lexer.cpp

   1 #include <string.h>
   2 #include <stdlib.h>
   3
   4 #include "gmqcc.h"
   5 #include "lexer.h"
   6
   7 /*
   8  * List of Keywords
   9  */
  10
  11 /* original */
  12 static const char *keywords_qc[] = {
  13     "for", "do", "while",
  14     "if", "else",
  15     "local",
  16     "return",
  17     "const"
  18 };
  19 /* For fte/gmgqcc */
  20 static const char *keywords_fg[] = {
  21     "switch", "case", "default",
  22     "break", "continue",
  23     "typedef",
  24     "goto",
  25
  26     "__builtin_debug_printtype"
  27 };
  28
  29 /*
  30  * Lexer code
  31  */
  32 static char* *lex_filenames;
  33
  34 static void lexerror(lex_file *lex, const char *fmt, ...)
  35 {
  36     va_list ap;
  37
  38     va_start(ap, fmt);
  39     if (lex)
  40         con_vprintmsg(LVL_ERROR, lex->name, lex->sline, lex->column, "parse error", fmt, ap);
  41     else
  42         con_vprintmsg(LVL_ERROR, "", 0, 0, "parse error", fmt, ap);
  43     va_end(ap);
  44 }
  45
  46 static bool lexwarn(lex_file *lex, int warntype, const char *fmt, ...)
  47 {
  48     bool      r;
  49     lex_ctx_t ctx;
  50     va_list   ap;
  51
  52     ctx.file   = lex->name;
  53     ctx.line   = lex->sline;
  54     ctx.column = lex->column;
  55
  56     va_start(ap, fmt);
  57     r = vcompile_warning(ctx, warntype, fmt, ap);
  58     va_end(ap);
  59     return r;
  60 }
  61
  62 static void lex_token_new(lex_file *lex)
  63 {
  64     lex->tok.value.shrinkto(0);
  65
  66     lex->tok.constval.t  = TYPE_VOID;
  67     lex->tok.ctx.line    = lex->sline;
  68     lex->tok.ctx.file    = lex->name;
  69     lex->tok.ctx.column  = lex->column;
  70 }
  71
  72 static void lex_ungetch(lex_file *lex, Token ch);
  73 static Token lex_getch(lex_file *lex);
  74
  75 lex_file* lex_open(const char *file)
  76 {
  77     lex_file  *lex;
  78     FILE *in = fopen(file, "rb");
  79     uint32_t   read;
  80
  81     if (!in) {
  82         lexerror(nullptr, "open failed: '%s'\n", file);
  83         return nullptr;
  84     }
  85
  86     lex = (lex_file*)mem_a(sizeof(*lex));
  87     if (!lex) {
  88         fclose(in);
  89         lexerror(nullptr, "out of memory\n");
  90         return nullptr;
  91     }
  92
  93     memset(lex, 0, sizeof(*lex));
  94
  95     lex->file    = in;
  96     lex->name    = util_strdup(file);
  97     lex->line    = 1; /* we start counting at 1 */
  98     lex->column  = 0;
  99     lex->peekpos = 0;
 100     lex->eof     = false;
 101
 102     /* handle BOM */
 103     if ((read = (lex_getch(lex) << 16) | (lex_getch(lex) << 8) | lex_getch(lex)) != 0xEFBBBF) {
 104         lex_ungetch(lex, static_cast<Token>((read & 0x0000FF)));
 105         lex_ungetch(lex, static_cast<Token>((read & 0x00FF00) >> 8));
 106         lex_ungetch(lex, static_cast<Token>((read & 0xFF0000) >> 16));
 107     } else {
 108         /*
 109          * otherwise the lexer has advanced 3 bytes for the BOM, we need
 110          * to set the column back to 0
 111          */
 112         lex->column = 0;
 113     }
 114
 115     vec_push(lex_filenames, lex->name);
 116     return lex;
 117 }
 118
 119 lex_file* lex_open_string(const char *str, size_t len, const char *name)
 120 {
 121     lex_file *lex;
 122
 123     lex = (lex_file*)mem_a(sizeof(*lex));
 124     if (!lex) {
 125         lexerror(nullptr, "out of memory\n");
 126         return nullptr;
 127     }
 128
 129     memset(lex, 0, sizeof(*lex));
 130
 131     lex->file = nullptr;
 132     lex->open_string        = str;
 133     lex->open_string_length = len;
 134     lex->open_string_pos    = 0;
 135
 136     lex->name    = util_strdup(name ? name : "<string-source>");
 137     lex->line    = 1; /* we start counting at 1 */
 138     lex->peekpos = 0;
 139     lex->eof     = false;
 140     lex->column  = 0;
 141
 142     vec_push(lex_filenames, lex->name);
 143
 144     return lex;
 145 }
 146
 147 void lex_cleanup()
 148 {
 149     for (size_t i = 0; i < vec_size(lex_filenames); ++i)
 150         mem_d(lex_filenames[i]);
 151     vec_free(lex_filenames);
 152 }
 153
 154 void lex_close(lex_file *lex)
 155 {
 156     vec_free(lex->frames);
 157
 158     if (lex->file)
 159         fclose(lex->file);
 160
 161     /* mem_d(lex->name); collected in lex_filenames */
 162     mem_d(lex);
 163 }
 164
 165
 166
 167 static Token lex_fgetc(lex_file *lex)
 168 {
 169     if (lex->file) {
 170         lex->column++;
 171         auto c = fgetc(lex->file);
 172         return c == EOF ? Token::END : static_cast<Token>(c);
 173     }
 174     if (lex->open_string) {
 175         if (lex->open_string_pos >= lex->open_string_length)
 176             return Token::END;
 177         lex->column++;
 178         auto c = lex->open_string[lex->open_string_pos++];
 179         return static_cast<Token>(c);
 180     }
 181     return Token::END;
 182 }
 183
 184 /* Get or put-back data
 185  * The following to functions do NOT understand what kind of data they
 186  * are working on.
 187  * The are merely wrapping get/put in order to count line numbers.
 188  */
 189 static Token lex_try_trigraph(lex_file *lex, Token old)
 190 {
 191     auto c2 = lex_fgetc(lex);
 192     if (!lex->push_line && c2 == Token::LF) {
 193         lex->line++;
 194         lex->column = 0;
 195     }
 196
 197     if (c2 != Token::QUESTION) {
 198         lex_ungetch(lex, c2);
 199         return old;
 200     }
 201
 202     auto c3 = lex_fgetc(lex);
 203     if (!lex->push_line && c3 == Token::LF) {
 204         lex->line++;
 205         lex->column = 0;
 206     }
 207
 208     switch (c3) {
 209         case Token::EQ: return Token::HASH;
 210         case Token::DIV: return Token::BACKSLASH;
 211         case Token::QUOT_SINGLE: return Token::XOR;
 212         case Token::PAREN_OPEN: return Token::BRACKET_OPEN;
 213         case Token::PAREN_CLOSE: return Token::BRACKET_CLOSE;
 214         case Token::NOT: return Token::OR;
 215         case Token::LT: return Token::BRACE_OPEN;
 216         case Token::GT: return Token::BRACE_CLOSE;
 217         case Token::SUB: return Token::BITNOT;
 218         default:
 219             lex_ungetch(lex, c3);
 220             lex_ungetch(lex, c2);
 221             return old;
 222     }
 223 }
 224
 225 static Token lex_try_digraph(lex_file *lex, Token ch)
 226 {
 227     auto c2 = lex_fgetc(lex);
 228     /* we just used fgetc() so count lines
 229      * need to offset a \n the ungetch would recognize
 230      */
 231     if (!lex->push_line && c2 == Token::LF)
 232         lex->line++;
 233     if      (ch == Token::LT && c2 == Token::COLON)
 234         return Token::BRACKET_OPEN;
 235     else if (ch == Token::COLON && c2 == Token::GT)
 236         return Token::BRACKET_CLOSE;
 237     else if (ch == Token::LT && c2 == Token::MOD)
 238         return Token::BRACE_OPEN;
 239     else if (ch == Token::MOD && c2 == Token::GT)
 240         return Token::BRACE_CLOSE;
 241     else if (ch == Token::MOD && c2 == Token::COLON)
 242         return Token::HASH;
 243     lex_ungetch(lex, c2);
 244     return ch;
 245 }
 246
 247 static Token lex_getch(lex_file *lex)
 248 {
 249     if (lex->peekpos) {
 250         lex->peekpos--;
 251         if (!lex->push_line && lex->peek[lex->peekpos] == Token::LF) {
 252             lex->line++;
 253             lex->column = 0;
 254         }
 255         return lex->peek[lex->peekpos];
 256     }
 257
 258     auto ch = lex_fgetc(lex);
 259     if (!lex->push_line && ch == Token::LF) {
 260         lex->line++;
 261         lex->column = 0;
 262     }
 263     else if (ch == Token::QUESTION)
 264         return lex_try_trigraph(lex, ch);
 265     else if (!lex->flags.nodigraphs && (ch == Token::LT || ch == Token::COLON || ch == Token::MOD))
 266         return lex_try_digraph(lex, ch);
 267     return ch;
 268 }
 269
 270 static void lex_ungetch(lex_file *lex, Token ch)
 271 {
 272     lex->peek[lex->peekpos++] = ch;
 273     lex->column--;
 274     if (!lex->push_line && ch == Token::LF) {
 275         lex->line--;
 276         lex->column = 0;
 277     }
 278 }
 279
 280 /* classify characters
 281  * some additions to the is*() functions of ctype.h
 282  */
 283
 284 /* Idents are alphanumberic, but they start with alpha or _ */
 285 static bool isident_start(int ch)
 286 {
 287     return util_isalpha(ch) || ch == '_';
 288 }
 289
 290 static bool isident(int ch)
 291 {
 292     return isident_start(ch) || util_isdigit(ch);
 293 }
 294
 295 /* isxdigit_only is used when we already know it's not a digit
 296  * and want to see if it's a hex digit anyway.
 297  */
 298 static bool isxdigit_only(int ch)
 299 {
 300     return (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
 301 }
 302
 303 /* Append a character to the token buffer */
 304 static void lex_tokench(lex_file *lex, int ch)
 305 {
 306     lex->tok.value.push(ch);
 307 }
 308
 309 /* Append a trailing null-byte */
 310 static void lex_endtoken(lex_file *lex)
 311 {
 312     lex->tok.value.push(0);
 313     lex->tok.value.shrinkby(1);
 314 }
 315
 316 static bool lex_try_pragma(lex_file *lex)
 317 {
 318     char *pragma  = nullptr;
 319     char *command = nullptr;
 320     char *param   = nullptr;
 321     size_t line;
 322
 323     if (lex->flags.preprocessing)
 324         return false;
 325
 326     line = lex->line;
 327
 328     auto ch = lex_getch(lex);
 329     if (ch != Token::HASH) {
 330         lex_ungetch(lex, ch);
 331         return false;
 332     }
 333
 334     for (ch = lex_getch(lex); vec_size(pragma) < 8 && ch >= 'a' && ch <= 'z'; ch = lex_getch(lex))
 335         vec_push(pragma, ch);
 336     vec_push(pragma, 0);
 337
 338     if (ch != Token::WS|| strcmp(pragma, "pragma")) {
 339         lex_ungetch(lex, ch);
 340         goto unroll;
 341     }
 342
 343     for (ch = lex_getch(lex); vec_size(command) < 32 && ch >= 'a' && ch <= 'z'; ch = lex_getch(lex))
 344         vec_push(command, ch);
 345     vec_push(command, 0);
 346
 347     if (ch != Token::PAREN_OPEN) {
 348         lex_ungetch(lex, ch);
 349         goto unroll;
 350     }
 351
 352     for (ch = lex_getch(lex); vec_size(param) < 1024 && ch != Token::PAREN_CLOSE && ch != Token::LF; ch = lex_getch(lex))
 353         vec_push(param, ch);
 354     vec_push(param, 0);
 355
 356     if (ch != Token::PAREN_CLOSE) {
 357         lex_ungetch(lex, ch);
 358         goto unroll;
 359     }
 360
 361     if (!strcmp(command, "push")) {
 362         if (!strcmp(param, "line")) {
 363             lex->push_line++;
 364             if (lex->push_line == 1)
 365                 --line;
 366         }
 367         else
 368             goto unroll;
 369     }
 370     else if (!strcmp(command, "pop")) {
 371         if (!strcmp(param, "line")) {
 372             if (lex->push_line)
 373                 lex->push_line--;
 374             if (lex->push_line == 0)
 375                 --line;
 376         }
 377         else
 378             goto unroll;
 379     }
 380     else if (!strcmp(command, "file")) {
 381         lex->name = util_strdup(param);
 382         vec_push(lex_filenames, lex->name);
 383     }
 384     else if (!strcmp(command, "line")) {
 385         line = strtol(param, nullptr, 0)-1;
 386     }
 387     else
 388         goto unroll;
 389
 390     lex->line = line;
 391     while (ch != Token::LF && ch != Token::END)
 392         ch = lex_getch(lex);
 393     vec_free(command);
 394     vec_free(param);
 395     vec_free(pragma);
 396     return true;
 397
 398 unroll:
 399     if (command) {
 400         vec_pop(command);
 401         while (vec_size(command)) {
 402             lex_ungetch(lex, static_cast<Token>(vec_last(command)));
 403             vec_pop(command);
 404         }
 405         vec_free(command);
 406         lex_ungetch(lex, Token::WS);
 407     }
 408     if (param) {
 409         vec_pop(param);
 410         while (vec_size(param)) {
 411             lex_ungetch(lex, static_cast<Token>(vec_last(param)));
 412             vec_pop(param);
 413         }
 414         vec_free(param);
 415         lex_ungetch(lex, Token::WS);
 416     }
 417     if (pragma) {
 418         vec_pop(pragma);
 419         while (vec_size(pragma)) {
 420             lex_ungetch(lex, static_cast<Token>(vec_last(pragma)));
 421             vec_pop(pragma);
 422         }
 423         vec_free(pragma);
 424     }
 425     lex_ungetch(lex, Token::HASH);
 426
 427     lex->line = line;
 428     return false;
 429 }
 430
 431 /* Skip whitespace and comments and return the first
 432  * non-white character.
 433  * As this makes use of the above getch() ungetch() functions,
 434  * we don't need to care at all about line numbering anymore.
 435  *
 436  * In theory, this function should only be used at the beginning
 437  * of lexing, or when we *know* the next character is part of the token.
 438  * Otherwise, if the parser throws an error, the linenumber may not be
 439  * the line of the error, but the line of the next token AFTER the error.
 440  *
 441  * This is currently only problematic when using c-like string-continuation,
 442  * since comments and whitespaces are allowed between 2 such strings.
 443  * Example:
 444 printf(   "line one\n"
 445 // A comment
 446           "A continuation of the previous string"
 447 // This line is skipped
 448       , foo);
 449
 450  * In this case, if the parse decides it didn't actually want a string,
 451  * and uses lex->line to print an error, it will show the ', foo);' line's
 452  * linenumber.
 453  *
 454  * On the other hand, the parser is supposed to remember the line of the next
 455  * token's beginning. In this case we would want skipwhite() to be called
 456  * AFTER reading a token, so that the parser, before reading the NEXT token,
 457  * doesn't store teh *comment's* linenumber, but the actual token's linenumber.
 458  *
 459  * THIS SOLUTION
 460  *    here is to store the line of the first character after skipping
 461  *    the initial whitespace in lex->sline, this happens in lex_do.
 462  */
 463 static Token lex_skipwhite(lex_file *lex, bool hadwhite)
 464 {
 465     Token ch;
 466     bool haswhite = hadwhite;
 467
 468     do
 469     {
 470         ch = lex_getch(lex);
 471         while (ch != Token::END && util_isspace(ch)) {
 472             if (ch == Token::LF) {
 473                 if (lex_try_pragma(lex))
 474                     continue;
 475             }
 476             if (lex->flags.preprocessing) {
 477                 if (ch == Token::LF) {
 478                     /* end-of-line */
 479                     /* see if there was whitespace first */
 480                     if (haswhite) { /* (vec_size(lex->tok.value)) { */
 481                         lex_ungetch(lex, ch);
 482                         lex_endtoken(lex);
 483                         return Token::WHITE;
 484                     }
 485                     /* otherwise return EOL */
 486                     return Token::EOL;
 487                 }
 488                 haswhite = true;
 489                 lex_tokench(lex, ch);
 490             }
 491             ch = lex_getch(lex);
 492         }
 493
 494         if (ch == Token::DIV) {
 495             ch = lex_getch(lex);
 496             if (ch == Token::DIV)
 497             {
 498                 /* one line comment */
 499                 ch = lex_getch(lex);
 500
 501                 if (lex->flags.preprocessing) {
 502                     haswhite = true;
 503                     lex_tokench(lex, Token::WS);
 504                     lex_tokench(lex, Token::WS);
 505                 }
 506
 507                 while (ch != Token::END && ch != Token::LF) {
 508                     if (lex->flags.preprocessing)
 509                         lex_tokench(lex, Token::WS); /* ch); */
 510                     ch = lex_getch(lex);
 511                 }
 512                 if (lex->flags.preprocessing) {
 513                     lex_ungetch(lex, Token::LF);
 514                     lex_endtoken(lex);
 515                     return Token::WHITE;
 516                 }
 517                 continue;
 518             }
 519             if (ch == Token::MUL)
 520             {
 521                 /* multiline comment */
 522                 if (lex->flags.preprocessing) {
 523                     haswhite = true;
 524                     lex_tokench(lex, Token::WS);
 525                     lex_tokench(lex, Token::WS);
 526                 }
 527
 528                 while (ch != Token::END)
 529                 {
 530                     ch = lex_getch(lex);
 531                     if (ch == Token::MUL) {
 532                         ch = lex_getch(lex);
 533                         if (ch == Token::DIV) {
 534                             if (lex->flags.preprocessing) {
 535                                 lex_tokench(lex, Token::WS);
 536                                 lex_tokench(lex, Token::WS);
 537                             }
 538                             break;
 539                         }
 540                         lex_ungetch(lex, ch);
 541                     }
 542                     if (lex->flags.preprocessing) {
 543                         if (ch == Token::LF)
 544                             lex_tokench(lex, Token::LF);
 545                         else
 546                             lex_tokench(lex, Token::WS);
 547                     }
 548                 }
 549                 ch = Token::WS; /* cause TRUE in the isspace check */
 550                 continue;
 551             }
 552             /* Otherwise roll back to the slash and break out of the loop */
 553             lex_ungetch(lex, ch);
 554             ch = Token::DIV;
 555             break;
 556         }
 557     } while (ch != Token::END && util_isspace(ch));
 558
 559     if (haswhite) {
 560         lex_endtoken(lex);
 561         lex_ungetch(lex, ch);
 562         return Token::WHITE;
 563     }
 564     return ch;
 565 }
 566
 567 /* Get a token */
 568 static bool GMQCC_WARN lex_finish_ident(lex_file *lex)
 569 {
 570     auto ch = lex_getch(lex);
 571     while (ch != Token::END && isident(ch))
 572     {
 573         lex_tokench(lex, ch);
 574         ch = lex_getch(lex);
 575     }
 576
 577     /* last ch was not an ident ch: */
 578     lex_ungetch(lex, ch);
 579
 580     return true;
 581 }
 582
 583 /* read one ident for the frame list */
 584 static int lex_parse_frame(lex_file *lex)
 585 {
 586     lex_token_new(lex);
 587
 588     auto ch = lex_getch(lex);
 589     while (ch != Token::END && ch != Token::LF && util_isspace(ch))
 590         ch = lex_getch(lex);
 591
 592     if (ch == Token::LF)
 593         return 1;
 594
 595     if (!isident_start(ch)) {
 596         lexerror(lex, "invalid framename, must start with one of a-z or _, got %c", ch);
 597         return -1;
 598     }
 599
 600     lex_tokench(lex, ch);
 601     if (!lex_finish_ident(lex))
 602         return -1;
 603     lex_endtoken(lex);
 604     return 0;
 605 }
 606
 607 /* read a list of $frames */
 608 static bool lex_finish_frames(lex_file *lex)
 609 {
 610     do {
 611         size_t i;
 612         int    rc;
 613         frame_macro m;
 614
 615         rc = lex_parse_frame(lex);
 616         if (rc > 0) /* end of line */
 617             return true;
 618         if (rc < 0) /* error */
 619             return false;
 620
 621         for (i = 0; i < vec_size(lex->frames); ++i) {
 622             if (lex->frames[i].name == lex->tok.value.c_str()) {
 623                 lex->frames[i].value = lex->framevalue++;
 624                 if (lexwarn(lex, WARN_FRAME_MACROS, "duplicate frame macro defined: `%s`", lex->tok.value))
 625                     return false;
 626                 break;
 627             }
 628         }
 629         if (i < vec_size(lex->frames))
 630             continue;
 631
 632         m.value = lex->framevalue++;
 633         m.name = util_strdup(lex->tok.value.c_str());
 634         lex->tok.value.shrinkto(0);
 635         vec_push(lex->frames, m);
 636     } while (true);
 637
 638     return false;
 639 }
 640
 641 static Token GMQCC_WARN lex_finish_string(lex_file *lex, int quote)
 642 {
 643     utf8ch_t chr = 0;
 644     int ch = 0, texttype = 0;
 645     Token nextch = Token::NONE;
 646     bool hex;
 647     bool oct;
 648     char u8buf[8]; /* way more than enough */
 649     int  u8len, uc;
 650
 651     while (ch != Token::END)
 652     {
 653         ch = lex_getch(lex);
 654         if (ch == quote)
 655             return Token::STRINGCONST;
 656
 657         if (lex->flags.preprocessing && ch == '\\') {
 658             lex_tokench(lex, ch);
 659             ch = lex_getch(lex);
 660             if (ch == Token::END) {
 661                 lexerror(lex, "unexpected end of file");
 662                 lex_ungetch(lex, Token::END); /* next token to be Token::END */
 663                 return (lex->tok.ttype = Token::ERROR);
 664             }
 665             lex_tokench(lex, ch);
 666         }
 667         else if (ch == '\\') {
 668             ch = lex_getch(lex);
 669             if (ch == Token::END) {
 670                 lexerror(lex, "unexpected end of file");
 671                 lex_ungetch(lex, Token::END); /* next token to be Token::END */
 672                 return (lex->tok.ttype = Token::ERROR);
 673             }
 674
 675             switch (ch) {
 676             case '\\': break;
 677             case '\'': break;
 678             case '"':  break;
 679             case 'a': ch = '\a'; break;
 680             case 'r': ch = '\r'; break;
 681             case 'n': ch = '\n'; break;
 682             case 't': ch = '\t'; break;
 683             case 'f': ch = '\f'; break;
 684             case 'v': ch = '\v'; break;
 685             case 'x':
 686             case 'X':
 687                 /* same procedure as in fteqcc */
 688                 ch = 0;
 689                 nextch = lex_getch(lex);
 690                 if      (nextch >= '0' && nextch <= '9')
 691                     ch += nextch - '0';
 692                 else if (nextch >= 'a' && nextch <= 'f')
 693                     ch += nextch - 'a' + 10;
 694                 else if (nextch >= 'A' && nextch <= 'F')
 695                     ch += nextch - 'A' + 10;
 696                 else {
 697                     lexerror(lex, "bad character code");
 698                     lex_ungetch(lex, nextch);
 699                     return (lex->tok.ttype = Token::ERROR);
 700                 }
 701
 702                 ch *= 0x10;
 703                 nextch = lex_getch(lex);
 704                 if      (nextch >= '0' && nextch <= '9')
 705                     ch += nextch - '0';
 706                 else if (nextch >= 'a' && nextch <= 'f')
 707                     ch += nextch - 'a' + 10;
 708                 else if (nextch >= 'A' && nextch <= 'F')
 709                     ch += nextch - 'A' + 10;
 710                 else {
 711                     lexerror(lex, "bad character code");
 712                     lex_ungetch(lex, nextch);
 713                     return (lex->tok.ttype = Token::ERROR);
 714                 }
 715                 break;
 716
 717             /* fteqcc support */
 718             case '0': case '1': case '2': case '3':
 719             case '4': case '5': case '6': case '7':
 720             case '8': case '9':
 721                 ch = 18 + ch - '0';
 722                 break;
 723             case '<':  ch = 29; break;
 724             case '-':  ch = 30; break;
 725             case '>':  ch = 31; break;
 726             case '[':  ch = 16; break;
 727             case ']':  ch = 17; break;
 728             case '{':
 729                 chr = 0;
 730                 nextch = lex_getch(lex);
 731                 hex = (nextch == 'x');
 732                 oct = (nextch == '0');
 733                 if (!hex && !oct)
 734                     lex_ungetch(lex, nextch);
 735                 for (nextch = lex_getch(lex); nextch != Token::BRACE_CLOSE; nextch = lex_getch(lex)) {
 736                     if (!hex && !oct) {
 737                         if (nextch >= '0' && nextch <= '9')
 738                             chr = chr * 10 + nextch - '0';
 739                         else {
 740                             lexerror(lex, "bad character code");
 741                             return (lex->tok.ttype = Token::ERROR);
 742                         }
 743                     } else if (!oct) {
 744                         if (nextch >= '0' && nextch <= '9')
 745                             chr = chr * 0x10 + nextch - '0';
 746                         else if (nextch >= 'a' && nextch <= 'f')
 747                             chr = chr * 0x10 + nextch - 'a' + 10;
 748                         else if (nextch >= 'A' && nextch <= 'F')
 749                             chr = chr * 0x10 + nextch - 'A' + 10;
 750                         else {
 751                             lexerror(lex, "bad character code");
 752                             return (lex->tok.ttype = Token::ERROR);
 753                         }
 754                     } else {
 755                         if (nextch >= '0' && nextch <= '9')
 756                             chr = chr * 8 + chr - '0';
 757                         else {
 758                             lexerror(lex, "bad character code");
 759                             return (lex->tok.ttype = Token::ERROR);
 760                         }
 761                     }
 762                     if (chr > 0x10FFFF || (!OPTS_FLAG(UTF8) && chr > 255))
 763                     {
 764                         lexerror(lex, "character code out of range");
 765                         return (lex->tok.ttype = Token::ERROR);
 766                     }
 767                 }
 768                 if (OPTS_FLAG(UTF8) && chr >= 128) {
 769                     u8len = utf8_from(u8buf, chr);
 770                     if (!u8len)
 771                         ch = 0;
 772                     else {
 773                         --u8len;
 774                         lex->column += u8len;
 775                         for (uc = 0; uc < u8len; ++uc)
 776                             lex_tokench(lex, u8buf[uc]);
 777                         /*
 778                          * the last character will be inserted with the tokench() call
 779                          * below the switch
 780                          */
 781                         ch = u8buf[uc];
 782                     }
 783                 }
 784                 else
 785                     ch = chr;
 786                 break;
 787
 788             /* high bit text */
 789             case 'b': case 's':
 790                 texttype ^= 128;
 791                 continue;
 792
 793             case '\n':
 794                 ch = '\n';
 795                 break;
 796
 797             default:
 798                 lexwarn(lex, WARN_UNKNOWN_CONTROL_SEQUENCE, "unrecognized control sequence: \\%c", ch);
 799                 /* so we just add the character plus backslash no matter what it actually is */
 800                 lex_tokench(lex, '\\');
 801             }
 802             /* add the character finally */
 803             lex_tokench(lex, ch | texttype);
 804         }
 805         else
 806             lex_tokench(lex, ch);
 807     }
 808     lexerror(lex, "unexpected end of file within string constant");
 809     lex_ungetch(lex, Token::END); /* next token to be Token::END */
 810     return (lex->tok.ttype = Token::ERROR);
 811 }
 812
 813 static Token GMQCC_WARN lex_finish_digit(lex_file *lex, Token lastch)
 814 {
 815     bool ishex = false;
 816
 817     Token ch = lastch;
 818
 819     /* parse a number... */
 820     if (ch == Token::DOT)
 821         lex->tok.ttype = Token::FLOATCONST;
 822     else
 823         lex->tok.ttype = Token::INTCONST;
 824
 825     lex_tokench(lex, ch);
 826
 827     ch = lex_getch(lex);
 828     if (ch != Token::DOT && !util_isdigit(ch))
 829     {
 830         if (lastch != '0' || ch != 'x')
 831         {
 832             /* end of the number or EOF */
 833             lex_ungetch(lex, ch);
 834             lex_endtoken(lex);
 835
 836             lex->tok.constval.i = lastch - '0';
 837             return lex->tok.ttype;
 838         }
 839
 840         ishex = true;
 841     }
 842
 843     /* EOF would have been caught above */
 844
 845     if (ch != Token::DOT)
 846     {
 847         lex_tokench(lex, ch);
 848         ch = lex_getch(lex);
 849         while (util_isdigit(ch) || (ishex && isxdigit_only(ch)))
 850         {
 851             lex_tokench(lex, ch);
 852             ch = lex_getch(lex);
 853         }
 854     }
 855     /* NOT else, '.' can come from above as well */
 856     if (lex->tok.ttype != Token::FLOATCONST && ch == Token::DOT && !ishex)
 857     {
 858         /* Allow floating comma in non-hex mode */
 859         lex->tok.ttype = Token::FLOATCONST;
 860         lex_tokench(lex, ch);
 861
 862         /* continue digits-only */
 863         ch = lex_getch(lex);
 864         while (util_isdigit(ch))
 865         {
 866             lex_tokench(lex, ch);
 867             ch = lex_getch(lex);
 868         }
 869     }
 870     /* put back the last character */
 871     /* but do not put back the trailing 'f' or a float */
 872     if (lex->tok.ttype == Token::FLOATCONST && ch == 'f')
 873         ch = lex_getch(lex);
 874
 875     /* generally we don't want words to follow numbers: */
 876     if (isident(ch)) {
 877         lexerror(lex, "unexpected trailing characters after number");
 878         return (lex->tok.ttype = Token::ERROR);
 879     }
 880     lex_ungetch(lex, ch);
 881
 882     lex_endtoken(lex);
 883     if (lex->tok.ttype == Token::FLOATCONST)
 884         lex->tok.constval.f = strtod(lex->tok.value.c_str(), nullptr);
 885     else
 886         lex->tok.constval.i = strtol(lex->tok.value.c_str(), nullptr, 0);
 887     return lex->tok.ttype;
 888 }
 889
 890 Token lex_do(lex_file *lex)
 891 {
 892     Token ch, nextch, thirdch;
 893     bool hadwhite = false;
 894
 895     lex_token_new(lex);
 896
 897     while (true) {
 898         ch = lex_skipwhite(lex, hadwhite);
 899         hadwhite = true;
 900         if (!lex->flags.mergelines || ch != Token::BACKSLASH)
 901             break;
 902         ch = lex_getch(lex);
 903         if (ch == Token::CR)
 904             ch = lex_getch(lex);
 905         if (ch != Token::LF) {
 906             lex_ungetch(lex, ch);
 907             ch = Token::BACKSLASH;
 908             break;
 909         }
 910         /* we reached a linemerge */
 911         lex_tokench(lex, '\n');
 912     }
 913
 914     if (lex->flags.preprocessing && (ch == Token::WHITE || ch == Token::EOL || ch == Token::FATAL)) {
 915         return (lex->tok.ttype = ch);
 916     }
 917
 918     lex->sline = lex->line;
 919     lex->tok.ctx.line = lex->sline;
 920     lex->tok.ctx.file = lex->name;
 921
 922     if (lex->eof)
 923         return (lex->tok.ttype = Token::FATAL);
 924
 925     if (ch == Token::END) {
 926         lex->eof = true;
 927         return (lex->tok.ttype = Token::END);
 928     }
 929
 930     /* modelgen / spiritgen commands */
 931     if (ch == Token::DOLLAR && !lex->flags.preprocessing) {
 932         const char *v;
 933         size_t frame;
 934
 935         ch = lex_getch(lex);
 936         if (!isident_start(ch)) {
 937             lexerror(lex, "hanging '$' modelgen/spritegen command line");
 938             return lex_do(lex);
 939         }
 940         lex_tokench(lex, ch);
 941         if (!lex_finish_ident(lex))
 942             return (lex->tok.ttype = Token::ERROR);
 943         lex_endtoken(lex);
 944         /* skip the known commands */
 945         v = lex->tok.value.c_str();
 946
 947         if (!strcmp(v, "frame") || !strcmp(v, "framesave"))
 948         {
 949             /* frame/framesave command works like an enum
 950              * similar to fteqcc we handle this in the lexer.
 951              * The reason for this is that it is sensitive to newlines,
 952              * which the parser is unaware of
 953              */
 954             if (!lex_finish_frames(lex))
 955                  return (lex->tok.ttype = Token::ERROR);
 956             return lex_do(lex);
 957         }
 958
 959         if (!strcmp(v, "framevalue"))
 960         {
 961             ch = lex_getch(lex);
 962             while (ch != Token::END && util_isspace(ch) && ch != Token::LF)
 963                 ch = lex_getch(lex);
 964
 965             if (!util_isdigit(ch)) {
 966                 lexerror(lex, "$framevalue requires an integer parameter");
 967                 return lex_do(lex);
 968             }
 969
 970             lex_token_new(lex);
 971             lex->tok.ttype = lex_finish_digit(lex, ch);
 972             lex_endtoken(lex);
 973             if (lex->tok.ttype != Token::INTCONST) {
 974                 lexerror(lex, "$framevalue requires an integer parameter");
 975                 return lex_do(lex);
 976             }
 977             lex->framevalue = lex->tok.constval.i;
 978             return lex_do(lex);
 979         }
 980
 981         if (!strcmp(v, "framerestore"))
 982         {
 983             int rc;
 984
 985             lex_token_new(lex);
 986
 987             rc = lex_parse_frame(lex);
 988
 989             if (rc > 0) {
 990                 lexerror(lex, "$framerestore requires a framename parameter");
 991                 return lex_do(lex);
 992             }
 993             if (rc < 0)
 994                 return (lex->tok.ttype = Token::FATAL);
 995
 996             v = lex->tok.value.c_str();
 997             for (frame = 0; frame < vec_size(lex->frames); ++frame) {
 998                 if (lex->frames[frame].name == v) {
 999                     lex->framevalue = lex->frames[frame].value;
1000                     return lex_do(lex);
1001                 }
1002             }
1003             lexerror(lex, "unknown framename `%s`", v);
1004             return lex_do(lex);
1005         }
1006
1007         if (!strcmp(v, "modelname"))
1008         {
1009             int rc;
1010
1011             lex_token_new(lex);
1012
1013             rc = lex_parse_frame(lex);
1014
1015             if (rc > 0) {
1016                 lexerror(lex, "$modelname requires a parameter");
1017                 return lex_do(lex);
1018             }
1019             if (rc < 0)
1020                 return (lex->tok.ttype = Token::FATAL);
1021
1022             if (lex->modelname.size()) {
1023                 frame_macro m;
1024                 m.name = std::move(lex->modelname);
1025                 m.value = lex->framevalue;
1026                 vec_push(lex->frames, m);
1027             }
1028             lex->modelname = std::string(lex->tok.value.c_str());
1029             return lex_do(lex);
1030         }
1031
1032         if (!strcmp(v, "flush"))
1033         {
1034             vec_free(lex->frames);
1035             /* skip line (fteqcc does it too) */
1036             ch = lex_getch(lex);
1037             while (ch != Token::END && ch != Token::LF)
1038                 ch = lex_getch(lex);
1039             return lex_do(lex);
1040         }
1041
1042         if (!strcmp(v, "cd") ||
1043             !strcmp(v, "origin") ||
1044             !strcmp(v, "base") ||
1045             !strcmp(v, "flags") ||
1046             !strcmp(v, "scale") ||
1047             !strcmp(v, "skin"))
1048         {
1049             /* skip line */
1050             ch = lex_getch(lex);
1051             while (ch != Token::END && ch != Token::LF)
1052                 ch = lex_getch(lex);
1053             return lex_do(lex);
1054         }
1055
1056         for (frame = 0; frame < vec_size(lex->frames); ++frame) {
1057             if (lex->frames[frame].name == v) {
1058                 lex->tok.constval.i = lex->frames[frame].value;
1059                 return (lex->tok.ttype = Token::INTCONST);
1060             }
1061         }
1062
1063         lexerror(lex, "invalid frame macro");
1064         return lex_do(lex);
1065     }
1066
1067     /* single-character tokens */
1068     switch (ch)
1069     {
1070         case Token::BRACKET_OPEN:
1071             nextch = lex_getch(lex);
1072             if (nextch == Token::BRACKET_OPEN) {
1073                 lex_tokench(lex, ch);
1074                 lex_tokench(lex, nextch);
1075                 lex_endtoken(lex);
1076                 return (lex->tok.ttype = Token::ATTRIBUTE_OPEN);
1077             }
1078             lex_ungetch(lex, nextch);
1079             /* FALL THROUGH */
1080         case Token::PAREN_OPEN:
1081         case Token::COLON:
1082         case Token::QUESTION:
1083             lex_tokench(lex, ch);
1084             lex_endtoken(lex);
1085             if (lex->flags.noops)
1086                 return (lex->tok.ttype = ch);
1087             else
1088                 return (lex->tok.ttype = Token::OPERATOR);
1089
1090         case Token::BRACKET_CLOSE:
1091             if (lex->flags.noops) {
1092                 nextch = lex_getch(lex);
1093                 if (nextch == Token::BRACKET_CLOSE) {
1094                     lex_tokench(lex, ch);
1095                     lex_tokench(lex, nextch);
1096                     lex_endtoken(lex);
1097                     return (lex->tok.ttype = Token::ATTRIBUTE_CLOSE);
1098                 }
1099                 lex_ungetch(lex, nextch);
1100             }
1101             /* FALL THROUGH */
1102         case Token::PAREN_CLOSE:
1103         case Token::SEMICOLON:
1104         case Token::BRACE_OPEN:
1105         case Token::BRACE_CLOSE:
1106
1107         case Token::HASH:
1108             lex_tokench(lex, ch);
1109             lex_endtoken(lex);
1110             return (lex->tok.ttype = ch);
1111         default:
1112             break;
1113     }
1114
1115     if (ch == Token::DOT) {
1116         nextch = lex_getch(lex);
1117         /* digits starting with a dot */
1118         if (util_isdigit(nextch)) {
1119             lex_ungetch(lex, nextch);
1120             lex->tok.ttype = lex_finish_digit(lex, ch);
1121             lex_endtoken(lex);
1122             return lex->tok.ttype;
1123         }
1124         lex_ungetch(lex, nextch);
1125     }
1126
1127     if (lex->flags.noops)
1128     {
1129         /* Detect characters early which are normally
1130          * operators OR PART of an operator.
1131          */
1132         switch (ch)
1133         {
1134             case Token::MUL:
1135             case Token::DIV:
1136             case Token::MOD:
1137             case Token::ADD:
1138             case Token::SUB:
1139             case Token::LT:
1140             case Token::GT:
1141             case Token::EQ:
1142             case Token::AND:
1143             case Token::OR:
1144             case Token::XOR:
1145             case Token::BITNOT:
1146             case Token::COMMA:
1147             case Token::NOT:
1148                 lex_tokench(lex, ch);
1149                 lex_endtoken(lex);
1150                 return (lex->tok.ttype = ch);
1151             default:
1152                 break;
1153         }
1154     }
1155
1156     if (ch == Token::DOT)
1157     {
1158         lex_tokench(lex, ch);
1159         // peak ahead once
1160         nextch = lex_getch(lex);
1161         if (nextch != Token::DOT) {
1162             lex_ungetch(lex, nextch);
1163             lex_endtoken(lex);
1164             if (lex->flags.noops)
1165                 return (lex->tok.ttype = ch);
1166             else
1167                 return (lex->tok.ttype = Token::OPERATOR);
1168         }
1169         // peak ahead again
1170         nextch = lex_getch(lex);
1171         if (nextch != Token::DOT) {
1172             lex_ungetch(lex, nextch);
1173             lex_ungetch(lex, Token::DOT);
1174             lex_endtoken(lex);
1175             if (lex->flags.noops)
1176                 return (lex->tok.ttype = ch);
1177             else
1178                 return (lex->tok.ttype = Token::OPERATOR);
1179         }
1180         // fill the token to be "..."
1181         lex_tokench(lex, ch);
1182         lex_tokench(lex, ch);
1183         lex_endtoken(lex);
1184         return (lex->tok.ttype = Token::DOTS);
1185     }
1186
1187     if (ch == Token::COMMA || ch == Token::DOT) {
1188         lex_tokench(lex, ch);
1189         lex_endtoken(lex);
1190         return (lex->tok.ttype = Token::OPERATOR);
1191     }
1192
1193     if (ch == Token::ADD || ch == Token::SUB || /* ++, --, +=, -= */
1194         ch == Token::GT || ch == Token::LT|| /* <<, >>, <=, >=  and >< as well! */
1195         ch == Token::EQ || ch == Token::NOT || /* <=>, ==, !=                     */
1196         ch == Token::AND || ch == Token::OR || /* &&, ||, &=, |=                  */
1197         ch == Token::BITNOT || ch == Token::XOR   /* ~=, ~, ^                        */
1198     )  {
1199         lex_tokench(lex, ch);
1200         nextch = lex_getch(lex);
1201
1202         if ((nextch == Token::EQ && ch != Token::LT) || (nextch == Token::LT && ch == Token::GT))
1203             lex_tokench(lex, nextch);
1204         else if (nextch == ch && ch != Token::NOT) {
1205             lex_tokench(lex, nextch);
1206             if ((thirdch = lex_getch(lex)) == Token::EQ)
1207                 lex_tokench(lex, thirdch);
1208             else
1209                 lex_ungetch(lex, thirdch);
1210         } else if (ch == Token::LT && nextch == Token::EQ) {
1211             lex_tokench(lex, nextch);
1212             if ((thirdch = lex_getch(lex)) == Token::GT)
1213                 lex_tokench(lex, thirdch);
1214             else
1215                 lex_ungetch(lex, thirdch);
1216         } else if (ch == Token::AND && nextch == Token::BITNOT) {
1217             thirdch = lex_getch(lex);
1218             if (thirdch != Token::EQ) {
1219                 lex_ungetch(lex, thirdch);
1220                 lex_ungetch(lex, nextch);
1221             }
1222             else {
1223                 lex_tokench(lex, nextch);
1224                 lex_tokench(lex, thirdch);
1225             }
1226         }
1227         else if (lex->flags.preprocessing &&
1228                  ch == Token::SUB && util_isdigit(nextch))
1229         {
1230             lex->tok.ttype = lex_finish_digit(lex, nextch);
1231             if (lex->tok.ttype == Token::INTCONST)
1232                 lex->tok.constval.i = -lex->tok.constval.i;
1233             else
1234                 lex->tok.constval.f = -lex->tok.constval.f;
1235             lex_endtoken(lex);
1236             return lex->tok.ttype;
1237         } else {
1238             lex_ungetch(lex, nextch);
1239         }
1240
1241         lex_endtoken(lex);
1242         return (lex->tok.ttype = Token::OPERATOR);
1243     }
1244
1245     if (ch == Token::MUL || ch == Token::DIV) /* *=, /= */
1246     {
1247         lex_tokench(lex, ch);
1248
1249         nextch = lex_getch(lex);
1250         if (nextch == Token::EQ || nextch == Token::MUL) {
1251             lex_tokench(lex, nextch);
1252         } else
1253             lex_ungetch(lex, nextch);
1254
1255         lex_endtoken(lex);
1256         return (lex->tok.ttype = Token::OPERATOR);
1257     }
1258
1259     if (ch == Token::MOD) {
1260         lex_tokench(lex, ch);
1261         lex_endtoken(lex);
1262         return (lex->tok.ttype = Token::OPERATOR);
1263     }
1264
1265     if (isident_start(ch))
1266     {
1267         const char *v;
1268
1269         lex_tokench(lex, ch);
1270         if (!lex_finish_ident(lex)) {
1271             /* error? */
1272             return (lex->tok.ttype = Token::ERROR);
1273         }
1274         lex_endtoken(lex);
1275         lex->tok.ttype = Token::IDENT;
1276
1277         v = lex->tok.value.c_str();
1278         if (!strcmp(v, "void")) {
1279             lex->tok.ttype = Token::TYPENAME;
1280             lex->tok.constval.t = TYPE_VOID;
1281         } else if (!strcmp(v, "int")) {
1282             lex->tok.ttype = Token::TYPENAME;
1283             lex->tok.constval.t = TYPE_INTEGER;
1284         } else if (!strcmp(v, "float")) {
1285             lex->tok.ttype = Token::TYPENAME;
1286             lex->tok.constval.t = TYPE_FLOAT;
1287         } else if (!strcmp(v, "string")) {
1288             lex->tok.ttype = Token::TYPENAME;
1289             lex->tok.constval.t = TYPE_STRING;
1290         } else if (!strcmp(v, "entity")) {
1291             lex->tok.ttype = Token::TYPENAME;
1292             lex->tok.constval.t = TYPE_ENTITY;
1293         } else if (!strcmp(v, "vector")) {
1294             lex->tok.ttype = Token::TYPENAME;
1295             lex->tok.constval.t = TYPE_VECTOR;
1296         } else if (!strcmp(v, "_length")) {
1297             lex->tok.ttype = Token::OPERATOR;
1298         } else {
1299             size_t kw;
1300             for (kw = 0; kw < GMQCC_ARRAY_COUNT(keywords_qc); ++kw) {
1301                 if (!strcmp(v, keywords_qc[kw]))
1302                     return (lex->tok.ttype = Token::KEYWORD);
1303             }
1304             if (OPTS_OPTION_U32(OPTION_STANDARD) != COMPILER_QCC) {
1305                 for (kw = 0; kw < GMQCC_ARRAY_COUNT(keywords_fg); ++kw) {
1306                     if (!strcmp(v, keywords_fg[kw]))
1307                         return (lex->tok.ttype = Token::KEYWORD);
1308                 }
1309             }
1310         }
1311
1312         return lex->tok.ttype;
1313     }
1314
1315     if (ch == Token::QUOT_DOUBLE)
1316     {
1317         lex->flags.nodigraphs = true;
1318         if (lex->flags.preprocessing)
1319             lex_tokench(lex, ch);
1320         lex->tok.ttype = lex_finish_string(lex, Token::QUOT_DOUBLE);
1321         if (lex->flags.preprocessing)
1322             lex_tokench(lex, ch);
1323         while (!lex->flags.preprocessing && lex->tok.ttype == Token::STRINGCONST)
1324         {
1325             /* Allow c style "string" "continuation" */
1326             ch = lex_skipwhite(lex, false);
1327             if (ch != Token::QUOT_DOUBLE) {
1328                 lex_ungetch(lex, ch);
1329                 break;
1330             }
1331
1332             lex->tok.ttype = lex_finish_string(lex, Token::QUOT_DOUBLE);
1333         }
1334         lex->flags.nodigraphs = false;
1335         lex_endtoken(lex);
1336         return lex->tok.ttype;
1337     }
1338
1339     if (ch == Token::QUOT_SINGLE)
1340     {
1341         /* we parse character constants like string,
1342          * but return Token::CHARCONST, or a vector type if it fits...
1343          * Likewise actual unescaping has to be done by the parser.
1344          * The difference is we don't allow 'char' 'continuation'.
1345          */
1346         if (lex->flags.preprocessing)
1347             lex_tokench(lex, ch);
1348         lex->tok.ttype = lex_finish_string(lex, Token::QUOT_SINGLE);
1349         if (lex->flags.preprocessing)
1350             lex_tokench(lex, ch);
1351         lex_endtoken(lex);
1352
1353         lex->tok.ttype = Token::CHARCONST;
1354
1355         /* It's a vector if we can successfully scan 3 floats */
1356         if (util_sscanf(lex->tok.value.c_str(), " %f %f %f ",
1357                    &lex->tok.constval.v.x, &lex->tok.constval.v.y, &lex->tok.constval.v.z) == 3)
1358
1359         {
1360              lex->tok.ttype = Token::VECTORCONST;
1361         }
1362         else
1363         {
1364             if (!lex->flags.preprocessing && strlen(lex->tok.value.c_str()) > 1) {
1365                 utf8ch_t u8char;
1366                 /* check for a valid utf8 character */
1367                 if (!OPTS_FLAG(UTF8) || !utf8_to(&u8char, (const unsigned char *)lex->tok.value.c_str(), 8)) {
1368                     if (lexwarn(lex, WARN_MULTIBYTE_CHARACTER,
1369                                 ( OPTS_FLAG(UTF8) ? "invalid multibyte character sequence `%s`"
1370                                                   : "multibyte character: `%s`" ),
1371                                 lex->tok.value))
1372                         return (lex->tok.ttype = Token::ERROR);
1373                 }
1374                 else
1375                     lex->tok.constval.i = u8char;
1376             }
1377             else
1378                 lex->tok.constval.i = lex->tok.value.c_str()[0];
1379         }
1380
1381         return lex->tok.ttype;
1382     }
1383
1384     if (util_isdigit(ch))
1385     {
1386         lex->tok.ttype = lex_finish_digit(lex, ch);
1387         lex_endtoken(lex);
1388         return lex->tok.ttype;
1389     }
1390
1391     if (lex->flags.preprocessing) {
1392         lex_tokench(lex, static_cast<int>(ch));
1393         lex_endtoken(lex);
1394         return (lex->tok.ttype = ch);
1395     }
1396
1397     lexerror(lex, "unknown token: `%c`", ch);
1398     return (lex->tok.ttype = Token::ERROR);
1399 }