lexer.c

   1 /*
   2  * Copyright (C) 2012
   3  *     Wolfgang Bumiller
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a copy of
   6  * this software and associated documentation files (the "Software"), to deal in
   7  * the Software without restriction, including without limitation the rights to
   8  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
   9  * of the Software, and to permit persons to whom the Software is furnished to do
  10  * so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be included in all
  13  * copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23 #include <stdio.h>
  24 #include <stdlib.h>
  25 #include <string.h>
  26 #include <stdarg.h>
  27
  28 #include "gmqcc.h"
  29 #include "lexer.h"
  30
  31 /*
  32  * List of Keywords
  33  */
  34
  35 /* original */
  36 static const char *keywords_qc[] = {
  37     "for", "do", "while",
  38     "if", "else",
  39     "local",
  40     "return",
  41     "const"
  42 };
  43 static size_t num_keywords_qc = sizeof(keywords_qc) / sizeof(keywords_qc[0]);
  44
  45 /* For fte/gmgqcc */
  46 static const char *keywords_fg[] = {
  47     "switch", "case", "default",
  48     "struct", "union",
  49     "break", "continue"
  50 };
  51 static size_t num_keywords_fg = sizeof(keywords_fg) / sizeof(keywords_fg[0]);
  52
  53 /*
  54  * Lexer code
  55  */
  56
  57 char* *lex_filenames;
  58
  59 void lexerror(lex_file *lex, const char *fmt, ...)
  60 {
  61     va_list ap;
  62
  63     va_start(ap, fmt);
  64     if (lex)
  65         con_vprintmsg(LVL_ERROR, lex->name, lex->sline, "parse error", fmt, ap);
  66     else
  67         con_vprintmsg(LVL_ERROR, "", 0, "parse error", fmt, ap);
  68     va_end(ap);
  69 }
  70
  71 bool lexwarn(lex_file *lex, int warntype, const char *fmt, ...)
  72 {
  73     va_list ap;
  74     int lvl = LVL_WARNING;
  75
  76     if (!OPTS_WARN(warntype))
  77         return false;
  78
  79     if (opts_werror)
  80         lvl = LVL_ERROR;
  81
  82     va_start(ap, fmt);
  83     con_vprintmsg(lvl, lex->name, lex->sline, "warning", fmt, ap);
  84     va_end(ap);
  85
  86     return opts_werror;
  87 }
  88
  89
  90 #if 0
  91 token* token_new()
  92 {
  93     token *tok = (token*)mem_a(sizeof(token));
  94     if (!tok)
  95         return NULL;
  96     memset(tok, 0, sizeof(*tok));
  97     return tok;
  98 }
  99
 100 void token_delete(token *self)
 101 {
 102     if (self->next && self->next->prev == self)
 103         self->next->prev = self->prev;
 104     if (self->prev && self->prev->next == self)
 105         self->prev->next = self->next;
 106     MEM_VECTOR_CLEAR(self, value);
 107     mem_d(self);
 108 }
 109
 110 token* token_copy(const token *cp)
 111 {
 112     token* self = token_new();
 113     if (!self)
 114         return NULL;
 115     /* copy the value */
 116     self->value_alloc = cp->value_count + 1;
 117     self->value_count = cp->value_count;
 118     self->value = (char*)mem_a(self->value_alloc);
 119     if (!self->value) {
 120         mem_d(self);
 121         return NULL;
 122     }
 123     memcpy(self->value, cp->value, cp->value_count);
 124     self->value[self->value_alloc-1] = 0;
 125
 126     /* rest */
 127     self->ctx = cp->ctx;
 128     self->ttype = cp->ttype;
 129     memcpy(&self->constval, &cp->constval, sizeof(self->constval));
 130     return self;
 131 }
 132
 133 void token_delete_all(token *t)
 134 {
 135     token *n;
 136
 137     do {
 138         n = t->next;
 139         token_delete(t);
 140         t = n;
 141     } while(t);
 142 }
 143
 144 token* token_copy_all(const token *cp)
 145 {
 146     token *cur;
 147     token *out;
 148
 149     out = cur = token_copy(cp);
 150     if (!out)
 151         return NULL;
 152
 153     while (cp->next) {
 154         cp = cp->next;
 155         cur->next = token_copy(cp);
 156         if (!cur->next) {
 157             token_delete_all(out);
 158             return NULL;
 159         }
 160         cur->next->prev = cur;
 161         cur = cur->next;
 162     }
 163
 164     return out;
 165 }
 166 #else
 167 static void lex_token_new(lex_file *lex)
 168 {
 169 #if 0
 170     if (lex->tok)
 171         token_delete(lex->tok);
 172     lex->tok = token_new();
 173 #else
 174     if (lex->tok.value)
 175         vec_shrinkto(lex->tok.value, 0);
 176     lex->tok.constval.t  = 0;
 177     lex->tok.ctx.line = lex->sline;
 178     lex->tok.ctx.file = lex->name;
 179 #endif
 180 }
 181 #endif
 182
 183 lex_file* lex_open(const char *file)
 184 {
 185     lex_file *lex;
 186     FILE *in = util_fopen(file, "rb");
 187
 188     if (!in) {
 189         lexerror(NULL, "open failed: '%s'\n", file);
 190         return NULL;
 191     }
 192
 193     lex = (lex_file*)mem_a(sizeof(*lex));
 194     if (!lex) {
 195         fclose(in);
 196         lexerror(NULL, "out of memory\n");
 197         return NULL;
 198     }
 199
 200     memset(lex, 0, sizeof(*lex));
 201
 202     lex->file = in;
 203     lex->name = util_strdup(file);
 204     lex->line = 1; /* we start counting at 1 */
 205
 206     lex->peekpos = 0;
 207     lex->eof = false;
 208
 209     vec_push(lex_filenames, lex->name);
 210     return lex;
 211 }
 212
 213 lex_file* lex_open_string(const char *str, size_t len, const char *name)
 214 {
 215     lex_file *lex;
 216
 217     lex = (lex_file*)mem_a(sizeof(*lex));
 218     if (!lex) {
 219         lexerror(NULL, "out of memory\n");
 220         return NULL;
 221     }
 222
 223     memset(lex, 0, sizeof(*lex));
 224
 225     lex->file = NULL;
 226     lex->open_string        = str;
 227     lex->open_string_length = len;
 228     lex->open_string_pos    = 0;
 229
 230     lex->name = util_strdup(name ? name : "<string-source>");
 231     lex->line = 1; /* we start counting at 1 */
 232
 233     lex->peekpos = 0;
 234     lex->eof = false;
 235
 236     vec_push(lex_filenames, lex->name);
 237
 238     return lex;
 239 }
 240
 241 void lex_cleanup(void)
 242 {
 243     size_t i;
 244     for (i = 0; i < vec_size(lex_filenames); ++i)
 245         mem_d(lex_filenames[i]);
 246     vec_free(lex_filenames);
 247 }
 248
 249 void lex_close(lex_file *lex)
 250 {
 251     size_t i;
 252     for (i = 0; i < vec_size(lex->frames); ++i)
 253         mem_d(lex->frames[i].name);
 254     vec_free(lex->frames);
 255
 256     if (lex->modelname)
 257         vec_free(lex->modelname);
 258
 259     if (lex->file)
 260         fclose(lex->file);
 261 #if 0
 262     if (lex->tok)
 263         token_delete(lex->tok);
 264 #else
 265     vec_free(lex->tok.value);
 266 #endif
 267     /* mem_d(lex->name); collected in lex_filenames */
 268     mem_d(lex);
 269 }
 270
 271 static int lex_fgetc(lex_file *lex)
 272 {
 273     if (lex->file)
 274         return fgetc(lex->file);
 275     if (lex->open_string) {
 276         if (lex->open_string_pos >= lex->open_string_length)
 277             return EOF;
 278         return lex->open_string[lex->open_string_pos++];
 279     }
 280     return EOF;
 281 }
 282
 283 /* Get or put-back data
 284  * The following to functions do NOT understand what kind of data they
 285  * are working on.
 286  * The are merely wrapping get/put in order to count line numbers.
 287  */
 288 static void lex_ungetch(lex_file *lex, int ch);
 289 static int lex_try_trigraph(lex_file *lex, int old)
 290 {
 291     int c2, c3;
 292     c2 = lex_fgetc(lex);
 293     if (c2 != '?') {
 294         lex_ungetch(lex, c2);
 295         return old;
 296     }
 297
 298     c3 = lex_fgetc(lex);
 299     switch (c3) {
 300         case '=': return '#';
 301         case '/': return '\\';
 302         case '\'': return '^';
 303         case '(': return '[';
 304         case ')': return ']';
 305         case '!': return '|';
 306         case '<': return '{';
 307         case '>': return '}';
 308         case '-': return '~';
 309         default:
 310             lex_ungetch(lex, c3);
 311             lex_ungetch(lex, c2);
 312             return old;
 313     }
 314 }
 315
 316 static int lex_try_digraph(lex_file *lex, int ch)
 317 {
 318     int c2;
 319     c2 = lex_fgetc(lex);
 320     if      (ch == '<' && c2 == ':')
 321         return '[';
 322     else if (ch == ':' && c2 == '>')
 323         return ']';
 324     else if (ch == '<' && c2 == '%')
 325         return '{';
 326     else if (ch == '%' && c2 == '>')
 327         return '}';
 328     else if (ch == '%' && c2 == ':')
 329         return '#';
 330     lex_ungetch(lex, c2);
 331     return ch;
 332 }
 333
 334 static int lex_getch(lex_file *lex)
 335 {
 336     int ch;
 337
 338     if (lex->peekpos) {
 339         lex->peekpos--;
 340         if (!lex->push_line && lex->peek[lex->peekpos] == '\n')
 341             lex->line++;
 342         return lex->peek[lex->peekpos];
 343     }
 344
 345     ch = lex_fgetc(lex);
 346     if (!lex->push_line && ch == '\n')
 347         lex->line++;
 348     else if (ch == '?')
 349         return lex_try_trigraph(lex, ch);
 350     else if (!lex->flags.nodigraphs && (ch == '<' || ch == ':' || ch == '%'))
 351         return lex_try_digraph(lex, ch);
 352     return ch;
 353 }
 354
 355 static void lex_ungetch(lex_file *lex, int ch)
 356 {
 357     lex->peek[lex->peekpos++] = ch;
 358     if (!lex->push_line && ch == '\n')
 359         lex->line--;
 360 }
 361
 362 /* classify characters
 363  * some additions to the is*() functions of ctype.h
 364  */
 365
 366 /* Idents are alphanumberic, but they start with alpha or _ */
 367 static bool isident_start(int ch)
 368 {
 369     return isalpha(ch) || ch == '_';
 370 }
 371
 372 static bool isident(int ch)
 373 {
 374     return isident_start(ch) || isdigit(ch);
 375 }
 376
 377 /* isxdigit_only is used when we already know it's not a digit
 378  * and want to see if it's a hex digit anyway.
 379  */
 380 static bool isxdigit_only(int ch)
 381 {
 382     return (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
 383 }
 384
 385 /* Append a character to the token buffer */
 386 static void lex_tokench(lex_file *lex, int ch)
 387 {
 388     vec_push(lex->tok.value, ch);
 389 }
 390
 391 /* Append a trailing null-byte */
 392 static void lex_endtoken(lex_file *lex)
 393 {
 394     vec_push(lex->tok.value, 0);
 395     vec_shrinkby(lex->tok.value, 1);
 396 }
 397
 398 static bool lex_try_pragma(lex_file *lex)
 399 {
 400     int ch;
 401     char *pragma  = NULL;
 402     char *command = NULL;
 403     char *param   = NULL;
 404     size_t line;
 405
 406     if (lex->flags.preprocessing)
 407         return false;
 408
 409     line = lex->line;
 410
 411     ch = lex_getch(lex);
 412     if (ch != '#') {
 413         lex_ungetch(lex, ch);
 414         return false;
 415     }
 416
 417     for (ch = lex_getch(lex); vec_size(pragma) < 8 && ch >= 'a' && ch <= 'z'; ch = lex_getch(lex))
 418         vec_push(pragma, ch);
 419     vec_push(pragma, 0);
 420
 421     if (ch != ' ' || strcmp(pragma, "pragma")) {
 422         lex_ungetch(lex, ch);
 423         goto unroll;
 424     }
 425
 426     for (ch = lex_getch(lex); vec_size(command) < 32 && ch >= 'a' && ch <= 'z'; ch = lex_getch(lex))
 427         vec_push(command, ch);
 428     vec_push(command, 0);
 429
 430     if (ch != '(') {
 431         lex_ungetch(lex, ch);
 432         goto unroll;
 433     }
 434
 435     for (ch = lex_getch(lex); vec_size(param) < 32 && ch != ')' && ch != '\n'; ch = lex_getch(lex))
 436         vec_push(param, ch);
 437     vec_push(param, 0);
 438
 439     if (ch != ')') {
 440         lex_ungetch(lex, ch);
 441         goto unroll;
 442     }
 443
 444     if (!strcmp(command, "push")) {
 445         if (!strcmp(param, "line")) {
 446             lex->push_line++;
 447             --line;
 448         }
 449         else
 450             goto unroll;
 451     }
 452     else if (!strcmp(command, "pop")) {
 453         if (!strcmp(param, "line")) {
 454             if (lex->push_line)
 455                 lex->push_line--;
 456             --line;
 457         }
 458         else
 459             goto unroll;
 460     }
 461     else if (!strcmp(command, "file")) {
 462         lex->name = util_strdup(param);
 463         vec_push(lex_filenames, lex->name);
 464     }
 465     else if (!strcmp(command, "line")) {
 466         line = strtol(param, NULL, 0)-1;
 467     }
 468     else
 469         goto unroll;
 470
 471     lex->line = line;
 472     while (ch != '\n' && ch != EOF)
 473         ch = lex_getch(lex);
 474     return true;
 475
 476 unroll:
 477     if (command) {
 478         vec_pop(command);
 479         while (vec_size(command)) {
 480             lex_ungetch(lex, vec_last(command));
 481             vec_pop(command);
 482         }
 483         vec_free(command);
 484     }
 485     if (command) {
 486         vec_pop(command);
 487         while (vec_size(command)) {
 488             lex_ungetch(lex, vec_last(command));
 489             vec_pop(command);
 490         }
 491         vec_free(command);
 492     }
 493     if (pragma) {
 494         vec_pop(pragma);
 495         while (vec_size(pragma)) {
 496             lex_ungetch(lex, vec_last(pragma));
 497             vec_pop(pragma);
 498         }
 499         vec_free(pragma);
 500     }
 501     lex_ungetch(lex, '#');
 502
 503     lex->line = line;
 504     return false;
 505 }
 506
 507 /* Skip whitespace and comments and return the first
 508  * non-white character.
 509  * As this makes use of the above getch() ungetch() functions,
 510  * we don't need to care at all about line numbering anymore.
 511  *
 512  * In theory, this function should only be used at the beginning
 513  * of lexing, or when we *know* the next character is part of the token.
 514  * Otherwise, if the parser throws an error, the linenumber may not be
 515  * the line of the error, but the line of the next token AFTER the error.
 516  *
 517  * This is currently only problematic when using c-like string-continuation,
 518  * since comments and whitespaces are allowed between 2 such strings.
 519  * Example:
 520 printf(   "line one\n"
 521 // A comment
 522           "A continuation of the previous string"
 523 // This line is skipped
 524       , foo);
 525
 526  * In this case, if the parse decides it didn't actually want a string,
 527  * and uses lex->line to print an error, it will show the ', foo);' line's
 528  * linenumber.
 529  *
 530  * On the other hand, the parser is supposed to remember the line of the next
 531  * token's beginning. In this case we would want skipwhite() to be called
 532  * AFTER reading a token, so that the parser, before reading the NEXT token,
 533  * doesn't store teh *comment's* linenumber, but the actual token's linenumber.
 534  *
 535  * THIS SOLUTION
 536  *    here is to store the line of the first character after skipping
 537  *    the initial whitespace in lex->sline, this happens in lex_do.
 538  */
 539 static int lex_skipwhite(lex_file *lex)
 540 {
 541     int ch = 0;
 542     bool haswhite = false;
 543
 544     do
 545     {
 546         ch = lex_getch(lex);
 547         while (ch != EOF && isspace(ch)) {
 548             if (ch == '\n') {
 549                 if (lex_try_pragma(lex))
 550                     continue;
 551             }
 552             if (lex->flags.preprocessing) {
 553                 if (ch == '\n') {
 554                     /* end-of-line */
 555                     /* see if there was whitespace first */
 556                     if (haswhite) { /* (vec_size(lex->tok.value)) { */
 557                         lex_ungetch(lex, ch);
 558                         lex_endtoken(lex);
 559                         return TOKEN_WHITE;
 560                     }
 561                     /* otherwise return EOL */
 562                     return TOKEN_EOL;
 563                 }
 564                 haswhite = true;
 565                 lex_tokench(lex, ch);
 566             }
 567             ch = lex_getch(lex);
 568         }
 569
 570         if (ch == '/') {
 571             ch = lex_getch(lex);
 572             if (ch == '/')
 573             {
 574                 /* one line comment */
 575                 ch = lex_getch(lex);
 576
 577                 if (lex->flags.preprocessing) {
 578                     haswhite = true;
 579                     /*
 580                     lex_tokench(lex, '/');
 581                     lex_tokench(lex, '/');
 582                     */
 583                     lex_tokench(lex, ' ');
 584                     lex_tokench(lex, ' ');
 585                 }
 586
 587                 while (ch != EOF && ch != '\n') {
 588                     if (lex->flags.preprocessing)
 589                         lex_tokench(lex, ' '); /* ch); */
 590                     ch = lex_getch(lex);
 591                 }
 592                 if (lex->flags.preprocessing) {
 593                     lex_ungetch(lex, '\n');
 594                     lex_endtoken(lex);
 595                     return TOKEN_WHITE;
 596                 }
 597                 continue;
 598             }
 599             if (ch == '*')
 600             {
 601                 /* multiline comment */
 602                 if (lex->flags.preprocessing) {
 603                     haswhite = true;
 604                     /*
 605                     lex_tokench(lex, '/');
 606                     lex_tokench(lex, '*');
 607                     */
 608                     lex_tokench(lex, ' ');
 609                     lex_tokench(lex, ' ');
 610                 }
 611
 612                 while (ch != EOF)
 613                 {
 614                     ch = lex_getch(lex);
 615                     if (ch == '*') {
 616                         ch = lex_getch(lex);
 617                         if (ch == '/') {
 618                             if (lex->flags.preprocessing) {
 619                                 /*
 620                                 lex_tokench(lex, '*');
 621                                 lex_tokench(lex, '/');
 622                                 */
 623                                 lex_tokench(lex, ' ');
 624                                 lex_tokench(lex, ' ');
 625                             }
 626                             break;
 627                         }
 628                         lex_ungetch(lex, ch);
 629                     }
 630                     if (lex->flags.preprocessing) {
 631                         if (ch == '\n')
 632                             lex_tokench(lex, '\n');
 633                         else
 634                             lex_tokench(lex, ' '); /* ch); */
 635                     }
 636                 }
 637                 ch = ' '; /* cause TRUE in the isspace check */
 638                 continue;
 639             }
 640             /* Otherwise roll back to the slash and break out of the loop */
 641             lex_ungetch(lex, ch);
 642             ch = '/';
 643             break;
 644         }
 645     } while (ch != EOF && isspace(ch));
 646
 647     if (haswhite) {
 648         lex_endtoken(lex);
 649         lex_ungetch(lex, ch);
 650         return TOKEN_WHITE;
 651     }
 652     return ch;
 653 }
 654
 655 /* Get a token */
 656 static bool GMQCC_WARN lex_finish_ident(lex_file *lex)
 657 {
 658     int ch;
 659
 660     ch = lex_getch(lex);
 661     while (ch != EOF && isident(ch))
 662     {
 663         lex_tokench(lex, ch);
 664         ch = lex_getch(lex);
 665     }
 666
 667     /* last ch was not an ident ch: */
 668     lex_ungetch(lex, ch);
 669
 670     return true;
 671 }
 672
 673 /* read one ident for the frame list */
 674 static int lex_parse_frame(lex_file *lex)
 675 {
 676     int ch;
 677
 678     lex_token_new(lex);
 679
 680     ch = lex_getch(lex);
 681     while (ch != EOF && ch != '\n' && isspace(ch))
 682         ch = lex_getch(lex);
 683
 684     if (ch == '\n')
 685         return 1;
 686
 687     if (!isident_start(ch)) {
 688         lexerror(lex, "invalid framename, must start with one of a-z or _, got %c", ch);
 689         return -1;
 690     }
 691
 692     lex_tokench(lex, ch);
 693     if (!lex_finish_ident(lex))
 694         return -1;
 695     lex_endtoken(lex);
 696     return 0;
 697 }
 698
 699 /* read a list of $frames */
 700 static bool lex_finish_frames(lex_file *lex)
 701 {
 702     do {
 703         size_t i;
 704         int    rc;
 705         frame_macro m;
 706
 707         rc = lex_parse_frame(lex);
 708         if (rc > 0) /* end of line */
 709             return true;
 710         if (rc < 0) /* error */
 711             return false;
 712
 713         for (i = 0; i < vec_size(lex->frames); ++i) {
 714             if (!strcmp(lex->tok.value, lex->frames[i].name)) {
 715                 lex->frames[i].value = lex->framevalue++;
 716                 if (lexwarn(lex, WARN_FRAME_MACROS, "duplicate frame macro defined: `%s`", lex->tok.value))
 717                     return false;
 718                 break;
 719             }
 720         }
 721         if (i < vec_size(lex->frames))
 722             continue;
 723
 724         m.value = lex->framevalue++;
 725         m.name = util_strdup(lex->tok.value);
 726         vec_shrinkto(lex->tok.value, 0);
 727         vec_push(lex->frames, m);
 728     } while (true);
 729 }
 730
 731 static int GMQCC_WARN lex_finish_string(lex_file *lex, int quote)
 732 {
 733     int ch = 0;
 734
 735     while (ch != EOF)
 736     {
 737         ch = lex_getch(lex);
 738         if (ch == quote)
 739             return TOKEN_STRINGCONST;
 740
 741         if (lex->flags.preprocessing && ch == '\\') {
 742             lex_tokench(lex, ch);
 743             ch = lex_getch(lex);
 744             if (ch == EOF) {
 745                 lexerror(lex, "unexpected end of file");
 746                 lex_ungetch(lex, EOF); /* next token to be TOKEN_EOF */
 747                 return (lex->tok.ttype = TOKEN_ERROR);
 748             }
 749             lex_tokench(lex, ch);
 750         }
 751         else if (ch == '\\') {
 752             ch = lex_getch(lex);
 753             if (ch == EOF) {
 754                 lexerror(lex, "unexpected end of file");
 755                 lex_ungetch(lex, EOF); /* next token to be TOKEN_EOF */
 756                 return (lex->tok.ttype = TOKEN_ERROR);
 757             }
 758
 759             switch (ch) {
 760             case '\\': break;
 761             case '\'': break;
 762             case '"':  break;
 763             case 'a':  ch = '\a'; break;
 764             case 'b':  ch = '\b'; break;
 765             case 'r':  ch = '\r'; break;
 766             case 'n':  ch = '\n'; break;
 767             case 't':  ch = '\t'; break;
 768             case 'f':  ch = '\f'; break;
 769             case 'v':  ch = '\v'; break;
 770             default:
 771                 lexwarn(lex, WARN_UNKNOWN_CONTROL_SEQUENCE, "unrecognized control sequence: \\%c", ch);
 772                 /* so we just add the character plus backslash no matter what it actually is */
 773                 lex_tokench(lex, '\\');
 774             }
 775             /* add the character finally */
 776             lex_tokench(lex, ch);
 777         }
 778         else
 779             lex_tokench(lex, ch);
 780     }
 781     lexerror(lex, "unexpected end of file within string constant");
 782     lex_ungetch(lex, EOF); /* next token to be TOKEN_EOF */
 783     return (lex->tok.ttype = TOKEN_ERROR);
 784 }
 785
 786 static int GMQCC_WARN lex_finish_digit(lex_file *lex, int lastch)
 787 {
 788     bool ishex = false;
 789
 790     int  ch = lastch;
 791
 792     /* parse a number... */
 793     lex->tok.ttype = TOKEN_INTCONST;
 794
 795     lex_tokench(lex, ch);
 796
 797     ch = lex_getch(lex);
 798     if (ch != '.' && !isdigit(ch))
 799     {
 800         if (lastch != '0' || ch != 'x')
 801         {
 802             /* end of the number or EOF */
 803             lex_ungetch(lex, ch);
 804             lex_endtoken(lex);
 805
 806             lex->tok.constval.i = lastch - '0';
 807             return lex->tok.ttype;
 808         }
 809
 810         ishex = true;
 811     }
 812
 813     /* EOF would have been caught above */
 814
 815     if (ch != '.')
 816     {
 817         lex_tokench(lex, ch);
 818         ch = lex_getch(lex);
 819         while (isdigit(ch) || (ishex && isxdigit_only(ch)))
 820         {
 821             lex_tokench(lex, ch);
 822             ch = lex_getch(lex);
 823         }
 824     }
 825     /* NOT else, '.' can come from above as well */
 826     if (ch == '.' && !ishex)
 827     {
 828         /* Allow floating comma in non-hex mode */
 829         lex->tok.ttype = TOKEN_FLOATCONST;
 830         lex_tokench(lex, ch);
 831
 832         /* continue digits-only */
 833         ch = lex_getch(lex);
 834         while (isdigit(ch))
 835         {
 836             lex_tokench(lex, ch);
 837             ch = lex_getch(lex);
 838         }
 839     }
 840     /* put back the last character */
 841     /* but do not put back the trailing 'f' or a float */
 842     if (lex->tok.ttype == TOKEN_FLOATCONST && ch == 'f')
 843         ch = lex_getch(lex);
 844
 845     /* generally we don't want words to follow numbers: */
 846     if (isident(ch)) {
 847         lexerror(lex, "unexpected trailing characters after number");
 848         return (lex->tok.ttype = TOKEN_ERROR);
 849     }
 850     lex_ungetch(lex, ch);
 851
 852     lex_endtoken(lex);
 853     if (lex->tok.ttype == TOKEN_FLOATCONST)
 854         lex->tok.constval.f = strtod(lex->tok.value, NULL);
 855     else
 856         lex->tok.constval.i = strtol(lex->tok.value, NULL, 0);
 857     return lex->tok.ttype;
 858 }
 859
 860 int lex_do(lex_file *lex)
 861 {
 862     int ch, nextch, thirdch;
 863
 864     lex_token_new(lex);
 865 #if 0
 866     if (!lex->tok)
 867         return TOKEN_FATAL;
 868 #endif
 869
 870     while (true) {
 871         ch = lex_skipwhite(lex);
 872         if (!lex->flags.mergelines || ch != '\\')
 873             break;
 874         ch = lex_getch(lex);
 875         if (ch != '\n') {
 876             lex_ungetch(lex, ch);
 877             ch = '\\';
 878             break;
 879         }
 880         /* we reached a linemerge */
 881         lex_tokench(lex, '\n');
 882         continue;
 883     }
 884
 885     lex->sline = lex->line;
 886     lex->tok.ctx.line = lex->sline;
 887     lex->tok.ctx.file = lex->name;
 888
 889     if (lex->flags.preprocessing && (ch == TOKEN_WHITE || ch == TOKEN_EOL || ch == TOKEN_FATAL)) {
 890         return (lex->tok.ttype = ch);
 891     }
 892
 893     if (lex->eof)
 894         return (lex->tok.ttype = TOKEN_FATAL);
 895
 896     if (ch == EOF) {
 897         lex->eof = true;
 898         return (lex->tok.ttype = TOKEN_EOF);
 899     }
 900
 901     /* modelgen / spiritgen commands */
 902     if (ch == '$') {
 903         const char *v;
 904         size_t frame;
 905
 906         ch = lex_getch(lex);
 907         if (!isident_start(ch)) {
 908             lexerror(lex, "hanging '$' modelgen/spritegen command line");
 909             return lex_do(lex);
 910         }
 911         lex_tokench(lex, ch);
 912         if (!lex_finish_ident(lex))
 913             return (lex->tok.ttype = TOKEN_ERROR);
 914         lex_endtoken(lex);
 915         /* skip the known commands */
 916         v = lex->tok.value;
 917
 918         if (!strcmp(v, "frame") || !strcmp(v, "framesave"))
 919         {
 920             /* frame/framesave command works like an enum
 921              * similar to fteqcc we handle this in the lexer.
 922              * The reason for this is that it is sensitive to newlines,
 923              * which the parser is unaware of
 924              */
 925             if (!lex_finish_frames(lex))
 926                  return (lex->tok.ttype = TOKEN_ERROR);
 927             return lex_do(lex);
 928         }
 929
 930         if (!strcmp(v, "framevalue"))
 931         {
 932             ch = lex_getch(lex);
 933             while (ch != EOF && isspace(ch) && ch != '\n')
 934                 ch = lex_getch(lex);
 935
 936             if (!isdigit(ch)) {
 937                 lexerror(lex, "$framevalue requires an integer parameter");
 938                 return lex_do(lex);
 939             }
 940
 941             lex_token_new(lex);
 942             lex->tok.ttype = lex_finish_digit(lex, ch);
 943             lex_endtoken(lex);
 944             if (lex->tok.ttype != TOKEN_INTCONST) {
 945                 lexerror(lex, "$framevalue requires an integer parameter");
 946                 return lex_do(lex);
 947             }
 948             lex->framevalue = lex->tok.constval.i;
 949             return lex_do(lex);
 950         }
 951
 952         if (!strcmp(v, "framerestore"))
 953         {
 954             int rc;
 955
 956             lex_token_new(lex);
 957
 958             rc = lex_parse_frame(lex);
 959
 960             if (rc > 0) {
 961                 lexerror(lex, "$framerestore requires a framename parameter");
 962                 return lex_do(lex);
 963             }
 964             if (rc < 0)
 965                 return (lex->tok.ttype = TOKEN_FATAL);
 966
 967             v = lex->tok.value;
 968             for (frame = 0; frame < vec_size(lex->frames); ++frame) {
 969                 if (!strcmp(v, lex->frames[frame].name)) {
 970                     lex->framevalue = lex->frames[frame].value;
 971                     return lex_do(lex);
 972                 }
 973             }
 974             lexerror(lex, "unknown framename `%s`", v);
 975             return lex_do(lex);
 976         }
 977
 978         if (!strcmp(v, "modelname"))
 979         {
 980             int rc;
 981
 982             lex_token_new(lex);
 983
 984             rc = lex_parse_frame(lex);
 985
 986             if (rc > 0) {
 987                 lexerror(lex, "$modelname requires a parameter");
 988                 return lex_do(lex);
 989             }
 990             if (rc < 0)
 991                 return (lex->tok.ttype = TOKEN_FATAL);
 992
 993             v = lex->tok.value;
 994             if (lex->modelname) {
 995                 frame_macro m;
 996                 m.value = lex->framevalue;
 997                 m.name = lex->modelname;
 998                 lex->modelname = NULL;
 999                 vec_push(lex->frames, m);
1000             }
1001             lex->modelname = lex->tok.value;
1002             lex->tok.value = NULL;
1003             return lex_do(lex);
1004         }
1005
1006         if (!strcmp(v, "flush"))
1007         {
1008             size_t fi;
1009             for (fi = 0; fi < vec_size(lex->frames); ++fi)
1010                 mem_d(lex->frames[fi].name);
1011             vec_free(lex->frames);
1012             /* skip line (fteqcc does it too) */
1013             ch = lex_getch(lex);
1014             while (ch != EOF && ch != '\n')
1015                 ch = lex_getch(lex);
1016             return lex_do(lex);
1017         }
1018
1019         if (!strcmp(v, "cd") ||
1020             !strcmp(v, "origin") ||
1021             !strcmp(v, "base") ||
1022             !strcmp(v, "flags") ||
1023             !strcmp(v, "scale") ||
1024             !strcmp(v, "skin"))
1025         {
1026             /* skip line */
1027             ch = lex_getch(lex);
1028             while (ch != EOF && ch != '\n')
1029                 ch = lex_getch(lex);
1030             return lex_do(lex);
1031         }
1032
1033         for (frame = 0; frame < vec_size(lex->frames); ++frame) {
1034             if (!strcmp(v, lex->frames[frame].name)) {
1035                 lex->tok.constval.i = lex->frames[frame].value;
1036                 return (lex->tok.ttype = TOKEN_INTCONST);
1037             }
1038         }
1039
1040         lexerror(lex, "invalid frame macro");
1041         return lex_do(lex);
1042     }
1043
1044     /* single-character tokens */
1045     switch (ch)
1046     {
1047         case '[':
1048         case '(':
1049         case ':':
1050         case '?':
1051             lex_tokench(lex, ch);
1052             lex_endtoken(lex);
1053             if (lex->flags.noops)
1054                 return (lex->tok.ttype = ch);
1055             else
1056                 return (lex->tok.ttype = TOKEN_OPERATOR);
1057         case ')':
1058         case ';':
1059         case '{':
1060         case '}':
1061         case ']':
1062
1063         case '#':
1064             lex_tokench(lex, ch);
1065             lex_endtoken(lex);
1066             return (lex->tok.ttype = ch);
1067         default:
1068             break;
1069     }
1070
1071     if (lex->flags.noops)
1072     {
1073         /* Detect characters early which are normally
1074          * operators OR PART of an operator.
1075          */
1076         switch (ch)
1077         {
1078             case '+':
1079             case '-':
1080             case '*':
1081             case '/':
1082             case '<':
1083             case '>':
1084             case '=':
1085             case '&':
1086             case '|':
1087             case '^':
1088             case '~':
1089             case ',':
1090             case '!':
1091                 lex_tokench(lex, ch);
1092                 lex_endtoken(lex);
1093                 return (lex->tok.ttype = ch);
1094             default:
1095                 break;
1096         }
1097
1098         if (ch == '.')
1099         {
1100             lex_tokench(lex, ch);
1101             /* peak ahead once */
1102             nextch = lex_getch(lex);
1103             if (nextch != '.') {
1104                 lex_ungetch(lex, nextch);
1105                 lex_endtoken(lex);
1106                 return (lex->tok.ttype = ch);
1107             }
1108             /* peak ahead again */
1109             nextch = lex_getch(lex);
1110             if (nextch != '.') {
1111                 lex_ungetch(lex, nextch);
1112                 lex_ungetch(lex, nextch);
1113                 lex_endtoken(lex);
1114                 return (lex->tok.ttype = ch);
1115             }
1116             /* fill the token to be "..." */
1117             lex_tokench(lex, ch);
1118             lex_tokench(lex, ch);
1119             lex_endtoken(lex);
1120             return (lex->tok.ttype = TOKEN_DOTS);
1121         }
1122     }
1123
1124     if (ch == ',' || ch == '.') {
1125         lex_tokench(lex, ch);
1126         lex_endtoken(lex);
1127         return (lex->tok.ttype = TOKEN_OPERATOR);
1128     }
1129
1130     if (ch == '+' || ch == '-' || /* ++, --, +=, -=  and -> as well! */
1131         ch == '>' || ch == '<' || /* <<, >>, <=, >= */
1132         ch == '=' || ch == '!' || /* ==, != */
1133         ch == '&' || ch == '|')   /* &&, ||, &=, |= */
1134     {
1135         lex_tokench(lex, ch);
1136
1137         nextch = lex_getch(lex);
1138         if (nextch == ch || nextch == '=') {
1139             lex_tokench(lex, nextch);
1140         } else if (ch == '-' && nextch == '>') {
1141             lex_tokench(lex, nextch);
1142         } else if (ch == '&' && nextch == '~') {
1143             thirdch = lex_getch(lex);
1144             if (thirdch != '=') {
1145                 lex_ungetch(lex, thirdch);
1146                 lex_ungetch(lex, nextch);
1147             }
1148             else {
1149                 lex_tokench(lex, nextch);
1150                 lex_tokench(lex, thirdch);
1151             }
1152         } else
1153             lex_ungetch(lex, nextch);
1154
1155         lex_endtoken(lex);
1156         return (lex->tok.ttype = TOKEN_OPERATOR);
1157     }
1158
1159     /*
1160     if (ch == '^' || ch == '~' || ch == '!')
1161     {
1162         lex_tokench(lex, ch);
1163         lex_endtoken(lex);
1164         return (lex->tok.ttype = TOKEN_OPERATOR);
1165     }
1166     */
1167
1168     if (ch == '*' || ch == '/') /* *=, /= */
1169     {
1170         lex_tokench(lex, ch);
1171
1172         nextch = lex_getch(lex);
1173         if (nextch == '=') {
1174             lex_tokench(lex, nextch);
1175         } else
1176             lex_ungetch(lex, nextch);
1177
1178         lex_endtoken(lex);
1179         return (lex->tok.ttype = TOKEN_OPERATOR);
1180     }
1181
1182     if (isident_start(ch))
1183     {
1184         const char *v;
1185
1186         lex_tokench(lex, ch);
1187         if (!lex_finish_ident(lex)) {
1188             /* error? */
1189             return (lex->tok.ttype = TOKEN_ERROR);
1190         }
1191         lex_endtoken(lex);
1192         lex->tok.ttype = TOKEN_IDENT;
1193
1194         v = lex->tok.value;
1195         if (!strcmp(v, "void")) {
1196             lex->tok.ttype = TOKEN_TYPENAME;
1197             lex->tok.constval.t = TYPE_VOID;
1198         } else if (!strcmp(v, "int")) {
1199             lex->tok.ttype = TOKEN_TYPENAME;
1200             lex->tok.constval.t = TYPE_INTEGER;
1201         } else if (!strcmp(v, "float")) {
1202             lex->tok.ttype = TOKEN_TYPENAME;
1203             lex->tok.constval.t = TYPE_FLOAT;
1204         } else if (!strcmp(v, "string")) {
1205             lex->tok.ttype = TOKEN_TYPENAME;
1206             lex->tok.constval.t = TYPE_STRING;
1207         } else if (!strcmp(v, "entity")) {
1208             lex->tok.ttype = TOKEN_TYPENAME;
1209             lex->tok.constval.t = TYPE_ENTITY;
1210         } else if (!strcmp(v, "vector")) {
1211             lex->tok.ttype = TOKEN_TYPENAME;
1212             lex->tok.constval.t = TYPE_VECTOR;
1213         } else {
1214             size_t kw;
1215             for (kw = 0; kw < num_keywords_qc; ++kw) {
1216                 if (!strcmp(v, keywords_qc[kw]))
1217                     return (lex->tok.ttype = TOKEN_KEYWORD);
1218             }
1219             if (opts_standard != COMPILER_QCC) {
1220                 for (kw = 0; kw < num_keywords_fg; ++kw) {
1221                     if (!strcmp(v, keywords_fg[kw]))
1222                         return (lex->tok.ttype = TOKEN_KEYWORD);
1223                 }
1224             }
1225         }
1226
1227         return lex->tok.ttype;
1228     }
1229
1230     if (ch == '"')
1231     {
1232         lex->flags.nodigraphs = true;
1233         if (lex->flags.preprocessing)
1234             lex_tokench(lex, ch);
1235         lex->tok.ttype = lex_finish_string(lex, '"');
1236         if (lex->flags.preprocessing)
1237             lex_tokench(lex, ch);
1238         while (!lex->flags.preprocessing && lex->tok.ttype == TOKEN_STRINGCONST)
1239         {
1240             /* Allow c style "string" "continuation" */
1241             ch = lex_skipwhite(lex);
1242             if (ch != '"') {
1243                 lex_ungetch(lex, ch);
1244                 break;
1245             }
1246
1247             lex->tok.ttype = lex_finish_string(lex, '"');
1248         }
1249         lex->flags.nodigraphs = false;
1250         lex_endtoken(lex);
1251         return lex->tok.ttype;
1252     }
1253
1254     if (ch == '\'')
1255     {
1256         /* we parse character constants like string,
1257          * but return TOKEN_CHARCONST, or a vector type if it fits...
1258          * Likewise actual unescaping has to be done by the parser.
1259          * The difference is we don't allow 'char' 'continuation'.
1260          */
1261         if (lex->flags.preprocessing)
1262             lex_tokench(lex, ch);
1263         lex->tok.ttype = lex_finish_string(lex, '\'');
1264         if (lex->flags.preprocessing)
1265             lex_tokench(lex, ch);
1266         lex_endtoken(lex);
1267
1268          /* It's a vector if we can successfully scan 3 floats */
1269 #ifdef WIN32
1270         if (sscanf_s(lex->tok.value, " %f %f %f ",
1271                    &lex->tok.constval.v.x, &lex->tok.constval.v.y, &lex->tok.constval.v.z) == 3)
1272 #else
1273         if (sscanf(lex->tok.value, " %f %f %f ",
1274                    &lex->tok.constval.v.x, &lex->tok.constval.v.y, &lex->tok.constval.v.z) == 3)
1275 #endif
1276
1277         {
1278              lex->tok.ttype = TOKEN_VECTORCONST;
1279         }
1280
1281         return lex->tok.ttype;
1282     }
1283
1284     if (isdigit(ch))
1285     {
1286         lex->tok.ttype = lex_finish_digit(lex, ch);
1287         lex_endtoken(lex);
1288         return lex->tok.ttype;
1289     }
1290
1291     lexerror(lex, "unknown token");
1292     return (lex->tok.ttype = TOKEN_ERROR);
1293 }