1/* 2 * This is a really stupid C tokenizer. It doesn't do any include 3 * files or anything complex at all. That's the preprocessor. 4 * 5 * Copyright (C) 2003 Transmeta Corp. 6 * 2003 Linus Torvalds 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a copy 9 * of this software and associated documentation files (the "Software"), to deal 10 * in the Software without restriction, including without limitation the rights 11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 * copies of the Software, and to permit persons to whom the Software is 13 * furnished to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice shall be included in 16 * all copies or substantial portions of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 * THE SOFTWARE. 25 */ 26#include <stdio.h> 27#include <stdlib.h> 28#include <stdarg.h> 29#include <stddef.h> 30#include <string.h> 31#include <ctype.h> 32#include <unistd.h> 33#include <stdint.h> 34 35#include "lib.h" 36#include "allocate.h" 37#include "token.h" 38#include "symbol.h" 39 40#define EOF (-1) 41 42int input_stream_nr = 0; 43struct stream *input_streams; 44static int input_streams_allocated; 45unsigned int tabstop = 8; 46 47#define BUFSIZE (8192) 48 49typedef struct { 50 int fd, offset, size; 51 int pos, line, nr; 52 int newline, whitespace; 53 struct token **tokenlist; 54 struct token *token; 55 unsigned char *buffer; 56} stream_t; 57 58const char *stream_name(int stream) 59{ 60 if (stream < 0 || stream > input_stream_nr) 61 return "<bad stream>"; 62 return input_streams[stream].name; 63} 64 65int stream_prev(int stream) 66{ 67 if (stream < 0 || stream > input_stream_nr) 68 return -1; 69 stream = input_streams[stream].pos.stream; 70 if (stream > input_stream_nr) 71 return -1; 72 return stream; 73} 74 75static struct position stream_pos(stream_t *stream) 76{ 77 struct position pos; 78 pos.type = 0; 79 pos.stream = stream->nr; 80 pos.newline = stream->newline; 81 pos.whitespace = stream->whitespace; 82 pos.pos = stream->pos; 83 pos.line = stream->line; 84 pos.noexpand = 0; 85 return pos; 86} 87 88const char *show_special(int val) 89{ 90 static char buffer[4]; 91 92 buffer[0] = val; 93 buffer[1] = 0; 94 if (val >= SPECIAL_BASE) 95 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]); 96 return buffer; 97} 98 99const char *show_ident(const struct ident *ident) 100{ 101 static char buff[4][256]; 102 static int n; 103 char *buffer; 104 105 if (!ident) 106 return "<noident>"; 107 buffer = buff[3 & ++n]; 108 sprintf(buffer, "%.*s", ident->len, ident->name); 109 return buffer; 110} 111 112static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next) 113{ 114 if (isprint(c)) { 115 if (c == escape || c == '\\') 116 *ptr++ = '\\'; 117 *ptr++ = c; 118 return ptr; 119 } 120 *ptr++ = '\\'; 121 switch (c) { 122 case '\n': 123 *ptr++ = 'n'; 124 return ptr; 125 case '\t': 126 *ptr++ = 't'; 127 return ptr; 128 } 129 if (!isdigit(next)) 130 return ptr + sprintf(ptr, "%o", c); 131 132 return ptr + sprintf(ptr, "%03o", c); 133} 134 135const char *show_string(const struct string *string) 136{ 137 static char buffer[4 * MAX_STRING + 3]; 138 char *ptr; 139 int i; 140 141 if (!string || !string->length) 142 return "<bad_string>"; 143 ptr = buffer; 144 *ptr++ = '"'; 145 for (i = 0; i < string->length-1; i++) { 146 const char *p = string->data + i; 147 ptr = charstr(ptr, p[0], '"', p[1]); 148 } 149 *ptr++ = '"'; 150 *ptr = '\0'; 151 return buffer; 152} 153 154static const char *show_char(const char *s, size_t len, char prefix, char delim) 155{ 156 static char buffer[MAX_STRING + 4]; 157 char *p = buffer; 158 if (prefix) 159 *p++ = prefix; 160 *p++ = delim; 161 memcpy(p, s, len); 162 p += len; 163 *p++ = delim; 164 *p++ = '\0'; 165 return buffer; 166} 167 168static const char *quote_char(const char *s, size_t len, char prefix, char delim) 169{ 170 static char buffer[2*MAX_STRING + 6]; 171 size_t i; 172 char *p = buffer; 173 if (prefix) 174 *p++ = prefix; 175 if (delim == '"') 176 *p++ = '\\'; 177 *p++ = delim; 178 for (i = 0; i < len; i++) { 179 if (s[i] == '"' || s[i] == '\\') 180 *p++ = '\\'; 181 *p++ = s[i]; 182 } 183 if (delim == '"') 184 *p++ = '\\'; 185 *p++ = delim; 186 *p++ = '\0'; 187 return buffer; 188} 189 190const char *show_token(const struct token *token) 191{ 192 static char buffer[256]; 193 194 if (!token) 195 return "<no token>"; 196 switch (token_type(token)) { 197 case TOKEN_ERROR: 198 return "syntax error"; 199 200 case TOKEN_EOF: 201 return "end-of-input"; 202 203 case TOKEN_IDENT: 204 return show_ident(token->ident); 205 206 case TOKEN_NUMBER: 207 return token->number; 208 209 case TOKEN_SPECIAL: 210 return show_special(token->special); 211 212 case TOKEN_CHAR: 213 return show_char(token->string->data, 214 token->string->length - 1, 0, '\''); 215 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3: 216 return show_char(token->embedded, 217 token_type(token) - TOKEN_CHAR, 0, '\''); 218 case TOKEN_WIDE_CHAR: 219 return show_char(token->string->data, 220 token->string->length - 1, 'L', '\''); 221 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3: 222 return show_char(token->embedded, 223 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\''); 224 case TOKEN_STRING: 225 return show_char(token->string->data, 226 token->string->length - 1, 0, '"'); 227 case TOKEN_WIDE_STRING: 228 return show_char(token->string->data, 229 token->string->length - 1, 'L', '"'); 230 231 case TOKEN_STREAMBEGIN: 232 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream)); 233 return buffer; 234 235 case TOKEN_STREAMEND: 236 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream)); 237 return buffer; 238 239 case TOKEN_UNTAINT: 240 sprintf(buffer, "<untaint>"); 241 return buffer; 242 243 case TOKEN_ARG_COUNT: 244 sprintf(buffer, "<argcnt>"); 245 return buffer; 246 247 default: 248 sprintf(buffer, "unhandled token type '%d' ", token_type(token)); 249 return buffer; 250 } 251} 252 253const char *quote_token(const struct token *token) 254{ 255 static char buffer[256]; 256 257 switch (token_type(token)) { 258 case TOKEN_ERROR: 259 return "syntax error"; 260 261 case TOKEN_IDENT: 262 return show_ident(token->ident); 263 264 case TOKEN_NUMBER: 265 return token->number; 266 267 case TOKEN_SPECIAL: 268 return show_special(token->special); 269 270 case TOKEN_CHAR: 271 return quote_char(token->string->data, 272 token->string->length - 1, 0, '\''); 273 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3: 274 return quote_char(token->embedded, 275 token_type(token) - TOKEN_CHAR, 0, '\''); 276 case TOKEN_WIDE_CHAR: 277 return quote_char(token->string->data, 278 token->string->length - 1, 'L', '\''); 279 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3: 280 return quote_char(token->embedded, 281 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\''); 282 case TOKEN_STRING: 283 return quote_char(token->string->data, 284 token->string->length - 1, 0, '"'); 285 case TOKEN_WIDE_STRING: 286 return quote_char(token->string->data, 287 token->string->length - 1, 'L', '"'); 288 default: 289 sprintf(buffer, "unhandled token type '%d' ", token_type(token)); 290 return buffer; 291 } 292} 293 294#define HASHED_INPUT_BITS (6) 295#define HASHED_INPUT (1 << HASHED_INPUT_BITS) 296#define HASH_PRIME 0x9e370001UL 297 298static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 }; 299 300int *hash_stream(const char *name) 301{ 302 uint32_t hash = 0; 303 unsigned char c; 304 305 while ((c = *name++) != 0) 306 hash = (hash + (c << 4) + (c >> 4)) * 11; 307 308 hash *= HASH_PRIME; 309 hash >>= 32 - HASHED_INPUT_BITS; 310 return input_stream_hashes + hash; 311} 312 313int init_stream(const struct position *pos, const char *name, int fd, const char **next_path) 314{ 315 int stream = input_stream_nr, *hash; 316 struct stream *current; 317 318 if (stream >= input_streams_allocated) { 319 int newalloc = stream * 4 / 3 + 10; 320 input_streams = realloc(input_streams, newalloc * sizeof(struct stream)); 321 if (!input_streams) 322 die("Unable to allocate more streams space"); 323 input_streams_allocated = newalloc; 324 } 325 current = input_streams + stream; 326 memset(current, 0, sizeof(*current)); 327 current->name = name; 328 current->fd = fd; 329 current->next_path = next_path; 330 current->path = NULL; 331 current->constant = CONSTANT_FILE_MAYBE; 332 if (pos) 333 current->pos = *pos; 334 else 335 current->pos.stream = -1; 336 input_stream_nr = stream+1; 337 hash = hash_stream(name); 338 current->next_stream = *hash; 339 *hash = stream; 340 return stream; 341} 342 343static struct token * alloc_token(stream_t *stream) 344{ 345 struct token *token = __alloc_token(0); 346 token->pos = stream_pos(stream); 347 return token; 348} 349 350/* 351 * Argh... That was surprisingly messy - handling '\r' complicates the 352 * things a _lot_. 353 */ 354static int nextchar_slow(stream_t *stream) 355{ 356 int offset = stream->offset; 357 int size = stream->size; 358 int c; 359 int spliced = 0, had_cr, had_backslash; 360 361restart: 362 had_cr = had_backslash = 0; 363 364repeat: 365 if (offset >= size) { 366 if (stream->fd < 0) 367 goto got_eof; 368 size = read(stream->fd, stream->buffer, BUFSIZE); 369 if (size <= 0) 370 goto got_eof; 371 stream->size = size; 372 stream->offset = offset = 0; 373 } 374 375 c = stream->buffer[offset++]; 376 if (had_cr) 377 goto check_lf; 378 379 if (c == '\r') { 380 had_cr = 1; 381 goto repeat; 382 } 383 384norm: 385 if (!had_backslash) { 386 switch (c) { 387 case '\t': 388 stream->pos += tabstop - stream->pos % tabstop; 389 break; 390 case '\n': 391 stream->line++; 392 stream->pos = 0; 393 stream->newline = 1; 394 break; 395 case '\\': 396 had_backslash = 1; 397 stream->pos++; 398 goto repeat; 399 default: 400 stream->pos++; 401 } 402 } else { 403 if (c == '\n') { 404 stream->line++; 405 stream->pos = 0; 406 spliced = 1; 407 goto restart; 408 } 409 offset--; 410 c = '\\'; 411 } 412out: 413 stream->offset = offset; 414 415 return c; 416 417check_lf: 418 if (c != '\n') 419 offset--; 420 c = '\n'; 421 goto norm; 422 423got_eof: 424 if (had_backslash) { 425 c = '\\'; 426 goto out; 427 } 428 if (stream->pos & Wnewline_eof) 429 warning(stream_pos(stream), "no newline at end of file"); 430 else if (spliced) 431 warning(stream_pos(stream), "backslash-newline at end of file"); 432 return EOF; 433} 434 435/* 436 * We want that as light as possible while covering all normal cases. 437 * Slow path (including the logics with line-splicing and EOF sanity 438 * checks) is in nextchar_slow(). 439 */ 440static inline int nextchar(stream_t *stream) 441{ 442 int offset = stream->offset; 443 444 if (offset < stream->size) { 445 int c = stream->buffer[offset++]; 446 static const char special[256] = { 447 ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1 448 }; 449 if (!special[c]) { 450 stream->offset = offset; 451 stream->pos++; 452 return c; 453 } 454 } 455 return nextchar_slow(stream); 456} 457 458struct token eof_token_entry; 459 460static struct token *mark_eof(stream_t *stream) 461{ 462 struct token *end; 463 464 end = alloc_token(stream); 465 eof_token_entry.pos = end->pos; 466 token_type(end) = TOKEN_STREAMEND; 467 end->pos.newline = 1; 468 469 eof_token_entry.next = &eof_token_entry; 470 eof_token_entry.pos.newline = 1; 471 472 end->next = &eof_token_entry; 473 *stream->tokenlist = end; 474 stream->tokenlist = NULL; 475 return end; 476} 477 478static void add_token(stream_t *stream) 479{ 480 struct token *token = stream->token; 481 482 stream->token = NULL; 483 token->next = NULL; 484 *stream->tokenlist = token; 485 stream->tokenlist = &token->next; 486} 487 488static void drop_token(stream_t *stream) 489{ 490 stream->newline |= stream->token->pos.newline; 491 stream->whitespace |= stream->token->pos.whitespace; 492 stream->token = NULL; 493} 494 495enum { 496 Letter = 1, 497 Digit = 2, 498 Hex = 4, 499 Exp = 8, 500 Dot = 16, 501 ValidSecond = 32, 502 Quote = 64, 503}; 504 505static const char cclass[257] = { 506 ['0' + 1 ... '9' + 1] = Digit | Hex, 507 ['A' + 1 ... 'D' + 1] = Letter | Hex, 508 ['E' + 1] = Letter | Hex | Exp, /* E<exp> */ 509 ['F' + 1] = Letter | Hex, 510 ['G' + 1 ... 'O' + 1] = Letter, 511 ['P' + 1] = Letter | Exp, /* P<exp> */ 512 ['Q' + 1 ... 'Z' + 1] = Letter, 513 ['a' + 1 ... 'd' + 1] = Letter | Hex, 514 ['e' + 1] = Letter | Hex | Exp, /* e<exp> */ 515 ['f' + 1] = Letter | Hex, 516 ['g' + 1 ... 'o' + 1] = Letter, 517 ['p' + 1] = Letter | Exp, /* p<exp> */ 518 ['q' + 1 ... 'z' + 1] = Letter, 519 ['_' + 1] = Letter, 520 ['.' + 1] = Dot | ValidSecond, 521 ['=' + 1] = ValidSecond, 522 ['+' + 1] = ValidSecond, 523 ['-' + 1] = ValidSecond, 524 ['>' + 1] = ValidSecond, 525 ['<' + 1] = ValidSecond, 526 ['&' + 1] = ValidSecond, 527 ['|' + 1] = ValidSecond, 528 ['#' + 1] = ValidSecond, 529 ['\'' + 1] = Quote, 530 ['"' + 1] = Quote, 531}; 532 533/* 534 * pp-number: 535 * digit 536 * . digit 537 * pp-number digit 538 * pp-number identifier-nodigit 539 * pp-number e sign 540 * pp-number E sign 541 * pp-number p sign 542 * pp-number P sign 543 * pp-number . 544 */ 545static int get_one_number(int c, int next, stream_t *stream) 546{ 547 struct token *token; 548 static char buffer[4095]; 549 char *p = buffer, *buffer_end = buffer + sizeof (buffer); 550 551 *p++ = c; 552 for (;;) { 553 long class = cclass[next + 1]; 554 if (!(class & (Dot | Digit | Letter))) 555 break; 556 if (p != buffer_end) 557 *p++ = next; 558 next = nextchar(stream); 559 if (class & Exp) { 560 if (next == '-' || next == '+') { 561 if (p != buffer_end) 562 *p++ = next; 563 next = nextchar(stream); 564 } 565 } 566 } 567 568 if (p == buffer_end) { 569 sparse_error(stream_pos(stream), "number token exceeds %td characters", 570 buffer_end - buffer); 571 // Pretend we saw just "1". 572 buffer[0] = '1'; 573 p = buffer + 1; 574 } 575 576 *p++ = 0; 577 token = stream->token; 578 token_type(token) = TOKEN_NUMBER; 579 token->number = xmemdup(buffer, p - buffer); 580 add_token(stream); 581 582 return next; 583} 584 585static int eat_string(int next, stream_t *stream, enum token_type type) 586{ 587 static char buffer[MAX_STRING]; 588 struct string *string; 589 struct token *token = stream->token; 590 int len = 0; 591 int escape; 592 int want_hex = 0; 593 char delim = type < TOKEN_STRING ? '\'' : '"'; 594 595 for (escape = 0; escape || next != delim; next = nextchar(stream)) { 596 if (len < MAX_STRING) 597 buffer[len] = next; 598 len++; 599 if (next == '\n') { 600 warning(stream_pos(stream), 601 "missing terminating %c character", delim); 602 /* assume delimiter is lost */ 603 break; 604 } 605 if (next == EOF) { 606 warning(stream_pos(stream), 607 "End of file in middle of string"); 608 return next; 609 } 610 if (!escape) { 611 if (want_hex && !(cclass[next + 1] & Hex)) 612 warning(stream_pos(stream), 613 "\\x used with no following hex digits"); 614 want_hex = 0; 615 escape = next == '\\'; 616 } else { 617 escape = 0; 618 want_hex = next == 'x'; 619 } 620 } 621 if (want_hex) 622 warning(stream_pos(stream), 623 "\\x used with no following hex digits"); 624 if (len > MAX_STRING) { 625 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING); 626 len = MAX_STRING; 627 } 628 if (delim == '\'' && len && len <= 4) { 629 token_type(token) = type + len; 630 memset(buffer + len, '\0', 4 - len); 631 memcpy(token->embedded, buffer, 4); 632 } else { 633 token_type(token) = type; 634 string = __alloc_string(len+1); 635 memcpy(string->data, buffer, len); 636 string->data[len] = '\0'; 637 string->length = len+1; 638 token->string = string; 639 } 640 641 /* Pass it on.. */ 642 token = stream->token; 643 add_token(stream); 644 return nextchar(stream); 645} 646 647static int drop_stream_eoln(stream_t *stream) 648{ 649 drop_token(stream); 650 for (;;) { 651 switch (nextchar(stream)) { 652 case EOF: 653 return EOF; 654 case '\n': 655 return nextchar(stream); 656 } 657 } 658} 659 660static int drop_stream_comment(stream_t *stream) 661{ 662 int newline; 663 int next; 664 drop_token(stream); 665 newline = stream->newline; 666 667 next = nextchar(stream); 668 for (;;) { 669 int curr = next; 670 if (curr == EOF) { 671 warning(stream_pos(stream), "End of file in the middle of a comment"); 672 return curr; 673 } 674 next = nextchar(stream); 675 if (curr == '*' && next == '/') 676 break; 677 } 678 stream->newline = newline; 679 return nextchar(stream); 680} 681 682unsigned char combinations[][4] = COMBINATION_STRINGS; 683 684#define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE) 685 686/* hash function for two-character punctuators - all give unique values */ 687#define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31) 688 689/* 690 * note that we won't get false positives - special_hash(0,0) is 0 and 691 * entry 0 is filled (by +=), so all the missing ones are OK. 692 */ 693static unsigned char hash_results[32][2] = { 694#define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1} 695 RES('+', '='), /* 00 */ 696 RES('/', '='), /* 01 */ 697 RES('^', '='), /* 05 */ 698 RES('&', '&'), /* 07 */ 699 RES('#', '#'), /* 08 */ 700 RES('<', '<'), /* 0a */ 701 RES('<', '='), /* 0c */ 702 RES('!', '='), /* 0e */ 703 RES('%', '='), /* 0f */ 704 RES('-', '-'), /* 10 */ 705 RES('-', '='), /* 11 */ 706 RES('-', '>'), /* 13 */ 707 RES('=', '='), /* 15 */ 708 RES('&', '='), /* 17 */ 709 RES('*', '='), /* 18 */ 710 RES('.', '.'), /* 1a */ 711 RES('+', '+'), /* 1b */ 712 RES('|', '='), /* 1c */ 713 RES('>', '='), /* 1d */ 714 RES('|', '|'), /* 1e */ 715 RES('>', '>') /* 1f */ 716#undef RES 717}; 718static int code[32] = { 719#define CODE(c0, c1, value) [special_hash(c0, c1)] = value 720 CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */ 721 CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */ 722 CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */ 723 CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */ 724 CODE('#', '#', SPECIAL_HASHHASH), /* 08 */ 725 CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */ 726 CODE('<', '=', SPECIAL_LTE), /* 0c */ 727 CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */ 728 CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */ 729 CODE('-', '-', SPECIAL_DECREMENT), /* 10 */ 730 CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */ 731 CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */ 732 CODE('=', '=', SPECIAL_EQUAL), /* 15 */ 733 CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */ 734 CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */ 735 CODE('.', '.', SPECIAL_DOTDOT), /* 1a */ 736 CODE('+', '+', SPECIAL_INCREMENT), /* 1b */ 737 CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */ 738 CODE('>', '=', SPECIAL_GTE), /* 1d */ 739 CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */ 740 CODE('>', '>', SPECIAL_RIGHTSHIFT) /* 1f */ 741#undef CODE 742}; 743 744static int get_one_special(int c, stream_t *stream) 745{ 746 struct token *token; 747 int next, value, i; 748 749 next = nextchar(stream); 750 751 /* 752 * Check for numbers, strings, character constants, and comments 753 */ 754 switch (c) { 755 case '.': 756 if (next >= '0' && next <= '9') 757 return get_one_number(c, next, stream); 758 break; 759 case '"': 760 return eat_string(next, stream, TOKEN_STRING); 761 case '\'': 762 return eat_string(next, stream, TOKEN_CHAR); 763 case '/': 764 if (next == '/') 765 return drop_stream_eoln(stream); 766 if (next == '*') 767 return drop_stream_comment(stream); 768 } 769 770 /* 771 * Check for combinations 772 */ 773 value = c; 774 if (cclass[next + 1] & ValidSecond) { 775 i = special_hash(c, next); 776 if (hash_results[i][0] == c && hash_results[i][1] == next) { 777 value = code[i]; 778 next = nextchar(stream); 779 if (value >= SPECIAL_LEFTSHIFT && 780 next == "==."[value - SPECIAL_LEFTSHIFT]) { 781 value += 3; 782 next = nextchar(stream); 783 } 784 } 785 } 786 787 /* Pass it on.. */ 788 token = stream->token; 789 token_type(token) = TOKEN_SPECIAL; 790 token->special = value; 791 add_token(stream); 792 return next; 793} 794 795#define IDENT_HASH_BITS (13) 796#define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS) 797#define IDENT_HASH_MASK (IDENT_HASH_SIZE-1) 798 799#define ident_hash_init(c) (c) 800#define ident_hash_add(oldhash,c) ((oldhash)*11 + (c)) 801#define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK) 802 803static struct ident *hash_table[IDENT_HASH_SIZE]; 804static int ident_hit, ident_miss, idents; 805 806void show_identifier_stats(void) 807{ 808 int i; 809 int distribution[100]; 810 811 fprintf(stderr, "identifiers: %d hits, %d misses\n", 812 ident_hit, ident_miss); 813 814 for (i = 0; i < 100; i++) 815 distribution[i] = 0; 816 817 for (i = 0; i < IDENT_HASH_SIZE; i++) { 818 struct ident * ident = hash_table[i]; 819 int count = 0; 820 821 while (ident) { 822 count++; 823 ident = ident->next; 824 } 825 if (count > 99) 826 count = 99; 827 distribution[count]++; 828 } 829 830 for (i = 0; i < 100; i++) { 831 if (distribution[i]) 832 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]); 833 } 834} 835 836static struct ident *alloc_ident(const char *name, int len) 837{ 838 struct ident *ident = __alloc_ident(len); 839 ident->symbols = NULL; 840 ident->len = len; 841 ident->tainted = 0; 842 memcpy(ident->name, name, len); 843 return ident; 844} 845 846static struct ident * insert_hash(struct ident *ident, unsigned long hash) 847{ 848 ident->next = hash_table[hash]; 849 hash_table[hash] = ident; 850 ident_miss++; 851 return ident; 852} 853 854static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash) 855{ 856 struct ident *ident; 857 struct ident **p; 858 859 p = &hash_table[hash]; 860 while ((ident = *p) != NULL) { 861 if (ident->len == (unsigned char) len) { 862 if (strncmp(name, ident->name, len) != 0) 863 goto next; 864 865 ident_hit++; 866 return ident; 867 } 868next: 869 //misses++; 870 p = &ident->next; 871 } 872 ident = alloc_ident(name, len); 873 *p = ident; 874 ident->next = NULL; 875 ident_miss++; 876 idents++; 877 return ident; 878} 879 880static unsigned long hash_name(const char *name, int len) 881{ 882 unsigned long hash; 883 const unsigned char *p = (const unsigned char *)name; 884 885 hash = ident_hash_init(*p++); 886 while (--len) { 887 unsigned int i = *p++; 888 hash = ident_hash_add(hash, i); 889 } 890 return ident_hash_end(hash); 891} 892 893struct ident *hash_ident(struct ident *ident) 894{ 895 return insert_hash(ident, hash_name(ident->name, ident->len)); 896} 897 898struct ident *built_in_ident(const char *name) 899{ 900 int len = strlen(name); 901 return create_hashed_ident(name, len, hash_name(name, len)); 902} 903 904struct token *built_in_token(int stream, struct ident *ident) 905{ 906 struct token *token; 907 908 token = __alloc_token(0); 909 token->pos.stream = stream; 910 token_type(token) = TOKEN_IDENT; 911 token->ident = ident; 912 return token; 913} 914 915static int get_one_identifier(int c, stream_t *stream) 916{ 917 struct token *token; 918 struct ident *ident; 919 unsigned long hash; 920 char buf[256]; 921 int len = 1; 922 int next; 923 924 hash = ident_hash_init(c); 925 buf[0] = c; 926 for (;;) { 927 next = nextchar(stream); 928 if (!(cclass[next + 1] & (Letter | Digit))) 929 break; 930 if (len >= sizeof(buf)) 931 break; 932 hash = ident_hash_add(hash, next); 933 buf[len] = next; 934 len++; 935 }; 936 if (cclass[next + 1] & Quote) { 937 if (len == 1 && buf[0] == 'L') { 938 if (next == '\'') 939 return eat_string(nextchar(stream), stream, 940 TOKEN_WIDE_CHAR); 941 else 942 return eat_string(nextchar(stream), stream, 943 TOKEN_WIDE_STRING); 944 } 945 } 946 hash = ident_hash_end(hash); 947 ident = create_hashed_ident(buf, len, hash); 948 949 /* Pass it on.. */ 950 token = stream->token; 951 token_type(token) = TOKEN_IDENT; 952 token->ident = ident; 953 add_token(stream); 954 return next; 955} 956 957static int get_one_token(int c, stream_t *stream) 958{ 959 long class = cclass[c + 1]; 960 if (class & Digit) 961 return get_one_number(c, nextchar(stream), stream); 962 if (class & Letter) 963 return get_one_identifier(c, stream); 964 return get_one_special(c, stream); 965} 966 967static struct token *setup_stream(stream_t *stream, int idx, int fd, 968 unsigned char *buf, unsigned int buf_size) 969{ 970 struct token *begin; 971 972 stream->nr = idx; 973 stream->line = 1; 974 stream->newline = 1; 975 stream->whitespace = 0; 976 stream->pos = 0; 977 978 stream->token = NULL; 979 stream->fd = fd; 980 stream->offset = 0; 981 stream->size = buf_size; 982 stream->buffer = buf; 983 984 begin = alloc_token(stream); 985 token_type(begin) = TOKEN_STREAMBEGIN; 986 stream->tokenlist = &begin->next; 987 return begin; 988} 989 990static struct token *tokenize_stream(stream_t *stream) 991{ 992 int c = nextchar(stream); 993 while (c != EOF) { 994 if (!isspace(c)) { 995 struct token *token = alloc_token(stream); 996 stream->token = token; 997 stream->newline = 0; 998 stream->whitespace = 0; 999 c = get_one_token(c, stream); 1000 continue; 1001 } 1002 stream->whitespace = 1; 1003 c = nextchar(stream); 1004 } 1005 return mark_eof(stream); 1006} 1007 1008struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken) 1009{ 1010 stream_t stream; 1011 struct token *begin; 1012 1013 begin = setup_stream(&stream, 0, -1, buffer, size); 1014 *endtoken = tokenize_stream(&stream); 1015 return begin; 1016} 1017 1018struct token * tokenize(const struct position *pos, const char *name, int fd, struct token *endtoken, const char **next_path) 1019{ 1020 struct token *begin, *end; 1021 stream_t stream; 1022 unsigned char buffer[BUFSIZE]; 1023 int idx; 1024 1025 idx = init_stream(pos, name, fd, next_path); 1026 if (idx < 0) { 1027 // info(endtoken->pos, "File %s is const", name); 1028 return endtoken; 1029 } 1030 1031 begin = setup_stream(&stream, idx, fd, buffer, 0); 1032 end = tokenize_stream(&stream); 1033 if (endtoken) 1034 end->next = endtoken; 1035 return begin; 1036} 1037