1/*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the preprocessor.
4 *
5 * Copyright (C) 2003 Transmeta Corp.
6 *               2003 Linus Torvalds
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26#include <stdio.h>
27#include <stdlib.h>
28#include <stdarg.h>
29#include <stddef.h>
30#include <string.h>
31#include <ctype.h>
32#include <unistd.h>
33#include <stdint.h>
34
35#include "lib.h"
36#include "allocate.h"
37#include "token.h"
38#include "symbol.h"
39
40#define EOF (-1)
41
42int input_stream_nr = 0;
43struct stream *input_streams;
44static int input_streams_allocated;
45unsigned int tabstop = 8;
46
47#define BUFSIZE (8192)
48
49typedef struct {
50	int fd, offset, size;
51	int pos, line, nr;
52	int newline, whitespace;
53	struct token **tokenlist;
54	struct token *token;
55	unsigned char *buffer;
56} stream_t;
57
58const char *stream_name(int stream)
59{
60	if (stream < 0 || stream > input_stream_nr)
61		return "<bad stream>";
62	return input_streams[stream].name;
63}
64
65int stream_prev(int stream)
66{
67	if (stream < 0 || stream > input_stream_nr)
68		return -1;
69	stream = input_streams[stream].pos.stream;
70	if (stream > input_stream_nr)
71		return -1;
72	return stream;
73}
74
75static struct position stream_pos(stream_t *stream)
76{
77	struct position pos;
78	pos.type = 0;
79	pos.stream = stream->nr;
80	pos.newline = stream->newline;
81	pos.whitespace = stream->whitespace;
82	pos.pos = stream->pos;
83	pos.line = stream->line;
84	pos.noexpand = 0;
85	return pos;
86}
87
88const char *show_special(int val)
89{
90	static char buffer[4];
91
92	buffer[0] = val;
93	buffer[1] = 0;
94	if (val >= SPECIAL_BASE)
95		strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
96	return buffer;
97}
98
99const char *show_ident(const struct ident *ident)
100{
101	static char buff[4][256];
102	static int n;
103	char *buffer;
104
105	if (!ident)
106		return "<noident>";
107	buffer = buff[3 & ++n];
108	sprintf(buffer, "%.*s", ident->len, ident->name);
109	return buffer;
110}
111
112static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
113{
114	if (isprint(c)) {
115		if (c == escape || c == '\\')
116			*ptr++ = '\\';
117		*ptr++ = c;
118		return ptr;
119	}
120	*ptr++ = '\\';
121	switch (c) {
122	case '\n':
123		*ptr++ = 'n';
124		return ptr;
125	case '\t':
126		*ptr++ = 't';
127		return ptr;
128	}
129	if (!isdigit(next))
130		return ptr + sprintf(ptr, "%o", c);
131
132	return ptr + sprintf(ptr, "%03o", c);
133}
134
135const char *show_string(const struct string *string)
136{
137	static char buffer[4 * MAX_STRING + 3];
138	char *ptr;
139	int i;
140
141	if (!string || !string->length)
142		return "<bad_string>";
143	ptr = buffer;
144	*ptr++ = '"';
145	for (i = 0; i < string->length-1; i++) {
146		const char *p = string->data + i;
147		ptr = charstr(ptr, p[0], '"', p[1]);
148	}
149	*ptr++ = '"';
150	*ptr = '\0';
151	return buffer;
152}
153
154static const char *show_char(const char *s, size_t len, char prefix, char delim)
155{
156	static char buffer[MAX_STRING + 4];
157	char *p = buffer;
158	if (prefix)
159		*p++ = prefix;
160	*p++ = delim;
161	memcpy(p, s, len);
162	p += len;
163	*p++ = delim;
164	*p++ = '\0';
165	return buffer;
166}
167
168static const char *quote_char(const char *s, size_t len, char prefix, char delim)
169{
170	static char buffer[2*MAX_STRING + 6];
171	size_t i;
172	char *p = buffer;
173	if (prefix)
174		*p++ = prefix;
175	if (delim == '"')
176		*p++ = '\\';
177	*p++ = delim;
178	for (i = 0; i < len; i++) {
179		if (s[i] == '"' || s[i] == '\\')
180			*p++ = '\\';
181		*p++ = s[i];
182	}
183	if (delim == '"')
184		*p++ = '\\';
185	*p++ = delim;
186	*p++ = '\0';
187	return buffer;
188}
189
190const char *show_token(const struct token *token)
191{
192	static char buffer[256];
193
194	if (!token)
195		return "<no token>";
196	switch (token_type(token)) {
197	case TOKEN_ERROR:
198		return "syntax error";
199
200	case TOKEN_EOF:
201		return "end-of-input";
202
203	case TOKEN_IDENT:
204		return show_ident(token->ident);
205
206	case TOKEN_NUMBER:
207		return token->number;
208
209	case TOKEN_SPECIAL:
210		return show_special(token->special);
211
212	case TOKEN_CHAR:
213		return show_char(token->string->data,
214			token->string->length - 1, 0, '\'');
215	case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
216		return show_char(token->embedded,
217			token_type(token) - TOKEN_CHAR, 0, '\'');
218	case TOKEN_WIDE_CHAR:
219		return show_char(token->string->data,
220			token->string->length - 1, 'L', '\'');
221	case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
222		return show_char(token->embedded,
223			token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
224	case TOKEN_STRING:
225		return show_char(token->string->data,
226			token->string->length - 1, 0, '"');
227	case TOKEN_WIDE_STRING:
228		return show_char(token->string->data,
229			token->string->length - 1, 'L', '"');
230
231	case TOKEN_STREAMBEGIN:
232		sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
233		return buffer;
234
235	case TOKEN_STREAMEND:
236		sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
237		return buffer;
238
239	case TOKEN_UNTAINT:
240		sprintf(buffer, "<untaint>");
241		return buffer;
242
243	case TOKEN_ARG_COUNT:
244		sprintf(buffer, "<argcnt>");
245		return buffer;
246
247	default:
248		sprintf(buffer, "unhandled token type '%d' ", token_type(token));
249		return buffer;
250	}
251}
252
253const char *quote_token(const struct token *token)
254{
255	static char buffer[256];
256
257	switch (token_type(token)) {
258	case TOKEN_ERROR:
259		return "syntax error";
260
261	case TOKEN_IDENT:
262		return show_ident(token->ident);
263
264	case TOKEN_NUMBER:
265		return token->number;
266
267	case TOKEN_SPECIAL:
268		return show_special(token->special);
269
270	case TOKEN_CHAR:
271		return quote_char(token->string->data,
272			token->string->length - 1, 0, '\'');
273	case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
274		return quote_char(token->embedded,
275			token_type(token) - TOKEN_CHAR, 0, '\'');
276	case TOKEN_WIDE_CHAR:
277		return quote_char(token->string->data,
278			token->string->length - 1, 'L', '\'');
279	case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
280		return quote_char(token->embedded,
281			token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
282	case TOKEN_STRING:
283		return quote_char(token->string->data,
284			token->string->length - 1, 0, '"');
285	case TOKEN_WIDE_STRING:
286		return quote_char(token->string->data,
287			token->string->length - 1, 'L', '"');
288	default:
289		sprintf(buffer, "unhandled token type '%d' ", token_type(token));
290		return buffer;
291	}
292}
293
294#define HASHED_INPUT_BITS (6)
295#define HASHED_INPUT (1 << HASHED_INPUT_BITS)
296#define HASH_PRIME 0x9e370001UL
297
298static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
299
300int *hash_stream(const char *name)
301{
302	uint32_t hash = 0;
303	unsigned char c;
304
305	while ((c = *name++) != 0)
306		hash = (hash + (c << 4) + (c >> 4)) * 11;
307
308	hash *= HASH_PRIME;
309	hash >>= 32 - HASHED_INPUT_BITS;
310	return input_stream_hashes + hash;
311}
312
313int init_stream(const struct position *pos, const char *name, int fd, const char **next_path)
314{
315	int stream = input_stream_nr, *hash;
316	struct stream *current;
317
318	if (stream >= input_streams_allocated) {
319		int newalloc = stream * 4 / 3 + 10;
320		input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
321		if (!input_streams)
322			die("Unable to allocate more streams space");
323		input_streams_allocated = newalloc;
324	}
325	current = input_streams + stream;
326	memset(current, 0, sizeof(*current));
327	current->name = name;
328	current->fd = fd;
329	current->next_path = next_path;
330	current->path = NULL;
331	current->constant = CONSTANT_FILE_MAYBE;
332	if (pos)
333		current->pos = *pos;
334	else
335		current->pos.stream = -1;
336	input_stream_nr = stream+1;
337	hash = hash_stream(name);
338	current->next_stream = *hash;
339	*hash = stream;
340	return stream;
341}
342
343static struct token * alloc_token(stream_t *stream)
344{
345	struct token *token = __alloc_token(0);
346	token->pos = stream_pos(stream);
347	return token;
348}
349
350/*
351 *  Argh...  That was surprisingly messy - handling '\r' complicates the
352 *  things a _lot_.
353 */
354static int nextchar_slow(stream_t *stream)
355{
356	int offset = stream->offset;
357	int size = stream->size;
358	int c;
359	int spliced = 0, had_cr, had_backslash;
360
361restart:
362	had_cr = had_backslash = 0;
363
364repeat:
365	if (offset >= size) {
366		if (stream->fd < 0)
367			goto got_eof;
368		size = read(stream->fd, stream->buffer, BUFSIZE);
369		if (size <= 0)
370			goto got_eof;
371		stream->size = size;
372		stream->offset = offset = 0;
373	}
374
375	c = stream->buffer[offset++];
376	if (had_cr)
377		goto check_lf;
378
379	if (c == '\r') {
380		had_cr = 1;
381		goto repeat;
382	}
383
384norm:
385	if (!had_backslash) {
386		switch (c) {
387		case '\t':
388			stream->pos += tabstop - stream->pos % tabstop;
389			break;
390		case '\n':
391			stream->line++;
392			stream->pos = 0;
393			stream->newline = 1;
394			break;
395		case '\\':
396			had_backslash = 1;
397			stream->pos++;
398			goto repeat;
399		default:
400			stream->pos++;
401		}
402	} else {
403		if (c == '\n') {
404			stream->line++;
405			stream->pos = 0;
406			spliced = 1;
407			goto restart;
408		}
409		offset--;
410		c = '\\';
411	}
412out:
413	stream->offset = offset;
414
415	return c;
416
417check_lf:
418	if (c != '\n')
419		offset--;
420	c = '\n';
421	goto norm;
422
423got_eof:
424	if (had_backslash) {
425		c = '\\';
426		goto out;
427	}
428	if (stream->pos & Wnewline_eof)
429		warning(stream_pos(stream), "no newline at end of file");
430	else if (spliced)
431		warning(stream_pos(stream), "backslash-newline at end of file");
432	return EOF;
433}
434
435/*
436 *  We want that as light as possible while covering all normal cases.
437 *  Slow path (including the logics with line-splicing and EOF sanity
438 *  checks) is in nextchar_slow().
439 */
440static inline int nextchar(stream_t *stream)
441{
442	int offset = stream->offset;
443
444	if (offset < stream->size) {
445		int c = stream->buffer[offset++];
446		static const char special[256] = {
447			['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
448		};
449		if (!special[c]) {
450			stream->offset = offset;
451			stream->pos++;
452			return c;
453		}
454	}
455	return nextchar_slow(stream);
456}
457
458struct token eof_token_entry;
459
460static struct token *mark_eof(stream_t *stream)
461{
462	struct token *end;
463
464	end = alloc_token(stream);
465	eof_token_entry.pos = end->pos;
466	token_type(end) = TOKEN_STREAMEND;
467	end->pos.newline = 1;
468
469	eof_token_entry.next = &eof_token_entry;
470	eof_token_entry.pos.newline = 1;
471
472	end->next =  &eof_token_entry;
473	*stream->tokenlist = end;
474	stream->tokenlist = NULL;
475	return end;
476}
477
478static void add_token(stream_t *stream)
479{
480	struct token *token = stream->token;
481
482	stream->token = NULL;
483	token->next = NULL;
484	*stream->tokenlist = token;
485	stream->tokenlist = &token->next;
486}
487
488static void drop_token(stream_t *stream)
489{
490	stream->newline |= stream->token->pos.newline;
491	stream->whitespace |= stream->token->pos.whitespace;
492	stream->token = NULL;
493}
494
495enum {
496	Letter = 1,
497	Digit = 2,
498	Hex = 4,
499	Exp = 8,
500	Dot = 16,
501	ValidSecond = 32,
502	Quote = 64,
503};
504
505static const char cclass[257] = {
506	['0' + 1 ... '9' + 1] = Digit | Hex,
507	['A' + 1 ... 'D' + 1] = Letter | Hex,
508	['E' + 1] = Letter | Hex | Exp,	/* E<exp> */
509	['F' + 1] = Letter | Hex,
510	['G' + 1 ... 'O' + 1] = Letter,
511	['P' + 1] = Letter | Exp,	/* P<exp> */
512	['Q' + 1 ... 'Z' + 1] = Letter,
513	['a' + 1 ... 'd' + 1] = Letter | Hex,
514	['e' + 1] = Letter | Hex | Exp,	/* e<exp> */
515	['f' + 1] = Letter | Hex,
516	['g' + 1 ... 'o' + 1] = Letter,
517	['p' + 1] = Letter | Exp,	/* p<exp> */
518	['q' + 1 ... 'z' + 1] = Letter,
519	['_' + 1] = Letter,
520	['.' + 1] = Dot | ValidSecond,
521	['=' + 1] = ValidSecond,
522	['+' + 1] = ValidSecond,
523	['-' + 1] = ValidSecond,
524	['>' + 1] = ValidSecond,
525	['<' + 1] = ValidSecond,
526	['&' + 1] = ValidSecond,
527	['|' + 1] = ValidSecond,
528	['#' + 1] = ValidSecond,
529	['\'' + 1] = Quote,
530	['"' + 1] = Quote,
531};
532
533/*
534 * pp-number:
535 *	digit
536 *	. digit
537 *	pp-number digit
538 *	pp-number identifier-nodigit
539 *	pp-number e sign
540 *	pp-number E sign
541 *	pp-number p sign
542 *	pp-number P sign
543 *	pp-number .
544 */
545static int get_one_number(int c, int next, stream_t *stream)
546{
547	struct token *token;
548	static char buffer[4095];
549	char *p = buffer, *buffer_end = buffer + sizeof (buffer);
550
551	*p++ = c;
552	for (;;) {
553		long class =  cclass[next + 1];
554		if (!(class & (Dot | Digit | Letter)))
555			break;
556		if (p != buffer_end)
557			*p++ = next;
558		next = nextchar(stream);
559		if (class & Exp) {
560			if (next == '-' || next == '+') {
561				if (p != buffer_end)
562					*p++ = next;
563				next = nextchar(stream);
564			}
565		}
566	}
567
568	if (p == buffer_end) {
569		sparse_error(stream_pos(stream), "number token exceeds %td characters",
570		      buffer_end - buffer);
571		// Pretend we saw just "1".
572		buffer[0] = '1';
573		p = buffer + 1;
574	}
575
576	*p++ = 0;
577	token = stream->token;
578	token_type(token) = TOKEN_NUMBER;
579	token->number = xmemdup(buffer, p - buffer);
580	add_token(stream);
581
582	return next;
583}
584
585static int eat_string(int next, stream_t *stream, enum token_type type)
586{
587	static char buffer[MAX_STRING];
588	struct string *string;
589	struct token *token = stream->token;
590	int len = 0;
591	int escape;
592	int want_hex = 0;
593	char delim = type < TOKEN_STRING ? '\'' : '"';
594
595	for (escape = 0; escape || next != delim; next = nextchar(stream)) {
596		if (len < MAX_STRING)
597			buffer[len] = next;
598		len++;
599		if (next == '\n') {
600			warning(stream_pos(stream),
601				"missing terminating %c character", delim);
602			/* assume delimiter is lost */
603			break;
604		}
605		if (next == EOF) {
606			warning(stream_pos(stream),
607				"End of file in middle of string");
608			return next;
609		}
610		if (!escape) {
611			if (want_hex && !(cclass[next + 1] & Hex))
612				warning(stream_pos(stream),
613					"\\x used with no following hex digits");
614			want_hex = 0;
615			escape = next == '\\';
616		} else {
617			escape = 0;
618			want_hex = next == 'x';
619		}
620	}
621	if (want_hex)
622		warning(stream_pos(stream),
623			"\\x used with no following hex digits");
624	if (len > MAX_STRING) {
625		warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
626		len = MAX_STRING;
627	}
628	if (delim == '\'' && len && len <= 4) {
629		token_type(token) = type + len;
630		memset(buffer + len, '\0', 4 - len);
631		memcpy(token->embedded, buffer, 4);
632	} else {
633		token_type(token) = type;
634		string = __alloc_string(len+1);
635		memcpy(string->data, buffer, len);
636		string->data[len] = '\0';
637		string->length = len+1;
638		token->string = string;
639	}
640
641	/* Pass it on.. */
642	token = stream->token;
643	add_token(stream);
644	return nextchar(stream);
645}
646
647static int drop_stream_eoln(stream_t *stream)
648{
649	drop_token(stream);
650	for (;;) {
651		switch (nextchar(stream)) {
652		case EOF:
653			return EOF;
654		case '\n':
655			return nextchar(stream);
656		}
657	}
658}
659
660static int drop_stream_comment(stream_t *stream)
661{
662	int newline;
663	int next;
664	drop_token(stream);
665	newline = stream->newline;
666
667	next = nextchar(stream);
668	for (;;) {
669		int curr = next;
670		if (curr == EOF) {
671			warning(stream_pos(stream), "End of file in the middle of a comment");
672			return curr;
673		}
674		next = nextchar(stream);
675		if (curr == '*' && next == '/')
676			break;
677	}
678	stream->newline = newline;
679	return nextchar(stream);
680}
681
682unsigned char combinations[][4] = COMBINATION_STRINGS;
683
684#define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
685
686/* hash function for two-character punctuators - all give unique values */
687#define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
688
689/*
690 * note that we won't get false positives - special_hash(0,0) is 0 and
691 * entry 0 is filled (by +=), so all the missing ones are OK.
692 */
693static unsigned char hash_results[32][2] = {
694#define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
695	RES('+', '='), /* 00 */
696	RES('/', '='), /* 01 */
697	RES('^', '='), /* 05 */
698	RES('&', '&'), /* 07 */
699	RES('#', '#'), /* 08 */
700	RES('<', '<'), /* 0a */
701	RES('<', '='), /* 0c */
702	RES('!', '='), /* 0e */
703	RES('%', '='), /* 0f */
704	RES('-', '-'), /* 10 */
705	RES('-', '='), /* 11 */
706	RES('-', '>'), /* 13 */
707	RES('=', '='), /* 15 */
708	RES('&', '='), /* 17 */
709	RES('*', '='), /* 18 */
710	RES('.', '.'), /* 1a */
711	RES('+', '+'), /* 1b */
712	RES('|', '='), /* 1c */
713	RES('>', '='), /* 1d */
714	RES('|', '|'), /* 1e */
715	RES('>', '>')  /* 1f */
716#undef RES
717};
718static int code[32] = {
719#define CODE(c0, c1, value) [special_hash(c0, c1)] = value
720	CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
721	CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
722	CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
723	CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
724	CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
725	CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
726	CODE('<', '=', SPECIAL_LTE), /* 0c */
727	CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
728	CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
729	CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
730	CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
731	CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
732	CODE('=', '=', SPECIAL_EQUAL), /* 15 */
733	CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
734	CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
735	CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
736	CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
737	CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
738	CODE('>', '=', SPECIAL_GTE), /* 1d */
739	CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
740	CODE('>', '>', SPECIAL_RIGHTSHIFT)  /* 1f */
741#undef CODE
742};
743
744static int get_one_special(int c, stream_t *stream)
745{
746	struct token *token;
747	int next, value, i;
748
749	next = nextchar(stream);
750
751	/*
752	 * Check for numbers, strings, character constants, and comments
753	 */
754	switch (c) {
755	case '.':
756		if (next >= '0' && next <= '9')
757			return get_one_number(c, next, stream);
758		break;
759	case '"':
760		return eat_string(next, stream, TOKEN_STRING);
761	case '\'':
762		return eat_string(next, stream, TOKEN_CHAR);
763	case '/':
764		if (next == '/')
765			return drop_stream_eoln(stream);
766		if (next == '*')
767			return drop_stream_comment(stream);
768	}
769
770	/*
771	 * Check for combinations
772	 */
773	value = c;
774	if (cclass[next + 1] & ValidSecond) {
775		i = special_hash(c, next);
776		if (hash_results[i][0] == c && hash_results[i][1] == next) {
777			value = code[i];
778			next = nextchar(stream);
779			if (value >= SPECIAL_LEFTSHIFT &&
780			    next == "==."[value - SPECIAL_LEFTSHIFT]) {
781				value += 3;
782				next = nextchar(stream);
783			}
784		}
785	}
786
787	/* Pass it on.. */
788	token = stream->token;
789	token_type(token) = TOKEN_SPECIAL;
790	token->special = value;
791	add_token(stream);
792	return next;
793}
794
795#define IDENT_HASH_BITS (13)
796#define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
797#define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
798
799#define ident_hash_init(c)		(c)
800#define ident_hash_add(oldhash,c)	((oldhash)*11 + (c))
801#define ident_hash_end(hash)		((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
802
803static struct ident *hash_table[IDENT_HASH_SIZE];
804static int ident_hit, ident_miss, idents;
805
806void show_identifier_stats(void)
807{
808	int i;
809	int distribution[100];
810
811	fprintf(stderr, "identifiers: %d hits, %d misses\n",
812		ident_hit, ident_miss);
813
814	for (i = 0; i < 100; i++)
815		distribution[i] = 0;
816
817	for (i = 0; i < IDENT_HASH_SIZE; i++) {
818		struct ident * ident = hash_table[i];
819		int count = 0;
820
821		while (ident) {
822			count++;
823			ident = ident->next;
824		}
825		if (count > 99)
826			count = 99;
827		distribution[count]++;
828	}
829
830	for (i = 0; i < 100; i++) {
831		if (distribution[i])
832			fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
833	}
834}
835
836static struct ident *alloc_ident(const char *name, int len)
837{
838	struct ident *ident = __alloc_ident(len);
839	ident->symbols = NULL;
840	ident->len = len;
841	ident->tainted = 0;
842	memcpy(ident->name, name, len);
843	return ident;
844}
845
846static struct ident * insert_hash(struct ident *ident, unsigned long hash)
847{
848	ident->next = hash_table[hash];
849	hash_table[hash] = ident;
850	ident_miss++;
851	return ident;
852}
853
854static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
855{
856	struct ident *ident;
857	struct ident **p;
858
859	p = &hash_table[hash];
860	while ((ident = *p) != NULL) {
861		if (ident->len == (unsigned char) len) {
862			if (strncmp(name, ident->name, len) != 0)
863				goto next;
864
865			ident_hit++;
866			return ident;
867		}
868next:
869		//misses++;
870		p = &ident->next;
871	}
872	ident = alloc_ident(name, len);
873	*p = ident;
874	ident->next = NULL;
875	ident_miss++;
876	idents++;
877	return ident;
878}
879
880static unsigned long hash_name(const char *name, int len)
881{
882	unsigned long hash;
883	const unsigned char *p = (const unsigned char *)name;
884
885	hash = ident_hash_init(*p++);
886	while (--len) {
887		unsigned int i = *p++;
888		hash = ident_hash_add(hash, i);
889	}
890	return ident_hash_end(hash);
891}
892
893struct ident *hash_ident(struct ident *ident)
894{
895	return insert_hash(ident, hash_name(ident->name, ident->len));
896}
897
898struct ident *built_in_ident(const char *name)
899{
900	int len = strlen(name);
901	return create_hashed_ident(name, len, hash_name(name, len));
902}
903
904struct token *built_in_token(int stream, struct ident *ident)
905{
906	struct token *token;
907
908	token = __alloc_token(0);
909	token->pos.stream = stream;
910	token_type(token) = TOKEN_IDENT;
911	token->ident = ident;
912	return token;
913}
914
915static int get_one_identifier(int c, stream_t *stream)
916{
917	struct token *token;
918	struct ident *ident;
919	unsigned long hash;
920	char buf[256];
921	int len = 1;
922	int next;
923
924	hash = ident_hash_init(c);
925	buf[0] = c;
926	for (;;) {
927		next = nextchar(stream);
928		if (!(cclass[next + 1] & (Letter | Digit)))
929			break;
930		if (len >= sizeof(buf))
931			break;
932		hash = ident_hash_add(hash, next);
933		buf[len] = next;
934		len++;
935	};
936	if (cclass[next + 1] & Quote) {
937		if (len == 1 && buf[0] == 'L') {
938			if (next == '\'')
939				return eat_string(nextchar(stream), stream,
940							TOKEN_WIDE_CHAR);
941			else
942				return eat_string(nextchar(stream), stream,
943							TOKEN_WIDE_STRING);
944		}
945	}
946	hash = ident_hash_end(hash);
947	ident = create_hashed_ident(buf, len, hash);
948
949	/* Pass it on.. */
950	token = stream->token;
951	token_type(token) = TOKEN_IDENT;
952	token->ident = ident;
953	add_token(stream);
954	return next;
955}
956
957static int get_one_token(int c, stream_t *stream)
958{
959	long class = cclass[c + 1];
960	if (class & Digit)
961		return get_one_number(c, nextchar(stream), stream);
962	if (class & Letter)
963		return get_one_identifier(c, stream);
964	return get_one_special(c, stream);
965}
966
967static struct token *setup_stream(stream_t *stream, int idx, int fd,
968	unsigned char *buf, unsigned int buf_size)
969{
970	struct token *begin;
971
972	stream->nr = idx;
973	stream->line = 1;
974	stream->newline = 1;
975	stream->whitespace = 0;
976	stream->pos = 0;
977
978	stream->token = NULL;
979	stream->fd = fd;
980	stream->offset = 0;
981	stream->size = buf_size;
982	stream->buffer = buf;
983
984	begin = alloc_token(stream);
985	token_type(begin) = TOKEN_STREAMBEGIN;
986	stream->tokenlist = &begin->next;
987	return begin;
988}
989
990static struct token *tokenize_stream(stream_t *stream)
991{
992	int c = nextchar(stream);
993	while (c != EOF) {
994		if (!isspace(c)) {
995			struct token *token = alloc_token(stream);
996			stream->token = token;
997			stream->newline = 0;
998			stream->whitespace = 0;
999			c = get_one_token(c, stream);
1000			continue;
1001		}
1002		stream->whitespace = 1;
1003		c = nextchar(stream);
1004	}
1005	return mark_eof(stream);
1006}
1007
1008struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1009{
1010	stream_t stream;
1011	struct token *begin;
1012
1013	begin = setup_stream(&stream, 0, -1, buffer, size);
1014	*endtoken = tokenize_stream(&stream);
1015	return begin;
1016}
1017
1018struct token * tokenize(const struct position *pos, const char *name, int fd, struct token *endtoken, const char **next_path)
1019{
1020	struct token *begin, *end;
1021	stream_t stream;
1022	unsigned char buffer[BUFSIZE];
1023	int idx;
1024
1025	idx = init_stream(pos, name, fd, next_path);
1026	if (idx < 0) {
1027		// info(endtoken->pos, "File %s is const", name);
1028		return endtoken;
1029	}
1030
1031	begin = setup_stream(&stream, idx, fd, buffer, 0);
1032	end = tokenize_stream(&stream);
1033	if (endtoken)
1034		end->next = endtoken;
1035	return begin;
1036}
1037