xref: /third_party/libwebsockets/lib/misc/fts/trie.c (revision d4afb5ce)
1 /*
2 * libwebsockets - small server side websockets and web server implementation
3 *
4 * Copyright (C) 2010 - 2019 Andy Green <andy@warmcat.com>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 *
24 * The functions allow
25 *
26 *  - collecting a concordance of strings from one or more files (eg, a
27 *    directory of files) into a single in-memory, lac-backed trie;
28 *
29 *  - to optimize and serialize the in-memory trie to an fd;
30 *
31 *  - to very quickly report any instances of a string in any of the files
32 *    indexed by the trie, by a seeking around a serialized trie fd, without
33 *    having to load it all in memory
34 */
35
36#include "private-lib-core.h"
37#include "private-lib-misc-fts.h"
38
39#include <stdio.h>
40#include <string.h>
41#include <assert.h>
42#include <fcntl.h>
43#include <errno.h>
44#include <sys/types.h>
45
46struct lws_fts_entry;
47
48/* notice these are stored in t->lwsac_input_head which has input file scope */
49
50struct lws_fts_filepath {
51	struct lws_fts_filepath *next;
52	struct lws_fts_filepath *prev;
53	char filepath[256];
54	jg2_file_offset ofs;
55	jg2_file_offset line_table_ofs;
56	int filepath_len;
57	int file_index;
58	int total_lines;
59	int priority;
60};
61
62/* notice these are stored in t->lwsac_input_head which has input file scope */
63
64struct lws_fts_lines {
65	struct lws_fts_lines *lines_next;
66	/*
67	 * amount of line numbers needs to meet average count for best
68	 * efficiency.
69	 *
70	 * Line numbers are stored in VLI format since if we don't, around half
71	 * the total lac allocation consists of struct lws_fts_lines...
72	 * size chosen to maintain 8-byte struct alignment
73	 */
74	uint8_t vli[119];
75	char count;
76};
77
78/* this represents the instances of a symbol inside a given filepath */
79
80struct lws_fts_instance_file {
81	/* linked-list of tifs generated for current file */
82	struct lws_fts_instance_file *inst_file_next;
83	struct lws_fts_entry *owner;
84	struct lws_fts_lines *lines_list, *lines_tail;
85	uint32_t file_index;
86	uint32_t total;
87
88	/*
89	 * optimization for the common case there's only 1 - ~3 matches, so we
90	 * don't have to allocate any lws_fts_lines struct
91	 *
92	 * Using 8 bytes total for this maintains 8-byte struct alignment...
93	 */
94
95	uint8_t vli[7];
96	char count;
97};
98
99/*
100 * this is the main trie in-memory allocation object
101 */
102
103struct lws_fts_entry {
104	struct lws_fts_entry *parent;
105
106	struct lws_fts_entry *child_list;
107	struct lws_fts_entry *sibling;
108
109	/*
110	 * care... this points to content in t->lwsac_input_head, it goes
111	 * out of scope when the input file being indexed completes
112	 */
113	struct lws_fts_instance_file *inst_file_list;
114
115	jg2_file_offset ofs_last_inst_file;
116
117	char *suffix; /* suffix string or NULL if one char (in .c) */
118	jg2_file_offset ofs;
119	uint32_t child_count;
120	uint32_t instance_count;
121	uint32_t agg_inst_count;
122	uint32_t agg_child_count;
123	uint32_t suffix_len;
124	unsigned char c;
125};
126
127/* there's only one of these per trie file */
128
129struct lws_fts {
130	struct lwsac *lwsac_head;
131	struct lwsac *lwsac_input_head;
132	struct lws_fts_entry *root;
133	struct lws_fts_filepath *filepath_list;
134	struct lws_fts_filepath *fp;
135
136	struct lws_fts_entry *parser;
137	struct lws_fts_entry *root_lookup[256];
138
139	/*
140	 * head of linked-list of tifs generated for current file
141	 * care... this points to content in t->lwsac_input_head
142	 */
143	struct lws_fts_instance_file *tif_list;
144
145	jg2_file_offset c; /* length of output file so far */
146
147	uint64_t agg_trie_creation_us;
148	uint64_t agg_raw_input;
149	uint64_t worst_lwsac_input_size;
150	int last_file_index;
151	int chars_in_line;
152	jg2_file_offset last_block_len_ofs;
153	int line_number;
154	int lines_in_unsealed_linetable;
155	int next_file_index;
156	int count_entries;
157
158	int fd;
159	unsigned int agg_pos;
160	unsigned int str_match_pos;
161
162	unsigned char aggregate;
163	unsigned char agg[128];
164};
165
166/* since the kernel case allocates >300MB, no point keeping this too low */
167
168#define TRIE_LWSAC_BLOCK_SIZE (1024 * 1024)
169
170#define spill(margin, force) \
171	if (bp && ((uint32_t)bp >= (sizeof(buf) - (size_t)(margin)) || (force))) { \
172		if ((int)write(t->fd, buf, (size_t)bp) != bp) { \
173			lwsl_err("%s: write %d failed (%d)\n", __func__, \
174				 bp, errno); \
175			return 1; \
176		} \
177		t->c += (unsigned int)bp; \
178		bp = 0; \
179	}
180
181static int
182g32(unsigned char *b, uint32_t d)
183{
184	*b++ = (uint8_t)((d >> 24) & 0xff);
185	*b++ = (uint8_t)((d >> 16) & 0xff);
186	*b++ = (uint8_t)((d >> 8) & 0xff);
187	*b = (uint8_t)(d & 0xff);
188
189	return 4;
190}
191
192static int
193g16(unsigned char *b, int d)
194{
195	*b++ = (uint8_t)((d >> 8) & 0xff);
196	*b = (uint8_t)(d & 0xff);
197
198	return 2;
199}
200
201static int
202wq32(unsigned char *b, uint32_t d)
203{
204	unsigned char *ob = b;
205
206	if (d > (1 << 28) - 1)
207		*b++ = (uint8_t)(((d >> 28) | 0x80) & 0xff);
208
209	if (d > (1 << 21) - 1)
210		*b++ = (uint8_t)(((d >> 21) | 0x80) & 0xff);
211
212	if (d > (1 << 14) - 1)
213		*b++ = (uint8_t)(((d >> 14) | 0x80) & 0xff);
214
215	if (d > (1 << 7) - 1)
216		*b++ = (uint8_t)(((d >> 7) | 0x80) & 0xff);
217
218	*b++ = (uint8_t)(d & 0x7f);
219
220	return lws_ptr_diff(b, ob);
221}
222
223
224/* read a VLI, return the number of bytes used */
225
226int
227rq32(unsigned char *b, uint32_t *d)
228{
229	unsigned char *ob = b;
230	uint32_t t = 0;
231
232	t = *b & 0x7f;
233	if (*(b++) & 0x80) {
234		t = (t << 7) | (*b & 0x7f);
235		if (*(b++) & 0x80) {
236			t = (t << 7) | (*b & 0x7f);
237			if (*(b++) & 0x80) {
238				t = (t << 7) | (*b & 0x7f);
239				if (*(b++) & 0x80) {
240					t = (t << 7) | (*b & 0x7f);
241					b++;
242				}
243			}
244		}
245	}
246
247	*d = t;
248
249	return (int)(b - ob);
250}
251
252struct lws_fts *
253lws_fts_create(int fd)
254{
255	struct lws_fts *t;
256	struct lwsac *lwsac_head = NULL;
257	unsigned char buf[TRIE_FILE_HDR_SIZE];
258
259	t = lwsac_use(&lwsac_head, sizeof(*t), TRIE_LWSAC_BLOCK_SIZE);
260	if (!t)
261		return NULL;
262
263	memset(t, 0, sizeof(*t));
264
265	t->fd = fd;
266	t->lwsac_head = lwsac_head;
267	t->root = lwsac_use(&lwsac_head, sizeof(*t->root),
268			    TRIE_LWSAC_BLOCK_SIZE);
269	if (!t->root)
270		goto unwind;
271
272	memset(t->root, 0, sizeof(*t->root));
273	t->parser = t->root;
274	t->last_file_index = -1;
275	t->line_number = 1;
276	t->filepath_list = NULL;
277
278	memset(t->root_lookup, 0, sizeof(*t->root_lookup));
279
280	/* write the header */
281
282	buf[0] = 0xca;
283	buf[1] = 0x7a;
284	buf[2] = 0x5f;
285	buf[3] = 0x75;
286
287	/* (these are filled in with correct data at the end) */
288
289	/* file offset to root trie entry */
290	g32(&buf[4], 0);
291	/* file length when it was created */
292	g32(&buf[8], 0);
293	/* fileoffset to the filepath table */
294	g32(&buf[0xc], 0);
295	/* count of filepaths */
296	g32(&buf[0x10], 0);
297
298	if (write(t->fd, buf, TRIE_FILE_HDR_SIZE) != TRIE_FILE_HDR_SIZE) {
299		lwsl_err("%s: trie header write failed\n", __func__);
300		goto unwind;
301	}
302
303	t->c = TRIE_FILE_HDR_SIZE;
304
305	return t;
306
307unwind:
308	lwsac_free(&lwsac_head);
309
310	return NULL;
311}
312
313void
314lws_fts_destroy(struct lws_fts **trie)
315{
316	struct lwsac *lwsac_head = (*trie)->lwsac_head;
317	lwsac_free(&(*trie)->lwsac_input_head);
318	lwsac_free(&lwsac_head);
319	*trie = NULL;
320}
321
322int
323lws_fts_file_index(struct lws_fts *t, const char *filepath, int filepath_len,
324		    int priority)
325{
326	struct lws_fts_filepath *fp = t->filepath_list;
327#if 0
328	while (fp) {
329		if (fp->filepath_len == filepath_len &&
330		    !strcmp(fp->filepath, filepath))
331			return fp->file_index;
332
333		fp = fp->next;
334	}
335#endif
336	fp = lwsac_use(&t->lwsac_head, sizeof(*fp), TRIE_LWSAC_BLOCK_SIZE);
337	if (!fp)
338		return -1;
339
340	fp->next = t->filepath_list;
341	t->filepath_list = fp;
342	strncpy(fp->filepath, filepath, sizeof(fp->filepath) - 1);
343	fp->filepath[sizeof(fp->filepath) - 1] = '\0';
344	fp->filepath_len = filepath_len;
345	fp->file_index = t->next_file_index++;
346	fp->line_table_ofs = t->c;
347	fp->priority = priority;
348	fp->total_lines = 0;
349	t->fp = fp;
350
351	return fp->file_index;
352}
353
354static struct lws_fts_entry *
355lws_fts_entry_child_add(struct lws_fts *t, unsigned char c,
356			struct lws_fts_entry *parent)
357{
358	struct lws_fts_entry *e, **pe;
359
360	e = lwsac_use(&t->lwsac_head, sizeof(*e), TRIE_LWSAC_BLOCK_SIZE);
361	if (!e)
362		return NULL;
363
364	memset(e, 0, sizeof(*e));
365
366	e->c = c;
367	parent->child_count++;
368	e->parent = parent;
369	t->count_entries++;
370
371	/* keep the parent child list in ascending sort order for c */
372
373	pe = &parent->child_list;
374	while (*pe) {
375		assert((*pe)->parent == parent);
376		if ((*pe)->c > c) {
377			/* add it before */
378			e->sibling = *pe;
379			*pe = e;
380			break;
381		}
382		pe = &(*pe)->sibling;
383	}
384
385	if (!*pe) {
386		/* add it at the end */
387		e->sibling = NULL;
388		*pe = e;
389	}
390
391	return e;
392}
393
394static int
395finalize_per_input(struct lws_fts *t)
396{
397	struct lws_fts_instance_file *tif;
398	unsigned char buf[8192];
399	uint64_t lwsac_input_size;
400	jg2_file_offset temp;
401	int bp = 0;
402
403	bp += g16(&buf[bp], 0);
404	bp += g16(&buf[bp], 0);
405	bp += g32(&buf[bp], 0);
406	if ((int)write(t->fd, buf, (size_t)bp) != bp)
407		return 1;
408	t->c += (unsigned int)bp;
409	bp = 0;
410
411	/*
412	 * Write the generated file index + instances (if any)
413	 *
414	 * Notice the next same-parent file instance fileoffset list is
415	 * backwards, so it does not require seeks to fill in.  The first
416	 * entry has 0 but the second entry points to the first entry (whose
417	 * fileoffset is known).
418	 *
419	 * After all the file instance structs are finalized,
420	 * .ofs_last_inst_file contains the fileoffset of that child's tif
421	 * list head in the file.
422	 *
423	 * The file instances are written to disk in the order that the files
424	 * were indexed, along with their prev pointers inline.
425	 */
426
427	tif = t->tif_list;
428	while (tif) {
429		struct lws_fts_lines *i;
430
431		spill((3 * MAX_VLI) + tif->count, 0);
432
433		temp = tif->owner->ofs_last_inst_file;
434		if (tif->total)
435			tif->owner->ofs_last_inst_file = t->c + (unsigned int)bp;
436
437		assert(!temp || (temp > TRIE_FILE_HDR_SIZE && temp < t->c));
438
439		/* fileoffset of prev instance file for this entry, or 0 */
440		bp += wq32(&buf[bp], temp);
441		bp += wq32(&buf[bp], tif->file_index);
442		bp += wq32(&buf[bp], tif->total);
443
444		/* remove any pointers into this disposable lac footprint */
445		tif->owner->inst_file_list = NULL;
446
447		memcpy(&buf[bp], &tif->vli, (size_t)tif->count);
448		bp += tif->count;
449
450		i = tif->lines_list;
451		while (i) {
452			spill(i->count, 0);
453			memcpy(&buf[bp], &i->vli, (size_t)i->count);
454			bp += i->count;
455
456			i = i->lines_next;
457		}
458
459		tif = tif->inst_file_next;
460	}
461
462	spill(0, 1);
463
464	assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
465
466	if (t->lwsac_input_head) {
467		lwsac_input_size = lwsac_total_alloc(t->lwsac_input_head);
468		if (lwsac_input_size > t->worst_lwsac_input_size)
469			t->worst_lwsac_input_size = lwsac_input_size;
470	}
471
472	/*
473	 * those per-file allocations are all on a separate lac so we can
474	 * free it cleanly afterwards
475	 */
476	lwsac_free(&t->lwsac_input_head);
477
478	/* and lose the pointer into the deallocated lac */
479	t->tif_list = NULL;
480
481	return 0;
482}
483
484/*
485 * 0 = punctuation, whitespace, brackets etc
486 * 1 = character inside symbol set
487 * 2 = upper-case character inside symbol set
488 */
489
490static char classify[] = {
491	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
492	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
493	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
494	1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
495	0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
496	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, //1,
497	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
498	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
499	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
500	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
501	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
502	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
503	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
504	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
505	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
506	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
507};
508
509#if 0
510static const char *
511name_entry(struct lws_fts_entry *e1, char *s, int len)
512{
513	struct lws_fts_entry *e2;
514	int n = len;
515
516	s[--n] = '\0';
517
518	e2 = e1;
519	while (e2) {
520		if (e2->suffix) {
521			if ((int)e2->suffix_len < n) {
522				n -= e2->suffix_len;
523				memcpy(&s[n], e2->suffix, e2->suffix_len);
524			}
525		} else {
526			n--;
527			s[n] = e2->c;
528		}
529
530		e2 = e2->parent;
531	}
532
533	return &s[n + 1];
534}
535#endif
536
537/*
538 * as we parse the input, we create a line length table for the file index.
539 * Only the file header has been written before we start doing this.
540 */
541
542int
543lws_fts_fill(struct lws_fts *t, uint32_t file_index, const char *buf,
544	     size_t len)
545{
546	unsigned long long tf = (unsigned long long)lws_now_usecs();
547	unsigned char c, linetable[256], vlibuf[8];
548	struct lws_fts_entry *e, *e1, *dcl;
549	struct lws_fts_instance_file *tif;
550	int bp = 0, sline, chars, m;
551	char *osuff, skipline = 0;
552	struct lws_fts_lines *tl;
553	unsigned int olen, n;
554	off_t lbh;
555
556	if ((int)file_index != t->last_file_index) {
557		if (t->last_file_index >= 0)
558			finalize_per_input(t);
559		t->last_file_index = (int)file_index;
560		t->line_number = 1;
561		t->chars_in_line = 0;
562		t->lines_in_unsealed_linetable = 0;
563	}
564
565	t->agg_raw_input += len;
566
567resume:
568
569	chars = 0;
570	lbh = (off_t)t->c;
571	sline = t->line_number;
572	bp += g16(&linetable[bp], 0);
573	bp += g16(&linetable[bp], 0);
574	bp += g32(&linetable[bp], 0);
575
576	while (len) {
577		char go_around = 0;
578
579		if (t->lines_in_unsealed_linetable >= LWS_FTS_LINES_PER_CHUNK)
580			break;
581
582		len--;
583
584		c = (unsigned char)*buf++;
585		t->chars_in_line++;
586		if (c == '\n') {
587			skipline = 0;
588			t->filepath_list->total_lines++;
589			t->lines_in_unsealed_linetable++;
590			t->line_number++;
591
592			bp += wq32(&linetable[bp], (uint32_t)t->chars_in_line);
593			if ((unsigned int)bp > sizeof(linetable) - 6) {
594				if ((int)write(t->fd, linetable, (unsigned int)bp) != bp) {
595					lwsl_err("%s: linetable write failed\n",
596							__func__);
597					return 1;
598				}
599				t->c += (unsigned int)bp;
600				bp = 0;
601				// assert(lseek(t->fd, 0, SEEK_END) == t->c);
602			}
603
604			chars += t->chars_in_line;
605			t->chars_in_line = 0;
606
607			/*
608			 * Detect overlength lines and skip them (eg, BASE64
609			 * in css etc)
610			 */
611
612			if (len > 200) {
613				n = 0;
614				m = 0;
615				while (n < 200 && m < 80 && buf[n] != '\n') {
616				       if (buf[n] == ' ' || buf[n] == '\t')
617					       m = 0;
618					n++;
619					m++;
620				}
621
622				/* 80 lines no whitespace, or >=200-char line */
623
624				if (m == 80 || n == 200)
625					skipline = 1;
626			}
627
628			goto seal;
629		}
630		if (skipline)
631			continue;
632
633		m = classify[(int)c];
634		if (!m)
635			goto seal;
636		if (m == 2)
637			c = (unsigned char)((char)c + 'a' - 'A');
638
639		if (t->aggregate) {
640
641			/*
642			 * We created a trie entry for an earlier char in this
643			 * symbol already.  So we know at the moment, any
644			 * further chars in the symbol are the only children.
645			 *
646			 * Aggregate them and add them as a string suffix to
647			 * the trie symbol at the end (when we know how much to
648			 * allocate).
649			 */
650
651			if (t->agg_pos < sizeof(t->agg) - 1)
652				/* symbol is not too long to stash */
653				t->agg[t->agg_pos++] = c;
654
655			continue;
656		}
657
658		if (t->str_match_pos) {
659			go_around = 1;
660			goto seal;
661		}
662
663		/* zeroth-iteration child matching */
664
665		if (t->parser == t->root) {
666			e = t->root_lookup[(int)c];
667			if (e) {
668				t->parser = e;
669				continue;
670			}
671		} else {
672
673			/* look for the char amongst the children */
674
675			e = t->parser->child_list;
676			while (e) {
677
678				/* since they're alpha ordered... */
679				if (e->c > c) {
680					e = NULL;
681					break;
682				}
683				if (e->c == c) {
684					t->parser = e;
685
686					if (e->suffix)
687						t->str_match_pos = 1;
688
689					break;
690				}
691
692				e = e->sibling;
693			}
694
695			if (e)
696				continue;
697		}
698
699		/*
700		 * we are blazing a new trail, add a new child representing
701		 * the whole suffix that couldn't be matched until now.
702		 */
703
704		e = lws_fts_entry_child_add(t, c, t->parser);
705		if (!e) {
706			lwsl_err("%s: lws_fts_entry_child_add failed\n",
707					__func__);
708			return 1;
709		}
710
711		/* if it's the root node, keep the root_lookup table in sync */
712
713		if (t->parser == t->root)
714			t->root_lookup[(int)c] = e;
715
716		/* follow the new path */
717		t->parser = e;
718
719		{
720			struct lws_fts_entry **pe = &e->child_list;
721			while (*pe) {
722				assert((*pe)->parent == e);
723
724				pe = &(*pe)->sibling;
725			}
726		}
727
728		/*
729		 * If there are any more symbol characters coming, just
730		 * create a suffix string on t->parser instead of what must
731		 * currently be single-child nodes, since we just created e
732		 * as a child with a single character due to no existing match
733		 * on that single character... so if no match on 'h' with this
734		 * guy's parent, we created e that matches on the single char
735		 * 'h'.  If the symbol continues ... 'a' 'p' 'p' 'y', then
736		 * instead of creating singleton child nodes under e,
737		 * modify e to match on the whole string suffix "happy".
738		 *
739		 * If later "hoppy" appears, we will remove the suffix on e,
740		 * so it reverts to a char match for 'h', add singleton children
741		 * for 'a' and 'o', and attach a "ppy" suffix child to each of
742		 * those.
743		 *
744		 * We want to do this so we don't have to allocate trie entries
745		 * for every char in the string to save memory and consequently
746		 * time.
747		 *
748		 * Don't try this optimization if the parent is the root node...
749		 * it's not compatible with it's root_lookup table and it's
750		 * highly likely children off the root entry are going to have
751		 * to be fragmented.
752		 */
753
754		if (e->parent != t->root) {
755			t->aggregate = 1;
756			t->agg_pos = 0;
757		}
758
759		continue;
760
761seal:
762		if (t->str_match_pos) {
763
764			/*
765			 * We're partway through matching an elaborated string
766			 * on a child, not just a character.  String matches
767			 * only exist when we met a child entry that only had
768			 * one path until now... so we had an 'h', and the
769			 * only child had a string "hello".
770			 *
771			 * We are following the right path and will not need
772			 * to back up, but we may find as we go we have the
773			 * first instance of a second child path, eg, "help".
774			 *
775			 * When we get to the 'p', we have to split what was
776			 * the only string option "hello" into "hel" and then
777			 * two child entries, for "lo" and 'p'.
778			 */
779
780			if (c == t->parser->suffix[t->str_match_pos++]) {
781				if (t->str_match_pos < t->parser->suffix_len)
782					continue;
783
784				/*
785				 * We simply matched everything, continue
786				 * parsing normally from this trie entry.
787				 */
788
789				t->str_match_pos = 0;
790				continue;
791			}
792
793			/*
794			 * So... we hit a mismatch somewhere... it means we
795			 * have to split this string entry.
796			 *
797			 * We know the first char actually matched in order to
798			 * start down this road.  So for the current trie entry,
799			 * we need to truncate his suffix at the char before
800			 * this mismatched one, where we diverged (if the
801			 * second char, simply remove the suffix string from the
802			 * current trie entry to turn it back to a 1-char match)
803			 *
804			 * The original entry, which becomes the lhs post-split,
805			 * is t->parser.
806			 */
807
808			olen = t->parser->suffix_len;
809			osuff = t->parser->suffix;
810
811			if (t->str_match_pos == 2)
812				t->parser->suffix = NULL;
813			else
814				t->parser->suffix_len = t->str_match_pos - 1;
815
816			/*
817			 * Then we need to create a new child trie entry that
818			 * represents the remainder of the original string
819			 * path that we didn't match.  For the "hello" /
820			 * "help" case, this guy will have "lo".
821			 *
822			 * Any instances or children (not siblings...) that were
823			 * attached to the original trie entry must be detached
824			 * first and then migrate to this new guy that completes
825			 * the original string.
826			 */
827
828			dcl = t->parser->child_list;
829			m = (int)t->parser->child_count;
830
831			t->parser->child_list = NULL;
832			t->parser->child_count = 0;
833
834			e = lws_fts_entry_child_add(t, (unsigned char)
835					osuff[t->str_match_pos - 1], t->parser);
836			if (!e) {
837				lwsl_err("%s: lws_fts_entry_child_add fail1\n",
838						__func__);
839				return 1;
840			}
841
842			e->child_list = dcl;
843			e->child_count = (uint32_t)m;
844			/*
845			 * any children we took over must point to us as the
846			 * parent now they appear on our child list
847			 */
848			e1 = e->child_list;
849			while (e1) {
850				e1->parent = e;
851				e1 = e1->sibling;
852			}
853
854			/*
855			 * We detached any children, gave them to the new guy
856			 * and replaced them with just our new guy
857			 */
858			t->parser->child_count = 1;
859			t->parser->child_list = e;
860
861			/*
862			 * any instances that belonged to the original entry we
863			 * are splitting now must be reassigned to the end
864			 * part
865			 */
866
867			e->inst_file_list = t->parser->inst_file_list;
868			if (e->inst_file_list)
869				e->inst_file_list->owner = e;
870			t->parser->inst_file_list = NULL;
871			e->instance_count = t->parser->instance_count;
872			t->parser->instance_count = 0;
873
874			e->ofs_last_inst_file = t->parser->ofs_last_inst_file;
875			t->parser->ofs_last_inst_file = 0;
876
877			if (t->str_match_pos != olen) {
878				/* we diverged partway */
879				e->suffix = &osuff[t->str_match_pos - 1];
880				e->suffix_len = olen - (t->str_match_pos - 1);
881			}
882
883			/*
884			 * if the current char is a terminal, skip creating a
885			 * new way forward.
886			 */
887
888			if (classify[(int)c]) {
889
890				/*
891				 * Lastly we need to create a new child trie
892				 * entry that represents the new way forward
893				 * from the point that we diverged.  For the
894				 * "hello" / "help" case, this guy will start
895				 * as a child of "hel" with the single
896				 * character match 'p'.
897				 *
898				 * Since he becomes the current parser context,
899				 * more symbol characters may be coming to make
900				 * him into, eg, "helping", in which case he
901				 * will acquire a suffix eventually of "ping"
902				 * via the aggregation stuff
903				 */
904
905				e = lws_fts_entry_child_add(t, c, t->parser);
906				if (!e) {
907					lwsl_err("%s: child_add fail2\n",
908						 __func__);
909					return 1;
910				}
911			}
912
913			/* go on following this path */
914			t->parser = e;
915
916			t->aggregate = 1;
917			t->agg_pos = 0;
918
919			t->str_match_pos = 0;
920
921			if (go_around)
922				continue;
923
924			/* this is intended to be a seal */
925		}
926
927
928		/* end of token */
929
930		if (t->aggregate && t->agg_pos) {
931
932			/* if nothing in agg[]: leave as single char match */
933
934			/* otherwise copy out the symbol aggregation */
935			t->parser->suffix = lwsac_use(&t->lwsac_head,
936						    t->agg_pos + 1,
937						    TRIE_LWSAC_BLOCK_SIZE);
938			if (!t->parser->suffix) {
939				lwsl_err("%s: lac for suffix failed\n",
940						__func__);
941				return 1;
942			}
943
944			/* add the first char at the beginning */
945			*t->parser->suffix = (char)t->parser->c;
946			/* and then add the agg buffer stuff */
947			memcpy(t->parser->suffix + 1, t->agg, t->agg_pos);
948			t->parser->suffix_len = t->agg_pos + 1;
949		}
950		t->aggregate = 0;
951
952		if (t->parser == t->root) /* multiple terminal chars */
953			continue;
954
955		if (!t->parser->inst_file_list ||
956		    t->parser->inst_file_list->file_index != file_index) {
957			tif = lwsac_use(&t->lwsac_input_head, sizeof(*tif),
958				      TRIE_LWSAC_BLOCK_SIZE);
959			if (!tif) {
960				lwsl_err("%s: lac for tif failed\n",
961						__func__);
962				return 1;
963			}
964
965			tif->file_index = file_index;
966			tif->owner = t->parser;
967			tif->lines_list = NULL;
968			tif->lines_tail = NULL;
969			tif->total = 0;
970			tif->count = 0;
971			tif->inst_file_next = t->tif_list;
972			t->tif_list = tif;
973
974			t->parser->inst_file_list = tif;
975		}
976
977		/*
978		 * A naive allocation strategy for this leads to 50% of the
979		 * total inmem lac allocation being for line numbers...
980		 *
981		 * It's mainly solved by only holding the instance and line
982		 * number tables for the duration of a file being input, as soon
983		 * as one input file is finished it is written to disk.
984		 *
985		 * For the common case of 1 - ~3 matches the line number are
986		 * stored in a small VLI array inside the filepath inst.  If the
987		 * next one won't fit, it allocates a line number struct with
988		 * more vli space and continues chaining those if needed.
989		 */
990
991		n = (unsigned int)wq32(vlibuf, (uint32_t)t->line_number);
992		tif = t->parser->inst_file_list;
993
994		if (!tif->lines_list) {
995			/* we are still trying to use the file inst vli */
996			if (LWS_ARRAY_SIZE(tif->vli) - (size_t)tif->count >= n) {
997				tif->count = (char)((char)tif->count + (char)wq32(tif->vli + tif->count,
998						   (uint32_t)t->line_number));
999				goto after;
1000			}
1001			/* we are going to have to allocate */
1002		}
1003
1004		/* can we add to an existing line numbers struct? */
1005		if (tif->lines_tail &&
1006		    LWS_ARRAY_SIZE(tif->lines_tail->vli) -
1007				(unsigned char)tif->lines_tail->count >= n) {
1008			tif->lines_tail->count = (char)((char)tif->lines_tail->count + (char)wq32(tif->lines_tail->vli +
1009						       tif->lines_tail->count,
1010						       (uint32_t)t->line_number));
1011			goto after;
1012		}
1013
1014		/* either no existing line numbers struct at tail, or full */
1015
1016		/* have to create a(nother) line numbers struct */
1017		tl = lwsac_use(&t->lwsac_input_head, sizeof(*tl),
1018			     TRIE_LWSAC_BLOCK_SIZE);
1019		if (!tl) {
1020			lwsl_err("%s: lac for tl failed\n", __func__);
1021			return 1;
1022		}
1023		tl->lines_next = NULL;
1024		if (tif->lines_tail)
1025			tif->lines_tail->lines_next = tl;
1026
1027		tif->lines_tail = tl;
1028		if (!tif->lines_list)
1029			tif->lines_list = tl;
1030
1031		tl->count = (char)wq32(tl->vli, (uint32_t)t->line_number);
1032after:
1033		tif->total++;
1034#if 0
1035		{
1036			char s[128];
1037			const char *ne = name_entry(t->parser, s, sizeof(s));
1038
1039			if (!strcmp(ne, "describ")) {
1040				lwsl_err("     %s %d\n", ne, t->str_match_pos);
1041				write(1, buf - 10, 20);
1042			}
1043		}
1044#endif
1045		t->parser->instance_count++;
1046		t->parser = t->root;
1047		t->str_match_pos = 0;
1048	}
1049
1050	/* seal off the line length table block */
1051
1052	if (bp) {
1053		if ((int)write(t->fd, linetable, (size_t)bp) != bp)
1054			return 1;
1055		t->c += (unsigned int)bp;
1056		bp = 0;
1057	}
1058
1059	if (lseek(t->fd, lbh, SEEK_SET) < 0) {
1060		lwsl_err("%s: seek to 0x%llx failed\n", __func__,
1061				(unsigned long long)lbh);
1062		return 1;
1063	}
1064
1065	g16(linetable, (uint16_t)(t->c - (jg2_file_offset)lbh));
1066	g16(linetable + 2, (uint16_t)(t->line_number - sline));
1067	g32(linetable + 4, (uint32_t)chars);
1068	if ((int)write(t->fd, linetable, 8) != 8) {
1069		lwsl_err("%s: write linetable header failed\n", __func__);
1070		return 1;
1071	}
1072
1073	assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
1074
1075	if (lseek(t->fd, (off_t)t->c, SEEK_SET) < 0) {
1076		lwsl_err("%s: end seek failed\n", __func__);
1077		return 1;
1078	}
1079
1080	bp = 0;
1081
1082	if (len) {
1083		t->lines_in_unsealed_linetable = 0;
1084		goto resume;
1085	}
1086
1087	/* dump the collected per-input instance and line data, and free it */
1088
1089	t->agg_trie_creation_us += (uint64_t)((uint64_t)lws_now_usecs() - tf);
1090
1091	return 0;
1092}
1093
1094/* refer to ./README.md */
1095
1096int
1097lws_fts_serialize(struct lws_fts *t)
1098{
1099	struct lws_fts_filepath *fp = t->filepath_list, *ofp;
1100	unsigned long long tf = (unsigned long long)lws_now_usecs();
1101	struct lws_fts_entry *e, *e1, *s[256];
1102	unsigned char buf[8192], stasis;
1103	int n, bp, sp = 0, do_parent;
1104
1105	(void)tf;
1106	finalize_per_input(t);
1107
1108	/*
1109	 * Compute aggregated instance counts (parents should know the total
1110	 * number of instances below each child path)
1111	 *
1112	 *
1113	 * If we have
1114	 *
1115	 * (root) -> (c1) -> (c2)
1116	 *        -> (c3)
1117	 *
1118	 * we need to visit the nodes in the order
1119	 *
1120	 * c2, c1, c3, root
1121	 */
1122
1123	sp = 0;
1124	s[0] = t->root;
1125	do_parent = 0;
1126	while (sp >= 0) {
1127		int n;
1128
1129		/* aggregate in every antecedent */
1130
1131		for (n = 0; n <= sp; n++) {
1132			s[n]->agg_inst_count += s[sp]->instance_count;
1133			s[n]->agg_child_count += s[sp]->child_count;
1134		}
1135
1136		/* handle any children before the parent */
1137
1138		if (s[sp]->child_list) {
1139			if (sp + 1 == LWS_ARRAY_SIZE(s)) {
1140				lwsl_err("Stack too deep\n");
1141
1142				goto bail;
1143			}
1144
1145			s[sp + 1] = s[sp]->child_list;
1146			sp++;
1147			continue;
1148		}
1149
1150		do {
1151			if (s[sp]->sibling) {
1152				s[sp] = s[sp]->sibling;
1153				break;
1154			} else
1155				sp--;
1156		} while (sp >= 0);
1157	}
1158
1159	/* dump the filepaths and set prev */
1160
1161	fp = t->filepath_list;
1162	ofp = NULL;
1163	bp = 0;
1164	while (fp) {
1165
1166		fp->ofs = t->c + (unsigned int)bp;
1167		n = (int)strlen(fp->filepath);
1168		spill(15 + n, 0);
1169
1170		bp += wq32(&buf[bp], fp->line_table_ofs);
1171		bp += wq32(&buf[bp], (uint32_t)fp->total_lines);
1172		bp += wq32(&buf[bp], (uint32_t)n);
1173		memcpy(&buf[bp], fp->filepath, (unsigned int)n);
1174		bp += n;
1175
1176		fp->prev = ofp;
1177		ofp = fp;
1178		fp = fp->next;
1179	}
1180
1181	spill(0, 1);
1182
1183	/* record the fileoffset of the filepath map and filepath count */
1184
1185	if (lseek(t->fd, 0xc, SEEK_SET) < 0)
1186		goto bail_seek;
1187
1188	g32(buf, t->c + (unsigned int)bp);
1189	g32(buf + 4, (uint32_t)t->next_file_index);
1190	if ((int)write(t->fd, buf, 8) != 8)
1191		goto bail;
1192
1193	if (lseek(t->fd, (off_t)(t->c + (unsigned int)bp), SEEK_SET) < 0)
1194		goto bail_seek;
1195
1196	/* dump the filepath map, starting from index 0, which is at the tail */
1197
1198	fp = ofp;
1199	bp = 0;
1200	while (fp) {
1201		spill(5, 0);
1202		g32(buf + bp, fp->ofs);
1203		bp += 4;
1204		fp = fp->prev;
1205	}
1206	spill(0, 1);
1207
1208	/*
1209	 * The trie entries in reverse order... because of the reversal, we have
1210	 * always written children first, and marked them with their file offset
1211	 * before we come to refer to them.
1212	 */
1213
1214	bp = 0;
1215	sp = 0;
1216	s[0] = t->root;
1217	do_parent = 0;
1218	while (s[sp]) {
1219
1220		/* handle any children before the parent */
1221
1222		if (!do_parent && s[sp]->child_list) {
1223
1224			if (sp + 1 == LWS_ARRAY_SIZE(s)) {
1225				lwsl_err("Stack too deep\n");
1226
1227				goto bail;
1228			}
1229
1230			s[sp + 1] = s[sp]->child_list;
1231			sp++;
1232			continue;
1233		}
1234
1235		/* leaf nodes with no children */
1236
1237		e = s[sp];
1238		e->ofs = t->c + (unsigned int)bp;
1239
1240		/* write the trie entry header */
1241
1242		spill((3 * MAX_VLI), 0);
1243
1244		bp += wq32(&buf[bp], e->ofs_last_inst_file);
1245		bp += wq32(&buf[bp], e->child_count);
1246		bp += wq32(&buf[bp], e->instance_count);
1247		bp += wq32(&buf[bp], e->agg_inst_count);
1248
1249		/* sort the children in order of highest aggregate hits first */
1250
1251		do {
1252			struct lws_fts_entry **pe, *te1, *te2;
1253
1254			stasis = 1;
1255
1256			/* bubble sort keeps going until nothing changed */
1257
1258			pe = &e->child_list;
1259			while (*pe) {
1260
1261				te1 = *pe;
1262				te2 = te1->sibling;
1263
1264				if (te2 && te1->agg_inst_count <
1265					   te2->agg_inst_count) {
1266					stasis = 0;
1267
1268					*pe = te2;
1269					te1->sibling = te2->sibling;
1270					te2->sibling = te1;
1271				}
1272
1273				pe = &(*pe)->sibling;
1274			}
1275
1276		} while (!stasis);
1277
1278		/* write the children */
1279
1280		e1 = e->child_list;
1281		while (e1) {
1282			spill((5 * MAX_VLI) + e1->suffix_len + 1, 0);
1283
1284			bp += wq32(&buf[bp], e1->ofs);
1285			bp += wq32(&buf[bp], e1->instance_count);
1286			bp += wq32(&buf[bp], e1->agg_inst_count);
1287			bp += wq32(&buf[bp], e1->agg_child_count);
1288
1289			if (e1->suffix) { /* string  */
1290				bp += wq32(&buf[bp], e1->suffix_len);
1291				memmove(&buf[bp], e1->suffix, e1->suffix_len);
1292				bp += (int)e1->suffix_len;
1293			} else { /* char */
1294				bp += wq32(&buf[bp], 1);
1295				buf[bp++] = e1->c;
1296			}
1297#if 0
1298			if (e1->suffix && e1->suffix_len == 3 &&
1299			    !memcmp(e1->suffix, "cri", 3)) {
1300				struct lws_fts_entry *e2;
1301
1302				e2 = e1;
1303				while (e2){
1304					if (e2->suffix)
1305						lwsl_notice("%s\n", e2->suffix);
1306					else
1307						lwsl_notice("%c\n", e2->c);
1308
1309					e2 = e2->parent;
1310				}
1311
1312				lwsl_err("*** %c CRI inst %d ch %d\n", e1->parent->c,
1313						e1->instance_count, e1->child_count);
1314			}
1315#endif
1316			e1 = e1->sibling;
1317		}
1318
1319		/* if there are siblings, do those next */
1320
1321		if (do_parent) {
1322			do_parent = 0;
1323			sp--;
1324		}
1325
1326		if (s[sp]->sibling)
1327			s[sp] = s[sp]->sibling;
1328		else {
1329			/* if there are no siblings, do the parent */
1330			do_parent = 1;
1331			s[sp] = s[sp]->parent;
1332		}
1333	}
1334
1335	spill(0, 1);
1336
1337	assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
1338
1339	/* drop the correct root trie offset + file length into the header */
1340
1341	if (lseek(t->fd, 4, SEEK_SET) < 0) {
1342		lwsl_err("%s: unable to seek\n", __func__);
1343
1344		goto bail;
1345	}
1346
1347	g32(buf, t->root->ofs);
1348	g32(buf + 4, t->c);
1349	if (write(t->fd, buf, 0x8) != 0x8)
1350		goto bail;
1351
1352	lwsl_notice("%s: index %d files (%uMiB) cpu time %dms, "
1353		    "alloc: %dKiB + %dKiB, "
1354		    "serialize: %dms, file: %dKiB\n", __func__,
1355		    t->next_file_index,
1356		    (int)(t->agg_raw_input / (1024 * 1024)),
1357		    (int)(t->agg_trie_creation_us / 1000),
1358		    (int)(lwsac_total_alloc(t->lwsac_head) / 1024),
1359		    (int)(t->worst_lwsac_input_size / 1024),
1360		    (int)(((uint64_t)lws_now_usecs() - tf) / 1000),
1361		    (int)(t->c / 1024));
1362
1363	return 0;
1364
1365bail_seek:
1366	lwsl_err("%s: problem seekings\n", __func__);
1367
1368bail:
1369	return 1;
1370}
1371
1372
1373