1 /* 2 * libwebsockets - small server side websockets and web server implementation 3 * 4 * Copyright (C) 2010 - 2019 Andy Green <andy@warmcat.com> 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 * 24 * The functions allow 25 * 26 * - collecting a concordance of strings from one or more files (eg, a 27 * directory of files) into a single in-memory, lac-backed trie; 28 * 29 * - to optimize and serialize the in-memory trie to an fd; 30 * 31 * - to very quickly report any instances of a string in any of the files 32 * indexed by the trie, by a seeking around a serialized trie fd, without 33 * having to load it all in memory 34 */ 35 36#include "private-lib-core.h" 37#include "private-lib-misc-fts.h" 38 39#include <stdio.h> 40#include <string.h> 41#include <assert.h> 42#include <fcntl.h> 43#include <errno.h> 44#include <sys/types.h> 45 46struct lws_fts_entry; 47 48/* notice these are stored in t->lwsac_input_head which has input file scope */ 49 50struct lws_fts_filepath { 51 struct lws_fts_filepath *next; 52 struct lws_fts_filepath *prev; 53 char filepath[256]; 54 jg2_file_offset ofs; 55 jg2_file_offset line_table_ofs; 56 int filepath_len; 57 int file_index; 58 int total_lines; 59 int priority; 60}; 61 62/* notice these are stored in t->lwsac_input_head which has input file scope */ 63 64struct lws_fts_lines { 65 struct lws_fts_lines *lines_next; 66 /* 67 * amount of line numbers needs to meet average count for best 68 * efficiency. 69 * 70 * Line numbers are stored in VLI format since if we don't, around half 71 * the total lac allocation consists of struct lws_fts_lines... 72 * size chosen to maintain 8-byte struct alignment 73 */ 74 uint8_t vli[119]; 75 char count; 76}; 77 78/* this represents the instances of a symbol inside a given filepath */ 79 80struct lws_fts_instance_file { 81 /* linked-list of tifs generated for current file */ 82 struct lws_fts_instance_file *inst_file_next; 83 struct lws_fts_entry *owner; 84 struct lws_fts_lines *lines_list, *lines_tail; 85 uint32_t file_index; 86 uint32_t total; 87 88 /* 89 * optimization for the common case there's only 1 - ~3 matches, so we 90 * don't have to allocate any lws_fts_lines struct 91 * 92 * Using 8 bytes total for this maintains 8-byte struct alignment... 93 */ 94 95 uint8_t vli[7]; 96 char count; 97}; 98 99/* 100 * this is the main trie in-memory allocation object 101 */ 102 103struct lws_fts_entry { 104 struct lws_fts_entry *parent; 105 106 struct lws_fts_entry *child_list; 107 struct lws_fts_entry *sibling; 108 109 /* 110 * care... this points to content in t->lwsac_input_head, it goes 111 * out of scope when the input file being indexed completes 112 */ 113 struct lws_fts_instance_file *inst_file_list; 114 115 jg2_file_offset ofs_last_inst_file; 116 117 char *suffix; /* suffix string or NULL if one char (in .c) */ 118 jg2_file_offset ofs; 119 uint32_t child_count; 120 uint32_t instance_count; 121 uint32_t agg_inst_count; 122 uint32_t agg_child_count; 123 uint32_t suffix_len; 124 unsigned char c; 125}; 126 127/* there's only one of these per trie file */ 128 129struct lws_fts { 130 struct lwsac *lwsac_head; 131 struct lwsac *lwsac_input_head; 132 struct lws_fts_entry *root; 133 struct lws_fts_filepath *filepath_list; 134 struct lws_fts_filepath *fp; 135 136 struct lws_fts_entry *parser; 137 struct lws_fts_entry *root_lookup[256]; 138 139 /* 140 * head of linked-list of tifs generated for current file 141 * care... this points to content in t->lwsac_input_head 142 */ 143 struct lws_fts_instance_file *tif_list; 144 145 jg2_file_offset c; /* length of output file so far */ 146 147 uint64_t agg_trie_creation_us; 148 uint64_t agg_raw_input; 149 uint64_t worst_lwsac_input_size; 150 int last_file_index; 151 int chars_in_line; 152 jg2_file_offset last_block_len_ofs; 153 int line_number; 154 int lines_in_unsealed_linetable; 155 int next_file_index; 156 int count_entries; 157 158 int fd; 159 unsigned int agg_pos; 160 unsigned int str_match_pos; 161 162 unsigned char aggregate; 163 unsigned char agg[128]; 164}; 165 166/* since the kernel case allocates >300MB, no point keeping this too low */ 167 168#define TRIE_LWSAC_BLOCK_SIZE (1024 * 1024) 169 170#define spill(margin, force) \ 171 if (bp && ((uint32_t)bp >= (sizeof(buf) - (size_t)(margin)) || (force))) { \ 172 if ((int)write(t->fd, buf, (size_t)bp) != bp) { \ 173 lwsl_err("%s: write %d failed (%d)\n", __func__, \ 174 bp, errno); \ 175 return 1; \ 176 } \ 177 t->c += (unsigned int)bp; \ 178 bp = 0; \ 179 } 180 181static int 182g32(unsigned char *b, uint32_t d) 183{ 184 *b++ = (uint8_t)((d >> 24) & 0xff); 185 *b++ = (uint8_t)((d >> 16) & 0xff); 186 *b++ = (uint8_t)((d >> 8) & 0xff); 187 *b = (uint8_t)(d & 0xff); 188 189 return 4; 190} 191 192static int 193g16(unsigned char *b, int d) 194{ 195 *b++ = (uint8_t)((d >> 8) & 0xff); 196 *b = (uint8_t)(d & 0xff); 197 198 return 2; 199} 200 201static int 202wq32(unsigned char *b, uint32_t d) 203{ 204 unsigned char *ob = b; 205 206 if (d > (1 << 28) - 1) 207 *b++ = (uint8_t)(((d >> 28) | 0x80) & 0xff); 208 209 if (d > (1 << 21) - 1) 210 *b++ = (uint8_t)(((d >> 21) | 0x80) & 0xff); 211 212 if (d > (1 << 14) - 1) 213 *b++ = (uint8_t)(((d >> 14) | 0x80) & 0xff); 214 215 if (d > (1 << 7) - 1) 216 *b++ = (uint8_t)(((d >> 7) | 0x80) & 0xff); 217 218 *b++ = (uint8_t)(d & 0x7f); 219 220 return lws_ptr_diff(b, ob); 221} 222 223 224/* read a VLI, return the number of bytes used */ 225 226int 227rq32(unsigned char *b, uint32_t *d) 228{ 229 unsigned char *ob = b; 230 uint32_t t = 0; 231 232 t = *b & 0x7f; 233 if (*(b++) & 0x80) { 234 t = (t << 7) | (*b & 0x7f); 235 if (*(b++) & 0x80) { 236 t = (t << 7) | (*b & 0x7f); 237 if (*(b++) & 0x80) { 238 t = (t << 7) | (*b & 0x7f); 239 if (*(b++) & 0x80) { 240 t = (t << 7) | (*b & 0x7f); 241 b++; 242 } 243 } 244 } 245 } 246 247 *d = t; 248 249 return (int)(b - ob); 250} 251 252struct lws_fts * 253lws_fts_create(int fd) 254{ 255 struct lws_fts *t; 256 struct lwsac *lwsac_head = NULL; 257 unsigned char buf[TRIE_FILE_HDR_SIZE]; 258 259 t = lwsac_use(&lwsac_head, sizeof(*t), TRIE_LWSAC_BLOCK_SIZE); 260 if (!t) 261 return NULL; 262 263 memset(t, 0, sizeof(*t)); 264 265 t->fd = fd; 266 t->lwsac_head = lwsac_head; 267 t->root = lwsac_use(&lwsac_head, sizeof(*t->root), 268 TRIE_LWSAC_BLOCK_SIZE); 269 if (!t->root) 270 goto unwind; 271 272 memset(t->root, 0, sizeof(*t->root)); 273 t->parser = t->root; 274 t->last_file_index = -1; 275 t->line_number = 1; 276 t->filepath_list = NULL; 277 278 memset(t->root_lookup, 0, sizeof(*t->root_lookup)); 279 280 /* write the header */ 281 282 buf[0] = 0xca; 283 buf[1] = 0x7a; 284 buf[2] = 0x5f; 285 buf[3] = 0x75; 286 287 /* (these are filled in with correct data at the end) */ 288 289 /* file offset to root trie entry */ 290 g32(&buf[4], 0); 291 /* file length when it was created */ 292 g32(&buf[8], 0); 293 /* fileoffset to the filepath table */ 294 g32(&buf[0xc], 0); 295 /* count of filepaths */ 296 g32(&buf[0x10], 0); 297 298 if (write(t->fd, buf, TRIE_FILE_HDR_SIZE) != TRIE_FILE_HDR_SIZE) { 299 lwsl_err("%s: trie header write failed\n", __func__); 300 goto unwind; 301 } 302 303 t->c = TRIE_FILE_HDR_SIZE; 304 305 return t; 306 307unwind: 308 lwsac_free(&lwsac_head); 309 310 return NULL; 311} 312 313void 314lws_fts_destroy(struct lws_fts **trie) 315{ 316 struct lwsac *lwsac_head = (*trie)->lwsac_head; 317 lwsac_free(&(*trie)->lwsac_input_head); 318 lwsac_free(&lwsac_head); 319 *trie = NULL; 320} 321 322int 323lws_fts_file_index(struct lws_fts *t, const char *filepath, int filepath_len, 324 int priority) 325{ 326 struct lws_fts_filepath *fp = t->filepath_list; 327#if 0 328 while (fp) { 329 if (fp->filepath_len == filepath_len && 330 !strcmp(fp->filepath, filepath)) 331 return fp->file_index; 332 333 fp = fp->next; 334 } 335#endif 336 fp = lwsac_use(&t->lwsac_head, sizeof(*fp), TRIE_LWSAC_BLOCK_SIZE); 337 if (!fp) 338 return -1; 339 340 fp->next = t->filepath_list; 341 t->filepath_list = fp; 342 strncpy(fp->filepath, filepath, sizeof(fp->filepath) - 1); 343 fp->filepath[sizeof(fp->filepath) - 1] = '\0'; 344 fp->filepath_len = filepath_len; 345 fp->file_index = t->next_file_index++; 346 fp->line_table_ofs = t->c; 347 fp->priority = priority; 348 fp->total_lines = 0; 349 t->fp = fp; 350 351 return fp->file_index; 352} 353 354static struct lws_fts_entry * 355lws_fts_entry_child_add(struct lws_fts *t, unsigned char c, 356 struct lws_fts_entry *parent) 357{ 358 struct lws_fts_entry *e, **pe; 359 360 e = lwsac_use(&t->lwsac_head, sizeof(*e), TRIE_LWSAC_BLOCK_SIZE); 361 if (!e) 362 return NULL; 363 364 memset(e, 0, sizeof(*e)); 365 366 e->c = c; 367 parent->child_count++; 368 e->parent = parent; 369 t->count_entries++; 370 371 /* keep the parent child list in ascending sort order for c */ 372 373 pe = &parent->child_list; 374 while (*pe) { 375 assert((*pe)->parent == parent); 376 if ((*pe)->c > c) { 377 /* add it before */ 378 e->sibling = *pe; 379 *pe = e; 380 break; 381 } 382 pe = &(*pe)->sibling; 383 } 384 385 if (!*pe) { 386 /* add it at the end */ 387 e->sibling = NULL; 388 *pe = e; 389 } 390 391 return e; 392} 393 394static int 395finalize_per_input(struct lws_fts *t) 396{ 397 struct lws_fts_instance_file *tif; 398 unsigned char buf[8192]; 399 uint64_t lwsac_input_size; 400 jg2_file_offset temp; 401 int bp = 0; 402 403 bp += g16(&buf[bp], 0); 404 bp += g16(&buf[bp], 0); 405 bp += g32(&buf[bp], 0); 406 if ((int)write(t->fd, buf, (size_t)bp) != bp) 407 return 1; 408 t->c += (unsigned int)bp; 409 bp = 0; 410 411 /* 412 * Write the generated file index + instances (if any) 413 * 414 * Notice the next same-parent file instance fileoffset list is 415 * backwards, so it does not require seeks to fill in. The first 416 * entry has 0 but the second entry points to the first entry (whose 417 * fileoffset is known). 418 * 419 * After all the file instance structs are finalized, 420 * .ofs_last_inst_file contains the fileoffset of that child's tif 421 * list head in the file. 422 * 423 * The file instances are written to disk in the order that the files 424 * were indexed, along with their prev pointers inline. 425 */ 426 427 tif = t->tif_list; 428 while (tif) { 429 struct lws_fts_lines *i; 430 431 spill((3 * MAX_VLI) + tif->count, 0); 432 433 temp = tif->owner->ofs_last_inst_file; 434 if (tif->total) 435 tif->owner->ofs_last_inst_file = t->c + (unsigned int)bp; 436 437 assert(!temp || (temp > TRIE_FILE_HDR_SIZE && temp < t->c)); 438 439 /* fileoffset of prev instance file for this entry, or 0 */ 440 bp += wq32(&buf[bp], temp); 441 bp += wq32(&buf[bp], tif->file_index); 442 bp += wq32(&buf[bp], tif->total); 443 444 /* remove any pointers into this disposable lac footprint */ 445 tif->owner->inst_file_list = NULL; 446 447 memcpy(&buf[bp], &tif->vli, (size_t)tif->count); 448 bp += tif->count; 449 450 i = tif->lines_list; 451 while (i) { 452 spill(i->count, 0); 453 memcpy(&buf[bp], &i->vli, (size_t)i->count); 454 bp += i->count; 455 456 i = i->lines_next; 457 } 458 459 tif = tif->inst_file_next; 460 } 461 462 spill(0, 1); 463 464 assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c); 465 466 if (t->lwsac_input_head) { 467 lwsac_input_size = lwsac_total_alloc(t->lwsac_input_head); 468 if (lwsac_input_size > t->worst_lwsac_input_size) 469 t->worst_lwsac_input_size = lwsac_input_size; 470 } 471 472 /* 473 * those per-file allocations are all on a separate lac so we can 474 * free it cleanly afterwards 475 */ 476 lwsac_free(&t->lwsac_input_head); 477 478 /* and lose the pointer into the deallocated lac */ 479 t->tif_list = NULL; 480 481 return 0; 482} 483 484/* 485 * 0 = punctuation, whitespace, brackets etc 486 * 1 = character inside symbol set 487 * 2 = upper-case character inside symbol set 488 */ 489 490static char classify[] = { 491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 492 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 494 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 495 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 496 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, //1, 497 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 498 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 499 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 500 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 501 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 502 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 503 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 504 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 505 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 506 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 507}; 508 509#if 0 510static const char * 511name_entry(struct lws_fts_entry *e1, char *s, int len) 512{ 513 struct lws_fts_entry *e2; 514 int n = len; 515 516 s[--n] = '\0'; 517 518 e2 = e1; 519 while (e2) { 520 if (e2->suffix) { 521 if ((int)e2->suffix_len < n) { 522 n -= e2->suffix_len; 523 memcpy(&s[n], e2->suffix, e2->suffix_len); 524 } 525 } else { 526 n--; 527 s[n] = e2->c; 528 } 529 530 e2 = e2->parent; 531 } 532 533 return &s[n + 1]; 534} 535#endif 536 537/* 538 * as we parse the input, we create a line length table for the file index. 539 * Only the file header has been written before we start doing this. 540 */ 541 542int 543lws_fts_fill(struct lws_fts *t, uint32_t file_index, const char *buf, 544 size_t len) 545{ 546 unsigned long long tf = (unsigned long long)lws_now_usecs(); 547 unsigned char c, linetable[256], vlibuf[8]; 548 struct lws_fts_entry *e, *e1, *dcl; 549 struct lws_fts_instance_file *tif; 550 int bp = 0, sline, chars, m; 551 char *osuff, skipline = 0; 552 struct lws_fts_lines *tl; 553 unsigned int olen, n; 554 off_t lbh; 555 556 if ((int)file_index != t->last_file_index) { 557 if (t->last_file_index >= 0) 558 finalize_per_input(t); 559 t->last_file_index = (int)file_index; 560 t->line_number = 1; 561 t->chars_in_line = 0; 562 t->lines_in_unsealed_linetable = 0; 563 } 564 565 t->agg_raw_input += len; 566 567resume: 568 569 chars = 0; 570 lbh = (off_t)t->c; 571 sline = t->line_number; 572 bp += g16(&linetable[bp], 0); 573 bp += g16(&linetable[bp], 0); 574 bp += g32(&linetable[bp], 0); 575 576 while (len) { 577 char go_around = 0; 578 579 if (t->lines_in_unsealed_linetable >= LWS_FTS_LINES_PER_CHUNK) 580 break; 581 582 len--; 583 584 c = (unsigned char)*buf++; 585 t->chars_in_line++; 586 if (c == '\n') { 587 skipline = 0; 588 t->filepath_list->total_lines++; 589 t->lines_in_unsealed_linetable++; 590 t->line_number++; 591 592 bp += wq32(&linetable[bp], (uint32_t)t->chars_in_line); 593 if ((unsigned int)bp > sizeof(linetable) - 6) { 594 if ((int)write(t->fd, linetable, (unsigned int)bp) != bp) { 595 lwsl_err("%s: linetable write failed\n", 596 __func__); 597 return 1; 598 } 599 t->c += (unsigned int)bp; 600 bp = 0; 601 // assert(lseek(t->fd, 0, SEEK_END) == t->c); 602 } 603 604 chars += t->chars_in_line; 605 t->chars_in_line = 0; 606 607 /* 608 * Detect overlength lines and skip them (eg, BASE64 609 * in css etc) 610 */ 611 612 if (len > 200) { 613 n = 0; 614 m = 0; 615 while (n < 200 && m < 80 && buf[n] != '\n') { 616 if (buf[n] == ' ' || buf[n] == '\t') 617 m = 0; 618 n++; 619 m++; 620 } 621 622 /* 80 lines no whitespace, or >=200-char line */ 623 624 if (m == 80 || n == 200) 625 skipline = 1; 626 } 627 628 goto seal; 629 } 630 if (skipline) 631 continue; 632 633 m = classify[(int)c]; 634 if (!m) 635 goto seal; 636 if (m == 2) 637 c = (unsigned char)((char)c + 'a' - 'A'); 638 639 if (t->aggregate) { 640 641 /* 642 * We created a trie entry for an earlier char in this 643 * symbol already. So we know at the moment, any 644 * further chars in the symbol are the only children. 645 * 646 * Aggregate them and add them as a string suffix to 647 * the trie symbol at the end (when we know how much to 648 * allocate). 649 */ 650 651 if (t->agg_pos < sizeof(t->agg) - 1) 652 /* symbol is not too long to stash */ 653 t->agg[t->agg_pos++] = c; 654 655 continue; 656 } 657 658 if (t->str_match_pos) { 659 go_around = 1; 660 goto seal; 661 } 662 663 /* zeroth-iteration child matching */ 664 665 if (t->parser == t->root) { 666 e = t->root_lookup[(int)c]; 667 if (e) { 668 t->parser = e; 669 continue; 670 } 671 } else { 672 673 /* look for the char amongst the children */ 674 675 e = t->parser->child_list; 676 while (e) { 677 678 /* since they're alpha ordered... */ 679 if (e->c > c) { 680 e = NULL; 681 break; 682 } 683 if (e->c == c) { 684 t->parser = e; 685 686 if (e->suffix) 687 t->str_match_pos = 1; 688 689 break; 690 } 691 692 e = e->sibling; 693 } 694 695 if (e) 696 continue; 697 } 698 699 /* 700 * we are blazing a new trail, add a new child representing 701 * the whole suffix that couldn't be matched until now. 702 */ 703 704 e = lws_fts_entry_child_add(t, c, t->parser); 705 if (!e) { 706 lwsl_err("%s: lws_fts_entry_child_add failed\n", 707 __func__); 708 return 1; 709 } 710 711 /* if it's the root node, keep the root_lookup table in sync */ 712 713 if (t->parser == t->root) 714 t->root_lookup[(int)c] = e; 715 716 /* follow the new path */ 717 t->parser = e; 718 719 { 720 struct lws_fts_entry **pe = &e->child_list; 721 while (*pe) { 722 assert((*pe)->parent == e); 723 724 pe = &(*pe)->sibling; 725 } 726 } 727 728 /* 729 * If there are any more symbol characters coming, just 730 * create a suffix string on t->parser instead of what must 731 * currently be single-child nodes, since we just created e 732 * as a child with a single character due to no existing match 733 * on that single character... so if no match on 'h' with this 734 * guy's parent, we created e that matches on the single char 735 * 'h'. If the symbol continues ... 'a' 'p' 'p' 'y', then 736 * instead of creating singleton child nodes under e, 737 * modify e to match on the whole string suffix "happy". 738 * 739 * If later "hoppy" appears, we will remove the suffix on e, 740 * so it reverts to a char match for 'h', add singleton children 741 * for 'a' and 'o', and attach a "ppy" suffix child to each of 742 * those. 743 * 744 * We want to do this so we don't have to allocate trie entries 745 * for every char in the string to save memory and consequently 746 * time. 747 * 748 * Don't try this optimization if the parent is the root node... 749 * it's not compatible with it's root_lookup table and it's 750 * highly likely children off the root entry are going to have 751 * to be fragmented. 752 */ 753 754 if (e->parent != t->root) { 755 t->aggregate = 1; 756 t->agg_pos = 0; 757 } 758 759 continue; 760 761seal: 762 if (t->str_match_pos) { 763 764 /* 765 * We're partway through matching an elaborated string 766 * on a child, not just a character. String matches 767 * only exist when we met a child entry that only had 768 * one path until now... so we had an 'h', and the 769 * only child had a string "hello". 770 * 771 * We are following the right path and will not need 772 * to back up, but we may find as we go we have the 773 * first instance of a second child path, eg, "help". 774 * 775 * When we get to the 'p', we have to split what was 776 * the only string option "hello" into "hel" and then 777 * two child entries, for "lo" and 'p'. 778 */ 779 780 if (c == t->parser->suffix[t->str_match_pos++]) { 781 if (t->str_match_pos < t->parser->suffix_len) 782 continue; 783 784 /* 785 * We simply matched everything, continue 786 * parsing normally from this trie entry. 787 */ 788 789 t->str_match_pos = 0; 790 continue; 791 } 792 793 /* 794 * So... we hit a mismatch somewhere... it means we 795 * have to split this string entry. 796 * 797 * We know the first char actually matched in order to 798 * start down this road. So for the current trie entry, 799 * we need to truncate his suffix at the char before 800 * this mismatched one, where we diverged (if the 801 * second char, simply remove the suffix string from the 802 * current trie entry to turn it back to a 1-char match) 803 * 804 * The original entry, which becomes the lhs post-split, 805 * is t->parser. 806 */ 807 808 olen = t->parser->suffix_len; 809 osuff = t->parser->suffix; 810 811 if (t->str_match_pos == 2) 812 t->parser->suffix = NULL; 813 else 814 t->parser->suffix_len = t->str_match_pos - 1; 815 816 /* 817 * Then we need to create a new child trie entry that 818 * represents the remainder of the original string 819 * path that we didn't match. For the "hello" / 820 * "help" case, this guy will have "lo". 821 * 822 * Any instances or children (not siblings...) that were 823 * attached to the original trie entry must be detached 824 * first and then migrate to this new guy that completes 825 * the original string. 826 */ 827 828 dcl = t->parser->child_list; 829 m = (int)t->parser->child_count; 830 831 t->parser->child_list = NULL; 832 t->parser->child_count = 0; 833 834 e = lws_fts_entry_child_add(t, (unsigned char) 835 osuff[t->str_match_pos - 1], t->parser); 836 if (!e) { 837 lwsl_err("%s: lws_fts_entry_child_add fail1\n", 838 __func__); 839 return 1; 840 } 841 842 e->child_list = dcl; 843 e->child_count = (uint32_t)m; 844 /* 845 * any children we took over must point to us as the 846 * parent now they appear on our child list 847 */ 848 e1 = e->child_list; 849 while (e1) { 850 e1->parent = e; 851 e1 = e1->sibling; 852 } 853 854 /* 855 * We detached any children, gave them to the new guy 856 * and replaced them with just our new guy 857 */ 858 t->parser->child_count = 1; 859 t->parser->child_list = e; 860 861 /* 862 * any instances that belonged to the original entry we 863 * are splitting now must be reassigned to the end 864 * part 865 */ 866 867 e->inst_file_list = t->parser->inst_file_list; 868 if (e->inst_file_list) 869 e->inst_file_list->owner = e; 870 t->parser->inst_file_list = NULL; 871 e->instance_count = t->parser->instance_count; 872 t->parser->instance_count = 0; 873 874 e->ofs_last_inst_file = t->parser->ofs_last_inst_file; 875 t->parser->ofs_last_inst_file = 0; 876 877 if (t->str_match_pos != olen) { 878 /* we diverged partway */ 879 e->suffix = &osuff[t->str_match_pos - 1]; 880 e->suffix_len = olen - (t->str_match_pos - 1); 881 } 882 883 /* 884 * if the current char is a terminal, skip creating a 885 * new way forward. 886 */ 887 888 if (classify[(int)c]) { 889 890 /* 891 * Lastly we need to create a new child trie 892 * entry that represents the new way forward 893 * from the point that we diverged. For the 894 * "hello" / "help" case, this guy will start 895 * as a child of "hel" with the single 896 * character match 'p'. 897 * 898 * Since he becomes the current parser context, 899 * more symbol characters may be coming to make 900 * him into, eg, "helping", in which case he 901 * will acquire a suffix eventually of "ping" 902 * via the aggregation stuff 903 */ 904 905 e = lws_fts_entry_child_add(t, c, t->parser); 906 if (!e) { 907 lwsl_err("%s: child_add fail2\n", 908 __func__); 909 return 1; 910 } 911 } 912 913 /* go on following this path */ 914 t->parser = e; 915 916 t->aggregate = 1; 917 t->agg_pos = 0; 918 919 t->str_match_pos = 0; 920 921 if (go_around) 922 continue; 923 924 /* this is intended to be a seal */ 925 } 926 927 928 /* end of token */ 929 930 if (t->aggregate && t->agg_pos) { 931 932 /* if nothing in agg[]: leave as single char match */ 933 934 /* otherwise copy out the symbol aggregation */ 935 t->parser->suffix = lwsac_use(&t->lwsac_head, 936 t->agg_pos + 1, 937 TRIE_LWSAC_BLOCK_SIZE); 938 if (!t->parser->suffix) { 939 lwsl_err("%s: lac for suffix failed\n", 940 __func__); 941 return 1; 942 } 943 944 /* add the first char at the beginning */ 945 *t->parser->suffix = (char)t->parser->c; 946 /* and then add the agg buffer stuff */ 947 memcpy(t->parser->suffix + 1, t->agg, t->agg_pos); 948 t->parser->suffix_len = t->agg_pos + 1; 949 } 950 t->aggregate = 0; 951 952 if (t->parser == t->root) /* multiple terminal chars */ 953 continue; 954 955 if (!t->parser->inst_file_list || 956 t->parser->inst_file_list->file_index != file_index) { 957 tif = lwsac_use(&t->lwsac_input_head, sizeof(*tif), 958 TRIE_LWSAC_BLOCK_SIZE); 959 if (!tif) { 960 lwsl_err("%s: lac for tif failed\n", 961 __func__); 962 return 1; 963 } 964 965 tif->file_index = file_index; 966 tif->owner = t->parser; 967 tif->lines_list = NULL; 968 tif->lines_tail = NULL; 969 tif->total = 0; 970 tif->count = 0; 971 tif->inst_file_next = t->tif_list; 972 t->tif_list = tif; 973 974 t->parser->inst_file_list = tif; 975 } 976 977 /* 978 * A naive allocation strategy for this leads to 50% of the 979 * total inmem lac allocation being for line numbers... 980 * 981 * It's mainly solved by only holding the instance and line 982 * number tables for the duration of a file being input, as soon 983 * as one input file is finished it is written to disk. 984 * 985 * For the common case of 1 - ~3 matches the line number are 986 * stored in a small VLI array inside the filepath inst. If the 987 * next one won't fit, it allocates a line number struct with 988 * more vli space and continues chaining those if needed. 989 */ 990 991 n = (unsigned int)wq32(vlibuf, (uint32_t)t->line_number); 992 tif = t->parser->inst_file_list; 993 994 if (!tif->lines_list) { 995 /* we are still trying to use the file inst vli */ 996 if (LWS_ARRAY_SIZE(tif->vli) - (size_t)tif->count >= n) { 997 tif->count = (char)((char)tif->count + (char)wq32(tif->vli + tif->count, 998 (uint32_t)t->line_number)); 999 goto after; 1000 } 1001 /* we are going to have to allocate */ 1002 } 1003 1004 /* can we add to an existing line numbers struct? */ 1005 if (tif->lines_tail && 1006 LWS_ARRAY_SIZE(tif->lines_tail->vli) - 1007 (unsigned char)tif->lines_tail->count >= n) { 1008 tif->lines_tail->count = (char)((char)tif->lines_tail->count + (char)wq32(tif->lines_tail->vli + 1009 tif->lines_tail->count, 1010 (uint32_t)t->line_number)); 1011 goto after; 1012 } 1013 1014 /* either no existing line numbers struct at tail, or full */ 1015 1016 /* have to create a(nother) line numbers struct */ 1017 tl = lwsac_use(&t->lwsac_input_head, sizeof(*tl), 1018 TRIE_LWSAC_BLOCK_SIZE); 1019 if (!tl) { 1020 lwsl_err("%s: lac for tl failed\n", __func__); 1021 return 1; 1022 } 1023 tl->lines_next = NULL; 1024 if (tif->lines_tail) 1025 tif->lines_tail->lines_next = tl; 1026 1027 tif->lines_tail = tl; 1028 if (!tif->lines_list) 1029 tif->lines_list = tl; 1030 1031 tl->count = (char)wq32(tl->vli, (uint32_t)t->line_number); 1032after: 1033 tif->total++; 1034#if 0 1035 { 1036 char s[128]; 1037 const char *ne = name_entry(t->parser, s, sizeof(s)); 1038 1039 if (!strcmp(ne, "describ")) { 1040 lwsl_err(" %s %d\n", ne, t->str_match_pos); 1041 write(1, buf - 10, 20); 1042 } 1043 } 1044#endif 1045 t->parser->instance_count++; 1046 t->parser = t->root; 1047 t->str_match_pos = 0; 1048 } 1049 1050 /* seal off the line length table block */ 1051 1052 if (bp) { 1053 if ((int)write(t->fd, linetable, (size_t)bp) != bp) 1054 return 1; 1055 t->c += (unsigned int)bp; 1056 bp = 0; 1057 } 1058 1059 if (lseek(t->fd, lbh, SEEK_SET) < 0) { 1060 lwsl_err("%s: seek to 0x%llx failed\n", __func__, 1061 (unsigned long long)lbh); 1062 return 1; 1063 } 1064 1065 g16(linetable, (uint16_t)(t->c - (jg2_file_offset)lbh)); 1066 g16(linetable + 2, (uint16_t)(t->line_number - sline)); 1067 g32(linetable + 4, (uint32_t)chars); 1068 if ((int)write(t->fd, linetable, 8) != 8) { 1069 lwsl_err("%s: write linetable header failed\n", __func__); 1070 return 1; 1071 } 1072 1073 assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c); 1074 1075 if (lseek(t->fd, (off_t)t->c, SEEK_SET) < 0) { 1076 lwsl_err("%s: end seek failed\n", __func__); 1077 return 1; 1078 } 1079 1080 bp = 0; 1081 1082 if (len) { 1083 t->lines_in_unsealed_linetable = 0; 1084 goto resume; 1085 } 1086 1087 /* dump the collected per-input instance and line data, and free it */ 1088 1089 t->agg_trie_creation_us += (uint64_t)((uint64_t)lws_now_usecs() - tf); 1090 1091 return 0; 1092} 1093 1094/* refer to ./README.md */ 1095 1096int 1097lws_fts_serialize(struct lws_fts *t) 1098{ 1099 struct lws_fts_filepath *fp = t->filepath_list, *ofp; 1100 unsigned long long tf = (unsigned long long)lws_now_usecs(); 1101 struct lws_fts_entry *e, *e1, *s[256]; 1102 unsigned char buf[8192], stasis; 1103 int n, bp, sp = 0, do_parent; 1104 1105 (void)tf; 1106 finalize_per_input(t); 1107 1108 /* 1109 * Compute aggregated instance counts (parents should know the total 1110 * number of instances below each child path) 1111 * 1112 * 1113 * If we have 1114 * 1115 * (root) -> (c1) -> (c2) 1116 * -> (c3) 1117 * 1118 * we need to visit the nodes in the order 1119 * 1120 * c2, c1, c3, root 1121 */ 1122 1123 sp = 0; 1124 s[0] = t->root; 1125 do_parent = 0; 1126 while (sp >= 0) { 1127 int n; 1128 1129 /* aggregate in every antecedent */ 1130 1131 for (n = 0; n <= sp; n++) { 1132 s[n]->agg_inst_count += s[sp]->instance_count; 1133 s[n]->agg_child_count += s[sp]->child_count; 1134 } 1135 1136 /* handle any children before the parent */ 1137 1138 if (s[sp]->child_list) { 1139 if (sp + 1 == LWS_ARRAY_SIZE(s)) { 1140 lwsl_err("Stack too deep\n"); 1141 1142 goto bail; 1143 } 1144 1145 s[sp + 1] = s[sp]->child_list; 1146 sp++; 1147 continue; 1148 } 1149 1150 do { 1151 if (s[sp]->sibling) { 1152 s[sp] = s[sp]->sibling; 1153 break; 1154 } else 1155 sp--; 1156 } while (sp >= 0); 1157 } 1158 1159 /* dump the filepaths and set prev */ 1160 1161 fp = t->filepath_list; 1162 ofp = NULL; 1163 bp = 0; 1164 while (fp) { 1165 1166 fp->ofs = t->c + (unsigned int)bp; 1167 n = (int)strlen(fp->filepath); 1168 spill(15 + n, 0); 1169 1170 bp += wq32(&buf[bp], fp->line_table_ofs); 1171 bp += wq32(&buf[bp], (uint32_t)fp->total_lines); 1172 bp += wq32(&buf[bp], (uint32_t)n); 1173 memcpy(&buf[bp], fp->filepath, (unsigned int)n); 1174 bp += n; 1175 1176 fp->prev = ofp; 1177 ofp = fp; 1178 fp = fp->next; 1179 } 1180 1181 spill(0, 1); 1182 1183 /* record the fileoffset of the filepath map and filepath count */ 1184 1185 if (lseek(t->fd, 0xc, SEEK_SET) < 0) 1186 goto bail_seek; 1187 1188 g32(buf, t->c + (unsigned int)bp); 1189 g32(buf + 4, (uint32_t)t->next_file_index); 1190 if ((int)write(t->fd, buf, 8) != 8) 1191 goto bail; 1192 1193 if (lseek(t->fd, (off_t)(t->c + (unsigned int)bp), SEEK_SET) < 0) 1194 goto bail_seek; 1195 1196 /* dump the filepath map, starting from index 0, which is at the tail */ 1197 1198 fp = ofp; 1199 bp = 0; 1200 while (fp) { 1201 spill(5, 0); 1202 g32(buf + bp, fp->ofs); 1203 bp += 4; 1204 fp = fp->prev; 1205 } 1206 spill(0, 1); 1207 1208 /* 1209 * The trie entries in reverse order... because of the reversal, we have 1210 * always written children first, and marked them with their file offset 1211 * before we come to refer to them. 1212 */ 1213 1214 bp = 0; 1215 sp = 0; 1216 s[0] = t->root; 1217 do_parent = 0; 1218 while (s[sp]) { 1219 1220 /* handle any children before the parent */ 1221 1222 if (!do_parent && s[sp]->child_list) { 1223 1224 if (sp + 1 == LWS_ARRAY_SIZE(s)) { 1225 lwsl_err("Stack too deep\n"); 1226 1227 goto bail; 1228 } 1229 1230 s[sp + 1] = s[sp]->child_list; 1231 sp++; 1232 continue; 1233 } 1234 1235 /* leaf nodes with no children */ 1236 1237 e = s[sp]; 1238 e->ofs = t->c + (unsigned int)bp; 1239 1240 /* write the trie entry header */ 1241 1242 spill((3 * MAX_VLI), 0); 1243 1244 bp += wq32(&buf[bp], e->ofs_last_inst_file); 1245 bp += wq32(&buf[bp], e->child_count); 1246 bp += wq32(&buf[bp], e->instance_count); 1247 bp += wq32(&buf[bp], e->agg_inst_count); 1248 1249 /* sort the children in order of highest aggregate hits first */ 1250 1251 do { 1252 struct lws_fts_entry **pe, *te1, *te2; 1253 1254 stasis = 1; 1255 1256 /* bubble sort keeps going until nothing changed */ 1257 1258 pe = &e->child_list; 1259 while (*pe) { 1260 1261 te1 = *pe; 1262 te2 = te1->sibling; 1263 1264 if (te2 && te1->agg_inst_count < 1265 te2->agg_inst_count) { 1266 stasis = 0; 1267 1268 *pe = te2; 1269 te1->sibling = te2->sibling; 1270 te2->sibling = te1; 1271 } 1272 1273 pe = &(*pe)->sibling; 1274 } 1275 1276 } while (!stasis); 1277 1278 /* write the children */ 1279 1280 e1 = e->child_list; 1281 while (e1) { 1282 spill((5 * MAX_VLI) + e1->suffix_len + 1, 0); 1283 1284 bp += wq32(&buf[bp], e1->ofs); 1285 bp += wq32(&buf[bp], e1->instance_count); 1286 bp += wq32(&buf[bp], e1->agg_inst_count); 1287 bp += wq32(&buf[bp], e1->agg_child_count); 1288 1289 if (e1->suffix) { /* string */ 1290 bp += wq32(&buf[bp], e1->suffix_len); 1291 memmove(&buf[bp], e1->suffix, e1->suffix_len); 1292 bp += (int)e1->suffix_len; 1293 } else { /* char */ 1294 bp += wq32(&buf[bp], 1); 1295 buf[bp++] = e1->c; 1296 } 1297#if 0 1298 if (e1->suffix && e1->suffix_len == 3 && 1299 !memcmp(e1->suffix, "cri", 3)) { 1300 struct lws_fts_entry *e2; 1301 1302 e2 = e1; 1303 while (e2){ 1304 if (e2->suffix) 1305 lwsl_notice("%s\n", e2->suffix); 1306 else 1307 lwsl_notice("%c\n", e2->c); 1308 1309 e2 = e2->parent; 1310 } 1311 1312 lwsl_err("*** %c CRI inst %d ch %d\n", e1->parent->c, 1313 e1->instance_count, e1->child_count); 1314 } 1315#endif 1316 e1 = e1->sibling; 1317 } 1318 1319 /* if there are siblings, do those next */ 1320 1321 if (do_parent) { 1322 do_parent = 0; 1323 sp--; 1324 } 1325 1326 if (s[sp]->sibling) 1327 s[sp] = s[sp]->sibling; 1328 else { 1329 /* if there are no siblings, do the parent */ 1330 do_parent = 1; 1331 s[sp] = s[sp]->parent; 1332 } 1333 } 1334 1335 spill(0, 1); 1336 1337 assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c); 1338 1339 /* drop the correct root trie offset + file length into the header */ 1340 1341 if (lseek(t->fd, 4, SEEK_SET) < 0) { 1342 lwsl_err("%s: unable to seek\n", __func__); 1343 1344 goto bail; 1345 } 1346 1347 g32(buf, t->root->ofs); 1348 g32(buf + 4, t->c); 1349 if (write(t->fd, buf, 0x8) != 0x8) 1350 goto bail; 1351 1352 lwsl_notice("%s: index %d files (%uMiB) cpu time %dms, " 1353 "alloc: %dKiB + %dKiB, " 1354 "serialize: %dms, file: %dKiB\n", __func__, 1355 t->next_file_index, 1356 (int)(t->agg_raw_input / (1024 * 1024)), 1357 (int)(t->agg_trie_creation_us / 1000), 1358 (int)(lwsac_total_alloc(t->lwsac_head) / 1024), 1359 (int)(t->worst_lwsac_input_size / 1024), 1360 (int)(((uint64_t)lws_now_usecs() - tf) / 1000), 1361 (int)(t->c / 1024)); 1362 1363 return 0; 1364 1365bail_seek: 1366 lwsl_err("%s: problem seekings\n", __func__); 1367 1368bail: 1369 return 1; 1370} 1371 1372 1373