1/* 2 * minilex.c 3 * 4 * High efficiency lexical state parser 5 * 6 * Copyright (C)2011-2014 Andy Green <andy@warmcat.com> 7 * 8 * Licensed under MIT 9 * 10 * Usage: gcc minihuf.c -o minihuf && ./minihuf > huftable.h 11 * 12 * Run it twice to test parsing on the generated table on stderr 13 */ 14 15#include <stdio.h> 16#include <stdlib.h> 17#include <string.h> 18 19#define LWS_ARRAY_SIZE(n) (sizeof(n) / sizeof(n[0])) 20 21struct huf { 22 unsigned int code; 23 unsigned char len; 24}; 25 26static struct huf huf_literal[] = { 27 /* 0x00 */ { 0x1ff8, 13 }, 28 /* 0x01 */ { 0x7fffd8, 23 }, 29 /* 0x02 */ { 0xfffffe2, 28 }, 30 /* 0x03 */ { 0xfffffe3, 28 }, 31 /* 0x04 */ { 0xfffffe4, 28 }, 32 /* 0x05 */ { 0xfffffe5, 28 }, 33 /* 0x06 */ { 0xfffffe6, 28 }, 34 /* 0x07 */ { 0xfffffe7, 28 }, 35 /* 0x08 */ { 0xfffffe8, 28 }, 36 /* 0x09 */ { 0xffffea, 24 }, 37 /* 0x0a */ { 0x3ffffffc, 30 }, 38 /* 0x0b */ { 0xfffffe9, 28 }, 39 40 /* 0x0c */ { 0xfffffea, 28 }, 41 /* 0x0d */ { 0x3ffffffd, 30 }, 42 /* 0x0e */ { 0xfffffeb, 28 }, 43 /* 0x0f */ { 0xfffffec, 28 }, 44 /* 0x10 */ { 0xfffffed, 28 }, 45 /* 0x11 */ { 0xfffffee, 28 }, 46 /* 0x12 */ { 0xfffffef, 28 }, 47 /* 0x13 */ { 0xffffff0, 28 }, 48 /* 0x14 */ { 0xffffff1, 28 }, 49 /* 0x15 */ { 0xffffff2, 28 }, 50 /* 0x16 */ { 0x3ffffffe, 30 }, 51 /* 0x17 */ { 0xffffff3, 28 }, 52 /* 0x18 */ { 0xffffff4, 28 }, 53 /* 0x19 */ { 0xffffff5, 28 }, 54 /* 0x1a */ { 0xffffff6, 28 }, 55 /* 0x1b */ { 0xffffff7, 28 }, 56 /* 0x1c */ { 0xffffff8, 28 }, 57 /* 0x1d */ { 0xffffff9, 28 }, 58 /* 0x1e */ { 0xffffffa, 28 }, 59 /* 0x1f */ { 0xffffffb, 28 }, 60 /* 0x20 */ { 0x14, 6 }, 61 /* 0x21 */ { 0x3f8, 10 }, 62 /* 0x22 */ { 0x3f9, 10 }, 63 /* 0x23 */ { 0xffa, 12 }, 64 /* 0x24 */ { 0x1ff9, 13 }, 65 /* 0x25 */ { 0x15, 6 }, 66 /* 0x26 */ { 0xf8, 8 }, 67 /* 0x27 */ { 0x7fa, 11 }, 68 /* 0x28 */ { 0x3fa, 10 }, 69 /* 0x29 */ { 0x3fb, 10 }, 70 /* 0x2a */ { 0xf9, 8 }, 71 /* 0x2b */ { 0x7fb, 11 }, 72 /* 0x2c */ { 0xfa, 8 }, 73 /* 0x2d */ { 0x16, 6 }, 74 /* 0x2e */ { 0x17, 6 }, 75 /* 0x2f */ { 0x18, 6 }, 76 /* 0x30 */ { 0x0, 5 }, 77 /* 0x31 */ { 0x1, 5 }, 78 /* 0x32 */ { 0x2, 5 }, 79 /* 0x33 */ { 0x19, 6 }, 80 /* 0x34 */ { 0x1a, 6 }, 81 /* 0x35 */ { 0x1b, 6 }, 82 /* 0x36 */ { 0x1c, 6 }, 83 /* 0x37 */ { 0x1d, 6 }, 84 /* 0x38 */ { 0x1e, 6 }, 85 /* 0x39 */ { 0x1f, 6 }, 86 /* 0x3a */ { 0x5c, 7 }, 87 /* 0x3b */ { 0xfb, 8 }, 88 89 /* 0x3c */ { 0x7ffc, 15 }, 90 /* 0x3d */ { 0x20, 6 }, 91 /* 0x3e */ { 0xffb, 12 }, 92 /* 0x3f */ { 0x3fc, 10 }, 93 /* 0x40 */ { 0x1ffa, 13 }, 94 /* 0x41 */ { 0x21, 6 }, 95 /* 0x42 */ { 0x5d, 7 }, 96 /* 0x43 */ { 0x5e, 7 }, 97 /* 0x44 */ { 0x5f, 7 }, 98 /* 0x45 */ { 0x60, 7 }, 99 /* 0x46 */ { 0x61, 7 }, 100 /* 0x47 */ { 0x62, 7 }, 101 /* 0x48 */ { 0x63, 7 }, 102 /* 0x49 */ { 0x64, 7 }, 103 /* 0x4a */ { 0x65, 7 }, 104 /* 0x4b */ { 0x66, 7 }, 105 /* 0x4c */ { 0x67, 7 }, 106 /* 0x4d */ { 0x68, 7 }, 107 /* 0x4e */ { 0x69, 7 }, 108 /* 0x4f */ { 0x6a, 7 }, 109 /* 0x50 */ { 0x6b, 7 }, 110 /* 0x51 */ { 0x6c, 7 }, 111 /* 0x52 */ { 0x6d, 7 }, 112 /* 0x53 */ { 0x6e, 7 }, 113 /* 0x54 */ { 0x6f, 7 }, 114 /* 0x55 */ { 0x70, 7 }, 115 /* 0x56 */ { 0x71, 7 }, 116 /* 0x57 */ { 0x72, 7 }, 117 /* 0x58 */ { 0xfc, 8 }, 118 /* 0x59 */ { 0x73, 7 }, 119 /* 0x5a */ { 0xfd, 8 }, 120 /* 0x5b */ { 0x1ffb, 13 }, 121 /* 0x5c */ { 0x7fff0, 19 }, 122 /* 0x5d */ { 0x1ffc, 13 }, 123 /* 0x5e */ { 0x3ffc, 14 }, 124 /* 0x5f */ { 0x22, 6 }, 125 /* 0x60 */ { 0x7ffd, 15 }, 126 /* 0x61 */ { 0x3, 5 }, 127 /* 0x62 */ { 0x23, 6 }, 128 /* 0x63 */ { 0x4, 5 }, 129 /* 0x64 */ { 0x24, 6 }, 130 /* 0x65 */ { 0x5, 5 }, 131 /* 0x66 */ { 0x25, 6 }, 132 /* 0x67 */ { 0x26, 6 }, 133 /* 0x68 */ { 0x27, 6 }, 134 /* 0x69 */ { 0x6, 5 }, 135 /* 0x6a */ { 0x74, 7 }, 136 /* 0x6b */ { 0x75, 7 }, 137 138 139 /* 0x6c */ { 0x28, 6 }, 140 /* 0x6d */ { 0x29, 6 }, 141 /* 0x6e */ { 0x2a, 6 }, 142 /* 0x6f */ { 0x7, 5 }, 143 /* 0x70 */ { 0x2b, 6 }, 144 /* 0x71 */ { 0x76, 7 }, 145 /* 0x72 */ { 0x2c, 6 }, 146 /* 0x73 */ { 0x8, 5 }, 147 /* 0x74 */ { 0x9, 5 }, 148 /* 0x75 */ { 0x2d, 6 }, 149 /* 0x76 */ { 0x77, 7 }, 150 /* 0x77 */ { 0x78, 7 }, 151 /* 0x78 */ { 0x79, 7 }, 152 /* 0x79 */ { 0x7a, 7 }, 153 /* 0x7a */ { 0x7b, 7 }, 154 /* 0x7b */ { 0x7ffe, 15 }, 155 /* 0x7c */ { 0x7fc, 11 }, 156 /* 0x7d */ { 0x3ffd, 14 }, 157 /* 0x7e */ { 0x1ffd, 13 }, 158 /* 0x7f */ { 0xffffffc, 28 }, 159 /* 0x80 */ { 0xfffe6, 20 }, 160 /* 0x81 */ { 0x3fffd2, 22 }, 161 /* 0x82 */ { 0xfffe7, 20 }, 162 /* 0x83 */ { 0xfffe8, 20 }, 163 /* 0x84 */ { 0x3fffd3, 22 }, 164 /* 0x85 */ { 0x3fffd4, 22 }, 165 /* 0x86 */ { 0x3fffd5, 22 }, 166 /* 0x87 */ { 0x7fffd9, 23 }, 167 /* 0x88 */ { 0x3fffd6, 22 }, 168 /* 0x89 */ { 0x7fffda, 23 }, 169 /* 0x8a */ { 0x7fffdb, 23 }, 170 /* 0x8b */ { 0x7fffdc, 23 }, 171 /* 0x8c */ { 0x7fffdd, 23 }, 172 /* 0x8d */ { 0x7fffde, 23 }, 173 /* 0x8e */ { 0xffffeb, 24 }, 174 /* 0x8f */ { 0x7fffdf, 23 }, 175 /* 0x90 */ { 0xffffec, 24 }, 176 /* 0x91 */ { 0xffffed, 24 }, 177 /* 0x92 */ { 0x3fffd7, 22 }, 178 /* 0x93 */ { 0x7fffe0, 23 }, 179 /* 0x94 */ { 0xffffee, 24 }, 180 /* 0x95 */ { 0x7fffe1, 23 }, 181 /* 0x96 */ { 0x7fffe2, 23 }, 182 /* 0x97 */ { 0x7fffe3, 23 }, 183 /* 0x98 */ { 0x7fffe4, 23 }, 184 /* 0x99 */ { 0x1fffdc, 21 }, 185 /* 0x9a */ { 0x3fffd8, 22 }, 186 /* 0x9b */ { 0x7fffe5, 23 }, 187 188 /* 0x9c */ { 0x3fffd9, 22 }, 189 /* 0x9d */ { 0x7fffe6, 23 }, 190 /* 0x9e */ { 0x7fffe7, 23 }, 191 /* 0x9f */ { 0xffffef, 24 }, 192 /* 0xa0 */ { 0x3fffda, 22 }, 193 /* 0xa1 */ { 0x1fffdd, 21 }, 194 /* 0xa2 */ { 0xfffe9, 20 }, 195 /* 0xa3 */ { 0x3fffdb, 22 }, 196 /* 0xa4 */ { 0x3fffdc, 22 }, 197 /* 0xa5 */ { 0x7fffe8, 23 }, 198 /* 0xa6 */ { 0x7fffe9, 23 }, 199 /* 0xa7 */ { 0x1fffde, 21 }, 200 /* 0xa8 */ { 0x7fffea, 23 }, 201 /* 0xa9 */ { 0x3fffdd, 22 }, 202 /* 0xaa */ { 0x3fffde, 22 }, 203 /* 0xab */ { 0xfffff0, 24 }, 204 /* 0xac */ { 0x1fffdf, 21 }, 205 /* 0xad */ { 0x3fffdf, 22 }, 206 /* 0xae */ { 0x7fffeb, 23 }, 207 /* 0xaf */ { 0x7fffec, 23 }, 208 /* 0xb0 */ { 0x1fffe0, 21 }, 209 /* 0xb1 */ { 0x1fffe1, 21 }, 210 /* 0xb2 */ { 0x3fffe0, 22 }, 211 /* 0xb3 */ { 0x1fffe2, 21 }, 212 /* 0xb4 */ { 0x7fffed, 23 }, 213 /* 0xb5 */ { 0x3fffe1, 22 }, 214 /* 0xb6 */ { 0x7fffee, 23 }, 215 /* 0xb7 */ { 0x7fffef, 23 }, 216 /* 0xb8 */ { 0xfffea, 20 }, 217 /* 0xb9 */ { 0x3fffe2, 22 }, 218 /* 0xba */ { 0x3fffe3, 22 }, 219 /* 0xbb */ { 0x3fffe4, 22 }, 220 /* 0xbc */ { 0x7ffff0, 23 }, 221 /* 0xbd */ { 0x3fffe5, 22 }, 222 /* 0xbe */ { 0x3fffe6, 22 }, 223 /* 0xbf */ { 0x7ffff1, 23 }, 224 /* 0xc0 */ { 0x3ffffe0, 26 }, 225 /* 0xc1 */ { 0x3ffffe1, 26 }, 226 /* 0xc2 */ { 0xfffeb, 20 }, 227 /* 0xc3 */ { 0x7fff1, 19 }, 228 /* 0xc4 */ { 0x3fffe7, 22 }, 229 /* 0xc5 */ { 0x7ffff2, 23 }, 230 /* 0xc6 */ { 0x3fffe8, 22 }, 231 /* 0xc7 */ { 0x1ffffec, 25 }, 232 /* 0xc8 */ { 0x3ffffe2, 26 }, 233 /* 0xc9 */ { 0x3ffffe3, 26 }, 234 /* 0xca */ { 0x3ffffe4, 26 }, 235 /* 0xcb */ { 0x7ffffde, 27 }, 236 237 /* 0xcc */ { 0x7ffffdf, 27 }, 238 /* 0xcd */ { 0x3ffffe5, 26 }, 239 /* 0xce */ { 0xfffff1, 24 }, 240 /* 0xcf */ { 0x1ffffed, 25 }, 241 /* 0xd0 */ { 0x7fff2, 19 }, 242 /* 0xd1 */ { 0x1fffe3, 21 }, 243 /* 0xd2 */ { 0x3ffffe6, 26 }, 244 /* 0xd3 */ { 0x7ffffe0, 27 }, 245 /* 0xd4 */ { 0x7ffffe1, 27 }, 246 /* 0xd5 */ { 0x3ffffe7, 26 }, 247 /* 0xd6 */ { 0x7ffffe2, 27 }, 248 /* 0xd7 */ { 0xfffff2, 24 }, 249 /* 0xd8 */ { 0x1fffe4, 21 }, 250 /* 0xd9 */ { 0x1fffe5, 21 }, 251 /* 0xda */ { 0x3ffffe8, 26 }, 252 /* 0xdb */ { 0x3ffffe9, 26 }, 253 /* 0xdc */ { 0xffffffd, 28 }, 254 /* 0xdd */ { 0x7ffffe3, 27 }, 255 /* 0xde */ { 0x7ffffe4, 27 }, 256 /* 0xdf */ { 0x7ffffe5, 27 }, 257 /* 0xe0 */ { 0xfffec, 20 }, 258 /* 0xe1 */ { 0xfffff3, 24 }, 259 /* 0xe2 */ { 0xfffed, 20 }, 260 /* 0xe3 */ { 0x1fffe6, 21 }, 261 /* 0xe4 */ { 0x3fffe9, 22 }, 262 /* 0xe5 */ { 0x1fffe7, 21 }, 263 /* 0xe6 */ { 0x1fffe8, 21 }, 264 /* 0xe7 */ { 0x7ffff3, 23 }, 265 /* 0xe8 */ { 0x3fffea, 22 }, 266 /* 0xe9 */ { 0x3fffeb, 22 }, 267 /* 0xea */ { 0x1ffffee, 25 }, 268 /* 0xeb */ { 0x1ffffef, 25 }, 269 /* 0xec */ { 0xfffff4, 24 }, 270 /* 0xed */ { 0xfffff5, 24 }, 271 /* 0xee */ { 0x3ffffea, 26 }, 272 /* 0xef */ { 0x7ffff4, 23 }, 273 /* 0xf0 */ { 0x3ffffeb, 26 }, 274 /* 0xf1 */ { 0x7ffffe6, 27 }, 275 /* 0xf2 */ { 0x3ffffec, 26 }, 276 /* 0xf3 */ { 0x3ffffed, 26 }, 277 /* 0xf4 */ { 0x7ffffe7, 27 }, 278 /* 0xf5 */ { 0x7ffffe8, 27 }, 279 /* 0xf6 */ { 0x7ffffe9, 27 }, 280 /* 0xf7 */ { 0x7ffffea, 27 }, 281 /* 0xf8 */ { 0x7ffffeb, 27 }, 282 /* 0xf9 */ { 0xffffffe, 28 }, 283 /* 0xfa */ { 0x7ffffec, 27 }, 284 /* 0xfb */ { 0x7ffffed, 27 }, 285 286 /* 0xfc */ { 0x7ffffee, 27 }, 287 /* 0xfd */ { 0x7ffffef, 27 }, 288 /* 0xfe */ { 0x7fffff0, 27 }, 289 /* 0xff */ { 0x3ffffee, 26 }, 290 /* 0x100 */ { 0x3fffffff, 30 }, 291}; 292 293int code_bit(int idx, int bit) 294{ 295 if (bit < huf_literal[idx].len) 296 return !!(huf_literal[idx].code & (1 << (huf_literal[idx].len - 1 - bit))); 297 298 return -1; 299} 300 301#include "huftable.h" 302 303#define PARALLEL 2 304 305struct state { 306 int terminal; 307 int state[PARALLEL]; 308 int bytepos; 309 310 int real_pos; 311}; 312 313struct state state[2000]; 314unsigned char terms[2000]; 315int next = 1; 316 317int lextable_decode(int pos, char c) 318{ 319 int q = pos + !!c; 320 321 if (lextable_terms[q >> 3] & (1 << (q & 7))) /* terminal */ 322 return lextable[q] | 0x8000; 323 324 return pos + (lextable[q] << 1); 325} 326 327int main(void) 328{ 329 int n = 0; 330 int m = 0; 331 int prev; 332 char c; 333 int walk; 334 int saw; 335 int y; 336 int j; 337 int pos = 0; 338 int biggest = 0; 339 int fails = 0; 340 341 m = 0; 342 while (m < LWS_ARRAY_SIZE(state)) { 343 for (j = 0; j < PARALLEL; j++) { 344 state[m].state[j] = 0xffff; 345 state[m].terminal = 0; 346 } 347 m++; 348 } 349 350 while (n < LWS_ARRAY_SIZE(huf_literal)) { 351 352 m = 0; 353 walk = 0; 354 prev = 0; 355 356 while (m < huf_literal[n].len) { 357 358 saw = 0; 359 if (state[walk].state[code_bit(n, m)] != 0xffff) { 360 /* exists -- go forward */ 361 walk = state[walk].state[code_bit(n, m)]; 362 goto again; 363 } 364 365 /* something we didn't see before */ 366 367 state[walk].state[code_bit(n, m)] = next; 368 walk = next++; 369again: 370 m++; 371 } 372 373 state[walk].terminal = n++; 374 state[walk].state[0] = 0; /* terminal marker */ 375 } 376 377 walk = 0; 378 for (n = 0; n < next; n++) { 379 state[n].bytepos = walk; 380 walk += (2 * 2); 381 } 382 383 /* compute everyone's position first */ 384 385 pos = 0; 386 walk = 0; 387 for (n = 0; n < next; n++) { 388 389 state[n].real_pos = pos; 390 391 if (state[n].state[0]) /* nonterminal */ 392 pos += 2; 393 394 walk ++; 395 } 396 397 fprintf(stdout, "static unsigned char lextable[] = {\n"); 398 399#define TERMINAL_MASK 0x8000 400 401 walk = 0; 402 pos = 0; 403 404 for (n = 0; n < next; n++) { 405 for (m = 0; m < 2; m++) { 406 saw = state[n].state[m]; 407 408 if (saw == 0) { // c is a terminal then 409 m = 2; 410 continue; 411 } 412 if (!m) 413 fprintf(stdout, "/* pos %04x: %3d */ ", 414 state[n].real_pos, n); 415 else 416 fprintf(stdout, " "); 417 418 if (saw == 0xffff) { 419 fprintf(stdout, 420 " 0xff, 0xff, /* 0 = fail */\n "); 421 pos ++; /* fail */ 422 fails++; 423 continue; 424 } 425 426 if (state[saw].state[0] == 0) { /* points to terminal */ 427 fprintf(stdout, " /* terminal %d */ 0x%02X,\n", 428 state[saw].terminal, 429 state[saw].terminal & 0xff); 430 terms[(state[n].real_pos + m) >> 3] |= 431 1 << ((state[n].real_pos + m) & 7); 432 pos++; 433 walk++; 434 continue; 435 } 436 437 j = (state[saw].real_pos - q) >> 1; 438 439 if (j > biggest) 440 biggest = j; 441 442 if (j > 0xffff) { 443 fprintf(stderr, 444 "Jump > 64K bytes ahead (%d to %d)\n", 445 state[n].real_pos, state[saw].real_pos); 446 return 1; 447 } 448 449 fprintf(stdout, " /* %d */ 0x%02X " 450 "/* (to 0x%04X state %3d) */,\n", 451 m, 452 j & 0xff, 453 state[saw].real_pos, saw); 454 pos++; 455 456 walk++; 457 } 458 } 459 460 fprintf(stdout, "/* total size %d bytes, biggest jump %d/256, fails=%d */\n};\n" 461 "\n static unsigned char lextable_terms[] = {\n", 462 pos, biggest, fails); 463 464 for (n = 0; n < (walk + 7) / 8; n++) { 465 if (!(n & 7)) 466 fprintf(stdout, "\n\t"); 467 fprintf(stdout, "0x%02x, ", terms[n]); 468 } 469 fprintf(stdout, "\n};\n"); 470 471 /* 472 * Try to parse every legal input string 473 */ 474 475 for (n = 0; n < LWS_ARRAY_SIZE(huf_literal); n++) { 476 walk = 0; 477 m = 0; 478 y = -1; 479 480 fprintf(stderr, " trying %d\n", n); 481 482 while (m < huf_literal[n].len) { 483 prev = walk; 484 walk = lextable_decode(walk, code_bit(n, m)); 485 486 if (walk == 0xffff) { 487 fprintf(stderr, "failed\n"); 488 return 3; 489 } 490 491 if (walk & 0x8000) { 492 y = walk & 0x7fff; 493 if (y == 0 && m == 29) { 494 y |= 0x100; 495 fprintf(stdout, 496 "\n/* state that points to " 497 "0x100 for disambiguation with " 498 "0x0 */\n" 499 "#define HUFTABLE_0x100_PREV " 500 "%d\n", prev); 501 } 502 break; 503 } 504 m++; 505 } 506 507 if (y != n) { 508 fprintf(stderr, "decode failed %d got %d (0x%x)\n", n, y, y); 509 return 4; 510 } 511 } 512 513 fprintf(stderr, "All decode OK\n"); 514 515 return 0; 516} 517