1/* Copyright Joyent, Inc. and other Node contributors. 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to 5 * deal in the Software without restriction, including without limitation the 6 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 * sell copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 * IN THE SOFTWARE. 20 */ 21#include "url_parser.h" 22#include <assert.h> 23#include <stddef.h> 24#include <ctype.h> 25#include <string.h> 26#include <limits.h> 27 28#ifndef BIT_AT 29# define BIT_AT(a, i) \ 30 (!!((unsigned int) (a)[(unsigned int) (i) >> 3] & \ 31 (1 << ((unsigned int) (i) & 7)))) 32#endif 33 34#if HTTP_PARSER_STRICT 35# define T(v) 0 36#else 37# define T(v) v 38#endif 39 40static const uint8_t normal_url_char[32] = { 41/* 0 nul 1 soh 2 stx 3 etx 4 eot 5 enq 6 ack 7 bel */ 42 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0, 43/* 8 bs 9 ht 10 nl 11 vt 12 np 13 cr 14 so 15 si */ 44 0 | T(2) | 0 | 0 | T(16) | 0 | 0 | 0, 45/* 16 dle 17 dc1 18 dc2 19 dc3 20 dc4 21 nak 22 syn 23 etb */ 46 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0, 47/* 24 can 25 em 26 sub 27 esc 28 fs 29 gs 30 rs 31 us */ 48 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0, 49/* 32 sp 33 ! 34 " 35 # 36 $ 37 % 38 & 39 ' */ 50 0 | 2 | 4 | 0 | 16 | 32 | 64 | 128, 51/* 40 ( 41 ) 42 * 43 + 44 , 45 - 46 . 47 / */ 52 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 53/* 48 0 49 1 50 2 51 3 52 4 53 5 54 6 55 7 */ 54 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 55/* 56 8 57 9 58 : 59 ; 60 < 61 = 62 > 63 ? */ 56 1 | 2 | 4 | 8 | 16 | 32 | 64 | 0, 57/* 64 @ 65 A 66 B 67 C 68 D 69 E 70 F 71 G */ 58 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 59/* 72 H 73 I 74 J 75 K 76 L 77 M 78 N 79 O */ 60 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 61/* 80 P 81 Q 82 R 83 S 84 T 85 U 86 V 87 W */ 62 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 63/* 88 X 89 Y 90 Z 91 [ 92 \ 93 ] 94 ^ 95 _ */ 64 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 65/* 96 ` 97 a 98 b 99 c 100 d 101 e 102 f 103 g */ 66 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 67/* 104 h 105 i 106 j 107 k 108 l 109 m 110 n 111 o */ 68 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 69/* 112 p 113 q 114 r 115 s 116 t 117 u 118 v 119 w */ 70 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 71/* 120 x 121 y 122 z 123 { 124 | 125 } 126 ~ 127 del */ 72 1 | 2 | 4 | 8 | 16 | 32 | 64 | 0, }; 73 74#undef T 75 76enum state 77 { s_dead = 1 /* important that this is > 0 */ 78 79 , s_start_req_or_res 80 , s_res_or_resp_H 81 , s_start_res 82 , s_res_H 83 , s_res_HT 84 , s_res_HTT 85 , s_res_HTTP 86 , s_res_http_major 87 , s_res_http_dot 88 , s_res_http_minor 89 , s_res_http_end 90 , s_res_first_status_code 91 , s_res_status_code 92 , s_res_status_start 93 , s_res_status 94 , s_res_line_almost_done 95 96 , s_start_req 97 98 , s_req_method 99 , s_req_spaces_before_url 100 , s_req_schema 101 , s_req_schema_slash 102 , s_req_schema_slash_slash 103 , s_req_server_start 104 , s_req_server 105 , s_req_server_with_at 106 , s_req_path 107 , s_req_query_string_start 108 , s_req_query_string 109 , s_req_fragment_start 110 , s_req_fragment 111 , s_req_http_start 112 , s_req_http_H 113 , s_req_http_HT 114 , s_req_http_HTT 115 , s_req_http_HTTP 116 , s_req_http_I 117 , s_req_http_IC 118 , s_req_http_major 119 , s_req_http_dot 120 , s_req_http_minor 121 , s_req_http_end 122 , s_req_line_almost_done 123 124 , s_header_field_start 125 , s_header_field 126 , s_header_value_discard_ws 127 , s_header_value_discard_ws_almost_done 128 , s_header_value_discard_lws 129 , s_header_value_start 130 , s_header_value 131 , s_header_value_lws 132 133 , s_header_almost_done 134 135 , s_chunk_size_start 136 , s_chunk_size 137 , s_chunk_parameters 138 , s_chunk_size_almost_done 139 140 , s_headers_almost_done 141 , s_headers_done 142 143 /* Important: 's_headers_done' must be the last 'header' state. All 144 * states beyond this must be 'body' states. It is used for overflow 145 * checking. See the PARSING_HEADER() macro. 146 */ 147 148 , s_chunk_data 149 , s_chunk_data_almost_done 150 , s_chunk_data_done 151 152 , s_body_identity 153 , s_body_identity_eof 154 155 , s_message_done 156 }; 157 158enum http_host_state 159 { 160 s_http_host_dead = 1 161 , s_http_userinfo_start 162 , s_http_userinfo 163 , s_http_host_start 164 , s_http_host_v6_start 165 , s_http_host 166 , s_http_host_v6 167 , s_http_host_v6_end 168 , s_http_host_v6_zone_start 169 , s_http_host_v6_zone 170 , s_http_host_port_start 171 , s_http_host_port 172}; 173 174/* Macros for character classes; depends on strict-mode */ 175#define CR '\r' 176#define LF '\n' 177#define LOWER(c) (unsigned char)(c | 0x20) 178#define IS_ALPHA(c) (LOWER(c) >= 'a' && LOWER(c) <= 'z') 179#define IS_NUM(c) ((c) >= '0' && (c) <= '9') 180#define IS_ALPHANUM(c) (IS_ALPHA(c) || IS_NUM(c)) 181#define IS_HEX(c) (IS_NUM(c) || (LOWER(c) >= 'a' && LOWER(c) <= 'f')) 182#define IS_MARK(c) ((c) == '-' || (c) == '_' || (c) == '.' || \ 183 (c) == '!' || (c) == '~' || (c) == '*' || (c) == '\'' || (c) == '(' || \ 184 (c) == ')') 185#define IS_USERINFO_CHAR(c) (IS_ALPHANUM(c) || IS_MARK(c) || (c) == '%' || \ 186 (c) == ';' || (c) == ':' || (c) == '&' || (c) == '=' || (c) == '+' || \ 187 (c) == '$' || (c) == ',') 188 189#define STRICT_TOKEN(c) ((c == ' ') ? 0 : tokens[(unsigned char)c]) 190 191#if HTTP_PARSER_STRICT 192#define TOKEN(c) STRICT_TOKEN(c) 193#define IS_URL_CHAR(c) (BIT_AT(normal_url_char, (unsigned char)c)) 194#define IS_HOST_CHAR(c) (IS_ALPHANUM(c) || (c) == '.' || (c) == '-') 195#else 196#define TOKEN(c) tokens[(unsigned char)c] 197#define IS_URL_CHAR(c) \ 198 (BIT_AT(normal_url_char, (unsigned char)c) || ((c) & 0x80)) 199#define IS_HOST_CHAR(c) \ 200 (IS_ALPHANUM(c) || (c) == '.' || (c) == '-' || (c) == '_') 201#endif 202 203/* Our URL parser. 204 * 205 * This is designed to be shared by http_parser_execute() for URL validation, 206 * hence it has a state transition + byte-for-byte interface. In addition, it 207 * is meant to be embedded in http_parser_parse_url(), which does the dirty 208 * work of turning state transitions URL components for its API. 209 * 210 * This function should only be invoked with non-space characters. It is 211 * assumed that the caller cares about (and can detect) the transition between 212 * URL and non-URL states by looking for these. 213 */ 214static enum state 215parse_url_char(enum state s, const char ch) 216{ 217 if (ch == ' ' || ch == '\r' || ch == '\n') { 218 return s_dead; 219 } 220 221#if HTTP_PARSER_STRICT 222 if (ch == '\t' || ch == '\f') { 223 return s_dead; 224 } 225#endif 226 227 switch (s) { 228 case s_req_spaces_before_url: 229 /* Proxied requests are followed by scheme of an absolute URI (alpha). 230 * All methods except CONNECT are followed by '/' or '*'. 231 */ 232 233 if (ch == '/' || ch == '*') { 234 return s_req_path; 235 } 236 237 if (IS_ALPHA(ch)) { 238 return s_req_schema; 239 } 240 241 break; 242 243 case s_req_schema: 244 if (IS_ALPHA(ch)) { 245 return s; 246 } 247 248 if (ch == ':') { 249 return s_req_schema_slash; 250 } 251 252 break; 253 254 case s_req_schema_slash: 255 if (ch == '/') { 256 return s_req_schema_slash_slash; 257 } 258 259 break; 260 261 case s_req_schema_slash_slash: 262 if (ch == '/') { 263 return s_req_server_start; 264 } 265 266 break; 267 268 case s_req_server_with_at: 269 if (ch == '@') { 270 return s_dead; 271 } 272 273 /* fall through */ 274 case s_req_server_start: 275 case s_req_server: 276 if (ch == '/') { 277 return s_req_path; 278 } 279 280 if (ch == '?') { 281 return s_req_query_string_start; 282 } 283 284 if (ch == '@') { 285 return s_req_server_with_at; 286 } 287 288 if (IS_USERINFO_CHAR(ch) || ch == '[' || ch == ']') { 289 return s_req_server; 290 } 291 292 break; 293 294 case s_req_path: 295 if (IS_URL_CHAR(ch)) { 296 return s; 297 } 298 299 switch (ch) { 300 case '?': 301 return s_req_query_string_start; 302 303 case '#': 304 return s_req_fragment_start; 305 } 306 307 break; 308 309 case s_req_query_string_start: 310 case s_req_query_string: 311 if (IS_URL_CHAR(ch)) { 312 return s_req_query_string; 313 } 314 315 switch (ch) { 316 case '?': 317 /* allow extra '?' in query string */ 318 return s_req_query_string; 319 320 case '#': 321 return s_req_fragment_start; 322 } 323 324 break; 325 326 case s_req_fragment_start: 327 if (IS_URL_CHAR(ch)) { 328 return s_req_fragment; 329 } 330 331 switch (ch) { 332 case '?': 333 return s_req_fragment; 334 335 case '#': 336 return s; 337 } 338 339 break; 340 341 case s_req_fragment: 342 if (IS_URL_CHAR(ch)) { 343 return s; 344 } 345 346 switch (ch) { 347 case '?': 348 case '#': 349 return s; 350 } 351 352 break; 353 354 default: 355 break; 356 } 357 358 /* We should never fall out of the switch above unless there's an error */ 359 return s_dead; 360} 361 362static enum http_host_state 363http_parse_host_char(enum http_host_state s, const char ch) { 364 switch(s) { 365 case s_http_userinfo: 366 case s_http_userinfo_start: 367 if (ch == '@') { 368 return s_http_host_start; 369 } 370 371 if (IS_USERINFO_CHAR(ch)) { 372 return s_http_userinfo; 373 } 374 break; 375 376 case s_http_host_start: 377 if (ch == '[') { 378 return s_http_host_v6_start; 379 } 380 381 if (IS_HOST_CHAR(ch)) { 382 return s_http_host; 383 } 384 385 break; 386 387 case s_http_host: 388 if (IS_HOST_CHAR(ch)) { 389 return s_http_host; 390 } 391 392 /* fall through */ 393 case s_http_host_v6_end: 394 if (ch == ':') { 395 return s_http_host_port_start; 396 } 397 398 break; 399 400 case s_http_host_v6: 401 if (ch == ']') { 402 return s_http_host_v6_end; 403 } 404 405 /* fall through */ 406 case s_http_host_v6_start: 407 if (IS_HEX(ch) || ch == ':' || ch == '.') { 408 return s_http_host_v6; 409 } 410 411 if (s == s_http_host_v6 && ch == '%') { 412 return s_http_host_v6_zone_start; 413 } 414 break; 415 416 case s_http_host_v6_zone: 417 if (ch == ']') { 418 return s_http_host_v6_end; 419 } 420 421 /* fall through */ 422 case s_http_host_v6_zone_start: 423 /* RFC 6874 Zone ID consists of 1*( unreserved / pct-encoded) */ 424 if (IS_ALPHANUM(ch) || ch == '%' || ch == '.' || ch == '-' || ch == '_' || 425 ch == '~') { 426 return s_http_host_v6_zone; 427 } 428 break; 429 430 case s_http_host_port: 431 case s_http_host_port_start: 432 if (IS_NUM(ch)) { 433 return s_http_host_port; 434 } 435 436 break; 437 438 default: 439 break; 440 } 441 return s_http_host_dead; 442} 443 444static int 445http_parse_host(const char * buf, struct http_parser_url *u, int found_at) { 446 enum http_host_state s; 447 448 const char *p; 449 size_t buflen = u->field_data[UF_HOST].off + u->field_data[UF_HOST].len; 450 451 assert(u->field_set & (1 << UF_HOST)); 452 453 u->field_data[UF_HOST].len = 0; 454 455 s = found_at ? s_http_userinfo_start : s_http_host_start; 456 457 for (p = buf + u->field_data[UF_HOST].off; p < buf + buflen; p++) { 458 enum http_host_state new_s = http_parse_host_char(s, *p); 459 460 if (new_s == s_http_host_dead) { 461 return 1; 462 } 463 464 switch(new_s) { 465 case s_http_host: 466 if (s != s_http_host) { 467 u->field_data[UF_HOST].off = (uint16_t)(p - buf); 468 } 469 u->field_data[UF_HOST].len++; 470 break; 471 472 case s_http_host_v6: 473 if (s != s_http_host_v6) { 474 u->field_data[UF_HOST].off = (uint16_t)(p - buf); 475 } 476 u->field_data[UF_HOST].len++; 477 break; 478 479 case s_http_host_v6_zone_start: 480 case s_http_host_v6_zone: 481 u->field_data[UF_HOST].len++; 482 break; 483 484 case s_http_host_port: 485 if (s != s_http_host_port) { 486 u->field_data[UF_PORT].off = (uint16_t)(p - buf); 487 u->field_data[UF_PORT].len = 0; 488 u->field_set |= (1 << UF_PORT); 489 } 490 u->field_data[UF_PORT].len++; 491 break; 492 493 case s_http_userinfo: 494 if (s != s_http_userinfo) { 495 u->field_data[UF_USERINFO].off = (uint16_t)(p - buf); 496 u->field_data[UF_USERINFO].len = 0; 497 u->field_set |= (1 << UF_USERINFO); 498 } 499 u->field_data[UF_USERINFO].len++; 500 break; 501 502 default: 503 break; 504 } 505 s = new_s; 506 } 507 508 /* Make sure we don't end somewhere unexpected */ 509 switch (s) { 510 case s_http_host_start: 511 case s_http_host_v6_start: 512 case s_http_host_v6: 513 case s_http_host_v6_zone_start: 514 case s_http_host_v6_zone: 515 case s_http_host_port_start: 516 case s_http_userinfo: 517 case s_http_userinfo_start: 518 return 1; 519 default: 520 break; 521 } 522 523 return 0; 524} 525 526void 527http_parser_url_init(struct http_parser_url *u) { 528 memset(u, 0, sizeof(*u)); 529} 530 531int 532http_parser_parse_url(const char *buf, size_t buflen, int is_connect, 533 struct http_parser_url *u) 534{ 535 enum state s; 536 const char *p; 537 enum http_parser_url_fields uf, old_uf; 538 int found_at = 0; 539 540 if (buflen == 0) { 541 return 1; 542 } 543 544 u->port = u->field_set = 0; 545 s = is_connect ? s_req_server_start : s_req_spaces_before_url; 546 old_uf = UF_MAX; 547 548 for (p = buf; p < buf + buflen; p++) { 549 s = parse_url_char(s, *p); 550 551 /* Figure out the next field that we're operating on */ 552 switch (s) { 553 case s_dead: 554 return 1; 555 556 /* Skip delimeters */ 557 case s_req_schema_slash: 558 case s_req_schema_slash_slash: 559 case s_req_server_start: 560 case s_req_query_string_start: 561 case s_req_fragment_start: 562 continue; 563 564 case s_req_schema: 565 uf = UF_SCHEMA; 566 break; 567 568 case s_req_server_with_at: 569 found_at = 1; 570 571 /* fall through */ 572 case s_req_server: 573 uf = UF_HOST; 574 break; 575 576 case s_req_path: 577 uf = UF_PATH; 578 break; 579 580 case s_req_query_string: 581 uf = UF_QUERY; 582 break; 583 584 case s_req_fragment: 585 uf = UF_FRAGMENT; 586 break; 587 588 default: 589 assert(!"Unexpected state"); 590 return 1; 591 } 592 593 /* Nothing's changed; soldier on */ 594 if (uf == old_uf) { 595 u->field_data[uf].len++; 596 continue; 597 } 598 599 u->field_data[uf].off = (uint16_t)(p - buf); 600 u->field_data[uf].len = 1; 601 602 u->field_set |= (1 << uf); 603 old_uf = uf; 604 } 605 606 /* host must be present if there is a schema */ 607 /* parsing http:///toto will fail */ 608 if ((u->field_set & (1 << UF_SCHEMA)) && 609 (u->field_set & (1 << UF_HOST)) == 0) { 610 return 1; 611 } 612 613 if (u->field_set & (1 << UF_HOST)) { 614 if (http_parse_host(buf, u, found_at) != 0) { 615 return 1; 616 } 617 } 618 619 /* CONNECT requests can only contain "hostname:port" */ 620 if (is_connect && u->field_set != ((1 << UF_HOST)|(1 << UF_PORT))) { 621 return 1; 622 } 623 624 if (u->field_set & (1 << UF_PORT)) { 625 uint16_t off; 626 uint16_t len; 627 const char* p; 628 const char* end; 629 unsigned long v; 630 631 off = u->field_data[UF_PORT].off; 632 len = u->field_data[UF_PORT].len; 633 end = buf + off + len; 634 635 /* NOTE: The characters are already validated and are in the [0-9] range */ 636 assert(off + len <= buflen && "Port number overflow"); 637 v = 0; 638 for (p = buf + off; p < end; p++) { 639 v *= 10; 640 v += *p - '0'; 641 642 /* Ports have a max value of 2^16 */ 643 if (v > 0xffff) { 644 return 1; 645 } 646 } 647 648 u->port = (uint16_t) v; 649 } 650 651 return 0; 652} 653