1/* Copyright Joyent, Inc. and other Node contributors.
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to
5 * deal in the Software without restriction, including without limitation the
6 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 * sell copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 * IN THE SOFTWARE.
20 */
21#include "url_parser.h"
22#include <assert.h>
23#include <stddef.h>
24#include <ctype.h>
25#include <string.h>
26#include <limits.h>
27
28#ifndef BIT_AT
29# define BIT_AT(a, i)                                                \
30  (!!((unsigned int) (a)[(unsigned int) (i) >> 3] &                  \
31   (1 << ((unsigned int) (i) & 7))))
32#endif
33
34#if HTTP_PARSER_STRICT
35# define T(v) 0
36#else
37# define T(v) v
38#endif
39
40static const uint8_t normal_url_char[32] = {
41/*   0 nul    1 soh    2 stx    3 etx    4 eot    5 enq    6 ack    7 bel  */
42        0    |   0    |   0    |   0    |   0    |   0    |   0    |   0,
43/*   8 bs     9 ht    10 nl    11 vt    12 np    13 cr    14 so    15 si   */
44        0    | T(2)   |   0    |   0    | T(16)  |   0    |   0    |   0,
45/*  16 dle   17 dc1   18 dc2   19 dc3   20 dc4   21 nak   22 syn   23 etb */
46        0    |   0    |   0    |   0    |   0    |   0    |   0    |   0,
47/*  24 can   25 em    26 sub   27 esc   28 fs    29 gs    30 rs    31 us  */
48        0    |   0    |   0    |   0    |   0    |   0    |   0    |   0,
49/*  32 sp    33  !    34  "    35  #    36  $    37  %    38  &    39  '  */
50        0    |   2    |   4    |   0    |   16   |   32   |   64   |  128,
51/*  40  (    41  )    42  *    43  +    44  ,    45  -    46  .    47  /  */
52        1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
53/*  48  0    49  1    50  2    51  3    52  4    53  5    54  6    55  7  */
54        1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
55/*  56  8    57  9    58  :    59  ;    60  <    61  =    62  >    63  ?  */
56        1    |   2    |   4    |   8    |   16   |   32   |   64   |   0,
57/*  64  @    65  A    66  B    67  C    68  D    69  E    70  F    71  G  */
58        1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
59/*  72  H    73  I    74  J    75  K    76  L    77  M    78  N    79  O  */
60        1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
61/*  80  P    81  Q    82  R    83  S    84  T    85  U    86  V    87  W  */
62        1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
63/*  88  X    89  Y    90  Z    91  [    92  \    93  ]    94  ^    95  _  */
64        1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
65/*  96  `    97  a    98  b    99  c   100  d   101  e   102  f   103  g  */
66        1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
67/* 104  h   105  i   106  j   107  k   108  l   109  m   110  n   111  o  */
68        1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
69/* 112  p   113  q   114  r   115  s   116  t   117  u   118  v   119  w  */
70        1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
71/* 120  x   121  y   122  z   123  {   124  |   125  }   126  ~   127 del */
72        1    |   2    |   4    |   8    |   16   |   32   |   64   |   0, };
73
74#undef T
75
76enum state
77  { s_dead = 1 /* important that this is > 0 */
78
79  , s_start_req_or_res
80  , s_res_or_resp_H
81  , s_start_res
82  , s_res_H
83  , s_res_HT
84  , s_res_HTT
85  , s_res_HTTP
86  , s_res_http_major
87  , s_res_http_dot
88  , s_res_http_minor
89  , s_res_http_end
90  , s_res_first_status_code
91  , s_res_status_code
92  , s_res_status_start
93  , s_res_status
94  , s_res_line_almost_done
95
96  , s_start_req
97
98  , s_req_method
99  , s_req_spaces_before_url
100  , s_req_schema
101  , s_req_schema_slash
102  , s_req_schema_slash_slash
103  , s_req_server_start
104  , s_req_server
105  , s_req_server_with_at
106  , s_req_path
107  , s_req_query_string_start
108  , s_req_query_string
109  , s_req_fragment_start
110  , s_req_fragment
111  , s_req_http_start
112  , s_req_http_H
113  , s_req_http_HT
114  , s_req_http_HTT
115  , s_req_http_HTTP
116  , s_req_http_I
117  , s_req_http_IC
118  , s_req_http_major
119  , s_req_http_dot
120  , s_req_http_minor
121  , s_req_http_end
122  , s_req_line_almost_done
123
124  , s_header_field_start
125  , s_header_field
126  , s_header_value_discard_ws
127  , s_header_value_discard_ws_almost_done
128  , s_header_value_discard_lws
129  , s_header_value_start
130  , s_header_value
131  , s_header_value_lws
132
133  , s_header_almost_done
134
135  , s_chunk_size_start
136  , s_chunk_size
137  , s_chunk_parameters
138  , s_chunk_size_almost_done
139
140  , s_headers_almost_done
141  , s_headers_done
142
143  /* Important: 's_headers_done' must be the last 'header' state. All
144   * states beyond this must be 'body' states. It is used for overflow
145   * checking. See the PARSING_HEADER() macro.
146   */
147
148  , s_chunk_data
149  , s_chunk_data_almost_done
150  , s_chunk_data_done
151
152  , s_body_identity
153  , s_body_identity_eof
154
155  , s_message_done
156  };
157
158enum http_host_state
159  {
160    s_http_host_dead = 1
161  , s_http_userinfo_start
162  , s_http_userinfo
163  , s_http_host_start
164  , s_http_host_v6_start
165  , s_http_host
166  , s_http_host_v6
167  , s_http_host_v6_end
168  , s_http_host_v6_zone_start
169  , s_http_host_v6_zone
170  , s_http_host_port_start
171  , s_http_host_port
172};
173
174/* Macros for character classes; depends on strict-mode  */
175#define CR                  '\r'
176#define LF                  '\n'
177#define LOWER(c)            (unsigned char)(c | 0x20)
178#define IS_ALPHA(c)         (LOWER(c) >= 'a' && LOWER(c) <= 'z')
179#define IS_NUM(c)           ((c) >= '0' && (c) <= '9')
180#define IS_ALPHANUM(c)      (IS_ALPHA(c) || IS_NUM(c))
181#define IS_HEX(c)           (IS_NUM(c) || (LOWER(c) >= 'a' && LOWER(c) <= 'f'))
182#define IS_MARK(c)          ((c) == '-' || (c) == '_' || (c) == '.' || \
183  (c) == '!' || (c) == '~' || (c) == '*' || (c) == '\'' || (c) == '(' || \
184  (c) == ')')
185#define IS_USERINFO_CHAR(c) (IS_ALPHANUM(c) || IS_MARK(c) || (c) == '%' || \
186  (c) == ';' || (c) == ':' || (c) == '&' || (c) == '=' || (c) == '+' || \
187  (c) == '$' || (c) == ',')
188
189#define STRICT_TOKEN(c)     ((c == ' ') ? 0 : tokens[(unsigned char)c])
190
191#if HTTP_PARSER_STRICT
192#define TOKEN(c)            STRICT_TOKEN(c)
193#define IS_URL_CHAR(c)      (BIT_AT(normal_url_char, (unsigned char)c))
194#define IS_HOST_CHAR(c)     (IS_ALPHANUM(c) || (c) == '.' || (c) == '-')
195#else
196#define TOKEN(c)            tokens[(unsigned char)c]
197#define IS_URL_CHAR(c)                                                         \
198  (BIT_AT(normal_url_char, (unsigned char)c) || ((c) & 0x80))
199#define IS_HOST_CHAR(c)                                                        \
200  (IS_ALPHANUM(c) || (c) == '.' || (c) == '-' || (c) == '_')
201#endif
202
203/* Our URL parser.
204 *
205 * This is designed to be shared by http_parser_execute() for URL validation,
206 * hence it has a state transition + byte-for-byte interface. In addition, it
207 * is meant to be embedded in http_parser_parse_url(), which does the dirty
208 * work of turning state transitions URL components for its API.
209 *
210 * This function should only be invoked with non-space characters. It is
211 * assumed that the caller cares about (and can detect) the transition between
212 * URL and non-URL states by looking for these.
213 */
214static enum state
215parse_url_char(enum state s, const char ch)
216{
217  if (ch == ' ' || ch == '\r' || ch == '\n') {
218    return s_dead;
219  }
220
221#if HTTP_PARSER_STRICT
222  if (ch == '\t' || ch == '\f') {
223    return s_dead;
224  }
225#endif
226
227  switch (s) {
228    case s_req_spaces_before_url:
229      /* Proxied requests are followed by scheme of an absolute URI (alpha).
230       * All methods except CONNECT are followed by '/' or '*'.
231       */
232
233      if (ch == '/' || ch == '*') {
234        return s_req_path;
235      }
236
237      if (IS_ALPHA(ch)) {
238        return s_req_schema;
239      }
240
241      break;
242
243    case s_req_schema:
244      if (IS_ALPHA(ch)) {
245        return s;
246      }
247
248      if (ch == ':') {
249        return s_req_schema_slash;
250      }
251
252      break;
253
254    case s_req_schema_slash:
255      if (ch == '/') {
256        return s_req_schema_slash_slash;
257      }
258
259      break;
260
261    case s_req_schema_slash_slash:
262      if (ch == '/') {
263        return s_req_server_start;
264      }
265
266      break;
267
268    case s_req_server_with_at:
269      if (ch == '@') {
270        return s_dead;
271      }
272
273    /* fall through */
274    case s_req_server_start:
275    case s_req_server:
276      if (ch == '/') {
277        return s_req_path;
278      }
279
280      if (ch == '?') {
281        return s_req_query_string_start;
282      }
283
284      if (ch == '@') {
285        return s_req_server_with_at;
286      }
287
288      if (IS_USERINFO_CHAR(ch) || ch == '[' || ch == ']') {
289        return s_req_server;
290      }
291
292      break;
293
294    case s_req_path:
295      if (IS_URL_CHAR(ch)) {
296        return s;
297      }
298
299      switch (ch) {
300        case '?':
301          return s_req_query_string_start;
302
303        case '#':
304          return s_req_fragment_start;
305      }
306
307      break;
308
309    case s_req_query_string_start:
310    case s_req_query_string:
311      if (IS_URL_CHAR(ch)) {
312        return s_req_query_string;
313      }
314
315      switch (ch) {
316        case '?':
317          /* allow extra '?' in query string */
318          return s_req_query_string;
319
320        case '#':
321          return s_req_fragment_start;
322      }
323
324      break;
325
326    case s_req_fragment_start:
327      if (IS_URL_CHAR(ch)) {
328        return s_req_fragment;
329      }
330
331      switch (ch) {
332        case '?':
333          return s_req_fragment;
334
335        case '#':
336          return s;
337      }
338
339      break;
340
341    case s_req_fragment:
342      if (IS_URL_CHAR(ch)) {
343        return s;
344      }
345
346      switch (ch) {
347        case '?':
348        case '#':
349          return s;
350      }
351
352      break;
353
354    default:
355      break;
356  }
357
358  /* We should never fall out of the switch above unless there's an error */
359  return s_dead;
360}
361
362static enum http_host_state
363http_parse_host_char(enum http_host_state s, const char ch) {
364  switch(s) {
365    case s_http_userinfo:
366    case s_http_userinfo_start:
367      if (ch == '@') {
368        return s_http_host_start;
369      }
370
371      if (IS_USERINFO_CHAR(ch)) {
372        return s_http_userinfo;
373      }
374      break;
375
376    case s_http_host_start:
377      if (ch == '[') {
378        return s_http_host_v6_start;
379      }
380
381      if (IS_HOST_CHAR(ch)) {
382        return s_http_host;
383      }
384
385      break;
386
387    case s_http_host:
388      if (IS_HOST_CHAR(ch)) {
389        return s_http_host;
390      }
391
392    /* fall through */
393    case s_http_host_v6_end:
394      if (ch == ':') {
395        return s_http_host_port_start;
396      }
397
398      break;
399
400    case s_http_host_v6:
401      if (ch == ']') {
402        return s_http_host_v6_end;
403      }
404
405    /* fall through */
406    case s_http_host_v6_start:
407      if (IS_HEX(ch) || ch == ':' || ch == '.') {
408        return s_http_host_v6;
409      }
410
411      if (s == s_http_host_v6 && ch == '%') {
412        return s_http_host_v6_zone_start;
413      }
414      break;
415
416    case s_http_host_v6_zone:
417      if (ch == ']') {
418        return s_http_host_v6_end;
419      }
420
421    /* fall through */
422    case s_http_host_v6_zone_start:
423      /* RFC 6874 Zone ID consists of 1*( unreserved / pct-encoded) */
424      if (IS_ALPHANUM(ch) || ch == '%' || ch == '.' || ch == '-' || ch == '_' ||
425          ch == '~') {
426        return s_http_host_v6_zone;
427      }
428      break;
429
430    case s_http_host_port:
431    case s_http_host_port_start:
432      if (IS_NUM(ch)) {
433        return s_http_host_port;
434      }
435
436      break;
437
438    default:
439      break;
440  }
441  return s_http_host_dead;
442}
443
444static int
445http_parse_host(const char * buf, struct http_parser_url *u, int found_at) {
446  enum http_host_state s;
447
448  const char *p;
449  size_t buflen = u->field_data[UF_HOST].off + u->field_data[UF_HOST].len;
450
451  assert(u->field_set & (1 << UF_HOST));
452
453  u->field_data[UF_HOST].len = 0;
454
455  s = found_at ? s_http_userinfo_start : s_http_host_start;
456
457  for (p = buf + u->field_data[UF_HOST].off; p < buf + buflen; p++) {
458    enum http_host_state new_s = http_parse_host_char(s, *p);
459
460    if (new_s == s_http_host_dead) {
461      return 1;
462    }
463
464    switch(new_s) {
465      case s_http_host:
466        if (s != s_http_host) {
467          u->field_data[UF_HOST].off = (uint16_t)(p - buf);
468        }
469        u->field_data[UF_HOST].len++;
470        break;
471
472      case s_http_host_v6:
473        if (s != s_http_host_v6) {
474          u->field_data[UF_HOST].off = (uint16_t)(p - buf);
475        }
476        u->field_data[UF_HOST].len++;
477        break;
478
479      case s_http_host_v6_zone_start:
480      case s_http_host_v6_zone:
481        u->field_data[UF_HOST].len++;
482        break;
483
484      case s_http_host_port:
485        if (s != s_http_host_port) {
486          u->field_data[UF_PORT].off = (uint16_t)(p - buf);
487          u->field_data[UF_PORT].len = 0;
488          u->field_set |= (1 << UF_PORT);
489        }
490        u->field_data[UF_PORT].len++;
491        break;
492
493      case s_http_userinfo:
494        if (s != s_http_userinfo) {
495          u->field_data[UF_USERINFO].off = (uint16_t)(p - buf);
496          u->field_data[UF_USERINFO].len = 0;
497          u->field_set |= (1 << UF_USERINFO);
498        }
499        u->field_data[UF_USERINFO].len++;
500        break;
501
502      default:
503        break;
504    }
505    s = new_s;
506  }
507
508  /* Make sure we don't end somewhere unexpected */
509  switch (s) {
510    case s_http_host_start:
511    case s_http_host_v6_start:
512    case s_http_host_v6:
513    case s_http_host_v6_zone_start:
514    case s_http_host_v6_zone:
515    case s_http_host_port_start:
516    case s_http_userinfo:
517    case s_http_userinfo_start:
518      return 1;
519    default:
520      break;
521  }
522
523  return 0;
524}
525
526void
527http_parser_url_init(struct http_parser_url *u) {
528  memset(u, 0, sizeof(*u));
529}
530
531int
532http_parser_parse_url(const char *buf, size_t buflen, int is_connect,
533                      struct http_parser_url *u)
534{
535  enum state s;
536  const char *p;
537  enum http_parser_url_fields uf, old_uf;
538  int found_at = 0;
539
540  if (buflen == 0) {
541    return 1;
542  }
543
544  u->port = u->field_set = 0;
545  s = is_connect ? s_req_server_start : s_req_spaces_before_url;
546  old_uf = UF_MAX;
547
548  for (p = buf; p < buf + buflen; p++) {
549    s = parse_url_char(s, *p);
550
551    /* Figure out the next field that we're operating on */
552    switch (s) {
553      case s_dead:
554        return 1;
555
556      /* Skip delimeters */
557      case s_req_schema_slash:
558      case s_req_schema_slash_slash:
559      case s_req_server_start:
560      case s_req_query_string_start:
561      case s_req_fragment_start:
562        continue;
563
564      case s_req_schema:
565        uf = UF_SCHEMA;
566        break;
567
568      case s_req_server_with_at:
569        found_at = 1;
570
571      /* fall through */
572      case s_req_server:
573        uf = UF_HOST;
574        break;
575
576      case s_req_path:
577        uf = UF_PATH;
578        break;
579
580      case s_req_query_string:
581        uf = UF_QUERY;
582        break;
583
584      case s_req_fragment:
585        uf = UF_FRAGMENT;
586        break;
587
588      default:
589        assert(!"Unexpected state");
590        return 1;
591    }
592
593    /* Nothing's changed; soldier on */
594    if (uf == old_uf) {
595      u->field_data[uf].len++;
596      continue;
597    }
598
599    u->field_data[uf].off = (uint16_t)(p - buf);
600    u->field_data[uf].len = 1;
601
602    u->field_set |= (1 << uf);
603    old_uf = uf;
604  }
605
606  /* host must be present if there is a schema */
607  /* parsing http:///toto will fail */
608  if ((u->field_set & (1 << UF_SCHEMA)) &&
609      (u->field_set & (1 << UF_HOST)) == 0) {
610    return 1;
611  }
612
613  if (u->field_set & (1 << UF_HOST)) {
614    if (http_parse_host(buf, u, found_at) != 0) {
615      return 1;
616    }
617  }
618
619  /* CONNECT requests can only contain "hostname:port" */
620  if (is_connect && u->field_set != ((1 << UF_HOST)|(1 << UF_PORT))) {
621    return 1;
622  }
623
624  if (u->field_set & (1 << UF_PORT)) {
625    uint16_t off;
626    uint16_t len;
627    const char* p;
628    const char* end;
629    unsigned long v;
630
631    off = u->field_data[UF_PORT].off;
632    len = u->field_data[UF_PORT].len;
633    end = buf + off + len;
634
635    /* NOTE: The characters are already validated and are in the [0-9] range */
636    assert(off + len <= buflen && "Port number overflow");
637    v = 0;
638    for (p = buf + off; p < end; p++) {
639      v *= 10;
640      v += *p - '0';
641
642      /* Ports have a max value of 2^16 */
643      if (v > 0xffff) {
644        return 1;
645      }
646    }
647
648    u->port = (uint16_t) v;
649  }
650
651  return 0;
652}
653