1//     __ _____ _____ _____
2//  __|  |   __|     |   | |  JSON for Modern C++
3// |  |  |__   |  |  | | | |  version 3.11.2
4// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
5//
6// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
7// SPDX-License-Identifier: MIT
8
9#pragma once
10
11#include <array> // array
12#include <clocale> // localeconv
13#include <cstddef> // size_t
14#include <cstdio> // snprintf
15#include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull
16#include <initializer_list> // initializer_list
17#include <string> // char_traits, string
18#include <utility> // move
19#include <vector> // vector
20
21#include <nlohmann/detail/input/input_adapters.hpp>
22#include <nlohmann/detail/input/position_t.hpp>
23#include <nlohmann/detail/macro_scope.hpp>
24
25NLOHMANN_JSON_NAMESPACE_BEGIN
26namespace detail
27{
28
29///////////
30// lexer //
31///////////
32
33template<typename BasicJsonType>
34class lexer_base
35{
36  public:
37    /// token types for the parser
38    enum class token_type
39    {
40        uninitialized,    ///< indicating the scanner is uninitialized
41        literal_true,     ///< the `true` literal
42        literal_false,    ///< the `false` literal
43        literal_null,     ///< the `null` literal
44        value_string,     ///< a string -- use get_string() for actual value
45        value_unsigned,   ///< an unsigned integer -- use get_number_unsigned() for actual value
46        value_integer,    ///< a signed integer -- use get_number_integer() for actual value
47        value_float,      ///< an floating point number -- use get_number_float() for actual value
48        begin_array,      ///< the character for array begin `[`
49        begin_object,     ///< the character for object begin `{`
50        end_array,        ///< the character for array end `]`
51        end_object,       ///< the character for object end `}`
52        name_separator,   ///< the name separator `:`
53        value_separator,  ///< the value separator `,`
54        parse_error,      ///< indicating a parse error
55        end_of_input,     ///< indicating the end of the input buffer
56        literal_or_value  ///< a literal or the begin of a value (only for diagnostics)
57    };
58
59    /// return name of values of type token_type (only used for errors)
60    JSON_HEDLEY_RETURNS_NON_NULL
61    JSON_HEDLEY_CONST
62    static const char* token_type_name(const token_type t) noexcept
63    {
64        switch (t)
65        {
66            case token_type::uninitialized:
67                return "<uninitialized>";
68            case token_type::literal_true:
69                return "true literal";
70            case token_type::literal_false:
71                return "false literal";
72            case token_type::literal_null:
73                return "null literal";
74            case token_type::value_string:
75                return "string literal";
76            case token_type::value_unsigned:
77            case token_type::value_integer:
78            case token_type::value_float:
79                return "number literal";
80            case token_type::begin_array:
81                return "'['";
82            case token_type::begin_object:
83                return "'{'";
84            case token_type::end_array:
85                return "']'";
86            case token_type::end_object:
87                return "'}'";
88            case token_type::name_separator:
89                return "':'";
90            case token_type::value_separator:
91                return "','";
92            case token_type::parse_error:
93                return "<parse error>";
94            case token_type::end_of_input:
95                return "end of input";
96            case token_type::literal_or_value:
97                return "'[', '{', or a literal";
98            // LCOV_EXCL_START
99            default: // catch non-enum values
100                return "unknown token";
101                // LCOV_EXCL_STOP
102        }
103    }
104};
105/*!
106@brief lexical analysis
107
108This class organizes the lexical analysis during JSON deserialization.
109*/
110template<typename BasicJsonType, typename InputAdapterType>
111class lexer : public lexer_base<BasicJsonType>
112{
113    using number_integer_t = typename BasicJsonType::number_integer_t;
114    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
115    using number_float_t = typename BasicJsonType::number_float_t;
116    using string_t = typename BasicJsonType::string_t;
117    using char_type = typename InputAdapterType::char_type;
118    using char_int_type = typename std::char_traits<char_type>::int_type;
119
120  public:
121    using token_type = typename lexer_base<BasicJsonType>::token_type;
122
123    explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) noexcept
124        : ia(std::move(adapter))
125        , ignore_comments(ignore_comments_)
126        , decimal_point_char(static_cast<char_int_type>(get_decimal_point()))
127    {}
128
129    // delete because of pointer members
130    lexer(const lexer&) = delete;
131    lexer(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
132    lexer& operator=(lexer&) = delete;
133    lexer& operator=(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
134    ~lexer() = default;
135
136  private:
137    /////////////////////
138    // locales
139    /////////////////////
140
141    /// return the locale-dependent decimal point
142    JSON_HEDLEY_PURE
143    static char get_decimal_point() noexcept
144    {
145        const auto* loc = localeconv();
146        JSON_ASSERT(loc != nullptr);
147        return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point);
148    }
149
150    /////////////////////
151    // scan functions
152    /////////////////////
153
154    /*!
155    @brief get codepoint from 4 hex characters following `\u`
156
157    For input "\u c1 c2 c3 c4" the codepoint is:
158      (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4
159    = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0)
160
161    Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f'
162    must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The
163    conversion is done by subtracting the offset (0x30, 0x37, and 0x57)
164    between the ASCII value of the character and the desired integer value.
165
166    @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or
167            non-hex character)
168    */
169    int get_codepoint()
170    {
171        // this function only makes sense after reading `\u`
172        JSON_ASSERT(current == 'u');
173        int codepoint = 0;
174
175        const auto factors = { 12u, 8u, 4u, 0u };
176        for (const auto factor : factors)
177        {
178            get();
179
180            if (current >= '0' && current <= '9')
181            {
182                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x30u) << factor);
183            }
184            else if (current >= 'A' && current <= 'F')
185            {
186                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x37u) << factor);
187            }
188            else if (current >= 'a' && current <= 'f')
189            {
190                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x57u) << factor);
191            }
192            else
193            {
194                return -1;
195            }
196        }
197
198        JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF);
199        return codepoint;
200    }
201
202    /*!
203    @brief check if the next byte(s) are inside a given range
204
205    Adds the current byte and, for each passed range, reads a new byte and
206    checks if it is inside the range. If a violation was detected, set up an
207    error message and return false. Otherwise, return true.
208
209    @param[in] ranges  list of integers; interpreted as list of pairs of
210                       inclusive lower and upper bound, respectively
211
212    @pre The passed list @a ranges must have 2, 4, or 6 elements; that is,
213         1, 2, or 3 pairs. This precondition is enforced by an assertion.
214
215    @return true if and only if no range violation was detected
216    */
217    bool next_byte_in_range(std::initializer_list<char_int_type> ranges)
218    {
219        JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6);
220        add(current);
221
222        for (auto range = ranges.begin(); range != ranges.end(); ++range)
223        {
224            get();
225            if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range)))
226            {
227                add(current);
228            }
229            else
230            {
231                error_message = "invalid string: ill-formed UTF-8 byte";
232                return false;
233            }
234        }
235
236        return true;
237    }
238
239    /*!
240    @brief scan a string literal
241
242    This function scans a string according to Sect. 7 of RFC 8259. While
243    scanning, bytes are escaped and copied into buffer token_buffer. Then the
244    function returns successfully, token_buffer is *not* null-terminated (as it
245    may contain \0 bytes), and token_buffer.size() is the number of bytes in the
246    string.
247
248    @return token_type::value_string if string could be successfully scanned,
249            token_type::parse_error otherwise
250
251    @note In case of errors, variable error_message contains a textual
252          description.
253    */
254    token_type scan_string()
255    {
256        // reset token_buffer (ignore opening quote)
257        reset();
258
259        // we entered the function by reading an open quote
260        JSON_ASSERT(current == '\"');
261
262        while (true)
263        {
264            // get next character
265            switch (get())
266            {
267                // end of file while parsing string
268                case std::char_traits<char_type>::eof():
269                {
270                    error_message = "invalid string: missing closing quote";
271                    return token_type::parse_error;
272                }
273
274                // closing quote
275                case '\"':
276                {
277                    return token_type::value_string;
278                }
279
280                // escapes
281                case '\\':
282                {
283                    switch (get())
284                    {
285                        // quotation mark
286                        case '\"':
287                            add('\"');
288                            break;
289                        // reverse solidus
290                        case '\\':
291                            add('\\');
292                            break;
293                        // solidus
294                        case '/':
295                            add('/');
296                            break;
297                        // backspace
298                        case 'b':
299                            add('\b');
300                            break;
301                        // form feed
302                        case 'f':
303                            add('\f');
304                            break;
305                        // line feed
306                        case 'n':
307                            add('\n');
308                            break;
309                        // carriage return
310                        case 'r':
311                            add('\r');
312                            break;
313                        // tab
314                        case 't':
315                            add('\t');
316                            break;
317
318                        // unicode escapes
319                        case 'u':
320                        {
321                            const int codepoint1 = get_codepoint();
322                            int codepoint = codepoint1; // start with codepoint1
323
324                            if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1))
325                            {
326                                error_message = "invalid string: '\\u' must be followed by 4 hex digits";
327                                return token_type::parse_error;
328                            }
329
330                            // check if code point is a high surrogate
331                            if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF)
332                            {
333                                // expect next \uxxxx entry
334                                if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u'))
335                                {
336                                    const int codepoint2 = get_codepoint();
337
338                                    if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1))
339                                    {
340                                        error_message = "invalid string: '\\u' must be followed by 4 hex digits";
341                                        return token_type::parse_error;
342                                    }
343
344                                    // check if codepoint2 is a low surrogate
345                                    if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF))
346                                    {
347                                        // overwrite codepoint
348                                        codepoint = static_cast<int>(
349                                                        // high surrogate occupies the most significant 22 bits
350                                                        (static_cast<unsigned int>(codepoint1) << 10u)
351                                                        // low surrogate occupies the least significant 15 bits
352                                                        + static_cast<unsigned int>(codepoint2)
353                                                        // there is still the 0xD800, 0xDC00 and 0x10000 noise
354                                                        // in the result, so we have to subtract with:
355                                                        // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
356                                                        - 0x35FDC00u);
357                                    }
358                                    else
359                                    {
360                                        error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
361                                        return token_type::parse_error;
362                                    }
363                                }
364                                else
365                                {
366                                    error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
367                                    return token_type::parse_error;
368                                }
369                            }
370                            else
371                            {
372                                if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF))
373                                {
374                                    error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
375                                    return token_type::parse_error;
376                                }
377                            }
378
379                            // result of the above calculation yields a proper codepoint
380                            JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF);
381
382                            // translate codepoint into bytes
383                            if (codepoint < 0x80)
384                            {
385                                // 1-byte characters: 0xxxxxxx (ASCII)
386                                add(static_cast<char_int_type>(codepoint));
387                            }
388                            else if (codepoint <= 0x7FF)
389                            {
390                                // 2-byte characters: 110xxxxx 10xxxxxx
391                                add(static_cast<char_int_type>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u)));
392                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
393                            }
394                            else if (codepoint <= 0xFFFF)
395                            {
396                                // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
397                                add(static_cast<char_int_type>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u)));
398                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
399                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
400                            }
401                            else
402                            {
403                                // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
404                                add(static_cast<char_int_type>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u)));
405                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu)));
406                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
407                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
408                            }
409
410                            break;
411                        }
412
413                        // other characters after escape
414                        default:
415                            error_message = "invalid string: forbidden character after backslash";
416                            return token_type::parse_error;
417                    }
418
419                    break;
420                }
421
422                // invalid control characters
423                case 0x00:
424                {
425                    error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
426                    return token_type::parse_error;
427                }
428
429                case 0x01:
430                {
431                    error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
432                    return token_type::parse_error;
433                }
434
435                case 0x02:
436                {
437                    error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002";
438                    return token_type::parse_error;
439                }
440
441                case 0x03:
442                {
443                    error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
444                    return token_type::parse_error;
445                }
446
447                case 0x04:
448                {
449                    error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
450                    return token_type::parse_error;
451                }
452
453                case 0x05:
454                {
455                    error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
456                    return token_type::parse_error;
457                }
458
459                case 0x06:
460                {
461                    error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
462                    return token_type::parse_error;
463                }
464
465                case 0x07:
466                {
467                    error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
468                    return token_type::parse_error;
469                }
470
471                case 0x08:
472                {
473                    error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
474                    return token_type::parse_error;
475                }
476
477                case 0x09:
478                {
479                    error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
480                    return token_type::parse_error;
481                }
482
483                case 0x0A:
484                {
485                    error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
486                    return token_type::parse_error;
487                }
488
489                case 0x0B:
490                {
491                    error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B";
492                    return token_type::parse_error;
493                }
494
495                case 0x0C:
496                {
497                    error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
498                    return token_type::parse_error;
499                }
500
501                case 0x0D:
502                {
503                    error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
504                    return token_type::parse_error;
505                }
506
507                case 0x0E:
508                {
509                    error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E";
510                    return token_type::parse_error;
511                }
512
513                case 0x0F:
514                {
515                    error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F";
516                    return token_type::parse_error;
517                }
518
519                case 0x10:
520                {
521                    error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
522                    return token_type::parse_error;
523                }
524
525                case 0x11:
526                {
527                    error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
528                    return token_type::parse_error;
529                }
530
531                case 0x12:
532                {
533                    error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
534                    return token_type::parse_error;
535                }
536
537                case 0x13:
538                {
539                    error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
540                    return token_type::parse_error;
541                }
542
543                case 0x14:
544                {
545                    error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
546                    return token_type::parse_error;
547                }
548
549                case 0x15:
550                {
551                    error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
552                    return token_type::parse_error;
553                }
554
555                case 0x16:
556                {
557                    error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
558                    return token_type::parse_error;
559                }
560
561                case 0x17:
562                {
563                    error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
564                    return token_type::parse_error;
565                }
566
567                case 0x18:
568                {
569                    error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
570                    return token_type::parse_error;
571                }
572
573                case 0x19:
574                {
575                    error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019";
576                    return token_type::parse_error;
577                }
578
579                case 0x1A:
580                {
581                    error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A";
582                    return token_type::parse_error;
583                }
584
585                case 0x1B:
586                {
587                    error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B";
588                    return token_type::parse_error;
589                }
590
591                case 0x1C:
592                {
593                    error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C";
594                    return token_type::parse_error;
595                }
596
597                case 0x1D:
598                {
599                    error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D";
600                    return token_type::parse_error;
601                }
602
603                case 0x1E:
604                {
605                    error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E";
606                    return token_type::parse_error;
607                }
608
609                case 0x1F:
610                {
611                    error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F";
612                    return token_type::parse_error;
613                }
614
615                // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
616                case 0x20:
617                case 0x21:
618                case 0x23:
619                case 0x24:
620                case 0x25:
621                case 0x26:
622                case 0x27:
623                case 0x28:
624                case 0x29:
625                case 0x2A:
626                case 0x2B:
627                case 0x2C:
628                case 0x2D:
629                case 0x2E:
630                case 0x2F:
631                case 0x30:
632                case 0x31:
633                case 0x32:
634                case 0x33:
635                case 0x34:
636                case 0x35:
637                case 0x36:
638                case 0x37:
639                case 0x38:
640                case 0x39:
641                case 0x3A:
642                case 0x3B:
643                case 0x3C:
644                case 0x3D:
645                case 0x3E:
646                case 0x3F:
647                case 0x40:
648                case 0x41:
649                case 0x42:
650                case 0x43:
651                case 0x44:
652                case 0x45:
653                case 0x46:
654                case 0x47:
655                case 0x48:
656                case 0x49:
657                case 0x4A:
658                case 0x4B:
659                case 0x4C:
660                case 0x4D:
661                case 0x4E:
662                case 0x4F:
663                case 0x50:
664                case 0x51:
665                case 0x52:
666                case 0x53:
667                case 0x54:
668                case 0x55:
669                case 0x56:
670                case 0x57:
671                case 0x58:
672                case 0x59:
673                case 0x5A:
674                case 0x5B:
675                case 0x5D:
676                case 0x5E:
677                case 0x5F:
678                case 0x60:
679                case 0x61:
680                case 0x62:
681                case 0x63:
682                case 0x64:
683                case 0x65:
684                case 0x66:
685                case 0x67:
686                case 0x68:
687                case 0x69:
688                case 0x6A:
689                case 0x6B:
690                case 0x6C:
691                case 0x6D:
692                case 0x6E:
693                case 0x6F:
694                case 0x70:
695                case 0x71:
696                case 0x72:
697                case 0x73:
698                case 0x74:
699                case 0x75:
700                case 0x76:
701                case 0x77:
702                case 0x78:
703                case 0x79:
704                case 0x7A:
705                case 0x7B:
706                case 0x7C:
707                case 0x7D:
708                case 0x7E:
709                case 0x7F:
710                {
711                    add(current);
712                    break;
713                }
714
715                // U+0080..U+07FF: bytes C2..DF 80..BF
716                case 0xC2:
717                case 0xC3:
718                case 0xC4:
719                case 0xC5:
720                case 0xC6:
721                case 0xC7:
722                case 0xC8:
723                case 0xC9:
724                case 0xCA:
725                case 0xCB:
726                case 0xCC:
727                case 0xCD:
728                case 0xCE:
729                case 0xCF:
730                case 0xD0:
731                case 0xD1:
732                case 0xD2:
733                case 0xD3:
734                case 0xD4:
735                case 0xD5:
736                case 0xD6:
737                case 0xD7:
738                case 0xD8:
739                case 0xD9:
740                case 0xDA:
741                case 0xDB:
742                case 0xDC:
743                case 0xDD:
744                case 0xDE:
745                case 0xDF:
746                {
747                    if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF})))
748                    {
749                        return token_type::parse_error;
750                    }
751                    break;
752                }
753
754                // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
755                case 0xE0:
756                {
757                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
758                    {
759                        return token_type::parse_error;
760                    }
761                    break;
762                }
763
764                // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
765                // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
766                case 0xE1:
767                case 0xE2:
768                case 0xE3:
769                case 0xE4:
770                case 0xE5:
771                case 0xE6:
772                case 0xE7:
773                case 0xE8:
774                case 0xE9:
775                case 0xEA:
776                case 0xEB:
777                case 0xEC:
778                case 0xEE:
779                case 0xEF:
780                {
781                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
782                    {
783                        return token_type::parse_error;
784                    }
785                    break;
786                }
787
788                // U+D000..U+D7FF: bytes ED 80..9F 80..BF
789                case 0xED:
790                {
791                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
792                    {
793                        return token_type::parse_error;
794                    }
795                    break;
796                }
797
798                // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
799                case 0xF0:
800                {
801                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
802                    {
803                        return token_type::parse_error;
804                    }
805                    break;
806                }
807
808                // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
809                case 0xF1:
810                case 0xF2:
811                case 0xF3:
812                {
813                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
814                    {
815                        return token_type::parse_error;
816                    }
817                    break;
818                }
819
820                // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
821                case 0xF4:
822                {
823                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
824                    {
825                        return token_type::parse_error;
826                    }
827                    break;
828                }
829
830                // remaining bytes (80..C1 and F5..FF) are ill-formed
831                default:
832                {
833                    error_message = "invalid string: ill-formed UTF-8 byte";
834                    return token_type::parse_error;
835                }
836            }
837        }
838    }
839
840    /*!
841     * @brief scan a comment
842     * @return whether comment could be scanned successfully
843     */
844    bool scan_comment()
845    {
846        switch (get())
847        {
848            // single-line comments skip input until a newline or EOF is read
849            case '/':
850            {
851                while (true)
852                {
853                    switch (get())
854                    {
855                        case '\n':
856                        case '\r':
857                        case std::char_traits<char_type>::eof():
858                        case '\0':
859                            return true;
860
861                        default:
862                            break;
863                    }
864                }
865            }
866
867            // multi-line comments skip input until */ is read
868            case '*':
869            {
870                while (true)
871                {
872                    switch (get())
873                    {
874                        case std::char_traits<char_type>::eof():
875                        case '\0':
876                        {
877                            error_message = "invalid comment; missing closing '*/'";
878                            return false;
879                        }
880
881                        case '*':
882                        {
883                            switch (get())
884                            {
885                                case '/':
886                                    return true;
887
888                                default:
889                                {
890                                    unget();
891                                    continue;
892                                }
893                            }
894                        }
895
896                        default:
897                            continue;
898                    }
899                }
900            }
901
902            // unexpected character after reading '/'
903            default:
904            {
905                error_message = "invalid comment; expecting '/' or '*' after '/'";
906                return false;
907            }
908        }
909    }
910
911    JSON_HEDLEY_NON_NULL(2)
912    static void strtof(float& f, const char* str, char** endptr) noexcept
913    {
914        f = std::strtof(str, endptr);
915    }
916
917    JSON_HEDLEY_NON_NULL(2)
918    static void strtof(double& f, const char* str, char** endptr) noexcept
919    {
920        f = std::strtod(str, endptr);
921    }
922
923    JSON_HEDLEY_NON_NULL(2)
924    static void strtof(long double& f, const char* str, char** endptr) noexcept
925    {
926        f = std::strtold(str, endptr);
927    }
928
929    /*!
930    @brief scan a number literal
931
932    This function scans a string according to Sect. 6 of RFC 8259.
933
934    The function is realized with a deterministic finite state machine derived
935    from the grammar described in RFC 8259. Starting in state "init", the
936    input is read and used to determined the next state. Only state "done"
937    accepts the number. State "error" is a trap state to model errors. In the
938    table below, "anything" means any character but the ones listed before.
939
940    state    | 0        | 1-9      | e E      | +       | -       | .        | anything
941    ---------|----------|----------|----------|---------|---------|----------|-----------
942    init     | zero     | any1     | [error]  | [error] | minus   | [error]  | [error]
943    minus    | zero     | any1     | [error]  | [error] | [error] | [error]  | [error]
944    zero     | done     | done     | exponent | done    | done    | decimal1 | done
945    any1     | any1     | any1     | exponent | done    | done    | decimal1 | done
946    decimal1 | decimal2 | decimal2 | [error]  | [error] | [error] | [error]  | [error]
947    decimal2 | decimal2 | decimal2 | exponent | done    | done    | done     | done
948    exponent | any2     | any2     | [error]  | sign    | sign    | [error]  | [error]
949    sign     | any2     | any2     | [error]  | [error] | [error] | [error]  | [error]
950    any2     | any2     | any2     | done     | done    | done    | done     | done
951
952    The state machine is realized with one label per state (prefixed with
953    "scan_number_") and `goto` statements between them. The state machine
954    contains cycles, but any cycle can be left when EOF is read. Therefore,
955    the function is guaranteed to terminate.
956
957    During scanning, the read bytes are stored in token_buffer. This string is
958    then converted to a signed integer, an unsigned integer, or a
959    floating-point number.
960
961    @return token_type::value_unsigned, token_type::value_integer, or
962            token_type::value_float if number could be successfully scanned,
963            token_type::parse_error otherwise
964
965    @note The scanner is independent of the current locale. Internally, the
966          locale's decimal point is used instead of `.` to work with the
967          locale-dependent converters.
968    */
969    token_type scan_number()  // lgtm [cpp/use-of-goto]
970    {
971        // reset token_buffer to store the number's bytes
972        reset();
973
974        // the type of the parsed number; initially set to unsigned; will be
975        // changed if minus sign, decimal point or exponent is read
976        token_type number_type = token_type::value_unsigned;
977
978        // state (init): we just found out we need to scan a number
979        switch (current)
980        {
981            case '-':
982            {
983                add(current);
984                goto scan_number_minus;
985            }
986
987            case '0':
988            {
989                add(current);
990                goto scan_number_zero;
991            }
992
993            case '1':
994            case '2':
995            case '3':
996            case '4':
997            case '5':
998            case '6':
999            case '7':
1000            case '8':
1001            case '9':
1002            {
1003                add(current);
1004                goto scan_number_any1;
1005            }
1006
1007            // all other characters are rejected outside scan_number()
1008            default:            // LCOV_EXCL_LINE
1009                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
1010        }
1011
1012scan_number_minus:
1013        // state: we just parsed a leading minus sign
1014        number_type = token_type::value_integer;
1015        switch (get())
1016        {
1017            case '0':
1018            {
1019                add(current);
1020                goto scan_number_zero;
1021            }
1022
1023            case '1':
1024            case '2':
1025            case '3':
1026            case '4':
1027            case '5':
1028            case '6':
1029            case '7':
1030            case '8':
1031            case '9':
1032            {
1033                add(current);
1034                goto scan_number_any1;
1035            }
1036
1037            default:
1038            {
1039                error_message = "invalid number; expected digit after '-'";
1040                return token_type::parse_error;
1041            }
1042        }
1043
1044scan_number_zero:
1045        // state: we just parse a zero (maybe with a leading minus sign)
1046        switch (get())
1047        {
1048            case '.':
1049            {
1050                add(decimal_point_char);
1051                goto scan_number_decimal1;
1052            }
1053
1054            case 'e':
1055            case 'E':
1056            {
1057                add(current);
1058                goto scan_number_exponent;
1059            }
1060
1061            default:
1062                goto scan_number_done;
1063        }
1064
1065scan_number_any1:
1066        // state: we just parsed a number 0-9 (maybe with a leading minus sign)
1067        switch (get())
1068        {
1069            case '0':
1070            case '1':
1071            case '2':
1072            case '3':
1073            case '4':
1074            case '5':
1075            case '6':
1076            case '7':
1077            case '8':
1078            case '9':
1079            {
1080                add(current);
1081                goto scan_number_any1;
1082            }
1083
1084            case '.':
1085            {
1086                add(decimal_point_char);
1087                goto scan_number_decimal1;
1088            }
1089
1090            case 'e':
1091            case 'E':
1092            {
1093                add(current);
1094                goto scan_number_exponent;
1095            }
1096
1097            default:
1098                goto scan_number_done;
1099        }
1100
1101scan_number_decimal1:
1102        // state: we just parsed a decimal point
1103        number_type = token_type::value_float;
1104        switch (get())
1105        {
1106            case '0':
1107            case '1':
1108            case '2':
1109            case '3':
1110            case '4':
1111            case '5':
1112            case '6':
1113            case '7':
1114            case '8':
1115            case '9':
1116            {
1117                add(current);
1118                goto scan_number_decimal2;
1119            }
1120
1121            default:
1122            {
1123                error_message = "invalid number; expected digit after '.'";
1124                return token_type::parse_error;
1125            }
1126        }
1127
1128scan_number_decimal2:
1129        // we just parsed at least one number after a decimal point
1130        switch (get())
1131        {
1132            case '0':
1133            case '1':
1134            case '2':
1135            case '3':
1136            case '4':
1137            case '5':
1138            case '6':
1139            case '7':
1140            case '8':
1141            case '9':
1142            {
1143                add(current);
1144                goto scan_number_decimal2;
1145            }
1146
1147            case 'e':
1148            case 'E':
1149            {
1150                add(current);
1151                goto scan_number_exponent;
1152            }
1153
1154            default:
1155                goto scan_number_done;
1156        }
1157
1158scan_number_exponent:
1159        // we just parsed an exponent
1160        number_type = token_type::value_float;
1161        switch (get())
1162        {
1163            case '+':
1164            case '-':
1165            {
1166                add(current);
1167                goto scan_number_sign;
1168            }
1169
1170            case '0':
1171            case '1':
1172            case '2':
1173            case '3':
1174            case '4':
1175            case '5':
1176            case '6':
1177            case '7':
1178            case '8':
1179            case '9':
1180            {
1181                add(current);
1182                goto scan_number_any2;
1183            }
1184
1185            default:
1186            {
1187                error_message =
1188                    "invalid number; expected '+', '-', or digit after exponent";
1189                return token_type::parse_error;
1190            }
1191        }
1192
1193scan_number_sign:
1194        // we just parsed an exponent sign
1195        switch (get())
1196        {
1197            case '0':
1198            case '1':
1199            case '2':
1200            case '3':
1201            case '4':
1202            case '5':
1203            case '6':
1204            case '7':
1205            case '8':
1206            case '9':
1207            {
1208                add(current);
1209                goto scan_number_any2;
1210            }
1211
1212            default:
1213            {
1214                error_message = "invalid number; expected digit after exponent sign";
1215                return token_type::parse_error;
1216            }
1217        }
1218
1219scan_number_any2:
1220        // we just parsed a number after the exponent or exponent sign
1221        switch (get())
1222        {
1223            case '0':
1224            case '1':
1225            case '2':
1226            case '3':
1227            case '4':
1228            case '5':
1229            case '6':
1230            case '7':
1231            case '8':
1232            case '9':
1233            {
1234                add(current);
1235                goto scan_number_any2;
1236            }
1237
1238            default:
1239                goto scan_number_done;
1240        }
1241
1242scan_number_done:
1243        // unget the character after the number (we only read it to know that
1244        // we are done scanning a number)
1245        unget();
1246
1247        char* endptr = nullptr; // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
1248        errno = 0;
1249
1250        // try to parse integers first and fall back to floats
1251        if (number_type == token_type::value_unsigned)
1252        {
1253            const auto x = std::strtoull(token_buffer.data(), &endptr, 10);
1254
1255            // we checked the number format before
1256            JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1257
1258            if (errno == 0)
1259            {
1260                value_unsigned = static_cast<number_unsigned_t>(x);
1261                if (value_unsigned == x)
1262                {
1263                    return token_type::value_unsigned;
1264                }
1265            }
1266        }
1267        else if (number_type == token_type::value_integer)
1268        {
1269            const auto x = std::strtoll(token_buffer.data(), &endptr, 10);
1270
1271            // we checked the number format before
1272            JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1273
1274            if (errno == 0)
1275            {
1276                value_integer = static_cast<number_integer_t>(x);
1277                if (value_integer == x)
1278                {
1279                    return token_type::value_integer;
1280                }
1281            }
1282        }
1283
1284        // this code is reached if we parse a floating-point number or if an
1285        // integer conversion above failed
1286        strtof(value_float, token_buffer.data(), &endptr);
1287
1288        // we checked the number format before
1289        JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1290
1291        return token_type::value_float;
1292    }
1293
1294    /*!
1295    @param[in] literal_text  the literal text to expect
1296    @param[in] length        the length of the passed literal text
1297    @param[in] return_type   the token type to return on success
1298    */
1299    JSON_HEDLEY_NON_NULL(2)
1300    token_type scan_literal(const char_type* literal_text, const std::size_t length,
1301                            token_type return_type)
1302    {
1303        JSON_ASSERT(std::char_traits<char_type>::to_char_type(current) == literal_text[0]);
1304        for (std::size_t i = 1; i < length; ++i)
1305        {
1306            if (JSON_HEDLEY_UNLIKELY(std::char_traits<char_type>::to_char_type(get()) != literal_text[i]))
1307            {
1308                error_message = "invalid literal";
1309                return token_type::parse_error;
1310            }
1311        }
1312        return return_type;
1313    }
1314
1315    /////////////////////
1316    // input management
1317    /////////////////////
1318
1319    /// reset token_buffer; current character is beginning of token
1320    void reset() noexcept
1321    {
1322        token_buffer.clear();
1323        token_string.clear();
1324        token_string.push_back(std::char_traits<char_type>::to_char_type(current));
1325    }
1326
1327    /*
1328    @brief get next character from the input
1329
1330    This function provides the interface to the used input adapter. It does
1331    not throw in case the input reached EOF, but returns a
1332    `std::char_traits<char>::eof()` in that case.  Stores the scanned characters
1333    for use in error messages.
1334
1335    @return character read from the input
1336    */
1337    char_int_type get()
1338    {
1339        ++position.chars_read_total;
1340        ++position.chars_read_current_line;
1341
1342        if (next_unget)
1343        {
1344            // just reset the next_unget variable and work with current
1345            next_unget = false;
1346        }
1347        else
1348        {
1349            current = ia.get_character();
1350        }
1351
1352        if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
1353        {
1354            token_string.push_back(std::char_traits<char_type>::to_char_type(current));
1355        }
1356
1357        if (current == '\n')
1358        {
1359            ++position.lines_read;
1360            position.chars_read_current_line = 0;
1361        }
1362
1363        return current;
1364    }
1365
1366    /*!
1367    @brief unget current character (read it again on next get)
1368
1369    We implement unget by setting variable next_unget to true. The input is not
1370    changed - we just simulate ungetting by modifying chars_read_total,
1371    chars_read_current_line, and token_string. The next call to get() will
1372    behave as if the unget character is read again.
1373    */
1374    void unget()
1375    {
1376        next_unget = true;
1377
1378        --position.chars_read_total;
1379
1380        // in case we "unget" a newline, we have to also decrement the lines_read
1381        if (position.chars_read_current_line == 0)
1382        {
1383            if (position.lines_read > 0)
1384            {
1385                --position.lines_read;
1386            }
1387        }
1388        else
1389        {
1390            --position.chars_read_current_line;
1391        }
1392
1393        if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
1394        {
1395            JSON_ASSERT(!token_string.empty());
1396            token_string.pop_back();
1397        }
1398    }
1399
1400    /// add a character to token_buffer
1401    void add(char_int_type c)
1402    {
1403        token_buffer.push_back(static_cast<typename string_t::value_type>(c));
1404    }
1405
1406  public:
1407    /////////////////////
1408    // value getters
1409    /////////////////////
1410
1411    /// return integer value
1412    constexpr number_integer_t get_number_integer() const noexcept
1413    {
1414        return value_integer;
1415    }
1416
1417    /// return unsigned integer value
1418    constexpr number_unsigned_t get_number_unsigned() const noexcept
1419    {
1420        return value_unsigned;
1421    }
1422
1423    /// return floating-point value
1424    constexpr number_float_t get_number_float() const noexcept
1425    {
1426        return value_float;
1427    }
1428
1429    /// return current string value (implicitly resets the token; useful only once)
1430    string_t& get_string()
1431    {
1432        return token_buffer;
1433    }
1434
1435    /////////////////////
1436    // diagnostics
1437    /////////////////////
1438
1439    /// return position of last read token
1440    constexpr position_t get_position() const noexcept
1441    {
1442        return position;
1443    }
1444
1445    /// return the last read token (for errors only).  Will never contain EOF
1446    /// (an arbitrary value that is not a valid char value, often -1), because
1447    /// 255 may legitimately occur.  May contain NUL, which should be escaped.
1448    std::string get_token_string() const
1449    {
1450        // escape control characters
1451        std::string result;
1452        for (const auto c : token_string)
1453        {
1454            if (static_cast<unsigned char>(c) <= '\x1F')
1455            {
1456                // escape control characters
1457                std::array<char, 9> cs{{}};
1458                static_cast<void>((std::snprintf)(cs.data(), cs.size(), "<U+%.4X>", static_cast<unsigned char>(c))); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
1459                result += cs.data();
1460            }
1461            else
1462            {
1463                // add character as is
1464                result.push_back(static_cast<std::string::value_type>(c));
1465            }
1466        }
1467
1468        return result;
1469    }
1470
1471    /// return syntax error message
1472    JSON_HEDLEY_RETURNS_NON_NULL
1473    constexpr const char* get_error_message() const noexcept
1474    {
1475        return error_message;
1476    }
1477
1478    /////////////////////
1479    // actual scanner
1480    /////////////////////
1481
1482    /*!
1483    @brief skip the UTF-8 byte order mark
1484    @return true iff there is no BOM or the correct BOM has been skipped
1485    */
1486    bool skip_bom()
1487    {
1488        if (get() == 0xEF)
1489        {
1490            // check if we completely parse the BOM
1491            return get() == 0xBB && get() == 0xBF;
1492        }
1493
1494        // the first character is not the beginning of the BOM; unget it to
1495        // process is later
1496        unget();
1497        return true;
1498    }
1499
1500    void skip_whitespace()
1501    {
1502        do
1503        {
1504            get();
1505        }
1506        while (current == ' ' || current == '\t' || current == '\n' || current == '\r');
1507    }
1508
1509    token_type scan()
1510    {
1511        // initially, skip the BOM
1512        if (position.chars_read_total == 0 && !skip_bom())
1513        {
1514            error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
1515            return token_type::parse_error;
1516        }
1517
1518        // read next character and ignore whitespace
1519        skip_whitespace();
1520
1521        // ignore comments
1522        while (ignore_comments && current == '/')
1523        {
1524            if (!scan_comment())
1525            {
1526                return token_type::parse_error;
1527            }
1528
1529            // skip following whitespace
1530            skip_whitespace();
1531        }
1532
1533        switch (current)
1534        {
1535            // structural characters
1536            case '[':
1537                return token_type::begin_array;
1538            case ']':
1539                return token_type::end_array;
1540            case '{':
1541                return token_type::begin_object;
1542            case '}':
1543                return token_type::end_object;
1544            case ':':
1545                return token_type::name_separator;
1546            case ',':
1547                return token_type::value_separator;
1548
1549            // literals
1550            case 't':
1551            {
1552                std::array<char_type, 4> true_literal = {{static_cast<char_type>('t'), static_cast<char_type>('r'), static_cast<char_type>('u'), static_cast<char_type>('e')}};
1553                return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true);
1554            }
1555            case 'f':
1556            {
1557                std::array<char_type, 5> false_literal = {{static_cast<char_type>('f'), static_cast<char_type>('a'), static_cast<char_type>('l'), static_cast<char_type>('s'), static_cast<char_type>('e')}};
1558                return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false);
1559            }
1560            case 'n':
1561            {
1562                std::array<char_type, 4> null_literal = {{static_cast<char_type>('n'), static_cast<char_type>('u'), static_cast<char_type>('l'), static_cast<char_type>('l')}};
1563                return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null);
1564            }
1565
1566            // string
1567            case '\"':
1568                return scan_string();
1569
1570            // number
1571            case '-':
1572            case '0':
1573            case '1':
1574            case '2':
1575            case '3':
1576            case '4':
1577            case '5':
1578            case '6':
1579            case '7':
1580            case '8':
1581            case '9':
1582                return scan_number();
1583
1584            // end of input (the null byte is needed when parsing from
1585            // string literals)
1586            case '\0':
1587            case std::char_traits<char_type>::eof():
1588                return token_type::end_of_input;
1589
1590            // error
1591            default:
1592                error_message = "invalid literal";
1593                return token_type::parse_error;
1594        }
1595    }
1596
1597  private:
1598    /// input adapter
1599    InputAdapterType ia;
1600
1601    /// whether comments should be ignored (true) or signaled as errors (false)
1602    const bool ignore_comments = false;
1603
1604    /// the current character
1605    char_int_type current = std::char_traits<char_type>::eof();
1606
1607    /// whether the next get() call should just return current
1608    bool next_unget = false;
1609
1610    /// the start position of the current token
1611    position_t position {};
1612
1613    /// raw input token string (for error messages)
1614    std::vector<char_type> token_string {};
1615
1616    /// buffer for variable-length tokens (numbers, strings)
1617    string_t token_buffer {};
1618
1619    /// a description of occurred lexer errors
1620    const char* error_message = "";
1621
1622    // number values
1623    number_integer_t value_integer = 0;
1624    number_unsigned_t value_unsigned = 0;
1625    number_float_t value_float = 0;
1626
1627    /// the decimal point
1628    const char_int_type decimal_point_char = '.';
1629};
1630
1631}  // namespace detail
1632NLOHMANN_JSON_NAMESPACE_END
1633