1/* A fuzz test for CPython.
2
3  The only exposed function is LLVMFuzzerTestOneInput, which is called by
4  fuzzers and by the _fuzz module for smoke tests.
5
6  To build exactly one fuzz test, as when running in oss-fuzz etc.,
7  build with -D _Py_FUZZ_ONE and -D _Py_FUZZ_<test_name>. e.g. to build
8  LLVMFuzzerTestOneInput to only run "fuzz_builtin_float", build this file with
9      -D _Py_FUZZ_ONE -D _Py_FUZZ_fuzz_builtin_float.
10
11  See the source code for LLVMFuzzerTestOneInput for details. */
12
13#include <Python.h>
14#include <stdlib.h>
15#include <inttypes.h>
16
17/*  Fuzz PyFloat_FromString as a proxy for float(str). */
18static int fuzz_builtin_float(const char* data, size_t size) {
19    PyObject* s = PyBytes_FromStringAndSize(data, size);
20    if (s == NULL) return 0;
21    PyObject* f = PyFloat_FromString(s);
22    if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_ValueError)) {
23        PyErr_Clear();
24    }
25
26    Py_XDECREF(f);
27    Py_DECREF(s);
28    return 0;
29}
30
31#define MAX_INT_TEST_SIZE 0x10000
32
33/* Fuzz PyLong_FromUnicodeObject as a proxy for int(str). */
34static int fuzz_builtin_int(const char* data, size_t size) {
35    /* Ignore test cases with very long ints to avoid timeouts
36       int("9" * 1000000) is not a very interesting test caase */
37    if (size > MAX_INT_TEST_SIZE) {
38        return 0;
39    }
40    /* Pick a random valid base. (When the fuzzed function takes extra
41       parameters, it's somewhat normal to hash the input to generate those
42       parameters. We want to exercise all code paths, so we do so here.) */
43    int base = _Py_HashBytes(data, size) % 37;
44    if (base == 1) {
45        // 1 is the only number between 0 and 36 that is not a valid base.
46        base = 0;
47    }
48    if (base == -1) {
49        return 0;  // An error occurred, bail early.
50    }
51    if (base < 0) {
52        base = -base;
53    }
54
55    PyObject* s = PyUnicode_FromStringAndSize(data, size);
56    if (s == NULL) {
57        if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
58            PyErr_Clear();
59        }
60        return 0;
61    }
62    PyObject* l = PyLong_FromUnicodeObject(s, base);
63    if (l == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) {
64        PyErr_Clear();
65    }
66    PyErr_Clear();
67    Py_XDECREF(l);
68    Py_DECREF(s);
69    return 0;
70}
71
72/* Fuzz PyUnicode_FromStringAndSize as a proxy for unicode(str). */
73static int fuzz_builtin_unicode(const char* data, size_t size) {
74    PyObject* s = PyUnicode_FromStringAndSize(data, size);
75    if (s == NULL && PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
76        PyErr_Clear();
77    }
78    Py_XDECREF(s);
79    return 0;
80}
81
82
83PyObject* struct_unpack_method = NULL;
84PyObject* struct_error = NULL;
85/* Called by LLVMFuzzerTestOneInput for initialization */
86static int init_struct_unpack(void) {
87    /* Import struct.unpack */
88    PyObject* struct_module = PyImport_ImportModule("struct");
89    if (struct_module == NULL) {
90        return 0;
91    }
92    struct_error = PyObject_GetAttrString(struct_module, "error");
93    if (struct_error == NULL) {
94        return 0;
95    }
96    struct_unpack_method = PyObject_GetAttrString(struct_module, "unpack");
97    return struct_unpack_method != NULL;
98}
99/* Fuzz struct.unpack(x, y) */
100static int fuzz_struct_unpack(const char* data, size_t size) {
101    /* Everything up to the first null byte is considered the
102       format. Everything after is the buffer */
103    const char* first_null = memchr(data, '\0', size);
104    if (first_null == NULL) {
105        return 0;
106    }
107
108    size_t format_length = first_null - data;
109    size_t buffer_length = size - format_length - 1;
110
111    PyObject* pattern = PyBytes_FromStringAndSize(data, format_length);
112    if (pattern == NULL) {
113        return 0;
114    }
115    PyObject* buffer = PyBytes_FromStringAndSize(first_null + 1, buffer_length);
116    if (buffer == NULL) {
117        Py_DECREF(pattern);
118        return 0;
119    }
120
121    PyObject* unpacked = PyObject_CallFunctionObjArgs(
122        struct_unpack_method, pattern, buffer, NULL);
123    /* Ignore any overflow errors, these are easily triggered accidentally */
124    if (unpacked == NULL && PyErr_ExceptionMatches(PyExc_OverflowError)) {
125        PyErr_Clear();
126    }
127    /* The pascal format string will throw a negative size when passing 0
128       like: struct.unpack('0p', b'') */
129    if (unpacked == NULL && PyErr_ExceptionMatches(PyExc_SystemError)) {
130        PyErr_Clear();
131    }
132    /* Ignore any struct.error exceptions, these can be caused by invalid
133       formats or incomplete buffers both of which are common. */
134    if (unpacked == NULL && PyErr_ExceptionMatches(struct_error)) {
135        PyErr_Clear();
136    }
137
138    Py_XDECREF(unpacked);
139    Py_DECREF(pattern);
140    Py_DECREF(buffer);
141    return 0;
142}
143
144
145#define MAX_JSON_TEST_SIZE 0x10000
146
147PyObject* json_loads_method = NULL;
148/* Called by LLVMFuzzerTestOneInput for initialization */
149static int init_json_loads(void) {
150    /* Import json.loads */
151    PyObject* json_module = PyImport_ImportModule("json");
152    if (json_module == NULL) {
153        return 0;
154    }
155    json_loads_method = PyObject_GetAttrString(json_module, "loads");
156    return json_loads_method != NULL;
157}
158/* Fuzz json.loads(x) */
159static int fuzz_json_loads(const char* data, size_t size) {
160    /* Since python supports arbitrarily large ints in JSON,
161       long inputs can lead to timeouts on boring inputs like
162       `json.loads("9" * 100000)` */
163    if (size > MAX_JSON_TEST_SIZE) {
164        return 0;
165    }
166    PyObject* input_bytes = PyBytes_FromStringAndSize(data, size);
167    if (input_bytes == NULL) {
168        return 0;
169    }
170    PyObject* parsed = PyObject_CallOneArg(json_loads_method, input_bytes);
171    if (parsed == NULL) {
172        /* Ignore ValueError as the fuzzer will more than likely
173           generate some invalid json and values */
174        if (PyErr_ExceptionMatches(PyExc_ValueError) ||
175        /* Ignore RecursionError as the fuzzer generates long sequences of
176           arrays such as `[[[...` */
177            PyErr_ExceptionMatches(PyExc_RecursionError) ||
178        /* Ignore unicode errors, invalid byte sequences are common */
179            PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)
180        ) {
181            PyErr_Clear();
182        }
183    }
184    Py_DECREF(input_bytes);
185    Py_XDECREF(parsed);
186    return 0;
187}
188
189#define MAX_RE_TEST_SIZE 0x10000
190
191PyObject* sre_compile_method = NULL;
192PyObject* sre_error_exception = NULL;
193int SRE_FLAG_DEBUG = 0;
194/* Called by LLVMFuzzerTestOneInput for initialization */
195static int init_sre_compile(void) {
196    /* Import sre_compile.compile and sre.error */
197    PyObject* sre_compile_module = PyImport_ImportModule("sre_compile");
198    if (sre_compile_module == NULL) {
199        return 0;
200    }
201    sre_compile_method = PyObject_GetAttrString(sre_compile_module, "compile");
202    if (sre_compile_method == NULL) {
203        return 0;
204    }
205
206    PyObject* sre_constants = PyImport_ImportModule("sre_constants");
207    if (sre_constants == NULL) {
208        return 0;
209    }
210    sre_error_exception = PyObject_GetAttrString(sre_constants, "error");
211    if (sre_error_exception == NULL) {
212        return 0;
213    }
214    PyObject* debug_flag = PyObject_GetAttrString(sre_constants, "SRE_FLAG_DEBUG");
215    if (debug_flag == NULL) {
216        return 0;
217    }
218    SRE_FLAG_DEBUG = PyLong_AsLong(debug_flag);
219    return 1;
220}
221/* Fuzz _sre.compile(x) */
222static int fuzz_sre_compile(const char* data, size_t size) {
223    /* Ignore really long regex patterns that will timeout the fuzzer */
224    if (size > MAX_RE_TEST_SIZE) {
225        return 0;
226    }
227    /* We treat the first 2 bytes of the input as a number for the flags */
228    if (size < 2) {
229        return 0;
230    }
231    uint16_t flags = ((uint16_t*) data)[0];
232    /* We remove the SRE_FLAG_DEBUG if present. This is because it
233       prints to stdout which greatly decreases fuzzing speed */
234    flags &= ~SRE_FLAG_DEBUG;
235
236    /* Pull the pattern from the remaining bytes */
237    PyObject* pattern_bytes = PyBytes_FromStringAndSize(data + 2, size - 2);
238    if (pattern_bytes == NULL) {
239        return 0;
240    }
241    PyObject* flags_obj = PyLong_FromUnsignedLong(flags);
242    if (flags_obj == NULL) {
243        Py_DECREF(pattern_bytes);
244        return 0;
245    }
246
247    /* compiled = _sre.compile(data[2:], data[0:2] */
248    PyObject* compiled = PyObject_CallFunctionObjArgs(
249        sre_compile_method, pattern_bytes, flags_obj, NULL);
250    /* Ignore ValueError as the fuzzer will more than likely
251       generate some invalid combination of flags */
252    if (compiled == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) {
253        PyErr_Clear();
254    }
255    /* Ignore some common errors thrown by sre_parse:
256       Overflow, Assertion, Recursion and Index */
257    if (compiled == NULL && (PyErr_ExceptionMatches(PyExc_OverflowError) ||
258                             PyErr_ExceptionMatches(PyExc_AssertionError) ||
259                             PyErr_ExceptionMatches(PyExc_RecursionError) ||
260                             PyErr_ExceptionMatches(PyExc_IndexError))
261    ) {
262        PyErr_Clear();
263    }
264    /* Ignore re.error */
265    if (compiled == NULL && PyErr_ExceptionMatches(sre_error_exception)) {
266        PyErr_Clear();
267    }
268
269    Py_DECREF(pattern_bytes);
270    Py_DECREF(flags_obj);
271    Py_XDECREF(compiled);
272    return 0;
273}
274
275/* Some random patterns used to test re.match.
276   Be careful not to add catostraphically slow regexes here, we want to
277   exercise the matching code without causing timeouts.*/
278static const char* regex_patterns[] = {
279    ".", "^", "abc", "abc|def", "^xxx$", "\\b", "()", "[a-zA-Z0-9]",
280    "abc+", "[^A-Z]", "[x]", "(?=)", "a{z}", "a+b", "a*?", "a??", "a+?",
281    "{}", "a{,}", "{", "}", "^\\(*\\d{3}\\)*( |-)*\\d{3}( |-)*\\d{4}$",
282    "(?:a*)*", "a{1,2}?"
283};
284const size_t NUM_PATTERNS = sizeof(regex_patterns) / sizeof(regex_patterns[0]);
285PyObject** compiled_patterns = NULL;
286/* Called by LLVMFuzzerTestOneInput for initialization */
287static int init_sre_match(void) {
288    PyObject* re_module = PyImport_ImportModule("re");
289    if (re_module == NULL) {
290        return 0;
291    }
292    compiled_patterns = (PyObject**) PyMem_RawMalloc(
293        sizeof(PyObject*) * NUM_PATTERNS);
294    if (compiled_patterns == NULL) {
295        PyErr_NoMemory();
296        return 0;
297    }
298
299    /* Precompile all the regex patterns on the first run for faster fuzzing */
300    for (size_t i = 0; i < NUM_PATTERNS; i++) {
301        PyObject* compiled = PyObject_CallMethod(
302            re_module, "compile", "y", regex_patterns[i]);
303        /* Bail if any of the patterns fail to compile */
304        if (compiled == NULL) {
305            return 0;
306        }
307        compiled_patterns[i] = compiled;
308    }
309    return 1;
310}
311/* Fuzz re.match(x) */
312static int fuzz_sre_match(const char* data, size_t size) {
313    if (size < 1 || size > MAX_RE_TEST_SIZE) {
314        return 0;
315    }
316    /* Use the first byte as a uint8_t specifying the index of the
317       regex to use */
318    unsigned char idx = (unsigned char) data[0];
319    idx = idx % NUM_PATTERNS;
320
321    /* Pull the string to match from the remaining bytes */
322    PyObject* to_match = PyBytes_FromStringAndSize(data + 1, size - 1);
323    if (to_match == NULL) {
324        return 0;
325    }
326
327    PyObject* pattern = compiled_patterns[idx];
328    PyObject* match_callable = PyObject_GetAttrString(pattern, "match");
329
330    PyObject* matches = PyObject_CallOneArg(match_callable, to_match);
331
332    Py_XDECREF(matches);
333    Py_DECREF(match_callable);
334    Py_DECREF(to_match);
335    return 0;
336}
337
338#define MAX_CSV_TEST_SIZE 0x10000
339PyObject* csv_module = NULL;
340PyObject* csv_error = NULL;
341/* Called by LLVMFuzzerTestOneInput for initialization */
342static int init_csv_reader(void) {
343    /* Import csv and csv.Error */
344    csv_module = PyImport_ImportModule("csv");
345    if (csv_module == NULL) {
346        return 0;
347    }
348    csv_error = PyObject_GetAttrString(csv_module, "Error");
349    return csv_error != NULL;
350}
351/* Fuzz csv.reader([x]) */
352static int fuzz_csv_reader(const char* data, size_t size) {
353    if (size < 1 || size > MAX_CSV_TEST_SIZE) {
354        return 0;
355    }
356    /* Ignore non null-terminated strings since _csv can't handle
357       embedded nulls */
358    if (memchr(data, '\0', size) == NULL) {
359        return 0;
360    }
361
362    PyObject* s = PyUnicode_FromString(data);
363    /* Ignore exceptions until we have a valid string */
364    if (s == NULL) {
365        PyErr_Clear();
366        return 0;
367    }
368
369    /* Split on \n so we can test multiple lines */
370    PyObject* lines = PyObject_CallMethod(s, "split", "s", "\n");
371    if (lines == NULL) {
372        Py_DECREF(s);
373        return 0;
374    }
375
376    PyObject* reader = PyObject_CallMethod(csv_module, "reader", "N", lines);
377    if (reader) {
378        /* Consume all of the reader as an iterator */
379        PyObject* parsed_line;
380        while ((parsed_line = PyIter_Next(reader))) {
381            Py_DECREF(parsed_line);
382        }
383    }
384
385    /* Ignore csv.Error because we're probably going to generate
386       some bad files (embedded new-lines, unterminated quotes etc) */
387    if (PyErr_ExceptionMatches(csv_error)) {
388        PyErr_Clear();
389    }
390
391    Py_XDECREF(reader);
392    Py_DECREF(s);
393    return 0;
394}
395
396#define MAX_AST_LITERAL_EVAL_TEST_SIZE 0x10000
397PyObject* ast_literal_eval_method = NULL;
398/* Called by LLVMFuzzerTestOneInput for initialization */
399static int init_ast_literal_eval(void) {
400    PyObject* ast_module = PyImport_ImportModule("ast");
401    if (ast_module == NULL) {
402        return 0;
403    }
404    ast_literal_eval_method = PyObject_GetAttrString(ast_module, "literal_eval");
405    return ast_literal_eval_method != NULL;
406}
407/* Fuzz ast.literal_eval(x) */
408static int fuzz_ast_literal_eval(const char* data, size_t size) {
409    if (size > MAX_AST_LITERAL_EVAL_TEST_SIZE) {
410        return 0;
411    }
412    /* Ignore non null-terminated strings since ast can't handle
413       embedded nulls */
414    if (memchr(data, '\0', size) == NULL) {
415        return 0;
416    }
417
418    PyObject* s = PyUnicode_FromString(data);
419    /* Ignore exceptions until we have a valid string */
420    if (s == NULL) {
421        PyErr_Clear();
422        return 0;
423    }
424
425    PyObject* literal = PyObject_CallOneArg(ast_literal_eval_method, s);
426    /* Ignore some common errors thrown by ast.literal_eval */
427    if (literal == NULL && (PyErr_ExceptionMatches(PyExc_ValueError) ||
428                            PyErr_ExceptionMatches(PyExc_TypeError) ||
429                            PyErr_ExceptionMatches(PyExc_SyntaxError) ||
430                            PyErr_ExceptionMatches(PyExc_MemoryError) ||
431                            PyErr_ExceptionMatches(PyExc_RecursionError))
432    ) {
433        PyErr_Clear();
434    }
435
436    Py_XDECREF(literal);
437    Py_DECREF(s);
438    return 0;
439}
440
441/* Run fuzzer and abort on failure. */
442static int _run_fuzz(const uint8_t *data, size_t size, int(*fuzzer)(const char* , size_t)) {
443    int rv = fuzzer((const char*) data, size);
444    if (PyErr_Occurred()) {
445        /* Fuzz tests should handle expected errors for themselves.
446           This is last-ditch check in case they didn't. */
447        PyErr_Print();
448        abort();
449    }
450    /* Someday the return value might mean something, propagate it. */
451    return rv;
452}
453
454/* CPython generates a lot of leak warnings for whatever reason. */
455int __lsan_is_turned_off(void) { return 1; }
456
457
458int LLVMFuzzerInitialize(int *argc, char ***argv) {
459    PyConfig config;
460    PyConfig_InitPythonConfig(&config);
461    config.install_signal_handlers = 0;
462    PyStatus status;
463    status = PyConfig_SetBytesString(&config, &config.program_name, *argv[0]);
464    if (PyStatus_Exception(status)) {
465        goto fail;
466    }
467
468    status = Py_InitializeFromConfig(&config);
469    if (PyStatus_Exception(status)) {
470        goto fail;
471    }
472    PyConfig_Clear(&config);
473
474    return 0;
475
476fail:
477    PyConfig_Clear(&config);
478    Py_ExitStatusException(status);
479}
480
481/* Fuzz test interface.
482   This returns the bitwise or of all fuzz test's return values.
483
484   All fuzz tests must return 0, as all nonzero return codes are reserved for
485   future use -- we propagate the return values for that future case.
486   (And we bitwise or when running multiple tests to verify that normally we
487   only return 0.) */
488int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
489    assert(Py_IsInitialized());
490
491    int rv = 0;
492
493#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_float)
494    rv |= _run_fuzz(data, size, fuzz_builtin_float);
495#endif
496#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_int)
497    rv |= _run_fuzz(data, size, fuzz_builtin_int);
498#endif
499#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_unicode)
500    rv |= _run_fuzz(data, size, fuzz_builtin_unicode);
501#endif
502#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_struct_unpack)
503    static int STRUCT_UNPACK_INITIALIZED = 0;
504    if (!STRUCT_UNPACK_INITIALIZED && !init_struct_unpack()) {
505        PyErr_Print();
506        abort();
507    } else {
508        STRUCT_UNPACK_INITIALIZED = 1;
509    }
510    rv |= _run_fuzz(data, size, fuzz_struct_unpack);
511#endif
512#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_json_loads)
513    static int JSON_LOADS_INITIALIZED = 0;
514    if (!JSON_LOADS_INITIALIZED && !init_json_loads()) {
515        PyErr_Print();
516        abort();
517    } else {
518        JSON_LOADS_INITIALIZED = 1;
519    }
520
521    rv |= _run_fuzz(data, size, fuzz_json_loads);
522#endif
523#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_sre_compile)
524    static int SRE_COMPILE_INITIALIZED = 0;
525    if (!SRE_COMPILE_INITIALIZED && !init_sre_compile()) {
526        PyErr_Print();
527        abort();
528    } else {
529        SRE_COMPILE_INITIALIZED = 1;
530    }
531
532    rv |= _run_fuzz(data, size, fuzz_sre_compile);
533#endif
534#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_sre_match)
535    static int SRE_MATCH_INITIALIZED = 0;
536    if (!SRE_MATCH_INITIALIZED && !init_sre_match()) {
537        PyErr_Print();
538        abort();
539    } else {
540        SRE_MATCH_INITIALIZED = 1;
541    }
542
543    rv |= _run_fuzz(data, size, fuzz_sre_match);
544#endif
545#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_csv_reader)
546    static int CSV_READER_INITIALIZED = 0;
547    if (!CSV_READER_INITIALIZED && !init_csv_reader()) {
548        PyErr_Print();
549        abort();
550    } else {
551        CSV_READER_INITIALIZED = 1;
552    }
553
554    rv |= _run_fuzz(data, size, fuzz_csv_reader);
555#endif
556#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_ast_literal_eval)
557    static int AST_LITERAL_EVAL_INITIALIZED = 0;
558    if (!AST_LITERAL_EVAL_INITIALIZED && !init_ast_literal_eval()) {
559        PyErr_Print();
560        abort();
561    } else {
562        AST_LITERAL_EVAL_INITIALIZED = 1;
563    }
564
565    rv |= _run_fuzz(data, size, fuzz_ast_literal_eval);
566#endif
567  return rv;
568}
569