1/* A fuzz test for CPython. 2 3 The only exposed function is LLVMFuzzerTestOneInput, which is called by 4 fuzzers and by the _fuzz module for smoke tests. 5 6 To build exactly one fuzz test, as when running in oss-fuzz etc., 7 build with -D _Py_FUZZ_ONE and -D _Py_FUZZ_<test_name>. e.g. to build 8 LLVMFuzzerTestOneInput to only run "fuzz_builtin_float", build this file with 9 -D _Py_FUZZ_ONE -D _Py_FUZZ_fuzz_builtin_float. 10 11 See the source code for LLVMFuzzerTestOneInput for details. */ 12 13#include <Python.h> 14#include <stdlib.h> 15#include <inttypes.h> 16 17/* Fuzz PyFloat_FromString as a proxy for float(str). */ 18static int fuzz_builtin_float(const char* data, size_t size) { 19 PyObject* s = PyBytes_FromStringAndSize(data, size); 20 if (s == NULL) return 0; 21 PyObject* f = PyFloat_FromString(s); 22 if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_ValueError)) { 23 PyErr_Clear(); 24 } 25 26 Py_XDECREF(f); 27 Py_DECREF(s); 28 return 0; 29} 30 31#define MAX_INT_TEST_SIZE 0x10000 32 33/* Fuzz PyLong_FromUnicodeObject as a proxy for int(str). */ 34static int fuzz_builtin_int(const char* data, size_t size) { 35 /* Ignore test cases with very long ints to avoid timeouts 36 int("9" * 1000000) is not a very interesting test caase */ 37 if (size > MAX_INT_TEST_SIZE) { 38 return 0; 39 } 40 /* Pick a random valid base. (When the fuzzed function takes extra 41 parameters, it's somewhat normal to hash the input to generate those 42 parameters. We want to exercise all code paths, so we do so here.) */ 43 int base = _Py_HashBytes(data, size) % 37; 44 if (base == 1) { 45 // 1 is the only number between 0 and 36 that is not a valid base. 46 base = 0; 47 } 48 if (base == -1) { 49 return 0; // An error occurred, bail early. 50 } 51 if (base < 0) { 52 base = -base; 53 } 54 55 PyObject* s = PyUnicode_FromStringAndSize(data, size); 56 if (s == NULL) { 57 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { 58 PyErr_Clear(); 59 } 60 return 0; 61 } 62 PyObject* l = PyLong_FromUnicodeObject(s, base); 63 if (l == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) { 64 PyErr_Clear(); 65 } 66 PyErr_Clear(); 67 Py_XDECREF(l); 68 Py_DECREF(s); 69 return 0; 70} 71 72/* Fuzz PyUnicode_FromStringAndSize as a proxy for unicode(str). */ 73static int fuzz_builtin_unicode(const char* data, size_t size) { 74 PyObject* s = PyUnicode_FromStringAndSize(data, size); 75 if (s == NULL && PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { 76 PyErr_Clear(); 77 } 78 Py_XDECREF(s); 79 return 0; 80} 81 82 83PyObject* struct_unpack_method = NULL; 84PyObject* struct_error = NULL; 85/* Called by LLVMFuzzerTestOneInput for initialization */ 86static int init_struct_unpack(void) { 87 /* Import struct.unpack */ 88 PyObject* struct_module = PyImport_ImportModule("struct"); 89 if (struct_module == NULL) { 90 return 0; 91 } 92 struct_error = PyObject_GetAttrString(struct_module, "error"); 93 if (struct_error == NULL) { 94 return 0; 95 } 96 struct_unpack_method = PyObject_GetAttrString(struct_module, "unpack"); 97 return struct_unpack_method != NULL; 98} 99/* Fuzz struct.unpack(x, y) */ 100static int fuzz_struct_unpack(const char* data, size_t size) { 101 /* Everything up to the first null byte is considered the 102 format. Everything after is the buffer */ 103 const char* first_null = memchr(data, '\0', size); 104 if (first_null == NULL) { 105 return 0; 106 } 107 108 size_t format_length = first_null - data; 109 size_t buffer_length = size - format_length - 1; 110 111 PyObject* pattern = PyBytes_FromStringAndSize(data, format_length); 112 if (pattern == NULL) { 113 return 0; 114 } 115 PyObject* buffer = PyBytes_FromStringAndSize(first_null + 1, buffer_length); 116 if (buffer == NULL) { 117 Py_DECREF(pattern); 118 return 0; 119 } 120 121 PyObject* unpacked = PyObject_CallFunctionObjArgs( 122 struct_unpack_method, pattern, buffer, NULL); 123 /* Ignore any overflow errors, these are easily triggered accidentally */ 124 if (unpacked == NULL && PyErr_ExceptionMatches(PyExc_OverflowError)) { 125 PyErr_Clear(); 126 } 127 /* The pascal format string will throw a negative size when passing 0 128 like: struct.unpack('0p', b'') */ 129 if (unpacked == NULL && PyErr_ExceptionMatches(PyExc_SystemError)) { 130 PyErr_Clear(); 131 } 132 /* Ignore any struct.error exceptions, these can be caused by invalid 133 formats or incomplete buffers both of which are common. */ 134 if (unpacked == NULL && PyErr_ExceptionMatches(struct_error)) { 135 PyErr_Clear(); 136 } 137 138 Py_XDECREF(unpacked); 139 Py_DECREF(pattern); 140 Py_DECREF(buffer); 141 return 0; 142} 143 144 145#define MAX_JSON_TEST_SIZE 0x10000 146 147PyObject* json_loads_method = NULL; 148/* Called by LLVMFuzzerTestOneInput for initialization */ 149static int init_json_loads(void) { 150 /* Import json.loads */ 151 PyObject* json_module = PyImport_ImportModule("json"); 152 if (json_module == NULL) { 153 return 0; 154 } 155 json_loads_method = PyObject_GetAttrString(json_module, "loads"); 156 return json_loads_method != NULL; 157} 158/* Fuzz json.loads(x) */ 159static int fuzz_json_loads(const char* data, size_t size) { 160 /* Since python supports arbitrarily large ints in JSON, 161 long inputs can lead to timeouts on boring inputs like 162 `json.loads("9" * 100000)` */ 163 if (size > MAX_JSON_TEST_SIZE) { 164 return 0; 165 } 166 PyObject* input_bytes = PyBytes_FromStringAndSize(data, size); 167 if (input_bytes == NULL) { 168 return 0; 169 } 170 PyObject* parsed = PyObject_CallOneArg(json_loads_method, input_bytes); 171 if (parsed == NULL) { 172 /* Ignore ValueError as the fuzzer will more than likely 173 generate some invalid json and values */ 174 if (PyErr_ExceptionMatches(PyExc_ValueError) || 175 /* Ignore RecursionError as the fuzzer generates long sequences of 176 arrays such as `[[[...` */ 177 PyErr_ExceptionMatches(PyExc_RecursionError) || 178 /* Ignore unicode errors, invalid byte sequences are common */ 179 PyErr_ExceptionMatches(PyExc_UnicodeDecodeError) 180 ) { 181 PyErr_Clear(); 182 } 183 } 184 Py_DECREF(input_bytes); 185 Py_XDECREF(parsed); 186 return 0; 187} 188 189#define MAX_RE_TEST_SIZE 0x10000 190 191PyObject* sre_compile_method = NULL; 192PyObject* sre_error_exception = NULL; 193int SRE_FLAG_DEBUG = 0; 194/* Called by LLVMFuzzerTestOneInput for initialization */ 195static int init_sre_compile(void) { 196 /* Import sre_compile.compile and sre.error */ 197 PyObject* sre_compile_module = PyImport_ImportModule("sre_compile"); 198 if (sre_compile_module == NULL) { 199 return 0; 200 } 201 sre_compile_method = PyObject_GetAttrString(sre_compile_module, "compile"); 202 if (sre_compile_method == NULL) { 203 return 0; 204 } 205 206 PyObject* sre_constants = PyImport_ImportModule("sre_constants"); 207 if (sre_constants == NULL) { 208 return 0; 209 } 210 sre_error_exception = PyObject_GetAttrString(sre_constants, "error"); 211 if (sre_error_exception == NULL) { 212 return 0; 213 } 214 PyObject* debug_flag = PyObject_GetAttrString(sre_constants, "SRE_FLAG_DEBUG"); 215 if (debug_flag == NULL) { 216 return 0; 217 } 218 SRE_FLAG_DEBUG = PyLong_AsLong(debug_flag); 219 return 1; 220} 221/* Fuzz _sre.compile(x) */ 222static int fuzz_sre_compile(const char* data, size_t size) { 223 /* Ignore really long regex patterns that will timeout the fuzzer */ 224 if (size > MAX_RE_TEST_SIZE) { 225 return 0; 226 } 227 /* We treat the first 2 bytes of the input as a number for the flags */ 228 if (size < 2) { 229 return 0; 230 } 231 uint16_t flags = ((uint16_t*) data)[0]; 232 /* We remove the SRE_FLAG_DEBUG if present. This is because it 233 prints to stdout which greatly decreases fuzzing speed */ 234 flags &= ~SRE_FLAG_DEBUG; 235 236 /* Pull the pattern from the remaining bytes */ 237 PyObject* pattern_bytes = PyBytes_FromStringAndSize(data + 2, size - 2); 238 if (pattern_bytes == NULL) { 239 return 0; 240 } 241 PyObject* flags_obj = PyLong_FromUnsignedLong(flags); 242 if (flags_obj == NULL) { 243 Py_DECREF(pattern_bytes); 244 return 0; 245 } 246 247 /* compiled = _sre.compile(data[2:], data[0:2] */ 248 PyObject* compiled = PyObject_CallFunctionObjArgs( 249 sre_compile_method, pattern_bytes, flags_obj, NULL); 250 /* Ignore ValueError as the fuzzer will more than likely 251 generate some invalid combination of flags */ 252 if (compiled == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) { 253 PyErr_Clear(); 254 } 255 /* Ignore some common errors thrown by sre_parse: 256 Overflow, Assertion, Recursion and Index */ 257 if (compiled == NULL && (PyErr_ExceptionMatches(PyExc_OverflowError) || 258 PyErr_ExceptionMatches(PyExc_AssertionError) || 259 PyErr_ExceptionMatches(PyExc_RecursionError) || 260 PyErr_ExceptionMatches(PyExc_IndexError)) 261 ) { 262 PyErr_Clear(); 263 } 264 /* Ignore re.error */ 265 if (compiled == NULL && PyErr_ExceptionMatches(sre_error_exception)) { 266 PyErr_Clear(); 267 } 268 269 Py_DECREF(pattern_bytes); 270 Py_DECREF(flags_obj); 271 Py_XDECREF(compiled); 272 return 0; 273} 274 275/* Some random patterns used to test re.match. 276 Be careful not to add catostraphically slow regexes here, we want to 277 exercise the matching code without causing timeouts.*/ 278static const char* regex_patterns[] = { 279 ".", "^", "abc", "abc|def", "^xxx$", "\\b", "()", "[a-zA-Z0-9]", 280 "abc+", "[^A-Z]", "[x]", "(?=)", "a{z}", "a+b", "a*?", "a??", "a+?", 281 "{}", "a{,}", "{", "}", "^\\(*\\d{3}\\)*( |-)*\\d{3}( |-)*\\d{4}$", 282 "(?:a*)*", "a{1,2}?" 283}; 284const size_t NUM_PATTERNS = sizeof(regex_patterns) / sizeof(regex_patterns[0]); 285PyObject** compiled_patterns = NULL; 286/* Called by LLVMFuzzerTestOneInput for initialization */ 287static int init_sre_match(void) { 288 PyObject* re_module = PyImport_ImportModule("re"); 289 if (re_module == NULL) { 290 return 0; 291 } 292 compiled_patterns = (PyObject**) PyMem_RawMalloc( 293 sizeof(PyObject*) * NUM_PATTERNS); 294 if (compiled_patterns == NULL) { 295 PyErr_NoMemory(); 296 return 0; 297 } 298 299 /* Precompile all the regex patterns on the first run for faster fuzzing */ 300 for (size_t i = 0; i < NUM_PATTERNS; i++) { 301 PyObject* compiled = PyObject_CallMethod( 302 re_module, "compile", "y", regex_patterns[i]); 303 /* Bail if any of the patterns fail to compile */ 304 if (compiled == NULL) { 305 return 0; 306 } 307 compiled_patterns[i] = compiled; 308 } 309 return 1; 310} 311/* Fuzz re.match(x) */ 312static int fuzz_sre_match(const char* data, size_t size) { 313 if (size < 1 || size > MAX_RE_TEST_SIZE) { 314 return 0; 315 } 316 /* Use the first byte as a uint8_t specifying the index of the 317 regex to use */ 318 unsigned char idx = (unsigned char) data[0]; 319 idx = idx % NUM_PATTERNS; 320 321 /* Pull the string to match from the remaining bytes */ 322 PyObject* to_match = PyBytes_FromStringAndSize(data + 1, size - 1); 323 if (to_match == NULL) { 324 return 0; 325 } 326 327 PyObject* pattern = compiled_patterns[idx]; 328 PyObject* match_callable = PyObject_GetAttrString(pattern, "match"); 329 330 PyObject* matches = PyObject_CallOneArg(match_callable, to_match); 331 332 Py_XDECREF(matches); 333 Py_DECREF(match_callable); 334 Py_DECREF(to_match); 335 return 0; 336} 337 338#define MAX_CSV_TEST_SIZE 0x10000 339PyObject* csv_module = NULL; 340PyObject* csv_error = NULL; 341/* Called by LLVMFuzzerTestOneInput for initialization */ 342static int init_csv_reader(void) { 343 /* Import csv and csv.Error */ 344 csv_module = PyImport_ImportModule("csv"); 345 if (csv_module == NULL) { 346 return 0; 347 } 348 csv_error = PyObject_GetAttrString(csv_module, "Error"); 349 return csv_error != NULL; 350} 351/* Fuzz csv.reader([x]) */ 352static int fuzz_csv_reader(const char* data, size_t size) { 353 if (size < 1 || size > MAX_CSV_TEST_SIZE) { 354 return 0; 355 } 356 /* Ignore non null-terminated strings since _csv can't handle 357 embedded nulls */ 358 if (memchr(data, '\0', size) == NULL) { 359 return 0; 360 } 361 362 PyObject* s = PyUnicode_FromString(data); 363 /* Ignore exceptions until we have a valid string */ 364 if (s == NULL) { 365 PyErr_Clear(); 366 return 0; 367 } 368 369 /* Split on \n so we can test multiple lines */ 370 PyObject* lines = PyObject_CallMethod(s, "split", "s", "\n"); 371 if (lines == NULL) { 372 Py_DECREF(s); 373 return 0; 374 } 375 376 PyObject* reader = PyObject_CallMethod(csv_module, "reader", "N", lines); 377 if (reader) { 378 /* Consume all of the reader as an iterator */ 379 PyObject* parsed_line; 380 while ((parsed_line = PyIter_Next(reader))) { 381 Py_DECREF(parsed_line); 382 } 383 } 384 385 /* Ignore csv.Error because we're probably going to generate 386 some bad files (embedded new-lines, unterminated quotes etc) */ 387 if (PyErr_ExceptionMatches(csv_error)) { 388 PyErr_Clear(); 389 } 390 391 Py_XDECREF(reader); 392 Py_DECREF(s); 393 return 0; 394} 395 396#define MAX_AST_LITERAL_EVAL_TEST_SIZE 0x10000 397PyObject* ast_literal_eval_method = NULL; 398/* Called by LLVMFuzzerTestOneInput for initialization */ 399static int init_ast_literal_eval(void) { 400 PyObject* ast_module = PyImport_ImportModule("ast"); 401 if (ast_module == NULL) { 402 return 0; 403 } 404 ast_literal_eval_method = PyObject_GetAttrString(ast_module, "literal_eval"); 405 return ast_literal_eval_method != NULL; 406} 407/* Fuzz ast.literal_eval(x) */ 408static int fuzz_ast_literal_eval(const char* data, size_t size) { 409 if (size > MAX_AST_LITERAL_EVAL_TEST_SIZE) { 410 return 0; 411 } 412 /* Ignore non null-terminated strings since ast can't handle 413 embedded nulls */ 414 if (memchr(data, '\0', size) == NULL) { 415 return 0; 416 } 417 418 PyObject* s = PyUnicode_FromString(data); 419 /* Ignore exceptions until we have a valid string */ 420 if (s == NULL) { 421 PyErr_Clear(); 422 return 0; 423 } 424 425 PyObject* literal = PyObject_CallOneArg(ast_literal_eval_method, s); 426 /* Ignore some common errors thrown by ast.literal_eval */ 427 if (literal == NULL && (PyErr_ExceptionMatches(PyExc_ValueError) || 428 PyErr_ExceptionMatches(PyExc_TypeError) || 429 PyErr_ExceptionMatches(PyExc_SyntaxError) || 430 PyErr_ExceptionMatches(PyExc_MemoryError) || 431 PyErr_ExceptionMatches(PyExc_RecursionError)) 432 ) { 433 PyErr_Clear(); 434 } 435 436 Py_XDECREF(literal); 437 Py_DECREF(s); 438 return 0; 439} 440 441/* Run fuzzer and abort on failure. */ 442static int _run_fuzz(const uint8_t *data, size_t size, int(*fuzzer)(const char* , size_t)) { 443 int rv = fuzzer((const char*) data, size); 444 if (PyErr_Occurred()) { 445 /* Fuzz tests should handle expected errors for themselves. 446 This is last-ditch check in case they didn't. */ 447 PyErr_Print(); 448 abort(); 449 } 450 /* Someday the return value might mean something, propagate it. */ 451 return rv; 452} 453 454/* CPython generates a lot of leak warnings for whatever reason. */ 455int __lsan_is_turned_off(void) { return 1; } 456 457 458int LLVMFuzzerInitialize(int *argc, char ***argv) { 459 PyConfig config; 460 PyConfig_InitPythonConfig(&config); 461 config.install_signal_handlers = 0; 462 PyStatus status; 463 status = PyConfig_SetBytesString(&config, &config.program_name, *argv[0]); 464 if (PyStatus_Exception(status)) { 465 goto fail; 466 } 467 468 status = Py_InitializeFromConfig(&config); 469 if (PyStatus_Exception(status)) { 470 goto fail; 471 } 472 PyConfig_Clear(&config); 473 474 return 0; 475 476fail: 477 PyConfig_Clear(&config); 478 Py_ExitStatusException(status); 479} 480 481/* Fuzz test interface. 482 This returns the bitwise or of all fuzz test's return values. 483 484 All fuzz tests must return 0, as all nonzero return codes are reserved for 485 future use -- we propagate the return values for that future case. 486 (And we bitwise or when running multiple tests to verify that normally we 487 only return 0.) */ 488int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { 489 assert(Py_IsInitialized()); 490 491 int rv = 0; 492 493#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_float) 494 rv |= _run_fuzz(data, size, fuzz_builtin_float); 495#endif 496#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_int) 497 rv |= _run_fuzz(data, size, fuzz_builtin_int); 498#endif 499#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_unicode) 500 rv |= _run_fuzz(data, size, fuzz_builtin_unicode); 501#endif 502#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_struct_unpack) 503 static int STRUCT_UNPACK_INITIALIZED = 0; 504 if (!STRUCT_UNPACK_INITIALIZED && !init_struct_unpack()) { 505 PyErr_Print(); 506 abort(); 507 } else { 508 STRUCT_UNPACK_INITIALIZED = 1; 509 } 510 rv |= _run_fuzz(data, size, fuzz_struct_unpack); 511#endif 512#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_json_loads) 513 static int JSON_LOADS_INITIALIZED = 0; 514 if (!JSON_LOADS_INITIALIZED && !init_json_loads()) { 515 PyErr_Print(); 516 abort(); 517 } else { 518 JSON_LOADS_INITIALIZED = 1; 519 } 520 521 rv |= _run_fuzz(data, size, fuzz_json_loads); 522#endif 523#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_sre_compile) 524 static int SRE_COMPILE_INITIALIZED = 0; 525 if (!SRE_COMPILE_INITIALIZED && !init_sre_compile()) { 526 PyErr_Print(); 527 abort(); 528 } else { 529 SRE_COMPILE_INITIALIZED = 1; 530 } 531 532 rv |= _run_fuzz(data, size, fuzz_sre_compile); 533#endif 534#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_sre_match) 535 static int SRE_MATCH_INITIALIZED = 0; 536 if (!SRE_MATCH_INITIALIZED && !init_sre_match()) { 537 PyErr_Print(); 538 abort(); 539 } else { 540 SRE_MATCH_INITIALIZED = 1; 541 } 542 543 rv |= _run_fuzz(data, size, fuzz_sre_match); 544#endif 545#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_csv_reader) 546 static int CSV_READER_INITIALIZED = 0; 547 if (!CSV_READER_INITIALIZED && !init_csv_reader()) { 548 PyErr_Print(); 549 abort(); 550 } else { 551 CSV_READER_INITIALIZED = 1; 552 } 553 554 rv |= _run_fuzz(data, size, fuzz_csv_reader); 555#endif 556#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_ast_literal_eval) 557 static int AST_LITERAL_EVAL_INITIALIZED = 0; 558 if (!AST_LITERAL_EVAL_INITIALIZED && !init_ast_literal_eval()) { 559 PyErr_Print(); 560 abort(); 561 } else { 562 AST_LITERAL_EVAL_INITIALIZED = 1; 563 } 564 565 rv |= _run_fuzz(data, size, fuzz_ast_literal_eval); 566#endif 567 return rv; 568} 569