From 1af7b7db0d1cd6756ecb6081364fdd1378b1605c Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 18 Oct 2023 00:34:56 +0200 Subject: [PATCH] [3.11] gh-107450: Check for overflow in the tokenizer and fix overflow test (GH-110832) (#110939) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry picked from commit a1ac5590e0f8fe008e5562d22edab65d0c1c5507) Co-authored-by: Filipe LaĆ­ns Co-authored-by: Serhiy Storchaka --- Include/errcode.h | 39 +++++++++++++++++++------------------ Lib/test/test_exceptions.py | 16 +++++++++++---- Parser/pegen_errors.c | 10 ++++------ Parser/tokenizer.c | 4 ++++ 4 files changed, 40 insertions(+), 29 deletions(-) diff --git a/Include/errcode.h b/Include/errcode.h index 54ae929bf25..19ee83ec73d 100644 --- a/Include/errcode.h +++ b/Include/errcode.h @@ -4,7 +4,6 @@ extern "C" { #endif - /* Error codes passed around between file input, tokenizer, parser and interpreter. This is necessary so we can turn them into Python exceptions at a higher level. Note that some errors have a @@ -13,24 +12,26 @@ extern "C" { the parser only returns E_EOF when it hits EOF immediately, and it never returns E_OK. */ -#define E_OK 10 /* No error */ -#define E_EOF 11 /* End Of File */ -#define E_INTR 12 /* Interrupted */ -#define E_TOKEN 13 /* Bad token */ -#define E_SYNTAX 14 /* Syntax error */ -#define E_NOMEM 15 /* Ran out of memory */ -#define E_DONE 16 /* Parsing complete */ -#define E_ERROR 17 /* Execution error */ -#define E_TABSPACE 18 /* Inconsistent mixing of tabs and spaces */ -#define E_OVERFLOW 19 /* Node had too many children */ -#define E_TOODEEP 20 /* Too many indentation levels */ -#define E_DEDENT 21 /* No matching outer block for dedent */ -#define E_DECODE 22 /* Error in decoding into Unicode */ -#define E_EOFS 23 /* EOF in triple-quoted string */ -#define E_EOLS 24 /* EOL in single-quoted string */ -#define E_LINECONT 25 /* Unexpected characters after a line continuation */ -#define E_BADSINGLE 27 /* Ill-formed single statement input */ -#define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */ +#define E_OK 10 /* No error */ +#define E_EOF 11 /* End Of File */ +#define E_INTR 12 /* Interrupted */ +#define E_TOKEN 13 /* Bad token */ +#define E_SYNTAX 14 /* Syntax error */ +#define E_NOMEM 15 /* Ran out of memory */ +#define E_DONE 16 /* Parsing complete */ +#define E_ERROR 17 /* Execution error */ +#define E_TABSPACE 18 /* Inconsistent mixing of tabs and spaces */ +#define E_OVERFLOW 19 /* Node had too many children */ +#define E_TOODEEP 20 /* Too many indentation levels */ +#define E_DEDENT 21 /* No matching outer block for dedent */ +#define E_DECODE 22 /* Error in decoding into Unicode */ +#define E_EOFS 23 /* EOF in triple-quoted string */ +#define E_EOLS 24 /* EOL in single-quoted string */ +#define E_LINECONT 25 /* Unexpected characters after a line continuation */ +#define E_BADSINGLE 27 /* Ill-formed single statement input */ +#define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */ +#define E_COLUMNOVERFLOW 29 /* Column offset overflow */ + #ifdef __cplusplus } diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py index c7c63e5e4ae..cd064219c67 100644 --- a/Lib/test/test_exceptions.py +++ b/Lib/test/test_exceptions.py @@ -19,6 +19,12 @@ from test.support.os_helper import TESTFN, unlink from test.support.warnings_helper import check_warnings from test import support +try: + from _testcapi import INT_MAX +except ImportError: + INT_MAX = 2**31 - 1 + + class NaiveException(Exception): def __init__(self, x): @@ -318,11 +324,13 @@ class ExceptionTests(unittest.TestCase): check('(yield i) = 2', 1, 2) check('def f(*):\n pass', 1, 7) + @unittest.skipIf(INT_MAX >= sys.maxsize, "Downcasting to int is safe for col_offset") @support.requires_resource('cpu') - @support.bigmemtest(support._2G, memuse=1.5) - def testMemoryErrorBigSource(self, _size): - with self.assertRaises(OverflowError): - exec(f"if True:\n {' ' * 2**31}print('hello world')") + @support.bigmemtest(INT_MAX, memuse=2, dry_run=False) + def testMemoryErrorBigSource(self, size): + src = b"if True:\n%*s" % (size, b"pass") + with self.assertRaisesRegex(OverflowError, "Parser column offset overflow"): + compile(src, '', 'exec') @cpython_only def testSettingException(self): diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c index ea5c4e227ff..005356110a4 100644 --- a/Parser/pegen_errors.c +++ b/Parser/pegen_errors.c @@ -101,6 +101,10 @@ _Pypegen_tokenizer_error(Parser *p) msg = "unexpected character after line continuation character"; break; } + case E_COLUMNOVERFLOW: + PyErr_SetString(PyExc_OverflowError, + "Parser column offset overflow - source line is too big"); + return -1; default: msg = "unknown parsing error"; } @@ -224,12 +228,6 @@ _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...) col_offset = 0; } else { const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf; - if (p->tok->cur - start > INT_MAX) { - PyErr_SetString(PyExc_OverflowError, - "Parser column offset overflow - source line is too big"); - p->error_indicator = 1; - return NULL; - } col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int); } } else { diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 7fc8a585621..566ad8dfa00 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1057,6 +1057,10 @@ tok_nextc(struct tok_state *tok) int rc; for (;;) { if (tok->cur != tok->inp) { + if (tok->cur - tok->buf >= INT_MAX) { + tok->done = E_COLUMNOVERFLOW; + return EOF; + } return Py_CHARMASK(*tok->cur++); /* Fast path */ } if (tok->done != E_OK) {