From c38e2f64d012929168dfef7363c9e48bd1a6c731 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 24 Jun 2024 12:17:25 +0300 Subject: [PATCH] gh-119614: Fix truncation of strings with embedded null characters in Tkinter (GH-120909) Now the null character is always represented as \xc0\x80 for Tcl_NewStringObj(). --- Lib/test/test_tcl.py | 24 +++++++++++ Lib/test/test_tkinter/test_misc.py | 9 +++++ ...-06-23-17-50-40.gh-issue-119614.vwPGLB.rst | 2 + Modules/_tkinter.c | 40 +++++++++++++++---- 4 files changed, 68 insertions(+), 7 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-06-23-17-50-40.gh-issue-119614.vwPGLB.rst diff --git a/Lib/test/test_tcl.py b/Lib/test/test_tcl.py index 443787d721d..d479f7d7515 100644 --- a/Lib/test/test_tcl.py +++ b/Lib/test/test_tcl.py @@ -73,6 +73,18 @@ class TclTest(unittest.TestCase): tcl.call('set','a','1') self.assertEqual(tcl.call('set','a'),'1') + def test_call_passing_null(self): + tcl = self.interp + tcl.call('set', 'a', 'a\0b') # ASCII-only + self.assertEqual(tcl.getvar('a'), 'a\x00b') + self.assertEqual(tcl.call('set', 'a'), 'a\x00b') + self.assertEqual(tcl.eval('set a'), 'a\x00b') + + tcl.call('set', 'a', '\u20ac\0') # non-ASCII + self.assertEqual(tcl.getvar('a'), '\u20ac\x00') + self.assertEqual(tcl.call('set', 'a'), '\u20ac\x00') + self.assertEqual(tcl.eval('set a'), '\u20ac\x00') + def testCallException(self): tcl = self.interp self.assertRaises(TclError,tcl.call,'set','a') @@ -98,6 +110,18 @@ class TclTest(unittest.TestCase): tcl.setvar('a','1') self.assertEqual(tcl.eval('set a'),'1') + def test_setvar_passing_null(self): + tcl = self.interp + tcl.setvar('a', 'a\0b') # ASCII-only + self.assertEqual(tcl.getvar('a'), 'a\x00b') + self.assertEqual(tcl.call('set', 'a'), 'a\x00b') + self.assertEqual(tcl.eval('set a'), 'a\x00b') + + tcl.setvar('a', '\u20ac\0') # non-ASCII + self.assertEqual(tcl.getvar('a'), '\u20ac\x00') + self.assertEqual(tcl.call('set', 'a'), '\u20ac\x00') + self.assertEqual(tcl.eval('set a'), '\u20ac\x00') + def testSetVarArray(self): tcl = self.interp tcl.setvar('a(1)','1') diff --git a/Lib/test/test_tkinter/test_misc.py b/Lib/test/test_tkinter/test_misc.py index d9ea642881a..b0b9ed60040 100644 --- a/Lib/test/test_tkinter/test_misc.py +++ b/Lib/test/test_tkinter/test_misc.py @@ -476,6 +476,15 @@ class MiscTest(AbstractTkTest, unittest.TestCase): self.assertEqual(vi.micro, 0) self.assertTrue(str(vi).startswith(f'{vi.major}.{vi.minor}')) + def test_embedded_null(self): + widget = tkinter.Entry(self.root) + widget.insert(0, 'abc\0def') # ASCII-only + widget.selection_range(0, 'end') + self.assertEqual(widget.selection_get(), 'abc\x00def') + widget.insert(0, '\u20ac\0') # non-ASCII + widget.selection_range(0, 'end') + self.assertEqual(widget.selection_get(), '\u20ac\0abc\x00def') + class WmTest(AbstractTkTest, unittest.TestCase): diff --git a/Misc/NEWS.d/next/Library/2024-06-23-17-50-40.gh-issue-119614.vwPGLB.rst b/Misc/NEWS.d/next/Library/2024-06-23-17-50-40.gh-issue-119614.vwPGLB.rst new file mode 100644 index 00000000000..d518265a7fe --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-06-23-17-50-40.gh-issue-119614.vwPGLB.rst @@ -0,0 +1,2 @@ +Fix truncation of strings with embedded null characters in some internal +operations in :mod:`tkinter`. diff --git a/Modules/_tkinter.c b/Modules/_tkinter.c index 8a5347be7de..1542d3af42f 100644 --- a/Modules/_tkinter.c +++ b/Modules/_tkinter.c @@ -512,7 +512,7 @@ unicodeFromTclObj(TkappObject *tkapp, Tcl_Obj *value) else Py_UNREACHABLE(); } -#endif +#endif /* USE_TCL_UNICODE */ const char *s = Tcl_GetStringFromObj(value, &len); return unicodeFromTclStringAndSize(s, len); } @@ -1018,7 +1018,9 @@ AsObj(PyObject *value) PyErr_SetString(PyExc_OverflowError, "string is too long"); return NULL; } - if (PyUnicode_IS_ASCII(value)) { + if (PyUnicode_IS_ASCII(value) && + strlen(PyUnicode_DATA(value)) == (size_t)PyUnicode_GET_LENGTH(value)) + { return Tcl_NewStringObj((const char *)PyUnicode_DATA(value), (int)size); } @@ -1033,9 +1035,6 @@ AsObj(PyObject *value) "surrogatepass", NATIVE_BYTEORDER); else Py_UNREACHABLE(); -#else - encoded = _PyUnicode_AsUTF8String(value, "surrogateescape"); -#endif if (!encoded) { return NULL; } @@ -1045,12 +1044,39 @@ AsObj(PyObject *value) PyErr_SetString(PyExc_OverflowError, "string is too long"); return NULL; } -#if USE_TCL_UNICODE result = Tcl_NewUnicodeObj((const Tcl_UniChar *)PyBytes_AS_STRING(encoded), (int)(size / sizeof(Tcl_UniChar))); #else + encoded = _PyUnicode_AsUTF8String(value, "surrogateescape"); + if (!encoded) { + return NULL; + } + size = PyBytes_GET_SIZE(encoded); + if (strlen(PyBytes_AS_STRING(encoded)) != (size_t)size) { + /* The string contains embedded null characters. + * Tcl needs a null character to be represented as \xc0\x80 in + * the Modified UTF-8 encoding. Otherwise the string can be + * truncated in some internal operations. + * + * NOTE: stringlib_replace() could be used here, but optimizing + * this obscure case isn't worth it unless stringlib_replace() + * was already exposed in the C API for other reasons. */ + Py_SETREF(encoded, + PyObject_CallMethod(encoded, "replace", "y#y#", + "\0", (Py_ssize_t)1, + "\xc0\x80", (Py_ssize_t)2)); + if (!encoded) { + return NULL; + } + size = PyBytes_GET_SIZE(encoded); + } + if (size > INT_MAX) { + Py_DECREF(encoded); + PyErr_SetString(PyExc_OverflowError, "string is too long"); + return NULL; + } result = Tcl_NewStringObj(PyBytes_AS_STRING(encoded), (int)size); -#endif +#endif /* USE_TCL_UNICODE */ Py_DECREF(encoded); return result; }