from _codecs import _unregister_error as _codecs_unregister_error import codecs import html.entities import itertools import sys import unicodedata import unittest class PosReturn: # this can be used for configurable callbacks def __init__(self): self.pos = 0 def handle(self, exc): oldpos = self.pos realpos = oldpos if realpos<0: realpos = len(exc.object) + realpos # if we don't advance this time, terminate on the next call # otherwise we'd get an endless loop if realpos <= exc.start: self.pos = len(exc.object) return ("", oldpos) class RepeatedPosReturn: def __init__(self, repl=""): self.repl = repl self.pos = 0 self.count = 0 def handle(self, exc): if self.count > 0: self.count -= 1 return (self.repl, self.pos) return (self.repl, exc.end) # A UnicodeEncodeError object with a bad start attribute class BadStartUnicodeEncodeError(UnicodeEncodeError): def __init__(self): UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad") self.start = [] # A UnicodeEncodeError object with a bad object attribute class BadObjectUnicodeEncodeError(UnicodeEncodeError): def __init__(self): UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad") self.object = [] # A UnicodeDecodeError object without an end attribute class NoEndUnicodeDecodeError(UnicodeDecodeError): def __init__(self): UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad") del self.end # A UnicodeDecodeError object with a bad object attribute class BadObjectUnicodeDecodeError(UnicodeDecodeError): def __init__(self): UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad") self.object = [] # A UnicodeTranslateError object without a start attribute class NoStartUnicodeTranslateError(UnicodeTranslateError): def __init__(self): UnicodeTranslateError.__init__(self, "", 0, 1, "bad") del self.start # A UnicodeTranslateError object without an end attribute class NoEndUnicodeTranslateError(UnicodeTranslateError): def __init__(self): UnicodeTranslateError.__init__(self, "", 0, 1, "bad") del self.end # A UnicodeTranslateError object without an object attribute class NoObjectUnicodeTranslateError(UnicodeTranslateError): def __init__(self): UnicodeTranslateError.__init__(self, "", 0, 1, "bad") del self.object class CodecCallbackTest(unittest.TestCase): def test_xmlcharrefreplace(self): # replace unencodable characters which numeric character entities. # For ascii, latin-1 and charmaps this is completely implemented # in C and should be reasonably fast. s = "\u30b9\u30d1\u30e2 \xe4nd eggs" self.assertEqual( s.encode("ascii", "xmlcharrefreplace"), b"スパモ änd eggs" ) self.assertEqual( s.encode("latin-1", "xmlcharrefreplace"), b"スパモ \xe4nd eggs" ) def test_xmlcharnamereplace(self): # This time use a named character entity for unencodable # characters, if one is available. def xmlcharnamereplace(exc): if not isinstance(exc, UnicodeEncodeError): raise TypeError("don't know how to handle %r" % exc) l = [] for c in exc.object[exc.start:exc.end]: try: l.append("&%s;" % html.entities.codepoint2name[ord(c)]) except KeyError: l.append("&#%d;" % ord(c)) return ("".join(l), exc.end) codecs.register_error( "test.xmlcharnamereplace", xmlcharnamereplace) sin = "\xab\u211c\xbb = \u2329\u1234\u20ac\u232a" sout = b"«ℜ» = ⟨ሴ€⟩" self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout) sout = b"\xabℜ\xbb = ⟨ሴ€⟩" self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout) sout = b"\xabℜ\xbb = ⟨ሴ\xa4⟩" self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout) def test_uninamereplace(self): # We're using the names from the unicode database this time, # and we're doing "syntax highlighting" here, i.e. we include # the replaced text in ANSI escape sequences. For this it is # useful that the error handler is not called for every single # unencodable character, but for a complete sequence of # unencodable characters, otherwise we would output many # unnecessary escape sequences. def uninamereplace(exc): if not isinstance(exc, UnicodeEncodeError): raise TypeError("don't know how to handle %r" % exc) l = [] for c in exc.object[exc.start:exc.end]: l.append(unicodedata.name(c, "0x%x" % ord(c))) return ("\033[1m%s\033[0m" % ", ".join(l), exc.end) codecs.register_error( "test.uninamereplace", uninamereplace) sin = "\xac\u1234\u20ac\u8000" sout = b"\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m" self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout) sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m" self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout) sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m" self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout) def test_backslashescape(self): # Does the same as the "unicode-escape" encoding, but with different # base encodings. sin = "a\xac\u1234\u20ac\u8000\U0010ffff" sout = b"a\\xac\\u1234\\u20ac\\u8000\\U0010ffff" self.assertEqual(sin.encode("ascii", "backslashreplace"), sout) sout = b"a\xac\\u1234\\u20ac\\u8000\\U0010ffff" self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout) sout = b"a\xac\\u1234\xa4\\u8000\\U0010ffff" self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout) def test_nameescape(self): # Does the same as backslashescape, but prefers ``\N{...}`` escape # sequences. sin = "a\xac\u1234\u20ac\u8000\U0010ffff" sout = (b'a\\N{NOT SIGN}\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}' b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff') self.assertEqual(sin.encode("ascii", "namereplace"), sout) sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}' b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff') self.assertEqual(sin.encode("latin-1", "namereplace"), sout) sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\xa4' b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff') self.assertEqual(sin.encode("iso-8859-15", "namereplace"), sout) def test_decoding_callbacks(self): # This is a test for a decoding callback handler # that allows the decoding of the invalid sequence # "\xc0\x80" and returns "\x00" instead of raising an error. # All other illegal sequences will be handled strictly. def relaxedutf8(exc): if not isinstance(exc, UnicodeDecodeError): raise TypeError("don't know how to handle %r" % exc) if exc.object[exc.start:exc.start+2] == b"\xc0\x80": return ("\x00", exc.start+2) # retry after two bytes else: raise exc codecs.register_error("test.relaxedutf8", relaxedutf8) # all the "\xc0\x80" will be decoded to "\x00" sin = b"a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80" sout = "a\x00b\x00c\xfc\x00\x00" self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout) # "\xc0\x81" is not valid and a UnicodeDecodeError will be raised sin = b"\xc0\x80\xc0\x81" self.assertRaises(UnicodeDecodeError, sin.decode, "utf-8", "test.relaxedutf8") def test_charmapencode(self): # For charmap encodings the replacement string will be # mapped through the encoding again. This means, that # to be able to use e.g. the "replace" handler, the # charmap has to have a mapping for "?". charmap = dict((ord(c), bytes(2*c.upper(), 'ascii')) for c in "abcdefgh") sin = "abc" sout = b"AABBCC" self.assertEqual(codecs.charmap_encode(sin, "strict", charmap)[0], sout) sin = "abcA" self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap) charmap[ord("?")] = b"XYZ" sin = "abcDEF" sout = b"AABBCCXYZXYZXYZ" self.assertEqual(codecs.charmap_encode(sin, "replace", charmap)[0], sout) charmap[ord("?")] = "XYZ" # wrong type in mapping self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap) def test_callbacks(self): def handler1(exc): r = range(exc.start, exc.end) if isinstance(exc, UnicodeEncodeError): l = ["<%d>" % ord(exc.object[pos]) for pos in r] elif isinstance(exc, UnicodeDecodeError): l = ["<%d>" % exc.object[pos] for pos in r] else: raise TypeError("don't know how to handle %r" % exc) return ("[%s]" % "".join(l), exc.end) codecs.register_error("test.handler1", handler1) def handler2(exc): if not isinstance(exc, UnicodeDecodeError): raise TypeError("don't know how to handle %r" % exc) l = ["<%d>" % exc.object[pos] for pos in range(exc.start, exc.end)] return ("[%s]" % "".join(l), exc.end+1) # skip one character codecs.register_error("test.handler2", handler2) s = b"\x00\x81\x7f\x80\xff" self.assertEqual( s.decode("ascii", "test.handler1"), "\x00[<129>]\x7f[<128>][<255>]" ) self.assertEqual( s.decode("ascii", "test.handler2"), "\x00[<129>][<128>]" ) self.assertEqual( b"\\u3042\\u3xxx".decode("unicode-escape", "test.handler1"), "\u3042[<92><117><51>]xxx" ) self.assertEqual( b"\\u3042\\u3xx".decode("unicode-escape", "test.handler1"), "\u3042[<92><117><51>]xx" ) self.assertEqual( codecs.charmap_decode(b"abc", "test.handler1", {ord("a"): "z"})[0], "z[<98>][<99>]" ) self.assertEqual( "g\xfc\xdfrk".encode("ascii", "test.handler1"), b"g[<252><223>]rk" ) self.assertEqual( "g\xfc\xdf".encode("ascii", "test.handler1"), b"g[<252><223>]" ) def test_longstrings(self): # test long strings to check for memory overflow problems errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace", "namereplace"] # register the handlers under different names, # to prevent the codec from recognizing the name for err in errors: codecs.register_error("test." + err, codecs.lookup_error(err)) l = 1000 errors += [ "test." + err for err in errors ] for uni in [ s*l for s in ("x", "\u3042", "a\xe4") ]: for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16", "utf-32"): for err in errors: try: uni.encode(enc, err) except UnicodeError: pass def check_exceptionobjectargs(self, exctype, args, msg): # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion # check with one missing argument self.assertRaises(TypeError, exctype, *args[:-1]) # check with one argument too much self.assertRaises(TypeError, exctype, *(args + ["too much"])) # check with one argument of the wrong type wrongargs = [ "spam", b"eggs", b"spam", 42, 1.0, None ] for i in range(len(args)): for wrongarg in wrongargs: if type(wrongarg) is type(args[i]): continue # build argument array callargs = [] for j in range(len(args)): if i==j: callargs.append(wrongarg) else: callargs.append(args[i]) self.assertRaises(TypeError, exctype, *callargs) # check with the correct number and type of arguments exc = exctype(*args) self.assertEqual(str(exc), msg) def test_unicodeencodeerror(self): self.check_exceptionobjectargs( UnicodeEncodeError, ["ascii", "g\xfcrk", 1, 2, "ouch"], "'ascii' codec can't encode character '\\xfc' in position 1: ouch" ) self.check_exceptionobjectargs( UnicodeEncodeError, ["ascii", "g\xfcrk", 1, 4, "ouch"], "'ascii' codec can't encode characters in position 1-3: ouch" ) self.check_exceptionobjectargs( UnicodeEncodeError, ["ascii", "\xfcx", 0, 1, "ouch"], "'ascii' codec can't encode character '\\xfc' in position 0: ouch" ) self.check_exceptionobjectargs( UnicodeEncodeError, ["ascii", "\u0100x", 0, 1, "ouch"], "'ascii' codec can't encode character '\\u0100' in position 0: ouch" ) self.check_exceptionobjectargs( UnicodeEncodeError, ["ascii", "\uffffx", 0, 1, "ouch"], "'ascii' codec can't encode character '\\uffff' in position 0: ouch" ) self.check_exceptionobjectargs( UnicodeEncodeError, ["ascii", "\U00010000x", 0, 1, "ouch"], "'ascii' codec can't encode character '\\U00010000' in position 0: ouch" ) def test_unicodedecodeerror(self): self.check_exceptionobjectargs( UnicodeDecodeError, ["ascii", bytearray(b"g\xfcrk"), 1, 2, "ouch"], "'ascii' codec can't decode byte 0xfc in position 1: ouch" ) self.check_exceptionobjectargs( UnicodeDecodeError, ["ascii", bytearray(b"g\xfcrk"), 1, 3, "ouch"], "'ascii' codec can't decode bytes in position 1-2: ouch" ) def test_unicodetranslateerror(self): self.check_exceptionobjectargs( UnicodeTranslateError, ["g\xfcrk", 1, 2, "ouch"], "can't translate character '\\xfc' in position 1: ouch" ) self.check_exceptionobjectargs( UnicodeTranslateError, ["g\u0100rk", 1, 2, "ouch"], "can't translate character '\\u0100' in position 1: ouch" ) self.check_exceptionobjectargs( UnicodeTranslateError, ["g\uffffrk", 1, 2, "ouch"], "can't translate character '\\uffff' in position 1: ouch" ) self.check_exceptionobjectargs( UnicodeTranslateError, ["g\U00010000rk", 1, 2, "ouch"], "can't translate character '\\U00010000' in position 1: ouch" ) self.check_exceptionobjectargs( UnicodeTranslateError, ["g\xfcrk", 1, 3, "ouch"], "can't translate characters in position 1-2: ouch" ) def test_badandgoodstrictexceptions(self): # "strict" complains about a non-exception passed in self.assertRaises( TypeError, codecs.strict_errors, 42 ) # "strict" complains about the wrong exception type self.assertRaises( Exception, codecs.strict_errors, Exception("ouch") ) # If the correct exception is passed in, "strict" raises it self.assertRaises( UnicodeEncodeError, codecs.strict_errors, UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch") ) self.assertRaises( UnicodeDecodeError, codecs.strict_errors, UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch") ) self.assertRaises( UnicodeTranslateError, codecs.strict_errors, UnicodeTranslateError("\u3042", 0, 1, "ouch") ) def test_badandgoodignoreexceptions(self): # "ignore" complains about a non-exception passed in self.assertRaises( TypeError, codecs.ignore_errors, 42 ) # "ignore" complains about the wrong exception type self.assertRaises( TypeError, codecs.ignore_errors, UnicodeError("ouch") ) # If the correct exception is passed in, "ignore" returns an empty replacement self.assertEqual( codecs.ignore_errors( UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")), ("", 2) ) self.assertEqual( codecs.ignore_errors( UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")), ("", 2) ) self.assertEqual( codecs.ignore_errors( UnicodeTranslateError("a\u3042b", 1, 2, "ouch")), ("", 2) ) def test_badandgoodreplaceexceptions(self): # "replace" complains about a non-exception passed in self.assertRaises( TypeError, codecs.replace_errors, 42 ) # "replace" complains about the wrong exception type self.assertRaises( TypeError, codecs.replace_errors, UnicodeError("ouch") ) self.assertRaises( TypeError, codecs.replace_errors, BadObjectUnicodeEncodeError() ) self.assertRaises( TypeError, codecs.replace_errors, BadObjectUnicodeDecodeError() ) # With the correct exception, "replace" returns an "?" or "\ufffd" replacement self.assertEqual( codecs.replace_errors( UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")), ("?", 2) ) self.assertEqual( codecs.replace_errors( UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")), ("\ufffd", 2) ) self.assertEqual( codecs.replace_errors( UnicodeTranslateError("a\u3042b", 1, 2, "ouch")), ("\ufffd", 2) ) def test_badandgoodxmlcharrefreplaceexceptions(self): # "xmlcharrefreplace" complains about a non-exception passed in self.assertRaises( TypeError, codecs.xmlcharrefreplace_errors, 42 ) # "xmlcharrefreplace" complains about the wrong exception types self.assertRaises( TypeError, codecs.xmlcharrefreplace_errors, UnicodeError("ouch") ) # "xmlcharrefreplace" can only be used for encoding self.assertRaises( TypeError, codecs.xmlcharrefreplace_errors, UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch") ) self.assertRaises( TypeError, codecs.xmlcharrefreplace_errors, UnicodeTranslateError("\u3042", 0, 1, "ouch") ) # Use the correct exception cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000, 99999, 100000, 999999, 1000000) cs += (0xd800, 0xdfff) s = "".join(chr(c) for c in cs) self.assertEqual( codecs.xmlcharrefreplace_errors( UnicodeEncodeError("ascii", "a" + s + "b", 1, 1 + len(s), "ouch") ), ("".join("&#%d;" % c for c in cs), 1 + len(s)) ) def test_badandgoodbackslashreplaceexceptions(self): # "backslashreplace" complains about a non-exception passed in self.assertRaises( TypeError, codecs.backslashreplace_errors, 42 ) # "backslashreplace" complains about the wrong exception types self.assertRaises( TypeError, codecs.backslashreplace_errors, UnicodeError("ouch") ) # Use the correct exception tests = [ ("\u3042", "\\u3042"), ("\n", "\\x0a"), ("a", "\\x61"), ("\x00", "\\x00"), ("\xff", "\\xff"), ("\u0100", "\\u0100"), ("\uffff", "\\uffff"), ("\U00010000", "\\U00010000"), ("\U0010ffff", "\\U0010ffff"), # Lone surrogates ("\ud800", "\\ud800"), ("\udfff", "\\udfff"), ("\ud800\udfff", "\\ud800\\udfff"), ] for s, r in tests: with self.subTest(str=s): self.assertEqual( codecs.backslashreplace_errors( UnicodeEncodeError("ascii", "a" + s + "b", 1, 1 + len(s), "ouch")), (r, 1 + len(s)) ) self.assertEqual( codecs.backslashreplace_errors( UnicodeTranslateError("a" + s + "b", 1, 1 + len(s), "ouch")), (r, 1 + len(s)) ) tests = [ (b"a", "\\x61"), (b"\n", "\\x0a"), (b"\x00", "\\x00"), (b"\xff", "\\xff"), ] for b, r in tests: with self.subTest(bytes=b): self.assertEqual( codecs.backslashreplace_errors( UnicodeDecodeError("ascii", bytearray(b"a" + b + b"b"), 1, 2, "ouch")), (r, 2) ) def test_badandgoodnamereplaceexceptions(self): # "namereplace" complains about a non-exception passed in self.assertRaises( TypeError, codecs.namereplace_errors, 42 ) # "namereplace" complains about the wrong exception types self.assertRaises( TypeError, codecs.namereplace_errors, UnicodeError("ouch") ) # "namereplace" can only be used for encoding self.assertRaises( TypeError, codecs.namereplace_errors, UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch") ) self.assertRaises( TypeError, codecs.namereplace_errors, UnicodeTranslateError("\u3042", 0, 1, "ouch") ) # Use the correct exception tests = [ ("\u3042", "\\N{HIRAGANA LETTER A}"), ("\x00", "\\x00"), ("\ufbf9", "\\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH " "HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}"), ("\U000e007f", "\\N{CANCEL TAG}"), ("\U0010ffff", "\\U0010ffff"), # Lone surrogates ("\ud800", "\\ud800"), ("\udfff", "\\udfff"), ("\ud800\udfff", "\\ud800\\udfff"), ] for s, r in tests: with self.subTest(str=s): self.assertEqual( codecs.namereplace_errors( UnicodeEncodeError("ascii", "a" + s + "b", 1, 1 + len(s), "ouch")), (r, 1 + len(s)) ) def test_badandgoodsurrogateescapeexceptions(self): surrogateescape_errors = codecs.lookup_error('surrogateescape') # "surrogateescape" complains about a non-exception passed in self.assertRaises( TypeError, surrogateescape_errors, 42 ) # "surrogateescape" complains about the wrong exception types self.assertRaises( TypeError, surrogateescape_errors, UnicodeError("ouch") ) # "surrogateescape" can not be used for translating self.assertRaises( TypeError, surrogateescape_errors, UnicodeTranslateError("\udc80", 0, 1, "ouch") ) # Use the correct exception for s in ("a", "\udc7f", "\udd00"): with self.subTest(str=s): self.assertRaises( UnicodeEncodeError, surrogateescape_errors, UnicodeEncodeError("ascii", s, 0, 1, "ouch") ) self.assertEqual( surrogateescape_errors( UnicodeEncodeError("ascii", "a\udc80b", 1, 2, "ouch")), (b"\x80", 2) ) self.assertRaises( UnicodeDecodeError, surrogateescape_errors, UnicodeDecodeError("ascii", bytearray(b"a"), 0, 1, "ouch") ) self.assertEqual( surrogateescape_errors( UnicodeDecodeError("ascii", bytearray(b"a\x80b"), 1, 2, "ouch")), ("\udc80", 2) ) def test_badandgoodsurrogatepassexceptions(self): surrogatepass_errors = codecs.lookup_error('surrogatepass') # "surrogatepass" complains about a non-exception passed in self.assertRaises( TypeError, surrogatepass_errors, 42 ) # "surrogatepass" complains about the wrong exception types self.assertRaises( TypeError, surrogatepass_errors, UnicodeError("ouch") ) # "surrogatepass" can not be used for translating self.assertRaises( TypeError, surrogatepass_errors, UnicodeTranslateError("\ud800", 0, 1, "ouch") ) # Use the correct exception for enc in ("utf-8", "utf-16le", "utf-16be", "utf-32le", "utf-32be"): with self.subTest(encoding=enc): self.assertRaises( UnicodeEncodeError, surrogatepass_errors, UnicodeEncodeError(enc, "a", 0, 1, "ouch") ) self.assertRaises( UnicodeDecodeError, surrogatepass_errors, UnicodeDecodeError(enc, "a".encode(enc), 0, 1, "ouch") ) for s in ("\ud800", "\udfff", "\ud800\udfff"): with self.subTest(str=s): self.assertRaises( UnicodeEncodeError, surrogatepass_errors, UnicodeEncodeError("ascii", s, 0, len(s), "ouch") ) tests = [ ("utf-8", "\ud800", b'\xed\xa0\x80', 3), ("utf-16le", "\ud800", b'\x00\xd8', 2), ("utf-16be", "\ud800", b'\xd8\x00', 2), ("utf-32le", "\ud800", b'\x00\xd8\x00\x00', 4), ("utf-32be", "\ud800", b'\x00\x00\xd8\x00', 4), ("utf-8", "\udfff", b'\xed\xbf\xbf', 3), ("utf-16le", "\udfff", b'\xff\xdf', 2), ("utf-16be", "\udfff", b'\xdf\xff', 2), ("utf-32le", "\udfff", b'\xff\xdf\x00\x00', 4), ("utf-32be", "\udfff", b'\x00\x00\xdf\xff', 4), ("utf-8", "\ud800\udfff", b'\xed\xa0\x80\xed\xbf\xbf', 3), ("utf-16le", "\ud800\udfff", b'\x00\xd8\xff\xdf', 2), ("utf-16be", "\ud800\udfff", b'\xd8\x00\xdf\xff', 2), ("utf-32le", "\ud800\udfff", b'\x00\xd8\x00\x00\xff\xdf\x00\x00', 4), ("utf-32be", "\ud800\udfff", b'\x00\x00\xd8\x00\x00\x00\xdf\xff', 4), ] for enc, s, b, n in tests: with self.subTest(encoding=enc, str=s, bytes=b): self.assertEqual( surrogatepass_errors( UnicodeEncodeError(enc, "a" + s + "b", 1, 1 + len(s), "ouch")), (b, 1 + len(s)) ) self.assertEqual( surrogatepass_errors( UnicodeDecodeError(enc, bytearray(b"a" + b[:n] + b"b"), 1, 1 + n, "ouch")), (s[:1], 1 + n) ) def test_badhandlerresults(self): results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) ) encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15") for res in results: codecs.register_error("test.badhandler", lambda x: res) for enc in encs: self.assertRaises( TypeError, "\u3042".encode, enc, "test.badhandler" ) for (enc, bytes) in ( ("ascii", b"\xff"), ("utf-8", b"\xff"), ("utf-7", b"+x-"), ): self.assertRaises( TypeError, bytes.decode, enc, "test.badhandler" ) def test_lookup(self): self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict")) self.assertEqual(codecs.ignore_errors, codecs.lookup_error("ignore")) self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict")) self.assertEqual( codecs.xmlcharrefreplace_errors, codecs.lookup_error("xmlcharrefreplace") ) self.assertEqual( codecs.backslashreplace_errors, codecs.lookup_error("backslashreplace") ) self.assertEqual( codecs.namereplace_errors, codecs.lookup_error("namereplace") ) def test_encode_nonascii_replacement(self): def handle(exc): if isinstance(exc, UnicodeEncodeError): return (repl, exc.end) raise TypeError("don't know how to handle %r" % exc) codecs.register_error("test.replacing", handle) for enc, input, repl in ( ("ascii", "[¤]", "abc"), ("iso-8859-1", "[€]", "½¾"), ("iso-8859-15", "[¤]", "œŸ"), ): res = input.encode(enc, "test.replacing") self.assertEqual(res, ("[" + repl + "]").encode(enc)) for enc, input, repl in ( ("utf-8", "[\udc80]", "\U0001f40d"), ("utf-16", "[\udc80]", "\U0001f40d"), ("utf-32", "[\udc80]", "\U0001f40d"), ): with self.subTest(encoding=enc): with self.assertRaises(UnicodeEncodeError) as cm: input.encode(enc, "test.replacing") exc = cm.exception self.assertEqual(exc.start, 1) self.assertEqual(exc.end, 2) self.assertEqual(exc.object, input) def test_encode_unencodable_replacement(self): def unencrepl(exc): if isinstance(exc, UnicodeEncodeError): return (repl, exc.end) else: raise TypeError("don't know how to handle %r" % exc) codecs.register_error("test.unencreplhandler", unencrepl) for enc, input, repl in ( ("ascii", "[¤]", "½"), ("iso-8859-1", "[€]", "œ"), ("iso-8859-15", "[¤]", "½"), ("utf-8", "[\udc80]", "\udcff"), ("utf-16", "[\udc80]", "\udcff"), ("utf-32", "[\udc80]", "\udcff"), ): with self.subTest(encoding=enc): with self.assertRaises(UnicodeEncodeError) as cm: input.encode(enc, "test.unencreplhandler") exc = cm.exception self.assertEqual(exc.start, 1) self.assertEqual(exc.end, 2) self.assertEqual(exc.object, input) def test_encode_bytes_replacement(self): def handle(exc): if isinstance(exc, UnicodeEncodeError): return (repl, exc.end) raise TypeError("don't know how to handle %r" % exc) codecs.register_error("test.replacing", handle) # It works even if the bytes sequence is not decodable. for enc, input, repl in ( ("ascii", "[¤]", b"\xbd\xbe"), ("iso-8859-1", "[€]", b"\xbd\xbe"), ("iso-8859-15", "[¤]", b"\xbd\xbe"), ("utf-8", "[\udc80]", b"\xbd\xbe"), ("utf-16le", "[\udc80]", b"\xbd\xbe"), ("utf-16be", "[\udc80]", b"\xbd\xbe"), ("utf-32le", "[\udc80]", b"\xbc\xbd\xbe\xbf"), ("utf-32be", "[\udc80]", b"\xbc\xbd\xbe\xbf"), ): with self.subTest(encoding=enc): res = input.encode(enc, "test.replacing") self.assertEqual(res, "[".encode(enc) + repl + "]".encode(enc)) def test_encode_odd_bytes_replacement(self): def handle(exc): if isinstance(exc, UnicodeEncodeError): return (repl, exc.end) raise TypeError("don't know how to handle %r" % exc) codecs.register_error("test.replacing", handle) input = "[\udc80]" # Tests in which the replacement bytestring contains not whole number # of code units. for enc, repl in ( *itertools.product(("utf-16le", "utf-16be"), [b"a", b"abc"]), *itertools.product(("utf-32le", "utf-32be"), [b"a", b"ab", b"abc", b"abcde"]), ): with self.subTest(encoding=enc, repl=repl): with self.assertRaises(UnicodeEncodeError) as cm: input.encode(enc, "test.replacing") exc = cm.exception self.assertEqual(exc.start, 1) self.assertEqual(exc.end, 2) self.assertEqual(exc.object, input) self.assertEqual(exc.reason, "surrogates not allowed") def test_badregistercall(self): # enhance coverage of: # Modules/_codecsmodule.c::register_error() # Python/codecs.c::PyCodec_RegisterError() self.assertRaises(TypeError, codecs.register_error, 42) self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42) def test_badlookupcall(self): # enhance coverage of: # Modules/_codecsmodule.c::lookup_error() self.assertRaises(TypeError, codecs.lookup_error) def test_unknownhandler(self): # enhance coverage of: # Modules/_codecsmodule.c::lookup_error() self.assertRaises(LookupError, codecs.lookup_error, "test.unknown") def test_xmlcharrefvalues(self): # enhance coverage of: # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors() # and inline implementations v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000) s = "".join([chr(x) for x in v]) codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors) for enc in ("ascii", "iso-8859-15"): for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"): s.encode(enc, err) def test_decodehelper(self): # enhance coverage of: # Objects/unicodeobject.c::unicode_decode_call_errorhandler() # and callers self.assertRaises(LookupError, b"\xff".decode, "ascii", "test.unknown") def baddecodereturn1(exc): return 42 codecs.register_error("test.baddecodereturn1", baddecodereturn1) self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn1") self.assertRaises(TypeError, b"\\".decode, "unicode-escape", "test.baddecodereturn1") self.assertRaises(TypeError, b"\\x0".decode, "unicode-escape", "test.baddecodereturn1") self.assertRaises(TypeError, b"\\x0y".decode, "unicode-escape", "test.baddecodereturn1") self.assertRaises(TypeError, b"\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1") self.assertRaises(TypeError, b"\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1") def baddecodereturn2(exc): return ("?", None) codecs.register_error("test.baddecodereturn2", baddecodereturn2) self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn2") handler = PosReturn() codecs.register_error("test.posreturn", handler.handle) # Valid negative position handler.pos = -1 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "0") # Valid negative position handler.pos = -2 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "") # Negative position out of bounds handler.pos = -3 self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn") # Valid positive position handler.pos = 1 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "0") # Largest valid positive position (one beyond end of input) handler.pos = 2 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "") # Invalid positive position handler.pos = 3 self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn") # Restart at the "0" handler.pos = 6 self.assertEqual(b"\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), "0") class D(dict): def __getitem__(self, key): raise ValueError self.assertRaises(UnicodeError, codecs.charmap_decode, b"\xff", "strict", {0xff: None}) self.assertRaises(ValueError, codecs.charmap_decode, b"\xff", "strict", D()) self.assertRaises(TypeError, codecs.charmap_decode, b"\xff", "strict", {0xff: sys.maxunicode+1}) def test_encodehelper(self): # enhance coverage of: # Objects/unicodeobject.c::unicode_encode_call_errorhandler() # and callers self.assertRaises(LookupError, "\xff".encode, "ascii", "test.unknown") def badencodereturn1(exc): return 42 codecs.register_error("test.badencodereturn1", badencodereturn1) self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn1") def badencodereturn2(exc): return ("?", None) codecs.register_error("test.badencodereturn2", badencodereturn2) self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn2") handler = PosReturn() codecs.register_error("test.posreturn", handler.handle) # Valid negative position handler.pos = -1 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"0") # Valid negative position handler.pos = -2 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"") # Negative position out of bounds handler.pos = -3 self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn") # Valid positive position handler.pos = 1 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"0") # Largest valid positive position (one beyond end of input handler.pos = 2 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"") # Invalid positive position handler.pos = 3 self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn") handler.pos = 0 class D(dict): def __getitem__(self, key): raise ValueError for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "namereplace", "test.posreturn"): self.assertRaises(UnicodeError, codecs.charmap_encode, "\xff", err, {0xff: None}) self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D()) self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300}) def test_decodehelper_bug36819(self): handler = RepeatedPosReturn("x") codecs.register_error("test.bug36819", handler.handle) testcases = [ ("ascii", b"\xff"), ("utf-8", b"\xff"), ("utf-16be", b'\xdc\x80'), ("utf-32be", b'\x00\x00\xdc\x80'), ("iso-8859-6", b"\xff"), ] for enc, bad in testcases: input = "abcd".encode(enc) + bad with self.subTest(encoding=enc): handler.count = 50 decoded = input.decode(enc, "test.bug36819") self.assertEqual(decoded, 'abcdx' * 51) def test_encodehelper_bug36819(self): handler = RepeatedPosReturn() codecs.register_error("test.bug36819", handler.handle) input = "abcd\udc80" encodings = ["ascii", "latin1", "utf-8", "utf-16", "utf-32"] # built-in encodings += ["iso-8859-15"] # charmap codec if sys.platform == 'win32': encodings = ["mbcs", "oem"] # code page codecs handler.repl = "\udcff" for enc in encodings: with self.subTest(encoding=enc): handler.count = 50 with self.assertRaises(UnicodeEncodeError) as cm: input.encode(enc, "test.bug36819") exc = cm.exception self.assertEqual(exc.start, 4) self.assertEqual(exc.end, 5) self.assertEqual(exc.object, input) if sys.platform == "win32": handler.count = 50 with self.assertRaises(UnicodeEncodeError) as cm: codecs.code_page_encode(437, input, "test.bug36819") exc = cm.exception self.assertEqual(exc.start, 4) self.assertEqual(exc.end, 5) self.assertEqual(exc.object, input) handler.repl = "x" for enc in encodings: with self.subTest(encoding=enc): # The interpreter should segfault after a handful of attempts. # 50 was chosen to try to ensure a segfault without a fix, # but not OOM a machine with one. handler.count = 50 encoded = input.encode(enc, "test.bug36819") self.assertEqual(encoded.decode(enc), "abcdx" * 51) if sys.platform == "win32": handler.count = 50 encoded = codecs.code_page_encode(437, input, "test.bug36819") self.assertEqual(encoded[0].decode(), "abcdx" * 51) self.assertEqual(encoded[1], len(input)) def test_translatehelper(self): # enhance coverage of: # Objects/unicodeobject.c::unicode_encode_call_errorhandler() # and callers # (Unfortunately the errors argument is not directly accessible # from Python, so we can't test that much) class D(dict): def __getitem__(self, key): raise ValueError #self.assertRaises(ValueError, "\xff".translate, D()) self.assertRaises(ValueError, "\xff".translate, {0xff: sys.maxunicode+1}) self.assertRaises(TypeError, "\xff".translate, {0xff: ()}) def test_bug828737(self): charmap = { ord("&"): "&", ord("<"): "<", ord(">"): ">", ord('"'): """, } for n in (1, 10, 100, 1000): text = 'abcghi'*n text.translate(charmap) def test_mutatingdecodehandler(self): baddata = [ ("ascii", b"\xff"), ("utf-7", b"++"), ("utf-8", b"\xff"), ("utf-16", b"\xff"), ("utf-32", b"\xff"), ("unicode-escape", b"\\u123g"), ("raw-unicode-escape", b"\\u123g"), ] def replacing(exc): if isinstance(exc, UnicodeDecodeError): exc.object = 42 return ("\u4242", 0) else: raise TypeError("don't know how to handle %r" % exc) codecs.register_error("test.replacing", replacing) for (encoding, data) in baddata: with self.assertRaises(TypeError): data.decode(encoding, "test.replacing") def mutating(exc): if isinstance(exc, UnicodeDecodeError): exc.object = b"" return ("\u4242", 0) else: raise TypeError("don't know how to handle %r" % exc) codecs.register_error("test.mutating", mutating) # If the decoder doesn't pick up the modified input the following # will lead to an endless loop for (encoding, data) in baddata: self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242") # issue32583 def test_crashing_decode_handler(self): # better generating one more character to fill the extra space slot # so in debug build it can steadily fail def forward_shorter_than_end(exc): if isinstance(exc, UnicodeDecodeError): # size one character, 0 < forward < exc.end return ('\ufffd', exc.start+1) else: raise TypeError("don't know how to handle %r" % exc) codecs.register_error( "test.forward_shorter_than_end", forward_shorter_than_end) self.assertEqual( b'\xd8\xd8\xd8\xd8\xd8\x00\x00\x00'.decode( 'utf-16-le', 'test.forward_shorter_than_end'), '\ufffd\ufffd\ufffd\ufffd\xd8\x00' ) self.assertEqual( b'\xd8\xd8\xd8\xd8\x00\xd8\x00\x00'.decode( 'utf-16-be', 'test.forward_shorter_than_end'), '\ufffd\ufffd\ufffd\ufffd\xd8\x00' ) self.assertEqual( b'\x11\x11\x11\x11\x11\x00\x00\x00\x00\x00\x00'.decode( 'utf-32-le', 'test.forward_shorter_than_end'), '\ufffd\ufffd\ufffd\u1111\x00' ) self.assertEqual( b'\x11\x11\x11\x00\x00\x11\x11\x00\x00\x00\x00'.decode( 'utf-32-be', 'test.forward_shorter_than_end'), '\ufffd\ufffd\ufffd\u1111\x00' ) def replace_with_long(exc): if isinstance(exc, UnicodeDecodeError): exc.object = b"\x00" * 8 return ('\ufffd', exc.start) else: raise TypeError("don't know how to handle %r" % exc) codecs.register_error("test.replace_with_long", replace_with_long) self.assertEqual( b'\x00'.decode('utf-16', 'test.replace_with_long'), '\ufffd\x00\x00\x00\x00' ) self.assertEqual( b'\x00'.decode('utf-32', 'test.replace_with_long'), '\ufffd\x00\x00' ) def test_fake_error_class(self): handlers = [ codecs.strict_errors, codecs.ignore_errors, codecs.replace_errors, codecs.backslashreplace_errors, codecs.namereplace_errors, codecs.xmlcharrefreplace_errors, codecs.lookup_error('surrogateescape'), codecs.lookup_error('surrogatepass'), ] for cls in UnicodeEncodeError, UnicodeDecodeError, UnicodeTranslateError: class FakeUnicodeError(str): __class__ = cls for handler in handlers: with self.subTest(handler=handler, error_class=cls): self.assertRaises(TypeError, handler, FakeUnicodeError()) class FakeUnicodeError(Exception): __class__ = cls for handler in handlers: with self.subTest(handler=handler, error_class=cls): with self.assertRaises((TypeError, FakeUnicodeError)): handler(FakeUnicodeError()) def test_reject_unregister_builtin_error_handler(self): for name in [ 'strict', 'ignore', 'replace', 'backslashreplace', 'namereplace', 'xmlcharrefreplace', 'surrogateescape', 'surrogatepass', ]: with self.subTest(name): self.assertRaises(ValueError, _codecs_unregister_error, name) def test_unregister_custom_error_handler(self): def custom_handler(exc): raise exc custom_name = 'test.test_unregister_custom_error_handler' self.assertRaises(LookupError, codecs.lookup_error, custom_name) codecs.register_error(custom_name, custom_handler) self.assertIs(codecs.lookup_error(custom_name), custom_handler) self.assertTrue(_codecs_unregister_error(custom_name)) self.assertRaises(LookupError, codecs.lookup_error, custom_name) def test_unregister_custom_unknown_error_handler(self): unknown_name = 'test.test_unregister_custom_unknown_error_handler' self.assertRaises(LookupError, codecs.lookup_error, unknown_name) self.assertFalse(_codecs_unregister_error(unknown_name)) self.assertRaises(LookupError, codecs.lookup_error, unknown_name) if __name__ == "__main__": unittest.main()