diff --git a/Doc/lib/libcodecs.tex b/Doc/lib/libcodecs.tex index 9f77955a101..136c5289923 100644 --- a/Doc/lib/libcodecs.tex +++ b/Doc/lib/libcodecs.tex @@ -142,16 +142,21 @@ for reading and writing to platform dependent files: \begin{datadesc}{BOM} \dataline{BOM_BE} \dataline{BOM_LE} -\dataline{BOM32_BE} -\dataline{BOM32_LE} -\dataline{BOM64_BE} -\dataline{BOM64_LE} -These constants define the byte order marks (BOM) used in data -streams to indicate the byte order used in the stream or file. -\constant{BOM} is either \constant{BOM_BE} or \constant{BOM_LE} -depending on the platform's native byte order, while the others -represent big endian (\samp{_BE} suffix) and little endian -(\samp{_LE} suffix) byte order using 32-bit and 64-bit encodings. +\dataline{BOM_UTF8} +\dataline{BOM_UTF16} +\dataline{BOM_UTF16_BE} +\dataline{BOM_UTF16_LE} +\dataline{BOM_UTF32} +\dataline{BOM_UTF32_BE} +\dataline{BOM_UTF32_LE} +These constants define various encodings of the Unicode byte order mark +(BOM) used in UTF-16 and UTF-32 data streams to indicate the byte order +used in the stream or file and in UTF-8 as a Unicode signature. +\constant{BOM_UTF16} is either \constant{BOM_UTF16_BE} or +\constant{BOM_UTF16_LE} depending on the platform's native byte order, +\constant{BOM} is an alias for \constant{BOM_UTF16}, \constant{BOM_LE} +for \constant{BOM_UTF16_LE} and \constant{BOM_BE} for \constant{BOM_UTF16_BE}. +The others represent the BOM in UTF-8 and UTF-32 encodings. \end{datadesc} diff --git a/Lib/codecs.py b/Lib/codecs.py index 9178db9c3e6..b089e907662 100644 --- a/Lib/codecs.py +++ b/Lib/codecs.py @@ -18,29 +18,44 @@ except ImportError, why: 'Failed to load the builtin codecs: %s' % why __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", - "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE"] + "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", + "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", + "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE"] ### Constants # -# Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE) +# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) +# and its possible byte string values +# for UTF8/UTF16/UTF32 output and little/big endian machines # -BOM = struct.pack('=H', 0xFEFF) -# -BOM_BE = BOM32_BE = '\376\377' -# corresponds to Unicode U+FEFF in UTF-16 on big endian -# platforms == ZERO WIDTH NO-BREAK SPACE -BOM_LE = BOM32_LE = '\377\376' -# corresponds to Unicode U+FFFE in UTF-16 on little endian -# platforms == defined as being an illegal Unicode character -# -# 64-bit Byte Order Marks -# -BOM64_BE = '\000\000\376\377' -# corresponds to Unicode U+0000FEFF in UCS-4 -BOM64_LE = '\377\376\000\000' -# corresponds to Unicode U+0000FFFE in UCS-4 +# UTF-8 +BOM_UTF8 = '\xef\xbb\xbf' + +# UTF-16, little endian +BOM_LE = BOM_UTF16_LE = '\xff\xfe' + +# UTF-16, big endian +BOM_BE = BOM_UTF16_BE = '\xfe\xff' + +# UTF-32, little endian +BOM_UTF32_LE = '\xff\xfe\x00\x00' + +# UTF-32, big endian +BOM_UTF32_BE = '\x00\x00\xfe\xff' + +# UTF-16, native endianness +BOM = BOM_UTF16 = struct.pack('=H', 0xFEFF) + +# UTF-32, native endianness +BOM_UTF32 = struct.pack('=L', 0x0000FEFF) + +# Old broken names (don't use in new code) +BOM32_LE = BOM_UTF16_LE +BOM32_BE = BOM_UTF16_BE +BOM64_LE = BOM_UTF32_LE +BOM64_BE = BOM_UTF32_BE ### Codec base classes (defining the API) diff --git a/Misc/NEWS b/Misc/NEWS index c90dd7f69c6..7275ad4895f 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -124,6 +124,12 @@ Extension modules Library +- Constants BOM_UTF8, BOM_UTF16, BOM_UTF16_LE, BOM_UTF16_BE, + BOM_UTF32, BOM_UTF32_LE and BOM_UTF32_BE that represent the Byte + Order Mark in UTF-8, UTF-16 and UTF-32 encodings for little and + big endian systems were added to the codecs module. The old names + BOM32_* and BOM64_* were off by a factor of 2. + - added degree/radian conversion functions to the math module. - ftplib.retrlines() now tests for callback is None rather than testing