0
0
mirror of https://github.com/python/cpython.git synced 2024-11-30 18:51:15 +01:00
cpython/Lib/_compression.py
Ma Lin f9bedb630e
bpo-41486: Faster bz2/lzma/zlib via new output buffering (GH-21740)
Faster bz2/lzma/zlib via new output buffering.
Also adds .readall() function to _compression.DecompressReader class
to take best advantage of this in the consume-all-output at once scenario.

Often a 5-20% speedup in common scenarios due to less data copying.

Contributed by Ma Lin.
2021-04-27 23:58:54 -07:00

163 lines
5.5 KiB
Python

"""Internal classes used by the gzip, lzma and bz2 modules"""
import io
import sys
BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE # Compressed data read chunk size
class BaseStream(io.BufferedIOBase):
"""Mode-checking helper functions."""
def _check_not_closed(self):
if self.closed:
raise ValueError("I/O operation on closed file")
def _check_can_read(self):
if not self.readable():
raise io.UnsupportedOperation("File not open for reading")
def _check_can_write(self):
if not self.writable():
raise io.UnsupportedOperation("File not open for writing")
def _check_can_seek(self):
if not self.readable():
raise io.UnsupportedOperation("Seeking is only supported "
"on files open for reading")
if not self.seekable():
raise io.UnsupportedOperation("The underlying file object "
"does not support seeking")
class DecompressReader(io.RawIOBase):
"""Adapts the decompressor API to a RawIOBase reader API"""
def readable(self):
return True
def __init__(self, fp, decomp_factory, trailing_error=(), **decomp_args):
self._fp = fp
self._eof = False
self._pos = 0 # Current offset in decompressed stream
# Set to size of decompressed stream once it is known, for SEEK_END
self._size = -1
# Save the decompressor factory and arguments.
# If the file contains multiple compressed streams, each
# stream will need a separate decompressor object. A new decompressor
# object is also needed when implementing a backwards seek().
self._decomp_factory = decomp_factory
self._decomp_args = decomp_args
self._decompressor = self._decomp_factory(**self._decomp_args)
# Exception class to catch from decompressor signifying invalid
# trailing data to ignore
self._trailing_error = trailing_error
def close(self):
self._decompressor = None
return super().close()
def seekable(self):
return self._fp.seekable()
def readinto(self, b):
with memoryview(b) as view, view.cast("B") as byte_view:
data = self.read(len(byte_view))
byte_view[:len(data)] = data
return len(data)
def read(self, size=-1):
if size < 0:
return self.readall()
if not size or self._eof:
return b""
data = None # Default if EOF is encountered
# Depending on the input data, our call to the decompressor may not
# return any data. In this case, try again after reading another block.
while True:
if self._decompressor.eof:
rawblock = (self._decompressor.unused_data or
self._fp.read(BUFFER_SIZE))
if not rawblock:
break
# Continue to next stream.
self._decompressor = self._decomp_factory(
**self._decomp_args)
try:
data = self._decompressor.decompress(rawblock, size)
except self._trailing_error:
# Trailing data isn't a valid compressed stream; ignore it.
break
else:
if self._decompressor.needs_input:
rawblock = self._fp.read(BUFFER_SIZE)
if not rawblock:
raise EOFError("Compressed file ended before the "
"end-of-stream marker was reached")
else:
rawblock = b""
data = self._decompressor.decompress(rawblock, size)
if data:
break
if not data:
self._eof = True
self._size = self._pos
return b""
self._pos += len(data)
return data
def readall(self):
chunks = []
# sys.maxsize means the max length of output buffer is unlimited,
# so that the whole input buffer can be decompressed within one
# .decompress() call.
while data := self.read(sys.maxsize):
chunks.append(data)
return b"".join(chunks)
# Rewind the file to the beginning of the data stream.
def _rewind(self):
self._fp.seek(0)
self._eof = False
self._pos = 0
self._decompressor = self._decomp_factory(**self._decomp_args)
def seek(self, offset, whence=io.SEEK_SET):
# Recalculate offset as an absolute file position.
if whence == io.SEEK_SET:
pass
elif whence == io.SEEK_CUR:
offset = self._pos + offset
elif whence == io.SEEK_END:
# Seeking relative to EOF - we need to know the file's size.
if self._size < 0:
while self.read(io.DEFAULT_BUFFER_SIZE):
pass
offset = self._size + offset
else:
raise ValueError("Invalid value for whence: {}".format(whence))
# Make it so that offset is the number of bytes to skip forward.
if offset < self._pos:
self._rewind()
else:
offset -= self._pos
# Read and discard data until we reach the desired position.
while offset > 0:
data = self.read(min(io.DEFAULT_BUFFER_SIZE, offset))
if not data:
break
offset -= len(data)
return self._pos
def tell(self):
"""Return the current file position."""
return self._pos