diff --git a/Lib/test/test_zipimport.py b/Lib/test/test_zipimport.py index ae497002943..e9c3218d2bb 100644 --- a/Lib/test/test_zipimport.py +++ b/Lib/test/test_zipimport.py @@ -1,8 +1,10 @@ import sys import os import marshal +import glob import importlib import importlib.util +import re import struct import time import unittest @@ -54,6 +56,7 @@ TESTPACK = "ziptestpackage" TESTPACK2 = "ziptestpackage2" TEMP_DIR = os.path.abspath("junk95142") TEMP_ZIP = os.path.abspath("junk95142.zip") +TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "zipimport_data") pyc_file = importlib.util.cache_from_source(TESTMOD + '.py') pyc_ext = '.pyc' @@ -134,7 +137,9 @@ class UncompressedZipImportTestCase(ImportHooksBaseTestCase): def doTest(self, expected_ext, files, *modules, **kw): self.makeZip(files, **kw) + self.doTestWithPreBuiltZip(expected_ext, *modules, **kw) + def doTestWithPreBuiltZip(self, expected_ext, *modules, **kw): sys.path.insert(0, TEMP_ZIP) mod = importlib.import_module(".".join(modules)) @@ -810,6 +815,122 @@ class UncompressedZipImportTestCase(ImportHooksBaseTestCase): files = self.getZip64Files() self.doTest(".py", files, "f65536", comment=b"c" * ((1 << 16) - 1)) + def testZip64LargeFile(self): + support.requires( + "largefile", + f"test generates files >{0xFFFFFFFF} bytes and takes a long time " + "to run" + ) + + # N.B.: We do alot of gymnastics below in the ZIP_STORED case to save + # and reconstruct a sparse zip on systems that support sparse files. + # Instead of creating a ~8GB zip file mainly consisting of null bytes + # for every run of the test, we create the zip once and save off the + # non-null portions of the resulting file as data blobs with offsets + # that allow re-creating the zip file sparsely. This drops disk space + # usage to ~9KB for the ZIP_STORED case and drops that test time by ~2 + # orders of magnitude. For the ZIP_DEFLATED case, however, we bite the + # bullet. The resulting zip file is ~8MB of non-null data; so the sparse + # trick doesn't work and would result in that full ~8MB zip data file + # being checked in to source control. + parts_glob = f"sparse-zip64-c{self.compression:d}-0x*.part" + full_parts_glob = os.path.join(TEST_DATA_DIR, parts_glob) + pre_built_zip_parts = glob.glob(full_parts_glob) + + self.addCleanup(os_helper.unlink, TEMP_ZIP) + if not pre_built_zip_parts: + if self.compression != ZIP_STORED: + support.requires( + "cpu", + "test requires a lot of CPU for compression." + ) + self.addCleanup(os_helper.unlink, os_helper.TESTFN) + with open(os_helper.TESTFN, "wb") as f: + f.write(b"data") + f.write(os.linesep.encode()) + f.seek(0xffff_ffff, os.SEEK_CUR) + f.write(os.linesep.encode()) + os.utime(os_helper.TESTFN, (0.0, 0.0)) + with ZipFile( + TEMP_ZIP, + "w", + compression=self.compression, + strict_timestamps=False + ) as z: + z.write(os_helper.TESTFN, "data1") + z.writestr( + ZipInfo("module.py", (1980, 1, 1, 0, 0, 0)), test_src + ) + z.write(os_helper.TESTFN, "data2") + + # This "works" but relies on the zip format having a non-empty + # final page due to the trailing central directory to wind up with + # the correct length file. + def make_sparse_zip_parts(name): + empty_page = b"\0" * 4096 + with open(name, "rb") as f: + part = None + try: + while True: + offset = f.tell() + data = f.read(len(empty_page)) + if not data: + break + if data != empty_page: + if not part: + part_fullname = os.path.join( + TEST_DATA_DIR, + f"sparse-zip64-c{self.compression:d}-" + f"{offset:#011x}.part", + ) + os.makedirs( + os.path.dirname(part_fullname), + exist_ok=True + ) + part = open(part_fullname, "wb") + print("Created", part_fullname) + part.write(data) + else: + if part: + part.close() + part = None + finally: + if part: + part.close() + + if self.compression == ZIP_STORED: + print(f"Creating sparse parts to check in into {TEST_DATA_DIR}:") + make_sparse_zip_parts(TEMP_ZIP) + + else: + def extract_offset(name): + if m := re.search(r"-(0x[0-9a-f]{9})\.part$", name): + return int(m.group(1), base=16) + raise ValueError(f"{name=} does not fit expected pattern.") + offset_parts = [(extract_offset(n), n) for n in pre_built_zip_parts] + with open(TEMP_ZIP, "wb") as f: + for offset, part_fn in sorted(offset_parts): + with open(part_fn, "rb") as part: + f.seek(offset, os.SEEK_SET) + f.write(part.read()) + # Confirm that the reconstructed zip file works and looks right. + with ZipFile(TEMP_ZIP, "r") as z: + self.assertEqual( + z.getinfo("module.py").date_time, (1980, 1, 1, 0, 0, 0) + ) + self.assertEqual( + z.read("module.py"), test_src.encode(), + msg=f"Recreate {full_parts_glob}, unexpected contents." + ) + def assertDataEntry(name): + zinfo = z.getinfo(name) + self.assertEqual(zinfo.date_time, (1980, 1, 1, 0, 0, 0)) + self.assertGreater(zinfo.file_size, 0xffff_ffff) + assertDataEntry("data1") + assertDataEntry("data2") + + self.doTestWithPreBuiltZip(".py", "module") + @support.requires_zlib() class CompressedZipImportTestCase(UncompressedZipImportTestCase): diff --git a/Lib/test/zipimport_data/sparse-zip64-c0-0x000000000.part b/Lib/test/zipimport_data/sparse-zip64-c0-0x000000000.part new file mode 100644 index 00000000000..c6beae8e255 Binary files /dev/null and b/Lib/test/zipimport_data/sparse-zip64-c0-0x000000000.part differ diff --git a/Lib/test/zipimport_data/sparse-zip64-c0-0x100000000.part b/Lib/test/zipimport_data/sparse-zip64-c0-0x100000000.part new file mode 100644 index 00000000000..74ab03b4648 Binary files /dev/null and b/Lib/test/zipimport_data/sparse-zip64-c0-0x100000000.part differ diff --git a/Lib/test/zipimport_data/sparse-zip64-c0-0x200000000.part b/Lib/test/zipimport_data/sparse-zip64-c0-0x200000000.part new file mode 100644 index 00000000000..9769a404f67 Binary files /dev/null and b/Lib/test/zipimport_data/sparse-zip64-c0-0x200000000.part differ diff --git a/Lib/zipimport.py b/Lib/zipimport.py index 21d2dca46f5..4e41d109865 100644 --- a/Lib/zipimport.py +++ b/Lib/zipimport.py @@ -517,8 +517,9 @@ def _read_directory(archive): num_extra_values = (len(extra_data) - 4) // 8 if num_extra_values > 3: raise ZipImportError(f"can't read header extra: {archive!r}", path=archive) - values = struct.unpack_from(f"<{min(num_extra_values, 3)}Q", - extra_data, offset=4) + import struct + values = list(struct.unpack_from(f"<{min(num_extra_values, 3)}Q", + extra_data, offset=4)) # N.b. Here be dragons: the ordering of these is different than # the header fields, and it's really easy to get it wrong since diff --git a/Makefile.pre.in b/Makefile.pre.in index 88ec2009fbf..74a438b015a 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -2483,7 +2483,8 @@ TESTSUBDIRS= idlelib/idle_test \ test/typinganndata \ test/wheeldata \ test/xmltestdata \ - test/xmltestdata/c14n-20 + test/xmltestdata/c14n-20 \ + test/zipimport_data COMPILEALL_OPTS=-j0 diff --git a/Misc/NEWS.d/next/Library/2024-04-19-09-28-43.gh-issue-118107.Mdsr1J.rst b/Misc/NEWS.d/next/Library/2024-04-19-09-28-43.gh-issue-118107.Mdsr1J.rst new file mode 100644 index 00000000000..0e358d6605e --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-04-19-09-28-43.gh-issue-118107.Mdsr1J.rst @@ -0,0 +1,2 @@ +Fix :mod:`zipimport` reading of ZIP64 files with file entries that are too big or +offset too far.