fix: patchDocument, looks for namspaces more carefully over whole doc… (#2943)

* fix: patchDocument, looks for namspaces more carefully over whole document added support of different placeholder brackets, default `{{`: `}}` extended inputDataType to accept JSZip * fix: Attribute value string not converted to HTML esc char by js2xml original: https://github.com/nashwaan/xml-js/issues/142 * fix: skip processing utf-16 encoded files * fix: patching goes on the second circle over patched document added parameter `recursive`(= true by default) to be able to disable this behavior * Fix linting * Simplify code * write tests to skip UTF-16 types --------- Co-authored-by: Vladimir Bulyga <vladimir.bulyga@legatics.com> Co-authored-by: Dolan Miu <dolan_miu@hotmail.com>
2025-04-16 10:03:49 +01:00
parent b614b74a2a
commit fc86e1842d
4 changed files with 62 additions and 20 deletions
--- a/src/patcher/from-docx.spec.ts
+++ b/src/patcher/from-docx.spec.ts
@ -202,15 +202,11 @@ describe("from-docx", () => {
    describe("patchDocument", () => {
        describe("document.xml and [Content_Types].xml", () => {
            beforeEach(() => {
-                vi.spyOn(JSZip, "loadAsync").mockReturnValue(
-                    new Promise<JSZip>((resolve) => {
-                        const zip = new JSZip();
+                const zip = new JSZip();

-                        zip.file("word/document.xml", MOCK_XML);
-                        zip.file("[Content_Types].xml", `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>`);
-                        resolve(zip);
-                    }),
-                );
+                zip.file("word/document.xml", MOCK_XML);
+                zip.file("[Content_Types].xml", `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>`);
+                vi.spyOn(JSZip, "loadAsync").mockResolvedValue(zip);
            });

            afterEach(() => {
@ -289,6 +285,32 @@ describe("from-docx", () => {
                expect(output).to.not.be.undefined;
            });

+            it("should work with the raw JSZip type", async () => {
+                const zip = new JSZip();
+
+                zip.file("word/document.xml", MOCK_XML);
+                zip.file("[Content_Types].xml", `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>`);
+                const output = await patchDocument({
+                    outputType: "uint8array",
+                    data: zip,
+                    patches: {},
+                });
+                expect(output).to.not.be.undefined;
+            });
+
+            it("should skiup UTF-16 types", async () => {
+                const zip = new JSZip();
+
+                zip.file("word/document.xml", MOCK_XML);
+                zip.file("[Content_Types].xml", Buffer.from([0xff, 0xfe]));
+                const output = await patchDocument({
+                    outputType: "uint8array",
+                    data: zip,
+                    patches: {},
+                });
+                expect(output).to.not.be.undefined;
+            });
+
            it("should patch the document", async () => {
                const output = await patchDocument({
                    outputType: "uint8array",
--- a/src/patcher/from-docx.ts
+++ b/src/patcher/from-docx.ts
@ -19,7 +19,7 @@ import { replacer } from "./replacer";
 import { toJson } from "./util";

 // eslint-disable-next-line functional/prefer-readonly-type
-export type InputDataType = Buffer | string | number[] | Uint8Array | ArrayBuffer | Blob | NodeJS.ReadableStream;
+export type InputDataType = Buffer | string | number[] | Uint8Array | ArrayBuffer | Blob | NodeJS.ReadableStream | JSZip;

 export const PatchType = {
    DOCUMENT: "file",
@ -59,9 +59,12 @@ export type PatchDocumentOptions<T extends PatchDocumentOutputType = PatchDocume
        readonly start: string;
        readonly end: string;
    }>;
+    readonly recursive?: boolean;
 };

 const imageReplacer = new ImageReplacer();
+const UTF16LE = Buffer.from([0xff, 0xfe]);
+const UTF16BE = Buffer.from([0xfe, 0xff]);

 export const patchDocument = async <T extends PatchDocumentOutputType = PatchDocumentOutputType>({
    outputType,
@ -69,8 +72,12 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
    patches,
    keepOriginalStyles,
    placeholderDelimiters = { start: "{{", end: "}}" } as const,
+    /**
+     * Search for occurrences over patched document
+     */
+    recursive = true,
 }: PatchDocumentOptions<T>): Promise<OutputByType[T]> => {
-    const zipContent = await JSZip.loadAsync(data);
+    const zipContent = data instanceof JSZip ? data : await JSZip.loadAsync(data);
    const contexts = new Map<string, IContext>();
    const file = {
        Media: new Media(),
@ -87,8 +94,15 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
    const binaryContentMap = new Map<string, Uint8Array>();

    for (const [key, value] of Object.entries(zipContent.files)) {
+        const binaryValue = await value.async("uint8array");
+        const startBytes = binaryValue.slice(0, 2);
+        if (UTF16LE.equals(startBytes) || UTF16BE.equals(startBytes)) {
+            binaryContentMap.set(key, binaryValue);
+            continue;
+        }
+
        if (!key.endsWith(".xml") && !key.endsWith(".rels")) {
-            binaryContentMap.set(key, await value.async("uint8array"));
+            binaryContentMap.set(key, binaryValue);
            continue;
        }

@ -96,12 +110,10 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc

        if (key === "word/document.xml") {
            const document = json.elements?.find((i) => i.name === "w:document");
-            if (document) {
+            if (document && document.attributes) {
                // We could check all namespaces from Document, but we'll instead
                // check only those that may be used by our element types.

-                // eslint-disable-next-line functional/immutable-data
-                document.attributes = document.attributes ?? {};
                for (const ns of ["mc", "wp", "r", "w15", "m"] as const) {
                    // eslint-disable-next-line functional/immutable-data
                    document.attributes[`xmlns:${ns}`] = DocumentAttributeNamespaces[ns];
@ -179,7 +191,8 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
                        context,
                        keepOriginalStyles,
                    });
-                    if (!didFindOccurrence) {
+                    // What the reason doing that? Once document is patched - it search over patched json again, that takes too long if patched document has big and deep structure.
+                    if (!recursive || !didFindOccurrence) {
                        break;
                    }
                }
@ -275,7 +288,15 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
 };

 const toXml = (jsonObj: Element): string => {
-    const output = js2xml(jsonObj);
+    const output = js2xml(jsonObj, {
+        attributeValueFn: (str) =>
+            String(str)
+                .replace(/&(?!amp;|lt;|gt;|quot;|apos;)/g, "&amp;")
+                .replace(/</g, "&lt;")
+                .replace(/>/g, "&gt;")
+                .replace(/"/g, "&quot;")
+                .replace(/'/g, "&apos;"), // cspell:words apos
+    });
    return output;
 };

--- a/src/patcher/patch-detector.ts
+++ b/src/patcher/patch-detector.ts
@ -10,7 +10,7 @@ type PatchDetectorOptions = {

 /** Detects which patches are needed/present in a template */
 export const patchDetector = async ({ data }: PatchDetectorOptions): Promise<readonly string[]> => {
-    const zipContent = await JSZip.loadAsync(data);
+    const zipContent = data instanceof JSZip ? data : await JSZip.loadAsync(data);
    const patches = new Set<string>();

    for (const [key, value] of Object.entries(zipContent.files)) {
--- a/src/patcher/traverser.ts
+++ b/src/patcher/traverser.ts
@ -34,10 +34,9 @@ export const traverse = (node: Element): readonly IRenderedParagraphNode[] => {

        if (currentNode.element.name === "w:p") {
            renderedParagraphs = [...renderedParagraphs, renderParagraphNode(currentNode)];
-        } else {
-            // eslint-disable-next-line functional/immutable-data
-            queue.push(...elementsToWrapper(currentNode));
        }
+        // eslint-disable-next-line functional/immutable-data
+        queue.push(...elementsToWrapper(currentNode));
    }

    return renderedParagraphs;