fix: patchDocument, looks for namspaces more carefully over whole doc… (#2943)

* fix: patchDocument, looks for namspaces more carefully over whole document added support of different placeholder brackets, default `{{`: `}}` extended inputDataType to accept JSZip * fix: Attribute value string not converted to HTML esc char by js2xml original: https://github.com/nashwaan/xml-js/issues/142 * fix: skip processing utf-16 encoded files * fix: patching goes on the second circle over patched document added parameter `recursive`(= true by default) to be able to disable this behavior * Fix linting * Simplify code * write tests to skip UTF-16 types --------- Co-authored-by: Vladimir Bulyga <vladimir.bulyga@legatics.com> Co-authored-by: Dolan Miu <dolan_miu@hotmail.com>
2025-04-16 10:03:49 +01:00
parent b614b74a2a
commit fc86e1842d
4 changed files with 62 additions and 20 deletions
--- a/src/patcher/from-docx.ts
+++ b/src/patcher/from-docx.ts
@ -19,7 +19,7 @@ import { replacer } from "./replacer";
 import { toJson } from "./util";

 // eslint-disable-next-line functional/prefer-readonly-type
-export type InputDataType = Buffer | string | number[] | Uint8Array | ArrayBuffer | Blob | NodeJS.ReadableStream;
+export type InputDataType = Buffer | string | number[] | Uint8Array | ArrayBuffer | Blob | NodeJS.ReadableStream | JSZip;

 export const PatchType = {
    DOCUMENT: "file",
@ -59,9 +59,12 @@ export type PatchDocumentOptions<T extends PatchDocumentOutputType = PatchDocume
        readonly start: string;
        readonly end: string;
    }>;
+    readonly recursive?: boolean;
 };

 const imageReplacer = new ImageReplacer();
+const UTF16LE = Buffer.from([0xff, 0xfe]);
+const UTF16BE = Buffer.from([0xfe, 0xff]);

 export const patchDocument = async <T extends PatchDocumentOutputType = PatchDocumentOutputType>({
    outputType,
@ -69,8 +72,12 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
    patches,
    keepOriginalStyles,
    placeholderDelimiters = { start: "{{", end: "}}" } as const,
+    /**
+     * Search for occurrences over patched document
+     */
+    recursive = true,
 }: PatchDocumentOptions<T>): Promise<OutputByType[T]> => {
-    const zipContent = await JSZip.loadAsync(data);
+    const zipContent = data instanceof JSZip ? data : await JSZip.loadAsync(data);
    const contexts = new Map<string, IContext>();
    const file = {
        Media: new Media(),
@ -87,8 +94,15 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
    const binaryContentMap = new Map<string, Uint8Array>();

    for (const [key, value] of Object.entries(zipContent.files)) {
+        const binaryValue = await value.async("uint8array");
+        const startBytes = binaryValue.slice(0, 2);
+        if (UTF16LE.equals(startBytes) || UTF16BE.equals(startBytes)) {
+            binaryContentMap.set(key, binaryValue);
+            continue;
+        }
+
        if (!key.endsWith(".xml") && !key.endsWith(".rels")) {
-            binaryContentMap.set(key, await value.async("uint8array"));
+            binaryContentMap.set(key, binaryValue);
            continue;
        }

@ -96,12 +110,10 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc

        if (key === "word/document.xml") {
            const document = json.elements?.find((i) => i.name === "w:document");
-            if (document) {
+            if (document && document.attributes) {
                // We could check all namespaces from Document, but we'll instead
                // check only those that may be used by our element types.

-                // eslint-disable-next-line functional/immutable-data
-                document.attributes = document.attributes ?? {};
                for (const ns of ["mc", "wp", "r", "w15", "m"] as const) {
                    // eslint-disable-next-line functional/immutable-data
                    document.attributes[`xmlns:${ns}`] = DocumentAttributeNamespaces[ns];
@ -179,7 +191,8 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
                        context,
                        keepOriginalStyles,
                    });
-                    if (!didFindOccurrence) {
+                    // What the reason doing that? Once document is patched - it search over patched json again, that takes too long if patched document has big and deep structure.
+                    if (!recursive || !didFindOccurrence) {
                        break;
                    }
                }
@ -275,7 +288,15 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
 };

 const toXml = (jsonObj: Element): string => {
-    const output = js2xml(jsonObj);
+    const output = js2xml(jsonObj, {
+        attributeValueFn: (str) =>
+            String(str)
+                .replace(/&(?!amp;|lt;|gt;|quot;|apos;)/g, "&amp;")
+                .replace(/</g, "&lt;")
+                .replace(/>/g, "&gt;")
+                .replace(/"/g, "&quot;")
+                .replace(/'/g, "&apos;"), // cspell:words apos
+    });
    return output;
 };