fix: patchDocument, looks for namspaces more carefully over whole doc… (#2943)

* fix: patchDocument, looks for namspaces more carefully over whole document

added support of different placeholder brackets, default `{{`: `}}`

extended inputDataType to accept JSZip

* fix: Attribute value string not converted to HTML esc char by js2xml

original: https://github.com/nashwaan/xml-js/issues/142

* fix: skip processing utf-16 encoded files

* fix: patching goes on the second circle over patched document

added parameter `recursive`(= true by default) to be able to disable
this behavior

* Fix linting

* Simplify code

* write tests to skip UTF-16 types

---------

Co-authored-by: Vladimir Bulyga <vladimir.bulyga@legatics.com>
Co-authored-by: Dolan Miu <dolan_miu@hotmail.com>
This commit is contained in:
Volodymyr Bulyha
2025-04-16 10:03:49 +01:00
committed by GitHub
parent b614b74a2a
commit fc86e1842d
4 changed files with 62 additions and 20 deletions

View File

@ -202,15 +202,11 @@ describe("from-docx", () => {
describe("patchDocument", () => { describe("patchDocument", () => {
describe("document.xml and [Content_Types].xml", () => { describe("document.xml and [Content_Types].xml", () => {
beforeEach(() => { beforeEach(() => {
vi.spyOn(JSZip, "loadAsync").mockReturnValue(
new Promise<JSZip>((resolve) => {
const zip = new JSZip(); const zip = new JSZip();
zip.file("word/document.xml", MOCK_XML); zip.file("word/document.xml", MOCK_XML);
zip.file("[Content_Types].xml", `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>`); zip.file("[Content_Types].xml", `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>`);
resolve(zip); vi.spyOn(JSZip, "loadAsync").mockResolvedValue(zip);
}),
);
}); });
afterEach(() => { afterEach(() => {
@ -289,6 +285,32 @@ describe("from-docx", () => {
expect(output).to.not.be.undefined; expect(output).to.not.be.undefined;
}); });
it("should work with the raw JSZip type", async () => {
const zip = new JSZip();
zip.file("word/document.xml", MOCK_XML);
zip.file("[Content_Types].xml", `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>`);
const output = await patchDocument({
outputType: "uint8array",
data: zip,
patches: {},
});
expect(output).to.not.be.undefined;
});
it("should skiup UTF-16 types", async () => {
const zip = new JSZip();
zip.file("word/document.xml", MOCK_XML);
zip.file("[Content_Types].xml", Buffer.from([0xff, 0xfe]));
const output = await patchDocument({
outputType: "uint8array",
data: zip,
patches: {},
});
expect(output).to.not.be.undefined;
});
it("should patch the document", async () => { it("should patch the document", async () => {
const output = await patchDocument({ const output = await patchDocument({
outputType: "uint8array", outputType: "uint8array",

View File

@ -19,7 +19,7 @@ import { replacer } from "./replacer";
import { toJson } from "./util"; import { toJson } from "./util";
// eslint-disable-next-line functional/prefer-readonly-type // eslint-disable-next-line functional/prefer-readonly-type
export type InputDataType = Buffer | string | number[] | Uint8Array | ArrayBuffer | Blob | NodeJS.ReadableStream; export type InputDataType = Buffer | string | number[] | Uint8Array | ArrayBuffer | Blob | NodeJS.ReadableStream | JSZip;
export const PatchType = { export const PatchType = {
DOCUMENT: "file", DOCUMENT: "file",
@ -59,9 +59,12 @@ export type PatchDocumentOptions<T extends PatchDocumentOutputType = PatchDocume
readonly start: string; readonly start: string;
readonly end: string; readonly end: string;
}>; }>;
readonly recursive?: boolean;
}; };
const imageReplacer = new ImageReplacer(); const imageReplacer = new ImageReplacer();
const UTF16LE = Buffer.from([0xff, 0xfe]);
const UTF16BE = Buffer.from([0xfe, 0xff]);
export const patchDocument = async <T extends PatchDocumentOutputType = PatchDocumentOutputType>({ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDocumentOutputType>({
outputType, outputType,
@ -69,8 +72,12 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
patches, patches,
keepOriginalStyles, keepOriginalStyles,
placeholderDelimiters = { start: "{{", end: "}}" } as const, placeholderDelimiters = { start: "{{", end: "}}" } as const,
/**
* Search for occurrences over patched document
*/
recursive = true,
}: PatchDocumentOptions<T>): Promise<OutputByType[T]> => { }: PatchDocumentOptions<T>): Promise<OutputByType[T]> => {
const zipContent = await JSZip.loadAsync(data); const zipContent = data instanceof JSZip ? data : await JSZip.loadAsync(data);
const contexts = new Map<string, IContext>(); const contexts = new Map<string, IContext>();
const file = { const file = {
Media: new Media(), Media: new Media(),
@ -87,8 +94,15 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
const binaryContentMap = new Map<string, Uint8Array>(); const binaryContentMap = new Map<string, Uint8Array>();
for (const [key, value] of Object.entries(zipContent.files)) { for (const [key, value] of Object.entries(zipContent.files)) {
const binaryValue = await value.async("uint8array");
const startBytes = binaryValue.slice(0, 2);
if (UTF16LE.equals(startBytes) || UTF16BE.equals(startBytes)) {
binaryContentMap.set(key, binaryValue);
continue;
}
if (!key.endsWith(".xml") && !key.endsWith(".rels")) { if (!key.endsWith(".xml") && !key.endsWith(".rels")) {
binaryContentMap.set(key, await value.async("uint8array")); binaryContentMap.set(key, binaryValue);
continue; continue;
} }
@ -96,12 +110,10 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
if (key === "word/document.xml") { if (key === "word/document.xml") {
const document = json.elements?.find((i) => i.name === "w:document"); const document = json.elements?.find((i) => i.name === "w:document");
if (document) { if (document && document.attributes) {
// We could check all namespaces from Document, but we'll instead // We could check all namespaces from Document, but we'll instead
// check only those that may be used by our element types. // check only those that may be used by our element types.
// eslint-disable-next-line functional/immutable-data
document.attributes = document.attributes ?? {};
for (const ns of ["mc", "wp", "r", "w15", "m"] as const) { for (const ns of ["mc", "wp", "r", "w15", "m"] as const) {
// eslint-disable-next-line functional/immutable-data // eslint-disable-next-line functional/immutable-data
document.attributes[`xmlns:${ns}`] = DocumentAttributeNamespaces[ns]; document.attributes[`xmlns:${ns}`] = DocumentAttributeNamespaces[ns];
@ -179,7 +191,8 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
context, context,
keepOriginalStyles, keepOriginalStyles,
}); });
if (!didFindOccurrence) { // What the reason doing that? Once document is patched - it search over patched json again, that takes too long if patched document has big and deep structure.
if (!recursive || !didFindOccurrence) {
break; break;
} }
} }
@ -275,7 +288,15 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
}; };
const toXml = (jsonObj: Element): string => { const toXml = (jsonObj: Element): string => {
const output = js2xml(jsonObj); const output = js2xml(jsonObj, {
attributeValueFn: (str) =>
String(str)
.replace(/&(?!amp;|lt;|gt;|quot;|apos;)/g, "&amp;")
.replace(/</g, "&lt;")
.replace(/>/g, "&gt;")
.replace(/"/g, "&quot;")
.replace(/'/g, "&apos;"), // cspell:words apos
});
return output; return output;
}; };

View File

@ -10,7 +10,7 @@ type PatchDetectorOptions = {
/** Detects which patches are needed/present in a template */ /** Detects which patches are needed/present in a template */
export const patchDetector = async ({ data }: PatchDetectorOptions): Promise<readonly string[]> => { export const patchDetector = async ({ data }: PatchDetectorOptions): Promise<readonly string[]> => {
const zipContent = await JSZip.loadAsync(data); const zipContent = data instanceof JSZip ? data : await JSZip.loadAsync(data);
const patches = new Set<string>(); const patches = new Set<string>();
for (const [key, value] of Object.entries(zipContent.files)) { for (const [key, value] of Object.entries(zipContent.files)) {

View File

@ -34,11 +34,10 @@ export const traverse = (node: Element): readonly IRenderedParagraphNode[] => {
if (currentNode.element.name === "w:p") { if (currentNode.element.name === "w:p") {
renderedParagraphs = [...renderedParagraphs, renderParagraphNode(currentNode)]; renderedParagraphs = [...renderedParagraphs, renderParagraphNode(currentNode)];
} else { }
// eslint-disable-next-line functional/immutable-data // eslint-disable-next-line functional/immutable-data
queue.push(...elementsToWrapper(currentNode)); queue.push(...elementsToWrapper(currentNode));
} }
}
return renderedParagraphs; return renderedParagraphs;
}; };