fix: patchDocument, looks for namspaces more carefully over whole doc… (#2943)

* fix: patchDocument, looks for namspaces more carefully over whole document

added support of different placeholder brackets, default `{{`: `}}`

extended inputDataType to accept JSZip

* fix: Attribute value string not converted to HTML esc char by js2xml

original: https://github.com/nashwaan/xml-js/issues/142

* fix: skip processing utf-16 encoded files

* fix: patching goes on the second circle over patched document

added parameter `recursive`(= true by default) to be able to disable
this behavior

* Fix linting

* Simplify code

* write tests to skip UTF-16 types

---------

Co-authored-by: Vladimir Bulyga <vladimir.bulyga@legatics.com>
Co-authored-by: Dolan Miu <dolan_miu@hotmail.com>
This commit is contained in:
Volodymyr Bulyha
2025-04-16 10:03:49 +01:00
committed by GitHub
parent b614b74a2a
commit fc86e1842d
4 changed files with 62 additions and 20 deletions

View File

@ -19,7 +19,7 @@ import { replacer } from "./replacer";
import { toJson } from "./util";
// eslint-disable-next-line functional/prefer-readonly-type
export type InputDataType = Buffer | string | number[] | Uint8Array | ArrayBuffer | Blob | NodeJS.ReadableStream;
export type InputDataType = Buffer | string | number[] | Uint8Array | ArrayBuffer | Blob | NodeJS.ReadableStream | JSZip;
export const PatchType = {
DOCUMENT: "file",
@ -59,9 +59,12 @@ export type PatchDocumentOptions<T extends PatchDocumentOutputType = PatchDocume
readonly start: string;
readonly end: string;
}>;
readonly recursive?: boolean;
};
const imageReplacer = new ImageReplacer();
const UTF16LE = Buffer.from([0xff, 0xfe]);
const UTF16BE = Buffer.from([0xfe, 0xff]);
export const patchDocument = async <T extends PatchDocumentOutputType = PatchDocumentOutputType>({
outputType,
@ -69,8 +72,12 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
patches,
keepOriginalStyles,
placeholderDelimiters = { start: "{{", end: "}}" } as const,
/**
* Search for occurrences over patched document
*/
recursive = true,
}: PatchDocumentOptions<T>): Promise<OutputByType[T]> => {
const zipContent = await JSZip.loadAsync(data);
const zipContent = data instanceof JSZip ? data : await JSZip.loadAsync(data);
const contexts = new Map<string, IContext>();
const file = {
Media: new Media(),
@ -87,8 +94,15 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
const binaryContentMap = new Map<string, Uint8Array>();
for (const [key, value] of Object.entries(zipContent.files)) {
const binaryValue = await value.async("uint8array");
const startBytes = binaryValue.slice(0, 2);
if (UTF16LE.equals(startBytes) || UTF16BE.equals(startBytes)) {
binaryContentMap.set(key, binaryValue);
continue;
}
if (!key.endsWith(".xml") && !key.endsWith(".rels")) {
binaryContentMap.set(key, await value.async("uint8array"));
binaryContentMap.set(key, binaryValue);
continue;
}
@ -96,12 +110,10 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
if (key === "word/document.xml") {
const document = json.elements?.find((i) => i.name === "w:document");
if (document) {
if (document && document.attributes) {
// We could check all namespaces from Document, but we'll instead
// check only those that may be used by our element types.
// eslint-disable-next-line functional/immutable-data
document.attributes = document.attributes ?? {};
for (const ns of ["mc", "wp", "r", "w15", "m"] as const) {
// eslint-disable-next-line functional/immutable-data
document.attributes[`xmlns:${ns}`] = DocumentAttributeNamespaces[ns];
@ -179,7 +191,8 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
context,
keepOriginalStyles,
});
if (!didFindOccurrence) {
// What the reason doing that? Once document is patched - it search over patched json again, that takes too long if patched document has big and deep structure.
if (!recursive || !didFindOccurrence) {
break;
}
}
@ -275,7 +288,15 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
};
const toXml = (jsonObj: Element): string => {
const output = js2xml(jsonObj);
const output = js2xml(jsonObj, {
attributeValueFn: (str) =>
String(str)
.replace(/&(?!amp;|lt;|gt;|quot;|apos;)/g, "&amp;")
.replace(/</g, "&lt;")
.replace(/>/g, "&gt;")
.replace(/"/g, "&quot;")
.replace(/'/g, "&apos;"), // cspell:words apos
});
return output;
};