fix: patchDocument, looks for namspaces more carefully over whole doc… (#2943)
* fix: patchDocument, looks for namspaces more carefully over whole document added support of different placeholder brackets, default `{{`: `}}` extended inputDataType to accept JSZip * fix: Attribute value string not converted to HTML esc char by js2xml original: https://github.com/nashwaan/xml-js/issues/142 * fix: skip processing utf-16 encoded files * fix: patching goes on the second circle over patched document added parameter `recursive`(= true by default) to be able to disable this behavior * Fix linting * Simplify code * write tests to skip UTF-16 types --------- Co-authored-by: Vladimir Bulyga <vladimir.bulyga@legatics.com> Co-authored-by: Dolan Miu <dolan_miu@hotmail.com>
This commit is contained in:
@ -202,15 +202,11 @@ describe("from-docx", () => {
|
||||
describe("patchDocument", () => {
|
||||
describe("document.xml and [Content_Types].xml", () => {
|
||||
beforeEach(() => {
|
||||
vi.spyOn(JSZip, "loadAsync").mockReturnValue(
|
||||
new Promise<JSZip>((resolve) => {
|
||||
const zip = new JSZip();
|
||||
const zip = new JSZip();
|
||||
|
||||
zip.file("word/document.xml", MOCK_XML);
|
||||
zip.file("[Content_Types].xml", `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>`);
|
||||
resolve(zip);
|
||||
}),
|
||||
);
|
||||
zip.file("word/document.xml", MOCK_XML);
|
||||
zip.file("[Content_Types].xml", `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>`);
|
||||
vi.spyOn(JSZip, "loadAsync").mockResolvedValue(zip);
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
@ -289,6 +285,32 @@ describe("from-docx", () => {
|
||||
expect(output).to.not.be.undefined;
|
||||
});
|
||||
|
||||
it("should work with the raw JSZip type", async () => {
|
||||
const zip = new JSZip();
|
||||
|
||||
zip.file("word/document.xml", MOCK_XML);
|
||||
zip.file("[Content_Types].xml", `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>`);
|
||||
const output = await patchDocument({
|
||||
outputType: "uint8array",
|
||||
data: zip,
|
||||
patches: {},
|
||||
});
|
||||
expect(output).to.not.be.undefined;
|
||||
});
|
||||
|
||||
it("should skiup UTF-16 types", async () => {
|
||||
const zip = new JSZip();
|
||||
|
||||
zip.file("word/document.xml", MOCK_XML);
|
||||
zip.file("[Content_Types].xml", Buffer.from([0xff, 0xfe]));
|
||||
const output = await patchDocument({
|
||||
outputType: "uint8array",
|
||||
data: zip,
|
||||
patches: {},
|
||||
});
|
||||
expect(output).to.not.be.undefined;
|
||||
});
|
||||
|
||||
it("should patch the document", async () => {
|
||||
const output = await patchDocument({
|
||||
outputType: "uint8array",
|
||||
|
@ -19,7 +19,7 @@ import { replacer } from "./replacer";
|
||||
import { toJson } from "./util";
|
||||
|
||||
// eslint-disable-next-line functional/prefer-readonly-type
|
||||
export type InputDataType = Buffer | string | number[] | Uint8Array | ArrayBuffer | Blob | NodeJS.ReadableStream;
|
||||
export type InputDataType = Buffer | string | number[] | Uint8Array | ArrayBuffer | Blob | NodeJS.ReadableStream | JSZip;
|
||||
|
||||
export const PatchType = {
|
||||
DOCUMENT: "file",
|
||||
@ -59,9 +59,12 @@ export type PatchDocumentOptions<T extends PatchDocumentOutputType = PatchDocume
|
||||
readonly start: string;
|
||||
readonly end: string;
|
||||
}>;
|
||||
readonly recursive?: boolean;
|
||||
};
|
||||
|
||||
const imageReplacer = new ImageReplacer();
|
||||
const UTF16LE = Buffer.from([0xff, 0xfe]);
|
||||
const UTF16BE = Buffer.from([0xfe, 0xff]);
|
||||
|
||||
export const patchDocument = async <T extends PatchDocumentOutputType = PatchDocumentOutputType>({
|
||||
outputType,
|
||||
@ -69,8 +72,12 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
|
||||
patches,
|
||||
keepOriginalStyles,
|
||||
placeholderDelimiters = { start: "{{", end: "}}" } as const,
|
||||
/**
|
||||
* Search for occurrences over patched document
|
||||
*/
|
||||
recursive = true,
|
||||
}: PatchDocumentOptions<T>): Promise<OutputByType[T]> => {
|
||||
const zipContent = await JSZip.loadAsync(data);
|
||||
const zipContent = data instanceof JSZip ? data : await JSZip.loadAsync(data);
|
||||
const contexts = new Map<string, IContext>();
|
||||
const file = {
|
||||
Media: new Media(),
|
||||
@ -87,8 +94,15 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
|
||||
const binaryContentMap = new Map<string, Uint8Array>();
|
||||
|
||||
for (const [key, value] of Object.entries(zipContent.files)) {
|
||||
const binaryValue = await value.async("uint8array");
|
||||
const startBytes = binaryValue.slice(0, 2);
|
||||
if (UTF16LE.equals(startBytes) || UTF16BE.equals(startBytes)) {
|
||||
binaryContentMap.set(key, binaryValue);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!key.endsWith(".xml") && !key.endsWith(".rels")) {
|
||||
binaryContentMap.set(key, await value.async("uint8array"));
|
||||
binaryContentMap.set(key, binaryValue);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -96,12 +110,10 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
|
||||
|
||||
if (key === "word/document.xml") {
|
||||
const document = json.elements?.find((i) => i.name === "w:document");
|
||||
if (document) {
|
||||
if (document && document.attributes) {
|
||||
// We could check all namespaces from Document, but we'll instead
|
||||
// check only those that may be used by our element types.
|
||||
|
||||
// eslint-disable-next-line functional/immutable-data
|
||||
document.attributes = document.attributes ?? {};
|
||||
for (const ns of ["mc", "wp", "r", "w15", "m"] as const) {
|
||||
// eslint-disable-next-line functional/immutable-data
|
||||
document.attributes[`xmlns:${ns}`] = DocumentAttributeNamespaces[ns];
|
||||
@ -179,7 +191,8 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
|
||||
context,
|
||||
keepOriginalStyles,
|
||||
});
|
||||
if (!didFindOccurrence) {
|
||||
// What the reason doing that? Once document is patched - it search over patched json again, that takes too long if patched document has big and deep structure.
|
||||
if (!recursive || !didFindOccurrence) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -275,7 +288,15 @@ export const patchDocument = async <T extends PatchDocumentOutputType = PatchDoc
|
||||
};
|
||||
|
||||
const toXml = (jsonObj: Element): string => {
|
||||
const output = js2xml(jsonObj);
|
||||
const output = js2xml(jsonObj, {
|
||||
attributeValueFn: (str) =>
|
||||
String(str)
|
||||
.replace(/&(?!amp;|lt;|gt;|quot;|apos;)/g, "&")
|
||||
.replace(/</g, "<")
|
||||
.replace(/>/g, ">")
|
||||
.replace(/"/g, """)
|
||||
.replace(/'/g, "'"), // cspell:words apos
|
||||
});
|
||||
return output;
|
||||
};
|
||||
|
||||
|
@ -10,7 +10,7 @@ type PatchDetectorOptions = {
|
||||
|
||||
/** Detects which patches are needed/present in a template */
|
||||
export const patchDetector = async ({ data }: PatchDetectorOptions): Promise<readonly string[]> => {
|
||||
const zipContent = await JSZip.loadAsync(data);
|
||||
const zipContent = data instanceof JSZip ? data : await JSZip.loadAsync(data);
|
||||
const patches = new Set<string>();
|
||||
|
||||
for (const [key, value] of Object.entries(zipContent.files)) {
|
||||
|
@ -34,10 +34,9 @@ export const traverse = (node: Element): readonly IRenderedParagraphNode[] => {
|
||||
|
||||
if (currentNode.element.name === "w:p") {
|
||||
renderedParagraphs = [...renderedParagraphs, renderParagraphNode(currentNode)];
|
||||
} else {
|
||||
// eslint-disable-next-line functional/immutable-data
|
||||
queue.push(...elementsToWrapper(currentNode));
|
||||
}
|
||||
// eslint-disable-next-line functional/immutable-data
|
||||
queue.push(...elementsToWrapper(currentNode));
|
||||
}
|
||||
|
||||
return renderedParagraphs;
|
||||
|
Reference in New Issue
Block a user