1// word2md - Word to Markdown conversion tool
2//
3// word2md converts a Microsoft Word document to Markdown formatted text. The tool uses the
4// Word Automation APIs to start an instance of Word and access the contents of the document
5// being converted. The tool must be run using the cscript.exe script host and requires Word
6// to be installed on the target machine. The name of the document to convert must be specified
7// as a command line argument and the resulting Markdown is written to standard output. The
8// tool recognizes the specific Word styles used in the TypeScript Language Specification.
9
10/// <reference lib="scripthost" />
11// eslint-disable-next-line @typescript-eslint/triple-slash-reference
12/// <reference path="./word.d.ts" />
13
14/** @type {{
15    args: string[];
16    createObject: (typeName: string) => any;
17    write(s: string): void;
18    writeFile: (fileName: string, data: string) => void;
19}} */
20const sys = (() => {
21    const fileStream = new ActiveXObject("ADODB.Stream");
22    fileStream.Type = 2 /* text */;
23    const binaryStream = new ActiveXObject("ADODB.Stream");
24    binaryStream.Type = 1 /* binary */;
25    const args = [];
26    for (let i = 0; i < WScript.Arguments.length; i++) {
27        args[i] = WScript.Arguments.Item(i);
28    }
29    return {
30        args,
31        createObject: (typeName) => new ActiveXObject(typeName),
32        write(s) {
33            WScript.StdOut.Write(s);
34        },
35        writeFile: (fileName, data) => {
36            fileStream.Open();
37            binaryStream.Open();
38            try {
39                // Write characters in UTF-8 encoding
40                fileStream.Charset = "utf-8";
41                fileStream.WriteText(data);
42                // We don't want the BOM, skip it by setting the starting location to 3 (size of BOM).
43                fileStream.Position = 3;
44                fileStream.CopyTo(binaryStream);
45                binaryStream.SaveToFile(fileName, 2 /*overwrite*/);
46            }
47            finally {
48                binaryStream.Close();
49                fileStream.Close();
50            }
51        }
52    };
53})();
54
55/** @typedef {{
56    style?: any;
57    font?: {
58        bold?: boolean;
59        italic?: boolean;
60        subscript?: boolean;
61    };
62}} FindReplaceOptions */
63
64/**
65 * @param {Word.Document} doc
66 * @returns {string}
67 */
68function convertDocumentToMarkdown(doc) {
69    /** @type {number[]} */
70    const columnAlignment = [];
71    /** @type {number} */
72    let tableColumnCount;
73    /** @type {number} */
74    let tableCellIndex;
75    /** @type {boolean} */
76    let lastInTable;
77    /** @type {string} */
78    let lastStyle;
79    let result = "";
80
81    /**
82     * @param {any} target
83     * @param {any} properties
84     */
85    function setProperties(target, properties) {
86        for (const name in properties) {
87            if (Object.prototype.hasOwnProperty.call(properties, name)) {
88                const value = properties[name];
89                if (typeof value === "object") {
90                    setProperties(target[name], value);
91                }
92                else {
93                    target[name] = value;
94                }
95            }
96        }
97    }
98
99    /**
100     * @param {string} findText
101     * @param {FindReplaceOptions} findOptions
102     * @param {string} replaceText
103     * @param {FindReplaceOptions} replaceOptions
104     */
105    function findReplace(findText, findOptions, replaceText, replaceOptions) {
106        const find = doc.range().find;
107        find.clearFormatting();
108        setProperties(find, findOptions);
109        const replace = find.replacement;
110        replace.clearFormatting();
111        setProperties(replace, replaceOptions);
112        find.execute(findText,
113            /* matchCase */ false,
114            /* matchWholeWord */ false,
115            /* matchWildcards */ false,
116            /* matchSoundsLike */ false,
117            /* matchAllWordForms */ false,
118            /* forward */ true,
119            0,
120            /* format */ true,
121            replaceText,
122            2
123        );
124    }
125
126    function fixHyperlinks() {
127        const count = doc.hyperlinks.count;
128        for (let i = 0; i < count; i++) {
129            const hyperlink = doc.hyperlinks.item(i + 1);
130            const address = hyperlink.address;
131            if (address && address.length > 0) {
132                const textToDisplay = hyperlink.textToDisplay;
133                hyperlink.textToDisplay = "[" + textToDisplay + "](" + address + ")";
134            }
135        }
136    }
137
138    /**
139     * @param {string} s
140     */
141    function write(s) {
142        result += s;
143    }
144
145    function writeTableHeader() {
146        for (let i = 0; i < tableColumnCount - 1; i++) {
147            switch (columnAlignment[i]) {
148                case 1:
149                    write("|:---:");
150                    break;
151                case 2:
152                    write("|---:");
153                    break;
154                default:
155                    write("|---");
156            }
157        }
158        write("|\n");
159    }
160
161    /**
162     * @param {string} text
163     */
164    function trimEndFormattingMarks(text) {
165        let i = text.length;
166        while (i > 0 && text.charCodeAt(i - 1) < 0x20) i--;
167        return text.substr(0, i);
168    }
169
170    function writeBlockEnd() {
171        switch (lastStyle) {
172            case "Code":
173                write("```\n\n");
174                break;
175            case "List Paragraph":
176            case "Table":
177            case "TOC":
178                write("\n");
179                break;
180        }
181    }
182
183    /**
184     * @param {Word.Paragraph} p
185     */
186    function writeParagraph(p) {
187
188        const range = p.range;
189        const inTable = range.tables.count > 0;
190        const sectionBreak = range.text.indexOf("\x0C") >= 0;
191
192        let level = 1;
193        let style = p.style.nameLocal;
194        let text = range.text;
195
196        text = trimEndFormattingMarks(text);
197        if (text === "/") {
198            // An inline image shows up in the text as a "/". When we see a paragraph
199            // consisting of nothing but "/", we check to see if the paragraph contains
200            // hidden text and, if so, emit that instead. The hidden text is assumed to
201            // contain an appropriate markdown image link.
202            range.textRetrievalMode.includeHiddenText = true;
203            const fullText = range.text;
204            range.textRetrievalMode.includeHiddenText = false;
205            if (text !== fullText) {
206                text = "&emsp;&emsp;" + fullText.substr(1);
207            }
208        }
209
210        if (inTable) {
211            style = "Table";
212        }
213        else if (style.match(/\s\d$/)) {
214            level = +style.substr(style.length - 1);
215            style = style.substr(0, style.length - 2);
216        }
217        if (lastStyle && style !== lastStyle) {
218            writeBlockEnd();
219        }
220
221        switch (style) {
222
223            case "Heading":
224            case "Appendix":
225                const section = range.listFormat.listString;
226                write("####".substr(0, level) + ' <a name="' + section + '"/>' + section + " " + text + "\n\n");
227                break;
228
229            case "Normal":
230                if (text.length) {
231                    write(text + "\n\n");
232                }
233                break;
234
235            case "List Paragraph":
236                write("        ".substr(0, range.listFormat.listLevelNumber * 2 - 2) + "* " + text + "\n");
237                break;
238
239            case "Grammar":
240                write("&emsp;&emsp;" + text.replace(/\s\s\s/g, "&emsp;").replace(/\x0B/g, "  \n&emsp;&emsp;&emsp;") + "\n\n");
241                break;
242
243            case "Code":
244                if (lastStyle !== "Code") {
245                    write("```TypeScript\n");
246                }
247                else {
248                    write("\n");
249                }
250                write(text.replace(/\x0B/g, "  \n") + "\n");
251                break;
252
253            case "Table":
254                if (!lastInTable) {
255                    tableColumnCount = range.tables.item(1).columns.count + 1;
256                    tableCellIndex = 0;
257                }
258                if (tableCellIndex < tableColumnCount) {
259                    columnAlignment[tableCellIndex] = p.alignment;
260                }
261                write("|" + text);
262                tableCellIndex++;
263                if (tableCellIndex % tableColumnCount === 0) {
264                    write("\n");
265                    if (tableCellIndex === tableColumnCount) {
266                        writeTableHeader();
267                    }
268                }
269                break;
270
271            case "TOC Heading":
272                write("## " + text + "\n\n");
273                break;
274
275            case "TOC":
276                const strings = text.split("\t");
277                write("        ".substr(0, level * 2 - 2) + "* [" + strings[0] + " " + strings[1] + "](#" + strings[0] + ")\n");
278                break;
279        }
280
281        if (sectionBreak) {
282            write("<br/>\n\n");
283        }
284        lastStyle = style;
285        lastInTable = inTable;
286    }
287
288    function writeDocument() {
289        const title = doc.builtInDocumentProperties.item(1) + "";
290        if (title.length) {
291            write("# " + title + "\n\n");
292        }
293        for (let p = doc.paragraphs.first; p; p = p.next()) {
294            writeParagraph(p);
295        }
296        writeBlockEnd();
297    }
298
299    findReplace("<", {}, "&lt;", {});
300    findReplace("&lt;", { style: "Code" }, "<", {});
301    findReplace("&lt;", { style: "Code Fragment" }, "<", {});
302    findReplace("&lt;", { style: "Terminal" }, "<", {});
303    findReplace("", { font: { subscript: true } }, "<sub>^&</sub>", { font: { subscript: false } });
304    findReplace("", { style: "Code Fragment" }, "`^&`", { style: -66 /* default font */ });
305    findReplace("", { style: "Production" }, "*^&*", { style: -66 /* default font */ });
306    findReplace("", { style: "Terminal" }, "`^&`", { style: -66 /* default font */ });
307    findReplace("", { font: { bold: true, italic: true } }, "***^&***", { font: { bold: false, italic: false } });
308    findReplace("", { font: { italic: true } }, "*^&*", { font: { italic: false } });
309
310    doc.fields.toggleShowCodes();
311    findReplace("^19 REF", {}, "[^&](#^&)", {});
312    doc.fields.toggleShowCodes();
313
314    fixHyperlinks();
315
316    writeDocument();
317
318    result = result.replace(/\x85/g, "\u2026");
319    result = result.replace(/\x96/g, "\u2013");
320    result = result.replace(/\x97/g, "\u2014");
321
322    return result;
323}
324
325/**
326 * @param {string[]} args
327 */
328function main(args) {
329    if (args.length !== 2) {
330        sys.write("Syntax: word2md <inputfile> <outputfile>\n");
331        return;
332    }
333
334    /** @type {Word.Application} */
335    const app = sys.createObject("Word.Application");
336    const doc = app.documents.open(args[0]);
337    sys.writeFile(args[1], convertDocumentToMarkdown(doc));
338    doc.close(/* saveChanges */ false);
339    app.quit();
340}
341
342main(sys.args);
343