1// word2md - Word to Markdown conversion tool 2// 3// word2md converts a Microsoft Word document to Markdown formatted text. The tool uses the 4// Word Automation APIs to start an instance of Word and access the contents of the document 5// being converted. The tool must be run using the cscript.exe script host and requires Word 6// to be installed on the target machine. The name of the document to convert must be specified 7// as a command line argument and the resulting Markdown is written to standard output. The 8// tool recognizes the specific Word styles used in the TypeScript Language Specification. 9 10/// <reference lib="scripthost" /> 11// eslint-disable-next-line @typescript-eslint/triple-slash-reference 12/// <reference path="./word.d.ts" /> 13 14/** @type {{ 15 args: string[]; 16 createObject: (typeName: string) => any; 17 write(s: string): void; 18 writeFile: (fileName: string, data: string) => void; 19}} */ 20const sys = (() => { 21 const fileStream = new ActiveXObject("ADODB.Stream"); 22 fileStream.Type = 2 /* text */; 23 const binaryStream = new ActiveXObject("ADODB.Stream"); 24 binaryStream.Type = 1 /* binary */; 25 const args = []; 26 for (let i = 0; i < WScript.Arguments.length; i++) { 27 args[i] = WScript.Arguments.Item(i); 28 } 29 return { 30 args, 31 createObject: (typeName) => new ActiveXObject(typeName), 32 write(s) { 33 WScript.StdOut.Write(s); 34 }, 35 writeFile: (fileName, data) => { 36 fileStream.Open(); 37 binaryStream.Open(); 38 try { 39 // Write characters in UTF-8 encoding 40 fileStream.Charset = "utf-8"; 41 fileStream.WriteText(data); 42 // We don't want the BOM, skip it by setting the starting location to 3 (size of BOM). 43 fileStream.Position = 3; 44 fileStream.CopyTo(binaryStream); 45 binaryStream.SaveToFile(fileName, 2 /*overwrite*/); 46 } 47 finally { 48 binaryStream.Close(); 49 fileStream.Close(); 50 } 51 } 52 }; 53})(); 54 55/** @typedef {{ 56 style?: any; 57 font?: { 58 bold?: boolean; 59 italic?: boolean; 60 subscript?: boolean; 61 }; 62}} FindReplaceOptions */ 63 64/** 65 * @param {Word.Document} doc 66 * @returns {string} 67 */ 68function convertDocumentToMarkdown(doc) { 69 /** @type {number[]} */ 70 const columnAlignment = []; 71 /** @type {number} */ 72 let tableColumnCount; 73 /** @type {number} */ 74 let tableCellIndex; 75 /** @type {boolean} */ 76 let lastInTable; 77 /** @type {string} */ 78 let lastStyle; 79 let result = ""; 80 81 /** 82 * @param {any} target 83 * @param {any} properties 84 */ 85 function setProperties(target, properties) { 86 for (const name in properties) { 87 if (Object.prototype.hasOwnProperty.call(properties, name)) { 88 const value = properties[name]; 89 if (typeof value === "object") { 90 setProperties(target[name], value); 91 } 92 else { 93 target[name] = value; 94 } 95 } 96 } 97 } 98 99 /** 100 * @param {string} findText 101 * @param {FindReplaceOptions} findOptions 102 * @param {string} replaceText 103 * @param {FindReplaceOptions} replaceOptions 104 */ 105 function findReplace(findText, findOptions, replaceText, replaceOptions) { 106 const find = doc.range().find; 107 find.clearFormatting(); 108 setProperties(find, findOptions); 109 const replace = find.replacement; 110 replace.clearFormatting(); 111 setProperties(replace, replaceOptions); 112 find.execute(findText, 113 /* matchCase */ false, 114 /* matchWholeWord */ false, 115 /* matchWildcards */ false, 116 /* matchSoundsLike */ false, 117 /* matchAllWordForms */ false, 118 /* forward */ true, 119 0, 120 /* format */ true, 121 replaceText, 122 2 123 ); 124 } 125 126 function fixHyperlinks() { 127 const count = doc.hyperlinks.count; 128 for (let i = 0; i < count; i++) { 129 const hyperlink = doc.hyperlinks.item(i + 1); 130 const address = hyperlink.address; 131 if (address && address.length > 0) { 132 const textToDisplay = hyperlink.textToDisplay; 133 hyperlink.textToDisplay = "[" + textToDisplay + "](" + address + ")"; 134 } 135 } 136 } 137 138 /** 139 * @param {string} s 140 */ 141 function write(s) { 142 result += s; 143 } 144 145 function writeTableHeader() { 146 for (let i = 0; i < tableColumnCount - 1; i++) { 147 switch (columnAlignment[i]) { 148 case 1: 149 write("|:---:"); 150 break; 151 case 2: 152 write("|---:"); 153 break; 154 default: 155 write("|---"); 156 } 157 } 158 write("|\n"); 159 } 160 161 /** 162 * @param {string} text 163 */ 164 function trimEndFormattingMarks(text) { 165 let i = text.length; 166 while (i > 0 && text.charCodeAt(i - 1) < 0x20) i--; 167 return text.substr(0, i); 168 } 169 170 function writeBlockEnd() { 171 switch (lastStyle) { 172 case "Code": 173 write("```\n\n"); 174 break; 175 case "List Paragraph": 176 case "Table": 177 case "TOC": 178 write("\n"); 179 break; 180 } 181 } 182 183 /** 184 * @param {Word.Paragraph} p 185 */ 186 function writeParagraph(p) { 187 188 const range = p.range; 189 const inTable = range.tables.count > 0; 190 const sectionBreak = range.text.indexOf("\x0C") >= 0; 191 192 let level = 1; 193 let style = p.style.nameLocal; 194 let text = range.text; 195 196 text = trimEndFormattingMarks(text); 197 if (text === "/") { 198 // An inline image shows up in the text as a "/". When we see a paragraph 199 // consisting of nothing but "/", we check to see if the paragraph contains 200 // hidden text and, if so, emit that instead. The hidden text is assumed to 201 // contain an appropriate markdown image link. 202 range.textRetrievalMode.includeHiddenText = true; 203 const fullText = range.text; 204 range.textRetrievalMode.includeHiddenText = false; 205 if (text !== fullText) { 206 text = "  " + fullText.substr(1); 207 } 208 } 209 210 if (inTable) { 211 style = "Table"; 212 } 213 else if (style.match(/\s\d$/)) { 214 level = +style.substr(style.length - 1); 215 style = style.substr(0, style.length - 2); 216 } 217 if (lastStyle && style !== lastStyle) { 218 writeBlockEnd(); 219 } 220 221 switch (style) { 222 223 case "Heading": 224 case "Appendix": 225 const section = range.listFormat.listString; 226 write("####".substr(0, level) + ' <a name="' + section + '"/>' + section + " " + text + "\n\n"); 227 break; 228 229 case "Normal": 230 if (text.length) { 231 write(text + "\n\n"); 232 } 233 break; 234 235 case "List Paragraph": 236 write(" ".substr(0, range.listFormat.listLevelNumber * 2 - 2) + "* " + text + "\n"); 237 break; 238 239 case "Grammar": 240 write("  " + text.replace(/\s\s\s/g, " ").replace(/\x0B/g, " \n   ") + "\n\n"); 241 break; 242 243 case "Code": 244 if (lastStyle !== "Code") { 245 write("```TypeScript\n"); 246 } 247 else { 248 write("\n"); 249 } 250 write(text.replace(/\x0B/g, " \n") + "\n"); 251 break; 252 253 case "Table": 254 if (!lastInTable) { 255 tableColumnCount = range.tables.item(1).columns.count + 1; 256 tableCellIndex = 0; 257 } 258 if (tableCellIndex < tableColumnCount) { 259 columnAlignment[tableCellIndex] = p.alignment; 260 } 261 write("|" + text); 262 tableCellIndex++; 263 if (tableCellIndex % tableColumnCount === 0) { 264 write("\n"); 265 if (tableCellIndex === tableColumnCount) { 266 writeTableHeader(); 267 } 268 } 269 break; 270 271 case "TOC Heading": 272 write("## " + text + "\n\n"); 273 break; 274 275 case "TOC": 276 const strings = text.split("\t"); 277 write(" ".substr(0, level * 2 - 2) + "* [" + strings[0] + " " + strings[1] + "](#" + strings[0] + ")\n"); 278 break; 279 } 280 281 if (sectionBreak) { 282 write("<br/>\n\n"); 283 } 284 lastStyle = style; 285 lastInTable = inTable; 286 } 287 288 function writeDocument() { 289 const title = doc.builtInDocumentProperties.item(1) + ""; 290 if (title.length) { 291 write("# " + title + "\n\n"); 292 } 293 for (let p = doc.paragraphs.first; p; p = p.next()) { 294 writeParagraph(p); 295 } 296 writeBlockEnd(); 297 } 298 299 findReplace("<", {}, "<", {}); 300 findReplace("<", { style: "Code" }, "<", {}); 301 findReplace("<", { style: "Code Fragment" }, "<", {}); 302 findReplace("<", { style: "Terminal" }, "<", {}); 303 findReplace("", { font: { subscript: true } }, "<sub>^&</sub>", { font: { subscript: false } }); 304 findReplace("", { style: "Code Fragment" }, "`^&`", { style: -66 /* default font */ }); 305 findReplace("", { style: "Production" }, "*^&*", { style: -66 /* default font */ }); 306 findReplace("", { style: "Terminal" }, "`^&`", { style: -66 /* default font */ }); 307 findReplace("", { font: { bold: true, italic: true } }, "***^&***", { font: { bold: false, italic: false } }); 308 findReplace("", { font: { italic: true } }, "*^&*", { font: { italic: false } }); 309 310 doc.fields.toggleShowCodes(); 311 findReplace("^19 REF", {}, "[^&](#^&)", {}); 312 doc.fields.toggleShowCodes(); 313 314 fixHyperlinks(); 315 316 writeDocument(); 317 318 result = result.replace(/\x85/g, "\u2026"); 319 result = result.replace(/\x96/g, "\u2013"); 320 result = result.replace(/\x97/g, "\u2014"); 321 322 return result; 323} 324 325/** 326 * @param {string[]} args 327 */ 328function main(args) { 329 if (args.length !== 2) { 330 sys.write("Syntax: word2md <inputfile> <outputfile>\n"); 331 return; 332 } 333 334 /** @type {Word.Application} */ 335 const app = sys.createObject("Word.Application"); 336 const doc = app.documents.open(args[0]); 337 sys.writeFile(args[1], convertDocumentToMarkdown(doc)); 338 doc.close(/* saveChanges */ false); 339 app.quit(); 340} 341 342main(sys.args); 343