html2markdown.js 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568
  1. /**
  2. * HTML2Markdown - An HTML to Markdown converter.
  3. *
  4. * This implementation uses HTML DOM parsing for conversion. Parsing code was
  5. * abstracted out in a parsing function which should be easy to remove in favor
  6. * of other parsing libraries.
  7. *
  8. * Converted MarkDown was tested with ShowDown library for HTML rendering. And
  9. * it tries to create MarkDown that does not confuse ShowDown when certain
  10. * combination of HTML tags come together.
  11. *
  12. * @author Himanshu Gilani
  13. * @author Kates Gasis (original author)
  14. *
  15. */
  16. if (typeof require != "undefined") {
  17. var htmlparser = require("./htmldomparser");
  18. var HTMLParser = htmlparser.HTMLParser;
  19. }
  20. /**
  21. * HTML2Markdown
  22. * @param html - html string to convert
  23. * @return converted markdown text
  24. */
  25. function HTML2Markdown(html, opts) {
  26. var logging = false;
  27. var nodeList = [];
  28. var listTagStack = [];
  29. var linkAttrStack = [];
  30. var blockquoteStack = [];
  31. var preStack = [];
  32. var links = [];
  33. opts = opts || {};
  34. var inlineStyle = opts['inlineStyle'] || false;
  35. var markdownTags = {
  36. "hr": "- - -\n\n",
  37. "br": " \n",
  38. "title": "# ",
  39. "h1": "# ",
  40. "h2": "## ",
  41. "h3": "### ",
  42. "h4": "#### ",
  43. "h5": "##### ",
  44. "h6": "###### ",
  45. "b": "**",
  46. "strong": "**",
  47. "i": "_",
  48. "em": "_",
  49. "dfn": "_",
  50. "var": "_",
  51. "cite": "_",
  52. "span": " ",
  53. "ul": "* ",
  54. "ol": "1. ",
  55. "dl": "- ",
  56. "blockquote": "> "
  57. };
  58. function getListMarkdownTag() {
  59. var listItem = "";
  60. if(listTagStack) {
  61. for ( var i = 0; i < listTagStack.length - 1; i++) {
  62. listItem += " ";
  63. }
  64. }
  65. listItem += peek(listTagStack);
  66. return listItem;
  67. }
  68. function convertAttrs(attrs) {
  69. var attributes = {};
  70. for(var k in attrs) {
  71. var attr = attrs[k];
  72. attributes[attr.name] = attr;
  73. }
  74. return attributes;
  75. }
  76. function peek(list) {
  77. if(list && list.length > 0) {
  78. return list.slice(-1)[0];
  79. }
  80. return "";
  81. }
  82. function peekTillNotEmpty(list) {
  83. if(!list) {
  84. return "";
  85. }
  86. for(var i = list.length - 1; i>=0; i-- ){
  87. if(list[i] != "") {
  88. return list[i];
  89. }
  90. }
  91. return "";
  92. }
  93. function removeIfEmptyTag(start) {
  94. var cleaned = false;
  95. if(start == peekTillNotEmpty(nodeList)) {
  96. while(peek(nodeList) != start) {
  97. nodeList.pop();
  98. }
  99. nodeList.pop();
  100. cleaned = true;
  101. }
  102. return cleaned;
  103. }
  104. function sliceText(start) {
  105. var text = [];
  106. while(nodeList.length > 0 && peek(nodeList) != start) {
  107. var t = nodeList.pop();
  108. text.unshift(t);
  109. }
  110. return text.join("");
  111. }
  112. function block(isEndBlock) {
  113. var lastItem = nodeList.pop();
  114. if (!lastItem) {
  115. return;
  116. }
  117. if(!isEndBlock) {
  118. var block;
  119. if(/\s*\n\n\s*$/.test(lastItem)) {
  120. lastItem = lastItem.replace(/\s*\n\n\s*$/, "\n\n");
  121. block = "";
  122. } else if(/\s*\n\s*$/.test(lastItem)) {
  123. lastItem = lastItem.replace(/\s*\n\s*$/, "\n");
  124. block = "\n";
  125. } else if(/\s+$/.test(lastItem)) {
  126. block = "\n\n";
  127. } else {
  128. block = "\n\n";
  129. }
  130. nodeList.push(lastItem);
  131. nodeList.push(block);
  132. } else {
  133. nodeList.push(lastItem);
  134. if(!lastItem.endsWith("\n")) {
  135. nodeList.push("\n\n");
  136. }
  137. }
  138. }
  139. function listBlock() {
  140. if(nodeList.length > 0) {
  141. var li = peek(nodeList);
  142. if(!li.endsWith("\n")) {
  143. nodeList.push("\n");
  144. }
  145. } else {
  146. nodeList.push("\n");
  147. }
  148. }
  149. try {
  150. var dom;
  151. if(html) {
  152. var e = document.createElement('div');
  153. e.innerHTML = html;
  154. dom = e;
  155. } else {
  156. dom = window.document.body;
  157. }
  158. HTMLParser(dom,{
  159. start: function(tag, attrs, unary) {
  160. tag = tag.toLowerCase();
  161. if(logging) {
  162. console.log("start: "+ tag);
  163. }
  164. if(unary && (tag != "br" && tag != "hr" && tag != "img")) {
  165. return;
  166. }
  167. switch (tag) {
  168. case "br":
  169. nodeList.push(markdownTags[tag]);
  170. break;
  171. case "hr":
  172. block();
  173. nodeList.push(markdownTags[tag]);
  174. break;
  175. case "title":
  176. case "h1":
  177. case "h2":
  178. case "h3":
  179. case "h4":
  180. case "h5":
  181. case "h6":
  182. block();
  183. nodeList.push(markdownTags[tag]);
  184. break;
  185. case "b":
  186. case "strong":
  187. case "i":
  188. case "em":
  189. case "dfn":
  190. case "var":
  191. case "cite":
  192. nodeList.push(markdownTags[tag]);
  193. break;
  194. case "span":
  195. if(! /\s+$/.test(peek(nodeList))) {
  196. nodeList.push(markdownTags[tag]);
  197. }
  198. break;
  199. case "p":
  200. case "div":
  201. case "td":
  202. block();
  203. break;
  204. case "ul":
  205. case "ol":
  206. case "dl":
  207. listTagStack.push(markdownTags[tag]);
  208. // lists are block elements
  209. if(listTagStack.length > 1) {
  210. listBlock();
  211. } else {
  212. block();
  213. }
  214. break;
  215. case "li":
  216. case "dt":
  217. var li = getListMarkdownTag();
  218. nodeList.push(li);
  219. break;
  220. case "a":
  221. var attribs = convertAttrs(attrs);
  222. linkAttrStack.push(attribs);
  223. nodeList.push("[");
  224. break;
  225. case "img":
  226. var attribs = convertAttrs(attrs);
  227. var alt, title, url;
  228. attribs["src"] ? url = getNormalizedUrl(attribs["src"].value) : url = "";
  229. if(!url) {
  230. break;
  231. }
  232. attribs['alt'] ? alt = attribs['alt'].value.trim() : alt = "";
  233. attribs['title'] ? title = attribs['title'].value.trim() : title = "";
  234. // if parent of image tag is nested in anchor tag use inline style
  235. if(!inlineStyle && !peekTillNotEmpty(nodeList).startsWith("[")) {
  236. var l = links.indexOf(url);
  237. if(l == -1) {
  238. links.push(url);
  239. l=links.length-1;
  240. }
  241. block();
  242. nodeList.push("![");
  243. if(alt!= "") {
  244. nodeList.push(alt);
  245. } else if (title != null) {
  246. nodeList.push(title);
  247. }
  248. nodeList.push("][" + l + "]");
  249. block();
  250. } else {
  251. //if image is not a link image then treat images as block elements
  252. if(!peekTillNotEmpty(nodeList).startsWith("[")) {
  253. block();
  254. }
  255. nodeList.push("![" + alt + "](" + url + (title ? " \"" + title + "\"" : "") + ")");
  256. if(!peekTillNotEmpty(nodeList).startsWith("[")) {
  257. block(true);
  258. }
  259. }
  260. break;
  261. case "blockquote":
  262. block();
  263. blockquoteStack.push(markdownTags[tag]);
  264. nodeList.push(blockquoteStack.join(""));
  265. break;
  266. case "pre":
  267. case "code":
  268. block();
  269. preStack.push(true);
  270. break;
  271. }
  272. },
  273. chars: function(text) {
  274. if(preStack.length > 0) {
  275. text = " " + text.replace(/\n/g,"\n ");
  276. } else if(text.trim() != "") {
  277. text = text.replace(/\s+/g, " ");
  278. var prevText = peekTillNotEmpty(nodeList);
  279. if(/\s+$/.test(prevText)) {
  280. text = text.replace(/^\s+/g, "");
  281. }
  282. } else {
  283. nodeList.push("");
  284. return;
  285. }
  286. if(logging) {
  287. console.log("text: "+ text);
  288. }
  289. nodeList.push(text);
  290. },
  291. end: function(tag) {
  292. tag = tag.toLowerCase();
  293. if(logging) {
  294. console.log("end: "+ tag);
  295. }
  296. switch (tag) {
  297. case "title":
  298. case "h1":
  299. case "h2":
  300. case "h3":
  301. case "h4":
  302. case "h5":
  303. case "h6":
  304. if(!removeIfEmptyTag(markdownTags[tag])) {
  305. block(true);
  306. }
  307. break;
  308. case "p":
  309. case "div":
  310. case "td":
  311. while(nodeList.length > 0 && peek(nodeList).trim() == "") {
  312. nodeList.pop();
  313. }
  314. block(true);
  315. break;
  316. case "b":
  317. case "strong":
  318. case "i":
  319. case "em":
  320. case "dfn":
  321. case "var":
  322. case "cite":
  323. if(!removeIfEmptyTag(markdownTags[tag])) {
  324. nodeList.push(sliceText(markdownTags[tag]).trim());
  325. nodeList.push(markdownTags[tag]);
  326. }
  327. break;
  328. case "a":
  329. var text = sliceText("[");
  330. text = text.replace(/\s+/g, " ");
  331. text = text.trim();
  332. if(text == "") {
  333. nodeList.pop();
  334. break;
  335. }
  336. var attrs = linkAttrStack.pop();
  337. var url;
  338. attrs["href"] && attrs["href"].value != "" ? url = getNormalizedUrl(attrs["href"].value) : url = "";
  339. if(url == "") {
  340. nodeList.pop();
  341. nodeList.push(text);
  342. break;
  343. }
  344. nodeList.push(text);
  345. if(!inlineStyle && !peek(nodeList).startsWith("!")){
  346. var l = links.indexOf(url);
  347. if(l == -1) {
  348. links.push(url);
  349. l=links.length-1;
  350. }
  351. nodeList.push("][" + l + "]");
  352. } else {
  353. if(peek(nodeList).startsWith("!")){
  354. var text = nodeList.pop();
  355. text = nodeList.pop() + text;
  356. block();
  357. nodeList.push(text);
  358. }
  359. var title = attrs["title"];
  360. nodeList.push("](" + url + (title ? " \"" + title.value.trim().replace(/\s+/g, " ") + "\"" : "") + ")");
  361. if(peek(nodeList).startsWith("!")){
  362. block(true);
  363. }
  364. }
  365. break;
  366. case "ul":
  367. case "ol":
  368. case "dl":
  369. listBlock();
  370. listTagStack.pop();
  371. break;
  372. case "li":
  373. case "dt":
  374. var li = getListMarkdownTag();
  375. if(!removeIfEmptyTag(li)) {
  376. var text = sliceText(li).trim();
  377. if(text.startsWith("[![")) {
  378. nodeList.pop();
  379. block();
  380. nodeList.push(text);
  381. block(true);
  382. } else {
  383. nodeList.push(text);
  384. listBlock();
  385. }
  386. }
  387. break;
  388. case "blockquote":
  389. blockquoteStack.pop();
  390. break;
  391. case "pre":
  392. case "code":
  393. block(true);
  394. preStack.pop();
  395. break;
  396. case "span":
  397. if(peek(nodeList).trim() == "") {
  398. nodeList.pop();
  399. if(peek(nodeList) == " ") {
  400. nodeList.pop();
  401. } else {
  402. nodeList.push(markdownTags[tag]);
  403. }
  404. } else {
  405. var text = nodeList.pop();
  406. nodeList.push(text.trim());
  407. nodeList.push(markdownTags[tag]);
  408. }
  409. break;
  410. case "br":
  411. case "hr":
  412. case "img":
  413. case "table":
  414. case "tr":
  415. break;
  416. }
  417. }
  418. }, {"nodesToIgnore": ["script", "noscript", "object", "iframe", "frame", "head", "style", "label"]});
  419. if(!inlineStyle) {
  420. for ( var i = 0; i < links.length; i++) {
  421. if(i == 0) {
  422. var lastItem = nodeList.pop();
  423. nodeList.push(lastItem.replace(/\s+$/g, ""));
  424. nodeList.push("\n\n[" + i + "]: " + links[i]);
  425. } else {
  426. nodeList.push("\n[" + i + "]: " + links[i]);
  427. }
  428. }
  429. }
  430. } catch(e) {
  431. console.log(e.stack);
  432. console.trace();
  433. }
  434. return nodeList.join("");
  435. }
  436. function getNormalizedUrl(s) {
  437. var urlBase = location.href;
  438. var urlDir = urlBase.replace(/\/[^\/]*$/, '/');
  439. var urlPage = urlBase.replace(/#[^\/#]*$/, '');
  440. var url;
  441. if(/^[a-zA-Z]([a-zA-Z0-9 -.])*:/.test(s)) {
  442. // already absolute url
  443. url = s;
  444. } else if(/^\x2f/.test(s)) {// %2f --> /
  445. // url is relative to site
  446. location.protocol != "" ? url = location.protocol + "//" : url ="";
  447. url+= location.hostname;
  448. if(location.port != "80") {
  449. url+=":"+location.port;
  450. }
  451. url += s;
  452. } else if(/^#/.test(s)) {
  453. // url is relative to page
  454. url = urlPage + s;
  455. } else {
  456. url = urlDir + s;
  457. }
  458. return encodeURI(url);
  459. }
  460. if (typeof exports != "undefined") {
  461. exports.HTML2Markdown = HTML2Markdown;
  462. }
  463. if (typeof exports != "undefined") {
  464. exports.HTML2MarkDown = HTML2MarkDown;
  465. }
  466. /* add the useful functions to String object*/
  467. if (typeof String.prototype.trim != 'function') {
  468. String.prototype.trim = function() {
  469. return replace(/^\s+|\s+$/g,"");
  470. };
  471. }
  472. if (typeof String.prototype.isNotEmpty != 'function') {
  473. String.prototype.isNotEmpty = function() {
  474. if (/\S/.test(this)) {
  475. return true;
  476. } else {
  477. return false;
  478. }
  479. };
  480. }
  481. if (typeof String.prototype.replaceAll != 'function') {
  482. String.prototype.replaceAll = function(stringToFind,stringToReplace){
  483. var temp = this;
  484. var index = temp.indexOf(stringToFind);
  485. while(index != -1){
  486. temp = temp.replace(stringToFind,stringToReplace);
  487. index = temp.indexOf(stringToFind);
  488. }
  489. return temp;
  490. };
  491. }
  492. if (typeof String.prototype.startsWith != 'function') {
  493. String.prototype.startsWith = function(str) {
  494. return this.indexOf(str) == 0;
  495. };
  496. }
  497. if (typeof String.prototype.endsWith != 'function') {
  498. String.prototype.endsWith = function(suffix) {
  499. return this.match(suffix+"$") == suffix;
  500. };
  501. }
  502. if (typeof Array.prototype.indexOf != 'function') {
  503. Array.prototype.indexOf = function(obj, fromIndex) {
  504. if (fromIndex == null) {
  505. fromIndex = 0;
  506. } else if (fromIndex < 0) {
  507. fromIndex = Math.max(0, this.length + fromIndex);
  508. }
  509. for ( var i = fromIndex, j = this.length; i < j; i++) {
  510. if (this[i] === obj)
  511. return i;
  512. }
  513. return -1;
  514. };
  515. }