/** * HTML2Markdown - An HTML to Markdown converter. * * This implementation uses HTML DOM parsing for conversion. Parsing code was * abstracted out in a parsing function which should be easy to remove in favor * of other parsing libraries. * * Converted MarkDown was tested with ShowDown library for HTML rendering. And * it tries to create MarkDown that does not confuse ShowDown when certain * combination of HTML tags come together. * * @author Himanshu Gilani * @author Kates Gasis (original author) * */ if (typeof require != "undefined") { var htmlparser = require("./htmldomparser"); var HTMLParser = htmlparser.HTMLParser; } /** * HTML2Markdown * @param html - html string to convert * @return converted markdown text */ function HTML2Markdown(html, opts) { var logging = false; var nodeList = []; var listTagStack = []; var linkAttrStack = []; var blockquoteStack = []; var preStack = []; var links = []; opts = opts || {}; var inlineStyle = opts['inlineStyle'] || false; var markdownTags = { "hr": "- - -\n\n", "br": " \n", "title": "# ", "h1": "# ", "h2": "## ", "h3": "### ", "h4": "#### ", "h5": "##### ", "h6": "###### ", "b": "**", "strong": "**", "i": "_", "em": "_", "dfn": "_", "var": "_", "cite": "_", "span": " ", "ul": "* ", "ol": "1. ", "dl": "- ", "blockquote": "> " }; function getListMarkdownTag() { var listItem = ""; if(listTagStack) { for ( var i = 0; i < listTagStack.length - 1; i++) { listItem += " "; } } listItem += peek(listTagStack); return listItem; } function convertAttrs(attrs) { var attributes = {}; for(var k in attrs) { var attr = attrs[k]; attributes[attr.name] = attr; } return attributes; } function peek(list) { if(list && list.length > 0) { return list.slice(-1)[0]; } return ""; } function peekTillNotEmpty(list) { if(!list) { return ""; } for(var i = list.length - 1; i>=0; i-- ){ if(list[i] != "") { return list[i]; } } return ""; } function removeIfEmptyTag(start) { var cleaned = false; if(start == peekTillNotEmpty(nodeList)) { while(peek(nodeList) != start) { nodeList.pop(); } nodeList.pop(); cleaned = true; } return cleaned; } function sliceText(start) { var text = []; while(nodeList.length > 0 && peek(nodeList) != start) { var t = nodeList.pop(); text.unshift(t); } return text.join(""); } function block(isEndBlock) { var lastItem = nodeList.pop(); if (!lastItem) { return; } if(!isEndBlock) { var block; if(/\s*\n\n\s*$/.test(lastItem)) { lastItem = lastItem.replace(/\s*\n\n\s*$/, "\n\n"); block = ""; } else if(/\s*\n\s*$/.test(lastItem)) { lastItem = lastItem.replace(/\s*\n\s*$/, "\n"); block = "\n"; } else if(/\s+$/.test(lastItem)) { block = "\n\n"; } else { block = "\n\n"; } nodeList.push(lastItem); nodeList.push(block); } else { nodeList.push(lastItem); if(!lastItem.endsWith("\n")) { nodeList.push("\n\n"); } } } function listBlock() { if(nodeList.length > 0) { var li = peek(nodeList); if(!li.endsWith("\n")) { nodeList.push("\n"); } } else { nodeList.push("\n"); } } try { var dom; if(html) { var e = document.createElement('div'); e.innerHTML = html; dom = e; } else { dom = window.document.body; } HTMLParser(dom,{ start: function(tag, attrs, unary) { tag = tag.toLowerCase(); if(logging) { console.log("start: "+ tag); } if(unary && (tag != "br" && tag != "hr" && tag != "img")) { return; } switch (tag) { case "br": nodeList.push(markdownTags[tag]); break; case "hr": block(); nodeList.push(markdownTags[tag]); break; case "title": case "h1": case "h2": case "h3": case "h4": case "h5": case "h6": block(); nodeList.push(markdownTags[tag]); break; case "b": case "strong": case "i": case "em": case "dfn": case "var": case "cite": nodeList.push(markdownTags[tag]); break; case "span": if(! /\s+$/.test(peek(nodeList))) { nodeList.push(markdownTags[tag]); } break; case "p": case "div": case "td": block(); break; case "ul": case "ol": case "dl": listTagStack.push(markdownTags[tag]); // lists are block elements if(listTagStack.length > 1) { listBlock(); } else { block(); } break; case "li": case "dt": var li = getListMarkdownTag(); nodeList.push(li); break; case "a": var attribs = convertAttrs(attrs); linkAttrStack.push(attribs); nodeList.push("["); break; case "img": var attribs = convertAttrs(attrs); var alt, title, url; attribs["src"] ? url = getNormalizedUrl(attribs["src"].value) : url = ""; if(!url) { break; } attribs['alt'] ? alt = attribs['alt'].value.trim() : alt = ""; attribs['title'] ? title = attribs['title'].value.trim() : title = ""; // if parent of image tag is nested in anchor tag use inline style if(!inlineStyle && !peekTillNotEmpty(nodeList).startsWith("[")) { var l = links.indexOf(url); if(l == -1) { links.push(url); l=links.length-1; } block(); nodeList.push("!["); if(alt!= "") { nodeList.push(alt); } else if (title != null) { nodeList.push(title); } nodeList.push("][" + l + "]"); block(); } else { //if image is not a link image then treat images as block elements if(!peekTillNotEmpty(nodeList).startsWith("[")) { block(); } nodeList.push("![" + alt + "](" + url + (title ? " \"" + title + "\"" : "") + ")"); if(!peekTillNotEmpty(nodeList).startsWith("[")) { block(true); } } break; case "blockquote": block(); blockquoteStack.push(markdownTags[tag]); nodeList.push(blockquoteStack.join("")); break; case "pre": case "code": block(); preStack.push(true); break; } }, chars: function(text) { if(preStack.length > 0) { text = " " + text.replace(/\n/g,"\n "); } else if(text.trim() != "") { text = text.replace(/\s+/g, " "); var prevText = peekTillNotEmpty(nodeList); if(/\s+$/.test(prevText)) { text = text.replace(/^\s+/g, ""); } } else { nodeList.push(""); return; } if(logging) { console.log("text: "+ text); } nodeList.push(text); }, end: function(tag) { tag = tag.toLowerCase(); if(logging) { console.log("end: "+ tag); } switch (tag) { case "title": case "h1": case "h2": case "h3": case "h4": case "h5": case "h6": if(!removeIfEmptyTag(markdownTags[tag])) { block(true); } break; case "p": case "div": case "td": while(nodeList.length > 0 && peek(nodeList).trim() == "") { nodeList.pop(); } block(true); break; case "b": case "strong": case "i": case "em": case "dfn": case "var": case "cite": if(!removeIfEmptyTag(markdownTags[tag])) { nodeList.push(sliceText(markdownTags[tag]).trim()); nodeList.push(markdownTags[tag]); } break; case "a": var text = sliceText("["); text = text.replace(/\s+/g, " "); text = text.trim(); if(text == "") { nodeList.pop(); break; } var attrs = linkAttrStack.pop(); var url; attrs["href"] && attrs["href"].value != "" ? url = getNormalizedUrl(attrs["href"].value) : url = ""; if(url == "") { nodeList.pop(); nodeList.push(text); break; } nodeList.push(text); if(!inlineStyle && !peek(nodeList).startsWith("!")){ var l = links.indexOf(url); if(l == -1) { links.push(url); l=links.length-1; } nodeList.push("][" + l + "]"); } else { if(peek(nodeList).startsWith("!")){ var text = nodeList.pop(); text = nodeList.pop() + text; block(); nodeList.push(text); } var title = attrs["title"]; nodeList.push("](" + url + (title ? " \"" + title.value.trim().replace(/\s+/g, " ") + "\"" : "") + ")"); if(peek(nodeList).startsWith("!")){ block(true); } } break; case "ul": case "ol": case "dl": listBlock(); listTagStack.pop(); break; case "li": case "dt": var li = getListMarkdownTag(); if(!removeIfEmptyTag(li)) { var text = sliceText(li).trim(); if(text.startsWith("[![")) { nodeList.pop(); block(); nodeList.push(text); block(true); } else { nodeList.push(text); listBlock(); } } break; case "blockquote": blockquoteStack.pop(); break; case "pre": case "code": block(true); preStack.pop(); break; case "span": if(peek(nodeList).trim() == "") { nodeList.pop(); if(peek(nodeList) == " ") { nodeList.pop(); } else { nodeList.push(markdownTags[tag]); } } else { var text = nodeList.pop(); nodeList.push(text.trim()); nodeList.push(markdownTags[tag]); } break; case "br": case "hr": case "img": case "table": case "tr": break; } } }, {"nodesToIgnore": ["script", "noscript", "object", "iframe", "frame", "head", "style", "label"]}); if(!inlineStyle) { for ( var i = 0; i < links.length; i++) { if(i == 0) { var lastItem = nodeList.pop(); nodeList.push(lastItem.replace(/\s+$/g, "")); nodeList.push("\n\n[" + i + "]: " + links[i]); } else { nodeList.push("\n[" + i + "]: " + links[i]); } } } } catch(e) { console.log(e.stack); console.trace(); } return nodeList.join(""); } function getNormalizedUrl(s) { var urlBase = location.href; var urlDir = urlBase.replace(/\/[^\/]*$/, '/'); var urlPage = urlBase.replace(/#[^\/#]*$/, ''); var url; if(/^[a-zA-Z]([a-zA-Z0-9 -.])*:/.test(s)) { // already absolute url url = s; } else if(/^\x2f/.test(s)) {// %2f --> / // url is relative to site location.protocol != "" ? url = location.protocol + "//" : url =""; url+= location.hostname; if(location.port != "80") { url+=":"+location.port; } url += s; } else if(/^#/.test(s)) { // url is relative to page url = urlPage + s; } else { url = urlDir + s; } return encodeURI(url); } if (typeof exports != "undefined") { exports.HTML2Markdown = HTML2Markdown; } if (typeof exports != "undefined") { exports.HTML2MarkDown = HTML2MarkDown; } /* add the useful functions to String object*/ if (typeof String.prototype.trim != 'function') { String.prototype.trim = function() { return replace(/^\s+|\s+$/g,""); }; } if (typeof String.prototype.isNotEmpty != 'function') { String.prototype.isNotEmpty = function() { if (/\S/.test(this)) { return true; } else { return false; } }; } if (typeof String.prototype.replaceAll != 'function') { String.prototype.replaceAll = function(stringToFind,stringToReplace){ var temp = this; var index = temp.indexOf(stringToFind); while(index != -1){ temp = temp.replace(stringToFind,stringToReplace); index = temp.indexOf(stringToFind); } return temp; }; } if (typeof String.prototype.startsWith != 'function') { String.prototype.startsWith = function(str) { return this.indexOf(str) == 0; }; } if (typeof String.prototype.endsWith != 'function') { String.prototype.endsWith = function(suffix) { return this.match(suffix+"$") == suffix; }; } if (typeof Array.prototype.indexOf != 'function') { Array.prototype.indexOf = function(obj, fromIndex) { if (fromIndex == null) { fromIndex = 0; } else if (fromIndex < 0) { fromIndex = Math.max(0, this.length + fromIndex); } for ( var i = fromIndex, j = this.length; i < j; i++) { if (this[i] === obj) return i; } return -1; }; }