123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568 |
- /**
- * HTML2Markdown - An HTML to Markdown converter.
- *
- * This implementation uses HTML DOM parsing for conversion. Parsing code was
- * abstracted out in a parsing function which should be easy to remove in favor
- * of other parsing libraries.
- *
- * Converted MarkDown was tested with ShowDown library for HTML rendering. And
- * it tries to create MarkDown that does not confuse ShowDown when certain
- * combination of HTML tags come together.
- *
- * @author Himanshu Gilani
- * @author Kates Gasis (original author)
- *
- */
- if (typeof require != "undefined") {
- var htmlparser = require("./htmldomparser");
- var HTMLParser = htmlparser.HTMLParser;
- }
- /**
- * HTML2Markdown
- * @param html - html string to convert
- * @return converted markdown text
- */
- function HTML2Markdown(html, opts) {
- var logging = false;
- var nodeList = [];
- var listTagStack = [];
- var linkAttrStack = [];
- var blockquoteStack = [];
- var preStack = [];
-
- var links = [];
-
- opts = opts || {};
- var inlineStyle = opts['inlineStyle'] || false;
- var markdownTags = {
- "hr": "- - -\n\n",
- "br": " \n",
- "title": "# ",
- "h1": "# ",
- "h2": "## ",
- "h3": "### ",
- "h4": "#### ",
- "h5": "##### ",
- "h6": "###### ",
- "b": "**",
- "strong": "**",
- "i": "_",
- "em": "_",
- "dfn": "_",
- "var": "_",
- "cite": "_",
- "span": " ",
- "ul": "* ",
- "ol": "1. ",
- "dl": "- ",
- "blockquote": "> "
- };
- function getListMarkdownTag() {
- var listItem = "";
- if(listTagStack) {
- for ( var i = 0; i < listTagStack.length - 1; i++) {
- listItem += " ";
- }
- }
- listItem += peek(listTagStack);
- return listItem;
- }
-
- function convertAttrs(attrs) {
- var attributes = {};
- for(var k in attrs) {
- var attr = attrs[k];
- attributes[attr.name] = attr;
- }
- return attributes;
- }
- function peek(list) {
- if(list && list.length > 0) {
- return list.slice(-1)[0];
- }
- return "";
- }
- function peekTillNotEmpty(list) {
- if(!list) {
- return "";
- }
-
- for(var i = list.length - 1; i>=0; i-- ){
- if(list[i] != "") {
- return list[i];
- }
- }
- return "";
- }
-
- function removeIfEmptyTag(start) {
- var cleaned = false;
- if(start == peekTillNotEmpty(nodeList)) {
- while(peek(nodeList) != start) {
- nodeList.pop();
- }
- nodeList.pop();
- cleaned = true;
- }
- return cleaned;
- }
-
- function sliceText(start) {
- var text = [];
- while(nodeList.length > 0 && peek(nodeList) != start) {
- var t = nodeList.pop();
- text.unshift(t);
- }
- return text.join("");
- }
-
- function block(isEndBlock) {
- var lastItem = nodeList.pop();
- if (!lastItem) {
- return;
- }
-
- if(!isEndBlock) {
- var block;
- if(/\s*\n\n\s*$/.test(lastItem)) {
- lastItem = lastItem.replace(/\s*\n\n\s*$/, "\n\n");
- block = "";
- } else if(/\s*\n\s*$/.test(lastItem)) {
- lastItem = lastItem.replace(/\s*\n\s*$/, "\n");
- block = "\n";
- } else if(/\s+$/.test(lastItem)) {
- block = "\n\n";
- } else {
- block = "\n\n";
- }
-
- nodeList.push(lastItem);
- nodeList.push(block);
- } else {
- nodeList.push(lastItem);
- if(!lastItem.endsWith("\n")) {
- nodeList.push("\n\n");
- }
- }
- }
-
- function listBlock() {
- if(nodeList.length > 0) {
- var li = peek(nodeList);
- if(!li.endsWith("\n")) {
- nodeList.push("\n");
- }
- } else {
- nodeList.push("\n");
- }
- }
-
- try {
- var dom;
- if(html) {
- var e = document.createElement('div');
- e.innerHTML = html;
- dom = e;
- } else {
- dom = window.document.body;
- }
- HTMLParser(dom,{
- start: function(tag, attrs, unary) {
- tag = tag.toLowerCase();
- if(logging) {
- console.log("start: "+ tag);
- }
-
- if(unary && (tag != "br" && tag != "hr" && tag != "img")) {
- return;
- }
-
- switch (tag) {
- case "br":
- nodeList.push(markdownTags[tag]);
- break;
- case "hr":
- block();
- nodeList.push(markdownTags[tag]);
- break;
- case "title":
- case "h1":
- case "h2":
- case "h3":
- case "h4":
- case "h5":
- case "h6":
- block();
- nodeList.push(markdownTags[tag]);
- break;
- case "b":
- case "strong":
- case "i":
- case "em":
- case "dfn":
- case "var":
- case "cite":
- nodeList.push(markdownTags[tag]);
- break;
- case "span":
- if(! /\s+$/.test(peek(nodeList))) {
- nodeList.push(markdownTags[tag]);
- }
- break;
- case "p":
- case "div":
- case "td":
- block();
- break;
- case "ul":
- case "ol":
- case "dl":
- listTagStack.push(markdownTags[tag]);
- // lists are block elements
- if(listTagStack.length > 1) {
- listBlock();
- } else {
- block();
- }
- break;
- case "li":
- case "dt":
- var li = getListMarkdownTag();
- nodeList.push(li);
- break;
- case "a":
- var attribs = convertAttrs(attrs);
- linkAttrStack.push(attribs);
- nodeList.push("[");
- break;
- case "img":
- var attribs = convertAttrs(attrs);
- var alt, title, url;
-
- attribs["src"] ? url = getNormalizedUrl(attribs["src"].value) : url = "";
- if(!url) {
- break;
- }
-
- attribs['alt'] ? alt = attribs['alt'].value.trim() : alt = "";
- attribs['title'] ? title = attribs['title'].value.trim() : title = "";
-
- // if parent of image tag is nested in anchor tag use inline style
- if(!inlineStyle && !peekTillNotEmpty(nodeList).startsWith("[")) {
- var l = links.indexOf(url);
- if(l == -1) {
- links.push(url);
- l=links.length-1;
- }
-
- block();
- nodeList.push("![");
- if(alt!= "") {
- nodeList.push(alt);
- } else if (title != null) {
- nodeList.push(title);
- }
-
- nodeList.push("][" + l + "]");
- block();
- } else {
- //if image is not a link image then treat images as block elements
- if(!peekTillNotEmpty(nodeList).startsWith("[")) {
- block();
- }
-
- nodeList.push("![" + alt + "](" + url + (title ? " \"" + title + "\"" : "") + ")");
-
- if(!peekTillNotEmpty(nodeList).startsWith("[")) {
- block(true);
- }
- }
- break;
- case "blockquote":
- block();
- blockquoteStack.push(markdownTags[tag]);
- nodeList.push(blockquoteStack.join(""));
- break;
- case "pre":
- case "code":
- block();
- preStack.push(true);
- break;
- }
- },
- chars: function(text) {
- if(preStack.length > 0) {
- text = " " + text.replace(/\n/g,"\n ");
- } else if(text.trim() != "") {
- text = text.replace(/\s+/g, " ");
-
- var prevText = peekTillNotEmpty(nodeList);
- if(/\s+$/.test(prevText)) {
- text = text.replace(/^\s+/g, "");
- }
- } else {
- nodeList.push("");
- return;
- }
- if(logging) {
- console.log("text: "+ text);
- }
-
- nodeList.push(text);
- },
- end: function(tag) {
- tag = tag.toLowerCase();
- if(logging) {
- console.log("end: "+ tag);
- }
- switch (tag) {
- case "title":
- case "h1":
- case "h2":
- case "h3":
- case "h4":
- case "h5":
- case "h6":
- if(!removeIfEmptyTag(markdownTags[tag])) {
- block(true);
- }
- break;
- case "p":
- case "div":
- case "td":
- while(nodeList.length > 0 && peek(nodeList).trim() == "") {
- nodeList.pop();
- }
- block(true);
- break;
- case "b":
- case "strong":
- case "i":
- case "em":
- case "dfn":
- case "var":
- case "cite":
- if(!removeIfEmptyTag(markdownTags[tag])) {
- nodeList.push(sliceText(markdownTags[tag]).trim());
- nodeList.push(markdownTags[tag]);
- }
- break;
- case "a":
- var text = sliceText("[");
- text = text.replace(/\s+/g, " ");
- text = text.trim();
-
- if(text == "") {
- nodeList.pop();
- break;
- }
- var attrs = linkAttrStack.pop();
- var url;
- attrs["href"] && attrs["href"].value != "" ? url = getNormalizedUrl(attrs["href"].value) : url = "";
-
- if(url == "") {
- nodeList.pop();
- nodeList.push(text);
- break;
- }
-
- nodeList.push(text);
-
- if(!inlineStyle && !peek(nodeList).startsWith("!")){
- var l = links.indexOf(url);
- if(l == -1) {
- links.push(url);
- l=links.length-1;
- }
- nodeList.push("][" + l + "]");
- } else {
- if(peek(nodeList).startsWith("!")){
- var text = nodeList.pop();
- text = nodeList.pop() + text;
- block();
- nodeList.push(text);
- }
-
- var title = attrs["title"];
- nodeList.push("](" + url + (title ? " \"" + title.value.trim().replace(/\s+/g, " ") + "\"" : "") + ")");
-
- if(peek(nodeList).startsWith("!")){
- block(true);
- }
- }
- break;
- case "ul":
- case "ol":
- case "dl":
- listBlock();
- listTagStack.pop();
- break;
- case "li":
- case "dt":
- var li = getListMarkdownTag();
- if(!removeIfEmptyTag(li)) {
- var text = sliceText(li).trim();
-
- if(text.startsWith("[![")) {
- nodeList.pop();
- block();
- nodeList.push(text);
- block(true);
- } else {
- nodeList.push(text);
- listBlock();
- }
- }
- break;
- case "blockquote":
- blockquoteStack.pop();
- break;
- case "pre":
- case "code":
- block(true);
- preStack.pop();
- break;
- case "span":
- if(peek(nodeList).trim() == "") {
- nodeList.pop();
- if(peek(nodeList) == " ") {
- nodeList.pop();
- } else {
- nodeList.push(markdownTags[tag]);
- }
- } else {
- var text = nodeList.pop();
- nodeList.push(text.trim());
- nodeList.push(markdownTags[tag]);
- }
- break;
- case "br":
- case "hr":
- case "img":
- case "table":
- case "tr":
- break;
- }
-
- }
- }, {"nodesToIgnore": ["script", "noscript", "object", "iframe", "frame", "head", "style", "label"]});
-
- if(!inlineStyle) {
- for ( var i = 0; i < links.length; i++) {
- if(i == 0) {
- var lastItem = nodeList.pop();
- nodeList.push(lastItem.replace(/\s+$/g, ""));
- nodeList.push("\n\n[" + i + "]: " + links[i]);
- } else {
- nodeList.push("\n[" + i + "]: " + links[i]);
- }
- }
- }
- } catch(e) {
- console.log(e.stack);
- console.trace();
- }
-
- return nodeList.join("");
-
- }
- function getNormalizedUrl(s) {
- var urlBase = location.href;
- var urlDir = urlBase.replace(/\/[^\/]*$/, '/');
- var urlPage = urlBase.replace(/#[^\/#]*$/, '');
- var url;
- if(/^[a-zA-Z]([a-zA-Z0-9 -.])*:/.test(s)) {
- // already absolute url
- url = s;
- } else if(/^\x2f/.test(s)) {// %2f --> /
- // url is relative to site
- location.protocol != "" ? url = location.protocol + "//" : url ="";
- url+= location.hostname;
- if(location.port != "80") {
- url+=":"+location.port;
- }
- url += s;
- } else if(/^#/.test(s)) {
- // url is relative to page
- url = urlPage + s;
- } else {
- url = urlDir + s;
- }
- return encodeURI(url);
- }
- if (typeof exports != "undefined") {
- exports.HTML2Markdown = HTML2Markdown;
- }
-
- if (typeof exports != "undefined") {
- exports.HTML2MarkDown = HTML2MarkDown;
- }
- /* add the useful functions to String object*/
- if (typeof String.prototype.trim != 'function') {
- String.prototype.trim = function() {
- return replace(/^\s+|\s+$/g,"");
- };
- }
- if (typeof String.prototype.isNotEmpty != 'function') {
- String.prototype.isNotEmpty = function() {
- if (/\S/.test(this)) {
- return true;
- } else {
- return false;
- }
- };
- }
- if (typeof String.prototype.replaceAll != 'function') {
- String.prototype.replaceAll = function(stringToFind,stringToReplace){
- var temp = this;
- var index = temp.indexOf(stringToFind);
- while(index != -1){
- temp = temp.replace(stringToFind,stringToReplace);
- index = temp.indexOf(stringToFind);
- }
- return temp;
- };
- }
- if (typeof String.prototype.startsWith != 'function') {
- String.prototype.startsWith = function(str) {
- return this.indexOf(str) == 0;
- };
- }
- if (typeof String.prototype.endsWith != 'function') {
- String.prototype.endsWith = function(suffix) {
- return this.match(suffix+"$") == suffix;
- };
- }
- if (typeof Array.prototype.indexOf != 'function') {
- Array.prototype.indexOf = function(obj, fromIndex) {
- if (fromIndex == null) {
- fromIndex = 0;
- } else if (fromIndex < 0) {
- fromIndex = Math.max(0, this.length + fromIndex);
- }
- for ( var i = fromIndex, j = this.length; i < j; i++) {
- if (this[i] === obj)
- return i;
- }
- return -1;
- };
- }
|