htmldomparser.js 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. /*
  2. * HTMLParser - This implementation of parser assumes we are parsing HTML in browser
  3. * and user DOM methods available in browser for parsing HTML.
  4. *
  5. * @author Himanshu Gilani
  6. *
  7. */
  8. var HTMLParser = function(node, handler, opts) {
  9. opts = opts || {};
  10. var nodesToIgnore = opts['nodesToIgnore'] || [];
  11. var parseHiddenNodes = opts['parseHiddenNodes'] || 'false';
  12. var c = node.childNodes;
  13. for ( var i = 0; i < c.length; i++) {
  14. try {
  15. var ignore = false;
  16. for(var k=0; k< nodesToIgnore.length; k++) {
  17. if(c[i].nodeName.toLowerCase() == nodesToIgnore[k]) {
  18. ignore= true;
  19. break;
  20. }
  21. }
  22. //NOTE hidden node testing is expensive in FF.
  23. if (ignore || (!parseHiddenNodes && isHiddenNode(c[i])) ){
  24. continue;
  25. }
  26. if (c[i].nodeName.toLowerCase() != "#text" && c[i].nodeName.toLowerCase() != "#comment") {
  27. var attrs = [];
  28. if (c[i].hasAttributes()) {
  29. var attributes = c[i].attributes;
  30. for ( var a = 0; a < attributes.length; a++) {
  31. var attribute = attributes.item(a);
  32. attrs.push({
  33. name : attribute.nodeName,
  34. value : attribute.nodeValue,
  35. });
  36. }
  37. }
  38. if (handler.start) {
  39. if (c[i].hasChildNodes()) {
  40. handler.start(c[i].nodeName, attrs, false);
  41. if (c[i].nodeName.toLowerCase() == "pre" || c[i].nodeName.toLowerCase() == "code") {
  42. handler.chars(c[i].innerHTML);
  43. } else if (c[i].nodeName.toLowerCase() == "iframe" || c[i].nodeName.toLowerCase() == "frame") {
  44. if (c[i].contentDocument && c[i].contentDocument.documentElement) {
  45. return HTMLParser(c[i].contentDocument.documentElement, handler, opts);
  46. }
  47. } else if (c[i].hasChildNodes()) {
  48. HTMLParser(c[i], handler, opts);
  49. }
  50. if (handler.end) {
  51. handler.end(c[i].nodeName);
  52. }
  53. } else {
  54. handler.start(c[i].nodeName, attrs, true);
  55. }
  56. }
  57. } else if (c[i].nodeName.toLowerCase() == "#text") {
  58. if (handler.chars) {
  59. handler.chars(c[i].nodeValue);
  60. }
  61. } else if (c[i].nodeName.toLowerCase() == "#comment") {
  62. if (handler.comment) {
  63. handler.comment(c[i].nodeValue);
  64. }
  65. }
  66. } catch (e) {
  67. //properly log error
  68. console.log("error while parsing node: " + c[i].nodeName.toLowerCase());
  69. }
  70. }
  71. };
  72. function isHiddenNode(node) {
  73. if(node.nodeName.toLowerCase() == "title"){
  74. return false;
  75. }
  76. if (window.getComputedStyle) {
  77. try {
  78. var style = window.getComputedStyle(node, null);
  79. if (style.getPropertyValue && style.getPropertyValue('display') == 'none') {
  80. return true;
  81. }
  82. } catch (e) {
  83. // consume and ignore. node styles are not accessible
  84. }
  85. return false;
  86. }
  87. }