Don't like ads? PRO users don't see any ads ;-)
Guest

Untitled

By: a guest on Apr 29th, 2012  |  syntax: None  |  size: 3.35 KB  |  hits: 15  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. //TO USE JUST save it and  FIRE node <filename.js>
  2. //AT THE BOTTOM THERE ARE TWO URL'S. TEST CASE IS FOR URL POINTING TO SUN'S JAVADOC ,A PAGE THAT MAKEE HEAVY //USE OF IFRAMES
  3. // Should process HTML text and dump it on terminal
  4. // Error on terminal with a backtrace
  5. /*
  6. /home/sid/opt/lib/node/.npm/jsdom/0.1.20/package/lib/jsdom/level2/html.js:1400
  7.         this._contentDocument = new HTMLDocument();
  8.         ^
  9. ReferenceError: HTMLDocument is not defined
  10.     at Object.contentDocument (/home/sid/opt/lib/node/.npm/jsdom/0.1.20/package/lib/jsdom/level2/html.js:1400:9)
  11.     at Object.<anonymous> (/media/dev/workspace/nodejs/scrobblepage.js:27:38)
  12.     at Function.each (evalmachine.<anonymous>:30:151)
  13.     at Object.each (evalmachine.<anonymous>:24:147)
  14.     at DocumentSaver.processRecursive (/media/dev/workspace/nodejs/scrobblepage.js:21:22)
  15.     at Object.<anonymous> (/media/dev/workspace/nodejs/scrobblepage.js:33:12)
  16.     at Function.each (evalmachine.<anonymous>:30:151)
  17.     at Object.each (evalmachine.<anonymous>:24:147)
  18.     at DocumentSaver.processRecursive (/media/dev/workspace/nodejs/scrobblepage.js:21:22)
  19.     at Object.<anonymous> (/media/dev/workspace/nodejs/scrobblepage.js:33:12)
  20. */
  21.  
  22. function DocumentSaver() {
  23.         this.textContent = ''; // Root of the cloned document
  24.         this.ignoredTypes = [   'SCRIPT', 'TITLE', 'META', 'STYLE', 'LINK','CANVAS', 'VIDEO','AUDIO', 'AREA', 'IMG', 'MAP', 'EMBED', 'OBJECT'
  25.                                                 , 'PARAM', 'SOURCE', 'DEVICE', 'NOSCRIPT' ,'HEAD']
  26.         this.$ = null;
  27.         this.indexOf = function(arr, item, from) {
  28.                 if (arr.indexOf) return arr.indexOf(item, from);
  29.                 var len = arr.length;
  30.                 for (var i = (from < 0) ? Math.max(0, len + from) : from || 0; i < len; i++) {
  31.                         if (arr[i] === item) return i;
  32.                 }
  33.                 return -1;
  34.         };
  35.         this.processRecursive = function(rootNode) {
  36.                 var children_t = this.$(rootNode).children();
  37.                 var self = this;
  38.                
  39.                 this.$(children_t).each(function(){
  40.                         if(this.tagName && self.indexOf(self.ignoredTypes, this.tagName.toUpperCase()) == -1){
  41.                
  42.                                 self.processRecursive(this);
  43.                                 var textContent_t = self.$(this)
  44.                                                                   .contents()
  45.                                                                   .filter(function() {
  46.                                                                                 return this.nodeType == 3;
  47.                                                                 }).text();
  48.                                 if(self.$.trim(textContent_t)!=''){                            
  49.                                         self.textContent = ' '+self.textContent+' '+self.$.trim(textContent_t);
  50.                                         //console.log(textContent)
  51.                                 }
  52.                                
  53.                         }
  54.                
  55.                 })
  56.                
  57.  
  58.         }
  59.    
  60.   this.processDocument = function(doc, callback, jQuery) {
  61.         if(doc == null) {
  62.                 callback('')
  63.         }
  64.     // TODO check content type
  65.     this.doc = doc;
  66.    
  67.     this.$ = jQuery;
  68.     this.callback = callback;
  69.     var rootNode = doc.getElementsByTagName('html')[0];
  70.  
  71.     if (!rootNode) {
  72.       console.error("No html node in document");
  73.       return;
  74.     }
  75.     this.textContent = "";
  76.  
  77.     this.processRecursive( rootNode );
  78.    
  79.     this.callback(this.textContent );
  80.   }
  81. }
  82.  
  83. var request = require('request'),
  84.     jsdom = require('jsdom'),
  85.     sys = require('sys');
  86.  
  87. var testURL = 'http://winnipeg.ctv.ca/servlet/an/local/CTVNews/20101121/taliban-afghanistan-101121/20101121/?hub=WinnipegHome';
  88. var testURL2 = 'http://download.oracle.com/javase/1.5.0/docs/api/index.html'
  89.  
  90. jsdom.env(testURL2, ['./jquery.min.js'], function(errors, window) {
  91.   if (!errors) {
  92.         var saver = new DocumentSaver();
  93.  
  94.         saver.processDocument(window.document, function(text){
  95.                         console.log('Got some text...')
  96.                         console.log(text)
  97.         }, window.jQuery);
  98.   } else {
  99.     console.log(arguments);
  100.   }
  101. });