- //TO USE JUST save it and FIRE node <filename.js>
- //AT THE BOTTOM THERE ARE TWO URL'S. TEST CASE IS FOR URL POINTING TO SUN'S JAVADOC ,A PAGE THAT MAKEE HEAVY //USE OF IFRAMES
- // Should process HTML text and dump it on terminal
- // Error on terminal with a backtrace
- /*
- /home/sid/opt/lib/node/.npm/jsdom/0.1.20/package/lib/jsdom/level2/html.js:1400
- this._contentDocument = new HTMLDocument();
- ^
- ReferenceError: HTMLDocument is not defined
- at Object.contentDocument (/home/sid/opt/lib/node/.npm/jsdom/0.1.20/package/lib/jsdom/level2/html.js:1400:9)
- at Object.<anonymous> (/media/dev/workspace/nodejs/scrobblepage.js:27:38)
- at Function.each (evalmachine.<anonymous>:30:151)
- at Object.each (evalmachine.<anonymous>:24:147)
- at DocumentSaver.processRecursive (/media/dev/workspace/nodejs/scrobblepage.js:21:22)
- at Object.<anonymous> (/media/dev/workspace/nodejs/scrobblepage.js:33:12)
- at Function.each (evalmachine.<anonymous>:30:151)
- at Object.each (evalmachine.<anonymous>:24:147)
- at DocumentSaver.processRecursive (/media/dev/workspace/nodejs/scrobblepage.js:21:22)
- at Object.<anonymous> (/media/dev/workspace/nodejs/scrobblepage.js:33:12)
- */
- function DocumentSaver() {
- this.textContent = ''; // Root of the cloned document
- this.ignoredTypes = [ 'SCRIPT', 'TITLE', 'META', 'STYLE', 'LINK','CANVAS', 'VIDEO','AUDIO', 'AREA', 'IMG', 'MAP', 'EMBED', 'OBJECT'
- , 'PARAM', 'SOURCE', 'DEVICE', 'NOSCRIPT' ,'HEAD']
- this.$ = null;
- this.indexOf = function(arr, item, from) {
- if (arr.indexOf) return arr.indexOf(item, from);
- var len = arr.length;
- for (var i = (from < 0) ? Math.max(0, len + from) : from || 0; i < len; i++) {
- if (arr[i] === item) return i;
- }
- return -1;
- };
- this.processRecursive = function(rootNode) {
- var children_t = this.$(rootNode).children();
- var self = this;
- this.$(children_t).each(function(){
- if(this.tagName && self.indexOf(self.ignoredTypes, this.tagName.toUpperCase()) == -1){
- self.processRecursive(this);
- var textContent_t = self.$(this)
- .contents()
- .filter(function() {
- return this.nodeType == 3;
- }).text();
- if(self.$.trim(textContent_t)!=''){
- self.textContent = ' '+self.textContent+' '+self.$.trim(textContent_t);
- //console.log(textContent)
- }
- }
- })
- }
- this.processDocument = function(doc, callback, jQuery) {
- if(doc == null) {
- callback('')
- }
- // TODO check content type
- this.doc = doc;
- this.$ = jQuery;
- this.callback = callback;
- var rootNode = doc.getElementsByTagName('html')[0];
- if (!rootNode) {
- console.error("No html node in document");
- return;
- }
- this.textContent = "";
- this.processRecursive( rootNode );
- this.callback(this.textContent );
- }
- }
- var request = require('request'),
- jsdom = require('jsdom'),
- sys = require('sys');
- var testURL = 'http://winnipeg.ctv.ca/servlet/an/local/CTVNews/20101121/taliban-afghanistan-101121/20101121/?hub=WinnipegHome';
- var testURL2 = 'http://download.oracle.com/javase/1.5.0/docs/api/index.html'
- jsdom.env(testURL2, ['./jquery.min.js'], function(errors, window) {
- if (!errors) {
- var saver = new DocumentSaver();
- saver.processDocument(window.document, function(text){
- console.log('Got some text...')
- console.log(text)
- }, window.jQuery);
- } else {
- console.log(arguments);
- }
- });