- /*UNIT TESTED CODE FOR CRAWLING A PAGE AND GATHERING THE TEXT USING CHROME DEBUGGER...
- * OPEN THE URL:'http://download.oracle.com/javase/1.5.0/docs/api/index.html'
- * fire the script
- d = new DocumentSaver();
- d.processDocument(document, function(text_t){
- console.log('********HTML TEXT******** ');
- console.log(text_t)
- })
- RESULT: you will see the HTML text gathered from the page...
- */
- function DocumentSaver() {
- this.textContent = ''; // Root of the cloned document
- this.ignoredTypes = [ 'SCRIPT', 'TITLE', 'META', 'STYLE', 'LINK','CANVAS', 'VIDEO','AUDIO', 'AREA', 'IMG', 'MAP', 'EMBED', 'OBJECT'
- , 'PARAM', 'SOURCE', 'DEVICE', 'NOSCRIPT' ,'HEAD']
- this.indexOf = function(arr, item, from) {
- if (arr.indexOf) return arr.indexOf(item, from);
- var len = arr.length;
- for (var i = (from < 0) ? Math.max(0, len + from) : from || 0; i < len; i++) {
- if (arr[i] === item) return i;
- }
- return -1;
- };
- this.processRecursive = function(rootNode) {
- for (var child = rootNode.firstChild; child != null; child = child.nextSibling){
- if(child.tagName && child.tagName.toUpperCase()=='FRAME'){
- var self = this;
- var iframeSaver = new DocumentSaver();
- iframeSaver.processDocument(child.contentDocument, function(Iframetext){
- self.textContent = self.textContent +" "+Iframetext;
- })
- }
- if(child.tagName && this.indexOf(this.ignoredTypes, child.tagName.toUpperCase()) != -1){
- continue;
- }
- //console.log(child);
- this.processRecursive(child);
- switch (child.nodeType) {
- case child.TEXT_NODE:
- if(child.textContent.trim()!=''){
- // console.log(child);
- this.textContent = " "+this.textContent+" "+child.textContent;
- }
- break;
- default:
- // console.log('Unhandled node: ' + child.nodeName);
- break; /* TODO */
- }
- }
- }
- this.processDocument = function(doc, callback) {
- // TODO check content type
- this.doc = doc;
- this.callback = callback;
- var rootNode = doc.getElementsByTagName('html')[0];
- if (!rootNode) {
- console.error("No html node in document");
- return;
- }
- this.textContent = "";
- // TODO process html root too
- this.processRecursive( rootNode );
- this.callback(this.textContent );
- }
- }