Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- var request = require('request'),
- jsdom = require('jsdom'),
- net = require('net'),
- DocumentSaver = require('./scrobblepage.js'),
- filter = require('./documentfilter.js');
- exports.handleRequests = function(response, URLlst){
- //Crawl a page, gather text, filter it
- //Send it to the server and get the topic representations back here.
- //Calculate the similarity based on topics and the representative words.
- //return similarity based on topics...
- this.URLList = URLlst;//ARRAY OF URL's
- this.response = response;
- //What is to be sent back on this response object.
- this.topicDist = {};//{'URL':'Topic Distribution'}
- this.repWords = {};//{'URL':'Rep words'}
- this.docStrings = {};//{'URL': 'word occurances'}
- this.docsDownLoaded =0;
- this.docsProcessed =0;
- this.init = function(){
- //Make the URL List by crawling the pages
- //After crawling a page, filter it to make the word occurance thingy
- //TODO: Cache last 10000 documents in DB
- //TODO: How to regulate the traffic to the C++ server?
- //TODO: Will this scale as # of connection from the users go up?
- var self = this;
- for(u in self.URLList){
- (function f(url, urlnum){
- request({uri:url}, function (error, response, body) {
- if (!error && response.statusCode == 200) {
- var window = jsdom.jsdom(body).createWindow();
- jsdom.jQueryify(window, 'jquery.min.js', function (window, jQuery) {
- var saver = new DocumentSaver();
- //get the text from the document
- saver.processDocument(window.document, function(text){
- self.docsDownLoaded++;
- console.log('Got some text...')
- filter.filterPage(text, function(wordOccurance){
- //console.log(wordOccurance);
- //got the text after filtering
- //Send it to C++ process
- self.calculateTopicDist(wordOccurance, urlnum);
- })
- }, jQuery);
- });
- }else{
- //Increase document count
- console.log('Unprocessed doc:: ');
- console.log(url);
- self.docStrings[url] ='!!UNPROCESSED!!';
- self.docsProcessed++;
- }
- });
- })(self.URLList[u],u)
- }
- }
- this.calculateTopicDist = function(wordOccurance, urlnum){
- var s = new net.Stream();
- var self = this;
- //Make a new connection and send it to the C++ process
- s.setEncoding('utf8');
- s.on('connect', function(){
- console.log('***Sending data for document:'+urlnum);
- console.log(wordOccurance.length);
- s.write(wordOccurance, encoding='utf8');
- })
- //handler to recieve the data back after the inference. /Recieve the inference results.
- s.on('data', function(data){
- console.log('*******DATA RECIEVED FOR:'+urlnum);
- console.log(data);
- self.docStrings[self.URLList[urlnum]] = data.trim();
- self.docsProcessed++;
- if(self.docsProcessed > self.URLList.length-1){
- console.log(JSON.stringify(self.docStrings));
- self.response.writeHead(200, {'Content-Type': 'text/html'});
- self.response.write(JSON.stringify(self.docStrings), encoding ='utf8');
- self.response.end();
- }
- s.end();
- })
- s.on('close', function(had_error){
- console.log('Closed the stream');
- })
- s.connect('/tmp/ts1');
- }
- }
Add Comment
Please, Sign In to add comment