Advertisement
Guest User

sexy

a guest
Oct 5th, 2015
81
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Scala 0.83 KB | None | 0 0
  1. package ch.ethz.ir
  2. import ch.ethz.dal.tinyir.io.DocStream
  3. import ch.ethz.dal.tinyir.processing.XMLDocument
  4. import scala.collection.mutable.PriorityQueue
  5. import java.net.URL
  6.  
  7. /**
  8.  * @author florangmehlin
  9.  */
  10. object WebCrawler {
  11.  
  12.   val findUrlRegex = "(a).*(href).*(\")".r//"(a.*href=\")((?!http)[^\\s]+)(\")".r
  13.   val urlQueue = new PriorityQueue[String]
  14.  
  15.  
  16.   def main(args: Array[String]){
  17.     val initPage = "http://idvm-infk-hofmann03.inf.ethz.ch/eth/www.ethz.ch/en.html";
  18.      
  19.     readURL(initPage)
  20.    
  21.    
  22.    
  23.    
  24.   }
  25.  
  26.   def readURL(targetURL: String){
  27.    
  28.     val sourceCode = io.Source.fromURL(targetURL);
  29.    
  30.     for( l <- sourceCode.getLines()) {
  31.       l match {
  32.         case findUrlRegex(_*) => /*urlQueue.enqueue(html)*/ print(l)
  33.         case _ => print("")
  34.       }
  35.     }
  36.      
  37.    
  38.   }
  39.  
  40. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement