Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package ch.ethz.ir
- import ch.ethz.dal.tinyir.io.DocStream
- import ch.ethz.dal.tinyir.processing.XMLDocument
- import scala.collection.mutable.PriorityQueue
- import java.net.URL
- /**
- * @author florangmehlin
- */
- object WebCrawler {
- val findUrlRegex = "(a).*(href).*(\")".r//"(a.*href=\")((?!http)[^\\s]+)(\")".r
- val urlQueue = new PriorityQueue[String]
- def main(args: Array[String]){
- val initPage = "http://idvm-infk-hofmann03.inf.ethz.ch/eth/www.ethz.ch/en.html";
- readURL(initPage)
- }
- def readURL(targetURL: String){
- val sourceCode = io.Source.fromURL(targetURL);
- for( l <- sourceCode.getLines()) {
- l match {
- case findUrlRegex(_*) => /*urlQueue.enqueue(html)*/ print(l)
- case _ => print("")
- }
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement