Advertisement
Guest User

skert

a guest
May 3rd, 2015
263
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Scala 6.90 KB | None | 0 0
  1. class IndexActor extends Actor with ActorLogging {
  2.  
  3.   def receive = {
  4.     case CheckPage(url, html) => if(checkUrlExistance(url) == false) sender ! ParsePage(url, html)
  5.     case CheckLink(url) => if(checkUrlExistance(url) == false) sender ! QueueLink(url)
  6.     case Word(url, word) => {
  7.       addDoc(url)
  8.       addWord(word)
  9.       val docid = getDocId(url)
  10.       val wordid = getWordId(word)
  11.       println(docid + " " + wordid + " about to index")
  12.       addIndex(wordid, docid)
  13.     }
  14.   }
  15.  
  16.   def addDoc(url: String): Int = {
  17.     //println(checkUrlExistance(url) == false)
  18.     //println(url)
  19.     if(checkUrlExistance(url) == false){
  20.       sql"""
  21.         insert into documents (url) values (${url});
  22.      """.update.apply()
  23.       }
  24.       getDocId(url)
  25.     }
  26.  
  27.   def addWord(word: String): Int = {
  28.     //println(checkWordExistance(word) == false)
  29.     //println(word)
  30.     if(checkWordExistance(word) == false) {
  31.       sql"""
  32.        insert into words (word) values (${word});
  33.      """.update.apply()
  34.       }
  35.     getWordId(word)
  36.   }
  37.  
  38.   def getDocId(url: String): Int = {
  39.     sql"""
  40.      select docid from documents where url = ${url};
  41.    """.map(rs => rs.int("docid")).single.apply().get
  42.   }
  43.  
  44.   def getWordId(word: String): Int = {
  45.     sql"""
  46.      select wordid from words where word = ${word};
  47.    """.map(rs => rs.int("wordid")).single.apply().get
  48.   }
  49.  
  50.   def addIndex(wordid: Int, docid: Int): Unit = {
  51.     sql"""
  52.      insert into index (docid, wordid) values (${docid}, ${wordid});
  53.    """.update.apply()
  54.   }
  55.  
  56.   def checkUrlExistance(link: String): Boolean = {
  57.     //println(link)
  58.     val check: List[String] = {
  59.       sql"""
  60.        select url from documents
  61.      """.map(rs => rs.string("url")).list.apply()
  62.       }
  63.       //println(check)  
  64.       if(check.isEmpty) false
  65.       else if(check.contains(link)) true
  66.       else false
  67.   }
  68.  
  69.   def checkWordExistance(word: String): Boolean = {
  70.      val check: List[String] = {
  71.       sql"""
  72.        select word from words
  73.      """.map(rs => rs.string("word")).list.apply()
  74.       }
  75.       //println(check)  
  76.       if(check.isEmpty) false
  77.       else if(check.contains(word)) true
  78.       else false
  79.   }
  80.  
  81.   ///////////////////////////////////////////////////////////////////
  82.   // The code below is a starting point for your queries/updates to
  83.   // the database. We have provided the database creation SQL for
  84.   // you. You will not need to add any additional tables. Your goal
  85.   // is to populate it with data you have received from parsed HTML
  86.   // documents. We strongly suggest that you implement each of your
  87.   // queries as individual methods in this class, where each method
  88.   // corresponds to some query that is useful in building the index.
  89.   ///////////////////////////////////////////////////////////////////
  90.  
  91.   // Necessary setup for connecting to the H2 database:
  92.   Class.forName("org.h2.Driver")
  93.   ConnectionPool.singleton("jdbc:h2:./indexer", "sa", "")
  94.   implicit val session = AutoSession
  95.  
  96.   // Create the database when this object is referenced.
  97.   createDatabase
  98.  
  99.   def createDatabase: Unit = {
  100.     sql"""
  101.      drop table words if exists;
  102.      drop table documents if exists;
  103.      drop table index if exists;
  104.    """.update.apply()
  105.  
  106.     // Create the tables if they do not already exist:
  107.     sql"""
  108.    create table if not exists words (
  109.      wordid int auto_increment,
  110.      word varchar(50),
  111.      primary key (wordid)
  112.    );
  113.    """.update.apply()
  114.  
  115.     sql"""
  116.    create table if not exists documents (
  117.      docid int auto_increment,
  118.      url varchar(1024),
  119.      primary key (docid)
  120.    );
  121.    """.update.apply()
  122.  
  123.     sql"""
  124.    create table if not exists index (
  125.      wordid int,
  126.      docid int,
  127.      foreign key (wordid) references words (wordid) on delete cascade,
  128.      foreign key (docid) references documents (docid) on delete cascade
  129.    );
  130.    """.update.apply()
  131.   }
  132.  
  133. }
  134.  
  135. class LinkQueueActor(parseQueue: ActorRef) extends Actor with ActorLogging {
  136.   // We have provided some definitions below which will help you with
  137.   // you implementation. You are welcome to modify these, however, this
  138.   // is what we used for our implementation.
  139.   val queue        = Queue[String]()
  140.   var limit        = 500 //variable, subtract one and make sure its greater than 0
  141.  
  142.   def receive = {
  143.     case Page(url, html) => {
  144.       //println(url)
  145.       //println(html)
  146.       parseQueue ! Page(url, html)
  147.     }
  148.     case NeedLink => {
  149.       //println(limit)
  150.       //println(queue.isEmpty)
  151.         if(limit == 0) context.system.shutdown()
  152.         if(queue.isEmpty) sender ! NoLinks
  153.         else {
  154.             limit = limit - 1
  155.             sender ! FetchLink(queue.dequeue())
  156.         }
  157.     }
  158.     case QueueLink(url) => queue += url
  159.   }
  160. }
  161.  
  162.  
  163. class FetchActor(queue: ActorRef) extends Actor with ActorLogging {
  164.  
  165.   // This message will start off the process of fetching
  166.   // links from the QueueActor. We include this for you!
  167.   queue ! NeedLink
  168.  
  169.   def receive = {
  170.     case NoLinks => queue ! NeedLink
  171.     case FetchLink(url) => {
  172.         if(fetch(url).isSuccess) queue ! Page(url, fetch(url).get)
  173.         queue ! NeedLink
  174.     }
  175.   }
  176.  
  177.   def fetch(url: String): Try[String] =
  178.     Try(Http(url).asString.body)
  179. }
  180.  
  181. class ParseQueueActor(indexer: ActorRef) extends Actor with ActorLogging {
  182.   var linkQueue: Option[ActorRef] = None
  183.   val queue = Queue[ParsePage]()
  184.   def receive = {
  185.     case Page(url, html) => {
  186.       //println(url)
  187.       //println(html)
  188.       if (linkQueue == None) linkQueue = Some(sender)
  189.       indexer ! CheckPage(url, html)
  190.     }
  191.     case ParsePage(url, html) => queue += ParsePage(url, html) //Response to CheckPage from ParseActor
  192.     case NeedPage => {
  193.       if(queue.isEmpty) sender ! NoPages
  194.       else sender ! queue.dequeue
  195.     }
  196.     case Link(url) => indexer ! CheckLink(url)
  197.     case QueueLink(url) => linkQueue.get ! QueueLink(url)
  198.     case Word(url, word) => {
  199.       //print(url)
  200.       //println(" " + word)
  201.       indexer ! Word(url, word)
  202.     }
  203.   }
  204. }
  205.  
  206. class ParseActor(pq: ActorRef) extends Actor with ActorLogging {
  207.   log.info("ParseActor created")
  208.   pq ! NeedPage
  209.  
  210.   def receive = {
  211.     case ParsePage(url, html) => {
  212.         val link = """\"(https?://[^\"]+)""".r
  213.         var linklist = link.findAllIn(html).toList
  214.         linklist = for {link <- linklist; newlinklist = link.substring(1)} yield newlinklist
  215.         linklist.foreach {sender ! Link(_)}
  216.         parse(html).foreach {pq ! Word(url, _)}
  217.         pq ! NeedPage
  218.     }
  219.     case NoPages => pq ! NeedPage
  220.   }
  221.     def parse(parse: String): List[String] = {
  222.         var html = parse;
  223.         html = html.replaceAll("""\"(https?://[^\"]+)""", "")
  224.         html = html.replaceAll("<[^>]*>", "")
  225.         html = html.replaceAll("""[ \t\x0B\f]+""", " ")
  226.         html = html.replaceAll("""(?m)^\s+$""", "")
  227.         html = html.replaceAll("""[^a-zA-Z0-9 ]""", "")
  228.         html = html.replaceAll("""/d""", "")
  229.         val list = html.split(" ").toList.distinct
  230.       //print(list)
  231.           return list
  232.     }  
  233. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement