Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- class IndexActor extends Actor with ActorLogging {
- def receive = {
- case CheckPage(url, html) => if(checkUrlExistance(url) == false) sender ! ParsePage(url, html)
- case CheckLink(url) => if(checkUrlExistance(url) == false) sender ! QueueLink(url)
- case Word(url, word) => {
- addDoc(url)
- addWord(word)
- val docid = getDocId(url)
- val wordid = getWordId(word)
- println(docid + " " + wordid + " about to index")
- addIndex(wordid, docid)
- }
- }
- def addDoc(url: String): Int = {
- //println(checkUrlExistance(url) == false)
- //println(url)
- if(checkUrlExistance(url) == false){
- sql"""
- insert into documents (url) values (${url});
- """.update.apply()
- }
- getDocId(url)
- }
- def addWord(word: String): Int = {
- //println(checkWordExistance(word) == false)
- //println(word)
- if(checkWordExistance(word) == false) {
- sql"""
- insert into words (word) values (${word});
- """.update.apply()
- }
- getWordId(word)
- }
- def getDocId(url: String): Int = {
- sql"""
- select docid from documents where url = ${url};
- """.map(rs => rs.int("docid")).single.apply().get
- }
- def getWordId(word: String): Int = {
- sql"""
- select wordid from words where word = ${word};
- """.map(rs => rs.int("wordid")).single.apply().get
- }
- def addIndex(wordid: Int, docid: Int): Unit = {
- sql"""
- insert into index (docid, wordid) values (${docid}, ${wordid});
- """.update.apply()
- }
- def checkUrlExistance(link: String): Boolean = {
- //println(link)
- val check: List[String] = {
- sql"""
- select url from documents
- """.map(rs => rs.string("url")).list.apply()
- }
- //println(check)
- if(check.isEmpty) false
- else if(check.contains(link)) true
- else false
- }
- def checkWordExistance(word: String): Boolean = {
- val check: List[String] = {
- sql"""
- select word from words
- """.map(rs => rs.string("word")).list.apply()
- }
- //println(check)
- if(check.isEmpty) false
- else if(check.contains(word)) true
- else false
- }
- ///////////////////////////////////////////////////////////////////
- // The code below is a starting point for your queries/updates to
- // the database. We have provided the database creation SQL for
- // you. You will not need to add any additional tables. Your goal
- // is to populate it with data you have received from parsed HTML
- // documents. We strongly suggest that you implement each of your
- // queries as individual methods in this class, where each method
- // corresponds to some query that is useful in building the index.
- ///////////////////////////////////////////////////////////////////
- // Necessary setup for connecting to the H2 database:
- Class.forName("org.h2.Driver")
- ConnectionPool.singleton("jdbc:h2:./indexer", "sa", "")
- implicit val session = AutoSession
- // Create the database when this object is referenced.
- createDatabase
- def createDatabase: Unit = {
- sql"""
- drop table words if exists;
- drop table documents if exists;
- drop table index if exists;
- """.update.apply()
- // Create the tables if they do not already exist:
- sql"""
- create table if not exists words (
- wordid int auto_increment,
- word varchar(50),
- primary key (wordid)
- );
- """.update.apply()
- sql"""
- create table if not exists documents (
- docid int auto_increment,
- url varchar(1024),
- primary key (docid)
- );
- """.update.apply()
- sql"""
- create table if not exists index (
- wordid int,
- docid int,
- foreign key (wordid) references words (wordid) on delete cascade,
- foreign key (docid) references documents (docid) on delete cascade
- );
- """.update.apply()
- }
- }
- class LinkQueueActor(parseQueue: ActorRef) extends Actor with ActorLogging {
- // We have provided some definitions below which will help you with
- // you implementation. You are welcome to modify these, however, this
- // is what we used for our implementation.
- val queue = Queue[String]()
- var limit = 500 //variable, subtract one and make sure its greater than 0
- def receive = {
- case Page(url, html) => {
- //println(url)
- //println(html)
- parseQueue ! Page(url, html)
- }
- case NeedLink => {
- //println(limit)
- //println(queue.isEmpty)
- if(limit == 0) context.system.shutdown()
- if(queue.isEmpty) sender ! NoLinks
- else {
- limit = limit - 1
- sender ! FetchLink(queue.dequeue())
- }
- }
- case QueueLink(url) => queue += url
- }
- }
- class FetchActor(queue: ActorRef) extends Actor with ActorLogging {
- // This message will start off the process of fetching
- // links from the QueueActor. We include this for you!
- queue ! NeedLink
- def receive = {
- case NoLinks => queue ! NeedLink
- case FetchLink(url) => {
- if(fetch(url).isSuccess) queue ! Page(url, fetch(url).get)
- queue ! NeedLink
- }
- }
- def fetch(url: String): Try[String] =
- Try(Http(url).asString.body)
- }
- class ParseQueueActor(indexer: ActorRef) extends Actor with ActorLogging {
- var linkQueue: Option[ActorRef] = None
- val queue = Queue[ParsePage]()
- def receive = {
- case Page(url, html) => {
- //println(url)
- //println(html)
- if (linkQueue == None) linkQueue = Some(sender)
- indexer ! CheckPage(url, html)
- }
- case ParsePage(url, html) => queue += ParsePage(url, html) //Response to CheckPage from ParseActor
- case NeedPage => {
- if(queue.isEmpty) sender ! NoPages
- else sender ! queue.dequeue
- }
- case Link(url) => indexer ! CheckLink(url)
- case QueueLink(url) => linkQueue.get ! QueueLink(url)
- case Word(url, word) => {
- //print(url)
- //println(" " + word)
- indexer ! Word(url, word)
- }
- }
- }
- class ParseActor(pq: ActorRef) extends Actor with ActorLogging {
- log.info("ParseActor created")
- pq ! NeedPage
- def receive = {
- case ParsePage(url, html) => {
- val link = """\"(https?://[^\"]+)""".r
- var linklist = link.findAllIn(html).toList
- linklist = for {link <- linklist; newlinklist = link.substring(1)} yield newlinklist
- linklist.foreach {sender ! Link(_)}
- parse(html).foreach {pq ! Word(url, _)}
- pq ! NeedPage
- }
- case NoPages => pq ! NeedPage
- }
- def parse(parse: String): List[String] = {
- var html = parse;
- html = html.replaceAll("""\"(https?://[^\"]+)""", "")
- html = html.replaceAll("<[^>]*>", "")
- html = html.replaceAll("""[ \t\x0B\f]+""", " ")
- html = html.replaceAll("""(?m)^\s+$""", "")
- html = html.replaceAll("""[^a-zA-Z0-9 ]""", "")
- html = html.replaceAll("""/d""", "")
- val list = html.split(" ").toList.distinct
- //print(list)
- return list
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement