Advertisement
Guest User

Untitled

a guest
Jan 21st, 2017
88
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.94 KB | None | 0 0
  1. import org.jsoup.Jsoup
  2. import org.jsoup.nodes.Document
  3. import org.jsoup.nodes.Element
  4. import org.jsoup.select.Elements
  5. import scala.collection.JavaConversions._
  6. import java.net.{ URL, MalformedURLException }
  7. import scala.util.control.Exception._
  8.  
  9. sealed case class Link(title: String, href: String)
  10.  
  11. case class WebDocument(title: String,
  12. body: String,
  13. links: Seq[Link],
  14. metaDescription: String)
  15.  
  16. object Crawler {
  17.  
  18. type JDoc = org.jsoup.nodes.Document
  19.  
  20. def get(url: String): JDoc = Jsoup.connect(url).get()
  21.  
  22. def titleText(doc: JDoc): String = doc.select("title").text
  23.  
  24. def bodyText(doc: JDoc): String = doc.select("body").text
  25.  
  26. /**
  27. * Allows for extraction without null pointer exceptions
  28. *
  29. */
  30. def safeMetaExtract(doc: JDoc, meta: String): String = {
  31. val result = doc.select("meta[name=" ++ meta ++ "]").first
  32. Option(result) match {
  33. case Some(v) => v.attr("content")
  34. case None => ""
  35. }
  36. }
  37.  
  38. def metaKeywords(doc: JDoc): String = safeMetaExtract(doc, "keywords")
  39.  
  40. def metaDescription(doc: JDoc): String = safeMetaExtract(doc, "description")
  41.  
  42. /**
  43. * Extracts links from a document
  44. *
  45. */
  46. def linkSequence(doc: JDoc): Seq[Link] = {
  47. val links = doc.select("a[href]").iterator.toList
  48. links.map { l => Link(l.text, l.attr("href")) }
  49. }
  50.  
  51. def extract(doc: JDoc): WebDocument = {
  52. val title: String = titleText(doc)
  53. val body: String = bodyText(doc)
  54. val links: Seq[Link] = linkSequence(doc)
  55. val desc: String = metaDescription(doc)
  56.  
  57. WebDocument(title, body, links, desc)
  58. }
  59.  
  60. def safeURL(url: String): Option[String] = {
  61. val result = catching(classOf[MalformedURLException]) opt new URL(url)
  62. result match {
  63. case Some(v) => Some(v.toString)
  64. case None => None
  65. }
  66. }
  67.  
  68. /**
  69. * Crawl a URL and return a WebDocument
  70. *
  71. */
  72. def crawl(url: String): WebDocument = {
  73. val f = extract _ compose get
  74. f(url)
  75. }
  76. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement