Advertisement
Guest User

Untitled

a guest
Jun 16th, 2016
57
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
F# 2.54 KB | None | 0 0
  1. #r "../packages/FSharp.Data.2.2.0/lib/net40/FSharp.Data.dll"
  2.  
  3.  
  4. open FSharp.Data
  5. open System.IO
  6. open System
  7. open System.Collections.Concurrent
  8. open System.Collections.Generic
  9. open System.Threading.Tasks
  10.  
  11.  
  12.  
  13.  
  14. let getDomain (absUrl:string) =
  15.     let cut = absUrl.Replace("https://", "").Replace("http://", "")
  16.     let firstSlash = cut.IndexOf("/")
  17.     if firstSlash = -1 then "" else cut.Substring(0, firstSlash).ToLower()
  18.  
  19. let isRel (url:string) = url.StartsWith("/")
  20.  
  21. let notAFileUrl (url:string) = url.EndsWith("/")
  22.  
  23. let haveSameDomains (url1:string) (url2:string) =
  24.     if isRel url1 || isRel url2 then true
  25.     else
  26.         let domain1 = getDomain url1
  27.         let domain2 = getDomain url2
  28.        
  29.         String.Compare(domain1, "") <> 0 &&
  30.         String.Compare(domain2, "") <> 0 &&
  31.         (isRel url2 || (String.Compare(domain1, domain2, true) = 0))
  32.  
  33.  
  34. let getAbs (domain:string) = function
  35.     | what when isRel what -> "http://" + domain + what
  36.     | what -> what
  37.  
  38.  
  39. let getPagesWithImportances (urls_limit:int) (url:string) : ConcurrentDictionary<string, float> =
  40.     let importancesDict = ConcurrentDictionary<string, float>()
  41.     let visitedUrls = ConcurrentDictionary<string, int>()
  42.     let rec walkThrough (url:string) (max_depth:int) =
  43.         if max_depth > 0 then
  44.             printfn "GET %s" url
  45.             visitedUrls.AddOrUpdate(url, 0, fun _ _ -> 0) |> ignore
  46.             let links = (true, ["a"], (HtmlDocument.Load url))
  47.                         |||> HtmlDocument.descendantsNamedWithPath
  48.                         |> Seq.filter (fun (x, y) -> x.TryGetAttribute("href").IsSome)
  49.                         |> Seq.filter (fun (x, y) -> haveSameDomains url (x.AttributeValue("href")))
  50.                         |> Seq.map (fun (x, y) -> (getAbs (getDomain url) (x.AttributeValue "href"), 1.0 / float(List.length y)))                    
  51.             links
  52.                 |> Seq.toArray
  53.                 |> Array.Parallel.iter (fun (x, y) -> (importancesDict.AddOrUpdate(x, y, fun _ z -> y + z) |> ignore)) // update importances
  54.             links
  55.                 |> Seq.filter (fun (x, y) -> not (visitedUrls.ContainsKey(x))) // delete already visited urls
  56.                 |> Seq.filter (fun (x, y) -> notAFileUrl x)
  57.                 |> Seq.toArray
  58.                 |> Array.Parallel.iter (fun (x, y) -> walkThrough x (max_depth - 1))
  59.     walkThrough url urls_limit
  60.     importancesDict
  61.  
  62.  
  63. let start = "http://www.vox.com/2015/5/13/8598655/north-korea-execution"
  64.  
  65. let dict = getPagesWithImportances 4 start
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement