Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #r "../packages/FSharp.Data.2.2.0/lib/net40/FSharp.Data.dll"
- open FSharp.Data
- open System.IO
- open System
- open System.Collections.Concurrent
- open System.Collections.Generic
- open System.Threading.Tasks
- let getDomain (absUrl:string) =
- let cut = absUrl.Replace("https://", "").Replace("http://", "")
- let firstSlash = cut.IndexOf("/")
- if firstSlash = -1 then "" else cut.Substring(0, firstSlash).ToLower()
- let isRel (url:string) = url.StartsWith("/")
- let notAFileUrl (url:string) = url.EndsWith("/")
- let haveSameDomains (url1:string) (url2:string) =
- if isRel url1 || isRel url2 then true
- else
- let domain1 = getDomain url1
- let domain2 = getDomain url2
- String.Compare(domain1, "") <> 0 &&
- String.Compare(domain2, "") <> 0 &&
- (isRel url2 || (String.Compare(domain1, domain2, true) = 0))
- let getAbs (domain:string) = function
- | what when isRel what -> "http://" + domain + what
- | what -> what
- let getPagesWithImportances (urls_limit:int) (url:string) : ConcurrentDictionary<string, float> =
- let importancesDict = ConcurrentDictionary<string, float>()
- let visitedUrls = ConcurrentDictionary<string, int>()
- let rec walkThrough (url:string) (max_depth:int) =
- if max_depth > 0 then
- printfn "GET %s" url
- visitedUrls.AddOrUpdate(url, 0, fun _ _ -> 0) |> ignore
- let links = (true, ["a"], (HtmlDocument.Load url))
- |||> HtmlDocument.descendantsNamedWithPath
- |> Seq.filter (fun (x, y) -> x.TryGetAttribute("href").IsSome)
- |> Seq.filter (fun (x, y) -> haveSameDomains url (x.AttributeValue("href")))
- |> Seq.map (fun (x, y) -> (getAbs (getDomain url) (x.AttributeValue "href"), 1.0 / float(List.length y)))
- links
- |> Seq.toArray
- |> Array.Parallel.iter (fun (x, y) -> (importancesDict.AddOrUpdate(x, y, fun _ z -> y + z) |> ignore)) // update importances
- links
- |> Seq.filter (fun (x, y) -> not (visitedUrls.ContainsKey(x))) // delete already visited urls
- |> Seq.filter (fun (x, y) -> notAFileUrl x)
- |> Seq.toArray
- |> Array.Parallel.iter (fun (x, y) -> walkThrough x (max_depth - 1))
- walkThrough url urls_limit
- importancesDict
- let start = "http://www.vox.com/2015/5/13/8598655/north-korea-execution"
- let dict = getPagesWithImportances 4 start
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement