Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- open System
- open System.Net
- let log fmt =
- Printf.kprintf (fun str -> // todo: lock | agent
- printfn "%-4O %-10O %O" Environment.CurrentManagedThreadId (DateTime.Now.ToLongTimeString()) str
- )
- fmt
- type File = {
- ContentType : string;
- Path : string
- }
- type DownloadResult =
- | Error of exn
- | Content of File
- module files =
- let download trgFileName srcUrl = async {
- try
- let w = WebRequest.Create(Uri srcUrl)
- use! r = w.AsyncGetResponse()
- use f = new IO.FileStream(trgFileName, IO.FileMode.Create, IO.FileAccess.Write, IO.FileShare.None)
- r.GetResponseStream().CopyTo f
- return Content { ContentType = r.ContentType; Path = trgFileName }
- with e ->
- return Error e
- }
- module filesTest =
- let download trgFileName srcUrl = async {
- return Error (exn "todo")
- }
- let computeHash (s:IO.Stream) =
- Security.Cryptography.HashAlgorithm.Create().ComputeHash s
- let fileHash (path:string) =
- use fs = new IO.FileStream(path, IO.FileMode.Open, IO.FileAccess.Read)
- let hc = computeHash fs
- BitConverter.ToString(hc).Replace("-", "")
- let trimQuery url =
- Uri(url).GetLeftPart(UriPartial.Path)
- open files
- // open filesTest
- let urls htmlPath : string list = [
- // todo: extract links from html. Load HtmlDocument and travers DOM nodes
- ]
- let crawl trgFolder maxDepth url =
- let cs = new Threading.CancellationTokenSource()
- let m = MailboxProcessor<int * string>.Start(fun inbox -> async { // depth * url
- use entries = new IO.StreamWriter(IO.Path.Combine(trgFolder, "entries.txt"), true)
- let visited = Collections.Generic.HashSet<string>()
- while true do
- let! depth, url = inbox.Receive()
- if depth < maxDepth then
- let url' = trimQuery url
- if not(visited.Add url') then
- log "downloading %O" url
- let! r = download (IO.Path.GetTempFileName()) url'
- match r with
- | Error e -> log "error %O" e.Message
- | Content f -> // todo: async { } |> Async.Start + cs.Token + visited - Concurrent.ConcurrentDictionary
- let hc = fileHash f.Path
- log "processing %O as %O" url hc
- let path = IO.Path.Combine(trgFolder, hc)
- IO.File.Move(f.Path, path)
- entries.WriteLine(sprintf "%O t %O" hc url)
- for u in urls path |> Seq.map trimQuery do
- if visited.Add u then
- inbox.Post (depth+1, u)
- if inbox.CurrentQueueLength = 0 then
- log "done"
- cs.Cancel()
- }, cs.Token)
- m.Post (0, url)
- cs
- let c = crawl __SOURCE_DIRECTORY__ 2 "https://gist.github.com"
- // c.Cancel()
- type Url = Url of string
- let! r = download (IO.Path.GetTempFileName()) url'
- match r with
- | Error e -> log "error %O" e.Message
- | Content f -> // todo: async { } |> Async.Start + cs.Token + visited - Concurrent.ConcurrentDictionary
- let hc = fileHash f.Path
- log "processing %O as %O" url hc
- let path = IO.Path.Combine(trgFolder, hc)
- IO.File.Move(f.Path, path)
- entries.WriteLine(sprintf "%O t %O" hc url)
- for u in urls path |> Seq.map trimQuery do
- if visited.Add u then
- inbox.Post (depth+1, u)
Add Comment
Please, Sign In to add comment