Guest User

Untitled

a guest
May 21st, 2018
84
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.69 KB | None | 0 0
  1. open System
  2. open System.Net
  3.  
  4. let log fmt =
  5. Printf.kprintf (fun str -> // todo: lock | agent
  6. printfn "%-4O %-10O %O" Environment.CurrentManagedThreadId (DateTime.Now.ToLongTimeString()) str
  7. )
  8. fmt
  9.  
  10. type File = {
  11. ContentType : string;
  12. Path : string
  13. }
  14.  
  15. type DownloadResult =
  16. | Error of exn
  17. | Content of File
  18.  
  19. module files =
  20. let download trgFileName srcUrl = async {
  21. try
  22. let w = WebRequest.Create(Uri srcUrl)
  23. use! r = w.AsyncGetResponse()
  24. use f = new IO.FileStream(trgFileName, IO.FileMode.Create, IO.FileAccess.Write, IO.FileShare.None)
  25. r.GetResponseStream().CopyTo f
  26. return Content { ContentType = r.ContentType; Path = trgFileName }
  27. with e ->
  28. return Error e
  29. }
  30.  
  31. module filesTest =
  32. let download trgFileName srcUrl = async {
  33. return Error (exn "todo")
  34. }
  35.  
  36. let computeHash (s:IO.Stream) =
  37. Security.Cryptography.HashAlgorithm.Create().ComputeHash s
  38.  
  39. let fileHash (path:string) =
  40. use fs = new IO.FileStream(path, IO.FileMode.Open, IO.FileAccess.Read)
  41. let hc = computeHash fs
  42. BitConverter.ToString(hc).Replace("-", "")
  43.  
  44. let trimQuery url =
  45. Uri(url).GetLeftPart(UriPartial.Path)
  46.  
  47. open files
  48. // open filesTest
  49.  
  50. let urls htmlPath : string list = [
  51. // todo: extract links from html. Load HtmlDocument and travers DOM nodes
  52. ]
  53.  
  54. let crawl trgFolder maxDepth url =
  55. let cs = new Threading.CancellationTokenSource()
  56. let m = MailboxProcessor<int * string>.Start(fun inbox -> async { // depth * url
  57. use entries = new IO.StreamWriter(IO.Path.Combine(trgFolder, "entries.txt"), true)
  58. let visited = Collections.Generic.HashSet<string>()
  59. while true do
  60. let! depth, url = inbox.Receive()
  61. if depth < maxDepth then
  62. let url' = trimQuery url
  63. if not(visited.Add url') then
  64. log "downloading %O" url
  65. let! r = download (IO.Path.GetTempFileName()) url'
  66. match r with
  67. | Error e -> log "error %O" e.Message
  68. | Content f -> // todo: async { } |> Async.Start + cs.Token + visited - Concurrent.ConcurrentDictionary
  69. let hc = fileHash f.Path
  70. log "processing %O as %O" url hc
  71. let path = IO.Path.Combine(trgFolder, hc)
  72. IO.File.Move(f.Path, path)
  73. entries.WriteLine(sprintf "%O t %O" hc url)
  74. for u in urls path |> Seq.map trimQuery do
  75. if visited.Add u then
  76. inbox.Post (depth+1, u)
  77. if inbox.CurrentQueueLength = 0 then
  78. log "done"
  79. cs.Cancel()
  80. }, cs.Token)
  81. m.Post (0, url)
  82. cs
  83.  
  84. let c = crawl __SOURCE_DIRECTORY__ 2 "https://gist.github.com"
  85. // c.Cancel()
  86.  
  87. type Url = Url of string
  88.  
  89. let! r = download (IO.Path.GetTempFileName()) url'
  90. match r with
  91. | Error e -> log "error %O" e.Message
  92. | Content f -> // todo: async { } |> Async.Start + cs.Token + visited - Concurrent.ConcurrentDictionary
  93. let hc = fileHash f.Path
  94. log "processing %O as %O" url hc
  95. let path = IO.Path.Combine(trgFolder, hc)
  96. IO.File.Move(f.Path, path)
  97. entries.WriteLine(sprintf "%O t %O" hc url)
  98. for u in urls path |> Seq.map trimQuery do
  99. if visited.Add u then
  100. inbox.Post (depth+1, u)
Add Comment
Please, Sign In to add comment