Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- (ns scraper
- (:use [clojure.contrib.duck-streams :only [spit]]
- [clojure.contrib.seq-utils :only [partition-all]])
- (:import (java.net URL)
- (java.io BufferedReader InputStreamReader FileReader)))
- (def num-threads 20)
- (defn scrape [url]
- (try
- (let [conn (URL. url)]
- (with-open [stream (.openStream conn)]
- (let [buf (BufferedReader. (InputStreamReader. stream))]
- (spit (.toString (java.util.UUID/randomUUID))
- (apply str (line-seq buf))))))
- (catch Exception e nil)))
- (defn process-urls [list]
- (doseq [url list]
- (scrape url)))
- (defn run [f]
- (let [list (line-seq (BufferedReader. (FileReader. f)))
- url-per-thread (int (inc (/ (count list) num-threads)))]
- (doseq [list (partition-all url-per-thread list)]
- (future (process-urls list)))))
- ;;(run "url.list")
Add Comment
Please, Sign In to add comment