SHARE
TWEET

Untitled

Teivar Dec 14th, 2019 (edited) 105 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. (ns word-crawler.service
  2.   (:require [clj-http.client :as client]
  3.             [clj-http.cookies :as cookies]
  4.             [word-crawler.config :refer [env]]
  5.             [mount.core :refer [defstate]]
  6.             [clojure.core.async :refer [chan close! >! <!! go go-loop <! thread]]
  7.             [lambdaisland.uri :refer [uri]]
  8.             [remus :as xml]
  9.             [clojure.tools.logging :as log]))
  10.  
  11. (def ^:private word-requests (chan 1e6))
  12.  
  13. (declare run-crawler)
  14.  
  15. (defstate word-crawler
  16.   :start (run-crawler)
  17.   :stop (close! word-crawler))
  18.  
  19. (defmulti ^:private search-engine (fn [msg] (:engine msg)) :default :bing)
  20.  
  21. (let [bing-cookies-store (cookies/cookie-store)]
  22.   (defmethod search-engine :bing
  23.     [{:keys [word res-fn]}]
  24.     (when (empty? (cookies/get-cookies bing-cookies-store))
  25.       (client/head "https://www.bing.com"
  26.                    {:cookie-store bing-cookies-store}))
  27.     (try
  28.       (->> (client/get "https://www.bing.com/search?"
  29.                        {:query-params {:q word
  30.                                        :count 10
  31.                                        :format "rss"}
  32.                         :cookie-store bing-cookies-store
  33.                         :as :stream})
  34.            (#(-> % :body xml/parse-stream :entries))
  35.            (map :link)
  36.            res-fn)
  37.       (catch Exception e
  38.         (log/error e)
  39.         (res-fn [])))))
  40.  
  41. (defn- run-crawler
  42.   []
  43.   (let [max-http-connection (or (env :max-http-connection) 10)
  44.         connection-run (atom 1)]
  45.     (go-loop []
  46.       (when (< @connection-run max-http-connection)
  47.         (let [msg (<! word-requests)]
  48.           (thread
  49.             (swap! connection-run inc)
  50.             (search-engine msg)
  51.             (swap! connection-run dec))))
  52.       (recur))))
  53.  
  54.  
  55. (defn- get-sld
  56.   [url]
  57.   (re-find #"[^\.]+[\.]{1}[^\.]+$" url))
  58.  
  59. #_(time (let [words ["Clojure" "Python" "Delphi" "Scala" "JS" "NodeJS" "Kotlin" "Haskell" "tmp" "Oracle" "Kafka"
  60.                      "Kafka""Kafka""Kafka""Kafka""Kafka""Kafka""Kafka""Kafka""Kafka""Kafka""Kafka"]
  61.               c (chan (count words))
  62.               res-fn (fn [res] (go (>! c res)))
  63.               results (atom [])]
  64.           (doseq [w words]
  65.             (go (>! word-requests {:word w :res-fn res-fn})))
  66.           (doseq [w words]
  67.             (let [value (<!! c)]
  68.               (swap! results concat value)))
  69.           (close! c)
  70.           (->> @results distinct (map #(-> % uri :host get-sld)) frequencies)))
  71.  
  72. (defn- words->links
  73.   [words]
  74.   (let [c (chan (count words))
  75.         res-fn (fn [res] (go (>! c res)))
  76.         results (atom [])]
  77.     (doseq [w words]
  78.       (go (>! word-requests {:word w :res-fn res-fn})))
  79.     (doseq [_ words]
  80.       (swap! results concat (<!! c)))
  81.     (close! c)
  82.     @results))
  83.  
  84. (defn words-frequencies
  85.   [words]
  86.   (let [links (words->links words)]
  87.     (->> links distinct (map #(-> % uri :host get-sld)) frequencies)))
  88.  
  89. #_(let []
  90.     (future (time (words->links (take 100 (repeat "clojure")))))
  91.     (future (time (words->links (take 100 (repeat "scala")))))
  92.     (future (time (words->links (take 100 (repeat "kotlin")))))
  93.     (future (time (words->links (take 100 (repeat "delphi"))))))
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Top