Teivar

Untitled

Dec 14th, 2019
138
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. (ns word-crawler.service
  2.   (:require [clj-http.client :as client]
  3.             [clj-http.cookies :as cookies]
  4.             [word-crawler.config :refer [env]]
  5.             [mount.core :refer [defstate]]
  6.             [clojure.core.async :refer [chan close! >! <!! go go-loop <! thread]]
  7.             [lambdaisland.uri :refer [uri]]
  8.             [remus :as xml]
  9.             [clojure.tools.logging :as log]))
  10.  
  11. (def ^:private word-requests (chan 1e6))
  12.  
  13. (declare run-crawler)
  14.  
  15. (defstate word-crawler
  16.   :start (run-crawler)
  17.   :stop (close! word-crawler))
  18.  
  19. (defmulti ^:private search-engine (fn [msg] (:engine msg)) :default :bing)
  20.  
  21. (let [bing-cookies-store (cookies/cookie-store)]
  22.   (defmethod search-engine :bing
  23.     [{:keys [word res-fn]}]
  24.     (when (empty? (cookies/get-cookies bing-cookies-store))
  25.       (client/head "https://www.bing.com"
  26.                    {:cookie-store bing-cookies-store}))
  27.     (try
  28.       (->> (client/get "https://www.bing.com/search?"
  29.                        {:query-params {:q word
  30.                                        :count 10
  31.                                        :format "rss"}
  32.                         :cookie-store bing-cookies-store
  33.                         :as :stream})
  34.            (#(-> % :body xml/parse-stream :entries))
  35.            (map :link)
  36.            res-fn)
  37.       (catch Exception e
  38.         (log/error e)
  39.         (res-fn [])))))
  40.  
  41. (defn- run-crawler
  42.   []
  43.   (let [max-http-connection (or (env :max-http-connection) 10)
  44.         connection-run (atom 1)]
  45.     (go-loop []
  46.       (when (< @connection-run max-http-connection)
  47.         (let [msg (<! word-requests)]
  48.           (thread
  49.             (swap! connection-run inc)
  50.             (search-engine msg)
  51.             (swap! connection-run dec))))
  52.       (recur))))
  53.  
  54.  
  55. (defn- get-sld
  56.   [url]
  57.   (re-find #"[^\.]+[\.]{1}[^\.]+$" url))
  58.  
  59. #_(time (let [words ["Clojure" "Python" "Delphi" "Scala" "JS" "NodeJS" "Kotlin" "Haskell" "tmp" "Oracle" "Kafka"
  60.                      "Kafka""Kafka""Kafka""Kafka""Kafka""Kafka""Kafka""Kafka""Kafka""Kafka""Kafka"]
  61.               c (chan (count words))
  62.               res-fn (fn [res] (go (>! c res)))
  63.               results (atom [])]
  64.           (doseq [w words]
  65.             (go (>! word-requests {:word w :res-fn res-fn})))
  66.           (doseq [w words]
  67.             (let [value (<!! c)]
  68.               (swap! results concat value)))
  69.           (close! c)
  70.           (->> @results distinct (map #(-> % uri :host get-sld)) frequencies)))
  71.  
  72. (defn- words->links
  73.   [words]
  74.   (let [c (chan (count words))
  75.         res-fn (fn [res] (go (>! c res)))
  76.         results (atom [])]
  77.     (doseq [w words]
  78.       (go (>! word-requests {:word w :res-fn res-fn})))
  79.     (doseq [_ words]
  80.       (swap! results concat (<!! c)))
  81.     (close! c)
  82.     @results))
  83.  
  84. (defn words-frequencies
  85.   [words]
  86.   (let [links (words->links words)]
  87.     (->> links distinct (map #(-> % uri :host get-sld)) frequencies)))
  88.  
  89. #_(let []
  90.     (future (time (words->links (take 100 (repeat "clojure")))))
  91.     (future (time (words->links (take 100 (repeat "scala")))))
  92.     (future (time (words->links (take 100 (repeat "kotlin")))))
  93.     (future (time (words->links (take 100 (repeat "delphi"))))))
RAW Paste Data