Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- (ns clj-etl-utils.sequences)
- (def random-sample-seq
- (let [rnd (java.util.Random.)]
- (fn self [[item & population :as population-seq] population-size remaining-samples-needed]
- (if (or (zero? remaining-samples-needed) (empty? population-seq))
- nil
- (if (< (.nextInt rnd population-size) remaining-samples-needed)
- (lazy-cat
- [item]
- (self population
- (dec population-size)
- (dec remaining-samples-needed)))
- (self population
- (dec population-size)
- remaining-samples-needed))))))
- (comment
- (with-open [outp (ds/writer "/tmp/20k-sample.tab")]
- (doseq [line (clj-etl-utils.sequences/random-sample-seq
- (ds/read-lines "data-to-be-sampled.tab")
- 390000000
- 20000
- (clj-etl-utils.lang/make-periodic-invoker
- 10
- #(printf "at %s out of %s: %3.2f\n" %1 100 (* 100 (/ %1 100.0)))))]
- (.println outp line)))
- )
Add Comment
Please, Sign In to add comment