Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- require 'set'
- class File
- def self.tail(path, n = 10)
- result = File.open(path, 'r') do |file|
- buffer_size = 512
- line_count = 0
- file.seek(0, IO::SEEK_END)
- offset = file.pos
- while line_count <= n && offset > 0
- to_read = if (offset - buffer_size) < 0
- offset
- else
- buffer_size
- end
- file.seek(offset - to_read)
- data = file.read(to_read)
- data.reverse.each_char do |c|
- if line_count > n
- offset += 1
- break
- end
- offset -= 1
- if c == "n"
- line_count += 1
- end
- end
- end
- file.seek(offset)
- file.read
- end
- result
- end
- def each_chunk(chunk_size)
- yield read(chunk_size) until eof?
- end
- end
- def top_n(filename, n = 100)
- pre_sorted_chunks = Dir[".#{filename}_sorted_chunk_*"]
- if pre_sorted_chunks.empty?
- build_pre_sorted_chunks_for(filename)
- end
- top = SortedSet.new
- # This reference takes ±0.141 seconds.
- # pre_sorted_chunks.each do |file|
- # top << `tail -n #{n} #{file}`.strip.split.map(&:to_i)
- # end
- # This takes ±0.130 seconds. A little better.
- tasks = pre_sorted_chunks.map do |chunk_file_path|
- Thread.new(chunk_file_path) do |file_path|
- top << File.tail(file_path, n).strip.split.map(&:to_i)
- end
- end
- tasks.each(&:join)
- top.max(n)
- end
- # Current impl. takes ±5m. A lot of time.
- def build_pre_sorted_chunks_for(filename)
- File.open(filename) do |file|
- n = 0
- # Chunks of 500MB.
- file.each_chunk(1024 ** 2 * 500) do |chunk|
- numbers = chunk.split.map(&:to_i)
- sorted_set = SortedSet.new(numbers)
- sorted_set = sorted_set.to_a.join("n")
- File.open(".#{filename}_sorted_chunk_#{n}", 'w') do |f|
- f.write(sorted_set)
- end
- n += 1
- end
- end
- end
- unless ARGV.empty?
- filename = ARGV[0]
- n = ARGV[1].to_i
- puts top_n(filename, n)
- else
- puts "Please provide a filename and a N value to calculate the top N."
- end
- def generate_random_number_file(n = 15_000_000)
- require 'set'
- randoms = Set.new
- loop do
- randoms << rand(n)
- return if randoms.size >= n
- end
- File.open('randos.txt', 'w') do |file|
- random_list = randoms.to_a.join("n")
- file.write(random_list)
- end
- end
- def _time(task_name = 'Task')
- t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
- yield
- t2 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
- puts "#{task_name} took #{(t2 - t1) * 1000} milliseconds."
- end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement