Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import time
- import sys
- import os
- from heapq import heappop, heappush
- from itertools import islice
- from tempfile import gettempdir
- LINES_BUFFER = 2000 # 1MB / 502 bytes
- KB_64 = 64 * 1024
- def merge(chunks):
- key = lambda x: x
- values = []
- for index, chunk in enumerate(chunks):
- try:
- iterator = iter(chunk)
- value = iterator.next()
- except StopIteration:
- try:
- chunk.close()
- os.remove(chunk.name)
- chunks.remove(chunk)
- except:
- pass
- else:
- heappush(values, (key(value), index, value, iterator, chunk))
- while values:
- k, index, value, iterator, chunk = heappop(values)
- yield value
- try:
- value = iterator.next()
- except StopIteration:
- try:
- chunk.close()
- os.remove(chunk.name)
- chunks.remove(chunk)
- except:
- pass
- else:
- heappush(values, (key(value), index, value, iterator, chunk))
- def external_sort(input_file, output_file):
- temp_dir = gettempdir()
- input_file = file(input_file, 'rb', KB_64)
- try:
- input_iterator = iter(input_file)
- chunks = []
- output_chunk = None
- try:
- while True:
- current_chunk = list(islice(input_iterator, LINES_BUFFER))
- if current_chunk:
- current_chunk.sort()
- output_chunk = file(os.path.join(temp_dir, '%06i' % len(chunks)), 'w+b', KB_64)
- output_chunk.writelines(current_chunk)
- output_chunk.flush()
- output_chunk.seek(0)
- chunks.append(output_chunk)
- else:
- break
- except:
- for chunk in chunks:
- try:
- chunk.close()
- os.remove(chunk.name)
- except:
- pass
- if output_chunk not in chunks:
- try:
- output_chunk.close()
- os.remove(output_chunk.name)
- except:
- pass
- return
- finally:
- input_file.close()
- output_file = file(output_file, 'wb', KB_64)
- try:
- output_file.writelines(merge(chunks))
- finally:
- for chunk in chunks:
- try:
- chunk.close()
- os.remove(chunk.name)
- except:
- pass
- output_file.close()
- if __name__ == '__main__':
- t = time.time()
- if len(sys.argv) < 2:
- print("Input [output] file(s) please....")
- else:
- external_sort(sys.argv[1], sys.argv[2] if len(sys.argv) > 2 else "%s.out" % sys.argv[1])
- print("Sorted in %f sec." % (time.time() - t))
Add Comment
Please, Sign In to add comment