Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- """Counts the appearance of a word in a (probably large) file or stream reading it by chunks."""
- import os
- import sys
- from time import time
- DEFAULT_CHUNK_SIZE = 2048
- REPORT_EVERY_SECS = 1
- def _print_usage():
- usage = []
- usage.append('Usage: %s <file> <word> [chunk_size] [-q]\n' % sys.argv[0])
- usage.append(' Counts occurrences of <word> in (probably large) <file> reading it by chunks.')
- usage.append(' If <file> is "-" then stdin will be used as input.\n')
- usage.append(' Options:')
- usage.append(' -q\t\tto disable progress report (forced if read goes from stdin)')
- print('\n'.join(usage))
- def _humanreadable_time(secs):
- hours = mins = 0
- mins, secs = divmod(secs, 60)
- hours, mins = divmod(mins, 60)
- return "%02d:%02d:%02d" % (hours, mins, secs)
- def _report(tell, size, start_time, occurrences):
- if not size:
- return
- eta = speed = '???'
- seconds = int(time() - start_time)
- prcnt = 100. * tell / size
- seconds_per_one_pcnt = seconds / prcnt if prcnt else 0
- if seconds_per_one_pcnt:
- eta = _humanreadable_time(int((100 - prcnt) * seconds_per_one_pcnt))
- if seconds:
- speed = '%0.1f' % (prcnt / seconds)
- msg = '\r%0.1f%% in %s ' % (prcnt, _humanreadable_time(seconds))
- msg += 'Found: %d ' % occurrences
- msg += 'Avg speed: %s %%/s ' % speed
- if tell != size:
- msg += 'ETA: %s' % eta
- msg += ' ' * (80 - len(msg))
- sys.stdout.write(msg)
- sys.stdout.flush()
- def _main(*args):
- if len(args) < 2 or set(args) & {'--help', '-h'}:
- _print_usage()
- exit(not set(args) & {'--help', '-h'})
- size = None
- stream = args[1]
- try:
- if args[1] == '-':
- stream = sys.stdin
- else:
- stream = open(args[1], 'rb')
- term, step_back = args[2], len(args[2]) - 1
- chunk_size = int(args[3]) if len(args) > 3 else DEFAULT_CHUNK_SIZE
- if chunk_size <= step_back:
- chunk_size = step_back + DEFAULT_CHUNK_SIZE
- occurrences = 0
- last_time = start = time()
- size = None
- if stream != sys.stdin and '-q' not in args:
- size = os.fstat(stream.fileno()).st_size
- _report(0, size, start, occurrences)
- tail, chunk = '', stream.read(chunk_size)
- while chunk:
- data = tail + chunk
- occurrences += data.count(term)
- tail = chunk[-step_back:]
- chunk = stream.read(chunk_size)
- if size and time() - last_time > REPORT_EVERY_SECS:
- size = os.fstat(stream.fileno()).st_size
- _report(stream.tell(), size, start, occurrences)
- last_time = time()
- if size:
- size = os.fstat(stream.fileno()).st_size
- _report(size, size, start, occurrences)
- sys.stdout.write('\n')
- sys.stdout.flush()
- print("%d" % occurrences)
- finally:
- if stream != sys.stdin:
- stream.close()
- if __name__ == '__main__':
- _main(*sys.argv)
Add Comment
Please, Sign In to add comment