daily pastebin goal
44%
SHARE
TWEET

Untitled

a guest Oct 21st, 2018 60 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/python
  2. """Counts the appearance of a word in a (probably large) file or stream reading it by chunks."""
  3.  
  4. import os
  5. import sys
  6. from time import time
  7.  
  8. DEFAULT_CHUNK_SIZE = 2048
  9. REPORT_EVERY_SECS = 1
  10.  
  11. def _print_usage():
  12.     usage = []
  13.     usage.append('Usage: %s <file> <word> [chunk_size] [-q]\n' % sys.argv[0])
  14.     usage.append('  Counts occurrences of <word> in (probably large) <file> reading it by chunks.')
  15.     usage.append('  If <file> is "-" then stdin will be used as input.\n')
  16.     usage.append('  Options:')
  17.     usage.append('     -q\t\tto disable progress report (forced if read goes from stdin)')
  18.     print('\n'.join(usage))
  19.  
  20.  
  21. def _humanreadable_time(secs):
  22.     hours = mins = 0
  23.     mins, secs = divmod(secs, 60)
  24.     hours, mins = divmod(mins, 60)
  25.     return "%02d:%02d:%02d" % (hours, mins, secs)
  26.  
  27.  
  28. def _report(tell, size, start_time, occurrences):
  29.     if not size:
  30.         return
  31.     eta = speed = '???'
  32.     seconds = int(time() - start_time)
  33.     prcnt = 100. * tell / size
  34.     seconds_per_one_pcnt = seconds / prcnt if prcnt else 0
  35.  
  36.     if seconds_per_one_pcnt:
  37.         eta = _humanreadable_time(int((100 - prcnt) * seconds_per_one_pcnt))
  38.     if seconds:
  39.         speed = '%0.1f' % (prcnt / seconds)
  40.     msg = '\r%0.1f%% in %s  ' % (prcnt, _humanreadable_time(seconds))
  41.     msg += 'Found: %d  ' %  occurrences
  42.     msg += 'Avg speed: %s %%/s  ' % speed
  43.     if tell != size:
  44.         msg += 'ETA: %s' % eta
  45.     msg += ' ' * (80 - len(msg))
  46.  
  47.     sys.stdout.write(msg)
  48.     sys.stdout.flush()
  49.  
  50.  
  51. def _main(*args):
  52.     if len(args) < 2 or set(args) & {'--help', '-h'}:
  53.         _print_usage()
  54.         exit(not set(args) & {'--help', '-h'})
  55.     size = None
  56.     stream = args[1]
  57.  
  58.     try:
  59.         if args[1] == '-':
  60.             stream = sys.stdin
  61.         else:
  62.             stream = open(args[1], 'rb')
  63.  
  64.         term, step_back = args[2], len(args[2]) - 1
  65.  
  66.         chunk_size = int(args[3]) if len(args) > 3 else DEFAULT_CHUNK_SIZE
  67.         if chunk_size <= step_back:
  68.             chunk_size = step_back + DEFAULT_CHUNK_SIZE
  69.  
  70.         occurrences = 0
  71.         last_time = start = time()
  72.         size = None
  73.         if stream != sys.stdin and '-q' not in args:
  74.             size = os.fstat(stream.fileno()).st_size
  75.             _report(0, size, start, occurrences)
  76.  
  77.         tail, chunk = '', stream.read(chunk_size)
  78.         while chunk:
  79.             data = tail + chunk
  80.             occurrences += data.count(term)
  81.             tail = chunk[-step_back:]
  82.             chunk = stream.read(chunk_size)
  83.  
  84.             if size and time() - last_time > REPORT_EVERY_SECS:
  85.                 size = os.fstat(stream.fileno()).st_size
  86.                 _report(stream.tell(), size, start, occurrences)
  87.                 last_time = time()
  88.  
  89.         if size:
  90.             size = os.fstat(stream.fileno()).st_size
  91.             _report(size, size, start, occurrences)
  92.             sys.stdout.write('\n')
  93.             sys.stdout.flush()
  94.         print("%d" % occurrences)
  95.  
  96.     finally:
  97.         if stream != sys.stdin:
  98.             stream.close()
  99.  
  100.  
  101. if __name__ == '__main__':
  102.     _main(*sys.argv)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top