Guest User

Untitled

a guest
Oct 21st, 2018
78
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.00 KB | None | 0 0
  1. #!/usr/bin/python
  2. """Counts the appearance of a word in a (probably large) file or stream reading it by chunks."""
  3.  
  4. import os
  5. import sys
  6. from time import time
  7.  
  8. DEFAULT_CHUNK_SIZE = 2048
  9. REPORT_EVERY_SECS = 1
  10.  
  11. def _print_usage():
  12. usage = []
  13. usage.append('Usage: %s <file> <word> [chunk_size] [-q]\n' % sys.argv[0])
  14. usage.append(' Counts occurrences of <word> in (probably large) <file> reading it by chunks.')
  15. usage.append(' If <file> is "-" then stdin will be used as input.\n')
  16. usage.append(' Options:')
  17. usage.append(' -q\t\tto disable progress report (forced if read goes from stdin)')
  18. print('\n'.join(usage))
  19.  
  20.  
  21. def _humanreadable_time(secs):
  22. hours = mins = 0
  23. mins, secs = divmod(secs, 60)
  24. hours, mins = divmod(mins, 60)
  25. return "%02d:%02d:%02d" % (hours, mins, secs)
  26.  
  27.  
  28. def _report(tell, size, start_time, occurrences):
  29. if not size:
  30. return
  31. eta = speed = '???'
  32. seconds = int(time() - start_time)
  33. prcnt = 100. * tell / size
  34. seconds_per_one_pcnt = seconds / prcnt if prcnt else 0
  35.  
  36. if seconds_per_one_pcnt:
  37. eta = _humanreadable_time(int((100 - prcnt) * seconds_per_one_pcnt))
  38. if seconds:
  39. speed = '%0.1f' % (prcnt / seconds)
  40. msg = '\r%0.1f%% in %s ' % (prcnt, _humanreadable_time(seconds))
  41. msg += 'Found: %d ' % occurrences
  42. msg += 'Avg speed: %s %%/s ' % speed
  43. if tell != size:
  44. msg += 'ETA: %s' % eta
  45. msg += ' ' * (80 - len(msg))
  46.  
  47. sys.stdout.write(msg)
  48. sys.stdout.flush()
  49.  
  50.  
  51. def _main(*args):
  52. if len(args) < 2 or set(args) & {'--help', '-h'}:
  53. _print_usage()
  54. exit(not set(args) & {'--help', '-h'})
  55. size = None
  56. stream = args[1]
  57.  
  58. try:
  59. if args[1] == '-':
  60. stream = sys.stdin
  61. else:
  62. stream = open(args[1], 'rb')
  63.  
  64. term, step_back = args[2], len(args[2]) - 1
  65.  
  66. chunk_size = int(args[3]) if len(args) > 3 else DEFAULT_CHUNK_SIZE
  67. if chunk_size <= step_back:
  68. chunk_size = step_back + DEFAULT_CHUNK_SIZE
  69.  
  70. occurrences = 0
  71. last_time = start = time()
  72. size = None
  73. if stream != sys.stdin and '-q' not in args:
  74. size = os.fstat(stream.fileno()).st_size
  75. _report(0, size, start, occurrences)
  76.  
  77. tail, chunk = '', stream.read(chunk_size)
  78. while chunk:
  79. data = tail + chunk
  80. occurrences += data.count(term)
  81. tail = chunk[-step_back:]
  82. chunk = stream.read(chunk_size)
  83.  
  84. if size and time() - last_time > REPORT_EVERY_SECS:
  85. size = os.fstat(stream.fileno()).st_size
  86. _report(stream.tell(), size, start, occurrences)
  87. last_time = time()
  88.  
  89. if size:
  90. size = os.fstat(stream.fileno()).st_size
  91. _report(size, size, start, occurrences)
  92. sys.stdout.write('\n')
  93. sys.stdout.flush()
  94. print("%d" % occurrences)
  95.  
  96. finally:
  97. if stream != sys.stdin:
  98. stream.close()
  99.  
  100.  
  101. if __name__ == '__main__':
  102. _main(*sys.argv)
Add Comment
Please, Sign In to add comment