Guest User

Untitled

a guest
Apr 25th, 2018
79
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.33 KB | None | 0 0
  1. def get_row_iterator_jsonl_gz(filename, log_period_s=5): # log_period_s=None for no logging
  2. import gzip
  3. import time
  4. import os.path
  5. import json
  6.  
  7.  
  8. # this is the compressed file size
  9. # looks like the gzip format only defines the uncompressed file size modulo 2^32
  10. file_size = os.path.getsize(filename)
  11.  
  12.  
  13. # NOTE: we will use f.fileobj.tell() to get the number of compressed bytes read so far
  14. # this is a little hacky, not public api of the gzip module
  15.  
  16. t0 = t_log = time.time()
  17.  
  18. with gzip.open(filename, 'rb') as f:
  19. for line_number, line_content in enumerate(f): # read line by line
  20.  
  21.  
  22. line_content = line_content.replace('\\\\', '\\')
  23. if line_content=='\N':
  24. continue
  25.  
  26.  
  27. if log_period_s and time.time() > t_log + log_period_s:
  28. t_log = t_log + log_period_s
  29. logging.info('{:,} / {:,} ({:0.1f}% in {:0.1f}s)'.format(f.fileobj.tell(), file_size, 100.0*float(f.fileobj.tell())/file_size, time.time()-t0))
  30.  
  31.  
  32. try:
  33. yield json.loads( line_content )
  34. except ValueError:
  35. logging.error('line_number={}, line_content={}'.format(line_number, line_content))
  36.  
  37. then:
  38.  
  39. for item in get_row_iterator_jsonl_gz(filename):
  40. ... do something with item
Add Comment
Please, Sign In to add comment