Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def get_row_iterator_jsonl_gz(filename, log_period_s=5): # log_period_s=None for no logging
- import gzip
- import time
- import os.path
- import json
- # this is the compressed file size
- # looks like the gzip format only defines the uncompressed file size modulo 2^32
- file_size = os.path.getsize(filename)
- # NOTE: we will use f.fileobj.tell() to get the number of compressed bytes read so far
- # this is a little hacky, not public api of the gzip module
- t0 = t_log = time.time()
- with gzip.open(filename, 'rb') as f:
- for line_number, line_content in enumerate(f): # read line by line
- line_content = line_content.replace('\\\\', '\\')
- if line_content=='\N':
- continue
- if log_period_s and time.time() > t_log + log_period_s:
- t_log = t_log + log_period_s
- logging.info('{:,} / {:,} ({:0.1f}% in {:0.1f}s)'.format(f.fileobj.tell(), file_size, 100.0*float(f.fileobj.tell())/file_size, time.time()-t0))
- try:
- yield json.loads( line_content )
- except ValueError:
- logging.error('line_number={}, line_content={}'.format(line_number, line_content))
- then:
- for item in get_row_iterator_jsonl_gz(filename):
- ... do something with item
Add Comment
Please, Sign In to add comment