Untitled

def get_row_iterator_jsonl_gz(filename, log_period_s=5): # log_period_s=None for no logging
    import gzip
    import time
    import os.path
    import json


    # this is the compressed file size
    # looks like the gzip format only defines the uncompressed file size modulo 2^32
    file_size = os.path.getsize(filename)


    # NOTE: we will use f.fileobj.tell() to get the number of compressed bytes read so far
    # this is a little hacky, not public api of the gzip module

    t0 = t_log = time.time()

    with gzip.open(filename, 'rb') as f:
        for line_number, line_content in enumerate(f): # read line by line


            line_content = line_content.replace('\\\\', '\\')
            if line_content=='\N':
                continue


            if log_period_s and time.time() > t_log + log_period_s:
                t_log = t_log + log_period_s
                logging.info('{:,} / {:,} ({:0.1f}% in {:0.1f}s)'.format(f.fileobj.tell(), file_size, 100.0*float(f.fileobj.tell())/file_size, time.time()-t0))


            try:
                yield json.loads( line_content )
            except ValueError:
                logging.error('line_number={}, line_content={}'.format(line_number, line_content))

then:

for item in get_row_iterator_jsonl_gz(filename):
    ... do something with item