Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- option = 'read'
- if option == 'download':
- print ('* Downloading the latest Wikidata dump.')
- url = "https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.gz" # download the dump: https://dumps.wikimedia.org/wikidatawiki/entities/20180212/
- local_filename = url.split('/')[-1]
- # NOTE the stream=True parameter
- r = requests.get(url, stream=True)
- with open(dumps_path + local_filename, 'wb') as f:
- for chunk in r.iter_content(chunk_size=10240):
- if chunk: # filter out keep-alive new chunks
- f.write(chunk)
- f.flush()
- read_dump = databases_path + local_filename
- if option == 'read': # 8 hours to process the 2% when read from the other server. sembla que hi ha un problema i és que llegir el dump és més lent que descarregar-lo.
- read_dump = '/public/dumps/public/wikidatawiki/entities/latest-all.json.gz'
- local_filename = url.split('/')[-1]
- try:
- shutil.copyfile(filename, dumps_path + local_filename)
- print ('Wikidata Dump copied.')
- except:
- print ('Not possible to copy the wikidata dump.')
- dump_in = gzip.open(read_dump, 'r')
- line = dump_in.readline()
- iter = 0
- n_qitems = 85696352
- print ('Iterating the dump.')
- while line != '':
- iter += 1
- line = dump_in.readline()
- line = line.rstrip().decode('utf-8')[:-1]
- try:
- entity = json.loads(line)
- qitem = entity['id']
- if not qitem.startswith('Q'): continue
- # OPERATIONS
- except:
- print ('JSON error.')
Advertisement
Add Comment
Please, Sign In to add comment