Advertisement
Guest User

Untitled

a guest
Apr 26th, 2020
64
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. option = 'read'
  2.  
  3. if option == 'download':
  4. print ('* Downloading the latest Wikidata dump.')
  5. url = "https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.gz" # download the dump: https://dumps.wikimedia.org/wikidatawiki/entities/20180212/
  6. local_filename = url.split('/')[-1]
  7. # NOTE the stream=True parameter
  8. r = requests.get(url, stream=True)
  9. with open(dumps_path + local_filename, 'wb') as f:
  10. for chunk in r.iter_content(chunk_size=10240):
  11. if chunk: # filter out keep-alive new chunks
  12. f.write(chunk)
  13. f.flush()
  14. read_dump = databases_path + local_filename
  15.  
  16. if option == 'read': # 8 hours to process the 2% when read from the other server. sembla que hi ha un problema i és que llegir el dump és més lent que descarregar-lo.
  17. read_dump = '/public/dumps/public/wikidatawiki/entities/latest-all.json.gz'
  18. local_filename = url.split('/')[-1]
  19. try:
  20. shutil.copyfile(filename, dumps_path + local_filename)
  21. print ('Wikidata Dump copied.')
  22. except:
  23. print ('Not possible to copy the wikidata dump.')
  24.  
  25. dump_in = gzip.open(read_dump, 'r')
  26. line = dump_in.readline()
  27. iter = 0
  28.  
  29. n_qitems = 85696352
  30.  
  31. print ('Iterating the dump.')
  32. while line != '':
  33. iter += 1
  34. line = dump_in.readline()
  35. line = line.rstrip().decode('utf-8')[:-1]
  36.  
  37. try:
  38. entity = json.loads(line)
  39. qitem = entity['id']
  40. if not qitem.startswith('Q'): continue
  41.  
  42. # OPERATIONS
  43.  
  44. except:
  45. print ('JSON error.')
Advertisement
RAW Paste Data Copied
Advertisement