Advertisement
Guest User

Untitled

a guest
May 11th, 2020
105
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. read_dump = '/public/dumps/public/wikidatawiki/entities/latest-all.json.gz'
  2.  
  3. print (str(datetime.timedelta(seconds=time.time() - functionstartTime)))
  4. # return
  5.  
  6. dump_in = gzip.open(read_dump, 'r')
  7. line = dump_in.readline()
  8. iter = 0
  9.  
  10. n_qitems = 85696352
  11.  
  12. sitelinks_values = []
  13. labels_values = []
  14. metadata_list = []; geolocated_property_list = []; time_properties_list = []; language_strong_properties_list = []; language_weak_properties_list = []; country_properties_list = []; location_properties_list = []; has_part_properties_list = []; affiliation_properties_list = []; created_by_properties_list = []; part_of_properties_list = []; industry_properties_list = []; sexual_orientation_properties_list = []; religious_group_properties_list = []; ethnic_group_properties_list = []; people_properties_list = []; instance_of_subclasses_of_properties_list = []
  15.  
  16. print ('Iterating the dump.')
  17. while line != '':
  18. iter += 1
  19. line = dump_in.readline()
  20. line = line.rstrip().decode('utf-8')[:-1]
  21.  
  22. try:
  23. entity = json.loads(line)
  24. qitem = entity['id']
  25. if not qitem.startswith('Q'): continue
  26.  
  27. except:
  28. print ('JSON error.')
  29.  
  30. sitelinks = []
  31. wd_sitelinks = entity['sitelinks']
  32. if len(wd_sitelinks) == 0: continue
  33.  
  34.  
  35.  
  36. # # SITELINKS
  37. # for code, title in wd_sitelinks.items():
  38. # if code in wikilanguagecodeswiki:
  39. # sitelinks_values.append((qitem,code,title['title']))
  40. # sitelinks.append(code)
  41.  
  42. # # LABELS
  43. # if len(sitelinks) != 0:
  44. # for code, title in entity['labels'].items(): # bucle de llengües
  45. # code = code + 'wiki'
  46. # if code not in wd_sitelinks and code in wikilanguagecodeswiki:
  47. # labels_values.append((qitem,code,title['value']))
  48.  
  49. # # PROPERTIES
  50. # # print ([qitem,len(claims),len(entity['sitelinks'])])
  51. # claims = entity['claims']
  52.  
  53. # # meta info
  54. # metadata_list.append((qitem,len(claims),len(sitelinks)-1))
  55.  
  56. # # properties
  57. # for claim in claims:
  58. # wdproperty = claim
  59. # if wdproperty not in allproperties: continue
  60. # claimlist = claims[claim]
  61. # for snak in claimlist:
  62. # mainsnak = snak['mainsnak']
  63.  
  64. # if wdproperty in geolocated_property:
  65. # try:
  66. # coordinates = str(mainsnak['datavalue']['value']['latitude'])+','+str(mainsnak['datavalue']['value']['longitude'])
  67. # except:
  68. # continue
  69.  
  70. # geolocated_property_list.append((qitem,wdproperty,coordinates))
  71. # continue
  72.  
  73. # if wdproperty in time_properties:
  74. # try:
  75. # value = str(mainsnak['datavalue']['value']['time'])
  76. # except:
  77. # continue
  78.  
  79. # value = int(value[0]+value[1:].split('-')[0])
  80. # time_properties_list.append((qitem,wdproperty,value))
  81. # continue
  82.  
  83. # # the rest of properties
  84. # try:
  85. # qitem2 = 'Q{}'.format(mainsnak['datavalue']['value']['numeric-id'])
  86. # except:
  87. # continue
  88.  
  89. # if wdproperty in language_strong_properties:
  90. # # print ('language properties')
  91. # # print (qitem,wdproperty,qitem2)
  92. # language_strong_properties_list.append((qitem,wdproperty,qitem2))
  93. # continue
  94.  
  95. # if wdproperty in language_weak_properties:
  96. # # print ('language properties')
  97. # # print (qitem,wdproperty,qitem2)
  98. # language_weak_properties_list.append((qitem,wdproperty,qitem2))
  99. # continue
  100.  
  101. # if wdproperty in country_properties:
  102. # # print ('country properties')
  103. # # print (qitem,wdproperty,qitem2)
  104. # country_properties_list.append((qitem,wdproperty,qitem2))
  105. # continue
  106.  
  107. # if wdproperty in location_properties:
  108. # # print ('location properties')
  109. # # print (qitem,wdproperty,qitem2)
  110. # location_properties_list.append((qitem,wdproperty,qitem2))
  111. # continue
  112.  
  113. # if wdproperty in has_part_properties:
  114. # # print ('has part properties')
  115. # # print (qitem,wdproperty,qitem2)
  116. # has_part_properties_list.append((qitem,wdproperty,qitem2))
  117. # continue
  118.  
  119. # if wdproperty in affiliation_properties:
  120. # # print ('affiliation_properties')
  121. # # print (qitem,wdproperty,qitem2)
  122. # affiliation_properties_list.append((qitem,wdproperty,qitem2))
  123. # continue
  124.  
  125. # if wdproperty in created_by_properties:
  126. # # print ('created by properties')
  127. # # print (qitem,wdproperty,qitem2)
  128. # created_by_properties_list.append((qitem,wdproperty,qitem2))
  129. # continue
  130.  
  131. # if wdproperty in part_of_properties:
  132. # # print ('part of properties')
  133. # # print (qitem,wdproperty,qitem2)
  134. # part_of_properties_list.append((qitem,wdproperty,qitem2))
  135. # continue
  136.  
  137. # if wdproperty in industry_properties:
  138. # # print ('industry properties')
  139. # # print (qitem,wdproperty,qitem2)
  140. # industry_properties_list.append((qitem,wdproperty,qitem2))
  141. # continue
  142.  
  143.  
  144. # if wdproperty in sexual_orientation_properties:
  145. # # print ('sexual_orientation_properties')
  146. # # print (qitem,wdproperty,qitem2)
  147. # sexual_orientation_properties_list.append((qitem,wdproperty,qitem2))
  148. # continue
  149.  
  150. # if wdproperty in religious_group_properties:
  151. # # print ('religious_group_properties')
  152. # # print (qitem,wdproperty,qitem2)
  153. # religious_group_properties_list.append((qitem,wdproperty,qitem2))
  154. # continue
  155.  
  156. # if wdproperty in ethnic_group_properties:
  157. # # print ('ethnic_group_properties')
  158. # # print (qitem,wdproperty,qitem2)
  159. # ethnic_group_properties_list.append((qitem,wdproperty,qitem2))
  160. # continue
  161.  
  162. # if wdproperty in people_properties:
  163. # if wdproperty == 'P21' or (wdproperty == 'P31' and qitem2 == 'Q5'):
  164. # # print ('people properties')
  165. # # print (qitem,wdproperty,qitem2)
  166. # people_properties_list.append((qitem,wdproperty,qitem2))
  167. # continue
  168.  
  169. # if wdproperty in instance_of_subclasses_of_properties:
  170. # if wdproperty == 'P31' and qitem2 == 'Q5': continue # if human, continue
  171. # values = [qitem,wdproperty,qitem2]
  172. # # print ('instance_of_subclasses_of_properties properties')
  173. # # print (qitem,wdproperty,qitem2)
  174. # instance_of_subclasses_of_properties_list.append((qitem,wdproperty,qitem2))
  175. # continue
  176.  
  177.  
  178. if iter % 900000 == 0:
  179. # insert
  180. # cursor.executemany("INSERT INTO sitelinks (qitem, langcode, page_title) VALUES (?,?,?)",sitelinks_values)
  181. # cursor.executemany("INSERT INTO labels (qitem, langcode, label) VALUES (?,?,?)",labels_values)
  182.  
  183. # cursor.executemany("INSERT OR IGNORE INTO metadata (qitem, properties, sitelinks) VALUES (?,?,?)", metadata_list)
  184. # cursor.executemany("INSERT OR IGNORE INTO geolocated_property (qitem, property, coordinates) VALUES (?,?,?)",geolocated_property_list)
  185. # cursor.executemany("INSERT OR IGNORE INTO time_properties (qitem, property, value) VALUES (?,?,?)",time_properties_list)
  186. # cursor.executemany("INSERT OR IGNORE INTO language_strong_properties (qitem, property, qitem2) VALUES (?,?,?)",language_strong_properties_list)
  187. # cursor.executemany("INSERT OR IGNORE INTO language_weak_properties (qitem, property, qitem2) VALUES (?,?,?)",language_weak_properties_list)
  188. # cursor.executemany("INSERT OR IGNORE INTO country_properties (qitem, property, qitem2) VALUES (?,?,?)",country_properties_list)
  189. # cursor.executemany("INSERT OR IGNORE INTO location_properties (qitem, property, qitem2) VALUES (?,?,?)",location_properties_list)
  190. # cursor.executemany("INSERT OR IGNORE INTO has_part_properties (qitem, property, qitem2) VALUES (?,?,?)",has_part_properties_list)
  191. # cursor.executemany("INSERT OR IGNORE INTO affiliation_properties (qitem, property, qitem2) VALUES (?,?,?)",affiliation_properties_list)
  192. # cursor.executemany("INSERT OR IGNORE INTO created_by_properties (qitem, property, qitem2) VALUES (?,?,?)",created_by_properties_list)
  193. # cursor.executemany("INSERT OR IGNORE INTO part_of_properties (qitem, property, qitem2) VALUES (?,?,?)",part_of_properties_list)
  194. # cursor.executemany("INSERT OR IGNORE INTO industry_properties (qitem, property, qitem2) VALUES (?,?,?)",industry_properties_list)
  195. # cursor.executemany("INSERT OR IGNORE INTO sexual_orientation_properties (qitem, property, qitem2) VALUES (?,?,?)",sexual_orientation_properties_list)
  196. # cursor.executemany("INSERT OR IGNORE INTO religious_group_properties (qitem, property, qitem2) VALUES (?,?,?)",religious_group_properties_list)
  197. # cursor.executemany("INSERT OR IGNORE INTO ethnic_group_properties (qitem, property, qitem2) VALUES (?,?,?)",ethnic_group_properties_list)
  198. # cursor.executemany("INSERT OR IGNORE INTO people_properties (qitem, property, qitem2) VALUES (?,?,?)",people_properties_list)
  199. # cursor.executemany("INSERT OR IGNORE INTO instance_of_subclasses_of_properties (qitem, property, qitem2) VALUES (?,?,?)",instance_of_subclasses_of_properties_list)
  200. # conn.commit()
  201.  
  202. # sitelinks_values = []
  203. # labels_values = []
  204. # metadata_list = []; geolocated_property_list = []; time_properties_list = []; language_strong_properties_list = []; language_weak_properties_list = []; country_properties_list = []; location_properties_list = []; has_part_properties_list = []; affiliation_properties_list = []; created_by_properties_list = []; part_of_properties_list = []; industry_properties_list = []; sexual_orientation_properties_list = []; religious_group_properties_list = []; ethnic_group_properties_list = []; people_properties_list = []; instance_of_subclasses_of_properties_list = []
  205.  
  206. print (iter)
  207. print (100*iter/n_qitems)
  208. print ('current time: ' + str(time.time() - functionstartTime))
  209. print ('number of line per second: '+str(iter/(time.time() - functionstartTime)))
  210. # break
  211.  
  212.  
  213. # last round
  214. # insert
  215. cursor.executemany("INSERT OR IGNORE INTO sitelinks (qitem, langcode, page_title) VALUES (?,?,?)",sitelinks_values)
  216. cursor.executemany("INSERT OR IGNORE INTO labels (qitem, langcode, label) VALUES (?,?,?)",values)
  217.  
  218. cursor.executemany("INSERT OR IGNORE INTO metadata (qitem, properties, sitelinks) VALUES (?,?,?)", metadata_list)
  219. cursor.executemany("INSERT OR IGNORE INTO geolocated_property (qitem, property, coordinates) VALUES (?,?,?)",geolocated_property_list)
  220. cursor.executemany("INSERT OR IGNORE INTO time_properties (qitem, property, value) VALUES (?,?,?)",time_properties_list)
  221. cursor.executemany("INSERT OR IGNORE INTO language_strong_properties (qitem, property, qitem2) VALUES (?,?,?)",language_strong_properties_list)
  222. cursor.executemany("INSERT OR IGNORE INTO language_weak_properties (qitem, property, qitem2) VALUES (?,?,?)",language_weak_properties_list)
  223. cursor.executemany("INSERT OR IGNORE INTO country_properties (qitem, property, qitem2) VALUES (?,?,?)",country_properties_list)
  224. cursor.executemany("INSERT OR IGNORE INTO location_properties (qitem, property, qitem2) VALUES (?,?,?)",location_properties_list)
  225. cursor.executemany("INSERT OR IGNORE INTO has_part_properties (qitem, property, qitem2) VALUES (?,?,?)",has_part_properties_list)
  226. cursor.executemany("INSERT OR IGNORE INTO affiliation_properties (qitem, property, qitem2) VALUES (?,?,?)",affiliation_properties_list)
  227. cursor.executemany("INSERT OR IGNORE INTO created_by_properties (qitem, property, qitem2) VALUES (?,?,?)",created_by_properties_list)
  228. cursor.executemany("INSERT OR IGNORE INTO part_of_properties (qitem, property, qitem2) VALUES (?,?,?)",part_of_properties_list)
  229. cursor.executemany("INSERT OR IGNORE INTO industry_properties (qitem, property, qitem2) VALUES (?,?,?)",industry_properties_list)
  230. cursor.executemany("INSERT OR IGNORE INTO sexual_orientation_properties (qitem, property, qitem2) VALUES (?,?,?)",sexual_orientation_properties_list)
  231. cursor.executemany("INSERT OR IGNORE INTO religious_group_properties (qitem, property, qitem2) VALUES (?,?,?)",religious_group_properties_list)
  232. cursor.executemany("INSERT OR IGNORE INTO ethnic_group_properties (qitem, property, qitem2) VALUES (?,?,?)",ethnic_group_properties_list)
  233. cursor.executemany("INSERT OR IGNORE INTO people_properties (qitem, property, qitem2) VALUES (?,?,?)",people_properties_list)
  234. cursor.executemany("INSERT OR IGNORE INTO instance_of_subclasses_of_properties (qitem, property, qitem2) VALUES (?,?,?)",instance_of_subclasses_of_properties_list)
  235. conn.commit()
  236. conn.close()
  237.  
  238. print ('DONE with the JSON.')
  239. print ('It has this number of lines: '+str(iter))
Advertisement
RAW Paste Data Copied
Advertisement