Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- read_dump = '/public/dumps/public/wikidatawiki/entities/latest-all.json.gz'
- print (str(datetime.timedelta(seconds=time.time() - functionstartTime)))
- # return
- dump_in = gzip.open(read_dump, 'r')
- line = dump_in.readline()
- iter = 0
- n_qitems = 85696352
- sitelinks_values = []
- labels_values = []
- metadata_list = []; geolocated_property_list = []; time_properties_list = []; language_strong_properties_list = []; language_weak_properties_list = []; country_properties_list = []; location_properties_list = []; has_part_properties_list = []; affiliation_properties_list = []; created_by_properties_list = []; part_of_properties_list = []; industry_properties_list = []; sexual_orientation_properties_list = []; religious_group_properties_list = []; ethnic_group_properties_list = []; people_properties_list = []; instance_of_subclasses_of_properties_list = []
- print ('Iterating the dump.')
- while line != '':
- iter += 1
- line = dump_in.readline()
- line = line.rstrip().decode('utf-8')[:-1]
- try:
- entity = json.loads(line)
- qitem = entity['id']
- if not qitem.startswith('Q'): continue
- except:
- print ('JSON error.')
- sitelinks = []
- wd_sitelinks = entity['sitelinks']
- if len(wd_sitelinks) == 0: continue
- # # SITELINKS
- # for code, title in wd_sitelinks.items():
- # if code in wikilanguagecodeswiki:
- # sitelinks_values.append((qitem,code,title['title']))
- # sitelinks.append(code)
- # # LABELS
- # if len(sitelinks) != 0:
- # for code, title in entity['labels'].items(): # bucle de llengües
- # code = code + 'wiki'
- # if code not in wd_sitelinks and code in wikilanguagecodeswiki:
- # labels_values.append((qitem,code,title['value']))
- # # PROPERTIES
- # # print ([qitem,len(claims),len(entity['sitelinks'])])
- # claims = entity['claims']
- # # meta info
- # metadata_list.append((qitem,len(claims),len(sitelinks)-1))
- # # properties
- # for claim in claims:
- # wdproperty = claim
- # if wdproperty not in allproperties: continue
- # claimlist = claims[claim]
- # for snak in claimlist:
- # mainsnak = snak['mainsnak']
- # if wdproperty in geolocated_property:
- # try:
- # coordinates = str(mainsnak['datavalue']['value']['latitude'])+','+str(mainsnak['datavalue']['value']['longitude'])
- # except:
- # continue
- # geolocated_property_list.append((qitem,wdproperty,coordinates))
- # continue
- # if wdproperty in time_properties:
- # try:
- # value = str(mainsnak['datavalue']['value']['time'])
- # except:
- # continue
- # value = int(value[0]+value[1:].split('-')[0])
- # time_properties_list.append((qitem,wdproperty,value))
- # continue
- # # the rest of properties
- # try:
- # qitem2 = 'Q{}'.format(mainsnak['datavalue']['value']['numeric-id'])
- # except:
- # continue
- # if wdproperty in language_strong_properties:
- # # print ('language properties')
- # # print (qitem,wdproperty,qitem2)
- # language_strong_properties_list.append((qitem,wdproperty,qitem2))
- # continue
- # if wdproperty in language_weak_properties:
- # # print ('language properties')
- # # print (qitem,wdproperty,qitem2)
- # language_weak_properties_list.append((qitem,wdproperty,qitem2))
- # continue
- # if wdproperty in country_properties:
- # # print ('country properties')
- # # print (qitem,wdproperty,qitem2)
- # country_properties_list.append((qitem,wdproperty,qitem2))
- # continue
- # if wdproperty in location_properties:
- # # print ('location properties')
- # # print (qitem,wdproperty,qitem2)
- # location_properties_list.append((qitem,wdproperty,qitem2))
- # continue
- # if wdproperty in has_part_properties:
- # # print ('has part properties')
- # # print (qitem,wdproperty,qitem2)
- # has_part_properties_list.append((qitem,wdproperty,qitem2))
- # continue
- # if wdproperty in affiliation_properties:
- # # print ('affiliation_properties')
- # # print (qitem,wdproperty,qitem2)
- # affiliation_properties_list.append((qitem,wdproperty,qitem2))
- # continue
- # if wdproperty in created_by_properties:
- # # print ('created by properties')
- # # print (qitem,wdproperty,qitem2)
- # created_by_properties_list.append((qitem,wdproperty,qitem2))
- # continue
- # if wdproperty in part_of_properties:
- # # print ('part of properties')
- # # print (qitem,wdproperty,qitem2)
- # part_of_properties_list.append((qitem,wdproperty,qitem2))
- # continue
- # if wdproperty in industry_properties:
- # # print ('industry properties')
- # # print (qitem,wdproperty,qitem2)
- # industry_properties_list.append((qitem,wdproperty,qitem2))
- # continue
- # if wdproperty in sexual_orientation_properties:
- # # print ('sexual_orientation_properties')
- # # print (qitem,wdproperty,qitem2)
- # sexual_orientation_properties_list.append((qitem,wdproperty,qitem2))
- # continue
- # if wdproperty in religious_group_properties:
- # # print ('religious_group_properties')
- # # print (qitem,wdproperty,qitem2)
- # religious_group_properties_list.append((qitem,wdproperty,qitem2))
- # continue
- # if wdproperty in ethnic_group_properties:
- # # print ('ethnic_group_properties')
- # # print (qitem,wdproperty,qitem2)
- # ethnic_group_properties_list.append((qitem,wdproperty,qitem2))
- # continue
- # if wdproperty in people_properties:
- # if wdproperty == 'P21' or (wdproperty == 'P31' and qitem2 == 'Q5'):
- # # print ('people properties')
- # # print (qitem,wdproperty,qitem2)
- # people_properties_list.append((qitem,wdproperty,qitem2))
- # continue
- # if wdproperty in instance_of_subclasses_of_properties:
- # if wdproperty == 'P31' and qitem2 == 'Q5': continue # if human, continue
- # values = [qitem,wdproperty,qitem2]
- # # print ('instance_of_subclasses_of_properties properties')
- # # print (qitem,wdproperty,qitem2)
- # instance_of_subclasses_of_properties_list.append((qitem,wdproperty,qitem2))
- # continue
- if iter % 900000 == 0:
- # insert
- # cursor.executemany("INSERT INTO sitelinks (qitem, langcode, page_title) VALUES (?,?,?)",sitelinks_values)
- # cursor.executemany("INSERT INTO labels (qitem, langcode, label) VALUES (?,?,?)",labels_values)
- # cursor.executemany("INSERT OR IGNORE INTO metadata (qitem, properties, sitelinks) VALUES (?,?,?)", metadata_list)
- # cursor.executemany("INSERT OR IGNORE INTO geolocated_property (qitem, property, coordinates) VALUES (?,?,?)",geolocated_property_list)
- # cursor.executemany("INSERT OR IGNORE INTO time_properties (qitem, property, value) VALUES (?,?,?)",time_properties_list)
- # cursor.executemany("INSERT OR IGNORE INTO language_strong_properties (qitem, property, qitem2) VALUES (?,?,?)",language_strong_properties_list)
- # cursor.executemany("INSERT OR IGNORE INTO language_weak_properties (qitem, property, qitem2) VALUES (?,?,?)",language_weak_properties_list)
- # cursor.executemany("INSERT OR IGNORE INTO country_properties (qitem, property, qitem2) VALUES (?,?,?)",country_properties_list)
- # cursor.executemany("INSERT OR IGNORE INTO location_properties (qitem, property, qitem2) VALUES (?,?,?)",location_properties_list)
- # cursor.executemany("INSERT OR IGNORE INTO has_part_properties (qitem, property, qitem2) VALUES (?,?,?)",has_part_properties_list)
- # cursor.executemany("INSERT OR IGNORE INTO affiliation_properties (qitem, property, qitem2) VALUES (?,?,?)",affiliation_properties_list)
- # cursor.executemany("INSERT OR IGNORE INTO created_by_properties (qitem, property, qitem2) VALUES (?,?,?)",created_by_properties_list)
- # cursor.executemany("INSERT OR IGNORE INTO part_of_properties (qitem, property, qitem2) VALUES (?,?,?)",part_of_properties_list)
- # cursor.executemany("INSERT OR IGNORE INTO industry_properties (qitem, property, qitem2) VALUES (?,?,?)",industry_properties_list)
- # cursor.executemany("INSERT OR IGNORE INTO sexual_orientation_properties (qitem, property, qitem2) VALUES (?,?,?)",sexual_orientation_properties_list)
- # cursor.executemany("INSERT OR IGNORE INTO religious_group_properties (qitem, property, qitem2) VALUES (?,?,?)",religious_group_properties_list)
- # cursor.executemany("INSERT OR IGNORE INTO ethnic_group_properties (qitem, property, qitem2) VALUES (?,?,?)",ethnic_group_properties_list)
- # cursor.executemany("INSERT OR IGNORE INTO people_properties (qitem, property, qitem2) VALUES (?,?,?)",people_properties_list)
- # cursor.executemany("INSERT OR IGNORE INTO instance_of_subclasses_of_properties (qitem, property, qitem2) VALUES (?,?,?)",instance_of_subclasses_of_properties_list)
- # conn.commit()
- # sitelinks_values = []
- # labels_values = []
- # metadata_list = []; geolocated_property_list = []; time_properties_list = []; language_strong_properties_list = []; language_weak_properties_list = []; country_properties_list = []; location_properties_list = []; has_part_properties_list = []; affiliation_properties_list = []; created_by_properties_list = []; part_of_properties_list = []; industry_properties_list = []; sexual_orientation_properties_list = []; religious_group_properties_list = []; ethnic_group_properties_list = []; people_properties_list = []; instance_of_subclasses_of_properties_list = []
- print (iter)
- print (100*iter/n_qitems)
- print ('current time: ' + str(time.time() - functionstartTime))
- print ('number of line per second: '+str(iter/(time.time() - functionstartTime)))
- # break
- # last round
- # insert
- cursor.executemany("INSERT OR IGNORE INTO sitelinks (qitem, langcode, page_title) VALUES (?,?,?)",sitelinks_values)
- cursor.executemany("INSERT OR IGNORE INTO labels (qitem, langcode, label) VALUES (?,?,?)",values)
- cursor.executemany("INSERT OR IGNORE INTO metadata (qitem, properties, sitelinks) VALUES (?,?,?)", metadata_list)
- cursor.executemany("INSERT OR IGNORE INTO geolocated_property (qitem, property, coordinates) VALUES (?,?,?)",geolocated_property_list)
- cursor.executemany("INSERT OR IGNORE INTO time_properties (qitem, property, value) VALUES (?,?,?)",time_properties_list)
- cursor.executemany("INSERT OR IGNORE INTO language_strong_properties (qitem, property, qitem2) VALUES (?,?,?)",language_strong_properties_list)
- cursor.executemany("INSERT OR IGNORE INTO language_weak_properties (qitem, property, qitem2) VALUES (?,?,?)",language_weak_properties_list)
- cursor.executemany("INSERT OR IGNORE INTO country_properties (qitem, property, qitem2) VALUES (?,?,?)",country_properties_list)
- cursor.executemany("INSERT OR IGNORE INTO location_properties (qitem, property, qitem2) VALUES (?,?,?)",location_properties_list)
- cursor.executemany("INSERT OR IGNORE INTO has_part_properties (qitem, property, qitem2) VALUES (?,?,?)",has_part_properties_list)
- cursor.executemany("INSERT OR IGNORE INTO affiliation_properties (qitem, property, qitem2) VALUES (?,?,?)",affiliation_properties_list)
- cursor.executemany("INSERT OR IGNORE INTO created_by_properties (qitem, property, qitem2) VALUES (?,?,?)",created_by_properties_list)
- cursor.executemany("INSERT OR IGNORE INTO part_of_properties (qitem, property, qitem2) VALUES (?,?,?)",part_of_properties_list)
- cursor.executemany("INSERT OR IGNORE INTO industry_properties (qitem, property, qitem2) VALUES (?,?,?)",industry_properties_list)
- cursor.executemany("INSERT OR IGNORE INTO sexual_orientation_properties (qitem, property, qitem2) VALUES (?,?,?)",sexual_orientation_properties_list)
- cursor.executemany("INSERT OR IGNORE INTO religious_group_properties (qitem, property, qitem2) VALUES (?,?,?)",religious_group_properties_list)
- cursor.executemany("INSERT OR IGNORE INTO ethnic_group_properties (qitem, property, qitem2) VALUES (?,?,?)",ethnic_group_properties_list)
- cursor.executemany("INSERT OR IGNORE INTO people_properties (qitem, property, qitem2) VALUES (?,?,?)",people_properties_list)
- cursor.executemany("INSERT OR IGNORE INTO instance_of_subclasses_of_properties (qitem, property, qitem2) VALUES (?,?,?)",instance_of_subclasses_of_properties_list)
- conn.commit()
- conn.close()
- print ('DONE with the JSON.')
- print ('It has this number of lines: '+str(iter))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement