Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- FIRST, I DO THIS:
- dumps_path = '/public/dumps/public/'+languagecode+'wiki/latest/'+languagecode+'wiki-latest-page.sql.gz'
- wikilanguages_utils.check_dump(dumps_path, script_name)
- dump_in = gzip.open(dumps_path, 'r')
- iter = 0
- while True:
- iter+=1
- line = dump_in.readline()
- try: line = line.decode("utf-8")
- except UnicodeDecodeError:
- print ('error.')
- line = str(line)
- if line == '':
- i+=1
- if i==3: break
- else: i=0
- if wikilanguages_utils.is_insert(line):
- values = wikilanguages_utils.get_values(line)
- if wikilanguages_utils.values_sanity_check(values): rows = wikilanguages_utils.parse_values(values)
- for row in rows:
- page_id = int(row[0])
- page_namespace = int(row[1])
- cat_title = str(row[2])
- if page_namespace != 14: continue
- category_page_ids_page_titles[page_id]=cat_title
- category_links_cat_cat[cat_title]=set()
- category_links_cat_art[cat_title]=set()
- if iter % 10000 == 0:
- print (str(iter)+' categories loaded.')
- print (str(datetime.timedelta(seconds=time.time() - functionstartTime)))
- print (len(category_links_cat_cat))
- print ('all categories loaded')
- THEN, I DO THIS:
- dumps_path = '/public/dumps/public/'+languagecode+'wiki/latest/'+languagecode+'wiki-latest-categorylinks.sql.gz'
- wikilanguages_utils.check_dump(dumps_path, script_name)
- dump_in = gzip.open(dumps_path, 'r')
- a = 0
- c = 0
- iter = 0
- while True:
- iter+=1
- line = dump_in.readline()
- try: line = line.decode("utf-8")
- except UnicodeDecodeError:
- print ('error.')
- line = str(line)
- if line == '':
- i+=1
- if i==3: break
- else: i=0
- if wikilanguages_utils.is_insert(line):
- values = wikilanguages_utils.get_values(line)
- if wikilanguages_utils.values_sanity_check(values): rows = wikilanguages_utils.parse_values(values)
- for row in rows:
- try:
- page_id = int(row[0])
- cat_title = str(row[1].strip("'"))
- except:
- continue
- if cat_title not in category_links_cat_cat:
- continue
- if page_id in category_page_ids_page_titles: # is this a category
- c += 1
- category_links_cat_cat[cat_title].add(category_page_ids_page_titles[page_id])
- else: # this is an article
- a += 1
- category_links_cat_art[cat_title].add(page_id)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement