Advertisement
Guest User

Untitled

a guest
Oct 9th, 2020
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.06 KB | None | 0 0
  1. FIRST, I DO THIS:
  2. dumps_path = '/public/dumps/public/'+languagecode+'wiki/latest/'+languagecode+'wiki-latest-page.sql.gz'
  3. wikilanguages_utils.check_dump(dumps_path, script_name)
  4.  
  5. dump_in = gzip.open(dumps_path, 'r')
  6. iter = 0
  7.  
  8. while True:
  9. iter+=1
  10. line = dump_in.readline()
  11. try: line = line.decode("utf-8")
  12. except UnicodeDecodeError:
  13. print ('error.')
  14. line = str(line)
  15.  
  16. if line == '':
  17. i+=1
  18. if i==3: break
  19. else: i=0
  20.  
  21. if wikilanguages_utils.is_insert(line):
  22. values = wikilanguages_utils.get_values(line)
  23. if wikilanguages_utils.values_sanity_check(values): rows = wikilanguages_utils.parse_values(values)
  24.  
  25. for row in rows:
  26. page_id = int(row[0])
  27. page_namespace = int(row[1])
  28. cat_title = str(row[2])
  29.  
  30. if page_namespace != 14: continue
  31. category_page_ids_page_titles[page_id]=cat_title
  32.  
  33. category_links_cat_cat[cat_title]=set()
  34. category_links_cat_art[cat_title]=set()
  35.  
  36. if iter % 10000 == 0:
  37. print (str(iter)+' categories loaded.')
  38.  
  39. print (str(datetime.timedelta(seconds=time.time() - functionstartTime)))
  40. print (len(category_links_cat_cat))
  41. print ('all categories loaded')
  42.  
  43.  
  44. THEN, I DO THIS:
  45. dumps_path = '/public/dumps/public/'+languagecode+'wiki/latest/'+languagecode+'wiki-latest-categorylinks.sql.gz'
  46. wikilanguages_utils.check_dump(dumps_path, script_name)
  47. dump_in = gzip.open(dumps_path, 'r')
  48.  
  49. a = 0
  50. c = 0
  51. iter = 0
  52. while True:
  53. iter+=1
  54. line = dump_in.readline()
  55. try: line = line.decode("utf-8")
  56. except UnicodeDecodeError:
  57. print ('error.')
  58. line = str(line)
  59.  
  60. if line == '':
  61. i+=1
  62. if i==3: break
  63. else: i=0
  64.  
  65. if wikilanguages_utils.is_insert(line):
  66. values = wikilanguages_utils.get_values(line)
  67. if wikilanguages_utils.values_sanity_check(values): rows = wikilanguages_utils.parse_values(values)
  68.  
  69. for row in rows:
  70.  
  71. try:
  72. page_id = int(row[0])
  73. cat_title = str(row[1].strip("'"))
  74. except:
  75. continue
  76.  
  77.  
  78. if cat_title not in category_links_cat_cat:
  79. continue
  80.  
  81. if page_id in category_page_ids_page_titles: # is this a category
  82. c += 1
  83. category_links_cat_cat[cat_title].add(category_page_ids_page_titles[page_id])
  84. else: # this is an article
  85. a += 1
  86. category_links_cat_art[cat_title].add(page_id)
  87.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement