Advertisement
Guest User

Untitled

a guest
Apr 25th, 2022
23
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.89 KB | None | 0 0
  1. import requests
  2. import re
  3. from time import sleep
  4.  
  5. def get_cache():
  6.     text = requests.get("https://en.wiktionary.org/?curid=8756089&action=raw").text
  7.     lines = text.split("\n")
  8.  
  9.     a = lines.index("==Valid==")
  10.     b = lines.index("==Broken==")
  11.     c = lines.index("==Special cases==")
  12.  
  13.     link = r"\[\[(.*?)\]\]"
  14.     kbd = r"<kbd>(.*?)</kbd> *\|\| *"
  15.  
  16.     valid = re.findall(link, "".join(lines[a:b]))
  17.     broken = re.findall(link, "".join(lines[b:c]))
  18.     valid += re.findall(kbd + "valid", "".join(lines[c:]))
  19.     broken += re.findall(kbd + "broken", "".join(lines[c:]))
  20.  
  21.     return set(valid), set(broken)
  22.  
  23. def get_links(cont=None):
  24.     url = "https://en.wiktionary.org/w/api.php"
  25.     params = {
  26.         "action": "query",
  27.         "format": "json",
  28.         "list": "exturlusage",
  29.         "euquery": "www.ancestry.com",
  30.         "eulimit": 500,
  31.         "eunamespace": 0,
  32.         "eucontinue": cont,
  33.     }
  34.  
  35.     data = requests.get(url, params).json()
  36.     results = data["query"]["exturlusage"]
  37.  
  38.     if not results:
  39.         return []
  40.     if "continue" not in data:
  41.         return results
  42.     return results + get_links(data["continue"]["eucontinue"])
  43.  
  44. def check_link(url, query, cache=None, retries=5):
  45.     if cache:
  46.         valid, broken = cache
  47.         if query in valid:
  48.             return "valid"
  49.         if query in broken:
  50.             return "broken"
  51.  
  52.     try:
  53.         sleep(1)
  54.         r = requests.get(url).text
  55.     except:
  56.         if retries <= 0:
  57.             return None
  58.  
  59.         sleep(60)
  60.         return check_link(query, cache, retries - 1)
  61.  
  62.     return "valid" if "Dictionary of American Family Names" in r else "broken"
  63.  
  64. def check_all_links():
  65.     cache = get_cache()
  66.  
  67.     simple_valid = []
  68.     simple_broken = []
  69.     special = []
  70.  
  71.     links = get_links()
  72.     for link in links:
  73.         if not "www.ancestry.com/name-origin?surname=" in link["url"]:
  74.             continue
  75.  
  76.         surname = link["url"].split("?surname=")[1]
  77.         status = check_link(link["url"], surname, cache)
  78.  
  79.         if surname == link["title"] and status == "valid":
  80.             simple_valid.append(f"[[{surname}]]")
  81.         elif surname == link["title"] and status == "broken":
  82.             simple_broken.append(f"[[{surname}]]")
  83.         else:
  84.             special.append(
  85.                 f"|-\n| [[{link['title']}]] || <kbd>{surname}</kbd> || {status}"
  86.             )
  87.  
  88.     simple_valid = ", ".join(simple_valid)
  89.     simple_broken = ", ".join(simple_broken)
  90.     special = "\n".join(special)
  91.     collapse_top = "{{collapse-top|class=custom|title=Pages}}"
  92.     collapse_bottom = "{{collapse-bottom}}"
  93.     print(
  94.         f"""
  95. ==Valid==
  96. {collapse_top}
  97. {simple_valid}
  98. {collapse_bottom}
  99. ==Broken==
  100. {collapse_top}
  101. {simple_broken}
  102. {collapse_bottom}
  103.  
  104. ==Special cases==
  105. {{| class="wikitable sortable"
  106. ! Page !! Query !! Status
  107. {special}
  108. |}}"""
  109.     )
  110.  
  111. check_all_links()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement