Guest User

ESC Reddit Thread Parsing

a guest
Apr 22nd, 2025
50
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.33 KB | None | 0 0
  1. from bs4 import BeautifulSoup
  2. import pandas as pd
  3.  
  4. NATION_NAMES = ['ALBANIA', 'ARMENIA', 'AUSTRALIA', 'AUSTRIA', 'AZERBAIJAN', 'BELGIUM', 'CROATIA', 'CYPRUS', 'CZECHIA', 'DENMARK', 'ESTONIA', 'FINLAND', 'FRANCE', 'GEORGIA', 'GERMANY', 'GREECE', 'ICELAND', 'IRELAND', 'ISRAEL', 'ITALY', 'LATVIA', 'LITHUANIA', 'LUXEMBOURG', 'MALTA', 'MONTENEGRO', 'NETHERLANDS', 'NORWAY', 'POLAND', 'PORTUGAL', 'SAN MARINO', 'SERBIA', 'SLOVENIA', 'SPAIN', 'SWEDEN', 'SWITZERLAND', 'UKRAINE', 'UNITED KINGDOM']
  5.  
  6. FLAGS = {'🇱🇺': 'LUXEMBOURG', '🇨🇭': 'SWITZERLAND', '🇸🇮': 'SLOVENIA', '🇦🇱': 'ALBANIA', '🇬🇪': 'GEORGIA', '🇱🇹': 'LITHUANIA', '🇸🇪': 'SWEDEN', '🇨🇿': 'CZECHIA', '🇮🇹': 'ITALY', '🇬🇷': 'GREECE', '🇵🇹': 'PORTUGAL', '🇱🇻': 'LATVIA', '🇫🇮': 'FINLAND', '🇧🇪': 'BELGIUM', '🇷🇸': 'SERBIA', '🇦🇺': 'AUSTRALIA', '🇺🇦': 'UKRAINE', '🇮🇱': 'ISRAEL', '🇦🇹': 'AUSTRIA', '🇪🇸': 'SPAIN', '🇳🇴': 'NORWAY', '🇭🇷': 'CROATIA', '🇲🇪': 'MONTENEGRO', '🇦🇿': 'AZERBAIJAN', '🇳🇱': 'NETHERLANDS', '🇲🇹': 'MALTA', '🇦🇲': 'ARMENIA', '🇬🇧': 'UNITED KINGDOM', '🇩🇰': 'DENMARK', '🇪🇪': 'ESTONIA', '🇨🇾': 'CYPRUS', '🇮🇸': 'ICELAND', '🇮🇪': 'IRELAND', '🇩🇪': 'GERMANY', '🇵🇱': 'POLAND', '🇫🇷': 'FRANCE', '🇸🇲': 'SAN MARINO'}
  7.  
  8. SONGS = {'ZJERM': 'ALBANIA', 'SURVIVOR': 'ARMENIA', 'MILKSHAKE MAN': 'AUSTRALIA', 'WASTED LOVE': 'AUSTRIA', 'RUN WITH U': 'AZERBAIJAN', 'STROBE LIGHTS': 'BELGIUM', 'POISON CAKE': 'CROATIA', 'SHH': 'CYPRUS', 'KISS KISS GOODBYE': 'CZECHIA', 'HALLUCINATION': 'DENMARK', 'ESPRESSO MACCHIATO': 'ESTONIA', 'ICH KOMME': 'FINLAND', 'MAMAN': 'FRANCE', 'FREEDOM': 'GEORGIA', 'BALLER': 'GERMANY', 'ASTEROMATA': 'GREECE', 'RÓA': 'ICELAND', 'LAIKA PARTY': 'IRELAND', 'NEW DAY WILL RISE': 'ISRAEL', 'VOLEVO ESSERE UN DURO': 'ITALY', 'BUR MAN LAIMI': 'LATVIA', 'TAVO AKYS': 'LITHUANIA', 'LA POUPÉE MONTE LE SON': 'LUXEMBOURG', 'SERVING': 'MALTA', 'DOBRODOŠLI': 'MONTENEGRO', "C'EST LA VIE": 'NETHERLANDS', 'LIGHTER': 'NORWAY', 'GAJA': 'POLAND', 'DESLOCADO': 'PORTUGAL', "TUTTA L'ITALIA": 'SAN MARINO', 'MILA': 'SERBIA', 'HOW MUCH TIME DO WE HAVE LEFT': 'SLOVENIA', 'ESA DIVA': 'SPAIN', 'BARA BADA BASTU': 'SWEDEN', 'VOYAGE': 'SWITZERLAND', 'BIRD OF PRAY': 'UKRAINE', 'WHAT THE HELL JUST HAPPENED?': 'UNITED KINGDOM'}
  9.  
  10. ALTERNATE_NAMES = {
  11.     # Alternative Nation Names
  12.     'UK': 'UNITED KINGDOM',
  13.     'GREAT BRITAIN': 'UNITED KINGDOM',
  14.     'UNITED KINDDOM': 'UNITED KINGDOM',
  15.     'DANEMARK': 'DENMARK',
  16.     'NETHELANDS': 'NETHERLANDS',
  17.     'THE NETHERLANDS': 'NETHERLANDS',
  18.     'LUXEMBOUR': 'LUXEMBOURG',
  19.     'LUCEMBOURG': 'LUXEMBOURG',
  20.     'LUXEMBURG': 'LUXEMBOURG',
  21.     'BELGIEN': 'BELGIUM',
  22.     'CZECH REPUBLIC': 'CZECHIA',
  23.     'LRELAND': 'IRELAND',
  24.  
  25.     # Alternative Song Names
  26.     'LA POUPPÉ MONTE LE SON': 'LUXEMBOURG',
  27.     'WTHJH?': 'UNITED KINGDOM',
  28.     'NDWR': 'ISRAEL',
  29.     'DOBRODOSLI': 'MONTENEGRO',
  30.     'VEUD': 'ITALY',
  31.     'ESPRESSO MACHIATO': 'ESTONIA',
  32.     'ESPRESSO MACCHIATTO': 'ESTONIA',
  33.     'ROA': 'ICELAND',
  34.     'HMTDWHL?': 'SLOVENIA'
  35. }
  36.  
  37. # --- Grabbing Post Data ---
  38. def load_data_from_html():
  39.     with open('rankings.html', encoding='utf8') as f:
  40.         soup = BeautifulSoup(f.read(), 'html.parser')
  41.     comments_section = soup.find(class_='sitetable nestedlisting')
  42.     comments = comments_section.children
  43.     comments = [comment.find(class_='md') for comment in comments if comment.get('data-type', []) == 'comment']
  44.     return comments
  45.  
  46. # --- Parsing Data ---
  47. failed_to_parse = []
  48.  
  49. def parse_comment(comment_md):
  50.     # Entered as a list
  51.     if list_element := comment_md.find(['ol', 'ul']):
  52.         vals = [x.text for x in list_element.find_all('li')]
  53.         filtered = []
  54.         for val in vals:
  55.             parsed = parse_entry(val)
  56.             if parsed:
  57.                 filtered.append(parsed)
  58.             else:
  59.                 print(f'Failed on {val}')
  60.                 return None
  61.         return filtered
  62.     # Entered as paragraphs
  63.     if len(rows := comment_md.find_all('p')) >= 10:
  64.         filtered = []
  65.         for row in rows:
  66.             parsed = parse_entry(row.text)
  67.             if parsed and parsed not in filtered:
  68.                 filtered.append(parsed)
  69.         if len(filtered) >= 10:
  70.             return filtered
  71.     # Entered on lines
  72.     if len(rows := comment_md.text.split('\n')) >= 10:
  73.         filtered = []
  74.         for row in rows:
  75.             parsed = parse_entry(row)
  76.             if parsed and parsed not in filtered:
  77.                 filtered.append(parsed)
  78.         if len(filtered) >= 10:
  79.             return filtered
  80.     # Failed
  81.     return None
  82.  
  83. def parse_entry(orig_entry):
  84.     global failed_to_parse
  85.     entry = orig_entry.replace('‘', "'").replace('’', "'")
  86.     entry = ''.join(c for c in entry if c.isalpha() or c.isspace() or c in "'?")
  87.     entry = entry.strip().upper()
  88.     if entry in NATION_NAMES:
  89.         return entry
  90.     if entry in ALTERNATE_NAMES:
  91.         return ALTERNATE_NAMES[entry]
  92.     if entry in SONGS:
  93.         return SONGS[entry]
  94.     for flag, nation in FLAGS.items():
  95.         if flag in orig_entry:
  96.             return nation
  97.     for nation in NATION_NAMES:
  98.         if nation in entry:
  99.             return nation
  100.     for song, nation in SONGS.items():
  101.         if song in entry:
  102.             return nation
  103.     for alt_name, nation in ALTERNATE_NAMES.items():
  104.         if alt_name in entry:
  105.             return nation
  106.     failed_to_parse.append(orig_entry + ' --> ' + entry)
  107.     return None
  108.  
  109. def main():
  110.     global failed_to_parse
  111.     comments = load_data_from_html()
  112.     rankings = []
  113.     for comment in comments:
  114.         ranking = parse_comment(comment)
  115.         if not ranking:
  116.             print('-------------------------')
  117.             print('Failed to parse comment:')
  118.             print(comment)
  119.         else:
  120.             rankings.append(ranking)
  121.     rankings = pd.DataFrame(rankings)
  122.     if failed_to_parse:
  123.         print('-------------------------')
  124.         print('Failed to parse the following lines:')
  125.         for x in failed_to_parse:
  126.             print('\t' + x)
  127.     print('-------------------------')
  128.     print(f'Successfully parsed {len(rankings)}/{len(comments)}')
  129.     print('-------------------------')
  130.     print(rankings)
  131.     rankings.to_pickle('rankings.pkl')
  132.  
  133. if __name__ == '__main__':
  134.     main()
  135.  
Advertisement
Add Comment
Please, Sign In to add comment