Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup
- import pandas as pd
- NATION_NAMES = ['ALBANIA', 'ARMENIA', 'AUSTRALIA', 'AUSTRIA', 'AZERBAIJAN', 'BELGIUM', 'CROATIA', 'CYPRUS', 'CZECHIA', 'DENMARK', 'ESTONIA', 'FINLAND', 'FRANCE', 'GEORGIA', 'GERMANY', 'GREECE', 'ICELAND', 'IRELAND', 'ISRAEL', 'ITALY', 'LATVIA', 'LITHUANIA', 'LUXEMBOURG', 'MALTA', 'MONTENEGRO', 'NETHERLANDS', 'NORWAY', 'POLAND', 'PORTUGAL', 'SAN MARINO', 'SERBIA', 'SLOVENIA', 'SPAIN', 'SWEDEN', 'SWITZERLAND', 'UKRAINE', 'UNITED KINGDOM']
- FLAGS = {'🇱🇺': 'LUXEMBOURG', '🇨ðŸ‡': 'SWITZERLAND', '🇸🇮': 'SLOVENIA', '🇦🇱': 'ALBANIA', '🇬🇪': 'GEORGIA', '🇱🇹': 'LITHUANIA', '🇸🇪': 'SWEDEN', '🇨🇿': 'CZECHIA', '🇮🇹': 'ITALY', '🇬🇷': 'GREECE', '🇵🇹': 'PORTUGAL', '🇱🇻': 'LATVIA', '🇫🇮': 'FINLAND', '🇧🇪': 'BELGIUM', '🇷🇸': 'SERBIA', '🇦🇺': 'AUSTRALIA', '🇺🇦': 'UKRAINE', '🇮🇱': 'ISRAEL', '🇦🇹': 'AUSTRIA', '🇪🇸': 'SPAIN', '🇳🇴': 'NORWAY', 'ðŸ‡ðŸ‡·': 'CROATIA', '🇲🇪': 'MONTENEGRO', '🇦🇿': 'AZERBAIJAN', '🇳🇱': 'NETHERLANDS', '🇲🇹': 'MALTA', '🇦🇲': 'ARMENIA', '🇬🇧': 'UNITED KINGDOM', '🇩🇰': 'DENMARK', '🇪🇪': 'ESTONIA', '🇨🇾': 'CYPRUS', '🇮🇸': 'ICELAND', '🇮🇪': 'IRELAND', '🇩🇪': 'GERMANY', '🇵🇱': 'POLAND', '🇫🇷': 'FRANCE', '🇸🇲': 'SAN MARINO'}
- SONGS = {'ZJERM': 'ALBANIA', 'SURVIVOR': 'ARMENIA', 'MILKSHAKE MAN': 'AUSTRALIA', 'WASTED LOVE': 'AUSTRIA', 'RUN WITH U': 'AZERBAIJAN', 'STROBE LIGHTS': 'BELGIUM', 'POISON CAKE': 'CROATIA', 'SHH': 'CYPRUS', 'KISS KISS GOODBYE': 'CZECHIA', 'HALLUCINATION': 'DENMARK', 'ESPRESSO MACCHIATO': 'ESTONIA', 'ICH KOMME': 'FINLAND', 'MAMAN': 'FRANCE', 'FREEDOM': 'GEORGIA', 'BALLER': 'GERMANY', 'ASTEROMATA': 'GREECE', 'RÓA': 'ICELAND', 'LAIKA PARTY': 'IRELAND', 'NEW DAY WILL RISE': 'ISRAEL', 'VOLEVO ESSERE UN DURO': 'ITALY', 'BUR MAN LAIMI': 'LATVIA', 'TAVO AKYS': 'LITHUANIA', 'LA POUPÉE MONTE LE SON': 'LUXEMBOURG', 'SERVING': 'MALTA', 'DOBRODOŠLI': 'MONTENEGRO', "C'EST LA VIE": 'NETHERLANDS', 'LIGHTER': 'NORWAY', 'GAJA': 'POLAND', 'DESLOCADO': 'PORTUGAL', "TUTTA L'ITALIA": 'SAN MARINO', 'MILA': 'SERBIA', 'HOW MUCH TIME DO WE HAVE LEFT': 'SLOVENIA', 'ESA DIVA': 'SPAIN', 'BARA BADA BASTU': 'SWEDEN', 'VOYAGE': 'SWITZERLAND', 'BIRD OF PRAY': 'UKRAINE', 'WHAT THE HELL JUST HAPPENED?': 'UNITED KINGDOM'}
- ALTERNATE_NAMES = {
- # Alternative Nation Names
- 'UK': 'UNITED KINGDOM',
- 'GREAT BRITAIN': 'UNITED KINGDOM',
- 'UNITED KINDDOM': 'UNITED KINGDOM',
- 'DANEMARK': 'DENMARK',
- 'NETHELANDS': 'NETHERLANDS',
- 'THE NETHERLANDS': 'NETHERLANDS',
- 'LUXEMBOUR': 'LUXEMBOURG',
- 'LUCEMBOURG': 'LUXEMBOURG',
- 'LUXEMBURG': 'LUXEMBOURG',
- 'BELGIEN': 'BELGIUM',
- 'CZECH REPUBLIC': 'CZECHIA',
- 'LRELAND': 'IRELAND',
- # Alternative Song Names
- 'LA POUPPÉ MONTE LE SON': 'LUXEMBOURG',
- 'WTHJH?': 'UNITED KINGDOM',
- 'NDWR': 'ISRAEL',
- 'DOBRODOSLI': 'MONTENEGRO',
- 'VEUD': 'ITALY',
- 'ESPRESSO MACHIATO': 'ESTONIA',
- 'ESPRESSO MACCHIATTO': 'ESTONIA',
- 'ROA': 'ICELAND',
- 'HMTDWHL?': 'SLOVENIA'
- }
- # --- Grabbing Post Data ---
- def load_data_from_html():
- with open('rankings.html', encoding='utf8') as f:
- soup = BeautifulSoup(f.read(), 'html.parser')
- comments_section = soup.find(class_='sitetable nestedlisting')
- comments = comments_section.children
- comments = [comment.find(class_='md') for comment in comments if comment.get('data-type', []) == 'comment']
- return comments
- # --- Parsing Data ---
- failed_to_parse = []
- def parse_comment(comment_md):
- # Entered as a list
- if list_element := comment_md.find(['ol', 'ul']):
- vals = [x.text for x in list_element.find_all('li')]
- filtered = []
- for val in vals:
- parsed = parse_entry(val)
- if parsed:
- filtered.append(parsed)
- else:
- print(f'Failed on {val}')
- return None
- return filtered
- # Entered as paragraphs
- if len(rows := comment_md.find_all('p')) >= 10:
- filtered = []
- for row in rows:
- parsed = parse_entry(row.text)
- if parsed and parsed not in filtered:
- filtered.append(parsed)
- if len(filtered) >= 10:
- return filtered
- # Entered on lines
- if len(rows := comment_md.text.split('\n')) >= 10:
- filtered = []
- for row in rows:
- parsed = parse_entry(row)
- if parsed and parsed not in filtered:
- filtered.append(parsed)
- if len(filtered) >= 10:
- return filtered
- # Failed
- return None
- def parse_entry(orig_entry):
- global failed_to_parse
- entry = orig_entry.replace('‘', "'").replace('’', "'")
- entry = ''.join(c for c in entry if c.isalpha() or c.isspace() or c in "'?")
- entry = entry.strip().upper()
- if entry in NATION_NAMES:
- return entry
- if entry in ALTERNATE_NAMES:
- return ALTERNATE_NAMES[entry]
- if entry in SONGS:
- return SONGS[entry]
- for flag, nation in FLAGS.items():
- if flag in orig_entry:
- return nation
- for nation in NATION_NAMES:
- if nation in entry:
- return nation
- for song, nation in SONGS.items():
- if song in entry:
- return nation
- for alt_name, nation in ALTERNATE_NAMES.items():
- if alt_name in entry:
- return nation
- failed_to_parse.append(orig_entry + ' --> ' + entry)
- return None
- def main():
- global failed_to_parse
- comments = load_data_from_html()
- rankings = []
- for comment in comments:
- ranking = parse_comment(comment)
- if not ranking:
- print('-------------------------')
- print('Failed to parse comment:')
- print(comment)
- else:
- rankings.append(ranking)
- rankings = pd.DataFrame(rankings)
- if failed_to_parse:
- print('-------------------------')
- print('Failed to parse the following lines:')
- for x in failed_to_parse:
- print('\t' + x)
- print('-------------------------')
- print(f'Successfully parsed {len(rankings)}/{len(comments)}')
- print('-------------------------')
- print(rankings)
- rankings.to_pickle('rankings.pkl')
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment