Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import argparse
- import requests
- from bs4 import BeautifulSoup
- def get_ent_names(url):
- response = requests.get(url)
- response.raise_for_status()
- soup = BeautifulSoup(response.text, 'html.parser')
- names = []
- for el in soup.find_all(class_='ent-name'):
- text = el.get_text(strip=True)
- if text and text not in names:
- names.append(text)
- if len(names) >= limit:
- break
- return names
- def compare_ent_names(url1, url2):
- names1 = get_ent_names(url1)
- names2 = get_ent_names(url2)
- only_in_1 = [n for n in names1 if n not in names2]
- only_in_2 = [n for n in names2 if n not in names1]
- in_both = [n for n in names1 if n in names2]
- def fmt(lst):
- return ", ".join(lst) if lst else "(none)"
- print(f"\n--- Only in URL 1 ({url1}) ---")
- print(fmt(only_in_1))
- print(f"\n--- Only in URL 2 ({url2}) ---")
- print(fmt(only_in_2))
- print(f"\n--- In both ---")
- print(fmt(in_both))
- def main():
- parser = argparse.ArgumentParser(
- description="Compare .ent-name elements between two webpages."
- )
- parser.add_argument("url1", help="First URL with `.ent-name`s to compare")
- parser.add_argument("url2", help="Second URL with `.ent-name`s to compare")
- args = parser.parse_args()
- compare_ent_names(args.url1, args.url2)
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment