Advertisement
Guest User

scraper help

a guest
Jul 25th, 2024
55
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.49 KB | Source Code | 0 0
  1. import subprocess
  2. import sys
  3. import traceback
  4. import requests
  5. from bs4 import BeautifulSoup
  6. import spacy
  7. import csv
  8.  
  9. # Function to check and install required libraries
  10. def check_and_install_libraries():
  11.     required_libraries = {
  12.         'requests': 'https://pypi.org/project/requests/',
  13.         'beautifulsoup4': 'https://pypi.org/project/beautifulsoup4/',
  14.         'spacy': 'https://pypi.org/project/spacy/',
  15.         'csv': 'csv is included in the Python Standard Library'
  16.     }
  17.  
  18.     missing_libraries = []
  19.  
  20.     for library in required_libraries:
  21.         try:
  22.             __import__(library)
  23.         except ImportError:
  24.             missing_libraries.append(library)
  25.  
  26.     if missing_libraries:
  27.         print("The following libraries are missing:")
  28.         for lib in missing_libraries:
  29.             print(f"- {lib}")
  30.         print("\nTo install missing libraries, use the following commands:")
  31.         for lib in missing_libraries:
  32.             print(f"- pip install {lib}")
  33.             print(f"  (Documentation: {required_libraries[lib]})")
  34.         sys.exit(1)
  35.  
  36. # Function to handle errors and print traceback
  37. def debug_error(message):
  38.     exc_type, exc_value, exc_traceback = sys.exc_info()
  39.     print(f"Error: {message}")
  40.     traceback.print_exception(exc_type, exc_value, exc_traceback, file=sys.stdout)
  41.     sys.exit(1)
  42.  
  43. # Check and install required libraries
  44. check_and_install_libraries()
  45.  
  46. # Initialize SpaCy NLP model
  47. nlp = spacy.load("en_core_web_sm")
  48.  
  49. # Known keywords for Games Workshop products
  50. known_keywords = ['warhammer', 'citadel', 'aos', 'age of sigmar', '40k']
  51.  
  52. # Create a requests session to reuse connections
  53. session = requests.Session()
  54.  
  55. # Function to scrape Canadian Games Workshop site for products and prices
  56. def scrape_games_workshop_ca():
  57.     url = 'https://www.games-workshop.com/en-CA'
  58.     products = {}
  59.  
  60.     try:
  61.         response = session.get(url)
  62.         response.raise_for_status()
  63.     except requests.RequestException as e:
  64.         debug_error(f"Error fetching Games Workshop site: {e}")
  65.  
  66.     soup = BeautifulSoup(response.content, 'html.parser')
  67.     product_elements = soup.find_all('div', {'class': 'product-item'})
  68.  
  69.     for product in product_elements:
  70.         title_elem = product.find('h4', {'class': 'product-title'})
  71.         price_elem = product.find('span', {'class': 'price'})
  72.         if title_elem and price_elem:
  73.             title = title_elem.text.strip()
  74.             price = parse_price(price_elem.text.strip())
  75.             if price:
  76.                 products[title.lower()] = price
  77.  
  78.     return products
  79.  
  80. # Function to parse price string and convert to float
  81. def parse_price(price_str):
  82.     price_str = price_str.replace('$', '').replace(',', '').strip()
  83.     try:
  84.         return float(price_str)
  85.     except ValueError:
  86.         debug_error(f"Error parsing price: {price_str}")
  87.         return None
  88.  
  89. # Function to check if an item might be mislisted
  90. def is_mislisted(item_title):
  91.     doc = nlp(item_title.lower())
  92.     return not any(token.text in known_keywords for token in doc)
  93.  
  94. # Function to search using a generic scraping function
  95. def search_site(url, title_selector, price_selector, title_key, price_key):
  96.     items = []
  97.     try:
  98.         response = session.get(url)
  99.         response.raise_for_status()
  100.     except requests.RequestException as e:
  101.         debug_error(f"Error fetching results from {url}: {e}")
  102.  
  103.     soup = BeautifulSoup(response.content, 'html.parser')
  104.     for listing in soup.find_all('div', {'class': 'search-item'}):
  105.         title_elem = listing.select_one(title_selector)
  106.         price_elem = listing.select_one(price_selector)
  107.         if title_elem and price_elem:
  108.             title = title_elem.get_text(strip=True)
  109.             price = parse_price(price_elem.get_text(strip=True))
  110.             if price is not None:
  111.                 items.append({title_key: title, price_key: price})
  112.  
  113.     return items
  114.  
  115. # Functions for specific sites
  116. def search_ebay(keyword):
  117.     return search_site(
  118.         f"https://www.ebay.ca/sch/i.html?_nkw={keyword.replace(' ', '+')}",
  119.         'h3.s-item__title',
  120.         'span.s-item__price',
  121.         'title',
  122.         'price'
  123.     )
  124.  
  125. def search_facebook_marketplace(query):
  126.     return search_site(
  127.         f"https://www.facebook.com/marketplace/search/?query={query}&location=Canada",
  128.         'span.title',
  129.         'span.price',
  130.         'title',
  131.         'price'
  132.     )
  133.  
  134. def search_kijiji(query):
  135.     return search_site(
  136.         f"https://www.kijiji.ca/b-buy-sell/canada/{query}/k0c10l0",
  137.         'a.title',
  138.         'div.price',
  139.         'title',
  140.         'price'
  141.     )
  142.  
  143. def search_google_shopping(query):
  144.     return search_site(
  145.         f"https://www.google.ca/search?tbm=shop&q={query.replace(' ', '+')}",
  146.         'h4.A2sOrd',
  147.         'span.a8Pemb',
  148.         'title',
  149.         'price'
  150.     )
  151.  
  152. # Collect mislisted items from all platforms
  153. def collect_mislisted_items(msrp_data):
  154.     mislisted_items = []
  155.  
  156.     for product_title in msrp_data:
  157.         for search_func, platform in [
  158.             (search_ebay, 'eBay'),
  159.             (search_facebook_marketplace, 'Facebook Marketplace'),
  160.             (search_kijiji, 'Kijiji'),
  161.             (search_google_shopping, 'Google Shopping')
  162.         ]:
  163.             try:
  164.                 results = search_func(product_title)
  165.                 for item in results:
  166.                     title, price = item['title'], item['price']
  167.                     if is_mislisted(title):
  168.                         percentage = calculate_percentage_of_msdp(price, msrp_data[product_title])
  169.                         if percentage is not None:
  170.                             mislisted_items.append({
  171.                                 'platform': platform,
  172.                                 'title': title,
  173.                                 'price': price,
  174.                                 'percentage_of_msrp': percentage
  175.                             })
  176.             except Exception as e:
  177.                 debug_error(f"Error searching on {platform}: {e}")
  178.  
  179.     return mislisted_items
  180.  
  181. # Function to calculate the percentage of MSRP
  182. def calculate_percentage_of_msdp(price, msrp):
  183.     return (price / msrp) * 100 if msrp > 0 else None
  184.  
  185. # Function to write results to a CSV file
  186. def write_results_to_csv(mislisted_items):
  187.     filename = 'mislisted_items.csv'
  188.     sorted_items = sorted(mislisted_items, key=lambda x: (x['platform'], x['percentage_of_msrp']))
  189.  
  190.     with open(filename, 'w', newline='', encoding='utf-8') as file:
  191.         writer = csv.writer(file)
  192.         writer.writerow(['Platform', 'Title', 'Price', 'Percentage of MSRP'])
  193.  
  194.         current_platform = None
  195.         for item in sorted_items:
  196.             if item['platform'] != current_platform:
  197.                 current_platform = item['platform']
  198.                 writer.writerow([])  # Add an empty row to separate platforms
  199.             writer.writerow([item['platform'], item['title'], item['price'], f"{item['percentage_of_msrp']:.2f}%"])
  200.  
  201.     print(f"Results have been written to {filename}")
  202.  
  203. # Main workflow
  204. def main():
  205.     try:
  206.         msrp_data = scrape_games_workshop_ca()
  207.         if not msrp_data:
  208.             print("No MSRP data found.")
  209.             return
  210.  
  211.         mislisted_items = collect_mislisted_items(msrp_data)
  212.         if not mislisted_items:
  213.             print("No mislisted items found.")
  214.             return
  215.  
  216.         write_results_to_csv(mislisted_items)
  217.     except Exception:
  218.         debug_error("An unexpected error occurred in the main workflow.")
  219.  
  220. if __name__ == "__main__":
  221.     main()
  222.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement