Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import subprocess
- import sys
- import traceback
- import requests
- from bs4 import BeautifulSoup
- import spacy
- import csv
- # Function to check and install required libraries
- def check_and_install_libraries():
- required_libraries = {
- 'requests': 'https://pypi.org/project/requests/',
- 'beautifulsoup4': 'https://pypi.org/project/beautifulsoup4/',
- 'spacy': 'https://pypi.org/project/spacy/',
- 'csv': 'csv is included in the Python Standard Library'
- }
- missing_libraries = []
- for library in required_libraries:
- try:
- __import__(library)
- except ImportError:
- missing_libraries.append(library)
- if missing_libraries:
- print("The following libraries are missing:")
- for lib in missing_libraries:
- print(f"- {lib}")
- print("\nTo install missing libraries, use the following commands:")
- for lib in missing_libraries:
- print(f"- pip install {lib}")
- print(f" (Documentation: {required_libraries[lib]})")
- sys.exit(1)
- # Function to handle errors and print traceback
- def debug_error(message):
- exc_type, exc_value, exc_traceback = sys.exc_info()
- print(f"Error: {message}")
- traceback.print_exception(exc_type, exc_value, exc_traceback, file=sys.stdout)
- sys.exit(1)
- # Check and install required libraries
- check_and_install_libraries()
- # Initialize SpaCy NLP model
- nlp = spacy.load("en_core_web_sm")
- # Known keywords for Games Workshop products
- known_keywords = ['warhammer', 'citadel', 'aos', 'age of sigmar', '40k']
- # Create a requests session to reuse connections
- session = requests.Session()
- # Function to scrape Canadian Games Workshop site for products and prices
- def scrape_games_workshop_ca():
- url = 'https://www.games-workshop.com/en-CA'
- products = {}
- try:
- response = session.get(url)
- response.raise_for_status()
- except requests.RequestException as e:
- debug_error(f"Error fetching Games Workshop site: {e}")
- soup = BeautifulSoup(response.content, 'html.parser')
- product_elements = soup.find_all('div', {'class': 'product-item'})
- for product in product_elements:
- title_elem = product.find('h4', {'class': 'product-title'})
- price_elem = product.find('span', {'class': 'price'})
- if title_elem and price_elem:
- title = title_elem.text.strip()
- price = parse_price(price_elem.text.strip())
- if price:
- products[title.lower()] = price
- return products
- # Function to parse price string and convert to float
- def parse_price(price_str):
- price_str = price_str.replace('$', '').replace(',', '').strip()
- try:
- return float(price_str)
- except ValueError:
- debug_error(f"Error parsing price: {price_str}")
- return None
- # Function to check if an item might be mislisted
- def is_mislisted(item_title):
- doc = nlp(item_title.lower())
- return not any(token.text in known_keywords for token in doc)
- # Function to search using a generic scraping function
- def search_site(url, title_selector, price_selector, title_key, price_key):
- items = []
- try:
- response = session.get(url)
- response.raise_for_status()
- except requests.RequestException as e:
- debug_error(f"Error fetching results from {url}: {e}")
- soup = BeautifulSoup(response.content, 'html.parser')
- for listing in soup.find_all('div', {'class': 'search-item'}):
- title_elem = listing.select_one(title_selector)
- price_elem = listing.select_one(price_selector)
- if title_elem and price_elem:
- title = title_elem.get_text(strip=True)
- price = parse_price(price_elem.get_text(strip=True))
- if price is not None:
- items.append({title_key: title, price_key: price})
- return items
- # Functions for specific sites
- def search_ebay(keyword):
- return search_site(
- f"https://www.ebay.ca/sch/i.html?_nkw={keyword.replace(' ', '+')}",
- 'h3.s-item__title',
- 'span.s-item__price',
- 'title',
- 'price'
- )
- def search_facebook_marketplace(query):
- return search_site(
- f"https://www.facebook.com/marketplace/search/?query={query}&location=Canada",
- 'span.title',
- 'span.price',
- 'title',
- 'price'
- )
- def search_kijiji(query):
- return search_site(
- f"https://www.kijiji.ca/b-buy-sell/canada/{query}/k0c10l0",
- 'a.title',
- 'div.price',
- 'title',
- 'price'
- )
- def search_google_shopping(query):
- return search_site(
- f"https://www.google.ca/search?tbm=shop&q={query.replace(' ', '+')}",
- 'h4.A2sOrd',
- 'span.a8Pemb',
- 'title',
- 'price'
- )
- # Collect mislisted items from all platforms
- def collect_mislisted_items(msrp_data):
- mislisted_items = []
- for product_title in msrp_data:
- for search_func, platform in [
- (search_ebay, 'eBay'),
- (search_facebook_marketplace, 'Facebook Marketplace'),
- (search_kijiji, 'Kijiji'),
- (search_google_shopping, 'Google Shopping')
- ]:
- try:
- results = search_func(product_title)
- for item in results:
- title, price = item['title'], item['price']
- if is_mislisted(title):
- percentage = calculate_percentage_of_msdp(price, msrp_data[product_title])
- if percentage is not None:
- mislisted_items.append({
- 'platform': platform,
- 'title': title,
- 'price': price,
- 'percentage_of_msrp': percentage
- })
- except Exception as e:
- debug_error(f"Error searching on {platform}: {e}")
- return mislisted_items
- # Function to calculate the percentage of MSRP
- def calculate_percentage_of_msdp(price, msrp):
- return (price / msrp) * 100 if msrp > 0 else None
- # Function to write results to a CSV file
- def write_results_to_csv(mislisted_items):
- filename = 'mislisted_items.csv'
- sorted_items = sorted(mislisted_items, key=lambda x: (x['platform'], x['percentage_of_msrp']))
- with open(filename, 'w', newline='', encoding='utf-8') as file:
- writer = csv.writer(file)
- writer.writerow(['Platform', 'Title', 'Price', 'Percentage of MSRP'])
- current_platform = None
- for item in sorted_items:
- if item['platform'] != current_platform:
- current_platform = item['platform']
- writer.writerow([]) # Add an empty row to separate platforms
- writer.writerow([item['platform'], item['title'], item['price'], f"{item['percentage_of_msrp']:.2f}%"])
- print(f"Results have been written to {filename}")
- # Main workflow
- def main():
- try:
- msrp_data = scrape_games_workshop_ca()
- if not msrp_data:
- print("No MSRP data found.")
- return
- mislisted_items = collect_mislisted_items(msrp_data)
- if not mislisted_items:
- print("No mislisted items found.")
- return
- write_results_to_csv(mislisted_items)
- except Exception:
- debug_error("An unexpected error occurred in the main workflow.")
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement