Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from tqdm import tqdm
- import pandas as pd
- from bs4 import BeautifulSoup
- RESULT = []
- url = 'https://www.constructionequipmentguide.com/used-excavators-for-sale/'
- r = requests.get(url)
- soup = BeautifulSoup(r.text, "html.parser")
- type_ul_tags = soup.find("ul", class_="columns-three")
- links = [item.find("a").get("href") for item in type_ul_tags.findAll("li")]
- subtypes = [item.find("a").text for item in type_ul_tags.findAll("li")]
- links = ["https://www.constructionequipmentguide.com"+item for item in links]
- def get_equipment_details(listing, subtype):
- obj = {}
- try:
- model = listing.find("div", class_="machine-model").find("a").text.strip()
- except Exception as e:
- model = None
- try:
- year = listing.find("div", class_="machine-model").find("a").text.strip().split(" ")[0].strip()
- except Exception as e:
- year = None
- try:
- dealer = listing.find("div", class_="machine-contact").find("div", class_="machine-dealer").text.strip()
- except Exception as e:
- dealer = None
- try:
- price = listing.find("div", class_="machine-details").find("div", class_="machine-price").text.strip()
- except Exception as e:
- price = None
- try:
- currency = price.split(" ")[-1].strip() if price else None
- except Exception as e:
- currency = None
- try:
- serial_no = listing.find("div", class_="machine-details").find("div", class_="machine-serial-stock").text.strip().replace("Serial:", "").strip()
- except Exception as e:
- serial_no = None
- try:
- url = listing.find("div", class_="machine-model").find("a").get("href")
- url = "https://www.constructionequipmentguide.com"+url
- except Exception as e:
- url = None
- print(e)
- obj["Model"] = model
- obj["Year"] = year
- obj["Seller_Name"] = dealer
- obj["Price"] = price
- obj["Currency"] = currency if currency != 'Price' else None
- obj["Serial_Number"] = serial_no
- obj["URL"] = url
- obj["SubType"] = subtype
- obj["Manufacturer"] = None
- if url:
- more_details = get_more_details(url)
- obj_joined = {**obj, **more_details}
- return obj_joined
- def get_more_details(url):
- obj = {}
- r = requests.get(url)
- soup = BeautifulSoup(r.text, "html.parser")
- try:
- condition_check = soup.find("div", class_="meta-descriptions").find("tr", class_="categories").find("td").\
- text.strip()
- if condition_check == "Condition":
- condition= soup.find("div", class_="meta-descriptions").findAll("tr")[1].findAll("td")[1].\
- text.strip()
- else:
- condition = None
- except Exception as e:
- condition = None
- try:
- size_class = None
- table = soup.find("table", class_="category-specs") #.findAll("tr")
- for row in table.findAll("tr"):
- if "Power" in row.find("td").text.strip():
- size_class = row.findAll("td")[1].text.strip()
- break
- except Exception as e:
- size_class = None
- #print(e)
- try:
- category = soup.find("span", {"content": "3"}).text.strip()
- except Exception as e:
- category = None
- try:
- zip = soup.find("table", class_="machine-info").find("tr", class_="machine-location").findAll("td")[-1].\
- text.split(" ")[-1].strip()
- except Exception as e:
- zip = None
- try:
- state = soup.find("table", class_="machine-info").find("tr", class_="machine-location").findAll("td")[-1].\
- text.split(" ")[1].replace(",", "").strip()
- except Exception as e:
- state = None
- try:
- city = soup.find("table", class_="machine-info").find("tr", class_="machine-location").findAll("td")[-1].\
- text.split(" ")[0].strip()
- except Exception as e:
- city = None
- obj["Category"] = category
- obj["Condition"] = condition
- obj["Size_Class"] = size_class
- obj["Seller_Zip"] = zip
- obj["Seller_State"] = state
- obj["Seller_City"] = city
- obj["Seller_Country"] = None
- obj["Meter_Reads"] = None
- obj["Meter_Reads_UOM"] = None
- return obj
- for index, link in enumerate(links[0:]):
- r = requests.get(link)
- soup = BeautifulSoup(r.text, "html.parser")
- for page_num in range(1, 500):
- r = requests.get(f"https://www.constructionequipmentguide.com/used-excavators-for-sale/all/type/large-excavators/{page_num}")
- soup = BeautifulSoup(r.text, "html.parser")
- try:
- big_paragraph = soup.find("p", class_="larger-text").text.strip()
- check_page_invalid = "We currently don't have any equipment listed for this type of machine." == big_paragraph
- if check_page_invalid:
- break
- except Exception as e:
- print(e)
- pass
- for listing in tqdm(soup.findAll("div", class_="machine-listing")):
- listing_item = get_equipment_details(listing=listing, subtype=subtypes[index])
- RESULT.append(listing_item)
- df = pd.DataFrame(RESULT)
- df.to_csv("result.csv", index=False)
Add Comment
Please, Sign In to add comment