import requests from tqdm import tqdm import pandas as pd from bs4 import BeautifulSoup RESULT = [] url = 'https://www.constructionequipmentguide.com/used-excavators-for-sale/' r = requests.get(url) soup = BeautifulSoup(r.text, "html.parser") type_ul_tags = soup.find("ul", class_="columns-three") links = [item.find("a").get("href") for item in type_ul_tags.findAll("li")] subtypes = [item.find("a").text for item in type_ul_tags.findAll("li")] links = ["https://www.constructionequipmentguide.com"+item for item in links] def get_equipment_details(listing, subtype): obj = {} try: model = listing.find("div", class_="machine-model").find("a").text.strip() except Exception as e: model = None try: year = listing.find("div", class_="machine-model").find("a").text.strip().split(" ")[0].strip() except Exception as e: year = None try: dealer = listing.find("div", class_="machine-contact").find("div", class_="machine-dealer").text.strip() except Exception as e: dealer = None try: price = listing.find("div", class_="machine-details").find("div", class_="machine-price").text.strip() except Exception as e: price = None try: currency = price.split(" ")[-1].strip() if price else None except Exception as e: currency = None try: serial_no = listing.find("div", class_="machine-details").find("div", class_="machine-serial-stock").text.strip().replace("Serial:", "").strip() except Exception as e: serial_no = None try: url = listing.find("div", class_="machine-model").find("a").get("href") url = "https://www.constructionequipmentguide.com"+url except Exception as e: url = None print(e) obj["Model"] = model obj["Year"] = year obj["Seller_Name"] = dealer obj["Price"] = price obj["Currency"] = currency if currency != 'Price' else None obj["Serial_Number"] = serial_no obj["URL"] = url obj["SubType"] = subtype obj["Manufacturer"] = None if url: more_details = get_more_details(url) obj_joined = {**obj, **more_details} return obj_joined def get_more_details(url): obj = {} r = requests.get(url) soup = BeautifulSoup(r.text, "html.parser") try: condition_check = soup.find("div", class_="meta-descriptions").find("tr", class_="categories").find("td").\ text.strip() if condition_check == "Condition": condition= soup.find("div", class_="meta-descriptions").findAll("tr")[1].findAll("td")[1].\ text.strip() else: condition = None except Exception as e: condition = None try: size_class = None table = soup.find("table", class_="category-specs") #.findAll("tr") for row in table.findAll("tr"): if "Power" in row.find("td").text.strip(): size_class = row.findAll("td")[1].text.strip() break except Exception as e: size_class = None #print(e) try: category = soup.find("span", {"content": "3"}).text.strip() except Exception as e: category = None try: zip = soup.find("table", class_="machine-info").find("tr", class_="machine-location").findAll("td")[-1].\ text.split(" ")[-1].strip() except Exception as e: zip = None try: state = soup.find("table", class_="machine-info").find("tr", class_="machine-location").findAll("td")[-1].\ text.split(" ")[1].replace(",", "").strip() except Exception as e: state = None try: city = soup.find("table", class_="machine-info").find("tr", class_="machine-location").findAll("td")[-1].\ text.split(" ")[0].strip() except Exception as e: city = None obj["Category"] = category obj["Condition"] = condition obj["Size_Class"] = size_class obj["Seller_Zip"] = zip obj["Seller_State"] = state obj["Seller_City"] = city obj["Seller_Country"] = None obj["Meter_Reads"] = None obj["Meter_Reads_UOM"] = None return obj for index, link in enumerate(links[0:]): r = requests.get(link) soup = BeautifulSoup(r.text, "html.parser") for page_num in range(1, 500): r = requests.get(f"https://www.constructionequipmentguide.com/used-excavators-for-sale/all/type/large-excavators/{page_num}") soup = BeautifulSoup(r.text, "html.parser") try: big_paragraph = soup.find("p", class_="larger-text").text.strip() check_page_invalid = "We currently don't have any equipment listed for this type of machine." == big_paragraph if check_page_invalid: break except Exception as e: print(e) pass for listing in tqdm(soup.findAll("div", class_="machine-listing")): listing_item = get_equipment_details(listing=listing, subtype=subtypes[index]) RESULT.append(listing_item) df = pd.DataFrame(RESULT) df.to_csv("result.csv", index=False)