collinsanele

edwards

May 19th, 2021 (edited)
881
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.39 KB | None | 0 0
  1. import requests
  2. from tqdm import tqdm
  3. import pandas as pd
  4. from bs4 import BeautifulSoup
  5.  
  6.  
  7.  
  8. RESULT = []
  9. url = 'https://www.constructionequipmentguide.com/used-excavators-for-sale/'
  10.  
  11. r = requests.get(url)
  12. soup = BeautifulSoup(r.text, "html.parser")
  13. type_ul_tags = soup.find("ul", class_="columns-three")
  14. links = [item.find("a").get("href") for item in type_ul_tags.findAll("li")]
  15. subtypes = [item.find("a").text for item in type_ul_tags.findAll("li")]
  16. links = ["https://www.constructionequipmentguide.com"+item for item in links]
  17.  
  18.  
  19. def get_equipment_details(listing, subtype):
  20.     obj = {}
  21.    
  22.     try:
  23.         model = listing.find("div", class_="machine-model").find("a").text.strip()
  24.     except Exception as e:
  25.         model = None
  26.        
  27.     try:
  28.         year = listing.find("div", class_="machine-model").find("a").text.strip().split(" ")[0].strip()
  29.     except Exception as e:
  30.         year = None
  31.        
  32.     try:
  33.         dealer =  listing.find("div", class_="machine-contact").find("div", class_="machine-dealer").text.strip()
  34.     except Exception as e:
  35.         dealer = None
  36.        
  37.     try:
  38.         price = listing.find("div", class_="machine-details").find("div", class_="machine-price").text.strip()
  39.     except Exception as e:
  40.         price = None
  41.        
  42.     try:
  43.         currency = price.split(" ")[-1].strip() if price else None
  44.     except Exception as e:
  45.         currency = None
  46.        
  47.     try:
  48.         serial_no = listing.find("div", class_="machine-details").find("div", class_="machine-serial-stock").text.strip().replace("Serial:", "").strip()
  49.     except Exception as e:
  50.         serial_no = None
  51.    
  52.     try:
  53.         url = listing.find("div", class_="machine-model").find("a").get("href")
  54.         url = "https://www.constructionequipmentguide.com"+url
  55.     except Exception as e:
  56.         url = None
  57.         print(e)
  58.    
  59.     obj["Model"] = model
  60.     obj["Year"] = year
  61.     obj["Seller_Name"] = dealer
  62.     obj["Price"] = price      
  63.     obj["Currency"] = currency if currency != 'Price' else None
  64.     obj["Serial_Number"] = serial_no
  65.     obj["URL"] = url
  66.     obj["SubType"] = subtype
  67.     obj["Manufacturer"] = None
  68.    
  69.     if url:
  70.         more_details = get_more_details(url)
  71.         obj_joined = {**obj, **more_details}
  72.    
  73.     return obj_joined
  74.    
  75.  
  76. def get_more_details(url):
  77.     obj = {}
  78.     r = requests.get(url)
  79.     soup = BeautifulSoup(r.text, "html.parser")
  80.    
  81.     try:
  82.         condition_check = soup.find("div", class_="meta-descriptions").find("tr", class_="categories").find("td").\
  83.         text.strip()
  84.         if condition_check == "Condition":
  85.             condition= soup.find("div", class_="meta-descriptions").findAll("tr")[1].findAll("td")[1].\
  86.         text.strip()
  87.            
  88.         else:
  89.             condition = None
  90.     except Exception as e:
  91.         condition = None
  92.        
  93.     try:
  94.         size_class = None
  95.         table = soup.find("table", class_="category-specs")  #.findAll("tr")
  96.         for row in table.findAll("tr"):
  97.             if "Power" in row.find("td").text.strip():
  98.                 size_class = row.findAll("td")[1].text.strip()
  99.                 break
  100.                
  101.     except Exception as e:
  102.         size_class = None
  103.         #print(e)
  104.    
  105.     try:        
  106.         category = soup.find("span", {"content": "3"}).text.strip()
  107.     except Exception as e:
  108.         category = None
  109.        
  110.     try:
  111.         zip = soup.find("table", class_="machine-info").find("tr", class_="machine-location").findAll("td")[-1].\
  112.     text.split(" ")[-1].strip()
  113.    
  114.     except Exception as e:
  115.         zip = None
  116.    
  117.     try:
  118.         state = soup.find("table", class_="machine-info").find("tr", class_="machine-location").findAll("td")[-1].\
  119.         text.split(" ")[1].replace(",", "").strip()
  120.     except Exception as e:
  121.         state = None
  122.    
  123.     try:
  124.         city = soup.find("table", class_="machine-info").find("tr", class_="machine-location").findAll("td")[-1].\
  125.         text.split(" ")[0].strip()
  126.     except Exception as e:
  127.         city = None
  128.    
  129.     obj["Category"] = category
  130.     obj["Condition"] = condition
  131.     obj["Size_Class"] = size_class
  132.     obj["Seller_Zip"] = zip
  133.     obj["Seller_State"] = state
  134.     obj["Seller_City"] = city
  135.     obj["Seller_Country"] = None
  136.     obj["Meter_Reads"] = None
  137.     obj["Meter_Reads_UOM"] = None
  138.    
  139.     return obj
  140.    
  141.    
  142.  
  143. for index, link in enumerate(links[0:]):
  144.     r = requests.get(link)
  145.     soup = BeautifulSoup(r.text, "html.parser")
  146.     for page_num in range(1, 500):
  147.         r = requests.get(f"https://www.constructionequipmentguide.com/used-excavators-for-sale/all/type/large-excavators/{page_num}")
  148.         soup = BeautifulSoup(r.text, "html.parser")
  149.         try:
  150.             big_paragraph = soup.find("p", class_="larger-text").text.strip()
  151.             check_page_invalid = "We currently don't have any equipment listed for this type of machine." == big_paragraph
  152.             if check_page_invalid:
  153.                 break
  154.         except Exception as e:
  155.             print(e)
  156.             pass
  157.            
  158.         for listing in tqdm(soup.findAll("div", class_="machine-listing")):
  159.             listing_item = get_equipment_details(listing=listing, subtype=subtypes[index])
  160.             RESULT.append(listing_item)
  161.        
  162.    
  163. df = pd.DataFrame(RESULT)
  164. df.to_csv("result.csv", index=False)
  165.  
  166.  
  167.  
  168.  
  169.  
  170.  
  171.  
  172.  
  173.  
  174.  
  175.  
  176.  
  177.  
  178.  
  179.  
  180.  
  181.  
  182.  
  183.  
  184.  
  185.  
  186.  
  187.  
  188.  
  189.  
  190.  
  191.  
  192.  
  193.  
  194.  
Add Comment
Please, Sign In to add comment