Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup as soup # HTML data structure
- from urllib.request import urlopen as uReq # Web client
- # URl to web scrap from.
- # in this example we web scrap graphics cards from Newegg.com
- page_url = "http://www.newegg.com/Product/ProductList.aspx?Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=1&PageSize=36&order=BESTMATCH"
- # opens the connection and downloads html page from url
- uClient = uReq(page_url)
- # parses html into a soup data structure to traverse html
- # as if it were a json data type.
- page_soup = soup(uClient.read(), "html.parser")
- uClient.close()
- # finds each product from the store page
- containers = page_soup.findAll("div", {"class": "item-container"})
- # name the output file to write to local disk
- out_filename = "graphics_cards.csv"
- # header of csv file to be written
- headers = "brand,product_name,shipping \n"
- # opens file, and writes headers
- f = open(out_filename, "w")
- f.write(headers)
- # loops over each product and grabs attributes about
- # each product
- for container in containers:
- # Finds all link tags "a" from within the first div.
- make_rating_sp = container.div.select("a")
- # Grabs the title from the image title attribute
- # Then does proper casing using .title()
- brand = make_rating_sp[0].img["title"].title()
- # Grabs the text within the second "(a)" tag from within
- # the list of queries.
- product_name = container.div.select("a")[2].text
- # Grabs the product shipping information by searching
- # all lists with the class "price-ship".
- # Then cleans the text of white space with strip()
- # Cleans the strip of "Shipping $" if it exists to just get number
- shipping = container.findAll("li", {"class": "price-ship"})[0].text.strip().replace("$", "").replace(" Shipping", "")
- # prints the dataset to console
- print("brand: " + brand + "\n")
- print("product_name: " + product_name + "\n")
- print("shipping: " + shipping + "\n")
- # writes the dataset to file
- f.write(brand + ", " + product_name.replace(",", "|") + ", " + shipping + "\n")
- f.close() # Close the file
- Python - newegg_com.py:58
- Traceback (most recent call last):
- File "/home/martin/dev/python/newegg_com.py", line 4, in <module>
- from urllib.request import urlopen as uReq # Web client
- ImportError: No module named request
- [Finished in 4.16s]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement