Advertisement
Guest User

bs4_request_1

a guest
Mar 27th, 2020
332
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.36 KB | None | 0 0
  1.  
  2. from bs4 import BeautifulSoup as soup # HTML data structure
  3. from urllib.request import urlopen as uReq # Web client
  4.  
  5. # URl to web scrap from.
  6. # in this example we web scrap graphics cards from Newegg.com
  7. page_url = "http://www.newegg.com/Product/ProductList.aspx?Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=1&PageSize=36&order=BESTMATCH"
  8.  
  9. # opens the connection and downloads html page from url
  10. uClient = uReq(page_url)
  11.  
  12. # parses html into a soup data structure to traverse html
  13. # as if it were a json data type.
  14. page_soup = soup(uClient.read(), "html.parser")
  15. uClient.close()
  16.  
  17. # finds each product from the store page
  18. containers = page_soup.findAll("div", {"class": "item-container"})
  19.  
  20. # name the output file to write to local disk
  21. out_filename = "graphics_cards.csv"
  22. # header of csv file to be written
  23. headers = "brand,product_name,shipping \n"
  24.  
  25. # opens file, and writes headers
  26. f = open(out_filename, "w")
  27. f.write(headers)
  28.  
  29. # loops over each product and grabs attributes about
  30. # each product
  31. for container in containers:
  32. # Finds all link tags "a" from within the first div.
  33. make_rating_sp = container.div.select("a")
  34.  
  35. # Grabs the title from the image title attribute
  36. # Then does proper casing using .title()
  37. brand = make_rating_sp[0].img["title"].title()
  38.  
  39. # Grabs the text within the second "(a)" tag from within
  40. # the list of queries.
  41. product_name = container.div.select("a")[2].text
  42.  
  43. # Grabs the product shipping information by searching
  44. # all lists with the class "price-ship".
  45. # Then cleans the text of white space with strip()
  46. # Cleans the strip of "Shipping $" if it exists to just get number
  47. shipping = container.findAll("li", {"class": "price-ship"})[0].text.strip().replace("$", "").replace(" Shipping", "")
  48.  
  49. # prints the dataset to console
  50. print("brand: " + brand + "\n")
  51. print("product_name: " + product_name + "\n")
  52. print("shipping: " + shipping + "\n")
  53.  
  54. # writes the dataset to file
  55. f.write(brand + ", " + product_name.replace(",", "|") + ", " + shipping + "\n")
  56.  
  57. f.close() # Close the file
  58.  
  59.  
  60.  
  61. Python - newegg_com.py:58
  62. Traceback (most recent call last):
  63. File "/home/martin/dev/python/newegg_com.py", line 4, in <module>
  64. from urllib.request import urlopen as uReq # Web client
  65. ImportError: No module named request
  66. [Finished in 4.16s]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement