Advertisement
Guest User

Scraper Issue

a guest
May 16th, 2021
172
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.07 KB | None | 0 0
  1. from bs4 import BeautifulSoup as Soup
  2. import pandas as pd
  3. import requests
  4.  
  5. def clean_row(row):
  6.     """
  7.    Takes a table row (or header), and gets the data out of it,
  8.    storing it in a list of strings
  9.    """
  10.     return [str(x.string) for x in row.find_all(['td', 'th'])]
  11.  
  12. # Get the data from the website.
  13. raw_data = requests.get(
  14.     'URL_Here')
  15.  
  16. #Parse the raw data into BeautifulSoup data.
  17. raw_soup = Soup(raw_data.text)
  18.  
  19. # Find all tables (should only be one).
  20. # Then grab all the rows in the first table.
  21. tables = raw_soup.find_all('table')
  22. rows = tables[0].find_all('tr')
  23.  
  24. # "Clean" the rows, then store the data in a data frame.
  25. # Start with the second row because the first is the header.
  26. cleaned_rows = [clean_row(row) for row in rows[1:-1]]
  27. df = pd.DataFrame(cleaned_rows)
  28.  
  29. # 'Clean' the column headings in the same way and add them to the dataframe.
  30. cleaned_header = clean_row(rows[0])
  31. df.columns = cleaned_header
  32.  
  33. # Set the index to the # row, which is just the row number. This will do for now.
  34. df.set_index('#', inplace=True)
  35.  
  36. df.sample(10) #test
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement