Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup as Soup
- import pandas as pd
- import requests
- def clean_row(row):
- """
- Takes a table row (or header), and gets the data out of it,
- storing it in a list of strings
- """
- return [str(x.string) for x in row.find_all(['td', 'th'])]
- # Get the data from the website.
- raw_data = requests.get(
- 'URL_Here')
- #Parse the raw data into BeautifulSoup data.
- raw_soup = Soup(raw_data.text)
- # Find all tables (should only be one).
- # Then grab all the rows in the first table.
- tables = raw_soup.find_all('table')
- rows = tables[0].find_all('tr')
- # "Clean" the rows, then store the data in a data frame.
- # Start with the second row because the first is the header.
- cleaned_rows = [clean_row(row) for row in rows[1:-1]]
- df = pd.DataFrame(cleaned_rows)
- # 'Clean' the column headings in the same way and add them to the dataframe.
- cleaned_header = clean_row(rows[0])
- df.columns = cleaned_header
- # Set the index to the # row, which is just the row number. This will do for now.
- df.set_index('#', inplace=True)
- df.sample(10) #test
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement