Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def parse_base_url():
- # Use requests.get to get the informtaion from the web page and store in the variable page
- page = requests.get(base_url)
- # if the status_code is 200 then the request was made successfully
- if page.status_code == 200:
- # BeautifulSoup gets the text from the page and lxml parses the information.
- bs = BeautifulSoup(page.text, 'lxml')
- # Get previous 10 players of the year using the class 'multicol', and the links from the unordered list.
- last_ten_players = bs.find('table', class_='multicol').find('ul').find_all('li')[:-11:-1][::-1]
- # Create the names of the data columns for pandas and assign each column an empty list.
- data = {
- 'Year': [],
- 'Country': [],
- 'Player': [],
- 'Team': [],
- }
- # for loop to iterate over and parse the data we want from the 10 players
- for parser in last_ten_players:
- year = parser.find('span').previousSibling.split()[0]
- # If there is a year append the year to the 'Year' list.
- if year:
- data['Year'].append(year)
- # else add the word 'none' to the 'Year' column
- else:
- data['Year'].append('none')
- # find all the anchor tags <a> and return the 1st <a> tag at index 0
- country = parser.findAll('a')[0]['title']
- # If there is a country append the country to the 'Country' list.
- if country:
- data['Country'].append(country)
- # else add the word 'none' to the 'Country' column
- else:
- data['Country'].append('none')
- # find all the anchor tags <a> and return the 2nd <a> tag at index 1
- player = parser.findAll('a')[1]
- # If there is a player append the player to the 'Player' list.
- if player:
- data['Player'].append(player.text)
- # else add the word 'none' to the 'Player' column
- else:
- data['Player'].append('none')
- # find all the anchor tags <a> and return the 3rd <a> tag at index 2
- team = parser.findAll('a')[2]
- # If there is a year team append the team to the 'Team' list.
- if team:
- data['Team'].append(team.text)
- # else add the word 'none' to the 'Team' column
- else:
- data['Team'].append('none')
- # Store data to dataframe using pandas with the following names for the columns.
- df = pd.DataFrame(data, columns=['Year','Country','Player', 'Team'])
- # Start the index at 1 instead of 0
- df.index = df.index + 1
- # print dataframe
- print(df)
- # store and save the data in the dataframe to a csv file
- df.to_csv('players_of_the_year.csv', sep=',', index=False, encoding='utf-8')
- parse_base_url()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement