Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from bs4 import BeautifulSoup as bs
- import mysql.connector
- mydb = mysql.connector.connect(
- host='localhost',
- user='root',
- passwd='1',
- database='scraping'
- )
- def get_html(url):
- r = requests.get(url)
- return r.text
- def get_page_data(html):
- soup = bs(html, 'lxml')
- name = soup.find('div', id='meta').find('h1').text
- print(name)
- position = soup.find('div', id='meta').find('p').find('strong', text='Position')
- try:
- print(position.next_sibling[2:9].replace('•',''))
- except:
- position =''
- hand = soup.find('div', id='meta').find('p').find('strong', text='Shoots')
- try:
- print(hand.next_sibling[2:8])
- except:
- hand = ''
- height = soup.find('div', id='meta').find_all('p')[1].find_all('span')[1]
- print(height.next_sibling[2:7])
- weight = soup.find('div', id='meta').find_all('p')[1].find_all('span')[1]
- print(weight.next_sibling[9:].replace(')',''))
- born = soup.find('div', id='meta').find('span', id='necro-birth').get('data-birth')
- print(born)
- city_born = soup.find('div', id='meta').find('span', itemprop='birthPlace').find('a')
- try:
- print(city_born.previous_sibling.replace(',', '').lstrip())
- except:
- city_born = ''
- country_born = soup.find('div', id='meta').find('span', itemprop='birthPlace').find('a')
- try:
- print(country_born.text.replace('California', 'US').
- replace('Michigan', 'US').
- replace('Florida', 'US').
- replace('Texas', 'US').
- replace('Washington', 'US').
- replace('Hawaii', 'US').
- replace('New York', 'US').
- replace('Arizona', 'US').
- replace('Washington', 'US').
- replace('Georgia', 'US').
- replace('Michigan', 'US').
- replace('New Jersey', 'US').
- replace('Pennsylvania', 'US').
- replace('Alabama', 'US').
- replace('Virginia', 'US').
- replace('Illinois', 'US').
- replace('Massachusetts', 'US').
- replace('Colorado', 'US').
- replace('Minnesota', 'US').
- replace('Ohio', 'US').
- replace('Alaska', 'US').
- replace('Tennessee', 'US').
- replace('Oregon', 'US').
- replace('Missouri', 'US').
- replace('Maryland', 'US').
- replace('South Carolina', 'US').
- replace('Indiana', 'US').
- replace('Wisconsin', 'US').
- replace('Utah', 'US').
- replace('Kentucky', 'US').
- replace('Louisiana', 'US').
- replace('Connecticut', 'US').
- replace('Oklahoma', 'US').
- replace('Nevada', 'US').
- replace('Maine', 'US').
- replace('Mississippi', 'US').
- replace('New Mexico', 'US').
- replace('Kansas', 'US').
- replace('Iowa', 'US').
- replace('Montana', 'US').
- replace('Nebraska', 'US').
- replace('Arkansas', 'US').
- replace('Wyoming', 'US').
- replace('Rhode Island', 'US').
- replace('New Hampshire', 'US').
- replace('Delaware', 'US').
- replace('West Virginia', 'US').
- replace('Vermont', 'US').
- replace('Idaho', 'US').
- replace('North Dakota', 'US').
- replace('South Dakota', 'US'))
- except:
- country_born = ''
- round_draft = soup.find('div', id='meta').find_all('p')[3:5]
- for i in round_draft:
- try:
- rd = i.find('a')
- print(rd.next_sibling.replace('\\','').replace(',', '').replace(')','').replace('(','').lstrip())
- except:
- rd = ''
- twitter_link = soup.find('div', id='meta').find('a', rel='nofollow')
- try:
- print(twitter_link.get('href'))
- except:
- twitter_link =''
- link_pic = soup.find('div', id='meta').find('div', class_='media-item')
- try:
- print(link_pic.find('img').get('src'))
- except:
- link_pic = ''
- mycursor = mydb.cursor()
- sqlFormula = 'INSERT INTO skate_stat (name) VALUES (%s),'
- mycursor.execute(sqlFormula, (namelastname,))
- mydb.commit()
- def main():
- pattern = 'https://www.hockey-reference.com'
- main = 'https://www.hockey-reference.com/players/'
- for be_letter in ('a', 'b', 'h'):
- letter = main + be_letter
- print(letter)
- soup = bs(get_html(letter), 'lxml')
- searcher = soup.find('div', class_='section_content').find_all('p')
- for i in searcher:
- try:
- final = i.find('a').get('href')
- url = pattern + final
- print(url)
- get_page_data(get_html(url))
- except:
- break
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement