Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests, time
- from bs4 import BeautifulSoup as bs
- import pandas as pd
- def searchsport(terme):
- url = f'https://www.verif.com/recherche/{terme}/1/ca/d/?ville=null'
- response = requests.get(url, headers = {'User-Agent':'Mozilla/5.0'})
- response.raise_for_status()
- return terme, response.text
- def crawl(keyword):
- try:
- keyword, html = searchsport(keyword)
- soup = bs(html,'lxml')
- a_tag = soup.select_one('td.verif_col1 a[href]')
- # your code before when looping tds would just overwrite truelink if more than one found. Instead
- if a_tag is None:
- #handle case of no result e.g. with using crawl('PARIS-SAINT-GERMAIN-FOOTBALL') instead of
- #crawl('PARIS SAINT GERMAIN FOOTBALL')
- truelink = ''
- else:
- # print(a_tag['href'])
- # adding to the list premier served no purpose. Using split on href would result in list index out of range
- truelink = f'https://www.verif.com{a_tag["href"]}' #relative link already so no extra / after .com
- except Exception as e:
- print(e)
- truelink = '' #handle case of 'other' fail. Make sure there is an assigment
- finally:
- time.sleep(5)
- return truelink #unless try succeeded this would have failed with local variable referenced before assignment
- def single_text(item_url):
- source_code = requests.get(item_url, headers = {'User-Agent':'Mozilla/5.0'})
- print('nivo1 ok')
- plain_text = source_code.text # La page en html avec toutes ces balises
- soup = bs(plain_text,features="lxml")
- print('nivo2 ok')
- table = soup.select_one('.table') # on cherche que la balise table
- #print('nivo1 ok', '\n', table)
- if table is None:
- df = pd.DataFrame()
- else:
- df = pd.read_html(str(table))[0] #simplify to work direct with table and pandas;avoid your loops
- return df
- def main():
- terms = ['PARIS-SAINT-GERMAIN-FOOTBALL', 'PARIS SAINT GERMAIN FOOTBALL']
- for term in terms:
- item_url = crawl(term)
- if item_url:
- print(item_url)
- df = single_text(item_url) # what is single_item in your question? There is single_text
- if not df.empty: #test if dataframe is empty
- print(df.head(1))
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement