Advertisement
Guest User

Untitled

a guest
Oct 18th, 2019
100
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.36 KB | None | 0 0
  1. import requests, time
  2. from bs4 import BeautifulSoup as bs
  3. import pandas as pd
  4.  
  5.  
  6. def searchsport(terme):
  7. url = f'https://www.verif.com/recherche/{terme}/1/ca/d/?ville=null'
  8. response = requests.get(url, headers = {'User-Agent':'Mozilla/5.0'})
  9. response.raise_for_status()
  10. return terme, response.text
  11.  
  12.  
  13. def crawl(keyword):
  14. try:
  15. keyword, html = searchsport(keyword)
  16. soup = bs(html,'lxml')
  17. a_tag = soup.select_one('td.verif_col1 a[href]')
  18. # your code before when looping tds would just overwrite truelink if more than one found. Instead
  19. if a_tag is None:
  20. #handle case of no result e.g. with using crawl('PARIS-SAINT-GERMAIN-FOOTBALL') instead of
  21. #crawl('PARIS SAINT GERMAIN FOOTBALL')
  22. truelink = ''
  23. else:
  24. # print(a_tag['href'])
  25. # adding to the list premier served no purpose. Using split on href would result in list index out of range
  26. truelink = f'https://www.verif.com{a_tag["href"]}' #relative link already so no extra / after .com
  27.  
  28. except Exception as e:
  29. print(e)
  30. truelink = '' #handle case of 'other' fail. Make sure there is an assigment
  31. finally:
  32. time.sleep(5)
  33. return truelink #unless try succeeded this would have failed with local variable referenced before assignment
  34.  
  35.  
  36. def single_text(item_url):
  37. source_code = requests.get(item_url, headers = {'User-Agent':'Mozilla/5.0'})
  38. print('nivo1 ok')
  39. plain_text = source_code.text # La page en html avec toutes ces balises
  40. soup = bs(plain_text,features="lxml")
  41. print('nivo2 ok')
  42. table = soup.select_one('.table') # on cherche que la balise table
  43. #print('nivo1 ok', '\n', table)
  44. if table is None:
  45. df = pd.DataFrame()
  46. else:
  47. df = pd.read_html(str(table))[0] #simplify to work direct with table and pandas;avoid your loops
  48. return df
  49.  
  50. def main():
  51.  
  52. terms = ['PARIS-SAINT-GERMAIN-FOOTBALL', 'PARIS SAINT GERMAIN FOOTBALL']
  53.  
  54. for term in terms:
  55. item_url = crawl(term)
  56. if item_url:
  57. print(item_url)
  58. df = single_text(item_url) # what is single_item in your question? There is single_text
  59. if not df.empty: #test if dataframe is empty
  60. print(df.head(1))
  61.  
  62. if __name__ == '__main__':
  63. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement