Advertisement
RayanRam

Untitled

Mar 14th, 2018
119
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.56 KB | None | 0 0
  1. import requests
  2. import bs4
  3. from bs4 import BeautifulSoup
  4. import pandas as pd
  5. import time
  6.  
  7. URL = 'https://www.indeed.fr/jobs?q=data+scientist&l=Paris+%2875%29'
  8. #conducting a request of the stated URL above:
  9. page = requests.get(URL)
  10. #specifying a desired format of “page” using the html parser - this allows python to read the various components of the page, rather than treating it as one long string.
  11. soup = BeautifulSoup(page.text, 'html.parser')
  12. #printing soup in a more structured tree format that makes for easier reading
  13. print(soup.prettify())
  14.  
  15.  
  16. def extract_job_title_from_result(soup):
  17.     jobs = []
  18.     for div in soup.find_all(name='div', attrs={'class':'row'}):
  19.         for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
  20.             jobs.append(a['title'])
  21.     return(jobs)
  22. extract_job_title_from_result(soup)
  23. def extract_company_from_result(soup):
  24.     companies = []
  25.     for div in soup.find_all(name='div', attrs={'class':'row'}):
  26.         company = div.find_all(name='span', attrs={'class':'company'})
  27.     if len(company) > 0:
  28.         for b in company:
  29.             companies.append(b.text.strip())
  30.     else:
  31.         sec_try = div.find_all(name='span', attrs={'class':'result-link-source'})
  32.         for span in sec_try:
  33.             companies.append(span.text.strip())
  34. return(companies)
  35.  
  36. extract_company_from_result(soup)
  37.  
  38. def extract_salary_from_result(soup):
  39.     salaries = []
  40.     for div in soup.find_all(name='div', attrs={'class':'row'}):
  41.         try:
  42.             salaries.append(div.find('nobr').text)
  43.         except:
  44.         try:
  45.             div_two = div.find(name='div', attrs={'class':'sjcl'})
  46.             div_three = div_two.find('div')
  47.             salaries.append(div_three.text.strip())
  48.         except:
  49.             salaries.append('Nothing_found')
  50.     return(salaries)
  51. extract_salary_from_result(soup)
  52.  
  53.  
  54. def extract_summary_from_result(soup):
  55.     summaries = []
  56.     spans = soup.findAll('span', attrs={'class': 'summary'})
  57.     for span in spans:
  58.         summaries.append(span.text.strip())
  59.     return(summaries)
  60. extract_summary_from_result(soup)
  61. max_results_per_city = 100
  62. city_set = [‘New+York’,’Chicago’,’San+Francisco’, ‘Austin’, ‘Seattle’, ‘Los+Angeles’, ‘Philadelphia’, ‘Atlanta’, ‘Dallas’, ‘Pittsburgh’, ‘Portland’, ‘Phoenix’, ‘Denver’, ‘Houston’, ‘Miami’, ‘Washington+DC’, ‘Boulder’]
  63. columns = [“city”, “job_title”, ”company_name”, ”location”, ”summary”, ”salary”]
  64. sample_df = pd.DataFrame(columns = columns)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement