Advertisement
przedmarancza

app.py

Mar 5th, 2018
148
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.34 KB | None | 0 0
  1. import urllib.parse
  2. import urllib.request
  3. from bs4 import BeautifulSoup
  4. import db_init
  5. import send_email
  6.  
  7.  
  8. def search_url(search='python'): #full web scraping
  9.  
  10.     site = 'http://www.rzeszowiak.pl/Praca-Zatrudnie-3040011505'
  11.     headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 '
  12.                              'Safari/537.17'}
  13.     values = {'z': search}
  14.     data = urllib.parse.urlencode(values)
  15.     url = site + '?' + data
  16.     req = urllib.request.Request(url, headers=headers)
  17.     resp = urllib.request.urlopen(req)
  18.     resp_data = resp.read()
  19.     parsed_html = BeautifulSoup(resp_data, "lxml")
  20.     listLinks = []
  21.  
  22.     for div in parsed_html.find_all('div', class_='normalbox-title-left'):
  23.         listLinks.append('http://www.rzeszowiak.pl' + div.a.get('href'))
  24.  
  25.     return listLinks[::-1]
  26.  
  27.  
  28. class DataAdvertisement:
  29.     """
  30.  
  31.    Scraping important data from advertisement
  32.  
  33.    """
  34.     def __init__(self, link):
  35.         self.link = link
  36.         self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko)'
  37.                                       ' Chrome/24.0.1312.27 Safari/537.17'}
  38.         self.req = urllib.request.Request(self.link, headers=self.headers)
  39.         self.resp = urllib.request.urlopen(self.req)
  40.         self.respData = self.resp.read()
  41.         self.parsedHtml = BeautifulSoup(self.respData, "lxml")
  42.         self.value = self.parsedHtml.find_all('div', class_='value')
  43.  
  44.     def take_title(self):
  45.         title = self.value[1].text
  46.         return title
  47.  
  48.     def take_date(self):
  49.         date = self.value[2].text[0:17]
  50.         return date
  51.  
  52.     def take_cash(self):  # salary
  53.         cash = self.value[4].text
  54.         return cash
  55.  
  56.     def take_content(self):
  57.         cont = self.parsedHtml.find('div', class_='content')
  58.         content = cont.text
  59.         return content
  60.  
  61.  
  62. def add_new_records():
  63.     db_init.create_table()
  64.     db_init.clear_new()
  65.     for oneLink in search_url():
  66.         loopLink = DataAdvertisement(oneLink)
  67.         if db_init.find_record(oneLink) is None:
  68.             db_init.data_entry(loopLink.take_title(),loopLink.take_date(), loopLink.take_cash(),
  69.                                           loopLink.take_content(), oneLink)
  70.     send_email.create_msg()  # send email
  71.  
  72.  
  73. add_new_records()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement