Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- import MySQLdb
- import ConfigParser
- import threading
- import time
- import lxml.etree as etree
- import urllib
- import pycurl
- from StringIO import StringIO
- import subprocess
- import lxml.html as HT
- import re
- def ConfigSectionMap(section):
- dict1 = {}
- options = Config.options(section)
- for option in options:
- try:
- dict1[option] = Config.get(section, option)
- except:
- print("exception on %s!" % option)
- dict1[option] = None
- return dict1
- def get_html_for_all_new_url():
- cursor.execute("""SELECT
- id,
- IdWebsite,
- CheckNumber,
- Url,
- status
- FROM url_check
- where status = 'New'""")
- allUrls = cursor.fetchall()
- url = "http://www.sheboygancadillac.com/new-cars-sheboygan-wi?model=ATS&_trc=db&_no_memcached=1"
- crl.setopt(pycurl.URL, url)
- crl.setopt(pycurl.COOKIEFILE, igniteCookies)
- crl.setopt(pycurl.WRITEFUNCTION, returnHtml.write)
- crl.setopt(pycurl.FOLLOWLOCATION, 1)
- crl.setopt(pycurl.MAXREDIRS, 5)
- crl.perform()
- for url in allUrls:
- crl.setopt(pycurl.URL, str(url["Url"]))
- crl.setopt(pycurl.COOKIEFILE, igniteCookies)
- crl.setopt(pycurl.WRITEFUNCTION, returnHtml.write)
- crl.setopt(pycurl.FOLLOWLOCATION, 1)
- crl.setopt(pycurl.MAXREDIRS, 5)
- crl.perform()
- content1 = str(returnHtml.getvalue())
- print content1
- #print str(url["Url"])
- tree = etree.fromstring(content1, parser=etree.HTMLParser())
- ##print HT.tostring(tree)
- contentnav = tree.xpath(".//div[@class='sqlTrace']")
- div1 = contentnav[0]
- sqlFull = div1.xpath(".//div[@style='display:none']")[0]
- sqlFull_str = re.sub('<[^>]*>', '', HT.tostring(sqlFull)).replace('\n', '').replace(' ', '')
- print sqlFull_str
- sqlStatement = HT.tostring(div1.xpath(".//div[@class='sqlQuery']")[0])
- sqlStatement = sqlStatement[:sqlStatement.find("""<span class="sqlParam">""")]
- sqlStatement_str = re.sub('<[^>]*>', '', sqlStatement).replace('\n', '').replace(' ', '')
- print sqlStatement_str
- sqlTime = re.sub('<[^>]*>', '', HT.tostring(div1.xpath(".//div[@class='sqlTime']")[0])).replace(' ', '')
- sqlTime_int = (sqlTime[12:].replace('.', ',').replace('\n', ''))
- print sqlTime_int
- exit(1)
- def check_all_sites():
- cursor.execute("Delete from url_check")
- cursor.execute("""SELECT
- Id,
- dateAdded,
- siteUrl,
- status
- FROM new_sites
- where checkcount is null""")
- allSites = cursor.fetchall()
- checkcount = 1
- get_urls_from_site(allSites, checkcount)
- cursor.execute("Commit")
- def get_urls_from_site(allSites, checkcount):
- for site in allSites:
- link = str(site['siteUrl']).strip()
- if link.find('http://') < 0:
- link = "http://" + link
- if link[-1] != '/':
- link += "/"
- clearLink = link[7:]
- clearLink = clearLink[:clearLink.__len__() - 1]
- if clearLink.find('www.') >= 0:
- clearLink = clearLink[4:]
- cursorRead.execute("Select id from dealer_websites where domain = %s ", clearLink)
- idWebsite = int(cursorRead.fetchone()['id'])
- cursorRead.execute("""SELECT template FROM dealer_website_seo_urls
- WHERE websiteid = %d
- AND alias = 'inventory'
- AND filter = '?t=%s'
- AND legacy = 0
- """ % (idWebsite, 'u'))
- usedLinkTemplate = cursorRead.fetchone()['template']
- cursorRead.execute("""SELECT template FROM dealer_website_seo_urls
- WHERE websiteid = %d
- AND alias = 'inventory'
- AND filter = '?t=%s'
- AND legacy = 0
- """ % (idWebsite, 'n'))
- newLinkTemplate = cursorRead.fetchone()['template']
- get_links_filter_box(link + newLinkTemplate + "/",
- parseLevel, ',', 0, idWebsite, checkcount)
- get_links_filter_box(link + usedLinkTemplate + "/",
- parseLevel, ',', 0, idWebsite, checkcount)
- def get_links_filter_box(link, lvl, path, autoinc, idWebsite, checkcount):
- lvl = lvl - 1
- autoinc = autoinc + 1
- page = etree.HTML(urllib.urlopen(link).read())
- for div in page.xpath('//div[@class="filters-box"]'):
- path_curr = div.xpath('.//h5/text()')[0]
- if path.find(path_curr) == -1:
- path = path + "," + path_curr
- for div2 in div.xpath('.//div[@class="filter"]'):
- for a in div2.xpath(".//a"):
- linkNew = str(a.attrib['href'])
- linkNew = linkNew.strip()
- linkNew += "&_trc=db&_no_memcached=1"
- cursor.execute("""INSERT INTO url_check
- (IdWebsite,
- CheckNumber,
- Url)
- VALUES
- (%d,%d,"%s") """ % (idWebsite, checkcount, linkNew))
- if lvl > 1:
- get_links_filter_box(linkNew, lvl, path, autoinc, checkcount)
- def authorize():
- result = subprocess.Popen(['curl', '-s', '-F', 'login=' + igniteLogin,
- '-F', 'pass=' + ignitePassword, '-b', '/tmp/igniteCookies', '-c',
- '/tmp/igniteCookies', "-X", "POST", ignitePath + "auth/login"],
- stdout=subprocess.PIPE).communicate()[0]
- return "alert" not in result
- if __name__ == "__main__":
- #config
- Config = ConfigParser.ConfigParser()
- Config.read("config.cfg")
- db = MySQLdb.connect(
- host=ConfigSectionMap("Main")['host'],
- user=ConfigSectionMap("Main")['user'],
- passwd=ConfigSectionMap("Main")['pass'],
- db=ConfigSectionMap("Main")['database'],
- port=int(ConfigSectionMap("Main")['port']),
- charset='utf8')
- cursor = db.cursor(MySQLdb.cursors.DictCursor)
- dbRead = MySQLdb.connect(
- host=ConfigSectionMap("mainmysqlserver")['host'],
- user=ConfigSectionMap("mainmysqlserver")['user'],
- passwd=ConfigSectionMap("mainmysqlserver")['pass'],
- db=ConfigSectionMap("mainmysqlserver")['database'],
- port=int(ConfigSectionMap("mainmysqlserver")['port']))
- cursorRead = dbRead.cursor(MySQLdb.cursors.DictCursor)
- parseLevel = int(ConfigSectionMap("Main")['parselevel'])
- igniteLogin = ConfigSectionMap("Main")['ignitelogin']
- ignitePassword = ConfigSectionMap("Main")['ignitepassword']
- ignitePath = ConfigSectionMap("Main")['ignitepath']
- igniteCookies = ConfigSectionMap("Main")['ignitecookies']
- #auth
- ##authorize()
- returnHtml = StringIO()
- crl = pycurl.Curl()
- crl.setopt(pycurl.COOKIEFILE, igniteCookies)
- crl.setopt(pycurl.URL, ignitePath)
- crl.setopt(pycurl.FOLLOWLOCATION, 1)
- crl.setopt(pycurl.WRITEFUNCTION, returnHtml.write)
- crl.setopt(pycurl.MAXREDIRS, 5)
- crl.perform()
- #main part
- ## check_all_sites()
- get_html_for_all_new_url()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement