Advertisement
Sorok7

Untitled

Jul 12th, 2013
119
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.85 KB | None | 0 0
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. import MySQLdb
  4. import ConfigParser
  5. import threading
  6. import time
  7. import lxml.etree as etree
  8. import urllib
  9. import pycurl
  10. from StringIO import StringIO
  11. import subprocess
  12. import lxml.html as HT
  13. import re
  14.  
  15.  
  16. def ConfigSectionMap(section):
  17.     dict1 = {}
  18.     options = Config.options(section)
  19.     for option in options:
  20.         try:
  21.             dict1[option] = Config.get(section, option)
  22.         except:
  23.             print("exception on %s!" % option)
  24.             dict1[option] = None
  25.     return dict1
  26.  
  27.  
  28. def get_html_for_all_new_url():
  29.     cursor.execute("""SELECT
  30.            id,
  31.            IdWebsite,
  32.            CheckNumber,
  33.            Url,
  34.            status
  35.        FROM url_check
  36.        where status = 'New'""")
  37.     allUrls = cursor.fetchall()
  38.     url = "http://www.sheboygancadillac.com/new-cars-sheboygan-wi?model=ATS&_trc=db&_no_memcached=1"
  39.     crl.setopt(pycurl.URL, url)
  40.     crl.setopt(pycurl.COOKIEFILE, igniteCookies)
  41.     crl.setopt(pycurl.WRITEFUNCTION, returnHtml.write)
  42.     crl.setopt(pycurl.FOLLOWLOCATION, 1)
  43.     crl.setopt(pycurl.MAXREDIRS, 5)
  44.     crl.perform()
  45.     for url in allUrls:
  46.         crl.setopt(pycurl.URL, str(url["Url"]))
  47.         crl.setopt(pycurl.COOKIEFILE, igniteCookies)
  48.         crl.setopt(pycurl.WRITEFUNCTION, returnHtml.write)
  49.         crl.setopt(pycurl.FOLLOWLOCATION, 1)
  50.         crl.setopt(pycurl.MAXREDIRS, 5)
  51.         crl.perform()
  52.         content1 = str(returnHtml.getvalue())
  53.         print content1
  54.         #print str(url["Url"])
  55.         tree = etree.fromstring(content1, parser=etree.HTMLParser())
  56.         ##print HT.tostring(tree)
  57.         contentnav = tree.xpath(".//div[@class='sqlTrace']")
  58.         div1 = contentnav[0]
  59.         sqlFull = div1.xpath(".//div[@style='display:none']")[0]
  60.         sqlFull_str = re.sub('<[^>]*>', '', HT.tostring(sqlFull)).replace('\n', '').replace('  ', '')
  61.         print sqlFull_str
  62.         sqlStatement = HT.tostring(div1.xpath(".//div[@class='sqlQuery']")[0])
  63.         sqlStatement = sqlStatement[:sqlStatement.find("""<span class="sqlParam">""")]
  64.         sqlStatement_str = re.sub('<[^>]*>', '', sqlStatement).replace('\n', '').replace('  ', '')
  65.         print sqlStatement_str
  66.         sqlTime = re.sub('<[^>]*>', '', HT.tostring(div1.xpath(".//div[@class='sqlTime']")[0])).replace('  ', '')
  67.         sqlTime_int = (sqlTime[12:].replace('.', ',').replace('\n', ''))
  68.         print sqlTime_int
  69.         exit(1)
  70.  
  71.  
  72.  
  73. def check_all_sites():
  74.     cursor.execute("Delete from url_check")
  75.     cursor.execute("""SELECT
  76.            Id,
  77.            dateAdded,
  78.            siteUrl,
  79.            status
  80.        FROM new_sites
  81.        where checkcount is null""")
  82.     allSites = cursor.fetchall()
  83.     checkcount = 1
  84.     get_urls_from_site(allSites, checkcount)
  85.     cursor.execute("Commit")
  86.  
  87. def get_urls_from_site(allSites, checkcount):
  88.     for site in allSites:
  89.         link = str(site['siteUrl']).strip()
  90.         if link.find('http://') < 0:
  91.             link = "http://" + link
  92.         if link[-1] != '/':
  93.             link += "/"
  94.         clearLink = link[7:]
  95.         clearLink = clearLink[:clearLink.__len__() - 1]
  96.         if clearLink.find('www.') >= 0:
  97.             clearLink = clearLink[4:]
  98.         cursorRead.execute("Select id from dealer_websites where domain = %s ", clearLink)
  99.         idWebsite = int(cursorRead.fetchone()['id'])
  100.         cursorRead.execute("""SELECT template FROM dealer_website_seo_urls
  101.            WHERE websiteid = %d
  102.            AND alias = 'inventory'
  103.            AND filter = '?t=%s'
  104.            AND legacy = 0
  105.             """ % (idWebsite, 'u'))
  106.         usedLinkTemplate = cursorRead.fetchone()['template']
  107.         cursorRead.execute("""SELECT template FROM dealer_website_seo_urls
  108.            WHERE websiteid = %d
  109.            AND alias = 'inventory'
  110.            AND filter = '?t=%s'
  111.            AND legacy = 0
  112.             """ % (idWebsite, 'n'))
  113.         newLinkTemplate = cursorRead.fetchone()['template']
  114.         get_links_filter_box(link + newLinkTemplate + "/",
  115.             parseLevel, ',', 0, idWebsite, checkcount)
  116.         get_links_filter_box(link + usedLinkTemplate + "/",
  117.             parseLevel, ',', 0, idWebsite, checkcount)
  118.  
  119.  
  120. def get_links_filter_box(link, lvl, path, autoinc, idWebsite, checkcount):
  121.     lvl = lvl - 1
  122.     autoinc = autoinc + 1
  123.     page = etree.HTML(urllib.urlopen(link).read())
  124.     for div in page.xpath('//div[@class="filters-box"]'):
  125.         path_curr = div.xpath('.//h5/text()')[0]
  126.         if path.find(path_curr) == -1:
  127.             path = path + "," + path_curr
  128.             for div2 in div.xpath('.//div[@class="filter"]'):
  129.                 for a in div2.xpath(".//a"):
  130.                     linkNew = str(a.attrib['href'])
  131.                     linkNew = linkNew.strip()
  132.                 linkNew += "&_trc=db&_no_memcached=1"
  133.                 cursor.execute("""INSERT INTO url_check
  134.                    (IdWebsite,
  135.                    CheckNumber,
  136.                    Url)
  137.                    VALUES
  138.                    (%d,%d,"%s") """ % (idWebsite, checkcount, linkNew))
  139.                 if lvl > 1:
  140.                     get_links_filter_box(linkNew, lvl, path, autoinc, checkcount)
  141.  
  142.  
  143. def authorize():
  144.     result = subprocess.Popen(['curl', '-s', '-F', 'login=' + igniteLogin,
  145.         '-F', 'pass=' + ignitePassword, '-b', '/tmp/igniteCookies', '-c',
  146.         '/tmp/igniteCookies', "-X", "POST", ignitePath + "auth/login"],
  147.          stdout=subprocess.PIPE).communicate()[0]
  148.     return "alert" not in result
  149.  
  150.  
  151.  
  152. if __name__ == "__main__":
  153.     #config
  154.     Config = ConfigParser.ConfigParser()
  155.     Config.read("config.cfg")
  156.     db = MySQLdb.connect(
  157.         host=ConfigSectionMap("Main")['host'],
  158.         user=ConfigSectionMap("Main")['user'],
  159.         passwd=ConfigSectionMap("Main")['pass'],
  160.         db=ConfigSectionMap("Main")['database'],
  161.         port=int(ConfigSectionMap("Main")['port']),
  162.         charset='utf8')
  163.     cursor = db.cursor(MySQLdb.cursors.DictCursor)
  164.     dbRead = MySQLdb.connect(
  165.         host=ConfigSectionMap("mainmysqlserver")['host'],
  166.         user=ConfigSectionMap("mainmysqlserver")['user'],
  167.         passwd=ConfigSectionMap("mainmysqlserver")['pass'],
  168.         db=ConfigSectionMap("mainmysqlserver")['database'],
  169.         port=int(ConfigSectionMap("mainmysqlserver")['port']))
  170.     cursorRead = dbRead.cursor(MySQLdb.cursors.DictCursor)
  171.     parseLevel = int(ConfigSectionMap("Main")['parselevel'])
  172.     igniteLogin = ConfigSectionMap("Main")['ignitelogin']
  173.     ignitePassword = ConfigSectionMap("Main")['ignitepassword']
  174.     ignitePath = ConfigSectionMap("Main")['ignitepath']
  175.     igniteCookies = ConfigSectionMap("Main")['ignitecookies']
  176.     #auth
  177.     ##authorize()
  178.     returnHtml = StringIO()
  179.     crl = pycurl.Curl()
  180.     crl.setopt(pycurl.COOKIEFILE, igniteCookies)
  181.     crl.setopt(pycurl.URL, ignitePath)
  182.     crl.setopt(pycurl.FOLLOWLOCATION, 1)
  183.     crl.setopt(pycurl.WRITEFUNCTION, returnHtml.write)
  184.     crl.setopt(pycurl.MAXREDIRS, 5)
  185.     crl.perform()
  186.     #main part
  187.     ## check_all_sites()
  188.     get_html_for_all_new_url()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement