Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/local/bin/python
- # -*- coding: utf-8 -*-
- from lxml import etree
- import sys, requests, traceback, math, time,shutil
- import requests.packages.urllib3
- from captcha_solver import CaptchaSolver
- from lxml import html
- #Disable SSL Warning
- requests.packages.urllib3.disable_warnings()
- #Define Global variable
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
- }
- proxy = {
- "http": "127.0.0.1:8080",
- "https": "127.0.0.1:8080",
- }
- s = requests.Session()
- main_domain = 'http://datawarehouse.dbd.go.th'
- html_parser = etree.HTMLParser(encoding="utf-8")
- def grab_captcha():
- global headers
- #Get cookie
- url = "http://datawarehouse.dbd.go.th/bdw/home/login.html"
- s.get(url,headers=headers)
- #Grab Captcha
- url = "http://datawarehouse.dbd.go.th/bdw/home/captcha.html"
- headers = {
- 'Host': 'datawarehouse.dbd.go.th',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language': 'en-US,en;q=0.5',
- 'Accept-Encoding': 'gzip, deflate, br',
- }
- #Request to server
- r = s.get(url , headers=headers,verify=False,stream=True)
- with open('img.png', 'wb') as out_file:
- shutil.copyfileobj(r.raw, out_file)
- solver = CaptchaSolver('antigate', api_key='39e2c871f81922e30c85108df1c8486c')#39e2c871f81922e30c85108df1c8486c')2395fb864412a999e5b998ac59a5b996
- raw_data = open('img.png', 'rb').read()
- captcha_val = solver.solve_captcha(raw_data)
- return captcha_val
- def login(user,password,captcha):
- global headers
- #Set url and header
- url = "http://datawarehouse.dbd.go.th/bdw/home/authen.html"
- headers = {
- 'Host': 'datawarehouse.dbd.go.th',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language': 'en-US,en;q=0.5',
- 'Accept-Encoding': 'gzip, deflate, br',
- 'Content-Type': 'application/x-www-form-urlencoded',
- 'Referer': 'http://datawarehouse.dbd.go.th/bdw/home/login.html',
- 'Upgrade-Insecure-Requests': '1',
- 'Connection': 'close'
- }
- payload = "userName="+str("hexkey6%40gmail.com")+"&userPassword="+str(password)+"&captchaText="+str(captcha)
- r = s.post(url,headers=headers ,data=payload,verify=False)
- if r.content.find('logout')!=-1:
- print "Logged"
- return True
- else:
- print "Not Logged"
- return False
- def list_obj_group():
- url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
- r = s.get(url,headers=headers,verify=False)
- tree = etree.HTML(r.content, parser=html_parser)
- result_dict = {}
- for i in tree.xpath('//select[@name="objGrpCode"]/option/text()'):
- if len(i.split(':'))==2:
- code = i.split(':')[0]
- dest = i.split(':')[1]
- result_dict[code] = dest
- return result_dict
- ##def list_obj(business_group):
- ## url = "http://datawarehouse.dbd.go.th/bdw/search/objective.html"
- ## payload = "value=" + str(business_group)
- ## headers = {
- ## 'Content-Type': 'application/x-www-form-urlencoded',
- ## 'X-Requested-With': 'XMLHttpRequest'
- ## }
- ## r = s.post(url,headers=headers ,data=payload,verify=False)
- ## tree = etree.HTML(r.content, parser=html_parser)
- ## result_dict = {}
- ## for i in tree.xpath('//option/text()'):
- ## if len(i.split(':'))==2:
- ## code = i.split(':')[0]
- ## dest = i.split(':')[1]
- ## result_dict[code] = dest
- ## return result_dict
- def list_region():
- url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
- r = s.get(url,headers=headers,verify=False)
- tree = etree.HTML(r.content, parser=html_parser)
- result_dict = {}
- for i in tree.xpath('//select[@name="zone"]/option'):
- if (i.xpath('@value')[0]!=''):
- code = i.xpath('@value')[0]
- dest = i.xpath('text()')[0]
- result_dict[code] = dest
- return result_dict
- def list_province(region):
- url = "http://datawarehouse.dbd.go.th/bdw/search/province.html"
- payload = "value=" + str(region) +"&province="
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded',
- 'X-Requested-With': 'XMLHttpRequest'
- }
- r = s.post(url,headers=headers ,data=payload,verify=False)
- tree = etree.HTML(r.content, parser=html_parser)
- result_dict = {}
- for i in tree.xpath('//option'):
- if (i.xpath('@value')[0]!=''):
- code = i.xpath('@value')[0]
- dest = i.xpath('text()')[0]
- result_dict[code] = dest
- return result_dict
- ###Old
- ##def get_business_list(objGrpCode,obj,region,province,year):
- ## total_page =-1
- ## page = 1
- ## url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
- ## while total_page!=str(page-1):
- ## payload = 'objGrpCode={0}&submitObjCode={1}&zone={2}&province={3}&hur=&jpTypeCode=&fiscalYear={4}&balIn09=&balIn09Max=&balIn21=&balIn21Max=&balBs11=&balBs11Max=&balBs23=&balBs23Max=&capAmt=&capAmtMax=&sortBy=JP_TNAME&search=search¤tPage={5}'.format(objGrpCode,obj,region,province,year,page)
- ## print payload
- ## headers = {
- ## 'Content-Type': 'application/x-www-form-urlencoded',
- ## 'X-Requested-With': 'XMLHttpRequest'
- ## }
- ## r = s.post(url,headers=headers ,data=payload,verify=False)
- ## tree = etree.HTML(r.content, parser=html_parser)
- ## try:
- ## total_page = tree.xpath('//*[@id="content"]/div[2]/div/div[3]/div[3]/b/text()')[0].split(' : ')[1].split(' ')[0]
- ## except:
- ## print "No result"
- ## break
- ##
- ## for i in tree.xpath('//table[@class="horizontal"]/tr'):
- ## if(i.xpath('td[2]/a/text()')!=[]):
- ## print i.xpath('td[2]/a/text()'),i.xpath('td[2]/a/@href')
- ## print total_page,page
- ## page = page + 1
- def get_business_list(objGrpCode,region,province,year):
- total_page =-1
- page = 1
- obj = ''
- url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
- while total_page!=str(page-1):
- payload = 'objGrpCode={0}&submitObjCode={1}&zone={2}&province={3}&hur=&jpTypeCode=&fiscalYear={4}&balIn09=&balIn09Max=&balIn21=&balIn21Max=&balBs11=&balBs11Max=&balBs23=&balBs23Max=&capAmt=&capAmtMax=&sortBy=JP_TNAME&search=search¤tPage={5}'.format(objGrpCode,obj,region,province,year,page)
- print payload
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded',
- 'X-Requested-With': 'XMLHttpRequest'
- }
- r = s.post(url,headers=headers ,data=payload,verify=False)
- tree = etree.HTML(r.content, parser=html_parser)
- try:
- total_page = tree.xpath('//*[@id="content"]/div[2]/div/div[3]/div[3]/b/text()')[0].split(' : ')[1].split(' ')[0]
- except:
- print "No result"
- break
- for i in tree.xpath('//table[@class="horizontal"]/tr'):
- if(i.xpath('td[2]/a/text()')!=[]):
- ##print i.xpath('td[2]/a/text()'),i.xpath('td[2]/a/@href')
- ##print 'http://datawarehouse.dbd.go.th'+ i.xpath('td[2]/a/@href')[0]
- company_info = 'http://datawarehouse.dbd.go.th'+ i.xpath('td[2]/a/@href')[0]
- pageContent=s.get(company_info)
- tree = html.fromstring(pageContent.content)
- ##detail = tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[' +str(i) +']/td/text()')
- company_detail =[]
- for i in range (1,14):
- company_detail.append(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr['+ str(i)+']/td/text()'))
- print total_page,page
- page = page + 1
- return company_detail.append
- if __name__ == '__main__':
- #Parsing Arguments
- captcha = grab_captcha()
- print "Got Captcha:",captcha
- login("hexkey6@gmail.com","password",captcha)
- get_business_list("A","C","10","2555")
- ##for obj_group in list_obj_group().keys():
- ## for region in list_region().keys():
- ## for province in list_province(region).keys():
- ## print get_business_list(obj_group,"C","10","2555")
- print "######"
- ##print list_obj("A")
- ##print list_region().keys()
- ##print list_province("C").keys()
- ##get_business_list("F","41000","C","10","2556")
- ##get_business_list("A","C","10","2555")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement