Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/local/bin/python
- # -*- coding: utf-8 -*-
- from lxml import etree
- import sys, requests, traceback, math, time,shutil
- import requests.packages.urllib3
- from captcha_solver import CaptchaSolver
- from lxml import html
- from bs4 import BeautifulSoup
- import re
- import urllib
- import MySQLdb
- #Disable SSL Warning
- requests.packages.urllib3.disable_warnings()
- #Define Global variable
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
- }
- proxy = {
- "http": "127.0.0.1:8080",
- "https": "127.0.0.1:8080",
- }
- s = requests.Session()
- main_domain = 'http://datawarehouse.dbd.go.th'
- html_parser = etree.HTMLParser(encoding="utf-8")
- def grab_captcha():
- global headers
- #Get cookie
- url = "http://datawarehouse.dbd.go.th/bdw/home/login.html"
- s.get(url,headers=headers)
- #Grab Captcha
- url = "http://datawarehouse.dbd.go.th/bdw/home/captcha.html"
- headers = {
- 'Host': 'datawarehouse.dbd.go.th',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language': 'en-US,en;q=0.5',
- 'Accept-Encoding': 'gzip, deflate, br',
- }
- #Request to server
- r = s.get(url , headers=headers,verify=False,stream=True)
- with open('img.png', 'wb') as out_file:
- shutil.copyfileobj(r.raw, out_file)
- solver = CaptchaSolver('antigate', api_key='39e2c871f81922e30c85108df1c8486c')#39e2c871f81922e30c85108df1c8486c')2395fb864412a999e5b998ac59a5b996
- raw_data = open('img.png', 'rb').read()
- captcha_val = solver.solve_captcha(raw_data)
- return captcha_val
- def login(user,password,captcha):
- global headers
- #Set url and header
- url = "http://datawarehouse.dbd.go.th/bdw/home/authen.html"
- headers = {
- 'Host': 'datawarehouse.dbd.go.th',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language': 'en-US,en;q=0.5',
- 'Accept-Encoding': 'gzip, deflate, br',
- 'Content-Type': 'application/x-www-form-urlencoded',
- 'Referer': 'http://datawarehouse.dbd.go.th/bdw/home/login.html',
- 'Upgrade-Insecure-Requests': '1',
- 'Connection': 'close'
- }
- payload = "userName="+str("hexkey6%40gmail.com")+"&userPassword="+str(password)+"&captchaText="+str(captcha)
- r = s.post(url,headers=headers ,data=payload,verify=False)
- if r.content.find('logout')!=-1:
- print "Logged"
- return True
- else:
- print "Not Logged"
- return False
- def list_obj_group():
- url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
- r = s.get(url,headers=headers,verify=False)
- tree = etree.HTML(r.content, parser=html_parser)
- result_dict = {}
- for i in tree.xpath('//select[@name="objGrpCode"]/option/text()'):
- if len(i.split(':'))==2:
- code = i.split(':')[0]
- dest = i.split(':')[1]
- result_dict[code] = dest
- return result_dict
- ##def list_obj(business_group):
- ## url = "http://datawarehouse.dbd.go.th/bdw/search/objective.html"
- ## payload = "value=" + str(business_group)
- ## headers = {
- ## 'Content-Type': 'application/x-www-form-urlencoded',
- ## 'X-Requested-With': 'XMLHttpRequest'
- ## }
- ## r = s.post(url,headers=headers ,data=payload,verify=False)
- ## tree = etree.HTML(r.content, parser=html_parser)
- ## result_dict = {}
- ## for i in tree.xpath('//option/text()'):
- ## if len(i.split(':'))==2:
- ## code = i.split(':')[0]
- ## dest = i.split(':')[1]
- ## result_dict[code] = dest
- ## return result_dict
- def list_region():
- url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
- r = s.get(url,headers=headers,verify=False)
- tree = etree.HTML(r.content, parser=html_parser)
- result_dict = {}
- for i in tree.xpath('//select[@name="zone"]/option'):
- if (i.xpath('@value')[0]!=''):
- code = i.xpath('@value')[0]
- dest = i.xpath('text()')[0]
- result_dict[code] = dest
- return result_dict
- def list_province(region):
- url = "http://datawarehouse.dbd.go.th/bdw/search/province.html"
- payload = "value=" + str(region) +"&province="
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded',
- 'X-Requested-With': 'XMLHttpRequest'
- }
- r = s.post(url,headers=headers ,data=payload,verify=False)
- tree = etree.HTML(r.content, parser=html_parser)
- result_dict = {}
- for i in tree.xpath('//option'):
- if (i.xpath('@value')[0]!=''):
- code = i.xpath('@value')[0]
- dest = i.xpath('text()')[0]
- result_dict[code] = dest
- return result_dict
- ###Old
- ##def get_business_list(objGrpCode,obj,region,province,year):
- ## total_page =-1
- ## page = 1
- ## url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
- ## while total_page!=str(page-1):
- ## payload = 'objGrpCode={0}&submitObjCode={1}&zone={2}&province={3}&hur=&jpTypeCode=&fiscalYear={4}&balIn09=&balIn09Max=&balIn21=&balIn21Max=&balBs11=&balBs11Max=&balBs23=&balBs23Max=&capAmt=&capAmtMax=&sortBy=JP_TNAME&search=search¤tPage={5}'.format(objGrpCode,obj,region,province,year,page)
- ## print payload
- ## headers = {
- ## 'Content-Type': 'application/x-www-form-urlencoded',
- ## 'X-Requested-With': 'XMLHttpRequest'
- ## }
- ## r = s.post(url,headers=headers ,data=payload,verify=False)
- ## tree = etree.HTML(r.content, parser=html_parser)
- ## try:
- ## total_page = tree.xpath('//*[@id="content"]/div[2]/div/div[3]/div[3]/b/text()')[0].split(' : ')[1].split(' ')[0]
- ## except:
- ## print "No result"
- ## break
- ##
- ## for i in tree.xpath('//table[@class="horizontal"]/tr'):
- ## if(i.xpath('td[2]/a/text()')!=[]):
- ## print i.xpath('td[2]/a/text()'),i.xpath('td[2]/a/@href')
- ## print total_page,page
- ## page = page + 1
- def get_business_list(objGrpCode,region,province,year,cursor):
- total_page =-1
- page = 1
- obj = ''
- url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
- while total_page!=str(page-1):
- payload = 'objGrpCode={0}&submitObjCode={1}&zone={2}&province={3}&hur=&jpTypeCode=&fiscalYear={4}&balIn09=&balIn09Max=&balIn21=&balIn21Max=&balBs11=&balBs11Max=&balBs23=&balBs23Max=&capAmt=&capAmtMax=&sortBy=JP_TNAME&search=search¤tPage={5}'.format(objGrpCode,obj,region,province,year,page)
- print payload
- headers = {
- 'Content-Type': 'application/x-www-form-urlencoded',
- 'X-Requested-With': 'XMLHttpRequest'
- }
- r = s.post(url,headers=headers ,data=payload,verify=False)
- tree = etree.HTML(r.content, parser=html_parser)
- try:
- total_page = tree.xpath('//*[@id="content"]/div[2]/div/div[3]/div[3]/b/text()')[0].split(' : ')[1].split(' ')[0]
- except:
- print "No result"
- break
- for i in tree.xpath('//table[@class="horizontal"]/tr'):
- if(i.xpath('td[2]/a/text()')!=[]):
- try:
- ##print i.xpath('td[2]/a/text()'),i.xpath('td[2]/a/@href')
- company_link = 'http://datawarehouse.dbd.go.th' + i.xpath('td[2]/a/@href')[0]
- insert_company_db(cursor,company_detail(company_link,year))
- ##print "company link "+ company_link
- ##print company_detail(company_link)
- except:
- print "company link :" +company_link #+'http://datawarehouse.dbd.go.th' + i.xpath('td[2]/a/@href')[0]
- print 'err'
- print total_page,page
- page = page + 1
- def clear_text(list_):
- if len(list_) >0 :
- text = ""
- for temp in list_ :
- text = text + temp
- else:
- return " "
- return text.encode('utf-8')
- def check(data):
- b =""
- try :
- for a in data:
- b = b+a
- return b.encode('utf-8')
- except :
- print "error"
- return "None"
- def check_int(data):
- try :
- return data[0].encode('utf-8')
- except :
- print "error"
- return "0"
- def date_sql_format(date):
- date = date.split('/')
- return date[2] +"-" +date[1]+"-"+date[0]
- def company_detail(company_url,year_query):
- f = open('temp.csv','a')
- pageContent=s.get(company_url)
- tree = html.fromstring(pageContent.content)
- company_detail =[]
- company_id_name = check(tree.xpath('//*[@class="txt-violet"]/text()')[0].replace('\n','').replace('\t',''))
- if (company_id_name == "None"):
- company_id = "None"
- company_name = "None"
- else :
- company_id = company_id_name[4:17]
- company_name = company_id_name[23:]
- company_type = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[1]/td/text()'))
- company_date = re.sub('\s+','',check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[2]/td/text()')))
- company_date = date_sql_format(company_date)
- company_status = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[3]/td/text()'))
- company_value = re.sub('\s+','',check_int(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[4]/td/text()'))).replace(',','')
- company_place = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[5]/td/text()'))
- company_group = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[6]/td/text()'))
- company_obj = re.sub('\s+','',check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[7]/td/text()')))
- company_years = filter(None,check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[8]/td/text()')).lstrip())
- company_referee = ""
- company_fax = ""
- company_tel = ""
- company_notice = ""
- company_relative = ""
- company_referee = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[9]/td/text()'))
- company_referee = company_referee.replace('\n','').replace('/','').replace('\t','')
- company_relative = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[10]/td/text()'))
- company_relative = company_relative.replace('\n','').replace('/','').replace('\t','')
- company_tel = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[11]/td/text()'))
- company_fax = check( tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[12]/td/text()'))
- company_notice = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[13]/td/text()'))
- company_notice = company_notice.replace('\n','').replace('/','').replace('\t','')
- company_detail.extend(['DEFAULT','CURRENT_TIMESTAMP',year_query,company_name,company_id,company_type,company_date,company_status,
- company_value,company_place,company_group,company_obj,company_years,company_referee,
- company_relative,company_tel,company_fax,company_notice,company_url])
- return company_detail
- def insert_company_db(cursor,company):
- varlist = []
- querry_company = ""
- querry_string = "INSERT INTO company_record VALUES( "
- for data in company:
- if data == "CURRENT_TIMESTAMP":
- querry_string = querry_string + data+ ","
- elif data == "DEFAULT":
- querry_string = querry_string + data+ ","
- else:
- querry_string = querry_string +"'" + data +"'" + ","
- querry_string = unicode(querry_string[:-1]+");",'utf-8')
- ##print "Querry sting " +querry_string
- cursor.execute(querry_string)
- if __name__ == '__main__':
- #Parsing Arguments
- captcha = grab_captcha()
- print "Got Captcha:",captcha
- login("hexkey6@gmail.com","password",captcha)
- db = MySQLdb.connect(host="localhost", # your host, usually localhost
- user="root", # your username
- passwd="", # your password
- db="list_company",use_unicode=True,
- init_command='SET NAMES UTF8') # name of the data base
- db.set_character_set("utf8")
- cursor = db.cursor()
- #a = company_detail('http://datawarehouse.dbd.go.th/bdw/est/details/index.html?jpNo=0105550089172&jpTypeCode=5',"1234")
- #insert_company_db(cursor,a)
- print get_business_list("A","C","10","2557",cursor)
- # print "Finish"
- ## for obj_group in list_obj_group().keys():
- ## for region in list_region().keys():
- ## for province in list_province(region).keys():
- ## print get_business_list(obj_group,"C","10","2555",cursor)
- #### print "######"
- ##print list_obj("A")
- ##print list_region().keys()
- ##print list_province("C").keys()
- ##get_business_list("F","41000","C","10","2556")
- ##get_business_list("A","C","10","2557")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement