Advertisement
Guest User

Untitled

a guest
Apr 26th, 2017
603
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.66 KB | None | 0 0
  1. #!/usr/local/bin/python
  2. # -*- coding: utf-8 -*-
  3.  
  4. from lxml import etree
  5. import sys, requests, traceback, math, time,shutil
  6. import requests.packages.urllib3
  7. from captcha_solver import CaptchaSolver
  8. from lxml import html
  9. #Disable SSL Warning
  10. requests.packages.urllib3.disable_warnings()
  11.  
  12. #Define Global variable
  13. headers = {
  14. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
  15. }
  16.  
  17. proxy = {
  18. "http": "127.0.0.1:8080",
  19. "https": "127.0.0.1:8080",
  20. }
  21. s = requests.Session()
  22. main_domain = 'http://datawarehouse.dbd.go.th'
  23. html_parser = etree.HTMLParser(encoding="utf-8")
  24.  
  25. def grab_captcha():
  26. global headers
  27. #Get cookie
  28. url = "http://datawarehouse.dbd.go.th/bdw/home/login.html"
  29. s.get(url,headers=headers)
  30.  
  31. #Grab Captcha
  32. url = "http://datawarehouse.dbd.go.th/bdw/home/captcha.html"
  33. headers = {
  34. 'Host': 'datawarehouse.dbd.go.th',
  35. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
  36. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  37. 'Accept-Language': 'en-US,en;q=0.5',
  38. 'Accept-Encoding': 'gzip, deflate, br',
  39. }
  40.  
  41. #Request to server
  42. r = s.get(url , headers=headers,verify=False,stream=True)
  43. with open('img.png', 'wb') as out_file:
  44. shutil.copyfileobj(r.raw, out_file)
  45. solver = CaptchaSolver('antigate', api_key='39e2c871f81922e30c85108df1c8486c')#39e2c871f81922e30c85108df1c8486c')2395fb864412a999e5b998ac59a5b996
  46. raw_data = open('img.png', 'rb').read()
  47. captcha_val = solver.solve_captcha(raw_data)
  48. return captcha_val
  49.  
  50.  
  51. def login(user,password,captcha):
  52. global headers
  53. #Set url and header
  54. url = "http://datawarehouse.dbd.go.th/bdw/home/authen.html"
  55. headers = {
  56. 'Host': 'datawarehouse.dbd.go.th',
  57. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
  58. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  59. 'Accept-Language': 'en-US,en;q=0.5',
  60. 'Accept-Encoding': 'gzip, deflate, br',
  61. 'Content-Type': 'application/x-www-form-urlencoded',
  62. 'Referer': 'http://datawarehouse.dbd.go.th/bdw/home/login.html',
  63. 'Upgrade-Insecure-Requests': '1',
  64. 'Connection': 'close'
  65. }
  66.  
  67. payload = "userName="+str("hexkey6%40gmail.com")+"&userPassword="+str(password)+"&captchaText="+str(captcha)
  68.  
  69. r = s.post(url,headers=headers ,data=payload,verify=False)
  70. if r.content.find('logout')!=-1:
  71. print "Logged"
  72. return True
  73. else:
  74. print "Not Logged"
  75. return False
  76.  
  77. def list_obj_group():
  78. url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
  79. r = s.get(url,headers=headers,verify=False)
  80. tree = etree.HTML(r.content, parser=html_parser)
  81. result_dict = {}
  82. for i in tree.xpath('//select[@name="objGrpCode"]/option/text()'):
  83. if len(i.split(':'))==2:
  84. code = i.split(':')[0]
  85. dest = i.split(':')[1]
  86. result_dict[code] = dest
  87. return result_dict
  88. ##def list_obj(business_group):
  89. ## url = "http://datawarehouse.dbd.go.th/bdw/search/objective.html"
  90. ## payload = "value=" + str(business_group)
  91. ## headers = {
  92. ## 'Content-Type': 'application/x-www-form-urlencoded',
  93. ## 'X-Requested-With': 'XMLHttpRequest'
  94. ## }
  95. ## r = s.post(url,headers=headers ,data=payload,verify=False)
  96. ## tree = etree.HTML(r.content, parser=html_parser)
  97. ## result_dict = {}
  98. ## for i in tree.xpath('//option/text()'):
  99. ## if len(i.split(':'))==2:
  100. ## code = i.split(':')[0]
  101. ## dest = i.split(':')[1]
  102. ## result_dict[code] = dest
  103. ## return result_dict
  104.  
  105. def list_region():
  106. url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
  107. r = s.get(url,headers=headers,verify=False)
  108. tree = etree.HTML(r.content, parser=html_parser)
  109. result_dict = {}
  110. for i in tree.xpath('//select[@name="zone"]/option'):
  111. if (i.xpath('@value')[0]!=''):
  112. code = i.xpath('@value')[0]
  113. dest = i.xpath('text()')[0]
  114. result_dict[code] = dest
  115. return result_dict
  116.  
  117. def list_province(region):
  118. url = "http://datawarehouse.dbd.go.th/bdw/search/province.html"
  119. payload = "value=" + str(region) +"&province="
  120. headers = {
  121. 'Content-Type': 'application/x-www-form-urlencoded',
  122. 'X-Requested-With': 'XMLHttpRequest'
  123. }
  124. r = s.post(url,headers=headers ,data=payload,verify=False)
  125. tree = etree.HTML(r.content, parser=html_parser)
  126. result_dict = {}
  127. for i in tree.xpath('//option'):
  128. if (i.xpath('@value')[0]!=''):
  129. code = i.xpath('@value')[0]
  130. dest = i.xpath('text()')[0]
  131. result_dict[code] = dest
  132. return result_dict
  133.  
  134. ###Old
  135. ##def get_business_list(objGrpCode,obj,region,province,year):
  136. ## total_page =-1
  137. ## page = 1
  138. ## url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
  139. ## while total_page!=str(page-1):
  140. ## payload = 'objGrpCode={0}&submitObjCode={1}&zone={2}&province={3}&amphur=&jpTypeCode=&fiscalYear={4}&balIn09=&balIn09Max=&balIn21=&balIn21Max=&balBs11=&balBs11Max=&balBs23=&balBs23Max=&capAmt=&capAmtMax=&sortBy=JP_TNAME&search=search&currentPage={5}'.format(objGrpCode,obj,region,province,year,page)
  141. ## print payload
  142. ## headers = {
  143. ## 'Content-Type': 'application/x-www-form-urlencoded',
  144. ## 'X-Requested-With': 'XMLHttpRequest'
  145. ## }
  146. ## r = s.post(url,headers=headers ,data=payload,verify=False)
  147. ## tree = etree.HTML(r.content, parser=html_parser)
  148. ## try:
  149. ## total_page = tree.xpath('//*[@id="content"]/div[2]/div/div[3]/div[3]/b/text()')[0].split(' : ')[1].split(' ')[0]
  150. ## except:
  151. ## print "No result"
  152. ## break
  153. ##
  154. ## for i in tree.xpath('//table[@class="horizontal"]/tr'):
  155. ## if(i.xpath('td[2]/a/text()')!=[]):
  156. ## print i.xpath('td[2]/a/text()'),i.xpath('td[2]/a/@href')
  157. ## print total_page,page
  158. ## page = page + 1
  159.  
  160. def get_business_list(objGrpCode,region,province,year):
  161. total_page =-1
  162. page = 1
  163. obj = ''
  164. url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
  165. while total_page!=str(page-1):
  166. payload = 'objGrpCode={0}&submitObjCode={1}&zone={2}&province={3}&amphur=&jpTypeCode=&fiscalYear={4}&balIn09=&balIn09Max=&balIn21=&balIn21Max=&balBs11=&balBs11Max=&balBs23=&balBs23Max=&capAmt=&capAmtMax=&sortBy=JP_TNAME&search=search&currentPage={5}'.format(objGrpCode,obj,region,province,year,page)
  167. print payload
  168. headers = {
  169. 'Content-Type': 'application/x-www-form-urlencoded',
  170. 'X-Requested-With': 'XMLHttpRequest'
  171. }
  172. r = s.post(url,headers=headers ,data=payload,verify=False)
  173. tree = etree.HTML(r.content, parser=html_parser)
  174. try:
  175. total_page = tree.xpath('//*[@id="content"]/div[2]/div/div[3]/div[3]/b/text()')[0].split(' : ')[1].split(' ')[0]
  176. except:
  177. print "No result"
  178. break
  179.  
  180. for i in tree.xpath('//table[@class="horizontal"]/tr'):
  181. if(i.xpath('td[2]/a/text()')!=[]):
  182. ##print i.xpath('td[2]/a/text()'),i.xpath('td[2]/a/@href')
  183. ##print 'http://datawarehouse.dbd.go.th'+ i.xpath('td[2]/a/@href')[0]
  184. company_info = 'http://datawarehouse.dbd.go.th'+ i.xpath('td[2]/a/@href')[0]
  185. pageContent=s.get(company_info)
  186. tree = html.fromstring(pageContent.content)
  187. ##detail = tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[' +str(i) +']/td/text()')
  188. company_detail =[]
  189. for i in range (1,14):
  190. company_detail.append(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr['+ str(i)+']/td/text()'))
  191. print total_page,page
  192. page = page + 1
  193. return company_detail.append
  194.  
  195. if __name__ == '__main__':
  196. #Parsing Arguments
  197. captcha = grab_captcha()
  198. print "Got Captcha:",captcha
  199. login("hexkey6@gmail.com","password",captcha)
  200. get_business_list("A","C","10","2555")
  201. ##for obj_group in list_obj_group().keys():
  202. ## for region in list_region().keys():
  203. ## for province in list_province(region).keys():
  204. ## print get_business_list(obj_group,"C","10","2555")
  205. print "######"
  206. ##print list_obj("A")
  207. ##print list_region().keys()
  208. ##print list_province("C").keys()
  209. ##get_business_list("F","41000","C","10","2556")
  210. ##get_business_list("A","C","10","2555")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement