Advertisement
Guest User

Untitled

a guest
Apr 28th, 2017
641
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 14.02 KB | None | 0 0
  1. #!/usr/local/bin/python
  2. # -*- coding: utf-8 -*-
  3.  
  4. from lxml import etree
  5. import sys, requests, traceback, math, time,shutil
  6. import requests.packages.urllib3
  7. from captcha_solver import CaptchaSolver
  8. from lxml import html
  9. from bs4 import BeautifulSoup
  10. import re
  11. import urllib
  12. #Disable SSL Warning
  13. requests.packages.urllib3.disable_warnings()
  14.  
  15. #Define Global variable
  16. headers = {
  17. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
  18. }
  19.  
  20. proxy = {
  21. "http": "127.0.0.1:8080",
  22. "https": "127.0.0.1:8080",
  23. }
  24. s = requests.Session()
  25. main_domain = 'http://datawarehouse.dbd.go.th'
  26. html_parser = etree.HTMLParser(encoding="utf-8")
  27.  
  28. def grab_captcha():
  29. global headers
  30. #Get cookie
  31. url = "http://datawarehouse.dbd.go.th/bdw/home/login.html"
  32. s.get(url,headers=headers)
  33.  
  34. #Grab Captcha
  35. url = "http://datawarehouse.dbd.go.th/bdw/home/captcha.html"
  36. headers = {
  37. 'Host': 'datawarehouse.dbd.go.th',
  38. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
  39. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  40. 'Accept-Language': 'en-US,en;q=0.5',
  41. 'Accept-Encoding': 'gzip, deflate, br',
  42. }
  43.  
  44. #Request to server
  45. r = s.get(url , headers=headers,verify=False,stream=True)
  46. with open('img.png', 'wb') as out_file:
  47. shutil.copyfileobj(r.raw, out_file)
  48. solver = CaptchaSolver('antigate', api_key='39e2c871f81922e30c85108df1c8486c')#39e2c871f81922e30c85108df1c8486c')2395fb864412a999e5b998ac59a5b996
  49. raw_data = open('img.png', 'rb').read()
  50. captcha_val = solver.solve_captcha(raw_data)
  51. return captcha_val
  52.  
  53.  
  54. def login(user,password,captcha):
  55. global headers
  56. #Set url and header
  57. url = "http://datawarehouse.dbd.go.th/bdw/home/authen.html"
  58. headers = {
  59. 'Host': 'datawarehouse.dbd.go.th',
  60. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
  61. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  62. 'Accept-Language': 'en-US,en;q=0.5',
  63. 'Accept-Encoding': 'gzip, deflate, br',
  64. 'Content-Type': 'application/x-www-form-urlencoded',
  65. 'Referer': 'http://datawarehouse.dbd.go.th/bdw/home/login.html',
  66. 'Upgrade-Insecure-Requests': '1',
  67. 'Connection': 'close'
  68. }
  69.  
  70. payload = "userName="+str("hexkey6%40gmail.com")+"&userPassword="+str(password)+"&captchaText="+str(captcha)
  71.  
  72. r = s.post(url,headers=headers ,data=payload,verify=False)
  73. if r.content.find('logout')!=-1:
  74. print "Logged"
  75. return True
  76. else:
  77. print "Not Logged"
  78. return False
  79.  
  80. def list_obj_group():
  81. url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
  82. r = s.get(url,headers=headers,verify=False)
  83. tree = etree.HTML(r.content, parser=html_parser)
  84. result_dict = {}
  85. for i in tree.xpath('//select[@name="objGrpCode"]/option/text()'):
  86. if len(i.split(':'))==2:
  87. code = i.split(':')[0]
  88. dest = i.split(':')[1]
  89. result_dict[code] = dest
  90. return result_dict
  91. ##def list_obj(business_group):
  92. ## url = "http://datawarehouse.dbd.go.th/bdw/search/objective.html"
  93. ## payload = "value=" + str(business_group)
  94. ## headers = {
  95. ## 'Content-Type': 'application/x-www-form-urlencoded',
  96. ## 'X-Requested-With': 'XMLHttpRequest'
  97. ## }
  98. ## r = s.post(url,headers=headers ,data=payload,verify=False)
  99. ## tree = etree.HTML(r.content, parser=html_parser)
  100. ## result_dict = {}
  101. ## for i in tree.xpath('//option/text()'):
  102. ## if len(i.split(':'))==2:
  103. ## code = i.split(':')[0]
  104. ## dest = i.split(':')[1]
  105. ## result_dict[code] = dest
  106. ## return result_dict
  107.  
  108. def list_region():
  109. url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
  110. r = s.get(url,headers=headers,verify=False)
  111. tree = etree.HTML(r.content, parser=html_parser)
  112. result_dict = {}
  113. for i in tree.xpath('//select[@name="zone"]/option'):
  114. if (i.xpath('@value')[0]!=''):
  115. code = i.xpath('@value')[0]
  116. dest = i.xpath('text()')[0]
  117. result_dict[code] = dest
  118. return result_dict
  119.  
  120. def list_province(region):
  121. url = "http://datawarehouse.dbd.go.th/bdw/search/province.html"
  122. payload = "value=" + str(region) +"&province="
  123. headers = {
  124. 'Content-Type': 'application/x-www-form-urlencoded',
  125. 'X-Requested-With': 'XMLHttpRequest'
  126. }
  127. r = s.post(url,headers=headers ,data=payload,verify=False)
  128. tree = etree.HTML(r.content, parser=html_parser)
  129. result_dict = {}
  130. for i in tree.xpath('//option'):
  131. if (i.xpath('@value')[0]!=''):
  132. code = i.xpath('@value')[0]
  133. dest = i.xpath('text()')[0]
  134. result_dict[code] = dest
  135. return result_dict
  136.  
  137. ###Old
  138. ##def get_business_list(objGrpCode,obj,region,province,year):
  139. ## total_page =-1
  140. ## page = 1
  141. ## url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
  142. ## while total_page!=str(page-1):
  143. ## payload = 'objGrpCode={0}&submitObjCode={1}&zone={2}&province={3}&amphur=&jpTypeCode=&fiscalYear={4}&balIn09=&balIn09Max=&balIn21=&balIn21Max=&balBs11=&balBs11Max=&balBs23=&balBs23Max=&capAmt=&capAmtMax=&sortBy=JP_TNAME&search=search&currentPage={5}'.format(objGrpCode,obj,region,province,year,page)
  144. ## print payload
  145. ## headers = {
  146. ## 'Content-Type': 'application/x-www-form-urlencoded',
  147. ## 'X-Requested-With': 'XMLHttpRequest'
  148. ## }
  149. ## r = s.post(url,headers=headers ,data=payload,verify=False)
  150. ## tree = etree.HTML(r.content, parser=html_parser)
  151. ## try:
  152. ## total_page = tree.xpath('//*[@id="content"]/div[2]/div/div[3]/div[3]/b/text()')[0].split(' : ')[1].split(' ')[0]
  153. ## except:
  154. ## print "No result"
  155. ## break
  156. ##
  157. ## for i in tree.xpath('//table[@class="horizontal"]/tr'):
  158. ## if(i.xpath('td[2]/a/text()')!=[]):
  159. ## print i.xpath('td[2]/a/text()'),i.xpath('td[2]/a/@href')
  160. ## print total_page,page
  161. ## page = page + 1
  162.  
  163. def get_business_list(objGrpCode,region,province,year):
  164. total_page =-1
  165. page = 1
  166. obj = ''
  167. url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
  168. while total_page!=str(page-1):
  169. payload = 'objGrpCode={0}&submitObjCode={1}&zone={2}&province={3}&amphur=&jpTypeCode=&fiscalYear={4}&balIn09=&balIn09Max=&balIn21=&balIn21Max=&balBs11=&balBs11Max=&balBs23=&balBs23Max=&capAmt=&capAmtMax=&sortBy=JP_TNAME&search=search&currentPage={5}'.format(objGrpCode,obj,region,province,year,page)
  170. print payload
  171. headers = {
  172. 'Content-Type': 'application/x-www-form-urlencoded',
  173. 'X-Requested-With': 'XMLHttpRequest'
  174. }
  175. r = s.post(url,headers=headers ,data=payload,verify=False)
  176. tree = etree.HTML(r.content, parser=html_parser)
  177. try:
  178. total_page = tree.xpath('//*[@id="content"]/div[2]/div/div[3]/div[3]/b/text()')[0].split(' : ')[1].split(' ')[0]
  179. except:
  180. print "No result"
  181. break
  182.  
  183. for i in tree.xpath('//table[@class="horizontal"]/tr'):
  184. if(i.xpath('td[2]/a/text()')!=[]):
  185. try:
  186. ##print i.xpath('td[2]/a/text()'),i.xpath('td[2]/a/@href')
  187. company_link = 'http://datawarehouse.dbd.go.th' + i.xpath('td[2]/a/@href')[0]
  188. print "company link "+ company_link
  189. ##print company_detail(company_link)
  190. except:
  191. print 'err'
  192. print total_page,page
  193. page = page + 1
  194. def clear_text(text):
  195. print text
  196. try :
  197. text = text.encode('utf-8')
  198. except:
  199. text = ""
  200. return text
  201. def company_detail(company_url):
  202. f = open('temp.csv','a')
  203. pageContent=s.get(company_url)
  204. tree = html.fromstring(pageContent.content)
  205. company_detail =[]
  206. ## ##print pageContent.content
  207. ## Soup
  208. ## soup = BeautifulSoup(pageContent.content,"lxml")
  209. ## print soup.prettify()
  210. ## table = soup.find(class_ = 'horizontal')
  211. ## list_ = []
  212. ## for info in table.find_all('td'):
  213. ## list_.append( info.get_text())
  214. ## print list_
  215. ## print "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
  216.  
  217. ## for i in range (1,14):
  218. ## text = tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr['+ str(i)+']/td/text()')
  219. ## try:
  220. ## print text
  221. ## text = text[0].encode('utf-8')
  222. ## print '---'
  223. ## except:
  224. ## print 'eer',text
  225. ## print(traceback.format_exc())
  226. ## text = ''
  227. ## pass
  228. ## company_detail.append(text)
  229. ## f.write(text)
  230. company_type = tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[1]/td/text()')[0].encode('utf-8')
  231. company_date = re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[2]/td/text()')[0])
  232. company_status = tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[3]/td/text()')[0].encode('utf-8')
  233. company_value = re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[4]/td/text()')[0].encode('utf-8'))
  234. company_place = re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[5]/td/text()')[0].encode('utf-8'))
  235. company_group = re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[6]/td/text()')[0].encode('utf-8'))
  236. company_obj = re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[7]/td/text()')[0].encode('utf-8'))
  237. company_years = filter(None,tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[8]/td/text()')[0].lstrip().split(' '))
  238. company_referee = []
  239. company_fax = ""
  240. company_tel = ""
  241. company_notice = ""
  242. for person in tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[9]/td/text()'):
  243. company_referee.append(person.replace('/','').lstrip())#
  244. company_referee = filter(None, company_referee) # clear empty element in list
  245. company_relative = ""
  246. for person in tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[10]/td/text()'):
  247. company_relative += person.replace('/','').lstrip()#.encode#('utf-8')
  248. company_tel = clear_text(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[11]/td/text()')[0]) #
  249. company_fax = clear_text( tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[12]/td/text()'))
  250. company_notice = clear_text(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[13]/td/text()'))
  251. company_detail.extend([company_type,company_date,company_status,company_value,company_place,company_group,company_obj,company_years
  252. ,company_referee,company_relative,company_tel,company_fax,company_notice])
  253. ## print company_detail
  254. ## print company_fax.encode('utf-8')
  255. ## print company_notice.encode('utf-8')
  256. ## company_tel = tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[11]/td/text()')
  257. ## company_fax = tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[12]/td/text()')
  258. ## company_notice = tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[13]/td/text()')
  259. ## print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[1]/td/text()')[0].encode('utf-8')
  260. ## print re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[2]/td/text()')[0])
  261. ## print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[3]/td/text()')[0].encode('utf-8')
  262. ## print re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[4]/td/text()')[0].encode('utf-8'))
  263. ## print re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[5]/td/text()')[0].encode('utf-8'))
  264. ## print re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[6]/td/text()')[0].encode('utf-8'))
  265. ## print re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[7]/td/text()')[0].encode('utf-8'))
  266. ## print filter(None,tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[8]/td/text()')[0].lstrip().split(' '))
  267. ## print company_relative
  268. ## print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[6]/td/text()')[0].encode('utf-8')
  269. ## print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[7]/td/text()')[0].encode('utf-8')
  270. ## print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[8]/td/text()')[0].encode('utf-8')
  271. ## print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[9]/td/text()')[0].encode('utf-8')
  272. ## print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[10]/td/text()')[0].encode('utf-8')
  273. ## try:
  274. ## print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[12]/td/text()')[0].encode('utf-8')
  275. ## except:
  276. ## print ""
  277. ## try:
  278. ## print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[13]/td/text()')[0].encode('utf-8')
  279. ## except:
  280. ## print""
  281. ##
  282. f.write('\n')
  283. f.close()
  284. return company_detail
  285.  
  286. if __name__ == '__main__':
  287. #Parsing Arguments
  288. captcha = grab_captcha()
  289. print "Got Captcha:",captcha
  290. login("hexkey6@gmail.com","password",captcha)
  291. print company_detail('http://datawarehouse.dbd.go.th/bdw/est/details1.html?jpNo=0105555108647&jpTypeCode=5&t=')
  292. ## get_business_list("A","C","10","2555")
  293. ## ##for obj_group in list_obj_group().keys():
  294. ## ## for region in list_region().keys():
  295. ## ## for province in list_province(region).keys():
  296. ## ## print get_business_list(obj_group,"C","10","2555")
  297. ## print "######"
  298. ##print list_obj("A")
  299. ##print list_region().keys()
  300. ##print list_province("C").keys()
  301. ##get_business_list("F","41000","C","10","2556")
  302. ##get_business_list("A","C","10","2557")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement