SHARE
TWEET

Untitled

a guest Apr 28th, 2017 581 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/local/bin/python
  2. # -*- coding: utf-8 -*-
  3.  
  4. from lxml import etree
  5. import sys, requests, traceback, math, time,shutil
  6. import requests.packages.urllib3
  7. from captcha_solver import CaptchaSolver
  8. from lxml import html
  9. from bs4 import BeautifulSoup
  10. import re
  11. import urllib
  12. #Disable SSL Warning
  13. requests.packages.urllib3.disable_warnings()
  14.  
  15. #Define Global variable
  16. headers = {
  17.     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
  18.     }
  19.  
  20. proxy = {
  21.   "http": "127.0.0.1:8080",
  22.   "https": "127.0.0.1:8080",
  23. }
  24. s = requests.Session()
  25. main_domain = 'http://datawarehouse.dbd.go.th'
  26. html_parser = etree.HTMLParser(encoding="utf-8")
  27.  
  28. def grab_captcha():
  29.     global headers
  30.     #Get cookie
  31.     url = "http://datawarehouse.dbd.go.th/bdw/home/login.html"
  32.     s.get(url,headers=headers)
  33.    
  34.     #Grab Captcha
  35.     url = "http://datawarehouse.dbd.go.th/bdw/home/captcha.html"
  36.     headers = {
  37.     'Host': 'datawarehouse.dbd.go.th',
  38.     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
  39.     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  40.     'Accept-Language': 'en-US,en;q=0.5',
  41.     'Accept-Encoding': 'gzip, deflate, br',
  42.     }
  43.  
  44.     #Request to server
  45.     r = s.get(url , headers=headers,verify=False,stream=True)
  46.     with open('img.png', 'wb') as out_file:
  47.         shutil.copyfileobj(r.raw, out_file)
  48.     solver = CaptchaSolver('antigate', api_key='39e2c871f81922e30c85108df1c8486c')#39e2c871f81922e30c85108df1c8486c')2395fb864412a999e5b998ac59a5b996
  49.     raw_data = open('img.png', 'rb').read()
  50.     captcha_val = solver.solve_captcha(raw_data)
  51.     return captcha_val
  52.  
  53.  
  54. def login(user,password,captcha):
  55.     global headers
  56.     #Set url and header
  57.     url = "http://datawarehouse.dbd.go.th/bdw/home/authen.html"
  58.     headers = {
  59.     'Host': 'datawarehouse.dbd.go.th',
  60.     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
  61.     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  62.     'Accept-Language': 'en-US,en;q=0.5',
  63.     'Accept-Encoding': 'gzip, deflate, br',
  64.     'Content-Type': 'application/x-www-form-urlencoded',
  65.     'Referer': 'http://datawarehouse.dbd.go.th/bdw/home/login.html',
  66.     'Upgrade-Insecure-Requests': '1',
  67.     'Connection': 'close'
  68.     }
  69.  
  70.     payload = "userName="+str("hexkey6%40gmail.com")+"&userPassword="+str(password)+"&captchaText="+str(captcha)
  71.  
  72.     r = s.post(url,headers=headers ,data=payload,verify=False)
  73.     if r.content.find('logout')!=-1:
  74.         print "Logged"
  75.         return True
  76.     else:
  77.         print "Not Logged"
  78.         return False
  79.  
  80. def list_obj_group():
  81.     url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
  82.     r = s.get(url,headers=headers,verify=False)
  83.     tree = etree.HTML(r.content, parser=html_parser)
  84.     result_dict = {}
  85.     for i in tree.xpath('//select[@name="objGrpCode"]/option/text()'):
  86.         if len(i.split(':'))==2:
  87.             code = i.split(':')[0]
  88.             dest = i.split(':')[1]
  89.             result_dict[code] = dest
  90.     return result_dict
  91. ##def list_obj(business_group):
  92. ##    url = "http://datawarehouse.dbd.go.th/bdw/search/objective.html"
  93. ##    payload = "value=" + str(business_group)
  94. ##    headers = {
  95. ##        'Content-Type': 'application/x-www-form-urlencoded',
  96. ##        'X-Requested-With': 'XMLHttpRequest'
  97. ##        }
  98. ##    r = s.post(url,headers=headers ,data=payload,verify=False)
  99. ##    tree = etree.HTML(r.content, parser=html_parser)
  100. ##    result_dict = {}
  101. ##    for i in tree.xpath('//option/text()'):
  102. ##        if len(i.split(':'))==2:
  103. ##            code = i.split(':')[0]
  104. ##            dest = i.split(':')[1]
  105. ##            result_dict[code] = dest
  106. ##    return result_dict
  107.  
  108. def list_region():
  109.     url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
  110.     r = s.get(url,headers=headers,verify=False)
  111.     tree = etree.HTML(r.content, parser=html_parser)
  112.     result_dict = {}
  113.     for i in tree.xpath('//select[@name="zone"]/option'):
  114.         if (i.xpath('@value')[0]!=''):
  115.             code = i.xpath('@value')[0]
  116.             dest = i.xpath('text()')[0]
  117.             result_dict[code] = dest
  118.     return result_dict
  119.  
  120. def list_province(region):
  121.     url = "http://datawarehouse.dbd.go.th/bdw/search/province.html"
  122.     payload = "value=" + str(region) +"&province="
  123.     headers = {
  124.         'Content-Type': 'application/x-www-form-urlencoded',
  125.         'X-Requested-With': 'XMLHttpRequest'
  126.         }
  127.     r = s.post(url,headers=headers ,data=payload,verify=False)
  128.     tree = etree.HTML(r.content, parser=html_parser)
  129.     result_dict = {}
  130.     for i in tree.xpath('//option'):
  131.         if (i.xpath('@value')[0]!=''):
  132.             code = i.xpath('@value')[0]
  133.             dest = i.xpath('text()')[0]
  134.             result_dict[code] = dest
  135.     return result_dict
  136.  
  137. ###Old
  138. ##def get_business_list(objGrpCode,obj,region,province,year):
  139. ##    total_page =-1
  140. ##    page = 1
  141. ##    url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
  142. ##    while total_page!=str(page-1):
  143. ##        payload = 'objGrpCode={0}&submitObjCode={1}&zone={2}&province={3}&amphur=&jpTypeCode=&fiscalYear={4}&balIn09=&balIn09Max=&balIn21=&balIn21Max=&balBs11=&balBs11Max=&balBs23=&balBs23Max=&capAmt=&capAmtMax=&sortBy=JP_TNAME&search=search&currentPage={5}'.format(objGrpCode,obj,region,province,year,page)
  144. ##        print payload
  145. ##        headers = {
  146. ##            'Content-Type': 'application/x-www-form-urlencoded',
  147. ##            'X-Requested-With': 'XMLHttpRequest'
  148. ##            }
  149. ##        r = s.post(url,headers=headers ,data=payload,verify=False)
  150. ##        tree = etree.HTML(r.content, parser=html_parser)
  151. ##        try:
  152. ##            total_page = tree.xpath('//*[@id="content"]/div[2]/div/div[3]/div[3]/b/text()')[0].split(' : ')[1].split(' ')[0]
  153. ##        except:
  154. ##            print "No result"
  155. ##            break
  156. ##        
  157. ##        for i in tree.xpath('//table[@class="horizontal"]/tr'):
  158. ##            if(i.xpath('td[2]/a/text()')!=[]):
  159. ##                print i.xpath('td[2]/a/text()'),i.xpath('td[2]/a/@href')
  160. ##        print total_page,page
  161. ##        page = page + 1
  162.  
  163. def get_business_list(objGrpCode,region,province,year):
  164.     total_page =-1
  165.     page = 1
  166.     obj = ''
  167.     url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
  168.     while total_page!=str(page-1):
  169.         payload = 'objGrpCode={0}&submitObjCode={1}&zone={2}&province={3}&amphur=&jpTypeCode=&fiscalYear={4}&balIn09=&balIn09Max=&balIn21=&balIn21Max=&balBs11=&balBs11Max=&balBs23=&balBs23Max=&capAmt=&capAmtMax=&sortBy=JP_TNAME&search=search&currentPage={5}'.format(objGrpCode,obj,region,province,year,page)
  170.         print payload
  171.         headers = {
  172.             'Content-Type': 'application/x-www-form-urlencoded',
  173.             'X-Requested-With': 'XMLHttpRequest'
  174.             }
  175.         r = s.post(url,headers=headers ,data=payload,verify=False)
  176.         tree = etree.HTML(r.content, parser=html_parser)
  177.         try:
  178.             total_page = tree.xpath('//*[@id="content"]/div[2]/div/div[3]/div[3]/b/text()')[0].split(' : ')[1].split(' ')[0]
  179.         except:
  180.             print "No result"
  181.             break
  182.        
  183.         for i in tree.xpath('//table[@class="horizontal"]/tr'):
  184.             if(i.xpath('td[2]/a/text()')!=[]):
  185.                 try:
  186.                     ##print i.xpath('td[2]/a/text()'),i.xpath('td[2]/a/@href')
  187.                     company_link = 'http://datawarehouse.dbd.go.th' + i.xpath('td[2]/a/@href')[0]
  188.                     print "company link     "+ company_link
  189.                     ##print company_detail(company_link)
  190.                 except:
  191.                     print 'err'
  192.         print total_page,page
  193.         page = page + 1
  194. def clear_text(text):
  195.     print text
  196.     try :
  197.         text = text.encode('utf-8')
  198.     except:
  199.         text = ""
  200.     return text
  201. def company_detail(company_url):
  202.     f = open('temp.csv','a')
  203.     pageContent=s.get(company_url)
  204.     tree = html.fromstring(pageContent.content)
  205.     company_detail =[]
  206. ##    ##print pageContent.content
  207. ## Soup
  208. ##    soup = BeautifulSoup(pageContent.content,"lxml")
  209. ##    print soup.prettify()
  210. ##    table = soup.find(class_ = 'horizontal')
  211. ##    list_ = []
  212. ##    for info in  table.find_all('td'):
  213. ##        list_.append( info.get_text())
  214. ##    print list_
  215. ##    print "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
  216.  
  217. ##    for i in range (1,14):
  218. ##        text = tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr['+ str(i)+']/td/text()')
  219. ##        try:
  220. ##            print text
  221. ##            text = text[0].encode('utf-8')
  222. ##            print '---'
  223. ##        except:
  224. ##            print 'eer',text
  225. ##            print(traceback.format_exc())
  226. ##            text = ''
  227. ##            pass
  228. ##        company_detail.append(text)
  229. ##        f.write(text)
  230.     company_type = tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[1]/td/text()')[0].encode('utf-8')
  231.     company_date = re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[2]/td/text()')[0])
  232.     company_status = tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[3]/td/text()')[0].encode('utf-8')
  233.     company_value = re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[4]/td/text()')[0].encode('utf-8'))
  234.     company_place = re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[5]/td/text()')[0].encode('utf-8'))
  235.     company_group = re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[6]/td/text()')[0].encode('utf-8'))
  236.     company_obj = re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[7]/td/text()')[0].encode('utf-8'))
  237.     company_years = filter(None,tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[8]/td/text()')[0].lstrip().split(' '))
  238.     company_referee = []
  239.     company_fax = ""
  240.     company_tel = ""
  241.     company_notice = ""
  242.     for person in tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[9]/td/text()'):
  243.         company_referee.append(person.replace('/','').lstrip())#
  244.     company_referee =  filter(None, company_referee) # clear empty element in list
  245.     company_relative = ""
  246.     for person in tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[10]/td/text()'):
  247.         company_relative += person.replace('/','').lstrip()#.encode#('utf-8')
  248.     company_tel = clear_text(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[11]/td/text()')[0]) #
  249.     company_fax = clear_text( tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[12]/td/text()'))
  250.     company_notice = clear_text(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[13]/td/text()'))
  251.     company_detail.extend([company_type,company_date,company_status,company_value,company_place,company_group,company_obj,company_years
  252.                           ,company_referee,company_relative,company_tel,company_fax,company_notice])
  253. ##    print company_detail
  254. ##    print company_fax.encode('utf-8')
  255. ##    print company_notice.encode('utf-8')
  256. ##    company_tel = tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[11]/td/text()')
  257. ##    company_fax = tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[12]/td/text()')
  258. ##    company_notice = tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[13]/td/text()')
  259. ##    print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[1]/td/text()')[0].encode('utf-8')
  260. ##    print re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[2]/td/text()')[0])      
  261. ##    print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[3]/td/text()')[0].encode('utf-8')
  262. ##    print re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[4]/td/text()')[0].encode('utf-8'))
  263. ##    print re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[5]/td/text()')[0].encode('utf-8'))
  264. ##    print re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[6]/td/text()')[0].encode('utf-8'))
  265. ##    print re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[7]/td/text()')[0].encode('utf-8'))
  266. ##    print filter(None,tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[8]/td/text()')[0].lstrip().split(' '))
  267. ##    print company_relative
  268. ##    print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[6]/td/text()')[0].encode('utf-8')
  269. ##    print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[7]/td/text()')[0].encode('utf-8')
  270. ##    print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[8]/td/text()')[0].encode('utf-8')
  271. ##    print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[9]/td/text()')[0].encode('utf-8')
  272. ##    print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[10]/td/text()')[0].encode('utf-8')
  273. ##    try:
  274. ##        print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[12]/td/text()')[0].encode('utf-8')
  275. ##    except:
  276. ##        print ""
  277. ##    try:
  278. ##        print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[13]/td/text()')[0].encode('utf-8')
  279. ##    except:
  280. ##        print""
  281. ##    
  282.     f.write('\n')
  283.     f.close()
  284.     return company_detail
  285.  
  286. if __name__ == '__main__':
  287.     #Parsing Arguments
  288.     captcha = grab_captcha()
  289.     print "Got Captcha:",captcha
  290.     login("hexkey6@gmail.com","password",captcha)
  291.     print company_detail('http://datawarehouse.dbd.go.th/bdw/est/details1.html?jpNo=0105555108647&jpTypeCode=5&t=')
  292. ##  get_business_list("A","C","10","2555")
  293. ##    ##for obj_group in list_obj_group().keys():
  294. ##    ##    for region in list_region().keys():
  295. ##    ##            for province in list_province(region).keys():            
  296. ##    ##                print get_business_list(obj_group,"C","10","2555")  
  297. ##    print "######"
  298.     ##print list_obj("A")
  299.     ##print list_region().keys()
  300.     ##print list_province("C").keys()
  301.     ##get_business_list("F","41000","C","10","2556")
  302.     ##get_business_list("A","C","10","2557")
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top