Advertisement
Guest User

Untitled

a guest
May 9th, 2017
132
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 13.06 KB | None | 0 0
  1. #!/usr/local/bin/python
  2. # -*- coding: utf-8 -*-
  3.  
  4. from lxml import etree
  5. import sys, requests, traceback, math, time,shutil
  6. import requests.packages.urllib3
  7. from captcha_solver import CaptchaSolver
  8. from lxml import html
  9. from bs4 import BeautifulSoup
  10. import re
  11. import urllib
  12. import MySQLdb
  13. #Disable SSL Warning
  14. requests.packages.urllib3.disable_warnings()
  15.  
  16. #Define Global variable
  17. headers = {
  18. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
  19. }
  20.  
  21. proxy = {
  22. "http": "127.0.0.1:8080",
  23. "https": "127.0.0.1:8080",
  24. }
  25. s = requests.Session()
  26. main_domain = 'http://datawarehouse.dbd.go.th'
  27. html_parser = etree.HTMLParser(encoding="utf-8")
  28.  
  29. def grab_captcha():
  30. global headers
  31. #Get cookie
  32. url = "http://datawarehouse.dbd.go.th/bdw/home/login.html"
  33. s.get(url,headers=headers)
  34.  
  35. #Grab Captcha
  36. url = "http://datawarehouse.dbd.go.th/bdw/home/captcha.html"
  37. headers = {
  38. 'Host': 'datawarehouse.dbd.go.th',
  39. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
  40. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  41. 'Accept-Language': 'en-US,en;q=0.5',
  42. 'Accept-Encoding': 'gzip, deflate, br',
  43. }
  44.  
  45. #Request to server
  46. r = s.get(url , headers=headers,verify=False,stream=True)
  47. with open('img.png', 'wb') as out_file:
  48. shutil.copyfileobj(r.raw, out_file)
  49. solver = CaptchaSolver('antigate', api_key='39e2c871f81922e30c85108df1c8486c')#39e2c871f81922e30c85108df1c8486c')2395fb864412a999e5b998ac59a5b996
  50. raw_data = open('img.png', 'rb').read()
  51. captcha_val = solver.solve_captcha(raw_data)
  52. return captcha_val
  53.  
  54.  
  55. def login(user,password,captcha):
  56. global headers
  57. #Set url and header
  58. url = "http://datawarehouse.dbd.go.th/bdw/home/authen.html"
  59. headers = {
  60. 'Host': 'datawarehouse.dbd.go.th',
  61. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
  62. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  63. 'Accept-Language': 'en-US,en;q=0.5',
  64. 'Accept-Encoding': 'gzip, deflate, br',
  65. 'Content-Type': 'application/x-www-form-urlencoded',
  66. 'Referer': 'http://datawarehouse.dbd.go.th/bdw/home/login.html',
  67. 'Upgrade-Insecure-Requests': '1',
  68. 'Connection': 'close'
  69. }
  70.  
  71. payload = "userName="+str("hexkey6%40gmail.com")+"&userPassword="+str(password)+"&captchaText="+str(captcha)
  72.  
  73. r = s.post(url,headers=headers ,data=payload,verify=False)
  74. if r.content.find('logout')!=-1:
  75. print "Logged"
  76. return True
  77. else:
  78. print "Not Logged"
  79. return False
  80.  
  81. def list_obj_group():
  82. url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
  83. r = s.get(url,headers=headers,verify=False)
  84. tree = etree.HTML(r.content, parser=html_parser)
  85. result_dict = {}
  86. for i in tree.xpath('//select[@name="objGrpCode"]/option/text()'):
  87. if len(i.split(':'))==2:
  88. code = i.split(':')[0]
  89. dest = i.split(':')[1]
  90. result_dict[code] = dest
  91. return result_dict
  92. ##def list_obj(business_group):
  93. ## url = "http://datawarehouse.dbd.go.th/bdw/search/objective.html"
  94. ## payload = "value=" + str(business_group)
  95. ## headers = {
  96. ## 'Content-Type': 'application/x-www-form-urlencoded',
  97. ## 'X-Requested-With': 'XMLHttpRequest'
  98. ## }
  99. ## r = s.post(url,headers=headers ,data=payload,verify=False)
  100. ## tree = etree.HTML(r.content, parser=html_parser)
  101. ## result_dict = {}
  102. ## for i in tree.xpath('//option/text()'):
  103. ## if len(i.split(':'))==2:
  104. ## code = i.split(':')[0]
  105. ## dest = i.split(':')[1]
  106. ## result_dict[code] = dest
  107. ## return result_dict
  108.  
  109. def list_region():
  110. url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
  111. r = s.get(url,headers=headers,verify=False)
  112. tree = etree.HTML(r.content, parser=html_parser)
  113. result_dict = {}
  114. for i in tree.xpath('//select[@name="zone"]/option'):
  115. if (i.xpath('@value')[0]!=''):
  116. code = i.xpath('@value')[0]
  117. dest = i.xpath('text()')[0]
  118. result_dict[code] = dest
  119. return result_dict
  120.  
  121. def list_province(region):
  122. url = "http://datawarehouse.dbd.go.th/bdw/search/province.html"
  123. payload = "value=" + str(region) +"&province="
  124. headers = {
  125. 'Content-Type': 'application/x-www-form-urlencoded',
  126. 'X-Requested-With': 'XMLHttpRequest'
  127. }
  128. r = s.post(url,headers=headers ,data=payload,verify=False)
  129. tree = etree.HTML(r.content, parser=html_parser)
  130. result_dict = {}
  131. for i in tree.xpath('//option'):
  132. if (i.xpath('@value')[0]!=''):
  133. code = i.xpath('@value')[0]
  134. dest = i.xpath('text()')[0]
  135. result_dict[code] = dest
  136. return result_dict
  137.  
  138. ###Old
  139. ##def get_business_list(objGrpCode,obj,region,province,year):
  140. ## total_page =-1
  141. ## page = 1
  142. ## url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
  143. ## while total_page!=str(page-1):
  144. ## payload = 'objGrpCode={0}&submitObjCode={1}&zone={2}&province={3}&amphur=&jpTypeCode=&fiscalYear={4}&balIn09=&balIn09Max=&balIn21=&balIn21Max=&balBs11=&balBs11Max=&balBs23=&balBs23Max=&capAmt=&capAmtMax=&sortBy=JP_TNAME&search=search&currentPage={5}'.format(objGrpCode,obj,region,province,year,page)
  145. ## print payload
  146. ## headers = {
  147. ## 'Content-Type': 'application/x-www-form-urlencoded',
  148. ## 'X-Requested-With': 'XMLHttpRequest'
  149. ## }
  150. ## r = s.post(url,headers=headers ,data=payload,verify=False)
  151. ## tree = etree.HTML(r.content, parser=html_parser)
  152. ## try:
  153. ## total_page = tree.xpath('//*[@id="content"]/div[2]/div/div[3]/div[3]/b/text()')[0].split(' : ')[1].split(' ')[0]
  154. ## except:
  155. ## print "No result"
  156. ## break
  157. ##
  158. ## for i in tree.xpath('//table[@class="horizontal"]/tr'):
  159. ## if(i.xpath('td[2]/a/text()')!=[]):
  160. ## print i.xpath('td[2]/a/text()'),i.xpath('td[2]/a/@href')
  161. ## print total_page,page
  162. ## page = page + 1
  163.  
  164. def get_business_list(objGrpCode,region,province,year,cursor):
  165. total_page =-1
  166. page = 1
  167. obj = ''
  168. url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
  169. while total_page!=str(page-1):
  170. payload = 'objGrpCode={0}&submitObjCode={1}&zone={2}&province={3}&amphur=&jpTypeCode=&fiscalYear={4}&balIn09=&balIn09Max=&balIn21=&balIn21Max=&balBs11=&balBs11Max=&balBs23=&balBs23Max=&capAmt=&capAmtMax=&sortBy=JP_TNAME&search=search&currentPage={5}'.format(objGrpCode,obj,region,province,year,page)
  171. print payload
  172. headers = {
  173. 'Content-Type': 'application/x-www-form-urlencoded',
  174. 'X-Requested-With': 'XMLHttpRequest'
  175. }
  176. r = s.post(url,headers=headers ,data=payload,verify=False)
  177. tree = etree.HTML(r.content, parser=html_parser)
  178. try:
  179. total_page = tree.xpath('//*[@id="content"]/div[2]/div/div[3]/div[3]/b/text()')[0].split(' : ')[1].split(' ')[0]
  180. except:
  181. print "No result"
  182. break
  183.  
  184. for i in tree.xpath('//table[@class="horizontal"]/tr'):
  185. if(i.xpath('td[2]/a/text()')!=[]):
  186. try:
  187. ##print i.xpath('td[2]/a/text()'),i.xpath('td[2]/a/@href')
  188. company_link = 'http://datawarehouse.dbd.go.th' + i.xpath('td[2]/a/@href')[0]
  189. insert_company_db(cursor,company_detail(company_link,year))
  190. ##print "company link "+ company_link
  191. ##print company_detail(company_link)
  192. except:
  193. print "company link :" +company_link #+'http://datawarehouse.dbd.go.th' + i.xpath('td[2]/a/@href')[0]
  194. print 'err'
  195. print total_page,page
  196. page = page + 1
  197. def clear_text(list_):
  198. if len(list_) >0 :
  199. text = ""
  200. for temp in list_ :
  201. text = text + temp
  202. else:
  203. return " "
  204. return text.encode('utf-8')
  205. def check(data):
  206. b =""
  207. try :
  208. for a in data:
  209. b = b+a
  210. return b.encode('utf-8')
  211. except :
  212. print "error"
  213. return "None"
  214. def check_int(data):
  215. try :
  216. return data[0].encode('utf-8')
  217. except :
  218. print "error"
  219. return "0"
  220.  
  221. def date_sql_format(date):
  222. date = date.split('/')
  223. return date[2] +"-" +date[1]+"-"+date[0]
  224. def company_detail(company_url,year_query):
  225. f = open('temp.csv','a')
  226. pageContent=s.get(company_url)
  227. tree = html.fromstring(pageContent.content)
  228. company_detail =[]
  229. company_id_name = check(tree.xpath('//*[@class="txt-violet"]/text()')[0].replace('\n','').replace('\t',''))
  230. if (company_id_name == "None"):
  231. company_id = "None"
  232. company_name = "None"
  233. else :
  234. company_id = company_id_name[4:17]
  235. company_name = company_id_name[23:]
  236. company_type = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[1]/td/text()'))
  237. company_date = re.sub('\s+','',check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[2]/td/text()')))
  238. company_date = date_sql_format(company_date)
  239. company_status = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[3]/td/text()'))
  240. company_value = re.sub('\s+','',check_int(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[4]/td/text()'))).replace(',','')
  241. company_place = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[5]/td/text()'))
  242. company_group = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[6]/td/text()'))
  243. company_obj = re.sub('\s+','',check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[7]/td/text()')))
  244. company_years = filter(None,check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[8]/td/text()')).lstrip())
  245. company_referee = ""
  246. company_fax = ""
  247. company_tel = ""
  248. company_notice = ""
  249. company_relative = ""
  250. company_referee = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[9]/td/text()'))
  251. company_referee = company_referee.replace('\n','').replace('/','').replace('\t','')
  252. company_relative = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[10]/td/text()'))
  253. company_relative = company_relative.replace('\n','').replace('/','').replace('\t','')
  254. company_tel = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[11]/td/text()'))
  255. company_fax = check( tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[12]/td/text()'))
  256. company_notice = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[13]/td/text()'))
  257. company_notice = company_notice.replace('\n','').replace('/','').replace('\t','')
  258. company_detail.extend(['DEFAULT','CURRENT_TIMESTAMP',year_query,company_name,company_id,company_type,company_date,company_status,
  259. company_value,company_place,company_group,company_obj,company_years,company_referee,
  260. company_relative,company_tel,company_fax,company_notice,company_url])
  261. return company_detail
  262. def insert_company_db(cursor,company):
  263. varlist = []
  264. querry_company = ""
  265. querry_string = "INSERT INTO company_record VALUES( "
  266. for data in company:
  267. if data == "CURRENT_TIMESTAMP":
  268. querry_string = querry_string + data+ ","
  269. elif data == "DEFAULT":
  270. querry_string = querry_string + data+ ","
  271. else:
  272. querry_string = querry_string +"'" + data +"'" + ","
  273. querry_string = unicode(querry_string[:-1]+");",'utf-8')
  274. ##print "Querry sting " +querry_string
  275. cursor.execute(querry_string)
  276.  
  277. if __name__ == '__main__':
  278. #Parsing Arguments
  279. captcha = grab_captcha()
  280. print "Got Captcha:",captcha
  281. login("hexkey6@gmail.com","password",captcha)
  282. db = MySQLdb.connect(host="localhost", # your host, usually localhost
  283. user="root", # your username
  284. passwd="", # your password
  285. db="list_company",use_unicode=True,
  286. init_command='SET NAMES UTF8') # name of the data base
  287. db.set_character_set("utf8")
  288. cursor = db.cursor()
  289. #a = company_detail('http://datawarehouse.dbd.go.th/bdw/est/details/index.html?jpNo=0105550089172&jpTypeCode=5',"1234")
  290. #insert_company_db(cursor,a)
  291. print get_business_list("A","C","10","2557",cursor)
  292. # print "Finish"
  293. ## for obj_group in list_obj_group().keys():
  294. ## for region in list_region().keys():
  295. ## for province in list_province(region).keys():
  296. ## print get_business_list(obj_group,"C","10","2555",cursor)
  297. #### print "######"
  298. ##print list_obj("A")
  299. ##print list_region().keys()
  300. ##print list_province("C").keys()
  301. ##get_business_list("F","41000","C","10","2556")
  302. ##get_business_list("A","C","10","2557")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement