Untitled

#!/usr/local/bin/python
# -*- coding: utf-8 -*-

from lxml import etree
import sys, requests, traceback, math, time,shutil
import requests.packages.urllib3
from captcha_solver import CaptchaSolver
from lxml import html
from bs4 import BeautifulSoup
import re
import urllib
#Disable SSL Warning
requests.packages.urllib3.disable_warnings()

#Define Global variable
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
    }

proxy = {
  "http": "127.0.0.1:8080",
  "https": "127.0.0.1:8080",
}
s = requests.Session()
main_domain = 'http://datawarehouse.dbd.go.th'
html_parser = etree.HTMLParser(encoding="utf-8")

def grab_captcha():
    global headers
    #Get cookie
    url = "http://datawarehouse.dbd.go.th/bdw/home/login.html"
    s.get(url,headers=headers)

    #Grab Captcha
    url = "http://datawarehouse.dbd.go.th/bdw/home/captcha.html"
    headers = {
    'Host': 'datawarehouse.dbd.go.th',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    }

    #Request to server
    r = s.get(url , headers=headers,verify=False,stream=True)
    with open('img.png', 'wb') as out_file:
        shutil.copyfileobj(r.raw, out_file)
    solver = CaptchaSolver('antigate', api_key='39e2c871f81922e30c85108df1c8486c')#39e2c871f81922e30c85108df1c8486c')2395fb864412a999e5b998ac59a5b996
    raw_data = open('img.png', 'rb').read()
    captcha_val = solver.solve_captcha(raw_data)
    return captcha_val


def login(user,password,captcha):
    global headers
    #Set url and header
    url = "http://datawarehouse.dbd.go.th/bdw/home/authen.html"
    headers = {
    'Host': 'datawarehouse.dbd.go.th',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'Content-Type': 'application/x-www-form-urlencoded',
    'Referer': 'http://datawarehouse.dbd.go.th/bdw/home/login.html',
    'Upgrade-Insecure-Requests': '1',
    'Connection': 'close'
    }

    payload = "userName="+str("hexkey6%40gmail.com")+"&userPassword="+str(password)+"&captchaText="+str(captcha)

    r = s.post(url,headers=headers ,data=payload,verify=False)
    if r.content.find('logout')!=-1:
        print "Logged"
        return True
    else:
        print "Not Logged"
        return False

def list_obj_group():
    url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
    r = s.get(url,headers=headers,verify=False)
    tree = etree.HTML(r.content, parser=html_parser)
    result_dict = {}
    for i in tree.xpath('//select[@name="objGrpCode"]/option/text()'):
        if len(i.split(':'))==2:
            code = i.split(':')[0]
            dest = i.split(':')[1]
            result_dict[code] = dest
    return result_dict
##def list_obj(business_group):
##    url = "http://datawarehouse.dbd.go.th/bdw/search/objective.html"
##    payload = "value=" + str(business_group)
##    headers = {
##        'Content-Type': 'application/x-www-form-urlencoded',
##        'X-Requested-With': 'XMLHttpRequest'
##        }
##    r = s.post(url,headers=headers ,data=payload,verify=False)
##    tree = etree.HTML(r.content, parser=html_parser)
##    result_dict = {}
##    for i in tree.xpath('//option/text()'):
##        if len(i.split(':'))==2:
##            code = i.split(':')[0]
##            dest = i.split(':')[1]
##            result_dict[code] = dest
##    return result_dict

def list_region():
    url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
    r = s.get(url,headers=headers,verify=False)
    tree = etree.HTML(r.content, parser=html_parser)
    result_dict = {}
    for i in tree.xpath('//select[@name="zone"]/option'):
        if (i.xpath('@value')[0]!=''):
            code = i.xpath('@value')[0]
            dest = i.xpath('text()')[0]
            result_dict[code] = dest
    return result_dict

def list_province(region):
    url = "http://datawarehouse.dbd.go.th/bdw/search/province.html"
    payload = "value=" + str(region) +"&province="
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded',
        'X-Requested-With': 'XMLHttpRequest'
        }
    r = s.post(url,headers=headers ,data=payload,verify=False)
    tree = etree.HTML(r.content, parser=html_parser)
    result_dict = {}
    for i in tree.xpath('//option'):
        if (i.xpath('@value')[0]!=''):
            code = i.xpath('@value')[0]
            dest = i.xpath('text()')[0]
            result_dict[code] = dest
    return result_dict

###Old
##def get_business_list(objGrpCode,obj,region,province,year):
##    total_page =-1
##    page = 1
##    url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
##    while total_page!=str(page-1):
##        payload = 'objGrpCode={0}&submitObjCode={1}&zone={2}&province={3}&amphur=&jpTypeCode=&fiscalYear={4}&balIn09=&balIn09Max=&balIn21=&balIn21Max=&balBs11=&balBs11Max=&balBs23=&balBs23Max=&capAmt=&capAmtMax=&sortBy=JP_TNAME&search=search&currentPage={5}'.format(objGrpCode,obj,region,province,year,page)
##        print payload
##        headers = {
##            'Content-Type': 'application/x-www-form-urlencoded',
##            'X-Requested-With': 'XMLHttpRequest'
##            }
##        r = s.post(url,headers=headers ,data=payload,verify=False)
##        tree = etree.HTML(r.content, parser=html_parser)
##        try:
##            total_page = tree.xpath('//*[@id="content"]/div[2]/div/div[3]/div[3]/b/text()')[0].split(' : ')[1].split(' ')[0]
##        except:
##            print "No result"
##            break
##
##        for i in tree.xpath('//table[@class="horizontal"]/tr'):
##            if(i.xpath('td[2]/a/text()')!=[]):
##                print i.xpath('td[2]/a/text()'),i.xpath('td[2]/a/@href')
##        print total_page,page
##        page = page + 1

def get_business_list(objGrpCode,region,province,year):
    total_page =-1
    page = 1
    obj = ''
    url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
    while total_page!=str(page-1):
        payload = 'objGrpCode={0}&submitObjCode={1}&zone={2}&province={3}&amphur=&jpTypeCode=&fiscalYear={4}&balIn09=&balIn09Max=&balIn21=&balIn21Max=&balBs11=&balBs11Max=&balBs23=&balBs23Max=&capAmt=&capAmtMax=&sortBy=JP_TNAME&search=search&currentPage={5}'.format(objGrpCode,obj,region,province,year,page)
        print payload
        headers = {
            'Content-Type': 'application/x-www-form-urlencoded',
            'X-Requested-With': 'XMLHttpRequest'
            }
        r = s.post(url,headers=headers ,data=payload,verify=False)
        tree = etree.HTML(r.content, parser=html_parser)
        try:
            total_page = tree.xpath('//*[@id="content"]/div[2]/div/div[3]/div[3]/b/text()')[0].split(' : ')[1].split(' ')[0]
        except:
            print "No result"
            break

        for i in tree.xpath('//table[@class="horizontal"]/tr'):
            if(i.xpath('td[2]/a/text()')!=[]):
                try:
                    ##print i.xpath('td[2]/a/text()'),i.xpath('td[2]/a/@href')
                    company_link = 'http://datawarehouse.dbd.go.th' + i.xpath('td[2]/a/@href')[0]
                    print "company link     "+ company_link
                    ##print company_detail(company_link)
                except:
                    print 'err'
        print total_page,page
        page = page + 1
def clear_text(text):
    print text
    try :
        text = text.encode('utf-8')
    except:
        text = ""
    return text
def company_detail(company_url):
    f = open('temp.csv','a')
    pageContent=s.get(company_url)
    tree = html.fromstring(pageContent.content)
    company_detail =[]
##    ##print pageContent.content
## Soup
##    soup = BeautifulSoup(pageContent.content,"lxml")
##    print soup.prettify()
##    table = soup.find(class_ = 'horizontal')
##    list_ = []
##    for info in  table.find_all('td'):
##        list_.append( info.get_text())
##    print list_
##    print "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"

##    for i in range (1,14):
##        text = tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr['+ str(i)+']/td/text()')
##        try:
##            print text
##            text = text[0].encode('utf-8')
##            print '---'
##        except:
##            print 'eer',text
##            print(traceback.format_exc())
##            text = ''
##            pass
##        company_detail.append(text)
##        f.write(text)
    company_type = tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[1]/td/text()')[0].encode('utf-8')
    company_date = re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[2]/td/text()')[0])
    company_status = tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[3]/td/text()')[0].encode('utf-8')
    company_value = re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[4]/td/text()')[0].encode('utf-8'))
    company_place = re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[5]/td/text()')[0].encode('utf-8'))
    company_group = re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[6]/td/text()')[0].encode('utf-8'))
    company_obj = re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[7]/td/text()')[0].encode('utf-8'))
    company_years = filter(None,tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[8]/td/text()')[0].lstrip().split(' '))
    company_referee = []
    company_fax = ""
    company_tel = ""
    company_notice = ""
    for person in tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[9]/td/text()'):
        company_referee.append(person.replace('/','').lstrip())#
    company_referee =  filter(None, company_referee) # clear empty element in list
    company_relative = ""
    for person in tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[10]/td/text()'):
        company_relative += person.replace('/','').lstrip()#.encode#('utf-8')
    company_tel = clear_text(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[11]/td/text()')[0]) #
    company_fax = clear_text( tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[12]/td/text()'))
    company_notice = clear_text(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[13]/td/text()'))
    company_detail.extend([company_type,company_date,company_status,company_value,company_place,company_group,company_obj,company_years
                          ,company_referee,company_relative,company_tel,company_fax,company_notice])
##    print company_detail
##    print company_fax.encode('utf-8')
##    print company_notice.encode('utf-8')
##    company_tel = tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[11]/td/text()')
##    company_fax = tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[12]/td/text()')
##    company_notice = tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[13]/td/text()')
##    print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[1]/td/text()')[0].encode('utf-8')
##    print re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[2]/td/text()')[0])
##    print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[3]/td/text()')[0].encode('utf-8')
##    print re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[4]/td/text()')[0].encode('utf-8'))
##    print re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[5]/td/text()')[0].encode('utf-8'))
##    print re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[6]/td/text()')[0].encode('utf-8'))
##    print re.sub('\s+','',tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[7]/td/text()')[0].encode('utf-8'))
##    print filter(None,tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[8]/td/text()')[0].lstrip().split(' '))
##    print company_relative
##    print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[6]/td/text()')[0].encode('utf-8')
##    print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[7]/td/text()')[0].encode('utf-8')
##    print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[8]/td/text()')[0].encode('utf-8')
##    print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[9]/td/text()')[0].encode('utf-8')
##    print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[10]/td/text()')[0].encode('utf-8')
##    try:
##        print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[12]/td/text()')[0].encode('utf-8')
##    except:
##        print ""
##    try:
##        print tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[13]/td/text()')[0].encode('utf-8')
##    except:
##        print""
##
    f.write('\n')
    f.close()
    return company_detail

if __name__ == '__main__':
    #Parsing Arguments
    captcha = grab_captcha()
    print "Got Captcha:",captcha
    login("hexkey6@gmail.com","password",captcha)
    print company_detail('http://datawarehouse.dbd.go.th/bdw/est/details1.html?jpNo=0105555108647&jpTypeCode=5&t=')
##  get_business_list("A","C","10","2555")
##    ##for obj_group in list_obj_group().keys():
##    ##    for region in list_region().keys():
##    ##            for province in list_province(region).keys():
##    ##                print get_business_list(obj_group,"C","10","2555")
##    print "######"
    ##print list_obj("A")
    ##print list_region().keys()
    ##print list_province("C").keys()
    ##get_business_list("F","41000","C","10","2556")
    ##get_business_list("A","C","10","2557")