Untitled

#!/usr/local/bin/python
# -*- coding: utf-8 -*-

from lxml import etree
import sys, requests, traceback, math, time,shutil
import requests.packages.urllib3
from captcha_solver import CaptchaSolver
from lxml import html
from bs4 import BeautifulSoup
import re
import urllib
import MySQLdb
#Disable SSL Warning
requests.packages.urllib3.disable_warnings()

#Define Global variable
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
    }

proxy = {
  "http": "127.0.0.1:8080",
  "https": "127.0.0.1:8080",
}
s = requests.Session()
main_domain = 'http://datawarehouse.dbd.go.th'
html_parser = etree.HTMLParser(encoding="utf-8")

def grab_captcha():
    global headers
    #Get cookie
    url = "http://datawarehouse.dbd.go.th/bdw/home/login.html"
    s.get(url,headers=headers)

    #Grab Captcha
    url = "http://datawarehouse.dbd.go.th/bdw/home/captcha.html"
    headers = {
    'Host': 'datawarehouse.dbd.go.th',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    }

    #Request to server
    r = s.get(url , headers=headers,verify=False,stream=True)
    with open('img.png', 'wb') as out_file:
        shutil.copyfileobj(r.raw, out_file)
    solver = CaptchaSolver('antigate', api_key='39e2c871f81922e30c85108df1c8486c')#39e2c871f81922e30c85108df1c8486c')2395fb864412a999e5b998ac59a5b996
    raw_data = open('img.png', 'rb').read()
    captcha_val = solver.solve_captcha(raw_data)
    return captcha_val


def login(user,password,captcha):
    global headers
    #Set url and header
    url = "http://datawarehouse.dbd.go.th/bdw/home/authen.html"
    headers = {
    'Host': 'datawarehouse.dbd.go.th',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'Content-Type': 'application/x-www-form-urlencoded',
    'Referer': 'http://datawarehouse.dbd.go.th/bdw/home/login.html',
    'Upgrade-Insecure-Requests': '1',
    'Connection': 'close'
    }

    payload = "userName="+str("hexkey6%40gmail.com")+"&userPassword="+str(password)+"&captchaText="+str(captcha)

    r = s.post(url,headers=headers ,data=payload,verify=False)
    if r.content.find('logout')!=-1:
        print "Logged"
        return True
    else:
        print "Not Logged"
        return False

def list_obj_group():
    url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
    r = s.get(url,headers=headers,verify=False)
    tree = etree.HTML(r.content, parser=html_parser)
    result_dict = {}
    for i in tree.xpath('//select[@name="objGrpCode"]/option/text()'):
        if len(i.split(':'))==2:
            code = i.split(':')[0]
            dest = i.split(':')[1]
            result_dict[code] = dest
    return result_dict
##def list_obj(business_group):
##    url = "http://datawarehouse.dbd.go.th/bdw/search/objective.html"
##    payload = "value=" + str(business_group)
##    headers = {
##        'Content-Type': 'application/x-www-form-urlencoded',
##        'X-Requested-With': 'XMLHttpRequest'
##        }
##    r = s.post(url,headers=headers ,data=payload,verify=False)
##    tree = etree.HTML(r.content, parser=html_parser)
##    result_dict = {}
##    for i in tree.xpath('//option/text()'):
##        if len(i.split(':'))==2:
##            code = i.split(':')[0]
##            dest = i.split(':')[1]
##            result_dict[code] = dest
##    return result_dict

def list_region():
    url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
    r = s.get(url,headers=headers,verify=False)
    tree = etree.HTML(r.content, parser=html_parser)
    result_dict = {}
    for i in tree.xpath('//select[@name="zone"]/option'):
        if (i.xpath('@value')[0]!=''):
            code = i.xpath('@value')[0]
            dest = i.xpath('text()')[0]
            result_dict[code] = dest
    return result_dict

def list_province(region):
    url = "http://datawarehouse.dbd.go.th/bdw/search/province.html"
    payload = "value=" + str(region) +"&province="
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded',
        'X-Requested-With': 'XMLHttpRequest'
        }
    r = s.post(url,headers=headers ,data=payload,verify=False)
    tree = etree.HTML(r.content, parser=html_parser)
    result_dict = {}
    for i in tree.xpath('//option'):
        if (i.xpath('@value')[0]!=''):
            code = i.xpath('@value')[0]
            dest = i.xpath('text()')[0]
            result_dict[code] = dest
    return result_dict

###Old
##def get_business_list(objGrpCode,obj,region,province,year):
##    total_page =-1
##    page = 1
##    url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
##    while total_page!=str(page-1):
##        payload = 'objGrpCode={0}&submitObjCode={1}&zone={2}&province={3}&amphur=&jpTypeCode=&fiscalYear={4}&balIn09=&balIn09Max=&balIn21=&balIn21Max=&balBs11=&balBs11Max=&balBs23=&balBs23Max=&capAmt=&capAmtMax=&sortBy=JP_TNAME&search=search&currentPage={5}'.format(objGrpCode,obj,region,province,year,page)
##        print payload
##        headers = {
##            'Content-Type': 'application/x-www-form-urlencoded',
##            'X-Requested-With': 'XMLHttpRequest'
##            }
##        r = s.post(url,headers=headers ,data=payload,verify=False)
##        tree = etree.HTML(r.content, parser=html_parser)
##        try:
##            total_page = tree.xpath('//*[@id="content"]/div[2]/div/div[3]/div[3]/b/text()')[0].split(' : ')[1].split(' ')[0]
##        except:
##            print "No result"
##            break
##
##        for i in tree.xpath('//table[@class="horizontal"]/tr'):
##            if(i.xpath('td[2]/a/text()')!=[]):
##                print i.xpath('td[2]/a/text()'),i.xpath('td[2]/a/@href')
##        print total_page,page
##        page = page + 1

def get_business_list(objGrpCode,region,province,year,cursor):
    total_page =-1
    page = 1
    obj = ''
    url = "http://datawarehouse.dbd.go.th/bdw/search/search2.html"
    while total_page!=str(page-1):
        payload = 'objGrpCode={0}&submitObjCode={1}&zone={2}&province={3}&amphur=&jpTypeCode=&fiscalYear={4}&balIn09=&balIn09Max=&balIn21=&balIn21Max=&balBs11=&balBs11Max=&balBs23=&balBs23Max=&capAmt=&capAmtMax=&sortBy=JP_TNAME&search=search&currentPage={5}'.format(objGrpCode,obj,region,province,year,page)
        print payload
        headers = {
            'Content-Type': 'application/x-www-form-urlencoded',
            'X-Requested-With': 'XMLHttpRequest'
            }
        r = s.post(url,headers=headers ,data=payload,verify=False)
        tree = etree.HTML(r.content, parser=html_parser)
        try:
            total_page = tree.xpath('//*[@id="content"]/div[2]/div/div[3]/div[3]/b/text()')[0].split(' : ')[1].split(' ')[0]
        except:
            print "No result"
            break

        for i in tree.xpath('//table[@class="horizontal"]/tr'):
            if(i.xpath('td[2]/a/text()')!=[]):
                try:
                    ##print i.xpath('td[2]/a/text()'),i.xpath('td[2]/a/@href')
                    company_link = 'http://datawarehouse.dbd.go.th' + i.xpath('td[2]/a/@href')[0]
                    insert_company_db(cursor,company_detail(company_link,year))
                    ##print "company link     "+ company_link
                    ##print company_detail(company_link)
                except:
                    print "company link :" +company_link #+'http://datawarehouse.dbd.go.th' + i.xpath('td[2]/a/@href')[0]
                    print 'err'
        print total_page,page
        page = page + 1
def clear_text(list_):
    if len(list_) >0 :
        text = ""
        for temp in list_ :
            text = text + temp
    else:
        return " "
    return text.encode('utf-8')
def check(data):
    b =""
    try :
        for a in data:
         b = b+a
        return b.encode('utf-8')
    except :
        print "error"
        return "None"
def check_int(data):
    try :
        return data[0].encode('utf-8')
    except :
        print "error"
        return "0"

def date_sql_format(date):
    date = date.split('/')
    return date[2] +"-" +date[1]+"-"+date[0]
def company_detail(company_url,year_query):
    f = open('temp.csv','a')
    pageContent=s.get(company_url)
    tree = html.fromstring(pageContent.content)
    company_detail =[]
    company_id_name = check(tree.xpath('//*[@class="txt-violet"]/text()')[0].replace('\n','').replace('\t',''))
    if (company_id_name == "None"):
        company_id = "None"
        company_name = "None"
    else :
        company_id = company_id_name[4:17]
        company_name = company_id_name[23:]
    company_type = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[1]/td/text()'))
    company_date = re.sub('\s+','',check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[2]/td/text()')))
    company_date = date_sql_format(company_date)
    company_status = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[3]/td/text()'))
    company_value = re.sub('\s+','',check_int(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[4]/td/text()'))).replace(',','')
    company_place = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[5]/td/text()'))
    company_group = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[6]/td/text()'))
    company_obj = re.sub('\s+','',check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[7]/td/text()')))
    company_years = filter(None,check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[8]/td/text()')).lstrip())
    company_referee = ""
    company_fax = ""
    company_tel = ""
    company_notice = ""
    company_relative = ""
    company_referee = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[9]/td/text()'))
    company_referee = company_referee.replace('\n','').replace('/','').replace('\t','')
    company_relative = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[10]/td/text()'))
    company_relative = company_relative.replace('\n','').replace('/','').replace('\t','')
    company_tel = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[11]/td/text()'))
    company_fax = check( tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[12]/td/text()'))
    company_notice = check(tree.xpath('//*[@id="content"]/div[2]/div/div[2]/div[2]/table/tr[13]/td/text()'))
    company_notice = company_notice.replace('\n','').replace('/','').replace('\t','')
    company_detail.extend(['DEFAULT','CURRENT_TIMESTAMP',year_query,company_name,company_id,company_type,company_date,company_status,
                           company_value,company_place,company_group,company_obj,company_years,company_referee,
                           company_relative,company_tel,company_fax,company_notice,company_url])
    return company_detail
def insert_company_db(cursor,company):
    varlist = []
    querry_company = ""
    querry_string = "INSERT INTO company_record VALUES( "
    for data in company:
        if data == "CURRENT_TIMESTAMP":
             querry_string = querry_string + data+ ","
        elif data == "DEFAULT":
             querry_string = querry_string + data+ ","
        else:
            querry_string = querry_string +"'" + data +"'" + ","
    querry_string = unicode(querry_string[:-1]+");",'utf-8')
    ##print "Querry sting " +querry_string
    cursor.execute(querry_string)

if __name__ == '__main__':
    #Parsing Arguments
    captcha = grab_captcha()
    print "Got Captcha:",captcha
    login("hexkey6@gmail.com","password",captcha)
    db = MySQLdb.connect(host="localhost",    # your host, usually localhost
                         user="root",         # your username
                         passwd="",  # your password
                         db="list_company",use_unicode=True,
                         init_command='SET NAMES UTF8')        # name of the data base
    db.set_character_set("utf8")
    cursor = db.cursor()
    #a = company_detail('http://datawarehouse.dbd.go.th/bdw/est/details/index.html?jpNo=0105550089172&jpTypeCode=5',"1234")
    #insert_company_db(cursor,a)
    print  get_business_list("A","C","10","2557",cursor)
   # print "Finish"
##    for obj_group in list_obj_group().keys():
##        for region in list_region().keys():
##                for province in list_province(region).keys():
##                    print get_business_list(obj_group,"C","10","2555",cursor)
####    print "######"
    ##print list_obj("A")
    ##print list_region().keys()
    ##print list_province("C").keys()
    ##get_business_list("F","41000","C","10","2556")
    ##get_business_list("A","C","10","2557")