crawler_DIA_simple

# coding: utf-8

# 開始動工
'''
2018/01/15
預計改成class寫法....

2018/01/14
可批次寫入資料庫，程式大致完成
2018/01/13
解決sql插入問題
2018/01/08
重新啟動專案
2017/10/25
'''
import requests
from bs4 import BeautifulSoup
import sqlite3 as lite
import os,time, shutil, dataset

#取得當前工作路徑加存檔路徑
workpath=os.getcwd()+'\crawler_DIA'

#如果目錄不存在就建立該目錄
if not os.path.isdir(workpath):
    os.makedirs(workpath)

#在爬蟲程式開始運作前，先建立資料庫
#資料庫設定
database='dia.sqlite'
savetodb=os.path.join(workpath,database)

'''
#20180113 sqlite連線時會自動建立所以下面程式碼可以省略
#如果檔案不存在就建立該檔案
if not os.path.isfile(os.path.join(workpath,database)):
    makefile=open(os.path.join(workpath,database),'w')
    makefile.close()
'''

#建立資料表的SQL
sql1="""CREATE TABLE IF NOT EXISTS 'profile'
('id' INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL ,\
'name' VARCHAR NOT NULL ,'profile_pic' TEXT NOT NULL \
DEFAULT (null) ,'age' INTEGER NOT NULL ,\
'gender' VARCHAR NOT NULL ,'appearance' TEXT,\
'occupation' VARCHAR,'education' VARCHAR,\
'religion' VARCHAR,'relationship_status' VARCHAR,\
'has_children' VARCHAR,'wants_children' VARCHAR,\
'willing_to_relocate' VARCHAR,'smoking' VARCHAR,\
'drinking' VARCHAR)"""

#資料庫欄位的初始值，id值要用None
dict={'id':None,
'profile_pic':'NA',
'age':'NA',
'gender':'NA',
'appearance':'NA',
'occupation':'NA',
'education':'NA',
'religion':'NA',
'relationship_status':'NA',
'has_children':'NA',
'wants_children':'NA',
'willing_to_relocate':'NA',
'smoking':'NA',
'drinking':'NA'}

#資料庫連線，如果路徑底下沒有dia.sqlite則會自動建立
conn= lite.connect(savetodb)
cur=conn.cursor()

#執行SQL
cur.execute(sql1)
#commit&close
conn.commit()
conn.close()

#目標網址
base_url='https://www.dateinasia.com'

#篩選條件
#philippines cebu ,女 18-30歲 ,sorted by last active 差別在s=?
#searchfor='&g=2&af=18&at=30&c=PH&ci=Cebu&s=2'
#philippines cebu ,女 18-30歲 ,sorted by newest members
searchfor='&g=2&af=18&at=30&c=PH&ci=Cebu&s=3'

#頁數
page_list=[]
#頁面的使用者連結

#起始的頁數
pg=0

#要抓的頁數(+1)
num=4

for i in range(0,num+1):
    get_page='pg='+str(pg)
    pg+=1
    page_list.append(base_url+'/Search.aspx?'+get_page+searchfor)

#測試
#print(len(page_list),page_list)

#使用者profile的頁面連結
User_Page_Link=[]
#記錄出錯的頁面Link
error_page=[]

#取得每個頁面中的使用者的連結(一頁會有60個使用者)
for get_user_link in page_list:
    #print(get_user_link)
    res=requests.get(get_user_link)
    soup=BeautifulSoup(res.content, 'html5lib')
    try:
        get_data=soup.find_all("span",{'class':'responsive-container galleryphoto-responsive'})
        for link in get_data:
            UserName=link.find('img').attrs['alt'].replace(' ','+')
            UserLink=base_url+'/'+UserName
            User_Page_Link.append(UserLink)
            #
            #print(UserName)
            #print(UserLink)
    except:
        print('error: {}'.format(get_user_link))
        error_page.append(get_user_link)
        pass
#測試
#print('User_Page_Link=',str(len(User_Page_Link)),User_Page_Link)

#記錄個人頁面抓取時出錯的訊息
error_profile=[]
#建立資料庫連線
con=lite.connect(savetodb)
cur=con.cursor()

#記錄程式執行時間
time_start=time.time()

for get_profile in User_Page_Link:

    page_html=requests.get(get_profile)
    page_soup=BeautifulSoup(page_html.text,'lxml')

    try:
        #使用者的ID與大頭照
        user=page_soup.select('.responsive-image-local')
        #用chrome info lite選取標籤 快
        #select css的用法
        #注意這邊用select的用法，跟前面用find的差異
        for i in user:
            username=i['alt']
            userpic=i['src']

        dict.update({'name':username})
        dict.update({'profile_pic':userpic})

        #使用者的個人資料
        get_info= page_soup.find_all('dt')

        for i in get_info:
            try:
                key=i.text.replace(':','').replace(' ','_').lower()
                value=i.next_sibling.text
                dict.update({key:value})

            except:
                pass

    except:
            print('error: {}'.format(get_profile))
            error_profile.append(get_profile)
            pass

    #寫入資料庫
    cur.execute("INSERT or ignore INTO profile VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",[dict["id"],dict["name"],dict["profile_pic"],dict["age"],dict["gender"],dict["appearance"],dict["occupation"],dict["education"],dict["religion"],dict["relationship_status"],dict["has_children"],dict["wants_children"],dict["willing_to_relocate"],dict["smoking"],dict["drinking"]])
    con.commit()
    #抓完一筆休息3秒
    time.sleep(3)

#關閉資料庫連線
con.close()

time_end=time.time()
#印出程式執行時間
print(time_end-time_start)