Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #-*- coding: UTF-8 -*-
- from bs4 import BeautifulSoup
- import scrapy
- #from scrapy.spiders import CrawlSpider , Rule
- #from scrapy.linkextractors import LinkExtractor
- from selenium import webdriver
- import time
- import lxml
- import sys
- from scrapy.selector import Selector
- import requests
- import shutil
- import sqlite3
- from sky.items import BoyClothesItem
- import os
- class boyclothes(scrapy.Spider):
- name = 'boyclothes'
- domain = ['http://www.ruten.com.tw/']
- #set the url you want to search
- start_urls = [
- #'http://class.ruten.com.tw/category/sub00.php?c=00180001&p=',#T恤
- #'http://class.ruten.com.tw/category/sub00.php?c=00180002&p=',#襯衫
- 'http://class.ruten.com.tw/category/sub00.php?c=00180003&p=',#POLO衫
- #'http://class.ruten.com.tw/category/sub00.php?c=00180035&p=',#針織衫
- #'http://class.ruten.com.tw/category/sub00.php?c=00180004&p=',#背心
- #'http://class.ruten.com.tw/category/sub00.php?c=00180005&p=',#毛衣
- #'http://class.ruten.com.tw/category/sub00.php?c=00180006&p=',#外套
- #'http://class.ruten.com.tw/category/sub00.php?c=00180007&p=',#褲子
- #'http://class.ruten.com.tw/category/sub00.php?c=00180008&p=',#西裝
- #'http://class.ruten.com.tw/category/sub00.php?c=00180037&p=',#內衣褲
- #'http://class.ruten.com.tw/category/sub00.php?c=00180039&p=',#大尺寸
- #'http://class.ruten.com.tw/category/sub00.php?c=00180011&p=',#表演、道具服
- #'http://class.ruten.com.tw/category/sub00.php?c=00180012&p=',#其他男裝
- #'http://class.ruten.com.tw/category/sub00.php?c=00180013&p=',#背包/公事包
- #'http://class.ruten.com.tw/category/sub00.php?c=00180014&p=',#首飾配件
- #'http://class.ruten.com.tw/category/sub00.php?c=00180015&p=',#打火機
- #'http://class.ruten.com.tw/category/sub00.php?c=00180016&p=',#鑰匙圈
- #'http://class.ruten.com.tw/category/sub00.php?c=00180017&p=',#皮夾
- #'http://class.ruten.com.tw/category/sub00.php?c=00180018&p=',#領帶、吊帶
- #'http://class.ruten.com.tw/category/sub00.php?c=00180038&p=',#睡衣
- #'http://class.ruten.com.tw/category/sub00.php?c=00180040&p=',#皮飾、皮帶
- #'http://class.ruten.com.tw/category/sub00.php?c=00180020&p=',#帽子
- #'http://class.ruten.com.tw/category/sub00.php?c=00180036&p=',#手套
- #'http://class.ruten.com.tw/category/sub00.php?c=00180021&p=',#假髮
- #'http://class.ruten.com.tw/category/sub00.php?c=00180022&p=',#圍巾、手帕
- #'http://class.ruten.com.tw/category/sub00.php?c=00180041&p=',#襪子
- #'http://class.ruten.com.tw/category/sub00.php?c=00180026&p=',#其他
- #'http://class.ruten.com.tw/category/sub00.php?c=000300030001&p=',#男運動服
- #'http://class.ruten.com.tw/category/sub00.php?c=000100020007&p=',#男童裝
- #'http://class.ruten.com.tw/category/sub00.php?c=00170001&p=',#男錶
- #'http://class.ruten.com.tw/category/sub00.php?c=00180033&p=',#男鞋
- #'http://class.ruten.com.tw/category/sub00.php?c=00120016&p=',#男士保養
- #'http://class.ruten.com.tw/category/sub00.php?c=00180042&p=',#其他隨身配件
- #'http://class.ruten.com.tw/category/sub00.php?c=00180043&p=',#國際代購/代買
- ]
- urlpage = 0
- db_title = []
- db_price = []
- db_website = []
- db_date = []
- db_flag = []
- point = 0
- count = 0
- results = 0
- temp_count = 0
- db_count = 0
- flag = 1
- HEADER={
- #"Host": "www.ruten.com.tw/",
- #"Connection": "keep-alive",
- #"Cache-Control": "max-age=0",
- #"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
- "User-Agent": "Mozilla/4.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
- #"Referer": "http://www.zhihu.com/people/raymond-wang",
- #"Accept-Encoding": "gzip,deflate,sdch",
- #"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4,zh-TW;q=0.2",
- }
- def __init__(self):
- #self.driver = webdriver.Chrome(executable_path=os.getcwd() + r'\selenium_driver_chrome\chromedriver.exe') # chrome瀏覽器
- #self.driver = webdriver.PhantomJS(executable_path=os.getcwd() +r'\phantomjs-2.1.1-windows\bin\phantomjs.exe') #phantomjs
- self.conn = sqlite3.connect('boyclothes.sqlite')
- self.cur = self.conn.cursor()
- self.cur.execute('create table if not exists boyclothes(title varchar(50) , price varchar(10) , website varchar(100) , date varchar(100), flag bit);')
- print("init")
- try:
- self.cur.execute('UPDATE boyclothes SET flag = 0;')
- self.conn.commit()
- print("update the flag.")
- except:
- print("not update the flag.")
- #self.conn.close()
- print("database close")
- def start_requests(self):
- #self.driver = webdriver.Chrome(executable_path=os.getcwd() + r'\selenium_driver_chrome\chromedriver.exe') # chrome瀏覽器
- try:
- self.conn = sqlite3.connect('boyclothes.sqlite')
- self.cur = self.conn.cursor()
- self.cur.execute('create table if not exists boyclothes(title varchar(50) , price varchar(10) , website varchar(100) , date varchar(100), flag bit);')
- self.cur.execute('SELECT * FROM boyclothes;')
- results = self.cur.fetchall()
- #print(count)
- #self.cur.execute('SELECT * FROM boyclothes')
- for record in results:
- self.db_title.append(record[0])
- self.db_price.append(record[1])
- self.db_website.append(record[2])
- self.db_date.append(record[3])
- #print(len(self.db_website))
- #try:
- # self.cur.execute('UPDATE boyclothes SET flag = 0;')
- # self.conn.commit()
- # print("update the flag.")
- #except:
- # print("not update the flag.")
- print("success")
- except:
- print("can't print website")
- self.conn.close()
- print("database close")
- self.driver = webdriver.PhantomJS(executable_path=os.getcwd() +r'\phantomjs-2.1.1-windows\bin\phantomjs.exe') #phantomjs
- self.driver.get(self.start_urls[self.urlpage])
- yield scrapy.Request(self.start_urls[self.urlpage] , callback = self.parse)
- self.urlpage = self.urlpage + 1
- def parse(self , response):
- for start in self.start_urls:
- #self.driver.get(self.start_urls[urlpage])
- for page in range(0,1):#how many pages want spider
- url = start + str(page+1)
- self.driver.get(url)
- #time.sleep(1)
- #sp = []
- sp = BeautifulSoup(self.driver.page_source.encode(sys.stdin.encoding, "replace").decode(sys.stdin.encoding) , "lxml")
- #sp = BeautifulSoup(self.driver.page_source , "lxml")
- page += 1
- #items = []
- items = sp.find_all('li',{'itemtype':'http://schema.org/Product'})
- for item in items: #catch the product's data
- title = item.find('a',{'class':'rt-goods-list-item-name-link'}).getText()
- price = item.find('strong',{'class':'rt-text-price'}).getText()
- website = item.find('a',{'class':'rt-goods-list-item-name-link'})['href']
- date = time.strftime("%Y/%m/%d/%H:%M:%S")
- #print(title + price)
- if len(self.db_website) == 0: #website no data , input data in database
- yield self.parse_detail(title ,price,website ,date,self.flag)
- else:
- for i in range(len(self.db_website)):
- if not (website == self.db_website[i]):#if new data not in the old database
- self.count = self.count + 1
- #find the different item
- else:
- #print("find the same item's website")
- #insert new price into the another table
- if not (price == self.db_price[i]):
- #-------------insert the new data in new table-----------------#
- self.conn = sqlite3.connect('boyclothes.sqlite')
- self.cur = self.conn.cursor()
- self.cur.execute('create table if not exists temp(title varchar(50) , price varchar(10) , website varchar(100) , date varchar(100),flag bit);')
- self.cur.execute('insert into temp values (?,?,?,?,?);',(title , price , website , date,self.flag))
- self.conn.commit()
- self.conn.close()
- if not os.path.exists('true.txt'):#close spider , write a true.txt
- open('true.txt', 'w').close()
- else:
- print("true exists")
- print("insert data success.")
- #------------------------------end------------------------------#
- #-----------update the new data into the old database-----------#
- self.conn = sqlite3.connect('boyclothes.sqlite')
- self.cur = self.conn.cursor()
- self.cur.execute('update boyclothes set price = ? where website = ?;',(price,website))
- self.cur.execute('update boyclothes set flag = ? where website = ?;',(self.flag,website))
- self.conn.commit()
- self.conn.close()
- print("update data success.")
- #-----------------------------end--------------------------------#
- else:#if find the same website and price same , update the flag
- self.conn = sqlite3.connect('boyclothes.sqlite')
- self.cur = self.conn.cursor()
- self.cur.execute('update boyclothes set flag = ? where website = ?;',(self.flag,website))
- self.conn.commit()
- self.conn.close()
- print("update flag success.")
- #if not find the same item , insert the new item into the database
- if (self.count == len(self.db_website)):
- self.conn = sqlite3.connect('boyclothes.sqlite')
- self.cur = self.conn.cursor()
- self.cur.execute('create table if not exists temp(title varchar(50) , price varchar(10) , website varchar(100) , date varchar(100),flag bit);')
- self.cur.execute('insert into temp values (?,?,?,?,?);',(title , price , website , date,self.flag))
- self.conn.commit()
- self.conn.close()
- print("title = " + title)
- print("insert into database")
- yield self.parse_detail(title ,price,website ,date , self.flag)
- #print(self.count)
- self.count = 0
- '''
- #修改資料庫
- try:
- self.conn = sqlite3.connect('boyclothes.sqlite')
- self.cur = self.conn.cursor()
- #self.cur.execute('create table if not exists temp(title varchar(50) , price varchar(10) , website varchar(100) , date varchar(100));')
- self.cur.execute('UPDATE boyclothes SET price = 11111 where website = "http://goods.ruten.com.tw/item/show?21408182947927";')
- self.cur.execute('DELETE from boyclothes where website = "http://goods.ruten.com.tw/item/show?21111144215687";')
- self.cur.execute('UPDATE boyclothes SET price = 1465456 where website = "http://goods.ruten.com.tw/item/show?21303195214113";')
- self.conn.commit()
- self.conn.close()
- print("update data success.")
- except:
- print("can't update data")
- '''
- def parse_detail(self,title ,price , website,date,flag):
- Item = BoyClothesItem()
- Item['title'] = title
- Item['price'] = price
- Item['website'] = website
- Item['date'] = date
- Item['flag'] = flag
- return Item
- def spider_closed(self, spider):
- self.driver.quit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement