Advertisement
Guest User

Untitled

a guest
Dec 17th, 2017
75
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.62 KB | None | 0 0
  1. #-*- coding: UTF-8 -*-
  2. from bs4 import BeautifulSoup
  3. import scrapy
  4. #from scrapy.spiders import CrawlSpider , Rule
  5. #from scrapy.linkextractors import LinkExtractor
  6. from selenium import webdriver
  7. import time
  8. import lxml
  9. import sys
  10. from scrapy.selector import Selector
  11. import requests
  12. import shutil
  13. import sqlite3
  14. from sky.items import BoyClothesItem
  15. import os
  16. class boyclothes(scrapy.Spider):
  17. name = 'boyclothes'
  18. domain = ['http://www.ruten.com.tw/']
  19. #set the url you want to search
  20. start_urls = [
  21. #'http://class.ruten.com.tw/category/sub00.php?c=00180001&p=',#T恤
  22. #'http://class.ruten.com.tw/category/sub00.php?c=00180002&p=',#襯衫
  23. 'http://class.ruten.com.tw/category/sub00.php?c=00180003&p=',#POLO衫
  24. #'http://class.ruten.com.tw/category/sub00.php?c=00180035&p=',#針織衫
  25. #'http://class.ruten.com.tw/category/sub00.php?c=00180004&p=',#背心
  26. #'http://class.ruten.com.tw/category/sub00.php?c=00180005&p=',#毛衣
  27. #'http://class.ruten.com.tw/category/sub00.php?c=00180006&p=',#外套
  28. #'http://class.ruten.com.tw/category/sub00.php?c=00180007&p=',#褲子
  29. #'http://class.ruten.com.tw/category/sub00.php?c=00180008&p=',#西裝
  30. #'http://class.ruten.com.tw/category/sub00.php?c=00180037&p=',#內衣褲
  31. #'http://class.ruten.com.tw/category/sub00.php?c=00180039&p=',#大尺寸
  32. #'http://class.ruten.com.tw/category/sub00.php?c=00180011&p=',#表演、道具服
  33. #'http://class.ruten.com.tw/category/sub00.php?c=00180012&p=',#其他男裝
  34. #'http://class.ruten.com.tw/category/sub00.php?c=00180013&p=',#背包/公事包
  35. #'http://class.ruten.com.tw/category/sub00.php?c=00180014&p=',#首飾配件
  36. #'http://class.ruten.com.tw/category/sub00.php?c=00180015&p=',#打火機
  37. #'http://class.ruten.com.tw/category/sub00.php?c=00180016&p=',#鑰匙圈
  38. #'http://class.ruten.com.tw/category/sub00.php?c=00180017&p=',#皮夾
  39. #'http://class.ruten.com.tw/category/sub00.php?c=00180018&p=',#領帶、吊帶
  40. #'http://class.ruten.com.tw/category/sub00.php?c=00180038&p=',#睡衣
  41. #'http://class.ruten.com.tw/category/sub00.php?c=00180040&p=',#皮飾、皮帶
  42. #'http://class.ruten.com.tw/category/sub00.php?c=00180020&p=',#帽子
  43. #'http://class.ruten.com.tw/category/sub00.php?c=00180036&p=',#手套
  44. #'http://class.ruten.com.tw/category/sub00.php?c=00180021&p=',#假髮
  45. #'http://class.ruten.com.tw/category/sub00.php?c=00180022&p=',#圍巾、手帕
  46. #'http://class.ruten.com.tw/category/sub00.php?c=00180041&p=',#襪子
  47. #'http://class.ruten.com.tw/category/sub00.php?c=00180026&p=',#其他
  48. #'http://class.ruten.com.tw/category/sub00.php?c=000300030001&p=',#男運動服
  49. #'http://class.ruten.com.tw/category/sub00.php?c=000100020007&p=',#男童裝
  50. #'http://class.ruten.com.tw/category/sub00.php?c=00170001&p=',#男錶
  51. #'http://class.ruten.com.tw/category/sub00.php?c=00180033&p=',#男鞋
  52. #'http://class.ruten.com.tw/category/sub00.php?c=00120016&p=',#男士保養
  53. #'http://class.ruten.com.tw/category/sub00.php?c=00180042&p=',#其他隨身配件
  54. #'http://class.ruten.com.tw/category/sub00.php?c=00180043&p=',#國際代購/代買
  55.  
  56. ]
  57. urlpage = 0
  58. db_title = []
  59. db_price = []
  60. db_website = []
  61. db_date = []
  62. db_flag = []
  63. point = 0
  64. count = 0
  65. results = 0
  66. temp_count = 0
  67. db_count = 0
  68. flag = 1
  69. HEADER={
  70. #"Host": "www.ruten.com.tw/",
  71. #"Connection": "keep-alive",
  72. #"Cache-Control": "max-age=0",
  73. #"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
  74. "User-Agent": "Mozilla/4.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
  75. #"Referer": "http://www.zhihu.com/people/raymond-wang",
  76. #"Accept-Encoding": "gzip,deflate,sdch",
  77. #"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4,zh-TW;q=0.2",
  78. }
  79.  
  80. def __init__(self):
  81. #self.driver = webdriver.Chrome(executable_path=os.getcwd() + r'\selenium_driver_chrome\chromedriver.exe') # chrome瀏覽器
  82. #self.driver = webdriver.PhantomJS(executable_path=os.getcwd() +r'\phantomjs-2.1.1-windows\bin\phantomjs.exe') #phantomjs
  83. self.conn = sqlite3.connect('boyclothes.sqlite')
  84. self.cur = self.conn.cursor()
  85. self.cur.execute('create table if not exists boyclothes(title varchar(50) , price varchar(10) , website varchar(100) , date varchar(100), flag bit);')
  86. print("init")
  87. try:
  88. self.cur.execute('UPDATE boyclothes SET flag = 0;')
  89. self.conn.commit()
  90. print("update the flag.")
  91. except:
  92. print("not update the flag.")
  93. #self.conn.close()
  94. print("database close")
  95. def start_requests(self):
  96. #self.driver = webdriver.Chrome(executable_path=os.getcwd() + r'\selenium_driver_chrome\chromedriver.exe') # chrome瀏覽器
  97. try:
  98. self.conn = sqlite3.connect('boyclothes.sqlite')
  99. self.cur = self.conn.cursor()
  100. self.cur.execute('create table if not exists boyclothes(title varchar(50) , price varchar(10) , website varchar(100) , date varchar(100), flag bit);')
  101. self.cur.execute('SELECT * FROM boyclothes;')
  102. results = self.cur.fetchall()
  103. #print(count)
  104. #self.cur.execute('SELECT * FROM boyclothes')
  105. for record in results:
  106. self.db_title.append(record[0])
  107. self.db_price.append(record[1])
  108. self.db_website.append(record[2])
  109. self.db_date.append(record[3])
  110. #print(len(self.db_website))
  111. #try:
  112. # self.cur.execute('UPDATE boyclothes SET flag = 0;')
  113. # self.conn.commit()
  114. # print("update the flag.")
  115. #except:
  116. # print("not update the flag.")
  117. print("success")
  118.  
  119.  
  120. except:
  121. print("can't print website")
  122. self.conn.close()
  123. print("database close")
  124.  
  125. self.driver = webdriver.PhantomJS(executable_path=os.getcwd() +r'\phantomjs-2.1.1-windows\bin\phantomjs.exe') #phantomjs
  126. self.driver.get(self.start_urls[self.urlpage])
  127. yield scrapy.Request(self.start_urls[self.urlpage] , callback = self.parse)
  128. self.urlpage = self.urlpage + 1
  129.  
  130. def parse(self , response):
  131. for start in self.start_urls:
  132. #self.driver.get(self.start_urls[urlpage])
  133.  
  134. for page in range(0,1):#how many pages want spider
  135. url = start + str(page+1)
  136. self.driver.get(url)
  137. #time.sleep(1)
  138. #sp = []
  139. sp = BeautifulSoup(self.driver.page_source.encode(sys.stdin.encoding, "replace").decode(sys.stdin.encoding) , "lxml")
  140. #sp = BeautifulSoup(self.driver.page_source , "lxml")
  141. page += 1
  142. #items = []
  143. items = sp.find_all('li',{'itemtype':'http://schema.org/Product'})
  144.  
  145. for item in items: #catch the product's data
  146. title = item.find('a',{'class':'rt-goods-list-item-name-link'}).getText()
  147. price = item.find('strong',{'class':'rt-text-price'}).getText()
  148. website = item.find('a',{'class':'rt-goods-list-item-name-link'})['href']
  149. date = time.strftime("%Y/%m/%d/%H:%M:%S")
  150. #print(title + price)
  151.  
  152. if len(self.db_website) == 0: #website no data , input data in database
  153. yield self.parse_detail(title ,price,website ,date,self.flag)
  154. else:
  155. for i in range(len(self.db_website)):
  156. if not (website == self.db_website[i]):#if new data not in the old database
  157. self.count = self.count + 1
  158. #find the different item
  159. else:
  160. #print("find the same item's website")
  161. #insert new price into the another table
  162. if not (price == self.db_price[i]):
  163. #-------------insert the new data in new table-----------------#
  164. self.conn = sqlite3.connect('boyclothes.sqlite')
  165. self.cur = self.conn.cursor()
  166. self.cur.execute('create table if not exists temp(title varchar(50) , price varchar(10) , website varchar(100) , date varchar(100),flag bit);')
  167. self.cur.execute('insert into temp values (?,?,?,?,?);',(title , price , website , date,self.flag))
  168. self.conn.commit()
  169. self.conn.close()
  170. if not os.path.exists('true.txt'):#close spider , write a true.txt
  171. open('true.txt', 'w').close()
  172. else:
  173. print("true exists")
  174. print("insert data success.")
  175. #------------------------------end------------------------------#
  176. #-----------update the new data into the old database-----------#
  177. self.conn = sqlite3.connect('boyclothes.sqlite')
  178. self.cur = self.conn.cursor()
  179. self.cur.execute('update boyclothes set price = ? where website = ?;',(price,website))
  180. self.cur.execute('update boyclothes set flag = ? where website = ?;',(self.flag,website))
  181. self.conn.commit()
  182. self.conn.close()
  183. print("update data success.")
  184. #-----------------------------end--------------------------------#
  185. else:#if find the same website and price same , update the flag
  186. self.conn = sqlite3.connect('boyclothes.sqlite')
  187. self.cur = self.conn.cursor()
  188. self.cur.execute('update boyclothes set flag = ? where website = ?;',(self.flag,website))
  189. self.conn.commit()
  190. self.conn.close()
  191. print("update flag success.")
  192. #if not find the same item , insert the new item into the database
  193. if (self.count == len(self.db_website)):
  194. self.conn = sqlite3.connect('boyclothes.sqlite')
  195. self.cur = self.conn.cursor()
  196. self.cur.execute('create table if not exists temp(title varchar(50) , price varchar(10) , website varchar(100) , date varchar(100),flag bit);')
  197. self.cur.execute('insert into temp values (?,?,?,?,?);',(title , price , website , date,self.flag))
  198. self.conn.commit()
  199. self.conn.close()
  200. print("title = " + title)
  201. print("insert into database")
  202. yield self.parse_detail(title ,price,website ,date , self.flag)
  203. #print(self.count)
  204. self.count = 0
  205.  
  206. '''
  207. #修改資料庫
  208. try:
  209. self.conn = sqlite3.connect('boyclothes.sqlite')
  210. self.cur = self.conn.cursor()
  211. #self.cur.execute('create table if not exists temp(title varchar(50) , price varchar(10) , website varchar(100) , date varchar(100));')
  212. self.cur.execute('UPDATE boyclothes SET price = 11111 where website = "http://goods.ruten.com.tw/item/show?21408182947927";')
  213. self.cur.execute('DELETE from boyclothes where website = "http://goods.ruten.com.tw/item/show?21111144215687";')
  214. self.cur.execute('UPDATE boyclothes SET price = 1465456 where website = "http://goods.ruten.com.tw/item/show?21303195214113";')
  215. self.conn.commit()
  216. self.conn.close()
  217. print("update data success.")
  218. except:
  219. print("can't update data")
  220. '''
  221.  
  222. def parse_detail(self,title ,price , website,date,flag):
  223. Item = BoyClothesItem()
  224. Item['title'] = title
  225. Item['price'] = price
  226. Item['website'] = website
  227. Item['date'] = date
  228. Item['flag'] = flag
  229. return Item
  230.  
  231.  
  232. def spider_closed(self, spider):
  233. self.driver.quit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement