Advertisement
Guest User

Untitled

a guest
Sep 9th, 2016
92
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 15.14 KB | None | 0 0
  1. import logging.config
  2. import os
  3.  
  4. if not os.path.exists('log'):
  5. os.mkdir('log')
  6. if not os.path.exists('caches'):
  7. os.mkdir('caches')
  8. CONF_LOG = "configs/logging.conf"
  9. logging.config.fileConfig(CONF_LOG) # 采用配置文件
  10. logger = logging.getLogger()
  11.  
  12.  
  13. class BacRecover(object):
  14. """
  15. 爬虫的备份恢复类
  16.  
  17. 用于备份爬虫的记录信息,已保证在程序中断后的数据恢复
  18. 暂时只能备份used/new data
  19.  
  20. 搭配 Manager 使用
  21. """
  22.  
  23. MODE_NEW = 1
  24. MODE_OLD = 0
  25.  
  26. def __init__(self):
  27. self.minOptRecords = 5000
  28. self.crashDate = None
  29. self.usedData = set()
  30. self.newData = set()
  31. self.isBackupToFile = True
  32. self.__backupFilePath = self.__readLatestBackupFile()
  33.  
  34. @staticmethod
  35. def __writeLatestBackupFile(backupFile):
  36. """
  37. 将最新的备份文件信息写入备份文件列表
  38. :param backupFile: 需要被写入备份文件列表的文件路径
  39.  
  40. """
  41. f = open('caches/latest-backup-file', 'a')
  42. f.write(backupFile)
  43. f.write('\n')
  44.  
  45. @staticmethod
  46. def __parseLine(line):
  47. """
  48. 解析从备份文件中的每一行信息
  49. :param line: line
  50. :return: 如果匹配失败返回None,否则返回[data,mode]
  51. """
  52. import re
  53. pattenNum = re.compile('\d+')
  54. pattenMode = re.compile('\{\[\(\d+\)\]\}')
  55. pattenData = re.compile('.*\{\[\(')
  56.  
  57. modeOnce = re.search(pattenMode, line).group(0)
  58. dataOnce = re.search(pattenData, line).group(0)
  59. if modeOnce is None and dataOnce is None: # 匹配失败
  60. return None
  61.  
  62. return dataOnce[0: len(dataOnce) - 3], re.search(pattenNum, modeOnce).group(0)
  63.  
  64. @staticmethod
  65. def __writeToFile(path, data, mode):
  66. """ writeToFile
  67. 将data于mode信息写入文件
  68. :param path: 备份文件路径
  69. :param data: data
  70. :param mode: mode
  71. """
  72. backupFile = open(path, 'a')
  73. try:
  74. data = str(data)
  75. backupFile.write(data) # 写入数据
  76. backupFile.write('{[(' + str(mode) + ')]}') # 写入mode
  77. backupFile.write('\n') # 数据分隔符
  78. except Exception:
  79. logger.warning('backup path does not exist , please create it ')
  80. finally:
  81. backupFile.close()
  82.  
  83. def __readLatestBackupFile(self):
  84. """
  85. 从备份文件列表中恢复最新的备份文件信息
  86. :return: 返回最新的备份文件路径
  87. """
  88. from datetime import datetime
  89. try:
  90. f = open('caches/latest-backup-file', 'r')
  91.  
  92. lines = f.readlines()
  93. if len(lines) == 0:
  94. fn = 'caches/backup-' + str(datetime.now()) + '.bak'
  95. self.__writeLatestBackupFile(fn)
  96. else:
  97. fn = lines[-1]
  98. return fn.lstrip().rstrip()
  99.  
  100. except FileNotFoundError:
  101. self.__backupFilePath = None
  102. fn = 'caches/backup-' + str(datetime.now()) + '.bak'
  103. self.__writeLatestBackupFile(fn)
  104. logger.warning('备份文件列表未找到,已创建新文件:' + fn)
  105. return fn
  106.  
  107. def __recoverFromFile(self):
  108. """
  109. 从备份文件中恢复信息
  110. :return:
  111. """
  112. usedSet = set()
  113. newSet = set()
  114. rf = open(self.__backupFilePath, 'r')
  115. logger.info('from file : %s' % self.__backupFilePath)
  116. lines = rf.readlines()
  117. for line in lines[::-1]:
  118. if line is None or line == '':
  119. continue
  120. (data, mode) = self.__parseLine(line)
  121. mode = int(mode)
  122. if data not in usedSet and data not in newSet: # 如果数据已经存在在其中一个集合,代表已经存在最新记录
  123. if mode == self.MODE_NEW:
  124. newSet.add(data)
  125. elif mode == self.MODE_OLD:
  126. usedSet.add(data)
  127. self.usedData = usedSet
  128. self.newData = newSet
  129. totalData = len(usedSet) + len(self.newData)
  130. logger.info('Recovered %d data from %d lines' % (totalData, len(lines)))
  131. if totalData == 0:
  132. logger.info('Backup is empty。')
  133. else:
  134. logger.info('Recovered : UsedData:%i , NewData:%i' % (
  135. len(self.usedData), len(self.newData)))
  136.  
  137. if totalData + self.minOptRecords < len(lines): # 如果两者相差minRecord 条数据
  138. logger.info('The backup file need to be optimized , it will start now .')
  139. self.optimizeBackupFile()
  140. return usedSet, newSet
  141.  
  142. def setBackupToFile(self, isToFile):
  143. self.isBackupToFile = isToFile
  144. return self
  145.  
  146. def setMinOptimizeRecords(self, minRecords):
  147. """ 指定备份文件信息优化的最小条数
  148. :param minRecords: 最小优化条数
  149. """
  150. self.minOptRecords = minRecords
  151. return self
  152.  
  153. def optimizeBackupFile(self):
  154. """ 优化备份文件信息
  155. 用于缩短备份文件,去除冗余的部分。
  156. 主要是用于去除已被使用过的记录的new mode插入记录,只保留old mode 插入记录即可完成功能
  157. """
  158. if self.__backupFilePath is None:
  159. return
  160. from datetime import datetime
  161.  
  162. bfp = 'caches/optimized-backup' + str(datetime.now()) + '.bak'
  163. for newdata in self.newData:
  164. self.__writeToFile(bfp, newdata, self.MODE_NEW)
  165. for useddata in self.usedData:
  166. self.__writeToFile(bfp, useddata, self.MODE_OLD)
  167. self.__backupFilePath = bfp
  168. self.__writeLatestBackupFile(bfp)
  169.  
  170. def isBackupExists(self):
  171. """
  172. 判断备份当前的最新备份文件是否存在
  173. :return:
  174. """
  175. import os
  176. logger.info(self.__backupFilePath)
  177. return os.path.exists(self.__backupFilePath)
  178.  
  179. def recover(self):
  180. """
  181. 外部调用的恢复 api
  182. 如果可恢复则恢复
  183. :return: 返回恢复后的数据
  184. """
  185. if self.isBackupToFile is True:
  186. if self.isBackupExists():
  187. logger.info('backup file exists , starting recover from this file ...')
  188. self.usedData, self.newData = self.__recoverFromFile()
  189. else:
  190. logger.info('backup file not found ...')
  191.  
  192. return self.usedData, self.newData
  193.  
  194. def updateBackup(self, data, _mode):
  195. """
  196. 更新备份信息,
  197. :param _mode: 将数据更新为指定的模式
  198. :param data: 数据
  199. :return:
  200. """
  201. if data is None:
  202. return
  203.  
  204. # 如果 mode = old 则需要将原有的数据从
  205. if _mode == self.MODE_OLD:
  206. self.usedData.add(data)
  207. if data in self.newData:
  208. self.newData.remove(data)
  209. # self.newData.add(data) # ???
  210. # 无论模式是什么,只要符合要求,则都需要在文件中更新该记录
  211. if _mode in (self.MODE_OLD, self.MODE_NEW):
  212. if self.isBackupToFile is True:
  213. self.__writeToFile(self.__backupFilePath, data, _mode)
  214.  
  215. def backupList(self, dataList, _mode):
  216. """ 备份整个列表的信息
  217. :param dataList: 信息列表
  218. :param _mode: 指定的信息模式
  219. """
  220. if dataList is None or len(dataList) == 0:
  221. return
  222. for data in dataList:
  223. self.updateBackup(data, _mode)
  224.  
  225. def printStatus(self):
  226. print(" new data : ", len(self.newData))
  227. print(" used data : ", len(self.usedData))
  228.  
  229.  
  230. class CookiesPool(object):
  231. """ CookiesPool
  232. 缓存连接池,用于管理多个账号的缓存信息,均匀分配每个账号的使用次数。
  233. 以达到有效解决403的问题,
  234.  
  235. 需要配置参数 :
  236. 1. maxSize : 设置最大的缓存个数
  237. 2. continuous : 每个缓存连续使用的次数,默认50次后 将该缓存失效
  238. 3. force : 在所有缓存都失效后是否sleep一段时间
  239. 4. sleepSecs : 在设置force后生效,设置sleep的时间
  240. 5. loginFunc: 登陆函数,在不存cookies缓存的时候,通过该函数获取cookies
  241. """
  242.  
  243. def __init__(self):
  244. if not os.path.exists('caches/persistcookies'):
  245. os.mkdir('caches/persistcookies')
  246.  
  247. self.__configFile = 'configs/accounts-list.conf'
  248. self.__cookiesPool = list()
  249.  
  250. self.__current = -1
  251. self.__last = -1
  252. self.__continuousUseCount = 0
  253.  
  254. self.__sleepSecs = 5.0
  255.  
  256. self.size = 0
  257. self.__maxSize = -1
  258. self.force = True
  259. self.__continuous = 50
  260.  
  261. self.__invalidatedCookies = None
  262. self.infos = self.__loadInfo()
  263.  
  264. @staticmethod
  265. def __isPersistCookies():
  266. import os
  267.  
  268. cachadir = 'caches/persistcookies/'
  269. fns = os.listdir(cachadir)
  270. return len(fns) != 0
  271.  
  272. @staticmethod
  273. def __initArray(length):
  274. arr = []
  275. for i in range(0, length):
  276. arr.append(0)
  277. return arr
  278.  
  279. def __loadInfo(self):
  280. retList = list()
  281. if self.__isPersistCookies():
  282. self.__recCookies()
  283. else:
  284. if os.path.exists(self.__configFile):
  285. counter = 0
  286. confReader = open(self.__configFile, 'r')
  287. for line in confReader.readlines():
  288. if line.rstrip().lstrip() == '':
  289. continue
  290. if self.__maxSize != -1 and self.__maxSize >= counter:
  291. break
  292. try:
  293. # 解析从配置文件中读取的信息并添加到返回列表中
  294. [username, password] = line.split(":")
  295. username = str(username).lstrip().rstrip().replace('\n', '')
  296. password = str(password).lstrip().rstrip().replace('\n', '')
  297. retList.append((username, password))
  298. counter += 1
  299. except ValueError:
  300. logger.error('配置文件错误!格式为:【账号:密码】,冒号为英文冒号')
  301.  
  302. confReader.close()
  303. else:
  304. logger.warning('headers conf file does\'t exists.')
  305.  
  306. self.size = len(self.__cookiesPool)
  307. self.__invalidatedCookies = self.__initArray(self.size)
  308. return retList
  309.  
  310. def __recCookies(self):
  311. import os
  312. cachadir = 'caches/persistcookies/'
  313. fns = os.listdir(cachadir)
  314. for fn in fns:
  315. f = open(cachadir + fn, 'r')
  316. cookiesDict = dict()
  317. lines = f.readlines()
  318. for line in lines:
  319. if ':' in line:
  320. k, v = line.split(':', 1)
  321. k = str(k).rstrip().lstrip()
  322. v = str(v).rstrip().lstrip()
  323. cookiesDict[k] = v
  324. self.__cookiesPool.append(cookiesDict)
  325. logger.info('发现%d个缓存数据,并已成功使用' % len(self.__cookiesPool))
  326.  
  327. def __invalidateReduce(self):
  328. """
  329. 减少每个失效缓存的死亡时间
  330. :return:
  331. """
  332. for i in range(self.size):
  333. if self.__invalidatedCookies[i] != 0:
  334. self.__invalidatedCookies[i] -= 1
  335.  
  336. def __setCaches(self, loginFunc):
  337. if not self.__isPersistCookies():
  338. for (username, password) in self.infos:
  339. r = loginFunc(username, password)
  340. if r is None:
  341. continue
  342. self.__cookiesPool.append(r.request.headers)
  343. self.persistCookies()
  344.  
  345. def setMaxSize(self, maxSize):
  346. self.__maxSize = maxSize
  347. return self
  348.  
  349. def setContinuous(self, continuous):
  350. self.__continuous = continuous
  351. return self
  352.  
  353. def setForce(self, force):
  354. self.force = force
  355.  
  356. def setSleepSecs(self, sleepSecs):
  357. self.__sleepSecs = sleepSecs
  358. return self
  359.  
  360. def setConfigFile(self, configFile):
  361. self.__configFile = configFile
  362. return self
  363.  
  364. def setLoginFunc(self, loginFunc):
  365. """ 设置登陆函数
  366. 必须传入登陆函数,该函数需要两个参数,分别是用户名和密码。
  367. 返回值必须上requests.post()的返回值
  368.  
  369. :param loginFunc: 登陆函数
  370. """
  371. self.__setCaches(loginFunc)
  372. return self
  373.  
  374. def getCookies(self):
  375. """getCookies()
  376. 获取缓存池中的有效cookies
  377. 1. 如果存在没有失效的缓存,则缓存第一个遍历到的未失效的缓存
  378. 2. 如果缓存都已失效,则判断force属性:
  379. a)如果 force 为true 则sleep一段时间后随机返回一个cookies
  380. b)如果 force 为false 则直接返回一个cookies
  381.  
  382. :return: cookies
  383. """
  384. logger.debug(self.__invalidatedCookies)
  385. self.__last = self.__current
  386. index = -1
  387. for i in range(0, len(self.__invalidatedCookies)):
  388. if self.__invalidatedCookies[i] == 0: # 未被休眠
  389. if i == self.__last: # 如果于上一次使用的为同一个cookies
  390. self.__continuousUseCount += 1
  391. # self.invalidateCurrent()
  392. logger.debug('连续使用次数:%s : %s' % (self.__continuousUseCount, self.__continuous))
  393. if self.__continuousUseCount >= self.__continuous: # 如果连续使用的次数大于40,则继续选择下一个
  394. index = i
  395. self.invalidateCurrent()
  396. continue
  397.  
  398. else:
  399. self.__current = i
  400. self.__continuousUseCount = 0
  401. logger.debug("use pool index : %s" % i)
  402. self.__invalidateReduce()
  403. return self.__cookiesPool[i]
  404.  
  405. # 如果可以执行到这里,代表所有的资源都已失效
  406. if self.force:
  407. from time import sleep
  408. logger.warning('cookiesPool中的所有资源都已失效。请等待%s秒' % self.__sleepSecs)
  409. sleep(self.__sleepSecs)
  410. if index == -1:
  411. import random
  412. idxRange = self.size - 1
  413. if idxRange < 0:
  414. idxRange = 1
  415. index = random.randint(0, idxRange)
  416. logger.debug("use pool index : %s" % index)
  417. self.__invalidateReduce()
  418. return self.__cookiesPool[index]
  419.  
  420. def invalidateCurrent(self):
  421. """ invalidateCurrent
  422. 使当前缓存失效
  423. :return:
  424. """
  425. if self.size > 10: # 如果有10个以上的cookies 则不需要中途休眠
  426. self.__invalidatedCookies[self.__current] = self.__continuous * (self.size - 1)
  427. else:
  428. self.__invalidatedCookies[self.__current] = (self.__continuous + 1) * (self.size - 1)
  429.  
  430. def persistCookies(self):
  431. """
  432. 持久化 cookies
  433. :return:
  434. """
  435. print('persistCookies')
  436. from datetime import datetime
  437. cachadir = 'caches/persistcookies/'
  438. for cookies in self.__cookiesPool:
  439. filename = cachadir + 'cache-' + str(datetime.now())
  440. f = open(filename, 'w')
  441. for k, v in cookies.items():
  442. f.writelines(k + ":" + v + "\n")
  443. f.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement