Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import logging.config
- import os
- if not os.path.exists('log'):
- os.mkdir('log')
- if not os.path.exists('caches'):
- os.mkdir('caches')
- CONF_LOG = "configs/logging.conf"
- logging.config.fileConfig(CONF_LOG) # 采用配置文件
- logger = logging.getLogger()
- class BacRecover(object):
- """
- 爬虫的备份恢复类
- 用于备份爬虫的记录信息,已保证在程序中断后的数据恢复
- 暂时只能备份used/new data
- 搭配 Manager 使用
- """
- MODE_NEW = 1
- MODE_OLD = 0
- def __init__(self):
- self.minOptRecords = 5000
- self.crashDate = None
- self.usedData = set()
- self.newData = set()
- self.isBackupToFile = True
- self.__backupFilePath = self.__readLatestBackupFile()
- @staticmethod
- def __writeLatestBackupFile(backupFile):
- """
- 将最新的备份文件信息写入备份文件列表
- :param backupFile: 需要被写入备份文件列表的文件路径
- """
- f = open('caches/latest-backup-file', 'a')
- f.write(backupFile)
- f.write('\n')
- @staticmethod
- def __parseLine(line):
- """
- 解析从备份文件中的每一行信息
- :param line: line
- :return: 如果匹配失败返回None,否则返回[data,mode]
- """
- import re
- pattenNum = re.compile('\d+')
- pattenMode = re.compile('\{\[\(\d+\)\]\}')
- pattenData = re.compile('.*\{\[\(')
- modeOnce = re.search(pattenMode, line).group(0)
- dataOnce = re.search(pattenData, line).group(0)
- if modeOnce is None and dataOnce is None: # 匹配失败
- return None
- return dataOnce[0: len(dataOnce) - 3], re.search(pattenNum, modeOnce).group(0)
- @staticmethod
- def __writeToFile(path, data, mode):
- """ writeToFile
- 将data于mode信息写入文件
- :param path: 备份文件路径
- :param data: data
- :param mode: mode
- """
- backupFile = open(path, 'a')
- try:
- data = str(data)
- backupFile.write(data) # 写入数据
- backupFile.write('{[(' + str(mode) + ')]}') # 写入mode
- backupFile.write('\n') # 数据分隔符
- except Exception:
- logger.warning('backup path does not exist , please create it ')
- finally:
- backupFile.close()
- def __readLatestBackupFile(self):
- """
- 从备份文件列表中恢复最新的备份文件信息
- :return: 返回最新的备份文件路径
- """
- from datetime import datetime
- try:
- f = open('caches/latest-backup-file', 'r')
- lines = f.readlines()
- if len(lines) == 0:
- fn = 'caches/backup-' + str(datetime.now()) + '.bak'
- self.__writeLatestBackupFile(fn)
- else:
- fn = lines[-1]
- return fn.lstrip().rstrip()
- except FileNotFoundError:
- self.__backupFilePath = None
- fn = 'caches/backup-' + str(datetime.now()) + '.bak'
- self.__writeLatestBackupFile(fn)
- logger.warning('备份文件列表未找到,已创建新文件:' + fn)
- return fn
- def __recoverFromFile(self):
- """
- 从备份文件中恢复信息
- :return:
- """
- usedSet = set()
- newSet = set()
- rf = open(self.__backupFilePath, 'r')
- logger.info('from file : %s' % self.__backupFilePath)
- lines = rf.readlines()
- for line in lines[::-1]:
- if line is None or line == '':
- continue
- (data, mode) = self.__parseLine(line)
- mode = int(mode)
- if data not in usedSet and data not in newSet: # 如果数据已经存在在其中一个集合,代表已经存在最新记录
- if mode == self.MODE_NEW:
- newSet.add(data)
- elif mode == self.MODE_OLD:
- usedSet.add(data)
- self.usedData = usedSet
- self.newData = newSet
- totalData = len(usedSet) + len(self.newData)
- logger.info('Recovered %d data from %d lines' % (totalData, len(lines)))
- if totalData == 0:
- logger.info('Backup is empty。')
- else:
- logger.info('Recovered : UsedData:%i , NewData:%i' % (
- len(self.usedData), len(self.newData)))
- if totalData + self.minOptRecords < len(lines): # 如果两者相差minRecord 条数据
- logger.info('The backup file need to be optimized , it will start now .')
- self.optimizeBackupFile()
- return usedSet, newSet
- def setBackupToFile(self, isToFile):
- self.isBackupToFile = isToFile
- return self
- def setMinOptimizeRecords(self, minRecords):
- """ 指定备份文件信息优化的最小条数
- :param minRecords: 最小优化条数
- """
- self.minOptRecords = minRecords
- return self
- def optimizeBackupFile(self):
- """ 优化备份文件信息
- 用于缩短备份文件,去除冗余的部分。
- 主要是用于去除已被使用过的记录的new mode插入记录,只保留old mode 插入记录即可完成功能
- """
- if self.__backupFilePath is None:
- return
- from datetime import datetime
- bfp = 'caches/optimized-backup' + str(datetime.now()) + '.bak'
- for newdata in self.newData:
- self.__writeToFile(bfp, newdata, self.MODE_NEW)
- for useddata in self.usedData:
- self.__writeToFile(bfp, useddata, self.MODE_OLD)
- self.__backupFilePath = bfp
- self.__writeLatestBackupFile(bfp)
- def isBackupExists(self):
- """
- 判断备份当前的最新备份文件是否存在
- :return:
- """
- import os
- logger.info(self.__backupFilePath)
- return os.path.exists(self.__backupFilePath)
- def recover(self):
- """
- 外部调用的恢复 api
- 如果可恢复则恢复
- :return: 返回恢复后的数据
- """
- if self.isBackupToFile is True:
- if self.isBackupExists():
- logger.info('backup file exists , starting recover from this file ...')
- self.usedData, self.newData = self.__recoverFromFile()
- else:
- logger.info('backup file not found ...')
- return self.usedData, self.newData
- def updateBackup(self, data, _mode):
- """
- 更新备份信息,
- :param _mode: 将数据更新为指定的模式
- :param data: 数据
- :return:
- """
- if data is None:
- return
- # 如果 mode = old 则需要将原有的数据从
- if _mode == self.MODE_OLD:
- self.usedData.add(data)
- if data in self.newData:
- self.newData.remove(data)
- # self.newData.add(data) # ???
- # 无论模式是什么,只要符合要求,则都需要在文件中更新该记录
- if _mode in (self.MODE_OLD, self.MODE_NEW):
- if self.isBackupToFile is True:
- self.__writeToFile(self.__backupFilePath, data, _mode)
- def backupList(self, dataList, _mode):
- """ 备份整个列表的信息
- :param dataList: 信息列表
- :param _mode: 指定的信息模式
- """
- if dataList is None or len(dataList) == 0:
- return
- for data in dataList:
- self.updateBackup(data, _mode)
- def printStatus(self):
- print(" new data : ", len(self.newData))
- print(" used data : ", len(self.usedData))
- class CookiesPool(object):
- """ CookiesPool
- 缓存连接池,用于管理多个账号的缓存信息,均匀分配每个账号的使用次数。
- 以达到有效解决403的问题,
- 需要配置参数 :
- 1. maxSize : 设置最大的缓存个数
- 2. continuous : 每个缓存连续使用的次数,默认50次后 将该缓存失效
- 3. force : 在所有缓存都失效后是否sleep一段时间
- 4. sleepSecs : 在设置force后生效,设置sleep的时间
- 5. loginFunc: 登陆函数,在不存cookies缓存的时候,通过该函数获取cookies
- """
- def __init__(self):
- if not os.path.exists('caches/persistcookies'):
- os.mkdir('caches/persistcookies')
- self.__configFile = 'configs/accounts-list.conf'
- self.__cookiesPool = list()
- self.__current = -1
- self.__last = -1
- self.__continuousUseCount = 0
- self.__sleepSecs = 5.0
- self.size = 0
- self.__maxSize = -1
- self.force = True
- self.__continuous = 50
- self.__invalidatedCookies = None
- self.infos = self.__loadInfo()
- @staticmethod
- def __isPersistCookies():
- import os
- cachadir = 'caches/persistcookies/'
- fns = os.listdir(cachadir)
- return len(fns) != 0
- @staticmethod
- def __initArray(length):
- arr = []
- for i in range(0, length):
- arr.append(0)
- return arr
- def __loadInfo(self):
- retList = list()
- if self.__isPersistCookies():
- self.__recCookies()
- else:
- if os.path.exists(self.__configFile):
- counter = 0
- confReader = open(self.__configFile, 'r')
- for line in confReader.readlines():
- if line.rstrip().lstrip() == '':
- continue
- if self.__maxSize != -1 and self.__maxSize >= counter:
- break
- try:
- # 解析从配置文件中读取的信息并添加到返回列表中
- [username, password] = line.split(":")
- username = str(username).lstrip().rstrip().replace('\n', '')
- password = str(password).lstrip().rstrip().replace('\n', '')
- retList.append((username, password))
- counter += 1
- except ValueError:
- logger.error('配置文件错误!格式为:【账号:密码】,冒号为英文冒号')
- confReader.close()
- else:
- logger.warning('headers conf file does\'t exists.')
- self.size = len(self.__cookiesPool)
- self.__invalidatedCookies = self.__initArray(self.size)
- return retList
- def __recCookies(self):
- import os
- cachadir = 'caches/persistcookies/'
- fns = os.listdir(cachadir)
- for fn in fns:
- f = open(cachadir + fn, 'r')
- cookiesDict = dict()
- lines = f.readlines()
- for line in lines:
- if ':' in line:
- k, v = line.split(':', 1)
- k = str(k).rstrip().lstrip()
- v = str(v).rstrip().lstrip()
- cookiesDict[k] = v
- self.__cookiesPool.append(cookiesDict)
- logger.info('发现%d个缓存数据,并已成功使用' % len(self.__cookiesPool))
- def __invalidateReduce(self):
- """
- 减少每个失效缓存的死亡时间
- :return:
- """
- for i in range(self.size):
- if self.__invalidatedCookies[i] != 0:
- self.__invalidatedCookies[i] -= 1
- def __setCaches(self, loginFunc):
- if not self.__isPersistCookies():
- for (username, password) in self.infos:
- r = loginFunc(username, password)
- if r is None:
- continue
- self.__cookiesPool.append(r.request.headers)
- self.persistCookies()
- def setMaxSize(self, maxSize):
- self.__maxSize = maxSize
- return self
- def setContinuous(self, continuous):
- self.__continuous = continuous
- return self
- def setForce(self, force):
- self.force = force
- def setSleepSecs(self, sleepSecs):
- self.__sleepSecs = sleepSecs
- return self
- def setConfigFile(self, configFile):
- self.__configFile = configFile
- return self
- def setLoginFunc(self, loginFunc):
- """ 设置登陆函数
- 必须传入登陆函数,该函数需要两个参数,分别是用户名和密码。
- 返回值必须上requests.post()的返回值
- :param loginFunc: 登陆函数
- """
- self.__setCaches(loginFunc)
- return self
- def getCookies(self):
- """getCookies()
- 获取缓存池中的有效cookies
- 1. 如果存在没有失效的缓存,则缓存第一个遍历到的未失效的缓存
- 2. 如果缓存都已失效,则判断force属性:
- a)如果 force 为true 则sleep一段时间后随机返回一个cookies
- b)如果 force 为false 则直接返回一个cookies
- :return: cookies
- """
- logger.debug(self.__invalidatedCookies)
- self.__last = self.__current
- index = -1
- for i in range(0, len(self.__invalidatedCookies)):
- if self.__invalidatedCookies[i] == 0: # 未被休眠
- if i == self.__last: # 如果于上一次使用的为同一个cookies
- self.__continuousUseCount += 1
- # self.invalidateCurrent()
- logger.debug('连续使用次数:%s : %s' % (self.__continuousUseCount, self.__continuous))
- if self.__continuousUseCount >= self.__continuous: # 如果连续使用的次数大于40,则继续选择下一个
- index = i
- self.invalidateCurrent()
- continue
- else:
- self.__current = i
- self.__continuousUseCount = 0
- logger.debug("use pool index : %s" % i)
- self.__invalidateReduce()
- return self.__cookiesPool[i]
- # 如果可以执行到这里,代表所有的资源都已失效
- if self.force:
- from time import sleep
- logger.warning('cookiesPool中的所有资源都已失效。请等待%s秒' % self.__sleepSecs)
- sleep(self.__sleepSecs)
- if index == -1:
- import random
- idxRange = self.size - 1
- if idxRange < 0:
- idxRange = 1
- index = random.randint(0, idxRange)
- logger.debug("use pool index : %s" % index)
- self.__invalidateReduce()
- return self.__cookiesPool[index]
- def invalidateCurrent(self):
- """ invalidateCurrent
- 使当前缓存失效
- :return:
- """
- if self.size > 10: # 如果有10个以上的cookies 则不需要中途休眠
- self.__invalidatedCookies[self.__current] = self.__continuous * (self.size - 1)
- else:
- self.__invalidatedCookies[self.__current] = (self.__continuous + 1) * (self.size - 1)
- def persistCookies(self):
- """
- 持久化 cookies
- :return:
- """
- print('persistCookies')
- from datetime import datetime
- cachadir = 'caches/persistcookies/'
- for cookies in self.__cookiesPool:
- filename = cachadir + 'cache-' + str(datetime.now())
- f = open(filename, 'w')
- for k, v in cookies.items():
- f.writelines(k + ":" + v + "\n")
- f.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement