Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # 邂逅在迷宫up池爬虫
- # 作者: Chillax
- # 请勿以任何形式商用
- # 转载请注明出处:
- # v1.0 2020-4-17
- # v1.1 2020-4-27 增加了匹配时间的正则表达式的完备性
- import json
- import sys
- import requests
- from bs4 import BeautifulSoup
- import lxml
- import re
- from datetime import datetime
- from datetime import date
- from datetime import timedelta
- from prettytable import PrettyTable
- from pypinyin import lazy_pinyin
- from itertools import chain
- from collections import defaultdict
- FLAG_STRING = "## DON NOT MODIFY ##"
- FILE_NAME = ""
- def RWJson(READ=True, jsonObj=''):
- # 为了兼顾可读性与节省空间,限制jsonobj必须为list
- codeLines = []
- jsonLines = []
- with open(FILE_NAME, 'r', encoding="utf-8") as f:
- flag = False
- for line in f.readlines():
- if not flag and line != FLAG_STRING + '\n':
- codeLines.append(line)
- elif line == FLAG_STRING + '\n':
- flag = True
- elif flag:
- jsonLines.append(line)
- jsonLines = jsonLines[1:-1]
- if READ:
- try:
- js = json.loads(''.join(jsonLines))
- except:
- print("can't loads:\n", jsonLines, end='\n' + "-" * 80 + '\n')
- return []
- return js
- else:
- jsonLines = [FLAG_STRING, '"""', '[', ']', '"""', FLAG_STRING]
- for i in jsonObj:
- jsonLines.insert(-3, json.dumps(i, ensure_ascii=False)+',')
- jsonLines[-4] = jsonLines[-4][:-1]
- with open(FILE_NAME, 'w', encoding="utf-8") as f:
- for line in codeLines:
- f.write(line)
- for line in jsonLines:
- f.write(line + '\n')
- return True
- def GetTopic(pageNum):
- url = "https://www.taptap.com/app/67245/topic?type=official&sort=created&page=%d" % pageNum
- r = requests.get(url)
- bs = BeautifulSoup(r.text, 'lxml')
- # 公告列表(上限15)
- lTopic = list(bs.find("div", {"class": "data-list"}).children)
- lTopic = [i for i in lTopic if i != '\n'] # 筛掉莫名其妙的空行
- return lTopic
- def ParsePage(url):
- dRes = {}
- r = requests.get(url)
- bs = BeautifulSoup(r.text, 'lxml')
- content = ''
- content += bs.find('div', {'class': 'topic-content'}
- ).find('div', {'class': 'top-title-author'}).text # 标题文本
- content += bs.find('div', {'class': 'topic-content'}
- ).find('div', {'class': 'bbcode-body'}).text # 正文文本
- heros = [[], []]
- heros[0] = re.findall("【([^【】]*?)】up卡池", content)
- heros[0] = list(set(heros[0]))
- heros[1] = re.findall("【([^【】]*?)】恒晶封印", content)
- heros[1] = list(set(heros[1]))
- if not heros[0] and not heros[1]:
- print("无法解析本公告,请手动解析添加,公告内容:\n%s" % content.strip())
- while 1:
- try:
- for i in range(int(input("这里面有几个英雄?"))):
- if int(input("第%d个英雄是钻石(0)还是恒金(1)?" % i + 1)):
- heros[1].append(input("TA的名字:"))
- else:
- heros[0].append(input("TA的名字:"))
- break
- except:
- print("输入格式错误,请重新输入")
- heros = [[], []]
- dRes['heros'] = heros
- lTime = list(re.findall(
- "(\d*?)[年/]?(\d*?)[月/](\d*?)[日 ] ?(\d*?)[时:更]\d*?\D*?\d*?\D*?(\d*?)[年/]?(\d*?)[月/](\d*?)[日 ](\d*?)[时:]", content)[0])
- dRes['time'] = [int(i) if i.isdigit() else -1 for i in lTime]
- return dRes
- def ParseTopic(topic):
- # {
- # "pubtime": "%Y-%m-%d %X",
- # "isHero": True,
- # "heros": [[hero1,][hero2,]],
- # "starttime": [%Y, %m, %d[, %H]],
- # "endtime": [%Y, %m, %d, %H]
- # }
- dRes = {}
- try:
- pubTime = topic.find("span", {"class": "item-publish-time"}).text
- except:
- print(topic)
- tPubTime = datetime.strptime(
- pubTime, '%Y-%m-%d %X')
- # sPubTime = tPubTime.strftime("%F, %a, %T")
- dRes['pubtime'] = str(tPubTime)
- isHero = 'SSR' in topic.text and '限时' in topic.text
- dRes['isHero'] = isHero
- if not isHero:
- return dRes
- url = topic.find('a', {"class": "taptap-btn-link"})['href']
- dTemp = ParsePage(url)
- dRes['heros'] = dTemp['heros']
- dRes['starttime'] = dTemp['time'][:4]
- dRes['endtime'] = dTemp['time'][4:]
- if dRes['endtime'][0] == -1 and dRes['starttime'][0] == -1:
- dRes['endtime'][0] = dRes['starttime'][0] = tPubTime.year
- if dRes['endtime'][0] == -1:
- if dRes['starttime'][1:3] <= dRes['endtime'][1:3]:
- dRes['endtime'][0] = dRes['starttime'][0]
- else:
- dRes['endtime'][0] = dRes['starttime'][0] + 1
- return dRes
- def ListToDate(l):
- if l[3] == -1:
- return date(l[0], l[1], l[2])
- else:
- if l[3] == 24:
- return datetime(l[0], l[1], l[2], l[3] - 1) + timedelta(hours=1)
- return datetime(l[0], l[1], l[2], l[3])
- def GetData():
- print("初始化数据库...")
- data = RWJson()
- newData = []
- print("数据库中已有数据数:%d 条,开始更新..." % len(data))
- if data:
- lastDate = datetime.strptime(data[0]['pubtime'], '%Y-%m-%d %X')
- else:
- lastDate = datetime(2019, 1, 1)
- pageNum = 1
- loopFlag = True
- while loopFlag:
- print("解析第 %d 页..." % pageNum)
- lTopic = GetTopic(pageNum)
- for topic in lTopic:
- res = ParseTopic(topic)
- if datetime.strptime(res['pubtime'], "%Y-%m-%d %X") <= lastDate:
- loopFlag = False
- break
- if not res['isHero']:
- continue
- print(res)
- newData.append(res)
- pageNum += 1
- for res in newData[::-1]:
- data.insert(0, res)
- if not newData:
- print("数据库为最新~")
- RWJson(False, data)
- print("初始化完成")
- return data
- def PrintChart(lines):
- pass
- def PrintData(data):
- minDate = ListToDate(data[-1]['starttime'])
- maxDate = ListToDate(data[0]['starttime'])
- while True:
- try:
- starttime = input('开始日期(YYYYMMDD)(默认为%s):' %
- minDate.strftime("%Y%m%d"))
- if not starttime:
- starttime = minDate
- else:
- starttime = datetime.strptime(starttime, "%Y%m%d")
- break
- except ValueError:
- print("格式错误,请重新输入")
- while True:
- try:
- endtime = input('结束日期(YYYYMMDD)(默认为%s):' %
- maxDate.strftime("%Y%m%d"))
- if not endtime:
- endtime = maxDate
- else:
- endtime = datetime.strptime(endtime, "%Y%m%d")
- break
- except ValueError:
- print("格式错误,请重新输入")
- showAct = bool(input("是否显示活动up(是(1),否(0))(默认为0):"))
- # heroNum = int(input("英雄类别(恒金(1),钻石(0),全都要(2))(默认为2):"))
- table = PrettyTable(
- ['序号', '公告时间', '开始时间', '结束时间', '持续天数', '钻石01', '钻石02', '恒金01', '恒金02'])
- num = 1
- for line in data:
- st = ListToDate(line["starttime"])
- et = ListToDate(line["endtime"])
- if datetime(st.year, st.month, st.day) < starttime:
- continue
- if datetime(st.year, st.month, st.day) > endtime:
- break
- lTemp = [num]
- lTemp.append(datetime.strptime(
- line['pubtime'], "%Y-%m-%d %X").strftime("%Y-%m-%d %a %X"))
- if not isinstance(st, datetime): # 用date区分不出来,应该是子类关系导致的
- lTemp.append(st.strftime("%Y-%m-%d %a ") + "更新后")
- else:
- lTemp.append(st.strftime("%Y-%m-%d %a %H") + "时")
- lTemp.append(et.strftime("%Y-%m-%d %a %H") + "时")
- deltaDays = (et - datetime(st.year, st.month, st.day)).days
- if deltaDays > 7 and not showAct:
- continue
- lTemp.append(deltaDays)
- tHeros = [line['heros'][0][:], line['heros'][1][:]]
- for i in tHeros:
- i.extend('-' * (2 - len(i)))
- lTemp += i
- table.add_row(lTemp)
- num += 1
- print(table)
- def EqPinYin(str1, str2):
- py1 = ''.join(lazy_pinyin(str1))
- py2 = ''.join(lazy_pinyin(str2))
- return py1 == py2
- def SearchHero(data):
- while True:
- heroName = input("输入你想要查找的英雄(无输入退出):")
- if not heroName:
- return
- table = PrettyTable(['开始时间', '持续时间', '距今时间', '距上次时间'])
- heroExist = False
- preDate = None
- trueName = ''
- for line in data[::-1]:
- heros = list(chain.from_iterable(line['heros']))
- flag = False
- for h in heros:
- if EqPinYin(h, heroName):
- flag = True
- heroExist = True
- if not trueName:
- trueName = h
- if not flag:
- continue
- st = ListToDate(line["starttime"])
- st = datetime(st.year, st.month, st.day)
- et = ListToDate(line["endtime"])
- lTemp = []
- lTemp.append(st.strftime("%Y-%m-%d %a"))
- deltaDays = (et - st).days
- lTemp.append(deltaDays)
- deltaDays = (datetime.now() - st).days
- lTemp.append(deltaDays)
- if preDate:
- deltaDays = (st - preDate).days
- lTemp.append(deltaDays)
- else:
- lTemp.append('-')
- preDate = st
- table.add_row(lTemp)
- # 不支持中文键值
- # print(table.get_string(sortby="开始日期", reversesort=True))
- # table.sortby = "开始日期"
- if heroExist:
- print("%s:" % trueName)
- print(table)
- else:
- print("未查到此英雄“%s”,请重新输入" % heroName)
- def NotUptime(data):
- print("注:不包括所有首发活动up")
- while True:
- heroCate = input("查看英雄类别(钻石(0),恒晶(1))(其他输入退出):")
- if heroCate not in ['0', '1']:
- return
- heroCate = int(heroCate)
- table = PrettyTable(['英雄名称', '距今时间'])
- preDate = defaultdict(list)
- for line in data[::-1]:
- heros = line['heros'][heroCate]
- st = ListToDate(line["starttime"])
- st = datetime(st.year, st.month, st.day)
- et = ListToDate(line["endtime"])
- if (et - st).days > 7:
- continue
- for h in heros:
- preDate[h].append(st)
- lTable = []
- nowDate = datetime.now()
- for k, v in preDate.items():
- lTable.append([k, (nowDate - v[-1]).days])
- lTable.sort(key=lambda x: x[1], reverse=True)
- for i in lTable:
- table.add_row(i)
- print(table)
- def Interact(data):
- while True:
- chNum = input(
- "你想要...:\n(1)查看过往up信息\n(2)查看某英雄UP日期\n(3)查看英雄持续未up天数\n(0)退出\n")
- if chNum == "0":
- return
- elif chNum == "1":
- PrintData(data)
- elif chNum == "2":
- SearchHero(data)
- elif chNum == "3":
- NotUptime(data)
- else:
- continue
- if __name__ == "__main__":
- FILE_NAME = sys.argv[0]
- data = GetData()
- try:
- Interact(data)
- except KeyboardInterrupt:
- pass
- print("感谢使用~")
- # res = RWJson()
- # print(res)
- # res.append({'\测试': '测试\n', ' 测试测试 ': ['测试 ', ' 测试']})
- # RWJson(False, res)
- # 无法识别数据(需要手动填写):
- # 1、https://www.taptap.com/topic/10792651
- # {"pubtime": "2020-03-19 15:18:11", "isHero": true, "heros": [[], ["诗寇蒂"]], "starttime": [2020, 3, 20, -1], "endtime": [2020, 3, 27, 0]},
- # 2、
- # 以下用于存储数据,避免每次运行时重新爬取/误删外部数据
- ## DON NOT MODIFY ##
- """
- [
- {"pubtime": "2020-04-27 11:55:46", "isHero": true, "heros": [["莉莉", "希帕提娅"], ["塔利亚"]], "starttime": [2020, 5, 1, 0], "endtime": [2020, 5, 8, 0]},
- {"pubtime": "2020-04-27 11:48:33", "isHero": true, "heros": [[], ["芬里尔·魔"]], "starttime": [2020, 4, 30, -1], "endtime": [2020, 5, 22, 16]},
- {"pubtime": "2020-04-20 09:46:07", "isHero": true, "heros": [["夏莉", "松"], ["诗寇蒂"]], "starttime": [2020, 4, 24, 0], "endtime": [2020, 5, 1, 0]},
- {"pubtime": "2020-04-13 10:52:56", "isHero": true, "heros": [["异邦人", "拉弥亚"], ["希尔芙"]], "starttime": [2020, 4, 17, 0], "endtime": [2020, 4, 24, 0]},
- {"pubtime": "2020-04-07 12:03:37", "isHero": true, "heros": [[], ["达芙妮"]], "starttime": [2020, 4, 9, -1], "endtime": [2020, 5, 1, 16]},
- {"pubtime": "2020-04-07 11:53:58", "isHero": true, "heros": [["茉莉安", "摩伊拉"], ["克里斯汀"]], "starttime": [2020, 4, 10, 0], "endtime": [2020, 4, 17, 0]},
- {"pubtime": "2020-03-31 11:17:57", "isHero": true, "heros": [[], ["希露达", "潘多拉"]], "starttime": [2020, 4, 3, 0], "endtime": [2020, 4, 10, 0]},
- {"pubtime": "2020-03-30 13:14:24", "isHero": true, "heros": [["尼采", "索尔"], []], "starttime": [2020, 4, 3, 0], "endtime": [2020, 4, 10, 0]},
- {"pubtime": "2020-03-23 11:22:44", "isHero": true, "heros": [[], ["莉莉丝", "塔利亚"]], "starttime": [2020, 3, 27, 0], "endtime": [2020, 4, 3, 0]},
- {"pubtime": "2020-03-23 11:17:52", "isHero": true, "heros": [["芙蕾雅", "奥维德"], []], "starttime": [2020, 3, 27, 0], "endtime": [2020, 4, 3, 0]},
- {"pubtime": "2020-03-19 15:18:11", "isHero": true, "heros": [[], ["诗寇蒂"]], "starttime": [2020, 3, 20, -1], "endtime": [2020, 3, 27, 0]},
- {"pubtime": "2020-03-16 10:12:24", "isHero": true, "heros": [["布伦希尔德", "艾薇尔"], ["薇薇安"]], "starttime": [2020, 3, 20, 0], "endtime": [2020, 3, 27, 0]},
- {"pubtime": "2020-03-09 10:39:07", "isHero": true, "heros": [["欧若拉", "阿塔兰忒"], ["希尔芙"]], "starttime": [2020, 3, 13, 0], "endtime": [2020, 3, 20, 0]},
- {"pubtime": "2020-03-02 10:55:21", "isHero": true, "heros": [["希帕提娅", "菲碧"], ["莉莉丝"]], "starttime": [2020, 3, 6, 0], "endtime": [2020, 3, 13, 0]},
- {"pubtime": "2020-02-24 10:44:23", "isHero": true, "heros": [["莉莉", "海尼尔"], ["塔利亚"]], "starttime": [2020, 2, 28, 0], "endtime": [2020, 3, 6, 0]},
- {"pubtime": "2020-02-17 15:28:25", "isHero": true, "heros": [["云翘", "尤里乌斯二世"], ["希露达"]], "starttime": [2020, 2, 21, 0], "endtime": [2020, 2, 28, 0]},
- {"pubtime": "2020-02-11 14:12:59", "isHero": true, "heros": [["拉弥亚"], ["诗寇蒂"]], "starttime": [2020, 2, 14, 0], "endtime": [2020, 2, 21, 0]},
- {"pubtime": "2020-02-11 13:58:15", "isHero": true, "heros": [["苏鲁特"], ["潘多拉"]], "starttime": [2020, 2, 13, 16], "endtime": [2020, 3, 6, 16]},
- {"pubtime": "2020-02-03 10:44:14", "isHero": true, "heros": [["松"], ["希尔芙"]], "starttime": [2020, 2, 7, 0], "endtime": [2020, 2, 14, 0]},
- {"pubtime": "2020-01-29 09:42:16", "isHero": true, "heros": [["茉莉安", "索尔"], ["莉莉丝"]], "starttime": [2020, 1, 31, 0], "endtime": [2020, 2, 7, 0]},
- {"pubtime": "2020-01-20 10:04:04", "isHero": true, "heros": [["夏莉", "尼采"], ["希露达"]], "starttime": [2020, 1, 24, 0], "endtime": [2020, 1, 31, 0]},
- {"pubtime": "2020-01-19 17:17:09", "isHero": true, "heros": [[], ["薇薇安"]], "starttime": [2020, 1, 21, 15], "endtime": [2020, 2, 12, 15]},
- {"pubtime": "2020-01-15 10:51:51", "isHero": true, "heros": [["摩伊拉"], ["塔利亚"]], "starttime": [2020, 1, 17, 0], "endtime": [2020, 1, 24, 0]},
- {"pubtime": "2020-01-07 11:45:47", "isHero": true, "heros": [["奥维德"], ["诗寇蒂"]], "starttime": [2020, 1, 10, 0], "endtime": [2020, 1, 17, 0]},
- {"pubtime": "2019-12-30 14:26:55", "isHero": true, "heros": [["异邦人"], ["希尔芙"]], "starttime": [2020, 1, 3, 0], "endtime": [2020, 1, 10, 0]},
- {"pubtime": "2019-12-24 15:22:59", "isHero": true, "heros": [["妮可"], []], "starttime": [2019, 12, 25, 16], "endtime": [2020, 1, 10, 16]},
- {"pubtime": "2019-12-23 14:04:19", "isHero": true, "heros": [[], ["莉莉丝"]], "starttime": [2019, 12, 27, 0], "endtime": [2020, 1, 3, 0]},
- {"pubtime": "2019-12-12 16:26:49", "isHero": true, "heros": [[], ["塔利亚"]], "starttime": [2019, 12, 13, 0], "endtime": [2019, 12, 20, 0]},
- {"pubtime": "2019-12-03 15:40:28", "isHero": true, "heros": [["芙蕾雅"], ["诗寇蒂"]], "starttime": [2019, 12, 6, 0], "endtime": [2019, 12, 13, 0]},
- {"pubtime": "2019-11-21 19:48:47", "isHero": true, "heros": [["欧若拉"], ["希尔芙"]], "starttime": [2019, 11, 22, 0], "endtime": [2019, 11, 28, 24]}
- ]
- """
- ## DON NOT MODIFY ##
Add Comment
Please, Sign In to add comment