Guest User

Untitled

a guest
May 1st, 2020
257
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 17.07 KB | None | 0 0
  1. # 邂逅在迷宫up池爬虫
  2. # 作者: Chillax
  3. # 请勿以任何形式商用
  4. # 转载请注明出处:
  5.  
  6. # v1.0 2020-4-17
  7.  
  8. # v1.1 2020-4-27 增加了匹配时间的正则表达式的完备性
  9.  
  10. import json
  11. import sys
  12. import requests
  13. from bs4 import BeautifulSoup
  14. import lxml
  15. import re
  16. from datetime import datetime
  17. from datetime import date
  18. from datetime import timedelta
  19. from prettytable import PrettyTable
  20. from pypinyin import lazy_pinyin
  21. from itertools import chain
  22. from collections import defaultdict
  23.  
  24. FLAG_STRING = "## DON NOT MODIFY ##"
  25. FILE_NAME = ""
  26.  
  27.  
  28. def RWJson(READ=True, jsonObj=''):
  29.     # 为了兼顾可读性与节省空间,限制jsonobj必须为list
  30.     codeLines = []
  31.     jsonLines = []
  32.     with open(FILE_NAME, 'r', encoding="utf-8") as f:
  33.         flag = False
  34.         for line in f.readlines():
  35.             if not flag and line != FLAG_STRING + '\n':
  36.                 codeLines.append(line)
  37.             elif line == FLAG_STRING + '\n':
  38.                 flag = True
  39.             elif flag:
  40.                 jsonLines.append(line)
  41.     jsonLines = jsonLines[1:-1]
  42.  
  43.     if READ:
  44.         try:
  45.             js = json.loads(''.join(jsonLines))
  46.         except:
  47.             print("can't loads:\n", jsonLines, end='\n' + "-" * 80 + '\n')
  48.             return []
  49.         return js
  50.     else:
  51.         jsonLines = [FLAG_STRING, '"""', '[', ']', '"""', FLAG_STRING]
  52.         for i in jsonObj:
  53.             jsonLines.insert(-3, json.dumps(i, ensure_ascii=False)+',')
  54.         jsonLines[-4] = jsonLines[-4][:-1]
  55.  
  56.         with open(FILE_NAME, 'w', encoding="utf-8") as f:
  57.             for line in codeLines:
  58.                 f.write(line)
  59.             for line in jsonLines:
  60.                 f.write(line + '\n')
  61.         return True
  62.  
  63.  
  64. def GetTopic(pageNum):
  65.     url = "https://www.taptap.com/app/67245/topic?type=official&sort=created&page=%d" % pageNum
  66.     r = requests.get(url)
  67.     bs = BeautifulSoup(r.text, 'lxml')
  68.  
  69.     # 公告列表(上限15)
  70.     lTopic = list(bs.find("div", {"class": "data-list"}).children)
  71.     lTopic = [i for i in lTopic if i != '\n']  # 筛掉莫名其妙的空行
  72.     return lTopic
  73.  
  74.  
  75. def ParsePage(url):
  76.     dRes = {}
  77.  
  78.     r = requests.get(url)
  79.     bs = BeautifulSoup(r.text, 'lxml')
  80.  
  81.     content = ''
  82.     content += bs.find('div', {'class': 'topic-content'}
  83.                        ).find('div', {'class': 'top-title-author'}).text  # 标题文本
  84.     content += bs.find('div', {'class': 'topic-content'}
  85.                        ).find('div', {'class': 'bbcode-body'}).text  # 正文文本
  86.  
  87.     heros = [[], []]
  88.     heros[0] = re.findall("【([^【】]*?)】up卡池", content)
  89.     heros[0] = list(set(heros[0]))
  90.     heros[1] = re.findall("【([^【】]*?)】恒晶封印", content)
  91.     heros[1] = list(set(heros[1]))
  92.  
  93.     if not heros[0] and not heros[1]:
  94.         print("无法解析本公告,请手动解析添加,公告内容:\n%s" % content.strip())
  95.         while 1:
  96.             try:
  97.                 for i in range(int(input("这里面有几个英雄?"))):
  98.                     if int(input("第%d个英雄是钻石(0)还是恒金(1)?" % i + 1)):
  99.                         heros[1].append(input("TA的名字:"))
  100.                     else:
  101.                         heros[0].append(input("TA的名字:"))
  102.                 break
  103.             except:
  104.                 print("输入格式错误,请重新输入")
  105.                 heros = [[], []]
  106.  
  107.     dRes['heros'] = heros
  108.  
  109.     lTime = list(re.findall(
  110.         "(\d*?)[年/]?(\d*?)[月/](\d*?)[日 ] ?(\d*?)[时:更]\d*?\D*?\d*?\D*?(\d*?)[年/]?(\d*?)[月/](\d*?)[日 ](\d*?)[时:]", content)[0])
  111.     dRes['time'] = [int(i) if i.isdigit() else -1 for i in lTime]
  112.  
  113.     return dRes
  114.  
  115.  
  116. def ParseTopic(topic):
  117.     # {
  118.     #     "pubtime": "%Y-%m-%d %X",
  119.     #     "isHero": True,
  120.     #     "heros": [[hero1,][hero2,]],
  121.     #     "starttime": [%Y, %m, %d[, %H]],
  122.     #     "endtime": [%Y, %m, %d, %H]
  123.     # }
  124.     dRes = {}
  125.     try:
  126.         pubTime = topic.find("span", {"class": "item-publish-time"}).text
  127.     except:
  128.         print(topic)
  129.     tPubTime = datetime.strptime(
  130.         pubTime, '%Y-%m-%d %X')
  131.     # sPubTime = tPubTime.strftime("%F, %a, %T")
  132.     dRes['pubtime'] = str(tPubTime)
  133.  
  134.     isHero = 'SSR' in topic.text and '限时' in topic.text
  135.     dRes['isHero'] = isHero
  136.  
  137.     if not isHero:
  138.         return dRes
  139.  
  140.     url = topic.find('a', {"class": "taptap-btn-link"})['href']
  141.     dTemp = ParsePage(url)
  142.     dRes['heros'] = dTemp['heros']
  143.     dRes['starttime'] = dTemp['time'][:4]
  144.     dRes['endtime'] = dTemp['time'][4:]
  145.  
  146.     if dRes['endtime'][0] == -1 and dRes['starttime'][0] == -1:
  147.         dRes['endtime'][0] = dRes['starttime'][0] = tPubTime.year
  148.     if dRes['endtime'][0] == -1:
  149.         if dRes['starttime'][1:3] <= dRes['endtime'][1:3]:
  150.             dRes['endtime'][0] = dRes['starttime'][0]
  151.         else:
  152.             dRes['endtime'][0] = dRes['starttime'][0] + 1
  153.  
  154.     return dRes
  155.  
  156.  
  157. def ListToDate(l):
  158.     if l[3] == -1:
  159.         return date(l[0], l[1], l[2])
  160.     else:
  161.         if l[3] == 24:
  162.             return datetime(l[0], l[1], l[2], l[3] - 1) + timedelta(hours=1)
  163.         return datetime(l[0], l[1], l[2], l[3])
  164.  
  165.  
  166. def GetData():
  167.     print("初始化数据库...")
  168.  
  169.     data = RWJson()
  170.     newData = []
  171.     print("数据库中已有数据数:%d 条,开始更新..." % len(data))
  172.  
  173.     if data:
  174.         lastDate = datetime.strptime(data[0]['pubtime'], '%Y-%m-%d %X')
  175.     else:
  176.         lastDate = datetime(2019, 1, 1)
  177.  
  178.     pageNum = 1
  179.     loopFlag = True
  180.     while loopFlag:
  181.         print("解析第 %d 页..." % pageNum)
  182.         lTopic = GetTopic(pageNum)
  183.  
  184.         for topic in lTopic:
  185.             res = ParseTopic(topic)
  186.  
  187.             if datetime.strptime(res['pubtime'], "%Y-%m-%d %X") <= lastDate:
  188.                 loopFlag = False
  189.                 break
  190.             if not res['isHero']:
  191.                 continue
  192.  
  193.             print(res)
  194.             newData.append(res)
  195.  
  196.         pageNum += 1
  197.  
  198.     for res in newData[::-1]:
  199.         data.insert(0, res)
  200.  
  201.     if not newData:
  202.         print("数据库为最新~")
  203.  
  204.     RWJson(False, data)
  205.     print("初始化完成")
  206.     return data
  207.  
  208.  
  209. def PrintChart(lines):
  210.     pass
  211.  
  212.  
  213. def PrintData(data):
  214.     minDate = ListToDate(data[-1]['starttime'])
  215.     maxDate = ListToDate(data[0]['starttime'])
  216.     while True:
  217.         try:
  218.             starttime = input('开始日期(YYYYMMDD)(默认为%s):' %
  219.                               minDate.strftime("%Y%m%d"))
  220.             if not starttime:
  221.                 starttime = minDate
  222.             else:
  223.                 starttime = datetime.strptime(starttime, "%Y%m%d")
  224.             break
  225.         except ValueError:
  226.             print("格式错误,请重新输入")
  227.     while True:
  228.         try:
  229.             endtime = input('结束日期(YYYYMMDD)(默认为%s):' %
  230.                             maxDate.strftime("%Y%m%d"))
  231.             if not endtime:
  232.                 endtime = maxDate
  233.             else:
  234.                 endtime = datetime.strptime(endtime, "%Y%m%d")
  235.             break
  236.         except ValueError:
  237.             print("格式错误,请重新输入")
  238.     showAct = bool(input("是否显示活动up(是(1),否(0))(默认为0):"))
  239.  
  240.     # heroNum = int(input("英雄类别(恒金(1),钻石(0),全都要(2))(默认为2):"))
  241.  
  242.     table = PrettyTable(
  243.         ['序号', '公告时间', '开始时间', '结束时间', '持续天数', '钻石01', '钻石02', '恒金01', '恒金02'])
  244.     num = 1
  245.     for line in data:
  246.         st = ListToDate(line["starttime"])
  247.         et = ListToDate(line["endtime"])
  248.         if datetime(st.year, st.month, st.day) < starttime:
  249.             continue
  250.         if datetime(st.year, st.month, st.day) > endtime:
  251.             break
  252.  
  253.         lTemp = [num]
  254.  
  255.         lTemp.append(datetime.strptime(
  256.             line['pubtime'], "%Y-%m-%d %X").strftime("%Y-%m-%d %a %X"))
  257.  
  258.         if not isinstance(st, datetime):  # 用date区分不出来,应该是子类关系导致的
  259.             lTemp.append(st.strftime("%Y-%m-%d %a ") + "更新后")
  260.         else:
  261.             lTemp.append(st.strftime("%Y-%m-%d %a %H") + "时")
  262.  
  263.         lTemp.append(et.strftime("%Y-%m-%d %a %H") + "时")
  264.  
  265.         deltaDays = (et - datetime(st.year, st.month, st.day)).days
  266.         if deltaDays > 7 and not showAct:
  267.             continue
  268.         lTemp.append(deltaDays)
  269.  
  270.         tHeros = [line['heros'][0][:], line['heros'][1][:]]
  271.         for i in tHeros:
  272.             i.extend('-' * (2 - len(i)))
  273.             lTemp += i
  274.  
  275.         table.add_row(lTemp)
  276.         num += 1
  277.     print(table)
  278.  
  279.  
  280. def EqPinYin(str1, str2):
  281.     py1 = ''.join(lazy_pinyin(str1))
  282.     py2 = ''.join(lazy_pinyin(str2))
  283.     return py1 == py2
  284.  
  285.  
  286. def SearchHero(data):
  287.     while True:
  288.         heroName = input("输入你想要查找的英雄(无输入退出):")
  289.         if not heroName:
  290.             return
  291.         table = PrettyTable(['开始时间', '持续时间', '距今时间', '距上次时间'])
  292.         heroExist = False
  293.  
  294.         preDate = None
  295.         trueName = ''
  296.         for line in data[::-1]:
  297.             heros = list(chain.from_iterable(line['heros']))
  298.             flag = False
  299.             for h in heros:
  300.                 if EqPinYin(h, heroName):
  301.                     flag = True
  302.                     heroExist = True
  303.                     if not trueName:
  304.                         trueName = h
  305.             if not flag:
  306.                 continue
  307.  
  308.             st = ListToDate(line["starttime"])
  309.             st = datetime(st.year, st.month, st.day)
  310.             et = ListToDate(line["endtime"])
  311.             lTemp = []
  312.  
  313.             lTemp.append(st.strftime("%Y-%m-%d %a"))
  314.  
  315.             deltaDays = (et - st).days
  316.             lTemp.append(deltaDays)
  317.  
  318.             deltaDays = (datetime.now() - st).days
  319.             lTemp.append(deltaDays)
  320.  
  321.             if preDate:
  322.                 deltaDays = (st - preDate).days
  323.                 lTemp.append(deltaDays)
  324.             else:
  325.                 lTemp.append('-')
  326.             preDate = st
  327.  
  328.             table.add_row(lTemp)
  329.         # 不支持中文键值
  330.         # print(table.get_string(sortby="开始日期", reversesort=True))
  331.         # table.sortby = "开始日期"
  332.         if heroExist:
  333.             print("%s:" % trueName)
  334.             print(table)
  335.         else:
  336.             print("未查到此英雄“%s”,请重新输入" % heroName)
  337.  
  338.  
  339. def NotUptime(data):
  340.     print("注:不包括所有首发活动up")
  341.     while True:
  342.         heroCate = input("查看英雄类别(钻石(0),恒晶(1))(其他输入退出):")
  343.         if heroCate not in ['0', '1']:
  344.             return
  345.         heroCate = int(heroCate)
  346.  
  347.         table = PrettyTable(['英雄名称', '距今时间'])
  348.  
  349.         preDate = defaultdict(list)
  350.         for line in data[::-1]:
  351.             heros = line['heros'][heroCate]
  352.  
  353.             st = ListToDate(line["starttime"])
  354.             st = datetime(st.year, st.month, st.day)
  355.             et = ListToDate(line["endtime"])
  356.             if (et - st).days > 7:
  357.                 continue
  358.  
  359.             for h in heros:
  360.                 preDate[h].append(st)
  361.  
  362.         lTable = []
  363.         nowDate = datetime.now()
  364.         for k, v in preDate.items():
  365.             lTable.append([k, (nowDate - v[-1]).days])
  366.         lTable.sort(key=lambda x: x[1], reverse=True)
  367.  
  368.         for i in lTable:
  369.             table.add_row(i)
  370.  
  371.         print(table)
  372.  
  373.  
  374. def Interact(data):
  375.     while True:
  376.         chNum = input(
  377.             "你想要...:\n(1)查看过往up信息\n(2)查看某英雄UP日期\n(3)查看英雄持续未up天数\n(0)退出\n")
  378.         if chNum == "0":
  379.             return
  380.         elif chNum == "1":
  381.             PrintData(data)
  382.         elif chNum == "2":
  383.             SearchHero(data)
  384.         elif chNum == "3":
  385.             NotUptime(data)
  386.         else:
  387.             continue
  388.  
  389.  
  390. if __name__ == "__main__":
  391.     FILE_NAME = sys.argv[0]
  392.  
  393.     data = GetData()
  394.     try:
  395.         Interact(data)
  396.     except KeyboardInterrupt:
  397.         pass
  398.     print("感谢使用~")
  399.     # res = RWJson()
  400.     # print(res)
  401.     # res.append({'\测试': '测试\n', ' 测试测试 ': ['测试 ', ' 测试']})
  402.     # RWJson(False, res)
  403.  
  404. # 无法识别数据(需要手动填写):
  405. # 1、https://www.taptap.com/topic/10792651
  406. # {"pubtime": "2020-03-19 15:18:11", "isHero": true, "heros": [[], ["诗寇蒂"]], "starttime": [2020, 3, 20, -1], "endtime": [2020, 3, 27, 0]},
  407. # 2、
  408.  
  409. # 以下用于存储数据,避免每次运行时重新爬取/误删外部数据
  410. ## DON NOT MODIFY ##
  411. """
  412. [
  413. {"pubtime": "2020-04-27 11:55:46", "isHero": true, "heros": [["莉莉", "希帕提娅"], ["塔利亚"]], "starttime": [2020, 5, 1, 0], "endtime": [2020, 5, 8, 0]},
  414. {"pubtime": "2020-04-27 11:48:33", "isHero": true, "heros": [[], ["芬里尔·魔"]], "starttime": [2020, 4, 30, -1], "endtime": [2020, 5, 22, 16]},
  415. {"pubtime": "2020-04-20 09:46:07", "isHero": true, "heros": [["夏莉", "松"], ["诗寇蒂"]], "starttime": [2020, 4, 24, 0], "endtime": [2020, 5, 1, 0]},
  416. {"pubtime": "2020-04-13 10:52:56", "isHero": true, "heros": [["异邦人", "拉弥亚"], ["希尔芙"]], "starttime": [2020, 4, 17, 0], "endtime": [2020, 4, 24, 0]},
  417. {"pubtime": "2020-04-07 12:03:37", "isHero": true, "heros": [[], ["达芙妮"]], "starttime": [2020, 4, 9, -1], "endtime": [2020, 5, 1, 16]},
  418. {"pubtime": "2020-04-07 11:53:58", "isHero": true, "heros": [["茉莉安", "摩伊拉"], ["克里斯汀"]], "starttime": [2020, 4, 10, 0], "endtime": [2020, 4, 17, 0]},
  419. {"pubtime": "2020-03-31 11:17:57", "isHero": true, "heros": [[], ["希露达", "潘多拉"]], "starttime": [2020, 4, 3, 0], "endtime": [2020, 4, 10, 0]},
  420. {"pubtime": "2020-03-30 13:14:24", "isHero": true, "heros": [["尼采", "索尔"], []], "starttime": [2020, 4, 3, 0], "endtime": [2020, 4, 10, 0]},
  421. {"pubtime": "2020-03-23 11:22:44", "isHero": true, "heros": [[], ["莉莉丝", "塔利亚"]], "starttime": [2020, 3, 27, 0], "endtime": [2020, 4, 3, 0]},
  422. {"pubtime": "2020-03-23 11:17:52", "isHero": true, "heros": [["芙蕾雅", "奥维德"], []], "starttime": [2020, 3, 27, 0], "endtime": [2020, 4, 3, 0]},
  423. {"pubtime": "2020-03-19 15:18:11", "isHero": true, "heros": [[], ["诗寇蒂"]], "starttime": [2020, 3, 20, -1], "endtime": [2020, 3, 27, 0]},
  424. {"pubtime": "2020-03-16 10:12:24", "isHero": true, "heros": [["布伦希尔德", "艾薇尔"], ["薇薇安"]], "starttime": [2020, 3, 20, 0], "endtime": [2020, 3, 27, 0]},
  425. {"pubtime": "2020-03-09 10:39:07", "isHero": true, "heros": [["欧若拉", "阿塔兰忒"], ["希尔芙"]], "starttime": [2020, 3, 13, 0], "endtime": [2020, 3, 20, 0]},
  426. {"pubtime": "2020-03-02 10:55:21", "isHero": true, "heros": [["希帕提娅", "菲碧"], ["莉莉丝"]], "starttime": [2020, 3, 6, 0], "endtime": [2020, 3, 13, 0]},
  427. {"pubtime": "2020-02-24 10:44:23", "isHero": true, "heros": [["莉莉", "海尼尔"], ["塔利亚"]], "starttime": [2020, 2, 28, 0], "endtime": [2020, 3, 6, 0]},
  428. {"pubtime": "2020-02-17 15:28:25", "isHero": true, "heros": [["云翘", "尤里乌斯二世"], ["希露达"]], "starttime": [2020, 2, 21, 0], "endtime": [2020, 2, 28, 0]},
  429. {"pubtime": "2020-02-11 14:12:59", "isHero": true, "heros": [["拉弥亚"], ["诗寇蒂"]], "starttime": [2020, 2, 14, 0], "endtime": [2020, 2, 21, 0]},
  430. {"pubtime": "2020-02-11 13:58:15", "isHero": true, "heros": [["苏鲁特"], ["潘多拉"]], "starttime": [2020, 2, 13, 16], "endtime": [2020, 3, 6, 16]},
  431. {"pubtime": "2020-02-03 10:44:14", "isHero": true, "heros": [["松"], ["希尔芙"]], "starttime": [2020, 2, 7, 0], "endtime": [2020, 2, 14, 0]},
  432. {"pubtime": "2020-01-29 09:42:16", "isHero": true, "heros": [["茉莉安", "索尔"], ["莉莉丝"]], "starttime": [2020, 1, 31, 0], "endtime": [2020, 2, 7, 0]},
  433. {"pubtime": "2020-01-20 10:04:04", "isHero": true, "heros": [["夏莉", "尼采"], ["希露达"]], "starttime": [2020, 1, 24, 0], "endtime": [2020, 1, 31, 0]},
  434. {"pubtime": "2020-01-19 17:17:09", "isHero": true, "heros": [[], ["薇薇安"]], "starttime": [2020, 1, 21, 15], "endtime": [2020, 2, 12, 15]},
  435. {"pubtime": "2020-01-15 10:51:51", "isHero": true, "heros": [["摩伊拉"], ["塔利亚"]], "starttime": [2020, 1, 17, 0], "endtime": [2020, 1, 24, 0]},
  436. {"pubtime": "2020-01-07 11:45:47", "isHero": true, "heros": [["奥维德"], ["诗寇蒂"]], "starttime": [2020, 1, 10, 0], "endtime": [2020, 1, 17, 0]},
  437. {"pubtime": "2019-12-30 14:26:55", "isHero": true, "heros": [["异邦人"], ["希尔芙"]], "starttime": [2020, 1, 3, 0], "endtime": [2020, 1, 10, 0]},
  438. {"pubtime": "2019-12-24 15:22:59", "isHero": true, "heros": [["妮可"], []], "starttime": [2019, 12, 25, 16], "endtime": [2020, 1, 10, 16]},
  439. {"pubtime": "2019-12-23 14:04:19", "isHero": true, "heros": [[], ["莉莉丝"]], "starttime": [2019, 12, 27, 0], "endtime": [2020, 1, 3, 0]},
  440. {"pubtime": "2019-12-12 16:26:49", "isHero": true, "heros": [[], ["塔利亚"]], "starttime": [2019, 12, 13, 0], "endtime": [2019, 12, 20, 0]},
  441. {"pubtime": "2019-12-03 15:40:28", "isHero": true, "heros": [["芙蕾雅"], ["诗寇蒂"]], "starttime": [2019, 12, 6, 0], "endtime": [2019, 12, 13, 0]},
  442. {"pubtime": "2019-11-21 19:48:47", "isHero": true, "heros": [["欧若拉"], ["希尔芙"]], "starttime": [2019, 11, 22, 0], "endtime": [2019, 11, 28, 24]}
  443. ]
  444. """
  445. ## DON NOT MODIFY ##
Add Comment
Please, Sign In to add comment