Advertisement
DeaD_EyE

wikispy

May 4th, 2019
306
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 15.32 KB | None | 0 0
  1. #!/usr/bin/python3
  2.  
  3. """
  4. This program is just a simple demonstration how to
  5. process the EventStream of Wikipedia, which is public to everyone.
  6.  
  7. Python >= 3.6 is needed.
  8.  
  9. More information about EventStream:
  10. https://wikitech.wikimedia.org/wiki/Event_Platform/EventStreams
  11.  
  12. Information about the json schema of recentchange version 2:
  13. https://github.com/wikimedia/mediawiki-event-schemas/blob/master/jsonschema/mediawiki/recentchange/2.yaml
  14.  
  15. External dependencies are used, to reduce my own workload.
  16.  
  17. aiohttp: https://github.com/aio-libs/aiohttp
  18.  
  19. aiohttp is a dependency of aiosseclient
  20.  
  21. aiosseclient: https://github.com/ebraminio/aiosseclient
  22.  
  23. To install the dependencies:
  24.    # in a virtual environment
  25.    pip install pytz aiohttp
  26.    pip install git+https://github.com/ebraminio/aiosseclient
  27.  
  28.    # or as a user, if you're not in a virtual environment
  29.    pip3 install pytz aiohttp --user
  30.    pip3 install git+https://github.com/ebraminio/aiosseclient --user
  31.  
  32.  
  33. The aiosseclient is a very simple implementation of a sse client.
  34. The stream sent by wikimedia, is a json string
  35. Python has a json parser in the standard library which is used.
  36. The urls are quoted, the unquote function make the urls readable
  37.  
  38. Fix for ssl-error: https://github.com/aio-libs/aiohttp/issues/3535#issuecomment-483268542
  39. """
  40.  
  41. import asyncio
  42. import json
  43. import csv
  44. from datetime import datetime as dt
  45. from urllib.parse import unquote, quote
  46. from pathlib import Path
  47.  
  48. import pytz
  49. from aiosseclient import aiosseclient
  50.  
  51.  
  52. TIMEZONE = pytz.timezone('Europe/Berlin')
  53. # stream url to get information about recent changes
  54. STREAM_URL = 'https://stream.wikimedia.org/v2/stream/recentchange'
  55. # just to know which types are supported
  56. TYPES = ["edit", "new", "log","categorize", "external"]
  57. TYPES_LEN = len(max(TYPES, key=len))
  58. # https://pbs.twimg.com/media/D1cnO9KXgAAV6mf.jpg:large
  59.  
  60. # funtionäre
  61. USERS = {'Hen3ry', 'Horst G.', 'Sargoth', 'XenonX3', 'Alraunenstern'}
  62.  
  63. # admins
  64. USERS |= {
  65.     'Aspirinikis', 'Cymothoa', 'DaB', 'Doc Taxon', 'Drahreg01', 'Felistoria',
  66.     'Filzstift', 'Gereon K', 'JD', 'Kuebi', 'Krizolina', 'Kurator71',
  67.     'Lustiger Seth', 'Partynia', 'Seewolf', 'Sqasher', 'Stefan64', 'XenonX3'
  68.     }
  69.  
  70. # sichter
  71. USERS |= {
  72.     'Abutoum', 'Alnilam', 'Amilamia', 'Andropov', 'Anidaat', 'AnnaS.',
  73.     'Atomiccocktail', 'Benatrevqre', 'Berichtbestatter', 'David Navan',
  74.     'Density', 'Elektrofisch', 'EH42', 'Feliks', 'Fiona B.', 'Fossa',
  75.     'Framhein', 'Gabel1960', 'Gonzo Lubitsch', 'Gustav v.A.', 'Hardenacke',
  76.     'Hot Gadling', 'Hvd69', 'In dubio pro dubio', 'JensB.', '-jkb-', 'Jmb1982',
  77.     'Jonaster', 'JosFritz', 'KaiMarting', 'KarlV', 'Kopilot', 'KurtR',
  78.     'Liberaler Humanist', 'Marcus C.', 'MBurch', 'Miltrak', 'Orik',
  79.     'Perfect Tommy', 'Phi', 'Port(u*o)s', 'SanFran Farmer', 'Sänger',
  80.     'Schwarze Feder', 'Simplicicus', 'Thoma', 'Toter alter Mann', 'Über-Blick',
  81.     'Viciarg', 'Webverbesserer'
  82.     }
  83.  
  84.  
  85.  
  86.  
  87. def ignore_aiohttp_ssl_eror(loop, aiohttpversion='3.5.4'):
  88.     """Ignore aiohttp #3535 issue with SSL data after close
  89.  
  90.    There appears to be an issue on Python 3.7 and aiohttp SSL that throws a
  91.    ssl.SSLError fatal error (ssl.SSLError: [SSL: KRB5_S_INIT] application data
  92.    after close notify (_ssl.c:2609)) after we are already done with the
  93.    connection. See GitHub issue aio-libs/aiohttp#3535
  94.  
  95.    Given a loop, this sets up a exception handler that ignores this specific
  96.    exception, but passes everything else on to the previous exception handler
  97.    this one replaces.
  98.  
  99.    If the current aiohttp version is not exactly equal to aiohttpversion
  100.    nothing is done, assuming that the next version will have this bug fixed.
  101.    This can be disabled by setting this parameter to None
  102.  
  103.    """
  104.     orig_handler = loop.get_exception_handler() or loop.default_exception_handler
  105.  
  106.     def ignore_ssl_error(loop, context):
  107.         if context.get('message') == 'SSL error in data received':
  108.             # validate we have the right exception, transport and protocol
  109.             exception = context.get('exception')
  110.             protocol = context.get('protocol')
  111.             if (
  112.                 isinstance(exception, ssl.SSLError) and exception.reason == 'KRB5_S_INIT' and
  113.                 isinstance(protocol, asyncio.sslproto.SSLProtocol) and
  114.                 isinstance(protocol._app_protocol, aiohttp.client_proto.ResponseHandler)
  115.             ):
  116.                 if loop.get_debug():
  117.                     asyncio.log.logger.debug('Ignoring aiohttp SSL KRB5_S_INIT error')
  118.                 return
  119.         orig_handler(context)
  120.  
  121.  
  122. async def event_generator(stream_url):
  123.     """
  124.    Infinite async stream generator.
  125.    All errors are silently thrown away.
  126.    If you use this function, just look what
  127.    happens, if you got a timeout.
  128.    The exception(s) should catched explicit.
  129.    """
  130.     while True:
  131.         client = aiosseclient(stream_url)
  132.         try:
  133.             async for event in client:
  134.                 yield json.loads(event.data)
  135.         except Exception as e:
  136.             # should be more specific
  137.             pass
  138.  
  139.  
  140. def selector(event, **kwargs):
  141.     """
  142.    This selector returns True, if all values are equal
  143.    to the values of the event. If the value is in callable,
  144.    it's called with the value of the event-field. If the
  145.    result is False, the function returns also False.
  146.    """
  147.     for key, value in kwargs.items():
  148.         event_value = event.get(key)
  149.         if callable(value):
  150.             if not value(event_value):
  151.                 return False
  152.         else:
  153.             if value != event_value:
  154.                 return False
  155.     return True
  156.  
  157.  
  158. def formatter(event):
  159.     """
  160.    Formatting the text and return it together
  161.    with the fields in a tuple.
  162.    """
  163.     uri = unquote(event['meta']['uri'])
  164.     user = event['user']
  165.     revision = event.get('revision')
  166.     timestamp = dt.fromisoformat(event['meta']['dt']).astimezone(TIMEZONE)
  167.     if revision and 'old' in revision and 'new' in revision:
  168.         old = revision['old']
  169.         new = revision['new']
  170.         title = quote(event['title'])
  171.         uri = f'https://de.wikipedia.org/w/index.php?title={title}&type=revision&diff={new}&oldid={old}'
  172.     if len(user) > 15:
  173.         user = user[:15-3]
  174.         user += '...'
  175.     title = event['title']
  176.     typ = event['type']
  177.     if len(title) > 30:
  178.         title = title[:30-3]
  179.         title += '...'
  180.     # a little bit black format magic
  181.     text = f'{timestamp.time()!s} | {user:<15} | {title.title():<30} | {typ:<{TYPES_LEN}} | {uri}'
  182.     return text, (timestamp.isoformat(), user, event['title'], typ, uri)
  183.  
  184.  
  185. async def main():
  186.     """
  187.    Main programm.
  188.    Prepares the filters, iterating over the stream
  189.    and print output to the console.
  190.    """
  191.     # only stupid german servers
  192.     server_name = lambda ev: ev.startswith('de.')
  193.     # filter for users
  194.     users = lambda user: user in USERS
  195.     # users = lambda user: True
  196.     # filter for different types
  197.     types = lambda typ: typ in ['edit', 'categorize', 'new']
  198.     # instanciate the event_generator
  199.     stream = event_generator(STREAM_URL)
  200.     # set the options for the filter
  201.     options = {'bot': False, 'server_name': server_name, 'type': types, 'user': users}
  202.     # iterate over the stream
  203.     csv_file = Path('wikispy.csv')
  204.     if csv_file.exists():
  205.         write_header = False
  206.     else:
  207.         write_header = True
  208.     with open(csv_file, 'a') as fd:
  209.         writer = csv.writer(fd, delimiter=';')
  210.         if write_header:
  211.             writer.writerow(['timestamp', 'user', 'title', 'type', 'uri'])
  212.         async for event in stream:
  213.             # if the event matches the filter, it's printed
  214.             if selector(event, **options):
  215.                 text, fields = formatter(event)
  216.                 # print text, then write to file
  217.                 print(text)
  218.                 writer.writerow(fields)
  219.  
  220. # fields of an event
  221. # meta has a subdict with more information
  222. # ['bot', 'comment', 'id', 'length', 'meta', 'minor', 'namespace',
  223. # 'parsedcomment', 'patrolled', 'revision', 'server_name',
  224. # 'server_script_path', 'server_url', 'timestamp', 'title',
  225. # 'type', 'user', 'wiki']
  226.  
  227. """
  228. The resulting output:
  229.  
  230. 18:34:22 | JosFritz        | Benutzer Diskussion:Andropov   | edit       | https://de.wikipedia.org/w/index.php?title=Benutzer%20Diskussion%3AAndropov&type=revision&diff=188219653&oldid=188219250
  231. 18:34:25 | Kurator71       | Benutzer:Kurator71/Erzurum ... | edit       | https://de.wikipedia.org/w/index.php?title=Benutzer%3AKurator71/Erzurum%20Congress&type=revision&diff=188219655&oldid=188218480
  232. 18:43:10 | Phi             | Ns-Staat                       | edit       | https://de.wikipedia.org/w/index.php?title=NS-Staat&type=revision&diff=188219980&oldid=188216757
  233. 18:45:44 | Sänger          | Benutzer Diskussion:Sänger     | edit       | https://de.wikipedia.org/w/index.php?title=Benutzer%20Diskussion%3AS%C3%A4nger&type=revision&diff=188220046&oldid=188219908
  234. 18:48:03 | Kurator71       | Benutzer:Kurator71/Erzurum ... | edit       | https://de.wikipedia.org/w/index.php?title=Benutzer%3AKurator71/Erzurum%20Congress&type=revision&diff=188220107&oldid=188219655
  235. 18:48:03 | Kurator71       | Kategorie:Türkischer Befrei... | categorize | https://de.wikipedia.org/wiki/Kategorie:Türkischer_Befreiungskrieg
  236. 18:48:03 | Kurator71       | Kategorie:Türkische Militär... | categorize | https://de.wikipedia.org/wiki/Kategorie:Türkische_Militärgeschichte
  237. 18:48:03 | Kurator71       | Kategorie:Mustafa Kemal Ata... | categorize | https://de.wikipedia.org/wiki/Kategorie:Mustafa_Kemal_Atatürk
  238. 18:48:03 | Kurator71       | Kategorie:Erzurum              | categorize | https://de.wikipedia.org/wiki/Kategorie:Erzurum
  239. 18:48:54 | Kurator71       | Benutzer:Kurator71/Erzurum ... | edit       | https://de.wikipedia.org/w/index.php?title=Benutzer%3AKurator71/Erzurum%20Congress&type=revision&diff=188220131&oldid=188220107
  240. 18:50:11 | Partynia        | Benutzer:Partynia              | edit       | https://de.wikipedia.org/w/index.php?title=Benutzer%3APartynia&type=revision&diff=188220167&oldid=188213600
  241. 18:51:26 | JosFritz        | Benutzer Diskussion:Andropov   | edit       | https://de.wikipedia.org/w/index.php?title=Benutzer%20Diskussion%3AAndropov&type=revision&diff=188220201&oldid=188220159
  242. 18:51:53 | Kurator71       | Benutzer:Kurator71/Erzurum ... | edit       | https://de.wikipedia.org/w/index.php?title=Benutzer%3AKurator71/Erzurum%20Congress&type=revision&diff=188220217&oldid=188220131
  243. 18:53:08 | Kurator71       | Erzurum-Kongress               | edit       | https://de.wikipedia.org/w/index.php?title=Erzurum-Kongress&type=revision&diff=188220253&oldid=188220224
  244. 18:53:32 | Kurator71       | Erzurum-Kongress               | edit       | https://de.wikipedia.org/w/index.php?title=Erzurum-Kongress&type=revision&diff=188220265&oldid=188220253
  245. 18:54:13 | Kurator71       | Benutzer:Kurator71/Erzurum ... | edit       | https://de.wikipedia.org/w/index.php?title=Benutzer%3AKurator71/Erzurum%20Congress&type=revision&diff=188220283&oldid=188220225
  246. 18:54:13 | Kurator71       | Kategorie:Wikipedia:Schnell... | categorize | https://de.wikipedia.org/wiki/Kategorie:Wikipedia:Schnelllöschen
  247. 18:54:58 | Kurator71       | Wikipedia:Wikimedia Cee Spr... | edit       | https://de.wikipedia.org/w/index.php?title=Wikipedia%3AWikimedia%20CEE%20Spring%202019&type=revision&diff=188220305&oldid=188217610
  248. 18:56:37 | Partynia        | Benutzer:Partynia/Gesundhei... | edit       | https://de.wikipedia.org/w/index.php?title=Benutzer%3APartynia/Gesundheitswesen%20Neu&type=revision&diff=188220351&oldid=187751173
  249. 18:56:43 | Phi             | Diskussion:Sven Felix Kelle... | edit       | https://de.wikipedia.org/w/index.php?title=Diskussion%3ASven%20Felix%20Kellerhoff&type=revision&diff=188220354&oldid=188210108
  250. 18:57:50 | Hardenacke      | Günther Stempel                | edit       | https://de.wikipedia.org/w/index.php?title=G%C3%BCnther%20Stempel&type=revision&diff=188220383&oldid=188114758
  251. 18:57:50 | Phi             | Diskussion:Sven Felix Kelle... | edit       | https://de.wikipedia.org/w/index.php?title=Diskussion%3ASven%20Felix%20Kellerhoff&type=revision&diff=188220384&oldid=188220354
  252. 18:58:24 | Phi             | Oxymoron                       | edit       | https://de.wikipedia.org/w/index.php?title=Oxymoron&type=revision&diff=188220403&oldid=188220369
  253. 18:59:04 | -jkb-           | Jan Líbezný                    | edit       | https://de.wikipedia.org/w/index.php?title=Jan%20L%C3%ADbezn%C3%BD&type=revision&diff=188220423&oldid=187718616
  254. 18:59:04 | -jkb-           | Kategorie:Wikipedia:Defekte... | categorize | https://de.wikipedia.org/wiki/Kategorie:Wikipedia:Defekte_Weblinks/Ungeprüfte_Archivlinks_2018-04
  255. 18:59:11 | Partynia        | Benutzer:Partynia/Geschichte   | edit       | https://de.wikipedia.org/w/index.php?title=Benutzer%3APartynia/Geschichte&type=revision&diff=188220427&oldid=187217751
  256. 19:03:22 | -jkb-           | Jüdische Gemeinde Loštice      | edit       | https://de.wikipedia.org/w/index.php?title=J%C3%BCdische%20Gemeinde%20Lo%C5%A1tice&type=revision&diff=188220528&oldid=187767523
  257. 19:03:22 | -jkb-           | Kategorie:Wikipedia:Weblink... | categorize | https://de.wikipedia.org/wiki/Kategorie:Wikipedia:Weblink_offline
  258. 19:03:22 | -jkb-           | Kategorie:Wikipedia:Weblink... | categorize | https://de.wikipedia.org/wiki/Kategorie:Wikipedia:Weblink_offline_IABot
  259. 19:03:22 | -jkb-           | Kategorie:Wikipedia:Defekte... | categorize | https://de.wikipedia.org/wiki/Kategorie:Wikipedia:Defekte_Weblinks/Ungeprüfte_Botmarkierungen_2018-04
  260. 19:06:02 | JosFritz        | Benutzerin Diskussion:Josfritz | edit       | https://de.wikipedia.org/w/index.php?title=Benutzerin%20Diskussion%3AJosFritz&type=revision&diff=188220602&oldid=188211731
  261. 19:08:18 | Sänger          | Benutzer Diskussion:Sänger     | edit       | https://de.wikipedia.org/w/index.php?title=Benutzer%20Diskussion%3AS%C3%A4nger&type=revision&diff=188220646&oldid=188220617
  262. 19:14:05 | Phi             | Wikipedia:Review/Geschichte    | edit       | https://de.wikipedia.org/w/index.php?title=Wikipedia%3AReview/Geschichte&type=revision&diff=188220781&oldid=188220559
  263. 19:22:56 | Sänger          | Benutzer Diskussion:Sänger     | edit       | https://de.wikipedia.org/w/index.php?title=Benutzer%20Diskussion%3AS%C3%A4nger&type=revision&diff=188220979&oldid=188220944
  264. 19:24:08 | Phi             | Diskussion:Adolf Hitlers Mö... | edit       | https://de.wikipedia.org/w/index.php?title=Diskussion%3AAdolf%20Hitlers%20m%C3%B6gliche%20Monorchie&type=revision&diff=188221011&oldid=188220836
  265. 19:25:29 | JosFritz        | Benutzer Diskussion:Sänger     | edit       | https://de.wikipedia.org/w/index.php?title=Benutzer%20Diskussion%3AS%C3%A4nger&type=revision&diff=188221065&oldid=188221024
  266. 19:26:44 | Sänger          | Wikipedia:Vandalismusmeldung   | edit       | https://de.wikipedia.org/w/index.php?title=Wikipedia%3AVandalismusmeldung&type=revision&diff=188221097&oldid=188220985
  267. 19:28:53 | Über-Blick      | Christlich Demokratische Un... | edit       | https://de.wikipedia.org/w/index.php?title=Christlich%20Demokratische%20Union%20Deutschlands&type=revision&diff=188221145&oldid=188173337
  268. """
  269.  
  270.  
  271. if __name__ == '__main__':
  272.     loop = asyncio.get_event_loop()
  273.     loop.set_exception_handler(ignore_aiohttp_ssl_eror)
  274.     try:
  275.         loop.run_until_complete(main())
  276.         loop.run_forever()
  277.     except KeyboardInterrupt:
  278.         loop.stop()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement