Advertisement
Guest User

Untitled

a guest
Oct 10th, 2017
4,839
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 42.60 KB | None | 0 0
  1. #!/usr/bin/python3
  2. # -*- coding: utf-8 -*-
  3.  
  4. """
  5. USAGE: Set up as described below, then just run the script somewhere in the
  6.       background, screen works nicely for this.
  7.  
  8. Non-standard dependencies: sqlite3, tld
  9.  
  10. It is recommended to wrap this in the following script:
  11. #!/bin/bash
  12. RET=1
  13. while [ $RET -eq 1 ]
  14. do
  15.        removalbot
  16.        RET=$?
  17. done
  18.  
  19. Ctrl+C is handled as success, expected failures have exit code 2.
  20. Likely programming errors have exit code 3.
  21.  
  22. Files:
  23. - CONFDIR, defined below
  24. - CONFDIR/maintainer, your reddit username
  25. - CONFDIT/secrets in the format "user\npass\nclient_id\bclient_secret"
  26. - CONFDIR/version
  27. - CONFDIR/safedomains, empty or newline-separated domains
  28. - CONFDIR/unknowndomains, empty
  29. - CONFDIR/posts.db, sqlite3 database
  30.  
  31. Be aware that you need to set up oauth to use this and gather the client ID and secret for the login process.
  32.  
  33. DB schema:
  34. CREATE TABLE notify
  35.    (author TEXT UNIQUE);
  36. CREATE TABLE comments
  37.    (name TEXT UNIQUE PRIMARY KEY,
  38.     subreddit TEXT,
  39.     author TEXT,
  40.     created_utc INTEGER,
  41.     link_id TEXT,
  42.     body TEXT);
  43. CREATE TABLE comments_deleted
  44.    (name TEXT UNIQUE PRIMARY KEY,
  45.     subreddit TEXT,
  46.     author TEXT,
  47.     created_utc INTEGER,
  48.     link_id TEXT,
  49.     body TEXT,
  50.     spotted INTEGER);
  51. CREATE TABLE flairs
  52.    (name TEXT UNIQUE PRIMARY KEY,
  53.     flair_template_id TEXT UNIQUE);
  54. CREATE TABLE times
  55.    (name TEXT UNIQUE PRIMARY KEY,
  56.     time INTEGER);
  57. CREATE TABLE submissions
  58.    (name TEXT UNIQUE PRIMARY KEY,
  59.     subreddit TEXT,
  60.     author TEXT,
  61.     created_utc INTEGER,
  62.     link_flair_text TEXT,
  63.     title TEXT,
  64.     domain TEXT,
  65.     selftext TEXT);
  66. CREATE TABLE submissions_deleted
  67.    (name TEXT UNIQUE PRIMARY KEY,
  68.     subreddit TEXT,
  69.     author TEXT,
  70.     created_utc INTEGER,
  71.     link_flair_text TEXT,
  72.     title TEXT,
  73.     domain TEXT,
  74.     selftext TEXT,
  75.     spotted INTEGER);
  76.  
  77. # TODO: shadowbanned vs deleted users
  78. # TODO: log.db
  79. # TODO: deletion.db
  80. """
  81.  
  82. LOGGING = True
  83. DUMMY = False
  84. SUBMIT_ERRORS = False
  85. DEBUG = True
  86.  
  87. import os
  88. import sys
  89. import time
  90. import json
  91. import urllib.request, urllib.parse, urllib.error
  92. import ssl
  93. import socket
  94. import re
  95. import traceback
  96. import sqlite3
  97. import tld
  98. from random import sample
  99. from datetime import datetime
  100. from contextlib import closing
  101. from sys import stderr
  102. from html.parser import HTMLParser
  103.  
  104. MINUTE = 60
  105. HOUR = MINUTE * 60
  106. DAY = HOUR * 24
  107.  
  108. socket.setdefaulttimeout(10)
  109.  
  110. NOTIFY_THREADS = ['3rmc4v']
  111. # all in seconds
  112. NEW_SPACING = 10
  113. DELETION_SPACING = 10 * MINUTE
  114. FLAIR_SPACING = 24 * HOUR
  115. SUBSCRIBER_SPACING = 12 * HOUR
  116. ALLOWED_TITLE_LENGTH = 300
  117. INTROLEN = 100
  118.  
  119. CONFDIR = '/etc/removalbot'
  120. PIDFILE = "/tmp/removalbot.pid"
  121. LOGDIR = os.path.join(CONFDIR, 'log')
  122. IGNORE = ["godwins_law_bot", "totes_meta_bot", "redditbots", "ttumblrbots",
  123.           "autowikibot", "SRScreenshot", "MRSPArchiver", "AutoModerator",
  124.           "image_linker_bot", "SmallSubBot", "autourbanbot",
  125.           "note-to-self-bot", "ObamaRobot", "TotesMessenger",
  126.           "TweetsInCommentsBot", "TweetPoster", "JoeBidenBot",
  127.           "smilesbot", "DailMail_Bot", "TrollaBot", "TotesHuman",
  128.           "youtubefactsbot", "imgurtranscriber", "isreactionary_bot",
  129.           "iscuck_bot", "author", "reginaldtato", "NotTheOnionBot",
  130.           "rSGSpolice", "hwsbot", "yes_it_is_weird", "r_PictureGame",
  131.           "prairiechicken2", "domoarigatobtfcboto", "SkydivingHaylz",
  132.           "I_Like_Spaghetti", "STEALTHM0UNTAIN", "Google_Panda",
  133.           "AakashMasani", "Forestl", "lurkattwork", "drgoku282",
  134.           "texasmommie", "Really_Like_Pancakes", "BlaineWolfe",
  135.           "Blassie098", "ghort98765", "GustavoFrings", "WritingPromptsRobot",
  136.           "sontato", "ramsesniblick3rd", "300BlackoutSober",
  137.           "flair_your_post_bot", "GoomyTooOP", "arbutus_", "foamed",
  138.           "DumbCollegeStudent", "[deleted]", "GOTradeRuleBot",
  139.           "ShadowBanCheckBot", "ShadowBannedBot", "Shiny_Sylveon",
  140.           "PaidBot", "xbamsod", "enriquepaz13", "Moskau50", "PornOverlord",
  141.           "ConvertsToMetric", "removalbot"]
  142.  
  143. URLREGEX = r'''(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9
  144.    .\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(
  145.    ([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
  146.  
  147. DOMAIN_BLACKLIST = ["malformed.domain", "goo.gl", "tinyurl.com"]
  148.  
  149. COMMENTMODEL = ((str, 'name'),
  150.                  (str, 'subreddit'),
  151.                  (str, 'author'),
  152.                  (float, 'created_utc'),
  153.                  (str, 'link_id'),
  154.                  (str, 'body'))
  155.  
  156. SUBMISSIONMODEL = ((str, 'name'),
  157.                     (str, 'subreddit'),
  158.                     (str, 'author'),
  159.                     (float, 'created_utc'),
  160.                     (str, 'link_flair_text'),
  161.                     (str, 'title'),
  162.                     (str, 'domain'),
  163.                     (str, 'selftext'))
  164.  
  165. MODELS = { 'comments': COMMENTMODEL, 'submissions': SUBMISSIONMODEL }
  166.  
  167. os.environ['TZ'] = 'UTC'
  168. time.tzset()
  169.  
  170. pid = str(os.getpid())
  171. with open(PIDFILE, 'w') as f:
  172.     f.write(pid)
  173.  
  174.  
  175. ALSO_FATAL = ['SyntaxError', 'NameError', 'IndexError', 'TypeError',
  176.               'ValueError', 'UnboundLocalError']
  177.  
  178.  
  179. def errorhook(extype, value, trace):
  180.     os.unlink(PIDFILE)
  181.     print(traceback.print_exception(extype, value, trace))
  182.     if extype.__name__ == "KeyboardInterrupt":
  183.         exit(0)
  184.     elif extype.__name__ == "FatalError":
  185.         send_pm(maintainer, "REMOVALBOT CRASHED", "Reason: " +
  186.                 extype.__name__+": " + value.message)
  187.         writefile(conf('error'), extype.__name__+": " + value.message)
  188.         exit(2)
  189.     elif extype.__name__ in ALSO_FATAL:
  190.         send_pm(maintainer, "REMOVALBOT CRASHED", "Reason: " +
  191.                 extype.__name__+": " + value.message)
  192.         writefile(conf('error'), extype.__name__+": " + value.message)
  193.         exit(3)
  194.  
  195. sys.excepthook = errorhook
  196.  
  197.  
  198. class FatalError(Exception):
  199.     def __init__(self, message):
  200.         self.message = message
  201.         Exception.__init__(self, message)
  202.  
  203.  
  204. def current_epoch():
  205.     return (datetime.now() - datetime.utcfromtimestamp(0)).total_seconds()
  206.  
  207.  
  208. def epoch_to_string(epoch=None, tech=False, short=False):
  209.     if epoch is None:
  210.         epoch = current_epoch()
  211.     try:
  212.         epoch = float(epoch)
  213.     except:
  214.         epoch = 0
  215.     if tech:
  216.         model = "%y%m%d-%H%M"
  217.     elif short:
  218.         model = "%m-%d %H:%M"
  219.     else:
  220.         model = "%Y-%m-%d %H:%M %Z"
  221.     return time.strftime(model, time.localtime(epoch))
  222.  
  223.  
  224. def conf(name):
  225.     return str(os.path.join(CONFDIR, name))
  226.  
  227.  
  228. def newlog(name):
  229.     if LOGGING:
  230.         name = str(os.path.join(LOGDIR, name))
  231.         with closing(open(name, 'w')):
  232.             pass
  233.         return name
  234.     else:
  235.         return '/dev/null'
  236.  
  237.  
  238. def readfile(f):
  239.     with closing(open(f)) as f:
  240.         return f.read()
  241.  
  242.  
  243. def writefile(f, data):
  244.     with closing(open(f, 'w')) as f:
  245.         f.write(data)
  246.  
  247.  
  248. def censor(s, fraction):
  249.     num = int(round(fraction * len(s)))
  250.     change_locs = set(sample(list(range(len(s))), num))
  251.     changed = ('*' if i in change_locs else c for i, c in enumerate(s))
  252.     return ''.join(changed)
  253.  
  254.  
  255. def getv(query, args=()):
  256.     return (c.execute(query, args).fetchone() or (None,))[0]
  257.  
  258.  
  259. def getlast(what):
  260.     return getv('SELECT time FROM times WHERE name=?', (what,))
  261.  
  262.  
  263. def setlast(what, utc):
  264.     c.execute('INSERT OR REPLACE INTO times VALUES (?, ?)', (what, utc))
  265.     db.commit()
  266.  
  267.  
  268. def login():
  269.     print("> Logging in ", end=' ')
  270.     sys.stdout.flush()
  271.     secrets = readfile(conf('secrets')).split()
  272.     username = secrets[0]
  273.     password = secrets[1]
  274.     client_id = secrets[2]
  275.     client_secret = secrets[3]
  276.  
  277.     post_data = {"grant_type": "password",
  278.                  "username": username,
  279.                  "password": password}
  280.     headers = {"User-Agent": USERAGENT}
  281.     url = "https://www.reddit.com/api/v1/access_token"
  282.  
  283.     password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
  284.     password_mgr.add_password(None, url, client_id, client_secret)
  285.     handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
  286.     opener = urllib.request.build_opener(handler)
  287.  
  288.     response = reddit(url, opener=opener.open,
  289.                       post_data=post_data, headers=headers,
  290.                       raw=True)
  291.  
  292.     if not ('token_type' in list(response.keys()) and
  293.             'access_token' in list(response.keys())):
  294.         print(response, file=stderr)
  295.         raise FatalError("Authorization Failed")
  296.     token = response['token_type']+" "+response['access_token']
  297.     print("- done")
  298.     return {"Authorization": token, "User-Agent": USERAGENT}
  299.  
  300.  
  301. def mkrequest(url, headers=None, post_data=None):
  302.     if not post_data:
  303.         return urllib.request.Request(url, None, headers)
  304.     else:
  305.         for k, v in post_data.items():
  306.             if isinstance(v, str):
  307.                 v = v.encode('utf8')
  308.             post_data[k] = v
  309.         post_data = urllib.parse.urlencode(post_data).encode('utf-8')
  310.         return urllib.request.Request(url, post_data, headers)
  311.  
  312.  
  313. def errordir(e):
  314.     for attr in dir(e):
  315.         print(attr, getattr(e, attr), file=stderr)
  316.  
  317.  
  318. def reddit(url, opener=urllib.request.urlopen,
  319.            headers=None, post_data=None,
  320.            raw=False, catch_also=[]):
  321.     global auth
  322.     global requests_used
  323.     global requests_remaining
  324.     global requests_reset
  325.     catch = [400, 401, 500, 502, 503, 504, 521] + catch_also
  326.     while True:
  327.         try:
  328.             request = mkrequest(url, headers=headers, post_data=post_data)
  329.             response = opener(request)
  330.             requests_remaining = response.headers.get('x-ratelimit-remaining')
  331.             requests_used = response.headers.get('x-ratelimit-used')
  332.             requests_reset = response.headers.get('x-ratelimit-reset')
  333.             if requests_remaining and requests_reset:
  334.                 remaining = float(requests_remaining)
  335.                 reset = int(requests_reset)
  336.                 if remaining < reset:
  337.                     time.sleep(float(reset - remaining) /
  338.                                remaining)
  339.             j = json.loads(response.read().decode('utf-8'))
  340.             if not raw:
  341.                 j = j['data']
  342.             break
  343.         except (ValueError, KeyError,
  344.                 socket.timeout, socket.error,
  345.                 ssl.SSLError,
  346.                 urllib.error.HTTPError, urllib.error.URLError) as e:
  347.             print('<' + type(e).__name__ + ': ', end=' ')
  348.             if type(e).__name__ == 'HTTPError':
  349.                 print(str(e.code) + '!>', end=' ')
  350.                 if e.code not in catch:
  351.                     raise
  352.                 elif e.code == 401:
  353.                     print("(Token expired)", end=' ')
  354.                     auth = login()
  355.                     headers = auth
  356.             elif type(e).__name__ == 'URLError':
  357.                 if "handshake operation" not in str(e.reason):
  358.                     raise
  359.             else:
  360.                 print('!>', end=' ')
  361.             sys.stdout.flush()
  362.             time.sleep(5)
  363.         except Exception as e:
  364.             print(file=stderr)
  365.             print("ERROR!", file=stderr)
  366.             print(file=stderr)
  367.             errordir(e)
  368.             raise
  369.     return j
  370.  
  371.  
  372. def fetch(query, lastseen, model, kind, catch_also=[]):
  373.     pagecount = 1
  374.     newest = 0
  375.     after = ''
  376.  
  377.     class Done(Exception):
  378.         pass
  379.     try:
  380.         while True:
  381.             if pagecount % 10 == 0 and kind != 'noupdate':
  382.                 print('u(', end=' ')
  383.                 sys.stdout.flush()
  384.                 for post in fetch_posts_since(kind, lastcheck=newest, quiet=True):
  385.                     yield post
  386.                 print(')', end=' ')
  387.                 sys.stdout.flush()
  388.  
  389.             print('p'+str(pagecount), end=' ')
  390.             sys.stdout.flush()
  391.  
  392.             response = reddit(query+after, headers=auth, catch_also=catch_also)
  393.  
  394.             for child in [c['data'] for c in response['children']]:
  395.  
  396.                 if child['created_utc'] <= lastseen or \
  397.                    current_epoch() - child['created_utc'] > DAY * 7:
  398.                     raise Done
  399.                 if child['author'] in IGNORE:
  400.                     continue
  401.  
  402.                 fields = {}
  403.                 for field in model:
  404.                     fieldtype = field[0]
  405.                     fieldname = field[1]
  406.                     if fieldtype is bool:
  407.                         fields[fieldname] = child[fieldname] and 1 or 0
  408.                     else:
  409.                         fields[fieldname] = fieldtype(child[fieldname])
  410.                 if fields['created_utc'] > newest:
  411.                         newest = fields['created_utc']
  412.  
  413.                 yield fields
  414.  
  415.             if not response['after']:
  416.                 raise Done
  417.             else:
  418.                 after = '&after='+response['after']
  419.             pagecount += 1
  420.     except Done:
  421.         #print(sum([len(field) for post in log for field in post if type(field) is str]))
  422.         #sys.stdout.flush()
  423.         #return tuple(sorted(list(log), key=lambda post: post[3], reverse=True))
  424.         pass
  425.  
  426. def fetch_posts_since(kind, lastcheck=0, quiet=False):
  427.     url = 'https://oauth.reddit.com/user/removalbot/m/monitor/' + \
  428.             (kind if kind == 'comments' else 'new') + \
  429.             '/.json?sort=new&limit=100'
  430.     count = 0
  431.     if not quiet:
  432.         start = current_epoch()
  433.         print("Reading " + kind + " -", end=' ')
  434.         sys.stdout.flush()
  435.     for post in fetch(url,
  436.                         lastcheck,
  437.                         MODELS[kind],
  438.                         kind,
  439.                         catch_also=[403]):
  440.         count += 1
  441.         yield post
  442.     if not quiet:
  443.         print("- "+str(count)+" "+kind+" new")
  444.         print("Read in " + str(current_epoch() - start))
  445.  
  446.  
  447. def model_to_tuple(item, model):
  448.     tup = ()
  449.     for field in model:
  450.         tup += (item[field[1]],)
  451.     return tup
  452.  
  453.  
  454. def insert_tuple(post, kind):
  455.     c.execute('INSERT OR IGNORE INTO ' + kind + ' VALUES ' +
  456.                 ('(' + ','.join(['?'] * len(post)) + ')'),
  457.               post)
  458.  
  459.  
  460. def get_new_of(kind):
  461.     lastcheck = getv('SELECT MAX(created_utc) FROM ' + kind)
  462.     for post in fetch_posts_since(kind, lastcheck=lastcheck):
  463.         post = model_to_tuple(post, MODELS[kind])
  464.         insert_tuple(post, kind)
  465.     db.commit()
  466.  
  467.  
  468. def get_new():
  469.     print("> Checking for new posts,", end=' ')
  470.     nextdelcheck = str(DELETION_SPACING - int(current_epoch() -
  471.                        last_deletion_check * DELETION_SPACING))
  472.     print("next comparison in "+nextdelcheck+"s                               ")
  473.     get_new_of('comments')
  474.     get_new_of('submissions')
  475.  
  476.  
  477. def recurse_into_get_authors(subthread):
  478.     subthread = subthread['data']['children']
  479.     notify = []
  480.     for post in subthread:
  481.         post = post['data']
  482.         if post['author'] != '[deleted]':
  483.             notify += [post['author']]
  484.         if 'replies' in list(post.keys()) and post['replies']:
  485.             notify += recurse_into_get_authors(post['replies'])
  486.     return notify
  487.  
  488.  
  489. def update_notify():
  490.     global notify
  491.     print("> Updating users to notify", end=' ')
  492.     sys.stdout.flush()
  493.     notify = []
  494.     for thread in NOTIFY_THREADS:
  495.         thread = 'https://oauth.reddit.com/r/removalbot/comments/'+thread
  496.         pm_thread = reddit(thread+'.json',
  497.                            headers=auth, raw=True)
  498.         notify += recurse_into_get_authors(pm_thread[1])
  499.     notify += [maintainer]
  500.     notify = list(set(notify))
  501.     c.execute('DELETE FROM notify')
  502.     for user in notify:
  503.         c.execute('INSERT INTO notify VALUES (?)', (user,))
  504.     db.commit()
  505.     print("- done (" + str(len(notify)) + " users: " + ", ".join(notify) + ")")
  506.  
  507.  
  508. def update_flairs():
  509.     print("> Fetching flairs", end=' ')
  510.     sys.stdout.flush()
  511.     latestposturl = 'https://oauth.reddit.com/r/removalbot/new/.json?limit=1'
  512.     latestpostname = reddit(latestposturl, headers=auth)
  513.     latestpostname = latestpostname['children'][0]['data']['name']
  514.     flairurl = 'https://oauth.reddit.com/r/removalbot/api/flairselector'
  515.     post_data = {'link': latestpostname}
  516.     flairchoices = reddit(flairurl,
  517.                           post_data=post_data, headers=auth,
  518.                           raw=True)['choices']
  519.     c.execute('DELETE FROM flairs')
  520.     for flair in flairchoices:
  521.         c.execute('INSERT INTO flairs VALUES (?,?)',
  522.                   (flair['flair_text'], flair['flair_template_id']))
  523.     db.commit()
  524.     print("- done")
  525.  
  526.  
  527. def get_fullname_new(response):
  528.     response = response['jquery']
  529.     is_redirect = False
  530.     fullname_new = None
  531.     for line in response:
  532.         if is_redirect:
  533.             fullname_new = 't3_'+line[3][0].split('/')[-3]
  534.             break
  535.         if line[2] == 'attr' and line[3] == 'redirect':
  536.             is_redirect = True
  537.     if fullname_new:
  538.         return fullname_new
  539.     else:
  540.         print(response)
  541.         raise FatalError("Malformed response from reddit")
  542.  
  543.  
  544. def setflair(fullname, text):
  545.     fid = getv('SELECT flair_template_id FROM flairs WHERE name=?', (text,))
  546.     if not fid:
  547.         return False
  548.     post_data = {'link': fullname,
  549.                  'api_type': 'json',
  550.                  'flair_template_id': fid}
  551.     if not DUMMY:
  552.         reddit('https://oauth.reddit.com/r/removalbot/api/selectflair',
  553.                post_data=post_data, headers=auth, raw=True)
  554.     return True
  555.  
  556.  
  557. def send_pm(to, subject, text):
  558.     post_data = {'api_type': 'json',
  559.                  'subject': subject,
  560.                  'text': text,
  561.                  'to': to}
  562.     print("Sending PM '"+subject+"' to /u/"+to, end=' ')
  563.     sys.stdout.flush()
  564.     if not DUMMY:
  565.         reddit('https://oauth.reddit.com/api/compose',
  566.                post_data=post_data, headers=auth, raw=True)
  567.     print("- sent")
  568.  
  569.  
  570. #def timestring(span):
  571. #    spanstring = ''
  572. #    times = ((60.0, 'min'), (60.0, 'h'), (24.0, 'd'))
  573. #    for t in times:
  574. #        span /= t[0]
  575. #        if span > 5:
  576. #            spanstring = str(round(span, 1)) + t[1]
  577. #    return spanstring
  578.  
  579.  
  580. def compare_update(kind, newposts):
  581.     c.execute('CREATE TABLE IF NOT EXISTS ' + kind + '_new \
  582.                AS SELECT * FROM ' + kind + ' WHERE 0')
  583.  
  584.     for post in newposts:
  585.         post = model_to_tuple(post, MODELS[kind])
  586.         insert_tuple(post, kind + '_new')
  587.    
  588.     q = '''DELETE FROM ''' + kind + ''' WHERE name IN (
  589.             SELECT name FROM (
  590.               SELECT subreddit, MIN(created_utc) AS mintime
  591.               FROM ''' + kind + '''_new GROUP BY subreddit
  592.             ) AS mintimes INNER JOIN ''' + kind + '''
  593.             ON ''' + kind + '''.subreddit = mintimes.subreddit
  594.             AND ''' + kind + '''.created_utc < mintimes.mintime
  595.           )'''
  596.    
  597.     oldcount = getv('SELECT COUNT(name) FROM ' + kind)
  598.     c.execute(q)
  599.     newcount = getv('SELECT COUNT(name) FROM ' + kind)
  600.     fell = oldcount - newcount
  601.    
  602.     q = '''INSERT OR IGNORE INTO ''' + kind + '''_deleted
  603.             SELECT *, STRFTIME("%s", "now") FROM ''' + kind + '''
  604.             WHERE name NOT IN (
  605.               SELECT name FROM ''' + kind + '''_new
  606.             )'''
  607.            
  608.     c.execute(q)
  609.     if DEBUG:
  610.         deleted = tuple(c.execute('SELECT name FROM ' + kind + '_deleted'))
  611.         print("Deleted " + kind + ": " + str([item[0] for item in deleted]))
  612.     deleted = getv('SELECT COUNT(*) FROM ' + kind + '_deleted')
  613.    
  614.     q = '''SELECT COUNT(*) FROM (
  615.             SELECT subreddit, MAX(created_utc) AS maxtime
  616.             FROM ''' + kind + ''' GROUP BY subreddit
  617.           ) AS maxtimes INNER JOIN ''' + kind + '''_new
  618.           ON ''' + kind + '''_new.subreddit = maxtimes.subreddit
  619.           AND ''' + kind + '''_new.created_utc > maxtimes.maxtime'''
  620.     new = getv(q)
  621.  
  622.     c.execute('DROP TABLE ' + kind)
  623.     c.execute('ALTER TABLE ' + kind + '_new RENAME TO ' + kind)
  624.  
  625.     db.commit()
  626.  
  627.     total = getv('SELECT COUNT(name) FROM ' + kind)
  628.  
  629.     print(str(deleted) + ' ' + kind + ' deleted,', end=' ')
  630.     print(str(new) + ' new,', end=' ')
  631.     print(str(fell) + ' fell out -', end=' ')
  632.     print(str(total) + ' new total')
  633.  
  634.  
  635. def check_deletions():
  636.  
  637.     print("> Checking for deletions at " + str(int(current_epoch())) + "                   ")
  638.  
  639.     compare_update('comments', fetch_posts_since('comments', 0))
  640.     compare_update('submissions', fetch_posts_since('submissions', 0))
  641.  
  642. def check_user_deletion(post, kind):
  643.     safe_domains = readfile(conf('safedomains')).strip().split()
  644.     u = readfile(conf('unknowndomains')).strip().split('\n')
  645.     unknown_domains = {}
  646.     for d in u:
  647.         if not d:
  648.             continue
  649.         d = d.strip().split()
  650.         unknown_domains[d[0]] = int(d[1])
  651.  
  652.     print("> Checking for user deletion of " + kind + " " + post[0] + \
  653.         (" from " + post[4] if kind == 'comment' else '') + \
  654.         " in " + post[1] + ",", end=' ')
  655.     left = getv('SELECT COUNT(*) FROM (SELECT name FROM comments_deleted \
  656.                     UNION SELECT name FROM submissions_deleted)')-1
  657.     print(str(left or 'no') + " more left to check -", end=' ')
  658.     sys.stdout.flush()
  659.  
  660.     spotted = post[-1]
  661.  
  662.     name = post[0].split('_')[1]
  663.     sub = post[1]
  664.     author = post[2]
  665.     posted = post[3]
  666.  
  667.     compare = tuple(comment[0] for comment in
  668.                     tuple(c.execute('SELECT created_utc FROM ' + kind + 's \
  669.                                    WHERE subreddit=? \
  670.                                    ORDER BY created_utc ASC', (sub,))))
  671.     mincompare = min(compare) if compare else current_epoch() - DAY*7
  672.     compare = compare[-int(len(compare)*0.95)] if compare \
  673.         else current_epoch() - DAY*7
  674.  
  675.     print(str(round((current_epoch() - posted) / HOUR, 2)) + "h vs cutoff " + \
  676.         str(round((current_epoch() - compare) / HOUR, 2)) + "h, oldest " + \
  677.         str(round((current_epoch() - mincompare) / HOUR, 2)) + "h -", end=' ')
  678.     sys.stdout.flush()
  679.     if posted <= compare:
  680.         print("too old")
  681.         return False
  682.  
  683.     title = ''
  684.     if kind == 'comment':
  685.         link_id = post[4].split('_')[1]
  686.         content = post[5]
  687.         baseurl = 'https://oauth.reddit.com/user/'+author
  688.         usercomments = {}
  689.         shadowbanned = False
  690.         try:
  691.             usercomments = fetch(baseurl+'/comments/.json?sort=new&limit=100',
  692.                                  posted - 30,
  693.                                  ((str, 'name'),
  694.                                   (str, 'subreddit'),
  695.                                   (str, 'author'),
  696.                                   (float, 'created_utc'),
  697.                                   (str, 'link_id'),
  698.                                   (str, 'body')),
  699.                                  kind='noupdate')
  700.         except urllib.error.HTTPError as e:
  701.             if e.code in [403, 404]:
  702.                 print("- " + author + " shadowbanned or deleted", end=' ')
  703.                 url = 'https://oauth.reddit.com/r/' + sub + '/comments/' + \
  704.                       link_id + '/comment/' + name + '/.json'
  705.                 try:
  706.                     reddit(url, headers=auth, raw=True)
  707.                     print("- deleted")
  708.                     return False
  709.                 except urllib.error.HTTPError as e:
  710.                     if e.code in [403, 404]:
  711.                         print("- shadowbanned", end=' ')
  712.                         shadowbanned = True
  713.                     else:
  714.                         raise
  715.             else:
  716.                 raise
  717.         usercomments = {comment['name']: model_to_tuple(comment, MODELS['comments'])
  718.                         for comment in usercomments}
  719.         if post[0] not in list(usercomments.keys()) and not shadowbanned:
  720.             print("- deleted by "+author+" ("+((content[:47]+"...")
  721.                                                if len(content) > 50
  722.                                                else content).replace('\n',
  723.                                                                      ' / ')+")")
  724.             return False
  725.         else:
  726.             if not shadowbanned:
  727.                 post = usercomments[post[0]]
  728.                 name = post[0].split('_')[1]
  729.                 sub = post[1]
  730.                 author = post[2]
  731.                 posted = post[3]
  732.                 link_id = post[4].split('_')[1]
  733.                 content = post[5]
  734.             print("- deleted by mods")
  735.             title1 = str(epoch_to_string(short=True) + " - '")
  736.             title2 = str("' by /u/" + author +
  737.                              " removed from /r/" + sub)
  738.             lower_frame_boundary = spotted - DELETION_SPACING
  739.             upper_frame_boundary = spotted
  740.             frame_lower = round((float(lower_frame_boundary) - posted) / 60)
  741.             if frame_lower < 0:
  742.                 frame_lower = 0
  743.             frame_upper = round((float(upper_frame_boundary) - posted) / 60)
  744.             if frame_lower != frame_upper:
  745.                 frame = str(int(frame_lower)) + "-" + str(int(frame_upper))
  746.             else:
  747.                 frame = str(int(frame_lower))
  748.             title2 += " within " + frame + "min"
  749.             if shadowbanned:
  750.                 title2 += " (user shadowbanned)"
  751.             restlen = ALLOWED_TITLE_LENGTH - (len(title1) + len(title2))
  752.             intro = re.sub(r'&gt;.*\n', '[quote]', content)
  753.  
  754.             intro = re.sub(r'\[([^\]]*)\]\([^\)]*\)', r'[\1]', intro)
  755.             intro = re.sub(URLREGEX, '[link]', intro)
  756.  
  757.             intro = intro.replace('/r/', 'r/')
  758.             intro = intro.replace('/u/', 'u/')
  759.             intro = re.sub(r' +', ' ', intro)
  760.             intro = re.sub(r'[ \n/][ \n/]+', ' / ', intro)
  761.             intro = intro.strip(' \n/')
  762.  
  763.             links = []
  764.             for url in re.finditer(r'\[([^\]]*)\][ \n]?\(([^\)]*)\)', content):
  765.                 links += [url.group(2)]
  766.                 content = content.replace(url.group(0),
  767.                                           '[' + url.group(1) + ']^^' +
  768.                                           str(len(links)) + ' ')
  769.             for url in re.finditer(URLREGEX, content):
  770.                 links += [url.group(0)]
  771.                 content = content.replace(url.group(0),
  772.                                           '[link]^^' + str(len(links)) + ' ')
  773.  
  774.             if len(intro) > restlen:
  775.                 intro = str(intro[:restlen-3].strip(' ./,') + "...")
  776.             else:
  777.                 intro = str(intro)
  778.             title = title1 + intro + title2
  779.             title = title[:ALLOWED_TITLE_LENGTH]
  780.             body = "'''\n\n"+content+"\n\n'''\n\n"
  781.             if post[4] == 'None':
  782.                 body = "No link could be determined."
  783.                 link = "Unknown"
  784.             else:
  785.                 linkbase = "/r/" + sub + "/comments/" + link_id + \
  786.                            "/comment/" + name + "?context=999"
  787.                 link = "https://reddit.com" + linkbase
  788.                 goldfishlink = "http://r.go1dfish.me" + linkbase
  789.                 unredditlink = "https://unreddit.com" + linkbase
  790.                 body += "[Context Link](" + link + ")\n\n"
  791.                 body += "[Go1dfish undelete link](" + goldfishlink + ")\n\n"
  792.                 body += "[unreddit undelete link](" + unredditlink + ")"
  793.             body += "\n\nAuthor: /u/" + author
  794.             if links:
  795.                 body += "\n\n"
  796.                 unknowns = False
  797.                 for l in range(len(links)):
  798.                     try:
  799.                         domain = tld.get_tld(links[l])
  800.                     except tld.exceptions.TldBadUrl:
  801.                         domain = 'reddit.com'
  802.                     except (tld.exceptions.TldDomainNotFound, ValueError):
  803.                         domain = 'malformed.domain'
  804.                         print("Malformed domain: " + links[l])
  805.                     if domain in safe_domains:
  806.                         body += str(l+1) + ': ' + links[l] + '  \n'
  807.                     else:
  808.                         unknowns = True
  809.                         if domain not in DOMAIN_BLACKLIST:
  810.                             if domain in list(unknown_domains.keys()):
  811.                                 unknown_domains[domain] += 1
  812.                             else:
  813.                                 unknown_domains[domain] = 1
  814.                             with closing(open(conf('unknowndomains'),
  815.                                          'w')) as f:
  816.                                 for d in unknown_domains:
  817.                                     f.write(d+' '+str(unknown_domains[d])+'\n')
  818.                         oblink = re.sub(r'.*://', '', links[l])
  819.                         if domain != "maldormed.domain":
  820.                             oblink = censor(oblink, 0.25)
  821.                         body += str(l+1) + ': `' + oblink + '`  \n'
  822.                 if unknowns:
  823.                     body += "\nUnknown links are censored to prevent \
  824.                        spreading illicit content."
  825.             print(title, end=' ')
  826.  
  827.     elif kind == 'submission':
  828.         reason = post[4]
  829.         subject = post[5]
  830.         domain = post[6]
  831.         selftext = post[7]
  832.         baseurl = 'https://oauth.reddit.com/user/'+author
  833.         shadowbanned = False
  834.         usersubmissions = {}
  835.         try:
  836.             usersubmissions = fetch(baseurl +
  837.                                     '/submitted/.json?sort=new&limit=100',
  838.                                     posted - 30,
  839.                                     ((str, 'name'),
  840.                                      (str, 'subreddit'),
  841.                                      (str, 'author'),
  842.                                      (float, 'created_utc'),
  843.                                      (str, 'link_flair_text'),
  844.                                      (str, 'title'),
  845.                                      (str, 'domain'),
  846.                                      (str, 'selftext')),
  847.                                     kind='noupdate')
  848.         except urllib.error.HTTPError as e:
  849.             if e.code in [403, 404]:
  850.                 print("- " + author + " shadowbanned or deleted", end=' ')
  851.                 url = 'https://oauth.reddit.com/r/' + sub + '/comments/' + \
  852.                       name + '/.json'
  853.                 try:
  854.                     reddit(url, headers=auth, raw=True)
  855.                     print("- deleted")
  856.                     return False
  857.                 except urllib.error.HTTPError as e:
  858.                     if e.code in [403, 404]:
  859.                         print("- shadowbanned", end=' ')
  860.                         shadowbanned = True
  861.                     else:
  862.                         raise
  863.             else:
  864.                 raise
  865.         usersubmissions = {submission['name']: model_to_tuple(submission, MODELS['submissions'])
  866.                            for submission in usersubmissions}
  867.         if post[0] not in list(usersubmissions.keys()) and not shadowbanned:
  868.             print("- deleted by "+author+" ("+((subject[:47]+"...")
  869.                                                if len(subject) > 50
  870.                                                else subject)+")")
  871.             return False
  872.         else:
  873.             reason = 'None'
  874.             if not shadowbanned and not post[7]:
  875.                 post = usersubmissions[post[0]]
  876.                 name = post[0].split('_')[1]
  877.                 sub = post[1]
  878.                 author = post[2]
  879.                 posted = post[3]
  880.                 reason = post[4]
  881.                 subject = post[5]
  882.                 domain = post[6]
  883.                 selftext = post[7]
  884.             reason = reason.lower()
  885.             if 'removed' in reason:
  886.                 reason = reason.replace('removed', '').strip(' -|—')
  887.                 reason = reason.capitalize() or 'None'
  888.                 reason = " - reason: "+reason
  889.             else:
  890.                 reason = ''
  891.             print("- deleted by mods" + reason)
  892.             lower_frame_boundary = spotted - DELETION_SPACING
  893.             upper_frame_boundary = spotted
  894.             frame_lower = round((float(lower_frame_boundary) - posted) / 60)
  895.             if frame_lower < 0:
  896.                 frame_lower = 0
  897.             frame_upper = round((float(upper_frame_boundary) - posted) / 60)
  898.             if frame_lower != frame_upper:
  899.                 frame = str(int(frame_lower)) + "-" + str(int(frame_upper))
  900.             else:
  901.                 frame = str(int(frame_lower))
  902.             title1 = str(epoch_to_string(short=True) + " - '")
  903.             title2 = str("' (" + domain + ") by /u/" + author +
  904.                              " removed from /r/" + sub +
  905.                              " within " + frame + "min" + reason)
  906.             if shadowbanned:
  907.                 title2 += " (user shadowbanned)"
  908.             restlen = ALLOWED_TITLE_LENGTH - (len(title1) + len(title2))
  909.             if len(subject) > restlen:
  910.                 intro = str(subject[:restlen-3].strip(' ./') + "...")
  911.             else:
  912.                 intro = str(subject)
  913.             title = title1 + intro + title2
  914.             title = title[:ALLOWED_TITLE_LENGTH]
  915.             linkbase = "/r/" + sub + "/comments/" + name
  916.             link = "https://reddit.com/" + linkbase
  917.             goldfishlink = "http://r.go1dfish.me" + linkbase
  918.             unredditlink = "https://unreddit.com" + linkbase
  919.             body = ""
  920.             links = []
  921.             if selftext:
  922.                 links = []
  923.                 for url in re.finditer(r'\[([^\]]*)\][ \n]?\(([^\)]*)\)',
  924.                                        selftext):
  925.                     links += [url.group(2)]
  926.                     selftext = selftext.replace(url.group(0),
  927.                                                 '[' + url.group(1) + ']^^' +
  928.                                                 str(len(links)) + ' ')
  929.                 for url in re.finditer(URLREGEX, selftext):
  930.                     links += [url.group(0)]
  931.                     selftext = selftext.replace(url.group(0),
  932.                                                 '[link]^^' +
  933.                                                 str(len(links)) + ' ')
  934.                 body = "'''\n\n" + selftext + "\n\n'''\n\n"
  935.             body += "[" + subject + "](" + link + ")\n\n"
  936.             body += "[Go1dfish undelete link](" + goldfishlink + ")\n\n"
  937.             body += "[unreddit undelete link](" + unredditlink + ")\n\n"
  938.             body += "Author: /u/" + author
  939.             if links:
  940.                 body += "\n\n"
  941.                 unknowns = False
  942.                 for l in range(len(links)):
  943.                     try:
  944.                         domain = tld.get_tld(links[l])
  945.                     except tld.exceptions.TldBadUrl:
  946.                         domain = 'reddit.com'
  947.                     except (tld.exceptions.TldDomainNotFound, ValueError):
  948.                         domain = 'malformed.domain'
  949.                         print("Malformed domain: " + links[l])
  950.                     if domain in safe_domains:
  951.                         body += str(l+1) + ': ' + links[l] + '  \n'
  952.                     else:
  953.                         unknowns = True
  954.                         if domain not in DOMAIN_BLACKLIST:
  955.                             if domain in list(unknown_domains.keys()):
  956.                                 unknown_domains[domain] += 1
  957.                             else:
  958.                                 unknown_domains[domain] = 1
  959.                             with closing(open(conf('unknowndomains'),
  960.                                          'w')) as f:
  961.                                 for d in unknown_domains:
  962.                                     f.write(d+' '+str(unknown_domains[d])+'\n')
  963.                         oblink = re.sub(r'.*://', '', links[l])
  964.                         if domain != "maldormed.domain":
  965.                             oblink = censor(oblink, 0.25)
  966.                         body += str(l+1) + ': `' + oblink + '`  \n'
  967.                 if unknowns:
  968.                     body += "\nUnknown links are censored to prevent \
  969.                        spreading illicit content."
  970.             print(title, end=' ')
  971.  
  972.     h = HTMLParser()
  973.     title = h.unescape(title)
  974.     body = h.unescape(body)
  975.     if len(body) > 40000:
  976.         body = body[:39900] + '[... post size limit of 40,000 characters reached]'
  977.     post_data = {'sr': 'removalbot', 'title': title,
  978.                  'kind': 'self', 'text': body}
  979.     if not DUMMY:
  980.         response = reddit('https://oauth.reddit.com/api/submit',
  981.                           post_data=post_data, headers=auth, raw=True)
  982.     print("- submitted", end=' ')
  983.     sys.stdout.flush()
  984.     if not DUMMY:
  985.         fullname_new = get_fullname_new(response)
  986.         if setflair(fullname_new, kind+'-'+sub.lower()):
  987.             print("- flaired")
  988.         else:
  989.             print("- no flair")
  990.  
  991.     if not shadowbanned and getv('SELECT author FROM notify WHERE author=?',
  992.                                  (author,)):
  993.         pm_subject = "Your "+kind+" was deleted from /r/"+sub
  994.         if kind == 'comment':
  995.             content = content.strip(' \n')
  996.             if '\n\n' in content:
  997.                 content = '\n\n'+content+'\n\n'
  998.         else:
  999.             content = '  \n&nbsp;**Reason**: '+str(reason)
  1000.         content = h.unescape(content)
  1001.         pm_body = 'Hello, **'+author+'**!\n\n&nbsp;\n\n'
  1002.         pm_body += 'Your '+kind+' appears to have been deleted '
  1003.         pm_body += 'from **/r/' + sub + '** by the moderators, '
  1004.         pm_body += '/u/AutoModerator or the administrators.\n\n'
  1005.         pm_body += '&nbsp;**'+kind.capitalize()+'**: '+content+'  \n'
  1006.         pm_body += '&nbsp;**Posted at**: '+epoch_to_string(posted)+'  \n'
  1007.         pm_body += '&nbsp;**Delay until deletion**: '+frame+'min  \n'
  1008.         pm_body += '&nbsp;**Link**: '+str(link)+'\n\n&nbsp;\n\n'
  1009.         pm_body += 'Have a nice day!  \n'
  1010.         pm_body += '/u/removalbot\n\n'
  1011.         pm_body += '----\n\n'
  1012.         pm_body += '^(Note that the deletion may have been accidental '
  1013.         pm_body += 'or its detection a false positive caused by heavy load '
  1014.         pm_body += 'on reddit\'s servers.)  \n'
  1015.         pm_body += '^^^This ^^^is ^^^an ^^^automated ^^^message ^^^from '
  1016.         pm_body += '^^^/r/removalbot.'
  1017.         send_pm(author, pm_subject, pm_body)
  1018.  
  1019.     return True
  1020.  
  1021. requests_used = None
  1022. requests_remaining = None
  1023. requests_reset = None
  1024.  
  1025. maintainer = readfile(conf('maintainer')).strip()
  1026. version = readfile(conf('version')).strip()
  1027. USERAGENT = 'removalbot by /u/'+maintainer+', v'+version
  1028.  
  1029. os.chdir(CONFDIR)
  1030.  
  1031. db = sqlite3.connect(conf('posts.db'))
  1032. c = db.cursor()
  1033.  
  1034. auth = login()
  1035.  
  1036. update_flairs()
  1037.  
  1038. if os.path.isfile(conf('error')) and \
  1039.    readfile(conf('error')) != 'KeyboardInterrupt':
  1040.     failed = os.path.getctime(conf('error'))
  1041.     failed = epoch_to_string(epoch=failed)
  1042.     reason = readfile(conf('error'))
  1043.     send_pm(maintainer, "REMOVALBOT CRASHED", "Reason: "+reason)
  1044.     e = reason.split(':')[0]
  1045.     print(e, e in ALSO_FATAL)
  1046.     if not DUMMY and (SUBMIT_ERRORS or e == "FatalError" or e in ALSO_FATAL):
  1047.         print("> Submitting error "+reason, end=' ')
  1048.         sys.stdout.flush()
  1049.         title = "[!] Bot encountered an error at " + failed + \
  1050.             ", reason: " + reason
  1051.         post_data = {'sr': 'removalbot', 'kind': 'self', 'title': title}
  1052.         if reason != "FatalError":
  1053.             post_data['text'] = "Such errors usually indicate that reddit is \
  1054.                                 overloaded or in maintenance mode, i. e. \
  1055.                                 they are unavoidable.\n\nA database is used \
  1056.                                 to minimize the impact but cannot negate it."
  1057.         else:
  1058.             post_data['text'] = "The bot encountered a fatal error. This \
  1059.                                 should not happen. The maintainer has been \
  1060.                                 notified; until further action, the bot is \
  1061.                                 suspended."
  1062.         response = reddit('https://oauth.reddit.com/api/submit',
  1063.                           post_data=post_data, headers=auth, raw=True)
  1064.         setflair(get_fullname_new(response), 'error')
  1065.         print("- done")
  1066.     else:
  1067.         print("> Encountered error "+reason)
  1068.     os.remove(conf('error'))
  1069.  
  1070. update_notify()
  1071.  
  1072. while True:
  1073.     now = int(current_epoch())
  1074.     last_deletion_check = getlast('deletion')
  1075.     last_new_check = getlast('new')
  1076.     last_subscriber_check = getlast('subscribers')
  1077.     last_flair_check = getlast('flair')
  1078.     changed = False
  1079.     if int(now / DELETION_SPACING) != last_deletion_check:
  1080.         check_deletions()
  1081.         setlast('deletion', int(now / DELETION_SPACING))
  1082.     elif int(now / NEW_SPACING) != last_new_check:
  1083.         get_new()
  1084.         setlast('new', int(now / NEW_SPACING))
  1085.     elif int(now / SUBSCRIBER_SPACING) != last_subscriber_check:
  1086.         update_notify()
  1087.         setlast('subscribers', int(now / SUBSCRIBER_SPACING))
  1088.     elif int(now / FLAIR_SPACING) != last_flair_check:
  1089.         update_flairs()
  1090.         setlast('flair', int(now / FLAIR_SPACING))
  1091.     else:
  1092.         deleted_submission = \
  1093.             c.execute('SELECT * FROM submissions_deleted LIMIT 1').fetchone()
  1094.         deleted_comment = \
  1095.             c.execute('SELECT * FROM comments_deleted LIMIT 1').fetchone()
  1096.         left = getv('SELECT COUNT(*) FROM (SELECT name FROM comments_deleted \
  1097.                 UNION SELECT name FROM submissions_deleted)')-1
  1098.         if deleted_submission:
  1099.             check_user_deletion(deleted_submission, 'submission')
  1100.             c.execute('DELETE FROM submissions_deleted WHERE name=?',
  1101.                       (deleted_submission[0],))
  1102.         elif deleted_comment:
  1103.             check_user_deletion(deleted_comment, 'comment')
  1104.             c.execute('DELETE FROM comments_deleted WHERE name=?',
  1105.                       (deleted_comment[0],))
  1106.         c.execute('PRAGMA shrink_memory')
  1107.         if not deleted_submission and not deleted_comment:
  1108.             time.sleep(0.1)
  1109.             continue
  1110.         elif left == 0:
  1111.             c_unknown = readfile(conf('unknowndomains')).strip().split('\n')
  1112.             print("> Done undeleting, " + str(len(c_unknown)) + \
  1113.                 " unknown domains logged")
  1114.  
  1115.     print()
  1116.     if requests_remaining and requests_reset and requests_used:
  1117.         requests_remaining = str(int(float(requests_remaining)))
  1118.         print("Used: " + requests_used + ", remaining: " + \
  1119.               requests_remaining + " in " + requests_reset + "s", end=' ')
  1120.         remaining = float(requests_remaining)
  1121.         reset = int(requests_reset)
  1122.         print("(" + str(round(remaining / reset, 2)) + "/s)", end=' ')
  1123.         if remaining < reset:
  1124.             sleeptime = str(int(round(float(reset - remaining) /
  1125.                                 remaining, 3) * 1000))
  1126.             print("- OVERLOAD, sleep " + sleeptime + "ms")
  1127.         else:
  1128.             print("- OK\r", end=' ')
  1129.         sys.stdout.flush()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement