Advertisement
Guest User

Untitled

a guest
Mar 24th, 2016
5,537
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 43.13 KB | None | 0 0
  1. #!/usr/bin/python2.7
  2. # -*- coding: utf-8 -*-
  3.  
  4. """
  5. USAGE: Set up as described below, then just run the script somewhere in the
  6.       background, screen works nicely for this.
  7.  
  8. Non-standard dependencies: sqlite3, tld
  9.  
  10. It is recommended to wrap this in the following script:
  11. #!/bin/bash
  12. RET=1
  13. while [ $RET -eq 1 ]
  14. do
  15.        removalbot
  16.        RET=$?
  17. done
  18.  
  19. Ctrl+C is handled as success, expected failures have exit code 2.
  20. Likely programming errors have exit code 3.
  21.  
  22. Files:
  23. - CONFDIR, defined below
  24. - CONFDIR/maintainer, your reddit username
  25. - CONFDIT/secrets in the format "user\npass\nclient_id\bclient_secret"
  26. - CONFDIR/version
  27. - CONFDIR/safedomains, empty or newline-separated domains
  28. - CONFDIR/unknowndomains, empty
  29. - CONFDIR/posts.db, sqlite3 database
  30.  
  31. Be aware that you need to set up oauth to use this and gather the client ID and secret for the login process.
  32.  
  33. DB schema:
  34. CREATE TABLE notify
  35.    (author TEXT UNIQUE);
  36. CREATE TABLE comments
  37.    (name TEXT UNIQUE PRIMARY KEY,
  38.     subreddit TEXT,
  39.     author TEXT,
  40.     created_utc INTEGER,
  41.     link_id TEXT,
  42.     body TEXT);
  43. CREATE TABLE comments_deleted
  44.    (name TEXT UNIQUE PRIMARY KEY,
  45.     subreddit TEXT,
  46.     author TEXT,
  47.     created_utc INTEGER,
  48.     link_id TEXT,
  49.     body TEXT,
  50.     spotted INTEGER);
  51. CREATE TABLE flairs
  52.    (name TEXT UNIQUE PRIMARY KEY,
  53.     flair_template_id TEXT UNIQUE);
  54. CREATE TABLE times
  55.    (name TEXT UNIQUE PRIMARY KEY,
  56.     time INTEGER);
  57. CREATE TABLE submissions
  58.    (name TEXT UNIQUE PRIMARY KEY,
  59.     subreddit TEXT,
  60.     author TEXT,
  61.     created_utc INTEGER,
  62.     link_flair_text TEXT,
  63.     title TEXT,
  64.     domain TEXT,
  65.     selftext TEXT);
  66. CREATE TABLE submissions_deleted
  67.    (name TEXT UNIQUE PRIMARY KEY,
  68.     subreddit TEXT,
  69.     author TEXT,
  70.     created_utc INTEGER,
  71.     link_flair_text TEXT,
  72.     title TEXT,
  73.     domain TEXT,
  74.     selftext TEXT,
  75.     spotted INTEGER);
  76.  
  77. # TODO: shadowbanned vs deleted users
  78. # TODO: log.db
  79. # TODO: deletion.db
  80. """
  81.  
  82. LOGGING = True
  83. DUMMY = False
  84. SUBMIT_ERRORS = False
  85.  
  86. import os
  87. import sys
  88. import time
  89. import json
  90. import urllib
  91. import urllib2
  92. import ssl
  93. import socket
  94. import re
  95. import traceback
  96. import sqlite3
  97. import tld
  98. from random import sample
  99. from datetime import datetime
  100. from contextlib import closing
  101. from sys import stderr
  102. from HTMLParser import HTMLParser
  103. # from pprint import pprint
  104.  
  105. MINUTE = 60
  106. HOUR = MINUTE * 60
  107. DAY = HOUR * 24
  108.  
  109. socket.setdefaulttimeout(10)
  110.  
  111. NOTIFY_THREAD = 'https://oauth.reddit.com/r/removalbot/comments/3rmc4v/'
  112. # all in seconds
  113. NEW_SPACING = 10
  114. DELETION_SPACING = 5 * MINUTE
  115. FLAIR_SPACING = 24 * HOUR
  116. SUBSCRIBER_SPACING = 12 * HOUR
  117. ALLOWED_TITLE_LENGTH = 300
  118. INTROLEN = 100
  119.  
  120. CONFDIR = '/etc/removalbot'
  121. PIDFILE = "/tmp/removalbot.pid"
  122. LOGDIR = os.path.join(CONFDIR, 'log')
  123. IGNORE = ["godwins_law_bot", "totes_meta_bot", "redditbots", "ttumblrbots",
  124.           "autowikibot", "SRScreenshot", "MRSPArchiver", "AutoModerator",
  125.           "image_linker_bot", "SmallSubBot", "autourbanbot",
  126.           "note-to-self-bot", "ObamaRobot", "TotesMessenger",
  127.           "TweetsInCommentsBot", "TweetPoster", "JoeBidenBot",
  128.           "smilesbot", "DailMail_Bot", "TrollaBot", "TotesHuman",
  129.           "youtubefactsbot", "imgurtranscriber", "isreactionary_bot",
  130.           "iscuck_bot", "author", "reginaldtato", "NotTheOnionBot",
  131.           "rSGSpolice", "hwsbot", "yes_it_is_weird", "r_PictureGame",
  132.           "prairiechicken2", "domoarigatobtfcboto", "SkydivingHaylz",
  133.           "I_Like_Spaghetti", "STEALTHM0UNTAIN", "Google_Panda",
  134.           "AakashMasani", "Forestl", "lurkattwork", "drgoku282",
  135.           "texasmommie", "Really_Like_Pancakes", "BlaineWolfe",
  136.           "Blassie098", "ghort98765", "GustavoFrings", "WritingPromptsRobot",
  137.           "sontato", "ramsesniblick3rd", "300BlackoutSober",
  138.           "flair_your_post_bot", "GoomyTooOP", "arbutus_", "foamed",
  139.           "DumbCollegeStudent", "[deleted]", "GOTradeRuleBot",
  140.           "ShadowBanCheckBot", "ShadowBannedBot", "Shiny_Sylveon",
  141.           "PaidBot", "xbamsod", "enriquepaz13", "Moskau50", "PornOverlord",
  142.           "ConvertsToMetric", "removalbot"]
  143.  
  144. URLREGEX = r'''(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9
  145.    .\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(
  146.    ([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
  147.  
  148. DOMAIN_BLACKLIST = ["malformed.domain", "goo.gl", "tinyurl.com"]
  149.  
  150. os.environ['TZ'] = 'UTC'
  151. time.tzset()
  152.  
  153. pid = str(os.getpid())
  154. with open(PIDFILE, 'w') as f:
  155.     f.write(pid)
  156.  
  157.  
  158. ALSO_FATAL = ['SyntaxError', 'NameError', 'IndexError', 'TypeError',
  159.               'ValueError', 'UnboundLocalError']
  160.  
  161.  
  162. def errorhook(extype, value, trace):
  163.     os.unlink(PIDFILE)
  164.     print traceback.print_exception(extype, value, trace)
  165.     if extype.__name__ == "KeyboardInterrupt":
  166.         exit(0)
  167.     elif extype.__name__ == "FatalError":
  168.         send_pm(maintainer, "REMOVALBOT CRASHED", "Reason: " +
  169.                 extype.__name__+": " + value.message)
  170.         writefile(conf('error'), extype.__name__+": " + value.message)
  171.         exit(2)
  172.     elif extype.__name__ in ALSO_FATAL:
  173.         send_pm(maintainer, "REMOVALBOT CRASHED", "Reason: " +
  174.                 extype.__name__+": " + value.message)
  175.         writefile(conf('error'), extype.__name__+": " + value.message)
  176.         exit(3)
  177.  
  178. sys.excepthook = errorhook
  179.  
  180.  
  181. class FatalError(Exception):
  182.     def __init__(self, message):
  183.         self.message = message
  184.         Exception.__init__(self, message)
  185.  
  186.  
  187. def current_epoch():
  188.     return (datetime.now() - datetime.utcfromtimestamp(0)).total_seconds()
  189.  
  190.  
  191. def epoch_to_string(epoch=None, tech=False, short=False):
  192.     if epoch is None:
  193.         epoch = current_epoch()
  194.     try:
  195.         epoch = float(epoch)
  196.     except:
  197.         epoch = 0
  198.     if tech:
  199.         model = "%y%m%d-%H%M"
  200.     elif short:
  201.         model = "%m-%d %H:%M"
  202.     else:
  203.         model = "%Y-%m-%d %H:%M %Z"
  204.     return time.strftime(model, time.localtime(epoch))
  205.  
  206.  
  207. def conf(name):
  208.     return str(os.path.join(CONFDIR, name))
  209.  
  210.  
  211. def newlog(name):
  212.     if LOGGING:
  213.         name = str(os.path.join(LOGDIR, name))
  214.         with closing(open(name, 'w')):
  215.             pass
  216.         return name
  217.     else:
  218.         return '/dev/null'
  219.  
  220.  
  221. def readfile(f):
  222.     with closing(open(f)) as f:
  223.         return f.read()
  224.  
  225.  
  226. def writefile(f, data):
  227.     with closing(open(f, 'w')) as f:
  228.         f.write(data)
  229.  
  230.  
  231. def censor(s, fraction):
  232.     num = int(round(fraction * len(s)))
  233.     change_locs = set(sample(range(len(s)), num))
  234.     changed = ('*' if i in change_locs else c for i, c in enumerate(s))
  235.     return ''.join(changed)
  236.  
  237.  
  238. def getv(query, args=()):
  239.     return (c.execute(query, args).fetchone() or (None,))[0]
  240.  
  241.  
  242. def getlast(what):
  243.     return getv('SELECT time FROM times WHERE name=?', (what,))
  244.  
  245.  
  246. def setlast(what, utc):
  247.     c.execute('INSERT OR REPLACE INTO times VALUES (?, ?)', (what, utc))
  248.     db.commit()
  249.  
  250.  
  251. def login():
  252.     print "> Logging in ",
  253.     sys.stdout.flush()
  254.     secrets = readfile(conf('secrets')).split()
  255.     username = secrets[0]
  256.     password = secrets[1]
  257.     client_id = secrets[2]
  258.     client_secret = secrets[3]
  259.  
  260.     post_data = {"grant_type": "password",
  261.                  "username": username,
  262.                  "password": password}
  263.     headers = {"User-Agent": USERAGENT}
  264.     url = "https://www.reddit.com/api/v1/access_token"
  265.  
  266.     password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
  267.     password_mgr.add_password(None, url, client_id, client_secret)
  268.     handler = urllib2.HTTPBasicAuthHandler(password_mgr)
  269.     opener = urllib2.build_opener(handler)
  270.  
  271.     response = reddit(url, opener=opener.open,
  272.                       post_data=post_data, headers=headers,
  273.                       raw=True)
  274.  
  275.     if not ('token_type' in response.keys() and
  276.             'access_token' in response.keys()):
  277.         print >> stderr, response
  278.         raise FatalError("Authorization Failed")
  279.     token = response['token_type']+" "+response['access_token']
  280.     print "- done"
  281.     return {"Authorization": token, "User-Agent": USERAGENT}
  282.  
  283.  
  284. def mkrequest(url, headers=None, post_data=None):
  285.     if not post_data:
  286.         return urllib2.Request(url, None, headers)
  287.     else:
  288.         for k, v in post_data.iteritems():
  289.             if isinstance(v, unicode):
  290.                 v = v.encode('utf8')
  291.             elif isinstance(v, str):
  292.                 v.decode('utf8')
  293.             post_data[k] = v
  294.         post_data = urllib.urlencode(post_data)
  295.         return urllib2.Request(url, post_data, headers)
  296.  
  297.  
  298. def errordir(e):
  299.     for attr in dir(e):
  300.         print >> stderr, attr, getattr(e, attr)
  301.  
  302.  
  303. def reddit(url, opener=urllib2.urlopen,
  304.            headers=None, post_data=None,
  305.            raw=False, catch_also=[]):
  306.     global auth
  307.     global requests_used
  308.     global requests_remaining
  309.     global requests_reset
  310.     catch = [400, 401, 500, 502, 503, 504, 521] + catch_also
  311.     while True:
  312.         try:
  313.             request = mkrequest(url, headers=headers, post_data=post_data)
  314.             response = opener(request)
  315.             requests_remaining = response.headers.get('x-ratelimit-remaining')
  316.             requests_used = response.headers.get('x-ratelimit-used')
  317.             requests_reset = response.headers.get('x-ratelimit-reset')
  318.             if requests_remaining and requests_reset:
  319.                 remaining = float(requests_remaining)
  320.                 reset = int(requests_reset)
  321.                 if remaining < reset:
  322.                     time.sleep(float(reset - remaining) /
  323.                                remaining)
  324.             response = response.read()
  325.             response = json.loads(response)
  326.             if not raw:
  327.                 response = response['data']
  328.             break
  329.         except (ValueError, KeyError,
  330.                 socket.timeout, socket.error,
  331.                 ssl.SSLError,
  332.                 urllib2.HTTPError, urllib2.URLError) as e:
  333.             print '<' + type(e).__name__ + ': ',
  334.             if type(e).__name__ == 'HTTPError':
  335.                 print str(e.code) + '!>',
  336.                 if e.code not in catch:
  337.                     raise
  338.                 elif e.code == 401:
  339.                     print "(Token expired)",
  340.                     auth = login()
  341.                     headers = auth
  342.             elif type(e).__name__ == 'URLError':
  343.                 if "handshake operation" not in str(e.reason):
  344.                     raise
  345.             else:
  346.                 print '!>',
  347.             sys.stdout.flush()
  348.             time.sleep(5)
  349.         except Exception as e:
  350.             print >> stderr
  351.             print >> stderr, "ERROR!"
  352.             print >> stderr
  353.             errordir(e)
  354.             raise
  355.     return response
  356.  
  357.  
  358. def fetch(query, lastseen, model, kind, catch_also=[]):
  359.     pagecount = 1
  360.     log = ()
  361.     after = ''
  362.  
  363.     class Done(Exception):
  364.         pass
  365.     try:
  366.         while True:
  367.             if pagecount % 10 == 0 and kind != 'noupdate':
  368.                 print 'u(',
  369.                 sys.stdout.flush()
  370.                 if kind == 'comments':
  371.                     new = fetch_posts_since(c_lastcheck=log[0][3], quiet=True)
  372.                     log = new + log
  373.                 elif kind == 'submissions':
  374.                     new = fetch_posts_since(s_lastcheck=log[0][3], quiet=True)
  375.                     log = new + log
  376.                 print ')',
  377.                 sys.stdout.flush
  378.  
  379.             print 'p'+str(pagecount),
  380.             sys.stdout.flush()
  381.  
  382.             response = reddit(query+after, headers=auth, catch_also=catch_also)
  383.  
  384.             for child in [c['data'] for c in response['children']]:
  385.  
  386.                 if child['created_utc'] <= lastseen or \
  387.                    current_epoch() - child['created_utc'] > DAY * 7:
  388.                     raise Done
  389.                 if child['author'] in IGNORE:
  390.                     continue
  391.  
  392.                 fields = []
  393.                 for field in model:
  394.                     fieldtype = field[0]
  395.                     fieldname = field[1]
  396.                     if fieldtype is bool:
  397.                         fields += [child[fieldname] and 1 or 0]
  398.                     else:
  399.                         fields += [fieldtype(child[fieldname])]
  400.  
  401.                 log += (tuple(fields),)
  402.  
  403.             if not response['after']:
  404.                 raise Done
  405.             else:
  406.                 after = '&after='+response['after']
  407.             pagecount += 1
  408.     except Done:
  409.         return tuple(sorted(list(log), key=lambda post: post[3], reverse=True))
  410.  
  411.  
  412. def fetch_posts_since(c_lastcheck=None, s_lastcheck=None, quiet=False):
  413.     baseurl = 'https://oauth.reddit.com/user/removalbot/m/monitor'
  414.     if c_lastcheck is not None:
  415.         if not quiet:
  416.             print "Reading comments -",
  417.             sys.stdout.flush()
  418.         newcomments = fetch(baseurl+'/comments/.json?sort=new&limit=100',
  419.                             c_lastcheck,
  420.                             ((unicode, 'name'),
  421.                              (unicode, 'subreddit'),
  422.                              (unicode, 'author'),
  423.                              (float, 'created_utc'),
  424.                              (unicode, 'link_id'),
  425.                              (unicode, 'body')),
  426.                             kind='comments',
  427.                             catch_also=[403])
  428.         if not quiet:
  429.             print "- "+str(len(newcomments))+" comments new"
  430.  
  431.     if s_lastcheck is not None:
  432.         if not quiet:
  433.             print "Reading submissions -",
  434.             sys.stdout.flush()
  435.         newsubmissions = fetch(baseurl+'/new/.json?limit=100',
  436.                                s_lastcheck,
  437.                                ((unicode, 'name'),
  438.                                 (unicode, 'subreddit'),
  439.                                 (unicode, 'author'),
  440.                                 (float, 'created_utc'),
  441.                                 (unicode, 'link_flair_text'),
  442.                                 (unicode, 'title'),
  443.                                 (unicode, 'domain'),
  444.                                 (unicode, 'selftext')),
  445.                                kind='submissions',
  446.                                catch_also=[403])
  447.         if not quiet:
  448.             print "- "+str(len(newsubmissions))+" submissions new"
  449.  
  450.     if c_lastcheck is not None and s_lastcheck is not None:
  451.         return newcomments, newsubmissions
  452.     elif c_lastcheck is None:
  453.         return newsubmissions
  454.     elif s_lastcheck is None:
  455.         return newcomments
  456.     else:
  457.         raise FatalError("Malformed post retrieval request")
  458.  
  459.  
  460. def get_new():
  461.     print "> Checking for new posts,",
  462.     nextdelcheck = str(DELETION_SPACING - int(current_epoch() -
  463.                        last_deletion_check * DELETION_SPACING))
  464.     print "next comparison in "+nextdelcheck+"s                               "
  465.  
  466.     global c_lastcheck
  467.     global s_lastcheck
  468.     global known_comments
  469.     global known_submissions
  470.  
  471.     c_lastcheck = getv('SELECT MAX(created_utc) FROM comments')
  472.     s_lastcheck = getv('SELECT MAX(created_utc) FROM submissions')
  473.  
  474.     newcomments, newsubmissions = fetch_posts_since(c_lastcheck=c_lastcheck,
  475.                                                     s_lastcheck=s_lastcheck)
  476.  
  477.     for comment in newcomments:
  478.         c.execute('INSERT OR IGNORE INTO comments VALUES (?,?,?,?,?,?)',
  479.                   comment)
  480.     for submission in newsubmissions:
  481.         c.execute('INSERT OR IGNORE INTO submissions VALUES (?,?,?,?,?,?,?,?)',
  482.                   submission)
  483.     db.commit()
  484.  
  485.  
  486. def recurse_into_get_authors(subthread):
  487.     subthread = subthread['data']['children']
  488.     notify = []
  489.     for post in subthread:
  490.         post = post['data']
  491.         if post['author'] != '[deleted]':
  492.             notify += [post['author']]
  493.         if 'replies' in post.keys() and post['replies']:
  494.             notify += recurse_into_get_authors(post['replies'])
  495.     return notify
  496.  
  497.  
  498. def update_notify():
  499.     global notify
  500.     print "> Updating users to notify",
  501.     sys.stdout.flush()
  502.     pm_thread = reddit(NOTIFY_THREAD+'.json',
  503.                        headers=auth, raw=True)
  504.     notify = recurse_into_get_authors(pm_thread[1])
  505.     notify = list(set(notify))
  506.     notify += [maintainer]
  507.     c.execute('DELETE FROM notify')
  508.     for user in notify:
  509.         c.execute('INSERT INTO notify VALUES (?)', (user,))
  510.     db.commit()
  511.     print "- done (" + str(len(notify)) + " users: " + ", ".join(notify) + ")"
  512.  
  513.  
  514. def update_flairs():
  515.     print "> Fetching flairs",
  516.     sys.stdout.flush()
  517.     latestposturl = 'https://oauth.reddit.com/r/removalbot/new/.json?limit=1'
  518.     latestpostname = reddit(latestposturl, headers=auth)
  519.     latestpostname = latestpostname['children'][0]['data']['name']
  520.     flairurl = 'https://oauth.reddit.com/r/removalbot/api/flairselector'
  521.     post_data = {'link': latestpostname}
  522.     flairchoices = reddit(flairurl,
  523.                           post_data=post_data, headers=auth,
  524.                           raw=True)['choices']
  525.     c.execute('DELETE FROM flairs')
  526.     for flair in flairchoices:
  527.         c.execute('INSERT INTO flairs VALUES (?,?)',
  528.                   (flair['flair_text'], flair['flair_template_id']))
  529.     db.commit()
  530.     print "- done"
  531.  
  532.  
  533. def get_fullname_new(response):
  534.     response = response['jquery']
  535.     is_redirect = False
  536.     fullname_new = None
  537.     for line in response:
  538.         if is_redirect:
  539.             fullname_new = 't3_'+line[3][0].split('/')[-3]
  540.         if line[2] == 'attr' and line[3] == 'redirect':
  541.             is_redirect = True
  542.     if fullname_new:
  543.         return fullname_new
  544.     else:
  545.         print response
  546.         raise FatalError("Malformed response from reddit")
  547.  
  548.  
  549. def setflair(fullname, text):
  550.     fid = getv('SELECT flair_template_id FROM flairs WHERE name=?', (text,))
  551.     if not fid:
  552.         return False
  553.     post_data = {'link': fullname,
  554.                  'api_type': 'json',
  555.                  'flair_template_id': fid}
  556.     if not DUMMY:
  557.         reddit('https://oauth.reddit.com/r/removalbot/api/selectflair',
  558.                post_data=post_data, headers=auth, raw=True)
  559.     return True
  560.  
  561.  
  562. def send_pm(to, subject, text):
  563.     post_data = {'api_type': 'json',
  564.                  'subject': subject,
  565.                  'text': text,
  566.                  'to': to}
  567.     print "Sending PM '"+subject+"' to /u/"+to,
  568.     sys.stdout.flush()
  569.     if not DUMMY:
  570.         reddit('https://oauth.reddit.com/api/compose',
  571.                post_data=post_data, headers=auth, raw=True)
  572.     print "- sent"
  573.  
  574.  
  575. def timestring(span):
  576.     spanstring = ''
  577.     times = ((60.0, 'min'), (60.0, 'h'), (24.0, 'd'))
  578.     for t in times:
  579.         span /= t[0]
  580.         if span > 5:
  581.             spanstring = str(round(span, 1)) + t[1]
  582.     return spanstring
  583.  
  584.  
  585. def compare_update(kind, newposts):
  586.     new_oldestposts = {}
  587.     for post in newposts:
  588.         if post[1] not in new_oldestposts.keys() or \
  589.            post[3] < new_oldestposts[post[1]]:
  590.             new_oldestposts[post[1]] = post[3]
  591.  
  592.     fell = 0
  593.     old_newestposts = {}
  594.     for sub in new_oldestposts.keys():
  595.         old_newestposts[sub] = getv('SELECT MAX(created_utc) \
  596.                                     FROM ' + kind + ' \
  597.                                     WHERE subreddit=?', (sub,))
  598.         fell += getv('SELECT COUNT(name) FROM ' + kind + ' WHERE \
  599.                      subreddit=? AND created_utc<=?',
  600.                      (sub, new_oldestposts[sub]))
  601.         c.execute('DELETE FROM ' + kind + ' WHERE \
  602.                   subreddit=? AND created_utc<=?',
  603.                   (sub, new_oldestposts[sub]))
  604.  
  605.     db.commit()
  606.  
  607.     new_ids = tuple(str(post[0]) for post in newposts)
  608.  
  609.     deleted = tuple(c.execute('SELECT name FROM ' + kind + ' \
  610.                               WHERE name NOT IN ' + str(new_ids)))
  611.  
  612.     if deleted:
  613.         print "Deleted "+kind+": " + ', '.join([post[0].split('_')[1]
  614.                                                for post in deleted])
  615.  
  616.     c.execute('INSERT OR IGNORE INTO ' + kind + '_deleted \
  617.               SELECT *, STRFTIME("%s", "now") FROM ' + kind + ' \
  618.               WHERE name NOT IN ' + str(new_ids))
  619.  
  620.     c.execute('DELETE FROM '+kind)
  621.  
  622.     db.commit()
  623.  
  624.     for post in newposts:
  625.         c.execute('INSERT OR IGNORE INTO ' + kind + ' VALUES ' +
  626.                   ('(' + ','.join(['?'] * len(post)) + ')'),
  627.                   post)
  628.  
  629.     new = 0
  630.     for sub in old_newestposts.keys():
  631.         new += getv('SELECT COUNT(name) FROM ' + kind + ' WHERE \
  632.                     subreddit=? AND created_utc>?',
  633.                     (sub, old_newestposts[sub]))
  634.  
  635.     total = getv('SELECT COUNT(name) FROM ' + kind)
  636.  
  637.     db.commit()
  638.  
  639.     print str(len(deleted)) + ' ' + kind + ' deleted,',
  640.     print str(new) + ' new,',
  641.     print str(fell) + ' fell out -',
  642.     print str(total) + ' total'
  643.  
  644.  
  645. def check_deletions():
  646.  
  647.     print "> Checking for deletions                                       "
  648.  
  649.     (newcomments, newsubmissions) = fetch_posts_since(c_lastcheck=0,
  650.                                                       s_lastcheck=0)
  651.  
  652.     compare_update('comments', newcomments)
  653.     compare_update('submissions', newsubmissions)
  654.  
  655.  
  656. def check_user_deletion(post, kind):
  657.     safe_domains = readfile(conf('safedomains')).strip().split()
  658.     u = readfile(conf('unknowndomains')).strip().split('\n')
  659.     unknown_domains = {}
  660.     for d in u:
  661.         if not d:
  662.             continue
  663.         d = d.strip().split()
  664.         unknown_domains[d[0]] = int(d[1])
  665.  
  666.     print "> Checking for user deletion of " + kind + " " + post[0] + \
  667.         (" from " + post[4] if kind == 'comment' else '') + \
  668.         " in " + post[1] + ",",
  669.     left = getv('SELECT COUNT(*) FROM (SELECT name FROM comments_deleted \
  670.                     UNION SELECT name FROM submissions_deleted)')-1
  671.     print str(left or 'no') + " more left to check -",
  672.     sys.stdout.flush()
  673.  
  674.     spotted = post[-1]
  675.  
  676.     name = post[0].split('_')[1]
  677.     sub = post[1]
  678.     author = post[2]
  679.     posted = post[3]
  680.  
  681.     compare = tuple(comment[0] for comment in
  682.                     tuple(c.execute('SELECT created_utc FROM ' + kind + 's \
  683.                                    WHERE subreddit=? \
  684.                                    ORDER BY created_utc ASC', (sub,))))
  685.     mincompare = min(compare) if compare else current_epoch() - DAY*7
  686.     compare = compare[-int(len(compare)*0.95)] if compare \
  687.         else current_epoch() - DAY*7
  688.  
  689.     print str(round((current_epoch() - posted) / HOUR, 2)) + "h vs cutoff " + \
  690.         str(round((current_epoch() - compare) / HOUR, 2)) + "h, oldest " + \
  691.         str(round((current_epoch() - mincompare) / HOUR, 2)) + "h -",
  692.     sys.stdout.flush()
  693.     if posted <= compare:
  694.         print "too old"
  695.         return False
  696.  
  697.     title = ''
  698.     if kind == 'comment':
  699.         link_id = post[4].split('_')[1]
  700.         content = post[5]
  701.         baseurl = 'https://oauth.reddit.com/user/'+author
  702.         usercomments = {}
  703.         shadowbanned = False
  704.         try:
  705.             usercomments = fetch(baseurl+'/comments/.json?sort=new&limit=100',
  706.                                  posted - 30,
  707.                                  ((unicode, 'name'),
  708.                                   (unicode, 'subreddit'),
  709.                                   (unicode, 'author'),
  710.                                   (float, 'created_utc'),
  711.                                   (unicode, 'link_id'),
  712.                                   (unicode, 'body')),
  713.                                  kind='noupdate')
  714.         except urllib2.HTTPError as e:
  715.             if e.code == 404:
  716.                 print "- " + author + " shadowbanned or deleted",
  717.                 url = 'https://oauth.reddit.com/r/' + sub + '/comments/' + \
  718.                       link_id + '/comment/' + name + '/.json'
  719.                 try:
  720.                     reddit(url, headers=auth, raw=True)
  721.                     print "- deleted"
  722.                     return False
  723.                 except urllib2.HTTPError as e:
  724.                     if e.code == 404:
  725.                         print "- shadowbanned",
  726.                         shadowbanned = True
  727.                     else:
  728.                         raise
  729.             else:
  730.                 raise
  731.         usercomments = {comment[0]: comment
  732.                         for comment in usercomments}
  733.         if post[0] not in usercomments.keys() and not shadowbanned:
  734.             print "- deleted by "+author+" ("+((content[:47]+"...")
  735.                                                if len(content) > 50
  736.                                                else content).replace('\n',
  737.                                                                      ' / ')+")"
  738.             return False
  739.         else:
  740.             if not shadowbanned:
  741.                 post = usercomments[post[0]]
  742.                 name = post[0].split('_')[1]
  743.                 sub = post[1]
  744.                 author = post[2]
  745.                 posted = post[3]
  746.                 link_id = post[4].split('_')[1]
  747.                 content = post[5]
  748.             print "- deleted by mods"
  749.             title1 = unicode(epoch_to_string(short=True) + " - '")
  750.             title2 = unicode("' by /u/" + author +
  751.                              " removed from /r/" + sub)
  752.             lower_frame_boundary = spotted - DELETION_SPACING
  753.             upper_frame_boundary = spotted
  754.             frame_lower = round((float(lower_frame_boundary) - posted) / 60)
  755.             if frame_lower < 0:
  756.                 frame_lower = 0
  757.             frame_upper = round((float(upper_frame_boundary) - posted) / 60)
  758.             if frame_lower != frame_upper:
  759.                 frame = str(int(frame_lower)) + "-" + str(int(frame_upper))
  760.             else:
  761.                 frame = str(int(frame_lower))
  762.             title2 += " within " + frame + "min"
  763.             if shadowbanned:
  764.                 title2 += u" (user shadowbanned)"
  765.             restlen = ALLOWED_TITLE_LENGTH - (len(title1) + len(title2))
  766.             intro = re.sub(r'&gt;.*\n', '[quote]', content)
  767.  
  768.             intro = re.sub(r'\[([^\]]*)\]\([^\)]*\)', r'[\1]', intro)
  769.             intro = re.sub(URLREGEX, '[link]', intro)
  770.  
  771.             intro = intro.replace('/r/', 'r/')
  772.             intro = intro.replace('/u/', 'u/')
  773.             intro = re.sub(r' +', ' ', intro)
  774.             intro = re.sub(r'[ \n/][ \n/]+', ' / ', intro)
  775.             intro = intro.strip(' \n/')
  776.  
  777.             links = []
  778.             for url in re.finditer(r'\[([^\]]*)\][ \n]?\(([^\)]*)\)', content):
  779.                 links += [url.group(2)]
  780.                 content = content.replace(url.group(0),
  781.                                           '[' + url.group(1) + ']^^' +
  782.                                           str(len(links)) + ' ')
  783.             for url in re.finditer(URLREGEX, content):
  784.                 links += [url.group(0)]
  785.                 content = content.replace(url.group(0),
  786.                                           '[link]^^' + str(len(links)) + ' ')
  787.  
  788.             if len(intro) > restlen:
  789.                 intro = unicode(intro[:restlen-3].strip(' ./,') + "...")
  790.             else:
  791.                 intro = unicode(intro)
  792.             title = title1 + intro + title2
  793.             title = title[:ALLOWED_TITLE_LENGTH]
  794.             body = "'''\n\n"+content+"\n\n'''\n\n"
  795.             if post[4] == 'None':
  796.                 body = "No link could be determined."
  797.                 link = "Unknown"
  798.             else:
  799.                 linkbase = "/r/" + sub + "/comments/" + link_id + \
  800.                            "/comment/" + name + "?context=999"
  801.                 link = "https://reddit.com" + linkbase
  802.                 goldfishlink = "http://r.go1dfish.me" + linkbase
  803.                 unredditlink = "https://unreddit.com" + linkbase
  804.                 body += "[Context Link](" + link + ")\n\n"
  805.                 body += "[Go1dfish undelete link](" + goldfishlink + ")\n\n"
  806.                 body += "[unreddit undelete link](" + unredditlink + ")"
  807.             body += "\n\nAuthor: /u/" + author
  808.             if links:
  809.                 body += "\n\n"
  810.                 unknowns = False
  811.                 for l in range(len(links)):
  812.                     try:
  813.                         domain = tld.get_tld(links[l])
  814.                     except tld.exceptions.TldBadUrl:
  815.                         domain = 'reddit.com'
  816.                     except tld.exceptions.TldDomainNotFound:
  817.                         domain = 'malformed.domain'
  818.                         print "Malformed domain: " + links[l]
  819.                     if domain in safe_domains:
  820.                         body += str(l+1) + ': ' + links[l] + '  \n'
  821.                     else:
  822.                         unknowns = True
  823.                         if domain not in DOMAIN_BLACKLIST:
  824.                             if domain in unknown_domains.keys():
  825.                                 unknown_domains[domain] += 1
  826.                             else:
  827.                                 unknown_domains[domain] = 1
  828.                             with closing(open(conf('unknowndomains'),
  829.                                          'w')) as f:
  830.                                 for d in unknown_domains:
  831.                                     f.write(d+' '+str(unknown_domains[d])+'\n')
  832.                         oblink = re.sub(r'.*://', '', links[l])
  833.                         if domain != "maldormed.domain":
  834.                             oblink = censor(oblink, 0.25)
  835.                         body += str(l+1) + ': `' + oblink + '`  \n'
  836.                 if unknowns:
  837.                     body += "\nUnknown links are censored to prevent \
  838.                        spreading illicit content."
  839.             print title,
  840.  
  841.     elif kind == 'submission':
  842.         reason = post[4]
  843.         subject = post[5]
  844.         domain = post[6]
  845.         selftext = post[7]
  846.         baseurl = 'https://oauth.reddit.com/user/'+author
  847.         shadowbanned = False
  848.         usersubmissions = {}
  849.         try:
  850.             usersubmissions = fetch(baseurl +
  851.                                     '/submitted/.json?sort=new&limit=100',
  852.                                     posted - 30,
  853.                                     ((unicode, 'name'),
  854.                                      (unicode, 'subreddit'),
  855.                                      (unicode, 'author'),
  856.                                      (float, 'created_utc'),
  857.                                      (unicode, 'link_flair_text'),
  858.                                      (unicode, 'title'),
  859.                                      (unicode, 'domain'),
  860.                                      (unicode, 'selftext')),
  861.                                     kind='noupdate')
  862.         except urllib2.HTTPError as e:
  863.             if e.code in [403, 404]:
  864.                 print "- " + author + " shadowbanned or deleted",
  865.                 url = 'https://oauth.reddit.com/r/' + sub + '/comments/' + \
  866.                       name + '/.json'
  867.                 try:
  868.                     reddit(url, headers=auth, raw=True)
  869.                     print "- deleted"
  870.                     return False
  871.                 except urllib2.HTTPError as e:
  872.                     if e.code in [403, 404]:
  873.                         print "- shadowbanned",
  874.                         shadowbanned = True
  875.                     else:
  876.                         raise
  877.             else:
  878.                 raise
  879.         usersubmissions = {submission[0]: submission
  880.                            for submission in usersubmissions}
  881.         if post[0] not in usersubmissions.keys() and not shadowbanned:
  882.             print "- deleted by "+author+" ("+((subject[:47]+"...")
  883.                                                if len(subject) > 50
  884.                                                else subject)+")"
  885.             return False
  886.         else:
  887.             reason = 'None'
  888.             if not shadowbanned and not post[7]:
  889.                 post = usersubmissions[post[0]]
  890.                 name = post[0].split('_')[1]
  891.                 sub = post[1]
  892.                 author = post[2]
  893.                 posted = post[3]
  894.                 reason = post[4]
  895.                 subject = post[5]
  896.                 domain = post[6]
  897.                 selftext = post[7]
  898.             reason = reason.lower()
  899.             if 'removed' in reason:
  900.                 reason = reason.replace('removed', '').strip(u' -|—')
  901.                 reason = reason.capitalize() or 'None'
  902.                 reason = " - reason: "+reason
  903.             else:
  904.                 reason = ''
  905.             print "- deleted by mods" + reason
  906.             lower_frame_boundary = spotted - DELETION_SPACING
  907.             upper_frame_boundary = spotted
  908.             frame_lower = round((float(lower_frame_boundary) - posted) / 60)
  909.             if frame_lower < 0:
  910.                 frame_lower = 0
  911.             frame_upper = round((float(upper_frame_boundary) - posted) / 60)
  912.             if frame_lower != frame_upper:
  913.                 frame = str(int(frame_lower)) + "-" + str(int(frame_upper))
  914.             else:
  915.                 frame = str(int(frame_lower))
  916.             title1 = unicode(epoch_to_string(short=True) + " - '")
  917.             title2 = unicode("' (" + domain + ") by /u/" + author +
  918.                              " removed from /r/" + sub +
  919.                              " within " + frame + "min" + reason)
  920.             if shadowbanned:
  921.                 title2 += u" (user shadowbanned)"
  922.             restlen = ALLOWED_TITLE_LENGTH - (len(title1) + len(title2))
  923.             if len(subject) > restlen:
  924.                 intro = unicode(subject[:restlen-3].strip(' ./') + "...")
  925.             else:
  926.                 intro = unicode(subject)
  927.             title = title1 + intro + title2
  928.             title = title[:ALLOWED_TITLE_LENGTH]
  929.             linkbase = "/r/" + sub + "/comments/" + name
  930.             link = "https://reddit.com/" + linkbase
  931.             goldfishlink = "http://r.go1dfish.me" + linkbase
  932.             unredditlink = "https://unreddit.com" + linkbase
  933.             body = ""
  934.             links = []
  935.             if selftext:
  936.                 links = []
  937.                 for url in re.finditer(r'\[([^\]]*)\][ \n]?\(([^\)]*)\)',
  938.                                        selftext):
  939.                     links += [url.group(2)]
  940.                     selftext = selftext.replace(url.group(0),
  941.                                                 '[' + url.group(1) + ']^^' +
  942.                                                 str(len(links)) + ' ')
  943.                 for url in re.finditer(URLREGEX, selftext):
  944.                     links += [url.group(0)]
  945.                     selftext = selftext.replace(url.group(0),
  946.                                                 '[link]^^' +
  947.                                                 str(len(links)) + ' ')
  948.                 body = "'''\n\n" + selftext + "\n\n'''\n\n"
  949.             body += "[" + subject + "](" + link + ")\n\n"
  950.             body += "[Go1dfish undelete link](" + goldfishlink + ")\n\n"
  951.             body += "[unreddit undelete link](" + unredditlink + ")\n\n"
  952.             body += "Author: /u/" + author
  953.             if links:
  954.                 body += "\n\n"
  955.                 unknowns = False
  956.                 for l in range(len(links)):
  957.                     try:
  958.                         domain = tld.get_tld(links[l])
  959.                     except tld.exceptions.TldBadUrl:
  960.                         domain = 'reddit.com'
  961.                     except tld.exceptions.TldDomainNotFound:
  962.                         domain = 'malformed.domain'
  963.                         print "Malformed domain: " + links[l]
  964.                     if domain in safe_domains:
  965.                         body += str(l+1) + ': ' + links[l] + '  \n'
  966.                     else:
  967.                         unknowns = True
  968.                         if domain not in DOMAIN_BLACKLIST:
  969.                             if domain in unknown_domains.keys():
  970.                                 unknown_domains[domain] += 1
  971.                             else:
  972.                                 unknown_domains[domain] = 1
  973.                             with closing(open(conf('unknowndomains'),
  974.                                          'w')) as f:
  975.                                 for d in unknown_domains:
  976.                                     f.write(d+' '+str(unknown_domains[d])+'\n')
  977.                         oblink = re.sub(r'.*://', '', links[l])
  978.                         if domain != "maldormed.domain":
  979.                             oblink = censor(oblink, 0.25)
  980.                         body += str(l+1) + ': `' + oblink + '`  \n'
  981.                 if unknowns:
  982.                     body += "\nUnknown links are censored to prevent \
  983.                        spreading illicit content."
  984.             print title,
  985.  
  986.     h = HTMLParser()
  987.     title = h.unescape(title)
  988.     body = h.unescape(body)
  989.     post_data = {'sr': 'removalbot', 'title': title,
  990.                  'kind': 'self', 'text': body}
  991.     if not DUMMY:
  992.         response = reddit('https://oauth.reddit.com/api/submit',
  993.                           post_data=post_data, headers=auth, raw=True)
  994.     print "- submitted",
  995.     sys.stdout.flush()
  996.     if not DUMMY:
  997.         fullname_new = get_fullname_new(response)
  998.         if setflair(fullname_new, kind+'-'+sub.lower()):
  999.             print "- flaired"
  1000.         else:
  1001.             print "- no flair"
  1002.  
  1003.     if not shadowbanned and getv('SELECT author FROM notify WHERE author=?',
  1004.                                  (author,)):
  1005.         pm_subject = "Your "+kind+" was deleted from /r/"+sub
  1006.         if kind == 'comment':
  1007.             content = content.strip(' \n')
  1008.             if '\n\n' in content:
  1009.                 content = '\n\n'+content+'\n\n'
  1010.         else:
  1011.             content = '  \n&nbsp;**Reason**: '+str(reason)
  1012.         content = h.unescape(content)
  1013.         pm_body = 'Hello, **'+author+'**!\n\n&nbsp;\n\n'
  1014.         pm_body += 'Your '+kind+' appears to have been deleted '
  1015.         pm_body += 'from **/r/' + sub + '** by the moderators, '
  1016.         pm_body += '/u/AutoModerator or the administrators.\n\n'
  1017.         pm_body += '&nbsp;**'+kind.capitalize()+'**: '+content+'  \n'
  1018.         pm_body += '&nbsp;**Posted at**: '+epoch_to_string(posted)+'  \n'
  1019.         pm_body += '&nbsp;**Delay until deletion**: '+frame+'min  \n'
  1020.         pm_body += '&nbsp;**Link**: '+str(link)+'\n\n&nbsp;\n\n'
  1021.         pm_body += 'Have a nice day!  \n'
  1022.         pm_body += '/u/removalbot\n\n'
  1023.         pm_body += '----\n\n'
  1024.         pm_body += '^(Note that the deletion may have been accidental '
  1025.         pm_body += 'or its detection a false positive caused by heavy load '
  1026.         pm_body += 'on reddit\'s servers.)  \n'
  1027.         pm_body += '^^^This ^^^is ^^^an ^^^automated ^^^message ^^^from '
  1028.         pm_body += '^^^/r/removalbot. ^^^If ^^^you ^^^wish ^^^to ^^^disable '
  1029.         pm_body += '^^^such ^^^messages ^^^in ^^^the ^^^future, ^^^please '
  1030.         pm_body += '^^^visit [^^^this ^^^thread]('
  1031.         pm_body += NOTIFY_THREAD.replace('oauth', 'www')+') ^^^and '
  1032.         pm_body += '^^^delete ^^^your ^^^comment(s) ^^^there.'
  1033.         send_pm(author, pm_subject, pm_body)
  1034.  
  1035.     return True
  1036.  
  1037. requests_used = None
  1038. requests_remaining = None
  1039. requests_reset = None
  1040.  
  1041. maintainer = readfile(conf('maintainer')).strip()
  1042. version = readfile(conf('version')).strip()
  1043. USERAGENT = 'removalbot by /u/'+maintainer+', v'+version
  1044.  
  1045. os.chdir(CONFDIR)
  1046.  
  1047. db = sqlite3.connect(conf('posts.db'))
  1048. c = db.cursor()
  1049.  
  1050. auth = login()
  1051.  
  1052. update_flairs()
  1053.  
  1054. if os.path.isfile(conf('error')) and \
  1055.    readfile(conf('error')) != 'KeyboardInterrupt':
  1056.     failed = os.path.getctime(conf('error'))
  1057.     failed = epoch_to_string(epoch=failed)
  1058.     reason = readfile(conf('error'))
  1059.     send_pm(maintainer, "REMOVALBOT CRASHED", "Reason: "+reason)
  1060.     e = reason.split(':')[0]
  1061.     print e, e in ALSO_FATAL
  1062.     if not DUMMY and (SUBMIT_ERRORS or e == "FatalError" or e in ALSO_FATAL):
  1063.         print "> Submitting error "+reason,
  1064.         sys.stdout.flush()
  1065.         title = "[!] Bot encountered an error at " + failed + \
  1066.             ", reason: " + reason
  1067.         post_data = {'sr': 'removalbot', 'kind': 'self', 'title': title}
  1068.         if reason != "FatalError":
  1069.             post_data['text'] = "Such errors usually indicate that reddit is \
  1070.                                 overloaded or in maintenance mode, i. e. \
  1071.                                 they are unavoidable.\n\nA database is used \
  1072.                                 to minimize the impact but cannot negate it."
  1073.         else:
  1074.             post_data['text'] = "The bot encountered a fatal error. This \
  1075.                                 should not happen. The maintainer has been \
  1076.                                 notified; until further action, the bot is \
  1077.                                 suspended."
  1078.         response = reddit('https://oauth.reddit.com/api/submit',
  1079.                           post_data=post_data, headers=auth, raw=True)
  1080.         setflair(get_fullname_new(response), 'error')
  1081.         print "- done"
  1082.     else:
  1083.         print "> Encountered error "+reason
  1084.     os.remove(conf('error'))
  1085.  
  1086. update_notify()
  1087.  
  1088. while True:
  1089.     now = int(current_epoch())
  1090.     last_deletion_check = getlast('deletion')
  1091.     last_new_check = getlast('new')
  1092.     last_subscriber_check = getlast('subscribers')
  1093.     last_flair_check = getlast('flair')
  1094.     changed = False
  1095.     if int(now / DELETION_SPACING) != last_deletion_check:
  1096.         check_deletions()
  1097.         setlast('deletion', int(now / DELETION_SPACING))
  1098.     elif int(now / NEW_SPACING) != last_new_check:
  1099.         get_new()
  1100.         setlast('new', int(now / NEW_SPACING))
  1101.     elif int(now / SUBSCRIBER_SPACING) != last_subscriber_check:
  1102.         update_notify()
  1103.         setlast('subscribers', int(now / SUBSCRIBER_SPACING))
  1104.     elif int(now / FLAIR_SPACING) != last_flair_check:
  1105.         update_flairs()
  1106.         setlast('flair', int(now / FLAIR_SPACING))
  1107.     else:
  1108.         deleted_submission = \
  1109.             c.execute('SELECT * FROM submissions_deleted LIMIT 1').fetchone()
  1110.         deleted_comment = \
  1111.             c.execute('SELECT * FROM comments_deleted LIMIT 1').fetchone()
  1112.         left = getv('SELECT COUNT(*) FROM (SELECT name FROM comments_deleted \
  1113.                 UNION SELECT name FROM submissions_deleted)')-1
  1114.         if deleted_submission:
  1115.             check_user_deletion(deleted_submission, 'submission')
  1116.             c.execute('DELETE FROM submissions_deleted WHERE name=?',
  1117.                       (deleted_submission[0],))
  1118.         elif deleted_comment:
  1119.             check_user_deletion(deleted_comment, 'comment')
  1120.             c.execute('DELETE FROM comments_deleted WHERE name=?',
  1121.                       (deleted_comment[0],))
  1122.  
  1123.         if not deleted_submission and not deleted_comment:
  1124.             time.sleep(0.1)
  1125.             continue
  1126.         elif left == 0:
  1127.             c_unknown = readfile(conf('unknowndomains')).strip().split('\n')
  1128.             print "> Done undeleting, " + str(len(c_unknown)) + \
  1129.                 " unknown domains logged"
  1130.  
  1131.     print
  1132.     if requests_remaining and requests_reset and requests_used:
  1133.         requests_remaining = str(int(float(requests_remaining)))
  1134.         print "Used: " + requests_used + ", remaining: " + \
  1135.               requests_remaining + " in " + requests_reset + "s",
  1136.         remaining = float(requests_remaining)
  1137.         reset = int(requests_reset)
  1138.         print "(" + str(round(remaining / reset, 2)) + "/s)",
  1139.         if remaining < reset:
  1140.             sleeptime = str(int(round(float(reset - remaining) /
  1141.                                 remaining, 3) * 1000))
  1142.             print "- OVERLOAD, sleep " + sleeptime + "ms"
  1143.         else:
  1144.             print "- OK\r",
  1145.         sys.stdout.flush()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement