Advertisement
Draugexa

Untitled

Feb 9th, 2018
108
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.68 KB | None | 0 0
  1. import sqlite3
  2. import json
  3. from datetime import datetime
  4. import bz2
  5.  
  6. timeframe = 'comments_fiveplus'
  7. sql_transaction = []
  8.  
  9. connection = sqlite3.connect('{}.db'.format(timeframe))
  10. c = connection.cursor()
  11.  
  12. def create_table(time):
  13.     c.execute("""CREATE TABLE IF NOT EXISTS '{}'(parent_id TEXT PRIMARY KEY,
  14.                                                        comment_id TEXT UNIQUE,
  15.                                                        parent TEXT,
  16.                                                        comment TEXT,
  17.                                                        subreddit TEXT,
  18.                                                        unix INT,
  19.                                                        score INT)""".format(time))
  20.  
  21. def format_data(data):
  22.     data = data.replace("\n", " newlinechar ").replace("\r", " returnchar ").replace('"',"'")
  23.     return data
  24.  
  25. def find_existing_score(pid, time):
  26.     try:
  27.         sql = "SELECT score FROM '{}' WHERE parent_id = '{}' LIMIT 1".format(time, pid)
  28.         c.execute(sql)
  29.         result = c.fetchone()
  30.         if result != None:
  31.             return result[0]
  32.         else:
  33.             return False
  34.     except Exception as e:
  35.         return False
  36.  
  37. def acceptable(data):
  38.     # if more than fifty words
  39.     if len(data.split(' ')) > 50 or len(data) < 1:
  40.         return False
  41.     # if more than 1000 characters (spam)
  42.     elif len(data) > 1000:
  43.         return False
  44.     # if deleted
  45.     elif data == '[deleted]' or data == '[removed]':
  46.         return False
  47.     else:
  48.         return True
  49.  
  50. def find_parent(pid, time):
  51.     try:
  52.         sql = "SELECT comment FROM '{}' WHERE comment_id = '{}' LIMIT 1".format(time, pid)
  53.         c.execute(sql)
  54.         result = c.fetchone()
  55.         if result != None:
  56.             return result[0]
  57.         else:
  58.             return False
  59.     except Exception as e:
  60.         print("find_parent", e)
  61.         return False
  62.  
  63. def transaction_bldr(sql):
  64.     global sql_transaction
  65.     sql_transaction.append(sql)
  66.     if len(sql_transaction) > 5000:
  67.         c.execute('BEGIN TRANSACTION')
  68.         for s in sql_transaction:
  69.             try:
  70.                 c.execute(s)
  71.             except:
  72.                 pass
  73.         connection.commit()
  74.         sql_transaction = []
  75.  
  76. def sql_insert_replace_comment(sourcetime, commentid,parentid,parent,comment,subreddit,time,score):
  77.     try:
  78.         sql = """UPDATE '{}' SET parent_id = ?, comment_id = ?, parent = ?, comment = ?, subreddit = ?, unix = ?, score = ? WHERE parent_id =?;""".format(sourcetime, parentid, commentid, parent, comment, subreddit, int(time), score, parentid)
  79.         transaction_bldr(sql)
  80.     except Exception as e:
  81.         print('s-UPDATE insertion',str(e))
  82.  
  83. def sql_insert_has_parent(sourcetime, commentid,parentid,parent,comment,subreddit,time,score):
  84.     try:
  85.         sql = """INSERT INTO '{}' (parent_id, comment_id, parent, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}","{}",{},{});""".format(sourcetime, parentid, commentid, parent, comment, subreddit, int(time), score)
  86.         transaction_bldr(sql)
  87.     except Exception as e:
  88.         print('s-PARENT insertion',str(e))
  89.  
  90. def sql_insert_no_parent(sourcetime, commentid,parentid,comment,subreddit,time,score):
  91.     try:
  92.         sql = """INSERT INTO '{}' (parent_id, comment_id, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}",{},{});""".format(sourcetime, parentid, commentid, comment, subreddit, int(time), score)
  93.         transaction_bldr(sql)
  94.     except Exception as e:
  95.         print('s-NO_PARENT insertion',str(e))
  96.  
  97. def openloop(time):
  98.     row_counter = 0
  99.     paired_rows = 0
  100.     sourcefile = "//APOLLO/home/Programming/reddit_data/Uncompressed/RC_{}".format(time)
  101.  
  102.     try:
  103.         with open(sourcefile, buffering = 5000) as f:
  104.             create_table(time)
  105.             for row in f:
  106.                 row_counter += 1
  107.                 row = json.loads(row)
  108.                 parent_id = row['parent_id']
  109.                 body = format_data(row['body'])
  110.                 created_utc = row['created_utc']
  111.                 score = row['score']
  112.                 subreddit = row['subreddit']
  113.  
  114.                 if 'name' in row:
  115.                     # old reddit storage
  116.                     comment_id = row['name']
  117.                 else:
  118.                     # new reddit storage. add 't1_' to make it same as before
  119.                     comment_id = 't1_' + row['id']
  120.  
  121.                 parent_data = find_parent(parent_id, time)
  122.  
  123.                 if score >= 5:
  124.                     if acceptable(body):
  125.                         existing_comment_score = find_existing_score(parent_id, time)
  126.                         if existing_comment_score:
  127.                             if score > existing_comment_score:
  128.                                 sql_insert_replace_comment(time, comment_id, parent_id, parent_data, body, subreddit, created_utc, score)
  129.                         else:
  130.                             if parent_data:
  131.                                 sql_insert_has_parent(time, comment_id, parent_id, parent_data, body, subreddit, created_utc, score)
  132.                                 paired_rows += 1
  133.                             else:
  134.                                 sql_insert_no_parent(time, comment_id, parent_id, body, subreddit, created_utc, score)
  135.  
  136.             print("Total rows read: {}, Paired Rows: {}, Time: {}".format(row_counter, paired_rows, str(datetime.now())))
  137.     except Exception as e:
  138.         print(e)
  139.  
  140. def main():
  141.     for y in range(14,16):
  142.         for x in range(1,13):
  143.             time = "20{:0>2d}-{:0>2d}".format(16,x)
  144.             print('Beginning work on RC_{}'.format(time))
  145.             openloop(time)
  146.  
  147. if __name__ == "__main__":
  148.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement