Advertisement
Guest User

Untitled

a guest
May 2nd, 2021
123
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.86 KB | None | 0 0
  1. import sqlite3
  2. import json
  3. from datetime import datetime
  4.  
  5. timeframe = '2015-05'
  6. sql_transaction = []
  7. #creates data base and names it the value in timeframe variable
  8. connection = sqlite3.connect('{}.db'.format(timeframe))
  9. c = connection.cursor()
  10.    
  11. def create_table():
  12.     #creates a table if one does not exist and names it parent reply
  13.     c.execute("""CREATE TABLE IF NOT EXISTS parent_reply
  14. (parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT, comment TEXT,
  15. subreddit TEXT, unix INT, score INT)""")
  16.  
  17. def format_data(data):
  18.     data=data.replace("\n"," newlinechar ").replace("\r"," newlinechar ").replace('"',"'")
  19.     return data
  20.  
  21. def find_parent(pid):
  22.     try:
  23.         sql = "SLECT comment FROM parent_reply WHERE comment_id = '{}' LIMIT 1".format(pid)
  24.         c.execute(sql)
  25.         result = c.fetchone()
  26.         if result != None:
  27.             return result[0]
  28.         else: return False
  29.     except Exception as e:
  30.         print("find_parent", e)
  31.         return False
  32.  
  33. def find_existing_score(pid):
  34.     try:
  35.         sql = "SLECT score FROM parent_reply WHERE parent_id = '{}' LIMIT 1".format(pid)
  36.         c.execute(sql)
  37.         result = c.fetchone()
  38.         if result != None:
  39.             return result[0]
  40.         else: return False
  41.     except Exception as e:
  42.         print("find_parent", e)
  43.         return False
  44.  
  45. def acceptable(data):
  46.     if len(data.split(' ')) > 50 or len(data) < 1:
  47.         return False
  48.     elif len(data) > 1000:
  49.         return False
  50.     elif data == '[deleted]' or data == '[removed]':
  51.         return False
  52.     else:
  53.         return True
  54.  
  55. def sql_insert_replace_comment(commentid,parentid,parent,comment,subreddit,time,score):
  56.     try:
  57.         sql = """UPDATE parent_reply SET parent_id = ?, comment_id = ?, parent = ?, comment = ?, subreddit = ?, unix = ?, score = ? WHERE parent_id =?;""".format(parentid, commentid, parent, comment, subreddit, int(time), score, parentid)
  58.         transaction_bldr(sql)
  59.     except Exception as e:
  60.         print('s-UPDATE insertion',str(e))
  61. def sql_insert_has_parent(commentid,parentid,parent,comment,subreddit,time,score):
  62.     try:
  63.         sql = """INSERT INTO parent_reply (parent_id, comment_id, parent, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}","{}",{},{});""".format(parentid, commentid, parent, comment, subreddit, int(time), score)
  64.         transaction_bldr(sql)
  65.     except Exception as e:
  66.         print('s-PARENT insertion',str(e))
  67.  
  68.  
  69. def sql_insert_no_parent(commentid,parentid,comment,subreddit,time,score):
  70.     try:
  71.         sql = """INSERT INTO parent_reply (parent_id, comment_id, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}",{},{});""".format(parentid, commentid, comment, subreddit, int(time), score)
  72.         transaction_bldr(sql)
  73.     except Exception as e:
  74.         print('s-NO_PARENT insertion',str(e))
  75.  
  76. def transaction_bldr(sql):
  77.     global sql_transaction
  78.     sql_transaction.append(sql)
  79.     if len(sql_transaction) > 1000:
  80.         c.execute('BEGIN TRANSACTION')
  81.         for s in sql_transaction:
  82.             try:
  83.                 c.execute(s)
  84.             except:
  85.                 pass
  86.         connection.commit()
  87.         sql_transaction = []
  88.    
  89.  
  90. if __name__=="__main__":
  91.     create_table()
  92.     row_counter = 0
  93.     paired_rows = 0
  94.  
  95.     with open("F:/Nebula0.0.5/chatdata/reddit_data/{}/RC_{}".format(timeframe.split('-')[0]), timeframe, buffering=1000) as f:
  96.               for row in f:
  97.                   print(row)
  98.                   row_counter += 1
  99.                   row = json.loads(row)
  100.                   parent_id = row['parent_id']
  101.                 #this function will clean up data
  102.                   body = format_data(row['body'])
  103.                   created_utc = row['created_utc']
  104.                   score = row['score']
  105.                   subreddit = row['subreddit']
  106.                   parent_data = find_parent(parent_id)
  107.  
  108.  
  109.                   if score >= 5:
  110.                       if acceptable(body):
  111.                           existing_comment_score = find_existing_score(parent_id)
  112.                           if existing_comment_score:
  113.                               if score > existing_comment_score:
  114.                                   sql_insert_replace_comment(comment_id, parent_id, parent_data, body, subreddit, created_utc, score)
  115.                           else:
  116.                               if parent_data:
  117.                                   sql_inser_has_parent(comment_id, parent_id, parent_data, body, subreddit, created_utc, score)
  118.                                   paired_rows+=1
  119.                               else:
  120.                                   sql_insert_no_parent(comment_id, parent_id, body, subreddit, created_utc, score)
  121.                   if row_counter % 100000 == 0:
  122.                       print('Total Rows Read: {}, Paired Rows: {}, Time: {}'.format(row_counter, paired_rows, str(datetime.now()))
  123.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement