Advertisement
Guest User

Untitled

a guest
May 2nd, 2021
67
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.25 KB | None | 0 0
  1. import sqlite3
  2. import json
  3. from datetime import datetime
  4.  
  5. timeframe = '2015-05'
  6. sql_transaction = []
  7. #creates data base and names it the value in timeframe variable
  8. connection = sqlite3.connect('{}.db'.format(timeframe))
  9. c = connection.cursor()
  10.    
  11. def create_table():
  12.     #creates a table if one does not exist and names it parent reply
  13.     c.execute("""CREATE TABLE IF NOT EXISTS parent_reply
  14. (parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT, comment TEXT,
  15. subreddit TEXT, unix INT, score INT)""")
  16.  
  17. def format_data(data):
  18.     data=data.replace("\n"," newlinechar ").replace("\r"," newlinechar ").replace('"',"'")
  19.     return data
  20.  
  21. def find_parent(pid):
  22.     try:
  23.         sql = "SLECT comment FROM parent_reply WHERE comment_id = '{}' LIMIT 1".format(pid)
  24.         c.execute(sql)
  25.         result = c.fetchone()
  26.         if result != None:
  27.             return result[0]
  28.         else: return False
  29.     except Exception as e:
  30.         print("find_parent", e)
  31.         return False
  32.  
  33. def find_existing_score(pid):
  34.     try:
  35.         sql = "SLECT score FROM parent_reply WHERE parent_id = '{}' LIMIT 1".format(pid)
  36.         c.execute(sql)
  37.         result = c.fetchone()
  38.         if result != None:
  39.             return result[0]
  40.         else: return False
  41.     except Exception as e:
  42.         print("find_parent", e)
  43.         return False
  44.  
  45. def acceptable(data):
  46.     if len(data.split(' ')) > 50 or len(data) < 1:
  47.         return False
  48.     elif len(data) > 1000:
  49.         return False
  50.     elif data == '[deleted]' or data == '[removed]':
  51.         return False
  52.     else:
  53.         return True
  54.  
  55. def sql_insert_replace_comment(commentid,parentid,parent,comment,subreddit,time,score):
  56.     try:
  57.  
  58.     except Exception as e:
  59.         print('replace_comment',e)
  60.  
  61. if __name__=="__main__":
  62.     create_table()
  63.     row_counter = 0
  64.     paired_rows = 0
  65.  
  66.     with open("F:/Nebula0.0.5/chatdata/reddit_data/{}/RC_{}".format(timeframe.split('-')[0]), timeframe, buffering=1000) as f:
  67.               for row in f:
  68.                   print(row)
  69.                   row_counter += 1
  70.                   row = json.loads(row)
  71.                   parent_id = row['parent_id']
  72.                 #this function will clean up data
  73.                   body = format_data(row['body'])
  74.                   created_utc = row['created_utc']
  75.                   score = row['score']
  76.                   subreddit = row['subreddit']
  77.                   parent_data = find_parent(parent_id)
  78.  
  79.  
  80.                   if score >= 5:
  81.                       if acceptable(body):
  82.                           existing_comment_score = find_existing_score(parent_id)
  83.                           if existing_comment_score:
  84.                               if score > existing_comment_score:
  85.                                   sql_insert_replace_comment(comment_id, parent_id, parent_data, body, subreddit, created_utc, score)
  86.                           else:
  87.                               if parent_data:
  88.                                   sql_inser_has_parent(comment_id, parent_id, parent_data, body, subreddit, created_utc, score)
  89.                               else:
  90.                                   sql_insert_no_parent(comment_id, parent_id, body, subreddit, created_utc, score)
  91.                      
  92.              
  93.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement