Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sqlite3
- import json
- from datetime import datetime
- timeframe = '2019-12'
- sql_transaction = []
- #creates data base and names it the value in timeframe variable
- connection = sqlite3.connect('{}.db'.format(timeframe))
- c = connection.cursor()
- def create_table():
- #creates a table if one does not exist and names it parent reply
- c.execute("""CREATE TABLE IF NOT EXISTS parent_reply
- (parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT, comment TEXT,
- subreddit TEXT, unix INT, score INT)""")
- def format_data(data):
- data=data.replace("\n"," newlinechar ").replace("\r"," newlinechar ").replace('"',"'")
- return data
- def find_parent(pid):
- try:
- sql = "SLECT comment FROM parent_reply WHERE comment_id = '{}' LIMIT 1".format(pid)
- c.execute(sql)
- result = c.fetchone()
- if result != None:
- return result[0]
- else: return False
- except Exception as e:
- print("find_parent", e)
- return False
- if __name__=="__main__":
- create_table()
- row_counter = 0
- paired_rows = 0
- with open("F:/Nebula0.0.5/chatdata/reddit_data/{}/RC_{}".format(timeframe.split('-')[0], timeframe, buffering=1000) as f:
- for row in f:
- print(row)
- row_counter += 1
- row = json.loads(row)
- parent_id = row{'parent_id']
- #this function will clean up data
- body = format_data(row{'body'})
- created_utc = row['created_utc']
- score = row['score']
- subreddit = row['subreddit']
- parent_data = find_parent(parent_id)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement