Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sqlite3
- import json
- from datetime import datetime
- import bz2
- timeframe = 'comments_fiveplus'
- sql_transaction = []
- connection = sqlite3.connect('{}.db'.format(timeframe))
- c = connection.cursor()
- def create_table(time):
- c.execute("""CREATE TABLE IF NOT EXISTS '{}'(parent_id TEXT PRIMARY KEY,
- comment_id TEXT UNIQUE,
- parent TEXT,
- comment TEXT,
- subreddit TEXT,
- unix INT,
- score INT)""".format(time))
- def format_data(data):
- data = data.replace("\n", " newlinechar ").replace("\r", " returnchar ").replace('"',"'")
- return data
- def find_existing_score(pid, time):
- try:
- sql = "SELECT score FROM '{}' WHERE parent_id = '{}' LIMIT 1".format(time, pid)
- c.execute(sql)
- result = c.fetchone()
- if result != None:
- return result[0]
- else:
- return False
- except Exception as e:
- return False
- def acceptable(data):
- # if more than fifty words
- if len(data.split(' ')) > 50 or len(data) < 1:
- return False
- # if more than 1000 characters (spam)
- elif len(data) > 1000:
- return False
- # if deleted
- elif data == '[deleted]' or data == '[removed]':
- return False
- else:
- return True
- def find_parent(pid, time):
- try:
- sql = "SELECT comment FROM '{}' WHERE comment_id = '{}' LIMIT 1".format(time, pid)
- c.execute(sql)
- result = c.fetchone()
- if result != None:
- return result[0]
- else:
- return False
- except Exception as e:
- print("find_parent", e)
- return False
- def transaction_bldr(sql):
- global sql_transaction
- sql_transaction.append(sql)
- if len(sql_transaction) > 5000:
- c.execute('BEGIN TRANSACTION')
- for s in sql_transaction:
- try:
- c.execute(s)
- except:
- pass
- connection.commit()
- sql_transaction = []
- def sql_insert_replace_comment(sourcetime, commentid,parentid,parent,comment,subreddit,time,score):
- try:
- sql = """UPDATE '{}' SET parent_id = ?, comment_id = ?, parent = ?, comment = ?, subreddit = ?, unix = ?, score = ? WHERE parent_id =?;""".format(sourcetime, parentid, commentid, parent, comment, subreddit, int(time), score, parentid)
- transaction_bldr(sql)
- except Exception as e:
- print('s-UPDATE insertion',str(e))
- def sql_insert_has_parent(sourcetime, commentid,parentid,parent,comment,subreddit,time,score):
- try:
- sql = """INSERT INTO '{}' (parent_id, comment_id, parent, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}","{}",{},{});""".format(sourcetime, parentid, commentid, parent, comment, subreddit, int(time), score)
- transaction_bldr(sql)
- except Exception as e:
- print('s-PARENT insertion',str(e))
- def sql_insert_no_parent(sourcetime, commentid,parentid,comment,subreddit,time,score):
- try:
- sql = """INSERT INTO '{}' (parent_id, comment_id, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}",{},{});""".format(sourcetime, parentid, commentid, comment, subreddit, int(time), score)
- transaction_bldr(sql)
- except Exception as e:
- print('s-NO_PARENT insertion',str(e))
- def openloop(time):
- row_counter = 0
- paired_rows = 0
- sourcefile = "//APOLLO/home/Programming/reddit_data/Uncompressed/RC_{}".format(time)
- try:
- with open(sourcefile, buffering = 5000) as f:
- create_table(time)
- for row in f:
- row_counter += 1
- row = json.loads(row)
- parent_id = row['parent_id']
- body = format_data(row['body'])
- created_utc = row['created_utc']
- score = row['score']
- subreddit = row['subreddit']
- if 'name' in row:
- # old reddit storage
- comment_id = row['name']
- else:
- # new reddit storage. add 't1_' to make it same as before
- comment_id = 't1_' + row['id']
- parent_data = find_parent(parent_id, time)
- if score >= 5:
- if acceptable(body):
- existing_comment_score = find_existing_score(parent_id, time)
- if existing_comment_score:
- if score > existing_comment_score:
- sql_insert_replace_comment(time, comment_id, parent_id, parent_data, body, subreddit, created_utc, score)
- else:
- if parent_data:
- sql_insert_has_parent(time, comment_id, parent_id, parent_data, body, subreddit, created_utc, score)
- paired_rows += 1
- else:
- sql_insert_no_parent(time, comment_id, parent_id, body, subreddit, created_utc, score)
- print("Total rows read: {}, Paired Rows: {}, Time: {}".format(row_counter, paired_rows, str(datetime.now())))
- except Exception as e:
- print(e)
- def main():
- for y in range(14,16):
- for x in range(1,13):
- time = "20{:0>2d}-{:0>2d}".format(16,x)
- print('Beginning work on RC_{}'.format(time))
- openloop(time)
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement