Advertisement
Guest User

Untitled

a guest
Apr 21st, 2018
73
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.49 KB | None | 0 0
  1. import sqlite3
  2. import json
  3. from datetime import datetime
  4. import time
  5.  
  6. timeframe = '2015-01'
  7. sql_transaction = []
  8. start_row = 0
  9. cleanup = 1000000
  10.  
  11. connection = sqlite3.connect('{}.db'.format(timeframe))
  12. c = connection.cursor()
  13.  
  14. def create_table():
  15. c.execute("CREATE TABLE IF NOT EXISTS parent_reply(parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT, comment TEXT, subreddit TEXT, unix INT, score INT)")
  16.  
  17. def format_data(data):
  18. data = data.replace('\n',' newlinechar ').replace('\r',' newlinechar ').replace('"',"'")
  19. return data
  20.  
  21. def transaction_bldr(sql):
  22. global sql_transaction
  23. sql_transaction.append(sql)
  24. if len(sql_transaction) > 1000:
  25. c.execute('BEGIN TRANSACTION')
  26. for s in sql_transaction:
  27. try:
  28. c.execute(s)
  29. except:
  30. pass
  31. connection.commit()
  32. sql_transaction = []
  33.  
  34. def sql_insert_replace_comment(commentid,parentid,parent,comment,subreddit,time,score):
  35. try:
  36. sql = """UPDATE parent_reply SET parent_id = ?, comment_id = ?, parent = ?, comment = ?, subreddit = ?, unix = ?, score = ? WHERE parent_id =?;""".format(parentid, commentid, parent, comment, subreddit, int(time), score, parentid)
  37. transaction_bldr(sql)
  38. except Exception as e:
  39. print('s0 insertion',str(e))
  40.  
  41. def sql_insert_has_parent(commentid,parentid,parent,comment,subreddit,time,score):
  42. try:
  43. sql = """INSERT INTO parent_reply (parent_id, comment_id, parent, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}","{}",{},{});""".format(parentid, commentid, parent, comment, subreddit, int(time), score)
  44. transaction_bldr(sql)
  45. except Exception as e:
  46. print('s0 insertion',str(e))
  47.  
  48. def sql_insert_no_parent(commentid,parentid,comment,subreddit,time,score):
  49. try:
  50. sql = """INSERT INTO parent_reply (parent_id, comment_id, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}",{},{});""".format(parentid, commentid, comment, subreddit, int(time), score)
  51. transaction_bldr(sql)
  52. except Exception as e:
  53. print('s0 insertion',str(e))
  54.  
  55. def acceptable(data):
  56. if len(data.split(' ')) > 1000 or len(data) < 1:
  57. return False
  58. elif len(data) > 32000:
  59. return False
  60. elif data == '[deleted]':
  61. return False
  62. elif data == '[removed]':
  63. return False
  64. else:
  65. return True
  66.  
  67. def find_parent(pid):
  68. try:
  69. sql = "SELECT comment FROM parent_reply WHERE comment_id = '{}' LIMIT 1".format(pid)
  70. c.execute(sql)
  71. result = c.fetchone()
  72. if result != None:
  73. return result[0]
  74. else: return False
  75. except Exception as e:
  76. #print(str(e))
  77. return False
  78.  
  79. def find_existing_score(pid):
  80. try:
  81. sql = "SELECT score FROM parent_reply WHERE parent_id = '{}' LIMIT 1".format(pid)
  82. c.execute(sql)
  83. result = c.fetchone()
  84. if result != None:
  85. return result[0]
  86. else: return False
  87. except Exception as e:
  88. #print(str(e))
  89. return False
  90.  
  91. if __name__ == '__main__':
  92. create_table()
  93. row_counter = 0
  94. paired_rows = 0
  95.  
  96. #with open('J:/chatdata/reddit_data/{}/RC_{}'.format(timeframe.split('-')[0],timeframe), buffering=1000) as f:
  97. with open('./RC_{}'.format(timeframe), buffering=1000) as f:
  98. for row in f:
  99. print(row)
  100. #time.sleep(555)
  101. row_counter += 1
  102.  
  103. if row_counter > start_row:
  104. try:
  105. row = json.loads(row)
  106. parent_id = row['parent_id'].split('_')[1]
  107. body = format_data(row['body'])
  108. created_utc = row['created_utc']
  109. score = row['score']
  110.  
  111. comment_id = row['id']
  112.  
  113. subreddit = row['subreddit']
  114. parent_data = find_parent(parent_id)
  115.  
  116. existing_comment_score = find_existing_score(parent_id)
  117. if existing_comment_score:
  118. if score > existing_comment_score:
  119. if acceptable(body):
  120. sql_insert_replace_comment(comment_id,parent_id,parent_data,body,subreddit,created_utc,score)
  121.  
  122. else:
  123. if acceptable(body):
  124. if parent_data:
  125. if score >= 2:
  126. sql_insert_has_parent(comment_id,parent_id,parent_data,body,subreddit,created_utc,score)
  127. paired_rows += 1
  128. else:
  129. sql_insert_no_parent(comment_id,parent_id,body,subreddit,created_utc,score)
  130. except Exception as e:
  131. print(str(e))
  132.  
  133. if row_counter % 100000 == 0:
  134. print('Total Rows Read: {}, Paired Rows: {}, Time: {}'.format(row_counter, paired_rows, str(datetime.now())))
  135.  
  136. if row_counter > start_row:
  137. if row_counter % cleanup == 0:
  138. print("Cleanin up!")
  139. sql = "DELETE FROM parent_reply WHERE parent IS NULL"
  140. c.execute(sql)
  141. connection.commit()
  142. c.execute("VACUUM")
  143. connection.commit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement