Advertisement
Guest User

Untitled

a guest
Mar 26th, 2019
87
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.80 KB | None | 0 0
  1. import json
  2. import sqlite3
  3. from datetime import datetime
  4.  
  5. zeit = '2014-01'
  6. sql_trans = []
  7.  
  8. connection = sqlite3.connect('{}.db'.format(zeit))
  9. cursor = connection.cursor()
  10.  
  11. def create_table():
  12. cursor.execute("CREATE TABLE IF NOT EXISTS parent_reply(parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT, comment TEXT, subreddit TEXT, unix INT, score INT)")
  13.  
  14.  
  15. def body_format(change):
  16. change = change.replace('\n', ' newline ').replace('\r',' newline ').replace('"',"'")
  17. return change
  18.  
  19.  
  20. def search_parent(pid):
  21. try:
  22. abfrage = "SELECT comment FROM parent_reply WHERE comment_id = '{}' LIMIT 1".format(pid)
  23. cursor.execute(abfrage)
  24. ergebnis = cursor.fetchone()
  25. if ergebnis != None:
  26. return ergebnis[0]
  27. else:
  28. return False
  29. except Exception as ex:
  30. return False
  31.  
  32.  
  33. def search_score(pid):
  34. try:
  35. abfrage = "SELECT score FROM parent_reply WHERE parent_id = '{}' LIMIT 1".format(pid)
  36. cursor.execute(abfrage)
  37. ergebnis = cursor.fetchone()
  38. if ergebnis != None:
  39. return ergebnis[0]
  40. else:
  41. return False
  42. except Exception as ex:
  43. return False
  44.  
  45.  
  46. def ignore(ign):
  47. if ign == '[deleted]':
  48. return False
  49. elif ign == '[removed]':
  50. return False
  51. elif len(ign.split(' ')) > 40 or len(ign) < 1:
  52. return False
  53. elif len(ign) > 700:
  54. return False
  55. else:
  56. return True
  57.  
  58.  
  59. def comment_ersetzen(commentid, parentid, parent, comment, subreddit, time, score):
  60. try:
  61. abfrage = """UPDATE parent_reply SET parent_id = ?, comment_id = ?, parent = ?, comment = ?, subreddit = ?, unix = ?, score = ? WHERE parent_id = ?;""".format(parentid, commentid, parent, comment, subreddit, int(time), score, parentid)
  62. trans(abfrage)
  63. except Exception as ex:
  64. print('insertion', str(ex))
  65.  
  66.  
  67. def comment_eintrag(commentid, parentid, parent, comment, subreddit, time, score):
  68. try:
  69. abfrage = """INSERT INTO parent_reply (parent_id, comment_id, parent, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}","{}",{},{});""".format(parentid, commentid, parent, comment, subreddit, int(time), score)
  70. trans(abfrage)
  71. except Exception as ex:
  72. print('insertion', str(ex))
  73.  
  74.  
  75. def comment_kein_eintrag(commentid, parentid, comment, subreddit, time, score):
  76. try:
  77. abfrage = """INSERT INTO parent_reply (parent_id, comment_id, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}",{},{});""".format(parentid, commentid, comment, subreddit, int(time), score)
  78. trans(abfrage)
  79. except Exception as ex:
  80. print('insertion', str(ex))
  81.  
  82.  
  83. def trans(abfrage):
  84. global sql_trans
  85. sql_trans.append(abfrage)
  86. if len(sql_trans) > 1000:
  87. cursor.execute('Starte Transaktion')
  88. for b in sql_trans:
  89. try:
  90. cursor.execute(b)
  91. except:
  92. pass
  93. connection.commit()
  94. sql_trans = []
  95.  
  96.  
  97. if __name__ == '__main__':
  98. create_table()
  99. zeilen = 0
  100. paare = 0
  101.  
  102. with open("E:/RC_2014-01".format(zeit.split('-')[0], zeit), buffering=1000) as f:
  103. for zeile in f:
  104. zeile = json.loads(zeile)
  105. zeilen = zeilen + 1
  106. parent_id = zeile['parent_id']
  107. created_utc = zeile['created_utc']
  108. score = zeile['score']
  109. comment_id = zeile['name']
  110. subreddit = zeile['subreddit']
  111. body = body_format(zeile['body'])
  112. parent_info = search_parent(parent_id)
  113.  
  114. if score >= 4:
  115. vorhandener_score = search_score(parent_id)
  116. if vorhandener_score:
  117. if score > vorhandener_score:
  118. if ignore(body):
  119. comment_ersetzen(comment_id, parent_id, parent_info, body, subreddit, created_utc, score)
  120.  
  121. else:
  122. if ignore(body):
  123. if parent_info:
  124. comment_eintrag(comment_id, parent_id, parent_info, body, subreddit, created_utc, score)
  125. paare = paare + 1
  126. else:
  127. comment_kein_eintrag(comment_id, parent_id, body, subreddit, created_utc, score)
  128.  
  129. if zeilen % 10000 == 0:
  130. print('Zeilen gelesen: {}, Anzahl Paare: {}, Zeitpunkt: {}'.format(zeilen, paare, str(datetime.now())))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement