Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import json
- import sqlite3
- from datetime import datetime
- zeit = '2014-01'
- sql_trans = []
- connection = sqlite3.connect('{}.db'.format(zeit))
- cursor = connection.cursor()
- def create_table():
- cursor.execute("CREATE TABLE IF NOT EXISTS parent_reply(parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT, comment TEXT, subreddit TEXT, unix INT, score INT)")
- def body_format(change):
- change = change.replace('\n', ' newline ').replace('\r',' newline ').replace('"',"'")
- return change
- def search_parent(pid):
- try:
- abfrage = "SELECT comment FROM parent_reply WHERE comment_id = '{}' LIMIT 1".format(pid)
- cursor.execute(abfrage)
- ergebnis = cursor.fetchone()
- if ergebnis != None:
- return ergebnis[0]
- else:
- return False
- except Exception as ex:
- return False
- def search_score(pid):
- try:
- abfrage = "SELECT score FROM parent_reply WHERE parent_id = '{}' LIMIT 1".format(pid)
- cursor.execute(abfrage)
- ergebnis = cursor.fetchone()
- if ergebnis != None:
- return ergebnis[0]
- else:
- return False
- except Exception as ex:
- return False
- def ignore(ign):
- if ign == '[deleted]':
- return False
- elif ign == '[removed]':
- return False
- elif len(ign.split(' ')) > 40 or len(ign) < 1:
- return False
- elif len(ign) > 700:
- return False
- else:
- return True
- def comment_ersetzen(commentid, parentid, parent, comment, subreddit, time, score):
- try:
- abfrage = """UPDATE parent_reply SET parent_id = ?, comment_id = ?, parent = ?, comment = ?, subreddit = ?, unix = ?, score = ? WHERE parent_id = ?;""".format(parentid, commentid, parent, comment, subreddit, int(time), score, parentid)
- trans(abfrage)
- except Exception as ex:
- print('insertion', str(ex))
- def comment_eintrag(commentid, parentid, parent, comment, subreddit, time, score):
- try:
- abfrage = """INSERT INTO parent_reply (parent_id, comment_id, parent, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}","{}",{},{});""".format(parentid, commentid, parent, comment, subreddit, int(time), score)
- trans(abfrage)
- except Exception as ex:
- print('insertion', str(ex))
- def comment_kein_eintrag(commentid, parentid, comment, subreddit, time, score):
- try:
- abfrage = """INSERT INTO parent_reply (parent_id, comment_id, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}",{},{});""".format(parentid, commentid, comment, subreddit, int(time), score)
- trans(abfrage)
- except Exception as ex:
- print('insertion', str(ex))
- def trans(abfrage):
- global sql_trans
- sql_trans.append(abfrage)
- if len(sql_trans) > 1000:
- cursor.execute('Starte Transaktion')
- for b in sql_trans:
- try:
- cursor.execute(b)
- except:
- pass
- connection.commit()
- sql_trans = []
- if __name__ == '__main__':
- create_table()
- zeilen = 0
- paare = 0
- with open("E:/RC_2014-01".format(zeit.split('-')[0], zeit), buffering=1000) as f:
- for zeile in f:
- zeile = json.loads(zeile)
- zeilen = zeilen + 1
- parent_id = zeile['parent_id']
- created_utc = zeile['created_utc']
- score = zeile['score']
- comment_id = zeile['name']
- subreddit = zeile['subreddit']
- body = body_format(zeile['body'])
- parent_info = search_parent(parent_id)
- if score >= 4:
- vorhandener_score = search_score(parent_id)
- if vorhandener_score:
- if score > vorhandener_score:
- if ignore(body):
- comment_ersetzen(comment_id, parent_id, parent_info, body, subreddit, created_utc, score)
- else:
- if ignore(body):
- if parent_info:
- comment_eintrag(comment_id, parent_id, parent_info, body, subreddit, created_utc, score)
- paare = paare + 1
- else:
- comment_kein_eintrag(comment_id, parent_id, body, subreddit, created_utc, score)
- if zeilen % 10000 == 0:
- print('Zeilen gelesen: {}, Anzahl Paare: {}, Zeitpunkt: {}'.format(zeilen, paare, str(datetime.now())))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement