Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import sys
- import os
- import json
- from datetime import datetime
- import re
- import psycopg2
- A_JSON_OUT_DIR = "dump-article"
- M_JSON_OUT_DIR = "dump-match"
- DATETIME_FORMAT = "%d.%m.%Y %H:%M"
- PAGE_ID = 1
- CONN = None
- DB_NAME = "efschema"
- DB_USER = "efuser"
- DB_PASS = "efpw"
- def process_article(directory, content_type, number):
- """Process single page"""
- global PAGE_ID
- filepath = "%s/%d" % (directory, number)
- file = open(filepath, 'r')
- data = json.load(file)
- # Empty struct from emty article
- if not data:
- return
- page_date = datetime.strptime(data['created'], DATETIME_FORMAT)
- try:
- cur = CONN.cursor()
- cur.execute("INSERT INTO efPage(id, content_type, page_number, created, views, name) VALUES (%s, %s, %s, %s, %s, %s)", (PAGE_ID, content_type, data['id'], page_date, data['views'], data['name'],))
- for post in data['forum']:
- post_date = datetime.strptime(post['created'], DATETIME_FORMAT)
- cur.execute("INSERT INTO efPost(anchor_id, page_id, parent_anchor_id, created, author, content) VALUES (%s, %s, %s, %s, %s, %s)", (post['id'], PAGE_ID, post['parent'], post_date, post['author'], post['text'],))
- except psycopg2.DatabaseError, e:
- print "Cannot insert into DB.: %s" % e
- sys.exit(3)
- PAGE_ID += 1
- file.close()
- def process_articles(directory, content_type, top, bottom):
- """Process articles interval"""
- # Loop article ids
- for i in xrange(top, bottom, -1):
- process_article(directory, content_type, i)
- if (i % 10000) == 0:
- CONN.commit()
- print "At %d commited." % i
- CONN.commit()
- def create_indexes():
- try:
- cur = CONN.cursor()
- cur.execute("CREATE INDEX ON efPost(author)")
- cur.execute("CREATE INDEX ON efPost(created)")
- cur.execute("CREATE INDEX ON efPost USING gin(content gin_trgm_ops)")
- cur.execute("CREATE INDEX ON efUser(posts)")
- cur.execute("CREATE INDEX ON efUser(name)")
- CONN.commit()
- except psycopg2.DatabaseError, e:
- print "Cannot create indexes.: %s" % e
- sys.exit(3)
- def init_db():
- try:
- global CONN
- CONN = psycopg2.connect(database=DB_NAME, user=DB_USER, password=DB_PASS)
- cur = CONN.cursor()
- cur.execute("DROP TABLE IF EXISTS efPage CASCADE")
- cur.execute("DROP TABLE IF EXISTS efPost CASCADE")
- cur.execute("DROP TABLE IF EXISTS efUser CASCADE")
- cur.execute("CREATE TABLE efPage(id integer PRIMARY KEY, page_number integer NOT NULL, content_type char(1) NOT NULL, created timestamp, views integer, name text)")
- cur.execute("CREATE TABLE efPost(id serial PRIMARY KEY, anchor_id integer, created timestamp, page_id integer REFERENCES efPage (id) NOT NULL, parent_anchor_id integer, author text, content text)")
- cur.execute("CREATE TABLE efUser(name text, posts integer)")
- CONN.commit()
- except psycopg2.DatabaseError, e:
- print "Cannot init DB.: %s" % e
- sys.exit(2)
- def process_support_tables():
- try:
- cur = CONN.cursor()
- cur.execute("INSERT INTO efUser SELECT author, count(*) FROM efPost GROUP BY author ORDER BY 2 DESC")
- # Posts of all users
- cur.execute("DELETE FROM efUser WHERE name = ''")
- cur.execute("INSERT INTO efUser SELECT '' as name, count(*) FROM efPost")
- CONN.commit()
- except psycopg2.DatabaseError, e:
- print "Cannot populate support tables.: %s" % e
- sys.exit(4)
- if __name__ == "__main__":
- if len(sys.argv) == 3:
- articles = int(sys.argv[1])
- matches = int(sys.argv[2])
- else:
- sys.exit(0)
- if not os.path.exists(A_JSON_OUT_DIR) or not os.path.exists(M_JSON_OUT_DIR):
- print "Cannot find dir with JSONs in local directory."
- sys.exit(1)
- init_db()
- if CONN:
- process_articles(A_JSON_OUT_DIR, "A", articles, 0)
- process_articles(M_JSON_OUT_DIR, "M", matches, 0)
- process_support_tables()
- create_indexes()
- CONN.close()
Advertisement
Add Comment
Please, Sign In to add comment