Guest User

efinsert

a guest
Aug 22nd, 2015
122
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.11 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. import sys
  4. import os
  5. import json
  6. from datetime import datetime
  7. import re
  8. import psycopg2
  9.  
  10. A_JSON_OUT_DIR = "dump-article"
  11. M_JSON_OUT_DIR = "dump-match"
  12. DATETIME_FORMAT = "%d.%m.%Y %H:%M"
  13.  
  14. PAGE_ID = 1
  15.  
  16. CONN = None
  17.  
  18. DB_NAME = "efschema"
  19. DB_USER = "efuser"
  20. DB_PASS = "efpw"
  21.  
  22. def process_article(directory, content_type, number):
  23.     """Process single page"""
  24.     global PAGE_ID
  25.     filepath = "%s/%d" % (directory, number)
  26.     file = open(filepath, 'r')
  27.  
  28.     data = json.load(file)
  29.  
  30.     # Empty struct from emty article
  31.     if not data:
  32.         return
  33.  
  34.     page_date = datetime.strptime(data['created'], DATETIME_FORMAT)
  35.  
  36.     try:
  37.         cur = CONN.cursor()
  38.         cur.execute("INSERT INTO efPage(id, content_type, page_number, created, views, name) VALUES (%s, %s, %s, %s, %s, %s)", (PAGE_ID, content_type, data['id'], page_date, data['views'], data['name'],))
  39.         for post in data['forum']:
  40.             post_date = datetime.strptime(post['created'], DATETIME_FORMAT)
  41.             cur.execute("INSERT INTO efPost(anchor_id, page_id, parent_anchor_id, created, author, content) VALUES (%s, %s, %s, %s, %s, %s)", (post['id'], PAGE_ID, post['parent'], post_date, post['author'], post['text'],))
  42.     except psycopg2.DatabaseError, e:
  43.         print "Cannot insert into DB.: %s" % e
  44.         sys.exit(3)
  45.  
  46.     PAGE_ID += 1
  47.     file.close()
  48.  
  49. def process_articles(directory, content_type, top, bottom):
  50.     """Process articles interval"""
  51.     # Loop article ids
  52.     for i in xrange(top, bottom, -1):
  53.         process_article(directory, content_type, i)
  54.         if (i % 10000) == 0:
  55.             CONN.commit()
  56.             print "At %d commited." % i
  57.     CONN.commit()
  58.  
  59. def create_indexes():
  60.     try:
  61.         cur = CONN.cursor()
  62.         cur.execute("CREATE INDEX ON efPost(author)")
  63.         cur.execute("CREATE INDEX ON efPost(created)")
  64.         cur.execute("CREATE INDEX ON efPost USING gin(content gin_trgm_ops)")
  65.         cur.execute("CREATE INDEX ON efUser(posts)")
  66.         cur.execute("CREATE INDEX ON efUser(name)")
  67.         CONN.commit()
  68.     except psycopg2.DatabaseError, e:
  69.         print "Cannot create indexes.: %s" % e
  70.         sys.exit(3)
  71.  
  72. def init_db():
  73.     try:
  74.         global CONN
  75.         CONN = psycopg2.connect(database=DB_NAME, user=DB_USER, password=DB_PASS)
  76.         cur = CONN.cursor()
  77.         cur.execute("DROP TABLE IF EXISTS efPage CASCADE")
  78.         cur.execute("DROP TABLE IF EXISTS efPost CASCADE")
  79.         cur.execute("DROP TABLE IF EXISTS efUser CASCADE")
  80.         cur.execute("CREATE TABLE efPage(id integer PRIMARY KEY, page_number integer NOT NULL, content_type char(1) NOT NULL, created timestamp, views integer, name text)")
  81.         cur.execute("CREATE TABLE efPost(id serial PRIMARY KEY, anchor_id integer, created timestamp, page_id integer REFERENCES efPage (id) NOT NULL, parent_anchor_id integer, author text, content text)")
  82.         cur.execute("CREATE TABLE efUser(name text, posts integer)")
  83.         CONN.commit()
  84.     except psycopg2.DatabaseError, e:
  85.         print "Cannot init DB.: %s" % e
  86.         sys.exit(2)
  87.  
  88. def process_support_tables():
  89.     try:
  90.         cur = CONN.cursor()
  91.         cur.execute("INSERT INTO efUser SELECT author, count(*) FROM efPost GROUP BY author ORDER BY 2 DESC")
  92.         # Posts of all users
  93.         cur.execute("DELETE FROM efUser WHERE name = ''")
  94.         cur.execute("INSERT INTO efUser SELECT '' as name, count(*) FROM efPost")
  95.         CONN.commit()
  96.     except psycopg2.DatabaseError, e:
  97.         print "Cannot populate support tables.: %s" % e
  98.         sys.exit(4)
  99.  
  100. if __name__ == "__main__":
  101.     if len(sys.argv) == 3:
  102.         articles = int(sys.argv[1])
  103.         matches = int(sys.argv[2])
  104.     else:
  105.         sys.exit(0)
  106.  
  107.     if not os.path.exists(A_JSON_OUT_DIR) or not os.path.exists(M_JSON_OUT_DIR):
  108.         print "Cannot find dir with JSONs in local directory."
  109.         sys.exit(1)
  110.  
  111.     init_db()
  112.  
  113.     if CONN:
  114.         process_articles(A_JSON_OUT_DIR, "A", articles, 0)
  115.         process_articles(M_JSON_OUT_DIR, "M", matches, 0)
  116.         process_support_tables()
  117.         create_indexes()
  118.         CONN.close()
Advertisement
Add Comment
Please, Sign In to add comment