Advertisement
Guest User

Untitled

a guest
Apr 25th, 2017
178
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.45 KB | None | 0 0
  1. import psycopg2
  2. import ast
  3. import time
  4. from psycopg2.extras import execute_batch
  5. #connect to sql server
  6. try:
  7. conn = psycopg2.connect("dbname = 'reddit' user = 'postgres' host = 'localhost' password = 'butt'")
  8. except:
  9. print("L2type baddie (Can't connect to the database)")
  10.  
  11. #list of attributes we want
  12. ourKeys = ["body", "score", "id", "controversiality", "author_flair_css_class", "author_flair_text", "parent_id", "author", "link_id", "gilded", "created_utc", "subreddit"]
  13.  
  14. cur = conn.cursor()
  15.  
  16. #sql insertion statements (format strings to prevent injections)
  17. sqlRedditor = '''INSERT INTO redditor(author) VALUES (%(author)s) ON CONFLICT(author) DO NOTHING'''
  18. sqlComment = '''INSERT INTO comment(id, body, created_utc, gilded, score, controversiality) VALUES (%(id)s, %(body)s, %(created_utc)s, %(gilded)s, %(score)s, %(controversiality)s) ON CONFLICT(id) DO NOTHING'''
  19. sqlPostedIn = ''' INSERT INTO posted_in(id, subreddit) VALUES(%(id)s, %(subreddit)s)'''
  20. sqlSubreddit = '''INSERT INTO subreddit(subreddit) VALUES( %(subreddit)s) ON CONFLICT(subreddit) DO NOTHING'''
  21. sqlPostedby = '''INSERT INTO posted_by(author, id) VALUES(%(author)s, %(id)s)'''
  22. startTime = time.time()
  23. dicList = []
  24. ### CHANGE THE RANGE DEPENDING ON HOW MANY FILES YOU WANT TO LOAD. SUGGEST range(0,2)
  25. for x in range(0,2):
  26. #### CHANGE THIS TO THE PROPER DIRECTORY
  27. with open('F:/Downloads (2)/json/jan17('+str(x).zfill(12)+')', 'r', encoding = "utf-8") as f:
  28. for line in f:
  29. deletionList = []
  30. if line[-1:] == '\n':
  31. line = line[:-1]
  32.  
  33. #prevent errors from line breaks; transform each line from a string to dictionary
  34. line = line.replace("\n", "\\n")
  35. line = ast.literal_eval(line)
  36.  
  37. #remove attributes that are not in our desired list
  38. for key in line.keys():
  39. if key not in ourKeys:
  40. deletionList.append(key)
  41. for key in deletionList:
  42. del line[key]
  43.  
  44. #add missing attributes from our desired list; instantiate with null value
  45. for key in ourKeys:
  46. if key not in line.keys():
  47. line[key]='null'
  48. dicList.append(line)
  49.  
  50. print("Done creating dictionaries. Elapsed: "+format(time.time()-startTime))
  51. #psycop requires tuples
  52. dicList = tuple(dicList)
  53.  
  54. #executemany: 577s (136+100+134+95+112)
  55. #execute sql statements using list of dictionaries
  56. try:
  57. startTime = time.time()
  58. execute_batch(cur, sqlComment, dicList)
  59. #cur.executemany(sqlComment,dicList)
  60. print("comment db updated. Elapsed: "+format(time.time()-startTime))
  61. except Exception as e:
  62. print("failed to add to db 'comment'", e)
  63. #"""
  64. try:
  65. startTime = time.time()
  66. execute_batch(cur, sqlRedditor, dicList)
  67. #cur.executemany(sqlRedditor,dicList)
  68. print("redditor db updated. Elapsed: "+format(time.time()-startTime))
  69. except:
  70. print("failed to add to db 'redditor'")
  71. #"""
  72. try:
  73. startTime = time.time()
  74. execute_batch(cur, sqlSubreddit, dicList)
  75. #cur.executemany(sqlSubreddit,dicList)
  76. print("subreddit db updated. Elapsed: "+format(time.time()-startTime))
  77. except:
  78. print("failed to add to db 'subreddit'")
  79. try:
  80. startTime = time.time()
  81. execute_batch(cur, sqlPostedIn, dicList)
  82. #cur.executemany(sqlPostedIn,dicList)
  83. print("posted_in db updated. Elapsed: "+format(time.time()-startTime))
  84. except Exception as e:
  85. print("failed to add to db 'posted_in'", e)
  86.  
  87. try:
  88. startTime = time.time()
  89. execute_batch(cur, sqlPostedby, dicList)
  90. #cur.executemany(sqlPostedby, dicList)
  91. print("posted_by db updated. Elapsed: "+format(time.time()-startTime))
  92. except:
  93. print("failed to add to db 'posted_by'")
  94.  
  95.  
  96. conn.commit()
  97. cur.close()
  98.  
  99. if conn is not None:
  100. conn.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement