iama_alpaca

SubredditTextPostScraper

Jun 8th, 2017
89
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.78 KB | None | 0 0
  1. #! /usr/bin/python3
  2. import praw, os, sys, datetime, time
  3. start = time.time()
  4.  
  5. #if usedarkmode:
  6. textcolor = "fff"
  7. postbg = "333"
  8. bgcolor = "212121"
  9. shadow = "000"
  10. link = "add8e6"
  11. hoverlink = "white"
  12. visitedlink = "adbce6"
  13. #else:
  14. #   textcolor = "000"
  15. #   postbg = "e6e6e6"
  16. #   bgcolor = "fff"
  17. #   shadow = "636363"
  18. #   link = "386fff"
  19. #   hoverlink = "759bff"
  20. #   visitedlink = "ba75ff"
  21.  
  22. # Setting Variables
  23. client_id = ""
  24. client_secret = ""
  25. subname = sys.argv[1]
  26. reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent='Text Post Archiver')
  27. postnumber = 0
  28.  
  29. # Making directory with the title of the selected subreddit
  30. os.makedirs(subname, exist_ok=True)
  31.  
  32. # Gets a post's submission date and time
  33. def getdate(submission):
  34.     time = submission.created
  35.     return datetime.datetime.fromtimestamp(time)
  36.  
  37. # Makes a blank html file and writes data to it
  38. postslist = open(subname+' Post Archive.html', "a")
  39. archive = """<html>
  40. <LINK REL=StyleSheet HREF=\""""+subname+'/'"""style.css" TYPE="text/css">
  41. <center><head>Posts from /r/"""+subname+"""</head></center>
  42. <body> <br> <br>\n"""
  43. postslist.write(archive)
  44.  
  45. # Making lists for the posts' data to go into
  46. global scores
  47. scores = []
  48. global postlist
  49. postlist = []
  50.  
  51. # Function for downloading and writing posts and comments
  52. def dothestuff():
  53.  
  54.     # Makes an html file for the current post and sets the file's name to the post's id
  55.     savepost = open(os.path.join(subname, submission.id+'.html'), "a")
  56.  
  57.     # Increases the index of posts downloaded by 1. Used to display "Saving Post #___..." in console
  58.     global postnumber
  59.     postnumber+= 1
  60.     print('Saving Post #'+str(postnumber)+': '+submission.title)
  61.  
  62.     # Setting variable to be added to the submission's html file (The ".replace"s fix some errors that make the browser unable to read certain characters properly)
  63.     if submission.selftext_html is not None:
  64.         post = """<html>
  65. <link rel=StyleSheet href="style.css" type="text/css">
  66. <head></head>
  67. <body> \n<p>"""+submission.title.replace("’","'").replace('”','"').replace("—","--").replace('“','"').replace("‘","'").replace("…","...").replace("–","-")+"""</p> <div class="mainpost"> <p>"""+submission.selftext_html.replace("’","'").replace('”','"').replace("—","--").replace('“','"').replace("‘","'").replace("…","...").replace("–","-")+"""</p> </div>"""
  68.     else:
  69.         post = """<html>
  70. <link rel=StyleSheet href="style.css" type="text/css">
  71. <head></head>
  72. <body> \n<p>"""+submission.title.replace("’","'").replace('”','"').replace("—","--").replace('“','"').replace("‘","'").replace("…","...").replace("–","-")+"""</p> <div class="mainpost"> <p>"""+submission.selftext.replace("’","'").replace('”','"').replace("—","--").replace('“','"').replace("‘","'").replace("…","...").replace("–","-")+"""</p> </div>"""
  73.    
  74.     # Writing current post's title and content to its html file
  75.     savepost.write(post)
  76.  
  77.     # Closes the html tags for the current submission's file
  78.     finish = """ </body> </html>"""
  79.     savepost.write(finish)
  80.     savepost.close()
  81.  
  82.     # Gets the post's author, score, number of comments, date, and title to be added to the main html file, then adds them
  83.     currentpost = """ <div class="postinfo"> """+str(submission.author)+' ['+str(submission.score)+' points] '+'['+str(submission.num_comments)+' comments] '+str(getdate(submission))+' '+""" <br> <a href=\""""+subname+'/'+submission.id+'.html'+'">'+submission.title.replace("’","'").replace('”','"').replace("—","--").replace('“','"').replace("‘","'").replace("…","...").replace("–","-")+'</a> </div> <br>\n\n'""""""
  84.     scores.append(submission.score)
  85.     postlist.append(currentpost)
  86.  
  87. # Gets and downloads submissions and comments, then adds them to the "Archive" html file in the current directory
  88. for submission in reddit.subreddit(subname).submissions():
  89.     dothestuff()
  90.  
  91. # Sorting the posts by score, then writing them to the list file
  92. postlist_sorted = [postlist for scores, postlist in sorted(zip(scores, postlist), reverse=True)]
  93. global postindex
  94. postindex = 0
  95. for thing in postlist_sorted:
  96.     postindex += 1
  97.     postslist.write('#'+str(postindex)+'\n'+thing+'\n\n\n\n')
  98.  
  99. # Closes the Main html file's tags
  100. finished = """</body> </html>"""
  101. postslist.write(finished)
  102. postslist.close()
  103.  
  104. # Writing a "style.css" file to be used by the downloaded html files
  105. cssfile = open(os.path.join(subname, 'style.css'), "a")
  106. csscode = """.acomment{
  107.     color: #"""+textcolor+""";
  108.     background-color: #"""+postbg+""";
  109.     padding-left: 10px;
  110.     width: 1000px;
  111.     padding-right: 10px;
  112.     min-height: 10em;
  113.     display: table-cell;
  114.     vertical-align: middle;
  115.     box-shadow: 5px 5px 10px #"""+shadow+""";
  116.     }
  117.  
  118. .bcomment{
  119.     color: #"""+textcolor+""";
  120.     background-color: #"""+postbg+""";
  121.     padding-left: 10px;
  122.     padding-top:5px;
  123.     padding-bottom: 5px;
  124.     margin-left: 50px;
  125.     width: 1000px;
  126.     padding-right: 10px;
  127.     box-shadow: 5px 5px 10px #"""+shadow+""";
  128.     }
  129.  
  130. .bcommentinfo{
  131.     margin-left: 50px;
  132. }
  133.  
  134. .postinfo{
  135.     color: #"""+textcolor+""";
  136.     margin: auto;
  137.     background-color: #"""+postbg+""";
  138.     padding-left: 10px;
  139.     width: 1000px;
  140.     padding-right: 10px;
  141.     box-shadow: 5px 5px 10px #"""+shadow+""";
  142.     padding-bottom: 10px;
  143.     padding-top: 5px;
  144. }
  145.  
  146. .mainpost{
  147.     color: #"""+textcolor+""";
  148.     background-color: #"""+postbg+""";
  149.     box-shadow: 5px 5px 10px #"""+shadow+""";
  150.     padding-left: 10px;
  151.     padding-right: 10px;
  152.     padding-bottom: 10px;
  153.     padding-top: 5px;
  154.     display: block;
  155.     margin: auto;
  156.     width: 90%;
  157. }
  158.  
  159. body{
  160.     background-color: #"""+bgcolor+""";
  161.     color: #"""+textcolor+""";
  162. }
  163.  
  164. A:link {
  165.     color: #"""+link+""";
  166.     font-weight: bold;
  167. }
  168.  
  169. A:visited {
  170.     color: #"""+visitedlink+""";
  171.     font-weight: bold;
  172. }
  173.  
  174. A:hover {
  175.     color: #"""+hoverlink+""";
  176. }
  177.     """
  178. cssfile.write(csscode)
  179. cssfile.close()
  180. print('Got '+str(postnumber)+' posts in {0:0.1f} seconds'.format(time.time() - start))
Add Comment
Please, Sign In to add comment