Advertisement
Guest User

Untitled

a guest
Apr 24th, 2023
106
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.43 KB | None | 0 0
  1. import requests
  2. from datetime import datetime
  3. import os
  4. import pandas as pd
  5. import praw.models
  6. import urllib.request
  7. import logging, sys
  8. from bs4 import *
  9.  
  10. # How to use:
  11. # 1. Install python3 - https://www.python.org/downloads/
  12. # 2. Open "Command prompt" on your PC and copy and paste: `pip install pandas, bs4, urllib` (Without quotes)
  13. # 3. Fill in details below (Link explains how)
  14. # 4. Run it this file, and it will download your last 1000 upvoted posts (1000 is the max set by reddit)
  15.  
  16. # Fails to work for: Redgifs, bdsmlr
  17. # Downloads crap along with correct post for: gfycat (Also fails half of the time)
  18.  
  19. # Fill in your details here
  20. # https://praw.readthedocs.io/en/stable/getting_started/authentication.html#password-flow
  21. reddit = praw.Reddit(
  22.     client_id="",
  23.     client_secret="",
  24.     password="",
  25.     user_agent="Downloads images from /u/<username>/upvoted before Imgur deletes them all",
  26.     username=""
  27. )
  28.  
  29. column_list = ["title", "post_url", "user", "image_url", "image_loc", "notes"]
  30. upvoted_df = pd.DataFrame(data=None,
  31.                           index=None,
  32.                           columns=column_list,
  33.                           dtype=None,
  34.                           copy=None
  35.                           )
  36.  
  37.  
  38. def clean_title(submission_title: str) -> str:
  39.     """
  40.    Remove all values not allowed in Windows file names
  41.    Makes name shorter than max file length Windows allows
  42.    :param submission_title:
  43.    :return:
  44.    """
  45.     for bad_char in list('\\/:*?\"<>|'):
  46.         submission_title = submission_title.replace(bad_char, "#")
  47.     return submission_title[:180]
  48.  
  49.  
  50. def download_images(url, folder_name) -> None:
  51.     """
  52.    Download all images from URL
  53.    From:
  54.    https://www.geeksforgeeks.org/how-to-download-all-images-from-a-web-page-in-python/
  55.  
  56.    #TODO Doens't work with redgifs
  57.    :param url: URL to download all images from
  58.    :param folder_name: Relative folder destination for images
  59.    :return:
  60.    """
  61.     # content of URL
  62.     r = requests.get(url)
  63.  
  64.     # Parse HTML Code
  65.     soup = BeautifulSoup(r.text, 'html.parser')
  66.  
  67.     for thing in ["img", "video"]:
  68.         # find all images in URL
  69.         images = soup.findAll(thing, limit=100)
  70.  
  71.         # initial count is zero
  72.         count = 0
  73.  
  74.         # print total images found in URL
  75.         print(f"Total {len(images)} {thing} Found!")
  76.  
  77.         # checking if images is not zero
  78.         if len(images) != 0:
  79.             for i, image in enumerate(images):
  80.                 # From image tag ,Fetch image Source URL
  81.  
  82.                 # 1.data-srcset
  83.                 # 2.data-src
  84.                 # 3.data-fallback-src
  85.                 # 4.src
  86.  
  87.                 # Here we will use exception handling
  88.  
  89.                 # first we will search for "data-srcset" in img tag
  90.                 try:
  91.                     # In image tag ,searching for "data-srcset"
  92.                     image_link = image["data-srcset"]
  93.  
  94.                 # then we will search for "data-src" in img
  95.                 # tag and so on
  96.                 except:
  97.                     try:
  98.                         # In image tag ,searching for "data-src"
  99.                         image_link = image["data-src"]
  100.                     except:
  101.                         try:
  102.                             # In image tag ,searching for "data-fallback-src"
  103.                             image_link = image["data-fallback-src"]
  104.                         except:
  105.                             try:
  106.                                 # In image tag ,searching for "src"
  107.                                 image_link = image["src"]
  108.  
  109.                             # if no Source URL found
  110.                             except:
  111.                                 pass
  112.  
  113.                 # After getting Image Source URL
  114.                 # We will try to get the content of image
  115.                 try:
  116.                     # Warning is fine as it is in try/except
  117.                     r = requests.get(image_link).content
  118.                     try:
  119.  
  120.                         # possibility of decode
  121.                         r = str(r, 'utf-8')
  122.  
  123.                     except UnicodeDecodeError:
  124.  
  125.                         # After checking above condition, Image Download start
  126.                         with open(f"{folder_name}/image{i + 1}.jpg", "wb+") as f:
  127.                             f.write(r)
  128.  
  129.                         # counting number of image downloaded
  130.                         count += 1
  131.                 except:
  132.                     print(f"Could not find content for '{image_link}' ({requests.get(image_link)})")
  133.                     pass
  134.  
  135.             # There might be possible, that all
  136.             # images not download
  137.             # if all images download
  138.             if count == len(images):
  139.                 print(f"All {thing}s Downloaded!")
  140.  
  141.             # if all images not download
  142.             else:
  143.                 print(f"Total {count} {thing}s downloaded out of {len(images)}")
  144.  
  145.  
  146. print(f"Downloading upvoted posts for: {reddit.user.me()}")
  147. current_redditor: praw.models.Redditor = reddit.user.me()
  148. cwd = os.path.dirname(__file__)
  149.  
  150. try:
  151.     os.mkdir(f"{cwd}/images/")
  152. except FileExistsError:
  153.     print("/images/ already exists")
  154. try:
  155.     os.mkdir(f"{cwd}/posts/")
  156. except FileExistsError:
  157.     print(f"/posts/ already exists")
  158.  
  159. # Max limit that PRAW allows easily (1000)
  160. for counter, submission in enumerate(current_redditor.saved(limit=1000)):
  161.     try:
  162.         submission: praw.models.Submission  # So editor knows
  163.  
  164.         filetype: str
  165.         to_append: pd.DataFrame
  166.  
  167.         if not hasattr(submission, 'title'):
  168.             print("Found a comment.")
  169.             print(submission.link_title)
  170.             print(submission.link_permalink)
  171.             print("Skipping.")
  172.             continue
  173.  
  174.         title = clean_title(submission.title)
  175.         title_with_counter = f"{counter}-{title}"
  176.         author = submission.author
  177.         if author is None:
  178.             author = "[deleted]"
  179.         else:
  180.             author = submission.author.name
  181.  
  182.         # If a url link
  183.         if submission.selftext == "":
  184.             # If image/video link
  185.             # https://help.imgur.com/hc/en-us/articles/115000083326-What-files-can-I-upload-Is-there-a-size-limit-
  186.             (_, filetype) = os.path.splitext(submission.url)
  187.             if filetype.upper() in [".PNG", ".GIF", ".JPG", ".JPEG", ".MP4", ".MPEG", ".AVI", ".WEBM", ".APNG", ".TIFF",
  188.                                     ".MOV", ".QT", ".MKV",
  189.                                     ".MK3D", ".MKA", ".MKS", ".FLV", ".F4V", ".F4P", ".F4A", ".F4B"]:
  190.                 print(f"Directly Downloading: '{submission.url}' as {filetype}")
  191.  
  192.                 image_loc = f"{cwd}/images/{title_with_counter}.{filetype}"
  193.  
  194.                 # Save image
  195.                 urllib.request.urlretrieve(submission.url, image_loc)
  196.  
  197.                 df_row = pd.DataFrame(
  198.                     [
  199.                         [
  200.                             submission.title,
  201.                             submission.permalink,
  202.                             author,
  203.                             submission.url,
  204.                             image_loc,
  205.                             "IMAGE"
  206.                         ]
  207.                     ],
  208.                     columns=column_list)
  209.  
  210.             # Non-Image url
  211.             # Download all images on page
  212.             else:
  213.                 print(f"Downloading files on page for: '{submission.url}'")
  214.  
  215.                 image_folder_loc = f"{cwd}/images/{title_with_counter}/"
  216.                 try:
  217.                     os.mkdir(image_folder_loc)
  218.                 except FileExistsError:
  219.                     print(f"/images/{title_with_counter} already exists")
  220.  
  221.                 download_images(submission.url, image_folder_loc)
  222.  
  223.                 df_row = pd.DataFrame(
  224.                     [
  225.                         [
  226.                             submission.title,
  227.                             submission.permalink,
  228.                             author,
  229.                             submission.url,
  230.                             image_folder_loc,
  231.                             "IMAGE FOLDER"
  232.                         ]
  233.                     ],
  234.                     columns=column_list)
  235.  
  236.         # If non-url (text) post
  237.         # TODO could be Poll I guess
  238.         else:
  239.             print(f"Downloading Text For: '{submission.url}'")
  240.             txt_loc = f"{cwd}/posts/{counter}-{title}.txt"
  241.             with open(txt_loc, "w+") as file:
  242.                 file.write(submission.selftext)
  243.  
  244.             df_row = pd.DataFrame(
  245.                 [
  246.                     [
  247.                         submission.title,
  248.                         submission.permalink,
  249.                         author,
  250.                         "",
  251.                         txt_loc,
  252.                         "TEXT POST"
  253.                     ]
  254.                 ],
  255.                 columns=column_list)
  256.  
  257.         # Append to df
  258.         upvoted_df = pd.concat([upvoted_df, df_row])
  259.  
  260.     except Exception:
  261.         print(f"Failed to download {submission.title}")
  262.         df_row = pd.DataFrame(
  263.             [
  264.                 [
  265.                     submission.title,
  266.                     submission.permalink,
  267.                     "FAILED",
  268.                     "",
  269.                     "FAILED",
  270.                     "FAILED"
  271.                 ]
  272.             ],
  273.             columns=column_list)
  274.         upvoted_df = pd.concat([upvoted_df, df_row])
  275.  
  276. upvoted_df.to_csv(f"{str(datetime.now()).replace(':', '-')}.csv")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement