Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from datetime import datetime
- import os
- import pandas as pd
- import praw.models
- import urllib.request
- import logging, sys
- from bs4 import *
- # How to use:
- # 1. Install python3 - https://www.python.org/downloads/
- # 2. Open "Command prompt" on your PC and copy and paste: `pip install pandas, bs4, urllib` (Without quotes)
- # 3. Fill in details below (Link explains how)
- # 4. Run it this file, and it will download your last 1000 upvoted posts (1000 is the max set by reddit)
- # Fails to work for: Redgifs, bdsmlr
- # Downloads crap along with correct post for: gfycat (Also fails half of the time)
- # Fill in your details here
- # https://praw.readthedocs.io/en/stable/getting_started/authentication.html#password-flow
- reddit = praw.Reddit(
- client_id="",
- client_secret="",
- password="",
- user_agent="Downloads images from /u/<username>/upvoted before Imgur deletes them all",
- username=""
- )
- column_list = ["title", "post_url", "user", "image_url", "image_loc", "notes"]
- upvoted_df = pd.DataFrame(data=None,
- index=None,
- columns=column_list,
- dtype=None,
- copy=None
- )
- def clean_title(submission_title: str) -> str:
- """
- Remove all values not allowed in Windows file names
- Makes name shorter than max file length Windows allows
- :param submission_title:
- :return:
- """
- for bad_char in list('\\/:*?\"<>|'):
- submission_title = submission_title.replace(bad_char, "#")
- return submission_title[:180]
- def download_images(url, folder_name) -> None:
- """
- Download all images from URL
- From:
- https://www.geeksforgeeks.org/how-to-download-all-images-from-a-web-page-in-python/
- #TODO Doens't work with redgifs
- :param url: URL to download all images from
- :param folder_name: Relative folder destination for images
- :return:
- """
- # content of URL
- r = requests.get(url)
- # Parse HTML Code
- soup = BeautifulSoup(r.text, 'html.parser')
- for thing in ["img", "video"]:
- # find all images in URL
- images = soup.findAll(thing, limit=100)
- # initial count is zero
- count = 0
- # print total images found in URL
- print(f"Total {len(images)} {thing} Found!")
- # checking if images is not zero
- if len(images) != 0:
- for i, image in enumerate(images):
- # From image tag ,Fetch image Source URL
- # 1.data-srcset
- # 2.data-src
- # 3.data-fallback-src
- # 4.src
- # Here we will use exception handling
- # first we will search for "data-srcset" in img tag
- try:
- # In image tag ,searching for "data-srcset"
- image_link = image["data-srcset"]
- # then we will search for "data-src" in img
- # tag and so on
- except:
- try:
- # In image tag ,searching for "data-src"
- image_link = image["data-src"]
- except:
- try:
- # In image tag ,searching for "data-fallback-src"
- image_link = image["data-fallback-src"]
- except:
- try:
- # In image tag ,searching for "src"
- image_link = image["src"]
- # if no Source URL found
- except:
- pass
- # After getting Image Source URL
- # We will try to get the content of image
- try:
- # Warning is fine as it is in try/except
- r = requests.get(image_link).content
- try:
- # possibility of decode
- r = str(r, 'utf-8')
- except UnicodeDecodeError:
- # After checking above condition, Image Download start
- with open(f"{folder_name}/image{i + 1}.jpg", "wb+") as f:
- f.write(r)
- # counting number of image downloaded
- count += 1
- except:
- print(f"Could not find content for '{image_link}' ({requests.get(image_link)})")
- pass
- # There might be possible, that all
- # images not download
- # if all images download
- if count == len(images):
- print(f"All {thing}s Downloaded!")
- # if all images not download
- else:
- print(f"Total {count} {thing}s downloaded out of {len(images)}")
- print(f"Downloading upvoted posts for: {reddit.user.me()}")
- current_redditor: praw.models.Redditor = reddit.user.me()
- cwd = os.path.dirname(__file__)
- try:
- os.mkdir(f"{cwd}/images/")
- except FileExistsError:
- print("/images/ already exists")
- try:
- os.mkdir(f"{cwd}/posts/")
- except FileExistsError:
- print(f"/posts/ already exists")
- # Max limit that PRAW allows easily (1000)
- for counter, submission in enumerate(current_redditor.saved(limit=1000)):
- try:
- submission: praw.models.Submission # So editor knows
- filetype: str
- to_append: pd.DataFrame
- if not hasattr(submission, 'title'):
- print("Found a comment.")
- print(submission.link_title)
- print(submission.link_permalink)
- print("Skipping.")
- continue
- title = clean_title(submission.title)
- title_with_counter = f"{counter}-{title}"
- author = submission.author
- if author is None:
- author = "[deleted]"
- else:
- author = submission.author.name
- # If a url link
- if submission.selftext == "":
- # If image/video link
- # https://help.imgur.com/hc/en-us/articles/115000083326-What-files-can-I-upload-Is-there-a-size-limit-
- (_, filetype) = os.path.splitext(submission.url)
- if filetype.upper() in [".PNG", ".GIF", ".JPG", ".JPEG", ".MP4", ".MPEG", ".AVI", ".WEBM", ".APNG", ".TIFF",
- ".MOV", ".QT", ".MKV",
- ".MK3D", ".MKA", ".MKS", ".FLV", ".F4V", ".F4P", ".F4A", ".F4B"]:
- print(f"Directly Downloading: '{submission.url}' as {filetype}")
- image_loc = f"{cwd}/images/{title_with_counter}.{filetype}"
- # Save image
- urllib.request.urlretrieve(submission.url, image_loc)
- df_row = pd.DataFrame(
- [
- [
- submission.title,
- submission.permalink,
- author,
- submission.url,
- image_loc,
- "IMAGE"
- ]
- ],
- columns=column_list)
- # Non-Image url
- # Download all images on page
- else:
- print(f"Downloading files on page for: '{submission.url}'")
- image_folder_loc = f"{cwd}/images/{title_with_counter}/"
- try:
- os.mkdir(image_folder_loc)
- except FileExistsError:
- print(f"/images/{title_with_counter} already exists")
- download_images(submission.url, image_folder_loc)
- df_row = pd.DataFrame(
- [
- [
- submission.title,
- submission.permalink,
- author,
- submission.url,
- image_folder_loc,
- "IMAGE FOLDER"
- ]
- ],
- columns=column_list)
- # If non-url (text) post
- # TODO could be Poll I guess
- else:
- print(f"Downloading Text For: '{submission.url}'")
- txt_loc = f"{cwd}/posts/{counter}-{title}.txt"
- with open(txt_loc, "w+") as file:
- file.write(submission.selftext)
- df_row = pd.DataFrame(
- [
- [
- submission.title,
- submission.permalink,
- author,
- "",
- txt_loc,
- "TEXT POST"
- ]
- ],
- columns=column_list)
- # Append to df
- upvoted_df = pd.concat([upvoted_df, df_row])
- except Exception:
- print(f"Failed to download {submission.title}")
- df_row = pd.DataFrame(
- [
- [
- submission.title,
- submission.permalink,
- "FAILED",
- "",
- "FAILED",
- "FAILED"
- ]
- ],
- columns=column_list)
- upvoted_df = pd.concat([upvoted_df, df_row])
- upvoted_df.to_csv(f"{str(datetime.now()).replace(':', '-')}.csv")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement