Untitled

import requests
from datetime import datetime
import os
import pandas as pd
import praw.models
import urllib.request
import logging, sys
from bs4 import *

# How to use:
# 1. Install python3 - https://www.python.org/downloads/
# 2. Open "Command prompt" on your PC and copy and paste: `pip install pandas, bs4, urllib` (Without quotes)
# 3. Fill in details below (Link explains how)
# 4. Run it this file, and it will download your last 1000 upvoted posts (1000 is the max set by reddit)

# Fails to work for: Redgifs, bdsmlr
# Downloads crap along with correct post for: gfycat (Also fails half of the time)

# Fill in your details here
# https://praw.readthedocs.io/en/stable/getting_started/authentication.html#password-flow
reddit = praw.Reddit(
    client_id="",
    client_secret="",
    password="",
    user_agent="Downloads images from /u/<username>/upvoted before Imgur deletes them all",
    username=""
)

column_list = ["title", "post_url", "user", "image_url", "image_loc", "notes"]
upvoted_df = pd.DataFrame(data=None,
                          index=None,
                          columns=column_list,
                          dtype=None,
                          copy=None
                          )


def clean_title(submission_title: str) -> str:
    """
    Remove all values not allowed in Windows file names
    Makes name shorter than max file length Windows allows
    :param submission_title:
    :return:
    """
    for bad_char in list('\\/:*?\"<>|'):
        submission_title = submission_title.replace(bad_char, "#")
    return submission_title[:180]


def download_images(url, folder_name) -> None:
    """
    Download all images from URL
    From:
    https://www.geeksforgeeks.org/how-to-download-all-images-from-a-web-page-in-python/

    #TODO Doens't work with redgifs
    :param url: URL to download all images from
    :param folder_name: Relative folder destination for images
    :return:
    """
    # content of URL
    r = requests.get(url)

    # Parse HTML Code
    soup = BeautifulSoup(r.text, 'html.parser')

    for thing in ["img", "video"]:
        # find all images in URL
        images = soup.findAll(thing, limit=100)

        # initial count is zero
        count = 0

        # print total images found in URL
        print(f"Total {len(images)} {thing} Found!")

        # checking if images is not zero
        if len(images) != 0:
            for i, image in enumerate(images):
                # From image tag ,Fetch image Source URL

                # 1.data-srcset
                # 2.data-src
                # 3.data-fallback-src
                # 4.src

                # Here we will use exception handling

                # first we will search for "data-srcset" in img tag
                try:
                    # In image tag ,searching for "data-srcset"
                    image_link = image["data-srcset"]

                # then we will search for "data-src" in img
                # tag and so on
                except:
                    try:
                        # In image tag ,searching for "data-src"
                        image_link = image["data-src"]
                    except:
                        try:
                            # In image tag ,searching for "data-fallback-src"
                            image_link = image["data-fallback-src"]
                        except:
                            try:
                                # In image tag ,searching for "src"
                                image_link = image["src"]

                            # if no Source URL found
                            except:
                                pass

                # After getting Image Source URL
                # We will try to get the content of image
                try:
                    # Warning is fine as it is in try/except
                    r = requests.get(image_link).content
                    try:

                        # possibility of decode
                        r = str(r, 'utf-8')

                    except UnicodeDecodeError:

                        # After checking above condition, Image Download start
                        with open(f"{folder_name}/image{i + 1}.jpg", "wb+") as f:
                            f.write(r)

                        # counting number of image downloaded
                        count += 1
                except:
                    print(f"Could not find content for '{image_link}' ({requests.get(image_link)})")
                    pass

            # There might be possible, that all
            # images not download
            # if all images download
            if count == len(images):
                print(f"All {thing}s Downloaded!")

            # if all images not download
            else:
                print(f"Total {count} {thing}s downloaded out of {len(images)}")


print(f"Downloading upvoted posts for: {reddit.user.me()}")
current_redditor: praw.models.Redditor = reddit.user.me()
cwd = os.path.dirname(__file__)

try:
    os.mkdir(f"{cwd}/images/")
except FileExistsError:
    print("/images/ already exists")
try:
    os.mkdir(f"{cwd}/posts/")
except FileExistsError:
    print(f"/posts/ already exists")

# Max limit that PRAW allows easily (1000)
for counter, submission in enumerate(current_redditor.saved(limit=1000)):
    try:
        submission: praw.models.Submission  # So editor knows

        filetype: str
        to_append: pd.DataFrame

        if not hasattr(submission, 'title'):
            print("Found a comment.")
            print(submission.link_title)
            print(submission.link_permalink)
            print("Skipping.")
            continue

        title = clean_title(submission.title)
        title_with_counter = f"{counter}-{title}"
        author = submission.author
        if author is None:
            author = "[deleted]"
        else:
            author = submission.author.name

        # If a url link
        if submission.selftext == "":
            # If image/video link
            # https://help.imgur.com/hc/en-us/articles/115000083326-What-files-can-I-upload-Is-there-a-size-limit-
            (_, filetype) = os.path.splitext(submission.url)
            if filetype.upper() in [".PNG", ".GIF", ".JPG", ".JPEG", ".MP4", ".MPEG", ".AVI", ".WEBM", ".APNG", ".TIFF",
                                    ".MOV", ".QT", ".MKV",
                                    ".MK3D", ".MKA", ".MKS", ".FLV", ".F4V", ".F4P", ".F4A", ".F4B"]:
                print(f"Directly Downloading: '{submission.url}' as {filetype}")

                image_loc = f"{cwd}/images/{title_with_counter}.{filetype}"

                # Save image
                urllib.request.urlretrieve(submission.url, image_loc)

                df_row = pd.DataFrame(
                    [
                        [
                            submission.title,
                            submission.permalink,
                            author,
                            submission.url,
                            image_loc,
                            "IMAGE"
                        ]
                    ],
                    columns=column_list)

            # Non-Image url
            # Download all images on page
            else:
                print(f"Downloading files on page for: '{submission.url}'")

                image_folder_loc = f"{cwd}/images/{title_with_counter}/"
                try:
                    os.mkdir(image_folder_loc)
                except FileExistsError:
                    print(f"/images/{title_with_counter} already exists")

                download_images(submission.url, image_folder_loc)

                df_row = pd.DataFrame(
                    [
                        [
                            submission.title,
                            submission.permalink,
                            author,
                            submission.url,
                            image_folder_loc,
                            "IMAGE FOLDER"
                        ]
                    ],
                    columns=column_list)

        # If non-url (text) post
        # TODO could be Poll I guess
        else:
            print(f"Downloading Text For: '{submission.url}'")
            txt_loc = f"{cwd}/posts/{counter}-{title}.txt"
            with open(txt_loc, "w+") as file:
                file.write(submission.selftext)

            df_row = pd.DataFrame(
                [
                    [
                        submission.title,
                        submission.permalink,
                        author,
                        "",
                        txt_loc,
                        "TEXT POST"
                    ]
                ],
                columns=column_list)

        # Append to df
        upvoted_df = pd.concat([upvoted_df, df_row])

    except Exception:
        print(f"Failed to download {submission.title}")
        df_row = pd.DataFrame(
            [
                [
                    submission.title,
                    submission.permalink,
                    "FAILED",
                    "",
                    "FAILED",
                    "FAILED"
                ]
            ],
            columns=column_list)
        upvoted_df = pd.concat([upvoted_df, df_row])

upvoted_df.to_csv(f"{str(datetime.now()).replace(':', '-')}.csv")