Analyze Playlist.py

# Given the URL of a playlist and the name of a CSV file (which will be created if necessary), update the contents of the CSV file to reflect the contents of the playlist and any changes in, for instance, accessibility.

ytdlp = "yt-dlp.exe" # The path to your copy of YT-DLP. Relative paths are supported, but you need the extension.

from json import loads
from os import path, replace
from re import finditer, sub
from shutil import get_terminal_size
from subprocess import run
from sys import argv, exit

# Version history:
    # (no version number): initial release.
    # (still no version number): added reporting of number of changes to CSV.
    # (version number next time seriously): fixed bug in previous feature.
    # 2022-May-28: added flag support ('-nsp' and '-nkr').
    # 2022-Jun-06: a video's status changing from 'New' to 'Ok' is no longer noteworthy.
    # 2022-Jun-30: if the list of URLs in the CSV doesn't match the list of URLs in the playlist, the program reports how many URLs are new (in the playlist but not the CSV), old (in the CSV but not the playlist), and common (in both).
    # 2022-Jul-01: changed the format of the previous report to be more clear.
    # 2022-Jul-02: added a 'Description' column (saving the video's description) between the video name and channel ID. Because this can be spammy, also added the '-nsd' flag.
arg_flags = {}
valid_arg_flags = dict.fromkeys(["-nsp", "-nkr", "-nsd"], True)
okay_go = True
if len(argv) >= 3: # we have enough arguments
    if len(argv) > 3: # more than enough
        for i in range(1, len(argv) - 2): # are they valid?
            if valid_arg_flags.get(argv[i], False):
                arg_flags[argv[i]] = True
            else:
                okay_go = False
else:
    okay_go = False
if not okay_go:
    exit("\n".join([
        "Usage: %s [flags] [playlist URL] [CSV file]" % argv[0],
        "",
        "Valid flags:",
        "    -nsd: do Not Save the video's Description to the CSV.",
        "          (Does not change existing description information.)",
        "    -nsp: do Not Save Playlist number changes to the CSV.",
        "    -nkr: do Not Keep data on videos Removed from the playlist.",
        "          (Only data already in the CSV will be removed.)",
    ]))
last_terminal_width = 0
ellipses = " ... "
left_slice = 0
right_slice = 0

def nice_print(s = ""):
    "Nicely print a string on the screen, taking into account its width."
    global last_terminal_width, left_slice, right_slice
    terminal_width = get_terminal_size().columns - 1
    if last_terminal_width != terminal_width:
        last_terminal_width = terminal_width
        left_slice = round((terminal_width - len(ellipses)) / 2)
        right_slice = terminal_width - len(ellipses) - left_slice
    print("\r" + (" " * terminal_width), end = "")
    if len(s) <= terminal_width:
        print("\r" + s, end = "")
    else:
        print("\r" + s[0:left_slice] + ellipses + s[-right_slice:], end = "")

def what_is_it(s):
    "Given a potentially incomplete path, return whether the item in question is a file, a directory, or nothing."
    if path.exists(s):
        if path.isfile(s):
            return "file"
        elif path.isdir(s):
            return "dir"
        else: # I don't think this should ever happen, but...
            exit("Error: %s exists, but is neither a file nor a directory? Wha?" % s)
    else:
        return False

ytdlp = path.abspath(ytdlp)
if what_is_it(ytdlp) != "file":
    exit("Error: could not find YT-DLP at %s. Double-check the path and edit the header of this file." % ytdlp)

def pluralize(n, s1 = "", s2 = "s"):
    "Return the proper pluralization string for a number."
    if n == 1: # Because Python has weird ideas about what constitutes falseness.
        return s1
    else:
        return s2

def entryies(n):
    return pluralize(n, "y", "ies")

def csv_to_list(csv):
    "Given the contents of a CSV file, return a list of the rows in that file, which are lists of the columns in those rows."
    csv = csv.strip() # remove any blank bits at beginning and end
    output = []
    row = []
    stops = [n.start() for n in finditer('[",\n]', csv)] # Commas, double quotes, and newlines are where interesting stuff happens.
    stops.append(len(csv)) # We also want to check the end of the contents.
    element = ""
    starting_element = True # Have we just started an element?
    quoted_element = False # Are we processing a quoted element?
    cursor = 0 # Where are we in the CSV?
    quote_lock = 0 # Are we potentially processing an escaped double quote (i.e., is our 'quote lock' on?)
    for n in stops:
        symbol = csv[n: n + 1]
        if starting_element: # Did we just start an element?
            if n == cursor: # And did we hit a stop immediately?
                if symbol == '"': # This is the start of a quoted element.
                    starting_element, quoted_element = False, True
                else: # This is the end of an empty element.
                    row.append('')
                    if symbol != ',': #And the end of the row.
                        output.append(row)
                        row = []
                cursor = cursor + 1 # Regardless, we want to do this.
            else: # No? Well, what did we hit?
                if symbol == '"': # This is a double quote in the middle of an unquoted element.
                    pass # We don't care about it.
                else: # This is the end of an unquoted element.
                    row.append(csv[cursor: n])
                    if symbol != ',': # And the end of the row.
                        output.append(row)
                        row = []
                    starting_element, cursor = True, n + 1
        else: # We didn't just start an element, so... is it quoted or not?
            if quoted_element: # It's a quoted element.
                if quote_lock: # It's a quoted element with the quote lock on.
                    if quote_lock + 1 == n: # It's a quoted element where the quote lock was JUST turned on.
                        if symbol == '"': # We've hit an escaped double quote.
                            element = element + csv[cursor: n] # Technically including the escaping double quote, not the escaped one, but hey.
                            cursor, quote_lock = n + 1, False
                        else: # We've hit the end of a quoted element.
                            row.append(element + csv[cursor: n - 1]) # Leaving off the closing double quote of the element.
                            element, starting_element, quoted_element, quote_lock = "", True, False, False
                            cursor = n + 1
                            if symbol != ',': # And the end of the row.
                                output.append(row)
                                row = []
                    else: # There was an unescaped double quote in the middle of a quoted element!
                        exit("Error: Malformed CSV, line %d: double quote was not escaped with another double quote!" % len(output) + 1)
                else: # It's a quoted element without the quote lock on.
                    if symbol == '"': # This could be either an escaping double quote or the end of a quoted element. We don't know yet.
                        quote_lock = n
                    elif symbol != '': # We don't care about commas or newlines in the middle of a quoted element.
                        pass
                    else: # The line has ended without closing the quoted element!
                        exit("Error: Malformed CSV, line %d: last element is quoted, but quote is not closed!" % len(output) + 1)
            else: # It's not a quoted element.
                if symbol == '"': # We don't care about double quotes in the middle of an unquoted element.
                    pass
                else: # This is the end of the element.
                    row.append(element + csv[cursor: n])
                    if symbol != ',': # And the end of the row.
                        output.append(row)
                        row = []
                    element, starting_element = "", True
                    cursor = n + 1
    return output

def list_to_csv(list_of_lists):
    "Given a list of lists of values, return a CSV-encoded string."
    output = []
    for list_line in list_of_lists:
        row = []
        for element in list_line:
            row.append('"%s"' % sub('"', '""', str(element)))
        output.append(",".join(row))
    return "\n".join(output)

def load_csv(filename):
    "Given a filename, load and parse its (presumably) CSV contents. Returns False if it is unable to do so."
    try:
        with open(filename, encoding = "utf-8") as handle:
            return csv_to_list(handle.read())
    except (FileNotFoundError, PermissionError):
        return False

csv_header = {"#": "playlist_index", "URL": "url", "Video Name": "title", "Description": "description", "Channel ID": "channel_id", "Status": True, "Changed?": True, "Auto Notes": True, "User Notes": True}
def load_url_data_from_csv(csv_filename):
    "Given a filename, attempt to open it and import its CSV data. (If it doesn't exist, create blank data to import.)  Perform some sanity checks to make sure it's valid and that there are no duplicate URLs. Then return a dictionary whose keys are URLs and whose values are dictionaries whose keys are the column names and whose values are the contents."
    # Does the file exist? If so, can we read it?
    csv_filetype = what_is_it(csv_filename)
    if csv_filetype:
        if csv_filetype == "file":
            csv_url_data = load_csv(csv_filename)
            if csv_url_data == False:
                exit("Error: couldn't read the contents of %s!" % csv_filename)
        else:
            exit("Error: %s is a directory!" % csv_filename)
    else:
        print("Warning: CSV file %s not found. A new one will be created." % csv_filename)
        csv_url_data = [[]]
        for s in csv_header:
            csv_url_data[0].append(s)
    # A simple check that the data is the correct size and has the correct header.
    if len(csv_url_data) == 0:
        exit("Error: CSV file %s exists, but contains no data!" % csv_filename)
    for i in range(len(csv_url_data)):
        if len(csv_url_data[i]) != len(csv_header):
            exit("Error: CSV file %s, line %s, contains %d entr%s (should contain %d)!" % (csv_filename, i + 1, len(csv_url_data[i]), entryies(len(csv_url_data[i])), len(csv_header)))
    for a in csv_url_data[0]:
        if not csv_header.get(a, False):
            exit("Error: CSV file %s header not valid!" % csv_filename)
    # Okay, we got this far. Now we start constructing the dictionary.
    output = {}
    for i in range(1, len(csv_url_data)):
        entry = {}
        for j, s in enumerate(csv_header):
            entry[s] = csv_url_data[i][j]
        if output.get(entry["URL"], False):
            exit("Error: URL %s appears more than once in CSV file %s!" % (entry["URL"], csv_filename))
        output[entry["URL"]] = entry
    if len(output) > 0:
        print("CSV file loaded. %d entr%s found." % (len(output), entryies(len(output))))
    return output

def load_url_data_from_playlist(url):
    "Given the playlist URL, return a dictionary in the same format as load_url_data_from_csv. Display any errors."
    thingy = run([ytdlp, "-j", "--flat-playlist", url], capture_output = True, text = True)
    s = thingy.stderr.strip()
    if s:
        print("Errors detected during playlist scan:\n%s" % s)
    entries = thingy.stdout.split("\n")
    output = {}
    for entry in entries:
        if entry:
            line = {}
            jthingy = loads(entry)
            for header_name, jthingy_key in csv_header.items():
                line[header_name] = str(jthingy.get(jthingy_key, "")) # Leave the fields we don't get from YT-DLP blank
            output[jthingy["url"]] = line
    return output

def save_csv(filename, contents):
    "Given a list of lists of values, save it in CSV format ATOMICALLY, i.e. without causing a corrupt file to exist if it's overwriting an existing file."
    target_file_path = path.abspath(filename)
    output = list_to_csv(contents)
    with open(filename + ".tmp", "w", encoding = "utf-8") as f:
        f.write(output)
    replace(filename + ".tmp", target_file_path)

def save_url_data(filename, contents):
    "Given a dictionary of the format produced by the load_url_data_from* functions (q.v.), save it in CSV format using save_csv."
    output = []
    for URL, entry in contents.items():
        line = []
        for s in csv_header:
            line.append(entry[s])
        output.append(line)
    output.sort(key = lambda entry: entry[1]) # Secondary sort by URL
    output.sort(key = lambda entry: float(entry[0] or "inf")) # Primary sort by playlist number
    output.insert(0, [])
    for s in csv_header:
        output[0].append(s)
    save_csv(filename, output)

# Load the URL data from the CSV file, if any.
csv_filename = path.abspath(argv[len(argv) - 1])
print("Loading CSV file %s..." % csv_filename)
csv_url_data = load_url_data_from_csv(csv_filename)

# If -nkr is given, remove any 'removed' items.
if arg_flags.get("-nkr", False):
    zz = len(csv_url_data)
    for entry in list(csv_url_data.keys()):
        if csv_url_data[entry]["Status"] == "Removed":
            del csv_url_data[entry]
    if zz != len(csv_url_data):
        save_url_data(csv_filename, csv_url_data)
        print("%d item%s no longer in the playlist removed from the CSV." % (zz - len(csv_url_data), pluralize(zz - len(csv_url_data))))

# Load the URL data from the playlist.
print("Getting video data from %s..." % argv[len(argv) - 2])
playlist_url_data = load_url_data_from_playlist(argv[len(argv) - 2])
if len(playlist_url_data) == 0:
    sys.exit("No videos in the playlist! (Did you mistype the URL?)")
print("Playlist data loaded. %d entr%s found." % (len(playlist_url_data), entryies(len(playlist_url_data))))

def prepare_set():
    global urls_to_check
    "Compare the two sets of URLs, prepare the set to check (the union of the two), and report on the similarities and differences. (Mainly a function to keep some variables local.)"
    csv_urls, playlist_urls = set(s for s in csv_url_data), set(s for s in playlist_url_data)
    urls_to_check = playlist_urls | csv_urls
    new_urls, old_urls = playlist_urls - csv_urls, csv_urls - playlist_urls
    parenthetical = []
    if len(new_urls) > 0:
        parenthetical.append("%d not in CSV" % len(new_urls))
    if len(old_urls) > 0:
        parenthetical.append("%d not in playlist" % len(old_urls))
    if len(parenthetical) > 0:
        parenthetical = " (%s)" % (", ".join(parenthetical))
    else:
        parenthetical = ""
    print("%d URL%s total to analyze%s." % (len(urls_to_check), pluralize(len(urls_to_check)), parenthetical))
prepare_set()

changed_entries = 0
for i, url_to_check in enumerate(urls_to_check):
    nice_print("Checking URL %d/%d (%.1f%%), %s..." % (i + 1, len(urls_to_check), (i + 1) * 100 / len(urls_to_check), url_to_check))
    video_data = run([ytdlp, "-j", url_to_check], capture_output = True, text = True)
    couldaccess = video_data.stdout.split("\n")[0].strip() # this is either a wodge of JSON (which will test true), in which case we could access it, or an empty string (which will test false), in which case we couldn't.
    entry = {}
    for key in csv_header: # Initialize the entries in, well, entry for convenience later.
        entry[key] = ""
    csv_entry, playlist_entry = csv_url_data.get(url_to_check, False), playlist_url_data.get(url_to_check, False)
    # The description of the video is in the data from the video URL, not the playlist URL. So if we're getting it, we have to parse couldaccess, if possible.
    if couldaccess and (not arg_flags.get("-nsd")):
        playlist_entry["Description"] = loads(couldaccess)["description"]
    if csv_entry:
        # We already have an entry for this URL in the CSV. Do we have one in the playlist?
        if playlist_entry: # We need to merge the data.
            new_auto_notes = []
            if csv_entry["Auto Notes"]:
                new_auto_notes.append(csv_entry["Auto Notes"])
            for key, value in playlist_entry.items():
                entry[key] = value
            # if we aren't saving description info, keep the old info, whatever it is. Otherwise, note changes, if any.
            if arg_flags.get("-nsd"):
                entry["Description"] = csv_entry["Description"]
            else:
                if (entry["Description"] != csv_entry["Description"]) and (csv_entry["Description"] != ""):
                    new_auto_notes.append("Previous description:\n%s" % csv_entry["Description"])
            if (entry["#"] != csv_entry["#"]) and not arg_flags.get("-nsp"):
                new_auto_notes.append("Was previously playlist entry %s." % csv_entry["#"])
            if entry["Video Name"] != csv_entry["Video Name"]:
                new_auto_notes.append("Was previously named %s." % csv_entry["Video Name"])
            if entry["Channel ID"] != csv_entry["Channel ID"]:
                new_auto_notes.append("Was previously channel ID %s." % csv_entry["Channel ID"])
            # 'Status', 'Auto Notes', and 'User Notes' were initialized to '' in entry earlier and won't have been overwritten.
            # Valid statuses are 'New' (added to the playlist since the last time the program was run), 'Removed' (no longer in the playlist), 'Error' (couldn't access the file), and 'Ok' (no problems accessing the file).
            if couldaccess:
                entry["Status"] = "Ok"
                if csv_entry["Status"] == "Removed":
                    new_auto_notes.append("Was re-added to playlist.")
                if csv_entry["Status"] == "Error":
                    new_auto_notes.append("Was able to access the file this time.")
            else:
                entry["Status"] = "Error"
                sanitized_error = video_data.stderr.strip()
                if csv_entry["Status"] == "Removed":
                    new_auto_notes.append("Was re-added to playlist, but cannot access.\n%s" % sanitized_error)
                elif csv_entry["Status"] != "Error":
                    new_auto_notes.append("Was unable to access file.\n%s" % sanitized_error)
                else: # not sure if needed, but...
                    pass
            entry["Changed?"] = (entry["Status"] != csv_entry["Status"]) and "Y" or "N"
            if (entry["Status"] == "Ok") and (csv_entry["Status"] == "New"): # New to Ok status change unremarkable.
                entry["Changed?"] = "N"
            entry["User Notes"] = csv_entry["User Notes"]
            entry["Auto Notes"] = "\n".join(new_auto_notes)
        else: # The URL has been removed from the playlist. We won't touch it other than to update the status and remove the playlist ID.
            for key, value in csv_entry.items():
                entry[key] = value
            entry["Changed?"] = (entry["Status"] == "Removed") and "N" or "Y"
            entry["Status"] = "Removed"
            entry["#"] = ""
    else: # This is a new entry, so we need to add the data.
        for key, value in playlist_entry.items():
            entry[key] = value
        if arg_flags.get("-nsd"):
            entry["Description"] = ""
        if couldaccess:
            entry["Status"] = "New"
            entry["Auto Notes"] = ""
        else:
            entry["Status"] = "Error"
            entry["Auto Notes"] = "Was unable to access file.\n%s" % video_data.stderr.strip()
        entry["Changed?"] = "Y"
        entry["User Notes"] = ""
    if entry["Changed?"] == "Y":
        changed_entries = changed_entries + 1
    csv_url_data[url_to_check] = entry
    save_url_data(csv_filename, csv_url_data)
nice_print()
print("Processing complete. %d URL%s checked.%s" % (len(urls_to_check), pluralize(len(urls_to_check)), changed_entries and (" %d entr%s changed." % (changed_entries, entryies(changed_entries))) or ""))