wrdcldgen.py

### wrdcldgen v1.0
# This script uses selenium to scrape data from warosu.org and the WordCloud python module to create a wordcloud
# It starts from the newest specified thread and works backwards until a stop condition is fulfilled

    ### You can use the following arguments:
    # -g        specify a general by subject e.g. flip this will find the latest thread, note this might catch other threads that have "flip" in their subject
    # -thread     specify any thread by it's threadnumber, this is the prefered method
    # -to       the last thread that should be included specified by it's threadnumber
    # -n        the amount of threads that should be used for the wordcloud, overrides -to
    # -e        exports the posts as an array, useful for testing wordcloud settings and debugging
    # -i        imports the array, overrides other inputs
    # -wc       counts unique words
    # -nocloud  doesn't create a wordcloud, only useful for debugging

    ### Examples
    # wrdcldgen -thread 91556315 -to 91531858
    # wrdcldgen -n 3 -e -thread 94235698 -nocloud
    # wrdcldgen -i -wc
    # wrdcldgen -g mint -n 4

### Dependecies
# python 3.12.6
# firefox or chrome+chromedriver
# selenium
# wordcloud
### Probably all python 3.x versions will work but I'm not sure
### You can install selenium and wordcloud running pip in cmd: pip install selenium wordcloud
### pip comes with python, so you should already have it
### Get chromedriver here: https://googlechromelabs.github.io/chrome-for-testing/
### You will have to add environment variables for chromedriver and python

### Settings
# WordCloud settings can be found at the end of the script

# pick your browser
browser="firefox"
#browser="chrome"

# directory for wordcloud output & import/export file
path=r"D:\Downloads"

# file format for wordcloud
fileformat=".png"
#fileformat=".jpg"      # jpg might be a good option for huge wordclouds if you change the wordcloud options down below

# If your general has it's backlink in the second post, add it to the list below
anchor_L=["flip",]

### end of settings
### import
import sys
import re
from wordcloud import WordCloud
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

### declarations
argX = {"-g":[0], "-thread":[0],"-to":[0],"-n":[0]}
argZ = {"-nocloud":[0], "-i":[0],"-e":[0], "-wc":[0]}
post_A=[{"subject":"", "anchor":0, "ignore_count":1, "filename":"wrdcld"}]
post_L=[]
error_L=[0]

### processing arguments
for a in sys.argv[1:]:
    if a in argZ:
        key=a
        argZ[a][0]=1
    elif a in argX:
        key=a
        argX[a][0]=1
    elif key in argX:
        argX[key].append(a)
    elif key in argZ:
        argZ[key].append(a)

### check for missing arguments
if len(argX["-g"]) != 2 and len(argX["-thread"]) != 2 and argZ["-i"][0] != 1:
    error_L[0]=1
    error_L.append("Error: Missing input specify either a general with -g, a threadnumber with -thread or an import a file with -i")

if len(argX["-n"]) != 2 and len(argX["-to"]) != 2 and argZ["-i"][0] != 1:
    error_L[0]=1
    error_L.append("Error: Missing input specify amount of threads with -n or the last threadnumber with -to")

### error message
if error_L[0]==1:
    for e in error_L[1:]:
        print(e)
    sys.exit()

### search for general on warosu
if argZ["-i"][0]!=1:
    print("start selenium...")

    # options for using firefox
    if browser=="firefox":
        from selenium.webdriver.firefox.options import Options
        firefox_options = Options()
        firefox_options.page_load_strategy = "eager"
        firefox_options.set_preference("permissions.default.image", 2)
        firefox_options.add_argument("--headless")      # comment this line to see what selenium is doing
        driver = webdriver.Firefox(options=firefox_options)

    # options for using chrome
    elif browser=="chrome":
        from selenium.webdriver.chrome.options import Options
        chrome_options = Options()
        chrome_options.page_load_strategy = "eager"
        chrome_options.add_argument("--disable-images")
        chrome_options.add_argument("--headless")      # uncomment this line to hide what selenium is doing
        driver = webdriver.Chrome(options=chrome_options)
    driver.implicitly_wait(10)

    ### if threadnumber was specified
    if argX["-thread"][0]==1:
        thread = argX["-thread"][1]
        url="https://warosu.org/vt/thread/"+thread
        driver.get(url)
        subject = driver.find_element(By.CLASS_NAME, "filetitle")
        subject = re.sub('[^0-9A-Za-z]', "", subject.text)

    ### if general was specified (e.g. flip)
    elif argX["-g"][0]==1:
        subject=argX["-g"][1]
        url="https://warosu.org/vt/?task=search2&ghost=false&search_text=&search_subject="+subject
        driver.get(url)
        thread_element = driver.find_element(By.XPATH, "/html/body/form/div/table[1]/tbody/tr/td[2]/a[1]")
        thread_element.click()
        split_L=driver.current_url.split("/")
        thread=split_L[len(split_L)-1]

    ### metadata for import/export
    post_A[0]["subject"]=subject
    post_A[0]["filename"]="\\"+subject+"_"+post_A[0]["filename"]
    post_A[0]["thread"]=thread
    if subject in anchor_L:
        post_A[0]["anchor"]=1
        post_A[0]["ignore_count"]=2

    # set the and-condition for the while loop
    if argX["-to"][0]==1:
        argX["-n"].append(-1)

    elif argX["-n"][0]==1:
        argX["-to"].append(-1)

    print("scraping threads...")
    i=0
    ### create an array containing threads and posts
    while (int(thread) >= int(argX["-to"][1])) and (int(argX["-n"][1])!=i):
        post_L=[] # new list for this iteration
        i=i+1
        # create a list of posts of current thread
        postmeta = driver.find_elements(By.TAG_NAME, "blockquote")
        for p in postmeta:
            post_L.append(p.text)
        post_A.append(post_L) # add list to array

        # find backlink to previous thread
        # this will always be the first backlink it finds
        regex = re.search(">>[0-9]+", post_L[post_A[0]["anchor"]])
        # handle missing backlinks
        try:
            thread=regex.group()[2:]
        except:
            thread=""
            print(f"No backlink found to previous thread in {driver.current_url}")
            while thread=="":
                thread = input(f"Enter the threadnumber to continue or EXIT to start the wordcloud with {i} threads. Enter BREAK to exit without doing anything. ")
            if thread == "EXIT":
                break
            elif thread == "BREAK":
                sys.exit()

        # go to previous thread
        url="https://warosu.org/vt/thread/"+thread
        driver.get(url)

    print("exit selenium...")
    driver.close()

    #post_A[0]["filename"]=post_A[0]["filename"]+"-"+thread
    post_A[0]["thread"]=thread+"-"+post_A[0]["thread"]

### import array
else:
    file=open(path+r"\wrdcld.txt", "r", encoding="utf-8")
    post_A=eval(file.read())
    ignore_count=post_A[0]["ignore_count"]
    file.close()

### export array
if argZ["-e"][0]==0:
    file=open(path+r"\wrdcld.txt", "w", encoding="utf-8")
    file.write(str(post_A))
    #print("array exported")
    file.close()

### manipulate posts
wordcloud_s=""
for L in post_A[1:]:
    for p in L[post_A[0]["ignore_count"]:]:
        y=p
        y = y.replace("\n", " ")
        #y = re.sub('>>[0-9]+', "", y) # removing backlinks, redundant with standard settings
        y = re.sub("really.*","really", y) # reducing butt touch anon's ritual post to a single really
        y = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', "", y) #removing links
        y = y.replace(">", " ")
        wordcloud_s=wordcloud_s+y

### unique word count
if argZ["-wc"][0]==1:
    wordcloud_wc = wordcloud_s
    wordcloud_D={}
    wordcloud_wc = wordcloud_wc.lower()
    wordcloud_wc = wordcloud_wc.replace("'s", "")
    wordcloud_wc = re.sub('[^a-z ]', "", wordcloud_wc) # remove everything but letters
    wordcloud_L=wordcloud_wc.split(" ")
    for w in wordcloud_L:
        if w not in wordcloud_D:
            wordcloud_D[w]=1
        else:
            wordcloud_D[w]+=1
    #for d in wordcloud_D: print(d, wordcloud_D[d]) # print a list of all unique words and their frequency
    print(f"{len(wordcloud_D)-1} unique words in {len(post_A)-1} threads")


# WordCloud settings
# The documentation can be found here https://amueller.github.io/word_cloud/generated/wordcloud.WordCloud.html
wc = WordCloud(
    background_color="#323232",
    colormap="Blues",       # you can check usable cholorschemes here: https://matplotlib.org/stable/users/explain/colors/colormaps.html#sphx-glr-users-explain-colors-colormaps-py
    relative_scaling=0.8,
    font_step=8,
    font_path=r"C:\windows\fonts\impact.ttf",   # changing the font will also lead to file size changes
    width=min(6400,(len(post_A)-1)*1000),       # poor attempt at dynamically sizing the wordcloud
    height=min(6400,(len(post_A)-1)*1000),
    max_words=min(6400,(len(post_A)-1)*800),
    max_font_size=min(1200,(len(post_A)-1)*200),
    min_font_size=min(32,(len(post_A)-1)*2+10),
    #font_path=None,
    #width=4000,
    #height=4000,
    #max_words=2500,
    #max_font_size=None,
    #min_font_size=16,
    min_word_length=2,
    prefer_horizontal=1,
    collocations=False,
    include_numbers=False,
    )

### Generate the wordcloud
if argZ["-nocloud"][0]==1:
    print("no wordcloud generated")
else:
    print("creating wordcloud, this may take a while...")
    wc.generate(wordcloud_s)
    wc.to_file(path+post_A[0]["filename"]+fileformat)
    print("wordcloud generated: "+post_A[0]["filename"][1:]+fileformat)
print("done.")