Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ### wrdcldgen v1.0
- # This script uses selenium to scrape data from warosu.org and the WordCloud python module to create a wordcloud
- # It starts from the newest specified thread and works backwards until a stop condition is fulfilled
- ### You can use the following arguments:
- # -g specify a general by subject e.g. flip this will find the latest thread, note this might catch other threads that have "flip" in their subject
- # -thread specify any thread by it's threadnumber, this is the prefered method
- # -to the last thread that should be included specified by it's threadnumber
- # -n the amount of threads that should be used for the wordcloud, overrides -to
- # -e exports the posts as an array, useful for testing wordcloud settings and debugging
- # -i imports the array, overrides other inputs
- # -wc counts unique words
- # -nocloud doesn't create a wordcloud, only useful for debugging
- ### Examples
- # wrdcldgen -thread 91556315 -to 91531858
- # wrdcldgen -n 3 -e -thread 94235698 -nocloud
- # wrdcldgen -i -wc
- # wrdcldgen -g mint -n 4
- ### Dependecies
- # python 3.12.6
- # firefox or chrome+chromedriver
- # selenium
- # wordcloud
- ### Probably all python 3.x versions will work but I'm not sure
- ### You can install selenium and wordcloud running pip in cmd: pip install selenium wordcloud
- ### pip comes with python, so you should already have it
- ### Get chromedriver here: https://googlechromelabs.github.io/chrome-for-testing/
- ### You will have to add environment variables for chromedriver and python
- ### Settings
- # WordCloud settings can be found at the end of the script
- # pick your browser
- browser="firefox"
- #browser="chrome"
- # directory for wordcloud output & import/export file
- path=r"D:\Downloads"
- # file format for wordcloud
- fileformat=".png"
- #fileformat=".jpg" # jpg might be a good option for huge wordclouds if you change the wordcloud options down below
- # If your general has it's backlink in the second post, add it to the list below
- anchor_L=["flip",]
- ### end of settings
- ### import
- import sys
- import re
- from wordcloud import WordCloud
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- from selenium.webdriver.common.keys import Keys
- ### declarations
- argX = {"-g":[0], "-thread":[0],"-to":[0],"-n":[0]}
- argZ = {"-nocloud":[0], "-i":[0],"-e":[0], "-wc":[0]}
- post_A=[{"subject":"", "anchor":0, "ignore_count":1, "filename":"wrdcld"}]
- post_L=[]
- error_L=[0]
- ### processing arguments
- for a in sys.argv[1:]:
- if a in argZ:
- key=a
- argZ[a][0]=1
- elif a in argX:
- key=a
- argX[a][0]=1
- elif key in argX:
- argX[key].append(a)
- elif key in argZ:
- argZ[key].append(a)
- ### check for missing arguments
- if len(argX["-g"]) != 2 and len(argX["-thread"]) != 2 and argZ["-i"][0] != 1:
- error_L[0]=1
- error_L.append("Error: Missing input specify either a general with -g, a threadnumber with -thread or an import a file with -i")
- if len(argX["-n"]) != 2 and len(argX["-to"]) != 2 and argZ["-i"][0] != 1:
- error_L[0]=1
- error_L.append("Error: Missing input specify amount of threads with -n or the last threadnumber with -to")
- ### error message
- if error_L[0]==1:
- for e in error_L[1:]:
- print(e)
- sys.exit()
- ### search for general on warosu
- if argZ["-i"][0]!=1:
- print("start selenium...")
- # options for using firefox
- if browser=="firefox":
- from selenium.webdriver.firefox.options import Options
- firefox_options = Options()
- firefox_options.page_load_strategy = "eager"
- firefox_options.set_preference("permissions.default.image", 2)
- firefox_options.add_argument("--headless") # comment this line to see what selenium is doing
- driver = webdriver.Firefox(options=firefox_options)
- # options for using chrome
- elif browser=="chrome":
- from selenium.webdriver.chrome.options import Options
- chrome_options = Options()
- chrome_options.page_load_strategy = "eager"
- chrome_options.add_argument("--disable-images")
- chrome_options.add_argument("--headless") # uncomment this line to hide what selenium is doing
- driver = webdriver.Chrome(options=chrome_options)
- driver.implicitly_wait(10)
- ### if threadnumber was specified
- if argX["-thread"][0]==1:
- thread = argX["-thread"][1]
- url="https://warosu.org/vt/thread/"+thread
- driver.get(url)
- subject = driver.find_element(By.CLASS_NAME, "filetitle")
- subject = re.sub('[^0-9A-Za-z]', "", subject.text)
- ### if general was specified (e.g. flip)
- elif argX["-g"][0]==1:
- subject=argX["-g"][1]
- url="https://warosu.org/vt/?task=search2&ghost=false&search_text=&search_subject="+subject
- driver.get(url)
- thread_element = driver.find_element(By.XPATH, "/html/body/form/div/table[1]/tbody/tr/td[2]/a[1]")
- thread_element.click()
- split_L=driver.current_url.split("/")
- thread=split_L[len(split_L)-1]
- ### metadata for import/export
- post_A[0]["subject"]=subject
- post_A[0]["filename"]="\\"+subject+"_"+post_A[0]["filename"]
- post_A[0]["thread"]=thread
- if subject in anchor_L:
- post_A[0]["anchor"]=1
- post_A[0]["ignore_count"]=2
- # set the and-condition for the while loop
- if argX["-to"][0]==1:
- argX["-n"].append(-1)
- elif argX["-n"][0]==1:
- argX["-to"].append(-1)
- print("scraping threads...")
- i=0
- ### create an array containing threads and posts
- while (int(thread) >= int(argX["-to"][1])) and (int(argX["-n"][1])!=i):
- post_L=[] # new list for this iteration
- i=i+1
- # create a list of posts of current thread
- postmeta = driver.find_elements(By.TAG_NAME, "blockquote")
- for p in postmeta:
- post_L.append(p.text)
- post_A.append(post_L) # add list to array
- # find backlink to previous thread
- # this will always be the first backlink it finds
- regex = re.search(">>[0-9]+", post_L[post_A[0]["anchor"]])
- # handle missing backlinks
- try:
- thread=regex.group()[2:]
- except:
- thread=""
- print(f"No backlink found to previous thread in {driver.current_url}")
- while thread=="":
- thread = input(f"Enter the threadnumber to continue or EXIT to start the wordcloud with {i} threads. Enter BREAK to exit without doing anything. ")
- if thread == "EXIT":
- break
- elif thread == "BREAK":
- sys.exit()
- # go to previous thread
- url="https://warosu.org/vt/thread/"+thread
- driver.get(url)
- print("exit selenium...")
- driver.close()
- #post_A[0]["filename"]=post_A[0]["filename"]+"-"+thread
- post_A[0]["thread"]=thread+"-"+post_A[0]["thread"]
- ### import array
- else:
- file=open(path+r"\wrdcld.txt", "r", encoding="utf-8")
- post_A=eval(file.read())
- ignore_count=post_A[0]["ignore_count"]
- file.close()
- ### export array
- if argZ["-e"][0]==0:
- file=open(path+r"\wrdcld.txt", "w", encoding="utf-8")
- file.write(str(post_A))
- #print("array exported")
- file.close()
- ### manipulate posts
- wordcloud_s=""
- for L in post_A[1:]:
- for p in L[post_A[0]["ignore_count"]:]:
- y=p
- y = y.replace("\n", " ")
- #y = re.sub('>>[0-9]+', "", y) # removing backlinks, redundant with standard settings
- y = re.sub("really.*","really", y) # reducing butt touch anon's ritual post to a single really
- y = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', "", y) #removing links
- y = y.replace(">", " ")
- wordcloud_s=wordcloud_s+y
- ### unique word count
- if argZ["-wc"][0]==1:
- wordcloud_wc = wordcloud_s
- wordcloud_D={}
- wordcloud_wc = wordcloud_wc.lower()
- wordcloud_wc = wordcloud_wc.replace("'s", "")
- wordcloud_wc = re.sub('[^a-z ]', "", wordcloud_wc) # remove everything but letters
- wordcloud_L=wordcloud_wc.split(" ")
- for w in wordcloud_L:
- if w not in wordcloud_D:
- wordcloud_D[w]=1
- else:
- wordcloud_D[w]+=1
- #for d in wordcloud_D: print(d, wordcloud_D[d]) # print a list of all unique words and their frequency
- print(f"{len(wordcloud_D)-1} unique words in {len(post_A)-1} threads")
- # WordCloud settings
- # The documentation can be found here https://amueller.github.io/word_cloud/generated/wordcloud.WordCloud.html
- wc = WordCloud(
- background_color="#323232",
- colormap="Blues", # you can check usable cholorschemes here: https://matplotlib.org/stable/users/explain/colors/colormaps.html#sphx-glr-users-explain-colors-colormaps-py
- relative_scaling=0.8,
- font_step=8,
- font_path=r"C:\windows\fonts\impact.ttf", # changing the font will also lead to file size changes
- width=min(6400,(len(post_A)-1)*1000), # poor attempt at dynamically sizing the wordcloud
- height=min(6400,(len(post_A)-1)*1000),
- max_words=min(6400,(len(post_A)-1)*800),
- max_font_size=min(1200,(len(post_A)-1)*200),
- min_font_size=min(32,(len(post_A)-1)*2+10),
- #font_path=None,
- #width=4000,
- #height=4000,
- #max_words=2500,
- #max_font_size=None,
- #min_font_size=16,
- min_word_length=2,
- prefer_horizontal=1,
- collocations=False,
- include_numbers=False,
- )
- ### Generate the wordcloud
- if argZ["-nocloud"][0]==1:
- print("no wordcloud generated")
- else:
- print("creating wordcloud, this may take a while...")
- wc.generate(wordcloud_s)
- wc.to_file(path+post_A[0]["filename"]+fileformat)
- print("wordcloud generated: "+post_A[0]["filename"][1:]+fileformat)
- print("done.")
Advertisement
Add Comment
Please, Sign In to add comment