Guest User

wrdcldgen.py

a guest
Jan 29th, 2025
42
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.19 KB | None | 0 0
  1. ### wrdcldgen v1.0
  2. # This script uses selenium to scrape data from warosu.org and the WordCloud python module to create a wordcloud
  3. # It starts from the newest specified thread and works backwards until a stop condition is fulfilled
  4.  
  5. ### You can use the following arguments:
  6. # -g specify a general by subject e.g. flip this will find the latest thread, note this might catch other threads that have "flip" in their subject
  7. # -thread specify any thread by it's threadnumber, this is the prefered method
  8. # -to the last thread that should be included specified by it's threadnumber
  9. # -n the amount of threads that should be used for the wordcloud, overrides -to
  10. # -e exports the posts as an array, useful for testing wordcloud settings and debugging
  11. # -i imports the array, overrides other inputs
  12. # -wc counts unique words
  13. # -nocloud doesn't create a wordcloud, only useful for debugging
  14.  
  15. ### Examples
  16. # wrdcldgen -thread 91556315 -to 91531858
  17. # wrdcldgen -n 3 -e -thread 94235698 -nocloud
  18. # wrdcldgen -i -wc
  19. # wrdcldgen -g mint -n 4
  20.  
  21. ### Dependecies
  22. # python 3.12.6
  23. # firefox or chrome+chromedriver
  24. # selenium
  25. # wordcloud
  26. ### Probably all python 3.x versions will work but I'm not sure
  27. ### You can install selenium and wordcloud running pip in cmd: pip install selenium wordcloud
  28. ### pip comes with python, so you should already have it
  29. ### Get chromedriver here: https://googlechromelabs.github.io/chrome-for-testing/
  30. ### You will have to add environment variables for chromedriver and python
  31.  
  32. ### Settings
  33. # WordCloud settings can be found at the end of the script
  34.  
  35. # pick your browser
  36. browser="firefox"
  37. #browser="chrome"
  38.  
  39. # directory for wordcloud output & import/export file
  40. path=r"D:\Downloads"
  41.  
  42. # file format for wordcloud
  43. fileformat=".png"
  44. #fileformat=".jpg" # jpg might be a good option for huge wordclouds if you change the wordcloud options down below
  45.  
  46. # If your general has it's backlink in the second post, add it to the list below
  47. anchor_L=["flip",]
  48.  
  49. ### end of settings
  50. ### import
  51. import sys
  52. import re
  53. from wordcloud import WordCloud
  54. from selenium import webdriver
  55. from selenium.webdriver.common.by import By
  56. from selenium.webdriver.common.keys import Keys
  57.  
  58. ### declarations
  59. argX = {"-g":[0], "-thread":[0],"-to":[0],"-n":[0]}
  60. argZ = {"-nocloud":[0], "-i":[0],"-e":[0], "-wc":[0]}
  61. post_A=[{"subject":"", "anchor":0, "ignore_count":1, "filename":"wrdcld"}]
  62. post_L=[]
  63. error_L=[0]
  64.  
  65. ### processing arguments
  66. for a in sys.argv[1:]:
  67. if a in argZ:
  68. key=a
  69. argZ[a][0]=1
  70. elif a in argX:
  71. key=a
  72. argX[a][0]=1
  73. elif key in argX:
  74. argX[key].append(a)
  75. elif key in argZ:
  76. argZ[key].append(a)
  77.  
  78. ### check for missing arguments
  79. if len(argX["-g"]) != 2 and len(argX["-thread"]) != 2 and argZ["-i"][0] != 1:
  80. error_L[0]=1
  81. error_L.append("Error: Missing input specify either a general with -g, a threadnumber with -thread or an import a file with -i")
  82.  
  83. if len(argX["-n"]) != 2 and len(argX["-to"]) != 2 and argZ["-i"][0] != 1:
  84. error_L[0]=1
  85. error_L.append("Error: Missing input specify amount of threads with -n or the last threadnumber with -to")
  86.  
  87. ### error message
  88. if error_L[0]==1:
  89. for e in error_L[1:]:
  90. print(e)
  91. sys.exit()
  92.  
  93. ### search for general on warosu
  94. if argZ["-i"][0]!=1:
  95. print("start selenium...")
  96.  
  97. # options for using firefox
  98. if browser=="firefox":
  99. from selenium.webdriver.firefox.options import Options
  100. firefox_options = Options()
  101. firefox_options.page_load_strategy = "eager"
  102. firefox_options.set_preference("permissions.default.image", 2)
  103. firefox_options.add_argument("--headless") # comment this line to see what selenium is doing
  104. driver = webdriver.Firefox(options=firefox_options)
  105.  
  106. # options for using chrome
  107. elif browser=="chrome":
  108. from selenium.webdriver.chrome.options import Options
  109. chrome_options = Options()
  110. chrome_options.page_load_strategy = "eager"
  111. chrome_options.add_argument("--disable-images")
  112. chrome_options.add_argument("--headless") # uncomment this line to hide what selenium is doing
  113. driver = webdriver.Chrome(options=chrome_options)
  114. driver.implicitly_wait(10)
  115.  
  116. ### if threadnumber was specified
  117. if argX["-thread"][0]==1:
  118. thread = argX["-thread"][1]
  119. url="https://warosu.org/vt/thread/"+thread
  120. driver.get(url)
  121. subject = driver.find_element(By.CLASS_NAME, "filetitle")
  122. subject = re.sub('[^0-9A-Za-z]', "", subject.text)
  123.  
  124. ### if general was specified (e.g. flip)
  125. elif argX["-g"][0]==1:
  126. subject=argX["-g"][1]
  127. url="https://warosu.org/vt/?task=search2&ghost=false&search_text=&search_subject="+subject
  128. driver.get(url)
  129. thread_element = driver.find_element(By.XPATH, "/html/body/form/div/table[1]/tbody/tr/td[2]/a[1]")
  130. thread_element.click()
  131. split_L=driver.current_url.split("/")
  132. thread=split_L[len(split_L)-1]
  133.  
  134. ### metadata for import/export
  135. post_A[0]["subject"]=subject
  136. post_A[0]["filename"]="\\"+subject+"_"+post_A[0]["filename"]
  137. post_A[0]["thread"]=thread
  138. if subject in anchor_L:
  139. post_A[0]["anchor"]=1
  140. post_A[0]["ignore_count"]=2
  141.  
  142. # set the and-condition for the while loop
  143. if argX["-to"][0]==1:
  144. argX["-n"].append(-1)
  145.  
  146. elif argX["-n"][0]==1:
  147. argX["-to"].append(-1)
  148.  
  149. print("scraping threads...")
  150. i=0
  151. ### create an array containing threads and posts
  152. while (int(thread) >= int(argX["-to"][1])) and (int(argX["-n"][1])!=i):
  153. post_L=[] # new list for this iteration
  154. i=i+1
  155. # create a list of posts of current thread
  156. postmeta = driver.find_elements(By.TAG_NAME, "blockquote")
  157. for p in postmeta:
  158. post_L.append(p.text)
  159. post_A.append(post_L) # add list to array
  160.  
  161. # find backlink to previous thread
  162. # this will always be the first backlink it finds
  163. regex = re.search(">>[0-9]+", post_L[post_A[0]["anchor"]])
  164. # handle missing backlinks
  165. try:
  166. thread=regex.group()[2:]
  167. except:
  168. thread=""
  169. print(f"No backlink found to previous thread in {driver.current_url}")
  170. while thread=="":
  171. thread = input(f"Enter the threadnumber to continue or EXIT to start the wordcloud with {i} threads. Enter BREAK to exit without doing anything. ")
  172. if thread == "EXIT":
  173. break
  174. elif thread == "BREAK":
  175. sys.exit()
  176.  
  177. # go to previous thread
  178. url="https://warosu.org/vt/thread/"+thread
  179. driver.get(url)
  180.  
  181. print("exit selenium...")
  182. driver.close()
  183.  
  184. #post_A[0]["filename"]=post_A[0]["filename"]+"-"+thread
  185. post_A[0]["thread"]=thread+"-"+post_A[0]["thread"]
  186.  
  187. ### import array
  188. else:
  189. file=open(path+r"\wrdcld.txt", "r", encoding="utf-8")
  190. post_A=eval(file.read())
  191. ignore_count=post_A[0]["ignore_count"]
  192. file.close()
  193.  
  194. ### export array
  195. if argZ["-e"][0]==0:
  196. file=open(path+r"\wrdcld.txt", "w", encoding="utf-8")
  197. file.write(str(post_A))
  198. #print("array exported")
  199. file.close()
  200.  
  201. ### manipulate posts
  202. wordcloud_s=""
  203. for L in post_A[1:]:
  204. for p in L[post_A[0]["ignore_count"]:]:
  205. y=p
  206. y = y.replace("\n", " ")
  207. #y = re.sub('>>[0-9]+', "", y) # removing backlinks, redundant with standard settings
  208. y = re.sub("really.*","really", y) # reducing butt touch anon's ritual post to a single really
  209. y = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', "", y) #removing links
  210. y = y.replace(">", " ")
  211. wordcloud_s=wordcloud_s+y
  212.  
  213. ### unique word count
  214. if argZ["-wc"][0]==1:
  215. wordcloud_wc = wordcloud_s
  216. wordcloud_D={}
  217. wordcloud_wc = wordcloud_wc.lower()
  218. wordcloud_wc = wordcloud_wc.replace("'s", "")
  219. wordcloud_wc = re.sub('[^a-z ]', "", wordcloud_wc) # remove everything but letters
  220. wordcloud_L=wordcloud_wc.split(" ")
  221. for w in wordcloud_L:
  222. if w not in wordcloud_D:
  223. wordcloud_D[w]=1
  224. else:
  225. wordcloud_D[w]+=1
  226. #for d in wordcloud_D: print(d, wordcloud_D[d]) # print a list of all unique words and their frequency
  227. print(f"{len(wordcloud_D)-1} unique words in {len(post_A)-1} threads")
  228.  
  229.  
  230. # WordCloud settings
  231. # The documentation can be found here https://amueller.github.io/word_cloud/generated/wordcloud.WordCloud.html
  232. wc = WordCloud(
  233. background_color="#323232",
  234. colormap="Blues", # you can check usable cholorschemes here: https://matplotlib.org/stable/users/explain/colors/colormaps.html#sphx-glr-users-explain-colors-colormaps-py
  235. relative_scaling=0.8,
  236. font_step=8,
  237. font_path=r"C:\windows\fonts\impact.ttf", # changing the font will also lead to file size changes
  238. width=min(6400,(len(post_A)-1)*1000), # poor attempt at dynamically sizing the wordcloud
  239. height=min(6400,(len(post_A)-1)*1000),
  240. max_words=min(6400,(len(post_A)-1)*800),
  241. max_font_size=min(1200,(len(post_A)-1)*200),
  242. min_font_size=min(32,(len(post_A)-1)*2+10),
  243. #font_path=None,
  244. #width=4000,
  245. #height=4000,
  246. #max_words=2500,
  247. #max_font_size=None,
  248. #min_font_size=16,
  249. min_word_length=2,
  250. prefer_horizontal=1,
  251. collocations=False,
  252. include_numbers=False,
  253. )
  254.  
  255. ### Generate the wordcloud
  256. if argZ["-nocloud"][0]==1:
  257. print("no wordcloud generated")
  258. else:
  259. print("creating wordcloud, this may take a while...")
  260. wc.generate(wordcloud_s)
  261. wc.to_file(path+post_A[0]["filename"]+fileformat)
  262. print("wordcloud generated: "+post_A[0]["filename"][1:]+fileformat)
  263. print("done.")
Advertisement
Add Comment
Please, Sign In to add comment