Guest User

Untitled

a guest
Apr 22nd, 2018
204
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 14.44 KB | None | 0 0
  1. #!/usr/bin/env ruby
  2.  
  3. def words_from_string( string )
  4. string.scan(/[\w]+/)
  5. end
  6.  
  7. def words_from_string_lowercase( string )
  8. string.downcase.scan(/[\w]+/)
  9. end
  10.  
  11. def words_from_string_swapcase( string )
  12. string.swapcase.scan(/[\w]+/)
  13. end
  14.  
  15. def words_from_string_upcase( string )
  16. string.swapcase.scan(/[\w]+/)
  17. end
  18.  
  19. def count_frequency( word_list )
  20. for word in word_list
  21. $counts[ word ] += 1
  22. end
  23.  
  24. # $counts
  25. end
  26.  
  27. def do_it( file )
  28. puts `date`
  29. puts "Doing a new file!"
  30. raw_text = File.read( file )
  31. word_list = words_from_string( raw_text )
  32. count_frequency( word_list )
  33. word_list = words_from_string_lowercase ( raw_text )
  34. count_frequency( word_list )
  35. word_list = words_from_string_swapcase ( raw_text )
  36. count_frequency( word_list )
  37. word_list = words_from_string_upcase ( raw_text )
  38. count_frequency( word_list )
  39. end
  40.  
  41. files = []
  42.  
  43. files.push "short1.txt"
  44. files.push "long1.txt"
  45. files.push "toolong1.txt"
  46. files.push "txt4/1/0...9999999.dic"
  47. files.push "txt4/1/0...ffffff.dic"
  48. files.push "txt4/1/10001fr.equ"
  49. files.push "txt4/1/10002fr.equ"
  50. files.push "txt4/1/10196pla.ces"
  51. files.push "txt4/1/113809of.fic"
  52. files.push "txt4/1/1185kjvf.req"
  53. files.push "txt4/1/1984.txt"
  54. files.push "txt4/1/1lower.lst"
  55. files.push "txt4/1/1mixed.lst"
  56. files.push "txt4/1/21986na.mes"
  57. files.push "txt4/1/256772co.mpo"
  58. files.push "txt4/1/354984si.ngl"
  59. files.push "txt4/1/366often.mis"
  60. files.push "txt4/1/3897male.nam"
  61. files.push "txt4/1/4160offi.cia"
  62. files.push "txt4/1/467popul.arf"
  63. files.push "txt4/1/4946fema.len"
  64. files.push "txt4/1/6213acro.nym"
  65. files.push "txt4/1/74550com.mon"
  66. files.push "txt4/1/all.lst"
  67. files.push "txt4/1/ALLUP&R.DIC"
  68. files.push "txt4/1/all-word"
  69. files.push "txt4/1/allwords"
  70. files.push "txt4/1/Antworth"
  71. files.push "txt4/1/arthur"
  72. files.push "txt4/1/ASSurnames"
  73. files.push "txt4/1/cmudict0.3"
  74. files.push "txt4/1/Common.dic"
  75. files.push "txt4/1/Cracklib.dic"
  76. files.push "txt4/1/creadme"
  77. files.push "txt4/1/D8.DIC"
  78. files.push "txt4/1/Dates.dic"
  79. files.push "txt4/1/des-how-to.txt"
  80. files.push "txt4/1/dic-0294.txt"
  81. files.push "txt4/1/dsgerman.txt"
  82. files.push "txt4/1/Dutch.dic"
  83. files.push "txt4/1/English.dic"
  84. files.push "txt4/1/fdsitalian.txt"
  85. files.push "txt4/1/fdsjapanese.txt"
  86. files.push "txt4/1/ffreadme"
  87. files.push "txt4/1/ffrench.txt"
  88. files.push "txt4/1/French.dic"
  89. files.push "txt4/1/french.txt"
  90. files.push "txt4/1/fsreadme"
  91. files.push "txt4/1/GDict_v2.txt"
  92. files.push "txt4/1/german.txt"
  93. files.push "txt4/1/gsdspanish.txt"
  94. files.push "txt4/1/italian.txt"
  95. files.push "txt4/1/japanese.txt"
  96. files.push "txt4/1/lower.lst"
  97. files.push "txt4/1/mhyph.txt"
  98. files.push "txt4/1/mixed.lst"
  99. files.push "txt4/1/mobyposi.i"
  100. files.push "txt4/1/mobypron.unc"
  101. files.push "txt4/1/mobythes.aur"
  102. files.push "txt4/1/phoneset.3"
  103. files.push "txt4/1/qsdffsdq0readme"
  104. files.push "txt4/1/qsdgreadme"
  105. files.push "txt4/1/readme"
  106. files.push "txt4/1/roget13a.txt"
  107. files.push "txt4/1/rreadme"
  108. files.push "txt4/1/sdgreadme"
  109. files.push "txt4/1/sdsdqfdsfreadme.txt"
  110. files.push "txt4/1/shakespe.are"
  111. files.push "txt4/1/spanish.txt"
  112. files.push "txt4/1/usaconst.itu"
  113. files.push "txt4/1/CainandAbel.dic"
  114. files.push "txt4/1/CaseMutation.dic"
  115. files.push "txt4/1/Facebook(Usernames).dic"
  116. files.push "txt4/2/0sd0freadme"
  117. files.push "txt4/2/abbr"
  118. files.push "txt4/2/all-words"
  119. files.push "txt4/2/asteroids"
  120. files.push "txt4/2/biology"
  121. files.push "txt4/2/cartoon"
  122. files.push "txt4/2/chars"
  123. files.push "txt4/2/chinese"
  124. files.push "txt4/2/common-passwords.txt"
  125. files.push "txt4/2/etc-hosts"
  126. files.push "txt4/2/famous"
  127. files.push "txt4/2/fast-names"
  128. files.push "txt4/2/female-names"
  129. files.push "txt4/2/junk"
  130. files.push "txt4/2/meh.txt"
  131. files.push "txt4/2/NAMES.DIC"
  132. files.push "txt4/2/n_common"
  133. files.push "txt4/2/NORM&R.DIC"
  134. files.push "txt4/2/ONEUP&R.DIC"
  135. files.push "txt4/3/0README"
  136. files.push "txt4/3/1dfg2README"
  137. files.push "txt4/3/1README"
  138. files.push "txt4/3/2README"
  139. files.push "txt4/3/3README"
  140. files.push "txt4/3/computer.names"
  141. files.push "txt4/3/dfgREADME"
  142. files.push "txt4/3/dsfREADME"
  143. files.push "txt4/3/dutch.maybe"
  144. files.push "txt4/3/dutch.trash"
  145. files.push "txt4/3/dutch.words"
  146. files.push "txt4/3/english.abbrs"
  147. files.push "txt4/3/english.maybe"
  148. files.push "txt4/3/english.names"
  149. files.push "txt4/3/english.trash"
  150. files.push "txt4/3/english.words"
  151. files.push "txt4/3/german.trash"
  152. files.push "txt4/3/german.words"
  153. files.push "txt4/3/italian.trash"
  154. files.push "txt4/3/italian.words"
  155. files.push "txt4/3/kjbible"
  156. files.push "txt4/3/male-names"
  157. files.push "txt4/3/misc.names"
  158. files.push "txt4/3/movies"
  159. files.push "txt4/3/myths-legends"
  160. files.push "txt4/3/norwegian.trash"
  161. files.push "txt4/3/norwegian.words"
  162. files.push "txt4/3/numbers"
  163. files.push "txt4/3/org.names"
  164. files.push "txt4/3/other-names"
  165. files.push "txt4/3/phrases"
  166. files.push "txt4/3/places"
  167. files.push "txt4/3/sf"
  168. files.push "txt4/3/shakespeare"
  169. files.push "txt4/3/sports"
  170. files.push "txt4/3/surnames"
  171. files.push "txt4/3/swedish.trash"
  172. files.push "txt4/3/swedish.words"
  173. files.push "txt4/3/yiddish"
  174. files.push "txt4/4/1sfreja.diku.dk"
  175. files.push "txt4/4/1web2"
  176. files.push "txt4/4/Fandboken"
  177. files.push "txt4/4/ffospd"
  178. files.push "txt4/4/firstnames.finnish"
  179. files.push "txt4/4/germanl"
  180. files.push "txt4/4/hindu-names"
  181. files.push "txt4/4/mhg@lance.hss.bu.oz.au.txt"
  182. files.push "txt4/4/qsdffreja.diku.dk"
  183. files.push "txt4/4/qsfdsqfqsf0ftp.cs.vu.nl"
  184. files.push "txt4/4/qsfsource.unknown"
  185. files.push "txt4/4/README.ftp.funet.fi"
  186. files.push "txt4/4/sdg1ftp.cs.vu.nl"
  187. files.push "txt4/4/sffreja.diku.dk"
  188. files.push "txt4/4/sfweb2a"
  189. files.push "txt4/4/sqfqssdfwietze@swi.psy.uva.nl"
  190. files.push "txt4/4/sqqsqqsasimtel20.army.mil"
  191. files.push "txt4/4/tzeftp.uu.net"
  192. files.push "txt4/4/web2"
  193. files.push "txt4/4/web2a"
  194. files.push "txt4/4/words.dutch"
  195. files.push "txt4/4/words.english"
  196. files.push "txt4/4/words.finnish"
  197. files.push "txt4/4/words.finnish.FAQ"
  198. files.push "txt4/4/words.german"
  199. files.push "txt4/4/words.italian"
  200. files.push "txt4/4/words.norwegian"
  201. files.push "txt4/4/words.swedish"
  202. files.push "txt4/5/124freja.diku.dk"
  203. files.push "txt4/5/4ghdsource.unknown"
  204. files.push "txt4/5/4s7etechnik.vok"
  205. files.push "txt4/5/d3s3g1freja.diku.dk"
  206. files.push "txt4/5/dfg.waseda.ac.jp"
  207. files.push "txt4/5/greyftp.cs.vu.nl-2"
  208. files.push "txt4/5/HUGE-words.nic.funet.fi"
  209. files.push "txt4/5/latin2"
  210. files.push "txt4/5/names.hp.ycy"
  211. files.push "txt4/5/qs2d4ggerman-wordlist.new"
  212. files.push "txt4/5/qsdfarcher@frmug.fr.mugnet.org"
  213. files.push "txt4/5/s2gidioms.vok"
  214. files.push "txt4/5/sd4gsource.unknown"
  215. files.push "txt4/5/sdg4ftp.cs.vu.nl-1"
  216. files.push "txt4/5/sgdqfexercise.vok"
  217. files.push "txt4/5/treftp.cs.vu.nl"
  218. files.push "txt4/5/words.nic.funet.fi"
  219. files.push "txt4/6/Antworth"
  220. files.push "txt4/6/CIS"
  221. files.push "txt4/6/Colleges"
  222. files.push "txt4/6/CRL.words"
  223. files.push "txt4/6/Domains"
  224. files.push "txt4/6/Dosref"
  225. files.push "txt4/6/etc-hosts"
  226. files.push "txt4/6/Ethnologue"
  227. files.push "txt4/6/Ftpsites"
  228. files.push "txt4/6/Jargon"
  229. files.push "txt4/6/Koran"
  230. files.push "txt4/6/LCarrol"
  231. files.push "txt4/6/Movies"
  232. files.push "txt4/6/Paradise.Lost"
  233. files.push "txt4/6/Python"
  234. files.push "txt4/6/README"
  235. files.push "txt4/6/Roget.words"
  236. files.push "txt4/6/Trek"
  237. files.push "txt4/6/Unabr.dict"
  238. files.push "txt4/6/Unix.dict"
  239. files.push "txt4/6/World.factbook"
  240. files.push "txt4/6/Zipcodes"
  241. files.push "txt4/7/abbr"
  242. files.push "txt4/7/all-words"
  243. files.push "txt4/7/asteroids"
  244. files.push "txt4/7/biology"
  245. files.push "txt4/7/cartoon"
  246. files.push "txt4/7/chars"
  247. files.push "txt4/7/chinese"
  248. files.push "txt4/7/common-passwords.txt"
  249. files.push "txt4/7/etc-hosts"
  250. files.push "txt4/7/famous"
  251. files.push "txt4/7/fast-names"
  252. files.push "txt4/7/female-names"
  253. files.push "txt4/7/junk"
  254. files.push "txt4/7/kjbible"
  255. files.push "txt4/7/male-names"
  256. files.push "txt4/7/movies"
  257. files.push "txt4/7/myths-legends"
  258. files.push "txt4/7/numbers"
  259. files.push "txt4/7/other-names"
  260. files.push "txt4/7/phrases"
  261. files.push "txt4/7/places"
  262. files.push "txt4/7/sf"
  263. files.push "txt4/7/shakespeare"
  264. files.push "txt4/7/sports"
  265. files.push "txt4/7/surnames"
  266. files.push "txt4/7/yiddish"
  267. files.push "txt4/8/asteroids"
  268. files.push "txt4/8/bsd-words"
  269. files.push "txt4/8/cars"
  270. files.push "txt4/8/cartoons"
  271. files.push "txt4/8/chinese"
  272. files.push "txt4/8/CIS.DIC"
  273. files.push "txt4/8/computer-companies"
  274. files.push "txt4/8/crackdict"
  275. files.push "txt4/8/dictionaries"
  276. files.push "txt4/8/digits"
  277. files.push "txt4/8/ego"
  278. files.push "txt4/8/famous"
  279. files.push "txt4/8/fantasy"
  280. files.push "txt4/8/geography"
  281. files.push "txt4/8/GNU-wordlist"
  282. files.push "txt4/8/greek"
  283. files.push "txt4/8/hackdict"
  284. files.push "txt4/8/hosts"
  285. files.push "txt4/8/jargon"
  286. files.push "txt4/8/JUNK.DIC"
  287. files.push "txt4/8/look.freja.diku.dk"
  288. files.push "txt4/8/misc"
  289. files.push "txt4/8/misc.other"
  290. files.push "txt4/8/music"
  291. files.push "txt4/8/phonenums"
  292. files.push "txt4/8/phrases"
  293. files.push "txt4/8/PHRASES.DIC"
  294. files.push "txt4/8/Purdue"
  295. files.push "txt4/8/rsk.dict"
  296. files.push "txt4/8/sequences"
  297. files.push "txt4/8/sports"
  298. files.push "txt4/8/stava"
  299. files.push "txt4/8/unix"
  300. files.push "txt4/8/webster.phrases"
  301. files.push "txt4/8/words.bad"
  302. files.push "txt4/8/wormlist"
  303. files.push "txt4/9/names1"
  304. files.push "txt4/9/names10"
  305. files.push "txt4/9/names10.source"
  306. files.push "txt4/9/names11"
  307. files.push "txt4/9/names11.source"
  308. files.push "txt4/9/names12"
  309. files.push "txt4/9/names12.source"
  310. files.push "txt4/9/names13"
  311. files.push "txt4/9/names13.source"
  312. files.push "txt4/9/names14"
  313. files.push "txt4/9/names14.source"
  314. files.push "txt4/9/names15"
  315. files.push "txt4/9/names15.source"
  316. files.push "txt4/9/names16"
  317. files.push "txt4/9/names16.source"
  318. files.push "txt4/9/names17"
  319. files.push "txt4/9/names17.source"
  320. files.push "txt4/9/names18"
  321. files.push "txt4/9/names18.source"
  322. files.push "txt4/9/names19"
  323. files.push "txt4/9/names19.source"
  324. files.push "txt4/9/names1.source"
  325. files.push "txt4/9/names2"
  326. files.push "txt4/9/names20"
  327. files.push "txt4/9/names20.source"
  328. files.push "txt4/9/names21"
  329. files.push "txt4/9/names21.source"
  330. files.push "txt4/9/names22"
  331. files.push "txt4/9/names22.source"
  332. files.push "txt4/9/names23"
  333. files.push "txt4/9/names23.source"
  334. files.push "txt4/9/names24"
  335. files.push "txt4/9/names24.source"
  336. files.push "txt4/9/names25"
  337. files.push "txt4/9/names25.source"
  338. files.push "txt4/9/names26"
  339. files.push "txt4/9/names26.source"
  340. files.push "txt4/9/names27"
  341. files.push "txt4/9/names27.source"
  342. files.push "txt4/9/names28"
  343. files.push "txt4/9/names28.source"
  344. files.push "txt4/9/names29"
  345. files.push "txt4/9/names29.source"
  346. files.push "txt4/9/names2.source"
  347. files.push "txt4/9/names3"
  348. files.push "txt4/9/names30"
  349. files.push "txt4/9/names30.source"
  350. files.push "txt4/9/names31"
  351. files.push "txt4/9/names31.source"
  352. files.push "txt4/9/names32"
  353. files.push "txt4/9/names32.source"
  354. files.push "txt4/9/names33"
  355. files.push "txt4/9/names33.source"
  356. files.push "txt4/9/names34"
  357. files.push "txt4/9/names34.source"
  358. files.push "txt4/9/names35"
  359. files.push "txt4/9/names35.source"
  360. files.push "txt4/9/names36"
  361. files.push "txt4/9/names36.source"
  362. files.push "txt4/9/names37"
  363. files.push "txt4/9/names37.source"
  364. files.push "txt4/9/names38"
  365. files.push "txt4/9/names38.source"
  366. files.push "txt4/9/names39"
  367. files.push "txt4/9/names39.source"
  368. files.push "txt4/9/names3.source"
  369. files.push "txt4/9/names4"
  370. files.push "txt4/9/names4.source"
  371. files.push "txt4/9/names5"
  372. files.push "txt4/9/names5.source"
  373. files.push "txt4/9/names6"
  374. files.push "txt4/9/names6.source"
  375. files.push "txt4/9/names7"
  376. files.push "txt4/9/names7.source"
  377. files.push "txt4/9/names8"
  378. files.push "txt4/9/names8.source"
  379. files.push "txt4/9/names9"
  380. files.push "txt4/9/names9.source"
  381. files.push "txt4/10/4ftp.cs.vu.nl"
  382. files.push "txt4/10/acronyms.txt"
  383. files.push "txt4/10/allwords2"
  384. files.push "txt4/10/foldoc.txt"
  385. files.push "txt4/10/ftp.cs.vu.nl"
  386. files.push "txt4/10/ftp.uu.net"
  387. files.push "txt4/10/source.unknown"
  388. files.push "txt4/11/4fREADME.TXT"
  389. files.push "txt4/11/abbr"
  390. files.push "txt4/11/Antworth"
  391. files.push "txt4/11/ASSurnames"
  392. files.push "txt4/11/asteroids"
  393. files.push "txt4/11/biology"
  394. files.push "txt4/11/cartoon"
  395. files.push "txt4/11/chars"
  396. files.push "txt4/11/chinese"
  397. files.push "txt4/11/CIS"
  398. files.push "txt4/11/Colleges"
  399. files.push "txt4/11/common-passwords.txt"
  400. files.push "txt4/11/Congress"
  401. files.push "txt4/11/CRL.words"
  402. files.push "txt4/11/danish.words"
  403. files.push "txt4/11/dico"
  404. files.push "txt4/11/Domains"
  405. files.push "txt4/11/Dosref"
  406. files.push "txt4/11/etc-hosts"
  407. files.push "txt4/11/Ethnologue"
  408. files.push "txt4/11/Family-Names"
  409. files.push "txt4/11/famous"
  410. files.push "txt4/11/fast-names"
  411. files.push "txt4/11/female-names"
  412. files.push "txt4/11/Ftpsites"
  413. files.push "txt4/11/germanl"
  414. files.push "txt4/11/Given-Names"
  415. files.push "txt4/11/Jargon"
  416. files.push "txt4/11/junk"
  417. files.push "txt4/11/kjbible"
  418. files.push "txt4/11/Koran"
  419. files.push "txt4/11/LCarrol"
  420. files.push "txt4/11/male-names"
  421. files.push "txt4/11/Movies"
  422. files.push "txt4/11/myths-legends"
  423. files.push "txt4/11/names.french"
  424. files.push "txt4/11/names.hp"
  425. files.push "txt4/11/numbers"
  426. files.push "txt4/11/other-names"
  427. files.push "txt4/11/oz"
  428. files.push "txt4/11/Paradise.Lost"
  429. files.push "txt4/11/phrases"
  430. files.push "txt4/11/places"
  431. files.push "txt4/11/Python"
  432. files.push "txt4/11/README"
  433. files.push "txt4/11/README.txt"
  434. files.push "txt4/11/Roget.words"
  435. files.push "txt4/11/sf"
  436. files.push "txt4/11/shakespeare"
  437. files.push "txt4/11/sports"
  438. files.push "txt4/11/surnames.finnish"
  439. files.push "txt4/11/Trek"
  440. files.push "txt4/11/Unabr.dict"
  441. files.push "txt4/11/Unix.dict"
  442. files.push "txt4/11/words.dutch"
  443. files.push "txt4/11/words.german"
  444. files.push "txt4/11/words.italian"
  445. files.push "txt4/11/words.japanese"
  446. files.push "txt4/11/words.norwegian"
  447. files.push "txt4/11/words.spanish"
  448. files.push "txt4/11/words.swedish"
  449. files.push "txt4/11/World.factbook"
  450. files.push "txt4/11/yiddish"
  451. files.push "txt4/11/Zipcodes"
  452.  
  453. puts "number of files to process: #{ files.length }"
  454.  
  455. $counts = Hash.new( 0 )
  456.  
  457. files.each do | file |
  458. do_it( file )
  459. end
  460.  
  461. puts "Start sorting!"
  462.  
  463. sorted = $counts.sort_by { | word, count | word }
  464.  
  465. puts "Started writing!"
  466.  
  467. shortfile = File.open( "short.txt", "w" )
  468. longfile = File.open( "long.txt", "w" )
  469. toolongfile = File.open( "toolong.txt", "w" )
  470.  
  471. sorted.each do | word, count |
  472. if word.length < 8 then
  473. shortfile.puts "#{ word }"
  474. end
  475.  
  476. if word.length >= 8 and word.length < 64 then
  477. longfile.puts "#{ word }"
  478. end
  479.  
  480. if word.length >= 64 then
  481. toolongfile.puts "#{ word }"
  482. end
  483. end
  484.  
  485. shortfile.close
  486. longfile.close
  487. toolongfile.close
Add Comment
Please, Sign In to add comment