Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env ruby
- def words_from_string( string )
- string.scan(/[\w]+/)
- end
- def words_from_string_lowercase( string )
- string.downcase.scan(/[\w]+/)
- end
- def words_from_string_swapcase( string )
- string.swapcase.scan(/[\w]+/)
- end
- def words_from_string_upcase( string )
- string.swapcase.scan(/[\w]+/)
- end
- def count_frequency( word_list )
- for word in word_list
- $counts[ word ] += 1
- end
- # $counts
- end
- def do_it( file )
- puts `date`
- puts "Doing a new file!"
- raw_text = File.read( file )
- word_list = words_from_string( raw_text )
- count_frequency( word_list )
- word_list = words_from_string_lowercase ( raw_text )
- count_frequency( word_list )
- word_list = words_from_string_swapcase ( raw_text )
- count_frequency( word_list )
- word_list = words_from_string_upcase ( raw_text )
- count_frequency( word_list )
- end
- files = []
- files.push "short1.txt"
- files.push "long1.txt"
- files.push "toolong1.txt"
- files.push "txt4/1/0...9999999.dic"
- files.push "txt4/1/0...ffffff.dic"
- files.push "txt4/1/10001fr.equ"
- files.push "txt4/1/10002fr.equ"
- files.push "txt4/1/10196pla.ces"
- files.push "txt4/1/113809of.fic"
- files.push "txt4/1/1185kjvf.req"
- files.push "txt4/1/1984.txt"
- files.push "txt4/1/1lower.lst"
- files.push "txt4/1/1mixed.lst"
- files.push "txt4/1/21986na.mes"
- files.push "txt4/1/256772co.mpo"
- files.push "txt4/1/354984si.ngl"
- files.push "txt4/1/366often.mis"
- files.push "txt4/1/3897male.nam"
- files.push "txt4/1/4160offi.cia"
- files.push "txt4/1/467popul.arf"
- files.push "txt4/1/4946fema.len"
- files.push "txt4/1/6213acro.nym"
- files.push "txt4/1/74550com.mon"
- files.push "txt4/1/all.lst"
- files.push "txt4/1/ALLUP&R.DIC"
- files.push "txt4/1/all-word"
- files.push "txt4/1/allwords"
- files.push "txt4/1/Antworth"
- files.push "txt4/1/arthur"
- files.push "txt4/1/ASSurnames"
- files.push "txt4/1/cmudict0.3"
- files.push "txt4/1/Common.dic"
- files.push "txt4/1/Cracklib.dic"
- files.push "txt4/1/creadme"
- files.push "txt4/1/D8.DIC"
- files.push "txt4/1/Dates.dic"
- files.push "txt4/1/des-how-to.txt"
- files.push "txt4/1/dic-0294.txt"
- files.push "txt4/1/dsgerman.txt"
- files.push "txt4/1/Dutch.dic"
- files.push "txt4/1/English.dic"
- files.push "txt4/1/fdsitalian.txt"
- files.push "txt4/1/fdsjapanese.txt"
- files.push "txt4/1/ffreadme"
- files.push "txt4/1/ffrench.txt"
- files.push "txt4/1/French.dic"
- files.push "txt4/1/french.txt"
- files.push "txt4/1/fsreadme"
- files.push "txt4/1/GDict_v2.txt"
- files.push "txt4/1/german.txt"
- files.push "txt4/1/gsdspanish.txt"
- files.push "txt4/1/italian.txt"
- files.push "txt4/1/japanese.txt"
- files.push "txt4/1/lower.lst"
- files.push "txt4/1/mhyph.txt"
- files.push "txt4/1/mixed.lst"
- files.push "txt4/1/mobyposi.i"
- files.push "txt4/1/mobypron.unc"
- files.push "txt4/1/mobythes.aur"
- files.push "txt4/1/phoneset.3"
- files.push "txt4/1/qsdffsdq0readme"
- files.push "txt4/1/qsdgreadme"
- files.push "txt4/1/readme"
- files.push "txt4/1/roget13a.txt"
- files.push "txt4/1/rreadme"
- files.push "txt4/1/sdgreadme"
- files.push "txt4/1/sdsdqfdsfreadme.txt"
- files.push "txt4/1/shakespe.are"
- files.push "txt4/1/spanish.txt"
- files.push "txt4/1/usaconst.itu"
- files.push "txt4/1/CainandAbel.dic"
- files.push "txt4/1/CaseMutation.dic"
- files.push "txt4/1/Facebook(Usernames).dic"
- files.push "txt4/2/0sd0freadme"
- files.push "txt4/2/abbr"
- files.push "txt4/2/all-words"
- files.push "txt4/2/asteroids"
- files.push "txt4/2/biology"
- files.push "txt4/2/cartoon"
- files.push "txt4/2/chars"
- files.push "txt4/2/chinese"
- files.push "txt4/2/common-passwords.txt"
- files.push "txt4/2/etc-hosts"
- files.push "txt4/2/famous"
- files.push "txt4/2/fast-names"
- files.push "txt4/2/female-names"
- files.push "txt4/2/junk"
- files.push "txt4/2/meh.txt"
- files.push "txt4/2/NAMES.DIC"
- files.push "txt4/2/n_common"
- files.push "txt4/2/NORM&R.DIC"
- files.push "txt4/2/ONEUP&R.DIC"
- files.push "txt4/3/0README"
- files.push "txt4/3/1dfg2README"
- files.push "txt4/3/1README"
- files.push "txt4/3/2README"
- files.push "txt4/3/3README"
- files.push "txt4/3/computer.names"
- files.push "txt4/3/dfgREADME"
- files.push "txt4/3/dsfREADME"
- files.push "txt4/3/dutch.maybe"
- files.push "txt4/3/dutch.trash"
- files.push "txt4/3/dutch.words"
- files.push "txt4/3/english.abbrs"
- files.push "txt4/3/english.maybe"
- files.push "txt4/3/english.names"
- files.push "txt4/3/english.trash"
- files.push "txt4/3/english.words"
- files.push "txt4/3/german.trash"
- files.push "txt4/3/german.words"
- files.push "txt4/3/italian.trash"
- files.push "txt4/3/italian.words"
- files.push "txt4/3/kjbible"
- files.push "txt4/3/male-names"
- files.push "txt4/3/misc.names"
- files.push "txt4/3/movies"
- files.push "txt4/3/myths-legends"
- files.push "txt4/3/norwegian.trash"
- files.push "txt4/3/norwegian.words"
- files.push "txt4/3/numbers"
- files.push "txt4/3/org.names"
- files.push "txt4/3/other-names"
- files.push "txt4/3/phrases"
- files.push "txt4/3/places"
- files.push "txt4/3/sf"
- files.push "txt4/3/shakespeare"
- files.push "txt4/3/sports"
- files.push "txt4/3/surnames"
- files.push "txt4/3/swedish.trash"
- files.push "txt4/3/swedish.words"
- files.push "txt4/3/yiddish"
- files.push "txt4/4/1sfreja.diku.dk"
- files.push "txt4/4/1web2"
- files.push "txt4/4/Fandboken"
- files.push "txt4/4/ffospd"
- files.push "txt4/4/firstnames.finnish"
- files.push "txt4/4/germanl"
- files.push "txt4/4/hindu-names"
- files.push "txt4/4/mhg@lance.hss.bu.oz.au.txt"
- files.push "txt4/4/qsdffreja.diku.dk"
- files.push "txt4/4/qsfdsqfqsf0ftp.cs.vu.nl"
- files.push "txt4/4/qsfsource.unknown"
- files.push "txt4/4/README.ftp.funet.fi"
- files.push "txt4/4/sdg1ftp.cs.vu.nl"
- files.push "txt4/4/sffreja.diku.dk"
- files.push "txt4/4/sfweb2a"
- files.push "txt4/4/sqfqssdfwietze@swi.psy.uva.nl"
- files.push "txt4/4/sqqsqqsasimtel20.army.mil"
- files.push "txt4/4/tzeftp.uu.net"
- files.push "txt4/4/web2"
- files.push "txt4/4/web2a"
- files.push "txt4/4/words.dutch"
- files.push "txt4/4/words.english"
- files.push "txt4/4/words.finnish"
- files.push "txt4/4/words.finnish.FAQ"
- files.push "txt4/4/words.german"
- files.push "txt4/4/words.italian"
- files.push "txt4/4/words.norwegian"
- files.push "txt4/4/words.swedish"
- files.push "txt4/5/124freja.diku.dk"
- files.push "txt4/5/4ghdsource.unknown"
- files.push "txt4/5/4s7etechnik.vok"
- files.push "txt4/5/d3s3g1freja.diku.dk"
- files.push "txt4/5/dfg.waseda.ac.jp"
- files.push "txt4/5/greyftp.cs.vu.nl-2"
- files.push "txt4/5/HUGE-words.nic.funet.fi"
- files.push "txt4/5/latin2"
- files.push "txt4/5/names.hp.ycy"
- files.push "txt4/5/qs2d4ggerman-wordlist.new"
- files.push "txt4/5/qsdfarcher@frmug.fr.mugnet.org"
- files.push "txt4/5/s2gidioms.vok"
- files.push "txt4/5/sd4gsource.unknown"
- files.push "txt4/5/sdg4ftp.cs.vu.nl-1"
- files.push "txt4/5/sgdqfexercise.vok"
- files.push "txt4/5/treftp.cs.vu.nl"
- files.push "txt4/5/words.nic.funet.fi"
- files.push "txt4/6/Antworth"
- files.push "txt4/6/CIS"
- files.push "txt4/6/Colleges"
- files.push "txt4/6/CRL.words"
- files.push "txt4/6/Domains"
- files.push "txt4/6/Dosref"
- files.push "txt4/6/etc-hosts"
- files.push "txt4/6/Ethnologue"
- files.push "txt4/6/Ftpsites"
- files.push "txt4/6/Jargon"
- files.push "txt4/6/Koran"
- files.push "txt4/6/LCarrol"
- files.push "txt4/6/Movies"
- files.push "txt4/6/Paradise.Lost"
- files.push "txt4/6/Python"
- files.push "txt4/6/README"
- files.push "txt4/6/Roget.words"
- files.push "txt4/6/Trek"
- files.push "txt4/6/Unabr.dict"
- files.push "txt4/6/Unix.dict"
- files.push "txt4/6/World.factbook"
- files.push "txt4/6/Zipcodes"
- files.push "txt4/7/abbr"
- files.push "txt4/7/all-words"
- files.push "txt4/7/asteroids"
- files.push "txt4/7/biology"
- files.push "txt4/7/cartoon"
- files.push "txt4/7/chars"
- files.push "txt4/7/chinese"
- files.push "txt4/7/common-passwords.txt"
- files.push "txt4/7/etc-hosts"
- files.push "txt4/7/famous"
- files.push "txt4/7/fast-names"
- files.push "txt4/7/female-names"
- files.push "txt4/7/junk"
- files.push "txt4/7/kjbible"
- files.push "txt4/7/male-names"
- files.push "txt4/7/movies"
- files.push "txt4/7/myths-legends"
- files.push "txt4/7/numbers"
- files.push "txt4/7/other-names"
- files.push "txt4/7/phrases"
- files.push "txt4/7/places"
- files.push "txt4/7/sf"
- files.push "txt4/7/shakespeare"
- files.push "txt4/7/sports"
- files.push "txt4/7/surnames"
- files.push "txt4/7/yiddish"
- files.push "txt4/8/asteroids"
- files.push "txt4/8/bsd-words"
- files.push "txt4/8/cars"
- files.push "txt4/8/cartoons"
- files.push "txt4/8/chinese"
- files.push "txt4/8/CIS.DIC"
- files.push "txt4/8/computer-companies"
- files.push "txt4/8/crackdict"
- files.push "txt4/8/dictionaries"
- files.push "txt4/8/digits"
- files.push "txt4/8/ego"
- files.push "txt4/8/famous"
- files.push "txt4/8/fantasy"
- files.push "txt4/8/geography"
- files.push "txt4/8/GNU-wordlist"
- files.push "txt4/8/greek"
- files.push "txt4/8/hackdict"
- files.push "txt4/8/hosts"
- files.push "txt4/8/jargon"
- files.push "txt4/8/JUNK.DIC"
- files.push "txt4/8/look.freja.diku.dk"
- files.push "txt4/8/misc"
- files.push "txt4/8/misc.other"
- files.push "txt4/8/music"
- files.push "txt4/8/phonenums"
- files.push "txt4/8/phrases"
- files.push "txt4/8/PHRASES.DIC"
- files.push "txt4/8/Purdue"
- files.push "txt4/8/rsk.dict"
- files.push "txt4/8/sequences"
- files.push "txt4/8/sports"
- files.push "txt4/8/stava"
- files.push "txt4/8/unix"
- files.push "txt4/8/webster.phrases"
- files.push "txt4/8/words.bad"
- files.push "txt4/8/wormlist"
- files.push "txt4/9/names1"
- files.push "txt4/9/names10"
- files.push "txt4/9/names10.source"
- files.push "txt4/9/names11"
- files.push "txt4/9/names11.source"
- files.push "txt4/9/names12"
- files.push "txt4/9/names12.source"
- files.push "txt4/9/names13"
- files.push "txt4/9/names13.source"
- files.push "txt4/9/names14"
- files.push "txt4/9/names14.source"
- files.push "txt4/9/names15"
- files.push "txt4/9/names15.source"
- files.push "txt4/9/names16"
- files.push "txt4/9/names16.source"
- files.push "txt4/9/names17"
- files.push "txt4/9/names17.source"
- files.push "txt4/9/names18"
- files.push "txt4/9/names18.source"
- files.push "txt4/9/names19"
- files.push "txt4/9/names19.source"
- files.push "txt4/9/names1.source"
- files.push "txt4/9/names2"
- files.push "txt4/9/names20"
- files.push "txt4/9/names20.source"
- files.push "txt4/9/names21"
- files.push "txt4/9/names21.source"
- files.push "txt4/9/names22"
- files.push "txt4/9/names22.source"
- files.push "txt4/9/names23"
- files.push "txt4/9/names23.source"
- files.push "txt4/9/names24"
- files.push "txt4/9/names24.source"
- files.push "txt4/9/names25"
- files.push "txt4/9/names25.source"
- files.push "txt4/9/names26"
- files.push "txt4/9/names26.source"
- files.push "txt4/9/names27"
- files.push "txt4/9/names27.source"
- files.push "txt4/9/names28"
- files.push "txt4/9/names28.source"
- files.push "txt4/9/names29"
- files.push "txt4/9/names29.source"
- files.push "txt4/9/names2.source"
- files.push "txt4/9/names3"
- files.push "txt4/9/names30"
- files.push "txt4/9/names30.source"
- files.push "txt4/9/names31"
- files.push "txt4/9/names31.source"
- files.push "txt4/9/names32"
- files.push "txt4/9/names32.source"
- files.push "txt4/9/names33"
- files.push "txt4/9/names33.source"
- files.push "txt4/9/names34"
- files.push "txt4/9/names34.source"
- files.push "txt4/9/names35"
- files.push "txt4/9/names35.source"
- files.push "txt4/9/names36"
- files.push "txt4/9/names36.source"
- files.push "txt4/9/names37"
- files.push "txt4/9/names37.source"
- files.push "txt4/9/names38"
- files.push "txt4/9/names38.source"
- files.push "txt4/9/names39"
- files.push "txt4/9/names39.source"
- files.push "txt4/9/names3.source"
- files.push "txt4/9/names4"
- files.push "txt4/9/names4.source"
- files.push "txt4/9/names5"
- files.push "txt4/9/names5.source"
- files.push "txt4/9/names6"
- files.push "txt4/9/names6.source"
- files.push "txt4/9/names7"
- files.push "txt4/9/names7.source"
- files.push "txt4/9/names8"
- files.push "txt4/9/names8.source"
- files.push "txt4/9/names9"
- files.push "txt4/9/names9.source"
- files.push "txt4/10/4ftp.cs.vu.nl"
- files.push "txt4/10/acronyms.txt"
- files.push "txt4/10/allwords2"
- files.push "txt4/10/foldoc.txt"
- files.push "txt4/10/ftp.cs.vu.nl"
- files.push "txt4/10/ftp.uu.net"
- files.push "txt4/10/source.unknown"
- files.push "txt4/11/4fREADME.TXT"
- files.push "txt4/11/abbr"
- files.push "txt4/11/Antworth"
- files.push "txt4/11/ASSurnames"
- files.push "txt4/11/asteroids"
- files.push "txt4/11/biology"
- files.push "txt4/11/cartoon"
- files.push "txt4/11/chars"
- files.push "txt4/11/chinese"
- files.push "txt4/11/CIS"
- files.push "txt4/11/Colleges"
- files.push "txt4/11/common-passwords.txt"
- files.push "txt4/11/Congress"
- files.push "txt4/11/CRL.words"
- files.push "txt4/11/danish.words"
- files.push "txt4/11/dico"
- files.push "txt4/11/Domains"
- files.push "txt4/11/Dosref"
- files.push "txt4/11/etc-hosts"
- files.push "txt4/11/Ethnologue"
- files.push "txt4/11/Family-Names"
- files.push "txt4/11/famous"
- files.push "txt4/11/fast-names"
- files.push "txt4/11/female-names"
- files.push "txt4/11/Ftpsites"
- files.push "txt4/11/germanl"
- files.push "txt4/11/Given-Names"
- files.push "txt4/11/Jargon"
- files.push "txt4/11/junk"
- files.push "txt4/11/kjbible"
- files.push "txt4/11/Koran"
- files.push "txt4/11/LCarrol"
- files.push "txt4/11/male-names"
- files.push "txt4/11/Movies"
- files.push "txt4/11/myths-legends"
- files.push "txt4/11/names.french"
- files.push "txt4/11/names.hp"
- files.push "txt4/11/numbers"
- files.push "txt4/11/other-names"
- files.push "txt4/11/oz"
- files.push "txt4/11/Paradise.Lost"
- files.push "txt4/11/phrases"
- files.push "txt4/11/places"
- files.push "txt4/11/Python"
- files.push "txt4/11/README"
- files.push "txt4/11/README.txt"
- files.push "txt4/11/Roget.words"
- files.push "txt4/11/sf"
- files.push "txt4/11/shakespeare"
- files.push "txt4/11/sports"
- files.push "txt4/11/surnames.finnish"
- files.push "txt4/11/Trek"
- files.push "txt4/11/Unabr.dict"
- files.push "txt4/11/Unix.dict"
- files.push "txt4/11/words.dutch"
- files.push "txt4/11/words.german"
- files.push "txt4/11/words.italian"
- files.push "txt4/11/words.japanese"
- files.push "txt4/11/words.norwegian"
- files.push "txt4/11/words.spanish"
- files.push "txt4/11/words.swedish"
- files.push "txt4/11/World.factbook"
- files.push "txt4/11/yiddish"
- files.push "txt4/11/Zipcodes"
- puts "number of files to process: #{ files.length }"
- $counts = Hash.new( 0 )
- files.each do | file |
- do_it( file )
- end
- puts "Start sorting!"
- sorted = $counts.sort_by { | word, count | word }
- puts "Started writing!"
- shortfile = File.open( "short.txt", "w" )
- longfile = File.open( "long.txt", "w" )
- toolongfile = File.open( "toolong.txt", "w" )
- sorted.each do | word, count |
- if word.length < 8 then
- shortfile.puts "#{ word }"
- end
- if word.length >= 8 and word.length < 64 then
- longfile.puts "#{ word }"
- end
- if word.length >= 64 then
- toolongfile.puts "#{ word }"
- end
- end
- shortfile.close
- longfile.close
- toolongfile.close
Add Comment
Please, Sign In to add comment