Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/ruby
- #CS132A - Ruby - Lab 3 - Text Analyzer
- #Author: Zach
- #Purpose: Utilize Ruby File.class to analyze Text files
- # and generate statitical information therein.
- STOPWORDS = IO.readlines('stop_words.txt').map {|i| i.chomp}
- your_text_files = Dir.glob("*.txt")
- your_text_files.each do |text_file|
- # Local Variables / Text Files
- line_count = text_file.size
- text = File.read(text_file)
- # Count the characters
- tot_chars = text.length
- tot_chars_no_space = text.gsub(/\s+/, '').length
- # Count the words, sentences and paragraphs
- word_count = text.split.length
- sent_count = text.split(/\.|\?|!/).length
- para_count = text.split(/\n\n/).length
- # Stop Words Processing
- puts "Meaningful Words"
- puts
- keywords = text.split(/\s+/).select {|word| !STOPWORDS.include?(word)}
- puts keywords.join(' ')
- good_percent = ((keywords.length.to_f / word_count.to_f) * 100).to_i
- # Ideal Sentences
- puts "Ideal sentences from text"
- sentences = text.gsub(/\s+/, ' ').strip.split(/\.|\?|!/)
- sentences_sorted = sentences.sort_by { |sentence| sentence.length }
- foo = sentences_sorted.length / 7
- ideal_sentences = sentences_sorted.slice(foo, foo + 1)
- ideal_sentences = ideal_sentences.select { |sentence| sentence =~/\sis\W|\sare\W/ }
- puts
- # Common Words
- puts "10 Most common words from text"
- words_less_stop = (keywords - STOPWORDS).group_by{ |x| x}.sort_by{ |word, hits| -hits.length}[0..9].map(&:first)
- puts words_less_stop
- puts
- # Output Statements
- puts "Innagural Speech Statistics"
- puts "The total number of lines in the inaugural speech is #{line_count}\."
- puts "The total number of characters in the first part is #{tot_chars}\."
- puts "The total number of characters less whitespace is #{tot_chars_no_space}\."
- puts "The total number of words is #{word_count}\."
- puts "The total number of sentences is #{sent_count}\."
- puts "The total number of paragraphs is #{para_count}\."
- puts "The average sentences per paragraph is #{sent_count/para_count}\."
- puts "The average words per sentence is #{word_count/ sent_count}\."
- puts "#{good_percent}% of all words in the text are non-fluff words."
- puts
- puts "The ideal sentences are:\n\n" + ideal_sentences.join(". ")
- puts
- end
Add Comment
Please, Sign In to add comment