Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/ruby
- # Text Analyzer
- # Author: Zach
- # Purpose: Utilize Ruby to analyze Text files
- # and generate statistical information therein.
- require 'cgi'
- STOPWORDS = File.read('conf/stop_words.txt').map{|x| x.chomp}
- # Count the characters; return a hash with stats
- def count_chars(text)
- {
- :tot_chars => text.length,
- :tot_chars_no_space => text.gsub(/\s+/, '').length
- }
- end
- # Count the words, sentences and paragraphs; return a hash with stats
- def count_chunks(text)
- {
- :word_count => text.split.length,
- :sent_count => text.split(/\.|\?|!/).length,
- :para_count => text.split(/\n\n/).length,
- :line_count => text.size,
- :keywords => text.split(/\s+/).select { |word| !STOPWORDS.include?(word) },
- }
- end
- def useful_words(word_count, keywords)
- {
- :pgw => ((keywords.length.to_f / word_count.to_f) * 100).to_i,
- :most_common_words => (keywords - STOPWORDS).group_by{ |x| x}.sort_by{ |word, hits| -hits.length}[0..9].map(&:first).join('-- ')
- }
- end
- # Display stats for the sentences
- def ideal_sentences(sentences)
- sentences_sorted = sentences.sort_by { |sentence| sentence.length }
- foo = sentences_sorted.length / 7
- ideal_sentences = sentences_sorted.slice(foo, foo + 1)
- { :ideal_sentences => ideal_sentences.select { |sentence| sentence =~/\sis\W|\sare\W/ } }
- end
- def build_reports (stats)
- reports = []
- stats.each do |stat|
- report = <<-REPORT
- #{stat[:text]} Inaugural Speech - Analysis Results
- Total number of characters is: #{stat[:chars][:tot_chars]}.
- Total number of characters less whitespace is: #{stat[:chars][:tot_chars_no_space]}
- Total number of words is: #{stat[:chunks][:word_count]}.
- Total number of sentences is: #{stat[:chunks][:sent_count]}
- Total number of paragraphs is: #{stat[:chunks][:para_count]}
- The average sentences per paragraph is: #{stat[:chunks][:sent_count] / stat[:chunks][:para_count]}
- The average words per sentence is: #{stat[:chunks][:word_count] / stat[:chunks][:sent_count]}
- #{stat[:words][:pgw]} % of all words in the text are non-fluff words.
- The ideal sentences are: #{stat[:sent][:ideal_sentences].join("-- ")}
- The top 10 most common words are: #{stat[:words][:most_common_words]}
- REPORT
- reports << report
- end #end each-loop
- reports
- end #end build_reports
- def collect_stats
- file_stats = []
- Dir.glob("files/*.txt").each do |text_file| # Iterate through all text files
- # Local Variables / Text Files
- text = File.read(text_file)
- sentences = text.gsub(/\s+/, ' ').strip.split(/\.|\?|!/)
- text_file.each {|x| p x}
- # Count the characters
- char_stats = count_chars(text) #Assign returned results from count_chars method
- # Count the words, sentences and paragraphs
- chunk_stats = count_chunks(text) #Assign returned retults to chunk_stats
- # Figure out the most Useful Words
- word_stats = useful_words(chunk_stats[:word_count],chunk_stats[:keywords]) #assign results to word_stats
- # Call to ideal_sentences which will produce the ideal sentences from text
- best_sentences = ideal_sentences(sentences) #assign results to best_sentences
- # Collect our stats and stuff them in the results array
- file_stats << {
- :chars => char_stats,
- :chunks => chunk_stats,
- :words => word_stats,
- :sent => best_sentences,
- :text => text_file,
- }
- end # End each-loop
- # Return the filestats at the end
- file_stats
- end
- stats = collect_stats #
- #Build reports that contain the values from our stats
- reports = build_reports(stats)
- header = <<HTML
- <html>
- <body>
- <pre>
- CS 132A Lab3
- Innagural Speech Analysis
- HTML
- footer = <<HTML
- </html>
- </body>
- </pre>
- HTML
- output = <<OUT
- #{header}
- #{reports}
- #{footer}
- OUT
- cgi = CGI.new
- cgi.out do
- output
- end
Add Comment
Please, Sign In to add comment