Untitled

#!/usr/bin/ruby
# Text Analyzer
# Author: Zach
# Purpose: Utilize Ruby to analyze Text files
# and generate statistical information therein.

require 'cgi'

STOPWORDS = File.read('conf/stop_words.txt').map{|x| x.chomp}

# Count the characters; return a hash with stats
def count_chars(text)
  {
    :tot_chars => text.length,
    :tot_chars_no_space => text.gsub(/\s+/, '').length
  }
end

# Count the words, sentences and paragraphs; return a hash with stats
def count_chunks(text)
  {
    :word_count => text.split.length,
    :sent_count => text.split(/\.|\?|!/).length,
    :para_count => text.split(/\n\n/).length,
    :line_count => text.size,
    :keywords => text.split(/\s+/).select { |word| !STOPWORDS.include?(word) },
  }
end

def useful_words(word_count, keywords)
  {
    :pgw => ((keywords.length.to_f / word_count.to_f) * 100).to_i,
    :most_common_words => (keywords - STOPWORDS).group_by{ |x| x}.sort_by{ |word, hits| -hits.length}[0..9].map(&:first).join('-- ')
  }
end

# Display stats for the sentences
def ideal_sentences(sentences)
  sentences_sorted = sentences.sort_by { |sentence| sentence.length }
  foo = sentences_sorted.length / 7
  ideal_sentences = sentences_sorted.slice(foo, foo + 1)
  { :ideal_sentences => ideal_sentences.select { |sentence| sentence =~/\sis\W|\sare\W/ } }
end

def build_reports (stats)
  reports = []
  stats.each do |stat|
    report = <<-REPORT
    #{stat[:text]} Inaugural Speech - Analysis Results
    Total number of characters is: #{stat[:chars][:tot_chars]}.
    Total number of characters less whitespace is: #{stat[:chars][:tot_chars_no_space]}
    Total number of words is: #{stat[:chunks][:word_count]}.
    Total number of sentences is: #{stat[:chunks][:sent_count]}
    Total number of paragraphs is: #{stat[:chunks][:para_count]}
    The average sentences per paragraph is: #{stat[:chunks][:sent_count] / stat[:chunks][:para_count]}
    The average words per sentence is:  #{stat[:chunks][:word_count] / stat[:chunks][:sent_count]}
    #{stat[:words][:pgw]} % of all words in the text are non-fluff words.
    The ideal sentences are: #{stat[:sent][:ideal_sentences].join("-- ")}
    The top 10 most common words are: #{stat[:words][:most_common_words]}

    REPORT
    reports << report
  end #end each-loop
  reports
end #end build_reports

def collect_stats
  file_stats = []
  Dir.glob("files/*.txt").each do |text_file| # Iterate through all text files
    # Local Variables / Text Files
    text = File.read(text_file)
    sentences = text.gsub(/\s+/, ' ').strip.split(/\.|\?|!/)
    text_file.each {|x| p x}
    # Count the characters
    char_stats = count_chars(text) #Assign returned results from count_chars method

    # Count the words, sentences and paragraphs
    chunk_stats = count_chunks(text) #Assign returned retults to chunk_stats

    # Figure out the most Useful Words
    word_stats = useful_words(chunk_stats[:word_count],chunk_stats[:keywords]) #assign results to word_stats

    # Call to ideal_sentences which will produce the ideal sentences from text
    best_sentences = ideal_sentences(sentences) #assign results to best_sentences

    # Collect our stats and stuff them in the results array
    file_stats << {
      :chars  => char_stats,
      :chunks => chunk_stats,
      :words  => word_stats,
      :sent   => best_sentences,
      :text   => text_file,
    }
  end # End each-loop
  # Return the filestats at the end
  file_stats
end

stats = collect_stats #
#Build reports that contain the values from our stats
reports = build_reports(stats)

header = <<HTML
<html>
<body>
<pre>
CS 132A Lab3
Innagural Speech Analysis
HTML

footer = <<HTML
</html>
</body>
</pre>
HTML

output = <<OUT
#{header}
#{reports}
#{footer}
OUT

cgi = CGI.new
cgi.out do
  output
end