Guest User

Untitled

a guest
Jan 21st, 2018
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.69 KB | None | 0 0
  1. #!/usr/bin/ruby
  2. # Text Analyzer
  3. # Author: Zach
  4. # Purpose: Utilize Ruby to analyze Text files
  5. # and generate statistical information therein.
  6.  
  7. require 'cgi'
  8.  
  9. STOPWORDS = File.read('conf/stop_words.txt').map{|x| x.chomp}
  10.  
  11. # Count the characters; return a hash with stats
  12. def count_chars(text)
  13. {
  14. :tot_chars => text.length,
  15. :tot_chars_no_space => text.gsub(/\s+/, '').length
  16. }
  17. end
  18.  
  19. # Count the words, sentences and paragraphs; return a hash with stats
  20. def count_chunks(text)
  21. {
  22. :word_count => text.split.length,
  23. :sent_count => text.split(/\.|\?|!/).length,
  24. :para_count => text.split(/\n\n/).length,
  25. :line_count => text.size,
  26. :keywords => text.split(/\s+/).select { |word| !STOPWORDS.include?(word) },
  27. }
  28. end
  29.  
  30. def useful_words(word_count, keywords)
  31. {
  32. :pgw => ((keywords.length.to_f / word_count.to_f) * 100).to_i,
  33. :most_common_words => (keywords - STOPWORDS).group_by{ |x| x}.sort_by{ |word, hits| -hits.length}[0..9].map(&:first).join('-- ')
  34. }
  35. end
  36.  
  37. # Display stats for the sentences
  38. def ideal_sentences(sentences)
  39. sentences_sorted = sentences.sort_by { |sentence| sentence.length }
  40. foo = sentences_sorted.length / 7
  41. ideal_sentences = sentences_sorted.slice(foo, foo + 1)
  42. { :ideal_sentences => ideal_sentences.select { |sentence| sentence =~/\sis\W|\sare\W/ } }
  43. end
  44.  
  45. def build_reports (stats)
  46. reports = []
  47. stats.each do |stat|
  48. report = <<-REPORT
  49. #{stat[:text]} Inaugural Speech - Analysis Results
  50. Total number of characters is: #{stat[:chars][:tot_chars]}.
  51. Total number of characters less whitespace is: #{stat[:chars][:tot_chars_no_space]}
  52. Total number of words is: #{stat[:chunks][:word_count]}.
  53. Total number of sentences is: #{stat[:chunks][:sent_count]}
  54. Total number of paragraphs is: #{stat[:chunks][:para_count]}
  55. The average sentences per paragraph is: #{stat[:chunks][:sent_count] / stat[:chunks][:para_count]}
  56. The average words per sentence is: #{stat[:chunks][:word_count] / stat[:chunks][:sent_count]}
  57. #{stat[:words][:pgw]} % of all words in the text are non-fluff words.
  58. The ideal sentences are: #{stat[:sent][:ideal_sentences].join("-- ")}
  59. The top 10 most common words are: #{stat[:words][:most_common_words]}
  60.  
  61. REPORT
  62. reports << report
  63. end #end each-loop
  64. reports
  65. end #end build_reports
  66.  
  67. def collect_stats
  68. file_stats = []
  69. Dir.glob("files/*.txt").each do |text_file| # Iterate through all text files
  70. # Local Variables / Text Files
  71. text = File.read(text_file)
  72. sentences = text.gsub(/\s+/, ' ').strip.split(/\.|\?|!/)
  73. text_file.each {|x| p x}
  74. # Count the characters
  75. char_stats = count_chars(text) #Assign returned results from count_chars method
  76.  
  77. # Count the words, sentences and paragraphs
  78. chunk_stats = count_chunks(text) #Assign returned retults to chunk_stats
  79.  
  80. # Figure out the most Useful Words
  81. word_stats = useful_words(chunk_stats[:word_count],chunk_stats[:keywords]) #assign results to word_stats
  82.  
  83. # Call to ideal_sentences which will produce the ideal sentences from text
  84. best_sentences = ideal_sentences(sentences) #assign results to best_sentences
  85.  
  86. # Collect our stats and stuff them in the results array
  87. file_stats << {
  88. :chars => char_stats,
  89. :chunks => chunk_stats,
  90. :words => word_stats,
  91. :sent => best_sentences,
  92. :text => text_file,
  93. }
  94. end # End each-loop
  95. # Return the filestats at the end
  96. file_stats
  97. end
  98.  
  99. stats = collect_stats #
  100. #Build reports that contain the values from our stats
  101. reports = build_reports(stats)
  102.  
  103. header = <<HTML
  104. <html>
  105. <body>
  106. <pre>
  107. CS 132A Lab3
  108. Innagural Speech Analysis
  109. HTML
  110.  
  111. footer = <<HTML
  112. </html>
  113. </body>
  114. </pre>
  115. HTML
  116.  
  117. output = <<OUT
  118. #{header}
  119. #{reports}
  120. #{footer}
  121. OUT
  122.  
  123. cgi = CGI.new
  124. cgi.out do
  125. output
  126. end
Add Comment
Please, Sign In to add comment