Advertisement
Guest User

Untitled

a guest
Jan 30th, 2015
202
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.88 KB | None | 0 0
  1. #!/usr/bin/ruby -w
  2.  
  3. require 'csv'
  4. require 'active_support/core_ext'
  5.  
  6. class Parser
  7.  
  8. attr_accessor :input_folder
  9. attr_accessor :output_folder
  10. attr_accessor :filename
  11. attr_accessor :seperator
  12. attr_accessor :column_names
  13. attr_accessor :entries
  14. attr_accessor :output
  15. attr_accessor :identifier
  16. attr_accessor :criteria
  17. attr_accessor :output_columns
  18.  
  19. def initialize(filename, identifier, criteria, options = {})
  20. options.reverse_merge!({
  21. :input_folder => "../csv/",
  22. :output_folder => "../output/",
  23. :output_columns => :all,
  24. })
  25.  
  26. @filename = filename
  27.  
  28. @input_folder = options[:input_folder]
  29. @output_folder = options[:output_folder]
  30.  
  31. @identifier = identifier
  32. @criteria = criteria
  33.  
  34. @seperator = ";"
  35.  
  36. @output_columns = options[:output_columns]
  37. end
  38.  
  39. def input_path
  40. input_folder + filename
  41. end
  42.  
  43. def output_path
  44. output_folder + filename
  45. end
  46.  
  47. def read_csv
  48. @entries = []
  49. @output = []
  50.  
  51. CSV.open(input_path, 'r', @seperator) do |row|
  52. @entries << row
  53. end
  54.  
  55. @column_names = entries.delete_at(0)
  56. end
  57.  
  58. def remove_duplicates
  59. identifier_index = column_names.index(identifier)
  60. criteria_index = column_names.index(criteria)
  61.  
  62. grouped = entries.group_by { |entry| entry[identifier_index] }
  63.  
  64. grouped.each do |key, rows|
  65. values = rows.map { |row| row[criteria_index] }.uniq
  66.  
  67. rows.each do |row|
  68. criteria = row[criteria_index]
  69.  
  70. if values.include?(criteria)
  71. output << filter_row(row)
  72.  
  73. values.delete(criteria)
  74. end
  75. end
  76. end
  77. end
  78.  
  79. def filter_row(row)
  80. return row if output_columns == :all
  81.  
  82. output_columns.inject([]) do |filtered_row, column|
  83. index = column_names.index(column)
  84. filtered_row << row[index]
  85. end
  86. end
  87.  
  88. def write_csv
  89. CSV.open(output_path, "w", seperator) do |csv|
  90. csv << filter_row(column_names)
  91. output.each { |row| csv << row }
  92. end
  93. end
  94.  
  95. def statistic
  96. size = output.size
  97. total = entries.size
  98.  
  99. criteria_index = column_names.index(criteria)
  100. criteria_values = entries.map { |row| row[criteria_index] }.uniq
  101.  
  102. empty = 0
  103. empty += output.select { |row| row[criteria_index].blank? }.size
  104.  
  105. identifier_per_criteria = criteria_values.inject("") do |string, field|
  106. hits = output.select { |row| row[criteria_index] == field }.size
  107. string += "#{field} (#{hits}); "
  108. end
  109.  
  110. puts
  111. puts " #total: #{total}"
  112. puts " #rows: #{size}"
  113. puts " empty: #{empty}"
  114. puts " #{column_names[criteria_index]}: #{identifier_per_criteria}\n\n\n"
  115. end
  116.  
  117. def run
  118. puts "=> #{filename}"
  119. read_csv
  120.  
  121. remove_duplicates
  122.  
  123. write_csv
  124.  
  125. statistic
  126. end
  127.  
  128. end
  129.  
  130. filenames = ["file1.csv", "file2.csv", "file3.csv"]
  131.  
  132. identifier = "ID"
  133. criteria = "City"
  134.  
  135. options = {
  136. :output_columns => :all,
  137. }
  138.  
  139. filenames.each do |filename|
  140. Parser.new(filename, identifier, criteria, options).run
  141. end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement