Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/ruby -w
- require 'csv'
- require 'active_support/core_ext'
- class Parser
- attr_accessor :input_folder
- attr_accessor :output_folder
- attr_accessor :filename
- attr_accessor :seperator
- attr_accessor :column_names
- attr_accessor :entries
- attr_accessor :output
- attr_accessor :identifier
- attr_accessor :criteria
- attr_accessor :output_columns
- def initialize(filename, identifier, criteria, options = {})
- options.reverse_merge!({
- :input_folder => "../csv/",
- :output_folder => "../output/",
- :output_columns => :all,
- })
- @filename = filename
- @input_folder = options[:input_folder]
- @output_folder = options[:output_folder]
- @identifier = identifier
- @criteria = criteria
- @seperator = ";"
- @output_columns = options[:output_columns]
- end
- def input_path
- input_folder + filename
- end
- def output_path
- output_folder + filename
- end
- def read_csv
- @entries = []
- @output = []
- CSV.open(input_path, 'r', @seperator) do |row|
- @entries << row
- end
- @column_names = entries.delete_at(0)
- end
- def remove_duplicates
- identifier_index = column_names.index(identifier)
- criteria_index = column_names.index(criteria)
- grouped = entries.group_by { |entry| entry[identifier_index] }
- grouped.each do |key, rows|
- values = rows.map { |row| row[criteria_index] }.uniq
- rows.each do |row|
- criteria = row[criteria_index]
- if values.include?(criteria)
- output << filter_row(row)
- values.delete(criteria)
- end
- end
- end
- end
- def filter_row(row)
- return row if output_columns == :all
- output_columns.inject([]) do |filtered_row, column|
- index = column_names.index(column)
- filtered_row << row[index]
- end
- end
- def write_csv
- CSV.open(output_path, "w", seperator) do |csv|
- csv << filter_row(column_names)
- output.each { |row| csv << row }
- end
- end
- def statistic
- size = output.size
- total = entries.size
- criteria_index = column_names.index(criteria)
- criteria_values = entries.map { |row| row[criteria_index] }.uniq
- empty = 0
- empty += output.select { |row| row[criteria_index].blank? }.size
- identifier_per_criteria = criteria_values.inject("") do |string, field|
- hits = output.select { |row| row[criteria_index] == field }.size
- string += "#{field} (#{hits}); "
- end
- puts
- puts " #total: #{total}"
- puts " #rows: #{size}"
- puts " empty: #{empty}"
- puts " #{column_names[criteria_index]}: #{identifier_per_criteria}\n\n\n"
- end
- def run
- puts "=> #{filename}"
- read_csv
- remove_duplicates
- write_csv
- statistic
- end
- end
- filenames = ["file1.csv", "file2.csv", "file3.csv"]
- identifier = "ID"
- criteria = "City"
- options = {
- :output_columns => :all,
- }
- filenames.each do |filename|
- Parser.new(filename, identifier, criteria, options).run
- end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement