Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- require 'csv'
- class MSSQLAdministratorCSVParser
- QUOTE = '"'
- ESCAPE = '"'
- SEPARATORS = %W(; , \t) # when equally probable, separatur occurring first is being picked
- UTF8_BOM = "\xef\xbb\xbf".force_encoding('ASCII-8BIT')
- def initialize(file_name, options={})
- error("file '#{file_name}' does not exist") unless File.exists? file_name
- @file_name = file_name
- detected_options = detect_options(file_name)
- @options = detected_options.merge(options)
- @file = read_file(@file_name, detected_options[:is_utf8])
- end
- # as correctly quoted, readable csv
- def as_csv(options={})
- options = @options.merge(options)
- CSV.generate(
- col_sep: options[:separator_char],
- row_sep: options[:line_end],
- quote_char: options[:quote_char],
- force_quotes: true
- ) do |csv|
- parse.each { |line|
- csv << line
- }
- end
- end
- # for validation purposes
- def as_administrator_csv
- lines = parse.map{|line|
- line.map{|col|
- (col.include?(@options[:separator_char]) ? "#{@options[:quote_char]}#{col}#{@options[:quote_char]}" : col)
- }.join(@options[:separator_char])
- }.join(@options[:line_end])
- if @options[:is_utf8]
- UTF8_BOM + lines.force_encoding('ASCII-8BIT') + @options[:line_end]
- else
- lines.force_encoding('ASCII-8BIT') + @options[:line_end]
- end
- end
- private
- def detect_options(file_name)
- content = File.read(file_name, mode: 'rb')
- utf8 = content.start_with? UTF8_BOM
- content = read_file(file_name, utf8)
- content.match /\A([^\n]*\r?\n?)/
- line = $1
- error('no information found in first line') unless line && line.match(/\S/)
- line.match /(\r?\n?)\z/
- line_end = $1
- error('cannot determine line ending sequence') unless line_end
- error('unsupported line ending "\r"') if line_end == "\r"
- separator = SEPARATORS.reverse.max {|sep|
- line.count sep
- }
- error("no field separator (tried #{%W(; , \t).map(&:inspect).join ', '}) found in header") if line.count(separator) == 0
- header_count = CSV.parse_line(line,
- row_sep: line_end,
- col_sep: separator,
- quote_char: QUOTE
- ).length
- {
- is_utf8: utf8,
- line_end: line_end,
- separator_char: separator,
- quote_char: QUOTE,
- escape_char: ESCAPE,
- header_count: header_count
- }
- end
- def error(message)
- STDERR.puts "Error: #{message}"
- exit false
- end
- def read_file(file_name, is_utf8)
- if is_utf8
- # Read as UTF8 with BOM
- File.read(file_name, mode: 'r:BOM|UTF-8')
- else
- # Read as ISO-8859-1
- File.read(file_name, mode: 'r:ISO-8859-1')
- end
- end
- def parse
- content = @file
- column_count = @options[:header_count]
- lines = []
- columns = []
- value = ''
- skip = 0
- in_field = quoted = escaping_or_end = false
- content.chars.each.with_index {|char, i|
- if skip > 0
- skip -= 1
- next
- end
- if in_field
- if quoted
- # in quoted field
- if escaping_or_end
- # found another quote char (may be end of the quote or an escape char)
- case
- when char == @options[:quote_char]
- # a quote followed by a quote (in quoting mode) is an escaped quote
- escaping_or_end = false
- value << char
- when char == @options[:line_end][0] && content[i..(i+@options[:line_end].length-1)] == @options[:line_end]
- # after a closing quote followed by a linebreak the whole line is definitely over
- in_field = quoted = escaping_or_end = false
- columns << value
- value = ''
- lines << columns
- columns = []
- skip = @options[:line_end].length - 1
- when char == @options[:separator_char]
- # a separator after a closing quote ends the field
- in_field = quoted = escaping_or_end = false
- columns << value
- value = ''
- else # ...;"bla "blubb"" ";...
- raise "invalid quote escape sequence '#{@options[:quote_char]}#{char.inspect.gsub(/^"(.+)"$/, '\1')}' (parsing column #{columns.length} of line #{lines.length+1})"
- end
- else
- case char
- when @options[:quote_char]
- # quote in quoting mode starts escaping (...;" bla ""blubb"" ";..) or ends the quote (...;"bla ";...)
- escaping_or_end = true
- else
- value << char
- end
- end
- else
- # in unquoted field
- case
- when char == @options[:separator_char]
- # a separator in non-quoting mode ends the currently read in field
- in_field = false
- columns << value
- value = ''
- when char == @options[:line_end][0] && content[i..(i+@options[:line_end].length-1)] == @options[:line_end]
- # in an unquoted field, a linebreak may mean the end of the current line or a linebreak in the current field
- # if it's the last column, check if the next line contains a separator (this would mean it is a new line as
- # we have enough columns already)
- if columns.length >= column_count - 1 && (content[(i+@options[:line_end].length)..-1] =~ /\A.*#{@options[:separator_char]}.*$/ || i+@options[:line_end].length >= content.length)
- # end the line
- in_field = false
- columns << value
- value = ''
- lines << columns
- columns = []
- else
- # it's not the last column or the next line should add to the last column, treat as linebreak
- value << @options[:line_end]
- end
- skip = @options[:line_end].length - 1
- else
- value << char
- end
- end
- else
- # not in a field
- case
- when char == @options[:quote_char]
- # new quoted field starts
- in_field = quoted = true
- when char == @options[:separator_char]
- # field ends and starts another
- columns << value
- value = ''
- when char == @options[:line_end][0] && content[i..(i+@options[:line_end].length-1)] == @options[:line_end]
- # a linebreak may indicate the end of the current line or a new field starting with a linebreak
- if columns.length == column_count
- # enough columns/fields parsed, treat as line end
- lines << columns
- columns = []
- else
- # not enough columns/fields in current line, treat as new field
- in_field = true
- value << @options[:line_end]
- end
- skip = @options[:line_end].length - 1
- else
- in_field = true
- value << char
- end
- end
- }
- if in_field && quoted && !escaping_or_end
- raise 'missing closing quote on last line'
- end
- lines
- end
- end
- file_name = ARGV[0]
- unless file_name
- puts
- puts 'Usage: fix_sql_administrator_csv.rb <file name>'
- puts
- puts 'Generates "<file name>_fixed.csv" and "<file name>_validation.csv" where the'
- puts 'first file contains the fixed CSV and the second file contains a byte-exact'
- puts 'recreation of the input file (useful for validation).'
- exit
- end
- puts 'Reading input file...'
- parser = MSSQLAdministratorCSVParser.new file_name
- puts 'Writing fixed CSV file...'
- File.binwrite("#{file_name}_fixed.csv", parser.as_csv)
- puts 'Writing validation file...'
- validation_file = "#{file_name}_validation.csv"
- File.binwrite(validation_file, parser.as_administrator_csv)
- puts 'Comparing original and validation file...'
- raise 'Generated file does not match input file (keeping validation file for reference)!' unless File.binread(file_name) == File.binread(validation_file)
- puts 'Removing validation file...'
- File.delete(validation_file)
- puts 'All done.'
Add Comment
Please, Sign In to add comment