Guest User

Untitled

a guest
Oct 22nd, 2017
79
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.90 KB | None | 0 0
  1. require 'csv'
  2.  
  3. class MSSQLAdministratorCSVParser
  4.  
  5. QUOTE = '"'
  6. ESCAPE = '"'
  7. SEPARATORS = %W(; , \t) # when equally probable, separatur occurring first is being picked
  8. UTF8_BOM = "\xef\xbb\xbf".force_encoding('ASCII-8BIT')
  9.  
  10. def initialize(file_name, options={})
  11. error("file '#{file_name}' does not exist") unless File.exists? file_name
  12. @file_name = file_name
  13. detected_options = detect_options(file_name)
  14. @options = detected_options.merge(options)
  15. @file = read_file(@file_name, detected_options[:is_utf8])
  16. end
  17.  
  18. # as correctly quoted, readable csv
  19. def as_csv(options={})
  20. options = @options.merge(options)
  21.  
  22. CSV.generate(
  23. col_sep: options[:separator_char],
  24. row_sep: options[:line_end],
  25. quote_char: options[:quote_char],
  26. force_quotes: true
  27. ) do |csv|
  28. parse.each { |line|
  29. csv << line
  30. }
  31. end
  32. end
  33.  
  34. # for validation purposes
  35. def as_administrator_csv
  36. lines = parse.map{|line|
  37. line.map{|col|
  38. (col.include?(@options[:separator_char]) ? "#{@options[:quote_char]}#{col}#{@options[:quote_char]}" : col)
  39. }.join(@options[:separator_char])
  40. }.join(@options[:line_end])
  41.  
  42. if @options[:is_utf8]
  43. UTF8_BOM + lines.force_encoding('ASCII-8BIT') + @options[:line_end]
  44. else
  45. lines.force_encoding('ASCII-8BIT') + @options[:line_end]
  46. end
  47. end
  48.  
  49. private
  50.  
  51. def detect_options(file_name)
  52. content = File.read(file_name, mode: 'rb')
  53. utf8 = content.start_with? UTF8_BOM
  54. content = read_file(file_name, utf8)
  55.  
  56. content.match /\A([^\n]*\r?\n?)/
  57. line = $1
  58. error('no information found in first line') unless line && line.match(/\S/)
  59.  
  60. line.match /(\r?\n?)\z/
  61. line_end = $1
  62. error('cannot determine line ending sequence') unless line_end
  63. error('unsupported line ending "\r"') if line_end == "\r"
  64.  
  65. separator = SEPARATORS.reverse.max {|sep|
  66. line.count sep
  67. }
  68. error("no field separator (tried #{%W(; , \t).map(&:inspect).join ', '}) found in header") if line.count(separator) == 0
  69.  
  70. header_count = CSV.parse_line(line,
  71. row_sep: line_end,
  72. col_sep: separator,
  73. quote_char: QUOTE
  74. ).length
  75.  
  76. {
  77. is_utf8: utf8,
  78. line_end: line_end,
  79. separator_char: separator,
  80. quote_char: QUOTE,
  81. escape_char: ESCAPE,
  82. header_count: header_count
  83. }
  84. end
  85.  
  86. def error(message)
  87. STDERR.puts "Error: #{message}"
  88. exit false
  89. end
  90.  
  91. def read_file(file_name, is_utf8)
  92. if is_utf8
  93. # Read as UTF8 with BOM
  94. File.read(file_name, mode: 'r:BOM|UTF-8')
  95. else
  96. # Read as ISO-8859-1
  97. File.read(file_name, mode: 'r:ISO-8859-1')
  98. end
  99. end
  100.  
  101. def parse
  102. content = @file
  103. column_count = @options[:header_count]
  104. lines = []
  105. columns = []
  106. value = ''
  107. skip = 0
  108.  
  109. in_field = quoted = escaping_or_end = false
  110.  
  111. content.chars.each.with_index {|char, i|
  112. if skip > 0
  113. skip -= 1
  114. next
  115. end
  116. if in_field
  117. if quoted
  118. # in quoted field
  119. if escaping_or_end
  120. # found another quote char (may be end of the quote or an escape char)
  121. case
  122. when char == @options[:quote_char]
  123. # a quote followed by a quote (in quoting mode) is an escaped quote
  124. escaping_or_end = false
  125. value << char
  126. when char == @options[:line_end][0] && content[i..(i+@options[:line_end].length-1)] == @options[:line_end]
  127. # after a closing quote followed by a linebreak the whole line is definitely over
  128. in_field = quoted = escaping_or_end = false
  129. columns << value
  130. value = ''
  131. lines << columns
  132. columns = []
  133. skip = @options[:line_end].length - 1
  134. when char == @options[:separator_char]
  135. # a separator after a closing quote ends the field
  136. in_field = quoted = escaping_or_end = false
  137. columns << value
  138. value = ''
  139. else # ...;"bla "blubb"" ";...
  140. raise "invalid quote escape sequence '#{@options[:quote_char]}#{char.inspect.gsub(/^"(.+)"$/, '\1')}' (parsing column #{columns.length} of line #{lines.length+1})"
  141. end
  142. else
  143. case char
  144. when @options[:quote_char]
  145. # quote in quoting mode starts escaping (...;" bla ""blubb"" ";..) or ends the quote (...;"bla ";...)
  146. escaping_or_end = true
  147. else
  148. value << char
  149. end
  150. end
  151. else
  152. # in unquoted field
  153. case
  154. when char == @options[:separator_char]
  155. # a separator in non-quoting mode ends the currently read in field
  156. in_field = false
  157. columns << value
  158. value = ''
  159. when char == @options[:line_end][0] && content[i..(i+@options[:line_end].length-1)] == @options[:line_end]
  160. # in an unquoted field, a linebreak may mean the end of the current line or a linebreak in the current field
  161. # if it's the last column, check if the next line contains a separator (this would mean it is a new line as
  162. # we have enough columns already)
  163. if columns.length >= column_count - 1 && (content[(i+@options[:line_end].length)..-1] =~ /\A.*#{@options[:separator_char]}.*$/ || i+@options[:line_end].length >= content.length)
  164. # end the line
  165. in_field = false
  166. columns << value
  167. value = ''
  168. lines << columns
  169. columns = []
  170. else
  171. # it's not the last column or the next line should add to the last column, treat as linebreak
  172. value << @options[:line_end]
  173. end
  174. skip = @options[:line_end].length - 1
  175. else
  176. value << char
  177. end
  178. end
  179. else
  180. # not in a field
  181. case
  182. when char == @options[:quote_char]
  183. # new quoted field starts
  184. in_field = quoted = true
  185. when char == @options[:separator_char]
  186. # field ends and starts another
  187. columns << value
  188. value = ''
  189. when char == @options[:line_end][0] && content[i..(i+@options[:line_end].length-1)] == @options[:line_end]
  190. # a linebreak may indicate the end of the current line or a new field starting with a linebreak
  191. if columns.length == column_count
  192. # enough columns/fields parsed, treat as line end
  193. lines << columns
  194. columns = []
  195. else
  196. # not enough columns/fields in current line, treat as new field
  197. in_field = true
  198. value << @options[:line_end]
  199. end
  200. skip = @options[:line_end].length - 1
  201. else
  202. in_field = true
  203. value << char
  204. end
  205. end
  206. }
  207.  
  208. if in_field && quoted && !escaping_or_end
  209. raise 'missing closing quote on last line'
  210. end
  211.  
  212. lines
  213. end
  214. end
  215.  
  216. file_name = ARGV[0]
  217. unless file_name
  218. puts
  219. puts 'Usage: fix_sql_administrator_csv.rb <file name>'
  220. puts
  221. puts 'Generates "<file name>_fixed.csv" and "<file name>_validation.csv" where the'
  222. puts 'first file contains the fixed CSV and the second file contains a byte-exact'
  223. puts 'recreation of the input file (useful for validation).'
  224. exit
  225. end
  226. puts 'Reading input file...'
  227. parser = MSSQLAdministratorCSVParser.new file_name
  228. puts 'Writing fixed CSV file...'
  229. File.binwrite("#{file_name}_fixed.csv", parser.as_csv)
  230. puts 'Writing validation file...'
  231. validation_file = "#{file_name}_validation.csv"
  232. File.binwrite(validation_file, parser.as_administrator_csv)
  233. puts 'Comparing original and validation file...'
  234. raise 'Generated file does not match input file (keeping validation file for reference)!' unless File.binread(file_name) == File.binread(validation_file)
  235. puts 'Removing validation file...'
  236. File.delete(validation_file)
  237. puts 'All done.'
Add Comment
Please, Sign In to add comment