Advertisement
Guest User

Translate mis-encoded characters

a guest
Nov 3rd, 2016
259
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.81 KB | None | 0 0
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3.  
  4.  
  5. """
  6. Inspired by: http://www.sebastianviereck.de/mysql-php-umlaute-sonderzeichen-utf8-iso/
  7. """
  8.  
  9.  
  10. # suppress deprecation warnings
  11. import warnings
  12. warnings.filterwarnings("ignore", category=DeprecationWarning)
  13.  
  14. import sys
  15. from StringIO import StringIO
  16. import csv
  17.  
  18.  
  19. # translation table
  20. TABLE = {
  21.     'ü': 'ü',
  22.     'ä': 'ä',
  23.     'ö': 'ö',
  24.     'Ö': 'Ö',
  25.     'ß': 'ß',
  26.     'Ã ': 'à',
  27.     'á': 'á',
  28.     'â': 'â',
  29.     'ã': 'ã',
  30.     'ù': 'ù',
  31.     'ú': 'ú',
  32.     'û': 'û',
  33.     'Ù': 'Ù',
  34.     'Ú': 'Ú',
  35.     'Û': 'Û',
  36.     'Ãœ': 'Ü',
  37.     'ò': 'ò',
  38.     'ó': 'ó',
  39.     'ô': 'ô',
  40.     'è': 'è',
  41.     'é': 'é',
  42.     'ê': 'ê',
  43.     'ë': 'ë',
  44.     'À': 'À',
  45.     'Á': 'Á',
  46.     'Â': 'Â',
  47.     'Ã': 'Ã',
  48.     'Ä': 'Ä',
  49.     'Ã…': 'Å',
  50.     'Ç': 'Ç',
  51.     'È': 'È',
  52.     'É': 'É',
  53.     'Ê': 'Ê',
  54.     'Ë': 'Ë',
  55.     'ÃŒ': 'Ì',
  56.     'Í': 'Í',
  57.     'ÃŽ': 'Î',
  58.     'Ï': 'Ï',
  59.     'Ñ': 'Ñ',
  60.     'Ã’': 'Ò',
  61.     'Ó': 'Ó',
  62.     'Ô': 'Ô',
  63.     'Õ': 'Õ',
  64.     'Ø': 'Ø',
  65.     'Ã¥': 'å',
  66.     'æ': 'æ',
  67.     'ç': 'ç',
  68.     'ì': 'ì',
  69.     'í': 'í',
  70.     'î': 'î',
  71.     'ï': 'ï',
  72.     'ð': 'ð',
  73.     'ñ': 'ñ',
  74.     'õ': 'õ',
  75.     'ø': 'ø',
  76.     'ý': 'ý',
  77.     'ÿ': 'ÿ',
  78.     '€': '€'
  79.     }
  80.  
  81.  
  82. def encode(s, coding="iso-8859-15"):
  83.     return s.encode(coding)
  84.  
  85.  
  86. def decode(s, coding="utf-8"):
  87.     return s.decode(coding)
  88.  
  89.  
  90. def main(inFile, outFile):
  91.     # read whole file
  92.     with open(inFile, 'r') as f:
  93.         data = f.read()
  94.     data = decode(data)
  95.     # replace characters
  96.     for x, y in TABLE.items():
  97.         data = data.replace(decode(x), decode(y))
  98.     data = encode(data)
  99.     # parse data as CSV file
  100.     data = StringIO(data)
  101.     reader = csv.reader(data, delimiter=':', quoting=csv.QUOTE_NONE)
  102.     # add column to CSV
  103.     csv_ = []
  104.    
  105.     # no do something that is not really important to understand this issue...
  106.     for r, row in enumerate(reader):
  107.         row_ = []
  108.         for c, col in enumerate(row):
  109.             # add extra column before column no. 6
  110.             if c == 5:
  111.                 # add header field
  112.                 if r == 0:
  113.                     row_.append("JobId")
  114.                 else:
  115.                     row_.append(col.split('_')[0])
  116.             row_.append(col)
  117.         csv_.append(row_)
  118.    
  119.     # write as CSV using Excel dialect
  120.     with open(outFile, 'wb') as w:
  121.         writer = csv.writer(w, dialect=csv.excel, delimiter=';', quoting=csv.QUOTE_MINIMAL)
  122.         writer.writerows(csv_)
  123.  
  124.  
  125. if __name__ == "__main__":
  126.     inFile = sys.argv[1]
  127.     outFile = sys.argv[2]
  128.     main(inFile, outFile)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement