Don't like ads? PRO users don't see any ads ;-)
Guest

Untitled

By: a guest on Jun 1st, 2012  |  syntax: None  |  size: 2.16 KB  |  hits: 11  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. Cannot write text as UTF-8 to file using python
  2. >>> print r.info()
  3. Content-Type: text/html; charset=ISO-8859-1
  4. Connection: close
  5. Cache-Control: no-cache
  6. Date: Sun, 20 Feb 2011 15:16:31 GMT
  7. Server: Apache/2.0.40 (Red Hat Linux)
  8. X-Accel-Cache-Control: no-cache
  9.        
  10. <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
  11.        
  12. with open('../results/1.html','r') as f:                                  
  13.     page = f.read()
  14. ...
  15. with open('../parsed.txt','w') as f:
  16.     for key in fieldD:
  17.         f.write(key+'t'+fieldD[key]+'n')
  18.        
  19. with codecs.open('../results/1.html','r','utf-8') as f:                                
  20.     page = f.read()
  21. ...
  22. with codecs.open('../parsed.txt','w','utf-8') as f:                                  
  23.     for key in fieldD:
  24.         f.write(key+'t'+fieldD[key]+'n')
  25.        
  26. with codecs.open('../results/1.html','r','iso_8859_1') as f:
  27.     page = f.read()
  28. ...
  29. with codecs.open('../parsed.txt','w','utf-8') as f:                        
  30.     for key in fieldD:
  31.         f.write(key+'t'+fieldD[key]+'n')
  32.        
  33. >>> from unicodedata import name
  34. >>> oacute = u"xf3"
  35. >>> print name(oacute)
  36. LATIN SMALL LETTER O WITH ACUTE
  37. >>> guff = oacute.encode('utf8').decode('latin1').encode('utf8')
  38. >>> guff
  39. 'xc3x83xc2xb3'
  40. >>> for c in guff.decode('macroman'):
  41. ...     print name(c)
  42. ...
  43. SQUARE ROOT
  44. LATIN CAPITAL LETTER E WITH ACUTE
  45. NOT SIGN
  46. GREATER-THAN OR EQUAL TO
  47. >>>
  48.        
  49. >>> data = open('g0.htm', 'rb').read()
  50. >>> uc = data.decode('utf8')
  51. Traceback (most recent call last):
  52.   File "<stdin>", line 1, in <module>
  53.   File "c:python27libencodingsutf_8.py", line 16, in decode
  54.     return codecs.utf_8_decode(input, errors, True)
  55. UnicodeDecodeError: 'utf8' codec can't decode byte 0xb7 in position 1130: invalid start byte
  56. >>> pos = data.find("Iglesia Cat")
  57. >>> data[pos:pos+20]
  58. 'Iglesia Catxf3lica</a>'
  59. >>> # Looks like one of ISO-8859-1 and its cousins to me.
  60.        
  61. >>> url = 'http://213.97.164.119/ABSYS/abwebp.cgi/X5104/ID31295/G0?ACC=DCT1'
  62. >>> data = urllib2.urlopen(url).read()[4016:4052]; data
  63. 'Iglesia+Cat%f3lica">Iglesia Catxf3lica'
  64.  
  65. >>> data.decode('latin-1')
  66. u'Iglesia+Cat%f3lica">Iglesia Catxf3lica'
  67.  
  68. >>> data.decode('latin-1').encode('utf-8')
  69. 'Iglesia+Cat%f3lica">Iglesia Catxc3xb3lica'