Don't like ads? PRO users don't see any ads ;-)
Guest

Untitled

By: a guest on Jun 30th, 2012  |  syntax: None  |  size: 0.93 KB  |  hits: 15  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. How to handle unicode charters in python with regex
  2. aren’t=ain’t
  3. hello=hey
  4.        
  5. u'arenu2019t' = u'ainu2019t'
  6. u'hello' = u'hey'
  7.        
  8. text = u"aren't"
  9.  
  10. def replace_all(text, dict):
  11.     for i, k in dict.iteritems():
  12.         #replace all whole words of I with K in lower cased text, regex = bSTRINGb
  13.         text = re.sub(r"b" + i + r"b", k , text.lower())
  14.     return text
  15.        
  16. text = u"aren't"
  17.        
  18. text = u"aren’t"
  19.        
  20. #!/usr/bin/env python
  21. # -*- coding: utf-8 -*-
  22.  
  23. import re
  24.  
  25. d = {
  26.     u'aren’t': u'ain’t',
  27.     u'hello': u'hey'
  28.     }
  29. #text = u"aren't"
  30. text = u"aren’t"
  31.  
  32.  
  33. def replace_all(text, d):
  34.     for i, k in d.iteritems():
  35.         #replace all whole words of I with K in lower cased text, regex = bSTRINGb
  36.         text = re.sub(r"b" + i + r"b", k , text.lower())
  37.     return text
  38.  
  39. if __name__ == '__main__':
  40.     newtext = replace_all(text, d)
  41.     print newtext
  42.        
  43. ain’t
  44.        
  45. >>> re.sub(ur'barenu2019tb', 'rep', u'arenu2019t')
  46. u'rep'