Advertisement
Guest User

Untitled

a guest
Jul 18th, 2019
79
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.60 KB | None | 0 0
  1. import unicodedata
  2.  
  3. SG_DIACRITICS = [
  4. 0x0300, # COMBINING GRAVE ACCENT
  5. 0x0301, # COMBINING ACUTE ACCENT
  6. 0x0302, # COMBINING CIRCUMFLEX ACCENT
  7. 0x0308, # COMBINING DIAERESIS
  8. # TODO: what about ǜ (U+01DC) and ß ?
  9. ]
  10.  
  11. SG_ACCENTED_CHARS = unicodedata.normalize('NFC',
  12. ''.join([ f'{c}{chr(diac)}' for diac in SG_DIACRITICS for c in list('aeiou')])
  13. )
  14.  
  15. import re
  16. _pattern = re.compile(f'[^\W\da-z{SG_ACCENTED_CHARS}]', re.IGNORECASE) #re.compile('[^\W\däÄöÖüÜa-zA-Z]')
  17.  
  18. def is_sg_charset(sentence, upper_limit=1):
  19. matches = _pattern.findall(sentence)
  20. return len(matches) <= upper_limit
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement