Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import unicodedata
- SG_DIACRITICS = [
- 0x0300, # COMBINING GRAVE ACCENT
- 0x0301, # COMBINING ACUTE ACCENT
- 0x0302, # COMBINING CIRCUMFLEX ACCENT
- 0x0308, # COMBINING DIAERESIS
- # TODO: what about ǜ (U+01DC) and ß ?
- ]
- SG_ACCENTED_CHARS = unicodedata.normalize('NFC',
- ''.join([ f'{c}{chr(diac)}' for diac in SG_DIACRITICS for c in list('aeiou')])
- )
- import re
- _pattern = re.compile(f'[^\W\da-z{SG_ACCENTED_CHARS}]', re.IGNORECASE) #re.compile('[^\W\däÄöÖüÜa-zA-Z]')
- def is_sg_charset(sentence, upper_limit=1):
- matches = _pattern.findall(sentence)
- return len(matches) <= upper_limit
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement