- t_pattern = r'''(?x) # set flag to allow verbose regexps
- ([A-Z]\.)+ # abbreviations, e.g. U.S.A.
- | (\w+\'\w+) # apostrophe
- | \w+(-\w+)* # words with optional internal hyphens
- | \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
- '''
- tokenizer_re = re.compile(t_pattern)