Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def custom_tokenise_forum(text):
- # this is one possible URL pattern, more complicated patterns that catch different URLs are possible.
- URL = '(?:\w+://)?(?:[-\w]+\.)+[a-zA-Z]{2,9}[-\w/#~:;.?+=&%@~]*'
- hyphenated = '\w+\-\w+'
- apostrophes = '\w+\'\w+'
- word = '\w+'
- pun = '[^\w\s\']+'
- hashtags = '#\w+'
- mentions = '@\w+'
- patterns = (URL, hyphenated, apostrophes, mentions, hashtags, pun, word)
- # the patterns are split with | for alternation.
- joint_patterns = '|'.join(patterns)
- # format is used to build the pattern, surrounding with (?:...) for non-captured grouping for alternation.
- p = re.compile(r'(?:{})'.format(joint_patterns), re.UNICODE)
- return p.findall(text)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement