Advertisement
Guest User

Untitled

a guest
Feb 23rd, 2019
164
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.70 KB | None | 0 0
  1. def custom_tokenise_forum(text):
  2. # this is one possible URL pattern, more complicated patterns that catch different URLs are possible.
  3. URL = '(?:\w+://)?(?:[-\w]+\.)+[a-zA-Z]{2,9}[-\w/#~:;.?+=&%@~]*'
  4. hyphenated = '\w+\-\w+'
  5. apostrophes = '\w+\'\w+'
  6. word = '\w+'
  7. pun = '[^\w\s\']+'
  8. hashtags = '#\w+'
  9. mentions = '@\w+'
  10. patterns = (URL, hyphenated, apostrophes, mentions, hashtags, pun, word)
  11. # the patterns are split with | for alternation.
  12. joint_patterns = '|'.join(patterns)
  13. # format is used to build the pattern, surrounding with (?:...) for non-captured grouping for alternation.
  14. p = re.compile(r'(?:{})'.format(joint_patterns), re.UNICODE)
  15. return p.findall(text)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement