Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from nltk.tokenize import sent_tokenize
- def lines(a, b):
- """Return lines in both a and b"""
- # split "a" text by lines
- linesA = a.splitlines()
- # split "b" text by lines
- linesB = b.splitlines()
- c = set()
- c1 = []
- # find unique matches between files
- for lineA in linesA:
- if lineA in linesB:
- c.add(lineA)
- c1 = list(c)
- return c1
- def sentences(a, b):
- """Return sentences in both a and b"""
- # split "a" text by sentences
- sentsA = sent_tokenize(a)
- # split "b" text by sentences
- sentsB = sent_tokenize(b)
- # find unique matches between files
- d = set()
- d1 = []
- for sentA in sentsA:
- if sentA in sentsB:
- d.add(sentA)
- d1 = list(d)
- return d1
- def substrings(a, b, n):
- """Return substrings of length n in both a and b"""
- subsA = set()
- subsB = set()
- subsD = set()
- subsD1 = []
- # split a text by n letters
- for i in range(0, len(a) - n + 1):
- subsA.add(a[i:(n + i)])
- # split b text by n letters
- for i in range(0, len(b) - n + 1):
- subsB.add(b[i:(n + i)])
- # find unique matches between files
- for strA in subsA:
- if strA in subsB:
- subsD.add(strA)
- subsD1 = list(subsD)
- return subsD1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement