Advertisement
venom049

Similarities / helpers.py

Mar 10th, 2018
91
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.30 KB | None | 0 0
  1. from nltk.tokenize import sent_tokenize
  2.  
  3.  
  4. def lines(a, b):
  5.     """Return lines in both a and b"""
  6.  
  7. # split "a" text by lines
  8.     linesA = a.splitlines()
  9.  
  10. # split "b" text by lines
  11.     linesB = b.splitlines()
  12.     c = set()
  13.     c1 = []
  14. # find unique matches between files
  15.     for lineA in linesA:
  16.         if lineA in linesB:
  17.             c.add(lineA)
  18.             c1 = list(c)
  19.  
  20.     return c1
  21.  
  22.  
  23. def sentences(a, b):
  24.     """Return sentences in both a and b"""
  25.  
  26. # split "a" text by sentences
  27.     sentsA = sent_tokenize(a)
  28. # split "b" text by sentences
  29.     sentsB = sent_tokenize(b)
  30. # find unique matches between files
  31.     d = set()
  32.     d1 = []
  33.     for sentA in sentsA:
  34.         if sentA in sentsB:
  35.             d.add(sentA)
  36.             d1 = list(d)
  37.  
  38.     return d1
  39.  
  40.  
  41. def substrings(a, b, n):
  42.     """Return substrings of length n in both a and b"""
  43.     subsA = set()
  44.     subsB = set()
  45.     subsD = set()
  46.     subsD1 = []
  47. # split a text by n letters
  48.     for i in range(0, len(a) - n + 1):
  49.         subsA.add(a[i:(n + i)])
  50.  
  51. # split b text by n letters
  52.     for i in range(0, len(b) - n + 1):
  53.         subsB.add(b[i:(n + i)])
  54.  
  55. # find unique matches between files
  56.  
  57.     for strA in subsA:
  58.         if strA in subsB:
  59.             subsD.add(strA)
  60.             subsD1 = list(subsD)
  61.  
  62.     return subsD1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement