Similarities / helpers.py

from nltk.tokenize import sent_tokenize


def lines(a, b):
    """Return lines in both a and b"""

# split "a" text by lines
    linesA = a.splitlines()

# split "b" text by lines
    linesB = b.splitlines()
    c = set()
    c1 = []
# find unique matches between files
    for lineA in linesA:
        if lineA in linesB:
            c.add(lineA)
            c1 = list(c)

    return c1


def sentences(a, b):
    """Return sentences in both a and b"""

# split "a" text by sentences
    sentsA = sent_tokenize(a)
# split "b" text by sentences
    sentsB = sent_tokenize(b)
# find unique matches between files
    d = set()
    d1 = []
    for sentA in sentsA:
        if sentA in sentsB:
            d.add(sentA)
            d1 = list(d)

    return d1


def substrings(a, b, n):
    """Return substrings of length n in both a and b"""
    subsA = set()
    subsB = set()
    subsD = set()
    subsD1 = []
# split a text by n letters
    for i in range(0, len(a) - n + 1):
        subsA.add(a[i:(n + i)])

# split b text by n letters
    for i in range(0, len(b) - n + 1):
        subsB.add(b[i:(n + i)])

# find unique matches between files

    for strA in subsA:
        if strA in subsB:
            subsD.add(strA)
            subsD1 = list(subsD)

    return subsD1