Example: Common Words in a Pair of Srings

# CSV files at https://docs.google.com/spreadsheets/d/1sxiP9qB00U4VLq35c_E4nJDhj5Z_tJFvofYYbgEMt50

import pandas as pd

def alphaStr(oStr:str, allowNum=True, allowWS=True, allowedChars=''):
    return ''.join(c if (
        c.isalpha() or (allowNum and c.isdigit())
        or (allowWS and not c.strip()) or c in allowedChars
    ) else '' for c in oStr)

def commonWords(sentence1, sentence2):
    wl1, wl2 = [alphaStr(s).split() for s in [sentence1, sentence2]]
    cw = [w for w in wl1+wl2 if w in wl1 and w in wl2]
    cwCt, twCt = len(cw), len(wl1+wl2)
    return {
        'overlap_frac': f'{cwCt}/{twCt}',
        'overlap_perc': f'{cwCt/twCt:.2%}',
        'overlap_half': f'{int(cwCt/2)}/{twCt}',
        'overlap': cw, 'wordCt_1': len(wl1), 'wordCt_2': len(wl2)
    }

dsdf = pandas.read_csv('CNN_articles.csv', index_col='Index')
dsRows = list(zip(dsdf.index, dsdf['Headline']))

comboRows = []
for i1, (n1, h1) in enumerate(dsRows):
    comboRows += [{
        'index_1': n1, 'index_2': n2, **commonWords(h1,h2),
        'headline_1': h1, 'headline_2': h2
    } for i2, (n2, h2) in enumerate(dsRows[i1+1:], i1+1)]
    print('', end=f'\r[Processed {i1+1}] --> {len(comboRows)} combos')

comboDf = pd.DataFrame(comboRows).set_index(['index_1', 'index_2'], drop=True)
comboDf.to_csv('pairCombos_CNN_articles.csv')

## recover old results with:
## comboDf = pd.read_csv('pairCombos_CNN_articles.csv', index_col=['index_1', 'index_2'])