Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # CSV files at https://docs.google.com/spreadsheets/d/1sxiP9qB00U4VLq35c_E4nJDhj5Z_tJFvofYYbgEMt50
- import pandas as pd
- def alphaStr(oStr:str, allowNum=True, allowWS=True, allowedChars=''):
- return ''.join(c if (
- c.isalpha() or (allowNum and c.isdigit())
- or (allowWS and not c.strip()) or c in allowedChars
- ) else '' for c in oStr)
- def commonWords(sentence1, sentence2):
- wl1, wl2 = [alphaStr(s).split() for s in [sentence1, sentence2]]
- cw = [w for w in wl1+wl2 if w in wl1 and w in wl2]
- cwCt, twCt = len(cw), len(wl1+wl2)
- return {
- 'overlap_frac': f'{cwCt}/{twCt}',
- 'overlap_perc': f'{cwCt/twCt:.2%}',
- 'overlap_half': f'{int(cwCt/2)}/{twCt}',
- 'overlap': cw, 'wordCt_1': len(wl1), 'wordCt_2': len(wl2)
- }
- dsdf = pandas.read_csv('CNN_articles.csv', index_col='Index')
- dsRows = list(zip(dsdf.index, dsdf['Headline']))
- comboRows = []
- for i1, (n1, h1) in enumerate(dsRows):
- comboRows += [{
- 'index_1': n1, 'index_2': n2, **commonWords(h1,h2),
- 'headline_1': h1, 'headline_2': h2
- } for i2, (n2, h2) in enumerate(dsRows[i1+1:], i1+1)]
- print('', end=f'\r[Processed {i1+1}] --> {len(comboRows)} combos')
- comboDf = pd.DataFrame(comboRows).set_index(['index_1', 'index_2'], drop=True)
- comboDf.to_csv('pairCombos_CNN_articles.csv')
- ## recover old results with:
- ## comboDf = pd.read_csv('pairCombos_CNN_articles.csv', index_col=['index_1', 'index_2'])
Advertisement
Add Comment
Please, Sign In to add comment