Try95th

Example: Common Words in a Pair of Srings

Jan 22nd, 2023 (edited)
134
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.45 KB | None | 0 0
  1. # CSV files at https://docs.google.com/spreadsheets/d/1sxiP9qB00U4VLq35c_E4nJDhj5Z_tJFvofYYbgEMt50
  2.  
  3. import pandas as pd
  4.  
  5. def alphaStr(oStr:str, allowNum=True, allowWS=True, allowedChars=''):
  6.     return ''.join(c if (
  7.         c.isalpha() or (allowNum and c.isdigit())
  8.         or (allowWS and not c.strip()) or c in allowedChars
  9.     ) else '' for c in oStr)
  10.  
  11. def commonWords(sentence1, sentence2):
  12.     wl1, wl2 = [alphaStr(s).split() for s in [sentence1, sentence2]]
  13.     cw = [w for w in wl1+wl2 if w in wl1 and w in wl2]
  14.     cwCt, twCt = len(cw), len(wl1+wl2)
  15.     return {
  16.         'overlap_frac': f'{cwCt}/{twCt}',
  17.         'overlap_perc': f'{cwCt/twCt:.2%}',
  18.         'overlap_half': f'{int(cwCt/2)}/{twCt}',
  19.         'overlap': cw, 'wordCt_1': len(wl1), 'wordCt_2': len(wl2)
  20.     }
  21.  
  22. dsdf = pandas.read_csv('CNN_articles.csv', index_col='Index')
  23. dsRows = list(zip(dsdf.index, dsdf['Headline']))
  24.  
  25. comboRows = []
  26. for i1, (n1, h1) in enumerate(dsRows):
  27.     comboRows += [{
  28.         'index_1': n1, 'index_2': n2, **commonWords(h1,h2),
  29.         'headline_1': h1, 'headline_2': h2
  30.     } for i2, (n2, h2) in enumerate(dsRows[i1+1:], i1+1)]
  31.     print('', end=f'\r[Processed {i1+1}] --> {len(comboRows)} combos')
  32.  
  33. comboDf = pd.DataFrame(comboRows).set_index(['index_1', 'index_2'], drop=True)
  34. comboDf.to_csv('pairCombos_CNN_articles.csv')
  35.  
  36. ## recover old results with:
  37. ## comboDf = pd.read_csv('pairCombos_CNN_articles.csv', index_col=['index_1', 'index_2'])
Advertisement
Add Comment
Please, Sign In to add comment