Advertisement
Guest User

Untitled

a guest
Jun 26th, 2019
88
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.57 KB | None | 0 0
  1. import textdistance
  2.  
  3. def remove_near_duplicates(df, threshold = 0.8):
  4.  
  5. new_df = pd.DataFrame(columns=["ID", "SentimentText_Clean",
  6. "Sim"])
  7.  
  8. while len(df) > 0:
  9.  
  10. comp = df["SentimentText_Clean"][0]
  11. t_id = df["Tweet_ID"][0]
  12. df["Sim"] = df["SentimentText_Clean"].apply(lambda x:
  13. textdistance.levenshtein.normalized_similarity(comp, x))
  14. new_df = new_df.append(df.iloc[[0]])
  15. df = df.iloc[1:]
  16. df = df.loc[df["Sim"] < threshold]
  17. df.reset_index(drop = True, inplace = True)
  18.  
  19. return new_df
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement