Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import textdistance
- def remove_near_duplicates(df, threshold = 0.8):
- new_df = pd.DataFrame(columns=["ID", "SentimentText_Clean",
- "Sim"])
- while len(df) > 0:
- comp = df["SentimentText_Clean"][0]
- t_id = df["Tweet_ID"][0]
- df["Sim"] = df["SentimentText_Clean"].apply(lambda x:
- textdistance.levenshtein.normalized_similarity(comp, x))
- new_df = new_df.append(df.iloc[[0]])
- df = df.iloc[1:]
- df = df.loc[df["Sim"] < threshold]
- df.reset_index(drop = True, inplace = True)
- return new_df
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement