Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from transformers import pipeline
- import pandas as pd
- import numpy as np
- from playwright.sync_api import sync_playwright
- from nested_lookup import nested_lookup
- from ggplot import *
- df = pd.read_csv("C:\\Users\\Chris\\Downloads\\tweet.csv")
- toxigen_roberta = pipeline("text-classification", model="tomh/toxigen_roberta")
- data = pd.DataFrame([])
- for name in df.columns:
- df = df[df[str(name)].str.contains("NaN|deleted|removed") == False] # remove NaN, deleted removed
- df = df.loc[df[str(name)].str.count(" ") < 250] # remove comments with more than 250 words (toxigen req)
- df = df.iloc[:500] # keep 1000 comments
- df2 = pd.DataFrame([])
- df2 = pd.DataFrame(toxigen_roberta(df[str(name)].values.tolist()))
- print(df2)
- df2['toxic'] = np.where(df2['label'] == "LABEL_1", 1, 0)
- df2 = pd.DataFrame(pd.Series(df2['toxic'].mean()))
- df2.columns = [str(name)]
- df2 = df2.T
- data = pd.concat([data, df2], axis=1)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement