Untitled


from transformers import pipeline
import pandas as pd
import numpy as np
from playwright.sync_api import sync_playwright
from nested_lookup import nested_lookup
from ggplot import *

df = pd.read_csv("C:\\Users\\Chris\\Downloads\\tweet.csv")
toxigen_roberta = pipeline("text-classification", model="tomh/toxigen_roberta")

data = pd.DataFrame([])
for name in df.columns:
    df = df[df[str(name)].str.contains("NaN|deleted|removed") == False]  # remove NaN, deleted removed
    df = df.loc[df[str(name)].str.count(" ") < 250]  # remove comments with more than 250 words (toxigen req)
    df = df.iloc[:500]  # keep 1000 comments
    df2 = pd.DataFrame([])
    df2 = pd.DataFrame(toxigen_roberta(df[str(name)].values.tolist()))
    print(df2)
    df2['toxic'] = np.where(df2['label'] == "LABEL_1", 1, 0)
    df2 = pd.DataFrame(pd.Series(df2['toxic'].mean()))
    df2.columns = [str(name)]
    df2 = df2.T
    data = pd.concat([data, df2], axis=1)