Advertisement
Guest User

Untitled

a guest
Jul 28th, 2023
384
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.95 KB | None | 0 0
  1.  
  2. from transformers import pipeline
  3. import pandas as pd
  4. import numpy as np
  5. from playwright.sync_api import sync_playwright
  6. from nested_lookup import nested_lookup
  7. from ggplot import *
  8.  
  9. df = pd.read_csv("C:\\Users\\Chris\\Downloads\\tweet.csv")
  10. toxigen_roberta = pipeline("text-classification", model="tomh/toxigen_roberta")
  11.  
  12. data = pd.DataFrame([])
  13. for name in df.columns:
  14. df = df[df[str(name)].str.contains("NaN|deleted|removed") == False] # remove NaN, deleted removed
  15. df = df.loc[df[str(name)].str.count(" ") < 250] # remove comments with more than 250 words (toxigen req)
  16. df = df.iloc[:500] # keep 1000 comments
  17. df2 = pd.DataFrame([])
  18. df2 = pd.DataFrame(toxigen_roberta(df[str(name)].values.tolist()))
  19. print(df2)
  20. df2['toxic'] = np.where(df2['label'] == "LABEL_1", 1, 0)
  21. df2 = pd.DataFrame(pd.Series(df2['toxic'].mean()))
  22. df2.columns = [str(name)]
  23. df2 = df2.T
  24. data = pd.concat([data, df2], axis=1)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement