Advertisement
khaiwen1111

remove html tags

Apr 17th, 2020
394
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.18 KB | None | 0 0
  1. import pandas as pd
  2. import csv
  3. import numpy as np
  4. import re
  5. with open(r"D:\Users\sentiment analysis\sentiment\sentiment3.csv", newline='') as f:
  6.     reader = csv.reader(f)
  7.     data = list(reader)
  8.    
  9. def replace_chars(s):
  10.     s= s.replace('<', '')
  11.     s=s.replace('"','')
  12.     s=s.replace('&','')
  13.     s=s.replace(''','')
  14.     return s
  15.  
  16. def recursively_apply(l, f):
  17.     for n, i in enumerate(l):
  18.         if type(i) is list:
  19.             l[n] = recursively_apply(l[n], f)
  20.         elif type(i) is str:
  21.             l[n] = f(i)
  22.     return l
  23.  
  24. data=recursively_apply(data, replace_chars)
  25. headers =["target","id","date","flag",'user',"text"]
  26. data=pd.DataFrame(data,columns=headers)
  27.  
  28. def remove_pattern(input_txt, pattern):
  29.     r = re.findall(pattern, input_txt)
  30.     for i in r:
  31.         input_txt = re.sub(i, '', input_txt)
  32.        
  33.     return input_txt
  34. data['tweet'] = np.vectorize(remove_pattern)(data['text'], "@[\w]*")
  35. data['tweet'] = data['tweet'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
  36. data['tweet'] = data['tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
  37. data['tweet'] = data['tweet'].str.replace('[^\w\s]','')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement