Advertisement
Guest User

Untitled

a guest
Oct 17th, 2019
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.55 KB | None | 0 0
  1. import pandas as pd
  2.  
  3. class preprocessor:
  4.  
  5. def __init__(self):
  6. self.book_content=''
  7.  
  8. def __str__(self):
  9. return str(self.book_content)
  10.  
  11. def clean(self):
  12. #at condition for that none and 1
  13. #remove more special charters
  14. self.book_content=self.book_content.replace("'",'').replace('-',' ').replace('_',' ').replace(':','')\
  15. .replace('.','').replace('/','').replace(',','').replace('(','').replace(')','').replace('~','')\
  16. .replace('$','').replace('[','').replace(']','').replace('*','').replace('@','').replace('—','')\
  17. .replace('!','').replace('#','').replace('?','').replace(';','').replace('"','')\
  18. .replace('%','')
  19. self.book_content=self.book_content.lower()
  20.  
  21.  
  22. def read_text(self,text_name):
  23. read=open(text_name,'r')
  24. self.book_content=read.read()
  25.  
  26. class WordAnalyser:
  27.  
  28. def __init__(self):
  29. self.word_counts={}
  30.  
  31. def __str__(self):
  32. return str((self.word_counts))
  33.  
  34. def analyse_words(self, book_text):
  35. self.book_text=book_text.split()
  36. for word in self.book_text:
  37. if word not in self.word_counts:
  38. self.word_counts[word]=1
  39. else:
  40. self.word_counts[word]+=1
  41.  
  42. def get_word_frequency(self):
  43. word=self.word_counts.keys()
  44. word_values=self.word_counts.values()
  45. word_values=list(word_values)
  46. total_words=sum(word_values)
  47. word_frequency=[]
  48. for values in word_values:
  49. word_frequency.append(values/total_words)
  50. self.get_word_frequency=dict(zip(word,word_frequency))
  51. self.word_counts=self.get_word_frequency
  52. return self.word_counts
  53.  
  54. class IDFAnalyser:
  55.  
  56. def __init__(self):
  57. self.data=pd.DataFrame()
  58.  
  59.  
  60. def load_frequency(self,book_frequency,book_title):
  61. self.data=pd.DataFrame(book_frequency,index=[str(book_title)])
  62.  
  63.  
  64. def get_IDF(self,term):
  65. pass
  66.  
  67. def __str__(self):
  68. return self.data
  69.  
  70. #1952-0/txt
  71. b=preprocessor()
  72. b.read_text('1952-0.txt')
  73. b.clean()
  74. book=b.__str__()
  75.  
  76.  
  77. c=WordAnalyser()
  78. c.analyse_words(book)
  79. co=c.get_word_frequency()
  80. print(co)
  81.  
  82. e=IDFAnalyser()
  83. e.load_frequency(co,'199')
  84. r=e.__str__()
  85. print(r)
  86.  
  87. #11-0.txt
  88. t=preprocessor()
  89. t.read_text('11-0.txt')
  90. t.clean()
  91. book2=t.__str__()
  92.  
  93.  
  94. s=WordAnalyser()
  95. s.analyse_words(book2)
  96. so=s.get_word_frequency()
  97. print(so)
  98.  
  99.  
  100.  
  101. f=IDFAnalyser()
  102. f.load_frequency(so,'133')
  103. u=f.__str__()
  104. print(u)
  105.  
  106. #want the both text load_frequency in the same dataframe
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement