Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- class preprocessor:
- def __init__(self):
- self.book_content=''
- def __str__(self):
- return str(self.book_content)
- def clean(self):
- #at condition for that none and 1
- #remove more special charters
- self.book_content=self.book_content.replace("'",'').replace('-',' ').replace('_',' ').replace(':','')\
- .replace('.','').replace('/','').replace(',','').replace('(','').replace(')','').replace('~','')\
- .replace('$','').replace('[','').replace(']','').replace('*','').replace('@','').replace('—','')\
- .replace('!','').replace('#','').replace('?','').replace(';','').replace('"','')\
- .replace('%','')
- self.book_content=self.book_content.lower()
- def read_text(self,text_name):
- read=open(text_name,'r')
- self.book_content=read.read()
- class WordAnalyser:
- def __init__(self):
- self.word_counts={}
- def __str__(self):
- return str((self.word_counts))
- def analyse_words(self, book_text):
- self.book_text=book_text.split()
- for word in self.book_text:
- if word not in self.word_counts:
- self.word_counts[word]=1
- else:
- self.word_counts[word]+=1
- def get_word_frequency(self):
- word=self.word_counts.keys()
- word_values=self.word_counts.values()
- word_values=list(word_values)
- total_words=sum(word_values)
- word_frequency=[]
- for values in word_values:
- word_frequency.append(values/total_words)
- self.get_word_frequency=dict(zip(word,word_frequency))
- self.word_counts=self.get_word_frequency
- return self.word_counts
- class IDFAnalyser:
- def __init__(self):
- self.data=pd.DataFrame()
- def load_frequency(self,book_frequency,book_title):
- self.data=pd.DataFrame(book_frequency,index=[str(book_title)])
- def get_IDF(self,term):
- pass
- def __str__(self):
- return self.data
- #1952-0/txt
- b=preprocessor()
- b.read_text('1952-0.txt')
- b.clean()
- book=b.__str__()
- c=WordAnalyser()
- c.analyse_words(book)
- co=c.get_word_frequency()
- print(co)
- e=IDFAnalyser()
- e.load_frequency(co,'199')
- r=e.__str__()
- print(r)
- #11-0.txt
- t=preprocessor()
- t.read_text('11-0.txt')
- t.clean()
- book2=t.__str__()
- s=WordAnalyser()
- s.analyse_words(book2)
- so=s.get_word_frequency()
- print(so)
- f=IDFAnalyser()
- f.load_frequency(so,'133')
- u=f.__str__()
- print(u)
- #want the both text load_frequency in the same dataframe
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement