Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # coding:utf-8
- import re
- class Parser(object):
- def __init__(self, text):
- self.text = text
- self.sentences = self.split_sentences()
- def split_sentences(self):
- """
- Split the text into sentences
- """
- abbreviations = r"(Sr|Sra|Dr|Dra|Ex|Vs|Versus)\."
- sentences = re.sub(abbreviations, '', self.text)
- sentences = re.sub(r'!|\?|\.|\n', '<stop>', sentences)
- sentences = sentences.strip()
- sentences = sentences.split('<stop>')
- return list(filter(lambda s: s != '', sentences))
- def remove_punct(self, sentence):
- """
- Remove punctuations from text
- """
- return re.sub(r"\?|\.{1,}|!|,|;|-|:|\(|\)|'|\"|\*", '', sentence)
- def is_sentence(self, sentence):
- """
- Check if is a sentence
- """
- return True if ' ' in sentence else False
- def intersec_score(self, sentenceX, sentenceY):
- """
- Get score of intersection
- """
- if self.is_sentence(sentenceX) and self.is_sentence(sentenceY):
- result = 0
- sentenceX, sentenceY = self.remove_punct(sentenceX).split(), self.remove_punct(sentenceY).split()
- wordsInX = {(w, sentenceX.count(w)) for w in sentenceX}
- wordsInY = {(w, sentenceY.count(w)) for w in sentenceY}
- for k,v in wordsInX:
- result += v
- for k,v in wordsInY:
- result += v
- return result // 2
- else:
- return False
- def calc_score(self):
- """
- I suppose that method calc text score and return a summary?!
- """
- high_result = 0
- winner = {'score': 0, 'sentence': ''}
- sentences = []
- for i in range(0, len(self.sentences)):
- for j in range(0, len(self.sentences)):
- if self.sentences[i] != self.sentences[j]:
- score = self.intersec_score(self.sentences[i], self.sentences[j])
- if score > high_result:
- high_result = score
- sentences.append({'score': score, 'sentence': '{}. {}.'.format(self.sentences[i], self.sentences[j])})
- return self.get_average(sentences)
- def get_average(self, data):
- """
- Idnk how i did it
- """
- length = len(data)
- total = 0
- minimum = 1000000
- result = None
- total = sum([d['score'] for d in data])
- avg = round(total/length)
- for d in data:
- m = abs(d['score'] - avg)
- if m < minimum:
- minimum = m
- result = d['sentence']
- return result
Add Comment
Please, Sign In to add comment