Untitled

import spacy;
import re;

# Text Preprocessing Pkg
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
punc = list(punctuation)
nlp = spacy.load('en_core_web_sm')

# Build a List of Stopwords
stopwords = list(STOP_WORDS)
whitelist = {};

internalScaleFactor = 1;
externalScaleFactor = 2;

def cleanStringPunc(string):
    for i in punc:
        string=string.replace(i,'')
    return string;

def scale(dict, proportion):
    maximum_frequency = max(dict.values())
    for word in dict.keys():
        dict[word] = (dict[word]/maximum_frequency*proportion)

#Merge dict2 into dict1
def mergeDict(dict1, dict2):
    for i in dict2.keys():
        word = i.lower();
        if word not in dict1.keys():
            dict1.set(word, dict2.get(i));
        else:
            dict1[word] += dict2[i]

#document ="""Machine learning (ML) is the scientific study of algorithms and statistical models that computer systems use to progressively improve their performance on a specific task. Machine learning algorithms build a mathematical model of sample data, known as "training data", in order to make predictions or decisions without being explicitly programmed to perform the task. Machine learning algorithms are used in the applications of email filtering, detection of network intruders, and computer vision, where it is infeasible to develop an algorithm of specific instructions for performing the task. Machine learning is closely related to computational statistics, which focuses on making predictions using computers. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. Data mining is a field of study within machine learning, and focuses on exploratory data analysis through unsupervised learning.In its application across business problems, machine learning is also referred to as predictive analytics."""

doc = """The University of California, Los Angeles (UCLA)[1] is a public research university in Los Angeles. UCLA traces its early origins back to 1882 as the southern branch of the California State Normal School (now San Jose State University). It became the Southern Branch of the University of California in 1919, making it the fourth-oldest (after UC Berkeley, UC San Francisco, and UC Davis) of the 10-campus University of California system and oldest of the campuses in Southern California.[11] It offers 337 undergraduate and graduate degree programs in a wide range of disciplines.[12] UCLA enrolls about 31,500 undergraduate and 12,800 graduate students[7] and had 119,000 applicants for Fall 2016, including transfer applicants, making the school the most applied-to of any American university.[13]
The university is organized into six undergraduate colleges, seven professional schools, and four professional health science schools. The undergraduate colleges are the College of Letters and Science; Samueli School of Engineering; School of the Arts and Architecture; Herb Alpert School of Music; School of Theater, Film and Television; and School of Nursing.
As of 2017, 24 Nobel laureates, three Fields Medalists, and five Turing Award winners, and two Chief Scientists of the U.S. Air Force have been affiliated with UCLA as faculty, researchers, or alumni.[14][15][16] Among the current faculty members, 55 have been elected to the National Academy of Sciences, 28 to the National Academy of Engineering, 39 to the Institute of Medicine, and 124 to the American Academy of Arts and Sciences.[17] The university was elected to the Association of American Universities in 1974.[18]

UCLA is considered one of the country's Public Ivy universities, meaning that it is a public university thought to provide a quality of education comparable with that of the Ivy League. US News & World Report named UCLA the best public university in the United States for 2019.[19]

UCLA student-athletes compete as the Bruins in the Pac-12 Conference. The Bruins have won 129 national championships, including 118 NCAA team championships, more than any other university except Stanford University, whose athletes have won 126.[20][21][22] UCLA student-athletes, coaches and staff won 251 Olympic medals: 126 gold, 65 silver, and 60 bronze.[23] UCLA student-athletes competed in every Olympics since 1920 with one exception (1924) and won a gold medal in every Olympics the U.S. participated in since 1932"""

def summarize(document,wlistadd,keywords):
    # Build an NLP Object
    docx = nlp(document)
    #stripdoc = nlp(re.sub(r'[\(\)\[\]]', '', document).lower())
    stripdoc = nlp(cleanStringPunc(document).lower());

    # Tokenization of Text
    mytokens = [token.text for token in docx]

    # Build Word Frequency
    # word.text is tokenization in spacy
    word_frequencies = {}
    for word in stripdoc:
        if word.text not in stopwords:
            if word.text not in word_frequencies.keys():
                word_frequencies[word.text] = 1
            else:
                word_frequencies[word.text] += 1

    #print(word_frequencies);

    # Maximum Word Frequency
    scale(word_frequencies, internalScaleFactor)
    scale(keywords,externalScaleFactor)
    mergeDict(word_frequencies,keywords)
    # Frequency Table
    print(sorted(word_frequencies, key=word_frequencies.get, reverse=True))

    # Sentence Tokens
    sentence_list = [ sentence for sentence in docx.sents ]
    #print(sentence_list)

    # Sentence Score via comparing each word with sentence
    sentence_scores = {}
    for sent in sentence_list:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent] += word_frequencies[word.text.lower()]
    print(sentence_scores);

    # Import Heapq
    from heapq import nlargest
    summarized_sentences = nlargest(10, sentence_scores, key=sentence_scores.get)
    #print(summarized_sentences)
    final_sentences = [ w.text for w in summarized_sentences ]
    summary = '\n '.join(final_sentences)
    print("==========\nSummary:")
    print(summary)

    return summary;

summarize(doc,{},{"nobel": 10,"offers": 20})