Advertisement
Guest User

Untitled

a guest
Mar 28th, 2020
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.32 KB | None | 0 0
  1. import spacy;
  2. import re;
  3.  
  4. # Text Preprocessing Pkg
  5. from spacy.lang.en.stop_words import STOP_WORDS
  6. from string import punctuation
  7. punc = list(punctuation)
  8. nlp = spacy.load('en_core_web_sm')
  9.  
  10. # Build a List of Stopwords
  11. stopwords = list(STOP_WORDS)
  12. whitelist = {};
  13.  
  14. internalScaleFactor = 1;
  15. externalScaleFactor = 2;
  16.  
  17. def cleanStringPunc(string):
  18. for i in punc:
  19. string=string.replace(i,'')
  20. return string;
  21.  
  22. def scale(dict, proportion):
  23. maximum_frequency = max(dict.values())
  24. for word in dict.keys():
  25. dict[word] = (dict[word]/maximum_frequency*proportion)
  26.  
  27. #Merge dict2 into dict1
  28. def mergeDict(dict1, dict2):
  29. for i in dict2.keys():
  30. word = i.lower();
  31. if word not in dict1.keys():
  32. dict1.set(word, dict2.get(i));
  33. else:
  34. dict1[word] += dict2[i]
  35.  
  36. def summarize(document,wlistadd,keywords):
  37. # Build an NLP Object
  38. docx = nlp(document)
  39. stripdoc = nlp(cleanStringPunc(document).lower());
  40.  
  41. # Build Word Frequency
  42. # word.text is tokenization in spacy
  43. word_frequencies = {}
  44. for word in stripdoc:
  45. if word.text not in stopwords:
  46. if word.text not in word_frequencies.keys():
  47. word_frequencies[word.text] = 1
  48. else:
  49. word_frequencies[word.text] += 1
  50.  
  51. # Maximum Word Frequency
  52. scale(word_frequencies, internalScaleFactor)
  53. scale(keywords,externalScaleFactor)
  54. mergeDict(word_frequencies,keywords)
  55. # Frequency Table
  56.  
  57. # Sentence Tokens
  58. sentence_list = [ sentence for sentence in docx.sents ]
  59.  
  60. # Sentence Score via comparing each word with sentence
  61. sentence_scores = {}
  62. for sent in sentence_list:
  63. for word in sent:
  64. if word.text.lower() in word_frequencies.keys():
  65. if sent not in sentence_scores.keys():
  66. sentence_scores[sent] = word_frequencies[word.text.lower()]
  67. else:
  68. sentence_scores[sent] += word_frequencies[word.text.lower()]
  69.  
  70. # Import Heapq
  71. from heapq import nlargest
  72. summarized_sentences = nlargest(10, sentence_scores, key=sentence_scores.get)
  73. #print(summarized_sentences)
  74. final_sentences = [ w.text for w in summarized_sentences ]
  75. summary = ' '.join(final_sentences)
  76. return summary;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement