Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from __future__ import absolute_import
- from __future__ import division, print_function, unicode_literals
- from sumy.parsers.html import HtmlParser
- from sumy.parsers.plaintext import PlaintextParser
- from sumy.nlp.tokenizers import Tokenizer
- from sumy.summarizers.lex_rank import LexRankSummarizer
- from sumy.summarizers.lsa import LsaSummarizer
- from sumy.summarizers.luhn import LuhnSummarizer
- from sumy.summarizers.random import RandomSummarizer
- from sumy.summarizers.text_rank import TextRankSummarizer
- from sumy.nlp.stemmers import Stemmer
- from sumy.utils import get_stop_words
- import sys
- import locale
- # arguments of sumyapi.py call
- # 1: chosen summarizer [lex_rank|lsa|luhn|random|text_rank], default lsa
- # 2: language, default [german]
- # 3: target summary length in sentences, default [3]
- # 4: text to summarize
- if __name__ == "__main__":
- LANGUAGE = str(sys.argv[2])
- decode = sys.argv[4].decode(locale.getpreferredencoding())# decode whatever java probably(!) encoded as
- encode = decode.encode("utf-8", "ignore")# make python compliant
- parser = PlaintextParser.from_string(encode, Tokenizer(LANGUAGE))
- stemmer = Stemmer(LANGUAGE)
- if str(sys.argv[1]) == 'lex_rank':
- summarizer = LexRankSummarizer(stemmer)
- elif str(sys.argv[1]) == 'lsa':
- summarizer = LsaSummarizer(stemmer)
- elif str(sys.argv[1]) == 'luhn':
- summarizer = LuhnSummarizer(stemmer)
- elif str(sys.argv[1]) == 'random':
- summarizer = RandomSummarizer(stemmer)
- elif str(sys.argv[1]) == 'text_rank':
- summarizer = TextRankSummarizer(stemmer)
- else:
- summarizer = LsaSummarizer(stemmer)
- summarizer.stop_words = get_stop_words(LANGUAGE)
- for sentence in summarizer(parser.document, int(sys.argv[3])):
- print(sentence)# note python prints in utf - 8, make sure java reads stream as such
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement