Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ##POS tagging is labeling words in a sentence as nouns, adjectives, verbs...etc
- import nltk
- from nltk.corpus import state_union
- from nltk.tokenize import PunktSentenceTokenizer
- ##PunktSentenceTokenizer a new sentence tokenizer
- ## This tokenizer is capable of unsupervised machine learning,
- ##so you can actually train it on any body of text that you use
- ##Creating training and testing data
- train_text = state_union.raw("2005-GWBush.txt")
- sample_text = state_union.raw("2006-GWBush.txt")
- ##train the Punkt tokenizer
- custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
- tokenized = custom_sent_tokenizer.tokenize(sample_text)
- def process_content():
- try:
- for i in tokenized:
- words = nltk.word_tokenize(i)
- tagged = nltk.pos_tag(words)
- ## print(tagged)
- ##
- ## Chunking is done to extract meaningful
- ## Chunking on Adverbs, Noun (Singular) and Proper Noun
- chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
- chunkParser = nltk.RegexpParser(chunkGram)
- chunked = chunkParser.parse(tagged)
- ## print(chunked)
- ## chunked.draw()
- ## "chunked" variable is an NLTK tree
- ## Each "chunk" and "non chunk" is a "subtree" of the tree
- ## for subtree in chunked.subtrees():
- ## print(subtree)
- ## Print the subtree with label Chunk that we assigned above
- for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
- print(subtree)
- except Exception as e:
- print(str(e))
- process_content()
- ##Chinking is a lot like chunking, it is basically a way for you to remove a
- ##chunk from a chunk.
- ##The chunk that you remove from your chunk is your chink.
- ##chunkGram = r"""Chunk: {<.*>+}
- ## }<VB.?|IN|DT|TO>+{"""
- ##This means we're removing from the chink one or more
- ##verbs, prepositions, determiners, or the word 'to'.
Add Comment
Please, Sign In to add comment