Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import nltk;
- from urllib import urlopen;
- url = "http://news.bbc.co.uk/2/hi/health/2284783.stm";
- html = urlopen(url).read();
- raw = nltk.clean_html(html);
- text = nltk.word_tokenize(raw);
- # f = open ("C:\\folder\\file.txt");
- # f.readline();
- from nltk.corpus import brown;
- #from nltk.corpus import mac_morpho; #PT-BR
- train_corpus = brown.tagged_sents( categories = "news" );
- tags = [ tag for ( word, tag ) in brown.tagged_words( categories = "news" )];
- nltk.FreqDist(tags).max(); # example of how it works. Shows the most frequent words
- default_tagger = nltk.DefaultTagger('NN'); # Example
- default_tagger.tag(text); # Example, shows how tagging works. This one will classify everything as noun ('NN')
- t0 = nltk.DefaultTagger('NN');
- t1 = nltk.UnigramTagger(train_corpus, backoff = t0); # Use the trained "train_corpus" to classify, and classify as specified by backoff if there's no match for this training corpus.
- t1.tag(text);
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement