Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import nltk
- from nltk.corpus import brown
- heinlein = brown.sents(categories=['science_fiction'])
- import nltk.chunk, itertools
- from nltk.tag import DefaultTagger
- from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger
- from tag_util import backoff_tagger
- from nltk.corpus import conll2000
- ###########################################
- ## Defining POS-tagger
- ###########################################
- def backoff_tagger(train_sents, tagger_classes, backoff=None):
- for cls in tagger_classes:
- backoff = cls(train_sents, backoff=backoff)
- return backoff
- test_POS_sents = conll2000.tagged_sents('test.txt')
- train_POS_sents = conll2000.tagged_sents('train.txt')
- backoff = DefaultTagger('NN')
- tagger = backoff_tagger(train_POS_sents, [UnigramTagger, BigramTagger,
- TrigramTagger], backoff=backoff)
- #print tagger.evaluate(test_POS_sents)
- ###########################################
- ## Training IOB-tagger
- ###########################################
- test_IOB_sents = conll2000.chunked_sents('test.txt')
- train_IOB_sents = conll2000.chunked_sents('train.txt')
- def conll_tag_chunks(chunk_sents):
- tagged_sents = [nltk.chunk.tree2conlltags(tree) for tree in chunk_sents]
- return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
- class TagChunker(nltk.chunk.ChunkParserI):
- def __init__(self, train_chunks, tagger_classes=[UnigramTagger,BigramTagger]):
- train_sents = conll_tag_chunks(train_chunks)
- self.tagger = backoff_tagger(train_sents, tagger_classes)
- def parse(self, tagged_sent):
- if not tagged_sent: return None
- (words, tags) = zip(*tagged_sent)
- chunks = self.tagger.tag(tags)
- wtc = itertools.izip(words, chunks)
- return [(w,t,c) for (w,(t,c)) in wtc]
- chunker = TagChunker(train_IOB_sents)
- heinlein_sents_tagged =[chunker.parse(tagger.tag(sent))
- for sent in heinlein]
- CREATE TABLE IF NOT EXISTS `tags` (
- `id` int(11) NOT NULL AUTO_INCREMENT,
- `tag_sequence` varchar(255) DEFAULT NULL,
- `word_sequence` varchar(255) DEFAULT NULL,
- `full_sentence` text,
- `length` int(11) DEFAULT NULL,
- PRIMARY KEY (`id`)
- ) ENGINE=InnoDB DEFAULT CHARSET=latin1 AUTO_INCREMENT=0;
- ###########################################
- ## Matching
- ###########################################
- from difflib import SequenceMatcher
- import MySQLdb as mdb
- conn = mdb.connect('localhost', 'root', '', 'test');
- curs = conn.cursor()
- for sent in heinlein_sents_tagged:###Flip through all the sentences
- tag_seq=[t+"||"+c for w,t,c in nltk.chunk.tree2conlltags(sent)]
- sentence=[w for w,t,c in nltk.chunk.tree2conlltags(sent)]
- for another_sent in heinlein_sents_tagged:###Again flip through all the sentences
- if sent != another_sent:#skip if same sentences
- another_tag_seq=[t+"||"+c for w,t,c in nltk.chunk.tree2conlltags(another_sent)]
- another_sentence=[w for w,t,c in nltk.chunk.tree2conlltags(another_sent)]
- m = SequenceMatcher(lambda x: x=="||", tag_seq, another_tag_seq)#here we consider "||" to be junk, i.e. it won't take part in matching
- for match in m.get_matching_blocks():
- i,j,n = match
- if n>5: # We want matches bigger than 5
- curs.execute ("SELECT length FROM tags WHERE word_sequence= %s",
- str(sentence[i:i+n],))
- if curs.fetchone() is None:
- curs.execute("""INSERT INTO tags (tag_sequence, word_sequence, full_sentence, length) VALUES (%s, %s, %s, %s)""",
- (str(tag_seq[i:i+n]), str(sentence[i:i+n]), str(sentence) ,n))
- curs.execute ("SELECT length FROM tags WHERE word_sequence= %s",
- str(another_sentence[i:i+n],))
- if curs.fetchone() is None:
- curs.execute("""INSERT INTO tags (tag_sequence, word_sequence, full_sentence, length) VALUES (%s, %s, %s, %s)""",
- (str(another_tag_seq[i:i+n]), str(another_sentence[i:i+n]), str(another_sentence) ,n))
- conn.commit()
- conn.close()
- SELECT t1.tag_sequence, t1.word_sequence, t1.full_sentence,
- (SELECT COUNT( tc.tag_sequence )
- FROM tags tc
- WHERE tc.tag_sequence = t1.tag_sequence
- ) AS count
- FROM tags t1
- WHERE (SELECT COUNT( tc.tag_sequence )
- FROM tags tc WHERE tc.tag_sequence = t1.tag_sequence ) >6
- ORDER BY count DESC, t1.tag_sequence
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement