Untitled

import nltk
from nltk.corpus import brown
heinlein = brown.sents(categories=['science_fiction'])

import nltk.chunk, itertools
from nltk.tag import DefaultTagger
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger
from tag_util import backoff_tagger
from nltk.corpus import conll2000
###########################################
##  Defining POS-tagger
###########################################
def backoff_tagger(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
    return backoff

test_POS_sents = conll2000.tagged_sents('test.txt')
train_POS_sents = conll2000.tagged_sents('train.txt')

backoff = DefaultTagger('NN')
tagger = backoff_tagger(train_POS_sents, [UnigramTagger, BigramTagger,
                                          TrigramTagger], backoff=backoff)

#print tagger.evaluate(test_POS_sents)
###########################################
## Training IOB-tagger
###########################################

test_IOB_sents = conll2000.chunked_sents('test.txt')
train_IOB_sents = conll2000.chunked_sents('train.txt')


def conll_tag_chunks(chunk_sents):
    tagged_sents = [nltk.chunk.tree2conlltags(tree) for tree in chunk_sents]
    return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
class TagChunker(nltk.chunk.ChunkParserI):
    def __init__(self, train_chunks, tagger_classes=[UnigramTagger,BigramTagger]):
        train_sents = conll_tag_chunks(train_chunks)
        self.tagger = backoff_tagger(train_sents, tagger_classes)
    def parse(self, tagged_sent):
        if not tagged_sent: return None
        (words, tags) = zip(*tagged_sent)
        chunks = self.tagger.tag(tags)
        wtc = itertools.izip(words, chunks)
        return [(w,t,c) for (w,(t,c)) in wtc]

chunker = TagChunker(train_IOB_sents)
heinlein_sents_tagged =[chunker.parse(tagger.tag(sent))
                        for sent in heinlein]

CREATE TABLE IF NOT EXISTS `tags` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `tag_sequence` varchar(255) DEFAULT NULL,
  `word_sequence` varchar(255) DEFAULT NULL,
  `full_sentence` text,
  `length` int(11) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB  DEFAULT CHARSET=latin1 AUTO_INCREMENT=0;

###########################################
##      Matching
###########################################
from difflib import SequenceMatcher
import MySQLdb as mdb
conn = mdb.connect('localhost', 'root', '', 'test');
curs = conn.cursor()

for sent in heinlein_sents_tagged:###Flip through all the sentences
    tag_seq=[t+"||"+c for w,t,c in nltk.chunk.tree2conlltags(sent)]
    sentence=[w for w,t,c in nltk.chunk.tree2conlltags(sent)]
    for another_sent in heinlein_sents_tagged:###Again flip through all the sentences
        if sent != another_sent:#skip if same sentences
                another_tag_seq=[t+"||"+c for w,t,c in nltk.chunk.tree2conlltags(another_sent)]
                another_sentence=[w for w,t,c in nltk.chunk.tree2conlltags(another_sent)]
                m = SequenceMatcher(lambda x: x=="||", tag_seq, another_tag_seq)#here we consider "||" to be junk, i.e. it won't take part in matching
                for match in m.get_matching_blocks():
                    i,j,n = match
                    if n>5: # We want matches bigger than 5
                        curs.execute ("SELECT length FROM tags WHERE word_sequence= %s",
                                      str(sentence[i:i+n],))
                        if curs.fetchone() is None:
                            curs.execute("""INSERT INTO tags (tag_sequence, word_sequence, full_sentence, length) VALUES (%s, %s, %s, %s)""",
                                         (str(tag_seq[i:i+n]), str(sentence[i:i+n]), str(sentence) ,n))

                        curs.execute ("SELECT length FROM tags WHERE word_sequence= %s",
                                      str(another_sentence[i:i+n],))
                        if curs.fetchone() is None:
                            curs.execute("""INSERT INTO tags (tag_sequence, word_sequence, full_sentence, length) VALUES (%s, %s, %s, %s)""",
                                         (str(another_tag_seq[i:i+n]), str(another_sentence[i:i+n]), str(another_sentence) ,n))

conn.commit()
conn.close()

SELECT t1.tag_sequence, t1.word_sequence, t1.full_sentence,
(SELECT COUNT( tc.tag_sequence )
FROM tags tc
WHERE tc.tag_sequence = t1.tag_sequence
) AS count
FROM tags t1
WHERE (SELECT COUNT( tc.tag_sequence )
FROM tags tc WHERE tc.tag_sequence = t1.tag_sequence ) >6
ORDER BY count DESC, t1.tag_sequence