Advertisement
Guest User

Untitled

a guest
Dec 20th, 2014
168
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.51 KB | None | 0 0
  1. import nltk
  2. from nltk.corpus import brown
  3. heinlein = brown.sents(categories=['science_fiction'])
  4.  
  5. import nltk.chunk, itertools
  6. from nltk.tag import DefaultTagger
  7. from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger
  8. from tag_util import backoff_tagger
  9. from nltk.corpus import conll2000
  10. ###########################################
  11. ## Defining POS-tagger
  12. ###########################################
  13. def backoff_tagger(train_sents, tagger_classes, backoff=None):
  14. for cls in tagger_classes:
  15. backoff = cls(train_sents, backoff=backoff)
  16. return backoff
  17.  
  18. test_POS_sents = conll2000.tagged_sents('test.txt')
  19. train_POS_sents = conll2000.tagged_sents('train.txt')
  20.  
  21. backoff = DefaultTagger('NN')
  22. tagger = backoff_tagger(train_POS_sents, [UnigramTagger, BigramTagger,
  23. TrigramTagger], backoff=backoff)
  24.  
  25. #print tagger.evaluate(test_POS_sents)
  26. ###########################################
  27. ## Training IOB-tagger
  28. ###########################################
  29.  
  30. test_IOB_sents = conll2000.chunked_sents('test.txt')
  31. train_IOB_sents = conll2000.chunked_sents('train.txt')
  32.  
  33.  
  34. def conll_tag_chunks(chunk_sents):
  35. tagged_sents = [nltk.chunk.tree2conlltags(tree) for tree in chunk_sents]
  36. return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
  37. class TagChunker(nltk.chunk.ChunkParserI):
  38. def __init__(self, train_chunks, tagger_classes=[UnigramTagger,BigramTagger]):
  39. train_sents = conll_tag_chunks(train_chunks)
  40. self.tagger = backoff_tagger(train_sents, tagger_classes)
  41. def parse(self, tagged_sent):
  42. if not tagged_sent: return None
  43. (words, tags) = zip(*tagged_sent)
  44. chunks = self.tagger.tag(tags)
  45. wtc = itertools.izip(words, chunks)
  46. return [(w,t,c) for (w,(t,c)) in wtc]
  47.  
  48. chunker = TagChunker(train_IOB_sents)
  49. heinlein_sents_tagged =[chunker.parse(tagger.tag(sent))
  50. for sent in heinlein]
  51.  
  52. CREATE TABLE IF NOT EXISTS `tags` (
  53. `id` int(11) NOT NULL AUTO_INCREMENT,
  54. `tag_sequence` varchar(255) DEFAULT NULL,
  55. `word_sequence` varchar(255) DEFAULT NULL,
  56. `full_sentence` text,
  57. `length` int(11) DEFAULT NULL,
  58. PRIMARY KEY (`id`)
  59. ) ENGINE=InnoDB DEFAULT CHARSET=latin1 AUTO_INCREMENT=0;
  60.  
  61. ###########################################
  62. ## Matching
  63. ###########################################
  64. from difflib import SequenceMatcher
  65. import MySQLdb as mdb
  66. conn = mdb.connect('localhost', 'root', '', 'test');
  67. curs = conn.cursor()
  68.  
  69. for sent in heinlein_sents_tagged:###Flip through all the sentences
  70. tag_seq=[t+"||"+c for w,t,c in nltk.chunk.tree2conlltags(sent)]
  71. sentence=[w for w,t,c in nltk.chunk.tree2conlltags(sent)]
  72. for another_sent in heinlein_sents_tagged:###Again flip through all the sentences
  73. if sent != another_sent:#skip if same sentences
  74. another_tag_seq=[t+"||"+c for w,t,c in nltk.chunk.tree2conlltags(another_sent)]
  75. another_sentence=[w for w,t,c in nltk.chunk.tree2conlltags(another_sent)]
  76. m = SequenceMatcher(lambda x: x=="||", tag_seq, another_tag_seq)#here we consider "||" to be junk, i.e. it won't take part in matching
  77. for match in m.get_matching_blocks():
  78. i,j,n = match
  79. if n>5: # We want matches bigger than 5
  80. curs.execute ("SELECT length FROM tags WHERE word_sequence= %s",
  81. str(sentence[i:i+n],))
  82. if curs.fetchone() is None:
  83. curs.execute("""INSERT INTO tags (tag_sequence, word_sequence, full_sentence, length) VALUES (%s, %s, %s, %s)""",
  84. (str(tag_seq[i:i+n]), str(sentence[i:i+n]), str(sentence) ,n))
  85.  
  86. curs.execute ("SELECT length FROM tags WHERE word_sequence= %s",
  87. str(another_sentence[i:i+n],))
  88. if curs.fetchone() is None:
  89. curs.execute("""INSERT INTO tags (tag_sequence, word_sequence, full_sentence, length) VALUES (%s, %s, %s, %s)""",
  90. (str(another_tag_seq[i:i+n]), str(another_sentence[i:i+n]), str(another_sentence) ,n))
  91.  
  92. conn.commit()
  93. conn.close()
  94.  
  95. SELECT t1.tag_sequence, t1.word_sequence, t1.full_sentence,
  96. (SELECT COUNT( tc.tag_sequence )
  97. FROM tags tc
  98. WHERE tc.tag_sequence = t1.tag_sequence
  99. ) AS count
  100. FROM tags t1
  101. WHERE (SELECT COUNT( tc.tag_sequence )
  102. FROM tags tc WHERE tc.tag_sequence = t1.tag_sequence ) >6
  103. ORDER BY count DESC, t1.tag_sequence
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement