Advertisement
alvations

europarl-indexer.py

Jan 13th, 2013
384
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.39 KB | None | 0 0
  1. # -*- coding: utf8 -*-
  2. import codecs, glob, os, string
  3. import lucene
  4. from lucene import Analyzer, StandardAnalyzer, Document, Field, IndexReader, \
  5. IndexWriter, QueryParser, IndexSearcher, Query, ScoreDoc, FSDirectory, Hit
  6. from itertools import izip
  7.  
  8. import sys
  9. reload(sys)
  10. sys.setdefaultencoding("utf-8")
  11.  
  12. dataDir = "./"
  13. euroDir = dataDir + "bitexts/"
  14. indexDir = dataDir + "index/"
  15.  
  16. # Gets first instance of matching key given a value and a dictionary.    
  17. def getKey(dic, value):
  18.   return [k for k,v in dic.items() if v == value]
  19.  
  20. def langiso (lang, isochar=3):
  21.   languages = {"de":"deu","es":"spa","fr":"fre", "it":"ita","nl":"nld" }
  22.   if len(lang) == 2 or isochar==3:
  23.     return languages[lang]
  24.   if len(lang) == 3 or isochar==2:
  25.     return getKey(lang)
  26.  
  27. def txtPairs (bitextDir):
  28.   txtpairs = {}
  29.   for infile in glob.glob(os.path.join(bitextDir, '*')):
  30.     #print infile
  31.     k = infile[-8:-3]; lang = infile[-2:]
  32.     try:
  33.       txtpairs[k] = (txtpairs[k],infile) if lang == "en" else (infile,txtpairs[k])
  34.     except:
  35.       txtpairs[k] = infile
  36.   for i in txtpairs:
  37.     if len(txtpairs[i]) != 2:
  38.       del txtpairs[i]
  39.   return txtpairs
  40.  
  41. '''def indexDoc(srcfile,trgfile,indexDir, overwrite=True):
  42.  lucene.initVM()
  43.  fileDir = FSDirectory.getDirectory(indexDir)
  44.  analyzer = StandardAnalyzer()
  45.  writer = IndexWriter(fileDir, analyzer, overwrite)
  46.  
  47.  # Using IndexReader to get maxDoc for uniqID of instances.
  48.  reader = IndexReader.open(fileDir)
  49.  uniqID = reader.maxDoc() + 1
  50.  reader.close()
  51.  
  52.  trglang = langiso(trgfile[-2:])
  53.  
  54.  for trg, src in izip(codecs.open(trgfile,'r','utf8'), \
  55.                       codecs.open(srcfile,'r','utf8')):
  56.    trg = trg.strip(); src = src.strip()
  57.    if trg == "" or src == "": continue
  58.    print "Indexing",uniqID,"..."
  59.    doc = Document()
  60.    doc.add(Field("uniqID", str(uniqID), Field.Store.YES, Field.Index.TOKENIZED))
  61.    doc.add(Field("eng", str(src.strip()),Field.Store.YES, Field.Index.TOKENIZED))
  62.    doc.add(Field(trglang, str(trg.strip()),Field.Store.YES, Field.Index.TOKENIZED))
  63.    writer.addDocument(doc); uniqID+=1
  64.  writer.optimize()
  65.  writer.close()
  66.  print "Finished indexing."
  67.  return None'''
  68.  
  69. def retrieveUniqID(indexDir, field, content):
  70.   lucene.initVM()
  71.   # Query index for the doc.
  72.   analyzer = StandardAnalyzer()
  73.   searcher = IndexSearcher(indexDir)
  74.   exclude = set(string.punctuation)
  75.   contentnopunct = ''.join(ch for ch in content if ch not in exclude)  
  76.   q = field + ':"' + contentnopunct+'"'
  77.   query = QueryParser("<default field>", analyzer).parse(q)
  78.   try:
  79.     hits = searcher.search(query, lucene.Sort("uniqID"))
  80.   except:
  81.     searcher.getIndexReader().close(); searcher.close()
  82.     return None
  83.   counter = 0
  84.   while counter < len(hits):
  85.     h = hits[counter]
  86.     id = str(h.get("uniqID"))
  87.     cont = str(h.get(field)) #;spa = str(h.get("spa"))
  88.     if cont == content:
  89.       searcher.getIndexReader().close(); searcher.close()
  90.       return [(id,cont)]
  91.     counter+=1
  92.   return None
  93.  
  94. # Deletes a document from index. **USE WITH CARE!!!
  95. def deleteFromIndex(indexDir, fieldname, content):
  96.   term2del = lucene.Term(fieldname, content)
  97.   lucene.initVM()
  98.   fileDir = FSDirectory.getDirectory(indexDir)
  99.   reader = IndexReader.open(fileDir)
  100.   reader.deleteDocuments(term2del);
  101.   reader.close(); fileDir.close()
  102.   #print "Document deleted from index with", term2del
  103.   return None
  104.  
  105. def createEmptyIndex(indexDir):
  106.   lucene.initVM()
  107.   fileDir = FSDirectory.getDirectory(indexDir)
  108.   analyzer = StandardAnalyzer()
  109.   writer = IndexWriter(fileDir, analyzer, True)
  110.   writer.optimize(); writer.getDirectory().close(); writer.close(); fileDir.close()
  111.   return None
  112.  
  113. # Adds a new field/content to an existing index
  114. # field = instID, field2 = focus
  115. def incrementalIndexing(srcfile,trgfile, indexDir,overwrite=False):
  116.   lucene.initVM()
  117.   analyzer = StandardAnalyzer()
  118.   trglang = langiso(trgfile[-2:])
  119.  
  120.   for trg, src in izip(codecs.open(trgfile,'r','utf8'), \
  121.                        codecs.open(srcfile,'r','utf8')):
  122.     fileDir = FSDirectory.getDirectory(indexDir)
  123.     #print src,trg
  124.     trg = trg.strip(); src = src.strip()
  125.     if trg == "" or src == "": continue
  126.     # Get uniqID of src sentence.
  127.     uid = retrieveUniqID(indexDir, "eng", src)
  128.    
  129.     if uid == None:
  130.       # Using IndexReader to get maxDoc for uniqID of instances.
  131.       reader = IndexReader.open(fileDir)
  132.       uniqID = reader.maxDoc()+1
  133.       reader.close()
  134.  
  135.       print "Indexing uniqID:",uniqID,"..."
  136.       doc = Document()
  137.       doc.add(Field("uniqID", str(uniqID), Field.Store.YES, Field.Index.TOKENIZED))
  138.       doc.add(Field("eng", str(src.strip()),Field.Store.YES, Field.Index.TOKENIZED))
  139.       doc.add(Field(trglang, str(trg.strip()),Field.Store.YES, Field.Index.TOKENIZED))
  140.       writer = IndexWriter(fileDir, analyzer, False)
  141.       writer.addDocument(doc)
  142.       writer.optimize();
  143.       writer.getDirectory().close(); writer.close(); fileDir.close()
  144.     else:
  145.       uid = uid[0][0]
  146.       # Query index for the doc.
  147.       searcher = IndexSearcher(indexDir)
  148.       query = QueryParser('uniqID', analyzer).parse(uid)
  149.       hits = searcher.search(query) # Retrieve only 1 instance.
  150.       '''# Checks if query retrieves more than 1 document.
  151.      if len(hits) > 1:
  152.        print "Query uniqID retrieves more than 1 document, \
  153.               NO documents reindexed!!!"
  154.        return None'''
  155.       # Retrieve the doc.
  156.       counter = 0
  157.       doc = Document()
  158.       while counter < len(hits):
  159.         if hits[counter].get("uniqID") == uid:
  160.           doc = hits[counter]
  161.           break
  162.         counter+=1
  163.       searcher.getIndexReader().close();searcher.close()
  164.       # Delete doc from index.
  165.       deleteFromIndex(indexDir, "uniqID", doc.get("uniqID"))
  166.      
  167.       # Add fields/contents to the doc.
  168.       doc.add(Field(trglang, trg, Field.Store.YES, Field.Index.TOKENIZED))
  169.      
  170.       # Reindexes the doc into the index.
  171.       # IMPT: set create=FALSE to not overwrite existing index
  172.       writer = IndexWriter(fileDir, analyzer, False)
  173.       writer.addDocument(doc)
  174.       writer.optimize(); writer.getDirectory().close(); writer.close(); fileDir.close()
  175.       print "Document reindexed",  "uniqID:"+ doc.get("uniqID"),"..."
  176.   print "Finished reindexing."
  177.   return None
  178.  
  179. '''createEmptyIndex(indexDir)
  180. filemap = txtPairs(euroDir)
  181. for i in filemap:
  182.  sfile = filemap[i][1]; tfile = filemap[i][0]
  183.  #print sfile, tfile
  184.  incrementalIndexing(sfile,tfile,indexDir)'''
  185.  
  186. createEmptyIndex(indexDir)
  187. incrementalIndexing("eng-spa.100en","eng-spa.100es",indexDir)
  188. incrementalIndexing("eng-deu.100en","eng-deu.100de",indexDir)
  189. sent = "Madam President, on a point of order."
  190. for i,j in retrieveUniqID(indexDir,'eng',sent):
  191.   print i,j
  192.  
  193. #indexDoc("eng-spa.100en","eng-spa.100es",indexDir)
  194. #sent = "(The House rose and observed a minute' s silence)"
  195. #print retrieveUniqID(indexDir,'eng',sent)[0][0]
  196. #for i,j in retrieveUniqID(indexDir,'eng',sent):
  197.   #print i,j
  198. #incrementalIndexing("eng-deu.100en","eng-deu.100de",indexDir)
  199.  
  200.  
  201. '''spa = euroDir + "europarl-v7.es-en.es"
  202. eng = euroDir + "europarl-v7.es-en.en"
  203. srcout = codecs.open('eng-spa.eng100','w','utf8')
  204. trgout = codecs.open('eng-spa.spa100','w','utf8')
  205. count = 0
  206. for t,s in izip(codecs.open(spa,'r','utf8'),codecs.open(eng,'r','utf8')):
  207.  if count == 100:
  208.    break
  209.  print>>trgout, t.strip()
  210.  print>>srcout, s.strip()
  211.  count+=1'''
  212. #filepairs = txtPairs(euroDir)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement