Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf8 -*-
- import codecs, glob, os, string
- import lucene
- from lucene import Analyzer, StandardAnalyzer, Document, Field, IndexReader, \
- IndexWriter, QueryParser, IndexSearcher, Query, ScoreDoc, FSDirectory, Hit
- from itertools import izip
- import sys
- reload(sys)
- sys.setdefaultencoding("utf-8")
- dataDir = "./"
- euroDir = dataDir + "bitexts/"
- indexDir = dataDir + "index/"
- # Gets first instance of matching key given a value and a dictionary.
- def getKey(dic, value):
- return [k for k,v in dic.items() if v == value]
- def langiso (lang, isochar=3):
- languages = {"de":"deu","es":"spa","fr":"fre", "it":"ita","nl":"nld" }
- if len(lang) == 2 or isochar==3:
- return languages[lang]
- if len(lang) == 3 or isochar==2:
- return getKey(lang)
- def txtPairs (bitextDir):
- txtpairs = {}
- for infile in glob.glob(os.path.join(bitextDir, '*')):
- #print infile
- k = infile[-8:-3]; lang = infile[-2:]
- try:
- txtpairs[k] = (txtpairs[k],infile) if lang == "en" else (infile,txtpairs[k])
- except:
- txtpairs[k] = infile
- for i in txtpairs:
- if len(txtpairs[i]) != 2:
- del txtpairs[i]
- return txtpairs
- '''def indexDoc(srcfile,trgfile,indexDir, overwrite=True):
- lucene.initVM()
- fileDir = FSDirectory.getDirectory(indexDir)
- analyzer = StandardAnalyzer()
- writer = IndexWriter(fileDir, analyzer, overwrite)
- # Using IndexReader to get maxDoc for uniqID of instances.
- reader = IndexReader.open(fileDir)
- uniqID = reader.maxDoc() + 1
- reader.close()
- trglang = langiso(trgfile[-2:])
- for trg, src in izip(codecs.open(trgfile,'r','utf8'), \
- codecs.open(srcfile,'r','utf8')):
- trg = trg.strip(); src = src.strip()
- if trg == "" or src == "": continue
- print "Indexing",uniqID,"..."
- doc = Document()
- doc.add(Field("uniqID", str(uniqID), Field.Store.YES, Field.Index.TOKENIZED))
- doc.add(Field("eng", str(src.strip()),Field.Store.YES, Field.Index.TOKENIZED))
- doc.add(Field(trglang, str(trg.strip()),Field.Store.YES, Field.Index.TOKENIZED))
- writer.addDocument(doc); uniqID+=1
- writer.optimize()
- writer.close()
- print "Finished indexing."
- return None'''
- def retrieveUniqID(indexDir, field, content):
- lucene.initVM()
- # Query index for the doc.
- analyzer = StandardAnalyzer()
- searcher = IndexSearcher(indexDir)
- exclude = set(string.punctuation)
- contentnopunct = ''.join(ch for ch in content if ch not in exclude)
- q = field + ':"' + contentnopunct+'"'
- query = QueryParser("<default field>", analyzer).parse(q)
- try:
- hits = searcher.search(query, lucene.Sort("uniqID"))
- except:
- searcher.getIndexReader().close(); searcher.close()
- return None
- counter = 0
- while counter < len(hits):
- h = hits[counter]
- id = str(h.get("uniqID"))
- cont = str(h.get(field)) #;spa = str(h.get("spa"))
- if cont == content:
- searcher.getIndexReader().close(); searcher.close()
- return [(id,cont)]
- counter+=1
- return None
- # Deletes a document from index. **USE WITH CARE!!!
- def deleteFromIndex(indexDir, fieldname, content):
- term2del = lucene.Term(fieldname, content)
- lucene.initVM()
- fileDir = FSDirectory.getDirectory(indexDir)
- reader = IndexReader.open(fileDir)
- reader.deleteDocuments(term2del);
- reader.close(); fileDir.close()
- #print "Document deleted from index with", term2del
- return None
- def createEmptyIndex(indexDir):
- lucene.initVM()
- fileDir = FSDirectory.getDirectory(indexDir)
- analyzer = StandardAnalyzer()
- writer = IndexWriter(fileDir, analyzer, True)
- writer.optimize(); writer.getDirectory().close(); writer.close(); fileDir.close()
- return None
- # Adds a new field/content to an existing index
- # field = instID, field2 = focus
- def incrementalIndexing(srcfile,trgfile, indexDir,overwrite=False):
- lucene.initVM()
- analyzer = StandardAnalyzer()
- trglang = langiso(trgfile[-2:])
- for trg, src in izip(codecs.open(trgfile,'r','utf8'), \
- codecs.open(srcfile,'r','utf8')):
- fileDir = FSDirectory.getDirectory(indexDir)
- #print src,trg
- trg = trg.strip(); src = src.strip()
- if trg == "" or src == "": continue
- # Get uniqID of src sentence.
- uid = retrieveUniqID(indexDir, "eng", src)
- if uid == None:
- # Using IndexReader to get maxDoc for uniqID of instances.
- reader = IndexReader.open(fileDir)
- uniqID = reader.maxDoc()+1
- reader.close()
- print "Indexing uniqID:",uniqID,"..."
- doc = Document()
- doc.add(Field("uniqID", str(uniqID), Field.Store.YES, Field.Index.TOKENIZED))
- doc.add(Field("eng", str(src.strip()),Field.Store.YES, Field.Index.TOKENIZED))
- doc.add(Field(trglang, str(trg.strip()),Field.Store.YES, Field.Index.TOKENIZED))
- writer = IndexWriter(fileDir, analyzer, False)
- writer.addDocument(doc)
- writer.optimize();
- writer.getDirectory().close(); writer.close(); fileDir.close()
- else:
- uid = uid[0][0]
- # Query index for the doc.
- searcher = IndexSearcher(indexDir)
- query = QueryParser('uniqID', analyzer).parse(uid)
- hits = searcher.search(query) # Retrieve only 1 instance.
- '''# Checks if query retrieves more than 1 document.
- if len(hits) > 1:
- print "Query uniqID retrieves more than 1 document, \
- NO documents reindexed!!!"
- return None'''
- # Retrieve the doc.
- counter = 0
- doc = Document()
- while counter < len(hits):
- if hits[counter].get("uniqID") == uid:
- doc = hits[counter]
- break
- counter+=1
- searcher.getIndexReader().close();searcher.close()
- # Delete doc from index.
- deleteFromIndex(indexDir, "uniqID", doc.get("uniqID"))
- # Add fields/contents to the doc.
- doc.add(Field(trglang, trg, Field.Store.YES, Field.Index.TOKENIZED))
- # Reindexes the doc into the index.
- # IMPT: set create=FALSE to not overwrite existing index
- writer = IndexWriter(fileDir, analyzer, False)
- writer.addDocument(doc)
- writer.optimize(); writer.getDirectory().close(); writer.close(); fileDir.close()
- print "Document reindexed", "uniqID:"+ doc.get("uniqID"),"..."
- print "Finished reindexing."
- return None
- '''createEmptyIndex(indexDir)
- filemap = txtPairs(euroDir)
- for i in filemap:
- sfile = filemap[i][1]; tfile = filemap[i][0]
- #print sfile, tfile
- incrementalIndexing(sfile,tfile,indexDir)'''
- createEmptyIndex(indexDir)
- incrementalIndexing("eng-spa.100en","eng-spa.100es",indexDir)
- incrementalIndexing("eng-deu.100en","eng-deu.100de",indexDir)
- sent = "Madam President, on a point of order."
- for i,j in retrieveUniqID(indexDir,'eng',sent):
- print i,j
- #indexDoc("eng-spa.100en","eng-spa.100es",indexDir)
- #sent = "(The House rose and observed a minute' s silence)"
- #print retrieveUniqID(indexDir,'eng',sent)[0][0]
- #for i,j in retrieveUniqID(indexDir,'eng',sent):
- #print i,j
- #incrementalIndexing("eng-deu.100en","eng-deu.100de",indexDir)
- '''spa = euroDir + "europarl-v7.es-en.es"
- eng = euroDir + "europarl-v7.es-en.en"
- srcout = codecs.open('eng-spa.eng100','w','utf8')
- trgout = codecs.open('eng-spa.spa100','w','utf8')
- count = 0
- for t,s in izip(codecs.open(spa,'r','utf8'),codecs.open(eng,'r','utf8')):
- if count == 100:
- break
- print>>trgout, t.strip()
- print>>srcout, s.strip()
- count+=1'''
- #filepairs = txtPairs(euroDir)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement