Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from __future__ import division
- import sys
- import os
- import urllib2
- import time
- from optparse import OptionParser
- import xml.etree.ElementTree as ET
- import math
- class SearchIndex(object):
- def __init__(self, input_file):
- self.indexname = ""
- self.chunk_size = 0
- if not os.path.exists(input_file):
- print "Dump file does not exist"
- return
- self.input_file = input_file
- def build_index(self):
- filename = self.input_file
- fields = self.field_names()
- with open(filename) as f:
- action = ET.Element('add')
- all_records = f.readlines()
- chunk_count = int(math.ceil(len(all_records) / self.chunk_size))
- chunks = self.chunks(all_records, self.chunk_size)
- f.close()
- current_chunk = 1
- for chunk in chunks:
- sys.stdout.write("Indexing chunk {0}/{1}...".format(current_chunk, chunk_count))
- sys.stdout.flush()
- current_chunk += 1
- for record in chunk:
- valid_record = self.validate_record(record)
- if not valid_record:
- # let validate_record handle what to do and make sure
- # it's not indexed
- continue
- else:
- record_fields = record.split('|')
- doc = ET.SubElement(action, 'doc')
- for i in range(0, len(record_fields)):
- text = unicode(record_fields[i].replace('"', ''), 'utf8')
- text = text.rstrip("\n")
- if text:
- ET.SubElement(doc, 'field', { 'name': fields[i]}).text = text
- self.load_records(ET.tostring(action))
- print "done"
- self.commit()
- def request_url_base(self):
- # no / at the end
- return "http://localhost:8983/solr/" + self.indexname
- def field_names(self):
- return []
- def commit(self):
- request_url = self.request_url_base() + "/update"
- request = urllib2.Request(request_url, headers={"Content-Type" : "text/xml"})
- commit = urllib2.urlopen(request, "<commit />")
- response = commit.read()
- commit.close()
- def flush_index(self):
- sys.stdout.write("flushing old index...")
- sys.stdout.flush()
- request = urllib2.Request(self.request_url_base() + "/update", headers={"Content-Type" : "text/xml"})
- delete = urllib2.urlopen(request, "<delete><query>*:*</query></delete>")
- response = delete.read()
- delete.close()
- request = urllib2.Request(self.request_url_base() + "/update", headers={"Content-Type" : "text/xml"})
- commit = urllib2.urlopen(request, "<commit />")
- response = commit.read()
- commit.close()
- print "done"
- def load_records(self, xml):
- request = urllib2.Request(self.request_url_base() + "/update", headers={"Content-Type" : "text/xml"})
- content = urllib2.urlopen(request, xml)
- response = content.read()
- content.close()
- def analyze_record_for_errors(self, record):
- record_detail = record.split('|')
- sys.stderr.write('Invalid record found: ' + record.split('|')[0].replace('"', '') + '\n')
- def validate_record(self, record):
- field_count = len(self.field_names())
- if len(record.split('|')) == field_count:
- return True
- else:
- return False
- # meh, should just be a function
- def chunks(self, l, n):
- """ Yield successive n-sized chunks from l."""
- for i in range(0, len(l), n):
- yield l[i:i+n]
- class CatalogSearchIndex(SearchIndex):
- def __init__(self, input_file):
- super(CatalogSearchIndex, self).__init__(input_file)
- self.chunk_size = 75000
- self.indexname = "LibrarySearchIndex"
- def field_names(self):
- return [
- 'bib_id',
- 'title',
- 'add_title',
- 'author',
- 'add_author',
- 'series',
- 'subject',
- 'description',
- 'standard_number',
- 'upc',
- 'material_type',
- 'year'
- ]
- class EventSearchIndex(SearchIndex):
- def __init__(self, input_file):
- super(EventSearchIndex, self).__init__(input_file)
- self.indexname = "LibraryEventIndex"
- self.chunk_size = 150000
- self.duplicates = {}
- def field_names(self):
- return [
- 'event_id',
- 'program_name',
- 'address',
- 'description',
- 'age_group',
- 'duration_minutes',
- 'start_date',
- 'location_code',
- 'branch_name',
- 'record_num',
- 'registration_required',
- 'branch_id',
- ]
- def validate_record(self, record):
- # create an event key, don't allow duplicates, pbly an issue
- # with my SQL query that'll I'll have to look at when i get the
- # chance
- columns = record.split('|')
- event_key = columns[6] + columns[2] + columns[5] + columns[1]
- event_key = event_key.replace(' ', '').lower()
- if event_key in self.duplicates:
- return False
- else:
- self.duplicates[event_key] = True
- return True
- class SearchIndexFactory:
- @staticmethod
- def create_search_index(type, input_file):
- if type == 'catalog':
- return CatalogSearchIndex(input_file)
- elif type == 'events':
- return EventSearchIndex(input_file)
- #####################
- if __name__ == "__main__":
- parser = OptionParser()
- parser.add_option("-f", "--file", dest="file",
- help="File containing records dumped from Sierra", metavar="file.txt")
- parser.add_option("-t", "--type", dest="type",
- help="Search Index type we are building", metavar="catalog|events")
- (options, args) = parser.parse_args()
- if not options.file:
- print "A file name is required"
- sys.exit(1)
- if not options.type:
- print "No index type specified"
- sys.exit(1)
- else:
- search_index = SearchIndexFactory.create_search_index(options.type, options.file)
- if not search_index:
- print "Not a valid search index type"
- sys.exit(1)
- search_index.flush_index()
- search_index.build_index()
- sys.exit(0)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement