Untitled

from __future__ import division
import sys
import os
import urllib2
import time
from optparse import OptionParser
import xml.etree.ElementTree as ET
import math

class SearchIndex(object):
    def __init__(self, input_file):
        self.indexname = ""
        self.chunk_size = 0
        if not os.path.exists(input_file):
            print "Dump file does not exist"
            return

        self.input_file = input_file

    def build_index(self):
        filename = self.input_file
        fields = self.field_names()
        with open(filename) as f:
            action = ET.Element('add')

            all_records = f.readlines()
            chunk_count = int(math.ceil(len(all_records) / self.chunk_size))
            chunks = self.chunks(all_records, self.chunk_size)
            f.close()

            current_chunk = 1
            for chunk in chunks:
                sys.stdout.write("Indexing chunk {0}/{1}...".format(current_chunk, chunk_count))
                sys.stdout.flush()
                current_chunk += 1
                for record in chunk:
                    valid_record = self.validate_record(record)
                    if not valid_record:
                        # let validate_record handle what to do and make sure
                        # it's not indexed
                        continue
                    else:
                        record_fields = record.split('|')
                        doc = ET.SubElement(action, 'doc')
                        for i in range(0, len(record_fields)):
                            text = unicode(record_fields[i].replace('"', ''), 'utf8')
                            text = text.rstrip("\n")
                            if text:
                                ET.SubElement(doc, 'field', { 'name': fields[i]}).text = text

                self.load_records(ET.tostring(action))
                print "done"

            self.commit()

    def request_url_base(self):
        # no / at the end
        return "http://localhost:8983/solr/" + self.indexname

    def field_names(self):
        return []

    def commit(self):
        request_url = self.request_url_base() + "/update"
        request = urllib2.Request(request_url, headers={"Content-Type" : "text/xml"})
        commit = urllib2.urlopen(request, "<commit />")
        response = commit.read()
        commit.close()

    def flush_index(self):
        sys.stdout.write("flushing old index...")
        sys.stdout.flush()
        request = urllib2.Request(self.request_url_base() + "/update", headers={"Content-Type" : "text/xml"})
        delete = urllib2.urlopen(request, "<delete><query>*:*</query></delete>")
        response = delete.read()
        delete.close()

        request = urllib2.Request(self.request_url_base() + "/update", headers={"Content-Type" : "text/xml"})
        commit = urllib2.urlopen(request, "<commit />")
        response = commit.read()
        commit.close()
        print "done"

    def load_records(self, xml):
        request = urllib2.Request(self.request_url_base() + "/update", headers={"Content-Type" : "text/xml"})
        content = urllib2.urlopen(request, xml)
        response = content.read()
        content.close()

    def analyze_record_for_errors(self, record):
        record_detail = record.split('|')
        sys.stderr.write('Invalid record found: ' + record.split('|')[0].replace('"', '') + '\n')

    def validate_record(self, record):
        field_count = len(self.field_names())
        if len(record.split('|')) == field_count:
            return True
        else:
            return False

    # meh, should just be a function
    def chunks(self, l, n):
        """ Yield successive n-sized chunks from l."""
        for i in range(0, len(l), n):
            yield l[i:i+n]

class CatalogSearchIndex(SearchIndex):
    def __init__(self, input_file):
        super(CatalogSearchIndex, self).__init__(input_file)
        self.chunk_size = 75000
        self.indexname = "LibrarySearchIndex"

    def field_names(self):
        return [
            'bib_id',
            'title',
            'add_title',
            'author',
            'add_author',
            'series',
            'subject',
            'description',
            'standard_number',
            'upc',
            'material_type',
            'year'
        ]

class EventSearchIndex(SearchIndex):
    def __init__(self, input_file):
        super(EventSearchIndex, self).__init__(input_file)
        self.indexname = "LibraryEventIndex"
        self.chunk_size = 150000
        self.duplicates = {}

    def field_names(self):
        return [
            'event_id',
            'program_name',
            'address',
            'description',
            'age_group',
            'duration_minutes',
            'start_date',
            'location_code',
            'branch_name',
            'record_num',
            'registration_required',
            'branch_id',
        ]

    def validate_record(self, record):
        # create an event key, don't allow duplicates, pbly an issue
        # with my SQL query that'll I'll have to look at when i get the
        # chance
        columns = record.split('|')
        event_key = columns[6] + columns[2] + columns[5] + columns[1]
        event_key = event_key.replace(' ', '').lower()
        if event_key in self.duplicates:
            return False
        else:
            self.duplicates[event_key] = True
            return True

class SearchIndexFactory:
    @staticmethod
    def create_search_index(type, input_file):
        if type == 'catalog':
            return CatalogSearchIndex(input_file)
        elif type == 'events':
            return EventSearchIndex(input_file)

#####################

if __name__ == "__main__":
    parser = OptionParser()
    parser.add_option("-f", "--file", dest="file",
        help="File containing records dumped from Sierra", metavar="file.txt")

    parser.add_option("-t", "--type", dest="type",
        help="Search Index type we are building", metavar="catalog|events")

    (options, args) = parser.parse_args()

    if not options.file:
        print "A file name is required"
        sys.exit(1)

    if not options.type:
        print "No index type specified"
        sys.exit(1)
    else:
        search_index = SearchIndexFactory.create_search_index(options.type, options.file)
        if not search_index:
            print "Not a valid search index type"
            sys.exit(1)

        search_index.flush_index()
        search_index.build_index()

    sys.exit(0)