Advertisement
Guest User

Untitled

a guest
Oct 23rd, 2014
173
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.61 KB | None | 0 0
  1. from __future__ import division
  2. import sys
  3. import os
  4. import urllib2
  5. import time
  6. from optparse import OptionParser
  7. import xml.etree.ElementTree as ET
  8. import math
  9.  
  10. class SearchIndex(object):
  11.     def __init__(self, input_file):
  12.         self.indexname = ""
  13.         self.chunk_size = 0
  14.         if not os.path.exists(input_file):
  15.             print "Dump file does not exist"
  16.             return
  17.  
  18.         self.input_file = input_file
  19.  
  20.     def build_index(self):
  21.         filename = self.input_file
  22.         fields = self.field_names()
  23.         with open(filename) as f:
  24.             action = ET.Element('add')
  25.  
  26.             all_records = f.readlines()
  27.             chunk_count = int(math.ceil(len(all_records) / self.chunk_size))
  28.             chunks = self.chunks(all_records, self.chunk_size)
  29.             f.close()
  30.  
  31.             current_chunk = 1
  32.             for chunk in chunks:
  33.                 sys.stdout.write("Indexing chunk {0}/{1}...".format(current_chunk, chunk_count))
  34.                 sys.stdout.flush()
  35.                 current_chunk += 1
  36.                 for record in chunk:
  37.                     valid_record = self.validate_record(record)
  38.                     if not valid_record:
  39.                         # let validate_record handle what to do and make sure
  40.                         # it's not indexed
  41.                         continue
  42.                     else:
  43.                         record_fields = record.split('|')
  44.                         doc = ET.SubElement(action, 'doc')
  45.                         for i in range(0, len(record_fields)):
  46.                             text = unicode(record_fields[i].replace('"', ''), 'utf8')
  47.                             text = text.rstrip("\n")
  48.                             if text:
  49.                                 ET.SubElement(doc, 'field', { 'name': fields[i]}).text = text
  50.  
  51.                 self.load_records(ET.tostring(action))
  52.                 print "done"
  53.  
  54.             self.commit()
  55.  
  56.     def request_url_base(self):
  57.         # no / at the end
  58.         return "http://localhost:8983/solr/" + self.indexname
  59.  
  60.     def field_names(self):
  61.         return []
  62.  
  63.     def commit(self):
  64.         request_url = self.request_url_base() + "/update"
  65.         request = urllib2.Request(request_url, headers={"Content-Type" : "text/xml"})
  66.         commit = urllib2.urlopen(request, "<commit />")
  67.         response = commit.read()
  68.         commit.close()
  69.  
  70.     def flush_index(self):
  71.         sys.stdout.write("flushing old index...")
  72.         sys.stdout.flush()
  73.         request = urllib2.Request(self.request_url_base() + "/update", headers={"Content-Type" : "text/xml"})
  74.         delete = urllib2.urlopen(request, "<delete><query>*:*</query></delete>")
  75.         response = delete.read()
  76.         delete.close()
  77.  
  78.         request = urllib2.Request(self.request_url_base() + "/update", headers={"Content-Type" : "text/xml"})
  79.         commit = urllib2.urlopen(request, "<commit />")
  80.         response = commit.read()
  81.         commit.close()
  82.         print "done"
  83.  
  84.     def load_records(self, xml):
  85.         request = urllib2.Request(self.request_url_base() + "/update", headers={"Content-Type" : "text/xml"})
  86.         content = urllib2.urlopen(request, xml)
  87.         response = content.read()
  88.         content.close()
  89.  
  90.     def analyze_record_for_errors(self, record):
  91.         record_detail = record.split('|')
  92.         sys.stderr.write('Invalid record found: ' + record.split('|')[0].replace('"', '') + '\n')
  93.  
  94.     def validate_record(self, record):
  95.         field_count = len(self.field_names())
  96.         if len(record.split('|')) == field_count:
  97.             return True
  98.         else:
  99.             return False
  100.  
  101.     # meh, should just be a function
  102.     def chunks(self, l, n):
  103.         """ Yield successive n-sized chunks from l."""
  104.         for i in range(0, len(l), n):
  105.             yield l[i:i+n]
  106.  
  107. class CatalogSearchIndex(SearchIndex):
  108.     def __init__(self, input_file):
  109.         super(CatalogSearchIndex, self).__init__(input_file)
  110.         self.chunk_size = 75000
  111.         self.indexname = "LibrarySearchIndex"
  112.  
  113.     def field_names(self):
  114.         return [
  115.             'bib_id',
  116.             'title',
  117.             'add_title',
  118.             'author',
  119.             'add_author',
  120.             'series',
  121.             'subject',
  122.             'description',
  123.             'standard_number',
  124.             'upc',
  125.             'material_type',
  126.             'year'
  127.         ]
  128.  
  129. class EventSearchIndex(SearchIndex):
  130.     def __init__(self, input_file):
  131.         super(EventSearchIndex, self).__init__(input_file)
  132.         self.indexname = "LibraryEventIndex"
  133.         self.chunk_size = 150000
  134.         self.duplicates = {}
  135.  
  136.     def field_names(self):
  137.         return [
  138.             'event_id',
  139.             'program_name',
  140.             'address',
  141.             'description',
  142.             'age_group',
  143.             'duration_minutes',
  144.             'start_date',
  145.             'location_code',
  146.             'branch_name',
  147.             'record_num',
  148.             'registration_required',
  149.             'branch_id',
  150.         ]
  151.  
  152.     def validate_record(self, record):
  153.         # create an event key, don't allow duplicates, pbly an issue
  154.         # with my SQL query that'll I'll have to look at when i get the
  155.         # chance
  156.         columns = record.split('|')
  157.         event_key = columns[6] + columns[2] + columns[5] + columns[1]
  158.         event_key = event_key.replace(' ', '').lower()
  159.         if event_key in self.duplicates:
  160.             return False
  161.         else:
  162.             self.duplicates[event_key] = True
  163.             return True
  164.  
  165. class SearchIndexFactory:
  166.     @staticmethod
  167.     def create_search_index(type, input_file):
  168.         if type == 'catalog':
  169.             return CatalogSearchIndex(input_file)
  170.         elif type == 'events':
  171.             return EventSearchIndex(input_file)
  172.  
  173. #####################
  174.  
  175. if __name__ == "__main__":
  176.     parser = OptionParser()
  177.     parser.add_option("-f", "--file", dest="file",
  178.         help="File containing records dumped from Sierra", metavar="file.txt")
  179.  
  180.     parser.add_option("-t", "--type", dest="type",
  181.         help="Search Index type we are building", metavar="catalog|events")
  182.  
  183.     (options, args) = parser.parse_args()
  184.  
  185.     if not options.file:
  186.         print "A file name is required"
  187.         sys.exit(1)
  188.  
  189.     if not options.type:
  190.         print "No index type specified"
  191.         sys.exit(1)
  192.     else:
  193.         search_index = SearchIndexFactory.create_search_index(options.type, options.file)
  194.         if not search_index:
  195.             print "Not a valid search index type"
  196.             sys.exit(1)
  197.  
  198.         search_index.flush_index()
  199.         search_index.build_index()
  200.  
  201.     sys.exit(0)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement