Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- Search features for :
- * :mod:`elasticsearch.elasticsearch`
- * :mod:`haystack:haystack`
- * :mod:`elasticstack:elasticstack`
- :creationdate: 05/11/15 15:05
- :moduleauthor: François GUÉRIN <fguerin@ville-tourcoing.fr>
- :modulename: intrautils.search
- """
- import base64
- import json
- import logging
- from copy import copy, deepcopy
- import haystack
- from django import forms
- from django.conf import settings
- from django.contrib.contenttypes.models import ContentType
- from django.db import models as dj_models
- from django.db.models.fields.files import FieldFile as dj_File
- from django.utils.translation import ugettext_lazy as _
- from elasticsearch import NotFoundError
- from elasticstack.backends import ConfigurableElasticBackend, ConfigurableElasticSearchEngine
- from elasticstack.fields import FacetField
- from elasticstack.forms import SearchForm
- from filer.models import File as fi_File
- from form_utils.forms import BetterForm
- from haystack import DEFAULT_ALIAS
- from haystack.backends import SQ
- from haystack.constants import DJANGO_CT, DJANGO_ID
- from haystack.fields import SearchField
- from haystack.forms import model_choices
- from urllib3.fields import guess_content_type
- from utils.forms import CollapsibleFieldsetFormMixin
- __author__ = 'fguerin'
- logger = logging.getLogger('intrautils.search')
- DEFAULT_TYPE_MAPPINGS = {'type': 'string', 'analyzer': 'french'}
- #: Type mapings
- TYPE_MAPPINGS = {
- 'string': {'type': 'string', 'analyzer': 'french',},
- 'edge_ngram': {'type': 'string', 'analyzer': 'edgengram_analyzer'},
- 'ngram': {'type': 'string', 'analyzer': 'ngram_analyzer'},
- 'date': {'type': 'date'},
- 'datetime': {'type': 'date'},
- 'location': {'type': 'geo_point'},
- 'boolean': {'type': 'boolean'},
- 'float': {'type': 'float'},
- 'long': {'type': 'long'},
- 'integer': {'type': 'long'},
- 'attachment': {'type': 'attachment',
- 'fields': {
- 'content': {
- 'copy_to': 'copy',
- 'type': 'string',
- 'term_vector': 'with_positions_offsets',
- 'store': 'yes',
- 'analyzer': 'edgengram_analyzer'},
- 'title': {'analyzer': 'french'},
- 'author': {
- 'analyzer': 'edgengram_analyzer'},
- 'content_type': {
- 'analyzer': 'edgengram_analyzer'},
- 'content_length': {
- 'store': 'yes',
- 'type': 'integer'}},
- }
- }
- class ExtendedElasticsearchBackend(ConfigurableElasticBackend):
- """
- Adds ***attachment*** support for elasticsearch backend settings
- """
- def setup(self):
- """
- Defers loading until needed.
- """
- # Get the existing mapping & cache it. We'll compare it
- # during the ``update`` & if it doesn't match, we'll put the new
- # mapping.
- try:
- self.existing_mapping = self.conn.indices.get_mapping(index=self.index_name)
- except NotFoundError:
- pass
- except Exception:
- if not self.silently_fail:
- raise
- unified_index = haystack.connections[self.connection_alias].get_unified_index()
- self.content_field_name, field_mapping = self.build_schema(unified_index.all_searchfields())
- current_mapping = {
- 'modelresult': {
- 'properties': field_mapping,
- }
- }
- if current_mapping != self.existing_mapping:
- try:
- # Make sure the index is there first.
- self.conn.indices.create(index=self.index_name, body=self.DEFAULT_SETTINGS, ignore=400)
- self.conn.indices.put_mapping(index=self.index_name, doc_type='modelresult', body=current_mapping)
- self.existing_mapping = current_mapping
- except Exception:
- if not self.silently_fail:
- raise
- self.setup_complete = True
- def extract_file_contents(self, file_obj):
- contents = base64.decode(file_obj)
- metadata = {'content_length': len(contents)}
- return {'contents': contents, 'metadata': metadata}
- def build_schema(self, fields):
- """
- Merge from `haystack` and `elasticstack` `elasticsearch` backend `build_shema` methods.
- It provides an additional feature : custom field mappings, from settings or default FIELD_MAPPINGS dict.
- :param fields: fields to map to the backend
- :returns: tuple content_field_name, mapping
- """
- content_field_name = ''
- final_mapping = {
- DJANGO_CT: {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
- DJANGO_ID: {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
- }
- type_mappings = copy(TYPE_MAPPINGS)
- default_type_mappings = copy(DEFAULT_TYPE_MAPPINGS)
- settings.DEBUG and logger.debug(u'ExtendedElasticsearchBackend::build_schema() '
- u'default_type_mappings = \n%s'
- u'\ntype_mappings = \n%s',
- json.dumps(default_type_mappings, indent=2),
- json.dumps(type_mappings, indent=2))
- for field_name, field_class in fields.items():
- field_type = field_class.field_type
- _mapping_for_field = type_mappings.get(field_type, default_type_mappings)
- # settings.DEBUG and logger.debug(u'ExtendedElasticsearchBackend::build_schema() '
- # u'field_name = %s / field_type = %s / _mapping_for_field = \n%s',
- # field_name, field_type, json.dumps(_mapping_for_field, indent=2))
- if field_class.boost != 1.0:
- _mapping_for_field['boost'] = field_class.boost
- if field_class.document is True:
- content_field_name = field_class.index_fieldname
- # Do this last to override `text` fields.
- if _mapping_for_field['type'] == 'string' and field_class.indexed:
- if not hasattr(field_class, 'facet_for') and not field_class.field_type in ('ngram', 'edge_ngram'):
- _mapping_for_field['analyzer'] = getattr(field_class, 'analyzer', self.DEFAULT_ANALYZER)
- final_mapping[field_class.index_fieldname] = _mapping_for_field
- settings.DEBUG and logger.debug(u'ExtendedElasticsearchBackend::build_schema() '
- u'mapping = \n%s',
- json.dumps(final_mapping, indent=2))
- return content_field_name, final_mapping
- def more_like_this(self, model_instance, additional_query_string=None, result_class=None, **kwargs):
- """
- Gives "more like this" items
- :param model_instance: model instance
- :param additional_query_string: additional srting
- :param result_class: result
- :param kwargs: additional kwargs
- :returns: super
- """
- return super(ExtendedElasticsearchBackend, self).more_like_this(model_instance, additional_query_string,
- result_class, **kwargs)
- def update(self, index, iterable=None, commit=True):
- return super(ExtendedElasticsearchBackend, self).update(index, iterable)
- def build_search_kwargs(self, query_string, sort_by=None, start_offset=0, end_offset=None, fields='',
- highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None,
- spelling_query=None, within=None, dwithin=None, distance_point=None, models=None,
- limit_to_registered_models=None, result_class=None):
- return super(ExtendedElasticsearchBackend, self).build_search_kwargs(query_string, sort_by, start_offset,
- end_offset, fields,
- highlight, facets, date_facets,
- query_facets, narrow_queries,
- spelling_query, within, dwithin,
- distance_point, models,
- limit_to_registered_models, result_class)
- class ExtendedElasticSearchEngine(ConfigurableElasticSearchEngine):
- backend = ExtendedElasticsearchBackend
- class AttachmentField(SearchField):
- """
- Mapping for an `AttachmentField`
- """
- field_type = 'attachment'
- author_field_name = 'user_author'
- author = None
- def __init__(self, **kwargs):
- if 'content_type_field' in kwargs:
- self.content_type_field = kwargs.pop('content_type_field')
- if 'author' in kwargs:
- self.author = kwargs.pop(self.author_field_name)
- super(AttachmentField, self).__init__(**kwargs)
- def convert(self, value):
- """
- Convert an attachment file to serializable data
- :param value: value to convert
- :returns: converted data
- """
- output = value
- return output
- @staticmethod
- def _get_file_data(field):
- if isinstance(field, fi_File):
- field_file = field.file
- title = name = field.label
- content_type = guess_content_type(name)
- try:
- content = base64.b64encode(field_file.read())
- except AttributeError:
- content = base64.b64encode(field_file)
- try:
- content_length = len(field_file)
- except TypeError:
- content_length = len(field_file.file)
- else: # isinstance(field, dj_File):
- field_file = field
- title = name = field_file.name
- content_type = guess_content_type(name)
- try:
- content_length = len(field_file)
- except TypeError:
- content_length = len(field_file.file)
- try:
- content = base64.b64encode(field_file.read())
- except AttributeError:
- content = base64.b64encode(field_file)
- output = {'_language': 'fr',
- '_content': content,
- '_content_type': content_type,
- '_name': name,
- '_title': title,
- '_content_length': content_length}
- # output = content
- return output
- def prepare(self, obj):
- if self.model_attr:
- field = getattr(obj, self.model_attr)
- else:
- field = obj
- if not isinstance(field, (dj_File, fi_File)):
- raise NotImplementedError('AttachmentField does not implement file reading for %s file'
- % field.__class__.__name__)
- output = self._get_file_data(field)
- if settings.DEBUG:
- _output = deepcopy(output)
- _output.update({'_content': _output['_content'][:50] + '...'})
- logger.debug(u'AttachmentField::prepare() output = %s', json.dumps(_output, indent=2))
- return output
- class FacetedAttachmentField(FacetField, AttachmentField):
- """
- Glue class to bind together `FacetField` and `AttachmentField`
- """
- pass
- def application_model_choices(app_name, using=DEFAULT_ALIAS):
- choices = model_choices(using)
- output = []
- if isinstance(app_name, (tuple, list)):
- for app in app_name:
- output.extend(application_model_choices(app, using))
- else:
- for choice in choices:
- if app_name in choice[0]:
- output.append(choice)
- output = sorted(output, key=(lambda x: x[1]))
- return output
- class HaystackSearchForm(CollapsibleFieldsetFormMixin, SearchForm, BetterForm):
- """
- :mod:`haystack:haystack` search form for main `searching` feature
- """
- class Meta:
- fieldsets = (('main', {'legend': _('search'), 'fields': ('search_query', 'models', 'more_like_this')}),)
- search_field_name = 'search_query'
- load_all = True
- #: can be a single application or a list of applications
- search_app = None
- #: global search field
- search_query = forms.CharField(label=_('Search'), required=False, max_length=255,
- help_text=_('You can use the wildcard * to search for words fragments, '
- 'by example "comm*" will search for words starting by "comm". '
- 'You can also write more than a word, each word will be searched.'))
- # more_like_this = forms.BooleanField(label=_('More like this'), required=False)
- def get_search_apps(self):
- if self.search_app:
- return self.search_app
- return None
- def get_models(self):
- """
- Return an alphabetical list of model classes in the index.
- """
- search_models = []
- if self.is_valid():
- for model in self.cleaned_data['models']:
- # noinspection PyUnresolvedReferences
- search_models.append(dj_models.get_model(*model.split('.')))
- return search_models
- def get_filters(self, search_query):
- """
- Build filter from a search_query
- :param search_query: search query
- :returns: built filters
- """
- searched = search_query.strip('*')
- if ' ' in searched:
- filters = SQ()
- search_list = search_query.split(' ')
- for item in search_list:
- sub_filters = SQ(text__contains=item.strip('*'))
- if item.startswith('*'):
- sub_filters |= SQ(text__endswith=item.strip('*'))
- if item.endswith('*'):
- sub_filters |= SQ(text__startswith=item.strip('*'))
- filters &= sub_filters
- else:
- filters = SQ(text__contains=searched)
- if search_query.startswith('*'):
- filters |= SQ(text__endswith=searched)
- if search_query.endswith('*'):
- filters |= SQ(text__startswith=searched)
- settings.DEBUG and logger.debug(u'HaystackSearchForm::get_filters(%s) filters = %s', search_query, filters)
- return filters
- @staticmethod
- def get_fields():
- """
- Gets the fields for the search
- :returns: list of fields
- """
- fields = ['document_file.content', 'text', 'content', 'title', ]
- settings.DEBUG and logger.debug(u'HaystackSearchForm::get_fields() fields = %s', fields)
- return fields
- def search(self):
- if not self.is_valid():
- return self.no_query_found()
- if not self.cleaned_data.get(self.search_field_name):
- return self.no_query_found()
- search_apps = self.get_search_apps()
- search_query = self.cleaned_data.get(self.search_field_name, None)
- search_models = self.get_models()
- more_liks_this = self.cleaned_data.get('more_like_this', False)
- filters = self.get_filters(search_query)
- if search_models:
- sub_filters = None
- for model in search_models:
- model_ct = ContentType.objects.get_for_model(model)
- _filter = SQ(django_ct__iexact='%s.%s' % (model_ct.app_label, model_ct.model))
- sub_filters = (sub_filters | _filter) if sub_filters else _filter
- filters = filters & sub_filters if filters else sub_filters
- else:
- if isinstance(search_apps, basestring):
- filters &= SQ(django_ct__startswith=search_apps)
- elif isinstance(search_apps, (tuple, list)):
- sub_filters = None
- for search_app in search_apps:
- _filter = SQ(django_ct__startswith=search_app)
- if sub_filters:
- sub_filters |= _filter
- else:
- sub_filters = _filter
- if sub_filters:
- filters &= sub_filters
- search_query_set = self.searchqueryset.filter(filters)
- settings.DEBUG and logger.debug(u'HaystackSearchForm::search() '
- u'search_query_set.query = %s (%d)', search_query_set.query,
- len(search_query_set))
- # Search for data
- if not search_query_set:
- search_query_set = search_query_set.load_all()
- # Search for `more_liks_this` items
- if search_query and more_liks_this:
- search_query_set = search_query_set.more_like_this(search_query).load_all()
- if self.load_all:
- search_query_set = search_query_set.load_all()
- settings.DEBUG and logger.debug(u'HaystackSearchForm::search() search_query (1) = %s ', search_query_set.query)
- settings.DEBUG and logger.debug(u'HaystackSearchForm::search() len(search_query_set) = %d '
- u'(after models filtering)', len(search_query_set))
- return search_query_set
- @staticmethod
- def no_query_found():
- return []
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement