Untitled

# -*- coding: utf-8 -*-
"""
Search features for :

*  :mod:`elasticsearch.elasticsearch`
*  :mod:`haystack:haystack`
*  :mod:`elasticstack:elasticstack`

:creationdate: 05/11/15 15:05
:moduleauthor: François GUÉRIN <fguerin@ville-tourcoing.fr>
:modulename: intrautils.search

"""
import base64
import json
import logging
from copy import copy, deepcopy

import haystack
from django import forms
from django.conf import settings
from django.contrib.contenttypes.models import ContentType
from django.db import models as dj_models
from django.db.models.fields.files import FieldFile as dj_File
from django.utils.translation import ugettext_lazy as _
from elasticsearch import NotFoundError
from elasticstack.backends import ConfigurableElasticBackend, ConfigurableElasticSearchEngine
from elasticstack.fields import FacetField
from elasticstack.forms import SearchForm
from filer.models import File as fi_File
from form_utils.forms import BetterForm
from haystack import DEFAULT_ALIAS
from haystack.backends import SQ
from haystack.constants import DJANGO_CT, DJANGO_ID
from haystack.fields import SearchField
from haystack.forms import model_choices
from urllib3.fields import guess_content_type

from utils.forms import CollapsibleFieldsetFormMixin

__author__ = 'fguerin'
logger = logging.getLogger('intrautils.search')

DEFAULT_TYPE_MAPPINGS = {'type': 'string', 'analyzer': 'french'}
#: Type mapings
TYPE_MAPPINGS = {
    'string': {'type': 'string', 'analyzer': 'french',},
    'edge_ngram': {'type': 'string', 'analyzer': 'edgengram_analyzer'},
    'ngram': {'type': 'string', 'analyzer': 'ngram_analyzer'},
    'date': {'type': 'date'},
    'datetime': {'type': 'date'},
    'location': {'type': 'geo_point'},
    'boolean': {'type': 'boolean'},
    'float': {'type': 'float'},
    'long': {'type': 'long'},
    'integer': {'type': 'long'},
    'attachment': {'type': 'attachment',
                   'fields': {
                       'content': {
                           'copy_to': 'copy',
                           'type': 'string',
                           'term_vector': 'with_positions_offsets',
                           'store': 'yes',
                           'analyzer': 'edgengram_analyzer'},
                       'title': {'analyzer': 'french'},
                       'author': {
                           'analyzer': 'edgengram_analyzer'},
                       'content_type': {
                           'analyzer': 'edgengram_analyzer'},
                       'content_length': {
                           'store': 'yes',
                           'type': 'integer'}},
                   }
}


class ExtendedElasticsearchBackend(ConfigurableElasticBackend):
    """
    Adds ***attachment*** support for elasticsearch backend settings
    """

    def setup(self):
        """
        Defers loading until needed.
        """
        # Get the existing mapping & cache it. We'll compare it
        # during the ``update`` & if it doesn't match, we'll put the new
        # mapping.
        try:
            self.existing_mapping = self.conn.indices.get_mapping(index=self.index_name)
        except NotFoundError:
            pass
        except Exception:
            if not self.silently_fail:
                raise

        unified_index = haystack.connections[self.connection_alias].get_unified_index()
        self.content_field_name, field_mapping = self.build_schema(unified_index.all_searchfields())
        current_mapping = {
            'modelresult': {
                'properties': field_mapping,
            }
        }

        if current_mapping != self.existing_mapping:
            try:
                # Make sure the index is there first.
                self.conn.indices.create(index=self.index_name, body=self.DEFAULT_SETTINGS, ignore=400)
                self.conn.indices.put_mapping(index=self.index_name, doc_type='modelresult', body=current_mapping)
                self.existing_mapping = current_mapping
            except Exception:
                if not self.silently_fail:
                    raise

        self.setup_complete = True

    def extract_file_contents(self, file_obj):
        contents = base64.decode(file_obj)
        metadata = {'content_length': len(contents)}
        return {'contents': contents, 'metadata': metadata}

    def build_schema(self, fields):
        """
        Merge from `haystack` and `elasticstack` `elasticsearch` backend `build_shema` methods.
        It provides an additional feature : custom field mappings, from settings or default FIELD_MAPPINGS dict.

        :param fields: fields to map to the backend
        :returns: tuple content_field_name, mapping
        """
        content_field_name = ''
        final_mapping = {
            DJANGO_CT: {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
            DJANGO_ID: {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
        }
        type_mappings = copy(TYPE_MAPPINGS)
        default_type_mappings = copy(DEFAULT_TYPE_MAPPINGS)

        settings.DEBUG and logger.debug(u'ExtendedElasticsearchBackend::build_schema() '
                                        u'default_type_mappings = \n%s'
                                        u'\ntype_mappings = \n%s',
                                        json.dumps(default_type_mappings, indent=2),
                                        json.dumps(type_mappings, indent=2))

        for field_name, field_class in fields.items():
            field_type = field_class.field_type
            _mapping_for_field = type_mappings.get(field_type, default_type_mappings)
            # settings.DEBUG and logger.debug(u'ExtendedElasticsearchBackend::build_schema() '
            #                                 u'field_name = %s / field_type = %s / _mapping_for_field = \n%s',
            #                                 field_name, field_type, json.dumps(_mapping_for_field, indent=2))
            if field_class.boost != 1.0:
                _mapping_for_field['boost'] = field_class.boost

            if field_class.document is True:
                content_field_name = field_class.index_fieldname

            # Do this last to override `text` fields.
            if _mapping_for_field['type'] == 'string' and field_class.indexed:
                if not hasattr(field_class, 'facet_for') and not field_class.field_type in ('ngram', 'edge_ngram'):
                    _mapping_for_field['analyzer'] = getattr(field_class, 'analyzer', self.DEFAULT_ANALYZER)

            final_mapping[field_class.index_fieldname] = _mapping_for_field

        settings.DEBUG and logger.debug(u'ExtendedElasticsearchBackend::build_schema() '
                                        u'mapping = \n%s',
                                        json.dumps(final_mapping, indent=2))

        return content_field_name, final_mapping

    def more_like_this(self, model_instance, additional_query_string=None, result_class=None, **kwargs):
        """
        Gives "more like this" items

        :param model_instance: model instance
        :param additional_query_string: additional srting
        :param result_class: result
        :param kwargs: additional kwargs
        :returns: super
        """
        return super(ExtendedElasticsearchBackend, self).more_like_this(model_instance, additional_query_string,
                                                                        result_class, **kwargs)

    def update(self, index, iterable=None, commit=True):
        return super(ExtendedElasticsearchBackend, self).update(index, iterable)

    def build_search_kwargs(self, query_string, sort_by=None, start_offset=0, end_offset=None, fields='',
                            highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None,
                            spelling_query=None, within=None, dwithin=None, distance_point=None, models=None,
                            limit_to_registered_models=None, result_class=None):

        return super(ExtendedElasticsearchBackend, self).build_search_kwargs(query_string, sort_by, start_offset,
                                                                             end_offset, fields,
                                                                             highlight, facets, date_facets,
                                                                             query_facets, narrow_queries,
                                                                             spelling_query, within, dwithin,
                                                                             distance_point, models,
                                                                             limit_to_registered_models, result_class)


class ExtendedElasticSearchEngine(ConfigurableElasticSearchEngine):
    backend = ExtendedElasticsearchBackend


class AttachmentField(SearchField):
    """
    Mapping for an `AttachmentField`
    """
    field_type = 'attachment'
    author_field_name = 'user_author'
    author = None

    def __init__(self, **kwargs):
        if 'content_type_field' in kwargs:
            self.content_type_field = kwargs.pop('content_type_field')
        if 'author' in kwargs:
            self.author = kwargs.pop(self.author_field_name)

        super(AttachmentField, self).__init__(**kwargs)

    def convert(self, value):
        """
        Convert an attachment file to serializable data

        :param value: value to convert
        :returns: converted data
        """
        output = value
        return output

    @staticmethod
    def _get_file_data(field):
        if isinstance(field, fi_File):
            field_file = field.file
            title = name = field.label
            content_type = guess_content_type(name)
            try:
                content = base64.b64encode(field_file.read())
            except AttributeError:
                content = base64.b64encode(field_file)
            try:
                content_length = len(field_file)
            except TypeError:
                content_length = len(field_file.file)

        else:  # isinstance(field, dj_File):
            field_file = field
            title = name = field_file.name
            content_type = guess_content_type(name)
            try:
                content_length = len(field_file)
            except TypeError:
                content_length = len(field_file.file)
            try:
                content = base64.b64encode(field_file.read())
            except AttributeError:
                content = base64.b64encode(field_file)

        output = {'_language': 'fr',
                  '_content': content,
                  '_content_type': content_type,
                  '_name': name,
                  '_title': title,
                  '_content_length': content_length}
        # output = content
        return output

    def prepare(self, obj):
        if self.model_attr:
            field = getattr(obj, self.model_attr)
        else:
            field = obj

        if not isinstance(field, (dj_File, fi_File)):
            raise NotImplementedError('AttachmentField does not implement file reading for %s file'
                                      % field.__class__.__name__)
        output = self._get_file_data(field)

        if settings.DEBUG:
            _output = deepcopy(output)
            _output.update({'_content': _output['_content'][:50] + '...'})
            logger.debug(u'AttachmentField::prepare() output = %s', json.dumps(_output, indent=2))

        return output


class FacetedAttachmentField(FacetField, AttachmentField):
    """
    Glue class to bind together `FacetField` and `AttachmentField`
    """
    pass


def application_model_choices(app_name, using=DEFAULT_ALIAS):
    choices = model_choices(using)
    output = []
    if isinstance(app_name, (tuple, list)):
        for app in app_name:
            output.extend(application_model_choices(app, using))
    else:
        for choice in choices:
            if app_name in choice[0]:
                output.append(choice)
    output = sorted(output, key=(lambda x: x[1]))
    return output


class HaystackSearchForm(CollapsibleFieldsetFormMixin, SearchForm, BetterForm):
    """
     :mod:`haystack:haystack` search form for main `searching` feature
    """

    class Meta:
        fieldsets = (('main', {'legend': _('search'), 'fields': ('search_query', 'models', 'more_like_this')}),)

    search_field_name = 'search_query'
    load_all = True

    #: can be a single application or a list of applications
    search_app = None

    #: global search field
    search_query = forms.CharField(label=_('Search'), required=False, max_length=255,
                                   help_text=_('You can use the wildcard * to search for words fragments, '
                                               'by example "comm*" will search for words starting by "comm". '
                                               'You can also write more than a word, each word will be searched.'))

    # more_like_this = forms.BooleanField(label=_('More like this'), required=False)

    def get_search_apps(self):
        if self.search_app:
            return self.search_app
        return None

    def get_models(self):
        """
        Return an alphabetical list of model classes in the index.
        """
        search_models = []
        if self.is_valid():
            for model in self.cleaned_data['models']:
                # noinspection PyUnresolvedReferences
                search_models.append(dj_models.get_model(*model.split('.')))

        return search_models

    def get_filters(self, search_query):
        """
        Build filter from a search_query

        :param search_query: search query
        :returns: built filters
        """
        searched = search_query.strip('*')
        if ' ' in searched:
            filters = SQ()
            search_list = search_query.split(' ')
            for item in search_list:
                sub_filters = SQ(text__contains=item.strip('*'))
                if item.startswith('*'):
                    sub_filters |= SQ(text__endswith=item.strip('*'))
                if item.endswith('*'):
                    sub_filters |= SQ(text__startswith=item.strip('*'))
                filters &= sub_filters
        else:
            filters = SQ(text__contains=searched)
            if search_query.startswith('*'):
                filters |= SQ(text__endswith=searched)
            if search_query.endswith('*'):
                filters |= SQ(text__startswith=searched)
        settings.DEBUG and logger.debug(u'HaystackSearchForm::get_filters(%s) filters = %s', search_query, filters)
        return filters

    @staticmethod
    def get_fields():
        """
        Gets the fields for the search

        :returns: list of fields
        """
        fields = ['document_file.content', 'text', 'content', 'title', ]
        settings.DEBUG and logger.debug(u'HaystackSearchForm::get_fields() fields = %s', fields)
        return fields

    def search(self):
        if not self.is_valid():
            return self.no_query_found()

        if not self.cleaned_data.get(self.search_field_name):
            return self.no_query_found()

        search_apps = self.get_search_apps()
        search_query = self.cleaned_data.get(self.search_field_name, None)
        search_models = self.get_models()
        more_liks_this = self.cleaned_data.get('more_like_this', False)

        filters = self.get_filters(search_query)

        if search_models:
            sub_filters = None
            for model in search_models:
                model_ct = ContentType.objects.get_for_model(model)
                _filter = SQ(django_ct__iexact='%s.%s' % (model_ct.app_label, model_ct.model))
                sub_filters = (sub_filters | _filter) if sub_filters else _filter
                filters = filters & sub_filters if filters else sub_filters
        else:
            if isinstance(search_apps, basestring):
                filters &= SQ(django_ct__startswith=search_apps)
            elif isinstance(search_apps, (tuple, list)):
                sub_filters = None
                for search_app in search_apps:
                    _filter = SQ(django_ct__startswith=search_app)
                    if sub_filters:
                        sub_filters |= _filter
                    else:
                        sub_filters = _filter
                if sub_filters:
                    filters &= sub_filters

        search_query_set = self.searchqueryset.filter(filters)
        settings.DEBUG and logger.debug(u'HaystackSearchForm::search() '
                                        u'search_query_set.query = %s (%d)', search_query_set.query,
                                        len(search_query_set))

        # Search for data
        if not search_query_set:
            search_query_set = search_query_set.load_all()

        # Search for `more_liks_this` items
        if search_query and more_liks_this:
            search_query_set = search_query_set.more_like_this(search_query).load_all()

        if self.load_all:
            search_query_set = search_query_set.load_all()

        settings.DEBUG and logger.debug(u'HaystackSearchForm::search() search_query (1) = %s ', search_query_set.query)
        settings.DEBUG and logger.debug(u'HaystackSearchForm::search() len(search_query_set) = %d '
                                        u'(after models filtering)', len(search_query_set))
        return search_query_set

    @staticmethod
    def no_query_found():
        return []