Advertisement
Guest User

Untitled

a guest
Feb 8th, 2016
61
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 17.23 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. Search features for :
  4.  
  5. * :mod:`elasticsearch.elasticsearch`
  6. * :mod:`haystack:haystack`
  7. * :mod:`elasticstack:elasticstack`
  8.  
  9. :creationdate: 05/11/15 15:05
  10. :moduleauthor: François GUÉRIN <fguerin@ville-tourcoing.fr>
  11. :modulename: intrautils.search
  12.  
  13. """
  14. import base64
  15. import json
  16. import logging
  17. from copy import copy, deepcopy
  18.  
  19. import haystack
  20. from django import forms
  21. from django.conf import settings
  22. from django.contrib.contenttypes.models import ContentType
  23. from django.db import models as dj_models
  24. from django.db.models.fields.files import FieldFile as dj_File
  25. from django.utils.translation import ugettext_lazy as _
  26. from elasticsearch import NotFoundError
  27. from elasticstack.backends import ConfigurableElasticBackend, ConfigurableElasticSearchEngine
  28. from elasticstack.fields import FacetField
  29. from elasticstack.forms import SearchForm
  30. from filer.models import File as fi_File
  31. from form_utils.forms import BetterForm
  32. from haystack import DEFAULT_ALIAS
  33. from haystack.backends import SQ
  34. from haystack.constants import DJANGO_CT, DJANGO_ID
  35. from haystack.fields import SearchField
  36. from haystack.forms import model_choices
  37. from urllib3.fields import guess_content_type
  38.  
  39. from utils.forms import CollapsibleFieldsetFormMixin
  40.  
  41. __author__ = 'fguerin'
  42. logger = logging.getLogger('intrautils.search')
  43.  
  44. DEFAULT_TYPE_MAPPINGS = {'type': 'string', 'analyzer': 'french'}
  45. #: Type mapings
  46. TYPE_MAPPINGS = {
  47. 'string': {'type': 'string', 'analyzer': 'french',},
  48. 'edge_ngram': {'type': 'string', 'analyzer': 'edgengram_analyzer'},
  49. 'ngram': {'type': 'string', 'analyzer': 'ngram_analyzer'},
  50. 'date': {'type': 'date'},
  51. 'datetime': {'type': 'date'},
  52. 'location': {'type': 'geo_point'},
  53. 'boolean': {'type': 'boolean'},
  54. 'float': {'type': 'float'},
  55. 'long': {'type': 'long'},
  56. 'integer': {'type': 'long'},
  57. 'attachment': {'type': 'attachment',
  58. 'fields': {
  59. 'content': {
  60. 'copy_to': 'copy',
  61. 'type': 'string',
  62. 'term_vector': 'with_positions_offsets',
  63. 'store': 'yes',
  64. 'analyzer': 'edgengram_analyzer'},
  65. 'title': {'analyzer': 'french'},
  66. 'author': {
  67. 'analyzer': 'edgengram_analyzer'},
  68. 'content_type': {
  69. 'analyzer': 'edgengram_analyzer'},
  70. 'content_length': {
  71. 'store': 'yes',
  72. 'type': 'integer'}},
  73. }
  74. }
  75.  
  76.  
  77. class ExtendedElasticsearchBackend(ConfigurableElasticBackend):
  78. """
  79. Adds ***attachment*** support for elasticsearch backend settings
  80. """
  81.  
  82. def setup(self):
  83. """
  84. Defers loading until needed.
  85. """
  86. # Get the existing mapping & cache it. We'll compare it
  87. # during the ``update`` & if it doesn't match, we'll put the new
  88. # mapping.
  89. try:
  90. self.existing_mapping = self.conn.indices.get_mapping(index=self.index_name)
  91. except NotFoundError:
  92. pass
  93. except Exception:
  94. if not self.silently_fail:
  95. raise
  96.  
  97. unified_index = haystack.connections[self.connection_alias].get_unified_index()
  98. self.content_field_name, field_mapping = self.build_schema(unified_index.all_searchfields())
  99. current_mapping = {
  100. 'modelresult': {
  101. 'properties': field_mapping,
  102. }
  103. }
  104.  
  105. if current_mapping != self.existing_mapping:
  106. try:
  107. # Make sure the index is there first.
  108. self.conn.indices.create(index=self.index_name, body=self.DEFAULT_SETTINGS, ignore=400)
  109. self.conn.indices.put_mapping(index=self.index_name, doc_type='modelresult', body=current_mapping)
  110. self.existing_mapping = current_mapping
  111. except Exception:
  112. if not self.silently_fail:
  113. raise
  114.  
  115. self.setup_complete = True
  116.  
  117. def extract_file_contents(self, file_obj):
  118. contents = base64.decode(file_obj)
  119. metadata = {'content_length': len(contents)}
  120. return {'contents': contents, 'metadata': metadata}
  121.  
  122. def build_schema(self, fields):
  123. """
  124. Merge from `haystack` and `elasticstack` `elasticsearch` backend `build_shema` methods.
  125. It provides an additional feature : custom field mappings, from settings or default FIELD_MAPPINGS dict.
  126.  
  127. :param fields: fields to map to the backend
  128. :returns: tuple content_field_name, mapping
  129. """
  130. content_field_name = ''
  131. final_mapping = {
  132. DJANGO_CT: {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
  133. DJANGO_ID: {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
  134. }
  135. type_mappings = copy(TYPE_MAPPINGS)
  136. default_type_mappings = copy(DEFAULT_TYPE_MAPPINGS)
  137.  
  138. settings.DEBUG and logger.debug(u'ExtendedElasticsearchBackend::build_schema() '
  139. u'default_type_mappings = \n%s'
  140. u'\ntype_mappings = \n%s',
  141. json.dumps(default_type_mappings, indent=2),
  142. json.dumps(type_mappings, indent=2))
  143.  
  144. for field_name, field_class in fields.items():
  145. field_type = field_class.field_type
  146. _mapping_for_field = type_mappings.get(field_type, default_type_mappings)
  147. # settings.DEBUG and logger.debug(u'ExtendedElasticsearchBackend::build_schema() '
  148. # u'field_name = %s / field_type = %s / _mapping_for_field = \n%s',
  149. # field_name, field_type, json.dumps(_mapping_for_field, indent=2))
  150. if field_class.boost != 1.0:
  151. _mapping_for_field['boost'] = field_class.boost
  152.  
  153. if field_class.document is True:
  154. content_field_name = field_class.index_fieldname
  155.  
  156. # Do this last to override `text` fields.
  157. if _mapping_for_field['type'] == 'string' and field_class.indexed:
  158. if not hasattr(field_class, 'facet_for') and not field_class.field_type in ('ngram', 'edge_ngram'):
  159. _mapping_for_field['analyzer'] = getattr(field_class, 'analyzer', self.DEFAULT_ANALYZER)
  160.  
  161. final_mapping[field_class.index_fieldname] = _mapping_for_field
  162.  
  163. settings.DEBUG and logger.debug(u'ExtendedElasticsearchBackend::build_schema() '
  164. u'mapping = \n%s',
  165. json.dumps(final_mapping, indent=2))
  166.  
  167. return content_field_name, final_mapping
  168.  
  169. def more_like_this(self, model_instance, additional_query_string=None, result_class=None, **kwargs):
  170. """
  171. Gives "more like this" items
  172.  
  173. :param model_instance: model instance
  174. :param additional_query_string: additional srting
  175. :param result_class: result
  176. :param kwargs: additional kwargs
  177. :returns: super
  178. """
  179. return super(ExtendedElasticsearchBackend, self).more_like_this(model_instance, additional_query_string,
  180. result_class, **kwargs)
  181.  
  182. def update(self, index, iterable=None, commit=True):
  183. return super(ExtendedElasticsearchBackend, self).update(index, iterable)
  184.  
  185. def build_search_kwargs(self, query_string, sort_by=None, start_offset=0, end_offset=None, fields='',
  186. highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None,
  187. spelling_query=None, within=None, dwithin=None, distance_point=None, models=None,
  188. limit_to_registered_models=None, result_class=None):
  189.  
  190. return super(ExtendedElasticsearchBackend, self).build_search_kwargs(query_string, sort_by, start_offset,
  191. end_offset, fields,
  192. highlight, facets, date_facets,
  193. query_facets, narrow_queries,
  194. spelling_query, within, dwithin,
  195. distance_point, models,
  196. limit_to_registered_models, result_class)
  197.  
  198.  
  199. class ExtendedElasticSearchEngine(ConfigurableElasticSearchEngine):
  200. backend = ExtendedElasticsearchBackend
  201.  
  202.  
  203. class AttachmentField(SearchField):
  204. """
  205. Mapping for an `AttachmentField`
  206. """
  207. field_type = 'attachment'
  208. author_field_name = 'user_author'
  209. author = None
  210.  
  211. def __init__(self, **kwargs):
  212. if 'content_type_field' in kwargs:
  213. self.content_type_field = kwargs.pop('content_type_field')
  214. if 'author' in kwargs:
  215. self.author = kwargs.pop(self.author_field_name)
  216.  
  217. super(AttachmentField, self).__init__(**kwargs)
  218.  
  219. def convert(self, value):
  220. """
  221. Convert an attachment file to serializable data
  222.  
  223. :param value: value to convert
  224. :returns: converted data
  225. """
  226. output = value
  227. return output
  228.  
  229. @staticmethod
  230. def _get_file_data(field):
  231. if isinstance(field, fi_File):
  232. field_file = field.file
  233. title = name = field.label
  234. content_type = guess_content_type(name)
  235. try:
  236. content = base64.b64encode(field_file.read())
  237. except AttributeError:
  238. content = base64.b64encode(field_file)
  239. try:
  240. content_length = len(field_file)
  241. except TypeError:
  242. content_length = len(field_file.file)
  243.  
  244. else: # isinstance(field, dj_File):
  245. field_file = field
  246. title = name = field_file.name
  247. content_type = guess_content_type(name)
  248. try:
  249. content_length = len(field_file)
  250. except TypeError:
  251. content_length = len(field_file.file)
  252. try:
  253. content = base64.b64encode(field_file.read())
  254. except AttributeError:
  255. content = base64.b64encode(field_file)
  256.  
  257. output = {'_language': 'fr',
  258. '_content': content,
  259. '_content_type': content_type,
  260. '_name': name,
  261. '_title': title,
  262. '_content_length': content_length}
  263. # output = content
  264. return output
  265.  
  266. def prepare(self, obj):
  267. if self.model_attr:
  268. field = getattr(obj, self.model_attr)
  269. else:
  270. field = obj
  271.  
  272. if not isinstance(field, (dj_File, fi_File)):
  273. raise NotImplementedError('AttachmentField does not implement file reading for %s file'
  274. % field.__class__.__name__)
  275. output = self._get_file_data(field)
  276.  
  277. if settings.DEBUG:
  278. _output = deepcopy(output)
  279. _output.update({'_content': _output['_content'][:50] + '...'})
  280. logger.debug(u'AttachmentField::prepare() output = %s', json.dumps(_output, indent=2))
  281.  
  282. return output
  283.  
  284.  
  285. class FacetedAttachmentField(FacetField, AttachmentField):
  286. """
  287. Glue class to bind together `FacetField` and `AttachmentField`
  288. """
  289. pass
  290.  
  291.  
  292. def application_model_choices(app_name, using=DEFAULT_ALIAS):
  293. choices = model_choices(using)
  294. output = []
  295. if isinstance(app_name, (tuple, list)):
  296. for app in app_name:
  297. output.extend(application_model_choices(app, using))
  298. else:
  299. for choice in choices:
  300. if app_name in choice[0]:
  301. output.append(choice)
  302. output = sorted(output, key=(lambda x: x[1]))
  303. return output
  304.  
  305.  
  306. class HaystackSearchForm(CollapsibleFieldsetFormMixin, SearchForm, BetterForm):
  307. """
  308. :mod:`haystack:haystack` search form for main `searching` feature
  309. """
  310.  
  311. class Meta:
  312. fieldsets = (('main', {'legend': _('search'), 'fields': ('search_query', 'models', 'more_like_this')}),)
  313.  
  314. search_field_name = 'search_query'
  315. load_all = True
  316.  
  317. #: can be a single application or a list of applications
  318. search_app = None
  319.  
  320. #: global search field
  321. search_query = forms.CharField(label=_('Search'), required=False, max_length=255,
  322. help_text=_('You can use the wildcard * to search for words fragments, '
  323. 'by example "comm*" will search for words starting by "comm". '
  324. 'You can also write more than a word, each word will be searched.'))
  325.  
  326. # more_like_this = forms.BooleanField(label=_('More like this'), required=False)
  327.  
  328. def get_search_apps(self):
  329. if self.search_app:
  330. return self.search_app
  331. return None
  332.  
  333. def get_models(self):
  334. """
  335. Return an alphabetical list of model classes in the index.
  336. """
  337. search_models = []
  338. if self.is_valid():
  339. for model in self.cleaned_data['models']:
  340. # noinspection PyUnresolvedReferences
  341. search_models.append(dj_models.get_model(*model.split('.')))
  342.  
  343. return search_models
  344.  
  345. def get_filters(self, search_query):
  346. """
  347. Build filter from a search_query
  348.  
  349. :param search_query: search query
  350. :returns: built filters
  351. """
  352. searched = search_query.strip('*')
  353. if ' ' in searched:
  354. filters = SQ()
  355. search_list = search_query.split(' ')
  356. for item in search_list:
  357. sub_filters = SQ(text__contains=item.strip('*'))
  358. if item.startswith('*'):
  359. sub_filters |= SQ(text__endswith=item.strip('*'))
  360. if item.endswith('*'):
  361. sub_filters |= SQ(text__startswith=item.strip('*'))
  362. filters &= sub_filters
  363. else:
  364. filters = SQ(text__contains=searched)
  365. if search_query.startswith('*'):
  366. filters |= SQ(text__endswith=searched)
  367. if search_query.endswith('*'):
  368. filters |= SQ(text__startswith=searched)
  369. settings.DEBUG and logger.debug(u'HaystackSearchForm::get_filters(%s) filters = %s', search_query, filters)
  370. return filters
  371.  
  372. @staticmethod
  373. def get_fields():
  374. """
  375. Gets the fields for the search
  376.  
  377. :returns: list of fields
  378. """
  379. fields = ['document_file.content', 'text', 'content', 'title', ]
  380. settings.DEBUG and logger.debug(u'HaystackSearchForm::get_fields() fields = %s', fields)
  381. return fields
  382.  
  383. def search(self):
  384. if not self.is_valid():
  385. return self.no_query_found()
  386.  
  387. if not self.cleaned_data.get(self.search_field_name):
  388. return self.no_query_found()
  389.  
  390. search_apps = self.get_search_apps()
  391. search_query = self.cleaned_data.get(self.search_field_name, None)
  392. search_models = self.get_models()
  393. more_liks_this = self.cleaned_data.get('more_like_this', False)
  394.  
  395. filters = self.get_filters(search_query)
  396.  
  397. if search_models:
  398. sub_filters = None
  399. for model in search_models:
  400. model_ct = ContentType.objects.get_for_model(model)
  401. _filter = SQ(django_ct__iexact='%s.%s' % (model_ct.app_label, model_ct.model))
  402. sub_filters = (sub_filters | _filter) if sub_filters else _filter
  403. filters = filters & sub_filters if filters else sub_filters
  404. else:
  405. if isinstance(search_apps, basestring):
  406. filters &= SQ(django_ct__startswith=search_apps)
  407. elif isinstance(search_apps, (tuple, list)):
  408. sub_filters = None
  409. for search_app in search_apps:
  410. _filter = SQ(django_ct__startswith=search_app)
  411. if sub_filters:
  412. sub_filters |= _filter
  413. else:
  414. sub_filters = _filter
  415. if sub_filters:
  416. filters &= sub_filters
  417.  
  418. search_query_set = self.searchqueryset.filter(filters)
  419. settings.DEBUG and logger.debug(u'HaystackSearchForm::search() '
  420. u'search_query_set.query = %s (%d)', search_query_set.query,
  421. len(search_query_set))
  422.  
  423. # Search for data
  424. if not search_query_set:
  425. search_query_set = search_query_set.load_all()
  426.  
  427. # Search for `more_liks_this` items
  428. if search_query and more_liks_this:
  429. search_query_set = search_query_set.more_like_this(search_query).load_all()
  430.  
  431. if self.load_all:
  432. search_query_set = search_query_set.load_all()
  433.  
  434. settings.DEBUG and logger.debug(u'HaystackSearchForm::search() search_query (1) = %s ', search_query_set.query)
  435. settings.DEBUG and logger.debug(u'HaystackSearchForm::search() len(search_query_set) = %d '
  436. u'(after models filtering)', len(search_query_set))
  437. return search_query_set
  438.  
  439. @staticmethod
  440. def no_query_found():
  441. return []
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement