[IFullTextIndexSerializable] use fulltext_indexable_attributes attribute to update cw_attr_cache

  • get rid of CUSTOM_ATTRIBUTES. Replace it with IFullTextIndexSerializable adapter custom_indexable_attributes attribute;
  • FullTextIndexSerializable.serialize method : default complete value is now True*

closes #15473085

authorKatia Saurfelt <katia.saurfelt@logilab.fr>
changesetaf6334be7b14
branchdefault
phasedraft
hiddenyes
parent revision#e6392f5058c4 [pkg] make elasticsearch a new-layout-style cube
child revision<not specified>
files modified by this revision
cubicweb_elasticsearch/ccplugin.py
cubicweb_elasticsearch/entities.py
cubicweb_elasticsearch/es.py
cubicweb_elasticsearch/hooks.py
cubicweb_elasticsearch/testutils.py
test/test_elastic_search.py
test/test_hooks.py
test/test_ifulltextadapter.py
test/test_parents.py
# HG changeset patch
# User Katia Saurfelt <katia.saurfelt@logilab.fr>
# Date 1490017003 -3600
# Mon Mar 20 14:36:43 2017 +0100
# Node ID af6334be7b14a96b93b7e6793f8ea5602a6a3217
# Parent e6392f5058c418affb1b40649c9460ced29f1d61
[IFullTextIndexSerializable] use fulltext_indexable_attributes attribute to update cw_attr_cache

* get rid of CUSTOM_ATTRIBUTES. Replace it with IFullTextIndexSerializable adapter `custom_indexable_attributes`
attribute;
* FullTextIndexSerializable.serialize method : default complete value is now True*


closes #15473085

diff --git a/cubicweb_elasticsearch/ccplugin.py b/cubicweb_elasticsearch/ccplugin.py
@@ -95,18 +95,18 @@
1 
2      def bulk_actions(self, etypes, cnx, index_name=None, dry_run=False):
3          if index_name is None:
4              index_name = cnx.vreg.config['index-name']
5          for etype in etypes:
6 -            rql = fulltext_indexable_rql(etype, cnx.vreg.schema)
7 +            rql = fulltext_indexable_rql(etype, cnx)
8              rset = cnx.execute(rql)
9              cnx.info(u'[{}] indexing {} {} entities'.format(index_name, len(rset), etype))
10              cnx.debug(u'RQL: {}'.format(rql))
11 
12              for entity in rset.entities():
13                  serializer = entity.cw_adapt_to('IFullTextIndexSerializable')
14 -                json = serializer.serialize()
15 +                json = serializer.serialize(complete=False)
16                  if not dry_run and json:
17                      # Entities with
18                      # fulltext_containers relations return their container
19                      # IFullTextIndex serializer , therefor the "id" and
20                      # "doc_type" in kwargs bellow must be container data.
diff --git a/cubicweb_elasticsearch/entities.py b/cubicweb_elasticsearch/entities.py
@@ -17,10 +17,12 @@
21 
22  """cubicweb-elasticsearch entity's classes"""
23 
24  import collections
25 
26 +from logilab.common.decorators import cachedproperty
27 +
28  from cubicweb import view, neg_role
29  from cubicweb.predicates import is_instance
30 
31  from cubicweb.appobject import AppObject
32 
@@ -86,21 +88,40 @@
33      directly serialized to e.g. JSON.
34      """
35 
36      __regid__ = 'IFullTextIndexSerializable'
37      __select__ = is_instance('Any')
38 +    custom_indexable_attributes = ()
39 +    skip_indexable_attributes = ()
40 
41 -    def serialize(self, complete=False):
42 +    @cachedproperty
43 +    def fulltext_indexable_attributes(self):
44 +        eschema = self._cw.vreg.schema[self.entity.cw_etype]
45 +        attrs = ['creation_date', 'modification_date', 'cwuri']
46 +        attrs.extend([r.type for r in eschema.indexable_attributes()
47 +                      if r.type not in self.skip_indexable_attributes])
48 +        for rschema, tschema in eschema.attribute_definitions():
49 +            if rschema.type == 'eid':
50 +                continue
51 +            # XXX
52 +            if tschema.type in ('Int', 'Float'):
53 +                attrs.append(rschema.type)
54 +        attrs.extend(self.custom_indexable_attributes)
55 +        return attrs
56 +
57 +    def serialize(self, complete=True):
58          entity = self.entity
59          if complete:
60              entity.complete()
61          data = {
62              'cw_etype': entity.cw_etype,
63              'eid': entity.eid,
64              'cwuri': entity.cwuri,
65          }
66 -        data.update(entity.cw_attr_cache)
67 +        for attr in self.fulltext_indexable_attributes:
68 +            if attr in entity.cw_attr_cache:
69 +                data[attr] = entity.cw_attr_cache[attr]
70          self.update_parent_info(data, entity)
71          # TODO take a look at what's in entity.cw_relation_cache
72          return data
73 
74      def update_parent_info(self, data, entity):
@@ -112,11 +133,11 @@
75 
76 
77  class File(IFullTextIndexSerializable):
78      __select__ = IFullTextIndexSerializable.__select__ & is_instance('File')
79 
80 -    def serialize(self, complete=False):
81 +    def serialize(self, complete=True):
82          """this could be a generic implementation of fulltext_containers indexation, but for
83 
84          now we can not return more than one parent json which is fine
85          for Files
86          """
diff --git a/cubicweb_elasticsearch/es.py b/cubicweb_elasticsearch/es.py
@@ -25,12 +25,10 @@
87 
88  INDEXABLE_TYPES = None
89 
90  # customization mechanism, in your cube, add your type as a key, and a list of
91  # additionnal attributes
92 -# eg. CUSTOM_ATTRIBUTES['BlogEntry'] = ('description',)
93 -CUSTOM_ATTRIBUTES = {}
94 
95  log = logging.getLogger(__name__)
96 
97 
98  def indexable_types(schema, custom_skip_list=None):
@@ -54,11 +52,11 @@
99                  indexable_types.append(eschema.type)
100      INDEXABLE_TYPES = indexable_types
101      return indexable_types
102 
103 
104 -def fulltext_indexable_rql(etype, schema, eid=None):
105 +def fulltext_indexable_rql(etype, cnx, eid=None):
106      '''
107      Generate RQL with fulltext_indexable attributes for a given entity type
108 
109      :eid:
110         defaults to None, set it to an eid to get RQL for a single element (used in hooks)
@@ -68,28 +66,16 @@
111      rql = ['WHERE %s is %s' % (V, etype)]
112      if eid:
113          rql.append('%s eid %i' % (V, eid))
114      var = next(varmaker)
115      selected = []
116 -    for rschema in schema.eschema(etype).indexable_attributes():
117 -        attr = rschema.type
118 +    cw_entity = cnx.vreg['etypes'].etype_class(etype)(cnx)
119 +    for attr in cw_entity.cw_adapt_to(
120 +            'IFullTextIndexSerializable').fulltext_indexable_attributes:
121          var = next(varmaker)
122          rql.append('%s %s %s' % (V, attr, var))
123          selected.append(var)
124 -    for rschema, tschema in schema.eschema(etype).attribute_definitions():
125 -        if rschema.type == 'eid':
126 -            continue
127 -        if tschema.type in ('Int', 'Float'):
128 -            attr = rschema.type
129 -            var = next(varmaker)
130 -            rql.append('%s %s %s' % (V, attr, var))
131 -            selected.append(var)
132 -    for attr in ('creation_date', 'modification_date', 'cwuri') + CUSTOM_ATTRIBUTES.get(etype, ()):
133 -        var = next(varmaker)
134 -        rql.append('%s %s %s' % (V, attr, var))
135 -        selected.append(var)
136 -    # TODO inlined relations ?
137      return 'Any %s,%s %s' % (V, ','.join(selected),
138                               ','.join(rql))
139 
140 
141  def create_index(es, index_name, settings=None):
diff --git a/cubicweb_elasticsearch/hooks.py b/cubicweb_elasticsearch/hooks.py
@@ -22,18 +22,19 @@
142  from elasticsearch.exceptions import ConnectionError
143  from urllib3.exceptions import ProtocolError
144 
145  from cubicweb.server import hook
146  from cubicweb.predicates import score_entity
147 -from cubicweb_elasticsearch.es import indexable_types, fulltext_indexable_rql, CUSTOM_ATTRIBUTES
148 +
149 +from cubicweb_elasticsearch.es import indexable_types
150 
151  log = logging.getLogger(__name__)
152 
153 
154  def entity_indexable(entity):
155      return entity.cw_etype in indexable_types(entity._cw.vreg.schema) or \
156 -        entity.cw_etype in CUSTOM_ATTRIBUTES
157 +        entity.cw_adapt_to('IFullTextIndexSerializable').custom_indexable_attributes
158 
159 
160  class ContentUpdateIndexES(hook.Hook):
161      """detect content change and updates ES indexing"""
162 
@@ -55,13 +56,11 @@
163 
164      def __call__(self):
165          # XXX add a selector for object and subject
166          for entity in (self._cw.entity_from_eid(self.eidfrom),
167                         self._cw.entity_from_eid(self.eidto)):
168 -            cw_etype = entity.cw_etype
169 -            if (cw_etype in indexable_types(entity._cw.vreg.schema) or
170 -                    cw_etype in CUSTOM_ATTRIBUTES):
171 +            if entity_indexable(entity):
172                  IndexEsOperation.get_instance(self._cw).add_data(entity)
173 
174 
175  class IndexEsOperation(hook.DataOperationMixIn, hook.Operation):
176 
@@ -80,16 +79,12 @@
177                      # TODO option for async ?
178                      es.delete(**kwargs)
179                  except (ConnectionError, ProtocolError):
180                      log.debug('Failed to index in hook, could not connect to ES')
181                  continue
182 -            rql = fulltext_indexable_rql(entity.cw_etype,
183 -                                         entity._cw.vreg.schema,
184 -                                         eid=entity.eid)
185 -            indexable_entity = self.cnx.execute(rql).one()
186 -            serializer = indexable_entity.cw_adapt_to('IFullTextIndexSerializable')
187 -            json = serializer.serialize(complete=True)
188 +            serializer = entity.cw_adapt_to('IFullTextIndexSerializable')
189 +            json = serializer.serialize()
190              if not json:
191                  # if en entity has been already indexed, we still
192                  # keep the first indexation
193                  # which is wrong. We should remove the existing es entry.
194                  continue
diff --git a/cubicweb_elasticsearch/testutils.py b/cubicweb_elasticsearch/testutils.py
@@ -3,23 +3,26 @@
195 
196  from elasticsearch_dsl.connections import connections
197 
198  from cubicweb.predicates import is_instance
199 
200 -from cubicweb_elasticsearch.es import CUSTOM_ATTRIBUTES
201  from cubicweb_elasticsearch.entities import IFullTextIndexSerializable
202 
203 
204 -CUSTOM_ATTRIBUTES['Blog'] = ('title',)
205 +class BlogEntryFTIAdapter(IFullTextIndexSerializable):
206 +    __select__ = (IFullTextIndexSerializable.__select__ &
207 +                  is_instance('BlogEntry'))
208 +    custom_indexable_attributes = ('title', 'content')
209 +
210 +    def update_parent_info(self, data, entity):
211 +        data['parent'] = entity.entry_of[0].eid
212 
213 
214  class BlogFTIAdapter(IFullTextIndexSerializable):
215      __select__ = (IFullTextIndexSerializable.__select__ &
216 -                  is_instance('BlogEntry'))
217 -
218 -    def update_parent_info(self, data, entity):
219 -        data['parent'] = entity.entry_of[0].eid
220 +                  is_instance('Blog'))
221 +    custom_indexable_attributes = ('title', )
222 
223 
224  class RealESTestMixin(object):
225 
226      @classmethod
diff --git a/test/test_elastic_search.py b/test/test_elastic_search.py
@@ -175,13 +175,12 @@
227 
228  class ElasticsearchTC(testlib.CubicWebTC):
229 
230      def test_1(self):
231          with self.admin_access.cnx() as cnx:
232 -            schema = cnx.vreg.schema
233              etype = 'Person'
234 -            rql = fulltext_indexable_rql(etype, schema)
235 +            rql = fulltext_indexable_rql(etype, cnx)
236              self.assertIn('age', rql)
237              self.assertNotIn('eid', rql)
238              self.assertEqual(rql.count('modification_date'), 1)
239 
240 
diff --git a/test/test_hooks.py b/test/test_hooks.py
@@ -3,19 +3,19 @@
241 
242  from elasticsearch_dsl import Search
243 
244  from cubicweb.devtools import testlib
245 
246 -from cubicweb_elasticsearch.testutils import RealESTestMixin, BlogFTIAdapter
247 +from cubicweb_elasticsearch.testutils import RealESTestMixin, BlogEntryFTIAdapter
248  from cubicweb_elasticsearch.search_helpers import compose_search
249 
250 
251  class ReindexOnRelationTests(RealESTestMixin, testlib.CubicWebTC):
252 
253      def test_es_hooks_modify_relation(self):
254          with self.admin_access.cnx() as cnx:
255 -            with self.temporary_appobjects(BlogFTIAdapter):
256 +            with self.temporary_appobjects(BlogEntryFTIAdapter):
257                  indexer = cnx.vreg['es'].select('indexer', cnx)
258                  indexer.create_index(custom_settings={
259                      'mappings': {
260                          'BlogEntry': {'_parent': {"type": "Blog"}},
261                      }
diff --git a/test/test_ifulltextadapter.py b/test/test_ifulltextadapter.py
@@ -0,0 +1,72 @@
262 +import unittest
263 +
264 +from mock import patch
265 +
266 +from cubicweb.devtools import testlib
267 +from cubicweb.cwconfig import CubicWebConfiguration
268 +
269 +from cubes.elasticsearch.testutils import BlogFTIAdapter, BlogEntryFTIAdapter
270 +
271 +
272 +class IFullTextIndexSerializableTC(testlib.CubicWebTC):
273 +
274 +    def setup_database(self):
275 +        super(IFullTextIndexSerializableTC, self).setup_database()
276 +        self.orig_config_for = CubicWebConfiguration.config_for
277 +        config_for = lambda appid: self.config  # noqa
278 +        CubicWebConfiguration.config_for = staticmethod(config_for)
279 +        self.config['elasticsearch-locations'] = 'http://nonexistant.elastic.search:9200'
280 +        self.config['index-name'] = 'unittest_index_name'
281 +
282 +    @patch('elasticsearch.client.indices.IndicesClient.create')
283 +    @patch('elasticsearch.client.indices.IndicesClient.exists')
284 +    @patch('elasticsearch.client.Elasticsearch.index')
285 +    def test_index_entity(self, create, exists, index):
286 +        """Only update indexable attributes while call entity.complete()
287 +           on IFullTextIndexSerializable.serialze()
288 +        """
289 +        with self.admin_access.repo_cnx() as cnx:
290 +            with self.temporary_appobjects(BlogFTIAdapter, BlogEntryFTIAdapter):
291 +                indexer = cnx.vreg['es'].select('indexer', cnx)
292 +                es = indexer.get_connection()
293 +                blog = cnx.create_entity('Blog', title=u'Blog')
294 +                cnx.commit()
295 +                self.assertTrue(es.index.called)
296 +                args, kwargs = es.index.call_args
297 +                # blog title is a in custom_indexable_attributes
298 +                self.assertEqual(kwargs['doc_type'], 'Blog')
299 +                self.assertEqual(kwargs['body']['title'], u'Blog')
300 +                index.reset_mock()
301 +                # create a BlogEntry
302 +                bentry = cnx.create_entity('BlogEntry', title=u'program',
303 +                                           content=u'Le nouveau programme',
304 +                                           entry_of=blog)
305 +                cnx.commit()
306 +                self.assertTrue(es.index.called)
307 +                args, kwargs = es.index.call_args
308 +                self.assertEqual(kwargs['doc_type'], 'BlogEntry')
309 +                for arg_name, expected_value in (
310 +                        ('content', u'Le nouveau programme'),
311 +                        ('cwuri', bentry.cwuri),
312 +                        ('title', 'program')):
313 +                    self.assertEqual(kwargs['body'][arg_name], expected_value)
314 +                self.assertFalse('content_format' in kwargs['body'])
315 +                # update BlogEntry
316 +                bentry.cw_set(title=u'Programme')
317 +                cnx.commit()
318 +                index.reset_mock()
319 +                self.assertTrue(es.index.called)
320 +                args, kwargs = es.index.call_args
321 +                for arg_name, expected_value in (
322 +                        ('id', bentry.eid), ('doc_type', bentry.cw_etype)):
323 +                    self.assertEqual(kwargs[arg_name], expected_value)
324 +                for arg_name, expected_value in (
325 +                        ('content', u'Le nouveau programme'),
326 +                        ('cwuri', bentry.cwuri),
327 +                        ('title', u'Programme')):
328 +                    self.assertEqual(kwargs['body'][arg_name], expected_value)
329 +                self.assertFalse('content_format' in kwargs['body'])
330 +
331 +
332 +if __name__ == '__main__':
333 +    unittest.main()
diff --git a/test/test_parents.py b/test/test_parents.py
@@ -7,18 +7,18 @@
334 
335  from cubicweb.devtools import testlib
336 
337  from cubicweb_elasticsearch.search_helpers import compose_search
338 
339 -from cubicweb_elasticsearch.testutils import RealESTestMixin, BlogFTIAdapter
340 +from cubicweb_elasticsearch.testutils import RealESTestMixin, BlogEntryFTIAdapter
341 
342 
343  class ParentsSearchTC(RealESTestMixin, testlib.CubicWebTC):
344 
345      def test_parent_search(self):
346          with self.admin_access.cnx() as cnx:
347 -            with self.temporary_appobjects(BlogFTIAdapter):
348 +            with self.temporary_appobjects(BlogEntryFTIAdapter):
349                  indexer = cnx.vreg['es'].select('indexer', cnx)
350                  indexer.create_index(custom_settings={
351                      'mappings': {
352                          'BlogEntry': {'_parent': {"type": "Blog"}},
353                      }