[IFullTextIndexSerializable] use fulltext_indexable_attributes attribute to update cw_attr_cache

  • get rid of CUSTOM_ATTRIBUTES. Replace it with IFullTextIndexSerializable adapter custom_indexable_attributes attribute;
  • FullTextIndexSerializable.serialize method : default complete value is now True*

closes #15473085

authorKatia Saurfelt <katia.saurfelt@logilab.fr>
changeset112ff3005d97
branchdefault
phasedraft
hiddenyes
parent revision#3d7845ff31dd [hook] cleaning: `parent` arg is no more used
child revision<not specified>
files modified by this revision
ccplugin.py
entities.py
es.py
hooks.py
test/test_elastic_search.py
test/test_hooks.py
test/test_ifulltextadapter.py
test/test_parents.py
testutils.py
# HG changeset patch
# User Katia Saurfelt <katia.saurfelt@logilab.fr>
# Date 1490017003 -3600
# Mon Mar 20 14:36:43 2017 +0100
# Node ID 112ff3005d97929e54e524560526203305e92e99
# Parent 3d7845ff31dd9681b2c3cf55bbb8fd0b3ebe0bf7
[IFullTextIndexSerializable] use fulltext_indexable_attributes attribute to update cw_attr_cache

* get rid of CUSTOM_ATTRIBUTES. Replace it with IFullTextIndexSerializable adapter `custom_indexable_attributes`
attribute;
* FullTextIndexSerializable.serialize method : default complete value is now True*


closes #15473085

diff --git a/ccplugin.py b/ccplugin.py
@@ -95,18 +95,18 @@
1 
2      def bulk_actions(self, etypes, cnx, index_name=None, dry_run=False):
3          if index_name is None:
4              index_name = cnx.vreg.config['index-name']
5          for etype in etypes:
6 -            rql = fulltext_indexable_rql(etype, cnx.vreg.schema)
7 +            rql = fulltext_indexable_rql(etype, cnx)
8              rset = cnx.execute(rql)
9              cnx.info(u'[{}] indexing {} {} entities'.format(index_name, len(rset), etype))
10              cnx.debug(u'RQL: {}'.format(rql))
11 
12              for entity in rset.entities():
13                  serializer = entity.cw_adapt_to('IFullTextIndexSerializable')
14 -                json = serializer.serialize()
15 +                json = serializer.serialize(complete=False)
16                  if not dry_run and json:
17                      # Entities with
18                      # fulltext_containers relations return their container
19                      # IFullTextIndex serializer , therefor the "id" and
20                      # "doc_type" in kwargs bellow must be container data.
diff --git a/entities.py b/entities.py
@@ -17,10 +17,12 @@
21 
22  """cubicweb-elasticsearch entity's classes"""
23 
24  import collections
25 
26 +from logilab.common.decorators import cachedproperty
27 +
28  from cubicweb import view, neg_role
29  from cubicweb.predicates import is_instance
30 
31  from cubicweb.appobject import AppObject
32 
@@ -86,21 +88,40 @@
33      directly serialized to e.g. JSON.
34      """
35 
36      __regid__ = 'IFullTextIndexSerializable'
37      __select__ = is_instance('Any')
38 +    custom_indexable_attributes = ()
39 +    skip_indexable_attributes = ()
40 
41 -    def serialize(self, complete=False):
42 +    @cachedproperty
43 +    def fulltext_indexable_attributes(self):
44 +        eschema = self._cw.vreg.schema[self.entity.cw_etype]
45 +        attrs = ['creation_date', 'modification_date', 'cwuri']
46 +        attrs.extend([r.type for r in eschema.indexable_attributes()
47 +                      if r.type not in self.skip_indexable_attributes])
48 +        for rschema, tschema in eschema.attribute_definitions():
49 +            if rschema.type == 'eid':
50 +                continue
51 +            # XXX
52 +            if tschema.type in ('Int', 'Float'):
53 +                attrs.append(rschema.type)
54 +        attrs.extend(self.custom_indexable_attributes)
55 +        return attrs
56 +
57 +    def serialize(self, complete=True):
58          entity = self.entity
59          if complete:
60              entity.complete()
61          data = {
62              'cw_etype': entity.cw_etype,
63              'eid': entity.eid,
64              'cwuri': entity.cwuri,
65          }
66 -        data.update(entity.cw_attr_cache)
67 +        for attr in self.fulltext_indexable_attributes:
68 +            if attr in entity.cw_attr_cache:
69 +                data[attr] = entity.cw_attr_cache[attr]
70          self.update_parent_info(data, entity)
71          # TODO take a look at what's in entity.cw_relation_cache
72          return data
73 
74      def update_parent_info(self, data, entity):
@@ -112,11 +133,11 @@
75 
76 
77  class File(IFullTextIndexSerializable):
78      __select__ = IFullTextIndexSerializable.__select__ & is_instance('File')
79 
80 -    def serialize(self, complete=False):
81 +    def serialize(self, complete=True):
82          """this could be a generic implementation of fulltext_containers indexation, but for
83 
84          now we can not return more than one parent json which is fine
85          for Files
86          """
diff --git a/es.py b/es.py
@@ -25,12 +25,10 @@
87 
88  INDEXABLE_TYPES = None
89 
90  # customization mechanism, in your cube, add your type as a key, and a list of
91  # additionnal attributes
92 -# eg. CUSTOM_ATTRIBUTES['BlogEntry'] = ('description',)
93 -CUSTOM_ATTRIBUTES = {}
94 
95  log = logging.getLogger(__name__)
96 
97 
98  def indexable_types(schema, custom_skip_list=None):
@@ -54,11 +52,11 @@
99                  indexable_types.append(eschema.type)
100      INDEXABLE_TYPES = indexable_types
101      return indexable_types
102 
103 
104 -def fulltext_indexable_rql(etype, schema, eid=None):
105 +def fulltext_indexable_rql(etype, cnx, eid=None):
106      '''
107      Generate RQL with fulltext_indexable attributes for a given entity type
108 
109      :eid:
110         defaults to None, set it to an eid to get RQL for a single element (used in hooks)
@@ -68,28 +66,16 @@
111      rql = ['WHERE %s is %s' % (V, etype)]
112      if eid:
113          rql.append('%s eid %i' % (V, eid))
114      var = next(varmaker)
115      selected = []
116 -    for rschema in schema.eschema(etype).indexable_attributes():
117 -        attr = rschema.type
118 +    cw_entity = cnx.vreg['etypes'].etype_class(etype)(cnx)
119 +    for attr in cw_entity.cw_adapt_to(
120 +            'IFullTextIndexSerializable').fulltext_indexable_attributes:
121          var = next(varmaker)
122          rql.append('%s %s %s' % (V, attr, var))
123          selected.append(var)
124 -    for rschema, tschema in schema.eschema(etype).attribute_definitions():
125 -        if rschema.type == 'eid':
126 -            continue
127 -        if tschema.type in ('Int', 'Float'):
128 -            attr = rschema.type
129 -            var = next(varmaker)
130 -            rql.append('%s %s %s' % (V, attr, var))
131 -            selected.append(var)
132 -    for attr in ('creation_date', 'modification_date', 'cwuri') + CUSTOM_ATTRIBUTES.get(etype, ()):
133 -        var = next(varmaker)
134 -        rql.append('%s %s %s' % (V, attr, var))
135 -        selected.append(var)
136 -    # TODO inlined relations ?
137      return 'Any %s,%s %s' % (V, ','.join(selected),
138                               ','.join(rql))
139 
140 
141  def create_index(es, index_name, settings=None):
diff --git a/hooks.py b/hooks.py
@@ -22,18 +22,18 @@
142  from elasticsearch.exceptions import ConnectionError
143  from urllib3.exceptions import ProtocolError
144 
145  from cubicweb.server import hook
146  from cubicweb.predicates import score_entity
147 -from cubes.elasticsearch.es import indexable_types, fulltext_indexable_rql, CUSTOM_ATTRIBUTES
148 +from cubes.elasticsearch.es import indexable_types
149 
150  log = logging.getLogger(__name__)
151 
152 
153  def entity_indexable(entity):
154      return entity.cw_etype in indexable_types(entity._cw.vreg.schema) or \
155 -        entity.cw_etype in CUSTOM_ATTRIBUTES
156 +        entity.cw_adapt_to('IFullTextIndexSerializable').custom_indexable_attributes
157 
158 
159  class ContentUpdateIndexES(hook.Hook):
160      """detect content change and updates ES indexing"""
161 
@@ -55,13 +55,11 @@
162 
163      def __call__(self):
164          # XXX add a selector for object and subject
165          for entity in (self._cw.entity_from_eid(self.eidfrom),
166                         self._cw.entity_from_eid(self.eidto)):
167 -            cw_etype = entity.cw_etype
168 -            if (cw_etype in indexable_types(entity._cw.vreg.schema) or
169 -                    cw_etype in CUSTOM_ATTRIBUTES):
170 +            if entity_indexable(entity):
171                  IndexEsOperation.get_instance(self._cw).add_data(entity)
172 
173 
174  class IndexEsOperation(hook.DataOperationMixIn, hook.Operation):
175 
@@ -80,16 +78,12 @@
176                      # TODO option for async ?
177                      es.delete(**kwargs)
178                  except (ConnectionError, ProtocolError):
179                      log.debug('Failed to index in hook, could not connect to ES')
180                  continue
181 -            rql = fulltext_indexable_rql(entity.cw_etype,
182 -                                         entity._cw.vreg.schema,
183 -                                         eid=entity.eid)
184 -            indexable_entity = self.cnx.execute(rql).one()
185 -            serializer = indexable_entity.cw_adapt_to('IFullTextIndexSerializable')
186 -            json = serializer.serialize(complete=True)
187 +            serializer = entity.cw_adapt_to('IFullTextIndexSerializable')
188 +            json = serializer.serialize()
189              if not json:
190                  # if en entity has been already indexed, we still
191                  # keep the first indexation
192                  # which is wrong. We should remove the existing es entry.
193                  continue
diff --git a/test/test_elastic_search.py b/test/test_elastic_search.py
@@ -175,13 +175,12 @@
194 
195  class ElasticsearchTC(testlib.CubicWebTC):
196 
197      def test_1(self):
198          with self.admin_access.cnx() as cnx:
199 -            schema = cnx.vreg.schema
200              etype = 'Person'
201 -            rql = fulltext_indexable_rql(etype, schema)
202 +            rql = fulltext_indexable_rql(etype, cnx)
203              self.assertIn('age', rql)
204              self.assertNotIn('eid', rql)
205              self.assertEqual(rql.count('modification_date'), 1)
206 
207 
diff --git a/test/test_hooks.py b/test/test_hooks.py
@@ -3,19 +3,19 @@
208 
209  from elasticsearch_dsl import Search
210 
211  from cubicweb.devtools import testlib
212 
213 -from cubes.elasticsearch.testutils import RealESTestMixin, BlogFTIAdapter
214 +from cubes.elasticsearch.testutils import RealESTestMixin, BlogEntryFTIAdapter
215  from cubes.elasticsearch.search_helpers import compose_search
216 
217 
218  class ReindexOnRelationTests(RealESTestMixin, testlib.CubicWebTC):
219 
220      def test_es_hooks_modify_relation(self):
221          with self.admin_access.cnx() as cnx:
222 -            with self.temporary_appobjects(BlogFTIAdapter):
223 +            with self.temporary_appobjects(BlogEntryFTIAdapter):
224                  indexer = cnx.vreg['es'].select('indexer', cnx)
225                  indexer.create_index(custom_settings={
226                      'mappings': {
227                          'BlogEntry': {'_parent': {"type": "Blog"}},
228                      }
diff --git a/test/test_ifulltextadapter.py b/test/test_ifulltextadapter.py
@@ -0,0 +1,72 @@
229 +import unittest
230 +
231 +from mock import patch
232 +
233 +from cubicweb.devtools import testlib
234 +from cubicweb.cwconfig import CubicWebConfiguration
235 +
236 +from cubes.elasticsearch.testutils import BlogFTIAdapter, BlogEntryFTIAdapter
237 +
238 +
239 +class IFullTextIndexSerializableTC(testlib.CubicWebTC):
240 +
241 +    def setup_database(self):
242 +        super(IFullTextIndexSerializableTC, self).setup_database()
243 +        self.orig_config_for = CubicWebConfiguration.config_for
244 +        config_for = lambda appid: self.config  # noqa
245 +        CubicWebConfiguration.config_for = staticmethod(config_for)
246 +        self.config['elasticsearch-locations'] = 'http://nonexistant.elastic.search:9200'
247 +        self.config['index-name'] = 'unittest_index_name'
248 +
249 +    @patch('elasticsearch.client.indices.IndicesClient.create')
250 +    @patch('elasticsearch.client.indices.IndicesClient.exists')
251 +    @patch('elasticsearch.client.Elasticsearch.index')
252 +    def test_index_entity(self, create, exists, index):
253 +        """Only update indexable attributes while call entity.complete()
254 +           on IFullTextIndexSerializable.serialze()
255 +        """
256 +        with self.admin_access.repo_cnx() as cnx:
257 +            with self.temporary_appobjects(BlogFTIAdapter, BlogEntryFTIAdapter):
258 +                indexer = cnx.vreg['es'].select('indexer', cnx)
259 +                es = indexer.get_connection()
260 +                blog = cnx.create_entity('Blog', title=u'Blog')
261 +                cnx.commit()
262 +                self.assertTrue(es.index.called)
263 +                args, kwargs = es.index.call_args
264 +                # blog title is a in custom_indexable_attributes
265 +                self.assertEqual(kwargs['doc_type'], 'Blog')
266 +                self.assertEqual(kwargs['body']['title'], u'Blog')
267 +                index.reset_mock()
268 +                # create a BlogEntry
269 +                bentry = cnx.create_entity('BlogEntry', title=u'program',
270 +                                           content=u'Le nouveau programme',
271 +                                           entry_of=blog)
272 +                cnx.commit()
273 +                self.assertTrue(es.index.called)
274 +                args, kwargs = es.index.call_args
275 +                self.assertEqual(kwargs['doc_type'], 'BlogEntry')
276 +                for arg_name, expected_value in (
277 +                        ('content', u'Le nouveau programme'),
278 +                        ('cwuri', bentry.cwuri),
279 +                        ('title', 'program')):
280 +                    self.assertEqual(kwargs['body'][arg_name], expected_value)
281 +                self.assertFalse('content_format' in kwargs['body'])
282 +                # update BlogEntry
283 +                bentry.cw_set(title=u'Programme')
284 +                cnx.commit()
285 +                index.reset_mock()
286 +                self.assertTrue(es.index.called)
287 +                args, kwargs = es.index.call_args
288 +                for arg_name, expected_value in (
289 +                        ('id', bentry.eid), ('doc_type', bentry.cw_etype)):
290 +                    self.assertEqual(kwargs[arg_name], expected_value)
291 +                for arg_name, expected_value in (
292 +                        ('content', u'Le nouveau programme'),
293 +                        ('cwuri', bentry.cwuri),
294 +                        ('title', u'Programme')):
295 +                    self.assertEqual(kwargs['body'][arg_name], expected_value)
296 +                self.assertFalse('content_format' in kwargs['body'])
297 +
298 +
299 +if __name__ == '__main__':
300 +    unittest.main()
diff --git a/test/test_parents.py b/test/test_parents.py
@@ -7,18 +7,18 @@
301 
302  from cubicweb.devtools import testlib
303 
304  from cubes.elasticsearch.search_helpers import compose_search
305 
306 -from cubes.elasticsearch.testutils import RealESTestMixin, BlogFTIAdapter
307 +from cubes.elasticsearch.testutils import RealESTestMixin, BlogEntryFTIAdapter
308 
309 
310  class ParentsSearchTC(RealESTestMixin, testlib.CubicWebTC):
311 
312      def test_parent_search(self):
313          with self.admin_access.cnx() as cnx:
314 -            with self.temporary_appobjects(BlogFTIAdapter):
315 +            with self.temporary_appobjects(BlogEntryFTIAdapter):
316                  indexer = cnx.vreg['es'].select('indexer', cnx)
317                  indexer.create_index(custom_settings={
318                      'mappings': {
319                          'BlogEntry': {'_parent': {"type": "Blog"}},
320                      }
diff --git a/testutils.py b/testutils.py
@@ -3,23 +3,26 @@
321 
322  from elasticsearch_dsl.connections import connections
323 
324  from cubicweb.predicates import is_instance
325 
326 -from cubes.elasticsearch.es import CUSTOM_ATTRIBUTES
327  from cubes.elasticsearch.entities import IFullTextIndexSerializable
328 
329 
330 -CUSTOM_ATTRIBUTES['Blog'] = ('title',)
331 +class BlogEntryFTIAdapter(IFullTextIndexSerializable):
332 +    __select__ = (IFullTextIndexSerializable.__select__ &
333 +                  is_instance('BlogEntry'))
334 +    custom_indexable_attributes = ('title', 'content')
335 +
336 +    def update_parent_info(self, data, entity):
337 +        data['parent'] = entity.entry_of[0].eid
338 
339 
340  class BlogFTIAdapter(IFullTextIndexSerializable):
341      __select__ = (IFullTextIndexSerializable.__select__ &
342 -                  is_instance('BlogEntry'))
343 -
344 -    def update_parent_info(self, data, entity):
345 -        data['parent'] = entity.entry_of[0].eid
346 +                  is_instance('Blog'))
347 +    custom_indexable_attributes = ('title', )
348 
349 
350  class RealESTestMixin(object):
351 
352      @classmethod
obsoleted by