[IFullTextIndexSerializable] use fulltext_indexable_attributes attribute to update cw_attr_cache

  • get rid of CUSTOM_ATTRIBUTES. Replace it with IFullTextIndexSerializable adapter custom_indexable_attributes attribute;
  • FullTextIndexSerializable.serialize method : default complete value is now True*

closes #17079905

authorKatia Saurfelt <katia.saurfelt@logilab.fr>
changeset4cb5b3c2a4b3
branchdefault
phasepublic
hiddenno
parent revision#f10462760dac Added tag 0.5.2, debian/0.5.2-1, centos/0.5.2-1 for changeset 3f5b5b287d5f
child revision#a9a0b699f3da attr indexation should not depend on its presence in cw_attr_cache
files modified by this revision
cubicweb_elasticsearch/ccplugin.py
cubicweb_elasticsearch/entities.py
cubicweb_elasticsearch/es.py
cubicweb_elasticsearch/hooks.py
cubicweb_elasticsearch/testutils.py
test/test_elastic_search.py
test/test_ifulltextadapter.py
# HG changeset patch
# User Katia Saurfelt <katia.saurfelt@logilab.fr>
# Date 1490017003 -3600
# Mon Mar 20 14:36:43 2017 +0100
# Node ID 4cb5b3c2a4b362f54ad2364a644d7ea1e4e92946
# Parent f10462760dac0376dd7ba36d28f4a7f33f868027
[IFullTextIndexSerializable] use fulltext_indexable_attributes attribute to update cw_attr_cache

* get rid of CUSTOM_ATTRIBUTES. Replace it with IFullTextIndexSerializable adapter `custom_indexable_attributes`
attribute;
* FullTextIndexSerializable.serialize method : default complete value is now True*


closes #17079905

diff --git a/cubicweb_elasticsearch/ccplugin.py b/cubicweb_elasticsearch/ccplugin.py
@@ -94,24 +94,23 @@
1 
2      def bulk_actions(self, etypes, cnx, index_name=None, dry_run=False):
3          if index_name is None:
4              index_name = cnx.vreg.config['index-name']
5          for etype in etypes:
6 -            rql = fulltext_indexable_rql(etype, cnx.vreg.schema)
7 +            rql = fulltext_indexable_rql(etype, cnx)
8              rset = cnx.execute(rql)
9              cnx.info(u'[{}] indexing {} {} entities'.format(index_name, len(rset), etype))
10              cnx.debug(u'RQL: {}'.format(rql))
11 
12              for entity in rset.entities():
13                  try:
14                      serializer = entity.cw_adapt_to('IFullTextIndexSerializable')
15 -                    json = serializer.serialize()
16 +                    json = serializer.serialize(complete=False)
17                  except Exception as e:
18                      cnx.error('[{}] Failed to serialize entity {} ({})'.format(
19                          index_name, entity.eid, etype))
20                      continue
21 -
22                  if not dry_run and json:
23                      # Entities with
24                      # fulltext_containers relations return their container
25                      # IFullTextIndex serializer , therefor the "id" and
26                      # "doc_type" in kwargs bellow must be container data.
diff --git a/cubicweb_elasticsearch/entities.py b/cubicweb_elasticsearch/entities.py
@@ -17,10 +17,12 @@
27 
28  """cubicweb-elasticsearch entity's classes"""
29 
30  import collections
31 
32 +from logilab.common.decorators import cachedproperty
33 +
34  from cubicweb import view, neg_role
35  from cubicweb.predicates import is_instance
36 
37  from cubicweb.appobject import AppObject
38 
@@ -86,29 +88,48 @@
39      directly serialized to e.g. JSON.
40      """
41 
42      __regid__ = 'IFullTextIndexSerializable'
43      __select__ = is_instance('Any')
44 +    custom_indexable_attributes = ()
45 +    skip_indexable_attributes = ()
46 
47 -    def serialize(self, complete=False):
48 +    @cachedproperty
49 +    def fulltext_indexable_attributes(self):
50 +        eschema = self._cw.vreg.schema[self.entity.cw_etype]
51 +        attrs = ['creation_date', 'modification_date', 'cwuri']
52 +        attrs.extend([r.type for r in eschema.indexable_attributes()
53 +                      if r.type not in self.skip_indexable_attributes])
54 +        for rschema, tschema in eschema.attribute_definitions():
55 +            if rschema.type == 'eid':
56 +                continue
57 +            # XXX
58 +            if tschema.type in ('Int', 'Float'):
59 +                attrs.append(rschema.type)
60 +        attrs.extend(self.custom_indexable_attributes)
61 +        return attrs
62 +
63 +    def serialize(self, complete=True):
64          entity = self.entity
65          if complete:
66              entity.complete()
67          data = {
68              'cw_etype': entity.cw_etype,
69              'eid': entity.eid,
70              'cwuri': entity.cwuri,
71          }
72 -        data.update(entity.cw_attr_cache)
73 +        for attr in self.fulltext_indexable_attributes:
74 +            if attr in entity.cw_attr_cache:
75 +                data[attr] = entity.cw_attr_cache[attr]
76          # TODO take a look at what's in entity.cw_relation_cache
77          return data
78 
79 
80  class File(IFullTextIndexSerializable):
81      __select__ = IFullTextIndexSerializable.__select__ & is_instance('File')
82 
83 -    def serialize(self, complete=False):
84 +    def serialize(self, complete=True):
85          """this could be a generic implementation of fulltext_containers indexation, but for
86 
87          now we can not return more than one parent json which is fine
88          for Files
89          """
diff --git a/cubicweb_elasticsearch/es.py b/cubicweb_elasticsearch/es.py
@@ -25,12 +25,10 @@
90 
91  INDEXABLE_TYPES = None
92 
93  # customization mechanism, in your cube, add your type as a key, and a list of
94  # additionnal attributes
95 -# eg. CUSTOM_ATTRIBUTES['BlogEntry'] = ('description',)
96 -CUSTOM_ATTRIBUTES = {}
97 
98  log = logging.getLogger(__name__)
99 
100 
101  def indexable_types(schema, custom_skip_list=None):
@@ -54,11 +52,11 @@
102                  indexable_types.append(eschema.type)
103      INDEXABLE_TYPES = indexable_types
104      return indexable_types
105 
106 
107 -def fulltext_indexable_rql(etype, schema, eid=None):
108 +def fulltext_indexable_rql(etype, cnx, eid=None):
109      '''
110      Generate RQL with fulltext_indexable attributes for a given entity type
111 
112      :eid:
113         defaults to None, set it to an eid to get RQL for a single element (used in hooks)
@@ -68,28 +66,16 @@
114      rql = ['WHERE %s is %s' % (V, etype)]
115      if eid:
116          rql.append('%s eid %i' % (V, eid))
117      var = next(varmaker)
118      selected = []
119 -    for rschema in schema.eschema(etype).indexable_attributes():
120 -        attr = rschema.type
121 +    cw_entity = cnx.vreg['etypes'].etype_class(etype)(cnx)
122 +    for attr in cw_entity.cw_adapt_to(
123 +            'IFullTextIndexSerializable').fulltext_indexable_attributes:
124          var = next(varmaker)
125          rql.append('%s %s %s' % (V, attr, var))
126          selected.append(var)
127 -    for rschema, tschema in schema.eschema(etype).attribute_definitions():
128 -        if rschema.type == 'eid':
129 -            continue
130 -        if tschema.type in ('Int', 'Float'):
131 -            attr = rschema.type
132 -            var = next(varmaker)
133 -            rql.append('%s %s %s' % (V, attr, var))
134 -            selected.append(var)
135 -    for attr in ('creation_date', 'modification_date', 'cwuri') + CUSTOM_ATTRIBUTES.get(etype, ()):
136 -        var = next(varmaker)
137 -        rql.append('%s %s %s' % (V, attr, var))
138 -        selected.append(var)
139 -    # TODO inlined relations ?
140      return 'Any %s,%s %s' % (V, ','.join(selected),
141                               ','.join(rql))
142 
143 
144  def create_index(es, index_name, settings=None):
diff --git a/cubicweb_elasticsearch/hooks.py b/cubicweb_elasticsearch/hooks.py
@@ -22,18 +22,19 @@
145  from elasticsearch.exceptions import ConnectionError, NotFoundError
146  from urllib3.exceptions import ProtocolError
147 
148  from cubicweb.server import hook
149  from cubicweb.predicates import score_entity
150 -from cubicweb_elasticsearch.es import indexable_types, fulltext_indexable_rql, CUSTOM_ATTRIBUTES
151 +
152 +from cubicweb_elasticsearch.es import indexable_types
153 
154  log = logging.getLogger(__name__)
155 
156 
157  def entity_indexable(entity):
158      return entity.cw_etype in indexable_types(entity._cw.vreg.schema) or \
159 -        entity.cw_etype in CUSTOM_ATTRIBUTES
160 +        entity.cw_adapt_to('IFullTextIndexSerializable').custom_indexable_attributes
161 
162 
163  class ContentUpdateIndexES(hook.Hook):
164      """detect content change and updates ES indexing"""
165 
@@ -55,13 +56,11 @@
166 
167      def __call__(self):
168          # XXX add a selector for object and subject
169          for entity in (self._cw.entity_from_eid(self.eidfrom),
170                         self._cw.entity_from_eid(self.eidto)):
171 -            cw_etype = entity.cw_etype
172 -            if (cw_etype in indexable_types(entity._cw.vreg.schema) or
173 -                    cw_etype in CUSTOM_ATTRIBUTES):
174 +            if entity_indexable(entity):
175                  IndexEsOperation.get_instance(self._cw).add_data(entity)
176 
177 
178  class IndexEsOperation(hook.DataOperationMixIn, hook.Operation):
179 
@@ -86,16 +85,12 @@
180                            id=entity.eid,
181                            doc_type=entity.cw_etype)
182              if self.cnx.deleted_in_transaction(entity.eid):
183                  self.delete_doc(es, **kwargs)
184                  continue
185 -            rql = fulltext_indexable_rql(entity.cw_etype,
186 -                                         entity._cw.vreg.schema,
187 -                                         eid=entity.eid)
188 -            indexable_entity = self.cnx.execute(rql).one()
189 -            serializer = indexable_entity.cw_adapt_to('IFullTextIndexSerializable')
190 -            json = serializer.serialize(complete=True)
191 +            serializer = entity.cw_adapt_to('IFullTextIndexSerializable')
192 +            json = serializer.serialize()
193              if not json:
194                  # if en entity has been already indexed, we still
195                  # keep the first indexation
196                  # which is wrong. We should remove the existing es entry.
197                  continue
diff --git a/cubicweb_elasticsearch/testutils.py b/cubicweb_elasticsearch/testutils.py
@@ -1,14 +1,25 @@
198  import unittest
199  import httplib
200 
201  from elasticsearch_dsl.connections import connections
202 
203 -from cubicweb_elasticsearch.es import CUSTOM_ATTRIBUTES
204 +from cubicweb.predicates import is_instance
205 +
206 +from cubicweb_elasticsearch.entities import IFullTextIndexSerializable
207 
208 
209 -CUSTOM_ATTRIBUTES['Blog'] = ('title',)
210 +class BlogEntryFTIAdapter(IFullTextIndexSerializable):
211 +    __select__ = (IFullTextIndexSerializable.__select__ &
212 +                  is_instance('BlogEntry'))
213 +    custom_indexable_attributes = ('title', 'content')
214 +
215 +
216 +class BlogFTIAdapter(IFullTextIndexSerializable):
217 +    __select__ = (IFullTextIndexSerializable.__select__ &
218 +                  is_instance('Blog'))
219 +    custom_indexable_attributes = ('title', )
220 
221 
222  class RealESTestMixin(object):
223 
224      @classmethod
diff --git a/test/test_elastic_search.py b/test/test_elastic_search.py
@@ -11,11 +11,11 @@
225 
226  from cubicweb.devtools import testlib
227  from cubicweb.cwconfig import CubicWebConfiguration
228  from cubicweb_elasticsearch import ccplugin
229  from cubicweb_elasticsearch.es import (indexable_types,
230 -                                    fulltext_indexable_rql)
231 +                                       fulltext_indexable_rql)
232 
233 
234  # TODO - find a way to configure ElasticSearch as non threaded while running tests
235  # so that the traces show the full stack, not just starting from connection.http_*
236  class ExportElasticSearchTC(testlib.AutoPopulateTest):
@@ -165,13 +165,12 @@
237 
238  class ElasticsearchTC(testlib.CubicWebTC):
239 
240      def test_1(self):
241          with self.admin_access.cnx() as cnx:
242 -            schema = cnx.vreg.schema
243              etype = 'Person'
244 -            rql = fulltext_indexable_rql(etype, schema)
245 +            rql = fulltext_indexable_rql(etype, cnx)
246              self.assertIn('age', rql)
247              self.assertNotIn('eid', rql)
248              self.assertEqual(rql.count('modification_date'), 1)
249 
250 
diff --git a/test/test_ifulltextadapter.py b/test/test_ifulltextadapter.py
@@ -0,0 +1,72 @@
251 +import unittest
252 +
253 +from mock import patch
254 +
255 +from cubicweb.devtools import testlib
256 +from cubicweb.cwconfig import CubicWebConfiguration
257 +
258 +from cubes.elasticsearch.testutils import BlogFTIAdapter, BlogEntryFTIAdapter
259 +
260 +
261 +class IFullTextIndexSerializableTC(testlib.CubicWebTC):
262 +
263 +    def setup_database(self):
264 +        super(IFullTextIndexSerializableTC, self).setup_database()
265 +        self.orig_config_for = CubicWebConfiguration.config_for
266 +        config_for = lambda appid: self.config  # noqa
267 +        CubicWebConfiguration.config_for = staticmethod(config_for)
268 +        self.config['elasticsearch-locations'] = 'http://nonexistant.elastic.search:9200'
269 +        self.config['index-name'] = 'unittest_index_name'
270 +
271 +    @patch('elasticsearch.client.indices.IndicesClient.create')
272 +    @patch('elasticsearch.client.indices.IndicesClient.exists')
273 +    @patch('elasticsearch.client.Elasticsearch.index')
274 +    def test_index_entity(self, create, exists, index):
275 +        """Only update indexable attributes while call entity.complete()
276 +           on IFullTextIndexSerializable.serialze()
277 +        """
278 +        with self.admin_access.repo_cnx() as cnx:
279 +            with self.temporary_appobjects(BlogFTIAdapter, BlogEntryFTIAdapter):
280 +                indexer = cnx.vreg['es'].select('indexer', cnx)
281 +                es = indexer.get_connection()
282 +                blog = cnx.create_entity('Blog', title=u'Blog')
283 +                cnx.commit()
284 +                self.assertTrue(es.index.called)
285 +                args, kwargs = es.index.call_args
286 +                # blog title is a in custom_indexable_attributes
287 +                self.assertEqual(kwargs['doc_type'], 'Blog')
288 +                self.assertEqual(kwargs['body']['title'], u'Blog')
289 +                index.reset_mock()
290 +                # create a BlogEntry
291 +                bentry = cnx.create_entity('BlogEntry', title=u'program',
292 +                                           content=u'Le nouveau programme',
293 +                                           entry_of=blog)
294 +                cnx.commit()
295 +                self.assertTrue(es.index.called)
296 +                args, kwargs = es.index.call_args
297 +                self.assertEqual(kwargs['doc_type'], 'BlogEntry')
298 +                for arg_name, expected_value in (
299 +                        ('content', u'Le nouveau programme'),
300 +                        ('cwuri', bentry.cwuri),
301 +                        ('title', 'program')):
302 +                    self.assertEqual(kwargs['body'][arg_name], expected_value)
303 +                self.assertFalse('content_format' in kwargs['body'])
304 +                # update BlogEntry
305 +                bentry.cw_set(title=u'Programme')
306 +                cnx.commit()
307 +                index.reset_mock()
308 +                self.assertTrue(es.index.called)
309 +                args, kwargs = es.index.call_args
310 +                for arg_name, expected_value in (
311 +                        ('id', bentry.eid), ('doc_type', bentry.cw_etype)):
312 +                    self.assertEqual(kwargs[arg_name], expected_value)
313 +                for arg_name, expected_value in (
314 +                        ('content', u'Le nouveau programme'),
315 +                        ('cwuri', bentry.cwuri),
316 +                        ('title', u'Programme')):
317 +                    self.assertEqual(kwargs['body'][arg_name], expected_value)
318 +                self.assertFalse('content_format' in kwargs['body'])
319 +
320 +
321 +if __name__ == '__main__':
322 +    unittest.main()
obsoletes