[IFullTextIndexSerializable] use fulltext_indexable_attributes attribute to update cw_attr_cache

  • get rid of CUSTOM_ATTRIBUTES. Replace it with IFullTextIndexSerializable adapter custom_indexable_attributes attribute;
  • FullTextIndexSerializable.serialize method : default complete value is now True*

closes #15473085

authorKatia Saurfelt <katia.saurfelt@logilab.fr>
changeset8d5b7639bf17
branchdefault
phasedraft
hiddenyes
parent revision#3eca03ec5f5b [pkg] add symlink to __pkginfo__ to please apycot/vcxxx
child revision<not specified>
files modified by this revision
cubicweb_elasticsearch/ccplugin.py
cubicweb_elasticsearch/entities.py
cubicweb_elasticsearch/es.py
cubicweb_elasticsearch/hooks.py
cubicweb_elasticsearch/testutils.py
test/test_elastic_search.py
test/test_ifulltextadapter.py
# HG changeset patch
# User Katia Saurfelt <katia.saurfelt@logilab.fr>
# Date 1490017003 -3600
# Mon Mar 20 14:36:43 2017 +0100
# Node ID 8d5b7639bf17d69d371221e74c3ce688e294c397
# Parent 3eca03ec5f5b812312ca746ebc307f9cdd3a7327
[IFullTextIndexSerializable] use fulltext_indexable_attributes attribute to update cw_attr_cache

* get rid of CUSTOM_ATTRIBUTES. Replace it with IFullTextIndexSerializable adapter `custom_indexable_attributes`
attribute;
* FullTextIndexSerializable.serialize method : default complete value is now True*


closes #15473085

diff --git a/cubicweb_elasticsearch/ccplugin.py b/cubicweb_elasticsearch/ccplugin.py
@@ -94,18 +94,18 @@
1 
2      def bulk_actions(self, etypes, cnx, index_name=None, dry_run=False):
3          if index_name is None:
4              index_name = cnx.vreg.config['index-name']
5          for etype in etypes:
6 -            rql = fulltext_indexable_rql(etype, cnx.vreg.schema)
7 +            rql = fulltext_indexable_rql(etype, cnx)
8              rset = cnx.execute(rql)
9              cnx.info(u'[{}] indexing {} {} entities'.format(index_name, len(rset), etype))
10              cnx.debug(u'RQL: {}'.format(rql))
11 
12              for entity in rset.entities():
13                  serializer = entity.cw_adapt_to('IFullTextIndexSerializable')
14 -                json = serializer.serialize()
15 +                json = serializer.serialize(complete=False)
16                  if not dry_run and json:
17                      # Entities with
18                      # fulltext_containers relations return their container
19                      # IFullTextIndex serializer , therefor the "id" and
20                      # "doc_type" in kwargs bellow must be container data.
diff --git a/cubicweb_elasticsearch/entities.py b/cubicweb_elasticsearch/entities.py
@@ -17,10 +17,12 @@
21 
22  """cubicweb-elasticsearch entity's classes"""
23 
24  import collections
25 
26 +from logilab.common.decorators import cachedproperty
27 +
28  from cubicweb import view, neg_role
29  from cubicweb.predicates import is_instance
30 
31  from cubicweb.appobject import AppObject
32 
@@ -86,29 +88,49 @@
33      directly serialized to e.g. JSON.
34      """
35 
36      __regid__ = 'IFullTextIndexSerializable'
37      __select__ = is_instance('Any')
38 +    custom_indexable_attributes = ()
39 +    skip_indexable_attributes = ()
40 
41 -    def serialize(self, complete=False):
42 +    @cachedproperty
43 +    def fulltext_indexable_attributes(self):
44 +        eschema = self._cw.vreg.schema[self.entity.cw_etype]
45 +        attrs = ['creation_date', 'modification_date', 'cwuri']
46 +        attrs.extend([r.type for r in eschema.indexable_attributes()
47 +                      if r.type not in self.skip_indexable_attributes])
48 +        for rschema, tschema in eschema.attribute_definitions():
49 +            if rschema.type == 'eid':
50 +                continue
51 +            # XXX
52 +            if tschema.type in ('Int', 'Float'):
53 +                attrs.append(rschema.type)
54 +        attrs.extend(self.custom_indexable_attributes)
55 +        return attrs
56 +
57 +    def serialize(self, complete=True):
58          entity = self.entity
59          if complete:
60              entity.complete()
61          data = {
62              'cw_etype': entity.cw_etype,
63              'eid': entity.eid,
64              'cwuri': entity.cwuri,
65          }
66 -        data.update(entity.cw_attr_cache)
67 +        for attr in self.fulltext_indexable_attributes:
68 +            if attr in entity.cw_attr_cache:
69 +                data[attr] = entity.cw_attr_cache[attr]
70 +        self.update_parent_info(data, entity)
71          # TODO take a look at what's in entity.cw_relation_cache
72          return data
73 
74 
75  class File(IFullTextIndexSerializable):
76      __select__ = IFullTextIndexSerializable.__select__ & is_instance('File')
77 
78 -    def serialize(self, complete=False):
79 +    def serialize(self, complete=True):
80          """this could be a generic implementation of fulltext_containers indexation, but for
81 
82          now we can not return more than one parent json which is fine
83          for Files
84          """
diff --git a/cubicweb_elasticsearch/es.py b/cubicweb_elasticsearch/es.py
@@ -25,12 +25,10 @@
85 
86  INDEXABLE_TYPES = None
87 
88  # customization mechanism, in your cube, add your type as a key, and a list of
89  # additionnal attributes
90 -# eg. CUSTOM_ATTRIBUTES['BlogEntry'] = ('description',)
91 -CUSTOM_ATTRIBUTES = {}
92 
93  log = logging.getLogger(__name__)
94 
95 
96  def indexable_types(schema, custom_skip_list=None):
@@ -54,11 +52,11 @@
97                  indexable_types.append(eschema.type)
98      INDEXABLE_TYPES = indexable_types
99      return indexable_types
100 
101 
102 -def fulltext_indexable_rql(etype, schema, eid=None):
103 +def fulltext_indexable_rql(etype, cnx, eid=None):
104      '''
105      Generate RQL with fulltext_indexable attributes for a given entity type
106 
107      :eid:
108         defaults to None, set it to an eid to get RQL for a single element (used in hooks)
@@ -68,28 +66,16 @@
109      rql = ['WHERE %s is %s' % (V, etype)]
110      if eid:
111          rql.append('%s eid %i' % (V, eid))
112      var = next(varmaker)
113      selected = []
114 -    for rschema in schema.eschema(etype).indexable_attributes():
115 -        attr = rschema.type
116 +    cw_entity = cnx.vreg['etypes'].etype_class(etype)(cnx)
117 +    for attr in cw_entity.cw_adapt_to(
118 +            'IFullTextIndexSerializable').fulltext_indexable_attributes:
119          var = next(varmaker)
120          rql.append('%s %s %s' % (V, attr, var))
121          selected.append(var)
122 -    for rschema, tschema in schema.eschema(etype).attribute_definitions():
123 -        if rschema.type == 'eid':
124 -            continue
125 -        if tschema.type in ('Int', 'Float'):
126 -            attr = rschema.type
127 -            var = next(varmaker)
128 -            rql.append('%s %s %s' % (V, attr, var))
129 -            selected.append(var)
130 -    for attr in ('creation_date', 'modification_date', 'cwuri') + CUSTOM_ATTRIBUTES.get(etype, ()):
131 -        var = next(varmaker)
132 -        rql.append('%s %s %s' % (V, attr, var))
133 -        selected.append(var)
134 -    # TODO inlined relations ?
135      return 'Any %s,%s %s' % (V, ','.join(selected),
136                               ','.join(rql))
137 
138 
139  def create_index(es, index_name, settings=None):
diff --git a/cubicweb_elasticsearch/hooks.py b/cubicweb_elasticsearch/hooks.py
@@ -22,18 +22,19 @@
140  from elasticsearch.exceptions import ConnectionError, NotFoundError
141  from urllib3.exceptions import ProtocolError
142 
143  from cubicweb.server import hook
144  from cubicweb.predicates import score_entity
145 -from cubicweb_elasticsearch.es import indexable_types, fulltext_indexable_rql, CUSTOM_ATTRIBUTES
146 +
147 +from cubicweb_elasticsearch.es import indexable_types
148 
149  log = logging.getLogger(__name__)
150 
151 
152  def entity_indexable(entity):
153      return entity.cw_etype in indexable_types(entity._cw.vreg.schema) or \
154 -        entity.cw_etype in CUSTOM_ATTRIBUTES
155 +        entity.cw_adapt_to('IFullTextIndexSerializable').custom_indexable_attributes
156 
157 
158  class ContentUpdateIndexES(hook.Hook):
159      """detect content change and updates ES indexing"""
160 
@@ -55,13 +56,11 @@
161 
162      def __call__(self):
163          # XXX add a selector for object and subject
164          for entity in (self._cw.entity_from_eid(self.eidfrom),
165                         self._cw.entity_from_eid(self.eidto)):
166 -            cw_etype = entity.cw_etype
167 -            if (cw_etype in indexable_types(entity._cw.vreg.schema) or
168 -                    cw_etype in CUSTOM_ATTRIBUTES):
169 +            if entity_indexable(entity):
170                  IndexEsOperation.get_instance(self._cw).add_data(entity)
171 
172 
173  class IndexEsOperation(hook.DataOperationMixIn, hook.Operation):
174 
@@ -86,16 +85,12 @@
175                            id=entity.eid,
176                            doc_type=entity.cw_etype)
177              if self.cnx.deleted_in_transaction(entity.eid):
178                  self.delete_doc(es, **kwargs)
179                  continue
180 -            rql = fulltext_indexable_rql(entity.cw_etype,
181 -                                         entity._cw.vreg.schema,
182 -                                         eid=entity.eid)
183 -            indexable_entity = self.cnx.execute(rql).one()
184 -            serializer = indexable_entity.cw_adapt_to('IFullTextIndexSerializable')
185 -            json = serializer.serialize(complete=True)
186 +            serializer = entity.cw_adapt_to('IFullTextIndexSerializable')
187 +            json = serializer.serialize()
188              if not json:
189                  # if en entity has been already indexed, we still
190                  # keep the first indexation
191                  # which is wrong. We should remove the existing es entry.
192                  continue
diff --git a/cubicweb_elasticsearch/testutils.py b/cubicweb_elasticsearch/testutils.py
@@ -1,14 +1,25 @@
193  import unittest
194  import httplib
195 
196  from elasticsearch_dsl.connections import connections
197 
198 -from cubicweb_elasticsearch.es import CUSTOM_ATTRIBUTES
199 +from cubicweb.predicates import is_instance
200 +
201 +from cubicweb_elasticsearch.entities import IFullTextIndexSerializable
202 
203 
204 -CUSTOM_ATTRIBUTES['Blog'] = ('title',)
205 +class BlogEntryFTIAdapter(IFullTextIndexSerializable):
206 +    __select__ = (IFullTextIndexSerializable.__select__ &
207 +                  is_instance('BlogEntry'))
208 +    custom_indexable_attributes = ('title', 'content')
209 +
210 +
211 +class BlogFTIAdapter(IFullTextIndexSerializable):
212 +    __select__ = (IFullTextIndexSerializable.__select__ &
213 +                  is_instance('Blog'))
214 +    custom_indexable_attributes = ('title', )
215 
216 
217  class RealESTestMixin(object):
218 
219      @classmethod
diff --git a/test/test_elastic_search.py b/test/test_elastic_search.py
@@ -165,13 +165,12 @@
220 
221  class ElasticsearchTC(testlib.CubicWebTC):
222 
223      def test_1(self):
224          with self.admin_access.cnx() as cnx:
225 -            schema = cnx.vreg.schema
226              etype = 'Person'
227 -            rql = fulltext_indexable_rql(etype, schema)
228 +            rql = fulltext_indexable_rql(etype, cnx)
229              self.assertIn('age', rql)
230              self.assertNotIn('eid', rql)
231              self.assertEqual(rql.count('modification_date'), 1)
232 
233 
diff --git a/test/test_ifulltextadapter.py b/test/test_ifulltextadapter.py
@@ -0,0 +1,72 @@
234 +import unittest
235 +
236 +from mock import patch
237 +
238 +from cubicweb.devtools import testlib
239 +from cubicweb.cwconfig import CubicWebConfiguration
240 +
241 +from cubes.elasticsearch.testutils import BlogFTIAdapter, BlogEntryFTIAdapter
242 +
243 +
244 +class IFullTextIndexSerializableTC(testlib.CubicWebTC):
245 +
246 +    def setup_database(self):
247 +        super(IFullTextIndexSerializableTC, self).setup_database()
248 +        self.orig_config_for = CubicWebConfiguration.config_for
249 +        config_for = lambda appid: self.config  # noqa
250 +        CubicWebConfiguration.config_for = staticmethod(config_for)
251 +        self.config['elasticsearch-locations'] = 'http://nonexistant.elastic.search:9200'
252 +        self.config['index-name'] = 'unittest_index_name'
253 +
254 +    @patch('elasticsearch.client.indices.IndicesClient.create')
255 +    @patch('elasticsearch.client.indices.IndicesClient.exists')
256 +    @patch('elasticsearch.client.Elasticsearch.index')
257 +    def test_index_entity(self, create, exists, index):
258 +        """Only update indexable attributes while call entity.complete()
259 +           on IFullTextIndexSerializable.serialze()
260 +        """
261 +        with self.admin_access.repo_cnx() as cnx:
262 +            with self.temporary_appobjects(BlogFTIAdapter, BlogEntryFTIAdapter):
263 +                indexer = cnx.vreg['es'].select('indexer', cnx)
264 +                es = indexer.get_connection()
265 +                blog = cnx.create_entity('Blog', title=u'Blog')
266 +                cnx.commit()
267 +                self.assertTrue(es.index.called)
268 +                args, kwargs = es.index.call_args
269 +                # blog title is a in custom_indexable_attributes
270 +                self.assertEqual(kwargs['doc_type'], 'Blog')
271 +                self.assertEqual(kwargs['body']['title'], u'Blog')
272 +                index.reset_mock()
273 +                # create a BlogEntry
274 +                bentry = cnx.create_entity('BlogEntry', title=u'program',
275 +                                           content=u'Le nouveau programme',
276 +                                           entry_of=blog)
277 +                cnx.commit()
278 +                self.assertTrue(es.index.called)
279 +                args, kwargs = es.index.call_args
280 +                self.assertEqual(kwargs['doc_type'], 'BlogEntry')
281 +                for arg_name, expected_value in (
282 +                        ('content', u'Le nouveau programme'),
283 +                        ('cwuri', bentry.cwuri),
284 +                        ('title', 'program')):
285 +                    self.assertEqual(kwargs['body'][arg_name], expected_value)
286 +                self.assertFalse('content_format' in kwargs['body'])
287 +                # update BlogEntry
288 +                bentry.cw_set(title=u'Programme')
289 +                cnx.commit()
290 +                index.reset_mock()
291 +                self.assertTrue(es.index.called)
292 +                args, kwargs = es.index.call_args
293 +                for arg_name, expected_value in (
294 +                        ('id', bentry.eid), ('doc_type', bentry.cw_etype)):
295 +                    self.assertEqual(kwargs[arg_name], expected_value)
296 +                for arg_name, expected_value in (
297 +                        ('content', u'Le nouveau programme'),
298 +                        ('cwuri', bentry.cwuri),
299 +                        ('title', u'Programme')):
300 +                    self.assertEqual(kwargs['body'][arg_name], expected_value)
301 +                self.assertFalse('content_format' in kwargs['body'])
302 +
303 +
304 +if __name__ == '__main__':
305 +    unittest.main()