[IFullTextIndexSerializable] use fulltext_indexable_attributes attribute to update cw_attr_cache

  • get rid of CUSTOM_ATTRIBUTES. Replace it with IFullTextIndexSerializable adapter custom_indexable_attributes attribute;
  • FullTextIndexSerializable.serialize method : default complete value is now True*

closes #15473085

authorKatia Saurfelt <katia.saurfelt@logilab.fr>
changeseta465024a09f2
branchdefault
phasedraft
hiddenyes
parent revision#f10462760dac Added tag 0.5.2, debian/0.5.2-1, centos/0.5.2-1 for changeset 3f5b5b287d5f
child revision<not specified>
files modified by this revision
cubicweb_elasticsearch/ccplugin.py
cubicweb_elasticsearch/entities.py
cubicweb_elasticsearch/es.py
cubicweb_elasticsearch/hooks.py
cubicweb_elasticsearch/testutils.py
test/test_elastic_search.py
test/test_ifulltextadapter.py
# HG changeset patch
# User Katia Saurfelt <katia.saurfelt@logilab.fr>
# Date 1490017003 -3600
# Mon Mar 20 14:36:43 2017 +0100
# Node ID a465024a09f2ff342a1c89698dda8fd1f5cc0722
# Parent f10462760dac0376dd7ba36d28f4a7f33f868027
[IFullTextIndexSerializable] use fulltext_indexable_attributes attribute to update cw_attr_cache

* get rid of CUSTOM_ATTRIBUTES. Replace it with IFullTextIndexSerializable adapter `custom_indexable_attributes`
attribute;
* FullTextIndexSerializable.serialize method : default complete value is now True*


closes #15473085

diff --git a/cubicweb_elasticsearch/ccplugin.py b/cubicweb_elasticsearch/ccplugin.py
@@ -94,24 +94,23 @@
1 
2      def bulk_actions(self, etypes, cnx, index_name=None, dry_run=False):
3          if index_name is None:
4              index_name = cnx.vreg.config['index-name']
5          for etype in etypes:
6 -            rql = fulltext_indexable_rql(etype, cnx.vreg.schema)
7 +            rql = fulltext_indexable_rql(etype, cnx)
8              rset = cnx.execute(rql)
9              cnx.info(u'[{}] indexing {} {} entities'.format(index_name, len(rset), etype))
10              cnx.debug(u'RQL: {}'.format(rql))
11 
12              for entity in rset.entities():
13                  try:
14                      serializer = entity.cw_adapt_to('IFullTextIndexSerializable')
15 -                    json = serializer.serialize()
16 +                    json = serializer.serialize(complete=False)
17                  except Exception as e:
18                      cnx.error('[{}] Failed to serialize entity {} ({})'.format(
19                          index_name, entity.eid, etype))
20                      continue
21 -
22                  if not dry_run and json:
23                      # Entities with
24                      # fulltext_containers relations return their container
25                      # IFullTextIndex serializer , therefor the "id" and
26                      # "doc_type" in kwargs bellow must be container data.
diff --git a/cubicweb_elasticsearch/entities.py b/cubicweb_elasticsearch/entities.py
@@ -17,10 +17,12 @@
27 
28  """cubicweb-elasticsearch entity's classes"""
29 
30  import collections
31 
32 +from logilab.common.decorators import cachedproperty
33 +
34  from cubicweb import view, neg_role
35  from cubicweb.predicates import is_instance
36 
37  from cubicweb.appobject import AppObject
38 
@@ -86,29 +88,49 @@
39      directly serialized to e.g. JSON.
40      """
41 
42      __regid__ = 'IFullTextIndexSerializable'
43      __select__ = is_instance('Any')
44 +    custom_indexable_attributes = ()
45 +    skip_indexable_attributes = ()
46 
47 -    def serialize(self, complete=False):
48 +    @cachedproperty
49 +    def fulltext_indexable_attributes(self):
50 +        eschema = self._cw.vreg.schema[self.entity.cw_etype]
51 +        attrs = ['creation_date', 'modification_date', 'cwuri']
52 +        attrs.extend([r.type for r in eschema.indexable_attributes()
53 +                      if r.type not in self.skip_indexable_attributes])
54 +        for rschema, tschema in eschema.attribute_definitions():
55 +            if rschema.type == 'eid':
56 +                continue
57 +            # XXX
58 +            if tschema.type in ('Int', 'Float'):
59 +                attrs.append(rschema.type)
60 +        attrs.extend(self.custom_indexable_attributes)
61 +        return attrs
62 +
63 +    def serialize(self, complete=True):
64          entity = self.entity
65          if complete:
66              entity.complete()
67          data = {
68              'cw_etype': entity.cw_etype,
69              'eid': entity.eid,
70              'cwuri': entity.cwuri,
71          }
72 -        data.update(entity.cw_attr_cache)
73 +        for attr in self.fulltext_indexable_attributes:
74 +            if attr in entity.cw_attr_cache:
75 +                data[attr] = entity.cw_attr_cache[attr]
76 +        self.update_parent_info(data, entity)
77          # TODO take a look at what's in entity.cw_relation_cache
78          return data
79 
80 
81  class File(IFullTextIndexSerializable):
82      __select__ = IFullTextIndexSerializable.__select__ & is_instance('File')
83 
84 -    def serialize(self, complete=False):
85 +    def serialize(self, complete=True):
86          """this could be a generic implementation of fulltext_containers indexation, but for
87 
88          now we can not return more than one parent json which is fine
89          for Files
90          """
diff --git a/cubicweb_elasticsearch/es.py b/cubicweb_elasticsearch/es.py
@@ -25,12 +25,10 @@
91 
92  INDEXABLE_TYPES = None
93 
94  # customization mechanism, in your cube, add your type as a key, and a list of
95  # additionnal attributes
96 -# eg. CUSTOM_ATTRIBUTES['BlogEntry'] = ('description',)
97 -CUSTOM_ATTRIBUTES = {}
98 
99  log = logging.getLogger(__name__)
100 
101 
102  def indexable_types(schema, custom_skip_list=None):
@@ -54,11 +52,11 @@
103                  indexable_types.append(eschema.type)
104      INDEXABLE_TYPES = indexable_types
105      return indexable_types
106 
107 
108 -def fulltext_indexable_rql(etype, schema, eid=None):
109 +def fulltext_indexable_rql(etype, cnx, eid=None):
110      '''
111      Generate RQL with fulltext_indexable attributes for a given entity type
112 
113      :eid:
114         defaults to None, set it to an eid to get RQL for a single element (used in hooks)
@@ -68,28 +66,16 @@
115      rql = ['WHERE %s is %s' % (V, etype)]
116      if eid:
117          rql.append('%s eid %i' % (V, eid))
118      var = next(varmaker)
119      selected = []
120 -    for rschema in schema.eschema(etype).indexable_attributes():
121 -        attr = rschema.type
122 +    cw_entity = cnx.vreg['etypes'].etype_class(etype)(cnx)
123 +    for attr in cw_entity.cw_adapt_to(
124 +            'IFullTextIndexSerializable').fulltext_indexable_attributes:
125          var = next(varmaker)
126          rql.append('%s %s %s' % (V, attr, var))
127          selected.append(var)
128 -    for rschema, tschema in schema.eschema(etype).attribute_definitions():
129 -        if rschema.type == 'eid':
130 -            continue
131 -        if tschema.type in ('Int', 'Float'):
132 -            attr = rschema.type
133 -            var = next(varmaker)
134 -            rql.append('%s %s %s' % (V, attr, var))
135 -            selected.append(var)
136 -    for attr in ('creation_date', 'modification_date', 'cwuri') + CUSTOM_ATTRIBUTES.get(etype, ()):
137 -        var = next(varmaker)
138 -        rql.append('%s %s %s' % (V, attr, var))
139 -        selected.append(var)
140 -    # TODO inlined relations ?
141      return 'Any %s,%s %s' % (V, ','.join(selected),
142                               ','.join(rql))
143 
144 
145  def create_index(es, index_name, settings=None):
diff --git a/cubicweb_elasticsearch/hooks.py b/cubicweb_elasticsearch/hooks.py
@@ -22,18 +22,19 @@
146  from elasticsearch.exceptions import ConnectionError, NotFoundError
147  from urllib3.exceptions import ProtocolError
148 
149  from cubicweb.server import hook
150  from cubicweb.predicates import score_entity
151 -from cubicweb_elasticsearch.es import indexable_types, fulltext_indexable_rql, CUSTOM_ATTRIBUTES
152 +
153 +from cubicweb_elasticsearch.es import indexable_types
154 
155  log = logging.getLogger(__name__)
156 
157 
158  def entity_indexable(entity):
159      return entity.cw_etype in indexable_types(entity._cw.vreg.schema) or \
160 -        entity.cw_etype in CUSTOM_ATTRIBUTES
161 +        entity.cw_adapt_to('IFullTextIndexSerializable').custom_indexable_attributes
162 
163 
164  class ContentUpdateIndexES(hook.Hook):
165      """detect content change and updates ES indexing"""
166 
@@ -55,13 +56,11 @@
167 
168      def __call__(self):
169          # XXX add a selector for object and subject
170          for entity in (self._cw.entity_from_eid(self.eidfrom),
171                         self._cw.entity_from_eid(self.eidto)):
172 -            cw_etype = entity.cw_etype
173 -            if (cw_etype in indexable_types(entity._cw.vreg.schema) or
174 -                    cw_etype in CUSTOM_ATTRIBUTES):
175 +            if entity_indexable(entity):
176                  IndexEsOperation.get_instance(self._cw).add_data(entity)
177 
178 
179  class IndexEsOperation(hook.DataOperationMixIn, hook.Operation):
180 
@@ -86,16 +85,12 @@
181                            id=entity.eid,
182                            doc_type=entity.cw_etype)
183              if self.cnx.deleted_in_transaction(entity.eid):
184                  self.delete_doc(es, **kwargs)
185                  continue
186 -            rql = fulltext_indexable_rql(entity.cw_etype,
187 -                                         entity._cw.vreg.schema,
188 -                                         eid=entity.eid)
189 -            indexable_entity = self.cnx.execute(rql).one()
190 -            serializer = indexable_entity.cw_adapt_to('IFullTextIndexSerializable')
191 -            json = serializer.serialize(complete=True)
192 +            serializer = entity.cw_adapt_to('IFullTextIndexSerializable')
193 +            json = serializer.serialize()
194              if not json:
195                  # if en entity has been already indexed, we still
196                  # keep the first indexation
197                  # which is wrong. We should remove the existing es entry.
198                  continue
diff --git a/cubicweb_elasticsearch/testutils.py b/cubicweb_elasticsearch/testutils.py
@@ -1,14 +1,25 @@
199  import unittest
200  import httplib
201 
202  from elasticsearch_dsl.connections import connections
203 
204 -from cubicweb_elasticsearch.es import CUSTOM_ATTRIBUTES
205 +from cubicweb.predicates import is_instance
206 +
207 +from cubicweb_elasticsearch.entities import IFullTextIndexSerializable
208 
209 
210 -CUSTOM_ATTRIBUTES['Blog'] = ('title',)
211 +class BlogEntryFTIAdapter(IFullTextIndexSerializable):
212 +    __select__ = (IFullTextIndexSerializable.__select__ &
213 +                  is_instance('BlogEntry'))
214 +    custom_indexable_attributes = ('title', 'content')
215 +
216 +
217 +class BlogFTIAdapter(IFullTextIndexSerializable):
218 +    __select__ = (IFullTextIndexSerializable.__select__ &
219 +                  is_instance('Blog'))
220 +    custom_indexable_attributes = ('title', )
221 
222 
223  class RealESTestMixin(object):
224 
225      @classmethod
diff --git a/test/test_elastic_search.py b/test/test_elastic_search.py
@@ -165,13 +165,12 @@
226 
227  class ElasticsearchTC(testlib.CubicWebTC):
228 
229      def test_1(self):
230          with self.admin_access.cnx() as cnx:
231 -            schema = cnx.vreg.schema
232              etype = 'Person'
233 -            rql = fulltext_indexable_rql(etype, schema)
234 +            rql = fulltext_indexable_rql(etype, cnx)
235              self.assertIn('age', rql)
236              self.assertNotIn('eid', rql)
237              self.assertEqual(rql.count('modification_date'), 1)
238 
239 
diff --git a/test/test_ifulltextadapter.py b/test/test_ifulltextadapter.py
@@ -0,0 +1,72 @@
240 +import unittest
241 +
242 +from mock import patch
243 +
244 +from cubicweb.devtools import testlib
245 +from cubicweb.cwconfig import CubicWebConfiguration
246 +
247 +from cubes.elasticsearch.testutils import BlogFTIAdapter, BlogEntryFTIAdapter
248 +
249 +
250 +class IFullTextIndexSerializableTC(testlib.CubicWebTC):
251 +
252 +    def setup_database(self):
253 +        super(IFullTextIndexSerializableTC, self).setup_database()
254 +        self.orig_config_for = CubicWebConfiguration.config_for
255 +        config_for = lambda appid: self.config  # noqa
256 +        CubicWebConfiguration.config_for = staticmethod(config_for)
257 +        self.config['elasticsearch-locations'] = 'http://nonexistant.elastic.search:9200'
258 +        self.config['index-name'] = 'unittest_index_name'
259 +
260 +    @patch('elasticsearch.client.indices.IndicesClient.create')
261 +    @patch('elasticsearch.client.indices.IndicesClient.exists')
262 +    @patch('elasticsearch.client.Elasticsearch.index')
263 +    def test_index_entity(self, create, exists, index):
264 +        """Only update indexable attributes while call entity.complete()
265 +           on IFullTextIndexSerializable.serialze()
266 +        """
267 +        with self.admin_access.repo_cnx() as cnx:
268 +            with self.temporary_appobjects(BlogFTIAdapter, BlogEntryFTIAdapter):
269 +                indexer = cnx.vreg['es'].select('indexer', cnx)
270 +                es = indexer.get_connection()
271 +                blog = cnx.create_entity('Blog', title=u'Blog')
272 +                cnx.commit()
273 +                self.assertTrue(es.index.called)
274 +                args, kwargs = es.index.call_args
275 +                # blog title is a in custom_indexable_attributes
276 +                self.assertEqual(kwargs['doc_type'], 'Blog')
277 +                self.assertEqual(kwargs['body']['title'], u'Blog')
278 +                index.reset_mock()
279 +                # create a BlogEntry
280 +                bentry = cnx.create_entity('BlogEntry', title=u'program',
281 +                                           content=u'Le nouveau programme',
282 +                                           entry_of=blog)
283 +                cnx.commit()
284 +                self.assertTrue(es.index.called)
285 +                args, kwargs = es.index.call_args
286 +                self.assertEqual(kwargs['doc_type'], 'BlogEntry')
287 +                for arg_name, expected_value in (
288 +                        ('content', u'Le nouveau programme'),
289 +                        ('cwuri', bentry.cwuri),
290 +                        ('title', 'program')):
291 +                    self.assertEqual(kwargs['body'][arg_name], expected_value)
292 +                self.assertFalse('content_format' in kwargs['body'])
293 +                # update BlogEntry
294 +                bentry.cw_set(title=u'Programme')
295 +                cnx.commit()
296 +                index.reset_mock()
297 +                self.assertTrue(es.index.called)
298 +                args, kwargs = es.index.call_args
299 +                for arg_name, expected_value in (
300 +                        ('id', bentry.eid), ('doc_type', bentry.cw_etype)):
301 +                    self.assertEqual(kwargs[arg_name], expected_value)
302 +                for arg_name, expected_value in (
303 +                        ('content', u'Le nouveau programme'),
304 +                        ('cwuri', bentry.cwuri),
305 +                        ('title', u'Programme')):
306 +                    self.assertEqual(kwargs['body'][arg_name], expected_value)
307 +                self.assertFalse('content_format' in kwargs['body'])
308 +
309 +
310 +if __name__ == '__main__':
311 +    unittest.main()
obsoletes