# HG changeset patch
# User Katia Saurfelt <katia.saurfelt@logilab.fr>
# Date 1490017003 -3600
# Mon Mar 20 14:36:43 2017 +0100
# Node ID 4cb5b3c2a4b362f54ad2364a644d7ea1e4e92946
# Parent f10462760dac0376dd7ba36d28f4a7f33f868027
[IFullTextIndexSerializable] use fulltext_indexable_attributes attribute to update cw_attr_cache
* get rid of CUSTOM_ATTRIBUTES. Replace it with IFullTextIndexSerializable adapter `custom_indexable_attributes`
attribute;
* FullTextIndexSerializable.serialize method : default complete value is now True*
closes #17079905
# User Katia Saurfelt <katia.saurfelt@logilab.fr>
# Date 1490017003 -3600
# Mon Mar 20 14:36:43 2017 +0100
# Node ID 4cb5b3c2a4b362f54ad2364a644d7ea1e4e92946
# Parent f10462760dac0376dd7ba36d28f4a7f33f868027
[IFullTextIndexSerializable] use fulltext_indexable_attributes attribute to update cw_attr_cache
* get rid of CUSTOM_ATTRIBUTES. Replace it with IFullTextIndexSerializable adapter `custom_indexable_attributes`
attribute;
* FullTextIndexSerializable.serialize method : default complete value is now True*
closes #17079905
@@ -94,24 +94,23 @@
1 2 def bulk_actions(self, etypes, cnx, index_name=None, dry_run=False): 3 if index_name is None: 4 index_name = cnx.vreg.config['index-name'] 5 for etype in etypes: 6 - rql = fulltext_indexable_rql(etype, cnx.vreg.schema) 7 + rql = fulltext_indexable_rql(etype, cnx) 8 rset = cnx.execute(rql) 9 cnx.info(u'[{}] indexing {} {} entities'.format(index_name, len(rset), etype)) 10 cnx.debug(u'RQL: {}'.format(rql)) 11 12 for entity in rset.entities(): 13 try: 14 serializer = entity.cw_adapt_to('IFullTextIndexSerializable') 15 - json = serializer.serialize() 16 + json = serializer.serialize(complete=False) 17 except Exception as e: 18 cnx.error('[{}] Failed to serialize entity {} ({})'.format( 19 index_name, entity.eid, etype)) 20 continue 21 - 22 if not dry_run and json: 23 # Entities with 24 # fulltext_containers relations return their container 25 # IFullTextIndex serializer , therefor the "id" and 26 # "doc_type" in kwargs bellow must be container data.
@@ -17,10 +17,12 @@
27 28 """cubicweb-elasticsearch entity's classes""" 29 30 import collections 31 32 +from logilab.common.decorators import cachedproperty 33 + 34 from cubicweb import view, neg_role 35 from cubicweb.predicates import is_instance 36 37 from cubicweb.appobject import AppObject 38
@@ -86,29 +88,48 @@
39 directly serialized to e.g. JSON. 40 """ 41 42 __regid__ = 'IFullTextIndexSerializable' 43 __select__ = is_instance('Any') 44 + custom_indexable_attributes = () 45 + skip_indexable_attributes = () 46 47 - def serialize(self, complete=False): 48 + @cachedproperty 49 + def fulltext_indexable_attributes(self): 50 + eschema = self._cw.vreg.schema[self.entity.cw_etype] 51 + attrs = ['creation_date', 'modification_date', 'cwuri'] 52 + attrs.extend([r.type for r in eschema.indexable_attributes() 53 + if r.type not in self.skip_indexable_attributes]) 54 + for rschema, tschema in eschema.attribute_definitions(): 55 + if rschema.type == 'eid': 56 + continue 57 + # XXX 58 + if tschema.type in ('Int', 'Float'): 59 + attrs.append(rschema.type) 60 + attrs.extend(self.custom_indexable_attributes) 61 + return attrs 62 + 63 + def serialize(self, complete=True): 64 entity = self.entity 65 if complete: 66 entity.complete() 67 data = { 68 'cw_etype': entity.cw_etype, 69 'eid': entity.eid, 70 'cwuri': entity.cwuri, 71 } 72 - data.update(entity.cw_attr_cache) 73 + for attr in self.fulltext_indexable_attributes: 74 + if attr in entity.cw_attr_cache: 75 + data[attr] = entity.cw_attr_cache[attr] 76 # TODO take a look at what's in entity.cw_relation_cache 77 return data 78 79 80 class File(IFullTextIndexSerializable): 81 __select__ = IFullTextIndexSerializable.__select__ & is_instance('File') 82 83 - def serialize(self, complete=False): 84 + def serialize(self, complete=True): 85 """this could be a generic implementation of fulltext_containers indexation, but for 86 87 now we can not return more than one parent json which is fine 88 for Files 89 """
@@ -25,12 +25,10 @@
90 91 INDEXABLE_TYPES = None 92 93 # customization mechanism, in your cube, add your type as a key, and a list of 94 # additionnal attributes 95 -# eg. CUSTOM_ATTRIBUTES['BlogEntry'] = ('description',) 96 -CUSTOM_ATTRIBUTES = {} 97 98 log = logging.getLogger(__name__) 99 100 101 def indexable_types(schema, custom_skip_list=None):
@@ -54,11 +52,11 @@
102 indexable_types.append(eschema.type) 103 INDEXABLE_TYPES = indexable_types 104 return indexable_types 105 106 107 -def fulltext_indexable_rql(etype, schema, eid=None): 108 +def fulltext_indexable_rql(etype, cnx, eid=None): 109 ''' 110 Generate RQL with fulltext_indexable attributes for a given entity type 111 112 :eid: 113 defaults to None, set it to an eid to get RQL for a single element (used in hooks)
@@ -68,28 +66,16 @@
114 rql = ['WHERE %s is %s' % (V, etype)] 115 if eid: 116 rql.append('%s eid %i' % (V, eid)) 117 var = next(varmaker) 118 selected = [] 119 - for rschema in schema.eschema(etype).indexable_attributes(): 120 - attr = rschema.type 121 + cw_entity = cnx.vreg['etypes'].etype_class(etype)(cnx) 122 + for attr in cw_entity.cw_adapt_to( 123 + 'IFullTextIndexSerializable').fulltext_indexable_attributes: 124 var = next(varmaker) 125 rql.append('%s %s %s' % (V, attr, var)) 126 selected.append(var) 127 - for rschema, tschema in schema.eschema(etype).attribute_definitions(): 128 - if rschema.type == 'eid': 129 - continue 130 - if tschema.type in ('Int', 'Float'): 131 - attr = rschema.type 132 - var = next(varmaker) 133 - rql.append('%s %s %s' % (V, attr, var)) 134 - selected.append(var) 135 - for attr in ('creation_date', 'modification_date', 'cwuri') + CUSTOM_ATTRIBUTES.get(etype, ()): 136 - var = next(varmaker) 137 - rql.append('%s %s %s' % (V, attr, var)) 138 - selected.append(var) 139 - # TODO inlined relations ? 140 return 'Any %s,%s %s' % (V, ','.join(selected), 141 ','.join(rql)) 142 143 144 def create_index(es, index_name, settings=None):
@@ -22,18 +22,19 @@
145 from elasticsearch.exceptions import ConnectionError, NotFoundError 146 from urllib3.exceptions import ProtocolError 147 148 from cubicweb.server import hook 149 from cubicweb.predicates import score_entity 150 -from cubicweb_elasticsearch.es import indexable_types, fulltext_indexable_rql, CUSTOM_ATTRIBUTES 151 + 152 +from cubicweb_elasticsearch.es import indexable_types 153 154 log = logging.getLogger(__name__) 155 156 157 def entity_indexable(entity): 158 return entity.cw_etype in indexable_types(entity._cw.vreg.schema) or \ 159 - entity.cw_etype in CUSTOM_ATTRIBUTES 160 + entity.cw_adapt_to('IFullTextIndexSerializable').custom_indexable_attributes 161 162 163 class ContentUpdateIndexES(hook.Hook): 164 """detect content change and updates ES indexing""" 165
@@ -55,13 +56,11 @@
166 167 def __call__(self): 168 # XXX add a selector for object and subject 169 for entity in (self._cw.entity_from_eid(self.eidfrom), 170 self._cw.entity_from_eid(self.eidto)): 171 - cw_etype = entity.cw_etype 172 - if (cw_etype in indexable_types(entity._cw.vreg.schema) or 173 - cw_etype in CUSTOM_ATTRIBUTES): 174 + if entity_indexable(entity): 175 IndexEsOperation.get_instance(self._cw).add_data(entity) 176 177 178 class IndexEsOperation(hook.DataOperationMixIn, hook.Operation): 179
@@ -86,16 +85,12 @@
180 id=entity.eid, 181 doc_type=entity.cw_etype) 182 if self.cnx.deleted_in_transaction(entity.eid): 183 self.delete_doc(es, **kwargs) 184 continue 185 - rql = fulltext_indexable_rql(entity.cw_etype, 186 - entity._cw.vreg.schema, 187 - eid=entity.eid) 188 - indexable_entity = self.cnx.execute(rql).one() 189 - serializer = indexable_entity.cw_adapt_to('IFullTextIndexSerializable') 190 - json = serializer.serialize(complete=True) 191 + serializer = entity.cw_adapt_to('IFullTextIndexSerializable') 192 + json = serializer.serialize() 193 if not json: 194 # if en entity has been already indexed, we still 195 # keep the first indexation 196 # which is wrong. We should remove the existing es entry. 197 continue
@@ -1,14 +1,25 @@
198 import unittest 199 import httplib 200 201 from elasticsearch_dsl.connections import connections 202 203 -from cubicweb_elasticsearch.es import CUSTOM_ATTRIBUTES 204 +from cubicweb.predicates import is_instance 205 + 206 +from cubicweb_elasticsearch.entities import IFullTextIndexSerializable 207 208 209 -CUSTOM_ATTRIBUTES['Blog'] = ('title',) 210 +class BlogEntryFTIAdapter(IFullTextIndexSerializable): 211 + __select__ = (IFullTextIndexSerializable.__select__ & 212 + is_instance('BlogEntry')) 213 + custom_indexable_attributes = ('title', 'content') 214 + 215 + 216 +class BlogFTIAdapter(IFullTextIndexSerializable): 217 + __select__ = (IFullTextIndexSerializable.__select__ & 218 + is_instance('Blog')) 219 + custom_indexable_attributes = ('title', ) 220 221 222 class RealESTestMixin(object): 223 224 @classmethod
@@ -11,11 +11,11 @@
225 226 from cubicweb.devtools import testlib 227 from cubicweb.cwconfig import CubicWebConfiguration 228 from cubicweb_elasticsearch import ccplugin 229 from cubicweb_elasticsearch.es import (indexable_types, 230 - fulltext_indexable_rql) 231 + fulltext_indexable_rql) 232 233 234 # TODO - find a way to configure ElasticSearch as non threaded while running tests 235 # so that the traces show the full stack, not just starting from connection.http_* 236 class ExportElasticSearchTC(testlib.AutoPopulateTest):
@@ -165,13 +165,12 @@
237 238 class ElasticsearchTC(testlib.CubicWebTC): 239 240 def test_1(self): 241 with self.admin_access.cnx() as cnx: 242 - schema = cnx.vreg.schema 243 etype = 'Person' 244 - rql = fulltext_indexable_rql(etype, schema) 245 + rql = fulltext_indexable_rql(etype, cnx) 246 self.assertIn('age', rql) 247 self.assertNotIn('eid', rql) 248 self.assertEqual(rql.count('modification_date'), 1) 249 250
@@ -0,0 +1,72 @@
251 +import unittest 252 + 253 +from mock import patch 254 + 255 +from cubicweb.devtools import testlib 256 +from cubicweb.cwconfig import CubicWebConfiguration 257 + 258 +from cubes.elasticsearch.testutils import BlogFTIAdapter, BlogEntryFTIAdapter 259 + 260 + 261 +class IFullTextIndexSerializableTC(testlib.CubicWebTC): 262 + 263 + def setup_database(self): 264 + super(IFullTextIndexSerializableTC, self).setup_database() 265 + self.orig_config_for = CubicWebConfiguration.config_for 266 + config_for = lambda appid: self.config # noqa 267 + CubicWebConfiguration.config_for = staticmethod(config_for) 268 + self.config['elasticsearch-locations'] = 'http://nonexistant.elastic.search:9200' 269 + self.config['index-name'] = 'unittest_index_name' 270 + 271 + @patch('elasticsearch.client.indices.IndicesClient.create') 272 + @patch('elasticsearch.client.indices.IndicesClient.exists') 273 + @patch('elasticsearch.client.Elasticsearch.index') 274 + def test_index_entity(self, create, exists, index): 275 + """Only update indexable attributes while call entity.complete() 276 + on IFullTextIndexSerializable.serialze() 277 + """ 278 + with self.admin_access.repo_cnx() as cnx: 279 + with self.temporary_appobjects(BlogFTIAdapter, BlogEntryFTIAdapter): 280 + indexer = cnx.vreg['es'].select('indexer', cnx) 281 + es = indexer.get_connection() 282 + blog = cnx.create_entity('Blog', title=u'Blog') 283 + cnx.commit() 284 + self.assertTrue(es.index.called) 285 + args, kwargs = es.index.call_args 286 + # blog title is a in custom_indexable_attributes 287 + self.assertEqual(kwargs['doc_type'], 'Blog') 288 + self.assertEqual(kwargs['body']['title'], u'Blog') 289 + index.reset_mock() 290 + # create a BlogEntry 291 + bentry = cnx.create_entity('BlogEntry', title=u'program', 292 + content=u'Le nouveau programme', 293 + entry_of=blog) 294 + cnx.commit() 295 + self.assertTrue(es.index.called) 296 + args, kwargs = es.index.call_args 297 + self.assertEqual(kwargs['doc_type'], 'BlogEntry') 298 + for arg_name, expected_value in ( 299 + ('content', u'Le nouveau programme'), 300 + ('cwuri', bentry.cwuri), 301 + ('title', 'program')): 302 + self.assertEqual(kwargs['body'][arg_name], expected_value) 303 + self.assertFalse('content_format' in kwargs['body']) 304 + # update BlogEntry 305 + bentry.cw_set(title=u'Programme') 306 + cnx.commit() 307 + index.reset_mock() 308 + self.assertTrue(es.index.called) 309 + args, kwargs = es.index.call_args 310 + for arg_name, expected_value in ( 311 + ('id', bentry.eid), ('doc_type', bentry.cw_etype)): 312 + self.assertEqual(kwargs[arg_name], expected_value) 313 + for arg_name, expected_value in ( 314 + ('content', u'Le nouveau programme'), 315 + ('cwuri', bentry.cwuri), 316 + ('title', u'Programme')): 317 + self.assertEqual(kwargs['body'][arg_name], expected_value) 318 + self.assertFalse('content_format' in kwargs['body']) 319 + 320 + 321 +if __name__ == '__main__': 322 + unittest.main()