[wip] External resources enrichment/aligner

authorVincent Michel <vincent.michel@logilab.fr>
changeset47f581626a1f
branchdefault
phasedraft
hiddenno
parent revision#57a9d36063bf [interfaces] Protect Librdf blank nodes, closes #3478264
child revision<not specified>
files modified by this revision
__pkginfo__.py
entities.py
external_resources.py
views.py
# HG changeset patch
# User Vincent Michel <vincent.michel@logilab.fr>
# Date 1391441253 0
# Mon Feb 03 15:27:33 2014 +0000
# Node ID 47f581626a1f6c0a65afcf760732a1b1b6e94ff6
# Parent 57a9d36063bfe602f16d25a83e7bb543ed3f0340
[wip] External resources enrichment/aligner

diff --git a/__pkginfo__.py b/__pkginfo__.py
@@ -11,11 +11,12 @@
1  author = 'LOGILAB S.A. (Paris, FRANCE)'
2  author_email = 'contact@logilab.fr'
3  description = 'Cube for data input/output, import and export'
4  web = 'http://www.cubicweb.org/project/%s' % distname
5 
6 -__depends__ =  {'cubicweb': '>= 3.17.1'}
7 +__depends__ =  {'cubicweb': '>= 3.17.1',
8 +                'nazca': '>=0.4.0'}
9  __recommends__ = {}
10 
11  classifiers = [
12      'Environment :: Web Environment',
13      'Framework :: CubicWeb',
diff --git a/entities.py b/entities.py
@@ -14,5 +14,35 @@
14  #
15  # You should have received a copy of the GNU Lesser General Public License along
16  # with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  """cubicweb-dataio entity's classes"""
19 +
20 +from cubicweb.predicates import is_instance, relation_possible
21 +
22 +from cubes.dataio.external_resources import AbstractIExternalResourceAdapter
23 +
24 +
25 +###############################################################################
26 +### EXTERNAL RESOURCES ADAPTERS - CONCRETE CLASSES ############################
27 +###############################################################################
28 +class ExternalUriIExternalResourceAdapter(AbstractIExternalResourceAdapter):
29 +    """ ExternalUri version - Use the uri of an ExternalUri.
30 +    """
31 +    __select__ = AbstractIExternalResourceAdapter.__select__ & is_instance('ExternalUri')
32 +
33 +    @property
34 +    def uris(self):
35 +        return (self.entity.uri,)
36 +
37 +
38 +class SameAsExternalResourceAdapter(AbstractIExternalResourceAdapter):
39 +    """ SameAs version - Fetch all the same as entities and get their uris.
40 +    """
41 +    __select__ = AbstractIExternalResourceAdapter.__select__& relation_possible('same_as')
42 +
43 +    @property
44 +    def uris(self):
45 +        uris = []
46 +        for e in self.entity.same_as:
47 +            uris.extend(e.cw_adapt_to('IExternalResource').uris)
48 +        return uris
diff --git a/external_resources.py b/external_resources.py
@@ -0,0 +1,236 @@
49 +# copyright 2003-2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
50 +# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
51 +#
52 +# This file is part of CubicWeb.
53 +#
54 +# CubicWeb is free software: you can redistribute it and/or modify it under the
55 +# terms of the GNU Lesser General Public License as published by the Free
56 +# Software Foundation, either version 2.1 of the License, or (at your option)
57 +# any later version.
58 +#
59 +# CubicWeb is distributed in the hope that it will be useful, but WITHOUT ANY
60 +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
61 +# A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
62 +# details.
63 +#
64 +# You should have received a copy of the GNU Lesser General Public License along
65 +# with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
66 +
67 +""" External resources enrichment/alignment
68 +
69 +This module provides:
70 +
71 + * adapters for entities (e.g. ExternalUri) in order to fetch information
72 +   from sparql endpoints (or other sources/API);
73 +
74 + * sources for matching/enriching entities on sparql endpoints (or other sources/API);
75 +
76 + * some views for these informations.
77 +
78 +"""
79 +from collections import defaultdict
80 +from nazca.utils.dataio import sparqlquery
81 +
82 +from cubicweb.view import EntityAdapter
83 +
84 +
85 +
86 +
87 +###############################################################################
88 +### EXTERNAL RESOURCE RESOLVER ################################################
89 +###############################################################################
90 +class ExternalResourceResolver(object):
91 +    default_endpoint = 'http://demo.cubicweb.org/sparql'
92 +
93 +    def __init__(self, uris=()):
94 +        if isinstance(uris, basestring):
95 +            uris = (uris,)
96 +        self.orig_uris = uris
97 +
98 +    ###########################################################################
99 +    ### RESOLVE/MATCH #########################################################
100 +    ###########################################################################
101 +    @property
102 +    def uris(self):
103 +        return self.orig_uris
104 +
105 +    def _resolve(self, uris, endpoint=None, recursion=0):
106 +        """ Resolve a set of uris fetching information from a source.
107 +        """
108 +        endpoint = endpoint or self.default_endpoint
109 +        assert endpoint
110 +        results = defaultdict(dict)
111 +        same_as = set()
112 +        for uri in uris:
113 +            query = 'SELECT ?r ?o WHERE {<%s> ?r ?o.}' % uri
114 +            for rel, obj in sparqlquery(endpoint, query):
115 +                results[uri].setdefault(rel, []).append(obj)
116 +            same_as.update(set(results[uri].get('http://www.w3.org/2002/07/owl#sameAs', [])))
117 +        if recursion and same_as:
118 +            results.update(self._resolve(same_as, endpoint, recursion=recursion-1))
119 +        return results
120 +
121 +    def resolve(self, endpoint=None, recursion=0):
122 +        """ Resolve an uri by fetching information from a source.
123 +        """
124 +        return self._resolve(self.uris, endpoint, recursion=recursion)
125 +
126 +    def match(self, value, endpoint=None, rdfprop='rdfs:label',
127 +              case_insensitive=False, lang=None):
128 +        """ Match the uri on a sparql endpoint
129 +        """
130 +        endpoint = endpoint or self.default_endpoint
131 +        if not case_insensitive:
132 +            if lang:
133 +                query = 'SELECT ?u WHERE {?u %(p)s "%(w)s"@%(l)s.}'
134 +            else:
135 +                query = 'SELECT ?u WHERE {?u %(p)s "%(w)s".}'
136 +        else:
137 +                query = '''SELECT ?u WHERE {?u %(p)s ?title.
138 +                           FILTER(regex(str(?title), "%(w)s", "i"))}'''
139 +        query = query % {'w': value, 'p': rdfprop, 'l': lang}
140 +        uris = sparqlquery(endpoint, query)
141 +        if uris:
142 +            return set([u[0] for u in uris])
143 +        return []
144 +
145 +    def match_and_resolve(self, value, endpoint=None, rdfprop='rdfs:label',
146 +                          case_insensitive=False, lang=None, recursion=0):
147 +        """ Match the uri on a sparql endpoint and resolve it
148 +        """
149 +        uris = self.match(value, endpoint, rdfprop=rdfprop, lang=lang,
150 +                          case_insensitive=case_insensitive)
151 +        return self._resolve(uris, endpoint, recursion=recursion)
152 +
153 +
154 +    ###########################################################################
155 +    ### INFORMATION HELPERS ###################################################
156 +    ###########################################################################
157 +    def fetch_types(self, data=None, endpoint=None):
158 +        """ Fetch the types """
159 +        data = self.resolve(endpoint) if not data else data
160 +        global_data = []
161 +        for uri, infos in data.iteritems():
162 +            global_data.extend(infos.get('rdf:type', []))
163 +        return global_data
164 +
165 +    def fetch_same_as(self, data=None, endpoint=None):
166 +        """ Fetch the same_as """
167 +        data = self.resolve(endpoint) if not data else data
168 +        global_data = []
169 +        for uri, infos in data.iteritems():
170 +            global_data.extend(infos.get('http://www.w3.org/2002/07/owl#sameAs', []))
171 +        return global_data
172 +
173 +    def _fetch_labels(self, data):
174 +        global_data = []
175 +        global_data.extend(data.get('http://www.w3.org/2004/02/skos/core#prefLabel', []))
176 +        global_data.extend(data.get('http://www.w3.org/2000/01/rdf-schema#label', []))
177 +        return global_data
178 +
179 +    def fetch_labels(self, data=None, endpoint=None):
180 +        """ Fetch the labels """
181 +        data = self.resolve(endpoint) if not data else data
182 +        global_data = []
183 +        for uri, infos in data.iteritems():
184 +            global_data.extend(self._fetch_labels(infos))
185 +        return global_data
186 +
187 +    def _fetch_abstracts(self, data):
188 +        global_data = []
189 +        global_data.extend(data.get('http://www.w3.org/2004/02/skos/core#editorialNote', []))
190 +        global_data.extend(data.get('http://dbpedia.org/ontology/abstract', []))
191 +        return global_data
192 +
193 +    def fetch_abstracts(self, data=None, endpoint=None):
194 +        """ Fetch the abstracts """
195 +        data = self.resolve(endpoint) if not data else data
196 +        global_data = []
197 +        for uri, infos in data.iteritems():
198 +            global_data.extend(self._fetch_abstracts(infos))
199 +        return global_data
200 +
201 +    def _fetch_subjects(self, data=None, endpoint=None):
202 +        """ Fetch the subjects """
203 +        return data.get('http://purl.org/dc/terms/subject', [])
204 +
205 +    def fetch_subjects(self, data=None, endpoint=None):
206 +        """ Fetch the subjects """
207 +        data = self.resolve(endpoint) if not data else data
208 +        global_data = []
209 +        for uri, infos in data.iteritems():
210 +            global_data.extend(self._fetch_subjects(infos))
211 +        return global_data
212 +
213 +    def _fetch_images(self, data):
214 +        global_data = []
215 +        global_data.extend(data.get('http://dbpedia.org/ontology/thumbnail', []))
216 +        global_data.extend(data.get('foaf:depiction', []))
217 +        return global_data
218 +
219 +    def fetch_images(self, data=None, endpoint=None):
220 +        """ Fetch the images """
221 +        data = self.resolve(endpoint) if not data else data
222 +        global_data = []
223 +        for uri, infos in data.iteritems():
224 +            global_data.extend(self._fetch_images(infos))
225 +        return global_data
226 +
227 +    def fetch_geo_infos(self, data=None, endpoint=None):
228 +        """ Fetch the geo informations """
229 +        data = self.resolve(endpoint) if not data else data
230 +        global_data = []
231 +        for uri, infos in data.iteritems():
232 +            global_data.append({'latitude': data.get('geo:lat'),
233 +                                'longitude': data.get('geo:lont'),
234 +                                'geometry': data.get('geo:geometry')})
235 +        return global_data
236 +
237 +
238 +###############################################################################
239 +### ABSTRACT ADAPTER ##########################################################
240 +###############################################################################
241 +class AbstractIExternalResourceAdapter(EntityAdapter, ExternalResourceResolver):
242 +    __abstract__ = True
243 +    __regid__ = 'IExternalResource'
244 +
245 +    def __init__(self, *args, **kwargs):
246 +        super(AbstractIExternalResourceAdapter, self).__init__(*args, **kwargs)
247 +
248 +    ###########################################################################
249 +    ### RESOLVE/MATCH #########################################################
250 +    ###########################################################################
251 +    @property
252 +    def uris(self):
253 +        raise NotImplementedError
254 +
255 +##     def uris_by_categories(self, caturi, limit=None):
256 +##         """ Fetch all the uris from a category
257 +##         """
258 +##         query = '''select distinct ?uri  where {
259 +##         ?uri <http://purl.org/dc/terms/subject> <%s>.
260 +##         }''' % caturi
261 +##         return self.fetch(query, limit)
262 +
263 +##     def get_events(self, uri):
264 +##         """ Get events from an uri
265 +##         """
266 +##         # Persons
267 +##         birthdate = self.get_dbpedia_info(uri, 'birthDate')
268 +##         deathdate = self.get_dbpedia_info(uri, 'deathDate')
269 +##         if birthdate and deathdate:
270 +##             return {'start': birthdate[0][0], 'stop': deathdate[0][0]}
271 +##         # Events/Year
272 +##         types = self.get_types(uri)
273 +##         if 'http://dbpedia.org/ontology/Year' in types.get('type', []):
274 +##             try:
275 +##                 year = (uri.rsplit('/', 1)[-1]) + '-1-1'
276 +##                 return {'start': year, 'stop': year}
277 +##             except ValueError:
278 +##                 pass
279 +##         elif 'http://dbpedia.org/ontology/Agent' in types.get('type', []):
280 +##             start = self.get_dbpedia_info(uri, 'activeYearsStartDate')
281 +##             stop = self.get_dbpedia_info(uri, 'activeYearsEndDate')
282 +##             if start and stop:
283 +##                 return {'start': start[0][0], 'stop': stop[0][0]}
284 +##         return {}
diff --git a/views.py b/views.py
@@ -19,10 +19,13 @@
285  from logilab.common.decorators import monkeypatch
286 
287  from cubicweb.web import Redirect
288  from cubicweb.web import controller
289  from cubicweb.web import component
290 +from cubicweb.selectors import adaptable
291 +from cubicweb.web.views.tabs import LazyViewMixin
292 +from cubicweb.view import EntityView
293 
294  from cubes.dataio.xy import XY
295 
296  try:
297      from fyzz.yappsparser import parse
@@ -162,10 +165,146 @@
298              url = self._cw.build_url('view', rql=rql % args, vid=sparql_format)
299              raise Redirect(url)
300          raise Redirect(self._cw.build_url())
301 
302 
303 +###############################################################################
304 +### EXTERNAL RESOURCES VIEWS ##################################################
305 +###############################################################################
306 +class ResolveExternalResourcesView(LazyViewMixin, EntityView):
307 +    __select__ = EntityView.__select__ & adaptable('IExternalResource')
308 +    __regid__ = 'resolve-all-external-resources'
309 +
310 +    def display_uri(self, uri, infos):
311 +        """ Render the infos for an uri
312 +        """
313 +        self.w(u'<h5>%s</h5>' % uri)
314 +        self.w(u'<dl>')
315 +        for k, values in infos.iteritems():
316 +            self.w(u'<dt>%s</dt>' % k)
317 +            self.w(u'<dd>%s</dd>' % ', '.join([str(s) if not isinstance(s, basestring) else s
318 +                                               for s in values]))
319 +        self.w(u'</dl>')
320 +
321 +    def cell_call(self, row, col, data=None, recursion=0, endpoint=None):
322 +        """ View of external resources
323 +        """
324 +        # XXX LazyView ?
325 +        entity = self.cw_rset.get_entity(row, col)
326 +        self.adapted = entity.cw_adapt_to('IExternalResource')
327 +        if not data:
328 +            data = self.adapted.resolve(endpoint=endpoint, recursion=recursion)
329 +        for uri, infos in data.iteritems():
330 +            if infos:
331 +                self.display_uri(uri, infos)
332 +
333 +
334 +class InfosExternalResourcesView(ResolveExternalResourcesView):
335 +    __regid__ = 'resolve-infos-external-resources'
336 +
337 +    def display_uri(self, uri, infos):
338 +        """ Render the infos for an uri
339 +        """
340 +        self.w(u'<h5><a href="%s">%s</a></h5>' % (uri, uri))
341 +        self.w(u'<div class="row">')
342 +        self.w(u'<div class="col-md-2">')
343 +        images = self.adapted.fetch_images(infos)
344 +        if images:
345 +            self.w(u'<a href="#" class="thumbnail"><img src="%s"/></a>' % images[0])
346 +        self.w(u'</div>')
347 +        self.w(u'<div class="col-md-8">')
348 +        labels = self.adapted.fetch_labels(infos)
349 +        if labels:
350 +            self.w(u'<h4>%s</h4>' % labels[0])
351 +        abstracts = self.adapted.fetch_abstracts(infos)
352 +        if abstracts:
353 +            self.w('</br>'.join(abstracts))
354 +        subjects = self.adapted.fetch_subjects(infos)
355 +        if labels:
356 +            self.w(u', '.join(['<a href="%s">%s</a>' % (c, c)
357 +                              for c in subjects]))
358 +        self.w(u'</div>')
359 +        self.w(u'</div>')
360 +
361 +
362 +class MatchExternalResourcesView(ResolveExternalResourcesView):
363 +    __regid__ = 'match-all-external-resources'
364 +
365 +    def cell_call(self, row, col, data=None, case_insensitive=False,
366 +                  match_callback=None, lang=None, recursion=0, endpoint=None):
367 +        """ View of external resources
368 +        """
369 +        # XXX LazyView ?
370 +        entity = self.cw_rset.get_entity(row, col)
371 +        self.adapted = entity.cw_adapt_to('IExternalResource')
372 +        value = match_callback(entity) if match_callback else entity.dc_title()
373 +        if not data:
374 +            data = self.adapted.match_and_resolve(endpoint=endpoint, value=value,
375 +                                                         case_insensitive=case_insensitive,
376 +                                                         lang=lang, recursion=recursion)
377 +        for uri, infos in data.iteritems():
378 +            if infos:
379 +                self.display_uri(uri, infos)
380 +
381 +
382 +class InfosMatchExternalResourcesView(MatchExternalResourcesView):
383 +    __regid__ = 'match-infos-external-resources'
384 +
385 +    def display_uri(self, uri, infos):
386 +        """ Render the infos for an uri
387 +        """
388 +        self.w(u'<h5><a href="%s">%s</a></h5>' % (uri, uri))
389 +        self.w(u'<div class="row">')
390 +        self.w(u'<div class="col-md-2">')
391 +        images = self.adapted.fetch_images(infos)
392 +        if images:
393 +            self.w(u'<a href="#" class="thumbnail"><img src="%s"/></a>' % images[0])
394 +        self.w(u'</div>')
395 +        self.w(u'<div class="col-md-8">')
396 +        labels = self.adapted.fetch_labels(infos)
397 +        if labels:
398 +            self.w(u'<h4>%s</h4>' % labels[0])
399 +        abstracts = self.adapted.fetch_abstracts(infos)
400 +        if abstracts:
401 +            self.w('</br>'.join(abstracts))
402 +        subjects = self.adapted.fetch_subjects(infos)
403 +        if labels:
404 +            self.w(u', '.join(['<a href="%s">%s</a>' % (c, c)
405 +                              for c in subjects]))
406 +        self.w(u'</div>')
407 +        self.w(u'</div>')
408 +
409 +
410 +class GlobalExternalResourcesView(LazyViewMixin, EntityView):
411 +    __select__ = EntityView.__select__ & adaptable('IExternalResource')
412 +    __regid__ = 'resolve-all-external-resources'
413 +
414 +    def display_uri(self, uri, infos):
415 +        """ Render the infos for an uri
416 +        """
417 +        self.w(u'<h5>%s</h5>' % uri)
418 +        self.w(u'<dl>')
419 +        for k, values in infos.iteritems():
420 +            self.w(u'<dt>%s</dt>' % k)
421 +            self.w(u'<dd>%s</dd>' % ', '.join([str(s) if not isinstance(s, basestring) else s
422 +                                               for s in values]))
423 +        self.w(u'</dl>')
424 +
425 +    def cell_call(self, row, col, data=None, recursion=0, endpoint=None):
426 +        """ View of external resources
427 +        """
428 +        # XXX LazyView ?
429 +        entity = self.cw_rset.get_entity(row, col)
430 +        self.adapted = entity.cw_adapt_to('IExternalResource')
431 +        if not data:
432 +            data = self.adapted.resolve(endpoint=endpoint, recursion=recursion)
433 +        for uri, infos in data.iteritems():
434 +            if infos:
435 +                self.display_uri(uri, infos)
436 +
437 +
438 +
439  def registration_callback(vreg):
440      vreg.register_all(globals().values(), __name__, (DataIORDFView,))
441      if rdflib is not None:
442          vreg.register_and_replace(DataIORDFView, RDFView)
443      # Register parser