diff --git a/docs/index.rst b/docs/index.rst index 5d0364eaa..b4d779d61 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -12,6 +12,7 @@ install gettingstarted + plugins views inference usecases diff --git a/docs/plugins.md b/docs/plugins.md new file mode 100644 index 000000000..ae486b6b6 --- /dev/null +++ b/docs/plugins.md @@ -0,0 +1,190 @@ +# Search Plugins + +Whyis supports multiple full-text search backends through a plugin system. Search plugins provide entity resolution and full-text search capabilities across your knowledge graph. + +## Available Plugins + +### Fuseki Search Plugin + +The **Fuseki Search Plugin** (`whyis_fuseki_search`) integrates with Apache Jena Fuseki's text search functionality using Apache Lucene. + +**Use this plugin when:** +- Using Apache Jena Fuseki as your triple store +- You have Fuseki configured with a text index +- Running Whyis in standard on-premise or self-hosted environments + +**Configuration:** +```python +RESOLVER_TYPE = 'fuseki' # or 'sparql' +RESOLVER_DB = 'knowledge' +PLUGINENGINE_PLUGINS = ['whyis_fuseki_search'] +``` + +See [fuseki_search plugin documentation](../whyis/plugins/fuseki_search/README.md) for detailed setup instructions. + +### Neptune Search Plugin + +The **Neptune Search Plugin** (`whyis_neptune_search`) integrates with AWS Neptune's OpenSearch full-text search. + +**Use this plugin when:** +- Using AWS Neptune as your triple store +- You have Neptune configured with OpenSearch integration +- Running Whyis in AWS cloud environments + +**Configuration:** +```python +RESOLVER_TYPE = 'neptune' +RESOLVER_DB = 'knowledge' +PLUGINENGINE_PLUGINS = ['whyis_neptune_search'] +``` + +See [neptune_search plugin documentation](../whyis/plugins/neptune_search/README.md) for detailed setup instructions. + +## Choosing a Search Plugin + +The choice of search plugin depends on your triple store backend: + +| Triple Store | Plugin | Search Backend | +|-------------|---------|----------------| +| Apache Jena Fuseki | `whyis_fuseki_search` | Apache Lucene | +| AWS Neptune | `whyis_neptune_search` | Amazon OpenSearch | +| Other SPARQL endpoints with text: namespace | `whyis_fuseki_search` | Varies | + +## Configuration Options + +Both plugins support the following configuration options: + +### RESOLVER_TYPE +The type of resolver to use. Valid values: +- `'fuseki'` or `'sparql'` - Uses Fuseki Search Plugin +- `'neptune'` - Uses Neptune Search Plugin + +Default: `'fuseki'` + +### RESOLVER_DB +The name of the database to search. + +Default: `'knowledge'` + +### PLUGINENGINE_PLUGINS +List of plugins to load. Include the appropriate search plugin: + +```python +PLUGINENGINE_PLUGINS = ['whyis_fuseki_search'] # For Fuseki +# or +PLUGINENGINE_PLUGINS = ['whyis_neptune_search'] # For Neptune +``` + +## Entity Resolution + +Both plugins implement entity resolution, which allows you to search for entities by term. The resolve view is accessible at: + +``` +/?view=resolve&term= +``` + +Optional parameters: +- `type` - Filter results by RDF type +- `context` - Context term for relevance boosting + +Example: +``` +/?view=resolve&term=protein&type=http://example.org/Protein +``` + +## Search Data View + +Both plugins provide a search data view that returns JSON results. This is used by the search interface and is accessible at: + +``` +/home?view=search_data&query= +``` + +Example: +``` +/home?view=search_data&query=enzyme +``` + +## Implementation Details + +### Query Differences + +The main difference between the plugins is the SPARQL syntax used: + +**Fuseki Search:** +```sparql +(?label ?relevance) text:search 'search_term'. +``` + +**Neptune Search:** +```sparql +SERVICE { + [] fts:search 'search_term' ; + fts:matchQuery '*' ; + fts:entity ?node ; + fts:score ?relevance . +} +``` + +Neptune uses a SERVICE clause to invoke OpenSearch integration, while Fuseki uses a direct predicate-based approach with Apache Lucene. + +### Searched Properties + +Both plugins search across these RDF properties: +- `dc:title` +- `rdfs:label` +- `skos:prefLabel` +- `skos:altLabel` +- `foaf:name` +- `dc:identifier` +- `schema:name` +- `skos:notation` + +### Filtered Resource Types + +Both plugins exclude these resource types from results: +- Semantic Science Integrated Ontology terms +- Nanopublication metadata (Nanopublication, Assertion, Provenance, PublicationInfo) + +## Extending Search + +To create a custom search plugin: + +1. Create a new plugin directory under `whyis/plugins/` +2. Implement an `EntityResolverListener` subclass with `on_resolve` method +3. Create a `Plugin` subclass that registers the resolver +4. Add templates for search views +5. Register the plugin in `setup.py` entry_points + +See the existing plugins as examples: +- [fuseki_search/plugin.py](../whyis/plugins/fuseki_search/plugin.py) +- [neptune_search/plugin.py](../whyis/plugins/neptune_search/plugin.py) + +## Troubleshooting + +### No search results + +1. Verify the search index is configured correctly for your triple store +2. Check that the `RESOLVER_TYPE` matches your triple store +3. Ensure the appropriate plugin is listed in `PLUGINENGINE_PLUGINS` +4. Verify that data has been indexed (may require rebuild/reindex) + +### Wrong plugin loaded + +Check your configuration: +```python +# Verify RESOLVER_TYPE +print(app.config['RESOLVER_TYPE']) + +# Verify loaded plugins +print(app.config['PLUGINENGINE_PLUGINS']) +``` + +### Import errors + +Ensure the plugin is properly installed: +```bash +pip install -e . +``` + +This will register the plugin entry points from `setup.py`. diff --git a/setup.py b/setup.py index 95c8837ee..3087c56c6 100644 --- a/setup.py +++ b/setup.py @@ -231,7 +231,8 @@ def run(self): 'text/turtle = rdflib.plugins.sparql.results.graph:GraphResultParser' ], 'whyis': [ - 'whyis_sparql_entity_resolver = whyis.plugins.sparql_entity_resolver:SPARQLEntityResolverPlugin', + 'whyis_fuseki_search = whyis.plugins.fuseki_search:FusekiSearchPlugin', + 'whyis_neptune_search = whyis.plugins.neptune_search:NeptuneSearchPlugin', 'whyis_knowledge_explorer = whyis.plugins.knowledge_explorer:KnowledgeExplorerPlugin' ] }, diff --git a/tests/unit/test_search_plugins.py b/tests/unit/test_search_plugins.py new file mode 100644 index 000000000..3df48344f --- /dev/null +++ b/tests/unit/test_search_plugins.py @@ -0,0 +1,438 @@ +""" +Unit tests for search plugins (fuseki_search and neptune_search). + +Tests both entity resolution and search functionality for Fuseki and Neptune backends. +""" + +import unittest +from unittest.mock import Mock, MagicMock, patch +import rdflib +from rdflib import Namespace, Literal, URIRef + + +class TestFusekiSearchPlugin(unittest.TestCase): + """Test the FusekiSearchPlugin and FusekiEntityResolver.""" + + def setUp(self): + """Set up test fixtures.""" + from whyis.plugins.fuseki_search.plugin import FusekiEntityResolver, FusekiSearchPlugin + self.resolver_class = FusekiEntityResolver + self.plugin_class = FusekiSearchPlugin + + def test_resolver_init(self): + """Test FusekiEntityResolver initialization.""" + resolver = self.resolver_class(database="test_db") + self.assertEqual(resolver.database, "test_db") + + # Test default database + resolver_default = self.resolver_class() + self.assertEqual(resolver_default.database, "knowledge") + + def test_resolver_query_format(self): + """Test that FusekiEntityResolver generates correct SPARQL queries.""" + resolver = self.resolver_class() + + # Check query structure contains text:search + self.assertIn("text:search", resolver.query) + self.assertIn("(?label ?relevance)", resolver.query) + + # Check for proper filtering + self.assertIn("filter not exists", resolver.query) + self.assertIn("np:Nanopublication", resolver.query) + + def test_resolver_type_query(self): + """Test type filtering in queries.""" + resolver = self.resolver_class() + type_uri = "http://example.org/TestType" + type_query = resolver.type_query % type_uri + + self.assertIn("rdf:type", type_query) + self.assertIn(type_uri, type_query) + + def test_resolver_context_query(self): + """Test context filtering in queries.""" + resolver = self.resolver_class() + context = "test context" + context_query = resolver.context_query % context + + self.assertIn("text:search", context_query) + self.assertIn("optional", context_query.lower()) + + def test_resolver_on_resolve_basic(self): + """Test basic entity resolution.""" + with patch('flask.current_app') as mock_app: + resolver = self.resolver_class() + + # Mock the database and query results + mock_graph = Mock() + mock_app.databases = {"knowledge": mock_graph} + + # Mock query result + mock_result = Mock() + mock_result.asdict.return_value = { + 'node': 'http://example.org/entity1', + 'label': 'Test Entity', + 'types': 'http://example.org/Type1||http://example.org/Type2', + 'score': 1.0 + } + mock_graph.query.return_value = [mock_result] + + # Mock labelize + mock_app.labelize.side_effect = lambda d, k, v: d.update({v: 'Labeled'}) + + results = resolver.on_resolve("test", label=False) + + self.assertEqual(len(results), 1) + self.assertEqual(results[0]['node'], 'http://example.org/entity1') + self.assertIn('types', results[0]) + self.assertEqual(len(results[0]['types']), 2) + + def test_resolver_on_resolve_with_type(self): + """Test entity resolution with type filtering.""" + with patch('flask.current_app') as mock_app: + resolver = self.resolver_class() + + mock_graph = Mock() + mock_app.databases = {"knowledge": mock_graph} + mock_graph.query.return_value = [] + + type_uri = "http://example.org/TestType" + results = resolver.on_resolve("test", type=type_uri, label=False) + + # Verify query was called + mock_graph.query.assert_called_once() + call_args = mock_graph.query.call_args[0][0] + + # Check that type filter is in query + self.assertIn(type_uri, call_args) + + def test_resolver_on_resolve_with_context(self): + """Test entity resolution with context.""" + with patch('flask.current_app') as mock_app: + resolver = self.resolver_class() + + mock_graph = Mock() + mock_app.databases = {"knowledge": mock_graph} + mock_graph.query.return_value = [] + + context = "test context" + results = resolver.on_resolve("test", context=context, label=False) + + # Verify query was called + mock_graph.query.assert_called_once() + call_args = mock_graph.query.call_args[0][0] + + # Check that context is in query + self.assertIn(context, call_args) + + def test_plugin_resolvers_dict(self): + """Test that plugin has correct resolver mappings.""" + plugin = self.plugin_class() + + self.assertIn("fuseki", plugin.resolvers) + self.assertIn("sparql", plugin.resolvers) + self.assertEqual(plugin.resolvers["fuseki"], self.resolver_class) + self.assertEqual(plugin.resolvers["sparql"], self.resolver_class) + + def test_plugin_create_blueprint(self): + """Test plugin blueprint creation.""" + with patch('whyis.plugins.fuseki_search.plugin.PluginBlueprint') as mock_blueprint: + plugin = self.plugin_class() + blueprint = plugin.create_blueprint() + + # Verify PluginBlueprint was called with correct arguments + mock_blueprint.assert_called_once_with('fuseki_search', + 'whyis.plugins.fuseki_search.plugin', + template_folder='templates') + + def test_plugin_init_valid_type(self): + """Test plugin initialization with valid resolver type.""" + plugin = self.plugin_class() + mock_app = Mock() + plugin.app = mock_app + mock_app.config.get.side_effect = lambda k, d: {'RESOLVER_TYPE': 'fuseki', + 'RESOLVER_DB': 'knowledge'}.get(k, d) + + plugin.init() + + # Verify add_listener was called + mock_app.add_listener.assert_called_once() + + def test_plugin_init_invalid_type(self): + """Test plugin initialization with invalid resolver type.""" + plugin = self.plugin_class() + mock_app = Mock() + plugin.app = mock_app + mock_app.config.get.side_effect = lambda k, d: {'RESOLVER_TYPE': 'invalid', + 'RESOLVER_DB': 'knowledge'}.get(k, d) + + # Should raise ValueError for invalid type + with self.assertRaises(ValueError) as context: + plugin.init() + + self.assertIn("Invalid RESOLVER_TYPE", str(context.exception)) + + +class TestNeptuneSearchPlugin(unittest.TestCase): + """Test the NeptuneSearchPlugin and NeptuneEntityResolver.""" + + def setUp(self): + """Set up test fixtures.""" + from whyis.plugins.neptune_search.plugin import NeptuneEntityResolver, NeptuneSearchPlugin + self.resolver_class = NeptuneEntityResolver + self.plugin_class = NeptuneSearchPlugin + + def test_resolver_init(self): + """Test NeptuneEntityResolver initialization.""" + resolver = self.resolver_class(database="test_db") + self.assertEqual(resolver.database, "test_db") + + # Test default database + resolver_default = self.resolver_class() + self.assertEqual(resolver_default.database, "knowledge") + + def test_resolver_query_format(self): + """Test that NeptuneEntityResolver generates correct SPARQL queries.""" + resolver = self.resolver_class() + + # Check query structure contains SERVICE clause and fts:search + self.assertIn("SERVICE ftsEndpoint", resolver.query) + self.assertIn("fts:search", resolver.query) + self.assertIn("fts:matchQuery", resolver.query) + self.assertIn("fts:entity", resolver.query) + self.assertIn("fts:score", resolver.query) + + # Check for proper filtering + self.assertIn("filter not exists", resolver.query) + self.assertIn("np:Nanopublication", resolver.query) + + def test_resolver_service_clause(self): + """Test that SERVICE clause is properly formatted.""" + resolver = self.resolver_class() + + # Verify SERVICE clause structure + self.assertIn("SERVICE ftsEndpoint", resolver.query) + self.assertIn("[] fts:search", resolver.query) + + # Check context query also uses SERVICE + self.assertIn("SERVICE ftsEndpoint", resolver.context_query) + + def test_resolver_type_query(self): + """Test type filtering in queries.""" + resolver = self.resolver_class() + type_uri = "http://example.org/TestType" + type_query = resolver.type_query % type_uri + + self.assertIn("rdf:type", type_query) + self.assertIn(type_uri, type_query) + + def test_resolver_context_query(self): + """Test context filtering in queries.""" + resolver = self.resolver_class() + context = "test context" + context_query = resolver.context_query % (context, context) + + self.assertIn("fts:search", context_query) + self.assertIn("fts:matchQuery", context_query) + self.assertIn("optional", context_query.lower()) + + def test_resolver_on_resolve_basic(self): + """Test basic entity resolution.""" + with patch('flask.current_app') as mock_app: + resolver = self.resolver_class() + + # Mock the database and query results + mock_graph = Mock() + mock_app.databases = {"knowledge": mock_graph} + + # Mock query result + mock_result = Mock() + mock_result.asdict.return_value = { + 'node': 'http://example.org/entity1', + 'label': 'Test Entity', + 'types': 'http://example.org/Type1||http://example.org/Type2', + 'score': 1.0 + } + mock_graph.query.return_value = [mock_result] + + # Mock labelize + mock_app.labelize.side_effect = lambda d, k, v: d.update({v: 'Labeled'}) + + results = resolver.on_resolve("test", label=False) + + self.assertEqual(len(results), 1) + self.assertEqual(results[0]['node'], 'http://example.org/entity1') + self.assertIn('types', results[0]) + self.assertEqual(len(results[0]['types']), 2) + + def test_resolver_on_resolve_with_empty_types(self): + """Test entity resolution handles empty types correctly.""" + with patch('flask.current_app') as mock_app: + resolver = self.resolver_class() + + mock_graph = Mock() + mock_app.databases = {"knowledge": mock_graph} + + # Mock query result with empty types + mock_result = Mock() + mock_result.asdict.return_value = { + 'node': 'http://example.org/entity1', + 'label': 'Test Entity', + 'types': '', # Empty types string + 'score': 1.0 + } + mock_graph.query.return_value = [mock_result] + + results = resolver.on_resolve("test", label=False) + + # Should handle empty types gracefully + self.assertEqual(len(results), 1) + self.assertEqual(results[0]['types'], []) + + def test_resolver_on_resolve_with_type(self): + """Test entity resolution with type filtering.""" + with patch('flask.current_app') as mock_app: + resolver = self.resolver_class() + + mock_graph = Mock() + mock_app.databases = {"knowledge": mock_graph} + mock_graph.query.return_value = [] + + type_uri = "http://example.org/TestType" + results = resolver.on_resolve("test", type=type_uri, label=False) + + # Verify query was called + mock_graph.query.assert_called_once() + call_args = mock_graph.query.call_args[0][0] + + # Check that type filter is in query + self.assertIn(type_uri, call_args) + + def test_resolver_on_resolve_with_context(self): + """Test entity resolution with context.""" + with patch('flask.current_app') as mock_app: + resolver = self.resolver_class() + + mock_graph = Mock() + mock_app.databases = {"knowledge": mock_graph} + mock_graph.query.return_value = [] + + context = "test context" + results = resolver.on_resolve("test", context=context, label=False) + + # Verify query was called + mock_graph.query.assert_called_once() + call_args = mock_graph.query.call_args[0][0] + + # Check that context is in query (appears twice for matchQuery) + self.assertEqual(call_args.count(context), 2) + + def test_resolver_match_query_parameter(self): + """Test that matchQuery parameter is included correctly.""" + with patch('flask.current_app') as mock_app: + resolver = self.resolver_class() + + mock_graph = Mock() + mock_app.databases = {"knowledge": mock_graph} + mock_graph.query.return_value = [] + + results = resolver.on_resolve("test", label=False) + + # Verify query was called + mock_graph.query.assert_called_once() + call_args = mock_graph.query.call_args[0][0] + + # Check that matchQuery with '*' is in query + self.assertIn("fts:matchQuery '*'", call_args) + + def test_plugin_resolvers_dict(self): + """Test that plugin has correct resolver mappings.""" + plugin = self.plugin_class() + + self.assertIn("neptune", plugin.resolvers) + self.assertEqual(plugin.resolvers["neptune"], self.resolver_class) + + def test_plugin_create_blueprint(self): + """Test plugin blueprint creation.""" + with patch('whyis.plugins.neptune_search.plugin.PluginBlueprint') as mock_blueprint: + plugin = self.plugin_class() + blueprint = plugin.create_blueprint() + + # Verify PluginBlueprint was called with correct arguments + mock_blueprint.assert_called_once_with('neptune_search', + 'whyis.plugins.neptune_search.plugin', + template_folder='templates') + + def test_plugin_init_valid_type(self): + """Test plugin initialization with valid resolver type.""" + mock_app = Mock() + plugin = self.plugin_class() + plugin.app = mock_app + mock_app.config.get.side_effect = lambda k, d: {'RESOLVER_TYPE': 'neptune', + 'RESOLVER_DB': 'knowledge'}.get(k, d) + + plugin.init() + + # Verify add_listener was called + mock_app.add_listener.assert_called_once() + + def test_plugin_init_invalid_type_silent(self): + """Test plugin initialization silently skips invalid resolver type.""" + mock_app = Mock() + plugin = self.plugin_class() + plugin.app = mock_app + mock_app.config.get.side_effect = lambda k, d: {'RESOLVER_TYPE': 'fuseki', + 'RESOLVER_DB': 'knowledge'}.get(k, d) + + # Should not raise, should silently skip + plugin.init() + + # Verify add_listener was NOT called + mock_app.add_listener.assert_not_called() + + +class TestSearchPluginIntegration(unittest.TestCase): + """Integration tests comparing Fuseki and Neptune plugins.""" + + def test_both_plugins_have_same_interface(self): + """Test that both plugins implement the same interface.""" + from whyis.plugins.fuseki_search.plugin import FusekiEntityResolver + from whyis.plugins.neptune_search.plugin import NeptuneEntityResolver + + fuseki_methods = {m for m in dir(FusekiEntityResolver) if not m.startswith('_')} + neptune_methods = {m for m in dir(NeptuneEntityResolver) if not m.startswith('_')} + + # Both should have on_resolve method + self.assertIn('on_resolve', fuseki_methods) + self.assertIn('on_resolve', neptune_methods) + + def test_both_plugins_filter_same_types(self): + """Test that both plugins filter the same resource types.""" + from whyis.plugins.fuseki_search.plugin import FusekiEntityResolver + from whyis.plugins.neptune_search.plugin import NeptuneEntityResolver + + fuseki = FusekiEntityResolver() + neptune = NeptuneEntityResolver() + + # Both should filter nanopublication types + self.assertIn("np:Nanopublication", fuseki.query) + self.assertIn("np:Nanopublication", neptune.query) + + self.assertIn("np:Assertion", fuseki.query) + self.assertIn("np:Assertion", neptune.query) + + def test_prefixes_compatibility(self): + """Test that both plugins define compatible prefixes.""" + from whyis.plugins.fuseki_search.plugin import prefixes as fuseki_prefixes + from whyis.plugins.neptune_search.plugin import prefixes as neptune_prefixes + + # Common prefixes should exist + common_keys = ['skos', 'foaf', 'schema', 'owl', 'rdfs', 'rdf', 'dc'] + + for key in common_keys: + self.assertIn(key, fuseki_prefixes) + self.assertIn(key, neptune_prefixes) + + +if __name__ == '__main__': + unittest.main() diff --git a/whyis/config/default.py b/whyis/config/default.py index 285c582c5..bef718aa3 100644 --- a/whyis/config/default.py +++ b/whyis/config/default.py @@ -87,7 +87,7 @@ class Config: MULTIUSER = True PLUGINENGINE_NAMESPACE = "whyis" - PLUGINENGINE_PLUGINS = ['whyis_sparql_entity_resolver'] + PLUGINENGINE_PLUGINS = ['whyis_fuseki_search'] SECURITY_EMAIL_SENDER = "Name " SECURITY_FLASH_MESSAGES = True diff --git a/whyis/default_vocab.ttl b/whyis/default_vocab.ttl index 67687ca9e..2f5a1449a 100644 --- a/whyis/default_vocab.ttl +++ b/whyis/default_vocab.ttl @@ -495,15 +495,11 @@ whyis:searchApi whyis:hasView "search-api.json". a whyis:search . -whyis:HomePage whyis:searchView "search.html"; - whyis:searchData "search.json". +whyis:HomePage whyis:searchView "search.html". whyis:searchView rdfs:subPropertyOf whyis:hasView; dc:identifier "search". -whyis:searchData rdfs:subPropertyOf whyis:hasView; - dc:identifier "search_data". - # whyis:search whyis:hasView "search-view.html"; # whyis:searchApi "search-api.json". diff --git a/whyis/plugins/fuseki_search/README.md b/whyis/plugins/fuseki_search/README.md new file mode 100644 index 000000000..f4d7d2fba --- /dev/null +++ b/whyis/plugins/fuseki_search/README.md @@ -0,0 +1,88 @@ +# Fuseki Search Plugin + +This plugin provides full-text search capabilities using Apache Jena Fuseki's text search functionality. + +## Overview + +The Fuseki Search plugin integrates with Apache Jena Fuseki's full-text search using the `text:search` predicate. It provides: +- Entity resolution via full-text search +- Search data view for the search interface +- Support for context-aware and type-filtered search + +## Configuration + +To use this plugin, set the following configuration in your Whyis application: + +```python +RESOLVER_TYPE = 'fuseki' # or 'sparql' (both use this plugin) +RESOLVER_DB = 'knowledge' # name of the database to search +PLUGINENGINE_PLUGINS = ['whyis_fuseki_search'] +``` + +## Features + +### Entity Resolution + +The plugin implements the `on_resolve` method to search entities by term, with optional filters: +- `term`: Search term (required) +- `type`: RDF type to filter results (optional) +- `context`: Context for relevance boosting (optional) +- `label`: Whether to fetch labels for results (default: True) + +### Search View + +The plugin registers a `search.json` view that provides full-text search results. This view is accessible via the `?view=search_data` parameter on the HomePage resource. + +## SPARQL Query Syntax + +The plugin uses Apache Jena's text search syntax: + +```sparql +(?label ?relevance) text:search 'search_term'. +``` + +This requires that Fuseki is configured with a text index. See [Jena Text Search documentation](https://jena.apache.org/documentation/query/text-query.html) for configuration details. + +## Fuseki Text Index Configuration + +To use this plugin effectively, your Fuseki server must be configured with a text index. Example configuration: + +```turtle +<#text_dataset> rdf:type text:TextDataset ; + text:dataset <#dataset> ; + text:index <#indexLucene> . + +<#indexLucene> a text:TextIndexLucene ; + text:directory ; + text:entityMap <#entMap> . + +<#entMap> a text:EntityMap ; + text:entityField "uri" ; + text:defaultField "label" ; + text:map ( + [ text:field "label" ; text:predicate rdfs:label ] + [ text:field "prefLabel" ; text:predicate skos:prefLabel ] + [ text:field "title" ; text:predicate dc:title ] + ) . +``` + +## Search Properties + +The plugin searches across multiple properties: +- `dc:title` +- `rdfs:label` +- `skos:prefLabel` +- `skos:altLabel` +- `foaf:name` +- `dc:identifier` +- `schema:name` +- `skos:notation` + +## Filtered Resources + +The following resource types are excluded from search results: +- `sio:Term` (Semantic Science Integrated Ontology terms) +- `np:Nanopublication` +- `np:Assertion` +- `np:Provenance` +- `np:PublicationInfo` diff --git a/whyis/plugins/sparql_entity_resolver/__init__.py b/whyis/plugins/fuseki_search/__init__.py similarity index 100% rename from whyis/plugins/sparql_entity_resolver/__init__.py rename to whyis/plugins/fuseki_search/__init__.py diff --git a/whyis/plugins/sparql_entity_resolver/plugin.py b/whyis/plugins/fuseki_search/plugin.py similarity index 82% rename from whyis/plugins/sparql_entity_resolver/plugin.py rename to whyis/plugins/fuseki_search/plugin.py index e2659a3a0..a3ddabc58 100644 --- a/whyis/plugins/sparql_entity_resolver/plugin.py +++ b/whyis/plugins/fuseki_search/plugin.py @@ -1,4 +1,4 @@ -from whyis.plugin import Plugin, EntityResolverListener +from whyis.plugin import Plugin, EntityResolverListener, PluginBlueprint import rdflib from flask import current_app @@ -14,7 +14,7 @@ dc = rdflib.URIRef("http://purl.org/dc/terms/") ) -class SPARQLEntityResolver(EntityResolverListener): +class FusekiEntityResolver(EntityResolverListener): context_query=""" optional { @@ -76,7 +76,7 @@ def on_resolve(self, term, type=None, context=None, label=True): type_query = '' if type is not None: - type_query = self.type_query% type + type_query = self.type_query % type query = self.query % (term, type_query, context_query) #print(query) @@ -94,15 +94,21 @@ def on_resolve(self, term, type=None, context=None, label=True): return results -class SPARQLEntityResolverPlugin(Plugin): +class FusekiSearchPlugin(Plugin): resolvers = { - "sparql" : SPARQLEntityResolver, - "fuseki" : SPARQLEntityResolver + "sparql" : FusekiEntityResolver, + "fuseki" : FusekiEntityResolver } + def create_blueprint(self): + blueprint = PluginBlueprint('fuseki_search', __name__, template_folder='templates') + return blueprint + def init(self): resolver_type = self.app.config.get('RESOLVER_TYPE', 'fuseki') resolver_db = self.app.config.get('RESOLVER_DB', "knowledge") + if resolver_type not in self.resolvers: + raise ValueError(f"Invalid RESOLVER_TYPE '{resolver_type}'. Valid options: {list(self.resolvers.keys())}") resolver = self.resolvers[resolver_type](resolver_db) self.app.add_listener(resolver) diff --git a/whyis/plugins/fuseki_search/templates/search.json b/whyis/plugins/fuseki_search/templates/search.json new file mode 100644 index 000000000..ea2961064 --- /dev/null +++ b/whyis/plugins/fuseki_search/templates/search.json @@ -0,0 +1,14 @@ +{{''' + SELECT ?identifier (sample(?d) as ?description) (max(?s) as ?score) (sample(?o) as ?text) + WHERE { + (?o ?s) text:search ?query . + filter(lang(?o) = "" || langMatches(lang(?o), "en")) + ?identifier ?p ?o . + filter(!isBlank(?identifier)) + OPTIONAL { + ?identifier dc:description|skos:definition|rdfs:comment|sioc:content|dc:abstract|dc:summary|rdfs:comment|dcelements:description||prov:value|sio:hasValue| ?d. + filter(lang(?d) = "" || langMatches(lang(?d), "en")) + } + } group by ?identifier having max(?s) + ORDER BY DESC(?score) + LIMIT 1000''' | query(values={"query":rdflib.Literal(args['query'])}) | iter_labelize("identifier","label") | tojson }} diff --git a/whyis/plugins/fuseki_search/vocab.ttl b/whyis/plugins/fuseki_search/vocab.ttl new file mode 100644 index 000000000..afbc66fe8 --- /dev/null +++ b/whyis/plugins/fuseki_search/vocab.ttl @@ -0,0 +1,10 @@ +@prefix : . +@prefix dc: . +@prefix rdfs: . +@prefix whyis: . + +# Search data view registration for Fuseki full-text search +whyis:HomePage whyis:searchData "search.json". + +whyis:searchData rdfs:subPropertyOf whyis:hasView; + dc:identifier "search_data". diff --git a/whyis/plugins/neptune_search/README.md b/whyis/plugins/neptune_search/README.md new file mode 100644 index 000000000..8f2da34eb --- /dev/null +++ b/whyis/plugins/neptune_search/README.md @@ -0,0 +1,144 @@ +# Neptune Search Plugin + +This plugin provides full-text search capabilities using AWS Neptune's OpenSearch integration. + +## Overview + +The Neptune Search plugin integrates with AWS Neptune's OpenSearch full-text search using the `fts:search` predicate. It provides: +- Entity resolution via full-text search +- Search data view for the search interface +- Support for context-aware and type-filtered search +- Compatible with Neptune's OpenSearch backend + +## Configuration + +To use this plugin, set the following configuration in your Whyis application: + +```python +RESOLVER_TYPE = 'neptune' +RESOLVER_DB = 'knowledge' # name of the database to search +PLUGINENGINE_PLUGINS = ['whyis_neptune_search'] +``` + +## Features + +### Entity Resolution + +The plugin implements the `on_resolve` method to search entities by term, with optional filters: +- `term`: Search term (required) +- `type`: RDF type to filter results (optional) +- `context`: Context for relevance boosting (optional) +- `label`: Whether to fetch labels for results (default: True) + +### Search View + +The plugin registers a `search.json` view that provides full-text search results. This view is accessible via the `?view=search_data` parameter on the HomePage resource. + +## SPARQL Query Syntax + +The plugin uses AWS Neptune's full-text search syntax with OpenSearch via a SERVICE clause: + +```sparql +SERVICE { + [] fts:search 'search_term' ; + fts:matchQuery '*' ; + fts:entity ?node ; + fts:score ?relevance . +} +``` + +This requires that Neptune is configured with OpenSearch integration enabled. See [Neptune Full-Text Search documentation](https://docs.aws.amazon.com/neptune/latest/userguide/full-text-search.html) for configuration details. + +The SERVICE clause parameters: +- `fts:search` - The search term/query string +- `fts:matchQuery` - The field pattern to search (use '*' for all indexed fields) +- `fts:entity` - Returns the matching RDF resource URI +- `fts:score` - Returns the relevance score + +## Neptune OpenSearch Configuration + +To use this plugin, your Neptune cluster must have: + +1. **OpenSearch Integration Enabled**: Neptune must be configured to integrate with Amazon OpenSearch Service +2. **Full-Text Search Endpoint**: The Neptune cluster must have a full-text search endpoint configured +3. **Indexed Properties**: Properties to be searched must be indexed in OpenSearch + +### Example Configuration Steps + +1. Enable OpenSearch integration on your Neptune cluster +2. Configure the OpenSearch domain +3. Index the properties you want to search (see Search Properties below) +4. Ensure proper IAM roles and permissions are configured + +For detailed setup instructions, refer to the [AWS Neptune documentation](https://docs.aws.amazon.com/neptune/latest/userguide/full-text-search.html). + +## Search Properties + +The plugin searches across multiple properties: +- `dc:title` +- `rdfs:label` +- `skos:prefLabel` +- `skos:altLabel` +- `foaf:name` +- `dc:identifier` +- `schema:name` +- `skos:notation` + +## Filtered Resources + +The following resource types are excluded from search results: +- `sio:Term` (Semantic Science Integrated Ontology terms) +- `np:Nanopublication` +- `np:Assertion` +- `np:Provenance` +- `np:PublicationInfo` + +## Differences from Fuseki Search + +The main differences between Neptune and Fuseki search plugins are: + +1. **Namespace**: Neptune uses `fts:` (http://aws.amazon.com/neptune/vocab/v01/services/fts#) instead of `text:` (http://jena.apache.org/fulltext#) +2. **Query Syntax**: + - Fuseki: `(?label ?relevance) text:search 'term'` + - Neptune: Uses SERVICE clause with `fts:search`, `fts:matchQuery`, `fts:entity`, and `fts:score` +3. **SERVICE Clause**: Neptune requires a SERVICE clause pointing to `` +4. **Backend**: Fuseki uses Apache Lucene; Neptune uses Amazon OpenSearch Service +5. **Configuration**: Fuseki configuration is in assembler files; Neptune is configured via AWS console/API + +### Example Comparison + +**Fuseki:** +```sparql +SELECT ?node ?score WHERE { + (?label ?score) text:search 'term' . + ?node rdfs:label ?label . +} +``` + +**Neptune:** +```sparql +SELECT ?node ?score WHERE { + SERVICE { + [] fts:search 'term' ; + fts:matchQuery '*' ; + fts:entity ?node ; + fts:score ?score . + } +} +``` + +## Connection String + +When connecting to Neptune with OpenSearch, ensure your SPARQL endpoint URL includes the proper Neptune endpoint. Example: + +```python +KNOWLEDGE_ENDPOINT = 'https://your-neptune-cluster.region.neptune.amazonaws.com:8182/sparql' +``` + +## IAM Authentication + +Neptune typically requires IAM authentication. Ensure your application has proper AWS credentials configured with permissions to: +- Execute queries on Neptune +- Access the OpenSearch domain (if applicable) + +Refer to the [Neptune IAM documentation](https://docs.aws.amazon.com/neptune/latest/userguide/iam-auth.html) for authentication setup. diff --git a/whyis/plugins/neptune_search/__init__.py b/whyis/plugins/neptune_search/__init__.py new file mode 100644 index 000000000..48aad58ec --- /dev/null +++ b/whyis/plugins/neptune_search/__init__.py @@ -0,0 +1 @@ +from .plugin import * diff --git a/whyis/plugins/neptune_search/plugin.py b/whyis/plugins/neptune_search/plugin.py new file mode 100644 index 000000000..a496a22aa --- /dev/null +++ b/whyis/plugins/neptune_search/plugin.py @@ -0,0 +1,128 @@ +from whyis.plugin import Plugin, EntityResolverListener, PluginBlueprint +import rdflib +from flask import current_app + + +prefixes = dict( + skos = rdflib.URIRef("http://www.w3.org/2004/02/skos/core#"), + foaf = rdflib.URIRef("http://xmlns.com/foaf/0.1/"), + fts = rdflib.URIRef("http://aws.amazon.com/neptune/vocab/v01/services/fts#"), + ftsEndpoint = rdflib.URIRef("http://aws.amazon.com/neptune/vocab/v01/services/fts"), + schema = rdflib.URIRef("http://schema.org/"), + owl = rdflib.OWL, + rdfs = rdflib.RDFS, + rdf = rdflib.RDF, + dc = rdflib.URIRef("http://purl.org/dc/terms/") +) + + +class NeptuneEntityResolver(EntityResolverListener): + """ + Entity resolver for AWS Neptune with OpenSearch full-text search integration. + Uses Neptune's SERVICE clause with fts:search for full-text queries. + + Based on AWS Neptune documentation: + https://docs.aws.amazon.com/neptune/latest/userguide/full-text-search-sparql-examples.html + """ + + context_query=""" + optional { + SERVICE ftsEndpoint { + [] fts:search '%s' ; + fts:matchQuery '%s' ; + fts:entity ?context ; + fts:score ?cr . + } + ?node ?p ?context. + } +""" + type_query = """ +?node rdf:type <%s> . +""" + + query = """ +select distinct +?node +?label +(group_concat(distinct ?type; separator="||") as ?types) +?relevance as ?score +where { + SERVICE ftsEndpoint { + [] fts:search '%s' ; + fts:matchQuery '%s' ; + fts:entity ?node ; + fts:score ?relevance . + } + ?node dc:title|rdfs:label|skos:prefLabel|skos:altLabel|foaf:name|dc:identifier|schema:name|skos:notation ?label. + %s + optional { + ?node rdf:type ?type. + } + + %s + + filter not exists { + ?node a + } + filter not exists { + ?node a + } + filter not exists { + ?node a + } + filter not exists { + ?node a + } + filter not exists { + ?node a + } +} group by ?node ?label ?relevance order by desc(?relevance) limit 10""" + + def __init__(self, database="knowledge"): + self.database = database + + def on_resolve(self, term, type=None, context=None, label=True): + graph = current_app.databases[self.database] + context_query = '' + if context is not None: + context_query = self.context_query % (context, context) + + type_query = '' + if type is not None: + type_query = self.type_query % type + + # Neptune requires the search term and matchQuery (field to search) + # For entity resolution, we search across all indexed fields using '*' + query = self.query % (term, '*', type_query, context_query) + #print(query) + results = [] + for hit in graph.query(query, initNs=prefixes): + result = hit.asdict() + result['types'] = [{'uri':x} for x in result.get('types','').split('||') if x] + if label: + current_app.labelize(result,'node','preflabel') + result['types'] = [ + current_app.labelize(x,'uri','label') + for x in result['types'] + ] + results.append(result) + return results + + +class NeptuneSearchPlugin(Plugin): + + resolvers = { + "neptune" : NeptuneEntityResolver + } + + def create_blueprint(self): + blueprint = PluginBlueprint('neptune_search', __name__, template_folder='templates') + return blueprint + + def init(self): + resolver_type = self.app.config.get('RESOLVER_TYPE', 'fuseki') + resolver_db = self.app.config.get('RESOLVER_DB', "knowledge") + if resolver_type in self.resolvers: + resolver = self.resolvers[resolver_type](resolver_db) + self.app.add_listener(resolver) + # Silently skip if not in resolvers - another plugin may handle this type diff --git a/whyis/plugins/neptune_search/templates/search.json b/whyis/plugins/neptune_search/templates/search.json new file mode 100644 index 000000000..13318c829 --- /dev/null +++ b/whyis/plugins/neptune_search/templates/search.json @@ -0,0 +1,19 @@ +{{''' + SELECT ?identifier (sample(?d) as ?description) (max(?s) as ?score) (sample(?label) as ?text) + WHERE { + SERVICE { + [] fts:search ?query ; + fts:matchQuery "*" ; + fts:entity ?identifier ; + fts:score ?s . + } + ?identifier ?p ?label . + filter(!isBlank(?identifier)) + filter(lang(?label) = "" || langMatches(lang(?label), "en")) + OPTIONAL { + ?identifier dc:description|skos:definition|rdfs:comment|sioc:content|dc:abstract|dc:summary|rdfs:comment|dcelements:description||prov:value|sio:hasValue| ?d. + filter(lang(?d) = "" || langMatches(lang(?d), "en")) + } + } group by ?identifier having max(?s) + ORDER BY DESC(?score) + LIMIT 1000''' | query(values={"query":rdflib.Literal(args['query'])}) | iter_labelize("identifier","label") | tojson }} diff --git a/whyis/plugins/neptune_search/vocab.ttl b/whyis/plugins/neptune_search/vocab.ttl new file mode 100644 index 000000000..abaad3375 --- /dev/null +++ b/whyis/plugins/neptune_search/vocab.ttl @@ -0,0 +1,10 @@ +@prefix : . +@prefix dc: . +@prefix rdfs: . +@prefix whyis: . + +# Search data view registration for Neptune OpenSearch full-text search +whyis:HomePage whyis:searchData "search.json". + +whyis:searchData rdfs:subPropertyOf whyis:hasView; + dc:identifier "search_data".