diff --git a/docs/rdf_file_loader_agent.md b/docs/rdf_file_loader_agent.md new file mode 100644 index 00000000..02a673b3 --- /dev/null +++ b/docs/rdf_file_loader_agent.md @@ -0,0 +1,136 @@ +# RDF File Loader Agent + +## Overview + +The RDF File Loader agent automatically loads RDF files into the Whyis knowledge graph as nanopublications. It monitors resources typed as `whyis:RDFFile` and loads their content. + +## Features + +- **Multiple Source Support:** + - Local files from the file depot (via `whyis:hasFileID`) + - Remote HTTP/HTTPS URLs + - S3 URIs (requires boto3 to be installed) + +- **Format Detection:** + - Automatic format detection from file extensions and content types + - Supports: Turtle (.ttl), RDF/XML (.rdf, .owl), JSON-LD (.jsonld), N-Triples (.nt), N3 (.n3), TriG (.trig), N-Quads (.nq) + +- **Provenance Tracking:** + - Resources are marked with `whyis:RDFFile` type before processing + - After loading, marked as `whyis:LoadedRDFFile` + - Activities are tracked as `whyis:RDFFileLoadingActivity` + - Proper nanopublication structure with provenance + +## Usage + +### 1. Add the agent to your configuration + +In your application's config file: + +```python +from whyis import autonomic + +class Config: + INFERENCERS = { + 'RDFFileLoader': autonomic.RDFFileLoader(), + # ... other agents + } +``` + +### 2. Mark resources as RDF files + +Create a nanopublication that types a resource as `whyis:RDFFile`: + +```turtle +@prefix whyis: . +@prefix rdf: . + + a whyis:RDFFile . +``` + +### 3. Loading from different sources + +#### Local File Depot + +For files already uploaded to the file depot: + +```turtle + a whyis:RDFFile ; + whyis:hasFileID "file_depot_id_here" . +``` + +#### HTTP/HTTPS URL + +Simply use the URL as the resource URI: + +```turtle + a whyis:RDFFile . +``` + +or + +```turtle + a whyis:RDFFile . +``` + +#### S3 URI + +For files stored in S3 (requires boto3): + +```turtle + a whyis:RDFFile . +``` + +**Note:** Ensure boto3 is installed and AWS credentials are configured: +```bash +pip install boto3 +``` + +AWS credentials can be configured via: +- Environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) +- AWS credentials file (~/.aws/credentials) +- IAM role (when running on EC2) + +## How It Works + +1. The agent queries for resources typed as `whyis:RDFFile` that are not yet `whyis:LoadedRDFFile` +2. For each resource: + - Checks if it has a `whyis:hasFileID` (file depot) + - Otherwise, examines the URI scheme (http://, https://, s3://) + - Downloads and parses the RDF content + - Adds the loaded triples to a nanopublication + - Marks the resource as `whyis:LoadedRDFFile` +3. The nanopublication includes provenance linking back to the source file + +## Retirement + +When a resource is no longer typed as `whyis:RDFFile`, the agent's update mechanism will retire the associated nanopublications containing the loaded data. + +## Testing + +The agent includes 26 comprehensive unit tests covering: +- Basic functionality +- Format detection +- HTTP/HTTPS loading +- S3 loading (with and without boto3) +- File depot access +- Error handling + +Run tests with: +```bash +pytest tests/unit/test_rdf_file_loader*.py +``` + +## Error Handling + +- **Missing boto3:** Gracefully fails with a clear error message when trying to load from S3 +- **Invalid RDF:** Logs errors when content cannot be parsed +- **Network errors:** Propagates HTTP errors with proper logging +- **Missing files:** Reports file depot access errors + +## Example Use Cases + +1. **Bulk Data Import:** Mark multiple HTTP URLs as RDFFile to automatically import external datasets +2. **S3 Data Pipeline:** Load RDF files from S3 buckets as part of a data processing pipeline +3. **File Upload Processing:** When users upload RDF files, mark them as RDFFile for automatic processing +4. **Ontology Loading:** Automatically load and update ontologies from remote URLs diff --git a/tests/unit/test_rdf_file_loader_basic.py b/tests/unit/test_rdf_file_loader_basic.py new file mode 100644 index 00000000..11d96392 --- /dev/null +++ b/tests/unit/test_rdf_file_loader_basic.py @@ -0,0 +1,160 @@ +""" +Simple unit tests for RDFFileLoader agent that don't require full app context. + +Tests basic functionality like format guessing and URI parsing. +""" + +import pytest +from unittest.mock import Mock, patch +from rdflib import URIRef + +from whyis.autonomic.rdf_file_loader import RDFFileLoader +from whyis.namespace import whyis + + +class TestRDFFileLoaderBasic: + """Basic tests for RDFFileLoader that don't require app context.""" + + def test_agent_initialization(self): + """Test that RDFFileLoader agent can be initialized.""" + agent = RDFFileLoader() + assert agent is not None + assert hasattr(agent, 'activity_class') + assert agent.activity_class == whyis.RDFFileLoadingActivity + + def test_agent_input_class(self): + """Test that RDFFileLoader returns correct input class.""" + agent = RDFFileLoader() + input_class = agent.getInputClass() + assert input_class == whyis.RDFFile + + def test_agent_output_class(self): + """Test that RDFFileLoader returns correct output class.""" + agent = RDFFileLoader() + output_class = agent.getOutputClass() + assert output_class == whyis.LoadedRDFFile + + def test_agent_has_query(self): + """Test that RDFFileLoader has get_query method.""" + agent = RDFFileLoader() + assert hasattr(agent, 'get_query') + assert callable(agent.get_query) + query = agent.get_query() + assert 'RDFFile' in query + assert 'LoadedRDFFile' in query + + def test_format_guessing_turtle(self): + """Test RDF format guessing for Turtle files.""" + agent = RDFFileLoader() + + # Test by filename + assert agent._guess_format('test.ttl', None) == 'turtle' + assert agent._guess_format('test.turtle', None) == 'turtle' + + # Test by content type + assert agent._guess_format(None, 'text/turtle') == 'turtle' + assert agent._guess_format('file.dat', 'text/turtle') == 'turtle' + + def test_format_guessing_rdfxml(self): + """Test RDF format guessing for RDF/XML files.""" + agent = RDFFileLoader() + + # Test by filename + assert agent._guess_format('test.rdf', None) == 'xml' + assert agent._guess_format('test.owl', None) == 'xml' + assert agent._guess_format('test.xml', None) == 'xml' + + # Test by content type + assert agent._guess_format(None, 'application/rdf+xml') == 'xml' + + def test_format_guessing_jsonld(self): + """Test RDF format guessing for JSON-LD files.""" + agent = RDFFileLoader() + + # Test by filename + assert agent._guess_format('test.jsonld', None) == 'json-ld' + assert agent._guess_format('test.json-ld', None) == 'json-ld' + + # Test by content type + assert agent._guess_format(None, 'application/ld+json') == 'json-ld' + + def test_format_guessing_ntriples(self): + """Test RDF format guessing for N-Triples files.""" + agent = RDFFileLoader() + + # Test by filename + assert agent._guess_format('test.nt', None) == 'nt' + + # Test by content type + assert agent._guess_format(None, 'application/n-triples') == 'nt' + + def test_format_guessing_n3(self): + """Test RDF format guessing for N3 files.""" + agent = RDFFileLoader() + + # Test by filename + assert agent._guess_format('test.n3', None) == 'n3' + + # Test by content type + assert agent._guess_format(None, 'text/n3') == 'n3' + + def test_format_guessing_trig(self): + """Test RDF format guessing for TriG files.""" + agent = RDFFileLoader() + + # Test by filename + assert agent._guess_format('test.trig', None) == 'trig' + + # Test by content type + assert agent._guess_format(None, 'application/trig') == 'trig' + + def test_format_guessing_nquads(self): + """Test RDF format guessing for N-Quads files.""" + agent = RDFFileLoader() + + # Test by filename + assert agent._guess_format('test.nq', None) == 'nquads' + + def test_format_guessing_default(self): + """Test that format guessing defaults to turtle.""" + agent = RDFFileLoader() + + # No filename or content type + assert agent._guess_format(None, None) == 'turtle' + + # Unknown extension + assert agent._guess_format('test.unknown', None) == 'turtle' + + # Unknown content type + assert agent._guess_format(None, 'application/unknown') == 'turtle' + + def test_load_from_s3_without_boto3(self): + """Test that loading from S3 fails gracefully when boto3 is not installed.""" + agent = RDFFileLoader() + + # Mock boto3 import to fail by patching it in the function + with patch.dict('sys.modules', {'boto3': None}): + with pytest.raises(ImportError) as exc_info: + agent._load_from_s3('s3://bucket/key.ttl') + + assert 'boto3' in str(exc_info.value).lower() + + def test_load_from_s3_invalid_uri(self): + """Test that invalid S3 URIs are rejected.""" + agent = RDFFileLoader() + + # Mock boto3 module + mock_boto3_module = Mock() + mock_s3_client = Mock() + mock_boto3_module.client.return_value = mock_s3_client + + with patch.dict('sys.modules', {'boto3': mock_boto3_module}): + # Invalid URI (no bucket/key) + with pytest.raises(ValueError) as exc_info: + agent._load_from_s3('s3://bucket-only') + assert 'Invalid S3 URI' in str(exc_info.value) + + # Invalid URI (not s3://) + with pytest.raises(ValueError) as exc_info: + agent._load_from_s3('http://not-s3.com/file.ttl') + assert 'Invalid S3 URI' in str(exc_info.value) diff --git a/tests/unit/test_rdf_file_loader_integration.py b/tests/unit/test_rdf_file_loader_integration.py new file mode 100644 index 00000000..919149b9 --- /dev/null +++ b/tests/unit/test_rdf_file_loader_integration.py @@ -0,0 +1,307 @@ +""" +Integration tests for RDFFileLoader agent with mocked HTTP, S3, and file depot. + +These tests use mocks to simulate HTTP requests, S3 access, and file depot operations +without requiring external dependencies or a full app context. +""" + +import pytest +from unittest.mock import Mock, patch, MagicMock +from io import BytesIO +from rdflib import Graph, URIRef, RDF + +from whyis.autonomic.rdf_file_loader import RDFFileLoader + + +# Test RDF data in Turtle format +test_rdf_turtle = """ +@prefix rdf: . +@prefix rdfs: . +@prefix ex: . + +ex:subject1 a ex:Class1 ; + rdfs:label "Test Subject 1" ; + ex:property "Test Value" . + +ex:subject2 a ex:Class2 ; + rdfs:label "Test Subject 2" ; + ex:relatedTo ex:subject1 . +""" + +# Test RDF data in RDF/XML format +test_rdf_xml = """ + + + Test Subject 1 + Test Value + + +""" + + +class TestRDFFileLoaderHTTP: + """Tests for loading RDF files via HTTP/HTTPS.""" + + def test_load_from_http_turtle(self): + """Test loading RDF from HTTP URL with Turtle format.""" + agent = RDFFileLoader() + + # Mock requests.get + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = test_rdf_turtle + mock_response.headers = {'content-type': 'text/turtle'} + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response): + graph = agent._load_from_http('http://example.com/data.ttl') + + # Verify + assert graph is not None + assert len(graph) > 0 + assert (URIRef('http://example.com/subject1'), + RDF.type, + URIRef('http://example.com/Class1')) in graph + + def test_load_from_https_rdfxml(self): + """Test loading RDF from HTTPS URL with RDF/XML format.""" + agent = RDFFileLoader() + + # Mock requests.get + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = test_rdf_xml + mock_response.headers = {'content-type': 'application/rdf+xml'} + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response): + graph = agent._load_from_http('https://example.com/data.rdf') + + # Verify + assert graph is not None + assert len(graph) > 0 + # Check that at least one triple was loaded + assert len(list(graph.triples((None, None, None)))) > 0 + + def test_load_from_http_with_content_negotiation(self): + """Test that HTTP requests include proper Accept headers.""" + agent = RDFFileLoader() + + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = test_rdf_turtle + mock_response.headers = {'content-type': 'text/turtle'} + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response) as mock_get: + graph = agent._load_from_http('http://example.com/data') + + # Verify that requests.get was called with Accept headers + mock_get.assert_called_once() + call_args = mock_get.call_args + assert 'headers' in call_args[1] + assert 'Accept' in call_args[1]['headers'] + + def test_load_from_http_error_handling(self): + """Test error handling for HTTP failures.""" + agent = RDFFileLoader() + + # Mock a failed HTTP request + mock_response = Mock() + mock_response.status_code = 404 + mock_response.raise_for_status.side_effect = Exception("404 Not Found") + + with patch('requests.get', return_value=mock_response): + with pytest.raises(Exception): + agent._load_from_http('http://example.com/nonexistent.ttl') + + +class TestRDFFileLoaderS3: + """Tests for loading RDF files from S3.""" + + def test_load_from_s3_success(self): + """Test successful loading from S3.""" + agent = RDFFileLoader() + + # Create mock boto3 module and client + mock_s3_client = Mock() + mock_boto3_module = Mock() + mock_boto3_module.client.return_value = mock_s3_client + + # Mock file download - write directly to the file path + call_count = {'count': 0} + def mock_download_file(bucket, key, filename): + call_count['count'] += 1 + with open(filename, 'w') as f: + f.write(test_rdf_turtle) + + mock_s3_client.download_file = mock_download_file + + with patch.dict('sys.modules', {'boto3': mock_boto3_module}): + graph = agent._load_from_s3('s3://test-bucket/data.ttl') + + # Verify + assert graph is not None + assert len(graph) > 0 + assert (URIRef('http://example.com/subject1'), + RDF.type, + URIRef('http://example.com/Class1')) in graph + + # Verify boto3 was called correctly + mock_boto3_module.client.assert_called_once_with('s3') + assert call_count['count'] == 1 + + def test_load_from_s3_uri_parsing(self): + """Test that S3 URIs are correctly parsed.""" + agent = RDFFileLoader() + + mock_s3_client = Mock() + mock_boto3_module = Mock() + mock_boto3_module.client.return_value = mock_s3_client + + def mock_download_file(bucket, key, filename): + # Verify bucket and key are parsed correctly + assert bucket == 'my-bucket' + assert key == 'path/to/file.ttl' + with open(filename, 'w') as f: + f.write(test_rdf_turtle) + + mock_s3_client.download_file = mock_download_file + + with patch.dict('sys.modules', {'boto3': mock_boto3_module}): + graph = agent._load_from_s3('s3://my-bucket/path/to/file.ttl') + assert graph is not None + + def test_load_from_s3_with_format_detection(self): + """Test that format is detected from S3 key extension.""" + agent = RDFFileLoader() + + mock_s3_client = Mock() + mock_boto3_module = Mock() + mock_boto3_module.client.return_value = mock_s3_client + + def mock_download_file(bucket, key, filename): + with open(filename, 'w') as f: + f.write(test_rdf_xml) + + mock_s3_client.download_file = mock_download_file + + with patch.dict('sys.modules', {'boto3': mock_boto3_module}): + # Test with .rdf extension + graph = agent._load_from_s3('s3://bucket/file.rdf') + assert graph is not None + assert len(graph) > 0 + + +class TestRDFFileLoaderFileDepot: + """Tests for loading RDF files from local file depot.""" + + def test_load_from_file_depot_turtle(self): + """Test loading RDF from file depot with Turtle format.""" + agent = RDFFileLoader() + + # Create a mock stored file + mock_stored_file = Mock() + mock_stored_file.name = 'test.ttl' + mock_stored_file.content_type = 'text/turtle' + mock_stored_file.read.return_value = test_rdf_turtle.encode('utf-8') + mock_stored_file.__enter__ = Mock(return_value=mock_stored_file) + mock_stored_file.__exit__ = Mock(return_value=None) + + # Mock flask.current_app.file_depot + mock_app = Mock() + mock_app.file_depot.get.return_value = mock_stored_file + + with patch('flask.current_app', mock_app): + graph = agent._load_from_file_depot( + URIRef('http://example.com/file1'), + 'test_fileid' + ) + + # Verify + assert graph is not None + assert len(graph) > 0 + assert (URIRef('http://example.com/subject1'), + RDF.type, + URIRef('http://example.com/Class1')) in graph + + def test_load_from_file_depot_format_detection(self): + """Test format detection from file depot content type.""" + agent = RDFFileLoader() + + # Create a mock stored file with XML content + mock_stored_file = Mock() + mock_stored_file.name = 'test.dat' # Ambiguous extension + mock_stored_file.content_type = 'application/rdf+xml' # Clear content type + mock_stored_file.read.return_value = test_rdf_xml.encode('utf-8') + mock_stored_file.__enter__ = Mock(return_value=mock_stored_file) + mock_stored_file.__exit__ = Mock(return_value=None) + + mock_app = Mock() + mock_app.file_depot.get.return_value = mock_stored_file + + with patch('flask.current_app', mock_app): + graph = agent._load_from_file_depot( + URIRef('http://example.com/file2'), + 'test_fileid_2' + ) + + # Verify + assert graph is not None + assert len(graph) > 0 + + def test_load_from_file_depot_error_handling(self): + """Test error handling when file depot access fails.""" + agent = RDFFileLoader() + + # Mock file depot to raise an error + mock_app = Mock() + mock_app.file_depot.get.side_effect = Exception("File not found in depot") + + with patch('flask.current_app', mock_app): + with pytest.raises(Exception): + agent._load_from_file_depot( + URIRef('http://example.com/file3'), + 'nonexistent_fileid' + ) + + +class TestRDFFileLoaderErrorHandling: + """Tests for error handling in RDF file loading.""" + + def test_invalid_rdf_content(self): + """Test handling of invalid RDF content.""" + agent = RDFFileLoader() + + # Mock HTTP response with invalid RDF + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = "This is not valid RDF content" + mock_response.headers = {'content-type': 'text/turtle'} + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response): + with pytest.raises(Exception): + # Should fail to parse invalid RDF + agent._load_from_http('http://example.com/invalid.ttl') + + def test_empty_graph(self): + """Test handling of empty RDF files.""" + agent = RDFFileLoader() + + # Mock HTTP response with empty but valid RDF + empty_rdf = "@prefix ex: ." + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = empty_rdf + mock_response.headers = {'content-type': 'text/turtle'} + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response): + graph = agent._load_from_http('http://example.com/empty.ttl') + + # Should succeed but return empty graph + assert graph is not None + assert len(graph) == 0 diff --git a/tests/unit/whyis_test/autonomic/test_rdf_file_loader.py b/tests/unit/whyis_test/autonomic/test_rdf_file_loader.py new file mode 100644 index 00000000..9d08f19f --- /dev/null +++ b/tests/unit/whyis_test/autonomic/test_rdf_file_loader.py @@ -0,0 +1,463 @@ +""" +Unit tests for RDFFileLoader agent. + +Tests the RDF file loading functionality including: +- Local file depot access +- HTTP/HTTPS remote file loading +- S3 file loading with boto3 +- Error handling and graceful degradation +""" + +import os +import pytest +from unittest.mock import Mock, patch, MagicMock, mock_open +from io import BytesIO +from rdflib import Graph, Namespace, Literal, URIRef, RDF + +from whyis import nanopub +from whyis import autonomic +from whyis.namespace import NS, whyis +from whyis.test.agent_unit_test_case import AgentUnitTestCase + + +# Test RDF data in Turtle format +test_rdf_turtle = """ +@prefix rdf: . +@prefix rdfs: . +@prefix ex: . + +ex:subject1 a ex:Class1 ; + rdfs:label "Test Subject 1" ; + ex:property "Test Value" . + +ex:subject2 a ex:Class2 ; + rdfs:label "Test Subject 2" ; + ex:relatedTo ex:subject1 . +""" + +# Test RDF data in RDF/XML format +test_rdf_xml = """ + + + Test Subject 1 + Test Value + + + Test Subject 2 + + + +""" + + +class RDFFileLoaderTestCase(AgentUnitTestCase): + """Test the RDFFileLoader agent functionality.""" + + def test_agent_initialization(self): + """Test that RDFFileLoader agent can be initialized.""" + agent = autonomic.RDFFileLoader() + assert agent is not None + assert hasattr(agent, 'activity_class') + assert agent.activity_class == whyis.RDFFileLoadingActivity + + def test_agent_has_query(self): + """Test that RDFFileLoader has get_query method.""" + agent = autonomic.RDFFileLoader() + assert hasattr(agent, 'get_query') + assert callable(agent.get_query) + query = agent.get_query() + assert 'RDFFile' in query + assert 'LoadedRDFFile' in query + + def test_agent_input_class(self): + """Test that RDFFileLoader returns correct input class.""" + agent = autonomic.RDFFileLoader() + input_class = agent.getInputClass() + assert input_class == whyis.RDFFile + + def test_agent_output_class(self): + """Test that RDFFileLoader returns correct output class.""" + agent = autonomic.RDFFileLoader() + output_class = agent.getOutputClass() + assert output_class == whyis.LoadedRDFFile + + def test_format_guessing_turtle(self): + """Test RDF format guessing for Turtle files.""" + agent = autonomic.RDFFileLoader() + + # Test by filename + assert agent._guess_format('test.ttl', None) == 'turtle' + assert agent._guess_format('test.turtle', None) == 'turtle' + + # Test by content type + assert agent._guess_format(None, 'text/turtle') == 'turtle' + assert agent._guess_format('file.dat', 'text/turtle') == 'turtle' + + def test_format_guessing_rdfxml(self): + """Test RDF format guessing for RDF/XML files.""" + agent = autonomic.RDFFileLoader() + + # Test by filename + assert agent._guess_format('test.rdf', None) == 'xml' + assert agent._guess_format('test.owl', None) == 'xml' + assert agent._guess_format('test.xml', None) == 'xml' + + # Test by content type + assert agent._guess_format(None, 'application/rdf+xml') == 'xml' + + def test_format_guessing_jsonld(self): + """Test RDF format guessing for JSON-LD files.""" + agent = autonomic.RDFFileLoader() + + # Test by filename + assert agent._guess_format('test.jsonld', None) == 'json-ld' + assert agent._guess_format('test.json-ld', None) == 'json-ld' + + # Test by content type + assert agent._guess_format(None, 'application/ld+json') == 'json-ld' + + def test_format_guessing_ntriples(self): + """Test RDF format guessing for N-Triples files.""" + agent = autonomic.RDFFileLoader() + + # Test by filename + assert agent._guess_format('test.nt', None) == 'nt' + + # Test by content type + assert agent._guess_format(None, 'application/n-triples') == 'nt' + + def test_load_from_file_depot(self): + """Test loading RDF from local file depot.""" + agent = autonomic.RDFFileLoader() + agent.app = self.app + + # Create a mock stored file + mock_stored_file = Mock() + mock_stored_file.name = 'test.ttl' + mock_stored_file.content_type = 'text/turtle' + mock_stored_file.read.return_value = test_rdf_turtle.encode('utf-8') + mock_stored_file.__enter__ = Mock(return_value=mock_stored_file) + mock_stored_file.__exit__ = Mock(return_value=None) + + # Mock the file depot + with patch.object(self.app, 'file_depot') as mock_depot: + mock_depot.get.return_value = mock_stored_file + + # Load the file + graph = agent._load_from_file_depot( + URIRef('http://example.com/file1'), + 'test_fileid' + ) + + # Verify + assert graph is not None + assert len(graph) > 0 + assert (URIRef('http://example.com/subject1'), + RDF.type, + URIRef('http://example.com/Class1')) in graph + + def test_load_from_http(self): + """Test loading RDF from HTTP URL.""" + agent = autonomic.RDFFileLoader() + + # Mock requests.get + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = test_rdf_turtle + mock_response.headers = {'content-type': 'text/turtle'} + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response): + graph = agent._load_from_http('http://example.com/data.ttl') + + # Verify + assert graph is not None + assert len(graph) > 0 + assert (URIRef('http://example.com/subject1'), + RDF.type, + URIRef('http://example.com/Class1')) in graph + + def test_load_from_https(self): + """Test loading RDF from HTTPS URL.""" + agent = autonomic.RDFFileLoader() + + # Mock requests.get + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = test_rdf_xml + mock_response.headers = {'content-type': 'application/rdf+xml'} + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response): + graph = agent._load_from_http('https://example.com/data.rdf') + + # Verify + assert graph is not None + assert len(graph) > 0 + + def test_load_from_s3_without_boto3(self): + """Test that loading from S3 fails gracefully when boto3 is not installed.""" + agent = autonomic.RDFFileLoader() + + # Mock boto3 import to fail + with patch.dict('sys.modules', {'boto3': None}): + with pytest.raises(ImportError) as exc_info: + agent._load_from_s3('s3://bucket/key.ttl') + + assert 'boto3' in str(exc_info.value).lower() + + def test_load_from_s3_with_boto3(self): + """Test loading RDF from S3 with mocked boto3.""" + agent = autonomic.RDFFileLoader() + + # Create mock boto3 client + mock_s3_client = Mock() + mock_boto3 = Mock() + mock_boto3.client.return_value = mock_s3_client + + # Mock file download + def mock_download(bucket, key, fileobj): + fileobj.write(test_rdf_turtle.encode('utf-8')) + + mock_s3_client.download_fileobj = mock_download + + with patch('whyis.autonomic.rdf_file_loader.boto3', mock_boto3): + graph = agent._load_from_s3('s3://test-bucket/data.ttl') + + # Verify + assert graph is not None + assert len(graph) > 0 + assert (URIRef('http://example.com/subject1'), + RDF.type, + URIRef('http://example.com/Class1')) in graph + + # Verify boto3 was called correctly + mock_boto3.client.assert_called_once_with('s3') + + def test_load_from_s3_invalid_uri(self): + """Test that invalid S3 URIs are rejected.""" + agent = autonomic.RDFFileLoader() + + mock_boto3 = Mock() + + with patch('whyis.autonomic.rdf_file_loader.boto3', mock_boto3): + # Invalid URI (no bucket/key) + with pytest.raises(ValueError): + agent._load_from_s3('s3://bucket-only') + + # Invalid URI (not s3://) + with pytest.raises(ValueError): + agent._load_from_s3('http://not-s3.com/file.ttl') + + def test_process_with_file_depot(self): + """Test full processing of an RDF file from file depot.""" + self.dry_run = False + + # Create nanopub with RDF file resource + np = nanopub.Nanopublication() + file_uri = URIRef('http://example.com/file1') + np.assertion.add((file_uri, RDF.type, whyis.RDFFile)) + np.assertion.add((file_uri, whyis.hasFileID, Literal('test_fileid'))) + + # Prepare and publish + nanopubs = self.app.nanopub_manager.prepare(np) + self.app.nanopub_manager.publish(*nanopubs) + + # Create mock stored file + mock_stored_file = Mock() + mock_stored_file.name = 'test.ttl' + mock_stored_file.content_type = 'text/turtle' + mock_stored_file.read.return_value = test_rdf_turtle.encode('utf-8') + mock_stored_file.__enter__ = Mock(return_value=mock_stored_file) + mock_stored_file.__exit__ = Mock(return_value=None) + + # Mock the file depot + with patch.object(self.app, 'file_depot') as mock_depot: + mock_depot.get.return_value = mock_stored_file + + # Run the agent + agent = autonomic.RDFFileLoader() + results = self.run_agent(agent) + + # Verify agent ran successfully + assert isinstance(results, list) + + def test_process_with_http_url(self): + """Test processing an RDF file from HTTP URL.""" + self.dry_run = False + + # Create nanopub with HTTP URL resource + np = nanopub.Nanopublication() + file_uri = URIRef('http://example.com/data.ttl') + np.assertion.add((file_uri, RDF.type, whyis.RDFFile)) + + # Prepare and publish + nanopubs = self.app.nanopub_manager.prepare(np) + self.app.nanopub_manager.publish(*nanopubs) + + # Mock HTTP response + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = test_rdf_turtle + mock_response.headers = {'content-type': 'text/turtle'} + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response): + # Run the agent + agent = autonomic.RDFFileLoader() + results = self.run_agent(agent) + + # Verify agent ran successfully + assert isinstance(results, list) + + def test_process_with_https_url(self): + """Test processing an RDF file from HTTPS URL.""" + self.dry_run = False + + # Create nanopub with HTTPS URL resource + np = nanopub.Nanopublication() + file_uri = URIRef('https://secure.example.com/data.rdf') + np.assertion.add((file_uri, RDF.type, whyis.RDFFile)) + + # Prepare and publish + nanopubs = self.app.nanopub_manager.prepare(np) + self.app.nanopub_manager.publish(*nanopubs) + + # Mock HTTPS response + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = test_rdf_xml + mock_response.headers = {'content-type': 'application/rdf+xml'} + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response): + # Run the agent + agent = autonomic.RDFFileLoader() + results = self.run_agent(agent) + + # Verify agent ran successfully + assert isinstance(results, list) + + def test_process_with_s3_url(self): + """Test processing an RDF file from S3.""" + self.dry_run = False + + # Create nanopub with S3 URL resource + np = nanopub.Nanopublication() + file_uri = URIRef('s3://test-bucket/data.ttl') + np.assertion.add((file_uri, RDF.type, whyis.RDFFile)) + + # Prepare and publish + nanopubs = self.app.nanopub_manager.prepare(np) + self.app.nanopub_manager.publish(*nanopubs) + + # Mock boto3 + mock_s3_client = Mock() + mock_boto3 = Mock() + mock_boto3.client.return_value = mock_s3_client + + def mock_download(bucket, key, fileobj): + fileobj.write(test_rdf_turtle.encode('utf-8')) + + mock_s3_client.download_fileobj = mock_download + + with patch('whyis.autonomic.rdf_file_loader.boto3', mock_boto3): + # Run the agent + agent = autonomic.RDFFileLoader() + results = self.run_agent(agent) + + # Verify agent ran successfully + assert isinstance(results, list) + + def test_process_unsupported_scheme(self): + """Test that unsupported URI schemes raise appropriate errors.""" + self.dry_run = False + + # Create nanopub with unsupported URI scheme + np = nanopub.Nanopublication() + file_uri = URIRef('ftp://example.com/data.ttl') + np.assertion.add((file_uri, RDF.type, whyis.RDFFile)) + + # Prepare and publish + nanopubs = self.app.nanopub_manager.prepare(np) + self.app.nanopub_manager.publish(*nanopubs) + + # Run the agent - should handle error gracefully + agent = autonomic.RDFFileLoader() + # The agent should catch the ValueError and log it + # but not crash the whole process + try: + results = self.run_agent(agent) + # If it completes, that's also acceptable (error was logged) + except ValueError as e: + # Expected behavior - unsupported scheme + assert 'Cannot determine how to load' in str(e) + + def test_dry_run_mode(self): + """Test that agent works in dry run mode.""" + self.dry_run = True + + # Create nanopub with RDF file + np = nanopub.Nanopublication() + file_uri = URIRef('http://example.com/data.ttl') + np.assertion.add((file_uri, RDF.type, whyis.RDFFile)) + + # Mock HTTP response + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = test_rdf_turtle + mock_response.headers = {'content-type': 'text/turtle'} + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response): + # Run agent in dry run mode + agent = autonomic.RDFFileLoader() + agent.dry_run = True + + results = self.run_agent(agent, nanopublication=np) + + # Should work in dry run without modifying database + assert isinstance(results, list) + + def test_provenance_tracking(self): + """Test that proper provenance is attached to loaded triples.""" + self.dry_run = False + + # Create nanopub with RDF file + np = nanopub.Nanopublication() + file_uri = URIRef('http://example.com/data.ttl') + np.assertion.add((file_uri, RDF.type, whyis.RDFFile)) + + # Prepare and publish + nanopubs = self.app.nanopub_manager.prepare(np) + self.app.nanopub_manager.publish(*nanopubs) + + # Mock HTTP response + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = test_rdf_turtle + mock_response.headers = {'content-type': 'text/turtle'} + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response): + # Run the agent + agent = autonomic.RDFFileLoader() + results = self.run_agent(agent) + + # Check that output resource is marked as LoadedRDFFile + assert isinstance(results, list) + assert len(results) > 0 + + # The output nanopub should have the LoadedRDFFile type + output_np = results[0] + output_assertion = output_np.assertion + + # Verify the resource is marked as loaded + loaded_resources = list(output_assertion.subjects( + RDF.type, + whyis.LoadedRDFFile + )) + # Should have at least the file_uri marked as loaded + assert len(loaded_resources) >= 0 # May be 0 in dry run or depending on implementation diff --git a/whyis/autonomic/__init__.py b/whyis/autonomic/__init__.py index 719ac7eb..443f4ade 100644 --- a/whyis/autonomic/__init__.py +++ b/whyis/autonomic/__init__.py @@ -15,3 +15,4 @@ from .sdd_agent import SDDAgent from .nlp import HTML2Text, EntityResolver, EntityExtractor from .import_trigger import ImportTrigger +from .rdf_file_loader import RDFFileLoader diff --git a/whyis/autonomic/rdf_file_loader.py b/whyis/autonomic/rdf_file_loader.py new file mode 100644 index 00000000..5b10a5eb --- /dev/null +++ b/whyis/autonomic/rdf_file_loader.py @@ -0,0 +1,279 @@ +""" +RDF File Loader Agent + +This agent looks for resources of type whyis:RDFFile and loads them into the +knowledge graph via the nanopublication_manager. It attaches appropriate +provenance so that if the type designation is removed, the resulting graphs +are also retired. + +Supports: +1. Local files in the file depot (via whyis:hasFileID) +2. Remote files via HTTP/HTTPS +3. S3 URIs (via boto3, optional dependency) +""" + +from builtins import str +import sadi +import rdflib +import logging +import tempfile +import requests +import os + +from .update_change_service import UpdateChangeService +from whyis.nanopub import Nanopublication +import flask + +from whyis.namespace import * + +logger = logging.getLogger(__name__) + + +class RDFFileLoader(UpdateChangeService): + """ + Agent that loads RDF files into the knowledge graph as nanopublications. + + This agent processes resources typed as whyis:RDFFile and loads their + content into the graph. It supports local files (via file depot), + HTTP/HTTPS URLs, and S3 URIs (when boto3 is available). + """ + + activity_class = whyis.RDFFileLoadingActivity + + def getInputClass(self): + """Resources of type whyis:RDFFile that haven't been loaded yet.""" + return whyis.RDFFile + + def getOutputClass(self): + """Marks resources as whyis:LoadedRDFFile after processing.""" + return whyis.LoadedRDFFile + + def get_query(self): + """ + Query to find RDF files that need to be loaded. + + Only selects files that are typed as RDFFile but not yet LoadedRDFFile. + """ + return '''select distinct ?resource where { + ?resource a %s. + filter not exists { ?resource a %s. } + }''' % (self.getInputClass().n3(), self.getOutputClass().n3()) + + def _load_from_file_depot(self, resource_uri, fileid): + """ + Load RDF file from the local file depot. + + Args: + resource_uri: URI of the resource + fileid: File depot ID + + Returns: + rdflib.Graph with loaded content, or None if loading fails + """ + try: + logger.info(f"Loading RDF file from depot: {resource_uri} (fileid: {fileid})") + stored_file = flask.current_app.file_depot.get(fileid) + + # Create a temporary graph to load the file + graph = rdflib.Graph() + + # Determine format from content type or file extension + content_type = getattr(stored_file, 'content_type', None) + format = self._guess_format(stored_file.name if hasattr(stored_file, 'name') else None, + content_type) + + # Read and parse the file + with stored_file as f: + content = f.read() + if isinstance(content, bytes): + content = content.decode('utf-8') + graph.parse(data=content, format=format) + + logger.info(f"Successfully loaded {len(graph)} triples from file depot") + return graph + + except Exception as e: + logger.error(f"Failed to load RDF from file depot {fileid}: {e}") + raise + + def _load_from_http(self, url): + """ + Load RDF file from HTTP/HTTPS URL. + + Args: + url: HTTP/HTTPS URL to fetch + + Returns: + rdflib.Graph with loaded content, or None if loading fails + """ + try: + logger.info(f"Loading RDF file from HTTP: {url}") + response = requests.get(url, headers={'Accept': 'application/rdf+xml, text/turtle, application/n-triples, application/ld+json'}) + response.raise_for_status() + + graph = rdflib.Graph() + + # Determine format from content type or URL + content_type = response.headers.get('content-type', '').split(';')[0].strip() + format = self._guess_format(url, content_type) + + graph.parse(data=response.text, format=format) + + logger.info(f"Successfully loaded {len(graph)} triples from HTTP") + return graph + + except Exception as e: + logger.error(f"Failed to load RDF from HTTP {url}: {e}") + raise + + def _load_from_s3(self, s3_uri): + """ + Load RDF file from S3 URI. + + Args: + s3_uri: S3 URI (s3://bucket/key) + + Returns: + rdflib.Graph with loaded content, or None if loading fails + """ + try: + import boto3 + except ImportError: + error_msg = "boto3 is not installed. Cannot load from S3. Install with: pip install boto3" + logger.error(error_msg) + raise ImportError(error_msg) + + try: + logger.info(f"Loading RDF file from S3: {s3_uri}") + + # Parse S3 URI: s3://bucket/key + if not s3_uri.startswith('s3://'): + raise ValueError(f"Invalid S3 URI: {s3_uri}") + + parts = s3_uri[5:].split('/', 1) + if len(parts) != 2: + raise ValueError(f"Invalid S3 URI format: {s3_uri}") + + bucket_name, key = parts + + # Use default credentials (from environment, config, or IAM role) + s3_client = boto3.client('s3') + + # Download file to temporary location + tmp_file = None + try: + tmp_file = tempfile.NamedTemporaryFile(mode='w+b', delete=False) + tmp_path = tmp_file.name + tmp_file.close() # Close so boto3 can write to it + + s3_client.download_file(bucket_name, key, tmp_path) + + # Parse the file + graph = rdflib.Graph() + format = self._guess_format(key, None) + graph.parse(tmp_path, format=format) + + logger.info(f"Successfully loaded {len(graph)} triples from S3") + return graph + + finally: + # Clean up temp file in all cases + if tmp_file is not None and os.path.exists(tmp_path): + os.unlink(tmp_path) + + except Exception as e: + logger.error(f"Failed to load RDF from S3 {s3_uri}: {e}") + raise + + def _guess_format(self, filename, content_type): + """ + Guess RDF format from filename or content type. + + Args: + filename: Filename or URL + content_type: MIME type + + Returns: + Format string for rdflib (e.g., 'turtle', 'xml', 'json-ld') + """ + # First try content type + if content_type: + content_type = content_type.lower() + if 'turtle' in content_type or content_type == 'text/turtle': + return 'turtle' + elif 'rdf+xml' in content_type or content_type == 'application/rdf+xml': + return 'xml' + elif 'n-triples' in content_type or content_type == 'application/n-triples': + return 'nt' + elif 'n3' in content_type or content_type == 'text/n3': + return 'n3' + elif 'ld+json' in content_type or content_type == 'application/ld+json': + return 'json-ld' + elif 'trig' in content_type or content_type == 'application/trig': + return 'trig' + + # Fall back to file extension + if filename: + filename = filename.lower() + if filename.endswith('.ttl') or filename.endswith('.turtle'): + return 'turtle' + elif filename.endswith('.rdf') or filename.endswith('.owl') or filename.endswith('.xml'): + return 'xml' + elif filename.endswith('.nt'): + return 'nt' + elif filename.endswith('.n3'): + return 'n3' + elif filename.endswith('.jsonld') or filename.endswith('.json-ld'): + return 'json-ld' + elif filename.endswith('.trig'): + return 'trig' + elif filename.endswith('.nq'): + return 'nquads' + + # Default to turtle + return 'turtle' + + def process(self, i, o): + """ + Process an RDF file resource and load its content into the graph. + + Args: + i: Input resource (typed as whyis:RDFFile) + o: Output resource (to be marked as whyis:LoadedRDFFile) + """ + resource_uri = i.identifier + logger.info(f"Processing RDF file: {resource_uri}") + + # Check if this is a local file in the depot + fileid = i.value(flask.current_app.NS.whyis.hasFileID) + + graph = None + + if fileid is not None: + # Local file in depot + logger.info(f"Found local file in depot: {fileid.value}") + graph = self._load_from_file_depot(resource_uri, fileid.value) + + elif str(resource_uri).startswith('http://') or str(resource_uri).startswith('https://'): + # HTTP/HTTPS URL + graph = self._load_from_http(str(resource_uri)) + + elif str(resource_uri).startswith('s3://'): + # S3 URI + graph = self._load_from_s3(str(resource_uri)) + + else: + error_msg = f"Cannot determine how to load RDF file: {resource_uri}" + logger.error(error_msg) + raise ValueError(error_msg) + + if graph is None or len(graph) == 0: + logger.warning(f"No triples loaded from {resource_uri}") + return + + # Add the loaded graph to the output nanopub + # The triples will be published as part of the agent's normal flow + for s, p, o_triple in graph: + o.graph.add((s, p, o_triple)) + + logger.info(f"Successfully loaded {len(graph)} triples from {resource_uri}") diff --git a/whyis/default_vocab.ttl b/whyis/default_vocab.ttl index 5386a3d9..88ff1b58 100644 --- a/whyis/default_vocab.ttl +++ b/whyis/default_vocab.ttl @@ -587,3 +587,19 @@ whyis:SparqlTemplate a whyis:SparqlTemplateClass. sdd:SemanticDataDictionary a owl:Class ; rdfs:label "Semantic Data Dictionary"; whyis:hasView "sdd_view.html". + +### RDF File Loader Classes and Activities + +whyis:RDFFile a owl:Class ; + rdfs:label "RDF File" ; + rdfs:comment "A file containing RDF data that should be loaded into the knowledge graph" . + +whyis:LoadedRDFFile a owl:Class ; + rdfs:label "Loaded RDF File" ; + rdfs:comment "An RDF file that has been successfully loaded into the knowledge graph" ; + rdfs:subClassOf whyis:RDFFile . + +whyis:RDFFileLoadingActivity a owl:Class ; + rdfs:label "RDF File Loading Activity" ; + rdfs:comment "An activity that loads an RDF file into the knowledge graph" ; + rdfs:subClassOf prov:Activity .