From fec62d3a7713cc8117cde3fe3ccca4da5193b391 Mon Sep 17 00:00:00 2001 From: "Mark A. Lifson, Ph.D." Date: Mon, 15 Dec 2025 17:23:36 -0800 Subject: [PATCH] Add schema registry with auto-detection Implement a global registry for storing and retrieving site schemas, with support for automatic URL-based schema detection. Registry functions: - register_schema(): Add schema to registry - unregister_schema(): Remove schema from registry - get_schema(): Retrieve schema by name - detect_schema(): Auto-detect schema from URL pattern - list_schemas(): List all registered schema names - get_all_schemas(): Get all schemas as dict - clear_registry(): Clear all schemas (for testing) - is_registered(): Check if schema exists - schema_count(): Count registered schemas - @schema decorator: Register class or instance Features: - Duplicate name detection with optional overwrite - Type validation for registration - Sorted listing of schema names - First-match-wins auto-detection Includes 35 comprehensive tests. Closes #12 --- fetcharoo/schemas/__init__.py | 36 +++- fetcharoo/schemas/registry.py | 234 ++++++++++++++++++++ tests/test_schemas_registry.py | 375 +++++++++++++++++++++++++++++++++ 3 files changed, 642 insertions(+), 3 deletions(-) create mode 100644 fetcharoo/schemas/registry.py create mode 100644 tests/test_schemas_registry.py diff --git a/fetcharoo/schemas/__init__.py b/fetcharoo/schemas/__init__.py index f34042d..4103838 100644 --- a/fetcharoo/schemas/__init__.py +++ b/fetcharoo/schemas/__init__.py @@ -6,18 +6,48 @@ for downloading PDFs from different websites. Example: - >>> from fetcharoo.schemas import SiteSchema + >>> from fetcharoo.schemas import SiteSchema, register_schema, detect_schema + >>> + >>> # Create and register a schema >>> schema = SiteSchema( ... name='my_site', ... url_pattern=r'https://mysite\\.com/.*', ... sort_by='numeric' ... ) - >>> schema.matches('https://mysite.com/docs') - True + >>> register_schema(schema) + >>> + >>> # Auto-detect schema from URL + >>> detected = detect_schema('https://mysite.com/docs') + >>> print(detected.name) + 'my_site' """ from fetcharoo.schemas.base import SiteSchema +from fetcharoo.schemas.registry import ( + register_schema, + unregister_schema, + get_schema, + detect_schema, + list_schemas, + get_all_schemas, + clear_registry, + schema, + is_registered, + schema_count, +) __all__ = [ + # Base class "SiteSchema", + # Registry functions + "register_schema", + "unregister_schema", + "get_schema", + "detect_schema", + "list_schemas", + "get_all_schemas", + "clear_registry", + "schema", + "is_registered", + "schema_count", ] diff --git a/fetcharoo/schemas/registry.py b/fetcharoo/schemas/registry.py new file mode 100644 index 0000000..6c02696 --- /dev/null +++ b/fetcharoo/schemas/registry.py @@ -0,0 +1,234 @@ +""" +Schema registry for managing site-specific download configurations. + +This module provides a global registry for storing and retrieving schemas, +with support for auto-detection based on URL patterns. +""" + +import logging +from typing import Dict, List, Optional, Union + +from fetcharoo.schemas.base import SiteSchema + +logger = logging.getLogger('fetcharoo.schemas') + +# Global schema registry +_SCHEMAS: Dict[str, SiteSchema] = {} + + +def register_schema(schema: SiteSchema, overwrite: bool = False) -> None: + """ + Register a schema in the global registry. + + Args: + schema: The SiteSchema instance to register. + overwrite: If True, overwrite existing schema with same name. + If False (default), raise ValueError if name exists. + + Raises: + ValueError: If schema name already exists and overwrite=False. + TypeError: If schema is not a SiteSchema instance. + + Example: + >>> schema = SiteSchema(name='my_site', url_pattern=r'https://mysite\\.com/.*') + >>> register_schema(schema) + """ + if not isinstance(schema, SiteSchema): + raise TypeError(f"Expected SiteSchema, got {type(schema).__name__}") + + if schema.name in _SCHEMAS and not overwrite: + raise ValueError( + f"Schema '{schema.name}' already registered. " + f"Use overwrite=True to replace it." + ) + + _SCHEMAS[schema.name] = schema + logger.debug(f"Registered schema: {schema.name}") + + +def unregister_schema(name: str) -> bool: + """ + Remove a schema from the registry. + + Args: + name: The name of the schema to remove. + + Returns: + True if schema was removed, False if it wasn't registered. + + Example: + >>> unregister_schema('my_site') + True + """ + if name in _SCHEMAS: + del _SCHEMAS[name] + logger.debug(f"Unregistered schema: {name}") + return True + return False + + +def get_schema(name: str) -> Optional[SiteSchema]: + """ + Get a schema by name. + + Args: + name: The name of the schema to retrieve. + + Returns: + The SiteSchema instance, or None if not found. + + Example: + >>> schema = get_schema('springer_book') + >>> if schema: + ... print(schema.description) + """ + return _SCHEMAS.get(name) + + +def detect_schema(url: str) -> Optional[SiteSchema]: + """ + Auto-detect schema from URL by testing all registered patterns. + + Iterates through registered schemas and returns the first one + whose URL pattern matches the given URL. More specific patterns + should be registered before generic ones to ensure correct matching. + + Args: + url: The URL to match against schema patterns. + + Returns: + The first matching SiteSchema, or None if no match. + + Example: + >>> schema = detect_schema('https://link.springer.com/book/10.1007/978-3-031-41026-0') + >>> if schema: + ... print(f"Detected: {schema.name}") + """ + for schema in _SCHEMAS.values(): + if schema.matches(url): + logger.debug(f"Auto-detected schema '{schema.name}' for URL: {url}") + return schema + return None + + +def list_schemas() -> List[str]: + """ + List all registered schema names. + + Returns: + A sorted list of registered schema names. + + Example: + >>> names = list_schemas() + >>> print(names) + ['arxiv', 'generic', 'springer_book'] + """ + return sorted(_SCHEMAS.keys()) + + +def get_all_schemas() -> Dict[str, SiteSchema]: + """ + Get all registered schemas. + + Returns: + A copy of the schema registry dictionary. + + Example: + >>> schemas = get_all_schemas() + >>> for name, schema in schemas.items(): + ... print(f"{name}: {schema.description}") + """ + return _SCHEMAS.copy() + + +def clear_registry() -> None: + """ + Clear all schemas from the registry. + + This is mainly useful for testing to ensure a clean state. + + Example: + >>> clear_registry() + >>> list_schemas() + [] + """ + _SCHEMAS.clear() + logger.debug("Cleared schema registry") + + +def schema(cls_or_instance: Union[type, SiteSchema]): + """ + Decorator to register a schema class or instance. + + Can be used as a decorator on a class that inherits from SiteSchema, + or called directly with a SiteSchema instance. + + Args: + cls_or_instance: Either a SiteSchema subclass or instance. + + Returns: + The original class or instance (for use as decorator). + + Example: + >>> # As class decorator + >>> @schema + ... class MySchema(SiteSchema): + ... name = 'my_schema' + ... url_pattern = r'https://mysite\\.com/.*' + + >>> # With instance + >>> @schema + ... class AnotherSchema(SiteSchema): + ... def __init__(self): + ... super().__init__( + ... name='another', + ... url_pattern=r'.*' + ... ) + """ + if isinstance(cls_or_instance, SiteSchema): + # Direct instance + register_schema(cls_or_instance) + return cls_or_instance + elif isinstance(cls_or_instance, type) and issubclass(cls_or_instance, SiteSchema): + # Class - instantiate and register + instance = cls_or_instance() + register_schema(instance) + return cls_or_instance + else: + raise TypeError( + f"@schema decorator expects SiteSchema class or instance, " + f"got {type(cls_or_instance).__name__}" + ) + + +def is_registered(name: str) -> bool: + """ + Check if a schema is registered. + + Args: + name: The schema name to check. + + Returns: + True if the schema is registered, False otherwise. + + Example: + >>> is_registered('springer_book') + True + >>> is_registered('nonexistent') + False + """ + return name in _SCHEMAS + + +def schema_count() -> int: + """ + Get the number of registered schemas. + + Returns: + The count of registered schemas. + + Example: + >>> schema_count() + 3 + """ + return len(_SCHEMAS) diff --git a/tests/test_schemas_registry.py b/tests/test_schemas_registry.py new file mode 100644 index 0000000..c9e6202 --- /dev/null +++ b/tests/test_schemas_registry.py @@ -0,0 +1,375 @@ +""" +Tests for the schema registry module. +""" + +import unittest +from fetcharoo.schemas import ( + SiteSchema, + register_schema, + unregister_schema, + get_schema, + detect_schema, + list_schemas, + get_all_schemas, + clear_registry, + schema, + is_registered, + schema_count, +) + + +class TestSchemaRegistryBase(unittest.TestCase): + """Base class for registry tests with cleanup.""" + + def setUp(self): + """Clear registry before each test.""" + clear_registry() + + def tearDown(self): + """Clear registry after each test.""" + clear_registry() + + +class TestRegisterSchema(TestSchemaRegistryBase): + """Tests for register_schema function.""" + + def test_register_schema_basic(self): + """Test basic schema registration.""" + schema = SiteSchema(name='test', url_pattern=r'.*') + register_schema(schema) + self.assertTrue(is_registered('test')) + + def test_register_multiple_schemas(self): + """Test registering multiple schemas.""" + schema1 = SiteSchema(name='schema1', url_pattern=r'https://site1\.com/.*') + schema2 = SiteSchema(name='schema2', url_pattern=r'https://site2\.com/.*') + register_schema(schema1) + register_schema(schema2) + self.assertEqual(schema_count(), 2) + self.assertTrue(is_registered('schema1')) + self.assertTrue(is_registered('schema2')) + + def test_register_duplicate_raises_error(self): + """Test that registering duplicate name raises ValueError.""" + schema1 = SiteSchema(name='duplicate', url_pattern=r'.*') + schema2 = SiteSchema(name='duplicate', url_pattern=r'https://other\.com/.*') + register_schema(schema1) + with self.assertRaises(ValueError) as context: + register_schema(schema2) + self.assertIn('already registered', str(context.exception)) + + def test_register_with_overwrite(self): + """Test that overwrite=True allows replacing schemas.""" + schema1 = SiteSchema(name='overwrite_test', url_pattern=r'https://old\.com/.*') + schema2 = SiteSchema(name='overwrite_test', url_pattern=r'https://new\.com/.*') + register_schema(schema1) + register_schema(schema2, overwrite=True) + + retrieved = get_schema('overwrite_test') + self.assertEqual(retrieved.url_pattern, r'https://new\.com/.*') + + def test_register_invalid_type_raises_error(self): + """Test that registering non-SiteSchema raises TypeError.""" + with self.assertRaises(TypeError) as context: + register_schema("not a schema") + self.assertIn('Expected SiteSchema', str(context.exception)) + + def test_register_dict_raises_error(self): + """Test that registering a dict raises TypeError.""" + with self.assertRaises(TypeError): + register_schema({'name': 'test', 'url_pattern': '.*'}) + + +class TestUnregisterSchema(TestSchemaRegistryBase): + """Tests for unregister_schema function.""" + + def test_unregister_existing(self): + """Test unregistering an existing schema.""" + schema = SiteSchema(name='to_remove', url_pattern=r'.*') + register_schema(schema) + self.assertTrue(is_registered('to_remove')) + + result = unregister_schema('to_remove') + self.assertTrue(result) + self.assertFalse(is_registered('to_remove')) + + def test_unregister_nonexistent(self): + """Test unregistering a schema that doesn't exist.""" + result = unregister_schema('nonexistent') + self.assertFalse(result) + + def test_unregister_and_reregister(self): + """Test that schema can be re-registered after unregistering.""" + schema = SiteSchema(name='reregister', url_pattern=r'.*') + register_schema(schema) + unregister_schema('reregister') + + # Should be able to register again without overwrite + register_schema(schema) + self.assertTrue(is_registered('reregister')) + + +class TestGetSchema(TestSchemaRegistryBase): + """Tests for get_schema function.""" + + def test_get_existing_schema(self): + """Test getting an existing schema.""" + original = SiteSchema( + name='get_test', + url_pattern=r'https://test\.com/.*', + description='Test schema' + ) + register_schema(original) + + retrieved = get_schema('get_test') + self.assertIsNotNone(retrieved) + self.assertEqual(retrieved.name, 'get_test') + self.assertEqual(retrieved.description, 'Test schema') + + def test_get_nonexistent_schema(self): + """Test getting a schema that doesn't exist.""" + result = get_schema('nonexistent') + self.assertIsNone(result) + + def test_get_returns_same_instance(self): + """Test that get_schema returns the same instance.""" + original = SiteSchema(name='instance_test', url_pattern=r'.*') + register_schema(original) + + retrieved = get_schema('instance_test') + self.assertIs(retrieved, original) + + +class TestDetectSchema(TestSchemaRegistryBase): + """Tests for detect_schema function.""" + + def test_detect_matching_schema(self): + """Test detecting a schema that matches the URL.""" + schema = SiteSchema( + name='springer', + url_pattern=r'https?://link\.springer\.com/book/.*' + ) + register_schema(schema) + + detected = detect_schema('https://link.springer.com/book/10.1007/978-3-031-41026-0') + self.assertIsNotNone(detected) + self.assertEqual(detected.name, 'springer') + + def test_detect_no_match(self): + """Test that None is returned when no schema matches.""" + schema = SiteSchema( + name='specific', + url_pattern=r'https://specific-site\.com/.*' + ) + register_schema(schema) + + detected = detect_schema('https://other-site.com/page') + self.assertIsNone(detected) + + def test_detect_first_match_wins(self): + """Test that the first matching schema is returned.""" + schema1 = SiteSchema(name='first', url_pattern=r'https://example\.com/.*') + schema2 = SiteSchema(name='second', url_pattern=r'https://example\.com/docs/.*') + + # Register in order - first should match first + register_schema(schema1) + register_schema(schema2) + + detected = detect_schema('https://example.com/docs/page') + # First registered schema that matches wins + self.assertEqual(detected.name, 'first') + + def test_detect_with_multiple_schemas(self): + """Test detection with multiple registered schemas.""" + schemas = [ + SiteSchema(name='arxiv', url_pattern=r'https?://arxiv\.org/.*'), + SiteSchema(name='springer', url_pattern=r'https?://link\.springer\.com/.*'), + SiteSchema(name='ieee', url_pattern=r'https?://ieeexplore\.ieee\.org/.*'), + ] + for s in schemas: + register_schema(s) + + self.assertEqual(detect_schema('https://arxiv.org/abs/2301.07041').name, 'arxiv') + self.assertEqual(detect_schema('https://link.springer.com/book/123').name, 'springer') + self.assertEqual(detect_schema('https://ieeexplore.ieee.org/document/123').name, 'ieee') + self.assertIsNone(detect_schema('https://unknown.com/')) + + def test_detect_empty_registry(self): + """Test detection with empty registry.""" + detected = detect_schema('https://example.com/') + self.assertIsNone(detected) + + +class TestListSchemas(TestSchemaRegistryBase): + """Tests for list_schemas function.""" + + def test_list_empty_registry(self): + """Test listing schemas from empty registry.""" + names = list_schemas() + self.assertEqual(names, []) + + def test_list_single_schema(self): + """Test listing single schema.""" + schema = SiteSchema(name='single', url_pattern=r'.*') + register_schema(schema) + + names = list_schemas() + self.assertEqual(names, ['single']) + + def test_list_multiple_schemas_sorted(self): + """Test that list_schemas returns sorted names.""" + schemas = [ + SiteSchema(name='zebra', url_pattern=r'.*'), + SiteSchema(name='alpha', url_pattern=r'.*'), + SiteSchema(name='middle', url_pattern=r'.*'), + ] + for s in schemas: + register_schema(s) + + names = list_schemas() + self.assertEqual(names, ['alpha', 'middle', 'zebra']) + + +class TestGetAllSchemas(TestSchemaRegistryBase): + """Tests for get_all_schemas function.""" + + def test_get_all_empty(self): + """Test get_all_schemas with empty registry.""" + all_schemas = get_all_schemas() + self.assertEqual(all_schemas, {}) + + def test_get_all_returns_copy(self): + """Test that get_all_schemas returns a copy.""" + schema = SiteSchema(name='copy_test', url_pattern=r'.*') + register_schema(schema) + + all_schemas = get_all_schemas() + all_schemas['new_key'] = 'should not affect registry' + + # Original registry should be unchanged + self.assertNotIn('new_key', get_all_schemas()) + self.assertEqual(schema_count(), 1) + + def test_get_all_contains_all_schemas(self): + """Test that get_all_schemas contains all registered schemas.""" + schemas = [ + SiteSchema(name='one', url_pattern=r'.*'), + SiteSchema(name='two', url_pattern=r'.*'), + ] + for s in schemas: + register_schema(s) + + all_schemas = get_all_schemas() + self.assertEqual(len(all_schemas), 2) + self.assertIn('one', all_schemas) + self.assertIn('two', all_schemas) + + +class TestClearRegistry(TestSchemaRegistryBase): + """Tests for clear_registry function.""" + + def test_clear_empty_registry(self): + """Test clearing an empty registry.""" + clear_registry() # Should not raise + self.assertEqual(schema_count(), 0) + + def test_clear_populated_registry(self): + """Test clearing a populated registry.""" + for i in range(5): + schema = SiteSchema(name=f'schema_{i}', url_pattern=r'.*') + register_schema(schema) + + self.assertEqual(schema_count(), 5) + clear_registry() + self.assertEqual(schema_count(), 0) + self.assertEqual(list_schemas(), []) + + +class TestSchemaDecorator(TestSchemaRegistryBase): + """Tests for the @schema decorator.""" + + def test_decorator_with_instance(self): + """Test @schema decorator with a SiteSchema instance.""" + instance = SiteSchema(name='decorated_instance', url_pattern=r'.*') + result = schema(instance) + + self.assertIs(result, instance) + self.assertTrue(is_registered('decorated_instance')) + + def test_decorator_with_class(self): + """Test @schema decorator with a SiteSchema subclass.""" + @schema + class MySchema(SiteSchema): + def __init__(self): + super().__init__( + name='decorated_class', + url_pattern=r'https://mysite\.com/.*', + description='Decorated class schema' + ) + + self.assertTrue(is_registered('decorated_class')) + retrieved = get_schema('decorated_class') + self.assertEqual(retrieved.description, 'Decorated class schema') + + def test_decorator_with_invalid_type(self): + """Test @schema decorator with invalid type.""" + with self.assertRaises(TypeError) as context: + schema("not a schema") + self.assertIn('expects SiteSchema', str(context.exception)) + + def test_decorator_with_non_schema_class(self): + """Test @schema decorator with non-SiteSchema class.""" + class NotASchema: + pass + + with self.assertRaises(TypeError): + schema(NotASchema) + + +class TestIsRegistered(TestSchemaRegistryBase): + """Tests for is_registered function.""" + + def test_is_registered_true(self): + """Test is_registered returns True for registered schema.""" + schema = SiteSchema(name='registered', url_pattern=r'.*') + register_schema(schema) + self.assertTrue(is_registered('registered')) + + def test_is_registered_false(self): + """Test is_registered returns False for unregistered schema.""" + self.assertFalse(is_registered('not_registered')) + + def test_is_registered_after_unregister(self): + """Test is_registered returns False after unregistering.""" + schema = SiteSchema(name='temp', url_pattern=r'.*') + register_schema(schema) + unregister_schema('temp') + self.assertFalse(is_registered('temp')) + + +class TestSchemaCount(TestSchemaRegistryBase): + """Tests for schema_count function.""" + + def test_count_empty(self): + """Test count of empty registry.""" + self.assertEqual(schema_count(), 0) + + def test_count_after_registrations(self): + """Test count after registering schemas.""" + for i in range(3): + schema = SiteSchema(name=f'count_{i}', url_pattern=r'.*') + register_schema(schema) + self.assertEqual(schema_count(), 3) + + def test_count_after_unregister(self): + """Test count after unregistering.""" + for i in range(3): + schema = SiteSchema(name=f'uncount_{i}', url_pattern=r'.*') + register_schema(schema) + + unregister_schema('uncount_1') + self.assertEqual(schema_count(), 2) + + +if __name__ == '__main__': + unittest.main()