diff --git a/fetcharoo/schemas/__init__.py b/fetcharoo/schemas/__init__.py new file mode 100644 index 0000000..f34042d --- /dev/null +++ b/fetcharoo/schemas/__init__.py @@ -0,0 +1,23 @@ +""" +Site-specific download schemas for fetcharoo. + +This package provides the schema system for defining site-specific +PDF download configurations. Schemas encapsulate the best practices +for downloading PDFs from different websites. + +Example: + >>> from fetcharoo.schemas import SiteSchema + >>> schema = SiteSchema( + ... name='my_site', + ... url_pattern=r'https://mysite\\.com/.*', + ... sort_by='numeric' + ... ) + >>> schema.matches('https://mysite.com/docs') + True +""" + +from fetcharoo.schemas.base import SiteSchema + +__all__ = [ + "SiteSchema", +] diff --git a/fetcharoo/schemas/base.py b/fetcharoo/schemas/base.py new file mode 100644 index 0000000..66ee3a2 --- /dev/null +++ b/fetcharoo/schemas/base.py @@ -0,0 +1,167 @@ +""" +Base schema class for site-specific download configurations. + +This module provides the SiteSchema dataclass that defines the structure +for site-specific PDF download configurations. +""" + +import re +from dataclasses import dataclass, field +from typing import Callable, List, Optional + +from fetcharoo.filtering import FilterConfig + + +@dataclass +class SiteSchema: + """ + Base class for site-specific download configurations. + + A SiteSchema encapsulates the best way to download PDFs from a specific + website or type of website. It includes URL pattern matching, PDF filtering, + sorting strategies, and validation settings. + + Attributes: + name: Unique identifier for this schema (e.g., 'springer_book'). + url_pattern: Regex pattern to match URLs this schema handles. + description: Human-readable description of what this schema is for. + include_patterns: Filename patterns to include (fnmatch syntax). + exclude_patterns: Filename patterns to exclude (fnmatch syntax). + url_include_patterns: URL patterns to include. + url_exclude_patterns: URL patterns to exclude. + sort_by: Sort strategy for merging: 'numeric', 'alpha', 'alpha_desc', 'none'. + sort_key: Custom sort key function (takes URL, returns sortable value). + default_output_name: Default filename for merged PDFs. + recommended_depth: Suggested recursion depth for this site. + request_delay: Suggested delay between requests in seconds. + test_url: Sample URL for validation testing. + expected_min_pdfs: Minimum PDFs expected when validating test_url. + version: Schema version string for tracking updates. + + Example: + >>> schema = SiteSchema( + ... name='example_site', + ... url_pattern=r'https?://example\\.com/docs/.*', + ... description='Example documentation site', + ... sort_by='numeric', + ... recommended_depth=1 + ... ) + >>> schema.matches('https://example.com/docs/guide') + True + """ + + # Required fields + name: str + url_pattern: str + + # Description + description: Optional[str] = None + + # PDF filtering patterns + include_patterns: List[str] = field(default_factory=list) + exclude_patterns: List[str] = field(default_factory=list) + url_include_patterns: List[str] = field(default_factory=list) + url_exclude_patterns: List[str] = field(default_factory=list) + + # Sorting + sort_by: Optional[str] = None + sort_key: Optional[Callable[[str], any]] = field(default=None, repr=False) + + # Output + default_output_name: Optional[str] = None + + # Behavior + recommended_depth: int = 1 + request_delay: float = 0.5 + + # Validation + test_url: Optional[str] = None + expected_min_pdfs: int = 1 + + # Metadata + version: str = "1.0.0" + + # Compiled regex (cached) + _compiled_pattern: Optional[re.Pattern] = field( + default=None, init=False, repr=False, compare=False + ) + + def __post_init__(self): + """Compile the URL pattern regex after initialization.""" + if self.url_pattern: + try: + self._compiled_pattern = re.compile(self.url_pattern) + except re.error as e: + raise ValueError(f"Invalid url_pattern regex: {e}") + + # Validate sort_by + valid_sort_options = (None, 'none', 'numeric', 'alpha', 'alpha_desc') + if self.sort_by not in valid_sort_options: + raise ValueError( + f"sort_by must be one of {valid_sort_options}, got '{self.sort_by}'" + ) + + def matches(self, url: str) -> bool: + """ + Check if this schema matches the given URL. + + Args: + url: The URL to check against this schema's pattern. + + Returns: + True if the URL matches this schema's url_pattern, False otherwise. + """ + if self._compiled_pattern is None: + return False + return bool(self._compiled_pattern.match(url)) + + def get_filter_config(self) -> Optional[FilterConfig]: + """ + Convert this schema's filter patterns to a FilterConfig. + + Returns: + A FilterConfig instance if any filter patterns are defined, + None if no filtering is configured. + """ + has_filters = ( + self.include_patterns + or self.exclude_patterns + or self.url_include_patterns + or self.url_exclude_patterns + ) + + if not has_filters: + return None + + return FilterConfig( + filename_include=self.include_patterns.copy(), + filename_exclude=self.exclude_patterns.copy(), + url_include=self.url_include_patterns.copy(), + url_exclude=self.url_exclude_patterns.copy(), + ) + + def get_sort_key(self) -> Optional[Callable[[str], any]]: + """ + Get the sort key function for this schema. + + Returns the custom sort_key if defined, otherwise returns a default + sort key function based on the sort_by strategy. + + Returns: + A callable sort key function, or None if no sorting is configured. + """ + if self.sort_key is not None: + return self.sort_key + + # Return None if no sorting strategy + if self.sort_by is None or self.sort_by == 'none': + return None + + # Default sort keys are handled by the caller (fetcharoo.py) + # This allows the schema to just specify sort_by without a custom key + return None + + def __str__(self) -> str: + """Return a human-readable string representation.""" + desc = self.description or "No description" + return f"{self.name}: {desc}" diff --git a/tests/test_schemas_base.py b/tests/test_schemas_base.py new file mode 100644 index 0000000..f2efe08 --- /dev/null +++ b/tests/test_schemas_base.py @@ -0,0 +1,309 @@ +""" +Tests for the SiteSchema base dataclass. +""" + +import unittest +from fetcharoo.schemas import SiteSchema +from fetcharoo.filtering import FilterConfig + + +class TestSiteSchemaBasic(unittest.TestCase): + """Basic tests for SiteSchema instantiation and attributes.""" + + def test_create_minimal_schema(self): + """Test creating a schema with only required fields.""" + schema = SiteSchema( + name='test_schema', + url_pattern=r'https://example\.com/.*' + ) + self.assertEqual(schema.name, 'test_schema') + self.assertEqual(schema.url_pattern, r'https://example\.com/.*') + self.assertIsNone(schema.description) + self.assertEqual(schema.include_patterns, []) + self.assertEqual(schema.exclude_patterns, []) + self.assertIsNone(schema.sort_by) + self.assertEqual(schema.recommended_depth, 1) + self.assertEqual(schema.request_delay, 0.5) + self.assertEqual(schema.version, "1.0.0") + + def test_create_full_schema(self): + """Test creating a schema with all fields.""" + def custom_sort(url): + return url + + schema = SiteSchema( + name='full_schema', + url_pattern=r'https://full\.example\.com/.*', + description='A fully configured schema', + include_patterns=['*.pdf', 'report*.pdf'], + exclude_patterns=['*draft*'], + url_include_patterns=['*/docs/*'], + url_exclude_patterns=['*/temp/*'], + sort_by='numeric', + sort_key=custom_sort, + default_output_name='output.pdf', + recommended_depth=2, + request_delay=1.5, + test_url='https://full.example.com/test', + expected_min_pdfs=5, + version='2.0.0' + ) + + self.assertEqual(schema.name, 'full_schema') + self.assertEqual(schema.description, 'A fully configured schema') + self.assertEqual(schema.include_patterns, ['*.pdf', 'report*.pdf']) + self.assertEqual(schema.exclude_patterns, ['*draft*']) + self.assertEqual(schema.sort_by, 'numeric') + self.assertEqual(schema.sort_key, custom_sort) + self.assertEqual(schema.default_output_name, 'output.pdf') + self.assertEqual(schema.recommended_depth, 2) + self.assertEqual(schema.request_delay, 1.5) + self.assertEqual(schema.test_url, 'https://full.example.com/test') + self.assertEqual(schema.expected_min_pdfs, 5) + self.assertEqual(schema.version, '2.0.0') + + +class TestSiteSchemaMatches(unittest.TestCase): + """Tests for the matches() method.""" + + def test_matches_simple_pattern(self): + """Test matching a simple URL pattern.""" + schema = SiteSchema( + name='test', + url_pattern=r'https://example\.com/.*' + ) + self.assertTrue(schema.matches('https://example.com/')) + self.assertTrue(schema.matches('https://example.com/page')) + self.assertTrue(schema.matches('https://example.com/docs/file.pdf')) + self.assertFalse(schema.matches('https://other.com/')) + self.assertFalse(schema.matches('http://example.com/')) # http vs https + + def test_matches_complex_pattern(self): + """Test matching a complex URL pattern.""" + schema = SiteSchema( + name='springer', + url_pattern=r'https?://link\.springer\.com/book/10\.\d+/.*' + ) + self.assertTrue(schema.matches('https://link.springer.com/book/10.1007/978-3-031-41026-0')) + self.assertTrue(schema.matches('http://link.springer.com/book/10.1234/some-book')) + self.assertFalse(schema.matches('https://link.springer.com/article/10.1007/something')) + self.assertFalse(schema.matches('https://springer.com/book/10.1007/something')) + + def test_matches_with_capture_groups(self): + """Test that patterns with capture groups work.""" + schema = SiteSchema( + name='arxiv', + url_pattern=r'https?://arxiv\.org/(abs|pdf)/(\d+\.\d+)' + ) + self.assertTrue(schema.matches('https://arxiv.org/abs/2301.07041')) + self.assertTrue(schema.matches('https://arxiv.org/pdf/2301.07041')) + self.assertFalse(schema.matches('https://arxiv.org/list/2301.07041')) + + def test_matches_empty_pattern(self): + """Test behavior with empty URL pattern.""" + schema = SiteSchema(name='empty', url_pattern='') + # Empty pattern is not compiled, so matches() returns False for all + self.assertFalse(schema.matches('')) + self.assertFalse(schema.matches('https://example.com')) + + def test_matches_catch_all_pattern(self): + """Test a catch-all pattern.""" + schema = SiteSchema(name='generic', url_pattern=r'.*') + self.assertTrue(schema.matches('https://example.com')) + self.assertTrue(schema.matches('anything')) + self.assertTrue(schema.matches('')) + + +class TestSiteSchemaValidation(unittest.TestCase): + """Tests for schema validation during initialization.""" + + def test_invalid_regex_raises_error(self): + """Test that invalid regex pattern raises ValueError.""" + with self.assertRaises(ValueError) as context: + SiteSchema(name='bad', url_pattern=r'[invalid') + self.assertIn('Invalid url_pattern regex', str(context.exception)) + + def test_invalid_sort_by_raises_error(self): + """Test that invalid sort_by value raises ValueError.""" + with self.assertRaises(ValueError) as context: + SiteSchema( + name='bad_sort', + url_pattern=r'.*', + sort_by='invalid_sort_option' + ) + self.assertIn('sort_by must be one of', str(context.exception)) + + def test_valid_sort_by_options(self): + """Test that all valid sort_by options are accepted.""" + valid_options = [None, 'none', 'numeric', 'alpha', 'alpha_desc'] + for option in valid_options: + schema = SiteSchema( + name=f'sort_{option}', + url_pattern=r'.*', + sort_by=option + ) + self.assertEqual(schema.sort_by, option) + + +class TestSiteSchemaGetFilterConfig(unittest.TestCase): + """Tests for the get_filter_config() method.""" + + def test_no_filters_returns_none(self): + """Test that schema with no filters returns None.""" + schema = SiteSchema(name='no_filters', url_pattern=r'.*') + self.assertIsNone(schema.get_filter_config()) + + def test_include_patterns_only(self): + """Test filter config with only include patterns.""" + schema = SiteSchema( + name='include_only', + url_pattern=r'.*', + include_patterns=['*.pdf', 'report*.pdf'] + ) + config = schema.get_filter_config() + self.assertIsInstance(config, FilterConfig) + self.assertEqual(config.filename_include, ['*.pdf', 'report*.pdf']) + self.assertEqual(config.filename_exclude, []) + + def test_exclude_patterns_only(self): + """Test filter config with only exclude patterns.""" + schema = SiteSchema( + name='exclude_only', + url_pattern=r'.*', + exclude_patterns=['*draft*', '*temp*'] + ) + config = schema.get_filter_config() + self.assertIsInstance(config, FilterConfig) + self.assertEqual(config.filename_include, []) + self.assertEqual(config.filename_exclude, ['*draft*', '*temp*']) + + def test_url_patterns(self): + """Test filter config with URL patterns.""" + schema = SiteSchema( + name='url_filters', + url_pattern=r'.*', + url_include_patterns=['*/docs/*'], + url_exclude_patterns=['*/temp/*'] + ) + config = schema.get_filter_config() + self.assertIsInstance(config, FilterConfig) + self.assertEqual(config.url_include, ['*/docs/*']) + self.assertEqual(config.url_exclude, ['*/temp/*']) + + def test_combined_filters(self): + """Test filter config with all filter types.""" + schema = SiteSchema( + name='combined', + url_pattern=r'.*', + include_patterns=['*.pdf'], + exclude_patterns=['*draft*'], + url_include_patterns=['*/reports/*'], + url_exclude_patterns=['*/archive/*'] + ) + config = schema.get_filter_config() + self.assertEqual(config.filename_include, ['*.pdf']) + self.assertEqual(config.filename_exclude, ['*draft*']) + self.assertEqual(config.url_include, ['*/reports/*']) + self.assertEqual(config.url_exclude, ['*/archive/*']) + + def test_filter_config_is_copy(self): + """Test that filter config returns copies of lists.""" + schema = SiteSchema( + name='copy_test', + url_pattern=r'.*', + include_patterns=['*.pdf'] + ) + config1 = schema.get_filter_config() + config2 = schema.get_filter_config() + + # Modify one config + config1.filename_include.append('new.pdf') + + # Other config should be unaffected + self.assertEqual(config2.filename_include, ['*.pdf']) + # Original schema should be unaffected + self.assertEqual(schema.include_patterns, ['*.pdf']) + + +class TestSiteSchemaGetSortKey(unittest.TestCase): + """Tests for the get_sort_key() method.""" + + def test_no_sort_returns_none(self): + """Test that no sort config returns None.""" + schema = SiteSchema(name='no_sort', url_pattern=r'.*') + self.assertIsNone(schema.get_sort_key()) + + def test_sort_by_none_returns_none(self): + """Test that sort_by='none' returns None.""" + schema = SiteSchema(name='sort_none', url_pattern=r'.*', sort_by='none') + self.assertIsNone(schema.get_sort_key()) + + def test_sort_by_without_custom_key_returns_none(self): + """Test that sort_by without custom key returns None (handled by caller).""" + schema = SiteSchema(name='sort_numeric', url_pattern=r'.*', sort_by='numeric') + # The actual sort key implementation is in fetcharoo.py + # Schema just specifies the strategy + self.assertIsNone(schema.get_sort_key()) + + def test_custom_sort_key_returned(self): + """Test that custom sort_key is returned.""" + def my_sort_key(url): + return len(url) + + schema = SiteSchema( + name='custom_sort', + url_pattern=r'.*', + sort_key=my_sort_key + ) + self.assertEqual(schema.get_sort_key(), my_sort_key) + + def test_custom_sort_key_overrides_sort_by(self): + """Test that custom sort_key takes precedence over sort_by.""" + def custom_key(url): + return url.lower() + + schema = SiteSchema( + name='override', + url_pattern=r'.*', + sort_by='numeric', # This would normally use numeric sorting + sort_key=custom_key # But custom key takes precedence + ) + self.assertEqual(schema.get_sort_key(), custom_key) + + +class TestSiteSchemaStr(unittest.TestCase): + """Tests for string representation.""" + + def test_str_with_description(self): + """Test __str__ with description.""" + schema = SiteSchema( + name='my_schema', + url_pattern=r'.*', + description='A useful schema' + ) + self.assertEqual(str(schema), 'my_schema: A useful schema') + + def test_str_without_description(self): + """Test __str__ without description.""" + schema = SiteSchema(name='my_schema', url_pattern=r'.*') + self.assertEqual(str(schema), 'my_schema: No description') + + +class TestSiteSchemaEquality(unittest.TestCase): + """Tests for schema equality comparison.""" + + def test_equal_schemas(self): + """Test that identical schemas are equal.""" + schema1 = SiteSchema(name='test', url_pattern=r'.*', sort_by='numeric') + schema2 = SiteSchema(name='test', url_pattern=r'.*', sort_by='numeric') + self.assertEqual(schema1, schema2) + + def test_different_schemas(self): + """Test that different schemas are not equal.""" + schema1 = SiteSchema(name='test1', url_pattern=r'.*') + schema2 = SiteSchema(name='test2', url_pattern=r'.*') + self.assertNotEqual(schema1, schema2) + + +if __name__ == '__main__': + unittest.main()