diff --git a/fetcharoo/schemas/__init__.py b/fetcharoo/schemas/__init__.py
new file mode 100644
index 0000000..f34042d
--- /dev/null
+++ b/fetcharoo/schemas/__init__.py
@@ -0,0 +1,23 @@
+"""
+Site-specific download schemas for fetcharoo.
+
+This package provides the schema system for defining site-specific
+PDF download configurations. Schemas encapsulate the best practices
+for downloading PDFs from different websites.
+
+Example:
+    >>> from fetcharoo.schemas import SiteSchema
+    >>> schema = SiteSchema(
+    ...     name='my_site',
+    ...     url_pattern=r'https://mysite\\.com/.*',
+    ...     sort_by='numeric'
+    ... )
+    >>> schema.matches('https://mysite.com/docs')
+    True
+"""
+
+from fetcharoo.schemas.base import SiteSchema
+
+__all__ = [
+    "SiteSchema",
+]
diff --git a/fetcharoo/schemas/base.py b/fetcharoo/schemas/base.py
new file mode 100644
index 0000000..66ee3a2
--- /dev/null
+++ b/fetcharoo/schemas/base.py
@@ -0,0 +1,167 @@
+"""
+Base schema class for site-specific download configurations.
+
+This module provides the SiteSchema dataclass that defines the structure
+for site-specific PDF download configurations.
+"""
+
+import re
+from dataclasses import dataclass, field
+from typing import Callable, List, Optional
+
+from fetcharoo.filtering import FilterConfig
+
+
+@dataclass
+class SiteSchema:
+    """
+    Base class for site-specific download configurations.
+
+    A SiteSchema encapsulates the best way to download PDFs from a specific
+    website or type of website. It includes URL pattern matching, PDF filtering,
+    sorting strategies, and validation settings.
+
+    Attributes:
+        name: Unique identifier for this schema (e.g., 'springer_book').
+        url_pattern: Regex pattern to match URLs this schema handles.
+        description: Human-readable description of what this schema is for.
+        include_patterns: Filename patterns to include (fnmatch syntax).
+        exclude_patterns: Filename patterns to exclude (fnmatch syntax).
+        url_include_patterns: URL patterns to include.
+        url_exclude_patterns: URL patterns to exclude.
+        sort_by: Sort strategy for merging: 'numeric', 'alpha', 'alpha_desc', 'none'.
+        sort_key: Custom sort key function (takes URL, returns sortable value).
+        default_output_name: Default filename for merged PDFs.
+        recommended_depth: Suggested recursion depth for this site.
+        request_delay: Suggested delay between requests in seconds.
+        test_url: Sample URL for validation testing.
+        expected_min_pdfs: Minimum PDFs expected when validating test_url.
+        version: Schema version string for tracking updates.
+
+    Example:
+        >>> schema = SiteSchema(
+        ...     name='example_site',
+        ...     url_pattern=r'https?://example\\.com/docs/.*',
+        ...     description='Example documentation site',
+        ...     sort_by='numeric',
+        ...     recommended_depth=1
+        ... )
+        >>> schema.matches('https://example.com/docs/guide')
+        True
+    """
+
+    # Required fields
+    name: str
+    url_pattern: str
+
+    # Description
+    description: Optional[str] = None
+
+    # PDF filtering patterns
+    include_patterns: List[str] = field(default_factory=list)
+    exclude_patterns: List[str] = field(default_factory=list)
+    url_include_patterns: List[str] = field(default_factory=list)
+    url_exclude_patterns: List[str] = field(default_factory=list)
+
+    # Sorting
+    sort_by: Optional[str] = None
+    sort_key: Optional[Callable[[str], any]] = field(default=None, repr=False)
+
+    # Output
+    default_output_name: Optional[str] = None
+
+    # Behavior
+    recommended_depth: int = 1
+    request_delay: float = 0.5
+
+    # Validation
+    test_url: Optional[str] = None
+    expected_min_pdfs: int = 1
+
+    # Metadata
+    version: str = "1.0.0"
+
+    # Compiled regex (cached)
+    _compiled_pattern: Optional[re.Pattern] = field(
+        default=None, init=False, repr=False, compare=False
+    )
+
+    def __post_init__(self):
+        """Compile the URL pattern regex after initialization."""
+        if self.url_pattern:
+            try:
+                self._compiled_pattern = re.compile(self.url_pattern)
+            except re.error as e:
+                raise ValueError(f"Invalid url_pattern regex: {e}")
+
+        # Validate sort_by
+        valid_sort_options = (None, 'none', 'numeric', 'alpha', 'alpha_desc')
+        if self.sort_by not in valid_sort_options:
+            raise ValueError(
+                f"sort_by must be one of {valid_sort_options}, got '{self.sort_by}'"
+            )
+
+    def matches(self, url: str) -> bool:
+        """
+        Check if this schema matches the given URL.
+
+        Args:
+            url: The URL to check against this schema's pattern.
+
+        Returns:
+            True if the URL matches this schema's url_pattern, False otherwise.
+        """
+        if self._compiled_pattern is None:
+            return False
+        return bool(self._compiled_pattern.match(url))
+
+    def get_filter_config(self) -> Optional[FilterConfig]:
+        """
+        Convert this schema's filter patterns to a FilterConfig.
+
+        Returns:
+            A FilterConfig instance if any filter patterns are defined,
+            None if no filtering is configured.
+        """
+        has_filters = (
+            self.include_patterns
+            or self.exclude_patterns
+            or self.url_include_patterns
+            or self.url_exclude_patterns
+        )
+
+        if not has_filters:
+            return None
+
+        return FilterConfig(
+            filename_include=self.include_patterns.copy(),
+            filename_exclude=self.exclude_patterns.copy(),
+            url_include=self.url_include_patterns.copy(),
+            url_exclude=self.url_exclude_patterns.copy(),
+        )
+
+    def get_sort_key(self) -> Optional[Callable[[str], any]]:
+        """
+        Get the sort key function for this schema.
+
+        Returns the custom sort_key if defined, otherwise returns a default
+        sort key function based on the sort_by strategy.
+
+        Returns:
+            A callable sort key function, or None if no sorting is configured.
+        """
+        if self.sort_key is not None:
+            return self.sort_key
+
+        # Return None if no sorting strategy
+        if self.sort_by is None or self.sort_by == 'none':
+            return None
+
+        # Default sort keys are handled by the caller (fetcharoo.py)
+        # This allows the schema to just specify sort_by without a custom key
+        return None
+
+    def __str__(self) -> str:
+        """Return a human-readable string representation."""
+        desc = self.description or "No description"
+        return f"{self.name}: {desc}"
diff --git a/tests/test_schemas_base.py b/tests/test_schemas_base.py
new file mode 100644
index 0000000..f2efe08
--- /dev/null
+++ b/tests/test_schemas_base.py
@@ -0,0 +1,309 @@
+"""
+Tests for the SiteSchema base dataclass.
+"""
+
+import unittest
+from fetcharoo.schemas import SiteSchema
+from fetcharoo.filtering import FilterConfig
+
+
+class TestSiteSchemaBasic(unittest.TestCase):
+    """Basic tests for SiteSchema instantiation and attributes."""
+
+    def test_create_minimal_schema(self):
+        """Test creating a schema with only required fields."""
+        schema = SiteSchema(
+            name='test_schema',
+            url_pattern=r'https://example\.com/.*'
+        )
+        self.assertEqual(schema.name, 'test_schema')
+        self.assertEqual(schema.url_pattern, r'https://example\.com/.*')
+        self.assertIsNone(schema.description)
+        self.assertEqual(schema.include_patterns, [])
+        self.assertEqual(schema.exclude_patterns, [])
+        self.assertIsNone(schema.sort_by)
+        self.assertEqual(schema.recommended_depth, 1)
+        self.assertEqual(schema.request_delay, 0.5)
+        self.assertEqual(schema.version, "1.0.0")
+
+    def test_create_full_schema(self):
+        """Test creating a schema with all fields."""
+        def custom_sort(url):
+            return url
+
+        schema = SiteSchema(
+            name='full_schema',
+            url_pattern=r'https://full\.example\.com/.*',
+            description='A fully configured schema',
+            include_patterns=['*.pdf', 'report*.pdf'],
+            exclude_patterns=['*draft*'],
+            url_include_patterns=['*/docs/*'],
+            url_exclude_patterns=['*/temp/*'],
+            sort_by='numeric',
+            sort_key=custom_sort,
+            default_output_name='output.pdf',
+            recommended_depth=2,
+            request_delay=1.5,
+            test_url='https://full.example.com/test',
+            expected_min_pdfs=5,
+            version='2.0.0'
+        )
+
+        self.assertEqual(schema.name, 'full_schema')
+        self.assertEqual(schema.description, 'A fully configured schema')
+        self.assertEqual(schema.include_patterns, ['*.pdf', 'report*.pdf'])
+        self.assertEqual(schema.exclude_patterns, ['*draft*'])
+        self.assertEqual(schema.sort_by, 'numeric')
+        self.assertEqual(schema.sort_key, custom_sort)
+        self.assertEqual(schema.default_output_name, 'output.pdf')
+        self.assertEqual(schema.recommended_depth, 2)
+        self.assertEqual(schema.request_delay, 1.5)
+        self.assertEqual(schema.test_url, 'https://full.example.com/test')
+        self.assertEqual(schema.expected_min_pdfs, 5)
+        self.assertEqual(schema.version, '2.0.0')
+
+
+class TestSiteSchemaMatches(unittest.TestCase):
+    """Tests for the matches() method."""
+
+    def test_matches_simple_pattern(self):
+        """Test matching a simple URL pattern."""
+        schema = SiteSchema(
+            name='test',
+            url_pattern=r'https://example\.com/.*'
+        )
+        self.assertTrue(schema.matches('https://example.com/'))
+        self.assertTrue(schema.matches('https://example.com/page'))
+        self.assertTrue(schema.matches('https://example.com/docs/file.pdf'))
+        self.assertFalse(schema.matches('https://other.com/'))
+        self.assertFalse(schema.matches('http://example.com/'))  # http vs https
+
+    def test_matches_complex_pattern(self):
+        """Test matching a complex URL pattern."""
+        schema = SiteSchema(
+            name='springer',
+            url_pattern=r'https?://link\.springer\.com/book/10\.\d+/.*'
+        )
+        self.assertTrue(schema.matches('https://link.springer.com/book/10.1007/978-3-031-41026-0'))
+        self.assertTrue(schema.matches('http://link.springer.com/book/10.1234/some-book'))
+        self.assertFalse(schema.matches('https://link.springer.com/article/10.1007/something'))
+        self.assertFalse(schema.matches('https://springer.com/book/10.1007/something'))
+
+    def test_matches_with_capture_groups(self):
+        """Test that patterns with capture groups work."""
+        schema = SiteSchema(
+            name='arxiv',
+            url_pattern=r'https?://arxiv\.org/(abs|pdf)/(\d+\.\d+)'
+        )
+        self.assertTrue(schema.matches('https://arxiv.org/abs/2301.07041'))
+        self.assertTrue(schema.matches('https://arxiv.org/pdf/2301.07041'))
+        self.assertFalse(schema.matches('https://arxiv.org/list/2301.07041'))
+
+    def test_matches_empty_pattern(self):
+        """Test behavior with empty URL pattern."""
+        schema = SiteSchema(name='empty', url_pattern='')
+        # Empty pattern is not compiled, so matches() returns False for all
+        self.assertFalse(schema.matches(''))
+        self.assertFalse(schema.matches('https://example.com'))
+
+    def test_matches_catch_all_pattern(self):
+        """Test a catch-all pattern."""
+        schema = SiteSchema(name='generic', url_pattern=r'.*')
+        self.assertTrue(schema.matches('https://example.com'))
+        self.assertTrue(schema.matches('anything'))
+        self.assertTrue(schema.matches(''))
+
+
+class TestSiteSchemaValidation(unittest.TestCase):
+    """Tests for schema validation during initialization."""
+
+    def test_invalid_regex_raises_error(self):
+        """Test that invalid regex pattern raises ValueError."""
+        with self.assertRaises(ValueError) as context:
+            SiteSchema(name='bad', url_pattern=r'[invalid')
+        self.assertIn('Invalid url_pattern regex', str(context.exception))
+
+    def test_invalid_sort_by_raises_error(self):
+        """Test that invalid sort_by value raises ValueError."""
+        with self.assertRaises(ValueError) as context:
+            SiteSchema(
+                name='bad_sort',
+                url_pattern=r'.*',
+                sort_by='invalid_sort_option'
+            )
+        self.assertIn('sort_by must be one of', str(context.exception))
+
+    def test_valid_sort_by_options(self):
+        """Test that all valid sort_by options are accepted."""
+        valid_options = [None, 'none', 'numeric', 'alpha', 'alpha_desc']
+        for option in valid_options:
+            schema = SiteSchema(
+                name=f'sort_{option}',
+                url_pattern=r'.*',
+                sort_by=option
+            )
+            self.assertEqual(schema.sort_by, option)
+
+
+class TestSiteSchemaGetFilterConfig(unittest.TestCase):
+    """Tests for the get_filter_config() method."""
+
+    def test_no_filters_returns_none(self):
+        """Test that schema with no filters returns None."""
+        schema = SiteSchema(name='no_filters', url_pattern=r'.*')
+        self.assertIsNone(schema.get_filter_config())
+
+    def test_include_patterns_only(self):
+        """Test filter config with only include patterns."""
+        schema = SiteSchema(
+            name='include_only',
+            url_pattern=r'.*',
+            include_patterns=['*.pdf', 'report*.pdf']
+        )
+        config = schema.get_filter_config()
+        self.assertIsInstance(config, FilterConfig)
+        self.assertEqual(config.filename_include, ['*.pdf', 'report*.pdf'])
+        self.assertEqual(config.filename_exclude, [])
+
+    def test_exclude_patterns_only(self):
+        """Test filter config with only exclude patterns."""
+        schema = SiteSchema(
+            name='exclude_only',
+            url_pattern=r'.*',
+            exclude_patterns=['*draft*', '*temp*']
+        )
+        config = schema.get_filter_config()
+        self.assertIsInstance(config, FilterConfig)
+        self.assertEqual(config.filename_include, [])
+        self.assertEqual(config.filename_exclude, ['*draft*', '*temp*'])
+
+    def test_url_patterns(self):
+        """Test filter config with URL patterns."""
+        schema = SiteSchema(
+            name='url_filters',
+            url_pattern=r'.*',
+            url_include_patterns=['*/docs/*'],
+            url_exclude_patterns=['*/temp/*']
+        )
+        config = schema.get_filter_config()
+        self.assertIsInstance(config, FilterConfig)
+        self.assertEqual(config.url_include, ['*/docs/*'])
+        self.assertEqual(config.url_exclude, ['*/temp/*'])
+
+    def test_combined_filters(self):
+        """Test filter config with all filter types."""
+        schema = SiteSchema(
+            name='combined',
+            url_pattern=r'.*',
+            include_patterns=['*.pdf'],
+            exclude_patterns=['*draft*'],
+            url_include_patterns=['*/reports/*'],
+            url_exclude_patterns=['*/archive/*']
+        )
+        config = schema.get_filter_config()
+        self.assertEqual(config.filename_include, ['*.pdf'])
+        self.assertEqual(config.filename_exclude, ['*draft*'])
+        self.assertEqual(config.url_include, ['*/reports/*'])
+        self.assertEqual(config.url_exclude, ['*/archive/*'])
+
+    def test_filter_config_is_copy(self):
+        """Test that filter config returns copies of lists."""
+        schema = SiteSchema(
+            name='copy_test',
+            url_pattern=r'.*',
+            include_patterns=['*.pdf']
+        )
+        config1 = schema.get_filter_config()
+        config2 = schema.get_filter_config()
+
+        # Modify one config
+        config1.filename_include.append('new.pdf')
+
+        # Other config should be unaffected
+        self.assertEqual(config2.filename_include, ['*.pdf'])
+        # Original schema should be unaffected
+        self.assertEqual(schema.include_patterns, ['*.pdf'])
+
+
+class TestSiteSchemaGetSortKey(unittest.TestCase):
+    """Tests for the get_sort_key() method."""
+
+    def test_no_sort_returns_none(self):
+        """Test that no sort config returns None."""
+        schema = SiteSchema(name='no_sort', url_pattern=r'.*')
+        self.assertIsNone(schema.get_sort_key())
+
+    def test_sort_by_none_returns_none(self):
+        """Test that sort_by='none' returns None."""
+        schema = SiteSchema(name='sort_none', url_pattern=r'.*', sort_by='none')
+        self.assertIsNone(schema.get_sort_key())
+
+    def test_sort_by_without_custom_key_returns_none(self):
+        """Test that sort_by without custom key returns None (handled by caller)."""
+        schema = SiteSchema(name='sort_numeric', url_pattern=r'.*', sort_by='numeric')
+        # The actual sort key implementation is in fetcharoo.py
+        # Schema just specifies the strategy
+        self.assertIsNone(schema.get_sort_key())
+
+    def test_custom_sort_key_returned(self):
+        """Test that custom sort_key is returned."""
+        def my_sort_key(url):
+            return len(url)
+
+        schema = SiteSchema(
+            name='custom_sort',
+            url_pattern=r'.*',
+            sort_key=my_sort_key
+        )
+        self.assertEqual(schema.get_sort_key(), my_sort_key)
+
+    def test_custom_sort_key_overrides_sort_by(self):
+        """Test that custom sort_key takes precedence over sort_by."""
+        def custom_key(url):
+            return url.lower()
+
+        schema = SiteSchema(
+            name='override',
+            url_pattern=r'.*',
+            sort_by='numeric',  # This would normally use numeric sorting
+            sort_key=custom_key  # But custom key takes precedence
+        )
+        self.assertEqual(schema.get_sort_key(), custom_key)
+
+
+class TestSiteSchemaStr(unittest.TestCase):
+    """Tests for string representation."""
+
+    def test_str_with_description(self):
+        """Test __str__ with description."""
+        schema = SiteSchema(
+            name='my_schema',
+            url_pattern=r'.*',
+            description='A useful schema'
+        )
+        self.assertEqual(str(schema), 'my_schema: A useful schema')
+
+    def test_str_without_description(self):
+        """Test __str__ without description."""
+        schema = SiteSchema(name='my_schema', url_pattern=r'.*')
+        self.assertEqual(str(schema), 'my_schema: No description')
+
+
+class TestSiteSchemaEquality(unittest.TestCase):
+    """Tests for schema equality comparison."""
+
+    def test_equal_schemas(self):
+        """Test that identical schemas are equal."""
+        schema1 = SiteSchema(name='test', url_pattern=r'.*', sort_by='numeric')
+        schema2 = SiteSchema(name='test', url_pattern=r'.*', sort_by='numeric')
+        self.assertEqual(schema1, schema2)
+
+    def test_different_schemas(self):
+        """Test that different schemas are not equal."""
+        schema1 = SiteSchema(name='test1', url_pattern=r'.*')
+        schema2 = SiteSchema(name='test2', url_pattern=r'.*')
+        self.assertNotEqual(schema1, schema2)
+
+
+if __name__ == '__main__':
+    unittest.main()