mozilla-conduit · dklawren · Jan 21, 2026 · Jan 21, 2026 · Jan 21, 2026 · Jan 21, 2026
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -4,6 +4,9 @@ on:
   pull_request:
     branches: [main]
 
+permissions:
+  contents: read
+
 jobs:
   test:
     runs-on: ubuntu-latest
@@ -19,3 +22,14 @@ jobs:
           pip install -e ".[dev]"
       - name: Run all tests
         run: pytest
+
+  integration-test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Run integration test with docker compose
+        run: |
+          docker compose up --build --abort-on-container-exit --exit-code-from github-etl
+      - name: Cleanup
+        if: always()
+        run: docker compose down -v --remove-orphans
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,5 @@
 # Use the latest stable Python image
-FROM python:3.11-slim
+FROM python:3.14.2-slim
 
 # Set environment variables
 ENV PYTHONDONTWRITEBYTECODE=1 \
@@ -34,4 +34,4 @@ RUN chown -R app:app /app
 USER app
 
 # Set the default command
-CMD ["python", "main.py"]
+CMD ["python", "main.py"]
diff --git a/Dockerfile.mock b/Dockerfile.mock
@@ -1,5 +1,5 @@
 # Dockerfile for mock GitHub API service
-FROM python:3.11-slim
+FROM python:3.14.2-slim
 
 WORKDIR /app
 

diff --git a/README.md b/README.md
@@ -48,13 +48,13 @@ docker run --rm \
 
 ### Environment Variables
 
-| Variable | Required | Default | Description |
-|----------|----------|---------|-------------|
-| `GITHUB_REPOS` | Yes | - | Comma separated repositories in format "owner/repo" (e.g., "mozilla/firefox") |
-| `GITHUB_TOKEN` | No | - | GitHub Personal Access Token (recommended to avoid rate limits) |
-| `BIGQUERY_PROJECT` | Yes | - | Google Cloud Project ID |
-| `BIGQUERY_DATASET` | Yes | - | BigQuery dataset ID |
-| `GOOGLE_APPLICATION_CREDENTIALS` | Yes* | - | Path to GCP service account JSON file (*or use Workload Identity) |
+| Variable                         | Required | Default | Description                                                                   |
+| -------------------------------- | -------- | ------- | ----------------------------------------------------------------------------- |
+| `GITHUB_REPOS`                   | Yes      | -       | Comma separated repositories in format "owner/repo" (e.g., "mozilla/firefox") |
+| `GITHUB_TOKEN`                   | No       | -       | GitHub Personal Access Token (recommended to avoid rate limits)               |
+| `BIGQUERY_PROJECT`               | Yes      | -       | Google Cloud Project ID                                                       |
+| `BIGQUERY_DATASET`               | Yes      | -       | BigQuery dataset ID                                                           |
+| `GOOGLE_APPLICATION_CREDENTIALS` | Yes\*    | -       | Path to GCP service account JSON file (\*or use Workload Identity)            |
 
 ## Architecture
 
@@ -66,7 +66,7 @@ docker run --rm \
 
 ### Container Specifications
 
-- **Base Image**: `python:3.11-slim` (latest stable Python)
+- **Base Image**: `python:3.14.2-slim` (latest stable Python)
 - **User**: `app` (uid: 1000, gid: 1000)
 - **Working Directory**: `/app`
 - **Ownership**: All files in `/app` are owned by the `app` user
@@ -157,6 +157,28 @@ This setup includes:
 - **BigQuery Emulator**: Local BigQuery instance for testing
 - **ETL Service**: Configured to use both mock services
 
+### Running Tests
+
+The project includes a test suite using pytest. Tests are in the `tests/` directory.
+
+Install development dependencies:
+
+```bash
+pip install -e ".[dev]"
+```
+
+Run tests:
+
+```bash
+pytest
+```
+
+Run with coverage:
+
+```bash
+pytest --cov=. --cov-report=html
+```
+
 ### Adding Dependencies
 
 Add new Python packages to `requirements.txt` and rebuild the Docker image.

diff --git a/main.py b/main.py
@@ -22,6 +22,8 @@
 
 BUG_RE = re.compile(r"\b(?:bug|b=)\s*#?(\d+)\b", re.I)
 
+logger = logging.getLogger(__name__)
+
 
 def setup_logging() -> None:
     """Configure logging for the ETL process."""
@@ -53,7 +55,6 @@ def extract_pull_requests(
     Yields:
         List of pull request dictionaries (up to chunk_size items)
     """
-    logger = logging.getLogger(__name__)
     logger.info("Starting data extraction from GitHub repositories")
 
     # Support custom API URL for mocking/testing
@@ -155,7 +156,6 @@ def extract_commits(
     Returns:
         List of commit dictionaries for the pull request
     """
-    logger = logging.getLogger(__name__)
     logger.info(f"Extracting commits for PR #{pr_number}")
 
     # Support custom API URL for mocking/testing
@@ -207,7 +207,6 @@ def extract_reviewers(
     Returns:
         List of reviewer dictionaries for the pull request
     """
-    logger = logging.getLogger(__name__)
     logger.info(f"Extracting reviewers for PR #{pr_number}")
 
     # Support custom API URL for mocking/testing
@@ -249,7 +248,6 @@ def extract_comments(
     Returns:
         List of comment dictionaries for the pull request
     """
-    logger = logging.getLogger(__name__)
     logger.info(f"Extracting comments for PR #{pr_number}")
 
     # Support custom API URL for mocking/testing
@@ -295,7 +293,6 @@ def transform_data(raw_data: list[dict], repo: str) -> dict:
     Returns:
         List of transformed pull requests, commits, reviewers, and comments ready for BigQuery
     """
-    logger = logging.getLogger(__name__)
     logger.info(f"Starting data transformation for {len(raw_data)} PRs")
 
     transformed_data: dict = {
@@ -426,8 +423,6 @@ def load_data(
         transformed_data: Dictionary containing tables ('pull_requests',
             'commits', 'reviewers', 'comments') mapped to lists of row dictionaries
     """
-    logger = logging.getLogger(__name__)
-
     if not transformed_data:
         logger.warning("No data to load, skipping")
         return
@@ -475,8 +470,6 @@ def main() -> int:
     4. Repeat until no more data
     """
     setup_logging()
-    logger = logging.getLogger(__name__)
-
     logger.info("Starting GitHub ETL process with chunked processing")
 
     github_token = os.environ.get("GITHUB_TOKEN")
@@ -529,7 +522,7 @@ def main() -> int:
     github_repos = []
     github_repos_str = os.getenv("GITHUB_REPOS")
     if github_repos_str:
-        github_repos = github_repos_str.split(",")
+        github_repos = [r.strip() for r in github_repos_str.split(",") if r.strip()]
     else:
         raise SystemExit(
             "Environment variable GITHUB_REPOS is required (format: 'owner/repo,owner/repo')"

diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,7 @@ dependencies = [
 [project.optional-dependencies]
 dev = [
     "pytest>=7.0.0",
+    "pytest-cov>=7.0.0",
     "ruff>=0.14.14",
     "black>=24.0.0",
 ]

diff --git a/requirements.txt b/requirements.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.14
 # by the following command:
 #
-#    pip-compile --generate-hashes pyproject.toml
+#    pip-compile --generate-hashes --output-file=requirements.txt pyproject.toml
 #
 certifi==2026.1.4 \
     --hash=sha256:9943707519e4add1115f44c2bc244f782c0249876bf51b6599fee1ffbedd685c \

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,20 @@
+from unittest.mock import MagicMock, Mock
+
+import pytest
+import requests
+from google.cloud import bigquery
+
+
+@pytest.fixture
+def mock_session() -> Mock:
+    session = Mock(spec=requests.Session)
+    session.headers = {}
+    return session
+
+
+@pytest.fixture
+def mock_bigquery_client() -> Mock:
+    client = Mock(spec=bigquery.Client)
+    client.project = "test-project"
+    client.insert_rows_json = MagicMock(return_value=[])  # Empty list = no errors
+    return client
diff --git a/tests/test_extract_comments.py b/tests/test_extract_comments.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+from unittest.mock import Mock, patch
+
+import pytest
+
+import main
+
+
+@patch("main.sleep_for_rate_limit")
+def test_rate_limit_handling_comments(mock_sleep, mock_session):
+    """Test rate limit handling when fetching comments."""
+    rate_limit_response = Mock()
+    rate_limit_response.status_code = 403
+    rate_limit_response.headers = {"X-RateLimit-Remaining": "0"}
+
+    success_response = Mock()
+    success_response.status_code = 200
+    success_response.json.return_value = []
+
+    mock_session.get.side_effect = [rate_limit_response, success_response]
+
+    main.extract_comments(mock_session, "mozilla/firefox", 123)
+
+    mock_sleep.assert_called_once()
+
+
+def test_api_error_comments(mock_session):
+    """Test API error handling when fetching comments."""
+    error_response = Mock()
+    error_response.status_code = 404
+    error_response.text = "Not Found"
+
+    mock_session.get.return_value = error_response
+
+    with pytest.raises(SystemExit) as exc_info:
+        main.extract_comments(mock_session, "mozilla/firefox", 123)
+
+    assert "GitHub API error 404" in str(exc_info.value)
diff --git a/tests/test_extract_commits.py b/tests/test_extract_commits.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+from unittest.mock import Mock, patch
+
+import pytest
+
+import main
+
+
+def test_extract_commits_with_files(mock_session):
+    """Test that file details are fetched per commit and merged into commit data."""
+    commits_response = Mock()
+    commits_response.status_code = 200
+    commits_response.json.return_value = [
+        {"sha": "abc123"},
+        {"sha": "def456"},
+    ]
+
+    commit_detail_1 = Mock()
+    commit_detail_1.status_code = 200
+    commit_detail_1.json.return_value = {
+        "sha": "abc123",
+        "files": [{"filename": "file1.py", "additions": 10}],
+    }
+
+    commit_detail_2 = Mock()
+    commit_detail_2.status_code = 200
+    commit_detail_2.json.return_value = {
+        "sha": "def456",
+        "files": [{"filename": "file2.py", "deletions": 5}],
+    }
+
+    mock_session.get.side_effect = [
+        commits_response,
+        commit_detail_1,
+        commit_detail_2,
+    ]
+
+    result = main.extract_commits(mock_session, "mozilla/firefox", 123)
+
+    # Verify the individual commit detail endpoints were fetched
+    assert mock_session.get.call_count == 3
+    calls = mock_session.get.call_args_list
+    assert "commits/abc123" in calls[1][0][0]
+    assert "commits/def456" in calls[2][0][0]
+
+    # Verify file data from detail responses is merged into each commit
+    assert result[0]["files"][0]["filename"] == "file1.py"
+    assert result[1]["files"][0]["filename"] == "file2.py"
+
+
+def test_multiple_files_per_commit(mock_session):
+    """Test that all files from a commit detail response are merged into the commit."""
+    commits_response = Mock()
+    commits_response.status_code = 200
+    commits_response.json.return_value = [{"sha": "abc123"}]
+
+    commit_detail = Mock()
+    commit_detail.status_code = 200
+    commit_detail.json.return_value = {
+        "sha": "abc123",
+        "files": [
+            {"filename": "file1.py", "additions": 10},
+            {"filename": "file2.py", "additions": 20},
+            {"filename": "file3.py", "deletions": 5},
+        ],
+    }
+
+    mock_session.get.side_effect = [commits_response, commit_detail]
+
+    result = main.extract_commits(mock_session, "mozilla/firefox", 123)
+
+    assert len(result[0]["files"]) == 3
+
+
+@patch("main.sleep_for_rate_limit")
+def test_rate_limit_on_commits_list(mock_sleep, mock_session):
+    """Test rate limit handling when fetching commits list."""
+    rate_limit_response = Mock()
+    rate_limit_response.status_code = 403
+    rate_limit_response.headers = {"X-RateLimit-Remaining": "0"}
+
+    success_response = Mock()
+    success_response.status_code = 200
+    success_response.json.return_value = []
+
+    mock_session.get.side_effect = [rate_limit_response, success_response]
+
+    result = main.extract_commits(mock_session, "mozilla/firefox", 123)
+
+    mock_sleep.assert_called_once()
+    assert result == []
+
+
+def test_api_error_on_commits_list(mock_session):
+    """Test API error handling when fetching commits list."""
+    error_response = Mock()
+    error_response.status_code = 500
+    error_response.text = "Internal Server Error"
+
+    mock_session.get.return_value = error_response
+
+    with pytest.raises(SystemExit) as exc_info:
+        main.extract_commits(mock_session, "mozilla/firefox", 123)
+
+    assert "GitHub API error 500" in str(exc_info.value)
+
+
+def test_api_error_on_individual_commit(mock_session):
+    """Test API error when fetching individual commit details."""
+    commits_response = Mock()
+    commits_response.status_code = 200
+    commits_response.json.return_value = [{"sha": "abc123"}]
+
+    commit_error = Mock()
+    commit_error.status_code = 404
+    commit_error.text = "Commit not found"
+
+    mock_session.get.side_effect = [commits_response, commit_error]
+
+    with pytest.raises(SystemExit) as exc_info:
+        main.extract_commits(mock_session, "mozilla/firefox", 123)
+
+    assert "GitHub API error 404" in str(exc_info.value)