Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ on:
pull_request:
branches: [main]

permissions:
contents: read

jobs:
test:
runs-on: ubuntu-latest
Expand All @@ -19,3 +22,14 @@ jobs:
pip install -e ".[dev]"
- name: Run all tests
run: pytest

integration-test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run integration test with docker compose
run: |
docker compose up --build --abort-on-container-exit --exit-code-from github-etl
- name: Cleanup
if: always()
run: docker compose down -v --remove-orphans
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Use the latest stable Python image
FROM python:3.11-slim
FROM python:3.14.2-slim

# Set environment variables
ENV PYTHONDONTWRITEBYTECODE=1 \
Expand Down Expand Up @@ -34,4 +34,4 @@ RUN chown -R app:app /app
USER app

# Set the default command
CMD ["python", "main.py"]
CMD ["python", "main.py"]
2 changes: 1 addition & 1 deletion Dockerfile.mock
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Dockerfile for mock GitHub API service
FROM python:3.11-slim
FROM python:3.14.2-slim

WORKDIR /app

Expand Down
38 changes: 30 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,13 @@ docker run --rm \

### Environment Variables

| Variable | Required | Default | Description |
|----------|----------|---------|-------------|
| `GITHUB_REPOS` | Yes | - | Comma separated repositories in format "owner/repo" (e.g., "mozilla/firefox") |
| `GITHUB_TOKEN` | No | - | GitHub Personal Access Token (recommended to avoid rate limits) |
| `BIGQUERY_PROJECT` | Yes | - | Google Cloud Project ID |
| `BIGQUERY_DATASET` | Yes | - | BigQuery dataset ID |
| `GOOGLE_APPLICATION_CREDENTIALS` | Yes* | - | Path to GCP service account JSON file (*or use Workload Identity) |
| Variable | Required | Default | Description |
| -------------------------------- | -------- | ------- | ----------------------------------------------------------------------------- |
| `GITHUB_REPOS` | Yes | - | Comma separated repositories in format "owner/repo" (e.g., "mozilla/firefox") |
| `GITHUB_TOKEN` | No | - | GitHub Personal Access Token (recommended to avoid rate limits) |
| `BIGQUERY_PROJECT` | Yes | - | Google Cloud Project ID |
| `BIGQUERY_DATASET` | Yes | - | BigQuery dataset ID |
| `GOOGLE_APPLICATION_CREDENTIALS` | Yes\* | - | Path to GCP service account JSON file (\*or use Workload Identity) |

## Architecture

Expand All @@ -66,7 +66,7 @@ docker run --rm \

### Container Specifications

- **Base Image**: `python:3.11-slim` (latest stable Python)
- **Base Image**: `python:3.14.2-slim` (latest stable Python)
- **User**: `app` (uid: 1000, gid: 1000)
- **Working Directory**: `/app`
- **Ownership**: All files in `/app` are owned by the `app` user
Expand Down Expand Up @@ -157,6 +157,28 @@ This setup includes:
- **BigQuery Emulator**: Local BigQuery instance for testing
- **ETL Service**: Configured to use both mock services

### Running Tests

The project includes a test suite using pytest. Tests are in the `tests/` directory.

Install development dependencies:

```bash
pip install -e ".[dev]"
```

Run tests:

```bash
pytest
```

Run with coverage:

```bash
pytest --cov=. --cov-report=html
```

### Adding Dependencies

Add new Python packages to `requirements.txt` and rebuild the Docker image.
Expand Down
13 changes: 3 additions & 10 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@

BUG_RE = re.compile(r"\b(?:bug|b=)\s*#?(\d+)\b", re.I)

logger = logging.getLogger(__name__)


def setup_logging() -> None:
"""Configure logging for the ETL process."""
Expand Down Expand Up @@ -53,7 +55,6 @@ def extract_pull_requests(
Yields:
List of pull request dictionaries (up to chunk_size items)
"""
logger = logging.getLogger(__name__)
logger.info("Starting data extraction from GitHub repositories")

# Support custom API URL for mocking/testing
Expand Down Expand Up @@ -155,7 +156,6 @@ def extract_commits(
Returns:
List of commit dictionaries for the pull request
"""
logger = logging.getLogger(__name__)
logger.info(f"Extracting commits for PR #{pr_number}")

# Support custom API URL for mocking/testing
Expand Down Expand Up @@ -207,7 +207,6 @@ def extract_reviewers(
Returns:
List of reviewer dictionaries for the pull request
"""
logger = logging.getLogger(__name__)
logger.info(f"Extracting reviewers for PR #{pr_number}")

# Support custom API URL for mocking/testing
Expand Down Expand Up @@ -249,7 +248,6 @@ def extract_comments(
Returns:
List of comment dictionaries for the pull request
"""
logger = logging.getLogger(__name__)
logger.info(f"Extracting comments for PR #{pr_number}")

# Support custom API URL for mocking/testing
Expand Down Expand Up @@ -295,7 +293,6 @@ def transform_data(raw_data: list[dict], repo: str) -> dict:
Returns:
List of transformed pull requests, commits, reviewers, and comments ready for BigQuery
"""
logger = logging.getLogger(__name__)
logger.info(f"Starting data transformation for {len(raw_data)} PRs")

transformed_data: dict = {
Expand Down Expand Up @@ -426,8 +423,6 @@ def load_data(
transformed_data: Dictionary containing tables ('pull_requests',
'commits', 'reviewers', 'comments') mapped to lists of row dictionaries
"""
logger = logging.getLogger(__name__)

if not transformed_data:
logger.warning("No data to load, skipping")
return
Expand Down Expand Up @@ -475,8 +470,6 @@ def main() -> int:
4. Repeat until no more data
"""
setup_logging()
logger = logging.getLogger(__name__)

logger.info("Starting GitHub ETL process with chunked processing")

github_token = os.environ.get("GITHUB_TOKEN")
Expand Down Expand Up @@ -529,7 +522,7 @@ def main() -> int:
github_repos = []
github_repos_str = os.getenv("GITHUB_REPOS")
if github_repos_str:
github_repos = github_repos_str.split(",")
github_repos = [r.strip() for r in github_repos_str.split(",") if r.strip()]
else:
raise SystemExit(
"Environment variable GITHUB_REPOS is required (format: 'owner/repo,owner/repo')"
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ dependencies = [
[project.optional-dependencies]
dev = [
"pytest>=7.0.0",
"pytest-cov>=7.0.0",
"ruff>=0.14.14",
"black>=24.0.0",
]
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.14
# by the following command:
#
# pip-compile --generate-hashes pyproject.toml
# pip-compile --generate-hashes --output-file=requirements.txt pyproject.toml
#
certifi==2026.1.4 \
--hash=sha256:9943707519e4add1115f44c2bc244f782c0249876bf51b6599fee1ffbedd685c \
Expand Down
20 changes: 20 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from unittest.mock import MagicMock, Mock

import pytest
import requests
from google.cloud import bigquery


@pytest.fixture
def mock_session() -> Mock:
session = Mock(spec=requests.Session)
session.headers = {}
return session


@pytest.fixture
def mock_bigquery_client() -> Mock:
client = Mock(spec=bigquery.Client)
client.project = "test-project"
client.insert_rows_json = MagicMock(return_value=[]) # Empty list = no errors
return client
38 changes: 38 additions & 0 deletions tests/test_extract_comments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env python3
from unittest.mock import Mock, patch
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We install pytest-mock but we don't seem to use the provided test fixtures anywhere. We're instead importing from unittest.mock directly.

We should get rid of pytest-mock, or switch occurrences to use the mocker pytest fixtures in tests.


import pytest

import main


@patch("main.sleep_for_rate_limit")
def test_rate_limit_handling_comments(mock_sleep, mock_session):
"""Test rate limit handling when fetching comments."""
rate_limit_response = Mock()
rate_limit_response.status_code = 403
rate_limit_response.headers = {"X-RateLimit-Remaining": "0"}

success_response = Mock()
success_response.status_code = 200
success_response.json.return_value = []

mock_session.get.side_effect = [rate_limit_response, success_response]

main.extract_comments(mock_session, "mozilla/firefox", 123)

mock_sleep.assert_called_once()


def test_api_error_comments(mock_session):
"""Test API error handling when fetching comments."""
error_response = Mock()
error_response.status_code = 404
error_response.text = "Not Found"

mock_session.get.return_value = error_response

with pytest.raises(SystemExit) as exc_info:
main.extract_comments(mock_session, "mozilla/firefox", 123)

assert "GitHub API error 404" in str(exc_info.value)
123 changes: 123 additions & 0 deletions tests/test_extract_commits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#!/usr/bin/env python3
from unittest.mock import Mock, patch

import pytest

import main


def test_extract_commits_with_files(mock_session):
"""Test that file details are fetched per commit and merged into commit data."""
commits_response = Mock()
commits_response.status_code = 200
commits_response.json.return_value = [
{"sha": "abc123"},
{"sha": "def456"},
]

commit_detail_1 = Mock()
commit_detail_1.status_code = 200
commit_detail_1.json.return_value = {
"sha": "abc123",
"files": [{"filename": "file1.py", "additions": 10}],
}

commit_detail_2 = Mock()
commit_detail_2.status_code = 200
commit_detail_2.json.return_value = {
"sha": "def456",
"files": [{"filename": "file2.py", "deletions": 5}],
}

mock_session.get.side_effect = [
commits_response,
commit_detail_1,
commit_detail_2,
]

result = main.extract_commits(mock_session, "mozilla/firefox", 123)

# Verify the individual commit detail endpoints were fetched
assert mock_session.get.call_count == 3
calls = mock_session.get.call_args_list
assert "commits/abc123" in calls[1][0][0]
assert "commits/def456" in calls[2][0][0]

# Verify file data from detail responses is merged into each commit
assert result[0]["files"][0]["filename"] == "file1.py"
assert result[1]["files"][0]["filename"] == "file2.py"


def test_multiple_files_per_commit(mock_session):
"""Test that all files from a commit detail response are merged into the commit."""
commits_response = Mock()
commits_response.status_code = 200
commits_response.json.return_value = [{"sha": "abc123"}]

commit_detail = Mock()
commit_detail.status_code = 200
commit_detail.json.return_value = {
"sha": "abc123",
"files": [
{"filename": "file1.py", "additions": 10},
{"filename": "file2.py", "additions": 20},
{"filename": "file3.py", "deletions": 5},
],
}

mock_session.get.side_effect = [commits_response, commit_detail]

result = main.extract_commits(mock_session, "mozilla/firefox", 123)

assert len(result[0]["files"]) == 3


@patch("main.sleep_for_rate_limit")
def test_rate_limit_on_commits_list(mock_sleep, mock_session):
"""Test rate limit handling when fetching commits list."""
rate_limit_response = Mock()
rate_limit_response.status_code = 403
rate_limit_response.headers = {"X-RateLimit-Remaining": "0"}

success_response = Mock()
success_response.status_code = 200
success_response.json.return_value = []

mock_session.get.side_effect = [rate_limit_response, success_response]

result = main.extract_commits(mock_session, "mozilla/firefox", 123)

mock_sleep.assert_called_once()
assert result == []


def test_api_error_on_commits_list(mock_session):
"""Test API error handling when fetching commits list."""
error_response = Mock()
error_response.status_code = 500
error_response.text = "Internal Server Error"

mock_session.get.return_value = error_response

with pytest.raises(SystemExit) as exc_info:
main.extract_commits(mock_session, "mozilla/firefox", 123)

assert "GitHub API error 500" in str(exc_info.value)


def test_api_error_on_individual_commit(mock_session):
"""Test API error when fetching individual commit details."""
commits_response = Mock()
commits_response.status_code = 200
commits_response.json.return_value = [{"sha": "abc123"}]

commit_error = Mock()
commit_error.status_code = 404
commit_error.text = "Commit not found"

mock_session.get.side_effect = [commits_response, commit_error]

with pytest.raises(SystemExit) as exc_info:
main.extract_commits(mock_session, "mozilla/firefox", 123)

assert "GitHub API error 404" in str(exc_info.value)
Loading