diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 236c70b..3dda315 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -8,11 +8,57 @@ permissions: contents: write jobs: - release: + build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + + - name: Install build dependencies + run: pip install build + + - name: Build package + run: python -m build + + - name: Upload dist artifacts + uses: actions/upload-artifact@v4 + with: + name: dist + path: dist/ + + publish: + needs: build + runs-on: ubuntu-latest + environment: pypi + permissions: + id-token: write + steps: + - name: Download dist artifacts + uses: actions/download-artifact@v4 + with: + name: dist + path: dist/ + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + + github-release: + needs: build + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + + - name: Download dist artifacts + uses: actions/download-artifact@v4 + with: + name: dist + path: dist/ + - name: Extract changelog for this version id: changelog run: | @@ -26,3 +72,4 @@ jobs: uses: softprops/action-gh-release@v2 with: body_path: release_notes.md + files: dist/* diff --git a/.gitignore b/.gitignore index f5fa882..5b6deb7 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ build/ .mypy_cache/ .ruff_cache/ .coverage +uv.lock diff --git a/CHANGELOG.md b/CHANGELOG.md index aa1c3a8..dc0da06 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ # Changelog +## Unreleased + +### Added + +- `[[tool.check-unicode.overrides]]` per-file config: apply different + allow-lists, severity, and confusable settings per file pattern +- Per-file severity: override `severity` to `"warning"` for specific file + patterns so findings don't affect exit code +- Per-file confusable toggle: enable or disable `check-confusables` per file + pattern +- `uv.lock` added to `.gitignore` + ## 0.3.3 - 2026-02-23 ### Fixed diff --git a/pyproject.toml b/pyproject.toml index 5eb4e7e..715de64 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,13 +24,14 @@ classifiers = [ ] dynamic = [ "version" ] optional-dependencies.dev = [ "bump-my-version", "mypy", "pytest", "pytest-cov", "ruff" ] +urls.Changelog = "https://github.com/mit-d/check-unicode/blob/main/CHANGELOG.md" urls.Issues = "https://github.com/mit-d/check-unicode/issues" urls.Repository = "https://github.com/mit-d/check-unicode" scripts.check-unicode = "check_unicode.main:main" [tool.hatch] build.targets.wheel.packages = [ "src/check_unicode" ] -build.targets.sdist.include = [ "docs/check-unicode.1" ] +build.targets.sdist.include = [ "src/", "docs/check-unicode.1" ] version.path = "src/check_unicode/__init__.py" [tool.ruff] diff --git a/src/check_unicode/main.py b/src/check_unicode/main.py index 041fe9c..474d440 100644 --- a/src/check_unicode/main.py +++ b/src/check_unicode/main.py @@ -7,6 +7,7 @@ import sys import textwrap import tomllib +from dataclasses import dataclass from pathlib import Path from typing import Any @@ -54,6 +55,20 @@ } +@dataclass(frozen=True) +class Override: + """Per-file override from [[tool.check-unicode.overrides]].""" + + patterns: tuple[str, ...] + codepoints: frozenset[int] + ranges: tuple[tuple[int, int], ...] + categories: frozenset[str] + printable: bool | None # None = inherit global + scripts: frozenset[str] + severity: str | None # None = inherit global + check_confusables: bool | None # None = inherit global + + def _parse_codepoint(s: str) -> int: """Parse 'U+XXXX' or '0xXXXX' into an integer codepoint.""" s = s.strip() @@ -427,19 +442,129 @@ def _build_exclude_patterns( return patterns +def _build_overrides(config: dict[str, Any]) -> tuple[Override, ...]: + """Parse [[overrides]] entries from the config into Override objects.""" + raw = config.get("overrides", []) + overrides: list[Override] = [] + for entry in raw: + if "files" not in entry: + msg = "Each [[overrides]] entry must have a 'files' key" + raise ValueError(msg) + patterns = tuple(entry["files"]) + codepoints, ranges, categories, printable_val, scripts = _allow_from_config( + entry + ) + # For override, printable is None when not set (inherit global) + printable: bool | None = ( + True if printable_val else None if "allow-printable" not in entry else False + ) + severity: str | None = entry.get("severity") + check_confusables: bool | None = entry.get("check-confusables") + overrides.append( + Override( + patterns=patterns, + codepoints=frozenset(codepoints), + ranges=tuple(ranges), + categories=frozenset(categories), + printable=printable, + scripts=frozenset(scripts), + severity=severity, + check_confusables=check_confusables, + ) + ) + return tuple(overrides) + + +def _file_matches_override(filepath: str, override: Override) -> bool: + """Check whether a filepath matches any pattern in an override.""" + name = Path(filepath).name + return any( + fnmatch.fnmatch(filepath, pat) or fnmatch.fnmatch(name, pat) + for pat in override.patterns + ) + + +def _resolve_allow_for_file( + filepath: str, + base_allow: AllowConfig, + overrides: tuple[Override, ...], +) -> AllowConfig: + """Merge matching overrides onto the base AllowConfig for a file.""" + codepoints = set(base_allow.codepoints) + ranges = list(base_allow.ranges) + categories = set(base_allow.categories) + printable = base_allow.printable + scripts = set(base_allow.scripts) + + for ovr in overrides: + if not _file_matches_override(filepath, ovr): + continue + codepoints |= ovr.codepoints + ranges.extend(ovr.ranges) + categories |= ovr.categories + if ovr.printable is not None: + printable = ovr.printable + scripts |= ovr.scripts + + return AllowConfig( + codepoints=frozenset(codepoints), + ranges=tuple(ranges), + categories=frozenset(categories), + printable=printable, + scripts=frozenset(scripts), + ) + + +def _resolve_file_settings( + filepath: str, + global_severity: str, + *, + global_confusables: bool, + overrides: tuple[Override, ...], +) -> tuple[str, bool]: + """Return (severity, do_confusables) for a file after applying overrides.""" + severity = global_severity + do_confusables = global_confusables + for ovr in overrides: + if not _file_matches_override(filepath, ovr): + continue + if ovr.severity is not None: + severity = ovr.severity + if ovr.check_confusables is not None: + do_confusables = ovr.check_confusables + return severity, do_confusables + + def _scan_files( files: list[str], allow: AllowConfig, + overrides: tuple[Override, ...], *, do_confusables: bool, -) -> list[Finding]: - """Scan files for non-ASCII and (optionally) confusable characters.""" + severity: str, +) -> tuple[list[Finding], bool]: + """Scan files for non-ASCII and (optionally) confusable characters. + + Returns (findings, has_errors) where has_errors is True if any finding + came from a file whose effective severity is "error". + """ findings: list[Finding] = [] + has_errors = False for filepath in files: - findings.extend(check_file(filepath, allow)) - if do_confusables: - findings.extend(check_confusables(filepath)) - return findings + file_allow = _resolve_allow_for_file(filepath, allow, overrides) + file_severity, file_confusables = _resolve_file_settings( + filepath, + severity, + global_confusables=do_confusables, + overrides=overrides, + ) + file_findings = check_file(filepath, file_allow) + if file_confusables: + file_findings.extend(check_confusables(filepath)) + if file_findings and file_severity == "error": + has_errors = True + findings.extend(file_findings) + return findings, has_errors def main(argv: list[str] | None = None) -> int: @@ -462,6 +587,7 @@ def main(argv: list[str] | None = None) -> int: severity = args.severity or config.get("severity", "error") allow = _build_allow_config(args, config) do_confusables = args.check_confusables or config.get("check-confusables", False) + overrides = _build_overrides(config) # Filter out excluded files exclude_patterns = _build_exclude_patterns(args, config) @@ -474,16 +600,20 @@ def main(argv: list[str] | None = None) -> int: if args.fix: fixed = [fix_file(filepath) for filepath in files] any_fixed = any(fixed) - all_findings = _scan_files(files, allow, do_confusables=do_confusables) + all_findings, has_errors = _scan_files( + files, allow, overrides, do_confusables=do_confusables, severity=severity + ) if all_findings: print_findings(all_findings, no_color=args.no_color, quiet=args.quiet) return 1 if any_fixed or all_findings else 0 # Check mode - all_findings = _scan_files(files, allow, do_confusables=do_confusables) + all_findings, has_errors = _scan_files( + files, allow, overrides, do_confusables=do_confusables, severity=severity + ) if all_findings: print_findings(all_findings, no_color=args.no_color, quiet=args.quiet) - return 0 if severity == "warning" else 1 + return 1 if has_errors else 0 return 0 diff --git a/tests/test_cli.py b/tests/test_cli.py index 62cc037..0fb3a92 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -7,11 +7,17 @@ import pytest +from check_unicode.checker import AllowConfig from check_unicode.main import ( + Override, + _build_overrides, _build_parser, + _file_matches_override, _is_excluded, _parse_codepoint, _parse_range, + _resolve_allow_for_file, + _resolve_file_settings, main, ) @@ -602,3 +608,269 @@ def test_list_categories_contains_descriptions( def test_list_categories_does_not_require_files(self) -> None: """--list-categories works without specifying any files.""" assert main(["--list-categories"]) == 0 + + +class TestOverrides: + """Tests for [[tool.check-unicode.overrides]] per-file config.""" + + def test_override_extends_global_allow(self, tmp_path: Path) -> None: + """Override allows emoji in matched file but not in unmatched file.""" + config = tmp_path / "config.toml" + config.write_text( + '[[overrides]]\nfiles = ["*.md"]\nallow-categories = ["So"]\n', + encoding="utf-8", + ) + md_file = tmp_path / "README.md" + md_file.write_text("\u00a9 symbol\n", encoding="utf-8") # So category + py_file = tmp_path / "code.py" + py_file.write_text("\u00a9 symbol\n", encoding="utf-8") + + # md file is allowed via override + assert main(["--config", str(config), str(md_file)]) == 0 + # py file is not matched, still flagged + assert main(["--config", str(config), str(py_file)]) == 1 + + def test_multiple_overrides_multiple_patterns(self, tmp_path: Path) -> None: + """Multiple overrides and multiple patterns per override work.""" + config = tmp_path / "config.toml" + config.write_text( + '[[overrides]]\nfiles = ["*.md", "*.txt"]\n' + 'allow-codepoints = ["U+00B0"]\n\n' + '[[overrides]]\nfiles = ["*.rst"]\n' + 'allow-codepoints = ["U+00A9"]\n', + encoding="utf-8", + ) + md_file = tmp_path / "doc.md" + md_file.write_text("72\u00b0F\n", encoding="utf-8") + txt_file = tmp_path / "notes.txt" + txt_file.write_text("72\u00b0F\n", encoding="utf-8") + rst_file = tmp_path / "doc.rst" + rst_file.write_text("\u00a9 2024\n", encoding="utf-8") + + assert main(["--config", str(config), str(md_file)]) == 0 + assert main(["--config", str(config), str(txt_file)]) == 0 + assert main(["--config", str(config), str(rst_file)]) == 0 + + def test_per_file_severity_warning(self, tmp_path: Path) -> None: + """Override with severity=warning doesn't affect exit code.""" + config = tmp_path / "config.toml" + config.write_text( + '[[overrides]]\nfiles = ["*.md"]\nseverity = "warning"\n', + encoding="utf-8", + ) + md_file = tmp_path / "README.md" + md_file.write_text("\u201chello\u201d\n", encoding="utf-8") + py_file = tmp_path / "code.py" + py_file.write_text("\u201chello\u201d\n", encoding="utf-8") + + # md has findings but severity=warning -> exit 0 + assert main(["--config", str(config), str(md_file)]) == 0 + # py has findings with default severity=error -> exit 1 + assert main(["--config", str(config), str(py_file)]) == 1 + + def test_per_file_severity_mixed(self, tmp_path: Path) -> None: + """Mixed severity: warning file + error file together exits 1.""" + config = tmp_path / "config.toml" + config.write_text( + '[[overrides]]\nfiles = ["*.md"]\nseverity = "warning"\n', + encoding="utf-8", + ) + md_file = tmp_path / "README.md" + md_file.write_text("\u201chello\u201d\n", encoding="utf-8") + py_file = tmp_path / "code.py" + py_file.write_text("\u201chello\u201d\n", encoding="utf-8") + + # Both files scanned: py is error -> exit 1 + assert main(["--config", str(config), str(md_file), str(py_file)]) == 1 + + def test_per_file_severity_all_warnings(self, tmp_path: Path) -> None: + """All files with severity=warning and findings -> exit 0.""" + config = tmp_path / "config.toml" + config.write_text( + '[[overrides]]\nfiles = ["*"]\nseverity = "warning"\n', + encoding="utf-8", + ) + f = tmp_path / "test.txt" + f.write_text("\u201chello\u201d\n", encoding="utf-8") + + assert main(["--config", str(config), str(f)]) == 0 + + def test_per_file_check_confusables_toggle(self, tmp_path: Path) -> None: + """Override can disable check-confusables for specific files.""" + config = tmp_path / "config.toml" + config.write_text( + "check-confusables = true\n" + '[[overrides]]\nfiles = ["*.md"]\ncheck-confusables = false\n', + encoding="utf-8", + ) + # File with a Cyrillic 'a' (U+0430) mixed into Latin text + md_file = tmp_path / "doc.md" + md_file.write_text("p\u0430ssword\n", encoding="utf-8") + py_file = tmp_path / "code.py" + py_file.write_text("p\u0430ssword\n", encoding="utf-8") + + # md has confusables disabled by override; still flagged for non-ASCII + # but allow the Cyrillic script to isolate confusable check + config.write_text( + "check-confusables = true\n" + 'allow-scripts = ["Cyrillic"]\n' + '[[overrides]]\nfiles = ["*.md"]\ncheck-confusables = false\n', + encoding="utf-8", + ) + # md: Cyrillic allowed, confusables OFF -> exit 0 + assert main(["--config", str(config), str(md_file)]) == 0 + # py: Cyrillic allowed, confusables ON -> exit 1 (confusable found) + assert main(["--config", str(config), str(py_file)]) == 1 + + def test_override_allow_printable(self, tmp_path: Path) -> None: + """Override with allow-printable=true for specific files.""" + config = tmp_path / "config.toml" + config.write_text( + '[[overrides]]\nfiles = ["*.md"]\nallow-printable = true\n', + encoding="utf-8", + ) + md_file = tmp_path / "doc.md" + md_file.write_text("caf\u00e9\n", encoding="utf-8") + py_file = tmp_path / "code.py" + py_file.write_text("caf\u00e9\n", encoding="utf-8") + + # md: printable allowed via override -> exit 0 + assert main(["--config", str(config), str(md_file)]) == 0 + # py: no override match -> exit 1 + assert main(["--config", str(config), str(py_file)]) == 1 + + def test_override_missing_files_key(self) -> None: + """Override without 'files' key raises ValueError.""" + with pytest.raises(ValueError, match="files"): + _build_overrides({"overrides": [{"allow-printable": True}]}) + + def test_override_no_allow_fields(self, tmp_path: Path) -> None: + """Override with only 'files' key is valid but a no-op.""" + config = tmp_path / "config.toml" + config.write_text( + '[[overrides]]\nfiles = ["*.md"]\n', + encoding="utf-8", + ) + md_file = tmp_path / "doc.md" + md_file.write_text("\u201chello\u201d\n", encoding="utf-8") + + # Still flagged: override is a no-op + assert main(["--config", str(config), str(md_file)]) == 1 + + def test_glob_basename_pattern(self) -> None: + """Basename patterns like *.md match regardless of path.""" + ovr = Override( + patterns=("*.md",), + codepoints=frozenset(), + ranges=(), + categories=frozenset(), + printable=None, + scripts=frozenset(), + severity=None, + check_confusables=None, + ) + assert _file_matches_override("docs/README.md", ovr) is True + assert _file_matches_override("README.md", ovr) is True + assert _file_matches_override("code.py", ovr) is False + + def test_glob_path_pattern(self) -> None: + """Path patterns like docs/* match against full path.""" + ovr = Override( + patterns=("docs/*",), + codepoints=frozenset(), + ranges=(), + categories=frozenset(), + printable=None, + scripts=frozenset(), + severity=None, + check_confusables=None, + ) + assert _file_matches_override("docs/guide.md", ovr) is True + assert _file_matches_override("src/main.py", ovr) is False + + def test_glob_exact_name(self) -> None: + """Exact file name patterns match.""" + ovr = Override( + patterns=("README.md",), + codepoints=frozenset(), + ranges=(), + categories=frozenset(), + printable=None, + scripts=frozenset(), + severity=None, + check_confusables=None, + ) + assert _file_matches_override("README.md", ovr) is True + assert _file_matches_override("docs/README.md", ovr) is True + assert _file_matches_override("CHANGELOG.md", ovr) is False + + def test_resolve_allow_merges_additively(self) -> None: + """Allow-list fields merge additively across overrides.""" + base = AllowConfig( + codepoints=frozenset({0x00B0}), + categories=frozenset({"Sc"}), + ) + ovr = Override( + patterns=("*.md",), + codepoints=frozenset({0x00A9}), + ranges=((0x2000, 0x200A),), + categories=frozenset({"So"}), + printable=None, + scripts=frozenset({"Cyrillic"}), + severity=None, + check_confusables=None, + ) + result = _resolve_allow_for_file("doc.md", base, (ovr,)) + assert 0x00B0 in result.codepoints + assert 0x00A9 in result.codepoints + assert "Sc" in result.categories + assert "So" in result.categories + assert (0x2000, 0x200A) in result.ranges + assert "Cyrillic" in result.scripts + assert result.printable is False # not set in override + + def test_resolve_file_settings_last_wins(self) -> None: + """For scalar settings, last matching override wins.""" + ovr1 = Override( + patterns=("*",), + codepoints=frozenset(), + ranges=(), + categories=frozenset(), + printable=None, + scripts=frozenset(), + severity="warning", + check_confusables=True, + ) + ovr2 = Override( + patterns=("*.py",), + codepoints=frozenset(), + ranges=(), + categories=frozenset(), + printable=None, + scripts=frozenset(), + severity="error", + check_confusables=False, + ) + sev, conf = _resolve_file_settings( + "code.py", + "error", + global_confusables=False, + overrides=(ovr1, ovr2), + ) + assert sev == "error" # ovr2 wins + assert conf is False # ovr2 wins + + def test_global_severity_warning_still_works(self, tmp_path: Path) -> None: + """Global severity=warning without overrides still exits 0.""" + config = tmp_path / "config.toml" + config.write_text( + 'severity = "warning"\n' + '[[overrides]]\nfiles = ["*.md"]\n' + 'allow-codepoints = ["U+00B0"]\n', + encoding="utf-8", + ) + py_file = tmp_path / "code.py" + py_file.write_text("\u201chello\u201d\n", encoding="utf-8") + + # Global severity is warning -> exit 0 even for unmatched files + assert main(["--config", str(config), str(py_file)]) == 0