diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..236c70b --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,28 @@ +name: Release + +on: + push: + tags: ["v*"] + +permissions: + contents: write + +jobs: + release: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Extract changelog for this version + id: changelog + run: | + version="${GITHUB_REF_NAME#v}" + # Extract everything between this version's header and the next + awk "/^## ${version}/{found=1; next} /^## [0-9]/{if(found) exit} found" \ + CHANGELOG.md > release_notes.md + cat release_notes.md + + - name: Create GitHub Release + uses: softprops/action-gh-release@v2 + with: + body_path: release_notes.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 5bb34a6..5b8b93c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## 0.2.0 +## 0.2.0 - 2026-02-19 ### Added @@ -20,7 +20,7 @@ printable i18n text. - TOML config keys: `allow-printable`, `allow-scripts`, `check-confusables`. -## 0.1.0 +## 0.1.0 - 2026-02-18 Initial release. diff --git a/docs/check-unicode.1 b/docs/check-unicode.1 new file mode 100644 index 0000000..019edb7 --- /dev/null +++ b/docs/check-unicode.1 @@ -0,0 +1,357 @@ +.\" Man page for check-unicode +.\" Generate with: man ./docs/check-unicode.1 +.TH CHECK\-UNICODE 1 "2026-02-20" "check-unicode 0.2.0" "User Commands" +. +.SH NAME +check\-unicode \- detect and fix non\-ASCII Unicode characters in text files +. +.SH SYNOPSIS +.B check\-unicode +.RI [ OPTIONS ] +.IR FILE ... +. +.SH DESCRIPTION +.B check\-unicode +scans text files for non\-ASCII Unicode characters and reports their location, +codepoint, name, and category. +It is designed to catch copy\-paste artifacts such as smart quotes, em dashes, +fancy spaces, and dangerous invisible characters +(Trojan Source bidi attacks, zero\-width chars). +.PP +In +.B \-\-fix +mode it replaces known offenders with ASCII equivalents. +Dangerous invisible characters are never auto\-fixed and always require manual +review. +.PP +.B check\-unicode +is commonly used as a +.BR pre\-commit (1) +hook but also works as a standalone CLI tool. +. +.SH POSITIONAL ARGUMENTS +.TP +.I FILE ... +One or more files to check. +At least one file is required; the program exits with code\ 2 if none are +provided. +. +.SH OPTIONS +.SS Mode +.TP +.B \-\-fix +Replace known offenders (smart quotes, en/em dashes, fancy spaces, ellipsis) +with their ASCII equivalents using an atomic write (temp file + rename). +Exits\ 1 if any file was modified. +Dangerous invisible characters are never auto\-fixed. +.TP +.BR \-V ", " \-\-version +Print the program version and exit. +. +.SS Allow\-list options +These flags suppress findings for specific characters. +They extend (never replace) any values set in the config file. +Dangerous invisible characters are always flagged unless explicitly allowed by +.BR \-\-allow\-codepoint . +.TP +.BI \-\-allow\-range " RANGE" +Allow a Unicode range. +The format is +.IR U+XXXX\-U+YYYY . +May be repeated for multiple ranges. +.RS +.PP +Example: +.B \-\-allow\-range U+00A0\-U+00FF +.RE +.TP +.BI \-\-allow\-codepoint " CP" +Allow specific Unicode codepoints. +Accepts +.I U+XXXX +notation, comma\-separated and/or repeated. +This is the +.B only +flag that can suppress dangerous invisible characters. +.RS +.PP +Example: +.B \-\-allow\-codepoint U+00B0,U+00A9 +.RE +.TP +.BI \-\-allow\-category " CAT" +Allow a Unicode general category. +May be repeated for multiple categories. +Use +.B \-\-list\-categories +to see all valid values. +.RS +.PP +Example: +.B \-\-allow\-category Sc +(Symbol, currency) +.RE +.TP +.B \-\-allow\-printable +Allow all printable non\-ASCII characters. +Only invisible and control characters will be flagged. +.TP +.BI \-\-allow\-script " SCRIPT" +Allow all characters from a Unicode script. +May be repeated. +Script names are case\-insensitive and normalized to title case. +Use +.B \-\-list\-scripts +to see all valid names. +.RS +.PP +Example: +.B \-\-allow\-script Cyrillic \-\-allow\-script Greek +.RE +.TP +.B \-\-list\-categories +Print all 30 Unicode general categories with descriptions and examples, +then exit. +Useful for discovering valid values for +.BR \-\-allow\-category . +.TP +.B \-\-list\-scripts +Print all known Unicode script names, then exit. +Useful for discovering valid values for +.BR \-\-allow\-script . +. +.SS Detection options +.TP +.B \-\-check\-confusables +Detect mixed\-script homoglyph/confusable characters, such as a Cyrillic +.B a +(U+0430) mixed into a Latin identifier. +This check is +.B not +suppressed by +.BR \-\-allow\-script . +. +.SS Output options +.TP +.BI \-\-severity " LEVEL" +Set exit\-code behavior. +.I LEVEL +must be +.B error +(exit\ 1 on findings) or +.B warning +(print findings but exit\ 0). +Default: +.BR error . +.TP +.B \-\-no\-color +Disable ANSI color output. +Color is also disabled when the +.B NO_COLOR +environment variable is set or stdout is not a TTY. +.TP +.BR \-q ", " \-\-quiet +Print the summary line only; suppress per\-finding details. +. +.SS Configuration +.TP +.BI \-\-config " FILE" +Path to a TOML config file. +If omitted, the program auto\-discovers +.I .check\-unicode.toml +in the current directory, or +.I [tool.check\-unicode] +in +.IR pyproject.toml . +.TP +.BI \-\-exclude\-pattern " PATTERN" +Exclude files matching a glob pattern. +May be repeated. +Extends any +.B exclude\-patterns +set in the config file. +Patterns are matched against both the full path and the basename. +.RS +.PP +Example: +.B \-\-exclude\-pattern '*.min.js' \-\-exclude\-pattern 'vendor/*' +.RE +. +.SH CONFIGURATION FILE +Settings can be stored in +.I .check\-unicode.toml +(standalone) or under the +.B [tool.check\-unicode] +table in +.IR pyproject.toml . +CLI flags always extend config\-file values; they never replace them. +.PP +.nf +.RS +[tool.check\-unicode] +allow\-codepoints = ["U+00B0", "U+2192"] +allow\-ranges = ["U+00A0\-U+00FF"] +allow\-categories = ["Sc"] +allow\-printable = true +allow\-scripts = ["Latin", "Cyrillic"] +check\-confusables = true +severity = "error" +exclude\-patterns = ["*.min.js", "vendor/*"] +.RE +.fi +. +.SH EXIT CODES +.TP +.B 0 +No findings were detected, or +.B \-\-severity=warning +was used. +.TP +.B 1 +Non\-ASCII findings were detected, or files were modified in +.B \-\-fix +mode. +.TP +.B 2 +Usage error (invalid arguments, no files specified, etc.). +. +.SH WHAT IT CATCHES +.SS Copy\-paste artifacts (fixable with \-\-fix) +.TP +.B Smart quotes +\(lq\(rq \(oq\(cq and variants \(-> replaced with ASCII quotes +.TP +.B Dashes +Em dash (U+2014), en dash (U+2013), minus sign (U+2212) \(-> replaced with +.B \-\- +or +.BR \- . +.TP +.B Fancy spaces +Non\-breaking space, em space, thin space, and 14 other Unicode space characters +\(-> replaced with a regular space. +.TP +.B Ellipsis +Horizontal ellipsis (U+2026) \(-> replaced with +.BR ... . +. +.SS Dangerous invisible characters (never auto\-fixed) +.TP +.B Bidi controls (Trojan Source CVE\-2021\-42574) +U+202A\-202E (embedding/override), U+2066\-2069 (isolate). +These can make source code appear to do something different from what it +actually does. +.TP +.B Zero\-width characters +U+200B\-200F, U+FEFF (mid\-file BOM), U+2060\-2064, U+180E. +Invisible characters that can break identifiers or hide malicious code. +.TP +.B Replacement character +U+FFFD, usually indicates an encoding error. +. +.SS Confusable homoglyphs (with \-\-check\-confusables) +Mixed\-script identifiers where minority\-script characters visually resemble +Latin letters (e.g.\& Cyrillic +.I a +U+0430 in a Latin word). +. +.SH OUTPUT FORMAT +For each finding, the program prints the file, line, column, codepoint, +Unicode name, and general category: +.PP +.nf +.RS +path/to/file.txt:42:17: U+201C LEFT DOUBLE QUOTATION MARK [Ps] + He said \(lqhello\(rq to the crowd + ^ +.RE +.fi +.PP +After all findings, a summary line is printed: +.PP +.nf +.RS +Found 5 non\-ASCII characters in 2 files (3 fixable, 1 dangerous) +.RE +.fi +. +.SH ENVIRONMENT +.TP +.B NO_COLOR +If set (to any value), ANSI color output is disabled. +See +.IR https://no\-color.org/ . +. +.SH EXAMPLES +Check all Python files in a project: +.PP +.RS +.B check\-unicode src/**/*.py +.RE +.PP +Auto\-fix smart quotes and dashes: +.PP +.RS +.B check\-unicode \-\-fix *.txt +.RE +.PP +Allow printable characters, flag only invisibles: +.PP +.RS +.B check\-unicode \-\-allow\-printable src/ +.RE +.PP +Detect confusables while allowing Cyrillic script: +.PP +.RS +.B check\-unicode \-\-check\-confusables \-\-allow\-script Cyrillic src/ +.RE +.PP +Warn without failing CI, disable color: +.PP +.RS +.B check\-unicode \-\-severity warning \-\-no\-color src/ +.RE +.PP +List all valid Unicode script names: +.PP +.RS +.B check\-unicode \-\-list\-scripts +.RE +.PP +List all valid Unicode general categories: +.PP +.RS +.B check\-unicode \-\-list\-categories +.RE +.PP +Use with pre\-commit: +.PP +.nf +.RS +repos: + \- repo: https://github.com/mit\-d/check\-unicode + rev: v0.2.0 + hooks: + \- id: check\-unicode + # or for auto\-fix: + \- id: fix\-unicode +.RE +.fi +. +.SH SEE ALSO +.BR pre\-commit (1), +.BR python3 (1), +.BR unicode (7) +.PP +Project repository: +.I https://github.com/mit\-d/check\-unicode +. +.SH AUTHORS +mit\-d +. +.SH LICENSE +MIT License. +See the +.I LICENSE +file in the source distribution. diff --git a/pyproject.toml b/pyproject.toml index b3d9754..ef1e555 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,17 +23,20 @@ classifiers = [ "Topic :: Software Development :: Quality Assurance", ] dynamic = [ "version" ] -optional-dependencies.dev = [ "mypy", "pytest", "pytest-cov", "ruff" ] +optional-dependencies.dev = [ "bump-my-version", "mypy", "pytest", "pytest-cov", "ruff" ] urls.Issues = "https://github.com/mit-d/check-unicode/issues" urls.Repository = "https://github.com/mit-d/check-unicode" scripts.check-unicode = "check_unicode.main:main" [tool.hatch] build.targets.wheel.packages = [ "src/check_unicode" ] +build.targets.sdist.include = [ "docs/check-unicode.1" ] version.path = "src/check_unicode/__init__.py" [tool.ruff] target-version = "py311" +line-length = 88 +indent-width = 4 lint.select = [ "ALL" ] lint.ignore = [ "COM812", # trailing comma -- conflicts with formatter @@ -49,9 +52,24 @@ lint.per-file-ignores."tests/fixtures/**" = [ "ALL", # fixture files are intentionally non-conformant ] +[tool.codespell] +ignore-words-list = "nd,caf,te" + [tool.pytest] ini_options.testpaths = [ "tests" ] +[tool.bumpversion] +current_version = "0.2.0" +commit = true +tag = true +tag_name = "v{new_version}" +commit_args = "--no-verify" +message = "release: v{new_version}" +files = [ + { filename = "src/check_unicode/__init__.py" }, + { filename = "CHANGELOG.md", search = "## Unreleased", replace = "## {new_version} - {now:%Y-%m-%d}" }, +] + [tool.mypy] python_version = "3.11" strict = true diff --git a/src/check_unicode/main.py b/src/check_unicode/main.py index c5c5da2..9995630 100644 --- a/src/check_unicode/main.py +++ b/src/check_unicode/main.py @@ -5,6 +5,7 @@ import argparse import fnmatch import sys +import textwrap import tomllib from pathlib import Path from typing import Any @@ -13,9 +14,45 @@ from check_unicode.checker import AllowConfig, Finding, check_confusables, check_file from check_unicode.fixer import fix_file from check_unicode.output import print_findings +from check_unicode.scripts import KNOWN_SCRIPTS _EXPECTED_RANGE_PARTS = 2 +# Unicode general categories: abbreviation -> (full name, description). +# Covers all 30 categories from the Unicode standard. +UNICODE_CATEGORIES: dict[str, tuple[str, str]] = { + "Lu": ("Letter, uppercase", "e.g. A, B, \u00c9"), + "Ll": ("Letter, lowercase", "e.g. a, b, \u00e9"), + "Lt": ("Letter, titlecase", "e.g. \u01c5, \u01c8"), + "Lm": ("Letter, modifier", "e.g. \u02b0, \u02c6"), + "Lo": ("Letter, other", "e.g. \u00aa, \u0e01, CJK ideographs"), + "Mn": ("Mark, nonspacing", "e.g. \u0300 (combining grave accent)"), + "Mc": ("Mark, spacing combining", "e.g. \u0903 (Devanagari visarga)"), + "Me": ("Mark, enclosing", "e.g. \u20dd (combining enclosing circle)"), + "Nd": ("Number, decimal digit", "e.g. 0-9, \u0660-\u0669"), + "Nl": ("Number, letter", "e.g. \u2160 (Roman numeral one)"), + "No": ("Number, other", "e.g. \u00b2, \u00b3, \u2153"), + "Pc": ("Punctuation, connector", "e.g. _"), + "Pd": ("Punctuation, dash", "e.g. -, \u2013, \u2014"), + "Ps": ("Punctuation, open", "e.g. (, [, {"), + "Pe": ("Punctuation, close", "e.g. ), ], }"), + "Pi": ("Punctuation, initial quote", "e.g. \u00ab, \u2018, \u201c"), + "Pf": ("Punctuation, final quote", "e.g. \u00bb, \u2019, \u201d"), + "Po": ("Punctuation, other", "e.g. !, ?, @, #"), + "Sm": ("Symbol, math", "e.g. +, =, <, >, \u00b1"), + "Sc": ("Symbol, currency", "e.g. $, \u00a3, \u00a5, \u20ac"), + "Sk": ("Symbol, modifier", "e.g. ^, `, \u00a8, \u02dc"), + "So": ("Symbol, other", "e.g. \u00a9, \u00ae, \u2122"), + "Zs": ("Separator, space", "e.g. U+0020, U+00A0, U+2003"), + "Zl": ("Separator, line", "U+2028"), + "Zp": ("Separator, paragraph", "U+2029"), + "Cc": ("Other, control", "e.g. U+0000-U+001F, U+007F-U+009F"), + "Cf": ("Other, format", "e.g. U+200B (zero-width space), U+FEFF (BOM)"), + "Cs": ("Other, surrogate", "U+D800-U+DFFF (not valid in UTF-8)"), + "Co": ("Other, private use", "U+E000-U+F8FF"), + "Cn": ("Other, not assigned", "reserved codepoints"), +} + def _parse_codepoint(s: str) -> int: """Parse 'U+XXXX' or '0xXXXX' into an integer codepoint.""" @@ -118,86 +155,247 @@ def _build_allow_config( ) +def _print_scripts() -> None: + """Print all known Unicode script names accepted by --allow-script.""" + write = sys.stdout.write + write("Unicode scripts accepted by --allow-script:\n\n") + for name in sorted(KNOWN_SCRIPTS): + write(f" {name}\n") + write(f"\nTotal: {len(KNOWN_SCRIPTS)} scripts\n") + write( + "Script names are case-insensitive" + " (e.g. 'cyrillic' and 'Cyrillic' both work).\n" + ) + + +def _print_categories() -> None: + """Print all Unicode general categories accepted by --allow-category.""" + write = sys.stdout.write + write("Unicode general categories accepted by --allow-category:\n\n") + # Group by major class (first letter) + major_classes = { + "L": "Letter", + "M": "Mark", + "N": "Number", + "P": "Punctuation", + "S": "Symbol", + "Z": "Separator", + "C": "Other", + } + current_major = "" + for abbrev in sorted(UNICODE_CATEGORIES): + major = abbrev[0] + if major != current_major: + current_major = major + write(f" {major_classes.get(major, major)}:\n") + full_name, examples = UNICODE_CATEGORIES[abbrev] + write(f" {abbrev} {full_name:<30s} {examples}\n") + write(f"\nTotal: {len(UNICODE_CATEGORIES)} categories\n") + + def _build_parser() -> argparse.ArgumentParser: """Build and return the CLI argument parser.""" + epilog = textwrap.dedent("""\ + examples: + check-unicode src/**/*.py Check all Python files + check-unicode --fix *.txt Auto-fix smart quotes, dashes, etc. + check-unicode --allow-printable . Allow printable non-ASCII + check-unicode --check-confusables --allow-script Cyrillic src/ + Detect confusables + check-unicode --allow-codepoint U+00B0,U+00A9 data.txt + Allow specific codepoints + check-unicode --allow-range U+0400-U+04FF src/i18n/ + Allow Cyrillic block + check-unicode --severity warning --no-color src/ + Warn without failing CI + check-unicode --list-scripts Show all valid script names + check-unicode --list-categories Show all valid category abbreviations + + configuration: + Settings can be defined in .check-unicode.toml or pyproject.toml under + [tool.check-unicode]. CLI flags extend (never replace) config-file values. + + Example .check-unicode.toml: + allow-codepoints = ["U+00B0", "U+2192"] + allow-ranges = ["U+00A0-U+00FF"] + allow-categories = ["Sc"] + allow-printable = true + allow-scripts = ["Latin", "Cyrillic"] + check-confusables = true + severity = "error" + exclude-patterns = ["*.min.js", "vendor/*"] + + exit codes: + 0 No findings (or --severity=warning) + 1 Findings detected (or files were fixed in --fix mode) + 2 Usage error (bad arguments) + """) + description = textwrap.dedent("""\ + Detect and fix non-ASCII Unicode characters in text files. + + Catches smart quotes, em dashes, fancy spaces, dangerous invisible + characters (Trojan Source bidi attacks, zero-width chars), and other + copy-paste artifacts. Use --fix to auto-replace known offenders with + ASCII equivalents. Dangerous characters are always flagged and never + auto-fixed.""") parser = argparse.ArgumentParser( prog="check-unicode", - description="Detect and fix non-ASCII Unicode characters in text files.", + description=description, + epilog=epilog, + formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument( "files", nargs="*", metavar="FILE", - help="Files to check (reads from stdin if none given)", + help="files to check (one or more paths required)", ) - parser.add_argument( - "--fix", - action="store_true", - help="Replace known offenders with ASCII equivalents (exit 1 if changed)", + + # Allow-list options + allow_group = parser.add_argument_group( + "allow-list options", + "Suppress findings for specific characters, ranges, categories, or scripts. " + "These flags extend (never replace) any values set in the config file. " + "Dangerous invisible characters are always flagged unless explicitly " + "allowed by --allow-codepoint.", ) - parser.add_argument( + allow_group.add_argument( "--allow-range", action="append", metavar="RANGE", - help="Allow a Unicode range (e.g. U+00A0-U+00FF). Repeatable.", + help=( + "allow a Unicode range, e.g. U+00A0-U+00FF. " + "may be repeated for multiple ranges" + ), ) - parser.add_argument( + allow_group.add_argument( "--allow-codepoint", action="append", metavar="CP", - help="Allow specific codepoints (e.g. U+00B0). Repeatable, comma-separated.", + help=( + "allow specific codepoints, e.g. U+00B0. " + "comma-separated and/or repeated. " + "this is the only flag that can suppress dangerous characters" + ), ) - parser.add_argument( + allow_group.add_argument( "--allow-category", action="append", metavar="CAT", - help="Allow Unicode category (e.g. Sc). Repeatable.", + help=( + "allow a Unicode general category, e.g. Sc (Symbol, currency). " + "may be repeated for multiple categories. " + "use --list-categories to see all valid values" + ), ) - parser.add_argument( + allow_group.add_argument( "--allow-printable", action="store_true", - help="Allow all printable characters (only flag invisibles).", + help=( + "allow all printable non-ASCII characters; " + "only invisible/control characters will be flagged" + ), ) - parser.add_argument( + allow_group.add_argument( "--allow-script", action="append", metavar="SCRIPT", - help="Allow Unicode script (e.g. Latin, Cyrillic). Repeatable.", + help=( + "allow all characters from a Unicode script, e.g. Latin, Cyrillic, " + "Greek. may be repeated for multiple scripts. " + "use --list-scripts to see all valid names" + ), ) - parser.add_argument( - "--exclude-pattern", - action="append", - metavar="PATTERN", - help="Exclude files matching glob pattern (e.g. '*.min.js'). Repeatable.", + allow_group.add_argument( + "--list-categories", + action="store_true", + help="list all Unicode general categories and exit", ) - parser.add_argument( + allow_group.add_argument( + "--list-scripts", + action="store_true", + help="list all known Unicode script names and exit", + ) + + # Detection options + detect_group = parser.add_argument_group( + "detection options", + "Control what is detected beyond the default non-ASCII scan.", + ) + detect_group.add_argument( "--check-confusables", action="store_true", - help="Detect mixed-script homoglyph/confusable characters.", + help=( + "detect mixed-script homoglyph/confusable characters " + "(e.g. Cyrillic 'a' in a Latin identifier). " + "not suppressed by --allow-script" + ), ) - parser.add_argument( + + # Output options + output_group = parser.add_argument_group( + "output options", + "Control output format, severity, and color.", + ) + output_group.add_argument( "--severity", choices=["error", "warning"], default=None, - help="error (exit 1) or warning (print, exit 0). Default: error.", + help=( + "set exit-code behavior: 'error' exits 1 on findings, " + "'warning' prints findings but exits 0. default: error" + ), ) - parser.add_argument( + output_group.add_argument( "--no-color", action="store_true", - help="Disable ANSI color output.", + help="disable ANSI color output (also respects NO_COLOR env var)", ) - parser.add_argument( + output_group.add_argument( + "-q", + "--quiet", + action="store_true", + help="print summary line only, suppress per-finding details", + ) + + # Configuration + config_group = parser.add_argument_group( + "configuration", + "Specify a config file or rely on auto-discovery.", + ) + config_group.add_argument( "--config", metavar="FILE", - help="Path to TOML config file.", + help=( + "path to a TOML config file. " + "if omitted, auto-discovers .check-unicode.toml " + "or [tool.check-unicode] in pyproject.toml" + ), ) - parser.add_argument( - "-q", - "--quiet", + config_group.add_argument( + "--exclude-pattern", + action="append", + metavar="PATTERN", + help=( + "exclude files matching a glob pattern, e.g. '*.min.js'. " + "may be repeated; extends config-file exclude-patterns" + ), + ) + + # Mode + mode_group = parser.add_argument_group( + "mode", + ) + mode_group.add_argument( + "--fix", action="store_true", - help="Summary only, no per-finding details.", + help=( + "replace known offenders (smart quotes, dashes, fancy spaces, " + "ellipsis) with ASCII equivalents. exits 1 if any file was " + "changed. dangerous characters are never auto-fixed" + ), ) - parser.add_argument( + mode_group.add_argument( "-V", "--version", action="version", @@ -249,6 +447,14 @@ def main(argv: list[str] | None = None) -> int: parser = _build_parser() args = parser.parse_args(argv) + # Informational flags that exit immediately + if args.list_scripts: + _print_scripts() + return 0 + if args.list_categories: + _print_categories() + return 0 + if not args.files: parser.error("No files specified.") diff --git a/tests/test_cli.py b/tests/test_cli.py index fd1a37c..f13ff99 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2,11 +2,18 @@ from __future__ import annotations +import argparse from pathlib import Path import pytest -from check_unicode.main import _is_excluded, _parse_codepoint, _parse_range, main +from check_unicode.main import ( + _build_parser, + _is_excluded, + _parse_codepoint, + _parse_range, + main, +) FIXTURES = Path(__file__).parent / "fixtures" @@ -423,3 +430,165 @@ def test_matches_full_path(self) -> None: def test_empty_patterns(self) -> None: """Empty pattern list excludes nothing.""" assert _is_excluded("foo.py", []) is False + + +class TestHelpOutput: + """Tests for the -h/--help output.""" + + def test_help_flag_exits_0(self) -> None: + """--help causes SystemExit with code 0.""" + with pytest.raises(SystemExit) as exc_info: + main(["--help"]) + assert exc_info.value.code == 0 + + def test_help_contains_argument_groups( + self, capsys: pytest.CaptureFixture[str] + ) -> None: + """Help output includes named argument groups.""" + with pytest.raises(SystemExit): + main(["--help"]) + out = capsys.readouterr().out + assert "allow-list options" in out + assert "detection options" in out + assert "output options" in out + assert "configuration" in out + assert "mode" in out + + def test_help_contains_examples(self, capsys: pytest.CaptureFixture[str]) -> None: + """Help output includes the examples epilog section.""" + with pytest.raises(SystemExit): + main(["--help"]) + out = capsys.readouterr().out + assert "examples:" in out + assert "check-unicode --fix" in out + assert "--allow-printable" in out + + def test_help_contains_exit_codes(self, capsys: pytest.CaptureFixture[str]) -> None: + """Help output includes exit code documentation.""" + with pytest.raises(SystemExit): + main(["--help"]) + out = capsys.readouterr().out + assert "exit codes:" in out + assert "No findings" in out + assert "Usage error" in out + + def test_help_contains_config_example( + self, capsys: pytest.CaptureFixture[str] + ) -> None: + """Help output includes a TOML configuration example.""" + with pytest.raises(SystemExit): + main(["--help"]) + out = capsys.readouterr().out + assert "allow-codepoints" in out + assert ".check-unicode.toml" in out + + def test_parser_uses_raw_formatter(self) -> None: + """Parser uses RawDescriptionHelpFormatter to preserve epilog formatting.""" + parser = _build_parser() + assert parser.formatter_class is argparse.RawDescriptionHelpFormatter + + def test_help_describes_dangerous_characters( + self, capsys: pytest.CaptureFixture[str] + ) -> None: + """Help mentions that dangerous characters require explicit allow-codepoint.""" + with pytest.raises(SystemExit): + main(["--help"]) + out = capsys.readouterr().out + assert "dangerous" in out.lower() + assert "--allow-codepoint" in out + + def test_help_mentions_list_flags(self, capsys: pytest.CaptureFixture[str]) -> None: + """Help text cross-references --list-scripts and --list-categories.""" + with pytest.raises(SystemExit): + main(["--help"]) + out = capsys.readouterr().out + assert "--list-scripts" in out + assert "--list-categories" in out + + +class TestListScripts: + """Tests for the --list-scripts flag.""" + + def test_list_scripts_exits_0(self) -> None: + """--list-scripts exits with code 0.""" + assert main(["--list-scripts"]) == 0 + + def test_list_scripts_output(self, capsys: pytest.CaptureFixture[str]) -> None: + """--list-scripts prints known script names.""" + main(["--list-scripts"]) + out = capsys.readouterr().out + assert "Latin" in out + assert "Cyrillic" in out + assert "Greek" in out + assert "Han" in out + assert "Arabic" in out + + def test_list_scripts_contains_count( + self, capsys: pytest.CaptureFixture[str] + ) -> None: + """--list-scripts shows a total count.""" + main(["--list-scripts"]) + out = capsys.readouterr().out + assert "Total:" in out + + def test_list_scripts_mentions_case_insensitive( + self, capsys: pytest.CaptureFixture[str] + ) -> None: + """--list-scripts reminds users that names are case-insensitive.""" + main(["--list-scripts"]) + out = capsys.readouterr().out + assert "case-insensitive" in out.lower() + + def test_list_scripts_does_not_require_files(self) -> None: + """--list-scripts works without specifying any files.""" + assert main(["--list-scripts"]) == 0 + + +class TestListCategories: + """Tests for the --list-categories flag.""" + + def test_list_categories_exits_0(self) -> None: + """--list-categories exits with code 0.""" + assert main(["--list-categories"]) == 0 + + def test_list_categories_output(self, capsys: pytest.CaptureFixture[str]) -> None: + """--list-categories prints all 30 Unicode general categories.""" + main(["--list-categories"]) + out = capsys.readouterr().out + assert "Sc" in out + assert "Lu" in out + assert "Mn" in out + assert "Zs" in out + + def test_list_categories_shows_major_groups( + self, capsys: pytest.CaptureFixture[str] + ) -> None: + """--list-categories organizes output by major class.""" + main(["--list-categories"]) + out = capsys.readouterr().out + assert "Letter:" in out + assert "Symbol:" in out + assert "Punctuation:" in out + assert "Number:" in out + assert "Separator:" in out + + def test_list_categories_contains_count( + self, capsys: pytest.CaptureFixture[str] + ) -> None: + """--list-categories shows the total count of 30.""" + main(["--list-categories"]) + out = capsys.readouterr().out + assert "Total: 30" in out + + def test_list_categories_contains_descriptions( + self, capsys: pytest.CaptureFixture[str] + ) -> None: + """--list-categories includes human-readable descriptions.""" + main(["--list-categories"]) + out = capsys.readouterr().out + assert "Symbol, currency" in out + assert "Letter, uppercase" in out + + def test_list_categories_does_not_require_files(self) -> None: + """--list-categories works without specifying any files.""" + assert main(["--list-categories"]) == 0 diff --git a/tests/test_integration.py b/tests/test_integration.py index e1abefc..8ff265c 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -91,9 +91,7 @@ def test_partial_allow(self) -> None: assert 0x20AC in codepoints # euro not allowed -# --------------------------------------------------------------------------- # Classic literature fixtures (see tests/fixtures/LICENSES.fixtures) -# --------------------------------------------------------------------------- # (fixture_stem, scripts needed to cover all non-ASCII characters) CLASSIC_TEXTS: list[tuple[str, frozenset[str]]] = [