From 85c37e85ab01aac04878d3242bc917dd29a4bfb9 Mon Sep 17 00:00:00 2001 From: Cristovao Cordeiro Date: Fri, 6 Feb 2026 17:43:39 +0100 Subject: [PATCH 01/15] ci: add hint validation checks --- .../scripts/validate-hints/requirements.txt | 3 + .../validate-hints/test_validate_hints.py | 149 +++++++++++++++ .../scripts/validate-hints/validate_hints.py | 177 ++++++++++++++++++ .github/workflows/lint.yaml | 4 + .github/workflows/test-ci.yaml | 27 +++ .github/workflows/validate-hints.yaml | 52 +++++ 6 files changed, 412 insertions(+) create mode 100644 .github/scripts/validate-hints/requirements.txt create mode 100644 .github/scripts/validate-hints/test_validate_hints.py create mode 100755 .github/scripts/validate-hints/validate_hints.py create mode 100644 .github/workflows/test-ci.yaml create mode 100644 .github/workflows/validate-hints.yaml diff --git a/.github/scripts/validate-hints/requirements.txt b/.github/scripts/validate-hints/requirements.txt new file mode 100644 index 000000000..e0559f929 --- /dev/null +++ b/.github/scripts/validate-hints/requirements.txt @@ -0,0 +1,3 @@ +spacy>=3.0.0 +https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl +pyyaml>=5.0.0 \ No newline at end of file diff --git a/.github/scripts/validate-hints/test_validate_hints.py b/.github/scripts/validate-hints/test_validate_hints.py new file mode 100644 index 000000000..122943caa --- /dev/null +++ b/.github/scripts/validate-hints/test_validate_hints.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +""" +Unit tests for validate_hints.py +""" +import sys +import os +import pytest +from unittest.mock import patch + +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +import validate_hints + + +class TestValidators: + """Test individual validator functions.""" + + def test_no_finite_verbs(self): + # Valid cases + assert validate_hints.no_finite_verbs("System configuration") is None + assert validate_hints.no_finite_verbs("Installation of packages") is None + + # Invalid cases + err = [] + err.append(validate_hints.no_finite_verbs("This contains a verb")) + err.append(validate_hints.no_finite_verbs("This is a verb")) + assert len(err) == 2 + assert all("finite verbs are not allowed" in e for e in err) + assert "contains" in err[0] + assert "is" in err[1] + + def test_no_starting_articles(self): + # Valid + assert validate_hints.no_starting_articles("System configuration") is None + + # Invalid + assert ( + validate_hints.no_starting_articles("The system configuration") is not None + ) + assert validate_hints.no_starting_articles("A configuration") is not None + assert validate_hints.no_starting_articles("An apple") is not None + + # Case sensitivity + assert validate_hints.no_starting_articles("the system") is not None + + def test_no_special_characters(self): + # Valid + assert validate_hints.no_special_characters("System configuration") is None + assert validate_hints.no_special_characters("File (config.yaml)") is None + assert validate_hints.no_special_characters("Items one, two; three.") is None + + # Invalid + set_of_invalid_chars = "@#$%^:&*!~`<>/\\|" + err = validate_hints.no_special_characters( + f"Some invalid chars: {set_of_invalid_chars}" + ) + assert err is not None + assert all(char in err for char in set_of_invalid_chars) + + def test_no_trailing_punctuation(self): + # Valid + assert validate_hints.no_trailing_punctuation("System configuration") is None + assert validate_hints.no_trailing_punctuation("Ends with parenthesis)") is None + + # Invalid + for punct in {".", "!", "?", ",", ";", ":"}: + assert ( + validate_hints.no_trailing_punctuation(f"Ends with {punct}") is not None + ) + + def test_is_sentence_case(self): + # Valid + assert validate_hints.is_sentence_case("System configuration") is None + assert ( + validate_hints.is_sentence_case("System configuration. Another sentence.") + is None + ) + + # Invalid + err = validate_hints.is_sentence_case("system configuration") + assert err is not None + assert "first letter" in err + + err = validate_hints.is_sentence_case("First sentence. second sentence.") + assert err is not None + assert "'s'" in err + + +class TestValidateHints: + """Test the file validation logic.""" + + def test_validate_hints(self, tmp_path): + f = tmp_path / "sdf.yaml" + yaml_content = """slices: + good: + hint: Pass the test + + bad-article: + hint: A bad hint + + bad-chars: + hint: Hint with invalid characters @#$%^&*() - the end + + bad-sentence-case: + hint: Ok. but not sentence case + + bad-trailing-punctuation: + hint: With trailing punctuation. + + bad-verbs: + hint: Has to fail the test, it is not valid, contains finite verbs +""" + f.write_text(yaml_content, encoding="utf-8") + + errors = validate_hints.validate_hints(str(f)) + assert len(errors) == 5 + assert "article (a, an, the) is not allowed: 'A'" in errors[0] + assert "can only contain alphanumeric characters" in errors[1] + assert "(first letter 'b' is not uppercase)" in errors[2] + assert "trailing punctuation is not allowed: found '.' at the end" in errors[3] + assert "finite verbs are not allowed" in errors[4] + + def test_validate_hints_malformed_yaml(self, tmp_path): + f = tmp_path / "bad.yaml" + f.write_text("slices: [", encoding="utf-8") + + errors = validate_hints.validate_hints(str(f)) + assert len(errors) == 1 + assert "Failed to parse YAML" in errors[0] + + +class TestMain: + """Test the main execution flow.""" + + @patch("validate_hints.validate_hints") + @patch("sys.exit") + def test_main_success(self, mock_exit, mock_validate): + mock_validate.return_value = [] + with patch("sys.argv", ["script", "file.yaml"]): + validate_hints.main() + mock_exit.assert_not_called() + + @patch("validate_hints.validate_hints") + @patch("sys.exit") + def test_main_failure(self, mock_exit, mock_validate): + mock_validate.return_value = ["Some error"] + with patch("sys.argv", ["script", "file.yaml"]): + validate_hints.main() + mock_exit.assert_called_with(1) diff --git a/.github/scripts/validate-hints/validate_hints.py b/.github/scripts/validate-hints/validate_hints.py new file mode 100755 index 000000000..c0eab3b99 --- /dev/null +++ b/.github/scripts/validate-hints/validate_hints.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +""" +Script to validate hints in slice definition files. +""" + +import argparse +import logging +import re +import sys +from typing import List, Optional + +import spacy +import yaml + +# Configure logging +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + +# Load NLP model +try: + NLP = spacy.load("en_core_web_sm") +except OSError: + logging.warning("Downloading en_core_web_sm model...") + from spacy.cli import download + + download("en_core_web_sm") + NLP = spacy.load("en_core_web_sm") + +ErrorMessage = str + + +def no_finite_verbs(text: str) -> Optional[ErrorMessage]: + """Check that the text does not contain finite verbs.""" + doc = NLP(text) + findings = [] + for token in doc: + if token.pos_ in ["VERB", "AUX"] and token.morph.get("VerbForm") == ["Fin"]: + findings.append(f"{token.text} ({token.lemma_})") + + if findings: + return f"finite verbs are not allowed: {', '.join(findings)}" + return None + + +def no_starting_articles(text: str) -> Optional[ErrorMessage]: + """Check that the text does not start with an article.""" + words = text.split() + if not words: + return None + + first_word = words[0] + articles = {"a", "an", "the"} + if first_word.lower() in articles: + return ( + f"starting with an article ({', '.join(sorted(articles))}) " + f"is not allowed: '{first_word}'" + ) + return None + + +def no_special_characters(text: str) -> Optional[ErrorMessage]: + """ + Check that the text contains only allowed characters. + Allowed: alphanumeric, periods, commas, semicolons, parentheses. + """ + # Regex for characters that are NOT allowed + forbidden_pattern = r"[^a-zA-Z0-9.,;()\s]" + + bad_chars = re.findall(forbidden_pattern, text) + if bad_chars: + unique_bad_chars = set(bad_chars) + return ( + f"can only contain alphanumeric characters, periods, commas, " + f"semicolons, parentheses: found {', '.join(unique_bad_chars)}" + ) + return None + + +def no_trailing_punctuation(text: str) -> Optional[ErrorMessage]: + """Check that the text does not end with punctuation, except parentheses.""" + punctuation_marks = {".", "!", "?", ",", ";", ":"} + + if text and text[-1] in punctuation_marks: + return f"trailing punctuation is not allowed: found '{text[-1]}' at the end" + return None + + +def is_sentence_case(text: str) -> Optional[ErrorMessage]: + """Check that each sentence in the text starts with an uppercase letter.""" + # It is not enough to split the text by '.' and check each sentence separately, + # because we can have complex punctuation like "Single 1.1 sentence" + doc = NLP(text) + findings = [] + + for sent in doc.sents: + s_text = sent.text.strip() + if not s_text: + continue + + # Check if first letter is upper + if not s_text[0].isupper(): + findings.append( + f"'{s_text}' (first letter '{s_text[0]}' is not uppercase)" + ) + + if findings: + return f"text must be sentence case: {', '.join(findings)}" + return None + + +def validate_hints(file_path: str) -> List[str]: + """Validate hints in a single slice definition file.""" + logging.info(f"Processing {file_path}...") + errors = [] + + try: + with open(file_path, "r", encoding="utf-8") as f: + content = yaml.safe_load(f) + except Exception as e: + return [f"File={file_path}, Error=Failed to parse YAML: {e}"] + + slices = content.get("slices", {}) + if not isinstance(slices, dict): + return [] + + validators = [ + no_finite_verbs, + no_starting_articles, + no_special_characters, + no_trailing_punctuation, + is_sentence_case, + ] + + for slice_name, values in slices.items(): + if not isinstance(values, dict): + continue + + hint = values.get("hint", "") + # Skip empty hints or non-string hints + if not hint or not isinstance(hint, str): + continue + + for validator in validators: + error_msg = validator(hint) + if error_msg: + errors.append( + f"File={file_path}, Slice={slice_name}, Error={error_msg}" + ) + + return errors + + +def main(): + parser = argparse.ArgumentParser( + description="Validate hints in slice definitions" + ) + parser.add_argument("files", nargs="+", help="Slice definition files to validate") + args = parser.parse_args() + + logging.info("Validating slice definition hints") + + all_errors = [] + for input_file in args.files: + all_errors.extend(validate_hints(input_file)) + + if all_errors: + logging.error("\033[91mThe following 'hints' are not valid\033[0m") + all_errors.sort() + for error in all_errors: + logging.error(error) + + sys.exit(1) + + logging.info("All hints are valid") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 2b64528e6..e4fa10dff 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -107,3 +107,7 @@ jobs: done exit $EXIT_CODE + + validate-hints: + name: Validate hints + uses: ./.github/workflows/validate-hints.yaml diff --git a/.github/workflows/test-ci.yaml b/.github/workflows/test-ci.yaml new file mode 100644 index 000000000..b46365499 --- /dev/null +++ b/.github/workflows/test-ci.yaml @@ -0,0 +1,27 @@ +name: Test chisel-releases CI + +on: + pull_request: + branches: + - "main" + paths: + - ".github/workflows/*.yaml" + - ".github/scripts/validate-hints/**" + +jobs: + test-validate-hints: + name: Test validate hints workflow + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Setup Python + uses: actions/setup-python@v6 + with: + python-version: "3.12" + pip-install: -r .github/scripts/validate-hints/requirements.txt + - run: pip install pytest + - name: Run "validate-hints" unit tests + run: | + pytest .github/scripts/validate-hints/ + + # TODO: add tests for remaining CI scripts diff --git a/.github/workflows/validate-hints.yaml b/.github/workflows/validate-hints.yaml new file mode 100644 index 000000000..8105700d3 --- /dev/null +++ b/.github/workflows/validate-hints.yaml @@ -0,0 +1,52 @@ +name: Validate hints + +on: + workflow_call: + +jobs: + validate: + name: Validate slice definition hints + runs-on: ubuntu-latest + # Needs to be called, from a workflow triggered by a PR + if: | + startswith(github.event_name, 'pull_request') && + startswith(github.base_ref, 'ubuntu-') + env: + main-branch-path: files-from-main + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.head_ref }} + repository: ${{ github.event.pull_request.head.repo.full_name }} + + - name: Check changed paths + id: changed-paths + uses: dorny/paths-filter@v3 + with: + # ref: https://github.com/marketplace/actions/paths-changes-filter + filters: | + slices: + - added|modified: 'slices/**/*.yaml' + # Space delimited list usable as command-line argument list in + # Linux shell. If needed, it uses single or double quotes to + # wrap filename with unsafe characters. + list-files: shell + + - name: Checkout main branch + uses: actions/checkout@v4 + with: + ref: main + path: ${{ env.main-branch-path }} + + - name: Setup Python + uses: actions/setup-python@v6 + with: + python-version: "3.12" + pip-install: -r ${{ env.main-branch-path }}/.github/scripts/validate-hints/requirements.txt + + - name: Validate hints + env: + script-dir: "${{ env.main-branch-path }}/.github/scripts/validate-hints" + run: | + ./${{ env.script-dir }}/validate_hints.py \ + ${{ steps.changed-paths.outputs.slices_files }} From 007cab17b9dc9e04e75dd606adee65152b1137e1 Mon Sep 17 00:00:00 2001 From: Cristovao Cordeiro Date: Thu, 12 Feb 2026 14:49:49 +0100 Subject: [PATCH 02/15] fix(ci): pin 3rd party action to a commit hash --- .github/workflows/validate-hints.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/validate-hints.yaml b/.github/workflows/validate-hints.yaml index 8105700d3..389696358 100644 --- a/.github/workflows/validate-hints.yaml +++ b/.github/workflows/validate-hints.yaml @@ -21,7 +21,7 @@ jobs: - name: Check changed paths id: changed-paths - uses: dorny/paths-filter@v3 + uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2 with: # ref: https://github.com/marketplace/actions/paths-changes-filter filters: | From 38659f4c8fe901c5214707ecaceac0e08f974f14 Mon Sep 17 00:00:00 2001 From: Cristovao Cordeiro Date: Thu, 12 Feb 2026 14:59:05 +0100 Subject: [PATCH 03/15] feat(ci): disallow trailing spaces --- .github/scripts/validate-hints/test_validate_hints.py | 4 ++-- .github/scripts/validate-hints/validate_hints.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/scripts/validate-hints/test_validate_hints.py b/.github/scripts/validate-hints/test_validate_hints.py index 122943caa..29bf9f89b 100644 --- a/.github/scripts/validate-hints/test_validate_hints.py +++ b/.github/scripts/validate-hints/test_validate_hints.py @@ -63,7 +63,7 @@ def test_no_trailing_punctuation(self): assert validate_hints.no_trailing_punctuation("Ends with parenthesis)") is None # Invalid - for punct in {".", "!", "?", ",", ";", ":"}: + for punct in {".", "!", "?", ",", ";", ":", " "}: assert ( validate_hints.no_trailing_punctuation(f"Ends with {punct}") is not None ) @@ -117,7 +117,7 @@ def test_validate_hints(self, tmp_path): assert "article (a, an, the) is not allowed: 'A'" in errors[0] assert "can only contain alphanumeric characters" in errors[1] assert "(first letter 'b' is not uppercase)" in errors[2] - assert "trailing punctuation is not allowed: found '.' at the end" in errors[3] + assert "trailing punctuation and spaces are not allowed: found '.' at the end" in errors[3] assert "finite verbs are not allowed" in errors[4] def test_validate_hints_malformed_yaml(self, tmp_path): diff --git a/.github/scripts/validate-hints/validate_hints.py b/.github/scripts/validate-hints/validate_hints.py index c0eab3b99..47e4ea987 100755 --- a/.github/scripts/validate-hints/validate_hints.py +++ b/.github/scripts/validate-hints/validate_hints.py @@ -77,10 +77,10 @@ def no_special_characters(text: str) -> Optional[ErrorMessage]: def no_trailing_punctuation(text: str) -> Optional[ErrorMessage]: """Check that the text does not end with punctuation, except parentheses.""" - punctuation_marks = {".", "!", "?", ",", ";", ":"} + punctuation_marks = {".", "!", "?", ",", ";", ":", " "} if text and text[-1] in punctuation_marks: - return f"trailing punctuation is not allowed: found '{text[-1]}' at the end" + return f"trailing punctuation and spaces are not allowed: found '{text[-1]}' at the end" return None From 2887b2ab8a96600ea3854cf228f20add951e1407 Mon Sep 17 00:00:00 2001 From: Cristovao Cordeiro Date: Tue, 17 Feb 2026 07:32:53 +0100 Subject: [PATCH 04/15] fix(ci): remove unused pytest import --- .github/scripts/validate-hints/test_validate_hints.py | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/scripts/validate-hints/test_validate_hints.py b/.github/scripts/validate-hints/test_validate_hints.py index 29bf9f89b..d1d8f93a0 100644 --- a/.github/scripts/validate-hints/test_validate_hints.py +++ b/.github/scripts/validate-hints/test_validate_hints.py @@ -4,7 +4,6 @@ """ import sys import os -import pytest from unittest.mock import patch sys.path.append(os.path.dirname(os.path.abspath(__file__))) From a5083ca2fc7cfb069bb08605b4b45b7174d15ade Mon Sep 17 00:00:00 2001 From: Cristovao Cordeiro Date: Tue, 17 Feb 2026 07:47:32 +0100 Subject: [PATCH 05/15] chore(ci): adopt PEP 585 and 604 Python syntax --- .github/scripts/validate-hints/validate_hints.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/.github/scripts/validate-hints/validate_hints.py b/.github/scripts/validate-hints/validate_hints.py index 47e4ea987..3df7227df 100755 --- a/.github/scripts/validate-hints/validate_hints.py +++ b/.github/scripts/validate-hints/validate_hints.py @@ -7,7 +7,6 @@ import logging import re import sys -from typing import List, Optional import spacy import yaml @@ -28,7 +27,7 @@ ErrorMessage = str -def no_finite_verbs(text: str) -> Optional[ErrorMessage]: +def no_finite_verbs(text: str) -> ErrorMessage | None: """Check that the text does not contain finite verbs.""" doc = NLP(text) findings = [] @@ -41,7 +40,7 @@ def no_finite_verbs(text: str) -> Optional[ErrorMessage]: return None -def no_starting_articles(text: str) -> Optional[ErrorMessage]: +def no_starting_articles(text: str) -> ErrorMessage | None: """Check that the text does not start with an article.""" words = text.split() if not words: @@ -57,7 +56,7 @@ def no_starting_articles(text: str) -> Optional[ErrorMessage]: return None -def no_special_characters(text: str) -> Optional[ErrorMessage]: +def no_special_characters(text: str) -> ErrorMessage | None: """ Check that the text contains only allowed characters. Allowed: alphanumeric, periods, commas, semicolons, parentheses. @@ -75,7 +74,7 @@ def no_special_characters(text: str) -> Optional[ErrorMessage]: return None -def no_trailing_punctuation(text: str) -> Optional[ErrorMessage]: +def no_trailing_punctuation(text: str) -> ErrorMessage | None: """Check that the text does not end with punctuation, except parentheses.""" punctuation_marks = {".", "!", "?", ",", ";", ":", " "} @@ -84,7 +83,7 @@ def no_trailing_punctuation(text: str) -> Optional[ErrorMessage]: return None -def is_sentence_case(text: str) -> Optional[ErrorMessage]: +def is_sentence_case(text: str) -> ErrorMessage | None: """Check that each sentence in the text starts with an uppercase letter.""" # It is not enough to split the text by '.' and check each sentence separately, # because we can have complex punctuation like "Single 1.1 sentence" @@ -107,7 +106,7 @@ def is_sentence_case(text: str) -> Optional[ErrorMessage]: return None -def validate_hints(file_path: str) -> List[str]: +def validate_hints(file_path: str) -> list[str]: """Validate hints in a single slice definition file.""" logging.info(f"Processing {file_path}...") errors = [] From f388703aa8d37519c81336f8675666155e1323ff Mon Sep 17 00:00:00 2001 From: Cristovao Cordeiro Date: Tue, 17 Feb 2026 07:51:44 +0100 Subject: [PATCH 06/15] Apply suggestions from code review Co-authored-by: Marcin Konowalczyk --- .github/scripts/validate-hints/requirements.txt | 2 +- .github/scripts/validate-hints/validate_hints.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/scripts/validate-hints/requirements.txt b/.github/scripts/validate-hints/requirements.txt index e0559f929..76b2df95e 100644 --- a/.github/scripts/validate-hints/requirements.txt +++ b/.github/scripts/validate-hints/requirements.txt @@ -1,3 +1,3 @@ spacy>=3.0.0 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl -pyyaml>=5.0.0 \ No newline at end of file +pyyaml>=5.0.0 diff --git a/.github/scripts/validate-hints/validate_hints.py b/.github/scripts/validate-hints/validate_hints.py index 3df7227df..c10feb735 100755 --- a/.github/scripts/validate-hints/validate_hints.py +++ b/.github/scripts/validate-hints/validate_hints.py @@ -30,7 +30,7 @@ def no_finite_verbs(text: str) -> ErrorMessage | None: """Check that the text does not contain finite verbs.""" doc = NLP(text) - findings = [] + findings: list[str] = [] for token in doc: if token.pos_ in ["VERB", "AUX"] and token.morph.get("VerbForm") == ["Fin"]: findings.append(f"{token.text} ({token.lemma_})") @@ -47,7 +47,7 @@ def no_starting_articles(text: str) -> ErrorMessage | None: return None first_word = words[0] - articles = {"a", "an", "the"} + articles: set[str] = {"a", "an", "the"} if first_word.lower() in articles: return ( f"starting with an article ({', '.join(sorted(articles))}) " @@ -76,7 +76,7 @@ def no_special_characters(text: str) -> ErrorMessage | None: def no_trailing_punctuation(text: str) -> ErrorMessage | None: """Check that the text does not end with punctuation, except parentheses.""" - punctuation_marks = {".", "!", "?", ",", ";", ":", " "} + punctuation_marks: set[str] = {".", "!", "?", ",", ";", ":", " "} if text and text[-1] in punctuation_marks: return f"trailing punctuation and spaces are not allowed: found '{text[-1]}' at the end" @@ -88,7 +88,7 @@ def is_sentence_case(text: str) -> ErrorMessage | None: # It is not enough to split the text by '.' and check each sentence separately, # because we can have complex punctuation like "Single 1.1 sentence" doc = NLP(text) - findings = [] + findings: list[str] = [] for sent in doc.sents: s_text = sent.text.strip() @@ -109,7 +109,7 @@ def is_sentence_case(text: str) -> ErrorMessage | None: def validate_hints(file_path: str) -> list[str]: """Validate hints in a single slice definition file.""" logging.info(f"Processing {file_path}...") - errors = [] + errors: list[str] = [] try: with open(file_path, "r", encoding="utf-8") as f: @@ -148,7 +148,7 @@ def validate_hints(file_path: str) -> list[str]: return errors -def main(): +def main() -> None: parser = argparse.ArgumentParser( description="Validate hints in slice definitions" ) @@ -157,7 +157,7 @@ def main(): logging.info("Validating slice definition hints") - all_errors = [] + all_errors: list[str] = [] for input_file in args.files: all_errors.extend(validate_hints(input_file)) @@ -173,4 +173,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() From 7f5f29c402605d328ebe9c4f1510fae14f43d959 Mon Sep 17 00:00:00 2001 From: Cristovao Cordeiro Date: Tue, 17 Feb 2026 10:41:58 +0100 Subject: [PATCH 07/15] Apply suggestion from @lczyk Co-authored-by: Marcin Konowalczyk --- .github/scripts/validate-hints/validate_hints.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/validate-hints/validate_hints.py b/.github/scripts/validate-hints/validate_hints.py index c10feb735..8058f8099 100755 --- a/.github/scripts/validate-hints/validate_hints.py +++ b/.github/scripts/validate-hints/validate_hints.py @@ -32,7 +32,7 @@ def no_finite_verbs(text: str) -> ErrorMessage | None: doc = NLP(text) findings: list[str] = [] for token in doc: - if token.pos_ in ["VERB", "AUX"] and token.morph.get("VerbForm") == ["Fin"]: + if token.pos_ in ["VERB", "AUX"] and token.morph.get("VerbForm", None) == ["Fin"]: findings.append(f"{token.text} ({token.lemma_})") if findings: From 3181288ee3188bf90d26a39bc4ee507aee8a6d03 Mon Sep 17 00:00:00 2001 From: Cristovao Cordeiro Date: Tue, 17 Feb 2026 10:51:01 +0100 Subject: [PATCH 08/15] ci(validate-hints): assert YAML mapping --- .../validate-hints/test_validate_hints.py | 9 +++++++++ .../scripts/validate-hints/validate_hints.py | 20 ++++++++++--------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/.github/scripts/validate-hints/test_validate_hints.py b/.github/scripts/validate-hints/test_validate_hints.py index d1d8f93a0..84d62826f 100644 --- a/.github/scripts/validate-hints/test_validate_hints.py +++ b/.github/scripts/validate-hints/test_validate_hints.py @@ -127,6 +127,15 @@ def test_validate_hints_malformed_yaml(self, tmp_path): assert len(errors) == 1 assert "Failed to parse YAML" in errors[0] + def test_validate_hints_non_dict_yaml(self, tmp_path): + f = tmp_path / "bad.yaml" + f.write_text("- foo", encoding="utf-8") + + errors = validate_hints.validate_hints(str(f)) + assert len(errors) == 1 + assert "Failed to parse YAML" in errors[0] + assert "YAML mapping" in errors[0] + class TestMain: """Test the main execution flow.""" diff --git a/.github/scripts/validate-hints/validate_hints.py b/.github/scripts/validate-hints/validate_hints.py index 8058f8099..575783566 100755 --- a/.github/scripts/validate-hints/validate_hints.py +++ b/.github/scripts/validate-hints/validate_hints.py @@ -32,7 +32,9 @@ def no_finite_verbs(text: str) -> ErrorMessage | None: doc = NLP(text) findings: list[str] = [] for token in doc: - if token.pos_ in ["VERB", "AUX"] and token.morph.get("VerbForm", None) == ["Fin"]: + if token.pos_ in ["VERB", "AUX"] and token.morph.get("VerbForm", None) == [ + "Fin" + ]: findings.append(f"{token.text} ({token.lemma_})") if findings: @@ -97,9 +99,7 @@ def is_sentence_case(text: str) -> ErrorMessage | None: # Check if first letter is upper if not s_text[0].isupper(): - findings.append( - f"'{s_text}' (first letter '{s_text[0]}' is not uppercase)" - ) + findings.append(f"'{s_text}' (first letter '{s_text[0]}' is not uppercase)") if findings: return f"text must be sentence case: {', '.join(findings)}" @@ -114,9 +114,13 @@ def validate_hints(file_path: str) -> list[str]: try: with open(file_path, "r", encoding="utf-8") as f: content = yaml.safe_load(f) + + assert isinstance( + content, dict + ), f"Expected {file_path} to be a YAML mapping (dict) at the top level" except Exception as e: return [f"File={file_path}, Error=Failed to parse YAML: {e}"] - + slices = content.get("slices", {}) if not isinstance(slices, dict): return [] @@ -149,9 +153,7 @@ def validate_hints(file_path: str) -> list[str]: def main() -> None: - parser = argparse.ArgumentParser( - description="Validate hints in slice definitions" - ) + parser = argparse.ArgumentParser(description="Validate hints in slice definitions") parser.add_argument("files", nargs="+", help="Slice definition files to validate") args = parser.parse_args() @@ -162,7 +164,7 @@ def main() -> None: all_errors.extend(validate_hints(input_file)) if all_errors: - logging.error("\033[91mThe following 'hints' are not valid\033[0m") + logging.error("\033[91mThe 'hint' validation steps failed\033[0m") all_errors.sort() for error in all_errors: logging.error(error) From 18cbbb2ad8c488d4f0688339fe8abc9a549df8e8 Mon Sep 17 00:00:00 2001 From: Cristovao Cordeiro Date: Tue, 17 Feb 2026 10:52:34 +0100 Subject: [PATCH 09/15] fix(ci): annotate list of callable validators --- .github/scripts/validate-hints/validate_hints.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/scripts/validate-hints/validate_hints.py b/.github/scripts/validate-hints/validate_hints.py index 575783566..8b42d5616 100755 --- a/.github/scripts/validate-hints/validate_hints.py +++ b/.github/scripts/validate-hints/validate_hints.py @@ -7,6 +7,7 @@ import logging import re import sys +from typing import Callable import spacy import yaml @@ -125,7 +126,7 @@ def validate_hints(file_path: str) -> list[str]: if not isinstance(slices, dict): return [] - validators = [ + validators: list[Callable[[str], ErrorMessage | None]] = [ no_finite_verbs, no_starting_articles, no_special_characters, From 88f535361e2cba344be37322474f45c52ab9cd54 Mon Sep 17 00:00:00 2001 From: Cristovao Cordeiro Date: Tue, 17 Feb 2026 10:58:16 +0100 Subject: [PATCH 10/15] ci: parameterize colored logs --- .github/scripts/validate-hints/validate_hints.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/scripts/validate-hints/validate_hints.py b/.github/scripts/validate-hints/validate_hints.py index 8b42d5616..41213105a 100755 --- a/.github/scripts/validate-hints/validate_hints.py +++ b/.github/scripts/validate-hints/validate_hints.py @@ -26,6 +26,10 @@ NLP = spacy.load("en_core_web_sm") ErrorMessage = str +COLORED_LOGGING = { + "red": "\033[31m", + "reset": "\033[0m", +} def no_finite_verbs(text: str) -> ErrorMessage | None: @@ -165,7 +169,9 @@ def main() -> None: all_errors.extend(validate_hints(input_file)) if all_errors: - logging.error("\033[91mThe 'hint' validation steps failed\033[0m") + logging.error( + f"{COLORED_LOGGING['red']}The 'hint' validation steps failed{COLORED_LOGGING['reset']}" + ) all_errors.sort() for error in all_errors: logging.error(error) From 7a789f4eef5f199c66b72a060c5f4b129a59257d Mon Sep 17 00:00:00 2001 From: Cristovao Cordeiro Date: Tue, 17 Feb 2026 12:08:40 +0100 Subject: [PATCH 11/15] ci(validate-hints): only validate if needed --- .github/workflows/validate-hints.yaml | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/.github/workflows/validate-hints.yaml b/.github/workflows/validate-hints.yaml index 389696358..af42c289a 100644 --- a/.github/workflows/validate-hints.yaml +++ b/.github/workflows/validate-hints.yaml @@ -4,15 +4,16 @@ on: workflow_call: jobs: - validate: - name: Validate slice definition hints + check-changed-files: + name: Prepare files to be validated runs-on: ubuntu-latest # Needs to be called, from a workflow triggered by a PR if: | startswith(github.event_name, 'pull_request') && startswith(github.base_ref, 'ubuntu-') - env: - main-branch-path: files-from-main + outputs: + changed-files: ${{ steps.changed-paths.outputs.slices_files }} + has-changes: ${{ steps.changed-paths.outputs.slices }} steps: - uses: actions/checkout@v4 with: @@ -32,6 +33,14 @@ jobs: # wrap filename with unsafe characters. list-files: shell + validate: + name: Validate hints + runs-on: ubuntu-latest + needs: [ check-changed-files ] + if: needs.check-changed-files.outputs.has-changes == 'true' + env: + main-branch-path: files-from-main + steps: - name: Checkout main branch uses: actions/checkout@v4 with: @@ -49,4 +58,4 @@ jobs: script-dir: "${{ env.main-branch-path }}/.github/scripts/validate-hints" run: | ./${{ env.script-dir }}/validate_hints.py \ - ${{ steps.changed-paths.outputs.slices_files }} + ${{ needs.check-changed-files.outputs.changed-files }} From cd34fceaf5f8f5bcf75714352dc554f9eec0d083 Mon Sep 17 00:00:00 2001 From: Cristovao Cordeiro Date: Tue, 17 Feb 2026 12:20:38 +0100 Subject: [PATCH 12/15] feat(validate-hints): disallow consecutive spaces --- .../validate-hints/test_validate_hints.py | 18 +++++++++++++++++- .../scripts/validate-hints/validate_hints.py | 10 ++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/.github/scripts/validate-hints/test_validate_hints.py b/.github/scripts/validate-hints/test_validate_hints.py index 84d62826f..edb94cbdb 100644 --- a/.github/scripts/validate-hints/test_validate_hints.py +++ b/.github/scripts/validate-hints/test_validate_hints.py @@ -84,6 +84,19 @@ def test_is_sentence_case(self): assert err is not None assert "'s'" in err + def test_has_consecutive_spaces(self): + # Valid + assert validate_hints.has_consecutive_spaces("System configuration") is None + + # Invalid + err = [ + validate_hints.has_consecutive_spaces("No consecutive spaces"), + validate_hints.has_consecutive_spaces("Ends with\t\tspace"), + ] + + assert None not in err + assert all("two or more consecutive spaces" in e for e in err) + class TestValidateHints: """Test the file validation logic.""" @@ -116,7 +129,10 @@ def test_validate_hints(self, tmp_path): assert "article (a, an, the) is not allowed: 'A'" in errors[0] assert "can only contain alphanumeric characters" in errors[1] assert "(first letter 'b' is not uppercase)" in errors[2] - assert "trailing punctuation and spaces are not allowed: found '.' at the end" in errors[3] + assert ( + "trailing punctuation and spaces are not allowed: found '.' at the end" + in errors[3] + ) assert "finite verbs are not allowed" in errors[4] def test_validate_hints_malformed_yaml(self, tmp_path): diff --git a/.github/scripts/validate-hints/validate_hints.py b/.github/scripts/validate-hints/validate_hints.py index 41213105a..29268394e 100755 --- a/.github/scripts/validate-hints/validate_hints.py +++ b/.github/scripts/validate-hints/validate_hints.py @@ -111,6 +111,15 @@ def is_sentence_case(text: str) -> ErrorMessage | None: return None +def has_consecutive_spaces(text: str) -> ErrorMessage | None: + """Check that the text does not contain consecutive spaces.""" + pattern = r"\s{2,}" + + if re.search(pattern, text): + return "contains two or more consecutive spaces" + return None + + def validate_hints(file_path: str) -> list[str]: """Validate hints in a single slice definition file.""" logging.info(f"Processing {file_path}...") @@ -136,6 +145,7 @@ def validate_hints(file_path: str) -> list[str]: no_special_characters, no_trailing_punctuation, is_sentence_case, + has_consecutive_spaces, ] for slice_name, values in slices.items(): From 4e73a8ad09c7bf41f12ed298d13ca07843e2c1e6 Mon Sep 17 00:00:00 2001 From: Cristovao Cordeiro Date: Tue, 17 Feb 2026 17:15:02 +0100 Subject: [PATCH 13/15] Apply suggestions from code review Co-authored-by: Marcin Konowalczyk --- .github/scripts/validate-hints/validate_hints.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/scripts/validate-hints/validate_hints.py b/.github/scripts/validate-hints/validate_hints.py index 29268394e..07b8bce6c 100755 --- a/.github/scripts/validate-hints/validate_hints.py +++ b/.github/scripts/validate-hints/validate_hints.py @@ -26,7 +26,7 @@ NLP = spacy.load("en_core_web_sm") ErrorMessage = str -COLORED_LOGGING = { +COLORED_LOGGING: dict[str, str] = { "red": "\033[31m", "reset": "\033[0m", } @@ -111,7 +111,7 @@ def is_sentence_case(text: str) -> ErrorMessage | None: return None -def has_consecutive_spaces(text: str) -> ErrorMessage | None: +def no_consecutive_spaces(text: str) -> ErrorMessage | None: """Check that the text does not contain consecutive spaces.""" pattern = r"\s{2,}" @@ -145,7 +145,7 @@ def validate_hints(file_path: str) -> list[str]: no_special_characters, no_trailing_punctuation, is_sentence_case, - has_consecutive_spaces, + no_consecutive_spaces, ] for slice_name, values in slices.items(): From 96d2deae208bb33bb6f6ec86b0b20de0c3ced0f6 Mon Sep 17 00:00:00 2001 From: Cristovao Cordeiro Date: Tue, 17 Feb 2026 17:39:42 +0100 Subject: [PATCH 14/15] fix: validate-hints tests --- .github/scripts/validate-hints/test_validate_hints.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/scripts/validate-hints/test_validate_hints.py b/.github/scripts/validate-hints/test_validate_hints.py index edb94cbdb..f327b8aab 100644 --- a/.github/scripts/validate-hints/test_validate_hints.py +++ b/.github/scripts/validate-hints/test_validate_hints.py @@ -84,14 +84,14 @@ def test_is_sentence_case(self): assert err is not None assert "'s'" in err - def test_has_consecutive_spaces(self): + def test_no_consecutive_spaces(self): # Valid - assert validate_hints.has_consecutive_spaces("System configuration") is None + assert validate_hints.no_consecutive_spaces("System configuration") is None # Invalid err = [ - validate_hints.has_consecutive_spaces("No consecutive spaces"), - validate_hints.has_consecutive_spaces("Ends with\t\tspace"), + validate_hints.no_consecutive_spaces("No consecutive spaces"), + validate_hints.no_consecutive_spaces("Ends with\t\tspace"), ] assert None not in err From 1a6cebb6c2a54a7bce6393b28ba1be4df5ee7777 Mon Sep 17 00:00:00 2001 From: Cristovao Cordeiro Date: Tue, 17 Feb 2026 18:03:51 +0100 Subject: [PATCH 15/15] feat: lazy load NLP initialization --- .../validate-hints/test_validate_hints.py | 21 +++++++++- .../scripts/validate-hints/validate_hints.py | 38 +++++++++++-------- 2 files changed, 43 insertions(+), 16 deletions(-) diff --git a/.github/scripts/validate-hints/test_validate_hints.py b/.github/scripts/validate-hints/test_validate_hints.py index f327b8aab..7ded57892 100644 --- a/.github/scripts/validate-hints/test_validate_hints.py +++ b/.github/scripts/validate-hints/test_validate_hints.py @@ -12,7 +12,26 @@ class TestValidators: - """Test individual validator functions.""" + """Test individual validator functions, including NLP initialization.""" + + def test_get_nlp_caching(self): + # Ensure that the NLP model is loaded and cached + nlp1 = validate_hints.get_nlp() + nlp2 = validate_hints.get_nlp() + assert nlp1 is nlp2, "NLP model should be cached and reused" + + @patch("spacy.load") + @patch("spacy.cli.download") + def test_nonexistent_model(self, mock_download, mock_load): + # Simulate the case where the model is not available and needs to be downloaded + validate_hints.get_nlp() # Ensure model is loaded and cached + mock_load.side_effect = [OSError("Model not found"), validate_hints._NLP_CACHE] + + validate_hints._NLP_CACHE = None # Reset cache + nlp = validate_hints.get_nlp() + mock_load.assert_called_with("en_core_web_sm") + mock_download.assert_called_with("en_core_web_sm") + assert nlp is not None, "NLP model should be loaded after download" def test_no_finite_verbs(self): # Valid cases diff --git a/.github/scripts/validate-hints/validate_hints.py b/.github/scripts/validate-hints/validate_hints.py index 07b8bce6c..91d52ad2f 100755 --- a/.github/scripts/validate-hints/validate_hints.py +++ b/.github/scripts/validate-hints/validate_hints.py @@ -12,19 +12,8 @@ import spacy import yaml -# Configure logging -logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") - -# Load NLP model -try: - NLP = spacy.load("en_core_web_sm") -except OSError: - logging.warning("Downloading en_core_web_sm model...") - from spacy.cli import download - - download("en_core_web_sm") - NLP = spacy.load("en_core_web_sm") +_NLP_CACHE: spacy.language.Language | None = None ErrorMessage = str COLORED_LOGGING: dict[str, str] = { "red": "\033[31m", @@ -32,9 +21,25 @@ } +def get_nlp() -> spacy.language.Language: + """Lazy load the NLP model.""" + global _NLP_CACHE + if _NLP_CACHE is None: + try: + _NLP_CACHE = spacy.load("en_core_web_sm") + except OSError: + logging.warning("Downloading en_core_web_sm model...") + from spacy.cli import download + + download("en_core_web_sm") + _NLP_CACHE = spacy.load("en_core_web_sm") + + return _NLP_CACHE + + def no_finite_verbs(text: str) -> ErrorMessage | None: """Check that the text does not contain finite verbs.""" - doc = NLP(text) + doc = get_nlp()(text) findings: list[str] = [] for token in doc: if token.pos_ in ["VERB", "AUX"] and token.morph.get("VerbForm", None) == [ @@ -94,7 +99,7 @@ def is_sentence_case(text: str) -> ErrorMessage | None: """Check that each sentence in the text starts with an uppercase letter.""" # It is not enough to split the text by '.' and check each sentence separately, # because we can have complex punctuation like "Single 1.1 sentence" - doc = NLP(text) + doc = get_nlp()(text) findings: list[str] = [] for sent in doc.sents: @@ -117,7 +122,7 @@ def no_consecutive_spaces(text: str) -> ErrorMessage | None: if re.search(pattern, text): return "contains two or more consecutive spaces" - return None + return None def validate_hints(file_path: str) -> list[str]: @@ -172,6 +177,9 @@ def main() -> None: parser.add_argument("files", nargs="+", help="Slice definition files to validate") args = parser.parse_args() + # Configure logging + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + logging.info("Validating slice definition hints") all_errors: list[str] = []