diff --git a/.github/scripts/validate-hints/requirements.txt b/.github/scripts/validate-hints/requirements.txt new file mode 100644 index 000000000..76b2df95e --- /dev/null +++ b/.github/scripts/validate-hints/requirements.txt @@ -0,0 +1,3 @@ +spacy>=3.0.0 +https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl +pyyaml>=5.0.0 diff --git a/.github/scripts/validate-hints/test_validate_hints.py b/.github/scripts/validate-hints/test_validate_hints.py new file mode 100644 index 000000000..7ded57892 --- /dev/null +++ b/.github/scripts/validate-hints/test_validate_hints.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +""" +Unit tests for validate_hints.py +""" +import sys +import os +from unittest.mock import patch + +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +import validate_hints + + +class TestValidators: + """Test individual validator functions, including NLP initialization.""" + + def test_get_nlp_caching(self): + # Ensure that the NLP model is loaded and cached + nlp1 = validate_hints.get_nlp() + nlp2 = validate_hints.get_nlp() + assert nlp1 is nlp2, "NLP model should be cached and reused" + + @patch("spacy.load") + @patch("spacy.cli.download") + def test_nonexistent_model(self, mock_download, mock_load): + # Simulate the case where the model is not available and needs to be downloaded + validate_hints.get_nlp() # Ensure model is loaded and cached + mock_load.side_effect = [OSError("Model not found"), validate_hints._NLP_CACHE] + + validate_hints._NLP_CACHE = None # Reset cache + nlp = validate_hints.get_nlp() + mock_load.assert_called_with("en_core_web_sm") + mock_download.assert_called_with("en_core_web_sm") + assert nlp is not None, "NLP model should be loaded after download" + + def test_no_finite_verbs(self): + # Valid cases + assert validate_hints.no_finite_verbs("System configuration") is None + assert validate_hints.no_finite_verbs("Installation of packages") is None + + # Invalid cases + err = [] + err.append(validate_hints.no_finite_verbs("This contains a verb")) + err.append(validate_hints.no_finite_verbs("This is a verb")) + assert len(err) == 2 + assert all("finite verbs are not allowed" in e for e in err) + assert "contains" in err[0] + assert "is" in err[1] + + def test_no_starting_articles(self): + # Valid + assert validate_hints.no_starting_articles("System configuration") is None + + # Invalid + assert ( + validate_hints.no_starting_articles("The system configuration") is not None + ) + assert validate_hints.no_starting_articles("A configuration") is not None + assert validate_hints.no_starting_articles("An apple") is not None + + # Case sensitivity + assert validate_hints.no_starting_articles("the system") is not None + + def test_no_special_characters(self): + # Valid + assert validate_hints.no_special_characters("System configuration") is None + assert validate_hints.no_special_characters("File (config.yaml)") is None + assert validate_hints.no_special_characters("Items one, two; three.") is None + + # Invalid + set_of_invalid_chars = "@#$%^:&*!~`<>/\\|" + err = validate_hints.no_special_characters( + f"Some invalid chars: {set_of_invalid_chars}" + ) + assert err is not None + assert all(char in err for char in set_of_invalid_chars) + + def test_no_trailing_punctuation(self): + # Valid + assert validate_hints.no_trailing_punctuation("System configuration") is None + assert validate_hints.no_trailing_punctuation("Ends with parenthesis)") is None + + # Invalid + for punct in {".", "!", "?", ",", ";", ":", " "}: + assert ( + validate_hints.no_trailing_punctuation(f"Ends with {punct}") is not None + ) + + def test_is_sentence_case(self): + # Valid + assert validate_hints.is_sentence_case("System configuration") is None + assert ( + validate_hints.is_sentence_case("System configuration. Another sentence.") + is None + ) + + # Invalid + err = validate_hints.is_sentence_case("system configuration") + assert err is not None + assert "first letter" in err + + err = validate_hints.is_sentence_case("First sentence. second sentence.") + assert err is not None + assert "'s'" in err + + def test_no_consecutive_spaces(self): + # Valid + assert validate_hints.no_consecutive_spaces("System configuration") is None + + # Invalid + err = [ + validate_hints.no_consecutive_spaces("No consecutive spaces"), + validate_hints.no_consecutive_spaces("Ends with\t\tspace"), + ] + + assert None not in err + assert all("two or more consecutive spaces" in e for e in err) + + +class TestValidateHints: + """Test the file validation logic.""" + + def test_validate_hints(self, tmp_path): + f = tmp_path / "sdf.yaml" + yaml_content = """slices: + good: + hint: Pass the test + + bad-article: + hint: A bad hint + + bad-chars: + hint: Hint with invalid characters @#$%^&*() - the end + + bad-sentence-case: + hint: Ok. but not sentence case + + bad-trailing-punctuation: + hint: With trailing punctuation. + + bad-verbs: + hint: Has to fail the test, it is not valid, contains finite verbs +""" + f.write_text(yaml_content, encoding="utf-8") + + errors = validate_hints.validate_hints(str(f)) + assert len(errors) == 5 + assert "article (a, an, the) is not allowed: 'A'" in errors[0] + assert "can only contain alphanumeric characters" in errors[1] + assert "(first letter 'b' is not uppercase)" in errors[2] + assert ( + "trailing punctuation and spaces are not allowed: found '.' at the end" + in errors[3] + ) + assert "finite verbs are not allowed" in errors[4] + + def test_validate_hints_malformed_yaml(self, tmp_path): + f = tmp_path / "bad.yaml" + f.write_text("slices: [", encoding="utf-8") + + errors = validate_hints.validate_hints(str(f)) + assert len(errors) == 1 + assert "Failed to parse YAML" in errors[0] + + def test_validate_hints_non_dict_yaml(self, tmp_path): + f = tmp_path / "bad.yaml" + f.write_text("- foo", encoding="utf-8") + + errors = validate_hints.validate_hints(str(f)) + assert len(errors) == 1 + assert "Failed to parse YAML" in errors[0] + assert "YAML mapping" in errors[0] + + +class TestMain: + """Test the main execution flow.""" + + @patch("validate_hints.validate_hints") + @patch("sys.exit") + def test_main_success(self, mock_exit, mock_validate): + mock_validate.return_value = [] + with patch("sys.argv", ["script", "file.yaml"]): + validate_hints.main() + mock_exit.assert_not_called() + + @patch("validate_hints.validate_hints") + @patch("sys.exit") + def test_main_failure(self, mock_exit, mock_validate): + mock_validate.return_value = ["Some error"] + with patch("sys.argv", ["script", "file.yaml"]): + validate_hints.main() + mock_exit.assert_called_with(1) diff --git a/.github/scripts/validate-hints/validate_hints.py b/.github/scripts/validate-hints/validate_hints.py new file mode 100755 index 000000000..91d52ad2f --- /dev/null +++ b/.github/scripts/validate-hints/validate_hints.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +""" +Script to validate hints in slice definition files. +""" + +import argparse +import logging +import re +import sys +from typing import Callable + +import spacy +import yaml + + +_NLP_CACHE: spacy.language.Language | None = None +ErrorMessage = str +COLORED_LOGGING: dict[str, str] = { + "red": "\033[31m", + "reset": "\033[0m", +} + + +def get_nlp() -> spacy.language.Language: + """Lazy load the NLP model.""" + global _NLP_CACHE + if _NLP_CACHE is None: + try: + _NLP_CACHE = spacy.load("en_core_web_sm") + except OSError: + logging.warning("Downloading en_core_web_sm model...") + from spacy.cli import download + + download("en_core_web_sm") + _NLP_CACHE = spacy.load("en_core_web_sm") + + return _NLP_CACHE + + +def no_finite_verbs(text: str) -> ErrorMessage | None: + """Check that the text does not contain finite verbs.""" + doc = get_nlp()(text) + findings: list[str] = [] + for token in doc: + if token.pos_ in ["VERB", "AUX"] and token.morph.get("VerbForm", None) == [ + "Fin" + ]: + findings.append(f"{token.text} ({token.lemma_})") + + if findings: + return f"finite verbs are not allowed: {', '.join(findings)}" + return None + + +def no_starting_articles(text: str) -> ErrorMessage | None: + """Check that the text does not start with an article.""" + words = text.split() + if not words: + return None + + first_word = words[0] + articles: set[str] = {"a", "an", "the"} + if first_word.lower() in articles: + return ( + f"starting with an article ({', '.join(sorted(articles))}) " + f"is not allowed: '{first_word}'" + ) + return None + + +def no_special_characters(text: str) -> ErrorMessage | None: + """ + Check that the text contains only allowed characters. + Allowed: alphanumeric, periods, commas, semicolons, parentheses. + """ + # Regex for characters that are NOT allowed + forbidden_pattern = r"[^a-zA-Z0-9.,;()\s]" + + bad_chars = re.findall(forbidden_pattern, text) + if bad_chars: + unique_bad_chars = set(bad_chars) + return ( + f"can only contain alphanumeric characters, periods, commas, " + f"semicolons, parentheses: found {', '.join(unique_bad_chars)}" + ) + return None + + +def no_trailing_punctuation(text: str) -> ErrorMessage | None: + """Check that the text does not end with punctuation, except parentheses.""" + punctuation_marks: set[str] = {".", "!", "?", ",", ";", ":", " "} + + if text and text[-1] in punctuation_marks: + return f"trailing punctuation and spaces are not allowed: found '{text[-1]}' at the end" + return None + + +def is_sentence_case(text: str) -> ErrorMessage | None: + """Check that each sentence in the text starts with an uppercase letter.""" + # It is not enough to split the text by '.' and check each sentence separately, + # because we can have complex punctuation like "Single 1.1 sentence" + doc = get_nlp()(text) + findings: list[str] = [] + + for sent in doc.sents: + s_text = sent.text.strip() + if not s_text: + continue + + # Check if first letter is upper + if not s_text[0].isupper(): + findings.append(f"'{s_text}' (first letter '{s_text[0]}' is not uppercase)") + + if findings: + return f"text must be sentence case: {', '.join(findings)}" + return None + + +def no_consecutive_spaces(text: str) -> ErrorMessage | None: + """Check that the text does not contain consecutive spaces.""" + pattern = r"\s{2,}" + + if re.search(pattern, text): + return "contains two or more consecutive spaces" + return None + + +def validate_hints(file_path: str) -> list[str]: + """Validate hints in a single slice definition file.""" + logging.info(f"Processing {file_path}...") + errors: list[str] = [] + + try: + with open(file_path, "r", encoding="utf-8") as f: + content = yaml.safe_load(f) + + assert isinstance( + content, dict + ), f"Expected {file_path} to be a YAML mapping (dict) at the top level" + except Exception as e: + return [f"File={file_path}, Error=Failed to parse YAML: {e}"] + + slices = content.get("slices", {}) + if not isinstance(slices, dict): + return [] + + validators: list[Callable[[str], ErrorMessage | None]] = [ + no_finite_verbs, + no_starting_articles, + no_special_characters, + no_trailing_punctuation, + is_sentence_case, + no_consecutive_spaces, + ] + + for slice_name, values in slices.items(): + if not isinstance(values, dict): + continue + + hint = values.get("hint", "") + # Skip empty hints or non-string hints + if not hint or not isinstance(hint, str): + continue + + for validator in validators: + error_msg = validator(hint) + if error_msg: + errors.append( + f"File={file_path}, Slice={slice_name}, Error={error_msg}" + ) + + return errors + + +def main() -> None: + parser = argparse.ArgumentParser(description="Validate hints in slice definitions") + parser.add_argument("files", nargs="+", help="Slice definition files to validate") + args = parser.parse_args() + + # Configure logging + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + + logging.info("Validating slice definition hints") + + all_errors: list[str] = [] + for input_file in args.files: + all_errors.extend(validate_hints(input_file)) + + if all_errors: + logging.error( + f"{COLORED_LOGGING['red']}The 'hint' validation steps failed{COLORED_LOGGING['reset']}" + ) + all_errors.sort() + for error in all_errors: + logging.error(error) + + sys.exit(1) + + logging.info("All hints are valid") + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 2b64528e6..e4fa10dff 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -107,3 +107,7 @@ jobs: done exit $EXIT_CODE + + validate-hints: + name: Validate hints + uses: ./.github/workflows/validate-hints.yaml diff --git a/.github/workflows/test-ci.yaml b/.github/workflows/test-ci.yaml new file mode 100644 index 000000000..b46365499 --- /dev/null +++ b/.github/workflows/test-ci.yaml @@ -0,0 +1,27 @@ +name: Test chisel-releases CI + +on: + pull_request: + branches: + - "main" + paths: + - ".github/workflows/*.yaml" + - ".github/scripts/validate-hints/**" + +jobs: + test-validate-hints: + name: Test validate hints workflow + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Setup Python + uses: actions/setup-python@v6 + with: + python-version: "3.12" + pip-install: -r .github/scripts/validate-hints/requirements.txt + - run: pip install pytest + - name: Run "validate-hints" unit tests + run: | + pytest .github/scripts/validate-hints/ + + # TODO: add tests for remaining CI scripts diff --git a/.github/workflows/validate-hints.yaml b/.github/workflows/validate-hints.yaml new file mode 100644 index 000000000..af42c289a --- /dev/null +++ b/.github/workflows/validate-hints.yaml @@ -0,0 +1,61 @@ +name: Validate hints + +on: + workflow_call: + +jobs: + check-changed-files: + name: Prepare files to be validated + runs-on: ubuntu-latest + # Needs to be called, from a workflow triggered by a PR + if: | + startswith(github.event_name, 'pull_request') && + startswith(github.base_ref, 'ubuntu-') + outputs: + changed-files: ${{ steps.changed-paths.outputs.slices_files }} + has-changes: ${{ steps.changed-paths.outputs.slices }} + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.head_ref }} + repository: ${{ github.event.pull_request.head.repo.full_name }} + + - name: Check changed paths + id: changed-paths + uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2 + with: + # ref: https://github.com/marketplace/actions/paths-changes-filter + filters: | + slices: + - added|modified: 'slices/**/*.yaml' + # Space delimited list usable as command-line argument list in + # Linux shell. If needed, it uses single or double quotes to + # wrap filename with unsafe characters. + list-files: shell + + validate: + name: Validate hints + runs-on: ubuntu-latest + needs: [ check-changed-files ] + if: needs.check-changed-files.outputs.has-changes == 'true' + env: + main-branch-path: files-from-main + steps: + - name: Checkout main branch + uses: actions/checkout@v4 + with: + ref: main + path: ${{ env.main-branch-path }} + + - name: Setup Python + uses: actions/setup-python@v6 + with: + python-version: "3.12" + pip-install: -r ${{ env.main-branch-path }}/.github/scripts/validate-hints/requirements.txt + + - name: Validate hints + env: + script-dir: "${{ env.main-branch-path }}/.github/scripts/validate-hints" + run: | + ./${{ env.script-dir }}/validate_hints.py \ + ${{ needs.check-changed-files.outputs.changed-files }}