From bf362a9db1e40b3108f741f2516f885990fe183c Mon Sep 17 00:00:00 2001 From: Aaron Steven White Date: Tue, 28 Oct 2025 10:37:35 -0400 Subject: [PATCH] Fixes FrameNet lexical unit loading by parsing `luIndex.xml` and relaxing validators to handle all actual data. --- .pre-commit-config.yaml | 1 + CHANGELOG.md | 16 ++- README.md | 20 +-- docs/api/index.md | 2 +- docs/api/references/index.md | 6 +- docs/api/utils/fuzzy-match.md | 2 +- docs/citation.md | 8 +- docs/index.md | 2 +- docs/installation.md | 4 +- docs/quick-start.md | 4 +- docs/user-guide/cli.md | 16 +-- docs/user-guide/cross-references.md | 14 +- docs/user-guide/fuzzy-search.md | 72 +++++----- pyproject.toml | 2 +- src/glazing/__version__.py | 2 +- src/glazing/cli/convert.py | 6 +- src/glazing/framenet/converter.py | 193 ++++++++++++++++++++++++-- src/glazing/framenet/types.py | 20 ++- tests/test_framenet/test_converter.py | 171 +++++++++++++++++++++++ tests/test_framenet/test_models.py | 4 +- tests/test_framenet/test_types.py | 11 +- 21 files changed, 474 insertions(+), 102 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index af0d53a..e624b77 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,6 +9,7 @@ repos: - id: check-added-large-files - id: check-merge-conflict - id: debug-statements + language_version: python3.13 - id: mixed-line-ending - repo: local diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b28a0d..05c574a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.1] - 2025-10-28 + +### Fixed + +- **FrameNet lexical units now properly loaded during conversion** + - Lexical units are now parsed from `luIndex.xml` during frame conversion + - All frames now include their associated lexical units with complete metadata + - Fixes critical data completeness issue where `frame.lexical_units` was always empty + - Enables querying frames by lexical unit name via the frame index + - Approximately 13,500 lexical units now correctly associated with their frames + ## [0.2.0] - 2025-09-30 ### Added @@ -20,7 +31,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Support for parsing complex symbols like ARG1-PPT, ?Theme_i, Core[Agent] #### Fuzzy Search and Matching -- **Fuzzy search capability** with Levenshtein distance-based matching +- **Fuzzy search capability** with Levenshtein distance-based matching to find data with typos, morphological variants, and spelling inconsistencies - **Configurable similarity thresholds** for controlling match precision - **Multi-field fuzzy matching** across names, descriptions, and identifiers - **Search result ranking** - New ranking module for scoring search results by match type and field relevance @@ -186,7 +197,8 @@ Initial release of `glazing`, a package containing unified data models and inter - `tqdm >= 4.60.0` (progress bars) - `rich >= 13.0.0` (CLI formatting) -[Unreleased]: https://github.com/aaronstevenwhite/glazing/compare/v0.2.0...HEAD +[Unreleased]: https://github.com/aaronstevenwhite/glazing/compare/v0.2.1...HEAD +[0.2.1]: https://github.com/aaronstevenwhite/glazing/releases/tag/v0.2.1 [0.2.0]: https://github.com/aaronstevenwhite/glazing/releases/tag/v0.2.0 [0.1.1]: https://github.com/aaronstevenwhite/glazing/releases/tag/v0.1.1 [0.1.0]: https://github.com/aaronstevenwhite/glazing/releases/tag/v0.1.0 diff --git a/README.md b/README.md index 183c6a7..3b120c4 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Unified data models and interfaces for syntactic and semantic frame ontologies. - 📦 **Type-safe models**: Pydantic v2 validation for all data structures - 🔍 **Unified search**: Query across all datasets with consistent API - 🔗 **Cross-references**: Automatic mapping between resources with confidence scores -- 🎯 **Fuzzy search**: Find matches even with typos or partial queries +- 🎯 **Fuzzy search**: Find data with typos, spelling variants, and inconsistencies - 🐳 **Docker support**: Use via Docker without local installation - 💾 **Efficient storage**: JSON Lines format with streaming support - 🐍 **Modern Python**: Full type hints, Python 3.13+ support @@ -83,9 +83,9 @@ glazing search query "abandon" # Search specific dataset glazing search query "run" --dataset verbnet -# Use fuzzy search for typos -glazing search query "giv" --fuzzy -glazing search query "instrment" --fuzzy --threshold 0.7 +# Find data with typos or spelling variants +glazing search query "realize" --fuzzy +glazing search query "organize" --fuzzy --threshold 0.8 ``` Resolve cross-references: @@ -98,8 +98,8 @@ glazing xref extract glazing xref resolve "give.01" --source propbank glazing xref resolve "give-13.1" --source verbnet -# Use fuzzy matching -glazing xref resolve "giv.01" --source propbank --fuzzy +# Find data with variations or inconsistencies +glazing xref resolve "realize.01" --source propbank --fuzzy ``` ## Python API @@ -131,8 +131,8 @@ refs = xref.resolve("give.01", source="propbank") print(f"VerbNet classes: {refs['verbnet_classes']}") print(f"Confidence scores: {refs['confidence_scores']}") -# Use fuzzy matching for typos -refs = xref.resolve("giv.01", source="propbank", fuzzy=True) +# Find data with variations or inconsistencies +refs = xref.resolve("realize.01", source="propbank", fuzzy=True) print(f"Found match with fuzzy search: {refs['verbnet_classes']}") ``` @@ -141,9 +141,9 @@ Fuzzy search in Python: ```python from glazing.search import UnifiedSearch -# Use fuzzy search to handle typos +# Find data with typos or spelling variants search = UnifiedSearch() -results = search.search_with_fuzzy("instrment", fuzzy_threshold=0.8) +results = search.search_with_fuzzy("organize", fuzzy_threshold=0.8) for result in results[:5]: print(f"{result.dataset}: {result.name} (score: {result.score:.2f})") diff --git a/docs/api/index.md b/docs/api/index.md index ca41a95..86288bf 100644 --- a/docs/api/index.md +++ b/docs/api/index.md @@ -118,7 +118,7 @@ except ValidationError as e: ## Version Compatibility -This documentation covers Glazing version 0.2.0. Check your installed version: +This documentation covers Glazing version 0.2.1. Check your installed version: ```python import glazing diff --git a/docs/api/references/index.md b/docs/api/references/index.md index ecf158e..f8f9cb2 100644 --- a/docs/api/references/index.md +++ b/docs/api/references/index.md @@ -18,8 +18,8 @@ xref = CrossReferenceIndex() refs = xref.resolve("give.01", source="propbank") print(refs["verbnet_classes"]) # ['give-13.1'] -# Use fuzzy matching for typos -refs = xref.resolve("giv.01", source="propbank", fuzzy=True) +# Find data with variations or inconsistencies +refs = xref.resolve("realize.01", source="propbank", fuzzy=True) ``` ## Main Classes @@ -53,7 +53,7 @@ class CrossReferenceIndex( - **Automatic Extraction**: References are extracted automatically on first use - **Caching**: Extracted references are cached for fast subsequent loads -- **Fuzzy Matching**: Handle typos and variations with configurable thresholds +- **Fuzzy Matching**: Find data with typos, morphological variants, and spelling inconsistencies - **Confidence Scores**: All mappings include confidence scores - **Progress Indicators**: Visual feedback during extraction diff --git a/docs/api/utils/fuzzy-match.md b/docs/api/utils/fuzzy-match.md index 8cf4e27..4d76e60 100644 --- a/docs/api/utils/fuzzy-match.md +++ b/docs/api/utils/fuzzy-match.md @@ -4,7 +4,7 @@ Fuzzy string matching utilities using Levenshtein distance. ## Overview -The fuzzy_match module provides functions for fuzzy string matching using Levenshtein distance and other similarity metrics. It includes text normalization and caching for performance. +The fuzzy_match module provides functions for fuzzy string matching using Levenshtein distance and other similarity metrics. It includes text normalization and caching for performance. The primary use case is finding data that contains typos, morphological variants, or spelling inconsistencies in the underlying datasets. ## Functions diff --git a/docs/citation.md b/docs/citation.md index 6730224..5a59caf 100644 --- a/docs/citation.md +++ b/docs/citation.md @@ -12,22 +12,22 @@ If you use Glazing in your research, please cite our work. title = {Glazing: Unified Data Models and Interfaces for Syntactic and Semantic Frame Ontologies}, year = {2025}, url = {https://github.com/aaronstevenwhite/glazing}, - version = {0.2.0}, + version = {0.2.1}, doi = {10.5281/zenodo.17185626} } ``` ### APA -White, A. S. (2025). *Glazing: Unified Data Models and Interfaces for Syntactic and Semantic Frame Ontologies* (Version 0.2.0) [Computer software]. https://github.com/aaronstevenwhite/glazing +White, A. S. (2025). *Glazing: Unified Data Models and Interfaces for Syntactic and Semantic Frame Ontologies* (Version 0.2.1) [Computer software]. https://github.com/aaronstevenwhite/glazing ### Chicago -White, Aaron Steven. 2025. *Glazing: Unified Data Models and Interfaces for Syntactic and Semantic Frame Ontologies*. Version 0.2.0. https://github.com/aaronstevenwhite/glazing. +White, Aaron Steven. 2025. *Glazing: Unified Data Models and Interfaces for Syntactic and Semantic Frame Ontologies*. Version 0.2.1. https://github.com/aaronstevenwhite/glazing. ### MLA -White, Aaron Steven. *Glazing: Unified Data Models and Interfaces for Syntactic and Semantic Frame Ontologies*. Version 0.2.0, 2025, https://github.com/aaronstevenwhite/glazing. +White, Aaron Steven. *Glazing: Unified Data Models and Interfaces for Syntactic and Semantic Frame Ontologies*. Version 0.2.1, 2025, https://github.com/aaronstevenwhite/glazing. ## Citing Datasets diff --git a/docs/index.md b/docs/index.md index 42b2bbc..f811334 100644 --- a/docs/index.md +++ b/docs/index.md @@ -93,7 +93,7 @@ If you use Glazing in your research, please cite: title = {Glazing: Unified Data Models and Interfaces for Syntactic and Semantic Frame Ontologies}, year = {2025}, url = {https://github.com/aaronstevenwhite/glazing}, - version = {0.2.0}, + version = {0.2.1}, doi = {10.5281/zenodo.17185626} } ``` diff --git a/docs/installation.md b/docs/installation.md index 57f5640..bd6a53f 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -168,8 +168,8 @@ docker run --rm -v glazing-data:/data glazing:latest init # Search across datasets docker run --rm -v glazing-data:/data glazing:latest search query "give" -# Search with fuzzy matching -docker run --rm -v glazing-data:/data glazing:latest search query "giv" --fuzzy +# Find data with variations using fuzzy matching +docker run --rm -v glazing-data:/data glazing:latest search query "realize" --fuzzy # Extract cross-references docker run --rm -v glazing-data:/data glazing:latest xref extract diff --git a/docs/quick-start.md b/docs/quick-start.md index b847a36..46305d8 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -81,8 +81,8 @@ refs = xref.resolve("give.01", source="propbank") print(f"VerbNet classes: {refs['verbnet_classes']}") print(f"Confidence scores: {refs['confidence_scores']}") -# Use fuzzy matching for typos -refs = xref.resolve("giv.01", source="propbank", fuzzy=True) +# Find data with variations or inconsistencies +refs = xref.resolve("realize.01", source="propbank", fuzzy=True) print(f"VerbNet classes: {refs['verbnet_classes']}") ``` diff --git a/docs/user-guide/cli.md b/docs/user-guide/cli.md index a2c5b6e..7d33f8e 100644 --- a/docs/user-guide/cli.md +++ b/docs/user-guide/cli.md @@ -38,15 +38,15 @@ glazing search query "give" --limit 10 --json ### Fuzzy Search -Use fuzzy matching to find results even with typos or partial matches: +Use fuzzy matching to find data with typos, morphological variants, or spelling inconsistencies: ```bash -# Find matches for typos -glazing search query "giv" --fuzzy -glazing search query "instrment" --fuzzy --threshold 0.7 +# Find data with variations +glazing search query "realize" --fuzzy +glazing search query "organize" --fuzzy --threshold 0.8 # Adjust the threshold (0.0-1.0, higher is stricter) -glazing search query "runing" --fuzzy --threshold 0.85 +glazing search query "analyze" --fuzzy --threshold 0.85 ``` ### Syntactic Pattern Search @@ -110,9 +110,9 @@ Find mappings between datasets: glazing xref resolve "give.01" --source propbank glazing xref resolve "give-13.1" --source verbnet -# Use fuzzy matching for typos -glazing xref resolve "giv.01" --source propbank --fuzzy -glazing xref resolve "transfer-11.1" --source verbnet --fuzzy --threshold 0.8 +# Find data with variations or inconsistencies +glazing xref resolve "realize.01" --source propbank --fuzzy +glazing xref resolve "organize-74" --source verbnet --fuzzy --threshold 0.8 # Get JSON output glazing xref resolve "Giving" --source framenet --json diff --git a/docs/user-guide/cross-references.md b/docs/user-guide/cross-references.md index d9c0413..b998255 100644 --- a/docs/user-guide/cross-references.md +++ b/docs/user-guide/cross-references.md @@ -30,8 +30,8 @@ refs = xref.resolve("give.01", source="propbank") print(f"VerbNet classes: {refs['verbnet_classes']}") print(f"Confidence scores: {refs['confidence_scores']}") -# Use fuzzy matching for typos -refs = xref.resolve("giv.01", source="propbank", fuzzy=True) +# Find data with variations or inconsistencies +refs = xref.resolve("realize.01", source="propbank", fuzzy=True) print(f"VerbNet classes: {refs['verbnet_classes']}") ``` @@ -93,13 +93,13 @@ xref.clear_cache() ### Fuzzy Matching -The system supports fuzzy matching for handling typos and variations: +The system supports fuzzy matching for finding data with typos, morphological variants, and spelling inconsistencies: ```python -# Find matches even with typos -refs = xref.resolve("transferr.01", source="propbank", fuzzy=True, threshold=0.7) +# Find data with variations +refs = xref.resolve("organize.01", source="propbank", fuzzy=True, threshold=0.8) -# The system will find "transfer.01" and return its references +# The system will find variants if they exist and return their references ``` ### Confidence Scores @@ -111,4 +111,4 @@ All mappings include confidence scores based on: ## Limitations -Cross-references in these datasets are incomplete and sometimes approximate. VerbNet members don't always have WordNet mappings. PropBank rolesets may lack VerbNet mappings. The quality and coverage of references varies between dataset pairs. Fuzzy matching can occasionally produce false positives at lower thresholds. +Cross-references in these datasets are incomplete and sometimes approximate. VerbNet members don't always have WordNet mappings. PropBank rolesets may lack VerbNet mappings. The quality and coverage of references varies between dataset pairs. The datasets themselves may contain typos or morphological variants, which fuzzy matching helps to address. Fuzzy matching can occasionally produce false positives at lower thresholds. diff --git a/docs/user-guide/fuzzy-search.md b/docs/user-guide/fuzzy-search.md index 7a17fee..0abba3e 100644 --- a/docs/user-guide/fuzzy-search.md +++ b/docs/user-guide/fuzzy-search.md @@ -1,10 +1,10 @@ # Fuzzy Search -Fuzzy search uses Levenshtein distance to find matches despite typos, misspellings, or partial queries. +Fuzzy search uses Levenshtein distance to find data that contains typos, morphological variants, or spelling inconsistencies. ## When to Use Fuzzy Search -Fuzzy search is useful when exact matches fail due to typos in queries, uncertain spelling, partial matches, or when searching for similar but not exact terms. +Fuzzy search is useful when the underlying datasets contain typos, morphological variants (e.g., "realise" vs "realize"), spelling inconsistencies, or partial forms. These datasets are compiled from various sources over time and may contain inconsistencies that would otherwise prevent exact matches. ## Implementation @@ -16,23 +16,23 @@ The system calculates Levenshtein distance between strings, measuring the minimu ```bash # Enable fuzzy matching with default threshold (0.8) -glazing search query "instrment" --fuzzy +glazing search query "realize" --fuzzy # Custom threshold (lower = more permissive) -glazing search query "giv" --fuzzy --threshold 0.6 +glazing search query "organize" --fuzzy --threshold 0.7 # Combine with dataset filter -glazing search query "trasfer" --fuzzy --dataset propbank +glazing search query "analyze" --fuzzy --dataset propbank ``` ### Cross-Reference Resolution ```bash # Fuzzy match cross-references -glazing xref resolve "giv.01" --source propbank --fuzzy +glazing xref resolve "realize.01" --source propbank --fuzzy # With custom threshold -glazing xref resolve "trasnsfer.01" --source propbank --fuzzy --threshold 0.7 +glazing xref resolve "organize.01" --source propbank --fuzzy --threshold 0.8 ``` ## Python API @@ -45,10 +45,10 @@ from glazing.search import UnifiedSearch search = UnifiedSearch() # Fuzzy search with default threshold -results = search.search_with_fuzzy("instrment") +results = search.search_with_fuzzy("realize") # Custom threshold -results = search.search_with_fuzzy("giv", fuzzy_threshold=0.6) +results = search.search_with_fuzzy("organize", fuzzy_threshold=0.7) # Check match scores for result in results[:5]: @@ -63,14 +63,14 @@ from glazing.references.index import CrossReferenceIndex xref = CrossReferenceIndex() # Resolve with fuzzy matching -refs = xref.resolve("giv.01", source="propbank", fuzzy=True) +refs = xref.resolve("realize.01", source="propbank", fuzzy=True) # With confidence threshold refs = xref.resolve( - "trasfer.01", + "organize.01", source="propbank", fuzzy=True, - confidence_threshold=0.7 + confidence_threshold=0.8 ) ``` @@ -80,22 +80,22 @@ refs = xref.resolve( from glazing.utils.fuzzy_match import fuzzy_match, find_best_match # Find multiple matches -candidates = ["instrument", "argument", "document"] -results = fuzzy_match("instrment", candidates, threshold=0.7) +candidates = ["realize", "realise", "recognition"] +results = fuzzy_match("realise", candidates, threshold=0.8) # Find single best match -best = find_best_match("giv", ["give", "take", "have"]) +best = find_best_match("organize", ["organize", "organise", "organisation"]) ``` ## Threshold Selection | Threshold | Use Case | Example Matches | |-----------|----------|-----------------| -| 0.9-1.0 | Near-exact matches | "give" → "give" | -| 0.8-0.9 | Minor typos | "instrment" → "instrument" | -| 0.7-0.8 | Multiple typos | "trasfer" → "transfer" | -| 0.6-0.7 | Significant differences | "giv" → "give" | -| Below 0.6 | Very loose matching | "doc" → "document" | +| 0.9-1.0 | Near-exact matches | "organize" → "organise" | +| 0.8-0.9 | Minor variations | "realize" → "realise" | +| 0.7-0.8 | Multiple variations | "color" → "colour" | +| 0.6-0.7 | Significant differences | "analyze" → "analyse" | +| Below 0.6 | Very loose matching | "recognise" → "recognize" | ## Text Normalization @@ -103,32 +103,32 @@ Text undergoes automatic normalization before matching: accents are removed (caf ## Examples -### Finding Misspelled Verbs +### Finding Data with Spelling Variants ```python -search.search_with_fuzzy("recieve") # Finds "receive" -search.search_with_fuzzy("occure") # Finds "occur" -search.search_with_fuzzy("seperate") # Finds "separate" +search.search_with_fuzzy("realize") # Finds "realise" if dataset contains this variant +search.search_with_fuzzy("organize") # Finds "organise" if dataset contains this variant +search.search_with_fuzzy("analyze") # Finds "analyse" if dataset contains this variant ``` -### Partial Matches +### Partial Forms -Short queries require lower thresholds: +Short forms or abbreviations in data require lower thresholds: ```python -search.search_with_fuzzy("giv", fuzzy_threshold=0.6) # Finds "give" -search.search_with_fuzzy("tak", fuzzy_threshold=0.6) # Finds "take" -search.search_with_fuzzy("trans", fuzzy_threshold=0.7) # Finds "transfer" +search.search_with_fuzzy("recognise", fuzzy_threshold=0.7) # Finds "recognize" +search.search_with_fuzzy("colour", fuzzy_threshold=0.7) # Finds "color" +search.search_with_fuzzy("favour", fuzzy_threshold=0.8) # Finds "favor" ``` -### Spelling Variants +### Morphological Variants -The system handles British and American spelling differences: +The system finds British and American spelling differences in the data: ```python -search.search_with_fuzzy("realise") # Finds "realize" -search.search_with_fuzzy("colour") # Finds "color" -search.search_with_fuzzy("analyse") # Finds "analyze" +search.search_with_fuzzy("realise") # Finds "realize" if present +search.search_with_fuzzy("colour") # Finds "color" if present +search.search_with_fuzzy("analyse") # Finds "analyze" if present ``` ## Performance @@ -147,9 +147,9 @@ from glazing.search import UnifiedSearch search = UnifiedSearch() # Process multiple queries -queries = ["giv", "tak", "mak"] +queries = ["realize", "organize", "analyze"] for query in queries: - results = search.search_with_fuzzy(query, fuzzy_threshold=0.7) + results = search.search_with_fuzzy(query, fuzzy_threshold=0.8) if results: print(f"{query} → {results[0].name}") ``` diff --git a/pyproject.toml b/pyproject.toml index 9f5e114..f7cc609 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "glazing" -version = "0.2.0" +version = "0.2.1" description = "Unified data models and interfaces for syntactic and semantic frame ontologies" readme = "README.md" requires-python = ">=3.13" diff --git a/src/glazing/__version__.py b/src/glazing/__version__.py index 5659225..7aa23f1 100644 --- a/src/glazing/__version__.py +++ b/src/glazing/__version__.py @@ -1,4 +1,4 @@ """Version information for the glazing package.""" -__version__ = "0.2.0" +__version__ = "0.2.1" __version_info__ = tuple(int(i) for i in __version__.split(".")) diff --git a/src/glazing/cli/convert.py b/src/glazing/cli/convert.py index 857c382..20c1560 100644 --- a/src/glazing/cli/convert.py +++ b/src/glazing/cli/convert.py @@ -181,7 +181,7 @@ def convert_wordnet(input_dir: Path, output_dir: Path, verbose: bool = False) -> def convert_framenet(input_dir: Path, output_dir: Path, verbose: bool = False) -> None: - """Convert FrameNet XML frames to JSON Lines. + """Convert FrameNet XML frames to JSON Lines with lexical units. Parameters ---------- @@ -209,14 +209,14 @@ def convert_framenet(input_dir: Path, output_dir: Path, verbose: bool = False) - TimeElapsedColumn(), console=console, ) as progress: - task = progress.add_task("Converting FrameNet files...", total=None) + task = progress.add_task("Converting FrameNet frames and lexical units...", total=None) count = converter.convert_frames_directory(str(frames_dir), str(output_file)) progress.update(task, completed=True) if verbose: - console.print(f"[green]✓[/green] Converted {count} frames") + console.print(f"[green]✓[/green] Converted {count} frames with lexical units") console.print(f" Output: {output_file}") diff --git a/src/glazing/framenet/converter.py b/src/glazing/framenet/converter.py index 4a98c65..60081cd 100644 --- a/src/glazing/framenet/converter.py +++ b/src/glazing/framenet/converter.py @@ -45,6 +45,9 @@ AnnotatedText, Frame, FrameElement, + Lexeme, + LexicalUnit, + SentenceCount, ) from glazing.utils.xml_parser import ( parse_attributes, @@ -358,13 +361,154 @@ def convert_frame_file(self, filepath: Path | str) -> Frame: ), ) + def _parse_lu_from_index(self, lu_elem: etree._Element) -> LexicalUnit: + """Parse a lexical unit from luIndex.xml element. + + Parameters + ---------- + lu_elem : etree._Element + The LU element from luIndex.xml. + + Returns + ------- + LexicalUnit + Parsed lexical unit. + """ + attrs = parse_attributes( + lu_elem, + {"ID": int, "frameID": int, "numAnnotInstances": int, "hasAnnotation": bool}, + ) + + # Extract required attributes + lu_id = int(attrs["ID"]) + lu_name = str(attrs["name"]) + frame_id = int(attrs["frameID"]) + frame_name = str(attrs["frameName"]) + status = str(attrs.get("status", "Unknown")) + has_annotation = bool(attrs.get("hasAnnotation", False)) + num_annotated = int(attrs.get("numAnnotInstances", 0)) + + # Extract POS from name (e.g., "abandon.v" -> "V") + parts = lu_name.split(".") + if len(parts) >= 2: + pos_lower = parts[-1] + # Map lowercase POS to uppercase + pos_map = {"v": "V", "n": "N", "a": "A", "adv": "ADV", "prep": "PREP", "num": "NUM"} + pos = pos_map.get(pos_lower, pos_lower.upper()) + else: + # Default to V if no POS specified + pos = "V" + + # Create lexemes from the name + # Extract lemma part (everything before the last dot) + lemma = ".".join(parts[:-1]) if len(parts) >= 2 else lu_name + + # Split multi-word LUs: try underscore first, then space + # Examples: "give_up.v" -> ["give", "up"], "a bit.n" -> ["a", "bit"] + if "_" in lemma: + word_parts = lemma.split("_") + elif " " in lemma: + word_parts = lemma.split(" ") + else: + word_parts = [lemma] + + lexemes = [] + for i, word in enumerate(word_parts): + if not word: # Skip empty strings from splitting + continue + # First word is typically the headword + is_headword = i == 0 + # Keep the word as-is: real FrameNet data has parentheses, brackets, etc. + lexemes.append( + Lexeme( + name=word, + pos=pos, # type: ignore[arg-type] + headword=is_headword, + order=i + 1, + ) + ) + + # Create sentence count from annotation data + sentence_count = SentenceCount( + annotated=num_annotated if has_annotation else 0, + total=num_annotated, # For luIndex, we only have annotated count + ) + + # Create minimal definition (luIndex doesn't have full definitions) + definition = f"Lexical unit '{lu_name}' in frame '{frame_name}'." + + # Note: Using field names, not aliases, since we're constructing programmatically + # The model uses aliases for deserialization from JSON + lu_dict = { + "id": lu_id, + "name": lu_name, + "pos": pos, + "definition": definition, + "status": status if status != "Unknown" else None, # annotation_status alias + "totalAnnotated": num_annotated if has_annotation else None, # total_annotated + "has_annotated_examples": has_annotation, + "frame_id": frame_id, + "frame_name": frame_name, + "sentence_count": sentence_count, + "lexemes": lexemes, + } + return LexicalUnit(**lu_dict) # type: ignore[arg-type] + + def convert_lu_index_file(self, filepath: Path | str) -> list[LexicalUnit]: + """Convert luIndex.xml to a list of LexicalUnit models. + + Parameters + ---------- + filepath : Path | str + Path to luIndex.xml file. + + Returns + ------- + list[LexicalUnit] + List of parsed LexicalUnit models. + + Examples + -------- + >>> converter = FrameNetConverter() + >>> lus = converter.convert_lu_index_file("framenet_v17/luIndex.xml") + >>> print(f"Loaded {len(lus)} lexical units") + 'Loaded 13575 lexical units' + """ + filepath = Path(filepath) + + # Parse XML + if self.validate_schema: + root = parse_with_schema(filepath) + else: + tree = etree.parse(str(filepath)) + root = tree.getroot() + + # Parse all LU elements + lexical_units = [] + lu_tag = f"{{{self.namespace}}}lu" if self.namespace else "lu" + for lu_elem in root.findall(lu_tag): + try: + lu = self._parse_lu_from_index(lu_elem) + lexical_units.append(lu) + except (ValueError, KeyError, TypeError) as e: + # Skip invalid LUs but continue processing + lu_name = lu_elem.get("name", "unknown") + # Log error but don't fail entire conversion + print(f"Warning: Failed to parse LU '{lu_name}': {e}") + continue + + return lexical_units + def convert_frames_directory( self, input_dir: Path | str, output_file: Path | str, pattern: str = "*.xml", ) -> int: - """Convert all frame files in a directory to JSON Lines. + """Convert all frame files in a directory to JSON Lines with lexical units. + + This method parses frame XML files and associates them with lexical units + from luIndex.xml (expected to be in the parent directory of input_dir). Parameters ---------- @@ -393,19 +537,46 @@ def convert_frames_directory( input_dir = Path(input_dir) output_file = Path(output_file) - count = 0 + # First, parse all frames + frames: list[Frame] = [] errors: list[tuple[Path, Exception]] = [] + for xml_file in sorted(input_dir.glob(pattern)): + try: + frame = self.convert_frame_file(xml_file) + frames.append(frame) + except (etree.XMLSyntaxError, ValueError, TypeError) as e: + errors.append((xml_file, e)) + + # Load lexical units from luIndex.xml (in parent directory) + parent_dir = input_dir.parent if input_dir.name == "frame" else input_dir + lu_index_path = parent_dir / "luIndex.xml" + + lexical_units: list[LexicalUnit] = [] + if lu_index_path.exists(): + try: + lexical_units = self.convert_lu_index_file(lu_index_path) + except (etree.XMLSyntaxError, ValueError, TypeError) as e: + print(f"Warning: Failed to load lexical units from {lu_index_path}: {e}") + + # Associate LUs with frames by frame_id + lu_by_frame: dict[int, list[LexicalUnit]] = {} + for lu in lexical_units: + if lu.frame_id not in lu_by_frame: + lu_by_frame[lu.frame_id] = [] + lu_by_frame[lu.frame_id].append(lu) + + # Update frames with their lexical units + for frame in frames: + frame.lexical_units = lu_by_frame.get(frame.id, []) + + # Write frames with LUs to output file + count = 0 with output_file.open("w", encoding="utf-8") as f: - for xml_file in sorted(input_dir.glob(pattern)): - try: - frame = self.convert_frame_file(xml_file) - # Write as JSON Lines - json_line = frame.model_dump_json(exclude_none=True) - f.write(json_line + "\n") - count += 1 - except (etree.XMLSyntaxError, ValueError, TypeError) as e: - errors.append((xml_file, e)) + for frame in frames: + json_line = frame.model_dump_json(exclude_none=True) + f.write(json_line + "\n") + count += 1 # If there were any errors, raise an exception with details if errors: diff --git a/src/glazing/framenet/types.py b/src/glazing/framenet/types.py index 7551062..a30cb2b 100644 --- a/src/glazing/framenet/types.py +++ b/src/glazing/framenet/types.py @@ -378,6 +378,11 @@ def is_valid_fe_abbrev(abbrev: str) -> bool: def is_valid_lu_name(name: str) -> bool: """Check if a string is a valid lexical unit name. + FrameNet LU names follow the pattern: lemma.pos where lemma can be + any string including multi-word expressions, proper nouns, acronyms, + and special characters. Examples: "abandon.v", "April.n", "a bit.n", + "(can't) help.v", "American [N and S Am].n" + Parameters ---------- name : str @@ -386,9 +391,10 @@ def is_valid_lu_name(name: str) -> bool: Returns ------- bool - True if the name matches the LU name pattern. + True if the name matches the LU name pattern (has at least one char and .pos). """ - return bool(re.match(r"^[a-z][a-z0-9_\'-]*\.[a-z]+$", name, re.IGNORECASE)) + # Very permissive: just require something.pos format + return bool(re.match(LU_NAME_PATTERN, name)) def is_valid_username(username: str) -> bool: @@ -426,10 +432,14 @@ def is_valid_hex_color(color: str) -> bool: # FrameNet-specific pattern constants FRAME_NAME_PATTERN = r"^[A-Za-z0-9_\-]+$" # Allow hyphens in frame names FE_NAME_PATTERN = r"^[A-Za-z0-9_\-\.\'\s]+$" # Allow hyphens, periods, apostrophes, spaces -FE_ABBREV_PATTERN = r"^[A-Za-z0-9_\-\.\'\s/]+$" # Allow slashes for abbreviations like H/C -LU_NAME_PATTERN = r"^[a-z][a-z0-9_\'-]*\.[a-z]+$" +FE_ABBREV_PATTERN = r"^[A-Za-z0-9_\-\.\'\s/]+$" # Allow slashes like H/C +# LU names: proper nouns, multi-word expressions, special chars (parentheses, brackets, etc.) +# Examples: "abandon.v", "April.n", "(can't) help.v", "a bit.n" +LU_NAME_PATTERN = r"^.+\.[a-z]+$" # Anything followed by .pos (very permissive) USERNAME_PATTERN = r"^[A-Za-z][A-Za-z0-9]*$" -LEXEME_NAME_PATTERN = r"^[a-zA-Z][a-zA-Z0-9\'-]*$" +# Lexeme names: spaces, hyphens, apostrophes, slashes, parentheses, brackets, digits +# Examples: "abandon", "a bit", "Boxing Day", "Scud-B missile", "(can't" +LEXEME_NAME_PATTERN = r"^.+$" # Very permissive - any non-empty string # Counts and limits MAX_FRAME_ELEMENTS = 100 # Maximum FEs per frame diff --git a/tests/test_framenet/test_converter.py b/tests/test_framenet/test_converter.py index e71a4cb..c79079b 100644 --- a/tests/test_framenet/test_converter.py +++ b/tests/test_framenet/test_converter.py @@ -165,3 +165,174 @@ def test_convenience_function(self, framenet_frame_xml): assert frame.id == 2031 assert frame.name == "Abandonment" assert len(frame.frame_elements) == 5 + + +class TestLexicalUnitParsing: + """Test lexical unit parsing from luIndex.xml.""" + + def test_parse_lu_from_index_basic(self, tmp_path): + """Test parsing basic LU from luIndex element.""" + lu_xml = """ + + + """ + + lu_file = tmp_path / "luIndex.xml" + lu_file.write_text(lu_xml) + + converter = FrameNetConverter() + lus = converter.convert_lu_index_file(lu_file) + + assert len(lus) == 1 + lu = lus[0] + assert lu.id == 12345 + assert lu.name == "abandon.v" + assert lu.pos == "V" + assert lu.frame_id == 2031 + assert lu.frame_name == "Abandonment" + assert lu.annotation_status == "Finished_Initial" + assert lu.has_annotated_examples is True + assert lu.sentence_count.total == 11 + assert lu.sentence_count.annotated == 11 + assert len(lu.lexemes) == 1 + assert lu.lexemes[0].name == "abandon" + assert lu.lexemes[0].pos == "V" + assert lu.lexemes[0].headword is True + + def test_parse_multi_word_lu(self, tmp_path): + """Test parsing multi-word LU like 'give_up.v'.""" + lu_xml = """ + + + """ + + lu_file = tmp_path / "luIndex.xml" + lu_file.write_text(lu_xml) + + converter = FrameNetConverter() + lus = converter.convert_lu_index_file(lu_file) + + assert len(lus) == 1 + lu = lus[0] + assert lu.name == "give_up.v" + assert len(lu.lexemes) == 2 + assert lu.lexemes[0].name == "give" + assert lu.lexemes[0].headword is True + assert lu.lexemes[0].order == 1 + assert lu.lexemes[1].name == "up" + assert lu.lexemes[1].headword is False + assert lu.lexemes[1].order == 2 + + def test_parse_lu_different_pos(self, tmp_path): + """Test parsing LUs with different parts of speech.""" + lu_xml = """ + + + + + """ + + lu_file = tmp_path / "luIndex.xml" + lu_file.write_text(lu_xml) + + converter = FrameNetConverter() + lus = converter.convert_lu_index_file(lu_file) + + assert len(lus) == 3 + assert lus[0].pos == "N" + assert lus[1].pos == "A" + assert lus[2].pos == "ADV" + + def test_convert_frames_with_lus(self, tmp_path): + """Test converting frames directory with LU association.""" + # Create frame directory structure + frames_dir = tmp_path / "frame" + frames_dir.mkdir() + + # Create a test frame XML + frame_xml = """ + + A test frame. + + The agent. + + """ + + frame_file = frames_dir / "TestFrame.xml" + frame_file.write_text(frame_xml) + + # Create luIndex.xml in parent directory + lu_xml = """ + + + + + """ + + lu_index_file = tmp_path / "luIndex.xml" + lu_index_file.write_text(lu_xml) + + # Convert + output_file = tmp_path / "output.jsonl" + converter = FrameNetConverter() + count = converter.convert_frames_directory(frames_dir, output_file) + + assert count == 1 + + # Verify output + with output_file.open("r") as f: + data = json.loads(f.readline()) + + assert data["id"] == 100 + assert data["name"] == "TestFrame" + assert "lexical_units" in data + assert len(data["lexical_units"]) == 2 + + lu_names = [lu["name"] for lu in data["lexical_units"]] + assert "test.v" in lu_names + assert "examine.v" in lu_names + assert "other.v" not in lu_names # Different frame + + def test_lu_index_missing_no_crash(self, tmp_path): + """Test that conversion works even if luIndex.xml is missing.""" + frames_dir = tmp_path / "frame" + frames_dir.mkdir() + + frame_xml = """ + + A test frame. + """ + + frame_file = frames_dir / "TestFrame.xml" + frame_file.write_text(frame_xml) + + # No luIndex.xml created + + output_file = tmp_path / "output.jsonl" + converter = FrameNetConverter() + count = converter.convert_frames_directory(frames_dir, output_file) + + assert count == 1 + + # Frame should have empty lexical_units list + with output_file.open("r") as f: + data = json.loads(f.readline()) + + assert data["id"] == 100 + assert "lexical_units" in data + assert len(data["lexical_units"]) == 0 diff --git a/tests/test_framenet/test_models.py b/tests/test_framenet/test_models.py index 3c749ae..84cb42f 100644 --- a/tests/test_framenet/test_models.py +++ b/tests/test_framenet/test_models.py @@ -980,8 +980,10 @@ def test_non_headword_lexeme(self): def test_invalid_lexeme_name(self): """Test validation of lexeme name.""" + # Lexeme names are very permissive in real FrameNet data (spaces, special chars, etc.) + # Only empty strings should be rejected with pytest.raises(ValueError, match="Invalid lexeme name format"): - Lexeme(name="123invalid", pos="V") + Lexeme(name="", pos="V") def test_break_before_alias(self): """Test breakBefore field alias.""" diff --git a/tests/test_framenet/test_types.py b/tests/test_framenet/test_types.py index aa7a0e2..525a978 100644 --- a/tests/test_framenet/test_types.py +++ b/tests/test_framenet/test_types.py @@ -266,8 +266,9 @@ def test_valid_lu_names(self): def test_invalid_lu_names(self): """Test LU name validation with invalid names.""" - # Note: The regex uses re.IGNORECASE, so ABANDON.V is valid - invalid_names = ["abandon", "abandon.", ".v", "123.v", "test word.v", ""] + # Real FrameNet data is very permissive (proper nouns, spaces, special chars, etc.) + # Only reject truly invalid patterns: no .pos suffix, empty, or just a dot + invalid_names = ["abandon", "abandon.", ".v", "", "test."] for name in invalid_names: assert not is_valid_lu_name(name), f"Should reject invalid LU name: {name}" @@ -353,11 +354,15 @@ def test_hex_color_pattern(self): def test_lexeme_name_pattern(self): """Test LEXEME_NAME_PATTERN regex.""" + # Lexeme names are very permissive in real FrameNet data (spaces, digits, symbols) + # Examples from real data: "a bit", "Boxing Day", "Scud-B missile", "(can't" pattern = re.compile(LEXEME_NAME_PATTERN) assert pattern.match("abandon") assert pattern.match("give") assert pattern.match("it's") - assert not pattern.match("123word") + assert pattern.match("123word") # Real FrameNet has numbers + assert pattern.match("a bit") # Real FrameNet multi-word expressions + assert not pattern.match("") # Only empty string is invalid class TestConstants: