From 300f77b9b85cb50f7e53998437e54e64807d79c5 Mon Sep 17 00:00:00 2001 From: Aaron Steven White Date: Thu, 5 Feb 2026 19:32:46 -0500 Subject: [PATCH 01/11] Rewrites WordNet converter and loader to use enriched single-file JSONL output with supplementary sense and exception files. --- src/glazing/wordnet/converter.py | 259 +++++++++++++++++ src/glazing/wordnet/loader.py | 264 ++++++++---------- src/glazing/wordnet/models.py | 15 + tests/test_wordnet/test_converter.py | 2 +- tests/test_wordnet/test_loader.py | 185 +++++------- tests/test_wordnet/test_morphy.py | 403 ++++----------------------- 6 files changed, 523 insertions(+), 605 deletions(-) diff --git a/src/glazing/wordnet/converter.py b/src/glazing/wordnet/converter.py index f16112b..cdb498d 100644 --- a/src/glazing/wordnet/converter.py +++ b/src/glazing/wordnet/converter.py @@ -315,6 +315,120 @@ def parse_exception_file(self, filepath: Path | str) -> list[ExceptionEntry]: return entries + def parse_verb_framestext(self, filepath: Path | str) -> dict[int, str]: + """Parse verb.Framestext into a mapping of frame number to template string. + + Parameters + ---------- + filepath : Path | str + Path to verb.Framestext file. + + Returns + ------- + dict[int, str] + Mapping from frame number to template string. + """ + filepath = Path(filepath) + if not filepath.exists(): + return {} + + frames: dict[int, str] = {} + + with filepath.open("r", encoding="utf-8") as f: + for line_raw in f: + line = line_raw.strip() + if not line: + continue + + parts = line.split(None, 1) + if len(parts) < 2: + continue + + try: + frame_num = int(parts[0]) + template = parts[1] + frames[frame_num] = template + except ValueError: + continue + + return frames + + def parse_verb_sentences(self, filepath: Path | str) -> dict[int, str]: + """Parse sents.vrb into a mapping of frame number to example sentence. + + Parameters + ---------- + filepath : Path | str + Path to sents.vrb file. + + Returns + ------- + dict[int, str] + Mapping from frame number to example sentence. + """ + filepath = Path(filepath) + if not filepath.exists(): + return {} + + sentences: dict[int, str] = {} + + with filepath.open("r", encoding="utf-8") as f: + for line_raw in f: + line = line_raw.strip() + if not line: + continue + + parts = line.split(None, 1) + if len(parts) < 2: + continue + + try: + sent_num = int(parts[0]) + sentence = parts[1] + sentences[sent_num] = sentence + except ValueError: + continue + + return sentences + + def parse_cntlist(self, filepath: Path | str) -> dict[str, int]: + """Parse cntlist into a mapping of sense key to frequency count. + + Parameters + ---------- + filepath : Path | str + Path to cntlist file. + + Returns + ------- + dict[str, int] + Mapping from sense key to frequency count. + """ + filepath = Path(filepath) + if not filepath.exists(): + return {} + + counts: dict[str, int] = {} + + with filepath.open("r", encoding="utf-8") as f: + for line_raw in f: + line = line_raw.strip() + if not line: + continue + + parts = line.split() + if len(parts) < 2: + continue + + try: + count = int(parts[0]) + sense_key = parts[1] + counts[sense_key] = count + except ValueError: + continue + + return counts + def convert_wordnet_database( self, wordnet_dir: Path | str, output_file: Path | str ) -> dict[str, int]: @@ -363,6 +477,71 @@ def convert_wordnet_database( all_synsets.extend(synsets) counts[f"synsets_{pos_name}"] = len(synsets) + # Parse supplementary files for enrichment + framestext = self.parse_verb_framestext(wordnet_dir / "verb.Framestext") + sents = self.parse_verb_sentences(wordnet_dir / "sents.vrb") + + # Build sense_key → (sense_number, tag_count) map from index.sense + sense_map: dict[str, tuple[int, int]] = {} + sense_index_file = wordnet_dir / "index.sense" + if sense_index_file.exists(): + with sense_index_file.open("r", encoding="utf-8") as f: + for line_raw in f: + line = line_raw.strip() + if not line: + continue + parts = line.split() + if len(parts) != 4: + continue + try: + sk = parts[0] + sense_number = int(parts[2]) + tag_count = int(parts[3]) + sense_map[sk] = (sense_number, tag_count) + except ValueError: + continue + + # Parse cntlist to enhance tag_count data + cntlist = self.parse_cntlist(wordnet_dir / "cntlist") + for sk, count in cntlist.items(): + if sk in sense_map: + sn, _ = sense_map[sk] + sense_map[sk] = (sn, count) + else: + sense_map[sk] = (0, count) + + # ss_type to number mapping for sense key construction + ss_type_num_map: dict[str, int] = { + "n": 1, + "v": 2, + "a": 3, + "r": 4, + "s": 5, + } + + # Enrich synsets with sense data and verb frame templates + for synset in all_synsets: + ss_num = ss_type_num_map.get(synset.ss_type, 1) + + # Enrich words with sense_number and tag_count + for word in synset.words: + lemma_lower = word.lemma.lower() + sense_key = f"{lemma_lower}%{ss_num}:{synset.lex_filenum:02d}:{word.lex_id:02d}::" + if sense_key in sense_map: + sn, tc = sense_map[sense_key] + if sn > 0: + word.sense_number = sn + word.tag_count = tc + + # Enrich verb frames with template and example_sentence + if synset.frames: + for frame in synset.frames: + fn = frame.frame_number + if fn in framestext: + frame.template = framestext[fn] + if fn in sents: + frame.example_sentence = sents[fn] + # Write all synsets to single output file with output_file.open("w", encoding="utf-8") as f: for synset in all_synsets: @@ -372,6 +551,83 @@ def convert_wordnet_database( return counts + def convert_sense_index(self, wordnet_dir: Path | str, output_file: Path | str) -> int: + """Parse index.sense and output Sense objects to JSONL. + + Parameters + ---------- + wordnet_dir : Path | str + Directory containing WordNet database files. + output_file : Path | str + Output JSON Lines file path. + + Returns + ------- + int + Number of sense entries written. + + Raises + ------ + FileNotFoundError + If index.sense file does not exist. + """ + wordnet_dir = Path(wordnet_dir) + output_file = Path(output_file) + + output_file.parent.mkdir(parents=True, exist_ok=True) + + sense_file = wordnet_dir / "index.sense" + senses = self.parse_sense_index(sense_file) + + with output_file.open("w", encoding="utf-8") as f: + for sense in senses: + f.write(f"{sense.model_dump_json()}\n") + + return len(senses) + + def convert_exceptions(self, wordnet_dir: Path | str, output_file: Path | str) -> int: + """Parse *.exc files and output ExceptionEntry objects to JSONL. + + Parameters + ---------- + wordnet_dir : Path | str + Directory containing WordNet database files. + output_file : Path | str + Output JSON Lines file path. + + Returns + ------- + int + Number of exception entries written. + """ + wordnet_dir = Path(wordnet_dir) + output_file = Path(output_file) + + output_file.parent.mkdir(parents=True, exist_ok=True) + + all_entries: list[ExceptionEntry] = [] + + exc_files: list[tuple[str, WordNetPOS]] = [ + ("noun.exc", "n"), + ("verb.exc", "v"), + ("adj.exc", "a"), + ("adv.exc", "r"), + ] + + for exc_name, pos in exc_files: + exc_path = wordnet_dir / exc_name + if exc_path.exists(): + entries = self.parse_exception_file(exc_path) + for entry in entries: + entry.pos = pos + all_entries.extend(entries) + + with output_file.open("w", encoding="utf-8") as f: + for entry in all_entries: + f.write(f"{entry.model_dump_json()}\n") + + return len(all_entries) + def _parse_data_line(self, line: str) -> Synset | None: """Parse a line from WordNet data file. @@ -458,6 +714,9 @@ def _parse_data_line(self, line: str) -> Synset | None: if ss_type == "v" and idx < len(parts): frames = [] + # Skip frame count field + idx += 1 + # Parse frames until no more "+" markers while idx + 2 < len(parts) and parts[idx] == "+": frame_marker = parts[idx] # "+" diff --git a/src/glazing/wordnet/loader.py b/src/glazing/wordnet/loader.py index 82b86fb..625f85b 100644 --- a/src/glazing/wordnet/loader.py +++ b/src/glazing/wordnet/loader.py @@ -32,7 +32,6 @@ import json from collections import defaultdict from pathlib import Path -from typing import cast from pydantic import ValidationError @@ -40,7 +39,6 @@ from glazing.utils.cache import LRUCache from glazing.wordnet.models import ( ExceptionEntry, - IndexEntry, Sense, Synset, ) @@ -62,7 +60,7 @@ class WordNetLoader: Parameters ---------- data_path : Path | str | None, optional - Path to directory containing WordNet JSON Lines files. + Path to the WordNet JSONL file (e.g., wordnet.jsonl). If None, uses default path from environment. lazy : bool, default=False If True, load synsets on demand rather than all at once. @@ -76,8 +74,8 @@ class WordNetLoader: ---------- synsets : dict[SynsetOffset, Synset] All loaded synsets indexed by offset. - lemma_index : dict[str, dict[WordNetPOS, list[IndexEntry]]] - Index from lemmas to their index entries by POS. + lemma_index : dict[str, dict[WordNetPOS, list[SynsetOffset]]] + Index from lemmas to synset offsets by POS. sense_index : dict[SenseKey, Sense] Index from sense keys to sense objects. exceptions : dict[WordNetPOS, dict[str, list[str]]] @@ -120,7 +118,7 @@ def __init__( Parameters ---------- data_path : Path | str | None, optional - Path to directory containing WordNet JSON Lines files. + Path to the WordNet JSONL file (e.g., wordnet.jsonl). If None, uses default path from environment. lazy : bool, default=False If True, load synsets on demand. @@ -138,7 +136,7 @@ def __init__( # Core data structures self.synsets: dict[SynsetOffset, Synset] = {} - self.lemma_index: dict[str, dict[WordNetPOS, list[IndexEntry]]] = defaultdict(dict) + self.lemma_index: dict[str, dict[WordNetPOS, list[SynsetOffset]]] = defaultdict(dict) self.sense_index: dict[SenseKey, Sense] = {} self.exceptions: dict[WordNetPOS, dict[str, list[str]]] = {} @@ -148,8 +146,8 @@ def __init__( self.meronym_index: dict[SynsetOffset, list[SynsetOffset]] = defaultdict(list) self.holonym_index: dict[SynsetOffset, list[SynsetOffset]] = defaultdict(list) - # File paths for lazy loading - self._synset_file_index: dict[SynsetOffset, tuple[Path, int]] = {} + # File index for lazy loading (offset -> byte position in file) + self._synset_file_index: dict[SynsetOffset, int] = {} # Cache for lazy loading if lazy: @@ -167,33 +165,32 @@ def __init__( def load(self) -> None: """Load all WordNet data from JSON Lines files. - This method loads synsets, builds indices, loads exceptions, - and constructs relation graphs. If lazy loading is enabled, - it only builds the file index without loading synset data. + This method loads synsets from the primary JSONL file, builds + lemma and relation indices from loaded data, and optionally loads + supplementary sense and exception data. Raises ------ FileNotFoundError - If data directory or required files don't exist. + If the primary JSONL file doesn't exist. ValidationError If JSON data doesn't match expected schema. """ if self._loaded: return - # Load synsets + # Load synsets from single JSONL file if self.lazy: self._build_file_index() else: self._load_all_synsets() - # Load index files - self._load_index_files() + # Build lemma index from loaded synsets + if not self.lazy: + self._build_lemma_index() - # Load sense index + # Load supplementary data if available self._load_sense_index() - - # Load exceptions self._load_exceptions() # Build relation indices @@ -203,45 +200,44 @@ def load(self) -> None: self._loaded = True def _load_all_synsets(self) -> None: - """Load all synsets from JSON Lines files.""" - for pos in ["noun", "verb", "adj", "adv"]: - synset_file = self.data_path / f"data.{pos}.jsonl" - if not synset_file.exists(): - continue - - with synset_file.open(encoding="utf-8") as f: - for line in f: - if not line.strip(): - continue - - try: - data = json.loads(line) - synset = Synset.model_validate(data) - self.synsets[synset.offset] = synset - except (json.JSONDecodeError, ValidationError) as e: - # Log error but continue loading - print(f"Error loading synset: {e}") + """Load all synsets from single JSONL file.""" + if not self.data_path.exists(): + return + + with self.data_path.open(encoding="utf-8") as f: + for line in f: + if not line.strip(): + continue + + try: + data = json.loads(line) + synset = Synset.model_validate(data) + self.synsets[synset.offset] = synset + except (json.JSONDecodeError, ValidationError): + continue def _build_file_index(self) -> None: - """Build index of synset locations for lazy loading.""" - for pos in ["noun", "verb", "adj", "adv"]: - synset_file = self.data_path / f"data.{pos}.jsonl" - if not synset_file.exists(): - continue - - with synset_file.open(encoding="utf-8") as f: - for line_num, line in enumerate(f): - if not line.strip(): - continue - - try: - # Just extract offset without full validation - data = json.loads(line) - offset = data.get("offset") - if offset: - self._synset_file_index[offset] = (synset_file, line_num) - except json.JSONDecodeError: - pass + """Build byte-offset index for lazy loading from single JSONL file.""" + if not self.data_path.exists(): + return + + with self.data_path.open(encoding="utf-8") as f: + while True: + byte_pos = f.tell() + line = f.readline() + if not line: + break + + if not line.strip(): + continue + + try: + data = json.loads(line) + offset = data.get("offset") + if offset: + self._synset_file_index[offset] = byte_pos + except json.JSONDecodeError: + pass def _load_synset_lazy(self, offset: SynsetOffset) -> Synset | None: """Load a single synset on demand. @@ -261,61 +257,44 @@ def _load_synset_lazy(self, offset: SynsetOffset) -> Synset | None: # Check cache first if self._cache is not None: - # Create cache key from offset (cache expects strings) cached = self._cache.get(offset) if cached is not None: return cached - # Load from file - file_info = self._synset_file_index.get(offset) - if not file_info: + # Load from file using byte offset + byte_pos = self._synset_file_index.get(offset) + if byte_pos is None: return None - synset_file, line_num = file_info - try: - with synset_file.open(encoding="utf-8") as f: - for i, line in enumerate(f): - if i == line_num: - data = json.loads(line) - synset = Synset.model_validate(data) + with self.data_path.open(encoding="utf-8") as f: + f.seek(byte_pos) + line = f.readline() + data = json.loads(line) + synset = Synset.model_validate(data) - # Cache it - if self._cache is not None: - self._cache.put(offset, synset) + # Cache it + if self._cache is not None: + self._cache.put(offset, synset) - return synset + return synset except (json.JSONDecodeError, ValidationError): return None - return None - - def _load_index_files(self) -> None: - """Load lemma index files.""" - for pos_name, pos_tag in [("noun", "n"), ("verb", "v"), ("adj", "a"), ("adv", "r")]: - index_file = self.data_path / f"index.{pos_name}.jsonl" - if not index_file.exists(): - continue - - with index_file.open(encoding="utf-8") as f: - for line in f: - if not line.strip(): - continue - - try: - data = json.loads(line) - entry = IndexEntry.model_validate(data) - - # Add to lemma index - if pos_tag not in self.lemma_index[entry.lemma]: - self.lemma_index[entry.lemma][cast(WordNetPOS, pos_tag)] = [] - self.lemma_index[entry.lemma][cast(WordNetPOS, pos_tag)].append(entry) - except (json.JSONDecodeError, ValidationError) as e: - print(f"Error loading index entry: {e}") + def _build_lemma_index(self) -> None: + """Build lemma→synset index from loaded synset data.""" + for synset in self.synsets.values(): + pos = synset.ss_type + for word in synset.words: + lemma = word.lemma.lower() + if pos not in self.lemma_index[lemma]: + self.lemma_index[lemma][pos] = [] + if synset.offset not in self.lemma_index[lemma][pos]: + self.lemma_index[lemma][pos].append(synset.offset) def _load_sense_index(self) -> None: - """Load sense index file.""" - sense_file = self.data_path / "index.sense.jsonl" + """Load sense index from supplementary JSONL file.""" + sense_file = self.data_path.parent / "wordnet_senses.jsonl" if not sense_file.exists(): return @@ -328,31 +307,32 @@ def _load_sense_index(self) -> None: data = json.loads(line) sense = Sense.model_validate(data) self.sense_index[sense.sense_key] = sense - except (json.JSONDecodeError, ValidationError) as e: - print(f"Error loading sense: {e}") + except (json.JSONDecodeError, ValidationError): + continue def _load_exceptions(self) -> None: - """Load morphological exception files.""" - for pos_name, pos_tag in [("noun", "n"), ("verb", "v"), ("adj", "a"), ("adv", "r")]: - exc_file = self.data_path / f"{pos_name}.exc.jsonl" - if not exc_file.exists(): - continue - - if pos_tag not in self.exceptions: - self.exceptions[cast(WordNetPOS, pos_tag)] = {} - - with exc_file.open(encoding="utf-8") as f: - for line in f: - if not line.strip(): - continue - - try: - data = json.loads(line) - entry = ExceptionEntry.model_validate(data) - pos_exceptions = self.exceptions[cast(WordNetPOS, pos_tag)] - pos_exceptions[entry.inflected_form] = entry.base_forms - except (json.JSONDecodeError, ValidationError) as e: - print(f"Error loading exception: {e}") + """Load morphological exceptions from supplementary JSONL file.""" + exc_file = self.data_path.parent / "wordnet_exceptions.jsonl" + if not exc_file.exists(): + return + + with exc_file.open(encoding="utf-8") as f: + for line in f: + if not line.strip(): + continue + + try: + data = json.loads(line) + entry = ExceptionEntry.model_validate(data) + # Determine POS from the base form by looking up in synsets + # Store under all POS for simplicity (exceptions file doesn't have POS) + pos = data.get("pos") + if pos and pos in ("n", "v", "a", "r", "s"): + if pos not in self.exceptions: + self.exceptions[pos] = {} + self.exceptions[pos][entry.inflected_form] = entry.base_forms + except (json.JSONDecodeError, ValidationError): + continue def _build_relation_indices(self) -> None: """Build relation indices for efficient traversal.""" @@ -371,12 +351,12 @@ def _build_relation_indices(self) -> None: self.hypernym_index[pointer.offset].append(synset.offset) # Meronym/holonym relations - elif pointer.symbol in ["%m", "%s", "%p"]: + elif pointer.symbol in ("%m", "%s", "%p"): if pointer.offset not in self.meronym_index[synset.offset]: self.meronym_index[synset.offset].append(pointer.offset) if synset.offset not in self.holonym_index[pointer.offset]: self.holonym_index[pointer.offset].append(synset.offset) - elif pointer.symbol in ["#m", "#s", "#p"]: + elif pointer.symbol in ("#m", "#s", "#p"): if pointer.offset not in self.holonym_index[synset.offset]: self.holonym_index[synset.offset].append(pointer.offset) if synset.offset not in self.meronym_index[pointer.offset]: @@ -426,23 +406,24 @@ def get_synsets_by_lemma(self, lemma: str, pos: WordNetPOS | None = None) -> lis ... print(synset.gloss) """ synsets: list[Synset] = [] + lemma_lower = lemma.lower() - if lemma not in self.lemma_index: + if lemma_lower not in self.lemma_index: return synsets # Get POS tags to search + pos_tags: list[WordNetPOS] if pos: - pos_tags = [pos] if pos in self.lemma_index[lemma] else [] + pos_tags = [pos] if pos in self.lemma_index[lemma_lower] else [] else: - pos_tags = list(self.lemma_index[lemma].keys()) + pos_tags = list(self.lemma_index[lemma_lower].keys()) - # Collect synsets + # Collect synsets from offset lists for pos_tag in pos_tags: - for entry in self.lemma_index[lemma][pos_tag]: - for offset in entry.synset_offsets: - synset = self.get_synset(offset) - if synset: - synsets.append(synset) + for offset in self.lemma_index[lemma_lower].get(pos_tag, []): + synset = self.get_synset(offset) + if synset: + synsets.append(synset) return synsets @@ -479,7 +460,7 @@ def get_senses_by_lemma(self, lemma: str, pos: WordNetPOS | None = None) -> list Returns ------- list[Sense] - List of senses for the lemma. + List of senses for the lemma, sorted by sense number. Examples -------- @@ -489,18 +470,9 @@ def get_senses_by_lemma(self, lemma: str, pos: WordNetPOS | None = None) -> list """ senses = [] - # Get synsets first - synsets = self.get_synsets_by_lemma(lemma, pos) - - # Extract senses from synsets - for synset in synsets: - for word in synset.words: - if word.lemma == lemma: - # Try to find corresponding sense - for _key, sense in self.sense_index.items(): - if sense.lemma == lemma and sense.synset_offset == synset.offset: - senses.append(sense) - break + for sense in self.sense_index.values(): + if sense.lemma == lemma and (pos is None or sense.ss_type == pos): + senses.append(sense) # Sort by sense number (frequency order) senses.sort(key=lambda s: s.sense_number) @@ -611,7 +583,7 @@ def load_wordnet( Parameters ---------- data_path : Path | str - Path to directory containing WordNet JSON Lines files. + Path to the WordNet JSONL file (e.g., wordnet.jsonl). lazy : bool, default=False If True, load synsets on demand. cache_size : int, default=1000 @@ -624,7 +596,7 @@ def load_wordnet( Examples -------- - >>> wn = load_wordnet("data/wordnet") + >>> wn = load_wordnet("data/wordnet.jsonl") >>> dog = wn.get_synsets_by_lemma("dog", "n")[0] >>> print(dog.gloss) """ diff --git a/src/glazing/wordnet/models.py b/src/glazing/wordnet/models.py index bdc58c0..2336d3a 100644 --- a/src/glazing/wordnet/models.py +++ b/src/glazing/wordnet/models.py @@ -66,6 +66,10 @@ class Word(GlazingBaseModel): Word form (lowercase, underscores for spaces). lex_id : LexID Distinguishes same word in synset (0-15). + sense_number : int | None, default=None + Frequency-based sense ordering from index.sense. + tag_count : int, default=0 + Semantic concordance tag count. Examples -------- @@ -78,6 +82,8 @@ class Word(GlazingBaseModel): lemma: str = Field(description="Word form (lowercase, underscores for spaces)") lex_id: LexID = Field(description="Lexical ID distinguishing same word in synset") + sense_number: int | None = Field(default=None, description="Frequency-based sense ordering") + tag_count: int = Field(default=0, ge=0, description="Semantic concordance tag count") @field_validator("lemma") @classmethod @@ -177,6 +183,10 @@ class VerbFrame(GlazingBaseModel): Frame number (1-35). word_indices : list[int] Word indices (0 = all words, or specific indices). + template : str | None, default=None + Natural language frame template (e.g., "Something ----s"). + example_sentence : str | None, default=None + Example sentence with %s placeholder for verb. Examples -------- @@ -189,6 +199,10 @@ class VerbFrame(GlazingBaseModel): word_indices: list[int] = Field( default_factory=list, description="Word indices (0 = all words)" ) + template: str | None = Field(default=None, description="Natural language frame template") + example_sentence: str | None = Field( + default=None, description="Example sentence with %s placeholder" + ) @field_validator("word_indices") @classmethod @@ -517,6 +531,7 @@ class ExceptionEntry(GlazingBaseModel): inflected_form: str = Field(description="Inflected/irregular form") base_forms: list[str] = Field(description="Base/lemma forms") + pos: WordNetPOS | None = Field(default=None, description="Part of speech") @field_validator("inflected_form", "base_forms") @classmethod diff --git a/tests/test_wordnet/test_converter.py b/tests/test_wordnet/test_converter.py index a08b4a7..f4bc050 100644 --- a/tests/test_wordnet/test_converter.py +++ b/tests/test_wordnet/test_converter.py @@ -30,7 +30,7 @@ def sample_data_file_content(self): return """ Copyright notice and license text here More license text 00001740 29 v 01 breathe 0 005 $ 00001740 v 0000 @ 00002084 v 0000 ~ 00001740 v 0000 + 00002760 v 0000 ^ 00001740 v 0000 | take in and expel air through lungs -00002084 29 v 02 respire 0 breathe 1 003 $ 00001740 v 0000 @ 00002325 v 0000 ~ 00002760 v 0000 + 01 00 + 02 01 | undergo the biomedical and metabolic processes of respiration by taking up oxygen and producing carbon monoxide +00002084 29 v 02 respire 0 breathe 1 003 $ 00001740 v 0000 @ 00002325 v 0000 ~ 00002760 v 0000 02 + 01 00 + 02 01 | undergo the biomedical and metabolic processes of respiration by taking up oxygen and producing carbon monoxide """ @pytest.fixture diff --git a/tests/test_wordnet/test_loader.py b/tests/test_wordnet/test_loader.py index 0c92c77..63084c0 100644 --- a/tests/test_wordnet/test_loader.py +++ b/tests/test_wordnet/test_loader.py @@ -13,12 +13,12 @@ class TestWordNetLoader: """Test WordNet loader functionality.""" @pytest.fixture - def temp_data_dir(self): - """Create temporary directory with test data.""" + def temp_data_file(self): + """Create temporary directory with test data in single-file format.""" with tempfile.TemporaryDirectory() as tmpdir: data_path = Path(tmpdir) - # Create test synset data + # All synsets go into a single wordnet.jsonl file synsets_data = [ { "offset": "00001740", @@ -45,75 +45,27 @@ def temp_data_dir(self): ], "gloss": "an entity that has physical existence", }, + { + "offset": "00002325", + "lex_filenum": 29, + "lex_filename": "verb.body", + "ss_type": "v", + "words": [{"lemma": "run", "lex_id": 0}, {"lemma": "go", "lex_id": 1}], + "pointers": [], + "frames": [ + {"frame_number": 1, "word_indices": [0]}, + {"frame_number": 2, "word_indices": [0, 1]}, + ], + "gloss": "move fast by using one's feet", + }, ] - # Write noun synsets - with open(data_path / "data.noun.jsonl", "w") as f: + wordnet_file = data_path / "wordnet.jsonl" + with open(wordnet_file, "w") as f: for synset in synsets_data: f.write(json.dumps(synset) + "\n") - # Create test verb synset - verb_synset = { - "offset": "00002325", - "lex_filenum": 29, - "lex_filename": "verb.body", - "ss_type": "v", - "words": [{"lemma": "run", "lex_id": 0}, {"lemma": "go", "lex_id": 1}], - "pointers": [], - "frames": [ - {"frame_number": 1, "word_indices": [0]}, - {"frame_number": 2, "word_indices": [0, 1]}, - ], - "gloss": "move fast by using one's feet", - } - - with open(data_path / "data.verb.jsonl", "w") as f: - f.write(json.dumps(verb_synset) + "\n") - - # Create index entries - index_data = [ - { - "lemma": "entity", - "pos": "n", - "synset_cnt": 1, - "p_cnt": 1, - "ptr_symbols": ["~"], - "sense_cnt": 1, - "tagsense_cnt": 0, - "synset_offsets": ["00001740"], - }, - { - "lemma": "physical_entity", - "pos": "n", - "synset_cnt": 1, - "p_cnt": 1, - "ptr_symbols": ["@"], - "sense_cnt": 1, - "tagsense_cnt": 0, - "synset_offsets": ["00001930"], - }, - ] - - with open(data_path / "index.noun.jsonl", "w") as f: - for entry in index_data: - f.write(json.dumps(entry) + "\n") - - # Create verb index - verb_index = { - "lemma": "run", - "pos": "v", - "synset_cnt": 1, - "p_cnt": 0, - "ptr_symbols": [], - "sense_cnt": 1, - "tagsense_cnt": 0, - "synset_offsets": ["00002325"], - } - - with open(data_path / "index.verb.jsonl", "w") as f: - f.write(json.dumps(verb_index) + "\n") - - # Create sense index + # Create sense index (supplementary file alongside primary) sense_data = [ { "sense_key": "entity%1:03:00::", @@ -137,40 +89,36 @@ def temp_data_dir(self): }, ] - with open(data_path / "index.sense.jsonl", "w") as f: + with open(data_path / "wordnet_senses.jsonl", "w") as f: for sense in sense_data: f.write(json.dumps(sense) + "\n") - # Create exception entries + # Create exception entries (with pos field) exc_data = [ - {"inflected_form": "children", "base_forms": ["child"]}, - {"inflected_form": "geese", "base_forms": ["goose"]}, + {"inflected_form": "children", "base_forms": ["child"], "pos": "n"}, + {"inflected_form": "geese", "base_forms": ["goose"], "pos": "n"}, + {"inflected_form": "ran", "base_forms": ["run"], "pos": "v"}, ] - with open(data_path / "noun.exc.jsonl", "w") as f: + with open(data_path / "wordnet_exceptions.jsonl", "w") as f: for exc in exc_data: f.write(json.dumps(exc) + "\n") - verb_exc = {"inflected_form": "ran", "base_forms": ["run"]} - - with open(data_path / "verb.exc.jsonl", "w") as f: - f.write(json.dumps(verb_exc) + "\n") + yield wordnet_file - yield data_path - - def test_loader_initialization(self, temp_data_dir): + def test_loader_initialization(self, temp_data_file): """Test loader initialization without autoload.""" - loader = WordNetLoader(temp_data_dir, autoload=False) + loader = WordNetLoader(temp_data_file, autoload=False) - assert loader.data_path == temp_data_dir + assert loader.data_path == temp_data_file assert loader.lazy is False assert loader.cache_size == 1000 assert not loader._loaded assert len(loader.synsets) == 0 - def test_load_synsets(self, temp_data_dir): + def test_load_synsets(self, temp_data_file): """Test loading synsets from JSON Lines.""" - loader = WordNetLoader(temp_data_dir) + loader = WordNetLoader(temp_data_file) loader.load() # Check synsets loaded @@ -186,23 +134,32 @@ def test_load_synsets(self, temp_data_dir): assert entity.words[0].lemma == "entity" assert len(entity.pointers) == 1 - def test_load_index(self, temp_data_dir): - """Test loading index files.""" - loader = WordNetLoader(temp_data_dir) + def test_load_lemma_index(self, temp_data_file): + """Test building lemma index from synset data.""" + loader = WordNetLoader(temp_data_file) loader.load() - # Check lemma index + # Check lemma index built from synset words assert "entity" in loader.lemma_index assert "n" in loader.lemma_index["entity"] assert len(loader.lemma_index["entity"]["n"]) == 1 - entry = loader.lemma_index["entity"]["n"][0] - assert entry.lemma == "entity" - assert entry.synset_offsets == ["00001740"] + # lemma_index values are SynsetOffset strings now + offset = loader.lemma_index["entity"]["n"][0] + assert offset == "00001740" + + # Check verb lemmas + assert "run" in loader.lemma_index + assert "v" in loader.lemma_index["run"] + assert loader.lemma_index["run"]["v"][0] == "00002325" + + # "go" should also be indexed + assert "go" in loader.lemma_index + assert "v" in loader.lemma_index["go"] - def test_load_sense_index(self, temp_data_dir): + def test_load_sense_index(self, temp_data_file): """Test loading sense index.""" - loader = WordNetLoader(temp_data_dir) + loader = WordNetLoader(temp_data_file) loader.load() # Check sense index @@ -212,9 +169,9 @@ def test_load_sense_index(self, temp_data_dir): assert sense.synset_offset == "00001740" assert sense.sense_number == 1 - def test_load_exceptions(self, temp_data_dir): + def test_load_exceptions(self, temp_data_file): """Test loading exception files.""" - loader = WordNetLoader(temp_data_dir) + loader = WordNetLoader(temp_data_file) loader.load() # Check noun exceptions @@ -227,9 +184,9 @@ def test_load_exceptions(self, temp_data_dir): assert "ran" in loader.exceptions["v"] assert loader.exceptions["v"]["ran"] == ["run"] - def test_build_relation_indices(self, temp_data_dir): + def test_build_relation_indices(self, temp_data_file): """Test building relation indices.""" - loader = WordNetLoader(temp_data_dir) + loader = WordNetLoader(temp_data_file) loader.load() # Check hypernym index @@ -240,9 +197,9 @@ def test_build_relation_indices(self, temp_data_dir): assert "00001740" in loader.hyponym_index assert "00001930" in loader.hyponym_index["00001740"] - def test_get_synset(self, temp_data_dir): + def test_get_synset(self, temp_data_file): """Test getting synset by offset.""" - loader = WordNetLoader(temp_data_dir) + loader = WordNetLoader(temp_data_file) loader.load() synset = loader.get_synset("00001740") @@ -254,9 +211,9 @@ def test_get_synset(self, temp_data_dir): synset = loader.get_synset("99999999") assert synset is None - def test_get_synsets_by_lemma(self, temp_data_dir): + def test_get_synsets_by_lemma(self, temp_data_file): """Test getting synsets by lemma.""" - loader = WordNetLoader(temp_data_dir) + loader = WordNetLoader(temp_data_file) loader.load() # Test noun @@ -277,9 +234,9 @@ def test_get_synsets_by_lemma(self, temp_data_dir): synsets = loader.get_synsets_by_lemma("nonexistent") assert len(synsets) == 0 - def test_get_sense_by_key(self, temp_data_dir): + def test_get_sense_by_key(self, temp_data_file): """Test getting sense by key.""" - loader = WordNetLoader(temp_data_dir) + loader = WordNetLoader(temp_data_file) loader.load() sense = loader.get_sense_by_key("entity%1:03:00::") @@ -291,9 +248,9 @@ def test_get_sense_by_key(self, temp_data_dir): sense = loader.get_sense_by_key("nonexistent%1:00:00::") assert sense is None - def test_get_senses_by_lemma(self, temp_data_dir): + def test_get_senses_by_lemma(self, temp_data_file): """Test getting senses by lemma.""" - loader = WordNetLoader(temp_data_dir) + loader = WordNetLoader(temp_data_file) loader.load() senses = loader.get_senses_by_lemma("entity", "n") @@ -304,9 +261,9 @@ def test_get_senses_by_lemma(self, temp_data_dir): assert len(senses) == 1 assert senses[0].sense_key == "run%2:38:00::" - def test_get_hypernyms(self, temp_data_dir): + def test_get_hypernyms(self, temp_data_file): """Test getting hypernyms.""" - loader = WordNetLoader(temp_data_dir) + loader = WordNetLoader(temp_data_file) loader.load() synset = loader.get_synset("00001930") @@ -314,9 +271,9 @@ def test_get_hypernyms(self, temp_data_dir): assert len(hypernyms) == 1 assert hypernyms[0].offset == "00001740" - def test_get_hyponyms(self, temp_data_dir): + def test_get_hyponyms(self, temp_data_file): """Test getting hyponyms.""" - loader = WordNetLoader(temp_data_dir) + loader = WordNetLoader(temp_data_file) loader.load() synset = loader.get_synset("00001740") @@ -324,9 +281,9 @@ def test_get_hyponyms(self, temp_data_dir): assert len(hyponyms) == 1 assert hyponyms[0].offset == "00001930" - def test_lazy_loading(self, temp_data_dir): + def test_lazy_loading(self, temp_data_file): """Test lazy loading mode.""" - loader = WordNetLoader(temp_data_dir, lazy=True, cache_size=2) + loader = WordNetLoader(temp_data_file, lazy=True, cache_size=2) loader.load() # Synsets should not be loaded yet @@ -346,9 +303,9 @@ def test_lazy_loading(self, temp_data_dir): assert cached is not None assert cached.offset == "00001740" - def test_get_exceptions(self, temp_data_dir): + def test_get_exceptions(self, temp_data_file): """Test getting morphological exceptions.""" - loader = WordNetLoader(temp_data_dir) + loader = WordNetLoader(temp_data_file) loader.load() noun_exc = loader.get_exceptions("n") @@ -363,9 +320,9 @@ def test_get_exceptions(self, temp_data_dir): adv_exc = loader.get_exceptions("r") assert len(adv_exc) == 0 - def test_load_wordnet_function(self, temp_data_dir): + def test_load_wordnet_function(self, temp_data_file): """Test the convenience load_wordnet function.""" - wn = load_wordnet(temp_data_dir) + wn = load_wordnet(temp_data_file) assert isinstance(wn, WordNetLoader) assert wn._loaded is True diff --git a/tests/test_wordnet/test_morphy.py b/tests/test_wordnet/test_morphy.py index 97efaa8..f9bfa33 100644 --- a/tests/test_wordnet/test_morphy.py +++ b/tests/test_wordnet/test_morphy.py @@ -10,6 +10,26 @@ from glazing.wordnet.morphy import Morphy, morphy +def _write_wordnet_files( + data_path: Path, synsets: list[dict], exceptions: list[dict] | None = None +) -> Path: + """Helper to write synsets and exceptions in the single-file format. + + Returns the path to the primary wordnet.jsonl file. + """ + wordnet_file = data_path / "wordnet.jsonl" + with open(wordnet_file, "w") as f: + for synset in synsets: + f.write(json.dumps(synset) + "\n") + + if exceptions: + with open(data_path / "wordnet_exceptions.jsonl", "w") as f: + for exc in exceptions: + f.write(json.dumps(exc) + "\n") + + return wordnet_file + + class TestMorphy: """Test WordNet morphological processing.""" @@ -19,8 +39,9 @@ def temp_data_with_lemmas(self): with tempfile.TemporaryDirectory() as tmpdir: data_path = Path(tmpdir) - # Create noun synsets with various lemmas - noun_synsets = [ + # All synsets in a single file + all_synsets = [ + # Noun synsets { "offset": "02084442", "lex_filenum": 5, @@ -60,14 +81,7 @@ def temp_data_with_lemmas(self): "pointers": [], "gloss": "two-winged insects", }, - ] - - with open(data_path / "data.noun.jsonl", "w") as f: - for synset in noun_synsets: - f.write(json.dumps(synset) + "\n") - - # Create verb synsets - verb_synsets = [ + # Verb synsets { "offset": "01926311", "lex_filenum": 38, @@ -108,14 +122,7 @@ def temp_data_with_lemmas(self): "frames": [], "gloss": "look attentively", }, - ] - - with open(data_path / "data.verb.jsonl", "w") as f: - for synset in verb_synsets: - f.write(json.dumps(synset) + "\n") - - # Create adjective synsets - adj_synsets = [ + # Adjective synsets { "offset": "00001740", "lex_filenum": 0, @@ -145,201 +152,33 @@ def temp_data_with_lemmas(self): }, ] - with open(data_path / "data.adj.jsonl", "w") as f: - for synset in adj_synsets: - f.write(json.dumps(synset) + "\n") - - # Create noun index - noun_index = [ - { - "lemma": "dog", - "pos": "n", - "synset_cnt": 1, - "p_cnt": 0, - "ptr_symbols": [], - "sense_cnt": 1, - "tagsense_cnt": 0, - "synset_offsets": ["02084442"], - }, - { - "lemma": "child", - "pos": "n", - "synset_cnt": 1, - "p_cnt": 0, - "ptr_symbols": [], - "sense_cnt": 1, - "tagsense_cnt": 0, - "synset_offsets": ["09917593"], - }, - { - "lemma": "box", - "pos": "n", - "synset_cnt": 1, - "p_cnt": 0, - "ptr_symbols": [], - "sense_cnt": 1, - "tagsense_cnt": 0, - "synset_offsets": ["02866578"], - }, - { - "lemma": "fly", - "pos": "n", - "synset_cnt": 1, - "p_cnt": 0, - "ptr_symbols": [], - "sense_cnt": 1, - "tagsense_cnt": 0, - "synset_offsets": ["01930374"], - }, - ] - - with open(data_path / "index.noun.jsonl", "w") as f: - for entry in noun_index: - f.write(json.dumps(entry) + "\n") - - # Create verb index - verb_index = [ - { - "lemma": "run", - "pos": "v", - "synset_cnt": 1, - "p_cnt": 0, - "ptr_symbols": [], - "sense_cnt": 1, - "tagsense_cnt": 0, - "synset_offsets": ["01926311"], - }, - { - "lemma": "fly", - "pos": "v", - "synset_cnt": 1, - "p_cnt": 0, - "ptr_symbols": [], - "sense_cnt": 1, - "tagsense_cnt": 0, - "synset_offsets": ["01835496"], - }, - { - "lemma": "be", - "pos": "v", - "synset_cnt": 1, - "p_cnt": 0, - "ptr_symbols": [], - "sense_cnt": 1, - "tagsense_cnt": 0, - "synset_offsets": ["00010435"], - }, - { - "lemma": "watch", - "pos": "v", - "synset_cnt": 1, - "p_cnt": 0, - "ptr_symbols": [], - "sense_cnt": 1, - "tagsense_cnt": 0, - "synset_offsets": ["00654625"], - }, - ] - - with open(data_path / "index.verb.jsonl", "w") as f: - for entry in verb_index: - f.write(json.dumps(entry) + "\n") - - # Create adjective index - adj_index = [ - { - "lemma": "big", - "pos": "a", - "synset_cnt": 1, - "p_cnt": 0, - "ptr_symbols": [], - "sense_cnt": 1, - "tagsense_cnt": 0, - "synset_offsets": ["00001740"], - }, - { - "lemma": "nice", - "pos": "a", - "synset_cnt": 1, - "p_cnt": 0, - "ptr_symbols": [], - "sense_cnt": 1, - "tagsense_cnt": 0, - "synset_offsets": ["00001741"], - }, - { - "lemma": "good", - "pos": "a", - "synset_cnt": 1, - "p_cnt": 0, - "ptr_symbols": [], - "sense_cnt": 1, - "tagsense_cnt": 0, - "synset_offsets": ["00001742"], - }, - { - "lemma": "well", - "pos": "a", - "synset_cnt": 1, - "p_cnt": 0, - "ptr_symbols": [], - "sense_cnt": 1, - "tagsense_cnt": 0, - "synset_offsets": ["00001742"], - }, - ] - - with open(data_path / "index.adj.jsonl", "w") as f: - for entry in adj_index: - f.write(json.dumps(entry) + "\n") - - # Create noun exceptions - noun_exc = [ - {"inflected_form": "children", "base_forms": ["child"]}, - {"inflected_form": "geese", "base_forms": ["goose"]}, - {"inflected_form": "men", "base_forms": ["man"]}, - {"inflected_form": "women", "base_forms": ["woman"]}, - {"inflected_form": "teeth", "base_forms": ["tooth"]}, - {"inflected_form": "feet", "base_forms": ["foot"]}, - {"inflected_form": "mice", "base_forms": ["mouse"]}, - ] - - with open(data_path / "noun.exc.jsonl", "w") as f: - for exc in noun_exc: - f.write(json.dumps(exc) + "\n") - - # Create verb exceptions - verb_exc = [ - {"inflected_form": "ran", "base_forms": ["run"]}, - {"inflected_form": "went", "base_forms": ["go"]}, - {"inflected_form": "was", "base_forms": ["be"]}, - {"inflected_form": "were", "base_forms": ["be"]}, - {"inflected_form": "been", "base_forms": ["be"]}, - {"inflected_form": "flew", "base_forms": ["fly"]}, - {"inflected_form": "flown", "base_forms": ["fly"]}, + # All exceptions in a single file with pos field + all_exceptions = [ + # Noun exceptions + {"inflected_form": "children", "base_forms": ["child"], "pos": "n"}, + {"inflected_form": "geese", "base_forms": ["goose"], "pos": "n"}, + {"inflected_form": "men", "base_forms": ["man"], "pos": "n"}, + {"inflected_form": "women", "base_forms": ["woman"], "pos": "n"}, + {"inflected_form": "teeth", "base_forms": ["tooth"], "pos": "n"}, + {"inflected_form": "feet", "base_forms": ["foot"], "pos": "n"}, + {"inflected_form": "mice", "base_forms": ["mouse"], "pos": "n"}, + # Verb exceptions + {"inflected_form": "ran", "base_forms": ["run"], "pos": "v"}, + {"inflected_form": "went", "base_forms": ["go"], "pos": "v"}, + {"inflected_form": "was", "base_forms": ["be"], "pos": "v"}, + {"inflected_form": "were", "base_forms": ["be"], "pos": "v"}, + {"inflected_form": "been", "base_forms": ["be"], "pos": "v"}, + {"inflected_form": "flew", "base_forms": ["fly"], "pos": "v"}, + {"inflected_form": "flown", "base_forms": ["fly"], "pos": "v"}, + # Adjective exceptions + {"inflected_form": "better", "base_forms": ["good", "well"], "pos": "a"}, + {"inflected_form": "best", "base_forms": ["good", "well"], "pos": "a"}, + {"inflected_form": "worse", "base_forms": ["bad"], "pos": "a"}, + {"inflected_form": "worst", "base_forms": ["bad"], "pos": "a"}, ] - with open(data_path / "verb.exc.jsonl", "w") as f: - for exc in verb_exc: - f.write(json.dumps(exc) + "\n") - - # Create adjective exceptions - adj_exc = [ - {"inflected_form": "better", "base_forms": ["good", "well"]}, - {"inflected_form": "best", "base_forms": ["good", "well"]}, - {"inflected_form": "worse", "base_forms": ["bad"]}, - {"inflected_form": "worst", "base_forms": ["bad"]}, - ] - - with open(data_path / "adj.exc.jsonl", "w") as f: - for exc in adj_exc: - f.write(json.dumps(exc) + "\n") - - # Create empty sense index (required but not used in tests) - with open(data_path / "index.sense.jsonl", "w") as f: - pass - - yield data_path + wordnet_file = _write_wordnet_files(data_path, all_synsets, all_exceptions) + yield wordnet_file @pytest.fixture def loader_with_data(self, temp_data_with_lemmas): @@ -581,12 +420,10 @@ def test_period_removal(self, loader_with_data): def test_ful_suffix_handling(self): """Test special handling of nouns ending with 'ful'.""" - # Create test data with "box" and "boxful" with tempfile.TemporaryDirectory() as tmpdir: data_path = Path(tmpdir) - # Create noun synsets - noun_synsets = [ + synsets = [ { "offset": "02883344", "lex_filenum": 6, @@ -607,44 +444,8 @@ def test_ful_suffix_handling(self): }, ] - with open(data_path / "data.noun.jsonl", "w") as f: - for synset in noun_synsets: - f.write(json.dumps(synset) + "\n") - - # Create index - noun_index = [ - { - "lemma": "box", - "pos": "n", - "synset_cnt": 1, - "p_cnt": 0, - "ptr_symbols": [], - "sense_cnt": 1, - "tagsense_cnt": 0, - "synset_offsets": ["02883344"], - }, - { - "lemma": "boxful", - "pos": "n", - "synset_cnt": 1, - "p_cnt": 0, - "ptr_symbols": [], - "sense_cnt": 1, - "tagsense_cnt": 0, - "synset_offsets": ["13767879"], - }, - ] - - with open(data_path / "index.noun.jsonl", "w") as f: - for entry in noun_index: - f.write(json.dumps(entry) + "\n") - - # Create empty files - with open(data_path / "index.sense.jsonl", "w") as f: - pass - - # Load and test - loader = WordNetLoader(data_path) + wordnet_file = _write_wordnet_files(data_path, synsets) + loader = WordNetLoader(wordnet_file) loader.load() processor = Morphy(loader) @@ -654,12 +455,10 @@ def test_ful_suffix_handling(self): def test_collocation_simple(self): """Test simple multi-word expressions.""" - # Create test data with "attorney_general" with tempfile.TemporaryDirectory() as tmpdir: data_path = Path(tmpdir) - # Create noun synset with multi-word expression - noun_synsets = [ + synsets = [ { "offset": "09780632", "lex_filenum": 15, @@ -689,54 +488,8 @@ def test_collocation_simple(self): }, ] - with open(data_path / "data.noun.jsonl", "w") as f: - for synset in noun_synsets: - f.write(json.dumps(synset) + "\n") - - # Create index - noun_index = [ - { - "lemma": "attorney", - "pos": "n", - "synset_cnt": 1, - "p_cnt": 0, - "ptr_symbols": [], - "sense_cnt": 1, - "tagsense_cnt": 0, - "synset_offsets": ["09780632"], - }, - { - "lemma": "general", - "pos": "n", - "synset_cnt": 1, - "p_cnt": 0, - "ptr_symbols": [], - "sense_cnt": 1, - "tagsense_cnt": 0, - "synset_offsets": ["10260706"], - }, - { - "lemma": "attorney_general", - "pos": "n", - "synset_cnt": 1, - "p_cnt": 0, - "ptr_symbols": [], - "sense_cnt": 1, - "tagsense_cnt": 0, - "synset_offsets": ["09781263"], - }, - ] - - with open(data_path / "index.noun.jsonl", "w") as f: - for entry in noun_index: - f.write(json.dumps(entry) + "\n") - - # Create empty files - with open(data_path / "index.sense.jsonl", "w") as f: - pass - - # Load and test - loader = WordNetLoader(data_path) + wordnet_file = _write_wordnet_files(data_path, synsets) + loader = WordNetLoader(wordnet_file) loader.load() processor = Morphy(loader) @@ -746,12 +499,10 @@ def test_collocation_simple(self): def test_hyphenated_words(self): """Test hyphenated multi-word expressions.""" - # Create test data with tempfile.TemporaryDirectory() as tmpdir: data_path = Path(tmpdir) - # Create noun synsets - noun_synsets = [ + synsets = [ { "offset": "10639637", "lex_filenum": 15, @@ -772,44 +523,8 @@ def test_hyphenated_words(self): }, ] - with open(data_path / "data.noun.jsonl", "w") as f: - for synset in noun_synsets: - f.write(json.dumps(synset) + "\n") - - # Create index - noun_index = [ - { - "lemma": "son", - "pos": "n", - "synset_cnt": 1, - "p_cnt": 0, - "ptr_symbols": [], - "sense_cnt": 1, - "tagsense_cnt": 0, - "synset_offsets": ["10639637"], - }, - { - "lemma": "son_in_law", - "pos": "n", - "synset_cnt": 1, - "p_cnt": 0, - "ptr_symbols": [], - "sense_cnt": 1, - "tagsense_cnt": 0, - "synset_offsets": ["10105733"], - }, - ] - - with open(data_path / "index.noun.jsonl", "w") as f: - for entry in noun_index: - f.write(json.dumps(entry) + "\n") - - # Create empty files - with open(data_path / "index.sense.jsonl", "w") as f: - pass - - # Load and test - loader = WordNetLoader(data_path) + wordnet_file = _write_wordnet_files(data_path, synsets) + loader = WordNetLoader(wordnet_file) loader.load() processor = Morphy(loader) From b5cc8553174e76be29f5803fb84a784b6efc9f96 Mon Sep 17 00:00:00 2001 From: Aaron Steven White Date: Thu, 5 Feb 2026 19:34:45 -0500 Subject: [PATCH 02/11] Adds frame relation, LU enrichment, semantic type, and fulltext parsing to FrameNet converter and loader. --- pyproject.toml | 3 +- src/glazing/framenet/converter.py | 655 ++++++++++++++++++++++++++++++ src/glazing/framenet/loader.py | 45 +- 3 files changed, 699 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4b07a5b..ab89ebd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -125,6 +125,7 @@ ignore = [ "tests/test_verbnet/test_converter.py" = ["E501", "S314"] "tests/test_verbnet/test_inheritance.py" = ["E501"] "tests/test_wordnet/test_converter.py" = ["E501"] +"tests/test_integration/test_converter_loader_roundtrip.py" = ["E501"] "tests/test_cli/test_download.py" = ["E501", "EM102"] "tests/test_downloader.py" = ["E501", "F841"] "tests/test_verbnet/test_downloader.py" = ["E501", "F841"] @@ -136,7 +137,7 @@ ignore = [ "src/glazing/wordnet/converter.py" = ["C901", "PLR0912", "PLR0915"] "src/glazing/wordnet/loader.py" = ["C901", "PLR0912"] "src/glazing/verbnet/converter.py" = ["C901", "S320"] -"src/glazing/framenet/converter.py" = ["S320"] +"src/glazing/framenet/converter.py" = ["S320", "C901", "PLR0912", "PLR0915"] "src/glazing/propbank/converter.py" = ["S320"] [tool.ruff.lint.pydocstyle] diff --git a/src/glazing/framenet/converter.py b/src/glazing/framenet/converter.py index 60081cd..a3054d5 100644 --- a/src/glazing/framenet/converter.py +++ b/src/glazing/framenet/converter.py @@ -38,22 +38,54 @@ import html from datetime import UTC, datetime from pathlib import Path +from typing import get_args from lxml import etree from glazing.framenet.models import ( AnnotatedText, + AnnotationLayer, + AnnotationSet, + FERealization, + FERelation, Frame, FrameElement, + FrameRelation, + Label, Lexeme, LexicalUnit, + SemanticType, + SemTypeRef, + Sentence, SentenceCount, + ValenceAnnotationPattern, + ValencePattern, + ValenceRealizationPattern, + ValenceUnit, +) +from glazing.framenet.types import ( + AnnotationStatus, + LayerType, ) from glazing.utils.xml_parser import ( parse_attributes, parse_with_schema, ) +# Map from frRelation.xml relation type names to (sub_type, super_type) pairs. +# sub_type is the relation from the sub-frame's perspective; +# super_type is the relation from the super-frame's perspective (None if one-directional). +FRAME_RELATION_TYPE_MAP: dict[str, tuple[str, str | None]] = { + "Inheritance": ("Inherits from", "Is Inherited by"), + "Using": ("Uses", "Is Used by"), + "Subframe": ("Subframe of", "Has Subframe(s)"), + "Precedes": ("Precedes", "Is Preceded by"), + "Perspective_on": ("Perspective on", "Is Perspectivized in"), + "Causative_of": ("Is Causative of", None), + "Inchoative_of": ("Is Inchoative of", None), + "See_also": ("See also", "See also"), +} + class FrameNetConverter: """Convert FrameNet XML files to JSON Lines format. @@ -80,6 +112,14 @@ class FrameNetConverter: Convert a lexical unit XML file to LexicalUnit model. convert_frames_directory(input_dir, output_file) Convert all frames in a directory to JSON Lines. + convert_frame_relations_file(filepath) + Convert frRelation.xml to frame relation mappings. + convert_semtypes_file(filepath, output_file) + Convert semTypes.xml to JSON Lines. + convert_fulltext_file(filepath) + Convert a fulltext XML file to Sentence models. + convert_fulltext_directory(input_dir, output_file) + Convert all fulltext files in a directory to JSON Lines. """ def __init__( @@ -99,6 +139,22 @@ def __init__( self.namespace = namespace self.ns = {"fn": namespace} if namespace else {} self.validate_schema = validate_schema + self._ns_prefix = f"{{{namespace}}}" if namespace else "" + + def _tag(self, local_name: str) -> str: + """Build a namespace-qualified tag name. + + Parameters + ---------- + local_name : str + The local element name. + + Returns + ------- + str + Namespace-qualified tag name. + """ + return f"{self._ns_prefix}{local_name}" def _parse_definition(self, element: etree._Element | None) -> AnnotatedText: """Parse a definition element with embedded markup. @@ -499,6 +555,574 @@ def convert_lu_index_file(self, filepath: Path | str) -> list[LexicalUnit]: return lexical_units + def convert_frame_relations_file(self, filepath: Path | str) -> dict[int, list[FrameRelation]]: + """Convert frRelation.xml to frame relation mappings. + + Parses the frame relation types and individual frame relations, + creating FrameRelation objects grouped by frame ID. + + Parameters + ---------- + filepath : Path | str + Path to frRelation.xml file. + + Returns + ------- + dict[int, list[FrameRelation]] + Dictionary mapping frame IDs to their FrameRelation objects. + + Examples + -------- + >>> converter = FrameNetConverter() + >>> relations = converter.convert_frame_relations_file("frRelation.xml") + >>> print(f"Found relations for {len(relations)} frames") + """ + filepath = Path(filepath) + + tree = etree.parse(str(filepath)) + root = tree.getroot() + + relations_by_frame: dict[int, list[FrameRelation]] = {} + + for rel_type_elem in root.findall(self._tag("frameRelationType")): + type_name = rel_type_elem.get("name", "") + + if type_name not in FRAME_RELATION_TYPE_MAP: + continue + + sub_type, super_type = FRAME_RELATION_TYPE_MAP[type_name] + + for fr_elem in rel_type_elem.findall(self._tag("frameRelation")): + sub_frame_id = int(fr_elem.get("subID", "0")) + sup_frame_id = int(fr_elem.get("supID", "0")) + sub_frame_name = fr_elem.get("subFrameName", "") + super_frame_name = fr_elem.get("superFrameName", "") + relation_id = int(fr_elem.get("ID", "0")) + + # Parse FE relations + fe_relations: list[FERelation] = [] + for fe_rel_elem in fr_elem.findall(self._tag("FERelation")): + try: + fe_rel = FERelation( # type: ignore[call-arg] + sub_fe_id=int(fe_rel_elem.get("subID", "0")), + sub_fe_name=fe_rel_elem.get("subFEName"), + super_fe_id=int(fe_rel_elem.get("supID", "0")), + super_fe_name=fe_rel_elem.get("superFEName"), + ) + fe_relations.append(fe_rel) + except (ValueError, TypeError): + continue + + # Create FrameRelation for the sub-frame's perspective + try: + sub_relation = FrameRelation( + id=relation_id, + type=sub_type, # type: ignore[arg-type] + sub_frame_id=sub_frame_id, + sub_frame_name=sub_frame_name, + super_frame_id=sup_frame_id, + super_frame_name=super_frame_name, + fe_relations=fe_relations, + ) + relations_by_frame.setdefault(sub_frame_id, []).append(sub_relation) + except (ValueError, TypeError): + pass + + # Create FrameRelation for the super-frame's perspective (if applicable) + if super_type is not None: + try: + super_relation = FrameRelation( + id=relation_id, + type=super_type, # type: ignore[arg-type] + sub_frame_id=sub_frame_id, + sub_frame_name=sub_frame_name, + super_frame_id=sup_frame_id, + super_frame_name=super_frame_name, + fe_relations=fe_relations, + ) + relations_by_frame.setdefault(sup_frame_id, []).append(super_relation) + except (ValueError, TypeError): + pass + + return relations_by_frame + + def convert_lu_file( + self, filepath: Path | str + ) -> tuple[list[ValencePattern], list[SemTypeRef], list[AnnotationSet]]: + """Convert an individual lu/*.xml file to extract valence patterns and semtypes. + + Parses valence patterns (FE realizations and their syntactic patterns), + semantic type references, and annotation sets from a lexical unit file. + + Parameters + ---------- + filepath : Path | str + Path to individual lu XML file (e.g., lu/lu10.xml). + + Returns + ------- + tuple[list[ValencePattern], list[SemTypeRef], list[AnnotationSet]] + Tuple of (valence_patterns, semtypes, annotation_sets). + + Examples + -------- + >>> converter = FrameNetConverter() + >>> patterns, semtypes, annosets = converter.convert_lu_file("lu/lu10.xml") + >>> print(f"Found {len(patterns)} valence patterns") + """ + filepath = Path(filepath) + + tree = etree.parse(str(filepath)) + root = tree.getroot() + + # Parse semantic types (direct children of root) + semtypes: list[SemTypeRef] = [] + for semtype_elem in root.findall(self._tag("semType")): + st_name = semtype_elem.get("name") + st_id = semtype_elem.get("ID") + if st_name and st_id: + try: + semtypes.append(SemTypeRef(name=st_name, id=int(st_id))) + except (ValueError, TypeError): + continue + + # Parse valence patterns from element + valence_patterns: list[ValencePattern] = [] + valences_elem = root.find(self._tag("valences")) + if valences_elem is not None: + # Parse FE realizations + fe_realizations: list[FERealization] = [] + for fe_real_elem in valences_elem.findall(self._tag("FERealization")): + fe_real_total = int(fe_real_elem.get("total", "0")) + + # Get FE name from child element + fe_child = fe_real_elem.find(self._tag("FE")) + fe_name = fe_child.get("name", "") if fe_child is not None else "" + + if not fe_name: + continue + + # Parse patterns within this FE realization + patterns: list[ValenceRealizationPattern] = [] + for pattern_elem in fe_real_elem.findall(self._tag("pattern")): + pattern_total = int(pattern_elem.get("total", "0")) + + # Parse valence units + valence_units: list[ValenceUnit] = [] + for vu_elem in pattern_elem.findall(self._tag("valenceUnit")): + try: + vu = ValenceUnit( + GF=vu_elem.get("GF", ""), + PT=vu_elem.get("PT", ""), + FE=vu_elem.get("FE", ""), + ) + valence_units.append(vu) + except (ValueError, TypeError): + continue + + # Parse annotation set IDs + anno_set_ids: list[int] = [] + for anno_elem in pattern_elem.findall(self._tag("annoSet")): + anno_id = anno_elem.get("ID") + if anno_id: + anno_set_ids.append(int(anno_id)) + + if valence_units and pattern_total > 0: + try: + patterns.append( + ValenceRealizationPattern( + valence_units=valence_units, + anno_set_ids=anno_set_ids, + total=pattern_total, + ) + ) + except (ValueError, TypeError): + continue + + try: + fe_realizations.append( + FERealization( + fe_name=fe_name, + total=fe_real_total, + patterns=patterns, + ) + ) + except (ValueError, TypeError): + continue + + # Build a single ValencePattern if we have FE realizations + if fe_realizations: + # Compute total annotated from the root or LU attributes + total_annotated = int(root.get("totalAnnotated", "0")) + + # Parse FEGroupRealization / ValenceAnnotationPattern entries + valence_anno_patterns: list[ValenceAnnotationPattern] = [] + # These come from elements in the valences section + # (not all LU files have these) + + valence_patterns.append( + ValencePattern( + total_annotated=total_annotated, + fe_realizations=fe_realizations, + patterns=valence_anno_patterns, + ) + ) + + # Parse annotation sets (from sections) + annotation_sets: list[AnnotationSet] = [] + # Annotation sets in lu files are nested inside subCorpus > sentence > annotationSet + # We collect them but don't return full sentences here + for subcorpus_elem in root.findall(self._tag("subCorpus")): + for sentence_elem in subcorpus_elem.findall(self._tag("sentence")): + sent_id = int(sentence_elem.get("ID", "0")) + for annoset_elem in sentence_elem.findall(self._tag("annotationSet")): + try: + annoset = self._parse_annotation_set(annoset_elem, sent_id) + if annoset is not None: + annotation_sets.append(annoset) + except (ValueError, TypeError): + continue + + return valence_patterns, semtypes, annotation_sets + + def convert_semtypes_file(self, filepath: Path | str, output_file: Path | str) -> int: + """Convert semTypes.xml to JSON Lines format. + + Parses the semantic type hierarchy and writes each type as a JSON line. + + Parameters + ---------- + filepath : Path | str + Path to semTypes.xml file. + output_file : Path | str + Output JSON Lines file path. + + Returns + ------- + int + Number of semantic types converted. + + Examples + -------- + >>> converter = FrameNetConverter() + >>> count = converter.convert_semtypes_file("semTypes.xml", "semtypes.jsonl") + >>> print(f"Converted {count} semantic types") + """ + filepath = Path(filepath) + output_file = Path(output_file) + + tree = etree.parse(str(filepath)) + root = tree.getroot() + + # semTypes.xml uses the FrameNet namespace + semtype_tag = self._tag("semType") + definition_tag = self._tag("definition") + supertype_tag = self._tag("superType") + + semantic_types: list[SemanticType] = [] + + for st_elem in root.findall(semtype_tag): + st_id = st_elem.get("ID") + st_name = st_elem.get("name", "") + st_abbrev = st_elem.get("abbrev", "") + + if not st_id or not st_name: + continue + + # Parse definition + def_elem = st_elem.find(definition_tag) + definition_text = "" + if def_elem is not None and def_elem.text: + definition_text = def_elem.text.strip() + if not definition_text: + definition_text = f"Semantic type: {st_name}" + + # Parse super type + super_type_id = None + super_type_name = None + sup_elem = st_elem.find(supertype_tag) + if sup_elem is not None: + sup_id = sup_elem.get("supID") + sup_name = sup_elem.get("superTypeName") + if sup_id: + super_type_id = int(sup_id) + super_type_name = sup_name + + try: + sem_type = SemanticType( + id=int(st_id), + name=st_name, + abbrev=st_abbrev if st_abbrev else st_name, + definition=definition_text, + super_type_id=super_type_id, + super_type_name=super_type_name, + root_type_id=None, + root_type_name=None, + ) + semantic_types.append(sem_type) + except (ValueError, TypeError) as e: + print(f"Warning: Failed to parse semantic type '{st_name}': {e}") + continue + + # Write to output file + count = 0 + with output_file.open("w", encoding="utf-8") as f: + for sem_type in semantic_types: + json_line = sem_type.model_dump_json(exclude_none=True) + f.write(json_line + "\n") + count += 1 + + return count + + def _parse_annotation_set( + self, annoset_elem: etree._Element, sentence_id: int + ) -> AnnotationSet | None: + """Parse an annotation set element. + + Parameters + ---------- + annoset_elem : etree._Element + The annotationSet XML element. + sentence_id : int + ID of the containing sentence. + + Returns + ------- + AnnotationSet | None + Parsed annotation set, or None if invalid. + """ + anno_id = annoset_elem.get("ID") + status = annoset_elem.get("status", "") + + if not anno_id: + return None + + # Validate status against allowed values + valid_statuses = get_args(AnnotationStatus.__value__) + if status not in valid_statuses: + return None + + # Parse created_by and created_date + cby = annoset_elem.get("cBy") + cdate_str = annoset_elem.get("cDate") + cdate = self._parse_datetime(cdate_str) if cdate_str else None + + # Parse layers + layers: list[AnnotationLayer] = [] + valid_layer_types = get_args(LayerType.__value__) + + for layer_elem in annoset_elem.findall(self._tag("layer")): + layer_name = layer_elem.get("name", "") + layer_rank = int(layer_elem.get("rank", "1")) + + if layer_name not in valid_layer_types: + continue + + # Parse labels + labels: list[Label] = [] + for label_elem in layer_elem.findall(self._tag("label")): + label_name = label_elem.get("name", "") + if not label_name: + continue + + start_str = label_elem.get("start") + end_str = label_elem.get("end") + itype = label_elem.get("itype") + label_id_str = label_elem.get("ID") + fe_id_str = label_elem.get("feID") + + # Handle null instantiation labels (no start/end attributes) + if itype and (start_str is None or end_str is None): + # Null instantiation: set start=0, end=0 + start_val = 0 + end_val = 0 + is_null = True + elif start_str is not None and end_str is not None: + start_val = int(start_str) + end_val = int(end_str) + is_null = bool(itype) + else: + # Labels without start/end and without itype - skip + continue + + # Validate positions + if start_val < 0 or end_val < start_val: + if is_null: + start_val = 0 + end_val = 0 + else: + continue + + try: + label = Label( + id=int(label_id_str) if label_id_str else None, + name=label_name, + start=start_val, + end=end_val, + fe_id=int(fe_id_str) if fe_id_str else None, + itype=is_null, + ) + labels.append(label) + except (ValueError, TypeError): + continue + + try: + layers.append( + AnnotationLayer( + name=layer_name, # type: ignore[arg-type] + rank=layer_rank, + labels=labels, + ) + ) + except (ValueError, TypeError): + continue + + try: + return AnnotationSet( + id=int(anno_id), + status=status, # type: ignore[arg-type] + sentence_id=sentence_id, + layers=layers, + cBy=cby, + cDate=cdate, + ) + except (ValueError, TypeError): + return None + + def convert_fulltext_file(self, filepath: Path | str) -> list[Sentence]: + """Convert a fulltext/*.xml file to Sentence models. + + Parses annotated corpus sentences with their annotation sets, + layers, and labels. + + Parameters + ---------- + filepath : Path | str + Path to fulltext XML file. + + Returns + ------- + list[Sentence] + List of parsed Sentence models. + + Examples + -------- + >>> converter = FrameNetConverter() + >>> sentences = converter.convert_fulltext_file("fulltext/ANC__110CYL067.xml") + >>> print(f"Found {len(sentences)} sentences") + """ + filepath = Path(filepath) + + tree = etree.parse(str(filepath)) + root = tree.getroot() + + sentences: list[Sentence] = [] + + for sent_elem in root.findall(self._tag("sentence")): + sent_id_str = sent_elem.get("ID") + if not sent_id_str: + continue + sent_id = int(sent_id_str) + + # Get sentence text + text_elem = sent_elem.find(self._tag("text")) + if text_elem is None or not text_elem.text: + continue + text = text_elem.text + + # Get sentence metadata + parag_no_str = sent_elem.get("paragNo") + sent_no_str = sent_elem.get("sentNo") + corp_id_str = sent_elem.get("corpID") + doc_id_str = sent_elem.get("docID") + apos_str = sent_elem.get("aPos") + + # Parse annotation sets + annotation_sets: list[AnnotationSet] = [] + for annoset_elem in sent_elem.findall(self._tag("annotationSet")): + try: + annoset = self._parse_annotation_set(annoset_elem, sent_id) + if annoset is not None: + annotation_sets.append(annoset) + except (ValueError, TypeError): + continue + + try: + sentence = Sentence( + id=sent_id, + text=text, + paragNo=int(parag_no_str) if parag_no_str else None, + sentNo=int(sent_no_str) if sent_no_str else None, + corpID=int(corp_id_str) if corp_id_str else None, + docID=int(doc_id_str) if doc_id_str else None, + apos=int(apos_str) if apos_str else None, + annotation_sets=annotation_sets, + ) + sentences.append(sentence) + except (ValueError, TypeError) as e: + print(f"Warning: Failed to parse sentence {sent_id}: {e}") + continue + + return sentences + + def convert_fulltext_directory( + self, + input_dir: Path | str, + output_file: Path | str, + pattern: str = "*.xml", + ) -> int: + """Convert all fulltext files in a directory to JSON Lines. + + Parameters + ---------- + input_dir : Path | str + Directory containing fulltext XML files. + output_file : Path | str + Output JSON Lines file path. + pattern : str, default="*.xml" + File pattern to match. + + Returns + ------- + int + Number of sentences converted. + + Examples + -------- + >>> converter = FrameNetConverter() + >>> count = converter.convert_fulltext_directory( + ... "framenet_v17/fulltext", + ... "fulltext.jsonl" + ... ) + >>> print(f"Converted {count} sentences") + """ + input_dir = Path(input_dir) + output_file = Path(output_file) + + count = 0 + errors: list[tuple[Path, Exception]] = [] + + with output_file.open("w", encoding="utf-8") as f: + for xml_file in sorted(input_dir.glob(pattern)): + try: + sentences = self.convert_fulltext_file(xml_file) + for sentence in sentences: + json_line = sentence.model_dump_json(exclude_none=True) + f.write(json_line + "\n") + count += 1 + except (etree.XMLSyntaxError, ValueError, TypeError) as e: + errors.append((xml_file, e)) + + if errors: + error_details = "\n".join(f" - {file}: {error}" for file, error in errors) + total_files = len(list(input_dir.glob(pattern))) + error_msg = ( + f"Failed to convert {len(errors)} out of {total_files} files:\n{error_details}" + ) + raise RuntimeError(error_msg) + + return count + def convert_frames_directory( self, input_dir: Path | str, @@ -509,6 +1133,8 @@ def convert_frames_directory( This method parses frame XML files and associates them with lexical units from luIndex.xml (expected to be in the parent directory of input_dir). + It also loads frame relations from frRelation.xml and enriches LUs with + valence patterns and semantic types from individual lu/*.xml files. Parameters ---------- @@ -570,6 +1196,35 @@ def convert_frames_directory( for frame in frames: frame.lexical_units = lu_by_frame.get(frame.id, []) + # Load frame relations from frRelation.xml + fr_relation_path = parent_dir / "frRelation.xml" + if fr_relation_path.exists(): + try: + relations_by_frame = self.convert_frame_relations_file(fr_relation_path) + for frame in frames: + frame.frame_relations = relations_by_frame.get(frame.id, []) + except (etree.XMLSyntaxError, ValueError, TypeError) as e: + print(f"Warning: Failed to load frame relations from {fr_relation_path}: {e}") + + # Enrich LUs with valence patterns and semtypes from individual lu/*.xml files + lu_dir = parent_dir / "lu" + if lu_dir.is_dir(): + for frame in frames: + for lu in frame.lexical_units: + lu_file = lu_dir / f"lu{lu.id}.xml" + if lu_file.exists(): + try: + valence_patterns, semtypes, _annotation_sets = self.convert_lu_file( + lu_file + ) + if valence_patterns: + lu.valence_patterns = valence_patterns + if semtypes: + lu.semtypes = semtypes + except (etree.XMLSyntaxError, ValueError, TypeError) as e: + print(f"Warning: Failed to parse LU file {lu_file}: {e}") + continue + # Write frames with LUs to output file count = 0 with output_file.open("w", encoding="utf-8") as f: diff --git a/src/glazing/framenet/loader.py b/src/glazing/framenet/loader.py index 5d91108..8d5e6df 100644 --- a/src/glazing/framenet/loader.py +++ b/src/glazing/framenet/loader.py @@ -36,7 +36,7 @@ from collections import defaultdict from pathlib import Path -from glazing.framenet.models import Frame, LexicalUnit, SemanticType +from glazing.framenet.models import Frame, LexicalUnit, SemanticType, Sentence from glazing.framenet.types import FrameID from glazing.initialize import get_default_data_path @@ -452,14 +452,15 @@ def load_lexical_units( return lexical_units def load_semantic_types( - self, filepath: Path | str, skip_errors: bool = False + self, filepath: Path | str | None = None, skip_errors: bool = False ) -> list[SemanticType]: """Load SemanticType models from JSON Lines file. Parameters ---------- - filepath : Path | str + filepath : Path | str | None, optional Path to JSON Lines file containing SemanticType data. + If None, looks for ``framenet_semtypes.jsonl`` alongside the primary data file. skip_errors : bool, default=False If True, skip invalid lines rather than raising errors. @@ -475,6 +476,8 @@ def load_semantic_types( ValueError If skip_errors=False and a line fails validation. """ + if filepath is None: + filepath = self.data_path.parent / "framenet_semtypes.jsonl" filepath = Path(filepath) if not filepath.exists(): msg = f"FrameNet semantic types file not found: {filepath}" @@ -486,6 +489,42 @@ def load_semantic_types( return sem_types + def load_fulltext( + self, filepath: Path | str | None = None, skip_errors: bool = False + ) -> list[Sentence]: + """Load Sentence models from fulltext JSON Lines file. + + Parameters + ---------- + filepath : Path | str | None, optional + Path to JSON Lines file containing Sentence data. + If None, looks for ``framenet_fulltext.jsonl`` alongside the primary data file. + skip_errors : bool, default=False + If True, skip invalid lines rather than raising errors. + + Returns + ------- + list[Sentence] + List of loaded Sentence models. + + Raises + ------ + FileNotFoundError + If the input file does not exist. + """ + if filepath is None: + filepath = self.data_path.parent / "framenet_fulltext.jsonl" + filepath = Path(filepath) + if not filepath.exists(): + msg = f"FrameNet fulltext file not found: {filepath}" + raise FileNotFoundError(msg) + + sentences = [] + for sentence in Sentence.from_json_lines_file(filepath, skip_errors=skip_errors): + sentences.append(sentence) + + return sentences + def build_frame_index(self, frames: list[Frame]) -> FrameIndex: """Build searchable index from frames data. From 6486fa8d3ab0c2014b6edaadd5234c0a01addb93 Mon Sep 17 00:00:00 2001 From: Aaron Steven White Date: Thu, 5 Feb 2026 19:38:56 -0500 Subject: [PATCH 03/11] Fixes VerbNet converter to populate framenet_mappings and propbank_mappings from member attributes. --- pyproject.toml | 2 +- src/glazing/verbnet/converter.py | 49 +++++++++++++++++++++++++++++--- 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ab89ebd..4af2bf1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -136,7 +136,7 @@ ignore = [ "src/glazing/utils/xml_parser.py" = ["S320"] "src/glazing/wordnet/converter.py" = ["C901", "PLR0912", "PLR0915"] "src/glazing/wordnet/loader.py" = ["C901", "PLR0912"] -"src/glazing/verbnet/converter.py" = ["C901", "S320"] +"src/glazing/verbnet/converter.py" = ["C901", "S320", "PLR0912"] "src/glazing/framenet/converter.py" = ["S320", "C901", "PLR0912", "PLR0915"] "src/glazing/propbank/converter.py" = ["S320"] diff --git a/src/glazing/verbnet/converter.py b/src/glazing/verbnet/converter.py index 5a6452d..e843326 100644 --- a/src/glazing/verbnet/converter.py +++ b/src/glazing/verbnet/converter.py @@ -39,11 +39,17 @@ from __future__ import annotations import re +from datetime import UTC, datetime from pathlib import Path from typing import cast from lxml import etree +from glazing.references.models import ( + CrossReference, + MappingMetadata, + VerbNetFrameNetMapping, +) from glazing.types import LogicType from glazing.utils.xml_parser import parse_attributes from glazing.verbnet.models import ( @@ -268,8 +274,8 @@ def _parse_members(self, element: etree._Element) -> list[Member]: # Parse cross-references from attributes features = {} wn_senses = str(attrs.get("wn", "")).strip() - str(attrs.get("grouping", "")).strip() - str(attrs.get("fn_mapping", "")).strip() + grouping = str(attrs.get("grouping", "")).strip() + fn_mapping = str(attrs.get("fn_mapping", "")).strip() features_str = str(attrs.get("features", "")).strip() # Parse features if present @@ -292,14 +298,49 @@ def _parse_members(self, element: etree._Element) -> list[Member]: # Skip invalid percentage notation continue + # Parse FrameNet mappings from fn_mapping attribute + framenet_mappings: list[VerbNetFrameNetMapping] = [] + if fn_mapping and fn_mapping != "None": + framenet_mappings.append( + VerbNetFrameNetMapping( + frame_name=fn_mapping, + confidence=None, + mapping_source="manual", + role_mappings=[], + ) + ) + + # Parse PropBank mappings from grouping attribute + propbank_mappings: list[CrossReference] = [] + if grouping and grouping != "None": + mapping_metadata = MappingMetadata( + created_date=datetime.now(tz=UTC), + created_by="verbnet_xml", + version="3.4", + validation_status="unvalidated", + ) + for roleset_id in grouping.split(): + if roleset_id.strip(): + propbank_mappings.append( + CrossReference( + source_dataset="verbnet", + source_id=verbnet_key, + source_version="3.4", + target_dataset="propbank", + target_id=roleset_id.strip(), + mapping_type="direct", + metadata=mapping_metadata, + ) + ) + # Create member model member = Member( name=name, verbnet_key=verbnet_key, wordnet_mappings=wordnet_mappings, + framenet_mappings=framenet_mappings, + propbank_mappings=propbank_mappings, features=features, - # PropBank and FrameNet mappings would be parsed from - # grouping and fn_mapping attributes here in a full implementation ) members.append(member) From 1e5e275fbbeccbf356ba4e33860d16c8d1298b2e Mon Sep 17 00:00:00 2001 From: Aaron Steven White Date: Thu, 5 Feb 2026 19:39:09 -0500 Subject: [PATCH 04/11] Adds AMR-UMR-91 roleset conversion and XML fixes to PropBank converter. --- src/glazing/propbank/converter.py | 82 +++++++++++++++++++++++++++--- src/glazing/utils/special_cases.py | 9 +++- 2 files changed, 82 insertions(+), 9 deletions(-) diff --git a/src/glazing/propbank/converter.py b/src/glazing/propbank/converter.py index 8489524..8a360e2 100644 --- a/src/glazing/propbank/converter.py +++ b/src/glazing/propbank/converter.py @@ -33,6 +33,7 @@ from pathlib import Path from lxml import etree +from pydantic import ValidationError from glazing.propbank.models import ( Alias, @@ -281,14 +282,18 @@ def _parse_propbank_annotation(self, propbank_elem: etree._Element) -> PropBankA with contextlib.suppress(ValueError, TypeError): end = int(end) - args.append( - Arg( - type=str(attrs["type"]), # type: ignore[arg-type] - start=start, # type: ignore[arg-type] - end=end, # type: ignore[arg-type] - text=arg.text, + try: + args.append( + Arg( + type=str(attrs["type"]), # type: ignore[arg-type] + start=start, # type: ignore[arg-type] + end=end, # type: ignore[arg-type] + text=arg.text, + ) ) - ) + except ValidationError: + # Skip args with non-standard types (e.g., AMR annotations) + continue # PropBankAnnotation expects a single Rel, not a list # Handle missing rel element (some annotations don't have it) @@ -486,6 +491,52 @@ def convert_frameset_file(self, filepath: Path | str) -> Frameset: return Frameset(predicate_lemma=predicate_lemma, rolesets=rolesets, notes=notes) + def convert_combined_frameset_file(self, filepath: Path | str) -> list[Frameset]: + """Convert a combined frameset XML file with multiple predicates. + + Handles files like AMR-UMR-91-rolesets.xml where a single + root contains multiple children. + + Parameters + ---------- + filepath : Path | str + Path to combined frameset XML file. + + Returns + ------- + list[Frameset] + List of parsed Frameset model instances, one per predicate. + """ + filepath = Path(filepath) + xml_content = filepath.read_text(encoding="utf-8") + xml_content = self._fix_xml_errors(xml_content, filepath) + + tree = etree.parse(BytesIO(xml_content.encode("utf-8"))) + root = tree.getroot() + + framesets: list[Frameset] = [] + for predicate_elem in root.findall("predicate"): + predicate_lemma = predicate_elem.get("lemma", "") + + rolesets = [] + for roleset in predicate_elem.findall("roleset"): + try: + rolesets.append(self._parse_roleset(roleset)) + except (ValidationError, ValueError, TypeError): + # Skip rolesets with non-standard values (e.g., AMR-specific types) + continue + + notes = [] + for note in predicate_elem.findall("note"): + if note.text: + notes.append(note.text) + + framesets.append( + Frameset(predicate_lemma=predicate_lemma, rolesets=rolesets, notes=notes) + ) + + return framesets + def convert_framesets_directory( self, input_dir: Path | str, @@ -494,6 +545,9 @@ def convert_framesets_directory( ) -> int: """Convert all frameset files in a directory to JSON Lines. + Also processes combined frameset files (e.g., AMR-UMR-91-rolesets.xml) + found in the parent directory. + Parameters ---------- input_dir : Path | str @@ -525,16 +579,28 @@ def convert_framesets_directory( errors: list[tuple[Path, Exception]] = [] with output_file.open("w", encoding="utf-8") as f: + # Convert individual frameset files for xml_file in sorted(input_dir.glob(pattern)): try: frameset = self.convert_frameset_file(xml_file) - # Write as JSON Lines json_line = frameset.model_dump_json(exclude_none=True) f.write(json_line + "\n") count += 1 except (etree.XMLSyntaxError, ValueError, TypeError) as e: errors.append((xml_file, e)) + # Also process combined frameset files in parent directory + amr_file = input_dir.parent / "AMR-UMR-91-rolesets.xml" + if amr_file.exists(): + try: + amr_framesets = self.convert_combined_frameset_file(amr_file) + for frameset in amr_framesets: + json_line = frameset.model_dump_json(exclude_none=True) + f.write(json_line + "\n") + count += 1 + except (etree.XMLSyntaxError, ValueError, TypeError) as e: + errors.append((amr_file, e)) + # If there were any errors, raise an exception with details if errors: error_details = "\n".join(f" - {file}: {error}" for file, error in errors) diff --git a/src/glazing/utils/special_cases.py b/src/glazing/utils/special_cases.py index 9d2def5..6ae1147 100644 --- a/src/glazing/utils/special_cases.py +++ b/src/glazing/utils/special_cases.py @@ -30,7 +30,14 @@ class SpecialCaseRegistry: "replacement": ">in", "description": "Mismatched closing tag", } - ] + ], + "AMR-UMR-91-rolesets.xml": [ + { + "pattern": " \n ", + "replacement": " ", + "description": "Duplicate closing tag in reference-illustration.91", + } + ], } PROPBANK_ROLESET_EXCEPTIONS: ClassVar[dict[str, str]] = { From 38ec4fe2abc14d3a062aa32a569df4df37787fc5 Mon Sep 17 00:00:00 2001 From: Aaron Steven White Date: Thu, 5 Feb 2026 19:57:50 -0500 Subject: [PATCH 05/11] Updates initialize.py to convert supplementary WordNet and FrameNet data files. --- src/glazing/initialize.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/src/glazing/initialize.py b/src/glazing/initialize.py index 7dde27b..9e3fb2b 100644 --- a/src/glazing/initialize.py +++ b/src/glazing/initialize.py @@ -168,6 +168,30 @@ def _process_dataset(name: str, data_dir: Path, verbose: bool) -> bool: return True +def _convert_wordnet_supplementary( + converter: object, source: Path, converted_dir: Path, verbose: bool +) -> None: + sense_count = converter.convert_sense_index(source, converted_dir / "wordnet_senses.jsonl") # type: ignore[attr-defined] + exc_count = converter.convert_exceptions(source, converted_dir / "wordnet_exceptions.jsonl") # type: ignore[attr-defined] + if verbose: + click.echo(f" ✓ Converted {sense_count} senses, {exc_count} exceptions") + + +def _convert_framenet_supplementary( + converter: object, download_path: Path, converted_dir: Path, verbose: bool +) -> None: + semtype_count = converter.convert_semtypes_file( # type: ignore[attr-defined] + download_path / "semTypes.xml", converted_dir / "framenet_semtypes.jsonl" + ) + if verbose: + click.echo(f" ✓ Converted {semtype_count} semantic types") + fulltext_count = converter.convert_fulltext_directory( # type: ignore[attr-defined] + download_path / "fulltext", converted_dir / "framenet_fulltext.jsonl" + ) + if verbose: + click.echo(f" ✓ Converted {fulltext_count} fulltext sentences") + + def _convert_dataset( name: str, download_path: Path, output: Path, converter: object, verbose: bool ) -> None: @@ -200,13 +224,15 @@ def _convert_dataset( source = download_path stats = converter.convert_wordnet_database(source, output) # type: ignore[attr-defined] if verbose: - synset_count = sum(v for k, v in stats.items() if k.startswith("synsets_")) - click.echo(f" ✓ Converted {synset_count} synsets") + total = stats.get("total_synsets", 0) + click.echo(f" ✓ Converted {total} synsets") + _convert_wordnet_supplementary(converter, source, output.parent, verbose) elif name == "framenet": source = download_path / "frame" count = converter.convert_frames_directory(source, output) # type: ignore[attr-defined] if verbose: click.echo(f" ✓ Converted {count} frames") + _convert_framenet_supplementary(converter, download_path, output.parent, verbose) def initialize_datasets( From e0d0ad954ca1dc04dc330fb5f2e3a6f3340841e4 Mon Sep 17 00:00:00 2001 From: Aaron Steven White Date: Thu, 5 Feb 2026 22:06:36 -0500 Subject: [PATCH 06/11] Adds converter-to-loader round-trip integration tests for all four resources. --- tests/test_integration/__init__.py | 0 .../test_converter_loader_roundtrip.py | 829 ++++++++++++++++++ 2 files changed, 829 insertions(+) create mode 100644 tests/test_integration/__init__.py create mode 100644 tests/test_integration/test_converter_loader_roundtrip.py diff --git a/tests/test_integration/__init__.py b/tests/test_integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_integration/test_converter_loader_roundtrip.py b/tests/test_integration/test_converter_loader_roundtrip.py new file mode 100644 index 0000000..1829a00 --- /dev/null +++ b/tests/test_integration/test_converter_loader_roundtrip.py @@ -0,0 +1,829 @@ +"""Converter-to-loader round-trip integration tests. + +Tests that data survives the full pipeline: raw format → converter → JSONL → loader. +Covers all four resources (WordNet, FrameNet, VerbNet, PropBank) plus contract +and field completeness checks. +""" + +import json + +import pytest + +from glazing.framenet.converter import FrameNetConverter +from glazing.framenet.loader import FrameNetLoader +from glazing.propbank.converter import PropBankConverter +from glazing.propbank.loader import PropBankLoader +from glazing.verbnet.converter import VerbNetConverter +from glazing.verbnet.loader import VerbNetLoader +from glazing.wordnet.converter import WordNetConverter +from glazing.wordnet.loader import WordNetLoader + +# ── WordNet ──────────────────────────────────────────────────────────────── + + +WN_LICENSE_HEADER = """\ + 1 This software and database is being provided to you, the LICENSEE, by + 2 Princeton University under the following license. + 3 + 4 + 5 +""" + + +class TestWordNetRoundTrip: + """WordNet converter → JSONL → loader pipeline.""" + + @pytest.fixture + def wordnet_data(self, tmp_path): + """Create a minimal WordNet database and run the full conversion pipeline.""" + wn_dir = tmp_path / "wn" + wn_dir.mkdir() + output_dir = tmp_path / "out" + output_dir.mkdir() + + # data.verb (2 synsets, second has 2 verb frames) + (wn_dir / "data.verb").write_text( + WN_LICENSE_HEADER + + "00001740 29 v 01 breathe 0 002 $ 00001740 v 0000 @ 00002084 v 0000 01 + 02 00 | draw air into and expel out of the lungs\n" + + "00002084 29 v 02 respire 0 breathe 1 001 @ 00001740 v 0000 02 + 01 00 + 02 01 | undergo respiration\n", + encoding="utf-8", + ) + # data.noun (1 synset) + (wn_dir / "data.noun").write_text( + WN_LICENSE_HEADER + + "00002325 03 n 01 entity 0 001 ~ 00002684 n 0000 | something having concrete existence\n", + encoding="utf-8", + ) + # data.adj / data.adv (empty but present) + for name in ("data.adj", "data.adv"): + (wn_dir / name).write_text(WN_LICENSE_HEADER, encoding="utf-8") + + # index.sense (sense_key synset_offset sense_number tag_count) + (wn_dir / "index.sense").write_text( + "breathe%2:29:00:: 00001740 1 25\n" + "respire%2:29:00:: 00002084 1 3\n" + "breathe%2:29:01:: 00002084 2 0\n" + "entity%1:03:00:: 00002325 1 11\n", + encoding="utf-8", + ) + + # verb.Framestext + (wn_dir / "verb.Framestext").write_text( + "1 Something ----s\n2 Somebody ----s\n", + encoding="utf-8", + ) + + # sents.vrb + (wn_dir / "sents.vrb").write_text( + "1 The children %s to the playground\n2 The banks %s the check\n", + encoding="utf-8", + ) + + # cntlist + (wn_dir / "cntlist").write_text( + "25 breathe%2:29:00:: 1\n3 respire%2:29:00:: 1\n11 entity%1:03:00:: 1\n", + encoding="utf-8", + ) + + # verb.exc + (wn_dir / "verb.exc").write_text( + "breathed breathe\nrespired respire\n", + encoding="utf-8", + ) + # Create empty exception files for remaining POS categories + for name in ("noun.exc", "adj.exc", "adv.exc"): + (wn_dir / name).write_text("", encoding="utf-8") + + # Run conversions + converter = WordNetConverter() + stats = converter.convert_wordnet_database(wn_dir, output_dir / "wordnet.jsonl") + sense_count = converter.convert_sense_index(wn_dir, output_dir / "wordnet_senses.jsonl") + exc_count = converter.convert_exceptions(wn_dir, output_dir / "wordnet_exceptions.jsonl") + + loader = WordNetLoader(data_path=output_dir / "wordnet.jsonl") + + return { + "stats": stats, + "sense_count": sense_count, + "exc_count": exc_count, + "loader": loader, + "output_dir": output_dir, + } + + def test_synset_count_preserved(self, wordnet_data): + """Converter counts match loader counts.""" + stats = wordnet_data["stats"] + wn = wordnet_data["loader"] + + assert stats["synsets_verb"] == 2 + assert stats["synsets_noun"] == 1 + assert stats["total_synsets"] == 3 + assert len(wn.synsets) == 3 + + def test_word_enrichment(self, wordnet_data): + """Words have tag_count and sense_number from cntlist/index.sense.""" + wn = wordnet_data["loader"] + + # breathe in synset 00001740 should have tag_count=25, sense_number=1 + synset = wn.synsets["00001740"] + breathe_word = synset.words[0] + assert breathe_word.lemma == "breathe" + assert breathe_word.tag_count == 25 + assert breathe_word.sense_number == 1 + + # entity in synset 00002325 should have tag_count=11 + entity_synset = wn.synsets["00002325"] + entity_word = entity_synset.words[0] + assert entity_word.lemma == "entity" + assert entity_word.tag_count == 11 + + def test_verb_frame_templates(self, wordnet_data): + """VerbFrames have template and example_sentence from verb.Framestext/sents.vrb.""" + wn = wordnet_data["loader"] + synset = wn.synsets["00001740"] + + assert synset.frames is not None + assert len(synset.frames) == 1 + + frame = synset.frames[0] + assert frame.frame_number == 2 + assert frame.template == "Somebody ----s" + assert frame.example_sentence == "The banks %s the check" + + def test_pointers_preserved(self, wordnet_data): + """Pointer relations survive the round-trip.""" + wn = wordnet_data["loader"] + synset = wn.synsets["00001740"] + + assert len(synset.pointers) == 2 + symbols = {p.symbol for p in synset.pointers} + assert "$" in symbols + assert "@" in symbols + + def test_lemma_index_builds(self, wordnet_data): + """Lemma index enables word lookups after loading.""" + wn = wordnet_data["loader"] + + assert "breathe" in wn.lemma_index + assert "v" in wn.lemma_index["breathe"] + assert "entity" in wn.lemma_index + assert "n" in wn.lemma_index["entity"] + + def test_sense_index_loads(self, wordnet_data): + """Sense index populated from supplementary wordnet_senses.jsonl.""" + wn = wordnet_data["loader"] + + assert len(wn.sense_index) == 4 + assert "breathe%2:29:00::" in wn.sense_index + sense = wn.sense_index["breathe%2:29:00::"] + assert sense.synset_offset == "00001740" + assert sense.tag_count == 25 + + def test_exceptions_load(self, wordnet_data): + """Morphological exceptions loaded from supplementary wordnet_exceptions.jsonl.""" + wn = wordnet_data["loader"] + + assert "v" in wn.exceptions + assert "breathed" in wn.exceptions["v"] + assert wn.exceptions["v"]["breathed"] == ["breathe"] + + +# ── FrameNet ────────────────────────────────────────────────────────────── + + +class TestFrameNetRoundTrip: + """FrameNet converter → JSONL → loader pipeline.""" + + @pytest.fixture + def framenet_data(self, tmp_path): + """Create a minimal FrameNet dataset and run the full conversion pipeline.""" + fn_root = tmp_path / "framenet" + frames_dir = fn_root / "frame" + lu_dir = fn_root / "lu" + fulltext_dir = fn_root / "fulltext" + output_dir = tmp_path / "output" + + frames_dir.mkdir(parents=True) + lu_dir.mkdir() + fulltext_dir.mkdir() + output_dir.mkdir() + + # Frame XML + (frames_dir / "Giving.xml").write_text( + """\ + + + <def-root>A Donor transfers a Theme to a Recipient.</def-root> + + <def-root>The person that gives.</def-root> + + + <def-root>The object given.</def-root> + + + <def-root>The person receiving.</def-root> + + + Transferring + + + + COD: freely transfer the possession of + + + + + COD: give to a good cause + + + +""", + encoding="utf-8", + ) + + # luIndex.xml + (fn_root / "luIndex.xml").write_text( + """\ + + + + +""", + encoding="utf-8", + ) + + # frRelation.xml + (fn_root / "frRelation.xml").write_text( + """\ + + + + + + + +""", + encoding="utf-8", + ) + + # semTypes.xml + (fn_root / "semTypes.xml").write_text( + """\ + + + + A type for physical entities + + +""", + encoding="utf-8", + ) + + # Fulltext XML + (fulltext_dir / "TestDoc.xml").write_text( + """\ + + +
+ + + +
+ + He gave her a book. + + + + + + + +
""", + encoding="utf-8", + ) + + # Run conversions + converter = FrameNetConverter() + frame_count = converter.convert_frames_directory(frames_dir, output_dir / "framenet.jsonl") + semtype_count = converter.convert_semtypes_file( + fn_root / "semTypes.xml", output_dir / "framenet_semtypes.jsonl" + ) + fulltext_count = converter.convert_fulltext_directory( + fulltext_dir, output_dir / "framenet_fulltext.jsonl" + ) + + loader = FrameNetLoader(data_path=output_dir / "framenet.jsonl") + + return { + "loader": loader, + "frame_count": frame_count, + "semtype_count": semtype_count, + "fulltext_count": fulltext_count, + } + + def test_frame_count_preserved(self, framenet_data): + """Converter reports 1 frame; loader reads 1 frame.""" + assert framenet_data["frame_count"] == 1 + frames = framenet_data["loader"].frames + assert len(frames) == 1 + assert frames[0].id == 139 + assert frames[0].name == "Giving" + + def test_frame_elements_preserved(self, framenet_data): + """FE names and core types survive the round trip.""" + frame = framenet_data["loader"].frames[0] + fe_names = {fe.name for fe in frame.frame_elements} + assert fe_names == {"Donor", "Theme", "Recipient"} + for fe in frame.frame_elements: + assert fe.core_type == "Core" + + def test_lexical_units_preserved(self, framenet_data): + """LU names and POS survive the round trip.""" + frame = framenet_data["loader"].frames[0] + lu_names = {lu.name for lu in frame.lexical_units} + assert "give.v" in lu_names + assert "donate.v" in lu_names + for lu in frame.lexical_units: + assert lu.pos == "V" + + def test_frame_relations_populated(self, framenet_data): + """Frame relations from frRelation.xml are attached to the frame.""" + frame = framenet_data["loader"].frames[0] + inherits = [r for r in frame.frame_relations if r.type == "Inherits from"] + assert len(inherits) == 1 + rel = inherits[0] + assert rel.sub_frame_id == 139 + assert rel.super_frame_id == 230 + assert rel.super_frame_name == "Transferring" + assert len(rel.fe_relations) == 1 + assert rel.fe_relations[0].sub_fe_name == "Donor" + assert rel.fe_relations[0].super_fe_name == "Sender" + + def test_semantic_types_load(self, framenet_data): + """Semantic types loaded from supplementary framenet_semtypes.jsonl.""" + sem_types = framenet_data["loader"].load_semantic_types() + assert len(sem_types) == 1 + assert sem_types[0].id == 68 + assert sem_types[0].name == "Physical_entity" + + def test_fulltext_loads(self, framenet_data): + """Fulltext sentences loaded from supplementary framenet_fulltext.jsonl.""" + sentences = framenet_data["loader"].load_fulltext() + assert len(sentences) == 1 + assert sentences[0].id == 100 + assert sentences[0].text == "He gave her a book." + assert len(sentences[0].annotation_sets) == 1 + + +# ── VerbNet ─────────────────────────────────────────────────────────────── + + +VERBNET_XML = """\ + + + + + + + + + + + + + + + + + + + + + + + + + + + + They lent a bicycle to me. + + + + + + + + + + + + + + + + + + + + + + + + + + + +""" + + +class TestVerbNetRoundTrip: + """VerbNet converter → JSONL → loader pipeline.""" + + @pytest.fixture + def verbnet_data(self, tmp_path): + """Create VerbNet XML and run the conversion pipeline.""" + vn_dir = tmp_path / "verbnet" + vn_dir.mkdir() + output_dir = tmp_path / "output" + output_dir.mkdir() + + (vn_dir / "give-13.1.xml").write_text(VERBNET_XML, encoding="utf-8") + + converter = VerbNetConverter() + count = converter.convert_verbnet_directory(vn_dir, output_dir / "verbnet.jsonl") + + loader = VerbNetLoader(data_path=output_dir / "verbnet.jsonl") + + return {"count": count, "loader": loader} + + def test_class_count_preserved(self, verbnet_data): + """Converter count matches loader count.""" + assert verbnet_data["count"] == 1 + assert len(verbnet_data["loader"].classes) == 1 + assert "give-13.1" in verbnet_data["loader"].classes + + def test_members_preserved(self, verbnet_data): + """Member names and keys survive the round trip.""" + vc = verbnet_data["loader"].classes["give-13.1"] + member_names = {m.name for m in vc.members} + assert member_names == {"give", "deal", "loan"} + + give = next(m for m in vc.members if m.name == "give") + assert give.verbnet_key == "give#2" + + def test_member_framenet_mappings(self, verbnet_data): + """fn_mapping attribute parsed into framenet_mappings.""" + vc = verbnet_data["loader"].classes["give-13.1"] + + give = next(m for m in vc.members if m.name == "give") + assert len(give.framenet_mappings) == 1 + assert give.framenet_mappings[0].frame_name == "Giving" + + # "None" fn_mapping should result in no mappings + deal = next(m for m in vc.members if m.name == "deal") + assert len(deal.framenet_mappings) == 0 + + def test_member_propbank_mappings(self, verbnet_data): + """grouping attribute parsed into propbank_mappings.""" + vc = verbnet_data["loader"].classes["give-13.1"] + + give = next(m for m in vc.members if m.name == "give") + pb_ids = {xr.target_id for xr in give.propbank_mappings} + assert "give.01" in pb_ids + assert "give.02" in pb_ids + + # empty grouping → no propbank mappings + loan = next(m for m in vc.members if m.name == "loan") + assert len(loan.propbank_mappings) == 0 + + def test_themroles_preserved(self, verbnet_data): + """Thematic roles survive the round trip.""" + vc = verbnet_data["loader"].classes["give-13.1"] + role_types = {r.type for r in vc.themroles} + assert role_types == {"Agent", "Theme", "Recipient"} + + def test_subclass_hierarchy(self, verbnet_data): + """Subclass members accessible through the class hierarchy.""" + vc = verbnet_data["loader"].classes["give-13.1"] + assert len(vc.subclasses) == 1 + + sub = vc.subclasses[0] + assert sub.id == "give-13.1-1" + assert len(sub.members) == 1 + assert sub.members[0].name == "sell" + + def test_member_index_builds(self, verbnet_data): + """Member index allows looking up class by verbnet key.""" + vn = verbnet_data["loader"] + assert "give#2" in vn.member_index + assert vn.member_index["give#2"] == "give-13.1" + + +# ── PropBank ────────────────────────────────────────────────────────────── + + +PROPBANK_XML = """\ + + + + + + + abandon + abandonment + + + + + theme + + + + + + + + + + + + + John abandoned the project. + + abandoned + John + the project + + + + +""" + + +class TestPropBankRoundTrip: + """PropBank converter → JSONL → loader pipeline.""" + + @pytest.fixture + def propbank_data(self, tmp_path): + """Create PropBank XML and run the conversion pipeline.""" + pb_dir = tmp_path / "frames" + pb_dir.mkdir() + output_dir = tmp_path / "output" + output_dir.mkdir() + + (pb_dir / "abandon.xml").write_text(PROPBANK_XML, encoding="utf-8") + + converter = PropBankConverter() + count = converter.convert_framesets_directory(pb_dir, output_dir / "propbank.jsonl") + + loader = PropBankLoader(data_path=output_dir / "propbank.jsonl") + + return {"count": count, "loader": loader} + + def test_frameset_count_preserved(self, propbank_data): + """Converter count matches loader count.""" + assert propbank_data["count"] == 1 + assert len(propbank_data["loader"].framesets) == 1 + assert "abandon" in propbank_data["loader"].framesets + + def test_roles_preserved(self, propbank_data): + """Roles survive the round trip with correct attributes.""" + fs = propbank_data["loader"].framesets["abandon"] + rs = fs.rolesets[0] + assert rs.id == "abandon.01" + assert rs.name == "leave behind" + + role_numbers = {r.n for r in rs.roles} + assert "0" in role_numbers + assert "1" in role_numbers + + role0 = next(r for r in rs.roles if r.n == "0") + assert role0.descr == "abandoner" + + def test_lexlinks_preserved(self, propbank_data): + """Lexical links survive the round trip.""" + rs = propbank_data["loader"].framesets["abandon"].rolesets[0] + + assert len(rs.lexlinks) == 2 + resources = {ll.resource for ll in rs.lexlinks} + assert "FrameNet" in resources + assert "VerbNet" in resources + + fn_link = next(ll for ll in rs.lexlinks if ll.resource == "FrameNet") + assert fn_link.class_name == "Abandonment" + assert fn_link.confidence == pytest.approx(0.8) + + def test_examples_preserved(self, propbank_data): + """Example annotations survive the round trip.""" + rs = propbank_data["loader"].framesets["abandon"].rolesets[0] + assert len(rs.examples) == 1 + + ex = rs.examples[0] + assert ex.name == "typical transitive" + assert "abandoned" in ex.text + + def test_roleset_index_builds(self, propbank_data): + """Roleset index allows looking up frameset by roleset ID.""" + pb = propbank_data["loader"] + assert "abandon.01" in pb.roleset_index + assert pb.roleset_index["abandon.01"] == "abandon" + + +# ── Contract Tests ──────────────────────────────────────────────────────── + + +class TestConverterLoaderContracts: + """Contract tests: all converters produce valid JSONL, all loaders can read it.""" + + def test_all_converters_produce_valid_jsonl(self, tmp_path): + """Every line in every converter output is valid JSON.""" + output_dir = tmp_path / "output" + output_dir.mkdir() + wn_dir = tmp_path / "wn" + wn_dir.mkdir() + + # Minimal WordNet data + (wn_dir / "data.verb").write_text( + WN_LICENSE_HEADER + + "00001740 29 v 01 breathe 0 001 @ 00001740 v 0000 01 + 02 00 | breathe\n", + encoding="utf-8", + ) + for name in ("data.noun", "data.adj", "data.adv"): + (wn_dir / name).write_text(WN_LICENSE_HEADER, encoding="utf-8") + (wn_dir / "index.sense").write_text("breathe%2:29:00:: 00001740 1 0\n", encoding="utf-8") + (wn_dir / "verb.Framestext").write_text("", encoding="utf-8") + (wn_dir / "sents.vrb").write_text("", encoding="utf-8") + (wn_dir / "cntlist").write_text("", encoding="utf-8") + for name in ("verb.exc", "noun.exc", "adj.exc", "adv.exc"): + (wn_dir / name).write_text("", encoding="utf-8") + + converter = WordNetConverter() + converter.convert_wordnet_database(wn_dir, output_dir / "wordnet.jsonl") + + # Verify every line is valid JSON + with (output_dir / "wordnet.jsonl").open() as f: + for i, line in enumerate(f): + obj = json.loads(line) + assert isinstance(obj, dict), f"Line {i} is not a JSON object" + + def test_supplementary_files_optional(self, tmp_path): + """Loaders work without supplementary files.""" + output_dir = tmp_path / "output" + output_dir.mkdir() + wn_dir = tmp_path / "wn" + wn_dir.mkdir() + + (wn_dir / "data.verb").write_text( + WN_LICENSE_HEADER + + "00001740 29 v 01 breathe 0 001 @ 00001740 v 0000 01 + 02 00 | breathe\n", + encoding="utf-8", + ) + for name in ("data.noun", "data.adj", "data.adv"): + (wn_dir / name).write_text(WN_LICENSE_HEADER, encoding="utf-8") + (wn_dir / "index.sense").write_text("breathe%2:29:00:: 00001740 1 0\n", encoding="utf-8") + (wn_dir / "verb.Framestext").write_text("", encoding="utf-8") + (wn_dir / "sents.vrb").write_text("", encoding="utf-8") + (wn_dir / "cntlist").write_text("", encoding="utf-8") + for name in ("verb.exc", "noun.exc", "adj.exc", "adv.exc"): + (wn_dir / name).write_text("", encoding="utf-8") + + converter = WordNetConverter() + converter.convert_wordnet_database(wn_dir, output_dir / "wordnet.jsonl") + + # Load without supplementary files + wn = WordNetLoader(data_path=output_dir / "wordnet.jsonl") + assert len(wn.synsets) == 1 + # sense_index and exceptions should be empty but loader shouldn't crash + assert len(wn.sense_index) == 0 + assert len(wn.exceptions) == 0 + + +# ── Field Completeness Tests ────────────────────────────────────────────── + + +class TestFieldCompleteness: + """Verify that key model fields are populated after conversion.""" + + def test_wordnet_field_completeness(self, tmp_path): + """WordNet synsets have all expected fields.""" + wn_dir = tmp_path / "wn" + wn_dir.mkdir() + output = tmp_path / "wordnet.jsonl" + + (wn_dir / "data.verb").write_text( + WN_LICENSE_HEADER + + "00001740 29 v 01 breathe 0 001 @ 00002084 v 0000 01 + 02 00 | draw air\n", + encoding="utf-8", + ) + for name in ("data.noun", "data.adj", "data.adv"): + (wn_dir / name).write_text(WN_LICENSE_HEADER, encoding="utf-8") + (wn_dir / "index.sense").write_text("breathe%2:29:00:: 00001740 1 5\n", encoding="utf-8") + (wn_dir / "verb.Framestext").write_text("2 Somebody ----s\n", encoding="utf-8") + (wn_dir / "sents.vrb").write_text("2 The banks %s\n", encoding="utf-8") + (wn_dir / "cntlist").write_text("5 breathe%2:29:00:: 1\n", encoding="utf-8") + for name in ("verb.exc", "noun.exc", "adj.exc", "adv.exc"): + (wn_dir / name).write_text("", encoding="utf-8") + + converter = WordNetConverter() + converter.convert_wordnet_database(wn_dir, output) + + with output.open() as f: + obj = json.loads(f.readline()) + + # Core fields + assert "offset" in obj + assert "lex_filenum" in obj + assert "ss_type" in obj + assert "words" in obj + assert len(obj["words"]) > 0 + assert "pointers" in obj + assert "gloss" in obj + + # Enriched word fields + word = obj["words"][0] + assert word["lemma"] == "breathe" + assert word["tag_count"] == 5 + assert word["sense_number"] == 1 + + # Verb frame fields + assert "frames" in obj + assert len(obj["frames"]) == 1 + frame = obj["frames"][0] + assert "frame_number" in frame + assert frame["template"] == "Somebody ----s" + assert frame["example_sentence"] == "The banks %s" + + def test_verbnet_field_completeness(self, tmp_path): + """VerbNet members have cross-resource mapping fields populated.""" + vn_dir = tmp_path / "vn" + vn_dir.mkdir() + output = tmp_path / "verbnet.jsonl" + + (vn_dir / "give-13.1.xml").write_text(VERBNET_XML, encoding="utf-8") + + converter = VerbNetConverter() + converter.convert_verbnet_directory(vn_dir, output) + + with output.open() as f: + obj = json.loads(f.readline()) + + # Check members + give = next(m for m in obj["members"] if m["name"] == "give") + assert len(give["framenet_mappings"]) == 1 + assert len(give["propbank_mappings"]) == 2 + assert len(give["wordnet_mappings"]) >= 1 + + # Check themroles + assert len(obj["themroles"]) == 3 + + # Check frames + assert len(obj["frames"]) == 1 + + def test_propbank_field_completeness(self, tmp_path): + """PropBank rolesets have all expected fields.""" + pb_dir = tmp_path / "pb" + pb_dir.mkdir() + output = tmp_path / "propbank.jsonl" + + (pb_dir / "abandon.xml").write_text(PROPBANK_XML, encoding="utf-8") + + converter = PropBankConverter() + converter.convert_framesets_directory(pb_dir, output) + + with output.open() as f: + obj = json.loads(f.readline()) + + assert "predicate_lemma" in obj + assert obj["predicate_lemma"] == "abandon" + assert "rolesets" in obj + + rs = obj["rolesets"][0] + assert rs["id"] == "abandon.01" + assert len(rs["roles"]) == 2 + assert len(rs["lexlinks"]) == 2 + assert len(rs["examples"]) == 1 From 6da7cb44fb3770c1a3c5c8570438adb028752be9 Mon Sep 17 00:00:00 2001 From: Aaron Steven White Date: Fri, 6 Feb 2026 13:06:01 -0500 Subject: [PATCH 07/11] Refines lemma validation to allow uppercase letters, digits at the start, and dots; updates tests accordingly. --- src/glazing/references/models.py | 6 +++--- src/glazing/types.py | 4 +++- src/glazing/wordnet/models.py | 2 +- tests/test_base.py | 7 +++++-- tests/test_references/test_models.py | 2 +- tests/test_types.py | 6 ++++-- tests/test_wordnet/test_models.py | 7 +++++-- 7 files changed, 22 insertions(+), 12 deletions(-) diff --git a/src/glazing/references/models.py b/src/glazing/references/models.py index cdb3547..ddb181c 100644 --- a/src/glazing/references/models.py +++ b/src/glazing/references/models.py @@ -57,7 +57,7 @@ from pydantic import BaseModel, Field, field_validator from glazing.propbank.models import LexLink, RoleLink -from glazing.types import DatasetType, MappingSource +from glazing.types import LEMMA_PATTERN, DatasetType, MappingSource from glazing.wordnet.models import Sense, WordNetCrossRef from glazing.wordnet.types import SynsetOffset @@ -584,8 +584,8 @@ def validate_lemma(cls, v: str) -> str: ValueError If lemma format is invalid. """ - if not re.match(r"^[a-z][a-z0-9_\'-]*$", v): - msg = f"Invalid lemma format: {v}" + if not re.match(LEMMA_PATTERN, v): + msg = f"Invalid lemma format: {v!r}" raise ValueError(msg) return v diff --git a/src/glazing/types.py b/src/glazing/types.py index c041ecb..5ad3e00 100644 --- a/src/glazing/types.py +++ b/src/glazing/types.py @@ -161,7 +161,9 @@ VERBNET_KEY_PATTERN = r"^[a-z_-]+#\d+$" # e.g., "give#2" # Name validation patterns -LEMMA_PATTERN = r"^[a-z][a-z0-9_\'-]*$" # Word lemmas +LEMMA_PATTERN = ( + r"^[a-zA-Z0-9][a-zA-Z0-9_\'\-\.\/]*$" # Word lemmas (incl. proper nouns, abbreviations) +) # Color validation for FrameNet HEX_COLOR_PATTERN = r"^#?[0-9A-Fa-f]{6}$" # 6-digit hex color with optional # prefix diff --git a/src/glazing/wordnet/models.py b/src/glazing/wordnet/models.py index 2336d3a..61fba80 100644 --- a/src/glazing/wordnet/models.py +++ b/src/glazing/wordnet/models.py @@ -106,7 +106,7 @@ def validate_lemma(cls, v: str) -> str: If lemma format is invalid. """ if not re.match(LEMMA_PATTERN, v): - msg = f"Invalid lemma format: {v}" + msg = f"Invalid lemma format: {v!r}" raise ValueError(msg) return v diff --git a/tests/test_base.py b/tests/test_base.py index 9ee24a1..486ec46 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -385,11 +385,14 @@ def test_validate_lemma(self): assert validate_lemma("abandon") == "abandon" assert validate_lemma("spray_paint") == "spray_paint" assert validate_lemma("don't") == "don't" + assert validate_lemma("Abandon") == "Abandon" # Uppercase allowed + assert validate_lemma("123abandon") == "123abandon" # Digit start allowed + assert validate_lemma("Dr.") == "Dr." # Dots allowed with pytest.raises(ValueError): - validate_lemma("Abandon") # Uppercase + validate_lemma("") # Empty with pytest.raises(ValueError): - validate_lemma("123abandon") # Starts with number + validate_lemma(" abandon") # Leading space def test_validate_hex_color(self): """Test hex color validation.""" diff --git a/tests/test_references/test_models.py b/tests/test_references/test_models.py index 399639e..eeac360 100644 --- a/tests/test_references/test_models.py +++ b/tests/test_references/test_models.py @@ -455,7 +455,7 @@ def test_invalid_lemma_format(self): """Test invalid lemma format.""" with pytest.raises(ValidationError) as exc_info: UnifiedLemma( - lemma="Give", # Capital letter invalid + lemma=" give", # Leading space invalid pos="v", framenet_lus=[], propbank_rolesets=[], diff --git a/tests/test_types.py b/tests/test_types.py index e573e58..c275e63 100644 --- a/tests/test_types.py +++ b/tests/test_types.py @@ -318,11 +318,13 @@ def test_lemma_pattern(self): assert pattern.match("run_up") assert pattern.match("don't") assert pattern.match("mother-in-law") + assert pattern.match("Give") # Capital letter allowed + assert pattern.match("123run") # Digit start allowed + assert pattern.match("Dr.") # Dots allowed # Invalid lemmas - assert not pattern.match("Give") # Capital letter - assert not pattern.match("123run") # Number start assert not pattern.match("") # Empty string + assert not pattern.match(" give") # Leading space def test_hex_color_pattern(self): """Test hex color pattern.""" diff --git a/tests/test_wordnet/test_models.py b/tests/test_wordnet/test_models.py index 341a271..fe9998e 100644 --- a/tests/test_wordnet/test_models.py +++ b/tests/test_wordnet/test_models.py @@ -34,12 +34,15 @@ def test_word_lemma_validation(self): Word(lemma="dog", lex_id=0) Word(lemma="run_up", lex_id=1) Word(lemma="mother-in-law", lex_id=0) + Word(lemma="Dog", lex_id=0) # Uppercase allowed (proper nouns) + Word(lemma="Dr.", lex_id=0) # Dots allowed (abbreviations) + Word(lemma="123dog", lex_id=0) # Digit start allowed # Invalid lemmas with pytest.raises(ValidationError): - Word(lemma="Dog", lex_id=0) # Capital letter + Word(lemma="", lex_id=0) # Empty with pytest.raises(ValidationError): - Word(lemma="123dog", lex_id=0) # Number start + Word(lemma=" dog", lex_id=0) # Leading space def test_word_lex_id_validation(self): """Test lex_id range validation.""" From 34dc404ad91d3f6fbe25683f6710ca3c75174177 Mon Sep 17 00:00:00 2001 From: Aaron Steven White Date: Fri, 6 Feb 2026 13:06:27 -0500 Subject: [PATCH 08/11] Adds pypi workflow. --- .github/workflows/publish.yml | 52 +++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 .github/workflows/publish.yml diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..9851ad1 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,52 @@ +name: Publish to PyPI + +on: + push: + tags: + - "v*" + +env: + PYTHON_VERSION: "3.13" + +jobs: + build: + name: Build Distribution + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install build dependencies + run: | + python -m pip install --upgrade pip + pip install build + + - name: Build distribution + run: python -m build + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: dist + path: dist/ + + publish: + name: Publish to PyPI + needs: build + runs-on: ubuntu-latest + environment: pypi + permissions: + id-token: write + steps: + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + name: dist + path: dist/ + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 From 8596c9916ced0d0dcf49c54faecb5b823aaf901d Mon Sep 17 00:00:00 2001 From: Aaron Steven White Date: Fri, 6 Feb 2026 15:31:57 -0500 Subject: [PATCH 09/11] Bumps version and changes repo location. --- CHANGELOG.md | 38 +++++++++++++++++++++++++++----------- README.md | 36 ++++++++++++++++++------------------ docs/api/index.md | 2 +- docs/citation.md | 12 ++++++------ docs/contributing.md | 4 ++-- docs/index.md | 16 ++++++++-------- docs/installation.md | 4 ++-- mkdocs.yml | 6 +++--- pyproject.toml | 8 ++++---- src/glazing/__version__.py | 2 +- 10 files changed, 72 insertions(+), 56 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 05c574a..b6e3a33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,16 +7,31 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.2] - 2026-02-06 + +### Added + +- **PyPI publish workflow** triggered on tag creation using trusted publishers (OIDC) +- **Converter-to-loader round-trip integration tests** for all four resources +- **FrameNet frame relation, LU enrichment, semantic type, and fulltext parsing** in converter and loader +- **Supplementary data conversion** for WordNet senses/exceptions and FrameNet semantic types/fulltext in `initialize.py` + +### Changed + +- **Rewrote WordNet converter and loader** to use enriched single-file JSONL output with supplementary sense and exception files +- **Relaxed lemma validation** to allow uppercase letters, digits at the start, and dots (supporting proper nouns, abbreviations, and numeric prefixes) +- **Moved repository** from `aaronstevenwhite/glazing` to `factslab/glazing` + +### Fixed + +- **VerbNet converter** now populates `framenet_mappings` and `propbank_mappings` from member attributes +- **PropBank converter** now handles AMR-UMR-91 roleset conversion and XML edge cases + ## [0.2.1] - 2025-10-28 ### Fixed -- **FrameNet lexical units now properly loaded during conversion** - - Lexical units are now parsed from `luIndex.xml` during frame conversion - - All frames now include their associated lexical units with complete metadata - - Fixes critical data completeness issue where `frame.lexical_units` was always empty - - Enables querying frames by lexical unit name via the frame index - - Approximately 13,500 lexical units now correctly associated with their frames +- **FrameNet converter** now properly loads lexical units from `luIndex.xml` during frame conversion, fixing a critical issue where `frame.lexical_units` was always empty (~13,500 LUs now correctly associated) ## [0.2.0] - 2025-09-30 @@ -197,8 +212,9 @@ Initial release of `glazing`, a package containing unified data models and inter - `tqdm >= 4.60.0` (progress bars) - `rich >= 13.0.0` (CLI formatting) -[Unreleased]: https://github.com/aaronstevenwhite/glazing/compare/v0.2.1...HEAD -[0.2.1]: https://github.com/aaronstevenwhite/glazing/releases/tag/v0.2.1 -[0.2.0]: https://github.com/aaronstevenwhite/glazing/releases/tag/v0.2.0 -[0.1.1]: https://github.com/aaronstevenwhite/glazing/releases/tag/v0.1.1 -[0.1.0]: https://github.com/aaronstevenwhite/glazing/releases/tag/v0.1.0 +[Unreleased]: https://github.com/factslab/glazing/compare/v0.2.2...HEAD +[0.2.2]: https://github.com/factslab/glazing/releases/tag/v0.2.2 +[0.2.1]: https://github.com/factslab/glazing/releases/tag/v0.2.1 +[0.2.0]: https://github.com/factslab/glazing/releases/tag/v0.2.0 +[0.1.1]: https://github.com/factslab/glazing/releases/tag/v0.1.1 +[0.1.0]: https://github.com/factslab/glazing/releases/tag/v0.1.0 diff --git a/README.md b/README.md index 5f93af7..1f1542b 100644 --- a/README.md +++ b/README.md @@ -2,23 +2,23 @@ [![PyPI version](https://img.shields.io/pypi/v/glazing)](https://pypi.org/project/glazing/) [![Python versions](https://img.shields.io/pypi/pyversions/glazing)](https://pypi.org/project/glazing/) -[![CI](https://github.com/aaronstevenwhite/glazing/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/aaronstevenwhite/glazing/actions/workflows/ci.yml) +[![CI](https://github.com/factslab/glazing/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/factslab/glazing/actions/workflows/ci.yml) [![Documentation](https://readthedocs.org/projects/glazing/badge/?version=latest)](https://glazing.readthedocs.io/en/latest/?badge=latest) -[![License](https://img.shields.io/pypi/l/glazing)](https://github.com/aaronstevenwhite/glazing/blob/main/LICENSE) +[![License](https://img.shields.io/pypi/l/glazing)](https://github.com/factslab/glazing/blob/main/LICENSE) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.17467082.svg)](https://doi.org/10.5281/zenodo.17467082) Unified data models and interfaces for syntactic and semantic frame ontologies. ## Features -- 🚀 **One-command setup**: `glazing init` downloads and prepares all datasets -- 📦 **Type-safe models**: Pydantic v2 validation for all data structures -- 🔍 **Unified search**: Query across all datasets with consistent API -- 🔗 **Cross-references**: Automatic mapping between resources with confidence scores -- 🎯 **Fuzzy search**: Find data with typos, spelling variants, and inconsistencies -- 🐳 **Docker support**: Use via Docker without local installation -- 💾 **Efficient storage**: JSON Lines format with streaming support -- 🐍 **Modern Python**: Full type hints, Python 3.13+ support +- **One-command setup**: `glazing init` downloads and prepares all datasets +- **Type-safe models**: Pydantic v2 validation for all data structures +- **Unified search**: Query across all datasets with consistent API +- **Cross-references**: Automatic mapping between resources with confidence scores +- **Fuzzy search**: Find data with typos, spelling variants, and inconsistencies +- **Docker support**: Use via Docker without local installation +- **Efficient storage**: JSON Lines format with streaming support +- **Modern Python**: Full type hints, Python 3.13+ support ## Installation @@ -34,7 +34,7 @@ Build and run Glazing in a containerized environment: ```bash # Build the image -git clone https://github.com/aaronstevenwhite/glazing.git +git clone https://github.com/factslab/glazing.git cd glazing docker build -t glazing:latest . @@ -167,11 +167,11 @@ Full documentation available at [https://glazing.readthedocs.io](https://glazing ## Contributing -We welcome contributions! See [CONTRIBUTING.md](https://github.com/aaronstevenwhite/glazing/blob/main/CONTRIBUTING.md) for guidelines. +We welcome contributions! See [CONTRIBUTING.md](https://github.com/factslab/glazing/blob/main/CONTRIBUTING.md) for guidelines. ```bash # Development setup -git clone https://github.com/aaronstevenwhite/glazing +git clone https://github.com/factslab/glazing cd glazing pip install -e ".[dev]" ``` @@ -185,22 +185,22 @@ If you use Glazing in your research, please cite: author = {White, Aaron Steven}, title = {Glazing: Unified Data Models and Interfaces for Syntactic and Semantic Frame Ontologies}, year = {2025}, - url = {https://github.com/aaronstevenwhite/glazing}, + url = {https://github.com/factslab/glazing}, doi = {10.5281/zenodo.17467082} } ``` ## License -This package is licensed under an MIT License. See [LICENSE](https://github.com/aaronstevenwhite/glazing/blob/main/LICENSE) file for details. +This package is licensed under an MIT License. See [LICENSE](https://github.com/factslab/glazing/blob/main/LICENSE) file for details. ## Links -- [GitHub Repository](https://github.com/aaronstevenwhite/glazing) +- [GitHub Repository](https://github.com/factslab/glazing) - [PyPI Package](https://pypi.org/project/glazing/) - [Documentation](https://glazing.readthedocs.io) -- [Issue Tracker](https://github.com/aaronstevenwhite/glazing/issues) +- [Issue Tracker](https://github.com/factslab/glazing/issues) ## Acknowledgments -This project was funded by a [National Science Foundation](https://www.nsf.gov/) ([BCS-2040831](https://www.nsf.gov/awardsearch/showAward?AWD_ID=2040831)) and builds upon the foundational work of the FrameNet, PropBank, VerbNet, and WordNet teams. +This project was funded by a [National Science Foundation](https://www.nsf.gov/) ([BCS-2040831](https://www.nsf.gov/awardsearch/showAward?AWD_ID=2040831)) and builds upon the foundational work of the FrameNet, PropBank, VerbNet, and WordNet teams. It was architected and implemented with the help of Claude Code. diff --git a/docs/api/index.md b/docs/api/index.md index 86288bf..b42cbcd 100644 --- a/docs/api/index.md +++ b/docs/api/index.md @@ -118,7 +118,7 @@ except ValidationError as e: ## Version Compatibility -This documentation covers Glazing version 0.2.1. Check your installed version: +This documentation covers Glazing version 0.2.2. Check your installed version: ```python import glazing diff --git a/docs/citation.md b/docs/citation.md index 051dcfb..c05d7a1 100644 --- a/docs/citation.md +++ b/docs/citation.md @@ -11,23 +11,23 @@ If you use Glazing in your research, please cite our work. author = {White, Aaron Steven}, title = {Glazing: Unified Data Models and Interfaces for Syntactic and Semantic Frame Ontologies}, year = {2025}, - url = {https://github.com/aaronstevenwhite/glazing}, - version = {0.2.1}, + url = {https://github.com/factslab/glazing}, + version = {0.2.2}, doi = {10.5281/zenodo.17467082} } ``` ### APA -White, A. S. (2025). *Glazing: Unified Data Models and Interfaces for Syntactic and Semantic Frame Ontologies* (Version 0.2.1) [Computer software]. https://github.com/aaronstevenwhite/glazing +White, A. S. (2025). *Glazing: Unified Data Models and Interfaces for Syntactic and Semantic Frame Ontologies* (Version 0.2.2) [Computer software]. https://github.com/factslab/glazing ### Chicago -White, Aaron Steven. 2025. *Glazing: Unified Data Models and Interfaces for Syntactic and Semantic Frame Ontologies*. Version 0.2.1. https://github.com/aaronstevenwhite/glazing. +White, Aaron Steven. 2025. *Glazing: Unified Data Models and Interfaces for Syntactic and Semantic Frame Ontologies*. Version 0.2.2. https://github.com/factslab/glazing. ### MLA -White, Aaron Steven. *Glazing: Unified Data Models and Interfaces for Syntactic and Semantic Frame Ontologies*. Version 0.2.1, 2025, https://github.com/aaronstevenwhite/glazing. +White, Aaron Steven. *Glazing: Unified Data Models and Interfaces for Syntactic and Semantic Frame Ontologies*. Version 0.2.2, 2025, https://github.com/factslab/glazing. ## Citing Datasets @@ -65,4 +65,4 @@ This project was funded by a [National Science Foundation](https://www.nsf.gov/) For questions about citing Glazing, contact: - Aaron Steven White: aaron.white@rochester.edu -- GitHub Issues: https://github.com/aaronstevenwhite/glazing/issues +- GitHub Issues: https://github.com/factslab/glazing/issues diff --git a/docs/contributing.md b/docs/contributing.md index 7403217..02c9b40 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -23,7 +23,7 @@ cd glazing 3. Add the upstream repository: ```bash -git remote add upstream https://github.com/aaronstevenwhite/glazing.git +git remote add upstream https://github.com/factslab/glazing.git ``` ### Development Setup @@ -245,7 +245,7 @@ Check the issue tracker for `enhancement` labels. Feel free to discuss implement Contributors are recognized in: -- The project's [CHANGELOG.md](https://github.com/aaronstevenwhite/glazing/blob/main/CHANGELOG.md) +- The project's [CHANGELOG.md](https://github.com/factslab/glazing/blob/main/CHANGELOG.md) - GitHub's contributor graph - Special mentions for significant contributions diff --git a/docs/index.md b/docs/index.md index 51cc33f..f4a55f9 100644 --- a/docs/index.md +++ b/docs/index.md @@ -2,8 +2,8 @@ [![PyPI version](https://img.shields.io/pypi/v/glazing)](https://pypi.org/project/glazing/) [![Python versions](https://img.shields.io/pypi/pyversions/glazing)](https://pypi.org/project/glazing/) -[![License](https://img.shields.io/pypi/l/glazing)](https://github.com/aaronstevenwhite/glazing/blob/main/LICENSE) -[![CI](https://github.com/aaronstevenwhite/glazing/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/aaronstevenwhite/glazing/actions/workflows/ci.yml) +[![License](https://img.shields.io/pypi/l/glazing)](https://github.com/factslab/glazing/blob/main/LICENSE) +[![CI](https://github.com/factslab/glazing/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/factslab/glazing/actions/workflows/ci.yml) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.17467082.svg)](https://doi.org/10.5281/zenodo.17467082) Unified data models and interfaces for syntactic and semantic frame ontologies. @@ -74,14 +74,14 @@ Glazing is actively maintained and welcomes contributions. The project follows s ## Links -- [GitHub Repository](https://github.com/aaronstevenwhite/glazing) +- [GitHub Repository](https://github.com/factslab/glazing) - [PyPI Package](https://pypi.org/project/glazing/) -- [Issue Tracker](https://github.com/aaronstevenwhite/glazing/issues) -- [Changelog](https://github.com/aaronstevenwhite/glazing/blob/main/CHANGELOG.md) +- [Issue Tracker](https://github.com/factslab/glazing/issues) +- [Changelog](https://github.com/factslab/glazing/blob/main/CHANGELOG.md) ## License -This package is licensed under an MIT License. See [LICENSE](https://github.com/aaronstevenwhite/glazing/blob/main/LICENSE) file for details. +This package is licensed under an MIT License. See [LICENSE](https://github.com/factslab/glazing/blob/main/LICENSE) file for details. ## Citation @@ -92,8 +92,8 @@ If you use Glazing in your research, please cite: author = {White, Aaron Steven}, title = {Glazing: Unified Data Models and Interfaces for Syntactic and Semantic Frame Ontologies}, year = {2025}, - url = {https://github.com/aaronstevenwhite/glazing}, - version = {0.2.1}, + url = {https://github.com/factslab/glazing}, + version = {0.2.2}, doi = {10.5281/zenodo.17467082} } ``` diff --git a/docs/installation.md b/docs/installation.md index bd6a53f..5e0a898 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -22,7 +22,7 @@ pip install glazing To install the latest development version: ```bash -git clone https://github.com/aaronstevenwhite/glazing.git +git clone https://github.com/factslab/glazing.git cd glazing pip install -e . ``` @@ -149,7 +149,7 @@ Glazing provides a Docker image for containerized usage, allowing you to use the Clone the repository and build the image: ```bash -git clone https://github.com/aaronstevenwhite/glazing.git +git clone https://github.com/factslab/glazing.git cd glazing docker build -t glazing:latest . ``` diff --git a/mkdocs.yml b/mkdocs.yml index d9f3d2f..ce2d0b8 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -2,8 +2,8 @@ site_name: Glazing Documentation site_description: Unified data models and interfaces for syntactic and semantic frame ontologies site_author: Aaron Steven White site_url: https://glazing.readthedocs.io -repo_name: aaronstevenwhite/glazing -repo_url: https://github.com/aaronstevenwhite/glazing +repo_name: factslab/glazing +repo_url: https://github.com/factslab/glazing edit_uri: edit/main/docs/ theme: @@ -127,7 +127,7 @@ markdown_extensions: extra: social: - icon: fontawesome/brands/github - link: https://github.com/aaronstevenwhite + link: https://github.com/factslab - icon: fontawesome/brands/python link: https://pypi.org/project/glazing/ version: diff --git a/pyproject.toml b/pyproject.toml index 4af2bf1..d8411ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "glazing" -version = "0.2.1" +version = "0.2.2" description = "Unified data models and interfaces for syntactic and semantic frame ontologies" readme = "README.md" requires-python = ">=3.13" @@ -60,9 +60,9 @@ docs = [ glazing = "glazing.cli:cli" [project.urls] -"Homepage" = "https://github.com/aaronstevenwhite/glazing" -"Bug Reports" = "https://github.com/aaronstevenwhite/glazing/issues" -"Source" = "https://github.com/aaronstevenwhite/glazing" +"Homepage" = "https://github.com/factslab/glazing" +"Bug Reports" = "https://github.com/factslab/glazing/issues" +"Source" = "https://github.com/factslab/glazing" "Documentation" = "https://glazing.readthedocs.io" [tool.setuptools.packages.find] diff --git a/src/glazing/__version__.py b/src/glazing/__version__.py index 7aa23f1..051b025 100644 --- a/src/glazing/__version__.py +++ b/src/glazing/__version__.py @@ -1,4 +1,4 @@ """Version information for the glazing package.""" -__version__ = "0.2.1" +__version__ = "0.2.2" __version_info__ = tuple(int(i) for i in __version__.split(".")) From 630147e59ffdced37cb808a303c0b32e3d6ca78b Mon Sep 17 00:00:00 2001 From: Aaron Steven White Date: Fri, 6 Feb 2026 15:41:52 -0500 Subject: [PATCH 10/11] Adds PR and issue templates. --- .github/ISSUE_TEMPLATE/bug_report.md | 36 +++++++++++++++++++++++ .github/ISSUE_TEMPLATE/feature_request.md | 27 +++++++++++++++++ .github/PULL_REQUEST_TEMPLATE.md | 30 +++++++++++++++++++ 3 files changed, 93 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md create mode 100644 .github/PULL_REQUEST_TEMPLATE.md diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..ef79b37 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,36 @@ +--- +name: Bug Report +about: Report a bug to help improve Glazing +title: "" +labels: bug +assignees: "" +--- + +## Description + + + +## Steps to reproduce + +1. +2. +3. + +## Expected behavior + + + +## Actual behavior + + + +## Environment + +- Glazing version: +- Python version: +- OS: +- Installation method: + +## Additional context + + diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..bde95eb --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,27 @@ +--- +name: Feature Request +about: Suggest a new feature or improvement +title: "" +labels: enhancement +assignees: "" +--- + +## Description + + + +## Motivation + + + +## Proposed solution + + + +## Alternatives considered + + + +## Additional context + + diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..0d487a8 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,30 @@ +## Description + + + +## Type of Change + +- [ ] Bug fix (non-breaking change which fixes an issue) +- [ ] New feature (non-breaking change which adds functionality) +- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) +- [ ] Documentation update + +## Key Changes + + + +- + +## Impact + + + +Closes # + +## Testing + +- [ ] All tests pass (`pytest`) +- [ ] New tests added (if applicable) +- [ ] Type checking passes (`mypy --strict src/`) +- [ ] Linting passes (`ruff check`) +- [ ] Formatting passes (`ruff format`) From 32819d5a043877d9936b810d7782100097e7b8bb Mon Sep 17 00:00:00 2001 From: Aaron Steven White Date: Fri, 6 Feb 2026 15:52:09 -0500 Subject: [PATCH 11/11] Fixes ruff PLW0108 and PLC0207 lint errors. --- src/glazing/cli/search.py | 20 ++++++++++---------- src/glazing/references/mapper.py | 4 ++-- src/glazing/search.py | 4 ++-- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/glazing/cli/search.py b/src/glazing/cli/search.py index 96244ac..a178065 100644 --- a/src/glazing/cli/search.py +++ b/src/glazing/cli/search.py @@ -175,7 +175,7 @@ def search() -> None: @click.option( "--data-dir", type=click.Path(exists=True, file_okay=False, dir_okay=True), - default=lambda: get_default_data_path(), + default=get_default_data_path, help="Directory containing converted JSON Lines files " "(default: ~/.local/share/glazing/converted).", ) @@ -318,7 +318,7 @@ def search_query( # noqa: PLR0913 @click.option( "--data-dir", type=click.Path(exists=True, file_okay=False, dir_okay=True), - default=lambda: get_default_data_path(), + default=get_default_data_path, help="Directory containing converted JSON Lines files " "(default: ~/.local/share/glazing/converted).", ) @@ -371,7 +371,7 @@ def get_entity( @click.option( "--data-dir", type=click.Path(exists=True, file_okay=False, dir_okay=True), - default=lambda: get_default_data_path(), + default=get_default_data_path, help="Directory containing converted JSON Lines files " "(default: ~/.local/share/glazing/converted).", ) @@ -456,7 +456,7 @@ def search_role( @click.option( "--data-dir", type=click.Path(exists=True, file_okay=False, dir_okay=True), - default=lambda: get_default_data_path(), + default=get_default_data_path, help="Directory containing converted JSON Lines files " "(default: ~/.local/share/glazing/converted).", ) @@ -518,7 +518,7 @@ def find_cross_ref( @click.option( "--data-dir", type=click.Path(exists=True, file_okay=False, dir_okay=True), - default=lambda: get_default_data_path(), + default=get_default_data_path, help="Directory containing converted JSON Lines files.", ) @click.option( @@ -579,7 +579,7 @@ def search_fuzzy( @click.option( "--data-dir", type=click.Path(exists=True, file_okay=False, dir_okay=True), - default=lambda: get_default_data_path(), + default=get_default_data_path, help="Directory containing converted JSON Lines files.", ) @click.option("--optional", is_flag=True, help="Find optional roles.") @@ -643,7 +643,7 @@ def search_roles( @click.option( "--data-dir", type=click.Path(exists=True, file_okay=False, dir_okay=True), - default=lambda: get_default_data_path(), + default=get_default_data_path, help="Directory containing converted JSON Lines files.", ) @click.option( @@ -722,7 +722,7 @@ def search_args( # noqa: PLR0913 @click.option( "--data-dir", type=click.Path(exists=True, file_okay=False, dir_okay=True), - default=lambda: get_default_data_path(), + default=get_default_data_path, help="Directory containing converted JSON Lines files.", ) @click.option( @@ -786,7 +786,7 @@ def search_relations( @click.option( "--data-dir", type=click.Path(exists=True, file_okay=False, dir_okay=True), - default=lambda: get_default_data_path(), + default=get_default_data_path, help="Directory containing converted JSON Lines files.", ) @click.option( @@ -874,7 +874,7 @@ def search_syntax( @click.option( "--data-dir", type=click.Path(exists=True, file_okay=False, dir_okay=True), - default=lambda: get_default_data_path(), + default=get_default_data_path, help="Directory containing converted JSON Lines files.", ) @click.option( diff --git a/src/glazing/references/mapper.py b/src/glazing/references/mapper.py index 10e1b95..b97ef33 100644 --- a/src/glazing/references/mapper.py +++ b/src/glazing/references/mapper.py @@ -919,8 +919,8 @@ def _generate_roleset_name(self, roleset_id: str) -> str: Descriptive name. """ if "." in roleset_id: - lemma_part = roleset_id.split(".")[0] - return f"{lemma_part} (sense {roleset_id.split('.')[-1]})" + lemma_part, sense_part = roleset_id.split(".", maxsplit=1) + return f"{lemma_part} (sense {sense_part})" return f"Roleset {roleset_id}" def _build_verbnet_member_refs( diff --git a/src/glazing/search.py b/src/glazing/search.py index 75ee365..5f4e849 100644 --- a/src/glazing/search.py +++ b/src/glazing/search.py @@ -876,7 +876,7 @@ def _propbank_to_verbnet_refs(self, entity_id: str) -> list[dict[str, str | floa if not self.propbank: return references - pb_frameset = self.propbank.by_lemma(entity_id.split(".")[0]) + pb_frameset = self.propbank.by_lemma(entity_id.split(".", maxsplit=1)[0]) if not pb_frameset: return references @@ -1051,7 +1051,7 @@ def _propbank_to_framenet_refs(self, entity_id: str) -> list[dict[str, str | flo if not self.propbank: return references - pb_frameset = self.propbank.by_lemma(entity_id.split(".")[0]) + pb_frameset = self.propbank.by_lemma(entity_id.split(".", maxsplit=1)[0]) if not pb_frameset: return references