diff --git a/Cargo.lock b/Cargo.lock index 9001283..1e7d801 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -341,7 +341,7 @@ checksum = "2687e6cf9c00f48e9284cf9fd15f2ef341d03cc7743abf9df4c5f07fdee50b18" [[package]] name = "minimalist-grammar-parser" version = "0.1.0" -source = "git+https://github.com/MichaelGoodale/minimalist-grammar-parser.git#342aac1c51c1c4125e2b7ca27e9d91cde3ac73ed" +source = "git+https://github.com/MichaelGoodale/minimalist-grammar-parser.git#4876725b425835b5a205d94fe93aab8c68046e9b" dependencies = [ "ahash 0.8.12", "bitvec", @@ -557,11 +557,13 @@ name = "python-mg" version = "0.1.0" dependencies = [ "anyhow", + "itertools", "logprob", "minimalist-grammar-parser", "numpy", "pyo3", "rand", + "simple-semantics", ] [[package]] @@ -636,7 +638,7 @@ dependencies = [ "aho-corasick", "memchr", "regex-automata 0.4.14", - "regex-syntax 0.8.9", + "regex-syntax 0.8.10", ] [[package]] @@ -658,7 +660,7 @@ checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.8.9", + "regex-syntax 0.8.10", ] [[package]] @@ -669,9 +671,9 @@ checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" [[package]] name = "regex-syntax" -version = "0.8.9" +version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a96887878f22d7bad8a3b6dc5b7440e0ada9a245242924394987b21cf2210a4c" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" [[package]] name = "rustc-hash" @@ -743,11 +745,12 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "simple-semantics" version = "0.1.0" -source = "git+https://github.com/MichaelGoodale/simple-semantics.git#2cef7d759b37c4a466b068f984bb549ec040b218" +source = "git+https://github.com/MichaelGoodale/simple-semantics.git#86afc64524a554b08ca693c7911671d919fb6413" dependencies = [ "ahash 0.8.12", "chumsky", "itertools", + "rand", "serde", "serde_json", "thiserror", diff --git a/Cargo.toml b/Cargo.toml index 59a5b41..6ecf8e0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,7 @@ name = "python_mg" crate-type = ["cdylib"] [dependencies] +simple-semantics = { git = "https://github.com/MichaelGoodale/simple-semantics.git" } pyo3 = { version = "0.28.0", features = [ "anyhow", "extension-module", @@ -22,3 +23,4 @@ anyhow = "1.0.98" logprob = "0.2.1" rand = "0.10.0" numpy = "0.28.0" +itertools = "0.14.0" diff --git a/docs/source/index.rst b/docs/source/index.rst index fb4db23..d56d406 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -15,6 +15,7 @@ It provides the tools necessary to generate strings from a Minimalist Grammar an lexicon syntax + semantics metrics diff --git a/docs/source/semantics.rst b/docs/source/semantics.rst new file mode 100644 index 0000000..3d9b288 --- /dev/null +++ b/docs/source/semantics.rst @@ -0,0 +1,25 @@ +Semantics +========= + +These are the classes that allow you to evaluate Language of Thought expressions generated by Semantic lexica. + +.. autoclass:: python_mg.semantics.Scenario + :members: + :undoc-members: + +.. autoclass:: python_mg.semantics.Actor + :members: + :undoc-members: + +.. autoclass:: python_mg.semantics.Event + :members: + :undoc-members: + +.. autoclass:: python_mg.semantics.PossibleEvent + :members: + :undoc-members: + +.. autoclass:: python_mg.semantics.ScenarioGenerator + :members: + :undoc-members: + diff --git a/docs/source/syntax.rst b/docs/source/syntax.rst index 8344093..5dc30e9 100644 --- a/docs/source/syntax.rst +++ b/docs/source/syntax.rst @@ -1,5 +1,5 @@ -Syntax tree utilities -===================== +Syntax +====== These are the classes that are useful for manipulating or plotting parse trees directly. diff --git a/pyproject.toml b/pyproject.toml index b9f03b4..c7a2072 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,10 +12,10 @@ classifiers = [ ] dynamic = ["version"] dependencies = [ - "numpy>=1.24.4", - "pillow>=10.4.0", - "pydot>=4.0.1", - "rustworkx>=0.15.1", + "numpy>=1.24.4", + "pillow>=10.4.0", + "pydot>=4.0.1", + "rustworkx>=0.15.1", ] [tool.maturin] features = ["pyo3/extension-module"] @@ -24,16 +24,16 @@ module-name = "python_mg._lib_name" [dependency-groups] dev = [ - "patchelf>=0.17.2.2", - "pytest>=8.3.5", - "sphinx>=7.1.2", - "sphinx-rtd-theme>=3.0.2", + "patchelf>=0.17.2.2", + "pytest>=8.3.5", + "sphinx>=7.1.2", + "sphinx-rtd-theme>=3.0.2", ] examples = [ - "datasets>=3.1.0", - "scipy>=1.10.1", - "torch>=2.5.1", - "transformers[torch]>=4.46.3", + "datasets>=3.1.0", + "scipy>=1.10.1", + "torch>=2.5.1", + "transformers[torch]>=4.46.3", ] [tool.setuptools.package-data] @@ -43,4 +43,12 @@ examples = [ where = ["src"] [tool.uv] -cache-keys = [{file = "pyproject.toml"}, {file = "Cargo.toml"}, {file = "**/*.rs"}] +cache-keys = [ + { file = "pyproject.toml" }, + { file = "Cargo.toml" }, + { file = "**/*.rs" }, +] + +[tool.ruff.lint] +ignore = ["E501"] +select = ["E", "F", "D", "ANN"] diff --git a/python/python_mg/_lib_name.pyi b/python/python_mg/_lib_name.pyi index aa96e0c..819fbf1 100644 --- a/python/python_mg/_lib_name.pyi +++ b/python/python_mg/_lib_name.pyi @@ -1,4 +1,5 @@ -from typing import Sequence +import datetime +from typing import Literal, Sequence import numpy as np import numpy.typing as npt @@ -6,93 +7,57 @@ from python_mg.syntax import ParseTree class MGNode: def is_trace(self) -> bool: ... - def trace_id(self) -> int: - """Gets the trace id of traces and raises an error otherwise""" - - def lemma_string(self) -> str: - """Format the node as a string in a tree if leaf or trace""" - - def is_stolen(self) -> str: - """Checks if the head has been stolen by head-movement""" + def trace_id(self) -> int: ... + def lemma_string(self) -> str: ... + def is_stolen(self) -> str: ... class MGEdge: - def is_move(self) -> bool: - """Checks whether the edge is a movement edge""" - - def is_head_move(self) -> bool: - """Checks whether the edge is a head-movement edge""" - - def is_merge(self) -> bool: - """Checks whether the edge is a merge edge""" + def is_move(self) -> bool: ... + def is_head_move(self) -> bool: ... + def is_merge(self) -> bool: ... class SyntacticStructure: - """A parse tree for some string""" + """A parse tree for some string.""" def __init__(self) -> None: ... - def log_prob(self) -> float: - """Return the log probability.""" - - def n_steps(self) -> int: - """Returns the number of steps in the derivation of this structure""" - - def contains_lexical_entry(self, s: str) -> bool: - """Check if this structure contains a specific lexical entry (formatted as an MG entry, will raise an error if unparseable)""" - - def contains_word(self, s: str | None) -> bool: - """Check if this structure contains a specific word.""" - - def prob(self) -> float: - """Return the probability of this syntactic structure.""" - - def latex(self) -> str: - """Return a LaTeX representation of this syntactic structure.""" - - def to_tree(self) -> ParseTree: - """Converts a syntactic structure into a graph structure""" - - def max_memory_load(self) -> int: - """Gets the largest amount of movers at a single point""" - - def tokens(self) -> npt.NDArray[np.uint]: - """Converts the string of this SyntacticStructure into a tokenized numpy array""" - + def pronunciation(self) -> list[str]: ... + def log_prob(self) -> float: ... + def n_steps(self) -> int: ... + def contains_lexical_entry(self, s: str) -> bool: ... + def contains_word(self, s: str | None) -> bool: ... + def prob(self) -> float: ... + def latex(self) -> str: ... + def to_tree(self) -> ParseTree: ... + def max_memory_load(self) -> int: ... + def tokens(self) -> npt.NDArray[np.uint]: ... + @property + def meaning(self) -> list[str] | None: ... def __to_tree_inner( self, ) -> tuple[list[tuple[int, MGNode]], list[tuple[int, int, MGEdge]], int]: ... class Continuation: - """A continuation of a prefix string""" + """A continuation of a prefix string.""" def __init__(self, word: str) -> None: ... @staticmethod def EOS() -> "Continuation": ... - def is_end_of_string(self) -> bool: - """Check if the continuation is a end of string marker""" - - def is_word(self) -> bool: - """Check if the continuation is a word""" - - def is_multi_word(self) -> bool: - """Check if the continuation is an affixed word""" + def is_end_of_string(self) -> bool: ... + def is_word(self) -> bool: ... + def is_multi_word(self) -> bool: ... class GrammarIterator: def __iter__(self) -> GrammarIterator: ... def __next__(self) -> SyntacticStructure: ... class Lexicon: - """A Minimalist Grammar Lexicon""" + """A Minimalist Grammar Lexicon.""" def __init__(self, s: str) -> None: ... @staticmethod - def random_lexicon(lemmas: list[str]) -> "Lexicon": - """Generate a random lexicon from the list of lemmas""" - - def mdl(self, n_phonemes: int) -> float: - """Returns the model description length of the lexicon""" - - def is_semantic(self) -> bool: - """Returns whether the lexicon has semantic interpretations""" - + def random_lexicon(lemmas: list[str]) -> "Lexicon": ... + def mdl(self, n_phonemes: int) -> float: ... + def is_semantic(self) -> bool: ... def continuations( self, prefix: str, @@ -102,9 +67,7 @@ class Lexicon: max_steps: int | None = 64, n_beams: int | None = 256, max_strings: int | None = None, - ) -> set[Continuation]: - """Returns a set of all valid continuations from this prefix""" - + ) -> set[Continuation]: ... def generate_unique_strings( self, category: str, @@ -113,9 +76,7 @@ class Lexicon: max_steps: int | None = 64, n_beams: int | None = 256, max_strings: int | None = None, - ) -> list[tuple[list[str], float]]: - """Returns a list of all unique strings and their probabilities""" - + ) -> list[tuple[list[str], float]]: ... def generate_grammar( self, category: str, @@ -124,9 +85,7 @@ class Lexicon: max_steps: int | None = 64, n_beams: int | None = 256, max_strings: int | None = None, - ) -> GrammarIterator: - """Returns an iterator over all possible parses""" - + ) -> GrammarIterator: ... def parse( self, s: str, @@ -136,11 +95,7 @@ class Lexicon: max_steps: int | None = 64, n_beams: int | None = 256, max_strings: int | None = None, - ) -> list[SyntacticStructure]: - """Returns a list of all possible parses of that string. - The string, s, should be delimited by spaces for words and hyphens for multi-word expressions from head-movement - """ - + ) -> list[SyntacticStructure]: ... def parse_tokens( self, s: Sequence[int] | npt.NDArray[np.uint], @@ -150,21 +105,13 @@ class Lexicon: max_steps: int | None = 64, n_beams: int | None = 256, max_strings: int | None = None, - ) -> list[SyntacticStructure]: - """Returns a list of all possible parses of a string represented by tokens.""" - - def tokens(self) -> dict[str, int]: - pass - - def detokenize(self, s: Sequence[int] | npt.NDArray[np.uint]) -> list[str]: - pass - + ) -> list[SyntacticStructure]: ... + def tokens(self) -> dict[str, int]: ... + def detokenize(self, s: Sequence[int] | npt.NDArray[np.uint]) -> list[str]: ... def detokenize_batch( self, s: Sequence[Sequence[int]] | list[npt.NDArray[np.uint]] | npt.NDArray[np.uint], - ) -> list[list[str]]: - pass - + ) -> list[list[str]]: ... def token_continuations( self, x: npt.NDArray[np.uint], @@ -173,5 +120,68 @@ class Lexicon: move_prob: float = 0.5, max_steps: int | None = 64, n_beams: int | None = 256, - ) -> npt.NDArray[np.bool]: - pass + ) -> npt.NDArray[np.bool]: ... + +class Actor: + name: str + properties: set[str] + + def __init__( + self, + name: str, + properties: set[str] | None = None, + ) -> None: ... + +class Event: + agent: str | None + patient: str | None + properties: set[str] + + def __init__( + self, + agent: str | None = None, + patient: str | None = None, + properties: set[str] | None = None, + ) -> None: ... + +class PossibleEvent: + has_agent: bool + has_patient: bool + is_reflexive: bool + name: str + + def __init__( + self, + name: str, + has_agent: bool = True, + has_patient: bool = False, + is_reflexive: bool = True, + ) -> None: ... + def event_kind(self) -> Literal[ + "Transitive", + "TransitiveNonReflexive", + "Unergative", + "Unaccusative", + "Avalent", + ]: ... + +class Scenario: + actors: list[Actor] + events: list[Event] + questions: list[str] + + def __init__(self, s: str) -> None: ... + def evaluate( + self, + expression: str, + max_steps: int | None = 256, + timeout: datetime.timedelta | None = None, + ) -> bool | Actor | Event | set[Actor] | set[Event]: ... + @staticmethod + def all_scenarios( + actors: list[str], event_kinds: list[PossibleEvent], actor_properties: list[str] + ) -> ScenarioGenerator: ... + +class ScenarioGenerator: + def __iter__(self) -> ScenarioGenerator: ... + def __next__(self) -> Scenario: ... diff --git a/python/python_mg/metrics.py b/python/python_mg/metrics.py index f1e1253..4249d07 100644 --- a/python/python_mg/metrics.py +++ b/python/python_mg/metrics.py @@ -10,16 +10,18 @@ def grammar_f1( preds: npt.NDArray[np.float64], correct: npt.NDArray[np.bool], ) -> dict[str, npt.NDArray[np.float64]]: - """ - Compute grammar F1 scores from boolean arrays of valid next moves and predictions. - The metric is described in `Meta-Learning Neural Mechanisms rather than Bayesian Priors `_ (Goodale et al., ACL 2025) + """Compute grammar F1 scores from boolean arrays of next moves and predictions. + + The metric is described in `Meta-Learning Neural Mechanisms rather than Bayesian + Priors `_ (Goodale et al., ACL 2025) Parameters ---------- preds : ndarray of float64 Predicted log probabilities for each token. Shape (..., seq_length, vocab_size). correct: ndarray of int - Boolean array for each valid token that can come next at that point in the sequence. Shape (..., seq_length, vocab_size). + Boolean array for each next valid token in the sequence. + Shape (..., seq_length, vocab_size). Returns ------- @@ -29,6 +31,7 @@ def grammar_f1( - 'precision': Precision scores - 'recall': Recall scores - 'f1': F1 scores + """ if preds.shape != correct.shape: raise ValueError("correct and preds must have matching shapes") @@ -66,14 +69,16 @@ def grammar_f1_from_strings( n_beams: int | None = 256, reduction: Literal["none", "sentence_mean", "length_mean"] = "sentence_mean", ) -> dict[str, npt.NDArray[np.float64]]: - """ - Compute grammar F1 scores from token sequences and predictions. - The metric is described in `Meta-Learning Neural Mechanisms rather than Bayesian Priors `_ (Goodale et al., ACL 2025) + """Compute grammar F1 scores from token sequences and predictions. + + The metric is described in `Meta-Learning Neural Mechanisms rather than Bayesian + Priors `_ (Goodale et al., ACL 2025) Parameters ---------- lexicon : Lexicon + the lexicon to use as ground truth for the measurement tokens : ndarray of int Token IDs representing the input sequences. Shape (..., seq_length). preds : ndarray of float64 @@ -108,11 +113,11 @@ def grammar_f1_from_strings( - 'precision': Precision scores - 'recall': Recall scores - 'f1': F1 scores - """ + """ if np.any(tokens < 0): raise ValueError( - "Some tokens are negative which means they will be cast to unsigned integers incorrectly" + "Some tokens are negative meaning they will be cast to unsigned integers incorrectly" ) conts = lexicon.token_continuations( diff --git a/python/python_mg/semantics.py b/python/python_mg/semantics.py new file mode 100644 index 0000000..014a47d --- /dev/null +++ b/python/python_mg/semantics.py @@ -0,0 +1,5 @@ +"""Defines tools related to semantics and interpretation of semantic grammars.""" + +from python_mg._lib_name import Scenario, Actor, Event, PossibleEvent, ScenarioGenerator + +__all__ = ["Scenario", "Actor", "Event", "PossibleEvent", "ScenarioGenerator"] diff --git a/python/python_mg/syntax.py b/python/python_mg/syntax.py index d20e524..30790c4 100644 --- a/python/python_mg/syntax.py +++ b/python/python_mg/syntax.py @@ -1,3 +1,5 @@ +"""Defines tools related to syntax and viewing trees.""" + from __future__ import annotations from dataclasses import dataclass from python_mg._lib_name import SyntacticStructure, MGNode, MGEdge @@ -6,14 +8,17 @@ from rustworkx.visualization import graphviz_draw -def sort_key(G: rx.PyDiGraph[MGNode, MGEdge], e: int) -> int: - (n, _) = G.get_edge_endpoints_by_index(e) +def _sort_key(G: rx.PyDiGraph[MGNode, MGEdge], e: int) -> int: + n, _ = G.get_edge_endpoints_by_index(e) return G.get_node_data(n).trace_id() @dataclass class Mover: - """A list of words used to indicate where movement has occurred. See :meth:`python_mg.ParseTree.base_string`""" + """A list of words used to indicate where movement has occurred. + + See :meth:`python_mg.ParseTree.base_string`. + """ s: list[str | Mover | Trace] """ The moved words """ @@ -24,13 +29,14 @@ class Mover: @dataclass class Trace: - """A representation of a trace index left by movement""" + """A representation of a trace index left by movement.""" trace: int """ the trace ID """ -def node_attrs(node: MGNode): +def node_attrs(node: MGNode) -> dict[str, str]: + """Get the attributes that defines node styling.""" attrs = {"label": str(node), "ordering": "out"} if node.is_stolen(): attrs["style"] = "dashed" @@ -41,77 +47,89 @@ def node_attrs(node: MGNode): def edge_attrs(edge: MGEdge) -> dict[str, str]: + """Get the attributes that defines edge styling.""" if edge.is_move() or edge.is_head_move(): return {"style": "dashed", "constraint": "false"} return {} class ParseTree: - """A class used for ParseTree that is generated by :meth:`python_mg.SyntacticStructure.to_tree`. - It can be used to get a GraphViz representation of the tree or to investigate the ParseTree as a graph. + """A class for ParseTree generated by :meth:`python_mg.SyntacticStructure.to_tree`. + + It can be used to get a GraphViz representation of the tree or to investigate the + ParseTree as a graph. """ def __init__( self, G: rx.PyDiGraph[MGNode, MGEdge], root: int, structure: SyntacticStructure - ): + ) -> None: + """Make a new ParseTree.""" self.root: int = root self.structure: SyntacticStructure = structure movement_edges = sorted( [x for x in G.filter_edges(lambda x: x.is_move())], - key=lambda x: sort_key(G, x), + key=lambda x: _sort_key(G, x), reverse=True, ) movements: dict[int, int] = {} for e in movement_edges: - (src, tgt) = G.get_edge_endpoints_by_index(e) + src, tgt = G.get_edge_endpoints_by_index(e) trace_id = G.get_node_data(src).trace_id() movements[trace_id] = tgt self.__movement_sources: dict[int, int] = {m: i for i, m in movements.items()} self.G: rx.PyDiGraph[MGNode, MGEdge] = G - """PyDiGraph[MGNode, MGEdge]: A `RustworkX `_ PyDiGraph which contains the syntactice structure of a sentence""" + """PyDiGraph[MGNode, MGEdge]: A `RustworkX ` + _ PyDiGraph which contains the syntactice structure of a sentence""" def normal_string(self) -> str: - """The string used by a ParseTree + """Get the string used by a ParseTree. Returns ------- str the parsed sentence + """ return str(self.structure) def base_string(self) -> list[str | Mover | Trace]: - """A richer representation of the parsed string, with traces where movement had occurred, and :meth:`python_mg.Mover` objects to indicated moved phrases. + """Get a richer representation of the parsed string. + + This representation has traces where movement had occurred, and + :meth:`python_mg.Mover` objects to indicated moved phrases. Returns ------- str the parsed sentence + """ linear_order = self.__explore(self.root) return linear_order def to_dot(self) -> str | None: - """Converts a tree to GraphViz DOT format + """Convert a tree to GraphViz DOT format. Returns ------- str The dot file for this tree + """ return self.G.to_dot(node_attr=node_attrs, edge_attr=edge_attrs) def to_image(self) -> Image.Image: - """Converts a tree to a PIL Image + """Convert a tree to a PIL Image. Returns ------- Image An image representation of the tree + """ return graphviz_draw( self.G, @@ -144,15 +162,17 @@ def __explore(self, n_i: int) -> list[str | Mover | Trace]: def to_tree(self: SyntacticStructure) -> ParseTree: - """Converts a SyntacticStructure to a ParseTree + """Convert a SyntacticStructure to a ParseTree. Returns ------- - The SyntacticStructure as a :meth:`python_mg.ParseTree` + :meth:`python_mg.ParseTree` + The SyntacticStructure as a :meth:`python_mg.ParseTree` + """ - (nodes, edges, root) = self.__to_tree_inner() # pyright: ignore[reportPrivateUsage] + nodes, edges, root = self.__to_tree_inner() # pyright: ignore[reportPrivateUsage] - # This will usually be the identity function, but on the off chance its not, we do this. + # This will usually be the identity function, but if not, we do this. # Waste computation in exchange for not having a horrible headache old2new: dict[int, int] = {} diff --git a/python/tests/test_mg.py b/python/tests/test_mg.py index ec795bd..b8a3172 100644 --- a/python/tests/test_mg.py +++ b/python/tests/test_mg.py @@ -1,11 +1,13 @@ -import pytest +# ruff: disable[D103,D100,E501] + import pickle from python_mg import Lexicon, Continuation +from python_mg.semantics import Scenario, Actor, Event from python_mg.syntax import Trace, Mover -def test_lexicon(): +def test_lexicon() -> None: x = Lexicon("a::b= a\nb::b") assert [str(s) for s in x.generate_grammar("a")] == ["a b"] parse = next(x.generate_grammar("a")) @@ -15,13 +17,13 @@ def test_lexicon(): ) -def test_pickling(): +def test_pickling() -> None: x = Lexicon("a::b= a\nb::b") x_pickle = pickle.dumps(x) assert pickle.loads(x_pickle) == x -def test_memory_load(): +def test_memory_load() -> None: grammar = Lexicon("a::b= c= +a +e C\nb::b -a\nc::c -e") parse = grammar.parse("c b a", "C")[0] assert parse.max_memory_load() == 2 @@ -30,16 +32,77 @@ def test_memory_load(): assert parse.max_memory_load() == 1 -def test_semantic_lexicon(): - grammar = """John::d::a_j -run::=d v::lambda a x some_e(e, pe_run(e), AgentOf(x,e)) -Mary::d::a_m +def test_generation() -> None: + grammar = """John::d::a_John +runs::=d v::lambda a x some_e(e, pe_run(e), AgentOf(x,e)) +Mary::d::a_Mary +likes::d= =d v::lambda a x lambda a y some_e(e, pe_likes(e), AgentOf(y,e) & PatientOf(x, e))""" + lexicon = Lexicon(grammar) + strings = [str(p) for p in lexicon.generate_grammar("v")] + assert strings == [ + "John runs", + "Mary runs", + "Mary likes John", + "John likes John", + "John likes Mary", + "Mary likes Mary", + ] + + +def test_semantic_lexicon() -> None: + grammar = """John::d::a_John +runs::=d v::lambda a x some_e(e, pe_run(e), AgentOf(x,e)) +Mary::d::a_Mary likes::d= =d v::lambda a x lambda a y some_e(e, pe_likes(e), AgentOf(y,e) & PatientOf(x, e))""" semantic_lexicon = Lexicon(grammar) assert semantic_lexicon.is_semantic() + s = semantic_lexicon.parse("John likes Mary", "v") + assert len(s) == 1 + parse = s[0] + assert parse.meaning is not None + assert parse.meaning == [ + "some_e(x, pe_likes(x), AgentOf(a_John, x) & PatientOf(a_Mary, x))" + ] + meaning: str = parse.meaning[0] + + s = Scenario( + " lambda a x some_e(e, pe_likes(e), AgentOf(x, e)); lambda a x some_e(e, pe_likes(e), PatientOf(x, e))" + ) + assert len(s.questions) == 2 + + assert s.evaluate(meaning) + answers = [ + s.evaluate(f"({q})(a_{name})") for q, name in zip(s.questions, ["John", "Mary"]) + ] + assert answers[0] + assert answers[1] + + +def test_scenario() -> None: + s = Scenario("") + assert s.actors == [Actor("John", properties={"nice", "quick"})] + assert s.events == [Event(agent="John", properties={"run"})] + + scenarios: list[Scenario] = [ + x for x in Scenario.all_scenarios(["John", "Mary"], [], ["kind"]) + ] + assert len(scenarios) == 9 + + phi = Scenario("").evaluate( + "(lambda a x some_e(e, pe_runs(e), AgentOf(x, e)))(a_John)" + ) + assert isinstance(phi, bool) + assert phi + + john = Scenario("").evaluate( + "iota(x, some_e(e, pe_runs(e), AgentOf(x, e)))" + ) + assert isinstance(john, Actor) + assert john.name == "John" + assert john.properties == {"cool"} -def test_trees(): +def test_trees() -> None: grammar = """ ::T= C ::T= +W C @@ -116,7 +179,7 @@ def test_trees(): assert tree.to_dot() == digraph -def test_continuations(): +def test_continuations() -> None: x = Lexicon("a::b= S\nb::b") assert x.continuations("a", "S") == {Continuation("b")} x = Lexicon("a::S= b= S\n::S\nb::b") diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 45ef49e..c4ea774 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,3 +1,3 @@ [toolchain] profile = "default" -channel = "1.88.0" +channel = "1.93.0" diff --git a/src/graphing.rs b/src/graphing.rs index 525c3a2..90617c4 100644 --- a/src/graphing.rs +++ b/src/graphing.rs @@ -2,6 +2,7 @@ use minimalist_grammar_parser::parsing::rules::{TreeEdge, TreeNode}; use pyo3::{exceptions::PyValueError, prelude::*}; use std::fmt::Display; +///A node on a tree. #[pyclass(name = "MGNode", str, eq, frozen)] #[derive(Debug, PartialEq, Eq)] pub struct PyMgNode(pub TreeNode<'static, String, String>); @@ -59,6 +60,7 @@ impl PyMgNode { } } +///A node representing the edge in a tree, whether in merging or movement. #[pyclass(name = "MGEdge", str, eq, frozen)] #[derive(Debug, PartialEq, PartialOrd, Ord, Eq)] pub struct PyMgEdge(pub TreeEdge); @@ -80,14 +82,32 @@ impl Display for PyMgEdge { #[pymethods] impl PyMgEdge { + ///Check if the edge is a movement edge. + /// + ///Returns + ///------- + ///bool + /// Whether it's a movement edge. fn is_move(&self) -> bool { matches!(self.0, TreeEdge::Move) } + ///Check if the edge is a head-movement edge. + /// + ///Returns + ///------- + ///bool + /// Whether it's a head-movement edge. fn is_head_move(&self) -> bool { matches!(self.0, TreeEdge::MoveHead) } + ///Check if the edge is a merge edge. + /// + ///Returns + ///------- + ///bool + /// Whether it's a merge edge. fn is_merge(&self) -> bool { matches!(self.0, TreeEdge::Merge(_)) } diff --git a/src/lib.rs b/src/lib.rs index 5cb8532..3439ecf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,181 +7,28 @@ use std::{ use anyhow::anyhow; use logprob::LogProb; use minimalist_grammar_parser::{ - Generator, ParsingConfig, PhonContent, Pronounciation, RulePool, + Generator, ParsingConfig, PhonContent, Pronounciation, lexicon::{LexemeId, LexicalEntry, Lexicon, SemanticLexicon}, parsing::beam::Continuation, }; use pyo3::{exceptions::PyValueError, prelude::*}; -mod graphing; +pub mod graphing; use graphing::{PyMgEdge, PyMgNode}; -use crate::tokenizers::TokenMap; - -#[pyclass(name = "SyntacticStructure", str, eq, frozen)] -#[derive(Debug)] -///The representation of a syntactic structure generated by a grammar, or alternatively the result -///of parsing a string. -struct PySyntacticStructure { - prob: LogProb, - string: Vec>, - rules: RulePool, - lex: Py, -} - -impl PartialEq for PySyntacticStructure { - fn eq(&self, other: &Self) -> bool { - self.prob == other.prob && self.string == other.string && self.rules == other.rules - } -} - -impl Display for PySyntacticStructure { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let len = self.string.len(); - for (i, x) in self.string.iter().enumerate() { - match x { - PhonContent::Normal(s) => write!(f, "{s}")?, - PhonContent::Affixed(items) => write!(f, "{}", items.join("-"))?, - }; - if i != len - 1 { - write!(f, " ")?; - } - } - Ok(()) - } -} - -#[pymethods] -impl PySyntacticStructure { - ///The log probability of generating this SyntacticStructure using its associated Lexicon. - /// - ///Returns - ///------- - ///float - /// the log probability - fn log_prob(&self) -> f64 { - self.prob.into_inner() - } - - fn contains_lexical_entry(&self, s: &str) -> PyResult { - let lex = self.lex.get(); - let entry = LexicalEntry::parse(s).map_err(|e| PyValueError::new_err(e.to_string()))?; - Ok(lex - .lexeme_to_id - .get(&entry) - .is_some_and(|x| self.rules.used_lemmas().any(|y| &y == x))) - } - - ///The probability of generating this SyntacticStructure using its associated Lexicon. - /// - ///Parameters - ///---------- - ///s : str or None - /// The word (or empty word) that may or may not be present - /// - ///Returns - ///------- - ///bool - /// whether the word is present in the structure - fn contains_word(&self, mut s: Option<&str>) -> bool { - let lex = self.lex.get(); - if let Some(s_inner) = &s - && s_inner.is_empty() - { - s = None; - } - lex.lemma_to_id - .get(&s.into()) - .is_some_and(|x| self.rules.used_lemmas().any(|y| x.contains(&y))) - } - - ///The probability of generating this SyntacticStructure using its associated Lexicon. - /// - ///Returns - ///------- - ///float - /// the probability of the structure - fn prob(&self) -> f64 { - self.prob.into_inner().exp() - } - - ///The number of derivational steps necessary to derive this SyntacticStructure using its Lexicon - /// - ///Returns - ///------- - ///int - /// the number of steps - fn n_steps(&self) -> usize { - self.rules.n_steps() - } - - ///Turns the SyntacticStructure into a tree that can be rendered with LaTeX. - ///Requires including `latex-commands.tex `_) in the LaTeX preamble. - /// - ///Returns - ///------- - ///str - /// A LaTeX representation of the parse tree - fn latex(&self) -> String { - let lex = self.lex.get(); - lex.lexicon - .lexicon() - .derivation(self.rules.clone()) - .tree() - .latex() - } - - ///The maximum number of moving elements stored in memory at one time. - /// - ///Returns - ///------- - ///int - /// the maximum number of moved items held in memory in the derivation - fn max_memory_load(&self) -> usize { - self.rules.max_memory_load() - } - - #[allow(clippy::type_complexity)] - fn __to_tree_inner(&self) -> (Vec<(usize, PyMgNode)>, Vec<(usize, usize, PyMgEdge)>, usize) { - let d = self - .lex - .get() - .lexicon - .lexicon() - .derivation(self.rules.clone()); - let tree = d.tree(); - let (g, root) = tree.petgraph(); - let nodes = g - .node_indices() - .map(|n| { - ( - n.index(), - PyMgNode( - g.node_weight(n) - .unwrap() - .clone() - .map(|x| x.to_string(), |x| x.to_string()), - ), - ) - }) - .collect::>(); - - let mut edges = g - .edge_indices() - .map(|e| { - let (src, tgt) = g.edge_endpoints(e).unwrap(); - ( - src.index(), - tgt.index(), - PyMgEdge(*g.edge_weight(e).unwrap()), - ) - }) - .collect::>(); - - edges.sort_by_key(|(_, _, x)| x.0); - (nodes, edges, root.index()) - } -} +mod semantics; +mod syntax; +mod tokenizers; +use syntax::PySyntacticStructure; + +use crate::{ + semantics::{ + PyPossibleEvent, PyScenarioGenerator, + lot_types::{PyActor, PyEvent}, + scenario::PyScenario, + }, + tokenizers::TokenMap, +}; #[derive(Debug, Clone, Eq, PartialEq)] enum PossiblySemanticLexicon { @@ -228,12 +75,20 @@ impl SelfOwningLexicon { }) } - fn lexicon(&self) -> &Lexicon<&'static str, &'static str> { + #[expect(clippy::needless_lifetimes)] + fn lexicon<'a>(&'a self) -> &'a Lexicon<&'a str, &'a str> { match &self.lexicon { PossiblySemanticLexicon::Normal(lexicon) => lexicon, PossiblySemanticLexicon::Semantic(semantic_lexicon) => semantic_lexicon.lexicon(), } } + + fn semantic_lexicon<'a>(&'a self) -> Option<&'a SemanticLexicon<'a, &'a str, &'a str>> { + match &self.lexicon { + PossiblySemanticLexicon::Normal(_) => None, + PossiblySemanticLexicon::Semantic(lex) => Some(lex), + } + } } impl Display for SelfOwningLexicon { @@ -252,7 +107,58 @@ impl Display for SelfOwningLexicon { )] #[derive(Debug, Clone, Eq, PartialEq)] ///A MG grammar that can be used to generate SyntacticStructures or parse strings into -///SyntacticStructures +///SyntacticStructures. +/// +///You may include semantic interpretations or not. You may also generate all valid sentences in the grammar. +/// +///Parameters +///---------- +///s : str +/// +///Raises +///------ +///ValueError +/// If the string is not a valid lexicon. +/// +///Examples +///-------- +///Generating all sentences of a grammar. +/// +///.. code-block:: python +/// +/// grammar = """John::d +/// runs::=d v +/// Mary::d +/// likes::d= =d v""" +/// lexicon = Lexicon(grammar) +/// strings = [str(p) for p in lexicon.generate_grammar("v")] +/// assert strings == [ +/// "John runs", +/// "Mary runs", +/// "Mary likes John", +/// "John likes John", +/// "John likes Mary", +/// "Mary likes Mary", +/// ] +/// +///Creating a lexicon with interpretations and getting the interpretation of a sentence. +/// +///.. code-block:: python +/// +/// grammar = """John::d::a_John +/// run::=d v::lambda a x some_e(e, pe_run(e), AgentOf(x,e)) +/// Mary::d::a_Mary +/// likes::d= =d v::lambda a x lambda a y some_e(e, pe_likes(e), AgentOf(y,e) & PatientOf(x, e))""" +/// semantic_lexicon = Lexicon(grammar) +/// assert semantic_lexicon.is_semantic() +/// s = semantic_lexicon.parse("John likes Mary", "v") +/// assert len(s) == 1 +/// parse = s[0] +/// assert parse.meaning is not None +/// assert parse.meaning == [ +/// "some_e(x, pe_likes(x), AgentOf(a_John, x) & PatientOf(a_Mary, x))" +/// ] +/// struct PyLexicon { word_id: TokenMap, lexeme_to_id: HashMap, LexemeId>, @@ -262,14 +168,18 @@ struct PyLexicon { lexicon: SelfOwningLexicon, } -mod tokenizers; - impl Display for PyLexicon { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "MGLexicon{{\n{}\n}}", self.lexicon) } } +impl PyLexicon { + fn semantics<'a>(&'a self) -> Option<&'a SemanticLexicon<'a, &'a str, &'a str>> { + self.lexicon.semantic_lexicon() + } +} + #[pyclass] struct GrammarIterator { generator: Generator, String, String>, @@ -285,21 +195,21 @@ impl GrammarIterator { } fn __next__(mut slf: PyRefMut<'_, Self>) -> Option { - if let Some(n) = slf.max_strings { - if slf.n_strings >= n { - return None; - } + if let Some(n) = slf.max_strings + && slf.n_strings >= n + { + return None; } if let Some((prob, string, rules)) = slf.generator.next() { slf.n_strings += 1; let py = slf.py(); - Some(PySyntacticStructure { + Some(PySyntacticStructure::new( + slf.lexicon.clone_ref(py), prob, string, rules, - lex: slf.lexicon.clone_ref(py), - }) + )) } else { None } @@ -420,21 +330,31 @@ fn get_config( impl PyLexicon { fn from_lexicon(lexicon: SelfOwningLexicon) -> PyResult { + //unsafe here because the lexicon has the lifetime of the reference of the SelfOwningLexicon. + //We are owning it in the arc, so we have to make sure we can refer to it. + let lexeme_to_id: HashMap<_, LexemeId> = lexicon .lexicon() .lexemes_and_ids() .map_err(|e| anyhow!(e))? - .map(|(id, entry)| (entry, id)) + .map(|(id, entry)| { + let entry: LexicalEntry<&'static str, &'static str> = + unsafe { std::mem::transmute(entry) }; + (entry, id) + }) .collect(); let mut lemma_to_id = HashMap::default(); let mut word_id = TokenMap::default(); for leaf in lexicon.lexicon().leaves().iter().copied() { - let lemma = *lexicon + let lemma = lexicon .lexicon() .leaf_to_lemma(leaf) .expect("Invalid lexicon!"); + + let lemma: Pronounciation<&'static str> = unsafe { std::mem::transmute(*lemma) }; + if let Pronounciation::Pronounced(word) = lemma.as_ref() { word_id.add_word(word); } @@ -453,7 +373,7 @@ impl PyLexicon { impl PyLexicon { #[allow(clippy::too_many_arguments)] fn inner_parse( - slf: PyRef<'_, Self>, + slf: &Bound<'_, Self>, s: &[PhonContent<&str>], category: String, min_log_prob: Option, @@ -462,48 +382,27 @@ impl PyLexicon { n_beams: Option, max_parses: Option, ) -> PyResult> { + let lex = slf.borrow(); let config = get_config(min_log_prob, move_prob, max_steps, n_beams)?; - let parser = slf + let parser = lex .lexicon .lexicon() .parse(s, category.as_str(), &config) .map_err(|e| PyValueError::new_err(e.to_string()))?; - let py = slf.py(); - let self_ref: Py = slf.clone().into_pyobject(py).unwrap().into(); + // let self_ref: Py = slf.clone().into_pyobject(py).unwrap().into(); + if let Some(max_parses) = max_parses { Ok(parser .take(max_parses) - .map(|(prob, string, rules)| PySyntacticStructure { - prob, - rules, - string: string - .iter() - .map(|x| match x { - PhonContent::Normal(x) => PhonContent::Normal(x.to_string()), - PhonContent::Affixed(items) => { - PhonContent::Affixed(items.iter().map(|x| x.to_string()).collect()) - } - }) - .collect(), - lex: self_ref.clone_ref(py), + .map(|(prob, string, rules)| { + PySyntacticStructure::into_syntax_structure(slf, prob, string, rules) }) .collect()) } else { Ok(parser - .map(|(prob, string, rules)| PySyntacticStructure { - prob, - rules, - string: string - .iter() - .map(|x| match x { - PhonContent::Normal(x) => PhonContent::Normal(x.to_string()), - PhonContent::Affixed(items) => { - PhonContent::Affixed(items.iter().map(|x| x.to_string()).collect()) - } - }) - .collect(), - lex: self_ref.clone_ref(py), + .map(|(prob, string, rules)| { + PySyntacticStructure::into_syntax_structure(slf, prob, string, rules) }) .collect()) } @@ -512,6 +411,7 @@ impl PyLexicon { #[pymethods] impl PyLexicon { + ///Check if this lexicon has semantics fn is_semantic(&self) -> bool { matches!(self.lexicon.lexicon, PossiblySemanticLexicon::Semantic(_)) } @@ -652,10 +552,10 @@ impl PyLexicon { }) .or_insert(prob); - if let Some(max_strings) = max_strings { - if hashmap.len() > max_strings { - break; - } + if let Some(max_strings) = max_strings + && hashmap.len() > max_strings + { + break; } } @@ -725,7 +625,7 @@ impl PyLexicon { }) } - #[allow(clippy::too_many_arguments)] + #[expect(clippy::too_many_arguments)] #[pyo3(signature = (s, category, min_log_prob=-128.0, move_prob=0.5, max_steps=64, n_beams=256, max_parses=None))] ///Parses a string and returns all found parses in a list ///The string, s, should be delimited by spaces for words and hyphens for multi-word expressions from head-movement @@ -753,7 +653,7 @@ impl PyLexicon { ///list of SyntacticStructure /// All found parses of the string. fn parse( - slf: PyRef<'_, Self>, + slf: &Bound<'_, Self>, s: &str, category: String, min_log_prob: Option, @@ -790,5 +690,10 @@ fn python_mg(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; Ok(()) } diff --git a/src/semantics.rs b/src/semantics.rs new file mode 100644 index 0000000..45f924d --- /dev/null +++ b/src/semantics.rs @@ -0,0 +1,283 @@ +use std::{ + collections::{BTreeMap, BTreeSet, HashSet}, + fmt::Display, + hash::Hash, + sync::Arc, + time::Duration, +}; + +use itertools::Itertools; +use pyo3::{IntoPyObjectExt, exceptions::PyValueError, prelude::*}; +use simple_semantics::{ + Entity, EventType, LanguageResult, PossibleEvent, Scenario, ScenarioIterator, ThetaRoles, + lambda::RootedLambdaPool, + language::{ExecutionConfig, Expr}, +}; + +pub mod lot_types; +use lot_types::{PyActor, PyEvent, convert_to_py_actor, convert_to_py_event}; +pub mod scenario; +use scenario::PyScenario; + +struct LanguageResultWrapper<'a>(LanguageResult<'a>, Scenario<'a>); + +impl<'py> IntoPyObject<'py> for LanguageResultWrapper<'_> { + type Target = PyAny; + + type Output = Bound<'py, Self::Target>; + + type Error = PyErr; + + fn into_pyobject(self, py: Python<'py>) -> Result { + match self.0 { + LanguageResult::Bool(bool) => bool.into_bound_py_any(py), + LanguageResult::Actor(name) => convert_to_py_actor(name, &self.1).into_bound_py_any(py), + LanguageResult::Event(e_i) => convert_to_py_event(e_i, &self.1)?.into_bound_py_any(py), + LanguageResult::ActorSet(items) => items + .into_iter() + .map(|name| convert_to_py_actor(name, &self.1)) + .collect::>() + .into_bound_py_any(py), + LanguageResult::EventSet(items) => items + .into_iter() + .map(|e_i| convert_to_py_event(e_i, &self.1)) + .collect::, _>>()? + .into_bound_py_any(py), + } + } +} + +impl PyScenario { + fn execute<'a>( + &'a self, + mut expr: RootedLambdaPool<'a, Expr<'a>>, + config: Option, + ) -> PyResult> { + let scenario = self.as_scenario(); + expr.reduce() + .map_err(|e| PyValueError::new_err(e.to_string()))?; + expr.cleanup(); + + let pool = expr + .into_pool() + .map_err(|e| PyValueError::new_err(e.to_string()))?; + + let language_result = pool + .run(&scenario, config) + .map_err(|e| PyValueError::new_err(e.to_string()))?; + Ok(LanguageResultWrapper(language_result, scenario)) + } +} + +#[pymethods] +impl PyScenario { + #[new] + fn new(s: String) -> PyResult { + let scenario = + Scenario::parse(s.as_str()).map_err(|e| PyValueError::new_err(e.to_string()))?; + Ok(scenario.into()) + } + + fn __repr__(&self) -> String { + format!("Scenario({self})") + } + + ///Executes an language of thought expression in this scenario. Will potentially throw a PresuppositionException if + ///something is referenced that isn't in the scenario. It will also reduce any lambda + ///expressions if possible, and then will only execute the expression if it is fully reducible. + /// + ///Parameters + ///---------- + ///expression : str + /// The expression in the language of thought to execute. + ///max_steps : int or None, optional + /// The number of steps in the virtual machine to execute before giving up. + /// Default is 256. + ///timeout : datetime.timedelta or None, optional + /// The amount of time before the execution gives up. + /// Default is None + ///Returns + ///------- + ///bool or Actor or Event or set[Actor] or set[Event] + /// the value of the expression + ///Raises + ///------ + ///ValueError + /// If the expression is incorrectly formatted or if there is a presupposition error. + #[pyo3(signature = (expression, max_steps=64, timeout=None))] + fn evaluate<'a>( + &'a self, + expression: &'a str, + max_steps: Option, + timeout: Option, + ) -> PyResult> { + let expr = RootedLambdaPool::parse(expression) + .map_err(|e| PyValueError::new_err(e.to_string()))?; + self.execute(expr, Some(ExecutionConfig::new(max_steps, timeout))) + } + + ///Creates a generator that goes over all possible scenarios that can be generated according to + ///the its parameters. This gets very large very quickly. + /// + ///Parameters + ///---------- + ///actors : list[str] + /// The actors who may or may not be present. + ///event_kinds : list[``PossibleEvent``] + /// The possible kinds of events + /// + ///Returns + ///------- + ///ScenarioGenerator + #[staticmethod] + fn all_scenarios( + actors: Vec, + event_kinds: Vec, + actor_properties: Vec, + ) -> PyScenarioGenerator { + let parameter_holder = Arc::new(ParameterHolder { + actors, + event_kinds, + actor_properties, + }); + + let actors: Vec<&'static str> = parameter_holder + .actors + .iter() + .map(|x| { + let s: &'static str = unsafe { std::mem::transmute(x.as_str()) }; + s + }) + .collect::>(); + let properties: Vec<&'static str> = parameter_holder + .actor_properties + .iter() + .map(|x| { + let s: &'static str = unsafe { std::mem::transmute(x.as_str()) }; + s + }) + .collect::>(); + + let event_kinds: Vec> = parameter_holder + .event_kinds + .iter() + .map(|x| { + let x = x.as_possible_event(); + let x: PossibleEvent<'static> = unsafe { std::mem::transmute(x) }; + x + }) + .collect::>(); + + PyScenarioGenerator { + generator: Scenario::all_scenarios(&actors, &event_kinds, &properties), + _parameter_holder: parameter_holder, + } + } +} + +/// A possible linguistic event with theta role structure. +/// +/// Parameters +/// ---------- +/// name : str +/// Identifier for the event. +/// has_agent : bool, optional +/// Whether the event has an agent participant. Default is ``True``. +/// has_patient : bool, optional +/// Whether the event has a patient participant. Default is ``False``. +/// is_reflexive : bool, optional +/// Whether the event allows reflexive construal. Default is ``True``. +#[pyclass(name = "PossibleEvent", eq, from_py_object)] +#[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] +pub struct PyPossibleEvent { + ///Whether the event takes an agent + #[pyo3(get, set)] + pub has_agent: bool, + ///Whether the event takes a patient + #[pyo3(get, set)] + pub has_patient: bool, + ///Whether the event can have the same agent and patient + #[pyo3(get, set)] + pub is_reflexive: bool, + ///The name of this kind of event (e.g. `running` could be a unaccusative event) + #[pyo3(get, set)] + pub name: String, +} + +#[pymethods] +impl PyPossibleEvent { + #[new] + #[pyo3(signature = (name, has_agent=true, has_patient=false, is_reflexive=true))] + fn new(name: String, has_agent: bool, has_patient: bool, is_reflexive: bool) -> Self { + PyPossibleEvent { + name, + has_agent, + has_patient, + is_reflexive, + } + } + + /// Classify the event based on its argument structure. + /// + /// Returns + /// ------- + /// Literal['Transitive', 'TransitiveNonReflexive', 'Unergative', 'Unaccusative', 'Avalent']. + fn event_type(&self) -> &'static str { + match (self.has_agent, self.has_patient) { + (true, true) if self.is_reflexive => "Transitive", + (true, true) => "TransitiveNonReflexive", + (true, false) => "Unergative", + (false, true) => "Unaccusative", + (false, false) => "Avalent", + } + } +} + +impl PyPossibleEvent { + fn as_event_type(&self) -> EventType { + match (self.has_agent, self.has_patient) { + (true, true) if self.is_reflexive => EventType::Transitive, + (true, true) => EventType::TransitiveNonReflexive, + (true, false) => EventType::Unergative, + (false, true) => EventType::Unaccusative, + (false, false) => EventType::Avalent, + } + } + + fn as_possible_event<'a>(&'a self) -> PossibleEvent<'a> { + PossibleEvent { + label: self.name.as_str(), + event_type: self.as_event_type(), + } + } +} + +///Yields +///------ +///Scenario +/// Another scenario that can be generated according to the parameters. +/// +#[pyclass(name = "ScenarioGenerator", eq, from_py_object)] +#[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] +pub struct PyScenarioGenerator { + generator: ScenarioIterator<'static>, + _parameter_holder: Arc, +} + +#[pymethods] +impl PyScenarioGenerator { + fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> { + slf + } + + fn __next__(mut slf: PyRefMut<'_, Self>) -> Option { + slf.generator.next().map(|s| s.into()) + } +} + +#[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] +struct ParameterHolder { + actors: Vec, + event_kinds: Vec, + actor_properties: Vec, +} diff --git a/src/semantics/lot_types.rs b/src/semantics/lot_types.rs new file mode 100644 index 0000000..f07834d --- /dev/null +++ b/src/semantics/lot_types.rs @@ -0,0 +1,198 @@ +use super::*; + +pub(super) fn convert_to_py_actor(name: &str, scenario: &Scenario<'_>) -> PyActor { + PyActor { + name: name.to_string(), + properties: scenario + .properties() + .iter() + .filter_map(|(prop, entries)| { + if entries.contains(&Entity::Actor(name)) { + Some(prop.to_string()) + } else { + None + } + }) + .collect(), + } +} + +pub(super) fn convert_to_py_event(e_i: u8, scenario: &Scenario<'_>) -> Result { + let e = scenario + .thematic_relations() + .get(e_i as usize) + .ok_or_else(|| { + PyValueError::new_err(format!( + "Result is event {e_i}, but no such event exists in the scenario!" + )) + })?; + + Ok(PyEvent { + agent: e.agent.map(|x| x.to_string()), + patient: e.patient.map(|x| x.to_string()), + properties: scenario + .properties() + .iter() + .filter_map(|(prop, entries)| { + if entries.contains(&Entity::Event(e_i)) { + Some(prop.to_string()) + } else { + None + } + }) + .collect(), + }) +} + +///Represents an actor with a name and a set of properties to be used in Scenarios. +/// +///Parameters +///---------- +///name : str +/// The name of the actor. +///properties : set[str], optional +/// Any properties that apply to the actor. Defaults to an empty set. +/// +/// +///Examples +///-------- +///Creating an actor and modifying its properties: +/// +///.. code-block:: python +/// +/// actor = Actor("John", properties={"mean", "unfriendly"}) +/// actor.name = "Alice" +/// actor.properties = {"nice", "friendly"} +/// +#[pyclass(name = "Actor", eq, str, from_py_object)] +#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd, Ord)] +pub struct PyActor { + /// The name of the actor + #[pyo3(get, set)] + pub name: String, + + /// An unordered set of properties that apply to this actor + #[pyo3(get, set)] + pub properties: BTreeSet, +} + +#[pymethods] +impl PyActor { + #[new] + #[pyo3(signature = (name, properties=None))] + fn new(name: String, properties: Option>) -> Self { + PyActor { + name, + properties: properties.unwrap_or_default(), + } + } + + fn __repr__(&self) -> String { + format!("Actor({self})") + } +} + +impl Display for PyActor { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}{}{}{}", + self.name, + if self.properties.is_empty() { "" } else { " (" }, + self.properties.iter().join(", "), + if self.properties.is_empty() { "" } else { ")" }, + ) + } +} + +///Represents an event to be used in a Scenario. +/// +///Parameters +///---------- +///agent : str, optional +/// The name of the agent (if there is one) +///patient : str, optional +/// The name of the patient (if there is one) +///properties : set[str], optional +/// Any properties that apply to the event. Defaults to an empty set. +/// +/// +///Examples +///-------- +///Creating an event +/// +///.. code-block:: python +/// +/// running = Actor(agent="John", properties={"run", "quickly"}) +/// +#[pyclass(name = "Event", eq, str, from_py_object)] +#[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] +pub struct PyEvent { + ///The agent of the event. + #[pyo3(get, set)] + pub agent: Option, + + ///The patient of the event. + #[pyo3(get, set)] + pub patient: Option, + + ///Any properties of the event. + #[pyo3(get, set)] + pub properties: BTreeSet, +} + +#[pymethods] +impl PyEvent { + #[new] + #[pyo3(signature = (agent=None, patient=None, properties=None))] + + fn new( + agent: Option, + patient: Option, + properties: Option>, + ) -> Self { + PyEvent { + agent, + patient, + properties: properties.unwrap_or_default(), + } + } + + fn __repr__(&self) -> String { + format!("Event({self})") + } +} + +impl Display for PyEvent { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{{{}{}{}{}{}{}}}", + self.agent + .as_deref() + .map(|x| format!("A = {x}")) + .unwrap_or("".to_string()), + if self.patient.is_some() && self.agent.is_some() { + ", " + } else { + "" + }, + self.patient + .as_deref() + .map(|x| format!("P = {x}")) + .unwrap_or("".to_string()), + if self.properties.is_empty() { "" } else { " (" }, + self.properties.iter().join(" "), + if self.properties.is_empty() { "" } else { ")" }, + ) + } +} + +impl PyEvent { + pub fn into_theta_roles<'a>(self: &'a PyEvent) -> ThetaRoles<'a> { + ThetaRoles { + agent: self.agent.as_deref(), + patient: self.patient.as_deref(), + } + } +} diff --git a/src/semantics/scenario.rs b/src/semantics/scenario.rs new file mode 100644 index 0000000..94beea3 --- /dev/null +++ b/src/semantics/scenario.rs @@ -0,0 +1,125 @@ +use super::*; + +///Represents a Scenario, a model that meanings are evaluated in. +/// +///Parameters +///---------- +///actors : list[Actor] +/// The actors present in the scenario +///events: list[Event] +/// The events happening in the scenario +///events: list[str] +/// The questions in a scenario. (Will raise a `ValueError` if set with a `str` which is not a +/// valid Language of Thought expression) +#[pyclass(name = "Scenario", str, eq, from_py_object)] +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct PyScenario { + ///A list of Actors in the scenario + #[pyo3(get, set)] + actors: Vec, + ///A list of Events in the scenario + #[pyo3(get, set)] + events: Vec, + + ///A list of questions to be asked in the scenario + #[pyo3(get)] + questions: Vec, +} + +impl From> for PyScenario { + fn from(value: Scenario) -> Self { + let actors = value + .actors() + .iter() + .map(|x| PyActor { + name: x.to_string(), + properties: value + .properties() + .iter() + .filter_map(|(k, v)| { + if v.contains(&Entity::Actor(x)) { + Some(k.to_string()) + } else { + None + } + }) + .collect(), + }) + .collect(); + + let events = value + .thematic_relations() + .iter() + .enumerate() + .map(|(i, x)| PyEvent { + agent: x.agent.map(|x| x.to_string()), + patient: x.patient.map(|x| x.to_string()), + properties: value + .properties() + .iter() + .filter_map(|(k, v)| { + if v.contains(&Entity::Event(u8::try_from(i).expect("Too many events!"))) { + Some(k.to_string()) + } else { + None + } + }) + .collect(), + }) + .collect(); + + let questions = value.questions().iter().map(|x| x.to_string()).collect(); + + PyScenario { + actors, + events, + questions, + } + } +} + +#[pymethods] +impl PyScenario { + #[setter] + fn set_questions(&mut self, questions: Vec) -> PyResult<()> { + for q in &questions { + let _ = RootedLambdaPool::parse(q).map_err(|e| PyValueError::new_err(e.to_string()))?; + } + + self.questions = questions; + Ok(()) + } +} + +impl Display for PyScenario { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.as_scenario()) + } +} + +impl PyScenario { + pub(super) fn as_scenario<'a>(&'a self) -> Scenario<'a> { + let actors = self.actors.iter().map(|x| x.name.as_str()).collect(); + let thematic_relations = self.events.iter().map(|x| x.into_theta_roles()).collect(); + let mut properties: BTreeMap<_, Vec<_>> = BTreeMap::new(); + + for a in &self.actors { + for p in &a.properties { + properties + .entry(p.as_str()) + .or_default() + .push(Entity::Actor(a.name.as_str())); + } + } + for (i, e) in self.events.iter().enumerate() { + for p in &e.properties { + properties + .entry(p.as_str()) + .or_default() + .push(Entity::Event(u8::try_from(i).expect("Too many events!"))); + } + } + + Scenario::new(actors, thematic_relations, properties) + } +} diff --git a/src/syntax.rs b/src/syntax.rs new file mode 100644 index 0000000..b5d0f7d --- /dev/null +++ b/src/syntax.rs @@ -0,0 +1,263 @@ +use std::fmt::Display; + +use crate::graphing::{PyMgEdge, PyMgNode}; + +use super::PyLexicon; +use logprob::LogProb; +use minimalist_grammar_parser::{PhonContent, RulePool, lexicon::LexicalEntry}; +use pyo3::{exceptions::PyValueError, prelude::*}; + +#[pyclass(name = "SyntacticStructure", str, eq, frozen)] +#[derive(Debug)] +///The representation of a syntactic structure generated by a grammar, or alternatively the result +///of parsing a string. +pub struct PySyntacticStructure { + prob: LogProb, + string: Vec>, + rules: RulePool, + meaning: Option>, + lex: Py, +} + +impl PartialEq for PySyntacticStructure { + fn eq(&self, other: &Self) -> bool { + self.prob == other.prob && self.string == other.string && self.rules == other.rules + } +} + +impl Display for PySyntacticStructure { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let len = self.string.len(); + for (i, x) in self.string.iter().enumerate() { + match x { + PhonContent::Normal(s) => write!(f, "{s}")?, + PhonContent::Affixed(items) => write!(f, "{}", items.join("-"))?, + }; + if i != len - 1 { + write!(f, " ")?; + } + } + Ok(()) + } +} + +impl PySyntacticStructure { + pub fn new( + lex: Py, + prob: LogProb, + string: Vec>, + rules: RulePool, + ) -> PySyntacticStructure { + PySyntacticStructure { + prob, + meaning: lex.get().semantics().map(|lex| { + rules + .to_interpretation(lex) + .map(|(a, _)| a.to_string()) + .collect() + }), + rules, + string, + lex, + } + } + + pub fn into_syntax_structure( + lexicon: &Bound<'_, PyLexicon>, + prob: LogProb, + string: &[PhonContent<&str>], + rules: RulePool, + ) -> PySyntacticStructure { + PySyntacticStructure { + prob, + meaning: lexicon.get().semantics().map(|lex| { + rules + .to_interpretation(lex) + .map(|(a, _)| a.to_string()) + .collect() + }), + rules, + string: string + .iter() + .map(|x| match x { + PhonContent::Normal(x) => PhonContent::Normal(x.to_string()), + PhonContent::Affixed(items) => { + PhonContent::Affixed(items.iter().map(|x| x.to_string()).collect()) + } + }) + .collect(), + lex: lexicon.as_unbound().clone_ref(lexicon.py()), + } + } + + pub fn lex(&self) -> &Py { + &self.lex + } + + pub fn string(&self) -> &Vec> { + &self.string + } +} + +#[pymethods] +impl PySyntacticStructure { + ///Returns the interpretation of this SyntacticStructure, provided that its associated Lexicon + ///has semantics + #[getter] + fn meaning(&self) -> &Option> { + &self.meaning + } + + ///The pronunciation of this SyntacticStructure. + /// + ///Returns + ///------- + ///list[str] + /// A list of strings of each word. Multi-morphemic words are seperated by `-`. + fn pronunciation(&self) -> Vec { + self.string + .iter() + .map(|x| x.to_string()) + .collect::>() + } + + ///The log probability of generating this SyntacticStructure using its associated Lexicon. + /// + ///Returns + ///------- + ///float + /// the log probability + fn log_prob(&self) -> f64 { + self.prob.into_inner() + } + + ///Check whether this string (representing a lexical entry) is used in this tree. + /// + ///Returns + ///------- + ///bool + /// Whether the lexical entry is used + /// + ///Raises + ///------ + ///ValueException + /// If the lexical entry is not parseable as a lexical entry. + fn contains_lexical_entry(&self, s: &str) -> PyResult { + let lex = self.lex.get(); + let entry = LexicalEntry::parse(s).map_err(|e| PyValueError::new_err(e.to_string()))?; + Ok(lex + .lexeme_to_id + .get(&entry) + .is_some_and(|x| self.rules.used_lemmas().any(|y| &y == x))) + } + + ///The probability of generating this SyntacticStructure using its associated Lexicon. + /// + ///Parameters + ///---------- + ///s : str or None + /// The word (or empty word) that may or may not be present + /// + ///Returns + ///------- + ///bool + /// whether the word is present in the structure + fn contains_word(&self, mut s: Option<&str>) -> bool { + let lex = self.lex.get(); + if let Some(s_inner) = &s + && s_inner.is_empty() + { + s = None; + } + lex.lemma_to_id + .get(&s.into()) + .is_some_and(|x| self.rules.used_lemmas().any(|y| x.contains(&y))) + } + + ///The probability of generating this SyntacticStructure using its associated Lexicon. + /// + ///Returns + ///------- + ///float + /// the probability of the structure + fn prob(&self) -> f64 { + self.prob.into_inner().exp() + } + + ///The number of derivational steps necessary to derive this SyntacticStructure using its Lexicon + /// + ///Returns + ///------- + ///int + /// the number of steps + fn n_steps(&self) -> usize { + self.rules.n_steps() + } + + ///Turns the SyntacticStructure into a tree that can be rendered with LaTeX. + ///Requires including `latex-commands.tex `_) in the LaTeX preamble. + /// + ///Returns + ///------- + ///str + /// A LaTeX representation of the parse tree + fn latex(&self) -> String { + let lex = self.lex.get(); + lex.lexicon + .lexicon() + .derivation(self.rules.clone()) + .tree() + .latex() + } + + ///The maximum number of moving elements stored in memory at one time. + /// + ///Returns + ///------- + ///int + /// the maximum number of moved items held in memory in the derivation + fn max_memory_load(&self) -> usize { + self.rules.max_memory_load() + } + + #[allow(clippy::type_complexity)] + fn __to_tree_inner(&self) -> (Vec<(usize, PyMgNode)>, Vec<(usize, usize, PyMgEdge)>, usize) { + let d = self + .lex + .get() + .lexicon + .lexicon() + .derivation(self.rules.clone()); + let tree = d.tree(); + let (g, root) = tree.petgraph(); + let nodes = g + .node_indices() + .map(|n| { + ( + n.index(), + PyMgNode( + g.node_weight(n) + .unwrap() + .clone() + .map(|x| x.to_string(), |x| x.to_string()), + ), + ) + }) + .collect::>(); + + let mut edges = g + .edge_indices() + .map(|e| { + let (src, tgt) = g.edge_endpoints(e).unwrap(); + ( + src.index(), + tgt.index(), + PyMgEdge(*g.edge_weight(e).unwrap()), + ) + }) + .collect::>(); + + edges.sort_by_key(|(_, _, x)| x.0); + (nodes, edges, root.index()) + } +} diff --git a/src/tokenizers.rs b/src/tokenizers.rs index 4b93407..ab66e5d 100644 --- a/src/tokenizers.rs +++ b/src/tokenizers.rs @@ -371,7 +371,7 @@ impl PyLexicon { /// list of :meth:`python_mg.SyntacticStructure` /// List of all parses of the token string fn parse_tokens( - slf: PyRef<'_, Self>, + slf: &Bound<'_, Self>, s: Vec, category: String, min_log_prob: Option, @@ -380,7 +380,7 @@ impl PyLexicon { n_beams: Option, max_parses: Option, ) -> PyResult> { - let v = to_phon_content(&s, &slf.word_id)?; + let v = to_phon_content(&s, &slf.borrow().word_id)?; PyLexicon::inner_parse( slf, @@ -411,10 +411,10 @@ impl PySyntacticStructure { ///ndarray of uint /// the tokenized string. fn tokens<'py>(slf: PyRef<'py, Self>) -> Bound<'py, PyArray1> { - let tokens = slf.lex.get().tokens(); + let tokens = slf.lex().get().tokens(); let mut output = vec![SOS]; - for c in &slf.string { + for c in slf.string() { match c { PhonContent::Normal(w) => output.push( *tokens