From 15e09edaa2fd1a2ee56712f9906388e208a725cb Mon Sep 17 00:00:00 2001 From: killeroonie Date: Wed, 25 Jun 2025 19:25:48 -0700 Subject: [PATCH 01/11] Deleted this unused file. Have already implemented cyclic checking in the evaluator. --- killerbunny/evaluating/cyclic_data.txt | 103 ------------------------- 1 file changed, 103 deletions(-) delete mode 100644 killerbunny/evaluating/cyclic_data.txt diff --git a/killerbunny/evaluating/cyclic_data.txt b/killerbunny/evaluating/cyclic_data.txt deleted file mode 100644 index d0d3777..0000000 --- a/killerbunny/evaluating/cyclic_data.txt +++ /dev/null @@ -1,103 +0,0 @@ -Core Idea: Tracking Visited Object InstancesThe fundamental approach is to keep track of the actual JSON object/array instances you've already encountered during a specific traversal operation. If you encounter the exact same instance again while still "inside" its own processing path, you've found a cycle.Implementation Strategy within the Evaluator:This logic would primarily reside in the parts of your evaluator that handle deep traversals, most notably the descendant segment (..).1.Identify Traversal Operations: The main place this is needed is in your SegmentNode._eval_descendant_node_impl method (or a helper function it calls). The .. operator can potentially visit every node in a subtree.2.Use Python's id() for Instance Tracking: Python's built-in id(obj) function returns a unique integer for each live object. This is perfect for identifying if you're seeing the exact same instance of a list or dictionary again.3.Maintain a visited_ids Set per Traversal Context:•When you start processing a descendant segment for a particular input_node from the input_nodelist, you'll initiate a traversal (e.g., a breadth-first or depth-first collection of the input node and its descendants).•For this specific traversal operation, maintain a set (let's call it visited_ids_in_current_descent).•Before processing the children of a JSON object or array:•Get its id(): current_id = id(current_json_container).•Check if current_id is in visited_ids_in_current_descent.•If yes: You've detected a cycle. Log a warning, and do not proceed to traverse its children. This breaks the infinite loop for this path.•If no: Add current_id to visited_ids_in_current_descent before you start processing/recursing into its children.•Backtracking (Important for Correctness): After you've finished processing all children of current_json_container (i.e., when the recursion unwinds or you move to the next item at the same level in a breadth-first search), you should remove current_id from visited_ids_in_current_descent. This is crucial because an object might be legitimately reachable via two different non-cyclic paths. You only want to detect cycles where an object is an ancestor of itself in the current active traversal path.4.Maximum Depth Limit (Failsafe):•In addition to cycle detection, implement a maximum traversal depth. This acts as a failsafe for extremely deep (but not necessarily cyclic) structures or if the cycle detection somehow has a bug.•Pass a current_depth parameter during your traversal. If current_depth exceeds a predefined MAX_DEPTH, stop traversing that branch and log a warning.5.Breadth-First Traversal for Descendants: The RFC 9535 (Section 2.5.2.2) states for descendant segments: "nodes are visited before their descendants." This implies a breadth-first collection of nodes.Revised SegmentNode._eval_descendant_node_impl and Helper:Your current SegmentNode._eval_descendant_node_impl passes the input_nodelist directly to selector.eval(). For descendant semantics, you first need to collect the input node itself and all its unique descendants, and then apply the selectors to this collected list.Here's a conceptual sketch of how you might modify it: - -# In your parser_nodes.py, within the SegmentNode class - -# Helper method to collect an input node and all its unique descendants -# using breadth-first traversal with cycle and depth protection. -def _collect_node_and_its_descendants( - self, - initial_node_to_scan: JSON_VALUE, - max_depth: int = 32 # Configurable: A reasonable default -) -> list[JSON_VALUE]: - - collected_nodes_output: list[JSON_VALUE] = [] - # Tracks object IDs already added to collected_nodes_output to ensure uniqueness - ids_in_collected_output: set[int] = set() - - # Queue for breadth-first traversal: (json_node_instance, current_processing_depth) - queue: list[tuple[JSON_VALUE, int]] = [(initial_node_to_scan, 0)] - - # Set to track object IDs visited in the *current expansion path* to detect cycles - # This set is effectively managed by the BFS queue structure implicitly for simple cycles, - # but an explicit set for `visited_on_path` would be needed for more complex DFS. - # For BFS, checking `ids_in_collected_output` before adding to queue can prevent re-processing known branches. - - head = 0 - while head < len(queue): - current_json_node, depth = queue[head] - head += 1 # Dequeue - - current_node_id = id(current_json_node) - - # Add to the final list if it's a unique instance - if current_node_id not in ids_in_collected_output: - collected_nodes_output.append(current_json_node) - ids_in_collected_output.add(current_node_id) - else: - # We've already processed this exact node instance and its children via another path - # or it was the initial node. For BFS, this means we don't need to add its children again. - # This also implicitly handles simple cycles for BFS if we only add children of new nodes. - pass # Continue to next item in queue - - # If max depth reached for this path, don't explore its children - if depth >= max_depth: - _logger.warning( - f"Max traversal depth ({max_depth}) reached while collecting descendants of an input node. " - f"Stopping further descent for this branch. ASTNode: {str(current_json_node)[:100]}" - ) - continue - - # If it's a compound type, add its children to the queue for processing - if isinstance(current_json_node, JSON_ARRAY_TYPES): - for item in current_json_node: - # Add child to queue. Cycle detection for adding to queue is implicitly handled - # by only fully processing unique items from the queue (via ids_in_collected_output). - # A more explicit check before adding to queue: if id(item) in ids_in_collected_output and item is compound, - # it might indicate a more complex cycle or shared structure already explored. - queue.append((item, depth + 1)) - elif isinstance(current_json_node, JSON_OBJECT_TYPES): - obj_dict = cast(JSON_OBJECT, current_json_node) - for value in obj_dict.values(): - queue.append((value, depth + 1)) - - return collected_nodes_output - -def _eval_descendant_node_impl(self, input_nodelist: list[Any]) -> list[Any] | None: - """ - RFC 9535 Section 2.5.2.2. (page 13) - A descendant segment produces zero or more descendants of an input value. - For each node in the input nodelist, a descendant selector visits the input node - and each of its descendants such that: - • nodes of any array are visited in array order, and - • nodes are visited before their descendants. - The nodelist resulting from these visits is the result of the descendant segment. - This means selectors are applied to the collection of (input_node + its descendants). - """ - overall_results_from_selectors: list[Any] = [] - - for node_from_input_list in input_nodelist: - # For each node from the input, collect it and all its unique descendants. - # This collection process handles cycle detection and depth limits. - nodes_to_pass_to_selectors = self._collect_node_and_its_descendants( - node_from_input_list - # Consider making max_depth configurable, perhaps via Context - ) - - # Apply all selectors associated with this descendant segment - # to the collected list of (node_from_input_list + its descendants). - for sel_node in self._selectors.nodes: - selector = cast(SelectorNode, sel_node) - # The selector.eval() method operates on the fully collected list. - # It should not need to do its own descent. - current_selector_results = selector.eval(nodes_to_pass_to_selectors) - if current_selector_results is not None: - overall_results_from_selectors.extend(current_selector_results) - - # Per RFC 2.2, a nodelist may contain duplicate values. - # Uniqueness is typically handled by the overall query semantics or specific functions, - # not necessarily at each segment's output aggregation. - self._input_nodelist = input_nodelist # Save original input to this segment - self._output_nodelist = overall_results_from_selectors - return self._output_nodelist - -Key Changes and Considerations:1._collect_node_and_its_descendants Helper:•This new method is responsible for taking a single JSON node and returning a list containing that node and all its unique descendants.•It uses a breadth-first search (BFS) strategy with a queue to ensure "nodes are visited before their descendants."•Cycle Prevention during Collection: It uses ids_in_collected_output to ensure that each unique object instance is added to collected_nodes_output only once and its children are enqueued for exploration only once. This implicitly handles cycles for BFS because you won't re-explore children of an already fully processed node.•Max Depth: It incorporates a max_depth check.2._eval_descendant_node_impl Usage:•It iterates through each node_from_input_list.•For each, it calls _collect_node_and_its_descendants to get the comprehensive list of nodes (the original input node + its descendants) that the segment's selectors should operate on.•It then passes this collected list to each selector.eval() method.3.Selector eval Methods:•The eval methods of your individual SelectorNode subclasses (like NameSelectorNode, WildcardSelectorNode, etc.) will now receive a list that could potentially contain many nodes (the result of the .. expansion). They should iterate through this list and apply their specific selection logic to each item. They generally should not perform their own recursive descent.4.Logging: Use _logger.warning(...) when cycles or depth limits are hit. This provides feedback without crashing.5.Configuration: The max_depth could be a configurable parameter of your JPathEvaluator or passed via the Context.This approach separates the concern of safely collecting nodes for a descendant operation from the concern of applying specific selectors to that collection. It directly addresses the risk of infinite loops from cyclic objects and also provides a failsafe for excessively deep structures. Remember to thoroughly test this with various cyclic and deep JSON structures! \ No newline at end of file From b1d9e5dcf3170e8f363a90b279a8b93697599ffe Mon Sep 17 00:00:00 2001 From: killeroonie Date: Thu, 26 Jun 2025 03:28:09 -0700 Subject: [PATCH 02/11] Implemented cycle detection and handling in the evaluator for JSONPath query execution. Added extensive unit tests for cyclic data structures. Improved logging for circular reference warnings. Fixed linter issues and optimized imports. --- killerbunny/__init__.py | 2 + killerbunny/evaluating/evaluator.py | 245 +++++++---------- killerbunny/evaluating/value_nodes.py | 48 ++-- .../jsonpointer/test_json_pointer.py | 5 - .../evaluating/test_root_value_cycles.py | 250 ++++++++++++++++++ 5 files changed, 371 insertions(+), 179 deletions(-) create mode 100644 tests/jpath/evaluating/test_root_value_cycles.py diff --git a/killerbunny/__init__.py b/killerbunny/__init__.py index 5b88b0d..198f612 100644 --- a/killerbunny/__init__.py +++ b/killerbunny/__init__.py @@ -5,3 +5,5 @@ # Created by: Robert L. Ross # +# In killerbunny/__init__.py +"""KillerBunny package for JSONPath Query evaluation.""" \ No newline at end of file diff --git a/killerbunny/evaluating/evaluator.py b/killerbunny/evaluating/evaluator.py index 4dea835..51bdddd 100644 --- a/killerbunny/evaluating/evaluator.py +++ b/killerbunny/evaluating/evaluator.py @@ -7,19 +7,12 @@ """Evaluates a jsonpath query""" +import logging from collections import deque -from logging import Logger from typing import Callable, NamedTuple, TypeAlias, Union, cast, Any -from killerbunny.shared.constants import ( - ROOT_JSON_VALUE_KEY, - SEGMENT_INPUT_NODELIST_KEY, - FILTER_SELECTOR_INPUT_NODE_KEY, - CURRENT_NODE_KEY, - JPATH_QUERY_RESULT_NODE_KEY -) -from killerbunny.shared.context import Context -from killerbunny.shared.errors import RTError, Error + from killerbunny.evaluating.compare_ops import COMPARISON_OP_TYPE_LOOKUP +from killerbunny.evaluating.evaluator_types import EvaluatorValue, NormalizedJPath from killerbunny.evaluating.runtime_result import RuntimeResult from killerbunny.evaluating.value_nodes import ( VNodeList, @@ -29,14 +22,6 @@ NullValue, StringValue ) -from killerbunny.evaluating.evaluator_types import EvaluatorValue, NormalizedJPath -from killerbunny.shared.json_type_defs import ( - JSON_PRIMITIVE_TYPES, - JSON_ARRAY_TYPES, - JSON_OBJECT_TYPES, - JSON_STRUCTURED_TYPES, - JSON_VALUE_TYPES -) from killerbunny.parsing.function import Nothing, LogicalType, ValueType, FunctionCallNode, FunctionArgument from killerbunny.parsing.node_type import ASTNode, ASTNodeType from killerbunny.parsing.parser_nodes import ( @@ -64,9 +49,25 @@ StringLiteralNode, IdentifierNode ) +from killerbunny.shared.constants import ( + ROOT_JSON_VALUE_KEY, + SEGMENT_INPUT_NODELIST_KEY, + FILTER_SELECTOR_INPUT_NODE_KEY, + CURRENT_NODE_KEY, + JPATH_QUERY_RESULT_NODE_KEY +) +from killerbunny.shared.context import Context +from killerbunny.shared.errors import RTError, Error +from killerbunny.shared.json_type_defs import ( + JSON_PRIMITIVE_TYPES, + JSON_ARRAY_TYPES, + JSON_OBJECT_TYPES, + JSON_STRUCTURED_TYPES, + JSON_VALUE_TYPES +) from killerbunny.shared.position import Position -_logger = Logger(__name__) +_logger = logging.getLogger(__name__) ComparableType: TypeAlias = Union[ValueType, EvaluatorValue] @@ -74,6 +75,7 @@ #################################################################### # EVALUATOR #################################################################### +# noinspection PyPep8Naming,PyMethodMayBeStatic,DuplicatedCode class JPathEvaluator: """Evaluates a json path query string, in the form of an AST, for the json path query argument stored in the supplied Context. The main entry point is visit(), which is passed an AST from a successful parse result, along with @@ -105,10 +107,10 @@ def no_visit_method(self, node: ASTNode, context: Context) -> RuntimeResult: def visit_RootNode(self, node: RootNode, context: Context) -> RuntimeResult: """Set the JSON path query argument in the RootNode. - This value is used by the json path query to generate ouput nodes that + This value is used by the JSON path query to generate ouput nodes that match the query parameters. The value is obtained from the Context's symbol table with the key given by ROOT_JSON_VALUE_KEY. - :return the root nodelist in RuntimeResult.value + :return: The root nodelist in RuntimeResult.value """ rt_res = RuntimeResult() root_json_value = context.get_symbol(ROOT_JSON_VALUE_KEY) @@ -121,12 +123,12 @@ def visit_RootNode(self, node: RootNode, context: Context) -> RuntimeResult: return rt_res.success(node.root_nodelist) def visit_JsonPathQueryNode(self, node: JsonPathQueryNode, context: Context) -> RuntimeResult: - """This node can represent the main entry point in the query, i.e., the entire json path query string - starting with $, or it can represnt an AbsoluteSingularQueryNode used inside a filter-selector. - Write result of query to the context with JPATH_QUERY_RESULT_NODE_KEY. If this is the top-level JsonPathQueryNode, - this is the final nodelist output of the query. + """This node can represent the main entry point in the query, i.e., the entire JSON path query string + starting with $, or it can represent an AbsoluteSingularQueryNode used inside a filter-selector. + Write the result of the query to the context with JPATH_QUERY_RESULT_NODE_KEY. + If this is the top-level JsonPathQueryNode, this is the final nodelist output of the query. - returns BooleanValue in RunttimeResult.value + returns: VNodeList in RunttimeResult.value """ rt_res = RuntimeResult() input_nodelist: VNodeList = rt_res.register(self.visit_RootNode(node.root_node, context)) @@ -146,22 +148,19 @@ def visit_JsonPathQueryNode(self, node: JsonPathQueryNode, context: Context) -> seg_context.set_symbol(SEGMENT_INPUT_NODELIST_KEY, input_nodelist) segment_output = rt_res.register(self.visit(seg_node, seg_context)) if rt_res.error: return rt_res - input_nodelist = segment_output # output of this segment becomes input for next segment + input_nodelist = segment_output # output of this segment becomes input for the next segment - # last segment output is the final output nodelist + # the last segment output is the final output nodelist context.set_symbol(JPATH_QUERY_RESULT_NODE_KEY, segment_output) - # apply existence test here and return BooleanValue - # we know that segments is not empty here, so this is safe: - ouput_pos = Position(node.position.text, segments[0].position.start, segments[-1].position.end) # type: ignore return rt_res.success(segment_output) - # if segment_output: - # return rt_res.success(BooleanValue.value_for(True).set_context(context).set_pos(ouput_pos)) - # else: - # return rt_res.success(BooleanValue.value_for(False).set_context(context).set_pos(ouput_pos)) + - def _collect_vnodes_and_their_descendants(self, initial_vnode: VNode, max_depth: int = 32) -> VNodeList: + def _collect_vnodes_and_their_descendants(self, + initial_vnode: VNode, + max_depth: int = 32, + ) -> VNodeList: """ - Performs a breadth-first collection of VNodes, starting with initial_vnode, + Performs a breadth-first collection of VNodes, starting with the initial VNode, then its children, then grandchildren, etc. Each collected item is a VNode with its correct path. Handles circular references/cycle detection using VNode.jvalue's id(). If an object node has already been seen, @@ -171,7 +170,8 @@ def _collect_vnodes_and_their_descendants(self, initial_vnode: VNode, max_depth: """ collected_vnodes: list[VNode] = [] instance_ids: dict[int, VNode] = {} # keeps track of instance ids to detect circular references - # Queue for BFS: (VNode, current_depth) + + # Queue for BFS: (VNode, current_depth) node_queue = deque([(initial_vnode, 0)]) while node_queue: @@ -192,7 +192,7 @@ def _collect_vnodes_and_their_descendants(self, initial_vnode: VNode, max_depth: instance_ids[id(jvalue)] = cur_node collected_vnodes.append(cur_node) - # add children to queue for processing during next iteration + # add children to the queue for processing during the next iteration if isinstance(jvalue, JSON_ARRAY_TYPES): for index, element in enumerate(jvalue): new_node = VNode(NormalizedJPath(f"{jpath}[{index}]"), element, @@ -203,55 +203,52 @@ def _collect_vnodes_and_their_descendants(self, initial_vnode: VNode, max_depth: new_node = VNode(NormalizedJPath(f"{jpath}['{name}']"), value, cur_node.root_value, cur_node.node_depth + 1) node_queue.append((new_node, depth + 1)) + return VNodeList(collected_vnodes) def _children_of(self, value_node: VNode) -> VNodeList: """Return the children of the argument node. - see section 1.1., "Terminology", pg 6, RFC 9535 + See section 1.1., "Terminology", pg 6, RFC 9535 Children (of a node): If the node is an array, the nodes of its elements; if the node is an object, the nodes of its member values. If the node is neither an array nor an object, it has no children. - If caller intends to recurse further into each returned child node, caller is responsible + If the caller intends to recurse further into each returned child node, the caller is responsible for checking that these children have not been seen yet to prevent a cycle and infinite recursion. - """ child_nodes: list[VNode] = [] + instance_ids: dict[int, VNode] = {id(value_node.jvalue):value_node} + if isinstance(value_node.jvalue, JSON_STRUCTURED_TYPES): base_path = value_node.jpath if isinstance(value_node.jvalue, JSON_ARRAY_TYPES): for index, element in enumerate(value_node.jvalue): element_path = NormalizedJPath(f"{base_path}[{index}]") - child_nodes.append(VNode(element_path, element, value_node.root_value, value_node.node_depth + 1)) + vnode = VNode(element_path, element, value_node.root_value, value_node.node_depth + 1) + if id(element) in instance_ids: + _logger.warning(f"Circular reference cycle detected: current node: {vnode} already included as: {instance_ids[id(element)]}") + print(f"\n+++Circular reference cycle detected: current node: {vnode} already included as: {instance_ids[id(element)]}") + continue + if isinstance(element, JSON_STRUCTURED_TYPES): + instance_ids[id(element)] = vnode + child_nodes.append(vnode) elif isinstance(value_node.jvalue, JSON_OBJECT_TYPES): for member_name, member_value in value_node.jvalue.items(): element_path = NormalizedJPath(f"{base_path}['{member_name}']") - child_nodes.append(VNode(element_path, member_value, value_node.root_value, value_node.node_depth + 1)) + vnode = VNode(element_path, member_value, value_node.root_value, value_node.node_depth + 1) + if id(member_value) in instance_ids: + _logger.warning(f"Circular reference cycle detected: current node: {vnode} already included as: {instance_ids[id(member_value)]}") + print(f"\n+++Circular reference cycle detected: current node: {vnode} already included as: {instance_ids[id(member_value)]}") + continue + if isinstance(member_value, JSON_STRUCTURED_TYPES): + instance_ids[id(member_value)] = vnode + child_nodes.append(vnode) return VNodeList(child_nodes) - def visit_DescendantSegment_new_in_progress(self, node: SegmentNode, context: Context) -> RuntimeResult: - """Descendants (of a node): The children of the node, together with the children of its children, - and so forth recursively. More formally, the "descendants" relation between nodes is the transitive closure - of the "children" relation. - - Children (of a node): If the node is an array, the nodes of its elements; if the node is an object, - the nodes of its member values. If the node is neither an array nor an object, it has no children. - - """ - - rt_res = RuntimeResult() - input_nodelist: VNodeList = context.get_symbol(SEGMENT_INPUT_NODELIST_KEY) - # we collect the descendants of each node in the input_nodelist. - - segment_result_nodes: list[VNode] = [] - for input_node in input_nodelist: - ... - - return rt_res.success(VNodeList(segment_result_nodes)) def visit_DescendantSegment(self, node: SegmentNode, context: Context) -> RuntimeResult: - """Descendant segment creates input nodelist from all input nodes and all descendants of input nodes. + """Descendant segment creates and input nodelist from all input nodes and all descendants of input nodes. Drill down into all children of all nodes in the input VNodeList and apply all selectors to produce a concatenated output VNodeList 2.5.2.2. Semantics, pg 43 RFC 9535 @@ -292,10 +289,15 @@ def visit_DescendantSegment(self, node: SegmentNode, context: Context) -> Runtim # where D1_vnode is current_d1_vnode. di_vnode_sequence: VNodeList = self._collect_vnodes_and_their_descendants(current_d1_vnode) + instance_ids: dict[int, VNode] = {} # keeps track of instance ids to detect circular references + # Inner loop: "For each i such that 1 <= i <= n" (iterating through D1...Dn) # Let's call the current node from this sequence 'di_from_sequence'. + di_from_sequence: VNode for di_from_sequence in di_vnode_sequence: - # Ri is the result of applying child segment [] to di_from_sequence + instance_ids[id(di_from_sequence.jvalue)] = di_from_sequence + + # Ri is the result of applying the child segment [] to di_from_sequence # Create a temporary VNodeList containing only di_from_sequence, # as selectors expect a VNodeList as input. @@ -309,16 +311,19 @@ def visit_DescendantSegment(self, node: SegmentNode, context: Context) -> Runtim # Apply this individual selector to temp_input_for_selectors descendant_segment_context = Context("", context, node.position ) - descendant_segment_context.set_symbol(SEGMENT_INPUT_NODELIST_KEY, temp_input_for_selectors) current_selector_output: VNodeList = rt_res.register(self.visit(selector, descendant_segment_context)) if rt_res.error: return rt_res if current_selector_output: # VNodeList might be empty - ri_for_this_di.extend(current_selector_output.node_list) - - # Now, ri_for_this_di is complete (it's Ri). + # exclude cycle nodes + cycle_nodes = [node for node in current_selector_output.node_list if not id(node.jvalue) in instance_ids] + # ri_for_this_di.extend(current_selector_output.node_list) + ri_for_this_di.extend(cycle_nodes) + + + # Now, ri_for_this_di is complete (it's Ri). # Add it to the list that accumulates R values for the current_d1_vnode. r_values_for_current_d1.extend(ri_for_this_di) @@ -330,25 +335,7 @@ def visit_DescendantSegment(self, node: SegmentNode, context: Context) -> Runtim # After processing all nodes from the input_vnodelist, # overall_segment_results contains the final concatenated list. return rt_res.success(VNodeList(overall_segment_results)) - - def visit_DescendantSegment_new(self, ast_node: SegmentNode, context: Context) -> RuntimeResult: - rt_res = RuntimeResult() - input_nodelist: VNodeList = context.get_symbol(SEGMENT_INPUT_NODELIST_KEY) - if input_nodelist is None: - # todo after testing convert this exception to a logging error msg. - raise ValueError(f"No input nodelist '{SEGMENT_INPUT_NODELIST_KEY}' found in Context for {ast_node}") - # This list will hold the final aggregated results for the entire segment - segment_output_list: list[VNode] = [] - for current_node in input_nodelist: - descent_result_list: list[VNode] = [] - # nodes are visited before their descendants - for selector in ast_node.selectors: - selector_output:VNodeList = rt_res.register(self.visit(selector, context)) - if rt_res.error: return rt_res - descent_result_list.extend(selector_output.node_list) - - return rt_res.success(VNodeList(segment_output_list)) def visit_SegmentNode(self, node: SegmentNode, context: Context) -> RuntimeResult: """A segment contains one or more Selectors""" @@ -402,23 +389,6 @@ def visit_WildcardSelectorNode(self, node: WildcardSelectorNode, context: Contex input_node: VNode for input_node in input_nodelist: output_nodelist.extend(self._children_of(input_node)) - # if not isinstance(input_node.jvalue, (*JSON_ARRAY_TYPES, *JSON_OBJECT_TYPES) ): - # continue # Nothing is selected for JSON primitives. - # if isinstance(input_node.jvalue, JSON_ARRAY_TYPES): - # # copy all list elements to output_nodelist - # jpath = input_node.jpath - # jvalue_list = input_node.jvalue - # for index, child_element in enumerate(jvalue_list): - # new_node = VNode( NormalizedJPath( f"{jpath}[{index}]" ), child_element) - # output_nodelist.append(new_node) - # - # elif isinstance(input_node.jvalue, JSON_OBJECT_TYPES): - # # copy all dict member values to output_nodelist - # jpath = input_node.jpath - # jvalue_dict = input_node.jvalue - # for member_name, member_value in jvalue_dict.items(): - # new_node = VNode( NormalizedJPath( f"{jpath}['{member_name}']" ), member_value) - # output_nodelist.append(new_node) return rt_res.success(VNodeList(output_nodelist)) @@ -443,25 +413,7 @@ def visit_SliceSelectorNode(self, node: SliceSelectorNode, context: Context) -> jvalue_list = input_node.jvalue length:int = len(jvalue_list) if length == 0 or current_slice_obj.step == 0: continue - # see 2.3.4.2.2. Normative Semantics, pg 23, RFC 9535 - # step_normal: int = self._step if self._step is not None else JPathBNFConstants.instance().STEP_DEFAULT - # start_default: int = 0 if step_normal >= 0 else length - 1 - # end_default: int = length if step_normal >= 0 else -length - 1 - # - # nindxs = slice_bounds(self._start or start_default, - # self._end or end_default, - # step_normal, - # length) - # #todo if we are creating slices of multiple arrays of the same size, we might want to cache these slice values - # #todo investigate just using self instance slice values with a Python slice object. It might implement - # # the same normalization algorithm as above, but it should be faster since it's in C code - # result_elements = input_node[slice(nindxs.lower, nindxs.upper, nindxs.step)] - # # each of these results is a separate result node value, which is protentially another list or dict - - # Define the slice object using attributes from the node - # self._start, self._end can be None. self._step can be None (defaulting to 1). - # current_slice_obj = node.slice_op - # Get the actual start, stop, and step for this array's length + # See 2.3.4.2.2. Normative Semantics, pg 23, RFC 9535 actual_start, actual_stop, actual_step = current_slice_obj.indices(length) # Iterate using the original indices for original_idx in range(actual_start, actual_stop, actual_step): @@ -630,7 +582,7 @@ def visit_FilterSelectorNode(self, node: FilterSelectorNode, context: Context) - def visit_RelativeQueryNode(self, node: RelativeQueryNode, context: Context) -> RuntimeResult: - """Return the resulting output VNodeList in RuntimeResult.value + """Return the resulting output VNodeList in RuntimeResult.value """ rt_res = RuntimeResult() @@ -643,9 +595,8 @@ def visit_RelativeQueryNode(self, node: RelativeQueryNode, context: Context) -> input_nodelist: VNodeList = VNodeList([current_node]) if segments.is_empty(): - # no segments in the relative query, so just return the current node + # no segments in the relative query, so we just return the current node return rt_res.success(input_nodelist) - #return rt_res.success(BooleanValue.value_for(True).set_context(context).set_pos(node.position.copy())) """Evaluating all these segments in a chain from input-> output VNodeLists result in a final output nodelist. This node list is evaluated for truthiness as an existence test or to compare to another logical_expr @@ -661,17 +612,9 @@ def visit_RelativeQueryNode(self, node: RelativeQueryNode, context: Context) -> continue # todo should we break here instead of just continue? What do we get by skipping a segment? if not isinstance(segment_output, VNodeList): raise TypeError(f"visit_RelativeQueryNode: visit seg_node returned Unsupported value type '{type(segment_output)}'") - input_nodelist = segment_output # output of this segment becomes input for next segment + input_nodelist = segment_output # output of this segment becomes input for the next segment - # apply existence test here and return BooleanValue - # we know that segments is not empty here, so this is safe: - ouput_pos = Position(node.position.text, segments[0].position.start, segments[-1].position.end) # type: ignore return rt_res.success(segment_output) - # - # if segment_output: - # return rt_res.success(BooleanValue.value_for(True).set_context(context).set_pos(ouput_pos)) - # else: - # return rt_res.success(BooleanValue.value_for(False).set_context(context).set_pos(ouput_pos)) def visit_RelativeSingularQueryNode(self, node: RelativeSingularQueryNode, context: Context) -> RuntimeResult: @@ -681,7 +624,7 @@ def visit_RelativeSingularQueryNode(self, node: RelativeSingularQueryNode, conte current_node: VNode = context.get_symbol(CURRENT_NODE_KEY) segments = node.segments if segments.is_empty(): - # if no segments we just return the current node + # if no segments, we just return the current node return rt_res.success(VNodeList([current_node])) input_nodelist: VNodeList = VNodeList([current_node]) @@ -692,7 +635,7 @@ def visit_RelativeSingularQueryNode(self, node: RelativeSingularQueryNode, conte seg_context.set_symbol(SEGMENT_INPUT_NODELIST_KEY, input_nodelist) segment_ouput = rt_res.register(self.visit(seg_node, seg_context)) if rt_res.error: return rt_res # todo we're not supposd to raise errors, just ignore this segment? - input_nodelist = segment_ouput # output of this segment becomes input for next segment + input_nodelist = segment_ouput # output of this segment becomes input for the next segment return rt_res.success(segment_ouput) @@ -704,10 +647,9 @@ def visit_AbsoluteSingularQueryNode(self, node: AbsoluteSingularQueryNode, conte if rt_res.error: return rt_res segments = node.segments if segments.is_empty(): - # if no segments we just return the current node + # if no segments, we just return the current node return rt_res.success(input_nodelist) - #input_nodelist: VNodeList = VNodeList([root_value]) segment_ouput: VNodeList = input_nodelist # default, in case there are no segments for seg_node in node.segments: # creating a new Context provides a local scope for the segment visit @@ -715,7 +657,7 @@ def visit_AbsoluteSingularQueryNode(self, node: AbsoluteSingularQueryNode, conte seg_context.set_symbol(SEGMENT_INPUT_NODELIST_KEY, input_nodelist) segment_ouput = rt_res.register(self.visit(seg_node, seg_context)) if rt_res.error: return rt_res # todo we're not supposd to raise errors, just ignore this segment? - input_nodelist = segment_ouput # output of this segment becomes input for next segment + input_nodelist = segment_ouput # output of this segment becomes input for the next segment return rt_res.success(segment_ouput) @@ -725,19 +667,19 @@ def visit_FunctionCallNode(self, node: FunctionCallNode, context: Context) -> Ru if not isinstance(node, FunctionCallNode): raise TypeError(f"Expected FunctionCallNode, got '{type(node)}'") - # we must evaluate each argument, and call the function's eval method for it. + # we must evaluate each argument and call the function's eval method for it. func_args = node.func_args arg: FunctionArgument func_name = node.func_node.func_name func_context = Context(f"{func_name}()", context, context.parent_entry_pos) - # we could add the argument to the context and let the function retrieve values from the context. but, here + # we could add the argument to the context and let the function retrieve values from the context. but here # we just call directly with the kwargs dict. call_args: list[Any] = [] for fa in func_args: arg = cast(FunctionArgument, fa) arg_value = rt_res.register(self.visit(arg.arg_node, func_context)) if rt_res.error: raise ValueError(rt_res.error) - # we need logic to translate the fa to a compatible type for the function. Some functions need a VNodeList, + # We need logic to translate the fa to a compatible type for the function. Some functions need a VNodeList, # some just want the singular value from the VNode in a 1-item VNodeList. call_args.append(arg_value) @@ -751,8 +693,8 @@ def visit_FunctionCallNode(self, node: FunctionCallNode, context: Context) -> Ru func_value = node.func_node.eval(**kwargs) # method call happens here return rt_res.success(func_value) - - + + # noinspection GrazieInspection def visit_UnaryOpNode(self, node: UnaryOpNode, context: Context) -> RuntimeResult: """The only use of this node is to invert the truth value of a logical_expr in a test_expr and paren_expr: @@ -801,10 +743,10 @@ def get_comparable_value( self, context ) return None, err - # Check if it's a EvaluatorValue wrapper (like NumberValue) or a raw JSON_ValueType + # Check if it's an EvaluatorValue wrapper (like NumberValue) or a raw JSON_ValueType elif isinstance(eval_result, (EvaluatorValue, *JSON_VALUE_TYPES)): return eval_result, None - # If the result of a sub-expression was already a LogicalType (e.g. from another comparison or NOT) + # If the result of a sub-expression was already a LogicalType (e.g., from another comparison or NOT) elif isinstance(eval_result, LogicalType): # For comparison, we'd use its underlying bool if comparing against another bool. # For AND/OR, we use LogicalType directly. @@ -861,8 +803,8 @@ def visit_BinaryOpNode(self, node: BinaryOpNode, context: Context) -> RuntimeRes def visit_RepetitionNode(self, node: RepetitionNode, context: Context) -> RuntimeResult: """We use a RepetitionNode for evaluating logical_and_expr and logical_or_expr in this context. This is to - avoid having to create another ASTNode for each, however, that might make more sense. This is in-progress. - Also, we implement these as a list of nodes instead of nested binary nodes so we can short circuit evaluation + avoid having to create another ASTNode for each, however, that might make more sense. This is in progress. + Also, we implement these as a list of nodes instead of nested binary nodes so we can short-circuit evaluation at the top level of node evaluation. """ rt_res = RuntimeResult() @@ -894,6 +836,7 @@ def visit_RepetitionNode(self, node: RepetitionNode, context: Context) -> Runtim break # short circuit else: current_bool_result = current_bool_result or bool_for_node if current_bool_result else bool_for_node + # noinspection PySimplifyBooleanCheck if current_bool_result == True: break # short circuit @@ -945,7 +888,7 @@ def visit_IdentifierNode(self, node: IdentifierNode, context: Context ) -> Runti if not isinstance(node, IdentifierNode): raise TypeError(f"Unsupported type '{type(node.node_type)}', expected IdentifierNode") - # we have to add quotes, because the StringValue expects parsed strings to be quoted, + # we have to add quotes because the StringValue expects parsed strings to be quoted, # thus it removes the first and last characters of the argument string when saving the string's value. str_value = f"'{node.value}'" id_str = StringValue(str_value).set_context(context).set_pos(node.position) @@ -966,7 +909,7 @@ class SliceBounds(NamedTuple): def normalize_slice_parameter(slice_parameter: int, array_len: int ) -> int: - """Slice expreession parameters `start` and `end` are not directly usable as slice bounds and must first be + """Slice expression parameters `start` and `end` are not directly usable as slice bounds and must first be normalized. (RCF 9535 page 23). """ diff --git a/killerbunny/evaluating/value_nodes.py b/killerbunny/evaluating/value_nodes.py index c90578c..5b053c8 100644 --- a/killerbunny/evaluating/value_nodes.py +++ b/killerbunny/evaluating/value_nodes.py @@ -11,8 +11,8 @@ """ import json import logging -from typing import Iterator, Generator, override - +from typing import Iterator, Generator +from typing_extensions import override from killerbunny.evaluating.evaluator_types import EvaluatorValue, NormalizedJPath from killerbunny.parsing.helper import unescape_string_content from killerbunny.shared.json_type_defs import JSON_ValueType @@ -70,6 +70,10 @@ def __repr__(self) -> str: repr_str = f"VNode(jpath={self._jpath}, json_value={repr(self._json_value)})" return repr_str + def __str__(self) -> str: + return f"{self._jpath.jpath_str}, {self._json_value}" + + @override def __eq__(self, other: object) -> bool: if not isinstance(other, VNode): @@ -112,9 +116,7 @@ def __hash__(self) -> int: raise TypeError(f"VNode instances with unhashable jvalue (type: {type(self._json_value).__name__}) cannot be hashed.") - - - +# noinspection SpellCheckingInspection class VNodeList: """This class holds a list of VNodes: normalized JSON Paths and the JSON values to which they refer. It functions as a list-like container, supporting common sequence operations such as iteration @@ -140,8 +142,8 @@ def empty_instance(cls) -> 'VNodeList': def node_list(self) -> list[VNode]: """Return the list of VNodes managed by this VNodeList. - Note: this class is itself a list-like container, so you can also use this VNodeList instance itself - in iteration contexts without needing to call this method. + Note: this class is itself a list-like container, so you can also use this VNodeList instance itself + in iteration contexts without needing to call this method. """ return self._node_list @@ -169,7 +171,7 @@ def copy(self) -> 'VNodeList': def pretty_print(self, flag: str = '') -> None: """Display the VNodes of this instance in a more user-friendly format, one line per VNode. - If flag =='-l', pretty prints the value with json.dumps(jvalue, indent==2)""" + If `flag` == '-l', pretty prints the value with json.dumps(jvalue, indent==2)""" if len(self) == 0 : print("VNodeList is empty.") return @@ -199,6 +201,7 @@ def value_generator() -> Generator[JSON_ValueType, None, None]: def paths(self) -> Iterator[NormalizedJPath]: """Return an iterator over the NormalizedJpath paths of this VNodeList. """ def paths_generator() -> Generator[NormalizedJPath, None, None]: + vnode: VNode for vnode in self: yield vnode.jpath return paths_generator() @@ -249,10 +252,6 @@ def clear(self) -> None: """Remove all items from the list.""" self._node_list.clear() - -# # Defined here instead of json_type_defs.py to prefent circular reference -# NodesType: TypeAlias = VNodeList - class NumberValue(EvaluatorValue): @@ -263,8 +262,8 @@ def __init__(self, value: int | float) -> None: super().__init__() self._value = value - @override @property + @override def value(self) -> int | float: return self._value @@ -288,8 +287,8 @@ def __init__(self, value: str) -> None: member_name = unescape_string_content(content_to_unescape) self._value = member_name - @override @property + @override def value(self) -> str: return self._value @@ -315,9 +314,10 @@ def value_for(bool_expr: bool) -> 'BooleanValue': else: return BooleanValue(False) - @override @property + @override def value(self) -> bool: + """Return the internal boolean value.""" return self._value def __repr__(self) -> str: @@ -333,11 +333,11 @@ def __bool__(self) -> bool: @override def __eq__(self, other: object) -> bool: """ - Compares this BooleanValue for equality with another object. + Compare this BooleanValue for equality with another object. - - If 'other' is a BooleanValue, compares their underlying boolean states. - - If 'other' is a Python bool, compares this BooleanValue's state to it. - - For any other type, compares this BooleanValue's state to the + - If 'other' is a BooleanValue, compare their underlying boolean states. + - If 'other' is a Python bool, compare this BooleanValue's state to it. + - For any other type, compare this BooleanValue's state to the truthiness of 'other' (i.e., bool(other)). Returns True if they are considered equal under these rules, False otherwise. @@ -351,6 +351,7 @@ def __eq__(self, other: object) -> bool: # For any other type, compare our boolean value to the truthiness of 'other'. # This fulfills "compare itself to any type that supports the concept of 'truthiness'". + # noinspection PyBroadException try: # Compare our internal boolean state to the truthiness of the other object. # Python's bool() will use __bool__ if available, then __len__, @@ -365,6 +366,7 @@ def __eq__(self, other: object) -> bool: def negate(self) -> 'BooleanValue': """Returns the logically opposite BooleanValue instance.""" + # noinspection PySimplifyBooleanCheck if self._value == True: return BooleanValue(False).set_pos(self.position).set_context(self.context) else: # self.value == False here @@ -376,15 +378,15 @@ class NullValue(EvaluatorValue): def __init__(self) -> None: super().__init__() - @override + def __str__(self) -> str: + return "null" + @property + @override def value(self) -> None: return None def __repr__(self) -> str: return f"NullValue(value={None})" - def __str__(self) -> str: - return "null" - \ No newline at end of file diff --git a/tests/incubator/jsonpointer/test_json_pointer.py b/tests/incubator/jsonpointer/test_json_pointer.py index 1e60f5e..50d20c2 100644 --- a/tests/incubator/jsonpointer/test_json_pointer.py +++ b/tests/incubator/jsonpointer/test_json_pointer.py @@ -52,11 +52,6 @@ def test_resolve_json_pointer(filename, json_obj, path, expected_value): assert actual_value == expected_value, f"{actual_value=}\n{expected_value=}" -# @pytest.fixture(scope="module") -# def json_object_fixture() -> dict: -# jobj =load_obj_from_json_file(JSON_FILES_DIR /"json.1.json") -# return jobj - @pytest.fixture(scope="module") def json_object_fixture(json_files_dir_fixture: Path, load_obj_from_json_file_func) -> dict: jobj_path = json_files_dir_fixture / "json.1.json" diff --git a/tests/jpath/evaluating/test_root_value_cycles.py b/tests/jpath/evaluating/test_root_value_cycles.py new file mode 100644 index 0000000..3e7cb70 --- /dev/null +++ b/tests/jpath/evaluating/test_root_value_cycles.py @@ -0,0 +1,250 @@ +# File: test_root_value_cycles.py +# Copyright (c) 2025 Robert L. Ross +# All rights reserved. +# Open-source license to come. +# Created by: Robert L. Ross +# + +"""Tests the evaluator with cyclic data. Cycles cannot be (easily?) represented in JSON text but could easily happen +when generating nested data programmatically. Cycles can be used as an attack vector to crash the evaluator with +a stack overflow error. """ +from typing import Any + +import pytest +from _pytest.logging import LogCaptureFixture + +from killerbunny.evaluating.value_nodes import VNodeList +from killerbunny.evaluating.well_formed_query import WellFormedValidQuery +from killerbunny.shared.json_type_defs import JSON_ValueType + +@pytest.fixture(scope="module", autouse=True) +def child_dict_cycle() -> JSON_ValueType: + parent_dict: dict[str, JSON_ValueType] = { "one": 1 } + dict_cycle = parent_dict + parent_dict["cycle"] = dict_cycle + return parent_dict + +@pytest.fixture(scope="module", autouse=True) +def grandchild_dict_cycle() -> JSON_ValueType: + nested_dict: dict[str, JSON_ValueType] = { "apple": "a", "banana": "b", "orange": "o" } + parent_dict: dict[str, JSON_ValueType] = { "one": 1, "two": 2 , "nested":nested_dict } + dict_cycle = parent_dict + nested_dict["cycle"] = dict_cycle + return parent_dict + +@pytest.fixture(scope="module", autouse=True) +def child_list_cycle() -> JSON_ValueType: + parent_list: list[Any] = [1] + list_cycle = parent_list + parent_list.append(list_cycle) + return parent_list + +@pytest.fixture(scope="module", autouse=True) +def grandchild_list_cycle() -> JSON_ValueType: + """a list with a cycle in an element list""" + nested_list: list[Any] = [ 1, 2, 3] + parent_list: list[Any] = [ "one", "two", nested_list] # nested_list will contain the parent list as an element + list_cycle = parent_list + nested_list.append(list_cycle) + return parent_list + +def log_msg_assert(msg: str, caplog: LogCaptureFixture) -> None: + """Helper method to ensure the logging message in `msg` was logged as expected.""" + assert len(caplog.records) > 0 + record = caplog.records[0] + msgs = [record.msg for record in caplog.records] + assert record.levelname == "WARNING" + assert msg in msgs + + +def test_cs_dict(child_dict_cycle: JSON_ValueType, caplog: LogCaptureFixture) -> None: + jpath_query_str = '$[*]' + query = WellFormedValidQuery.from_str(jpath_query_str) + + node_list:VNodeList = query.eval(child_dict_cycle) + expected_values = [1] # eval will not include a cycle in the result, so the cyclic parent_dict is not included + + actual_values = list(node_list.values()) + assert len(actual_values) == len(expected_values) + assert actual_values[0] == expected_values[0] + + actual_paths = [npath.jpath_str for npath in node_list.paths() ] + assert actual_paths == ["$['one']"] + + log_msg_assert( + "Circular reference cycle detected: current node: $['cycle'], {'one': 1, 'cycle': {...}} already included as: $, {'one': 1, 'cycle': {...}}", + caplog + ) + +# noinspection SpellCheckingInspection +def test_ds_dict(child_dict_cycle: JSON_ValueType, caplog: LogCaptureFixture)-> None: + jpath_query_str = '$..[*]' + query = WellFormedValidQuery.from_str(jpath_query_str) + + node_list:VNodeList = query.eval(child_dict_cycle) + + expected_values = [1] # eval will not include a cycle in the result, so the cyclic parent_dict is not included + + actual_values = list(node_list.values()) + print(f"\nactual_values = {actual_values}") + assert len(actual_values) == len(expected_values) + assert actual_values[0] == expected_values[0] + + actual_paths = [npath.jpath_str for npath in node_list.paths() ] + assert actual_paths == ["$['one']"] + + log_msg_assert( + "Circular reference cycle detected: current node: $['cycle'], {'one': 1, 'cycle': {...}} already included as: $, {'one': 1, 'cycle': {...}}", + caplog + ) + +def test_cs_dict_grandchild(grandchild_dict_cycle: JSON_ValueType, caplog: LogCaptureFixture) -> None: + jpath_query_str = '$[*]' + query = WellFormedValidQuery.from_str(jpath_query_str) + + node_list:VNodeList = query.eval(grandchild_dict_cycle) + expected_values = [1, 2, { "apple": "a", "banana": "b", "orange": "o", "cycle": grandchild_dict_cycle}] + + actual_values = list(node_list.values()) + assert len(actual_values) == len(expected_values) + assert actual_values[0] == expected_values[0] + assert actual_values[1] == expected_values[1] + assert actual_values[2] == expected_values[2] + + actual_paths = [npath.jpath_str for npath in node_list.paths() ] + assert actual_paths == ["$['one']", "$['two']", "$['nested']"] + +def test_ds_dict_grandchild(grandchild_dict_cycle: JSON_ValueType, caplog: LogCaptureFixture) -> None: + jpath_query_str = '$..[*]' + query = WellFormedValidQuery.from_str(jpath_query_str) + + node_list:VNodeList = query.eval(grandchild_dict_cycle) + expected_values = [1, 2, { "apple": "a", "banana": "b", "orange": "o", "cycle": grandchild_dict_cycle}, 'a', 'b', 'o'] + + actual_values = list(node_list.values()) + assert len(actual_values) == len(expected_values) + assert actual_values[0] == expected_values[0] + assert actual_values[1] == expected_values[1] + assert actual_values[2] == expected_values[2] + assert actual_values[3] == expected_values[3] + assert actual_values[4] == expected_values[4] + assert actual_values[5] == expected_values[5] + + actual_paths = [npath.jpath_str for npath in node_list.paths() ] + expected_paths = ["$['one']", + "$['two']", + "$['nested']", + "$['nested']['apple']", + "$['nested']['banana']", + "$['nested']['orange']"] + assert actual_paths == expected_paths + + log_msg_assert( + "Circular reference cycle detected: current node: $['nested']['cycle'], " + "{'one': 1, 'two': 2, 'nested': {'apple': 'a', 'banana': 'b', 'orange': 'o', " + "'cycle': {...}}} already included as: $, {'one': 1, 'two': 2, 'nested': " + "{'apple': 'a', 'banana': 'b', 'orange': 'o', 'cycle': {...}}}", + caplog + ) + +def test_cs_list_grandchild(grandchild_list_cycle: JSON_ValueType, caplog: LogCaptureFixture) -> None: + """There is a cycle from the parent list to its last element, which is a list that includes + the parent list as a member. + This query will not cause infinite recursion because it only processes the children of the parent""" + jpath_query_str = '$[*]' + query = WellFormedValidQuery.from_str(jpath_query_str) + + node_list:VNodeList = query.eval(grandchild_list_cycle) + # eval will not include a cycle in the result, so the cyclic parent_list is not included in the nested list + expected_values = ["one", "two", [1,2,3, grandchild_list_cycle]] + + actual_values = list(node_list.values()) + assert len(actual_values) == len(expected_values) + actual_values_str = f"{actual_values}" + expected_values_str = f"['one', 'two', [1, 2, 3, ['one', 'two', [...]]]]" + assert actual_values_str == expected_values_str + + assert actual_values[0] == expected_values[0] + assert actual_values[1] == expected_values[1] + assert actual_values[2] == expected_values[2] + + actual_paths = [npath.jpath_str for npath in node_list.paths() ] + assert actual_paths == ['$[0]', '$[1]', '$[2]'] + +def test_ds_list_grandchild(grandchild_list_cycle: JSON_ValueType, caplog: LogCaptureFixture) -> None: + """There is a cycle from the parent list to its last element, which is a list that includes + the parent list as a member.Cyclic parent is not included in as a result node. + """ + jpath_query_str = '$..[*]' + query = WellFormedValidQuery.from_str(jpath_query_str) + + node_list:VNodeList = query.eval(grandchild_list_cycle) + # eval will not include a cycle in the result, so the cyclic parent_list is not included in the nested list + expected_values = ["one", "two", [1,2,3, grandchild_list_cycle], 1, 2, 3] + + actual_values = list(node_list.values()) + actual_values_str = f"{actual_values}" + expected_values_str = f"['one', 'two', [1, 2, 3, ['one', 'two', [...]]], 1, 2, 3]" + + assert actual_values_str == expected_values_str + + assert len(actual_values) == len(expected_values) + assert actual_values[0] == expected_values[0] + assert actual_values[1] == expected_values[1] + assert actual_values[2] == expected_values[2] + assert actual_values[3] == expected_values[3] + assert actual_values[4] == expected_values[4] + assert actual_values[5] == expected_values[5] + + actual_paths = [npath.jpath_str for npath in node_list.paths() ] + expected_paths = ['$[0]', '$[1]', '$[2]', '$[2][0]', '$[2][1]', '$[2][2]'] + assert actual_paths == expected_paths + + log_msg_assert( + "Circular reference cycle detected: current node: $[2][3], ['one', 'two', [1, " + "2, 3, [...]]] already included as: $, ['one', 'two', [1, 2, 3, [...]]]", + caplog + ) + + +def test_cs_list(child_list_cycle: JSON_ValueType, caplog: LogCaptureFixture) -> None: + jpath_query_str = '$[*]' + query = WellFormedValidQuery.from_str(jpath_query_str) + + node_list:VNodeList = query.eval(child_list_cycle) + expected_values = [1] # eval will not include a cycle in the result, so the cyclic parent_list is not included + + actual_values = list(node_list.values()) + assert len(actual_values) == len(expected_values) + assert actual_values[0] == expected_values[0] + + actual_paths = [npath.jpath_str for npath in node_list.paths() ] + assert actual_paths == ['$[0]'] + + log_msg_assert( + "Circular reference cycle detected: current node: $[1], [1, [...]] already included as: $, [1, [...]]", + caplog + ) + +# noinspection SpellCheckingInspection +def test_ds_list(child_list_cycle: JSON_ValueType, caplog: LogCaptureFixture)-> None: + jpath_query_str = '$..[*]' + query = WellFormedValidQuery.from_str(jpath_query_str) + + node_list:VNodeList = query.eval(child_list_cycle) + + expected_values = [1] # eval will not include a cycle in the result, so the cyclic parent_list is not included + + actual_values = list(node_list.values()) + assert len(actual_values) == len(expected_values) + assert actual_values[0] == expected_values[0] + + actual_paths = [npath.jpath_str for npath in node_list.paths() ] + assert actual_paths == ['$[0]'] + + log_msg_assert( + "Circular reference cycle detected: current node: $[1], [1, [...]] already included as: $, [1, [...]]", + caplog + ) + + \ No newline at end of file From b48f32dc0ae77522224712c03f6f4ed6cd385e54 Mon Sep 17 00:00:00 2001 From: killeroonie Date: Thu, 26 Jun 2025 03:30:45 -0700 Subject: [PATCH 03/11] Deleted file as no longer needed. --- .../incubator/jsonpointer/refactoring_bnf.txt | 296 ------------------ 1 file changed, 296 deletions(-) delete mode 100644 killerbunny/incubator/jsonpointer/refactoring_bnf.txt diff --git a/killerbunny/incubator/jsonpointer/refactoring_bnf.txt b/killerbunny/incubator/jsonpointer/refactoring_bnf.txt deleted file mode 100644 index 2d5cf36..0000000 --- a/killerbunny/incubator/jsonpointer/refactoring_bnf.txt +++ /dev/null @@ -1,296 +0,0 @@ -Part 1: Refactoring JPathBNFConstants to use Class Variables for Grammar Rules -Yes, you can absolutely transform your -__init__ method into a class method that declares and defines class variables. -This is a good approach if JPathBNFConstants is meant to be a stateless container for these grammar rules, -and you want these rules to be associated with the class itself rather than individual (singleton) instances. -Here's how you can refactor JPathBNFConstants: -1.Define Simple Constants Directly: Basic constants like SLASH, SINGLE_QUOTE, DIGITS can remain -as directly defined class variables. -2.Annotate Complex Class Variables: For the more complex regex patterns that will be built, -add class-level type annotations (e.g., HEX_CHAR: str). - -3.Create _init_grammar_patterns(cls) Class Method:•Move all the logic from your current __init__ method -that constructs the regex patterns into this new class method. -•Change all self.VARIABLE_NAME = ... assignments to cls.VARIABLE_NAME = .... -•This method will use other class variables (e.g., cls.DIGIT_CHAR_SET, cls.NON_SURROGATE) to build more complex ones. - -4.Ensure One-Time Initialization per Class:•Use a dictionary _grammar_patterns_initialized_for_class to track -if _init_grammar_patterns has been run for a specific class (JPathBNFConstants or its subclasses). -•Create a class method _ensure_grammar_initialized(cls) that checks this flag - and calls cls._init_grammar_patterns() under a lock if needed. - -5.Modify instance(cls) Class Method: -•This method should first call cls._ensure_grammar_initialized(). -•Then, it should manage a per-class singleton instance cache (_instances) as before. -The instance itself will be stateless regarding the grammar rules. - -6.Empty __init__(self): The instance __init__ method will likely become empty (pass) -as all grammar state is now at the class level. - - - - -import re -import threading -from typing import Pattern # Removed NamedTuple as it's not used in this snippet - - -# Helper functions (assumed to be at module level as they are) -# pattern_str, concat, alternatives, star_rep, etc. - -class JPathBNFConstants: - # Basic characters / Simple constants defined directly as class attributes - SLASH = SOLIDUS = chr(0x2F) - BACKSLASH = REVERSE_SOLIDUS = chr(0x5C) - SINGLE_QUOTE = chr(0x27) - DOUBLE_QUOTE = chr(0x22) - ESC = BACKSLASH - UNDERSCORE = '_' - COMMA = ',' - LEFT_PAREN = '(' - RIGHT_PAREN = ')' - LEFT_BRACKET = '[' - RIGHT_BRACKET = ']' - # ... (all other simple string constants) ... - DIGITS = '0123456789' - DIGIT_CHAR_SET = '[0-9]' - DIGITS1 = '123456789' - DIGIT1_CHAR_SET = '[1-9]' - BLANK_CHAR = f"{chr(0x20)}{chr(0x09)}{chr(0x0A)}{chr(0x0D)}" - SPACES = f"(?:[{BLANK_CHAR}]*)" - - # Escaped versions - BACKSLASH_ESC = re.escape(BACKSLASH) - # ... (all other _ESC constants) ... - - # Annotate complex patterns that will be initialized by _init_grammar_patterns - # These will be set as class attributes. - NON_SURROGATE: str - HIGH_SURROGATE: str - LOW_SURROGATE: str - HEX_CHAR: str - FUNCTION_NAME_FIRST: str - FUNCTION_NAME_CHAR: str - FUNCTION_NAME: str - NON_SURROGATE_CODEPOINTS: str - UNESCAPED_CHAR: str - ESCAPABLE_CHAR: str - SINGLE_QUOTED: str - DOUBLE_QUOTED: str - STRING_LITERAL_DOUBLE_QUOTEABLE: str - STRING_LITERAL_DQ: str - STRING_LITERAL_SINGLE_QUOTEABLE: str - STRING_LITERAL_SQ: str - STRING_LITERAL: str - LITERAL: str - INT: str - START: str - END: str - STEP: str - SLICE_CHARS: str - EXPONENT: str - FRACTION: str - NUMBER: str - _2HEXDIGITS: str # If these are used by _init_grammar_patterns, they should be init'd or simple - _3HEXDIGITS: str # Same as above - - INDEX_SELECTOR: str - SLICE_SELECTOR: str - NAME_SELECTOR: str - INDEX_SEGMENT: str - NAME_FIRST: str - NAME_CHAR: str - MEMBER_NAME_SHORTHAND: str - NAME_SEGMENT: str - SINGULAR_QUERY_SEGMENTS: str - ABSOLUTE_SINGULAR_QUERY: str - RELATIVE_SINGULAR_QUERY: str - SINGULAR_QUERY: str - - # Non-terminals (recursive ones will remain as string comments or conceptual) - FILTER_SELECTOR_DEF: str # Using _DEF to distinguish from the simple constant - SELECTOR_DEF: str - BRACKETED_SELECTION_DEF: str - DESCENDANT_SEGMENT_DEF: str - CHILD_SEGMENT_DEF: str - SEGMENT_DEF: str - SEGMENTS_DEF: str - RELATIVE_QUERY_DEF: str - JSON_PATH_QUERY_DEF: str - FILTER_QUERY_DEF: str - LOGICAL_EXPRESSION_DEF: str # Placeholder for recursive parts - FUNCTION_EXPRESSION_DEF: str - FUNCTION_ARGUMENT_DEF: str - TEST_EXPRESSION_DEF: str - COMPARABLE_DEF: str - COMPARISON_OP_DEF: str - COMPARISON_EXPRESSION_DEF: str - PAREN_EXPRESSION_DEF: str - BASIC_EXPRESSION_DEF: str - LOGICAL_AND_EXPRESSION_DEF: str - LOGICAL_OR_EXPRESSION_DEF: str - - _grammar_patterns_initialized_for_class = {} # Key: class, Value: bool - _init_patterns_lock = threading.Lock() # Lock for initializing patterns - - @classmethod - def _init_grammar_patterns(cls): - """Initializes the complex grammar regex patterns as class variables for `cls`.""" - # These are used by other patterns, ensure they are set if not simple constants - cls._3HEXDIGITS = f"{cls.HEXDIGITS}{{{cls.LEFT_BRACE}3{cls.RIGHT_BRACE}}}" # Assuming HEXDIGITS is simple - cls._2HEXDIGITS = f"{cls.HEXDIGITS}{{{cls.LEFT_BRACE}2{cls.RIGHT_BRACE}}}" - - # Terminals (not strictly terminals but these can be parsed without further recursion) - cls.NON_SURROGATE = f"(?:[0-9a-cA-C]{cls._3HEXDIGITS})|(?:[dD][0-7]{cls._2HEXDIGITS})|(?:[eEfF]{cls._3HEXDIGITS})" - cls.HIGH_SURROGATE = f"(?:[dD][89aAbB]{cls._2HEXDIGITS})" - cls.LOW_SURROGATE = f"(?:[dD][c-fC-F]{cls._2HEXDIGITS})" - cls.HEX_CHAR = f"(?:(?:{cls.NON_SURROGATE})|(?:{cls.HIGH_SURROGATE}{cls.BACKSLASH_ESC}u{cls.LOW_SURROGATE}))" - - cls.FUNCTION_NAME_FIRST = cls.ALPHA_LC - cls.FUNCTION_NAME_CHAR = alternatives([cls.FUNCTION_NAME_FIRST, cls.UNDERSCORE, cls.DIGIT_CHAR_SET]) - cls.FUNCTION_NAME = concat([cls.FUNCTION_NAME_FIRST, star_rep(cls.FUNCTION_NAME_CHAR)]) - - cls.NON_SURROGATE_CODEPOINTS = r'[\x80-\xFF\u0100-\uD7FF\uE000-\U0010FFFF]' - cls.UNESCAPED_CHAR = rf'(?:(?:[\x20\x21\x23-\x26\x28-\x5B\x5D-\x7F])|(?:{cls.NON_SURROGATE_CODEPOINTS}))' - cls.ESCAPABLE_CHAR = rf'(?:(?:[bfnrt{cls.SLASH}{cls.BACKSLASH_ESC}])|(?:u{cls.HEX_CHAR}))' - cls.SINGLE_QUOTED = rf"(?:{cls.UNESCAPED_CHAR}|{cls.DOUBLE_QUOTE}|(?:{cls.BACKSLASH_ESC}{cls.SINGLE_QUOTE})|(?:{cls.BACKSLASH_ESC}{cls.ESCAPABLE_CHAR}))" - cls.DOUBLE_QUOTED = rf"(?:{cls.UNESCAPED_CHAR}|{cls.SINGLE_QUOTE}|(?:{cls.BACKSLASH_ESC}{cls.DOUBLE_QUOTE})|(?:{cls.BACKSLASH_ESC}{cls.ESCAPABLE_CHAR}))" - cls.STRING_LITERAL_DOUBLE_QUOTEABLE = f"(?P{cls.DOUBLE_QUOTED}*)" - cls.STRING_LITERAL_DQ = f"(?:{cls.DOUBLE_QUOTE}{cls.STRING_LITERAL_DOUBLE_QUOTEABLE}{cls.DOUBLE_QUOTE})" - cls.STRING_LITERAL_SINGLE_QUOTEABLE = f"(?P{cls.SINGLE_QUOTED}*)" - cls.STRING_LITERAL_SQ = f"(?:{cls.SINGLE_QUOTE}{cls.STRING_LITERAL_SINGLE_QUOTEABLE}{cls.SINGLE_QUOTE})" - cls.STRING_LITERAL = alternatives([cls.STRING_LITERAL_SQ, cls.STRING_LITERAL_DQ]) - - cls.INT = f'(?:0|-?{cls.DIGIT1_CHAR_SET}{cls.DIGIT_CHAR_SET}*)' - cls.START = cls.INT - cls.END = cls.INT - cls.STEP = cls.INT - cls.SLICE_CHARS = f"{cls.DIGITS}:-" - cls.EXPONENT = f"[eE][-+]?{cls.DIGIT_CHAR_SET}+" - cls.FRACTION = rf"\.{cls.DIGIT_CHAR_SET}+" - cls.NUMBER = f'(?P(?P{cls.INT}|-0)(?P{cls.FRACTION})?(?P{cls.EXPONENT})?)' - - cls.LITERAL = rf"{cls.NUMBER}|{cls.STRING_LITERAL}|{cls.TRUE}|{cls.FALSE}|{cls.NULL}" - - cls.INDEX_SELECTOR = cls.INT - cls.SLICE_SELECTOR = f"(?:(?:(?P{cls.START}){cls.SPACES})?{cls.COLON}{cls.SPACES}(?P{cls.END})?{cls.SPACES}(?:{cls.COLON}(?:{cls.SPACES}(?P{cls.STEP}))?)?)" - cls.NAME_SELECTOR = cls.LITERAL # Note: was cls.STRING_LITERAL, but grammar says literal. For name selector, it's specifically string_literal. - - cls.INDEX_SEGMENT = rf"\[{cls.INDEX_SELECTOR}\]" - cls.NAME_FIRST = alternatives([cls.ALPHA, cls.UNDERSCORE, cls.NON_SURROGATE_CODEPOINTS]) - cls.NAME_CHAR = alternatives([cls.NAME_FIRST, cls.DIGIT_CHAR_SET, ]) - cls.MEMBER_NAME_SHORTHAND = concat([cls.NAME_FIRST, star_rep(cls.NAME_CHAR)]) - cls.NAME_SEGMENT = rf"(?:\[{cls.NAME_SELECTOR}\])|(?:\.{cls.MEMBER_NAME_SHORTHAND})" # NAME_SELECTOR here should be string_literal for the bracketed form. - - cls.SINGULAR_QUERY_SEGMENTS = rf"(?:{cls.SPACES}(?:{cls.NAME_SEGMENT}|{cls.INDEX_SEGMENT}))*" - cls.ABSOLUTE_SINGULAR_QUERY = rf"{cls.ROOT_IDENTIFIER}{cls.SINGULAR_QUERY_SEGMENTS}" - cls.RELATIVE_SINGULAR_QUERY = rf"{cls.CURRENT_NODE_IDENTIFIER}{cls.SINGULAR_QUERY_SEGMENTS}" - cls.SINGULAR_QUERY = rf"{cls.RELATIVE_SINGULAR_QUERY}|{cls.ABSOLUTE_SINGULAR_QUERY}" - - # Non-terminals (conceptual definitions for recursive parts) - # These are illustrative and won't be directly usable as full regexes if recursive - cls.LOGICAL_EXPRESSION_DEF = "self.LOGICAL_OR_EXPRESSION_DEF" # Conceptual link - - cls.FILTER_SELECTOR_DEF = rf"{cls.QUESTION}{cls.SPACES}{cls.LOGICAL_EXPRESSION_DEF}" - - cls.SELECTOR_DEF = alternatives([ - cls.NAME_SELECTOR, # This should be STRING_LITERAL for name_selector - cls.WILDCARD_SELECTOR, - cls.SLICE_SELECTOR, - cls.INDEX_SELECTOR, - cls.FILTER_SELECTOR_DEF, - ]) - - cls.BRACKETED_SELECTION_DEF = rf"\[{cls.SPACES}{cls.SELECTOR_DEF}(?:{cls.SPACES}{cls.COMMA}{cls.SPACES}{cls.SELECTOR_DEF})*{cls.SPACES}\]" # added * for multiple selectors - cls.DESCENDANT_SEGMENT_DEF = rf"\.\.{alternatives([ - cls.BRACKETED_SELECTION_DEF, - cls.WILDCARD_SELECTOR, - cls.MEMBER_NAME_SHORTHAND, - ])}" - cls.CHILD_SEGMENT_DEF = alternatives([ - cls.BRACKETED_SELECTION_DEF, - rf"(?:\.(?:{cls.WILDCARD_SELECTOR}|{cls.MEMBER_NAME_SHORTHAND}))"]) - cls.SEGMENT_DEF = alternatives([cls.CHILD_SEGMENT_DEF, cls.DESCENDANT_SEGMENT_DEF]) - cls.SEGMENTS_DEF = star_rep(f"{cls.SPACES}{cls.SEGMENT_DEF}") - - cls.RELATIVE_QUERY_DEF = concat([cls.CURRENT_NODE_IDENTIFIER, cls.SEGMENTS_DEF]) - cls.JSON_PATH_QUERY_DEF = concat([cls.ROOT_IDENTIFIER, cls.SEGMENTS_DEF]) - - cls.FILTER_QUERY_DEF = alternatives( - [cls.RELATIVE_QUERY_DEF, cls.JSON_PATH_QUERY_DEF]) # Corrected: alternatives - - # ... (Continue for other _DEF conceptual patterns like FUNCTION_EXPRESSION_DEF, etc.) - - cls._grammar_patterns_initialized_for_class[cls] = True - - @classmethod - def _ensure_grammar_initialized(cls): - if not cls._grammar_patterns_initialized_for_class.get(cls, False): - with cls._init_patterns_lock: - if not cls._grammar_patterns_initialized_for_class.get(cls, False): - # If a superclass in MRO also has this mechanism and isn't initialized, - # initialize it first. This allows subclasses to build upon initialized superclass patterns. - for base in reversed(cls.__mro__[1:]): # Exclude 'object' and 'cls' itself initially - if hasattr(base, "_ensure_grammar_initialized") and \ - isinstance(getattr(base, "_ensure_grammar_initialized"), classmethod) and \ - not cls._grammar_patterns_initialized_for_class.get(base, False): - # Call the superclass's ensure method. - # This assumes that _ensure_grammar_initialized on the base - # will correctly initialize 'base's patterns. - base._ensure_grammar_initialized() - - cls._init_grammar_patterns() # Initialize patterns for the current class 'cls' - - _instances = {} # Key: class, Value: instance of that class - _instance_creation_lock = threading.Lock() - - @classmethod - def instance(cls) -> 'JPathBNFConstants': - cls._ensure_grammar_initialized() # Ensure class attributes are set for 'cls' - - if cls not in cls._instances: - with cls._instance_creation_lock: - if cls not in cls._instances: - cls._instances[cls] = cls() # Calls __init__ - return cls._instances[cls] - - def __init__(self): - # __init__ is now empty as all state is at the class level. - # The instance() method ensures class-level patterns are initialized. - pass - -PART TWO: SUBLCLASSES OF BNF -How it Works: -1.Define Overrides in Subclass: The subclass defines its version of the "leaf" or base regex string constants as class attributes. - - class RelaxedJPathBNF(JPathBNFConstants): - # Override a simple constant used in INT pattern construction - # Original JPathBNFConstants.INT uses "-?" - # Let's say we want to allow "+" as well for integers. - # We need to identify which base component contributes to the sign. - # JPathBNFConstants.INT = f'(?:0|-?{JPathBNFConstants.DIGIT1_CHAR_SET}{JPathBNFConstants.DIGIT_CHAR_SET}*)' - # We can redefine the whole INT pattern: - INT = f'(?:0|[-+]?{JPathBNFConstants.DIGIT1_CHAR_SET}{JPathBNFConstants.DIGIT_CHAR_SET}*)' - - # Example: Allow trailing comma in bracketed selection (conceptual) - # This would require modifying how BRACKETED_SELECTION_DEF is built, - # perhaps by overriding a COMMA_RULE used by alternatives() or concat(). - # For simplicity, let's focus on overriding a base pattern like INT. - -2.Inherited _init_grammar_patterns Uses Subclass's Values:•When RelaxedJPathBNF.instance() is called, it eventually calls RelaxedJPathBNF._ensure_grammar_initialized().•This, in turn, calls RelaxedJPathBNF._init_grammar_patterns() (which is inherited from JPathBNFConstants if not overridden by RelaxedJPathBNF).•Inside _init_grammar_patterns(cls), where cls is now RelaxedJPathBNF, expressions like cls.INT will resolve to RelaxedJPathBNF.INT (the overridden version).•Therefore, more complex patterns like cls.NUMBER or cls.INDEX_SELECTOR that are built using cls.INT will automatically use the relaxed definition from RelaxedJPathBNF. -3.No Need to Override _init_grammar_patterns (Ideally): If JPathBNFConstants._init_grammar_patterns is written carefully to always use cls.SOME_COMPONENT to build cls.SOME_LARGER_PATTERN, then subclasses only need to override the SOME_COMPONENT class variables. They wouldn't need to override the _init_grammar_patterns method itself unless the logic of how patterns are combined changes, not just the constituent parts. - -Example of _ensure_grammar_initialized for superclasses (Refined): The logic for initializing superclasses first in _ensure_grammar_initialized can be tricky. A simpler model is that _init_grammar_patterns for a given class cls is responsible for defining all necessary grammar patterns for cls. If it needs to refer to a pattern from a superclass as it was defined for the superclass, it would use super().PATTERN_NAME or BaseClassName.PATTERN_NAME. If it uses cls.PATTERN_NAME, it gets the version potentially overridden by cls or its subclasses.For your use case, where a subclass primarily redefines leaf patterns, the _init_grammar_patterns inherited from JPathBNFConstants should work correctly when called with cls being the subclass, as it will pick up the subclass's overridden leaf patterns. The superclass initialization in _ensure_grammar_initialized might be simplified or removed if each class's _init_grammar_patterns is self-contained.Let's simplify _ensure_grammar_initialized assuming _init_grammar_patterns for cls sets everything cls needs, using cls.LEAF_PATTERN which might be overridden: - - @classmethod - def _ensure_grammar_initialized(cls): - if not cls._grammar_patterns_initialized_for_class.get(cls, False): - with cls._init_patterns_lock: - if not cls._grammar_patterns_initialized_for_class.get(cls, False): - # If this class is a subclass, its _init_grammar_patterns might - # implicitly use/override constants from its superclasses. - # The key is that _init_grammar_patterns is called ONCE for this specific cls. - cls._init_grammar_patterns() - - -The _DEF suffixed variables are good for conceptual clarity for the recursive parts of the grammar that can't be fully represented by a single regex. \ No newline at end of file From 3cbb0e4994985d56e68f06c7908d12f2de5233bb Mon Sep 17 00:00:00 2001 From: killeroonie Date: Thu, 26 Jun 2025 03:42:21 -0700 Subject: [PATCH 04/11] Stop tracking python-killerbunny.iml --- python-killerbunny.iml | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 python-killerbunny.iml diff --git a/python-killerbunny.iml b/python-killerbunny.iml deleted file mode 100644 index a60b472..0000000 --- a/python-killerbunny.iml +++ /dev/null @@ -1,19 +0,0 @@ - - - - - - - - - - - - - - - - - \ No newline at end of file From 1377be70a50fab0a84cc50b4f9646d67329b318f Mon Sep 17 00:00:00 2001 From: killeroonie Date: Thu, 26 Jun 2025 03:44:17 -0700 Subject: [PATCH 05/11] Update gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 9714c5d..d436810 100644 --- a/.gitignore +++ b/.gitignore @@ -33,7 +33,7 @@ coverage.xml ### IntelliJ IDEA ### out/ .idea/ # JetBrains IDEs (PyCharm, IntelliJ) -.iml +*.iml .vscode/ # Visual Studio Code (can sometimes share launch.json/settings.json if desired) *.sublime-* # Sublime Text nbproject/ # NetBeans From 9691b709ce2834d46872aac5a2b5830e52d94460 Mon Sep 17 00:00:00 2001 From: killeroonie Date: Thu, 26 Jun 2025 03:45:31 -0700 Subject: [PATCH 06/11] Update gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index d436810..c43f129 100644 --- a/.gitignore +++ b/.gitignore @@ -64,3 +64,4 @@ logs/ wordspy_home/ +/.idea/inspectionProfiles/Project_Default.xml From 5df518dad6d1b76fe9edc4537c09d0c911d45699 Mon Sep 17 00:00:00 2001 From: killeroonie Date: Thu, 26 Jun 2025 14:23:49 -0700 Subject: [PATCH 07/11] Update gitignore --- .gitignore | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index c43f129..f7b2355 100644 --- a/.gitignore +++ b/.gitignore @@ -34,6 +34,9 @@ coverage.xml out/ .idea/ # JetBrains IDEs (PyCharm, IntelliJ) *.iml +*.iws +*.ipr + .vscode/ # Visual Studio Code (can sometimes share launch.json/settings.json if desired) *.sublime-* # Sublime Text nbproject/ # NetBeans @@ -58,10 +61,3 @@ secrets.* # Log files *.log logs/ - - -# local data and config home directory -wordspy_home/ - - -/.idea/inspectionProfiles/Project_Default.xml From e58e16cc65997f73681548bd2410f64f8ec98487 Mon Sep 17 00:00:00 2001 From: killeroonie Date: Thu, 26 Jun 2025 14:45:47 -0700 Subject: [PATCH 08/11] Removed comments at end of lines as these are not supported in .gitignore. --- .gitignore | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index f7b2355..e398cc1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +# Reminder: .gitignore does not support comments at the end of a line. + # Python bytecode and caches __pycache__/ *.py[cod] @@ -32,20 +34,20 @@ coverage.xml # IDEs / Editors ### IntelliJ IDEA ### out/ -.idea/ # JetBrains IDEs (PyCharm, IntelliJ) +.idea/ *.iml *.iws *.ipr -.vscode/ # Visual Studio Code (can sometimes share launch.json/settings.json if desired) -*.sublime-* # Sublime Text -nbproject/ # NetBeans -*.swp # Vim swap files -*~ # Backup files (common convention) +.vscode/ +*.sublime-* +nbproject/ +*.swp +*~ # Operating System files -.DS_Store # macOS -Thumbs.db # Windows +.DS_Store +Thumbs.db # Secrets (use with caution, prefer dedicated secret management) @@ -56,7 +58,7 @@ Thumbs.db # Windows *.pem credentials.* secrets.* -*.env # If used for secrets +*.env # Log files *.log From 77e5e3188309b797b814fb6c62dfb849c2b94118 Mon Sep 17 00:00:00 2001 From: killeroonie Date: Thu, 26 Jun 2025 15:55:31 -0700 Subject: [PATCH 09/11] WIP. Committing and pushing now to test the pre-push hook which should run all unit tests, and since they currently fail, should prevent the push. --- killerbunny/evaluating/evaluator.py | 29 ++++++------ .../evaluating/test_root_value_cycles.py | 45 ++++++++++++++++++- 2 files changed, 56 insertions(+), 18 deletions(-) diff --git a/killerbunny/evaluating/evaluator.py b/killerbunny/evaluating/evaluator.py index 51bdddd..6fde366 100644 --- a/killerbunny/evaluating/evaluator.py +++ b/killerbunny/evaluating/evaluator.py @@ -206,42 +206,39 @@ def _collect_vnodes_and_their_descendants(self, return VNodeList(collected_vnodes) - def _children_of(self, value_node: VNode) -> VNodeList: + def _children_of(self, parent_node: VNode) -> VNodeList: """Return the children of the argument node. See section 1.1., "Terminology", pg 6, RFC 9535 Children (of a node): If the node is an array, the nodes of its elements; if the node is an object, the nodes of its member values. If the node is neither an array nor an object, it has no children. - If the caller intends to recurse further into each returned child node, the caller is responsible - for checking that these children have not been seen yet to prevent a cycle and infinite recursion. + This method will detect a cycle when a child of the parent node `parent_node` is the same instance as the + parent node, as determined by calling id() on the parent and child nodes. In this case, that child will not + be included in the retrned VNodeList. """ child_nodes: list[VNode] = [] - instance_ids: dict[int, VNode] = {id(value_node.jvalue):value_node} + instance_ids: dict[int, VNode] = {id(parent_node.jvalue):parent_node} - if isinstance(value_node.jvalue, JSON_STRUCTURED_TYPES): - base_path = value_node.jpath - if isinstance(value_node.jvalue, JSON_ARRAY_TYPES): - for index, element in enumerate(value_node.jvalue): + if isinstance(parent_node.jvalue, JSON_STRUCTURED_TYPES): + base_path = parent_node.jpath + if isinstance(parent_node.jvalue, JSON_ARRAY_TYPES): + for index, element in enumerate(parent_node.jvalue): element_path = NormalizedJPath(f"{base_path}[{index}]") - vnode = VNode(element_path, element, value_node.root_value, value_node.node_depth + 1) + vnode = VNode(element_path, element, parent_node.root_value, parent_node.node_depth + 1) if id(element) in instance_ids: _logger.warning(f"Circular reference cycle detected: current node: {vnode} already included as: {instance_ids[id(element)]}") print(f"\n+++Circular reference cycle detected: current node: {vnode} already included as: {instance_ids[id(element)]}") continue - if isinstance(element, JSON_STRUCTURED_TYPES): - instance_ids[id(element)] = vnode child_nodes.append(vnode) - elif isinstance(value_node.jvalue, JSON_OBJECT_TYPES): - for member_name, member_value in value_node.jvalue.items(): + elif isinstance(parent_node.jvalue, JSON_OBJECT_TYPES): + for member_name, member_value in parent_node.jvalue.items(): element_path = NormalizedJPath(f"{base_path}['{member_name}']") - vnode = VNode(element_path, member_value, value_node.root_value, value_node.node_depth + 1) + vnode = VNode(element_path, member_value, parent_node.root_value, parent_node.node_depth + 1) if id(member_value) in instance_ids: _logger.warning(f"Circular reference cycle detected: current node: {vnode} already included as: {instance_ids[id(member_value)]}") print(f"\n+++Circular reference cycle detected: current node: {vnode} already included as: {instance_ids[id(member_value)]}") continue - if isinstance(member_value, JSON_STRUCTURED_TYPES): - instance_ids[id(member_value)] = vnode child_nodes.append(vnode) return VNodeList(child_nodes) diff --git a/tests/jpath/evaluating/test_root_value_cycles.py b/tests/jpath/evaluating/test_root_value_cycles.py index 3e7cb70..eec98ab 100644 --- a/tests/jpath/evaluating/test_root_value_cycles.py +++ b/tests/jpath/evaluating/test_root_value_cycles.py @@ -215,10 +215,13 @@ def test_cs_list(child_list_cycle: JSON_ValueType, caplog: LogCaptureFixture) -> expected_values = [1] # eval will not include a cycle in the result, so the cyclic parent_list is not included actual_values = list(node_list.values()) + actual_paths = [npath.jpath_str for npath in node_list.paths() ] + + print(f"\nactual_paths: {actual_paths}, actual_values: {actual_values}") + assert len(actual_values) == len(expected_values) assert actual_values[0] == expected_values[0] - actual_paths = [npath.jpath_str for npath in node_list.paths() ] assert actual_paths == ['$[0]'] log_msg_assert( @@ -246,5 +249,43 @@ def test_ds_list(child_list_cycle: JSON_ValueType, caplog: LogCaptureFixture)-> "Circular reference cycle detected: current node: $[1], [1, [...]] already included as: $, [1, [...]]", caplog ) + +def test_ds_deeply_nested() -> None: + ... + + +def test_duplicated_member_list_no_nesting() -> None: + """If we have a member used multiple times but not nested, this should not prevent inclusion""" + shared_list = [ "one", "two", "three"] + parent_list = [ shared_list, shared_list, shared_list ] + + jpath_query_str = '$[*]' + query = WellFormedValidQuery.from_str(jpath_query_str) + node_list:VNodeList = query.eval(parent_list) + + actual_values = list(node_list.values()) + actual_paths = [npath.jpath_str for npath in node_list.paths() ] + + expected_values: list[Any] = [['one', 'two', 'three'], ['one', 'two', 'three'], ['one', 'two', 'three']] + expected_paths = ['$[0]', '$[1]', '$[2]'] + + assert actual_values == expected_values + assert actual_paths == expected_paths + +def test_duplicated_member_dict_no_nesting() -> None: + """If we have a member used multiple times but not nested, this should not prevent inclusion""" + shared_dict = { "one": "one", "two": "two", "three": "three" } + parent_dict = { "first": shared_dict, "second": shared_dict, "third": shared_dict } + + jpath_query_str = '$[*]' + query = WellFormedValidQuery.from_str(jpath_query_str) + node_list:VNodeList = query.eval(parent_dict) + + actual_values = list(node_list.values()) + actual_paths = [npath.jpath_str for npath in node_list.paths() ] + + expected_values: list[Any] = [['one', 'two', 'three'], ['one', 'two', 'three'], ['one', 'two', 'three']] + expected_paths = ['$[0]', '$[1]', '$[2]'] - \ No newline at end of file + assert actual_values == expected_values + assert actual_paths == expected_paths \ No newline at end of file From a63a51e1a0a87697e6f5063d0752ab213ad72d7f Mon Sep 17 00:00:00 2001 From: killeroonie Date: Thu, 26 Jun 2025 18:40:31 -0700 Subject: [PATCH 10/11] Final tests and implementation changes to guard against infinite recursion in cyclic data graphs. --- killerbunny/evaluating/evaluator.py | 8 +- .../evaluating/test_root_value_cycles.py | 125 ++++++++++++++++-- 2 files changed, 120 insertions(+), 13 deletions(-) diff --git a/killerbunny/evaluating/evaluator.py b/killerbunny/evaluating/evaluator.py index 6fde366..6d8e8f3 100644 --- a/killerbunny/evaluating/evaluator.py +++ b/killerbunny/evaluating/evaluator.py @@ -227,8 +227,8 @@ def _children_of(self, parent_node: VNode) -> VNodeList: element_path = NormalizedJPath(f"{base_path}[{index}]") vnode = VNode(element_path, element, parent_node.root_value, parent_node.node_depth + 1) if id(element) in instance_ids: - _logger.warning(f"Circular reference cycle detected: current node: {vnode} already included as: {instance_ids[id(element)]}") - print(f"\n+++Circular reference cycle detected: current node: {vnode} already included as: {instance_ids[id(element)]}") + _logger.warning(f"Circular reference cycle detected: child node: {vnode} same as parent: {instance_ids[id(element)]}") + #print(f"\n+++Circular reference cycle detected: current node: {vnode} already included as: {instance_ids[id(element)]}") continue child_nodes.append(vnode) elif isinstance(parent_node.jvalue, JSON_OBJECT_TYPES): @@ -236,8 +236,8 @@ def _children_of(self, parent_node: VNode) -> VNodeList: element_path = NormalizedJPath(f"{base_path}['{member_name}']") vnode = VNode(element_path, member_value, parent_node.root_value, parent_node.node_depth + 1) if id(member_value) in instance_ids: - _logger.warning(f"Circular reference cycle detected: current node: {vnode} already included as: {instance_ids[id(member_value)]}") - print(f"\n+++Circular reference cycle detected: current node: {vnode} already included as: {instance_ids[id(member_value)]}") + _logger.warning(f"Circular reference cycle detected: child node: {vnode} same as parent: {instance_ids[id(member_value)]}") + #print(f"\n+++Circular reference cycle detected: current node: {vnode} already included as: {instance_ids[id(member_value)]}") continue child_nodes.append(vnode) diff --git a/tests/jpath/evaluating/test_root_value_cycles.py b/tests/jpath/evaluating/test_root_value_cycles.py index eec98ab..49292fd 100644 --- a/tests/jpath/evaluating/test_root_value_cycles.py +++ b/tests/jpath/evaluating/test_root_value_cycles.py @@ -8,7 +8,7 @@ """Tests the evaluator with cyclic data. Cycles cannot be (easily?) represented in JSON text but could easily happen when generating nested data programmatically. Cycles can be used as an attack vector to crash the evaluator with a stack overflow error. """ -from typing import Any +from typing import Any, cast import pytest from _pytest.logging import LogCaptureFixture @@ -53,6 +53,7 @@ def log_msg_assert(msg: str, caplog: LogCaptureFixture) -> None: assert len(caplog.records) > 0 record = caplog.records[0] msgs = [record.msg for record in caplog.records] + print(f"\nmsgs={msgs}") assert record.levelname == "WARNING" assert msg in msgs @@ -72,7 +73,8 @@ def test_cs_dict(child_dict_cycle: JSON_ValueType, caplog: LogCaptureFixture) -> assert actual_paths == ["$['one']"] log_msg_assert( - "Circular reference cycle detected: current node: $['cycle'], {'one': 1, 'cycle': {...}} already included as: $, {'one': 1, 'cycle': {...}}", + "Circular reference cycle detected: child node: $['cycle'], {'one': 1, " + "'cycle': {...}} same as parent: $, {'one': 1, 'cycle': {...}}", caplog ) @@ -225,7 +227,7 @@ def test_cs_list(child_list_cycle: JSON_ValueType, caplog: LogCaptureFixture) -> assert actual_paths == ['$[0]'] log_msg_assert( - "Circular reference cycle detected: current node: $[1], [1, [...]] already included as: $, [1, [...]]", + "Circular reference cycle detected: child node: $[1], [1, [...]] same as parent: $, [1, [...]]", caplog ) @@ -250,9 +252,7 @@ def test_ds_list(child_list_cycle: JSON_ValueType, caplog: LogCaptureFixture)-> caplog ) -def test_ds_deeply_nested() -> None: - ... - + def test_duplicated_member_list_no_nesting() -> None: """If we have a member used multiple times but not nested, this should not prevent inclusion""" @@ -284,8 +284,115 @@ def test_duplicated_member_dict_no_nesting() -> None: actual_values = list(node_list.values()) actual_paths = [npath.jpath_str for npath in node_list.paths() ] - expected_values: list[Any] = [['one', 'two', 'three'], ['one', 'two', 'three'], ['one', 'two', 'three']] - expected_paths = ['$[0]', '$[1]', '$[2]'] + expected_values: list[Any] = [{'one': 'one', 'three': 'three', 'two': 'two'}, + {'one': 'one', 'three': 'three', 'two': 'two'}, + {'one': 'one', 'three': 'three', 'two': 'two'} + ] + expected_paths = ["$['first']", "$['second']", "$['third']"] + + assert actual_values == expected_values + assert actual_paths == expected_paths + +@pytest.fixture(scope="module", autouse=True) +def doubly_nested_cycle() -> JSON_ValueType: + shared_list: list[Any] = [ "1", "2", "3"] + shared_dict: dict[str, Any] = { "one": "one", "two": "two", "three": "three" } + parent_list: list[Any] = [ "a", "b", "c", shared_list, shared_dict ] + shared_list.append(shared_dict) + shared_dict["shared_list"] = shared_list + return parent_list + +def test_cs_list_doubly_nested(doubly_nested_cycle: JSON_ValueType, caplog: LogCaptureFixture) -> None: + jpath_query_str = '$[*]' + query = WellFormedValidQuery.from_str(jpath_query_str) + node_list:VNodeList = query.eval(doubly_nested_cycle) + + actual_values = list(node_list.values()) + actual_values_str = f"{actual_values}" + actual_paths = [npath.jpath_str for npath in node_list.paths() ] + + expected_values = doubly_nested_cycle + expected_values_str = ("['a', 'b', 'c', ['1', '2', '3', {'one': 'one', 'two': 'two', 'three': 'three', 'shared_list': [...]}], " + "{'one': 'one', 'two': 'two', 'three': 'three', 'shared_list': ['1', '2', '3', {...}]}]") + expected_paths: list[Any] = ['$[0]', '$[1]', '$[2]', '$[3]', '$[4]'] + + assert actual_values_str == expected_values_str + assert actual_values == expected_values + assert actual_paths == expected_paths + +def test_ds_list_doubly_nested(doubly_nested_cycle: JSON_ValueType, caplog: LogCaptureFixture) -> None: + root_value: list[Any] = cast(list[Any], doubly_nested_cycle) + + + jpath_query_str = '$..[*]' + query = WellFormedValidQuery.from_str(jpath_query_str) + node_list:VNodeList = query.eval(root_value) + actual_values = list(node_list.values()) + actual_values_str = f"{actual_values}" + actual_paths = [npath.jpath_str for npath in node_list.paths() ] + + expected_values: list[Any] = [ 'a', 'b', 'c', root_value[3], root_value[4], '1', '2', '3', root_value[4], 'one', 'two', 'three'] + + expected_values_str = ("['a', 'b', 'c', ['1', '2', '3', {'one': 'one', 'two': 'two', 'three': 'three', 'shared_list': [...]}], {'one': 'one', 'two': 'two', 'three': 'three', 'shared_list': ['1', '2', '3', {...}]}, " + "'1', '2', '3', {'one': 'one', 'two': 'two', 'three': 'three', 'shared_list': ['1', '2', '3', {...}]}, 'one', 'two', 'three']") + expected_paths: list[Any] = ['$[0]', '$[1]', '$[2]', '$[3]', '$[4]', '$[3][0]', '$[3][1]', '$[3][2]', '$[3][3]', "$[4]['one']", "$[4]['two']", "$[4]['three']"] + + assert actual_values_str == expected_values_str + assert actual_values == expected_values + assert actual_paths == expected_paths + + log_msg_assert( + "Circular reference cycle detected: current node: $[3][3], {'one': 'one', " + "'two': 'two', 'three': 'three', 'shared_list': ['1', '2', '3', {...}]} " + "already included as: $[4], {'one': 'one', 'two': 'two', 'three': 'three', " + "'shared_list': ['1', '2', '3', {...}]}", + caplog + ) + +def test_ds_list_deeply_nested(doubly_nested_cycle: JSON_ValueType, caplog: LogCaptureFixture) -> None: + root_value: list[Any] = cast(list[Any], doubly_nested_cycle) + root_value[4]["cycle"] = root_value # nested dict now has member reference to parent + + jpath_query_str = '$..[*]' + query = WellFormedValidQuery.from_str(jpath_query_str) + node_list:VNodeList = query.eval(root_value) + + actual_values = list(node_list.values()) + actual_values_str = f"{actual_values}" + actual_paths = [npath.jpath_str for npath in node_list.paths() ] + + expected_values: list[Any] = [ 'a', 'b', 'c', root_value[3], root_value[4], '1', '2', '3', root_value[4], 'one', 'two', 'three'] + + expected_values_str = ("['a', 'b', 'c', ['1', '2', '3', {'one': 'one', 'two': 'two', 'three': " + "'three', 'shared_list': [...], 'cycle': ['a', 'b', 'c', [...], {...}]}], " + "{'one': 'one', 'two': 'two', 'three': 'three', 'shared_list': ['1', '2', " + "'3', {...}], 'cycle': ['a', 'b', 'c', ['1', '2', '3', {...}], {...}]}, '1', " + "'2', '3', {'one': 'one', 'two': 'two', 'three': 'three', 'shared_list': " + "['1', '2', '3', {...}], 'cycle': ['a', 'b', 'c', ['1', '2', '3', {...}], " + "{...}]}, 'one', 'two', 'three']") + + expected_paths: list[Any] = ['$[0]', + '$[1]', + '$[2]', + '$[3]', + '$[4]', + '$[3][0]', + '$[3][1]', + '$[3][2]', + '$[3][3]', + "$[4]['one']", + "$[4]['two']", + "$[4]['three']"] + + assert actual_values_str == expected_values_str + assert actual_paths == expected_paths assert actual_values == expected_values - assert actual_paths == expected_paths \ No newline at end of file + + log_msg_assert( + "Circular reference cycle detected: current node: $[3][3], {'one': 'one', 'two': 'two', 'three': 'three', " + "'shared_list': ['1', '2', '3', {...}], 'cycle': ['a', 'b', 'c', ['1', '2', '3', {...}], {...}]} " + "already included as: $[4], {'one': 'one', 'two': 'two', 'three': 'three', " + "'shared_list': ['1', '2', '3', {...}], 'cycle': ['a', 'b', 'c', ['1', '2', '3', {...}], {...}]}", + caplog + ) \ No newline at end of file From decc9cecaf11b98e709c86956c3e533b10ff735c Mon Sep 17 00:00:00 2001 From: killeroonie Date: Thu, 26 Jun 2025 18:55:00 -0700 Subject: [PATCH 11/11] Fix import issue in value_nodes.py MyPy complains that `override` should be imported from the typing_extensions package, but Python docs show that `override` exists in typing as of 3.12. --- killerbunny/evaluating/value_nodes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/killerbunny/evaluating/value_nodes.py b/killerbunny/evaluating/value_nodes.py index 5b053c8..70359e0 100644 --- a/killerbunny/evaluating/value_nodes.py +++ b/killerbunny/evaluating/value_nodes.py @@ -12,7 +12,7 @@ import json import logging from typing import Iterator, Generator -from typing_extensions import override +from typing import override # type: ignore from killerbunny.evaluating.evaluator_types import EvaluatorValue, NormalizedJPath from killerbunny.parsing.helper import unescape_string_content from killerbunny.shared.json_type_defs import JSON_ValueType