From 398eb21277ad7783d5ac05818711d0f21d437316 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Sat, 14 Feb 2026 17:09:29 -0500 Subject: [PATCH 1/3] Updated `mmif describe` implementation to be based on pydantic for better documentation --- build-tools/requirements.docs.txt | 3 +- documentation/conf.py | 9 + mmif/utils/cli/__init__.py | 4 +- mmif/utils/cli/describe.py | 88 ++++--- mmif/utils/workflow_helper.py | 414 ++++++++++++++++++------------ requirements.txt | 1 + 6 files changed, 312 insertions(+), 207 deletions(-) diff --git a/build-tools/requirements.docs.txt b/build-tools/requirements.docs.txt index 8d9ee33d..db2d03d8 100644 --- a/build-tools/requirements.docs.txt +++ b/build-tools/requirements.docs.txt @@ -1,3 +1,4 @@ -sphinx>=7.0,<8.0 +sphinx furo m2r2 +autodoc-pydantic diff --git a/documentation/conf.py b/documentation/conf.py index 121054ef..f309f548 100644 --- a/documentation/conf.py +++ b/documentation/conf.py @@ -33,6 +33,7 @@ 'undoc-members': True, 'show-inheritance': True, } +autodoc_member_order = 'bysource' # -- Project information ----------------------------------------------------- @@ -55,8 +56,16 @@ 'sphinx.ext.autodoc', 'sphinx.ext.linkcode', 'm2r2', + 'sphinxcontrib.autodoc_pydantic', ] +autodoc_pydantic_model_show_json = True +autodoc_pydantic_model_show_field_summary = True +autodoc_pydantic_model_show_config_summary = False +autodoc_pydantic_model_show_validator_members = False +autodoc_pydantic_model_show_validator_summary = False +autodoc_pydantic_field_list_validators = False + templates_path = ['_templates'] exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] # dynamically generated files diff --git a/mmif/utils/cli/__init__.py b/mmif/utils/cli/__init__.py index 9b91b60c..935ab0a7 100644 --- a/mmif/utils/cli/__init__.py +++ b/mmif/utils/cli/__init__.py @@ -28,6 +28,7 @@ def open_cli_io_arg(path_or_dash: Optional[str], manager. Handles the common CLI pattern where: + - '-' means stdin (read mode) or stdout (write mode) - None means "argument not provided"; when default_stdin=True, it falls back to stdin/stdout @@ -117,7 +118,8 @@ def open_cli_io_arg(path_or_dash: Optional[str], "Expected str or None." ) - yield file_handle + if file_handle is not None: + yield file_handle finally: if should_close and file_handle is not None: diff --git a/mmif/utils/cli/describe.py b/mmif/utils/cli/describe.py index 0bbd49a8..d921b329 100644 --- a/mmif/utils/cli/describe.py +++ b/mmif/utils/cli/describe.py @@ -1,17 +1,29 @@ import argparse import json -import os import sys import textwrap from pathlib import Path -from typing import Union, cast +from typing import Dict, Type, Union, cast + +from pydantic import BaseModel from mmif.utils.cli import open_cli_io_arg -from mmif.utils.workflow_helper import generate_workflow_identifier, describe_single_mmif, \ - describe_mmif_collection -# gen_param_hash is imported for backward compatibility -from mmif.utils.workflow_helper import generate_param_hash +# gen_param_hash is imported for backward compatibility +from mmif.utils.workflow_helper import ( + CollectionMmifDesc, + SingleMmifDesc, + describe_mmif_collection, + describe_single_mmif, + generate_workflow_identifier, +) + +models_to_help = [SingleMmifDesc, CollectionMmifDesc] +model_modules = set(model.__module__ for model in models_to_help) +def get_all_models() -> Dict[str, Type[BaseModel]]: + return { + name: cls for name, cls in models_to_help + } def get_pipeline_specs(mmif_file: Union[str, Path]): import warnings @@ -33,30 +45,11 @@ def describe_argparser(): 'collection of MMIF files.' ) - # get and clean docstrings - def _extract_describe_docstring(func): - doc = func.__doc__.split(':param')[0] - # then cut off all lines after `---` - doc = doc.split('---')[0] - return textwrap.dedent(doc).strip() - - single_doc = _extract_describe_docstring(describe_single_mmif) - collection_doc = _extract_describe_docstring(describe_mmif_collection) - additional = textwrap.dedent(f""" This command extracts workflow information from a single MMIF file or - summarizes a directory of MMIF files. The output is serialized as JSON and - includes: + a directory of MMIF files. The output is serialized as JSON. - ========================= - Single MMIF file as input - ========================= -{single_doc} - - ================================== - A directory of MMIF files as input - ================================== -{collection_doc} + Use `--help-schemas` to inspect the structure of the JSON output. """) return oneliner, additional @@ -67,6 +60,7 @@ def prep_argparser(**kwargs): formatter_class=argparse.RawDescriptionHelpFormatter, **kwargs ) + parser.add_argument( "MMIF_FILE", nargs="?", @@ -84,24 +78,43 @@ def prep_argparser(**kwargs): action="store_true", help="Pretty-print JSON output" ) + parser.add_argument( + "--help-schemas", + nargs="*", + choices=["all"] + [m.__name__ for m in models_to_help], + metavar="SCHEMA_NAME", + help=f"Print the JSON schema for the output. For human-readable documentation, " + f"visit https://clams.ai/mmif-python and see the following modules: " + f"{', '.join(model_modules)}.\nOptions: all, {', '.join([m.__name__ for m in models_to_help])}." + ) return parser def main(args): """ - Main entry point for the describe CLI command. - - Reads a MMIF file and outputs a JSON summary containing: - - - workflow_id: unique identifier for the source and app sequence - - stats: view counts, annotation counts (total/per-view/per-type), and lists of error/warning/empty view IDs - - views: map of view IDs to app configurations and profiling data - - :param args: Parsed command-line arguments + Main block for the describe CLI command. + This function basically works as a wrapper around + :func:`describe_single_mmif` (for single file input) or + :func:`describe_mmif_collection` (for directory input). """ + if hasattr(args, 'help_schemas') and args.help_schemas is not None: + models_map = {m.__name__: m for m in models_to_help} + to_show = [] + if len(args.help_schemas) == 0 or 'all' in args.help_schemas: + to_show = models_to_help + else: + to_show = args.help_schemas + + for name in to_show: + model_cls = models_map[name] + schema = model_cls.model_json_schema() + print(json.dumps(schema, indent=2)) + print() + sys.exit(0) + output = {} # if input is a directory - if isinstance(args.MMIF_FILE, (str, os.PathLike)) and Path(args.MMIF_FILE).is_dir(): + if Path(str(args.MMIF_FILE)).is_dir(): output = describe_mmif_collection(args.MMIF_FILE) # if input is a file or stdin else: @@ -125,6 +138,7 @@ def main(args): tmp_path.unlink() if output: + # Convert Pydantic models to dicts with open_cli_io_arg(args.output, 'w', default_stdin=True) as output_file: json.dump(output, output_file, indent=2 if args.pretty else None) output_file.write('\n') diff --git a/mmif/utils/workflow_helper.py b/mmif/utils/workflow_helper.py index c73c0cd2..bdde664a 100644 --- a/mmif/utils/workflow_helper.py +++ b/mmif/utils/workflow_helper.py @@ -1,13 +1,16 @@ import datetime import hashlib -from collections import Counter, defaultdict -from pathlib import Path -from typing import List, Any, Tuple, Optional, Union import itertools -from mmif import Mmif +from collections import Counter +from pathlib import Path +from typing import Any, Dict, List, Literal, Optional, Tuple, Union, overload + +from pydantic import BaseModel, ConfigDict, Field +from mmif.serialize.mmif import Mmif, ViewsList -def group_views_by_app(views: List[Any]) -> List[List[Any]]: + +def group_views_by_app(views: ViewsList) -> List[List[Any]]: """ Groups views into app executions based on app and timestamp. @@ -93,9 +96,21 @@ def _read_mmif_from_path(mmif_input: Union[str, Path, Mmif]) -> Mmif: ) +@overload +def generate_workflow_identifier(mmif_input: Union[str, Path, Mmif], + return_param_dicts: Literal[True] + ) -> Tuple[str, List[dict]]: ... + + +@overload def generate_workflow_identifier(mmif_input: Union[str, Path, Mmif], - return_param_dicts=False) \ - -> Union[str, Tuple[str, List[dict]]]: + return_param_dicts: Literal[False] = False + ) -> str: ... + + +def generate_workflow_identifier(mmif_input: Union[str, Path, Mmif], + return_param_dicts: bool = False + ) -> Union[str, Tuple[str, List[dict]]]: """ Generate a workflow identifier string from a MMIF file or object. @@ -149,7 +164,53 @@ def generate_workflow_identifier(mmif_input: Union[str, Path, Mmif], return '/'.join(segments) -def _get_profile_data(view) -> dict: +## single MMIF summarization + +class SingleMmifStats(BaseModel): + """ + Aggregated statistics for a single MMIF file. + """ + model_config = ConfigDict(populate_by_name=True) + + app_count: int = Field(..., alias="appCount", description="Total number of app executions identified.") + error_views: List[str] = Field(default_factory=list, alias="errorViews", description="List of view IDs that contain errors.") + warning_views: List[str] = Field(default_factory=list, alias="warningViews", description="List of view IDs that contain warnings.") + empty_views: List[str] = Field(default_factory=list, alias="emptyViews", description="List of view IDs that contain no annotations.") + annotation_count_by_type: Dict[str, int] = Field(default_factory=dict, alias="annotationCountByType", description="Total annotation counts across the file.") + +class AppProfiling(BaseModel): + """ + Profiling data for a single app execution. + """ + model_config = ConfigDict(populate_by_name=True) + + running_time_ms: Optional[int] = Field(default=None, alias="runningTimeMS", description="Execution time in milliseconds.") + +class AppExecution(BaseModel): + """ + Represents a single execution of an app, which may produce multiple views. + """ + model_config = ConfigDict(populate_by_name=True) + + app: str = Field(..., description="The URI of the app.") + view_ids: List[str] = Field(..., alias="viewIds", description="List of view IDs generated by this execution.") + app_configuration: Dict = Field(default_factory=dict, alias="appConfiguration", description="Configuration parameters used for this execution.") + app_profiling: AppProfiling = Field(default_factory=lambda: AppProfiling(), alias="appProfiling", description="Profiling data for this execution.") + annotation_count_by_type: Dict[str, int] = Field(default_factory=dict, alias="annotationCountByType", description="Counts of annotations produced, grouped by type.") + + +class SingleMmifDesc(BaseModel): + """ + Description of a workflow extracted from a single MMIF file. + """ + model_config = ConfigDict(populate_by_name=True) + + workflow_id: str = Field(..., alias="workflowId", description="Unique identifier for the workflow structure.") + stats: SingleMmifStats = Field(..., description="Statistics about the views and annotations.") + apps: List[AppExecution] = Field(..., description="Sequence of app executions in the workflow.") + + +def _get_profile_data(view) -> AppProfiling: """ Extract profiling data from a view's metadata. @@ -168,13 +229,13 @@ def _get_profile_data(view) -> dict: running_time_str = profiling.get("runningTime") if running_time_str is None: - return {} + return AppProfiling(runningTimeMS=None) # the format is datetime.timedelta string, e.g. '0:00:02.345678' # need to convert to milliseconds integer time_obj = datetime.datetime.strptime(running_time_str, "%H:%M:%S.%f").time() milliseconds = (time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second) * 1000 + time_obj.microsecond // 1000 - return {"runningTimeMS": milliseconds} + return AppProfiling(runningTimeMS=milliseconds) def describe_single_mmif(mmif_input: Union[str, Path, Mmif]) -> dict: @@ -188,53 +249,24 @@ def describe_single_mmif(mmif_input: Union[str, Path, Mmif]) -> dict: a single logical "app execution". .. note:: - For MMIF files generated by ``clams-python`` <= 1.3.3, all views - are independently timestamped. This means that even if multiple views - were generated by a single execution of an app, their + For MMIF files generated by apps based on ``clams-python`` <= 1.3.3, all + views are independently timestamped. This means that even if multiple + views were generated by a single execution of an app, their ``metadata.timestamp`` values will be unique. As a result, the grouping logic will treat each view as a separate app execution. The change that aligns timestamps for views from a single app execution is implemented in `clams-python PR #271 `_. - The output format is a dictionary with the following keys: - - * ``workflowId`` - A unique identifier for the workflow, based on the - sequence of app executions (app, version, parameter hashes). App - executions with errors are excluded from this identifier. App - executions with warnings are still considered successful for the purpose - of this identifier. - * ``stats`` - A dictionary with the following keys: - - * ``appCount`` - Total number of identified app executions. - * ``errorViews`` - A list of view IDs that reported errors. - * ``warningViews`` - A list of view IDs that reported warnings. - * ``emptyViews`` - A list of view IDs that contain no annotations. - * ``annotationCountByType`` - A dictionary mapping each annotation type to its count, plus a - ``total`` key for the sum of all annotations across all app - executions. - * ``apps`` - A list of objects, where each object represents one app - execution. It includes metadata, profiling, and aggregated statistics - for all views generated by that execution. A special entry for views - that could not be assigned to an execution will be at the end of the list. - - --- - The docstring above is used to generate help messages for the CLI command. - Do not remove the triple-dashed lines. + The output is a serialized :class:`~SingleMmifDesc` object. + .. pydantic_model:: SingleMmifDesc + :noindex: + :param mmif_input: Path to MMIF file (str or Path) or a Mmif object :return: A dictionary containing the workflow specification. """ mmif = _read_mmif_from_path(mmif_input) - workflow_id = generate_workflow_identifier(mmif) error_view_ids = [] warning_view_ids = [] @@ -259,17 +291,21 @@ def describe_single_mmif(mmif_input: Union[str, Path, Mmif]) -> dict: execution_view_ids = [v.id for v in group] processed_view_ids.update(execution_view_ids) - app_data = { - "app": first_view.metadata.app, - "viewIds": execution_view_ids, - "appConfiguration": first_view.metadata.get("appConfiguration", {}), - "appProfiling": _get_profile_data(first_view), - } + # Prepare annotation counts total_annotations_in_exec = sum(execution_ann_counter.values()) if total_annotations_in_exec > 0: - app_data['annotationCountByType'] = dict(execution_ann_counter) - app_data['annotationCountByType']['total'] = total_annotations_in_exec - grouped_apps.append(app_data) + count_dict = dict(execution_ann_counter) + count_dict['total'] = total_annotations_in_exec + else: + count_dict = {} + + grouped_apps.append(AppExecution( + app=first_view.metadata.app, + viewIds=execution_view_ids, + appConfiguration=first_view.metadata.get("appConfiguration", {}), + appProfiling=_get_profile_data(first_view), + annotationCountByType=count_dict + )) # Handle unassigned and problematic views all_view_ids = set(v.id for v in mmif.views) @@ -289,19 +325,23 @@ def describe_single_mmif(mmif_input: Union[str, Path, Mmif]) -> dict: app_count = len(grouped_apps) if unassigned_view_ids: - grouped_apps.append({ - "app": "http://apps.clams.ai/non-existing-app/v1", - "viewIds": sorted(list(unassigned_view_ids)) - }) + grouped_apps.append(AppExecution( + app="http://apps.clams.ai/non-existing-app/v1", + viewIds=sorted(list(unassigned_view_ids)), + appConfiguration={}, + appProfiling=AppProfiling(runningTimeMS=None), + annotationCountByType={} + )) # aggregate total annotation counts total_annotations_by_type = Counter() for execution in grouped_apps: # Only aggregate from actual apps, not the special unassigned entry - if execution.get('app') != "http://apps.clams.ai/non-existing-app/v1": - if 'annotationCountByType' in execution: - exec_counts = execution['annotationCountByType'].copy() - del exec_counts['total'] + if execution.app != "http://apps.clams.ai/non-existing-app/v1": + if execution.annotation_count_by_type: + exec_counts = execution.annotation_count_by_type.copy() + if 'total' in exec_counts: + del exec_counts['total'] total_annotations_by_type.update(Counter(exec_counts)) final_total_annotations = sum(total_annotations_by_type.values()) @@ -309,17 +349,79 @@ def describe_single_mmif(mmif_input: Union[str, Path, Mmif]) -> dict: if final_total_annotations > 0: final_annotation_counts['total'] = final_total_annotations - return { - "workflowId": workflow_id, - "stats": { - "appCount": app_count, - "errorViews": error_view_ids, - "warningViews": warning_view_ids, - "emptyViews": empty_view_ids, - "annotationCountByType": final_annotation_counts - }, - "apps": grouped_apps - } + return SingleMmifDesc( + workflowId=generate_workflow_identifier(mmif, return_param_dicts=False), + stats=SingleMmifStats( + appCount=app_count, + errorViews=error_view_ids, + warningViews=warning_view_ids, + emptyViews=empty_view_ids, + annotationCountByType=final_annotation_counts + ), + apps=grouped_apps + ).model_dump(by_alias=True) + + +## MMIF collection summarization + +class AppProfilingStats(BaseModel): + """ + Aggregated profiling statistics for an app across a workflow. + """ + model_config = ConfigDict(populate_by_name=True) + + avg_running_time_ms: Optional[float] = Field(default=None, alias="avgRunningTimeMS", description="Average execution time in milliseconds.") + min_running_time_ms: Optional[float] = Field(default=None, alias="minRunningTimeMS", description="Minimum execution time in milliseconds.") + max_running_time_ms: Optional[float] = Field(default=None, alias="maxRunningTimeMS", description="Maximum execution time in milliseconds.") + stdev_running_time_ms: Optional[float] = Field(default=None, alias="stdevRunningTimeMS", description="Standard deviation of execution time.") + + + + +class WorkflowAppExecution(BaseModel): + """ + Aggregated information about an app's usage within a specific workflow across multiple files. + """ + model_config = ConfigDict(populate_by_name=True) + + app: str = Field(..., description="The URI of the app.") + app_configuration: Dict = Field(default_factory=dict, alias="appConfiguration", description="Representative configuration (usually from the first occurrence).") + app_profiling: AppProfilingStats = Field(default_factory=lambda: AppProfilingStats(), alias="appProfiling", description="Aggregated profiling statistics.") + + +class WorkflowCollectionEntry(BaseModel): + """ + Summary of a unique workflow found within a collection. + """ + model_config = ConfigDict(populate_by_name=True) + + workflow_id: str = Field(..., alias="workflowId", description="Unique identifier for the workflow.") + mmifs: List[str] = Field(..., description="List of filenames belonging to this workflow.") + mmif_count: int = Field(..., alias="mmifCount", description="Number of MMIF files matching this workflow.") + apps: List[WorkflowAppExecution] = Field(..., description="Sequence of apps in this workflow with aggregated stats.") + +class MmifCountByStatus(BaseModel): + """ + Breakdown of MMIF files in a collection by their processing status. + """ + model_config = ConfigDict(populate_by_name=True) + + total: int = Field(..., description="Total number of MMIF files found.") + successful: int = Field(..., description="Number of files processed without errors.") + with_errors: int = Field(..., alias="withErrors", description="Number of files containing error views.") + with_warnings: int = Field(..., alias="withWarnings", description="Number of files containing warning views.") + invalid: int = Field(..., description="Number of files that failed to parse as valid MMIF.") + + +class CollectionMmifDesc(BaseModel): + """ + Summary of a collection of MMIF files. + """ + model_config = ConfigDict(populate_by_name=True) + + mmif_count_by_status: MmifCountByStatus = Field(..., alias="mmifCountByStatus", description="Counts of MMIF files by status.") + workflows: List[WorkflowCollectionEntry] = Field(..., description="List of unique workflows identified in the collection.") + annotation_count_by_type: Dict[str, int] = Field(default_factory=dict, alias="annotationCountByType", description="Total annotation counts across the entire collection.") def describe_mmif_collection(mmif_dir: Union[str, Path]) -> dict: @@ -329,139 +431,115 @@ def describe_mmif_collection(mmif_dir: Union[str, Path]) -> dict: This function provides an overview of a collection of MMIF files, aggregating statistics across multiple files. - The output format is a dictionary with the following keys: - - * ``mmifCountByStatus`` - A dictionary summarizing the processing status of all MMIF files in the - collection. It includes: - - ``total`` - Total number of MMIF files found. - ``successful`` - Number of MMIF files processed without errors (may contain warnings). - ``withErrors`` - Number of MMIF files containing app executions that reported errors. - ``withWarnings`` - Number of MMIF files containing app executions that reported warnings. - ``invalid`` - Number of files that failed to be parsed as valid MMIF. - * ``workflows`` - A list of "workflow" objects found in the "successful" MMIF files (files - with errors are excluded), where each object contains: - - ``workflowId`` - The unique identifier for the workflow. - ``apps`` - A list of app objects, each with ``app`` (name+ver identifier), - ``appConfiguration``, and ``appProfiling`` statistics (avg, min, max, - stdev running times) aggregated per workflow. - ``mmifs`` - A list of MMIF file basenames belonging to this workflow. - ``mmifCount`` - The number of MMIF files in this workflow. - * ``annotationCountByType`` - A dictionary aggregating annotation counts across the entire collection. - It includes a ``total`` key for the grand total, plus integer counts for - each individual annotation type. - - --- - The docstring above is used to generate help messages for the CLI command. - Do not remove the triple-dashed lines. + The output is a serialized :class:`~CollectionMmifDesc` object. + + .. pydantic_model:: CollectionMmifDesc + :noindex: :param mmif_dir: Path to the directory containing MMIF files. :return: A dictionary containing the summarized collection specification. """ import statistics - from collections import defaultdict, Counter + from collections import Counter mmif_files = list(Path(mmif_dir).glob('*.mmif')) - status_summary = defaultdict(int) - status_summary['total'] = len(mmif_files) - status_summary['successful'] = 0 - status_summary['withErrors'] = 0 - status_summary['withWarnings'] = 0 - status_summary['invalid'] = 0 + status_summary = MmifCountByStatus( + total=len(mmif_files), + successful=0, + withErrors=0, + withWarnings=0, + invalid=0 + ) aggregated_counts = Counter() - workflows_data = defaultdict(lambda: { - 'mmifs': [], - 'apps': defaultdict(lambda: { - 'appConfiguration': None, # Store the first config here - 'execution_times': [] - }) - }) + # Structure: {workflow_id: {'mmifs': [...], 'apps': {app_uri: {'appConfiguration': ..., 'execution_times': [...]}}}} + workflows_data: Dict[str, Dict] = {} for mmif_file in mmif_files: try: - single_report = describe_single_mmif(mmif_file) - except Exception as e: - status_summary['invalid'] += 1 + single_report = SingleMmifDesc.model_validate(describe_single_mmif(mmif_file)) + except Exception: + status_summary.invalid += 1 continue - if single_report['stats']['errorViews']: - status_summary['withErrors'] += 1 + if single_report.stats.error_views: + status_summary.with_errors += 1 continue # Exclude from all other stats # If we get here, the MMIF has no errors and is considered "successful" - status_summary['successful'] += 1 - if single_report['stats']['warningViews']: - status_summary['withWarnings'] += 1 - - wf_id = single_report['workflowId'] + status_summary.successful += 1 + if single_report.stats.warning_views: + status_summary.with_warnings += 1 + + wf_id = single_report.workflow_id + # Initialize workflow entry if not exists + if wf_id not in workflows_data: + workflows_data[wf_id] = {'mmifs': [], 'apps': {}} workflows_data[wf_id]['mmifs'].append(Path(mmif_file).name) # Aggregate annotation counts for successful mmifs - report_counts = single_report['stats'].get('annotationCountByType', {}) + report_counts = single_report.stats.annotation_count_by_type.copy() if 'total' in report_counts: del report_counts['total'] # don't add the sub-total to the main counter aggregated_counts.update(report_counts) - for app_exec in single_report.get('apps', []): - app_uri = app_exec.get('app') + for app_exec in single_report.apps: + app_uri = app_exec.app # skip the special "unassigned" app if app_uri and app_uri != "http://apps.clams.ai/non-existing-app/v1": - running_time = app_exec.get('appProfiling', {}).get('runningTimeMS') + # Initialize app entry if not exists + if app_uri not in workflows_data[wf_id]['apps']: + workflows_data[wf_id]['apps'][app_uri] = { + 'appConfiguration': None, + 'execution_times': [] + } + + running_time = app_exec.app_profiling.running_time_ms if running_time is not None: workflows_data[wf_id]['apps'][app_uri]['execution_times'].append(running_time) # Store the first non-empty app configuration we find for this app in this workflow if workflows_data[wf_id]['apps'][app_uri]['appConfiguration'] is None: - config = app_exec.get('appConfiguration', {}) + config = app_exec.app_configuration if config: workflows_data[wf_id]['apps'][app_uri]['appConfiguration'] = config # Process collected data into the final output format final_workflows_list = [] for wf_id, wf_data in sorted(workflows_data.items()): - workflow_object = { - 'workflowId': wf_id, - 'mmifs': sorted(wf_data['mmifs']), - 'mmifCount': len(wf_data['mmifs']), - 'apps': [] - } + workflow_apps = [] for app_uri, app_data in sorted(wf_data['apps'].items()): times = app_data['execution_times'] if times: - profiling_stats = { - 'avgRunningTimeMS': statistics.mean(times), - 'minRunningTimeMS': min(times), - 'maxRunningTimeMS': max(times), - 'stdevRunningTimeMS': statistics.stdev(times) if len(times) > 1 else 0 - } + profiling_stats = AppProfilingStats( + avgRunningTimeMS=statistics.mean(times), + minRunningTimeMS=min(times), + maxRunningTimeMS=max(times), + stdevRunningTimeMS=statistics.stdev(times) if len(times) > 1 else 0 + ) else: - profiling_stats = {} - - app_object = { - 'app': app_uri, - 'appConfiguration': app_data['appConfiguration'] or {}, # Default to empty dict - 'appProfiling': profiling_stats - } - workflow_object['apps'].append(app_object) - - final_workflows_list.append(workflow_object) + profiling_stats = AppProfilingStats( + avgRunningTimeMS=None, + minRunningTimeMS=None, + maxRunningTimeMS=None, + stdevRunningTimeMS=None + ) + + workflow_apps.append(WorkflowAppExecution( + app=app_uri, + appConfiguration=app_data['appConfiguration'] or {}, + appProfiling=profiling_stats + )) + + final_workflows_list.append(WorkflowCollectionEntry( + workflowId=wf_id, + mmifs=sorted(wf_data['mmifs']), + mmifCount=len(wf_data['mmifs']), + apps=workflow_apps + )) # Finalize annotation counts final_annotation_counts = dict(aggregated_counts) @@ -469,8 +547,8 @@ def describe_mmif_collection(mmif_dir: Union[str, Path]) -> dict: if grand_total > 0: final_annotation_counts['total'] = grand_total - return { - 'mmifCountByStatus': dict(status_summary), - 'workflows': final_workflows_list, - 'annotationCountByType': final_annotation_counts - } + return CollectionMmifDesc( + mmifCountByStatus=status_summary, + workflows=final_workflows_list, + annotationCountByType=final_annotation_counts + ).model_dump(by_alias=True) diff --git a/requirements.txt b/requirements.txt index a97c214e..c3e9d722 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ orderly-set==5.3.* # 5.4 drops py38 support jsonschema +pydantic>=2.0 From 8266a2edfcf9846f3d98baa3ab4e0e7c635074dc Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Sun, 15 Feb 2026 08:51:34 -0500 Subject: [PATCH 2/3] updated test cases for utils and clis --- mmif/utils/cli/describe.py | 2 +- tests/test_utils.py | 203 +++++++++++++++++++---- tests/test_utils_cli.py | 328 ++++++++++++++++++------------------- 3 files changed, 333 insertions(+), 200 deletions(-) diff --git a/mmif/utils/cli/describe.py b/mmif/utils/cli/describe.py index d921b329..bb226a81 100644 --- a/mmif/utils/cli/describe.py +++ b/mmif/utils/cli/describe.py @@ -101,7 +101,7 @@ def main(args): models_map = {m.__name__: m for m in models_to_help} to_show = [] if len(args.help_schemas) == 0 or 'all' in args.help_schemas: - to_show = models_to_help + to_show = [m.__name__ for m in models_to_help] else: to_show = args.help_schemas diff --git a/tests/test_utils.py b/tests/test_utils.py index 5f29b9d2..1aa4fdaf 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,17 +1,25 @@ +import json +import os import pathlib -import unittest import tempfile -import json +import unittest +from pathlib import Path import pytest - -from mmif import Mmif, Document, AnnotationTypes +from hypothesis import given +from hypothesis import strategies as st + +from mmif import ( + AnnotationTypes, + Document, + Mmif +) from mmif.utils import sequence_helper as sqh from mmif.utils import text_document_helper as tdh from mmif.utils import timeunit_helper as tuh from mmif.utils import video_document_helper as vdh -from tests.mmif_examples import * -from hypothesis import given, strategies as st +from mmif.utils import workflow_helper as wfh +from tests import mmif_examples class TestTimeunitHelper(unittest.TestCase): @@ -205,7 +213,7 @@ def test_width_based_smoothing(self): class TestTextDocHelper(unittest.TestCase): - mmif_obj = Mmif(MMIF_EXAMPLES['everything']) + mmif_obj = Mmif(mmif_examples.MMIF_EXAMPLES['everything']) @pytest.mark.skip("The only valid test cases come from kaldi app which annotates wrong property") def test_slice_text(self): @@ -232,8 +240,6 @@ def setUp(self) -> None: def create_temp_mmif_file(self, mmif_obj): """Helper to create a temporary MMIF file.""" - import tempfile - import json tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.mmif', delete=False) if isinstance(mmif_obj, Mmif): content_to_write = mmif_obj.serialize(pretty=False) @@ -244,24 +250,20 @@ def create_temp_mmif_file(self, mmif_obj): return tmp.name def test_split_appname_appversion(self): - from mmif.utils.workflow_helper import _split_appname_appversion - app_name, app_version = _split_appname_appversion("http://apps.clams.ai/test-app/v1.0.0") + app_name, app_version = wfh._split_appname_appversion("http://apps.clams.ai/test-app/v1.0.0") self.assertEqual(app_name, "test-app") self.assertEqual(app_version, "v1.0.0") def test_generate_param_hash(self): - from mmif.utils.workflow_helper import generate_param_hash params = {"param1": "value1", "param2": 42} - hash1 = generate_param_hash(params) - hash2 = generate_param_hash(params) + hash1 = wfh.generate_param_hash(params) + hash2 = wfh.generate_param_hash(params) self.assertEqual(hash1, hash2) params_reversed = {"param2": 42, "param1": "value1"} - hash3 = generate_param_hash(params_reversed) + hash3 = wfh.generate_param_hash(params_reversed) self.assertEqual(hash1, hash3) def test_generate_workflow_identifier_grouped(self): - from mmif.vocabulary import AnnotationTypes - from mmif.utils import workflow_helper view1 = self.basic_mmif.new_view() view1.metadata.app = "http://apps.clams.ai/app1/v1.0.0" view1.metadata.timestamp = "2024-01-01T12:00:00Z" @@ -274,7 +276,7 @@ def test_generate_workflow_identifier_grouped(self): tmp_file = self.create_temp_mmif_file(self.basic_mmif) import os try: - workflow_id = workflow_helper.generate_workflow_identifier(tmp_file) + workflow_id = wfh.generate_workflow_identifier(tmp_file) segments = workflow_id.split('/') self.assertEqual(len(segments), 6) self.assertIn('app1', segments[0]) @@ -284,39 +286,35 @@ def test_generate_workflow_identifier_grouped(self): def test_generate_workflow_identifier_with_mmif_object(self): """Test that generate_workflow_identifier accepts Mmif objects directly.""" - from mmif.utils import workflow_helper import os # Test with Mmif object directly - workflow_id_from_obj = workflow_helper.generate_workflow_identifier(self.basic_mmif) + workflow_id_from_obj = wfh.generate_workflow_identifier(self.basic_mmif) # Test with file path - should produce the same result tmp_file = self.create_temp_mmif_file(self.basic_mmif) try: - workflow_id_from_file = workflow_helper.generate_workflow_identifier(tmp_file) + workflow_id_from_file = wfh.generate_workflow_identifier(tmp_file) self.assertEqual(workflow_id_from_obj, workflow_id_from_file) finally: os.unlink(tmp_file) def test_read_mmif_from_path(self): """Test the _read_mmif_from_path helper function.""" - from mmif.utils.workflow_helper import _read_mmif_from_path - from pathlib import Path - import os # Test with Mmif object - should return as-is - result = _read_mmif_from_path(self.basic_mmif) + result = wfh._read_mmif_from_path(self.basic_mmif) self.assertIs(result, self.basic_mmif) # Test with file path string tmp_file = self.create_temp_mmif_file(self.basic_mmif) try: - result_from_str = _read_mmif_from_path(tmp_file) + result_from_str = wfh._read_mmif_from_path(tmp_file) self.assertIsInstance(result_from_str, Mmif) self.assertEqual(result_from_str.serialize(pretty=False), self.basic_mmif.serialize(pretty=False)) # Test with Path object - result_from_path = _read_mmif_from_path(Path(tmp_file)) + result_from_path = wfh._read_mmif_from_path(Path(tmp_file)) self.assertIsInstance(result_from_path, Mmif) self.assertEqual(result_from_path.serialize(pretty=False), self.basic_mmif.serialize(pretty=False)) finally: @@ -324,27 +322,164 @@ def test_read_mmif_from_path(self): # Test with invalid input with pytest.raises(ValueError): - _read_mmif_from_path(12345) + wfh._read_mmif_from_path(12345) def test_describe_single_mmif_with_mmif_object(self): """Test that describe_single_mmif accepts Mmif objects directly.""" - from mmif.utils.workflow_helper import describe_single_mmif import os # Test with Mmif object directly - result_from_obj = describe_single_mmif(self.basic_mmif) + result_from_obj = wfh.describe_single_mmif(self.basic_mmif) # Test with file path - should produce the same result tmp_file = self.create_temp_mmif_file(self.basic_mmif) try: - result_from_file = describe_single_mmif(tmp_file) + result_from_file = wfh.describe_single_mmif(tmp_file) self.assertEqual(result_from_obj, result_from_file) - self.assertIn('workflowId', result_from_obj) - self.assertIn('stats', result_from_obj) - self.assertIn('apps', result_from_obj) + + # Validate that the output conforms to the SingleMmifDesc Pydantic model + # If validation succeeds, all required fields with correct aliases are present + validated = wfh.SingleMmifDesc.model_validate(result_from_obj) + # Can assert on the validated object's attributes if needed + self.assertIsNotNone(validated.workflow_id) + self.assertIsNotNone(validated.stats) + self.assertIsNotNone(validated.apps) + finally: + os.unlink(tmp_file) + + def test_describe_single_mmif_empty(self): + """Test describe_single_mmif with an empty MMIF (no views).""" + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + try: + result = wfh.describe_single_mmif(tmp_file) + # Validate against Pydantic model + validated = wfh.SingleMmifDesc.model_validate(result) + self.assertEqual(validated.stats.app_count, 0) + self.assertEqual(len(validated.apps), 0) + self.assertEqual(validated.stats.annotation_count_by_type, {}) + finally: + os.unlink(tmp_file) + + def test_describe_single_mmif_one_app(self): + """Test describe_single_mmif with a single app execution.""" + view = self.basic_mmif.new_view() + view.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" + view.metadata.timestamp = "2024-01-01T12:00:00Z" + view.metadata.appProfiling = {"runningTime": "0:00:01.234"} + view.new_annotation(AnnotationTypes.TimeFrame) + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + try: + result = wfh.describe_single_mmif(tmp_file) + # Validate against Pydantic model + validated = wfh.SingleMmifDesc.model_validate(result) + self.assertEqual(validated.stats.app_count, 1) + self.assertEqual(len(validated.apps), 1) + app_exec = validated.apps[0] + self.assertEqual(app_exec.app, view.metadata.app) + self.assertEqual(app_exec.view_ids, [view.id]) + self.assertEqual(app_exec.app_profiling.running_time_ms, 1234) + finally: + os.unlink(tmp_file) + + def test_describe_single_mmif_one_app_two_views(self): + """Test describe_single_mmif with one app execution producing two views.""" + view1 = self.basic_mmif.new_view() + view1.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" + view1.metadata.timestamp = "2024-01-01T12:00:00Z" + view1.new_annotation(AnnotationTypes.TimeFrame) + view2 = self.basic_mmif.new_view() + view2.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" + view2.metadata.timestamp = "2024-01-01T12:00:00Z" + view2.new_annotation(AnnotationTypes.TimeFrame) + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + try: + result = wfh.describe_single_mmif(tmp_file) + # Validate against Pydantic model + validated = wfh.SingleMmifDesc.model_validate(result) + self.assertEqual(validated.stats.app_count, 1) + self.assertEqual(len(validated.apps), 1) + app_exec = validated.apps[0] + self.assertEqual(app_exec.view_ids, [view1.id, view2.id]) + finally: + os.unlink(tmp_file) + + def test_describe_single_mmif_error_view(self): + """Test describe_single_mmif with a view containing an error.""" + view = self.basic_mmif.new_view() + view.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" + view.metadata.timestamp = "2024-01-01T12:00:00Z" + view.metadata.error = {"message": "Something went wrong"} + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + try: + result = wfh.describe_single_mmif(tmp_file) + # Validate against Pydantic model + validated = wfh.SingleMmifDesc.model_validate(result) + self.assertEqual(validated.stats.app_count, 0) + self.assertEqual(len(validated.apps), 0) + self.assertEqual(len(validated.stats.error_views), 1) finally: os.unlink(tmp_file) + def test_describe_single_mmif_with_unassigned_views(self): + """Test describe_single_mmif with views that cannot be grouped.""" + import unittest.mock + raw_mmif = json.loads(self.basic_mmif.serialize()) + raw_mmif['views'].append({'id': 'v1', 'metadata': {'app': 'http://apps.clams.ai/app1/v1.0.0', 'timestamp': '2024-01-01T12:00:00Z'}, 'annotations': []}) + raw_mmif['views'].append({'id': 'v2', 'metadata': {'app': 'http://apps.clams.ai/app2/v2.0.0'}, 'annotations': []}) + raw_mmif['views'].append({'id': 'v3', 'metadata': {'timestamp': '2024-01-01T12:01:00Z', 'app': ''}, 'annotations': []}) + tmp_file = self.create_temp_mmif_file(raw_mmif) + try: + with unittest.mock.patch('jsonschema.validators.validate'): + result = wfh.describe_single_mmif(tmp_file) + # Validate against Pydantic model + validated = wfh.SingleMmifDesc.model_validate(result) + self.assertEqual(validated.stats.app_count, 1) + self.assertEqual(len(validated.apps), 2) + special_entry = validated.apps[-1] + self.assertEqual(special_entry.app, 'http://apps.clams.ai/non-existing-app/v1') + self.assertEqual(len(special_entry.view_ids), 2) + self.assertIn('v2', special_entry.view_ids) + self.assertIn('v3', special_entry.view_ids) + finally: + os.unlink(tmp_file) + + def test_describe_collection_empty(self): + """Test describe_mmif_collection with an empty directory.""" + dummy_dir = 'dummy_mmif_collection' + os.makedirs(dummy_dir, exist_ok=True) + try: + output = wfh.describe_mmif_collection(dummy_dir) + # Validate using Pydantic model + validated = wfh.CollectionMmifDesc.model_validate(output) + self.assertEqual(validated.mmif_count_by_status.total, 0) + self.assertEqual(len(validated.workflows), 0) + finally: + os.rmdir(dummy_dir) + + def test_describe_collection_with_files(self): + """Test describe_mmif_collection with MMIF files.""" + dummy_dir = 'dummy_mmif_collection_with_files' + os.makedirs(dummy_dir, exist_ok=True) + try: + # Create two MMIF files in the directory + for i in range(2): + tmp_file = os.path.join(dummy_dir, f'{i}.mmif') + with open(tmp_file, 'w') as f: + f.write(self.basic_mmif.serialize()) + + output = wfh.describe_mmif_collection(dummy_dir) + + # Validate structure using Pydantic model + # If validation succeeds, all required fields with correct aliases are present + validated = wfh.CollectionMmifDesc.model_validate(output) + + # Verify counts using validated object attributes + self.assertEqual(validated.mmif_count_by_status.total, 2) + self.assertIsInstance(validated.workflows, list) + finally: + import shutil + shutil.rmtree(dummy_dir) + if __name__ == '__main__': unittest.main() diff --git a/tests/test_utils_cli.py b/tests/test_utils_cli.py index 10270525..66c77c38 100644 --- a/tests/test_utils_cli.py +++ b/tests/test_utils_cli.py @@ -1,3 +1,4 @@ +import argparse import contextlib import io import json @@ -6,18 +7,121 @@ import unittest.mock import mmif -from mmif.utils.cli import rewind -from mmif.utils.cli import source -from mmif.utils.cli import describe -from mmif.utils.cli import summarize - from mmif.serialize import Mmif -from mmif.vocabulary import DocumentTypes, AnnotationTypes - +from mmif.utils.cli import describe, rewind, source, summarize +from mmif.vocabulary import AnnotationTypes BASIC_MMIF_STRING = '{"metadata": {"mmif": "http://mmif.clams.ai/1.0.0"}, "documents": [{"@type": "http://mmif.clams.ai/vocabulary/VideoDocument/v1", "properties": {"id": "d1", "mime": "video/mp4", "location": "file:///test/video.mp4"}}], "views": []}' +class BaseCliTestCase(unittest.TestCase): + """Base class for CLI module tests with common utilities.""" + + cli_module = None # Override in subclass + + def setUp(self): + """Set up common test fixtures.""" + if self.cli_module: + self.parser = self.cli_module.prep_argparser() + self.basic_mmif = Mmif(BASIC_MMIF_STRING) + self.maxDiff = None + + @staticmethod + def create_temp_mmif_file(mmif_obj): + """Create a temporary MMIF file for testing. + + Args: + mmif_obj: Either a Mmif object or a dict/string to serialize + + Returns: + str: Path to the temporary file (caller must unlink) + """ + tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.mmif', delete=False) + if isinstance(mmif_obj, Mmif): + content = mmif_obj.serialize(pretty=False) + else: + content = json.dumps(mmif_obj) if isinstance(mmif_obj, dict) else mmif_obj + tmp.write(content) + tmp.close() + return tmp.name + + def run_cli_capture_stdout(self, args_namespace): + """Run CLI module and capture stdout as parsed JSON. + + Args: + args_namespace: Namespace object with CLI arguments + + Returns: + dict: Parsed JSON output from stdout + """ + with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: + self.cli_module.main(args_namespace) + return json.loads(stdout.getvalue()) + + +class IOTestMixin: + """Mixin providing common I/O tests for CLI modules. + + Requires the test class to have: + - cli_module attribute + - basic_mmif attribute + - create_temp_mmif_file method + - run_cli_capture_stdout method + - expected_output_keys attribute (list of keys to check in output) + """ + + def test_file_input_stdout_output(self): + """Test reading from file and outputting to stdout.""" + tmp_file = self.create_temp_mmif_file(self.basic_mmif) + try: + args = argparse.Namespace( + MMIF_FILE=tmp_file, + output=None, + pretty=False, + help_schemas=None # For describe module + ) + output = self.run_cli_capture_stdout(args) + self.assertIsInstance(output, dict) + for key in self.expected_output_keys: + self.assertIn(key, output) + finally: + os.unlink(tmp_file) + + def test_file_input_file_output(self): + """Test reading from file and outputting to file.""" + tmp_input = self.create_temp_mmif_file(self.basic_mmif) + tmp_output = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) + tmp_output.close() + try: + args = self.parser.parse_args([tmp_input, '-o', tmp_output.name]) + self.cli_module.main(args) + with open(tmp_output.name, 'r') as f: + output = json.load(f) + self.assertIsInstance(output, dict) + for key in self.expected_output_keys: + self.assertIn(key, output) + finally: + os.unlink(tmp_input) + os.unlink(tmp_output.name) + + def test_stdin_input_stdout_output(self): + """Test reading from stdin and outputting to stdout.""" + mmif_str = self.basic_mmif.serialize() + with unittest.mock.patch('sys.stdin', io.StringIO(mmif_str)), \ + unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: + args = argparse.Namespace( + MMIF_FILE=None, + output=None, + pretty=False, + help_schemas=None # For describe module + ) + self.cli_module.main(args) + output = json.loads(stdout.getvalue()) + self.assertIsInstance(output, dict) + for key in self.expected_output_keys: + self.assertIn(key, output) + + class TestCli(unittest.TestCase): def setUp(self) -> None: self.parser, _, _ = mmif.prep_argparser_and_subcmds() @@ -179,178 +283,72 @@ def test_app_rewind(self): self.assertIn('dummy_app_two', remaining_apps) -class TestDescribe(unittest.TestCase): +class TestDescribe(BaseCliTestCase, IOTestMixin): """Test suite for the describe CLI module.""" - - def setUp(self): - """Create test MMIF structures.""" - self.parser = describe.prep_argparser() - self.maxDiff = None - self.basic_mmif = Mmif(BASIC_MMIF_STRING) - - def create_temp_mmif_file(self, mmif_obj): - """Helper to create a temporary MMIF file.""" - tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.mmif', delete=False) - if isinstance(mmif_obj, Mmif): - content_to_write = mmif_obj.serialize(pretty=False) - else: - content_to_write = json.dumps(mmif_obj) - tmp.write(content_to_write) - tmp.close() - return tmp.name - - def test_describe_single_mmif_empty(self): - tmp_file = self.create_temp_mmif_file(self.basic_mmif) - try: - result = mmif.utils.workflow_helper.describe_single_mmif(tmp_file) - self.assertEqual(result["stats"]["appCount"], 0) - self.assertEqual(len(result["apps"]), 0) - self.assertEqual(result["stats"]["annotationCountByType"], {}) - finally: - os.unlink(tmp_file) - - def test_describe_single_mmif_one_app(self): - view = self.basic_mmif.new_view() - view.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" - view.metadata.timestamp = "2024-01-01T12:00:00Z" - view.metadata.appProfiling = {"runningTime": "0:00:01.234"} - view.new_annotation(AnnotationTypes.TimeFrame) - tmp_file = self.create_temp_mmif_file(self.basic_mmif) - try: - result = mmif.utils.workflow_helper.describe_single_mmif(tmp_file) - self.assertEqual(result["stats"]["appCount"], 1) - self.assertEqual(len(result["apps"]), 1) - app_exec = result["apps"][0] - self.assertEqual(app_exec["app"], view.metadata.app) - self.assertEqual(app_exec["viewIds"], [view.id]) - self.assertEqual(app_exec["appProfiling"]["runningTimeMS"], 1234) - finally: - os.unlink(tmp_file) - - def test_describe_single_mmif_one_app_two_views(self): - view1 = self.basic_mmif.new_view() - view1.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" - view1.metadata.timestamp = "2024-01-01T12:00:00Z" - view1.new_annotation(AnnotationTypes.TimeFrame) - view2 = self.basic_mmif.new_view() - view2.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" - view2.metadata.timestamp = "2024-01-01T12:00:00Z" - view2.new_annotation(AnnotationTypes.TimeFrame) - tmp_file = self.create_temp_mmif_file(self.basic_mmif) - try: - result = mmif.utils.workflow_helper.describe_single_mmif(tmp_file) - self.assertEqual(result["stats"]["appCount"], 1) - self.assertEqual(len(result["apps"]), 1) - app_exec = result["apps"][0] - self.assertEqual(app_exec["viewIds"], [view1.id, view2.id]) - finally: - os.unlink(tmp_file) - - def test_describe_single_mmif_error_view(self): - view = self.basic_mmif.new_view() - view.metadata.app = "http://apps.clams.ai/test-app/v1.0.0" - view.metadata.timestamp = "2024-01-01T12:00:00Z" - view.metadata.error = {"message": "Something went wrong"} + + cli_module = describe + expected_output_keys = ['workflowId', 'stats', 'apps'] + + def test_help_schemas_all(self): + """Test --help-schemas all""" + from mmif.utils.cli.describe import models_to_help + with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: + args = argparse.Namespace(help_schemas=['all'], MMIF_FILE=None, output=None, pretty=False) + with self.assertRaises(SystemExit) as cm: + describe.main(args) + self.assertEqual(cm.exception.code, 0) + output = stdout.getvalue() + for m in models_to_help: + self.assertIn(m.__name__, output) + self.assertIn("$defs", output) + + def test_describe_main_directory(self): + """Test describe.main with a directory input""" + with tempfile.TemporaryDirectory() as tmp_dir: + # Create two mmif files + with open(os.path.join(tmp_dir, '1.mmif'), 'w') as f: + f.write(self.basic_mmif.serialize()) + with open(os.path.join(tmp_dir, '2.mmif'), 'w') as f: + f.write(self.basic_mmif.serialize()) + + with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: + # MMIF_FILE argument expects a string path + args = argparse.Namespace(MMIF_FILE=tmp_dir, output=None, pretty=False, help_schemas=None) + describe.main(args) + output_json = json.loads(stdout.getvalue()) + # Just verify valid JSON output was produced + self.assertIsInstance(output_json, dict) + self.assertTrue(len(output_json) > 0) + + def test_deprecated_functions(self): + """Test backward compatibility wrapper functions""" tmp_file = self.create_temp_mmif_file(self.basic_mmif) try: - result = mmif.utils.workflow_helper.describe_single_mmif(tmp_file) - self.assertEqual(result["stats"]["appCount"], 0) - self.assertEqual(len(result["apps"]), 0) - self.assertEqual(len(result["stats"]["errorViews"]), 1) - finally: - os.unlink(tmp_file) - - @unittest.mock.patch('jsonschema.validators.validate') - def test_describe_single_mmif_with_unassigned_views(self, mock_validate): - raw_mmif = json.loads(self.basic_mmif.serialize()) - raw_mmif['views'].append({'id': 'v1', 'metadata': {'app': 'http://apps.clams.ai/app1/v1.0.0', 'timestamp': '2024-01-01T12:00:00Z'}, 'annotations': []}) - raw_mmif['views'].append({'id': 'v2', 'metadata': {'app': 'http://apps.clams.ai/app2/v2.0.0'}, 'annotations': []}) - raw_mmif['views'].append({'id': 'v3', 'metadata': {'timestamp': '2024-01-01T12:01:00Z', 'app': ''}, 'annotations': []}) - tmp_file = self.create_temp_mmif_file(raw_mmif) - try: - result = mmif.utils.workflow_helper.describe_single_mmif(tmp_file) - self.assertEqual(result['stats']['appCount'], 1) - self.assertEqual(len(result['apps']), 2) - special_entry = result['apps'][-1] - self.assertEqual(special_entry['app'], 'http://apps.clams.ai/non-existing-app/v1') - self.assertEqual(len(special_entry['viewIds']), 2) - self.assertIn('v2', special_entry['viewIds']) - self.assertIn('v3', special_entry['viewIds']) + with self.assertWarns(DeprecationWarning): + describe.get_pipeline_specs(tmp_file) + with self.assertWarns(DeprecationWarning): + describe.generate_pipeline_identifier(tmp_file) finally: os.unlink(tmp_file) - def test_describe_collection_empty(self): - dummy_dir = 'dummy_mmif_collection' - os.makedirs(dummy_dir, exist_ok=True) - try: - output = mmif.utils.workflow_helper.describe_mmif_collection(dummy_dir) - expected = { - 'mmifCountByStatus': {'total': 0, 'successful': 0, 'withErrors': 0, 'withWarnings': 0, 'invalid': 0}, - 'workflows': [], - 'annotationCountByType': {} - } - self.assertEqual(output, expected) - finally: - os.rmdir(dummy_dir) - -class TestSummarize(unittest.TestCase): +class TestSummarize(BaseCliTestCase, IOTestMixin): """Test suite for the summarize CLI module.""" + + cli_module = summarize + expected_output_keys = ['mmif_version', 'documents', 'views'] - def setUp(self): - """Create test MMIF structures.""" - self.parser = summarize.prep_argparser() - self.basic_mmif = Mmif(BASIC_MMIF_STRING) - - def create_temp_mmif_file(self, mmif_obj): - """Helper to create a temporary MMIF file.""" - tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.mmif', delete=False) - tmp.write(mmif_obj.serialize(pretty=False)) - tmp.close() - return tmp.name - - def test_summarize_positional_input(self): + def test_summarize_validates_content(self): + """Test that summarize produces expected content.""" tmp_file = self.create_temp_mmif_file(self.basic_mmif) try: - with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: - args = self.parser.parse_args([tmp_file]) - # args.output is None by default, which means stdout in open_cli_io_arg - summarize.main(args) - output = json.loads(stdout.getvalue()) - self.assertIn('mmif_version', output) - self.assertEqual(output['mmif_version'], "http://mmif.clams.ai/1.0.0") + output = self.run_cli_capture_stdout( + argparse.Namespace(MMIF_FILE=tmp_file, output=None, pretty=False) + ) + self.assertEqual(output['mmif_version'], "http://mmif.clams.ai/1.0.0") finally: os.unlink(tmp_file) - def test_summarize_output_file(self): - tmp_input = self.create_temp_mmif_file(self.basic_mmif) - tmp_output = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) - tmp_output.close() - try: - args = self.parser.parse_args([tmp_input, "-o", tmp_output.name]) - summarize.main(args) - # args.output is a path string now; no file handle to close. - with open(tmp_output.name, 'r') as f: - output = json.load(f) - self.assertIn('mmif_version', output) - finally: - os.unlink(tmp_input) - os.unlink(tmp_output.name) - - def test_summarize_stdin(self): - mmif_str = self.basic_mmif.serialize() - import argparse - - with unittest.mock.patch('sys.stdin', io.StringIO(mmif_str)), \ - unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: - # MMIF_FILE defaults to None -> stdin - # output defaults to None -> stdout - args = argparse.Namespace(MMIF_FILE=None, output=None, pretty=False) - summarize.main(args) - - output = json.loads(stdout.getvalue()) - self.assertEqual(output['mmif_version'], "http://mmif.clams.ai/1.0.0") - if __name__ == '__main__': unittest.main() From 9ee0bd5ac5ecf7fee0b80e5e372b7fec8708dd00 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Mon, 16 Feb 2026 05:32:33 -0500 Subject: [PATCH 3/3] added human-friendly summary for pydantic classes in `describe --help` --- mmif/utils/cli/__init__.py | 136 +++++++++++++++++++++++++++++++------ mmif/utils/cli/describe.py | 52 +++++++------- tests/test_utils_cli.py | 34 +++++++--- 3 files changed, 162 insertions(+), 60 deletions(-) diff --git a/mmif/utils/cli/__init__.py b/mmif/utils/cli/__init__.py index 935ab0a7..f24248f2 100644 --- a/mmif/utils/cli/__init__.py +++ b/mmif/utils/cli/__init__.py @@ -6,16 +6,19 @@ import io import os import sys -from typing import Iterator, Optional, TextIO, cast +from typing import Iterator, Optional, TextIO, Type, Union, cast, get_args, get_origin + +from pydantic import BaseModel @contextlib.contextmanager -def open_cli_io_arg(path_or_dash: Optional[str], - mode: str = 'r', - encoding: Optional[str] = None, - errors: Optional[str] = None, - default_stdin: bool = False, - ) -> Iterator[TextIO]: +def open_cli_io_arg( + path_or_dash: Optional[str], + mode: str = "r", + encoding: Optional[str] = None, + errors: Optional[str] = None, + default_stdin: bool = False, +) -> Iterator[TextIO]: """ Context manager for opening files with stdin/stdout support. @@ -55,10 +58,10 @@ def open_cli_io_arg(path_or_dash: Optional[str], f.write(content) """ # Valid text modes for file operations - _READ_FLAGS = frozenset({'r', '+'}) - _WRITE_FLAGS = frozenset({'w', 'a', 'x', '+'}) + _READ_FLAGS = frozenset({"r", "+"}) + _WRITE_FLAGS = frozenset({"w", "a", "x", "+"}) - if 'b' in mode: + if "b" in mode: raise ValueError( f"Binary mode '{mode}' is not supported. " "Use text modes ('r', 'w', 'a', 'x') instead." @@ -67,9 +70,7 @@ def open_cli_io_arg(path_or_dash: Optional[str], needs_read = bool(set(mode) & _READ_FLAGS) needs_write = bool(set(mode) & _WRITE_FLAGS) - should_use_stdio = path_or_dash == '-' or ( - path_or_dash is None and default_stdin - ) + should_use_stdio = path_or_dash == "-" or (path_or_dash is None and default_stdin) file_handle: Optional[TextIO] = None should_close = False @@ -84,11 +85,7 @@ def open_cli_io_arg(path_or_dash: Optional[str], if needs_read: # Check for missing input when stdin is a terminal - if ( - path_or_dash is None - and default_stdin - and sys.stdin.isatty() - ): + if path_or_dash is None and default_stdin and sys.stdin.isatty(): raise SystemExit("error: No input provided.") file_handle = sys.stdin @@ -97,14 +94,15 @@ def open_cli_io_arg(path_or_dash: Optional[str], else: raise ValueError( - f"Mode '{mode}' not supported with stdin/stdout " - "(use 'r' or 'w')" + f"Mode '{mode}' not supported with stdin/stdout (use 'r' or 'w')" ) elif isinstance(path_or_dash, str): if needs_read and not os.path.exists(path_or_dash): raise FileNotFoundError(f"Input path does not exist: {path_or_dash}") - file_handle = cast(TextIO, io.open(path_or_dash, mode, encoding=encoding, errors=errors)) + file_handle = cast( + TextIO, io.open(path_or_dash, mode, encoding=encoding, errors=errors) + ) should_close = True elif path_or_dash is None: @@ -126,6 +124,102 @@ def open_cli_io_arg(path_or_dash: Optional[str], file_handle.close() +def generate_model_summary(model: Type[BaseModel], indent: int = 0) -> str: + lines = [] + prefix = " " * indent + + # model_fields is a dictionary of FieldInfo objects + for name, field in model.model_fields.items(): + # Get the alias if available, otherwise use the field name + field_name = field.alias if field.alias else name + + # Get type annotation + type_annotation = field.annotation + + def format_type(t) -> str: + origin = get_origin(t) + args = get_args(t) + + # Handle Optional (Union[T, None]) + if origin is Union and type(None) in args: + non_none_args = [arg for arg in args if arg is not type(None)] + if len(non_none_args) == 1: + return f"{format_type(non_none_args[0])}, optional" + + # Handle List + if origin is list: + if args: + return f"[{format_type(args[0])}]" + return "[]" + + # Handle Dict + if origin is dict: + return "obj" + + # Handle Pydantic Models (Custom Classes) + if isinstance(t, type) and issubclass(t, BaseModel): + return "obj" + + # Handle basic types and cleanup + t_str = str(t) + if t_str.startswith(" 1 + and isinstance(args[1], type) + and issubclass(args[1], BaseModel) + ): + nested_model = args[1] + + if nested_model: + lines.append(generate_model_summary(nested_model, indent + 4)) + + return "\n".join(lines) + + # keep imports of CLI modules for historical reasons # keep them here in the bottom to avoid circular imports from mmif.utils.cli import rewind diff --git a/mmif/utils/cli/describe.py b/mmif/utils/cli/describe.py index bb226a81..b8c79ced 100644 --- a/mmif/utils/cli/describe.py +++ b/mmif/utils/cli/describe.py @@ -3,11 +3,9 @@ import sys import textwrap from pathlib import Path -from typing import Dict, Type, Union, cast +from typing import Union, cast -from pydantic import BaseModel - -from mmif.utils.cli import open_cli_io_arg +from mmif.utils.cli import open_cli_io_arg, generate_model_summary # gen_param_hash is imported for backward compatibility from mmif.utils.workflow_helper import ( @@ -18,12 +16,6 @@ generate_workflow_identifier, ) -models_to_help = [SingleMmifDesc, CollectionMmifDesc] -model_modules = set(model.__module__ for model in models_to_help) -def get_all_models() -> Dict[str, Type[BaseModel]]: - return { - name: cls for name, cls in models_to_help - } def get_pipeline_specs(mmif_file: Union[str, Path]): import warnings @@ -49,7 +41,15 @@ def describe_argparser(): This command extracts workflow information from a single MMIF file or a directory of MMIF files. The output is serialized as JSON. - Use `--help-schemas` to inspect the structure of the JSON output. + Output Schemas: + + 1. Single MMIF File (mmif-file): +{generate_model_summary(SingleMmifDesc, indent=4)} + + 2. MMIF Collection (mmif-dir): +{generate_model_summary(CollectionMmifDesc, indent=4)} + + Use `--help-schema` to inspect the full JSON schema for a specific output type. """) return oneliner, additional @@ -79,13 +79,11 @@ def prep_argparser(**kwargs): help="Pretty-print JSON output" ) parser.add_argument( - "--help-schemas", - nargs="*", - choices=["all"] + [m.__name__ for m in models_to_help], + "--help-schema", + nargs=1, + choices=["mmif-file", "mmif-dir"], metavar="SCHEMA_NAME", - help=f"Print the JSON schema for the output. For human-readable documentation, " - f"visit https://clams.ai/mmif-python and see the following modules: " - f"{', '.join(model_modules)}.\nOptions: all, {', '.join([m.__name__ for m in models_to_help])}." + help="Print the JSON schema for the output. Options: mmif-file, mmif-dir." ) return parser @@ -97,19 +95,15 @@ def main(args): :func:`describe_single_mmif` (for single file input) or :func:`describe_mmif_collection` (for directory input). """ - if hasattr(args, 'help_schemas') and args.help_schemas is not None: - models_map = {m.__name__: m for m in models_to_help} - to_show = [] - if len(args.help_schemas) == 0 or 'all' in args.help_schemas: - to_show = [m.__name__ for m in models_to_help] - else: - to_show = args.help_schemas + if hasattr(args, 'help_schema') and args.help_schema is not None: + schema_name = args.help_schema[0] + if schema_name == 'mmif-file': + model_cls = SingleMmifDesc + elif schema_name == 'mmif-dir': + model_cls = CollectionMmifDesc - for name in to_show: - model_cls = models_map[name] - schema = model_cls.model_json_schema() - print(json.dumps(schema, indent=2)) - print() + schema = model_cls.model_json_schema() + print(json.dumps(schema, indent=2)) sys.exit(0) output = {} diff --git a/tests/test_utils_cli.py b/tests/test_utils_cli.py index 66c77c38..dd33fec2 100644 --- a/tests/test_utils_cli.py +++ b/tests/test_utils_cli.py @@ -78,7 +78,7 @@ def test_file_input_stdout_output(self): MMIF_FILE=tmp_file, output=None, pretty=False, - help_schemas=None # For describe module + help_schema=None # For describe module ) output = self.run_cli_capture_stdout(args) self.assertIsInstance(output, dict) @@ -113,7 +113,7 @@ def test_stdin_input_stdout_output(self): MMIF_FILE=None, output=None, pretty=False, - help_schemas=None # For describe module + help_schema=None # For describe module ) self.cli_module.main(args) output = json.loads(stdout.getvalue()) @@ -289,18 +289,32 @@ class TestDescribe(BaseCliTestCase, IOTestMixin): cli_module = describe expected_output_keys = ['workflowId', 'stats', 'apps'] - def test_help_schemas_all(self): - """Test --help-schemas all""" - from mmif.utils.cli.describe import models_to_help + def test_help_schema(self): + """Test --help-schema with different options""" + from mmif.utils.workflow_helper import SingleMmifDesc, CollectionMmifDesc + + # Test mmif-file + with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: + args = argparse.Namespace(help_schema=['mmif-file'], MMIF_FILE=None, output=None, pretty=False) + with self.assertRaises(SystemExit) as cm: + describe.main(args) + self.assertEqual(cm.exception.code, 0) + output = stdout.getvalue() + # Verify SingleMmifDesc schema keys are present + self.assertIn("workflowId", output) + self.assertIn("stats", output) + self.assertIn("apps", output) + + # Test mmif-dir with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: - args = argparse.Namespace(help_schemas=['all'], MMIF_FILE=None, output=None, pretty=False) + args = argparse.Namespace(help_schema=['mmif-dir'], MMIF_FILE=None, output=None, pretty=False) with self.assertRaises(SystemExit) as cm: describe.main(args) self.assertEqual(cm.exception.code, 0) output = stdout.getvalue() - for m in models_to_help: - self.assertIn(m.__name__, output) - self.assertIn("$defs", output) + # Verify CollectionMmifDesc schema keys are present + self.assertIn("mmifCountByStatus", output) + self.assertIn("workflows", output) def test_describe_main_directory(self): """Test describe.main with a directory input""" @@ -313,7 +327,7 @@ def test_describe_main_directory(self): with unittest.mock.patch('sys.stdout', new=io.StringIO()) as stdout: # MMIF_FILE argument expects a string path - args = argparse.Namespace(MMIF_FILE=tmp_dir, output=None, pretty=False, help_schemas=None) + args = argparse.Namespace(MMIF_FILE=tmp_dir, output=None, pretty=False, help_schema=None) describe.main(args) output_json = json.loads(stdout.getvalue()) # Just verify valid JSON output was produced