From 117758249cf12dddf40a2be4a9368aee424e42d9 Mon Sep 17 00:00:00 2001 From: jniestroy Date: Thu, 6 Nov 2025 08:15:33 -0500 Subject: [PATCH 1/6] first draft --- pyproject.toml | 4 +- src/fairscape_cli/jupyter/__init__.py | 3 + src/fairscape_cli/jupyter/magic.py | 104 ++++++ src/fairscape_cli/tracking/__init__.py | 35 ++ src/fairscape_cli/tracking/config.py | 46 +++ src/fairscape_cli/tracking/io_capture.py | 256 +++++++++++++ .../tracking/metadata_generator.py | 183 +++++++++ .../tracking/provenance_tracker.py | 349 ++++++++++++++++++ src/fairscape_cli/tracking/utils.py | 60 +++ 9 files changed, 1038 insertions(+), 2 deletions(-) create mode 100644 src/fairscape_cli/jupyter/__init__.py create mode 100644 src/fairscape_cli/jupyter/magic.py create mode 100644 src/fairscape_cli/tracking/__init__.py create mode 100644 src/fairscape_cli/tracking/config.py create mode 100644 src/fairscape_cli/tracking/io_capture.py create mode 100644 src/fairscape_cli/tracking/metadata_generator.py create mode 100644 src/fairscape_cli/tracking/provenance_tracker.py create mode 100644 src/fairscape_cli/tracking/utils.py diff --git a/pyproject.toml b/pyproject.toml index df4ca59..26b49bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "fairscape-cli" -version = "1.1.7" +version = "1.1.9" description = "A utility for packaging objects and validating metadata for FAIRSCAPE" readme = "README.md" requires-python = ">=3.8" @@ -39,7 +39,7 @@ dependencies = [ "prettytable>=3.9.0", "jsonschema>=4.20.0", "sqids>=0.4.1", - "fairscape-models>=1.0.8", + "fairscape-models>=1.0.11", "pyyaml", "h5py", "frictionless>=5.0,<6.0", diff --git a/src/fairscape_cli/jupyter/__init__.py b/src/fairscape_cli/jupyter/__init__.py new file mode 100644 index 0000000..79dd58c --- /dev/null +++ b/src/fairscape_cli/jupyter/__init__.py @@ -0,0 +1,3 @@ +from .magic import fairscape + +__all__ = ['fairscape'] diff --git a/src/fairscape_cli/jupyter/magic.py b/src/fairscape_cli/jupyter/magic.py new file mode 100644 index 0000000..a3c123c --- /dev/null +++ b/src/fairscape_cli/jupyter/magic.py @@ -0,0 +1,104 @@ +import os +import pathlib +import argparse +from IPython.core.magic import register_cell_magic +from IPython import get_ipython + +from fairscape_cli.tracking.io_capture import IOCapture +from fairscape_cli.tracking.provenance_tracker import ProvenanceTracker +from fairscape_cli.tracking.config import ProvenanceConfig, TrackerConfig +from fairscape_cli.tracking.metadata_generator import create_metadata_generator + + +def parse_magic_arguments(line: str) -> argparse.Namespace: + """Parse arguments from magic command line.""" + parser = argparse.ArgumentParser(description='Track Jupyter cell execution') + parser.add_argument('command', nargs='?', default=None) + parser.add_argument('--rocrate-path', type=str, default=None) + parser.add_argument('--author', type=str, default="Unknown") + parser.add_argument('--keywords', nargs='+', default=["jupyter", "computation"]) + parser.add_argument('--input', nargs='+', default=[], dest='manual_inputs') + parser.add_argument('--no-llm', action='store_true', help='Disable LLM descriptions') + + args_list = line.split() + + try: + args = parser.parse_args(args_list) + except SystemExit: + print("Usage: %%fairscape track [--rocrate-path PATH] [--author AUTHOR] [--keywords KW1 KW2] [--input FILE1 FILE2] [--no-llm]") + raise + + return args + + +def execute_cell_safely(cell: str) -> bool: + """Execute cell and return success status.""" + ip = get_ipython() + result = ip.run_cell(cell) + + if result.error_in_exec: + print("ERROR: Cell execution failed") + return False + + return True + + +@register_cell_magic +def fairscape(line, cell): + """ + Jupyter cell magic for tracking computational provenance. + + Usage: + %%fairscape track [options] + + + Options: + --rocrate-path PATH Path to RO-Crate directory (default: current directory) + --author AUTHOR Author name (default: from RO-Crate or "Unknown") + --keywords KW1 KW2 Keywords for metadata (default: from RO-Crate or ["jupyter", "computation"]) + --input FILE1 FILE2 Manual input files to track + --no-llm Disable LLM-based description generation + """ + args = parse_magic_arguments(line) + + if args.command != 'track': + print("Usage: %%fairscape track [options]") + return + + rocrate_path = pathlib.Path(args.rocrate_path) if args.rocrate_path else pathlib.Path.cwd() + + tracker_config = TrackerConfig() + + with IOCapture(config=tracker_config) as capture: + if not execute_cell_safely(cell): + return + + use_llm = not args.no_llm and os.environ.get("GEMINI_API_KEY") + + metadata_generator = None + if use_llm: + from datetime import datetime + metadata_generator = create_metadata_generator( + provider="gemini", + timestamp=datetime.now().strftime("%Y%m%d_%H%M%S") + ) + + provenance_config = ProvenanceConfig( + rocrate_path=rocrate_path, + author=args.author, + keywords=args.keywords, + manual_inputs=args.manual_inputs, + use_llm=use_llm + ) + + try: + tracker = ProvenanceTracker( + config=provenance_config, + metadata_generator=metadata_generator + ) + + result = tracker.track_execution(cell, capture) + + except Exception as e: + print(f"ERROR: Tracking failed: {e}") + raise diff --git a/src/fairscape_cli/tracking/__init__.py b/src/fairscape_cli/tracking/__init__.py new file mode 100644 index 0000000..a15e09c --- /dev/null +++ b/src/fairscape_cli/tracking/__init__.py @@ -0,0 +1,35 @@ +from .io_capture import IOCapture +from .provenance_tracker import ProvenanceTracker +from .metadata_generator import ( + MetadataGenerator, + GeminiMetadataGenerator, + FallbackMetadataGenerator, + MockMetadataGenerator, + create_metadata_generator +) +from .config import TrackerConfig, ProvenanceConfig, TrackingResult +from .utils import ( + normalize_path, + is_trackable_path, + read_dataset_sample, + collect_dataset_samples, + format_samples_for_prompt +) + +__all__ = [ + 'IOCapture', + 'ProvenanceTracker', + 'MetadataGenerator', + 'GeminiMetadataGenerator', + 'FallbackMetadataGenerator', + 'MockMetadataGenerator', + 'create_metadata_generator', + 'TrackerConfig', + 'ProvenanceConfig', + 'TrackingResult', + 'normalize_path', + 'is_trackable_path', + 'read_dataset_sample', + 'collect_dataset_samples', + 'format_samples_for_prompt', +] \ No newline at end of file diff --git a/src/fairscape_cli/tracking/config.py b/src/fairscape_cli/tracking/config.py new file mode 100644 index 0000000..0feb2d4 --- /dev/null +++ b/src/fairscape_cli/tracking/config.py @@ -0,0 +1,46 @@ +from dataclasses import dataclass, field +from typing import List, Optional +from pathlib import Path + + +@dataclass +class TrackerConfig: + track_builtins: bool = True + track_pathlib: bool = True + track_pandas: bool = True + track_numpy: bool = True + excluded_patterns: List[str] = field(default_factory=lambda: [ + '.matplotlib', + '.ipython', + '.jupyter', + 'site-packages', + '/tmp/', + '__pycache__' + ]) + + +@dataclass +class TrackingResult: + computation_guid: str + software_guid: str + input_count: int + output_count: int + reused_count: int + new_datasets: int + + def __str__(self): + return ( + f"Tracked computation: {self.computation_guid}\n" + f" Software: {self.software_guid}\n" + f" Inputs: {self.input_count} datasets ({self.reused_count} reused)\n" + f" Outputs: {self.output_count} datasets" + ) + + +@dataclass +class ProvenanceConfig: + rocrate_path: Path + author: str = "Unknown" + keywords: List[str] = field(default_factory=lambda: ["jupyter", "computation"]) + manual_inputs: List[str] = field(default_factory=list) + use_llm: bool = False diff --git a/src/fairscape_cli/tracking/io_capture.py b/src/fairscape_cli/tracking/io_capture.py new file mode 100644 index 0000000..f02c440 --- /dev/null +++ b/src/fairscape_cli/tracking/io_capture.py @@ -0,0 +1,256 @@ +import builtins +import pathlib +from typing import Set, Dict, Any +import pandas as pd +import numpy as np + +from .config import TrackerConfig +from .utils import normalize_path, is_trackable_path + + +class IOCapture: + """Captures file I/O operations during code execution.""" + + def __init__(self, config: TrackerConfig = None): + self.config = config or TrackerConfig() + self.inputs: Set[str] = set() + self.outputs: Set[str] = set() + self.original_functions: Dict[str, Any] = {} + self.captured_variables: Dict[str, Any] = {} + + def _should_track(self, filepath) -> bool: + """Check if filepath should be tracked.""" + return is_trackable_path(filepath, self.config.excluded_patterns) + + def _normalize_path(self, filepath) -> str: + """Normalize filepath to absolute string.""" + return normalize_path(filepath) + + def patch_open(self): + """Patch builtin open function to track file I/O.""" + if not self.config.track_builtins: + return + + original_open = builtins.open + self.original_functions['builtins.open'] = original_open + + capture = self + + def tracked_open(file, mode='r', *args, **kwargs): + if capture._should_track(file): + normalized = capture._normalize_path(file) + if 'r' in mode: + capture.inputs.add(normalized) + if any(m in mode for m in ['w', 'a', 'x']): + capture.outputs.add(normalized) + return original_open(file, mode, *args, **kwargs) + + builtins.open = tracked_open + + def patch_pathlib(self): + """Patch pathlib methods to track file I/O.""" + if not self.config.track_pathlib: + return + + original_path_open = pathlib.Path.open + original_read_text = pathlib.Path.read_text + original_read_bytes = pathlib.Path.read_bytes + original_write_text = pathlib.Path.write_text + original_write_bytes = pathlib.Path.write_bytes + + self.original_functions['pathlib.Path.open'] = original_path_open + self.original_functions['pathlib.Path.read_text'] = original_read_text + self.original_functions['pathlib.Path.read_bytes'] = original_read_bytes + self.original_functions['pathlib.Path.write_text'] = original_write_text + self.original_functions['pathlib.Path.write_bytes'] = original_write_bytes + + capture = self + + def tracked_path_open(self, mode='r', *args, **kwargs): + if capture._should_track(self): + normalized = capture._normalize_path(self) + if 'r' in mode: + capture.inputs.add(normalized) + if any(m in mode for m in ['w', 'a', 'x']): + capture.outputs.add(normalized) + return original_path_open(self, mode, *args, **kwargs) + + def tracked_read_text(self, *args, **kwargs): + if capture._should_track(self): + capture.inputs.add(capture._normalize_path(self)) + return original_read_text(self, *args, **kwargs) + + def tracked_read_bytes(self, *args, **kwargs): + if capture._should_track(self): + capture.inputs.add(capture._normalize_path(self)) + return original_read_bytes(self, *args, **kwargs) + + def tracked_write_text(self, *args, **kwargs): + if capture._should_track(self): + capture.outputs.add(capture._normalize_path(self)) + return original_write_text(self, *args, **kwargs) + + def tracked_write_bytes(self, *args, **kwargs): + if capture._should_track(self): + capture.outputs.add(capture._normalize_path(self)) + return original_write_bytes(self, *args, **kwargs) + + pathlib.Path.open = tracked_path_open + pathlib.Path.read_text = tracked_read_text + pathlib.Path.read_bytes = tracked_read_bytes + pathlib.Path.write_text = tracked_write_text + pathlib.Path.write_bytes = tracked_write_bytes + + def patch_pandas(self): + """Patch pandas methods to track file I/O.""" + if not self.config.track_pandas: + return + + original_read_csv = pd.read_csv + original_read_excel = pd.read_excel + original_read_parquet = pd.read_parquet + original_read_json = pd.read_json + original_to_csv = pd.DataFrame.to_csv + original_to_excel = pd.DataFrame.to_excel + original_to_parquet = pd.DataFrame.to_parquet + original_to_json = pd.DataFrame.to_json + + self.original_functions['pd.read_csv'] = original_read_csv + self.original_functions['pd.read_excel'] = original_read_excel + self.original_functions['pd.read_parquet'] = original_read_parquet + self.original_functions['pd.read_json'] = original_read_json + self.original_functions['pd.DataFrame.to_csv'] = original_to_csv + self.original_functions['pd.DataFrame.to_excel'] = original_to_excel + self.original_functions['pd.DataFrame.to_parquet'] = original_to_parquet + self.original_functions['pd.DataFrame.to_json'] = original_to_json + + capture = self + + def tracked_read_csv(filepath_or_buffer, *args, **kwargs): + if capture._should_track(filepath_or_buffer): + capture.inputs.add(capture._normalize_path(filepath_or_buffer)) + return original_read_csv(filepath_or_buffer, *args, **kwargs) + + def tracked_read_excel(io, *args, **kwargs): + if capture._should_track(io): + capture.inputs.add(capture._normalize_path(io)) + return original_read_excel(io, *args, **kwargs) + + def tracked_read_parquet(path, *args, **kwargs): + if capture._should_track(path): + capture.inputs.add(capture._normalize_path(path)) + return original_read_parquet(path, *args, **kwargs) + + def tracked_read_json(path_or_buf, *args, **kwargs): + if capture._should_track(path_or_buf): + capture.inputs.add(capture._normalize_path(path_or_buf)) + return original_read_json(path_or_buf, *args, **kwargs) + + def tracked_to_csv(df_self, path_or_buf=None, *args, **kwargs): + if path_or_buf and capture._should_track(path_or_buf): + capture.outputs.add(capture._normalize_path(path_or_buf)) + return original_to_csv(df_self, path_or_buf, *args, **kwargs) + + def tracked_to_excel(df_self, excel_writer, *args, **kwargs): + if capture._should_track(excel_writer): + capture.outputs.add(capture._normalize_path(excel_writer)) + return original_to_excel(df_self, excel_writer, *args, **kwargs) + + def tracked_to_parquet(df_self, path, *args, **kwargs): + if capture._should_track(path): + capture.outputs.add(capture._normalize_path(path)) + return original_to_parquet(df_self, path, *args, **kwargs) + + def tracked_to_json(df_self, path_or_buf=None, *args, **kwargs): + if path_or_buf and capture._should_track(path_or_buf): + capture.outputs.add(capture._normalize_path(path_or_buf)) + return original_to_json(df_self, path_or_buf, *args, **kwargs) + + pd.read_csv = tracked_read_csv + pd.read_excel = tracked_read_excel + pd.read_parquet = tracked_read_parquet + pd.read_json = tracked_read_json + pd.DataFrame.to_csv = tracked_to_csv + pd.DataFrame.to_excel = tracked_to_excel + pd.DataFrame.to_parquet = tracked_to_parquet + pd.DataFrame.to_json = tracked_to_json + + def patch_numpy(self): + """Patch numpy methods to track file I/O.""" + if not self.config.track_numpy: + return + + original_load = np.load + original_save = np.save + original_loadtxt = np.loadtxt + original_savetxt = np.savetxt + + self.original_functions['np.load'] = original_load + self.original_functions['np.save'] = original_save + self.original_functions['np.loadtxt'] = original_loadtxt + self.original_functions['np.savetxt'] = original_savetxt + + capture = self + + def tracked_load(file, *args, **kwargs): + if capture._should_track(file): + capture.inputs.add(capture._normalize_path(file)) + return original_load(file, *args, **kwargs) + + def tracked_save(file, arr, *args, **kwargs): + if capture._should_track(file): + capture.outputs.add(capture._normalize_path(file)) + return original_save(file, arr, *args, **kwargs) + + def tracked_loadtxt(fname, *args, **kwargs): + if capture._should_track(fname): + capture.inputs.add(capture._normalize_path(fname)) + return original_loadtxt(fname, *args, **kwargs) + + def tracked_savetxt(fname, X, *args, **kwargs): + if capture._should_track(fname): + capture.outputs.add(capture._normalize_path(fname)) + return original_savetxt(fname, X, *args, **kwargs) + + np.load = tracked_load + np.save = tracked_save + np.loadtxt = tracked_loadtxt + np.savetxt = tracked_savetxt + + def restore_all(self): + """Restore all original functions.""" + builtins.open = self.original_functions.get('builtins.open', builtins.open) + + if 'pathlib.Path.open' in self.original_functions: + pathlib.Path.open = self.original_functions['pathlib.Path.open'] + pathlib.Path.read_text = self.original_functions['pathlib.Path.read_text'] + pathlib.Path.read_bytes = self.original_functions['pathlib.Path.read_bytes'] + pathlib.Path.write_text = self.original_functions['pathlib.Path.write_text'] + pathlib.Path.write_bytes = self.original_functions['pathlib.Path.write_bytes'] + + if 'pd.read_csv' in self.original_functions: + pd.read_csv = self.original_functions['pd.read_csv'] + pd.read_excel = self.original_functions['pd.read_excel'] + pd.read_parquet = self.original_functions['pd.read_parquet'] + pd.read_json = self.original_functions['pd.read_json'] + pd.DataFrame.to_csv = self.original_functions['pd.DataFrame.to_csv'] + pd.DataFrame.to_excel = self.original_functions['pd.DataFrame.to_excel'] + pd.DataFrame.to_parquet = self.original_functions['pd.DataFrame.to_parquet'] + pd.DataFrame.to_json = self.original_functions['pd.DataFrame.to_json'] + + if 'np.load' in self.original_functions: + np.load = self.original_functions['np.load'] + np.save = self.original_functions['np.save'] + np.loadtxt = self.original_functions['np.loadtxt'] + np.savetxt = self.original_functions['np.savetxt'] + + def __enter__(self): + self.patch_open() + self.patch_pathlib() + self.patch_pandas() + self.patch_numpy() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.restore_all() + return False diff --git a/src/fairscape_cli/tracking/metadata_generator.py b/src/fairscape_cli/tracking/metadata_generator.py new file mode 100644 index 0000000..b44705c --- /dev/null +++ b/src/fairscape_cli/tracking/metadata_generator.py @@ -0,0 +1,183 @@ +import os +import json +import re +from typing import Dict, Optional, Any +from abc import ABC, abstractmethod + + +DESCRIPTION_PROMPT_TEMPLATE = """ROLE: Research data management expert specializing in FAIR metadata + +TASK: Generate concise, technical descriptions for a computational workflow + +INPUT FORMAT: +- Software code that was executed +- Input datasets with samples (first 5 rows) +- Output datasets with samples (first 5 rows) + +OUTPUT FORMAT: JSON object with these keys: +{{ + "software_description": "What this code does technically", + "computation_description": "What this computation accomplishes", + "input_datasets": {{ + "filename": "Description of this input's role and content" + }}, + "output_datasets": {{ + "filename": "Description of this output's content and meaning" + }} +}} + +REQUIREMENTS: +- Software description: 1-2 sentences, focus on operations performed +- Computation description: 1-2 sentences, focus on scientific/analytical goal +- Dataset descriptions: 1 sentence each, describe content type and role in workflow +- Be technical but clear, assume scientific audience +- No markdown formatting, just plain JSON + +SOFTWARE CODE: +```python +{code} +``` + +INPUT DATASETS: +{input_samples} + +OUTPUT DATASETS: +{output_samples} + +Generate the JSON now:""" + + +class MetadataGenerator(ABC): + """Abstract base class for generating metadata descriptions.""" + + @abstractmethod + def generate_descriptions( + self, + code: str, + input_samples: str, + output_samples: str + ) -> Optional[Dict[str, Any]]: + """Generate descriptions for code and datasets.""" + pass + + +class GeminiMetadataGenerator(MetadataGenerator): + """Generate metadata using Google Gemini.""" + + def __init__(self, api_key: Optional[str] = None, temperature: float = 0.2, max_tokens: int = 2048): + self.api_key = api_key or os.environ.get("GEMINI_API_KEY") + self.temperature = temperature + self.max_tokens = max_tokens + + if not self.api_key: + raise ValueError("GEMINI_API_KEY not found in environment or provided") + + def _build_prompt(self, code: str, input_samples: str, output_samples: str) -> str: + """Build the prompt for the LLM.""" + return DESCRIPTION_PROMPT_TEMPLATE.format( + code=code, + input_samples=input_samples, + output_samples=output_samples + ) + + def _parse_llm_response(self, response_text: str) -> Dict[str, Any]: + """Parse JSON from LLM response, handling various formats.""" + response_text = response_text.strip() + + json_match = re.search(r'```json\s*(\{.*?\})\s*```', response_text, re.DOTALL) + if json_match: + response_text = json_match.group(1) + else: + code_match = re.search(r'```\s*(\{.*?\})\s*```', response_text, re.DOTALL) + if code_match: + response_text = code_match.group(1) + elif not (response_text.startswith('{') and response_text.endswith('}')): + json_search = re.search(r'\{.*\}', response_text, re.DOTALL) + if json_search: + response_text = json_search.group(0) + + return json.loads(response_text) + + def generate_descriptions( + self, + code: str, + input_samples: str, + output_samples: str + ) -> Optional[Dict[str, Any]]: + """Generate descriptions using Gemini.""" + import google.generativeai as genai + + genai.configure(api_key=self.api_key) + model = genai.GenerativeModel('gemini-2.0-flash-exp') + + prompt = self._build_prompt(code, input_samples, output_samples) + response = model.generate_content( + prompt, + generation_config=genai.types.GenerationConfig( + temperature=self.temperature, + max_output_tokens=self.max_tokens, + ) + ) + + return self._parse_llm_response(response.text) + + +class FallbackMetadataGenerator(MetadataGenerator): + """Generate simple timestamp-based descriptions without LLM.""" + + def __init__(self, timestamp: str): + self.timestamp = timestamp + + def generate_descriptions( + self, + code: str, + input_samples: str, + output_samples: str + ) -> Dict[str, Any]: + """Generate simple fallback descriptions.""" + return { + "software_description": f"Code executed at {self.timestamp}", + "computation_description": f"Computation executed at {self.timestamp}", + "input_datasets": {}, + "output_datasets": {} + } + + +class MockMetadataGenerator(MetadataGenerator): + """Mock generator for testing.""" + + def __init__(self, mock_response: Dict[str, Any]): + self.mock_response = mock_response + + def generate_descriptions( + self, + code: str, + input_samples: str, + output_samples: str + ) -> Dict[str, Any]: + """Return mock response.""" + return self.mock_response + + +def create_metadata_generator( + provider: str = "gemini", + api_key: Optional[str] = None, + **kwargs +) -> MetadataGenerator: + """Factory function to create appropriate metadata generator.""" + + if provider == "gemini": + try: + gemini_kwargs = {k: v for k, v in kwargs.items() if k in ['temperature', 'max_tokens']} + return GeminiMetadataGenerator(api_key=api_key, **gemini_kwargs) + except ValueError: + print("WARNING: GEMINI_API_KEY not found, using fallback descriptions") + timestamp = kwargs.get('timestamp', 'unknown') + return FallbackMetadataGenerator(timestamp=timestamp) + elif provider == "fallback": + timestamp = kwargs.get('timestamp', 'unknown') + return FallbackMetadataGenerator(timestamp=timestamp) + elif provider == "mock": + return MockMetadataGenerator(kwargs.get('mock_response', {})) + else: + raise ValueError(f"Unknown metadata generator provider: {provider}") \ No newline at end of file diff --git a/src/fairscape_cli/tracking/provenance_tracker.py b/src/fairscape_cli/tracking/provenance_tracker.py new file mode 100644 index 0000000..215ac57 --- /dev/null +++ b/src/fairscape_cli/tracking/provenance_tracker.py @@ -0,0 +1,349 @@ +from pathlib import Path +from datetime import datetime +from typing import List, Dict, Set, Optional + +from fairscape_cli.models.rocrate import ReadROCrateMetadata, AppendCrate +from fairscape_cli.models.dataset import GenerateDataset, Dataset +from fairscape_cli.models.software import GenerateSoftware, Software +from fairscape_cli.models.computation import GenerateComputation, Computation + +from .config import ProvenanceConfig, TrackingResult +from .io_capture import IOCapture +from .metadata_generator import MetadataGenerator, FallbackMetadataGenerator, create_metadata_generator +from .utils import collect_dataset_samples, format_samples_for_prompt + +from fairscape_cli.models.rocrate import GenerateROCrate +from datetime import datetime + + +class ProvenanceTracker: + """Tracks computational provenance and generates RO-Crate metadata.""" + + def __init__( + self, + config: ProvenanceConfig, + metadata_generator: Optional[MetadataGenerator] = None + ): + self.config = config + self.metadata_generator = metadata_generator + self.filepath_to_guid: Dict[str, str] = {} + self.crate_metadata = None + + self._ensure_crate_exists() + self._load_crate_context() + + def _ensure_crate_exists(self): + """Create RO-Crate if it doesn't exist with placeholder metadata.""" + metadata_path = self.config.rocrate_path / 'ro-crate-metadata.json' + + if metadata_path.exists(): + return + + placeholder_name = f"Research Project {datetime.now().strftime('%Y%m%d')}" + placeholder_description = "Automatically generated RO-Crate for computational provenance tracking" + + GenerateROCrate( + path=self.config.rocrate_path, + guid=None, + name=placeholder_name, + description=placeholder_description, + author=self.config.author if self.config.author != "Unknown" else "Researcher", + keywords=self.config.keywords, + datePublished=datetime.now().isoformat(), + version="1.0", + license="https://creativecommons.org/licenses/by/4.0/" + ) + + + def _load_crate_context(self): + """Load existing RO-Crate and build filepath to GUID mapping.""" + try: + self.crate_metadata = ReadROCrateMetadata(self.config.rocrate_path) + + root_dataset = self.crate_metadata['@graph'][1] + + if self.config.author == "Unknown": + if hasattr(root_dataset, 'author') and root_dataset.author: + self.config.author = root_dataset.author + + if self.config.keywords == ["jupyter", "computation"]: + if hasattr(root_dataset, 'keywords') and root_dataset.keywords: + self.config.keywords = root_dataset.keywords + + for entity in self.crate_metadata.get('@graph', []): + entity_types = getattr(entity, '@type', []) + if isinstance(entity_types, str): + entity_types = [entity_types] + + content_url = getattr(entity, 'contentUrl', None) + if content_url and content_url.startswith('file://'): + relative_path = content_url.replace('file:///', '').lstrip('/') + filepath_full = (self.config.rocrate_path / relative_path).resolve() + self.filepath_to_guid[str(filepath_full)] = getattr(entity, 'guid') + + print(self.filepath_to_guid) + + except Exception as e: + raise RuntimeError(f"Could not read RO-Crate at {self.config.rocrate_path}: {e}") + + def _resolve_manual_inputs(self) -> Set[str]: + """Convert manual input paths to absolute paths.""" + manual_input_paths = set() + for manual_input in self.config.manual_inputs: + manual_path = Path(manual_input) + if not manual_path.is_absolute(): + manual_path = (self.config.rocrate_path / manual_input).resolve() + else: + manual_path = manual_path.resolve() + manual_input_paths.add(str(manual_path)) + return manual_input_paths + + def _resolve_inputs(self, io_capture: IOCapture) -> tuple[List[Dataset], int]: + """Resolve input datasets, reusing existing ones where possible.""" + all_input_files = set(io_capture.inputs) + all_input_files.update(self._resolve_manual_inputs()) + + input_datasets = [] + reused_count = 0 + + for input_file in all_input_files: + input_path = Path(input_file) + + if not input_path.exists(): + print(f"WARNING: Input file does not exist: {input_file}") + continue + + normalized_path = input_path.resolve() + + if str(normalized_path) in self.filepath_to_guid: + existing_guid = self.filepath_to_guid[str(normalized_path)] + print(f"Reusing existing dataset: {input_path.name} ({existing_guid})") + + existing_dataset = next( + (e for e in self.crate_metadata['@graph'] if e.get('@id') == existing_guid), + None + ) + if existing_dataset: + dataset_obj = Dataset(**existing_dataset) + input_datasets.append(dataset_obj) + reused_count += 1 + continue + + rel_path = input_path.relative_to(self.config.rocrate_path) if input_path.is_relative_to(self.config.rocrate_path) else input_path + + dataset_metadata = GenerateDataset( + name=input_path.name, + author=self.config.author, + version="1.0", + description=f"Input dataset", + keywords=self.config.keywords, + format=input_path.suffix.lstrip('.') or "unknown", + filepath=str(rel_path), + datePublished=datetime.now().isoformat(), + cratePath=self.config.rocrate_path + ) + input_datasets.append(dataset_metadata) + + return input_datasets, reused_count + + def _resolve_outputs(self, io_capture: IOCapture) -> List[Dataset]: + """Resolve output datasets.""" + output_datasets = [] + + for output_file in io_capture.outputs: + output_path = Path(output_file) + normalized_output = output_path.resolve() + + if str(normalized_output) in self.filepath_to_guid: + print(f"WARNING: Overwriting existing dataset: {output_path.name}") + + rel_path = output_path.relative_to(self.config.rocrate_path) if output_path.is_relative_to(self.config.rocrate_path) else output_path + + dataset_metadata = GenerateDataset( + name=output_path.name, + author=self.config.author, + version="1.0", + description=f"Output dataset", + keywords=self.config.keywords, + format=output_path.suffix.lstrip('.') or "unknown", + filepath=str(rel_path), + datePublished=datetime.now().isoformat(), + generatedBy=[], + cratePath=self.config.rocrate_path + ) + output_datasets.append(dataset_metadata) + + return output_datasets + + def _enhance_with_llm( + self, + code: str, + input_datasets: List[Dataset], + output_datasets: List[Dataset] + ) -> Optional[Dict]: + """Use LLM to generate better descriptions if available.""" + + if not self.metadata_generator: + return None + + all_input_files = set() + for ds in input_datasets: + if ds.contentUrl: + url = ds.contentUrl if isinstance(ds.contentUrl, str) else ds.contentUrl[0] + filepath = url.replace('file:///', '').replace('file:', '') + full_path = str((self.config.rocrate_path / filepath).resolve()) + all_input_files.add(full_path) + + all_output_files = set() + for ds in output_datasets: + if ds.contentUrl: + url = ds.contentUrl if isinstance(ds.contentUrl, str) else ds.contentUrl[0] + filepath = url.replace('file://', '').replace('file:', '') + full_path = str((self.config.rocrate_path / filepath).resolve()) + all_output_files.add(full_path) + + input_samples = collect_dataset_samples(all_input_files) + output_samples = collect_dataset_samples(all_output_files) + + if not input_samples and not output_samples: + return None + + input_samples_str = format_samples_for_prompt(input_samples) + output_samples_str = format_samples_for_prompt(output_samples) + + return self.metadata_generator.generate_descriptions( + code, + input_samples_str, + output_samples_str + ) + + def _apply_llm_descriptions( + self, + llm_descriptions: Optional[Dict], + software_name: str, + input_datasets: List[Dataset], + output_datasets: List[Dataset] + ) -> tuple[str, str]: + """Apply LLM descriptions to datasets and return software/computation descriptions.""" + software_description = f"Code executed" + computation_description = f"Computation executed" + + if llm_descriptions: + if 'software_description' in llm_descriptions: + software_description = llm_descriptions['software_description'] + + if 'computation_description' in llm_descriptions: + computation_description = llm_descriptions['computation_description'] + + if 'input_datasets' in llm_descriptions: + for ds in input_datasets: + if ds.name in llm_descriptions['input_datasets']: + ds.description = llm_descriptions['input_datasets'][ds.name] + + if 'output_datasets' in llm_descriptions: + for ds in output_datasets: + if ds.name in llm_descriptions['output_datasets']: + ds.description = llm_descriptions['output_datasets'][ds.name] + + return software_description, computation_description + + def _create_software(self, code: str, name: str, description: str) -> Software: + """Create software metadata.""" + software_filepath = f"software/{name}.py" + software_full_path = self.config.rocrate_path / software_filepath + software_full_path.parent.mkdir(parents=True, exist_ok=True) + software_full_path.write_text(code) + + return GenerateSoftware( + name=name, + author=self.config.author, + version="1.0", + description=description, + dateModified=datetime.now().isoformat(), + keywords=self.config.keywords, + fileFormat="py", + filepath=software_filepath, + cratePath=self.config.rocrate_path + ) + + def _create_computation( + self, + name: str, + description: str, + software: Software, + input_datasets: List[Dataset], + output_datasets: List[Dataset] + ) -> Computation: + """Create computation metadata.""" + computation = GenerateComputation( + name=name, + runBy=self.config.author, + dateCreated=datetime.now().isoformat(), + description=description, + keywords=self.config.keywords, + usedSoftware=[software.guid], + usedDataset=[ds.guid for ds in input_datasets], + generated=[ds.guid for ds in output_datasets] + ) + + for output_ds in output_datasets: + output_ds.generatedBy = {"@id": computation.guid} + + return computation + + def track_execution( + self, + code: str, + io_capture: IOCapture, + execution_name: Optional[str] = None + ) -> TrackingResult: + """Track a code execution and generate provenance metadata.""" + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + execution_name = execution_name or f"cell_{timestamp}" + + if not io_capture.inputs and not io_capture.outputs and not self.config.manual_inputs: + print("No file I/O detected in execution") + raise ValueError("No file I/O detected") + + input_datasets, reused_count = self._resolve_inputs(io_capture) + output_datasets = self._resolve_outputs(io_capture) + + llm_descriptions = self._enhance_with_llm(code, input_datasets, output_datasets) + + software_description, computation_description = self._apply_llm_descriptions( + llm_descriptions, + execution_name, + input_datasets, + output_datasets + ) + + software = self._create_software(code, execution_name, software_description) + + computation = self._create_computation( + f"Computation_{execution_name}", + computation_description, + software, + input_datasets, + output_datasets + ) + + new_datasets = [] + for ds in input_datasets: + if ds.contentUrl: + url = ds.contentUrl if isinstance(ds.contentUrl, str) else ds.contentUrl[0] + filepath = url.replace('file://', '').replace('file:', '') + full_path = str((self.config.rocrate_path / filepath).resolve()) + if full_path not in self.filepath_to_guid: + new_datasets.append(ds) + + elements = [software] + new_datasets + output_datasets + [computation] + AppendCrate(cratePath=self.config.rocrate_path, elements=elements) + + return TrackingResult( + computation_guid=computation.guid, + software_guid=software.guid, + input_count=len(input_datasets), + output_count=len(output_datasets), + reused_count=reused_count, + new_datasets=len(new_datasets) + ) diff --git a/src/fairscape_cli/tracking/utils.py b/src/fairscape_cli/tracking/utils.py new file mode 100644 index 0000000..f828061 --- /dev/null +++ b/src/fairscape_cli/tracking/utils.py @@ -0,0 +1,60 @@ +from pathlib import Path +from typing import Optional, Dict +import pandas as pd + + +def normalize_path(filepath) -> str: + """Normalize a filepath to absolute string representation.""" + if isinstance(filepath, (str, Path)): + return str(Path(filepath).resolve()) + return str(filepath) + + +def is_trackable_path(filepath: str, excluded_patterns: list) -> bool: + """Check if a filepath should be tracked based on exclusion patterns.""" + if not filepath: + return False + filepath_str = str(filepath) + return not any(pattern in filepath_str.lower() for pattern in excluded_patterns) + + +def read_dataset_sample(filepath: str, n_rows: int = 5) -> Optional[str]: + """Read first n rows from a dataset file as a string sample.""" + try: + path = Path(filepath) + if not path.exists(): + return None + + suffix = path.suffix.lower() + df = None + + if suffix == '.csv': + df = pd.read_csv(filepath) + elif suffix == '.parquet': + df = pd.read_parquet(filepath) + elif suffix in ['.xlsx', '.xls']: + df = pd.read_excel(filepath) + + if df is not None: + return df.head(n_rows).to_string() + + return None + except Exception: + return None + + +def collect_dataset_samples(filepaths: set, n_rows: int = 5) -> Dict[str, str]: + """Collect samples from multiple dataset files.""" + samples = {} + for filepath in filepaths: + sample = read_dataset_sample(filepath, n_rows) + if sample: + samples[Path(filepath).name] = sample + return samples + + +def format_samples_for_prompt(samples_dict: Dict[str, str]) -> str: + """Format dataset samples into a string for LLM prompts.""" + if not samples_dict: + return "None" + return "\n\n".join([f"File: {name}\n{sample}" for name, sample in samples_dict.items()]) From 9e099b0f45939ae0ce34f47d37b15ce423536875 Mon Sep 17 00:00:00 2001 From: jniestroy Date: Thu, 6 Nov 2025 08:17:32 -0500 Subject: [PATCH 2/6] fix pydantic not dict issues --- .../tracking/provenance_tracker.py | 36 +++++++------------ 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/src/fairscape_cli/tracking/provenance_tracker.py b/src/fairscape_cli/tracking/provenance_tracker.py index 215ac57..1aaaf3c 100644 --- a/src/fairscape_cli/tracking/provenance_tracker.py +++ b/src/fairscape_cli/tracking/provenance_tracker.py @@ -17,7 +17,6 @@ class ProvenanceTracker: - """Tracks computational provenance and generates RO-Crate metadata.""" def __init__( self, @@ -33,7 +32,6 @@ def __init__( self._load_crate_context() def _ensure_crate_exists(self): - """Create RO-Crate if it doesn't exist with placeholder metadata.""" metadata_path = self.config.rocrate_path / 'ro-crate-metadata.json' if metadata_path.exists(): @@ -56,7 +54,6 @@ def _ensure_crate_exists(self): def _load_crate_context(self): - """Load existing RO-Crate and build filepath to GUID mapping.""" try: self.crate_metadata = ReadROCrateMetadata(self.config.rocrate_path) @@ -70,7 +67,7 @@ def _load_crate_context(self): if hasattr(root_dataset, 'keywords') and root_dataset.keywords: self.config.keywords = root_dataset.keywords - for entity in self.crate_metadata.get('@graph', []): + for entity in self.crate_metadata['@graph']: entity_types = getattr(entity, '@type', []) if isinstance(entity_types, str): entity_types = [entity_types] @@ -80,14 +77,11 @@ def _load_crate_context(self): relative_path = content_url.replace('file:///', '').lstrip('/') filepath_full = (self.config.rocrate_path / relative_path).resolve() self.filepath_to_guid[str(filepath_full)] = getattr(entity, 'guid') - - print(self.filepath_to_guid) except Exception as e: raise RuntimeError(f"Could not read RO-Crate at {self.config.rocrate_path}: {e}") def _resolve_manual_inputs(self) -> Set[str]: - """Convert manual input paths to absolute paths.""" manual_input_paths = set() for manual_input in self.config.manual_inputs: manual_path = Path(manual_input) @@ -99,7 +93,6 @@ def _resolve_manual_inputs(self) -> Set[str]: return manual_input_paths def _resolve_inputs(self, io_capture: IOCapture) -> tuple[List[Dataset], int]: - """Resolve input datasets, reusing existing ones where possible.""" all_input_files = set(io_capture.inputs) all_input_files.update(self._resolve_manual_inputs()) @@ -120,11 +113,11 @@ def _resolve_inputs(self, io_capture: IOCapture) -> tuple[List[Dataset], int]: print(f"Reusing existing dataset: {input_path.name} ({existing_guid})") existing_dataset = next( - (e for e in self.crate_metadata['@graph'] if e.get('@id') == existing_guid), + (e for e in self.crate_metadata['@graph'] if getattr(e, '@id', None) == existing_guid), None ) if existing_dataset: - dataset_obj = Dataset(**existing_dataset) + dataset_obj = Dataset(**existing_dataset.__dict__) input_datasets.append(dataset_obj) reused_count += 1 continue @@ -147,7 +140,6 @@ def _resolve_inputs(self, io_capture: IOCapture) -> tuple[List[Dataset], int]: return input_datasets, reused_count def _resolve_outputs(self, io_capture: IOCapture) -> List[Dataset]: - """Resolve output datasets.""" output_datasets = [] for output_file in io_capture.outputs: @@ -181,23 +173,24 @@ def _enhance_with_llm( input_datasets: List[Dataset], output_datasets: List[Dataset] ) -> Optional[Dict]: - """Use LLM to generate better descriptions if available.""" if not self.metadata_generator: return None all_input_files = set() for ds in input_datasets: - if ds.contentUrl: - url = ds.contentUrl if isinstance(ds.contentUrl, str) else ds.contentUrl[0] + content_url = getattr(ds, 'contentUrl', None) + if content_url: + url = content_url if isinstance(content_url, str) else content_url[0] filepath = url.replace('file:///', '').replace('file:', '') full_path = str((self.config.rocrate_path / filepath).resolve()) all_input_files.add(full_path) all_output_files = set() for ds in output_datasets: - if ds.contentUrl: - url = ds.contentUrl if isinstance(ds.contentUrl, str) else ds.contentUrl[0] + content_url = getattr(ds, 'contentUrl', None) + if content_url: + url = content_url if isinstance(content_url, str) else content_url[0] filepath = url.replace('file://', '').replace('file:', '') full_path = str((self.config.rocrate_path / filepath).resolve()) all_output_files.add(full_path) @@ -224,7 +217,6 @@ def _apply_llm_descriptions( input_datasets: List[Dataset], output_datasets: List[Dataset] ) -> tuple[str, str]: - """Apply LLM descriptions to datasets and return software/computation descriptions.""" software_description = f"Code executed" computation_description = f"Computation executed" @@ -248,7 +240,6 @@ def _apply_llm_descriptions( return software_description, computation_description def _create_software(self, code: str, name: str, description: str) -> Software: - """Create software metadata.""" software_filepath = f"software/{name}.py" software_full_path = self.config.rocrate_path / software_filepath software_full_path.parent.mkdir(parents=True, exist_ok=True) @@ -274,7 +265,6 @@ def _create_computation( input_datasets: List[Dataset], output_datasets: List[Dataset] ) -> Computation: - """Create computation metadata.""" computation = GenerateComputation( name=name, runBy=self.config.author, @@ -297,7 +287,6 @@ def track_execution( io_capture: IOCapture, execution_name: Optional[str] = None ) -> TrackingResult: - """Track a code execution and generate provenance metadata.""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") execution_name = execution_name or f"cell_{timestamp}" @@ -329,8 +318,9 @@ def track_execution( new_datasets = [] for ds in input_datasets: - if ds.contentUrl: - url = ds.contentUrl if isinstance(ds.contentUrl, str) else ds.contentUrl[0] + content_url = getattr(ds, 'contentUrl', None) + if content_url: + url = content_url if isinstance(content_url, str) else content_url[0] filepath = url.replace('file://', '').replace('file:', '') full_path = str((self.config.rocrate_path / filepath).resolve()) if full_path not in self.filepath_to_guid: @@ -346,4 +336,4 @@ def track_execution( output_count=len(output_datasets), reused_count=reused_count, new_datasets=len(new_datasets) - ) + ) \ No newline at end of file From 9c4764a938fb25856c548728972d810d84af41fe Mon Sep 17 00:00:00 2001 From: jniestroy Date: Thu, 6 Nov 2025 08:26:07 -0500 Subject: [PATCH 3/6] track cli command --- src/fairscape_cli/__main__.py | 2 + src/fairscape_cli/commands/track.py | 135 ++++++++++++++++++ .../tracking/provenance_tracker.py | 3 +- 3 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 src/fairscape_cli/commands/track.py diff --git a/src/fairscape_cli/__main__.py b/src/fairscape_cli/__main__.py index cc7b3dc..142bd5c 100644 --- a/src/fairscape_cli/__main__.py +++ b/src/fairscape_cli/__main__.py @@ -7,6 +7,7 @@ from fairscape_cli.commands.publish_commands import publish_group from fairscape_cli.commands.schema_commands import schema from fairscape_cli.commands.augment_commands import augment_group +from fairscape_cli.commands.track import track @click.group(invoke_without_command=True) @click.pass_context @@ -25,6 +26,7 @@ def cli(ctx): cli.add_command(publish_group, name='publish') cli.add_command(schema, name='schema') cli.add_command(augment_group, name='augment') +cli.add_command(track, name='track') if __name__ == "__main__": cli() \ No newline at end of file diff --git a/src/fairscape_cli/commands/track.py b/src/fairscape_cli/commands/track.py new file mode 100644 index 0000000..ddafd20 --- /dev/null +++ b/src/fairscape_cli/commands/track.py @@ -0,0 +1,135 @@ +import click +import pathlib +import os +import runpy +from typing import List + +from fairscape_cli.tracking.io_capture import IOCapture +from fairscape_cli.tracking.provenance_tracker import ProvenanceTracker +from fairscape_cli.tracking.config import ProvenanceConfig, TrackerConfig +from fairscape_cli.tracking.metadata_generator import create_metadata_generator + + +@click.command('track') +@click.argument('script-path', type=click.Path(exists=True, path_type=pathlib.Path)) +@click.option('--rocrate-path', type=click.Path(path_type=pathlib.Path), default=None, help='Path to RO-Crate directory (default: current directory)') +@click.option('--author', type=str, default="Unknown", help='Author name (default: from RO-Crate or "Unknown")') +@click.option('--keywords', multiple=True, default=["computation"], help='Keywords for metadata (default: from RO-Crate or ["computation"])') +@click.option('--input', 'manual_inputs', multiple=True, help='Manual input files to track') +@click.option('--no-llm', is_flag=True, default=False, help='Disable LLM-based description generation') +@click.option('--execution-name', type=str, default=None, help='Name for this execution (default: script filename)') +@click.pass_context +def track( + ctx, + script_path: pathlib.Path, + rocrate_path: pathlib.Path, + author: str, + keywords: List[str], + manual_inputs: List[str], + no_llm: bool, + execution_name: str +): + """Track execution of a Python script and generate provenance metadata. + + Executes SCRIPT_PATH while capturing file I/O operations, then generates + RO-Crate metadata documenting the computation, software, input datasets, + and output datasets. + + Examples: + + fairscape-cli track analysis.py + + fairscape-cli track analysis.py --author "Jane Doe" --keywords ml analysis + + fairscape-cli track analysis.py --rocrate-path ./my-crate --input config.json + + fairscape-cli track analysis.py --no-llm --author "John Smith" + """ + + rocrate_path = rocrate_path or pathlib.Path.cwd() + + if not script_path.exists(): + click.echo(f"ERROR: Script file not found: {script_path}", err=True) + ctx.exit(code=1) + + try: + with script_path.open('r') as f: + code = f.read() + except Exception as exc: + click.echo(f"ERROR: Could not read script file: {exc}", err=True) + ctx.exit(code=1) + + tracker_config = TrackerConfig() + + original_cwd = pathlib.Path.cwd() + script_dir = script_path.parent.resolve() + + try: + os.chdir(script_dir) + + with IOCapture(config=tracker_config) as capture: + try: + runpy.run_path(str(script_path), run_name='__main__') + except SystemExit as e: + if e.code != 0: + click.echo(f"WARNING: Script exited with code {e.code}", err=True) + except Exception as exc: + click.echo(f"ERROR: Script execution failed: {exc}", err=True) + ctx.exit(code=1) + finally: + os.chdir(original_cwd) + + if not capture.inputs and not capture.outputs and not manual_inputs: + click.echo("WARNING: No file I/O detected in script execution", err=True) + click.echo("No metadata generated.", err=True) + return + + use_llm = not no_llm and os.environ.get("GEMINI_API_KEY") + + metadata_generator = None + if use_llm: + from datetime import datetime + try: + metadata_generator = create_metadata_generator( + provider="gemini", + timestamp=datetime.now().strftime("%Y%m%d_%H%M%S") + ) + except Exception as exc: + click.echo(f"WARNING: Could not initialize LLM metadata generator: {exc}", err=True) + click.echo("Falling back to simple descriptions", err=True) + + provenance_config = ProvenanceConfig( + rocrate_path=rocrate_path, + author=author, + keywords=list(keywords), + manual_inputs=list(manual_inputs), + use_llm=use_llm + ) + + try: + tracker = ProvenanceTracker( + config=provenance_config, + metadata_generator=metadata_generator + ) + + exec_name = execution_name or script_path.stem + + result = tracker.track_execution(code, capture, execution_name=exec_name) + + click.echo(result.computation_guid) + + if ctx.obj and ctx.obj.get('verbose'): + click.echo(f"\nTracking Summary:", err=True) + click.echo(f" Software: {result.software_guid}", err=True) + click.echo(f" Inputs: {result.input_count} datasets ({result.reused_count} reused)", err=True) + click.echo(f" Outputs: {result.output_count} datasets", err=True) + + except ValueError as exc: + click.echo(f"ERROR: {exc}", err=True) + ctx.exit(code=1) + except RuntimeError as exc: + click.echo(f"ERROR: {exc}", err=True) + ctx.exit(code=1) + except Exception as exc: + click.echo(f"ERROR: Tracking failed: {exc}", err=True) + ctx.exit(code=1) \ No newline at end of file diff --git a/src/fairscape_cli/tracking/provenance_tracker.py b/src/fairscape_cli/tracking/provenance_tracker.py index 1aaaf3c..f21969a 100644 --- a/src/fairscape_cli/tracking/provenance_tracker.py +++ b/src/fairscape_cli/tracking/provenance_tracker.py @@ -49,7 +49,8 @@ def _ensure_crate_exists(self): keywords=self.config.keywords, datePublished=datetime.now().isoformat(), version="1.0", - license="https://creativecommons.org/licenses/by/4.0/" + license="https://creativecommons.org/licenses/by/4.0/", + isPartOf=[] ) From 0d5fcba1344e9c62c6540c4e64c6cfdeb1834a1c Mon Sep 17 00:00:00 2001 From: jniestroy Date: Thu, 6 Nov 2025 09:06:30 -0500 Subject: [PATCH 4/6] fix existing --- src/fairscape_cli/tracking/provenance_tracker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fairscape_cli/tracking/provenance_tracker.py b/src/fairscape_cli/tracking/provenance_tracker.py index f21969a..9099d36 100644 --- a/src/fairscape_cli/tracking/provenance_tracker.py +++ b/src/fairscape_cli/tracking/provenance_tracker.py @@ -114,11 +114,11 @@ def _resolve_inputs(self, io_capture: IOCapture) -> tuple[List[Dataset], int]: print(f"Reusing existing dataset: {input_path.name} ({existing_guid})") existing_dataset = next( - (e for e in self.crate_metadata['@graph'] if getattr(e, '@id', None) == existing_guid), + (e for e in self.crate_metadata['@graph'] if getattr(e, 'guid', None) == existing_guid), None ) if existing_dataset: - dataset_obj = Dataset(**existing_dataset.__dict__) + dataset_obj =Dataset.model_validate(existing_dataset) input_datasets.append(dataset_obj) reused_count += 1 continue From 5edd41c111ea18337ebb78f14ef040e43741ac24 Mon Sep 17 00:00:00 2001 From: jniestroy Date: Thu, 6 Nov 2025 09:13:06 -0500 Subject: [PATCH 5/6] fix overwriting description --- .../tracking/provenance_tracker.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/src/fairscape_cli/tracking/provenance_tracker.py b/src/fairscape_cli/tracking/provenance_tracker.py index 9099d36..49cd892 100644 --- a/src/fairscape_cli/tracking/provenance_tracker.py +++ b/src/fairscape_cli/tracking/provenance_tracker.py @@ -26,6 +26,7 @@ def __init__( self.config = config self.metadata_generator = metadata_generator self.filepath_to_guid: Dict[str, str] = {} + self.existing_guids: Set[str] = set() self.crate_metadata = None self._ensure_crate_exists() @@ -69,6 +70,10 @@ def _load_crate_context(self): self.config.keywords = root_dataset.keywords for entity in self.crate_metadata['@graph']: + entity_guid = getattr(entity, 'guid', None) + if entity_guid: + self.existing_guids.add(entity_guid) + entity_types = getattr(entity, '@type', []) if isinstance(entity_types, str): entity_types = [entity_types] @@ -77,7 +82,7 @@ def _load_crate_context(self): if content_url and content_url.startswith('file://'): relative_path = content_url.replace('file:///', '').lstrip('/') filepath_full = (self.config.rocrate_path / relative_path).resolve() - self.filepath_to_guid[str(filepath_full)] = getattr(entity, 'guid') + self.filepath_to_guid[str(filepath_full)] = entity_guid except Exception as e: raise RuntimeError(f"Could not read RO-Crate at {self.config.rocrate_path}: {e}") @@ -317,15 +322,7 @@ def track_execution( output_datasets ) - new_datasets = [] - for ds in input_datasets: - content_url = getattr(ds, 'contentUrl', None) - if content_url: - url = content_url if isinstance(content_url, str) else content_url[0] - filepath = url.replace('file://', '').replace('file:', '') - full_path = str((self.config.rocrate_path / filepath).resolve()) - if full_path not in self.filepath_to_guid: - new_datasets.append(ds) + new_datasets = [ds for ds in input_datasets if ds.guid not in self.existing_guids] elements = [software] + new_datasets + output_datasets + [computation] AppendCrate(cratePath=self.config.rocrate_path, elements=elements) From b7f8b3016f8b7325c5316e82c03dd67858393bc5 Mon Sep 17 00:00:00 2001 From: jniestroy Date: Fri, 7 Nov 2025 07:00:43 -0500 Subject: [PATCH 6/6] final for now --- src/fairscape_cli/tracking/metadata_generator.py | 2 +- src/fairscape_cli/tracking/provenance_tracker.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/fairscape_cli/tracking/metadata_generator.py b/src/fairscape_cli/tracking/metadata_generator.py index b44705c..f2bf912 100644 --- a/src/fairscape_cli/tracking/metadata_generator.py +++ b/src/fairscape_cli/tracking/metadata_generator.py @@ -108,7 +108,7 @@ def generate_descriptions( import google.generativeai as genai genai.configure(api_key=self.api_key) - model = genai.GenerativeModel('gemini-2.0-flash-exp') + model = genai.GenerativeModel('gemini-2.5-flash') prompt = self._build_prompt(code, input_samples, output_samples) response = model.generate_content( diff --git a/src/fairscape_cli/tracking/provenance_tracker.py b/src/fairscape_cli/tracking/provenance_tracker.py index 49cd892..f27aa88 100644 --- a/src/fairscape_cli/tracking/provenance_tracker.py +++ b/src/fairscape_cli/tracking/provenance_tracker.py @@ -150,10 +150,6 @@ def _resolve_outputs(self, io_capture: IOCapture) -> List[Dataset]: for output_file in io_capture.outputs: output_path = Path(output_file) - normalized_output = output_path.resolve() - - if str(normalized_output) in self.filepath_to_guid: - print(f"WARNING: Overwriting existing dataset: {output_path.name}") rel_path = output_path.relative_to(self.config.rocrate_path) if output_path.is_relative_to(self.config.rocrate_path) else output_path