From 117758249cf12dddf40a2be4a9368aee424e42d9 Mon Sep 17 00:00:00 2001
From: jniestroy <jniestroy@gmail.com>
Date: Thu, 6 Nov 2025 08:15:33 -0500
Subject: [PATCH 1/6] first draft

---
 pyproject.toml                                |   4 +-
 src/fairscape_cli/jupyter/__init__.py         |   3 +
 src/fairscape_cli/jupyter/magic.py            | 104 ++++++
 src/fairscape_cli/tracking/__init__.py        |  35 ++
 src/fairscape_cli/tracking/config.py          |  46 +++
 src/fairscape_cli/tracking/io_capture.py      | 256 +++++++++++++
 .../tracking/metadata_generator.py            | 183 +++++++++
 .../tracking/provenance_tracker.py            | 349 ++++++++++++++++++
 src/fairscape_cli/tracking/utils.py           |  60 +++
 9 files changed, 1038 insertions(+), 2 deletions(-)
 create mode 100644 src/fairscape_cli/jupyter/__init__.py
 create mode 100644 src/fairscape_cli/jupyter/magic.py
 create mode 100644 src/fairscape_cli/tracking/__init__.py
 create mode 100644 src/fairscape_cli/tracking/config.py
 create mode 100644 src/fairscape_cli/tracking/io_capture.py
 create mode 100644 src/fairscape_cli/tracking/metadata_generator.py
 create mode 100644 src/fairscape_cli/tracking/provenance_tracker.py
 create mode 100644 src/fairscape_cli/tracking/utils.py

diff --git a/pyproject.toml b/pyproject.toml
index df4ca59..26b49bb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "fairscape-cli"
-version = "1.1.7"
+version = "1.1.9"
 description = "A utility for packaging objects and validating metadata for FAIRSCAPE"
 readme = "README.md"
 requires-python = ">=3.8"
@@ -39,7 +39,7 @@ dependencies = [
         "prettytable>=3.9.0",
         "jsonschema>=4.20.0",
         "sqids>=0.4.1",
-        "fairscape-models>=1.0.8",
+        "fairscape-models>=1.0.11",
         "pyyaml",
         "h5py",
         "frictionless>=5.0,<6.0",
diff --git a/src/fairscape_cli/jupyter/__init__.py b/src/fairscape_cli/jupyter/__init__.py
new file mode 100644
index 0000000..79dd58c
--- /dev/null
+++ b/src/fairscape_cli/jupyter/__init__.py
@@ -0,0 +1,3 @@
+from .magic import fairscape
+
+__all__ = ['fairscape']
diff --git a/src/fairscape_cli/jupyter/magic.py b/src/fairscape_cli/jupyter/magic.py
new file mode 100644
index 0000000..a3c123c
--- /dev/null
+++ b/src/fairscape_cli/jupyter/magic.py
@@ -0,0 +1,104 @@
+import os
+import pathlib
+import argparse
+from IPython.core.magic import register_cell_magic
+from IPython import get_ipython
+
+from fairscape_cli.tracking.io_capture import IOCapture
+from fairscape_cli.tracking.provenance_tracker import ProvenanceTracker
+from fairscape_cli.tracking.config import ProvenanceConfig, TrackerConfig
+from fairscape_cli.tracking.metadata_generator import create_metadata_generator
+
+
+def parse_magic_arguments(line: str) -> argparse.Namespace:
+    """Parse arguments from magic command line."""
+    parser = argparse.ArgumentParser(description='Track Jupyter cell execution')
+    parser.add_argument('command', nargs='?', default=None)
+    parser.add_argument('--rocrate-path', type=str, default=None)
+    parser.add_argument('--author', type=str, default="Unknown")
+    parser.add_argument('--keywords', nargs='+', default=["jupyter", "computation"])
+    parser.add_argument('--input', nargs='+', default=[], dest='manual_inputs')
+    parser.add_argument('--no-llm', action='store_true', help='Disable LLM descriptions')
+    
+    args_list = line.split()
+    
+    try:
+        args = parser.parse_args(args_list)
+    except SystemExit:
+        print("Usage: %%fairscape track [--rocrate-path PATH] [--author AUTHOR] [--keywords KW1 KW2] [--input FILE1 FILE2] [--no-llm]")
+        raise
+    
+    return args
+
+
+def execute_cell_safely(cell: str) -> bool:
+    """Execute cell and return success status."""
+    ip = get_ipython()
+    result = ip.run_cell(cell)
+    
+    if result.error_in_exec:
+        print("ERROR: Cell execution failed")
+        return False
+    
+    return True
+
+
+@register_cell_magic
+def fairscape(line, cell):
+    """
+    Jupyter cell magic for tracking computational provenance.
+    
+    Usage:
+        %%fairscape track [options]
+        <your code here>
+    
+    Options:
+        --rocrate-path PATH    Path to RO-Crate directory (default: current directory)
+        --author AUTHOR        Author name (default: from RO-Crate or "Unknown")
+        --keywords KW1 KW2     Keywords for metadata (default: from RO-Crate or ["jupyter", "computation"])
+        --input FILE1 FILE2    Manual input files to track
+        --no-llm               Disable LLM-based description generation
+    """
+    args = parse_magic_arguments(line)
+    
+    if args.command != 'track':
+        print("Usage: %%fairscape track [options]")
+        return
+    
+    rocrate_path = pathlib.Path(args.rocrate_path) if args.rocrate_path else pathlib.Path.cwd()
+    
+    tracker_config = TrackerConfig()
+    
+    with IOCapture(config=tracker_config) as capture:
+        if not execute_cell_safely(cell):
+            return
+    
+    use_llm = not args.no_llm and os.environ.get("GEMINI_API_KEY")
+    
+    metadata_generator = None
+    if use_llm:
+        from datetime import datetime
+        metadata_generator = create_metadata_generator(
+            provider="gemini",
+            timestamp=datetime.now().strftime("%Y%m%d_%H%M%S")
+        )
+    
+    provenance_config = ProvenanceConfig(
+        rocrate_path=rocrate_path,
+        author=args.author,
+        keywords=args.keywords,
+        manual_inputs=args.manual_inputs,
+        use_llm=use_llm
+    )
+    
+    try:
+        tracker = ProvenanceTracker(
+            config=provenance_config,
+            metadata_generator=metadata_generator
+        )
+        
+        result = tracker.track_execution(cell, capture)
+        
+    except Exception as e:
+        print(f"ERROR: Tracking failed: {e}")
+        raise
diff --git a/src/fairscape_cli/tracking/__init__.py b/src/fairscape_cli/tracking/__init__.py
new file mode 100644
index 0000000..a15e09c
--- /dev/null
+++ b/src/fairscape_cli/tracking/__init__.py
@@ -0,0 +1,35 @@
+from .io_capture import IOCapture
+from .provenance_tracker import ProvenanceTracker
+from .metadata_generator import (
+    MetadataGenerator,
+    GeminiMetadataGenerator,
+    FallbackMetadataGenerator,
+    MockMetadataGenerator,
+    create_metadata_generator
+)
+from .config import TrackerConfig, ProvenanceConfig, TrackingResult
+from .utils import (
+    normalize_path,
+    is_trackable_path,
+    read_dataset_sample,
+    collect_dataset_samples,
+    format_samples_for_prompt
+)
+
+__all__ = [
+    'IOCapture',
+    'ProvenanceTracker',
+    'MetadataGenerator',
+    'GeminiMetadataGenerator',
+    'FallbackMetadataGenerator',
+    'MockMetadataGenerator',
+    'create_metadata_generator',
+    'TrackerConfig',
+    'ProvenanceConfig',
+    'TrackingResult',
+    'normalize_path',
+    'is_trackable_path',
+    'read_dataset_sample',
+    'collect_dataset_samples',
+    'format_samples_for_prompt',
+]
\ No newline at end of file
diff --git a/src/fairscape_cli/tracking/config.py b/src/fairscape_cli/tracking/config.py
new file mode 100644
index 0000000..0feb2d4
--- /dev/null
+++ b/src/fairscape_cli/tracking/config.py
@@ -0,0 +1,46 @@
+from dataclasses import dataclass, field
+from typing import List, Optional
+from pathlib import Path
+
+
+@dataclass
+class TrackerConfig:
+    track_builtins: bool = True
+    track_pathlib: bool = True
+    track_pandas: bool = True
+    track_numpy: bool = True
+    excluded_patterns: List[str] = field(default_factory=lambda: [
+        '.matplotlib',
+        '.ipython',
+        '.jupyter',
+        'site-packages',
+        '/tmp/',
+        '__pycache__'
+    ])
+
+
+@dataclass
+class TrackingResult:
+    computation_guid: str
+    software_guid: str
+    input_count: int
+    output_count: int
+    reused_count: int
+    new_datasets: int
+    
+    def __str__(self):
+        return (
+            f"Tracked computation: {self.computation_guid}\n"
+            f"  Software: {self.software_guid}\n"
+            f"  Inputs: {self.input_count} datasets ({self.reused_count} reused)\n"
+            f"  Outputs: {self.output_count} datasets"
+        )
+
+
+@dataclass
+class ProvenanceConfig:
+    rocrate_path: Path
+    author: str = "Unknown"
+    keywords: List[str] = field(default_factory=lambda: ["jupyter", "computation"])
+    manual_inputs: List[str] = field(default_factory=list)
+    use_llm: bool = False
diff --git a/src/fairscape_cli/tracking/io_capture.py b/src/fairscape_cli/tracking/io_capture.py
new file mode 100644
index 0000000..f02c440
--- /dev/null
+++ b/src/fairscape_cli/tracking/io_capture.py
@@ -0,0 +1,256 @@
+import builtins
+import pathlib
+from typing import Set, Dict, Any
+import pandas as pd
+import numpy as np
+
+from .config import TrackerConfig
+from .utils import normalize_path, is_trackable_path
+
+
+class IOCapture:
+    """Captures file I/O operations during code execution."""
+    
+    def __init__(self, config: TrackerConfig = None):
+        self.config = config or TrackerConfig()
+        self.inputs: Set[str] = set()
+        self.outputs: Set[str] = set()
+        self.original_functions: Dict[str, Any] = {}
+        self.captured_variables: Dict[str, Any] = {}
+    
+    def _should_track(self, filepath) -> bool:
+        """Check if filepath should be tracked."""
+        return is_trackable_path(filepath, self.config.excluded_patterns)
+    
+    def _normalize_path(self, filepath) -> str:
+        """Normalize filepath to absolute string."""
+        return normalize_path(filepath)
+    
+    def patch_open(self):
+        """Patch builtin open function to track file I/O."""
+        if not self.config.track_builtins:
+            return
+            
+        original_open = builtins.open
+        self.original_functions['builtins.open'] = original_open
+        
+        capture = self
+        
+        def tracked_open(file, mode='r', *args, **kwargs):
+            if capture._should_track(file):
+                normalized = capture._normalize_path(file)
+                if 'r' in mode:
+                    capture.inputs.add(normalized)
+                if any(m in mode for m in ['w', 'a', 'x']):
+                    capture.outputs.add(normalized)
+            return original_open(file, mode, *args, **kwargs)
+        
+        builtins.open = tracked_open
+    
+    def patch_pathlib(self):
+        """Patch pathlib methods to track file I/O."""
+        if not self.config.track_pathlib:
+            return
+            
+        original_path_open = pathlib.Path.open
+        original_read_text = pathlib.Path.read_text
+        original_read_bytes = pathlib.Path.read_bytes
+        original_write_text = pathlib.Path.write_text
+        original_write_bytes = pathlib.Path.write_bytes
+        
+        self.original_functions['pathlib.Path.open'] = original_path_open
+        self.original_functions['pathlib.Path.read_text'] = original_read_text
+        self.original_functions['pathlib.Path.read_bytes'] = original_read_bytes
+        self.original_functions['pathlib.Path.write_text'] = original_write_text
+        self.original_functions['pathlib.Path.write_bytes'] = original_write_bytes
+        
+        capture = self
+        
+        def tracked_path_open(self, mode='r', *args, **kwargs):
+            if capture._should_track(self):
+                normalized = capture._normalize_path(self)
+                if 'r' in mode:
+                    capture.inputs.add(normalized)
+                if any(m in mode for m in ['w', 'a', 'x']):
+                    capture.outputs.add(normalized)
+            return original_path_open(self, mode, *args, **kwargs)
+        
+        def tracked_read_text(self, *args, **kwargs):
+            if capture._should_track(self):
+                capture.inputs.add(capture._normalize_path(self))
+            return original_read_text(self, *args, **kwargs)
+        
+        def tracked_read_bytes(self, *args, **kwargs):
+            if capture._should_track(self):
+                capture.inputs.add(capture._normalize_path(self))
+            return original_read_bytes(self, *args, **kwargs)
+        
+        def tracked_write_text(self, *args, **kwargs):
+            if capture._should_track(self):
+                capture.outputs.add(capture._normalize_path(self))
+            return original_write_text(self, *args, **kwargs)
+        
+        def tracked_write_bytes(self, *args, **kwargs):
+            if capture._should_track(self):
+                capture.outputs.add(capture._normalize_path(self))
+            return original_write_bytes(self, *args, **kwargs)
+        
+        pathlib.Path.open = tracked_path_open
+        pathlib.Path.read_text = tracked_read_text
+        pathlib.Path.read_bytes = tracked_read_bytes
+        pathlib.Path.write_text = tracked_write_text
+        pathlib.Path.write_bytes = tracked_write_bytes
+    
+    def patch_pandas(self):
+        """Patch pandas methods to track file I/O."""
+        if not self.config.track_pandas:
+            return
+            
+        original_read_csv = pd.read_csv
+        original_read_excel = pd.read_excel
+        original_read_parquet = pd.read_parquet
+        original_read_json = pd.read_json
+        original_to_csv = pd.DataFrame.to_csv
+        original_to_excel = pd.DataFrame.to_excel
+        original_to_parquet = pd.DataFrame.to_parquet
+        original_to_json = pd.DataFrame.to_json
+        
+        self.original_functions['pd.read_csv'] = original_read_csv
+        self.original_functions['pd.read_excel'] = original_read_excel
+        self.original_functions['pd.read_parquet'] = original_read_parquet
+        self.original_functions['pd.read_json'] = original_read_json
+        self.original_functions['pd.DataFrame.to_csv'] = original_to_csv
+        self.original_functions['pd.DataFrame.to_excel'] = original_to_excel
+        self.original_functions['pd.DataFrame.to_parquet'] = original_to_parquet
+        self.original_functions['pd.DataFrame.to_json'] = original_to_json
+        
+        capture = self
+        
+        def tracked_read_csv(filepath_or_buffer, *args, **kwargs):
+            if capture._should_track(filepath_or_buffer):
+                capture.inputs.add(capture._normalize_path(filepath_or_buffer))
+            return original_read_csv(filepath_or_buffer, *args, **kwargs)
+        
+        def tracked_read_excel(io, *args, **kwargs):
+            if capture._should_track(io):
+                capture.inputs.add(capture._normalize_path(io))
+            return original_read_excel(io, *args, **kwargs)
+        
+        def tracked_read_parquet(path, *args, **kwargs):
+            if capture._should_track(path):
+                capture.inputs.add(capture._normalize_path(path))
+            return original_read_parquet(path, *args, **kwargs)
+        
+        def tracked_read_json(path_or_buf, *args, **kwargs):
+            if capture._should_track(path_or_buf):
+                capture.inputs.add(capture._normalize_path(path_or_buf))
+            return original_read_json(path_or_buf, *args, **kwargs)
+        
+        def tracked_to_csv(df_self, path_or_buf=None, *args, **kwargs):
+            if path_or_buf and capture._should_track(path_or_buf):
+                capture.outputs.add(capture._normalize_path(path_or_buf))
+            return original_to_csv(df_self, path_or_buf, *args, **kwargs)
+        
+        def tracked_to_excel(df_self, excel_writer, *args, **kwargs):
+            if capture._should_track(excel_writer):
+                capture.outputs.add(capture._normalize_path(excel_writer))
+            return original_to_excel(df_self, excel_writer, *args, **kwargs)
+        
+        def tracked_to_parquet(df_self, path, *args, **kwargs):
+            if capture._should_track(path):
+                capture.outputs.add(capture._normalize_path(path))
+            return original_to_parquet(df_self, path, *args, **kwargs)
+        
+        def tracked_to_json(df_self, path_or_buf=None, *args, **kwargs):
+            if path_or_buf and capture._should_track(path_or_buf):
+                capture.outputs.add(capture._normalize_path(path_or_buf))
+            return original_to_json(df_self, path_or_buf, *args, **kwargs)
+        
+        pd.read_csv = tracked_read_csv
+        pd.read_excel = tracked_read_excel
+        pd.read_parquet = tracked_read_parquet
+        pd.read_json = tracked_read_json
+        pd.DataFrame.to_csv = tracked_to_csv
+        pd.DataFrame.to_excel = tracked_to_excel
+        pd.DataFrame.to_parquet = tracked_to_parquet
+        pd.DataFrame.to_json = tracked_to_json
+    
+    def patch_numpy(self):
+        """Patch numpy methods to track file I/O."""
+        if not self.config.track_numpy:
+            return
+            
+        original_load = np.load
+        original_save = np.save
+        original_loadtxt = np.loadtxt
+        original_savetxt = np.savetxt
+        
+        self.original_functions['np.load'] = original_load
+        self.original_functions['np.save'] = original_save
+        self.original_functions['np.loadtxt'] = original_loadtxt
+        self.original_functions['np.savetxt'] = original_savetxt
+        
+        capture = self
+        
+        def tracked_load(file, *args, **kwargs):
+            if capture._should_track(file):
+                capture.inputs.add(capture._normalize_path(file))
+            return original_load(file, *args, **kwargs)
+        
+        def tracked_save(file, arr, *args, **kwargs):
+            if capture._should_track(file):
+                capture.outputs.add(capture._normalize_path(file))
+            return original_save(file, arr, *args, **kwargs)
+        
+        def tracked_loadtxt(fname, *args, **kwargs):
+            if capture._should_track(fname):
+                capture.inputs.add(capture._normalize_path(fname))
+            return original_loadtxt(fname, *args, **kwargs)
+        
+        def tracked_savetxt(fname, X, *args, **kwargs):
+            if capture._should_track(fname):
+                capture.outputs.add(capture._normalize_path(fname))
+            return original_savetxt(fname, X, *args, **kwargs)
+        
+        np.load = tracked_load
+        np.save = tracked_save
+        np.loadtxt = tracked_loadtxt
+        np.savetxt = tracked_savetxt
+    
+    def restore_all(self):
+        """Restore all original functions."""
+        builtins.open = self.original_functions.get('builtins.open', builtins.open)
+        
+        if 'pathlib.Path.open' in self.original_functions:
+            pathlib.Path.open = self.original_functions['pathlib.Path.open']
+            pathlib.Path.read_text = self.original_functions['pathlib.Path.read_text']
+            pathlib.Path.read_bytes = self.original_functions['pathlib.Path.read_bytes']
+            pathlib.Path.write_text = self.original_functions['pathlib.Path.write_text']
+            pathlib.Path.write_bytes = self.original_functions['pathlib.Path.write_bytes']
+        
+        if 'pd.read_csv' in self.original_functions:
+            pd.read_csv = self.original_functions['pd.read_csv']
+            pd.read_excel = self.original_functions['pd.read_excel']
+            pd.read_parquet = self.original_functions['pd.read_parquet']
+            pd.read_json = self.original_functions['pd.read_json']
+            pd.DataFrame.to_csv = self.original_functions['pd.DataFrame.to_csv']
+            pd.DataFrame.to_excel = self.original_functions['pd.DataFrame.to_excel']
+            pd.DataFrame.to_parquet = self.original_functions['pd.DataFrame.to_parquet']
+            pd.DataFrame.to_json = self.original_functions['pd.DataFrame.to_json']
+        
+        if 'np.load' in self.original_functions:
+            np.load = self.original_functions['np.load']
+            np.save = self.original_functions['np.save']
+            np.loadtxt = self.original_functions['np.loadtxt']
+            np.savetxt = self.original_functions['np.savetxt']
+    
+    def __enter__(self):
+        self.patch_open()
+        self.patch_pathlib()
+        self.patch_pandas()
+        self.patch_numpy()
+        return self
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.restore_all()
+        return False
diff --git a/src/fairscape_cli/tracking/metadata_generator.py b/src/fairscape_cli/tracking/metadata_generator.py
new file mode 100644
index 0000000..b44705c
--- /dev/null
+++ b/src/fairscape_cli/tracking/metadata_generator.py
@@ -0,0 +1,183 @@
+import os
+import json
+import re
+from typing import Dict, Optional, Any
+from abc import ABC, abstractmethod
+
+
+DESCRIPTION_PROMPT_TEMPLATE = """ROLE: Research data management expert specializing in FAIR metadata
+
+TASK: Generate concise, technical descriptions for a computational workflow
+
+INPUT FORMAT:
+- Software code that was executed
+- Input datasets with samples (first 5 rows)
+- Output datasets with samples (first 5 rows)
+
+OUTPUT FORMAT: JSON object with these keys:
+{{
+  "software_description": "What this code does technically",
+  "computation_description": "What this computation accomplishes",
+  "input_datasets": {{
+    "filename": "Description of this input's role and content"
+  }},
+  "output_datasets": {{
+    "filename": "Description of this output's content and meaning"
+  }}
+}}
+
+REQUIREMENTS:
+- Software description: 1-2 sentences, focus on operations performed
+- Computation description: 1-2 sentences, focus on scientific/analytical goal
+- Dataset descriptions: 1 sentence each, describe content type and role in workflow
+- Be technical but clear, assume scientific audience
+- No markdown formatting, just plain JSON
+
+SOFTWARE CODE:
+```python
+{code}
+```
+
+INPUT DATASETS:
+{input_samples}
+
+OUTPUT DATASETS:
+{output_samples}
+
+Generate the JSON now:"""
+
+
+class MetadataGenerator(ABC):
+    """Abstract base class for generating metadata descriptions."""
+    
+    @abstractmethod
+    def generate_descriptions(
+        self, 
+        code: str, 
+        input_samples: str, 
+        output_samples: str
+    ) -> Optional[Dict[str, Any]]:
+        """Generate descriptions for code and datasets."""
+        pass
+
+
+class GeminiMetadataGenerator(MetadataGenerator):
+    """Generate metadata using Google Gemini."""
+    
+    def __init__(self, api_key: Optional[str] = None, temperature: float = 0.2, max_tokens: int = 2048):
+        self.api_key = api_key or os.environ.get("GEMINI_API_KEY")
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+        
+        if not self.api_key:
+            raise ValueError("GEMINI_API_KEY not found in environment or provided")
+    
+    def _build_prompt(self, code: str, input_samples: str, output_samples: str) -> str:
+        """Build the prompt for the LLM."""
+        return DESCRIPTION_PROMPT_TEMPLATE.format(
+            code=code,
+            input_samples=input_samples,
+            output_samples=output_samples
+        )
+    
+    def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
+        """Parse JSON from LLM response, handling various formats."""
+        response_text = response_text.strip()
+        
+        json_match = re.search(r'```json\s*(\{.*?\})\s*```', response_text, re.DOTALL)
+        if json_match:
+            response_text = json_match.group(1)
+        else:
+            code_match = re.search(r'```\s*(\{.*?\})\s*```', response_text, re.DOTALL)
+            if code_match:
+                response_text = code_match.group(1)
+            elif not (response_text.startswith('{') and response_text.endswith('}')):
+                json_search = re.search(r'\{.*\}', response_text, re.DOTALL)
+                if json_search:
+                    response_text = json_search.group(0)
+                    
+        return json.loads(response_text)
+    
+    def generate_descriptions(
+        self, 
+        code: str, 
+        input_samples: str, 
+        output_samples: str
+    ) -> Optional[Dict[str, Any]]:
+        """Generate descriptions using Gemini."""
+        import google.generativeai as genai
+        
+        genai.configure(api_key=self.api_key)
+        model = genai.GenerativeModel('gemini-2.0-flash-exp')
+        
+        prompt = self._build_prompt(code, input_samples, output_samples)
+        response = model.generate_content(
+            prompt,
+            generation_config=genai.types.GenerationConfig(
+                temperature=self.temperature,
+                max_output_tokens=self.max_tokens,
+            )
+        )
+        
+        return self._parse_llm_response(response.text)
+
+
+class FallbackMetadataGenerator(MetadataGenerator):
+    """Generate simple timestamp-based descriptions without LLM."""
+    
+    def __init__(self, timestamp: str):
+        self.timestamp = timestamp
+    
+    def generate_descriptions(
+        self, 
+        code: str, 
+        input_samples: str, 
+        output_samples: str
+    ) -> Dict[str, Any]:
+        """Generate simple fallback descriptions."""
+        return {
+            "software_description": f"Code executed at {self.timestamp}",
+            "computation_description": f"Computation executed at {self.timestamp}",
+            "input_datasets": {},
+            "output_datasets": {}
+        }
+
+
+class MockMetadataGenerator(MetadataGenerator):
+    """Mock generator for testing."""
+    
+    def __init__(self, mock_response: Dict[str, Any]):
+        self.mock_response = mock_response
+    
+    def generate_descriptions(
+        self, 
+        code: str, 
+        input_samples: str, 
+        output_samples: str
+    ) -> Dict[str, Any]:
+        """Return mock response."""
+        return self.mock_response
+
+
+def create_metadata_generator(
+    provider: str = "gemini",
+    api_key: Optional[str] = None,
+    **kwargs
+) -> MetadataGenerator:
+    """Factory function to create appropriate metadata generator."""
+    
+    if provider == "gemini":
+        try:
+            gemini_kwargs = {k: v for k, v in kwargs.items() if k in ['temperature', 'max_tokens']}
+            return GeminiMetadataGenerator(api_key=api_key, **gemini_kwargs)
+        except ValueError:
+            print("WARNING: GEMINI_API_KEY not found, using fallback descriptions")
+            timestamp = kwargs.get('timestamp', 'unknown')
+            return FallbackMetadataGenerator(timestamp=timestamp)
+    elif provider == "fallback":
+        timestamp = kwargs.get('timestamp', 'unknown')
+        return FallbackMetadataGenerator(timestamp=timestamp)
+    elif provider == "mock":
+        return MockMetadataGenerator(kwargs.get('mock_response', {}))
+    else:
+        raise ValueError(f"Unknown metadata generator provider: {provider}")
\ No newline at end of file
diff --git a/src/fairscape_cli/tracking/provenance_tracker.py b/src/fairscape_cli/tracking/provenance_tracker.py
new file mode 100644
index 0000000..215ac57
--- /dev/null
+++ b/src/fairscape_cli/tracking/provenance_tracker.py
@@ -0,0 +1,349 @@
+from pathlib import Path
+from datetime import datetime
+from typing import List, Dict, Set, Optional
+
+from fairscape_cli.models.rocrate import ReadROCrateMetadata, AppendCrate
+from fairscape_cli.models.dataset import GenerateDataset, Dataset
+from fairscape_cli.models.software import GenerateSoftware, Software
+from fairscape_cli.models.computation import GenerateComputation, Computation
+
+from .config import ProvenanceConfig, TrackingResult
+from .io_capture import IOCapture
+from .metadata_generator import MetadataGenerator, FallbackMetadataGenerator, create_metadata_generator
+from .utils import collect_dataset_samples, format_samples_for_prompt
+
+from fairscape_cli.models.rocrate import GenerateROCrate
+from datetime import datetime
+
+
+class ProvenanceTracker:
+    """Tracks computational provenance and generates RO-Crate metadata."""
+    
+    def __init__(
+        self,
+        config: ProvenanceConfig,
+        metadata_generator: Optional[MetadataGenerator] = None
+    ):
+        self.config = config
+        self.metadata_generator = metadata_generator
+        self.filepath_to_guid: Dict[str, str] = {}
+        self.crate_metadata = None
+        
+        self._ensure_crate_exists()
+        self._load_crate_context()
+    
+    def _ensure_crate_exists(self):
+        """Create RO-Crate if it doesn't exist with placeholder metadata."""
+        metadata_path = self.config.rocrate_path / 'ro-crate-metadata.json'
+        
+        if metadata_path.exists():
+            return
+        
+        placeholder_name = f"Research Project {datetime.now().strftime('%Y%m%d')}"
+        placeholder_description = "Automatically generated RO-Crate for computational provenance tracking"
+        
+        GenerateROCrate(
+            path=self.config.rocrate_path,
+            guid=None,  
+            name=placeholder_name,
+            description=placeholder_description,
+            author=self.config.author if self.config.author != "Unknown" else "Researcher",
+            keywords=self.config.keywords,
+            datePublished=datetime.now().isoformat(),
+            version="1.0",
+            license="https://creativecommons.org/licenses/by/4.0/"
+        )
+    
+
+    def _load_crate_context(self):
+        """Load existing RO-Crate and build filepath to GUID mapping."""
+        try:
+            self.crate_metadata = ReadROCrateMetadata(self.config.rocrate_path)
+            
+            root_dataset = self.crate_metadata['@graph'][1]
+            
+            if self.config.author == "Unknown":
+                if hasattr(root_dataset, 'author') and root_dataset.author:
+                    self.config.author = root_dataset.author
+            
+            if self.config.keywords == ["jupyter", "computation"]:
+                if hasattr(root_dataset, 'keywords') and root_dataset.keywords:
+                    self.config.keywords = root_dataset.keywords
+            
+            for entity in self.crate_metadata.get('@graph', []):
+                entity_types = getattr(entity, '@type', [])
+                if isinstance(entity_types, str):
+                    entity_types = [entity_types]
+                
+                content_url = getattr(entity, 'contentUrl', None)
+                if content_url and content_url.startswith('file://'):
+                    relative_path = content_url.replace('file:///', '').lstrip('/')
+                    filepath_full = (self.config.rocrate_path / relative_path).resolve()
+                    self.filepath_to_guid[str(filepath_full)] = getattr(entity, 'guid')
+                        
+            print(self.filepath_to_guid)
+            
+        except Exception as e:
+            raise RuntimeError(f"Could not read RO-Crate at {self.config.rocrate_path}: {e}")
+    
+    def _resolve_manual_inputs(self) -> Set[str]:
+        """Convert manual input paths to absolute paths."""
+        manual_input_paths = set()
+        for manual_input in self.config.manual_inputs:
+            manual_path = Path(manual_input)
+            if not manual_path.is_absolute():
+                manual_path = (self.config.rocrate_path / manual_input).resolve()
+            else:
+                manual_path = manual_path.resolve()
+            manual_input_paths.add(str(manual_path))
+        return manual_input_paths
+    
+    def _resolve_inputs(self, io_capture: IOCapture) -> tuple[List[Dataset], int]:
+        """Resolve input datasets, reusing existing ones where possible."""
+        all_input_files = set(io_capture.inputs)
+        all_input_files.update(self._resolve_manual_inputs())
+        
+        input_datasets = []
+        reused_count = 0
+        
+        for input_file in all_input_files:
+            input_path = Path(input_file)
+            
+            if not input_path.exists():
+                print(f"WARNING: Input file does not exist: {input_file}")
+                continue
+            
+            normalized_path = input_path.resolve()
+            
+            if str(normalized_path) in self.filepath_to_guid:
+                existing_guid = self.filepath_to_guid[str(normalized_path)]
+                print(f"Reusing existing dataset: {input_path.name} ({existing_guid})")
+                
+                existing_dataset = next(
+                    (e for e in self.crate_metadata['@graph'] if e.get('@id') == existing_guid),
+                    None
+                )
+                if existing_dataset:
+                    dataset_obj = Dataset(**existing_dataset)
+                    input_datasets.append(dataset_obj)
+                    reused_count += 1
+                continue
+            
+            rel_path = input_path.relative_to(self.config.rocrate_path) if input_path.is_relative_to(self.config.rocrate_path) else input_path
+            
+            dataset_metadata = GenerateDataset(
+                name=input_path.name,
+                author=self.config.author,
+                version="1.0",
+                description=f"Input dataset",
+                keywords=self.config.keywords,
+                format=input_path.suffix.lstrip('.') or "unknown",
+                filepath=str(rel_path),
+                datePublished=datetime.now().isoformat(),
+                cratePath=self.config.rocrate_path
+            )
+            input_datasets.append(dataset_metadata)
+        
+        return input_datasets, reused_count
+    
+    def _resolve_outputs(self, io_capture: IOCapture) -> List[Dataset]:
+        """Resolve output datasets."""
+        output_datasets = []
+        
+        for output_file in io_capture.outputs:
+            output_path = Path(output_file)
+            normalized_output = output_path.resolve()
+            
+            if str(normalized_output) in self.filepath_to_guid:
+                print(f"WARNING: Overwriting existing dataset: {output_path.name}")
+            
+            rel_path = output_path.relative_to(self.config.rocrate_path) if output_path.is_relative_to(self.config.rocrate_path) else output_path
+            
+            dataset_metadata = GenerateDataset(
+                name=output_path.name,
+                author=self.config.author,
+                version="1.0",
+                description=f"Output dataset",
+                keywords=self.config.keywords,
+                format=output_path.suffix.lstrip('.') or "unknown",
+                filepath=str(rel_path),
+                datePublished=datetime.now().isoformat(),
+                generatedBy=[],
+                cratePath=self.config.rocrate_path
+            )
+            output_datasets.append(dataset_metadata)
+        
+        return output_datasets
+    
+    def _enhance_with_llm(
+        self,
+        code: str,
+        input_datasets: List[Dataset],
+        output_datasets: List[Dataset]
+    ) -> Optional[Dict]:
+        """Use LLM to generate better descriptions if available."""
+
+        if not self.metadata_generator:
+            return None
+        
+        all_input_files = set()
+        for ds in input_datasets:
+            if ds.contentUrl:
+                url = ds.contentUrl if isinstance(ds.contentUrl, str) else ds.contentUrl[0]
+                filepath = url.replace('file:///', '').replace('file:', '')
+                full_path = str((self.config.rocrate_path / filepath).resolve())
+                all_input_files.add(full_path)
+
+        all_output_files = set()
+        for ds in output_datasets:
+            if ds.contentUrl:
+                url = ds.contentUrl if isinstance(ds.contentUrl, str) else ds.contentUrl[0]
+                filepath = url.replace('file://', '').replace('file:', '')
+                full_path = str((self.config.rocrate_path / filepath).resolve())
+                all_output_files.add(full_path)
+        
+        input_samples = collect_dataset_samples(all_input_files)
+        output_samples = collect_dataset_samples(all_output_files)
+        
+        if not input_samples and not output_samples:
+            return None
+        
+        input_samples_str = format_samples_for_prompt(input_samples)
+        output_samples_str = format_samples_for_prompt(output_samples)
+        
+        return self.metadata_generator.generate_descriptions(
+            code, 
+            input_samples_str, 
+            output_samples_str
+        )
+    
+    def _apply_llm_descriptions(
+        self,
+        llm_descriptions: Optional[Dict],
+        software_name: str,
+        input_datasets: List[Dataset],
+        output_datasets: List[Dataset]
+    ) -> tuple[str, str]:
+        """Apply LLM descriptions to datasets and return software/computation descriptions."""
+        software_description = f"Code executed"
+        computation_description = f"Computation executed"
+        
+        if llm_descriptions:
+            if 'software_description' in llm_descriptions:
+                software_description = llm_descriptions['software_description']
+            
+            if 'computation_description' in llm_descriptions:
+                computation_description = llm_descriptions['computation_description']
+            
+            if 'input_datasets' in llm_descriptions:
+                for ds in input_datasets:
+                    if ds.name in llm_descriptions['input_datasets']:
+                        ds.description = llm_descriptions['input_datasets'][ds.name]
+            
+            if 'output_datasets' in llm_descriptions:
+                for ds in output_datasets:
+                    if ds.name in llm_descriptions['output_datasets']:
+                        ds.description = llm_descriptions['output_datasets'][ds.name]
+        
+        return software_description, computation_description
+    
+    def _create_software(self, code: str, name: str, description: str) -> Software:
+        """Create software metadata."""
+        software_filepath = f"software/{name}.py"
+        software_full_path = self.config.rocrate_path / software_filepath
+        software_full_path.parent.mkdir(parents=True, exist_ok=True)
+        software_full_path.write_text(code)
+        
+        return GenerateSoftware(
+            name=name,
+            author=self.config.author,
+            version="1.0",
+            description=description,
+            dateModified=datetime.now().isoformat(),
+            keywords=self.config.keywords,
+            fileFormat="py",
+            filepath=software_filepath,
+            cratePath=self.config.rocrate_path
+        )
+    
+    def _create_computation(
+        self,
+        name: str,
+        description: str,
+        software: Software,
+        input_datasets: List[Dataset],
+        output_datasets: List[Dataset]
+    ) -> Computation:
+        """Create computation metadata."""
+        computation = GenerateComputation(
+            name=name,
+            runBy=self.config.author,
+            dateCreated=datetime.now().isoformat(),
+            description=description,
+            keywords=self.config.keywords,
+            usedSoftware=[software.guid],
+            usedDataset=[ds.guid for ds in input_datasets],
+            generated=[ds.guid for ds in output_datasets]
+        )
+        
+        for output_ds in output_datasets:
+            output_ds.generatedBy = {"@id": computation.guid}
+        
+        return computation
+    
+    def track_execution(
+        self,
+        code: str,
+        io_capture: IOCapture,
+        execution_name: Optional[str] = None
+    ) -> TrackingResult:
+        """Track a code execution and generate provenance metadata."""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        execution_name = execution_name or f"cell_{timestamp}"
+        
+        if not io_capture.inputs and not io_capture.outputs and not self.config.manual_inputs:
+            print("No file I/O detected in execution")
+            raise ValueError("No file I/O detected")
+        
+        input_datasets, reused_count = self._resolve_inputs(io_capture)
+        output_datasets = self._resolve_outputs(io_capture)
+        
+        llm_descriptions = self._enhance_with_llm(code, input_datasets, output_datasets)
+        
+        software_description, computation_description = self._apply_llm_descriptions(
+            llm_descriptions,
+            execution_name,
+            input_datasets,
+            output_datasets
+        )
+        
+        software = self._create_software(code, execution_name, software_description)
+        
+        computation = self._create_computation(
+            f"Computation_{execution_name}",
+            computation_description,
+            software,
+            input_datasets,
+            output_datasets
+        )
+        
+        new_datasets = []
+        for ds in input_datasets:
+            if ds.contentUrl:
+                url = ds.contentUrl if isinstance(ds.contentUrl, str) else ds.contentUrl[0]
+                filepath = url.replace('file://', '').replace('file:', '')
+                full_path = str((self.config.rocrate_path / filepath).resolve())
+                if full_path not in self.filepath_to_guid:
+                    new_datasets.append(ds)
+        
+        elements = [software] + new_datasets + output_datasets + [computation]
+        AppendCrate(cratePath=self.config.rocrate_path, elements=elements)
+        
+        return TrackingResult(
+            computation_guid=computation.guid,
+            software_guid=software.guid,
+            input_count=len(input_datasets),
+            output_count=len(output_datasets),
+            reused_count=reused_count,
+            new_datasets=len(new_datasets)
+        )
diff --git a/src/fairscape_cli/tracking/utils.py b/src/fairscape_cli/tracking/utils.py
new file mode 100644
index 0000000..f828061
--- /dev/null
+++ b/src/fairscape_cli/tracking/utils.py
@@ -0,0 +1,60 @@
+from pathlib import Path
+from typing import Optional, Dict
+import pandas as pd
+
+
+def normalize_path(filepath) -> str:
+    """Normalize a filepath to absolute string representation."""
+    if isinstance(filepath, (str, Path)):
+        return str(Path(filepath).resolve())
+    return str(filepath)
+
+
+def is_trackable_path(filepath: str, excluded_patterns: list) -> bool:
+    """Check if a filepath should be tracked based on exclusion patterns."""
+    if not filepath:
+        return False
+    filepath_str = str(filepath)
+    return not any(pattern in filepath_str.lower() for pattern in excluded_patterns)
+
+
+def read_dataset_sample(filepath: str, n_rows: int = 5) -> Optional[str]:
+    """Read first n rows from a dataset file as a string sample."""
+    try:
+        path = Path(filepath)
+        if not path.exists():
+            return None
+        
+        suffix = path.suffix.lower()
+        df = None
+        
+        if suffix == '.csv':
+            df = pd.read_csv(filepath)
+        elif suffix == '.parquet':
+            df = pd.read_parquet(filepath)
+        elif suffix in ['.xlsx', '.xls']:
+            df = pd.read_excel(filepath)
+        
+        if df is not None:
+            return df.head(n_rows).to_string()
+        
+        return None
+    except Exception:
+        return None
+
+
+def collect_dataset_samples(filepaths: set, n_rows: int = 5) -> Dict[str, str]:
+    """Collect samples from multiple dataset files."""
+    samples = {}
+    for filepath in filepaths:
+        sample = read_dataset_sample(filepath, n_rows)
+        if sample:
+            samples[Path(filepath).name] = sample
+    return samples
+
+
+def format_samples_for_prompt(samples_dict: Dict[str, str]) -> str:
+    """Format dataset samples into a string for LLM prompts."""
+    if not samples_dict:
+        return "None"
+    return "\n\n".join([f"File: {name}\n{sample}" for name, sample in samples_dict.items()])

From 9e099b0f45939ae0ce34f47d37b15ce423536875 Mon Sep 17 00:00:00 2001
From: jniestroy <jniestroy@gmail.com>
Date: Thu, 6 Nov 2025 08:17:32 -0500
Subject: [PATCH 2/6] fix pydantic not dict issues

---
 .../tracking/provenance_tracker.py            | 36 +++++++------------
 1 file changed, 13 insertions(+), 23 deletions(-)

diff --git a/src/fairscape_cli/tracking/provenance_tracker.py b/src/fairscape_cli/tracking/provenance_tracker.py
index 215ac57..1aaaf3c 100644
--- a/src/fairscape_cli/tracking/provenance_tracker.py
+++ b/src/fairscape_cli/tracking/provenance_tracker.py
@@ -17,7 +17,6 @@
 
 
 class ProvenanceTracker:
-    """Tracks computational provenance and generates RO-Crate metadata."""
     
     def __init__(
         self,
@@ -33,7 +32,6 @@ def __init__(
         self._load_crate_context()
     
     def _ensure_crate_exists(self):
-        """Create RO-Crate if it doesn't exist with placeholder metadata."""
         metadata_path = self.config.rocrate_path / 'ro-crate-metadata.json'
         
         if metadata_path.exists():
@@ -56,7 +54,6 @@ def _ensure_crate_exists(self):
     
 
     def _load_crate_context(self):
-        """Load existing RO-Crate and build filepath to GUID mapping."""
         try:
             self.crate_metadata = ReadROCrateMetadata(self.config.rocrate_path)
             
@@ -70,7 +67,7 @@ def _load_crate_context(self):
                 if hasattr(root_dataset, 'keywords') and root_dataset.keywords:
                     self.config.keywords = root_dataset.keywords
             
-            for entity in self.crate_metadata.get('@graph', []):
+            for entity in self.crate_metadata['@graph']:
                 entity_types = getattr(entity, '@type', [])
                 if isinstance(entity_types, str):
                     entity_types = [entity_types]
@@ -80,14 +77,11 @@ def _load_crate_context(self):
                     relative_path = content_url.replace('file:///', '').lstrip('/')
                     filepath_full = (self.config.rocrate_path / relative_path).resolve()
                     self.filepath_to_guid[str(filepath_full)] = getattr(entity, 'guid')
-                        
-            print(self.filepath_to_guid)
             
         except Exception as e:
             raise RuntimeError(f"Could not read RO-Crate at {self.config.rocrate_path}: {e}")
     
     def _resolve_manual_inputs(self) -> Set[str]:
-        """Convert manual input paths to absolute paths."""
         manual_input_paths = set()
         for manual_input in self.config.manual_inputs:
             manual_path = Path(manual_input)
@@ -99,7 +93,6 @@ def _resolve_manual_inputs(self) -> Set[str]:
         return manual_input_paths
     
     def _resolve_inputs(self, io_capture: IOCapture) -> tuple[List[Dataset], int]:
-        """Resolve input datasets, reusing existing ones where possible."""
         all_input_files = set(io_capture.inputs)
         all_input_files.update(self._resolve_manual_inputs())
         
@@ -120,11 +113,11 @@ def _resolve_inputs(self, io_capture: IOCapture) -> tuple[List[Dataset], int]:
                 print(f"Reusing existing dataset: {input_path.name} ({existing_guid})")
                 
                 existing_dataset = next(
-                    (e for e in self.crate_metadata['@graph'] if e.get('@id') == existing_guid),
+                    (e for e in self.crate_metadata['@graph'] if getattr(e, '@id', None) == existing_guid),
                     None
                 )
                 if existing_dataset:
-                    dataset_obj = Dataset(**existing_dataset)
+                    dataset_obj = Dataset(**existing_dataset.__dict__)
                     input_datasets.append(dataset_obj)
                     reused_count += 1
                 continue
@@ -147,7 +140,6 @@ def _resolve_inputs(self, io_capture: IOCapture) -> tuple[List[Dataset], int]:
         return input_datasets, reused_count
     
     def _resolve_outputs(self, io_capture: IOCapture) -> List[Dataset]:
-        """Resolve output datasets."""
         output_datasets = []
         
         for output_file in io_capture.outputs:
@@ -181,23 +173,24 @@ def _enhance_with_llm(
         input_datasets: List[Dataset],
         output_datasets: List[Dataset]
     ) -> Optional[Dict]:
-        """Use LLM to generate better descriptions if available."""
 
         if not self.metadata_generator:
             return None
         
         all_input_files = set()
         for ds in input_datasets:
-            if ds.contentUrl:
-                url = ds.contentUrl if isinstance(ds.contentUrl, str) else ds.contentUrl[0]
+            content_url = getattr(ds, 'contentUrl', None)
+            if content_url:
+                url = content_url if isinstance(content_url, str) else content_url[0]
                 filepath = url.replace('file:///', '').replace('file:', '')
                 full_path = str((self.config.rocrate_path / filepath).resolve())
                 all_input_files.add(full_path)
 
         all_output_files = set()
         for ds in output_datasets:
-            if ds.contentUrl:
-                url = ds.contentUrl if isinstance(ds.contentUrl, str) else ds.contentUrl[0]
+            content_url = getattr(ds, 'contentUrl', None)
+            if content_url:
+                url = content_url if isinstance(content_url, str) else content_url[0]
                 filepath = url.replace('file://', '').replace('file:', '')
                 full_path = str((self.config.rocrate_path / filepath).resolve())
                 all_output_files.add(full_path)
@@ -224,7 +217,6 @@ def _apply_llm_descriptions(
         input_datasets: List[Dataset],
         output_datasets: List[Dataset]
     ) -> tuple[str, str]:
-        """Apply LLM descriptions to datasets and return software/computation descriptions."""
         software_description = f"Code executed"
         computation_description = f"Computation executed"
         
@@ -248,7 +240,6 @@ def _apply_llm_descriptions(
         return software_description, computation_description
     
     def _create_software(self, code: str, name: str, description: str) -> Software:
-        """Create software metadata."""
         software_filepath = f"software/{name}.py"
         software_full_path = self.config.rocrate_path / software_filepath
         software_full_path.parent.mkdir(parents=True, exist_ok=True)
@@ -274,7 +265,6 @@ def _create_computation(
         input_datasets: List[Dataset],
         output_datasets: List[Dataset]
     ) -> Computation:
-        """Create computation metadata."""
         computation = GenerateComputation(
             name=name,
             runBy=self.config.author,
@@ -297,7 +287,6 @@ def track_execution(
         io_capture: IOCapture,
         execution_name: Optional[str] = None
     ) -> TrackingResult:
-        """Track a code execution and generate provenance metadata."""
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         execution_name = execution_name or f"cell_{timestamp}"
         
@@ -329,8 +318,9 @@ def track_execution(
         
         new_datasets = []
         for ds in input_datasets:
-            if ds.contentUrl:
-                url = ds.contentUrl if isinstance(ds.contentUrl, str) else ds.contentUrl[0]
+            content_url = getattr(ds, 'contentUrl', None)
+            if content_url:
+                url = content_url if isinstance(content_url, str) else content_url[0]
                 filepath = url.replace('file://', '').replace('file:', '')
                 full_path = str((self.config.rocrate_path / filepath).resolve())
                 if full_path not in self.filepath_to_guid:
@@ -346,4 +336,4 @@ def track_execution(
             output_count=len(output_datasets),
             reused_count=reused_count,
             new_datasets=len(new_datasets)
-        )
+        )
\ No newline at end of file

From 9c4764a938fb25856c548728972d810d84af41fe Mon Sep 17 00:00:00 2001
From: jniestroy <jniestroy@gmail.com>
Date: Thu, 6 Nov 2025 08:26:07 -0500
Subject: [PATCH 3/6] track cli command

---
 src/fairscape_cli/__main__.py                 |   2 +
 src/fairscape_cli/commands/track.py           | 135 ++++++++++++++++++
 .../tracking/provenance_tracker.py            |   3 +-
 3 files changed, 139 insertions(+), 1 deletion(-)
 create mode 100644 src/fairscape_cli/commands/track.py

diff --git a/src/fairscape_cli/__main__.py b/src/fairscape_cli/__main__.py
index cc7b3dc..142bd5c 100644
--- a/src/fairscape_cli/__main__.py
+++ b/src/fairscape_cli/__main__.py
@@ -7,6 +7,7 @@
 from fairscape_cli.commands.publish_commands import publish_group
 from fairscape_cli.commands.schema_commands import schema
 from fairscape_cli.commands.augment_commands import augment_group
+from fairscape_cli.commands.track import track
 
 @click.group(invoke_without_command=True)
 @click.pass_context
@@ -25,6 +26,7 @@ def cli(ctx):
 cli.add_command(publish_group, name='publish')
 cli.add_command(schema, name='schema')
 cli.add_command(augment_group, name='augment')
+cli.add_command(track, name='track')
 
 if __name__ == "__main__":
     cli()
\ No newline at end of file
diff --git a/src/fairscape_cli/commands/track.py b/src/fairscape_cli/commands/track.py
new file mode 100644
index 0000000..ddafd20
--- /dev/null
+++ b/src/fairscape_cli/commands/track.py
@@ -0,0 +1,135 @@
+import click
+import pathlib
+import os
+import runpy
+from typing import List
+
+from fairscape_cli.tracking.io_capture import IOCapture
+from fairscape_cli.tracking.provenance_tracker import ProvenanceTracker
+from fairscape_cli.tracking.config import ProvenanceConfig, TrackerConfig
+from fairscape_cli.tracking.metadata_generator import create_metadata_generator
+
+
+@click.command('track')
+@click.argument('script-path', type=click.Path(exists=True, path_type=pathlib.Path))
+@click.option('--rocrate-path', type=click.Path(path_type=pathlib.Path), default=None, help='Path to RO-Crate directory (default: current directory)')
+@click.option('--author', type=str, default="Unknown", help='Author name (default: from RO-Crate or "Unknown")')
+@click.option('--keywords', multiple=True, default=["computation"], help='Keywords for metadata (default: from RO-Crate or ["computation"])')
+@click.option('--input', 'manual_inputs', multiple=True, help='Manual input files to track')
+@click.option('--no-llm', is_flag=True, default=False, help='Disable LLM-based description generation')
+@click.option('--execution-name', type=str, default=None, help='Name for this execution (default: script filename)')
+@click.pass_context
+def track(
+    ctx,
+    script_path: pathlib.Path,
+    rocrate_path: pathlib.Path,
+    author: str,
+    keywords: List[str],
+    manual_inputs: List[str],
+    no_llm: bool,
+    execution_name: str
+):
+    """Track execution of a Python script and generate provenance metadata.
+    
+    Executes SCRIPT_PATH while capturing file I/O operations, then generates
+    RO-Crate metadata documenting the computation, software, input datasets,
+    and output datasets.
+    
+    Examples:
+    
+        fairscape-cli track analysis.py
+        
+        fairscape-cli track analysis.py --author "Jane Doe" --keywords ml analysis
+        
+        fairscape-cli track analysis.py --rocrate-path ./my-crate --input config.json
+        
+        fairscape-cli track analysis.py --no-llm --author "John Smith"
+    """
+    
+    rocrate_path = rocrate_path or pathlib.Path.cwd()
+    
+    if not script_path.exists():
+        click.echo(f"ERROR: Script file not found: {script_path}", err=True)
+        ctx.exit(code=1)
+    
+    try:
+        with script_path.open('r') as f:
+            code = f.read()
+    except Exception as exc:
+        click.echo(f"ERROR: Could not read script file: {exc}", err=True)
+        ctx.exit(code=1)
+    
+    tracker_config = TrackerConfig()
+    
+    original_cwd = pathlib.Path.cwd()
+    script_dir = script_path.parent.resolve()
+    
+    try:
+        os.chdir(script_dir)
+        
+        with IOCapture(config=tracker_config) as capture:
+            try:
+                runpy.run_path(str(script_path), run_name='__main__')
+            except SystemExit as e:
+                if e.code != 0:
+                    click.echo(f"WARNING: Script exited with code {e.code}", err=True)
+            except Exception as exc:
+                click.echo(f"ERROR: Script execution failed: {exc}", err=True)
+                ctx.exit(code=1)
+    finally:
+        os.chdir(original_cwd)
+    
+    if not capture.inputs and not capture.outputs and not manual_inputs:
+        click.echo("WARNING: No file I/O detected in script execution", err=True)
+        click.echo("No metadata generated.", err=True)
+        return
+    
+    use_llm = not no_llm and os.environ.get("GEMINI_API_KEY")
+    
+    metadata_generator = None
+    if use_llm:
+        from datetime import datetime
+        try:
+            metadata_generator = create_metadata_generator(
+                provider="gemini",
+                timestamp=datetime.now().strftime("%Y%m%d_%H%M%S")
+            )
+        except Exception as exc:
+            click.echo(f"WARNING: Could not initialize LLM metadata generator: {exc}", err=True)
+            click.echo("Falling back to simple descriptions", err=True)
+    
+    provenance_config = ProvenanceConfig(
+        rocrate_path=rocrate_path,
+        author=author,
+        keywords=list(keywords),
+        manual_inputs=list(manual_inputs),
+        use_llm=use_llm
+    )
+    
+    try:
+        tracker = ProvenanceTracker(
+            config=provenance_config,
+            metadata_generator=metadata_generator
+        )
+        
+        exec_name = execution_name or script_path.stem
+        
+        result = tracker.track_execution(code, capture, execution_name=exec_name)
+        
+        click.echo(result.computation_guid)
+        
+        if ctx.obj and ctx.obj.get('verbose'):
+            click.echo(f"\nTracking Summary:", err=True)
+            click.echo(f"  Software: {result.software_guid}", err=True)
+            click.echo(f"  Inputs: {result.input_count} datasets ({result.reused_count} reused)", err=True)
+            click.echo(f"  Outputs: {result.output_count} datasets", err=True)
+        
+    except ValueError as exc:
+        click.echo(f"ERROR: {exc}", err=True)
+        ctx.exit(code=1)
+    except RuntimeError as exc:
+        click.echo(f"ERROR: {exc}", err=True)
+        ctx.exit(code=1)
+    except Exception as exc:
+        click.echo(f"ERROR: Tracking failed: {exc}", err=True)
+        ctx.exit(code=1)
\ No newline at end of file
diff --git a/src/fairscape_cli/tracking/provenance_tracker.py b/src/fairscape_cli/tracking/provenance_tracker.py
index 1aaaf3c..f21969a 100644
--- a/src/fairscape_cli/tracking/provenance_tracker.py
+++ b/src/fairscape_cli/tracking/provenance_tracker.py
@@ -49,7 +49,8 @@ def _ensure_crate_exists(self):
             keywords=self.config.keywords,
             datePublished=datetime.now().isoformat(),
             version="1.0",
-            license="https://creativecommons.org/licenses/by/4.0/"
+            license="https://creativecommons.org/licenses/by/4.0/",
+            isPartOf=[]
         )
     
 

From 0d5fcba1344e9c62c6540c4e64c6cfdeb1834a1c Mon Sep 17 00:00:00 2001
From: jniestroy <jniestroy@gmail.com>
Date: Thu, 6 Nov 2025 09:06:30 -0500
Subject: [PATCH 4/6] fix existing

---
 src/fairscape_cli/tracking/provenance_tracker.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fairscape_cli/tracking/provenance_tracker.py b/src/fairscape_cli/tracking/provenance_tracker.py
index f21969a..9099d36 100644
--- a/src/fairscape_cli/tracking/provenance_tracker.py
+++ b/src/fairscape_cli/tracking/provenance_tracker.py
@@ -114,11 +114,11 @@ def _resolve_inputs(self, io_capture: IOCapture) -> tuple[List[Dataset], int]:
                 print(f"Reusing existing dataset: {input_path.name} ({existing_guid})")
                 
                 existing_dataset = next(
-                    (e for e in self.crate_metadata['@graph'] if getattr(e, '@id', None) == existing_guid),
+                    (e for e in self.crate_metadata['@graph'] if getattr(e, 'guid', None) == existing_guid),
                     None
                 )
                 if existing_dataset:
-                    dataset_obj = Dataset(**existing_dataset.__dict__)
+                    dataset_obj =Dataset.model_validate(existing_dataset)
                     input_datasets.append(dataset_obj)
                     reused_count += 1
                 continue

From 5edd41c111ea18337ebb78f14ef040e43741ac24 Mon Sep 17 00:00:00 2001
From: jniestroy <jniestroy@gmail.com>
Date: Thu, 6 Nov 2025 09:13:06 -0500
Subject: [PATCH 5/6] fix overwriting description

---
 .../tracking/provenance_tracker.py              | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/fairscape_cli/tracking/provenance_tracker.py b/src/fairscape_cli/tracking/provenance_tracker.py
index 9099d36..49cd892 100644
--- a/src/fairscape_cli/tracking/provenance_tracker.py
+++ b/src/fairscape_cli/tracking/provenance_tracker.py
@@ -26,6 +26,7 @@ def __init__(
         self.config = config
         self.metadata_generator = metadata_generator
         self.filepath_to_guid: Dict[str, str] = {}
+        self.existing_guids: Set[str] = set()
         self.crate_metadata = None
         
         self._ensure_crate_exists()
@@ -69,6 +70,10 @@ def _load_crate_context(self):
                     self.config.keywords = root_dataset.keywords
             
             for entity in self.crate_metadata['@graph']:
+                entity_guid = getattr(entity, 'guid', None)
+                if entity_guid:
+                    self.existing_guids.add(entity_guid)
+                
                 entity_types = getattr(entity, '@type', [])
                 if isinstance(entity_types, str):
                     entity_types = [entity_types]
@@ -77,7 +82,7 @@ def _load_crate_context(self):
                 if content_url and content_url.startswith('file://'):
                     relative_path = content_url.replace('file:///', '').lstrip('/')
                     filepath_full = (self.config.rocrate_path / relative_path).resolve()
-                    self.filepath_to_guid[str(filepath_full)] = getattr(entity, 'guid')
+                    self.filepath_to_guid[str(filepath_full)] = entity_guid
             
         except Exception as e:
             raise RuntimeError(f"Could not read RO-Crate at {self.config.rocrate_path}: {e}")
@@ -317,15 +322,7 @@ def track_execution(
             output_datasets
         )
         
-        new_datasets = []
-        for ds in input_datasets:
-            content_url = getattr(ds, 'contentUrl', None)
-            if content_url:
-                url = content_url if isinstance(content_url, str) else content_url[0]
-                filepath = url.replace('file://', '').replace('file:', '')
-                full_path = str((self.config.rocrate_path / filepath).resolve())
-                if full_path not in self.filepath_to_guid:
-                    new_datasets.append(ds)
+        new_datasets = [ds for ds in input_datasets if ds.guid not in self.existing_guids]
         
         elements = [software] + new_datasets + output_datasets + [computation]
         AppendCrate(cratePath=self.config.rocrate_path, elements=elements)

From b7f8b3016f8b7325c5316e82c03dd67858393bc5 Mon Sep 17 00:00:00 2001
From: jniestroy <jniestroy@gmail.com>
Date: Fri, 7 Nov 2025 07:00:43 -0500
Subject: [PATCH 6/6] final for now

---
 src/fairscape_cli/tracking/metadata_generator.py | 2 +-
 src/fairscape_cli/tracking/provenance_tracker.py | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/fairscape_cli/tracking/metadata_generator.py b/src/fairscape_cli/tracking/metadata_generator.py
index b44705c..f2bf912 100644
--- a/src/fairscape_cli/tracking/metadata_generator.py
+++ b/src/fairscape_cli/tracking/metadata_generator.py
@@ -108,7 +108,7 @@ def generate_descriptions(
         import google.generativeai as genai
         
         genai.configure(api_key=self.api_key)
-        model = genai.GenerativeModel('gemini-2.0-flash-exp')
+        model = genai.GenerativeModel('gemini-2.5-flash')
         
         prompt = self._build_prompt(code, input_samples, output_samples)
         response = model.generate_content(
diff --git a/src/fairscape_cli/tracking/provenance_tracker.py b/src/fairscape_cli/tracking/provenance_tracker.py
index 49cd892..f27aa88 100644
--- a/src/fairscape_cli/tracking/provenance_tracker.py
+++ b/src/fairscape_cli/tracking/provenance_tracker.py
@@ -150,10 +150,6 @@ def _resolve_outputs(self, io_capture: IOCapture) -> List[Dataset]:
         
         for output_file in io_capture.outputs:
             output_path = Path(output_file)
-            normalized_output = output_path.resolve()
-            
-            if str(normalized_output) in self.filepath_to_guid:
-                print(f"WARNING: Overwriting existing dataset: {output_path.name}")
             
             rel_path = output_path.relative_to(self.config.rocrate_path) if output_path.is_relative_to(self.config.rocrate_path) else output_path