Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "fairscape-cli"
version = "1.1.7"
version = "1.1.9"
description = "A utility for packaging objects and validating metadata for FAIRSCAPE"
readme = "README.md"
requires-python = ">=3.8"
Expand Down Expand Up @@ -39,7 +39,7 @@ dependencies = [
"prettytable>=3.9.0",
"jsonschema>=4.20.0",
"sqids>=0.4.1",
"fairscape-models>=1.0.8",
"fairscape-models>=1.0.11",
"pyyaml",
"h5py",
"frictionless>=5.0,<6.0",
Expand Down
2 changes: 2 additions & 0 deletions src/fairscape_cli/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from fairscape_cli.commands.publish_commands import publish_group
from fairscape_cli.commands.schema_commands import schema
from fairscape_cli.commands.augment_commands import augment_group
from fairscape_cli.commands.track import track

@click.group(invoke_without_command=True)
@click.pass_context
Expand All @@ -25,6 +26,7 @@ def cli(ctx):
cli.add_command(publish_group, name='publish')
cli.add_command(schema, name='schema')
cli.add_command(augment_group, name='augment')
cli.add_command(track, name='track')

if __name__ == "__main__":
cli()
135 changes: 135 additions & 0 deletions src/fairscape_cli/commands/track.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import click
import pathlib
import os
import runpy
from typing import List

from fairscape_cli.tracking.io_capture import IOCapture
from fairscape_cli.tracking.provenance_tracker import ProvenanceTracker
from fairscape_cli.tracking.config import ProvenanceConfig, TrackerConfig
from fairscape_cli.tracking.metadata_generator import create_metadata_generator


@click.command('track')
@click.argument('script-path', type=click.Path(exists=True, path_type=pathlib.Path))
@click.option('--rocrate-path', type=click.Path(path_type=pathlib.Path), default=None, help='Path to RO-Crate directory (default: current directory)')
@click.option('--author', type=str, default="Unknown", help='Author name (default: from RO-Crate or "Unknown")')
@click.option('--keywords', multiple=True, default=["computation"], help='Keywords for metadata (default: from RO-Crate or ["computation"])')
@click.option('--input', 'manual_inputs', multiple=True, help='Manual input files to track')
@click.option('--no-llm', is_flag=True, default=False, help='Disable LLM-based description generation')
@click.option('--execution-name', type=str, default=None, help='Name for this execution (default: script filename)')
@click.pass_context
def track(
ctx,
script_path: pathlib.Path,
rocrate_path: pathlib.Path,
author: str,
keywords: List[str],
manual_inputs: List[str],
no_llm: bool,
execution_name: str
):
"""Track execution of a Python script and generate provenance metadata.

Executes SCRIPT_PATH while capturing file I/O operations, then generates
RO-Crate metadata documenting the computation, software, input datasets,
and output datasets.

Examples:

fairscape-cli track analysis.py

fairscape-cli track analysis.py --author "Jane Doe" --keywords ml analysis

fairscape-cli track analysis.py --rocrate-path ./my-crate --input config.json

fairscape-cli track analysis.py --no-llm --author "John Smith"
"""

rocrate_path = rocrate_path or pathlib.Path.cwd()

if not script_path.exists():
click.echo(f"ERROR: Script file not found: {script_path}", err=True)
ctx.exit(code=1)

try:
with script_path.open('r') as f:
code = f.read()
except Exception as exc:
click.echo(f"ERROR: Could not read script file: {exc}", err=True)
ctx.exit(code=1)

tracker_config = TrackerConfig()

original_cwd = pathlib.Path.cwd()
script_dir = script_path.parent.resolve()

try:
os.chdir(script_dir)

with IOCapture(config=tracker_config) as capture:
try:
runpy.run_path(str(script_path), run_name='__main__')
except SystemExit as e:
if e.code != 0:
click.echo(f"WARNING: Script exited with code {e.code}", err=True)
except Exception as exc:
click.echo(f"ERROR: Script execution failed: {exc}", err=True)
ctx.exit(code=1)
finally:
os.chdir(original_cwd)

if not capture.inputs and not capture.outputs and not manual_inputs:
click.echo("WARNING: No file I/O detected in script execution", err=True)
click.echo("No metadata generated.", err=True)
return

use_llm = not no_llm and os.environ.get("GEMINI_API_KEY")

metadata_generator = None
if use_llm:
from datetime import datetime
try:
metadata_generator = create_metadata_generator(
provider="gemini",
timestamp=datetime.now().strftime("%Y%m%d_%H%M%S")
)
except Exception as exc:
click.echo(f"WARNING: Could not initialize LLM metadata generator: {exc}", err=True)
click.echo("Falling back to simple descriptions", err=True)

provenance_config = ProvenanceConfig(
rocrate_path=rocrate_path,
author=author,
keywords=list(keywords),
manual_inputs=list(manual_inputs),
use_llm=use_llm
)

try:
tracker = ProvenanceTracker(
config=provenance_config,
metadata_generator=metadata_generator
)

exec_name = execution_name or script_path.stem

result = tracker.track_execution(code, capture, execution_name=exec_name)

click.echo(result.computation_guid)

if ctx.obj and ctx.obj.get('verbose'):
click.echo(f"\nTracking Summary:", err=True)
click.echo(f" Software: {result.software_guid}", err=True)
click.echo(f" Inputs: {result.input_count} datasets ({result.reused_count} reused)", err=True)
click.echo(f" Outputs: {result.output_count} datasets", err=True)

except ValueError as exc:
click.echo(f"ERROR: {exc}", err=True)
ctx.exit(code=1)
except RuntimeError as exc:
click.echo(f"ERROR: {exc}", err=True)
ctx.exit(code=1)
except Exception as exc:
click.echo(f"ERROR: Tracking failed: {exc}", err=True)
ctx.exit(code=1)
3 changes: 3 additions & 0 deletions src/fairscape_cli/jupyter/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .magic import fairscape

__all__ = ['fairscape']
104 changes: 104 additions & 0 deletions src/fairscape_cli/jupyter/magic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import os
import pathlib
import argparse
from IPython.core.magic import register_cell_magic
from IPython import get_ipython

from fairscape_cli.tracking.io_capture import IOCapture
from fairscape_cli.tracking.provenance_tracker import ProvenanceTracker
from fairscape_cli.tracking.config import ProvenanceConfig, TrackerConfig
from fairscape_cli.tracking.metadata_generator import create_metadata_generator


def parse_magic_arguments(line: str) -> argparse.Namespace:
"""Parse arguments from magic command line."""
parser = argparse.ArgumentParser(description='Track Jupyter cell execution')
parser.add_argument('command', nargs='?', default=None)
parser.add_argument('--rocrate-path', type=str, default=None)
parser.add_argument('--author', type=str, default="Unknown")
parser.add_argument('--keywords', nargs='+', default=["jupyter", "computation"])
parser.add_argument('--input', nargs='+', default=[], dest='manual_inputs')
parser.add_argument('--no-llm', action='store_true', help='Disable LLM descriptions')

args_list = line.split()

try:
args = parser.parse_args(args_list)
except SystemExit:
print("Usage: %%fairscape track [--rocrate-path PATH] [--author AUTHOR] [--keywords KW1 KW2] [--input FILE1 FILE2] [--no-llm]")
raise

return args


def execute_cell_safely(cell: str) -> bool:
"""Execute cell and return success status."""
ip = get_ipython()
result = ip.run_cell(cell)

if result.error_in_exec:
print("ERROR: Cell execution failed")
return False

return True


@register_cell_magic
def fairscape(line, cell):
"""
Jupyter cell magic for tracking computational provenance.

Usage:
%%fairscape track [options]
<your code here>

Options:
--rocrate-path PATH Path to RO-Crate directory (default: current directory)
--author AUTHOR Author name (default: from RO-Crate or "Unknown")
--keywords KW1 KW2 Keywords for metadata (default: from RO-Crate or ["jupyter", "computation"])
--input FILE1 FILE2 Manual input files to track
--no-llm Disable LLM-based description generation
"""
args = parse_magic_arguments(line)

if args.command != 'track':
print("Usage: %%fairscape track [options]")
return

rocrate_path = pathlib.Path(args.rocrate_path) if args.rocrate_path else pathlib.Path.cwd()

tracker_config = TrackerConfig()

with IOCapture(config=tracker_config) as capture:
if not execute_cell_safely(cell):
return

use_llm = not args.no_llm and os.environ.get("GEMINI_API_KEY")

metadata_generator = None
if use_llm:
from datetime import datetime
metadata_generator = create_metadata_generator(
provider="gemini",
timestamp=datetime.now().strftime("%Y%m%d_%H%M%S")
)

provenance_config = ProvenanceConfig(
rocrate_path=rocrate_path,
author=args.author,
keywords=args.keywords,
manual_inputs=args.manual_inputs,
use_llm=use_llm
)

try:
tracker = ProvenanceTracker(
config=provenance_config,
metadata_generator=metadata_generator
)

result = tracker.track_execution(cell, capture)

except Exception as e:
print(f"ERROR: Tracking failed: {e}")
raise
35 changes: 35 additions & 0 deletions src/fairscape_cli/tracking/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from .io_capture import IOCapture
from .provenance_tracker import ProvenanceTracker
from .metadata_generator import (
MetadataGenerator,
GeminiMetadataGenerator,
FallbackMetadataGenerator,
MockMetadataGenerator,
create_metadata_generator
)
from .config import TrackerConfig, ProvenanceConfig, TrackingResult
from .utils import (
normalize_path,
is_trackable_path,
read_dataset_sample,
collect_dataset_samples,
format_samples_for_prompt
)

__all__ = [
'IOCapture',
'ProvenanceTracker',
'MetadataGenerator',
'GeminiMetadataGenerator',
'FallbackMetadataGenerator',
'MockMetadataGenerator',
'create_metadata_generator',
'TrackerConfig',
'ProvenanceConfig',
'TrackingResult',
'normalize_path',
'is_trackable_path',
'read_dataset_sample',
'collect_dataset_samples',
'format_samples_for_prompt',
]
46 changes: 46 additions & 0 deletions src/fairscape_cli/tracking/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from dataclasses import dataclass, field
from typing import List, Optional
from pathlib import Path


@dataclass
class TrackerConfig:
track_builtins: bool = True
track_pathlib: bool = True
track_pandas: bool = True
track_numpy: bool = True
excluded_patterns: List[str] = field(default_factory=lambda: [
'.matplotlib',
'.ipython',
'.jupyter',
'site-packages',
'/tmp/',
'__pycache__'
])


@dataclass
class TrackingResult:
computation_guid: str
software_guid: str
input_count: int
output_count: int
reused_count: int
new_datasets: int

def __str__(self):
return (
f"Tracked computation: {self.computation_guid}\n"
f" Software: {self.software_guid}\n"
f" Inputs: {self.input_count} datasets ({self.reused_count} reused)\n"
f" Outputs: {self.output_count} datasets"
)


@dataclass
class ProvenanceConfig:
rocrate_path: Path
author: str = "Unknown"
keywords: List[str] = field(default_factory=lambda: ["jupyter", "computation"])
manual_inputs: List[str] = field(default_factory=list)
use_llm: bool = False
Loading
Loading