From d9176724edf933245a079c9255583d8ebdc9e8d9 Mon Sep 17 00:00:00 2001
From: fern-api <115122769+fern-api[bot]@users.noreply.github.com>
Date: Tue, 13 May 2025 17:35:47 +0000
Subject: [PATCH 1/6] Release 0.8.39


From 99461727fc9f6758aaa85dea057764a05ce4e4f0 Mon Sep 17 00:00:00 2001
From: Ale Pouroullis <alexandros@humanloop.com>
Date: Tue, 13 May 2025 18:54:33 +0100
Subject: [PATCH 2/6] Add custom code on top of autogenerated SDK

---
 .gitignore                                  |   2 +
 pytest.ini                                  |   2 +
 src/humanloop/cli/__init__.py               |   0
 src/humanloop/cli/__main__.py               | 248 ++++++++++++
 src/humanloop/client.py                     | 107 ++++-
 src/humanloop/overload.py                   | 274 ++++++++-----
 src/humanloop/sync/__init__.py              |   3 +
 src/humanloop/sync/sync_client.py           | 374 ++++++++++++++++++
 tests/custom/README.md                      |  19 +
 tests/custom/__init__.py                    |   0
 tests/custom/assets/exact_match.py          |  16 +
 tests/custom/assets/levenshtein.py          |  99 +++++
 tests/custom/conftest.py                    | 170 ++++++++
 tests/custom/integration/__init__.py        |   0
 tests/custom/integration/conftest.py        | 259 ++++++++++++
 tests/custom/integration/test_decorators.py | 153 ++++++++
 tests/custom/integration/test_evals.py      | 411 ++++++++++++++++++++
 tests/custom/integration/test_sync.py       | 206 ++++++++++
 tests/custom/integration/test_sync_cli.py   | 179 +++++++++
 tests/custom/otel/__init__.py               |   0
 tests/custom/otel/test_helpers.py           | 172 ++++++++
 tests/custom/sync/__init__.py               |   0
 tests/custom/sync/test_client.py            | 126 ++++++
 tests/custom/types.py                       |  15 +
 24 files changed, 2736 insertions(+), 99 deletions(-)
 create mode 100644 pytest.ini
 create mode 100644 src/humanloop/cli/__init__.py
 create mode 100644 src/humanloop/cli/__main__.py
 create mode 100644 src/humanloop/sync/__init__.py
 create mode 100644 src/humanloop/sync/sync_client.py
 create mode 100644 tests/custom/README.md
 create mode 100644 tests/custom/__init__.py
 create mode 100644 tests/custom/assets/exact_match.py
 create mode 100644 tests/custom/assets/levenshtein.py
 create mode 100644 tests/custom/conftest.py
 create mode 100644 tests/custom/integration/__init__.py
 create mode 100644 tests/custom/integration/conftest.py
 create mode 100644 tests/custom/integration/test_decorators.py
 create mode 100644 tests/custom/integration/test_evals.py
 create mode 100644 tests/custom/integration/test_sync.py
 create mode 100644 tests/custom/integration/test_sync_cli.py
 create mode 100644 tests/custom/otel/__init__.py
 create mode 100644 tests/custom/otel/test_helpers.py
 create mode 100644 tests/custom/sync/__init__.py
 create mode 100644 tests/custom/sync/test_client.py
 create mode 100644 tests/custom/types.py

diff --git a/.gitignore b/.gitignore
index a55ede77..f5cda9d9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,5 @@ poetry.toml
 .env
 tests/assets/*.jsonl
 tests/assets/*.parquet
+# Ignore humanloop directory which could mistakenly be committed when testing sync functionality as it's used as the default sync directory
+humanloop
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 00000000..8ab80e5d
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+addopts = -n auto
diff --git a/src/humanloop/cli/__init__.py b/src/humanloop/cli/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/humanloop/cli/__main__.py b/src/humanloop/cli/__main__.py
new file mode 100644
index 00000000..ad582bbc
--- /dev/null
+++ b/src/humanloop/cli/__main__.py
@@ -0,0 +1,248 @@
+import click
+import logging
+from typing import Optional, Callable
+from functools import wraps
+from dotenv import load_dotenv
+import os
+import sys
+from humanloop import Humanloop
+from humanloop.sync.sync_client import SyncClient
+import time
+
+# Set up logging
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)  # Set back to INFO level
+console_handler = logging.StreamHandler()
+formatter = logging.Formatter("%(message)s")  # Simplified formatter
+console_handler.setFormatter(formatter)
+if not logger.hasHandlers():
+    logger.addHandler(console_handler)
+
+# Color constants
+SUCCESS_COLOR = "green"
+ERROR_COLOR = "red"
+INFO_COLOR = "blue"
+WARNING_COLOR = "yellow"
+
+
+def load_api_key(env_file: Optional[str] = None) -> str:
+    """Load API key from .env file or environment variable.
+
+    Args:
+        env_file: Optional path to .env file
+
+    Returns:
+        str: The loaded API key
+
+    Raises:
+        click.ClickException: If no API key is found
+    """
+    # Try specific .env file if provided, otherwise default to .env in current directory
+    if env_file:
+        if not load_dotenv(env_file):  # load_dotenv returns False if file not found/invalid
+            raise click.ClickException(
+                click.style(
+                    f"Failed to load environment file: {env_file} (file not found or invalid format)",
+                    fg=ERROR_COLOR,
+                )
+            )
+    else:
+        load_dotenv()  # Attempt to load from default .env in current directory
+
+    # Get API key from environment
+    api_key = os.getenv("HUMANLOOP_API_KEY")
+    if not api_key:
+        raise click.ClickException(
+            click.style(
+                "No API key found. Set HUMANLOOP_API_KEY in .env file or environment, or use --api-key", fg=ERROR_COLOR
+            )
+        )
+
+    return api_key
+
+
+def get_client(
+    api_key: Optional[str] = None, env_file: Optional[str] = None, base_url: Optional[str] = None
+) -> Humanloop:
+    """Instantiate a Humanloop client for the CLI.
+
+    Args:
+        api_key: Optional API key provided directly
+        env_file: Optional path to .env file
+        base_url: Optional base URL for the API
+
+    Returns:
+        Humanloop: Configured client instance
+
+    Raises:
+        click.ClickException: If no API key is found
+    """
+    if not api_key:
+        api_key = load_api_key(env_file)
+    return Humanloop(api_key=api_key, base_url=base_url)
+
+
+def common_options(f: Callable) -> Callable:
+    """Decorator for common CLI options."""
+
+    @click.option(
+        "--api-key",
+        help="Humanloop API key. If not provided, uses HUMANLOOP_API_KEY from .env or environment.",
+        default=None,
+        show_default=False,
+    )
+    @click.option(
+        "--env-file",
+        help="Path to .env file. If not provided, looks for .env in current directory.",
+        default=None,
+        type=click.Path(exists=True),
+        show_default=False,
+    )
+    @click.option(
+        "--local-files-directory",
+        "--local-dir",
+        help="Directory (relative to the current working directory) where Humanloop files are stored locally (default: humanloop/).",
+        default="humanloop",
+        type=click.Path(),
+    )
+    @click.option(
+        "--base-url",
+        default=None,
+        hidden=True,
+    )
+    @wraps(f)
+    def wrapper(*args, **kwargs):
+        return f(*args, **kwargs)
+
+    return wrapper
+
+
+def handle_sync_errors(f: Callable) -> Callable:
+    """Decorator for handling sync operation errors.
+
+    If an error occurs in any operation that uses this decorator, it will be logged and the program will exit with a non-zero exit code.
+    """
+
+    @wraps(f)
+    def wrapper(*args, **kwargs):
+        try:
+            return f(*args, **kwargs)
+        except Exception as e:
+            click.echo(click.style(str(f"Error: {e}"), fg=ERROR_COLOR))
+            sys.exit(1)
+
+    return wrapper
+
+
+@click.group(
+    help="Humanloop CLI for managing sync operations.",
+    context_settings={
+        "help_option_names": ["-h", "--help"],
+        "max_content_width": 100,
+    },
+)
+def cli():  # Does nothing because used as a group for other subcommands (pull, push, etc.)
+    """Humanloop CLI for managing sync operations."""
+    pass
+
+
+@cli.command()
+@click.option(
+    "--path",
+    "-p",
+    help="Path in the Humanloop workspace to pull from (file or directory). You can pull an entire directory (e.g. 'my/directory') "
+    "or a specific file (e.g. 'my/directory/my_prompt.prompt'). When pulling a directory, all files within that directory and its subdirectories will be included. "
+    "If not specified, pulls from the root of the remote workspace.",
+    default=None,
+)
+@click.option(
+    "--environment",
+    "-e",
+    help="Environment to pull from (e.g. 'production', 'staging')",
+    default=None,
+)
+@click.option(
+    "--verbose",
+    "-v",
+    is_flag=True,
+    help="Show detailed information about the operation",
+)
+@click.option(
+    "--quiet",
+    "-q",
+    is_flag=True,
+    help="Suppress output of successful files",
+)
+@handle_sync_errors
+@common_options
+def pull(
+    path: Optional[str],
+    environment: Optional[str],
+    api_key: Optional[str],
+    env_file: Optional[str],
+    local_files_directory: str,
+    base_url: Optional[str],
+    verbose: bool,
+    quiet: bool,
+):
+    """Pull Prompt and Agent files from Humanloop to your local filesystem.
+
+    \b
+    This command will:
+    1. Fetch Prompt and Agent files from your Humanloop workspace
+    2. Save them to your local filesystem (directory specified by --local-files-directory, default: humanloop/)
+    3. Maintain the same directory structure as in Humanloop
+    4. Add appropriate file extensions (.prompt or .agent)
+
+    \b
+    For example, with the default --local-files-directory=humanloop, files will be saved as:
+    ./humanloop/
+    ├── my_project/
+    │   ├── prompts/
+    │   │   ├── my_prompt.prompt
+    │   │   └── nested/
+    │   │       └── another_prompt.prompt
+    │   └── agents/
+    │       └── my_agent.agent
+    └── another_project/
+        └── prompts/
+            └── other_prompt.prompt
+
+    \b
+    If you specify --local-files-directory=data/humanloop, files will be saved in ./data/humanloop/ instead.
+
+    If a file exists both locally and in the Humanloop workspace, the local file will be overwritten
+    with the version from Humanloop. Files that only exist locally will not be affected.
+
+    Currently only supports syncing Prompt and Agent files. Other file types will be skipped."""
+    client = get_client(api_key, env_file, base_url)
+    sync_client = SyncClient(
+        client, base_dir=local_files_directory, log_level=logging.DEBUG if verbose else logging.WARNING
+    )
+
+    click.echo(click.style("Pulling files from Humanloop...", fg=INFO_COLOR))
+    click.echo(click.style(f"Path: {path or '(root)'}", fg=INFO_COLOR))
+    click.echo(click.style(f"Environment: {environment or '(default)'}", fg=INFO_COLOR))
+
+    start_time = time.time()
+    successful_files, failed_files = sync_client.pull(path, environment)
+    duration_ms = int((time.time() - start_time) * 1000)
+
+    # Determine if the operation was successful based on failed_files
+    is_successful = not failed_files
+    duration_color = SUCCESS_COLOR if is_successful else ERROR_COLOR
+    click.echo(click.style(f"Pull completed in {duration_ms}ms", fg=duration_color))
+
+    if successful_files and not quiet:
+        click.echo(click.style(f"\nSuccessfully pulled {len(successful_files)} files:", fg=SUCCESS_COLOR))
+        for file in successful_files:
+            click.echo(click.style(f"  ✓ {file}", fg=SUCCESS_COLOR))
+
+    if failed_files:
+        click.echo(click.style(f"\nFailed to pull {len(failed_files)} files:", fg=ERROR_COLOR))
+        for file in failed_files:
+            click.echo(click.style(f"  ✗ {file}", fg=ERROR_COLOR))
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/src/humanloop/client.py b/src/humanloop/client.py
index 74cd6c97..fce02a98 100644
--- a/src/humanloop/client.py
+++ b/src/humanloop/client.py
@@ -1,6 +1,7 @@
 import os
 import typing
-from typing import Any, List, Optional, Sequence
+from typing import Any, List, Optional, Sequence, Tuple
+import logging
 
 import httpx
 from opentelemetry.sdk.resources import Resource
@@ -18,7 +19,7 @@
 )
 
 from humanloop.base_client import AsyncBaseHumanloop, BaseHumanloop
-from humanloop.overload import overload_call, overload_log
+from humanloop.overload import overload_client
 from humanloop.decorators.flow import flow as flow_decorator_factory
 from humanloop.decorators.prompt import prompt_decorator_factory
 from humanloop.decorators.tool import tool_decorator_factory as tool_decorator_factory
@@ -29,6 +30,9 @@
 from humanloop.otel.processor import HumanloopSpanProcessor
 from humanloop.prompt_utils import populate_template
 from humanloop.prompts.client import PromptsClient
+from humanloop.sync.sync_client import SyncClient, DEFAULT_CACHE_SIZE
+
+logger = logging.getLogger("humanloop.sdk")
 
 
 class ExtendedEvalsClient(EvaluationsClient):
@@ -87,8 +91,9 @@ class Humanloop(BaseHumanloop):
     """
     See docstring of :class:`BaseHumanloop`.
 
-    This class extends the base client with custom evaluation utilities
-    and decorators for declaring Files in code.
+    This class extends the base client with custom evaluation utilities,
+    decorators for declaring Files in code, and utilities for syncing
+    files between Humanloop and local filesystem.
     """
 
     def __init__(
@@ -102,6 +107,9 @@ def __init__(
         httpx_client: typing.Optional[httpx.Client] = None,
         opentelemetry_tracer_provider: Optional[TracerProvider] = None,
         opentelemetry_tracer: Optional[Tracer] = None,
+        use_local_files: bool = False,
+        local_files_directory: str = "humanloop",
+        cache_size: int = DEFAULT_CACHE_SIZE,
     ):
         """
         Extends the base client with custom evaluation utilities and
@@ -111,6 +119,27 @@ def __init__(
         You can provide a TracerProvider and a Tracer to integrate
         with your existing telemetry system. If not provided,
         an internal TracerProvider will be used.
+
+        Parameters
+        ----------
+        base_url: Optional base URL for the API
+        environment: The environment to use (default: DEFAULT)
+        api_key: Your Humanloop API key (default: from HUMANLOOP_API_KEY env var)
+        timeout: Optional timeout for API requests
+        follow_redirects: Whether to follow redirects
+        httpx_client: Optional custom httpx client
+        opentelemetry_tracer_provider: Optional tracer provider for telemetry
+        opentelemetry_tracer: Optional tracer for telemetry
+        use_local_files: Whether to use local files for prompts and agents
+        local_files_directory: Base directory where local prompt and agent files are stored (default: "humanloop").
+                      This is relative to the current working directory. For example:
+                      - "humanloop" will look for files in "./humanloop/"
+                      - "data/humanloop" will look for files in "./data/humanloop/"
+                      When using paths in the API, they must be relative to this directory. For example,
+                      if local_files_directory="humanloop" and you have a file at "humanloop/samples/test.prompt",
+                      you would reference it as "samples/test" in your code.
+        cache_size: Maximum number of files to cache when use_local_files is True (default: DEFAULT_CACHE_SIZE).
+                   This parameter has no effect if use_local_files is False.
         """
         super().__init__(
             base_url=base_url,
@@ -121,6 +150,17 @@ def __init__(
             httpx_client=httpx_client,
         )
 
+        self.use_local_files = use_local_files
+
+        # Warn user if cache_size is non-default but use_local_files is False — has no effect and will therefore be ignored
+        if not self.use_local_files and cache_size != DEFAULT_CACHE_SIZE:
+            logger.warning(
+                f"The specified cache_size={cache_size} will have no effect because use_local_files=False. "
+                f"File caching is only active when local files are enabled."
+            )
+
+        # Check if cache_size is non-default but use_local_files is False
+        self._sync_client = SyncClient(client=self, base_dir=local_files_directory, cache_size=cache_size)
         eval_client = ExtendedEvalsClient(client_wrapper=self._client_wrapper)
         eval_client.client = self
         self.evaluations = eval_client
@@ -128,10 +168,14 @@ def __init__(
 
         # Overload the .log method of the clients to be aware of Evaluation Context
         # and the @flow decorator providing the trace_id
-        self.prompts = overload_log(client=self.prompts)
-        self.prompts = overload_call(client=self.prompts)
-        self.flows = overload_log(client=self.flows)
-        self.tools = overload_log(client=self.tools)
+        self.prompts = overload_client(
+            client=self.prompts, sync_client=self._sync_client, use_local_files=self.use_local_files
+        )
+        self.agents = overload_client(
+            client=self.agents, sync_client=self._sync_client, use_local_files=self.use_local_files
+        )
+        self.flows = overload_client(client=self.flows)
+        self.tools = overload_client(client=self.tools)
 
         if opentelemetry_tracer_provider is not None:
             self._tracer_provider = opentelemetry_tracer_provider
@@ -351,6 +395,53 @@ def agent():
             attributes=attributes,
         )
 
+    def pull(self, path: str | None = None, environment: str | None = None) -> Tuple[List[str], List[str]]:
+        """Pull Prompt and Agent files from Humanloop to local filesystem.
+
+        This method will:
+        1. Fetch Prompt and Agent files from your Humanloop workspace
+        2. Save them to your local filesystem (directory specified by `local_files_directory`, default: "humanloop")
+        3. Maintain the same directory structure as in Humanloop
+        4. Add appropriate file extensions (`.prompt` or `.agent`)
+
+        The path parameter can be used in two ways:
+        - If it points to a specific file (e.g. "path/to/file.prompt" or "path/to/file.agent"), only that file will be pulled
+        - If it points to a directory (e.g. "path/to/directory"), all Prompt and Agent files in that directory and its subdirectories will be pulled
+        - If no path is provided, all Prompt and Agent files will be pulled
+
+        The operation will overwrite existing files with the latest version from Humanloop
+        but will not delete local files that don't exist in the remote workspace.
+
+        Currently only supports syncing Prompt and Agent files. Other file types will be skipped.
+
+        For example, with the default `local_files_directory="humanloop"`, files will be saved as:
+        ```
+        ./humanloop/
+        ├── my_project/
+        │   ├── prompts/
+        │   │   ├── my_prompt.prompt
+        │   │   └── nested/
+        │   │       └── another_prompt.prompt
+        │   └── agents/
+        │       └── my_agent.agent
+        └── another_project/
+            └── prompts/
+                └── other_prompt.prompt
+        ```
+
+        If you specify `local_files_directory="data/humanloop"`, files will be saved in ./data/humanloop/ instead.
+
+        :param path: Optional path to either a specific file (e.g. "path/to/file.prompt") or a directory (e.g. "path/to/directory").
+                    If not provided, all Prompt and Agent files will be pulled.
+        :param environment: The environment to pull the files from.
+        :return: Tuple of two lists:
+             - First list contains paths of successfully synced files
+             - Second list contains paths of files that failed to sync (due to API errors, missing content,
+               or filesystem issues)
+        :raises HumanloopRuntimeError: If there's an error communicating with the API
+        """
+        return self._sync_client.pull(environment=environment, path=path)
+
 
 class AsyncHumanloop(AsyncBaseHumanloop):
     """
diff --git a/src/humanloop/overload.py b/src/humanloop/overload.py
index b0c83215..92c83e6b 100644
--- a/src/humanloop/overload.py
+++ b/src/humanloop/overload.py
@@ -1,54 +1,69 @@
 import inspect
 import logging
 import types
-from typing import TypeVar, Union
+from typing import Any, Dict, Optional, Union, Callable
 
 from humanloop.context import (
     get_decorator_context,
     get_evaluation_context,
     get_trace_id,
 )
-from humanloop.evals.run import HumanloopRuntimeError
-
-from humanloop.evaluators.client import EvaluatorsClient
-from humanloop.flows.client import FlowsClient
+from humanloop.error import HumanloopRuntimeError
+from humanloop.sync.sync_client import SyncClient
 from humanloop.prompts.client import PromptsClient
+from humanloop.flows.client import FlowsClient
+from humanloop.datasets.client import DatasetsClient
+from humanloop.agents.client import AgentsClient
 from humanloop.tools.client import ToolsClient
+from humanloop.evaluators.client import EvaluatorsClient
+from humanloop.types import FileType
 from humanloop.types.create_evaluator_log_response import CreateEvaluatorLogResponse
 from humanloop.types.create_flow_log_response import CreateFlowLogResponse
 from humanloop.types.create_prompt_log_response import CreatePromptLogResponse
 from humanloop.types.create_tool_log_response import CreateToolLogResponse
 from humanloop.types.prompt_call_response import PromptCallResponse
+from humanloop.types.agent_call_response import AgentCallResponse
 
 logger = logging.getLogger("humanloop.sdk")
 
-
-CLIENT_TYPE = TypeVar("CLIENT_TYPE", PromptsClient, FlowsClient, EvaluatorsClient, ToolsClient)
-
-
-def overload_log(client: CLIENT_TYPE) -> CLIENT_TYPE:
-    """
-    Wrap the `log` method of the provided Humanloop client to use EVALUATION_CONTEXT.
-
-    This makes the overloaded log actions be aware of whether the created Log is
-    part of an Evaluation (e.g. one started by eval_utils.run_eval).
-    """
-    # Copy the original log method in a hidden attribute
-    client._log = client.log  # type: ignore [attr-defined]
-
-    def _overload_log(
-        # It's safe to only consider kwargs since the original
-        # log method bans positional arguments
-        self,
-        **kwargs,
-    ) -> Union[
-        CreatePromptLogResponse,
-        CreateToolLogResponse,
-        CreateFlowLogResponse,
-        CreateEvaluatorLogResponse,
-    ]:
-        trace_id = get_trace_id()
-        if trace_id is not None and type(client) is FlowsClient:
+LogResponseType = Union[
+    CreatePromptLogResponse,
+    CreateToolLogResponse,
+    CreateFlowLogResponse,
+    CreateEvaluatorLogResponse,
+]
+
+CallResponseType = Union[
+    PromptCallResponse,
+    AgentCallResponse,
+]
+
+
+def _get_file_type_from_client(
+    client: Union[PromptsClient, AgentsClient, ToolsClient, FlowsClient, DatasetsClient, EvaluatorsClient],
+) -> FileType:
+    """Get the file type based on the client type."""
+    if isinstance(client, PromptsClient):
+        return "prompt"
+    elif isinstance(client, AgentsClient):
+        return "agent"
+    elif isinstance(client, ToolsClient):
+        return "tool"
+    elif isinstance(client, FlowsClient):
+        return "flow"
+    elif isinstance(client, DatasetsClient):
+        return "dataset"
+    elif isinstance(client, EvaluatorsClient):
+        return "evaluator"
+
+    raise ValueError(f"Unsupported client type: {type(client)}")
+
+
+def _handle_tracing_context(kwargs: Dict[str, Any], client: Any) -> Dict[str, Any]:
+    """Handle tracing context for both log and call methods."""
+    trace_id = get_trace_id()
+    if trace_id is not None:
+        if "flow" in str(type(client).__name__).lower():
             context = get_decorator_context()
             if context is None:
                 raise HumanloopRuntimeError("Internal error: trace_id context is set outside a decorator context.")
@@ -56,69 +71,146 @@ def _overload_log(
                 f"Using `flows.log()` is not allowed: Flow decorator "
                 f"for File {context.path} manages the tracing and trace completion."
             )
-        if trace_id is not None:
-            if "trace_parent_id" in kwargs:
-                logger.warning(
-                    "Ignoring trace_parent_id argument at line %d: the Flow decorator manages tracing.",
-                    inspect.currentframe().f_lineno,  # type: ignore [union-attr]
-                )
-            kwargs = {
-                **kwargs,
-                "trace_parent_id": trace_id,
-            }
-        evaluation_context = get_evaluation_context()
-        if evaluation_context is not None:
-            kwargs_eval, eval_callback = evaluation_context.log_args_with_context(
-                path=kwargs.get("path"), log_args=kwargs
-            )
-            try:
-                response = self._log(**kwargs_eval)
-            except Exception as e:
-                # Re-raising as HumanloopDecoratorError so the decorators don't catch it
-                raise HumanloopRuntimeError from e
-            if eval_callback is not None:
-                eval_callback(response.id)
-        else:
-            try:
-                response = self._log(**kwargs)
-            except Exception as e:
-                # Re-raising as HumanloopDecoratorError so the decorators don't catch it
-                raise HumanloopRuntimeError from e
-
-        return response
 
-    # Replace the original log method with the overloaded one
-    client.log = types.MethodType(_overload_log, client)  # type: ignore [assignment]
-    # Return the client with the overloaded log method
-    logger.debug("Overloaded the .call method of %s", client)
-    return client
+        if "trace_parent_id" in kwargs:
+            logger.warning(
+                "Ignoring trace_parent_id argument at line %d: the Flow decorator manages tracing.",
+                inspect.currentframe().f_lineno,  # type: ignore[union-attr]
+            )
+        kwargs = {
+            **kwargs,
+            "trace_parent_id": trace_id,
+        }
+    return kwargs
+
+
+def _handle_local_files(
+    kwargs: Dict[str, Any],
+    client: Any,
+    sync_client: Optional[SyncClient],
+    use_local_files: bool,
+) -> Dict[str, Any]:
+    """Handle local file loading if enabled."""
+    if not use_local_files or "path" not in kwargs or sync_client is None:
+        return kwargs
+
+    if "id" in kwargs:
+        raise HumanloopRuntimeError("Can only specify one of `id` or `path`")
+
+    # Check if version_id or environment is specified
+    use_remote = any(["version_id" in kwargs, "environment" in kwargs])
+    normalized_path = sync_client._normalize_path(kwargs["path"])
+
+    if use_remote:
+        raise HumanloopRuntimeError(
+            f"Cannot use local file for `{normalized_path}` as version_id or environment was specified. "
+            "Please either remove version_id/environment to use local files, or set use_local_files=False to use remote files."
+        )
+
+    file_type = _get_file_type_from_client(client)
+    if file_type not in SyncClient.SERIALIZABLE_FILE_TYPES:
+        raise HumanloopRuntimeError(f"Local files are not supported for `{file_type}` files.")
+
+    # If file_type is already specified in kwargs, it means user provided a PromptKernelRequestParams object
+    if file_type in kwargs and not isinstance(kwargs[file_type], str):
+        logger.warning(
+            f"Ignoring local file for `{normalized_path}` as {file_type} parameters were directly provided. "
+            "Using provided parameters instead."
+        )
+        return kwargs
+
+    try:
+        file_content = sync_client.get_file_content(normalized_path, file_type)  # type: ignore[arg-type] # file_type was checked above
+        kwargs[file_type] = file_content
+    except HumanloopRuntimeError as e:
+        raise HumanloopRuntimeError(f"Failed to use local file for `{normalized_path}`: {str(e)}")
+
+    return kwargs
+
+
+def _handle_evaluation_context(kwargs: Dict[str, Any]) -> tuple[Dict[str, Any], Optional[Callable[[str], None]]]:
+    """Handle evaluation context for logging."""
+    evaluation_context = get_evaluation_context()
+    if evaluation_context is not None:
+        return evaluation_context.log_args_with_context(path=kwargs.get("path"), log_args=kwargs)
+    return kwargs, None
+
+
+def _overload_log(self: Any, sync_client: Optional[SyncClient], use_local_files: bool, **kwargs) -> LogResponseType:
+    try:
+        # Special handling for flows - prevent direct log usage
+        if type(self) is FlowsClient and get_trace_id() is not None:
+            context = get_decorator_context()
+            if context is None:
+                raise HumanloopRuntimeError("Internal error: trace_id context is set outside a decorator context.")
+            raise HumanloopRuntimeError(
+                f"Using `flows.log()` is not allowed: Flow decorator "
+                f"for File {context.path} manages the tracing and trace completion."
+            )
 
+        kwargs = _handle_tracing_context(kwargs, self)
 
-def overload_call(client: PromptsClient) -> PromptsClient:
-    client._call = client.call  # type: ignore [attr-defined]
-
-    def _overload_call(self, **kwargs) -> PromptCallResponse:
-        # None if not logging inside a decorator
-        trace_id = get_trace_id()
-        if trace_id is not None:
-            if "trace_parent_id" in kwargs:
-                logger.warning(
-                    "Ignoring trace_parent_id argument at line %d: the Flow decorator manages tracing.",
-                    inspect.currentframe().f_lineno,  # type: ignore [union-attr]
-                )
-            kwargs = {
-                **kwargs,
-                "trace_parent_id": trace_id,
-            }
-
-        try:
-            response = self._call(**kwargs)
-        except Exception as e:
-            # Re-raising as HumanloopDecoratorError so the decorators don't catch it
-            raise HumanloopRuntimeError from e
+        # Handle local files for Prompts and Agents clients
+        if _get_file_type_from_client(self) in ["prompt", "agent"]:
+            if sync_client is None:
+                logger.error("sync_client is None but client has log method and use_local_files=%s", use_local_files)
+                raise HumanloopRuntimeError("sync_client is required for clients that support local file operations")
+            kwargs = _handle_local_files(kwargs, self, sync_client, use_local_files)
 
+        kwargs, eval_callback = _handle_evaluation_context(kwargs)
+        response = self._log(**kwargs)  # Use stored original method
+        if eval_callback is not None:
+            eval_callback(response.id)
         return response
+    except HumanloopRuntimeError:
+        # Re-raise HumanloopRuntimeError without wrapping to preserve the message
+        raise
+    except Exception as e:
+        # Only wrap non-HumanloopRuntimeError exceptions
+        raise HumanloopRuntimeError from e
+
+
+def _overload_call(self: Any, sync_client: Optional[SyncClient], use_local_files: bool, **kwargs) -> CallResponseType:
+    try:
+        kwargs = _handle_tracing_context(kwargs, self)
+        kwargs = _handle_local_files(kwargs, self, sync_client, use_local_files)
+        return self._call(**kwargs)  # Use stored original method
+    except HumanloopRuntimeError:
+        # Re-raise HumanloopRuntimeError without wrapping to preserve the message
+        raise
+    except Exception as e:
+        # Only wrap non-HumanloopRuntimeError exceptions
+        raise HumanloopRuntimeError from e
+
+
+def overload_client(
+    client: Any,
+    sync_client: Optional[SyncClient] = None,
+    use_local_files: bool = False,
+) -> Any:
+    """Overloads client methods to add tracing, local file handling, and evaluation context."""
+    # Store original log method as _log for all clients. Used in flow decorator
+    if hasattr(client, "log") and not hasattr(client, "_log"):
+        client._log = client.log  # type: ignore[attr-defined]
+
+        # Create a closure to capture sync_client and use_local_files
+        def log_wrapper(self: Any, **kwargs) -> LogResponseType:
+            return _overload_log(self, sync_client, use_local_files, **kwargs)
+
+        client.log = types.MethodType(log_wrapper, client)
+
+    # Overload call method for Prompt and Agent clients
+    if _get_file_type_from_client(client) in ["prompt", "agent"]:
+        if sync_client is None and use_local_files:
+            logger.error("sync_client is None but client has call method and use_local_files=%s", use_local_files)
+            raise HumanloopRuntimeError("sync_client is required for clients that support call operations")
+        if hasattr(client, "call") and not hasattr(client, "_call"):
+            client._call = client.call  # type: ignore[attr-defined]
+
+            # Create a closure to capture sync_client and use_local_files
+            def call_wrapper(self: Any, **kwargs) -> CallResponseType:
+                return _overload_call(self, sync_client, use_local_files, **kwargs)
+
+            client.call = types.MethodType(call_wrapper, client)
 
-    # Replace the original log method with the overloaded one
-    client.call = types.MethodType(_overload_call, client)  # type: ignore [assignment]
     return client
diff --git a/src/humanloop/sync/__init__.py b/src/humanloop/sync/__init__.py
new file mode 100644
index 00000000..007659df
--- /dev/null
+++ b/src/humanloop/sync/__init__.py
@@ -0,0 +1,3 @@
+from humanloop.sync.sync_client import SyncClient
+
+__all__ = ["SyncClient"]
diff --git a/src/humanloop/sync/sync_client.py b/src/humanloop/sync/sync_client.py
new file mode 100644
index 00000000..d71f1568
--- /dev/null
+++ b/src/humanloop/sync/sync_client.py
@@ -0,0 +1,374 @@
+import logging
+from pathlib import Path
+from typing import List, Tuple, TYPE_CHECKING
+from functools import lru_cache
+import typing
+import time
+from humanloop.error import HumanloopRuntimeError
+import json
+
+if TYPE_CHECKING:
+    from humanloop.base_client import BaseHumanloop
+
+# Set up logging
+logger = logging.getLogger("humanloop.sdk.sync")
+logger.setLevel(logging.INFO)
+console_handler = logging.StreamHandler()
+formatter = logging.Formatter("%(message)s")
+console_handler.setFormatter(formatter)
+if not logger.hasHandlers():
+    logger.addHandler(console_handler)
+
+# Default cache size for file content caching
+DEFAULT_CACHE_SIZE = 100
+
+
+def format_api_error(error: Exception) -> str:
+    """Format API error messages to be more user-friendly."""
+    error_msg = str(error)
+    if "status_code" not in error_msg or "body" not in error_msg:
+        return error_msg
+
+    try:
+        # Extract the body part and parse as JSON
+        body_str = error_msg.split("body: ")[1]
+        # Convert Python dict string to valid JSON by:
+        # 1. Escaping double quotes
+        # 2. Replacing single quotes with double quotes
+        body_str = body_str.replace('"', '\\"').replace("'", '"')
+        body = json.loads(body_str)
+
+        # Get the detail from the body
+        detail = body.get("detail", {})
+
+        # Handle both string and dictionary types for detail
+        if isinstance(detail, str):
+            return detail
+        elif isinstance(detail, dict):
+            return detail.get("description") or detail.get("msg") or error_msg
+        else:
+            return error_msg
+    except Exception as e:
+        logger.debug(f"Failed to parse error message: {str(e)}")
+        return error_msg
+
+
+SerializableFileType = typing.Literal["prompt", "agent"]
+
+
+class SyncClient:
+    """Client for managing synchronization between local filesystem and Humanloop.
+
+    This client provides file synchronization between Humanloop and the local filesystem,
+    with built-in caching for improved performance. The cache uses Python's LRU (Least
+    Recently Used) cache to automatically manage memory usage by removing least recently
+    accessed files when the cache is full.
+
+    The cache is automatically updated when files are pulled or saved, and can be
+    manually cleared using the clear_cache() method.
+    """
+
+    # File types that can be serialized to/from the filesystem
+    SERIALIZABLE_FILE_TYPES = frozenset(typing.get_args(SerializableFileType))
+
+    def __init__(
+        self,
+        client: "BaseHumanloop",
+        base_dir: str = "humanloop",
+        cache_size: int = DEFAULT_CACHE_SIZE,
+        log_level: int = logging.WARNING,
+    ):
+        """
+        Parameters
+        ----------
+        client: Humanloop client instance
+        base_dir: Base directory for synced files (default: "humanloop")
+        cache_size: Maximum number of files to cache (default: DEFAULT_CACHE_SIZE)
+        log_level: Log level for logging (default: WARNING)
+        """
+        self.client = client
+        self.base_dir = Path(base_dir)
+        self._cache_size = cache_size
+
+        logger.setLevel(log_level)
+
+        # Create a new cached version of get_file_content with the specified cache size
+        self.get_file_content = lru_cache(maxsize=cache_size)(  # type: ignore [assignment]
+            self._get_file_content_implementation,
+        )
+
+    def _get_file_content_implementation(self, path: str, file_type: SerializableFileType) -> str:
+        """Implementation of get_file_content without the cache decorator.
+
+        This is the actual implementation that gets wrapped by lru_cache.
+
+        Args:
+            path: The normalized path to the file (without extension)
+            file_type: The type of file to get the content of (SerializableFileType)
+
+        Returns:
+            The raw file content
+
+        Raises:
+            HumanloopRuntimeError: In two cases:
+                1. If the file doesn't exist at the expected location
+                2. If there's a filesystem error when trying to read the file
+                   (e.g., permission denied, file is locked, etc.)
+        """
+        # Construct path to local file
+        local_path = self.base_dir / path
+        # Add appropriate extension
+        local_path = local_path.parent / f"{local_path.stem}.{file_type}"
+
+        if not local_path.exists():
+            raise HumanloopRuntimeError(f"Local file not found: {local_path}")
+
+        try:
+            # Read the raw file content
+            with open(local_path) as f:
+                file_content = f.read()
+            logger.debug(f"Using local file content from {local_path}")
+            return file_content
+        except Exception as e:
+            raise HumanloopRuntimeError(f"Error reading local file {local_path}: {str(e)}")
+
+    def get_file_content(self, path: str, file_type: SerializableFileType) -> str:
+        """Get the raw file content of a file from cache or filesystem.
+
+        This method uses an LRU cache to store file contents. When the cache is full,
+        the least recently accessed files are automatically removed to make space.
+
+        Args:
+            path: The normalized path to the file (without extension)
+            file_type: The type of file (Prompt or Agent)
+
+        Returns:
+            The raw file content
+
+        Raises:
+            HumanloopRuntimeError: If the file doesn't exist or can't be read
+        """
+        return self._get_file_content_implementation(path, file_type)
+
+    def clear_cache(self) -> None:
+        """Clear the LRU cache."""
+        self.get_file_content.cache_clear()  # type: ignore [attr-defined]
+
+    def _normalize_path(self, path: str) -> str:
+        """Normalize the path by:
+        1. Converting to a Path object to handle platform-specific separators
+        2. Removing any file extensions
+        3. Converting to a string with forward slashes and no leading/trailing slashes
+        """
+        # Convert to Path object to handle platform-specific separators
+        path_obj = Path(path)
+
+        # Reject absolute paths to ensure all paths are relative to base_dir.
+        # This maintains consistency with the remote filesystem where paths are relative to project root.
+        if path_obj.is_absolute():
+            raise HumanloopRuntimeError(
+                f"Absolute paths are not supported: `{path}`. "
+                f"Paths should be relative to the base directory (`{self.base_dir}`)."
+            )
+
+        # Remove extension, convert to string with forward slashes, and remove leading/trailing slashes
+        normalized = str(path_obj.with_suffix(""))
+        # Replace all backslashes and normalize multiple forward slashes
+        return "/".join(part for part in normalized.replace("\\", "/").split("/") if part)
+
+    def is_file(self, path: str) -> bool:
+        """Check if the path is a file by checking for .{file_type} extension for serializable file types."""
+        return path.endswith(tuple(f".{file_type}" for file_type in self.SERIALIZABLE_FILE_TYPES))
+
+    def _save_serialized_file(
+        self,
+        serialized_content: str,
+        file_path: str,
+        file_type: SerializableFileType,
+    ) -> None:
+        """Save serialized file to local filesystem."""
+        try:
+            # Create full path including base_dir prefix
+            full_path = self.base_dir / file_path
+            # Create directory if it doesn't exist
+            full_path.parent.mkdir(parents=True, exist_ok=True)
+
+            # Add file type extension
+            new_path = full_path.parent / f"{full_path.stem}.{file_type}"
+
+            # Write raw file content to file
+            with open(new_path, "w") as f:
+                f.write(serialized_content)
+        except Exception as e:
+            logger.error(f"Failed to write {file_type} {file_path} to disk: {str(e)}")
+            raise
+
+    def _pull_file(self, path: str, environment: str | None = None) -> bool:
+        """Pull a specific file from Humanloop to local filesystem.
+
+        Returns:
+            True if the file was successfully pulled, False otherwise
+        """
+        try:
+            file = self.client.files.retrieve_by_path(
+                path=path,
+                environment=environment,
+                include_raw_file_content=True,
+            )
+
+            if file.type not in self.SERIALIZABLE_FILE_TYPES:
+                logger.error(f"Unsupported file type: {file.type}")
+                return False
+
+            if not file.raw_file_content:  # type: ignore [union-attr]
+                logger.error(f"No content found for {file.type} {path}")
+                return False
+
+            self._save_serialized_file(
+                serialized_content=file.raw_file_content,  # type: ignore [union-attr]
+                file_path=file.path,
+                file_type=typing.cast(SerializableFileType, file.type),
+            )
+            return True
+        except Exception as e:
+            logger.error(f"Failed to pull file {path}: {str(e)}")
+            return False
+
+    def _pull_directory(
+        self,
+        path: str | None = None,
+        environment: str | None = None,
+    ) -> Tuple[List[str], List[str]]:
+        """Sync Prompt and Agent files from Humanloop to local filesystem.
+
+        Returns:
+            Tuple of two lists:
+            - First list contains paths of successfully synced files
+            - Second list contains paths of files that failed to sync.
+              Failures can occur due to missing content in the response or errors during local file writing.
+
+        Raises:
+            HumanloopRuntimeError: If there's an error communicating with the API
+        """
+        successful_files = []
+        failed_files = []
+        page = 1
+
+        logger.debug(f"Fetching files from directory: {path or '(root)'} in environment: {environment or '(default)'}")
+
+        while True:
+            try:
+                logger.debug(f"`{path}`: Requesting page {page} of files")
+                response = self.client.files.list_files(
+                    type=list(self.SERIALIZABLE_FILE_TYPES),
+                    page=page,
+                    size=100,
+                    include_raw_file_content=True,
+                    environment=environment,
+                    path=path,
+                )
+
+                if len(response.records) == 0:
+                    logger.debug(f"Finished reading files for path `{path}`")
+                    break
+
+                logger.debug(f"`{path}`: Read page {page} containing {len(response.records)} files")
+
+                # Process each file
+                for file in response.records:
+                    # Skip if not a serializable file type
+                    if file.type not in self.SERIALIZABLE_FILE_TYPES:
+                        logger.warning(f"Skipping unsupported file type: {file.type}")
+                        continue
+
+                    file_type: SerializableFileType = typing.cast(
+                        SerializableFileType,
+                        file.type,
+                    )
+
+                    # Skip if no raw file content
+                    if not getattr(file, "raw_file_content", None) or not file.raw_file_content:  # type: ignore [union-attr]
+                        logger.warning(f"No content found for {file.type} {file.path}")
+                        failed_files.append(file.path)
+                        continue
+
+                    try:
+                        logger.debug(f"Writing {file.type} {file.path} to disk")
+                        self._save_serialized_file(
+                            serialized_content=file.raw_file_content,  # type: ignore [union-attr]
+                            file_path=file.path,
+                            file_type=file_type,
+                        )
+                        successful_files.append(file.path)
+                    except Exception as e:
+                        failed_files.append(file.path)
+                        logger.error(f"Failed to save {file.path}: {str(e)}")
+
+                page += 1
+            except Exception as e:
+                formatted_error = format_api_error(e)
+                raise HumanloopRuntimeError(f"Failed to fetch page {page}: {formatted_error}")
+
+        if successful_files:
+            logger.info(f"Successfully pulled {len(successful_files)} files")
+        if failed_files:
+            logger.warning(f"Failed to pull {len(failed_files)} files")
+
+        return successful_files, failed_files
+
+    def pull(self, path: str | None = None, environment: str | None = None) -> Tuple[List[str], List[str]]:
+        """Pull files from Humanloop to local filesystem.
+
+        If the path ends with .prompt or .agent, pulls that specific file.
+        Otherwise, pulls all files under the specified path.
+        If no path is provided, pulls all files from the root.
+
+        Args:
+            path: The path to pull from (either a specific file or directory)
+            environment: The environment to pull from
+
+        Returns:
+            Tuple of two lists:
+            - First list contains paths of successfully synced files
+            - Second list contains paths of files that failed to sync (e.g. failed to write to disk or missing raw content)
+
+        Raises:
+            HumanloopRuntimeError: If there's an error communicating with the API
+        """
+        start_time = time.time()
+        normalized_path = self._normalize_path(path) if path else None
+
+        logger.info(
+            f"Starting pull operation: path={normalized_path or '(root)'}, environment={environment or '(default)'}"
+        )
+
+        try:
+            if normalized_path is None or path is None: # path being None means normalized_path is None, but we check both for improved type safety
+                # Pull all files from the root
+                logger.debug("Pulling all files from root")
+                successful_files, failed_files = self._pull_directory(
+                    path=None,
+                    environment=environment,
+                )
+            else:
+                if self.is_file(path.strip()):
+                    logger.debug(f"Pulling file: {normalized_path}")
+                    if self._pull_file(path=normalized_path, environment=environment):
+                        successful_files = [path]
+                        failed_files = []
+                    else:
+                        successful_files = []
+                        failed_files = [path]
+                else:
+                    logger.debug(f"Pulling directory: {normalized_path}")
+                    successful_files, failed_files = self._pull_directory(normalized_path, environment)
+
+            # Clear the cache at the end of each pull operation
+            self.clear_cache()
+
+            duration_ms = int((time.time() - start_time) * 1000)
+            logger.info(f"Pull completed in {duration_ms}ms: {len(successful_files)} files succeeded")
+
+            return successful_files, failed_files
+        except Exception as e:
+            raise HumanloopRuntimeError(f"Pull operation failed: {str(e)}")
diff --git a/tests/custom/README.md b/tests/custom/README.md
new file mode 100644
index 00000000..14ff7ed4
--- /dev/null
+++ b/tests/custom/README.md
@@ -0,0 +1,19 @@
+# Custom Tests Directory
+
+This directory contains custom tests for the Humanloop Python SDK. While the main SDK is auto-generated using [Fern](https://buildwithfern.com/), this directory allows us to add our own test implementations that won't be overwritten during regeneration.
+
+## Why Custom Tests?
+
+- **Preservation**: Tests in this directory won't be overwritten when regenerating the SDK
+- **Custom Implementation**: Allows testing of our own implementations beyond the auto-generated code
+- **Integration**: Enables testing of how our custom code works with the auto-generated SDK
+
+## Running Tests
+
+```bash
+# Run all custom tests
+pytest tests/custom/
+
+# Run specific test file
+pytest tests/custom/sync/test_sync_client.py
+```
diff --git a/tests/custom/__init__.py b/tests/custom/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/custom/assets/exact_match.py b/tests/custom/assets/exact_match.py
new file mode 100644
index 00000000..583d742a
--- /dev/null
+++ b/tests/custom/assets/exact_match.py
@@ -0,0 +1,16 @@
+def extract_answer(generation: str):
+    """Extracts answer from generation.
+
+    Handles a generation that if separated by "---" with the answer being the first part.
+    Also handles a generation that starts with "```\n" and removes it.
+    """
+    answer = generation.split("---")[0].strip()
+    if answer.startswith("```\n"):
+        answer = answer[4:].strip()
+
+    return answer
+
+
+def exact_match(log, testcase):
+    target = testcase["target"]["output"]
+    return target == extract_answer(log["output"])
diff --git a/tests/custom/assets/levenshtein.py b/tests/custom/assets/levenshtein.py
new file mode 100644
index 00000000..b2e279ae
--- /dev/null
+++ b/tests/custom/assets/levenshtein.py
@@ -0,0 +1,99 @@
+def levenshtein_distance_optimized(s1, s2, max_distance=1000):
+    """
+    Calculate the Levenshtein distance between two strings with optimizations and a maximum distance cap.
+
+    This function trims common prefixes and suffixes from the input strings, uses a single-row table
+    to reduce space complexity, and stops the computation early if the Levenshtein distance is
+    guaranteed to exceed a maximum distance cap.
+
+    Args:
+        s1 (str): The first string.
+        s2 (str): The second string.
+        max_distance (int, optional): The maximum Levenshtein distance. Defaults to 1000.
+
+    Returns:
+        int: The Levenshtein distance between the two strings, or max_distance if the distance
+        exceeds max_distance.
+    """
+    # Trim common prefixes
+    while s1 and s2 and s1[0] == s2[0]:
+        s1 = s1[1:]
+        s2 = s2[1:]
+
+    # Trim common suffixes
+    while s1 and s2 and s1[-1] == s2[-1]:
+        s1 = s1[:-1]
+        s2 = s2[:-1]
+
+    len_s1 = len(s1)
+    len_s2 = len(s2)
+
+    # If the length difference between the strings exceeds max_distance, stop the computation
+    if abs(len_s1 - len_s2) > max_distance:
+        return max_distance
+
+    # If one of the strings is empty, the distance is the length of the other string
+    if len_s1 == 0:
+        return min(len_s2, max_distance)
+    if len_s2 == 0:
+        return min(len_s1, max_distance)
+
+    # Create a single-row table with len(s2) + 1 columns
+    distance = list(range(len_s2 + 1))
+
+    # Fill up the table
+    for i in range(1, len_s1 + 1):
+        # Store the value of the previous cell in the previous row
+        prev_row_cell = i - 1
+        # The value at the first column is the row number
+        distance[0] = i
+
+        # Initialize the minimum distance in the current row to max_distance
+        min_distance = max_distance
+
+        for j in range(1, len_s2 + 1):
+            # Store the value of the current cell before it is updated
+            current_cell = distance[j]
+
+            # If the current characters of the two strings are the same, the cost is 0, otherwise 1
+            substitution_cost = 0 if s1[i - 1] == s2[j - 1] else 1
+
+            # The value at the current cell is the minimum of the values at the previous cell in the
+            # current row, the current cell in the previous row, and the previous cell in the previous row,
+            # plus the cost
+            distance[j] = min(
+                distance[j - 1] + 1,  # deletion
+                distance[j] + 1,  # insertion
+                prev_row_cell + substitution_cost,
+            )  # substitution
+
+            # Update the minimum distance in the current row
+            min_distance = min(min_distance, distance[j])
+
+            # Update the value of the previous cell in the previous row
+            prev_row_cell = current_cell
+
+        # If the minimum distance in the current row exceeds max_distance, stop the computation
+        if min_distance >= max_distance:
+            return max_distance
+
+    # The Levenshtein distance between the two strings is the value at the last cell in the table
+    return min(distance[-1], max_distance)
+
+
+def extract_answer(generation: str):
+    """Extracts answer from generation.
+
+    Handles a generation that if separated by "---" with the answer being the first part.
+    Also handles a generation that starts with "```\n" and removes it.
+    """
+    answer = generation.split("---")[0].strip()
+    if answer.startswith("```\n"):
+        answer = answer[4:].strip()
+
+    return answer
+
+
+def compare_log_and_target(log, testcase):
+    target = testcase["target"]["output"]
+    return levenshtein_distance_optimized(target, extract_answer(log["output"]))
diff --git a/tests/custom/conftest.py b/tests/custom/conftest.py
new file mode 100644
index 00000000..7667dedf
--- /dev/null
+++ b/tests/custom/conftest.py
@@ -0,0 +1,170 @@
+from typing import Generator
+import os
+from dotenv import load_dotenv
+from unittest.mock import MagicMock
+
+import pytest
+from humanloop.client import Humanloop
+from humanloop.otel.exporter import HumanloopSpanExporter
+from humanloop.otel.processor import HumanloopSpanProcessor
+from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
+from opentelemetry.instrumentation.anthropic import AnthropicInstrumentor
+from opentelemetry.instrumentation.cohere import CohereInstrumentor
+from opentelemetry.instrumentation.groq import GroqInstrumentor
+from opentelemetry.instrumentation.instrumentor import BaseInstrumentor  # type: ignore
+from opentelemetry.instrumentation.openai import OpenAIInstrumentor
+from opentelemetry.instrumentation.replicate import ReplicateInstrumentor
+from opentelemetry.sdk.resources import Resource
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
+from opentelemetry.trace import Tracer
+from tests.custom.types import GetHumanloopClientFn
+
+
+@pytest.fixture(scope="function")
+def opentelemetry_test_provider() -> TracerProvider:
+    """Create a test TracerProvider with a resource.
+
+    This is similar to the created TracerProvider in the
+    Humanloop class.
+    """
+    provider = TracerProvider(
+        resource=Resource.create(
+            {
+                "service": "humanloop.sdk",
+                "environment": "test",
+            }
+        )
+    )
+    return provider
+
+
+@pytest.fixture(scope="function")
+def test_span(opentelemetry_test_provider: TracerProvider):
+    exporter = InMemorySpanExporter()
+    processor = SimpleSpanProcessor(exporter)
+    opentelemetry_test_provider.add_span_processor(processor)
+    tracer = opentelemetry_test_provider.get_tracer("test")
+    return tracer.start_span("test_span")
+
+
+@pytest.fixture(scope="function")
+def opentelemetry_test_configuration(
+    opentelemetry_test_provider: TracerProvider,
+) -> Generator[tuple[Tracer, InMemorySpanExporter], None, None]:
+    """Configure OTel backend without HumanloopSpanProcessor.
+
+    Spans created by Instrumentors will not be used to enrich
+    Humanloop Spans.
+    """
+    exporter = InMemorySpanExporter()
+    processor = SimpleSpanProcessor(exporter)
+    opentelemetry_test_provider.add_span_processor(processor)
+    instrumentors: list[BaseInstrumentor] = [
+        OpenAIInstrumentor(),
+        AnthropicInstrumentor(),
+        GroqInstrumentor(),
+        CohereInstrumentor(),
+        ReplicateInstrumentor(),
+    ]
+    for instrumentor in instrumentors:
+        instrumentor.instrument(tracer_provider=opentelemetry_test_provider)
+    tracer = opentelemetry_test_provider.get_tracer("test")
+    # Circumvent configuration procedure
+
+    yield tracer, exporter
+
+    for instrumentor in instrumentors:
+        instrumentor.uninstrument()
+
+
+@pytest.fixture(scope="session")
+def get_humanloop_client() -> GetHumanloopClientFn:
+    load_dotenv()
+    if not os.getenv("HUMANLOOP_API_KEY"):
+        pytest.fail("HUMANLOOP_API_KEY is not set for integration tests")
+
+    def _get_humanloop_client(use_local_files: bool = False) -> Humanloop:
+        return Humanloop(
+            api_key=os.getenv("HUMANLOOP_API_KEY"),
+            use_local_files=use_local_files,
+        )
+
+    return _get_humanloop_client
+
+
+@pytest.fixture(scope="function")
+def opentelemetry_hl_test_configuration(
+    opentelemetry_test_provider: TracerProvider,
+) -> Generator[tuple[Tracer, InMemorySpanExporter], None, None]:
+    """Configure OTel backend with HumanloopSpanProcessor.
+
+    Spans created by Instrumentors will be used to enrich
+    Humanloop Spans.
+    """
+    exporter = InMemorySpanExporter()
+    processor = HumanloopSpanProcessor(exporter=exporter)
+    opentelemetry_test_provider.add_span_processor(processor)
+    instrumentors: list[BaseInstrumentor] = [
+        OpenAIInstrumentor(),
+        AnthropicInstrumentor(),
+        GroqInstrumentor(),
+        CohereInstrumentor(),
+        ReplicateInstrumentor(),
+        AnthropicInstrumentor(),
+    ]
+    for instrumentor in instrumentors:
+        instrumentor.instrument(
+            tracer_provider=opentelemetry_test_provider,
+        )
+    tracer = opentelemetry_test_provider.get_tracer("test")
+
+    yield tracer, exporter
+
+    for instrumentor in instrumentors:
+        instrumentor.uninstrument()
+
+
+@pytest.fixture(scope="function")
+def hl_test_exporter() -> HumanloopSpanExporter:
+    """
+    Test Exporter where HTTP calls to Humanloop API
+    are mocked.
+    """
+    client = MagicMock()
+    exporter = HumanloopSpanExporter(client=client)
+    return exporter
+
+
+@pytest.fixture(scope="function")
+def opentelemetry_hl_with_exporter_test_configuration(
+    hl_test_exporter: HumanloopSpanExporter,
+    opentelemetry_test_provider: TracerProvider,
+) -> Generator[tuple[Tracer, HumanloopSpanExporter], None, None]:
+    """Configure OTel backend with HumanloopSpanProcessor and
+    a HumanloopSpanExporter where HTTP calls are mocked.
+    """
+    processor = HumanloopSpanProcessor(exporter=hl_test_exporter)
+    opentelemetry_test_provider.add_span_processor(processor)
+    instrumentor = OpenAIInstrumentor()
+    instrumentor.instrument(tracer_provider=opentelemetry_test_provider)
+    tracer = opentelemetry_test_provider.get_tracer("test")
+
+    yield tracer, hl_test_exporter
+
+    instrumentor.uninstrument()
+
+
+@pytest.fixture(scope="session")
+def call_llm_messages() -> list[ChatCompletionMessageParam]:
+    return [
+        {
+            "role": "system",
+            "content": "You are an assistant on the following topics: greetings in foreign languages.",
+        },
+        {
+            "role": "user",
+            "content": "Bonjour!",
+        },
+    ]
diff --git a/tests/custom/integration/__init__.py b/tests/custom/integration/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/custom/integration/conftest.py b/tests/custom/integration/conftest.py
new file mode 100644
index 00000000..f918c48c
--- /dev/null
+++ b/tests/custom/integration/conftest.py
@@ -0,0 +1,259 @@
+from contextlib import contextmanager, redirect_stdout
+from dataclasses import dataclass
+import os
+import time
+from typing import Any, ContextManager, Generator, List, Union
+import io
+from typing import TextIO
+import uuid
+import pytest
+import dotenv
+from humanloop import AgentResponse, PromptResponse
+from tests.custom.types import GetHumanloopClientFn, SyncableFile
+from click.testing import CliRunner
+
+
+@dataclass
+class ResourceIdentifiers:
+    file_id: str
+    file_path: str
+
+
+@pytest.fixture()
+def capture_stdout() -> ContextManager[TextIO]:
+    @contextmanager
+    def _context_manager():
+        f = io.StringIO()
+        with redirect_stdout(f):
+            yield f
+
+    return _context_manager  # type: ignore [return-value]
+
+
+@pytest.fixture(scope="session")
+def openai_key() -> str:
+    dotenv.load_dotenv()
+    if not os.getenv("OPENAI_API_KEY"):
+        pytest.fail("OPENAI_API_KEY is not set for integration tests")
+    return os.getenv("OPENAI_API_KEY")  # type: ignore [return-value]
+
+
+@pytest.fixture(scope="function")
+def sdk_test_dir(get_humanloop_client: GetHumanloopClientFn) -> Generator[str, None, None]:
+    humanloop_client = get_humanloop_client()
+
+    def cleanup_directory(directory_id: str):
+        directory_response = humanloop_client.directories.get(id=directory_id)
+        for subdirectory in directory_response.subdirectories:
+            cleanup_directory(subdirectory.id)
+        for file in directory_response.files:
+            match file.type:
+                case "agent":
+                    humanloop_client.agents.delete(id=file.id)
+                case "prompt":
+                    humanloop_client.prompts.delete(id=file.id)
+                case "dataset":
+                    humanloop_client.datasets.delete(id=file.id)
+                case "evaluator":
+                    humanloop_client.evaluators.delete(id=file.id)
+                case "flow":
+                    humanloop_client.flows.delete(id=file.id)
+                case "tool":
+                    humanloop_client.tools.delete(id=file.id)
+                case _:
+                    raise ValueError(f"Unknown file type: {file.type}")
+        humanloop_client.directories.delete(id=directory_response.id)
+
+    path = f"SDK_INTEGRATION_TEST_{uuid.uuid4()}"
+    response = None
+    try:
+        response = humanloop_client.directories.create(path=path)
+        yield response.path
+    except Exception as e:
+        pytest.fail(f"Failed to create directory {path}: {e}")
+    finally:
+        if response:
+            time.sleep(5)
+            cleanup_directory(response.id)
+
+
+@pytest.fixture(scope="function")
+def test_prompt_config() -> dict[str, Any]:
+    return {
+        "provider": "openai",
+        "model": "gpt-4o-mini",
+        "temperature": 0.5,
+        "template": [
+            {
+                "role": "system",
+                "content": "You are a helpful assistant. You must answer the user's question truthfully and at the level of a 5th grader.",
+            },
+            {
+                "role": "user",
+                "content": "{{question}}",
+            },
+        ],
+    }
+
+
+@pytest.fixture(scope="function")
+def eval_dataset(
+    get_humanloop_client: GetHumanloopClientFn, sdk_test_dir: str
+) -> Generator[ResourceIdentifiers, None, None]:
+    humanloop_client = get_humanloop_client()
+    dataset_path = f"{sdk_test_dir}/eval_dataset"
+    try:
+        response = humanloop_client.datasets.upsert(
+            path=dataset_path,
+            datapoints=[
+                {
+                    "inputs": {
+                        "question": "What is the capital of the France?",
+                    },
+                },
+                {
+                    "inputs": {
+                        "question": "What is the capital of the Germany?",
+                    },
+                },
+                {
+                    "inputs": {
+                        "question": "What is 2+2?",
+                    },
+                },
+            ],
+        )
+        yield ResourceIdentifiers(file_id=response.id, file_path=response.path)
+        humanloop_client.datasets.delete(id=response.id)
+    except Exception as e:
+        pytest.fail(f"Failed to create dataset {dataset_path}: {e}")
+
+
+@pytest.fixture(scope="function")
+def eval_prompt(
+    get_humanloop_client: GetHumanloopClientFn, sdk_test_dir: str, openai_key: str, test_prompt_config: dict[str, Any]
+) -> Generator[ResourceIdentifiers, None, None]:
+    humanloop_client = get_humanloop_client()
+    prompt_path = f"{sdk_test_dir}/eval_prompt"
+    try:
+        response = humanloop_client.prompts.upsert(
+            path=prompt_path,
+            **test_prompt_config,
+        )
+        yield ResourceIdentifiers(file_id=response.id, file_path=response.path)
+        humanloop_client.prompts.delete(id=response.id)
+    except Exception as e:
+        pytest.fail(f"Failed to create prompt {prompt_path}: {e}")
+
+
+@pytest.fixture(scope="function")
+def output_not_null_evaluator(
+    get_humanloop_client: GetHumanloopClientFn, sdk_test_dir: str
+) -> Generator[ResourceIdentifiers, None, None]:
+    humanloop_client = get_humanloop_client()
+    evaluator_path = f"{sdk_test_dir}/output_not_null_evaluator"
+    try:
+        response = humanloop_client.evaluators.upsert(
+            path=evaluator_path,
+            spec={
+                "arguments_type": "target_required",
+                "return_type": "boolean",
+                "code": """
+def output_not_null(log: dict) -> bool:
+    return log["output"] is not None
+                """,
+                "evaluator_type": "python",
+            },
+        )
+        yield ResourceIdentifiers(file_id=response.id, file_path=response.path)
+        humanloop_client.evaluators.delete(id=response.id)
+    except Exception as e:
+        pytest.fail(f"Failed to create evaluator {evaluator_path}: {e}")
+
+
+@pytest.fixture(scope="function")
+def id_for_staging_environment(get_humanloop_client: GetHumanloopClientFn, eval_prompt: ResourceIdentifiers) -> str:
+    humanloop_client = get_humanloop_client()
+    response = humanloop_client.prompts.list_environments(id=eval_prompt.file_id)
+    for environment in response:
+        if environment.name == "staging":
+            return environment.id
+    pytest.fail("Staging environment not found")
+
+
+@pytest.fixture
+def syncable_files_fixture(
+    get_humanloop_client: GetHumanloopClientFn,
+    sdk_test_dir: str,
+) -> Generator[list[SyncableFile], None, None]:
+    """Creates a predefined structure of files in Humanloop for testing sync."""
+    files: List[SyncableFile] = [
+        SyncableFile(
+            path="prompts/gpt-4",
+            type="prompt",
+            model="gpt-4",
+        ),
+        SyncableFile(
+            path="prompts/gpt-4o",
+            type="prompt",
+            model="gpt-4o",
+        ),
+        SyncableFile(
+            path="prompts/nested/complex/gpt-4o",
+            type="prompt",
+            model="gpt-4o",
+        ),
+        SyncableFile(
+            path="agents/gpt-4",
+            type="agent",
+            model="gpt-4",
+        ),
+        SyncableFile(
+            path="agents/gpt-4o",
+            type="agent",
+            model="gpt-4o",
+        ),
+    ]
+
+    humanloop_client = get_humanloop_client()
+    created_files = []
+    for file in files:
+        full_path = f"{sdk_test_dir}/{file.path}"
+        response: Union[AgentResponse, PromptResponse]
+        if file.type == "prompt":
+            response = humanloop_client.prompts.upsert(
+                path=full_path,
+                model=file.model,
+            )
+        elif file.type == "agent":
+            response = humanloop_client.agents.upsert(
+                path=full_path,
+                model=file.model,
+            )
+        created_files.append(
+            SyncableFile(
+                path=full_path, type=file.type, model=file.model, id=response.id, version_id=response.version_id
+            )
+        )
+
+    yield created_files
+
+
+@pytest.fixture
+def cli_runner() -> CliRunner:
+    """GIVEN a CLI runner
+    THEN it should be configured to catch exceptions
+    """
+    return CliRunner(mix_stderr=False)
+
+
+@pytest.fixture
+def no_humanloop_api_key_in_env(monkeypatch):
+    """Fixture that removes HUMANLOOP_API_KEY from environment variables.
+
+    Use this fixture in tests that verify behavior when no API key is available
+    in the environment (but could still be loaded from .env files).
+    """
+    # Remove API key from environment
+    monkeypatch.delenv("HUMANLOOP_API_KEY", raising=False)
+    yield
diff --git a/tests/custom/integration/test_decorators.py b/tests/custom/integration/test_decorators.py
new file mode 100644
index 00000000..15057ba2
--- /dev/null
+++ b/tests/custom/integration/test_decorators.py
@@ -0,0 +1,153 @@
+import time
+from typing import Any
+
+from openai import OpenAI
+from tests.custom.integration.conftest import GetHumanloopClientFn
+
+
+def test_prompt_decorator(
+    get_humanloop_client: GetHumanloopClientFn,
+    sdk_test_dir: str,
+    test_prompt_config: dict[str, Any],
+    openai_key: str,
+):
+    try:
+        humanloop_client = get_humanloop_client()
+        prompt_path = f"{sdk_test_dir}/test_prompt"
+        prompt_response = humanloop_client.prompts.upsert(
+            path=prompt_path,
+            **test_prompt_config,
+        )
+
+        prompt_versions_response = humanloop_client.prompts.list_versions(id=prompt_response.id)
+        assert len(prompt_versions_response.records) == 1
+
+        @humanloop_client.prompt(path=prompt_path)
+        def my_prompt(question: str) -> str:
+            openai_client = OpenAI(api_key=openai_key)
+
+            response = openai_client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[{"role": "user", "content": question}],
+            )
+
+            assert response.choices[0].message.content is not None
+            return response.choices[0].message.content
+
+        assert "paris" in my_prompt("What is the capital of the France?").lower()
+
+        time.sleep(5)
+        prompt_versions_response = humanloop_client.prompts.list_versions(id=prompt_response.id)
+        assert len(prompt_versions_response.records) == 2
+
+        logs_response = humanloop_client.logs.list(file_id=prompt_response.id, page=1, size=50)
+
+        assert logs_response.items is not None and len(logs_response.items) == 1
+    finally:
+        humanloop_client.prompts.delete(id=prompt_response.id)
+
+
+def test_call_prompt_in_flow_decorator(
+    get_humanloop_client: GetHumanloopClientFn,
+    sdk_test_dir: str,
+    openai_key: str,
+):
+    try:
+        humanloop_client = get_humanloop_client()
+
+        @humanloop_client.flow(path=f"{sdk_test_dir}/test_flow")
+        def my_flow(question: str) -> str:
+            response = humanloop_client.prompts.call(
+                path=f"{sdk_test_dir}/test_prompt",
+                prompt={
+                    "provider": "openai",
+                    "model": "gpt-4o-mini",
+                    "temperature": 0,
+                },
+                messages=[{"role": "user", "content": question}],
+                provider_api_keys={"openai": openai_key},
+            )
+
+            assert response.logs[0].output is not None
+            return response.logs[0].output
+
+        assert "paris" in my_flow("What is the capital of the France?").lower()
+        time.sleep(5)
+        prompt_response = humanloop_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_prompt")
+        assert prompt_response is not None
+        prompt_logs_response = humanloop_client.logs.list(file_id=prompt_response.id, page=1, size=50)
+        assert prompt_logs_response.items is not None and len(prompt_logs_response.items) == 1
+        prompt_log = prompt_logs_response.items[0]
+
+        flow_response = humanloop_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_flow")
+        assert flow_response is not None
+        flow_logs_response = humanloop_client.logs.list(file_id=flow_response.id, page=1, size=50)
+        assert flow_logs_response.items is not None and len(flow_logs_response.items) == 1
+        flow_log = flow_logs_response.items[0]
+        assert prompt_log.trace_parent_id == flow_log.id
+    finally:
+        flow_response = humanloop_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_flow")
+        if flow_response is not None:
+            humanloop_client.flows.delete(id=flow_response.id)
+        prompt_response = humanloop_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_prompt")
+        if prompt_response is not None:
+            humanloop_client.prompts.delete(id=prompt_response.id)
+
+
+def test_flow_decorator_logs_exceptions(
+    get_humanloop_client: GetHumanloopClientFn,
+    sdk_test_dir: str,
+):
+    try:
+        humanloop_client = get_humanloop_client()
+
+        @humanloop_client.flow(path=f"{sdk_test_dir}/test_flow_log_error")
+        def my_flow(question: str) -> str:
+            raise ValueError("This is a test exception")
+
+        my_flow("test")
+
+        time.sleep(5)
+
+        flow_response = humanloop_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_flow_log_error")
+        assert flow_response is not None
+        flow_logs_response = humanloop_client.logs.list(file_id=flow_response.id, page=1, size=50)
+        assert flow_logs_response.items is not None and len(flow_logs_response.items) == 1
+        flow_log = flow_logs_response.items[0]
+        assert flow_log.error is not None
+        assert flow_log.output is None
+
+    finally:
+        flow_response = humanloop_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_flow_log_error")
+        if flow_response is not None:
+            humanloop_client.flows.delete(id=flow_response.id)
+
+
+def test_flow_decorator_populates_output_message(
+    get_humanloop_client: GetHumanloopClientFn,
+    sdk_test_dir: str,
+):
+    try:
+        humanloop_client = get_humanloop_client()
+
+        @humanloop_client.flow(path=f"{sdk_test_dir}/test_flow_log_output_message")
+        def my_flow(question: str) -> dict[str, Any]:
+            return {"role": "user", "content": question}
+
+        assert "france" in my_flow("What is the capital of the France?")["content"].lower()
+
+        time.sleep(5)
+
+        flow_response = humanloop_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_flow_log_output_message")
+        assert flow_response is not None
+        flow_logs_response = humanloop_client.logs.list(file_id=flow_response.id, page=1, size=50)
+        assert flow_logs_response.items is not None and len(flow_logs_response.items) == 1
+        flow_log = flow_logs_response.items[0]
+        assert flow_log.output_message is not None
+        assert flow_log.output is None
+        assert flow_log.error is None
+
+    finally:
+        flow_response = humanloop_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_flow_log_output_message")
+        if flow_response is not None:
+            humanloop_client.flows.delete(id=flow_response.id)
diff --git a/tests/custom/integration/test_evals.py b/tests/custom/integration/test_evals.py
new file mode 100644
index 00000000..2ec74d93
--- /dev/null
+++ b/tests/custom/integration/test_evals.py
@@ -0,0 +1,411 @@
+import time
+from typing import Any
+
+import pytest
+from humanloop.error import HumanloopRuntimeError
+from tests.custom.integration.conftest import ResourceIdentifiers
+from tests.custom.types import GetHumanloopClientFn
+
+
+def test_eval_run_works_on_online_files(
+    get_humanloop_client: GetHumanloopClientFn,
+    output_not_null_evaluator: ResourceIdentifiers,
+    eval_dataset: ResourceIdentifiers,
+    eval_prompt: ResourceIdentifiers,
+) -> None:
+    humanloop_client = get_humanloop_client()
+    humanloop_client.evaluations.run(  # type: ignore [attr-defined]
+        name="test_eval_run",
+        file={
+            "path": eval_prompt.file_path,
+            "type": "prompt",
+        },
+        dataset={
+            "path": eval_dataset.file_path,
+        },
+        evaluators=[
+            {
+                "path": output_not_null_evaluator.file_path,
+            }
+        ],
+    )
+    time.sleep(5)
+    response = humanloop_client.evaluations.list(file_id=eval_prompt.file_id)
+    assert response.items and len(response.items) == 1
+    evaluation_id = response.items[0].id
+    run_evaluation_response = humanloop_client.evaluations.list_runs_for_evaluation(id=evaluation_id)  # type: ignore [attr-defined]
+    assert run_evaluation_response.runs[0].status == "completed"
+
+
+def test_eval_run_version_id(
+    get_humanloop_client: GetHumanloopClientFn,
+    output_not_null_evaluator: ResourceIdentifiers,
+    eval_dataset: ResourceIdentifiers,
+    eval_prompt: ResourceIdentifiers,
+    test_prompt_config: dict[str, Any],
+) -> None:
+    humanloop_client = get_humanloop_client()
+    # GIVEN a prompt where a non-default version is created
+    new_test_prompt_config = test_prompt_config.copy()
+    new_test_prompt_config["temperature"] = 1
+    new_prompt_version_response = humanloop_client.prompts.upsert(
+        path=eval_prompt.file_path,
+        **new_test_prompt_config,
+    )
+    # WHEN creating an evaluation using version_id
+    humanloop_client.evaluations.run(  # type: ignore [attr-defined]
+        name="test_eval_run",
+        file={
+            "id": new_prompt_version_response.id,
+            "version_id": new_prompt_version_response.version_id,
+            "type": "prompt",
+        },
+        dataset={
+            "path": eval_dataset.file_path,
+        },
+        evaluators=[
+            {
+                "path": output_not_null_evaluator.file_path,
+            }
+        ],
+    )
+    # THEN we evaluate the version created in the test
+    evaluations_response = humanloop_client.evaluations.list(file_id=new_prompt_version_response.id)
+    assert evaluations_response.items and len(evaluations_response.items) == 1
+    evaluation_id = evaluations_response.items[0].id
+    runs_response = humanloop_client.evaluations.list_runs_for_evaluation(id=evaluation_id)
+    assert runs_response.runs[0].status == "completed"
+    assert (
+        runs_response.runs[0].version
+        and runs_response.runs[0].version.version_id == new_prompt_version_response.version_id
+    )
+    list_versions_response = humanloop_client.prompts.list_versions(id=new_prompt_version_response.id)
+    assert list_versions_response.records and len(list_versions_response.records) == 2
+    # THEN the version used in evaluation is not the default version
+    response = humanloop_client.prompts.get(id=new_prompt_version_response.id)
+    assert response.version_id != new_prompt_version_response.version_id
+
+
+def test_eval_run_environment(
+    get_humanloop_client: GetHumanloopClientFn,
+    output_not_null_evaluator: ResourceIdentifiers,
+    eval_dataset: ResourceIdentifiers,
+    eval_prompt: ResourceIdentifiers,
+    test_prompt_config: dict[str, Any],
+    id_for_staging_environment: str,
+) -> None:
+    humanloop_client = get_humanloop_client()
+    # GIVEN a prompt deployed to staging environment
+    new_test_prompt_config = test_prompt_config.copy()
+    new_test_prompt_config["temperature"] = 1
+    new_prompt_version_response = humanloop_client.prompts.upsert(
+        path=eval_prompt.file_path,
+        **new_test_prompt_config,
+    )
+    humanloop_client.prompts.set_deployment(
+        id=new_prompt_version_response.id,
+        environment_id=id_for_staging_environment,
+        version_id=new_prompt_version_response.version_id,
+    )
+    # WHEN creating an evaluation using environment
+    humanloop_client.evaluations.run(  # type: ignore [attr-defined]
+        name="test_eval_run",
+        file={
+            "id": new_prompt_version_response.id,
+            "type": "prompt",
+            "environment": "staging",
+        },
+        dataset={
+            "path": eval_dataset.file_path,
+        },
+        evaluators=[
+            {
+                "path": output_not_null_evaluator.file_path,
+            }
+        ],
+    )
+    # THEN evaluation is done with the version deployed to staging environment
+    evaluations_response = humanloop_client.evaluations.list(file_id=new_prompt_version_response.id)
+    assert evaluations_response.items and len(evaluations_response.items) == 1
+    evaluation_id = evaluations_response.items[0].id
+    runs_response = humanloop_client.evaluations.list_runs_for_evaluation(id=evaluation_id)
+    assert runs_response.runs[0].status == "completed"
+    assert (
+        runs_response.runs[0].version
+        and runs_response.runs[0].version.version_id == new_prompt_version_response.version_id
+    )
+    default_prompt_version_response = humanloop_client.prompts.get(id=new_prompt_version_response.id)
+    assert default_prompt_version_response.version_id != new_prompt_version_response.version_id
+
+
+@pytest.mark.parametrize("version_lookup", ["version_id", "environment"])
+def test_eval_run_version_lookup_fails_with_path(
+    get_humanloop_client: GetHumanloopClientFn,
+    eval_prompt: ResourceIdentifiers,
+    eval_dataset: ResourceIdentifiers,
+    output_not_null_evaluator: ResourceIdentifiers,
+    version_lookup: str,
+):
+    # GIVEN an eval run where we try to evaluate a non-default version
+    with pytest.raises(HumanloopRuntimeError) as e:
+        humanloop_client = get_humanloop_client()
+        humanloop_client.evaluations.run(  # type: ignore [attr-defined]
+            name="test_eval_run",
+            file={
+                "path": eval_prompt.file_path,
+                "type": "prompt",
+                # WHEN the File id is not passed in file
+                version_lookup: "will_not_work",
+            },
+            dataset={
+                "path": eval_dataset.file_path,
+            },
+            evaluators=[
+                {
+                    "path": output_not_null_evaluator.file_path,
+                }
+            ],
+        )
+    # THEN an error is raised
+    assert "You must provide the `file.id` when addressing a file by version ID or environment" in str(e.value)
+
+
+def test_eval_run_with_version_upsert(
+    get_humanloop_client: GetHumanloopClientFn,
+    eval_prompt: ResourceIdentifiers,
+    eval_dataset: ResourceIdentifiers,
+    output_not_null_evaluator: ResourceIdentifiers,
+    test_prompt_config: dict[str, Any],
+):
+    humanloop_client = get_humanloop_client()
+    humanloop_client.evaluations.run(  # type: ignore [attr-defined]
+        name="test_eval_run",
+        file={
+            "path": eval_prompt.file_path,
+            "type": "prompt",
+            "version": {
+                **test_prompt_config,
+                "temperature": 1,
+            },
+        },
+        dataset={
+            "path": eval_dataset.file_path,
+        },
+        evaluators=[
+            {
+                "path": output_not_null_evaluator.file_path,
+            }
+        ],
+    )
+    # THEN the version is upserted and evaluation finishes successfully
+    evaluations_response = humanloop_client.evaluations.list(file_id=eval_prompt.file_id)
+    assert evaluations_response.items and len(evaluations_response.items) == 1
+    evaluation_id = evaluations_response.items[0].id
+    runs_response = humanloop_client.evaluations.list_runs_for_evaluation(id=evaluation_id)
+    assert runs_response.runs[0].status == "completed"
+    # THEN a version was upserted based on file.version
+    list_prompt_versions_response = humanloop_client.prompts.list_versions(id=eval_prompt.file_id)
+    assert list_prompt_versions_response.records and len(list_prompt_versions_response.records) == 2
+
+
+def test_flow_eval_does_not_work_without_callable(
+    get_humanloop_client: GetHumanloopClientFn,
+    eval_dataset: ResourceIdentifiers,
+    output_not_null_evaluator: ResourceIdentifiers,
+):
+    with pytest.raises(HumanloopRuntimeError) as e:
+        humanloop_client = get_humanloop_client()
+        humanloop_client.evaluations.run(  # type: ignore [attr-defined]
+            name="test_eval_run",
+            file={
+                "path": "Test Flow",
+                "type": "flow",
+                "version": {
+                    "attributes": {
+                        "foo": "bar",
+                    }
+                },
+            },
+            dataset={
+                "path": eval_dataset.file_path,
+            },
+            evaluators=[
+                {
+                    "path": output_not_null_evaluator.file_path,
+                }
+            ],
+        )
+    # THEN an error is raised
+    assert "You must provide a `callable` for your Flow `file` to run a local eval." in str(e.value)
+
+
+def test_flow_eval_works_with_callable(
+    get_humanloop_client: GetHumanloopClientFn,
+    eval_dataset: ResourceIdentifiers,
+    output_not_null_evaluator: ResourceIdentifiers,
+    sdk_test_dir: str,
+):
+    humanloop_client = get_humanloop_client()
+    flow_path = f"{sdk_test_dir}/Test Flow"
+    # GIVEN a flow with a callable
+    flow_response = humanloop_client.flows.upsert(
+        path=flow_path,
+        attributes={
+            "foo": "bar",
+        },
+    )
+    try:
+        flow = humanloop_client.flows.upsert(
+            path=flow_path,
+            attributes={
+                "foo": "bar",
+            },
+        )
+        # WHEN we run an evaluation with the flow
+        humanloop_client.evaluations.run(  # type: ignore [attr-defined]
+            name="test_eval_run",
+            file={
+                "id": flow.id,
+                "type": "flow",
+                "callable": lambda question: "bar",
+            },
+            dataset={
+                "path": eval_dataset.file_path,
+            },
+            evaluators=[
+                {
+                    "path": output_not_null_evaluator.file_path,
+                }
+            ],
+        )
+        # THEN the evaluation finishes successfully
+        evaluations_response = humanloop_client.evaluations.list(file_id=flow.id)
+        assert evaluations_response.items and len(evaluations_response.items) == 1
+        evaluation_id = evaluations_response.items[0].id
+        runs_response = humanloop_client.evaluations.list_runs_for_evaluation(id=evaluation_id)
+        assert runs_response.runs[0].status == "completed"
+    finally:
+        humanloop_client.flows.delete(id=flow_response.id)
+
+
+def test_cannot_evaluate_agent_with_callable(
+    get_humanloop_client: GetHumanloopClientFn,
+    eval_dataset: ResourceIdentifiers,
+    output_not_null_evaluator: ResourceIdentifiers,
+):
+    with pytest.raises(ValueError) as e:
+        humanloop_client = get_humanloop_client()
+        humanloop_client.evaluations.run(  # type: ignore [attr-defined]
+            name="test_eval_run",
+            file={
+                "path": "Test Agent",
+                "type": "agent",
+                "callable": lambda question: "bar",
+            },
+            dataset={
+                "path": eval_dataset.file_path,
+            },
+            evaluators=[
+                {
+                    "path": output_not_null_evaluator.file_path,
+                }
+            ],
+        )
+    assert str(e.value) == "Agent evaluation is only possible on the Humanloop runtime, do not provide a `callable`."
+
+
+def test_flow_eval_resolves_to_default_with_callable(
+    get_humanloop_client: GetHumanloopClientFn,
+    output_not_null_evaluator: ResourceIdentifiers,
+    eval_dataset: ResourceIdentifiers,
+    sdk_test_dir: str,
+) -> None:
+    humanloop_client = get_humanloop_client()
+    # GIVEN a flow with some attributes
+    flow_path = f"{sdk_test_dir}/Test Flow"
+    flow_response = humanloop_client.flows.upsert(
+        path=flow_path,
+        attributes={
+            "foo": "bar",
+        },
+    )
+    try:
+        # WHEN running an evaluation with the flow's callable but no version
+        humanloop_client.evaluations.run(  # type: ignore [attr-defined]
+            name="test_eval_run",
+            file={
+                "id": flow_response.id,
+                "type": "flow",
+                "callable": lambda question: "It's complicated don't worry about it",
+            },
+            dataset={
+                "path": eval_dataset.file_path,
+            },
+            evaluators=[
+                {
+                    "path": output_not_null_evaluator.file_path,
+                }
+            ],
+        )
+        # THEN the evaluation finishes successfully
+        evaluations_response = humanloop_client.evaluations.list(file_id=flow_response.id)
+        assert evaluations_response.items and len(evaluations_response.items) == 1
+        evaluation_id = evaluations_response.items and evaluations_response.items[0].id
+        runs_response = humanloop_client.evaluations.list_runs_for_evaluation(id=evaluation_id)  # type: ignore [attr-defined, arg-type]
+        assert runs_response.runs[0].status == "completed"
+    finally:
+        # Clean up test resources
+        humanloop_client.flows.delete(id=flow_response.id)
+
+
+def test_agent_eval_works_upserting(
+    get_humanloop_client: GetHumanloopClientFn,
+    eval_dataset: ResourceIdentifiers,
+    output_not_null_evaluator: ResourceIdentifiers,
+    sdk_test_dir: str,
+):
+    humanloop_client = get_humanloop_client()
+    humanloop_client.evaluations.run(  # type: ignore [attr-defined]
+        name="test_eval_run",
+        file={
+            "path": f"{sdk_test_dir}/Test Agent",
+            "type": "agent",
+            "version": {
+                "model": "gpt-4o",
+                "template": [
+                    {
+                        "role": "system",
+                        "content": "You are a helpful assistant, offering very short answers.",
+                    },
+                    {
+                        "role": "user",
+                        "content": "{{question}}",
+                    },
+                ],
+                "provider": "openai",
+                "temperature": 0,
+                "max_iterations": 5,
+            },
+        },
+        dataset={
+            "path": eval_dataset.file_path,
+        },
+        evaluators=[
+            {
+                "path": output_not_null_evaluator.file_path,
+            }
+        ],
+    )
+    files_response = humanloop_client.files.list_files(page=1, size=100)
+    eval_agent = None
+    for file in files_response.records:
+        if file.path == f"{sdk_test_dir}/Test Agent":
+            eval_agent = file
+            break
+    assert eval_agent and eval_agent.type == "agent"
+    # THEN the evaluation finishes successfully
+    evaluations_response = humanloop_client.evaluations.list(file_id=eval_agent.id)
+    assert evaluations_response.items and len(evaluations_response.items) == 1
+    evaluation_id = evaluations_response.items[0].id
+    runs_response = humanloop_client.evaluations.list_runs_for_evaluation(id=evaluation_id)  # type: ignore [attr-defined, arg-type]
+    assert runs_response.runs[0].status == "completed"
diff --git a/tests/custom/integration/test_sync.py b/tests/custom/integration/test_sync.py
new file mode 100644
index 00000000..6e7b002b
--- /dev/null
+++ b/tests/custom/integration/test_sync.py
@@ -0,0 +1,206 @@
+from typing import List, Union
+from pathlib import Path
+import pytest
+from humanloop import AgentResponse, PromptResponse
+from humanloop.prompts.client import PromptsClient
+from humanloop.agents.client import AgentsClient
+from humanloop.error import HumanloopRuntimeError
+from tests.custom.types import GetHumanloopClientFn, SyncableFile
+
+
+@pytest.fixture
+def cleanup_local_files():
+    """Cleanup any locally synced files after tests"""
+    yield
+    local_dir = Path("humanloop")
+    if local_dir.exists():
+        import shutil
+
+        shutil.rmtree(local_dir)
+
+
+def test_pull_basic(
+    syncable_files_fixture: List[SyncableFile],
+    get_humanloop_client: GetHumanloopClientFn,
+):
+    """Test that humanloop.sync() correctly syncs remote files to local filesystem"""
+    # GIVEN a set of files in the remote system (from syncable_files_fixture)
+    humanloop_client = get_humanloop_client()
+
+    # WHEN running the sync
+    humanloop_client.pull()
+
+    # THEN our local filesystem should mirror the remote filesystem in the HL Workspace
+    for file in syncable_files_fixture:
+        extension = f".{file.type}"
+        local_path = Path("humanloop") / f"{file.path}{extension}"
+
+        # THEN the file and its directory should exist
+        assert local_path.exists(), f"Expected synced file at {local_path}"
+        assert local_path.parent.exists(), f"Expected directory at {local_path.parent}"
+
+        # THEN the file should not be empty
+        content = local_path.read_text()
+        assert content, f"File at {local_path} should not be empty"
+
+
+def test_overload_with_local_files(
+    get_humanloop_client: GetHumanloopClientFn,
+    syncable_files_fixture: List[SyncableFile],
+):
+    """Test that overload_with_local_files correctly handles local files."""
+    # GIVEN a client with use_local_files=True and pulled files
+    humanloop_client = get_humanloop_client(use_local_files=True)
+    humanloop_client.pull()
+
+    # GIVEN a test file from the structure
+    test_file = syncable_files_fixture[0]
+    extension = f".{test_file.type}"
+    local_path = Path("humanloop") / f"{test_file.path}{extension}"
+
+    # THEN the file should exist locally
+    assert local_path.exists(), f"Expected pulled file at {local_path}"
+    assert local_path.parent.exists(), f"Expected directory at {local_path.parent}"
+
+    # WHEN calling the file
+    response: Union[AgentResponse, PromptResponse]
+    if test_file.type == "prompt":
+        response = humanloop_client.prompts.call(  # type: ignore [assignment]
+            path=test_file.path, messages=[{"role": "user", "content": "Testing"}]
+        )
+    elif test_file.type == "agent":
+        response = humanloop_client.agents.call(  # type: ignore [assignment]
+            path=test_file.path, messages=[{"role": "user", "content": "Testing"}]
+        )
+    # THEN the response should not be None
+    assert response is not None
+
+    # WHEN calling with an invalid path
+    # THEN it should raise HumanloopRuntimeError
+    with pytest.raises(HumanloopRuntimeError):
+        sub_client: Union[PromptsClient, AgentsClient]
+        match test_file.type:
+            case "prompt":
+                sub_client = humanloop_client.prompts
+            case "agent":
+                sub_client = humanloop_client.agents
+            case _:
+                raise ValueError(f"Invalid file type: {test_file.type}")
+        sub_client.call(path="invalid/path")
+
+
+def test_overload_log_with_local_files(
+    get_humanloop_client: GetHumanloopClientFn,
+    syncable_files_fixture: List[SyncableFile],
+    sdk_test_dir: str,
+):
+    """Test that overload_with_local_files correctly handles local files for log operations."""
+    # GIVEN a client with use_local_files=True and pulled files
+    humanloop_client = get_humanloop_client(use_local_files=True)
+    humanloop_client.pull()
+
+    # GIVEN a test file from the structure
+    test_file = syncable_files_fixture[0]
+    extension = f".{test_file.type}"
+    local_path = Path("humanloop") / f"{test_file.path}{extension}"
+
+    # THEN the file should exist locally
+    assert local_path.exists(), f"Expected pulled file at {local_path}"
+    assert local_path.parent.exists(), f"Expected directory at {local_path.parent}"
+
+    # WHEN logging with the pulled file
+    if test_file.type == "prompt":
+        response = humanloop_client.prompts.log(  # type: ignore [assignment]
+            path=test_file.path, messages=[{"role": "user", "content": "Testing"}], output="Test response"
+        )
+    elif test_file.type == "agent":
+        response = humanloop_client.agents.log(  # type: ignore [assignment]
+            path=test_file.path, messages=[{"role": "user", "content": "Testing"}], output="Test response"
+        )
+    # THEN the response should not be None
+    assert response is not None
+
+    # WHEN logging with an invalid path
+    # THEN it should raise HumanloopRuntimeError
+    with pytest.raises(HumanloopRuntimeError):
+        if test_file.type == "prompt":
+            humanloop_client.prompts.log(
+                path=f"{sdk_test_dir}/invalid/path",
+                messages=[{"role": "user", "content": "Testing"}],
+                output="Test response",
+            )
+        elif test_file.type == "agent":
+            humanloop_client.agents.log(
+                path=f"{sdk_test_dir}/invalid/path",
+                messages=[{"role": "user", "content": "Testing"}],
+                output="Test response",
+            )
+
+
+def test_overload_version_environment_handling(
+    get_humanloop_client: GetHumanloopClientFn,
+    syncable_files_fixture: List[SyncableFile],
+):
+    """Test that overload_with_local_files correctly handles version_id and environment parameters."""
+    # GIVEN a client with use_local_files=True and pulled files
+    humanloop_client = get_humanloop_client(use_local_files=True)
+    humanloop_client.pull()
+
+    # GIVEN a test file from the structure
+    test_file = syncable_files_fixture[0]
+    extension = f".{test_file.type}"
+    local_path = Path("humanloop") / f"{test_file.path}{extension}"
+
+    # THEN the file should exist locally
+    assert local_path.exists(), f"Expected pulled file at {local_path}"
+    assert local_path.parent.exists(), f"Expected directory at {local_path.parent}"
+
+    # WHEN calling with version_id
+    # THEN it should raise HumanloopRuntimeError
+    with pytest.raises(HumanloopRuntimeError, match="Cannot use local file.*version_id or environment was specified"):
+        if test_file.type == "prompt":
+            humanloop_client.prompts.call(
+                path=test_file.path,
+                version_id=test_file.version_id,
+                messages=[{"role": "user", "content": "Testing"}],
+            )
+        elif test_file.type == "agent":
+            humanloop_client.agents.call(
+                path=test_file.path,
+                version_id=test_file.version_id,
+                messages=[{"role": "user", "content": "Testing"}],
+            )
+
+    # WHEN calling with environment
+    # THEN it should raise HumanloopRuntimeError
+    with pytest.raises(HumanloopRuntimeError, match="Cannot use local file.*version_id or environment was specified"):
+        if test_file.type == "prompt":
+            humanloop_client.prompts.call(
+                path=test_file.path,
+                environment="production",
+                messages=[{"role": "user", "content": "Testing"}],
+            )
+        elif test_file.type == "agent":
+            humanloop_client.agents.call(
+                path=test_file.path,
+                environment="production",
+                messages=[{"role": "user", "content": "Testing"}],
+            )
+
+    # WHEN calling with both version_id and environment
+    # THEN it should raise HumanloopRuntimeError
+    with pytest.raises(HumanloopRuntimeError, match="Cannot use local file.*version_id or environment was specified"):
+        if test_file.type == "prompt":
+            humanloop_client.prompts.call(
+                path=test_file.path,
+                version_id=test_file.version_id,
+                environment="staging",
+                messages=[{"role": "user", "content": "Testing"}],
+            )
+        elif test_file.type == "agent":
+            humanloop_client.agents.call(
+                path=test_file.path,
+                version_id=test_file.version_id,
+                environment="staging",
+                messages=[{"role": "user", "content": "Testing"}],
+            )
diff --git a/tests/custom/integration/test_sync_cli.py b/tests/custom/integration/test_sync_cli.py
new file mode 100644
index 00000000..3957aed2
--- /dev/null
+++ b/tests/custom/integration/test_sync_cli.py
@@ -0,0 +1,179 @@
+from pathlib import Path
+from unittest import mock
+import pytest
+from click.testing import CliRunner
+from humanloop.cli.__main__ import cli
+from tests.custom.types import SyncableFile
+
+
+@pytest.fixture
+def no_env_file_loading():
+    """Fixture that prevents loading API keys from any .env files.
+
+    Use this fixture in tests that verify behavior when no .env files should
+    be processed, regardless of whether they exist or not.
+    """
+    # Prevent any .env file from being loaded
+    with mock.patch("humanloop.cli.__main__.load_dotenv", lambda *args, **kwargs: None):
+        yield
+
+
+def test_pull_without_api_key(cli_runner: CliRunner, no_humanloop_api_key_in_env, no_env_file_loading):
+    """GIVEN no API key in environment
+    WHEN running pull command
+    THEN it should fail with appropriate error message
+    """
+    # WHEN running pull command
+    result = cli_runner.invoke(cli, ["pull", "--local-files-directory", "humanloop"])
+
+    # THEN it should fail with appropriate error message
+    assert result.exit_code == 1  # Our custom error code for API key issues
+    assert "No API key found" in result.output
+    assert "Set HUMANLOOP_API_KEY in .env file or environment" in result.output
+
+
+def test_pull_basic(
+    cli_runner: CliRunner,
+    syncable_files_fixture: list[SyncableFile],
+    tmp_path: Path,  # this path is used as a temporary store for files locally
+):
+    # GIVEN a base directory for pulled files
+    base_dir = str(tmp_path / "humanloop")
+
+    # WHEN running pull command
+    result = cli_runner.invoke(cli, ["pull", "--local-files-directory", base_dir, "--verbose"])
+
+    # THEN it should succeed
+    assert result.exit_code == 0
+    assert "Pulling files from Humanloop..." in result.output
+    assert "Pull completed" in result.output
+
+    # THEN the files should exist locally
+    for file in syncable_files_fixture:
+        extension = f".{file.type}"
+        local_path = Path(base_dir) / f"{file.path}{extension}"
+        assert local_path.exists(), f"Expected synced file at {local_path}"
+        assert local_path.parent.exists(), f"Expected directory at {local_path.parent}"
+        assert local_path.read_text(), f"File at {local_path} should not be empty"
+
+
+def test_pull_with_specific_path(
+    cli_runner: CliRunner,
+    syncable_files_fixture: list[SyncableFile],
+    tmp_path: Path,
+):
+    """GIVEN a specific path to pull
+    WHEN running pull command with path
+    THEN it should pull only files from that path
+    """
+    # GIVEN a base directory and specific path
+    base_dir = str(tmp_path / "humanloop")
+    test_path = syncable_files_fixture[
+        0
+    ].path.split(
+        "/"
+    )[
+        0
+    ]  # Retrieve the prefix of the first file's path which corresponds to the sdk_test_dir used within syncable_files_fixture
+
+    # WHEN running pull command with path
+    result = cli_runner.invoke(cli, ["pull", "--local-files-directory", base_dir, "--path", test_path, "--verbose"])
+
+    # THEN it should succeed and show the path
+    assert result.exit_code == 0
+    assert f"Path: {test_path}" in result.output
+
+    # THEN only files from that path should exist locally
+    for file in syncable_files_fixture:
+        extension = f".{file.type}"
+        local_path = Path(base_dir) / f"{file.path}{extension}"
+        if file.path.startswith(test_path):
+            assert local_path.exists(), f"Expected synced file at {local_path}"
+        else:
+            assert not local_path.exists(), f"Unexpected file at {local_path}"
+
+
+def test_pull_with_environment(
+    cli_runner: CliRunner,
+    syncable_files_fixture: list[SyncableFile],
+    tmp_path: Path,
+):
+    # GIVEN a base directory and environment
+    base_dir = str(tmp_path / "humanloop")
+    environment = "staging"
+
+    # WHEN running pull command with environment
+    result = cli_runner.invoke(
+        cli,
+        [
+            "pull",
+            "--local-files-directory",
+            base_dir,
+            "--environment",
+            environment,
+            "--verbose",
+        ],
+    )
+
+    # THEN it should succeed and show the environment
+    assert result.exit_code == 0
+    assert f"Environment: {environment}" in result.output
+
+
+def test_pull_with_quiet_mode(
+    cli_runner: CliRunner,
+    syncable_files_fixture: list[SyncableFile],
+    tmp_path: Path,
+):
+    # GIVEN a base directory and quiet mode
+    base_dir = str(tmp_path / "humanloop")
+
+    # WHEN running pull command with quiet mode
+    result = cli_runner.invoke(cli, ["pull", "--local-files-directory", base_dir, "--quiet"])
+
+    # THEN it should succeed but not show file list
+    assert result.exit_code == 0
+    assert "Successfully pulled" not in result.output
+
+    # THEN files should still be pulled
+    for file in syncable_files_fixture:
+        extension = f".{file.type}"
+        local_path = Path(base_dir) / f"{file.path}{extension}"
+        assert local_path.exists(), f"Expected synced file at {local_path}"
+
+
+def test_pull_with_invalid_path(
+    cli_runner: CliRunner,
+):
+    # GIVEN an invalid base directory
+    path = "nonexistent/path"
+
+    # WHEN running pull command
+    result = cli_runner.invoke(cli, ["pull", "--path", path])
+
+    # THEN it should fail
+    assert result.exit_code == 1
+    assert "Error" in result.output
+
+
+def test_pull_with_invalid_environment(cli_runner: CliRunner, tmp_path: Path):
+    # GIVEN an invalid environment
+    environment = "nonexistent"
+    base_dir = str(tmp_path / "humanloop")
+
+    # WHEN running pull command
+    result = cli_runner.invoke(
+        cli,
+        [
+            "pull",
+            "--local-files-directory",
+            base_dir,
+            "--environment",
+            environment,
+            "--verbose",
+        ],
+    )
+
+    # THEN it should fail
+    assert result.exit_code == 1
+    assert "Error" in result.output
diff --git a/tests/custom/otel/__init__.py b/tests/custom/otel/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/custom/otel/test_helpers.py b/tests/custom/otel/test_helpers.py
new file mode 100644
index 00000000..3bd5ce45
--- /dev/null
+++ b/tests/custom/otel/test_helpers.py
@@ -0,0 +1,172 @@
+import pytest
+from humanloop.otel.helpers import read_from_opentelemetry_span, write_to_opentelemetry_span
+from opentelemetry.sdk.trace import Span
+
+
+def test_read_empty(test_span: Span):
+    with pytest.raises(TypeError):
+        assert read_from_opentelemetry_span(test_span) == {}
+
+
+def test_read_non_existent_key(test_span: Span):
+    with pytest.raises(TypeError):
+        assert read_from_opentelemetry_span(test_span, "key") == {}
+    write_to_opentelemetry_span(test_span, {"x": 7, "y": "foo"}, key="key")
+    # NOTE: attributes cannot be None at this point
+    assert dict(test_span.attributes) == {  # type: ignore
+        "key.x": 7,
+        "key.y": "foo",
+    }
+    with pytest.raises(TypeError):
+        assert read_from_opentelemetry_span(test_span, "key.z") is None
+
+
+def test_simple_dict(test_span: Span):
+    write_to_opentelemetry_span(test_span, {"x": 7, "y": "foo"}, "key")
+    # NOTE: attributes cannot be None at this point
+    assert dict(test_span.attributes) == {  # type: ignore
+        "key.x": 7,
+        "key.y": "foo",
+    }
+    assert read_from_opentelemetry_span(test_span, "key") == {"x": 7, "y": "foo"}
+
+
+def test_no_prefix(test_span: Span):
+    write_to_opentelemetry_span(test_span, {"x": 7, "y": "foo"})
+    # NOTE: attributes cannot be None at this point
+    assert dict(test_span.attributes) == {  # type: ignore
+        "x": 7,
+        "y": "foo",
+    }
+    assert read_from_opentelemetry_span(test_span) == {"x": 7, "y": "foo"}
+
+
+def test_nested_object(test_span: Span):
+    write_to_opentelemetry_span(test_span, {"x": 7, "y": {"z": "foo"}}, "key")
+    # NOTE: attributes cannot be None at this point
+    assert dict(test_span.attributes) == {  # type: ignore
+        "key.x": 7,
+        "key.y.z": "foo",
+    }
+    assert read_from_opentelemetry_span(test_span, "key") == {"x": 7, "y": {"z": "foo"}}
+
+
+def test_list(test_span: Span):
+    write_to_opentelemetry_span(
+        test_span,
+        [{"x": 7, "y": "foo"}, {"z": "bar"}],  # type: ignore
+        "key",
+    )  # type: ignore
+    # NOTE: attributes cannot be None at this point
+    assert dict(test_span.attributes) == {  # type: ignore
+        "key.0.x": 7,
+        "key.0.y": "foo",
+        "key.1.z": "bar",
+    }
+    assert read_from_opentelemetry_span(test_span, "key") == [
+        {"z": "bar"},
+        {"x": 7, "y": "foo"},
+    ]
+
+
+def test_list_no_prefix(test_span: Span):
+    write_to_opentelemetry_span(
+        test_span,
+        [{"x": 7, "y": "foo"}, {"z": "bar"}],  # type: ignore
+    )
+    # NOTE: attributes cannot be None at this point
+    assert dict(test_span.attributes) == {  # type: ignore
+        "0.x": 7,
+        "0.y": "foo",
+        "1.z": "bar",
+    }
+    assert read_from_opentelemetry_span(test_span) == [
+        {"z": "bar"},
+        {"x": 7, "y": "foo"},
+    ]
+
+
+def test_multiple_nestings(test_span: Span):
+    write_to_opentelemetry_span(
+        test_span,
+        [
+            {"x": 7, "y": "foo"},
+            [{"z": "bar"}, {"a": 42}],
+        ],  # type: ignore
+        "key",
+    )
+    assert dict(test_span.attributes) == {  # type: ignore
+        "key.0.x": 7,
+        "key.0.y": "foo",
+        "key.1.0.z": "bar",
+        "key.1.1.a": 42,
+    }
+    assert read_from_opentelemetry_span(test_span, "key") == [
+        [
+            {"a": 42},
+            {"z": "bar"},
+        ],
+        {"x": 7, "y": "foo"},
+    ]
+
+
+def test_read_mixed_numeric_string_keys(test_span: Span):
+    test_span.set_attributes(
+        {
+            "key.0.x": 7,
+            "key.0.y": "foo",
+            "key.a.z": "bar",
+            "key.a.a": 42,
+        }
+    )
+    assert read_from_opentelemetry_span(span=test_span, key="key") == {  # type: ignore
+        "0": {"x": 7, "y": "foo"},
+        "a": {"z": "bar", "a": 42},
+    }
+    assert read_from_opentelemetry_span(span=test_span) == {  # type: ignore
+        "key": {
+            "0": {"x": 7, "y": "foo"},
+            "a": {"z": "bar", "a": 42},
+        }
+    }
+
+
+def test_sub_key_same_as_key(test_span: Span):
+    write_to_opentelemetry_span(test_span, {"key": 7}, "key")
+    # NOTE: attributes cannot be None at this point
+    assert dict(test_span.attributes) == {  # type: ignore
+        "key.key": 7,
+    }
+    assert read_from_opentelemetry_span(test_span, "key") == {"key": 7}
+
+
+def test_read_nested_key(test_span: Span):
+    test_span.set_attributes({"key.x": 7, "key.y.z": "foo"})
+    assert read_from_opentelemetry_span(span=test_span, key="key.y") == {"z": "foo"}
+
+
+def test_write_read_sub_key(test_span: Span):
+    write_to_opentelemetry_span(test_span, {"x": 7, "y": "foo"}, "key")
+    assert read_from_opentelemetry_span(test_span, "key.x") == 7
+    assert read_from_opentelemetry_span(test_span, "key.y") == "foo"
+    assert read_from_opentelemetry_span(test_span, "key") == {"x": 7, "y": "foo"}
+
+
+def test_write_drops_dict_all_null_values(test_span: Span):
+    # GIVEN a test_span to which a value with null values is written
+    # NOTE: mypy complains about None value in the dict, but it is intentionally under test
+    write_to_opentelemetry_span(test_span, {"x": None, "y": None}, "key")  # type: ignore
+    # WHEN reading the value from the span
+    # THEN the value is not present in the span attributes
+    assert "key" not in test_span.attributes  # type: ignore
+    with pytest.raises(TypeError):
+        assert read_from_opentelemetry_span(test_span, "key") == {}
+
+
+def test_write_drops_null_value_from_dict(test_span: Span):
+    # GIVEN a test_span to which a dict with some null values are written
+    # NOTE: mypy complains about None value in the dict, but it is intentionally under test
+    write_to_opentelemetry_span(test_span, {"x": 2, "y": None}, "key")  # type: ignore
+    # WHEN reading the values from the span
+    # THEN the value with null value is not present in the span attributes
+    assert read_from_opentelemetry_span(test_span, "key") == {"x": 2}
diff --git a/tests/custom/sync/__init__.py b/tests/custom/sync/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/custom/sync/test_client.py b/tests/custom/sync/test_client.py
new file mode 100644
index 00000000..a349fd0c
--- /dev/null
+++ b/tests/custom/sync/test_client.py
@@ -0,0 +1,126 @@
+import logging
+import pytest
+from pathlib import Path
+from unittest.mock import Mock, patch
+from humanloop.sync.sync_client import SyncClient, SerializableFileType
+from humanloop.error import HumanloopRuntimeError
+from typing import Literal
+
+
+@pytest.fixture
+def mock_client() -> Mock:
+    return Mock()
+
+
+@pytest.fixture
+def sync_client(mock_client: Mock, tmp_path: Path) -> SyncClient:
+    return SyncClient(
+        client=mock_client,
+        base_dir=str(tmp_path),
+        cache_size=10,
+        log_level=logging.DEBUG,  # DEBUG level for testing  # noqa: F821
+    )
+
+
+def test_init(sync_client: SyncClient, tmp_path: Path):
+    """Test basic initialization of SyncClient."""
+    # GIVEN a SyncClient instance
+    # THEN it should be initialized with correct base directory, cache size and file types
+    assert sync_client.base_dir == tmp_path
+    assert sync_client._cache_size == 10
+    assert sync_client.SERIALIZABLE_FILE_TYPES == frozenset(["prompt", "agent"])
+
+
+def test_normalize_path(sync_client: SyncClient):
+    """Test path normalization functionality."""
+    # GIVEN various file paths with different formats
+    test_cases = [
+        ("path/to/file.prompt", "path/to/file"),
+        ("path\\to\\file.agent", "path/to/file"),
+        ("trailing/slashes/file.agent/", "trailing/slashes/file"),
+        ("multiple//slashes//file.prompt", "multiple/slashes/file"),
+    ]
+
+    for input_path, expected in test_cases:
+        # WHEN they are normalized
+        normalized = sync_client._normalize_path(input_path)
+        # THEN they should be converted to the expected format
+        assert normalized == expected
+
+    # Test absolute path raises error
+    with pytest.raises(HumanloopRuntimeError, match="Absolute paths are not supported"):
+        sync_client._normalize_path("/leading/slashes/file.prompt")
+
+
+def test_is_file(sync_client: SyncClient):
+    """Test file type detection."""
+    # GIVEN various file paths
+    # WHEN checking if they are valid file types
+    # THEN only .prompt and .agent files should return True
+    assert sync_client.is_file("test.prompt")
+    assert sync_client.is_file("test.agent")
+    assert not sync_client.is_file("test.txt")
+    assert not sync_client.is_file("test")
+
+
+def test_save_and_read_file(sync_client: SyncClient):
+    """Test saving and reading files."""
+    # GIVEN a file content and path
+    content = "test content"
+    path = "test/path"
+    file_type: SerializableFileType = "prompt"
+
+    # WHEN saving the file
+    sync_client._save_serialized_file(content, path, "prompt")
+    saved_path = sync_client.base_dir / path
+    saved_path = saved_path.parent / f"{saved_path.stem}.{file_type}"
+
+    # THEN the file should exist on disk
+    assert saved_path.exists()
+
+    # WHEN reading the file
+    read_content = sync_client.get_file_content(path, file_type)
+
+    # THEN the content should match
+    assert read_content == content
+
+
+def test_error_handling(sync_client: SyncClient):
+    """Test error handling in various scenarios."""
+    # GIVEN a nonexistent file
+    # WHEN trying to read it
+    # THEN a HumanloopRuntimeError should be raised
+    with pytest.raises(HumanloopRuntimeError, match="Local file not found"):
+        sync_client.get_file_content("nonexistent", "prompt")
+
+    # GIVEN an API error
+    # WHEN trying to pull a file
+    # THEN it should return False
+    with patch.object(sync_client.client.files, "retrieve_by_path", side_effect=Exception("API Error")):
+        assert not sync_client._pull_file("test.prompt")
+
+
+def test_cache_functionality(sync_client: SyncClient):
+    """Test LRU cache functionality."""
+    # GIVEN a test file
+    content = "test content"
+    path = "test/path"
+    file_type: Literal["prompt", "agent"] = "prompt"
+    sync_client._save_serialized_file(content, path, file_type)
+
+    # WHEN reading the file for the first time
+    sync_client.get_file_content(path, file_type)
+    # THEN it should hit disk (implicitly verified by no cache hit)
+
+    # WHEN modifying the file on disk
+    saved_path = sync_client.base_dir / f"{path}.{file_type}"
+    saved_path.write_text("modified content")
+
+    # THEN subsequent reads should use cache
+    assert sync_client.get_file_content(path, file_type) == content
+
+    # WHEN clearing the cache
+    sync_client.clear_cache()
+
+    # THEN new content should be read from disk
+    assert sync_client.get_file_content(path, file_type) == "modified content"
diff --git a/tests/custom/types.py b/tests/custom/types.py
new file mode 100644
index 00000000..7a198456
--- /dev/null
+++ b/tests/custom/types.py
@@ -0,0 +1,15 @@
+from typing import Protocol, NamedTuple
+from humanloop.client import Humanloop
+from humanloop import FileType
+
+
+class GetHumanloopClientFn(Protocol):
+    def __call__(self, use_local_files: bool = False) -> Humanloop: ...
+
+
+class SyncableFile(NamedTuple):
+    path: str
+    type: FileType
+    model: str
+    id: str = ""
+    version_id: str = ""

From 602b91212a633ef10194c01b3008abf85265d955 Mon Sep 17 00:00:00 2001
From: Ale Pouroullis <alexandros@humanloop.com>
Date: Tue, 13 May 2025 19:06:48 +0100
Subject: [PATCH 3/6] mypy + ruff formatting fixes + add prompt tests

---
 tests/assets/exact_match.py                   |  16 -
 tests/assets/levenshtein.py                   |  99 -----
 tests/conftest.py                             | 278 ------------
 tests/custom/conftest.py                      |  11 +-
 tests/custom/integration/conftest.py          |  42 +-
 tests/custom/integration/test_decorators.py   |   1 +
 tests/custom/integration/test_evals.py        |   1 +
 .../{ => custom}/integration/test_prompts.py  |  24 +-
 tests/custom/integration/test_sync.py         |   6 +-
 tests/custom/integration/test_sync_cli.py     |   2 +
 tests/custom/otel/test_helpers.py             |   3 +-
 tests/custom/sync/test_client.py              |   8 +-
 tests/custom/types.py                         |   5 +-
 tests/integration/__init__.py                 |   0
 tests/integration/conftest.py                 | 169 --------
 tests/integration/test_decorators.py          | 154 -------
 tests/integration/test_evals.py               | 402 ------------------
 tests/otel/__init__.py                        |   0
 tests/otel/test_helpers.py                    | 172 --------
 tests/utils/assets/models/__init__.py         |   2 +-
 tests/utils/assets/models/circle.py           |   1 +
 .../assets/models/object_with_defaults.py     |   1 -
 .../models/object_with_optional_field.py      |   8 +-
 tests/utils/assets/models/shape.py            |   5 +-
 tests/utils/assets/models/square.py           |   1 +
 .../assets/models/undiscriminated_shape.py    |   1 +
 tests/utils/test_serialization.py             |   4 +-
 27 files changed, 85 insertions(+), 1331 deletions(-)
 delete mode 100644 tests/assets/exact_match.py
 delete mode 100644 tests/assets/levenshtein.py
 delete mode 100644 tests/conftest.py
 rename tests/{ => custom}/integration/test_prompts.py (60%)
 delete mode 100644 tests/integration/__init__.py
 delete mode 100644 tests/integration/conftest.py
 delete mode 100644 tests/integration/test_decorators.py
 delete mode 100644 tests/integration/test_evals.py
 delete mode 100644 tests/otel/__init__.py
 delete mode 100644 tests/otel/test_helpers.py

diff --git a/tests/assets/exact_match.py b/tests/assets/exact_match.py
deleted file mode 100644
index 583d742a..00000000
--- a/tests/assets/exact_match.py
+++ /dev/null
@@ -1,16 +0,0 @@
-def extract_answer(generation: str):
-    """Extracts answer from generation.
-
-    Handles a generation that if separated by "---" with the answer being the first part.
-    Also handles a generation that starts with "```\n" and removes it.
-    """
-    answer = generation.split("---")[0].strip()
-    if answer.startswith("```\n"):
-        answer = answer[4:].strip()
-
-    return answer
-
-
-def exact_match(log, testcase):
-    target = testcase["target"]["output"]
-    return target == extract_answer(log["output"])
diff --git a/tests/assets/levenshtein.py b/tests/assets/levenshtein.py
deleted file mode 100644
index b2e279ae..00000000
--- a/tests/assets/levenshtein.py
+++ /dev/null
@@ -1,99 +0,0 @@
-def levenshtein_distance_optimized(s1, s2, max_distance=1000):
-    """
-    Calculate the Levenshtein distance between two strings with optimizations and a maximum distance cap.
-
-    This function trims common prefixes and suffixes from the input strings, uses a single-row table
-    to reduce space complexity, and stops the computation early if the Levenshtein distance is
-    guaranteed to exceed a maximum distance cap.
-
-    Args:
-        s1 (str): The first string.
-        s2 (str): The second string.
-        max_distance (int, optional): The maximum Levenshtein distance. Defaults to 1000.
-
-    Returns:
-        int: The Levenshtein distance between the two strings, or max_distance if the distance
-        exceeds max_distance.
-    """
-    # Trim common prefixes
-    while s1 and s2 and s1[0] == s2[0]:
-        s1 = s1[1:]
-        s2 = s2[1:]
-
-    # Trim common suffixes
-    while s1 and s2 and s1[-1] == s2[-1]:
-        s1 = s1[:-1]
-        s2 = s2[:-1]
-
-    len_s1 = len(s1)
-    len_s2 = len(s2)
-
-    # If the length difference between the strings exceeds max_distance, stop the computation
-    if abs(len_s1 - len_s2) > max_distance:
-        return max_distance
-
-    # If one of the strings is empty, the distance is the length of the other string
-    if len_s1 == 0:
-        return min(len_s2, max_distance)
-    if len_s2 == 0:
-        return min(len_s1, max_distance)
-
-    # Create a single-row table with len(s2) + 1 columns
-    distance = list(range(len_s2 + 1))
-
-    # Fill up the table
-    for i in range(1, len_s1 + 1):
-        # Store the value of the previous cell in the previous row
-        prev_row_cell = i - 1
-        # The value at the first column is the row number
-        distance[0] = i
-
-        # Initialize the minimum distance in the current row to max_distance
-        min_distance = max_distance
-
-        for j in range(1, len_s2 + 1):
-            # Store the value of the current cell before it is updated
-            current_cell = distance[j]
-
-            # If the current characters of the two strings are the same, the cost is 0, otherwise 1
-            substitution_cost = 0 if s1[i - 1] == s2[j - 1] else 1
-
-            # The value at the current cell is the minimum of the values at the previous cell in the
-            # current row, the current cell in the previous row, and the previous cell in the previous row,
-            # plus the cost
-            distance[j] = min(
-                distance[j - 1] + 1,  # deletion
-                distance[j] + 1,  # insertion
-                prev_row_cell + substitution_cost,
-            )  # substitution
-
-            # Update the minimum distance in the current row
-            min_distance = min(min_distance, distance[j])
-
-            # Update the value of the previous cell in the previous row
-            prev_row_cell = current_cell
-
-        # If the minimum distance in the current row exceeds max_distance, stop the computation
-        if min_distance >= max_distance:
-            return max_distance
-
-    # The Levenshtein distance between the two strings is the value at the last cell in the table
-    return min(distance[-1], max_distance)
-
-
-def extract_answer(generation: str):
-    """Extracts answer from generation.
-
-    Handles a generation that if separated by "---" with the answer being the first part.
-    Also handles a generation that starts with "```\n" and removes it.
-    """
-    answer = generation.split("---")[0].strip()
-    if answer.startswith("```\n"):
-        answer = answer[4:].strip()
-
-    return answer
-
-
-def compare_log_and_target(log, testcase):
-    target = testcase["target"]["output"]
-    return levenshtein_distance_optimized(target, extract_answer(log["output"]))
diff --git a/tests/conftest.py b/tests/conftest.py
deleted file mode 100644
index 80e3b336..00000000
--- a/tests/conftest.py
+++ /dev/null
@@ -1,278 +0,0 @@
-from dataclasses import asdict, dataclass
-import os
-import random
-import string
-import time
-from typing import Callable, Generator
-import typing
-from unittest.mock import MagicMock
-
-from dotenv import load_dotenv
-import pytest
-from humanloop.base_client import BaseHumanloop
-from humanloop.client import Humanloop
-from humanloop.otel.exporter import HumanloopSpanExporter
-from humanloop.otel.processor import HumanloopSpanProcessor
-from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
-from opentelemetry.instrumentation.anthropic import AnthropicInstrumentor
-from opentelemetry.instrumentation.cohere import CohereInstrumentor
-from opentelemetry.instrumentation.groq import GroqInstrumentor
-from opentelemetry.instrumentation.instrumentor import BaseInstrumentor  # type: ignore
-from opentelemetry.instrumentation.openai import OpenAIInstrumentor
-from opentelemetry.instrumentation.replicate import ReplicateInstrumentor
-from opentelemetry.sdk.resources import Resource
-from opentelemetry.sdk.trace import TracerProvider
-from opentelemetry.sdk.trace.export import SimpleSpanProcessor
-from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
-from opentelemetry.trace import Tracer
-
-if typing.TYPE_CHECKING:
-    from humanloop.client import BaseHumanloop
-
-
-@pytest.fixture(scope="function")
-def opentelemetry_test_provider() -> TracerProvider:
-    """Create a test TracerProvider with a resource.
-
-    This is similar to the created TracerProvider in the
-    Humanloop class.
-    """
-    provider = TracerProvider(
-        resource=Resource.create(
-            {
-                "service": "humanloop.sdk",
-                "environment": "test",
-            }
-        )
-    )
-    return provider
-
-
-@pytest.fixture(scope="function")
-def test_span(opentelemetry_test_provider: TracerProvider):
-    exporter = InMemorySpanExporter()
-    processor = SimpleSpanProcessor(exporter)
-    opentelemetry_test_provider.add_span_processor(processor)
-    tracer = opentelemetry_test_provider.get_tracer("test")
-    return tracer.start_span("test_span")
-
-
-@pytest.fixture(scope="function")
-def opentelemetry_test_configuration(
-    opentelemetry_test_provider: TracerProvider,
-) -> Generator[tuple[Tracer, InMemorySpanExporter], None, None]:
-    """Configure OTel backend without HumanloopSpanProcessor.
-
-    Spans created by Instrumentors will not be used to enrich
-    Humanloop Spans.
-    """
-    exporter = InMemorySpanExporter()
-    processor = SimpleSpanProcessor(exporter)
-    opentelemetry_test_provider.add_span_processor(processor)
-    instrumentors: list[BaseInstrumentor] = [
-        OpenAIInstrumentor(),
-        AnthropicInstrumentor(),
-        GroqInstrumentor(),
-        CohereInstrumentor(),
-        ReplicateInstrumentor(),
-    ]
-    for instrumentor in instrumentors:
-        instrumentor.instrument(tracer_provider=opentelemetry_test_provider)
-    tracer = opentelemetry_test_provider.get_tracer("test")
-    # Circumvent configuration procedure
-
-    yield tracer, exporter
-
-    for instrumentor in instrumentors:
-        instrumentor.uninstrument()
-
-
-@pytest.fixture(scope="function")
-def opentelemetry_hl_test_configuration(
-    opentelemetry_test_provider: TracerProvider,
-    humanloop_client: BaseHumanloop,
-) -> Generator[tuple[Tracer, InMemorySpanExporter], None, None]:
-    """Configure OTel backend with HumanloopSpanProcessor.
-
-    Spans created by Instrumentors will be used to enrich
-    Humanloop Spans.
-    """
-    exporter = InMemorySpanExporter()
-    processor = HumanloopSpanProcessor(exporter=exporter)
-    opentelemetry_test_provider.add_span_processor(processor)
-    instrumentors: list[BaseInstrumentor] = [
-        OpenAIInstrumentor(),
-        AnthropicInstrumentor(),
-        GroqInstrumentor(),
-        CohereInstrumentor(),
-        ReplicateInstrumentor(),
-        AnthropicInstrumentor(),
-    ]
-    for instrumentor in instrumentors:
-        instrumentor.instrument(
-            tracer_provider=opentelemetry_test_provider,
-        )
-    tracer = opentelemetry_test_provider.get_tracer("test")
-
-    yield tracer, exporter
-
-    for instrumentor in instrumentors:
-        instrumentor.uninstrument()
-
-
-@pytest.fixture(scope="function")
-def hl_test_exporter() -> HumanloopSpanExporter:
-    """
-    Test Exporter where HTTP calls to Humanloop API
-    are mocked.
-    """
-    client = MagicMock()
-    exporter = HumanloopSpanExporter(client=client)
-    return exporter
-
-
-@pytest.fixture(scope="function")
-def opentelemetry_hl_with_exporter_test_configuration(
-    hl_test_exporter: HumanloopSpanExporter,
-    opentelemetry_test_provider: TracerProvider,
-) -> Generator[tuple[Tracer, HumanloopSpanExporter], None, None]:
-    """Configure OTel backend with HumanloopSpanProcessor and
-    a HumanloopSpanExporter where HTTP calls are mocked.
-    """
-    processor = HumanloopSpanProcessor(exporter=hl_test_exporter)
-    opentelemetry_test_provider.add_span_processor(processor)
-    instrumentor = OpenAIInstrumentor()
-    instrumentor.instrument(tracer_provider=opentelemetry_test_provider)
-    tracer = opentelemetry_test_provider.get_tracer("test")
-
-    yield tracer, hl_test_exporter
-
-    instrumentor.uninstrument()
-
-
-@pytest.fixture(scope="session")
-def call_llm_messages() -> list[ChatCompletionMessageParam]:
-    return [
-        {
-            "role": "system",
-            "content": "You are an assistant on the following topics: greetings in foreign languages.",
-        },
-        {
-            "role": "user",
-            "content": "Bonjour!",
-        },
-    ]
-
-
-@dataclass
-class APIKeys:
-    openai: str
-    humanloop: str
-
-
-@pytest.fixture(scope="session")
-def api_keys() -> APIKeys:
-    openai_key = os.getenv("OPENAI_API_KEY")
-    humanloop_key = os.getenv("HUMANLOOP_API_KEY")
-    for key_name, key_value in [
-        ("OPENAI_API_KEY", openai_key),
-        ("HUMANLOOP_API_KEY", humanloop_key),
-    ]:
-        if key_value is None:
-            raise ValueError(f"{key_name} is not set in .env file")
-    api_keys = APIKeys(
-        openai=openai_key,  # type: ignore [arg-type]
-        humanloop=humanloop_key,  # type: ignore [arg-type]
-    )
-    for key, value in asdict(api_keys).items():
-        if value is None:
-            raise ValueError(f"{key.upper()} key is not set in .env file")
-    return api_keys
-
-
-@pytest.fixture(scope="session")
-def humanloop_client(api_keys: APIKeys) -> Humanloop:
-    return Humanloop(api_key=api_keys.humanloop)
-
-
-@pytest.fixture(scope="session", autouse=True)
-def load_env():
-    load_dotenv()
-
-
-def directory_cleanup(directory_id: str, humanloop_client: Humanloop):
-    response = humanloop_client.directories.get(directory_id)
-    for file in response.files:
-        file_id = file.id
-        if file.type == "prompt":
-            client = humanloop_client.prompts  # type: ignore [assignment]
-        elif file.type == "tool":
-            client = humanloop_client.tools  # type: ignore [assignment]
-        elif file.type == "dataset":
-            client = humanloop_client.datasets  # type: ignore [assignment]
-        elif file.type == "evaluator":
-            client = humanloop_client.evaluators  # type: ignore [assignment]
-        elif file.type == "flow":
-            client = humanloop_client.flows  # type: ignore [assignment]
-        else:
-            raise NotImplementedError(f"Unknown HL file type {file.type}")
-        client.delete(file_id)
-
-    for subdirectory in response.subdirectories:
-        directory_cleanup(
-            directory_id=subdirectory.id,
-            humanloop_client=humanloop_client,
-        )
-
-    humanloop_client.directories.delete(id=response.id)
-
-
-@dataclass
-class DirectoryIdentifiers:
-    path: str
-    id: str
-
-
-@pytest.fixture()
-def test_directory(
-    humanloop_client: Humanloop,
-) -> Generator[DirectoryIdentifiers, None, None]:
-    # Generate a random  alphanumeric directory name to avoid conflicts
-    def get_random_string(length: int = 16) -> str:
-        return "".join([random.choice(string.ascii_letters + "0123456789") for _ in range(length)])
-
-    directory_path = "SDK_integ_test_" + get_random_string()
-    response = humanloop_client.directories.create(path=directory_path)
-    assert response.path == directory_path
-    try:
-        yield DirectoryIdentifiers(
-            path=response.path,
-            id=response.id,
-        )
-    finally:
-        time.sleep(1)
-        directory_cleanup(response.id, humanloop_client)
-
-
-@pytest.fixture()
-def get_test_path(test_directory: DirectoryIdentifiers) -> Callable[[str], str]:
-    def generate_path(name: str) -> str:
-        return f"{test_directory.path}/{name}"
-
-    return generate_path
-
-
-# @pytest.fixture(scope="session", autouse=True)
-# def cleanup_test_dirs(humanloop_client: Humanloop):
-#     def _cleanup_all_test_dirs():
-#         dirs = humanloop_client.directories.list()
-#         for dir in dirs:
-#             if dir.path.startswith("SDK_integ_test_"):
-#                 directory_cleanup(
-#                     directory_id=dir.id,
-#                     humanloop_client=humanloop_client,
-#                 )
-
-#     _cleanup_all_test_dirs()
-#     yield
-#     _cleanup_all_test_dirs()
diff --git a/tests/custom/conftest.py b/tests/custom/conftest.py
index 7667dedf..8e400483 100644
--- a/tests/custom/conftest.py
+++ b/tests/custom/conftest.py
@@ -1,12 +1,9 @@
-from typing import Generator
 import os
-from dotenv import load_dotenv
+from typing import Generator
 from unittest.mock import MagicMock
 
 import pytest
-from humanloop.client import Humanloop
-from humanloop.otel.exporter import HumanloopSpanExporter
-from humanloop.otel.processor import HumanloopSpanProcessor
+from dotenv import load_dotenv
 from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
 from opentelemetry.instrumentation.anthropic import AnthropicInstrumentor
 from opentelemetry.instrumentation.cohere import CohereInstrumentor
@@ -19,6 +16,10 @@
 from opentelemetry.sdk.trace.export import SimpleSpanProcessor
 from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
 from opentelemetry.trace import Tracer
+
+from humanloop.client import Humanloop
+from humanloop.otel.exporter import HumanloopSpanExporter
+from humanloop.otel.processor import HumanloopSpanProcessor
 from tests.custom.types import GetHumanloopClientFn
 
 
diff --git a/tests/custom/integration/conftest.py b/tests/custom/integration/conftest.py
index f918c48c..25dc441f 100644
--- a/tests/custom/integration/conftest.py
+++ b/tests/custom/integration/conftest.py
@@ -1,16 +1,18 @@
-from contextlib import contextmanager, redirect_stdout
-from dataclasses import dataclass
+import io
 import os
 import time
-from typing import Any, ContextManager, Generator, List, Union
-import io
-from typing import TextIO
 import uuid
-import pytest
+from contextlib import contextmanager, redirect_stdout
+from dataclasses import dataclass
+from typing import ContextManager, Generator, List, TextIO, Union
+
 import dotenv
+import pytest
+from click.testing import CliRunner
+
 from humanloop import AgentResponse, PromptResponse
+from humanloop.requests.prompt_kernel_request import PromptKernelRequestParams
 from tests.custom.types import GetHumanloopClientFn, SyncableFile
-from click.testing import CliRunner
 
 
 @dataclass
@@ -78,7 +80,7 @@ def cleanup_directory(directory_id: str):
 
 
 @pytest.fixture(scope="function")
-def test_prompt_config() -> dict[str, Any]:
+def test_prompt_config() -> PromptKernelRequestParams:
     return {
         "provider": "openai",
         "model": "gpt-4o-mini",
@@ -96,6 +98,25 @@ def test_prompt_config() -> dict[str, Any]:
     }
 
 
+@pytest.fixture(scope="function")
+def prompt(
+    get_humanloop_client: GetHumanloopClientFn,
+    sdk_test_dir: str,
+    test_prompt_config: PromptKernelRequestParams,
+) -> Generator[ResourceIdentifiers, None, None]:
+    humanloop_client = get_humanloop_client()
+    prompt_path = f"{sdk_test_dir}/prompt"
+    try:
+        response = humanloop_client.prompts.upsert(
+            path=prompt_path,
+            **test_prompt_config,
+        )
+        yield ResourceIdentifiers(file_id=response.id, file_path=response.path)
+        humanloop_client.prompts.delete(id=response.id)
+    except Exception as e:
+        pytest.fail(f"Failed to create prompt {prompt_path}: {e}")
+
+
 @pytest.fixture(scope="function")
 def eval_dataset(
     get_humanloop_client: GetHumanloopClientFn, sdk_test_dir: str
@@ -131,7 +152,10 @@ def eval_dataset(
 
 @pytest.fixture(scope="function")
 def eval_prompt(
-    get_humanloop_client: GetHumanloopClientFn, sdk_test_dir: str, openai_key: str, test_prompt_config: dict[str, Any]
+    get_humanloop_client: GetHumanloopClientFn,
+    sdk_test_dir: str,
+    openai_key: str,
+    test_prompt_config: PromptKernelRequestParams,
 ) -> Generator[ResourceIdentifiers, None, None]:
     humanloop_client = get_humanloop_client()
     prompt_path = f"{sdk_test_dir}/eval_prompt"
diff --git a/tests/custom/integration/test_decorators.py b/tests/custom/integration/test_decorators.py
index 15057ba2..59638896 100644
--- a/tests/custom/integration/test_decorators.py
+++ b/tests/custom/integration/test_decorators.py
@@ -2,6 +2,7 @@
 from typing import Any
 
 from openai import OpenAI
+
 from tests.custom.integration.conftest import GetHumanloopClientFn
 
 
diff --git a/tests/custom/integration/test_evals.py b/tests/custom/integration/test_evals.py
index 2ec74d93..d8ba8996 100644
--- a/tests/custom/integration/test_evals.py
+++ b/tests/custom/integration/test_evals.py
@@ -2,6 +2,7 @@
 from typing import Any
 
 import pytest
+
 from humanloop.error import HumanloopRuntimeError
 from tests.custom.integration.conftest import ResourceIdentifiers
 from tests.custom.types import GetHumanloopClientFn
diff --git a/tests/integration/test_prompts.py b/tests/custom/integration/test_prompts.py
similarity index 60%
rename from tests/integration/test_prompts.py
rename to tests/custom/integration/test_prompts.py
index 13ca80eb..f6021b7e 100644
--- a/tests/integration/test_prompts.py
+++ b/tests/custom/integration/test_prompts.py
@@ -1,14 +1,15 @@
-from humanloop.client import Humanloop
-
-from tests.integration.conftest import TestIdentifiers
+from humanloop.requests.prompt_kernel_request import PromptKernelRequestParams
+from tests.custom.integration.conftest import ResourceIdentifiers
+from tests.custom.types import GetHumanloopClientFn
 
 
 def test_prompts_call(
-    humanloop_test_client: Humanloop,
-    prompt: TestIdentifiers,
-    test_prompt_config: TestIdentifiers,
+    get_humanloop_client: GetHumanloopClientFn,
+    prompt: ResourceIdentifiers,
+    test_prompt_config: PromptKernelRequestParams,
 ) -> None:
-    response = humanloop_test_client.prompts.call(  # type: ignore [attr-defined]
+    humanloop_client = get_humanloop_client()
+    response = humanloop_client.prompts.call(  # type: ignore [attr-defined]
         path=prompt.file_path,
         prompt={**test_prompt_config},  # type: ignore [misc, arg-type, typeddict-item, dict-item, list-item]
         inputs={"question": "What is the capital of the France?"},
@@ -24,11 +25,12 @@ def test_prompts_call(
 
 
 def test_prompts_call_stream(
-    humanloop_test_client: Humanloop,
-    prompt: TestIdentifiers,
-    test_prompt_config: TestIdentifiers,
+    get_humanloop_client: GetHumanloopClientFn,
+    prompt: ResourceIdentifiers,
+    test_prompt_config: PromptKernelRequestParams,
 ) -> None:
-    response = humanloop_test_client.prompts.call_stream(  # type: ignore [attr-defined]
+    humanloop_client = get_humanloop_client()
+    response = humanloop_client.prompts.call_stream(  # type: ignore [attr-defined]
         path=prompt.file_path,
         prompt={**test_prompt_config},  # type: ignore [misc, arg-type, typeddict-item, dict-item, list-item]
         inputs={"question": "What is the capital of the France?"},
diff --git a/tests/custom/integration/test_sync.py b/tests/custom/integration/test_sync.py
index 6e7b002b..80e332a4 100644
--- a/tests/custom/integration/test_sync.py
+++ b/tests/custom/integration/test_sync.py
@@ -1,10 +1,12 @@
-from typing import List, Union
 from pathlib import Path
+from typing import List, Union
+
 import pytest
+
 from humanloop import AgentResponse, PromptResponse
-from humanloop.prompts.client import PromptsClient
 from humanloop.agents.client import AgentsClient
 from humanloop.error import HumanloopRuntimeError
+from humanloop.prompts.client import PromptsClient
 from tests.custom.types import GetHumanloopClientFn, SyncableFile
 
 
diff --git a/tests/custom/integration/test_sync_cli.py b/tests/custom/integration/test_sync_cli.py
index 3957aed2..5631d5f0 100644
--- a/tests/custom/integration/test_sync_cli.py
+++ b/tests/custom/integration/test_sync_cli.py
@@ -1,7 +1,9 @@
 from pathlib import Path
 from unittest import mock
+
 import pytest
 from click.testing import CliRunner
+
 from humanloop.cli.__main__ import cli
 from tests.custom.types import SyncableFile
 
diff --git a/tests/custom/otel/test_helpers.py b/tests/custom/otel/test_helpers.py
index 3bd5ce45..f7ff6555 100644
--- a/tests/custom/otel/test_helpers.py
+++ b/tests/custom/otel/test_helpers.py
@@ -1,7 +1,8 @@
 import pytest
-from humanloop.otel.helpers import read_from_opentelemetry_span, write_to_opentelemetry_span
 from opentelemetry.sdk.trace import Span
 
+from humanloop.otel.helpers import read_from_opentelemetry_span, write_to_opentelemetry_span
+
 
 def test_read_empty(test_span: Span):
     with pytest.raises(TypeError):
diff --git a/tests/custom/sync/test_client.py b/tests/custom/sync/test_client.py
index a349fd0c..ac83d259 100644
--- a/tests/custom/sync/test_client.py
+++ b/tests/custom/sync/test_client.py
@@ -1,10 +1,12 @@
 import logging
-import pytest
 from pathlib import Path
+from typing import Literal
 from unittest.mock import Mock, patch
-from humanloop.sync.sync_client import SyncClient, SerializableFileType
+
+import pytest
+
 from humanloop.error import HumanloopRuntimeError
-from typing import Literal
+from humanloop.sync.sync_client import SerializableFileType, SyncClient
 
 
 @pytest.fixture
diff --git a/tests/custom/types.py b/tests/custom/types.py
index 7a198456..b270d9fa 100644
--- a/tests/custom/types.py
+++ b/tests/custom/types.py
@@ -1,6 +1,7 @@
-from typing import Protocol, NamedTuple
-from humanloop.client import Humanloop
+from typing import NamedTuple, Protocol
+
 from humanloop import FileType
+from humanloop.client import Humanloop
 
 
 class GetHumanloopClientFn(Protocol):
diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
deleted file mode 100644
index d14042a3..00000000
--- a/tests/integration/conftest.py
+++ /dev/null
@@ -1,169 +0,0 @@
-import io
-import os
-import uuid
-from contextlib import contextmanager, redirect_stdout
-from dataclasses import dataclass
-from typing import Any, ContextManager, Generator, TextIO
-
-import dotenv
-import pytest
-from humanloop.client import Humanloop
-from humanloop.requests.prompt_kernel_request import PromptKernelRequestParams
-
-
-@dataclass
-class TestIdentifiers:
-    file_id: str
-    file_path: str
-
-
-@pytest.fixture()
-def capture_stdout() -> ContextManager[TextIO]:
-    @contextmanager
-    def _context_manager():
-        f = io.StringIO()
-        with redirect_stdout(f):
-            yield f
-
-    return _context_manager  # type: ignore [return-value]
-
-
-@pytest.fixture(scope="session")
-def openai_key() -> str:
-    dotenv.load_dotenv()
-    if not os.getenv("OPENAI_API_KEY"):
-        pytest.fail("OPENAI_API_KEY is not set for integration tests")
-    return os.getenv("OPENAI_API_KEY")  # type: ignore [return-value]
-
-
-@pytest.fixture(scope="session")
-def humanloop_test_client() -> Humanloop:
-    dotenv.load_dotenv()
-    if not os.getenv("HUMANLOOP_API_KEY"):
-        pytest.fail("HUMANLOOP_API_KEY is not set for integration tests")
-    return Humanloop(api_key=os.getenv("HUMANLOOP_API_KEY"))  # type: ignore [return-value]
-
-
-@pytest.fixture(scope="function")
-def sdk_test_dir(humanloop_test_client: Humanloop) -> Generator[str, None, None]:
-    path = f"SDK_INTEGRATION_TEST_{uuid.uuid4()}"
-    try:
-        response = humanloop_test_client.directories.create(path=path)
-        yield response.path
-        humanloop_test_client.directories.delete(id=response.id)
-    except Exception as e:
-        pytest.fail(f"Failed to create directory {path}: {e}")
-
-
-@pytest.fixture(scope="function")
-def test_prompt_config() -> PromptKernelRequestParams:
-    return {
-        "provider": "openai",
-        "model": "gpt-4o-mini",
-        "temperature": 0.5,
-        "template": [
-            {
-                "role": "system",
-                "content": "You are a helpful assistant. You must answer the user's question truthfully and at the level of a 5th grader.",
-            },
-            {
-                "role": "user",
-                "content": "{{question}}",
-            },
-        ],
-    }
-
-
-@pytest.fixture(scope="function")
-def eval_dataset(humanloop_test_client: Humanloop, sdk_test_dir: str) -> Generator[TestIdentifiers, None, None]:
-    dataset_path = f"{sdk_test_dir}/eval_dataset"
-    try:
-        response = humanloop_test_client.datasets.upsert(
-            path=dataset_path,
-            datapoints=[
-                {
-                    "inputs": {
-                        "question": "What is the capital of the France?",
-                    },
-                },
-                {
-                    "inputs": {
-                        "question": "What is the capital of the Germany?",
-                    },
-                },
-                {
-                    "inputs": {
-                        "question": "What is 2+2?",
-                    },
-                },
-            ],
-        )
-        yield TestIdentifiers(file_id=response.id, file_path=response.path)
-        humanloop_test_client.datasets.delete(id=response.id)
-    except Exception as e:
-        pytest.fail(f"Failed to create dataset {dataset_path}: {e}")
-
-
-@pytest.fixture(scope="function")
-def eval_prompt(
-    humanloop_test_client: Humanloop, sdk_test_dir: str, openai_key: str, test_prompt_config: dict[str, Any]
-) -> Generator[TestIdentifiers, None, None]:
-    prompt_path = f"{sdk_test_dir}/eval_prompt"
-    try:
-        response = humanloop_test_client.prompts.upsert(
-            path=prompt_path,
-            **test_prompt_config,
-        )
-        yield TestIdentifiers(file_id=response.id, file_path=response.path)
-        humanloop_test_client.prompts.delete(id=response.id)
-    except Exception as e:
-        pytest.fail(f"Failed to create prompt {prompt_path}: {e}")
-
-
-@pytest.fixture(scope="function")
-def prompt(
-    humanloop_test_client: Humanloop, sdk_test_dir: str, openai_key: str, test_prompt_config: dict[str, Any]
-) -> Generator[TestIdentifiers, None, None]:
-    prompt_path = f"{sdk_test_dir}/prompt"
-    try:
-        response = humanloop_test_client.prompts.upsert(
-            path=prompt_path,
-            **test_prompt_config,
-        )
-        yield TestIdentifiers(file_id=response.id, file_path=response.path)
-        humanloop_test_client.prompts.delete(id=response.id)
-    except Exception as e:
-        pytest.fail(f"Failed to create prompt {prompt_path}: {e}")
-
-
-@pytest.fixture(scope="function")
-def output_not_null_evaluator(
-    humanloop_test_client: Humanloop, sdk_test_dir: str
-) -> Generator[TestIdentifiers, None, None]:
-    evaluator_path = f"{sdk_test_dir}/output_not_null_evaluator"
-    try:
-        response = humanloop_test_client.evaluators.upsert(
-            path=evaluator_path,
-            spec={
-                "arguments_type": "target_required",
-                "return_type": "boolean",
-                "code": """
-def output_not_null(log: dict) -> bool:
-    return log["output"] is not None
-                """,
-                "evaluator_type": "python",
-            },
-        )
-        yield TestIdentifiers(file_id=response.id, file_path=response.path)
-        humanloop_test_client.evaluators.delete(id=response.id)
-    except Exception as e:
-        pytest.fail(f"Failed to create evaluator {evaluator_path}: {e}")
-
-
-@pytest.fixture(scope="function")
-def id_for_staging_environment(humanloop_test_client: Humanloop, eval_prompt: TestIdentifiers) -> str:
-    response = humanloop_test_client.prompts.list_environments(id=eval_prompt.file_id)
-    for environment in response:
-        if environment.name == "staging":
-            return environment.id
-    pytest.fail("Staging environment not found")
diff --git a/tests/integration/test_decorators.py b/tests/integration/test_decorators.py
deleted file mode 100644
index 218453a6..00000000
--- a/tests/integration/test_decorators.py
+++ /dev/null
@@ -1,154 +0,0 @@
-import time
-from typing import Any
-
-from openai import OpenAI
-from humanloop.client import Humanloop
-from humanloop.types.chat_message import ChatMessage
-
-
-def test_prompt_decorator(
-    humanloop_test_client: Humanloop,
-    sdk_test_dir: str,
-    test_prompt_config: dict[str, Any],
-    openai_key: str,
-):
-    try:
-        prompt_path = f"{sdk_test_dir}/test_prompt"
-        prompt_response = humanloop_test_client.prompts.upsert(
-            path=prompt_path,
-            **test_prompt_config,
-        )
-
-        prompt_versions_response = humanloop_test_client.prompts.list_versions(id=prompt_response.id)
-        assert len(prompt_versions_response.records) == 1
-
-        @humanloop_test_client.prompt(path=prompt_path)
-        def my_prompt(question: str) -> str:
-            openai_client = OpenAI(api_key=openai_key)
-
-            response = openai_client.chat.completions.create(
-                model="gpt-4o-mini",
-                messages=[{"role": "user", "content": question}],
-            )
-
-            assert response.choices[0].message.content is not None
-            return response.choices[0].message.content
-
-        assert "paris" in my_prompt("What is the capital of the France?").lower()
-
-        time.sleep(5)
-        prompt_versions_response = humanloop_test_client.prompts.list_versions(id=prompt_response.id)
-        assert len(prompt_versions_response.records) == 2
-
-        logs_response = humanloop_test_client.logs.list(file_id=prompt_response.id, page=1, size=50)
-
-        assert logs_response.items is not None and len(logs_response.items) == 1
-    finally:
-        humanloop_test_client.prompts.delete(id=prompt_response.id)
-
-
-def test_call_prompt_in_flow_decorator(
-    humanloop_test_client: Humanloop,
-    sdk_test_dir: str,
-    openai_key: str,
-):
-    try:
-
-        @humanloop_test_client.flow(path=f"{sdk_test_dir}/test_flow")
-        def my_flow(question: str) -> str:
-            response = humanloop_test_client.prompts.call(
-                path=f"{sdk_test_dir}/test_prompt",
-                prompt={
-                    "provider": "openai",
-                    "model": "gpt-4o-mini",
-                    "temperature": 0,
-                },
-                messages=[{"role": "user", "content": question}],
-                provider_api_keys={"openai": openai_key},
-            )
-
-            assert response.logs[0].output is not None
-            return response.logs[0].output
-
-        assert "paris" in my_flow("What is the capital of the France?").lower()
-        time.sleep(5)
-        prompt_response = humanloop_test_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_prompt")
-        assert prompt_response is not None
-        prompt_logs_response = humanloop_test_client.logs.list(file_id=prompt_response.id, page=1, size=50)
-        assert prompt_logs_response.items is not None and len(prompt_logs_response.items) == 1
-        prompt_log = prompt_logs_response.items[0]
-
-        flow_response = humanloop_test_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_flow")
-        assert flow_response is not None
-        flow_logs_response = humanloop_test_client.logs.list(file_id=flow_response.id, page=1, size=50)
-        assert flow_logs_response.items is not None and len(flow_logs_response.items) == 1
-        flow_log = flow_logs_response.items[0]
-        assert prompt_log.trace_parent_id == flow_log.id
-    finally:
-        flow_response = humanloop_test_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_flow")
-        if flow_response is not None:
-            humanloop_test_client.flows.delete(id=flow_response.id)
-        prompt_response = humanloop_test_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_prompt")
-        if prompt_response is not None:
-            humanloop_test_client.prompts.delete(id=prompt_response.id)
-
-
-def test_flow_decorator_logs_exceptions(
-    humanloop_test_client: Humanloop,
-    sdk_test_dir: str,
-):
-    try:
-
-        @humanloop_test_client.flow(path=f"{sdk_test_dir}/test_flow_log_error")
-        def my_flow(question: str) -> str:
-            raise ValueError("This is a test exception")
-
-        my_flow("test")
-
-        time.sleep(5)
-
-        flow_response = humanloop_test_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_flow_log_error")
-        assert flow_response is not None
-        flow_logs_response = humanloop_test_client.logs.list(file_id=flow_response.id, page=1, size=50)
-        assert flow_logs_response.items is not None and len(flow_logs_response.items) == 1
-        flow_log = flow_logs_response.items[0]
-        assert flow_log.error is not None
-        assert flow_log.output is None
-
-    finally:
-        flow_response = humanloop_test_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_flow_log_error")
-        if flow_response is not None:
-            humanloop_test_client.flows.delete(id=flow_response.id)
-
-
-def test_flow_decorator_populates_output_message(
-    humanloop_test_client: Humanloop,
-    sdk_test_dir: str,
-):
-    try:
-
-        @humanloop_test_client.flow(path=f"{sdk_test_dir}/test_flow_log_output_message")
-        def my_flow(question: str) -> dict[str, Any]:
-            return {"role": "user", "content": question}
-
-        assert "france" in my_flow("What is the capital of the France?")["content"].lower()
-
-        time.sleep(5)
-
-        flow_response = humanloop_test_client.files.retrieve_by_path(
-            path=f"{sdk_test_dir}/test_flow_log_output_message"
-        )
-        assert flow_response is not None
-        flow_logs_response = humanloop_test_client.logs.list(file_id=flow_response.id, page=1, size=50)
-        assert flow_logs_response.items is not None and len(flow_logs_response.items) == 1
-        flow_log = flow_logs_response.items[0]
-        assert flow_log.output_message is not None
-        assert flow_log.output is None
-        assert flow_log.error is None
-
-    finally:
-        flow_response = humanloop_test_client.files.retrieve_by_path(
-            path=f"{sdk_test_dir}/test_flow_log_output_message"
-        )
-        if flow_response is not None:
-            humanloop_test_client.flows.delete(id=flow_response.id)
diff --git a/tests/integration/test_evals.py b/tests/integration/test_evals.py
deleted file mode 100644
index 49bbb6dc..00000000
--- a/tests/integration/test_evals.py
+++ /dev/null
@@ -1,402 +0,0 @@
-import time
-from typing import Any
-
-import pytest
-from humanloop.client import Humanloop
-from humanloop.error import HumanloopRuntimeError
-from tests.integration.conftest import TestIdentifiers
-
-
-def test_eval_run_works_on_online_files(
-    humanloop_test_client: Humanloop,
-    output_not_null_evaluator: TestIdentifiers,
-    eval_dataset: TestIdentifiers,
-    eval_prompt: TestIdentifiers,
-) -> None:
-    humanloop_test_client.evaluations.run(  # type: ignore [attr-defined]
-        name="test_eval_run",
-        file={
-            "path": eval_prompt.file_path,
-            "type": "prompt",
-        },
-        dataset={
-            "path": eval_dataset.file_path,
-        },
-        evaluators=[
-            {
-                "path": output_not_null_evaluator.file_path,
-            }
-        ],
-    )
-    time.sleep(5)
-    response = humanloop_test_client.evaluations.list(file_id=eval_prompt.file_id)
-    assert response.items and len(response.items) == 1
-    evaluation_id = response.items[0].id
-    run_evaluation_response = humanloop_test_client.evaluations.list_runs_for_evaluation(id=evaluation_id)  # type: ignore [attr-defined]
-    assert run_evaluation_response.runs[0].status == "completed"
-
-
-def test_eval_run_version_id(
-    humanloop_test_client: Humanloop,
-    output_not_null_evaluator: TestIdentifiers,
-    eval_dataset: TestIdentifiers,
-    eval_prompt: TestIdentifiers,
-    test_prompt_config: dict[str, Any],
-) -> None:
-    # GIVEN a prompt where a non-default version is created
-    new_test_prompt_config = test_prompt_config.copy()
-    new_test_prompt_config["temperature"] = 1
-    new_prompt_version_response = humanloop_test_client.prompts.upsert(
-        path=eval_prompt.file_path,
-        **new_test_prompt_config,
-    )
-    # WHEN creating an evaluation using version_id
-    humanloop_test_client.evaluations.run(  # type: ignore [attr-defined]
-        name="test_eval_run",
-        file={
-            "id": new_prompt_version_response.id,
-            "version_id": new_prompt_version_response.version_id,
-            "type": "prompt",
-        },
-        dataset={
-            "path": eval_dataset.file_path,
-        },
-        evaluators=[
-            {
-                "path": output_not_null_evaluator.file_path,
-            }
-        ],
-    )
-    # THEN we evaluate the version created in the test
-    evaluations_response = humanloop_test_client.evaluations.list(file_id=new_prompt_version_response.id)
-    assert evaluations_response.items and len(evaluations_response.items) == 1
-    evaluation_id = evaluations_response.items[0].id
-    runs_response = humanloop_test_client.evaluations.list_runs_for_evaluation(id=evaluation_id)
-    assert runs_response.runs[0].status == "completed"
-    assert (
-        runs_response.runs[0].version
-        and runs_response.runs[0].version.version_id == new_prompt_version_response.version_id
-    )
-    list_versions_response = humanloop_test_client.prompts.list_versions(id=new_prompt_version_response.id)
-    assert list_versions_response.records and len(list_versions_response.records) == 2
-    # THEN the version used in evaluation is not the default version
-    response = humanloop_test_client.prompts.get(id=new_prompt_version_response.id)
-    assert response.version_id != new_prompt_version_response.version_id
-
-
-def test_eval_run_environment(
-    humanloop_test_client: Humanloop,
-    output_not_null_evaluator: TestIdentifiers,
-    eval_dataset: TestIdentifiers,
-    eval_prompt: TestIdentifiers,
-    test_prompt_config: dict[str, Any],
-    id_for_staging_environment: str,
-) -> None:
-    # GIVEN a prompt deployed to staging environment
-    new_test_prompt_config = test_prompt_config.copy()
-    new_test_prompt_config["temperature"] = 1
-    new_prompt_version_response = humanloop_test_client.prompts.upsert(
-        path=eval_prompt.file_path,
-        **new_test_prompt_config,
-    )
-    humanloop_test_client.prompts.set_deployment(
-        id=new_prompt_version_response.id,
-        environment_id=id_for_staging_environment,
-        version_id=new_prompt_version_response.version_id,
-    )
-    # WHEN creating an evaluation using environment
-    humanloop_test_client.evaluations.run(  # type: ignore [attr-defined]
-        name="test_eval_run",
-        file={
-            "id": new_prompt_version_response.id,
-            "type": "prompt",
-            "environment": "staging",
-        },
-        dataset={
-            "path": eval_dataset.file_path,
-        },
-        evaluators=[
-            {
-                "path": output_not_null_evaluator.file_path,
-            }
-        ],
-    )
-    # THEN evaluation is done with the version deployed to staging environment
-    evaluations_response = humanloop_test_client.evaluations.list(file_id=new_prompt_version_response.id)
-    assert evaluations_response.items and len(evaluations_response.items) == 1
-    evaluation_id = evaluations_response.items[0].id
-    runs_response = humanloop_test_client.evaluations.list_runs_for_evaluation(id=evaluation_id)
-    assert runs_response.runs[0].status == "completed"
-    assert (
-        runs_response.runs[0].version
-        and runs_response.runs[0].version.version_id == new_prompt_version_response.version_id
-    )
-    default_prompt_version_response = humanloop_test_client.prompts.get(id=new_prompt_version_response.id)
-    assert default_prompt_version_response.version_id != new_prompt_version_response.version_id
-
-
-@pytest.mark.parametrize("version_lookup", ["version_id", "environment"])
-def test_eval_run_version_lookup_fails_with_path(
-    humanloop_test_client: Humanloop,
-    eval_prompt: TestIdentifiers,
-    eval_dataset: TestIdentifiers,
-    output_not_null_evaluator: TestIdentifiers,
-    version_lookup: str,
-):
-    # GIVEN an eval run where we try to evaluate a non-default version
-    with pytest.raises(HumanloopRuntimeError) as e:
-        humanloop_test_client.evaluations.run(  # type: ignore [attr-defined]
-            name="test_eval_run",
-            file={
-                "path": eval_prompt.file_path,
-                "type": "prompt",
-                # WHEN the File id is not passed in file
-                version_lookup: "will_not_work",
-            },
-            dataset={
-                "path": eval_dataset.file_path,
-            },
-            evaluators=[
-                {
-                    "path": output_not_null_evaluator.file_path,
-                }
-            ],
-        )
-    # THEN an error is raised
-    assert "You must provide the `file.id` when addressing a file by version ID or environment" in str(e.value)
-
-
-def test_eval_run_with_version_upsert(
-    humanloop_test_client: Humanloop,
-    eval_prompt: TestIdentifiers,
-    eval_dataset: TestIdentifiers,
-    output_not_null_evaluator: TestIdentifiers,
-    test_prompt_config: dict[str, Any],
-):
-    humanloop_test_client.evaluations.run(  # type: ignore [attr-defined]
-        name="test_eval_run",
-        file={
-            "path": eval_prompt.file_path,
-            "type": "prompt",
-            "version": {
-                **test_prompt_config,
-                "temperature": 1,
-            },
-        },
-        dataset={
-            "path": eval_dataset.file_path,
-        },
-        evaluators=[
-            {
-                "path": output_not_null_evaluator.file_path,
-            }
-        ],
-    )
-    # THEN the version is upserted and evaluation finishes successfully
-    evaluations_response = humanloop_test_client.evaluations.list(file_id=eval_prompt.file_id)
-    assert evaluations_response.items and len(evaluations_response.items) == 1
-    evaluation_id = evaluations_response.items[0].id
-    runs_response = humanloop_test_client.evaluations.list_runs_for_evaluation(id=evaluation_id)
-    assert runs_response.runs[0].status == "completed"
-    # THEN a version was upserted based on file.version
-    list_prompt_versions_response = humanloop_test_client.prompts.list_versions(id=eval_prompt.file_id)
-    assert list_prompt_versions_response.records and len(list_prompt_versions_response.records) == 2
-
-
-def test_flow_eval_does_not_work_without_callable(
-    humanloop_test_client: Humanloop,
-    eval_dataset: TestIdentifiers,
-    output_not_null_evaluator: TestIdentifiers,
-):
-    with pytest.raises(HumanloopRuntimeError) as e:
-        humanloop_test_client.evaluations.run(  # type: ignore [attr-defined]
-            name="test_eval_run",
-            file={
-                "path": "Test Flow",
-                "type": "flow",
-                "version": {
-                    "attributes": {
-                        "foo": "bar",
-                    }
-                },
-            },
-            dataset={
-                "path": eval_dataset.file_path,
-            },
-            evaluators=[
-                {
-                    "path": output_not_null_evaluator.file_path,
-                }
-            ],
-        )
-    # THEN an error is raised
-    assert "You must provide a `callable` for your Flow `file` to run a local eval." in str(e.value)
-
-
-def test_flow_eval_works_with_callable(
-    humanloop_test_client: Humanloop,
-    eval_dataset: TestIdentifiers,
-    output_not_null_evaluator: TestIdentifiers,
-    sdk_test_dir: str,
-):
-    flow_path = f"{sdk_test_dir}/Test Flow"
-    # GIVEN a flow with a callable
-    flow_response = humanloop_test_client.flows.upsert(
-        path=flow_path,
-        attributes={
-            "foo": "bar",
-        },
-    )
-    try:
-        flow = humanloop_test_client.flows.upsert(
-            path=flow_path,
-            attributes={
-                "foo": "bar",
-            },
-        )
-        # WHEN we run an evaluation with the flow
-        humanloop_test_client.evaluations.run(  # type: ignore [attr-defined]
-            name="test_eval_run",
-            file={
-                "id": flow.id,
-                "type": "flow",
-                "callable": lambda question: "bar",
-            },
-            dataset={
-                "path": eval_dataset.file_path,
-            },
-            evaluators=[
-                {
-                    "path": output_not_null_evaluator.file_path,
-                }
-            ],
-        )
-        # THEN the evaluation finishes successfully
-        evaluations_response = humanloop_test_client.evaluations.list(file_id=flow.id)
-        assert evaluations_response.items and len(evaluations_response.items) == 1
-        evaluation_id = evaluations_response.items[0].id
-        runs_response = humanloop_test_client.evaluations.list_runs_for_evaluation(id=evaluation_id)
-        assert runs_response.runs[0].status == "completed"
-    finally:
-        humanloop_test_client.flows.delete(id=flow_response.id)
-
-
-def test_cannot_evaluate_agent_with_callable(
-    humanloop_test_client: Humanloop,
-    eval_dataset: TestIdentifiers,
-    output_not_null_evaluator: TestIdentifiers,
-):
-    with pytest.raises(ValueError) as e:
-        humanloop_test_client.evaluations.run(  # type: ignore [attr-defined]
-            name="test_eval_run",
-            file={
-                "path": "Test Agent",
-                "type": "agent",
-                "callable": lambda question: "bar",
-            },
-            dataset={
-                "path": eval_dataset.file_path,
-            },
-            evaluators=[
-                {
-                    "path": output_not_null_evaluator.file_path,
-                }
-            ],
-        )
-    assert str(e.value) == "Agent evaluation is only possible on the Humanloop runtime, do not provide a `callable`."
-
-
-def test_flow_eval_resolves_to_default_with_callable(
-    humanloop_test_client: Humanloop,
-    output_not_null_evaluator: TestIdentifiers,
-    eval_dataset: TestIdentifiers,
-    sdk_test_dir: str,
-) -> None:
-    # GIVEN a flow with some attributes
-    flow_path = f"{sdk_test_dir}/Test Flow"
-    flow_response = humanloop_test_client.flows.upsert(
-        path=flow_path,
-        attributes={
-            "foo": "bar",
-        },
-    )
-    try:
-        # WHEN running an evaluation with the flow's callable but no version
-        humanloop_test_client.evaluations.run(  # type: ignore [attr-defined]
-            name="test_eval_run",
-            file={
-                "id": flow_response.id,
-                "type": "flow",
-                "callable": lambda question: "It's complicated don't worry about it",
-            },
-            dataset={
-                "path": eval_dataset.file_path,
-            },
-            evaluators=[
-                {
-                    "path": output_not_null_evaluator.file_path,
-                }
-            ],
-        )
-        # THEN the evaluation finishes successfully
-        evaluations_response = humanloop_test_client.evaluations.list(file_id=flow_response.id)
-        assert evaluations_response.items and len(evaluations_response.items) == 1
-        evaluation_id = evaluations_response.items and evaluations_response.items[0].id
-        runs_response = humanloop_test_client.evaluations.list_runs_for_evaluation(id=evaluation_id)  # type: ignore [attr-defined, arg-type]
-        assert runs_response.runs[0].status == "completed"
-    finally:
-        # Clean up test resources
-        humanloop_test_client.flows.delete(id=flow_response.id)
-
-
-@pytest.mark.skip(reason="Skip until agents are in prod")
-def test_agent_eval_works_upserting(
-    humanloop_test_client: Humanloop,
-    eval_dataset: TestIdentifiers,
-    output_not_null_evaluator: TestIdentifiers,
-    sdk_test_dir: str,
-):
-    humanloop_test_client.evaluations.run(  # type: ignore [attr-defined]
-        name="test_eval_run",
-        file={
-            "path": f"{sdk_test_dir}/Test Agent",
-            "type": "agent",
-            "version": {
-                "model": "gpt-4o",
-                "template": [
-                    {
-                        "role": "system",
-                        "content": "You are a helpful assistant, offering very short answers.",
-                    },
-                    {
-                        "role": "user",
-                        "content": "{{question}}",
-                    },
-                ],
-                "provider": "openai",
-                "temperature": 0,
-                "max_iterations": 5,
-            },
-        },
-        dataset={
-            "path": eval_dataset.file_path,
-        },
-        evaluators=[
-            {
-                "path": output_not_null_evaluator.file_path,
-            }
-        ],
-    )
-    files_response = humanloop_test_client.files.list_files(page=1, size=100)
-    eval_agent = None
-    for file in files_response.records:
-        if file.path == f"{sdk_test_dir}/Test Agent":
-            eval_agent = file
-            break
-    assert eval_agent and eval_agent.type == "agent"
-    # THEN the evaluation finishes successfully
-    evaluations_response = humanloop_test_client.evaluations.list(file_id=eval_agent.id)
-    assert evaluations_response.items and len(evaluations_response.items) == 1
-    evaluation_id = evaluations_response.items[0].id
-    runs_response = humanloop_test_client.evaluations.list_runs_for_evaluation(id=evaluation_id)  # type: ignore [attr-defined, arg-type]
-    assert runs_response.runs[0].status == "completed"
diff --git a/tests/otel/__init__.py b/tests/otel/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tests/otel/test_helpers.py b/tests/otel/test_helpers.py
deleted file mode 100644
index 3bd5ce45..00000000
--- a/tests/otel/test_helpers.py
+++ /dev/null
@@ -1,172 +0,0 @@
-import pytest
-from humanloop.otel.helpers import read_from_opentelemetry_span, write_to_opentelemetry_span
-from opentelemetry.sdk.trace import Span
-
-
-def test_read_empty(test_span: Span):
-    with pytest.raises(TypeError):
-        assert read_from_opentelemetry_span(test_span) == {}
-
-
-def test_read_non_existent_key(test_span: Span):
-    with pytest.raises(TypeError):
-        assert read_from_opentelemetry_span(test_span, "key") == {}
-    write_to_opentelemetry_span(test_span, {"x": 7, "y": "foo"}, key="key")
-    # NOTE: attributes cannot be None at this point
-    assert dict(test_span.attributes) == {  # type: ignore
-        "key.x": 7,
-        "key.y": "foo",
-    }
-    with pytest.raises(TypeError):
-        assert read_from_opentelemetry_span(test_span, "key.z") is None
-
-
-def test_simple_dict(test_span: Span):
-    write_to_opentelemetry_span(test_span, {"x": 7, "y": "foo"}, "key")
-    # NOTE: attributes cannot be None at this point
-    assert dict(test_span.attributes) == {  # type: ignore
-        "key.x": 7,
-        "key.y": "foo",
-    }
-    assert read_from_opentelemetry_span(test_span, "key") == {"x": 7, "y": "foo"}
-
-
-def test_no_prefix(test_span: Span):
-    write_to_opentelemetry_span(test_span, {"x": 7, "y": "foo"})
-    # NOTE: attributes cannot be None at this point
-    assert dict(test_span.attributes) == {  # type: ignore
-        "x": 7,
-        "y": "foo",
-    }
-    assert read_from_opentelemetry_span(test_span) == {"x": 7, "y": "foo"}
-
-
-def test_nested_object(test_span: Span):
-    write_to_opentelemetry_span(test_span, {"x": 7, "y": {"z": "foo"}}, "key")
-    # NOTE: attributes cannot be None at this point
-    assert dict(test_span.attributes) == {  # type: ignore
-        "key.x": 7,
-        "key.y.z": "foo",
-    }
-    assert read_from_opentelemetry_span(test_span, "key") == {"x": 7, "y": {"z": "foo"}}
-
-
-def test_list(test_span: Span):
-    write_to_opentelemetry_span(
-        test_span,
-        [{"x": 7, "y": "foo"}, {"z": "bar"}],  # type: ignore
-        "key",
-    )  # type: ignore
-    # NOTE: attributes cannot be None at this point
-    assert dict(test_span.attributes) == {  # type: ignore
-        "key.0.x": 7,
-        "key.0.y": "foo",
-        "key.1.z": "bar",
-    }
-    assert read_from_opentelemetry_span(test_span, "key") == [
-        {"z": "bar"},
-        {"x": 7, "y": "foo"},
-    ]
-
-
-def test_list_no_prefix(test_span: Span):
-    write_to_opentelemetry_span(
-        test_span,
-        [{"x": 7, "y": "foo"}, {"z": "bar"}],  # type: ignore
-    )
-    # NOTE: attributes cannot be None at this point
-    assert dict(test_span.attributes) == {  # type: ignore
-        "0.x": 7,
-        "0.y": "foo",
-        "1.z": "bar",
-    }
-    assert read_from_opentelemetry_span(test_span) == [
-        {"z": "bar"},
-        {"x": 7, "y": "foo"},
-    ]
-
-
-def test_multiple_nestings(test_span: Span):
-    write_to_opentelemetry_span(
-        test_span,
-        [
-            {"x": 7, "y": "foo"},
-            [{"z": "bar"}, {"a": 42}],
-        ],  # type: ignore
-        "key",
-    )
-    assert dict(test_span.attributes) == {  # type: ignore
-        "key.0.x": 7,
-        "key.0.y": "foo",
-        "key.1.0.z": "bar",
-        "key.1.1.a": 42,
-    }
-    assert read_from_opentelemetry_span(test_span, "key") == [
-        [
-            {"a": 42},
-            {"z": "bar"},
-        ],
-        {"x": 7, "y": "foo"},
-    ]
-
-
-def test_read_mixed_numeric_string_keys(test_span: Span):
-    test_span.set_attributes(
-        {
-            "key.0.x": 7,
-            "key.0.y": "foo",
-            "key.a.z": "bar",
-            "key.a.a": 42,
-        }
-    )
-    assert read_from_opentelemetry_span(span=test_span, key="key") == {  # type: ignore
-        "0": {"x": 7, "y": "foo"},
-        "a": {"z": "bar", "a": 42},
-    }
-    assert read_from_opentelemetry_span(span=test_span) == {  # type: ignore
-        "key": {
-            "0": {"x": 7, "y": "foo"},
-            "a": {"z": "bar", "a": 42},
-        }
-    }
-
-
-def test_sub_key_same_as_key(test_span: Span):
-    write_to_opentelemetry_span(test_span, {"key": 7}, "key")
-    # NOTE: attributes cannot be None at this point
-    assert dict(test_span.attributes) == {  # type: ignore
-        "key.key": 7,
-    }
-    assert read_from_opentelemetry_span(test_span, "key") == {"key": 7}
-
-
-def test_read_nested_key(test_span: Span):
-    test_span.set_attributes({"key.x": 7, "key.y.z": "foo"})
-    assert read_from_opentelemetry_span(span=test_span, key="key.y") == {"z": "foo"}
-
-
-def test_write_read_sub_key(test_span: Span):
-    write_to_opentelemetry_span(test_span, {"x": 7, "y": "foo"}, "key")
-    assert read_from_opentelemetry_span(test_span, "key.x") == 7
-    assert read_from_opentelemetry_span(test_span, "key.y") == "foo"
-    assert read_from_opentelemetry_span(test_span, "key") == {"x": 7, "y": "foo"}
-
-
-def test_write_drops_dict_all_null_values(test_span: Span):
-    # GIVEN a test_span to which a value with null values is written
-    # NOTE: mypy complains about None value in the dict, but it is intentionally under test
-    write_to_opentelemetry_span(test_span, {"x": None, "y": None}, "key")  # type: ignore
-    # WHEN reading the value from the span
-    # THEN the value is not present in the span attributes
-    assert "key" not in test_span.attributes  # type: ignore
-    with pytest.raises(TypeError):
-        assert read_from_opentelemetry_span(test_span, "key") == {}
-
-
-def test_write_drops_null_value_from_dict(test_span: Span):
-    # GIVEN a test_span to which a dict with some null values are written
-    # NOTE: mypy complains about None value in the dict, but it is intentionally under test
-    write_to_opentelemetry_span(test_span, {"x": 2, "y": None}, "key")  # type: ignore
-    # WHEN reading the values from the span
-    # THEN the value with null value is not present in the span attributes
-    assert read_from_opentelemetry_span(test_span, "key") == {"x": 2}
diff --git a/tests/utils/assets/models/__init__.py b/tests/utils/assets/models/__init__.py
index 3a1c852e..2cf01263 100644
--- a/tests/utils/assets/models/__init__.py
+++ b/tests/utils/assets/models/__init__.py
@@ -5,7 +5,7 @@
 from .circle import CircleParams
 from .object_with_defaults import ObjectWithDefaultsParams
 from .object_with_optional_field import ObjectWithOptionalFieldParams
-from .shape import ShapeParams, Shape_CircleParams, Shape_SquareParams
+from .shape import Shape_CircleParams, Shape_SquareParams, ShapeParams
 from .square import SquareParams
 from .undiscriminated_shape import UndiscriminatedShapeParams
 
diff --git a/tests/utils/assets/models/circle.py b/tests/utils/assets/models/circle.py
index 759fe3eb..6125ca54 100644
--- a/tests/utils/assets/models/circle.py
+++ b/tests/utils/assets/models/circle.py
@@ -3,6 +3,7 @@
 # This file was auto-generated by Fern from our API Definition.
 
 import typing_extensions
+
 from humanloop.core.serialization import FieldMetadata
 
 
diff --git a/tests/utils/assets/models/object_with_defaults.py b/tests/utils/assets/models/object_with_defaults.py
index ef14f7b2..a977b1d2 100644
--- a/tests/utils/assets/models/object_with_defaults.py
+++ b/tests/utils/assets/models/object_with_defaults.py
@@ -3,7 +3,6 @@
 # This file was auto-generated by Fern from our API Definition.
 
 import typing_extensions
-import typing_extensions
 
 
 class ObjectWithDefaultsParams(typing_extensions.TypedDict):
diff --git a/tests/utils/assets/models/object_with_optional_field.py b/tests/utils/assets/models/object_with_optional_field.py
index dc3e3eb7..e4ffe724 100644
--- a/tests/utils/assets/models/object_with_optional_field.py
+++ b/tests/utils/assets/models/object_with_optional_field.py
@@ -2,15 +2,17 @@
 
 # This file was auto-generated by Fern from our API Definition.
 
-import typing_extensions
-import typing
-from humanloop.core.serialization import FieldMetadata
 import datetime as dt
+import typing
 import uuid
+
+import typing_extensions
 from .color import Color
 from .shape import ShapeParams
 from .undiscriminated_shape import UndiscriminatedShapeParams
 
+from humanloop.core.serialization import FieldMetadata
+
 
 class ObjectWithOptionalFieldParams(typing_extensions.TypedDict):
     literal: typing.Literal["lit_one"]
diff --git a/tests/utils/assets/models/shape.py b/tests/utils/assets/models/shape.py
index 540ccabd..56394d93 100644
--- a/tests/utils/assets/models/shape.py
+++ b/tests/utils/assets/models/shape.py
@@ -3,8 +3,11 @@
 # This file was auto-generated by Fern from our API Definition.
 
 from __future__ import annotations
-import typing_extensions
+
 import typing
+
+import typing_extensions
+
 from humanloop.core.serialization import FieldMetadata
 
 
diff --git a/tests/utils/assets/models/square.py b/tests/utils/assets/models/square.py
index da4a2111..3f25005d 100644
--- a/tests/utils/assets/models/square.py
+++ b/tests/utils/assets/models/square.py
@@ -3,6 +3,7 @@
 # This file was auto-generated by Fern from our API Definition.
 
 import typing_extensions
+
 from humanloop.core.serialization import FieldMetadata
 
 
diff --git a/tests/utils/assets/models/undiscriminated_shape.py b/tests/utils/assets/models/undiscriminated_shape.py
index 68876a23..99f12b30 100644
--- a/tests/utils/assets/models/undiscriminated_shape.py
+++ b/tests/utils/assets/models/undiscriminated_shape.py
@@ -3,6 +3,7 @@
 # This file was auto-generated by Fern from our API Definition.
 
 import typing
+
 from .circle import CircleParams
 from .square import SquareParams
 
diff --git a/tests/utils/test_serialization.py b/tests/utils/test_serialization.py
index 2ad8e1b5..40cc847b 100644
--- a/tests/utils/test_serialization.py
+++ b/tests/utils/test_serialization.py
@@ -2,10 +2,10 @@
 
 from typing import Any, List
 
-from humanloop.core.serialization import convert_and_respect_annotation_metadata
-
 from .assets.models import ObjectWithOptionalFieldParams, ShapeParams
 
+from humanloop.core.serialization import convert_and_respect_annotation_metadata
+
 UNION_TEST: ShapeParams = {"radius_measurement": 1.0, "shape_type": "circle", "id": "1"}
 UNION_TEST_CONVERTED = {"shapeType": "circle", "radiusMeasurement": 1.0, "id": "1"}
 

From 48aba5702876e3c36fa89de8e9059999d27ddf11 Mon Sep 17 00:00:00 2001
From: Ale Pouroullis <alexandros@humanloop.com>
Date: Tue, 13 May 2025 19:09:07 +0100
Subject: [PATCH 4/6] Update .fernignore with custom code dirs related to
 syncing

---
 .fernignore | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.fernignore b/.fernignore
index 112f779b..fd7adc81 100644
--- a/.fernignore
+++ b/.fernignore
@@ -13,10 +13,13 @@ mypy.ini
 README.md
 src/humanloop/decorators
 src/humanloop/otel
+src/humanloop/sync
+src/humanloop/cli
+pytest.ini
 
 ## Tests
 
-tests/
+tests/custom
 
 ## CI
 

From 0b5e426eeb7c8972bbedda3340b13c2948a91b1f Mon Sep 17 00:00:00 2001
From: Ale Pouroullis <alexandros@humanloop.com>
Date: Wed, 14 May 2025 10:48:15 +0100
Subject: [PATCH 5/6] Fix mypy errors for python 3.9

---
 src/humanloop/client.py               |  2 +-
 src/humanloop/sync/sync_client.py     | 14 ++++----
 tests/custom/integration/conftest.py  | 49 +++++++++++----------------
 tests/custom/integration/test_sync.py | 21 +++++++-----
 4 files changed, 40 insertions(+), 46 deletions(-)

diff --git a/src/humanloop/client.py b/src/humanloop/client.py
index fce02a98..ab6b2abc 100644
--- a/src/humanloop/client.py
+++ b/src/humanloop/client.py
@@ -395,7 +395,7 @@ def agent():
             attributes=attributes,
         )
 
-    def pull(self, path: str | None = None, environment: str | None = None) -> Tuple[List[str], List[str]]:
+    def pull(self, path: Optional[str] = None, environment: Optional[str] = None) -> Tuple[List[str], List[str]]:
         """Pull Prompt and Agent files from Humanloop to local filesystem.
 
         This method will:
diff --git a/src/humanloop/sync/sync_client.py b/src/humanloop/sync/sync_client.py
index d71f1568..b1cf091a 100644
--- a/src/humanloop/sync/sync_client.py
+++ b/src/humanloop/sync/sync_client.py
@@ -1,6 +1,6 @@
 import logging
 from pathlib import Path
-from typing import List, Tuple, TYPE_CHECKING
+from typing import List, Optional, Tuple, TYPE_CHECKING, Union
 from functools import lru_cache
 import typing
 import time
@@ -203,7 +203,7 @@ def _save_serialized_file(
             logger.error(f"Failed to write {file_type} {file_path} to disk: {str(e)}")
             raise
 
-    def _pull_file(self, path: str, environment: str | None = None) -> bool:
+    def _pull_file(self, path: str, environment: Optional[str] = None) -> bool:
         """Pull a specific file from Humanloop to local filesystem.
 
         Returns:
@@ -236,8 +236,8 @@ def _pull_file(self, path: str, environment: str | None = None) -> bool:
 
     def _pull_directory(
         self,
-        path: str | None = None,
-        environment: str | None = None,
+        path: Optional[str] = None,
+        environment: Optional[str] = None,
     ) -> Tuple[List[str], List[str]]:
         """Sync Prompt and Agent files from Humanloop to local filesystem.
 
@@ -316,7 +316,7 @@ def _pull_directory(
 
         return successful_files, failed_files
 
-    def pull(self, path: str | None = None, environment: str | None = None) -> Tuple[List[str], List[str]]:
+    def pull(self, path: Optional[str] = None, environment: Optional[str] = None) -> Tuple[List[str], List[str]]:
         """Pull files from Humanloop to local filesystem.
 
         If the path ends with .prompt or .agent, pulls that specific file.
@@ -343,7 +343,9 @@ def pull(self, path: str | None = None, environment: str | None = None) -> Tuple
         )
 
         try:
-            if normalized_path is None or path is None: # path being None means normalized_path is None, but we check both for improved type safety
+            if (
+                normalized_path is None or path is None
+            ):  # path being None means normalized_path is None, but we check both for improved type safety
                 # Pull all files from the root
                 logger.debug("Pulling all files from root")
                 successful_files, failed_files = self._pull_directory(
diff --git a/tests/custom/integration/conftest.py b/tests/custom/integration/conftest.py
index 25dc441f..039b0f1c 100644
--- a/tests/custom/integration/conftest.py
+++ b/tests/custom/integration/conftest.py
@@ -1,10 +1,10 @@
-import io
 import os
 import time
+import typing
 import uuid
-from contextlib import contextmanager, redirect_stdout
+from collections.abc import Generator
 from dataclasses import dataclass
-from typing import ContextManager, Generator, List, TextIO, Union
+from typing import Union
 
 import dotenv
 import pytest
@@ -21,17 +21,6 @@ class ResourceIdentifiers:
     file_path: str
 
 
-@pytest.fixture()
-def capture_stdout() -> ContextManager[TextIO]:
-    @contextmanager
-    def _context_manager():
-        f = io.StringIO()
-        with redirect_stdout(f):
-            yield f
-
-    return _context_manager  # type: ignore [return-value]
-
-
 @pytest.fixture(scope="session")
 def openai_key() -> str:
     dotenv.load_dotenv()
@@ -44,26 +33,26 @@ def openai_key() -> str:
 def sdk_test_dir(get_humanloop_client: GetHumanloopClientFn) -> Generator[str, None, None]:
     humanloop_client = get_humanloop_client()
 
+    def _get_subclient(file_type: str):
+        try:
+            return {
+                "agent": humanloop_client.agents,
+                "prompt": humanloop_client.prompts,
+                "dataset": humanloop_client.datasets,
+                "evaluator": humanloop_client.evaluators,
+                "flow": humanloop_client.flows,
+                "tool": humanloop_client.tools,
+            }[file_type]
+        except KeyError:
+            raise NotImplementedError(f"Unknown file type: {file_type}")
+
     def cleanup_directory(directory_id: str):
         directory_response = humanloop_client.directories.get(id=directory_id)
         for subdirectory in directory_response.subdirectories:
             cleanup_directory(subdirectory.id)
         for file in directory_response.files:
-            match file.type:
-                case "agent":
-                    humanloop_client.agents.delete(id=file.id)
-                case "prompt":
-                    humanloop_client.prompts.delete(id=file.id)
-                case "dataset":
-                    humanloop_client.datasets.delete(id=file.id)
-                case "evaluator":
-                    humanloop_client.evaluators.delete(id=file.id)
-                case "flow":
-                    humanloop_client.flows.delete(id=file.id)
-                case "tool":
-                    humanloop_client.tools.delete(id=file.id)
-                case _:
-                    raise ValueError(f"Unknown file type: {file.type}")
+            subclient = _get_subclient(typing.cast(str, file.type))
+            subclient.delete(id=file.id)
         humanloop_client.directories.delete(id=directory_response.id)
 
     path = f"SDK_INTEGRATION_TEST_{uuid.uuid4()}"
@@ -211,7 +200,7 @@ def syncable_files_fixture(
     sdk_test_dir: str,
 ) -> Generator[list[SyncableFile], None, None]:
     """Creates a predefined structure of files in Humanloop for testing sync."""
-    files: List[SyncableFile] = [
+    files: list[SyncableFile] = [
         SyncableFile(
             path="prompts/gpt-4",
             type="prompt",
diff --git a/tests/custom/integration/test_sync.py b/tests/custom/integration/test_sync.py
index 80e332a4..8b33f7a4 100644
--- a/tests/custom/integration/test_sync.py
+++ b/tests/custom/integration/test_sync.py
@@ -1,3 +1,4 @@
+import typing
 from pathlib import Path
 from typing import List, Union
 
@@ -80,15 +81,17 @@ def test_overload_with_local_files(
     # WHEN calling with an invalid path
     # THEN it should raise HumanloopRuntimeError
     with pytest.raises(HumanloopRuntimeError):
-        sub_client: Union[PromptsClient, AgentsClient]
-        match test_file.type:
-            case "prompt":
-                sub_client = humanloop_client.prompts
-            case "agent":
-                sub_client = humanloop_client.agents
-            case _:
-                raise ValueError(f"Invalid file type: {test_file.type}")
-        sub_client.call(path="invalid/path")
+        try:
+            sub_client: Union[PromptsClient, AgentsClient] = typing.cast(
+                Union[PromptsClient, AgentsClient],
+                {
+                    "prompt": humanloop_client.prompts,
+                    "agent": humanloop_client.agents,
+                }[test_file.type],
+            )
+            sub_client.call(path="invalid/path")
+        except KeyError:
+            raise NotImplementedError(f"Unknown file type: {test_file.type}")
 
 
 def test_overload_log_with_local_files(

From 79e1d0db9f9d54dbf4f2ff05095324d281cfe7e1 Mon Sep 17 00:00:00 2001
From: Ale Pouroullis <alexandros@humanloop.com>
Date: Wed, 14 May 2025 16:53:03 +0100
Subject: [PATCH 6/6] chore: clean up import order in cli

---
 src/humanloop/cli/__main__.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/humanloop/cli/__main__.py b/src/humanloop/cli/__main__.py
index ad582bbc..3ab53cfb 100644
--- a/src/humanloop/cli/__main__.py
+++ b/src/humanloop/cli/__main__.py
@@ -1,13 +1,15 @@
-import click
 import logging
-from typing import Optional, Callable
-from functools import wraps
-from dotenv import load_dotenv
 import os
 import sys
+import time
+from functools import wraps
+from typing import Callable, Optional
+
+import click
+from dotenv import load_dotenv
+
 from humanloop import Humanloop
 from humanloop.sync.sync_client import SyncClient
-import time
 
 # Set up logging
 logger = logging.getLogger(__name__)