From d9176724edf933245a079c9255583d8ebdc9e8d9 Mon Sep 17 00:00:00 2001 From: fern-api <115122769+fern-api[bot]@users.noreply.github.com> Date: Tue, 13 May 2025 17:35:47 +0000 Subject: [PATCH 1/6] Release 0.8.39 From 99461727fc9f6758aaa85dea057764a05ce4e4f0 Mon Sep 17 00:00:00 2001 From: Ale Pouroullis Date: Tue, 13 May 2025 18:54:33 +0100 Subject: [PATCH 2/6] Add custom code on top of autogenerated SDK --- .gitignore | 2 + pytest.ini | 2 + src/humanloop/cli/__init__.py | 0 src/humanloop/cli/__main__.py | 248 ++++++++++++ src/humanloop/client.py | 107 ++++- src/humanloop/overload.py | 274 ++++++++----- src/humanloop/sync/__init__.py | 3 + src/humanloop/sync/sync_client.py | 374 ++++++++++++++++++ tests/custom/README.md | 19 + tests/custom/__init__.py | 0 tests/custom/assets/exact_match.py | 16 + tests/custom/assets/levenshtein.py | 99 +++++ tests/custom/conftest.py | 170 ++++++++ tests/custom/integration/__init__.py | 0 tests/custom/integration/conftest.py | 259 ++++++++++++ tests/custom/integration/test_decorators.py | 153 ++++++++ tests/custom/integration/test_evals.py | 411 ++++++++++++++++++++ tests/custom/integration/test_sync.py | 206 ++++++++++ tests/custom/integration/test_sync_cli.py | 179 +++++++++ tests/custom/otel/__init__.py | 0 tests/custom/otel/test_helpers.py | 172 ++++++++ tests/custom/sync/__init__.py | 0 tests/custom/sync/test_client.py | 126 ++++++ tests/custom/types.py | 15 + 24 files changed, 2736 insertions(+), 99 deletions(-) create mode 100644 pytest.ini create mode 100644 src/humanloop/cli/__init__.py create mode 100644 src/humanloop/cli/__main__.py create mode 100644 src/humanloop/sync/__init__.py create mode 100644 src/humanloop/sync/sync_client.py create mode 100644 tests/custom/README.md create mode 100644 tests/custom/__init__.py create mode 100644 tests/custom/assets/exact_match.py create mode 100644 tests/custom/assets/levenshtein.py create mode 100644 tests/custom/conftest.py create mode 100644 tests/custom/integration/__init__.py create mode 100644 tests/custom/integration/conftest.py create mode 100644 tests/custom/integration/test_decorators.py create mode 100644 tests/custom/integration/test_evals.py create mode 100644 tests/custom/integration/test_sync.py create mode 100644 tests/custom/integration/test_sync_cli.py create mode 100644 tests/custom/otel/__init__.py create mode 100644 tests/custom/otel/test_helpers.py create mode 100644 tests/custom/sync/__init__.py create mode 100644 tests/custom/sync/test_client.py create mode 100644 tests/custom/types.py diff --git a/.gitignore b/.gitignore index a55ede77..f5cda9d9 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,5 @@ poetry.toml .env tests/assets/*.jsonl tests/assets/*.parquet +# Ignore humanloop directory which could mistakenly be committed when testing sync functionality as it's used as the default sync directory +humanloop diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..8ab80e5d --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +addopts = -n auto diff --git a/src/humanloop/cli/__init__.py b/src/humanloop/cli/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/humanloop/cli/__main__.py b/src/humanloop/cli/__main__.py new file mode 100644 index 00000000..ad582bbc --- /dev/null +++ b/src/humanloop/cli/__main__.py @@ -0,0 +1,248 @@ +import click +import logging +from typing import Optional, Callable +from functools import wraps +from dotenv import load_dotenv +import os +import sys +from humanloop import Humanloop +from humanloop.sync.sync_client import SyncClient +import time + +# Set up logging +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) # Set back to INFO level +console_handler = logging.StreamHandler() +formatter = logging.Formatter("%(message)s") # Simplified formatter +console_handler.setFormatter(formatter) +if not logger.hasHandlers(): + logger.addHandler(console_handler) + +# Color constants +SUCCESS_COLOR = "green" +ERROR_COLOR = "red" +INFO_COLOR = "blue" +WARNING_COLOR = "yellow" + + +def load_api_key(env_file: Optional[str] = None) -> str: + """Load API key from .env file or environment variable. + + Args: + env_file: Optional path to .env file + + Returns: + str: The loaded API key + + Raises: + click.ClickException: If no API key is found + """ + # Try specific .env file if provided, otherwise default to .env in current directory + if env_file: + if not load_dotenv(env_file): # load_dotenv returns False if file not found/invalid + raise click.ClickException( + click.style( + f"Failed to load environment file: {env_file} (file not found or invalid format)", + fg=ERROR_COLOR, + ) + ) + else: + load_dotenv() # Attempt to load from default .env in current directory + + # Get API key from environment + api_key = os.getenv("HUMANLOOP_API_KEY") + if not api_key: + raise click.ClickException( + click.style( + "No API key found. Set HUMANLOOP_API_KEY in .env file or environment, or use --api-key", fg=ERROR_COLOR + ) + ) + + return api_key + + +def get_client( + api_key: Optional[str] = None, env_file: Optional[str] = None, base_url: Optional[str] = None +) -> Humanloop: + """Instantiate a Humanloop client for the CLI. + + Args: + api_key: Optional API key provided directly + env_file: Optional path to .env file + base_url: Optional base URL for the API + + Returns: + Humanloop: Configured client instance + + Raises: + click.ClickException: If no API key is found + """ + if not api_key: + api_key = load_api_key(env_file) + return Humanloop(api_key=api_key, base_url=base_url) + + +def common_options(f: Callable) -> Callable: + """Decorator for common CLI options.""" + + @click.option( + "--api-key", + help="Humanloop API key. If not provided, uses HUMANLOOP_API_KEY from .env or environment.", + default=None, + show_default=False, + ) + @click.option( + "--env-file", + help="Path to .env file. If not provided, looks for .env in current directory.", + default=None, + type=click.Path(exists=True), + show_default=False, + ) + @click.option( + "--local-files-directory", + "--local-dir", + help="Directory (relative to the current working directory) where Humanloop files are stored locally (default: humanloop/).", + default="humanloop", + type=click.Path(), + ) + @click.option( + "--base-url", + default=None, + hidden=True, + ) + @wraps(f) + def wrapper(*args, **kwargs): + return f(*args, **kwargs) + + return wrapper + + +def handle_sync_errors(f: Callable) -> Callable: + """Decorator for handling sync operation errors. + + If an error occurs in any operation that uses this decorator, it will be logged and the program will exit with a non-zero exit code. + """ + + @wraps(f) + def wrapper(*args, **kwargs): + try: + return f(*args, **kwargs) + except Exception as e: + click.echo(click.style(str(f"Error: {e}"), fg=ERROR_COLOR)) + sys.exit(1) + + return wrapper + + +@click.group( + help="Humanloop CLI for managing sync operations.", + context_settings={ + "help_option_names": ["-h", "--help"], + "max_content_width": 100, + }, +) +def cli(): # Does nothing because used as a group for other subcommands (pull, push, etc.) + """Humanloop CLI for managing sync operations.""" + pass + + +@cli.command() +@click.option( + "--path", + "-p", + help="Path in the Humanloop workspace to pull from (file or directory). You can pull an entire directory (e.g. 'my/directory') " + "or a specific file (e.g. 'my/directory/my_prompt.prompt'). When pulling a directory, all files within that directory and its subdirectories will be included. " + "If not specified, pulls from the root of the remote workspace.", + default=None, +) +@click.option( + "--environment", + "-e", + help="Environment to pull from (e.g. 'production', 'staging')", + default=None, +) +@click.option( + "--verbose", + "-v", + is_flag=True, + help="Show detailed information about the operation", +) +@click.option( + "--quiet", + "-q", + is_flag=True, + help="Suppress output of successful files", +) +@handle_sync_errors +@common_options +def pull( + path: Optional[str], + environment: Optional[str], + api_key: Optional[str], + env_file: Optional[str], + local_files_directory: str, + base_url: Optional[str], + verbose: bool, + quiet: bool, +): + """Pull Prompt and Agent files from Humanloop to your local filesystem. + + \b + This command will: + 1. Fetch Prompt and Agent files from your Humanloop workspace + 2. Save them to your local filesystem (directory specified by --local-files-directory, default: humanloop/) + 3. Maintain the same directory structure as in Humanloop + 4. Add appropriate file extensions (.prompt or .agent) + + \b + For example, with the default --local-files-directory=humanloop, files will be saved as: + ./humanloop/ + ├── my_project/ + │ ├── prompts/ + │ │ ├── my_prompt.prompt + │ │ └── nested/ + │ │ └── another_prompt.prompt + │ └── agents/ + │ └── my_agent.agent + └── another_project/ + └── prompts/ + └── other_prompt.prompt + + \b + If you specify --local-files-directory=data/humanloop, files will be saved in ./data/humanloop/ instead. + + If a file exists both locally and in the Humanloop workspace, the local file will be overwritten + with the version from Humanloop. Files that only exist locally will not be affected. + + Currently only supports syncing Prompt and Agent files. Other file types will be skipped.""" + client = get_client(api_key, env_file, base_url) + sync_client = SyncClient( + client, base_dir=local_files_directory, log_level=logging.DEBUG if verbose else logging.WARNING + ) + + click.echo(click.style("Pulling files from Humanloop...", fg=INFO_COLOR)) + click.echo(click.style(f"Path: {path or '(root)'}", fg=INFO_COLOR)) + click.echo(click.style(f"Environment: {environment or '(default)'}", fg=INFO_COLOR)) + + start_time = time.time() + successful_files, failed_files = sync_client.pull(path, environment) + duration_ms = int((time.time() - start_time) * 1000) + + # Determine if the operation was successful based on failed_files + is_successful = not failed_files + duration_color = SUCCESS_COLOR if is_successful else ERROR_COLOR + click.echo(click.style(f"Pull completed in {duration_ms}ms", fg=duration_color)) + + if successful_files and not quiet: + click.echo(click.style(f"\nSuccessfully pulled {len(successful_files)} files:", fg=SUCCESS_COLOR)) + for file in successful_files: + click.echo(click.style(f" ✓ {file}", fg=SUCCESS_COLOR)) + + if failed_files: + click.echo(click.style(f"\nFailed to pull {len(failed_files)} files:", fg=ERROR_COLOR)) + for file in failed_files: + click.echo(click.style(f" ✗ {file}", fg=ERROR_COLOR)) + + +if __name__ == "__main__": + cli() diff --git a/src/humanloop/client.py b/src/humanloop/client.py index 74cd6c97..fce02a98 100644 --- a/src/humanloop/client.py +++ b/src/humanloop/client.py @@ -1,6 +1,7 @@ import os import typing -from typing import Any, List, Optional, Sequence +from typing import Any, List, Optional, Sequence, Tuple +import logging import httpx from opentelemetry.sdk.resources import Resource @@ -18,7 +19,7 @@ ) from humanloop.base_client import AsyncBaseHumanloop, BaseHumanloop -from humanloop.overload import overload_call, overload_log +from humanloop.overload import overload_client from humanloop.decorators.flow import flow as flow_decorator_factory from humanloop.decorators.prompt import prompt_decorator_factory from humanloop.decorators.tool import tool_decorator_factory as tool_decorator_factory @@ -29,6 +30,9 @@ from humanloop.otel.processor import HumanloopSpanProcessor from humanloop.prompt_utils import populate_template from humanloop.prompts.client import PromptsClient +from humanloop.sync.sync_client import SyncClient, DEFAULT_CACHE_SIZE + +logger = logging.getLogger("humanloop.sdk") class ExtendedEvalsClient(EvaluationsClient): @@ -87,8 +91,9 @@ class Humanloop(BaseHumanloop): """ See docstring of :class:`BaseHumanloop`. - This class extends the base client with custom evaluation utilities - and decorators for declaring Files in code. + This class extends the base client with custom evaluation utilities, + decorators for declaring Files in code, and utilities for syncing + files between Humanloop and local filesystem. """ def __init__( @@ -102,6 +107,9 @@ def __init__( httpx_client: typing.Optional[httpx.Client] = None, opentelemetry_tracer_provider: Optional[TracerProvider] = None, opentelemetry_tracer: Optional[Tracer] = None, + use_local_files: bool = False, + local_files_directory: str = "humanloop", + cache_size: int = DEFAULT_CACHE_SIZE, ): """ Extends the base client with custom evaluation utilities and @@ -111,6 +119,27 @@ def __init__( You can provide a TracerProvider and a Tracer to integrate with your existing telemetry system. If not provided, an internal TracerProvider will be used. + + Parameters + ---------- + base_url: Optional base URL for the API + environment: The environment to use (default: DEFAULT) + api_key: Your Humanloop API key (default: from HUMANLOOP_API_KEY env var) + timeout: Optional timeout for API requests + follow_redirects: Whether to follow redirects + httpx_client: Optional custom httpx client + opentelemetry_tracer_provider: Optional tracer provider for telemetry + opentelemetry_tracer: Optional tracer for telemetry + use_local_files: Whether to use local files for prompts and agents + local_files_directory: Base directory where local prompt and agent files are stored (default: "humanloop"). + This is relative to the current working directory. For example: + - "humanloop" will look for files in "./humanloop/" + - "data/humanloop" will look for files in "./data/humanloop/" + When using paths in the API, they must be relative to this directory. For example, + if local_files_directory="humanloop" and you have a file at "humanloop/samples/test.prompt", + you would reference it as "samples/test" in your code. + cache_size: Maximum number of files to cache when use_local_files is True (default: DEFAULT_CACHE_SIZE). + This parameter has no effect if use_local_files is False. """ super().__init__( base_url=base_url, @@ -121,6 +150,17 @@ def __init__( httpx_client=httpx_client, ) + self.use_local_files = use_local_files + + # Warn user if cache_size is non-default but use_local_files is False — has no effect and will therefore be ignored + if not self.use_local_files and cache_size != DEFAULT_CACHE_SIZE: + logger.warning( + f"The specified cache_size={cache_size} will have no effect because use_local_files=False. " + f"File caching is only active when local files are enabled." + ) + + # Check if cache_size is non-default but use_local_files is False + self._sync_client = SyncClient(client=self, base_dir=local_files_directory, cache_size=cache_size) eval_client = ExtendedEvalsClient(client_wrapper=self._client_wrapper) eval_client.client = self self.evaluations = eval_client @@ -128,10 +168,14 @@ def __init__( # Overload the .log method of the clients to be aware of Evaluation Context # and the @flow decorator providing the trace_id - self.prompts = overload_log(client=self.prompts) - self.prompts = overload_call(client=self.prompts) - self.flows = overload_log(client=self.flows) - self.tools = overload_log(client=self.tools) + self.prompts = overload_client( + client=self.prompts, sync_client=self._sync_client, use_local_files=self.use_local_files + ) + self.agents = overload_client( + client=self.agents, sync_client=self._sync_client, use_local_files=self.use_local_files + ) + self.flows = overload_client(client=self.flows) + self.tools = overload_client(client=self.tools) if opentelemetry_tracer_provider is not None: self._tracer_provider = opentelemetry_tracer_provider @@ -351,6 +395,53 @@ def agent(): attributes=attributes, ) + def pull(self, path: str | None = None, environment: str | None = None) -> Tuple[List[str], List[str]]: + """Pull Prompt and Agent files from Humanloop to local filesystem. + + This method will: + 1. Fetch Prompt and Agent files from your Humanloop workspace + 2. Save them to your local filesystem (directory specified by `local_files_directory`, default: "humanloop") + 3. Maintain the same directory structure as in Humanloop + 4. Add appropriate file extensions (`.prompt` or `.agent`) + + The path parameter can be used in two ways: + - If it points to a specific file (e.g. "path/to/file.prompt" or "path/to/file.agent"), only that file will be pulled + - If it points to a directory (e.g. "path/to/directory"), all Prompt and Agent files in that directory and its subdirectories will be pulled + - If no path is provided, all Prompt and Agent files will be pulled + + The operation will overwrite existing files with the latest version from Humanloop + but will not delete local files that don't exist in the remote workspace. + + Currently only supports syncing Prompt and Agent files. Other file types will be skipped. + + For example, with the default `local_files_directory="humanloop"`, files will be saved as: + ``` + ./humanloop/ + ├── my_project/ + │ ├── prompts/ + │ │ ├── my_prompt.prompt + │ │ └── nested/ + │ │ └── another_prompt.prompt + │ └── agents/ + │ └── my_agent.agent + └── another_project/ + └── prompts/ + └── other_prompt.prompt + ``` + + If you specify `local_files_directory="data/humanloop"`, files will be saved in ./data/humanloop/ instead. + + :param path: Optional path to either a specific file (e.g. "path/to/file.prompt") or a directory (e.g. "path/to/directory"). + If not provided, all Prompt and Agent files will be pulled. + :param environment: The environment to pull the files from. + :return: Tuple of two lists: + - First list contains paths of successfully synced files + - Second list contains paths of files that failed to sync (due to API errors, missing content, + or filesystem issues) + :raises HumanloopRuntimeError: If there's an error communicating with the API + """ + return self._sync_client.pull(environment=environment, path=path) + class AsyncHumanloop(AsyncBaseHumanloop): """ diff --git a/src/humanloop/overload.py b/src/humanloop/overload.py index b0c83215..92c83e6b 100644 --- a/src/humanloop/overload.py +++ b/src/humanloop/overload.py @@ -1,54 +1,69 @@ import inspect import logging import types -from typing import TypeVar, Union +from typing import Any, Dict, Optional, Union, Callable from humanloop.context import ( get_decorator_context, get_evaluation_context, get_trace_id, ) -from humanloop.evals.run import HumanloopRuntimeError - -from humanloop.evaluators.client import EvaluatorsClient -from humanloop.flows.client import FlowsClient +from humanloop.error import HumanloopRuntimeError +from humanloop.sync.sync_client import SyncClient from humanloop.prompts.client import PromptsClient +from humanloop.flows.client import FlowsClient +from humanloop.datasets.client import DatasetsClient +from humanloop.agents.client import AgentsClient from humanloop.tools.client import ToolsClient +from humanloop.evaluators.client import EvaluatorsClient +from humanloop.types import FileType from humanloop.types.create_evaluator_log_response import CreateEvaluatorLogResponse from humanloop.types.create_flow_log_response import CreateFlowLogResponse from humanloop.types.create_prompt_log_response import CreatePromptLogResponse from humanloop.types.create_tool_log_response import CreateToolLogResponse from humanloop.types.prompt_call_response import PromptCallResponse +from humanloop.types.agent_call_response import AgentCallResponse logger = logging.getLogger("humanloop.sdk") - -CLIENT_TYPE = TypeVar("CLIENT_TYPE", PromptsClient, FlowsClient, EvaluatorsClient, ToolsClient) - - -def overload_log(client: CLIENT_TYPE) -> CLIENT_TYPE: - """ - Wrap the `log` method of the provided Humanloop client to use EVALUATION_CONTEXT. - - This makes the overloaded log actions be aware of whether the created Log is - part of an Evaluation (e.g. one started by eval_utils.run_eval). - """ - # Copy the original log method in a hidden attribute - client._log = client.log # type: ignore [attr-defined] - - def _overload_log( - # It's safe to only consider kwargs since the original - # log method bans positional arguments - self, - **kwargs, - ) -> Union[ - CreatePromptLogResponse, - CreateToolLogResponse, - CreateFlowLogResponse, - CreateEvaluatorLogResponse, - ]: - trace_id = get_trace_id() - if trace_id is not None and type(client) is FlowsClient: +LogResponseType = Union[ + CreatePromptLogResponse, + CreateToolLogResponse, + CreateFlowLogResponse, + CreateEvaluatorLogResponse, +] + +CallResponseType = Union[ + PromptCallResponse, + AgentCallResponse, +] + + +def _get_file_type_from_client( + client: Union[PromptsClient, AgentsClient, ToolsClient, FlowsClient, DatasetsClient, EvaluatorsClient], +) -> FileType: + """Get the file type based on the client type.""" + if isinstance(client, PromptsClient): + return "prompt" + elif isinstance(client, AgentsClient): + return "agent" + elif isinstance(client, ToolsClient): + return "tool" + elif isinstance(client, FlowsClient): + return "flow" + elif isinstance(client, DatasetsClient): + return "dataset" + elif isinstance(client, EvaluatorsClient): + return "evaluator" + + raise ValueError(f"Unsupported client type: {type(client)}") + + +def _handle_tracing_context(kwargs: Dict[str, Any], client: Any) -> Dict[str, Any]: + """Handle tracing context for both log and call methods.""" + trace_id = get_trace_id() + if trace_id is not None: + if "flow" in str(type(client).__name__).lower(): context = get_decorator_context() if context is None: raise HumanloopRuntimeError("Internal error: trace_id context is set outside a decorator context.") @@ -56,69 +71,146 @@ def _overload_log( f"Using `flows.log()` is not allowed: Flow decorator " f"for File {context.path} manages the tracing and trace completion." ) - if trace_id is not None: - if "trace_parent_id" in kwargs: - logger.warning( - "Ignoring trace_parent_id argument at line %d: the Flow decorator manages tracing.", - inspect.currentframe().f_lineno, # type: ignore [union-attr] - ) - kwargs = { - **kwargs, - "trace_parent_id": trace_id, - } - evaluation_context = get_evaluation_context() - if evaluation_context is not None: - kwargs_eval, eval_callback = evaluation_context.log_args_with_context( - path=kwargs.get("path"), log_args=kwargs - ) - try: - response = self._log(**kwargs_eval) - except Exception as e: - # Re-raising as HumanloopDecoratorError so the decorators don't catch it - raise HumanloopRuntimeError from e - if eval_callback is not None: - eval_callback(response.id) - else: - try: - response = self._log(**kwargs) - except Exception as e: - # Re-raising as HumanloopDecoratorError so the decorators don't catch it - raise HumanloopRuntimeError from e - - return response - # Replace the original log method with the overloaded one - client.log = types.MethodType(_overload_log, client) # type: ignore [assignment] - # Return the client with the overloaded log method - logger.debug("Overloaded the .call method of %s", client) - return client + if "trace_parent_id" in kwargs: + logger.warning( + "Ignoring trace_parent_id argument at line %d: the Flow decorator manages tracing.", + inspect.currentframe().f_lineno, # type: ignore[union-attr] + ) + kwargs = { + **kwargs, + "trace_parent_id": trace_id, + } + return kwargs + + +def _handle_local_files( + kwargs: Dict[str, Any], + client: Any, + sync_client: Optional[SyncClient], + use_local_files: bool, +) -> Dict[str, Any]: + """Handle local file loading if enabled.""" + if not use_local_files or "path" not in kwargs or sync_client is None: + return kwargs + + if "id" in kwargs: + raise HumanloopRuntimeError("Can only specify one of `id` or `path`") + + # Check if version_id or environment is specified + use_remote = any(["version_id" in kwargs, "environment" in kwargs]) + normalized_path = sync_client._normalize_path(kwargs["path"]) + + if use_remote: + raise HumanloopRuntimeError( + f"Cannot use local file for `{normalized_path}` as version_id or environment was specified. " + "Please either remove version_id/environment to use local files, or set use_local_files=False to use remote files." + ) + + file_type = _get_file_type_from_client(client) + if file_type not in SyncClient.SERIALIZABLE_FILE_TYPES: + raise HumanloopRuntimeError(f"Local files are not supported for `{file_type}` files.") + + # If file_type is already specified in kwargs, it means user provided a PromptKernelRequestParams object + if file_type in kwargs and not isinstance(kwargs[file_type], str): + logger.warning( + f"Ignoring local file for `{normalized_path}` as {file_type} parameters were directly provided. " + "Using provided parameters instead." + ) + return kwargs + + try: + file_content = sync_client.get_file_content(normalized_path, file_type) # type: ignore[arg-type] # file_type was checked above + kwargs[file_type] = file_content + except HumanloopRuntimeError as e: + raise HumanloopRuntimeError(f"Failed to use local file for `{normalized_path}`: {str(e)}") + + return kwargs + + +def _handle_evaluation_context(kwargs: Dict[str, Any]) -> tuple[Dict[str, Any], Optional[Callable[[str], None]]]: + """Handle evaluation context for logging.""" + evaluation_context = get_evaluation_context() + if evaluation_context is not None: + return evaluation_context.log_args_with_context(path=kwargs.get("path"), log_args=kwargs) + return kwargs, None + + +def _overload_log(self: Any, sync_client: Optional[SyncClient], use_local_files: bool, **kwargs) -> LogResponseType: + try: + # Special handling for flows - prevent direct log usage + if type(self) is FlowsClient and get_trace_id() is not None: + context = get_decorator_context() + if context is None: + raise HumanloopRuntimeError("Internal error: trace_id context is set outside a decorator context.") + raise HumanloopRuntimeError( + f"Using `flows.log()` is not allowed: Flow decorator " + f"for File {context.path} manages the tracing and trace completion." + ) + kwargs = _handle_tracing_context(kwargs, self) -def overload_call(client: PromptsClient) -> PromptsClient: - client._call = client.call # type: ignore [attr-defined] - - def _overload_call(self, **kwargs) -> PromptCallResponse: - # None if not logging inside a decorator - trace_id = get_trace_id() - if trace_id is not None: - if "trace_parent_id" in kwargs: - logger.warning( - "Ignoring trace_parent_id argument at line %d: the Flow decorator manages tracing.", - inspect.currentframe().f_lineno, # type: ignore [union-attr] - ) - kwargs = { - **kwargs, - "trace_parent_id": trace_id, - } - - try: - response = self._call(**kwargs) - except Exception as e: - # Re-raising as HumanloopDecoratorError so the decorators don't catch it - raise HumanloopRuntimeError from e + # Handle local files for Prompts and Agents clients + if _get_file_type_from_client(self) in ["prompt", "agent"]: + if sync_client is None: + logger.error("sync_client is None but client has log method and use_local_files=%s", use_local_files) + raise HumanloopRuntimeError("sync_client is required for clients that support local file operations") + kwargs = _handle_local_files(kwargs, self, sync_client, use_local_files) + kwargs, eval_callback = _handle_evaluation_context(kwargs) + response = self._log(**kwargs) # Use stored original method + if eval_callback is not None: + eval_callback(response.id) return response + except HumanloopRuntimeError: + # Re-raise HumanloopRuntimeError without wrapping to preserve the message + raise + except Exception as e: + # Only wrap non-HumanloopRuntimeError exceptions + raise HumanloopRuntimeError from e + + +def _overload_call(self: Any, sync_client: Optional[SyncClient], use_local_files: bool, **kwargs) -> CallResponseType: + try: + kwargs = _handle_tracing_context(kwargs, self) + kwargs = _handle_local_files(kwargs, self, sync_client, use_local_files) + return self._call(**kwargs) # Use stored original method + except HumanloopRuntimeError: + # Re-raise HumanloopRuntimeError without wrapping to preserve the message + raise + except Exception as e: + # Only wrap non-HumanloopRuntimeError exceptions + raise HumanloopRuntimeError from e + + +def overload_client( + client: Any, + sync_client: Optional[SyncClient] = None, + use_local_files: bool = False, +) -> Any: + """Overloads client methods to add tracing, local file handling, and evaluation context.""" + # Store original log method as _log for all clients. Used in flow decorator + if hasattr(client, "log") and not hasattr(client, "_log"): + client._log = client.log # type: ignore[attr-defined] + + # Create a closure to capture sync_client and use_local_files + def log_wrapper(self: Any, **kwargs) -> LogResponseType: + return _overload_log(self, sync_client, use_local_files, **kwargs) + + client.log = types.MethodType(log_wrapper, client) + + # Overload call method for Prompt and Agent clients + if _get_file_type_from_client(client) in ["prompt", "agent"]: + if sync_client is None and use_local_files: + logger.error("sync_client is None but client has call method and use_local_files=%s", use_local_files) + raise HumanloopRuntimeError("sync_client is required for clients that support call operations") + if hasattr(client, "call") and not hasattr(client, "_call"): + client._call = client.call # type: ignore[attr-defined] + + # Create a closure to capture sync_client and use_local_files + def call_wrapper(self: Any, **kwargs) -> CallResponseType: + return _overload_call(self, sync_client, use_local_files, **kwargs) + + client.call = types.MethodType(call_wrapper, client) - # Replace the original log method with the overloaded one - client.call = types.MethodType(_overload_call, client) # type: ignore [assignment] return client diff --git a/src/humanloop/sync/__init__.py b/src/humanloop/sync/__init__.py new file mode 100644 index 00000000..007659df --- /dev/null +++ b/src/humanloop/sync/__init__.py @@ -0,0 +1,3 @@ +from humanloop.sync.sync_client import SyncClient + +__all__ = ["SyncClient"] diff --git a/src/humanloop/sync/sync_client.py b/src/humanloop/sync/sync_client.py new file mode 100644 index 00000000..d71f1568 --- /dev/null +++ b/src/humanloop/sync/sync_client.py @@ -0,0 +1,374 @@ +import logging +from pathlib import Path +from typing import List, Tuple, TYPE_CHECKING +from functools import lru_cache +import typing +import time +from humanloop.error import HumanloopRuntimeError +import json + +if TYPE_CHECKING: + from humanloop.base_client import BaseHumanloop + +# Set up logging +logger = logging.getLogger("humanloop.sdk.sync") +logger.setLevel(logging.INFO) +console_handler = logging.StreamHandler() +formatter = logging.Formatter("%(message)s") +console_handler.setFormatter(formatter) +if not logger.hasHandlers(): + logger.addHandler(console_handler) + +# Default cache size for file content caching +DEFAULT_CACHE_SIZE = 100 + + +def format_api_error(error: Exception) -> str: + """Format API error messages to be more user-friendly.""" + error_msg = str(error) + if "status_code" not in error_msg or "body" not in error_msg: + return error_msg + + try: + # Extract the body part and parse as JSON + body_str = error_msg.split("body: ")[1] + # Convert Python dict string to valid JSON by: + # 1. Escaping double quotes + # 2. Replacing single quotes with double quotes + body_str = body_str.replace('"', '\\"').replace("'", '"') + body = json.loads(body_str) + + # Get the detail from the body + detail = body.get("detail", {}) + + # Handle both string and dictionary types for detail + if isinstance(detail, str): + return detail + elif isinstance(detail, dict): + return detail.get("description") or detail.get("msg") or error_msg + else: + return error_msg + except Exception as e: + logger.debug(f"Failed to parse error message: {str(e)}") + return error_msg + + +SerializableFileType = typing.Literal["prompt", "agent"] + + +class SyncClient: + """Client for managing synchronization between local filesystem and Humanloop. + + This client provides file synchronization between Humanloop and the local filesystem, + with built-in caching for improved performance. The cache uses Python's LRU (Least + Recently Used) cache to automatically manage memory usage by removing least recently + accessed files when the cache is full. + + The cache is automatically updated when files are pulled or saved, and can be + manually cleared using the clear_cache() method. + """ + + # File types that can be serialized to/from the filesystem + SERIALIZABLE_FILE_TYPES = frozenset(typing.get_args(SerializableFileType)) + + def __init__( + self, + client: "BaseHumanloop", + base_dir: str = "humanloop", + cache_size: int = DEFAULT_CACHE_SIZE, + log_level: int = logging.WARNING, + ): + """ + Parameters + ---------- + client: Humanloop client instance + base_dir: Base directory for synced files (default: "humanloop") + cache_size: Maximum number of files to cache (default: DEFAULT_CACHE_SIZE) + log_level: Log level for logging (default: WARNING) + """ + self.client = client + self.base_dir = Path(base_dir) + self._cache_size = cache_size + + logger.setLevel(log_level) + + # Create a new cached version of get_file_content with the specified cache size + self.get_file_content = lru_cache(maxsize=cache_size)( # type: ignore [assignment] + self._get_file_content_implementation, + ) + + def _get_file_content_implementation(self, path: str, file_type: SerializableFileType) -> str: + """Implementation of get_file_content without the cache decorator. + + This is the actual implementation that gets wrapped by lru_cache. + + Args: + path: The normalized path to the file (without extension) + file_type: The type of file to get the content of (SerializableFileType) + + Returns: + The raw file content + + Raises: + HumanloopRuntimeError: In two cases: + 1. If the file doesn't exist at the expected location + 2. If there's a filesystem error when trying to read the file + (e.g., permission denied, file is locked, etc.) + """ + # Construct path to local file + local_path = self.base_dir / path + # Add appropriate extension + local_path = local_path.parent / f"{local_path.stem}.{file_type}" + + if not local_path.exists(): + raise HumanloopRuntimeError(f"Local file not found: {local_path}") + + try: + # Read the raw file content + with open(local_path) as f: + file_content = f.read() + logger.debug(f"Using local file content from {local_path}") + return file_content + except Exception as e: + raise HumanloopRuntimeError(f"Error reading local file {local_path}: {str(e)}") + + def get_file_content(self, path: str, file_type: SerializableFileType) -> str: + """Get the raw file content of a file from cache or filesystem. + + This method uses an LRU cache to store file contents. When the cache is full, + the least recently accessed files are automatically removed to make space. + + Args: + path: The normalized path to the file (without extension) + file_type: The type of file (Prompt or Agent) + + Returns: + The raw file content + + Raises: + HumanloopRuntimeError: If the file doesn't exist or can't be read + """ + return self._get_file_content_implementation(path, file_type) + + def clear_cache(self) -> None: + """Clear the LRU cache.""" + self.get_file_content.cache_clear() # type: ignore [attr-defined] + + def _normalize_path(self, path: str) -> str: + """Normalize the path by: + 1. Converting to a Path object to handle platform-specific separators + 2. Removing any file extensions + 3. Converting to a string with forward slashes and no leading/trailing slashes + """ + # Convert to Path object to handle platform-specific separators + path_obj = Path(path) + + # Reject absolute paths to ensure all paths are relative to base_dir. + # This maintains consistency with the remote filesystem where paths are relative to project root. + if path_obj.is_absolute(): + raise HumanloopRuntimeError( + f"Absolute paths are not supported: `{path}`. " + f"Paths should be relative to the base directory (`{self.base_dir}`)." + ) + + # Remove extension, convert to string with forward slashes, and remove leading/trailing slashes + normalized = str(path_obj.with_suffix("")) + # Replace all backslashes and normalize multiple forward slashes + return "/".join(part for part in normalized.replace("\\", "/").split("/") if part) + + def is_file(self, path: str) -> bool: + """Check if the path is a file by checking for .{file_type} extension for serializable file types.""" + return path.endswith(tuple(f".{file_type}" for file_type in self.SERIALIZABLE_FILE_TYPES)) + + def _save_serialized_file( + self, + serialized_content: str, + file_path: str, + file_type: SerializableFileType, + ) -> None: + """Save serialized file to local filesystem.""" + try: + # Create full path including base_dir prefix + full_path = self.base_dir / file_path + # Create directory if it doesn't exist + full_path.parent.mkdir(parents=True, exist_ok=True) + + # Add file type extension + new_path = full_path.parent / f"{full_path.stem}.{file_type}" + + # Write raw file content to file + with open(new_path, "w") as f: + f.write(serialized_content) + except Exception as e: + logger.error(f"Failed to write {file_type} {file_path} to disk: {str(e)}") + raise + + def _pull_file(self, path: str, environment: str | None = None) -> bool: + """Pull a specific file from Humanloop to local filesystem. + + Returns: + True if the file was successfully pulled, False otherwise + """ + try: + file = self.client.files.retrieve_by_path( + path=path, + environment=environment, + include_raw_file_content=True, + ) + + if file.type not in self.SERIALIZABLE_FILE_TYPES: + logger.error(f"Unsupported file type: {file.type}") + return False + + if not file.raw_file_content: # type: ignore [union-attr] + logger.error(f"No content found for {file.type} {path}") + return False + + self._save_serialized_file( + serialized_content=file.raw_file_content, # type: ignore [union-attr] + file_path=file.path, + file_type=typing.cast(SerializableFileType, file.type), + ) + return True + except Exception as e: + logger.error(f"Failed to pull file {path}: {str(e)}") + return False + + def _pull_directory( + self, + path: str | None = None, + environment: str | None = None, + ) -> Tuple[List[str], List[str]]: + """Sync Prompt and Agent files from Humanloop to local filesystem. + + Returns: + Tuple of two lists: + - First list contains paths of successfully synced files + - Second list contains paths of files that failed to sync. + Failures can occur due to missing content in the response or errors during local file writing. + + Raises: + HumanloopRuntimeError: If there's an error communicating with the API + """ + successful_files = [] + failed_files = [] + page = 1 + + logger.debug(f"Fetching files from directory: {path or '(root)'} in environment: {environment or '(default)'}") + + while True: + try: + logger.debug(f"`{path}`: Requesting page {page} of files") + response = self.client.files.list_files( + type=list(self.SERIALIZABLE_FILE_TYPES), + page=page, + size=100, + include_raw_file_content=True, + environment=environment, + path=path, + ) + + if len(response.records) == 0: + logger.debug(f"Finished reading files for path `{path}`") + break + + logger.debug(f"`{path}`: Read page {page} containing {len(response.records)} files") + + # Process each file + for file in response.records: + # Skip if not a serializable file type + if file.type not in self.SERIALIZABLE_FILE_TYPES: + logger.warning(f"Skipping unsupported file type: {file.type}") + continue + + file_type: SerializableFileType = typing.cast( + SerializableFileType, + file.type, + ) + + # Skip if no raw file content + if not getattr(file, "raw_file_content", None) or not file.raw_file_content: # type: ignore [union-attr] + logger.warning(f"No content found for {file.type} {file.path}") + failed_files.append(file.path) + continue + + try: + logger.debug(f"Writing {file.type} {file.path} to disk") + self._save_serialized_file( + serialized_content=file.raw_file_content, # type: ignore [union-attr] + file_path=file.path, + file_type=file_type, + ) + successful_files.append(file.path) + except Exception as e: + failed_files.append(file.path) + logger.error(f"Failed to save {file.path}: {str(e)}") + + page += 1 + except Exception as e: + formatted_error = format_api_error(e) + raise HumanloopRuntimeError(f"Failed to fetch page {page}: {formatted_error}") + + if successful_files: + logger.info(f"Successfully pulled {len(successful_files)} files") + if failed_files: + logger.warning(f"Failed to pull {len(failed_files)} files") + + return successful_files, failed_files + + def pull(self, path: str | None = None, environment: str | None = None) -> Tuple[List[str], List[str]]: + """Pull files from Humanloop to local filesystem. + + If the path ends with .prompt or .agent, pulls that specific file. + Otherwise, pulls all files under the specified path. + If no path is provided, pulls all files from the root. + + Args: + path: The path to pull from (either a specific file or directory) + environment: The environment to pull from + + Returns: + Tuple of two lists: + - First list contains paths of successfully synced files + - Second list contains paths of files that failed to sync (e.g. failed to write to disk or missing raw content) + + Raises: + HumanloopRuntimeError: If there's an error communicating with the API + """ + start_time = time.time() + normalized_path = self._normalize_path(path) if path else None + + logger.info( + f"Starting pull operation: path={normalized_path or '(root)'}, environment={environment or '(default)'}" + ) + + try: + if normalized_path is None or path is None: # path being None means normalized_path is None, but we check both for improved type safety + # Pull all files from the root + logger.debug("Pulling all files from root") + successful_files, failed_files = self._pull_directory( + path=None, + environment=environment, + ) + else: + if self.is_file(path.strip()): + logger.debug(f"Pulling file: {normalized_path}") + if self._pull_file(path=normalized_path, environment=environment): + successful_files = [path] + failed_files = [] + else: + successful_files = [] + failed_files = [path] + else: + logger.debug(f"Pulling directory: {normalized_path}") + successful_files, failed_files = self._pull_directory(normalized_path, environment) + + # Clear the cache at the end of each pull operation + self.clear_cache() + + duration_ms = int((time.time() - start_time) * 1000) + logger.info(f"Pull completed in {duration_ms}ms: {len(successful_files)} files succeeded") + + return successful_files, failed_files + except Exception as e: + raise HumanloopRuntimeError(f"Pull operation failed: {str(e)}") diff --git a/tests/custom/README.md b/tests/custom/README.md new file mode 100644 index 00000000..14ff7ed4 --- /dev/null +++ b/tests/custom/README.md @@ -0,0 +1,19 @@ +# Custom Tests Directory + +This directory contains custom tests for the Humanloop Python SDK. While the main SDK is auto-generated using [Fern](https://buildwithfern.com/), this directory allows us to add our own test implementations that won't be overwritten during regeneration. + +## Why Custom Tests? + +- **Preservation**: Tests in this directory won't be overwritten when regenerating the SDK +- **Custom Implementation**: Allows testing of our own implementations beyond the auto-generated code +- **Integration**: Enables testing of how our custom code works with the auto-generated SDK + +## Running Tests + +```bash +# Run all custom tests +pytest tests/custom/ + +# Run specific test file +pytest tests/custom/sync/test_sync_client.py +``` diff --git a/tests/custom/__init__.py b/tests/custom/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/custom/assets/exact_match.py b/tests/custom/assets/exact_match.py new file mode 100644 index 00000000..583d742a --- /dev/null +++ b/tests/custom/assets/exact_match.py @@ -0,0 +1,16 @@ +def extract_answer(generation: str): + """Extracts answer from generation. + + Handles a generation that if separated by "---" with the answer being the first part. + Also handles a generation that starts with "```\n" and removes it. + """ + answer = generation.split("---")[0].strip() + if answer.startswith("```\n"): + answer = answer[4:].strip() + + return answer + + +def exact_match(log, testcase): + target = testcase["target"]["output"] + return target == extract_answer(log["output"]) diff --git a/tests/custom/assets/levenshtein.py b/tests/custom/assets/levenshtein.py new file mode 100644 index 00000000..b2e279ae --- /dev/null +++ b/tests/custom/assets/levenshtein.py @@ -0,0 +1,99 @@ +def levenshtein_distance_optimized(s1, s2, max_distance=1000): + """ + Calculate the Levenshtein distance between two strings with optimizations and a maximum distance cap. + + This function trims common prefixes and suffixes from the input strings, uses a single-row table + to reduce space complexity, and stops the computation early if the Levenshtein distance is + guaranteed to exceed a maximum distance cap. + + Args: + s1 (str): The first string. + s2 (str): The second string. + max_distance (int, optional): The maximum Levenshtein distance. Defaults to 1000. + + Returns: + int: The Levenshtein distance between the two strings, or max_distance if the distance + exceeds max_distance. + """ + # Trim common prefixes + while s1 and s2 and s1[0] == s2[0]: + s1 = s1[1:] + s2 = s2[1:] + + # Trim common suffixes + while s1 and s2 and s1[-1] == s2[-1]: + s1 = s1[:-1] + s2 = s2[:-1] + + len_s1 = len(s1) + len_s2 = len(s2) + + # If the length difference between the strings exceeds max_distance, stop the computation + if abs(len_s1 - len_s2) > max_distance: + return max_distance + + # If one of the strings is empty, the distance is the length of the other string + if len_s1 == 0: + return min(len_s2, max_distance) + if len_s2 == 0: + return min(len_s1, max_distance) + + # Create a single-row table with len(s2) + 1 columns + distance = list(range(len_s2 + 1)) + + # Fill up the table + for i in range(1, len_s1 + 1): + # Store the value of the previous cell in the previous row + prev_row_cell = i - 1 + # The value at the first column is the row number + distance[0] = i + + # Initialize the minimum distance in the current row to max_distance + min_distance = max_distance + + for j in range(1, len_s2 + 1): + # Store the value of the current cell before it is updated + current_cell = distance[j] + + # If the current characters of the two strings are the same, the cost is 0, otherwise 1 + substitution_cost = 0 if s1[i - 1] == s2[j - 1] else 1 + + # The value at the current cell is the minimum of the values at the previous cell in the + # current row, the current cell in the previous row, and the previous cell in the previous row, + # plus the cost + distance[j] = min( + distance[j - 1] + 1, # deletion + distance[j] + 1, # insertion + prev_row_cell + substitution_cost, + ) # substitution + + # Update the minimum distance in the current row + min_distance = min(min_distance, distance[j]) + + # Update the value of the previous cell in the previous row + prev_row_cell = current_cell + + # If the minimum distance in the current row exceeds max_distance, stop the computation + if min_distance >= max_distance: + return max_distance + + # The Levenshtein distance between the two strings is the value at the last cell in the table + return min(distance[-1], max_distance) + + +def extract_answer(generation: str): + """Extracts answer from generation. + + Handles a generation that if separated by "---" with the answer being the first part. + Also handles a generation that starts with "```\n" and removes it. + """ + answer = generation.split("---")[0].strip() + if answer.startswith("```\n"): + answer = answer[4:].strip() + + return answer + + +def compare_log_and_target(log, testcase): + target = testcase["target"]["output"] + return levenshtein_distance_optimized(target, extract_answer(log["output"])) diff --git a/tests/custom/conftest.py b/tests/custom/conftest.py new file mode 100644 index 00000000..7667dedf --- /dev/null +++ b/tests/custom/conftest.py @@ -0,0 +1,170 @@ +from typing import Generator +import os +from dotenv import load_dotenv +from unittest.mock import MagicMock + +import pytest +from humanloop.client import Humanloop +from humanloop.otel.exporter import HumanloopSpanExporter +from humanloop.otel.processor import HumanloopSpanProcessor +from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam +from opentelemetry.instrumentation.anthropic import AnthropicInstrumentor +from opentelemetry.instrumentation.cohere import CohereInstrumentor +from opentelemetry.instrumentation.groq import GroqInstrumentor +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor # type: ignore +from opentelemetry.instrumentation.openai import OpenAIInstrumentor +from opentelemetry.instrumentation.replicate import ReplicateInstrumentor +from opentelemetry.sdk.resources import Resource +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import SimpleSpanProcessor +from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter +from opentelemetry.trace import Tracer +from tests.custom.types import GetHumanloopClientFn + + +@pytest.fixture(scope="function") +def opentelemetry_test_provider() -> TracerProvider: + """Create a test TracerProvider with a resource. + + This is similar to the created TracerProvider in the + Humanloop class. + """ + provider = TracerProvider( + resource=Resource.create( + { + "service": "humanloop.sdk", + "environment": "test", + } + ) + ) + return provider + + +@pytest.fixture(scope="function") +def test_span(opentelemetry_test_provider: TracerProvider): + exporter = InMemorySpanExporter() + processor = SimpleSpanProcessor(exporter) + opentelemetry_test_provider.add_span_processor(processor) + tracer = opentelemetry_test_provider.get_tracer("test") + return tracer.start_span("test_span") + + +@pytest.fixture(scope="function") +def opentelemetry_test_configuration( + opentelemetry_test_provider: TracerProvider, +) -> Generator[tuple[Tracer, InMemorySpanExporter], None, None]: + """Configure OTel backend without HumanloopSpanProcessor. + + Spans created by Instrumentors will not be used to enrich + Humanloop Spans. + """ + exporter = InMemorySpanExporter() + processor = SimpleSpanProcessor(exporter) + opentelemetry_test_provider.add_span_processor(processor) + instrumentors: list[BaseInstrumentor] = [ + OpenAIInstrumentor(), + AnthropicInstrumentor(), + GroqInstrumentor(), + CohereInstrumentor(), + ReplicateInstrumentor(), + ] + for instrumentor in instrumentors: + instrumentor.instrument(tracer_provider=opentelemetry_test_provider) + tracer = opentelemetry_test_provider.get_tracer("test") + # Circumvent configuration procedure + + yield tracer, exporter + + for instrumentor in instrumentors: + instrumentor.uninstrument() + + +@pytest.fixture(scope="session") +def get_humanloop_client() -> GetHumanloopClientFn: + load_dotenv() + if not os.getenv("HUMANLOOP_API_KEY"): + pytest.fail("HUMANLOOP_API_KEY is not set for integration tests") + + def _get_humanloop_client(use_local_files: bool = False) -> Humanloop: + return Humanloop( + api_key=os.getenv("HUMANLOOP_API_KEY"), + use_local_files=use_local_files, + ) + + return _get_humanloop_client + + +@pytest.fixture(scope="function") +def opentelemetry_hl_test_configuration( + opentelemetry_test_provider: TracerProvider, +) -> Generator[tuple[Tracer, InMemorySpanExporter], None, None]: + """Configure OTel backend with HumanloopSpanProcessor. + + Spans created by Instrumentors will be used to enrich + Humanloop Spans. + """ + exporter = InMemorySpanExporter() + processor = HumanloopSpanProcessor(exporter=exporter) + opentelemetry_test_provider.add_span_processor(processor) + instrumentors: list[BaseInstrumentor] = [ + OpenAIInstrumentor(), + AnthropicInstrumentor(), + GroqInstrumentor(), + CohereInstrumentor(), + ReplicateInstrumentor(), + AnthropicInstrumentor(), + ] + for instrumentor in instrumentors: + instrumentor.instrument( + tracer_provider=opentelemetry_test_provider, + ) + tracer = opentelemetry_test_provider.get_tracer("test") + + yield tracer, exporter + + for instrumentor in instrumentors: + instrumentor.uninstrument() + + +@pytest.fixture(scope="function") +def hl_test_exporter() -> HumanloopSpanExporter: + """ + Test Exporter where HTTP calls to Humanloop API + are mocked. + """ + client = MagicMock() + exporter = HumanloopSpanExporter(client=client) + return exporter + + +@pytest.fixture(scope="function") +def opentelemetry_hl_with_exporter_test_configuration( + hl_test_exporter: HumanloopSpanExporter, + opentelemetry_test_provider: TracerProvider, +) -> Generator[tuple[Tracer, HumanloopSpanExporter], None, None]: + """Configure OTel backend with HumanloopSpanProcessor and + a HumanloopSpanExporter where HTTP calls are mocked. + """ + processor = HumanloopSpanProcessor(exporter=hl_test_exporter) + opentelemetry_test_provider.add_span_processor(processor) + instrumentor = OpenAIInstrumentor() + instrumentor.instrument(tracer_provider=opentelemetry_test_provider) + tracer = opentelemetry_test_provider.get_tracer("test") + + yield tracer, hl_test_exporter + + instrumentor.uninstrument() + + +@pytest.fixture(scope="session") +def call_llm_messages() -> list[ChatCompletionMessageParam]: + return [ + { + "role": "system", + "content": "You are an assistant on the following topics: greetings in foreign languages.", + }, + { + "role": "user", + "content": "Bonjour!", + }, + ] diff --git a/tests/custom/integration/__init__.py b/tests/custom/integration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/custom/integration/conftest.py b/tests/custom/integration/conftest.py new file mode 100644 index 00000000..f918c48c --- /dev/null +++ b/tests/custom/integration/conftest.py @@ -0,0 +1,259 @@ +from contextlib import contextmanager, redirect_stdout +from dataclasses import dataclass +import os +import time +from typing import Any, ContextManager, Generator, List, Union +import io +from typing import TextIO +import uuid +import pytest +import dotenv +from humanloop import AgentResponse, PromptResponse +from tests.custom.types import GetHumanloopClientFn, SyncableFile +from click.testing import CliRunner + + +@dataclass +class ResourceIdentifiers: + file_id: str + file_path: str + + +@pytest.fixture() +def capture_stdout() -> ContextManager[TextIO]: + @contextmanager + def _context_manager(): + f = io.StringIO() + with redirect_stdout(f): + yield f + + return _context_manager # type: ignore [return-value] + + +@pytest.fixture(scope="session") +def openai_key() -> str: + dotenv.load_dotenv() + if not os.getenv("OPENAI_API_KEY"): + pytest.fail("OPENAI_API_KEY is not set for integration tests") + return os.getenv("OPENAI_API_KEY") # type: ignore [return-value] + + +@pytest.fixture(scope="function") +def sdk_test_dir(get_humanloop_client: GetHumanloopClientFn) -> Generator[str, None, None]: + humanloop_client = get_humanloop_client() + + def cleanup_directory(directory_id: str): + directory_response = humanloop_client.directories.get(id=directory_id) + for subdirectory in directory_response.subdirectories: + cleanup_directory(subdirectory.id) + for file in directory_response.files: + match file.type: + case "agent": + humanloop_client.agents.delete(id=file.id) + case "prompt": + humanloop_client.prompts.delete(id=file.id) + case "dataset": + humanloop_client.datasets.delete(id=file.id) + case "evaluator": + humanloop_client.evaluators.delete(id=file.id) + case "flow": + humanloop_client.flows.delete(id=file.id) + case "tool": + humanloop_client.tools.delete(id=file.id) + case _: + raise ValueError(f"Unknown file type: {file.type}") + humanloop_client.directories.delete(id=directory_response.id) + + path = f"SDK_INTEGRATION_TEST_{uuid.uuid4()}" + response = None + try: + response = humanloop_client.directories.create(path=path) + yield response.path + except Exception as e: + pytest.fail(f"Failed to create directory {path}: {e}") + finally: + if response: + time.sleep(5) + cleanup_directory(response.id) + + +@pytest.fixture(scope="function") +def test_prompt_config() -> dict[str, Any]: + return { + "provider": "openai", + "model": "gpt-4o-mini", + "temperature": 0.5, + "template": [ + { + "role": "system", + "content": "You are a helpful assistant. You must answer the user's question truthfully and at the level of a 5th grader.", + }, + { + "role": "user", + "content": "{{question}}", + }, + ], + } + + +@pytest.fixture(scope="function") +def eval_dataset( + get_humanloop_client: GetHumanloopClientFn, sdk_test_dir: str +) -> Generator[ResourceIdentifiers, None, None]: + humanloop_client = get_humanloop_client() + dataset_path = f"{sdk_test_dir}/eval_dataset" + try: + response = humanloop_client.datasets.upsert( + path=dataset_path, + datapoints=[ + { + "inputs": { + "question": "What is the capital of the France?", + }, + }, + { + "inputs": { + "question": "What is the capital of the Germany?", + }, + }, + { + "inputs": { + "question": "What is 2+2?", + }, + }, + ], + ) + yield ResourceIdentifiers(file_id=response.id, file_path=response.path) + humanloop_client.datasets.delete(id=response.id) + except Exception as e: + pytest.fail(f"Failed to create dataset {dataset_path}: {e}") + + +@pytest.fixture(scope="function") +def eval_prompt( + get_humanloop_client: GetHumanloopClientFn, sdk_test_dir: str, openai_key: str, test_prompt_config: dict[str, Any] +) -> Generator[ResourceIdentifiers, None, None]: + humanloop_client = get_humanloop_client() + prompt_path = f"{sdk_test_dir}/eval_prompt" + try: + response = humanloop_client.prompts.upsert( + path=prompt_path, + **test_prompt_config, + ) + yield ResourceIdentifiers(file_id=response.id, file_path=response.path) + humanloop_client.prompts.delete(id=response.id) + except Exception as e: + pytest.fail(f"Failed to create prompt {prompt_path}: {e}") + + +@pytest.fixture(scope="function") +def output_not_null_evaluator( + get_humanloop_client: GetHumanloopClientFn, sdk_test_dir: str +) -> Generator[ResourceIdentifiers, None, None]: + humanloop_client = get_humanloop_client() + evaluator_path = f"{sdk_test_dir}/output_not_null_evaluator" + try: + response = humanloop_client.evaluators.upsert( + path=evaluator_path, + spec={ + "arguments_type": "target_required", + "return_type": "boolean", + "code": """ +def output_not_null(log: dict) -> bool: + return log["output"] is not None + """, + "evaluator_type": "python", + }, + ) + yield ResourceIdentifiers(file_id=response.id, file_path=response.path) + humanloop_client.evaluators.delete(id=response.id) + except Exception as e: + pytest.fail(f"Failed to create evaluator {evaluator_path}: {e}") + + +@pytest.fixture(scope="function") +def id_for_staging_environment(get_humanloop_client: GetHumanloopClientFn, eval_prompt: ResourceIdentifiers) -> str: + humanloop_client = get_humanloop_client() + response = humanloop_client.prompts.list_environments(id=eval_prompt.file_id) + for environment in response: + if environment.name == "staging": + return environment.id + pytest.fail("Staging environment not found") + + +@pytest.fixture +def syncable_files_fixture( + get_humanloop_client: GetHumanloopClientFn, + sdk_test_dir: str, +) -> Generator[list[SyncableFile], None, None]: + """Creates a predefined structure of files in Humanloop for testing sync.""" + files: List[SyncableFile] = [ + SyncableFile( + path="prompts/gpt-4", + type="prompt", + model="gpt-4", + ), + SyncableFile( + path="prompts/gpt-4o", + type="prompt", + model="gpt-4o", + ), + SyncableFile( + path="prompts/nested/complex/gpt-4o", + type="prompt", + model="gpt-4o", + ), + SyncableFile( + path="agents/gpt-4", + type="agent", + model="gpt-4", + ), + SyncableFile( + path="agents/gpt-4o", + type="agent", + model="gpt-4o", + ), + ] + + humanloop_client = get_humanloop_client() + created_files = [] + for file in files: + full_path = f"{sdk_test_dir}/{file.path}" + response: Union[AgentResponse, PromptResponse] + if file.type == "prompt": + response = humanloop_client.prompts.upsert( + path=full_path, + model=file.model, + ) + elif file.type == "agent": + response = humanloop_client.agents.upsert( + path=full_path, + model=file.model, + ) + created_files.append( + SyncableFile( + path=full_path, type=file.type, model=file.model, id=response.id, version_id=response.version_id + ) + ) + + yield created_files + + +@pytest.fixture +def cli_runner() -> CliRunner: + """GIVEN a CLI runner + THEN it should be configured to catch exceptions + """ + return CliRunner(mix_stderr=False) + + +@pytest.fixture +def no_humanloop_api_key_in_env(monkeypatch): + """Fixture that removes HUMANLOOP_API_KEY from environment variables. + + Use this fixture in tests that verify behavior when no API key is available + in the environment (but could still be loaded from .env files). + """ + # Remove API key from environment + monkeypatch.delenv("HUMANLOOP_API_KEY", raising=False) + yield diff --git a/tests/custom/integration/test_decorators.py b/tests/custom/integration/test_decorators.py new file mode 100644 index 00000000..15057ba2 --- /dev/null +++ b/tests/custom/integration/test_decorators.py @@ -0,0 +1,153 @@ +import time +from typing import Any + +from openai import OpenAI +from tests.custom.integration.conftest import GetHumanloopClientFn + + +def test_prompt_decorator( + get_humanloop_client: GetHumanloopClientFn, + sdk_test_dir: str, + test_prompt_config: dict[str, Any], + openai_key: str, +): + try: + humanloop_client = get_humanloop_client() + prompt_path = f"{sdk_test_dir}/test_prompt" + prompt_response = humanloop_client.prompts.upsert( + path=prompt_path, + **test_prompt_config, + ) + + prompt_versions_response = humanloop_client.prompts.list_versions(id=prompt_response.id) + assert len(prompt_versions_response.records) == 1 + + @humanloop_client.prompt(path=prompt_path) + def my_prompt(question: str) -> str: + openai_client = OpenAI(api_key=openai_key) + + response = openai_client.chat.completions.create( + model="gpt-4o-mini", + messages=[{"role": "user", "content": question}], + ) + + assert response.choices[0].message.content is not None + return response.choices[0].message.content + + assert "paris" in my_prompt("What is the capital of the France?").lower() + + time.sleep(5) + prompt_versions_response = humanloop_client.prompts.list_versions(id=prompt_response.id) + assert len(prompt_versions_response.records) == 2 + + logs_response = humanloop_client.logs.list(file_id=prompt_response.id, page=1, size=50) + + assert logs_response.items is not None and len(logs_response.items) == 1 + finally: + humanloop_client.prompts.delete(id=prompt_response.id) + + +def test_call_prompt_in_flow_decorator( + get_humanloop_client: GetHumanloopClientFn, + sdk_test_dir: str, + openai_key: str, +): + try: + humanloop_client = get_humanloop_client() + + @humanloop_client.flow(path=f"{sdk_test_dir}/test_flow") + def my_flow(question: str) -> str: + response = humanloop_client.prompts.call( + path=f"{sdk_test_dir}/test_prompt", + prompt={ + "provider": "openai", + "model": "gpt-4o-mini", + "temperature": 0, + }, + messages=[{"role": "user", "content": question}], + provider_api_keys={"openai": openai_key}, + ) + + assert response.logs[0].output is not None + return response.logs[0].output + + assert "paris" in my_flow("What is the capital of the France?").lower() + time.sleep(5) + prompt_response = humanloop_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_prompt") + assert prompt_response is not None + prompt_logs_response = humanloop_client.logs.list(file_id=prompt_response.id, page=1, size=50) + assert prompt_logs_response.items is not None and len(prompt_logs_response.items) == 1 + prompt_log = prompt_logs_response.items[0] + + flow_response = humanloop_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_flow") + assert flow_response is not None + flow_logs_response = humanloop_client.logs.list(file_id=flow_response.id, page=1, size=50) + assert flow_logs_response.items is not None and len(flow_logs_response.items) == 1 + flow_log = flow_logs_response.items[0] + assert prompt_log.trace_parent_id == flow_log.id + finally: + flow_response = humanloop_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_flow") + if flow_response is not None: + humanloop_client.flows.delete(id=flow_response.id) + prompt_response = humanloop_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_prompt") + if prompt_response is not None: + humanloop_client.prompts.delete(id=prompt_response.id) + + +def test_flow_decorator_logs_exceptions( + get_humanloop_client: GetHumanloopClientFn, + sdk_test_dir: str, +): + try: + humanloop_client = get_humanloop_client() + + @humanloop_client.flow(path=f"{sdk_test_dir}/test_flow_log_error") + def my_flow(question: str) -> str: + raise ValueError("This is a test exception") + + my_flow("test") + + time.sleep(5) + + flow_response = humanloop_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_flow_log_error") + assert flow_response is not None + flow_logs_response = humanloop_client.logs.list(file_id=flow_response.id, page=1, size=50) + assert flow_logs_response.items is not None and len(flow_logs_response.items) == 1 + flow_log = flow_logs_response.items[0] + assert flow_log.error is not None + assert flow_log.output is None + + finally: + flow_response = humanloop_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_flow_log_error") + if flow_response is not None: + humanloop_client.flows.delete(id=flow_response.id) + + +def test_flow_decorator_populates_output_message( + get_humanloop_client: GetHumanloopClientFn, + sdk_test_dir: str, +): + try: + humanloop_client = get_humanloop_client() + + @humanloop_client.flow(path=f"{sdk_test_dir}/test_flow_log_output_message") + def my_flow(question: str) -> dict[str, Any]: + return {"role": "user", "content": question} + + assert "france" in my_flow("What is the capital of the France?")["content"].lower() + + time.sleep(5) + + flow_response = humanloop_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_flow_log_output_message") + assert flow_response is not None + flow_logs_response = humanloop_client.logs.list(file_id=flow_response.id, page=1, size=50) + assert flow_logs_response.items is not None and len(flow_logs_response.items) == 1 + flow_log = flow_logs_response.items[0] + assert flow_log.output_message is not None + assert flow_log.output is None + assert flow_log.error is None + + finally: + flow_response = humanloop_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_flow_log_output_message") + if flow_response is not None: + humanloop_client.flows.delete(id=flow_response.id) diff --git a/tests/custom/integration/test_evals.py b/tests/custom/integration/test_evals.py new file mode 100644 index 00000000..2ec74d93 --- /dev/null +++ b/tests/custom/integration/test_evals.py @@ -0,0 +1,411 @@ +import time +from typing import Any + +import pytest +from humanloop.error import HumanloopRuntimeError +from tests.custom.integration.conftest import ResourceIdentifiers +from tests.custom.types import GetHumanloopClientFn + + +def test_eval_run_works_on_online_files( + get_humanloop_client: GetHumanloopClientFn, + output_not_null_evaluator: ResourceIdentifiers, + eval_dataset: ResourceIdentifiers, + eval_prompt: ResourceIdentifiers, +) -> None: + humanloop_client = get_humanloop_client() + humanloop_client.evaluations.run( # type: ignore [attr-defined] + name="test_eval_run", + file={ + "path": eval_prompt.file_path, + "type": "prompt", + }, + dataset={ + "path": eval_dataset.file_path, + }, + evaluators=[ + { + "path": output_not_null_evaluator.file_path, + } + ], + ) + time.sleep(5) + response = humanloop_client.evaluations.list(file_id=eval_prompt.file_id) + assert response.items and len(response.items) == 1 + evaluation_id = response.items[0].id + run_evaluation_response = humanloop_client.evaluations.list_runs_for_evaluation(id=evaluation_id) # type: ignore [attr-defined] + assert run_evaluation_response.runs[0].status == "completed" + + +def test_eval_run_version_id( + get_humanloop_client: GetHumanloopClientFn, + output_not_null_evaluator: ResourceIdentifiers, + eval_dataset: ResourceIdentifiers, + eval_prompt: ResourceIdentifiers, + test_prompt_config: dict[str, Any], +) -> None: + humanloop_client = get_humanloop_client() + # GIVEN a prompt where a non-default version is created + new_test_prompt_config = test_prompt_config.copy() + new_test_prompt_config["temperature"] = 1 + new_prompt_version_response = humanloop_client.prompts.upsert( + path=eval_prompt.file_path, + **new_test_prompt_config, + ) + # WHEN creating an evaluation using version_id + humanloop_client.evaluations.run( # type: ignore [attr-defined] + name="test_eval_run", + file={ + "id": new_prompt_version_response.id, + "version_id": new_prompt_version_response.version_id, + "type": "prompt", + }, + dataset={ + "path": eval_dataset.file_path, + }, + evaluators=[ + { + "path": output_not_null_evaluator.file_path, + } + ], + ) + # THEN we evaluate the version created in the test + evaluations_response = humanloop_client.evaluations.list(file_id=new_prompt_version_response.id) + assert evaluations_response.items and len(evaluations_response.items) == 1 + evaluation_id = evaluations_response.items[0].id + runs_response = humanloop_client.evaluations.list_runs_for_evaluation(id=evaluation_id) + assert runs_response.runs[0].status == "completed" + assert ( + runs_response.runs[0].version + and runs_response.runs[0].version.version_id == new_prompt_version_response.version_id + ) + list_versions_response = humanloop_client.prompts.list_versions(id=new_prompt_version_response.id) + assert list_versions_response.records and len(list_versions_response.records) == 2 + # THEN the version used in evaluation is not the default version + response = humanloop_client.prompts.get(id=new_prompt_version_response.id) + assert response.version_id != new_prompt_version_response.version_id + + +def test_eval_run_environment( + get_humanloop_client: GetHumanloopClientFn, + output_not_null_evaluator: ResourceIdentifiers, + eval_dataset: ResourceIdentifiers, + eval_prompt: ResourceIdentifiers, + test_prompt_config: dict[str, Any], + id_for_staging_environment: str, +) -> None: + humanloop_client = get_humanloop_client() + # GIVEN a prompt deployed to staging environment + new_test_prompt_config = test_prompt_config.copy() + new_test_prompt_config["temperature"] = 1 + new_prompt_version_response = humanloop_client.prompts.upsert( + path=eval_prompt.file_path, + **new_test_prompt_config, + ) + humanloop_client.prompts.set_deployment( + id=new_prompt_version_response.id, + environment_id=id_for_staging_environment, + version_id=new_prompt_version_response.version_id, + ) + # WHEN creating an evaluation using environment + humanloop_client.evaluations.run( # type: ignore [attr-defined] + name="test_eval_run", + file={ + "id": new_prompt_version_response.id, + "type": "prompt", + "environment": "staging", + }, + dataset={ + "path": eval_dataset.file_path, + }, + evaluators=[ + { + "path": output_not_null_evaluator.file_path, + } + ], + ) + # THEN evaluation is done with the version deployed to staging environment + evaluations_response = humanloop_client.evaluations.list(file_id=new_prompt_version_response.id) + assert evaluations_response.items and len(evaluations_response.items) == 1 + evaluation_id = evaluations_response.items[0].id + runs_response = humanloop_client.evaluations.list_runs_for_evaluation(id=evaluation_id) + assert runs_response.runs[0].status == "completed" + assert ( + runs_response.runs[0].version + and runs_response.runs[0].version.version_id == new_prompt_version_response.version_id + ) + default_prompt_version_response = humanloop_client.prompts.get(id=new_prompt_version_response.id) + assert default_prompt_version_response.version_id != new_prompt_version_response.version_id + + +@pytest.mark.parametrize("version_lookup", ["version_id", "environment"]) +def test_eval_run_version_lookup_fails_with_path( + get_humanloop_client: GetHumanloopClientFn, + eval_prompt: ResourceIdentifiers, + eval_dataset: ResourceIdentifiers, + output_not_null_evaluator: ResourceIdentifiers, + version_lookup: str, +): + # GIVEN an eval run where we try to evaluate a non-default version + with pytest.raises(HumanloopRuntimeError) as e: + humanloop_client = get_humanloop_client() + humanloop_client.evaluations.run( # type: ignore [attr-defined] + name="test_eval_run", + file={ + "path": eval_prompt.file_path, + "type": "prompt", + # WHEN the File id is not passed in file + version_lookup: "will_not_work", + }, + dataset={ + "path": eval_dataset.file_path, + }, + evaluators=[ + { + "path": output_not_null_evaluator.file_path, + } + ], + ) + # THEN an error is raised + assert "You must provide the `file.id` when addressing a file by version ID or environment" in str(e.value) + + +def test_eval_run_with_version_upsert( + get_humanloop_client: GetHumanloopClientFn, + eval_prompt: ResourceIdentifiers, + eval_dataset: ResourceIdentifiers, + output_not_null_evaluator: ResourceIdentifiers, + test_prompt_config: dict[str, Any], +): + humanloop_client = get_humanloop_client() + humanloop_client.evaluations.run( # type: ignore [attr-defined] + name="test_eval_run", + file={ + "path": eval_prompt.file_path, + "type": "prompt", + "version": { + **test_prompt_config, + "temperature": 1, + }, + }, + dataset={ + "path": eval_dataset.file_path, + }, + evaluators=[ + { + "path": output_not_null_evaluator.file_path, + } + ], + ) + # THEN the version is upserted and evaluation finishes successfully + evaluations_response = humanloop_client.evaluations.list(file_id=eval_prompt.file_id) + assert evaluations_response.items and len(evaluations_response.items) == 1 + evaluation_id = evaluations_response.items[0].id + runs_response = humanloop_client.evaluations.list_runs_for_evaluation(id=evaluation_id) + assert runs_response.runs[0].status == "completed" + # THEN a version was upserted based on file.version + list_prompt_versions_response = humanloop_client.prompts.list_versions(id=eval_prompt.file_id) + assert list_prompt_versions_response.records and len(list_prompt_versions_response.records) == 2 + + +def test_flow_eval_does_not_work_without_callable( + get_humanloop_client: GetHumanloopClientFn, + eval_dataset: ResourceIdentifiers, + output_not_null_evaluator: ResourceIdentifiers, +): + with pytest.raises(HumanloopRuntimeError) as e: + humanloop_client = get_humanloop_client() + humanloop_client.evaluations.run( # type: ignore [attr-defined] + name="test_eval_run", + file={ + "path": "Test Flow", + "type": "flow", + "version": { + "attributes": { + "foo": "bar", + } + }, + }, + dataset={ + "path": eval_dataset.file_path, + }, + evaluators=[ + { + "path": output_not_null_evaluator.file_path, + } + ], + ) + # THEN an error is raised + assert "You must provide a `callable` for your Flow `file` to run a local eval." in str(e.value) + + +def test_flow_eval_works_with_callable( + get_humanloop_client: GetHumanloopClientFn, + eval_dataset: ResourceIdentifiers, + output_not_null_evaluator: ResourceIdentifiers, + sdk_test_dir: str, +): + humanloop_client = get_humanloop_client() + flow_path = f"{sdk_test_dir}/Test Flow" + # GIVEN a flow with a callable + flow_response = humanloop_client.flows.upsert( + path=flow_path, + attributes={ + "foo": "bar", + }, + ) + try: + flow = humanloop_client.flows.upsert( + path=flow_path, + attributes={ + "foo": "bar", + }, + ) + # WHEN we run an evaluation with the flow + humanloop_client.evaluations.run( # type: ignore [attr-defined] + name="test_eval_run", + file={ + "id": flow.id, + "type": "flow", + "callable": lambda question: "bar", + }, + dataset={ + "path": eval_dataset.file_path, + }, + evaluators=[ + { + "path": output_not_null_evaluator.file_path, + } + ], + ) + # THEN the evaluation finishes successfully + evaluations_response = humanloop_client.evaluations.list(file_id=flow.id) + assert evaluations_response.items and len(evaluations_response.items) == 1 + evaluation_id = evaluations_response.items[0].id + runs_response = humanloop_client.evaluations.list_runs_for_evaluation(id=evaluation_id) + assert runs_response.runs[0].status == "completed" + finally: + humanloop_client.flows.delete(id=flow_response.id) + + +def test_cannot_evaluate_agent_with_callable( + get_humanloop_client: GetHumanloopClientFn, + eval_dataset: ResourceIdentifiers, + output_not_null_evaluator: ResourceIdentifiers, +): + with pytest.raises(ValueError) as e: + humanloop_client = get_humanloop_client() + humanloop_client.evaluations.run( # type: ignore [attr-defined] + name="test_eval_run", + file={ + "path": "Test Agent", + "type": "agent", + "callable": lambda question: "bar", + }, + dataset={ + "path": eval_dataset.file_path, + }, + evaluators=[ + { + "path": output_not_null_evaluator.file_path, + } + ], + ) + assert str(e.value) == "Agent evaluation is only possible on the Humanloop runtime, do not provide a `callable`." + + +def test_flow_eval_resolves_to_default_with_callable( + get_humanloop_client: GetHumanloopClientFn, + output_not_null_evaluator: ResourceIdentifiers, + eval_dataset: ResourceIdentifiers, + sdk_test_dir: str, +) -> None: + humanloop_client = get_humanloop_client() + # GIVEN a flow with some attributes + flow_path = f"{sdk_test_dir}/Test Flow" + flow_response = humanloop_client.flows.upsert( + path=flow_path, + attributes={ + "foo": "bar", + }, + ) + try: + # WHEN running an evaluation with the flow's callable but no version + humanloop_client.evaluations.run( # type: ignore [attr-defined] + name="test_eval_run", + file={ + "id": flow_response.id, + "type": "flow", + "callable": lambda question: "It's complicated don't worry about it", + }, + dataset={ + "path": eval_dataset.file_path, + }, + evaluators=[ + { + "path": output_not_null_evaluator.file_path, + } + ], + ) + # THEN the evaluation finishes successfully + evaluations_response = humanloop_client.evaluations.list(file_id=flow_response.id) + assert evaluations_response.items and len(evaluations_response.items) == 1 + evaluation_id = evaluations_response.items and evaluations_response.items[0].id + runs_response = humanloop_client.evaluations.list_runs_for_evaluation(id=evaluation_id) # type: ignore [attr-defined, arg-type] + assert runs_response.runs[0].status == "completed" + finally: + # Clean up test resources + humanloop_client.flows.delete(id=flow_response.id) + + +def test_agent_eval_works_upserting( + get_humanloop_client: GetHumanloopClientFn, + eval_dataset: ResourceIdentifiers, + output_not_null_evaluator: ResourceIdentifiers, + sdk_test_dir: str, +): + humanloop_client = get_humanloop_client() + humanloop_client.evaluations.run( # type: ignore [attr-defined] + name="test_eval_run", + file={ + "path": f"{sdk_test_dir}/Test Agent", + "type": "agent", + "version": { + "model": "gpt-4o", + "template": [ + { + "role": "system", + "content": "You are a helpful assistant, offering very short answers.", + }, + { + "role": "user", + "content": "{{question}}", + }, + ], + "provider": "openai", + "temperature": 0, + "max_iterations": 5, + }, + }, + dataset={ + "path": eval_dataset.file_path, + }, + evaluators=[ + { + "path": output_not_null_evaluator.file_path, + } + ], + ) + files_response = humanloop_client.files.list_files(page=1, size=100) + eval_agent = None + for file in files_response.records: + if file.path == f"{sdk_test_dir}/Test Agent": + eval_agent = file + break + assert eval_agent and eval_agent.type == "agent" + # THEN the evaluation finishes successfully + evaluations_response = humanloop_client.evaluations.list(file_id=eval_agent.id) + assert evaluations_response.items and len(evaluations_response.items) == 1 + evaluation_id = evaluations_response.items[0].id + runs_response = humanloop_client.evaluations.list_runs_for_evaluation(id=evaluation_id) # type: ignore [attr-defined, arg-type] + assert runs_response.runs[0].status == "completed" diff --git a/tests/custom/integration/test_sync.py b/tests/custom/integration/test_sync.py new file mode 100644 index 00000000..6e7b002b --- /dev/null +++ b/tests/custom/integration/test_sync.py @@ -0,0 +1,206 @@ +from typing import List, Union +from pathlib import Path +import pytest +from humanloop import AgentResponse, PromptResponse +from humanloop.prompts.client import PromptsClient +from humanloop.agents.client import AgentsClient +from humanloop.error import HumanloopRuntimeError +from tests.custom.types import GetHumanloopClientFn, SyncableFile + + +@pytest.fixture +def cleanup_local_files(): + """Cleanup any locally synced files after tests""" + yield + local_dir = Path("humanloop") + if local_dir.exists(): + import shutil + + shutil.rmtree(local_dir) + + +def test_pull_basic( + syncable_files_fixture: List[SyncableFile], + get_humanloop_client: GetHumanloopClientFn, +): + """Test that humanloop.sync() correctly syncs remote files to local filesystem""" + # GIVEN a set of files in the remote system (from syncable_files_fixture) + humanloop_client = get_humanloop_client() + + # WHEN running the sync + humanloop_client.pull() + + # THEN our local filesystem should mirror the remote filesystem in the HL Workspace + for file in syncable_files_fixture: + extension = f".{file.type}" + local_path = Path("humanloop") / f"{file.path}{extension}" + + # THEN the file and its directory should exist + assert local_path.exists(), f"Expected synced file at {local_path}" + assert local_path.parent.exists(), f"Expected directory at {local_path.parent}" + + # THEN the file should not be empty + content = local_path.read_text() + assert content, f"File at {local_path} should not be empty" + + +def test_overload_with_local_files( + get_humanloop_client: GetHumanloopClientFn, + syncable_files_fixture: List[SyncableFile], +): + """Test that overload_with_local_files correctly handles local files.""" + # GIVEN a client with use_local_files=True and pulled files + humanloop_client = get_humanloop_client(use_local_files=True) + humanloop_client.pull() + + # GIVEN a test file from the structure + test_file = syncable_files_fixture[0] + extension = f".{test_file.type}" + local_path = Path("humanloop") / f"{test_file.path}{extension}" + + # THEN the file should exist locally + assert local_path.exists(), f"Expected pulled file at {local_path}" + assert local_path.parent.exists(), f"Expected directory at {local_path.parent}" + + # WHEN calling the file + response: Union[AgentResponse, PromptResponse] + if test_file.type == "prompt": + response = humanloop_client.prompts.call( # type: ignore [assignment] + path=test_file.path, messages=[{"role": "user", "content": "Testing"}] + ) + elif test_file.type == "agent": + response = humanloop_client.agents.call( # type: ignore [assignment] + path=test_file.path, messages=[{"role": "user", "content": "Testing"}] + ) + # THEN the response should not be None + assert response is not None + + # WHEN calling with an invalid path + # THEN it should raise HumanloopRuntimeError + with pytest.raises(HumanloopRuntimeError): + sub_client: Union[PromptsClient, AgentsClient] + match test_file.type: + case "prompt": + sub_client = humanloop_client.prompts + case "agent": + sub_client = humanloop_client.agents + case _: + raise ValueError(f"Invalid file type: {test_file.type}") + sub_client.call(path="invalid/path") + + +def test_overload_log_with_local_files( + get_humanloop_client: GetHumanloopClientFn, + syncable_files_fixture: List[SyncableFile], + sdk_test_dir: str, +): + """Test that overload_with_local_files correctly handles local files for log operations.""" + # GIVEN a client with use_local_files=True and pulled files + humanloop_client = get_humanloop_client(use_local_files=True) + humanloop_client.pull() + + # GIVEN a test file from the structure + test_file = syncable_files_fixture[0] + extension = f".{test_file.type}" + local_path = Path("humanloop") / f"{test_file.path}{extension}" + + # THEN the file should exist locally + assert local_path.exists(), f"Expected pulled file at {local_path}" + assert local_path.parent.exists(), f"Expected directory at {local_path.parent}" + + # WHEN logging with the pulled file + if test_file.type == "prompt": + response = humanloop_client.prompts.log( # type: ignore [assignment] + path=test_file.path, messages=[{"role": "user", "content": "Testing"}], output="Test response" + ) + elif test_file.type == "agent": + response = humanloop_client.agents.log( # type: ignore [assignment] + path=test_file.path, messages=[{"role": "user", "content": "Testing"}], output="Test response" + ) + # THEN the response should not be None + assert response is not None + + # WHEN logging with an invalid path + # THEN it should raise HumanloopRuntimeError + with pytest.raises(HumanloopRuntimeError): + if test_file.type == "prompt": + humanloop_client.prompts.log( + path=f"{sdk_test_dir}/invalid/path", + messages=[{"role": "user", "content": "Testing"}], + output="Test response", + ) + elif test_file.type == "agent": + humanloop_client.agents.log( + path=f"{sdk_test_dir}/invalid/path", + messages=[{"role": "user", "content": "Testing"}], + output="Test response", + ) + + +def test_overload_version_environment_handling( + get_humanloop_client: GetHumanloopClientFn, + syncable_files_fixture: List[SyncableFile], +): + """Test that overload_with_local_files correctly handles version_id and environment parameters.""" + # GIVEN a client with use_local_files=True and pulled files + humanloop_client = get_humanloop_client(use_local_files=True) + humanloop_client.pull() + + # GIVEN a test file from the structure + test_file = syncable_files_fixture[0] + extension = f".{test_file.type}" + local_path = Path("humanloop") / f"{test_file.path}{extension}" + + # THEN the file should exist locally + assert local_path.exists(), f"Expected pulled file at {local_path}" + assert local_path.parent.exists(), f"Expected directory at {local_path.parent}" + + # WHEN calling with version_id + # THEN it should raise HumanloopRuntimeError + with pytest.raises(HumanloopRuntimeError, match="Cannot use local file.*version_id or environment was specified"): + if test_file.type == "prompt": + humanloop_client.prompts.call( + path=test_file.path, + version_id=test_file.version_id, + messages=[{"role": "user", "content": "Testing"}], + ) + elif test_file.type == "agent": + humanloop_client.agents.call( + path=test_file.path, + version_id=test_file.version_id, + messages=[{"role": "user", "content": "Testing"}], + ) + + # WHEN calling with environment + # THEN it should raise HumanloopRuntimeError + with pytest.raises(HumanloopRuntimeError, match="Cannot use local file.*version_id or environment was specified"): + if test_file.type == "prompt": + humanloop_client.prompts.call( + path=test_file.path, + environment="production", + messages=[{"role": "user", "content": "Testing"}], + ) + elif test_file.type == "agent": + humanloop_client.agents.call( + path=test_file.path, + environment="production", + messages=[{"role": "user", "content": "Testing"}], + ) + + # WHEN calling with both version_id and environment + # THEN it should raise HumanloopRuntimeError + with pytest.raises(HumanloopRuntimeError, match="Cannot use local file.*version_id or environment was specified"): + if test_file.type == "prompt": + humanloop_client.prompts.call( + path=test_file.path, + version_id=test_file.version_id, + environment="staging", + messages=[{"role": "user", "content": "Testing"}], + ) + elif test_file.type == "agent": + humanloop_client.agents.call( + path=test_file.path, + version_id=test_file.version_id, + environment="staging", + messages=[{"role": "user", "content": "Testing"}], + ) diff --git a/tests/custom/integration/test_sync_cli.py b/tests/custom/integration/test_sync_cli.py new file mode 100644 index 00000000..3957aed2 --- /dev/null +++ b/tests/custom/integration/test_sync_cli.py @@ -0,0 +1,179 @@ +from pathlib import Path +from unittest import mock +import pytest +from click.testing import CliRunner +from humanloop.cli.__main__ import cli +from tests.custom.types import SyncableFile + + +@pytest.fixture +def no_env_file_loading(): + """Fixture that prevents loading API keys from any .env files. + + Use this fixture in tests that verify behavior when no .env files should + be processed, regardless of whether they exist or not. + """ + # Prevent any .env file from being loaded + with mock.patch("humanloop.cli.__main__.load_dotenv", lambda *args, **kwargs: None): + yield + + +def test_pull_without_api_key(cli_runner: CliRunner, no_humanloop_api_key_in_env, no_env_file_loading): + """GIVEN no API key in environment + WHEN running pull command + THEN it should fail with appropriate error message + """ + # WHEN running pull command + result = cli_runner.invoke(cli, ["pull", "--local-files-directory", "humanloop"]) + + # THEN it should fail with appropriate error message + assert result.exit_code == 1 # Our custom error code for API key issues + assert "No API key found" in result.output + assert "Set HUMANLOOP_API_KEY in .env file or environment" in result.output + + +def test_pull_basic( + cli_runner: CliRunner, + syncable_files_fixture: list[SyncableFile], + tmp_path: Path, # this path is used as a temporary store for files locally +): + # GIVEN a base directory for pulled files + base_dir = str(tmp_path / "humanloop") + + # WHEN running pull command + result = cli_runner.invoke(cli, ["pull", "--local-files-directory", base_dir, "--verbose"]) + + # THEN it should succeed + assert result.exit_code == 0 + assert "Pulling files from Humanloop..." in result.output + assert "Pull completed" in result.output + + # THEN the files should exist locally + for file in syncable_files_fixture: + extension = f".{file.type}" + local_path = Path(base_dir) / f"{file.path}{extension}" + assert local_path.exists(), f"Expected synced file at {local_path}" + assert local_path.parent.exists(), f"Expected directory at {local_path.parent}" + assert local_path.read_text(), f"File at {local_path} should not be empty" + + +def test_pull_with_specific_path( + cli_runner: CliRunner, + syncable_files_fixture: list[SyncableFile], + tmp_path: Path, +): + """GIVEN a specific path to pull + WHEN running pull command with path + THEN it should pull only files from that path + """ + # GIVEN a base directory and specific path + base_dir = str(tmp_path / "humanloop") + test_path = syncable_files_fixture[ + 0 + ].path.split( + "/" + )[ + 0 + ] # Retrieve the prefix of the first file's path which corresponds to the sdk_test_dir used within syncable_files_fixture + + # WHEN running pull command with path + result = cli_runner.invoke(cli, ["pull", "--local-files-directory", base_dir, "--path", test_path, "--verbose"]) + + # THEN it should succeed and show the path + assert result.exit_code == 0 + assert f"Path: {test_path}" in result.output + + # THEN only files from that path should exist locally + for file in syncable_files_fixture: + extension = f".{file.type}" + local_path = Path(base_dir) / f"{file.path}{extension}" + if file.path.startswith(test_path): + assert local_path.exists(), f"Expected synced file at {local_path}" + else: + assert not local_path.exists(), f"Unexpected file at {local_path}" + + +def test_pull_with_environment( + cli_runner: CliRunner, + syncable_files_fixture: list[SyncableFile], + tmp_path: Path, +): + # GIVEN a base directory and environment + base_dir = str(tmp_path / "humanloop") + environment = "staging" + + # WHEN running pull command with environment + result = cli_runner.invoke( + cli, + [ + "pull", + "--local-files-directory", + base_dir, + "--environment", + environment, + "--verbose", + ], + ) + + # THEN it should succeed and show the environment + assert result.exit_code == 0 + assert f"Environment: {environment}" in result.output + + +def test_pull_with_quiet_mode( + cli_runner: CliRunner, + syncable_files_fixture: list[SyncableFile], + tmp_path: Path, +): + # GIVEN a base directory and quiet mode + base_dir = str(tmp_path / "humanloop") + + # WHEN running pull command with quiet mode + result = cli_runner.invoke(cli, ["pull", "--local-files-directory", base_dir, "--quiet"]) + + # THEN it should succeed but not show file list + assert result.exit_code == 0 + assert "Successfully pulled" not in result.output + + # THEN files should still be pulled + for file in syncable_files_fixture: + extension = f".{file.type}" + local_path = Path(base_dir) / f"{file.path}{extension}" + assert local_path.exists(), f"Expected synced file at {local_path}" + + +def test_pull_with_invalid_path( + cli_runner: CliRunner, +): + # GIVEN an invalid base directory + path = "nonexistent/path" + + # WHEN running pull command + result = cli_runner.invoke(cli, ["pull", "--path", path]) + + # THEN it should fail + assert result.exit_code == 1 + assert "Error" in result.output + + +def test_pull_with_invalid_environment(cli_runner: CliRunner, tmp_path: Path): + # GIVEN an invalid environment + environment = "nonexistent" + base_dir = str(tmp_path / "humanloop") + + # WHEN running pull command + result = cli_runner.invoke( + cli, + [ + "pull", + "--local-files-directory", + base_dir, + "--environment", + environment, + "--verbose", + ], + ) + + # THEN it should fail + assert result.exit_code == 1 + assert "Error" in result.output diff --git a/tests/custom/otel/__init__.py b/tests/custom/otel/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/custom/otel/test_helpers.py b/tests/custom/otel/test_helpers.py new file mode 100644 index 00000000..3bd5ce45 --- /dev/null +++ b/tests/custom/otel/test_helpers.py @@ -0,0 +1,172 @@ +import pytest +from humanloop.otel.helpers import read_from_opentelemetry_span, write_to_opentelemetry_span +from opentelemetry.sdk.trace import Span + + +def test_read_empty(test_span: Span): + with pytest.raises(TypeError): + assert read_from_opentelemetry_span(test_span) == {} + + +def test_read_non_existent_key(test_span: Span): + with pytest.raises(TypeError): + assert read_from_opentelemetry_span(test_span, "key") == {} + write_to_opentelemetry_span(test_span, {"x": 7, "y": "foo"}, key="key") + # NOTE: attributes cannot be None at this point + assert dict(test_span.attributes) == { # type: ignore + "key.x": 7, + "key.y": "foo", + } + with pytest.raises(TypeError): + assert read_from_opentelemetry_span(test_span, "key.z") is None + + +def test_simple_dict(test_span: Span): + write_to_opentelemetry_span(test_span, {"x": 7, "y": "foo"}, "key") + # NOTE: attributes cannot be None at this point + assert dict(test_span.attributes) == { # type: ignore + "key.x": 7, + "key.y": "foo", + } + assert read_from_opentelemetry_span(test_span, "key") == {"x": 7, "y": "foo"} + + +def test_no_prefix(test_span: Span): + write_to_opentelemetry_span(test_span, {"x": 7, "y": "foo"}) + # NOTE: attributes cannot be None at this point + assert dict(test_span.attributes) == { # type: ignore + "x": 7, + "y": "foo", + } + assert read_from_opentelemetry_span(test_span) == {"x": 7, "y": "foo"} + + +def test_nested_object(test_span: Span): + write_to_opentelemetry_span(test_span, {"x": 7, "y": {"z": "foo"}}, "key") + # NOTE: attributes cannot be None at this point + assert dict(test_span.attributes) == { # type: ignore + "key.x": 7, + "key.y.z": "foo", + } + assert read_from_opentelemetry_span(test_span, "key") == {"x": 7, "y": {"z": "foo"}} + + +def test_list(test_span: Span): + write_to_opentelemetry_span( + test_span, + [{"x": 7, "y": "foo"}, {"z": "bar"}], # type: ignore + "key", + ) # type: ignore + # NOTE: attributes cannot be None at this point + assert dict(test_span.attributes) == { # type: ignore + "key.0.x": 7, + "key.0.y": "foo", + "key.1.z": "bar", + } + assert read_from_opentelemetry_span(test_span, "key") == [ + {"z": "bar"}, + {"x": 7, "y": "foo"}, + ] + + +def test_list_no_prefix(test_span: Span): + write_to_opentelemetry_span( + test_span, + [{"x": 7, "y": "foo"}, {"z": "bar"}], # type: ignore + ) + # NOTE: attributes cannot be None at this point + assert dict(test_span.attributes) == { # type: ignore + "0.x": 7, + "0.y": "foo", + "1.z": "bar", + } + assert read_from_opentelemetry_span(test_span) == [ + {"z": "bar"}, + {"x": 7, "y": "foo"}, + ] + + +def test_multiple_nestings(test_span: Span): + write_to_opentelemetry_span( + test_span, + [ + {"x": 7, "y": "foo"}, + [{"z": "bar"}, {"a": 42}], + ], # type: ignore + "key", + ) + assert dict(test_span.attributes) == { # type: ignore + "key.0.x": 7, + "key.0.y": "foo", + "key.1.0.z": "bar", + "key.1.1.a": 42, + } + assert read_from_opentelemetry_span(test_span, "key") == [ + [ + {"a": 42}, + {"z": "bar"}, + ], + {"x": 7, "y": "foo"}, + ] + + +def test_read_mixed_numeric_string_keys(test_span: Span): + test_span.set_attributes( + { + "key.0.x": 7, + "key.0.y": "foo", + "key.a.z": "bar", + "key.a.a": 42, + } + ) + assert read_from_opentelemetry_span(span=test_span, key="key") == { # type: ignore + "0": {"x": 7, "y": "foo"}, + "a": {"z": "bar", "a": 42}, + } + assert read_from_opentelemetry_span(span=test_span) == { # type: ignore + "key": { + "0": {"x": 7, "y": "foo"}, + "a": {"z": "bar", "a": 42}, + } + } + + +def test_sub_key_same_as_key(test_span: Span): + write_to_opentelemetry_span(test_span, {"key": 7}, "key") + # NOTE: attributes cannot be None at this point + assert dict(test_span.attributes) == { # type: ignore + "key.key": 7, + } + assert read_from_opentelemetry_span(test_span, "key") == {"key": 7} + + +def test_read_nested_key(test_span: Span): + test_span.set_attributes({"key.x": 7, "key.y.z": "foo"}) + assert read_from_opentelemetry_span(span=test_span, key="key.y") == {"z": "foo"} + + +def test_write_read_sub_key(test_span: Span): + write_to_opentelemetry_span(test_span, {"x": 7, "y": "foo"}, "key") + assert read_from_opentelemetry_span(test_span, "key.x") == 7 + assert read_from_opentelemetry_span(test_span, "key.y") == "foo" + assert read_from_opentelemetry_span(test_span, "key") == {"x": 7, "y": "foo"} + + +def test_write_drops_dict_all_null_values(test_span: Span): + # GIVEN a test_span to which a value with null values is written + # NOTE: mypy complains about None value in the dict, but it is intentionally under test + write_to_opentelemetry_span(test_span, {"x": None, "y": None}, "key") # type: ignore + # WHEN reading the value from the span + # THEN the value is not present in the span attributes + assert "key" not in test_span.attributes # type: ignore + with pytest.raises(TypeError): + assert read_from_opentelemetry_span(test_span, "key") == {} + + +def test_write_drops_null_value_from_dict(test_span: Span): + # GIVEN a test_span to which a dict with some null values are written + # NOTE: mypy complains about None value in the dict, but it is intentionally under test + write_to_opentelemetry_span(test_span, {"x": 2, "y": None}, "key") # type: ignore + # WHEN reading the values from the span + # THEN the value with null value is not present in the span attributes + assert read_from_opentelemetry_span(test_span, "key") == {"x": 2} diff --git a/tests/custom/sync/__init__.py b/tests/custom/sync/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/custom/sync/test_client.py b/tests/custom/sync/test_client.py new file mode 100644 index 00000000..a349fd0c --- /dev/null +++ b/tests/custom/sync/test_client.py @@ -0,0 +1,126 @@ +import logging +import pytest +from pathlib import Path +from unittest.mock import Mock, patch +from humanloop.sync.sync_client import SyncClient, SerializableFileType +from humanloop.error import HumanloopRuntimeError +from typing import Literal + + +@pytest.fixture +def mock_client() -> Mock: + return Mock() + + +@pytest.fixture +def sync_client(mock_client: Mock, tmp_path: Path) -> SyncClient: + return SyncClient( + client=mock_client, + base_dir=str(tmp_path), + cache_size=10, + log_level=logging.DEBUG, # DEBUG level for testing # noqa: F821 + ) + + +def test_init(sync_client: SyncClient, tmp_path: Path): + """Test basic initialization of SyncClient.""" + # GIVEN a SyncClient instance + # THEN it should be initialized with correct base directory, cache size and file types + assert sync_client.base_dir == tmp_path + assert sync_client._cache_size == 10 + assert sync_client.SERIALIZABLE_FILE_TYPES == frozenset(["prompt", "agent"]) + + +def test_normalize_path(sync_client: SyncClient): + """Test path normalization functionality.""" + # GIVEN various file paths with different formats + test_cases = [ + ("path/to/file.prompt", "path/to/file"), + ("path\\to\\file.agent", "path/to/file"), + ("trailing/slashes/file.agent/", "trailing/slashes/file"), + ("multiple//slashes//file.prompt", "multiple/slashes/file"), + ] + + for input_path, expected in test_cases: + # WHEN they are normalized + normalized = sync_client._normalize_path(input_path) + # THEN they should be converted to the expected format + assert normalized == expected + + # Test absolute path raises error + with pytest.raises(HumanloopRuntimeError, match="Absolute paths are not supported"): + sync_client._normalize_path("/leading/slashes/file.prompt") + + +def test_is_file(sync_client: SyncClient): + """Test file type detection.""" + # GIVEN various file paths + # WHEN checking if they are valid file types + # THEN only .prompt and .agent files should return True + assert sync_client.is_file("test.prompt") + assert sync_client.is_file("test.agent") + assert not sync_client.is_file("test.txt") + assert not sync_client.is_file("test") + + +def test_save_and_read_file(sync_client: SyncClient): + """Test saving and reading files.""" + # GIVEN a file content and path + content = "test content" + path = "test/path" + file_type: SerializableFileType = "prompt" + + # WHEN saving the file + sync_client._save_serialized_file(content, path, "prompt") + saved_path = sync_client.base_dir / path + saved_path = saved_path.parent / f"{saved_path.stem}.{file_type}" + + # THEN the file should exist on disk + assert saved_path.exists() + + # WHEN reading the file + read_content = sync_client.get_file_content(path, file_type) + + # THEN the content should match + assert read_content == content + + +def test_error_handling(sync_client: SyncClient): + """Test error handling in various scenarios.""" + # GIVEN a nonexistent file + # WHEN trying to read it + # THEN a HumanloopRuntimeError should be raised + with pytest.raises(HumanloopRuntimeError, match="Local file not found"): + sync_client.get_file_content("nonexistent", "prompt") + + # GIVEN an API error + # WHEN trying to pull a file + # THEN it should return False + with patch.object(sync_client.client.files, "retrieve_by_path", side_effect=Exception("API Error")): + assert not sync_client._pull_file("test.prompt") + + +def test_cache_functionality(sync_client: SyncClient): + """Test LRU cache functionality.""" + # GIVEN a test file + content = "test content" + path = "test/path" + file_type: Literal["prompt", "agent"] = "prompt" + sync_client._save_serialized_file(content, path, file_type) + + # WHEN reading the file for the first time + sync_client.get_file_content(path, file_type) + # THEN it should hit disk (implicitly verified by no cache hit) + + # WHEN modifying the file on disk + saved_path = sync_client.base_dir / f"{path}.{file_type}" + saved_path.write_text("modified content") + + # THEN subsequent reads should use cache + assert sync_client.get_file_content(path, file_type) == content + + # WHEN clearing the cache + sync_client.clear_cache() + + # THEN new content should be read from disk + assert sync_client.get_file_content(path, file_type) == "modified content" diff --git a/tests/custom/types.py b/tests/custom/types.py new file mode 100644 index 00000000..7a198456 --- /dev/null +++ b/tests/custom/types.py @@ -0,0 +1,15 @@ +from typing import Protocol, NamedTuple +from humanloop.client import Humanloop +from humanloop import FileType + + +class GetHumanloopClientFn(Protocol): + def __call__(self, use_local_files: bool = False) -> Humanloop: ... + + +class SyncableFile(NamedTuple): + path: str + type: FileType + model: str + id: str = "" + version_id: str = "" From 602b91212a633ef10194c01b3008abf85265d955 Mon Sep 17 00:00:00 2001 From: Ale Pouroullis Date: Tue, 13 May 2025 19:06:48 +0100 Subject: [PATCH 3/6] mypy + ruff formatting fixes + add prompt tests --- tests/assets/exact_match.py | 16 - tests/assets/levenshtein.py | 99 ----- tests/conftest.py | 278 ------------ tests/custom/conftest.py | 11 +- tests/custom/integration/conftest.py | 42 +- tests/custom/integration/test_decorators.py | 1 + tests/custom/integration/test_evals.py | 1 + .../{ => custom}/integration/test_prompts.py | 24 +- tests/custom/integration/test_sync.py | 6 +- tests/custom/integration/test_sync_cli.py | 2 + tests/custom/otel/test_helpers.py | 3 +- tests/custom/sync/test_client.py | 8 +- tests/custom/types.py | 5 +- tests/integration/__init__.py | 0 tests/integration/conftest.py | 169 -------- tests/integration/test_decorators.py | 154 ------- tests/integration/test_evals.py | 402 ------------------ tests/otel/__init__.py | 0 tests/otel/test_helpers.py | 172 -------- tests/utils/assets/models/__init__.py | 2 +- tests/utils/assets/models/circle.py | 1 + .../assets/models/object_with_defaults.py | 1 - .../models/object_with_optional_field.py | 8 +- tests/utils/assets/models/shape.py | 5 +- tests/utils/assets/models/square.py | 1 + .../assets/models/undiscriminated_shape.py | 1 + tests/utils/test_serialization.py | 4 +- 27 files changed, 85 insertions(+), 1331 deletions(-) delete mode 100644 tests/assets/exact_match.py delete mode 100644 tests/assets/levenshtein.py delete mode 100644 tests/conftest.py rename tests/{ => custom}/integration/test_prompts.py (60%) delete mode 100644 tests/integration/__init__.py delete mode 100644 tests/integration/conftest.py delete mode 100644 tests/integration/test_decorators.py delete mode 100644 tests/integration/test_evals.py delete mode 100644 tests/otel/__init__.py delete mode 100644 tests/otel/test_helpers.py diff --git a/tests/assets/exact_match.py b/tests/assets/exact_match.py deleted file mode 100644 index 583d742a..00000000 --- a/tests/assets/exact_match.py +++ /dev/null @@ -1,16 +0,0 @@ -def extract_answer(generation: str): - """Extracts answer from generation. - - Handles a generation that if separated by "---" with the answer being the first part. - Also handles a generation that starts with "```\n" and removes it. - """ - answer = generation.split("---")[0].strip() - if answer.startswith("```\n"): - answer = answer[4:].strip() - - return answer - - -def exact_match(log, testcase): - target = testcase["target"]["output"] - return target == extract_answer(log["output"]) diff --git a/tests/assets/levenshtein.py b/tests/assets/levenshtein.py deleted file mode 100644 index b2e279ae..00000000 --- a/tests/assets/levenshtein.py +++ /dev/null @@ -1,99 +0,0 @@ -def levenshtein_distance_optimized(s1, s2, max_distance=1000): - """ - Calculate the Levenshtein distance between two strings with optimizations and a maximum distance cap. - - This function trims common prefixes and suffixes from the input strings, uses a single-row table - to reduce space complexity, and stops the computation early if the Levenshtein distance is - guaranteed to exceed a maximum distance cap. - - Args: - s1 (str): The first string. - s2 (str): The second string. - max_distance (int, optional): The maximum Levenshtein distance. Defaults to 1000. - - Returns: - int: The Levenshtein distance between the two strings, or max_distance if the distance - exceeds max_distance. - """ - # Trim common prefixes - while s1 and s2 and s1[0] == s2[0]: - s1 = s1[1:] - s2 = s2[1:] - - # Trim common suffixes - while s1 and s2 and s1[-1] == s2[-1]: - s1 = s1[:-1] - s2 = s2[:-1] - - len_s1 = len(s1) - len_s2 = len(s2) - - # If the length difference between the strings exceeds max_distance, stop the computation - if abs(len_s1 - len_s2) > max_distance: - return max_distance - - # If one of the strings is empty, the distance is the length of the other string - if len_s1 == 0: - return min(len_s2, max_distance) - if len_s2 == 0: - return min(len_s1, max_distance) - - # Create a single-row table with len(s2) + 1 columns - distance = list(range(len_s2 + 1)) - - # Fill up the table - for i in range(1, len_s1 + 1): - # Store the value of the previous cell in the previous row - prev_row_cell = i - 1 - # The value at the first column is the row number - distance[0] = i - - # Initialize the minimum distance in the current row to max_distance - min_distance = max_distance - - for j in range(1, len_s2 + 1): - # Store the value of the current cell before it is updated - current_cell = distance[j] - - # If the current characters of the two strings are the same, the cost is 0, otherwise 1 - substitution_cost = 0 if s1[i - 1] == s2[j - 1] else 1 - - # The value at the current cell is the minimum of the values at the previous cell in the - # current row, the current cell in the previous row, and the previous cell in the previous row, - # plus the cost - distance[j] = min( - distance[j - 1] + 1, # deletion - distance[j] + 1, # insertion - prev_row_cell + substitution_cost, - ) # substitution - - # Update the minimum distance in the current row - min_distance = min(min_distance, distance[j]) - - # Update the value of the previous cell in the previous row - prev_row_cell = current_cell - - # If the minimum distance in the current row exceeds max_distance, stop the computation - if min_distance >= max_distance: - return max_distance - - # The Levenshtein distance between the two strings is the value at the last cell in the table - return min(distance[-1], max_distance) - - -def extract_answer(generation: str): - """Extracts answer from generation. - - Handles a generation that if separated by "---" with the answer being the first part. - Also handles a generation that starts with "```\n" and removes it. - """ - answer = generation.split("---")[0].strip() - if answer.startswith("```\n"): - answer = answer[4:].strip() - - return answer - - -def compare_log_and_target(log, testcase): - target = testcase["target"]["output"] - return levenshtein_distance_optimized(target, extract_answer(log["output"])) diff --git a/tests/conftest.py b/tests/conftest.py deleted file mode 100644 index 80e3b336..00000000 --- a/tests/conftest.py +++ /dev/null @@ -1,278 +0,0 @@ -from dataclasses import asdict, dataclass -import os -import random -import string -import time -from typing import Callable, Generator -import typing -from unittest.mock import MagicMock - -from dotenv import load_dotenv -import pytest -from humanloop.base_client import BaseHumanloop -from humanloop.client import Humanloop -from humanloop.otel.exporter import HumanloopSpanExporter -from humanloop.otel.processor import HumanloopSpanProcessor -from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam -from opentelemetry.instrumentation.anthropic import AnthropicInstrumentor -from opentelemetry.instrumentation.cohere import CohereInstrumentor -from opentelemetry.instrumentation.groq import GroqInstrumentor -from opentelemetry.instrumentation.instrumentor import BaseInstrumentor # type: ignore -from opentelemetry.instrumentation.openai import OpenAIInstrumentor -from opentelemetry.instrumentation.replicate import ReplicateInstrumentor -from opentelemetry.sdk.resources import Resource -from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import SimpleSpanProcessor -from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter -from opentelemetry.trace import Tracer - -if typing.TYPE_CHECKING: - from humanloop.client import BaseHumanloop - - -@pytest.fixture(scope="function") -def opentelemetry_test_provider() -> TracerProvider: - """Create a test TracerProvider with a resource. - - This is similar to the created TracerProvider in the - Humanloop class. - """ - provider = TracerProvider( - resource=Resource.create( - { - "service": "humanloop.sdk", - "environment": "test", - } - ) - ) - return provider - - -@pytest.fixture(scope="function") -def test_span(opentelemetry_test_provider: TracerProvider): - exporter = InMemorySpanExporter() - processor = SimpleSpanProcessor(exporter) - opentelemetry_test_provider.add_span_processor(processor) - tracer = opentelemetry_test_provider.get_tracer("test") - return tracer.start_span("test_span") - - -@pytest.fixture(scope="function") -def opentelemetry_test_configuration( - opentelemetry_test_provider: TracerProvider, -) -> Generator[tuple[Tracer, InMemorySpanExporter], None, None]: - """Configure OTel backend without HumanloopSpanProcessor. - - Spans created by Instrumentors will not be used to enrich - Humanloop Spans. - """ - exporter = InMemorySpanExporter() - processor = SimpleSpanProcessor(exporter) - opentelemetry_test_provider.add_span_processor(processor) - instrumentors: list[BaseInstrumentor] = [ - OpenAIInstrumentor(), - AnthropicInstrumentor(), - GroqInstrumentor(), - CohereInstrumentor(), - ReplicateInstrumentor(), - ] - for instrumentor in instrumentors: - instrumentor.instrument(tracer_provider=opentelemetry_test_provider) - tracer = opentelemetry_test_provider.get_tracer("test") - # Circumvent configuration procedure - - yield tracer, exporter - - for instrumentor in instrumentors: - instrumentor.uninstrument() - - -@pytest.fixture(scope="function") -def opentelemetry_hl_test_configuration( - opentelemetry_test_provider: TracerProvider, - humanloop_client: BaseHumanloop, -) -> Generator[tuple[Tracer, InMemorySpanExporter], None, None]: - """Configure OTel backend with HumanloopSpanProcessor. - - Spans created by Instrumentors will be used to enrich - Humanloop Spans. - """ - exporter = InMemorySpanExporter() - processor = HumanloopSpanProcessor(exporter=exporter) - opentelemetry_test_provider.add_span_processor(processor) - instrumentors: list[BaseInstrumentor] = [ - OpenAIInstrumentor(), - AnthropicInstrumentor(), - GroqInstrumentor(), - CohereInstrumentor(), - ReplicateInstrumentor(), - AnthropicInstrumentor(), - ] - for instrumentor in instrumentors: - instrumentor.instrument( - tracer_provider=opentelemetry_test_provider, - ) - tracer = opentelemetry_test_provider.get_tracer("test") - - yield tracer, exporter - - for instrumentor in instrumentors: - instrumentor.uninstrument() - - -@pytest.fixture(scope="function") -def hl_test_exporter() -> HumanloopSpanExporter: - """ - Test Exporter where HTTP calls to Humanloop API - are mocked. - """ - client = MagicMock() - exporter = HumanloopSpanExporter(client=client) - return exporter - - -@pytest.fixture(scope="function") -def opentelemetry_hl_with_exporter_test_configuration( - hl_test_exporter: HumanloopSpanExporter, - opentelemetry_test_provider: TracerProvider, -) -> Generator[tuple[Tracer, HumanloopSpanExporter], None, None]: - """Configure OTel backend with HumanloopSpanProcessor and - a HumanloopSpanExporter where HTTP calls are mocked. - """ - processor = HumanloopSpanProcessor(exporter=hl_test_exporter) - opentelemetry_test_provider.add_span_processor(processor) - instrumentor = OpenAIInstrumentor() - instrumentor.instrument(tracer_provider=opentelemetry_test_provider) - tracer = opentelemetry_test_provider.get_tracer("test") - - yield tracer, hl_test_exporter - - instrumentor.uninstrument() - - -@pytest.fixture(scope="session") -def call_llm_messages() -> list[ChatCompletionMessageParam]: - return [ - { - "role": "system", - "content": "You are an assistant on the following topics: greetings in foreign languages.", - }, - { - "role": "user", - "content": "Bonjour!", - }, - ] - - -@dataclass -class APIKeys: - openai: str - humanloop: str - - -@pytest.fixture(scope="session") -def api_keys() -> APIKeys: - openai_key = os.getenv("OPENAI_API_KEY") - humanloop_key = os.getenv("HUMANLOOP_API_KEY") - for key_name, key_value in [ - ("OPENAI_API_KEY", openai_key), - ("HUMANLOOP_API_KEY", humanloop_key), - ]: - if key_value is None: - raise ValueError(f"{key_name} is not set in .env file") - api_keys = APIKeys( - openai=openai_key, # type: ignore [arg-type] - humanloop=humanloop_key, # type: ignore [arg-type] - ) - for key, value in asdict(api_keys).items(): - if value is None: - raise ValueError(f"{key.upper()} key is not set in .env file") - return api_keys - - -@pytest.fixture(scope="session") -def humanloop_client(api_keys: APIKeys) -> Humanloop: - return Humanloop(api_key=api_keys.humanloop) - - -@pytest.fixture(scope="session", autouse=True) -def load_env(): - load_dotenv() - - -def directory_cleanup(directory_id: str, humanloop_client: Humanloop): - response = humanloop_client.directories.get(directory_id) - for file in response.files: - file_id = file.id - if file.type == "prompt": - client = humanloop_client.prompts # type: ignore [assignment] - elif file.type == "tool": - client = humanloop_client.tools # type: ignore [assignment] - elif file.type == "dataset": - client = humanloop_client.datasets # type: ignore [assignment] - elif file.type == "evaluator": - client = humanloop_client.evaluators # type: ignore [assignment] - elif file.type == "flow": - client = humanloop_client.flows # type: ignore [assignment] - else: - raise NotImplementedError(f"Unknown HL file type {file.type}") - client.delete(file_id) - - for subdirectory in response.subdirectories: - directory_cleanup( - directory_id=subdirectory.id, - humanloop_client=humanloop_client, - ) - - humanloop_client.directories.delete(id=response.id) - - -@dataclass -class DirectoryIdentifiers: - path: str - id: str - - -@pytest.fixture() -def test_directory( - humanloop_client: Humanloop, -) -> Generator[DirectoryIdentifiers, None, None]: - # Generate a random alphanumeric directory name to avoid conflicts - def get_random_string(length: int = 16) -> str: - return "".join([random.choice(string.ascii_letters + "0123456789") for _ in range(length)]) - - directory_path = "SDK_integ_test_" + get_random_string() - response = humanloop_client.directories.create(path=directory_path) - assert response.path == directory_path - try: - yield DirectoryIdentifiers( - path=response.path, - id=response.id, - ) - finally: - time.sleep(1) - directory_cleanup(response.id, humanloop_client) - - -@pytest.fixture() -def get_test_path(test_directory: DirectoryIdentifiers) -> Callable[[str], str]: - def generate_path(name: str) -> str: - return f"{test_directory.path}/{name}" - - return generate_path - - -# @pytest.fixture(scope="session", autouse=True) -# def cleanup_test_dirs(humanloop_client: Humanloop): -# def _cleanup_all_test_dirs(): -# dirs = humanloop_client.directories.list() -# for dir in dirs: -# if dir.path.startswith("SDK_integ_test_"): -# directory_cleanup( -# directory_id=dir.id, -# humanloop_client=humanloop_client, -# ) - -# _cleanup_all_test_dirs() -# yield -# _cleanup_all_test_dirs() diff --git a/tests/custom/conftest.py b/tests/custom/conftest.py index 7667dedf..8e400483 100644 --- a/tests/custom/conftest.py +++ b/tests/custom/conftest.py @@ -1,12 +1,9 @@ -from typing import Generator import os -from dotenv import load_dotenv +from typing import Generator from unittest.mock import MagicMock import pytest -from humanloop.client import Humanloop -from humanloop.otel.exporter import HumanloopSpanExporter -from humanloop.otel.processor import HumanloopSpanProcessor +from dotenv import load_dotenv from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam from opentelemetry.instrumentation.anthropic import AnthropicInstrumentor from opentelemetry.instrumentation.cohere import CohereInstrumentor @@ -19,6 +16,10 @@ from opentelemetry.sdk.trace.export import SimpleSpanProcessor from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter from opentelemetry.trace import Tracer + +from humanloop.client import Humanloop +from humanloop.otel.exporter import HumanloopSpanExporter +from humanloop.otel.processor import HumanloopSpanProcessor from tests.custom.types import GetHumanloopClientFn diff --git a/tests/custom/integration/conftest.py b/tests/custom/integration/conftest.py index f918c48c..25dc441f 100644 --- a/tests/custom/integration/conftest.py +++ b/tests/custom/integration/conftest.py @@ -1,16 +1,18 @@ -from contextlib import contextmanager, redirect_stdout -from dataclasses import dataclass +import io import os import time -from typing import Any, ContextManager, Generator, List, Union -import io -from typing import TextIO import uuid -import pytest +from contextlib import contextmanager, redirect_stdout +from dataclasses import dataclass +from typing import ContextManager, Generator, List, TextIO, Union + import dotenv +import pytest +from click.testing import CliRunner + from humanloop import AgentResponse, PromptResponse +from humanloop.requests.prompt_kernel_request import PromptKernelRequestParams from tests.custom.types import GetHumanloopClientFn, SyncableFile -from click.testing import CliRunner @dataclass @@ -78,7 +80,7 @@ def cleanup_directory(directory_id: str): @pytest.fixture(scope="function") -def test_prompt_config() -> dict[str, Any]: +def test_prompt_config() -> PromptKernelRequestParams: return { "provider": "openai", "model": "gpt-4o-mini", @@ -96,6 +98,25 @@ def test_prompt_config() -> dict[str, Any]: } +@pytest.fixture(scope="function") +def prompt( + get_humanloop_client: GetHumanloopClientFn, + sdk_test_dir: str, + test_prompt_config: PromptKernelRequestParams, +) -> Generator[ResourceIdentifiers, None, None]: + humanloop_client = get_humanloop_client() + prompt_path = f"{sdk_test_dir}/prompt" + try: + response = humanloop_client.prompts.upsert( + path=prompt_path, + **test_prompt_config, + ) + yield ResourceIdentifiers(file_id=response.id, file_path=response.path) + humanloop_client.prompts.delete(id=response.id) + except Exception as e: + pytest.fail(f"Failed to create prompt {prompt_path}: {e}") + + @pytest.fixture(scope="function") def eval_dataset( get_humanloop_client: GetHumanloopClientFn, sdk_test_dir: str @@ -131,7 +152,10 @@ def eval_dataset( @pytest.fixture(scope="function") def eval_prompt( - get_humanloop_client: GetHumanloopClientFn, sdk_test_dir: str, openai_key: str, test_prompt_config: dict[str, Any] + get_humanloop_client: GetHumanloopClientFn, + sdk_test_dir: str, + openai_key: str, + test_prompt_config: PromptKernelRequestParams, ) -> Generator[ResourceIdentifiers, None, None]: humanloop_client = get_humanloop_client() prompt_path = f"{sdk_test_dir}/eval_prompt" diff --git a/tests/custom/integration/test_decorators.py b/tests/custom/integration/test_decorators.py index 15057ba2..59638896 100644 --- a/tests/custom/integration/test_decorators.py +++ b/tests/custom/integration/test_decorators.py @@ -2,6 +2,7 @@ from typing import Any from openai import OpenAI + from tests.custom.integration.conftest import GetHumanloopClientFn diff --git a/tests/custom/integration/test_evals.py b/tests/custom/integration/test_evals.py index 2ec74d93..d8ba8996 100644 --- a/tests/custom/integration/test_evals.py +++ b/tests/custom/integration/test_evals.py @@ -2,6 +2,7 @@ from typing import Any import pytest + from humanloop.error import HumanloopRuntimeError from tests.custom.integration.conftest import ResourceIdentifiers from tests.custom.types import GetHumanloopClientFn diff --git a/tests/integration/test_prompts.py b/tests/custom/integration/test_prompts.py similarity index 60% rename from tests/integration/test_prompts.py rename to tests/custom/integration/test_prompts.py index 13ca80eb..f6021b7e 100644 --- a/tests/integration/test_prompts.py +++ b/tests/custom/integration/test_prompts.py @@ -1,14 +1,15 @@ -from humanloop.client import Humanloop - -from tests.integration.conftest import TestIdentifiers +from humanloop.requests.prompt_kernel_request import PromptKernelRequestParams +from tests.custom.integration.conftest import ResourceIdentifiers +from tests.custom.types import GetHumanloopClientFn def test_prompts_call( - humanloop_test_client: Humanloop, - prompt: TestIdentifiers, - test_prompt_config: TestIdentifiers, + get_humanloop_client: GetHumanloopClientFn, + prompt: ResourceIdentifiers, + test_prompt_config: PromptKernelRequestParams, ) -> None: - response = humanloop_test_client.prompts.call( # type: ignore [attr-defined] + humanloop_client = get_humanloop_client() + response = humanloop_client.prompts.call( # type: ignore [attr-defined] path=prompt.file_path, prompt={**test_prompt_config}, # type: ignore [misc, arg-type, typeddict-item, dict-item, list-item] inputs={"question": "What is the capital of the France?"}, @@ -24,11 +25,12 @@ def test_prompts_call( def test_prompts_call_stream( - humanloop_test_client: Humanloop, - prompt: TestIdentifiers, - test_prompt_config: TestIdentifiers, + get_humanloop_client: GetHumanloopClientFn, + prompt: ResourceIdentifiers, + test_prompt_config: PromptKernelRequestParams, ) -> None: - response = humanloop_test_client.prompts.call_stream( # type: ignore [attr-defined] + humanloop_client = get_humanloop_client() + response = humanloop_client.prompts.call_stream( # type: ignore [attr-defined] path=prompt.file_path, prompt={**test_prompt_config}, # type: ignore [misc, arg-type, typeddict-item, dict-item, list-item] inputs={"question": "What is the capital of the France?"}, diff --git a/tests/custom/integration/test_sync.py b/tests/custom/integration/test_sync.py index 6e7b002b..80e332a4 100644 --- a/tests/custom/integration/test_sync.py +++ b/tests/custom/integration/test_sync.py @@ -1,10 +1,12 @@ -from typing import List, Union from pathlib import Path +from typing import List, Union + import pytest + from humanloop import AgentResponse, PromptResponse -from humanloop.prompts.client import PromptsClient from humanloop.agents.client import AgentsClient from humanloop.error import HumanloopRuntimeError +from humanloop.prompts.client import PromptsClient from tests.custom.types import GetHumanloopClientFn, SyncableFile diff --git a/tests/custom/integration/test_sync_cli.py b/tests/custom/integration/test_sync_cli.py index 3957aed2..5631d5f0 100644 --- a/tests/custom/integration/test_sync_cli.py +++ b/tests/custom/integration/test_sync_cli.py @@ -1,7 +1,9 @@ from pathlib import Path from unittest import mock + import pytest from click.testing import CliRunner + from humanloop.cli.__main__ import cli from tests.custom.types import SyncableFile diff --git a/tests/custom/otel/test_helpers.py b/tests/custom/otel/test_helpers.py index 3bd5ce45..f7ff6555 100644 --- a/tests/custom/otel/test_helpers.py +++ b/tests/custom/otel/test_helpers.py @@ -1,7 +1,8 @@ import pytest -from humanloop.otel.helpers import read_from_opentelemetry_span, write_to_opentelemetry_span from opentelemetry.sdk.trace import Span +from humanloop.otel.helpers import read_from_opentelemetry_span, write_to_opentelemetry_span + def test_read_empty(test_span: Span): with pytest.raises(TypeError): diff --git a/tests/custom/sync/test_client.py b/tests/custom/sync/test_client.py index a349fd0c..ac83d259 100644 --- a/tests/custom/sync/test_client.py +++ b/tests/custom/sync/test_client.py @@ -1,10 +1,12 @@ import logging -import pytest from pathlib import Path +from typing import Literal from unittest.mock import Mock, patch -from humanloop.sync.sync_client import SyncClient, SerializableFileType + +import pytest + from humanloop.error import HumanloopRuntimeError -from typing import Literal +from humanloop.sync.sync_client import SerializableFileType, SyncClient @pytest.fixture diff --git a/tests/custom/types.py b/tests/custom/types.py index 7a198456..b270d9fa 100644 --- a/tests/custom/types.py +++ b/tests/custom/types.py @@ -1,6 +1,7 @@ -from typing import Protocol, NamedTuple -from humanloop.client import Humanloop +from typing import NamedTuple, Protocol + from humanloop import FileType +from humanloop.client import Humanloop class GetHumanloopClientFn(Protocol): diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py deleted file mode 100644 index d14042a3..00000000 --- a/tests/integration/conftest.py +++ /dev/null @@ -1,169 +0,0 @@ -import io -import os -import uuid -from contextlib import contextmanager, redirect_stdout -from dataclasses import dataclass -from typing import Any, ContextManager, Generator, TextIO - -import dotenv -import pytest -from humanloop.client import Humanloop -from humanloop.requests.prompt_kernel_request import PromptKernelRequestParams - - -@dataclass -class TestIdentifiers: - file_id: str - file_path: str - - -@pytest.fixture() -def capture_stdout() -> ContextManager[TextIO]: - @contextmanager - def _context_manager(): - f = io.StringIO() - with redirect_stdout(f): - yield f - - return _context_manager # type: ignore [return-value] - - -@pytest.fixture(scope="session") -def openai_key() -> str: - dotenv.load_dotenv() - if not os.getenv("OPENAI_API_KEY"): - pytest.fail("OPENAI_API_KEY is not set for integration tests") - return os.getenv("OPENAI_API_KEY") # type: ignore [return-value] - - -@pytest.fixture(scope="session") -def humanloop_test_client() -> Humanloop: - dotenv.load_dotenv() - if not os.getenv("HUMANLOOP_API_KEY"): - pytest.fail("HUMANLOOP_API_KEY is not set for integration tests") - return Humanloop(api_key=os.getenv("HUMANLOOP_API_KEY")) # type: ignore [return-value] - - -@pytest.fixture(scope="function") -def sdk_test_dir(humanloop_test_client: Humanloop) -> Generator[str, None, None]: - path = f"SDK_INTEGRATION_TEST_{uuid.uuid4()}" - try: - response = humanloop_test_client.directories.create(path=path) - yield response.path - humanloop_test_client.directories.delete(id=response.id) - except Exception as e: - pytest.fail(f"Failed to create directory {path}: {e}") - - -@pytest.fixture(scope="function") -def test_prompt_config() -> PromptKernelRequestParams: - return { - "provider": "openai", - "model": "gpt-4o-mini", - "temperature": 0.5, - "template": [ - { - "role": "system", - "content": "You are a helpful assistant. You must answer the user's question truthfully and at the level of a 5th grader.", - }, - { - "role": "user", - "content": "{{question}}", - }, - ], - } - - -@pytest.fixture(scope="function") -def eval_dataset(humanloop_test_client: Humanloop, sdk_test_dir: str) -> Generator[TestIdentifiers, None, None]: - dataset_path = f"{sdk_test_dir}/eval_dataset" - try: - response = humanloop_test_client.datasets.upsert( - path=dataset_path, - datapoints=[ - { - "inputs": { - "question": "What is the capital of the France?", - }, - }, - { - "inputs": { - "question": "What is the capital of the Germany?", - }, - }, - { - "inputs": { - "question": "What is 2+2?", - }, - }, - ], - ) - yield TestIdentifiers(file_id=response.id, file_path=response.path) - humanloop_test_client.datasets.delete(id=response.id) - except Exception as e: - pytest.fail(f"Failed to create dataset {dataset_path}: {e}") - - -@pytest.fixture(scope="function") -def eval_prompt( - humanloop_test_client: Humanloop, sdk_test_dir: str, openai_key: str, test_prompt_config: dict[str, Any] -) -> Generator[TestIdentifiers, None, None]: - prompt_path = f"{sdk_test_dir}/eval_prompt" - try: - response = humanloop_test_client.prompts.upsert( - path=prompt_path, - **test_prompt_config, - ) - yield TestIdentifiers(file_id=response.id, file_path=response.path) - humanloop_test_client.prompts.delete(id=response.id) - except Exception as e: - pytest.fail(f"Failed to create prompt {prompt_path}: {e}") - - -@pytest.fixture(scope="function") -def prompt( - humanloop_test_client: Humanloop, sdk_test_dir: str, openai_key: str, test_prompt_config: dict[str, Any] -) -> Generator[TestIdentifiers, None, None]: - prompt_path = f"{sdk_test_dir}/prompt" - try: - response = humanloop_test_client.prompts.upsert( - path=prompt_path, - **test_prompt_config, - ) - yield TestIdentifiers(file_id=response.id, file_path=response.path) - humanloop_test_client.prompts.delete(id=response.id) - except Exception as e: - pytest.fail(f"Failed to create prompt {prompt_path}: {e}") - - -@pytest.fixture(scope="function") -def output_not_null_evaluator( - humanloop_test_client: Humanloop, sdk_test_dir: str -) -> Generator[TestIdentifiers, None, None]: - evaluator_path = f"{sdk_test_dir}/output_not_null_evaluator" - try: - response = humanloop_test_client.evaluators.upsert( - path=evaluator_path, - spec={ - "arguments_type": "target_required", - "return_type": "boolean", - "code": """ -def output_not_null(log: dict) -> bool: - return log["output"] is not None - """, - "evaluator_type": "python", - }, - ) - yield TestIdentifiers(file_id=response.id, file_path=response.path) - humanloop_test_client.evaluators.delete(id=response.id) - except Exception as e: - pytest.fail(f"Failed to create evaluator {evaluator_path}: {e}") - - -@pytest.fixture(scope="function") -def id_for_staging_environment(humanloop_test_client: Humanloop, eval_prompt: TestIdentifiers) -> str: - response = humanloop_test_client.prompts.list_environments(id=eval_prompt.file_id) - for environment in response: - if environment.name == "staging": - return environment.id - pytest.fail("Staging environment not found") diff --git a/tests/integration/test_decorators.py b/tests/integration/test_decorators.py deleted file mode 100644 index 218453a6..00000000 --- a/tests/integration/test_decorators.py +++ /dev/null @@ -1,154 +0,0 @@ -import time -from typing import Any - -from openai import OpenAI -from humanloop.client import Humanloop -from humanloop.types.chat_message import ChatMessage - - -def test_prompt_decorator( - humanloop_test_client: Humanloop, - sdk_test_dir: str, - test_prompt_config: dict[str, Any], - openai_key: str, -): - try: - prompt_path = f"{sdk_test_dir}/test_prompt" - prompt_response = humanloop_test_client.prompts.upsert( - path=prompt_path, - **test_prompt_config, - ) - - prompt_versions_response = humanloop_test_client.prompts.list_versions(id=prompt_response.id) - assert len(prompt_versions_response.records) == 1 - - @humanloop_test_client.prompt(path=prompt_path) - def my_prompt(question: str) -> str: - openai_client = OpenAI(api_key=openai_key) - - response = openai_client.chat.completions.create( - model="gpt-4o-mini", - messages=[{"role": "user", "content": question}], - ) - - assert response.choices[0].message.content is not None - return response.choices[0].message.content - - assert "paris" in my_prompt("What is the capital of the France?").lower() - - time.sleep(5) - prompt_versions_response = humanloop_test_client.prompts.list_versions(id=prompt_response.id) - assert len(prompt_versions_response.records) == 2 - - logs_response = humanloop_test_client.logs.list(file_id=prompt_response.id, page=1, size=50) - - assert logs_response.items is not None and len(logs_response.items) == 1 - finally: - humanloop_test_client.prompts.delete(id=prompt_response.id) - - -def test_call_prompt_in_flow_decorator( - humanloop_test_client: Humanloop, - sdk_test_dir: str, - openai_key: str, -): - try: - - @humanloop_test_client.flow(path=f"{sdk_test_dir}/test_flow") - def my_flow(question: str) -> str: - response = humanloop_test_client.prompts.call( - path=f"{sdk_test_dir}/test_prompt", - prompt={ - "provider": "openai", - "model": "gpt-4o-mini", - "temperature": 0, - }, - messages=[{"role": "user", "content": question}], - provider_api_keys={"openai": openai_key}, - ) - - assert response.logs[0].output is not None - return response.logs[0].output - - assert "paris" in my_flow("What is the capital of the France?").lower() - time.sleep(5) - prompt_response = humanloop_test_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_prompt") - assert prompt_response is not None - prompt_logs_response = humanloop_test_client.logs.list(file_id=prompt_response.id, page=1, size=50) - assert prompt_logs_response.items is not None and len(prompt_logs_response.items) == 1 - prompt_log = prompt_logs_response.items[0] - - flow_response = humanloop_test_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_flow") - assert flow_response is not None - flow_logs_response = humanloop_test_client.logs.list(file_id=flow_response.id, page=1, size=50) - assert flow_logs_response.items is not None and len(flow_logs_response.items) == 1 - flow_log = flow_logs_response.items[0] - assert prompt_log.trace_parent_id == flow_log.id - finally: - flow_response = humanloop_test_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_flow") - if flow_response is not None: - humanloop_test_client.flows.delete(id=flow_response.id) - prompt_response = humanloop_test_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_prompt") - if prompt_response is not None: - humanloop_test_client.prompts.delete(id=prompt_response.id) - - -def test_flow_decorator_logs_exceptions( - humanloop_test_client: Humanloop, - sdk_test_dir: str, -): - try: - - @humanloop_test_client.flow(path=f"{sdk_test_dir}/test_flow_log_error") - def my_flow(question: str) -> str: - raise ValueError("This is a test exception") - - my_flow("test") - - time.sleep(5) - - flow_response = humanloop_test_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_flow_log_error") - assert flow_response is not None - flow_logs_response = humanloop_test_client.logs.list(file_id=flow_response.id, page=1, size=50) - assert flow_logs_response.items is not None and len(flow_logs_response.items) == 1 - flow_log = flow_logs_response.items[0] - assert flow_log.error is not None - assert flow_log.output is None - - finally: - flow_response = humanloop_test_client.files.retrieve_by_path(path=f"{sdk_test_dir}/test_flow_log_error") - if flow_response is not None: - humanloop_test_client.flows.delete(id=flow_response.id) - - -def test_flow_decorator_populates_output_message( - humanloop_test_client: Humanloop, - sdk_test_dir: str, -): - try: - - @humanloop_test_client.flow(path=f"{sdk_test_dir}/test_flow_log_output_message") - def my_flow(question: str) -> dict[str, Any]: - return {"role": "user", "content": question} - - assert "france" in my_flow("What is the capital of the France?")["content"].lower() - - time.sleep(5) - - flow_response = humanloop_test_client.files.retrieve_by_path( - path=f"{sdk_test_dir}/test_flow_log_output_message" - ) - assert flow_response is not None - flow_logs_response = humanloop_test_client.logs.list(file_id=flow_response.id, page=1, size=50) - assert flow_logs_response.items is not None and len(flow_logs_response.items) == 1 - flow_log = flow_logs_response.items[0] - assert flow_log.output_message is not None - assert flow_log.output is None - assert flow_log.error is None - - finally: - flow_response = humanloop_test_client.files.retrieve_by_path( - path=f"{sdk_test_dir}/test_flow_log_output_message" - ) - if flow_response is not None: - humanloop_test_client.flows.delete(id=flow_response.id) diff --git a/tests/integration/test_evals.py b/tests/integration/test_evals.py deleted file mode 100644 index 49bbb6dc..00000000 --- a/tests/integration/test_evals.py +++ /dev/null @@ -1,402 +0,0 @@ -import time -from typing import Any - -import pytest -from humanloop.client import Humanloop -from humanloop.error import HumanloopRuntimeError -from tests.integration.conftest import TestIdentifiers - - -def test_eval_run_works_on_online_files( - humanloop_test_client: Humanloop, - output_not_null_evaluator: TestIdentifiers, - eval_dataset: TestIdentifiers, - eval_prompt: TestIdentifiers, -) -> None: - humanloop_test_client.evaluations.run( # type: ignore [attr-defined] - name="test_eval_run", - file={ - "path": eval_prompt.file_path, - "type": "prompt", - }, - dataset={ - "path": eval_dataset.file_path, - }, - evaluators=[ - { - "path": output_not_null_evaluator.file_path, - } - ], - ) - time.sleep(5) - response = humanloop_test_client.evaluations.list(file_id=eval_prompt.file_id) - assert response.items and len(response.items) == 1 - evaluation_id = response.items[0].id - run_evaluation_response = humanloop_test_client.evaluations.list_runs_for_evaluation(id=evaluation_id) # type: ignore [attr-defined] - assert run_evaluation_response.runs[0].status == "completed" - - -def test_eval_run_version_id( - humanloop_test_client: Humanloop, - output_not_null_evaluator: TestIdentifiers, - eval_dataset: TestIdentifiers, - eval_prompt: TestIdentifiers, - test_prompt_config: dict[str, Any], -) -> None: - # GIVEN a prompt where a non-default version is created - new_test_prompt_config = test_prompt_config.copy() - new_test_prompt_config["temperature"] = 1 - new_prompt_version_response = humanloop_test_client.prompts.upsert( - path=eval_prompt.file_path, - **new_test_prompt_config, - ) - # WHEN creating an evaluation using version_id - humanloop_test_client.evaluations.run( # type: ignore [attr-defined] - name="test_eval_run", - file={ - "id": new_prompt_version_response.id, - "version_id": new_prompt_version_response.version_id, - "type": "prompt", - }, - dataset={ - "path": eval_dataset.file_path, - }, - evaluators=[ - { - "path": output_not_null_evaluator.file_path, - } - ], - ) - # THEN we evaluate the version created in the test - evaluations_response = humanloop_test_client.evaluations.list(file_id=new_prompt_version_response.id) - assert evaluations_response.items and len(evaluations_response.items) == 1 - evaluation_id = evaluations_response.items[0].id - runs_response = humanloop_test_client.evaluations.list_runs_for_evaluation(id=evaluation_id) - assert runs_response.runs[0].status == "completed" - assert ( - runs_response.runs[0].version - and runs_response.runs[0].version.version_id == new_prompt_version_response.version_id - ) - list_versions_response = humanloop_test_client.prompts.list_versions(id=new_prompt_version_response.id) - assert list_versions_response.records and len(list_versions_response.records) == 2 - # THEN the version used in evaluation is not the default version - response = humanloop_test_client.prompts.get(id=new_prompt_version_response.id) - assert response.version_id != new_prompt_version_response.version_id - - -def test_eval_run_environment( - humanloop_test_client: Humanloop, - output_not_null_evaluator: TestIdentifiers, - eval_dataset: TestIdentifiers, - eval_prompt: TestIdentifiers, - test_prompt_config: dict[str, Any], - id_for_staging_environment: str, -) -> None: - # GIVEN a prompt deployed to staging environment - new_test_prompt_config = test_prompt_config.copy() - new_test_prompt_config["temperature"] = 1 - new_prompt_version_response = humanloop_test_client.prompts.upsert( - path=eval_prompt.file_path, - **new_test_prompt_config, - ) - humanloop_test_client.prompts.set_deployment( - id=new_prompt_version_response.id, - environment_id=id_for_staging_environment, - version_id=new_prompt_version_response.version_id, - ) - # WHEN creating an evaluation using environment - humanloop_test_client.evaluations.run( # type: ignore [attr-defined] - name="test_eval_run", - file={ - "id": new_prompt_version_response.id, - "type": "prompt", - "environment": "staging", - }, - dataset={ - "path": eval_dataset.file_path, - }, - evaluators=[ - { - "path": output_not_null_evaluator.file_path, - } - ], - ) - # THEN evaluation is done with the version deployed to staging environment - evaluations_response = humanloop_test_client.evaluations.list(file_id=new_prompt_version_response.id) - assert evaluations_response.items and len(evaluations_response.items) == 1 - evaluation_id = evaluations_response.items[0].id - runs_response = humanloop_test_client.evaluations.list_runs_for_evaluation(id=evaluation_id) - assert runs_response.runs[0].status == "completed" - assert ( - runs_response.runs[0].version - and runs_response.runs[0].version.version_id == new_prompt_version_response.version_id - ) - default_prompt_version_response = humanloop_test_client.prompts.get(id=new_prompt_version_response.id) - assert default_prompt_version_response.version_id != new_prompt_version_response.version_id - - -@pytest.mark.parametrize("version_lookup", ["version_id", "environment"]) -def test_eval_run_version_lookup_fails_with_path( - humanloop_test_client: Humanloop, - eval_prompt: TestIdentifiers, - eval_dataset: TestIdentifiers, - output_not_null_evaluator: TestIdentifiers, - version_lookup: str, -): - # GIVEN an eval run where we try to evaluate a non-default version - with pytest.raises(HumanloopRuntimeError) as e: - humanloop_test_client.evaluations.run( # type: ignore [attr-defined] - name="test_eval_run", - file={ - "path": eval_prompt.file_path, - "type": "prompt", - # WHEN the File id is not passed in file - version_lookup: "will_not_work", - }, - dataset={ - "path": eval_dataset.file_path, - }, - evaluators=[ - { - "path": output_not_null_evaluator.file_path, - } - ], - ) - # THEN an error is raised - assert "You must provide the `file.id` when addressing a file by version ID or environment" in str(e.value) - - -def test_eval_run_with_version_upsert( - humanloop_test_client: Humanloop, - eval_prompt: TestIdentifiers, - eval_dataset: TestIdentifiers, - output_not_null_evaluator: TestIdentifiers, - test_prompt_config: dict[str, Any], -): - humanloop_test_client.evaluations.run( # type: ignore [attr-defined] - name="test_eval_run", - file={ - "path": eval_prompt.file_path, - "type": "prompt", - "version": { - **test_prompt_config, - "temperature": 1, - }, - }, - dataset={ - "path": eval_dataset.file_path, - }, - evaluators=[ - { - "path": output_not_null_evaluator.file_path, - } - ], - ) - # THEN the version is upserted and evaluation finishes successfully - evaluations_response = humanloop_test_client.evaluations.list(file_id=eval_prompt.file_id) - assert evaluations_response.items and len(evaluations_response.items) == 1 - evaluation_id = evaluations_response.items[0].id - runs_response = humanloop_test_client.evaluations.list_runs_for_evaluation(id=evaluation_id) - assert runs_response.runs[0].status == "completed" - # THEN a version was upserted based on file.version - list_prompt_versions_response = humanloop_test_client.prompts.list_versions(id=eval_prompt.file_id) - assert list_prompt_versions_response.records and len(list_prompt_versions_response.records) == 2 - - -def test_flow_eval_does_not_work_without_callable( - humanloop_test_client: Humanloop, - eval_dataset: TestIdentifiers, - output_not_null_evaluator: TestIdentifiers, -): - with pytest.raises(HumanloopRuntimeError) as e: - humanloop_test_client.evaluations.run( # type: ignore [attr-defined] - name="test_eval_run", - file={ - "path": "Test Flow", - "type": "flow", - "version": { - "attributes": { - "foo": "bar", - } - }, - }, - dataset={ - "path": eval_dataset.file_path, - }, - evaluators=[ - { - "path": output_not_null_evaluator.file_path, - } - ], - ) - # THEN an error is raised - assert "You must provide a `callable` for your Flow `file` to run a local eval." in str(e.value) - - -def test_flow_eval_works_with_callable( - humanloop_test_client: Humanloop, - eval_dataset: TestIdentifiers, - output_not_null_evaluator: TestIdentifiers, - sdk_test_dir: str, -): - flow_path = f"{sdk_test_dir}/Test Flow" - # GIVEN a flow with a callable - flow_response = humanloop_test_client.flows.upsert( - path=flow_path, - attributes={ - "foo": "bar", - }, - ) - try: - flow = humanloop_test_client.flows.upsert( - path=flow_path, - attributes={ - "foo": "bar", - }, - ) - # WHEN we run an evaluation with the flow - humanloop_test_client.evaluations.run( # type: ignore [attr-defined] - name="test_eval_run", - file={ - "id": flow.id, - "type": "flow", - "callable": lambda question: "bar", - }, - dataset={ - "path": eval_dataset.file_path, - }, - evaluators=[ - { - "path": output_not_null_evaluator.file_path, - } - ], - ) - # THEN the evaluation finishes successfully - evaluations_response = humanloop_test_client.evaluations.list(file_id=flow.id) - assert evaluations_response.items and len(evaluations_response.items) == 1 - evaluation_id = evaluations_response.items[0].id - runs_response = humanloop_test_client.evaluations.list_runs_for_evaluation(id=evaluation_id) - assert runs_response.runs[0].status == "completed" - finally: - humanloop_test_client.flows.delete(id=flow_response.id) - - -def test_cannot_evaluate_agent_with_callable( - humanloop_test_client: Humanloop, - eval_dataset: TestIdentifiers, - output_not_null_evaluator: TestIdentifiers, -): - with pytest.raises(ValueError) as e: - humanloop_test_client.evaluations.run( # type: ignore [attr-defined] - name="test_eval_run", - file={ - "path": "Test Agent", - "type": "agent", - "callable": lambda question: "bar", - }, - dataset={ - "path": eval_dataset.file_path, - }, - evaluators=[ - { - "path": output_not_null_evaluator.file_path, - } - ], - ) - assert str(e.value) == "Agent evaluation is only possible on the Humanloop runtime, do not provide a `callable`." - - -def test_flow_eval_resolves_to_default_with_callable( - humanloop_test_client: Humanloop, - output_not_null_evaluator: TestIdentifiers, - eval_dataset: TestIdentifiers, - sdk_test_dir: str, -) -> None: - # GIVEN a flow with some attributes - flow_path = f"{sdk_test_dir}/Test Flow" - flow_response = humanloop_test_client.flows.upsert( - path=flow_path, - attributes={ - "foo": "bar", - }, - ) - try: - # WHEN running an evaluation with the flow's callable but no version - humanloop_test_client.evaluations.run( # type: ignore [attr-defined] - name="test_eval_run", - file={ - "id": flow_response.id, - "type": "flow", - "callable": lambda question: "It's complicated don't worry about it", - }, - dataset={ - "path": eval_dataset.file_path, - }, - evaluators=[ - { - "path": output_not_null_evaluator.file_path, - } - ], - ) - # THEN the evaluation finishes successfully - evaluations_response = humanloop_test_client.evaluations.list(file_id=flow_response.id) - assert evaluations_response.items and len(evaluations_response.items) == 1 - evaluation_id = evaluations_response.items and evaluations_response.items[0].id - runs_response = humanloop_test_client.evaluations.list_runs_for_evaluation(id=evaluation_id) # type: ignore [attr-defined, arg-type] - assert runs_response.runs[0].status == "completed" - finally: - # Clean up test resources - humanloop_test_client.flows.delete(id=flow_response.id) - - -@pytest.mark.skip(reason="Skip until agents are in prod") -def test_agent_eval_works_upserting( - humanloop_test_client: Humanloop, - eval_dataset: TestIdentifiers, - output_not_null_evaluator: TestIdentifiers, - sdk_test_dir: str, -): - humanloop_test_client.evaluations.run( # type: ignore [attr-defined] - name="test_eval_run", - file={ - "path": f"{sdk_test_dir}/Test Agent", - "type": "agent", - "version": { - "model": "gpt-4o", - "template": [ - { - "role": "system", - "content": "You are a helpful assistant, offering very short answers.", - }, - { - "role": "user", - "content": "{{question}}", - }, - ], - "provider": "openai", - "temperature": 0, - "max_iterations": 5, - }, - }, - dataset={ - "path": eval_dataset.file_path, - }, - evaluators=[ - { - "path": output_not_null_evaluator.file_path, - } - ], - ) - files_response = humanloop_test_client.files.list_files(page=1, size=100) - eval_agent = None - for file in files_response.records: - if file.path == f"{sdk_test_dir}/Test Agent": - eval_agent = file - break - assert eval_agent and eval_agent.type == "agent" - # THEN the evaluation finishes successfully - evaluations_response = humanloop_test_client.evaluations.list(file_id=eval_agent.id) - assert evaluations_response.items and len(evaluations_response.items) == 1 - evaluation_id = evaluations_response.items[0].id - runs_response = humanloop_test_client.evaluations.list_runs_for_evaluation(id=evaluation_id) # type: ignore [attr-defined, arg-type] - assert runs_response.runs[0].status == "completed" diff --git a/tests/otel/__init__.py b/tests/otel/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/otel/test_helpers.py b/tests/otel/test_helpers.py deleted file mode 100644 index 3bd5ce45..00000000 --- a/tests/otel/test_helpers.py +++ /dev/null @@ -1,172 +0,0 @@ -import pytest -from humanloop.otel.helpers import read_from_opentelemetry_span, write_to_opentelemetry_span -from opentelemetry.sdk.trace import Span - - -def test_read_empty(test_span: Span): - with pytest.raises(TypeError): - assert read_from_opentelemetry_span(test_span) == {} - - -def test_read_non_existent_key(test_span: Span): - with pytest.raises(TypeError): - assert read_from_opentelemetry_span(test_span, "key") == {} - write_to_opentelemetry_span(test_span, {"x": 7, "y": "foo"}, key="key") - # NOTE: attributes cannot be None at this point - assert dict(test_span.attributes) == { # type: ignore - "key.x": 7, - "key.y": "foo", - } - with pytest.raises(TypeError): - assert read_from_opentelemetry_span(test_span, "key.z") is None - - -def test_simple_dict(test_span: Span): - write_to_opentelemetry_span(test_span, {"x": 7, "y": "foo"}, "key") - # NOTE: attributes cannot be None at this point - assert dict(test_span.attributes) == { # type: ignore - "key.x": 7, - "key.y": "foo", - } - assert read_from_opentelemetry_span(test_span, "key") == {"x": 7, "y": "foo"} - - -def test_no_prefix(test_span: Span): - write_to_opentelemetry_span(test_span, {"x": 7, "y": "foo"}) - # NOTE: attributes cannot be None at this point - assert dict(test_span.attributes) == { # type: ignore - "x": 7, - "y": "foo", - } - assert read_from_opentelemetry_span(test_span) == {"x": 7, "y": "foo"} - - -def test_nested_object(test_span: Span): - write_to_opentelemetry_span(test_span, {"x": 7, "y": {"z": "foo"}}, "key") - # NOTE: attributes cannot be None at this point - assert dict(test_span.attributes) == { # type: ignore - "key.x": 7, - "key.y.z": "foo", - } - assert read_from_opentelemetry_span(test_span, "key") == {"x": 7, "y": {"z": "foo"}} - - -def test_list(test_span: Span): - write_to_opentelemetry_span( - test_span, - [{"x": 7, "y": "foo"}, {"z": "bar"}], # type: ignore - "key", - ) # type: ignore - # NOTE: attributes cannot be None at this point - assert dict(test_span.attributes) == { # type: ignore - "key.0.x": 7, - "key.0.y": "foo", - "key.1.z": "bar", - } - assert read_from_opentelemetry_span(test_span, "key") == [ - {"z": "bar"}, - {"x": 7, "y": "foo"}, - ] - - -def test_list_no_prefix(test_span: Span): - write_to_opentelemetry_span( - test_span, - [{"x": 7, "y": "foo"}, {"z": "bar"}], # type: ignore - ) - # NOTE: attributes cannot be None at this point - assert dict(test_span.attributes) == { # type: ignore - "0.x": 7, - "0.y": "foo", - "1.z": "bar", - } - assert read_from_opentelemetry_span(test_span) == [ - {"z": "bar"}, - {"x": 7, "y": "foo"}, - ] - - -def test_multiple_nestings(test_span: Span): - write_to_opentelemetry_span( - test_span, - [ - {"x": 7, "y": "foo"}, - [{"z": "bar"}, {"a": 42}], - ], # type: ignore - "key", - ) - assert dict(test_span.attributes) == { # type: ignore - "key.0.x": 7, - "key.0.y": "foo", - "key.1.0.z": "bar", - "key.1.1.a": 42, - } - assert read_from_opentelemetry_span(test_span, "key") == [ - [ - {"a": 42}, - {"z": "bar"}, - ], - {"x": 7, "y": "foo"}, - ] - - -def test_read_mixed_numeric_string_keys(test_span: Span): - test_span.set_attributes( - { - "key.0.x": 7, - "key.0.y": "foo", - "key.a.z": "bar", - "key.a.a": 42, - } - ) - assert read_from_opentelemetry_span(span=test_span, key="key") == { # type: ignore - "0": {"x": 7, "y": "foo"}, - "a": {"z": "bar", "a": 42}, - } - assert read_from_opentelemetry_span(span=test_span) == { # type: ignore - "key": { - "0": {"x": 7, "y": "foo"}, - "a": {"z": "bar", "a": 42}, - } - } - - -def test_sub_key_same_as_key(test_span: Span): - write_to_opentelemetry_span(test_span, {"key": 7}, "key") - # NOTE: attributes cannot be None at this point - assert dict(test_span.attributes) == { # type: ignore - "key.key": 7, - } - assert read_from_opentelemetry_span(test_span, "key") == {"key": 7} - - -def test_read_nested_key(test_span: Span): - test_span.set_attributes({"key.x": 7, "key.y.z": "foo"}) - assert read_from_opentelemetry_span(span=test_span, key="key.y") == {"z": "foo"} - - -def test_write_read_sub_key(test_span: Span): - write_to_opentelemetry_span(test_span, {"x": 7, "y": "foo"}, "key") - assert read_from_opentelemetry_span(test_span, "key.x") == 7 - assert read_from_opentelemetry_span(test_span, "key.y") == "foo" - assert read_from_opentelemetry_span(test_span, "key") == {"x": 7, "y": "foo"} - - -def test_write_drops_dict_all_null_values(test_span: Span): - # GIVEN a test_span to which a value with null values is written - # NOTE: mypy complains about None value in the dict, but it is intentionally under test - write_to_opentelemetry_span(test_span, {"x": None, "y": None}, "key") # type: ignore - # WHEN reading the value from the span - # THEN the value is not present in the span attributes - assert "key" not in test_span.attributes # type: ignore - with pytest.raises(TypeError): - assert read_from_opentelemetry_span(test_span, "key") == {} - - -def test_write_drops_null_value_from_dict(test_span: Span): - # GIVEN a test_span to which a dict with some null values are written - # NOTE: mypy complains about None value in the dict, but it is intentionally under test - write_to_opentelemetry_span(test_span, {"x": 2, "y": None}, "key") # type: ignore - # WHEN reading the values from the span - # THEN the value with null value is not present in the span attributes - assert read_from_opentelemetry_span(test_span, "key") == {"x": 2} diff --git a/tests/utils/assets/models/__init__.py b/tests/utils/assets/models/__init__.py index 3a1c852e..2cf01263 100644 --- a/tests/utils/assets/models/__init__.py +++ b/tests/utils/assets/models/__init__.py @@ -5,7 +5,7 @@ from .circle import CircleParams from .object_with_defaults import ObjectWithDefaultsParams from .object_with_optional_field import ObjectWithOptionalFieldParams -from .shape import ShapeParams, Shape_CircleParams, Shape_SquareParams +from .shape import Shape_CircleParams, Shape_SquareParams, ShapeParams from .square import SquareParams from .undiscriminated_shape import UndiscriminatedShapeParams diff --git a/tests/utils/assets/models/circle.py b/tests/utils/assets/models/circle.py index 759fe3eb..6125ca54 100644 --- a/tests/utils/assets/models/circle.py +++ b/tests/utils/assets/models/circle.py @@ -3,6 +3,7 @@ # This file was auto-generated by Fern from our API Definition. import typing_extensions + from humanloop.core.serialization import FieldMetadata diff --git a/tests/utils/assets/models/object_with_defaults.py b/tests/utils/assets/models/object_with_defaults.py index ef14f7b2..a977b1d2 100644 --- a/tests/utils/assets/models/object_with_defaults.py +++ b/tests/utils/assets/models/object_with_defaults.py @@ -3,7 +3,6 @@ # This file was auto-generated by Fern from our API Definition. import typing_extensions -import typing_extensions class ObjectWithDefaultsParams(typing_extensions.TypedDict): diff --git a/tests/utils/assets/models/object_with_optional_field.py b/tests/utils/assets/models/object_with_optional_field.py index dc3e3eb7..e4ffe724 100644 --- a/tests/utils/assets/models/object_with_optional_field.py +++ b/tests/utils/assets/models/object_with_optional_field.py @@ -2,15 +2,17 @@ # This file was auto-generated by Fern from our API Definition. -import typing_extensions -import typing -from humanloop.core.serialization import FieldMetadata import datetime as dt +import typing import uuid + +import typing_extensions from .color import Color from .shape import ShapeParams from .undiscriminated_shape import UndiscriminatedShapeParams +from humanloop.core.serialization import FieldMetadata + class ObjectWithOptionalFieldParams(typing_extensions.TypedDict): literal: typing.Literal["lit_one"] diff --git a/tests/utils/assets/models/shape.py b/tests/utils/assets/models/shape.py index 540ccabd..56394d93 100644 --- a/tests/utils/assets/models/shape.py +++ b/tests/utils/assets/models/shape.py @@ -3,8 +3,11 @@ # This file was auto-generated by Fern from our API Definition. from __future__ import annotations -import typing_extensions + import typing + +import typing_extensions + from humanloop.core.serialization import FieldMetadata diff --git a/tests/utils/assets/models/square.py b/tests/utils/assets/models/square.py index da4a2111..3f25005d 100644 --- a/tests/utils/assets/models/square.py +++ b/tests/utils/assets/models/square.py @@ -3,6 +3,7 @@ # This file was auto-generated by Fern from our API Definition. import typing_extensions + from humanloop.core.serialization import FieldMetadata diff --git a/tests/utils/assets/models/undiscriminated_shape.py b/tests/utils/assets/models/undiscriminated_shape.py index 68876a23..99f12b30 100644 --- a/tests/utils/assets/models/undiscriminated_shape.py +++ b/tests/utils/assets/models/undiscriminated_shape.py @@ -3,6 +3,7 @@ # This file was auto-generated by Fern from our API Definition. import typing + from .circle import CircleParams from .square import SquareParams diff --git a/tests/utils/test_serialization.py b/tests/utils/test_serialization.py index 2ad8e1b5..40cc847b 100644 --- a/tests/utils/test_serialization.py +++ b/tests/utils/test_serialization.py @@ -2,10 +2,10 @@ from typing import Any, List -from humanloop.core.serialization import convert_and_respect_annotation_metadata - from .assets.models import ObjectWithOptionalFieldParams, ShapeParams +from humanloop.core.serialization import convert_and_respect_annotation_metadata + UNION_TEST: ShapeParams = {"radius_measurement": 1.0, "shape_type": "circle", "id": "1"} UNION_TEST_CONVERTED = {"shapeType": "circle", "radiusMeasurement": 1.0, "id": "1"} From 48aba5702876e3c36fa89de8e9059999d27ddf11 Mon Sep 17 00:00:00 2001 From: Ale Pouroullis Date: Tue, 13 May 2025 19:09:07 +0100 Subject: [PATCH 4/6] Update .fernignore with custom code dirs related to syncing --- .fernignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.fernignore b/.fernignore index 112f779b..fd7adc81 100644 --- a/.fernignore +++ b/.fernignore @@ -13,10 +13,13 @@ mypy.ini README.md src/humanloop/decorators src/humanloop/otel +src/humanloop/sync +src/humanloop/cli +pytest.ini ## Tests -tests/ +tests/custom ## CI From 0b5e426eeb7c8972bbedda3340b13c2948a91b1f Mon Sep 17 00:00:00 2001 From: Ale Pouroullis Date: Wed, 14 May 2025 10:48:15 +0100 Subject: [PATCH 5/6] Fix mypy errors for python 3.9 --- src/humanloop/client.py | 2 +- src/humanloop/sync/sync_client.py | 14 ++++---- tests/custom/integration/conftest.py | 49 +++++++++++---------------- tests/custom/integration/test_sync.py | 21 +++++++----- 4 files changed, 40 insertions(+), 46 deletions(-) diff --git a/src/humanloop/client.py b/src/humanloop/client.py index fce02a98..ab6b2abc 100644 --- a/src/humanloop/client.py +++ b/src/humanloop/client.py @@ -395,7 +395,7 @@ def agent(): attributes=attributes, ) - def pull(self, path: str | None = None, environment: str | None = None) -> Tuple[List[str], List[str]]: + def pull(self, path: Optional[str] = None, environment: Optional[str] = None) -> Tuple[List[str], List[str]]: """Pull Prompt and Agent files from Humanloop to local filesystem. This method will: diff --git a/src/humanloop/sync/sync_client.py b/src/humanloop/sync/sync_client.py index d71f1568..b1cf091a 100644 --- a/src/humanloop/sync/sync_client.py +++ b/src/humanloop/sync/sync_client.py @@ -1,6 +1,6 @@ import logging from pathlib import Path -from typing import List, Tuple, TYPE_CHECKING +from typing import List, Optional, Tuple, TYPE_CHECKING, Union from functools import lru_cache import typing import time @@ -203,7 +203,7 @@ def _save_serialized_file( logger.error(f"Failed to write {file_type} {file_path} to disk: {str(e)}") raise - def _pull_file(self, path: str, environment: str | None = None) -> bool: + def _pull_file(self, path: str, environment: Optional[str] = None) -> bool: """Pull a specific file from Humanloop to local filesystem. Returns: @@ -236,8 +236,8 @@ def _pull_file(self, path: str, environment: str | None = None) -> bool: def _pull_directory( self, - path: str | None = None, - environment: str | None = None, + path: Optional[str] = None, + environment: Optional[str] = None, ) -> Tuple[List[str], List[str]]: """Sync Prompt and Agent files from Humanloop to local filesystem. @@ -316,7 +316,7 @@ def _pull_directory( return successful_files, failed_files - def pull(self, path: str | None = None, environment: str | None = None) -> Tuple[List[str], List[str]]: + def pull(self, path: Optional[str] = None, environment: Optional[str] = None) -> Tuple[List[str], List[str]]: """Pull files from Humanloop to local filesystem. If the path ends with .prompt or .agent, pulls that specific file. @@ -343,7 +343,9 @@ def pull(self, path: str | None = None, environment: str | None = None) -> Tuple ) try: - if normalized_path is None or path is None: # path being None means normalized_path is None, but we check both for improved type safety + if ( + normalized_path is None or path is None + ): # path being None means normalized_path is None, but we check both for improved type safety # Pull all files from the root logger.debug("Pulling all files from root") successful_files, failed_files = self._pull_directory( diff --git a/tests/custom/integration/conftest.py b/tests/custom/integration/conftest.py index 25dc441f..039b0f1c 100644 --- a/tests/custom/integration/conftest.py +++ b/tests/custom/integration/conftest.py @@ -1,10 +1,10 @@ -import io import os import time +import typing import uuid -from contextlib import contextmanager, redirect_stdout +from collections.abc import Generator from dataclasses import dataclass -from typing import ContextManager, Generator, List, TextIO, Union +from typing import Union import dotenv import pytest @@ -21,17 +21,6 @@ class ResourceIdentifiers: file_path: str -@pytest.fixture() -def capture_stdout() -> ContextManager[TextIO]: - @contextmanager - def _context_manager(): - f = io.StringIO() - with redirect_stdout(f): - yield f - - return _context_manager # type: ignore [return-value] - - @pytest.fixture(scope="session") def openai_key() -> str: dotenv.load_dotenv() @@ -44,26 +33,26 @@ def openai_key() -> str: def sdk_test_dir(get_humanloop_client: GetHumanloopClientFn) -> Generator[str, None, None]: humanloop_client = get_humanloop_client() + def _get_subclient(file_type: str): + try: + return { + "agent": humanloop_client.agents, + "prompt": humanloop_client.prompts, + "dataset": humanloop_client.datasets, + "evaluator": humanloop_client.evaluators, + "flow": humanloop_client.flows, + "tool": humanloop_client.tools, + }[file_type] + except KeyError: + raise NotImplementedError(f"Unknown file type: {file_type}") + def cleanup_directory(directory_id: str): directory_response = humanloop_client.directories.get(id=directory_id) for subdirectory in directory_response.subdirectories: cleanup_directory(subdirectory.id) for file in directory_response.files: - match file.type: - case "agent": - humanloop_client.agents.delete(id=file.id) - case "prompt": - humanloop_client.prompts.delete(id=file.id) - case "dataset": - humanloop_client.datasets.delete(id=file.id) - case "evaluator": - humanloop_client.evaluators.delete(id=file.id) - case "flow": - humanloop_client.flows.delete(id=file.id) - case "tool": - humanloop_client.tools.delete(id=file.id) - case _: - raise ValueError(f"Unknown file type: {file.type}") + subclient = _get_subclient(typing.cast(str, file.type)) + subclient.delete(id=file.id) humanloop_client.directories.delete(id=directory_response.id) path = f"SDK_INTEGRATION_TEST_{uuid.uuid4()}" @@ -211,7 +200,7 @@ def syncable_files_fixture( sdk_test_dir: str, ) -> Generator[list[SyncableFile], None, None]: """Creates a predefined structure of files in Humanloop for testing sync.""" - files: List[SyncableFile] = [ + files: list[SyncableFile] = [ SyncableFile( path="prompts/gpt-4", type="prompt", diff --git a/tests/custom/integration/test_sync.py b/tests/custom/integration/test_sync.py index 80e332a4..8b33f7a4 100644 --- a/tests/custom/integration/test_sync.py +++ b/tests/custom/integration/test_sync.py @@ -1,3 +1,4 @@ +import typing from pathlib import Path from typing import List, Union @@ -80,15 +81,17 @@ def test_overload_with_local_files( # WHEN calling with an invalid path # THEN it should raise HumanloopRuntimeError with pytest.raises(HumanloopRuntimeError): - sub_client: Union[PromptsClient, AgentsClient] - match test_file.type: - case "prompt": - sub_client = humanloop_client.prompts - case "agent": - sub_client = humanloop_client.agents - case _: - raise ValueError(f"Invalid file type: {test_file.type}") - sub_client.call(path="invalid/path") + try: + sub_client: Union[PromptsClient, AgentsClient] = typing.cast( + Union[PromptsClient, AgentsClient], + { + "prompt": humanloop_client.prompts, + "agent": humanloop_client.agents, + }[test_file.type], + ) + sub_client.call(path="invalid/path") + except KeyError: + raise NotImplementedError(f"Unknown file type: {test_file.type}") def test_overload_log_with_local_files( From 79e1d0db9f9d54dbf4f2ff05095324d281cfe7e1 Mon Sep 17 00:00:00 2001 From: Ale Pouroullis Date: Wed, 14 May 2025 16:53:03 +0100 Subject: [PATCH 6/6] chore: clean up import order in cli --- src/humanloop/cli/__main__.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/humanloop/cli/__main__.py b/src/humanloop/cli/__main__.py index ad582bbc..3ab53cfb 100644 --- a/src/humanloop/cli/__main__.py +++ b/src/humanloop/cli/__main__.py @@ -1,13 +1,15 @@ -import click import logging -from typing import Optional, Callable -from functools import wraps -from dotenv import load_dotenv import os import sys +import time +from functools import wraps +from typing import Callable, Optional + +import click +from dotenv import load_dotenv + from humanloop import Humanloop from humanloop.sync.sync_client import SyncClient -import time # Set up logging logger = logging.getLogger(__name__)