Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions src/humanloop/eval_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,16 +233,16 @@ def _run_eval(
raise NotImplementedError(f"Unsupported File type: {type_}")

# Upsert the Dataset
action = dataset.get("action", "set") # set is the server default - None not allowed.
action = dataset.get(
"action", "set"
) # set is the server default - None not allowed.
if "datapoints" not in dataset:
dataset["datapoints"] = []
# Use `upsert` to get existing dataset ID if no datapoints provided, given we can't `get` on path.
action = "add"
hl_dataset = client.datasets.upsert(**dataset, action=action)
hl_dataset = client.datasets.get(
id=hl_dataset.id,
version_id=hl_dataset.version_id,
include_datapoints=True
id=hl_dataset.id, version_id=hl_dataset.version_id, include_datapoints=True
)

# Upsert the local Evaluators; other Evaluators are just referenced by `path` or `id`
Expand Down Expand Up @@ -422,7 +422,11 @@ def process_datapoint(datapoint: Datapoint):
while not complete:
stats = client.evaluations.get_stats(id=evaluation.id)
logger.info(f"\r{stats.progress}")
complete = stats.status == "completed"
run_stats = next(
(run_stats for run_stats in stats.run_stats if run_stats.run_id == run_id),
None,
)
complete = run_stats is not None and run_stats.status == "completed"
if not complete:
time.sleep(5)

Expand Down Expand Up @@ -615,7 +619,8 @@ def check_evaluation_improvement(
return True, 0, 0

previous_evaluator_stats_by_path = get_evaluator_stats_by_path(
stat=stats.run_stats[-2], evaluation=evaluation
stat=stats.run_stats[1], # Latest Run is at index 0; previous Run is at index 1
evaluation=evaluation,
)
if (
evaluator_path in latest_evaluator_stats_by_path
Expand All @@ -625,6 +630,8 @@ def check_evaluation_improvement(
previous_evaluator_stat = previous_evaluator_stats_by_path[evaluator_path]
latest_score = get_score_from_evaluator_stat(stat=latest_evaluator_stat)
previous_score = get_score_from_evaluator_stat(stat=previous_evaluator_stat)
if latest_score is None or previous_score is None:
raise ValueError(f"Could not find score for Evaluator {evaluator_path}.")
diff = round(latest_score - previous_score, 2)
if diff >= 0:
logger.info(
Expand Down
Loading