From c92641ac4c6097a931b2669d596febe16bd8ae84 Mon Sep 17 00:00:00 2001
From: Harry Xie <harry@humanloop.com>
Date: Thu, 7 Nov 2024 11:35:38 +0000
Subject: [PATCH 1/2] use run_stats[1] to account for new run stats sorting
 (with latest first); formatting

---
 src/humanloop/eval_utils.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/humanloop/eval_utils.py b/src/humanloop/eval_utils.py
index b13d6c3b..8ff2cf8a 100644
--- a/src/humanloop/eval_utils.py
+++ b/src/humanloop/eval_utils.py
@@ -233,16 +233,16 @@ def _run_eval(
         raise NotImplementedError(f"Unsupported File type: {type_}")
 
     # Upsert the Dataset
-    action = dataset.get("action", "set")  # set is the server default - None not allowed.
+    action = dataset.get(
+        "action", "set"
+    )  # set is the server default - None not allowed.
     if "datapoints" not in dataset:
         dataset["datapoints"] = []
         # Use `upsert` to get existing dataset ID if no datapoints provided, given we can't `get` on path.
         action = "add"
     hl_dataset = client.datasets.upsert(**dataset, action=action)
     hl_dataset = client.datasets.get(
-        id=hl_dataset.id,
-        version_id=hl_dataset.version_id,
-        include_datapoints=True
+        id=hl_dataset.id, version_id=hl_dataset.version_id, include_datapoints=True
     )
 
     # Upsert the local Evaluators; other Evaluators are just referenced by `path` or `id`
@@ -615,7 +615,8 @@ def check_evaluation_improvement(
         return True, 0, 0
 
     previous_evaluator_stats_by_path = get_evaluator_stats_by_path(
-        stat=stats.run_stats[-2], evaluation=evaluation
+        stat=stats.run_stats[1],  # Latest Run is at index 0; previous Run is at index 1
+        evaluation=evaluation,
     )
     if (
         evaluator_path in latest_evaluator_stats_by_path
@@ -625,6 +626,8 @@ def check_evaluation_improvement(
         previous_evaluator_stat = previous_evaluator_stats_by_path[evaluator_path]
         latest_score = get_score_from_evaluator_stat(stat=latest_evaluator_stat)
         previous_score = get_score_from_evaluator_stat(stat=previous_evaluator_stat)
+        if latest_score is None or previous_score is None:
+            raise ValueError(f"Could not find score for Evaluator {evaluator_path}.")
         diff = round(latest_score - previous_score, 2)
         if diff >= 0:
             logger.info(

From 5021e53a15c2fc6278d0c082e34e994a46e87262 Mon Sep 17 00:00:00 2001
From: Harry Xie <harry@humanloop.com>
Date: Thu, 7 Nov 2024 11:47:36 +0000
Subject: [PATCH 2/2] use run-level status

---
 src/humanloop/eval_utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/humanloop/eval_utils.py b/src/humanloop/eval_utils.py
index 8ff2cf8a..1eb09e2f 100644
--- a/src/humanloop/eval_utils.py
+++ b/src/humanloop/eval_utils.py
@@ -422,7 +422,11 @@ def process_datapoint(datapoint: Datapoint):
     while not complete:
         stats = client.evaluations.get_stats(id=evaluation.id)
         logger.info(f"\r{stats.progress}")
-        complete = stats.status == "completed"
+        run_stats = next(
+            (run_stats for run_stats in stats.run_stats if run_stats.run_id == run_id),
+            None,
+        )
+        complete = run_stats is not None and run_stats.status == "completed"
         if not complete:
             time.sleep(5)