AgentOpt · carlosrod723 · Feb 9, 2026 · Feb 10, 2026 · Feb 10, 2026 · Feb 12, 2026
diff --git a/docs/T6_technical_plan.md b/docs/T6_technical_plan.md
diff --git a/examples/notebooks/t6_m0_analysis.ipynb b/examples/notebooks/t6_m0_analysis.ipynb
diff --git a/examples/notebooks/t6_m1_vector_scores.ipynb b/examples/notebooks/t6_m1_vector_scores.ipynb
diff --git a/opto/trainer/algorithms/basic_algorithms.py b/opto/trainer/algorithms/basic_algorithms.py
@@ -6,7 +6,8 @@
 from opto.trainer.loader import DataLoader
 from opto.trainer.utils import batch_run, async_run
 from opto.optimizers.utils import print_color
-from opto.trainer.evaluators import evaluate
+from opto.trainer.evaluators import evaluate, evaluate_vector, aggregate_vector_scores
+from opto.trainer.objectives import ObjectiveConfig, select_best
 
 
 def standard_optimization_step(agent, x, guide, info, min_score=0):
@@ -533,6 +534,7 @@ def train(self,
               validate_dataset = None, # dataset of (x, info) pairs to evaluate the agent for candidate selection
               validate_guide = None,  #  to provide scores for the validation set
               num_proposals = 4,  # number of proposals to get from the optimizer
+              objective_config = None,  # optional ObjectiveConfig for multi-objective selection
               num_epochs = 1,  # number of training epochs
               batch_size = 1,  # batch size for updating the agent
               test_dataset = None, # dataset of (x, info) pairs to evaluate the agent
@@ -549,6 +551,8 @@ def train(self,
         self.validate_guide = validate_guide or guide
         self.min_score = min_score
         self.current_score = None
+        self.objective_config = objective_config
+        self.current_score_dict = None  # stores vector score when using multi-objective
 
         return super().train(guide, train_dataset, num_epochs=num_epochs, batch_size=batch_size,
                       test_dataset=test_dataset, test_frequency=test_frequency, log_frequency=log_frequency,
@@ -571,6 +575,21 @@ def validate():
                               description="Validating proposals")
             return np.mean(scores) if all([s is not None for s in scores]) else -np.inf
 
+        def validate_vector():
+            """ Validate and return aggregated vector score dict. """
+            score_dicts = evaluate_vector(self.agent,
+                                          self.validate_guide,
+                                          self.validate_dataset['inputs'],
+                                          self.validate_dataset['infos'],
+                                          min_score=self.min_score,
+                                          num_threads=num_threads,
+                                          description="Validating proposals (vector)")
+            return aggregate_vector_scores(score_dicts)
+
+        # Determine whether to use vector scoring for selection
+        use_vector = (self.objective_config is not None
+                      and self.objective_config.mode != "scalar")
+
         # TODO perhaps we can ask for multiple updates in one query or use different temperatures in different queries
         # Generate different proposals
         step_kwargs = dict(bypassing=True, verbose='output' if verbose else False)  # we don't print the inner full message
@@ -582,25 +601,57 @@ def validate():
                                 kwargs_list=[step_kwargs] * self.num_proposals,
                                 max_workers=num_threads,
                                 description=f"Generating {self.num_proposals} proposals")  # async step
+
         # Validate the proposals
         candidates = []
         backup_dict = {p: copy.deepcopy(p.data) for p in self.agent.parameters()}  # backup the current value
-        for update_dict in update_dicts:
-            if len(update_dict) == 0:
-                continue
-            self.optimizer.update(update_dict)  # set the agent with update_dict
-            score = validate()  # check the score on the validation set
-            candidates.append((score, update_dict))
-            self.optimizer.update(backup_dict)  # restore the backup
-
-        # Include the current parameter as a candidate
-        if self.current_score is None:
-            self.current_score = validate()
-        candidates.append((self.current_score, backup_dict))
-
-        # Find the candidate with the best score
-        best_score, best_update = max(candidates, key=lambda x: x[0])
-        self.current_score = best_score
+
+        if use_vector:
+            # Vector path: collect (score_dict, update_dict) for multi-objective selection
+            vector_candidates = []
+            for update_dict in update_dicts:
+                if len(update_dict) == 0:
+                    continue
+                self.optimizer.update(update_dict)
+                score_dict = validate_vector()
+                scalar_score = float(np.mean(list(score_dict.values())))
+                candidates.append((scalar_score, update_dict))
+                vector_candidates.append((score_dict, update_dict))
+                self.optimizer.update(backup_dict)
+
+            # Include current parameters as a candidate
+            if self.current_score_dict is None:
+                self.current_score_dict = validate_vector()
+            if self.current_score is None:
+                self.current_score = float(np.mean(list(self.current_score_dict.values())))
+            candidates.append((self.current_score, backup_dict))
+            vector_candidates.append((self.current_score_dict, backup_dict))
+
+            # Select best via multi-objective config
+            best_idx = select_best(vector_candidates, self.objective_config)
+            best_score_dict = vector_candidates[best_idx][0]
+            best_update = vector_candidates[best_idx][1]
+            best_score = float(np.mean(list(best_score_dict.values())))
+            self.current_score = best_score
+            self.current_score_dict = best_score_dict
+        else:
+            # Scalar path: unchanged from original behavior
+            for update_dict in update_dicts:
+                if len(update_dict) == 0:
+                    continue
+                self.optimizer.update(update_dict)  # set the agent with update_dict
+                score = validate()  # check the score on the validation set
+                candidates.append((score, update_dict))
+                self.optimizer.update(backup_dict)  # restore the backup
+
+            # Include the current parameter as a candidate
+            if self.current_score is None:
+                self.current_score = validate()
+            candidates.append((self.current_score, backup_dict))
+
+            # Find the candidate with the best score
+            best_score, best_update = max(candidates, key=lambda x: x[0])
+            self.current_score = best_score
 
         if verbose:
             print_color(f"Best score: {best_score} out of scores {[c[0] for c in candidates]}", 'green')
@@ -609,5 +660,11 @@ def validate():
         # Make the best update
         self.optimizer.update(best_update)
 
-        # Logging
-        self.logger.log('Validation score', best_score, self.n_iters, color='green')
+        # Logging — always log scalar for backward compatibility
+        self.logger.log('Validation score', best_score, self.n_iters, color='green')
+
+        # Log individual vector metrics if available
+        if use_vector and isinstance(best_score_dict, dict):
+            for metric_name, metric_value in best_score_dict.items():
+                self.logger.log(f'Validation score/{metric_name}', metric_value,
+                                self.n_iters, color='green')
diff --git a/opto/trainer/evaluators.py b/opto/trainer/evaluators.py
@@ -39,6 +39,76 @@ def _evaluate(agent, guide, i):
     scores = np.array(scores)
     if num_samples > 1:
         # scores will be of length N * num_samples
-        # Reshape scores into an array of shape (N, num_samples)        
+        # Reshape scores into an array of shape (N, num_samples)
         scores = scores.reshape(N, num_samples)
-    return scores
+    return scores
+
+
+def evaluate_vector(agent, guide, inputs, infos, min_score=None,
+                    num_threads=None, description=None):
+    """Evaluate the agent and return per-example score dicts.
+
+    Like evaluate(), but calls guide.get_score_dict() instead of
+    guide.metric(), returning a list of Dict[str, float].
+
+    Args:
+        agent: The agent to evaluate
+        guide: The guide (must have get_score_dict method)
+        inputs: List of inputs to evaluate on
+        infos: List of additional information for each input
+        min_score: Fallback on exception. Dict or float (wrapped as
+                   {"score": val}). None -> {"score": -inf}.
+        num_threads: Maximum threads for parallel evaluation
+        description: Progress bar description
+
+    Returns:
+        List[Dict[str, float]] of length len(inputs)
+    """
+    assert len(inputs) == len(infos), "Inputs and infos must have the same length"
+    N = len(inputs)
+    eval_description = description or f"Evaluating {N} examples (vector)"
+
+    if min_score is None:
+        _fallback = {"score": float("-inf")}
+    elif isinstance(min_score, dict):
+        _fallback = min_score
+    else:
+        _fallback = {"score": float(min_score)}
+
+    @batch_run(max_workers=num_threads, description=eval_description)
+    def _evaluate_vector(agent, guide, i):
+        try:
+            output = agent(inputs[i]).data
+            score_dict = guide.get_score_dict(inputs[i], output, infos[i])
+        except ExecutionError:
+            score_dict = copy.copy(_fallback)
+        return score_dict
+
+    indices = list(range(N))
+    return _evaluate_vector(agent, guide, indices)
+
+
+def aggregate_vector_scores(score_dicts):
+    """Compute the per-metric mean across a list of score dicts.
+
+    Args:
+        score_dicts: List[Dict[str, float]]
+
+    Returns:
+        Dict[str, float] with the mean value for each metric key.
+        Empty dict if input is empty.
+    """
+    if not score_dicts:
+        return {}
+
+    all_keys = set()
+    for sd in score_dicts:
+        all_keys.update(sd.keys())
+
+    result = {}
+    for key in sorted(all_keys):
+        values = [sd[key] for sd in score_dicts
+                  if key in sd and sd[key] is not None]
+        if values:
+            result[key] = float(np.mean(values))
+    return result
diff --git a/opto/trainer/guide.py b/opto/trainer/guide.py
@@ -47,6 +47,22 @@ def metric(self, query: str, response: str, reference: Optional[str] = None, **k
         """ Exact match metric """
         return self.get_feedback(query, response, reference)[0]
 
+    def get_score_dict(self, query: str, response: str, reference: Optional[str] = None, **kwargs) -> Dict[str, float]:
+        """Return the evaluation score as a dictionary.
+
+        Default implementation wraps the scalar from get_feedback() as
+        {"score": float_value}. Subclasses returning multi-metric scores
+        should override this method to return e.g.
+        {"accuracy": 0.9, "fluency": 0.8, "latency_s": 0.05}.
+
+        If get_feedback() returns a dict as its first element, that dict
+        is returned directly (with values cast to float).
+        """
+        score = self.get_feedback(query, response, reference, **kwargs)[0]
+        if isinstance(score, dict):
+            return {k: float(v) for k, v in score.items()}
+        return {"score": float(score)}
+
     def copy(self):
         """ Create a copy of the guide instance.