Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
847 changes: 847 additions & 0 deletions docs/T6_technical_plan.md

Large diffs are not rendered by default.

935 changes: 935 additions & 0 deletions examples/notebooks/t6_m0_analysis.ipynb

Large diffs are not rendered by default.

810 changes: 810 additions & 0 deletions examples/notebooks/t6_m1_vector_scores.ipynb

Large diffs are not rendered by default.

95 changes: 76 additions & 19 deletions opto/trainer/algorithms/basic_algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from opto.trainer.loader import DataLoader
from opto.trainer.utils import batch_run, async_run
from opto.optimizers.utils import print_color
from opto.trainer.evaluators import evaluate
from opto.trainer.evaluators import evaluate, evaluate_vector, aggregate_vector_scores
from opto.trainer.objectives import ObjectiveConfig, select_best


def standard_optimization_step(agent, x, guide, info, min_score=0):
Expand Down Expand Up @@ -533,6 +534,7 @@ def train(self,
validate_dataset = None, # dataset of (x, info) pairs to evaluate the agent for candidate selection
validate_guide = None, # to provide scores for the validation set
num_proposals = 4, # number of proposals to get from the optimizer
objective_config = None, # optional ObjectiveConfig for multi-objective selection
num_epochs = 1, # number of training epochs
batch_size = 1, # batch size for updating the agent
test_dataset = None, # dataset of (x, info) pairs to evaluate the agent
Expand All @@ -549,6 +551,8 @@ def train(self,
self.validate_guide = validate_guide or guide
self.min_score = min_score
self.current_score = None
self.objective_config = objective_config
self.current_score_dict = None # stores vector score when using multi-objective

return super().train(guide, train_dataset, num_epochs=num_epochs, batch_size=batch_size,
test_dataset=test_dataset, test_frequency=test_frequency, log_frequency=log_frequency,
Expand All @@ -571,6 +575,21 @@ def validate():
description="Validating proposals")
return np.mean(scores) if all([s is not None for s in scores]) else -np.inf

def validate_vector():
""" Validate and return aggregated vector score dict. """
score_dicts = evaluate_vector(self.agent,
self.validate_guide,
self.validate_dataset['inputs'],
self.validate_dataset['infos'],
min_score=self.min_score,
num_threads=num_threads,
description="Validating proposals (vector)")
return aggregate_vector_scores(score_dicts)

# Determine whether to use vector scoring for selection
use_vector = (self.objective_config is not None
and self.objective_config.mode != "scalar")

# TODO perhaps we can ask for multiple updates in one query or use different temperatures in different queries
# Generate different proposals
step_kwargs = dict(bypassing=True, verbose='output' if verbose else False) # we don't print the inner full message
Expand All @@ -582,25 +601,57 @@ def validate():
kwargs_list=[step_kwargs] * self.num_proposals,
max_workers=num_threads,
description=f"Generating {self.num_proposals} proposals") # async step

# Validate the proposals
candidates = []
backup_dict = {p: copy.deepcopy(p.data) for p in self.agent.parameters()} # backup the current value
for update_dict in update_dicts:
if len(update_dict) == 0:
continue
self.optimizer.update(update_dict) # set the agent with update_dict
score = validate() # check the score on the validation set
candidates.append((score, update_dict))
self.optimizer.update(backup_dict) # restore the backup

# Include the current parameter as a candidate
if self.current_score is None:
self.current_score = validate()
candidates.append((self.current_score, backup_dict))

# Find the candidate with the best score
best_score, best_update = max(candidates, key=lambda x: x[0])
self.current_score = best_score

if use_vector:
# Vector path: collect (score_dict, update_dict) for multi-objective selection
vector_candidates = []
for update_dict in update_dicts:
if len(update_dict) == 0:
continue
self.optimizer.update(update_dict)
score_dict = validate_vector()
scalar_score = float(np.mean(list(score_dict.values())))
candidates.append((scalar_score, update_dict))
vector_candidates.append((score_dict, update_dict))
self.optimizer.update(backup_dict)

# Include current parameters as a candidate
if self.current_score_dict is None:
self.current_score_dict = validate_vector()
if self.current_score is None:
self.current_score = float(np.mean(list(self.current_score_dict.values())))
candidates.append((self.current_score, backup_dict))
vector_candidates.append((self.current_score_dict, backup_dict))

# Select best via multi-objective config
best_idx = select_best(vector_candidates, self.objective_config)
best_score_dict = vector_candidates[best_idx][0]
best_update = vector_candidates[best_idx][1]
best_score = float(np.mean(list(best_score_dict.values())))
self.current_score = best_score
self.current_score_dict = best_score_dict
else:
# Scalar path: unchanged from original behavior
for update_dict in update_dicts:
if len(update_dict) == 0:
continue
self.optimizer.update(update_dict) # set the agent with update_dict
score = validate() # check the score on the validation set
candidates.append((score, update_dict))
self.optimizer.update(backup_dict) # restore the backup

# Include the current parameter as a candidate
if self.current_score is None:
self.current_score = validate()
candidates.append((self.current_score, backup_dict))

# Find the candidate with the best score
best_score, best_update = max(candidates, key=lambda x: x[0])
self.current_score = best_score

if verbose:
print_color(f"Best score: {best_score} out of scores {[c[0] for c in candidates]}", 'green')
Expand All @@ -609,5 +660,11 @@ def validate():
# Make the best update
self.optimizer.update(best_update)

# Logging
self.logger.log('Validation score', best_score, self.n_iters, color='green')
# Logging — always log scalar for backward compatibility
self.logger.log('Validation score', best_score, self.n_iters, color='green')

# Log individual vector metrics if available
if use_vector and isinstance(best_score_dict, dict):
for metric_name, metric_value in best_score_dict.items():
self.logger.log(f'Validation score/{metric_name}', metric_value,
self.n_iters, color='green')
74 changes: 72 additions & 2 deletions opto/trainer/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,76 @@ def _evaluate(agent, guide, i):
scores = np.array(scores)
if num_samples > 1:
# scores will be of length N * num_samples
# Reshape scores into an array of shape (N, num_samples)
# Reshape scores into an array of shape (N, num_samples)
scores = scores.reshape(N, num_samples)
return scores
return scores


def evaluate_vector(agent, guide, inputs, infos, min_score=None,
num_threads=None, description=None):
"""Evaluate the agent and return per-example score dicts.

Like evaluate(), but calls guide.get_score_dict() instead of
guide.metric(), returning a list of Dict[str, float].

Args:
agent: The agent to evaluate
guide: The guide (must have get_score_dict method)
inputs: List of inputs to evaluate on
infos: List of additional information for each input
min_score: Fallback on exception. Dict or float (wrapped as
{"score": val}). None -> {"score": -inf}.
num_threads: Maximum threads for parallel evaluation
description: Progress bar description

Returns:
List[Dict[str, float]] of length len(inputs)
"""
assert len(inputs) == len(infos), "Inputs and infos must have the same length"
N = len(inputs)
eval_description = description or f"Evaluating {N} examples (vector)"

if min_score is None:
_fallback = {"score": float("-inf")}
elif isinstance(min_score, dict):
_fallback = min_score
else:
_fallback = {"score": float(min_score)}

@batch_run(max_workers=num_threads, description=eval_description)
def _evaluate_vector(agent, guide, i):
try:
output = agent(inputs[i]).data
score_dict = guide.get_score_dict(inputs[i], output, infos[i])
except ExecutionError:
score_dict = copy.copy(_fallback)
return score_dict

indices = list(range(N))
return _evaluate_vector(agent, guide, indices)


def aggregate_vector_scores(score_dicts):
"""Compute the per-metric mean across a list of score dicts.

Args:
score_dicts: List[Dict[str, float]]

Returns:
Dict[str, float] with the mean value for each metric key.
Empty dict if input is empty.
"""
if not score_dicts:
return {}

all_keys = set()
for sd in score_dicts:
all_keys.update(sd.keys())

result = {}
for key in sorted(all_keys):
values = [sd[key] for sd in score_dicts
if key in sd and sd[key] is not None]
if values:
result[key] = float(np.mean(values))
return result
16 changes: 16 additions & 0 deletions opto/trainer/guide.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,22 @@ def metric(self, query: str, response: str, reference: Optional[str] = None, **k
""" Exact match metric """
return self.get_feedback(query, response, reference)[0]

def get_score_dict(self, query: str, response: str, reference: Optional[str] = None, **kwargs) -> Dict[str, float]:
"""Return the evaluation score as a dictionary.

Default implementation wraps the scalar from get_feedback() as
{"score": float_value}. Subclasses returning multi-metric scores
should override this method to return e.g.
{"accuracy": 0.9, "fluency": 0.8, "latency_s": 0.05}.

If get_feedback() returns a dict as its first element, that dict
is returned directly (with values cast to float).
"""
score = self.get_feedback(query, response, reference, **kwargs)[0]
if isinstance(score, dict):
return {k: float(v) for k, v in score.items()}
return {"score": float(score)}

def copy(self):
""" Create a copy of the guide instance.

Expand Down
Loading