From ddc693c8c65d219e32398acc8adc42bc2f42bf4a Mon Sep 17 00:00:00 2001 From: Omkar M Date: Tue, 3 Feb 2026 04:49:31 +0000 Subject: [PATCH 1/2] Add Foundry SDK version of Sarcasm demo - New demo using azure-ai-projects SDK instead of raw OpenAI SDK - Uses DefaultAzureCredential for authentication (no API keys) - Uses responses.create() API for inference - Same evals and fine-tuning APIs as original demo - Includes README comparing both approaches Files: - sarcasm_foundry.ipynb: Main notebook - requirements.txt: azure-ai-projects, azure-identity, etc. - .env.template: Configuration template - .gitignore: Excludes .env, .venv, etc. --- Demos/DistillingSarcasm_Foundry/.env.template | 14 + Demos/DistillingSarcasm_Foundry/.gitignore | 5 + Demos/DistillingSarcasm_Foundry/README.md | 152 +++ .../requirements.txt | 19 + .../sarcasm_foundry.ipynb | 1056 +++++++++++++++++ 5 files changed, 1246 insertions(+) create mode 100644 Demos/DistillingSarcasm_Foundry/.env.template create mode 100644 Demos/DistillingSarcasm_Foundry/.gitignore create mode 100644 Demos/DistillingSarcasm_Foundry/README.md create mode 100644 Demos/DistillingSarcasm_Foundry/requirements.txt create mode 100644 Demos/DistillingSarcasm_Foundry/sarcasm_foundry.ipynb diff --git a/Demos/DistillingSarcasm_Foundry/.env.template b/Demos/DistillingSarcasm_Foundry/.env.template new file mode 100644 index 0000000..c143065 --- /dev/null +++ b/Demos/DistillingSarcasm_Foundry/.env.template @@ -0,0 +1,14 @@ +# Microsoft Foundry Project Configuration +MICROSOFT_FOUNDRY_PROJECT_ENDPOINT=https://.services.ai.azure.com/api/projects/ + +# Azure OpenAI Configuration (for evaluation model) +AZURE_OPENAI_DEPLOYMENT=gpt-4.1 + +# Azure Resource Configuration (for model deployment) +AZURE_SUBSCRIPTION_ID= +AZURE_RESOURCE_GROUP= +AZURE_AOAI_ACCOUNT= + +# Fine-tuning Configuration +BASE_MODEL=gpt-4.1-mini +TEACHER_MODEL=gpt-4.1 diff --git a/Demos/DistillingSarcasm_Foundry/.gitignore b/Demos/DistillingSarcasm_Foundry/.gitignore new file mode 100644 index 0000000..b9a371e --- /dev/null +++ b/Demos/DistillingSarcasm_Foundry/.gitignore @@ -0,0 +1,5 @@ +.env +.venv/ +__pycache__/ +*.pyc +.ipynb_checkpoints/ diff --git a/Demos/DistillingSarcasm_Foundry/README.md b/Demos/DistillingSarcasm_Foundry/README.md new file mode 100644 index 0000000..cf0d904 --- /dev/null +++ b/Demos/DistillingSarcasm_Foundry/README.md @@ -0,0 +1,152 @@ +# Distilling Sarcasm - Foundry SDK Version + +This is an alternative implementation of the Sarcasm distillation demo using the **Azure AI Foundry SDK** instead of raw API keys. + +## Overview + +This demo teaches language models to generate sarcastic responses through distillation: +1. A **teacher model** (gpt-4.1) generates sarcastic training data +2. A **student model** (gpt-4.1-mini) is fine-tuned on this data +3. Evaluators measure sarcasm quality before and after training + +## SDK Comparison + +| Aspect | Original (API Key) | This Version (Foundry SDK) | +|--------|-------------------|---------------------------| +| **Package** | `openai` | `azure-ai-projects` + `openai` | +| **Auth** | API Key | `DefaultAzureCredential` | +| **Inference** | `client.chat.completions.create()` | `openai_client.chat.completions.create()` | +| **Evaluations** | `client.evals.*` | `openai_client.evals.*` (same!) | +| **Fine-tuning** | `client.fine_tuning.jobs.*` | `openai_client.fine_tuning.jobs.*` (same!) | + +**Key Insight**: The APIs are nearly identical! The main difference is how you get the client: + +```python +# Original (API Key) +client = OpenAI(base_url=..., api_key=...) + +# Foundry SDK +project_client = AIProjectClient(endpoint=..., credential=DefaultAzureCredential()) +openai_client = project_client.get_openai_client() # Same API from here! +``` + +## Prerequisites + +1. **Azure Subscription** with access to Azure AI Foundry +2. **Python 3.10+** +3. **Azure CLI** logged in (`az login`) + +## Setup + +1. Create and activate a virtual environment: + ```bash + python -m venv .venv + source .venv/bin/activate # Linux/Mac + # or: .venv\Scripts\activate # Windows + ``` + +2. Install dependencies: + ```bash + pip install -r requirements.txt + ``` + +3. Copy the environment template and fill in your values: + ```bash + cp .env.template .env + ``` + +4. Configure `.env`: + ``` + MICROSOFT_FOUNDRY_PROJECT_ENDPOINT=https://.services.ai.azure.com/api/projects/ + AZURE_INFERENCE_ENDPOINT=https://..models.ai.azure.com + AZURE_OPENAI_DEPLOYMENT=gpt-4.1 + AZURE_SUBSCRIPTION_ID= + AZURE_RESOURCE_GROUP= + AZURE_AOAI_ACCOUNT= + BASE_MODEL=gpt-4.1-mini + TEACHER_MODEL=gpt-4.1 + ``` + +## Running the Demo + +Open `sarcasm_foundry.ipynb` in Jupyter and execute cells in order: + +```bash +jupyter notebook sarcasm_foundry.ipynb +``` + +## Key Differences from Original + +### Authentication +```python +# Original (API Key) +client = OpenAI( + base_url=f"https://{resource}.openai.azure.com/openai/v1/", + api_key=os.environ.get("FOUNDRY_API_KEY"), +) + +# Foundry SDK (Azure Credential) +credential = DefaultAzureCredential() +project_client = AIProjectClient(endpoint=endpoint, credential=credential) +openai_client = project_client.get_openai_client() +``` + +### Inference +```python +# Original (OpenAI SDK) +response = client.chat.completions.create( + model="gpt-4.1", + messages=[{"role": "user", "content": "Hello"}] +) + +# Foundry SDK (azure-ai-inference) +from azure.ai.inference import ChatCompletionsClient +from azure.ai.inference.models import UserMessage + +client = ChatCompletionsClient(endpoint=..., credential=credential) +response = client.complete(messages=[UserMessage(content="Hello")]) +``` + +### Evaluations (Same API!) +```python +# Both demos use identical evals API +eval = openai_client.evals.create(name="sarcasm-grader", ...) +run = openai_client.evals.runs.create(eval_id=eval.id, ...) +``` + +### Trade-offs + +**Foundry SDK Advantages:** +- ✅ No API keys to manage (uses Azure managed identity) +- ✅ Better security with DefaultAzureCredential +- ✅ Consistent with other Azure AI services +- ✅ Unified project management + +**Original (API Key) Advantages:** +- ✅ Simpler setup (just one API key) +- ✅ Works outside Azure environments +- ✅ Familiar OpenAI SDK patterns + +## Troubleshooting + +### Authentication Issues +- Ensure you're logged in: `az login` +- Check your role assignments include "Azure AI User" on the Foundry resource + +### Model Not Found +- Verify the model deployment names in your `.env` match your Azure deployments + +### Rate Limiting +- Reduce `sample_size` in evaluation cells if hitting rate limits + +## Files + +- `sarcasm_foundry.ipynb` - Main notebook +- `requirements.txt` - Python dependencies +- `.env.template` - Environment variable template +- `README.md` - This file + +## Related + +- [Original Sarcasm Demo](../DistillingSarcasm/) - Uses OpenAI SDK with API keys +- [CNN DailyMail Demo](../SFT_CNN_DailyMail/) - SFT example using Foundry SDK diff --git a/Demos/DistillingSarcasm_Foundry/requirements.txt b/Demos/DistillingSarcasm_Foundry/requirements.txt new file mode 100644 index 0000000..677bc62 --- /dev/null +++ b/Demos/DistillingSarcasm_Foundry/requirements.txt @@ -0,0 +1,19 @@ +# Azure AI Foundry SDK +azure-ai-projects>=2.0.0b1 + +# Azure Authentication +azure-identity + +# Azure Cognitive Services for model deployment +azure-mgmt-cognitiveservices + +# OpenAI SDK (obtained via Foundry client) +openai + +# Data manipulation and visualization +pandas +matplotlib +numpy + +# Core utilities +python-dotenv diff --git a/Demos/DistillingSarcasm_Foundry/sarcasm_foundry.ipynb b/Demos/DistillingSarcasm_Foundry/sarcasm_foundry.ipynb new file mode 100644 index 0000000..027a3a7 --- /dev/null +++ b/Demos/DistillingSarcasm_Foundry/sarcasm_foundry.ipynb @@ -0,0 +1,1056 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distilling Sarcasm (Foundry SDK Version)\n", + "\n", + "This notebook demonstrates fine-tuning language models to generate sarcastic responses using the **Azure AI Foundry SDK**.\n", + "\n", + "## Comparison with Original Demo\n", + "\n", + "| Aspect | Original | Foundry SDK |\n", + "|--------|----------|-------------|\n", + "| Auth | API Key | DefaultAzureCredential |\n", + "| Inference | `client.chat.completions` | `openai_client.responses` |\n", + "| Evaluations | `client.evals.*` | `openai_client.evals.*` (same!) |\n", + "| Fine-tuning | `client.fine_tuning.jobs.*` | `openai_client.fine_tuning.jobs.*` (same!) |\n", + "\n", + "**Key Insight**: The APIs are nearly identical! The main difference is authentication:\n", + "- Original: `OpenAI(base_url=..., api_key=...)`\n", + "- Foundry: `AIProjectClient(endpoint, credential).get_openai_client()`\n", + "\n", + "**Note**: Foundry project endpoints use `responses.create()` API for inference." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install -r requirements.txt -q" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Libraries imported successfully\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "import uuid\n", + "import time\n", + "from random import shuffle\n", + "\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "from dotenv import load_dotenv\n", + "from azure.identity import DefaultAzureCredential\n", + "from azure.ai.projects import AIProjectClient\n", + "\n", + "print(\"Libraries imported successfully\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Connect to Microsoft Foundry" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Run identifier: 4aa88349\n", + "Project: https://omi-ignite-demo-resource.services.ai.azure.com/api/projects/omi-ignite-demo\n" + ] + } + ], + "source": [ + "load_dotenv(override=True)\n", + "\n", + "# Required for Azure OpenAI client\n", + "os.environ.setdefault(\"OPENAI_API_VERSION\", \"2025-03-01-preview\")\n", + "\n", + "project_endpoint = os.environ.get(\"MICROSOFT_FOUNDRY_PROJECT_ENDPOINT\")\n", + "model_deployment = os.environ.get(\"AZURE_OPENAI_DEPLOYMENT\", \"gpt-4.1\")\n", + "base_model = os.environ.get(\"BASE_MODEL\", \"gpt-4.1-mini\")\n", + "teacher_model = os.environ.get(\"TEACHER_MODEL\", \"gpt-4.1\")\n", + "\n", + "UNIQUE_KEY = str(uuid.uuid4()).split(\"-\")[0]\n", + "print(f\"Run identifier: {UNIQUE_KEY}\")\n", + "print(f\"Project: {project_endpoint}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Connected to Microsoft Foundry\n" + ] + } + ], + "source": [ + "# Create clients - this is the KEY difference from original demo\n", + "credential = DefaultAzureCredential()\n", + "project_client = AIProjectClient(endpoint=project_endpoint, credential=credential)\n", + "openai_client = project_client.get_openai_client()\n", + "\n", + "print(\"✓ Connected to Microsoft Foundry\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Load Baseline Data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded 20 baseline examples\n", + "\n", + "Sample:\n", + "Q: Who discovered Antarctica?\n", + "A: Some chaps named Fabian Gottlieb von Bellingshausen and Mikhail Lazarev, as if they don't teach that in every school!\n" + ] + } + ], + "source": [ + "baseline_data = []\n", + "with open(\"../DistillingSarcasm/baseline.jsonl\", \"r\") as f:\n", + " for line in f:\n", + " baseline_data.append(json.loads(line))\n", + "\n", + "print(f\"Loaded {len(baseline_data)} baseline examples\")\n", + "print(f\"\\nSample:\")\n", + "print(f\"Q: {baseline_data[0]['question']}\")\n", + "print(f\"A: {baseline_data[0]['answer']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Define Sarcasm Grader" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Grader prompt defined\n" + ] + } + ], + "source": [ + "GRADER_PROMPT = \"\"\"\n", + "You are a connoisseur of finely crafted sarcasm. Your job is to evaluate a\n", + "question and answer pair and score it based on how sarcastic the answer is.\n", + "\n", + "# Guidelines\n", + " * Output ONLY a score.\n", + " * A score of 0 means the answer is not sarcastic at all.\n", + " * A score of 5 means the answer is moderately sarcastic.\n", + " * A score of 10 means the answer is dripping with sarcasm.\n", + "\n", + "# Scoring Features\n", + " * Exaggeration or overstatement of the obvious\n", + " * Rhetorical questions that mock the original question\n", + " * Use of phrases like \"Oh really?\", \"Shocking!\", \"Who knew?\"\n", + " * Intentional understatement for comedic effect\n", + " * Irony where literal meaning differs from intended meaning\n", + " * The answer must still be factually correct\n", + "\n", + "# Output Format\n", + "Provide ONLY a single decimal number between 0 and 1.\n", + "\"\"\"\n", + "\n", + "USER_PROMPT = \"\"\"\n", + "Q: {{item.question}}\n", + "A: {{item.answer}}\n", + "\"\"\"\n", + "\n", + "print(\"Grader prompt defined\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Create Evaluation (Same API as Original!)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Eval file: file-185f7d1d42464c4f91996a2442505ccb\n" + ] + } + ], + "source": [ + "# Upload baseline for evaluation\n", + "with open(\"../DistillingSarcasm/baseline.jsonl\", \"rb\") as f:\n", + " grader_eval_file = openai_client.files.create(purpose=\"evals\", file=f)\n", + " grader_eval_file = openai_client.files.wait_for_processing(grader_eval_file.id)\n", + "\n", + "print(f\"✓ Eval file: {grader_eval_file.id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluation criteria defined\n" + ] + } + ], + "source": [ + "# Define evaluation schema\n", + "data_source_config = {\n", + " \"type\": \"custom\",\n", + " \"item_schema\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"question\": {\"type\": \"string\"},\n", + " \"answer\": {\"type\": \"string\"}\n", + " },\n", + " \"required\": [\"question\", \"answer\"]\n", + " }\n", + "}\n", + "\n", + "# Define grader as testing criteria\n", + "testing_criteria = [\n", + " {\n", + " \"type\": \"score_model\",\n", + " \"name\": \"sarcasm_score\",\n", + " \"model\": model_deployment,\n", + " \"input\": [\n", + " {\"role\": \"system\", \"content\": GRADER_PROMPT},\n", + " {\"role\": \"user\", \"content\": USER_PROMPT}\n", + " ],\n", + " \"pass_threshold\": 0.5\n", + " }\n", + "]\n", + "\n", + "print(\"Evaluation criteria defined\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Evaluation: eval_d341bcecf17245e48c9131906b3b73fc\n" + ] + } + ], + "source": [ + "# Create evaluation\n", + "grader_eval = openai_client.evals.create(\n", + " name=f\"sarcasm-grader-{UNIQUE_KEY}\",\n", + " data_source_config=data_source_config,\n", + " testing_criteria=testing_criteria\n", + ")\n", + "\n", + "print(f\"✓ Evaluation: {grader_eval.id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Eval run: evalrun_730150a66c8343c8a62edbbf95b75a4d\n" + ] + } + ], + "source": [ + "# Run evaluation on baseline\n", + "grader_run = openai_client.evals.runs.create(\n", + " eval_id=grader_eval.id,\n", + " name=f\"baseline-{UNIQUE_KEY}\",\n", + " data_source={\n", + " \"type\": \"jsonl\",\n", + " \"source\": {\"type\": \"file_id\", \"id\": grader_eval_file.id}\n", + " }\n", + ")\n", + "\n", + "print(f\"✓ Eval run: {grader_run.id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Waiting for evaluation...\n", + " Status: in_progress\n", + " Status: in_progress\n", + " Status: in_progress\n", + " Status: in_progress\n", + " Status: in_progress\n", + " Status: in_progress\n", + " Status: in_progress\n", + " Status: in_progress\n", + " Status: in_progress\n", + " Status: completed\n", + "\n", + "✓ completed!\n", + " Passed: 19/20\n" + ] + } + ], + "source": [ + "# Wait for completion\n", + "print(\"Waiting for evaluation...\")\n", + "while True:\n", + " run = openai_client.evals.runs.retrieve(run_id=grader_run.id, eval_id=grader_eval.id)\n", + " print(f\" Status: {run.status}\")\n", + " if run.status in [\"completed\", \"failed\"]:\n", + " break\n", + " time.sleep(5)\n", + "\n", + "print(f\"\\n✓ {run.status}!\")\n", + "if run.result_counts:\n", + " print(f\" Passed: {run.result_counts.passed}/{run.result_counts.total}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Baseline Sarcasm Scores:\n", + " Average: 0.59\n", + " Std Dev: 0.15\n" + ] + } + ], + "source": [ + "# Get scores\n", + "output_items = list(openai_client.evals.runs.output_items.list(run_id=grader_run.id, eval_id=grader_eval.id))\n", + "baseline_scores = [r.score for item in output_items for r in item.results if r.score is not None]\n", + "\n", + "print(f\"Baseline Sarcasm Scores:\")\n", + "print(f\" Average: {np.mean(baseline_scores):.2f}\")\n", + "print(f\" Std Dev: {np.std(baseline_scores):.2f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Test Inference" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Q: What is the capital of France?\n", + "A: Oh, what a stumper! The capital of France? Let me consult my ancient scrolls... Oh wait, it’s Paris. You know, the city with the Eiffel Tower, baguettes, and a population of people who perfected the art of looking unimpressed. Paris is the answer you’re searching for, unless France has pulled a fast one overnight.\n" + ] + } + ], + "source": [ + "SARCASM_SYSTEM_PROMPT = \"\"\"You are a sarcastically witty assistant. Answer questions with \n", + "biting wit while remaining factually correct. Channel your inner comedian who's \n", + "slightly annoyed at obvious questions.\"\"\"\n", + "\n", + "def generate_response(question: str, model: str = None) -> str:\n", + " \"\"\"Generate a sarcastic response using responses API.\"\"\"\n", + " response = openai_client.responses.create(\n", + " model=model or teacher_model,\n", + " instructions=SARCASM_SYSTEM_PROMPT,\n", + " input=question,\n", + " temperature=0.7,\n", + " max_output_tokens=150\n", + " )\n", + " return response.output_text\n", + "\n", + "# Test\n", + "test_q = \"What is the capital of France?\"\n", + "print(f\"Q: {test_q}\")\n", + "print(f\"A: {generate_response(test_q)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Load Q&A Data" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training: 250, Validation: 250\n" + ] + } + ], + "source": [ + "qa_data = []\n", + "with open(\"../DistillingSarcasm/qa.jsonl\", \"r\") as f:\n", + " for line in f:\n", + " qa_data.append(json.loads(line))\n", + "\n", + "shuffle(qa_data)\n", + "split_idx = len(qa_data) // 2\n", + "training_questions = qa_data[:split_idx]\n", + "validation_questions = qa_data[split_idx:]\n", + "\n", + "print(f\"Training: {len(training_questions)}, Validation: {len(validation_questions)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Generate Training Data" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating 50 training examples...\n", + " 10/50...\n", + " 20/50...\n", + " 30/50...\n", + " 40/50...\n", + " 50/50...\n", + "\n", + "Generating 25 validation examples...\n", + "\n", + "✓ Generated 50 training, 25 validation\n" + ] + } + ], + "source": [ + "MAX_TRAINING = 50\n", + "MAX_VALIDATION = 25\n", + "\n", + "print(f\"Generating {MAX_TRAINING} training examples...\")\n", + "training_data = []\n", + "for i, item in enumerate(training_questions[:MAX_TRAINING]):\n", + " response = generate_response(item['question'])\n", + " training_data.append({\n", + " \"messages\": [\n", + " {\"role\": \"system\", \"content\": SARCASM_SYSTEM_PROMPT},\n", + " {\"role\": \"user\", \"content\": item['question']},\n", + " {\"role\": \"assistant\", \"content\": response}\n", + " ]\n", + " })\n", + " if (i + 1) % 10 == 0:\n", + " print(f\" {i + 1}/{MAX_TRAINING}...\")\n", + "\n", + "print(f\"\\nGenerating {MAX_VALIDATION} validation examples...\")\n", + "validation_data = []\n", + "for item in validation_questions[:MAX_VALIDATION]:\n", + " response = generate_response(item['question'])\n", + " validation_data.append({\n", + " \"messages\": [\n", + " {\"role\": \"system\", \"content\": SARCASM_SYSTEM_PROMPT},\n", + " {\"role\": \"user\", \"content\": item['question']},\n", + " {\"role\": \"assistant\", \"content\": response}\n", + " ]\n", + " })\n", + "\n", + "print(f\"\\n✓ Generated {len(training_data)} training, {len(validation_data)} validation\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ training_sarcasm_4aa88349.jsonl\n", + "✓ validation_sarcasm_4aa88349.jsonl\n" + ] + } + ], + "source": [ + "# Save files\n", + "training_file_path = f\"training_sarcasm_{UNIQUE_KEY}.jsonl\"\n", + "validation_file_path = f\"validation_sarcasm_{UNIQUE_KEY}.jsonl\"\n", + "\n", + "with open(training_file_path, \"w\") as f:\n", + " for item in training_data:\n", + " f.write(json.dumps(item) + \"\\n\")\n", + "\n", + "with open(validation_file_path, \"w\") as f:\n", + " for item in validation_data:\n", + " f.write(json.dumps(item) + \"\\n\")\n", + "\n", + "print(f\"✓ {training_file_path}\")\n", + "print(f\"✓ {validation_file_path}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 9. Fine-Tune Model" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training: file-e457cb500c77408db3a563f2dcfa986f\n", + "Validation: file-b7d2cf189fb9422eb9e57a5c454d7531\n", + "✓ Files ready\n" + ] + } + ], + "source": [ + "# Upload files\n", + "with open(training_file_path, \"rb\") as f:\n", + " train_file = openai_client.files.create(file=f, purpose=\"fine-tune\")\n", + "\n", + "with open(validation_file_path, \"rb\") as f:\n", + " val_file = openai_client.files.create(file=f, purpose=\"fine-tune\")\n", + "\n", + "print(f\"Training: {train_file.id}\")\n", + "print(f\"Validation: {val_file.id}\")\n", + "\n", + "openai_client.files.wait_for_processing(train_file.id)\n", + "openai_client.files.wait_for_processing(val_file.id)\n", + "print(\"✓ Files ready\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Job: ftjob-cbccca9eb73d4da991b925d045133219\n", + " Status: pending\n" + ] + } + ], + "source": [ + "# Create fine-tuning job\n", + "fine_tune_job = openai_client.fine_tuning.jobs.create(\n", + " model=base_model,\n", + " training_file=train_file.id,\n", + " validation_file=val_file.id,\n", + " method={\n", + " \"type\": \"supervised\",\n", + " \"supervised\": {\n", + " \"hyperparameters\": {\n", + " \"n_epochs\": 3,\n", + " \"batch_size\": 1,\n", + " \"learning_rate_multiplier\": 1.0\n", + " }\n", + " }\n", + " },\n", + " extra_body={\"trainingType\": \"Standard\"},\n", + " suffix=f\"sarcasm-{UNIQUE_KEY}\"\n", + ")\n", + "\n", + "print(f\"✓ Job: {fine_tune_job.id}\")\n", + "print(f\" Status: {fine_tune_job.status}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 10. Monitor Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Job: ftjob-71e2eea33ecd4d4e990c54ed09ada149\n", + "Status: succeeded\n", + "Model: gpt-4.1-mini-2025-04-14.ft-71e2eea33ecd4d4e990c54ed09ada149-sarcasm-bc6d797e\n" + ] + } + ], + "source": [ + "# Check status\n", + "job_id = fine_tune_job.id\n", + "job = openai_client.fine_tuning.jobs.retrieve(job_id)\n", + "print(f\"Job: {job.id}\")\n", + "print(f\"Status: {job.status}\")\n", + "if job.fine_tuned_model:\n", + " print(f\"Model: {job.fine_tuned_model}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Waiting for fine-tuning...\n", + " [00:00] succeeded\n", + "\n", + "✓ Model: gpt-4.1-mini-2025-04-14.ft-71e2eea33ecd4d4e990c54ed09ada149-sarcasm-bc6d797e\n" + ] + } + ], + "source": [ + "# Wait for completion (can take 10-30 min)\n", + "print(\"Waiting for fine-tuning...\")\n", + "start_time = time.time()\n", + "\n", + "while True:\n", + " job = openai_client.fine_tuning.jobs.retrieve(job_id)\n", + " elapsed = int(time.time() - start_time)\n", + " print(f\" [{elapsed//60:02d}:{elapsed%60:02d}] {job.status}\")\n", + " \n", + " if job.status in [\"succeeded\", \"failed\", \"cancelled\"]:\n", + " break\n", + " time.sleep(30)\n", + "\n", + "if job.status == \"succeeded\":\n", + " fine_tuned_model_id = job.fine_tuned_model\n", + " print(f\"\\n✓ Model: {fine_tuned_model_id}\")\n", + "else:\n", + " print(f\"\\n✗ {job.status}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 11. Deploy Model" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Deploying: gpt-4.1-mini-2025-04-14.ft-71e2eea33ecd4d4e990c54ed09ada149-sarcasm-bc6d797e\n", + "✓ Deployed: sarcasm-ft-4aa88349\n" + ] + } + ], + "source": [ + "from azure.mgmt.cognitiveservices import CognitiveServicesManagementClient\n", + "from azure.mgmt.cognitiveservices.models import Deployment, DeploymentProperties, DeploymentModel, Sku\n", + "\n", + "subscription_id = os.environ.get(\"AZURE_SUBSCRIPTION_ID\")\n", + "resource_group = os.environ.get(\"AZURE_RESOURCE_GROUP\")\n", + "account_name = os.environ.get(\"AZURE_AOAI_ACCOUNT\")\n", + "\n", + "deployment_name = f\"sarcasm-ft-{UNIQUE_KEY}\"\n", + "\n", + "with CognitiveServicesManagementClient(credential=credential, subscription_id=subscription_id) as mgmt:\n", + " deployment_model = DeploymentModel(format=\"OpenAI\", name=fine_tuned_model_id, version=\"1\")\n", + " deployment_properties = DeploymentProperties(model=deployment_model)\n", + " deployment_sku = Sku(name=\"GlobalStandard\", capacity=50)\n", + " deployment_config = Deployment(properties=deployment_properties, sku=deployment_sku)\n", + " \n", + " print(f\"Deploying: {fine_tuned_model_id}\")\n", + " deployment = mgmt.deployments.begin_create_or_update(\n", + " resource_group_name=resource_group,\n", + " account_name=account_name,\n", + " deployment_name=deployment_name,\n", + " deployment=deployment_config,\n", + " )\n", + " deployment.result()\n", + "\n", + "print(f\"✓ Deployed: {deployment_name}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 12. Compare Models" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating comparison data...\n", + "✓ 20 comparisons\n" + ] + } + ], + "source": [ + "# Generate comparison data\n", + "print(\"Generating comparison data...\")\n", + "comparison_data = []\n", + "test_questions = validation_questions[MAX_VALIDATION:MAX_VALIDATION+20]\n", + "\n", + "for item in test_questions:\n", + " q = item['question']\n", + " base_response = generate_response(q, model=base_model)\n", + " ft_response = generate_response(q, model=deployment_name)\n", + " comparison_data.append({\n", + " \"question\": q,\n", + " \"base_answer\": base_response,\n", + " \"ft_answer\": ft_response\n", + " })\n", + "\n", + "print(f\"✓ {len(comparison_data)} comparisons\")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Evaluation files saved\n" + ] + } + ], + "source": [ + "# Save for evaluation\n", + "base_eval_file = f\"base_eval_{UNIQUE_KEY}.jsonl\"\n", + "ft_eval_file = f\"ft_eval_{UNIQUE_KEY}.jsonl\"\n", + "\n", + "with open(base_eval_file, \"w\") as f:\n", + " for item in comparison_data:\n", + " f.write(json.dumps({\"question\": item[\"question\"], \"answer\": item[\"base_answer\"]}) + \"\\n\")\n", + "\n", + "with open(ft_eval_file, \"w\") as f:\n", + " for item in comparison_data:\n", + " f.write(json.dumps({\"question\": item[\"question\"], \"answer\": item[\"ft_answer\"]}) + \"\\n\")\n", + "\n", + "print(\"✓ Evaluation files saved\")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Base run: evalrun_a716d715fdc24e0facffc8b9a90ce170\n", + "Fine-tuned run: evalrun_5180d075fd104d3e962db18db418176c\n" + ] + } + ], + "source": [ + "# Upload and run evaluations\n", + "with open(base_eval_file, \"rb\") as f:\n", + " base_file = openai_client.files.create(purpose=\"evals\", file=f)\n", + " base_file = openai_client.files.wait_for_processing(base_file.id)\n", + "\n", + "with open(ft_eval_file, \"rb\") as f:\n", + " ft_file = openai_client.files.create(purpose=\"evals\", file=f)\n", + " ft_file = openai_client.files.wait_for_processing(ft_file.id)\n", + "\n", + "base_run = openai_client.evals.runs.create(\n", + " eval_id=grader_eval.id,\n", + " name=f\"base-{UNIQUE_KEY}\",\n", + " data_source={\"type\": \"jsonl\", \"source\": {\"type\": \"file_id\", \"id\": base_file.id}}\n", + ")\n", + "\n", + "ft_run = openai_client.evals.runs.create(\n", + " eval_id=grader_eval.id,\n", + " name=f\"finetuned-{UNIQUE_KEY}\",\n", + " data_source={\"type\": \"jsonl\", \"source\": {\"type\": \"file_id\", \"id\": ft_file.id}}\n", + ")\n", + "\n", + "print(f\"Base run: {base_run.id}\")\n", + "print(f\"Fine-tuned run: {ft_run.id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Base: completed\n", + "Fine-tuned: completed\n", + "✓ Evaluations complete\n" + ] + } + ], + "source": [ + "# Wait for both\n", + "for run_id, name in [(base_run.id, \"Base\"), (ft_run.id, \"Fine-tuned\")]:\n", + " while True:\n", + " run = openai_client.evals.runs.retrieve(run_id=run_id, eval_id=grader_eval.id)\n", + " if run.status in [\"completed\", \"failed\"]:\n", + " print(f\"{name}: {run.status}\")\n", + " break\n", + " time.sleep(5)\n", + "\n", + "print(\"✓ Evaluations complete\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 13. Results" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================================================\n", + "RESULTS\n", + "==================================================\n", + "Baseline (gold): 0.59\n", + "Base model: 0.82 (±0.21)\n", + "Fine-tuned: 0.94 (±0.06)\n", + "\n", + "Improvement: +0.12\n" + ] + } + ], + "source": [ + "def get_scores(eval_id, run_id):\n", + " items = list(openai_client.evals.runs.output_items.list(run_id=run_id, eval_id=eval_id))\n", + " return [r.score for item in items for r in item.results if r.score is not None]\n", + "\n", + "base_scores = get_scores(grader_eval.id, base_run.id)\n", + "ft_scores = get_scores(grader_eval.id, ft_run.id)\n", + "\n", + "print(\"=\" * 50)\n", + "print(\"RESULTS\")\n", + "print(\"=\" * 50)\n", + "print(f\"Baseline (gold): {np.mean(baseline_scores):.2f}\")\n", + "print(f\"Base model: {np.mean(base_scores):.2f} (±{np.std(base_scores):.2f})\")\n", + "print(f\"Fine-tuned: {np.mean(ft_scores):.2f} (±{np.std(ft_scores):.2f})\")\n", + "print(f\"\\nImprovement: {np.mean(ft_scores) - np.mean(base_scores):+.2f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Visualize\n", + "fig, ax = plt.subplots(figsize=(8, 5))\n", + "\n", + "models = ['Baseline', 'Base Model', 'Fine-Tuned']\n", + "avgs = [np.mean(baseline_scores), np.mean(base_scores), np.mean(ft_scores)]\n", + "colors = ['#e74c3c', '#3498db', '#2ecc71']\n", + "\n", + "bars = ax.bar(models, avgs, color=colors)\n", + "ax.set_ylabel('Sarcasm Score')\n", + "ax.set_title('Model Comparison')\n", + "ax.set_ylim(0, 1)\n", + "\n", + "for bar, v in zip(bars, avgs):\n", + " ax.text(bar.get_x() + bar.get_width()/2, v + 0.2, f'{v:.1f}', ha='center', fontweight='bold')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 14. Cleanup" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Deleted: training_sarcasm_4aa88349.jsonl\n", + "Deleted: validation_sarcasm_4aa88349.jsonl\n", + "Deleted: base_eval_4aa88349.jsonl\n", + "Deleted: ft_eval_4aa88349.jsonl\n", + "\n", + "✓ Cleanup complete\n" + ] + } + ], + "source": [ + "# Clean up local files\n", + "for f in [training_file_path, validation_file_path, base_eval_file, ft_eval_file]:\n", + " if os.path.exists(f):\n", + " os.remove(f)\n", + " print(f\"Deleted: {f}\")\n", + "\n", + "print(\"\\n✓ Cleanup complete\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv (3.12.3)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 3abcec11a43ee264b5529604e7bdb9c9965e53bc Mon Sep 17 00:00:00 2001 From: Omkar M Date: Tue, 3 Feb 2026 05:44:14 +0000 Subject: [PATCH 2/2] Replace DistillingSarcasm with Foundry SDK version - Use azure-ai-projects SDK with DefaultAzureCredential (no API keys) - Use responses.create() API for inference - Simplified evals with normalized pass_threshold (0-1 scale) - Remove eval_utils.py (no longer needed) - Remove separate DistillingSarcasm_Foundry folder Changes: - sarcasm.ipynb: Complete rewrite using Foundry SDK - requirements.txt: azure-ai-projects, azure-identity - README.md: Updated documentation - .env.template: New config template - .gitignore: Updated patterns --- .../.env.template | 0 Demos/DistillingSarcasm/.gitignore | 7 +- Demos/DistillingSarcasm/README.md | 222 +- Demos/DistillingSarcasm/requirements.txt | 28 +- Demos/DistillingSarcasm/sarcasm.ipynb | 2263 ++++------------- Demos/DistillingSarcasm/scripts/eval_utils.py | 254 -- Demos/DistillingSarcasm_Foundry/.gitignore | 5 - Demos/DistillingSarcasm_Foundry/README.md | 152 -- .../requirements.txt | 19 - .../sarcasm_foundry.ipynb | 1056 -------- 10 files changed, 651 insertions(+), 3355 deletions(-) rename Demos/{DistillingSarcasm_Foundry => DistillingSarcasm}/.env.template (100%) delete mode 100644 Demos/DistillingSarcasm/scripts/eval_utils.py delete mode 100644 Demos/DistillingSarcasm_Foundry/.gitignore delete mode 100644 Demos/DistillingSarcasm_Foundry/README.md delete mode 100644 Demos/DistillingSarcasm_Foundry/requirements.txt delete mode 100644 Demos/DistillingSarcasm_Foundry/sarcasm_foundry.ipynb diff --git a/Demos/DistillingSarcasm_Foundry/.env.template b/Demos/DistillingSarcasm/.env.template similarity index 100% rename from Demos/DistillingSarcasm_Foundry/.env.template rename to Demos/DistillingSarcasm/.env.template diff --git a/Demos/DistillingSarcasm/.gitignore b/Demos/DistillingSarcasm/.gitignore index cc708a8..dc0f593 100644 --- a/Demos/DistillingSarcasm/.gitignore +++ b/Demos/DistillingSarcasm/.gitignore @@ -1,8 +1,11 @@ +__pycache__ +*.pyc +.ipynb_checkpoints/ +training_*.jsonl +validation_*.jsonl sarcasm-baseline-*.jsonl sarcasm-training-*.jsonl sarcasm-validation-*.jsonl sarcasm-posttraining-*.jsonl .env .venv -**/__pycache__ -__pycache__ \ No newline at end of file diff --git a/Demos/DistillingSarcasm/README.md b/Demos/DistillingSarcasm/README.md index a1d8bb6..cf0d904 100644 --- a/Demos/DistillingSarcasm/README.md +++ b/Demos/DistillingSarcasm/README.md @@ -1,126 +1,152 @@ -# Distilling Sarcasm -> or: "how to teach GPT-4.1-nano how to be sarcastic" +# Distilling Sarcasm - Foundry SDK Version -This is an end-to-end example of **distillation**: squeezing the behaviors of a -larger model into a smaller model. +This is an alternative implementation of the Sarcasm distillation demo using the **Azure AI Foundry SDK** instead of raw API keys. -## tl;dr +## Overview -1. Go spin up base model deployments for all models you want to work with. In - this notebook, I use o3, o4-mini, gpt-4.1, gpt-4.1-mini, gpt-4.1-nano, - gpt-4o, and gpt-4o-mini. You might need TPM quota in your subscription. +This demo teaches language models to generate sarcastic responses through distillation: +1. A **teacher model** (gpt-4.1) generates sarcastic training data +2. A **student model** (gpt-4.1-mini) is fine-tuned on this data +3. Evaluators measure sarcasm quality before and after training -2. Create and populate a `.env` file to simplify stuff. In it, put some Azure - specific details: +## SDK Comparison -```properties -AZURE_OPENAI_ENDPOINT=https://.openai.azure.com -AZURE_OPENAI_API_KEY= -AZURE_SUBSCRIPTION_ID= -AZURE_RESOURCE_GROUP= -AZURE_AOAI_ACCOUNT= -``` +| Aspect | Original (API Key) | This Version (Foundry SDK) | +|--------|-------------------|---------------------------| +| **Package** | `openai` | `azure-ai-projects` + `openai` | +| **Auth** | API Key | `DefaultAzureCredential` | +| **Inference** | `client.chat.completions.create()` | `openai_client.chat.completions.create()` | +| **Evaluations** | `client.evals.*` | `openai_client.evals.*` (same!) | +| **Fine-tuning** | `client.fine_tuning.jobs.*` | `openai_client.fine_tuning.jobs.*` (same!) | -3. Wrangle the Python stuff. (See below.) +**Key Insight**: The APIs are nearly identical! The main difference is how you get the client: -``` -$ python3 -m venv .venv -$ . .venv/bin/activate -(venv) $ pip install -r requirements -``` +```python +# Original (API Key) +client = OpenAI(base_url=..., api_key=...) -Then launch this sucker in Jupyter notebooks. The easiest way is to fire it up -in **Visual Studio Code**: - -``` -$ code sarcasm.ipynb +# Foundry SDK +project_client = AIProjectClient(endpoint=..., credential=DefaultAzureCredential()) +openai_client = project_client.get_openai_client() # Same API from here! ``` -Happy distilling. 🧪 - -## Background -The basis for this demo is the Distillation [demo](https://github.com/azure-ai-foundry/build-2025-demos/blob/main/Azure%20AI%20Model%20Customization/DistillationDemo/demo.ipynb) -featured at Build 2025: - -And the [tutorial](https://learn.microsoft.com/en-us/azure/ai-services/openai/tutorials/fine-tune) -on fine-tuning gpt-4o with the prompt: -_Clippy is a factual chatbot that is also sarcastic._ - -## Grader -We use a crude grader that rewards two things: - -1. The amount of sarcasm from 1 (no sarcasm) to 10 (the most sarcasm) -2. Correct answers to the users questions. - -More importantly, it *heavily* penalizes wrong answers by driving the scores to -zero. In practice, I don't think I saw this happen, but it's a fun example. - -We leave it to **o3** to decide what we mean by sarcasm 😜 - -## Methodology -More will be written here, but at a high-level the approach takes a few steps. +## Prerequisites + +1. **Azure Subscription** with access to Azure AI Foundry +2. **Python 3.10+** +3. **Azure CLI** logged in (`az login`) + +## Setup + +1. Create and activate a virtual environment: + ```bash + python -m venv .venv + source .venv/bin/activate # Linux/Mac + # or: .venv\Scripts\activate # Windows + ``` + +2. Install dependencies: + ```bash + pip install -r requirements.txt + ``` + +3. Copy the environment template and fill in your values: + ```bash + cp .env.template .env + ``` + +4. Configure `.env`: + ``` + MICROSOFT_FOUNDRY_PROJECT_ENDPOINT=https://.services.ai.azure.com/api/projects/ + AZURE_INFERENCE_ENDPOINT=https://..models.ai.azure.com + AZURE_OPENAI_DEPLOYMENT=gpt-4.1 + AZURE_SUBSCRIPTION_ID= + AZURE_RESOURCE_GROUP= + AZURE_AOAI_ACCOUNT= + BASE_MODEL=gpt-4.1-mini + TEACHER_MODEL=gpt-4.1 + ``` + +## Running the Demo + +Open `sarcasm_foundry.ipynb` in Jupyter and execute cells in order: + +```bash +jupyter notebook sarcasm_foundry.ipynb +``` -For now, just read the [notebook](./sarcasm.ipynb) 😉 +## Key Differences from Original -### 0. Assemble Human Curated Data -We need something of a "gold standard." In this case, we use some pre-canned -examples from the Azure OpenAI docs. +### Authentication +```python +# Original (API Key) +client = OpenAI( + base_url=f"https://{resource}.openai.azure.com/openai/v1/", + api_key=os.environ.get("FOUNDRY_API_KEY"), +) -### 1. Build & Test our Grader -We assemble a Grader: a combination of model (o3) and a prompt. The prompt -tells the grader how to score other models' output. +# Foundry SDK (Azure Credential) +credential = DefaultAzureCredential() +project_client = AIProjectClient(endpoint=endpoint, credential=credential) +openai_client = project_client.get_openai_client() +``` -### 2. Benchmark our Base Models -We use the "gold standard" to check if our Grader is doing its job. If we can't -trust the grader, who can we trust? +### Inference +```python +# Original (OpenAI SDK) +response = client.chat.completions.create( + model="gpt-4.1", + messages=[{"role": "user", "content": "Hello"}] +) -### 3. Pick our Teacher and our Student -We then give an assignment to our base models (`o3`, `o4-mini`, `4.1-*`, -`4o-*`) and have the Grader decide on their scores. +# Foundry SDK (azure-ai-inference) +from azure.ai.inference import ChatCompletionsClient +from azure.ai.inference.models import UserMessage -We use these scores to determine which base model shows the most aptitude -for our use case. That model we pick as our Teacher. +client = ChatCompletionsClient(endpoint=..., credential=credential) +response = client.complete(messages=[UserMessage(content="Hello")]) +``` -We also figure out who our Student should be based on which model performs the -worst...yet might be much cheaper to use than our Teacher. (We're on a fixed -budget!) +### Evaluations (Same API!) +```python +# Both demos use identical evals API +eval = openai_client.evals.create(name="sarcasm-grader", ...) +run = openai_client.evals.runs.create(eval_id=eval.id, ...) +``` -### 4. Distill from the Teacher -We take the Teacher's answers to the questions and consider them what the -Student needs to learn. We turn them into our training data for fine-tuning. +### Trade-offs -### 5. Train our Student -The Student gets to work learning by studying the Teacher's output. +**Foundry SDK Advantages:** +- ✅ No API keys to manage (uses Azure managed identity) +- ✅ Better security with DefaultAzureCredential +- ✅ Consistent with other Azure AI services +- ✅ Unified project management -### 6. Test our Student against its Peer -Now, to keep things fair, we take _new_ data and ask the Student to generate -responses. We also ask its peer, the un-trained version of itself, to do the -same. We then compare the two. +**Original (API Key) Advantages:** +- ✅ Simpler setup (just one API key) +- ✅ Works outside Azure environments +- ✅ Familiar OpenAI SDK patterns -### 7. Celebrate or Cry -If our Student bests the Peer, we celebrate! Our job is done. +## Troubleshooting -If the Student is close enough to the Teacher, we ship it off to Production! +### Authentication Issues +- Ensure you're logged in: `az login` +- Check your role assignments include "Azure AI User" on the Foundry resource -## Troubleshooting +### Model Not Found +- Verify the model deployment names in your `.env` match your Azure deployments -### Common Issues +### Rate Limiting +- Reduce `sample_size` in evaluation cells if hitting rate limits -**Authentication Error** -- Run `az login` to refresh your Azure credentials -- Verify your Azure OpenAI endpoint and API key in `.env` -- Check that you have deployment access for all models (o3, o4-mini, gpt-4.1, etc.) +## Files -**Quota Exceeded** -- This demo uses multiple model deployments - ensure you have TPM quota for each -- Request additional quota in Azure Portal → Azure OpenAI → Quotas -- Try running with fewer models if quota is limited +- `sarcasm_foundry.ipynb` - Main notebook +- `requirements.txt` - Python dependencies +- `.env.template` - Environment variable template +- `README.md` - This file -**Model Not Available** -- Check regional availability for gpt-4.1-nano and other newer models -- Some models may require preview access - check Azure documentation +## Related -**Training Job Fails** -- Verify data format matches the expected JSONL schema -- Ensure training data doesn't exceed token limits -- Check that grader outputs are valid JSON +- [Original Sarcasm Demo](../DistillingSarcasm/) - Uses OpenAI SDK with API keys +- [CNN DailyMail Demo](../SFT_CNN_DailyMail/) - SFT example using Foundry SDK diff --git a/Demos/DistillingSarcasm/requirements.txt b/Demos/DistillingSarcasm/requirements.txt index 576bea1..677bc62 100644 --- a/Demos/DistillingSarcasm/requirements.txt +++ b/Demos/DistillingSarcasm/requirements.txt @@ -1,17 +1,19 @@ -# Core libraries -python-dotenv==0.21.0 # For loading environment variables from .env files +# Azure AI Foundry SDK +azure-ai-projects>=2.0.0b1 -# Data manipulation and visualization -pandas # For data manipulation and analysis -matplotlib # For creating visualizations -numpy # For numerical computations - -# Jupyter Notebook -notebook # For running Jupyter notebooks - -# OpenAI SDK -openai # For interacting with the OpenAI API +# Azure Authentication +azure-identity # Azure Cognitive Services for model deployment -azure-identity azure-mgmt-cognitiveservices + +# OpenAI SDK (obtained via Foundry client) +openai + +# Data manipulation and visualization +pandas +matplotlib +numpy + +# Core utilities +python-dotenv diff --git a/Demos/DistillingSarcasm/sarcasm.ipynb b/Demos/DistillingSarcasm/sarcasm.ipynb index 8b52a9e..ba2e25b 100644 --- a/Demos/DistillingSarcasm/sarcasm.ipynb +++ b/Demos/DistillingSarcasm/sarcasm.ipynb @@ -2,2180 +2,974 @@ "cells": [ { "cell_type": "markdown", - "id": "900d18b5", "metadata": {}, "source": [ - "# Distilling Sarcasm\n", - "> or: \"how to teach GPT-4.1-nano and Ministral-3B how to be sarcastic\"\n", + "# Distilling Sarcasm (Foundry SDK Version)\n", "\n", - "This is an end-to-end example of **distillation**: squeezing the behaviors of a\n", - "larger model into a smaller model.\n", + "This notebook demonstrates fine-tuning language models to generate sarcastic responses using the **Azure AI Foundry SDK**.\n", "\n", - "We're going to show how you can take an un-engineered prompt (`SYSTEM_PROMPT`)\n", - "as simple as:\n", + "## Comparison with Original Demo\n", "\n", - "> Clippy is a factual chatbot that is also sarcastic.\n", + "| Aspect | Original | Foundry SDK |\n", + "|--------|----------|-------------|\n", + "| Auth | API Key | DefaultAzureCredential |\n", + "| Inference | `client.chat.completions` | `openai_client.responses` |\n", + "| Evaluations | `client.evals.*` | `openai_client.evals.*` (same!) |\n", + "| Fine-tuning | `client.fine_tuning.jobs.*` | `openai_client.fine_tuning.jobs.*` (same!) |\n", "\n", - "...and basically go from zero to a fine-tuned model \n", - "*without having to generate your own training data!*\n", + "**Key Insight**: The APIs are nearly identical! The main difference is authentication:\n", + "- Original: `OpenAI(base_url=..., api_key=...)`\n", + "- Foundry: `AIProjectClient(endpoint, credential).get_openai_client()`\n", "\n", - "## Requirements\n", - "1. Go spin up base model deployments for all models you want to work with. In\n", - " this notebook, I use `o3-mini`, `gpt-4.1`, `gpt-4.1-mini`, `gpt-4.1-nano`, \n", - " `gpt-4o`, `gpt-4o-mini`, `Ministral-3B`, and `DeepSeek-V3.1`. (You may also \n", - " need TPM quota in your subscription.)\n", - "\n", - "2. Create and populate a `.env` file to simplify stuff. In it, put some Azure\n", - " specific details:\n", - "\n", - "```properties\n", - "FOUNDRY_PARENT_RESOURCE=\n", - "FOUNDRY_API_KEY=\n", - "AZURE_SUBSCRIPTION_ID=\n", - "AZURE_RESOURCE_GROUP=\n", - "```\n", - "\n", - "3. Depending on how you run this notebook, you may need to pre-install a\n", - " **Jupyter** environment. The `pip install` below will handle any other\n", - " dependencies for you." + "**Note**: Foundry project endpoints use `responses.create()` API for inference." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Setup" ] }, { "cell_type": "code", "execution_count": 1, - "id": "74193f85", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Requirement already satisfied: python-dotenv==0.21.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from -r requirements.txt (line 2)) (0.21.0)\n", - "Requirement already satisfied: pandas in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from -r requirements.txt (line 5)) (2.3.3)\n", - "Requirement already satisfied: matplotlib in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from -r requirements.txt (line 6)) (3.10.7)\n", - "Requirement already satisfied: numpy in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from -r requirements.txt (line 7)) (2.3.5)\n", - "Requirement already satisfied: notebook in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from -r requirements.txt (line 10)) (7.5.0)\n", - "Requirement already satisfied: openai in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from -r requirements.txt (line 13)) (2.9.0)\n", - "Requirement already satisfied: azure-identity in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from -r requirements.txt (line 16)) (1.25.1)\n", - "Requirement already satisfied: azure-mgmt-cognitiveservices in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from -r requirements.txt (line 17)) (14.1.0)\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from pandas->-r requirements.txt (line 5)) (2.9.0.post0)\n", - "Requirement already satisfied: pytz>=2020.1 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from pandas->-r requirements.txt (line 5)) (2025.2)\n", - "Requirement already satisfied: tzdata>=2022.7 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from pandas->-r requirements.txt (line 5)) (2025.2)\n", - "Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from matplotlib->-r requirements.txt (line 6)) (1.3.3)\n", - "Requirement already satisfied: cycler>=0.10 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from matplotlib->-r requirements.txt (line 6)) (0.12.1)\n", - "Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from matplotlib->-r requirements.txt (line 6)) (4.61.0)\n", - "Requirement already satisfied: kiwisolver>=1.3.1 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from matplotlib->-r requirements.txt (line 6)) (1.4.9)\n", - "Requirement already satisfied: packaging>=20.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from matplotlib->-r requirements.txt (line 6)) (25.0)\n", - "Requirement already satisfied: pillow>=8 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from matplotlib->-r requirements.txt (line 6)) (12.0.0)\n", - "Requirement already satisfied: pyparsing>=3 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from matplotlib->-r requirements.txt (line 6)) (3.2.5)\n", - "Requirement already satisfied: jupyter-server<3,>=2.4.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from notebook->-r requirements.txt (line 10)) (2.17.0)\n", - "Requirement already satisfied: jupyterlab-server<3,>=2.28.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from notebook->-r requirements.txt (line 10)) (2.28.0)\n", - "Requirement already satisfied: jupyterlab<4.6,>=4.5.0rc0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from notebook->-r requirements.txt (line 10)) (4.5.0)\n", - "Requirement already satisfied: notebook-shim<0.3,>=0.2 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from notebook->-r requirements.txt (line 10)) (0.2.4)\n", - "Requirement already satisfied: tornado>=6.2.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from notebook->-r requirements.txt (line 10)) (6.5.2)\n", - "Requirement already satisfied: anyio>=3.1.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (4.12.0)\n", - "Requirement already satisfied: argon2-cffi>=21.1 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (25.1.0)\n", - "Requirement already satisfied: jinja2>=3.0.3 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (3.1.6)\n", - "Requirement already satisfied: jupyter-client>=7.4.4 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (8.7.0)\n", - "Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (5.9.1)\n", - "Requirement already satisfied: jupyter-events>=0.11.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (0.12.0)\n", - "Requirement already satisfied: jupyter-server-terminals>=0.4.4 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (0.5.3)\n", - "Requirement already satisfied: nbconvert>=6.4.4 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (7.16.6)\n", - "Requirement already satisfied: nbformat>=5.3.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (5.10.4)\n", - "Requirement already satisfied: prometheus-client>=0.9 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (0.23.1)\n", - "Requirement already satisfied: pywinpty>=2.0.1 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (3.0.2)\n", - "Requirement already satisfied: pyzmq>=24 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (27.1.0)\n", - "Requirement already satisfied: send2trash>=1.8.2 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (1.8.3)\n", - "Requirement already satisfied: terminado>=0.8.3 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (0.18.1)\n", - "Requirement already satisfied: traitlets>=5.6.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (5.14.3)\n", - "Requirement already satisfied: websocket-client>=1.7 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (1.9.0)\n", - "Requirement already satisfied: async-lru>=1.0.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (2.0.5)\n", - "Requirement already satisfied: httpx<1,>=0.25.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (0.28.1)\n", - "Requirement already satisfied: ipykernel!=6.30.0,>=6.5.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (7.1.0)\n", - "Requirement already satisfied: jupyter-lsp>=2.0.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (2.3.0)\n", - "Requirement already satisfied: setuptools>=41.1.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (80.9.0)\n", - "Requirement already satisfied: certifi in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (2025.11.12)\n", - "Requirement already satisfied: httpcore==1.* in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (1.0.9)\n", - "Requirement already satisfied: idna in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from httpx<1,>=0.25.0->jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (3.11)\n", - "Requirement already satisfied: h11>=0.16 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from httpcore==1.*->httpx<1,>=0.25.0->jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (0.16.0)\n", - "Requirement already satisfied: babel>=2.10 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyterlab-server<3,>=2.28.0->notebook->-r requirements.txt (line 10)) (2.17.0)\n", - "Requirement already satisfied: json5>=0.9.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyterlab-server<3,>=2.28.0->notebook->-r requirements.txt (line 10)) (0.12.1)\n", - "Requirement already satisfied: jsonschema>=4.18.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyterlab-server<3,>=2.28.0->notebook->-r requirements.txt (line 10)) (4.25.1)\n", - "Requirement already satisfied: requests>=2.31 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyterlab-server<3,>=2.28.0->notebook->-r requirements.txt (line 10)) (2.32.5)\n", - "Requirement already satisfied: distro<2,>=1.7.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from openai->-r requirements.txt (line 13)) (1.9.0)\n", - "Requirement already satisfied: jiter<1,>=0.10.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from openai->-r requirements.txt (line 13)) (0.12.0)\n", - "Requirement already satisfied: pydantic<3,>=1.9.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from openai->-r requirements.txt (line 13)) (2.12.5)\n", - "Requirement already satisfied: sniffio in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from openai->-r requirements.txt (line 13)) (1.3.1)\n", - "Requirement already satisfied: tqdm>4 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from openai->-r requirements.txt (line 13)) (4.67.1)\n", - "Requirement already satisfied: typing-extensions<5,>=4.11 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from openai->-r requirements.txt (line 13)) (4.15.0)\n", - "Requirement already satisfied: annotated-types>=0.6.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from pydantic<3,>=1.9.0->openai->-r requirements.txt (line 13)) (0.7.0)\n", - "Requirement already satisfied: pydantic-core==2.41.5 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from pydantic<3,>=1.9.0->openai->-r requirements.txt (line 13)) (2.41.5)\n", - "Requirement already satisfied: typing-inspection>=0.4.2 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from pydantic<3,>=1.9.0->openai->-r requirements.txt (line 13)) (0.4.2)\n", - "Requirement already satisfied: azure-core>=1.31.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from azure-identity->-r requirements.txt (line 16)) (1.36.0)\n", - "Requirement already satisfied: cryptography>=2.5 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from azure-identity->-r requirements.txt (line 16)) (46.0.3)\n", - "Requirement already satisfied: msal>=1.30.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from azure-identity->-r requirements.txt (line 16)) (1.34.0)\n", - "Requirement already satisfied: msal-extensions>=1.2.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from azure-identity->-r requirements.txt (line 16)) (1.3.1)\n", - "Requirement already satisfied: msrest>=0.7.1 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from azure-mgmt-cognitiveservices->-r requirements.txt (line 17)) (0.7.1)\n", - "Requirement already satisfied: azure-mgmt-core>=1.6.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from azure-mgmt-cognitiveservices->-r requirements.txt (line 17)) (1.6.0)\n", - "Requirement already satisfied: argon2-cffi-bindings in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (25.1.0)\n", - "Requirement already satisfied: cffi>=2.0.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from cryptography>=2.5->azure-identity->-r requirements.txt (line 16)) (2.0.0)\n", - "Requirement already satisfied: pycparser in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from cffi>=2.0.0->cryptography>=2.5->azure-identity->-r requirements.txt (line 16)) (2.23)\n", - "Requirement already satisfied: comm>=0.1.1 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from ipykernel!=6.30.0,>=6.5.0->jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (0.2.3)\n", - "Requirement already satisfied: debugpy>=1.6.5 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from ipykernel!=6.30.0,>=6.5.0->jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (1.8.17)\n", - "Requirement already satisfied: ipython>=7.23.1 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from ipykernel!=6.30.0,>=6.5.0->jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (9.8.0)\n", - "Requirement already satisfied: matplotlib-inline>=0.1 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from ipykernel!=6.30.0,>=6.5.0->jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (0.2.1)\n", - "Requirement already satisfied: nest-asyncio>=1.4 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from ipykernel!=6.30.0,>=6.5.0->jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (1.6.0)\n", - "Requirement already satisfied: psutil>=5.7 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from ipykernel!=6.30.0,>=6.5.0->jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (7.1.3)\n", - "Requirement already satisfied: colorama>=0.4.4 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel!=6.30.0,>=6.5.0->jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (0.4.6)\n", - "Requirement already satisfied: decorator>=4.3.2 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel!=6.30.0,>=6.5.0->jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (5.2.1)\n", - "Requirement already satisfied: ipython-pygments-lexers>=1.0.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel!=6.30.0,>=6.5.0->jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (1.1.1)\n", - "Requirement already satisfied: jedi>=0.18.1 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel!=6.30.0,>=6.5.0->jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (0.19.2)\n", - "Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel!=6.30.0,>=6.5.0->jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (3.0.52)\n", - "Requirement already satisfied: pygments>=2.11.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel!=6.30.0,>=6.5.0->jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (2.19.2)\n", - "Requirement already satisfied: stack_data>=0.6.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from ipython>=7.23.1->ipykernel!=6.30.0,>=6.5.0->jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (0.6.3)\n", - "Requirement already satisfied: wcwidth in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=7.23.1->ipykernel!=6.30.0,>=6.5.0->jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (0.2.14)\n", - "Requirement already satisfied: parso<0.9.0,>=0.8.4 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jedi>=0.18.1->ipython>=7.23.1->ipykernel!=6.30.0,>=6.5.0->jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (0.8.5)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jinja2>=3.0.3->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (3.0.3)\n", - "Requirement already satisfied: attrs>=22.2.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->notebook->-r requirements.txt (line 10)) (25.4.0)\n", - "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->notebook->-r requirements.txt (line 10)) (2025.9.1)\n", - "Requirement already satisfied: referencing>=0.28.4 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->notebook->-r requirements.txt (line 10)) (0.37.0)\n", - "Requirement already satisfied: rpds-py>=0.7.1 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->notebook->-r requirements.txt (line 10)) (0.30.0)\n", - "Requirement already satisfied: platformdirs>=2.5 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyter-core!=5.0.*,>=4.12->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (4.5.1)\n", - "Requirement already satisfied: python-json-logger>=2.0.4 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (4.0.0)\n", - "Requirement already satisfied: pyyaml>=5.3 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (6.0.3)\n", - "Requirement already satisfied: rfc3339-validator in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (0.1.4)\n", - "Requirement already satisfied: rfc3986-validator>=0.1.1 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (0.1.1)\n", - "Requirement already satisfied: fqdn in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (1.5.1)\n", - "Requirement already satisfied: isoduration in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (20.11.0)\n", - "Requirement already satisfied: jsonpointer>1.13 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (3.0.0)\n", - "Requirement already satisfied: rfc3987-syntax>=1.1.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (1.1.0)\n", - "Requirement already satisfied: uri-template in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (1.3.0)\n", - "Requirement already satisfied: webcolors>=24.6.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (25.10.0)\n", - "Requirement already satisfied: PyJWT<3,>=1.0.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from PyJWT[crypto]<3,>=1.0.0->msal>=1.30.0->azure-identity->-r requirements.txt (line 16)) (2.10.1)\n", - "Requirement already satisfied: charset_normalizer<4,>=2 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from requests>=2.31->jupyterlab-server<3,>=2.28.0->notebook->-r requirements.txt (line 10)) (3.4.4)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from requests>=2.31->jupyterlab-server<3,>=2.28.0->notebook->-r requirements.txt (line 10)) (2.6.1)\n", - "Requirement already satisfied: isodate>=0.6.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from msrest>=0.7.1->azure-mgmt-cognitiveservices->-r requirements.txt (line 17)) (0.7.2)\n", - "Requirement already satisfied: requests-oauthlib>=0.5.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from msrest>=0.7.1->azure-mgmt-cognitiveservices->-r requirements.txt (line 17)) (2.0.0)\n", - "Requirement already satisfied: beautifulsoup4 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (4.14.3)\n", - "Requirement already satisfied: bleach!=5.0.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from bleach[css]!=5.0.0->nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (6.3.0)\n", - "Requirement already satisfied: defusedxml in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (0.7.1)\n", - "Requirement already satisfied: jupyterlab-pygments in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (0.3.0)\n", - "Requirement already satisfied: mistune<4,>=2.0.3 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (3.1.4)\n", - "Requirement already satisfied: nbclient>=0.5.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (0.10.2)\n", - "Requirement already satisfied: pandocfilters>=1.4.1 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (1.5.1)\n", - "Requirement already satisfied: webencodings in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from bleach!=5.0.0->bleach[css]!=5.0.0->nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (0.5.1)\n", - "Requirement already satisfied: tinycss2<1.5,>=1.1.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from bleach[css]!=5.0.0->nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (1.4.0)\n", - "Requirement already satisfied: fastjsonschema>=2.15 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from nbformat>=5.3.0->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (2.21.2)\n", - "Requirement already satisfied: six>=1.5 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from python-dateutil>=2.8.2->pandas->-r requirements.txt (line 5)) (1.17.0)\n", - "Requirement already satisfied: oauthlib>=3.0.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from requests-oauthlib>=0.5.0->msrest>=0.7.1->azure-mgmt-cognitiveservices->-r requirements.txt (line 17)) (3.3.1)\n", - "Requirement already satisfied: lark>=1.2.2 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from rfc3987-syntax>=1.1.0->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (1.3.1)\n", - "Requirement already satisfied: executing>=1.2.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel!=6.30.0,>=6.5.0->jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (2.2.1)\n", - "Requirement already satisfied: asttokens>=2.1.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel!=6.30.0,>=6.5.0->jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (3.0.1)\n", - "Requirement already satisfied: pure-eval in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel!=6.30.0,>=6.5.0->jupyterlab<4.6,>=4.5.0rc0->notebook->-r requirements.txt (line 10)) (0.2.3)\n", - "Requirement already satisfied: soupsieve>=1.6.1 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from beautifulsoup4->nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (2.8)\n", - "Requirement already satisfied: arrow>=0.15.0 in c:\\users\\davevoutila\\src\\fine-tuning\\demos\\distillingsarcasm\\.venv\\lib\\site-packages (from isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook->-r requirements.txt (line 10)) (1.4.0)\n", "Note: you may need to restart the kernel to use updated packages.\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "[notice] A new release of pip is available: 25.2 -> 25.3\n", - "[notice] To update, run: python.exe -m pip install --upgrade pip\n" - ] } ], "source": [ - "%pip install -r requirements.txt" - ] - }, - { - "cell_type": "markdown", - "id": "ff847b25", - "metadata": {}, - "source": [ - "## Environment Setup\n", - "First things first: we need an **Azure OpenAI** client instance." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "4912ccdc", - "metadata": {}, - "outputs": [], - "source": [ - "from openai import OpenAI\n", - "from dotenv import load_dotenv\n", - "import os\n", - "\n", - "load_dotenv(override=True)\n", - "\n", - "client = OpenAI(\n", - " base_url=f\"https://{os.environ.get('FOUNDRY_PARENT_RESOURCE')}.openai.azure.com/openai/v1/\",\n", - " api_key=os.environ.get(\"FOUNDRY_API_KEY\"),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "19bfcc21", - "metadata": {}, - "source": [ - "We also want to keep an entire experiment run easily identifiable. A few bits of\n", - "randomness should be enough, so just generate a UUID and chop off its head 🪓." + "%pip install -r requirements.txt -q" ] }, { "cell_type": "code", - "execution_count": 3, - "id": "53f86c4a", - "metadata": {}, - "outputs": [], - "source": [ - "# We'll create a \"unique enough\" identifier that lets us run this notebook\n", - "# multiple times and easily keep track of things each run creates.\n", - "import uuid\n", - "UNIQUE_ENOUGH_KEY = str(uuid.uuid4()).split(\"-\")[0]" - ] - }, - { - "cell_type": "markdown", - "id": "c3b55aff", - "metadata": {}, - "source": [ - "## 0. Assembling Human Curated Data\n", - "We need a *gold standard* sample, not to train our model on, but to measure\n", - "the efficacy of our grader.\n", - "\n", - "We'll use some pre-canned sarcasm examples from the **Azure OpenAI** tutorial\n", - "on [fine-tuning a GPT model](https://learn.microsoft.com/en-us/azure/ai-services/openai/tutorials/fine-tune?tabs=command-line).\n", - "\n", - "The [baseline.jsonl](./baseline.jsonl) file contains the prompts from that\n", - "tutorial, but decomposed into a simple format of question/answer pairs." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "dd7ffbc9", + "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'question': 'Who discovered Antarctica?', 'answer': \"Some chaps named Fabian Gottlieb von Bellingshausen and Mikhail Lazarev, as if they don't teach that in every school!\"}\n", - "{'question': 'What is the biggest ocean?', 'answer': \"The Pacific Ocean. It's not like it's a small pond or anything.\"}\n", - "{'question': 'What is the largest planet?', 'answer': \"It's called Jupiter, you might have heard of it...or not.\"}\n" + "Libraries imported successfully\n" ] } ], "source": [ - "# Peek at our baseline *gold standard* dataset.\n", + "import os\n", "import json\n", + "import uuid\n", + "import time\n", + "from random import shuffle\n", "\n", - "rows = 0\n", - "with open(\"./baseline.jsonl\", \"r\") as f:\n", - " for line in f.readlines():\n", - " print(json.loads(line))\n", - " rows += 1\n", - " if rows >= 3:\n", - " break" - ] - }, - { - "cell_type": "markdown", - "id": "72852df2", - "metadata": {}, - "source": [ - "## 1. Grading the Grader\n", - "The Grader is the lynchpin here, so it *must* be effective. \n", - "\n", - "Specifically, we'll use a **Score Model Grader** (aka `score_model` via the API).\n", - "It's job is to take a *prompt* and use it to derive a numeric *score*. We define\n", - "to decide if the score is \"passing\" or \"failing\", which we'll use as a simple\n", - "signal of quality.\n", - "\n", - "> ℹ️ **Aside: Garbage In, Garbage Out**\n", - "> If you cannot explain to a reasoning model like `o3-mini` how to identify high \n", - "> and low quality responses based on your objective, you have a *shaky foundation*\n", - "> from which to build upon.\n", - ">\n", - "> But keep in mind, `o3-mini` is a *reasoning* model. You do *not* need to give it\n", - "> fine-grained instructions, but you should give it a *clear objective*.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "778440a4", - "metadata": {}, - "source": [ - "### 📃 Preparing our first Evaluation\n", - "We'll start with the definition of the prompt for our grader (`GRADER_PROMPT`).\n", - "\n", - "Things to note about this prompt:\n", - "- We let the grader know how we want it to behave, in this case as a sarcasm\n", - " detector and fact checker.\n", - "- We inform it of what the input data looks like.\n", - "- We give it a rubrik for scoring each input.\n", - "- We give it a clear schema for its output to match what a Score Model Grader\n", - " must output for the Evaluation system to interpret.\n", - "\n", - "And that's it. In this case, we don't need examples, we rely on the reasoning\n", - "model to decide how to interpret sarcasm, etc." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "eb62b687", - "metadata": {}, - "outputs": [], - "source": [ - "# We'll use the most capable reasoning model as our grader.\n", - "GRADER_MODEL = \"o3-mini\"\n", - "\n", - "# A simple score grader that quantifies sarcasm while accounting for accuracy\n", - "# of the model's response. We want the model to answer the question *correctly*\n", - "# while adjusting their tone. Incorrect answers are heavily penalized.\n", - "GRADER_PROMPT = \"\"\"\n", - "You are an expert in sarcasm detection, while also being a good fact checker.\n", - "\n", - "You'll be given a conversation in the form of a question and answer. \n", - "\n", - "## Scoring Criteria\n", - "Judge the answer using two metrics.\n", - "\n", - "### Metric 1: How sarcastic is the response?\n", - "On a scale from 1 to 10 inclusive with 1 being no sarcasm detected and 10 being\n", - "the most sarcastic response ever, assign an integer value within that range based\n", - "on your assessment.\n", - "\n", - "### Metric 2: How accurate is the response?\n", - "Assign a 1 if the response is factually correct. Assign a 0 for this metric if it's\n", - "incorrect or contains innacuracies.\n", - "\n", - "### Final Score\n", - "The final score you must decide should be based on a weighted blend of Metric 1 and\n", - "Metric 2 using the formula: `(Metric 1) * (Metric 2)`\n", - "\n", - "This means that if Metric 2 is zero, the final score must be zero.\n", - "\n", - "## Response Structure\n", - "Your response must be in a JSON format that can be loaded by Python's json.loads()\n", - "function. It must resemble the following:\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", "\n", - "```\n", - "{\n", - " \"steps\": [\n", - " { \n", - " \"description\": \", \n", - " \"result\": \n", - " },\n", - " { \n", - " \"description\": \", \n", - " \"result\": \n", - " }\n", - " ],\n", - " \"result\": \n", - "}\n", + "from dotenv import load_dotenv\n", + "from azure.identity import DefaultAzureCredential\n", + "from azure.ai.projects import AIProjectClient\n", "\n", - "## General Guidance\n", - "The questions should be simple factual questions with clear answers. Deep research is\n", - "not required.\n", - "```\n", - "\"\"\"" + "print(\"Libraries imported successfully\")" ] }, { "cell_type": "markdown", - "id": "da8ae533", "metadata": {}, "source": [ - "Now we need to stage our baseline data in the Azure OpenAI service so the grader can\n", - "access and score each input.\n", - "\n", - "Using the SDK, it's a simple file upload, but with a purpose of `evals`.\n", - "\n", - "A file used for `evals` ideally is in JSONL format, but does **not** need to be in a\n", - "chat completions format. We can just use that `{ \"question\": \"?\", \"answer\": \"...\" }`\n", - "format from our baseline file without any further data engineering.\n" + "## 2. Connect to Microsoft Foundry" ] }, { "cell_type": "code", - "execution_count": 6, - "id": "7f6b9289", + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Created eval file:\n", - "{\n", - " \"id\": \"file-8ce13f288b2d444eaed9fb0ff8ac15da\",\n", - " \"bytes\": 2510,\n", - " \"created_at\": 1765377018,\n", - " \"filename\": \"baseline.jsonl\",\n", - " \"object\": \"file\",\n", - " \"purpose\": \"evals\",\n", - " \"status\": \"processed\"\n", - "}\n" + "Run identifier: 4aa88349\n", + "Project: https://omi-ignite-demo-resource.services.ai.azure.com/api/projects/omi-ignite-demo\n" ] } ], "source": [ - "# We're going to first evaluate our grader using a human-curated dataset.\n", - "# In this case, these are the examples from our fine-tuning tutorial. Let's pretend\n", - "# we know what the scores for these _should_ be.\n", - "grader_eval_file = None\n", - "with open(\"./baseline.jsonl\", \"rb\") as f:\n", - " grader_eval_file = client.files.create(purpose=\"evals\", file=f)\n", - " grader_eval_file = client.files.wait_for_processing(grader_eval_file.id)\n", - "\n", - "print(f\"Created eval file:\\n{grader_eval_file.to_json(indent=2)}\")\n" - ] - }, - { - "cell_type": "markdown", - "id": "4f9d3404", - "metadata": {}, - "source": [ - "### 🏗️ Constructing the Grader\n", - "For evaluating the grader, we'll be giving it pre-canned prompts for it to score.\n", - "\n", - "To do this, we have to provide some templating:\n", - "\n", - "1. We give the system a template (`INPUT`) to plug in data from our baseline\n", - " jsonl to construct a ficticious prompt from a model.\n", - "2. We provide a schema to describe the shape of our test data (the baseline file)\n", - " in `SCHEMA`.\n", - "3. Lastly, we define the testing criteria (`TESTING_CRITERIA`) which takes our\n", - " prompt template (`INPUT`), the name of our grader model (in Azure OpenAI, this\n", - " is the _deployment name_ of the model to use), and the scoring details.\n", - "\n", - "This is a lot of data, so it's important to take a moment and wrap your head around\n", - "this stuff. Remember, this is the _simple_ version of a Score Model Grader! Simple\n", - "being we're not generating prompts, we're just using a data file to populate a\n", - "template.\n", - "\n", - "So, for example, if we have a row from `baseline.jsonl` that looks like:\n", - "\n", - "```json\n", - "{ \"question\": \"Who spilled coffee on their desk today?\", \"answer\": \"Dave\" }\n", - "```\n", - "\n", - "The _actual_ prompt that will be shown to the _grader_ will be:\n", - "\n", - "```json\n", - "[\n", - " { \"role\": \"system\", \"content\": \"You are an expert in sarcasm detection,...\" },\n", - " { \"role\": \"user\", \"content\": \"\\nQ: Who spilled coffee on their desk today?\\nA: Dave\\n\" }\n", - "]\n", - "```\n", - "\n", - "> ℹ️ Once you know how to reason about this yourself, you can use something like the\n", - "> Azure OpenAI Chat Playground to manually test your grader! Just set the system\n", - "> prompt to your grader prompt and then provide the _user_ content like in the\n", - "> example above." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "03a6cbf7", - "metadata": {}, - "outputs": [], - "source": [ - "# Now we define our Evaluation for validating our choice in grader prompt and model.\n", - "\n", - "# The entire user prompt is data driven from the file. No generation is done using\n", - "# a model in this case, just simple string substitution using this pattern. This\n", - "# means we directly reference the two fields in our baseline.jsonl file.\n", - "USER_PROMPT = \"\"\"\n", - "Q: {{item.question}}\n", - "A: {{item.answer}}\n", - "\"\"\"\n", - "INPUT = [\n", - " {\n", - " \"type\": \"message\",\n", - " \"role\": \"system\",\n", - " \"content\": { \"type\": \"input_text\", \"text\": GRADER_PROMPT }\n", - " },\n", - " {\n", - " \"type\": \"message\",\n", - " \"role\": \"user\",\n", - " \"content\": { \"type\": \"input_text\", \"text\": USER_PROMPT }\n", - " }\n", - "]\n", + "load_dotenv(override=True)\n", "\n", - "# We need to describe what our evaluation dataset looks like.\n", - "SCHEMA = {\n", - " \"type\": \"object\",\n", - " \"properties\": {\n", - " \"question\": { \"type\": \"string\" },\n", - " \"answer\": { \"type\": \"string\" },\n", - " }\n", - "}\n", - "DATA_SOURCE = {\n", - " \"item_schema\": SCHEMA,\n", - " \"include_sample_schema\": False,\n", - " \"type\": \"custom\",\n", - "}\n", + "# Required for Azure OpenAI client\n", + "os.environ.setdefault(\"OPENAI_API_VERSION\", \"2025-03-01-preview\")\n", "\n", - "# Lastly, we define test criteria that combines all the above.\n", - "TESTING_CRITERIA = {\n", - " \"name\": \"Auto Sarcasm Grader\",\n", - " \"type\": \"score_model\",\n", - " \"model\": GRADER_MODEL,\n", - " \"input\": INPUT,\n", - " \"range\": [1.0, 10.0], # Our grader scores in a range from 1 to 10\n", - " \"pass_threshold\": 4.0, # Let's say a 4 is \"passing\" for now.\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "19368669", - "metadata": {}, - "source": [ - "### 👨‍🔬 Putting it together into an Evaluation\n", - "We have our data source defined (`DATA_SOURCE`) and our testing criteria that defines\n", - "our grader (`TESTING_CRITERIA`). Now we have what we need to construct an Evaluation.\n", + "project_endpoint = os.environ.get(\"MICROSOFT_FOUNDRY_PROJECT_ENDPOINT\")\n", + "model_deployment = os.environ.get(\"AZURE_OPENAI_DEPLOYMENT\", \"gpt-4.1\")\n", + "base_model = os.environ.get(\"BASE_MODEL\", \"gpt-4.1-mini\")\n", + "teacher_model = os.environ.get(\"TEACHER_MODEL\", \"gpt-4.1\")\n", "\n", - "> An evaluation can contain multiple testing criteria (i.e. graders), but in our case\n", - "> we just use the one above. Just a heads up on why we're giving it a list here.\n" + "UNIQUE_KEY = str(uuid.uuid4()).split(\"-\")[0]\n", + "print(f\"Run identifier: {UNIQUE_KEY}\")\n", + "print(f\"Project: {project_endpoint}\")" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "2d0a7eb6", + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "⚖️ Submitted grader evaluation eval_693983fb81d08191b12b619b470c8b19.\n" + "✓ Connected to Microsoft Foundry\n" ] } ], "source": [ - "# We've set up the parameters for our Eval, now we create it via the API.\n", - "grader_eval = client.evals.create(\n", - " name=f\"sarcasm-grader-{UNIQUE_ENOUGH_KEY}\",\n", - " data_source_config=DATA_SOURCE,\n", - " testing_criteria=[TESTING_CRITERIA],\n", - ")\n", + "# Create clients - this is the KEY difference from original demo\n", + "credential = DefaultAzureCredential()\n", + "project_client = AIProjectClient(endpoint=project_endpoint, credential=credential)\n", + "openai_client = project_client.get_openai_client()\n", "\n", - "print(f\"⚖️ Submitted grader evaluation {grader_eval.id}.\")\n" + "print(\"✓ Connected to Microsoft Foundry\")" ] }, { "cell_type": "markdown", - "id": "48ef0227", "metadata": {}, "source": [ - "Oddly, we're **not done!** \n", - "\n", - "We defined the Evaluation, but each Evaluation needs a test Run. The Evaluation defines\n", - "the general terms, testing criteria, etc., but since we might want to vary some test\n", - "parameters, we actually need a **Run** to get anything done.\n", - "\n", - "Astute readers may have noticed that _we never specified we want to use baseline.jsonl._\n", - "\n", - "At the Run-level, we provide the specific test file. This lets you separate out the\n", - "schema definition of a test file from the data itself, so if you had multiple files\n", - "you wanted to test, you could create multiple runs.\n", - "\n", - "We define `RUN_DATA_SOURCE` below to specify, by _file id_ which file we want this Run\n", - "to use for data driving our test." + "## 3. Load Baseline Data" ] }, { "cell_type": "code", - "execution_count": 9, - "id": "a0b6954a", + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "🏃‍➡️ Submitted run evalrun_693983fc25c08191a6718033cc66a6d7 for eval_693983fb81d08191b12b619b470c8b19.\n" + "Loaded 20 baseline examples\n", + "\n", + "Sample:\n", + "Q: Who discovered Antarctica?\n", + "A: Some chaps named Fabian Gottlieb von Bellingshausen and Mikhail Lazarev, as if they don't teach that in every school!\n" ] } ], "source": [ - "# Our evaluation needs a test run. This is where we let it know to use our\n", - "# \"gold standard\" file (baseline.jsonl) to test our grader.\n", - "RUN_DATA_SOURCE = {\n", - " \"type\": \"jsonl\",\n", - " \"source\": { \"type\": \"file_id\", \"id\": grader_eval_file.id }\n", - "}\n", - "grader_run = client.evals.runs.create(\n", - " name=GRADER_MODEL,\n", - " eval_id=grader_eval.id,\n", - " data_source=RUN_DATA_SOURCE,\n", - ")\n", - "print(f\"🏃‍➡️ Submitted run {grader_run.id} for {grader_eval.id}.\")" + "baseline_data = []\n", + "with open(\"baseline.jsonl\", \"r\") as f:\n", + " for line in f:\n", + " baseline_data.append(json.loads(line))\n", + "\n", + "print(f\"Loaded {len(baseline_data)} baseline examples\")\n", + "print(f\"\\nSample:\")\n", + "print(f\"Q: {baseline_data[0]['question']}\")\n", + "print(f\"A: {baseline_data[0]['answer']}\")" ] }, { "cell_type": "markdown", - "id": "5018b01b", "metadata": {}, "source": [ - "Nothing is instaneous in life, including Evals. Let's wait for our Run to complete.\n", - "\n", - "We can do this by polling the status of the Run itself. (Note: we don't poll the Eval.)" + "## 4. Define Sarcasm Grader" ] }, { "cell_type": "code", - "execution_count": 10, - "id": "080dd428", + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "⏱️ Elapsed time: 0 minutes 23 seconds\n", - "🏁 Run evalrun_693983fc25c08191a6718033cc66a6d7: completed!\n" + "Grader prompt defined\n" ] } ], "source": [ - "# An Eval Run takes time to complete. Let's actively wait for it to finish before continuing.\n", - "from IPython.display import clear_output\n", - "import time\n", - "\n", - "start_time = time.time()\n", - "\n", - "grader_run = client.evals.runs.retrieve(eval_id=grader_eval.id, run_id=grader_run.id)\n", - "while grader_run.status not in [\"completed\", \"failed\"]:\n", - " time.sleep(5)\n", - " clear_output(wait=True)\n", + "GRADER_PROMPT = \"\"\"\n", + "You are a connoisseur of finely crafted sarcasm. Your job is to evaluate a\n", + "question and answer pair and score it based on how sarcastic the answer is.\n", + "\n", + "# Guidelines\n", + " * Output ONLY a score.\n", + " * A score of 0 means the answer is not sarcastic at all.\n", + " * A score of 5 means the answer is moderately sarcastic.\n", + " * A score of 10 means the answer is dripping with sarcasm.\n", + "\n", + "# Scoring Features\n", + " * Exaggeration or overstatement of the obvious\n", + " * Rhetorical questions that mock the original question\n", + " * Use of phrases like \"Oh really?\", \"Shocking!\", \"Who knew?\"\n", + " * Intentional understatement for comedic effect\n", + " * Irony where literal meaning differs from intended meaning\n", + " * The answer must still be factually correct\n", + "\n", + "# Output Format\n", + "Provide ONLY a single decimal number between 0 and 1.\n", + "\"\"\"\n", "\n", - " grader_run = client.evals.runs.retrieve(eval_id=grader_eval.id, run_id=grader_run.id)\n", - " now = time.time()\n", - " mins, secs = int((now - start_time) // 60), int((now - start_time) % 60)\n", - " print(f\"⏱️ Elapsed time: {mins} minutes {secs} seconds\")\n", + "USER_PROMPT = \"\"\"\n", + "Q: {{item.question}}\n", + "A: {{item.answer}}\n", + "\"\"\"\n", "\n", - "print(f\"🏁 Run {grader_run.id}: {grader_run.status}!\")" + "print(\"Grader prompt defined\")" ] }, { "cell_type": "markdown", - "id": "08693c42", "metadata": {}, "source": [ - "### 📊 Viewing and Interpreting Results\n", - "Our Run completed...so now what?\n", - "\n", - "You can view the results in Azure AI Foundry, or stay in the comfort of this\n", - "notebook 😉. Let's stay in the notebook.\n", - "\n", - "There's a help script provided for rendering the results of all Runs for a given\n", - "list of Evaluations. It will render two things:\n", - "\n", - "1. The pass percentage of each Run (i.e. % of prompts scoring above our provided\n", - " pass threshold).\n", - "2. A histogram of individual scores from a Run letting us see how the score\n", - " distribution looks to see if we're generating excellent (well-above passing)\n", - " results or just barely passing. (_\"C's get degrees,\"_ as they say! 😜)" + "## 5. Create Evaluation (Same API as Original!)" ] }, { "cell_type": "code", - "execution_count": 11, - "id": "bd72521a", + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Get Evaluation Runs: eval_693983fb81d08191b12b619b470c8b19\n", - "\n", - "==================================================\n", - "Combined Evaluation Summary\n", - "==================================================\n", - " Run ID Run Name Model Status Pass Percentage (%) Error Percentage (%) Evaluation ID Evaluation Name\n", - "evalrun_693983fc25c08191a6718033cc66a6d7 o3-mini None completed 90.0 0.0 eval_693983fb81d08191b12b619b470c8b19 sarcasm-grader-90f2f57b\n", - "==================================================\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "==================================================\n", - "Fetching scores...\n", - "==================================================\n", - "\n", - "==================================================\n", - "Score Summary Table:\n", - "Model Evaluation Name Average Score Min Score Max Score 10th Percentile 25th Percentile 50th Percentile 75th Percentile 90th Percentile\n", - " None sarcasm-grader-90f2f57b 6.25 3.00 8.00 3.90 4.75 7.00 8.00 8.00\n", - "==================================================\n" + "✓ Eval file: file-185f7d1d42464c4f91996a2442505ccb\n" ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" } ], "source": [ - "# We've got a handy script for rendering the results from an Evaluations Runs. Let's\n", - "# eyeball this stuff. It uses the Evals API to retrieve the scores and plot them.\n", - "from scripts.eval_utils import display_evaluation_summary\n", + "# Upload baseline for evaluation\n", + "with open(\"baseline.jsonl\", \"rb\") as f:\n", + " grader_eval_file = openai_client.files.create(purpose=\"evals\", file=f)\n", + " grader_eval_file = openai_client.files.wait_for_processing(grader_eval_file.id)\n", "\n", - "display_evaluation_summary(client, [grader_eval.id], x_range=(0, 10))\n", - "\n", - "# We should see that our grader generally thinks our \"gold standard\" is pretty\n", - "# sarcastic. This is where we'd iterate on tuning the grader, making sure we\n", - "# clearly capture features for it to score, etc. We're keeping it simple for now." - ] - }, - { - "cell_type": "markdown", - "id": "457d1a5f", - "metadata": {}, - "source": [ - "### 🤔 So how's it looking?\n", - "Ok! Not bad if we consider our *gold standard* data as generally \"good\" quality\n", - "sarcasm. The Grader thinks these are decent examples with nothing below a `2.0` in\n", - "this case and with the *p90* score being `6.0` (this varies with each run of the\n", - "notebook, but it tends to land around here)." - ] - }, - { - "cell_type": "markdown", - "id": "1b84f91b", - "metadata": {}, - "source": [ - "## 2. Baseline Testing of our Base Models\n", - "At this point we've done two things (hopefully!):\n", - "\n", - "1. Developed a Grader we feel is fit for our task.\n", - "2. Learned how to construct and submit an Eval and Run.\n", - "\n", - "> ℹ️ If you aren't confident in (2), this is a good time to go re-read the above.\n", - "\n", - "### Generating our Dataset\n", - "The real beauty of this type of distillation is we don't have to kill ourselves\n", - "to create a dataset! We really only need to generate the end-user's side of the\n", - "prompt and not the model's side, making this _so much simpler._\n", - "\n", - "Since our agent usecase here is 📎Clippy, we just need some simple questions a\n", - "user of our application might ask. A model like `GPT-4.1` can do this for us with\n", - "a prompt like:\n", - "\n", - "```\n", - "Generate 100 question and answer pairs that might be used in a quiz game. Output\n", - "the pairs in JSONL with the following schema:\n", - "\n", - "{ \"question\": , \"answer\": }\n", - "\n", - "Try not to create duplicates!\n", - "```\n", - "\n", - "> Seriously, this is how I created `qa.jsonl`. You can use Github Copilot right\n", - "> inside Visual Studio Code to generate it and also detect/remove duplicates\n", - "> until you have 500 rows.\n", - "\n", - "Let's take a peek at the data in [qa.jsonl](./qa.jsonl):\n" + "print(f\"✓ Eval file: {grader_eval_file.id}\")" ] }, { "cell_type": "code", - "execution_count": 12, - "id": "ccfec9be", + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Number of Q/A pairs: 500\n", - "{'question': 'What is the freezing point of water in Fahrenheit?', 'answer': '32.'}\n", - "{'question': 'Which famous physicist developed the theory of relativity?', 'answer': 'Albert Einstein.'}\n", - "{'question': \"Who wrote the novel '1984'?\", 'answer': 'George Orwell'}\n" + "Evaluation criteria defined\n" ] } ], "source": [ - "# We've previously generated 500 Question/Answer pairs. Note that they are only\n", - "# focused on factual answers. We're not trying to provide any sarcasm here, just\n", - "# facts...and honestly we're not going to use the answers anyways...but you might\n", - "# use them if you had another grader that checked for gold standard answers.\n", - "import json\n", - "\n", - "qa = []\n", - "with open(\"./qa.jsonl\", \"r\") as f:\n", - " for line in f.readlines():\n", - " qa.append(json.loads(line))\n", - "\n", - "print(f\"Number of Q/A pairs: {len(qa)}\")\n", - "for i in range(3):\n", - " print(qa[i])" - ] - }, - { - "cell_type": "markdown", - "id": "3277e523", - "metadata": {}, - "source": [ - "We now need to split this into two sets:\n", + "# Define evaluation schema\n", + "data_source_config = {\n", + " \"type\": \"custom\",\n", + " \"item_schema\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"question\": {\"type\": \"string\"},\n", + " \"answer\": {\"type\": \"string\"}\n", + " },\n", + " \"required\": [\"question\", \"answer\"]\n", + " }\n", + "}\n", "\n", - "1. A set for **baseline** testing (`qa_baseline`)\n", - "2. A set for **validation** after fine-tuning (`qa_validation`)\n", + "# Define grader as testing criteria\n", + "testing_criteria = [\n", + " {\n", + " \"type\": \"score_model\",\n", + " \"name\": \"sarcasm_score\",\n", + " \"model\": model_deployment,\n", + " \"input\": [\n", + " {\"role\": \"system\", \"content\": GRADER_PROMPT},\n", + " {\"role\": \"user\", \"content\": USER_PROMPT}\n", + " ],\n", + " \"pass_threshold\": 0.5\n", + " }\n", + "]\n", "\n", - "A simple 80/20 split here is good because the point of the validation set here is to have\n", - "a totally _different_ test to use post-training that has _zero overlap_ with what we'll\n", - "be using to train our student." + "print(\"Evaluation criteria defined\")" ] }, { "cell_type": "code", - "execution_count": 13, - "id": "7d3e8982", + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "400 pairs for baseline testing, 100 for validation.\n" + "✓ Evaluation: eval_d341bcecf17245e48c9131906b3b73fc\n" ] } ], "source": [ - "# Now let's split these into two sets: our baseline set and our validation set. We'll just split\n", - "# them in half for now. \n", - "\n", - "# First we'll randomize it to maybe prove a point that this isn't totally staged 😜\n", - "from random import shuffle\n", - "shuffle(qa)\n", - "\n", - "# Now we split 80/20.\n", - "split_at = int(len(qa) * 0.8)\n", - "qa_baseline = qa[:split_at]\n", - "qa_validation = qa[split_at:]\n", - "\n", - "# Check it.\n", - "print(f\"{len(qa_baseline)} pairs for baseline testing, {len(qa_validation)} for validation.\")" - ] - }, - { - "cell_type": "markdown", - "id": "f11cc4b1", - "metadata": {}, - "source": [ - "Let's upload our baseline dataset.\n", - "\n", - "First, we'll write it to disk both to let you inspect it, but also because the OpenAI SDK\n", - "really prefers to upload data file files. 🙃\n", + "# Create evaluation\n", + "grader_eval = openai_client.evals.create(\n", + " name=f\"sarcasm-grader-{UNIQUE_KEY}\",\n", + " data_source_config=data_source_config,\n", + " testing_criteria=testing_criteria\n", + ")\n", "\n", - "> Yeah, if you're a Python dev and know how to make Files-like objects from data in-memory,\n", - "> _sorry_, but the SDK will barf." + "print(f\"✓ Evaluation: {grader_eval.id}\")" ] }, { "cell_type": "code", - "execution_count": 14, - "id": "b73f399e", + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Created baseline file:\n", - "{\n", - " \"id\": \"file-ec72c2ec38dc4bf08937ee2d69cf5a80\",\n", - " \"bytes\": 33245,\n", - " \"created_at\": 1765377055,\n", - " \"filename\": \"sarcasm-baseline-90f2f57b.jsonl\",\n", - " \"object\": \"file\",\n", - " \"purpose\": \"evals\",\n", - " \"status\": \"processed\"\n", - "}\n" + "✓ Eval run: evalrun_730150a66c8343c8a62edbbf95b75a4d\n" ] } ], "source": [ - "# Now we'll upload our baseline dataset and prepare our Evaluation. We need to save the data\n", - "# to disk first for...reasons...because of the OpenAI SDK. That's fine.\n", - "filename = f\"./sarcasm-baseline-{UNIQUE_ENOUGH_KEY}.jsonl\"\n", - "\n", - "with open(filename, \"w\") as f:\n", - " for row in qa_baseline:\n", - " json.dump(row, f)\n", - " f.write(\"\\n\")\n", - "\n", - "baseline_file = None\n", - "with open(filename, \"rb\") as f:\n", - " baseline_file = client.files.create(purpose=\"evals\", file=f)\n", - " baseline_file = client.files.wait_for_processing(baseline_file.id)\n", + "# Run evaluation on baseline\n", + "grader_run = openai_client.evals.runs.create(\n", + " eval_id=grader_eval.id,\n", + " name=f\"baseline-{UNIQUE_KEY}\",\n", + " data_source={\n", + " \"type\": \"jsonl\",\n", + " \"source\": {\"type\": \"file_id\", \"id\": grader_eval_file.id}\n", + " }\n", + ")\n", "\n", - "print(f\"Created baseline file:\\n{baseline_file.to_json(indent=2)}\")" + "print(f\"✓ Eval run: {grader_run.id}\")" ] }, { - "cell_type": "markdown", - "id": "ae9d2b91", + "cell_type": "code", + "execution_count": 16, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Waiting for evaluation...\n", + " Status: in_progress\n", + " Status: in_progress\n", + " Status: in_progress\n", + " Status: in_progress\n", + " Status: in_progress\n", + " Status: in_progress\n", + " Status: in_progress\n", + " Status: in_progress\n", + " Status: in_progress\n", + " Status: completed\n", + "\n", + "✓ completed!\n", + " Passed: 19/20\n" + ] + } + ], "source": [ - "### Defining our Baseline Evaluation\n", - "Most of this will look similar to above from section 1 where we evaluated our Grader.\n", - "\n", - "There are some key differences:\n", - "\n", - "1. The prompt template now uses `{{sample.output_text}}` because we'll be using the\n", - " base model (the model under test) to generate the answer to the user's question.\n", - "2. Because we're mixing _both_ the provided dataset file with generated responses,\n", - " we have to make some slight tweaks to our data source definition (`DATA_SOURCE`).\n", + "# Wait for completion\n", + "print(\"Waiting for evaluation...\")\n", + "while True:\n", + " run = openai_client.evals.runs.retrieve(run_id=grader_run.id, eval_id=grader_eval.id)\n", + " print(f\" Status: {run.status}\")\n", + " if run.status in [\"completed\", \"failed\"]:\n", + " break\n", + " time.sleep(5)\n", "\n", - "Other than that, this is pretty similar." + "print(f\"\\n✓ {run.status}!\")\n", + "if run.result_counts:\n", + " print(f\" Passed: {run.result_counts.passed}/{run.result_counts.total}\")" ] }, { "cell_type": "code", - "execution_count": 15, - "id": "69774305", + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "⚖️ Created baseline eval eval_693984204a7c8191b08abdc67b5d5e23\n" + "Baseline Sarcasm Scores:\n", + " Average: 0.59\n", + " Std Dev: 0.15\n" ] } ], "source": [ - "# We'll now build out the Evaluation details. In this case, we'll *generate* responses\n", - "# using a base model, unlike before where we used the pre-canned results just to test\n", - "# the grader.\n", - "\n", - "# We'll use a simple system prompt to show how distillation and fine-tuning let us\n", - "# get away without overly complex prompt engineering.\n", - "SYSTEM_PROMPT = \"Clippy is a factual chatbot that is also sarcastic.\"\n", - "\n", - "# We'll use a flee of base models as our baseline, including `o3` (our grader).\n", - "BASE_MODELS = [\n", - " \"DeepSeek-V3.1\",\n", - " \"gpt-4.1\",\n", - " \"gpt-4.1-mini\",\n", - " \"gpt-4.1-nano\",\n", - " \"gpt-4o\",\n", - " \"gpt-4o-mini\",\n", - " \"Ministral-3B\",\n", - "]\n", - "\n", - "# The prompt we'll grade will look like this pattern. Similar to before, but now we're\n", - "# going to use {{sample.output_text}} to substitute what the model under test generates.\n", - "USER_PROMPT = \"\"\"\n", - "Q: {{item.question}}\n", - "A: {{sample.output_text}}\n", - "\"\"\"\n", - "\n", - "# Input to our grader remains the same as before, but we reproduce it here for context.\n", - "INPUT = [\n", - " {\n", - " \"type\": \"message\",\n", - " \"role\": \"system\",\n", - " \"content\": { \"type\": \"input_text\", \"text\": GRADER_PROMPT }\n", - " },\n", - " {\n", - " \"type\": \"message\",\n", - " \"role\": \"user\",\n", - " \"content\": { \"type\": \"input_text\", \"text\": USER_PROMPT }\n", - " }\n", - "]\n", - "\n", - "# The schema and data source are similar, but with one major difference noted below.\n", - "SCHEMA = {\n", - " \"type\": \"object\",\n", - " \"properties\": {\n", - " \"question\": { \"type\": \"string\" },\n", - " \"answer\": { \"type\": \"string\" },\n", - " },\n", - "}\n", - "DATA_SOURCE = {\n", - " \"item_schema\": SCHEMA, \n", - " \"include_sample_schema\": True, # Note this change! Needed for data gen.\n", - " \"type\": \"custom\"\n", - "}\n", - "\n", - "# Same testing criteria, reproduced again for context.\n", - "TESTING_CRITERIA = {\n", - " \"name\": \"Auto Sarcasm Grader\",\n", - " \"type\": \"score_model\",\n", - " \"model\": GRADER_MODEL,\n", - " \"input\": INPUT,\n", - " \"range\": [1.0, 10.0],\n", - " \"pass_threshold\": 4.0,\n", - "}\n", + "# Get scores\n", + "output_items = list(openai_client.evals.runs.output_items.list(run_id=grader_run.id, eval_id=grader_eval.id))\n", + "baseline_scores = [r.score for item in output_items for r in item.results if r.score is not None]\n", "\n", - "# We create one Evaluation for *all* our base models. Each model is tested in a\n", - "# distinct Run that we'll define next.\n", - "baseline_eval = client.evals.create(\n", - " name=f\"sacarsm-baseline-{UNIQUE_ENOUGH_KEY}\",\n", - " data_source_config=DATA_SOURCE,\n", - " testing_criteria=[TESTING_CRITERIA]\n", - ")\n", - "print(f\"⚖️ Created baseline eval {baseline_eval.id}\")\n", - "\n" + "print(f\"Baseline Sarcasm Scores:\")\n", + "print(f\" Average: {np.mean(baseline_scores):.2f}\")\n", + "print(f\" Std Dev: {np.std(baseline_scores):.2f}\")" ] }, { "cell_type": "markdown", - "id": "1a7640fb", "metadata": {}, "source": [ - "Now instead of a single Run, we submit _one Run per base model_.\n", - "\n", - "This part is **new**, so let's look at it closely.\n", - "\n", - "Each Run has its own data source defined. Like with the Grader evaluation where\n", - "we finally said _which_ file to use for a test dataset, we're not providing both\n", - "a prompt template _and_ a reference to the test dataset.\n", - "\n", - "1. `source` -- looks like what we did previously, referencing our test data by\n", - " file id.\n", - "2. `input_messages` -- provides our prompt template, looking similar to how we\n", - " defined the Grader prompt previously. Note, however, we're now using our\n", - " `SYSTEM_PROMPT` (the simple 1-liner Clippy one) and wiring in the test data\n", - " as the user's input.\n", - "\n", - "> ℹ️ An astute reader will notice _we're not using the `answer` field_ from our\n", - "> test data. Yup! In this demo, we don't. You _could_ extend the grader to use\n", - "> that as \"ground truth\" for scoring the accuracy. Exercise left to you, my\n", - "> friend!\n", - "\n", - "> ⚠️ Attention!\n", - "> There's one thing to call out and that's the `sampling_params`. This lets us\n", - "> provide tuning of the chat completion parameters to the model under test. In\n", - "> this case, we're tuning the max completion tokens.\n", - "> \n", - "> However, there are two things to note:\n", - "> 1. We use a different value depending on if it's a reasoning model being tested\n", - "> or if it's a GPT model. (I'm not sure if this matters?)\n", - "> 2. More importantly, Azure OpenAI has a 🐛bug where we need to specificy it as\n", - "> `max_completions_tokens` and not `max_completion_tokens`. Note the lack of\n", - "> the `s`." + "## 6. Test Inference" ] }, { "cell_type": "code", - "execution_count": 16, - "id": "85308636", + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "🏃‍➡️ Created run evalrun_69398420f92081919db1b05086cd59ce for eval eval_693984204a7c8191b08abdc67b5d5e23\n", - "🏃‍➡️ Created run evalrun_69398421b4308191bc33dfac7567f082 for eval eval_693984204a7c8191b08abdc67b5d5e23\n", - "🏃‍➡️ Created run evalrun_6939842251f08191b1c8a9e4c5d0b2b1 for eval eval_693984204a7c8191b08abdc67b5d5e23\n", - "🏃‍➡️ Created run evalrun_69398422f7508191a52bfb35d10a2542 for eval eval_693984204a7c8191b08abdc67b5d5e23\n", - "🏃‍➡️ Created run evalrun_69398423c1008191b859c069dbce521e for eval eval_693984204a7c8191b08abdc67b5d5e23\n", - "🏃‍➡️ Created run evalrun_693984246064819180e8989ca09c597a for eval eval_693984204a7c8191b08abdc67b5d5e23\n", - "🏃‍➡️ Created run evalrun_6939842511a48191a67a392e04471fc1 for eval eval_693984204a7c8191b08abdc67b5d5e23\n" + "Q: What is the capital of France?\n", + "A: Oh, what a stumper! The capital of France? Let me consult my ancient scrolls... Oh wait, it’s Paris. You know, the city with the Eiffel Tower, baguettes, and a population of people who perfected the art of looking unimpressed. Paris is the answer you’re searching for, unless France has pulled a fast one overnight.\n" ] } ], "source": [ - "# Each run gets its own data source definition as it needs to specify a different\n", - "# model deployment to use for generation. The template is the prompt template\n", - "# sent to the model under test. It uses the simple Clippy system prompt and for\n", - "# the user input, we use the \"question\" from the baseline Q&A data file.\n", - "baseline_runs = []\n", - "for model in BASE_MODELS:\n", - " RUN_DATA_SOURCE = {\n", - " \"type\": \"completions\",\n", - " \"model\": model,\n", - " \"source\": { \"type\": \"file_id\", \"id\": baseline_file.id },\n", - " \"input_messages\": {\n", - " \"type\": \"template\",\n", - " \"template\": [\n", - " { \n", - " \"type\": \"message\", \n", - " \"role\": \"system\", \n", - " \"content\": { \"type\": \"input_text\", \"text\": SYSTEM_PROMPT },\n", - " },\n", - " { \n", - " \"type\": \"message\", \n", - " \"role\": \"user\", \n", - " \"content\": { \"type\": \"input_text\", \"text\": \"{{item.question}}\" },\n", - " },\n", - " ],\n", - " },\n", - " }\n", - " run = client.evals.runs.create(\n", - " name=f\"{model}-{UNIQUE_ENOUGH_KEY}\", \n", - " eval_id=baseline_eval.id,\n", - " data_source=RUN_DATA_SOURCE, \n", + "SARCASM_SYSTEM_PROMPT = \"\"\"You are a sarcastically witty assistant. Answer questions with \n", + "biting wit while remaining factually correct. Channel your inner comedian who's \n", + "slightly annoyed at obvious questions.\"\"\"\n", + "\n", + "def generate_response(question: str, model: str = None) -> str:\n", + " \"\"\"Generate a sarcastic response using responses API.\"\"\"\n", + " response = openai_client.responses.create(\n", + " model=model or teacher_model,\n", + " instructions=SARCASM_SYSTEM_PROMPT,\n", + " input=question,\n", + " temperature=0.7,\n", + " max_output_tokens=150\n", " )\n", - " print(f\"🏃‍➡️ Created run {run.id} for eval {baseline_eval.id}\")\n", - " baseline_runs.append(run)" + " return response.output_text\n", + "\n", + "# Test\n", + "test_q = \"What is the capital of France?\"\n", + "print(f\"Q: {test_q}\")\n", + "print(f\"A: {generate_response(test_q)}\")" ] }, { "cell_type": "markdown", - "id": "daaa0f77", "metadata": {}, "source": [ - "Waiting is always the hardest part. What can I say?\n", - "\n", - "This can take 15-20 minutes depending on your TPM limits. Maybe more. Maybe less." + "## 7. Load Q&A Data" ] }, { "cell_type": "code", - "execution_count": 17, - "id": "2d83696a", + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "🏃‍➡️ Run DeepSeek-V3.1-90f2f57b: completed\n", - "🏃‍➡️ Run gpt-4.1-90f2f57b: completed\n", - "🏃‍➡️ Run gpt-4.1-mini-90f2f57b: completed\n", - "🏃‍➡️ Run gpt-4.1-nano-90f2f57b: completed\n", - "🏃‍➡️ Run gpt-4o-90f2f57b: completed\n", - "🏃‍➡️ Run gpt-4o-mini-90f2f57b: completed\n", - "🏃‍➡️ Run Ministral-3B-90f2f57b: completed\n", - "⏱️ Elapsed time: 8 minutes 1 seconds\n", - "🏁 All 7 runs completed!\n" + "Training: 250, Validation: 250\n" ] } ], "source": [ - "# We now have to wait for our half-dozen or so Runs to finish. Twiddle your thumbs a bit!\n", - "from IPython.display import clear_output\n", - "import time\n", - "\n", - "start_time = time.time()\n", - "\n", - "while any([r.status not in [\"completed\", \"failed\"] for r in baseline_runs]):\n", - " time.sleep(10)\n", - " clear_output(wait=True)\n", + "qa_data = []\n", + "with open(\"qa.jsonl\", \"r\") as f:\n", + " for line in f:\n", + " qa_data.append(json.loads(line))\n", "\n", - " for i in range(len(baseline_runs)):\n", - " baseline_runs[i] = client.evals.runs.retrieve(eval_id=baseline_eval.id, run_id=baseline_runs[i].id)\n", - " print(f\"🏃‍➡️ Run {baseline_runs[i].name}: {baseline_runs[i].status}\")\n", - " \n", - " now = time.time()\n", - " print(\"⏱️ Elapsed time: {} minutes {} seconds\".format(int((now - start_time) // 60), int((now - start_time) % 60)))\n", + "shuffle(qa_data)\n", + "split_idx = len(qa_data) // 2\n", + "training_questions = qa_data[:split_idx]\n", + "validation_questions = qa_data[split_idx:]\n", "\n", - "print(f\"🏁 All {len(baseline_runs)} runs completed!\")" + "print(f\"Training: {len(training_questions)}, Validation: {len(validation_questions)}\")" ] }, { "cell_type": "markdown", - "id": "3c425e6f", "metadata": {}, "source": [ - "### Interpreting our Baseline Results\n", - "Now we get to see our winner and loser!\n", - "\n", - "We'll use the same plotting function as before to look at _pass percentage_ and the\n", - "individual score distributions for each model.\n", - "\n", - "What we want to do here is:\n", - "1. Identify the clear winner to designat our *teacher*.\n", - "2. Identify the clear loser to designate our *student*.\n", - "\n", - "Recall our general hypothesis here is:\n", - "\n", - "> A larger, more robust model will excel at our task out of the box at the cost of\n", - "> typically more $'s per token and often higher latency (time-to-first-token).\n", - ">\n", - "> A smaller, less robust model will perform poorly out of the box, but will provide\n", - "> a better $/token price-point and often much lower latency.\n", - ">\n", - "> The ideal model for our agent will have the lowest $/token and latency, while\n", - "> achieving acceptable scores.\n", - "\n", - "Our goal, as a reminder, is to take that fast, cheap model and make it perform\n", - "**as well as the slower, more expensive model**.\n", - "\n", - "> 😜 Better, Faster, Cheaper: Pick 3!" + "## 8. Generate Training Data" ] }, { "cell_type": "code", - "execution_count": 18, - "id": "5acc0dcd", + "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Get Evaluation Runs: eval_693984204a7c8191b08abdc67b5d5e23\n", + "Generating 50 training examples...\n", + " 10/50...\n", + " 20/50...\n", + " 30/50...\n", + " 40/50...\n", + " 50/50...\n", "\n", - "==================================================\n", - "Combined Evaluation Summary\n", - "==================================================\n", - " Run ID Run Name Model Status Pass Percentage (%) Error Percentage (%) Evaluation ID Evaluation Name\n", - "evalrun_69398420f92081919db1b05086cd59ce DeepSeek-V3.1-90f2f57b DeepSeek-V3.1 completed 93.750000 3.00 eval_693984204a7c8191b08abdc67b5d5e23 sacarsm-baseline-90f2f57b\n", - "evalrun_69398421b4308191bc33dfac7567f082 gpt-4.1-90f2f57b gpt-4.1 completed 84.750000 5.25 eval_693984204a7c8191b08abdc67b5d5e23 sacarsm-baseline-90f2f57b\n", - "evalrun_693984246064819180e8989ca09c597a gpt-4o-mini-90f2f57b gpt-4o-mini completed 80.000000 2.75 eval_693984204a7c8191b08abdc67b5d5e23 sacarsm-baseline-90f2f57b\n", - "evalrun_6939842251f08191b1c8a9e4c5d0b2b1 gpt-4.1-mini-90f2f57b gpt-4.1-mini completed 74.686717 4.00 eval_693984204a7c8191b08abdc67b5d5e23 sacarsm-baseline-90f2f57b\n", - "evalrun_69398423c1008191b859c069dbce521e gpt-4o-90f2f57b gpt-4o completed 74.500000 3.50 eval_693984204a7c8191b08abdc67b5d5e23 sacarsm-baseline-90f2f57b\n", - "evalrun_6939842511a48191a67a392e04471fc1 Ministral-3B-90f2f57b Ministral-3B completed 29.500000 1.50 eval_693984204a7c8191b08abdc67b5d5e23 sacarsm-baseline-90f2f57b\n", - "evalrun_69398422f7508191a52bfb35d10a2542 gpt-4.1-nano-90f2f57b gpt-4.1-nano completed 27.000000 2.75 eval_693984204a7c8191b08abdc67b5d5e23 sacarsm-baseline-90f2f57b\n", - "==================================================\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "==================================================\n", - "Fetching scores...\n", - "==================================================\n", + "Generating 25 validation examples...\n", "\n", - "==================================================\n", - "Score Summary Table:\n", - " Model Evaluation Name Average Score Min Score Max Score 10th Percentile 25th Percentile 50th Percentile 75th Percentile 90th Percentile\n", - "DeepSeek-V3.1 sacarsm-baseline-90f2f57b 7.07 0.00 10.00 5.00 7.00 8.00 8.00 8.00\n", - " gpt-4.1 sacarsm-baseline-90f2f57b 5.80 0.00 9.00 3.00 5.00 6.00 7.00 8.00\n", - " gpt-4o-mini sacarsm-baseline-90f2f57b 5.54 0.00 9.00 3.00 4.00 6.00 7.00 8.00\n", - " gpt-4o sacarsm-baseline-90f2f57b 5.26 0.00 9.00 3.00 3.00 6.00 7.00 8.00\n", - " gpt-4.1-mini sacarsm-baseline-90f2f57b 5.18 0.00 9.00 3.00 3.00 6.00 7.00 8.00\n", - " gpt-4.1-nano sacarsm-baseline-90f2f57b 2.85 0.00 8.00 1.00 1.00 2.00 4.00 6.00\n", - " Ministral-3B sacarsm-baseline-90f2f57b 2.70 0.00 8.00 1.00 1.00 1.00 5.00 7.00\n", - "==================================================\n" + "✓ Generated 50 training, 25 validation\n" ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" } ], "source": [ - "# Let's visualize our evaluation and identify the best and worst performers.\n", - "display_evaluation_summary(client, [baseline_eval.id], x_range=(1, 10))\n" - ] - }, - { - "cell_type": "markdown", - "id": "780e5d56", - "metadata": {}, - "source": [ - "You'll notice we have a very clear winner and a very clear loser.\n", - "\n", - "- `DeepSeek-V3.1` -- consistently scores the highest both in terms of passing scores,\n", - " but also generates generates a lot of high scores.\n", - "- `Ministral-3B` -- higher average score than `gpt-4.1-nano`, but has the most scores\n", - " of `1`. Just an embarassment!\n", - "\n", - "But let's hold that thought for a moment. Visually we can see this, but remember what\n", - "we want to do here is find not just the _best performing model_, but we need it's best\n", - "_example responses_.\n" - ] - }, - { - "cell_type": "markdown", - "id": "a729010c", - "metadata": {}, - "source": [ - "\n", - "## 4. 🧪 Distilling from the Teacher\n", - "Let's look at all our models again, this time via code, and find just the _excellent_\n", - "responses that scored `6.0` or higher.\n", - "\n", - "> ℹ️ This part gets a bit technical! We'll be doing some data engineering on the fly\n", - "> as we analyze the excellent responses. This is maybe the more complex Python in this\n", - "> entire notebook as it works around some limitations with the Evaluations API, but\n", - "> buckle up and I promise it's worth it.\n", - "\n", - "What we're going to do is query each Run, look at the Run's _individual results_,\n", - "collect just the \"excellent\" ones, and while we're doing so we'll be\n", - "_transforming them into chat completions_.\n", - "\n", - "Then it's as simple as seeing which model had the most \"excellent\" scores and\n", - "declaring our winner." + "MAX_TRAINING = 50\n", + "MAX_VALIDATION = 25\n", + "\n", + "print(f\"Generating {MAX_TRAINING} training examples...\")\n", + "training_data = []\n", + "for i, item in enumerate(training_questions[:MAX_TRAINING]):\n", + " response = generate_response(item['question'])\n", + " training_data.append({\n", + " \"messages\": [\n", + " {\"role\": \"system\", \"content\": SARCASM_SYSTEM_PROMPT},\n", + " {\"role\": \"user\", \"content\": item['question']},\n", + " {\"role\": \"assistant\", \"content\": response}\n", + " ]\n", + " })\n", + " if (i + 1) % 10 == 0:\n", + " print(f\" {i + 1}/{MAX_TRAINING}...\")\n", + "\n", + "print(f\"\\nGenerating {MAX_VALIDATION} validation examples...\")\n", + "validation_data = []\n", + "for item in validation_questions[:MAX_VALIDATION]:\n", + " response = generate_response(item['question'])\n", + " validation_data.append({\n", + " \"messages\": [\n", + " {\"role\": \"system\", \"content\": SARCASM_SYSTEM_PROMPT},\n", + " {\"role\": \"user\", \"content\": item['question']},\n", + " {\"role\": \"assistant\", \"content\": response}\n", + " ]\n", + " })\n", + "\n", + "print(f\"\\n✓ Generated {len(training_data)} training, {len(validation_data)} validation\")" ] }, { "cell_type": "code", - "execution_count": 19, - "id": "2ab5fae6", + "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "😜 Ok! Let's use DeepSeek-V3.1. It had 357 excellent responses.\n" + "✓ training_sarcasm_4aa88349.jsonl\n", + "✓ validation_sarcasm_4aa88349.jsonl\n" ] } ], "source": [ - "# We'll find the model that generated the most \"excellent\" (>= 6.0) examples of sarcasm.\n", - "CUTOFF = 6.0\n", - "HIGH_SCORES = dict([(m, []) for m in BASE_MODELS])\n", + "# Save files\n", + "training_file_path = f\"training_sarcasm_{UNIQUE_KEY}.jsonl\"\n", + "validation_file_path = f\"validation_sarcasm_{UNIQUE_KEY}.jsonl\"\n", + "\n", + "with open(training_file_path, \"w\") as f:\n", + " for item in training_data:\n", + " f.write(json.dumps(item) + \"\\n\")\n", "\n", - "# Let's find our responses that were Excellent (at or above CUTOFF). We'll collect them\n", - "# and pre-format them into chat completions format to save time later.\n", - "#\n", - "# This part is honestly a bit tricky...we're extracting the prompts and responses for the\n", - "# model under test and *not* the prompts to the grader, so we have to do surgery. 🔪\n", - "for run in baseline_runs:\n", - " pages = client.evals.runs.output_items.list(run.id, eval_id=baseline_eval.id).iter_pages()\n", - " for page in pages:\n", - " for item in page.data:\n", - " # We only used 1 grader. If you use multiple, you should look for which ones you want.\n", - " if not item.results:\n", - " continue\n", - " result = item.results[0]\n", - " if result.score >= CUTOFF:\n", - " generated = result.sample[\"input\"][-1][\"content\"].strip().split(\"\\nA: \")\n", - " question = generated[0][3:] # drops the \"Q: \"\n", - " answer = generated[-1]\n", - " messages = [\n", - " { \"role\": \"system\", \"content\": SYSTEM_PROMPT },\n", - " { \"role\": \"user\", \"content\": question },\n", - " { \"role\": \"assistant\", \"content\": answer },\n", - " ]\n", - " HIGH_SCORES[run.model].append({ \"messages\": messages })\n", + "with open(validation_file_path, \"w\") as f:\n", + " for item in validation_data:\n", + " f.write(json.dumps(item) + \"\\n\")\n", "\n", - "# Time to find the winner! Obviously, this is probably o3...\n", - "winning_model = \"\"\n", - "winning_cnt = 0\n", - "for key in HIGH_SCORES.keys():\n", - " if len(HIGH_SCORES[key]) > winning_cnt:\n", - " winning_model = key\n", - " winning_cnt = len(HIGH_SCORES[key])\n", - " \n", - "print(f\"😜 Ok! Let's use {winning_model}. It had {winning_cnt} excellent responses.\")" + "print(f\"✓ {training_file_path}\")\n", + "print(f\"✓ {validation_file_path}\")" ] }, { "cell_type": "markdown", - "id": "f3bbdd48", "metadata": {}, "source": [ - "We now take _just the Teacher's excellent responses_ and construct our training data.\n", - "\n", - "We should well over 100 examples if all goes well (based on previous testing of this notebook),\n", - "so let's give it a train/test split:\n", - "\n", - "1. Split the data in-memory.\n", - "2. Write the training and validation data out as JSONL to disk.\n", - "3. Upload them via the Files API." + "## 9. Fine-Tune Model" ] }, { "cell_type": "code", - "execution_count": 20, - "id": "f70963f8", + "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Split into 285 training / 72 validation rows.\n", - "🏋️‍♂️ Created training file:\n", - "{\n", - " \"id\": \"file-8289de70b4af4de98f2136dcd694e0ec\",\n", - " \"bytes\": 120909,\n", - " \"created_at\": 1765377614,\n", - " \"filename\": \"sarcasm-training-90f2f57b.jsonl\",\n", - " \"object\": \"file\",\n", - " \"purpose\": \"fine-tune\",\n", - " \"status\": \"processed\"\n", - "}\n", - "📋 Created validation file:\n", - "{\n", - " \"id\": \"file-8d808872611d45b1b1d38221d4e777ae\",\n", - " \"bytes\": 30810,\n", - " \"created_at\": 1765377620,\n", - " \"filename\": \"sarcasm-validation-90f2f57b.jsonl\",\n", - " \"object\": \"file\",\n", - " \"purpose\": \"fine-tune\",\n", - " \"status\": \"processed\"\n", - "}\n" + "Training: file-e457cb500c77408db3a563f2dcfa986f\n", + "Validation: file-b7d2cf189fb9422eb9e57a5c454d7531\n", + "✓ Files ready\n" ] } ], "source": [ - "# Before we go any further, let's turn our collected excellent responses into our\n", - "# training and validation fine-tuning datasets. Like before, we have to write these\n", - "# to disk and then upload them via the Files API.\n", - "training_filename = f\"sarcasm-training-{UNIQUE_ENOUGH_KEY}.jsonl\"\n", - "validation_filename = f\"sarcasm-validation-{UNIQUE_ENOUGH_KEY}.jsonl\"\n", - "\n", - "# Make an 80/20 split to form our training/validation data.\n", - "split_at = int(len(HIGH_SCORES[winning_model]) * 0.80)\n", - "training_data = HIGH_SCORES[winning_model][:split_at]\n", - "validation_data = HIGH_SCORES[winning_model][split_at:]\n", - "print(f\"Split into {len(training_data)} training / {len(validation_data)} validation rows.\")\n", - "\n", - "# Create and upload the training data.\n", - "with open(training_filename, \"w\") as f:\n", - " for message in training_data:\n", - " json.dump(message, f)\n", - " f.write(\"\\n\")\n", - "with open(training_filename, \"rb\") as f:\n", - " training_file = client.files.create(file=f, purpose=\"fine-tune\")\n", - " training_file = client.files.wait_for_processing(training_file.id)\n", - "print(f\"🏋️‍♂️ Created training file:\\n{training_file.to_json(indent=2)}\")\n", - "\n", - "# Create and upload the validation data.\n", - "with open(validation_filename, \"w\") as f:\n", - " for message in validation_data:\n", - " json.dump(message, f)\n", - " f.write(\"\\n\")\n", - "with open(validation_filename, \"rb\") as f:\n", - " validation_file = client.files.create(file=f, purpose=\"fine-tune\")\n", - " validation_file = client.files.wait_for_processing(validation_file.id)\n", - "print(f\"📋 Created validation file:\\n{validation_file.to_json(indent=2)}\")" - ] - }, - { - "cell_type": "markdown", - "id": "ce255aed", - "metadata": {}, - "source": [ - "## 5. 🏋️‍♂️ Training the Students\n", - "We've got our training dataset, so let's get to fine-tuning!\n", - "\n", - "We've chosen `Ministral-3B` and `gpt-4.1-nano` as the students because they are desperately\n", - "in the need of some help as shown by its benchmarking.\n", + "# Upload files\n", + "with open(training_file_path, \"rb\") as f:\n", + " train_file = openai_client.files.create(file=f, purpose=\"fine-tune\")\n", "\n", - "We'll create a job _suffix_ that identifies our teacher model for posterity and include that\n", - "unique-enough key from before so if you want to run this notebook again you can have two\n", - "different fine-tuning jobs to compare.\n", + "with open(validation_file_path, \"rb\") as f:\n", + " val_file = openai_client.files.create(file=f, purpose=\"fine-tune\")\n", "\n", - "> ℹ️ Look at the use of `extra_body` as a parameter! We're telling Azure OpenAI to try\n", - "> using Global Training for our job. If you want to learn more about Global Training, check\n", - "> out the orginal [announcement](https://aka.ms/Build25/FTGlobalAndDev) from **Build 2025**.\n", + "print(f\"Training: {train_file.id}\")\n", + "print(f\"Validation: {val_file.id}\")\n", "\n", - "Note that we're not tuning hyper-parameters here for our job and just using defaults. That\n", - "is an exercise left for you, dear reader!" + "openai_client.files.wait_for_processing(train_file.id)\n", + "openai_client.files.wait_for_processing(val_file.id)\n", + "print(\"✓ Files ready\")" ] }, { "cell_type": "code", - "execution_count": 21, - "id": "e7e8b7a4", + "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "👨‍🔬 Created fine-tuning job:\n", - "{\n", - " \"id\": \"ftjob-22fdc4d28e9e4b80b78b30daeb76cc00\",\n", - " \"created_at\": 1765377627,\n", - " \"hyperparameters\": {\n", - " \"batch_size\": -1,\n", - " \"learning_rate_multiplier\": 1.0,\n", - " \"n_epochs\": 50\n", - " },\n", - " \"model\": \"Ministral-3B\",\n", - " \"object\": \"fine_tuning.job\",\n", - " \"seed\": 170999531,\n", - " \"status\": \"pending\",\n", - " \"training_file\": \"file-8289de70b4af4de98f2136dcd694e0ec\",\n", - " \"validation_file\": \"file-8d808872611d45b1b1d38221d4e777ae\",\n", - " \"estimated_finish\": 1765468527,\n", - " \"method\": {\n", - " \"type\": \"supervised\",\n", - " \"supervised\": {\n", - " \"hyperparameters\": {\n", - " \"batch_size\": -1,\n", - " \"learning_rate_multiplier\": 1.0,\n", - " \"n_epochs\": 50\n", - " }\n", - " }\n", - " },\n", - " \"suffix\": \"sarcasm-DeepSeek-V31-90f2f57b\",\n", - " \"trainingType\": \"globalStandard\"\n", - "}\n" + "✓ Job: ftjob-cbccca9eb73d4da991b925d045133219\n", + " Status: pending\n" ] } ], "source": [ - "# Submit our Ministral-3B training job.\n", - "TEACHER_MODEL = winning_model\n", - "STUDENT_MODEL = \"ministral-3b\"\n", - "SUFFIX = f\"sarcasm-{TEACHER_MODEL}-{UNIQUE_ENOUGH_KEY}\".replace(\".\", \"\") # '.' is a reserved character 😜\n", - "\n", - "ministral_job = client.fine_tuning.jobs.create(\n", - " model=STUDENT_MODEL,\n", - " suffix=SUFFIX,\n", - " training_file=training_file.id,\n", - " validation_file=validation_file.id,\n", - " extra_body={ \"trainingType\": \"globalstandard\" },\n", + "# Create fine-tuning job\n", + "fine_tune_job = openai_client.fine_tuning.jobs.create(\n", + " model=base_model,\n", + " training_file=train_file.id,\n", + " validation_file=val_file.id,\n", " method={\n", " \"type\": \"supervised\",\n", " \"supervised\": {\n", " \"hyperparameters\": {\n", - " \"n_epochs\": 50, # Ministral-3B benefits from a high epochs value.\n", + " \"n_epochs\": 3,\n", + " \"batch_size\": 1,\n", + " \"learning_rate_multiplier\": 1.0\n", " }\n", " }\n", - " }\n", + " },\n", + " extra_body={\"trainingType\": \"Standard\"},\n", + " suffix=f\"sarcasm-{UNIQUE_KEY}\"\n", ")\n", - "print(f\"👨‍🔬 Created fine-tuning job:\\n{ministral_job.to_json(indent=2)}\")" + "\n", + "print(f\"✓ Job: {fine_tune_job.id}\")\n", + "print(f\" Status: {fine_tune_job.status}\")" ] }, { "cell_type": "markdown", - "id": "63eb381d", "metadata": {}, "source": [ - "For `gpt-4.1-nano`, we can take advantage of the new *Developer* training tier\n", - "that schedules our job on spot capacity at a 50% discount!" + "## 10. Monitor Training" ] }, { "cell_type": "code", - "execution_count": 22, - "id": "ec8f986e", + "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "👨‍🔬 Created fine-tuning job:\n", - "{\n", - " \"id\": \"ftjob-3c27f78ad2c141719573c550300bb791\",\n", - " \"created_at\": 1765377628,\n", - " \"hyperparameters\": {\n", - " \"batch_size\": -1,\n", - " \"learning_rate_multiplier\": 0.1,\n", - " \"n_epochs\": -1\n", - " },\n", - " \"model\": \"gpt-4.1-nano-2025-04-14\",\n", - " \"object\": \"fine_tuning.job\",\n", - " \"seed\": 1003399446,\n", - " \"status\": \"pending\",\n", - " \"training_file\": \"file-8289de70b4af4de98f2136dcd694e0ec\",\n", - " \"validation_file\": \"file-8d808872611d45b1b1d38221d4e777ae\",\n", - " \"estimated_finish\": 1765471227,\n", - " \"suffix\": \"sarcasm-DeepSeek-V31-90f2f57b\",\n", - " \"trainingType\": \"developerTier\"\n", - "}\n" + "Job: ftjob-71e2eea33ecd4d4e990c54ed09ada149\n", + "Status: succeeded\n", + "Model: gpt-4.1-mini-2025-04-14.ft-71e2eea33ecd4d4e990c54ed09ada149-sarcasm-bc6d797e\n" ] } ], "source": [ - "# Now we start training! Submit our fine-tuning job to teach 4.1-nano new tricks.\n", - "TEACHER_MODEL = winning_model\n", - "STUDENT_MODEL = \"gpt-4.1-nano-2025-04-14\"\n", - "SUFFIX = f\"sarcasm-{TEACHER_MODEL}-{UNIQUE_ENOUGH_KEY}\".replace(\".\", \"\") # '.' is a reserved character 😜\n", - "\n", - "nano_job = client.fine_tuning.jobs.create(\n", - " model=STUDENT_MODEL,\n", - " suffix=SUFFIX,\n", - " training_file=training_file.id,\n", - " validation_file=validation_file.id,\n", - " extra_body={ \"trainingType\": \"developerTier\" },\n", - " # We will use default hyperparameters for this job.\n", - ")\n", - "print(f\"👨‍🔬 Created fine-tuning job:\\n{nano_job.to_json(indent=2)}\")" - ] - }, - { - "cell_type": "markdown", - "id": "a85e5e6a", - "metadata": {}, - "source": [ - "Fine-tuning, like Evaluations, take time. Now's a good time to go catch up on emails ✉️,\n", - "walk your dog 🐕, or take a nap 😴. We can poll the training status to know when we can\n", - "proceed further in the notebook." + "# Check status\n", + "job_id = fine_tune_job.id\n", + "job = openai_client.fine_tuning.jobs.retrieve(job_id)\n", + "print(f\"Job: {job.id}\")\n", + "print(f\"Status: {job.status}\")\n", + "if job.fine_tuned_model:\n", + " print(f\"Model: {job.fine_tuned_model}\")" ] }, { "cell_type": "code", - "execution_count": 23, - "id": "88aa266f", + "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "👨‍🔬 gpt-4.1-nano job ftjob-3c27f78ad2c141719573c550300bb791: succeeded\n", - "👨‍🔬 Ministral-3B job ftjob-22fdc4d28e9e4b80b78b30daeb76cc00: succeeded\n", - "⏱️ Elapsed time: 57 minutes 53 seconds\n", - "🏁 Fine-tuning finished!\n" + "Waiting for fine-tuning...\n", + " [00:00] succeeded\n", + "\n", + "✓ Model: gpt-4.1-mini-2025-04-14.ft-71e2eea33ecd4d4e990c54ed09ada149-sarcasm-bc6d797e\n" ] } ], "source": [ - "# Wait for our FT job to complete. You may want to go work on some other tasks for now 😜\n", - "from IPython.display import clear_output\n", - "import time\n", - "\n", + "# Wait for completion (can take 10-30 min)\n", + "print(\"Waiting for fine-tuning...\")\n", "start_time = time.time()\n", "\n", - "while any([j.status not in [\"succeeded\", \"failed\", \"cancelled\"] for j in [nano_job, ministral_job]]):\n", - " time.sleep(10)\n", - " nano_job = client.fine_tuning.jobs.retrieve(nano_job.id)\n", - " ministral_job = client.fine_tuning.jobs.retrieve(ministral_job.id)\n", - " clear_output(wait=True)\n", - " print(f\"👨‍🔬 gpt-4.1-nano job {nano_job.id}: {nano_job.status}\")\n", - " print(f\"👨‍🔬 Ministral-3B job {ministral_job.id}: {ministral_job.status}\")\n", - " print(\"⏱️ Elapsed time: {} minutes {} seconds\".format(int((time.time() - start_time) // 60), int((time.time() - start_time) % 60)))\n", + "while True:\n", + " job = openai_client.fine_tuning.jobs.retrieve(job_id)\n", + " elapsed = int(time.time() - start_time)\n", + " print(f\" [{elapsed//60:02d}:{elapsed%60:02d}] {job.status}\")\n", + " \n", + " if job.status in [\"succeeded\", \"failed\", \"cancelled\"]:\n", + " break\n", + " time.sleep(30)\n", "\n", - "if nano_job.status == \"succeeded\" and ministral_job.status == \"succeeded\":\n", - " print(f\"🏁 Fine-tuning finished!\")\n", + "if job.status == \"succeeded\":\n", + " fine_tuned_model_id = job.fine_tuned_model\n", + " print(f\"\\n✓ Model: {fine_tuned_model_id}\")\n", "else:\n", - " raise RuntimeError(f\"Fine-tuning job did not complete successfully (status={status})\")" - ] - }, - { - "cell_type": "markdown", - "id": "91fc761d", - "metadata": {}, - "source": [ - "Once the jobs complete, we should eyeball the metrics. Here's what an example\n", - "should look like:\n", - "\n", - "`TODO: INSERT IMAGE`\n", - "\n", - "If our models are learning, you should see the _training loss_ decreasing.\n", - "This gives us a sense of if the model learned something, but it **doesn't** tell us\n", - "if the model is fit for production!" + " print(f\"\\n✗ {job.status}\")" ] }, { "cell_type": "markdown", - "id": "92f0660a", "metadata": {}, "source": [ - "## 6. 🧑‍⚖️ Judging our Students against their Peers\n", - "Shipping a new model to production right now would be truly a YOLO moment and I\n", - "cannot recommend it.\n", - "\n", - "What we really need to do now is go back and benchmark it against the original\n", - "`Ministral-3B` and `gpt-4.1-nano` base models.\n", - "\n", - "Remember how we split that initial Q&A dataset into two parts? The `qa_validation`\n", - "file contains unseen questions for our models that could not possible appear in the\n", - "training data, so it's the perfect check to answer the question if we actually\n", - "moved the needle.\n", - "\n", - "### 🚢 Deploying our New Models\n", - "Foundry requires you to \"deploy\" a model in order to have an endpoint to call it,\n", - "so let's do that.\n", - "\n", - "We need to reach for a different SDK, though, but no worries! The Azure Cognitive\n", - "Services SDK let's us talk to the Azure OpenAI control plane to deploy things\n", - "without having to leave this comfy notebook.\n", - "\n", - "> ⚠️ The code below uses a `DefaultAzureCredential`. The easiest way to make sure\n", - "> one exists is to have installed and authenticated the Azure CLI tooling.\n", - ">\n", - "> See: https://learn.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest\n", - "\n", - "For `gpt-4.1-nano`, we'll use the `Developer` tier available for Azure OpenAI\n", - "Fine-Tuned models as it's purpose built for model candidate evaluation: you only pay\n", - "per token at the same base model rates! ([Learn more](https://aka.ms/Build25/FTGlobalAndDev)!)\n", - "\n", - "For Ministral-3B, we'll use `Global Standard` as `Developer` isn't yet ready while\n", - "the model is in Public Preview." + "## 11. Deploy Model" ] }, { "cell_type": "code", - "execution_count": 24, - "id": "6dee6381", + "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "🛳️ Submitted deployment ministral-3b-distilled-90f2f57b\n", - "🛳️ Submitted deployment gpt-4.1-nano-distilled-90f2f57b\n" + "Deploying: gpt-4.1-mini-2025-04-14.ft-71e2eea33ecd4d4e990c54ed09ada149-sarcasm-bc6d797e\n", + "✓ Deployed: sarcasm-ft-4aa88349\n" ] } ], "source": [ - "# Now we need to deploy our fine-tuned model. We'll use Developer Tier to keep\n", - "# costs under control for evaluation.\n", - "\n", - "# We can't do this with the OpenAI SDK, so we need to reach for the Azure SDK.\n", - "from azure.identity import DefaultAzureCredential\n", "from azure.mgmt.cognitiveservices import CognitiveServicesManagementClient\n", - "from azure.mgmt.cognitiveservices.models import Deployment, DeploymentProperties, DeploymentModel\n", + "from azure.mgmt.cognitiveservices.models import Deployment, DeploymentProperties, DeploymentModel, Sku\n", "\n", - "cogsvc_client = CognitiveServicesManagementClient(\n", - " credential=DefaultAzureCredential(),\n", - " subscription_id=os.environ.get(\"AZURE_SUBSCRIPTION_ID\"),\n", - ")\n", + "subscription_id = os.environ.get(\"AZURE_SUBSCRIPTION_ID\")\n", + "resource_group = os.environ.get(\"AZURE_RESOURCE_GROUP\")\n", + "account_name = os.environ.get(\"AZURE_AOAI_ACCOUNT\")\n", "\n", - "# Define our Deployments. Note the use of SKU for specificy capacity and\n", - "# the name of the deployment tier.\n", - "DEPLOYMENT_CONFIGURATION = [\n", - " {\n", - " \"name\": f\"ministral-3b-distilled-{UNIQUE_ENOUGH_KEY}\",\n", - " \"format\": \"Ministral AI\",\n", - " \"model\": ministral_job.fine_tuned_model,\n", - " \"sku\": \"GlobalStandard\",\n", - " },\n", - " {\n", - " \"name\": f\"gpt-4.1-nano-distilled-{UNIQUE_ENOUGH_KEY}\",\n", - " \"format\": \"OpenAI\",\n", - " \"model\": nano_job.fine_tuned_model,\n", - " \"sku\": \"DeveloperTier\",\n", - " },\n", - "]\n", + "deployment_name = f\"sarcasm-ft-{UNIQUE_KEY}\"\n", "\n", - "DEPLOYMENTS = []\n", - "for d in DEPLOYMENT_CONFIGURATION:\n", - " deployment = cogsvc_client.deployments.begin_create_or_update(\n", - " resource_group_name=os.environ.get(\"AZURE_RESOURCE_GROUP\"),\n", - " account_name=os.environ.get(\"FOUNDRY_PARENT_RESOURCE\"),\n", - " deployment_name=d[\"name\"],\n", - " deployment={\n", - " \"properties\": {\n", - " \"model\": {\n", - " \"format\": d[\"format\"],\n", - " \"name\": d[\"model\"],\n", - " \"version\": \"1\",\n", - " },\n", - " },\n", - " \"sku\": {\"capacity\": 100, \"name\": d[\"sku\"]},\n", - " },\n", + "with CognitiveServicesManagementClient(credential=credential, subscription_id=subscription_id) as mgmt:\n", + " deployment_model = DeploymentModel(format=\"OpenAI\", name=fine_tuned_model_id, version=\"1\")\n", + " deployment_properties = DeploymentProperties(model=deployment_model)\n", + " deployment_sku = Sku(name=\"GlobalStandard\", capacity=50)\n", + " deployment_config = Deployment(properties=deployment_properties, sku=deployment_sku)\n", + " \n", + " print(f\"Deploying: {fine_tuned_model_id}\")\n", + " deployment = mgmt.deployments.begin_create_or_update(\n", + " resource_group_name=resource_group,\n", + " account_name=account_name,\n", + " deployment_name=deployment_name,\n", + " deployment=deployment_config,\n", " )\n", - " print(f\"🛳️ Submitted deployment {d['name']}\")\n", - " DEPLOYMENTS.append(deployment)" + " deployment.result()\n", + "\n", + "print(f\"✓ Deployed: {deployment_name}\")" ] }, { "cell_type": "markdown", - "id": "3c9affa8", "metadata": {}, "source": [ - "Deployments, like Evaluations and Training, are not instaneous, but they are typically\n", - "a lot faster! `Ministral-3B` typically will deploy in 3-5 minutes, so maybe don't walk\n", - "away from your desk just yet 😜." + "## 12. Compare Models" ] }, { "cell_type": "code", - "execution_count": 27, - "id": "02c4168a", + "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "🏁 Provisioning finished!\n" + "Generating comparison data...\n", + "✓ 20 comparisons\n" ] } ], "source": [ - "# Wait for our deployment to finish provisioning.\n", - "start_time = time.time()\n", - "\n", - "STATUS = [d.status() for d in DEPLOYMENTS]\n", - "while any([s not in [\"Succeeded\", \"Failed\"] for s in STATUS]):\n", - " print(f\"🛳️ Provisioning {STATUS}\")\n", - " print(\"⏱️Elapsed time: {} minutes {} seconds\".format(int((time.time() - start_time) // 60), int((time.time() - start_time) % 60)))\n", - " STATUS = [d.status() for d in DEPLOYMENTS]\n", - " time.sleep(5)\n", - " clear_output(wait=True)\n", - " \n", - "print(f\"🏁 Provisioning finished!\")\n" - ] - }, - { - "cell_type": "markdown", - "id": "88fdda88", - "metadata": {}, - "source": [ - "### ⬆️ Uploading the Validation Data\n", - "This should be old-hat by now! We'll take the in-memory validatation data (`qa_validation`)\n", - "and create a dataset via the Files API." - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "5788bf4d", - "metadata": {}, - "outputs": [], - "source": [ - "# Now we'll upload our post-training validation dataset and prepare our final Evaluation.\n", - "# We need to save the data to disk first, again for...reasons.\n", - "filename = f\"./sarcasm-posttraining-{UNIQUE_ENOUGH_KEY}.jsonl\"\n", + "# Generate comparison data\n", + "print(\"Generating comparison data...\")\n", + "comparison_data = []\n", + "test_questions = validation_questions[MAX_VALIDATION:MAX_VALIDATION+20]\n", "\n", - "with open(filename, \"w\") as f:\n", - " for row in qa_validation:\n", - " json.dump(row, f)\n", - " f.write(\"\\n\")\n", + "for item in test_questions:\n", + " q = item['question']\n", + " base_response = generate_response(q, model=base_model)\n", + " ft_response = generate_response(q, model=deployment_name)\n", + " comparison_data.append({\n", + " \"question\": q,\n", + " \"base_answer\": base_response,\n", + " \"ft_answer\": ft_response\n", + " })\n", "\n", - "posttraining_file = None\n", - "with open(filename, \"rb\") as f:\n", - " posttraining_file = client.files.create(purpose=\"evals\", file=f)\n", - " posttraining_file = client.files.wait_for_processing(posttraining_file.id)\n" + "print(f\"✓ {len(comparison_data)} comparisons\")" ] }, { - "cell_type": "markdown", - "id": "35180760", + "cell_type": "code", + "execution_count": 33, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Evaluation files saved\n" + ] + } + ], "source": [ - "### 🏃 Creating Runs to make our Students Compete with the Base Models\n", - "Here's where Evaluations really shine ✨!\n", - "\n", - "We _could_ reuse our baseline evaluation from before and just add new Runs so all the\n", - "results are in one place and easily viewable.\n", - "\n", - "In this case, we'll keep it simple and create a new Evaluation primarily because the\n", - "visualization code already has color-coding based on Eval, not Run and I'm too lazy\n", - "to rewrite it further 😜.\n", + "# Save for evaluation\n", + "base_eval_file = f\"base_eval_{UNIQUE_KEY}.jsonl\"\n", + "ft_eval_file = f\"ft_eval_{UNIQUE_KEY}.jsonl\"\n", "\n", - "Most of this is repeat, in fact we could have reused thigns like `DATA_SOURCE`,\n", - "`TESTING_CRITERIA`, etc. but restate them below for completeness. Same can be said\n", - "for the Runs.\n", + "with open(base_eval_file, \"w\") as f:\n", + " for item in comparison_data:\n", + " f.write(json.dumps({\"question\": item[\"question\"], \"answer\": item[\"base_answer\"]}) + \"\\n\")\n", "\n", - "*However*, we'll focus on just comparing our new models to their base models and also add\n", - "in the teacher as sort of a control. We're testing with totally new data here, unseen by\n", - "any of our models, so what we expect to see is:\n", + "with open(ft_eval_file, \"w\") as f:\n", + " for item in comparison_data:\n", + " f.write(json.dumps({\"question\": item[\"question\"], \"answer\": item[\"ft_answer\"]}) + \"\\n\")\n", "\n", - "1. The base `Ministral-3B` and `gpt-4.1-nano` perform poorly again.\n", - "2. The teacher performs well and on-par with the previous baseline.\n", - "3. The new student models outperform their respective base models.\n" + "print(\"✓ Evaluation files saved\")" ] }, { "cell_type": "code", - "execution_count": 31, - "id": "c125c08c", + "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Created eval eval_693996f5adac8191a606b12700d6118f\n", - "🏃‍➡️ Created run evalrun_693996f640588191a6d2723f8d53adb5 for eval_693996f5adac8191a606b12700d6118f\n", - "🏃‍➡️ Created run evalrun_693996f706ec8191b0096ec9a8305ffd for eval_693996f5adac8191a606b12700d6118f\n", - "🏃‍➡️ Created run evalrun_693996f7a04881918725f5a01dfc964b for eval_693996f5adac8191a606b12700d6118f\n", - "🏃‍➡️ Created run evalrun_693996f83c488191b358c15ae875ec33 for eval_693996f5adac8191a606b12700d6118f\n", - "🏃‍➡️ Created run evalrun_693996f95cb081918c16ae9f8926f64c for eval_693996f5adac8191a606b12700d6118f\n" + "Base run: evalrun_a716d715fdc24e0facffc8b9a90ce170\n", + "Fine-tuned run: evalrun_5180d075fd104d3e962db18db418176c\n" ] } ], "source": [ - "# Now we create a final Eval using our post-training dataset that doesn't overlap with the\n", - "# original distillation and training dataset. This lets us judge our new model based on\n", - "# data it hasn't seen before. We'll also through in one of our better performing base\n", - "# models as a control.\n", - "POST_EVAL_MODELS = [d[\"name\"] for d in DEPLOYMENT_CONFIGURATION] + [\"gpt-4.1-nano\", \"Ministral-3B\", TEACHER_MODEL]\n", + "# Upload and run evaluations\n", + "with open(base_eval_file, \"rb\") as f:\n", + " base_file = openai_client.files.create(purpose=\"evals\", file=f)\n", + " base_file = openai_client.files.wait_for_processing(base_file.id)\n", "\n", - "# SCHEMA, GRADER_MODEL, and INPUT are re-used from our previous Evaluation definition,\n", - "# but let's restate the source and testing criteria again.\n", - "DATA_SOURCE = {\n", - " \"item_schema\": SCHEMA, \n", - " \"include_sample_schema\": True, # Note this change! Needed for data gen.\n", - " \"type\": \"custom\"\n", - "}\n", - "TESTING_CRITERIA = {\n", - " \"name\": \"Auto Sarcasm Grader\",\n", - " \"type\": \"score_model\",\n", - " \"model\": GRADER_MODEL,\n", - " \"input\": INPUT,\n", - " \"range\": [1.0, 10.0],\n", - " \"pass_threshold\": 4.0,\n", - "}\n", - "posttraining_eval = client.evals.create(\n", - " name=f\"sarcasm-posttrain-evaluation-{UNIQUE_ENOUGH_KEY}\",\n", - " data_source_config=DATA_SOURCE,\n", - " testing_criteria=[TESTING_CRITERIA]\n", + "with open(ft_eval_file, \"rb\") as f:\n", + " ft_file = openai_client.files.create(purpose=\"evals\", file=f)\n", + " ft_file = openai_client.files.wait_for_processing(ft_file.id)\n", + "\n", + "base_run = openai_client.evals.runs.create(\n", + " eval_id=grader_eval.id,\n", + " name=f\"base-{UNIQUE_KEY}\",\n", + " data_source={\"type\": \"jsonl\", \"source\": {\"type\": \"file_id\", \"id\": base_file.id}}\n", ")\n", - "print(f\"Created eval {posttraining_eval.id}\")\n", "\n", - "# Now add our runs.\n", - "postraining_runs = []\n", - "for model in POST_EVAL_MODELS:\n", - " RUN_DATA_SOURCE = {\n", - " \"type\": \"completions\",\n", - " \"model\": model,\n", - " \"source\": { \"type\": \"file_id\", \"id\": posttraining_file.id },\n", - " \"input_messages\": {\n", - " \"type\": \"template\",\n", - " \"template\": [\n", - " { \n", - " \"type\": \"message\", \n", - " \"role\": \"system\", \n", - " \"content\": { \"type\": \"input_text\", \"text\": SYSTEM_PROMPT },\n", - " },\n", - " { \n", - " \"type\": \"message\", \n", - " \"role\": \"user\", \n", - " \"content\": { \"type\": \"input_text\", \"text\": \"{{item.question}}\" },\n", - " },\n", - " ],\n", - " },\n", - " }\n", - " run = client.evals.runs.create(\n", - " name=f\"{model}\", \n", - " eval_id=posttraining_eval.id,\n", - " data_source=RUN_DATA_SOURCE, \n", - " )\n", - " print(f\"🏃‍➡️ Created run {run.id} for {posttraining_eval.id}\")\n", - " postraining_runs.append(run)\n" - ] - }, - { - "cell_type": "markdown", - "id": "4de42c84", - "metadata": {}, - "source": [ - "Again, we wait! ⏱️" + "ft_run = openai_client.evals.runs.create(\n", + " eval_id=grader_eval.id,\n", + " name=f\"finetuned-{UNIQUE_KEY}\",\n", + " data_source={\"type\": \"jsonl\", \"source\": {\"type\": \"file_id\", \"id\": ft_file.id}}\n", + ")\n", + "\n", + "print(f\"Base run: {base_run.id}\")\n", + "print(f\"Fine-tuned run: {ft_run.id}\")" ] }, { "cell_type": "code", - "execution_count": 32, - "id": "67769d7d", + "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "🏃‍➡️ Run ministral-3b-distilled-90f2f57b: completed\n", - "🏃‍➡️ Run gpt-4.1-nano-distilled-90f2f57b: completed\n", - "🏃‍➡️ Run gpt-4.1-nano: completed\n", - "🏃‍➡️ Run Ministral-3B: completed\n", - "🏃‍➡️ Run DeepSeek-V3.1: completed\n", - "⏱️ Elapsed time: 1 minutes 44 seconds\n", - "🏁 All 5 runs completed!\n" + "Base: completed\n", + "Fine-tuned: completed\n", + "✓ Evaluations complete\n" ] } ], "source": [ - "# Again, we wait for our runs to finish.\n", - "start_time = time.time()\n", + "# Wait for both\n", + "for run_id, name in [(base_run.id, \"Base\"), (ft_run.id, \"Fine-tuned\")]:\n", + " while True:\n", + " run = openai_client.evals.runs.retrieve(run_id=run_id, eval_id=grader_eval.id)\n", + " if run.status in [\"completed\", \"failed\"]:\n", + " print(f\"{name}: {run.status}\")\n", + " break\n", + " time.sleep(5)\n", "\n", - "while any([r.status not in [\"completed\", \"failed\"] for r in postraining_runs]):\n", - " time.sleep(10)\n", - " clear_output(wait=True)\n", - "\n", - " for i in range(len(postraining_runs)):\n", - " postraining_runs[i] = client.evals.runs.retrieve(eval_id=posttraining_eval.id, run_id=postraining_runs[i].id)\n", - " print(f\"🏃‍➡️ Run {postraining_runs[i].name}: {postraining_runs[i].status}\")\n", - " \n", - " now = time.time()\n", - " print(\"⏱️ Elapsed time: {} minutes {} seconds\".format(int((now - start_time) // 60), int((now - start_time) % 60)))\n", - "\n", - "print(f\"🏁 All {len(postraining_runs)} runs completed!\")" + "print(\"✓ Evaluations complete\")" ] }, { "cell_type": "markdown", - "id": "e482042a", "metadata": {}, "source": [ - "### 📊 Interpreting the Post-Training Results\n", - "Let's first look at _just the new Evaluation_ to see if our new model outperforms its\n", - "competition (`4.1-nano` and `4.1`)." + "## 13. Results" ] }, { "cell_type": "code", - "execution_count": 33, - "id": "ae06ec92", + "execution_count": 36, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Get Evaluation Runs: eval_693996f5adac8191a606b12700d6118f\n", - "\n", - "==================================================\n", - "Combined Evaluation Summary\n", - "==================================================\n", - " Run ID Run Name Model Status Pass Percentage (%) Error Percentage (%) Evaluation ID Evaluation Name\n", - "evalrun_693996f95cb081918c16ae9f8926f64c DeepSeek-V3.1 DeepSeek-V3.1 completed 97.000000 0.0 eval_693996f5adac8191a606b12700d6118f sarcasm-posttrain-evaluation-90f2f57b\n", - "evalrun_693996f706ec8191b0096ec9a8305ffd gpt-4.1-nano-distilled-90f2f57b gpt-4.1-nano-di completed 83.838384 1.0 eval_693996f5adac8191a606b12700d6118f sarcasm-posttrain-evaluation-90f2f57b\n", - "evalrun_693996f640588191a6d2723f8d53adb5 ministral-3b-distilled-90f2f57b ministral-3b-di completed 74.000000 0.0 eval_693996f5adac8191a606b12700d6118f sarcasm-posttrain-evaluation-90f2f57b\n", - "evalrun_693996f83c488191b358c15ae875ec33 Ministral-3B Ministral-3B completed 31.000000 0.0 eval_693996f5adac8191a606b12700d6118f sarcasm-posttrain-evaluation-90f2f57b\n", - "evalrun_693996f7a04881918725f5a01dfc964b gpt-4.1-nano gpt-4.1-nano completed 26.000000 0.0 eval_693996f5adac8191a606b12700d6118f sarcasm-posttrain-evaluation-90f2f57b\n", - "==================================================\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", "text": [ "==================================================\n", - "Fetching scores...\n", + "RESULTS\n", "==================================================\n", + "Baseline (gold): 0.59\n", + "Base model: 0.82 (±0.21)\n", + "Fine-tuned: 0.94 (±0.06)\n", "\n", - "==================================================\n", - "Score Summary Table:\n", - " Model Evaluation Name Average Score Min Score Max Score 10th Percentile 25th Percentile 50th Percentile 75th Percentile 90th Percentile\n", - " DeepSeek-V3.1 sarcasm-posttrain-evaluation-90f2f57b 7.30 1.00 9.00 6.90 7.00 7.50 8.00 8.00\n", - "gpt-4.1-nano-di sarcasm-posttrain-evaluation-90f2f57b 6.26 1.00 8.00 1.00 6.00 7.00 8.00 8.00\n", - "ministral-3b-di sarcasm-posttrain-evaluation-90f2f57b 5.38 1.00 9.00 1.00 3.00 6.00 7.00 8.00\n", - " gpt-4.1-nano sarcasm-posttrain-evaluation-90f2f57b 2.81 1.00 8.00 1.00 1.00 3.00 4.00 6.00\n", - " Ministral-3B sarcasm-posttrain-evaluation-90f2f57b 2.74 1.00 8.00 1.00 1.00 1.00 5.00 7.00\n", - "==================================================\n" + "Improvement: +0.12\n" ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" } ], "source": [ - "# Let's visualize our post-training evaluation. Fingers crossed!\n", - "display_evaluation_summary(client, [posttraining_eval.id], x_range=(1, 10))" + "def get_scores(eval_id, run_id):\n", + " items = list(openai_client.evals.runs.output_items.list(run_id=run_id, eval_id=eval_id))\n", + " return [r.score for item in items for r in item.results if r.score is not None]\n", + "\n", + "base_scores = get_scores(grader_eval.id, base_run.id)\n", + "ft_scores = get_scores(grader_eval.id, ft_run.id)\n", + "\n", + "print(\"=\" * 50)\n", + "print(\"RESULTS\")\n", + "print(\"=\" * 50)\n", + "print(f\"Baseline (gold): {np.mean(baseline_scores):.2f}\")\n", + "print(f\"Base model: {np.mean(base_scores):.2f} (±{np.std(base_scores):.2f})\")\n", + "print(f\"Fine-tuned: {np.mean(ft_scores):.2f} (±{np.std(ft_scores):.2f})\")\n", + "print(f\"\\nImprovement: {np.mean(ft_scores) - np.mean(base_scores):+.2f}\")" ] }, { "cell_type": "code", - "execution_count": 34, - "id": "b0bfe89e", + "execution_count": 38, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Get Evaluation Runs: eval_693984204a7c8191b08abdc67b5d5e23\n", - "Get Evaluation Runs: eval_693996f5adac8191a606b12700d6118f\n", - "\n", - "==================================================\n", - "Combined Evaluation Summary\n", - "==================================================\n", - " Run ID Run Name Model Status Pass Percentage (%) Error Percentage (%) Evaluation ID Evaluation Name\n", - "evalrun_693996f95cb081918c16ae9f8926f64c DeepSeek-V3.1 DeepSeek-V3.1 completed 97.000000 0.00 eval_693996f5adac8191a606b12700d6118f sarcasm-posttrain-evaluation-90f2f57b\n", - "evalrun_69398420f92081919db1b05086cd59ce DeepSeek-V3.1-90f2f57b DeepSeek-V3.1 completed 93.750000 3.00 eval_693984204a7c8191b08abdc67b5d5e23 sacarsm-baseline-90f2f57b\n", - "evalrun_69398421b4308191bc33dfac7567f082 gpt-4.1-90f2f57b gpt-4.1 completed 84.750000 5.25 eval_693984204a7c8191b08abdc67b5d5e23 sacarsm-baseline-90f2f57b\n", - "evalrun_693996f706ec8191b0096ec9a8305ffd gpt-4.1-nano-distilled-90f2f57b gpt-4.1-nano-di completed 83.838384 1.00 eval_693996f5adac8191a606b12700d6118f sarcasm-posttrain-evaluation-90f2f57b\n", - "evalrun_693984246064819180e8989ca09c597a gpt-4o-mini-90f2f57b gpt-4o-mini completed 80.000000 2.75 eval_693984204a7c8191b08abdc67b5d5e23 sacarsm-baseline-90f2f57b\n", - "evalrun_6939842251f08191b1c8a9e4c5d0b2b1 gpt-4.1-mini-90f2f57b gpt-4.1-mini completed 74.686717 4.00 eval_693984204a7c8191b08abdc67b5d5e23 sacarsm-baseline-90f2f57b\n", - "evalrun_69398423c1008191b859c069dbce521e gpt-4o-90f2f57b gpt-4o completed 74.500000 3.50 eval_693984204a7c8191b08abdc67b5d5e23 sacarsm-baseline-90f2f57b\n", - "evalrun_693996f640588191a6d2723f8d53adb5 ministral-3b-distilled-90f2f57b ministral-3b-di completed 74.000000 0.00 eval_693996f5adac8191a606b12700d6118f sarcasm-posttrain-evaluation-90f2f57b\n", - "evalrun_693996f83c488191b358c15ae875ec33 Ministral-3B Ministral-3B completed 31.000000 0.00 eval_693996f5adac8191a606b12700d6118f sarcasm-posttrain-evaluation-90f2f57b\n", - "evalrun_6939842511a48191a67a392e04471fc1 Ministral-3B-90f2f57b Ministral-3B completed 29.500000 1.50 eval_693984204a7c8191b08abdc67b5d5e23 sacarsm-baseline-90f2f57b\n", - "evalrun_69398422f7508191a52bfb35d10a2542 gpt-4.1-nano-90f2f57b gpt-4.1-nano completed 27.000000 2.75 eval_693984204a7c8191b08abdc67b5d5e23 sacarsm-baseline-90f2f57b\n", - "evalrun_693996f7a04881918725f5a01dfc964b gpt-4.1-nano gpt-4.1-nano completed 26.000000 0.00 eval_693996f5adac8191a606b12700d6118f sarcasm-posttrain-evaluation-90f2f57b\n", - "==================================================\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "==================================================\n", - "Fetching scores...\n", - "==================================================\n", - "\n", - "==================================================\n", - "Score Summary Table:\n", - " Model Evaluation Name Average Score Min Score Max Score 10th Percentile 25th Percentile 50th Percentile 75th Percentile 90th Percentile\n", - " DeepSeek-V3.1 sacarsm-baseline-90f2f57b 7.07 0.00 10.00 5.00 7.00 8.00 8.00 8.00\n", - " gpt-4.1 sacarsm-baseline-90f2f57b 5.80 0.00 9.00 3.00 5.00 6.00 7.00 8.00\n", - " gpt-4o-mini sacarsm-baseline-90f2f57b 5.54 0.00 9.00 3.00 4.00 6.00 7.00 8.00\n", - " gpt-4o sacarsm-baseline-90f2f57b 5.26 0.00 9.00 3.00 3.00 6.00 7.00 8.00\n", - " gpt-4.1-mini sacarsm-baseline-90f2f57b 5.18 0.00 9.00 3.00 3.00 6.00 7.00 8.00\n", - " gpt-4.1-nano sacarsm-baseline-90f2f57b 2.85 0.00 8.00 1.00 1.00 2.00 4.00 6.00\n", - " Ministral-3B sacarsm-baseline-90f2f57b 2.70 0.00 8.00 1.00 1.00 1.00 5.00 7.00\n", - " DeepSeek-V3.1 sarcasm-posttrain-evaluation-90f2f57b 7.30 1.00 9.00 6.90 7.00 7.50 8.00 8.00\n", - "gpt-4.1-nano-di sarcasm-posttrain-evaluation-90f2f57b 6.26 1.00 8.00 1.00 6.00 7.00 8.00 8.00\n", - "ministral-3b-di sarcasm-posttrain-evaluation-90f2f57b 5.38 1.00 9.00 1.00 3.00 6.00 7.00 8.00\n", - " gpt-4.1-nano sarcasm-posttrain-evaluation-90f2f57b 2.81 1.00 8.00 1.00 1.00 3.00 4.00 6.00\n", - " Ministral-3B sarcasm-posttrain-evaluation-90f2f57b 2.74 1.00 8.00 1.00 1.00 1.00 5.00 7.00\n", - "==================================================\n" - ] - }, { "data": { - "image/png": "", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAxYAAAHpCAYAAAAf5apCAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAPz1JREFUeJzt3XlcFuX+//H3DQqCIijIoiG4o5WQmOZ21COFS5Cap9RKJTVLOZq06clc0sTqZGaa5oLaaXHJtDqapqZlqbmFfi3DPUgFFxJcQWF+f/jjPt6Byu0AN+Tr+XjcD++55rpmPoOM8HbmmttiGIYhAAAAADDBydEFAAAAACj7CBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAANzmFi5cqCZNmsjNzU1Vq1ZVjx49dPDgwRuOOXnypIYNG6Y6deqoQoUKCg4O1siRI5WVlVVCVaO0sRiGYTi6CAAAADjG3LlzNWDAAElSrVq1dPr0aWVmZsrX11e7du2Sv79/vjFZWVkKDQ1VUlKSXF1dFRISoqSkJF26dEldu3bVsmXLSvowUApwxQIAAOA2lZ2drREjRkiSHn74YR06dEh79+6Vh4eHTpw4oYkTJxY4bt26dUpKSpIkLV26VImJifriiy8kScuXL9emTZtK5gBQqhAsAAAAblPbtm3TqVOnJF0NFpJUvXp13XfffZKkVatWFTguNzfX+t7JycnmT0lau3ZtsdSL0q2cowsAAACAY6SkpFjf+/r6Wt/7+flJkpKTkwsc17p1awUEBOj48ePq3r279VaoPEePHi2milGaccUCAAAANm42BdfLy0tr165VVFSUKlasqCNHjqhr167y8vKSJJUvX74EqkRpwxULAACA21RgYKD1/YkTJ/K9r1mz5nXHNmrUyDqvQpKOHTumTz75RJLUoEGDoi4VZQBXLAAAAG5T9957r7y9vSVdnYQtXQ0IW7ZskSR17NhRkhQSEqKQkBBNmzbNOnbLli3WR8tevHhR//znPyVdvVrRvXv3EjsGlB4ECwAAgNuUi4uL9clPS5cuVe3atdWwYUOdPXtWPj4+1idGJSUlKSkpyTrRW5ImTJggHx8fNW7cWAEBAfrss88kSW+++aZq1KhR8gcDhyNYAAAA3MaeeuopffjhhwoLC9OxY8dksVjUvXt3bdq0SdWrV7/uuLZt28rf31/79+/XlStX1Lp1ay1btkzDhg0rwepRmvABeQAAAABM44oFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAwjWABAAAAwDSCBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAwjWABAAAAwDSCBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAPAXZLFYNHbsWLvHHTlyRBaLRfPnzy/ymv6KgoOD1a9fP0eXAQClAsECAIrJ/PnzZbFYZLFY9P333+dbbxiGAgMDZbFY9OCDDzqgQvPS0tL0/PPPKyQkRO7u7qpYsaLCw8M1YcIEnTlzxtHlAQBKUDlHFwAAf3UVKlTQxx9/rNatW9u0f/vtt/r999/l6urqoMrM2bZtmzp37qxz587p8ccfV3h4uCRp+/btmjRpkr777jt9/fXXDq6yeCUlJcnJif+jAwCJYAEAxa5z585asmSJpk6dqnLl/vfP7scff6zw8HCdOnXKgdXdmjNnzqhbt25ydnbWTz/9pJCQEJv1r732mmbPnu2g6oqXYRi6dOmS3NzcymwoBIDiwH+zAEAx69Wrl06fPq01a9ZY27Kzs/Xpp5+qd+/eBY45f/68nnvuOQUGBsrV1VUNGjTQv//9bxmGYdMvKytLw4cPV7Vq1eTh4aHo6Gj9/vvvBW7z6NGjevLJJ+Xn5ydXV1fdeeedSkhIuKVjev/993X06FFNnjw5X6iQJD8/P40aNcq6vHDhQgUGBsrJyUkWi0Vubm56/PHH890u1a5dO911113avXu3WrZsqfLly1tfVapUUUhIiGrXri03Nzc1aNBAa9eutRk/duxYWSwW/frrr3rkkUdUuXJleXt7a9iwYbp06ZJN33nz5unvf/+7fH195erqqkaNGmnGjBn5jiU4OFgPPvigVq9eraZNm8rNzU3vv/++dd21cywuX76scePGqV69eqpQoYK8vb3VunVrm797Sfrmm2/Upk0bVaxYUV5eXnrooYe0d+/eAo/lwIED6tevn7y8vOTp6amYmBhduHDh+n85AOAgXLEAgGIWHBysFi1a6JNPPlGnTp0kSV999ZUyMjLUs2dPTZ061aa/YRiKjo7W+vXr1b9/f4WFhWn16tV64YUXdPToUb399tvWvgMGDNCHH36o3r17q2XLlvrmm2/UpUuXfDWkpaXpvvvuk8ViUWxsrKpVq6avvvpK/fv3V2Zmpp599lm7jumLL76Qm5ubevTocdO+c+fO1YABAyRdvS1Mki5duqSPPvpIP//8s7Zu3ary5ctb+//xxx968MEHZRiGrly5Ym3Pzc1VUlKS9bi/+uor9ejRQykpKfLw8LDZ5yOPPKLg4GDFx8dry5Ytmjp1qv744w998MEH1j4zZszQnXfeqejoaJUrV05ffvmlBg8erNzcXA0ZMsRme0lJSerVq5cGDRqkgQMHqkGDBgUe69ixYxUfH68BAwaoWbNmyszM1Pbt27Vz507df//9kqS1a9eqU6dOql27tsaOHauLFy/q3XffVatWrbRz504FBwfnO5ZatWopPj5eO3fu1Jw5c+Tr66vXX3/9pl97AChRBgCgWMybN8+QZGzbts2YNm2a4eHhYVy4cMEwDMP4xz/+YbRv394wDMMICgoyunTpYh23fPlyQ5IxYcIEm+316NHDsFgsxoEDBwzDMIzExERDkjF48GCbfr179zYkGWPGjLG29e/f3wgICDBOnTpl07dnz56Gp6enta7Dhw8bkox58+bd8NiqVKlihIaG3vRrkJWVZVStWtWQZPj6+ho5OTnG0aNHDQ8PD0OSIclISEiw9m/btq0hyfjoo4+M8uXLG5KMnj17GpIMJycn65h3333XWL16db5ax4wZY0gyoqOjbeoYPHiwIcnYtWuXtS3vmK8VGRlp1K5d26YtKCjIkGSsWrUqX/+goCCjb9++1uXQ0FCbv8uChIWFGb6+vsbp06etbbt27TKcnJyMPn365DuWJ5980mZ8t27dDG9v7xvuAwAcgVuhAKAEPPLII7p48aL++9//6uzZs/rvf/973dugVq5cKWdnZw0dOtSm/bnnnpNhGPrqq6+s/STl6/fnqw+GYWjp0qWKioqSYRg6deqU9RUZGamMjAzt3LnTruPJzMzMd5WgINu2bVN6erokqV+/fnJyclL16tV13333SZKcnJy0YsUKmzGVKlVSr1691KpVK0nSkiVL5OzsbH3CVnR0tPr166fmzZtLkg4dOpRvv3++4vDPf/5T0v++ZpLk5uZmfZ+RkaFTp06pbdu2OnTokDIyMmzG16pVS5GRkTc9Xi8vL/3888/av39/geuPHz+uxMRE9evXT1WrVrW2N27cWPfff79NfXmefvppm+U2bdro9OnTyszMvGk9AFCSCBYAUAKqVaumiIgIffzxx/rss8+Uk5Nz3duIfvvtN1WvXj3fL+4NGza0rs/708nJSXXq1LHp9+fbdE6ePKkzZ85o1qxZqlatms0rJiZGknTixAm7jqdy5co6e/bsTfulpKRY3991113W935+fpKuhp6848lzxx13yGKxaNmyZYqMjFROTo71ValSJd1zzz1yd3eXp6enpKu3Tv1ZvXr1bJbr1KkjJycnHTlyxNr2ww8/KCIiwjrPoVq1avrXv/4lSQUGi8J49dVXdebMGdWvX1933323XnjhBe3evdu6Pu9YC7qVqmHDhjp16pTOnz9v016zZk2b5SpVqkgq+LgBwJGYYwEAJaR3794aOHCgUlNT1alTJ3l5eZXIfnNzcyVJjz/+uPr27Vtgn8aNG9u1zZCQECUmJio7O1suLi5212T8aRL6tZydnSVJI0eO1OrVq9WjRw9t3bpVNWrU0I4dOzRu3Dh5eXlZr8zcaFt5LBaLzfLBgwfVoUMHhYSEaPLkyQoMDJSLi4tWrlypt99+2/o1y3Pt1Y0b+dvf/qaDBw/q888/19dff605c+bo7bff1syZM63zTOyV9/X4s8IcNwCUJK5YAEAJ6datm5ycnLRly5br3gYlSUFBQTp27Fi+KwK//vqrdX3en7m5uTp48KBNv7wJznnynhiVk5OjiIiIAl++vr52HUtUVJQuXryopUuX3rBfYGCg9f2ePXus7/OukFgsFuvxXGv//v2aOXOmpKuBzGKxqGrVqtYnUP35aVAFjb/WgQMHlJuba50Y/eWXXyorK0tffPGFBg0apM6dOysiIqLQAeJGqlatqpiYGH3yySdKSUlR48aNrZ+Cnnesf/47kq7+/fr4+KhixYqmawAARyBYAEAJqVSpkmbMmKGxY8cqKirquv06d+6snJwcTZs2zab97bfflsVisT5ZKu/PPz9VasqUKTbLzs7Oevjhh7V06VKbX+7znDx50u5jefrppxUQEKDnnntO+/bty7f+xIkTmjBhgu69917rrTsLFiyQYRg6duyYtmzZIunq1ZQuXbooJCREISEhOnr0qCTbW5G2b98u6eojevNuZbrZL9/Tp0+3WX733Xcl/e9rlncV4Nr/9c/IyNC8efMK9wW4jtOnT9ssV6pUSXXr1lVWVpYkKSAgQGFhYVqwYIHNo3b37Nmjr7/+Wp07dza1fwBwJG6FAoASdL1bka4VFRWl9u3b6+WXX9aRI0cUGhqqr7/+Wp9//rmeffZZ65yKsLAw9erVS++9954yMjLUsmVLrVu3TgcOHMi3zUmTJmn9+vVq3ry5Bg4cqEaNGik9PV07d+7U2rVrrROsC6tKlSpatmyZOnfurLCwMJtP3t65c6c++eQTtWjRQqNGjdKkSZM0aNAgpaWlqWLFitYPmMs7hscff1xPPvmkpKv/o+/q6qrQ0FDVqVNHBw8e1MSJE1W+fHkdO3ZMly9fliT16dPnhvUdPnxY0dHR6tixozZv3mx9JG9oaKgk6YEHHpCLi4uioqI0aNAgnTt3TrNnz5avr6+OHz9u19fiWo0aNVK7du0UHh6uqlWravv27fr0008VGxtr7fPmm2+qU6dOatGihfr372993Kynp6f1ygYAlEmOehwVAPzVXfu42Rv58+NmDcMwzp49awwfPtyoXr26Ub58eaNevXrGm2++aeTm5tr0u3jxojF06FDD29vbqFixohEVFWWkpKTke9ysYRhGWlqaMWTIECMwMNAoX7684e/vb3To0MGYNWuWtU9hHzeb59ixY8bw4cON+vXrGxUqVDDc3d2N8PBw47XXXjMyMjKs/T788EOjRo0ahsViMSQZrq6uRu/evY0//vjDMAzD+hjZoKAg48477zQMwzBSUlKMp59+2qhVq5YhyXBxcTHatWtnrFy50rpdScaQIUOsy3mPaP3ll1+MHj16GB4eHkaVKlWM2NhY4+LFiza1f/HFF0bjxo2NChUqGMHBwcbrr79uJCQkGJKMw4cP3/Dv59p11z5udsKECUazZs0MLy8vw83NzQgJCTFee+01Izs722bc2rVrjVatWhlubm5G5cqVjaioKOOXX36x6ZN3LCdPnrRpz/u+urZGACgNLIbB7C8AwF/D2LFjNW7cOJ08eVI+Pj6OLgcAbivMsQAAAABgGsECAAAAgGkECwAAAACmOTRYfPfdd4qKilL16tVlsVi0fPnym47ZsGGDmjRpIldXV9WtW1fz588v9joBAGXD2LFjZRgG8ysAwAEcGizOnz+v0NDQfM8bv57Dhw+rS5cuat++vRITE/Xss89qwIABWr16dTFXCgAAAOBGSs1ToSwWi5YtW6auXbtet89LL72kFStW2HzAU8+ePXXmzBmtWrWqwDFZWVnWDyaSrn4YU3p6ury9vWWxWIqsfgAAAOCvxjAMnT17VtWrV5eT042vSZSpD8jbvHmzIiIibNoiIyP17LPPXndMfHy8xo0bV8yVAQAAAH9dKSkpuuOOO27Yp0wFi9TUVPn5+dm0+fn5KTMzUxcvXpSbm1u+MSNHjlRcXJx1OSMjQzVr1lRKSooqV65c7DUDAAAAZVVmZqYCAwPl4eFx075lKljcCldXV7m6uuZrr1y5MsECAAAAKITCTCEoU4+b9ff3V1pamk1bWlqaKleuXODVCgAAAAAlo0wFixYtWmjdunU2bWvWrFGLFi0cVBEAAAAAycHB4ty5c0pMTFRiYqKkq4+TTUxMVHJysqSr8yP69Olj7f/000/r0KFDevHFF/Xrr7/qvffe0+LFizV8+HBHlA8AAADg/3NosNi+fbvuuece3XPPPZKkuLg43XPPPRo9erQk6fjx49aQIUm1atXSihUrtGbNGoWGhuqtt97SnDlzFBkZ6ZD6AQAAAFxVaj7HoqRkZmbK09NTGRkZTN4GAAAAbsCe353L1BwLAAAAAKUTwQIAAACAaQQLAAAAAKYRLAAAAACYRrAAAAAAYBrBAgAAAIBpBAsAAAAAphEsAAAAAJhGsAAAAABgGsECAAAAgGkECwAAAACmESwAAAAAmEawAAAAAGAawQIAAACAaQQLAAAAAKYRLAAAAACYRrAAAAAAYFo5RxcAAABwK0J3xDm6BKDY7Qqf7OgSCo0rFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAwjWABAAAAwDSCBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAwjWABAAAAwDSCBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAwjWABAAAAwDSCBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAwjWABAAAAwLRyji4AAEqje98/4OgSgBKxbVBdR5cA4C+CKxb4S1i4cKGaNGkiNzc3Va1aVT169NDBgwdvOu7w4cPq16+fAgIC5OLiIj8/P3Xp0kUZGRklUDUAAMBfB1csUObNnTtXAwYMkCTVqlVLp0+f1tKlS7Vx40bt2rVL/v7+BY7bt2+fWrZsqdOnT8vd3V0NGzZUdna21qxZo7Nnz8rT07MkDwMAAKBM44oFyrTs7GyNGDFCkvTwww/r0KFD2rt3rzw8PHTixAlNnDjxumOHDh2q06dPq3379jp69Kh27dqlvXv3KiMj47phBAAAAAVzeLCYPn26goODVaFCBTVv3lxbt269Yf8pU6aoQYMGcnNzU2BgoIYPH65Lly6VULUobbZt26ZTp05JuhosJKl69eq67777JEmrVq0qcNwff/yhr7/+WpJUpUoVNW3aVB4eHrrvvvv0/fffq1w5LuYBAADYw6HBYtGiRYqLi9OYMWO0c+dOhYaGKjIyUidOnCiw/8cff6wRI0ZozJgx2rt3r+bOnatFixbpX//6VwlXjtIiJSXF+t7X19f63s/PT5KUnJxc4Lj9+/fLMAxJ0meffabc3FxVqFBBP/74ozp16qQff/yxGKsGAAD463FosJg8ebIGDhyomJgYNWrUSDNnzpS7u7sSEhIK7L9p0ya1atVKvXv3VnBwsB544AH16tXrplc5cPvJCw3Xc+XKFev7iIgIHTx4UAcOHFDVqlWVk5OjGTNmFHeJAAAAfykOCxbZ2dnasWOHIiIi/leMk5MiIiK0efPmAse0bNlSO3bssAaJQ4cOaeXKlercufN195OVlaXMzEybF/46AgMDre+vvdKV975mzZoFjqtRo4b1fdOmTWWxWOTp6an69etLko4cOVIM1QIAAPx1OSxYnDp1Sjk5OdZbVvL4+fkpNTW1wDG9e/fWq6++qtatW6t8+fKqU6eO2rVrd8NboeLj4+Xp6Wl9XfuLKMq+e++9V97e3pKkpUuXSpKOHTumLVu2SJI6duwoSQoJCVFISIimTZsmSQoKClK9evUkSTt27JBhGMrMzNS+ffskyboOAAAAhePwydv22LBhgyZOnKj33ntPO3fu1GeffaYVK1Zo/Pjx1x0zcuRIZWRkWF/X3pOPss/FxcX65KelS5eqdu3aatiwoc6ePSsfHx/rE6OSkpKUlJRknegtSZMmTZLFYtGaNWtUt25d1a1bV+np6apYsaLi4uIccjwAAABllcOChY+Pj5ydnZWWlmbTnpaWdt1Hfb7yyit64oknNGDAAN19993q1q2bJk6cqPj4eOXm5hY4xtXVVZUrV7Z54a/lqaee0ocffqiwsDAdO3ZMFotF3bt316ZNm1S9evXrjuvevbuWL1+ue++9V8eOHZOTk5O6du2q7du3q2HDhiV4BAAAAGWfw56p6eLiovDwcK1bt05du3aVJOXm5mrdunWKjY0tcMyFCxfk5GSbhZydnSXdfLIu/toee+wxPfbYY9ddf73vj+joaEVHRxdXWQAAALcNhz6sPy4uTn379lXTpk3VrFkzTZkyRefPn1dMTIwkqU+fPqpRo4bi4+MlSVFRUZo8ebLuueceNW/eXAcOHNArr7yiqKgoa8AAAAAAUPIcGiweffRRnTx5UqNHj1ZqaqrCwsK0atUqm88guPYKxahRo2SxWDRq1CgdPXpU1apVU1RUlF577TVHHQIAAAAASRbjNruHKDMzU56ensrIyGC+BYDruvf9A44uASgR2wbVdXQJtyx0Bw/awF/frvDJDt2/Pb87l6mnQgEAAAAonQgWAAAAAEwjWAAAAAAwjWABAAAAwDSCBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAwjWABAAAAwDSCBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAwjWABAAAAwDSCBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAwjWABAAAAwDSCBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAwjWABAAAAwDSCBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMu+VgceDAAa1evVoXL16UJBmGUWRFAQAAAChb7A4Wp0+fVkREhOrXr6/OnTvr+PHjkqT+/fvrueeeK/ICAQAAAJR+dgeL4cOHq1y5ckpOTpa7u7u1/dFHH9WqVauKtDgAAAAAZUM5ewd8/fXXWr16te644w6b9nr16um3334rssIAAAAAlB12X7E4f/68zZWKPOnp6XJ1dS2SogAAAACULXYHizZt2uiDDz6wLlssFuXm5uqNN95Q+/bti7Q4AAAAAGWD3bdCvfHGG+rQoYO2b9+u7Oxsvfjii/r555+Vnp6uH374oThqBAAAAFDK2X3F4q677tK+ffvUunVrPfTQQzp//ry6d++un376SXXq1CmOGgEAAACUcnYFi8uXL6tDhw46ceKEXn75ZS1evFgrV67UhAkTFBAQcEsFTJ8+XcHBwapQoYKaN2+urVu33rD/mTNnNGTIEAUEBMjV1VX169fXypUrb2nfAAAAAIqGXbdClS9fXrt37y6ynS9atEhxcXGaOXOmmjdvrilTpigyMlJJSUny9fXN1z87O1v333+/fH199emnn6pGjRr67bff5OXlVWQ1AQAAALCf3bdCPf7445o7d26R7Hzy5MkaOHCgYmJi1KhRI82cOVPu7u5KSEgosH9CQoLS09O1fPlytWrVSsHBwWrbtq1CQ0OLpB4AAAAAt8buydtXrlxRQkKC1q5dq/DwcFWsWNFm/eTJkwu1nezsbO3YsUMjR460tjk5OSkiIkKbN28ucMwXX3yhFi1aaMiQIfr8889VrVo19e7dWy+99JKcnZ0LHJOVlaWsrCzrcmZmZqHqAwAAAFB4dgeLPXv2qEmTJpKkffv22ayzWCyF3s6pU6eUk5MjPz8/m3Y/Pz/9+uuvBY45dOiQvvnmGz322GNauXKlDhw4oMGDB+vy5csaM2ZMgWPi4+M1bty4QtcFAAAAwH52B4v169cXRx2FkpubK19fX82aNUvOzs4KDw/X0aNH9eabb143WIwcOVJxcXHW5czMTAUGBpZUydeVGtXG0SUAJcL/y42OLgEAAJQAu4PFtX7//XdJ0h133GH3WB8fHzk7OystLc2mPS0tTf7+/gWOCQgIUPny5W1ue2rYsKFSU1OVnZ0tFxeXfGNcXV35RHAAAACgmNk9eTs3N1evvvqqPD09FRQUpKCgIHl5eWn8+PHKzc0t9HZcXFwUHh6udevW2Wx73bp1atGiRYFjWrVqpQMHDtjsZ9++fQoICCgwVAAAAAAoGXYHi5dfflnTpk3TpEmT9NNPP+mnn37SxIkT9e677+qVV16xa1txcXGaPXu2FixYoL179+qZZ57R+fPnFRMTI0nq06ePzeTuZ555Runp6Ro2bJj27dunFStWaOLEiRoyZIi9hwEAAACgCNl9K9SCBQs0Z84cRUdHW9saN26sGjVqaPDgwXrttdcKva1HH31UJ0+e1OjRo5WamqqwsDCtWrXKOqE7OTlZTk7/yz6BgYFavXq1hg8fbt3nsGHD9NJLL9l7GAAAAACKkN3BIj09XSEhIfnaQ0JClJ6ebncBsbGxio2NLXDdhg0b8rW1aNFCW7ZssXs/AAAAAIqP3bdChYaGatq0afnap02bxgfVAQAAALcpu69YvPHGG+rSpYvWrl1rnWS9efNmpaSkaOXKlUVeIAAAAIDSz+4rFm3btlVSUpK6deumM2fO6MyZM+revbuSkpLUpg2fzQAAAADcjm7pcyxq1Khh1yRtAAAAAH9tdl+xmDdvnpYsWZKvfcmSJVqwYEGRFAUAAACgbLE7WMTHx8vHxydfu6+vryZOnFgkRQEAAAAoW+wOFsnJyapVq1a+9qCgICUnJxdJUQAAAADKFruDha+vr3bv3p2vfdeuXfL29i6SogAAAACULXYHi169emno0KFav369cnJylJOTo2+++UbDhg1Tz549i6NGAAAAAKWc3U+FGj9+vI4cOaIOHTqoXLmrw3Nzc9WnTx/mWAAAAAC3KbuDhYuLixYtWqQJEyYoMTFRbm5uuvvuuxUUFFQc9QEAAAAoA27pcywkqV69eqpXr56uXLmiS5cuFWVNAAAAAMqYQs+x+PLLLzV//nybttdee02VKlWSl5eXHnjgAf3xxx9FXR8AAACAMqDQwWLy5Mk6f/68dXnTpk0aPXq0XnnlFS1evFgpKSkaP358sRQJAAAAoHQrdLD4+eef1bJlS+vyp59+qvvvv18vv/yyunfvrrfeektffvllsRQJAAAAoHQrdLA4e/aszedUfP/99+rQoYN1+c4779SxY8eKtjoAAAAAZUKhg0WNGjW0d+9eSdK5c+e0a9cumysYp0+flru7e9FXCAAAAKDUK3Sw+Mc//qFnn31W//nPfzRw4ED5+/vrvvvus67fvn27GjRoUCxFAgAAACjdCv242dGjR+vo0aMaOnSo/P399eGHH8rZ2dm6/pNPPlFUVFSxFAkAAACgdCt0sHBzc9MHH3xw3fXr168vkoIAAAAAlD2FvhUKAAAAAK6HYAEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMK3QT4W61rZt27R+/XqdOHFCubm5NusmT55cJIUBAAAAKDvsDhYTJ07UqFGj1KBBA/n5+clisVjXXfseAAAAwO3D7mDxzjvvKCEhQf369SuGcgAAAACURXbPsXByclKrVq2KoxYAAAAAZZTdwWL48OGaPn16cdQCAAAAoIyy+1ao559/Xl26dFGdOnXUqFEjlS9f3mb9Z599VmTFAQAAACgb7A4WQ4cO1fr169W+fXt5e3szYRsAAACA/cFiwYIFWrp0qbp06VIc9QAAAAAog+yeY1G1alXVqVOnOGoBAAAAUEbZHSzGjh2rMWPG6MKFC8VRDwAAAIAyyO5boaZOnaqDBw/Kz89PwcHB+SZv79y5s8iKAwAAAFA22B0sunbtWgxlAAAAACjL7A4WY8aMKY46AAAAAJRhds+xSElJ0e+//25d3rp1q5599lnNmjWrSAsDAAAAUHbYHSx69+6t9evXS5JSU1MVERGhrVu36uWXX9arr75a5AUCAAAAKP3sDhZ79uxRs2bNJEmLFy/W3XffrU2bNumjjz7S/Pnzi7o+AAAAAGWA3cHi8uXLcnV1lSStXbtW0dHRkqSQkBAdP368aKsDAAAAUCbYHSzuvPNOzZw5Uxs3btSaNWvUsWNHSdKxY8fk7e1d5AUCAAAAKP3sDhavv/663n//fbVr1069evVSaGioJOmLL76w3iIFAAAA4PZi9+Nm27Vrp1OnTikzM1NVqlSxtj/11FNyd3cv0uIAAAAAlA12BwtJcnZ2tgkVkhQcHFwU9QAAAAAog24pWHz66adavHixkpOTlZ2dbbNu586dRVIYAAAAgLLD7jkWU6dOVUxMjPz8/PTTTz+pWbNm8vb21qFDh9SpU6fiqBEAAABAKWd3sHjvvfc0a9Ysvfvuu3JxcdGLL76oNWvWaOjQocrIyCiOGgEAAACUcnYHi+TkZLVs2VKS5ObmprNnz0qSnnjiCX3yySdFWx0AAACAMsHuYOHv76/09HRJUs2aNbVlyxZJ0uHDh2UYRtFWBwAAAKBMsDtY/P3vf9cXX3whSYqJidHw4cN1//3369FHH1W3bt2KvEAAAAAApZ/dT4WaNWuWcnNzJUlDhgyRt7e3Nm3apOjoaA0aNKjICwQAAABQ+tkdLJycnOTk9L8LHT179lTPnj2LtCgAAAAAZYvdt0LNmzdPS5Ysyde+ZMkSLViwoEiKAgAAAFC22B0s4uPj5ePjk6/d19dXEydOLJKiAAAAAJQtt/S42Vq1auVrDwoKUnJycpEUBQAAAKBssTtY+Pr6avfu3fnad+3aJW9v7yIpCgAAAEDZYnew6NWrl4YOHar169crJydHOTk5+uabbzRs2DAmcQMAAAC3KbufCjV+/HgdOXJEHTp0ULlyV4fn5uaqT58+zLEAAAAAblN2BQvDMJSamqr58+drwoQJSkxMlJubm+6++24FBQUVV40AAAAASjm7g0XdunX1888/q169eqpXr15x1QUAAACgDLFrjoWTk5Pq1aun06dPF1c9AAAAAMoguydvT5o0SS+88IL27NlTHPUAAAAAKIPsnrzdp08fXbhwQaGhoXJxcZGbm5vN+vT09CIrDgAAAEDZYHewmDJlSpEXMX36dL355ptKTU1VaGio3n33XTVr1uym4xYuXKhevXrpoYce0vLly4u8LgAAAACFY3ew6Nu3b5EWsGjRIsXFxWnmzJlq3ry5pkyZosjISCUlJcnX1/e6444cOaLnn39ebdq0KdJ6AAAAANjP7jkW17p06ZIyMzNtXvaaPHmyBg4cqJiYGDVq1EgzZ86Uu7u7EhISrjsmJydHjz32mMaNG6fatWubOQQAAAAARcDuYHH+/HnFxsbK19dXFStWVJUqVWxe9sjOztaOHTsUERHxv4KcnBQREaHNmzdfd9yrr74qX19f9e/f/6b7yMrKMh1+AAAAANyY3cHixRdf1DfffKMZM2bI1dVVc+bM0bhx41S9enV98MEHdm3r1KlTysnJkZ+fn027n5+fUlNTCxzz/fffa+7cuZo9e3ah9hEfHy9PT0/rKzAw0K4aAQAAANyc3cHiyy+/1HvvvaeHH35Y5cqVU5s2bTRq1ChNnDhRH330UXHUaHX27Fk98cQTmj17tnx8fAo1ZuTIkcrIyLC+UlJSirVGAAAA4HZk9+Tt9PR067yGypUrWx8v27p1az3zzDN2bcvHx0fOzs5KS0uzaU9LS5O/v3++/gcPHtSRI0cUFRVlbcvNzZUklStXTklJSapTp47NGFdXV7m6utpVFwAAAAD72H3Fonbt2jp8+LAkKSQkRIsXL5Z09UqGl5eXXdtycXFReHi41q1bZ23Lzc3VunXr1KJFi3z9Q0JC9H//939KTEy0vqKjo9W+fXslJiZymxMAAADgIHZfsYiJidGuXbvUtm1bjRgxQlFRUZo2bZouX76syZMn211AXFyc+vbtq6ZNm6pZs2aaMmWKzp8/r5iYGElXP5CvRo0aio+PV4UKFXTXXXfZjM8LM39uBwAAAFBy7A4Ww4cPt76PiIjQ3r17tXPnTtWtW1eNGze2u4BHH31UJ0+e1OjRo5WamqqwsDCtWrXKOqE7OTlZTk6mnooLAAAAoJjZHSz+LDg4WMHBwaa2ERsbq9jY2ALXbdiw4YZj58+fb2rfAAAAAMwr9KWAzZs367///a9N2wcffKBatWrJ19dXTz31lLKysoq8QAAAAAClX6GDxauvvqqff/7Zuvx///d/6t+/vyIiIjRixAh9+eWXio+PL5YiAQAAAJRuhQ4WiYmJ6tChg3V54cKFat68uWbPnq24uDhNnTrV+oQoAAAAALeXQgeLP/74w+YTsr/99lt16tTJunzvvffy4XMAAADAbarQwcLPz8/6+RXZ2dnauXOn7rvvPuv6s2fPqnz58kVfIQAAAIBSr9DBonPnzhoxYoQ2btyokSNHyt3dXW3atLGu3717d75PvQYAAABweyj042bHjx+v7t27q23btqpUqZIWLFggFxcX6/qEhAQ98MADxVIkAAAAgNKt0MHCx8dH3333nTIyMlSpUiU5OzvbrF+yZIkqVapU5AUCAAAAKP3s/oA8T0/PAturVq1quhgAAAAAZVOh51gAAAAAwPUQLAAAAACYRrAAAAAAYBrBAgAAAIBpBAsAAAAAphEsAAAAAJhGsAAAAABgGsECAAAAgGkECwAAAACmESwAAAAAmEawAAAAAGAawQIAAACAaQQLAAAAAKYRLAAAAACYRrAAAAAAYBrBAgAAAIBpBAsAAAAAphEsAAAAAJhGsAAAAABgGsECAAAAgGkECwAAAACmESwAAAAAmEawAAAAAGAawQIAAACAaQQLAAAAAKYRLAAAAACYRrAAAAAAYBrBAgAAAIBpBAsAAAAAphEsAAAAAJhGsAAAAABgGsECAAAAgGkECwAAAACmESwAAAAAmEawAAAAAGAawQIAAACAaQQLAAAAAKYRLAAAAACYRrAAAAAAYBrBAgAAAIBpBAsAAAAAphEsAAAAAJhGsAAAAABgGsECAAAAgGkECwAAAACmESwAAAAAmEawAAAAAGAawQIAAACAaQQLAAAAAKYRLAAAAACYRrAAAAAAYBrBAgAAAIBpBAsAAAAAphEsAAAAAJhGsAAAAABgGsECAAAAgGkECwAAAACmlYpgMX36dAUHB6tChQpq3ry5tm7det2+s2fPVps2bVSlShVVqVJFERERN+wPAAAAoPg5PFgsWrRIcXFxGjNmjHbu3KnQ0FBFRkbqxIkTBfbfsGGDevXqpfXr12vz5s0KDAzUAw88oKNHj5Zw5QAAAADyODxYTJ48WQMHDlRMTIwaNWqkmTNnyt3dXQkJCQX2/+ijjzR48GCFhYUpJCREc+bMUW5urtatW1fClQMAAADI49BgkZ2drR07digiIsLa5uTkpIiICG3evLlQ27hw4YIuX76sqlWrFrg+KytLmZmZNi8AAAAARcuhweLUqVPKycmRn5+fTbufn59SU1MLtY2XXnpJ1atXtwkn14qPj5enp6f1FRgYaLpuAAAAALYcfiuUGZMmTdLChQu1bNkyVahQocA+I0eOVEZGhvWVkpJSwlUCAAAAf33lHLlzHx8fOTs7Ky0tzaY9LS1N/v7+Nxz773//W5MmTdLatWvVuHHj6/ZzdXWVq6trkdQLAAAAoGAOvWLh4uKi8PBwm4nXeROxW7Rocd1xb7zxhsaPH69Vq1apadOmJVEqAAAAgBtw6BULSYqLi1Pfvn3VtGlTNWvWTFOmTNH58+cVExMjSerTp49q1Kih+Ph4SdLrr7+u0aNH6+OPP1ZwcLB1LkalSpVUqVIlhx0HAAAAcDtzeLB49NFHdfLkSY0ePVqpqakKCwvTqlWrrBO6k5OT5eT0vwsrM2bMUHZ2tnr06GGznTFjxmjs2LElWToAAACA/8/hwUKSYmNjFRsbW+C6DRs22CwfOXKk+AsCAAAAYJcy/VQoAAAAAKUDwQIAAACAaQQLAAAAAKYRLAAAAACYRrAAAAAAYBrBAgAAAIBpBAsAAAAAphEsAAAAAJhGsAAAAABgGsECAAAAgGkECwAAAACmESwAAAAAmEawAAAAAGAawQIAAACAaQQLAAAAAKYRLAAAAACYRrAAAAAAYBrBAgAAAIBpBAsAAAAAphEsAAAAAJhGsAAAAABgGsECAAAAgGkECwAAAACmESwAAAAAmEawAAAAAGAawQIAAACAaQQLAAAAAKYRLAAAAACYRrAAAAAAYBrBAgAAAIBpBAsAAAAAphEsAAAAAJhGsAAAAABgGsECAAAAgGkECwAAAACmESwAAAAAmEawAAAAAGAawQIAAACAaQQLAAAAAKYRLAAAAACYRrAAAAAAYBrBAgAAAIBpBAsAAAAAphEsAAAAAJhGsAAAAABgGsECAAAAgGkECwAAAACmESwAAAAAmEawAAAAAGAawQIAAACAaQQLAAAAAKYRLAAAAACYRrAAAAAAYBrBAgAAAIBpBAsAAAAAphEsAAAAAJhGsAAAAABgGsECAAAAgGkECwAAAACmESwAAAAAmEawAAAAAGAawQIAAACAaQQLAAAAAKYRLAAAAACYRrAAAAAAYFqpCBbTp09XcHCwKlSooObNm2vr1q037L9kyRKFhISoQoUKuvvuu7Vy5coSqhQAAABAQRweLBYtWqS4uDiNGTNGO3fuVGhoqCIjI3XixIkC+2/atEm9evVS//799dNPP6lr167q2rWr9uzZU8KVAwAAAMjj8GAxefJkDRw4UDExMWrUqJFmzpwpd3d3JSQkFNj/nXfeUceOHfXCCy+oYcOGGj9+vJo0aaJp06aVcOUAAAAA8pRz5M6zs7O1Y8cOjRw50trm5OSkiIgIbd68ucAxmzdvVlxcnE1bZGSkli9fXmD/rKwsZWVlWZczMjIkSZmZmSarN+fs5SsO3T9QUtwdfK7dqpyLZx1dAlAiHP3z0Iycc1k37wSUcY4+R/P2bxjGTfs6NFicOnVKOTk58vPzs2n38/PTr7/+WuCY1NTUAvunpqYW2D8+Pl7jxo3L1x4YGHiLVQOwi6enoysAcAOewx1dAYAb8dR7ji5BknT27Fl53uRnukODRUkYOXKkzRWO3Nxcpaeny9vbWxaLxYGVoSRlZmYqMDBQKSkpqly5sqPLAVAAzlOgdOMcvT0ZhqGzZ8+qevXqN+3r0GDh4+MjZ2dnpaWl2bSnpaXJ39+/wDH+/v529Xd1dZWrq6tNm5eX160XjTKtcuXK/GMIlHKcp0Dpxjl6+7nZlYo8Dp287eLiovDwcK1bt87alpubq3Xr1qlFixYFjmnRooVNf0las2bNdfsDAAAAKH4OvxUqLi5Offv2VdOmTdWsWTNNmTJF58+fV0xMjCSpT58+qlGjhuLj4yVJw4YNU9u2bfXWW2+pS5cuWrhwobZv365Zs2Y58jAAAACA25rDg8Wjjz6qkydPavTo0UpNTVVYWJhWrVplnaCdnJwsJ6f/XVhp2bKlPv74Y40aNUr/+te/VK9ePS1fvlx33XWXow4BZYCrq6vGjBmT77Y4AKUH5ylQunGO4mYsRmGeHQUAAAAAN+DwD8gDAAAAUPYRLAAAAACYRrAAAAAAYBrBAgAAAIBpBAvc1oKDgzVlyhTrssVi0fLlyx1WD4DiZ+953q9fP3Xt2rXY6gFuVbt27fTss886uowSxzlZehEs4DD9+vWTxWKxvry9vdWxY0ft3r3bYTUdP35cnTp1ctj+gZJQGs89SZo/f74sFosaNmyYb92SJUtksVgUHBxc8oUBDvbnczbv9cYbb2j8+PHFss8NGzYUuM9rXxs2bCiWfaPsIljAoTp27Kjjx4/r+PHjWrduncqVK6cHH3zQYfX4+/vzfG7cFkrbuZenYsWKOnHihDZv3mzTPnfuXNWsWdNBVQGOd+05m/cKDw+Xh4dHseyvZcuWNvt65JFH8tXQsmXLYtk3yi6CBRzK1dVV/v7+8vf3V1hYmEaMGKGUlBSdPHlSkvTSSy+pfv36cnd3V+3atfXKK6/o8uXL1vG7du1S+/bt5eHhocqVKys8PFzbt2+3rv/+++/Vpk0bubm5KTAwUEOHDtX58+evW8+1t0gcOXJEFotFn332mdq3by93d3eFhobm+4XH3n0ApcHNzj2p5M8/SSpXrpx69+6thIQEa9vvv/+uDRs2qHfv3vn6z5gxQ3Xq1JGLi4saNGig//znPzbr9+/fr7/97W+qUKGCGjVqpDVr1uTbRkpKih555BF5eXmpatWqeuihh3TkyJGbfg2BknTtOZv36tChg82tUMHBwZo4caKefPJJeXh4qGbNmpo1a5bNdgr7/e7i4mKzLzc3N5saevbsqRdffNFmTNeuXdWvX78irScnJ0dxcXHy8vKSt7e3XnzxRfERbKUXwQKlxrlz5/Thhx+qbt268vb2liR5eHho/vz5+uWXX/TOO+9o9uzZevvtt61jHnvsMd1xxx3atm2bduzYoREjRqh8+fKSpIMHD6pjx456+OGHtXv3bi1atEjff/+9YmNj7arr5Zdf1vPPP6/ExETVr19fvXr10pUrV4p0H4AjFXTuSY47/5588kktXrxYFy5ckHT1FqmOHTvKz8/Ppt+yZcs0bNgwPffcc9qzZ48GDRqkmJgYrV+/XpKUm5ur7t27y8XFRT/++KNmzpypl156yWYbly9fVmRkpDw8PLRx40b98MMPqlSpkjp27Kjs7Oxb+4ICDvTWW2+padOm+umnnzR48GA988wzSkpKkuSY73ez9bz11luaP3++EhIS9P333ys9PV3Lli0rllpRBAzAQfr27Ws4OzsbFStWNCpWrGhIMgICAowdO3Zcd8ybb75phIeHW5c9PDyM+fPnF9i3f//+xlNPPWXTtnHjRsPJycm4ePGiYRiGERQUZLz99tvW9ZKMZcuWGYZhGIcPHzYkGXPmzLGu//nnnw1Jxt69ewu9D6C0uZVzzzCK/vz7s3nz5hmenp6GYRhGWFiYsWDBAiM3N9eoU6eO8fnnnxtvv/22ERQUZO3fsmVLY+DAgTbb+Mc//mF07tzZMAzDWL16tVGuXDnj6NGj1vVfffWVzXn+n//8x2jQoIGRm5tr7ZOVlWW4ubkZq1evNgzj6tfroYceuv4XBihmfz5nK1asaPTo0cNo27atMWzYMGu/oKAg4/HHH7cu5+bmGr6+vsaMGTMMwyjc9/uNarj2PPjzvg3DMB566CGjb9++RVpPQECA8cYbb1jXX7582bjjjjs4J0sprljAodq3b6/ExEQlJiZq69atioyMVKdOnfTbb79JkhYtWqRWrVrJ399flSpV0qhRo5ScnGwdHxcXpwEDBigiIkKTJk3SwYMHret27dql+fPnq1KlStZXZGSkcnNzdfjw4ULX2LhxY+v7gIAASdKJEyeKdB9ASbvZuSc59vx78sknNW/ePH377bc6f/68OnfunK/P3r171apVK5u2Vq1aae/evdb1gYGBql69unV9ixYtbPrv2rVLBw4ckIeHh7XOqlWr6tKlSzbHAzjatedsYmKipk6dWmC/a39mWSwW+fv72/zMutH3+8aNG23O2Y8++sh03WbqycjI0PHjx9W8eXPrNsqVK6emTZuargvFo5yjC8DtrWLFiqpbt651ec6cOfL09NTs2bPVpUsXPfbYYxo3bpwiIyPl6emphQsX6q233rL2Hzt2rHr37q0VK1boq6++0pgxY7Rw4UJ169ZN586d06BBgzR06NB8+7VnEmjerR3S1X8Upau3WEgqsn0AJe1G596ECRO0efNmh55/jz32mF588UWNHTtWTzzxhMqVK54fV+fOnVN4eHiBv0BVq1atWPYJ3Io/n7PXc+3PLOnqz61rf2bd6PvdxcVFiYmJ1rY/3354LScnp3xzHa6dg1UU9aDsIVigVLFYLHJyctLFixe1adMmBQUF6eWXX7auv/Z/U/PUr19f9evX1/Dhw9WrVy/NmzdP3bp1U5MmTfTLL78U6h/iW1US+wBKwrXnniSHn39Vq1ZVdHS0Fi9erJkzZxbYp2HDhvrhhx/Ut29fa9sPP/ygRo0aWdenpKTo+PHj1quNW7ZssdlGkyZNtGjRIvn6+qpy5cq3VCtQVhTm+72w52y1atV0/Phx63JOTo727Nmj9u3bF2k9AQEB+vHHH/W3v/1NknTlyhXt2LFDTZo0KfR+UHK4FQoOlZWVpdTUVKWmpmrv3r365z//qXPnzikqKkr16tVTcnKyFi5cqIMHD2rq1Kk2E7YuXryo2NhYbdiwQb/99pt++OEHbdu2zfoM/JdeekmbNm1SbGysEhMTtX//fn3++edFOrG6JPYBFIcbnXuSSsX5N3/+fJ06dUohISEFrn/hhRc0f/58zZgxQ/v379fkyZP12Wef6fnnn5ckRUREqH79+urbt6927dqljRs32gQl6eqVER8fHz300EPauHGjDh8+rA0bNmjo0KH6/fff7fqaAqVdUX6///3vf9eKFSu0YsUK/frrr3rmmWd05syZIq9n2LBhmjRpkpYvX65ff/1VgwcPtns/KDkECzjUqlWrFBAQoICAADVv3lzbtm3TkiVL1K5dO0VHR2v48OGKjY1VWFiYNm3apFdeecU61tnZWadPn1afPn1Uv359PfLII+rUqZPGjRsn6ep9nd9++6327dunNm3a6J577tHo0aNt7rc2qyT2ARSHG517kkrF+efm5mbzlKo/69q1q9555x39+9//1p133qn3339f8+bNsx6Dk5OTli1bposXL6pZs2YaMGCAXnvtNZttuLu767vvvlPNmjXVvXt3NWzYUP3799elS5e4goG/nKL8fn/yySfVt29f9enTR23btlXt2rXtulpR2Hqee+45PfHEE+rbt69atGghDw8PdevWza79oORYjD/fIAcAAAAAduKKBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAw7f8Bv3Nn2bTg4k8AAAAASUVORK5CYII=", "text/plain": [ - "
" + "
" ] }, "metadata": {}, @@ -2183,107 +977,64 @@ } ], "source": [ - "# And now let's put it all together.\n", - "# Let's visualize our post-training evaluation. Fingers crossed!\n", - "display_evaluation_summary(client, [baseline_eval.id, posttraining_eval.id], x_range=(1, 10))" - ] - }, - { - "cell_type": "markdown", - "id": "d3af014a", - "metadata": {}, - "source": [ - "## 🥳 7. Celebrate or Cry\n", - "It _should_ be time to celebrate if all went well!\n", - "\n", - "We can clearly see our new models not only outperforming their base models, but\n", - "also competing with larger OpenAI models and DeepSeek!\n" - ] - }, - { - "cell_type": "markdown", - "id": "ea99b11c", - "metadata": {}, - "source": [ - "## 🔚 8. Conclusion\n", - "In this notebook we demonstrated **distillation** using Azure OpenAI *Evaluations*\n", - "and *Fine Tuning* features.\n", - "\n", - "We used an objective of *adjusting the tone* of a model to meet our needs, in this\n", - "case making its responses sarcastic, while preserving accuracy in results, and\n", - "*distilled* the native capabilities of a state-of-the-art reasoning model (`o3`)\n", - "into a much smaller, non-reasoning model (`4.1-nano`) to let our agent or app\n", - "use the smallest model possible while:\n", + "# Visualize\n", + "fig, ax = plt.subplots(figsize=(8, 5))\n", "\n", - "- 🤑 minimizing per-token costs\n", - "- 🏎️ improve performance (latency)\n", + "models = ['Baseline', 'Base Model', 'Fine-Tuned']\n", + "avgs = [np.mean(baseline_scores), np.mean(base_scores), np.mean(ft_scores)]\n", + "colors = ['#e74c3c', '#3498db', '#2ecc71']\n", "\n", - "We did all this:\n", + "bars = ax.bar(models, avgs, color=colors)\n", + "ax.set_ylabel('Sarcasm Score')\n", + "ax.set_title('Model Comparison')\n", + "ax.set_ylim(0, 1)\n", "\n", - "- without creating training data directly\n", - "- without knowing the ideal student model\n", - "- only by knowing how to define our Grader\n", + "for bar, v in zip(bars, avgs):\n", + " ax.text(bar.get_x() + bar.get_width()/2, v + 0.2, f'{v:.1f}', ha='center', fontweight='bold')\n", "\n", - "So to wrap it all up:\n", - "\n", - "1. We described the ideal state to our complex reasoning model in the form of\n", - " a few samples we feel are ideal.\n", - "2. We described to the reasoning model (grader) how to judge those examples\n", - " to measure their quality.\n", - "3. We let Evaluations and Fine Tuning do the rest!\n", - "\n", - "\n" + "plt.tight_layout()\n", + "plt.show()" ] }, { "cell_type": "markdown", - "id": "080fdd24", "metadata": {}, "source": [ - "## 🧹 9. Cleanup\n", - "If you want to clean up everthing from this notebook, set `cleanup = True` below." + "## 14. Cleanup" ] }, { "cell_type": "code", - "execution_count": null, - "id": "fad63a77", + "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "deleting ministral-3b-distilled-90f2f57b...\n", - "deleted!\n", - "deleting gpt-4.1-nano-distilled-90f2f57b...\n", - "deleted!\n" + "Deleted: training_sarcasm_4aa88349.jsonl\n", + "Deleted: validation_sarcasm_4aa88349.jsonl\n", + "Deleted: base_eval_4aa88349.jsonl\n", + "Deleted: ft_eval_4aa88349.jsonl\n", + "\n", + "✓ Cleanup complete\n" ] } ], "source": [ - "# Want to clean up your mess? \n", - "cleanup = True\n", + "# Clean up local files\n", + "for f in [training_file_path, validation_file_path, base_eval_file, ft_eval_file]:\n", + " if os.path.exists(f):\n", + " os.remove(f)\n", + " print(f\"Deleted: {f}\")\n", "\n", - "if cleanup:\n", - " # for now, we delete the model deployments as they may incur charges\n", - " for name in [d[\"name\"] for d in DEPLOYMENT_CONFIGURATION]:\n", - " print(f\"deleting {name}...\")\n", - " poller = cogsvc_client.deployments.begin_delete(\n", - " resource_group_name=os.environ.get(\"AZURE_RESOURCE_GROUP\"),\n", - " account_name=os.environ.get(\"FOUNDRY_PARENT_RESOURCE\"),\n", - " deployment_name=name,\n", - " )\n", - " while not poller.done():\n", - " time.sleep(5)\n", - " print(f\"deleted!\")\n", - " # todo: delete training data files" + "print(\"\\n✓ Cleanup complete\")" ] } ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": ".venv (3.12.3)", "language": "python", "name": "python3" }, @@ -2297,9 +1048,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.9" + "version": "3.12.3" } }, "nbformat": 4, - "nbformat_minor": 5 + "nbformat_minor": 4 } diff --git a/Demos/DistillingSarcasm/scripts/eval_utils.py b/Demos/DistillingSarcasm/scripts/eval_utils.py deleted file mode 100644 index 14e32f9..0000000 --- a/Demos/DistillingSarcasm/scripts/eval_utils.py +++ /dev/null @@ -1,254 +0,0 @@ -# Original version from: -# https://raw.githubusercontent.com/azure-ai-foundry/build-2025-demos/refs/heads/main/Azure%20AI%20Model%20Customization/DistillationDemo/scripts/eval_utils.py -import openai -import pandas as pd -import matplotlib.pyplot as plt -import numpy as np # Import numpy for percentile calculations - -from concurrent.futures import ThreadPoolExecutor, as_completed - -def get_eval_runs_list(client: openai.Client, eval_id: str) -> list: - """ - Fetch the list of evaluation runs for a given evaluation ID. - - Args: - eval_id (str): The evaluation ID. - - Returns: - list: A list of evaluation runs with their details. - """ - runs = client.evals.runs.list(eval_id) - - print(f"Get Evaluation Runs: {eval_id}") - list_runs = [] - - if runs: - for run in runs: - r = { - 'id': run.id, - 'name': run.name, - 'status': run.status, - 'model': run.model, - 'pass_percentage': 0, - 'error_percentage': 0, - } - result = run.result_counts - if result: - pass_percentage = (result.passed * 100) / (result.passed + result.failed) if (result.passed + result.failed) > 0 else 0 - error_percentage = (result.errored * 100) / result.total if result.total > 0 else 0 - r['pass_percentage'] = pass_percentage - r['error_percentage'] = error_percentage - - list_runs.append(r) - - return list_runs - - -def get_eval_details(client: openai.Client, eval_id: str) -> dict: - """ - Fetch the details of a specific evaluation. - - Args: - eval_id (str): The evaluation ID. - - Returns: - dict: A dictionary containing evaluation details, including the name. - """ - try: - eval = client.evals.retrieve(eval_id) - return eval.to_dict() - except Exception as e: - print(f"Failed to fetch evaluation details for ID: {eval_id}. Error: {e}") - return {"name": f"Unknown Evaluation ({eval_id})"} - - -def display_evaluation_summary(client: openai.Client, eval_ids: list, x_range = (0, 7)): - """ - Fetch and display a summary of evaluation runs for a list of evaluation IDs, including a horizontal bar chart, - average score, and score distribution for all runs in a single chart with a maximum of 4 graphs per row. - - Args: - eval_ids (list): A list of evaluation IDs. - """ - all_eval_runs = [] - eval_id_to_name = {} - eval_id_to_color = {} - - # Assign unique colors for each evaluation ID - colors = plt.cm.tab10.colors # Use a colormap for distinct colors - for i, eval_id in enumerate(eval_ids): - eval_id_to_color[eval_id] = colors[i % len(colors)] - - # Fetch evaluation runs and details for each evaluation ID - for eval_id in eval_ids: - eval_runs = get_eval_runs_list(client, eval_id) - - # Fetch evaluation details using the helper method - eval_details = get_eval_details(client, eval_id) - eval_name = eval_details.get('name', f'Unknown Evaluation ({eval_id})') - eval_id_to_name[eval_id] = eval_name - - # Add evaluation ID to each run for color coding - for run in eval_runs: - run['eval_id'] = eval_id - all_eval_runs.append(run) - - # Combine all evaluation runs into a single DataFrame - if all_eval_runs: - df = pd.DataFrame(all_eval_runs) - df = df[['id', 'name', 'model', 'status', 'pass_percentage', 'error_percentage', 'eval_id']] # Select relevant columns - df['eval_name'] = df['eval_id'].map(eval_id_to_name) # Map eval_id to eval_name - df['model'] = df['model'].str[:15] # Truncate model names to 15 characters - df = df.sort_values(by=['pass_percentage'], ascending=[False]) # Sort by pass_percentage descending - - print("\n" + "=" * 50) - print("Combined Evaluation Summary") - print("=" * 50) - print(df.to_string(index=False, header=["Run ID", "Run Name", "Model", "Status", "Pass Percentage (%)", "Error Percentage (%)", "Evaluation ID", "Evaluation Name"])) - print("=" * 50) - - # Dynamically adjust the figure height based on the number of rows - num_rows = len(df) - fig_height = max(3, num_rows * 0.5) # Set a minimum height of 6 and scale with 0.5 per row - - - # Create a horizontal bar chart with rows sorted by pass percentage across all eval_ids - plt.figure(figsize=(12, fig_height)) - - df['display_label'] = df['model'].where( - (df['model'].str.strip() != '') & (df['model'] != 'None') & (df['model'].notna()), - df['name'] - ) - - plt.barh( - df['display_label'], - df['pass_percentage'], - color=[eval_id_to_color[eval_id] for eval_id in df['eval_id']], - edgecolor='black' - ) - plt.xlabel('Pass Percentage (%)') - plt.ylabel('Model') - plt.title("Pass Percentage by Model Across Evaluations") - plt.xlim(0, 100) # Set x-axis scale explicitly to 0-100 - plt.gca().invert_yaxis() # Invert y-axis to show the highest percentage at the top - plt.grid(axis='x', linestyle='--', alpha=0.7) - plt.tight_layout() - plt.show() - - # Process each run to calculate and collect scores for distribution - # (This part can be slow as we have to page over results for each run, so we parallelize this.) - all_scores = [] - run_labels = [] - score_summary = [] # To store data for the summary table - - print("=" * 50) - print("Fetching scores...") - print("=" * 50) - - futures = {} # dict of "future: (model, eval_id)" so we can easily access which model powered the run. - with ThreadPoolExecutor(thread_name_prefix="eval-run-fetcher") as pool: - for _, row in df.iterrows(): - run_id = row['id'] - eval_id = row['eval_id'] - future = pool.submit(get_eval_run_output_items, client, eval_id, run_id) - futures.update({ future: (row['model'] , eval_id)}) - - for f in as_completed(futures.keys()): - try: - model, eval_id = futures[f] - scores = f.result() - except Exception as e: - print(f"exception fetching future result: {e}") - scores = None - if scores: - avg_score = sum(scores) / len(scores) - min_score = min(scores) - max_score = max(scores) - p10 = np.percentile(scores, 10) # 10th percentile - p25 = np.percentile(scores, 25) # 25th percentile - p50 = np.percentile(scores, 50) # 50th percentile (median) - p75 = np.percentile(scores, 75) # 75th percentile - p90 = np.percentile(scores, 90) # 90th percentile - - # Collect scores and labels for the combined chart - all_scores.append((scores, eval_id_to_color[eval_id])) # Include color for the subplot - run_labels.append(f"{model} ({eval_id_to_name[eval_id]})") # Include eval name in the label - - # Add data to the summary table - score_summary.append({ - "Model": model, - "Evaluation Name": eval_id_to_name[eval_id], - "Average Score": f"{avg_score:.2f}", - "Min Score": f"{min_score:.2f}", - "Max Score": f"{max_score:.2f}", - "10th Percentile": f"{p10:.2f}", - "25th Percentile": f"{p25:.2f}", - "50th Percentile": f"{p50:.2f}", - "75th Percentile": f"{p75:.2f}", - "90th Percentile": f"{p90:.2f}" - }) - - # Display the score summary as a table - if score_summary: - score_df = pd.DataFrame(score_summary) - score_df = score_df.sort_values(by=['Evaluation Name', 'Average Score'], ascending=[True, False]) # Sort by eval_name and avg_score - print("\n" + "=" * 50) - print("Score Summary Table:") - print(score_df.to_string(index=False)) - print("=" * 50) - - # Plot all score distributions in a single chart with a maximum of 4 graphs per row - if all_scores: - num_runs = len(all_scores) - max_cols = 4 # Maximum number of graphs per row - num_rows = (num_runs + max_cols - 1) // max_cols # Calculate the number of rows - - _, axes = plt.subplots(num_rows, max_cols, figsize=(5 * max_cols, 4 * num_rows), sharey=True) - axes = axes.flatten() # Flatten the axes array for easier indexing - - for i, ((scores, color), label) in enumerate(zip(all_scores, run_labels)): - ax = axes[i] - ax.hist(scores, bins=10, color=color, edgecolor='black') # Use color for the histogram - ax.set_title(label, fontsize=10) # Include model and evaluation name - ax.set_xlabel("Score") - ax.set_ylabel("Frequency") - ax.set_xlim(x_range[0], x_range[1]) # Fix the x-axis range between 0 and 7 - ax.grid(axis='y', linestyle='--', alpha=0.7) - - # Hide any unused subplots - for j in range(len(all_scores), len(axes)): - axes[j].axis('off') - - plt.tight_layout() - plt.suptitle("Score Distributions for each Model", fontsize=16, y=1.02) - plt.show() - else: - print("\n" + "=" * 50) - print("No evaluation runs found for the provided Evaluation IDs.") - print("=" * 50) - - -def get_eval_run_output_items(client: openai.Client, eval_id: str, run_id: str) -> list: - """ - Fetch the output items for a specific evaluation run and extract the result scores. - - Args: - eval_id (str): The evaluation ID. - run_id (str): The run ID. - - Returns: - list: A list of scores for the output items. - """ - scores = [] - - try: - response = client.evals.runs.output_items.list(run_id=run_id, eval_id=eval_id) - for page in response.iter_pages(): - for item in page.data: - for result in item.results: - if result.score is not None: - scores.append(result.score) - except Exception as e: - print(f"Failed to fetch output items for run {run_id}. Error: {e}") - - return scores diff --git a/Demos/DistillingSarcasm_Foundry/.gitignore b/Demos/DistillingSarcasm_Foundry/.gitignore deleted file mode 100644 index b9a371e..0000000 --- a/Demos/DistillingSarcasm_Foundry/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -.env -.venv/ -__pycache__/ -*.pyc -.ipynb_checkpoints/ diff --git a/Demos/DistillingSarcasm_Foundry/README.md b/Demos/DistillingSarcasm_Foundry/README.md deleted file mode 100644 index cf0d904..0000000 --- a/Demos/DistillingSarcasm_Foundry/README.md +++ /dev/null @@ -1,152 +0,0 @@ -# Distilling Sarcasm - Foundry SDK Version - -This is an alternative implementation of the Sarcasm distillation demo using the **Azure AI Foundry SDK** instead of raw API keys. - -## Overview - -This demo teaches language models to generate sarcastic responses through distillation: -1. A **teacher model** (gpt-4.1) generates sarcastic training data -2. A **student model** (gpt-4.1-mini) is fine-tuned on this data -3. Evaluators measure sarcasm quality before and after training - -## SDK Comparison - -| Aspect | Original (API Key) | This Version (Foundry SDK) | -|--------|-------------------|---------------------------| -| **Package** | `openai` | `azure-ai-projects` + `openai` | -| **Auth** | API Key | `DefaultAzureCredential` | -| **Inference** | `client.chat.completions.create()` | `openai_client.chat.completions.create()` | -| **Evaluations** | `client.evals.*` | `openai_client.evals.*` (same!) | -| **Fine-tuning** | `client.fine_tuning.jobs.*` | `openai_client.fine_tuning.jobs.*` (same!) | - -**Key Insight**: The APIs are nearly identical! The main difference is how you get the client: - -```python -# Original (API Key) -client = OpenAI(base_url=..., api_key=...) - -# Foundry SDK -project_client = AIProjectClient(endpoint=..., credential=DefaultAzureCredential()) -openai_client = project_client.get_openai_client() # Same API from here! -``` - -## Prerequisites - -1. **Azure Subscription** with access to Azure AI Foundry -2. **Python 3.10+** -3. **Azure CLI** logged in (`az login`) - -## Setup - -1. Create and activate a virtual environment: - ```bash - python -m venv .venv - source .venv/bin/activate # Linux/Mac - # or: .venv\Scripts\activate # Windows - ``` - -2. Install dependencies: - ```bash - pip install -r requirements.txt - ``` - -3. Copy the environment template and fill in your values: - ```bash - cp .env.template .env - ``` - -4. Configure `.env`: - ``` - MICROSOFT_FOUNDRY_PROJECT_ENDPOINT=https://.services.ai.azure.com/api/projects/ - AZURE_INFERENCE_ENDPOINT=https://..models.ai.azure.com - AZURE_OPENAI_DEPLOYMENT=gpt-4.1 - AZURE_SUBSCRIPTION_ID= - AZURE_RESOURCE_GROUP= - AZURE_AOAI_ACCOUNT= - BASE_MODEL=gpt-4.1-mini - TEACHER_MODEL=gpt-4.1 - ``` - -## Running the Demo - -Open `sarcasm_foundry.ipynb` in Jupyter and execute cells in order: - -```bash -jupyter notebook sarcasm_foundry.ipynb -``` - -## Key Differences from Original - -### Authentication -```python -# Original (API Key) -client = OpenAI( - base_url=f"https://{resource}.openai.azure.com/openai/v1/", - api_key=os.environ.get("FOUNDRY_API_KEY"), -) - -# Foundry SDK (Azure Credential) -credential = DefaultAzureCredential() -project_client = AIProjectClient(endpoint=endpoint, credential=credential) -openai_client = project_client.get_openai_client() -``` - -### Inference -```python -# Original (OpenAI SDK) -response = client.chat.completions.create( - model="gpt-4.1", - messages=[{"role": "user", "content": "Hello"}] -) - -# Foundry SDK (azure-ai-inference) -from azure.ai.inference import ChatCompletionsClient -from azure.ai.inference.models import UserMessage - -client = ChatCompletionsClient(endpoint=..., credential=credential) -response = client.complete(messages=[UserMessage(content="Hello")]) -``` - -### Evaluations (Same API!) -```python -# Both demos use identical evals API -eval = openai_client.evals.create(name="sarcasm-grader", ...) -run = openai_client.evals.runs.create(eval_id=eval.id, ...) -``` - -### Trade-offs - -**Foundry SDK Advantages:** -- ✅ No API keys to manage (uses Azure managed identity) -- ✅ Better security with DefaultAzureCredential -- ✅ Consistent with other Azure AI services -- ✅ Unified project management - -**Original (API Key) Advantages:** -- ✅ Simpler setup (just one API key) -- ✅ Works outside Azure environments -- ✅ Familiar OpenAI SDK patterns - -## Troubleshooting - -### Authentication Issues -- Ensure you're logged in: `az login` -- Check your role assignments include "Azure AI User" on the Foundry resource - -### Model Not Found -- Verify the model deployment names in your `.env` match your Azure deployments - -### Rate Limiting -- Reduce `sample_size` in evaluation cells if hitting rate limits - -## Files - -- `sarcasm_foundry.ipynb` - Main notebook -- `requirements.txt` - Python dependencies -- `.env.template` - Environment variable template -- `README.md` - This file - -## Related - -- [Original Sarcasm Demo](../DistillingSarcasm/) - Uses OpenAI SDK with API keys -- [CNN DailyMail Demo](../SFT_CNN_DailyMail/) - SFT example using Foundry SDK diff --git a/Demos/DistillingSarcasm_Foundry/requirements.txt b/Demos/DistillingSarcasm_Foundry/requirements.txt deleted file mode 100644 index 677bc62..0000000 --- a/Demos/DistillingSarcasm_Foundry/requirements.txt +++ /dev/null @@ -1,19 +0,0 @@ -# Azure AI Foundry SDK -azure-ai-projects>=2.0.0b1 - -# Azure Authentication -azure-identity - -# Azure Cognitive Services for model deployment -azure-mgmt-cognitiveservices - -# OpenAI SDK (obtained via Foundry client) -openai - -# Data manipulation and visualization -pandas -matplotlib -numpy - -# Core utilities -python-dotenv diff --git a/Demos/DistillingSarcasm_Foundry/sarcasm_foundry.ipynb b/Demos/DistillingSarcasm_Foundry/sarcasm_foundry.ipynb deleted file mode 100644 index 027a3a7..0000000 --- a/Demos/DistillingSarcasm_Foundry/sarcasm_foundry.ipynb +++ /dev/null @@ -1,1056 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Distilling Sarcasm (Foundry SDK Version)\n", - "\n", - "This notebook demonstrates fine-tuning language models to generate sarcastic responses using the **Azure AI Foundry SDK**.\n", - "\n", - "## Comparison with Original Demo\n", - "\n", - "| Aspect | Original | Foundry SDK |\n", - "|--------|----------|-------------|\n", - "| Auth | API Key | DefaultAzureCredential |\n", - "| Inference | `client.chat.completions` | `openai_client.responses` |\n", - "| Evaluations | `client.evals.*` | `openai_client.evals.*` (same!) |\n", - "| Fine-tuning | `client.fine_tuning.jobs.*` | `openai_client.fine_tuning.jobs.*` (same!) |\n", - "\n", - "**Key Insight**: The APIs are nearly identical! The main difference is authentication:\n", - "- Original: `OpenAI(base_url=..., api_key=...)`\n", - "- Foundry: `AIProjectClient(endpoint, credential).get_openai_client()`\n", - "\n", - "**Note**: Foundry project endpoints use `responses.create()` API for inference." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Setup" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "%pip install -r requirements.txt -q" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Libraries imported successfully\n" - ] - } - ], - "source": [ - "import os\n", - "import json\n", - "import uuid\n", - "import time\n", - "from random import shuffle\n", - "\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "\n", - "from dotenv import load_dotenv\n", - "from azure.identity import DefaultAzureCredential\n", - "from azure.ai.projects import AIProjectClient\n", - "\n", - "print(\"Libraries imported successfully\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Connect to Microsoft Foundry" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Run identifier: 4aa88349\n", - "Project: https://omi-ignite-demo-resource.services.ai.azure.com/api/projects/omi-ignite-demo\n" - ] - } - ], - "source": [ - "load_dotenv(override=True)\n", - "\n", - "# Required for Azure OpenAI client\n", - "os.environ.setdefault(\"OPENAI_API_VERSION\", \"2025-03-01-preview\")\n", - "\n", - "project_endpoint = os.environ.get(\"MICROSOFT_FOUNDRY_PROJECT_ENDPOINT\")\n", - "model_deployment = os.environ.get(\"AZURE_OPENAI_DEPLOYMENT\", \"gpt-4.1\")\n", - "base_model = os.environ.get(\"BASE_MODEL\", \"gpt-4.1-mini\")\n", - "teacher_model = os.environ.get(\"TEACHER_MODEL\", \"gpt-4.1\")\n", - "\n", - "UNIQUE_KEY = str(uuid.uuid4()).split(\"-\")[0]\n", - "print(f\"Run identifier: {UNIQUE_KEY}\")\n", - "print(f\"Project: {project_endpoint}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✓ Connected to Microsoft Foundry\n" - ] - } - ], - "source": [ - "# Create clients - this is the KEY difference from original demo\n", - "credential = DefaultAzureCredential()\n", - "project_client = AIProjectClient(endpoint=project_endpoint, credential=credential)\n", - "openai_client = project_client.get_openai_client()\n", - "\n", - "print(\"✓ Connected to Microsoft Foundry\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Load Baseline Data" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loaded 20 baseline examples\n", - "\n", - "Sample:\n", - "Q: Who discovered Antarctica?\n", - "A: Some chaps named Fabian Gottlieb von Bellingshausen and Mikhail Lazarev, as if they don't teach that in every school!\n" - ] - } - ], - "source": [ - "baseline_data = []\n", - "with open(\"../DistillingSarcasm/baseline.jsonl\", \"r\") as f:\n", - " for line in f:\n", - " baseline_data.append(json.loads(line))\n", - "\n", - "print(f\"Loaded {len(baseline_data)} baseline examples\")\n", - "print(f\"\\nSample:\")\n", - "print(f\"Q: {baseline_data[0]['question']}\")\n", - "print(f\"A: {baseline_data[0]['answer']}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Define Sarcasm Grader" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Grader prompt defined\n" - ] - } - ], - "source": [ - "GRADER_PROMPT = \"\"\"\n", - "You are a connoisseur of finely crafted sarcasm. Your job is to evaluate a\n", - "question and answer pair and score it based on how sarcastic the answer is.\n", - "\n", - "# Guidelines\n", - " * Output ONLY a score.\n", - " * A score of 0 means the answer is not sarcastic at all.\n", - " * A score of 5 means the answer is moderately sarcastic.\n", - " * A score of 10 means the answer is dripping with sarcasm.\n", - "\n", - "# Scoring Features\n", - " * Exaggeration or overstatement of the obvious\n", - " * Rhetorical questions that mock the original question\n", - " * Use of phrases like \"Oh really?\", \"Shocking!\", \"Who knew?\"\n", - " * Intentional understatement for comedic effect\n", - " * Irony where literal meaning differs from intended meaning\n", - " * The answer must still be factually correct\n", - "\n", - "# Output Format\n", - "Provide ONLY a single decimal number between 0 and 1.\n", - "\"\"\"\n", - "\n", - "USER_PROMPT = \"\"\"\n", - "Q: {{item.question}}\n", - "A: {{item.answer}}\n", - "\"\"\"\n", - "\n", - "print(\"Grader prompt defined\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Create Evaluation (Same API as Original!)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✓ Eval file: file-185f7d1d42464c4f91996a2442505ccb\n" - ] - } - ], - "source": [ - "# Upload baseline for evaluation\n", - "with open(\"../DistillingSarcasm/baseline.jsonl\", \"rb\") as f:\n", - " grader_eval_file = openai_client.files.create(purpose=\"evals\", file=f)\n", - " grader_eval_file = openai_client.files.wait_for_processing(grader_eval_file.id)\n", - "\n", - "print(f\"✓ Eval file: {grader_eval_file.id}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Evaluation criteria defined\n" - ] - } - ], - "source": [ - "# Define evaluation schema\n", - "data_source_config = {\n", - " \"type\": \"custom\",\n", - " \"item_schema\": {\n", - " \"type\": \"object\",\n", - " \"properties\": {\n", - " \"question\": {\"type\": \"string\"},\n", - " \"answer\": {\"type\": \"string\"}\n", - " },\n", - " \"required\": [\"question\", \"answer\"]\n", - " }\n", - "}\n", - "\n", - "# Define grader as testing criteria\n", - "testing_criteria = [\n", - " {\n", - " \"type\": \"score_model\",\n", - " \"name\": \"sarcasm_score\",\n", - " \"model\": model_deployment,\n", - " \"input\": [\n", - " {\"role\": \"system\", \"content\": GRADER_PROMPT},\n", - " {\"role\": \"user\", \"content\": USER_PROMPT}\n", - " ],\n", - " \"pass_threshold\": 0.5\n", - " }\n", - "]\n", - "\n", - "print(\"Evaluation criteria defined\")" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✓ Evaluation: eval_d341bcecf17245e48c9131906b3b73fc\n" - ] - } - ], - "source": [ - "# Create evaluation\n", - "grader_eval = openai_client.evals.create(\n", - " name=f\"sarcasm-grader-{UNIQUE_KEY}\",\n", - " data_source_config=data_source_config,\n", - " testing_criteria=testing_criteria\n", - ")\n", - "\n", - "print(f\"✓ Evaluation: {grader_eval.id}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✓ Eval run: evalrun_730150a66c8343c8a62edbbf95b75a4d\n" - ] - } - ], - "source": [ - "# Run evaluation on baseline\n", - "grader_run = openai_client.evals.runs.create(\n", - " eval_id=grader_eval.id,\n", - " name=f\"baseline-{UNIQUE_KEY}\",\n", - " data_source={\n", - " \"type\": \"jsonl\",\n", - " \"source\": {\"type\": \"file_id\", \"id\": grader_eval_file.id}\n", - " }\n", - ")\n", - "\n", - "print(f\"✓ Eval run: {grader_run.id}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Waiting for evaluation...\n", - " Status: in_progress\n", - " Status: in_progress\n", - " Status: in_progress\n", - " Status: in_progress\n", - " Status: in_progress\n", - " Status: in_progress\n", - " Status: in_progress\n", - " Status: in_progress\n", - " Status: in_progress\n", - " Status: completed\n", - "\n", - "✓ completed!\n", - " Passed: 19/20\n" - ] - } - ], - "source": [ - "# Wait for completion\n", - "print(\"Waiting for evaluation...\")\n", - "while True:\n", - " run = openai_client.evals.runs.retrieve(run_id=grader_run.id, eval_id=grader_eval.id)\n", - " print(f\" Status: {run.status}\")\n", - " if run.status in [\"completed\", \"failed\"]:\n", - " break\n", - " time.sleep(5)\n", - "\n", - "print(f\"\\n✓ {run.status}!\")\n", - "if run.result_counts:\n", - " print(f\" Passed: {run.result_counts.passed}/{run.result_counts.total}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Baseline Sarcasm Scores:\n", - " Average: 0.59\n", - " Std Dev: 0.15\n" - ] - } - ], - "source": [ - "# Get scores\n", - "output_items = list(openai_client.evals.runs.output_items.list(run_id=grader_run.id, eval_id=grader_eval.id))\n", - "baseline_scores = [r.score for item in output_items for r in item.results if r.score is not None]\n", - "\n", - "print(f\"Baseline Sarcasm Scores:\")\n", - "print(f\" Average: {np.mean(baseline_scores):.2f}\")\n", - "print(f\" Std Dev: {np.std(baseline_scores):.2f}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6. Test Inference" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Q: What is the capital of France?\n", - "A: Oh, what a stumper! The capital of France? Let me consult my ancient scrolls... Oh wait, it’s Paris. You know, the city with the Eiffel Tower, baguettes, and a population of people who perfected the art of looking unimpressed. Paris is the answer you’re searching for, unless France has pulled a fast one overnight.\n" - ] - } - ], - "source": [ - "SARCASM_SYSTEM_PROMPT = \"\"\"You are a sarcastically witty assistant. Answer questions with \n", - "biting wit while remaining factually correct. Channel your inner comedian who's \n", - "slightly annoyed at obvious questions.\"\"\"\n", - "\n", - "def generate_response(question: str, model: str = None) -> str:\n", - " \"\"\"Generate a sarcastic response using responses API.\"\"\"\n", - " response = openai_client.responses.create(\n", - " model=model or teacher_model,\n", - " instructions=SARCASM_SYSTEM_PROMPT,\n", - " input=question,\n", - " temperature=0.7,\n", - " max_output_tokens=150\n", - " )\n", - " return response.output_text\n", - "\n", - "# Test\n", - "test_q = \"What is the capital of France?\"\n", - "print(f\"Q: {test_q}\")\n", - "print(f\"A: {generate_response(test_q)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 7. Load Q&A Data" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Training: 250, Validation: 250\n" - ] - } - ], - "source": [ - "qa_data = []\n", - "with open(\"../DistillingSarcasm/qa.jsonl\", \"r\") as f:\n", - " for line in f:\n", - " qa_data.append(json.loads(line))\n", - "\n", - "shuffle(qa_data)\n", - "split_idx = len(qa_data) // 2\n", - "training_questions = qa_data[:split_idx]\n", - "validation_questions = qa_data[split_idx:]\n", - "\n", - "print(f\"Training: {len(training_questions)}, Validation: {len(validation_questions)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 8. Generate Training Data" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generating 50 training examples...\n", - " 10/50...\n", - " 20/50...\n", - " 30/50...\n", - " 40/50...\n", - " 50/50...\n", - "\n", - "Generating 25 validation examples...\n", - "\n", - "✓ Generated 50 training, 25 validation\n" - ] - } - ], - "source": [ - "MAX_TRAINING = 50\n", - "MAX_VALIDATION = 25\n", - "\n", - "print(f\"Generating {MAX_TRAINING} training examples...\")\n", - "training_data = []\n", - "for i, item in enumerate(training_questions[:MAX_TRAINING]):\n", - " response = generate_response(item['question'])\n", - " training_data.append({\n", - " \"messages\": [\n", - " {\"role\": \"system\", \"content\": SARCASM_SYSTEM_PROMPT},\n", - " {\"role\": \"user\", \"content\": item['question']},\n", - " {\"role\": \"assistant\", \"content\": response}\n", - " ]\n", - " })\n", - " if (i + 1) % 10 == 0:\n", - " print(f\" {i + 1}/{MAX_TRAINING}...\")\n", - "\n", - "print(f\"\\nGenerating {MAX_VALIDATION} validation examples...\")\n", - "validation_data = []\n", - "for item in validation_questions[:MAX_VALIDATION]:\n", - " response = generate_response(item['question'])\n", - " validation_data.append({\n", - " \"messages\": [\n", - " {\"role\": \"system\", \"content\": SARCASM_SYSTEM_PROMPT},\n", - " {\"role\": \"user\", \"content\": item['question']},\n", - " {\"role\": \"assistant\", \"content\": response}\n", - " ]\n", - " })\n", - "\n", - "print(f\"\\n✓ Generated {len(training_data)} training, {len(validation_data)} validation\")" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✓ training_sarcasm_4aa88349.jsonl\n", - "✓ validation_sarcasm_4aa88349.jsonl\n" - ] - } - ], - "source": [ - "# Save files\n", - "training_file_path = f\"training_sarcasm_{UNIQUE_KEY}.jsonl\"\n", - "validation_file_path = f\"validation_sarcasm_{UNIQUE_KEY}.jsonl\"\n", - "\n", - "with open(training_file_path, \"w\") as f:\n", - " for item in training_data:\n", - " f.write(json.dumps(item) + \"\\n\")\n", - "\n", - "with open(validation_file_path, \"w\") as f:\n", - " for item in validation_data:\n", - " f.write(json.dumps(item) + \"\\n\")\n", - "\n", - "print(f\"✓ {training_file_path}\")\n", - "print(f\"✓ {validation_file_path}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 9. Fine-Tune Model" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Training: file-e457cb500c77408db3a563f2dcfa986f\n", - "Validation: file-b7d2cf189fb9422eb9e57a5c454d7531\n", - "✓ Files ready\n" - ] - } - ], - "source": [ - "# Upload files\n", - "with open(training_file_path, \"rb\") as f:\n", - " train_file = openai_client.files.create(file=f, purpose=\"fine-tune\")\n", - "\n", - "with open(validation_file_path, \"rb\") as f:\n", - " val_file = openai_client.files.create(file=f, purpose=\"fine-tune\")\n", - "\n", - "print(f\"Training: {train_file.id}\")\n", - "print(f\"Validation: {val_file.id}\")\n", - "\n", - "openai_client.files.wait_for_processing(train_file.id)\n", - "openai_client.files.wait_for_processing(val_file.id)\n", - "print(\"✓ Files ready\")" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✓ Job: ftjob-cbccca9eb73d4da991b925d045133219\n", - " Status: pending\n" - ] - } - ], - "source": [ - "# Create fine-tuning job\n", - "fine_tune_job = openai_client.fine_tuning.jobs.create(\n", - " model=base_model,\n", - " training_file=train_file.id,\n", - " validation_file=val_file.id,\n", - " method={\n", - " \"type\": \"supervised\",\n", - " \"supervised\": {\n", - " \"hyperparameters\": {\n", - " \"n_epochs\": 3,\n", - " \"batch_size\": 1,\n", - " \"learning_rate_multiplier\": 1.0\n", - " }\n", - " }\n", - " },\n", - " extra_body={\"trainingType\": \"Standard\"},\n", - " suffix=f\"sarcasm-{UNIQUE_KEY}\"\n", - ")\n", - "\n", - "print(f\"✓ Job: {fine_tune_job.id}\")\n", - "print(f\" Status: {fine_tune_job.status}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 10. Monitor Training" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Job: ftjob-71e2eea33ecd4d4e990c54ed09ada149\n", - "Status: succeeded\n", - "Model: gpt-4.1-mini-2025-04-14.ft-71e2eea33ecd4d4e990c54ed09ada149-sarcasm-bc6d797e\n" - ] - } - ], - "source": [ - "# Check status\n", - "job_id = fine_tune_job.id\n", - "job = openai_client.fine_tuning.jobs.retrieve(job_id)\n", - "print(f\"Job: {job.id}\")\n", - "print(f\"Status: {job.status}\")\n", - "if job.fine_tuned_model:\n", - " print(f\"Model: {job.fine_tuned_model}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Waiting for fine-tuning...\n", - " [00:00] succeeded\n", - "\n", - "✓ Model: gpt-4.1-mini-2025-04-14.ft-71e2eea33ecd4d4e990c54ed09ada149-sarcasm-bc6d797e\n" - ] - } - ], - "source": [ - "# Wait for completion (can take 10-30 min)\n", - "print(\"Waiting for fine-tuning...\")\n", - "start_time = time.time()\n", - "\n", - "while True:\n", - " job = openai_client.fine_tuning.jobs.retrieve(job_id)\n", - " elapsed = int(time.time() - start_time)\n", - " print(f\" [{elapsed//60:02d}:{elapsed%60:02d}] {job.status}\")\n", - " \n", - " if job.status in [\"succeeded\", \"failed\", \"cancelled\"]:\n", - " break\n", - " time.sleep(30)\n", - "\n", - "if job.status == \"succeeded\":\n", - " fine_tuned_model_id = job.fine_tuned_model\n", - " print(f\"\\n✓ Model: {fine_tuned_model_id}\")\n", - "else:\n", - " print(f\"\\n✗ {job.status}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 11. Deploy Model" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Deploying: gpt-4.1-mini-2025-04-14.ft-71e2eea33ecd4d4e990c54ed09ada149-sarcasm-bc6d797e\n", - "✓ Deployed: sarcasm-ft-4aa88349\n" - ] - } - ], - "source": [ - "from azure.mgmt.cognitiveservices import CognitiveServicesManagementClient\n", - "from azure.mgmt.cognitiveservices.models import Deployment, DeploymentProperties, DeploymentModel, Sku\n", - "\n", - "subscription_id = os.environ.get(\"AZURE_SUBSCRIPTION_ID\")\n", - "resource_group = os.environ.get(\"AZURE_RESOURCE_GROUP\")\n", - "account_name = os.environ.get(\"AZURE_AOAI_ACCOUNT\")\n", - "\n", - "deployment_name = f\"sarcasm-ft-{UNIQUE_KEY}\"\n", - "\n", - "with CognitiveServicesManagementClient(credential=credential, subscription_id=subscription_id) as mgmt:\n", - " deployment_model = DeploymentModel(format=\"OpenAI\", name=fine_tuned_model_id, version=\"1\")\n", - " deployment_properties = DeploymentProperties(model=deployment_model)\n", - " deployment_sku = Sku(name=\"GlobalStandard\", capacity=50)\n", - " deployment_config = Deployment(properties=deployment_properties, sku=deployment_sku)\n", - " \n", - " print(f\"Deploying: {fine_tuned_model_id}\")\n", - " deployment = mgmt.deployments.begin_create_or_update(\n", - " resource_group_name=resource_group,\n", - " account_name=account_name,\n", - " deployment_name=deployment_name,\n", - " deployment=deployment_config,\n", - " )\n", - " deployment.result()\n", - "\n", - "print(f\"✓ Deployed: {deployment_name}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 12. Compare Models" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generating comparison data...\n", - "✓ 20 comparisons\n" - ] - } - ], - "source": [ - "# Generate comparison data\n", - "print(\"Generating comparison data...\")\n", - "comparison_data = []\n", - "test_questions = validation_questions[MAX_VALIDATION:MAX_VALIDATION+20]\n", - "\n", - "for item in test_questions:\n", - " q = item['question']\n", - " base_response = generate_response(q, model=base_model)\n", - " ft_response = generate_response(q, model=deployment_name)\n", - " comparison_data.append({\n", - " \"question\": q,\n", - " \"base_answer\": base_response,\n", - " \"ft_answer\": ft_response\n", - " })\n", - "\n", - "print(f\"✓ {len(comparison_data)} comparisons\")" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✓ Evaluation files saved\n" - ] - } - ], - "source": [ - "# Save for evaluation\n", - "base_eval_file = f\"base_eval_{UNIQUE_KEY}.jsonl\"\n", - "ft_eval_file = f\"ft_eval_{UNIQUE_KEY}.jsonl\"\n", - "\n", - "with open(base_eval_file, \"w\") as f:\n", - " for item in comparison_data:\n", - " f.write(json.dumps({\"question\": item[\"question\"], \"answer\": item[\"base_answer\"]}) + \"\\n\")\n", - "\n", - "with open(ft_eval_file, \"w\") as f:\n", - " for item in comparison_data:\n", - " f.write(json.dumps({\"question\": item[\"question\"], \"answer\": item[\"ft_answer\"]}) + \"\\n\")\n", - "\n", - "print(\"✓ Evaluation files saved\")" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Base run: evalrun_a716d715fdc24e0facffc8b9a90ce170\n", - "Fine-tuned run: evalrun_5180d075fd104d3e962db18db418176c\n" - ] - } - ], - "source": [ - "# Upload and run evaluations\n", - "with open(base_eval_file, \"rb\") as f:\n", - " base_file = openai_client.files.create(purpose=\"evals\", file=f)\n", - " base_file = openai_client.files.wait_for_processing(base_file.id)\n", - "\n", - "with open(ft_eval_file, \"rb\") as f:\n", - " ft_file = openai_client.files.create(purpose=\"evals\", file=f)\n", - " ft_file = openai_client.files.wait_for_processing(ft_file.id)\n", - "\n", - "base_run = openai_client.evals.runs.create(\n", - " eval_id=grader_eval.id,\n", - " name=f\"base-{UNIQUE_KEY}\",\n", - " data_source={\"type\": \"jsonl\", \"source\": {\"type\": \"file_id\", \"id\": base_file.id}}\n", - ")\n", - "\n", - "ft_run = openai_client.evals.runs.create(\n", - " eval_id=grader_eval.id,\n", - " name=f\"finetuned-{UNIQUE_KEY}\",\n", - " data_source={\"type\": \"jsonl\", \"source\": {\"type\": \"file_id\", \"id\": ft_file.id}}\n", - ")\n", - "\n", - "print(f\"Base run: {base_run.id}\")\n", - "print(f\"Fine-tuned run: {ft_run.id}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Base: completed\n", - "Fine-tuned: completed\n", - "✓ Evaluations complete\n" - ] - } - ], - "source": [ - "# Wait for both\n", - "for run_id, name in [(base_run.id, \"Base\"), (ft_run.id, \"Fine-tuned\")]:\n", - " while True:\n", - " run = openai_client.evals.runs.retrieve(run_id=run_id, eval_id=grader_eval.id)\n", - " if run.status in [\"completed\", \"failed\"]:\n", - " print(f\"{name}: {run.status}\")\n", - " break\n", - " time.sleep(5)\n", - "\n", - "print(\"✓ Evaluations complete\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 13. Results" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "==================================================\n", - "RESULTS\n", - "==================================================\n", - "Baseline (gold): 0.59\n", - "Base model: 0.82 (±0.21)\n", - "Fine-tuned: 0.94 (±0.06)\n", - "\n", - "Improvement: +0.12\n" - ] - } - ], - "source": [ - "def get_scores(eval_id, run_id):\n", - " items = list(openai_client.evals.runs.output_items.list(run_id=run_id, eval_id=eval_id))\n", - " return [r.score for item in items for r in item.results if r.score is not None]\n", - "\n", - "base_scores = get_scores(grader_eval.id, base_run.id)\n", - "ft_scores = get_scores(grader_eval.id, ft_run.id)\n", - "\n", - "print(\"=\" * 50)\n", - "print(\"RESULTS\")\n", - "print(\"=\" * 50)\n", - "print(f\"Baseline (gold): {np.mean(baseline_scores):.2f}\")\n", - "print(f\"Base model: {np.mean(base_scores):.2f} (±{np.std(base_scores):.2f})\")\n", - "print(f\"Fine-tuned: {np.mean(ft_scores):.2f} (±{np.std(ft_scores):.2f})\")\n", - "print(f\"\\nImprovement: {np.mean(ft_scores) - np.mean(base_scores):+.2f}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Visualize\n", - "fig, ax = plt.subplots(figsize=(8, 5))\n", - "\n", - "models = ['Baseline', 'Base Model', 'Fine-Tuned']\n", - "avgs = [np.mean(baseline_scores), np.mean(base_scores), np.mean(ft_scores)]\n", - "colors = ['#e74c3c', '#3498db', '#2ecc71']\n", - "\n", - "bars = ax.bar(models, avgs, color=colors)\n", - "ax.set_ylabel('Sarcasm Score')\n", - "ax.set_title('Model Comparison')\n", - "ax.set_ylim(0, 1)\n", - "\n", - "for bar, v in zip(bars, avgs):\n", - " ax.text(bar.get_x() + bar.get_width()/2, v + 0.2, f'{v:.1f}', ha='center', fontweight='bold')\n", - "\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 14. Cleanup" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Deleted: training_sarcasm_4aa88349.jsonl\n", - "Deleted: validation_sarcasm_4aa88349.jsonl\n", - "Deleted: base_eval_4aa88349.jsonl\n", - "Deleted: ft_eval_4aa88349.jsonl\n", - "\n", - "✓ Cleanup complete\n" - ] - } - ], - "source": [ - "# Clean up local files\n", - "for f in [training_file_path, validation_file_path, base_eval_file, ft_eval_file]:\n", - " if os.path.exists(f):\n", - " os.remove(f)\n", - " print(f\"Deleted: {f}\")\n", - "\n", - "print(\"\\n✓ Cleanup complete\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv (3.12.3)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}