diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5579dc1..96adae6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,63 +1,31 @@ -name: CI +name: Check on: - workflow_dispatch: - pull_request: push: branches: - main - 'renovate/**' + pull_request: jobs: - check: - name: Code Quality Checks + python: runs-on: ubuntu-latest - steps: - - name: Clone the code - uses: 'actions/checkout@v5' + - name: Checkout + uses: 'actions/checkout@v6' - name: Install uv - uses: 'astral-sh/setup-uv@v6' + uses: 'astral-sh/setup-uv@v7' with: - version: "0.8.17" + version: "0.9.26" enable-cache: true - - name: Setup Python - uses: 'actions/setup-python@v6' - with: - python-version-file: ".python-version" - - name: Install Dependencies run: uv sync - name: Run Checks run: uv run poe check - test: - name: Unit Tests - runs-on: ubuntu-latest - permissions: - contents: 'read' - - steps: - - name: Clone the code - uses: 'actions/checkout@v5' - - - name: Install uv - uses: 'astral-sh/setup-uv@v6' - with: - version: "0.8.17" - enable-cache: true - - - name: Setup Python - uses: 'actions/setup-python@v6' - with: - python-version-file: ".python-version" - - - name: Install Dependencies - run: uv sync - - name: Run Tests run: uv run poe test @@ -83,6 +51,20 @@ jobs: with: github_token: ${{ secrets.GITHUB_TOKEN }} + helm: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: 'actions/checkout@v6' + + - name: Install Helm + uses: azure/setup-helm@v4 + with: + version: 'v3.14.0' + + - name: Lint Helm Chart + run: helm lint chart/ + test-e2e: name: E2E Tests runs-on: ubuntu-latest @@ -92,8 +74,8 @@ jobs: GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} steps: - - name: Clone the code - uses: 'actions/checkout@v5' + - name: Checkout + uses: 'actions/checkout@v6' - name: Extract image tag id: extract-tag diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index c51fc06..2201ba6 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -12,11 +12,10 @@ jobs: permissions: contents: write packages: write - id-token: write steps: - - name: 'Checkout' - uses: 'actions/checkout@v5' + - name: Checkout + uses: 'actions/checkout@v6' with: fetch-depth: 0 @@ -50,3 +49,66 @@ jobs: --title="${tag#v}" \ --generate-notes \ install.yaml + + helm-chart: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout + uses: 'actions/checkout@v6' + with: + fetch-depth: 0 + + - name: Install Helm + uses: azure/setup-helm@v4 + with: + version: 'v3.14.0' + + - name: Determine Chart Version + id: version + run: | + if [[ "${{ github.ref }}" == refs/tags/v* ]]; then + # For tags: use semver (e.g., v1.2.3 → 1.2.3) + VERSION="${{ github.ref_name }}" + VERSION="${VERSION#v}" # Remove 'v' prefix + else + # For branches/PRs: use pre-release version (e.g., 0.0.0-main-abc1234) + GIT_BRANCH="${{ github.ref_name }}" + GIT_SHA="${{ github.sha }}" + # Replace slashes with dashes for branch names (e.g., feature/foo → feature-foo) + GIT_BRANCH="${GIT_BRANCH//\//-}" + VERSION="0.0.0-${GIT_BRANCH}-${GIT_SHA:0:7}" + fi + echo "version=$VERSION" >> $GITHUB_OUTPUT + echo "Chart Version: $VERSION" + + - name: Update Chart Version + run: | + # Update Chart.yaml with dynamic version + sed -i "s|^version:.*|version: ${{ steps.version.outputs.version }}|" chart/Chart.yaml + sed -i "s|^appVersion:.*|appVersion: ${{ steps.version.outputs.version }}|" chart/Chart.yaml + + - name: Update Image Tags for Release + run: | + # Update image tags in values.yaml to use the versioned images + # This assumes Docker images were built with the same git tag + sed -i "s|tag: latest|tag: ${{ steps.version.outputs.version }}|g" chart/values.yaml + + - name: Package Helm Chart + run: | + helm package chart/ --destination . + + - name: Log into Docker Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Push Helm Chart to OCI Registry + if: startsWith(github.ref, 'refs/tags/v') + run: | + helm push *.tgz oci://ghcr.io/${{ github.repository_owner }}/charts diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b85083d..56ac341 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,6 +5,7 @@ repos: hooks: - id: check-yaml args: [--allow-multiple-documents] + exclude: ^chart/templates/ - id: check-ast - id: check-case-conflict - id: check-json @@ -17,13 +18,6 @@ repos: - id: requirements-txt-fixer - id: mixed-line-ending - - repo: https://github.com/astral-sh/uv-pre-commit - # uv version. - rev: 0.9.11 - hooks: - # Update the uv lockfile - - id: uv-lock - - repo: local hooks: - id: python-lint @@ -32,9 +26,17 @@ repos: entry: "uv run poe lint" types_or: [ python ] pass_filenames: false + - id: python-ruff name: python-ruff language: system entry: "uv run poe ruff" types_or: [ python ] pass_filenames: false + + - id: helm-lint + name: Helm Lint + entry: helm lint chart/ + language: system + pass_filenames: false + files: ^chart/ diff --git a/Tiltfile b/Tiltfile index d67eba0..990a212 100644 --- a/Tiltfile +++ b/Tiltfile @@ -38,11 +38,22 @@ helm_resource( 'testkube', 'oci://docker.io/kubeshop/testkube', namespace='testkube', - flags=['--version=2.5.3', '--create-namespace', '--values=deploy/local/testkube/values.yaml', '--wait', + flags=['--version=2.5.3', '--values=deploy/local/testkube/values.yaml', '--wait', '--wait-for-jobs', '--timeout=10m'], ) -# Apply Kubernetes manifests +# Deploy testbench Helm chart +k8s_yaml(helm( + 'chart', + name='testbench', + namespace='testkube', + values=['chart/values.yaml'], + set=[ + 'image.repository=testworkflows', + ], +)) + +# Apply local development manifests k8s_yaml(kustomize('deploy/local')) k8s_resource('ai-gateway-litellm', port_forwards=['11001:4000']) diff --git a/chart/.helmignore b/chart/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/chart/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/chart/Chart.yaml b/chart/Chart.yaml new file mode 100644 index 0000000..5d62f56 --- /dev/null +++ b/chart/Chart.yaml @@ -0,0 +1,18 @@ +apiVersion: v2 +name: testbench +description: A Helm chart for RAGAS-based agent evaluation via Testkube workflows +type: application +version: 0.1.0 +appVersion: "0.1.0" +keywords: + - ragas + - testkube + - evaluation + - testing + - a2a +maintainers: + - name: Agentic Layer Team + url: https://docs.agentic-layer.ai/ +home: https://github.com/agentic-layer/testbench +sources: + - https://github.com/agentic-layer/testbench diff --git a/chart/dashboards/evaluation-dashboard.json b/chart/dashboards/evaluation-dashboard.json new file mode 100644 index 0000000..46b95a7 --- /dev/null +++ b/chart/dashboards/evaluation-dashboard.json @@ -0,0 +1,1033 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 5, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "editorMode": "builder", + "expr": "max by(type) (avg by(execution_number, type) (testbench_evaluation_token_usage{workflow_name=\"$workflow\"}))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Token Usage Over Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 1, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 16, + "options": { + "barRadius": 0, + "barWidth": 0.97, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "editorMode": "builder", + "expr": "avg by(execution_number) (testbench_evaluation_cost{workflow_name=\"$workflow\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Cost Per Test (USD)", + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "groupBy", + "options": { + "fields": { + "Value": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "execution_number": { + "aggregations": [], + "operation": "groupby" + } + } + } + }, + { + "id": "merge", + "options": {} + } + ], + "type": "barchart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "series", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 1, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "Value (lastNotNull) input_tokens" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": true, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 18, + "options": { + "barRadius": 0, + "barWidth": 0.97, + "colorByField": "execution_number", + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "editorMode": "builder", + "expr": "avg by(execution_number, type) (testbench_evaluation_token_usage{workflow_name=\"$workflow\", type=\"input_tokens\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "builder", + "expr": "avg by(execution_number, type) (testbench_evaluation_token_usage{workflow_name=\"$workflow\", type=\"output_tokens\"})", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "B" + } + ], + "title": "Tokens per Test", + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "merge", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "execution_number" + } + ] + } + }, + { + "id": "groupBy", + "options": { + "fields": { + "Value": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "execution_number": { + "aggregations": [], + "operation": "groupby" + }, + "type": { + "aggregations": [], + "operation": "groupby" + } + } + } + }, + { + "id": "groupingToMatrix", + "options": { + "columnField": "type", + "rowField": "execution_number", + "valueField": "Value (lastNotNull)" + } + } + ], + "type": "barchart" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 4, + "panels": [], + "repeat": "metric", + "title": "Evaluation of $metric", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 25 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": false, + "sortBy": "Name", + "sortDesc": false + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "avg by(name) (last_over_time(testbench_evaluation_metric{name=\"$metric\", workflow_name=\"$workflow\"}[$__interval]))", + "hide": false, + "instant": false, + "legendFormat": "Average", + "range": true, + "refId": "B" + } + ], + "title": "Average $metric over time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 50, + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "pointShape": "square", + "pointSize": { + "fixed": 7 + }, + "pointStrokeWidth": 1, + "scaleDistribution": { + "type": "linear" + }, + "show": "points+lines" + }, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 25 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "mapping": "auto", + "series": [ + { + "frame": { + "matcher": { + "id": "byIndex", + "options": 0 + } + }, + "x": { + "matcher": { + "id": "byName", + "options": "execution_number" + } + }, + "y": { + "matcher": { + "id": "byName", + "options": "Value" + } + } + } + ], + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "builder", + "expr": "avg by(execution_number) (testbench_evaluation_metric{workflow_name=\"$workflow\", name=~\"$metric\"})", + "hide": false, + "instant": false, + "legendFormat": "Average", + "range": true, + "refId": "A" + } + ], + "title": "Average $metric per test", + "transformations": [ + { + "id": "labelsToFields", + "options": { + "mode": "columns" + } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "enumConfig": { + "text": [ + "62" + ] + }, + "targetField": "execution_number" + } + ], + "fields": {} + } + }, + { + "id": "merge", + "options": {} + } + ], + "type": "xychart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 33 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Name", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "max by(sample_hash, user_input_truncated) (last_over_time(testbench_evaluation_metric{name=\"$metric\", workflow_name=\"$workflow\"}[$__interval]))", + "hide": false, + "instant": false, + "legendFormat": "Question: \"{{user_input_truncated}}\"", + "range": true, + "refId": "A" + } + ], + "title": "Individual $metric results over time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 50, + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "pointShape": "square", + "pointSize": { + "fixed": 7 + }, + "pointStrokeWidth": 1, + "scaleDistribution": { + "type": "linear" + }, + "show": "points+lines" + }, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 33 + }, + "id": 13, + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Name", + "sortDesc": true + }, + "mapping": "auto", + "series": [ + { + "x": { + "matcher": { + "id": "byName", + "options": "execution_number (lastNotNull)" + } + }, + "y": { + "matcher": { + "id": "byName", + "options": "Value (lastNotNull)" + } + } + } + ], + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "builder", + "expr": "avg by(sample_hash, trace_id, user_input_truncated, execution_number) (testbench_evaluation_metric{workflow_name=\"$workflow\", name=~\"$metric\"})", + "hide": false, + "instant": false, + "legendFormat": "Question: \"{{user_input_truncated}}\"", + "range": true, + "refId": "A" + } + ], + "title": "Individual $metric results per test", + "transformations": [ + { + "id": "labelsToFields", + "options": { + "mode": "columns" + } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "execution_number" + } + ], + "fields": {} + } + }, + { + "id": "groupBy", + "options": { + "fields": { + "Time": { + "aggregations": [ + "last" + ], + "operation": "aggregate" + }, + "Value": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "execution_number": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "sample_hash": { + "aggregations": [] + }, + "trace_id": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "user_input_truncated": { + "aggregations": [], + "operation": "groupby" + } + } + } + }, + { + "id": "merge", + "options": {} + }, + { + "id": "seriesToRows", + "options": {} + }, + { + "id": "partitionByValues", + "options": { + "fields": [ + "user_input_truncated" + ], + "keepFields": false, + "naming": { + "asLabels": false + } + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "(.*?) Value \\(lastNotNull\\)", + "renamePattern": "$1" + } + } + ], + "type": "xychart" + } + ], + "preload": false, + "schemaVersion": 42, + "tags": [], + "templating": { + "list": [ + { + "current": { + "text": "multi-turn-workflow", + "value": "multi-turn-workflow" + }, + "definition": "label_values(workflow_name)", + "description": "name of the TestWorkflow CR", + "label": "Workflow Name", + "name": "workflow", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(workflow_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "sort": 5, + "type": "query" + }, + { + "allowCustomValue": false, + "current": { + "text": "All", + "value": "$__all" + }, + "definition": "label_values(testbench_evaluation_metric,name)", + "includeAll": true, + "name": "metric", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(testbench_evaluation_metric,name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Workflow Evaluations", + "uid": "g9lx76", + "version": 1 +} diff --git a/chart/dashboards/testkube-dashboard.json b/chart/dashboards/testkube-dashboard.json new file mode 100644 index 0000000..85cf3fd --- /dev/null +++ b/chart/dashboards/testkube-dashboard.json @@ -0,0 +1,316 @@ +{ + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.1.2" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": {"type": "prometheus"}, + "description": "Executions distributions", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": -1, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 3, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "always", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "testkube_testworkflow_executions_count_total{result=\"passed\"}", + "hide": false, + "instant": false, + "interval": "1", + "legendFormat": "{{name}}", + "refId": "A" + }, + { + "hide": false, + "refId": "B" + } + ], + "title": "SUCCESSFUL workflow executions", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus"}, + "description": "Executions distributions", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": -1, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 3, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "always", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "testkube_testworkflow_executions_count_total{result=\"failed\"}", + "hide": false, + "instant": false, + "interval": "1", + "legendFormat": "{{name}}", + "refId": "A" + }, + { + "hide": false, + "refId": "B" + } + ], + "title": "FAILED workflow executions", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus"}, + "description": "Executions distributions", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.1.2", + "targets": [ + { + "exemplar": true, + "expr": "testkube_testworkflow_executions_count_total{}", + "format": "table", + "hide": false, + "instant": false, + "interval": "1", + "intervalFactor": 10, + "legendFormat": "{{type}} created", + "refId": "A" + }, + { + "hide": false, + "refId": "B" + } + ], + "title": "Total workflow executions", + "transparent": true, + "type": "stat" + } + ], + "schemaVersion": 30, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Testkube", + "uid": "jMtrP3nnz", + "version": 2 +} diff --git a/chart/templates/NOTES.txt b/chart/templates/NOTES.txt new file mode 100644 index 0000000..fc8e018 --- /dev/null +++ b/chart/templates/NOTES.txt @@ -0,0 +1,31 @@ +Thank you for installing {{ .Chart.Name }}! + +Your release is named: {{ .Release.Name }} +Installed in namespace: {{ include "testbench.namespace" . }} + +TestWorkflow Templates have been created: + - ragas-setup-template + - ragas-run-template + - ragas-evaluate-template + - ragas-publish-template + - ragas-visualize-template + +To view the templates: + kubectl get testworkflowtemplates -n {{ include "testbench.namespace" . }} + +To run a complete evaluation workflow: + kubectl testkube run testworkflow \ + --config bucket="your-bucket" \ + --config key="your-dataset.csv" \ + --config agentUrl="http://agent-service:8000" \ + --config model="gemini-2.5-flash-lite" \ + -n {{ include "testbench.namespace" . }} + +To watch workflow execution: + kubectl testkube watch testworkflow -n {{ include "testbench.namespace" . }} + +Grafana dashboards have been deployed to: {{ .Values.grafana.dashboardNamespace }} + ConfigMap: {{ .Values.grafana.dashboardConfigMapName }} + +For more information, visit: + {{ .Chart.Home }} diff --git a/chart/templates/_helpers.tpl b/chart/templates/_helpers.tpl new file mode 100644 index 0000000..b0554ce --- /dev/null +++ b/chart/templates/_helpers.tpl @@ -0,0 +1,72 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "testbench.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +*/}} +{{- define "testbench.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "testbench.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "testbench.labels" -}} +helm.sh/chart: {{ include "testbench.chart" . }} +{{ include "testbench.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "testbench.selectorLabels" -}} +app.kubernetes.io/name: {{ include "testbench.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Testkube workflow labels +*/}} +{{- define "testbench.workflowLabels" -}} +testkube.io/test-category: ragas-evaluation +app: testworkflows +{{- end }} + +{{/* +Namespace name +*/}} +{{- define "testbench.namespace" -}} +{{- .Values.namespace.name }} +{{- end }} + +{{/* +Image name +*/}} +{{- define "testbench.image" -}} +{{- $tag := .Values.image.tag | default .Chart.AppVersion }} +{{- printf "%s:%s" .Values.image.repository $tag }} +{{- end }} diff --git a/chart/templates/evaluate-template.yaml b/chart/templates/evaluate-template.yaml new file mode 100644 index 0000000..4ff1f57 --- /dev/null +++ b/chart/templates/evaluate-template.yaml @@ -0,0 +1,42 @@ +apiVersion: testworkflows.testkube.io/v1 +kind: TestWorkflowTemplate +metadata: + name: ragas-evaluate-template + namespace: {{ include "testbench.namespace" . }} + labels: + {{- include "testbench.labels" . | nindent 4 }} + {{- include "testbench.workflowLabels" . | nindent 4 }} +spec: + # Configuration parameters that can be overridden + config: + model: + type: string + description: "Model name to use for evaluation (e.g., gemini-2.5-flash-lite)" + openApiBasePath: + type: string + description: "Base path for OpenAI API" + default: "http://ai-gateway-litellm.ai-gateway:4000" + + # Steps to execute + steps: + - name: evaluate-results + artifacts: + paths: + - "data/results/evaluation_scores.json" + run: + command: + - sh + - -c + args: + - | + uv run python3 evaluate.py "{{`{{ config.model }}`}}" --metrics-config "/app/config/metrics.yaml" && \ + if [ -f data/results/evaluation_scores.json ]; then + echo "✓ Evaluation completed" + cat data/results/evaluation_scores.json + else + echo "✗ Error: Results file not created" + exit 1 + fi + env: + - name: OPENAI_API_BASE + value: "{{`{{ config.openApiBasePath }}`}}" diff --git a/chart/templates/grafana-dashboards.yaml b/chart/templates/grafana-dashboards.yaml new file mode 100644 index 0000000..f4fb221 --- /dev/null +++ b/chart/templates/grafana-dashboards.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Values.grafana.dashboardConfigMapName }} + namespace: {{ .Values.grafana.dashboardNamespace }} + labels: + {{- include "testbench.labels" . | nindent 4 }} + grafana_dashboard: "1" +data: + evaluation-dashboard.json: | +{{ .Files.Get "dashboards/evaluation-dashboard.json" | indent 4 }} + testkube-dashboard.json: | +{{ .Files.Get "dashboards/testkube-dashboard.json" | indent 4 }} diff --git a/chart/templates/namespace.yaml b/chart/templates/namespace.yaml new file mode 100644 index 0000000..a39e415 --- /dev/null +++ b/chart/templates/namespace.yaml @@ -0,0 +1,8 @@ +{{- if .Values.namespace.create }} +apiVersion: v1 +kind: Namespace +metadata: + name: {{ include "testbench.namespace" . }} + labels: + {{- include "testbench.labels" . | nindent 4 }} +{{- end }} diff --git a/chart/templates/publish-template.yaml b/chart/templates/publish-template.yaml new file mode 100644 index 0000000..40b6a26 --- /dev/null +++ b/chart/templates/publish-template.yaml @@ -0,0 +1,29 @@ +apiVersion: testworkflows.testkube.io/v1 +kind: TestWorkflowTemplate +metadata: + name: ragas-publish-template + namespace: {{ include "testbench.namespace" . }} + labels: + {{- include "testbench.labels" . | nindent 4 }} + {{- include "testbench.workflowLabels" . | nindent 4 }} + +spec: + + config: + otelExporterOtlpEndpoint: + type: string + description: "OTel Collector endpoint for exporting traces and metrics" + default: "http://lgtm.monitoring:4318" + + # Steps to execute + steps: + - name: publish-metrics + run: + args: + - publish.py + - "{{`{{ workflow.name }}`}}" + - "{{`{{ execution.id }}`}}" + - "{{`{{ execution.number }}`}}" + env: + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: "{{`{{ config.otelExporterOtlpEndpoint }}`}}" diff --git a/chart/templates/run-template.yaml b/chart/templates/run-template.yaml new file mode 100644 index 0000000..728fea3 --- /dev/null +++ b/chart/templates/run-template.yaml @@ -0,0 +1,33 @@ +apiVersion: testworkflows.testkube.io/v1 +kind: TestWorkflowTemplate +metadata: + name: ragas-run-template + namespace: {{ include "testbench.namespace" . }} + labels: + {{- include "testbench.labels" . | nindent 4 }} + {{- include "testbench.workflowLabels" . | nindent 4 }} + +spec: + # Configuration parameters that can be overridden + config: + agentUrl: + type: string + description: "URL to the agent endpoint (A2A protocol)" + otelExporterOtlpEndpoint: + type: string + description: "OTel Collector endpoint for exporting traces and metrics" + default: "http://lgtm.monitoring:4318" + + # Steps to execute + steps: + - name: run + artifacts: + paths: + - "data/experiments/ragas_experiment.jsonl" + run: + args: + - run.py + - "{{`{{ config.agentUrl }}`}}" + env: + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: "{{`{{ config.otelExporterOtlpEndpoint }}`}}" diff --git a/chart/templates/setup-template.yaml b/chart/templates/setup-template.yaml new file mode 100644 index 0000000..6af624b --- /dev/null +++ b/chart/templates/setup-template.yaml @@ -0,0 +1,30 @@ +apiVersion: testworkflows.testkube.io/v1 +kind: TestWorkflowTemplate +metadata: + name: ragas-setup-template + namespace: {{ include "testbench.namespace" . }} + labels: + {{- include "testbench.labels" . | nindent 4 }} + {{- include "testbench.workflowLabels" . | nindent 4 }} + +spec: + # Configuration parameters that can be overridden + config: + bucket: + type: string + description: "S3/MinIO bucket name containing the dataset" + key: + type: string + description: "S3/MinIO object key (path to dataset file in .csv / .json / .parquet format)" + + # Steps to execute + steps: + - name: setup + artifacts: + paths: + - "data/datasets/ragas_dataset.jsonl" + run: + args: + - setup.py + - "{{`{{ config.bucket }}`}}" + - "{{`{{ config.key }}`}}" diff --git a/chart/templates/visualize-template.yaml b/chart/templates/visualize-template.yaml new file mode 100644 index 0000000..0b0140a --- /dev/null +++ b/chart/templates/visualize-template.yaml @@ -0,0 +1,22 @@ +apiVersion: testworkflows.testkube.io/v1 +kind: TestWorkflowTemplate +metadata: + name: ragas-visualize-template + namespace: {{ include "testbench.namespace" . }} + labels: + {{- include "testbench.labels" . | nindent 4 }} + {{- include "testbench.workflowLabels" . | nindent 4 }} + +spec: + # Steps to execute + steps: + - name: visualize-metrics + artifacts: + paths: + - "data/results/evaluation_report.html" + run: + args: + - visualize.py + - "{{`{{ workflow.name }}`}}" + - "{{`{{ execution.id }}`}}" + - "{{`{{ execution.number }}`}}" diff --git a/chart/values.yaml b/chart/values.yaml new file mode 100644 index 0000000..2f97780 --- /dev/null +++ b/chart/values.yaml @@ -0,0 +1,20 @@ +# Default values for testbench + +namespace: + # Create namespace if it doesn't exist + create: true + # Namespace name + name: testkube + +image: + repository: ghcr.io/agentic-layer/testbench/testworkflows + # Overrides the image tag whose default is the chart appVersion + tag: "" + pullPolicy: IfNotPresent + +# Grafana dashboard configuration +grafana: + # ConfigMap name for dashboards + dashboardConfigMapName: grafana-testkube-dashboard + # Target namespace for Grafana dashboards + dashboardNamespace: monitoring diff --git a/deploy/local/kustomization.yaml b/deploy/local/kustomization.yaml index f58bdc0..a83a2bc 100644 --- a/deploy/local/kustomization.yaml +++ b/deploy/local/kustomization.yaml @@ -4,7 +4,6 @@ resources: - lgtm/ - weather-agent.yaml - dataset.yaml - - ../base - multi-turn-metrics-configmap.yaml - multi-turn-workflow.yaml - - multi-turn-workflow-trigger.yaml \ No newline at end of file + - multi-turn-workflow-trigger.yaml