From 19feab3405c788354079ef13fa89b6548f928522 Mon Sep 17 00:00:00 2001
From: Joshua Blanch <joshua.blanch@clyso.com>
Date: Thu, 7 Aug 2025 05:07:54 +0000
Subject: [PATCH 01/17] qa/clyso/upgrade: upgrade testing to ces image

- wip upgrade testing between different versions

Signed-off-by: Joshua Blanch <joshua.blanch@clyso.com>
---
 qa/suites/orch/cephadm/clyso/upgrade/+        |   0
 .../cephadm/clyso/upgrade/1-start-ces.yaml    |  45 ++++++
 .../clyso/upgrade/2-create-baseline.yaml      |  43 +++++
 .../clyso/upgrade/3-downgrade-upstream.yaml   | 149 ++++++++++++++++++
 qa/suites/orch/cephadm/clyso/upgradeMatrix/%  |   0
 .../cross-distro-pairs/ces-to-upstream.yaml   |   9 ++
 .../cross-distro-pairs/upstream-to-ces.yaml   |   9 ++
 .../cephadm/clyso/upgradeMatrix/workflow/+    |   0
 .../upgradeMatrix/workflow/1-bootstrap.yaml   |  44 ++++++
 .../upgradeMatrix/workflow/3-upgrade.yaml     | 149 ++++++++++++++++++
 .../upgradeMatrix/workflow/5-downgrade.yaml   | 149 ++++++++++++++++++
 11 files changed, 597 insertions(+)
 create mode 100644 qa/suites/orch/cephadm/clyso/upgrade/+
 create mode 100644 qa/suites/orch/cephadm/clyso/upgrade/1-start-ces.yaml
 create mode 100644 qa/suites/orch/cephadm/clyso/upgrade/2-create-baseline.yaml
 create mode 100644 qa/suites/orch/cephadm/clyso/upgrade/3-downgrade-upstream.yaml
 create mode 100644 qa/suites/orch/cephadm/clyso/upgradeMatrix/%
 create mode 100644 qa/suites/orch/cephadm/clyso/upgradeMatrix/cross-distro-pairs/ces-to-upstream.yaml
 create mode 100644 qa/suites/orch/cephadm/clyso/upgradeMatrix/cross-distro-pairs/upstream-to-ces.yaml
 create mode 100644 qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/+
 create mode 100644 qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/1-bootstrap.yaml
 create mode 100644 qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/3-upgrade.yaml
 create mode 100644 qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/5-downgrade.yaml

diff --git a/qa/suites/orch/cephadm/clyso/upgrade/+ b/qa/suites/orch/cephadm/clyso/upgrade/+
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/qa/suites/orch/cephadm/clyso/upgrade/1-start-ces.yaml b/qa/suites/orch/cephadm/clyso/upgrade/1-start-ces.yaml
new file mode 100644
index 0000000000000..15e2ece1e7071
--- /dev/null
+++ b/qa/suites/orch/cephadm/clyso/upgrade/1-start-ces.yaml
@@ -0,0 +1,45 @@
+roles:
+- - host.a
+  - mon.a
+  - mgr.a  
+  - osd.0
+- - host.b
+  - osd.1
+  - mgr.b
+  - client.0
+- - host.b
+  - osd.2
+  - mgr.c
+
+tasks:
+# Install system packages via pexec (avoids CEPH RPM installation)
+- pexec:
+    all:
+    - sudo dnf install s3cmd curl jq -y
+
+- cephadm:
+
+- cephadm.shell:
+    host.a:
+      - ceph status
+      - ceph orch ps
+      - ceph version
+      - echo "Starting with CES version (from override YAML)"
+
+openstack:
+- volumes:
+    count: 4
+    size: 20
+
+overrides:
+  ceph:
+    conf:
+      osd:
+        osd shutdown pgref assert: true
+    log-only-match:
+      - CEPHADM_
+    log-ignorelist:
+      - CEPHADM_DAEMON_PLACE_FAIL
+      - CEPHADM_FAILED_DAEMON
+      - CEPHADM_STRAY_DAEMON
+      - CEPHADM_AGENT_DOWN
diff --git a/qa/suites/orch/cephadm/clyso/upgrade/2-create-baseline.yaml b/qa/suites/orch/cephadm/clyso/upgrade/2-create-baseline.yaml
new file mode 100644
index 0000000000000..fbf225d91b312
--- /dev/null
+++ b/qa/suites/orch/cephadm/clyso/upgrade/2-create-baseline.yaml
@@ -0,0 +1,43 @@
+tasks:
+- cephadm.apply:
+    specs:
+      - service_type: rgw
+        service_id: foo
+        placement:
+          host_pattern: "*"
+        spec:
+          rgw_frontend_port: 8080
+
+# it will be named rgw.foo for some reason
+- cephadm.wait_for_service:
+    service: rgw.foo
+
+- cephadm.shell:
+    host.a:
+      - ceph status
+
+# - cephadm.shell:
+#    host.a:
+#      - ceph_test_rgw_obj
+
+# - workunit:
+#    clients:
+#      client.0:
+#        - rgw/test_rgw_obj.sh
+
+    # client.0:
+    #   - |
+    #     cat > /tmp/s3cfg << 'EOF'
+    #     [default]
+    #     access_key = ceskey
+    #     secret_key = cessecret
+    #     host_base = host.b:8080
+    #     host_bucket = host.b:8080  
+    #     use_https = False
+    #     signature_v2 = True
+    #     EOF
+    #   - s3cmd -c /tmp/s3cfg mb s3://ces-baseline-bucket
+    #   - echo "CES S3 baseline data - must survive downgrade!" > /tmp/ces-s3-baseline.txt
+    #   - s3cmd -c /tmp/s3cfg put /tmp/ces-s3-baseline.txt s3://ces-baseline-bucket/ces-baseline-s3-object.txt
+    #   - s3cmd -c /tmp/s3cfg ls s3://ces-baseline-bucket/
+    #   - echo "CES S3 baseline data created successfully"
diff --git a/qa/suites/orch/cephadm/clyso/upgrade/3-downgrade-upstream.yaml b/qa/suites/orch/cephadm/clyso/upgrade/3-downgrade-upstream.yaml
new file mode 100644
index 0000000000000..5b794a3f6c18c
--- /dev/null
+++ b/qa/suites/orch/cephadm/clyso/upgrade/3-downgrade-upstream.yaml
@@ -0,0 +1,149 @@
+tasks:
+- cephadm.shell:
+    host.a:
+      - echo "PRE-DOWNGRADE CES VERSION:"
+      - ceph version
+      - ceph orch ps
+
+      - echo "Starting downgrade from CES to upstream CEPH v18.2.7..."
+      - ceph orch upgrade start --image quay.io/ceph/ceph:v18.2.7
+      - sleep 30
+
+      - |
+        cat > /tmp/upgrade_monitor.sh << 'EOF'
+        #!/bin/bash
+        
+        # Upgrade monitoring script for cephadm upgrade tests
+        # Monitors upgrade/downgrade completion by checking both upgrade status and daemon versions
+        
+        set -e
+        
+        TARGET_VERSION="$1"
+        OPERATION="$2"  # "upgrade" or "downgrade"
+        BASE_IMAGE_NAME="${3:-base}"
+        TARGET_IMAGE_NAME="${4:-target}"
+        TIMEOUT="${5:-2400}"
+        
+        if [ -z "$TARGET_VERSION" ] || [ -z "$OPERATION" ]; then
+            echo "Usage: $0 <target_version> <upgrade|downgrade> [base_image_name] [target_image_name] [timeout_seconds]"
+            exit 1
+        fi
+        
+        echo "=== CEPH Upgrade Monitor Started ==="
+        echo "Base image: $BASE_IMAGE_NAME"
+        echo "Target image: $TARGET_IMAGE_NAME"
+        echo "Operation: $OPERATION to $TARGET_VERSION"
+        echo "Timeout: ${TIMEOUT}s"
+        echo "Start time: $(date)"
+        
+        echo "=== Capturing Baseline Version ==="
+        ceph versions
+        baseline_version=$(ceph versions --format json | jq -r ".overall | keys[0]")
+        echo "Baseline version: $baseline_version"
+        
+        echo "=== Starting Upgrade Monitoring ==="
+        start_time=$(date +%s)
+        
+        while true; do
+            current_time=$(date +%s)
+            elapsed=$((current_time - start_time))
+            
+            echo ""
+            echo "=== Upgrade Status (Elapsed: ${elapsed}s) ==="
+            echo "Time: $(date)"
+            
+            echo "--- Orchestrator Upgrade Status ---"
+            upgrade_status=$(ceph orch upgrade status --format json)
+            echo "$upgrade_status"
+            
+            echo "--- Daemon Versions ---"
+            ceph versions
+            
+            in_progress=$(echo "$upgrade_status" | jq -r ".in_progress")
+            version_count=$(ceph versions --format json | jq ".overall | length")
+            
+            echo "Upgrade in progress: $in_progress"
+            echo "Number of different versions running: $version_count"
+            
+            if [ "$in_progress" = "false" ] && [ "$version_count" -eq 1 ]; then
+                current_version=$(ceph versions --format json | jq -r ".overall | keys[0]")
+                echo "All daemons now on: $current_version"
+                
+                if [ "$current_version" != "$baseline_version" ]; then
+                    echo ""
+                    echo "=== SUCCESS: Upgrade Completed ==="
+                    echo "From: $baseline_version"
+                    echo "To:   $current_version"
+                    echo "Base image: $BASE_IMAGE_NAME"
+                    echo "Target image: $TARGET_IMAGE_NAME"
+                    echo "Total time: ${elapsed}s"
+                    echo "End time: $(date)"
+                    break
+                else
+                    echo ""
+                    echo "=== SUCCESS: Already on Target Version ==="
+                    echo "Current version: $current_version"
+                    echo "Base image: $BASE_IMAGE_NAME"
+                    echo "Target image: $TARGET_IMAGE_NAME"
+                    echo "Total time: ${elapsed}s"
+                    echo "End time: $(date)"
+                    break
+                fi
+            else
+                echo "Upgrade still in progress or daemons on mixed versions"
+                if [ "$version_count" -gt 1 ]; then
+                    echo "--- Version Breakdown ---"
+                    ceph versions --format json | jq ".overall"
+                fi
+            fi
+            
+            if echo "$upgrade_status" | jq -r ".message" | grep -q -i "error\|fail"; then
+                echo ""
+                echo "=== ERROR: Upgrade Failed ==="
+                echo "Upgrade status shows error or failure"
+                echo "$upgrade_status"
+                exit 1
+            fi
+            
+            if [ $elapsed -ge $TIMEOUT ]; then
+                echo ""
+                echo "=== ERROR: Upgrade Timeout ==="
+                echo "Upgrade did not complete within $TIMEOUT seconds"
+                echo "Current status:"
+                echo "$upgrade_status"
+                ceph versions
+                exit 1
+            fi
+            
+            echo "Waiting 60 seconds before next check..."
+            sleep 60
+        done
+        
+        echo ""
+        echo "=== Final Verification ==="
+        ceph health detail
+        ceph orch ps
+        ceph status
+        
+        echo ""
+        echo "=== Upgrade Monitor Completed Successfully ==="
+        EOF
+        
+        chmod +x /tmp/upgrade_monitor.sh
+        /tmp/upgrade_monitor.sh "v18.2.7" "downgrade" "CES" "Upstream v18.2.7" "1800"
+
+      - echo "POST-DOWNGRADE UPSTREAM VERSION:"
+      - ceph version
+      - ceph orch ps
+      - ceph -s
+
+
+overrides:
+  ceph:
+    log-ignorelist:
+      - CEPHADM_STRAY_DAEMON  
+      - CEPHADM_FAILED_DAEMON
+      - CEPHADM_AGENT_DOWN
+      - CEPHADM_DAEMON_PLACE_FAIL
+    log-only-match:
+      - CEPHADM_
diff --git a/qa/suites/orch/cephadm/clyso/upgradeMatrix/% b/qa/suites/orch/cephadm/clyso/upgradeMatrix/%
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/qa/suites/orch/cephadm/clyso/upgradeMatrix/cross-distro-pairs/ces-to-upstream.yaml b/qa/suites/orch/cephadm/clyso/upgradeMatrix/cross-distro-pairs/ces-to-upstream.yaml
new file mode 100644
index 0000000000000..ea0b2b526a525
--- /dev/null
+++ b/qa/suites/orch/cephadm/clyso/upgradeMatrix/cross-distro-pairs/ces-to-upstream.yaml
@@ -0,0 +1,9 @@
+# Cross-distro test: Start with CES, upgrade to Upstream, downgrade back to CES
+base_image: "harbor.clyso.com/ces/ceph/ceph:ces-v25.03.2-rc.4"
+target_image: "quay.io/ceph/ceph:v18.2.7"
+base_image_name: "CES-v25.03.2-rc.4"
+target_image_name: "Upstream-18.2.7"
+
+overrides:
+  ceph:
+    image: "harbor.clyso.com/ces/ceph/ceph:ces-v25.03.2-rc.4"
diff --git a/qa/suites/orch/cephadm/clyso/upgradeMatrix/cross-distro-pairs/upstream-to-ces.yaml b/qa/suites/orch/cephadm/clyso/upgradeMatrix/cross-distro-pairs/upstream-to-ces.yaml
new file mode 100644
index 0000000000000..fa221779078ca
--- /dev/null
+++ b/qa/suites/orch/cephadm/clyso/upgradeMatrix/cross-distro-pairs/upstream-to-ces.yaml
@@ -0,0 +1,9 @@
+# Cross-distro test: Start with Upstream, upgrade to CES, downgrade back to Upstream  
+base_image: "quay.io/ceph/ceph:v18.2.7"
+target_image: "harbor.clyso.com/ces/ceph/ceph:ces-v25.03.2-rc.4"
+base_image_name: "Upstream-18.2.7"
+target_image_name: "CES-v25.03.2-rc.4"
+
+overrides:
+  ceph:
+    image: "quay.io/ceph/ceph:v18.2.7"
diff --git a/qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/+ b/qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/+
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/1-bootstrap.yaml b/qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/1-bootstrap.yaml
new file mode 100644
index 0000000000000..baaf340d552d1
--- /dev/null
+++ b/qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/1-bootstrap.yaml
@@ -0,0 +1,44 @@
+roles:
+- - host.a
+  - mon.a
+  - mgr.a  
+  - osd.0
+- - host.b
+  - mgr.b
+  - osd.1
+  - client.0
+
+tasks:
+# Install system packages via pexec (avoids CEPH RPM installation)
+- pexec:
+    all:
+    - sudo dnf install s3cmd curl jq -y
+
+- cephadm:
+
+- cephadm.shell:
+    host.a:
+      - echo "=== BOOTSTRAP COMPLETE ==="
+      - ceph orch status
+      - ceph orch ps
+      - ceph version
+      - ceph -s
+      - ceph orch device ls
+
+openstack:
+- volumes:
+    count: 4
+    size: 20
+
+overrides:
+  ceph:
+    conf:
+      osd:
+        osd shutdown pgref assert: true
+    log-only-match:
+      - CEPHADM_
+    log-ignorelist:
+      - CEPHADM_DAEMON_PLACE_FAIL
+      - CEPHADM_FAILED_DAEMON
+      - CEPHADM_STRAY_DAEMON
+      - CEPHADM_AGENT_DOWN
diff --git a/qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/3-upgrade.yaml b/qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/3-upgrade.yaml
new file mode 100644
index 0000000000000..f94b75375051d
--- /dev/null
+++ b/qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/3-upgrade.yaml
@@ -0,0 +1,149 @@
+tasks:
+- cephadm.shell:
+    host.a:
+      - echo "PRE-UPGRADE STATE:"
+      - ceph version
+      - ceph orch ps
+      - ceph -s
+
+      - echo "Starting upgrade from CES-v25.03.2-rc.4 to reef-18.2.7..."
+      - ceph orch upgrade start --image "quay.io/ceph/ceph:v18.2.7"
+      - sleep 30
+
+      - |
+        cat > /tmp/upgrade_monitor.sh << 'EOF'
+        #!/bin/bash
+        
+        # Upgrade monitoring script for cephadm upgrade tests
+        # Monitors upgrade/downgrade completion by checking both upgrade status and daemon versions
+        
+        set -e
+        
+        TARGET_VERSION="$1"
+        OPERATION="$2"  # "upgrade" or "downgrade"
+        BASE_IMAGE_NAME="${3:-base}"
+        TARGET_IMAGE_NAME="${4:-target}"
+        TIMEOUT="${5:-2400}"
+        
+        if [ -z "$TARGET_VERSION" ] || [ -z "$OPERATION" ]; then
+            echo "Usage: $0 <target_version> <upgrade|downgrade> [base_image_name] [target_image_name] [timeout_seconds]"
+            exit 1
+        fi
+        
+        echo "=== CEPH Upgrade Monitor Started ==="
+        echo "Base image: $BASE_IMAGE_NAME"
+        echo "Target image: $TARGET_IMAGE_NAME"
+        echo "Operation: $OPERATION to $TARGET_VERSION"
+        echo "Timeout: ${TIMEOUT}s"
+        echo "Start time: $(date)"
+        
+        echo "=== Capturing Baseline Version ==="
+        ceph versions
+        baseline_version=$(ceph versions --format json | jq -r ".overall | keys[0]")
+        echo "Baseline version: $baseline_version"
+        
+        echo "=== Starting Upgrade Monitoring ==="
+        start_time=$(date +%s)
+        
+        while true; do
+            current_time=$(date +%s)
+            elapsed=$((current_time - start_time))
+            
+            echo ""
+            echo "=== Upgrade Status (Elapsed: ${elapsed}s) ==="
+            echo "Time: $(date)"
+            
+            echo "--- Orchestrator Upgrade Status ---"
+            upgrade_status=$(ceph orch upgrade status --format json)
+            echo "$upgrade_status"
+            
+            echo "--- Daemon Versions ---"
+            ceph versions
+            
+            in_progress=$(echo "$upgrade_status" | jq -r ".in_progress")
+            version_count=$(ceph versions --format json | jq ".overall | length")
+            
+            echo "Upgrade in progress: $in_progress"
+            echo "Number of different versions running: $version_count"
+            
+            if [ "$in_progress" = "false" ] && [ "$version_count" -eq 1 ]; then
+                current_version=$(ceph versions --format json | jq -r ".overall | keys[0]")
+                echo "All daemons now on: $current_version"
+                
+                if [ "$current_version" != "$baseline_version" ]; then
+                    echo ""
+                    echo "=== SUCCESS: Upgrade Completed ==="
+                    echo "From: $baseline_version"
+                    echo "To:   $current_version"
+                    echo "Base image: $BASE_IMAGE_NAME"
+                    echo "Target image: $TARGET_IMAGE_NAME"
+                    echo "Total time: ${elapsed}s"
+                    echo "End time: $(date)"
+                    break
+                else
+                    echo ""
+                    echo "=== SUCCESS: Already on Target Version ==="
+                    echo "Current version: $current_version"
+                    echo "Base image: $BASE_IMAGE_NAME"
+                    echo "Target image: $TARGET_IMAGE_NAME"
+                    echo "Total time: ${elapsed}s"
+                    echo "End time: $(date)"
+                    break
+                fi
+            else
+                echo "Upgrade still in progress or daemons on mixed versions"
+                if [ "$version_count" -gt 1 ]; then
+                    echo "--- Version Breakdown ---"
+                    ceph versions --format json | jq ".overall"
+                fi
+            fi
+            
+            if echo "$upgrade_status" | jq -r ".message" | grep -q -i "error\|fail"; then
+                echo ""
+                echo "=== ERROR: Upgrade Failed ==="
+                echo "Upgrade status shows error or failure"
+                echo "$upgrade_status"
+                exit 1
+            fi
+            
+            if [ $elapsed -ge $TIMEOUT ]; then
+                echo ""
+                echo "=== ERROR: Upgrade Timeout ==="
+                echo "Upgrade did not complete within $TIMEOUT seconds"
+                echo "Current status:"
+                echo "$upgrade_status"
+                ceph versions
+                exit 1
+            fi
+            
+            echo "Waiting 60 seconds before next check..."
+            sleep 60
+        done
+        
+        echo ""
+        echo "=== Final Verification ==="
+        ceph health detail
+        ceph orch ps
+        ceph status
+        
+        echo ""
+        echo "=== Upgrade Monitor Completed Successfully ==="
+        EOF
+        
+        chmod +x /tmp/upgrade_monitor.sh
+        /tmp/upgrade_monitor.sh "18.2.7" "upgrade" "CES-v25.03.2-rc.4" "reef-18.2.7" "2400"
+
+      - echo "POST-UPGRADE STATE:"
+      - ceph version
+      - ceph orch ps
+      - ceph -s
+
+overrides:
+  ceph:
+    log-ignorelist:
+      - CEPHADM_STRAY_DAEMON  
+      - CEPHADM_FAILED_DAEMON
+      - CEPHADM_AGENT_DOWN
+      - CEPHADM_DAEMON_PLACE_FAIL
+    log-only-match:
+      - CEPHADM_
diff --git a/qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/5-downgrade.yaml b/qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/5-downgrade.yaml
new file mode 100644
index 0000000000000..25dd72bddca3b
--- /dev/null
+++ b/qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/5-downgrade.yaml
@@ -0,0 +1,149 @@
+tasks:
+- cephadm.shell:
+    host.a:
+      - echo "PRE-DOWNGRADE STATE:"
+      - ceph version
+      - ceph orch ps
+      - ceph -s
+
+      - echo "Starting downgrade from reef-18.2.7 back to CES-v25.03.2-rc.4..."
+      - ceph orch upgrade start --image "harbor.clyso.com/ces/ceph/ceph:ces-v25.03.2-rc.4"
+      - sleep 30
+
+      - |
+        cat > /tmp/upgrade_monitor.sh << 'EOF'
+        #!/bin/bash
+        
+        # Upgrade monitoring script for cephadm upgrade tests
+        # Monitors upgrade/downgrade completion by checking both upgrade status and daemon versions
+        
+        set -e
+        
+        TARGET_VERSION="$1"
+        OPERATION="$2"  # "upgrade" or "downgrade"
+        BASE_IMAGE_NAME="${3:-base}"
+        TARGET_IMAGE_NAME="${4:-target}"
+        TIMEOUT="${5:-2400}"
+        
+        if [ -z "$TARGET_VERSION" ] || [ -z "$OPERATION" ]; then
+            echo "Usage: $0 <target_version> <upgrade|downgrade> [base_image_name] [target_image_name] [timeout_seconds]"
+            exit 1
+        fi
+        
+        echo "=== CEPH Upgrade Monitor Started ==="
+        echo "Base image: $BASE_IMAGE_NAME"
+        echo "Target image: $TARGET_IMAGE_NAME"
+        echo "Operation: $OPERATION to $TARGET_VERSION"
+        echo "Timeout: ${TIMEOUT}s"
+        echo "Start time: $(date)"
+        
+        echo "=== Capturing Baseline Version ==="
+        ceph versions
+        baseline_version=$(ceph versions --format json | jq -r ".overall | keys[0]")
+        echo "Baseline version: $baseline_version"
+        
+        echo "=== Starting Upgrade Monitoring ==="
+        start_time=$(date +%s)
+        
+        while true; do
+            current_time=$(date +%s)
+            elapsed=$((current_time - start_time))
+            
+            echo ""
+            echo "=== Upgrade Status (Elapsed: ${elapsed}s) ==="
+            echo "Time: $(date)"
+            
+            echo "--- Orchestrator Upgrade Status ---"
+            upgrade_status=$(ceph orch upgrade status --format json)
+            echo "$upgrade_status"
+            
+            echo "--- Daemon Versions ---"
+            ceph versions
+            
+            in_progress=$(echo "$upgrade_status" | jq -r ".in_progress")
+            version_count=$(ceph versions --format json | jq ".overall | length")
+            
+            echo "Upgrade in progress: $in_progress"
+            echo "Number of different versions running: $version_count"
+            
+            if [ "$in_progress" = "false" ] && [ "$version_count" -eq 1 ]; then
+                current_version=$(ceph versions --format json | jq -r ".overall | keys[0]")
+                echo "All daemons now on: $current_version"
+                
+                if [ "$current_version" != "$baseline_version" ]; then
+                    echo ""
+                    echo "=== SUCCESS: Upgrade Completed ==="
+                    echo "From: $baseline_version"
+                    echo "To:   $current_version"
+                    echo "Base image: $BASE_IMAGE_NAME"
+                    echo "Target image: $TARGET_IMAGE_NAME"
+                    echo "Total time: ${elapsed}s"
+                    echo "End time: $(date)"
+                    break
+                else
+                    echo ""
+                    echo "=== SUCCESS: Already on Target Version ==="
+                    echo "Current version: $current_version"
+                    echo "Base image: $BASE_IMAGE_NAME"
+                    echo "Target image: $TARGET_IMAGE_NAME"
+                    echo "Total time: ${elapsed}s"
+                    echo "End time: $(date)"
+                    break
+                fi
+            else
+                echo "Upgrade still in progress or daemons on mixed versions"
+                if [ "$version_count" -gt 1 ]; then
+                    echo "--- Version Breakdown ---"
+                    ceph versions --format json | jq ".overall"
+                fi
+            fi
+            
+            if echo "$upgrade_status" | jq -r ".message" | grep -q -i "error\|fail"; then
+                echo ""
+                echo "=== ERROR: Upgrade Failed ==="
+                echo "Upgrade status shows error or failure"
+                echo "$upgrade_status"
+                exit 1
+            fi
+            
+            if [ $elapsed -ge $TIMEOUT ]; then
+                echo ""
+                echo "=== ERROR: Upgrade Timeout ==="
+                echo "Upgrade did not complete within $TIMEOUT seconds"
+                echo "Current status:"
+                echo "$upgrade_status"
+                ceph versions
+                exit 1
+            fi
+            
+            echo "Waiting 60 seconds before next check..."
+            sleep 60
+        done
+        
+        echo ""
+        echo "=== Final Verification ==="
+        ceph health detail
+        ceph orch ps
+        ceph status
+        
+        echo ""
+        echo "=== Upgrade Monitor Completed Successfully ==="
+        EOF
+        
+        chmod +x /tmp/upgrade_monitor.sh
+        /tmp/upgrade_monitor.sh "25.03.2-rc.4" "downgrade" "reef-18.2.7" "CES-v25.03.2-rc.4" "2400"
+
+      - echo "POST-DOWNGRADE STATE:"
+      - ceph version
+      - ceph orch ps
+      - ceph -s
+
+overrides:
+  ceph:
+    log-ignorelist:
+      - CEPHADM_STRAY_DAEMON  
+      - CEPHADM_FAILED_DAEMON
+      - CEPHADM_AGENT_DOWN
+      - CEPHADM_DAEMON_PLACE_FAIL
+    log-only-match:
+      - CEPHADM_

From aca9f23efdf6f93267299434f6092d3fbcdd4689 Mon Sep 17 00:00:00 2001
From: Joshua Blanch <joshua.blanch@clyso.com>
Date: Thu, 7 Aug 2025 21:49:00 +0000
Subject: [PATCH 02/17] qa/cephadm_s3_bridge.py: test be able to run s3test
 repo from cephadm

Signed-off-by: Joshua Blanch <joshua.blanch@clyso.com>
---
 .../cephadm/s3tests-bridge/basic-s3tests.yaml |  48 ++
 .../s3tests-bridge/bridge-test-only.yaml      | 114 +++++
 .../cephadm/s3tests-bridge/minimal-test.yaml  |  47 ++
 qa/tasks/cephadm_s3_bridge.py                 | 441 ++++++++++++++++++
 qa/tasks/python.py                            |   9 +-
 5 files changed, 658 insertions(+), 1 deletion(-)
 create mode 100644 qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml
 create mode 100644 qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml
 create mode 100644 qa/suites/orch/cephadm/s3tests-bridge/minimal-test.yaml
 create mode 100644 qa/tasks/cephadm_s3_bridge.py

diff --git a/qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml b/qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml
new file mode 100644
index 0000000000000..53d7dfd3844f0
--- /dev/null
+++ b/qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml
@@ -0,0 +1,48 @@
+roles:
+- [host.a, mon.a, mgr.a, osd.0, osd.1, osd.2, client.0]
+
+overrides:
+  ceph:
+    log-to-file: true
+    conf:
+      global:
+        log to file: true
+        mon cluster log to file: true
+      mon:
+        mon_warn_on_insecure_global_id_reclaim_allowed: false
+
+tasks:
+- cephadm:
+
+- cephadm.apply:
+    specs:
+      - service_type: rgw
+        service_id: s3test
+        placement:
+          host_pattern: "*"
+        spec:
+          rgw_frontend_port: 8080
+
+- cephadm.wait_for_service:
+    service: rgw.s3test
+
+- cephadm_s3_bridge:
+    client.0:
+      discover_from_cephadm: true
+
+# note: tox is needed
+- tox: [client.0]
+
+- s3tests:
+    client.0:
+      rgw_server: client.0
+      force-branch: master
+      conf:
+        DEFAULT:
+          is_secure: false
+          port: 8080
+          calling_format: ordinary
+        fixtures:
+          bucket prefix: test-{random}-
+      # Only run 3 basic tests to verify bridge works
+      filter: "test_bucket_list_empty or test_bucket_create_naming_good_long_255"
diff --git a/qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml b/qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml
new file mode 100644
index 0000000000000..d1cd13f3664d2
--- /dev/null
+++ b/qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml
@@ -0,0 +1,114 @@
+roles:
+- [host.a, mon.a, mgr.a, osd.0, osd.1, osd.2, client.0]
+
+tasks:
+- cephadm:
+
+- cephadm.apply:
+    specs:
+      - service_type: rgw
+        service_id: test
+        placement:
+          host_pattern: "*"
+        spec:
+          rgw_frontend_port: 8080
+
+- cephadm.wait_for_service:
+    service: rgw.test
+
+- cephadm_s3_bridge:
+    client.0:
+      discover_from_cephadm: true
+
+- exec:
+    client.0:
+      - echo "Bridge task completed - ctx.rgw should now be available for s3tests"
+      - echo "Testing RGW accessibility..."
+
+# This uses the same execution path that s3tests uses, so it will trigger our monkey patch
+- python:
+    client.0:
+      - |
+        import logging
+        from io import StringIO
+        log = logging.getLogger(__name__)
+        
+        log.info("=== Testing radosgw-admin monkey patch ===")
+        
+        # Get the remote connection (same way s3tests does it)
+        remote = list(ctx.cluster.only('client.0').remotes.keys())[0]
+        
+        log.info("Testing user creation via remote.run() - this should trigger monkey patch")
+        try:
+            # This should be intercepted by our monkey patch and converted to cephadm shell
+            result = remote.run(
+                args=[
+                    'radosgw-admin', 'user', 'create',
+                    '--uid=testuser',
+                    '--display-name=Test User', 
+                    '--access-key=testkey',
+                    '--secret-key=testsecret'
+                ],
+                stdout=StringIO()
+            )
+            log.info("✅ SUCCESS: radosgw-admin user create executed through monkey patch!")
+            output = result.stdout.getvalue() if hasattr(result.stdout, 'getvalue') else str(result.stdout)
+            log.info(f"Command output length: {len(output)} chars")
+            if 'testuser' in output or 'access_key' in output:
+                log.info("✅ User creation output looks correct")
+            else:
+                log.warning(f"⚠️ Unexpected output: {output[:200]}...")
+        except Exception as e:
+            log.error(f"❌ FAILED: radosgw-admin user create failed: {e}")
+            raise
+        
+        log.info("Testing user info retrieval via remote.run()")
+        try:
+            result = remote.run(
+                args=[
+                    'radosgw-admin', 'user', 'info',
+                    '--uid=testuser'
+                ],
+                stdout=StringIO()
+            )
+            log.info("✅ SUCCESS: radosgw-admin user info executed through monkey patch!")
+            output = result.stdout.getvalue() if hasattr(result.stdout, 'getvalue') else str(result.stdout)
+            if 'testuser' in output and 'access_key' in output:
+                log.info("✅ User info output contains expected fields")
+            else:
+                log.warning(f"⚠️ User info output: {output[:200]}...")
+        except Exception as e:
+            log.error(f"❌ FAILED: radosgw-admin user info failed: {e}")
+            raise
+        
+        log.info("=== ✅ Monkey patch test completed successfully! ===")
+        log.info("This confirms that s3tests radosgw-admin commands will work with cephadm")
+
+# Test S3 endpoint accessibility  
+- exec:
+    client.0:
+      - echo "Testing S3 endpoint accessibility..."
+      - 'response=$(curl -s http://localhost:8080/ 2>/dev/null || echo "CONNECTION_FAILED")'
+      - echo "RGW Response: $response"
+      - 'if echo "$response" | grep -q "ListBucketResult\|Error\|xml\|ACCESS_DENIED"; then echo "✅ RGW is responding with valid S3 response"; else echo "✗ RGW not responding correctly"; fi'
+
+# Cleanup test user using monkey patch (another test)
+- python:
+    client.0:
+      - |
+        import logging
+        log = logging.getLogger(__name__)
+        
+        log.info("=== Testing cleanup via monkey patch ===")
+        remote = list(ctx.cluster.only('client.0').remotes.keys())[0]
+        
+        try:
+            result = remote.run(args=[
+                'radosgw-admin', 'user', 'rm',
+                '--uid=testuser'
+            ])
+            log.info("✅ SUCCESS: User cleanup via monkey patch worked!")
+        except Exception as e:
+            log.warning(f"⚠️ Cleanup failed (this is often expected): {e}")
+        
+        log.info("=== Bridge functionality test completed ===")
diff --git a/qa/suites/orch/cephadm/s3tests-bridge/minimal-test.yaml b/qa/suites/orch/cephadm/s3tests-bridge/minimal-test.yaml
new file mode 100644
index 0000000000000..f1424fcb6874a
--- /dev/null
+++ b/qa/suites/orch/cephadm/s3tests-bridge/minimal-test.yaml
@@ -0,0 +1,47 @@
+roles:
+- [host.a, mon.a, mgr.a, osd.0, osd.1, osd.2, client.0]
+
+tasks:
+- cephadm:
+
+- cephadm.apply:
+    specs:
+      - service_type: rgw
+        service_id: test
+        placement:
+          host_pattern: "*"
+        spec:
+          rgw_frontend_port: 8080
+
+- cephadm.wait_for_service:
+    service: rgw.test
+
+- cephadm_s3_bridge:
+    client.0:
+      discover_from_cephadm: true
+      dns_name: client.0
+
+- tox: [client.0]
+
+- s3tests:
+    client.0:
+      rgw_server: client.0
+      force-branch: master
+      conf:
+        DEFAULT:
+          is_secure: false
+          port: 8080
+          calling_format: ordinary
+        fixtures:
+          bucket prefix: s3test-{random}-
+        # POC run
+        exclude:
+          - test_bucket_policy*
+          - test_lifecycle*  
+          - test_encryption*
+          - test_multipart_upload_size_too_small
+          - test_cors*
+          - test_website*
+          - test_logging*
+          - test_versioning*
+      filter: "test_bucket_list_empty or test_bucket_create or test_object_write or test_object_read"
diff --git a/qa/tasks/cephadm_s3_bridge.py b/qa/tasks/cephadm_s3_bridge.py
new file mode 100644
index 0000000000000..d870679fc207f
--- /dev/null
+++ b/qa/tasks/cephadm_s3_bridge.py
@@ -0,0 +1,441 @@
+"""
+Bridge task to make cephadm-deployed RGW compatible with s3tests.
+
+This task discovers RGW endpoints deployed via cephadm orchestrator
+and creates the ctx.rgw.role_endpoints structure that s3tests expects.
+"""
+
+import json
+import logging
+import time
+from io import StringIO
+
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+from teuthology.exceptions import ConfigError
+import teuthology.orchestra.remote
+
+import sys
+import os
+
+qa_dir = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, qa_dir)
+from rgw import RGWEndpoint
+
+log = logging.getLogger(__name__)
+
+
+def detect_cephadm_deployment(ctx):
+    """Detect if we're in a cephadm environment with bridge active"""
+    return (
+        hasattr(ctx, "rgw")
+        and hasattr(ctx.rgw, "cephadm_bridge_active")
+        and ctx.rgw.cephadm_bridge_active
+    )
+
+
+def patch_s3tests_radosgw_admin(ctx):
+    """
+    Monkey patch teuthology remote execution to make radosgw-admin commands
+    work inside cephadm containers when running s3tests.
+    """
+    log.info("convert radosgw-admin to cephadm command")
+
+    original_run = teuthology.orchestra.remote.Remote.run
+
+    def cephadm_aware_run(self, **kwargs):
+        args = kwargs.get("args", [])
+
+        if args and len(args) > 0 and args[0] == "radosgw-admin":
+            if detect_cephadm_deployment(ctx):
+                log.info(f"Intercepting radosgw-admin command: {args}")
+
+                try:
+                    cluster_name = (
+                        list(ctx.ceph.keys())[0] if hasattr(ctx, "ceph") else "ceph"
+                    )
+                    image = ctx.ceph[cluster_name].image
+
+                    cephadm_args = [
+                        "sudo",
+                        "cephadm",
+                        "--image",
+                        image,
+                        "shell",
+                        "-c",
+                        f"/etc/ceph/{cluster_name}.conf",
+                        "-k",
+                        f"/etc/ceph/{cluster_name}.client.admin.keyring",
+                        "--fsid",
+                        ctx.ceph[cluster_name].fsid,
+                        "--",
+                    ] + args
+
+                    log.info(f"Converted to cephadm shell command: {cephadm_args}")
+                    kwargs["args"] = cephadm_args
+
+                except Exception as e:
+                    log.error(f"Failed to convert radosgw-admin to cephadm shell: {e}")
+                    pass
+
+        return original_run(self, **kwargs)
+
+
+    teuthology.orchestra.remote.Remote.run = cephadm_aware_run
+
+
+def restore_original_remote_run():
+    """Restore original remote run method (for cleanup)"""
+    # TODO: In practice, this is tricky to implement cleanly since we don't
+    # store the original reference. The monkey patch will remain active
+    # for the duration of the test run, which is typically desired.
+    log.info("Note: Monkey patch cleanup not implemented - patch remains active")
+
+
+def discover_cephadm_rgw_endpoints(ctx):
+    """
+    Discover RGW endpoints from cephadm orchestrator using cephadm shell.
+    Returns dict mapping service names to endpoint info.
+    """
+    log.info("Discovering cephadm RGW endpoints via 'ceph orch ps'")
+
+    cluster_roles = list(ctx.cluster.remotes.keys())
+    if not cluster_roles:
+        raise ConfigError("No cluster nodes available for ceph commands")
+
+    remote = cluster_roles[0]
+
+    try:
+        # Get cluster name (usually 'ceph')
+        cluster_name = list(ctx.ceph.keys())[0] if hasattr(ctx, "ceph") else "ceph"
+
+        result = remote.run(
+            args=[
+                "sudo",
+                ctx.cephadm,
+                "--image",
+                ctx.ceph[cluster_name].image,
+                "shell",
+                "-c",
+                f"/etc/ceph/{cluster_name}.conf",
+                "-k",
+                f"/etc/ceph/{cluster_name}.client.admin.keyring",
+                "--fsid",
+                ctx.ceph[cluster_name].fsid,
+                "--",
+                "ceph",
+                "orch",
+                "ps",
+                "--daemon_type",
+                "rgw",
+                "--format",
+                "json",
+            ],
+            stdout=StringIO(),
+        )
+    except AttributeError as e:
+        log.error(f"Missing cephadm context attributes: {e}")
+        log.error(
+            "Available ctx.cephadm attributes: " + str(dir(ctx.cephadm))
+            if hasattr(ctx, "cephadm")
+            else "No ctx.cephadm found"
+        )
+        raise ConfigError(f"cephadm context not properly initialized: {e}")
+    except Exception as e:
+        log.error(f"Failed to run ceph orch ps command: {e}")
+        raise ConfigError(f"RGW endpoint discovery failed: {e}")
+
+    services_json = result.stdout.getvalue()
+    log.info(f"Raw ceph orch ps output: {services_json}")
+
+    if not services_json.strip():
+        log.warning("No RGW services found via 'ceph orch ps'")
+        return {}
+
+    try:
+        services = json.loads(services_json)
+        log.info(f"Parsed RGW services: {services}")
+    except json.JSONDecodeError as e:
+        log.error(f"Failed to parse JSON from ceph orch ps: {e}")
+        log.error(f"Raw output was: {services_json}")
+        raise ConfigError(f"Invalid JSON from ceph orch ps: {e}")
+
+    endpoints = {}
+    for service in services:
+        service_name = service.get("service_name", "")
+        hostname = service.get("hostname", "")
+        ports = service.get("ports", [])
+        status = service.get("status_desc", "")
+
+        if not service_name.startswith("rgw."):
+            continue
+
+        if status != "running":
+            log.warning(f"RGW service {service_name} is not running: {status}")
+            continue
+
+        if not ports:
+            log.warning(f"No ports found for RGW service {service_name}")
+            continue
+
+        # Extract port number (ports is typically ['8080/tcp'] format)
+        port = None
+        for port_spec in ports:
+            if isinstance(port_spec, str) and "/" in port_spec:
+                port = int(port_spec.split("/")[0])
+                break
+            elif isinstance(port_spec, int):
+                port = port_spec
+                break
+
+        if port is None:
+            log.warning(f"Could not parse port for RGW service {service_name}: {ports}")
+            continue
+
+        endpoints[service_name] = {
+            "hostname": hostname,
+            "port": port,
+            "service_name": service_name,
+            "status": status,
+        }
+
+    log.info(f"Discovered RGW endpoints: {endpoints}")
+    return endpoints
+
+
+def map_roles_to_endpoints(ctx, config, discovered_endpoints):
+    """
+    Map teuthology roles to discovered RGW endpoints.
+    """
+    role_endpoints = {}
+
+    for role, client_config in config.items():
+        if not client_config.get("discover_from_cephadm"):
+            continue
+
+        log.info(f"Mapping role {role} to cephadm RGW endpoint")
+
+        target_service = client_config.get("rgw_service")
+        if target_service and target_service in discovered_endpoints:
+            endpoint_info = discovered_endpoints[target_service]
+            log.info(f"Using explicit service mapping: {role} -> {target_service}")
+        else:
+            if not discovered_endpoints:
+                raise ConfigError(f"No RGW endpoints discovered for role {role}")
+
+            service_name = list(discovered_endpoints.keys())[0]
+            endpoint_info = discovered_endpoints[service_name]
+            log.info(f"Using first available RGW service: {role} -> {service_name}")
+
+        hostname = endpoint_info["hostname"]
+        port = endpoint_info["port"]
+
+        dns_name = client_config.get("dns_name", hostname)
+
+        rgw_endpoint = RGWEndpoint(
+            hostname=hostname,
+            port=port,
+            cert=None,
+            dns_name=dns_name,
+            website_dns_name=None,
+        )
+
+        role_endpoints[role] = rgw_endpoint
+        log.info(f"Created endpoint for {role}: {hostname}:{port} (dns: {dns_name})")
+
+    return role_endpoints
+
+
+def wait_for_rgw_accessibility(ctx, role_endpoints, timeout=60):
+    """
+    Wait for RGW endpoints to be accessible via HTTP.
+    """
+    log.info("Verifying RGW endpoint accessibility")
+
+    cluster_roles = list(ctx.cluster.remotes.keys())
+    test_remote = cluster_roles[0]
+
+    for role, endpoint in role_endpoints.items():
+        log.info(
+            f"Testing accessibility of {role} at {endpoint.hostname}:{endpoint.port}"
+        )
+
+        start_time = time.time()
+        accessible = False
+
+        while time.time() - start_time < timeout:
+            try:
+                result = test_remote.run(
+                    args=[
+                        "curl",
+                        "-s",
+                        "-o",
+                        "/dev/null",
+                        "-w",
+                        "%{http_code}",
+                        "--connect-timeout",
+                        "5",
+                        f"http://{endpoint.hostname}:{endpoint.port}/",
+                    ],
+                    stdout=StringIO(),
+                    check_status=False,
+                )
+
+                http_code = result.stdout.getvalue().strip()
+                log.info(f"HTTP response from {role}: {http_code}")
+
+                if http_code and http_code.isdigit():
+                    accessible = True
+                    break
+
+            except Exception as e:
+                log.debug(f"Accessibility test failed for {role}: {e}")
+
+            log.info(f"Waiting for {role} to become accessible...")
+            time.sleep(2)
+
+        if not accessible:
+            raise ConfigError(f"RGW endpoint {role} not accessible after {timeout}s")
+
+        log.info(f"RGW endpoint {role} is accessible")
+
+
+def task(ctx, config):
+    """
+    Bridge task to make cephadm-deployed RGW compatible with s3tests.
+
+    Example usage:
+    - cephadm_s3_bridge:
+        client.0:
+          discover_from_cephadm: true
+          dns_name: rgw.example.com  # optional
+          rgw_service: rgw.myservice  # optional, defaults to first found
+    """
+    if config is None:
+        config = {}
+
+    log.info("🚀 STARTING cephadm s3tests bridge task")
+    log.info(f"🔍 DEBUG: Bridge task config: {config}")
+    
+    # Extensive context debugging
+    log.info("🔍 DEBUG: Checking available context attributes...")
+    log.info(f"🔍 DEBUG: hasattr(ctx, 'ceph') = {hasattr(ctx, 'ceph')}")
+    log.info(f"🔍 DEBUG: hasattr(ctx, 'cephadm') = {hasattr(ctx, 'cephadm')}")
+    log.info(f"🔍 DEBUG: hasattr(ctx, 'cluster') = {hasattr(ctx, 'cluster')}")
+    log.info(f"🔍 DEBUG: hasattr(ctx, 'rgw') = {hasattr(ctx, 'rgw')} (should be False initially)")
+    
+    if hasattr(ctx, 'ceph'):
+        log.info(f"🔍 DEBUG: ctx.ceph keys: {list(ctx.ceph.keys())}")
+        for cluster_name in ctx.ceph.keys():
+            log.info(f"🔍 DEBUG: ctx.ceph[{cluster_name}] attributes: {dir(ctx.ceph[cluster_name])}")
+            if hasattr(ctx.ceph[cluster_name], 'image'):
+                log.info(f"🔍 DEBUG: ctx.ceph[{cluster_name}].image = {ctx.ceph[cluster_name].image}")
+            if hasattr(ctx.ceph[cluster_name], 'fsid'):
+                log.info(f"🔍 DEBUG: ctx.ceph[{cluster_name}].fsid = {ctx.ceph[cluster_name].fsid}")
+    else:
+        log.error("❌ ERROR: ctx.ceph not found - this is critical!")
+    
+    if hasattr(ctx, 'cephadm'):
+        log.info(f"🔍 DEBUG: type(ctx.cephadm) = {type(ctx.cephadm)}")
+        log.info(f"🔍 DEBUG: ctx.cephadm = {ctx.cephadm}")
+    else:
+        log.error("❌ ERROR: ctx.cephadm not found")
+
+    try:
+        log.info("🔍 Phase 1: Attempting RGW endpoint discovery...")
+        discovered_endpoints = discover_cephadm_rgw_endpoints(ctx)
+        log.info(f"✅ SUCCESS: Discovered {len(discovered_endpoints)} RGW endpoints")
+    except Exception as e:
+        log.error(f"❌ CRITICAL: RGW endpoint discovery failed: {e}")
+        log.error(f"❌ Bridge task cannot continue - ctx.rgw will NOT be created!")
+        raise e
+
+    if not discovered_endpoints:
+        log.error("❌ CRITICAL: No RGW services found via cephadm orchestrator")
+        log.error("❌ Bridge task cannot continue - ctx.rgw will NOT be created!")
+        raise ConfigError("No RGW services found via cephadm orchestrator")
+
+    log.info("🔍 Phase 2: Mapping roles to endpoints...")
+    role_endpoints = map_roles_to_endpoints(ctx, config, discovered_endpoints)
+
+    if not role_endpoints:
+        log.error("❌ CRITICAL: No roles configured for RGW endpoint mapping")
+        log.error("❌ Bridge task cannot continue - ctx.rgw will NOT be created!")
+        log.error("❌ Check your bridge task configuration - you need at least one role with 'discover_from_cephadm: true'")
+        return
+
+    log.info(f"✅ SUCCESS: Mapped {len(role_endpoints)} roles to endpoints")
+    for role, endpoint in role_endpoints.items():
+        log.info(f"    🔗 {role} -> {endpoint.hostname}:{endpoint.port}")
+
+    log.info("🔍 Phase 3: Testing RGW endpoint accessibility...")
+    try:
+        wait_for_rgw_accessibility(ctx, role_endpoints)
+        log.info("✅ SUCCESS: All RGW endpoints are accessible")
+    except Exception as e:
+        log.error(f"❌ ERROR: RGW accessibility test failed: {e}")
+        log.error("❌ Continuing anyway - ctx.rgw will still be created")
+
+    log.info("🔍 Phase 4: Creating ctx.rgw structure for s3tests compatibility...")
+    
+    # Store original state for debugging
+    original_rgw_exists = hasattr(ctx, 'rgw')
+    log.info(f"🔍 DEBUG: Before creation - hasattr(ctx, 'rgw') = {original_rgw_exists}")
+
+    # Phase 4: Create ctx.rgw structure for s3tests compatibility
+    # Using simple class instead of dynamic type creation for better compatibility
+    class RGWContext:
+        pass
+
+    ctx.rgw = RGWContext()
+    ctx.rgw.role_endpoints = role_endpoints
+
+    log.info(f"🔍 DEBUG: After creation - hasattr(ctx, 'rgw') = {hasattr(ctx, 'rgw')}")
+    log.info(f"🔍 DEBUG: type(ctx.rgw) = {type(ctx.rgw)}")
+    log.info(f"🔍 DEBUG: hasattr(ctx.rgw, 'role_endpoints') = {hasattr(ctx.rgw, 'role_endpoints')}")
+    log.info(f"🔍 DEBUG: len(ctx.rgw.role_endpoints) = {len(ctx.rgw.role_endpoints)}")
+
+    log.info(f"✅ SUCCESS: Created ctx.rgw.role_endpoints with {len(role_endpoints)} endpoints")
+    for role, endpoint in role_endpoints.items():
+        log.info(f"  🔗 {role} -> {endpoint.hostname}:{endpoint.port}")
+
+    # Phase 5: Store discovery info and activate bridge
+    ctx.rgw.cephadm_discovered_endpoints = discovered_endpoints
+    ctx.rgw.cephadm_bridge_active = True
+    
+    log.info(f"🔍 DEBUG: Set ctx.rgw.cephadm_bridge_active = {ctx.rgw.cephadm_bridge_active}")
+
+    # Phase 6: Patch radosgw-admin commands for cephadm compatibility
+    log.info("🔍 Phase 5: Setting up radosgw-admin monkey patching...")
+    try:
+        patch_s3tests_radosgw_admin(ctx)
+        log.info("✅ SUCCESS: Monkey patch for radosgw-admin commands activated")
+    except Exception as e:
+        log.error(f"❌ ERROR: Monkey patch setup failed: {e}")
+        raise e
+
+    # Final verification for s3tests compatibility
+    log.info("🔍 FINAL VERIFICATION: Checking s3tests compatibility...")
+    log.info(f"✅ hasattr(ctx, 'rgw') = {hasattr(ctx, 'rgw')}")
+    log.info(f"✅ type(ctx.rgw) = {type(ctx.rgw)}")
+    log.info(f"✅ hasattr(ctx.rgw, 'role_endpoints') = {hasattr(ctx.rgw, 'role_endpoints')}")
+    log.info(f"✅ ctx.rgw.cephadm_bridge_active = {getattr(ctx.rgw, 'cephadm_bridge_active', 'MISSING')}")
+    log.info(f"✅ len(ctx.rgw.role_endpoints) = {len(getattr(ctx.rgw, 'role_endpoints', {}))}")
+
+    log.info("🎉 SUCCESS: cephadm s3tests bridge task completed successfully!")
+    log.info("🎉 ctx.rgw is now ready for s3tests - the assertion should PASS!")
+
+    try:
+        yield
+    finally:
+        # Cleanup logging with more detail
+        log.info("🔄 BRIDGE CLEANUP: Starting bridge task cleanup...")
+        log.info("🔄 Note: Monkey patch remains active for test duration (this is expected)")
+        log.info(f"🔄 Final state: hasattr(ctx, 'rgw') = {hasattr(ctx, 'rgw')}")
+        if hasattr(ctx, 'rgw'):
+            log.info(f"🔄 Final state: hasattr(ctx.rgw, 'cephadm_bridge_active') = {hasattr(ctx.rgw, 'cephadm_bridge_active')}")
+            log.info(f"🔄 Final state: len(ctx.rgw.role_endpoints) = {len(getattr(ctx.rgw, 'role_endpoints', {}))}")
+            log.info(f"🔄 Final state: ctx.rgw.cephadm_bridge_active = {getattr(ctx.rgw, 'cephadm_bridge_active', 'MISSING')}")
+        else:
+            log.error("🔄 ❌ CRITICAL: ctx.rgw was lost during test execution!")
+        log.info("🔄 Bridge task cleanup completed")
diff --git a/qa/tasks/python.py b/qa/tasks/python.py
index 4ddb14f714538..aacb01ee45426 100644
--- a/qa/tasks/python.py
+++ b/qa/tasks/python.py
@@ -35,11 +35,18 @@ def task(ctx, config):
         (remote,) = ctx.cluster.only(role).remotes.keys()
         log.info('Running python on role %s host %s', role, remote.name)
         log.info(code)
+        
+        # Handle both string and list input for code
+        if isinstance(code, list):
+            code_str = '\n'.join(code)
+        else:
+            code_str = code
+            
         args=[
             'TESTDIR={tdir}'.format(tdir=testdir),
             'python3',
         ]
         if sudo:
             args = ['sudo'] + args
-        remote.run(args=args, stdin=subst_vip(ctx, code))
+        remote.run(args=args, stdin=subst_vip(ctx, code_str))
 

From 0857e18563208584d63b257baddab169e60fb6b8 Mon Sep 17 00:00:00 2001
From: Joshua Blanch <joshua.blanch@clyso.com>
Date: Fri, 8 Aug 2025 04:26:47 +0000
Subject: [PATCH 03/17] qa/s3bridge: asserts for ctx properties

Signed-off-by: Joshua Blanch <joshua.blanch@clyso.com>
---
 qa/tasks/cephadm_s3_bridge.py | 112 +++++++---------------------------
 1 file changed, 22 insertions(+), 90 deletions(-)

diff --git a/qa/tasks/cephadm_s3_bridge.py b/qa/tasks/cephadm_s3_bridge.py
index d870679fc207f..f4dcdca69365b 100644
--- a/qa/tasks/cephadm_s3_bridge.py
+++ b/qa/tasks/cephadm_s3_bridge.py
@@ -314,128 +314,60 @@ def task(ctx, config):
     if config is None:
         config = {}
 
-    log.info("🚀 STARTING cephadm s3tests bridge task")
-    log.info(f"🔍 DEBUG: Bridge task config: {config}")
-    
-    # Extensive context debugging
-    log.info("🔍 DEBUG: Checking available context attributes...")
-    log.info(f"🔍 DEBUG: hasattr(ctx, 'ceph') = {hasattr(ctx, 'ceph')}")
-    log.info(f"🔍 DEBUG: hasattr(ctx, 'cephadm') = {hasattr(ctx, 'cephadm')}")
-    log.info(f"🔍 DEBUG: hasattr(ctx, 'cluster') = {hasattr(ctx, 'cluster')}")
-    log.info(f"🔍 DEBUG: hasattr(ctx, 'rgw') = {hasattr(ctx, 'rgw')} (should be False initially)")
-    
-    if hasattr(ctx, 'ceph'):
-        log.info(f"🔍 DEBUG: ctx.ceph keys: {list(ctx.ceph.keys())}")
-        for cluster_name in ctx.ceph.keys():
-            log.info(f"🔍 DEBUG: ctx.ceph[{cluster_name}] attributes: {dir(ctx.ceph[cluster_name])}")
-            if hasattr(ctx.ceph[cluster_name], 'image'):
-                log.info(f"🔍 DEBUG: ctx.ceph[{cluster_name}].image = {ctx.ceph[cluster_name].image}")
-            if hasattr(ctx.ceph[cluster_name], 'fsid'):
-                log.info(f"🔍 DEBUG: ctx.ceph[{cluster_name}].fsid = {ctx.ceph[cluster_name].fsid}")
-    else:
-        log.error("❌ ERROR: ctx.ceph not found - this is critical!")
-    
-    if hasattr(ctx, 'cephadm'):
-        log.info(f"🔍 DEBUG: type(ctx.cephadm) = {type(ctx.cephadm)}")
-        log.info(f"🔍 DEBUG: ctx.cephadm = {ctx.cephadm}")
-    else:
-        log.error("❌ ERROR: ctx.cephadm not found")
+    # Critical context assertions - fail fast if something is missing
+    assert hasattr(ctx, 'ceph'), 'ctx.ceph not found - cephadm bridge requires ceph context'
+    assert hasattr(ctx, 'cephadm'), 'ctx.cephadm not found - cephadm bridge requires cephadm context'
+    assert hasattr(ctx, 'cluster'), 'ctx.cluster not found - cephadm bridge requires cluster context'
+    assert not hasattr(ctx, 'rgw'), 'ctx.rgw already exists - bridge should run before rgw task'
 
     try:
-        log.info("🔍 Phase 1: Attempting RGW endpoint discovery...")
         discovered_endpoints = discover_cephadm_rgw_endpoints(ctx)
-        log.info(f"✅ SUCCESS: Discovered {len(discovered_endpoints)} RGW endpoints")
     except Exception as e:
-        log.error(f"❌ CRITICAL: RGW endpoint discovery failed: {e}")
-        log.error(f"❌ Bridge task cannot continue - ctx.rgw will NOT be created!")
+        log.error(f"RGW endpoint discovery failed: {e}")
         raise e
 
     if not discovered_endpoints:
-        log.error("❌ CRITICAL: No RGW services found via cephadm orchestrator")
-        log.error("❌ Bridge task cannot continue - ctx.rgw will NOT be created!")
         raise ConfigError("No RGW services found via cephadm orchestrator")
 
-    log.info("🔍 Phase 2: Mapping roles to endpoints...")
     role_endpoints = map_roles_to_endpoints(ctx, config, discovered_endpoints)
 
     if not role_endpoints:
-        log.error("❌ CRITICAL: No roles configured for RGW endpoint mapping")
-        log.error("❌ Bridge task cannot continue - ctx.rgw will NOT be created!")
-        log.error("❌ Check your bridge task configuration - you need at least one role with 'discover_from_cephadm: true'")
+        log.error("No roles configured for RGW endpoint mapping")
+        log.error("Check your bridge task configuration - you need at least one role with 'discover_from_cephadm: true'")
         return
 
-    log.info(f"✅ SUCCESS: Mapped {len(role_endpoints)} roles to endpoints")
-    for role, endpoint in role_endpoints.items():
-        log.info(f"    🔗 {role} -> {endpoint.hostname}:{endpoint.port}")
-
-    log.info("🔍 Phase 3: Testing RGW endpoint accessibility...")
     try:
         wait_for_rgw_accessibility(ctx, role_endpoints)
-        log.info("✅ SUCCESS: All RGW endpoints are accessible")
     except Exception as e:
-        log.error(f"❌ ERROR: RGW accessibility test failed: {e}")
-        log.error("❌ Continuing anyway - ctx.rgw will still be created")
-
-    log.info("🔍 Phase 4: Creating ctx.rgw structure for s3tests compatibility...")
-    
-    # Store original state for debugging
-    original_rgw_exists = hasattr(ctx, 'rgw')
-    log.info(f"🔍 DEBUG: Before creation - hasattr(ctx, 'rgw') = {original_rgw_exists}")
+        log.error(f"RGW accessibility test failed: {e}")
+        log.error("Continuing anyway - ctx.rgw will still be created")
 
-    # Phase 4: Create ctx.rgw structure for s3tests compatibility
-    # Using simple class instead of dynamic type creation for better compatibility
+    # Create ctx.rgw structure for s3tests compatibility
     class RGWContext:
         pass
 
     ctx.rgw = RGWContext()
     ctx.rgw.role_endpoints = role_endpoints
-
-    log.info(f"🔍 DEBUG: After creation - hasattr(ctx, 'rgw') = {hasattr(ctx, 'rgw')}")
-    log.info(f"🔍 DEBUG: type(ctx.rgw) = {type(ctx.rgw)}")
-    log.info(f"🔍 DEBUG: hasattr(ctx.rgw, 'role_endpoints') = {hasattr(ctx.rgw, 'role_endpoints')}")
-    log.info(f"🔍 DEBUG: len(ctx.rgw.role_endpoints) = {len(ctx.rgw.role_endpoints)}")
-
-    log.info(f"✅ SUCCESS: Created ctx.rgw.role_endpoints with {len(role_endpoints)} endpoints")
-    for role, endpoint in role_endpoints.items():
-        log.info(f"  🔗 {role} -> {endpoint.hostname}:{endpoint.port}")
-
-    # Phase 5: Store discovery info and activate bridge
     ctx.rgw.cephadm_discovered_endpoints = discovered_endpoints
     ctx.rgw.cephadm_bridge_active = True
-    
-    log.info(f"🔍 DEBUG: Set ctx.rgw.cephadm_bridge_active = {ctx.rgw.cephadm_bridge_active}")
 
-    # Phase 6: Patch radosgw-admin commands for cephadm compatibility
-    log.info("🔍 Phase 5: Setting up radosgw-admin monkey patching...")
+    # Setup radosgw-admin monkey patching
     try:
         patch_s3tests_radosgw_admin(ctx)
-        log.info("✅ SUCCESS: Monkey patch for radosgw-admin commands activated")
     except Exception as e:
-        log.error(f"❌ ERROR: Monkey patch setup failed: {e}")
+        log.error(f"Monkey patch setup failed: {e}")
         raise e
 
-    # Final verification for s3tests compatibility
-    log.info("🔍 FINAL VERIFICATION: Checking s3tests compatibility...")
-    log.info(f"✅ hasattr(ctx, 'rgw') = {hasattr(ctx, 'rgw')}")
-    log.info(f"✅ type(ctx.rgw) = {type(ctx.rgw)}")
-    log.info(f"✅ hasattr(ctx.rgw, 'role_endpoints') = {hasattr(ctx.rgw, 'role_endpoints')}")
-    log.info(f"✅ ctx.rgw.cephadm_bridge_active = {getattr(ctx.rgw, 'cephadm_bridge_active', 'MISSING')}")
-    log.info(f"✅ len(ctx.rgw.role_endpoints) = {len(getattr(ctx.rgw, 'role_endpoints', {}))}")
-
-    log.info("🎉 SUCCESS: cephadm s3tests bridge task completed successfully!")
-    log.info("🎉 ctx.rgw is now ready for s3tests - the assertion should PASS!")
+    # Final verification assertions
+    assert hasattr(ctx, 'rgw'), 'ctx.rgw was not created successfully'
+    assert hasattr(ctx.rgw, 'role_endpoints'), 'ctx.rgw.role_endpoints was not created'
+    assert hasattr(ctx.rgw, 'cephadm_bridge_active'), 'ctx.rgw.cephadm_bridge_active was not set'
+    assert ctx.rgw.cephadm_bridge_active, 'ctx.rgw.cephadm_bridge_active is not True'
+    assert len(ctx.rgw.role_endpoints) > 0, 'ctx.rgw.role_endpoints is empty'
 
     try:
         yield
     finally:
-        # Cleanup logging with more detail
-        log.info("🔄 BRIDGE CLEANUP: Starting bridge task cleanup...")
-        log.info("🔄 Note: Monkey patch remains active for test duration (this is expected)")
-        log.info(f"🔄 Final state: hasattr(ctx, 'rgw') = {hasattr(ctx, 'rgw')}")
-        if hasattr(ctx, 'rgw'):
-            log.info(f"🔄 Final state: hasattr(ctx.rgw, 'cephadm_bridge_active') = {hasattr(ctx.rgw, 'cephadm_bridge_active')}")
-            log.info(f"🔄 Final state: len(ctx.rgw.role_endpoints) = {len(getattr(ctx.rgw, 'role_endpoints', {}))}")
-            log.info(f"🔄 Final state: ctx.rgw.cephadm_bridge_active = {getattr(ctx.rgw, 'cephadm_bridge_active', 'MISSING')}")
-        else:
-            log.error("🔄 ❌ CRITICAL: ctx.rgw was lost during test execution!")
-        log.info("🔄 Bridge task cleanup completed")
+        # Verify ctx.rgw survived test execution
+        assert hasattr(ctx, 'rgw'), 'ctx.rgw was lost during test execution'
+        assert hasattr(ctx.rgw, 'cephadm_bridge_active'), 'ctx.rgw.cephadm_bridge_active was lost'

From 917432095f1d62002e9713e9293cc9815f2de648 Mon Sep 17 00:00:00 2001
From: Joshua Blanch <joshua.blanch@clyso.com>
Date: Fri, 8 Aug 2025 04:34:11 +0000
Subject: [PATCH 04/17] qa/s3bridge: more asserts

Signed-off-by: Joshua Blanch <joshua.blanch@clyso.com>
---
 qa/tasks/cephadm_s3_bridge.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/qa/tasks/cephadm_s3_bridge.py b/qa/tasks/cephadm_s3_bridge.py
index f4dcdca69365b..a935fd3fdec8e 100644
--- a/qa/tasks/cephadm_s3_bridge.py
+++ b/qa/tasks/cephadm_s3_bridge.py
@@ -318,7 +318,10 @@ def task(ctx, config):
     assert hasattr(ctx, 'ceph'), 'ctx.ceph not found - cephadm bridge requires ceph context'
     assert hasattr(ctx, 'cephadm'), 'ctx.cephadm not found - cephadm bridge requires cephadm context'
     assert hasattr(ctx, 'cluster'), 'ctx.cluster not found - cephadm bridge requires cluster context'
-    assert not hasattr(ctx, 'rgw'), 'ctx.rgw already exists - bridge should run before rgw task'
+    
+    # Allow ctx.rgw to exist from cephadm tasks, but ensure it doesn't have role_endpoints
+    if hasattr(ctx, 'rgw') and hasattr(ctx.rgw, 'role_endpoints'):
+        raise ConfigError('ctx.rgw.role_endpoints already exists - bridge should run before other rgw configuration tasks')
 
     try:
         discovered_endpoints = discover_cephadm_rgw_endpoints(ctx)
@@ -343,10 +346,11 @@ def task(ctx, config):
         log.error("Continuing anyway - ctx.rgw will still be created")
 
     # Create ctx.rgw structure for s3tests compatibility
-    class RGWContext:
-        pass
-
-    ctx.rgw = RGWContext()
+    if not hasattr(ctx, 'rgw'):
+        class RGWContext:
+            pass
+        ctx.rgw = RGWContext()
+    
     ctx.rgw.role_endpoints = role_endpoints
     ctx.rgw.cephadm_discovered_endpoints = discovered_endpoints
     ctx.rgw.cephadm_bridge_active = True

From a53ad01feaa45cd2326efc6e8cc80520a2989ffa Mon Sep 17 00:00:00 2001
From: Joshua Blanch <joshua.blanch@clyso.com>
Date: Fri, 8 Aug 2025 04:42:24 +0000
Subject: [PATCH 05/17] s3bridge: debug logs

Signed-off-by: Joshua Blanch <joshua.blanch@clyso.com>
---
 qa/tasks/cephadm_s3_bridge.py | 62 +++++++++++++++++++++--------------
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/qa/tasks/cephadm_s3_bridge.py b/qa/tasks/cephadm_s3_bridge.py
index a935fd3fdec8e..e4be2a24fed25 100644
--- a/qa/tasks/cephadm_s3_bridge.py
+++ b/qa/tasks/cephadm_s3_bridge.py
@@ -80,16 +80,12 @@ def cephadm_aware_run(self, **kwargs):
 
         return original_run(self, **kwargs)
 
-
     teuthology.orchestra.remote.Remote.run = cephadm_aware_run
 
 
 def restore_original_remote_run():
     """Restore original remote run method (for cleanup)"""
-    # TODO: In practice, this is tricky to implement cleanly since we don't
-    # store the original reference. The monkey patch will remain active
-    # for the duration of the test run, which is typically desired.
-    log.info("Note: Monkey patch cleanup not implemented - patch remains active")
+    log.info("not implemented - patch remains active")
 
 
 def discover_cephadm_rgw_endpoints(ctx):
@@ -314,14 +310,25 @@ def task(ctx, config):
     if config is None:
         config = {}
 
-    # Critical context assertions - fail fast if something is missing
-    assert hasattr(ctx, 'ceph'), 'ctx.ceph not found - cephadm bridge requires ceph context'
-    assert hasattr(ctx, 'cephadm'), 'ctx.cephadm not found - cephadm bridge requires cephadm context'
-    assert hasattr(ctx, 'cluster'), 'ctx.cluster not found - cephadm bridge requires cluster context'
-    
+    log.info("Starting cephadm s3tests bridge task")
+
+    assert hasattr(ctx, "ceph"), (
+        "ctx.ceph not found - cephadm bridge requires ceph context"
+    )
+    assert hasattr(ctx, "cephadm"), (
+        "ctx.cephadm not found - cephadm bridge requires cephadm context"
+    )
+    assert hasattr(ctx, "cluster"), (
+        "ctx.cluster not found - cephadm bridge requires cluster context"
+    )
+
+    log.info("Context assertions passed, checking for existing ctx.rgw...")
+
     # Allow ctx.rgw to exist from cephadm tasks, but ensure it doesn't have role_endpoints
-    if hasattr(ctx, 'rgw') and hasattr(ctx.rgw, 'role_endpoints'):
-        raise ConfigError('ctx.rgw.role_endpoints already exists - bridge should run before other rgw configuration tasks')
+    if hasattr(ctx, "rgw") and hasattr(ctx.rgw, "role_endpoints"):
+        raise ConfigError(
+            "ctx.rgw.role_endpoints already exists - bridge should run before other rgw configuration tasks"
+        )
 
     try:
         discovered_endpoints = discover_cephadm_rgw_endpoints(ctx)
@@ -336,7 +343,9 @@ def task(ctx, config):
 
     if not role_endpoints:
         log.error("No roles configured for RGW endpoint mapping")
-        log.error("Check your bridge task configuration - you need at least one role with 'discover_from_cephadm: true'")
+        log.error(
+            "Check your bridge task configuration - you need at least one role with 'discover_from_cephadm: true'"
+        )
         return
 
     try:
@@ -346,32 +355,35 @@ def task(ctx, config):
         log.error("Continuing anyway - ctx.rgw will still be created")
 
     # Create ctx.rgw structure for s3tests compatibility
-    if not hasattr(ctx, 'rgw'):
+    if not hasattr(ctx, "rgw"):
+
         class RGWContext:
             pass
+
         ctx.rgw = RGWContext()
-    
+
     ctx.rgw.role_endpoints = role_endpoints
     ctx.rgw.cephadm_discovered_endpoints = discovered_endpoints
     ctx.rgw.cephadm_bridge_active = True
 
-    # Setup radosgw-admin monkey patching
     try:
         patch_s3tests_radosgw_admin(ctx)
     except Exception as e:
         log.error(f"Monkey patch setup failed: {e}")
         raise e
 
-    # Final verification assertions
-    assert hasattr(ctx, 'rgw'), 'ctx.rgw was not created successfully'
-    assert hasattr(ctx.rgw, 'role_endpoints'), 'ctx.rgw.role_endpoints was not created'
-    assert hasattr(ctx.rgw, 'cephadm_bridge_active'), 'ctx.rgw.cephadm_bridge_active was not set'
-    assert ctx.rgw.cephadm_bridge_active, 'ctx.rgw.cephadm_bridge_active is not True'
-    assert len(ctx.rgw.role_endpoints) > 0, 'ctx.rgw.role_endpoints is empty'
+    assert hasattr(ctx, "rgw"), "ctx.rgw was not created successfully"
+    assert hasattr(ctx.rgw, "role_endpoints"), "ctx.rgw.role_endpoints was not created"
+    assert hasattr(ctx.rgw, "cephadm_bridge_active"), (
+        "ctx.rgw.cephadm_bridge_active was not set"
+    )
+    assert ctx.rgw.cephadm_bridge_active, "ctx.rgw.cephadm_bridge_active is not True"
+    assert len(ctx.rgw.role_endpoints) > 0, "ctx.rgw.role_endpoints is empty"
 
     try:
         yield
     finally:
-        # Verify ctx.rgw survived test execution
-        assert hasattr(ctx, 'rgw'), 'ctx.rgw was lost during test execution'
-        assert hasattr(ctx.rgw, 'cephadm_bridge_active'), 'ctx.rgw.cephadm_bridge_active was lost'
+        assert hasattr(ctx, "rgw"), "ctx.rgw was lost during test execution"
+        assert hasattr(ctx.rgw, "cephadm_bridge_active"), (
+            "ctx.rgw.cephadm_bridge_active was lost"
+        )

From db85d1e3cff13591c7a1844fff31a5d3c2cb57cf Mon Sep 17 00:00:00 2001
From: Joshua Blanch <joshua.blanch@clyso.com>
Date: Fri, 8 Aug 2025 06:00:07 +0000
Subject: [PATCH 06/17] s3bridge:  finds 'radosgw-admin' anywhere in args,
 splits prefix/admin_and_rest, builds new_args = prefix + cephadm_prefix +
 admin_and_rest,

Signed-off-by: Joshua Blanch <joshua.blanch@clyso.com>
---
 qa/tasks/cephadm_s3_bridge.py | 74 +++++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 30 deletions(-)

diff --git a/qa/tasks/cephadm_s3_bridge.py b/qa/tasks/cephadm_s3_bridge.py
index e4be2a24fed25..b22245ffe2138 100644
--- a/qa/tasks/cephadm_s3_bridge.py
+++ b/qa/tasks/cephadm_s3_bridge.py
@@ -38,45 +38,59 @@ def patch_s3tests_radosgw_admin(ctx):
     """
     Monkey patch teuthology remote execution to make radosgw-admin commands
     work inside cephadm containers when running s3tests.
+
+    Many teuthology tasks (eg. s3tests, rgw helpers) invoke radosgw-admin with
+    wrapper prefixes like ["adjust-ulimits", "ceph-coverage", <path>, ... ,
+    "radosgw-admin", ...]. The original patch only matched when args[0] was
+    "radosgw-admin" which missed these cases. Here we detect radosgw-admin at
+    any position, split the prefix, and wrap only the radosgw-admin portion
+    inside a 'sudo <cephadm> shell -c ... -k ... -- <radosgw-admin ...>' call.
     """
-    log.info("convert radosgw-admin to cephadm command")
+    log.info("Enabling cephadm-aware radosgw-admin monkey patch for s3tests")
 
     original_run = teuthology.orchestra.remote.Remote.run
 
     def cephadm_aware_run(self, **kwargs):
         args = kwargs.get("args", [])
 
-        if args and len(args) > 0 and args[0] == "radosgw-admin":
-            if detect_cephadm_deployment(ctx):
+        try:
+            # Locate the radosgw-admin binary within args (not just at index 0)
+            admin_idx = -1
+            for i, a in enumerate(args):
+                if isinstance(a, str) and a == "radosgw-admin":
+                    admin_idx = i
+                    break
+
+            if admin_idx != -1 and detect_cephadm_deployment(ctx):
                 log.info(f"Intercepting radosgw-admin command: {args}")
 
-                try:
-                    cluster_name = (
-                        list(ctx.ceph.keys())[0] if hasattr(ctx, "ceph") else "ceph"
-                    )
-                    image = ctx.ceph[cluster_name].image
-
-                    cephadm_args = [
-                        "sudo",
-                        "cephadm",
-                        "--image",
-                        image,
-                        "shell",
-                        "-c",
-                        f"/etc/ceph/{cluster_name}.conf",
-                        "-k",
-                        f"/etc/ceph/{cluster_name}.client.admin.keyring",
-                        "--fsid",
-                        ctx.ceph[cluster_name].fsid,
-                        "--",
-                    ] + args
-
-                    log.info(f"Converted to cephadm shell command: {cephadm_args}")
-                    kwargs["args"] = cephadm_args
-
-                except Exception as e:
-                    log.error(f"Failed to convert radosgw-admin to cephadm shell: {e}")
-                    pass
+                cluster_name = list(ctx.ceph.keys())[0] if hasattr(ctx, "ceph") else "ceph"
+                image = ctx.ceph[cluster_name].image
+                fsid = ctx.ceph[cluster_name].fsid
+                cephadm_bin = getattr(ctx, "cephadm", "cephadm")
+
+                # Everything before radosgw-admin should remain as-is
+                prefix = list(args[:admin_idx])
+                admin_and_rest = list(args[admin_idx:])
+
+                cephadm_prefix = [
+                    "sudo",
+                    cephadm_bin,
+                    "--image", image,
+                    "shell",
+                    "-c", f"/etc/ceph/{cluster_name}.conf",
+                    "-k", f"/etc/ceph/{cluster_name}.client.admin.keyring",
+                    "--fsid", fsid,
+                    "--",
+                ]
+
+                new_args = prefix + cephadm_prefix + admin_and_rest
+                log.info(f"Converted to cephadm shell command: {new_args}")
+                kwargs["args"] = new_args
+
+        except Exception as e:
+            # On any failure, fall back to original behavior
+            log.error(f"cephadm radosgw-admin monkey patch error: {e}")
 
         return original_run(self, **kwargs)
 

From 8f2fa488e6b397c36f9725e86c7fe15f69371e4d Mon Sep 17 00:00:00 2001
From: Joshua Blanch <joshua.blanch@clyso.com>
Date: Fri, 8 Aug 2025 06:30:53 +0000
Subject: [PATCH 07/17] bump

Signed-off-by: Joshua Blanch <joshua.blanch@clyso.com>
---
 .../s3tests-bridge/bridge-test-only.yaml      | 92 ++++---------------
 qa/tasks/cephadm_s3_bridge.py                 | 44 ++++++---
 2 files changed, 45 insertions(+), 91 deletions(-)

diff --git a/qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml b/qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml
index d1cd13f3664d2..80c42d2339fdb 100644
--- a/qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml
+++ b/qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml
@@ -25,64 +25,17 @@ tasks:
       - echo "Bridge task completed - ctx.rgw should now be available for s3tests"
       - echo "Testing RGW accessibility..."
 
-# This uses the same execution path that s3tests uses, so it will trigger our monkey patch
-- python:
+# Test the monkey patch by directly executing radosgw-admin commands
+# The bridge should intercept these and run them inside cephadm containers
+- exec:
     client.0:
-      - |
-        import logging
-        from io import StringIO
-        log = logging.getLogger(__name__)
-        
-        log.info("=== Testing radosgw-admin monkey patch ===")
-        
-        # Get the remote connection (same way s3tests does it)
-        remote = list(ctx.cluster.only('client.0').remotes.keys())[0]
-        
-        log.info("Testing user creation via remote.run() - this should trigger monkey patch")
-        try:
-            # This should be intercepted by our monkey patch and converted to cephadm shell
-            result = remote.run(
-                args=[
-                    'radosgw-admin', 'user', 'create',
-                    '--uid=testuser',
-                    '--display-name=Test User', 
-                    '--access-key=testkey',
-                    '--secret-key=testsecret'
-                ],
-                stdout=StringIO()
-            )
-            log.info("✅ SUCCESS: radosgw-admin user create executed through monkey patch!")
-            output = result.stdout.getvalue() if hasattr(result.stdout, 'getvalue') else str(result.stdout)
-            log.info(f"Command output length: {len(output)} chars")
-            if 'testuser' in output or 'access_key' in output:
-                log.info("✅ User creation output looks correct")
-            else:
-                log.warning(f"⚠️ Unexpected output: {output[:200]}...")
-        except Exception as e:
-            log.error(f"❌ FAILED: radosgw-admin user create failed: {e}")
-            raise
-        
-        log.info("Testing user info retrieval via remote.run()")
-        try:
-            result = remote.run(
-                args=[
-                    'radosgw-admin', 'user', 'info',
-                    '--uid=testuser'
-                ],
-                stdout=StringIO()
-            )
-            log.info("✅ SUCCESS: radosgw-admin user info executed through monkey patch!")
-            output = result.stdout.getvalue() if hasattr(result.stdout, 'getvalue') else str(result.stdout)
-            if 'testuser' in output and 'access_key' in output:
-                log.info("✅ User info output contains expected fields")
-            else:
-                log.warning(f"⚠️ User info output: {output[:200]}...")
-        except Exception as e:
-            log.error(f"❌ FAILED: radosgw-admin user info failed: {e}")
-            raise
-        
-        log.info("=== ✅ Monkey patch test completed successfully! ===")
-        log.info("This confirms that s3tests radosgw-admin commands will work with cephadm")
+      - echo "=== Testing radosgw-admin monkey patch ==="
+      - echo "Creating test user (this should be intercepted by bridge)..."
+      - radosgw-admin user create --uid=testuser --display-name="Test User" --access-key=testkey --secret-key=testsecret
+      - echo "✅ User creation succeeded!"
+      - echo "Retrieving user info..."
+      - radosgw-admin user info --uid=testuser
+      - echo "✅ User info retrieval succeeded!"
 
 # Test S3 endpoint accessibility  
 - exec:
@@ -92,23 +45,10 @@ tasks:
       - echo "RGW Response: $response"
       - 'if echo "$response" | grep -q "ListBucketResult\|Error\|xml\|ACCESS_DENIED"; then echo "✅ RGW is responding with valid S3 response"; else echo "✗ RGW not responding correctly"; fi'
 
-# Cleanup test user using monkey patch (another test)
-- python:
+# Cleanup test user (another test of the monkey patch)
+- exec:
     client.0:
-      - |
-        import logging
-        log = logging.getLogger(__name__)
-        
-        log.info("=== Testing cleanup via monkey patch ===")
-        remote = list(ctx.cluster.only('client.0').remotes.keys())[0]
-        
-        try:
-            result = remote.run(args=[
-                'radosgw-admin', 'user', 'rm',
-                '--uid=testuser'
-            ])
-            log.info("✅ SUCCESS: User cleanup via monkey patch worked!")
-        except Exception as e:
-            log.warning(f"⚠️ Cleanup failed (this is often expected): {e}")
-        
-        log.info("=== Bridge functionality test completed ===")
+      - echo "=== Testing cleanup via monkey patch ==="
+      - echo "Removing test user..."
+      - radosgw-admin user rm --uid=testuser || echo "⚠️ Cleanup failed (this is often expected if user doesn't exist)"
+      - echo "=== Bridge functionality test completed ==="
diff --git a/qa/tasks/cephadm_s3_bridge.py b/qa/tasks/cephadm_s3_bridge.py
index b22245ffe2138..2d6ea09d021ac 100644
--- a/qa/tasks/cephadm_s3_bridge.py
+++ b/qa/tasks/cephadm_s3_bridge.py
@@ -177,30 +177,36 @@ def discover_cephadm_rgw_endpoints(ctx):
         ports = service.get("ports", [])
         status = service.get("status_desc", "")
 
+        log.info(f"Processing service: {service_name}, hostname: {hostname}, ports: {ports}, status: {status}")
+
         if not service_name.startswith("rgw."):
+            log.debug(f"Skipping non-RGW service: {service_name}")
             continue
 
-        if status != "running":
+        if "running" not in status.lower():
             log.warning(f"RGW service {service_name} is not running: {status}")
-            continue
-
-        if not ports:
-            log.warning(f"No ports found for RGW service {service_name}")
-            continue
+            # Allow non-running services through for now, s3tests might still work
+            log.info(f"Continuing with non-running service {service_name} - s3tests might still work")
 
         # Extract port number (ports is typically ['8080/tcp'] format)
         port = None
-        for port_spec in ports:
-            if isinstance(port_spec, str) and "/" in port_spec:
-                port = int(port_spec.split("/")[0])
-                break
-            elif isinstance(port_spec, int):
-                port = port_spec
-                break
+        if ports:
+            for port_spec in ports:
+                if isinstance(port_spec, str) and "/" in port_spec:
+                    try:
+                        port = int(port_spec.split("/")[0])
+                        break
+                    except ValueError:
+                        continue
+                elif isinstance(port_spec, int):
+                    port = port_spec
+                    break
 
         if port is None:
             log.warning(f"Could not parse port for RGW service {service_name}: {ports}")
-            continue
+            # Fall back to default RGW port 8080
+            port = 8080
+            log.info(f"Using default port {port} for {service_name}")
 
         endpoints[service_name] = {
             "hostname": hostname,
@@ -209,6 +215,8 @@ def discover_cephadm_rgw_endpoints(ctx):
             "status": status,
         }
 
+        log.info(f"Added endpoint: {service_name} -> {hostname}:{port} (status: {status})")
+
     log.info(f"Discovered RGW endpoints: {endpoints}")
     return endpoints
 
@@ -351,7 +359,13 @@ def task(ctx, config):
         raise e
 
     if not discovered_endpoints:
-        raise ConfigError("No RGW services found via cephadm orchestrator")
+        log.error("No RGW services found via cephadm orchestrator")
+        log.error("This usually means:")
+        log.error("  1. No RGW services have been deployed yet")
+        log.error("  2. RGW services haven't started yet (check with 'ceph orch ps')")
+        log.error("  3. cephadm bridge is running before RGW deployment")
+        log.error("Make sure to run cephadm.apply (with RGW service) and cephadm.wait_for_service before this bridge")
+        raise ConfigError("No RGW services found via cephadm orchestrator - see logs for troubleshooting steps")
 
     role_endpoints = map_roles_to_endpoints(ctx, config, discovered_endpoints)
 

From 377a31174b93c5ac2ca85e12e3c0d1433ebb402e Mon Sep 17 00:00:00 2001
From: Joshua Blanch <joshua.blanch@clyso.com>
Date: Fri, 8 Aug 2025 06:56:19 +0000
Subject: [PATCH 08/17] get rid of tox

Signed-off-by: Joshua Blanch <joshua.blanch@clyso.com>
---
 qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml | 3 ---
 qa/suites/orch/cephadm/s3tests-bridge/minimal-test.yaml  | 2 --
 2 files changed, 5 deletions(-)

diff --git a/qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml b/qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml
index 53d7dfd3844f0..732eb76745b81 100644
--- a/qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml
+++ b/qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml
@@ -30,9 +30,6 @@ tasks:
     client.0:
       discover_from_cephadm: true
 
-# note: tox is needed
-- tox: [client.0]
-
 - s3tests:
     client.0:
       rgw_server: client.0
diff --git a/qa/suites/orch/cephadm/s3tests-bridge/minimal-test.yaml b/qa/suites/orch/cephadm/s3tests-bridge/minimal-test.yaml
index f1424fcb6874a..ba9c8b1b16eee 100644
--- a/qa/suites/orch/cephadm/s3tests-bridge/minimal-test.yaml
+++ b/qa/suites/orch/cephadm/s3tests-bridge/minimal-test.yaml
@@ -21,8 +21,6 @@ tasks:
       discover_from_cephadm: true
       dns_name: client.0
 
-- tox: [client.0]
-
 - s3tests:
     client.0:
       rgw_server: client.0

From 83c65d82431fe43373a88eb6ef0485c713f7faf8 Mon Sep 17 00:00:00 2001
From: Joshua Blanch <joshua.blanch@clyso.com>
Date: Fri, 8 Aug 2025 14:39:38 +0000
Subject: [PATCH 09/17] bump

Signed-off-by: Joshua Blanch <joshua.blanch@clyso.com>
---
 .../cephadm/s3tests-bridge/basic-s3tests.yaml | 26 +++++------
 .../s3tests-bridge/bridge-test-only.yaml      | 34 +--------------
 .../cephadm/s3tests-bridge/minimal-test.yaml  | 43 ++++++++++---------
 qa/tasks/cephadm_s3_bridge.py                 |  2 +-
 4 files changed, 39 insertions(+), 66 deletions(-)

diff --git a/qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml b/qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml
index 732eb76745b81..784cdf4d8476c 100644
--- a/qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml
+++ b/qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml
@@ -30,16 +30,16 @@ tasks:
     client.0:
       discover_from_cephadm: true
 
-- s3tests:
-    client.0:
-      rgw_server: client.0
-      force-branch: master
-      conf:
-        DEFAULT:
-          is_secure: false
-          port: 8080
-          calling_format: ordinary
-        fixtures:
-          bucket prefix: test-{random}-
-      # Only run 3 basic tests to verify bridge works
-      filter: "test_bucket_list_empty or test_bucket_create_naming_good_long_255"
+# - s3tests:
+#     client.0:
+#       rgw_server: client.0
+#       force-branch: master
+#       conf:
+#         DEFAULT:
+#           is_secure: false
+#           port: 8080
+#           calling_format: ordinary
+#         fixtures:
+#           bucket prefix: test-{random}-
+#       # Only run 3 basic tests to verify bridge works
+#       filter: "test_bucket_list_empty or test_bucket_create_naming_good_long_255"
diff --git a/qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml b/qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml
index 80c42d2339fdb..58c99d7eee03f 100644
--- a/qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml
+++ b/qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml
@@ -19,36 +19,6 @@ tasks:
 - cephadm_s3_bridge:
     client.0:
       discover_from_cephadm: true
-
-- exec:
-    client.0:
-      - echo "Bridge task completed - ctx.rgw should now be available for s3tests"
-      - echo "Testing RGW accessibility..."
-
-# Test the monkey patch by directly executing radosgw-admin commands
-# The bridge should intercept these and run them inside cephadm containers
-- exec:
-    client.0:
-      - echo "=== Testing radosgw-admin monkey patch ==="
-      - echo "Creating test user (this should be intercepted by bridge)..."
-      - radosgw-admin user create --uid=testuser --display-name="Test User" --access-key=testkey --secret-key=testsecret
-      - echo "✅ User creation succeeded!"
-      - echo "Retrieving user info..."
-      - radosgw-admin user info --uid=testuser
-      - echo "✅ User info retrieval succeeded!"
-
-# Test S3 endpoint accessibility  
-- exec:
-    client.0:
-      - echo "Testing S3 endpoint accessibility..."
-      - 'response=$(curl -s http://localhost:8080/ 2>/dev/null || echo "CONNECTION_FAILED")'
-      - echo "RGW Response: $response"
-      - 'if echo "$response" | grep -q "ListBucketResult\|Error\|xml\|ACCESS_DENIED"; then echo "✅ RGW is responding with valid S3 response"; else echo "✗ RGW not responding correctly"; fi'
-
-# Cleanup test user (another test of the monkey patch)
-- exec:
+- rgw:
     client.0:
-      - echo "=== Testing cleanup via monkey patch ==="
-      - echo "Removing test user..."
-      - radosgw-admin user rm --uid=testuser || echo "⚠️ Cleanup failed (this is often expected if user doesn't exist)"
-      - echo "=== Bridge functionality test completed ==="
+ 
diff --git a/qa/suites/orch/cephadm/s3tests-bridge/minimal-test.yaml b/qa/suites/orch/cephadm/s3tests-bridge/minimal-test.yaml
index ba9c8b1b16eee..5b8ccef88a806 100644
--- a/qa/suites/orch/cephadm/s3tests-bridge/minimal-test.yaml
+++ b/qa/suites/orch/cephadm/s3tests-bridge/minimal-test.yaml
@@ -16,6 +16,9 @@ tasks:
 - cephadm.wait_for_service:
     service: rgw.test
 
+- rgw:
+    client.0:
+
 - cephadm_s3_bridge:
     client.0:
       discover_from_cephadm: true
@@ -23,23 +26,23 @@ tasks:
 
 - s3tests:
     client.0:
-      rgw_server: client.0
-      force-branch: master
-      conf:
-        DEFAULT:
-          is_secure: false
-          port: 8080
-          calling_format: ordinary
-        fixtures:
-          bucket prefix: s3test-{random}-
-        # POC run
-        exclude:
-          - test_bucket_policy*
-          - test_lifecycle*  
-          - test_encryption*
-          - test_multipart_upload_size_too_small
-          - test_cors*
-          - test_website*
-          - test_logging*
-          - test_versioning*
-      filter: "test_bucket_list_empty or test_bucket_create or test_object_write or test_object_read"
+#       rgw_server: client.0
+#       force-branch: master
+#       conf:
+#         DEFAULT:
+#           is_secure: false
+#           port: 8080
+#           calling_format: ordinary
+#         fixtures:
+#           bucket prefix: s3test-{random}-
+#         # POC run
+#         exclude:
+#           - test_bucket_policy*
+#           - test_lifecycle*  
+#           - test_encryption*
+#           - test_multipart_upload_size_too_small
+#           - test_cors*
+#           - test_website*
+#           - test_logging*
+#           - test_versioning*
+#       filter: "test_bucket_list_empty or test_bucket_create or test_object_write or test_object_read"
diff --git a/qa/tasks/cephadm_s3_bridge.py b/qa/tasks/cephadm_s3_bridge.py
index 2d6ea09d021ac..8d563f9c5d104 100644
--- a/qa/tasks/cephadm_s3_bridge.py
+++ b/qa/tasks/cephadm_s3_bridge.py
@@ -36,7 +36,7 @@ def detect_cephadm_deployment(ctx):
 
 def patch_s3tests_radosgw_admin(ctx):
     """
-    Monkey patch teuthology remote execution to make radosgw-admin commands
+    patch teuthology remote execution to make radosgw-admin commands
     work inside cephadm containers when running s3tests.
 
     Many teuthology tasks (eg. s3tests, rgw helpers) invoke radosgw-admin with

From 0fc86f6a504717f3c6e84bb3139c17c0514fe089 Mon Sep 17 00:00:00 2001
From: Joshua Blanch <joshua.blanch@clyso.com>
Date: Fri, 8 Aug 2025 15:09:45 +0000
Subject: [PATCH 10/17] bump

Signed-off-by: Joshua Blanch <joshua.blanch@clyso.com>
---
 qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml b/qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml
index 58c99d7eee03f..6f1bcdc03d395 100644
--- a/qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml
+++ b/qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml
@@ -19,6 +19,11 @@ tasks:
 - cephadm_s3_bridge:
     client.0:
       discover_from_cephadm: true
+- cephadm.shell:
+    host.a:
+      - ceph osd pool ls detail
+      - ceph orch ls
+      - ceph orch ps
 - rgw:
     client.0:
  

From 3e332fed6ac821be9ba33ff3e82fa93ec5a15280 Mon Sep 17 00:00:00 2001
From: Joshua Blanch <joshua.blanch@clyso.com>
Date: Fri, 8 Aug 2025 15:13:30 +0000
Subject: [PATCH 11/17] bump

Signed-off-by: Joshua Blanch <joshua.blanch@clyso.com>
---
 qa/tasks/cephadm_s3_bridge.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/qa/tasks/cephadm_s3_bridge.py b/qa/tasks/cephadm_s3_bridge.py
index 8d563f9c5d104..330f0a0def67d 100644
--- a/qa/tasks/cephadm_s3_bridge.py
+++ b/qa/tasks/cephadm_s3_bridge.py
@@ -329,6 +329,7 @@ def task(ctx, config):
           dns_name: rgw.example.com  # optional
           rgw_service: rgw.myservice  # optional, defaults to first found
     """
+    log.info(f"Config received: {config}")
     if config is None:
         config = {}
 
@@ -368,7 +369,7 @@ def task(ctx, config):
         raise ConfigError("No RGW services found via cephadm orchestrator - see logs for troubleshooting steps")
 
     role_endpoints = map_roles_to_endpoints(ctx, config, discovered_endpoints)
-
+    log.info(f"Available roles: {config.keys() if config else 'No config'}")
     if not role_endpoints:
         log.error("No roles configured for RGW endpoint mapping")
         log.error(

From 3a5df518b804451e21ae655292063e489231e2ec Mon Sep 17 00:00:00 2001
From: Joshua Blanch <joshua.blanch@clyso.com>
Date: Fri, 8 Aug 2025 15:16:34 +0000
Subject: [PATCH 12/17] contextlib decorator for task

Signed-off-by: Joshua Blanch <joshua.blanch@clyso.com>
---
 qa/tasks/cephadm_s3_bridge.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/qa/tasks/cephadm_s3_bridge.py b/qa/tasks/cephadm_s3_bridge.py
index 330f0a0def67d..6c91a2c61a525 100644
--- a/qa/tasks/cephadm_s3_bridge.py
+++ b/qa/tasks/cephadm_s3_bridge.py
@@ -14,6 +14,7 @@
 from teuthology import misc as teuthology
 from teuthology.exceptions import ConfigError
 import teuthology.orchestra.remote
+import contextlib
 
 import sys
 import os
@@ -318,6 +319,7 @@ def wait_for_rgw_accessibility(ctx, role_endpoints, timeout=60):
         log.info(f"RGW endpoint {role} is accessible")
 
 
+@contextlib.contextmanager
 def task(ctx, config):
     """
     Bridge task to make cephadm-deployed RGW compatible with s3tests.

From ee5aa3953a32e255e3bfc02c2e990fdcf2acb327 Mon Sep 17 00:00:00 2001
From: Joshua Blanch <joshua.blanch@clyso.com>
Date: Fri, 8 Aug 2025 15:17:19 +0000
Subject: [PATCH 13/17] bump

Signed-off-by: Joshua Blanch <joshua.blanch@clyso.com>
---
 .../cephadm/s3tests-bridge/basic-s3tests.yaml | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml b/qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml
index 784cdf4d8476c..732eb76745b81 100644
--- a/qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml
+++ b/qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml
@@ -30,16 +30,16 @@ tasks:
     client.0:
       discover_from_cephadm: true
 
-# - s3tests:
-#     client.0:
-#       rgw_server: client.0
-#       force-branch: master
-#       conf:
-#         DEFAULT:
-#           is_secure: false
-#           port: 8080
-#           calling_format: ordinary
-#         fixtures:
-#           bucket prefix: test-{random}-
-#       # Only run 3 basic tests to verify bridge works
-#       filter: "test_bucket_list_empty or test_bucket_create_naming_good_long_255"
+- s3tests:
+    client.0:
+      rgw_server: client.0
+      force-branch: master
+      conf:
+        DEFAULT:
+          is_secure: false
+          port: 8080
+          calling_format: ordinary
+        fixtures:
+          bucket prefix: test-{random}-
+      # Only run 3 basic tests to verify bridge works
+      filter: "test_bucket_list_empty or test_bucket_create_naming_good_long_255"

From e869d9eb94a6270c98c1ae97976c4aacedb7c34b Mon Sep 17 00:00:00 2001
From: Joshua Blanch <joshua.blanch@clyso.com>
Date: Fri, 8 Aug 2025 15:41:53 +0000
Subject: [PATCH 14/17] add back tox

Signed-off-by: Joshua Blanch <joshua.blanch@clyso.com>
---
 .../cephadm/s3tests-bridge/basic-s3tests.yaml    |  1 +
 .../cephadm/s3tests-bridge/bridge-test-only.yaml | 16 ++++++++++++++--
 .../cephadm/s3tests-bridge/minimal-test.yaml     |  1 +
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml b/qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml
index 732eb76745b81..3540a36025670 100644
--- a/qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml
+++ b/qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml
@@ -30,6 +30,7 @@ tasks:
     client.0:
       discover_from_cephadm: true
 
+- tox: [client.0]
 - s3tests:
     client.0:
       rgw_server: client.0
diff --git a/qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml b/qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml
index 6f1bcdc03d395..8ac134f5170ce 100644
--- a/qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml
+++ b/qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml
@@ -24,6 +24,18 @@ tasks:
       - ceph osd pool ls detail
       - ceph orch ls
       - ceph orch ps
-- rgw:
+- tox: [client.0]
+
+- s3tests:
     client.0:
- 
+      rgw_server: client.0
+      force-branch: master
+      conf:
+        DEFAULT:
+          is_secure: false
+          port: 8080
+          calling_format: ordinary
+        fixtures:
+          bucket prefix: test-{random}-
+      # Only run 3 basic tests to verify bridge works
+      filter: "test_bucket_list_empty or test_bucket_create_naming_good_long_255"
diff --git a/qa/suites/orch/cephadm/s3tests-bridge/minimal-test.yaml b/qa/suites/orch/cephadm/s3tests-bridge/minimal-test.yaml
index 5b8ccef88a806..cc1967540d8c7 100644
--- a/qa/suites/orch/cephadm/s3tests-bridge/minimal-test.yaml
+++ b/qa/suites/orch/cephadm/s3tests-bridge/minimal-test.yaml
@@ -24,6 +24,7 @@ tasks:
       discover_from_cephadm: true
       dns_name: client.0
 
+- tox: [client.0]
 - s3tests:
     client.0:
 #       rgw_server: client.0

From 6a2dd0238e51d7e3afcf260090d9090871ce81ac Mon Sep 17 00:00:00 2001
From: Joshua Blanch <joshua.blanch@clyso.com>
Date: Fri, 8 Aug 2025 18:59:11 +0000
Subject: [PATCH 15/17] remove adjust-ulimits and ceph-coverage

Signed-off-by: Joshua Blanch <joshua.blanch@clyso.com>
---
 qa/tasks/s3tests.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/qa/tasks/s3tests.py b/qa/tasks/s3tests.py
index afef388fb8460..daae50ab5d824 100644
--- a/qa/tasks/s3tests.py
+++ b/qa/tasks/s3tests.py
@@ -133,8 +133,6 @@ def create_users(ctx, config, s3tests_conf):
             # create user
             ctx.cluster.only(client).run(
                 args=[
-                    'adjust-ulimits',
-                    'ceph-coverage',
                     '{tdir}/archive/coverage'.format(tdir=testdir),
                     'radosgw-admin',
                     '-n', client_with_id,
@@ -152,8 +150,6 @@ def create_users(ctx, config, s3tests_conf):
             if not ctx.dbstore_variable:
                 ctx.cluster.only(client).run(
                     args=[
-                        'adjust-ulimits',
-                        'ceph-coverage',
                         '{tdir}/archive/coverage'.format(tdir=testdir),
                         'radosgw-admin',
                         '-n', client_with_id,
@@ -172,8 +168,6 @@ def create_users(ctx, config, s3tests_conf):
             if section=='iam':
                 ctx.cluster.only(client).run(
                     args=[
-                        'adjust-ulimits',
-                        'ceph-coverage',
                         '{tdir}/archive/coverage'.format(tdir=testdir),
                         'radosgw-admin',
                         '-n', client_with_id,
@@ -185,8 +179,6 @@ def create_users(ctx, config, s3tests_conf):
                 )
                 ctx.cluster.only(client).run(
                     args=[
-                        'adjust-ulimits',
-                        'ceph-coverage',
                         '{tdir}/archive/coverage'.format(tdir=testdir),
                         'radosgw-admin',
                         '-n', client_with_id,
@@ -220,8 +212,6 @@ def create_users(ctx, config, s3tests_conf):
                 client_with_id = daemon_type + '.' + client_id
                 ctx.cluster.only(client).run(
                     args=[
-                        'adjust-ulimits',
-                        'ceph-coverage',
                         '{tdir}/archive/coverage'.format(tdir=testdir),
                         'radosgw-admin',
                         '-n', client_with_id,

From 00209f19d7ca44527133d91c0ee1e94cd8635e07 Mon Sep 17 00:00:00 2001
From: Joshua Blanch <joshua.blanch@clyso.com>
Date: Fri, 8 Aug 2025 20:31:33 +0000
Subject: [PATCH 16/17] removed coverage script from s3test

Signed-off-by: Joshua Blanch <joshua.blanch@clyso.com>
---
 qa/tasks/s3tests.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/qa/tasks/s3tests.py b/qa/tasks/s3tests.py
index daae50ab5d824..c56545ab0385b 100644
--- a/qa/tasks/s3tests.py
+++ b/qa/tasks/s3tests.py
@@ -133,7 +133,6 @@ def create_users(ctx, config, s3tests_conf):
             # create user
             ctx.cluster.only(client).run(
                 args=[
-                    '{tdir}/archive/coverage'.format(tdir=testdir),
                     'radosgw-admin',
                     '-n', client_with_id,
                     'user', 'create',
@@ -150,7 +149,6 @@ def create_users(ctx, config, s3tests_conf):
             if not ctx.dbstore_variable:
                 ctx.cluster.only(client).run(
                     args=[
-                        '{tdir}/archive/coverage'.format(tdir=testdir),
                         'radosgw-admin',
                         '-n', client_with_id,
                         'mfa', 'create',
@@ -168,7 +166,6 @@ def create_users(ctx, config, s3tests_conf):
             if section=='iam':
                 ctx.cluster.only(client).run(
                     args=[
-                        '{tdir}/archive/coverage'.format(tdir=testdir),
                         'radosgw-admin',
                         '-n', client_with_id,
                         'caps', 'add',
@@ -179,7 +176,6 @@ def create_users(ctx, config, s3tests_conf):
                 )
                 ctx.cluster.only(client).run(
                     args=[
-                        '{tdir}/archive/coverage'.format(tdir=testdir),
                         'radosgw-admin',
                         '-n', client_with_id,
                         'caps', 'add',
@@ -212,7 +208,6 @@ def create_users(ctx, config, s3tests_conf):
                 client_with_id = daemon_type + '.' + client_id
                 ctx.cluster.only(client).run(
                     args=[
-                        '{tdir}/archive/coverage'.format(tdir=testdir),
                         'radosgw-admin',
                         '-n', client_with_id,
                         'user', 'rm',

From 887bd0a3d30339ad1e6a5ce74e9c42566b991126 Mon Sep 17 00:00:00 2001
From: Joshua Blanch <joshua.blanch@clyso.com>
Date: Sat, 9 Aug 2025 05:03:23 +0000
Subject: [PATCH 17/17] delete other tests

Signed-off-by: Joshua Blanch <joshua.blanch@clyso.com>
---
 .../s3tests-bridge/bridge-test-only.yaml      | 41 ----------------
 .../cephadm/s3tests-bridge/minimal-test.yaml  | 49 -------------------
 2 files changed, 90 deletions(-)
 delete mode 100644 qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml
 delete mode 100644 qa/suites/orch/cephadm/s3tests-bridge/minimal-test.yaml

diff --git a/qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml b/qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml
deleted file mode 100644
index 8ac134f5170ce..0000000000000
--- a/qa/suites/orch/cephadm/s3tests-bridge/bridge-test-only.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-roles:
-- [host.a, mon.a, mgr.a, osd.0, osd.1, osd.2, client.0]
-
-tasks:
-- cephadm:
-
-- cephadm.apply:
-    specs:
-      - service_type: rgw
-        service_id: test
-        placement:
-          host_pattern: "*"
-        spec:
-          rgw_frontend_port: 8080
-
-- cephadm.wait_for_service:
-    service: rgw.test
-
-- cephadm_s3_bridge:
-    client.0:
-      discover_from_cephadm: true
-- cephadm.shell:
-    host.a:
-      - ceph osd pool ls detail
-      - ceph orch ls
-      - ceph orch ps
-- tox: [client.0]
-
-- s3tests:
-    client.0:
-      rgw_server: client.0
-      force-branch: master
-      conf:
-        DEFAULT:
-          is_secure: false
-          port: 8080
-          calling_format: ordinary
-        fixtures:
-          bucket prefix: test-{random}-
-      # Only run 3 basic tests to verify bridge works
-      filter: "test_bucket_list_empty or test_bucket_create_naming_good_long_255"
diff --git a/qa/suites/orch/cephadm/s3tests-bridge/minimal-test.yaml b/qa/suites/orch/cephadm/s3tests-bridge/minimal-test.yaml
deleted file mode 100644
index cc1967540d8c7..0000000000000
--- a/qa/suites/orch/cephadm/s3tests-bridge/minimal-test.yaml
+++ /dev/null
@@ -1,49 +0,0 @@
-roles:
-- [host.a, mon.a, mgr.a, osd.0, osd.1, osd.2, client.0]
-
-tasks:
-- cephadm:
-
-- cephadm.apply:
-    specs:
-      - service_type: rgw
-        service_id: test
-        placement:
-          host_pattern: "*"
-        spec:
-          rgw_frontend_port: 8080
-
-- cephadm.wait_for_service:
-    service: rgw.test
-
-- rgw:
-    client.0:
-
-- cephadm_s3_bridge:
-    client.0:
-      discover_from_cephadm: true
-      dns_name: client.0
-
-- tox: [client.0]
-- s3tests:
-    client.0:
-#       rgw_server: client.0
-#       force-branch: master
-#       conf:
-#         DEFAULT:
-#           is_secure: false
-#           port: 8080
-#           calling_format: ordinary
-#         fixtures:
-#           bucket prefix: s3test-{random}-
-#         # POC run
-#         exclude:
-#           - test_bucket_policy*
-#           - test_lifecycle*  
-#           - test_encryption*
-#           - test_multipart_upload_size_too_small
-#           - test_cors*
-#           - test_website*
-#           - test_logging*
-#           - test_versioning*
-#       filter: "test_bucket_list_empty or test_bucket_create or test_object_write or test_object_read"