diff --git a/qa/suites/orch/cephadm/clyso/upgrade/+ b/qa/suites/orch/cephadm/clyso/upgrade/+ new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/qa/suites/orch/cephadm/clyso/upgrade/1-start-ces.yaml b/qa/suites/orch/cephadm/clyso/upgrade/1-start-ces.yaml new file mode 100644 index 0000000000000..15e2ece1e7071 --- /dev/null +++ b/qa/suites/orch/cephadm/clyso/upgrade/1-start-ces.yaml @@ -0,0 +1,45 @@ +roles: +- - host.a + - mon.a + - mgr.a + - osd.0 +- - host.b + - osd.1 + - mgr.b + - client.0 +- - host.b + - osd.2 + - mgr.c + +tasks: +# Install system packages via pexec (avoids CEPH RPM installation) +- pexec: + all: + - sudo dnf install s3cmd curl jq -y + +- cephadm: + +- cephadm.shell: + host.a: + - ceph status + - ceph orch ps + - ceph version + - echo "Starting with CES version (from override YAML)" + +openstack: +- volumes: + count: 4 + size: 20 + +overrides: + ceph: + conf: + osd: + osd shutdown pgref assert: true + log-only-match: + - CEPHADM_ + log-ignorelist: + - CEPHADM_DAEMON_PLACE_FAIL + - CEPHADM_FAILED_DAEMON + - CEPHADM_STRAY_DAEMON + - CEPHADM_AGENT_DOWN diff --git a/qa/suites/orch/cephadm/clyso/upgrade/2-create-baseline.yaml b/qa/suites/orch/cephadm/clyso/upgrade/2-create-baseline.yaml new file mode 100644 index 0000000000000..fbf225d91b312 --- /dev/null +++ b/qa/suites/orch/cephadm/clyso/upgrade/2-create-baseline.yaml @@ -0,0 +1,43 @@ +tasks: +- cephadm.apply: + specs: + - service_type: rgw + service_id: foo + placement: + host_pattern: "*" + spec: + rgw_frontend_port: 8080 + +# it will be named rgw.foo for some reason +- cephadm.wait_for_service: + service: rgw.foo + +- cephadm.shell: + host.a: + - ceph status + +# - cephadm.shell: +# host.a: +# - ceph_test_rgw_obj + +# - workunit: +# clients: +# client.0: +# - rgw/test_rgw_obj.sh + + # client.0: + # - | + # cat > /tmp/s3cfg << 'EOF' + # [default] + # access_key = ceskey + # secret_key = cessecret + # host_base = host.b:8080 + # host_bucket = host.b:8080 + # use_https = False + # signature_v2 = True + # EOF + # - s3cmd -c /tmp/s3cfg mb s3://ces-baseline-bucket + # - echo "CES S3 baseline data - must survive downgrade!" > /tmp/ces-s3-baseline.txt + # - s3cmd -c /tmp/s3cfg put /tmp/ces-s3-baseline.txt s3://ces-baseline-bucket/ces-baseline-s3-object.txt + # - s3cmd -c /tmp/s3cfg ls s3://ces-baseline-bucket/ + # - echo "CES S3 baseline data created successfully" diff --git a/qa/suites/orch/cephadm/clyso/upgrade/3-downgrade-upstream.yaml b/qa/suites/orch/cephadm/clyso/upgrade/3-downgrade-upstream.yaml new file mode 100644 index 0000000000000..5b794a3f6c18c --- /dev/null +++ b/qa/suites/orch/cephadm/clyso/upgrade/3-downgrade-upstream.yaml @@ -0,0 +1,149 @@ +tasks: +- cephadm.shell: + host.a: + - echo "PRE-DOWNGRADE CES VERSION:" + - ceph version + - ceph orch ps + + - echo "Starting downgrade from CES to upstream CEPH v18.2.7..." + - ceph orch upgrade start --image quay.io/ceph/ceph:v18.2.7 + - sleep 30 + + - | + cat > /tmp/upgrade_monitor.sh << 'EOF' + #!/bin/bash + + # Upgrade monitoring script for cephadm upgrade tests + # Monitors upgrade/downgrade completion by checking both upgrade status and daemon versions + + set -e + + TARGET_VERSION="$1" + OPERATION="$2" # "upgrade" or "downgrade" + BASE_IMAGE_NAME="${3:-base}" + TARGET_IMAGE_NAME="${4:-target}" + TIMEOUT="${5:-2400}" + + if [ -z "$TARGET_VERSION" ] || [ -z "$OPERATION" ]; then + echo "Usage: $0 [base_image_name] [target_image_name] [timeout_seconds]" + exit 1 + fi + + echo "=== CEPH Upgrade Monitor Started ===" + echo "Base image: $BASE_IMAGE_NAME" + echo "Target image: $TARGET_IMAGE_NAME" + echo "Operation: $OPERATION to $TARGET_VERSION" + echo "Timeout: ${TIMEOUT}s" + echo "Start time: $(date)" + + echo "=== Capturing Baseline Version ===" + ceph versions + baseline_version=$(ceph versions --format json | jq -r ".overall | keys[0]") + echo "Baseline version: $baseline_version" + + echo "=== Starting Upgrade Monitoring ===" + start_time=$(date +%s) + + while true; do + current_time=$(date +%s) + elapsed=$((current_time - start_time)) + + echo "" + echo "=== Upgrade Status (Elapsed: ${elapsed}s) ===" + echo "Time: $(date)" + + echo "--- Orchestrator Upgrade Status ---" + upgrade_status=$(ceph orch upgrade status --format json) + echo "$upgrade_status" + + echo "--- Daemon Versions ---" + ceph versions + + in_progress=$(echo "$upgrade_status" | jq -r ".in_progress") + version_count=$(ceph versions --format json | jq ".overall | length") + + echo "Upgrade in progress: $in_progress" + echo "Number of different versions running: $version_count" + + if [ "$in_progress" = "false" ] && [ "$version_count" -eq 1 ]; then + current_version=$(ceph versions --format json | jq -r ".overall | keys[0]") + echo "All daemons now on: $current_version" + + if [ "$current_version" != "$baseline_version" ]; then + echo "" + echo "=== SUCCESS: Upgrade Completed ===" + echo "From: $baseline_version" + echo "To: $current_version" + echo "Base image: $BASE_IMAGE_NAME" + echo "Target image: $TARGET_IMAGE_NAME" + echo "Total time: ${elapsed}s" + echo "End time: $(date)" + break + else + echo "" + echo "=== SUCCESS: Already on Target Version ===" + echo "Current version: $current_version" + echo "Base image: $BASE_IMAGE_NAME" + echo "Target image: $TARGET_IMAGE_NAME" + echo "Total time: ${elapsed}s" + echo "End time: $(date)" + break + fi + else + echo "Upgrade still in progress or daemons on mixed versions" + if [ "$version_count" -gt 1 ]; then + echo "--- Version Breakdown ---" + ceph versions --format json | jq ".overall" + fi + fi + + if echo "$upgrade_status" | jq -r ".message" | grep -q -i "error\|fail"; then + echo "" + echo "=== ERROR: Upgrade Failed ===" + echo "Upgrade status shows error or failure" + echo "$upgrade_status" + exit 1 + fi + + if [ $elapsed -ge $TIMEOUT ]; then + echo "" + echo "=== ERROR: Upgrade Timeout ===" + echo "Upgrade did not complete within $TIMEOUT seconds" + echo "Current status:" + echo "$upgrade_status" + ceph versions + exit 1 + fi + + echo "Waiting 60 seconds before next check..." + sleep 60 + done + + echo "" + echo "=== Final Verification ===" + ceph health detail + ceph orch ps + ceph status + + echo "" + echo "=== Upgrade Monitor Completed Successfully ===" + EOF + + chmod +x /tmp/upgrade_monitor.sh + /tmp/upgrade_monitor.sh "v18.2.7" "downgrade" "CES" "Upstream v18.2.7" "1800" + + - echo "POST-DOWNGRADE UPSTREAM VERSION:" + - ceph version + - ceph orch ps + - ceph -s + + +overrides: + ceph: + log-ignorelist: + - CEPHADM_STRAY_DAEMON + - CEPHADM_FAILED_DAEMON + - CEPHADM_AGENT_DOWN + - CEPHADM_DAEMON_PLACE_FAIL + log-only-match: + - CEPHADM_ diff --git a/qa/suites/orch/cephadm/clyso/upgradeMatrix/% b/qa/suites/orch/cephadm/clyso/upgradeMatrix/% new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/qa/suites/orch/cephadm/clyso/upgradeMatrix/cross-distro-pairs/ces-to-upstream.yaml b/qa/suites/orch/cephadm/clyso/upgradeMatrix/cross-distro-pairs/ces-to-upstream.yaml new file mode 100644 index 0000000000000..ea0b2b526a525 --- /dev/null +++ b/qa/suites/orch/cephadm/clyso/upgradeMatrix/cross-distro-pairs/ces-to-upstream.yaml @@ -0,0 +1,9 @@ +# Cross-distro test: Start with CES, upgrade to Upstream, downgrade back to CES +base_image: "harbor.clyso.com/ces/ceph/ceph:ces-v25.03.2-rc.4" +target_image: "quay.io/ceph/ceph:v18.2.7" +base_image_name: "CES-v25.03.2-rc.4" +target_image_name: "Upstream-18.2.7" + +overrides: + ceph: + image: "harbor.clyso.com/ces/ceph/ceph:ces-v25.03.2-rc.4" diff --git a/qa/suites/orch/cephadm/clyso/upgradeMatrix/cross-distro-pairs/upstream-to-ces.yaml b/qa/suites/orch/cephadm/clyso/upgradeMatrix/cross-distro-pairs/upstream-to-ces.yaml new file mode 100644 index 0000000000000..fa221779078ca --- /dev/null +++ b/qa/suites/orch/cephadm/clyso/upgradeMatrix/cross-distro-pairs/upstream-to-ces.yaml @@ -0,0 +1,9 @@ +# Cross-distro test: Start with Upstream, upgrade to CES, downgrade back to Upstream +base_image: "quay.io/ceph/ceph:v18.2.7" +target_image: "harbor.clyso.com/ces/ceph/ceph:ces-v25.03.2-rc.4" +base_image_name: "Upstream-18.2.7" +target_image_name: "CES-v25.03.2-rc.4" + +overrides: + ceph: + image: "quay.io/ceph/ceph:v18.2.7" diff --git a/qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/+ b/qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/+ new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/1-bootstrap.yaml b/qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/1-bootstrap.yaml new file mode 100644 index 0000000000000..baaf340d552d1 --- /dev/null +++ b/qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/1-bootstrap.yaml @@ -0,0 +1,44 @@ +roles: +- - host.a + - mon.a + - mgr.a + - osd.0 +- - host.b + - mgr.b + - osd.1 + - client.0 + +tasks: +# Install system packages via pexec (avoids CEPH RPM installation) +- pexec: + all: + - sudo dnf install s3cmd curl jq -y + +- cephadm: + +- cephadm.shell: + host.a: + - echo "=== BOOTSTRAP COMPLETE ===" + - ceph orch status + - ceph orch ps + - ceph version + - ceph -s + - ceph orch device ls + +openstack: +- volumes: + count: 4 + size: 20 + +overrides: + ceph: + conf: + osd: + osd shutdown pgref assert: true + log-only-match: + - CEPHADM_ + log-ignorelist: + - CEPHADM_DAEMON_PLACE_FAIL + - CEPHADM_FAILED_DAEMON + - CEPHADM_STRAY_DAEMON + - CEPHADM_AGENT_DOWN diff --git a/qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/3-upgrade.yaml b/qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/3-upgrade.yaml new file mode 100644 index 0000000000000..f94b75375051d --- /dev/null +++ b/qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/3-upgrade.yaml @@ -0,0 +1,149 @@ +tasks: +- cephadm.shell: + host.a: + - echo "PRE-UPGRADE STATE:" + - ceph version + - ceph orch ps + - ceph -s + + - echo "Starting upgrade from CES-v25.03.2-rc.4 to reef-18.2.7..." + - ceph orch upgrade start --image "quay.io/ceph/ceph:v18.2.7" + - sleep 30 + + - | + cat > /tmp/upgrade_monitor.sh << 'EOF' + #!/bin/bash + + # Upgrade monitoring script for cephadm upgrade tests + # Monitors upgrade/downgrade completion by checking both upgrade status and daemon versions + + set -e + + TARGET_VERSION="$1" + OPERATION="$2" # "upgrade" or "downgrade" + BASE_IMAGE_NAME="${3:-base}" + TARGET_IMAGE_NAME="${4:-target}" + TIMEOUT="${5:-2400}" + + if [ -z "$TARGET_VERSION" ] || [ -z "$OPERATION" ]; then + echo "Usage: $0 [base_image_name] [target_image_name] [timeout_seconds]" + exit 1 + fi + + echo "=== CEPH Upgrade Monitor Started ===" + echo "Base image: $BASE_IMAGE_NAME" + echo "Target image: $TARGET_IMAGE_NAME" + echo "Operation: $OPERATION to $TARGET_VERSION" + echo "Timeout: ${TIMEOUT}s" + echo "Start time: $(date)" + + echo "=== Capturing Baseline Version ===" + ceph versions + baseline_version=$(ceph versions --format json | jq -r ".overall | keys[0]") + echo "Baseline version: $baseline_version" + + echo "=== Starting Upgrade Monitoring ===" + start_time=$(date +%s) + + while true; do + current_time=$(date +%s) + elapsed=$((current_time - start_time)) + + echo "" + echo "=== Upgrade Status (Elapsed: ${elapsed}s) ===" + echo "Time: $(date)" + + echo "--- Orchestrator Upgrade Status ---" + upgrade_status=$(ceph orch upgrade status --format json) + echo "$upgrade_status" + + echo "--- Daemon Versions ---" + ceph versions + + in_progress=$(echo "$upgrade_status" | jq -r ".in_progress") + version_count=$(ceph versions --format json | jq ".overall | length") + + echo "Upgrade in progress: $in_progress" + echo "Number of different versions running: $version_count" + + if [ "$in_progress" = "false" ] && [ "$version_count" -eq 1 ]; then + current_version=$(ceph versions --format json | jq -r ".overall | keys[0]") + echo "All daemons now on: $current_version" + + if [ "$current_version" != "$baseline_version" ]; then + echo "" + echo "=== SUCCESS: Upgrade Completed ===" + echo "From: $baseline_version" + echo "To: $current_version" + echo "Base image: $BASE_IMAGE_NAME" + echo "Target image: $TARGET_IMAGE_NAME" + echo "Total time: ${elapsed}s" + echo "End time: $(date)" + break + else + echo "" + echo "=== SUCCESS: Already on Target Version ===" + echo "Current version: $current_version" + echo "Base image: $BASE_IMAGE_NAME" + echo "Target image: $TARGET_IMAGE_NAME" + echo "Total time: ${elapsed}s" + echo "End time: $(date)" + break + fi + else + echo "Upgrade still in progress or daemons on mixed versions" + if [ "$version_count" -gt 1 ]; then + echo "--- Version Breakdown ---" + ceph versions --format json | jq ".overall" + fi + fi + + if echo "$upgrade_status" | jq -r ".message" | grep -q -i "error\|fail"; then + echo "" + echo "=== ERROR: Upgrade Failed ===" + echo "Upgrade status shows error or failure" + echo "$upgrade_status" + exit 1 + fi + + if [ $elapsed -ge $TIMEOUT ]; then + echo "" + echo "=== ERROR: Upgrade Timeout ===" + echo "Upgrade did not complete within $TIMEOUT seconds" + echo "Current status:" + echo "$upgrade_status" + ceph versions + exit 1 + fi + + echo "Waiting 60 seconds before next check..." + sleep 60 + done + + echo "" + echo "=== Final Verification ===" + ceph health detail + ceph orch ps + ceph status + + echo "" + echo "=== Upgrade Monitor Completed Successfully ===" + EOF + + chmod +x /tmp/upgrade_monitor.sh + /tmp/upgrade_monitor.sh "18.2.7" "upgrade" "CES-v25.03.2-rc.4" "reef-18.2.7" "2400" + + - echo "POST-UPGRADE STATE:" + - ceph version + - ceph orch ps + - ceph -s + +overrides: + ceph: + log-ignorelist: + - CEPHADM_STRAY_DAEMON + - CEPHADM_FAILED_DAEMON + - CEPHADM_AGENT_DOWN + - CEPHADM_DAEMON_PLACE_FAIL + log-only-match: + - CEPHADM_ diff --git a/qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/5-downgrade.yaml b/qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/5-downgrade.yaml new file mode 100644 index 0000000000000..25dd72bddca3b --- /dev/null +++ b/qa/suites/orch/cephadm/clyso/upgradeMatrix/workflow/5-downgrade.yaml @@ -0,0 +1,149 @@ +tasks: +- cephadm.shell: + host.a: + - echo "PRE-DOWNGRADE STATE:" + - ceph version + - ceph orch ps + - ceph -s + + - echo "Starting downgrade from reef-18.2.7 back to CES-v25.03.2-rc.4..." + - ceph orch upgrade start --image "harbor.clyso.com/ces/ceph/ceph:ces-v25.03.2-rc.4" + - sleep 30 + + - | + cat > /tmp/upgrade_monitor.sh << 'EOF' + #!/bin/bash + + # Upgrade monitoring script for cephadm upgrade tests + # Monitors upgrade/downgrade completion by checking both upgrade status and daemon versions + + set -e + + TARGET_VERSION="$1" + OPERATION="$2" # "upgrade" or "downgrade" + BASE_IMAGE_NAME="${3:-base}" + TARGET_IMAGE_NAME="${4:-target}" + TIMEOUT="${5:-2400}" + + if [ -z "$TARGET_VERSION" ] || [ -z "$OPERATION" ]; then + echo "Usage: $0 [base_image_name] [target_image_name] [timeout_seconds]" + exit 1 + fi + + echo "=== CEPH Upgrade Monitor Started ===" + echo "Base image: $BASE_IMAGE_NAME" + echo "Target image: $TARGET_IMAGE_NAME" + echo "Operation: $OPERATION to $TARGET_VERSION" + echo "Timeout: ${TIMEOUT}s" + echo "Start time: $(date)" + + echo "=== Capturing Baseline Version ===" + ceph versions + baseline_version=$(ceph versions --format json | jq -r ".overall | keys[0]") + echo "Baseline version: $baseline_version" + + echo "=== Starting Upgrade Monitoring ===" + start_time=$(date +%s) + + while true; do + current_time=$(date +%s) + elapsed=$((current_time - start_time)) + + echo "" + echo "=== Upgrade Status (Elapsed: ${elapsed}s) ===" + echo "Time: $(date)" + + echo "--- Orchestrator Upgrade Status ---" + upgrade_status=$(ceph orch upgrade status --format json) + echo "$upgrade_status" + + echo "--- Daemon Versions ---" + ceph versions + + in_progress=$(echo "$upgrade_status" | jq -r ".in_progress") + version_count=$(ceph versions --format json | jq ".overall | length") + + echo "Upgrade in progress: $in_progress" + echo "Number of different versions running: $version_count" + + if [ "$in_progress" = "false" ] && [ "$version_count" -eq 1 ]; then + current_version=$(ceph versions --format json | jq -r ".overall | keys[0]") + echo "All daemons now on: $current_version" + + if [ "$current_version" != "$baseline_version" ]; then + echo "" + echo "=== SUCCESS: Upgrade Completed ===" + echo "From: $baseline_version" + echo "To: $current_version" + echo "Base image: $BASE_IMAGE_NAME" + echo "Target image: $TARGET_IMAGE_NAME" + echo "Total time: ${elapsed}s" + echo "End time: $(date)" + break + else + echo "" + echo "=== SUCCESS: Already on Target Version ===" + echo "Current version: $current_version" + echo "Base image: $BASE_IMAGE_NAME" + echo "Target image: $TARGET_IMAGE_NAME" + echo "Total time: ${elapsed}s" + echo "End time: $(date)" + break + fi + else + echo "Upgrade still in progress or daemons on mixed versions" + if [ "$version_count" -gt 1 ]; then + echo "--- Version Breakdown ---" + ceph versions --format json | jq ".overall" + fi + fi + + if echo "$upgrade_status" | jq -r ".message" | grep -q -i "error\|fail"; then + echo "" + echo "=== ERROR: Upgrade Failed ===" + echo "Upgrade status shows error or failure" + echo "$upgrade_status" + exit 1 + fi + + if [ $elapsed -ge $TIMEOUT ]; then + echo "" + echo "=== ERROR: Upgrade Timeout ===" + echo "Upgrade did not complete within $TIMEOUT seconds" + echo "Current status:" + echo "$upgrade_status" + ceph versions + exit 1 + fi + + echo "Waiting 60 seconds before next check..." + sleep 60 + done + + echo "" + echo "=== Final Verification ===" + ceph health detail + ceph orch ps + ceph status + + echo "" + echo "=== Upgrade Monitor Completed Successfully ===" + EOF + + chmod +x /tmp/upgrade_monitor.sh + /tmp/upgrade_monitor.sh "25.03.2-rc.4" "downgrade" "reef-18.2.7" "CES-v25.03.2-rc.4" "2400" + + - echo "POST-DOWNGRADE STATE:" + - ceph version + - ceph orch ps + - ceph -s + +overrides: + ceph: + log-ignorelist: + - CEPHADM_STRAY_DAEMON + - CEPHADM_FAILED_DAEMON + - CEPHADM_AGENT_DOWN + - CEPHADM_DAEMON_PLACE_FAIL + log-only-match: + - CEPHADM_ diff --git a/qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml b/qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml new file mode 100644 index 0000000000000..3540a36025670 --- /dev/null +++ b/qa/suites/orch/cephadm/s3tests-bridge/basic-s3tests.yaml @@ -0,0 +1,46 @@ +roles: +- [host.a, mon.a, mgr.a, osd.0, osd.1, osd.2, client.0] + +overrides: + ceph: + log-to-file: true + conf: + global: + log to file: true + mon cluster log to file: true + mon: + mon_warn_on_insecure_global_id_reclaim_allowed: false + +tasks: +- cephadm: + +- cephadm.apply: + specs: + - service_type: rgw + service_id: s3test + placement: + host_pattern: "*" + spec: + rgw_frontend_port: 8080 + +- cephadm.wait_for_service: + service: rgw.s3test + +- cephadm_s3_bridge: + client.0: + discover_from_cephadm: true + +- tox: [client.0] +- s3tests: + client.0: + rgw_server: client.0 + force-branch: master + conf: + DEFAULT: + is_secure: false + port: 8080 + calling_format: ordinary + fixtures: + bucket prefix: test-{random}- + # Only run 3 basic tests to verify bridge works + filter: "test_bucket_list_empty or test_bucket_create_naming_good_long_255" diff --git a/qa/tasks/cephadm_s3_bridge.py b/qa/tasks/cephadm_s3_bridge.py new file mode 100644 index 0000000000000..6c91a2c61a525 --- /dev/null +++ b/qa/tasks/cephadm_s3_bridge.py @@ -0,0 +1,420 @@ +""" +Bridge task to make cephadm-deployed RGW compatible with s3tests. + +This task discovers RGW endpoints deployed via cephadm orchestrator +and creates the ctx.rgw.role_endpoints structure that s3tests expects. +""" + +import json +import logging +import time +from io import StringIO + +from teuthology.orchestra import run +from teuthology import misc as teuthology +from teuthology.exceptions import ConfigError +import teuthology.orchestra.remote +import contextlib + +import sys +import os + +qa_dir = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, qa_dir) +from rgw import RGWEndpoint + +log = logging.getLogger(__name__) + + +def detect_cephadm_deployment(ctx): + """Detect if we're in a cephadm environment with bridge active""" + return ( + hasattr(ctx, "rgw") + and hasattr(ctx.rgw, "cephadm_bridge_active") + and ctx.rgw.cephadm_bridge_active + ) + + +def patch_s3tests_radosgw_admin(ctx): + """ + patch teuthology remote execution to make radosgw-admin commands + work inside cephadm containers when running s3tests. + + Many teuthology tasks (eg. s3tests, rgw helpers) invoke radosgw-admin with + wrapper prefixes like ["adjust-ulimits", "ceph-coverage", , ... , + "radosgw-admin", ...]. The original patch only matched when args[0] was + "radosgw-admin" which missed these cases. Here we detect radosgw-admin at + any position, split the prefix, and wrap only the radosgw-admin portion + inside a 'sudo shell -c ... -k ... -- ' call. + """ + log.info("Enabling cephadm-aware radosgw-admin monkey patch for s3tests") + + original_run = teuthology.orchestra.remote.Remote.run + + def cephadm_aware_run(self, **kwargs): + args = kwargs.get("args", []) + + try: + # Locate the radosgw-admin binary within args (not just at index 0) + admin_idx = -1 + for i, a in enumerate(args): + if isinstance(a, str) and a == "radosgw-admin": + admin_idx = i + break + + if admin_idx != -1 and detect_cephadm_deployment(ctx): + log.info(f"Intercepting radosgw-admin command: {args}") + + cluster_name = list(ctx.ceph.keys())[0] if hasattr(ctx, "ceph") else "ceph" + image = ctx.ceph[cluster_name].image + fsid = ctx.ceph[cluster_name].fsid + cephadm_bin = getattr(ctx, "cephadm", "cephadm") + + # Everything before radosgw-admin should remain as-is + prefix = list(args[:admin_idx]) + admin_and_rest = list(args[admin_idx:]) + + cephadm_prefix = [ + "sudo", + cephadm_bin, + "--image", image, + "shell", + "-c", f"/etc/ceph/{cluster_name}.conf", + "-k", f"/etc/ceph/{cluster_name}.client.admin.keyring", + "--fsid", fsid, + "--", + ] + + new_args = prefix + cephadm_prefix + admin_and_rest + log.info(f"Converted to cephadm shell command: {new_args}") + kwargs["args"] = new_args + + except Exception as e: + # On any failure, fall back to original behavior + log.error(f"cephadm radosgw-admin monkey patch error: {e}") + + return original_run(self, **kwargs) + + teuthology.orchestra.remote.Remote.run = cephadm_aware_run + + +def restore_original_remote_run(): + """Restore original remote run method (for cleanup)""" + log.info("not implemented - patch remains active") + + +def discover_cephadm_rgw_endpoints(ctx): + """ + Discover RGW endpoints from cephadm orchestrator using cephadm shell. + Returns dict mapping service names to endpoint info. + """ + log.info("Discovering cephadm RGW endpoints via 'ceph orch ps'") + + cluster_roles = list(ctx.cluster.remotes.keys()) + if not cluster_roles: + raise ConfigError("No cluster nodes available for ceph commands") + + remote = cluster_roles[0] + + try: + # Get cluster name (usually 'ceph') + cluster_name = list(ctx.ceph.keys())[0] if hasattr(ctx, "ceph") else "ceph" + + result = remote.run( + args=[ + "sudo", + ctx.cephadm, + "--image", + ctx.ceph[cluster_name].image, + "shell", + "-c", + f"/etc/ceph/{cluster_name}.conf", + "-k", + f"/etc/ceph/{cluster_name}.client.admin.keyring", + "--fsid", + ctx.ceph[cluster_name].fsid, + "--", + "ceph", + "orch", + "ps", + "--daemon_type", + "rgw", + "--format", + "json", + ], + stdout=StringIO(), + ) + except AttributeError as e: + log.error(f"Missing cephadm context attributes: {e}") + log.error( + "Available ctx.cephadm attributes: " + str(dir(ctx.cephadm)) + if hasattr(ctx, "cephadm") + else "No ctx.cephadm found" + ) + raise ConfigError(f"cephadm context not properly initialized: {e}") + except Exception as e: + log.error(f"Failed to run ceph orch ps command: {e}") + raise ConfigError(f"RGW endpoint discovery failed: {e}") + + services_json = result.stdout.getvalue() + log.info(f"Raw ceph orch ps output: {services_json}") + + if not services_json.strip(): + log.warning("No RGW services found via 'ceph orch ps'") + return {} + + try: + services = json.loads(services_json) + log.info(f"Parsed RGW services: {services}") + except json.JSONDecodeError as e: + log.error(f"Failed to parse JSON from ceph orch ps: {e}") + log.error(f"Raw output was: {services_json}") + raise ConfigError(f"Invalid JSON from ceph orch ps: {e}") + + endpoints = {} + for service in services: + service_name = service.get("service_name", "") + hostname = service.get("hostname", "") + ports = service.get("ports", []) + status = service.get("status_desc", "") + + log.info(f"Processing service: {service_name}, hostname: {hostname}, ports: {ports}, status: {status}") + + if not service_name.startswith("rgw."): + log.debug(f"Skipping non-RGW service: {service_name}") + continue + + if "running" not in status.lower(): + log.warning(f"RGW service {service_name} is not running: {status}") + # Allow non-running services through for now, s3tests might still work + log.info(f"Continuing with non-running service {service_name} - s3tests might still work") + + # Extract port number (ports is typically ['8080/tcp'] format) + port = None + if ports: + for port_spec in ports: + if isinstance(port_spec, str) and "/" in port_spec: + try: + port = int(port_spec.split("/")[0]) + break + except ValueError: + continue + elif isinstance(port_spec, int): + port = port_spec + break + + if port is None: + log.warning(f"Could not parse port for RGW service {service_name}: {ports}") + # Fall back to default RGW port 8080 + port = 8080 + log.info(f"Using default port {port} for {service_name}") + + endpoints[service_name] = { + "hostname": hostname, + "port": port, + "service_name": service_name, + "status": status, + } + + log.info(f"Added endpoint: {service_name} -> {hostname}:{port} (status: {status})") + + log.info(f"Discovered RGW endpoints: {endpoints}") + return endpoints + + +def map_roles_to_endpoints(ctx, config, discovered_endpoints): + """ + Map teuthology roles to discovered RGW endpoints. + """ + role_endpoints = {} + + for role, client_config in config.items(): + if not client_config.get("discover_from_cephadm"): + continue + + log.info(f"Mapping role {role} to cephadm RGW endpoint") + + target_service = client_config.get("rgw_service") + if target_service and target_service in discovered_endpoints: + endpoint_info = discovered_endpoints[target_service] + log.info(f"Using explicit service mapping: {role} -> {target_service}") + else: + if not discovered_endpoints: + raise ConfigError(f"No RGW endpoints discovered for role {role}") + + service_name = list(discovered_endpoints.keys())[0] + endpoint_info = discovered_endpoints[service_name] + log.info(f"Using first available RGW service: {role} -> {service_name}") + + hostname = endpoint_info["hostname"] + port = endpoint_info["port"] + + dns_name = client_config.get("dns_name", hostname) + + rgw_endpoint = RGWEndpoint( + hostname=hostname, + port=port, + cert=None, + dns_name=dns_name, + website_dns_name=None, + ) + + role_endpoints[role] = rgw_endpoint + log.info(f"Created endpoint for {role}: {hostname}:{port} (dns: {dns_name})") + + return role_endpoints + + +def wait_for_rgw_accessibility(ctx, role_endpoints, timeout=60): + """ + Wait for RGW endpoints to be accessible via HTTP. + """ + log.info("Verifying RGW endpoint accessibility") + + cluster_roles = list(ctx.cluster.remotes.keys()) + test_remote = cluster_roles[0] + + for role, endpoint in role_endpoints.items(): + log.info( + f"Testing accessibility of {role} at {endpoint.hostname}:{endpoint.port}" + ) + + start_time = time.time() + accessible = False + + while time.time() - start_time < timeout: + try: + result = test_remote.run( + args=[ + "curl", + "-s", + "-o", + "/dev/null", + "-w", + "%{http_code}", + "--connect-timeout", + "5", + f"http://{endpoint.hostname}:{endpoint.port}/", + ], + stdout=StringIO(), + check_status=False, + ) + + http_code = result.stdout.getvalue().strip() + log.info(f"HTTP response from {role}: {http_code}") + + if http_code and http_code.isdigit(): + accessible = True + break + + except Exception as e: + log.debug(f"Accessibility test failed for {role}: {e}") + + log.info(f"Waiting for {role} to become accessible...") + time.sleep(2) + + if not accessible: + raise ConfigError(f"RGW endpoint {role} not accessible after {timeout}s") + + log.info(f"RGW endpoint {role} is accessible") + + +@contextlib.contextmanager +def task(ctx, config): + """ + Bridge task to make cephadm-deployed RGW compatible with s3tests. + + Example usage: + - cephadm_s3_bridge: + client.0: + discover_from_cephadm: true + dns_name: rgw.example.com # optional + rgw_service: rgw.myservice # optional, defaults to first found + """ + log.info(f"Config received: {config}") + if config is None: + config = {} + + log.info("Starting cephadm s3tests bridge task") + + assert hasattr(ctx, "ceph"), ( + "ctx.ceph not found - cephadm bridge requires ceph context" + ) + assert hasattr(ctx, "cephadm"), ( + "ctx.cephadm not found - cephadm bridge requires cephadm context" + ) + assert hasattr(ctx, "cluster"), ( + "ctx.cluster not found - cephadm bridge requires cluster context" + ) + + log.info("Context assertions passed, checking for existing ctx.rgw...") + + # Allow ctx.rgw to exist from cephadm tasks, but ensure it doesn't have role_endpoints + if hasattr(ctx, "rgw") and hasattr(ctx.rgw, "role_endpoints"): + raise ConfigError( + "ctx.rgw.role_endpoints already exists - bridge should run before other rgw configuration tasks" + ) + + try: + discovered_endpoints = discover_cephadm_rgw_endpoints(ctx) + except Exception as e: + log.error(f"RGW endpoint discovery failed: {e}") + raise e + + if not discovered_endpoints: + log.error("No RGW services found via cephadm orchestrator") + log.error("This usually means:") + log.error(" 1. No RGW services have been deployed yet") + log.error(" 2. RGW services haven't started yet (check with 'ceph orch ps')") + log.error(" 3. cephadm bridge is running before RGW deployment") + log.error("Make sure to run cephadm.apply (with RGW service) and cephadm.wait_for_service before this bridge") + raise ConfigError("No RGW services found via cephadm orchestrator - see logs for troubleshooting steps") + + role_endpoints = map_roles_to_endpoints(ctx, config, discovered_endpoints) + log.info(f"Available roles: {config.keys() if config else 'No config'}") + if not role_endpoints: + log.error("No roles configured for RGW endpoint mapping") + log.error( + "Check your bridge task configuration - you need at least one role with 'discover_from_cephadm: true'" + ) + return + + try: + wait_for_rgw_accessibility(ctx, role_endpoints) + except Exception as e: + log.error(f"RGW accessibility test failed: {e}") + log.error("Continuing anyway - ctx.rgw will still be created") + + # Create ctx.rgw structure for s3tests compatibility + if not hasattr(ctx, "rgw"): + + class RGWContext: + pass + + ctx.rgw = RGWContext() + + ctx.rgw.role_endpoints = role_endpoints + ctx.rgw.cephadm_discovered_endpoints = discovered_endpoints + ctx.rgw.cephadm_bridge_active = True + + try: + patch_s3tests_radosgw_admin(ctx) + except Exception as e: + log.error(f"Monkey patch setup failed: {e}") + raise e + + assert hasattr(ctx, "rgw"), "ctx.rgw was not created successfully" + assert hasattr(ctx.rgw, "role_endpoints"), "ctx.rgw.role_endpoints was not created" + assert hasattr(ctx.rgw, "cephadm_bridge_active"), ( + "ctx.rgw.cephadm_bridge_active was not set" + ) + assert ctx.rgw.cephadm_bridge_active, "ctx.rgw.cephadm_bridge_active is not True" + assert len(ctx.rgw.role_endpoints) > 0, "ctx.rgw.role_endpoints is empty" + + try: + yield + finally: + assert hasattr(ctx, "rgw"), "ctx.rgw was lost during test execution" + assert hasattr(ctx.rgw, "cephadm_bridge_active"), ( + "ctx.rgw.cephadm_bridge_active was lost" + ) diff --git a/qa/tasks/python.py b/qa/tasks/python.py index 4ddb14f714538..aacb01ee45426 100644 --- a/qa/tasks/python.py +++ b/qa/tasks/python.py @@ -35,11 +35,18 @@ def task(ctx, config): (remote,) = ctx.cluster.only(role).remotes.keys() log.info('Running python on role %s host %s', role, remote.name) log.info(code) + + # Handle both string and list input for code + if isinstance(code, list): + code_str = '\n'.join(code) + else: + code_str = code + args=[ 'TESTDIR={tdir}'.format(tdir=testdir), 'python3', ] if sudo: args = ['sudo'] + args - remote.run(args=args, stdin=subst_vip(ctx, code)) + remote.run(args=args, stdin=subst_vip(ctx, code_str)) diff --git a/qa/tasks/s3tests.py b/qa/tasks/s3tests.py index afef388fb8460..c56545ab0385b 100644 --- a/qa/tasks/s3tests.py +++ b/qa/tasks/s3tests.py @@ -133,9 +133,6 @@ def create_users(ctx, config, s3tests_conf): # create user ctx.cluster.only(client).run( args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), 'radosgw-admin', '-n', client_with_id, 'user', 'create', @@ -152,9 +149,6 @@ def create_users(ctx, config, s3tests_conf): if not ctx.dbstore_variable: ctx.cluster.only(client).run( args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), 'radosgw-admin', '-n', client_with_id, 'mfa', 'create', @@ -172,9 +166,6 @@ def create_users(ctx, config, s3tests_conf): if section=='iam': ctx.cluster.only(client).run( args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), 'radosgw-admin', '-n', client_with_id, 'caps', 'add', @@ -185,9 +176,6 @@ def create_users(ctx, config, s3tests_conf): ) ctx.cluster.only(client).run( args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), 'radosgw-admin', '-n', client_with_id, 'caps', 'add', @@ -220,9 +208,6 @@ def create_users(ctx, config, s3tests_conf): client_with_id = daemon_type + '.' + client_id ctx.cluster.only(client).run( args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), 'radosgw-admin', '-n', client_with_id, 'user', 'rm',