From 14173ff66476312eeb50e53fbc94047161217c46 Mon Sep 17 00:00:00 2001 From: Sreeja Chintalapati Date: Thu, 8 Jan 2026 15:59:16 +0530 Subject: [PATCH 1/4] =?UTF-8?q?HDDS-14108.=20Provide=20option=20in=20?= =?UTF-8?q?=E2=80=98scm=20safemode=20status=E2=80=99=20to=20show=20status?= =?UTF-8?q?=20of=20all=20SCM=20nodes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../hadoop/hdds/scm/client/ScmClient.java | 16 +++ .../StorageContainerLocationProtocol.java | 28 ++++ ...ocationProtocolClientSideTranslatorPB.java | 48 +++++++ .../proxy/SCMFailoverProxyProviderBase.java | 11 ++ ...ocationProtocolServerSideTranslatorPB.java | 4 +- .../scm/server/SCMClientProtocolServer.java | 18 +++ .../scm/cli/ContainerOperationClient.java | 10 ++ .../hdds/scm/cli/SafeModeCheckSubcommand.java | 130 +++++++++++++++++- .../apache/hadoop/hdds/scm/cli/ScmOption.java | 4 + .../hadoop/hdds/scm/cli/ScmSubcommand.java | 4 + 10 files changed, 267 insertions(+), 6 deletions(-) diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java index fbf1f46a9702..a57f6f9d7f4e 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java @@ -334,6 +334,22 @@ Map> getSafeModeRuleStatuses() */ boolean forceExitSafeMode() throws IOException; + /** + * Check if a specific SCM node is in safe mode. + * @param nodeId SCM node ID to query + * @return true if the node is in safe mode, false otherwise + * @throws IOException + */ + boolean inSafeModeForNode(String nodeId) throws IOException; + + /** + * Get safe mode rule statuses from a specific SCM node. + * @param nodeId SCM node ID to query + * @return Map of rule name to rule status + * @throws IOException + */ + Map> getSafeModeRuleStatusesForNode(String nodeId) throws IOException; + /** * Start ReplicationManager. */ diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java index 92ddfa7eb8dc..4a62d667e0ec 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java @@ -73,6 +73,14 @@ public interface StorageContainerLocationProtocol extends Closeable { Type.StopReplicationManager, Type.ForceExitSafeMode)); + /** + * Read-only commands that can execute on followers without leader check. + * These commands respect the --scm parameter and query the specified SCM. + */ + Set FOLLOWER_READABLE_COMMAND_TYPE = Collections.unmodifiableSet(EnumSet.of( + Type.InSafeMode, + Type.GetSafeModeRuleStatuses)); + /** * Asks SCM where a container should be allocated. SCM responds with the * set of datanodes that should be used creating this container. @@ -390,6 +398,26 @@ Map> getSafeModeRuleStatuses() */ boolean forceExitSafeMode() throws IOException; + /** + * Check if a specific SCM node is in safe mode. + * In HA clusters, queries the specified node. + * + * @param nodeId SCM node ID to query + * @return true if the node is in safe mode, false otherwise + * @throws IOException + */ + boolean inSafeModeForNode(String nodeId) throws IOException; + + /** + * Get safe mode rule statuses from a specific SCM node. + * In HA clusters, queries the specified node. + * + * @param nodeId SCM node ID to query + * @return Map of rule name to rule status + * @throws IOException + */ + Map> getSafeModeRuleStatusesForNode(String nodeId) throws IOException; + /** * Start ReplicationManager. */ diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java index 94b2230e68ba..699b426e43ec 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java @@ -145,6 +145,8 @@ import org.apache.hadoop.ozone.upgrade.UpgradeFinalization.StatusAndMessages; import org.apache.hadoop.ozone.util.ProtobufUtils; import org.apache.hadoop.security.token.Token; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This class is the client-side translator to translate the requests made on @@ -162,6 +164,8 @@ public final class StorageContainerLocationProtocolClientSideTranslatorPB private final StorageContainerLocationProtocolPB rpcProxy; private final SCMContainerLocationFailoverProxyProvider fpp; + private static final Logger LOG = + LoggerFactory.getLogger(StorageContainerLocationProtocolClientSideTranslatorPB.class); /** * Creates a new StorageContainerLocationProtocolClientSideTranslatorPB. @@ -870,6 +874,50 @@ public boolean forceExitSafeMode() throws IOException { } + @Override + public boolean inSafeModeForNode(String nodeId) throws IOException { + InSafeModeRequestProto request = InSafeModeRequestProto.getDefaultInstance(); + + try { + StorageContainerLocationProtocolPB proxy = fpp.getProxyForNode(nodeId); + ScmContainerLocationRequest wrapper = ScmContainerLocationRequest.newBuilder() + .setCmdType(Type.InSafeMode) + .setVersion(ClientVersion.CURRENT_VERSION) + .setTraceID(TracingUtil.exportCurrentSpan()) + .setInSafeModeRequest(request) + .build(); + ScmContainerLocationResponse response = proxy.submitRequest(NULL_RPC_CONTROLLER, wrapper); + return response.getInSafeModeResponse().getInSafeMode(); + } catch (Exception e) { + throw new IOException("Failed to get safe mode status from SCM node " + nodeId, e); + } + } + + @Override + public Map> getSafeModeRuleStatusesForNode(String nodeId) throws IOException { + GetSafeModeRuleStatusesRequestProto request = GetSafeModeRuleStatusesRequestProto.getDefaultInstance(); + + try { + StorageContainerLocationProtocolPB proxy = fpp.getProxyForNode(nodeId); + ScmContainerLocationRequest wrapper = ScmContainerLocationRequest.newBuilder() + .setCmdType(Type.GetSafeModeRuleStatuses) + .setVersion(ClientVersion.CURRENT_VERSION) + .setTraceID(TracingUtil.exportCurrentSpan()) + .setGetSafeModeRuleStatusesRequest(request) + .build(); + ScmContainerLocationResponse response = proxy.submitRequest(NULL_RPC_CONTROLLER, wrapper); + + Map> ruleStatuses = new HashMap<>(); + for (SafeModeRuleStatusProto statusProto : + response.getGetSafeModeRuleStatusesResponse().getSafeModeRuleStatusesProtoList()) { + ruleStatuses.put(statusProto.getRuleName(), Pair.of(statusProto.getValidate(), statusProto.getStatusText())); + } + return ruleStatuses; + } catch (Exception e) { + throw new IOException("Failed to get safe mode rule statuses from SCM node " + nodeId, e); + } + } + @Override public void startReplicationManager() throws IOException { diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java index 0dfa95a95246..272db7a04ae4 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java @@ -193,6 +193,17 @@ public synchronized List getProxies() { .map(proxyInfo -> proxyInfo.proxy).collect(Collectors.toList()); } + public synchronized T getProxyForNode(String nodeId) throws IOException { + ProxyInfo proxyInfo = scmProxies.get(nodeId); + if (proxyInfo == null) { + if (!scmProxyInfoMap.containsKey(nodeId)) { + throw new IOException("Unknown SCM node ID: " + nodeId); + } + proxyInfo = createSCMProxy(nodeId); + } + return proxyInfo.proxy; + } + @Override public synchronized void performFailover(T newLeader) { if (updatedLeaderNodeID != null) { diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java index 3b061aa10c01..996fbe330525 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java @@ -29,6 +29,7 @@ import static org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.Type.ListContainer; import static org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.Type.ListPipelines; import static org.apache.hadoop.hdds.scm.protocol.StorageContainerLocationProtocol.ADMIN_COMMAND_TYPE; +import static org.apache.hadoop.hdds.scm.protocol.StorageContainerLocationProtocol.FOLLOWER_READABLE_COMMAND_TYPE; import com.google.protobuf.ProtocolMessageEnum; import com.google.protobuf.RpcController; @@ -212,7 +213,8 @@ public ScmContainerLocationResponse submitRequest(RpcController controller, ScmContainerLocationRequest request) throws ServiceException { // not leader or not belong to admin command. if (!scm.checkLeader() - && !ADMIN_COMMAND_TYPE.contains(request.getCmdType())) { + && !ADMIN_COMMAND_TYPE.contains(request.getCmdType()) + && !FOLLOWER_READABLE_COMMAND_TYPE.contains(request.getCmdType())) { RatisUtil.checkRatisException( scm.getScmHAManager().getRatisServer().triggerNotLeaderException(), scm.getClientRpcPort(), scm.getScmId(), scm.getHostname(), ROLE_TYPE); diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java index 3bd5214ea0ec..832bc0caf66b 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java @@ -1033,6 +1033,24 @@ public Map> getSafeModeRuleStatuses() } } + @Override + public boolean inSafeModeForNode(String nodeId) throws IOException { + boolean result = inSafeMode(); + AUDIT.logReadSuccess( + buildAuditMessageForSuccess(SCMAction.IN_SAFE_MODE, null) + ); + return result; + } + + @Override + public Map> getSafeModeRuleStatusesForNode(String nodeId) throws IOException { + Map> result = getSafeModeRuleStatuses(); + AUDIT.logReadSuccess( + buildAuditMessageForSuccess(SCMAction.GET_SAFE_MODE_RULE_STATUSES, null) + ); + return result; + } + /** * Force SCM out of Safe mode. * diff --git a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java index 61c0f4150c34..2c7c0f9a57b0 100644 --- a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java +++ b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java @@ -477,6 +477,16 @@ public boolean forceExitSafeMode() throws IOException { return storageContainerLocationClient.forceExitSafeMode(); } + @Override + public boolean inSafeModeForNode(String nodeId) throws IOException { + return storageContainerLocationClient.inSafeModeForNode(nodeId); + } + + @Override + public Map> getSafeModeRuleStatusesForNode(String nodeId) throws IOException { + return storageContainerLocationClient.getSafeModeRuleStatusesForNode(nodeId); + } + @Override public void startReplicationManager() throws IOException { storageContainerLocationClient.startReplicationManager(); diff --git a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java index d15be56410f8..ff294dab9a8d 100644 --- a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java +++ b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java @@ -18,10 +18,19 @@ package org.apache.hadoop.hdds.scm.cli; import java.io.IOException; +import java.net.InetSocketAddress; +import java.util.List; import java.util.Map; +import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; +import org.apache.hadoop.hdds.HddsUtils; import org.apache.hadoop.hdds.cli.HddsVersionProvider; +import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.scm.client.ScmClient; +import org.apache.hadoop.hdds.scm.ha.SCMNodeInfo; +import org.apache.hadoop.net.NetUtils; +import picocli.CommandLine; import picocli.CommandLine.Command; /** @@ -33,9 +42,26 @@ mixinStandardHelpOptions = true, versionProvider = HddsVersionProvider.class) public class SafeModeCheckSubcommand extends ScmSubcommand { + @CommandLine.Option(names = {"--all", "-a"}, + description = "Show safe mode status for all SCM nodes in the service. " + + "When multiple SCM service IDs are configured, --service-id must be specified.") + private boolean allNodes; @Override public void execute(ScmClient scmClient) throws IOException { + final OzoneConfiguration conf = getOzoneConf(); + String serviceId = HddsUtils.getScmServiceId(conf); + + if (allNodes) { + executeForAllNodes(scmClient); + } else if (StringUtils.isNotEmpty(getScmOption().getScm()) && serviceId != null) { + executeForSpecificNodeInHA(scmClient, serviceId); + } else { + executeForSingleNode(scmClient); + } + } + + private void executeForSingleNode(ScmClient scmClient) throws IOException { boolean execReturn = scmClient.inSafeMode(); // Output data list @@ -45,12 +71,106 @@ public void execute(ScmClient scmClient) throws IOException { System.out.println("SCM is out of safe mode."); } if (isVerbose()) { - for (Map.Entry> entry : - scmClient.getSafeModeRuleStatuses().entrySet()) { - Pair value = entry.getValue(); - System.out.printf("validated:%s, %s, %s%n", - value.getLeft(), entry.getKey(), value.getRight()); + printSafeModeRules(scmClient.getSafeModeRuleStatuses()); + } + } + + private void executeForSpecificNodeInHA(ScmClient scmClient, String serviceId) throws IOException { + String scmAddress = getScmOption().getScm(); + + System.out.println("Service ID: " + serviceId); + + final OzoneConfiguration conf = getOzoneConf(); + + List nodes = SCMNodeInfo.buildNodeInfo(conf); + + // Find the node matching the --scm address + List matchedNodes = nodes.stream() + .filter(node -> matchesAddress(node, scmAddress)) + .collect(Collectors.toList()); + + if (matchedNodes.isEmpty()) { + throw new IOException("Specified --scm address " + scmAddress + + " does not match any node in service " + serviceId + + ". Available nodes: " + nodes.stream() + .map(n -> n.getScmClientAddress() + " [" + n.getNodeId() + "]") + .collect(Collectors.joining(", "))); + } + + queryNode(scmClient, matchedNodes.get(0)); + } + + private void executeForAllNodes(ScmClient scmClient) throws IOException { + final OzoneConfiguration conf = getOzoneConf(); + String serviceId = HddsUtils.getScmServiceId(conf); + + if (serviceId == null) { + executeForSingleNode(scmClient); + return; + } + + System.out.println("Service ID: " + serviceId); + List nodes = SCMNodeInfo.buildNodeInfo(conf); + + for (SCMNodeInfo node : nodes) { + queryNode(scmClient, node); + } + } + + private void queryNode(ScmClient scmClient, SCMNodeInfo node) { + String nodeId = node.getNodeId(); + + try { + boolean inSafeMode = scmClient.inSafeModeForNode(nodeId); + + System.out.printf("%s [%s]: %s%n", + node.getScmClientAddress(), + nodeId, + inSafeMode ? "IN SAFE MODE" : "OUT OF SAFE MODE"); + + if (isVerbose()) { + Map> rules = scmClient.getSafeModeRuleStatusesForNode(nodeId); + if (rules != null && !rules.isEmpty()) { + printSafeModeRules(rules); + } } + } catch (Exception e) { + System.out.printf("%s [%s]: ERROR: %s%n", + node.getScmClientAddress(), nodeId, e.getMessage()); + } + } + + /** + * Check if the given SCMNodeInfo matches the target address. + * Tries to match by direct string comparison and by resolved address. + */ + private boolean matchesAddress(SCMNodeInfo node, String targetAddress) { + String nodeAddress = node.getScmClientAddress(); + + // Direct match + if (nodeAddress.equals(targetAddress)) { + return true; + } + + // Try normalizing both addresses and comparing + try { + InetSocketAddress target = NetUtils.createSocketAddr(targetAddress); + InetSocketAddress nodeAddr = NetUtils.createSocketAddr(nodeAddress); + + // Match by resolved IP and port + return target.getPort() == nodeAddr.getPort() && + target.getAddress().equals(nodeAddr.getAddress()); + } catch (Exception e) { + // If address resolution fails, no match + return false; + } + } + + private void printSafeModeRules(Map> rules) { + for (Map.Entry> entry : rules.entrySet()) { + Pair value = entry.getValue(); + System.out.printf("validated:%s, %s, %s%n", + value.getLeft(), entry.getKey(), value.getRight()); } } } diff --git a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmOption.java b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmOption.java index 640433c99b3f..8b8d4b9f0f4b 100644 --- a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmOption.java +++ b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmOption.java @@ -88,4 +88,8 @@ public SCMSecurityProtocol createScmSecurityClient() { "Can't create SCM Security client", ex); } } + + public String getScm() { + return scm; + } } diff --git a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmSubcommand.java b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmSubcommand.java index 6681a4894dbe..c81a2d4c2e78 100644 --- a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmSubcommand.java +++ b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmSubcommand.java @@ -33,6 +33,10 @@ public abstract class ScmSubcommand extends AbstractSubcommand implements Callab protected abstract void execute(ScmClient client) throws IOException; + protected ScmOption getScmOption() { + return scmOption; + } + @Override public final Void call() throws Exception { try (ScmClient scmClient = scmOption.createScmClient()) { From 5958634280117d254053c6ee33be6ce534dcb167 Mon Sep 17 00:00:00 2001 From: Sreeja Chintalapati Date: Mon, 12 Jan 2026 15:42:59 +0530 Subject: [PATCH 2/4] Refactored code for better readability and reusability --- ...ocationProtocolClientSideTranslatorPB.java | 88 +++++++++++-------- ...ocationProtocolServerSideTranslatorPB.java | 4 +- .../hdds/scm/cli/SafeModeCheckSubcommand.java | 2 +- 3 files changed, 53 insertions(+), 41 deletions(-) diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java index 699b426e43ec..f29c7e940fa6 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java @@ -204,6 +204,34 @@ private ScmContainerLocationResponse submitRequest( return response; } + /** + * Helper method to wrap the request and send the message to a specific SCM node. + * This is used for operations that need to query a specific SCM node in an HA cluster. + * + * @param nodeId the SCM node ID to send the request to + * @param type the request type + * @param builderConsumer consumer to populate the request specific fields + * @return the response from the specified SCM node + */ + private ScmContainerLocationResponse submitRequestToNode( + String nodeId, + StorageContainerLocationProtocolProtos.Type type, + Consumer builderConsumer) throws IOException { + try { + StorageContainerLocationProtocolPB proxy = fpp.getProxyForNode(nodeId); + Builder builder = ScmContainerLocationRequest.newBuilder() + .setCmdType(type) + .setVersion(ClientVersion.CURRENT_VERSION) + .setTraceID(TracingUtil.exportCurrentSpan()); + builderConsumer.accept(builder); + ScmContainerLocationRequest wrapper = builder.build(); + + return proxy.submitRequest(NULL_RPC_CONTROLLER, wrapper); + } catch (ServiceException ex) { + throw ProtobufHelper.getRemoteException(ex); + } + } + private ScmContainerLocationResponse submitRpcRequest( ScmContainerLocationRequest wrapper) throws ServiceException { if (!ADMIN_COMMAND_TYPE.contains(wrapper.getCmdType())) { @@ -847,13 +875,21 @@ public Map> getSafeModeRuleStatuses() submitRequest(Type.GetSafeModeRuleStatuses, builder -> builder.setGetSafeModeRuleStatusesRequest(request)) .getGetSafeModeRuleStatusesResponse(); - Map> map = new HashMap(); - for (SafeModeRuleStatusProto statusProto : - response.getSafeModeRuleStatusesProtoList()) { - map.put(statusProto.getRuleName(), + return buildSafeModeRuleStatusesMap(response); + } + + /** + * Helper method to build a map from GetSafeModeRuleStatusesResponseProto. + * Extracts rule names and their status information. + */ + private Map> buildSafeModeRuleStatusesMap( + GetSafeModeRuleStatusesResponseProto response) { + Map> ruleStatuses = new HashMap<>(); + for (SafeModeRuleStatusProto statusProto : response.getSafeModeRuleStatusesProtoList()) { + ruleStatuses.put(statusProto.getRuleName(), Pair.of(statusProto.getValidate(), statusProto.getStatusText())); } - return map; + return ruleStatuses; } /** @@ -877,45 +913,19 @@ public boolean forceExitSafeMode() throws IOException { @Override public boolean inSafeModeForNode(String nodeId) throws IOException { InSafeModeRequestProto request = InSafeModeRequestProto.getDefaultInstance(); - - try { - StorageContainerLocationProtocolPB proxy = fpp.getProxyForNode(nodeId); - ScmContainerLocationRequest wrapper = ScmContainerLocationRequest.newBuilder() - .setCmdType(Type.InSafeMode) - .setVersion(ClientVersion.CURRENT_VERSION) - .setTraceID(TracingUtil.exportCurrentSpan()) - .setInSafeModeRequest(request) - .build(); - ScmContainerLocationResponse response = proxy.submitRequest(NULL_RPC_CONTROLLER, wrapper); - return response.getInSafeModeResponse().getInSafeMode(); - } catch (Exception e) { - throw new IOException("Failed to get safe mode status from SCM node " + nodeId, e); - } + return submitRequestToNode(nodeId, Type.InSafeMode, + builder -> builder.setInSafeModeRequest(request)) + .getInSafeModeResponse().getInSafeMode(); } @Override public Map> getSafeModeRuleStatusesForNode(String nodeId) throws IOException { GetSafeModeRuleStatusesRequestProto request = GetSafeModeRuleStatusesRequestProto.getDefaultInstance(); - - try { - StorageContainerLocationProtocolPB proxy = fpp.getProxyForNode(nodeId); - ScmContainerLocationRequest wrapper = ScmContainerLocationRequest.newBuilder() - .setCmdType(Type.GetSafeModeRuleStatuses) - .setVersion(ClientVersion.CURRENT_VERSION) - .setTraceID(TracingUtil.exportCurrentSpan()) - .setGetSafeModeRuleStatusesRequest(request) - .build(); - ScmContainerLocationResponse response = proxy.submitRequest(NULL_RPC_CONTROLLER, wrapper); - - Map> ruleStatuses = new HashMap<>(); - for (SafeModeRuleStatusProto statusProto : - response.getGetSafeModeRuleStatusesResponse().getSafeModeRuleStatusesProtoList()) { - ruleStatuses.put(statusProto.getRuleName(), Pair.of(statusProto.getValidate(), statusProto.getStatusText())); - } - return ruleStatuses; - } catch (Exception e) { - throw new IOException("Failed to get safe mode rule statuses from SCM node " + nodeId, e); - } + GetSafeModeRuleStatusesResponseProto response = + submitRequestToNode(nodeId, Type.GetSafeModeRuleStatuses, + builder -> builder.setGetSafeModeRuleStatusesRequest(request)) + .getGetSafeModeRuleStatusesResponse(); + return buildSafeModeRuleStatusesMap(response); } @Override diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java index 996fbe330525..b319805d0a8c 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java @@ -211,7 +211,9 @@ public StorageContainerLocationProtocolServerSideTranslatorPB( @Override public ScmContainerLocationResponse submitRequest(RpcController controller, ScmContainerLocationRequest request) throws ServiceException { - // not leader or not belong to admin command. + // Trigger not leader exception unless: + // This is the leader node, or this is an admin command, + // or this is a follower-readable command. if (!scm.checkLeader() && !ADMIN_COMMAND_TYPE.contains(request.getCmdType()) && !FOLLOWER_READABLE_COMMAND_TYPE.contains(request.getCmdType())) { diff --git a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java index ff294dab9a8d..6fd09100a9cb 100644 --- a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java +++ b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java @@ -135,7 +135,7 @@ private void queryNode(ScmClient scmClient, SCMNodeInfo node) { } } } catch (Exception e) { - System.out.printf("%s [%s]: ERROR: %s%n", + System.out.printf("%s [%s]: ERROR: Failed to get safe mode status - %s%n", node.getScmClientAddress(), nodeId, e.getMessage()); } } From 4b95b952fa262e9312ab68d45479b95b8c1682f2 Mon Sep 17 00:00:00 2001 From: Sreeja Chintalapati Date: Fri, 16 Jan 2026 15:33:55 +0530 Subject: [PATCH 3/4] Should return leader node status when no option provided --- .../hdds/scm/cli/SafeModeCheckSubcommand.java | 85 ++++++++++++++----- 1 file changed, 64 insertions(+), 21 deletions(-) diff --git a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java index 6fd09100a9cb..9b008b21eff9 100644 --- a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java +++ b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java @@ -47,43 +47,90 @@ public class SafeModeCheckSubcommand extends ScmSubcommand { "When multiple SCM service IDs are configured, --service-id must be specified.") private boolean allNodes; + private String serviceId; + private List nodes; + @Override public void execute(ScmClient scmClient) throws IOException { - final OzoneConfiguration conf = getOzoneConf(); - String serviceId = HddsUtils.getScmServiceId(conf); + OzoneConfiguration conf = getOzoneConf(); + serviceId = HddsUtils.getScmServiceId(conf); + String scmAddress = getScmOption().getScm(); + if (serviceId != null) { + nodes = SCMNodeInfo.buildNodeInfo(conf); + } if (allNodes) { executeForAllNodes(scmClient); - } else if (StringUtils.isNotEmpty(getScmOption().getScm()) && serviceId != null) { - executeForSpecificNodeInHA(scmClient, serviceId); + } else if (StringUtils.isNotEmpty(scmAddress)) { + executeForSpecificNode(scmClient, scmAddress); } else { executeForSingleNode(scmClient); } } private void executeForSingleNode(ScmClient scmClient) throws IOException { - boolean execReturn = scmClient.inSafeMode(); - - // Output data list - if (execReturn) { + boolean inSafeMode; + Map> rules = null; + String leaderNodeId; + + // If SCM HA mode, query the leader node. + if (serviceId != null) { + leaderNodeId = findLeaderNodeId(scmClient); + inSafeMode = scmClient.inSafeModeForNode(leaderNodeId); + if (isVerbose()) { + rules = scmClient.getSafeModeRuleStatusesForNode(leaderNodeId); + } + } else { + // Non-HA mode + inSafeMode = scmClient.inSafeMode(); + if (isVerbose()) { + rules = scmClient.getSafeModeRuleStatuses(); + } + } + + if (inSafeMode) { System.out.println("SCM is in safe mode."); } else { System.out.println("SCM is out of safe mode."); } - if (isVerbose()) { - printSafeModeRules(scmClient.getSafeModeRuleStatuses()); + if (isVerbose() && rules != null) { + printSafeModeRules(rules); } } - private void executeForSpecificNodeInHA(ScmClient scmClient, String serviceId) throws IOException { - String scmAddress = getScmOption().getScm(); - - System.out.println("Service ID: " + serviceId); + /** + * Find the leader node ID from SCM roles. + * @param scmClient the SCM client + * @return the leader node ID, or null if not found + */ + private String findLeaderNodeId(ScmClient scmClient) throws IOException { + try { + List roles = scmClient.getScmRoles(); + for (String role : roles) { + String[] parts = role.split(":"); + if (parts.length >= 3 && "LEADER".equalsIgnoreCase(parts[2])) { + String leaderHostname = parts[0]; + for (SCMNodeInfo node : nodes) { + String nodeHostname = node.getScmClientAddress().split(":")[0]; + if (nodeHostname.equalsIgnoreCase(leaderHostname)) { + return node.getNodeId(); + } + } + } + } + return null; + } catch (IOException e) { + throw new IOException("Could not determine leader node for service: " + serviceId, e); + } + } - final OzoneConfiguration conf = getOzoneConf(); + private void executeForSpecificNode(ScmClient scmClient, String scmAddress) throws IOException { + if (serviceId == null) { + executeForSingleNode(scmClient); + return; + } - List nodes = SCMNodeInfo.buildNodeInfo(conf); - + System.out.println("Service ID: " + serviceId); // Find the node matching the --scm address List matchedNodes = nodes.stream() .filter(node -> matchesAddress(node, scmAddress)) @@ -101,16 +148,12 @@ private void executeForSpecificNodeInHA(ScmClient scmClient, String serviceId) t } private void executeForAllNodes(ScmClient scmClient) throws IOException { - final OzoneConfiguration conf = getOzoneConf(); - String serviceId = HddsUtils.getScmServiceId(conf); - if (serviceId == null) { executeForSingleNode(scmClient); return; } System.out.println("Service ID: " + serviceId); - List nodes = SCMNodeInfo.buildNodeInfo(conf); for (SCMNodeInfo node : nodes) { queryNode(scmClient, node); From dc935e68e5c9fcf20ff1544a3512cab4870628fd Mon Sep 17 00:00:00 2001 From: Sreeja Chintalapati Date: Fri, 16 Jan 2026 15:55:54 +0530 Subject: [PATCH 4/4] Addressed review comments wrt logs --- .../scm/server/SCMClientProtocolServer.java | 38 ++++++++++++++----- .../hdds/scm/cli/SafeModeCheckSubcommand.java | 3 ++ 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java index 832bc0caf66b..ee8ebafd2d82 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java @@ -1035,20 +1035,38 @@ public Map> getSafeModeRuleStatuses() @Override public boolean inSafeModeForNode(String nodeId) throws IOException { - boolean result = inSafeMode(); - AUDIT.logReadSuccess( - buildAuditMessageForSuccess(SCMAction.IN_SAFE_MODE, null) - ); - return result; + Map auditMap = Maps.newHashMap(); + auditMap.put("nodeId", nodeId); + try { + boolean result = inSafeMode(); + AUDIT.logReadSuccess( + buildAuditMessageForSuccess(SCMAction.IN_SAFE_MODE, auditMap) + ); + return result; + } catch (Exception ex) { + AUDIT.logReadFailure( + buildAuditMessageForFailure(SCMAction.IN_SAFE_MODE, auditMap, ex) + ); + throw ex; + } } @Override public Map> getSafeModeRuleStatusesForNode(String nodeId) throws IOException { - Map> result = getSafeModeRuleStatuses(); - AUDIT.logReadSuccess( - buildAuditMessageForSuccess(SCMAction.GET_SAFE_MODE_RULE_STATUSES, null) - ); - return result; + Map auditMap = Maps.newHashMap(); + auditMap.put("nodeId", nodeId); + try { + Map> result = getSafeModeRuleStatuses(); + AUDIT.logReadSuccess( + buildAuditMessageForSuccess(SCMAction.GET_SAFE_MODE_RULE_STATUSES, auditMap) + ); + return result; + } catch (Exception ex) { + AUDIT.logReadFailure( + buildAuditMessageForFailure(SCMAction.GET_SAFE_MODE_RULE_STATUSES, auditMap, ex) + ); + throw ex; + } } /** diff --git a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java index 9b008b21eff9..3a550f184d27 100644 --- a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java +++ b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java @@ -205,6 +205,9 @@ private boolean matchesAddress(SCMNodeInfo node, String targetAddress) { target.getAddress().equals(nodeAddr.getAddress()); } catch (Exception e) { // If address resolution fails, no match + if (isVerbose()) { + System.err.println("Warning: Failed to resolve address: " + e.getMessage()); + } return false; } }