diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java index fbf1f46a9702..a57f6f9d7f4e 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java @@ -334,6 +334,22 @@ Map> getSafeModeRuleStatuses() */ boolean forceExitSafeMode() throws IOException; + /** + * Check if a specific SCM node is in safe mode. + * @param nodeId SCM node ID to query + * @return true if the node is in safe mode, false otherwise + * @throws IOException + */ + boolean inSafeModeForNode(String nodeId) throws IOException; + + /** + * Get safe mode rule statuses from a specific SCM node. + * @param nodeId SCM node ID to query + * @return Map of rule name to rule status + * @throws IOException + */ + Map> getSafeModeRuleStatusesForNode(String nodeId) throws IOException; + /** * Start ReplicationManager. */ diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java index 92ddfa7eb8dc..4a62d667e0ec 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java @@ -73,6 +73,14 @@ public interface StorageContainerLocationProtocol extends Closeable { Type.StopReplicationManager, Type.ForceExitSafeMode)); + /** + * Read-only commands that can execute on followers without leader check. + * These commands respect the --scm parameter and query the specified SCM. + */ + Set FOLLOWER_READABLE_COMMAND_TYPE = Collections.unmodifiableSet(EnumSet.of( + Type.InSafeMode, + Type.GetSafeModeRuleStatuses)); + /** * Asks SCM where a container should be allocated. SCM responds with the * set of datanodes that should be used creating this container. @@ -390,6 +398,26 @@ Map> getSafeModeRuleStatuses() */ boolean forceExitSafeMode() throws IOException; + /** + * Check if a specific SCM node is in safe mode. + * In HA clusters, queries the specified node. + * + * @param nodeId SCM node ID to query + * @return true if the node is in safe mode, false otherwise + * @throws IOException + */ + boolean inSafeModeForNode(String nodeId) throws IOException; + + /** + * Get safe mode rule statuses from a specific SCM node. + * In HA clusters, queries the specified node. + * + * @param nodeId SCM node ID to query + * @return Map of rule name to rule status + * @throws IOException + */ + Map> getSafeModeRuleStatusesForNode(String nodeId) throws IOException; + /** * Start ReplicationManager. */ diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java index 94b2230e68ba..f29c7e940fa6 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java @@ -145,6 +145,8 @@ import org.apache.hadoop.ozone.upgrade.UpgradeFinalization.StatusAndMessages; import org.apache.hadoop.ozone.util.ProtobufUtils; import org.apache.hadoop.security.token.Token; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This class is the client-side translator to translate the requests made on @@ -162,6 +164,8 @@ public final class StorageContainerLocationProtocolClientSideTranslatorPB private final StorageContainerLocationProtocolPB rpcProxy; private final SCMContainerLocationFailoverProxyProvider fpp; + private static final Logger LOG = + LoggerFactory.getLogger(StorageContainerLocationProtocolClientSideTranslatorPB.class); /** * Creates a new StorageContainerLocationProtocolClientSideTranslatorPB. @@ -200,6 +204,34 @@ private ScmContainerLocationResponse submitRequest( return response; } + /** + * Helper method to wrap the request and send the message to a specific SCM node. + * This is used for operations that need to query a specific SCM node in an HA cluster. + * + * @param nodeId the SCM node ID to send the request to + * @param type the request type + * @param builderConsumer consumer to populate the request specific fields + * @return the response from the specified SCM node + */ + private ScmContainerLocationResponse submitRequestToNode( + String nodeId, + StorageContainerLocationProtocolProtos.Type type, + Consumer builderConsumer) throws IOException { + try { + StorageContainerLocationProtocolPB proxy = fpp.getProxyForNode(nodeId); + Builder builder = ScmContainerLocationRequest.newBuilder() + .setCmdType(type) + .setVersion(ClientVersion.CURRENT_VERSION) + .setTraceID(TracingUtil.exportCurrentSpan()); + builderConsumer.accept(builder); + ScmContainerLocationRequest wrapper = builder.build(); + + return proxy.submitRequest(NULL_RPC_CONTROLLER, wrapper); + } catch (ServiceException ex) { + throw ProtobufHelper.getRemoteException(ex); + } + } + private ScmContainerLocationResponse submitRpcRequest( ScmContainerLocationRequest wrapper) throws ServiceException { if (!ADMIN_COMMAND_TYPE.contains(wrapper.getCmdType())) { @@ -843,13 +875,21 @@ public Map> getSafeModeRuleStatuses() submitRequest(Type.GetSafeModeRuleStatuses, builder -> builder.setGetSafeModeRuleStatusesRequest(request)) .getGetSafeModeRuleStatusesResponse(); - Map> map = new HashMap(); - for (SafeModeRuleStatusProto statusProto : - response.getSafeModeRuleStatusesProtoList()) { - map.put(statusProto.getRuleName(), + return buildSafeModeRuleStatusesMap(response); + } + + /** + * Helper method to build a map from GetSafeModeRuleStatusesResponseProto. + * Extracts rule names and their status information. + */ + private Map> buildSafeModeRuleStatusesMap( + GetSafeModeRuleStatusesResponseProto response) { + Map> ruleStatuses = new HashMap<>(); + for (SafeModeRuleStatusProto statusProto : response.getSafeModeRuleStatusesProtoList()) { + ruleStatuses.put(statusProto.getRuleName(), Pair.of(statusProto.getValidate(), statusProto.getStatusText())); } - return map; + return ruleStatuses; } /** @@ -870,6 +910,24 @@ public boolean forceExitSafeMode() throws IOException { } + @Override + public boolean inSafeModeForNode(String nodeId) throws IOException { + InSafeModeRequestProto request = InSafeModeRequestProto.getDefaultInstance(); + return submitRequestToNode(nodeId, Type.InSafeMode, + builder -> builder.setInSafeModeRequest(request)) + .getInSafeModeResponse().getInSafeMode(); + } + + @Override + public Map> getSafeModeRuleStatusesForNode(String nodeId) throws IOException { + GetSafeModeRuleStatusesRequestProto request = GetSafeModeRuleStatusesRequestProto.getDefaultInstance(); + GetSafeModeRuleStatusesResponseProto response = + submitRequestToNode(nodeId, Type.GetSafeModeRuleStatuses, + builder -> builder.setGetSafeModeRuleStatusesRequest(request)) + .getGetSafeModeRuleStatusesResponse(); + return buildSafeModeRuleStatusesMap(response); + } + @Override public void startReplicationManager() throws IOException { diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java index 0dfa95a95246..272db7a04ae4 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMFailoverProxyProviderBase.java @@ -193,6 +193,17 @@ public synchronized List getProxies() { .map(proxyInfo -> proxyInfo.proxy).collect(Collectors.toList()); } + public synchronized T getProxyForNode(String nodeId) throws IOException { + ProxyInfo proxyInfo = scmProxies.get(nodeId); + if (proxyInfo == null) { + if (!scmProxyInfoMap.containsKey(nodeId)) { + throw new IOException("Unknown SCM node ID: " + nodeId); + } + proxyInfo = createSCMProxy(nodeId); + } + return proxyInfo.proxy; + } + @Override public synchronized void performFailover(T newLeader) { if (updatedLeaderNodeID != null) { diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java index 3b061aa10c01..b319805d0a8c 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java @@ -29,6 +29,7 @@ import static org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.Type.ListContainer; import static org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.Type.ListPipelines; import static org.apache.hadoop.hdds.scm.protocol.StorageContainerLocationProtocol.ADMIN_COMMAND_TYPE; +import static org.apache.hadoop.hdds.scm.protocol.StorageContainerLocationProtocol.FOLLOWER_READABLE_COMMAND_TYPE; import com.google.protobuf.ProtocolMessageEnum; import com.google.protobuf.RpcController; @@ -210,9 +211,12 @@ public StorageContainerLocationProtocolServerSideTranslatorPB( @Override public ScmContainerLocationResponse submitRequest(RpcController controller, ScmContainerLocationRequest request) throws ServiceException { - // not leader or not belong to admin command. + // Trigger not leader exception unless: + // This is the leader node, or this is an admin command, + // or this is a follower-readable command. if (!scm.checkLeader() - && !ADMIN_COMMAND_TYPE.contains(request.getCmdType())) { + && !ADMIN_COMMAND_TYPE.contains(request.getCmdType()) + && !FOLLOWER_READABLE_COMMAND_TYPE.contains(request.getCmdType())) { RatisUtil.checkRatisException( scm.getScmHAManager().getRatisServer().triggerNotLeaderException(), scm.getClientRpcPort(), scm.getScmId(), scm.getHostname(), ROLE_TYPE); diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java index 3bd5214ea0ec..ee8ebafd2d82 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java @@ -1033,6 +1033,42 @@ public Map> getSafeModeRuleStatuses() } } + @Override + public boolean inSafeModeForNode(String nodeId) throws IOException { + Map auditMap = Maps.newHashMap(); + auditMap.put("nodeId", nodeId); + try { + boolean result = inSafeMode(); + AUDIT.logReadSuccess( + buildAuditMessageForSuccess(SCMAction.IN_SAFE_MODE, auditMap) + ); + return result; + } catch (Exception ex) { + AUDIT.logReadFailure( + buildAuditMessageForFailure(SCMAction.IN_SAFE_MODE, auditMap, ex) + ); + throw ex; + } + } + + @Override + public Map> getSafeModeRuleStatusesForNode(String nodeId) throws IOException { + Map auditMap = Maps.newHashMap(); + auditMap.put("nodeId", nodeId); + try { + Map> result = getSafeModeRuleStatuses(); + AUDIT.logReadSuccess( + buildAuditMessageForSuccess(SCMAction.GET_SAFE_MODE_RULE_STATUSES, auditMap) + ); + return result; + } catch (Exception ex) { + AUDIT.logReadFailure( + buildAuditMessageForFailure(SCMAction.GET_SAFE_MODE_RULE_STATUSES, auditMap, ex) + ); + throw ex; + } + } + /** * Force SCM out of Safe mode. * diff --git a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java index 61c0f4150c34..2c7c0f9a57b0 100644 --- a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java +++ b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java @@ -477,6 +477,16 @@ public boolean forceExitSafeMode() throws IOException { return storageContainerLocationClient.forceExitSafeMode(); } + @Override + public boolean inSafeModeForNode(String nodeId) throws IOException { + return storageContainerLocationClient.inSafeModeForNode(nodeId); + } + + @Override + public Map> getSafeModeRuleStatusesForNode(String nodeId) throws IOException { + return storageContainerLocationClient.getSafeModeRuleStatusesForNode(nodeId); + } + @Override public void startReplicationManager() throws IOException { storageContainerLocationClient.startReplicationManager(); diff --git a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java index d15be56410f8..3a550f184d27 100644 --- a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java +++ b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/SafeModeCheckSubcommand.java @@ -18,10 +18,19 @@ package org.apache.hadoop.hdds.scm.cli; import java.io.IOException; +import java.net.InetSocketAddress; +import java.util.List; import java.util.Map; +import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; +import org.apache.hadoop.hdds.HddsUtils; import org.apache.hadoop.hdds.cli.HddsVersionProvider; +import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.scm.client.ScmClient; +import org.apache.hadoop.hdds.scm.ha.SCMNodeInfo; +import org.apache.hadoop.net.NetUtils; +import picocli.CommandLine; import picocli.CommandLine.Command; /** @@ -33,24 +42,181 @@ mixinStandardHelpOptions = true, versionProvider = HddsVersionProvider.class) public class SafeModeCheckSubcommand extends ScmSubcommand { + @CommandLine.Option(names = {"--all", "-a"}, + description = "Show safe mode status for all SCM nodes in the service. " + + "When multiple SCM service IDs are configured, --service-id must be specified.") + private boolean allNodes; + + private String serviceId; + private List nodes; @Override public void execute(ScmClient scmClient) throws IOException { - boolean execReturn = scmClient.inSafeMode(); + OzoneConfiguration conf = getOzoneConf(); + serviceId = HddsUtils.getScmServiceId(conf); + String scmAddress = getScmOption().getScm(); + if (serviceId != null) { + nodes = SCMNodeInfo.buildNodeInfo(conf); + } + + if (allNodes) { + executeForAllNodes(scmClient); + } else if (StringUtils.isNotEmpty(scmAddress)) { + executeForSpecificNode(scmClient, scmAddress); + } else { + executeForSingleNode(scmClient); + } + } + + private void executeForSingleNode(ScmClient scmClient) throws IOException { + boolean inSafeMode; + Map> rules = null; + String leaderNodeId; - // Output data list - if (execReturn) { + // If SCM HA mode, query the leader node. + if (serviceId != null) { + leaderNodeId = findLeaderNodeId(scmClient); + inSafeMode = scmClient.inSafeModeForNode(leaderNodeId); + if (isVerbose()) { + rules = scmClient.getSafeModeRuleStatusesForNode(leaderNodeId); + } + } else { + // Non-HA mode + inSafeMode = scmClient.inSafeMode(); + if (isVerbose()) { + rules = scmClient.getSafeModeRuleStatuses(); + } + } + + if (inSafeMode) { System.out.println("SCM is in safe mode."); } else { System.out.println("SCM is out of safe mode."); } - if (isVerbose()) { - for (Map.Entry> entry : - scmClient.getSafeModeRuleStatuses().entrySet()) { - Pair value = entry.getValue(); - System.out.printf("validated:%s, %s, %s%n", - value.getLeft(), entry.getKey(), value.getRight()); + if (isVerbose() && rules != null) { + printSafeModeRules(rules); + } + } + + /** + * Find the leader node ID from SCM roles. + * @param scmClient the SCM client + * @return the leader node ID, or null if not found + */ + private String findLeaderNodeId(ScmClient scmClient) throws IOException { + try { + List roles = scmClient.getScmRoles(); + for (String role : roles) { + String[] parts = role.split(":"); + if (parts.length >= 3 && "LEADER".equalsIgnoreCase(parts[2])) { + String leaderHostname = parts[0]; + for (SCMNodeInfo node : nodes) { + String nodeHostname = node.getScmClientAddress().split(":")[0]; + if (nodeHostname.equalsIgnoreCase(leaderHostname)) { + return node.getNodeId(); + } + } + } } + return null; + } catch (IOException e) { + throw new IOException("Could not determine leader node for service: " + serviceId, e); + } + } + + private void executeForSpecificNode(ScmClient scmClient, String scmAddress) throws IOException { + if (serviceId == null) { + executeForSingleNode(scmClient); + return; + } + + System.out.println("Service ID: " + serviceId); + // Find the node matching the --scm address + List matchedNodes = nodes.stream() + .filter(node -> matchesAddress(node, scmAddress)) + .collect(Collectors.toList()); + + if (matchedNodes.isEmpty()) { + throw new IOException("Specified --scm address " + scmAddress + + " does not match any node in service " + serviceId + + ". Available nodes: " + nodes.stream() + .map(n -> n.getScmClientAddress() + " [" + n.getNodeId() + "]") + .collect(Collectors.joining(", "))); + } + + queryNode(scmClient, matchedNodes.get(0)); + } + + private void executeForAllNodes(ScmClient scmClient) throws IOException { + if (serviceId == null) { + executeForSingleNode(scmClient); + return; + } + + System.out.println("Service ID: " + serviceId); + + for (SCMNodeInfo node : nodes) { + queryNode(scmClient, node); + } + } + + private void queryNode(ScmClient scmClient, SCMNodeInfo node) { + String nodeId = node.getNodeId(); + + try { + boolean inSafeMode = scmClient.inSafeModeForNode(nodeId); + + System.out.printf("%s [%s]: %s%n", + node.getScmClientAddress(), + nodeId, + inSafeMode ? "IN SAFE MODE" : "OUT OF SAFE MODE"); + + if (isVerbose()) { + Map> rules = scmClient.getSafeModeRuleStatusesForNode(nodeId); + if (rules != null && !rules.isEmpty()) { + printSafeModeRules(rules); + } + } + } catch (Exception e) { + System.out.printf("%s [%s]: ERROR: Failed to get safe mode status - %s%n", + node.getScmClientAddress(), nodeId, e.getMessage()); + } + } + + /** + * Check if the given SCMNodeInfo matches the target address. + * Tries to match by direct string comparison and by resolved address. + */ + private boolean matchesAddress(SCMNodeInfo node, String targetAddress) { + String nodeAddress = node.getScmClientAddress(); + + // Direct match + if (nodeAddress.equals(targetAddress)) { + return true; + } + + // Try normalizing both addresses and comparing + try { + InetSocketAddress target = NetUtils.createSocketAddr(targetAddress); + InetSocketAddress nodeAddr = NetUtils.createSocketAddr(nodeAddress); + + // Match by resolved IP and port + return target.getPort() == nodeAddr.getPort() && + target.getAddress().equals(nodeAddr.getAddress()); + } catch (Exception e) { + // If address resolution fails, no match + if (isVerbose()) { + System.err.println("Warning: Failed to resolve address: " + e.getMessage()); + } + return false; + } + } + + private void printSafeModeRules(Map> rules) { + for (Map.Entry> entry : rules.entrySet()) { + Pair value = entry.getValue(); + System.out.printf("validated:%s, %s, %s%n", + value.getLeft(), entry.getKey(), value.getRight()); } } } diff --git a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmOption.java b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmOption.java index 640433c99b3f..8b8d4b9f0f4b 100644 --- a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmOption.java +++ b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmOption.java @@ -88,4 +88,8 @@ public SCMSecurityProtocol createScmSecurityClient() { "Can't create SCM Security client", ex); } } + + public String getScm() { + return scm; + } } diff --git a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmSubcommand.java b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmSubcommand.java index 6681a4894dbe..c81a2d4c2e78 100644 --- a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmSubcommand.java +++ b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/ScmSubcommand.java @@ -33,6 +33,10 @@ public abstract class ScmSubcommand extends AbstractSubcommand implements Callab protected abstract void execute(ScmClient client) throws IOException; + protected ScmOption getScmOption() { + return scmOption; + } + @Override public final Void call() throws Exception { try (ScmClient scmClient = scmOption.createScmClient()) {