From c50d043b4399762f17b20e105cc6611eb69f720d Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Thu, 6 Nov 2025 10:43:56 -0700 Subject: [PATCH 01/37] bp5writer: defer writer subfile map generation --- source/adios2/engine/bp5/BP5Writer.cpp | 54 +++++++++++++------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/source/adios2/engine/bp5/BP5Writer.cpp b/source/adios2/engine/bp5/BP5Writer.cpp index bda55bea49..5b7b165a37 100644 --- a/source/adios2/engine/bp5/BP5Writer.cpp +++ b/source/adios2/engine/bp5/BP5Writer.cpp @@ -1020,6 +1020,33 @@ void BP5Writer::EndStep() m_Profiler.Stop("ES_AWD"); + if (m_Parameters.verbose > 2) + { + std::cout << "Rank " << m_Comm.Rank() << " deciding whether new writer map is needed" + << std::endl; + std::cout << " m_WriterStep: " << m_WriterStep << std::endl; + std::cout << " m_AppendWriterCount: " << m_AppendWriterCount + << ", m_Comm.Size(): " << m_Comm.Size() << std::endl; + std::cout << " m_AppendAggregatorCount: " << m_AppendAggregatorCount + << ", m_Aggregator->m_NumAggregators: " << m_Aggregator->m_NumAggregators + << std::endl; + std::cout << " m_AppendSubfileCount: " << m_AppendSubfileCount + << ", m_Aggregator->m_SubStreams: " << m_Aggregator->m_SubStreams << std::endl; + } + + if (!m_WriterStep || m_AppendWriterCount != static_cast(m_Comm.Size()) || + m_AppendAggregatorCount != static_cast(m_Aggregator->m_NumAggregators) || + m_AppendSubfileCount != static_cast(m_Aggregator->m_SubStreams)) + { + // new Writer Map is needed + if (m_Parameters.verbose > 2) + { + std::cout << "Rank " << m_Comm.Rank() << " new writer map needed" << std::endl; + } + const uint64_t a = static_cast(m_Aggregator->m_SubStreamIndex); + m_WriterSubfileMap = m_Comm.GatherValues(a, 0); + } + if (m_Parameters.UseSelectiveMetadataAggregation) { SelectiveAggregationMetadata(TSInfo); @@ -1963,33 +1990,6 @@ void BP5Writer::InitBPBuffer() { m_WriterDataPos.resize(m_Comm.Size()); } - - if (m_Parameters.verbose > 2) - { - std::cout << "Rank " << m_Comm.Rank() << " deciding whether new writer map is needed" - << std::endl; - std::cout << " m_WriterStep: " << m_WriterStep << std::endl; - std::cout << " m_AppendWriterCount: " << m_AppendWriterCount - << ", m_Comm.Size(): " << m_Comm.Size() << std::endl; - std::cout << " m_AppendAggregatorCount: " << m_AppendAggregatorCount - << ", m_Aggregator->m_NumAggregators: " << m_Aggregator->m_NumAggregators - << std::endl; - std::cout << " m_AppendSubfileCount: " << m_AppendSubfileCount - << ", m_Aggregator->m_SubStreams: " << m_Aggregator->m_SubStreams << std::endl; - } - - if (!m_WriterStep || m_AppendWriterCount != static_cast(m_Comm.Size()) || - m_AppendAggregatorCount != static_cast(m_Aggregator->m_NumAggregators) || - m_AppendSubfileCount != static_cast(m_Aggregator->m_SubStreams)) - { - // new Writer Map is needed, generate now, write later - if (m_Parameters.verbose > 2) - { - std::cout << "Rank " << m_Comm.Rank() << " new writer map needed" << std::endl; - } - const uint64_t a = static_cast(m_Aggregator->m_SubStreamIndex); - m_WriterSubfileMap = m_Comm.GatherValues(a, 0); - } } void BP5Writer::EnterComputationBlock() noexcept From 4f47ee3bd34376138d384630f14a2f43620971e4 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Thu, 6 Nov 2025 10:46:08 -0700 Subject: [PATCH 02/37] bp5writer: InitTransports() allow opening a file later Break out the bits of InitTransports that generate the filename and open the file, and put them into a new function that can be called either with or without an mpi communicator. This supports the rerouting aggregator, currently under development, by allowing rerouted ranks to change their target subfile near the end of the writing process, after opening a different file at the beginning. --- source/adios2/engine/bp5/BP5Writer.cpp | 95 +++++++++++++++----------- source/adios2/engine/bp5/BP5Writer.h | 1 + 2 files changed, 56 insertions(+), 40 deletions(-) diff --git a/source/adios2/engine/bp5/BP5Writer.cpp b/source/adios2/engine/bp5/BP5Writer.cpp index 5b7b165a37..54c59e15fe 100644 --- a/source/adios2/engine/bp5/BP5Writer.cpp +++ b/source/adios2/engine/bp5/BP5Writer.cpp @@ -1636,29 +1636,9 @@ void BP5Writer::InitMetadataTransports() void BP5Writer::InitTransports() { - std::string cacheKey = GetCacheKey(m_Aggregator); - auto search = m_AggregatorSpecifics.find(cacheKey); - bool cacheHit = false; - - if (search != m_AggregatorSpecifics.end()) - { - if (m_Parameters.verbose > 2) - { - std::cout << "Rank " << m_Comm.Rank() << " cache hit for aggregator key " << cacheKey - << std::endl; - } - cacheHit = true; - } - else - { - // Didn't have one in the cache, add it now - m_AggregatorSpecifics.emplace(std::make_pair(cacheKey, AggTransportData(m_IO, m_Comm))); - } - - AggTransportData &aggData = m_AggregatorSpecifics.at(cacheKey); + OpenSubfile(); - // /path/name.bp.dir/name.bp.rank - aggData.m_SubStreamNames = GetBPSubStreamNames({m_Name}, m_Aggregator->m_SubStreamIndex); + AggTransportData &aggData = m_AggregatorSpecifics.at(GetCacheKey(m_Aggregator)); if (m_IAmDraining) { @@ -1685,9 +1665,49 @@ void BP5Writer::InitTransports() aggData.m_DrainSubStreamNames, m_IO.m_TransportsParameters, m_Parameters.NodeLocal); } + if (m_IAmDraining) + { + if (m_DrainBB) + { + for (const auto &name : aggData.m_DrainSubStreamNames) + { + m_FileDrainer.AddOperationOpen(name, m_OpenMode); + } + } + } + + InitBPBuffer(); +} + +void BP5Writer::OpenSubfile(bool useComm) +{ + std::string cacheKey = GetCacheKey(m_Aggregator); + auto search = m_AggregatorSpecifics.find(cacheKey); + bool cacheHit = false; + + if (search != m_AggregatorSpecifics.end()) + { + if (m_Parameters.verbose > 2) + { + std::cout << "Rank " << m_Comm.Rank() << " cache hit for aggregator key " << cacheKey + << std::endl; + } + cacheHit = true; + } + else + { + // Didn't have one in the cache, add it now + m_AggregatorSpecifics.emplace(std::make_pair(cacheKey, AggTransportData(m_IO, m_Comm))); + } + + AggTransportData &aggData = m_AggregatorSpecifics.at(cacheKey); + + // /path/name.bp.dir/name.bp.rank + aggData.m_SubStreamNames = GetBPSubStreamNames({m_Name}, m_Aggregator->m_SubStreamIndex); + helper::Comm openSyncComm; - if (m_Parameters.AggregationType == (int)AggregationType::DataSizeBased) + if (m_Parameters.AggregationType == (int)AggregationType::DataSizeBased && useComm) { // Split my writer chain so only ranks that actually need to open a // file can do so in an ordered fashion. @@ -1718,29 +1738,24 @@ void BP5Writer::InitTransports() { std::cout << "Rank " << m_Comm.Rank() << " opening data file" << std::endl; } - aggData.m_FileDataManager.OpenFiles(aggData.m_SubStreamNames, mode, - m_IO.m_TransportsParameters, true, - *DataWritingComm); + if (useComm) + { + aggData.m_FileDataManager.OpenFiles(aggData.m_SubStreamNames, mode, + m_IO.m_TransportsParameters, true, + *DataWritingComm); + } + else + { + aggData.m_FileDataManager.OpenFiles(aggData.m_SubStreamNames, mode, + m_IO.m_TransportsParameters, true); + } } } - if (m_Parameters.AggregationType == (int)AggregationType::DataSizeBased) + if (m_Parameters.AggregationType == (int)AggregationType::DataSizeBased && useComm) { openSyncComm.Free(); } - - if (m_IAmDraining) - { - if (m_DrainBB) - { - for (const auto &name : aggData.m_DrainSubStreamNames) - { - m_FileDrainer.AddOperationOpen(name, m_OpenMode); - } - } - } - - this->InitBPBuffer(); } /*generate the header for the metadata index file*/ diff --git a/source/adios2/engine/bp5/BP5Writer.h b/source/adios2/engine/bp5/BP5Writer.h index 4ded24e8d2..0f18356422 100644 --- a/source/adios2/engine/bp5/BP5Writer.h +++ b/source/adios2/engine/bp5/BP5Writer.h @@ -78,6 +78,7 @@ class BP5Writer : public BP5Engine, public core::Engine std::map m_AggregatorSpecifics; helper::RankPartition GetPartitionInfo(const uint64_t rankDataSize, const int subStreams, helper::Comm const &parentComm); + void OpenSubfile(const bool useComm = true); /** Single object controlling BP buffering */ format::BP5Serializer m_BP5Serializer; From db61893e92eb02bab3d19abfce8c348dbb1fdf63 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Fri, 26 Sep 2025 15:44:50 -0600 Subject: [PATCH 03/37] Add a RerouteMessage class with send/recv capability --- source/adios2/CMakeLists.txt | 1 + source/adios2/helper/adiosComm.h | 8 ++ source/adios2/helper/adiosComm.inl | 18 ++- source/adios2/helper/adiosCommMPI.cpp | 22 +++- source/adios2/helper/adiosRerouting.cpp | 86 ++++++++++++++ source/adios2/helper/adiosRerouting.h | 65 +++++++++++ .../adios2/engine/bp/TestBPRerouteMessage.cpp | 108 ++++++++++++++++++ 7 files changed, 297 insertions(+), 11 deletions(-) create mode 100644 source/adios2/helper/adiosRerouting.cpp create mode 100644 source/adios2/helper/adiosRerouting.h create mode 100644 testing/adios2/engine/bp/TestBPRerouteMessage.cpp diff --git a/source/adios2/CMakeLists.txt b/source/adios2/CMakeLists.txt index daa921003b..2db06ff410 100644 --- a/source/adios2/CMakeLists.txt +++ b/source/adios2/CMakeLists.txt @@ -33,6 +33,7 @@ add_library(adios2_core helper/adiosNetwork.cpp helper/adiosPartitioner.cpp helper/adiosPluginManager.cpp + helper/adiosRerouting.cpp helper/adiosString.cpp helper/adiosString.tcc helper/adiosSystem.cpp helper/adiosType.cpp diff --git a/source/adios2/helper/adiosComm.h b/source/adios2/helper/adiosComm.h index a7e35e20d0..cd2f7a1865 100644 --- a/source/adios2/helper/adiosComm.h +++ b/source/adios2/helper/adiosComm.h @@ -60,6 +60,14 @@ class Comm Shared }; + /** + * @brief Various constants + */ + enum class Constants + { + CommRecvAny = -10 + }; + /** * @brief Default constructor. Produces an empty communicator. * diff --git a/source/adios2/helper/adiosComm.inl b/source/adios2/helper/adiosComm.inl index dac093af2e..035d22823e 100644 --- a/source/adios2/helper/adiosComm.inl +++ b/source/adios2/helper/adiosComm.inl @@ -259,9 +259,12 @@ Comm::Status Comm::Recv(T *buf, size_t count, int source, int tag, { if (source < 0 || source > m_Impl->Size() - 1) { - throw std::runtime_error( - "Invalid MPI source rank in Recv: " + std::to_string(source) + - " for a communicator of size " + std::to_string(m_Impl->Size())); + if (source != static_cast(Comm::Constants::CommRecvAny)) + { + throw std::runtime_error( + "Invalid MPI source rank in Recv: " + std::to_string(source) + + " for a communicator of size " + std::to_string(m_Impl->Size())); + } } return m_Impl->Recv(buf, count, CommImpl::GetDatatype(), source, tag, hint); @@ -296,9 +299,12 @@ Comm::Req Comm::Irecv(T *buffer, const size_t count, int source, int tag, { if (source < 0 || source > m_Impl->Size() - 1) { - throw std::runtime_error( - "Invalid MPI source rank in Irecv: " + std::to_string(source) + - " for a communicator of size " + std::to_string(m_Impl->Size())); + if (source != static_cast(Comm::Constants::CommRecvAny)) + { + throw std::runtime_error( + "Invalid MPI source rank in Irecv: " + std::to_string(source) + + " for a communicator of size " + std::to_string(m_Impl->Size())); + } } return m_Impl->Irecv(buffer, count, CommImpl::GetDatatype(), source, tag, hint); diff --git a/source/adios2/helper/adiosCommMPI.cpp b/source/adios2/helper/adiosCommMPI.cpp index 7e6d8b1bd1..09933af786 100644 --- a/source/adios2/helper/adiosCommMPI.cpp +++ b/source/adios2/helper/adiosCommMPI.cpp @@ -19,6 +19,7 @@ #include "adios2/common/ADIOSTypes.h" #include +#include namespace adios2 { @@ -68,6 +69,17 @@ const MPI_Datatype DatatypeToMPI[] = { MPI_SHORT_INT, }; +int GetMPISource(int source) +{ + if (source == static_cast(Comm::Constants::CommRecvAny)) + { + std::cout << "Source override" << std::endl; + return MPI_ANY_SOURCE; + } + + return source; +} + MPI_Datatype ToMPI(CommImpl::Datatype dt) { return DatatypeToMPI[int(dt)]; } void CheckMPIReturn(const int value, const std::string &hint) @@ -388,8 +400,8 @@ Comm::Status CommImplMPI::Recv(void *buf, size_t count, Datatype datatype, int s { MPI_Status mpiStatus; CheckMPIReturn( - MPI_Recv(buf, static_cast(count), ToMPI(datatype), source, tag, m_MPIComm, &mpiStatus), - hint); + MPI_Recv(buf, static_cast(count), ToMPI(datatype), GetMPISource(source), tag, + m_MPIComm, &mpiStatus), hint); Comm::Status status; status.Source = mpiStatus.MPI_SOURCE; @@ -470,7 +482,7 @@ Comm::Req CommImplMPI::Irecv(void *buffer, size_t count, Datatype datatype, int int batchSize = static_cast(DefaultMaxFileBatchSize); MPI_Request mpiReq; CheckMPIReturn(MPI_Irecv(static_cast(buffer) + position, batchSize, - ToMPI(datatype), source, tag, m_MPIComm, &mpiReq), + ToMPI(datatype), GetMPISource(source), tag, m_MPIComm, &mpiReq), "in call to Irecv batch " + std::to_string(b) + " " + hint + "\n"); req->m_MPIReqs.emplace_back(mpiReq); @@ -483,7 +495,7 @@ Comm::Req CommImplMPI::Irecv(void *buffer, size_t count, Datatype datatype, int int batchSize = static_cast(remainder); MPI_Request mpiReq; CheckMPIReturn(MPI_Irecv(static_cast(buffer) + position, batchSize, - ToMPI(datatype), source, tag, m_MPIComm, &mpiReq), + ToMPI(datatype), GetMPISource(source), tag, m_MPIComm, &mpiReq), "in call to Irecv remainder batch " + hint + "\n"); req->m_MPIReqs.emplace_back(mpiReq); } @@ -493,7 +505,7 @@ Comm::Req CommImplMPI::Irecv(void *buffer, size_t count, Datatype datatype, int int batchSize = static_cast(count); MPI_Request mpiReq; CheckMPIReturn( - MPI_Irecv(buffer, batchSize, ToMPI(datatype), source, tag, m_MPIComm, &mpiReq), + MPI_Irecv(buffer, batchSize, ToMPI(datatype), GetMPISource(source), tag, m_MPIComm, &mpiReq), " in call to Isend with single batch " + hint + "\n"); req->m_MPIReqs.emplace_back(mpiReq); } diff --git a/source/adios2/helper/adiosRerouting.cpp b/source/adios2/helper/adiosRerouting.cpp new file mode 100644 index 0000000000..db552069bc --- /dev/null +++ b/source/adios2/helper/adiosRerouting.cpp @@ -0,0 +1,86 @@ +/* + * Distributed under the OSI-approved Apache License, Version 2.0. See + * accompanying file Copyright.txt for details. + * + * adiosRerouting.cpp + * + * Created on: Sept 26, 2025 + * Author: Scott Wittenburg scott.wittenburg@kitware.com + */ + +#include "adiosRerouting.h" +#include "adios2/helper/adiosComm.h" +#include "adios2/helper/adiosMemory.h" + +/// \cond EXCLUDE_FROM_DOXYGEN +#include +#include +#include +#include +#include +#include +/// \endcond + + +namespace adios2 +{ +namespace helper +{ + +void RerouteMessage::ToBuffer(std::vector &buffer) +{ + size_t pos = 0; + buffer.resize(REROUTE_MESSAGE_SIZE); + helper::CopyToBuffer(buffer, pos, &this->m_MsgType); + helper::CopyToBuffer(buffer, pos, &this->m_SrcRank); + helper::CopyToBuffer(buffer, pos, &this->m_DestRank); + helper::CopyToBuffer(buffer, pos, &this->m_SubStreamIdx); + helper::CopyToBuffer(buffer, pos, &this->m_Offset); + helper::CopyToBuffer(buffer, pos, &this->m_Size); +} + + +void RerouteMessage::FromBuffer(const std::vector &buffer) +{ + size_t pos = 0; + helper::CopyFromBuffer(buffer.data(), pos, &this->m_MsgType); + helper::CopyFromBuffer(buffer.data(), pos, &this->m_SrcRank); + helper::CopyFromBuffer(buffer.data(), pos, &this->m_DestRank); + helper::CopyFromBuffer(buffer.data(), pos, &this->m_SubStreamIdx); + helper::CopyFromBuffer(buffer.data(), pos, &this->m_Offset); + helper::CopyFromBuffer(buffer.data(), pos, &this->m_Size); +} + +void RerouteMessage::SendTo(helper::Comm& comm, int destRank) +{ + std::vector sendBuf; + this->ToBuffer(sendBuf); + + // You'd think send and/or recv should be non-blocking for this to work, + // especially when sending to ourselves, but somehow blocking send and recv + // seems to work. + + // Non-blocking send + // comm.Isend(sendBuf.data(), sendBuf.size(), desinationtRank, 0); + + // "Blocking" send + comm.Send(sendBuf.data(), sendBuf.size(), destRank, 0); +} + +void RerouteMessage::RecvFrom(helper::Comm& comm, int srcRank) +{ + std::vector recvBuf; + recvBuf.resize(REROUTE_MESSAGE_SIZE); + + // Non-blocking receive + // helper::Comm::Req req = comm.Irecv(recvBuf.data(), msgSize, recvFlag, 0); + // helper::Comm::Status status = req.Wait(); + + // "Blocking" receive + helper::Comm::Status status = comm.Recv(recvBuf.data(), REROUTE_MESSAGE_SIZE, srcRank, 0); + + this->FromBuffer(recvBuf); +} + +} // end namespace helper +} // end namespace adios2 diff --git a/source/adios2/helper/adiosRerouting.h b/source/adios2/helper/adiosRerouting.h new file mode 100644 index 0000000000..26ffeb6048 --- /dev/null +++ b/source/adios2/helper/adiosRerouting.h @@ -0,0 +1,65 @@ +/* + * Distributed under the OSI-approved Apache License, Version 2.0. See + * accompanying file Copyright.txt for details. + * + * adiosRerouting.h helpers for BP5 rerouting aggregation + * + * Created on: Sept 26, 2025 + * Author: Scott Wittenburg scott.wittenburg@kitware.com + */ + +#ifndef ADIOS2_HELPER_ADIOSREROUTING_H_ +#define ADIOS2_HELPER_ADIOSREROUTING_H_ + +#include "adios2/helper/adiosComm.h" + +/// \cond EXCLUDE_FROM_DOXYGEN +#include +#include +#include +/// \endcond + +namespace adios2 +{ + +namespace helper +{ + +class RerouteMessage +{ +public: + enum class MessageType + { + WRITER_IDLE, + REROUTE_REQUEST, + REROUTE_ACK, + REROUTE_REJECT, + }; + + // Store a RerouteMsg so it can sent/received over mpi + void ToBuffer(std::vector &buffer); + + // Reconstitute member fields from buffer + void FromBuffer(const std::vector &buffer); + + // Send the contents of this message to another rank + void SendTo(helper::Comm& comm, int destRank); + + // Receive a message from another rank to populate this message + void RecvFrom(helper::Comm& comm, int srcRank); + + MessageType m_MsgType; + int m_SrcRank; + int m_DestRank; + unsigned long m_SubStreamIdx; + unsigned long m_Offset; + unsigned long m_Size; + + static const size_t REROUTE_MESSAGE_SIZE = sizeof(MessageType) + sizeof(int) + + sizeof(int) + sizeof(unsigned long) + sizeof(unsigned long) + sizeof(unsigned long); +}; + +} // end namespace helper +} // end namespace adios2 + +#endif /* ADIOS2_HELPER_ADIOSREROUTING_H_ */ diff --git a/testing/adios2/engine/bp/TestBPRerouteMessage.cpp b/testing/adios2/engine/bp/TestBPRerouteMessage.cpp new file mode 100644 index 0000000000..c4c2b1c916 --- /dev/null +++ b/testing/adios2/engine/bp/TestBPRerouteMessage.cpp @@ -0,0 +1,108 @@ +/* + * Distributed under the OSI-approved Apache License, Version 2.0. See + * accompanying file Copyright.txt for details. + */ + +#include +#include +#include +#include +#include +#include +#include + +using namespace adios2; + +namespace +{ +int worldRank, worldSize; + +void SendAndReceiveMessage(helper::Comm &comm, int destRank, int srcRank) +{ + std::cout << "Sending to " << destRank << " and expecting to receive from: " << srcRank << std::endl; + + // Send a message to another rank + adios2::helper::RerouteMessage origMsg; + origMsg.m_MsgType = adios2::helper::RerouteMessage::MessageType::WRITER_IDLE; + origMsg.m_SrcRank = worldRank; + origMsg.m_DestRank = destRank; + origMsg.m_Offset = 2138; + origMsg.m_Size = 1213; + origMsg.SendTo(comm, destRank); + + // Receive a message from another (any) rank + adios2::helper::RerouteMessage receivedMsg; + receivedMsg.RecvFrom(comm, static_cast(helper::Comm::Constants::CommRecvAny)); + + ASSERT_EQ(receivedMsg.m_MsgType, origMsg.m_MsgType); + ASSERT_EQ(receivedMsg.m_SrcRank, srcRank); + ASSERT_EQ(receivedMsg.m_DestRank, worldRank); + ASSERT_EQ(receivedMsg.m_Offset, origMsg.m_Offset); + ASSERT_EQ(receivedMsg.m_Size, origMsg.m_Size); +} +} + +class RerouteTest : public ::testing::Test +{ +public: + RerouteTest() = default; +}; + + +TEST_F(RerouteTest, TestMessageBuffer) +{ + adios2::helper::RerouteMessage origMsg; + + origMsg.m_MsgType = adios2::helper::RerouteMessage::MessageType::WRITER_IDLE; + origMsg.m_SrcRank = 3; + origMsg.m_DestRank = 0; + origMsg.m_SubStreamIdx = 1; + origMsg.m_Offset = 2138; + origMsg.m_Size = 1213; + + std::vector buf; + origMsg.ToBuffer(buf); + + adios2::helper::RerouteMessage newMsg; + newMsg.FromBuffer(buf); + + ASSERT_EQ(newMsg.m_MsgType, origMsg.m_MsgType); + ASSERT_EQ(newMsg.m_SrcRank, origMsg.m_SrcRank); + ASSERT_EQ(newMsg.m_DestRank, origMsg.m_DestRank); + ASSERT_EQ(newMsg.m_SubStreamIdx, origMsg.m_SubStreamIdx); + ASSERT_EQ(newMsg.m_Offset, origMsg.m_Offset); + ASSERT_EQ(newMsg.m_Size, origMsg.m_Size); +} + +TEST_F(RerouteTest, TestSendReceiveRoundRobin) +{ + helper::Comm comm = adios2::helper::CommDupMPI(MPI_COMM_WORLD); + + int destRank = worldRank >= worldSize - 1 ? 0 : worldRank + 1; + int expectedSender = worldRank <= 0 ? worldSize - 1 : worldRank - 1; + + SendAndReceiveMessage(comm, destRank, expectedSender); +} + +TEST_F(RerouteTest, TestSendReceiveSelf) +{ + helper::Comm comm = adios2::helper::CommDupMPI(MPI_COMM_WORLD); + + int destRank = worldRank; + int expectedSender = worldRank; + + SendAndReceiveMessage(comm, destRank, expectedSender); +} + +int main(int argc, char **argv) +{ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &worldRank); + MPI_Comm_size(MPI_COMM_WORLD, &worldSize); + ::testing::InitGoogleTest(&argc, argv); + + int result = RUN_ALL_TESTS(); + + MPI_Finalize(); + return result; +} From 624cb905a543e4015bf431589aa4534df0024493 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Fri, 26 Sep 2025 15:46:09 -0600 Subject: [PATCH 04/37] Start working on the rerouting code path --- source/adios2/CMakeLists.txt | 1 + source/adios2/engine/bp5/BP5Engine.h | 1 + source/adios2/engine/bp5/BP5Writer.cpp | 11 ++++- source/adios2/engine/bp5/BP5Writer.h | 1 + .../engine/bp5/BP5Writer_WithRerouting.cpp | 41 +++++++++++++++++++ testing/adios2/engine/bp/CMakeLists.txt | 5 +++ 6 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp diff --git a/source/adios2/CMakeLists.txt b/source/adios2/CMakeLists.txt index 2db06ff410..542085b050 100644 --- a/source/adios2/CMakeLists.txt +++ b/source/adios2/CMakeLists.txt @@ -58,6 +58,7 @@ add_library(adios2_core engine/bp5/BP5Writer.tcc engine/bp5/BP5Writer_TwoLevelShm_Async.cpp engine/bp5/BP5Writer_TwoLevelShm.cpp + engine/bp5/BP5Writer_WithRerouting.cpp engine/timeseries/TimeSeriesReader.cpp engine/timeseries/TimeSeriesReader.tcc diff --git a/source/adios2/engine/bp5/BP5Engine.h b/source/adios2/engine/bp5/BP5Engine.h index 1f72f10a0a..3c1bbe81a4 100644 --- a/source/adios2/engine/bp5/BP5Engine.h +++ b/source/adios2/engine/bp5/BP5Engine.h @@ -151,6 +151,7 @@ class BP5Engine MACRO(DirectIOAlignOffset, UInt, unsigned int, 512) \ MACRO(DirectIOAlignBuffer, UInt, unsigned int, 0) \ MACRO(AggregationType, AggregationType, int, (int)AggregationType::TwoLevelShm) \ + MACRO(EnableWriterRerouting, Bool, bool, false) \ MACRO(AsyncOpen, Bool, bool, true) \ MACRO(AsyncWrite, AsyncWrite, int, (int)AsyncWrite::Sync) \ MACRO(GrowthFactor, Float, float, DefaultBufferGrowthFactor) \ diff --git a/source/adios2/engine/bp5/BP5Writer.cpp b/source/adios2/engine/bp5/BP5Writer.cpp index 54c59e15fe..62c231b6df 100644 --- a/source/adios2/engine/bp5/BP5Writer.cpp +++ b/source/adios2/engine/bp5/BP5Writer.cpp @@ -391,7 +391,16 @@ void BP5Writer::WriteData(format::BufferV *Data) break; case (int)AggregationType::EveryoneWritesSerial: case (int)AggregationType::DataSizeBased: - WriteData_EveryoneWrites(Data, true); + // For rerouting to be useful, there must be multiple writers sending + // data to multiple subfiles. + if (m_Parameters.EnableWriterRerouting && m_Comm.Size() > 1 && m_Aggregator->m_SubStreams > 1) + { + WriteData_WithRerouting(Data); + } + else + { + WriteData_EveryoneWrites(Data, true); + } break; case (int)AggregationType::TwoLevelShm: WriteData_TwoLevelShm(Data); diff --git a/source/adios2/engine/bp5/BP5Writer.h b/source/adios2/engine/bp5/BP5Writer.h index 0f18356422..b61bd0fa2d 100644 --- a/source/adios2/engine/bp5/BP5Writer.h +++ b/source/adios2/engine/bp5/BP5Writer.h @@ -197,6 +197,7 @@ class BP5Writer : public BP5Engine, public core::Engine /** Write Data to disk, in an aggregator chain */ void WriteData(format::BufferV *Data); void WriteData_EveryoneWrites(format::BufferV *Data, bool SerializedWriters); + void WriteData_WithRerouting(format::BufferV *Data); void WriteData_EveryoneWrites_Async(format::BufferV *Data, bool SerializedWriters); void WriteData_TwoLevelShm(format::BufferV *Data); void WriteData_TwoLevelShm_Async(format::BufferV *Data); diff --git a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp new file mode 100644 index 0000000000..30ab9e8b66 --- /dev/null +++ b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp @@ -0,0 +1,41 @@ +/* + * Distributed under the OSI-approved Apache License, Version 2.0. See + * accompanying file Copyright.txt for details. + * + * BP5Writer.cpp + * + */ + +#include "BP5Writer.h" + +#include "adios2/common/ADIOSMacros.h" +#include "adios2/core/IO.h" +#include "adios2/helper/adiosFunctions.h" //CheckIndexRange +#include "adios2/toolkit/format/buffer/chunk/ChunkV.h" +#include "adios2/toolkit/format/buffer/malloc/MallocV.h" +#include "adios2/toolkit/transport/file/FileFStream.h" +#include + +#include // max +#include +#include + +namespace adios2 +{ +namespace core +{ +namespace engine +{ + +using namespace adios2::format; + + + +void BP5Writer::WriteData_WithRerouting(format::BufferV *Data) +{ + +} + +} // end namespace engine +} // end namespace core +} // end namespace adios2 diff --git a/testing/adios2/engine/bp/CMakeLists.txt b/testing/adios2/engine/bp/CMakeLists.txt index 840817518d..2d8bd6ed6a 100644 --- a/testing/adios2/engine/bp/CMakeLists.txt +++ b/testing/adios2/engine/bp/CMakeLists.txt @@ -247,6 +247,11 @@ macro(bp5agg_gtest_add_tests testname mpi) WORKING_DIRECTORY ${BP5_DIR} EXTRA_ARGS "BP5" "DataSizeBased") endmacro() +if (ADIOS2_HAVE_MPI) + gtest_add_tests_helper(RerouteMessage MPI_ONLY BP Engine.BPReroute. .BP5.RR + WORKING_DIRECTORY ${BP5_DIR} EXTRA_ARGS "BP5") +endif() + # BP5 only for now, so test all the aggregators while we're at it bp5agg_gtest_add_tests(ParameterSelectSteps MPI_ALLOW) bp5agg_gtest_add_tests(AppendAfterSteps MPI_ALLOW) From afdc1c1e98933dcee55eb2e22afafec16182ce6c Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Wed, 1 Oct 2025 19:00:50 -0600 Subject: [PATCH 05/37] Work on rerouting aggregator Just trying to get a threaded communication scheme going, without the actual rerouting to start (just have every group write to the expected file). Add some api to adiosComm to support the Probe/Iprobe api and make sure it's working by using it the test. --- source/adios2/engine/bp5/BP5Writer.cpp | 26 ++-- source/adios2/engine/bp5/BP5Writer.h | 6 + .../engine/bp5/BP5Writer_WithRerouting.cpp | 118 ++++++++++++++++++ source/adios2/helper/adiosComm.cpp | 28 +++++ source/adios2/helper/adiosComm.h | 8 ++ source/adios2/helper/adiosCommDummy.cpp | 16 +++ source/adios2/helper/adiosCommMPI.cpp | 24 ++++ source/adios2/helper/adiosRerouting.h | 2 + testing/adios2/engine/bp/CMakeLists.txt | 5 +- .../engine/bp/TestBPDataSizeAggregate.cpp | 16 ++- .../adios2/engine/bp/TestBPRerouteMessage.cpp | 7 ++ 11 files changed, 241 insertions(+), 15 deletions(-) diff --git a/source/adios2/engine/bp5/BP5Writer.cpp b/source/adios2/engine/bp5/BP5Writer.cpp index 62c231b6df..9bfba6288a 100644 --- a/source/adios2/engine/bp5/BP5Writer.cpp +++ b/source/adios2/engine/bp5/BP5Writer.cpp @@ -390,7 +390,20 @@ void BP5Writer::WriteData(format::BufferV *Data) WriteData_EveryoneWrites(Data, false); break; case (int)AggregationType::EveryoneWritesSerial: + WriteData_EveryoneWrites(Data, true); + break; case (int)AggregationType::DataSizeBased: + // First initialize aggregator and transports if we haven't done it yet this step + if (!m_AggregatorInitializedThisStep) + { + // We can't allow ranks to change subfiles between calls to Put(), so we only + // do this initialization once per timestep. Consequently, partition decision + // could be based on incomplete step data. + InitAggregator(Data->Size()); + InitTransports(); + m_AggregatorInitializedThisStep = true; + } + // For rerouting to be useful, there must be multiple writers sending // data to multiple subfiles. if (m_Parameters.EnableWriterRerouting && m_Comm.Size() > 1 && m_Aggregator->m_SubStreams > 1) @@ -419,19 +432,6 @@ void BP5Writer::WriteData(format::BufferV *Data) void BP5Writer::WriteData_EveryoneWrites(format::BufferV *Data, bool SerializedWriters) { - if (m_Parameters.AggregationType == (int)AggregationType::DataSizeBased) - { - if (!m_AggregatorInitializedThisStep) - { - // We can't allow ranks to change subfiles between calls to Put(), so we only - // do this initialization once per timestep. Consequently, partition decision - // could be based on incomplete step data. - InitAggregator(Data->Size()); - InitTransports(); - m_AggregatorInitializedThisStep = true; - } - } - const aggregator::MPIChain *a = dynamic_cast(m_Aggregator); // new step writing starts at offset m_DataPos on aggregator diff --git a/source/adios2/engine/bp5/BP5Writer.h b/source/adios2/engine/bp5/BP5Writer.h index b61bd0fa2d..97518bfb58 100644 --- a/source/adios2/engine/bp5/BP5Writer.h +++ b/source/adios2/engine/bp5/BP5Writer.h @@ -304,6 +304,12 @@ class BP5Writer : public BP5Engine, public core::Engine */ uint64_t CountStepsInMetadataIndex(format::BufferSTL &bufferSTL); + // Thread function for inter-rank communication when rerouting aggregation + // is enabled + void ReroutingCommunicationLoop(); + size_t m_TargetIndex; + std::atomic m_readyToWrite; + /* Async write's future */ std::future m_WriteFuture; // variables to delay writing to index file diff --git a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp index 30ab9e8b66..d35ede70ed 100644 --- a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp +++ b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp @@ -19,6 +19,8 @@ #include // max #include #include +#include +#include namespace adios2 { @@ -28,12 +30,128 @@ namespace engine { using namespace adios2::format; +using namespace adios2::helper; +void BP5Writer::ReroutingCommunicationLoop() +{ + // Sleep for a random amount of time between 0 and 6 seconds + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution<> distrib(0, 6000); + int sleepMillis = distrib(gen); + std::this_thread::sleep_for(std::chrono::milliseconds(sleepMillis)); + + int subCoord = m_Aggregator->m_AggregatorRank; + std::queue writerQueue; + int writingRank = -1; + uint64_t currentFilePos; + + // Now start the + std::cout << " Rank " << m_Comm.Rank() << " ReroutingCommunicationLoop()" << std::endl; + std::cout << " SC: " << subCoord << std::endl; + std::cout << " GC: " << m_Comm.Size() - 1 << std::endl; + std::cout << " subfile index: " << m_Aggregator->m_SubStreamIndex << std::endl; + std::cout << " total subfiles: " << m_Aggregator->m_SubStreams << std::endl; + + if (m_Rank == subCoord && m_DataPosShared == true) + { + // We are a subcoordinator and have shared data pos after a previous timestep, + // we should update our notion of m_DataPos + currentFilePos = m_SubstreamDataPos[a->m_SubStreamIndex]; + m_DataPosShared = false; + } + + // align to PAGE_SIZE + m_DataPos += helper::PaddingToAlignOffset(m_DataPos, m_Parameters.StripeSize); + m_StartDataPos = m_DataPos; + + // First send a message to the SC to get added to their writing queue + adios2::helper::RerouteMessage submitMsg; + submitMsg.m_MsgType = RerouteMessage::MessageType::WRITE_SUBMISSION; + submitMsg.m_SrcRank = m_RankMPI; + submitMsg.m_DestRank = subCoord; + submitMsg.SendTo(m_Comm, subCoord); + + while (true) + { + int msgReady = 0; + + m_Comm.Iprobe(static_cast(helper::Comm::Constants::CommRecvAny), 0, &msgReady); + + if (msgReady) + { + // If there is a message ready, receive and handle it + adios2::helper::RerouteMessage message; + message.RecvFrom(m_Comm, static_cast(helper::Comm::Constants::CommRecvAny)); + + switch (message.m_MsgType) + { + case (int)RerouteMessage::MessageType::WRITE_SUBMISSION: + writerQueue.push(message.m_SrcRank); + break; + case (int)RerouteMessage::MessageType::DO_WRITE: + m_TargetIndex = message.m_SubStreamIdx; + m_DataPos = message.m_Offset; + break; + default: + break; + } + } + + // Check if writing has finished, and alert the target SC + // Check if anyone is writing right now, and if not, ask the next writer to start + if (writingRank == -1) + { + if (!writerQueue.empty()) + { + int nextWriter = writerQueue.pop(); + adios2::helper::RerouteMessage writeMsg; + writeMsg.m_MsgType = adios2::helper::RerouteMessage::MessageType::DO_WRITE; + writeMsg.m_SrcRank = m_RankMPI; + writeMsg.m_DestRank = nextWriter; + writeMsg.m_SubStreamIdx = m_Aggregator->m_SubStreamIndex; + writeMsg.m_Offset = currentFilePos; + writeMsg.SendTo(comm, nextWriter); + writingRank = nextWriter; + } + + } + } + + + +} void BP5Writer::WriteData_WithRerouting(format::BufferV *Data) { + // - start the communcation loop running in a thread + // - begin polling the write barrier variable, once it flips. write: + // - variables set by comm thread indicate subfile and offset + // - attempt to join the comm thread + // - return + + // The communcation function: + // + // - Send a message to your SC, requesting to write + // - enter the message loop where you: + // - check if a message is ready, and if so: + // - receive the message + // - handle the message, based on your role and the sender + // - if no message is ready, check if the writing is finished, and if so: + // - send a message to SC telling them: + // - you're done + // - the new offset (or the amount you wrote) + + m_readyToWrite = false; + + std::thread commThread(&BP5Writer::ReroutingCommunicationLoop, this); + + std::cout << "Background thread for rank " << m_RankMPI << " is now running" << std::endl; + + commThread.join(); + std::cout << "Background thread for rank " << m_RankMPI << " is now finished" << std::endl; } } // end namespace engine diff --git a/source/adios2/helper/adiosComm.cpp b/source/adios2/helper/adiosComm.cpp index 9109a35212..1aa970e755 100644 --- a/source/adios2/helper/adiosComm.cpp +++ b/source/adios2/helper/adiosComm.cpp @@ -91,6 +91,34 @@ std::string Comm::BroadcastFile(const std::string &fileName, const std::string h return fileContents; } +Comm::Status Comm::Probe(int source, int tag, const std::string &hint) const +{ + if (source < 0 || source > m_Impl->Size() - 1) + { + if (source != static_cast(Comm::Constants::CommRecvAny)) + { + throw std::runtime_error( + "Invalid MPI source rank in Probe: " + std::to_string(source) + + " for a communicator of size " + std::to_string(m_Impl->Size())); + } + } + return m_Impl->Probe(source, tag, hint); +} + +Comm::Status Comm::Iprobe(int source, int tag, int *flag, const std::string &hint) const +{ + if (source < 0 || source > m_Impl->Size() - 1) + { + if (source != static_cast(Comm::Constants::CommRecvAny)) + { + throw std::runtime_error( + "Invalid MPI source rank in Iprobe: " + std::to_string(source) + + " for a communicator of size " + std::to_string(m_Impl->Size())); + } + } + return m_Impl->Iprobe(source, tag, flag, hint); +} + std::vector Comm::GetGathervDisplacements(const size_t *counts, const size_t countsSize) { std::vector displacements(countsSize); diff --git a/source/adios2/helper/adiosComm.h b/source/adios2/helper/adiosComm.h index cd2f7a1865..86f5cdf65e 100644 --- a/source/adios2/helper/adiosComm.h +++ b/source/adios2/helper/adiosComm.h @@ -263,6 +263,10 @@ class Comm Req Irecv(T *buffer, const size_t count, int source, int tag, const std::string &hint = std::string()) const; + Status Probe(int source, int tag, const std::string &hint = std::string()) const; + + Status Iprobe(int source, int tag, int *flag, const std::string &hint = std::string()) const; + Win Win_allocate_shared(size_t size, int disp_unit, void *baseptr, const std::string &hint = std::string()); int Win_shared_query(Win &win, int rank, size_t *size, int *disp_unit, void *baseptr, @@ -493,6 +497,10 @@ class CommImpl virtual Comm::Req Irecv(void *buffer, size_t count, Datatype datatype, int source, int tag, const std::string &hint) const = 0; + virtual Comm::Status Probe(int source, int tag, const std::string &hint) const = 0; + + virtual Comm::Status Iprobe(int source, int tag, int *flag, const std::string &hint) const = 0; + virtual Comm::Win Win_allocate_shared(size_t size, int disp_unit, void *baseptr, const std::string &hint) const = 0; virtual int Win_shared_query(Comm::Win &win, int rank, size_t *size, int *disp_unit, diff --git a/source/adios2/helper/adiosCommDummy.cpp b/source/adios2/helper/adiosCommDummy.cpp index f7453ca5a7..af03911f32 100644 --- a/source/adios2/helper/adiosCommDummy.cpp +++ b/source/adios2/helper/adiosCommDummy.cpp @@ -111,6 +111,10 @@ class CommImplDummy : public CommImpl Comm::Req Irecv(void *buffer, size_t count, Datatype datatype, int source, int tag, const std::string &hint) const override; + Comm::Status Probe(int source, int tag, const std::string &hint) const override; + + Comm::Status Iprobe(int source, int tag, int *flag, const std::string &hint) const override; + Comm::Win Win_allocate_shared(size_t size, int disp_unit, void *baseptr, const std::string &hint) const override; int Win_shared_query(Comm::Win &win, int rank, size_t *size, int *disp_unit, void *baseptr, @@ -280,6 +284,18 @@ Comm::Req CommImplDummy::Irecv(void *, size_t, Datatype, int, int, const std::st return MakeReq(std::move(req)); } +Comm::Status CommImplDummy::Probe(int source, int tag, const std::string &hint) const +{ + Comm::Status status; + return status; +} + +Comm::Status CommImplDummy::Iprobe(int source, int tag, int *flag, const std::string &hint) const +{ + Comm::Status status; + return status; +} + Comm::Win CommImplDummy::Win_allocate_shared(size_t size, int disp_unit, void *baseptr, const std::string &) const { diff --git a/source/adios2/helper/adiosCommMPI.cpp b/source/adios2/helper/adiosCommMPI.cpp index 09933af786..2499cd8c86 100644 --- a/source/adios2/helper/adiosCommMPI.cpp +++ b/source/adios2/helper/adiosCommMPI.cpp @@ -202,6 +202,10 @@ class CommImplMPI : public CommImpl Comm::Req Irecv(void *buffer, size_t count, Datatype datatype, int source, int tag, const std::string &hint) const override; + Comm::Status Probe(int source, int tag, const std::string &hint) const override; + + Comm::Status Iprobe(int source, int tag, int *flag, const std::string &hint) const override; + Comm::Win Win_allocate_shared(size_t size, int disp_unit, void *baseptr, const std::string &hint) const override; int Win_shared_query(Comm::Win &win, int rank, size_t *size, int *disp_unit, void *baseptr, @@ -513,6 +517,26 @@ Comm::Req CommImplMPI::Irecv(void *buffer, size_t count, Datatype datatype, int return MakeReq(std::move(req)); } +Comm::Status CommImplMPI::Probe(int source, int tag, const std::string &hint) const +{ + MPI_Status mpiStatus; + CheckMPIReturn(MPI_Probe(GetMPISource(source), tag, m_MPIComm, &mpiStatus), hint); + Comm::Status status; + status.Source = mpiStatus.MPI_SOURCE; + status.Tag = mpiStatus.MPI_TAG; + return status; +} + +Comm::Status CommImplMPI::Iprobe(int source, int tag, int *flag, const std::string &hint) const +{ + MPI_Status mpiStatus; + CheckMPIReturn(MPI_Iprobe(GetMPISource(source), tag, m_MPIComm, flag, &mpiStatus), hint); + Comm::Status status; + status.Source = mpiStatus.MPI_SOURCE; + status.Tag = mpiStatus.MPI_TAG; + return status; +} + Comm::Win CommImplMPI::Win_allocate_shared(size_t size, int disp_unit, void *baseptr, const std::string &hint) const { diff --git a/source/adios2/helper/adiosRerouting.h b/source/adios2/helper/adiosRerouting.h index 26ffeb6048..302d2c44b0 100644 --- a/source/adios2/helper/adiosRerouting.h +++ b/source/adios2/helper/adiosRerouting.h @@ -30,7 +30,9 @@ class RerouteMessage public: enum class MessageType { + DO_WRITE, WRITER_IDLE, + WRITE_SUBMISSION, REROUTE_REQUEST, REROUTE_ACK, REROUTE_REJECT, diff --git a/testing/adios2/engine/bp/CMakeLists.txt b/testing/adios2/engine/bp/CMakeLists.txt index 2d8bd6ed6a..6acb762ed4 100644 --- a/testing/adios2/engine/bp/CMakeLists.txt +++ b/testing/adios2/engine/bp/CMakeLists.txt @@ -126,10 +126,13 @@ bp_gtest_add_tests_helper(LargeMetadata MPI_ALLOW) bp5_gtest_add_tests_helper(WriteStatsOnly MPI_ALLOW) if (ADIOS2_HAVE_MPI) - # Extra arguments: aggregation type, num subfiles, num timesteps, verbose level + # Extra arguments: aggregation type, num subfiles, num timesteps, verbose level, rerouting? gtest_add_tests_helper(DataSizeAggregate MPI_ONLY BP Engine.BP. .BP5.DSB WORKING_DIRECTORY ${BP5_DIR} EXTRA_ARGS "DataSizeBased" "3" "5" "2" ) + gtest_add_tests_helper(DataSizeAggregate MPI_ONLY BP Engine.BP. .BP5.DSB.Rerouting + WORKING_DIRECTORY ${BP5_DIR} EXTRA_ARGS "DataSizeBased" "2" "1" "5" "WithRerouting" + ) endif() set(BP5LargeMeta "Engine.BP.BPLargeMetadata.BPWrite1D_LargeMetadata.BP5.Serial") diff --git a/testing/adios2/engine/bp/TestBPDataSizeAggregate.cpp b/testing/adios2/engine/bp/TestBPDataSizeAggregate.cpp index f47ea254ff..1054e5f12f 100644 --- a/testing/adios2/engine/bp/TestBPDataSizeAggregate.cpp +++ b/testing/adios2/engine/bp/TestBPDataSizeAggregate.cpp @@ -18,7 +18,8 @@ uint64_t nSteps = 1; std::string aggregationType = "DataSizeBased"; // comes from command line std::string numberOfSubFiles = "2"; // comes from command line std::string numberOfSteps = "1"; // comes from command line -std::string verbose = "0"; +std::string verbose = "0"; // comes from command line +bool rerouting = false; // comes from command line uint64_t sumFirstN(const std::vector &vec, uint64_t n) { @@ -100,6 +101,11 @@ TEST_F(DSATest, TestWriteUnbalancedData) bpIO.SetParameter("NumSubFiles", numberOfSubFiles); bpIO.SetParameter("verbose", verbose); + if (rerouting) + { + bpIO.SetParameter("EnableWriterRerouting", "true"); + } + adios2::Variable varGlobalArray = bpIO.DefineVariable("GlobalArray", {globalNx, globalNy}); EXPECT_TRUE(varGlobalArray); @@ -214,6 +220,14 @@ int main(int argc, char **argv) { verbose = std::string(argv[4]); } + if (argc > 5) + { + std::string lastArg = std::string(argv[5]); + if (lastArg.compare("WithRerouting") == 0) + { + rerouting = true; + } + } try { diff --git a/testing/adios2/engine/bp/TestBPRerouteMessage.cpp b/testing/adios2/engine/bp/TestBPRerouteMessage.cpp index c4c2b1c916..0a65719e2a 100644 --- a/testing/adios2/engine/bp/TestBPRerouteMessage.cpp +++ b/testing/adios2/engine/bp/TestBPRerouteMessage.cpp @@ -30,6 +30,13 @@ void SendAndReceiveMessage(helper::Comm &comm, int destRank, int srcRank) origMsg.m_Size = 1213; origMsg.SendTo(comm, destRank); + int ready = 0; + + while (!ready) + { + comm.Iprobe(static_cast(helper::Comm::Constants::CommRecvAny), 0, &ready); + } + // Receive a message from another (any) rank adios2::helper::RerouteMessage receivedMsg; receivedMsg.RecvFrom(comm, static_cast(helper::Comm::Constants::CommRecvAny)); From ac74c54878430ce65d3ffccdfb7b4159dfb91cca Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Thu, 2 Oct 2025 16:55:41 -0600 Subject: [PATCH 06/37] Test passes except when it hangs, no rerouting yet --- source/adios2/engine/bp5/BP5Writer.h | 7 +- .../engine/bp5/BP5Writer_WithRerouting.cpp | 131 +++++++++++++----- source/adios2/helper/adiosCommMPI.cpp | 1 - source/adios2/helper/adiosRerouting.cpp | 12 ++ source/adios2/helper/adiosRerouting.h | 23 +++ 5 files changed, 135 insertions(+), 39 deletions(-) diff --git a/source/adios2/engine/bp5/BP5Writer.h b/source/adios2/engine/bp5/BP5Writer.h index 97518bfb58..d9675f2ede 100644 --- a/source/adios2/engine/bp5/BP5Writer.h +++ b/source/adios2/engine/bp5/BP5Writer.h @@ -308,7 +308,12 @@ class BP5Writer : public BP5Engine, public core::Engine // is enabled void ReroutingCommunicationLoop(); size_t m_TargetIndex; - std::atomic m_readyToWrite; + int m_TargetCoordinator; + std::mutex m_WriteMutex; + std::mutex m_NotifMutex; + std::condition_variable m_WriteCV; + bool m_ReadyToWrite; + bool m_FinishedWriting; /* Async write's future */ std::future m_WriteFuture; diff --git a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp index d35ede70ed..0a14321e4a 100644 --- a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp +++ b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp @@ -10,16 +10,15 @@ #include "adios2/common/ADIOSMacros.h" #include "adios2/core/IO.h" +#include "adios2/helper/adiosRerouting.h" #include "adios2/helper/adiosFunctions.h" //CheckIndexRange #include "adios2/toolkit/format/buffer/chunk/ChunkV.h" #include "adios2/toolkit/format/buffer/malloc/MallocV.h" #include "adios2/toolkit/transport/file/FileFStream.h" #include -#include // max #include #include -#include #include namespace adios2 @@ -30,21 +29,20 @@ namespace engine { using namespace adios2::format; -using namespace adios2::helper; void BP5Writer::ReroutingCommunicationLoop() { - // Sleep for a random amount of time between 0 and 6 seconds - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_int_distribution<> distrib(0, 6000); - int sleepMillis = distrib(gen); - std::this_thread::sleep_for(std::chrono::milliseconds(sleepMillis)); + using RerouteMessage = typename adios2::helper::RerouteMessage; int subCoord = m_Aggregator->m_AggregatorRank; + bool iAmSubCoord = m_RankMPI == subCoord; + // Arbitrarily make the last rank the global coordinator + // TODO: should the global coordinator role be assigned to a subcoordinator? + bool iAmGlobalCoord = m_RankMPI == m_Comm.Size() - 1; std::queue writerQueue; int writingRank = -1; - uint64_t currentFilePos; + uint64_t currentFilePos = 0; + bool sentFinished = false; // Now start the std::cout << " Rank " << m_Comm.Rank() << " ReroutingCommunicationLoop()" << std::endl; @@ -53,20 +51,16 @@ void BP5Writer::ReroutingCommunicationLoop() std::cout << " subfile index: " << m_Aggregator->m_SubStreamIndex << std::endl; std::cout << " total subfiles: " << m_Aggregator->m_SubStreams << std::endl; - if (m_Rank == subCoord && m_DataPosShared == true) + if (iAmSubCoord && m_DataPosShared == true) { // We are a subcoordinator and have shared data pos after a previous timestep, // we should update our notion of m_DataPos - currentFilePos = m_SubstreamDataPos[a->m_SubStreamIndex]; + currentFilePos = m_SubstreamDataPos[m_Aggregator->m_SubStreamIndex]; m_DataPosShared = false; } - // align to PAGE_SIZE - m_DataPos += helper::PaddingToAlignOffset(m_DataPos, m_Parameters.StripeSize); - m_StartDataPos = m_DataPos; - // First send a message to the SC to get added to their writing queue - adios2::helper::RerouteMessage submitMsg; + RerouteMessage submitMsg; submitMsg.m_MsgType = RerouteMessage::MessageType::WRITE_SUBMISSION; submitMsg.m_SrcRank = m_RankMPI; submitMsg.m_DestRank = subCoord; @@ -75,52 +69,89 @@ void BP5Writer::ReroutingCommunicationLoop() while (true) { int msgReady = 0; + helper::Comm::Status status = m_Comm.Iprobe( + static_cast(helper::Comm::Constants::CommRecvAny), 0, &msgReady); - m_Comm.Iprobe(static_cast(helper::Comm::Constants::CommRecvAny), 0, &msgReady); - - if (msgReady) + // If there is a message ready, receive and handle it + // if (msgReady) + while (msgReady) { - // If there is a message ready, receive and handle it - adios2::helper::RerouteMessage message; - message.RecvFrom(m_Comm, static_cast(helper::Comm::Constants::CommRecvAny)); + RerouteMessage message; + message.RecvFrom(m_Comm, status.Source); - switch (message.m_MsgType) + switch ((RerouteMessage::MessageType) message.m_MsgType) { - case (int)RerouteMessage::MessageType::WRITE_SUBMISSION: + case RerouteMessage::MessageType::WRITE_SUBMISSION: writerQueue.push(message.m_SrcRank); break; - case (int)RerouteMessage::MessageType::DO_WRITE: - m_TargetIndex = message.m_SubStreamIdx; - m_DataPos = message.m_Offset; + case RerouteMessage::MessageType::DO_WRITE: + { + std::unique_lock lck(m_WriteMutex); + m_TargetIndex = message.m_SubStreamIdx; + m_DataPos = message.m_Offset; + m_TargetCoordinator = message.m_SrcRank; + m_ReadyToWrite = true; + m_WriteCV.notify_one(); + } + break; + case RerouteMessage::MessageType::WRITE_COMPLETION: + currentFilePos = message.m_Offset; + writingRank = -1; break; default: break; } + + msgReady = 0; + status = m_Comm.Iprobe(static_cast(helper::Comm::Constants::CommRecvAny), 0, &msgReady); } // Check if writing has finished, and alert the target SC + { + std::lock_guard lck(m_NotifMutex); + if (m_FinishedWriting && !sentFinished) + { + adios2::helper::RerouteMessage writeCompleteMsg; + writeCompleteMsg.m_MsgType = RerouteMessage::MessageType::WRITE_COMPLETION; + writeCompleteMsg.m_SrcRank = m_RankMPI; + writeCompleteMsg.m_DestRank = m_TargetCoordinator; + writeCompleteMsg.m_SubStreamIdx = m_TargetIndex; + writeCompleteMsg.m_Offset = m_DataPos; + writeCompleteMsg.SendTo(m_Comm, m_TargetCoordinator); + sentFinished = true; + + if (!iAmSubCoord /*&& !iAmGlobalCoord*/ ) + { + // My only role was to write (no communication responsibility) so I am + // done at this point. + break; + } + } + } // Check if anyone is writing right now, and if not, ask the next writer to start - if (writingRank == -1) + if (iAmSubCoord && writingRank == -1) { if (!writerQueue.empty()) { - int nextWriter = writerQueue.pop(); + int nextWriter = writerQueue.front(); + writerQueue.pop(); adios2::helper::RerouteMessage writeMsg; - writeMsg.m_MsgType = adios2::helper::RerouteMessage::MessageType::DO_WRITE; + writeMsg.m_MsgType = RerouteMessage::MessageType::DO_WRITE; writeMsg.m_SrcRank = m_RankMPI; writeMsg.m_DestRank = nextWriter; writeMsg.m_SubStreamIdx = m_Aggregator->m_SubStreamIndex; writeMsg.m_Offset = currentFilePos; - writeMsg.SendTo(comm, nextWriter); writingRank = nextWriter; + writeMsg.SendTo(m_Comm, nextWriter); + } + else + { + // TODO: Send WRITE_IDLE to the GC instead of ending the loop here + break; } - } } - - - } void BP5Writer::WriteData_WithRerouting(format::BufferV *Data) @@ -143,12 +174,38 @@ void BP5Writer::WriteData_WithRerouting(format::BufferV *Data) // - you're done // - the new offset (or the amount you wrote) - m_readyToWrite = false; + m_ReadyToWrite = false; + m_FinishedWriting = false; std::thread commThread(&BP5Writer::ReroutingCommunicationLoop, this); std::cout << "Background thread for rank " << m_RankMPI << " is now running" << std::endl; + // wait until communication thread indicates it's our turn to write + { + std::unique_lock lck(m_WriteMutex); + m_WriteCV.wait(lck, [this]{ return m_ReadyToWrite; }); + } + + // Do the writing + + // align to PAGE_SIZE + m_DataPos += helper::PaddingToAlignOffset(m_DataPos, m_Parameters.StripeSize); + m_StartDataPos = m_DataPos; + + std::vector DataVec = Data->DataVec(); + // TODO: use m_TargetIndex here instead of being locked into aggregator's index + AggTransportData &aggData = m_AggregatorSpecifics.at(GetCacheKey(m_Aggregator)); + aggData.m_FileDataManager.WriteFileAt(DataVec.data(), DataVec.size(), m_StartDataPos); + + m_DataPos += Data->Size(); + + // Now signal the communication thread that this rank has finished writing + { + std::lock_guard lck(m_NotifMutex); + m_FinishedWriting = true; + } + commThread.join(); std::cout << "Background thread for rank " << m_RankMPI << " is now finished" << std::endl; diff --git a/source/adios2/helper/adiosCommMPI.cpp b/source/adios2/helper/adiosCommMPI.cpp index 2499cd8c86..fc3e4a852c 100644 --- a/source/adios2/helper/adiosCommMPI.cpp +++ b/source/adios2/helper/adiosCommMPI.cpp @@ -73,7 +73,6 @@ int GetMPISource(int source) { if (source == static_cast(Comm::Constants::CommRecvAny)) { - std::cout << "Source override" << std::endl; return MPI_ANY_SOURCE; } diff --git a/source/adios2/helper/adiosRerouting.cpp b/source/adios2/helper/adiosRerouting.cpp index db552069bc..f0e9c78c51 100644 --- a/source/adios2/helper/adiosRerouting.cpp +++ b/source/adios2/helper/adiosRerouting.cpp @@ -53,6 +53,9 @@ void RerouteMessage::FromBuffer(const std::vector &buffer) void RerouteMessage::SendTo(helper::Comm& comm, int destRank) { + std::cout << "Rank " << comm.Rank() << " sending " + << this->GetTypeString(this->m_MsgType) << " to " << destRank << std::endl; + std::vector sendBuf; this->ToBuffer(sendBuf); @@ -80,6 +83,15 @@ void RerouteMessage::RecvFrom(helper::Comm& comm, int srcRank) helper::Comm::Status status = comm.Recv(recvBuf.data(), REROUTE_MESSAGE_SIZE, srcRank, 0); this->FromBuffer(recvBuf); + + std::cout << "Rank " << comm.Rank() << " received " + << this->GetTypeString(this->m_MsgType) << " from " << status.Source + << std::endl; + + if (status.Source != this->m_SrcRank) + { + std::cout << " GAHHHHHHH!" << std::endl; + } } } // end namespace helper diff --git a/source/adios2/helper/adiosRerouting.h b/source/adios2/helper/adiosRerouting.h index 302d2c44b0..e36a3756c0 100644 --- a/source/adios2/helper/adiosRerouting.h +++ b/source/adios2/helper/adiosRerouting.h @@ -33,11 +33,34 @@ class RerouteMessage DO_WRITE, WRITER_IDLE, WRITE_SUBMISSION, + WRITE_COMPLETION, REROUTE_REQUEST, REROUTE_ACK, REROUTE_REJECT, }; + std::string GetTypeString(MessageType mtype) + { + switch (mtype) + { + case MessageType::DO_WRITE: + return std::string("DO_WRITE"); + break; + case MessageType::WRITER_IDLE: + return std::string("WRITER_IDLE"); + break; + case MessageType::WRITE_SUBMISSION: + return std::string("WRITE_SUBMISSION"); + break; + case MessageType::WRITE_COMPLETION: + return std::string("WRITE_COMPLETION"); + break; + default: + return std::string("UNKNOWN"); + break; + } + } + // Store a RerouteMsg so it can sent/received over mpi void ToBuffer(std::vector &buffer); From 2bc1d771e867e0b115a29288e857750b17324df5 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Fri, 3 Oct 2025 11:00:17 -0600 Subject: [PATCH 07/37] Fix race condition Cause of race condition was that rank 0 would, maybe due to its extra pre-write duties, arrive to the party and send its WRITE_SUBMISSION msg after the SC had already finished writing and exited the comm loop. Since all ranks generate the complete partitioning information, we can make that available so that SCs can pre-populate their writer queues from that and do away with the WRITE_SUBMISSION message altogether. This reduces the total number of messages needed, as well as avoids the race condtion that caused intermittent hangs of the test. --- source/adios2/engine/bp5/BP5Writer.cpp | 6 +-- source/adios2/engine/bp5/BP5Writer.h | 1 + .../engine/bp5/BP5Writer_WithRerouting.cpp | 38 +++++++++---------- 3 files changed, 21 insertions(+), 24 deletions(-) diff --git a/source/adios2/engine/bp5/BP5Writer.cpp b/source/adios2/engine/bp5/BP5Writer.cpp index 9bfba6288a..29b6199e57 100644 --- a/source/adios2/engine/bp5/BP5Writer.cpp +++ b/source/adios2/engine/bp5/BP5Writer.cpp @@ -115,15 +115,15 @@ helper::RankPartition BP5Writer::GetPartitionInfo(const uint64_t rankDataSize, c m_Profiler.AddTimerWatch("PartitionRanks"); m_Profiler.Start("PartitionRanks"); - helper::Partitioning partitioning = helper::PartitionRanks(allsizes, numPartitions); + m_Partitioning = helper::PartitionRanks(allsizes, numPartitions); m_Profiler.Stop("PartitionRanks"); if (parentRank == 0 && m_Parameters.verbose > 0) { - partitioning.PrintSummary(); + m_Partitioning.PrintSummary(); } - return partitioning.FindPartition(parentRank); + return m_Partitioning.FindPartition(parentRank); } StepStatus BP5Writer::BeginStep(StepMode mode, const float timeoutSeconds) diff --git a/source/adios2/engine/bp5/BP5Writer.h b/source/adios2/engine/bp5/BP5Writer.h index d9675f2ede..76bdb9a882 100644 --- a/source/adios2/engine/bp5/BP5Writer.h +++ b/source/adios2/engine/bp5/BP5Writer.h @@ -79,6 +79,7 @@ class BP5Writer : public BP5Engine, public core::Engine helper::RankPartition GetPartitionInfo(const uint64_t rankDataSize, const int subStreams, helper::Comm const &parentComm); void OpenSubfile(const bool useComm = true); + helper::Partitioning m_Partitioning; /** Single object controlling BP buffering */ format::BP5Serializer m_BP5Serializer; diff --git a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp index 0a14321e4a..5d4f3eb8c6 100644 --- a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp +++ b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp @@ -51,20 +51,23 @@ void BP5Writer::ReroutingCommunicationLoop() std::cout << " subfile index: " << m_Aggregator->m_SubStreamIndex << std::endl; std::cout << " total subfiles: " << m_Aggregator->m_SubStreams << std::endl; - if (iAmSubCoord && m_DataPosShared == true) + if (iAmSubCoord) { - // We are a subcoordinator and have shared data pos after a previous timestep, - // we should update our notion of m_DataPos - currentFilePos = m_SubstreamDataPos[m_Aggregator->m_SubStreamIndex]; - m_DataPosShared = false; - } + // Pre-populate my queue with the ranks in my group/partition + const std::vector &groupRanks = m_Partitioning.m_Partitions[m_Aggregator->m_SubStreamIndex]; + for (auto rank : groupRanks) + { + writerQueue.push(static_cast(rank)); + } - // First send a message to the SC to get added to their writing queue - RerouteMessage submitMsg; - submitMsg.m_MsgType = RerouteMessage::MessageType::WRITE_SUBMISSION; - submitMsg.m_SrcRank = m_RankMPI; - submitMsg.m_DestRank = subCoord; - submitMsg.SendTo(m_Comm, subCoord); + if (m_DataPosShared) + { + // We have shared data pos after a previous timestep, we should update our + // notion of m_DataPos + currentFilePos = m_SubstreamDataPos[m_Aggregator->m_SubStreamIndex]; + m_DataPosShared = false; + } + } while (true) { @@ -73,17 +76,13 @@ void BP5Writer::ReroutingCommunicationLoop() static_cast(helper::Comm::Constants::CommRecvAny), 0, &msgReady); // If there is a message ready, receive and handle it - // if (msgReady) - while (msgReady) + if (msgReady) { RerouteMessage message; message.RecvFrom(m_Comm, status.Source); switch ((RerouteMessage::MessageType) message.m_MsgType) { - case RerouteMessage::MessageType::WRITE_SUBMISSION: - writerQueue.push(message.m_SrcRank); - break; case RerouteMessage::MessageType::DO_WRITE: { std::unique_lock lck(m_WriteMutex); @@ -101,9 +100,6 @@ void BP5Writer::ReroutingCommunicationLoop() default: break; } - - msgReady = 0; - status = m_Comm.Iprobe(static_cast(helper::Comm::Constants::CommRecvAny), 0, &msgReady); } // Check if writing has finished, and alert the target SC @@ -157,7 +153,7 @@ void BP5Writer::ReroutingCommunicationLoop() void BP5Writer::WriteData_WithRerouting(format::BufferV *Data) { // - start the communcation loop running in a thread - // - begin polling the write barrier variable, once it flips. write: + // - wait to be signalled by the communication thread, then write: // - variables set by comm thread indicate subfile and offset // - attempt to join the comm thread // - return From bb4f632ca52c95bd900ca24661e0b50bdeccb03b Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Fri, 3 Oct 2025 15:48:13 -0600 Subject: [PATCH 08/37] Fix failing tests when using the new rerouting feature * Before leaving the comm loop, SC must update its m_DataPos from the variable that has been tracking it for its group, or else it shares the wrong file positions with the other ranks. This was causing all tests with more than 1 timestep to fail. * When SC comm thread first starts up, it needs to set the current file position variable from m_DataPos, which was updated correctly in append mode. That was causing tests that append to fail. --- source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp index 5d4f3eb8c6..a12df82f72 100644 --- a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp +++ b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp @@ -60,6 +60,8 @@ void BP5Writer::ReroutingCommunicationLoop() writerQueue.push(static_cast(rank)); } + currentFilePos = m_DataPos; + if (m_DataPosShared) { // We have shared data pos after a previous timestep, we should update our @@ -144,6 +146,7 @@ void BP5Writer::ReroutingCommunicationLoop() else { // TODO: Send WRITE_IDLE to the GC instead of ending the loop here + m_DataPos = currentFilePos; break; } } From bc221bb1d73d11448fe04f2105019c42372e32b1 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Fri, 3 Oct 2025 16:32:28 -0600 Subject: [PATCH 09/37] Update bp5 only tests for rerouting, and try rerouting by default --- source/adios2/engine/bp5/BP5Engine.h | 4 +-- testing/adios2/engine/bp/CMakeLists.txt | 2 ++ .../engine/bp/TestBPAppendAfterSteps.cpp | 20 +++++++++++-- .../engine/bp/TestBPDataSizeAggregate.cpp | 12 ++++---- testing/adios2/engine/bp/TestBPDirectIO.cpp | 15 +++++++++- .../engine/bp/TestBPNewFileAppendMode.cpp | 15 ++++++++++ .../engine/bp/TestBPParameterSelectSteps.cpp | 30 +++++++++++++++---- .../engine/bp/TestBPReadMultithreaded.cpp | 19 ++++++++++-- .../engine/bp/TestBPStepsFileGlobalArray.cpp | 30 +++++++++++++++++-- 9 files changed, 124 insertions(+), 23 deletions(-) diff --git a/source/adios2/engine/bp5/BP5Engine.h b/source/adios2/engine/bp5/BP5Engine.h index 3c1bbe81a4..6d18689081 100644 --- a/source/adios2/engine/bp5/BP5Engine.h +++ b/source/adios2/engine/bp5/BP5Engine.h @@ -150,8 +150,8 @@ class BP5Engine MACRO(DirectIO, Bool, bool, false) \ MACRO(DirectIOAlignOffset, UInt, unsigned int, 512) \ MACRO(DirectIOAlignBuffer, UInt, unsigned int, 0) \ - MACRO(AggregationType, AggregationType, int, (int)AggregationType::TwoLevelShm) \ - MACRO(EnableWriterRerouting, Bool, bool, false) \ + MACRO(AggregationType, AggregationType, int, (int)AggregationType::DataSizeBased) \ + MACRO(EnableWriterRerouting, Bool, bool, true) \ MACRO(AsyncOpen, Bool, bool, true) \ MACRO(AsyncWrite, AsyncWrite, int, (int)AsyncWrite::Sync) \ MACRO(GrowthFactor, Float, float, DefaultBufferGrowthFactor) \ diff --git a/testing/adios2/engine/bp/CMakeLists.txt b/testing/adios2/engine/bp/CMakeLists.txt index 6acb762ed4..0d0af774e5 100644 --- a/testing/adios2/engine/bp/CMakeLists.txt +++ b/testing/adios2/engine/bp/CMakeLists.txt @@ -248,6 +248,8 @@ macro(bp5agg_gtest_add_tests testname mpi) WORKING_DIRECTORY ${BP5_DIR} EXTRA_ARGS "BP5" "TwoLevelShm") gtest_add_tests_helper(${testname} ${mpi} BP Engine.BPAGG. .BP5.DSB WORKING_DIRECTORY ${BP5_DIR} EXTRA_ARGS "BP5" "DataSizeBased") + gtest_add_tests_helper(${testname} ${mpi} BP Engine.BPAGG. .BP5.DSB.RR + WORKING_DIRECTORY ${BP5_DIR} EXTRA_ARGS "BP5" "DataSizeBased" "WithRerouting") endmacro() if (ADIOS2_HAVE_MPI) diff --git a/testing/adios2/engine/bp/TestBPAppendAfterSteps.cpp b/testing/adios2/engine/bp/TestBPAppendAfterSteps.cpp index 58e91332dd..f6ced86cba 100644 --- a/testing/adios2/engine/bp/TestBPAppendAfterSteps.cpp +++ b/testing/adios2/engine/bp/TestBPAppendAfterSteps.cpp @@ -22,6 +22,7 @@ std::string engineName; // comes from command line std::string aggType = "TwoLevelShm"; // overridden on command line +bool rerouting = false; // overridden on command line const std::size_t Nx = 10; using DataArray = std::array; @@ -88,9 +89,9 @@ TEST_P(BPAppendAfterStepsP, Test) << " steps, then appending " << nSteps << " steps again with parameter " << nAppendAfterSteps << std::endl; - std::string filename = "AppendAfterSteps_agg_" + aggType + "_N" + std::to_string(mpiSize) + - "_Steps" + std::to_string(nSteps) + "_Append_" + - std::to_string(nAppendAfterSteps) + ".bp"; + std::string filename = "AppendAfterSteps_agg_" + aggType + "_RR" + (rerouting ? "Y" : "N") + + "_N" + std::to_string(mpiSize) + "_Steps" + std::to_string(nSteps) + + "_Append_" + std::to_string(nAppendAfterSteps) + ".bp"; size_t totalNSteps = 0; { @@ -98,6 +99,10 @@ TEST_P(BPAppendAfterStepsP, Test) adios2::IO ioWrite = adios.DeclareIO("TestIOWrite"); ioWrite.SetEngine(engineName); ioWrite.SetParameter("AggregationType", aggType); + + const char* rr = (rerouting ? "true" : "false"); + ioWrite.SetParameter("EnableWriterRerouting", rr); + adios2::Engine engine = ioWrite.Open(filename, adios2::Mode::Write); adios2::Dims shape{static_cast(mpiSize * Nx)}; adios2::Dims start{static_cast(mpiRank * Nx)}; @@ -222,6 +227,15 @@ int main(int argc, char **argv) aggType = std::string(argv[2]); } + if (argc > 3) + { + std::string lastArg = std::string(argv[3]); + if (lastArg.compare("WithRerouting") == 0) + { + rerouting = true; + } + } + result = RUN_ALL_TESTS(); #if ADIOS2_USE_MPI diff --git a/testing/adios2/engine/bp/TestBPDataSizeAggregate.cpp b/testing/adios2/engine/bp/TestBPDataSizeAggregate.cpp index 1054e5f12f..391d331fb1 100644 --- a/testing/adios2/engine/bp/TestBPDataSizeAggregate.cpp +++ b/testing/adios2/engine/bp/TestBPDataSizeAggregate.cpp @@ -94,6 +94,8 @@ TEST_F(DSATest, TestWriteUnbalancedData) uint64_t globalNy = sumFirstN(columnsPerRank, columnsPerRank.size()); uint64_t largestValue = (globalNx * globalNy) - 1; + std::string filename = std::string("unbalanced_output") + "_RR" + (rerouting ? "Y" : "N"); + { adios2::IO bpIO = adios.DeclareIO("WriteIO"); bpIO.SetEngine("BPFile"); @@ -101,16 +103,14 @@ TEST_F(DSATest, TestWriteUnbalancedData) bpIO.SetParameter("NumSubFiles", numberOfSubFiles); bpIO.SetParameter("verbose", verbose); - if (rerouting) - { - bpIO.SetParameter("EnableWriterRerouting", "true"); - } + const char* rr = (rerouting ? "true" : "false"); + bpIO.SetParameter("EnableWriterRerouting", rr); adios2::Variable varGlobalArray = bpIO.DefineVariable("GlobalArray", {globalNx, globalNy}); EXPECT_TRUE(varGlobalArray); - adios2::Engine bpWriter = bpIO.Open("unbalanced_output.bp", adios2::Mode::Write); + adios2::Engine bpWriter = bpIO.Open(filename, adios2::Mode::Write); for (size_t step = 0; step < nSteps; ++step) { @@ -159,7 +159,7 @@ TEST_F(DSATest, TestWriteUnbalancedData) adios2::IO io = adios.DeclareIO("ReadIO"); io.SetEngine("BPFile"); - adios2::Engine bpReader = io.Open("unbalanced_output.bp", adios2::Mode::ReadRandomAccess); + adios2::Engine bpReader = io.Open(filename, adios2::Mode::ReadRandomAccess); auto var_array = io.InquireVariable("GlobalArray"); EXPECT_TRUE(var_array); diff --git a/testing/adios2/engine/bp/TestBPDirectIO.cpp b/testing/adios2/engine/bp/TestBPDirectIO.cpp index a9390eb6f9..2428c7e604 100644 --- a/testing/adios2/engine/bp/TestBPDirectIO.cpp +++ b/testing/adios2/engine/bp/TestBPDirectIO.cpp @@ -15,6 +15,7 @@ std::string engineName; // comes from command line std::string aggType = "TwoLevelShm"; // comes from command line +bool rerouting = false; // overridden on command line class ADIOSReadDirectIOTest : public ::testing::Test { @@ -31,7 +32,7 @@ TEST_F(ADIOSReadDirectIOTest, BufferResize) int mpiRank = 0, mpiSize = 1; - std::string filename = "ADIOSDirectIO.agg-" + aggType; + std::string filename = "ADIOSDirectIO.agg-" + aggType + "_RR" + (rerouting ? "Y" : "N"); #if ADIOS2_USE_MPI MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank); @@ -59,6 +60,9 @@ TEST_F(ADIOSReadDirectIOTest, BufferResize) // BufferChunkSize should be adjusted to 2*4096 by engine // StripeSize should be adjusted to 3*4096 by engine + const char* rr = (rerouting ? "true" : "false"); + ioWrite.SetParameter("EnableWriterRerouting", rr); + adios2::Engine engine = ioWrite.Open(filename, adios2::Mode::Write); // Number of elements per process const std::size_t Nx = 2000; @@ -145,6 +149,15 @@ int main(int argc, char **argv) aggType = std::string(argv[2]); } + if (argc > 3) + { + std::string lastArg = std::string(argv[3]); + if (lastArg.compare("WithRerouting") == 0) + { + rerouting = true; + } + } + int result = RUN_ALL_TESTS(); #if ADIOS2_USE_MPI MPI_Finalize(); diff --git a/testing/adios2/engine/bp/TestBPNewFileAppendMode.cpp b/testing/adios2/engine/bp/TestBPNewFileAppendMode.cpp index 4ba9b07b91..d1201384ca 100644 --- a/testing/adios2/engine/bp/TestBPNewFileAppendMode.cpp +++ b/testing/adios2/engine/bp/TestBPNewFileAppendMode.cpp @@ -14,6 +14,7 @@ std::string engineName; // comes from command line std::string aggregationType = "EveryoneWritesSerial"; // comes from command line +bool rerouting = false; // overridden on command line class BPNewFileAppendMode : public ::testing::Test { @@ -43,6 +44,8 @@ TEST_F(BPNewFileAppendMode, ADIOS2BPNewFileAppendMode) fname += "-EW.bp"; } + fname += std::string("_RR") + (rerouting ? "Y" : "N"); + const size_t Nx = 6; adios2::ADIOS adios; @@ -64,6 +67,9 @@ TEST_F(BPNewFileAppendMode, ADIOS2BPNewFileAppendMode) io.SetParameter("AggregationType", aggregationType); io.SetParameter("NumAggregators", "0"); + const char* rr = (rerouting ? "true" : "false"); + io.SetParameter("EnableWriterRerouting", rr); + adios2::Engine engine = io.Open(fname, adios2::Mode::Append); engine.Close(); @@ -84,6 +90,15 @@ int main(int argc, char **argv) aggregationType = std::string(argv[2]); } + if (argc > 3) + { + std::string lastArg = std::string(argv[3]); + if (lastArg.compare("WithRerouting") == 0) + { + rerouting = true; + } + } + result = RUN_ALL_TESTS(); return result; } \ No newline at end of file diff --git a/testing/adios2/engine/bp/TestBPParameterSelectSteps.cpp b/testing/adios2/engine/bp/TestBPParameterSelectSteps.cpp index b467eb7ca4..bb81422018 100644 --- a/testing/adios2/engine/bp/TestBPParameterSelectSteps.cpp +++ b/testing/adios2/engine/bp/TestBPParameterSelectSteps.cpp @@ -22,6 +22,7 @@ std::string engineName; // comes from command line std::string aggType = "TwoLevelShm"; // overridden on command line +bool rerouting = false; // overridden on command line int streamingFileId = 0; constexpr std::size_t NSteps = 10; const std::size_t Nx = 10; @@ -80,12 +81,16 @@ class BPParameterSelectSteps : public ::testing::Test #else adios2::ADIOS adios; #endif - OutputFileName = "ParameterSelectSteps_agg_" + aggType + "_id_" + - std::to_string(streamingFileId++) + "_size_" + std::to_string(mpiSize) + - ".bp"; + OutputFileName = "ParameterSelectSteps_agg_" + aggType + "_RR" + (rerouting ? "Y" : "N") + + "_id_" + std::to_string(streamingFileId++) + "_size_" + + std::to_string(mpiSize) + ".bp"; adios2::IO ioWrite = adios.DeclareIO("TestIOWrite"); ioWrite.SetEngine(engineName); ioWrite.SetParameter("AggregationType", aggType); + + const char* rr = (rerouting ? "true" : "false"); + ioWrite.SetParameter("EnableWriterRerouting", rr); + adios2::Engine engine = ioWrite.Open(OutputFileName, adios2::Mode::Write); // Number of elements per process const std::size_t Nx = 10; @@ -200,12 +205,16 @@ TEST_P(BPParameterSelectStepsP, Stream) adios2::ADIOS adios; #endif - std::string filename = "ParameterSelectStepsStream_agg_" + aggType + "_id_" + - std::to_string(streamingFileId++) + "_size_" + std::to_string(mpiSize) + - ".bp"; + std::string filename = "ParameterSelectStepsStream_agg_" + aggType + "_RR" + + (rerouting ? "Y" : "N") + "_id_" + std::to_string(streamingFileId++) + + "_size_" + std::to_string(mpiSize) + ".bp"; adios2::IO ioWrite = adios.DeclareIO("TestIOWrite"); ioWrite.SetEngine(engineName); ioWrite.SetParameter("AggregationType", aggType); + + const char* rr = (rerouting ? "true" : "false"); + ioWrite.SetParameter("EnableWriterRerouting", rr); + adios2::Engine writer = ioWrite.Open(filename, adios2::Mode::Write); adios2::IO ioRead = adios.DeclareIO("TestIORead"); @@ -320,6 +329,15 @@ int main(int argc, char **argv) aggType = std::string(argv[2]); } + if (argc > 3) + { + std::string lastArg = std::string(argv[3]); + if (lastArg.compare("WithRerouting") == 0) + { + rerouting = true; + } + } + result = RUN_ALL_TESTS(); #if ADIOS2_USE_MPI diff --git a/testing/adios2/engine/bp/TestBPReadMultithreaded.cpp b/testing/adios2/engine/bp/TestBPReadMultithreaded.cpp index 6c4140e249..082b23b289 100644 --- a/testing/adios2/engine/bp/TestBPReadMultithreaded.cpp +++ b/testing/adios2/engine/bp/TestBPReadMultithreaded.cpp @@ -22,6 +22,7 @@ std::string engineName; // comes from command line std::string aggType = "TwoLevelShm"; // overridden on command line +bool rerouting = false; // overridden on command line constexpr std::size_t NSteps = 4; const std::size_t Nx = 10; using DataArray = std::array; @@ -85,22 +86,27 @@ class BPReadMultithreadedTest : public ::testing::Test #else adios2::ADIOS adios; #endif + std::string rr(std::string("_RR") + (rerouting ? "Y" : "N")); std::string filename; if (stream) { - StreamOutputFileName = "BPReadMultithreaded_agg_" + aggType + "_size_" + + StreamOutputFileName = "BPReadMultithreaded_agg_" + aggType + rr + "_size_" + std::to_string(mpiSize) + "_Stream.bp"; filename = StreamOutputFileName; } else { - FileOutputFileName = "BPReadMultithreaded_agg_" + aggType + "_size_" + + FileOutputFileName = "BPReadMultithreaded_agg_" + aggType + rr + "_size_" + std::to_string(mpiSize) + "_File.bp"; filename = FileOutputFileName; } adios2::IO ioWrite = adios.DeclareIO("TestIOWrite"); ioWrite.SetEngine(engineName); ioWrite.SetParameter("AggregationType", aggType); + + const char* rrParam = (rerouting ? "true" : "false"); + ioWrite.SetParameter("EnableWriterRerouting", rrParam); + adios2::Engine engine = ioWrite.Open(filename, adios2::Mode::Write); // Number of elements per process const std::size_t Nx = 10; @@ -302,6 +308,15 @@ int main(int argc, char **argv) aggType = std::string(argv[2]); } + if (argc > 3) + { + std::string lastArg = std::string(argv[3]); + if (lastArg.compare("WithRerouting") == 0) + { + rerouting = true; + } + } + result = RUN_ALL_TESTS(); #if ADIOS2_USE_MPI diff --git a/testing/adios2/engine/bp/TestBPStepsFileGlobalArray.cpp b/testing/adios2/engine/bp/TestBPStepsFileGlobalArray.cpp index 8f53e734ff..dd95d77314 100644 --- a/testing/adios2/engine/bp/TestBPStepsFileGlobalArray.cpp +++ b/testing/adios2/engine/bp/TestBPStepsFileGlobalArray.cpp @@ -18,6 +18,7 @@ std::string engineName; // comes from command line std::string aggType = ""; // overridden on command line +bool rerouting = false; // overridden on command line // Number of elements per process const std::size_t Nx = 10; @@ -121,7 +122,7 @@ TEST_P(BPStepsFileGlobalArrayReaders, EveryStep) { const ReadMode readMode = GetReadMode(); std::string fname_prefix = "BPStepsFileGlobalArray.EveryStep." + ReadModeToString(readMode) + - AggregationTypeAlias(aggType); + AggregationTypeAlias(aggType) + "_RR" + (rerouting ? "Y" : "N"); int mpiRank = 0, mpiSize = 1; const std::size_t NSteps = 4; @@ -160,6 +161,10 @@ TEST_P(BPStepsFileGlobalArrayReaders, EveryStep) { io.SetParameter("AggregationType", aggType); } + + const char* rr = (rerouting ? "true" : "false"); + io.SetParameter("EnableWriterRerouting", rr); + adios2::Engine engine = io.Open(fname, adios2::Mode::Write); auto var_i32 = io.DefineVariable("i32", shape, start, count); @@ -380,7 +385,8 @@ TEST_P(BPStepsFileGlobalArrayReaders, NewVarPerStep) { const ReadMode readMode = GetReadMode(); std::string fname_prefix = "BPStepsFileGlobalArray.NewVarPerStep." + - ReadModeToString(readMode) + AggregationTypeAlias(aggType); + ReadModeToString(readMode) + AggregationTypeAlias(aggType) + + "_RR" + (rerouting ? "Y" : "N"); int mpiRank = 0, mpiSize = 1; const std::size_t NSteps = 4; @@ -421,6 +427,10 @@ TEST_P(BPStepsFileGlobalArrayReaders, NewVarPerStep) { io.SetParameter("AggregationType", aggType); } + + const char* rr = (rerouting ? "true" : "false"); + io.SetParameter("EnableWriterRerouting", rr); + adios2::Engine engine = io.Open(fname, adios2::Mode::Write); for (int step = 0; step < static_cast(NSteps); ++step) @@ -667,7 +677,8 @@ TEST_P(BPStepsFileGlobalArrayParameters, EveryOtherStep) const ReadMode readMode = GetReadMode(); std::string fname_prefix = "BPStepsFileGlobalArray.EveryOtherStep.Steps" + std::to_string(NSteps) + ".Oddity" + std::to_string(Oddity) + "." + - ReadModeToString(readMode) + AggregationTypeAlias(aggType); + ReadModeToString(readMode) + AggregationTypeAlias(aggType) + + "_RR" + (rerouting ? "Y" : "N"); int mpiRank = 0, mpiSize = 1; #if ADIOS2_USE_MPI @@ -709,6 +720,10 @@ TEST_P(BPStepsFileGlobalArrayParameters, EveryOtherStep) { io.SetParameter("AggregationType", aggType); } + + const char* rr = (rerouting ? "true" : "false"); + io.SetParameter("EnableWriterRerouting", rr); + adios2::Engine engine = io.Open(fname, adios2::Mode::Write); auto var_i32 = io.DefineVariable("i32", shape, start, count); @@ -982,6 +997,15 @@ int main(int argc, char **argv) aggType = std::string(argv[2]); } + if (argc > 3) + { + std::string lastArg = std::string(argv[3]); + if (lastArg.compare("WithRerouting") == 0) + { + rerouting = true; + } + } + result = RUN_ALL_TESTS(); #if ADIOS2_USE_MPI From cd3521789ca32a654a76f0363591bfa29f74d831 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Fri, 3 Oct 2025 16:42:08 -0600 Subject: [PATCH 10/37] run clang format --- source/adios2/engine/bp5/BP5Writer.cpp | 3 +- .../engine/bp5/BP5Writer_WithRerouting.cpp | 34 +++++++++---------- source/adios2/helper/adiosComm.cpp | 6 ++-- source/adios2/helper/adiosCommMPI.cpp | 20 ++++++----- source/adios2/helper/adiosRerouting.cpp | 15 ++++---- source/adios2/helper/adiosRerouting.h | 9 ++--- .../engine/bp/TestBPAppendAfterSteps.cpp | 2 +- .../engine/bp/TestBPDataSizeAggregate.cpp | 2 +- testing/adios2/engine/bp/TestBPDirectIO.cpp | 2 +- .../engine/bp/TestBPNewFileAppendMode.cpp | 2 +- .../engine/bp/TestBPParameterSelectSteps.cpp | 4 +-- .../engine/bp/TestBPReadMultithreaded.cpp | 2 +- .../adios2/engine/bp/TestBPRerouteMessage.cpp | 4 +-- .../engine/bp/TestBPStepsFileGlobalArray.cpp | 14 ++++---- 14 files changed, 60 insertions(+), 59 deletions(-) diff --git a/source/adios2/engine/bp5/BP5Writer.cpp b/source/adios2/engine/bp5/BP5Writer.cpp index 29b6199e57..807578373f 100644 --- a/source/adios2/engine/bp5/BP5Writer.cpp +++ b/source/adios2/engine/bp5/BP5Writer.cpp @@ -406,7 +406,8 @@ void BP5Writer::WriteData(format::BufferV *Data) // For rerouting to be useful, there must be multiple writers sending // data to multiple subfiles. - if (m_Parameters.EnableWriterRerouting && m_Comm.Size() > 1 && m_Aggregator->m_SubStreams > 1) + if (m_Parameters.EnableWriterRerouting && m_Comm.Size() > 1 && + m_Aggregator->m_SubStreams > 1) { WriteData_WithRerouting(Data); } diff --git a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp index a12df82f72..39049c89b8 100644 --- a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp +++ b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp @@ -10,8 +10,8 @@ #include "adios2/common/ADIOSMacros.h" #include "adios2/core/IO.h" -#include "adios2/helper/adiosRerouting.h" #include "adios2/helper/adiosFunctions.h" //CheckIndexRange +#include "adios2/helper/adiosRerouting.h" #include "adios2/toolkit/format/buffer/chunk/ChunkV.h" #include "adios2/toolkit/format/buffer/malloc/MallocV.h" #include "adios2/toolkit/transport/file/FileFStream.h" @@ -54,7 +54,8 @@ void BP5Writer::ReroutingCommunicationLoop() if (iAmSubCoord) { // Pre-populate my queue with the ranks in my group/partition - const std::vector &groupRanks = m_Partitioning.m_Partitions[m_Aggregator->m_SubStreamIndex]; + const std::vector &groupRanks = + m_Partitioning.m_Partitions[m_Aggregator->m_SubStreamIndex]; for (auto rank : groupRanks) { writerQueue.push(static_cast(rank)); @@ -74,8 +75,8 @@ void BP5Writer::ReroutingCommunicationLoop() while (true) { int msgReady = 0; - helper::Comm::Status status = m_Comm.Iprobe( - static_cast(helper::Comm::Constants::CommRecvAny), 0, &msgReady); + helper::Comm::Status status = + m_Comm.Iprobe(static_cast(helper::Comm::Constants::CommRecvAny), 0, &msgReady); // If there is a message ready, receive and handle it if (msgReady) @@ -83,18 +84,17 @@ void BP5Writer::ReroutingCommunicationLoop() RerouteMessage message; message.RecvFrom(m_Comm, status.Source); - switch ((RerouteMessage::MessageType) message.m_MsgType) + switch ((RerouteMessage::MessageType)message.m_MsgType) { - case RerouteMessage::MessageType::DO_WRITE: - { - std::unique_lock lck(m_WriteMutex); - m_TargetIndex = message.m_SubStreamIdx; - m_DataPos = message.m_Offset; - m_TargetCoordinator = message.m_SrcRank; - m_ReadyToWrite = true; - m_WriteCV.notify_one(); - } - break; + case RerouteMessage::MessageType::DO_WRITE: { + std::unique_lock lck(m_WriteMutex); + m_TargetIndex = message.m_SubStreamIdx; + m_DataPos = message.m_Offset; + m_TargetCoordinator = message.m_SrcRank; + m_ReadyToWrite = true; + m_WriteCV.notify_one(); + } + break; case RerouteMessage::MessageType::WRITE_COMPLETION: currentFilePos = message.m_Offset; writingRank = -1; @@ -118,7 +118,7 @@ void BP5Writer::ReroutingCommunicationLoop() writeCompleteMsg.SendTo(m_Comm, m_TargetCoordinator); sentFinished = true; - if (!iAmSubCoord /*&& !iAmGlobalCoord*/ ) + if (!iAmSubCoord /*&& !iAmGlobalCoord*/) { // My only role was to write (no communication responsibility) so I am // done at this point. @@ -183,7 +183,7 @@ void BP5Writer::WriteData_WithRerouting(format::BufferV *Data) // wait until communication thread indicates it's our turn to write { std::unique_lock lck(m_WriteMutex); - m_WriteCV.wait(lck, [this]{ return m_ReadyToWrite; }); + m_WriteCV.wait(lck, [this] { return m_ReadyToWrite; }); } // Do the writing diff --git a/source/adios2/helper/adiosComm.cpp b/source/adios2/helper/adiosComm.cpp index 1aa970e755..9b0bd918d9 100644 --- a/source/adios2/helper/adiosComm.cpp +++ b/source/adios2/helper/adiosComm.cpp @@ -97,9 +97,9 @@ Comm::Status Comm::Probe(int source, int tag, const std::string &hint) const { if (source != static_cast(Comm::Constants::CommRecvAny)) { - throw std::runtime_error( - "Invalid MPI source rank in Probe: " + std::to_string(source) + - " for a communicator of size " + std::to_string(m_Impl->Size())); + throw std::runtime_error("Invalid MPI source rank in Probe: " + std::to_string(source) + + " for a communicator of size " + + std::to_string(m_Impl->Size())); } } return m_Impl->Probe(source, tag, hint); diff --git a/source/adios2/helper/adiosCommMPI.cpp b/source/adios2/helper/adiosCommMPI.cpp index fc3e4a852c..5012157395 100644 --- a/source/adios2/helper/adiosCommMPI.cpp +++ b/source/adios2/helper/adiosCommMPI.cpp @@ -18,8 +18,8 @@ #include "adios2/common/ADIOSTypes.h" -#include #include +#include namespace adios2 { @@ -402,9 +402,9 @@ Comm::Status CommImplMPI::Recv(void *buf, size_t count, Datatype datatype, int s const std::string &hint) const { MPI_Status mpiStatus; - CheckMPIReturn( - MPI_Recv(buf, static_cast(count), ToMPI(datatype), GetMPISource(source), tag, - m_MPIComm, &mpiStatus), hint); + CheckMPIReturn(MPI_Recv(buf, static_cast(count), ToMPI(datatype), GetMPISource(source), + tag, m_MPIComm, &mpiStatus), + hint); Comm::Status status; status.Source = mpiStatus.MPI_SOURCE; @@ -485,7 +485,8 @@ Comm::Req CommImplMPI::Irecv(void *buffer, size_t count, Datatype datatype, int int batchSize = static_cast(DefaultMaxFileBatchSize); MPI_Request mpiReq; CheckMPIReturn(MPI_Irecv(static_cast(buffer) + position, batchSize, - ToMPI(datatype), GetMPISource(source), tag, m_MPIComm, &mpiReq), + ToMPI(datatype), GetMPISource(source), tag, m_MPIComm, + &mpiReq), "in call to Irecv batch " + std::to_string(b) + " " + hint + "\n"); req->m_MPIReqs.emplace_back(mpiReq); @@ -498,7 +499,8 @@ Comm::Req CommImplMPI::Irecv(void *buffer, size_t count, Datatype datatype, int int batchSize = static_cast(remainder); MPI_Request mpiReq; CheckMPIReturn(MPI_Irecv(static_cast(buffer) + position, batchSize, - ToMPI(datatype), GetMPISource(source), tag, m_MPIComm, &mpiReq), + ToMPI(datatype), GetMPISource(source), tag, m_MPIComm, + &mpiReq), "in call to Irecv remainder batch " + hint + "\n"); req->m_MPIReqs.emplace_back(mpiReq); } @@ -507,9 +509,9 @@ Comm::Req CommImplMPI::Irecv(void *buffer, size_t count, Datatype datatype, int { int batchSize = static_cast(count); MPI_Request mpiReq; - CheckMPIReturn( - MPI_Irecv(buffer, batchSize, ToMPI(datatype), GetMPISource(source), tag, m_MPIComm, &mpiReq), - " in call to Isend with single batch " + hint + "\n"); + CheckMPIReturn(MPI_Irecv(buffer, batchSize, ToMPI(datatype), GetMPISource(source), tag, + m_MPIComm, &mpiReq), + " in call to Isend with single batch " + hint + "\n"); req->m_MPIReqs.emplace_back(mpiReq); } diff --git a/source/adios2/helper/adiosRerouting.cpp b/source/adios2/helper/adiosRerouting.cpp index f0e9c78c51..f0b65ec0d0 100644 --- a/source/adios2/helper/adiosRerouting.cpp +++ b/source/adios2/helper/adiosRerouting.cpp @@ -21,7 +21,6 @@ #include /// \endcond - namespace adios2 { namespace helper @@ -39,7 +38,6 @@ void RerouteMessage::ToBuffer(std::vector &buffer) helper::CopyToBuffer(buffer, pos, &this->m_Size); } - void RerouteMessage::FromBuffer(const std::vector &buffer) { size_t pos = 0; @@ -51,10 +49,10 @@ void RerouteMessage::FromBuffer(const std::vector &buffer) helper::CopyFromBuffer(buffer.data(), pos, &this->m_Size); } -void RerouteMessage::SendTo(helper::Comm& comm, int destRank) +void RerouteMessage::SendTo(helper::Comm &comm, int destRank) { - std::cout << "Rank " << comm.Rank() << " sending " - << this->GetTypeString(this->m_MsgType) << " to " << destRank << std::endl; + std::cout << "Rank " << comm.Rank() << " sending " << this->GetTypeString(this->m_MsgType) + << " to " << destRank << std::endl; std::vector sendBuf; this->ToBuffer(sendBuf); @@ -70,7 +68,7 @@ void RerouteMessage::SendTo(helper::Comm& comm, int destRank) comm.Send(sendBuf.data(), sendBuf.size(), destRank, 0); } -void RerouteMessage::RecvFrom(helper::Comm& comm, int srcRank) +void RerouteMessage::RecvFrom(helper::Comm &comm, int srcRank) { std::vector recvBuf; recvBuf.resize(REROUTE_MESSAGE_SIZE); @@ -84,9 +82,8 @@ void RerouteMessage::RecvFrom(helper::Comm& comm, int srcRank) this->FromBuffer(recvBuf); - std::cout << "Rank " << comm.Rank() << " received " - << this->GetTypeString(this->m_MsgType) << " from " << status.Source - << std::endl; + std::cout << "Rank " << comm.Rank() << " received " << this->GetTypeString(this->m_MsgType) + << " from " << status.Source << std::endl; if (status.Source != this->m_SrcRank) { diff --git a/source/adios2/helper/adiosRerouting.h b/source/adios2/helper/adiosRerouting.h index e36a3756c0..3ad40ea3c8 100644 --- a/source/adios2/helper/adiosRerouting.h +++ b/source/adios2/helper/adiosRerouting.h @@ -68,10 +68,10 @@ class RerouteMessage void FromBuffer(const std::vector &buffer); // Send the contents of this message to another rank - void SendTo(helper::Comm& comm, int destRank); + void SendTo(helper::Comm &comm, int destRank); // Receive a message from another rank to populate this message - void RecvFrom(helper::Comm& comm, int srcRank); + void RecvFrom(helper::Comm &comm, int srcRank); MessageType m_MsgType; int m_SrcRank; @@ -80,8 +80,9 @@ class RerouteMessage unsigned long m_Offset; unsigned long m_Size; - static const size_t REROUTE_MESSAGE_SIZE = sizeof(MessageType) + sizeof(int) + - sizeof(int) + sizeof(unsigned long) + sizeof(unsigned long) + sizeof(unsigned long); + static const size_t REROUTE_MESSAGE_SIZE = sizeof(MessageType) + sizeof(int) + sizeof(int) + + sizeof(unsigned long) + sizeof(unsigned long) + + sizeof(unsigned long); }; } // end namespace helper diff --git a/testing/adios2/engine/bp/TestBPAppendAfterSteps.cpp b/testing/adios2/engine/bp/TestBPAppendAfterSteps.cpp index f6ced86cba..046d7c6940 100644 --- a/testing/adios2/engine/bp/TestBPAppendAfterSteps.cpp +++ b/testing/adios2/engine/bp/TestBPAppendAfterSteps.cpp @@ -100,7 +100,7 @@ TEST_P(BPAppendAfterStepsP, Test) ioWrite.SetEngine(engineName); ioWrite.SetParameter("AggregationType", aggType); - const char* rr = (rerouting ? "true" : "false"); + const char *rr = (rerouting ? "true" : "false"); ioWrite.SetParameter("EnableWriterRerouting", rr); adios2::Engine engine = ioWrite.Open(filename, adios2::Mode::Write); diff --git a/testing/adios2/engine/bp/TestBPDataSizeAggregate.cpp b/testing/adios2/engine/bp/TestBPDataSizeAggregate.cpp index 391d331fb1..12877d6800 100644 --- a/testing/adios2/engine/bp/TestBPDataSizeAggregate.cpp +++ b/testing/adios2/engine/bp/TestBPDataSizeAggregate.cpp @@ -103,7 +103,7 @@ TEST_F(DSATest, TestWriteUnbalancedData) bpIO.SetParameter("NumSubFiles", numberOfSubFiles); bpIO.SetParameter("verbose", verbose); - const char* rr = (rerouting ? "true" : "false"); + const char *rr = (rerouting ? "true" : "false"); bpIO.SetParameter("EnableWriterRerouting", rr); adios2::Variable varGlobalArray = diff --git a/testing/adios2/engine/bp/TestBPDirectIO.cpp b/testing/adios2/engine/bp/TestBPDirectIO.cpp index 2428c7e604..0822af53ba 100644 --- a/testing/adios2/engine/bp/TestBPDirectIO.cpp +++ b/testing/adios2/engine/bp/TestBPDirectIO.cpp @@ -60,7 +60,7 @@ TEST_F(ADIOSReadDirectIOTest, BufferResize) // BufferChunkSize should be adjusted to 2*4096 by engine // StripeSize should be adjusted to 3*4096 by engine - const char* rr = (rerouting ? "true" : "false"); + const char *rr = (rerouting ? "true" : "false"); ioWrite.SetParameter("EnableWriterRerouting", rr); adios2::Engine engine = ioWrite.Open(filename, adios2::Mode::Write); diff --git a/testing/adios2/engine/bp/TestBPNewFileAppendMode.cpp b/testing/adios2/engine/bp/TestBPNewFileAppendMode.cpp index d1201384ca..7390a1af7d 100644 --- a/testing/adios2/engine/bp/TestBPNewFileAppendMode.cpp +++ b/testing/adios2/engine/bp/TestBPNewFileAppendMode.cpp @@ -67,7 +67,7 @@ TEST_F(BPNewFileAppendMode, ADIOS2BPNewFileAppendMode) io.SetParameter("AggregationType", aggregationType); io.SetParameter("NumAggregators", "0"); - const char* rr = (rerouting ? "true" : "false"); + const char *rr = (rerouting ? "true" : "false"); io.SetParameter("EnableWriterRerouting", rr); adios2::Engine engine = io.Open(fname, adios2::Mode::Append); diff --git a/testing/adios2/engine/bp/TestBPParameterSelectSteps.cpp b/testing/adios2/engine/bp/TestBPParameterSelectSteps.cpp index bb81422018..1c9e0ac677 100644 --- a/testing/adios2/engine/bp/TestBPParameterSelectSteps.cpp +++ b/testing/adios2/engine/bp/TestBPParameterSelectSteps.cpp @@ -88,7 +88,7 @@ class BPParameterSelectSteps : public ::testing::Test ioWrite.SetEngine(engineName); ioWrite.SetParameter("AggregationType", aggType); - const char* rr = (rerouting ? "true" : "false"); + const char *rr = (rerouting ? "true" : "false"); ioWrite.SetParameter("EnableWriterRerouting", rr); adios2::Engine engine = ioWrite.Open(OutputFileName, adios2::Mode::Write); @@ -212,7 +212,7 @@ TEST_P(BPParameterSelectStepsP, Stream) ioWrite.SetEngine(engineName); ioWrite.SetParameter("AggregationType", aggType); - const char* rr = (rerouting ? "true" : "false"); + const char *rr = (rerouting ? "true" : "false"); ioWrite.SetParameter("EnableWriterRerouting", rr); adios2::Engine writer = ioWrite.Open(filename, adios2::Mode::Write); diff --git a/testing/adios2/engine/bp/TestBPReadMultithreaded.cpp b/testing/adios2/engine/bp/TestBPReadMultithreaded.cpp index 082b23b289..056407c33f 100644 --- a/testing/adios2/engine/bp/TestBPReadMultithreaded.cpp +++ b/testing/adios2/engine/bp/TestBPReadMultithreaded.cpp @@ -104,7 +104,7 @@ class BPReadMultithreadedTest : public ::testing::Test ioWrite.SetEngine(engineName); ioWrite.SetParameter("AggregationType", aggType); - const char* rrParam = (rerouting ? "true" : "false"); + const char *rrParam = (rerouting ? "true" : "false"); ioWrite.SetParameter("EnableWriterRerouting", rrParam); adios2::Engine engine = ioWrite.Open(filename, adios2::Mode::Write); diff --git a/testing/adios2/engine/bp/TestBPRerouteMessage.cpp b/testing/adios2/engine/bp/TestBPRerouteMessage.cpp index 0a65719e2a..3684aa1df4 100644 --- a/testing/adios2/engine/bp/TestBPRerouteMessage.cpp +++ b/testing/adios2/engine/bp/TestBPRerouteMessage.cpp @@ -19,7 +19,8 @@ int worldRank, worldSize; void SendAndReceiveMessage(helper::Comm &comm, int destRank, int srcRank) { - std::cout << "Sending to " << destRank << " and expecting to receive from: " << srcRank << std::endl; + std::cout << "Sending to " << destRank << " and expecting to receive from: " << srcRank + << std::endl; // Send a message to another rank adios2::helper::RerouteMessage origMsg; @@ -55,7 +56,6 @@ class RerouteTest : public ::testing::Test RerouteTest() = default; }; - TEST_F(RerouteTest, TestMessageBuffer) { adios2::helper::RerouteMessage origMsg; diff --git a/testing/adios2/engine/bp/TestBPStepsFileGlobalArray.cpp b/testing/adios2/engine/bp/TestBPStepsFileGlobalArray.cpp index dd95d77314..461ea0d53c 100644 --- a/testing/adios2/engine/bp/TestBPStepsFileGlobalArray.cpp +++ b/testing/adios2/engine/bp/TestBPStepsFileGlobalArray.cpp @@ -162,7 +162,7 @@ TEST_P(BPStepsFileGlobalArrayReaders, EveryStep) io.SetParameter("AggregationType", aggType); } - const char* rr = (rerouting ? "true" : "false"); + const char *rr = (rerouting ? "true" : "false"); io.SetParameter("EnableWriterRerouting", rr); adios2::Engine engine = io.Open(fname, adios2::Mode::Write); @@ -385,8 +385,8 @@ TEST_P(BPStepsFileGlobalArrayReaders, NewVarPerStep) { const ReadMode readMode = GetReadMode(); std::string fname_prefix = "BPStepsFileGlobalArray.NewVarPerStep." + - ReadModeToString(readMode) + AggregationTypeAlias(aggType) + - "_RR" + (rerouting ? "Y" : "N"); + ReadModeToString(readMode) + AggregationTypeAlias(aggType) + "_RR" + + (rerouting ? "Y" : "N"); int mpiRank = 0, mpiSize = 1; const std::size_t NSteps = 4; @@ -428,7 +428,7 @@ TEST_P(BPStepsFileGlobalArrayReaders, NewVarPerStep) io.SetParameter("AggregationType", aggType); } - const char* rr = (rerouting ? "true" : "false"); + const char *rr = (rerouting ? "true" : "false"); io.SetParameter("EnableWriterRerouting", rr); adios2::Engine engine = io.Open(fname, adios2::Mode::Write); @@ -677,8 +677,8 @@ TEST_P(BPStepsFileGlobalArrayParameters, EveryOtherStep) const ReadMode readMode = GetReadMode(); std::string fname_prefix = "BPStepsFileGlobalArray.EveryOtherStep.Steps" + std::to_string(NSteps) + ".Oddity" + std::to_string(Oddity) + "." + - ReadModeToString(readMode) + AggregationTypeAlias(aggType) + - "_RR" + (rerouting ? "Y" : "N"); + ReadModeToString(readMode) + AggregationTypeAlias(aggType) + "_RR" + + (rerouting ? "Y" : "N"); int mpiRank = 0, mpiSize = 1; #if ADIOS2_USE_MPI @@ -721,7 +721,7 @@ TEST_P(BPStepsFileGlobalArrayParameters, EveryOtherStep) io.SetParameter("AggregationType", aggType); } - const char* rr = (rerouting ? "true" : "false"); + const char *rr = (rerouting ? "true" : "false"); io.SetParameter("EnableWriterRerouting", rr); adios2::Engine engine = io.Open(fname, adios2::Mode::Write); From 69cf99bce9853a48fae46b2b21ad1aae2745ead3 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Fri, 3 Oct 2025 17:04:26 -0600 Subject: [PATCH 11/37] fix a couple warnings --- source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp index 39049c89b8..ad8cd0d5d5 100644 --- a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp +++ b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp @@ -32,13 +32,13 @@ using namespace adios2::format; void BP5Writer::ReroutingCommunicationLoop() { - using RerouteMessage = typename adios2::helper::RerouteMessage; + using RerouteMessage = adios2::helper::RerouteMessage; int subCoord = m_Aggregator->m_AggregatorRank; bool iAmSubCoord = m_RankMPI == subCoord; // Arbitrarily make the last rank the global coordinator // TODO: should the global coordinator role be assigned to a subcoordinator? - bool iAmGlobalCoord = m_RankMPI == m_Comm.Size() - 1; + // bool iAmGlobalCoord = m_RankMPI == m_Comm.Size() - 1; std::queue writerQueue; int writingRank = -1; uint64_t currentFilePos = 0; From ba5ea7f3930a7e9a65e4639c1f385d616604bc30 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Mon, 6 Oct 2025 13:52:30 -0600 Subject: [PATCH 12/37] fix some type mismatches, comment debug stmts --- source/adios2/engine/bp5/BP5Writer.h | 2 +- .../engine/bp5/BP5Writer_WithRerouting.cpp | 13 ++----- source/adios2/helper/adiosRerouting.cpp | 34 +++++-------------- source/adios2/helper/adiosRerouting.h | 10 +++--- 4 files changed, 18 insertions(+), 41 deletions(-) diff --git a/source/adios2/engine/bp5/BP5Writer.h b/source/adios2/engine/bp5/BP5Writer.h index 76bdb9a882..ad9c1e658f 100644 --- a/source/adios2/engine/bp5/BP5Writer.h +++ b/source/adios2/engine/bp5/BP5Writer.h @@ -308,7 +308,7 @@ class BP5Writer : public BP5Engine, public core::Engine // Thread function for inter-rank communication when rerouting aggregation // is enabled void ReroutingCommunicationLoop(); - size_t m_TargetIndex; + int m_TargetIndex; int m_TargetCoordinator; std::mutex m_WriteMutex; std::mutex m_NotifMutex; diff --git a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp index ad8cd0d5d5..cbb32fa853 100644 --- a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp +++ b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp @@ -44,13 +44,6 @@ void BP5Writer::ReroutingCommunicationLoop() uint64_t currentFilePos = 0; bool sentFinished = false; - // Now start the - std::cout << " Rank " << m_Comm.Rank() << " ReroutingCommunicationLoop()" << std::endl; - std::cout << " SC: " << subCoord << std::endl; - std::cout << " GC: " << m_Comm.Size() - 1 << std::endl; - std::cout << " subfile index: " << m_Aggregator->m_SubStreamIndex << std::endl; - std::cout << " total subfiles: " << m_Aggregator->m_SubStreams << std::endl; - if (iAmSubCoord) { // Pre-populate my queue with the ranks in my group/partition @@ -138,7 +131,7 @@ void BP5Writer::ReroutingCommunicationLoop() writeMsg.m_MsgType = RerouteMessage::MessageType::DO_WRITE; writeMsg.m_SrcRank = m_RankMPI; writeMsg.m_DestRank = nextWriter; - writeMsg.m_SubStreamIdx = m_Aggregator->m_SubStreamIndex; + writeMsg.m_SubStreamIdx = static_cast(m_Aggregator->m_SubStreamIndex); writeMsg.m_Offset = currentFilePos; writingRank = nextWriter; writeMsg.SendTo(m_Comm, nextWriter); @@ -178,7 +171,7 @@ void BP5Writer::WriteData_WithRerouting(format::BufferV *Data) std::thread commThread(&BP5Writer::ReroutingCommunicationLoop, this); - std::cout << "Background thread for rank " << m_RankMPI << " is now running" << std::endl; + // std::cout << "Background thread for rank " << m_RankMPI << " is now running" << std::endl; // wait until communication thread indicates it's our turn to write { @@ -207,7 +200,7 @@ void BP5Writer::WriteData_WithRerouting(format::BufferV *Data) commThread.join(); - std::cout << "Background thread for rank " << m_RankMPI << " is now finished" << std::endl; + // std::cout << "Background thread for rank " << m_RankMPI << " is now finished" << std::endl; } } // end namespace engine diff --git a/source/adios2/helper/adiosRerouting.cpp b/source/adios2/helper/adiosRerouting.cpp index f0b65ec0d0..7d063eca17 100644 --- a/source/adios2/helper/adiosRerouting.cpp +++ b/source/adios2/helper/adiosRerouting.cpp @@ -51,44 +51,28 @@ void RerouteMessage::FromBuffer(const std::vector &buffer) void RerouteMessage::SendTo(helper::Comm &comm, int destRank) { - std::cout << "Rank " << comm.Rank() << " sending " << this->GetTypeString(this->m_MsgType) - << " to " << destRank << std::endl; + // std::cout << "Rank " << comm.Rank() << " sending " << this->GetTypeString(this->m_MsgType) + // << " to " << destRank << std::endl; std::vector sendBuf; this->ToBuffer(sendBuf); - - // You'd think send and/or recv should be non-blocking for this to work, - // especially when sending to ourselves, but somehow blocking send and recv - // seems to work. - - // Non-blocking send - // comm.Isend(sendBuf.data(), sendBuf.size(), desinationtRank, 0); - - // "Blocking" send - comm.Send(sendBuf.data(), sendBuf.size(), destRank, 0); + comm.Isend(sendBuf.data(), sendBuf.size(), destRank, 0); } void RerouteMessage::RecvFrom(helper::Comm &comm, int srcRank) { std::vector recvBuf; recvBuf.resize(REROUTE_MESSAGE_SIZE); - - // Non-blocking receive - // helper::Comm::Req req = comm.Irecv(recvBuf.data(), msgSize, recvFlag, 0); - // helper::Comm::Status status = req.Wait(); - - // "Blocking" receive helper::Comm::Status status = comm.Recv(recvBuf.data(), REROUTE_MESSAGE_SIZE, srcRank, 0); - this->FromBuffer(recvBuf); - std::cout << "Rank " << comm.Rank() << " received " << this->GetTypeString(this->m_MsgType) - << " from " << status.Source << std::endl; + // std::cout << "Rank " << comm.Rank() << " received " << this->GetTypeString(this->m_MsgType) + // << " from " << status.Source << std::endl; - if (status.Source != this->m_SrcRank) - { - std::cout << " GAHHHHHHH!" << std::endl; - } + // if (status.Source != this->m_SrcRank) + // { + // std::cout << " GAHHHHHHH!" << std::endl; + // } } } // end namespace helper diff --git a/source/adios2/helper/adiosRerouting.h b/source/adios2/helper/adiosRerouting.h index 3ad40ea3c8..9efa11ae3d 100644 --- a/source/adios2/helper/adiosRerouting.h +++ b/source/adios2/helper/adiosRerouting.h @@ -76,13 +76,13 @@ class RerouteMessage MessageType m_MsgType; int m_SrcRank; int m_DestRank; - unsigned long m_SubStreamIdx; - unsigned long m_Offset; - unsigned long m_Size; + int m_SubStreamIdx; + uint64_t m_Offset; + uint64_t m_Size; static const size_t REROUTE_MESSAGE_SIZE = sizeof(MessageType) + sizeof(int) + sizeof(int) + - sizeof(unsigned long) + sizeof(unsigned long) + - sizeof(unsigned long); + sizeof(int) + sizeof(uint64_t) + + sizeof(uint64_t); }; } // end namespace helper From a915a70b770368f5330c084af09cd908b1f98fbf Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Mon, 6 Oct 2025 13:53:23 -0600 Subject: [PATCH 13/37] run clang format --- source/adios2/helper/adiosRerouting.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/source/adios2/helper/adiosRerouting.h b/source/adios2/helper/adiosRerouting.h index 9efa11ae3d..6521c02f2f 100644 --- a/source/adios2/helper/adiosRerouting.h +++ b/source/adios2/helper/adiosRerouting.h @@ -81,8 +81,7 @@ class RerouteMessage uint64_t m_Size; static const size_t REROUTE_MESSAGE_SIZE = sizeof(MessageType) + sizeof(int) + sizeof(int) + - sizeof(int) + sizeof(uint64_t) + - sizeof(uint64_t); + sizeof(int) + sizeof(uint64_t) + sizeof(uint64_t); }; } // end namespace helper From bf8db40de0907183576e7a40a3951d9f524adc28 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Mon, 6 Oct 2025 14:04:08 -0600 Subject: [PATCH 14/37] fix unused variable warning --- source/adios2/helper/adiosRerouting.cpp | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/source/adios2/helper/adiosRerouting.cpp b/source/adios2/helper/adiosRerouting.cpp index 7d063eca17..55f249c442 100644 --- a/source/adios2/helper/adiosRerouting.cpp +++ b/source/adios2/helper/adiosRerouting.cpp @@ -51,9 +51,6 @@ void RerouteMessage::FromBuffer(const std::vector &buffer) void RerouteMessage::SendTo(helper::Comm &comm, int destRank) { - // std::cout << "Rank " << comm.Rank() << " sending " << this->GetTypeString(this->m_MsgType) - // << " to " << destRank << std::endl; - std::vector sendBuf; this->ToBuffer(sendBuf); comm.Isend(sendBuf.data(), sendBuf.size(), destRank, 0); @@ -63,16 +60,8 @@ void RerouteMessage::RecvFrom(helper::Comm &comm, int srcRank) { std::vector recvBuf; recvBuf.resize(REROUTE_MESSAGE_SIZE); - helper::Comm::Status status = comm.Recv(recvBuf.data(), REROUTE_MESSAGE_SIZE, srcRank, 0); + comm.Recv(recvBuf.data(), REROUTE_MESSAGE_SIZE, srcRank, 0); this->FromBuffer(recvBuf); - - // std::cout << "Rank " << comm.Rank() << " received " << this->GetTypeString(this->m_MsgType) - // << " from " << status.Source << std::endl; - - // if (status.Source != this->m_SrcRank) - // { - // std::cout << " GAHHHHHHH!" << std::endl; - // } } } // end namespace helper From 9aaa3ace50208e311e737a1b8f5450250d7a67b2 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Tue, 7 Oct 2025 10:35:30 -0600 Subject: [PATCH 15/37] style fix --- source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp index cbb32fa853..fc0e201836 100644 --- a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp +++ b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp @@ -79,7 +79,8 @@ void BP5Writer::ReroutingCommunicationLoop() switch ((RerouteMessage::MessageType)message.m_MsgType) { - case RerouteMessage::MessageType::DO_WRITE: { + case RerouteMessage::MessageType::DO_WRITE: + { std::unique_lock lck(m_WriteMutex); m_TargetIndex = message.m_SubStreamIdx; m_DataPos = message.m_Offset; From 908be4a80bec88db6ff19690e61c6fbfaace8878 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Tue, 7 Oct 2025 14:55:51 -0600 Subject: [PATCH 16/37] Try to fix issues sending to and receiving from self Avoid the situation with non-blocking sends where the buffer containing the data to be sent is destroyed before the send completes. --- .../engine/bp5/BP5Writer_WithRerouting.cpp | 46 +++++++++++++++++-- source/adios2/helper/adiosRerouting.cpp | 24 +++------- source/adios2/helper/adiosRerouting.h | 10 +--- .../adios2/engine/bp/TestBPRerouteMessage.cpp | 32 ++----------- 4 files changed, 57 insertions(+), 55 deletions(-) diff --git a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp index fc0e201836..ddebf223b4 100644 --- a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp +++ b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp @@ -21,6 +21,38 @@ #include #include +namespace +{ +class BufferPool +{ +public: + BufferPool(int size) + { + m_Pool.resize(size); + } + + ~BufferPool() = default; + + std::vector &GetNextBuffer() + { + size_t bufferIdx = m_CurrentBufferIdx; + + if (m_CurrentBufferIdx < m_Pool.size() - 1) + { + m_CurrentBufferIdx += 1; + } + else { + m_CurrentBufferIdx = 0; + } + + return m_Pool[bufferIdx]; + } + + size_t m_CurrentBufferIdx = 0; + std::vector> m_Pool; +}; +} + namespace adios2 { namespace core @@ -40,6 +72,14 @@ void BP5Writer::ReroutingCommunicationLoop() // TODO: should the global coordinator role be assigned to a subcoordinator? // bool iAmGlobalCoord = m_RankMPI == m_Comm.Size() - 1; std::queue writerQueue; + + // Sends are non-blocking. We use the pool to avoid the situation where the + // buffer is destructed before the send is complete. If we start seeing + // many Sends pending for a long time, that could cause us to exhaust the + // pool of buffers, at which point we would start overwriting buffers in the + // pool, potentially creating errors which are difficult to debug/diagnose. + BufferPool sendBuffers((iAmSubCoord ? 100 : 1)); + std::vector recvBuffer; int writingRank = -1; uint64_t currentFilePos = 0; bool sentFinished = false; @@ -75,7 +115,7 @@ void BP5Writer::ReroutingCommunicationLoop() if (msgReady) { RerouteMessage message; - message.RecvFrom(m_Comm, status.Source); + message.BlockingRecvFrom(m_Comm, status.Source, recvBuffer); switch ((RerouteMessage::MessageType)message.m_MsgType) { @@ -109,7 +149,7 @@ void BP5Writer::ReroutingCommunicationLoop() writeCompleteMsg.m_DestRank = m_TargetCoordinator; writeCompleteMsg.m_SubStreamIdx = m_TargetIndex; writeCompleteMsg.m_Offset = m_DataPos; - writeCompleteMsg.SendTo(m_Comm, m_TargetCoordinator); + writeCompleteMsg.NonBlockingSendTo(m_Comm, m_TargetCoordinator, sendBuffers.GetNextBuffer()); sentFinished = true; if (!iAmSubCoord /*&& !iAmGlobalCoord*/) @@ -135,7 +175,7 @@ void BP5Writer::ReroutingCommunicationLoop() writeMsg.m_SubStreamIdx = static_cast(m_Aggregator->m_SubStreamIndex); writeMsg.m_Offset = currentFilePos; writingRank = nextWriter; - writeMsg.SendTo(m_Comm, nextWriter); + writeMsg.NonBlockingSendTo(m_Comm, nextWriter, sendBuffers.GetNextBuffer()); } else { diff --git a/source/adios2/helper/adiosRerouting.cpp b/source/adios2/helper/adiosRerouting.cpp index 55f249c442..b4a350e9fa 100644 --- a/source/adios2/helper/adiosRerouting.cpp +++ b/source/adios2/helper/adiosRerouting.cpp @@ -26,7 +26,7 @@ namespace adios2 namespace helper { -void RerouteMessage::ToBuffer(std::vector &buffer) +void RerouteMessage::NonBlockingSendTo(helper::Comm &comm, int destRank, std::vector &buffer) { size_t pos = 0; buffer.resize(REROUTE_MESSAGE_SIZE); @@ -36,10 +36,15 @@ void RerouteMessage::ToBuffer(std::vector &buffer) helper::CopyToBuffer(buffer, pos, &this->m_SubStreamIdx); helper::CopyToBuffer(buffer, pos, &this->m_Offset); helper::CopyToBuffer(buffer, pos, &this->m_Size); + + comm.Isend(buffer.data(), buffer.size(), destRank, 0); } -void RerouteMessage::FromBuffer(const std::vector &buffer) +void RerouteMessage::BlockingRecvFrom(helper::Comm &comm, int srcRank, std::vector &buffer) { + buffer.resize(REROUTE_MESSAGE_SIZE); + comm.Recv(buffer.data(), REROUTE_MESSAGE_SIZE, srcRank, 0); + size_t pos = 0; helper::CopyFromBuffer(buffer.data(), pos, &this->m_MsgType); helper::CopyFromBuffer(buffer.data(), pos, &this->m_SrcRank); @@ -49,20 +54,5 @@ void RerouteMessage::FromBuffer(const std::vector &buffer) helper::CopyFromBuffer(buffer.data(), pos, &this->m_Size); } -void RerouteMessage::SendTo(helper::Comm &comm, int destRank) -{ - std::vector sendBuf; - this->ToBuffer(sendBuf); - comm.Isend(sendBuf.data(), sendBuf.size(), destRank, 0); -} - -void RerouteMessage::RecvFrom(helper::Comm &comm, int srcRank) -{ - std::vector recvBuf; - recvBuf.resize(REROUTE_MESSAGE_SIZE); - comm.Recv(recvBuf.data(), REROUTE_MESSAGE_SIZE, srcRank, 0); - this->FromBuffer(recvBuf); -} - } // end namespace helper } // end namespace adios2 diff --git a/source/adios2/helper/adiosRerouting.h b/source/adios2/helper/adiosRerouting.h index 6521c02f2f..1cc1bae082 100644 --- a/source/adios2/helper/adiosRerouting.h +++ b/source/adios2/helper/adiosRerouting.h @@ -61,17 +61,11 @@ class RerouteMessage } } - // Store a RerouteMsg so it can sent/received over mpi - void ToBuffer(std::vector &buffer); - - // Reconstitute member fields from buffer - void FromBuffer(const std::vector &buffer); - // Send the contents of this message to another rank - void SendTo(helper::Comm &comm, int destRank); + void NonBlockingSendTo(helper::Comm &comm, int destRank, std::vector &buffer); // Receive a message from another rank to populate this message - void RecvFrom(helper::Comm &comm, int srcRank); + void BlockingRecvFrom(helper::Comm &comm, int srcRank, std::vector &buffer); MessageType m_MsgType; int m_SrcRank; diff --git a/testing/adios2/engine/bp/TestBPRerouteMessage.cpp b/testing/adios2/engine/bp/TestBPRerouteMessage.cpp index 3684aa1df4..1196df43f0 100644 --- a/testing/adios2/engine/bp/TestBPRerouteMessage.cpp +++ b/testing/adios2/engine/bp/TestBPRerouteMessage.cpp @@ -22,6 +22,9 @@ void SendAndReceiveMessage(helper::Comm &comm, int destRank, int srcRank) std::cout << "Sending to " << destRank << " and expecting to receive from: " << srcRank << std::endl; + std::vector sendBuffer; + std::vector recvBuffer; + // Send a message to another rank adios2::helper::RerouteMessage origMsg; origMsg.m_MsgType = adios2::helper::RerouteMessage::MessageType::WRITER_IDLE; @@ -29,7 +32,7 @@ void SendAndReceiveMessage(helper::Comm &comm, int destRank, int srcRank) origMsg.m_DestRank = destRank; origMsg.m_Offset = 2138; origMsg.m_Size = 1213; - origMsg.SendTo(comm, destRank); + origMsg.NonBlockingSendTo(comm, destRank, sendBuffer); int ready = 0; @@ -40,7 +43,7 @@ void SendAndReceiveMessage(helper::Comm &comm, int destRank, int srcRank) // Receive a message from another (any) rank adios2::helper::RerouteMessage receivedMsg; - receivedMsg.RecvFrom(comm, static_cast(helper::Comm::Constants::CommRecvAny)); + receivedMsg.BlockingRecvFrom(comm, static_cast(helper::Comm::Constants::CommRecvAny), recvBuffer); ASSERT_EQ(receivedMsg.m_MsgType, origMsg.m_MsgType); ASSERT_EQ(receivedMsg.m_SrcRank, srcRank); @@ -56,31 +59,6 @@ class RerouteTest : public ::testing::Test RerouteTest() = default; }; -TEST_F(RerouteTest, TestMessageBuffer) -{ - adios2::helper::RerouteMessage origMsg; - - origMsg.m_MsgType = adios2::helper::RerouteMessage::MessageType::WRITER_IDLE; - origMsg.m_SrcRank = 3; - origMsg.m_DestRank = 0; - origMsg.m_SubStreamIdx = 1; - origMsg.m_Offset = 2138; - origMsg.m_Size = 1213; - - std::vector buf; - origMsg.ToBuffer(buf); - - adios2::helper::RerouteMessage newMsg; - newMsg.FromBuffer(buf); - - ASSERT_EQ(newMsg.m_MsgType, origMsg.m_MsgType); - ASSERT_EQ(newMsg.m_SrcRank, origMsg.m_SrcRank); - ASSERT_EQ(newMsg.m_DestRank, origMsg.m_DestRank); - ASSERT_EQ(newMsg.m_SubStreamIdx, origMsg.m_SubStreamIdx); - ASSERT_EQ(newMsg.m_Offset, origMsg.m_Offset); - ASSERT_EQ(newMsg.m_Size, origMsg.m_Size); -} - TEST_F(RerouteTest, TestSendReceiveRoundRobin) { helper::Comm comm = adios2::helper::CommDupMPI(MPI_COMM_WORLD); From cda5fe420d2cd0467694e9a08f734f8281745a07 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Tue, 7 Oct 2025 14:58:23 -0600 Subject: [PATCH 17/37] run clang format --- .../adios2/engine/bp5/BP5Writer_WithRerouting.cpp | 14 ++++++-------- testing/adios2/engine/bp/TestBPRerouteMessage.cpp | 3 ++- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp index ddebf223b4..cb137651a2 100644 --- a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp +++ b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp @@ -26,10 +26,7 @@ namespace class BufferPool { public: - BufferPool(int size) - { - m_Pool.resize(size); - } + BufferPool(int size) { m_Pool.resize(size); } ~BufferPool() = default; @@ -41,7 +38,8 @@ class BufferPool { m_CurrentBufferIdx += 1; } - else { + else + { m_CurrentBufferIdx = 0; } @@ -119,8 +117,7 @@ void BP5Writer::ReroutingCommunicationLoop() switch ((RerouteMessage::MessageType)message.m_MsgType) { - case RerouteMessage::MessageType::DO_WRITE: - { + case RerouteMessage::MessageType::DO_WRITE: { std::unique_lock lck(m_WriteMutex); m_TargetIndex = message.m_SubStreamIdx; m_DataPos = message.m_Offset; @@ -149,7 +146,8 @@ void BP5Writer::ReroutingCommunicationLoop() writeCompleteMsg.m_DestRank = m_TargetCoordinator; writeCompleteMsg.m_SubStreamIdx = m_TargetIndex; writeCompleteMsg.m_Offset = m_DataPos; - writeCompleteMsg.NonBlockingSendTo(m_Comm, m_TargetCoordinator, sendBuffers.GetNextBuffer()); + writeCompleteMsg.NonBlockingSendTo(m_Comm, m_TargetCoordinator, + sendBuffers.GetNextBuffer()); sentFinished = true; if (!iAmSubCoord /*&& !iAmGlobalCoord*/) diff --git a/testing/adios2/engine/bp/TestBPRerouteMessage.cpp b/testing/adios2/engine/bp/TestBPRerouteMessage.cpp index 1196df43f0..4e262548be 100644 --- a/testing/adios2/engine/bp/TestBPRerouteMessage.cpp +++ b/testing/adios2/engine/bp/TestBPRerouteMessage.cpp @@ -43,7 +43,8 @@ void SendAndReceiveMessage(helper::Comm &comm, int destRank, int srcRank) // Receive a message from another (any) rank adios2::helper::RerouteMessage receivedMsg; - receivedMsg.BlockingRecvFrom(comm, static_cast(helper::Comm::Constants::CommRecvAny), recvBuffer); + receivedMsg.BlockingRecvFrom(comm, static_cast(helper::Comm::Constants::CommRecvAny), + recvBuffer); ASSERT_EQ(receivedMsg.m_MsgType, origMsg.m_MsgType); ASSERT_EQ(receivedMsg.m_SrcRank, srcRank); From 8bd0feb697770362567c0506a2e17d66c332836a Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Tue, 7 Oct 2025 16:54:16 -0600 Subject: [PATCH 18/37] Try to fix round-robin send/recv on OpenMPI --- testing/adios2/engine/bp/TestBPRerouteMessage.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/testing/adios2/engine/bp/TestBPRerouteMessage.cpp b/testing/adios2/engine/bp/TestBPRerouteMessage.cpp index 4e262548be..63ea5f14de 100644 --- a/testing/adios2/engine/bp/TestBPRerouteMessage.cpp +++ b/testing/adios2/engine/bp/TestBPRerouteMessage.cpp @@ -35,16 +35,16 @@ void SendAndReceiveMessage(helper::Comm &comm, int destRank, int srcRank) origMsg.NonBlockingSendTo(comm, destRank, sendBuffer); int ready = 0; + helper::Comm::Status status; while (!ready) { - comm.Iprobe(static_cast(helper::Comm::Constants::CommRecvAny), 0, &ready); + status = comm.Iprobe(static_cast(helper::Comm::Constants::CommRecvAny), 0, &ready); } // Receive a message from another (any) rank adios2::helper::RerouteMessage receivedMsg; - receivedMsg.BlockingRecvFrom(comm, static_cast(helper::Comm::Constants::CommRecvAny), - recvBuffer); + receivedMsg.BlockingRecvFrom(comm, status.Source, recvBuffer); ASSERT_EQ(receivedMsg.m_MsgType, origMsg.m_MsgType); ASSERT_EQ(receivedMsg.m_SrcRank, srcRank); From c7ff85812f0300e4126992ce35dee6ecc145389e Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Tue, 7 Oct 2025 17:35:10 -0600 Subject: [PATCH 19/37] avoid changing filename for BP3 invocation --- .../engine/bp/TestBPStepsFileGlobalArray.cpp | 37 ++++++++++++------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/testing/adios2/engine/bp/TestBPStepsFileGlobalArray.cpp b/testing/adios2/engine/bp/TestBPStepsFileGlobalArray.cpp index 461ea0d53c..97261c4e57 100644 --- a/testing/adios2/engine/bp/TestBPStepsFileGlobalArray.cpp +++ b/testing/adios2/engine/bp/TestBPStepsFileGlobalArray.cpp @@ -110,6 +110,18 @@ std::string AggregationTypeAlias(const std::string &aggregationType) } } +std::string ReroutingPart(const std::string &aggregationType, bool rr) +{ + if (aggregationType == "") + { + return ""; + } + + std::string ret("_RR"); + ret += (rerouting ? "Y" : "N"); + return ret; +} + class BPStepsFileGlobalArrayReaders : public BPStepsFileGlobalArray, public ::testing::WithParamInterface { @@ -122,7 +134,7 @@ TEST_P(BPStepsFileGlobalArrayReaders, EveryStep) { const ReadMode readMode = GetReadMode(); std::string fname_prefix = "BPStepsFileGlobalArray.EveryStep." + ReadModeToString(readMode) + - AggregationTypeAlias(aggType) + "_RR" + (rerouting ? "Y" : "N"); + AggregationTypeAlias(aggType) + ReroutingPart(aggType, rerouting); int mpiRank = 0, mpiSize = 1; const std::size_t NSteps = 4; @@ -160,11 +172,10 @@ TEST_P(BPStepsFileGlobalArrayReaders, EveryStep) if (aggType != "") { io.SetParameter("AggregationType", aggType); + const char *rr = (rerouting ? "true" : "false"); + io.SetParameter("EnableWriterRerouting", rr); } - const char *rr = (rerouting ? "true" : "false"); - io.SetParameter("EnableWriterRerouting", rr); - adios2::Engine engine = io.Open(fname, adios2::Mode::Write); auto var_i32 = io.DefineVariable("i32", shape, start, count); @@ -385,8 +396,8 @@ TEST_P(BPStepsFileGlobalArrayReaders, NewVarPerStep) { const ReadMode readMode = GetReadMode(); std::string fname_prefix = "BPStepsFileGlobalArray.NewVarPerStep." + - ReadModeToString(readMode) + AggregationTypeAlias(aggType) + "_RR" + - (rerouting ? "Y" : "N"); + ReadModeToString(readMode) + AggregationTypeAlias(aggType) + + ReroutingPart(aggType, rerouting); int mpiRank = 0, mpiSize = 1; const std::size_t NSteps = 4; @@ -426,11 +437,10 @@ TEST_P(BPStepsFileGlobalArrayReaders, NewVarPerStep) if (aggType != "") { io.SetParameter("AggregationType", aggType); + const char *rr = (rerouting ? "true" : "false"); + io.SetParameter("EnableWriterRerouting", rr); } - const char *rr = (rerouting ? "true" : "false"); - io.SetParameter("EnableWriterRerouting", rr); - adios2::Engine engine = io.Open(fname, adios2::Mode::Write); for (int step = 0; step < static_cast(NSteps); ++step) @@ -677,8 +687,8 @@ TEST_P(BPStepsFileGlobalArrayParameters, EveryOtherStep) const ReadMode readMode = GetReadMode(); std::string fname_prefix = "BPStepsFileGlobalArray.EveryOtherStep.Steps" + std::to_string(NSteps) + ".Oddity" + std::to_string(Oddity) + "." + - ReadModeToString(readMode) + AggregationTypeAlias(aggType) + "_RR" + - (rerouting ? "Y" : "N"); + ReadModeToString(readMode) + AggregationTypeAlias(aggType) + + ReroutingPart(aggType, rerouting); int mpiRank = 0, mpiSize = 1; #if ADIOS2_USE_MPI @@ -719,11 +729,10 @@ TEST_P(BPStepsFileGlobalArrayParameters, EveryOtherStep) if (aggType != "") { io.SetParameter("AggregationType", aggType); + const char *rr = (rerouting ? "true" : "false"); + io.SetParameter("EnableWriterRerouting", rr); } - const char *rr = (rerouting ? "true" : "false"); - io.SetParameter("EnableWriterRerouting", rr); - adios2::Engine engine = io.Open(fname, adios2::Mode::Write); auto var_i32 = io.DefineVariable("i32", shape, start, count); From 130fd9e98af302921d918546c5eb630773d13315 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Tue, 7 Oct 2025 18:09:42 -0600 Subject: [PATCH 20/37] WIP: see if it is just the message type --- testing/adios2/engine/bp/TestBPRerouteMessage.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing/adios2/engine/bp/TestBPRerouteMessage.cpp b/testing/adios2/engine/bp/TestBPRerouteMessage.cpp index 63ea5f14de..630da444c8 100644 --- a/testing/adios2/engine/bp/TestBPRerouteMessage.cpp +++ b/testing/adios2/engine/bp/TestBPRerouteMessage.cpp @@ -46,7 +46,7 @@ void SendAndReceiveMessage(helper::Comm &comm, int destRank, int srcRank) adios2::helper::RerouteMessage receivedMsg; receivedMsg.BlockingRecvFrom(comm, status.Source, recvBuffer); - ASSERT_EQ(receivedMsg.m_MsgType, origMsg.m_MsgType); + // ASSERT_EQ(receivedMsg.m_MsgType, origMsg.m_MsgType); ASSERT_EQ(receivedMsg.m_SrcRank, srcRank); ASSERT_EQ(receivedMsg.m_DestRank, worldRank); ASSERT_EQ(receivedMsg.m_Offset, origMsg.m_Offset); From 52346028a6e286bbdaab0b1f57b7bf69066749e0 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Wed, 8 Oct 2025 17:04:40 -0600 Subject: [PATCH 21/37] WIP: add debug stmts --- source/adios2/helper/adiosRerouting.cpp | 23 +++++++++++ testing/adios2/engine/bp/CMakeLists.txt | 2 +- .../adios2/engine/bp/TestBPRerouteMessage.cpp | 41 ++++++++++++++++++- 3 files changed, 64 insertions(+), 2 deletions(-) diff --git a/source/adios2/helper/adiosRerouting.cpp b/source/adios2/helper/adiosRerouting.cpp index b4a350e9fa..863c4b51fc 100644 --- a/source/adios2/helper/adiosRerouting.cpp +++ b/source/adios2/helper/adiosRerouting.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include /// \endcond @@ -37,6 +38,17 @@ void RerouteMessage::NonBlockingSendTo(helper::Comm &comm, int destRank, std::ve helper::CopyToBuffer(buffer, pos, &this->m_Offset); helper::CopyToBuffer(buffer, pos, &this->m_Size); + std::stringstream ss; + + ss << "Rank " << comm.Rank() << " SEND buffer of length " << buffer.size() << ": ["; + for (int i = 0; i < buffer.size(); ++i) + { + ss << " " << static_cast(buffer[i]); + } + ss << " ]\n"; + + std::cout << ss.str(); + comm.Isend(buffer.data(), buffer.size(), destRank, 0); } @@ -45,6 +57,17 @@ void RerouteMessage::BlockingRecvFrom(helper::Comm &comm, int srcRank, std::vect buffer.resize(REROUTE_MESSAGE_SIZE); comm.Recv(buffer.data(), REROUTE_MESSAGE_SIZE, srcRank, 0); + std::stringstream ss; + + ss << "Rank " << comm.Rank() << " RECV buffer of length " << buffer.size() << ": ["; + for (int i = 0; i < buffer.size(); ++i) + { + ss << " " << static_cast(buffer[i]); + } + ss << " ]\n"; + + std::cout << ss.str(); + size_t pos = 0; helper::CopyFromBuffer(buffer.data(), pos, &this->m_MsgType); helper::CopyFromBuffer(buffer.data(), pos, &this->m_SrcRank); diff --git a/testing/adios2/engine/bp/CMakeLists.txt b/testing/adios2/engine/bp/CMakeLists.txt index 0d0af774e5..a3e877f839 100644 --- a/testing/adios2/engine/bp/CMakeLists.txt +++ b/testing/adios2/engine/bp/CMakeLists.txt @@ -131,7 +131,7 @@ if (ADIOS2_HAVE_MPI) WORKING_DIRECTORY ${BP5_DIR} EXTRA_ARGS "DataSizeBased" "3" "5" "2" ) gtest_add_tests_helper(DataSizeAggregate MPI_ONLY BP Engine.BP. .BP5.DSB.Rerouting - WORKING_DIRECTORY ${BP5_DIR} EXTRA_ARGS "DataSizeBased" "2" "1" "5" "WithRerouting" + WORKING_DIRECTORY ${BP5_DIR} EXTRA_ARGS "DataSizeBased" "3" "5" "5" "WithRerouting" ) endif() diff --git a/testing/adios2/engine/bp/TestBPRerouteMessage.cpp b/testing/adios2/engine/bp/TestBPRerouteMessage.cpp index 630da444c8..7f18190e87 100644 --- a/testing/adios2/engine/bp/TestBPRerouteMessage.cpp +++ b/testing/adios2/engine/bp/TestBPRerouteMessage.cpp @@ -5,10 +5,12 @@ #include #include +#include "adios2/helper/adiosMemory.h" #include #include #include #include +#include #include using namespace adios2; @@ -46,7 +48,18 @@ void SendAndReceiveMessage(helper::Comm &comm, int destRank, int srcRank) adios2::helper::RerouteMessage receivedMsg; receivedMsg.BlockingRecvFrom(comm, status.Source, recvBuffer); - // ASSERT_EQ(receivedMsg.m_MsgType, origMsg.m_MsgType); + std::stringstream ss; + + ss << "m_MsgType, orig = " << static_cast(origMsg.m_MsgType) + << ", rcvd = " << static_cast(receivedMsg.m_MsgType) << "\n"; + ss << "m_SrcRank, orig = " << srcRank << ", rcvd = " << receivedMsg.m_SrcRank << "\n"; + ss << "m_DestRank, orig = " << worldRank << ", rcvd = " << receivedMsg.m_DestRank << "\n"; + ss << "m_Offset, orig = " << origMsg.m_Offset << ", rcvd = " << receivedMsg.m_Offset << "\n"; + ss << "m_Size, orig = " << origMsg.m_Size << ", rcvd = " << receivedMsg.m_Size << "\n"; + + std::cout << ss.str(); + + ASSERT_EQ(receivedMsg.m_MsgType, origMsg.m_MsgType); ASSERT_EQ(receivedMsg.m_SrcRank, srcRank); ASSERT_EQ(receivedMsg.m_DestRank, worldRank); ASSERT_EQ(receivedMsg.m_Offset, origMsg.m_Offset); @@ -80,6 +93,32 @@ TEST_F(RerouteTest, TestSendReceiveSelf) SendAndReceiveMessage(comm, destRank, expectedSender); } +TEST_F(RerouteTest, TestSendReceiveBare) +{ + helper::Comm comm = adios2::helper::CommDupMPI(MPI_COMM_WORLD); + + int sendToRank = worldRank >= worldSize - 1 ? 0 : worldRank + 1; + int recvFromRank = worldRank <= 0 ? worldSize - 1 : worldRank - 1; + + std::vector sendBuffer; + + sendBuffer.push_back(1); + sendBuffer.push_back(2); + sendBuffer.push_back(3); + + comm.Isend(sendBuffer.data(), sendBuffer.size(), sendToRank, 0); + + std::vector recvBuffer; + recvBuffer.resize(sendBuffer.size()); + + comm.Recv(recvBuffer.data(), recvBuffer.size(), recvFromRank, 0); + + ASSERT_EQ(recvBuffer.size(), 3); + ASSERT_EQ(static_cast(recvBuffer[0]), 1); + ASSERT_EQ(static_cast(recvBuffer[1]), 2); + ASSERT_EQ(static_cast(recvBuffer[2]), 3); +} + int main(int argc, char **argv) { MPI_Init(&argc, &argv); From 74f1e69f1d39802173c8f9ecefc6ddf8f8ac9f7e Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Thu, 9 Oct 2025 14:25:07 -0600 Subject: [PATCH 22/37] remove one layer of casting in Isend --- source/adios2/helper/adiosCommMPI.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/source/adios2/helper/adiosCommMPI.cpp b/source/adios2/helper/adiosCommMPI.cpp index 5012157395..da6b0a40ca 100644 --- a/source/adios2/helper/adiosCommMPI.cpp +++ b/source/adios2/helper/adiosCommMPI.cpp @@ -462,7 +462,10 @@ Comm::Req CommImplMPI::Isend(const void *buffer, size_t count, Datatype datatype { int batchSize = static_cast(count); MPI_Request mpiReq; - CheckMPIReturn(MPI_Isend(static_cast(const_cast(buffer)), batchSize, + std::cout << "Isend " << count << " items of type " << static_cast(datatype) << std::endl; + // CheckMPIReturn(MPI_Isend(static_cast(const_cast(buffer)), batchSize, + // ToMPI(datatype), dest, tag, m_MPIComm, &mpiReq), + CheckMPIReturn(MPI_Isend(const_cast(buffer), batchSize, ToMPI(datatype), dest, tag, m_MPIComm, &mpiReq), " in call to Isend with single batch " + hint + "\n"); req->m_MPIReqs.emplace_back(mpiReq); From c05d03608d8b0a21295c80b994e6a9c4c2034926 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Thu, 9 Oct 2025 14:25:25 -0600 Subject: [PATCH 23/37] simplify the send/recv bare test --- source/adios2/helper/adiosCommMPI.cpp | 4 +- .../adios2/engine/bp/TestBPRerouteMessage.cpp | 44 ++++++++++++++----- 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/source/adios2/helper/adiosCommMPI.cpp b/source/adios2/helper/adiosCommMPI.cpp index da6b0a40ca..1e99502fa6 100644 --- a/source/adios2/helper/adiosCommMPI.cpp +++ b/source/adios2/helper/adiosCommMPI.cpp @@ -402,6 +402,8 @@ Comm::Status CommImplMPI::Recv(void *buf, size_t count, Datatype datatype, int s const std::string &hint) const { MPI_Status mpiStatus; + int msrc = GetMPISource(source); + std::cout << "Recv " << count << " items of type " << static_cast(datatype) << " from " << msrc << std::endl; CheckMPIReturn(MPI_Recv(buf, static_cast(count), ToMPI(datatype), GetMPISource(source), tag, m_MPIComm, &mpiStatus), hint); @@ -462,7 +464,7 @@ Comm::Req CommImplMPI::Isend(const void *buffer, size_t count, Datatype datatype { int batchSize = static_cast(count); MPI_Request mpiReq; - std::cout << "Isend " << count << " items of type " << static_cast(datatype) << std::endl; + std::cout << "Isend " << count << " items" << std::endl; // CheckMPIReturn(MPI_Isend(static_cast(const_cast(buffer)), batchSize, // ToMPI(datatype), dest, tag, m_MPIComm, &mpiReq), CheckMPIReturn(MPI_Isend(const_cast(buffer), batchSize, diff --git a/testing/adios2/engine/bp/TestBPRerouteMessage.cpp b/testing/adios2/engine/bp/TestBPRerouteMessage.cpp index 7f18190e87..c308d34611 100644 --- a/testing/adios2/engine/bp/TestBPRerouteMessage.cpp +++ b/testing/adios2/engine/bp/TestBPRerouteMessage.cpp @@ -95,28 +95,50 @@ TEST_F(RerouteTest, TestSendReceiveSelf) TEST_F(RerouteTest, TestSendReceiveBare) { - helper::Comm comm = adios2::helper::CommDupMPI(MPI_COMM_WORLD); + helper::Comm comm = adios2::helper::CommWithMPI(MPI_COMM_WORLD); int sendToRank = worldRank >= worldSize - 1 ? 0 : worldRank + 1; int recvFromRank = worldRank <= 0 ? worldSize - 1 : worldRank - 1; - std::vector sendBuffer; - - sendBuffer.push_back(1); - sendBuffer.push_back(2); - sendBuffer.push_back(3); + std::vector sendBuffer = {1, 2, 3}; comm.Isend(sendBuffer.data(), sendBuffer.size(), sendToRank, 0); std::vector recvBuffer; recvBuffer.resize(sendBuffer.size()); - comm.Recv(recvBuffer.data(), recvBuffer.size(), recvFromRank, 0); + helper::Comm::Status status = comm.Recv(recvBuffer.data(), recvBuffer.size(), recvFromRank, 0); + + std::cout << "Rank " << comm.Rank() << " received " << status.Count << " elts" << std::endl; + + ASSERT_EQ(recvBuffer.size(), sendBuffer.size()); + + for (int i = 0; i < recvBuffer.size(); ++i) + { + ASSERT_EQ(static_cast(recvBuffer[i]), static_cast(sendBuffer[i])); + } +} - ASSERT_EQ(recvBuffer.size(), 3); - ASSERT_EQ(static_cast(recvBuffer[0]), 1); - ASSERT_EQ(static_cast(recvBuffer[1]), 2); - ASSERT_EQ(static_cast(recvBuffer[2]), 3); +TEST_F(RerouteTest, TestSendReceiveMoreBare) +{ + int sendToRank = worldRank >= worldSize - 1 ? 0 : worldRank + 1; + int recvFromRank = worldRank <= 0 ? worldSize - 1 : worldRank - 1; + int count = 3; + + // Send the buffer of chars (non-blocking) + MPI_Request request = MPI_REQUEST_NULL; + char sendBuffer[count] = {1, 2, 3}; + MPI_Isend(&sendBuffer, count, MPI_CHAR, sendToRank, 0, MPI_COMM_WORLD, &request); + + // Receive the buffer of chars (blocking) + MPI_Status status; + char recvBuffer[count]; + MPI_Recv(&recvBuffer, count, MPI_CHAR, recvFromRank, 0, MPI_COMM_WORLD, &status); + + for (int i = 0; i < count; ++i) + { + ASSERT_EQ(static_cast(recvBuffer[i]), static_cast(sendBuffer[i])); + } } int main(int argc, char **argv) From 8458353742e6c6627cd89a53ab22e81f32a4b321 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Wed, 29 Oct 2025 10:47:41 -0600 Subject: [PATCH 24/37] clang format --- source/adios2/helper/adiosCommMPI.cpp | 7 ++++--- testing/adios2/engine/bp/TestBPRerouteMessage.cpp | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/source/adios2/helper/adiosCommMPI.cpp b/source/adios2/helper/adiosCommMPI.cpp index 1e99502fa6..959dbebb2d 100644 --- a/source/adios2/helper/adiosCommMPI.cpp +++ b/source/adios2/helper/adiosCommMPI.cpp @@ -403,7 +403,8 @@ Comm::Status CommImplMPI::Recv(void *buf, size_t count, Datatype datatype, int s { MPI_Status mpiStatus; int msrc = GetMPISource(source); - std::cout << "Recv " << count << " items of type " << static_cast(datatype) << " from " << msrc << std::endl; + std::cout << "Recv " << count << " items of type " << static_cast(datatype) << " from " + << msrc << std::endl; CheckMPIReturn(MPI_Recv(buf, static_cast(count), ToMPI(datatype), GetMPISource(source), tag, m_MPIComm, &mpiStatus), hint); @@ -467,8 +468,8 @@ Comm::Req CommImplMPI::Isend(const void *buffer, size_t count, Datatype datatype std::cout << "Isend " << count << " items" << std::endl; // CheckMPIReturn(MPI_Isend(static_cast(const_cast(buffer)), batchSize, // ToMPI(datatype), dest, tag, m_MPIComm, &mpiReq), - CheckMPIReturn(MPI_Isend(const_cast(buffer), batchSize, - ToMPI(datatype), dest, tag, m_MPIComm, &mpiReq), + CheckMPIReturn(MPI_Isend(const_cast(buffer), batchSize, ToMPI(datatype), dest, tag, + m_MPIComm, &mpiReq), " in call to Isend with single batch " + hint + "\n"); req->m_MPIReqs.emplace_back(mpiReq); } diff --git a/testing/adios2/engine/bp/TestBPRerouteMessage.cpp b/testing/adios2/engine/bp/TestBPRerouteMessage.cpp index c308d34611..0c89d6c01c 100644 --- a/testing/adios2/engine/bp/TestBPRerouteMessage.cpp +++ b/testing/adios2/engine/bp/TestBPRerouteMessage.cpp @@ -3,9 +3,9 @@ * accompanying file Copyright.txt for details. */ +#include "adios2/helper/adiosMemory.h" #include #include -#include "adios2/helper/adiosMemory.h" #include #include #include @@ -51,7 +51,7 @@ void SendAndReceiveMessage(helper::Comm &comm, int destRank, int srcRank) std::stringstream ss; ss << "m_MsgType, orig = " << static_cast(origMsg.m_MsgType) - << ", rcvd = " << static_cast(receivedMsg.m_MsgType) << "\n"; + << ", rcvd = " << static_cast(receivedMsg.m_MsgType) << "\n"; ss << "m_SrcRank, orig = " << srcRank << ", rcvd = " << receivedMsg.m_SrcRank << "\n"; ss << "m_DestRank, orig = " << worldRank << ", rcvd = " << receivedMsg.m_DestRank << "\n"; ss << "m_Offset, orig = " << origMsg.m_Offset << ", rcvd = " << receivedMsg.m_Offset << "\n"; From 2f58b3159d1777e61e8de196738acc711506a2f3 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Wed, 29 Oct 2025 11:29:01 -0600 Subject: [PATCH 25/37] fix compile warnings and errors --- source/adios2/helper/adiosRerouting.cpp | 4 ++-- testing/adios2/engine/bp/TestBPRerouteMessage.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/source/adios2/helper/adiosRerouting.cpp b/source/adios2/helper/adiosRerouting.cpp index 863c4b51fc..9361571ae2 100644 --- a/source/adios2/helper/adiosRerouting.cpp +++ b/source/adios2/helper/adiosRerouting.cpp @@ -41,7 +41,7 @@ void RerouteMessage::NonBlockingSendTo(helper::Comm &comm, int destRank, std::ve std::stringstream ss; ss << "Rank " << comm.Rank() << " SEND buffer of length " << buffer.size() << ": ["; - for (int i = 0; i < buffer.size(); ++i) + for (size_t i = 0; i < buffer.size(); ++i) { ss << " " << static_cast(buffer[i]); } @@ -60,7 +60,7 @@ void RerouteMessage::BlockingRecvFrom(helper::Comm &comm, int srcRank, std::vect std::stringstream ss; ss << "Rank " << comm.Rank() << " RECV buffer of length " << buffer.size() << ": ["; - for (int i = 0; i < buffer.size(); ++i) + for (size_t i = 0; i < buffer.size(); ++i) { ss << " " << static_cast(buffer[i]); } diff --git a/testing/adios2/engine/bp/TestBPRerouteMessage.cpp b/testing/adios2/engine/bp/TestBPRerouteMessage.cpp index 0c89d6c01c..2eecd418ac 100644 --- a/testing/adios2/engine/bp/TestBPRerouteMessage.cpp +++ b/testing/adios2/engine/bp/TestBPRerouteMessage.cpp @@ -113,7 +113,7 @@ TEST_F(RerouteTest, TestSendReceiveBare) ASSERT_EQ(recvBuffer.size(), sendBuffer.size()); - for (int i = 0; i < recvBuffer.size(); ++i) + for (size_t i = 0; i < recvBuffer.size(); ++i) { ASSERT_EQ(static_cast(recvBuffer[i]), static_cast(sendBuffer[i])); } @@ -123,7 +123,7 @@ TEST_F(RerouteTest, TestSendReceiveMoreBare) { int sendToRank = worldRank >= worldSize - 1 ? 0 : worldRank + 1; int recvFromRank = worldRank <= 0 ? worldSize - 1 : worldRank - 1; - int count = 3; + const int count = 3; // Send the buffer of chars (non-blocking) MPI_Request request = MPI_REQUEST_NULL; From d5fff62ec695d0438fdfecebaf810db2f312a133 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Tue, 4 Nov 2025 16:34:09 -0700 Subject: [PATCH 26/37] Introduce STATUS_INQUIRY and STATUS_REPLY messages --- .../engine/bp5/BP5Writer_WithRerouting.cpp | 114 +++++++++++++++--- source/adios2/helper/adiosRerouting.h | 3 + 2 files changed, 99 insertions(+), 18 deletions(-) diff --git a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp index cb137651a2..d9052e8fd1 100644 --- a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp +++ b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp @@ -66,11 +66,21 @@ void BP5Writer::ReroutingCommunicationLoop() int subCoord = m_Aggregator->m_AggregatorRank; bool iAmSubCoord = m_RankMPI == subCoord; - // Arbitrarily make the last rank the global coordinator - // TODO: should the global coordinator role be assigned to a subcoordinator? - // bool iAmGlobalCoord = m_RankMPI == m_Comm.Size() - 1; + + // Arbitrarily decide that the sub coordinator of the first partition is also + // the global coordinator + int globalCoord = m_Partitioning.m_Partitions[0][0]; + bool iAmGlobalCoord = m_RankMPI == globalCoord; + + // Some containers only used by augmented roles: + + // sub coordinators maintain a queue of writers std::queue writerQueue; + // global coordinator keeps track of queue sizes of each subcoord + std::vector queueSizes; + std::vector subCoordRanks; + // Sends are non-blocking. We use the pool to avoid the situation where the // buffer is destructed before the send is complete. If we start seeing // many Sends pending for a long time, that could cause us to exhaust the @@ -81,6 +91,20 @@ void BP5Writer::ReroutingCommunicationLoop() int writingRank = -1; uint64_t currentFilePos = 0; bool sentFinished = false; + bool commThreadFinished = false; + bool firstIdleMsg = true; + + if (iAmGlobalCoord) + { + queueSizes.resize(m_CommAggregators.Size()); + subCoordRanks.resize(m_CommAggregators.Size()); + + for (size_t i = 0; i < m_Partitioning.m_Partitions.size(); ++i) + { + queueSizes[i] = -1; // indicates we don't know the size of that queue + subCoordRanks[i] = m_Partitioning.m_Partitions[i][0]; + } + } if (iAmSubCoord) { @@ -103,7 +127,7 @@ void BP5Writer::ReroutingCommunicationLoop() } } - while (true) + while (!commThreadFinished) { int msgReady = 0; helper::Comm::Status status = @@ -117,19 +141,65 @@ void BP5Writer::ReroutingCommunicationLoop() switch ((RerouteMessage::MessageType)message.m_MsgType) { - case RerouteMessage::MessageType::DO_WRITE: { - std::unique_lock lck(m_WriteMutex); - m_TargetIndex = message.m_SubStreamIdx; - m_DataPos = message.m_Offset; - m_TargetCoordinator = message.m_SrcRank; - m_ReadyToWrite = true; - m_WriteCV.notify_one(); - } - break; + case RerouteMessage::MessageType::DO_WRITE: + { + std::unique_lock lck(m_WriteMutex); + m_TargetIndex = message.m_SubStreamIdx; + m_DataPos = message.m_Offset; + m_TargetCoordinator = message.m_SrcRank; + m_ReadyToWrite = true; + m_WriteCV.notify_one(); + } + break; case RerouteMessage::MessageType::WRITE_COMPLETION: currentFilePos = message.m_Offset; writingRank = -1; break; + case RerouteMessage::MessageType::GROUP_CLOSE: + m_DataPos = currentFilePos; + commThreadFinished = true; + continue; + case RerouteMessage::MessageType::WRITER_IDLE: + { + int idleWriter = message.m_SrcRank; + size_t idleGroup = static_cast(message.m_SubStreamIdx); + queueSizes[idleGroup] = 0; + + if (firstIdleMsg) + { + for (size_t i = 0; i < subCoordRanks.size(); ++i) + { + int scRank = subCoordRanks[i]; + // No need to ask the sender of the WRITER_IDLE msg how big their queue is + if (scRank != idleWriter) + { + adios2::helper::RerouteMessage inquiryMsg; + inquiryMsg.m_MsgType = RerouteMessage::MessageType::STATUS_INQUIRY; + inquiryMsg.m_SrcRank = m_RankMPI; + inquiryMsg.m_DestRank = scRank; + inquiryMsg.NonBlockingSendTo(m_Comm, scRank, sendBuffers.GetNextBuffer()); + } + } + + firstIdleMsg = false; + } + } + break; + case RerouteMessage::MessageType::STATUS_INQUIRY: + adios2::helper::RerouteMessage replyMsg; + replyMsg.m_MsgType = RerouteMessage::MessageType::STATUS_REPLY; + replyMsg.m_SrcRank = m_RankMPI; + replyMsg.m_DestRank = globalCoord; + replyMsg.m_SubStreamIdx = static_cast(m_Aggregator->m_SubStreamIndex); + replyMsg.m_Size = static_cast(writerQueue.size()); + replyMsg.NonBlockingSendTo(m_Comm, globalCoord, sendBuffers.GetNextBuffer()); + break; + case RerouteMessage::MessageType::STATUS_REPLY: + { + size_t idx = static_cast(message.m_SubStreamIdx); + queueSizes[idx] = static_cast(message.m_Size); + } + break; default: break; } @@ -150,11 +220,12 @@ void BP5Writer::ReroutingCommunicationLoop() sendBuffers.GetNextBuffer()); sentFinished = true; - if (!iAmSubCoord /*&& !iAmGlobalCoord*/) + if (!iAmSubCoord && !iAmGlobalCoord) { // My only role was to write (no communication responsibility) so I am // done at this point. - break; + commThreadFinished = true; + continue; } } } @@ -164,6 +235,7 @@ void BP5Writer::ReroutingCommunicationLoop() { if (!writerQueue.empty()) { + // Pop the queue and send DO_WRITE int nextWriter = writerQueue.front(); writerQueue.pop(); adios2::helper::RerouteMessage writeMsg; @@ -177,9 +249,15 @@ void BP5Writer::ReroutingCommunicationLoop() } else { - // TODO: Send WRITE_IDLE to the GC instead of ending the loop here - m_DataPos = currentFilePos; - break; + // Writer queue now empty, send WRITE_IDLE to the GC + // TODO: If this group is already over the threshold ratio, send + // TODO: GROUP_CLOSE instead of WRITER_IDLE + adios2::helper::RerouteMessage idleMsg; + idleMsg.m_MsgType = RerouteMessage::MessageType::WRITER_IDLE; + idleMsg.m_SrcRank = m_RankMPI; + idleMsg.m_DestRank = globalCoord; + idleMsg.m_SubStreamIdx = static_cast(m_Aggregator->m_SubStreamIndex); + idleMsg.NonBlockingSendTo(m_Comm, globalCoord, sendBuffers.GetNextBuffer()); } } } diff --git a/source/adios2/helper/adiosRerouting.h b/source/adios2/helper/adiosRerouting.h index 1cc1bae082..7e9a5cdf06 100644 --- a/source/adios2/helper/adiosRerouting.h +++ b/source/adios2/helper/adiosRerouting.h @@ -37,6 +37,9 @@ class RerouteMessage REROUTE_REQUEST, REROUTE_ACK, REROUTE_REJECT, + GROUP_CLOSE, + STATUS_INQUIRY, + STATUS_REPLY, }; std::string GetTypeString(MessageType mtype) From e317ca5aa696ab71773571c34075600447e2d2c8 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Tue, 4 Nov 2025 19:37:49 -0700 Subject: [PATCH 27/37] Abstractions to help global coord manage subcoord state --- .../engine/bp5/BP5Writer_WithRerouting.cpp | 76 +++++++++++++++++-- 1 file changed, 70 insertions(+), 6 deletions(-) diff --git a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp index d9052e8fd1..012a8d02d2 100644 --- a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp +++ b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp @@ -49,6 +49,63 @@ class BufferPool size_t m_CurrentBufferIdx = 0; std::vector> m_Pool; }; + +struct WriterGroupState +{ + enum class Status + { + UNKNOWN, + WRITING, + IDLE, + PENDING, + CAPACITY, + }; + + Status m_currentStatus; + int m_queueSize; + size_t m_subFileIndex; + // We could also keep track of number of times rerouted to/from +}; + +/// Avoid visiting the same rerouting source or destination groups +/// repeatedly +struct StateTraversal +{ + StateTraversal() + { + m_currentIndex = 0; + } + + int GetNext(WriterGroupState::Status status, const std::vector &state) + { + if (state.size() == 0) + { + return -1; + } + + size_t checkedCount = 0; + + while (checkedCount < state.size()) + { + checkedCount++; + m_currentIndex++; + + if (m_currentIndex >= state.size()) + { + m_currentIndex = 0; + } + + if (state[m_currentIndex].m_currentStatus == status) + { + return m_currentIndex; + } + } + + return -1; + } + + size_t m_currentIndex; +}; } namespace adios2 @@ -77,8 +134,9 @@ void BP5Writer::ReroutingCommunicationLoop() // sub coordinators maintain a queue of writers std::queue writerQueue; - // global coordinator keeps track of queue sizes of each subcoord - std::vector queueSizes; + // global coordinator keeps track of state of each subcoord + std::vector groupState; + StateTraversal idlers, writers; std::vector subCoordRanks; // Sends are non-blocking. We use the pool to avoid the situation where the @@ -96,12 +154,15 @@ void BP5Writer::ReroutingCommunicationLoop() if (iAmGlobalCoord) { - queueSizes.resize(m_CommAggregators.Size()); + groupState.resize(m_CommAggregators.Size()); subCoordRanks.resize(m_CommAggregators.Size()); for (size_t i = 0; i < m_Partitioning.m_Partitions.size(); ++i) { - queueSizes[i] = -1; // indicates we don't know the size of that queue + // Status remains unknown until we hear something specific (i.e. idle + // message or status inquiry response) + groupState[i].m_currentStatus = WriterGroupState::Status::UNKNOWN; + groupState[i].m_subFileIndex = i; subCoordRanks[i] = m_Partitioning.m_Partitions[i][0]; } } @@ -163,7 +224,8 @@ void BP5Writer::ReroutingCommunicationLoop() { int idleWriter = message.m_SrcRank; size_t idleGroup = static_cast(message.m_SubStreamIdx); - queueSizes[idleGroup] = 0; + groupState[idleGroup].m_currentStatus = WriterGroupState::Status::IDLE; + groupState[idleGroup].m_queueSize = 0; if (firstIdleMsg) { @@ -197,7 +259,9 @@ void BP5Writer::ReroutingCommunicationLoop() case RerouteMessage::MessageType::STATUS_REPLY: { size_t idx = static_cast(message.m_SubStreamIdx); - queueSizes[idx] = static_cast(message.m_Size); + int qSize = static_cast(message.m_Size); + groupState[idx].m_queueSize = qSize; + groupState[idx].m_currentStatus = qSize > 0 ? WriterGroupState::Status::WRITING : WriterGroupState::Status::IDLE; } break; default: From 0d5a88524ba79a501f017f0fbe70e6cb036aa397 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Wed, 5 Nov 2025 17:14:32 -0700 Subject: [PATCH 28/37] Add messages and state management needed for actual re-routing --- .../engine/bp5/BP5Writer_WithRerouting.cpp | 257 ++++++++++++++++-- source/adios2/helper/adiosRerouting.cpp | 4 +- source/adios2/helper/adiosRerouting.h | 9 +- 3 files changed, 240 insertions(+), 30 deletions(-) diff --git a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp index 012a8d02d2..41590a54cc 100644 --- a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp +++ b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp @@ -67,44 +67,93 @@ struct WriterGroupState // We could also keep track of number of times rerouted to/from }; +bool IsFinished(const WriterGroupState &state) +{ + return state.m_currentStatus == WriterGroupState::Status::CAPACITY || state.m_currentStatus == WriterGroupState::Status::IDLE; +} + /// Avoid visiting the same rerouting source or destination groups /// repeatedly -struct StateTraversal +class StateTraversal { +public: + StateTraversal() { - m_currentIndex = 0; + m_idlerIndex = 0; + m_writerIndex = 0; } - int GetNext(WriterGroupState::Status status, const std::vector &state) + enum class SearchResult + { + NOT_FOUND, + FOUND, + FINISHED, + }; + + SearchResult FindNextPair(const std::vector &state, std::pair &nextPair) + { + auto isIdle = [](WriterGroupState s) { + return s.m_currentStatus == WriterGroupState::Status::IDLE; + }; + + SearchResult idleResult = GetNext(isIdle, state, m_idlerIndex, nextPair.first); + + if (idleResult != SearchResult::FOUND) + { + return idleResult; + } + + auto canOffload = [](WriterGroupState s) { + return s.m_currentStatus == WriterGroupState::Status::WRITING && s.m_queueSize > 0; + }; + + return GetNext(canOffload, state, m_writerIndex, nextPair.second); + } + +private: + + SearchResult GetNext(bool(*checkFn)(WriterGroupState), const std::vector &state, size_t &searchIndex, size_t &foundIndex) { if (state.size() == 0) { - return -1; + return SearchResult::NOT_FOUND; } size_t checkedCount = 0; + size_t finishedCount = 0; while (checkedCount < state.size()) { checkedCount++; - m_currentIndex++; + searchIndex++; - if (m_currentIndex >= state.size()) + if (searchIndex >= state.size()) { - m_currentIndex = 0; + searchIndex = 0; } - if (state[m_currentIndex].m_currentStatus == status) + if (checkFn(state[searchIndex])) + { + foundIndex = searchIndex; + return SearchResult::FOUND; + } + else if (IsFinished(state[searchIndex])) { - return m_currentIndex; + finishedCount++; } } - return -1; + if (finishedCount == state.size()) + { + return SearchResult::FINISHED; + } + + return SearchResult::NOT_FOUND; } - size_t m_currentIndex; + size_t m_idlerIndex; + size_t m_writerIndex; }; } @@ -136,21 +185,32 @@ void BP5Writer::ReroutingCommunicationLoop() // global coordinator keeps track of state of each subcoord std::vector groupState; - StateTraversal idlers, writers; + bool needStateCheck = false; + StateTraversal pairFinder; std::vector subCoordRanks; + bool firstIdleMsg = true; // Sends are non-blocking. We use the pool to avoid the situation where the // buffer is destructed before the send is complete. If we start seeing // many Sends pending for a long time, that could cause us to exhaust the // pool of buffers, at which point we would start overwriting buffers in the // pool, potentially creating errors which are difficult to debug/diagnose. + int bufferSize = 1; + if (iAmGlobalCoord) + { + bufferSize = subCoordRanks.size(); + } + else if (iAmSubCoord) + { + bufferSize = 100; + } BufferPool sendBuffers((iAmSubCoord ? 100 : 1)); std::vector recvBuffer; int writingRank = -1; uint64_t currentFilePos = 0; bool sentFinished = false; bool commThreadFinished = false; - bool firstIdleMsg = true; + if (iAmGlobalCoord) { @@ -194,6 +254,10 @@ void BP5Writer::ReroutingCommunicationLoop() helper::Comm::Status status = m_Comm.Iprobe(static_cast(helper::Comm::Constants::CommRecvAny), 0, &msgReady); + // Many messages may not result in a state change for the global coordinator, + // so set this flag if a check is needed + needStateCheck = false; + // If there is a message ready, receive and handle it if (msgReady) { @@ -203,9 +267,10 @@ void BP5Writer::ReroutingCommunicationLoop() switch ((RerouteMessage::MessageType)message.m_MsgType) { case RerouteMessage::MessageType::DO_WRITE: + // msg for all processes { std::unique_lock lck(m_WriteMutex); - m_TargetIndex = message.m_SubStreamIdx; + m_TargetIndex = message.m_WildCard; m_DataPos = message.m_Offset; m_TargetCoordinator = message.m_SrcRank; m_ReadyToWrite = true; @@ -213,17 +278,20 @@ void BP5Writer::ReroutingCommunicationLoop() } break; case RerouteMessage::MessageType::WRITE_COMPLETION: + // msg for sub coordinator currentFilePos = message.m_Offset; writingRank = -1; break; case RerouteMessage::MessageType::GROUP_CLOSE: + // msg for sub coordinator m_DataPos = currentFilePos; commThreadFinished = true; continue; case RerouteMessage::MessageType::WRITER_IDLE: + // msg for global coordinator { int idleWriter = message.m_SrcRank; - size_t idleGroup = static_cast(message.m_SubStreamIdx); + size_t idleGroup = static_cast(message.m_WildCard); groupState[idleGroup].m_currentStatus = WriterGroupState::Status::IDLE; groupState[idleGroup].m_queueSize = 0; @@ -245,30 +313,103 @@ void BP5Writer::ReroutingCommunicationLoop() firstIdleMsg = false; } + + needStateCheck = true; + } + break; + case RerouteMessage::MessageType::WRITER_CAPACITY: + // msg for global coordinator + { + int capacityGroup = message.m_WildCard; + groupState[capacityGroup].m_currentStatus = WriterGroupState::Status::CAPACITY; + needStateCheck = true; } break; case RerouteMessage::MessageType::STATUS_INQUIRY: + // msg for sub coordinator adios2::helper::RerouteMessage replyMsg; replyMsg.m_MsgType = RerouteMessage::MessageType::STATUS_REPLY; replyMsg.m_SrcRank = m_RankMPI; replyMsg.m_DestRank = globalCoord; - replyMsg.m_SubStreamIdx = static_cast(m_Aggregator->m_SubStreamIndex); + replyMsg.m_WildCard = static_cast(m_Aggregator->m_SubStreamIndex); replyMsg.m_Size = static_cast(writerQueue.size()); replyMsg.NonBlockingSendTo(m_Comm, globalCoord, sendBuffers.GetNextBuffer()); break; case RerouteMessage::MessageType::STATUS_REPLY: + // msg for global coordinator { - size_t idx = static_cast(message.m_SubStreamIdx); + size_t subStreamIdx = static_cast(message.m_WildCard); int qSize = static_cast(message.m_Size); - groupState[idx].m_queueSize = qSize; - groupState[idx].m_currentStatus = qSize > 0 ? WriterGroupState::Status::WRITING : WriterGroupState::Status::IDLE; + groupState[subStreamIdx].m_queueSize = qSize; + groupState[subStreamIdx].m_currentStatus = qSize > 0 ? WriterGroupState::Status::WRITING : WriterGroupState::Status::IDLE; + needStateCheck = true; + } + break; + case RerouteMessage::MessageType::REROUTE_REQUEST: + // msg for sub coordinator + if (writerQueue.empty()) + { + adios2::helper::RerouteMessage rejectMsg; + rejectMsg.m_MsgType = RerouteMessage::MessageType::REROUTE_REJECT; + rejectMsg.m_SrcRank = message.m_SrcRank; + rejectMsg.m_DestRank = message.m_DestRank; + rejectMsg.NonBlockingSendTo(m_Comm, globalCoord, sendBuffers.GetNextBuffer()); } + else + { + int reroutedRank = writerQueue.front(); + writerQueue.pop(); + + adios2::helper::RerouteMessage ackMsg; + ackMsg.m_MsgType = RerouteMessage::MessageType::REROUTE_ACK; + ackMsg.m_SrcRank = message.m_SrcRank; + ackMsg.m_DestRank = message.m_DestRank; + ackMsg.m_WildCard = reroutedRank; + ackMsg.NonBlockingSendTo(m_Comm, globalCoord, sendBuffers.GetNextBuffer()); + } + break; + case RerouteMessage::MessageType::REROUTE_REJECT: + // msg for global coordinator + + // Both the src and target subcoord states return from PENDING to their prior state + groupState[message.m_SrcRank].m_currentStatus = WriterGroupState::Status::WRITING; + groupState[message.m_DestRank].m_currentStatus = WriterGroupState::Status::IDLE; + + // The reason to check here is that global coord triggers at most + // one reroute sequence per iteration through the loop. Otherwise + // we probably don't need a state check upon receipt of rejection. + needStateCheck = true; + break; + case RerouteMessage::MessageType::REROUTE_ACK: + // msg for global coordinator + + // Send the lucky volunteer another writer + adios2::helper::RerouteMessage writeMoreMsg; + writeMoreMsg.m_MsgType = RerouteMessage::MessageType::WRITE_MORE; + writeMoreMsg.m_WildCard = message.m_WildCard; // i.e. the rerouted writer rank + writeMoreMsg.NonBlockingSendTo(m_Comm, message.m_DestRank, sendBuffers.GetNextBuffer()); + + // Src subcoord state is returned to writing, dest subcoord state is now writing as well + groupState[message.m_SrcRank].m_currentStatus = WriterGroupState::Status::WRITING; + groupState[message.m_SrcRank].m_queueSize -= 1; + groupState[message.m_DestRank].m_currentStatus = WriterGroupState::Status::WRITING; + groupState[message.m_DestRank].m_queueSize += 1; + + // The reason to check here is that global coord triggers at most + // one reroute sequence per iteration through the loop. Otherwise + // we probably don't need a state check upon receiving reroute ack. + needStateCheck = true; + break; + case RerouteMessage::MessageType::WRITE_MORE: + // msg for sub coordinator + writerQueue.push(message.m_WildCard); break; default: break; } } + // All processes // Check if writing has finished, and alert the target SC { std::lock_guard lck(m_NotifMutex); @@ -278,7 +419,7 @@ void BP5Writer::ReroutingCommunicationLoop() writeCompleteMsg.m_MsgType = RerouteMessage::MessageType::WRITE_COMPLETION; writeCompleteMsg.m_SrcRank = m_RankMPI; writeCompleteMsg.m_DestRank = m_TargetCoordinator; - writeCompleteMsg.m_SubStreamIdx = m_TargetIndex; + writeCompleteMsg.m_WildCard = m_TargetIndex; writeCompleteMsg.m_Offset = m_DataPos; writeCompleteMsg.NonBlockingSendTo(m_Comm, m_TargetCoordinator, sendBuffers.GetNextBuffer()); @@ -294,6 +435,7 @@ void BP5Writer::ReroutingCommunicationLoop() } } + // Subcoordinator processes // Check if anyone is writing right now, and if not, ask the next writer to start if (iAmSubCoord && writingRank == -1) { @@ -306,7 +448,7 @@ void BP5Writer::ReroutingCommunicationLoop() writeMsg.m_MsgType = RerouteMessage::MessageType::DO_WRITE; writeMsg.m_SrcRank = m_RankMPI; writeMsg.m_DestRank = nextWriter; - writeMsg.m_SubStreamIdx = static_cast(m_Aggregator->m_SubStreamIndex); + writeMsg.m_WildCard = static_cast(m_Aggregator->m_SubStreamIndex); writeMsg.m_Offset = currentFilePos; writingRank = nextWriter; writeMsg.NonBlockingSendTo(m_Comm, nextWriter, sendBuffers.GetNextBuffer()); @@ -314,14 +456,63 @@ void BP5Writer::ReroutingCommunicationLoop() else { // Writer queue now empty, send WRITE_IDLE to the GC - // TODO: If this group is already over the threshold ratio, send - // TODO: GROUP_CLOSE instead of WRITER_IDLE adios2::helper::RerouteMessage idleMsg; idleMsg.m_MsgType = RerouteMessage::MessageType::WRITER_IDLE; idleMsg.m_SrcRank = m_RankMPI; idleMsg.m_DestRank = globalCoord; - idleMsg.m_SubStreamIdx = static_cast(m_Aggregator->m_SubStreamIndex); + idleMsg.m_WildCard = static_cast(m_Aggregator->m_SubStreamIndex); idleMsg.NonBlockingSendTo(m_Comm, globalCoord, sendBuffers.GetNextBuffer()); + + // TODO: If this group is already over the threshold ratio, send + // TODO: WRITER_CAPACITY instead of WRITER_IDLE + + // adios2::helper::RerouteMessage capacityMsg; + // capacityMsg.m_MsgType = RerouteMessage::MessageType::WRITER_CAPACITY; + // // capacityMsg.m_SrcRank = m_RankMPI; + // // capacityMsg.m_DestRank = globalCoord; + // capacityMsg.m_WildCard = static_cast(m_Aggregator->m_SubStreamIndex); + // capacityMsg.NonBlockingSendTo(m_Comm, globalCoord, sendBuffers.GetNextBuffer()); + } + } + + // Global coordinator process + // Look for possible reroute-to / reroute-from pairs + if (iAmGlobalCoord && needStateCheck) + { + std::pair nextPair; + StateTraversal::SearchResult result = pairFinder.FindNextPair(groupState, nextPair); + + if (result == StateTraversal::SearchResult::FOUND) + { + // Finding a pair means there was both an idle group and a writing + // group with at least one writer in its queue. With these, we will + // initiate a reroute sequence. + size_t idleIdx = nextPair.first; + int idleSubcoordRank = subCoordRanks[idleIdx]; + size_t writerIdx = nextPair.second; + int writerSubcoordRank = subCoordRanks[writerIdx]; + + adios2::helper::RerouteMessage rerouteReqMsg; + rerouteReqMsg.m_MsgType = RerouteMessage::MessageType::REROUTE_REQUEST; + rerouteReqMsg.m_SrcRank = writerSubcoordRank; + rerouteReqMsg.m_DestRank = idleSubcoordRank; + rerouteReqMsg.NonBlockingSendTo(m_Comm, writerSubcoordRank, sendBuffers.GetNextBuffer()); + + groupState[idleIdx].m_currentStatus = WriterGroupState::Status::PENDING; + groupState[writerIdx].m_currentStatus = WriterGroupState::Status::PENDING; + } + else if (result == StateTraversal::SearchResult::FINISHED) + { + // If we didn't find a pair, it could be because all the groups are + // done writing (either idle or possibly at capacity). In that case, + // we need to release the subcoordinators (and ourself) from their + // comm loop. + for (size_t scIdx = 0; scIdx < subCoordRanks.size(); ++scIdx) + { + adios2::helper::RerouteMessage closeMsg; + closeMsg.m_MsgType = RerouteMessage::MessageType::GROUP_CLOSE; + closeMsg.NonBlockingSendTo(m_Comm, scIdx, sendBuffers.GetNextBuffer()); + } } } } @@ -362,6 +553,24 @@ void BP5Writer::WriteData_WithRerouting(format::BufferV *Data) // Do the writing + if (m_TargetIndex != m_Aggregator->m_SubStreamIndex) + { + // We were rerouted! + // Update this or metadata will be incorrect + m_Aggregator->m_SubStreamIndex = m_TargetIndex; + // Gah! We might need to open this file, but we already went through + // the InitAggregator(), InitTransports(), and InitBPBuffer() methods, + // which means we already shared that we were writing to a different + // subfile! + // Refactoring of those Init...() methods is required: + // - exchange subfile to populate m_WriterSubfileMap *after* + // everybody gets done writing, instead of before (we could + // maybe do this always, not just in DSB+Rerouting case) + // - need a path through those Init...() methods that doesn't + // invoke any collective calls (taken only by rerouted ranks in + // DSB+Rerouting case) + } + // align to PAGE_SIZE m_DataPos += helper::PaddingToAlignOffset(m_DataPos, m_Parameters.StripeSize); m_StartDataPos = m_DataPos; diff --git a/source/adios2/helper/adiosRerouting.cpp b/source/adios2/helper/adiosRerouting.cpp index 9361571ae2..5c13445e91 100644 --- a/source/adios2/helper/adiosRerouting.cpp +++ b/source/adios2/helper/adiosRerouting.cpp @@ -34,7 +34,7 @@ void RerouteMessage::NonBlockingSendTo(helper::Comm &comm, int destRank, std::ve helper::CopyToBuffer(buffer, pos, &this->m_MsgType); helper::CopyToBuffer(buffer, pos, &this->m_SrcRank); helper::CopyToBuffer(buffer, pos, &this->m_DestRank); - helper::CopyToBuffer(buffer, pos, &this->m_SubStreamIdx); + helper::CopyToBuffer(buffer, pos, &this->m_WildCard); helper::CopyToBuffer(buffer, pos, &this->m_Offset); helper::CopyToBuffer(buffer, pos, &this->m_Size); @@ -72,7 +72,7 @@ void RerouteMessage::BlockingRecvFrom(helper::Comm &comm, int srcRank, std::vect helper::CopyFromBuffer(buffer.data(), pos, &this->m_MsgType); helper::CopyFromBuffer(buffer.data(), pos, &this->m_SrcRank); helper::CopyFromBuffer(buffer.data(), pos, &this->m_DestRank); - helper::CopyFromBuffer(buffer.data(), pos, &this->m_SubStreamIdx); + helper::CopyFromBuffer(buffer.data(), pos, &this->m_WildCard); helper::CopyFromBuffer(buffer.data(), pos, &this->m_Offset); helper::CopyFromBuffer(buffer.data(), pos, &this->m_Size); } diff --git a/source/adios2/helper/adiosRerouting.h b/source/adios2/helper/adiosRerouting.h index 7e9a5cdf06..67aeae2301 100644 --- a/source/adios2/helper/adiosRerouting.h +++ b/source/adios2/helper/adiosRerouting.h @@ -31,8 +31,9 @@ class RerouteMessage enum class MessageType { DO_WRITE, + WRITER_CAPACITY, WRITER_IDLE, - WRITE_SUBMISSION, + WRITE_MORE, WRITE_COMPLETION, REROUTE_REQUEST, REROUTE_ACK, @@ -52,8 +53,8 @@ class RerouteMessage case MessageType::WRITER_IDLE: return std::string("WRITER_IDLE"); break; - case MessageType::WRITE_SUBMISSION: - return std::string("WRITE_SUBMISSION"); + case MessageType::WRITE_MORE: + return std::string("WRITE_MORE"); break; case MessageType::WRITE_COMPLETION: return std::string("WRITE_COMPLETION"); @@ -73,7 +74,7 @@ class RerouteMessage MessageType m_MsgType; int m_SrcRank; int m_DestRank; - int m_SubStreamIdx; + int m_WildCard; uint64_t m_Offset; uint64_t m_Size; From c6c1664985cf9c80d3e17c00f62026b5cee8f7bf Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Wed, 5 Nov 2025 17:21:04 -0700 Subject: [PATCH 29/37] run clang format --- .../engine/bp5/BP5Writer_WithRerouting.cpp | 32 ++++++++++++------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp index 41590a54cc..6732e46617 100644 --- a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp +++ b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp @@ -69,7 +69,8 @@ struct WriterGroupState bool IsFinished(const WriterGroupState &state) { - return state.m_currentStatus == WriterGroupState::Status::CAPACITY || state.m_currentStatus == WriterGroupState::Status::IDLE; + return state.m_currentStatus == WriterGroupState::Status::CAPACITY || + state.m_currentStatus == WriterGroupState::Status::IDLE; } /// Avoid visiting the same rerouting source or destination groups @@ -77,7 +78,6 @@ bool IsFinished(const WriterGroupState &state) class StateTraversal { public: - StateTraversal() { m_idlerIndex = 0; @@ -91,7 +91,8 @@ class StateTraversal FINISHED, }; - SearchResult FindNextPair(const std::vector &state, std::pair &nextPair) + SearchResult FindNextPair(const std::vector &state, + std::pair &nextPair) { auto isIdle = [](WriterGroupState s) { return s.m_currentStatus == WriterGroupState::Status::IDLE; @@ -112,8 +113,9 @@ class StateTraversal } private: - - SearchResult GetNext(bool(*checkFn)(WriterGroupState), const std::vector &state, size_t &searchIndex, size_t &foundIndex) + SearchResult GetNext(bool (*checkFn)(WriterGroupState), + const std::vector &state, size_t &searchIndex, + size_t &foundIndex) { if (state.size() == 0) { @@ -211,7 +213,6 @@ void BP5Writer::ReroutingCommunicationLoop() bool sentFinished = false; bool commThreadFinished = false; - if (iAmGlobalCoord) { groupState.resize(m_CommAggregators.Size()); @@ -300,14 +301,16 @@ void BP5Writer::ReroutingCommunicationLoop() for (size_t i = 0; i < subCoordRanks.size(); ++i) { int scRank = subCoordRanks[i]; - // No need to ask the sender of the WRITER_IDLE msg how big their queue is + // No need to ask the sender of the WRITER_IDLE msg how big their queue + // is if (scRank != idleWriter) { adios2::helper::RerouteMessage inquiryMsg; inquiryMsg.m_MsgType = RerouteMessage::MessageType::STATUS_INQUIRY; inquiryMsg.m_SrcRank = m_RankMPI; inquiryMsg.m_DestRank = scRank; - inquiryMsg.NonBlockingSendTo(m_Comm, scRank, sendBuffers.GetNextBuffer()); + inquiryMsg.NonBlockingSendTo(m_Comm, scRank, + sendBuffers.GetNextBuffer()); } } @@ -341,7 +344,9 @@ void BP5Writer::ReroutingCommunicationLoop() size_t subStreamIdx = static_cast(message.m_WildCard); int qSize = static_cast(message.m_Size); groupState[subStreamIdx].m_queueSize = qSize; - groupState[subStreamIdx].m_currentStatus = qSize > 0 ? WriterGroupState::Status::WRITING : WriterGroupState::Status::IDLE; + groupState[subStreamIdx].m_currentStatus = + qSize > 0 ? WriterGroupState::Status::WRITING + : WriterGroupState::Status::IDLE; needStateCheck = true; } break; @@ -387,9 +392,11 @@ void BP5Writer::ReroutingCommunicationLoop() adios2::helper::RerouteMessage writeMoreMsg; writeMoreMsg.m_MsgType = RerouteMessage::MessageType::WRITE_MORE; writeMoreMsg.m_WildCard = message.m_WildCard; // i.e. the rerouted writer rank - writeMoreMsg.NonBlockingSendTo(m_Comm, message.m_DestRank, sendBuffers.GetNextBuffer()); + writeMoreMsg.NonBlockingSendTo(m_Comm, message.m_DestRank, + sendBuffers.GetNextBuffer()); - // Src subcoord state is returned to writing, dest subcoord state is now writing as well + // Src subcoord state is returned to writing, dest subcoord state is now writing as + // well groupState[message.m_SrcRank].m_currentStatus = WriterGroupState::Status::WRITING; groupState[message.m_SrcRank].m_queueSize -= 1; groupState[message.m_DestRank].m_currentStatus = WriterGroupState::Status::WRITING; @@ -496,7 +503,8 @@ void BP5Writer::ReroutingCommunicationLoop() rerouteReqMsg.m_MsgType = RerouteMessage::MessageType::REROUTE_REQUEST; rerouteReqMsg.m_SrcRank = writerSubcoordRank; rerouteReqMsg.m_DestRank = idleSubcoordRank; - rerouteReqMsg.NonBlockingSendTo(m_Comm, writerSubcoordRank, sendBuffers.GetNextBuffer()); + rerouteReqMsg.NonBlockingSendTo(m_Comm, writerSubcoordRank, + sendBuffers.GetNextBuffer()); groupState[idleIdx].m_currentStatus = WriterGroupState::Status::PENDING; groupState[writerIdx].m_currentStatus = WriterGroupState::Status::PENDING; From d66bfe060ed2bd061bc85cc737be16a26095a115 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Fri, 7 Nov 2025 13:51:22 -0700 Subject: [PATCH 30/37] Debug/fix cycles --- source/adios2/engine/bp5/BP5Engine.h | 2 +- .../engine/bp5/BP5Writer_WithRerouting.cpp | 139 ++++++++++++++---- source/adios2/helper/adiosCommMPI.cpp | 12 +- source/adios2/helper/adiosRerouting.cpp | 57 +++++-- source/adios2/helper/adiosRerouting.h | 1 + 5 files changed, 159 insertions(+), 52 deletions(-) diff --git a/source/adios2/engine/bp5/BP5Engine.h b/source/adios2/engine/bp5/BP5Engine.h index 6d18689081..9cb287af98 100644 --- a/source/adios2/engine/bp5/BP5Engine.h +++ b/source/adios2/engine/bp5/BP5Engine.h @@ -142,7 +142,7 @@ class BP5Engine MACRO(BurstBufferDrain, Bool, bool, true) \ MACRO(BurstBufferPath, String, std::string, "") \ MACRO(NodeLocal, Bool, bool, false) \ - MACRO(verbose, Int, int, 0) \ + MACRO(verbose, Int, int, 5) \ MACRO(NumAggregators, UInt, unsigned int, 0) \ MACRO(AggregatorRatio, UInt, unsigned int, 0) \ MACRO(NumSubFiles, UInt, unsigned int, 0) \ diff --git a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp index 6732e46617..5a5540d368 100644 --- a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp +++ b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp @@ -172,6 +172,8 @@ void BP5Writer::ReroutingCommunicationLoop() { using RerouteMessage = adios2::helper::RerouteMessage; + std::cout << "Rank " << m_RankMPI << " Enter ReroutingCommunicationLoop" << std::endl; + int subCoord = m_Aggregator->m_AggregatorRank; bool iAmSubCoord = m_RankMPI == subCoord; @@ -211,7 +213,9 @@ void BP5Writer::ReroutingCommunicationLoop() int writingRank = -1; uint64_t currentFilePos = 0; bool sentFinished = false; - bool commThreadFinished = false; + bool receivedGroupClose = false; + bool expectingWriteCompletion = false; + bool sentIdle = false; if (iAmGlobalCoord) { @@ -249,7 +253,17 @@ void BP5Writer::ReroutingCommunicationLoop() } } - while (!commThreadFinished) + auto keepGoing = [&]() + { + if (iAmSubCoord) + { + return !receivedGroupClose || expectingWriteCompletion; + } + + return !receivedGroupClose; + }; + + while (keepGoing()) { int msgReady = 0; helper::Comm::Status status = @@ -268,30 +282,40 @@ void BP5Writer::ReroutingCommunicationLoop() switch ((RerouteMessage::MessageType)message.m_MsgType) { case RerouteMessage::MessageType::DO_WRITE: + std::cout << "Rank " << m_RankMPI << " received DO_WRITE" << std::endl; // msg for all processes { std::unique_lock lck(m_WriteMutex); m_TargetIndex = message.m_WildCard; m_DataPos = message.m_Offset; - m_TargetCoordinator = message.m_SrcRank; + // m_TargetCoordinator = message.m_SrcRank; + m_TargetCoordinator = status.Source; m_ReadyToWrite = true; m_WriteCV.notify_one(); } break; case RerouteMessage::MessageType::WRITE_COMPLETION: + std::cout << "Rank " << m_RankMPI << " received WRITE_COMPLETION from rank " + << status.Source << std::endl; // msg for sub coordinator currentFilePos = message.m_Offset; writingRank = -1; + expectingWriteCompletion = false; break; case RerouteMessage::MessageType::GROUP_CLOSE: + std::cout << "Rank " << m_RankMPI << " received GROUP_CLOSE from rank " + << status.Source << std::endl; // msg for sub coordinator m_DataPos = currentFilePos; - commThreadFinished = true; + receivedGroupClose = true; continue; case RerouteMessage::MessageType::WRITER_IDLE: + std::cout << "Rank " << m_RankMPI << " received WRITER_IDLE from rank " + << status.Source << std::endl; // msg for global coordinator { - int idleWriter = message.m_SrcRank; + // int idleWriter = message.m_SrcRank; + int idleWriter = status.Source; size_t idleGroup = static_cast(message.m_WildCard); groupState[idleGroup].m_currentStatus = WriterGroupState::Status::IDLE; groupState[idleGroup].m_queueSize = 0; @@ -305,6 +329,8 @@ void BP5Writer::ReroutingCommunicationLoop() // is if (scRank != idleWriter) { + std::cout << "GC (" << m_RankMPI << ") sending STATUS_INQUIRY to rank " + << scRank << std::endl; adios2::helper::RerouteMessage inquiryMsg; inquiryMsg.m_MsgType = RerouteMessage::MessageType::STATUS_INQUIRY; inquiryMsg.m_SrcRank = m_RankMPI; @@ -321,6 +347,8 @@ void BP5Writer::ReroutingCommunicationLoop() } break; case RerouteMessage::MessageType::WRITER_CAPACITY: + std::cout << "Rank " << m_RankMPI << " received WRITER_CAPACITY from rank " + << status.Source << std::endl; // msg for global coordinator { int capacityGroup = message.m_WildCard; @@ -329,6 +357,8 @@ void BP5Writer::ReroutingCommunicationLoop() } break; case RerouteMessage::MessageType::STATUS_INQUIRY: + std::cout << "Rank " << m_RankMPI << " received STATUS_INQUIRY from rank " + << status.Source << std::endl; // msg for sub coordinator adios2::helper::RerouteMessage replyMsg; replyMsg.m_MsgType = RerouteMessage::MessageType::STATUS_REPLY; @@ -339,6 +369,8 @@ void BP5Writer::ReroutingCommunicationLoop() replyMsg.NonBlockingSendTo(m_Comm, globalCoord, sendBuffers.GetNextBuffer()); break; case RerouteMessage::MessageType::STATUS_REPLY: + std::cout << "Rank " << m_RankMPI << " received STATUS_REPLY from rank " + << status.Source << std::endl; // msg for global coordinator { size_t subStreamIdx = static_cast(message.m_WildCard); @@ -351,9 +383,13 @@ void BP5Writer::ReroutingCommunicationLoop() } break; case RerouteMessage::MessageType::REROUTE_REQUEST: + std::cout << "Rank " << m_RankMPI << " received REROUTE_REQUEST from rank " + << status.Source << std::endl; // msg for sub coordinator if (writerQueue.empty()) { + std::cout << "Rank " << m_RankMPI << " sending REROUTE_REJECT to rank " + << globalCoord << std::endl; adios2::helper::RerouteMessage rejectMsg; rejectMsg.m_MsgType = RerouteMessage::MessageType::REROUTE_REJECT; rejectMsg.m_SrcRank = message.m_SrcRank; @@ -362,6 +398,8 @@ void BP5Writer::ReroutingCommunicationLoop() } else { + std::cout << "Rank " << m_RankMPI << " sending REROUTE_ACK to rank " + << globalCoord << std::endl; int reroutedRank = writerQueue.front(); writerQueue.pop(); @@ -374,11 +412,21 @@ void BP5Writer::ReroutingCommunicationLoop() } break; case RerouteMessage::MessageType::REROUTE_REJECT: + std::cout << "Rank " << m_RankMPI << " received REROUTE_REJECT from rank " + << status.Source << std::endl; // msg for global coordinator // Both the src and target subcoord states return from PENDING to their prior state - groupState[message.m_SrcRank].m_currentStatus = WriterGroupState::Status::WRITING; - groupState[message.m_DestRank].m_currentStatus = WriterGroupState::Status::IDLE; + if (groupState[message.m_SrcRank].m_currentStatus == WriterGroupState::Status::PENDING) + { + groupState[message.m_SrcRank].m_currentStatus = WriterGroupState::Status::WRITING; + groupState[message.m_SrcRank].m_queueSize = 0; + } + + if (groupState[message.m_DestRank].m_currentStatus == WriterGroupState::Status::PENDING) + { + groupState[message.m_DestRank].m_currentStatus = WriterGroupState::Status::IDLE; + } // The reason to check here is that global coord triggers at most // one reroute sequence per iteration through the loop. Otherwise @@ -386,8 +434,13 @@ void BP5Writer::ReroutingCommunicationLoop() needStateCheck = true; break; case RerouteMessage::MessageType::REROUTE_ACK: + std::cout << "Rank " << m_RankMPI << " received REROUTE_ACK from rank " + << status.Source << std::endl; // msg for global coordinator + std::cout << "Rank " << m_RankMPI << " sending WRITE_MORE to rank " + << message.m_DestRank << std::endl; + // Send the lucky volunteer another writer adios2::helper::RerouteMessage writeMoreMsg; writeMoreMsg.m_MsgType = RerouteMessage::MessageType::WRITE_MORE; @@ -400,7 +453,7 @@ void BP5Writer::ReroutingCommunicationLoop() groupState[message.m_SrcRank].m_currentStatus = WriterGroupState::Status::WRITING; groupState[message.m_SrcRank].m_queueSize -= 1; groupState[message.m_DestRank].m_currentStatus = WriterGroupState::Status::WRITING; - groupState[message.m_DestRank].m_queueSize += 1; + // groupState[message.m_DestRank].m_queueSize += 1; // The reason to check here is that global coord triggers at most // one reroute sequence per iteration through the loop. Otherwise @@ -408,8 +461,11 @@ void BP5Writer::ReroutingCommunicationLoop() needStateCheck = true; break; case RerouteMessage::MessageType::WRITE_MORE: + std::cout << "Rank " << m_RankMPI << " received WRITE_MORE from rank " + << status.Source << std::endl; // msg for sub coordinator writerQueue.push(message.m_WildCard); + sentIdle = false; break; default: break; @@ -428,17 +484,30 @@ void BP5Writer::ReroutingCommunicationLoop() writeCompleteMsg.m_DestRank = m_TargetCoordinator; writeCompleteMsg.m_WildCard = m_TargetIndex; writeCompleteMsg.m_Offset = m_DataPos; - writeCompleteMsg.NonBlockingSendTo(m_Comm, m_TargetCoordinator, - sendBuffers.GetNextBuffer()); - sentFinished = true; if (!iAmSubCoord && !iAmGlobalCoord) { + std::cout << "Rank " << m_RankMPI << " notifying SC (" << m_TargetCoordinator + << ") of write completion -- BLOCKING" << std::endl; // My only role was to write (no communication responsibility) so I am - // done at this point. - commThreadFinished = true; + // done at this point. However, I need to do a blocking send because I + // am about to return from this function, at which point my buffer pool + // goes away. + writeCompleteMsg.BlockingSendTo(m_Comm, m_TargetCoordinator, + sendBuffers.GetNextBuffer()); + + receivedGroupClose = true; continue; } + else + { + std::cout << "Rank " << m_RankMPI << " notifying SC (" << m_TargetCoordinator + << ") of write completion -- NONBLOCKING" << std::endl; + writeCompleteMsg.NonBlockingSendTo(m_Comm, m_TargetCoordinator, + sendBuffers.GetNextBuffer()); + } + + sentFinished = true; } } @@ -450,6 +519,7 @@ void BP5Writer::ReroutingCommunicationLoop() { // Pop the queue and send DO_WRITE int nextWriter = writerQueue.front(); + std::cout << "Rank " << m_RankMPI << " sending DO_WRITE to " << nextWriter << std::endl; writerQueue.pop(); adios2::helper::RerouteMessage writeMsg; writeMsg.m_MsgType = RerouteMessage::MessageType::DO_WRITE; @@ -458,10 +528,12 @@ void BP5Writer::ReroutingCommunicationLoop() writeMsg.m_WildCard = static_cast(m_Aggregator->m_SubStreamIndex); writeMsg.m_Offset = currentFilePos; writingRank = nextWriter; + expectingWriteCompletion = true; writeMsg.NonBlockingSendTo(m_Comm, nextWriter, sendBuffers.GetNextBuffer()); } - else + else if (!sentIdle) { + std::cout << "Rank " << m_RankMPI << " sending WRITER_IDLE to gc (" << globalCoord << ")" << std::endl; // Writer queue now empty, send WRITE_IDLE to the GC adios2::helper::RerouteMessage idleMsg; idleMsg.m_MsgType = RerouteMessage::MessageType::WRITER_IDLE; @@ -469,6 +541,7 @@ void BP5Writer::ReroutingCommunicationLoop() idleMsg.m_DestRank = globalCoord; idleMsg.m_WildCard = static_cast(m_Aggregator->m_SubStreamIndex); idleMsg.NonBlockingSendTo(m_Comm, globalCoord, sendBuffers.GetNextBuffer()); + sentIdle = true; // TODO: If this group is already over the threshold ratio, send // TODO: WRITER_CAPACITY instead of WRITER_IDLE @@ -486,11 +559,14 @@ void BP5Writer::ReroutingCommunicationLoop() // Look for possible reroute-to / reroute-from pairs if (iAmGlobalCoord && needStateCheck) { + std:: cout << "GC (" << m_RankMPI << ") looking for reroute candidate pair" << std::endl; std::pair nextPair; StateTraversal::SearchResult result = pairFinder.FindNextPair(groupState, nextPair); if (result == StateTraversal::SearchResult::FOUND) { + std:: cout << "GC (" << m_RankMPI << ") found reroute candidate pair (" << nextPair.first + << ", " << nextPair.second << "), sending REROUTE_REQUEST" << std::endl; // Finding a pair means there was both an idle group and a writing // group with at least one writer in its queue. With these, we will // initiate a reroute sequence. @@ -511,6 +587,12 @@ void BP5Writer::ReroutingCommunicationLoop() } else if (result == StateTraversal::SearchResult::FINISHED) { + std:: cout << "GC (" << m_RankMPI << ") all work finished, sending GROUP_CLOSE" << std::endl; + for (size_t i = 0; i < groupState.size(); ++i) + { + std::cout << " group " << i << " status: " << static_cast(groupState[i].m_currentStatus) + << ", queue size: " << groupState[i].m_queueSize << std::endl; + } // If we didn't find a pair, it could be because all the groups are // done writing (either idle or possibly at capacity). In that case, // we need to release the subcoordinators (and ourself) from their @@ -524,6 +606,8 @@ void BP5Writer::ReroutingCommunicationLoop() } } } + + std::cout << "Rank " << m_RankMPI << " Exit ReroutingCommunicationLoop" << std::endl; } void BP5Writer::WriteData_WithRerouting(format::BufferV *Data) @@ -551,7 +635,7 @@ void BP5Writer::WriteData_WithRerouting(format::BufferV *Data) std::thread commThread(&BP5Writer::ReroutingCommunicationLoop, this); - // std::cout << "Background thread for rank " << m_RankMPI << " is now running" << std::endl; + std::cout << "Background thread for rank " << m_RankMPI << " is now running" << std::endl; // wait until communication thread indicates it's our turn to write { @@ -561,22 +645,19 @@ void BP5Writer::WriteData_WithRerouting(format::BufferV *Data) // Do the writing + std::cout << "Rank " << m_RankMPI << " signaled to write" << std::endl; + + // Check if we need to update which file we are writing to if (m_TargetIndex != m_Aggregator->m_SubStreamIndex) { - // We were rerouted! - // Update this or metadata will be incorrect + // We were rerouted! Our aggregator subfile index is later exchanged with other + // ranks to be written to metadata, so update it here or the metadata will be + // wrong. m_Aggregator->m_SubStreamIndex = m_TargetIndex; - // Gah! We might need to open this file, but we already went through - // the InitAggregator(), InitTransports(), and InitBPBuffer() methods, - // which means we already shared that we were writing to a different - // subfile! - // Refactoring of those Init...() methods is required: - // - exchange subfile to populate m_WriterSubfileMap *after* - // everybody gets done writing, instead of before (we could - // maybe do this always, not just in DSB+Rerouting case) - // - need a path through those Init...() methods that doesn't - // invoke any collective calls (taken only by rerouted ranks in - // DSB+Rerouting case) + + // Open the subfile without doing any collective communications, since the global + // coordinator ensures only one rerouted rank opens this file at a time. + OpenSubfile(false); } // align to PAGE_SIZE @@ -584,7 +665,7 @@ void BP5Writer::WriteData_WithRerouting(format::BufferV *Data) m_StartDataPos = m_DataPos; std::vector DataVec = Data->DataVec(); - // TODO: use m_TargetIndex here instead of being locked into aggregator's index + AggTransportData &aggData = m_AggregatorSpecifics.at(GetCacheKey(m_Aggregator)); aggData.m_FileDataManager.WriteFileAt(DataVec.data(), DataVec.size(), m_StartDataPos); diff --git a/source/adios2/helper/adiosCommMPI.cpp b/source/adios2/helper/adiosCommMPI.cpp index 959dbebb2d..eafef64df2 100644 --- a/source/adios2/helper/adiosCommMPI.cpp +++ b/source/adios2/helper/adiosCommMPI.cpp @@ -402,9 +402,9 @@ Comm::Status CommImplMPI::Recv(void *buf, size_t count, Datatype datatype, int s const std::string &hint) const { MPI_Status mpiStatus; - int msrc = GetMPISource(source); - std::cout << "Recv " << count << " items of type " << static_cast(datatype) << " from " - << msrc << std::endl; + // int msrc = GetMPISource(source); + // std::cout << "Recv " << count << " items of type " << static_cast(datatype) << " from " + // << msrc << std::endl; CheckMPIReturn(MPI_Recv(buf, static_cast(count), ToMPI(datatype), GetMPISource(source), tag, m_MPIComm, &mpiStatus), hint); @@ -465,9 +465,9 @@ Comm::Req CommImplMPI::Isend(const void *buffer, size_t count, Datatype datatype { int batchSize = static_cast(count); MPI_Request mpiReq; - std::cout << "Isend " << count << " items" << std::endl; - // CheckMPIReturn(MPI_Isend(static_cast(const_cast(buffer)), batchSize, - // ToMPI(datatype), dest, tag, m_MPIComm, &mpiReq), + // std::cout << "Isend " << count << " items" << std::endl; + // // CheckMPIReturn(MPI_Isend(static_cast(const_cast(buffer)), batchSize, + // // ToMPI(datatype), dest, tag, m_MPIComm, &mpiReq), CheckMPIReturn(MPI_Isend(const_cast(buffer), batchSize, ToMPI(datatype), dest, tag, m_MPIComm, &mpiReq), " in call to Isend with single batch " + hint + "\n"); diff --git a/source/adios2/helper/adiosRerouting.cpp b/source/adios2/helper/adiosRerouting.cpp index 5c13445e91..1f6ab322ed 100644 --- a/source/adios2/helper/adiosRerouting.cpp +++ b/source/adios2/helper/adiosRerouting.cpp @@ -38,35 +38,60 @@ void RerouteMessage::NonBlockingSendTo(helper::Comm &comm, int destRank, std::ve helper::CopyToBuffer(buffer, pos, &this->m_Offset); helper::CopyToBuffer(buffer, pos, &this->m_Size); - std::stringstream ss; + // std::stringstream ss; - ss << "Rank " << comm.Rank() << " SEND buffer of length " << buffer.size() << ": ["; - for (size_t i = 0; i < buffer.size(); ++i) - { - ss << " " << static_cast(buffer[i]); - } - ss << " ]\n"; + // ss << "Rank " << comm.Rank() << " SEND buffer of length " << buffer.size() << ": ["; + // for (size_t i = 0; i < buffer.size(); ++i) + // { + // ss << " " << static_cast(buffer[i]); + // } + // ss << " ]\n"; - std::cout << ss.str(); + // std::cout << ss.str(); comm.Isend(buffer.data(), buffer.size(), destRank, 0); } +void RerouteMessage::BlockingSendTo(helper::Comm &comm, int destRank, std::vector &buffer) +{ + size_t pos = 0; + buffer.resize(REROUTE_MESSAGE_SIZE); + helper::CopyToBuffer(buffer, pos, &this->m_MsgType); + helper::CopyToBuffer(buffer, pos, &this->m_SrcRank); + helper::CopyToBuffer(buffer, pos, &this->m_DestRank); + helper::CopyToBuffer(buffer, pos, &this->m_WildCard); + helper::CopyToBuffer(buffer, pos, &this->m_Offset); + helper::CopyToBuffer(buffer, pos, &this->m_Size); + + // std::stringstream ss; + + // ss << "Rank " << comm.Rank() << " SEND buffer of length " << buffer.size() << ": ["; + // for (size_t i = 0; i < buffer.size(); ++i) + // { + // ss << " " << static_cast(buffer[i]); + // } + // ss << " ]\n"; + + // std::cout << ss.str(); + + comm.Send(buffer.data(), buffer.size(), destRank, 0); +} + void RerouteMessage::BlockingRecvFrom(helper::Comm &comm, int srcRank, std::vector &buffer) { buffer.resize(REROUTE_MESSAGE_SIZE); comm.Recv(buffer.data(), REROUTE_MESSAGE_SIZE, srcRank, 0); - std::stringstream ss; + // std::stringstream ss; - ss << "Rank " << comm.Rank() << " RECV buffer of length " << buffer.size() << ": ["; - for (size_t i = 0; i < buffer.size(); ++i) - { - ss << " " << static_cast(buffer[i]); - } - ss << " ]\n"; + // ss << "Rank " << comm.Rank() << " RECV buffer of length " << buffer.size() << ": ["; + // for (size_t i = 0; i < buffer.size(); ++i) + // { + // ss << " " << static_cast(buffer[i]); + // } + // ss << " ]\n"; - std::cout << ss.str(); + // std::cout << ss.str(); size_t pos = 0; helper::CopyFromBuffer(buffer.data(), pos, &this->m_MsgType); diff --git a/source/adios2/helper/adiosRerouting.h b/source/adios2/helper/adiosRerouting.h index 67aeae2301..a5d1e65a45 100644 --- a/source/adios2/helper/adiosRerouting.h +++ b/source/adios2/helper/adiosRerouting.h @@ -67,6 +67,7 @@ class RerouteMessage // Send the contents of this message to another rank void NonBlockingSendTo(helper::Comm &comm, int destRank, std::vector &buffer); + void BlockingSendTo(helper::Comm &comm, int destRank, std::vector &buffer); // Receive a message from another rank to populate this message void BlockingRecvFrom(helper::Comm &comm, int srcRank, std::vector &buffer); From ad0c1ad8a450f1db9a0df8decc62e1b120b956e4 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Fri, 7 Nov 2025 18:03:24 -0700 Subject: [PATCH 31/37] More debug/fix cycles --- .../engine/bp5/BP5Writer_WithRerouting.cpp | 101 +++++++++++++++--- source/adios2/helper/adiosRerouting.h | 1 + 2 files changed, 89 insertions(+), 13 deletions(-) diff --git a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp index 5a5540d368..bfa1600ca3 100644 --- a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp +++ b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp @@ -59,6 +59,7 @@ struct WriterGroupState IDLE, PENDING, CAPACITY, + CLOSED, }; Status m_currentStatus; @@ -193,6 +194,9 @@ void BP5Writer::ReroutingCommunicationLoop() StateTraversal pairFinder; std::vector subCoordRanks; bool firstIdleMsg = true; + std::set closeAcksNeeded; + std::set groupIdlesNeeded; + bool waitingForCloseAcks = false; // Sends are non-blocking. We use the pool to avoid the situation where the // buffer is destructed before the send is complete. If we start seeing @@ -229,6 +233,8 @@ void BP5Writer::ReroutingCommunicationLoop() groupState[i].m_currentStatus = WriterGroupState::Status::UNKNOWN; groupState[i].m_subFileIndex = i; subCoordRanks[i] = m_Partitioning.m_Partitions[i][0]; + closeAcksNeeded.insert(subCoordRanks[i]); + groupIdlesNeeded.insert(subCoordRanks[i]); } } @@ -257,7 +263,7 @@ void BP5Writer::ReroutingCommunicationLoop() { if (iAmSubCoord) { - return !receivedGroupClose || expectingWriteCompletion; + return !receivedGroupClose || expectingWriteCompletion || waitingForCloseAcks; } return !receivedGroupClose; @@ -308,7 +314,42 @@ void BP5Writer::ReroutingCommunicationLoop() // msg for sub coordinator m_DataPos = currentFilePos; receivedGroupClose = true; - continue; + + std::cout << "Rank " << m_RankMPI << " sending GROUP_CLOSE_ACK to rank " + << globalCoord << std::endl; + adios2::helper::RerouteMessage closeAckMsg; + closeAckMsg.m_MsgType = RerouteMessage::MessageType::GROUP_CLOSE_ACK; + closeAckMsg.m_SrcRank = m_RankMPI; + closeAckMsg.m_DestRank = globalCoord; + closeAckMsg.m_WildCard = m_Aggregator->m_SubStreamIndex; + closeAckMsg.BlockingSendTo(m_Comm, globalCoord, + sendBuffers.GetNextBuffer()); + break; + case RerouteMessage::MessageType::GROUP_CLOSE_ACK: + // msg for global coordinator + { + std::cout << "Rank " << m_RankMPI << " received GROUP_CLOSE_ACK from rank " + << status.Source << std::endl; + + size_t ackGroupIdx = static_cast(message.m_WildCard); + groupState[ackGroupIdx].m_currentStatus = WriterGroupState::Status::CLOSED; + closeAcksNeeded.erase(status.Source); + + if (!closeAcksNeeded.empty()) + { + std::stringstream ss; + + ss << "Rank " << m_RankMPI << " still awaiting close acks from: [ "; + for (int n : closeAcksNeeded) + { + ss << n << " "; + } + ss << " ]\n"; + + std::cout << ss.str(); + } + } + break; case RerouteMessage::MessageType::WRITER_IDLE: std::cout << "Rank " << m_RankMPI << " received WRITER_IDLE from rank " << status.Source << std::endl; @@ -319,6 +360,7 @@ void BP5Writer::ReroutingCommunicationLoop() size_t idleGroup = static_cast(message.m_WildCard); groupState[idleGroup].m_currentStatus = WriterGroupState::Status::IDLE; groupState[idleGroup].m_queueSize = 0; + groupIdlesNeeded.erase(idleWriter); if (firstIdleMsg) { @@ -448,6 +490,8 @@ void BP5Writer::ReroutingCommunicationLoop() writeMoreMsg.NonBlockingSendTo(m_Comm, message.m_DestRank, sendBuffers.GetNextBuffer()); + groupIdlesNeeded.insert(message.m_DestRank); + // Src subcoord state is returned to writing, dest subcoord state is now writing as // well groupState[message.m_SrcRank].m_currentStatus = WriterGroupState::Status::WRITING; @@ -474,9 +518,11 @@ void BP5Writer::ReroutingCommunicationLoop() // All processes // Check if writing has finished, and alert the target SC + if (!sentFinished) { + // std::cout << "Rank " << m_RankMPI << " attempting to get lock on m_NotifMutex" << std::endl; std::lock_guard lck(m_NotifMutex); - if (m_FinishedWriting && !sentFinished) + if (m_FinishedWriting) { adios2::helper::RerouteMessage writeCompleteMsg; writeCompleteMsg.m_MsgType = RerouteMessage::MessageType::WRITE_COMPLETION; @@ -488,13 +534,13 @@ void BP5Writer::ReroutingCommunicationLoop() if (!iAmSubCoord && !iAmGlobalCoord) { std::cout << "Rank " << m_RankMPI << " notifying SC (" << m_TargetCoordinator - << ") of write completion -- BLOCKING" << std::endl; + << ") of write completion -- BLOCKING" << std::endl; // My only role was to write (no communication responsibility) so I am // done at this point. However, I need to do a blocking send because I // am about to return from this function, at which point my buffer pool // goes away. writeCompleteMsg.BlockingSendTo(m_Comm, m_TargetCoordinator, - sendBuffers.GetNextBuffer()); + sendBuffers.GetNextBuffer()); receivedGroupClose = true; continue; @@ -502,9 +548,9 @@ void BP5Writer::ReroutingCommunicationLoop() else { std::cout << "Rank " << m_RankMPI << " notifying SC (" << m_TargetCoordinator - << ") of write completion -- NONBLOCKING" << std::endl; + << ") of write completion -- NONBLOCKING" << std::endl; writeCompleteMsg.NonBlockingSendTo(m_Comm, m_TargetCoordinator, - sendBuffers.GetNextBuffer()); + sendBuffers.GetNextBuffer()); } sentFinished = true; @@ -557,7 +603,7 @@ void BP5Writer::ReroutingCommunicationLoop() // Global coordinator process // Look for possible reroute-to / reroute-from pairs - if (iAmGlobalCoord && needStateCheck) + if (iAmGlobalCoord && needStateCheck && !waitingForCloseAcks) { std:: cout << "GC (" << m_RankMPI << ") looking for reroute candidate pair" << std::endl; std::pair nextPair; @@ -585,9 +631,9 @@ void BP5Writer::ReroutingCommunicationLoop() groupState[idleIdx].m_currentStatus = WriterGroupState::Status::PENDING; groupState[writerIdx].m_currentStatus = WriterGroupState::Status::PENDING; } - else if (result == StateTraversal::SearchResult::FINISHED) + else if (result == StateTraversal::SearchResult::FINISHED && groupIdlesNeeded.empty()) { - std:: cout << "GC (" << m_RankMPI << ") all work finished, sending GROUP_CLOSE" << std::endl; + for (size_t i = 0; i < groupState.size(); ++i) { std::cout << " group " << i << " status: " << static_cast(groupState[i].m_currentStatus) @@ -599,10 +645,39 @@ void BP5Writer::ReroutingCommunicationLoop() // comm loop. for (size_t scIdx = 0; scIdx < subCoordRanks.size(); ++scIdx) { - adios2::helper::RerouteMessage closeMsg; - closeMsg.m_MsgType = RerouteMessage::MessageType::GROUP_CLOSE; - closeMsg.NonBlockingSendTo(m_Comm, scIdx, sendBuffers.GetNextBuffer()); + if (subCoordRanks[scIdx] != globalCoord) + { + std:: cout << "Rank " << m_RankMPI << " sending GROUP_CLOSE to rank " << subCoordRanks[scIdx] << std::endl; + adios2::helper::RerouteMessage closeMsg; + closeMsg.m_MsgType = RerouteMessage::MessageType::GROUP_CLOSE; + closeMsg.NonBlockingSendTo(m_Comm, subCoordRanks[scIdx], sendBuffers.GetNextBuffer()); + } + } + + std::cout << "Rank " << m_RankMPI << " marking my own close ack as received" << std::endl; + receivedGroupClose = true; + closeAcksNeeded.erase(globalCoord); + + waitingForCloseAcks = true; + } + } + + if (iAmGlobalCoord) + { + if (waitingForCloseAcks && closeAcksNeeded.empty()) + { + std::cout << "Rank " << m_RankMPI << " got all my close acks" << std::endl; + // global coordinator received the final close ack, now it can leave + waitingForCloseAcks = false; + } + else + { + std::cout << "Rank " << m_RankMPI << " still need " << closeAcksNeeded.size() << " close acks [ "; + for (int n : closeAcksNeeded) + { + std::cout << " " << n; } + std::cout << " ]" << std::endl; } } } diff --git a/source/adios2/helper/adiosRerouting.h b/source/adios2/helper/adiosRerouting.h index a5d1e65a45..fdf59e4c67 100644 --- a/source/adios2/helper/adiosRerouting.h +++ b/source/adios2/helper/adiosRerouting.h @@ -39,6 +39,7 @@ class RerouteMessage REROUTE_ACK, REROUTE_REJECT, GROUP_CLOSE, + GROUP_CLOSE_ACK, STATUS_INQUIRY, STATUS_REPLY, }; From 4b039ec88fc4b7615010755a7f227ff6db4ceaa0 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Fri, 7 Nov 2025 18:04:08 -0700 Subject: [PATCH 32/37] run clang-format --- .../engine/bp5/BP5Writer_WithRerouting.cpp | 59 +++++++++++-------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp index bfa1600ca3..4b8f0d6bff 100644 --- a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp +++ b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp @@ -259,8 +259,7 @@ void BP5Writer::ReroutingCommunicationLoop() } } - auto keepGoing = [&]() - { + auto keepGoing = [&]() { if (iAmSubCoord) { return !receivedGroupClose || expectingWriteCompletion || waitingForCloseAcks; @@ -316,14 +315,13 @@ void BP5Writer::ReroutingCommunicationLoop() receivedGroupClose = true; std::cout << "Rank " << m_RankMPI << " sending GROUP_CLOSE_ACK to rank " - << globalCoord << std::endl; + << globalCoord << std::endl; adios2::helper::RerouteMessage closeAckMsg; closeAckMsg.m_MsgType = RerouteMessage::MessageType::GROUP_CLOSE_ACK; closeAckMsg.m_SrcRank = m_RankMPI; closeAckMsg.m_DestRank = globalCoord; closeAckMsg.m_WildCard = m_Aggregator->m_SubStreamIndex; - closeAckMsg.BlockingSendTo(m_Comm, globalCoord, - sendBuffers.GetNextBuffer()); + closeAckMsg.BlockingSendTo(m_Comm, globalCoord, sendBuffers.GetNextBuffer()); break; case RerouteMessage::MessageType::GROUP_CLOSE_ACK: // msg for global coordinator @@ -371,8 +369,9 @@ void BP5Writer::ReroutingCommunicationLoop() // is if (scRank != idleWriter) { - std::cout << "GC (" << m_RankMPI << ") sending STATUS_INQUIRY to rank " - << scRank << std::endl; + std::cout << "GC (" << m_RankMPI + << ") sending STATUS_INQUIRY to rank " << scRank + << std::endl; adios2::helper::RerouteMessage inquiryMsg; inquiryMsg.m_MsgType = RerouteMessage::MessageType::STATUS_INQUIRY; inquiryMsg.m_SrcRank = m_RankMPI; @@ -459,13 +458,16 @@ void BP5Writer::ReroutingCommunicationLoop() // msg for global coordinator // Both the src and target subcoord states return from PENDING to their prior state - if (groupState[message.m_SrcRank].m_currentStatus == WriterGroupState::Status::PENDING) + if (groupState[message.m_SrcRank].m_currentStatus == + WriterGroupState::Status::PENDING) { - groupState[message.m_SrcRank].m_currentStatus = WriterGroupState::Status::WRITING; + groupState[message.m_SrcRank].m_currentStatus = + WriterGroupState::Status::WRITING; groupState[message.m_SrcRank].m_queueSize = 0; } - if (groupState[message.m_DestRank].m_currentStatus == WriterGroupState::Status::PENDING) + if (groupState[message.m_DestRank].m_currentStatus == + WriterGroupState::Status::PENDING) { groupState[message.m_DestRank].m_currentStatus = WriterGroupState::Status::IDLE; } @@ -520,7 +522,8 @@ void BP5Writer::ReroutingCommunicationLoop() // Check if writing has finished, and alert the target SC if (!sentFinished) { - // std::cout << "Rank " << m_RankMPI << " attempting to get lock on m_NotifMutex" << std::endl; + // std::cout << "Rank " << m_RankMPI << " attempting to get lock on m_NotifMutex" << + // std::endl; std::lock_guard lck(m_NotifMutex); if (m_FinishedWriting) { @@ -534,7 +537,7 @@ void BP5Writer::ReroutingCommunicationLoop() if (!iAmSubCoord && !iAmGlobalCoord) { std::cout << "Rank " << m_RankMPI << " notifying SC (" << m_TargetCoordinator - << ") of write completion -- BLOCKING" << std::endl; + << ") of write completion -- BLOCKING" << std::endl; // My only role was to write (no communication responsibility) so I am // done at this point. However, I need to do a blocking send because I // am about to return from this function, at which point my buffer pool @@ -548,9 +551,9 @@ void BP5Writer::ReroutingCommunicationLoop() else { std::cout << "Rank " << m_RankMPI << " notifying SC (" << m_TargetCoordinator - << ") of write completion -- NONBLOCKING" << std::endl; + << ") of write completion -- NONBLOCKING" << std::endl; writeCompleteMsg.NonBlockingSendTo(m_Comm, m_TargetCoordinator, - sendBuffers.GetNextBuffer()); + sendBuffers.GetNextBuffer()); } sentFinished = true; @@ -565,7 +568,8 @@ void BP5Writer::ReroutingCommunicationLoop() { // Pop the queue and send DO_WRITE int nextWriter = writerQueue.front(); - std::cout << "Rank " << m_RankMPI << " sending DO_WRITE to " << nextWriter << std::endl; + std::cout << "Rank " << m_RankMPI << " sending DO_WRITE to " << nextWriter + << std::endl; writerQueue.pop(); adios2::helper::RerouteMessage writeMsg; writeMsg.m_MsgType = RerouteMessage::MessageType::DO_WRITE; @@ -579,7 +583,8 @@ void BP5Writer::ReroutingCommunicationLoop() } else if (!sentIdle) { - std::cout << "Rank " << m_RankMPI << " sending WRITER_IDLE to gc (" << globalCoord << ")" << std::endl; + std::cout << "Rank " << m_RankMPI << " sending WRITER_IDLE to gc (" << globalCoord + << ")" << std::endl; // Writer queue now empty, send WRITE_IDLE to the GC adios2::helper::RerouteMessage idleMsg; idleMsg.m_MsgType = RerouteMessage::MessageType::WRITER_IDLE; @@ -605,14 +610,15 @@ void BP5Writer::ReroutingCommunicationLoop() // Look for possible reroute-to / reroute-from pairs if (iAmGlobalCoord && needStateCheck && !waitingForCloseAcks) { - std:: cout << "GC (" << m_RankMPI << ") looking for reroute candidate pair" << std::endl; + std::cout << "GC (" << m_RankMPI << ") looking for reroute candidate pair" << std::endl; std::pair nextPair; StateTraversal::SearchResult result = pairFinder.FindNextPair(groupState, nextPair); if (result == StateTraversal::SearchResult::FOUND) { - std:: cout << "GC (" << m_RankMPI << ") found reroute candidate pair (" << nextPair.first - << ", " << nextPair.second << "), sending REROUTE_REQUEST" << std::endl; + std::cout << "GC (" << m_RankMPI << ") found reroute candidate pair (" + << nextPair.first << ", " << nextPair.second + << "), sending REROUTE_REQUEST" << std::endl; // Finding a pair means there was both an idle group and a writing // group with at least one writer in its queue. With these, we will // initiate a reroute sequence. @@ -636,7 +642,8 @@ void BP5Writer::ReroutingCommunicationLoop() for (size_t i = 0; i < groupState.size(); ++i) { - std::cout << " group " << i << " status: " << static_cast(groupState[i].m_currentStatus) + std::cout << " group " << i + << " status: " << static_cast(groupState[i].m_currentStatus) << ", queue size: " << groupState[i].m_queueSize << std::endl; } // If we didn't find a pair, it could be because all the groups are @@ -647,14 +654,17 @@ void BP5Writer::ReroutingCommunicationLoop() { if (subCoordRanks[scIdx] != globalCoord) { - std:: cout << "Rank " << m_RankMPI << " sending GROUP_CLOSE to rank " << subCoordRanks[scIdx] << std::endl; + std::cout << "Rank " << m_RankMPI << " sending GROUP_CLOSE to rank " + << subCoordRanks[scIdx] << std::endl; adios2::helper::RerouteMessage closeMsg; closeMsg.m_MsgType = RerouteMessage::MessageType::GROUP_CLOSE; - closeMsg.NonBlockingSendTo(m_Comm, subCoordRanks[scIdx], sendBuffers.GetNextBuffer()); + closeMsg.NonBlockingSendTo(m_Comm, subCoordRanks[scIdx], + sendBuffers.GetNextBuffer()); } } - std::cout << "Rank " << m_RankMPI << " marking my own close ack as received" << std::endl; + std::cout << "Rank " << m_RankMPI << " marking my own close ack as received" + << std::endl; receivedGroupClose = true; closeAcksNeeded.erase(globalCoord); @@ -672,7 +682,8 @@ void BP5Writer::ReroutingCommunicationLoop() } else { - std::cout << "Rank " << m_RankMPI << " still need " << closeAcksNeeded.size() << " close acks [ "; + std::cout << "Rank " << m_RankMPI << " still need " << closeAcksNeeded.size() + << " close acks [ "; for (int n : closeAcksNeeded) { std::cout << " " << n; From 67f86a67bb33791f0a1b624e1154e4d7231c1529 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Mon, 1 Dec 2025 17:57:26 -0700 Subject: [PATCH 33/37] Fix a few bugs and adjust debug logs - One certain issue was storing globalState using subcoordinator rank ids, rather than the index of the groups they coordinate. Fix that by adding a mapping from rank id to global state index. - When rerouting happens on time step zero, opening without append could result in a zero block appearing where the rerouted rank wrote. Fix that by allowing to specify forceAppend when rerouted ranks open their files. --- source/adios2/engine/bp5/BP5Writer.cpp | 4 +- source/adios2/engine/bp5/BP5Writer.h | 2 +- .../engine/bp5/BP5Writer_WithRerouting.cpp | 166 ++++++++++-------- 3 files changed, 99 insertions(+), 73 deletions(-) diff --git a/source/adios2/engine/bp5/BP5Writer.cpp b/source/adios2/engine/bp5/BP5Writer.cpp index 807578373f..8ce1ecd4e2 100644 --- a/source/adios2/engine/bp5/BP5Writer.cpp +++ b/source/adios2/engine/bp5/BP5Writer.cpp @@ -1689,7 +1689,7 @@ void BP5Writer::InitTransports() InitBPBuffer(); } -void BP5Writer::OpenSubfile(bool useComm) +void BP5Writer::OpenSubfile(bool useComm, bool forceAppend) { std::string cacheKey = GetCacheKey(m_Aggregator); auto search = m_AggregatorSpecifics.find(cacheKey); @@ -1736,7 +1736,7 @@ void BP5Writer::OpenSubfile(bool useComm) if (m_Parameters.AggregationType == (int)AggregationType::DataSizeBased) { - if (m_WriterStep > 0) + if (m_WriterStep > 0 || forceAppend) { // override the mode to be append if we're opening a file that // was already opened by another rank. diff --git a/source/adios2/engine/bp5/BP5Writer.h b/source/adios2/engine/bp5/BP5Writer.h index ad9c1e658f..594a113561 100644 --- a/source/adios2/engine/bp5/BP5Writer.h +++ b/source/adios2/engine/bp5/BP5Writer.h @@ -78,7 +78,7 @@ class BP5Writer : public BP5Engine, public core::Engine std::map m_AggregatorSpecifics; helper::RankPartition GetPartitionInfo(const uint64_t rankDataSize, const int subStreams, helper::Comm const &parentComm); - void OpenSubfile(const bool useComm = true); + void OpenSubfile(const bool useComm = true, const bool forceAppend = false); helper::Partitioning m_Partitioning; /** Single object controlling BP buffering */ diff --git a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp index 4b8f0d6bff..4c647128cc 100644 --- a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp +++ b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp @@ -193,6 +193,7 @@ void BP5Writer::ReroutingCommunicationLoop() bool needStateCheck = false; StateTraversal pairFinder; std::vector subCoordRanks; + std::map scRankToIndex; bool firstIdleMsg = true; std::set closeAcksNeeded; std::set groupIdlesNeeded; @@ -233,6 +234,7 @@ void BP5Writer::ReroutingCommunicationLoop() groupState[i].m_currentStatus = WriterGroupState::Status::UNKNOWN; groupState[i].m_subFileIndex = i; subCoordRanks[i] = m_Partitioning.m_Partitions[i][0]; + scRankToIndex[m_Partitioning.m_Partitions[i][0]] = i; closeAcksNeeded.insert(subCoordRanks[i]); groupIdlesNeeded.insert(subCoordRanks[i]); } @@ -332,20 +334,6 @@ void BP5Writer::ReroutingCommunicationLoop() size_t ackGroupIdx = static_cast(message.m_WildCard); groupState[ackGroupIdx].m_currentStatus = WriterGroupState::Status::CLOSED; closeAcksNeeded.erase(status.Source); - - if (!closeAcksNeeded.empty()) - { - std::stringstream ss; - - ss << "Rank " << m_RankMPI << " still awaiting close acks from: [ "; - for (int n : closeAcksNeeded) - { - ss << n << " "; - } - ss << " ]\n"; - - std::cout << ss.str(); - } } break; case RerouteMessage::MessageType::WRITER_IDLE: @@ -455,56 +443,65 @@ void BP5Writer::ReroutingCommunicationLoop() case RerouteMessage::MessageType::REROUTE_REJECT: std::cout << "Rank " << m_RankMPI << " received REROUTE_REJECT from rank " << status.Source << std::endl; - // msg for global coordinator - - // Both the src and target subcoord states return from PENDING to their prior state - if (groupState[message.m_SrcRank].m_currentStatus == - WriterGroupState::Status::PENDING) { - groupState[message.m_SrcRank].m_currentStatus = - WriterGroupState::Status::WRITING; - groupState[message.m_SrcRank].m_queueSize = 0; - } + // msg for global coordinator - if (groupState[message.m_DestRank].m_currentStatus == - WriterGroupState::Status::PENDING) - { - groupState[message.m_DestRank].m_currentStatus = WriterGroupState::Status::IDLE; - } + size_t srcIdx = scRankToIndex[message.m_SrcRank]; + size_t destIdx = scRankToIndex[message.m_DestRank]; + + // Both the src and target subcoord states return from PENDING to their prior state + if (groupState[srcIdx].m_currentStatus == + WriterGroupState::Status::PENDING) + { + groupState[srcIdx].m_currentStatus = + WriterGroupState::Status::WRITING; + groupState[srcIdx].m_queueSize = 0; + } - // The reason to check here is that global coord triggers at most - // one reroute sequence per iteration through the loop. Otherwise - // we probably don't need a state check upon receipt of rejection. - needStateCheck = true; + if (groupState[destIdx].m_currentStatus == + WriterGroupState::Status::PENDING) + { + groupState[destIdx].m_currentStatus = WriterGroupState::Status::IDLE; + } + + // The reason to check here is that global coord triggers at most + // one reroute sequence per iteration through the loop. Otherwise + // we probably don't need a state check upon receipt of rejection. + needStateCheck = true; + } break; case RerouteMessage::MessageType::REROUTE_ACK: std::cout << "Rank " << m_RankMPI << " received REROUTE_ACK from rank " << status.Source << std::endl; // msg for global coordinator + { + + std::cout << "Rank " << m_RankMPI << " sending WRITE_MORE to rank " + << message.m_DestRank << std::endl; - std::cout << "Rank " << m_RankMPI << " sending WRITE_MORE to rank " - << message.m_DestRank << std::endl; - - // Send the lucky volunteer another writer - adios2::helper::RerouteMessage writeMoreMsg; - writeMoreMsg.m_MsgType = RerouteMessage::MessageType::WRITE_MORE; - writeMoreMsg.m_WildCard = message.m_WildCard; // i.e. the rerouted writer rank - writeMoreMsg.NonBlockingSendTo(m_Comm, message.m_DestRank, - sendBuffers.GetNextBuffer()); - - groupIdlesNeeded.insert(message.m_DestRank); - - // Src subcoord state is returned to writing, dest subcoord state is now writing as - // well - groupState[message.m_SrcRank].m_currentStatus = WriterGroupState::Status::WRITING; - groupState[message.m_SrcRank].m_queueSize -= 1; - groupState[message.m_DestRank].m_currentStatus = WriterGroupState::Status::WRITING; - // groupState[message.m_DestRank].m_queueSize += 1; - - // The reason to check here is that global coord triggers at most - // one reroute sequence per iteration through the loop. Otherwise - // we probably don't need a state check upon receiving reroute ack. - needStateCheck = true; + // Send the lucky volunteer another writer + adios2::helper::RerouteMessage writeMoreMsg; + writeMoreMsg.m_MsgType = RerouteMessage::MessageType::WRITE_MORE; + writeMoreMsg.m_WildCard = message.m_WildCard; // i.e. the rerouted writer rank + writeMoreMsg.NonBlockingSendTo(m_Comm, message.m_DestRank, + sendBuffers.GetNextBuffer()); + + groupIdlesNeeded.insert(message.m_DestRank); + + // Src subcoord state is returned to writing, dest subcoord state is now writing as + // well + size_t srcIdx = scRankToIndex[message.m_SrcRank]; + size_t destIdx = scRankToIndex[message.m_DestRank]; + groupState[srcIdx].m_currentStatus = WriterGroupState::Status::WRITING; + groupState[srcIdx].m_queueSize -= 1; + groupState[destIdx].m_currentStatus = WriterGroupState::Status::WRITING; + // groupState[destIdx].m_queueSize += 1; + + // The reason to check here is that global coord triggers at most + // one reroute sequence per iteration through the loop. Otherwise + // we probably don't need a state check upon receiving reroute ack. + needStateCheck = true; + } break; case RerouteMessage::MessageType::WRITE_MORE: std::cout << "Rank " << m_RankMPI << " received WRITE_MORE from rank " @@ -608,7 +605,7 @@ void BP5Writer::ReroutingCommunicationLoop() // Global coordinator process // Look for possible reroute-to / reroute-from pairs - if (iAmGlobalCoord && needStateCheck && !waitingForCloseAcks) + if (iAmGlobalCoord /*&& needStateCheck*/ && !waitingForCloseAcks) { std::cout << "GC (" << m_RankMPI << ") looking for reroute candidate pair" << std::endl; std::pair nextPair; @@ -670,25 +667,51 @@ void BP5Writer::ReroutingCommunicationLoop() waitingForCloseAcks = true; } + else + { + std::cout << "candidate pair search returned "; + + switch (result) + { + case StateTraversal::SearchResult::NOT_FOUND: + std::cout << "NOT FOUND" << std::endl; + std::cout << " states: [ "; + for (size_t i = 0; i < groupState.size(); ++i) + { + std::cout << static_cast(groupState[i].m_currentStatus) << " "; + } + std::cout << "]" << std::endl; + break; + case StateTraversal::SearchResult::FOUND: + std::cout << "FOUND" << std::endl; + break; + case StateTraversal::SearchResult::FINISHED: + std::cout << "FINISHED" << std::endl; + break; + } + } } if (iAmGlobalCoord) { - if (waitingForCloseAcks && closeAcksNeeded.empty()) - { - std::cout << "Rank " << m_RankMPI << " got all my close acks" << std::endl; - // global coordinator received the final close ack, now it can leave - waitingForCloseAcks = false; - } - else + if (waitingForCloseAcks) { - std::cout << "Rank " << m_RankMPI << " still need " << closeAcksNeeded.size() - << " close acks [ "; - for (int n : closeAcksNeeded) + if (closeAcksNeeded.empty()) { - std::cout << " " << n; + std::cout << "Rank " << m_RankMPI << " got all my close acks" << std::endl; + // global coordinator received the final close ack, now it can leave + waitingForCloseAcks = false; + } + else + { + std::cout << "Rank " << m_RankMPI << " still need " << closeAcksNeeded.size() + << " close acks [ "; + for (int n : closeAcksNeeded) + { + std::cout << " " << n; + } + std::cout << " ]" << std::endl; } - std::cout << " ]" << std::endl; } } } @@ -742,8 +765,11 @@ void BP5Writer::WriteData_WithRerouting(format::BufferV *Data) m_Aggregator->m_SubStreamIndex = m_TargetIndex; // Open the subfile without doing any collective communications, since the global - // coordinator ensures only one rerouted rank opens this file at a time. - OpenSubfile(false); + // coordinator ensures only one rerouted rank opens this file at a time. Also, + // be sure to open in append mode because another rank already wrote to this file, + // and open without append mode in that case can result in a block of zeros getting + // written. + OpenSubfile(false, true); } // align to PAGE_SIZE From 87aaec63c1389c2275892bccce5ece1be16b0883 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Tue, 2 Dec 2025 13:21:32 -0700 Subject: [PATCH 34/37] Fix subtle bug that could cause blocks to disappear This could happen when global coordinator re-routed another rank to it's own subfile. Because the update of m_DataPos from the local tracking varibale, currentFilePos, was happening upon receipt of the GROUP_CLOSE message, and the global coordinator does not send itself that message, it wasn't capturing the actual file offset to be shared with other ranks later in EndStep(). This moves the variable update to the end of the thread method, just before returning, so that all subcoordinators do it, regardless of whether they're the global coordinator also. --- source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp index 4c647128cc..4f73a12013 100644 --- a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp +++ b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp @@ -313,7 +313,6 @@ void BP5Writer::ReroutingCommunicationLoop() std::cout << "Rank " << m_RankMPI << " received GROUP_CLOSE from rank " << status.Source << std::endl; // msg for sub coordinator - m_DataPos = currentFilePos; receivedGroupClose = true; std::cout << "Rank " << m_RankMPI << " sending GROUP_CLOSE_ACK to rank " @@ -716,6 +715,13 @@ void BP5Writer::ReroutingCommunicationLoop() } } + // Before leaving this method, subcoordinators need to update the variable tracking + // the current file position for their particular subfile + if (iAmSubCoord) + { + m_DataPos = currentFilePos; + } + std::cout << "Rank " << m_RankMPI << " Exit ReroutingCommunicationLoop" << std::endl; } From 4c7a6452595c7f9d1724000d572c70cecf08ee85 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Tue, 2 Dec 2025 14:19:21 -0700 Subject: [PATCH 35/37] run clang format --- .../engine/bp5/BP5Writer_WithRerouting.cpp | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp index 4f73a12013..e19912859b 100644 --- a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp +++ b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp @@ -448,17 +448,15 @@ void BP5Writer::ReroutingCommunicationLoop() size_t srcIdx = scRankToIndex[message.m_SrcRank]; size_t destIdx = scRankToIndex[message.m_DestRank]; - // Both the src and target subcoord states return from PENDING to their prior state - if (groupState[srcIdx].m_currentStatus == - WriterGroupState::Status::PENDING) + // Both the src and target subcoord states return from PENDING to their prior + // state + if (groupState[srcIdx].m_currentStatus == WriterGroupState::Status::PENDING) { - groupState[srcIdx].m_currentStatus = - WriterGroupState::Status::WRITING; + groupState[srcIdx].m_currentStatus = WriterGroupState::Status::WRITING; groupState[srcIdx].m_queueSize = 0; } - if (groupState[destIdx].m_currentStatus == - WriterGroupState::Status::PENDING) + if (groupState[destIdx].m_currentStatus == WriterGroupState::Status::PENDING) { groupState[destIdx].m_currentStatus = WriterGroupState::Status::IDLE; } @@ -476,19 +474,19 @@ void BP5Writer::ReroutingCommunicationLoop() { std::cout << "Rank " << m_RankMPI << " sending WRITE_MORE to rank " - << message.m_DestRank << std::endl; + << message.m_DestRank << std::endl; // Send the lucky volunteer another writer adios2::helper::RerouteMessage writeMoreMsg; writeMoreMsg.m_MsgType = RerouteMessage::MessageType::WRITE_MORE; writeMoreMsg.m_WildCard = message.m_WildCard; // i.e. the rerouted writer rank writeMoreMsg.NonBlockingSendTo(m_Comm, message.m_DestRank, - sendBuffers.GetNextBuffer()); + sendBuffers.GetNextBuffer()); groupIdlesNeeded.insert(message.m_DestRank); - // Src subcoord state is returned to writing, dest subcoord state is now writing as - // well + // Src subcoord state is returned to writing, dest subcoord state is now writing + // as well size_t srcIdx = scRankToIndex[message.m_SrcRank]; size_t destIdx = scRankToIndex[message.m_DestRank]; groupState[srcIdx].m_currentStatus = WriterGroupState::Status::WRITING; @@ -704,7 +702,7 @@ void BP5Writer::ReroutingCommunicationLoop() else { std::cout << "Rank " << m_RankMPI << " still need " << closeAcksNeeded.size() - << " close acks [ "; + << " close acks [ "; for (int n : closeAcksNeeded) { std::cout << " " << n; From 37bd2246ded0b5a1fc183d693e613525146e2a9b Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Tue, 2 Dec 2025 17:39:02 -0700 Subject: [PATCH 36/37] Fix some compiler warnings --- .../engine/bp5/BP5Writer_WithRerouting.cpp | 38 +++---------------- 1 file changed, 6 insertions(+), 32 deletions(-) diff --git a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp index e19912859b..f36cf36560 100644 --- a/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp +++ b/source/adios2/engine/bp5/BP5Writer_WithRerouting.cpp @@ -190,7 +190,6 @@ void BP5Writer::ReroutingCommunicationLoop() // global coordinator keeps track of state of each subcoord std::vector groupState; - bool needStateCheck = false; StateTraversal pairFinder; std::vector subCoordRanks; std::map scRankToIndex; @@ -204,16 +203,7 @@ void BP5Writer::ReroutingCommunicationLoop() // many Sends pending for a long time, that could cause us to exhaust the // pool of buffers, at which point we would start overwriting buffers in the // pool, potentially creating errors which are difficult to debug/diagnose. - int bufferSize = 1; - if (iAmGlobalCoord) - { - bufferSize = subCoordRanks.size(); - } - else if (iAmSubCoord) - { - bufferSize = 100; - } - BufferPool sendBuffers((iAmSubCoord ? 100 : 1)); + BufferPool sendBuffers(100); std::vector recvBuffer; int writingRank = -1; uint64_t currentFilePos = 0; @@ -276,10 +266,6 @@ void BP5Writer::ReroutingCommunicationLoop() helper::Comm::Status status = m_Comm.Iprobe(static_cast(helper::Comm::Constants::CommRecvAny), 0, &msgReady); - // Many messages may not result in a state change for the global coordinator, - // so set this flag if a check is needed - needStateCheck = false; - // If there is a message ready, receive and handle it if (msgReady) { @@ -370,8 +356,6 @@ void BP5Writer::ReroutingCommunicationLoop() firstIdleMsg = false; } - - needStateCheck = true; } break; case RerouteMessage::MessageType::WRITER_CAPACITY: @@ -381,7 +365,6 @@ void BP5Writer::ReroutingCommunicationLoop() { int capacityGroup = message.m_WildCard; groupState[capacityGroup].m_currentStatus = WriterGroupState::Status::CAPACITY; - needStateCheck = true; } break; case RerouteMessage::MessageType::STATUS_INQUIRY: @@ -407,7 +390,6 @@ void BP5Writer::ReroutingCommunicationLoop() groupState[subStreamIdx].m_currentStatus = qSize > 0 ? WriterGroupState::Status::WRITING : WriterGroupState::Status::IDLE; - needStateCheck = true; } break; case RerouteMessage::MessageType::REROUTE_REQUEST: @@ -460,11 +442,6 @@ void BP5Writer::ReroutingCommunicationLoop() { groupState[destIdx].m_currentStatus = WriterGroupState::Status::IDLE; } - - // The reason to check here is that global coord triggers at most - // one reroute sequence per iteration through the loop. Otherwise - // we probably don't need a state check upon receipt of rejection. - needStateCheck = true; } break; case RerouteMessage::MessageType::REROUTE_ACK: @@ -493,11 +470,6 @@ void BP5Writer::ReroutingCommunicationLoop() groupState[srcIdx].m_queueSize -= 1; groupState[destIdx].m_currentStatus = WriterGroupState::Status::WRITING; // groupState[destIdx].m_queueSize += 1; - - // The reason to check here is that global coord triggers at most - // one reroute sequence per iteration through the loop. Otherwise - // we probably don't need a state check upon receiving reroute ack. - needStateCheck = true; } break; case RerouteMessage::MessageType::WRITE_MORE: @@ -602,7 +574,7 @@ void BP5Writer::ReroutingCommunicationLoop() // Global coordinator process // Look for possible reroute-to / reroute-from pairs - if (iAmGlobalCoord /*&& needStateCheck*/ && !waitingForCloseAcks) + if (iAmGlobalCoord && !waitingForCloseAcks) { std::cout << "GC (" << m_RankMPI << ") looking for reroute candidate pair" << std::endl; std::pair nextPair; @@ -760,13 +732,15 @@ void BP5Writer::WriteData_WithRerouting(format::BufferV *Data) std::cout << "Rank " << m_RankMPI << " signaled to write" << std::endl; + size_t substreamIdx = static_cast(m_TargetIndex); + // Check if we need to update which file we are writing to - if (m_TargetIndex != m_Aggregator->m_SubStreamIndex) + if (substreamIdx != m_Aggregator->m_SubStreamIndex) { // We were rerouted! Our aggregator subfile index is later exchanged with other // ranks to be written to metadata, so update it here or the metadata will be // wrong. - m_Aggregator->m_SubStreamIndex = m_TargetIndex; + m_Aggregator->m_SubStreamIndex = substreamIdx; // Open the subfile without doing any collective communications, since the global // coordinator ensures only one rerouted rank opens this file at a time. Also, From 8586d6df00921e775527eb6ce240d0c75ac4d88d Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Tue, 2 Dec 2025 19:04:52 -0700 Subject: [PATCH 37/37] Debug stmts break tests, disable them --- source/adios2/engine/bp5/BP5Engine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adios2/engine/bp5/BP5Engine.h b/source/adios2/engine/bp5/BP5Engine.h index 9cb287af98..6d18689081 100644 --- a/source/adios2/engine/bp5/BP5Engine.h +++ b/source/adios2/engine/bp5/BP5Engine.h @@ -142,7 +142,7 @@ class BP5Engine MACRO(BurstBufferDrain, Bool, bool, true) \ MACRO(BurstBufferPath, String, std::string, "") \ MACRO(NodeLocal, Bool, bool, false) \ - MACRO(verbose, Int, int, 5) \ + MACRO(verbose, Int, int, 0) \ MACRO(NumAggregators, UInt, unsigned int, 0) \ MACRO(AggregatorRatio, UInt, unsigned int, 0) \ MACRO(NumSubFiles, UInt, unsigned int, 0) \