From 4dc402e67a832b91eee9d58fb11903e3f2c33b21 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Mon, 18 Mar 2024 19:50:43 +0100
Subject: [PATCH 01/64] set init flag

---
 integrations/pytorch_ddp/test/test-coyote.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/integrations/pytorch_ddp/test/test-coyote.py b/integrations/pytorch_ddp/test/test-coyote.py
index 6cf1927e..c3b63422 100644
--- a/integrations/pytorch_ddp/test/test-coyote.py
+++ b/integrations/pytorch_ddp/test/test-coyote.py
@@ -161,10 +161,10 @@ def start_test(simulator: bool):
              for i in range(size)]
 
     if simulator:
-        accl.create_simulate_process_group(ranks, bufsize=rxbufsize)
+        accl.create_simulate_process_group(ranks, bufsize=rxbufsize, initialize=True)
     else:
         accl.create_process_group_coyote(ranks, accl.ACCLDesign.cyt_rdma,
-                                         bufsize=rxbufsize)
+                                         bufsize=rxbufsize, initialize=True)
     dist.init_process_group("ACCL", rank=rank, world_size=size)
 
     with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],

From a8d16195b0b131b11ef3c3e6f30cd3b0eea9618e Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sat, 23 Mar 2024 14:17:58 +0100
Subject: [PATCH 02/64] Adapt to constructor interface changes and removal of
 ROCE added debug flag Only broadcast seems to work atm

---
 integrations/pytorch_ddp/install.py           | 19 +++++++++++--------
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      |  6 +-----
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/integrations/pytorch_ddp/install.py b/integrations/pytorch_ddp/install.py
index b97ab21f..6a515cda 100755
--- a/integrations/pytorch_ddp/install.py
+++ b/integrations/pytorch_ddp/install.py
@@ -121,7 +121,7 @@ def install_accl_driver(accl_driver_path: Path):
     subprocess.run(['make'], cwd=accl_driver_path, check=True)
 
 
-def install_accl_process_group(rocm: bool = False, cuda: bool = False):
+def install_accl_process_group(rocm: bool = False, cuda: bool = False, debug: bool = False):
     if not accl_driver_path.exists():
         clone_accl()
     if not accl_driver.exists():
@@ -131,12 +131,13 @@ def install_accl_process_group(rocm: bool = False, cuda: bool = False):
     env = os.environ.copy()
     env['USE_ROCM'] = '1' if rocm else '0'
     env['USE_CUDA'] = '1' if cuda else '0'
+    env['ACCL_DEBUG'] = '1' if debug else '0'
     subprocess.run([python, '-m', 'pip', '-v', 'install', '.'],
                    env=env, cwd=root, check=True)
 
 
 def main(rocm: bool = False, cuda: bool = False,
-         force_accl_process_group: bool = False, force_pytorch: bool = False):
+         force_accl_process_group: bool = False, force_pytorch: bool = False, debug: bool = False):
     packages = test_packages()
 
     if force_pytorch and torch_dir.exists():
@@ -152,7 +153,7 @@ def main(rocm: bool = False, cuda: bool = False,
 
     if not packages['accl-process-group'] or force_accl_process_group:
         print("ACCL Process Group not found, installing...")
-        install_accl_process_group(rocm, cuda)
+        install_accl_process_group(rocm, cuda, debug)
 
 
 if __name__ == '__main__':
@@ -165,21 +166,23 @@ def main(rocm: bool = False, cuda: bool = False,
         'ProcessGroup in the current virtual environment.\nWill also install '
         'PyTorch if it isn\'t installed already.')
     gpu_support = parser.add_mutually_exclusive_group()
-    gpu_support.add_argument('--rocm', action='store_true',
+    gpu_support.add_argument('-r','--rocm', action='store_true',
                              help='Installs the Process Group with ROCm '
                              'support.')
-    gpu_support.add_argument('--cuda', action='store_true',
+    gpu_support.add_argument('-c','--cuda', action='store_true',
                              help='Installs the Process Group with CUDA '
                              'support.')
-    parser.add_argument('--force-accl-process-group', action='store_true',
+    parser.add_argument('-a','--force-accl-process-group', action='store_true',
                         help='Force a reinstall of the ACCL Process Group')
-    parser.add_argument('--force-pytorch', action='store_true',
+    parser.add_argument('-t','--force-pytorch', action='store_true',
                         help='Force a reinstall of PyTorch '
                         f'{CURRENT_PYTORCH_VERSION} with the correct CXX11 ABI'
                         ' settings applied.')
     parser.add_argument('-f', '--force', action='store_true',
                         help='Enables both --force-accl-process-group and '
                         '--force-pytorch.')
+    parser.add_argument('-d', '--debug', action='store_true',
+                        help='Will print ACCL debugging info using ACCL_DEBUG=1')
 
     args = parser.parse_args()
     if args.force:
@@ -188,7 +191,7 @@ def main(rocm: bool = False, cuda: bool = False,
 
     try:
         main(args.rocm, args.cuda, args.force_accl_process_group,
-             args.force_pytorch)
+             args.force_pytorch, args.debug)
     except KeyboardInterrupt:
         print("Cancelled installation")
         exit(1)
diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index e0c3e596..9d109bfe 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -651,8 +651,7 @@ void ProcessGroupACCL::initialize() {
       throw std::runtime_error("Coyote configure not implemented");
     }
 
-    accl = std::make_unique<ACCL::ACCL>(cyt_device, ranks_, rank_, size_ + 2,
-                                        bufsize, bufsize, 8388608UL);
+    accl = std::make_unique<ACCL::ACCL>(cyt_device);
     ACCL::debug(std::string("[ACCL coyote] communicator: ") + accl->dump_communicator());
   } else {
     if (!simulator_) {
@@ -1890,9 +1889,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
              "TCP ACCL backend; uses EasyNet network kernel on hardware")
       .value("udp", accl_network_utils::acclDesign::UDP,
              "UDP ACCL backend; uses VNx network kernel on hardware")
-      .value("roce", accl_network_utils::acclDesign::ROCE,
-             "Only applicable for hardware; uses UDP ACCL backend and RoCE "
-             "network kernel")
       .value("cyt_tcp", accl_network_utils::acclDesign::CYT_TCP,
              "Only applicable for hardware; uses coyote ACCL backend with a "
              "TCP network kernel")

From 7b6a18b56f0302795de76bc40a5081ece664b220 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sun, 24 Mar 2024 18:13:32 +0100
Subject: [PATCH 03/64] initialize private design_ parameter

---
 integrations/pytorch_ddp/src/ProcessGroupACCL.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 9d109bfe..7304c11c 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -605,11 +605,12 @@ ProcessGroupACCL::ProcessGroupACCL(
   }
 
   ranks_ = convert_ranks(ranks);
+  design_ = design;
 
   if (coyote_enabled) {
-    if (design == accl_network_utils::acclDesign::CYT_TCP) {
+    if (design_ == accl_network_utils::acclDesign::CYT_TCP) {
       cyt_device = new ACCL::CoyoteDevice();
-    } else if (design == accl_network_utils::acclDesign::CYT_RDMA) {
+    } else if (design_ == accl_network_utils::acclDesign::CYT_RDMA) {
       cyt_device = new ACCL::CoyoteDevice(size_);
       cyt::setup_cyt_rdma(ibvQpConn_vec, ranks_, rank_, *cyt_device);
     } else {

From 748655090d4fca81b2040ae453a21f78f34dd854 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Mon, 25 Mar 2024 00:59:00 +0100
Subject: [PATCH 04/64] created generic(xrt+coyote) test script

(not functional yet)
---
 .../process_group_wrapper.py                  |   9 +-
 integrations/pytorch_ddp/test/test-generic.py | 200 ++++++++++++++++++
 2 files changed, 203 insertions(+), 6 deletions(-)
 create mode 100644 integrations/pytorch_ddp/test/test-generic.py

diff --git a/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py b/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py
index 06979add..c6de9a21 100644
--- a/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py
+++ b/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py
@@ -27,15 +27,13 @@
 
 
 def create_process_group(
-        ranks: list[Rank],
-        xclbin: str, device_index: int, design: ACCLDesign,
+        ranks: list[Rank], design: ACCLDesign,
         *, nbufs: int = 16, bufsize: int = 1024,
         compression: Optional[dict[DataType, DataType]] = None,
         p2p_enabled: bool = False, profiling_ranks: Optional[list[int]] = None,
         profiling_timeout: float = 0.0, rsfec: bool = False,
+        simulation: bool = False,
         initialize: bool = True) -> ProcessGroup:
-    if design == ACCLDesign.cyt_rdma or design == ACCLDesign.cyt_tcp:
-        raise RuntimeError(f"{design} is an incompatible design for XRT")
 
     if compression is None:
         compression = {}
@@ -54,8 +52,7 @@ def create_process_group_wrapper(store, rank, size, _timeout):
             raise RuntimeError("ACCL ProcessGroup already created, "
                                "can only create one.")
 
-        pg = ProcessGroupACCL(store, rank, size, ranks, False, design,
-                              xclbin=xclbin, device_index=device_index,
+        pg = ProcessGroupACCL(store, rank, size, ranks, simulation, design,
                               bufsize=bufsize, rsfec=rsfec, nbufs=nbufs,
                               compression=compression,
                               p2p_enabled=p2p_enabled,
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
new file mode 100644
index 00000000..87dbe1d1
--- /dev/null
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -0,0 +1,200 @@
+# /*****************************************************************************
+#  Copyright (C) 2023 Advanced Micro Devices, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *****************************************************************************/
+
+from __future__ import annotations
+from typing import Optional
+import numpy as np
+import os
+from mpi4py.MPI import COMM_WORLD as mpi
+
+import torch
+import torch.distributed as dist
+from torch.profiler import profile, ProfilerActivity
+import accl_process_group as accl
+
+rank = 0
+size = 0
+
+count = 1024
+rxbufsize = 1500 * 4
+
+
+def test_broadcast():
+    if rank == 0:
+        x = torch.ones(count)
+    else:
+        x = torch.zeros(count)
+
+    dist.broadcast(x, 0)
+
+    np.testing.assert_allclose(x, torch.ones(count))
+    print("Test broadcast finished!")
+
+
+def test_sendrcv():
+    x = torch.full((count,), float(rank))
+
+    y = torch.empty(count)
+
+    prev_rank = (rank - 1) % size
+    next_rank = (rank + 1) % size
+
+    if rank % 2:
+        dist.send(x, next_rank)
+        dist.recv(y, prev_rank)
+    else:
+        dist.recv(y, prev_rank)
+        dist.send(x, next_rank)
+
+    np.testing.assert_allclose(y, torch.full((count,), prev_rank))
+    print("Test sendrcv finished!")
+
+
+def test_scatter():
+    if rank == 0:
+        x = [torch.full((count,), float(i)) for i in range(size)]
+    else:
+        x = None
+    y = torch.empty(count)
+
+    dist.scatter(y, x, 0)
+
+    np.testing.assert_allclose(y, torch.full((count,), float(rank)))
+    print("Test scatter finished!")
+
+
+def test_gather():
+    x = torch.full((count,), float(rank))
+
+    if rank == 0:
+        y = [torch.empty(count) for _ in range(size)]
+    else:
+        y = None
+
+    dist.gather(x, y, 0)
+
+    if rank == 0:
+        for i, c in enumerate(y):
+            np.testing.assert_allclose(c, torch.full((count,), float(i)))
+    print("Test gather finished!")
+
+
+def test_allgather():
+    x = torch.full((count,), float(rank))
+    y = [torch.empty(count) for _ in range(size)]
+
+    dist.all_gather(y, x)
+
+    for i, c in enumerate(y):
+        np.testing.assert_allclose(c, torch.full((count,), float(i)))
+    print("Test allgather finished!")
+
+
+def test_reduce():
+    x = torch.ones(count)
+
+    dist.reduce(x, 0, dist.ReduceOp.SUM)
+
+    if rank == 0:
+        np.testing.assert_allclose(x, [size for _ in range(count)])
+    print("Test reduce finished!")
+
+
+def test_allreduce():
+    x = torch.ones(count)
+
+    dist.all_reduce(x, dist.ReduceOp.SUM)
+
+    np.testing.assert_allclose(x, [size for _ in range(count)])
+    print("Test allreduce finished!")
+
+
+def exchange_qp(first_rank, second_rank, rank, ranks):
+    if rank == first_rank:
+        mpi.send(accl.get_local_qp(second_rank), dest=second_rank, tag=23)
+    elif rank == second_rank:
+        accl.set_remote_qp(first_rank, mpi.recv(source=first_rank, tag=23))
+
+    mpi.barrier()
+
+    if rank == second_rank:
+        mpi.send(accl.get_local_qp(first_rank), dest=first_rank, tag=24)
+    elif rank == first_rank:
+        accl.set_remote_qp(second_rank, mpi.recv(source=second_rank, tag=24))
+
+    mpi.barrier()
+
+
+def configure_cyt_rdma(ranks):
+    global rank, size
+    for first_rank in range(0, size):
+        for second_rank in range(first_rank + 1, size):
+            exchange_qp(first_rank, second_rank, rank, ranks)
+    accl.initialize()
+    mpi.barrier()
+
+
+
+def start_test(comms: str, simulator: bool):
+    global rank, size
+    if 'MASTER_ADDR' not in os.environ:
+        os.environ['MASTER_ADDR'] = 'localhost'
+    if 'MASTER_PORT' not in os.environ:
+        os.environ['MASTER_PORT'] = '30500'
+    rank = mpi.Get_rank()
+    size = mpi.Get_size()
+    ranks = [accl.Rank("127.0.0.1", 5500 + i, i, rxbufsize)
+             for i in range(size)]
+
+    accl.create_process_group(ranks, accl.ACCLDesign.cyt_rdma, bufsize=rxbufsize, initialize=True)
+    dist.init_process_group("ACCL", rank=rank, world_size=size)
+
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                 profile_memory=True, record_shapes=True) as prof:
+        mpi.Barrier()
+        test_broadcast()
+        mpi.Barrier()
+        test_sendrcv()
+        mpi.Barrier()
+        test_scatter()
+        mpi.Barrier()
+        test_gather()
+        mpi.Barrier()
+        test_allgather()
+        mpi.Barrier()
+        test_reduce()
+        mpi.Barrier()
+        test_allreduce()
+
+    print(prof.key_averages(group_by_input_shape=True)
+          .table(sort_by="cpu_time_total", row_limit=15))
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser(description='Coyote tests for ACCL ProcessGroup')
+    parser.add_argument('-s', '--simulation', action='store_true',
+                        default=False, help='Use simulation instead of '
+                                            'hardware')
+    parser.add_argument('-c', '--comms', choices=['udp', 'tcp', 'cyt_rdma'], default='tcp',
+                        help='Run tests over specied communication backend')
+    args = parser.parse_args()
+
+    #if args.comms != 'cyt_rdma' or not args.simulation:
+    if args.comms != 'cyt_rdma':
+        sys.exit('Currently only supports -c cyt_rdma and -s flags')
+    start_test(args.comms, args.simulation)

From 7da1f6b36a8da94ae258b7a6a30c997170e1df52 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Tue, 26 Mar 2024 00:10:18 +0100
Subject: [PATCH 05/64] test-generic fix passes up to receive test in -c tcp -s

---
 .../accl_process_group/__init__.py            |   2 +-
 .../process_group_wrapper.py                  | 146 +++++++++---------
 .../pytorch_ddp/include/ProcessGroupACCL.hpp  |   1 +
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      |  38 +++--
 integrations/pytorch_ddp/test/test-generic.py |  52 ++++---
 5 files changed, 127 insertions(+), 112 deletions(-)

diff --git a/integrations/pytorch_ddp/accl_process_group/__init__.py b/integrations/pytorch_ddp/accl_process_group/__init__.py
index cac971eb..55098ff2 100644
--- a/integrations/pytorch_ddp/accl_process_group/__init__.py
+++ b/integrations/pytorch_ddp/accl_process_group/__init__.py
@@ -17,5 +17,5 @@
 
 from ._c.ProcessGroupACCL import ProcessGroupACCL, Rank, DataType, ACCLDesign
 from .process_group_wrapper import create_process_group, \
-    create_process_group_coyote, create_simulate_process_group, initialize, \
+    initialize, \
     set_compression, get_compression, get_local_qp, set_remote_qp
diff --git a/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py b/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py
index c6de9a21..dfa6de8e 100644
--- a/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py
+++ b/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py
@@ -67,79 +67,79 @@ def create_process_group_wrapper(store, rank, size, _timeout):
 
     Backend.register_backend("ACCL", create_process_group_wrapper)
 
-def create_simulate_process_group(ranks: list[Rank], *,
-                                  nbufs: int = 16, udp: bool = False,
-                                  compression: Optional[dict[DataType,
-                                                             DataType]] = None,
-                                  bufsize: int = 1024,
-                                  initialize: bool = True) -> ProcessGroup:
-    if compression is None:
-        compression = {}
-    else:
-        # Copy compression since it will be used later in the lambda function
-        compression = compression.copy()
-
-    def create_process_group_wrapper(store, rank, size, _timeout):
-        global process_group
-        if process_group is not None:
-            raise RuntimeError("ACCL ProcessGroup already created, "
-                               "can only create one.")
-
-        design = ACCLDesign.udp if udp else ACCLDesign.tcp
-
-        pg = ProcessGroupACCL(store, rank, size, ranks, True, design,
-                              compression=compression, nbufs=nbufs,
-                              bufsize=bufsize)
-
-        process_group = pg
-        if initialize:
-            pg.initialize()
-
-        return pg
-
-    Backend.register_backend("ACCL", create_process_group_wrapper)
-
-def create_process_group_coyote(
-        ranks: list[Rank], design: ACCLDesign,
-        *, nbufs: int = 16, bufsize: int = 1024,
-        compression: Optional[dict[DataType, DataType]] = None,
-        p2p_enabled: bool = False, profiling_ranks: Optional[list[int]] = None,
-        profiling_timeout: float = 0.0, rsfec: bool = False,
-        initialize: bool = False) -> ProcessGroup:
-    if design != ACCLDesign.cyt_rdma and design != ACCLDesign.cyt_tcp:
-        raise RuntimeError(f"{design} is an incompatible design for coyote")
-
-    if compression is None:
-        compression = {}
-    else:
-        # Copy compression since it will be used later in the lambda function
-        compression = compression.copy()
-
-    if profiling_ranks is None:
-        profiling_ranks = []
-    else:
-        profiling_ranks = profiling_ranks.copy()
-
-    def create_process_group_wrapper(store, rank, size, _timeout):
-        global process_group
-        if process_group is not None:
-            raise RuntimeError("ACCL ProcessGroup already created, "
-                               "can only create one.")
-
-        pg = ProcessGroupACCL(store, rank, size, ranks, False, design,
-                              bufsize=bufsize, rsfec=rsfec, nbufs=nbufs,
-                              compression=compression,
-                              p2p_enabled=p2p_enabled,
-                              profiling_ranks=profiling_ranks,
-                              profiling_timeout=profiling_timeout)
-
-        process_group = pg
-        if initialize:
-            pg.initialize()
-
-        return pg
-
-    Backend.register_backend("ACCL", create_process_group_wrapper)
+# def create_simulate_process_group(ranks: list[Rank], *,
+#                                   nbufs: int = 16, udp: bool = False,
+#                                   compression: Optional[dict[DataType,
+#                                                              DataType]] = None,
+#                                   bufsize: int = 1024,
+#                                   initialize: bool = True) -> ProcessGroup:
+#     if compression is None:
+#         compression = {}
+#     else:
+#         # Copy compression since it will be used later in the lambda function
+#         compression = compression.copy()
+
+#     def create_process_group_wrapper(store, rank, size, _timeout):
+#         global process_group
+#         if process_group is not None:
+#             raise RuntimeError("ACCL ProcessGroup already created, "
+#                                "can only create one.")
+
+#         design = ACCLDesign.udp if udp else ACCLDesign.tcp
+
+#         pg = ProcessGroupACCL(store, rank, size, ranks, True, design,
+#                               compression=compression, nbufs=nbufs,
+#                               bufsize=bufsize)
+
+#         process_group = pg
+#         if initialize:
+#             pg.initialize()
+
+#         return pg
+
+#     Backend.register_backend("ACCL", create_process_group_wrapper)
+
+# def create_process_group_coyote(
+#         ranks: list[Rank], design: ACCLDesign,
+#         *, nbufs: int = 16, bufsize: int = 1024,
+#         compression: Optional[dict[DataType, DataType]] = None,
+#         p2p_enabled: bool = False, profiling_ranks: Optional[list[int]] = None,
+#         profiling_timeout: float = 0.0, rsfec: bool = False,
+#         initialize: bool = False) -> ProcessGroup:
+#     if design != ACCLDesign.cyt_rdma and design != ACCLDesign.cyt_tcp:
+#         raise RuntimeError(f"{design} is an incompatible design for coyote")
+
+#     if compression is None:
+#         compression = {}
+#     else:
+#         # Copy compression since it will be used later in the lambda function
+#         compression = compression.copy()
+
+#     if profiling_ranks is None:
+#         profiling_ranks = []
+#     else:
+#         profiling_ranks = profiling_ranks.copy()
+
+#     def create_process_group_wrapper(store, rank, size, _timeout):
+#         global process_group
+#         if process_group is not None:
+#             raise RuntimeError("ACCL ProcessGroup already created, "
+#                                "can only create one.")
+
+#         pg = ProcessGroupACCL(store, rank, size, ranks, False, design,
+#                               bufsize=bufsize, rsfec=rsfec, nbufs=nbufs,
+#                               compression=compression,
+#                               p2p_enabled=p2p_enabled,
+#                               profiling_ranks=profiling_ranks,
+#                               profiling_timeout=profiling_timeout)
+
+#         process_group = pg
+#         if initialize:
+#             pg.initialize()
+
+#         return pg
+
+#     Backend.register_backend("ACCL", create_process_group_wrapper)
 
 def initialize() -> None:
     if process_group is None:
diff --git a/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp b/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp
index 4218ad36..ae9944ca 100644
--- a/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp
+++ b/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp
@@ -309,6 +309,7 @@ class TORCH_API ProcessGroupACCL : public ProcessGroup {
 
   ACCL::CoyoteDevice *cyt_device;
   std::vector<fpga::ibvQpConn*> ibvQpConn_vec;
+  xrt::device xrt_device;
 
   std::unique_ptr<ACCL::ACCL> accl;
   uint64_t bufsize;
diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 7304c11c..495adebe 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -606,15 +606,21 @@ ProcessGroupACCL::ProcessGroupACCL(
 
   ranks_ = convert_ranks(ranks);
   design_ = design;
-
-  if (coyote_enabled) {
-    if (design_ == accl_network_utils::acclDesign::CYT_TCP) {
-      cyt_device = new ACCL::CoyoteDevice();
-    } else if (design_ == accl_network_utils::acclDesign::CYT_RDMA) {
-      cyt_device = new ACCL::CoyoteDevice(size_);
-      cyt::setup_cyt_rdma(ibvQpConn_vec, ranks_, rank_, *cyt_device);
-    } else {
-      throw std::runtime_error("Undefined ACCL design");
+  
+  if (!simulator){
+    if (coyote_enabled) {
+      if (design_ == accl_network_utils::acclDesign::CYT_TCP) {
+        cyt_device = new ACCL::CoyoteDevice();
+      } else if (design_ == accl_network_utils::acclDesign::CYT_RDMA) {
+        cyt_device = new ACCL::CoyoteDevice(size_);
+        cyt::setup_cyt_rdma(ibvQpConn_vec, ranks_, rank_, *cyt_device);
+      } else {
+        throw std::runtime_error("Undefined ACCL design");
+      }
+    }
+    // use xrt
+    else{
+      xrt_device = xrt::device(device_index);
     }
   }
 }
@@ -640,7 +646,6 @@ void ProcessGroupACCL::set_remote_qp(unsigned int rank, std::vector<std::uint8_t
 }
 
 void ProcessGroupACCL::initialize() {
-  xrt::device device;
   if (initialized) {
     throw std::runtime_error("Already initialized process group");
   }
@@ -655,19 +660,18 @@ void ProcessGroupACCL::initialize() {
     accl = std::make_unique<ACCL::ACCL>(cyt_device);
     ACCL::debug(std::string("[ACCL coyote] communicator: ") + accl->dump_communicator());
   } else {
-    if (!simulator_) {
-      device = xrt::device(device_index_);
-    }
-
     accl = accl_network_utils::initialize_accl(ranks_, rank_,
-                                               simulator_, design_, device,
+                                               simulator_, design_, xrt_device,
                                                xclbin_, nbufs_, bufsize, 0,
                                                rsfec_);
+    accl->set_timeout(1e6);
+    accl->set_rendezvous_threshold(16*1024);
+                                      
     int devicemem = accl->devicemem();
     if (!simulator_) {
       // Initialize cache buffers
-      buf0 = xrt::bo(device, bufsize, devicemem);
-      buf1 = xrt::bo(device, bufsize, devicemem);
+      buf0 = xrt::bo(xrt_device, bufsize, devicemem);
+      buf1 = xrt::bo(xrt_device, bufsize, devicemem);
     }
   }
 
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index 87dbe1d1..c11536b2 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -19,6 +19,7 @@
 from typing import Optional
 import numpy as np
 import os
+import sys
 from mpi4py.MPI import COMM_WORLD as mpi
 
 import torch
@@ -123,29 +124,29 @@ def test_allreduce():
     print("Test allreduce finished!")
 
 
-def exchange_qp(first_rank, second_rank, rank, ranks):
-    if rank == first_rank:
-        mpi.send(accl.get_local_qp(second_rank), dest=second_rank, tag=23)
-    elif rank == second_rank:
-        accl.set_remote_qp(first_rank, mpi.recv(source=first_rank, tag=23))
+# def exchange_qp(first_rank, second_rank, rank, ranks):
+#     if rank == first_rank:
+#         mpi.send(accl.get_local_qp(second_rank), dest=second_rank, tag=23)
+#     elif rank == second_rank:
+#         accl.set_remote_qp(first_rank, mpi.recv(source=first_rank, tag=23))
 
-    mpi.barrier()
+#     mpi.barrier()
 
-    if rank == second_rank:
-        mpi.send(accl.get_local_qp(first_rank), dest=first_rank, tag=24)
-    elif rank == first_rank:
-        accl.set_remote_qp(second_rank, mpi.recv(source=second_rank, tag=24))
+#     if rank == second_rank:
+#         mpi.send(accl.get_local_qp(first_rank), dest=first_rank, tag=24)
+#     elif rank == first_rank:
+#         accl.set_remote_qp(second_rank, mpi.recv(source=second_rank, tag=24))
 
-    mpi.barrier()
+#     mpi.barrier()
 
 
-def configure_cyt_rdma(ranks):
-    global rank, size
-    for first_rank in range(0, size):
-        for second_rank in range(first_rank + 1, size):
-            exchange_qp(first_rank, second_rank, rank, ranks)
-    accl.initialize()
-    mpi.barrier()
+#def configure_cyt_rdma(ranks):
+#    global rank, size
+#    for first_rank in range(0, size):
+#        for second_rank in range(first_rank + 1, size):
+#            exchange_qp(first_rank, second_rank, rank, ranks)
+#    accl.initialize()
+#    mpi.barrier()
 
 
 
@@ -160,7 +161,16 @@ def start_test(comms: str, simulator: bool):
     ranks = [accl.Rank("127.0.0.1", 5500 + i, i, rxbufsize)
              for i in range(size)]
 
-    accl.create_process_group(ranks, accl.ACCLDesign.cyt_rdma, bufsize=rxbufsize, initialize=True)
+    if comms == 'udp':
+        design = accl.ACCLDesign.udp
+    elif comms == 'tcp':
+        design = accl.ACCLDesign.tcp
+    elif comms == 'cyt_rdma':
+        design = accl.ACCLDesign.cyt_rdma
+    else:
+        sys.exit('Design "' + comms + '" currently not supported')
+
+    accl.create_process_group(ranks, design, bufsize=rxbufsize, initialize=True, simulation=simulator)
     dist.init_process_group("ACCL", rank=rank, world_size=size)
 
     with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
@@ -195,6 +205,6 @@ def start_test(comms: str, simulator: bool):
     args = parser.parse_args()
 
     #if args.comms != 'cyt_rdma' or not args.simulation:
-    if args.comms != 'cyt_rdma':
-        sys.exit('Currently only supports -c cyt_rdma and -s flags')
+    #if args.comms != 'cyt_rdma':
+    #    sys.exit('Currently only supports -c cyt_rdma and -s flags')
     start_test(args.comms, args.simulation)

From 5d37c0fe4752357718e08217261c3ed5a85f35ad Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sat, 20 Apr 2024 16:26:20 +0200
Subject: [PATCH 06/64] Set up runscripts

---
 .../process_group_wrapper.py                  |   5 +
 integrations/pytorch_ddp/install.py           |   6 +-
 integrations/pytorch_ddp/run.sh               | 108 ++++++++++++++++++
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      |  11 +-
 integrations/pytorch_ddp/src/coyote_init.cpp  |   1 +
 integrations/pytorch_ddp/test/run.sh          | 106 +++++++++++++++++
 integrations/pytorch_ddp/test/test-generic.py |  15 ++-
 7 files changed, 248 insertions(+), 4 deletions(-)
 create mode 100755 integrations/pytorch_ddp/run.sh
 create mode 100755 integrations/pytorch_ddp/test/run.sh

diff --git a/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py b/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py
index dfa6de8e..9ef09c81 100644
--- a/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py
+++ b/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py
@@ -19,6 +19,7 @@
 from typing import Optional
 from . import ProcessGroupACCL, Rank, DataType, ACCLDesign
 import torch
+import logging
 from torch.distributed import Backend
 from torch.distributed.distributed_c10d import ProcessGroup, Store
 
@@ -52,6 +53,10 @@ def create_process_group_wrapper(store, rank, size, _timeout):
             raise RuntimeError("ACCL ProcessGroup already created, "
                                "can only create one.")
 
+        # if simulation:
+            #overwrite the design choice in simulation
+            # design = ACCLDesign.udp
+        
         pg = ProcessGroupACCL(store, rank, size, ranks, simulation, design,
                               bufsize=bufsize, rsfec=rsfec, nbufs=nbufs,
                               compression=compression,
diff --git a/integrations/pytorch_ddp/install.py b/integrations/pytorch_ddp/install.py
index 6a515cda..4e0ee3cc 100755
--- a/integrations/pytorch_ddp/install.py
+++ b/integrations/pytorch_ddp/install.py
@@ -151,10 +151,14 @@ def main(rocm: bool = False, cuda: bool = False,
               "please rerun with the --force-pytorch flag enabled.")
         exit(1)
 
-    if not packages['accl-process-group'] or force_accl_process_group:
+    if not packages['accl-process-group']:
         print("ACCL Process Group not found, installing...")
         install_accl_process_group(rocm, cuda, debug)
 
+    if force_accl_process_group:
+        print("Forced reinstall of ACCL Process Group ")
+        install_accl_process_group(rocm, cuda, debug)
+        
 
 if __name__ == '__main__':
     import argparse
diff --git a/integrations/pytorch_ddp/run.sh b/integrations/pytorch_ddp/run.sh
new file mode 100755
index 00000000..b97b10a8
--- /dev/null
+++ b/integrations/pytorch_ddp/run.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+
+#check working directory
+if [[ $(pwd) != *pytorch_ddp ]]; then
+	echo "ERROR: this script should only be run in the pytorch_ddp dir of the repo!"
+	exit 1
+fi
+
+# state variables
+mkdir -p "$(pwd)/accl_log"
+# BUILD_DIR=../build
+# point this to python venv, which has the relevant libraries installed
+VENV_ACTIVATE=$(pwd)/venv/bin/activate
+SETUP_SH=$(pwd)/setup.sh
+SCRIPT=$(pwd)/test/test-generic.py
+HOST_FILE=./accl_log/host
+FPGA_FILE=./accl_log/fpga
+
+#enter venv and run script
+EXEC="bash -c \"source $VENV_ACTIVATE && source $SETUP_SH  && python $SCRIPT"
+# EXEC="python $SCRIPT"
+
+
+#---------------Setting up vars-------------
+if [[ $ACCL_SIM -eq 1 ]]; then
+    echo "Starting in simulator mode. Make sure to start the emulator beforehand"
+    ARG="-s "
+
+    if [[ -v ACCL_NP ]]; then
+        NUM_PROCESS="$ACCL_NP"
+    else
+    	echo "Variable ACCL_NP not set. Enter num of processes:"
+	read -a NUM_PROCESS
+    fi
+
+else
+    echo "Starting in hw mode. Make sure to run flow_u55c beforehand."
+    if [[ -v U55C_IDS ]]; then
+	IFS=' ' read -r -a SERVID <<< "$U55C_IDS"
+    else
+	# read server ids from user
+	echo "Variable U55C_IDS not set. Enter u55c machine ids (space separated):"
+	read -a SERVID
+    fi
+    RANK_PORT="30501"
+    # create ip files
+    rm -f $HOST_FILE $FPGA_FILE
+    NUM_PROCESS=0
+    for ID in ${SERVID[@]}; do
+	echo "10.253.74.$(((ID-1) * 4 + 66))">>$HOST_FILE
+	echo "10.253.74.$(((ID-1) * 4 + 68))">>$FPGA_FILE
+	NUM_PROCESS=$((NUM_PROCESS+1))
+	HOST_LIST+="alveo-u55c-$(printf "%02d" $ID) "
+	HOST_PORT_LIST+="alveo-u55c-$(printf "%02d" $ID):$RANK_PORT "
+    done
+
+    echo "HOST_LIST: ${HOST_LIST[*]}"
+
+    #set master address
+    MASTER_IP="10.253.74.$(((${SERVID[0]}-1) * 4 + 66))"
+    MASTER_PORT="30501"
+
+    echo "Master node set to: $MASTER_IP:$MASTER_PORT"
+
+    MPI_ARGS="-f $HOST_FILE --iface ens4f0"
+fi
+
+ARG="$ARG -c cyt_rdma\""
+
+#---------------Running it-------------
+
+echo "Run command: $EXEC $ARG"
+
+echo "Running with $NUM_PROCESS Processes"
+
+rm -f $(pwd)/accl_log/rank*
+
+C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stdout\" -errfile-pattern \"$(pwd)/accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stderr\" $EXEC $ARG"
+# C="mpirun -n $NUM_PROCESS -f $HOST_FILE --iface ens4f0 $EXEC $ARG &"
+echo $C
+
+/bin/sh -c "$C"
+
+if ! [[ $ACCL_SIM -eq 1 ]]; then
+    SLEEPTIME=8
+    echo "Sleep for $SLEEPTIMEs"
+    sleep $SLEEPTIME
+    parallel-ssh -H "$HOST_LIST" "killall -9 test-generic.py"
+    parallel-ssh -H "$HOST_LIST" "dmesg | grep "fpga_tlb_miss_isr" >$(pwd)/accl_log/tlb_miss.log"
+    # done
+
+    mkdir -p "$(pwd)/accl_results"
+    # Loop through accl log files in the source directory and append to accl_results folder
+    for source_log in "$(pwd)/accl"*.log; do
+	# Extract the log number from the source log file name (assuming the format is acclX.log)
+	log_number=$(basename "${source_log}" | sed 's/accl\([0-9]*\)\.log/\1/')
+	# Create the destination log file path
+	destination_log="$(pwd)/accl_results/accl${log_number}.log"
+	# Append the content of the source log to the destination log
+	cat "${source_log}" >> "${destination_log}"
+	# Remove the tmp log
+	rm ${source_log}
+    done
+fi
+
+
+
+
diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 495adebe..1de00c0f 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -606,6 +606,14 @@ ProcessGroupACCL::ProcessGroupACCL(
 
   ranks_ = convert_ranks(ranks);
   design_ = design;
+  if(simulator_){
+    ACCL::debug("running on simulator\n");
+    std::cout << "running on simulator\n";
+  }
+  else{
+    ACCL::debug("not running on simulator\n");
+    std::cout << "not running on simulator\n";
+  }
   
   if (!simulator){
     if (coyote_enabled) {
@@ -646,11 +654,12 @@ void ProcessGroupACCL::set_remote_qp(unsigned int rank, std::vector<std::uint8_t
 }
 
 void ProcessGroupACCL::initialize() {
+  std::cout << "PG initialize called\n";
   if (initialized) {
     throw std::runtime_error("Already initialized process group");
   }
 
-  if (coyote_enabled) {
+  if (coyote_enabled && !simulator_) {
     if (design_ == accl_network_utils::acclDesign::CYT_RDMA) {
       cyt::configure_cyt_rdma(ibvQpConn_vec, ranks_, rank_);
     } else {
diff --git a/integrations/pytorch_ddp/src/coyote_init.cpp b/integrations/pytorch_ddp/src/coyote_init.cpp
index a6be2cc4..db0f2150 100644
--- a/integrations/pytorch_ddp/src/coyote_init.cpp
+++ b/integrations/pytorch_ddp/src/coyote_init.cpp
@@ -88,6 +88,7 @@ void setup_cyt_rdma(std::vector<fpga::ibvQpConn *> &ibvQpConn_vec,
 
 void configure_cyt_rdma(std::vector<fpga::ibvQpConn *> &ibvQpConn_vec,
                         std::vector<ACCL::rank_t> &ranks, int local_rank) {
+  std::cout << "[ACCL Coyote] Test3..." << std::endl;
   std::cout << "[ACCL Coyote] Exchanging QP..." << std::endl;
   for (int first_rank = 0; first_rank < ranks.size(); first_rank++) {
     for (int second_rank = first_rank + 1; second_rank < ranks.size();
diff --git a/integrations/pytorch_ddp/test/run.sh b/integrations/pytorch_ddp/test/run.sh
new file mode 100755
index 00000000..039aa66b
--- /dev/null
+++ b/integrations/pytorch_ddp/test/run.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+
+#check working directory
+if [[ $(pwd) != *pytorch_ddp/test ]]; then
+	echo "ERROR: this script should only be run in the pytorch_ddp/test dir of the repo!"
+	exit 1
+fi
+
+# state variables
+mkdir -p "$(pwd)/accl_log"
+# BUILD_DIR=../build
+# point this to python venv, which has the relevant libraries installed
+VENV_ACTIVATE=$(pwd)/../venv/bin/activate
+SETUP_SH=$(pwd)/../setup.sh
+SCRIPT=$(pwd)/test-generic.py
+HOST_FILE=./accl_log/host
+FPGA_FILE=./accl_log/fpga
+
+#enter venv and run script
+EXEC="bash -c \"source $VENV_ACTIVATE && source $SETUP_SH  && python $SCRIPT"
+# EXEC="python $SCRIPT"
+
+
+#---------------Setting up vars-------------
+if [[ $ACCL_SIM -eq 1 ]]; then
+    echo "Starting in simulator mode. Make sure to start the emulator beforehand"
+    ARG="-s "
+
+    if [[ -v ACCL_NP ]]; then
+        NUM_PROCESS="$ACCL_NP"
+    else
+    	echo "Variable ACCL_NP not set. Enter num of processes:"
+	read -a NUM_PROCESS
+    fi
+
+else
+    echo "Starting in hw mode. Make sure to run flow_u55c beforehand."
+    if [[ -v U55C_IDS ]]; then
+	IFS=' ' read -r -a SERVID <<< "$U55C_IDS"
+    else
+	# read server ids from user
+	echo "Variable U55C_IDS not set. Enter u55c machine ids (space separated):"
+	read -a SERVID
+    fi
+    RANK_PORT="30501"
+    # create ip files
+    rm -f $HOST_FILE $FPGA_FILE
+    NUM_PROCESS=0
+    for ID in ${SERVID[@]}; do
+	echo "10.253.74.$(((ID-1) * 4 + 66))">>$HOST_FILE
+	echo "10.253.74.$(((ID-1) * 4 + 68))">>$FPGA_FILE
+	NUM_PROCESS=$((NUM_PROCESS+1))
+	HOST_LIST+="alveo-u55c-$(printf "%02d" $ID) "
+	HOST_PORT_LIST+="alveo-u55c-$(printf "%02d" $ID):$RANK_PORT "
+    done
+
+    echo "HOST_LIST: ${HOST_LIST[*]}"
+
+    #set master address
+    MASTER_IP="10.253.74.$(((${SERVID[0]}-1) * 4 + 66))"
+    MASTER_PORT="30501"
+
+    echo "Master node set to: $MASTER_IP:$MASTER_PORT"
+
+    MPI_ARGS="-f $HOST_FILE --iface ens4f0"
+fi
+
+ARG="$ARG -c cyt_rdma\""
+
+echo "Run command: $EXEC $ARG"
+
+echo "Running with $NUM_PROCESS Processes"
+
+rm -f $(pwd)/accl_log/rank*
+
+C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stdout\" -errfile-pattern \"$(pwd)/accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stderr\" $EXEC $ARG"
+# C="mpirun -n $NUM_PROCESS -f $HOST_FILE --iface ens4f0 $EXEC $ARG &"
+echo $C
+
+/bin/sh -c "$C"
+
+if ! [[ $ACCL_SIM -eq 1 ]]; then
+    SLEEPTIME=8
+    echo "Sleep for $SLEEPTIMEs"
+    sleep $SLEEPTIME
+    parallel-ssh -H "$HOST_LIST" "killall -9 test-generic.py"
+    parallel-ssh -H "$HOST_LIST" "dmesg | grep "fpga_tlb_miss_isr" >$(pwd)/accl_log/tlb_miss.log"
+    # done
+
+    mkdir -p "$(pwd)/accl_results"
+    # Loop through accl log files in the source directory and append to accl_results folder
+    for source_log in "$(pwd)/accl"*.log; do
+	# Extract the log number from the source log file name (assuming the format is acclX.log)
+	log_number=$(basename "${source_log}" | sed 's/accl\([0-9]*\)\.log/\1/')
+	# Create the destination log file path
+	destination_log="$(pwd)/accl_results/accl${log_number}.log"
+	# Append the content of the source log to the destination log
+	cat "${source_log}" >> "${destination_log}"
+	# Remove the tmp log
+	rm ${source_log}
+    done
+fi
+
+
+
+
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index c11536b2..6935f1a7 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -20,6 +20,7 @@
 import numpy as np
 import os
 import sys
+import logging
 from mpi4py.MPI import COMM_WORLD as mpi
 
 import torch
@@ -27,6 +28,9 @@
 from torch.profiler import profile, ProfilerActivity
 import accl_process_group as accl
 
+#Configure, which logging messages to display
+logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
+
 rank = 0
 size = 0
 
@@ -42,6 +46,9 @@ def test_broadcast():
 
     dist.broadcast(x, 0)
 
+    logging.debug('Tensor after broadcast: ' + str(x))
+    print('Tensor after broadcast: ' + str(x))
+    
     np.testing.assert_allclose(x, torch.ones(count))
     print("Test broadcast finished!")
 
@@ -158,6 +165,9 @@ def start_test(comms: str, simulator: bool):
         os.environ['MASTER_PORT'] = '30500'
     rank = mpi.Get_rank()
     size = mpi.Get_size()
+    # size = 2
+    print(f"Starting tests on rank {rank} with size {size}")
+    
     ranks = [accl.Rank("127.0.0.1", 5500 + i, i, rxbufsize)
              for i in range(size)]
 
@@ -166,7 +176,8 @@ def start_test(comms: str, simulator: bool):
     elif comms == 'tcp':
         design = accl.ACCLDesign.tcp
     elif comms == 'cyt_rdma':
-        design = accl.ACCLDesign.cyt_rdma
+        # design = accl.ACCLDesign.cyt_rdma
+        design = accl.ACCLDesign.udp
     else:
         sys.exit('Design "' + comms + '" currently not supported')
 
@@ -201,7 +212,7 @@ def start_test(comms: str, simulator: bool):
                         default=False, help='Use simulation instead of '
                                             'hardware')
     parser.add_argument('-c', '--comms', choices=['udp', 'tcp', 'cyt_rdma'], default='tcp',
-                        help='Run tests over specied communication backend')
+                        help='Run tests over specified communication backend')
     args = parser.parse_args()
 
     #if args.comms != 'cyt_rdma' or not args.simulation:

From e4b0904c12203da5ac4649934ce26286daa5a910 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Tue, 23 Apr 2024 09:50:52 +0200
Subject: [PATCH 07/64] Neural net added to test cases + adapted runscript

---
 .../process_group_wrapper.py                  |   2 +-
 integrations/pytorch_ddp/run.sh               |  70 +++++++----
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      |   2 -
 integrations/pytorch_ddp/test/run.sh          |  74 ++++++++----
 integrations/pytorch_ddp/test/test-generic.py | 113 ++++++++++++++++--
 5 files changed, 198 insertions(+), 63 deletions(-)

diff --git a/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py b/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py
index 9ef09c81..ccdf7090 100644
--- a/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py
+++ b/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py
@@ -70,7 +70,7 @@ def create_process_group_wrapper(store, rank, size, _timeout):
 
         return pg
 
-    Backend.register_backend("ACCL", create_process_group_wrapper)
+    Backend.register_backend("ACCL", create_process_group_wrapper, devices='cpu')
 
 # def create_simulate_process_group(ranks: list[Rank], *,
 #                                   nbufs: int = 16, udp: bool = False,
diff --git a/integrations/pytorch_ddp/run.sh b/integrations/pytorch_ddp/run.sh
index b97b10a8..58d137d6 100755
--- a/integrations/pytorch_ddp/run.sh
+++ b/integrations/pytorch_ddp/run.sh
@@ -6,13 +6,20 @@ if [[ $(pwd) != *pytorch_ddp ]]; then
 	exit 1
 fi
 
+if [[ -v ACCL_SCRIPT ]]; then
+    SCRIPT_NAME="$ACCL_SCRIPT"
+else
+    SCRIPT_NAME=test-generic.py
+    echo "Variable ACCL_SCRIPT not set. Assuming $SCRIPT_NAME"
+fi
+
 # state variables
 mkdir -p "$(pwd)/accl_log"
 # BUILD_DIR=../build
 # point this to python venv, which has the relevant libraries installed
 VENV_ACTIVATE=$(pwd)/venv/bin/activate
 SETUP_SH=$(pwd)/setup.sh
-SCRIPT=$(pwd)/test/test-generic.py
+SCRIPT=$(pwd)/test/$SCRIPT_NAME
 HOST_FILE=./accl_log/host
 FPGA_FILE=./accl_log/fpga
 
@@ -26,6 +33,10 @@ if [[ $ACCL_SIM -eq 1 ]]; then
     echo "Starting in simulator mode. Make sure to start the emulator beforehand"
     ARG="-s "
 
+    ACCL_COMMS="udp"
+
+    echo "assuming udp comms in simulator"
+
     if [[ -v ACCL_NP ]]; then
         NUM_PROCESS="$ACCL_NP"
     else
@@ -33,6 +44,9 @@ if [[ $ACCL_SIM -eq 1 ]]; then
 	read -a NUM_PROCESS
     fi
 
+    MASTER_IP="localhost"
+    MASTER_PORT="30501"
+
 else
     echo "Starting in hw mode. Make sure to run flow_u55c beforehand."
     if [[ -v U55C_IDS ]]; then
@@ -42,6 +56,12 @@ else
 	echo "Variable U55C_IDS not set. Enter u55c machine ids (space separated):"
 	read -a SERVID
     fi
+
+    if ! [[ -v ACCL_COMMS ]]; then
+        ACCL_COMMS="cyt_rdma"
+	echo "Assuming cyt_rdma comms in hardware"
+    fi
+	
     RANK_PORT="30501"
     # create ip files
     rm -f $HOST_FILE $FPGA_FILE
@@ -58,14 +78,14 @@ else
 
     #set master address
     MASTER_IP="10.253.74.$(((${SERVID[0]}-1) * 4 + 66))"
-    MASTER_PORT="30501"
+    MASTER_PORT="30505"
 
     echo "Master node set to: $MASTER_IP:$MASTER_PORT"
 
     MPI_ARGS="-f $HOST_FILE --iface ens4f0"
 fi
 
-ARG="$ARG -c cyt_rdma\""
+ARG="$ARG -c $ACCL_COMMS -i $HOST_FILE -f $FPGA_FILE -a $MASTER_IP -p $MASTER_PORT\""
 
 #---------------Running it-------------
 
@@ -75,34 +95,40 @@ echo "Running with $NUM_PROCESS Processes"
 
 rm -f $(pwd)/accl_log/rank*
 
-C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stdout\" -errfile-pattern \"$(pwd)/accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stderr\" $EXEC $ARG"
+C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stdout\" $EXEC $ARG &"
+# C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stdout\" -errfile-pattern \"$(pwd)/accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stderr\" $EXEC $ARG &"
 # C="mpirun -n $NUM_PROCESS -f $HOST_FILE --iface ens4f0 $EXEC $ARG &"
 echo $C
 
 /bin/sh -c "$C"
 
+if ! [[ -v SLEEPTIME ]]; then
+    SLEEPTIME="16"
+fi
+echo "Sleeping for $SLEEPTIME"
+sleep $SLEEPTIME
+
 if ! [[ $ACCL_SIM -eq 1 ]]; then
-    SLEEPTIME=8
-    echo "Sleep for $SLEEPTIMEs"
-    sleep $SLEEPTIME
-    parallel-ssh -H "$HOST_LIST" "killall -9 test-generic.py"
+    parallel-ssh -H "$HOST_LIST" "killall -9 $SCRIPT_NAME"
     parallel-ssh -H "$HOST_LIST" "dmesg | grep "fpga_tlb_miss_isr" >$(pwd)/accl_log/tlb_miss.log"
-    # done
-
-    mkdir -p "$(pwd)/accl_results"
-    # Loop through accl log files in the source directory and append to accl_results folder
-    for source_log in "$(pwd)/accl"*.log; do
-	# Extract the log number from the source log file name (assuming the format is acclX.log)
-	log_number=$(basename "${source_log}" | sed 's/accl\([0-9]*\)\.log/\1/')
-	# Create the destination log file path
-	destination_log="$(pwd)/accl_results/accl${log_number}.log"
-	# Append the content of the source log to the destination log
-	cat "${source_log}" >> "${destination_log}"
-	# Remove the tmp log
-	rm ${source_log}
-    done
+else
+    killall -9 $SCRIPT_NAME
+    dmesg | grep "fpga_tlb_miss_isr" >$(pwd)/accl_log/tlb_miss.log
 fi
 
+mkdir -p "$(pwd)/accl_results"
+# Loop through accl log files in the source directory and append to accl_results folder
+for source_log in "$(pwd)/accl"*.log; do
+    # Extract the log number from the source log file name (assuming the format is acclX.log)
+    log_number=$(basename "${source_log}" | sed 's/accl\([0-9]*\)\.log/\1/')
+    # Create the destination log file path
+    destination_log="$(pwd)/accl_results/accl${log_number}.log"
+    # Append the content of the source log to the destination log
+    cat "${source_log}" >> "${destination_log}"
+    # Remove the tmp log
+    rm ${source_log}
+done
+
 
 
 
diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 1de00c0f..3abbada0 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -608,11 +608,9 @@ ProcessGroupACCL::ProcessGroupACCL(
   design_ = design;
   if(simulator_){
     ACCL::debug("running on simulator\n");
-    std::cout << "running on simulator\n";
   }
   else{
     ACCL::debug("not running on simulator\n");
-    std::cout << "not running on simulator\n";
   }
   
   if (!simulator){
diff --git a/integrations/pytorch_ddp/test/run.sh b/integrations/pytorch_ddp/test/run.sh
index 039aa66b..ca630839 100755
--- a/integrations/pytorch_ddp/test/run.sh
+++ b/integrations/pytorch_ddp/test/run.sh
@@ -6,13 +6,20 @@ if [[ $(pwd) != *pytorch_ddp/test ]]; then
 	exit 1
 fi
 
+if [[ -v ACCL_SCRIPT ]]; then
+    SCRIPT_NAME="$ACCL_SCRIPT"
+else
+    SCRIPT_NAME=test-generic.py
+    echo "Variable ACCL_SCRIPT not set. Assuming $SCRIPT_NAME"
+fi
+
 # state variables
 mkdir -p "$(pwd)/accl_log"
 # BUILD_DIR=../build
 # point this to python venv, which has the relevant libraries installed
 VENV_ACTIVATE=$(pwd)/../venv/bin/activate
 SETUP_SH=$(pwd)/../setup.sh
-SCRIPT=$(pwd)/test-generic.py
+SCRIPT=$(pwd)/$SCRIPT_NAME
 HOST_FILE=./accl_log/host
 FPGA_FILE=./accl_log/fpga
 
@@ -26,6 +33,10 @@ if [[ $ACCL_SIM -eq 1 ]]; then
     echo "Starting in simulator mode. Make sure to start the emulator beforehand"
     ARG="-s "
 
+    ACCL_COMMS="udp"
+
+    echo "assuming udp comms in simulator"
+
     if [[ -v ACCL_NP ]]; then
         NUM_PROCESS="$ACCL_NP"
     else
@@ -33,6 +44,9 @@ if [[ $ACCL_SIM -eq 1 ]]; then
 	read -a NUM_PROCESS
     fi
 
+    MASTER_IP="localhost"
+    MASTER_PORT="30505"
+
 else
     echo "Starting in hw mode. Make sure to run flow_u55c beforehand."
     if [[ -v U55C_IDS ]]; then
@@ -42,6 +56,12 @@ else
 	echo "Variable U55C_IDS not set. Enter u55c machine ids (space separated):"
 	read -a SERVID
     fi
+
+    if ! [[ -v ACCL_COMMS ]]; then
+        ACCL_COMMS="cyt_rdma"
+	echo "Assuming cyt_rdma comms in hardware"
+    fi
+	
     RANK_PORT="30501"
     # create ip files
     rm -f $HOST_FILE $FPGA_FILE
@@ -58,14 +78,16 @@ else
 
     #set master address
     MASTER_IP="10.253.74.$(((${SERVID[0]}-1) * 4 + 66))"
-    MASTER_PORT="30501"
+    MASTER_PORT="30505"
 
     echo "Master node set to: $MASTER_IP:$MASTER_PORT"
 
     MPI_ARGS="-f $HOST_FILE --iface ens4f0"
 fi
 
-ARG="$ARG -c cyt_rdma\""
+ARG="$ARG -c $ACCL_COMMS -i $HOST_FILE -f $FPGA_FILE -a $MASTER_IP -p $MASTER_PORT\""
+
+#---------------Running it-------------
 
 echo "Run command: $EXEC $ARG"
 
@@ -73,34 +95,36 @@ echo "Running with $NUM_PROCESS Processes"
 
 rm -f $(pwd)/accl_log/rank*
 
-C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stdout\" -errfile-pattern \"$(pwd)/accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stderr\" $EXEC $ARG"
+C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stdout\" $EXEC $ARG &"
+# C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stdout\" -errfile-pattern \"$(pwd)/accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stderr\" $EXEC $ARG &"
 # C="mpirun -n $NUM_PROCESS -f $HOST_FILE --iface ens4f0 $EXEC $ARG &"
 echo $C
 
 /bin/sh -c "$C"
 
+if ! [[ -v SLEEPTIME ]]; then
+    SLEEPTIME="16"
+fi
+echo "Sleeping for $SLEEPTIME"
+sleep $SLEEPTIME
+
 if ! [[ $ACCL_SIM -eq 1 ]]; then
-    SLEEPTIME=8
-    echo "Sleep for $SLEEPTIMEs"
-    sleep $SLEEPTIME
-    parallel-ssh -H "$HOST_LIST" "killall -9 test-generic.py"
+    parallel-ssh -H "$HOST_LIST" "killall -9 $SCRIPT_NAME"
     parallel-ssh -H "$HOST_LIST" "dmesg | grep "fpga_tlb_miss_isr" >$(pwd)/accl_log/tlb_miss.log"
-    # done
-
-    mkdir -p "$(pwd)/accl_results"
-    # Loop through accl log files in the source directory and append to accl_results folder
-    for source_log in "$(pwd)/accl"*.log; do
-	# Extract the log number from the source log file name (assuming the format is acclX.log)
-	log_number=$(basename "${source_log}" | sed 's/accl\([0-9]*\)\.log/\1/')
-	# Create the destination log file path
-	destination_log="$(pwd)/accl_results/accl${log_number}.log"
-	# Append the content of the source log to the destination log
-	cat "${source_log}" >> "${destination_log}"
-	# Remove the tmp log
-	rm ${source_log}
-    done
+else
+    killall -9 $SCRIPT_NAME
+    dmesg | grep "fpga_tlb_miss_isr" >$(pwd)/accl_log/tlb_miss.log
 fi
 
-
-
-
+# mkdir -p "$(pwd)/accl_results"
+# # Loop through accl log files in the source directory and append to accl_results folder
+# for source_log in "$(pwd)/accl"*.log; do
+#     # Extract the log number from the source log file name (assuming the format is acclX.log)
+#     log_number=$(basename "${source_log}" | sed 's/accl\([0-9]*\)\.log/\1/')
+#     # Create the destination log file path
+#     destination_log="$(pwd)/accl_results/accl${log_number}.log"
+#     # Append the content of the source log to the destination log
+#     cat "${source_log}" >> "${destination_log}"
+#     # Remove the tmp log
+#     rm ${source_log}
+# done
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index 6935f1a7..e4b7e346 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -28,6 +28,13 @@
 from torch.profiler import profile, ProfilerActivity
 import accl_process_group as accl
 
+from torch.nn.parallel import DistributedDataParallel as DDP
+import torch.nn as nn
+import torch.optim as optim
+
+from torch.utils.data import Dataset, DataLoader
+from torch.utils.data.distributed import DistributedSampler
+
 #Configure, which logging messages to display
 logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
 
@@ -131,6 +138,66 @@ def test_allreduce():
     print("Test allreduce finished!")
 
 
+class ToyModel(nn.Module):
+    def __init__(self):
+        super(ToyModel, self).__init__()
+        self.net1 = nn.Linear(10, 10)
+        self.relu = nn.ReLU()
+        self.net2 = nn.Linear(10, 5)
+
+    def forward(self, x):
+        return self.net2(self.relu(self.net1(x)))
+
+class MyTrainDataset(Dataset):
+    def __init__(self, size):
+        self.size = size
+        self.data = [(torch.rand(10), torch.rand(5)) for _ in range(size)]
+
+    def __len__(self):
+        return self.size
+    
+    def __getitem__(self, index):
+        return self.data[index]
+    
+
+def prepare_dataloader(dataset: Dataset, batch_size: int):
+    return DataLoader(
+        dataset,
+        batch_size=batch_size,
+        pin_memory=True,
+        shuffle=False,
+        sampler=DistributedSampler(dataset)
+    )    
+    
+def demo_basic(rank: int):
+    model = ToyModel()
+    ddp_model = DDP(model)
+
+    train_set = MyTrainDataset(2048)  # load your dataset
+    batch_size=64
+    train_data = prepare_dataloader(train_set, batch_size)
+    
+    loss_fn = nn.MSELoss()
+    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
+
+    max_epochs = 10
+    for epoch in range(max_epochs):
+        batch_size = len(next(iter(train_data))[0])
+        train_data.sampler.set_epoch(epoch)
+        for x, y in train_data:
+            
+            optimizer.zero_grad()
+            outputs = ddp_model(x)
+            loss = loss_fn(outputs, y)
+            loss.backward()
+            optimizer.step()
+
+        print(f"Rank {rank}: Epoch {epoch} | Batchsize: {batch_size} | Steps: {len(train_data)} | Loss: {loss}")
+        
+
+    print("finished training")
+    dist.destroy_process_group()
+
 # def exchange_qp(first_rank, second_rank, rank, ranks):
 #     if rank == first_rank:
 #         mpi.send(accl.get_local_qp(second_rank), dest=second_rank, tag=23)
@@ -157,30 +224,44 @@ def test_allreduce():
 
 
 
-def start_test(comms: str, simulator: bool):
+def start_test(comms: str, simulator: bool, host_file: str, fpga_file: str, ma: str, mp: str):
     global rank, size
-    if 'MASTER_ADDR' not in os.environ:
-        os.environ['MASTER_ADDR'] = 'localhost'
-    if 'MASTER_PORT' not in os.environ:
-        os.environ['MASTER_PORT'] = '30500'
+    os.environ['MASTER_ADDR'] = ma
+    os.environ['MASTER_PORT'] = mp
     rank = mpi.Get_rank()
     size = mpi.Get_size()
     # size = 2
+    print(f"MASTER: {os.environ['MASTER_ADDR']}{os.environ['MASTER_PORT']} ")
     print(f"Starting tests on rank {rank} with size {size}")
+
+    start_port = 5005
     
-    ranks = [accl.Rank("127.0.0.1", 5500 + i, i, rxbufsize)
-             for i in range(size)]
+    if not simulator:
+        with open(host_file, 'r') as hf:
+            host_ips = hf.readlines()
+            
+        with open(fpga_file, 'r') as ff:
+            fpga_ips = ff.readlines()
+
+        if comms == "cyt_rdma":
+            ranks = [accl.Rank(a, start_port, i, rxbufsize) for i, a in enumerate(fpga_ips)]
+        else:
+            ranks = [accl.Rank(a, start_port + i, 0, rxbufsize) for i, a in enumerate(fpga_ips)]            
+    else:
+        ranks = [accl.Rank("127.0.0.1", 5500 + i, i, rxbufsize) for i in range(size)]
 
     if comms == 'udp':
         design = accl.ACCLDesign.udp
     elif comms == 'tcp':
         design = accl.ACCLDesign.tcp
-    elif comms == 'cyt_rdma':
-        # design = accl.ACCLDesign.cyt_rdma
-        design = accl.ACCLDesign.udp
+    elif comms == 'cyt_rdma' and not simulator:
+        design = accl.ACCLDesign.cyt_rdma
     else:
-        sys.exit('Design "' + comms + '" currently not supported')
-
+        if simulator:
+            sys.exit('Design "' + comms + '" currently not supported in simulator mode')
+        else:
+            sys.exit('Design "' + comms + '" currently not supported in hardware mode')
+    
     accl.create_process_group(ranks, design, bufsize=rxbufsize, initialize=True, simulation=simulator)
     dist.init_process_group("ACCL", rank=rank, world_size=size)
 
@@ -200,6 +281,8 @@ def start_test(comms: str, simulator: bool):
         test_reduce()
         mpi.Barrier()
         test_allreduce()
+        mpi.Barrier()
+        demo_basic(rank)
 
     print(prof.key_averages(group_by_input_shape=True)
           .table(sort_by="cpu_time_total", row_limit=15))
@@ -213,9 +296,13 @@ def start_test(comms: str, simulator: bool):
                                             'hardware')
     parser.add_argument('-c', '--comms', choices=['udp', 'tcp', 'cyt_rdma'], default='tcp',
                         help='Run tests over specified communication backend')
+    parser.add_argument('-i', '--host-file', type=str, required=True, help='Specify the file, where the host IPs are listed')
+    parser.add_argument('-f', '--fpga-file', type=str, required=True, help='Specify the file, where the FPGA IPs are listed')
+    parser.add_argument('-a','--master-address', type=str)
+    parser.add_argument('-p','--master-port', type=str)
     args = parser.parse_args()
 
     #if args.comms != 'cyt_rdma' or not args.simulation:
     #if args.comms != 'cyt_rdma':
     #    sys.exit('Currently only supports -c cyt_rdma and -s flags')
-    start_test(args.comms, args.simulation)
+    start_test(args.comms, args.simulation, args.host_file, args.fpga_file, args.master_address, args.master_port)

From b788417bf359a52415708f6c1bea79d1828d9fb6 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Fri, 26 Apr 2024 11:21:38 +0200
Subject: [PATCH 08/64] Fixed initalization errors, added logging to python
 part

---
 .../process_group_wrapper.py                  | 98 ++++---------------
 integrations/pytorch_ddp/install.py           |  5 +-
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 22 +++--
 integrations/pytorch_ddp/src/coyote_init.cpp  |  2 +-
 integrations/pytorch_ddp/test/run.sh          | 22 +++--
 integrations/pytorch_ddp/test/test-generic.py | 78 +++++++--------
 6 files changed, 88 insertions(+), 139 deletions(-)

diff --git a/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py b/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py
index ccdf7090..3948d728 100644
--- a/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py
+++ b/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py
@@ -22,10 +22,17 @@
 import logging
 from torch.distributed import Backend
 from torch.distributed.distributed_c10d import ProcessGroup, Store
-
+import sys
+import os
 
 process_group: Optional[ProcessGroupACCL] = None
 
+#Configure logging
+logger = logging.getLogger(__name__)
+if "ACCL_DEBUG" in os.environ and os.environ["ACCL_DEBUG"]=="1":
+    logger.setLevel(logging.DEBUG)
+else:
+    logger.setLevel(logging.WARNING)
 
 def create_process_group(
         ranks: list[Rank], design: ACCLDesign,
@@ -42,11 +49,15 @@ def create_process_group(
         # Copy compression since it will be used later in the lambda function
         compression = compression.copy()
 
+    logger.debug(f'Compression: {compression}')
+
     if profiling_ranks is None:
         profiling_ranks = []
     else:
         profiling_ranks = profiling_ranks.copy()
 
+    logger.debug(f'Profiling_ranks: {profiling_ranks}')        
+
     def create_process_group_wrapper(store, rank, size, _timeout):
         global process_group
         if process_group is not None:
@@ -56,7 +67,9 @@ def create_process_group_wrapper(store, rank, size, _timeout):
         # if simulation:
             #overwrite the design choice in simulation
             # design = ACCLDesign.udp
-        
+
+        logger.debug(f'Creating ProcessGroupACCL for: rank {rank}')
+    
         pg = ProcessGroupACCL(store, rank, size, ranks, simulation, design,
                               bufsize=bufsize, rsfec=rsfec, nbufs=nbufs,
                               compression=compression,
@@ -66,105 +79,36 @@ def create_process_group_wrapper(store, rank, size, _timeout):
 
         process_group = pg
         if initialize:
+            logger.debug('Initializing Process Group')
             pg.initialize()
-
         return pg
 
+    #CPU only for now
     Backend.register_backend("ACCL", create_process_group_wrapper, devices='cpu')
 
-# def create_simulate_process_group(ranks: list[Rank], *,
-#                                   nbufs: int = 16, udp: bool = False,
-#                                   compression: Optional[dict[DataType,
-#                                                              DataType]] = None,
-#                                   bufsize: int = 1024,
-#                                   initialize: bool = True) -> ProcessGroup:
-#     if compression is None:
-#         compression = {}
-#     else:
-#         # Copy compression since it will be used later in the lambda function
-#         compression = compression.copy()
-
-#     def create_process_group_wrapper(store, rank, size, _timeout):
-#         global process_group
-#         if process_group is not None:
-#             raise RuntimeError("ACCL ProcessGroup already created, "
-#                                "can only create one.")
-
-#         design = ACCLDesign.udp if udp else ACCLDesign.tcp
-
-#         pg = ProcessGroupACCL(store, rank, size, ranks, True, design,
-#                               compression=compression, nbufs=nbufs,
-#                               bufsize=bufsize)
-
-#         process_group = pg
-#         if initialize:
-#             pg.initialize()
-
-#         return pg
-
-#     Backend.register_backend("ACCL", create_process_group_wrapper)
-
-# def create_process_group_coyote(
-#         ranks: list[Rank], design: ACCLDesign,
-#         *, nbufs: int = 16, bufsize: int = 1024,
-#         compression: Optional[dict[DataType, DataType]] = None,
-#         p2p_enabled: bool = False, profiling_ranks: Optional[list[int]] = None,
-#         profiling_timeout: float = 0.0, rsfec: bool = False,
-#         initialize: bool = False) -> ProcessGroup:
-#     if design != ACCLDesign.cyt_rdma and design != ACCLDesign.cyt_tcp:
-#         raise RuntimeError(f"{design} is an incompatible design for coyote")
-
-#     if compression is None:
-#         compression = {}
-#     else:
-#         # Copy compression since it will be used later in the lambda function
-#         compression = compression.copy()
-
-#     if profiling_ranks is None:
-#         profiling_ranks = []
-#     else:
-#         profiling_ranks = profiling_ranks.copy()
-
-#     def create_process_group_wrapper(store, rank, size, _timeout):
-#         global process_group
-#         if process_group is not None:
-#             raise RuntimeError("ACCL ProcessGroup already created, "
-#                                "can only create one.")
-
-#         pg = ProcessGroupACCL(store, rank, size, ranks, False, design,
-#                               bufsize=bufsize, rsfec=rsfec, nbufs=nbufs,
-#                               compression=compression,
-#                               p2p_enabled=p2p_enabled,
-#                               profiling_ranks=profiling_ranks,
-#                               profiling_timeout=profiling_timeout)
-
-#         process_group = pg
-#         if initialize:
-#             pg.initialize()
-
-#         return pg
-
-#     Backend.register_backend("ACCL", create_process_group_wrapper)
-
 def initialize() -> None:
+    logger.debug('Initialize called')
     if process_group is None:
         raise RuntimeError("Cannot initialize before ACCL ProcessGroup "
                            "is created.")
     process_group.initialize()
 
 def get_local_qp(rank: int) -> list[int]:
+    logger.debug('Get_local_qp called')
     if process_group is None:
         raise RuntimeError("Cannot get local qp before ACCL ProcessGroup "
                            "is created.")
     return process_group.get_local_qp(rank)
 
 def set_remote_qp(rank: int, qp: list[int]) -> None:
+    logger.debug('Set_remote_qp called')
     if process_group is None:
         raise RuntimeError("Cannot set remote qp before ACCL ProcessGroup "
                            "is created.")
     return process_group.set_remote_qp(rank, qp)
 
 def set_compression(compression: dict[DataType, DataType]):
+    logger.debug(f'Setting compression to {compression}')
     if process_group is None:
         raise RuntimeError("Cannot set compression before ACCL ProcessGroup "
                            "is initialized.")
diff --git a/integrations/pytorch_ddp/install.py b/integrations/pytorch_ddp/install.py
index 4e0ee3cc..e1366a7d 100755
--- a/integrations/pytorch_ddp/install.py
+++ b/integrations/pytorch_ddp/install.py
@@ -112,7 +112,7 @@ def install_pytorch(rocm: bool = False, cuda: bool = False):
 
 def install_accl_driver(accl_driver_path: Path):
     print("Installing accl driver...")
-    if 'ACCL_DEBUG' in os.environ:
+    if 'ACCL_DEBUG' in os.environ and os.environ["ACCL_DEBUG"]=="1":
         extra_args = ['-DACCL_DEBUG=1']
     else:
         extra_args = []
@@ -131,7 +131,8 @@ def install_accl_process_group(rocm: bool = False, cuda: bool = False, debug: bo
     env = os.environ.copy()
     env['USE_ROCM'] = '1' if rocm else '0'
     env['USE_CUDA'] = '1' if cuda else '0'
-    env['ACCL_DEBUG'] = '1' if debug else '0'
+    if debug:
+        env['ACCL_DEBUG'] = '1'
     subprocess.run([python, '-m', 'pip', '-v', 'install', '.'],
                    env=env, cwd=root, check=True)
 
diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 3abbada0..0a94ad7c 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -606,13 +606,6 @@ ProcessGroupACCL::ProcessGroupACCL(
 
   ranks_ = convert_ranks(ranks);
   design_ = design;
-  if(simulator_){
-    ACCL::debug("running on simulator\n");
-  }
-  else{
-    ACCL::debug("not running on simulator\n");
-  }
-  
   if (!simulator){
     if (coyote_enabled) {
       if (design_ == accl_network_utils::acclDesign::CYT_TCP) {
@@ -665,6 +658,21 @@ void ProcessGroupACCL::initialize() {
     }
 
     accl = std::make_unique<ACCL::ACCL>(cyt_device);
+
+    // eager protocol for now
+    int protoc = 0;
+    // default from test.cpp
+    int segsize = 4096 * 1024;
+    
+    if (protoc == 0){
+      std::cout<<"Eager Protocol"<<std::endl;
+      accl.get()->initialize(ranks_, rank_,
+			     size_+2, bufsize, segsize, 4096*1024*2);
+    } else {
+      std::cout<<"Rendezvous Protocol"<<std::endl;
+      accl.get()->initialize(ranks_, rank_, size_, 64, 64, segsize);
+    }  
+    
     ACCL::debug(std::string("[ACCL coyote] communicator: ") + accl->dump_communicator());
   } else {
     accl = accl_network_utils::initialize_accl(ranks_, rank_,
diff --git a/integrations/pytorch_ddp/src/coyote_init.cpp b/integrations/pytorch_ddp/src/coyote_init.cpp
index db0f2150..602e6cc2 100644
--- a/integrations/pytorch_ddp/src/coyote_init.cpp
+++ b/integrations/pytorch_ddp/src/coyote_init.cpp
@@ -77,6 +77,7 @@ void setup_cyt_rdma(std::vector<fpga::ibvQpConn *> &ibvQpConn_vec,
                     std::vector<ACCL::rank_t> &ranks, int local_rank,
                     ACCL::CoyoteDevice &device) {
   std::cout << "[ACCL Coyote] Initializing QP..." << std::endl;
+  ACCL::debug("Cyt setup on rank" + std::to_string(local_rank) + "\n");
   // create single page dummy memory space for each qp
   uint32_t n_pages = 1;
   for (int i = 0; i < ranks.size(); i++) {
@@ -88,7 +89,6 @@ void setup_cyt_rdma(std::vector<fpga::ibvQpConn *> &ibvQpConn_vec,
 
 void configure_cyt_rdma(std::vector<fpga::ibvQpConn *> &ibvQpConn_vec,
                         std::vector<ACCL::rank_t> &ranks, int local_rank) {
-  std::cout << "[ACCL Coyote] Test3..." << std::endl;
   std::cout << "[ACCL Coyote] Exchanging QP..." << std::endl;
   for (int first_rank = 0; first_rank < ranks.size(); first_rank++) {
     for (int second_rank = first_rank + 1; second_rank < ranks.size();
diff --git a/integrations/pytorch_ddp/test/run.sh b/integrations/pytorch_ddp/test/run.sh
index ca630839..4f2a4a03 100755
--- a/integrations/pytorch_ddp/test/run.sh
+++ b/integrations/pytorch_ddp/test/run.sh
@@ -83,6 +83,8 @@ else
     echo "Master node set to: $MASTER_IP:$MASTER_PORT"
 
     MPI_ARGS="-f $HOST_FILE --iface ens4f0"
+    # 09 and 10 have other interface names:
+    # MPI_ARGS="-f $HOST_FILE --iface ens4"
 fi
 
 ARG="$ARG -c $ACCL_COMMS -i $HOST_FILE -f $FPGA_FILE -a $MASTER_IP -p $MASTER_PORT\""
@@ -95,9 +97,9 @@ echo "Running with $NUM_PROCESS Processes"
 
 rm -f $(pwd)/accl_log/rank*
 
-C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stdout\" $EXEC $ARG &"
-# C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stdout\" -errfile-pattern \"$(pwd)/accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stderr\" $EXEC $ARG &"
-# C="mpirun -n $NUM_PROCESS -f $HOST_FILE --iface ens4f0 $EXEC $ARG &"
+# C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_stdout\" $EXEC $ARG &"
+C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_stdout\" -errfile-pattern \"$(pwd)/accl_log/rank_%r_stderr\" $EXEC $ARG &"
+# C="mpirun -n $NUM_PROCESS $MPI_ARGS $EXEC $ARG &"
 echo $C
 
 /bin/sh -c "$C"
@@ -108,13 +110,13 @@ fi
 echo "Sleeping for $SLEEPTIME"
 sleep $SLEEPTIME
 
-if ! [[ $ACCL_SIM -eq 1 ]]; then
-    parallel-ssh -H "$HOST_LIST" "killall -9 $SCRIPT_NAME"
-    parallel-ssh -H "$HOST_LIST" "dmesg | grep "fpga_tlb_miss_isr" >$(pwd)/accl_log/tlb_miss.log"
-else
-    killall -9 $SCRIPT_NAME
-    dmesg | grep "fpga_tlb_miss_isr" >$(pwd)/accl_log/tlb_miss.log
-fi
+# if ! [[ $ACCL_SIM -eq 1 ]]; then
+    # parallel-ssh -H "$HOST_LIST" "killall -9 $SCRIPT_NAME"
+    # parallel-ssh -H "$HOST_LIST" "dmesg | grep "fpga_tlb_miss_isr" >$(pwd)/accl_log/tlb_miss.log"
+# else
+    # killall -9 $SCRIPT_NAME
+    # dmesg | grep "fpga_tlb_miss_isr" >$(pwd)/accl_log/tlb_miss.log
+# fi
 
 # mkdir -p "$(pwd)/accl_results"
 # # Loop through accl log files in the source directory and append to accl_results folder
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index e4b7e346..7e979651 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -35,14 +35,22 @@
 from torch.utils.data import Dataset, DataLoader
 from torch.utils.data.distributed import DistributedSampler
 
-#Configure, which logging messages to display
+#Configure logging
 logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
 
+logger = logging.getLogger(__name__)
+
+if "ACCL_DEBUG" in os.environ and os.environ["ACCL_DEBUG"]=="1":
+    logger.setLevel(logging.DEBUG)
+else:
+    logger.setLevel(logging.WARNING)
+    
 rank = 0
 size = 0
 
 count = 1024
-rxbufsize = 1500 * 4
+#As in test.cpp defaults
+rxbufsize = 4096 * 1024
 
 
 def test_broadcast():
@@ -53,7 +61,7 @@ def test_broadcast():
 
     dist.broadcast(x, 0)
 
-    logging.debug('Tensor after broadcast: ' + str(x))
+    logger.debug('Tensor after broadcast: ' + str(x))
     print('Tensor after broadcast: ' + str(x))
     
     np.testing.assert_allclose(x, torch.ones(count))
@@ -198,45 +206,27 @@ def demo_basic(rank: int):
     print("finished training")
     dist.destroy_process_group()
 
-# def exchange_qp(first_rank, second_rank, rank, ranks):
-#     if rank == first_rank:
-#         mpi.send(accl.get_local_qp(second_rank), dest=second_rank, tag=23)
-#     elif rank == second_rank:
-#         accl.set_remote_qp(first_rank, mpi.recv(source=first_rank, tag=23))
-
-#     mpi.barrier()
-
-#     if rank == second_rank:
-#         mpi.send(accl.get_local_qp(first_rank), dest=first_rank, tag=24)
-#     elif rank == first_rank:
-#         accl.set_remote_qp(second_rank, mpi.recv(source=second_rank, tag=24))
-
-#     mpi.barrier()
-
-
-#def configure_cyt_rdma(ranks):
-#    global rank, size
-#    for first_rank in range(0, size):
-#        for second_rank in range(first_rank + 1, size):
-#            exchange_qp(first_rank, second_rank, rank, ranks)
-#    accl.initialize()
-#    mpi.barrier()
-
-
-
-def start_test(comms: str, simulator: bool, host_file: str, fpga_file: str, ma: str, mp: str):
+def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=None, ma: str="localhost", mp: str="30505"):
     global rank, size
+    if ma==None:
+        ma = "localhost"
+    if mp==None:
+        mp = "30505"
     os.environ['MASTER_ADDR'] = ma
     os.environ['MASTER_PORT'] = mp
     rank = mpi.Get_rank()
     size = mpi.Get_size()
-    # size = 2
-    print(f"MASTER: {os.environ['MASTER_ADDR']}{os.environ['MASTER_PORT']} ")
-    print(f"Starting tests on rank {rank} with size {size}")
-
     start_port = 5005
+    print(f"Starting tests with the following parameters:\n\
+Simulation: {simulator}, Communication Backend: {comms}\n\
+Rank: {rank}, World size: {size}\n\
+Host file: {host_file}, FPGA file: {fpga_file}\n\
+Master address: {ma}:{mp}, Start port for FPGA: {start_port}")
     
+
     if not simulator:
+        if host_file==None or fpga_file==None: sys.exit('Host and FPGA file need to be specified in hardware mode')
+            
         with open(host_file, 'r') as hf:
             host_ips = hf.readlines()
             
@@ -250,6 +240,8 @@ def start_test(comms: str, simulator: bool, host_file: str, fpga_file: str, ma:
     else:
         ranks = [accl.Rank("127.0.0.1", 5500 + i, i, rxbufsize) for i in range(size)]
 
+    logger.debug(f'Ranks: {ranks}')
+
     if comms == 'udp':
         design = accl.ACCLDesign.udp
     elif comms == 'tcp':
@@ -261,6 +253,7 @@ def start_test(comms: str, simulator: bool, host_file: str, fpga_file: str, ma:
             sys.exit('Design "' + comms + '" currently not supported in simulator mode')
         else:
             sys.exit('Design "' + comms + '" currently not supported in hardware mode')
+
     
     accl.create_process_group(ranks, design, bufsize=rxbufsize, initialize=True, simulation=simulator)
     dist.init_process_group("ACCL", rank=rank, world_size=size)
@@ -278,12 +271,13 @@ def start_test(comms: str, simulator: bool, host_file: str, fpga_file: str, ma:
         mpi.Barrier()
         test_allgather()
         mpi.Barrier()
-        test_reduce()
-        mpi.Barrier()
-        test_allreduce()
-        mpi.Barrier()
-        demo_basic(rank)
-
+        # test_reduce()
+        # mpi.Barrier()
+        # test_allreduce()
+        # mpi.Barrier()
+        # demo_basic(rank)
+        # mpi.Barrier()
+        
     print(prof.key_averages(group_by_input_shape=True)
           .table(sort_by="cpu_time_total", row_limit=15))
 
@@ -296,8 +290,8 @@ def start_test(comms: str, simulator: bool, host_file: str, fpga_file: str, ma:
                                             'hardware')
     parser.add_argument('-c', '--comms', choices=['udp', 'tcp', 'cyt_rdma'], default='tcp',
                         help='Run tests over specified communication backend')
-    parser.add_argument('-i', '--host-file', type=str, required=True, help='Specify the file, where the host IPs are listed')
-    parser.add_argument('-f', '--fpga-file', type=str, required=True, help='Specify the file, where the FPGA IPs are listed')
+    parser.add_argument('-i', '--host-file', type=str, help='Specify the file, where the host IPs are listed')
+    parser.add_argument('-f', '--fpga-file', type=str, help='Specify the file, where the FPGA IPs are listed')
     parser.add_argument('-a','--master-address', type=str)
     parser.add_argument('-p','--master-port', type=str)
     args = parser.parse_args()

From c6075645f020066ae12642834e5a238990464e95 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sat, 27 Apr 2024 11:16:57 +0200
Subject: [PATCH 09/64] Fixed rank initialization errors

---
 integrations/pytorch_ddp/src/ProcessGroupACCL.cpp | 11 ++++++++---
 integrations/pytorch_ddp/test/run.sh              |  2 +-
 integrations/pytorch_ddp/test/test-generic.py     |  4 ++--
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 0a94ad7c..2a6dc6c2 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -41,6 +41,7 @@
 
 namespace cyt = coyote_init;
 namespace py = pybind11;
+using namespace ACCL;
 
 namespace c10d {
 
@@ -660,7 +661,7 @@ void ProcessGroupACCL::initialize() {
     accl = std::make_unique<ACCL::ACCL>(cyt_device);
 
     // eager protocol for now
-    int protoc = 0;
+    int protoc = 1;
     // default from test.cpp
     int segsize = 4096 * 1024;
     
@@ -816,9 +817,11 @@ void ProcessGroupACCL::run_broadcast(at::Tensor tensor_original,
   if (!coyote_enabled && rank_ == opts.rootRank) {
     data->sync_to_device();
   }
-
-  accl->bcast(*data, tensor->numel(), opts.rootRank, ACCL::GLOBAL_COMM, true,
+  // auto start = std::chrono::high_resolution_clock::now();
+  ACCL::ACCLRequest* req = accl.bcast(*data, tensor->numel(), opts.rootRank, ACCL::GLOBAL_COMM, true,
               true, get_compressed_type(tensor->scalar_type()));
+  accl.wait(req, 1000ms);
+  // auto end = std::chrono::high_resolution_clock::now();
   int retcode = accl->get_retcode();
   if (retcode) {
     TORCH_CHECK(false, ACCL_ERROR(retcode));
@@ -850,11 +853,13 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
         // Segment data if necessary
         if (tensor.nbytes() > bufsize) {
           size_t n = bufsize / tensor.itemsize();
+	  ACCL::debug("[Broadcast] Segmenting tensor of size " + std::to_string(tensor.nbytes()) + " into " + std::to_string(n) + "-sized elements ");
           for (size_t i = 0; i < tensor.numel(); i += n) {
             size_t end = std::min(i + n, static_cast<size_t>(tensor.numel()));
             run_broadcast(tensor.slice(0, i, end), opts);
           }
         } else {
+	  ACCL::debug("[Broadcast] Broadcasting entire tensor of size " + std::to_string(tensor.nbytes()) + " without segmentation.");
           run_broadcast(tensor, opts);
         }
       };
diff --git a/integrations/pytorch_ddp/test/run.sh b/integrations/pytorch_ddp/test/run.sh
index 4f2a4a03..0dbf3d72 100755
--- a/integrations/pytorch_ddp/test/run.sh
+++ b/integrations/pytorch_ddp/test/run.sh
@@ -35,7 +35,7 @@ if [[ $ACCL_SIM -eq 1 ]]; then
 
     ACCL_COMMS="udp"
 
-    echo "assuming udp comms in simulator"
+    echo "assuming $ACCL_COMMS comms in simulator"
 
     if [[ -v ACCL_NP ]]; then
         NUM_PROCESS="$ACCL_NP"
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index 7e979651..4fdbd925 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -228,10 +228,10 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
         if host_file==None or fpga_file==None: sys.exit('Host and FPGA file need to be specified in hardware mode')
             
         with open(host_file, 'r') as hf:
-            host_ips = hf.readlines()
+            host_ips = hf.read().splitlines()
             
         with open(fpga_file, 'r') as ff:
-            fpga_ips = ff.readlines()
+            fpga_ips = ff.read().splitlines()
 
         if comms == "cyt_rdma":
             ranks = [accl.Rank(a, start_port, i, rxbufsize) for i, a in enumerate(fpga_ips)]

From cb092f439f301ff2fbaf473e3f46728b58273362 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sat, 27 Apr 2024 19:19:45 +0200
Subject: [PATCH 10/64] Small pg fix

---
 integrations/pytorch_ddp/src/ProcessGroupACCL.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 2a6dc6c2..0de807b2 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -818,9 +818,9 @@ void ProcessGroupACCL::run_broadcast(at::Tensor tensor_original,
     data->sync_to_device();
   }
   // auto start = std::chrono::high_resolution_clock::now();
-  ACCL::ACCLRequest* req = accl.bcast(*data, tensor->numel(), opts.rootRank, ACCL::GLOBAL_COMM, true,
+  ACCL::ACCLRequest* req = accl->bcast(*data, tensor->numel(), opts.rootRank, ACCL::GLOBAL_COMM, true,
               true, get_compressed_type(tensor->scalar_type()));
-  accl.wait(req, 1000ms);
+  accl->wait(req, 1000ms);
   // auto end = std::chrono::high_resolution_clock::now();
   int retcode = accl->get_retcode();
   if (retcode) {

From 84131ba39ab14b3449a6901f866484a2e5dfc9ec Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Wed, 1 May 2024 13:09:00 +0200
Subject: [PATCH 11/64] Redefining exchange_qp fixes most errors

---
 integrations/pytorch_ddp/setup.py             |   3 +-
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      |  51 ++++++--
 integrations/pytorch_ddp/src/coyote_init.cpp  | 120 ++++++++++++++----
 integrations/pytorch_ddp/test/test-generic.py |   6 +-
 4 files changed, 142 insertions(+), 38 deletions(-)

diff --git a/integrations/pytorch_ddp/setup.py b/integrations/pytorch_ddp/setup.py
index 4f16452e..7917b138 100755
--- a/integrations/pytorch_ddp/setup.py
+++ b/integrations/pytorch_ddp/setup.py
@@ -49,8 +49,9 @@
                 root / 'accl' / 'test' / 'model' / 'zmq',
                 vnx_dir / 'include', roce_dir / 'include',
                 root / 'accl' / 'test' / 'refdesigns' / 'Coyote' / 'sw' / 'include',
+                '/pub/scratch/zhe/mpich/install/include',
                 '/usr/include/jsoncpp']
-library_dirs = [driver_dir / 'xrt' / 'lib', xrt_dir / 'lib']
+library_dirs = [driver_dir / 'xrt' / 'lib', xrt_dir / 'lib', '/mnt/scratch/zhe/mpich/install/lib/libmpicxx.so']
 libraries = ['accl', 'jsoncpp', 'zmq']
 sources = [root / 'src' / 'ProcessGroupACCL.cpp',
            root / 'src' / 'coyote_init.cpp',
diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 0de807b2..71361f98 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -660,15 +660,14 @@ void ProcessGroupACCL::initialize() {
 
     accl = std::make_unique<ACCL::ACCL>(cyt_device);
 
-    // eager protocol for now
+    // Rendezvous protocol for now
     int protoc = 1;
     // default from test.cpp
     int segsize = 4096 * 1024;
     
     if (protoc == 0){
       std::cout<<"Eager Protocol"<<std::endl;
-      accl.get()->initialize(ranks_, rank_,
-			     size_+2, bufsize, segsize, 4096*1024*2);
+      accl.get()->initialize(ranks_, rank_, size_+2, bufsize, segsize, 4096*1024*2);
     } else {
       std::cout<<"Rendezvous Protocol"<<std::endl;
       accl.get()->initialize(ranks_, rank_, size_, 64, 64, segsize);
@@ -810,27 +809,58 @@ void ProcessGroupACCL::run_broadcast(at::Tensor tensor_original,
     }
   }
 
+  // int num_prints = std::max((int64_t) 3, tensor->numel());
+
+  // ACCL::debug("First " + std::to_string(num_prints) + " elements before bcast:");
+  
+  // for(int i = 0; i < num_prints; i++){
+    // ACCL::debug(std::to_string(data.get()->buffer()[i]));
+  // }
+
   // Run broadcast
-  ACCL::debug("Starting broadcast of " + std::to_string(tensor->numel()) +
-              " items");
 
+  //check wether this is needed, with hostmem
   if (!coyote_enabled && rank_ == opts.rootRank) {
     data->sync_to_device();
   }
+
+  ACCL::debug("[Broadcast] Entering pre-bcast barrier");
+
+  //add mpi barrier
+  accl->barrier();
+  
+  ACCL::debug("Starting broadcast of " + std::to_string(tensor->numel()) + " items");
+
   // auto start = std::chrono::high_resolution_clock::now();
   ACCL::ACCLRequest* req = accl->bcast(*data, tensor->numel(), opts.rootRank, ACCL::GLOBAL_COMM, true,
               true, get_compressed_type(tensor->scalar_type()));
-  accl->wait(req, 1000ms);
+  // not sure if this is supported in other modes
+  // ACCL::debug("First " + std::to_string(num_prints) + " elements after bcast:");
+    ACCL::debug("After request");
+
+
+  if (!coyote_enabled && rank_ != opts.rootRank) {
+    data->sync_from_device();
+  }
+
+  // for(int i = 0; i < num_prints; i++){
+    // ACCL::debug(std::to_string(data.get()->buffer()[i]));
+  // }
+  
+  if(coyote_enabled){
+    ACCL::debug("Waiting for request to complete.");
+    accl->wait(req, 1000ms);
+  }
+  ACCL::debug("After wait");
   // auto end = std::chrono::high_resolution_clock::now();
   int retcode = accl->get_retcode();
+
+  ACCL::debug("Returncode: " + std::to_string(retcode));  
   if (retcode) {
+    // add deconstruction
     TORCH_CHECK(false, ACCL_ERROR(retcode));
   }
 
-  if (!coyote_enabled && rank_ != opts.rootRank) {
-    data->sync_from_device();
-  }
-
   // Copy results back to GPU if necessary
   if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
     copy_back_p2p_buffer(*data, tensor_original);
@@ -846,6 +876,7 @@ void ProcessGroupACCL::run_broadcast(at::Tensor tensor_original,
 c10::intrusive_ptr<Work>
 ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
                             const BroadcastOptions &opts) {
+  debug(accl->dump_eager_rx_buffers(false));
   checkSingleTensor(tensors);
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [opts, this](std::unique_ptr<WorkEntry> &entry) {
diff --git a/integrations/pytorch_ddp/src/coyote_init.cpp b/integrations/pytorch_ddp/src/coyote_init.cpp
index 602e6cc2..8523cabf 100644
--- a/integrations/pytorch_ddp/src/coyote_init.cpp
+++ b/integrations/pytorch_ddp/src/coyote_init.cpp
@@ -18,6 +18,9 @@
 #include "coyote_init.hpp"
 #include <arpa/inet.h>
 #include <iostream>
+#include <mpi.h>
+
+using namespace ACCL;
 
 namespace {
 inline void swap_endianness(uint32_t *ip) {
@@ -42,34 +45,99 @@ std::string ip_decode(uint32_t ip) {
   return std::string(buffer, INET_ADDRSTRLEN);
 }
 
-void exchange_qp(unsigned int first_rank, unsigned int second_rank,
-                 unsigned int local_rank,
-                 std::vector<fpga::ibvQpConn *> &ibvQpConn_vec,
-                 std::vector<ACCL::rank_t> &ranks) {
-  // write established connection to hardware and perform arp lookup
-  if (local_rank == first_rank) {
-    int connection =
-        (ibvQpConn_vec[second_rank]->getQpairStruct()->local.qpn & 0xFFFF) |
-        ((ibvQpConn_vec[second_rank]->getQpairStruct()->remote.qpn & 0xFFFF)
-         << 16);
-    ibvQpConn_vec[second_rank]->setConnection(connection);
-    ibvQpConn_vec[second_rank]->writeContext(ranks[second_rank].port);
-    ibvQpConn_vec[second_rank]->doArpLookup();
-    ranks[second_rank].session_id =
-        ibvQpConn_vec[second_rank]->getQpairStruct()->local.qpn;
-  } else if (local_rank == second_rank) {
-    int connection =
-        (ibvQpConn_vec[first_rank]->getQpairStruct()->local.qpn & 0xFFFF) |
-        ((ibvQpConn_vec[first_rank]->getQpairStruct()->remote.qpn & 0xFFFF)
-         << 16);
-    ibvQpConn_vec[first_rank]->setConnection(connection);
-    ibvQpConn_vec[first_rank]->writeContext(ranks[first_rank].port);
-    ibvQpConn_vec[first_rank]->doArpLookup();
-    ranks[first_rank].session_id =
-        ibvQpConn_vec[first_rank]->getQpairStruct()->local.qpn;
-  }
+void exchange_qp(unsigned int master_rank, unsigned int slave_rank, unsigned int local_rank, std::vector<fpga::ibvQpConn*> &ibvQpConn_vec, std::vector<rank_t> &ranks)
+{
+  	
+	if (local_rank == master_rank)
+	{
+		std::cout<<"Local rank "<<local_rank<<" sending local QP to remote rank "<<slave_rank<<std::endl;
+		// Send the local queue pair information to the slave rank
+		MPI_Send(&(ibvQpConn_vec[slave_rank]->getQpairStruct()->local), sizeof(fpga::ibvQ), MPI_CHAR, slave_rank, 0, MPI_COMM_WORLD);
+	}
+	else if (local_rank == slave_rank)
+	{
+		std::cout<<"Local rank "<<local_rank<<" receiving remote QP from remote rank "<<master_rank<<std::endl;
+		// Receive the queue pair information from the master rank
+		fpga::ibvQ received_q;
+		MPI_Recv(&received_q, sizeof(fpga::ibvQ), MPI_CHAR, master_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+		// Copy the received data to the remote queue pair
+		ibvQpConn_vec[master_rank]->getQpairStruct()->remote = received_q;
+	}
+
+	// Synchronize after the first exchange to avoid race conditions
+	MPI_Barrier(MPI_COMM_WORLD);
+
+	if (local_rank == slave_rank)
+	{
+		std::cout<<"Local rank "<<local_rank<<" sending local QP to remote rank "<<master_rank<<std::endl;
+		// Send the local queue pair information to the master rank
+		MPI_Send(&(ibvQpConn_vec[master_rank]->getQpairStruct()->local), sizeof(fpga::ibvQ), MPI_CHAR, master_rank, 0, MPI_COMM_WORLD);
+	}
+	else if (local_rank == master_rank)
+	{
+		std::cout<<"Local rank "<<local_rank<<" receiving remote QP from remote rank "<<slave_rank<<std::endl;
+		// Receive the queue pair information from the slave rank
+		fpga::ibvQ received_q;
+		MPI_Recv(&received_q, sizeof(fpga::ibvQ), MPI_CHAR, slave_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+		// Copy the received data to the remote queue pair
+		ibvQpConn_vec[slave_rank]->getQpairStruct()->remote = received_q;
+	}
+
+	MPI_Barrier(MPI_COMM_WORLD);
+
+	// write established connection to hardware and perform arp lookup
+	if (local_rank == master_rank)
+	{
+		int connection = (ibvQpConn_vec[slave_rank]->getQpairStruct()->local.qpn & 0xFFFF) | ((ibvQpConn_vec[slave_rank]->getQpairStruct()->remote.qpn & 0xFFFF) << 16);
+		ibvQpConn_vec[slave_rank]->getQpairStruct()->print();
+		ibvQpConn_vec[slave_rank]->setConnection(connection);
+		ibvQpConn_vec[slave_rank]->writeContext(ranks[slave_rank].port);
+		ibvQpConn_vec[slave_rank]->doArpLookup();
+		ranks[slave_rank].session_id = ibvQpConn_vec[slave_rank]->getQpairStruct()->local.qpn;
+	} else if (local_rank == slave_rank) 
+	{
+		int connection = (ibvQpConn_vec[master_rank]->getQpairStruct()->local.qpn & 0xFFFF) | ((ibvQpConn_vec[master_rank]->getQpairStruct()->remote.qpn & 0xFFFF) << 16);
+		ibvQpConn_vec[master_rank]->getQpairStruct()->print();
+		ibvQpConn_vec[master_rank]->setConnection(connection);
+		ibvQpConn_vec[master_rank]->writeContext(ranks[master_rank].port);
+		ibvQpConn_vec[master_rank]->doArpLookup();
+		ranks[master_rank].session_id = ibvQpConn_vec[master_rank]->getQpairStruct()->local.qpn;
+	}
+
+	MPI_Barrier(MPI_COMM_WORLD);
 }
 
+  
+// void exchange_qp(unsigned int first_rank, unsigned int second_rank,
+//                  unsigned int local_rank,
+//                  std::vector<fpga::ibvQpConn *> &ibvQpConn_vec,
+//                  std::vector<ACCL::rank_t> &ranks) {
+//   // write established connection to hardware and perform arp lookup
+//   if (local_rank == first_rank) {
+//     int connection =
+//         (ibvQpConn_vec[second_rank]->getQpairStruct()->local.qpn & 0xFFFF) |
+//         ((ibvQpConn_vec[second_rank]->getQpairStruct()->remote.qpn & 0xFFFF)
+//          << 16);
+//     ibvQpConn_vec[second_rank]->setConnection(connection);
+//     ibvQpConn_vec[second_rank]->writeContext(ranks[second_rank].port);
+//     ibvQpConn_vec[second_rank]->doArpLookup();
+//     ranks[second_rank].session_id =
+//         ibvQpConn_vec[second_rank]->getQpairStruct()->local.qpn;
+//   } else if (local_rank == second_rank) {
+//     int connection =
+//         (ibvQpConn_vec[first_rank]->getQpairStruct()->local.qpn & 0xFFFF) |
+//         ((ibvQpConn_vec[first_rank]->getQpairStruct()->remote.qpn & 0xFFFF)
+//          << 16);
+//     ibvQpConn_vec[first_rank]->setConnection(connection);
+//     ibvQpConn_vec[first_rank]->writeContext(ranks[first_rank].port);
+//     ibvQpConn_vec[first_rank]->doArpLookup();
+//     ranks[first_rank].session_id =
+//         ibvQpConn_vec[first_rank]->getQpairStruct()->local.qpn;
+//   }
+// }
+
 } // namespace
 
 namespace coyote_init {
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index 4fdbd925..a666760f 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -48,7 +48,7 @@
 rank = 0
 size = 0
 
-count = 1024
+count = 512
 #As in test.cpp defaults
 rxbufsize = 4096 * 1024
 
@@ -277,10 +277,14 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
         # mpi.Barrier()
         # demo_basic(rank)
         # mpi.Barrier()
+
+
+        # add destroy 
         
     print(prof.key_averages(group_by_input_shape=True)
           .table(sort_by="cpu_time_total", row_limit=15))
 
+    dist.destroy_process_group()
 
 if __name__ == '__main__':
     import argparse

From 9cc61bda35fae8a7cc0e69793bf476a437c33f0d Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Thu, 2 May 2024 17:19:23 +0200
Subject: [PATCH 12/64] Added barriers, waits and removed get-retcode.

fixes all collectives on cyt_rdma
---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 240 ++++++++++--------
 1 file changed, 132 insertions(+), 108 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 71361f98..8a83e004 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -809,56 +809,33 @@ void ProcessGroupACCL::run_broadcast(at::Tensor tensor_original,
     }
   }
 
-  // int num_prints = std::max((int64_t) 3, tensor->numel());
-
-  // ACCL::debug("First " + std::to_string(num_prints) + " elements before bcast:");
-  
-  // for(int i = 0; i < num_prints; i++){
-    // ACCL::debug(std::to_string(data.get()->buffer()[i]));
-  // }
-
-  // Run broadcast
-
   //check wether this is needed, with hostmem
   if (!coyote_enabled && rank_ == opts.rootRank) {
     data->sync_to_device();
   }
 
-  ACCL::debug("[Broadcast] Entering pre-bcast barrier");
-
-  //add mpi barrier
+  ACCL::debug("[Broadcast] Entering barrier");
   accl->barrier();
   
   ACCL::debug("Starting broadcast of " + std::to_string(tensor->numel()) + " items");
 
-  // auto start = std::chrono::high_resolution_clock::now();
   ACCL::ACCLRequest* req = accl->bcast(*data, tensor->numel(), opts.rootRank, ACCL::GLOBAL_COMM, true,
               true, get_compressed_type(tensor->scalar_type()));
-  // not sure if this is supported in other modes
-  // ACCL::debug("First " + std::to_string(num_prints) + " elements after bcast:");
-    ACCL::debug("After request");
 
-
-  if (!coyote_enabled && rank_ != opts.rootRank) {
-    data->sync_from_device();
-  }
-
-  // for(int i = 0; i < num_prints; i++){
-    // ACCL::debug(std::to_string(data.get()->buffer()[i]));
-  // }
-  
   if(coyote_enabled){
     ACCL::debug("Waiting for request to complete.");
     accl->wait(req, 1000ms);
   }
-  ACCL::debug("After wait");
-  // auto end = std::chrono::high_resolution_clock::now();
-  int retcode = accl->get_retcode();
+  ACCL::debug("Finished waiting");
 
-  ACCL::debug("Returncode: " + std::to_string(retcode));  
-  if (retcode) {
+  // ACCL::debug("Returncode: " + std::to_string(retcode));  
+  // if (retcode) {
     // add deconstruction
-    TORCH_CHECK(false, ACCL_ERROR(retcode));
+    // TORCH_CHECK(false, ACCL_ERROR(retcode));
+  // }
+  
+  if (!coyote_enabled && rank_ != opts.rootRank) {
+    data->sync_from_device();
   }
 
   // Copy results back to GPU if necessary
@@ -939,18 +916,25 @@ void ProcessGroupACCL::run_allreduce(at::Tensor tensor_original,
   }
 
   // Run allreduce
-  ACCL::debug("Starting allreduce of " + std::to_string(tensor->numel()) +
-              " items");
   if (!coyote_enabled) {
     data->sync_to_device();
   }
-  accl->allreduce(*data, *result, tensor->numel(), acclOp.at(opts.reduceOp),
+
+  ACCL::debug("[AllReduce] Entering barrier");
+  accl->barrier();
+
+  ACCL::debug("Starting allreduce of " + std::to_string(tensor->numel()) +
+              " items");
+  ACCL::ACCLRequest* req = accl->allreduce(*data, *result, tensor->numel(), acclOp.at(opts.reduceOp),
                   ACCL::GLOBAL_COMM, true, true,
                   get_compressed_type(tensor->scalar_type()));
-  int retcode = accl->get_retcode();
-  if (retcode) {
-    TORCH_CHECK(false, ACCL_ERROR(retcode));
+
+  if(coyote_enabled){
+    ACCL::debug("Waiting for request to complete.");
+    accl->wait(req, 1000ms);
   }
+  
+  ACCL::debug("Finished waiting");
 
   if (!coyote_enabled) {
     result->sync_from_device();
@@ -1055,18 +1039,24 @@ void ProcessGroupACCL::run_reduce(at::Tensor tensor_original,
   }
 
   // Run reduce
-  ACCL::debug("Starting reduce of " + std::to_string(tensor->numel()) +
-              " items");
   if (!coyote_enabled) {
     data->sync_to_device();
   }
-  accl->reduce(*data, *result, tensor->numel(), opts.rootRank,
+  
+  ACCL::debug("[Reduce] Entering barrier");
+  accl->barrier();
+
+  ACCL::debug("Starting reduce of " + std::to_string(tensor->numel()) +
+              " items");
+  ACCL::ACCLRequest* req = accl->reduce(*data, *result, tensor->numel(), opts.rootRank,
                acclOp.at(opts.reduceOp), ACCL::GLOBAL_COMM, true, true,
                get_compressed_type(tensor->scalar_type()));
-  int retcode = accl->get_retcode();
-  if (retcode) {
-    TORCH_CHECK(false, ACCL_ERROR(retcode));
+
+  if(coyote_enabled){
+    ACCL::debug("Waiting for request to complete.");
+    accl->wait(req, 1000ms);
   }
+  ACCL::debug("Finished waiting");
 
   if (!coyote_enabled && rank_ == opts.rootRank) {
     result->sync_from_device();
@@ -1179,18 +1169,24 @@ void ProcessGroupACCL::run_allgather(
   }
 
   // Run allgather
-  ACCL::debug("Starting allgather of " + std::to_string(srctensor->numel()) +
-              " items");
   if (!coyote_enabled) {
     srcdata->sync_to_device();
   }
-  accl->allgather(*srcdata, *dstdata, srctensor->numel(), ACCL::GLOBAL_COMM,
+
+  ACCL::debug("[Allgather] Entering barrier");
+  accl->barrier();
+
+  ACCL::debug("Starting allgather of " + std::to_string(srctensor->numel()) +
+              " items");
+  ACCL::ACCLRequest* req = accl->allgather(*srcdata, *dstdata, srctensor->numel(), ACCL::GLOBAL_COMM,
                   true, true, get_compressed_type(srctensor->scalar_type()));
 
-  int retcode = accl->get_retcode();
-  if (retcode) {
-    TORCH_CHECK(false, ACCL_ERROR(retcode));
+  if(coyote_enabled){
+    ACCL::debug("Waiting for request to complete.");
+    accl->wait(req, 1000ms);
   }
+  ACCL::debug("Finished waiting");
+  
   if (!coyote_enabled) {
     dstdata->sync_from_device();
   }
@@ -1328,22 +1324,25 @@ void ProcessGroupACCL::run_gather(at::Tensor srctensor_original,
   }
 
   // Run gather
-  ACCL::debug("Starting gather of " + std::to_string(srctensor->numel()) +
-              " items");
-
   if (!coyote_enabled) {
     srcdata->sync_to_device();
   }
 
-  accl->gather(*srcdata, *dstdata, srctensor->numel(), opts.rootRank,
+  ACCL::debug("[Gather] Entering barrier");
+  accl->barrier();
+
+  ACCL::debug("Starting gather of " + std::to_string(srctensor->numel()) +
+              " items");
+  ACCL::ACCLRequest* req = accl->gather(*srcdata, *dstdata, srctensor->numel(), opts.rootRank,
                ACCL::GLOBAL_COMM, true, true,
                get_compressed_type(srctensor->scalar_type()));
 
-  int retcode = accl->get_retcode();
-  if (retcode) {
-    TORCH_CHECK(false, ACCL_ERROR(retcode));
+  if(coyote_enabled){
+    ACCL::debug("Waiting for request to complete.");
+    accl->wait(req, 1000ms);
   }
-
+  ACCL::debug("Finished waiting");
+  
   if (!coyote_enabled && rank_ == opts.rootRank) {
     dstdata->sync_from_device();
   }
@@ -1435,30 +1434,28 @@ void ProcessGroupACCL::run_scatter(std::vector<at::Tensor> &srctensorvec,
   // Create new input buffer, since srctensorvec is not continuous in memory
   if (rank_ == opts.rootRank) {
     at::Tensor srctensor;
-    if (rank_ == opts.rootRank) {
-      if (p2p_applicable(*accl, srctensorvec[0], p2p_enabled)) {
-        srcdata = create_buffer_p2p(
-            *accl, dsttensor->numel() * static_cast<size_t>(size_),
-            dsttensor->scalar_type());
-      } else if (coyote_enabled) {
-        srcdata = create_coyotebuffer(*accl,
-                                dsttensor->numel() * static_cast<size_t>(size_),
-                                dsttensor->scalar_type());
-        std::vector<int64_t> sizes = {static_cast<int64_t>(dsttensor->numel()) *
-                                      size_};
-        srctensor =
-            torch::from_blob(srcdata->byte_array(), sizes,
-                             dsttensor->options().device(c10::DeviceType::CPU));
-      } else {
-        srcdata = create_buffer(*accl,
-                                dsttensor->numel() * static_cast<size_t>(size_),
-                                dsttensor->scalar_type());
-        std::vector<int64_t> sizes = {static_cast<int64_t>(dsttensor->numel()) *
-                                      size_};
-        srctensor =
-            torch::from_blob(srcdata->byte_array(), sizes,
-                             dsttensor->options().device(c10::DeviceType::CPU));
-      }
+    if (p2p_applicable(*accl, srctensorvec[0], p2p_enabled)) {
+      srcdata = create_buffer_p2p(
+				  *accl, dsttensor->numel() * static_cast<size_t>(size_),
+				  dsttensor->scalar_type());
+    } else if (coyote_enabled) {
+      srcdata = create_coyotebuffer(*accl,
+				    dsttensor->numel() * static_cast<size_t>(size_),
+				    dsttensor->scalar_type());
+      std::vector<int64_t> sizes = {static_cast<int64_t>(dsttensor->numel()) *
+				    size_};
+      srctensor =
+	torch::from_blob(srcdata->byte_array(), sizes,
+			 dsttensor->options().device(c10::DeviceType::CPU));
+    } else {
+      srcdata = create_buffer(*accl,
+			      dsttensor->numel() * static_cast<size_t>(size_),
+			      dsttensor->scalar_type());
+      std::vector<int64_t> sizes = {static_cast<int64_t>(dsttensor->numel()) *
+				    size_};
+      srctensor =
+	torch::from_blob(srcdata->byte_array(), sizes,
+			 dsttensor->options().device(c10::DeviceType::CPU));
     }
 
     // Copy data to input buffer
@@ -1504,26 +1501,34 @@ void ProcessGroupACCL::run_scatter(std::vector<at::Tensor> &srctensorvec,
     }
   }
 
-  // Run scatter
-  ACCL::debug("Starting scatter of " + std::to_string(dsttensor->numel()) +
-              " items");
-
   if (!coyote_enabled && rank_ == opts.rootRank) {
     srcdata->sync_to_device();
   }
 
-  accl->scatter(*srcdata, *dstdata, dsttensor->numel(), opts.rootRank,
+  ACCL::debug("[Scatter] Entering barrier");
+  accl->barrier();
+
+
+  ACCL::debug("Starting scatter of " + std::to_string(dsttensor->numel()) +
+              " items");
+  // Run scatter
+  ACCL::ACCLRequest* req = accl->scatter(*srcdata, *dstdata, dsttensor->numel(), opts.rootRank,
                 ACCL::GLOBAL_COMM, true, true,
                 get_compressed_type(dsttensor->scalar_type()));
-  int retcode = accl->get_retcode();
-  if (retcode) {
-    TORCH_CHECK(false, ACCL_ERROR(retcode));
+
+  if(coyote_enabled){
+    ACCL::debug("Waiting for request to complete.");
+    accl->wait(req, 1000ms);
   }
 
+  ACCL::debug("Finished wait");
+
   if (!coyote_enabled) {
     dstdata->sync_from_device();
   }
 
+
+
   // Copy result back to GPU if necessary
   if (p2p_applicable(*accl, dsttensor_original, p2p_enabled)) {
     copy_back_p2p_buffer(*dstdata, dsttensor_original);
@@ -1650,18 +1655,25 @@ void ProcessGroupACCL::run_alltoall(at::Tensor srctensor_original,
   }
 
   // Run alltoall
-  ACCL::debug("Starting alltoall of " + std::to_string(srctensor->numel()) +
-              " items");
   if (!coyote_enabled) {
     srcdata->sync_to_device();
   }
-  accl->alltoall(*srcdata, *dstdata, srctensor->numel(),
+
+  ACCL::debug("[AlltoAll] Entering barrier");
+  accl->barrier();
+
+  ACCL::debug("Starting alltoall of " + std::to_string(srctensor->numel()) +
+              " items");
+  ACCL::ACCLRequest* req = accl->alltoall(*srcdata, *dstdata, srctensor->numel(),
                   ACCL::GLOBAL_COMM, true, true,
                   get_compressed_type(srctensor->scalar_type()));
-  int retcode = accl->get_retcode();
-  if (retcode) {
-    TORCH_CHECK(false, ACCL_ERROR(retcode));
+
+  if(coyote_enabled){
+    ACCL::debug("Waiting for request to complete.");
+    accl->wait(req, 1000ms);
   }
+  ACCL::debug("Finished waiting");
+  
   if (!coyote_enabled) {
     dstdata->sync_from_device();
   }
@@ -1772,18 +1784,25 @@ void ProcessGroupACCL::run_send(at::Tensor tensor_original, int dstRank,
   }
 
   // Run send
-  ACCL::debug("Starting send of " + std::to_string(tensor->numel()) +
-              " items to " + std::to_string(dstRank));
   if (!coyote_enabled) {
     data->sync_to_device();
   }
-  accl->send(*data, tensor->numel(), dstRank, tag, ACCL::GLOBAL_COMM, true,
+
+  ACCL::debug("[Send] Entering barrier");
+  accl->barrier();
+  
+  ACCL::debug("Starting send of " + std::to_string(tensor->numel()) +
+              " items to " + std::to_string(dstRank));
+  
+  ACCL::ACCLRequest* req = accl->send(*data, tensor->numel(), dstRank, tag, ACCL::GLOBAL_COMM, true,
              get_compressed_type(tensor->scalar_type()));
 
-  int retcode = accl->get_retcode();
-  if (retcode) {
-    TORCH_CHECK(false, ACCL_ERROR(retcode));
+  if(coyote_enabled){
+    ACCL::debug("Waiting for request to complete.");
+    accl->wait(req, 1000ms);
   }
+  ACCL::debug("Finished waiting");
+  
 }
 
 c10::intrusive_ptr<Work>
@@ -1848,16 +1867,21 @@ void ProcessGroupACCL::run_recv(at::Tensor tensor_original, int srcRank,
   }
 
   // Run recieve
-  ACCL::debug("Starting recieve of " + std::to_string(tensor->numel()) +
+
+  ACCL::debug("[Receive] Entering barrier");
+  accl->barrier();
+  
+  ACCL::debug("Starting receive of " + std::to_string(tensor->numel()) +
               " items from " + std::to_string(srcRank));
-  accl->recv(*data, tensor->numel(), srcRank, tag, ACCL::GLOBAL_COMM, true,
+  ACCL::ACCLRequest* req = accl->recv(*data, tensor->numel(), srcRank, tag, ACCL::GLOBAL_COMM, true,
              get_compressed_type(tensor->scalar_type()));
 
-  int retcode = accl->get_retcode();
-  if (retcode) {
-    TORCH_CHECK(false, ACCL_ERROR(retcode));
+  if(coyote_enabled){
+    ACCL::debug("Waiting for request to complete.");
+    accl->wait(req, 1000ms);
   }
-
+  ACCL::debug("Finished waiting");
+  
   if (!coyote_enabled) {
     data->sync_from_device();
   }

From cb229c65cd40dffc286de72b9c383b99e5d914b9 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Fri, 3 May 2024 17:43:49 +0200
Subject: [PATCH 13/64] Restructured tests and incorporated torchvision test

---
 .../{ => old_testscripts}/test-compression.py |   0
 .../test/{ => old_testscripts}/test-coyote.py |   0
 .../{ => old_testscripts}/test-gpu-p2p.py     |   0
 .../test/{ => old_testscripts}/test-gpu.py    |   0
 .../test-segmentation.py                      |   0
 .../test/{ => old_testscripts}/test-udp.py    |   0
 .../test/{ => old_testscripts}/test.py        |   0
 integrations/pytorch_ddp/test/test-generic.py |   9 +-
 .../test/torchvision/PennFudanDataset.py      |  74 +++
 .../pytorch_ddp/test/torchvision/coco_eval.py | 192 ++++++
 .../test/torchvision/coco_eval.py.1           | 192 ++++++
 .../test/torchvision/coco_utils.py            | 234 +++++++
 .../test/torchvision/coco_utils.py.1          | 234 +++++++
 .../pytorch_ddp/test/torchvision/engine.py    | 115 ++++
 .../pytorch_ddp/test/torchvision/main.py      | 100 +++
 .../test/torchvision/transforms.py            | 601 ++++++++++++++++++
 .../test/torchvision/transforms.py.1          | 601 ++++++++++++++++++
 .../pytorch_ddp/test/torchvision/utils.py     | 282 ++++++++
 .../pytorch_ddp/test/torchvision/utils.py.1   | 282 ++++++++
 19 files changed, 2912 insertions(+), 4 deletions(-)
 rename integrations/pytorch_ddp/test/{ => old_testscripts}/test-compression.py (100%)
 rename integrations/pytorch_ddp/test/{ => old_testscripts}/test-coyote.py (100%)
 rename integrations/pytorch_ddp/test/{ => old_testscripts}/test-gpu-p2p.py (100%)
 rename integrations/pytorch_ddp/test/{ => old_testscripts}/test-gpu.py (100%)
 rename integrations/pytorch_ddp/test/{ => old_testscripts}/test-segmentation.py (100%)
 rename integrations/pytorch_ddp/test/{ => old_testscripts}/test-udp.py (100%)
 rename integrations/pytorch_ddp/test/{ => old_testscripts}/test.py (100%)
 create mode 100644 integrations/pytorch_ddp/test/torchvision/PennFudanDataset.py
 create mode 100644 integrations/pytorch_ddp/test/torchvision/coco_eval.py
 create mode 100644 integrations/pytorch_ddp/test/torchvision/coco_eval.py.1
 create mode 100644 integrations/pytorch_ddp/test/torchvision/coco_utils.py
 create mode 100644 integrations/pytorch_ddp/test/torchvision/coco_utils.py.1
 create mode 100644 integrations/pytorch_ddp/test/torchvision/engine.py
 create mode 100644 integrations/pytorch_ddp/test/torchvision/main.py
 create mode 100644 integrations/pytorch_ddp/test/torchvision/transforms.py
 create mode 100644 integrations/pytorch_ddp/test/torchvision/transforms.py.1
 create mode 100644 integrations/pytorch_ddp/test/torchvision/utils.py
 create mode 100644 integrations/pytorch_ddp/test/torchvision/utils.py.1

diff --git a/integrations/pytorch_ddp/test/test-compression.py b/integrations/pytorch_ddp/test/old_testscripts/test-compression.py
similarity index 100%
rename from integrations/pytorch_ddp/test/test-compression.py
rename to integrations/pytorch_ddp/test/old_testscripts/test-compression.py
diff --git a/integrations/pytorch_ddp/test/test-coyote.py b/integrations/pytorch_ddp/test/old_testscripts/test-coyote.py
similarity index 100%
rename from integrations/pytorch_ddp/test/test-coyote.py
rename to integrations/pytorch_ddp/test/old_testscripts/test-coyote.py
diff --git a/integrations/pytorch_ddp/test/test-gpu-p2p.py b/integrations/pytorch_ddp/test/old_testscripts/test-gpu-p2p.py
similarity index 100%
rename from integrations/pytorch_ddp/test/test-gpu-p2p.py
rename to integrations/pytorch_ddp/test/old_testscripts/test-gpu-p2p.py
diff --git a/integrations/pytorch_ddp/test/test-gpu.py b/integrations/pytorch_ddp/test/old_testscripts/test-gpu.py
similarity index 100%
rename from integrations/pytorch_ddp/test/test-gpu.py
rename to integrations/pytorch_ddp/test/old_testscripts/test-gpu.py
diff --git a/integrations/pytorch_ddp/test/test-segmentation.py b/integrations/pytorch_ddp/test/old_testscripts/test-segmentation.py
similarity index 100%
rename from integrations/pytorch_ddp/test/test-segmentation.py
rename to integrations/pytorch_ddp/test/old_testscripts/test-segmentation.py
diff --git a/integrations/pytorch_ddp/test/test-udp.py b/integrations/pytorch_ddp/test/old_testscripts/test-udp.py
similarity index 100%
rename from integrations/pytorch_ddp/test/test-udp.py
rename to integrations/pytorch_ddp/test/old_testscripts/test-udp.py
diff --git a/integrations/pytorch_ddp/test/test.py b/integrations/pytorch_ddp/test/old_testscripts/test.py
similarity index 100%
rename from integrations/pytorch_ddp/test/test.py
rename to integrations/pytorch_ddp/test/old_testscripts/test.py
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index a666760f..448e7259 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -275,15 +275,16 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
         # mpi.Barrier()
         # test_allreduce()
         # mpi.Barrier()
-        # demo_basic(rank)
-        # mpi.Barrier()
-
+        demo_basic(rank)
+        mpi.Barrier()
 
-        # add destroy 
+    print("Finished testing")
+    logger.debug('Finished testing')
         
     print(prof.key_averages(group_by_input_shape=True)
           .table(sort_by="cpu_time_total", row_limit=15))
 
+    logger.debug('Destroying ACCL Process Group')
     dist.destroy_process_group()
 
 if __name__ == '__main__':
diff --git a/integrations/pytorch_ddp/test/torchvision/PennFudanDataset.py b/integrations/pytorch_ddp/test/torchvision/PennFudanDataset.py
new file mode 100644
index 00000000..949bc93e
--- /dev/null
+++ b/integrations/pytorch_ddp/test/torchvision/PennFudanDataset.py
@@ -0,0 +1,74 @@
+import os
+import torch
+
+from torchvision.io import read_image
+from torchvision.ops.boxes import masks_to_boxes
+from torchvision import tv_tensors
+from torchvision.transforms.v2 import functional as F
+
+
+class PennFudanDataset(torch.utils.data.Dataset):
+    def __init__(self, root, transforms):
+        self.root = root
+        self.transforms = transforms
+        # load all image files, sorting them to
+        # ensure that they are aligned
+        self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
+        self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))
+
+    def __getitem__(self, idx):
+        # load images and masks
+        img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
+        mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
+        img = read_image(img_path)
+        mask = read_image(mask_path)
+        # instances are encoded as different colors
+        obj_ids = torch.unique(mask)
+        # first id is the background, so remove it
+        obj_ids = obj_ids[1:]
+        num_objs = len(obj_ids)
+
+        # split the color-encoded mask into a set
+        # of binary masks
+        masks = (mask == obj_ids[:, None, None]).to(dtype=torch.uint8)
+
+        # get bounding box coordinates for each mask
+        boxes = masks_to_boxes(masks)
+
+        # there is only one class
+        labels = torch.ones((num_objs,), dtype=torch.int64)
+
+        image_id = idx
+        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
+        # suppose all instances are not crowd
+        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
+
+        # Wrap sample and targets into torchvision tv_tensors:
+        img = tv_tensors.Image(img)
+
+        target = {}
+        target["boxes"] = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=F.get_size(img))
+        target["masks"] = tv_tensors.Mask(masks)
+        target["labels"] = labels
+        target["image_id"] = image_id
+        target["area"] = area
+        target["iscrowd"] = iscrowd
+
+        if self.transforms is not None:
+            img, target = self.transforms(img, target)
+
+        return img, target
+
+    def __len__(self):
+        return len(self.imgs)
+
+
+
+def get_transform(train):
+    transforms = []
+    if train:
+        transforms.append(T.RandomHorizontalFlip(0.5))
+    transforms.append(T.ToDtype(torch.float, scale=True))
+    transforms.append(T.ToPureTensor())
+    return T.Compose(transforms)
+
diff --git a/integrations/pytorch_ddp/test/torchvision/coco_eval.py b/integrations/pytorch_ddp/test/torchvision/coco_eval.py
new file mode 100644
index 00000000..ba1359f8
--- /dev/null
+++ b/integrations/pytorch_ddp/test/torchvision/coco_eval.py
@@ -0,0 +1,192 @@
+import copy
+import io
+from contextlib import redirect_stdout
+
+import numpy as np
+import pycocotools.mask as mask_util
+import torch
+import utils
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+
+
+class CocoEvaluator:
+    def __init__(self, coco_gt, iou_types):
+        if not isinstance(iou_types, (list, tuple)):
+            raise TypeError(f"This constructor expects iou_types of type list or tuple, instead  got {type(iou_types)}")
+        coco_gt = copy.deepcopy(coco_gt)
+        self.coco_gt = coco_gt
+
+        self.iou_types = iou_types
+        self.coco_eval = {}
+        for iou_type in iou_types:
+            self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
+
+        self.img_ids = []
+        self.eval_imgs = {k: [] for k in iou_types}
+
+    def update(self, predictions):
+        img_ids = list(np.unique(list(predictions.keys())))
+        self.img_ids.extend(img_ids)
+
+        for iou_type in self.iou_types:
+            results = self.prepare(predictions, iou_type)
+            with redirect_stdout(io.StringIO()):
+                coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO()
+            coco_eval = self.coco_eval[iou_type]
+
+            coco_eval.cocoDt = coco_dt
+            coco_eval.params.imgIds = list(img_ids)
+            img_ids, eval_imgs = evaluate(coco_eval)
+
+            self.eval_imgs[iou_type].append(eval_imgs)
+
+    def synchronize_between_processes(self):
+        for iou_type in self.iou_types:
+            self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
+            create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
+
+    def accumulate(self):
+        for coco_eval in self.coco_eval.values():
+            coco_eval.accumulate()
+
+    def summarize(self):
+        for iou_type, coco_eval in self.coco_eval.items():
+            print(f"IoU metric: {iou_type}")
+            coco_eval.summarize()
+
+    def prepare(self, predictions, iou_type):
+        if iou_type == "bbox":
+            return self.prepare_for_coco_detection(predictions)
+        if iou_type == "segm":
+            return self.prepare_for_coco_segmentation(predictions)
+        if iou_type == "keypoints":
+            return self.prepare_for_coco_keypoint(predictions)
+        raise ValueError(f"Unknown iou type {iou_type}")
+
+    def prepare_for_coco_detection(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "bbox": box,
+                        "score": scores[k],
+                    }
+                    for k, box in enumerate(boxes)
+                ]
+            )
+        return coco_results
+
+    def prepare_for_coco_segmentation(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            scores = prediction["scores"]
+            labels = prediction["labels"]
+            masks = prediction["masks"]
+
+            masks = masks > 0.5
+
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+
+            rles = [
+                mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0] for mask in masks
+            ]
+            for rle in rles:
+                rle["counts"] = rle["counts"].decode("utf-8")
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "segmentation": rle,
+                        "score": scores[k],
+                    }
+                    for k, rle in enumerate(rles)
+                ]
+            )
+        return coco_results
+
+    def prepare_for_coco_keypoint(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            keypoints = prediction["keypoints"]
+            keypoints = keypoints.flatten(start_dim=1).tolist()
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "keypoints": keypoint,
+                        "score": scores[k],
+                    }
+                    for k, keypoint in enumerate(keypoints)
+                ]
+            )
+        return coco_results
+
+
+def convert_to_xywh(boxes):
+    xmin, ymin, xmax, ymax = boxes.unbind(1)
+    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
+
+
+def merge(img_ids, eval_imgs):
+    all_img_ids = utils.all_gather(img_ids)
+    all_eval_imgs = utils.all_gather(eval_imgs)
+
+    merged_img_ids = []
+    for p in all_img_ids:
+        merged_img_ids.extend(p)
+
+    merged_eval_imgs = []
+    for p in all_eval_imgs:
+        merged_eval_imgs.append(p)
+
+    merged_img_ids = np.array(merged_img_ids)
+    merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
+
+    # keep only unique (and in sorted order) images
+    merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
+    merged_eval_imgs = merged_eval_imgs[..., idx]
+
+    return merged_img_ids, merged_eval_imgs
+
+
+def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
+    img_ids, eval_imgs = merge(img_ids, eval_imgs)
+    img_ids = list(img_ids)
+    eval_imgs = list(eval_imgs.flatten())
+
+    coco_eval.evalImgs = eval_imgs
+    coco_eval.params.imgIds = img_ids
+    coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
+
+
+def evaluate(imgs):
+    with redirect_stdout(io.StringIO()):
+        imgs.evaluate()
+    return imgs.params.imgIds, np.asarray(imgs.evalImgs).reshape(-1, len(imgs.params.areaRng), len(imgs.params.imgIds))
diff --git a/integrations/pytorch_ddp/test/torchvision/coco_eval.py.1 b/integrations/pytorch_ddp/test/torchvision/coco_eval.py.1
new file mode 100644
index 00000000..ba1359f8
--- /dev/null
+++ b/integrations/pytorch_ddp/test/torchvision/coco_eval.py.1
@@ -0,0 +1,192 @@
+import copy
+import io
+from contextlib import redirect_stdout
+
+import numpy as np
+import pycocotools.mask as mask_util
+import torch
+import utils
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+
+
+class CocoEvaluator:
+    def __init__(self, coco_gt, iou_types):
+        if not isinstance(iou_types, (list, tuple)):
+            raise TypeError(f"This constructor expects iou_types of type list or tuple, instead  got {type(iou_types)}")
+        coco_gt = copy.deepcopy(coco_gt)
+        self.coco_gt = coco_gt
+
+        self.iou_types = iou_types
+        self.coco_eval = {}
+        for iou_type in iou_types:
+            self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
+
+        self.img_ids = []
+        self.eval_imgs = {k: [] for k in iou_types}
+
+    def update(self, predictions):
+        img_ids = list(np.unique(list(predictions.keys())))
+        self.img_ids.extend(img_ids)
+
+        for iou_type in self.iou_types:
+            results = self.prepare(predictions, iou_type)
+            with redirect_stdout(io.StringIO()):
+                coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO()
+            coco_eval = self.coco_eval[iou_type]
+
+            coco_eval.cocoDt = coco_dt
+            coco_eval.params.imgIds = list(img_ids)
+            img_ids, eval_imgs = evaluate(coco_eval)
+
+            self.eval_imgs[iou_type].append(eval_imgs)
+
+    def synchronize_between_processes(self):
+        for iou_type in self.iou_types:
+            self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
+            create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
+
+    def accumulate(self):
+        for coco_eval in self.coco_eval.values():
+            coco_eval.accumulate()
+
+    def summarize(self):
+        for iou_type, coco_eval in self.coco_eval.items():
+            print(f"IoU metric: {iou_type}")
+            coco_eval.summarize()
+
+    def prepare(self, predictions, iou_type):
+        if iou_type == "bbox":
+            return self.prepare_for_coco_detection(predictions)
+        if iou_type == "segm":
+            return self.prepare_for_coco_segmentation(predictions)
+        if iou_type == "keypoints":
+            return self.prepare_for_coco_keypoint(predictions)
+        raise ValueError(f"Unknown iou type {iou_type}")
+
+    def prepare_for_coco_detection(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "bbox": box,
+                        "score": scores[k],
+                    }
+                    for k, box in enumerate(boxes)
+                ]
+            )
+        return coco_results
+
+    def prepare_for_coco_segmentation(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            scores = prediction["scores"]
+            labels = prediction["labels"]
+            masks = prediction["masks"]
+
+            masks = masks > 0.5
+
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+
+            rles = [
+                mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0] for mask in masks
+            ]
+            for rle in rles:
+                rle["counts"] = rle["counts"].decode("utf-8")
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "segmentation": rle,
+                        "score": scores[k],
+                    }
+                    for k, rle in enumerate(rles)
+                ]
+            )
+        return coco_results
+
+    def prepare_for_coco_keypoint(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            keypoints = prediction["keypoints"]
+            keypoints = keypoints.flatten(start_dim=1).tolist()
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "keypoints": keypoint,
+                        "score": scores[k],
+                    }
+                    for k, keypoint in enumerate(keypoints)
+                ]
+            )
+        return coco_results
+
+
+def convert_to_xywh(boxes):
+    xmin, ymin, xmax, ymax = boxes.unbind(1)
+    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
+
+
+def merge(img_ids, eval_imgs):
+    all_img_ids = utils.all_gather(img_ids)
+    all_eval_imgs = utils.all_gather(eval_imgs)
+
+    merged_img_ids = []
+    for p in all_img_ids:
+        merged_img_ids.extend(p)
+
+    merged_eval_imgs = []
+    for p in all_eval_imgs:
+        merged_eval_imgs.append(p)
+
+    merged_img_ids = np.array(merged_img_ids)
+    merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
+
+    # keep only unique (and in sorted order) images
+    merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
+    merged_eval_imgs = merged_eval_imgs[..., idx]
+
+    return merged_img_ids, merged_eval_imgs
+
+
+def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
+    img_ids, eval_imgs = merge(img_ids, eval_imgs)
+    img_ids = list(img_ids)
+    eval_imgs = list(eval_imgs.flatten())
+
+    coco_eval.evalImgs = eval_imgs
+    coco_eval.params.imgIds = img_ids
+    coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
+
+
+def evaluate(imgs):
+    with redirect_stdout(io.StringIO()):
+        imgs.evaluate()
+    return imgs.params.imgIds, np.asarray(imgs.evalImgs).reshape(-1, len(imgs.params.areaRng), len(imgs.params.imgIds))
diff --git a/integrations/pytorch_ddp/test/torchvision/coco_utils.py b/integrations/pytorch_ddp/test/torchvision/coco_utils.py
new file mode 100644
index 00000000..f40dcdff
--- /dev/null
+++ b/integrations/pytorch_ddp/test/torchvision/coco_utils.py
@@ -0,0 +1,234 @@
+import os
+
+import torch
+import torch.utils.data
+import torchvision
+import transforms as T
+from pycocotools import mask as coco_mask
+from pycocotools.coco import COCO
+
+
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+
+
+class ConvertCocoPolysToMask:
+    def __call__(self, image, target):
+        w, h = image.size
+
+        image_id = target["image_id"]
+
+        anno = target["annotations"]
+
+        anno = [obj for obj in anno if obj["iscrowd"] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+
+        classes = [obj["category_id"] for obj in anno]
+        classes = torch.tensor(classes, dtype=torch.int64)
+
+        segmentations = [obj["segmentation"] for obj in anno]
+        masks = convert_coco_poly_to_mask(segmentations, h, w)
+
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.view(num_keypoints, -1, 3)
+
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] for obj in anno])
+        target["area"] = area
+        target["iscrowd"] = iscrowd
+
+        return image, target
+
+
+def _coco_remove_images_without_annotations(dataset, cat_list=None):
+    def _has_only_empty_bbox(anno):
+        return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
+
+    def _count_visible_keypoints(anno):
+        return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
+
+    min_keypoints_per_image = 10
+
+    def _has_valid_annotation(anno):
+        # if it's empty, there is no annotation
+        if len(anno) == 0:
+            return False
+        # if all boxes have close to zero area, there is no annotation
+        if _has_only_empty_bbox(anno):
+            return False
+        # keypoints task have a slight different criteria for considering
+        # if an annotation is valid
+        if "keypoints" not in anno[0]:
+            return True
+        # for keypoint detection tasks, only consider valid images those
+        # containing at least min_keypoints_per_image
+        if _count_visible_keypoints(anno) >= min_keypoints_per_image:
+            return True
+        return False
+
+    ids = []
+    for ds_idx, img_id in enumerate(dataset.ids):
+        ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
+        anno = dataset.coco.loadAnns(ann_ids)
+        if cat_list:
+            anno = [obj for obj in anno if obj["category_id"] in cat_list]
+        if _has_valid_annotation(anno):
+            ids.append(ds_idx)
+
+    dataset = torch.utils.data.Subset(dataset, ids)
+    return dataset
+
+
+def convert_to_coco_api(ds):
+    coco_ds = COCO()
+    # annotation IDs need to start at 1, not 0, see torchvision issue #1530
+    ann_id = 1
+    dataset = {"images": [], "categories": [], "annotations": []}
+    categories = set()
+    for img_idx in range(len(ds)):
+        # find better way to get target
+        # targets = ds.get_annotations(img_idx)
+        img, targets = ds[img_idx]
+        image_id = targets["image_id"]
+        img_dict = {}
+        img_dict["id"] = image_id
+        img_dict["height"] = img.shape[-2]
+        img_dict["width"] = img.shape[-1]
+        dataset["images"].append(img_dict)
+        bboxes = targets["boxes"].clone()
+        bboxes[:, 2:] -= bboxes[:, :2]
+        bboxes = bboxes.tolist()
+        labels = targets["labels"].tolist()
+        areas = targets["area"].tolist()
+        iscrowd = targets["iscrowd"].tolist()
+        if "masks" in targets:
+            masks = targets["masks"]
+            # make masks Fortran contiguous for coco_mask
+            masks = masks.permute(0, 2, 1).contiguous().permute(0, 2, 1)
+        if "keypoints" in targets:
+            keypoints = targets["keypoints"]
+            keypoints = keypoints.reshape(keypoints.shape[0], -1).tolist()
+        num_objs = len(bboxes)
+        for i in range(num_objs):
+            ann = {}
+            ann["image_id"] = image_id
+            ann["bbox"] = bboxes[i]
+            ann["category_id"] = labels[i]
+            categories.add(labels[i])
+            ann["area"] = areas[i]
+            ann["iscrowd"] = iscrowd[i]
+            ann["id"] = ann_id
+            if "masks" in targets:
+                ann["segmentation"] = coco_mask.encode(masks[i].numpy())
+            if "keypoints" in targets:
+                ann["keypoints"] = keypoints[i]
+                ann["num_keypoints"] = sum(k != 0 for k in keypoints[i][2::3])
+            dataset["annotations"].append(ann)
+            ann_id += 1
+    dataset["categories"] = [{"id": i} for i in sorted(categories)]
+    coco_ds.dataset = dataset
+    coco_ds.createIndex()
+    return coco_ds
+
+
+def get_coco_api_from_dataset(dataset):
+    # FIXME: This is... awful?
+    for _ in range(10):
+        if isinstance(dataset, torchvision.datasets.CocoDetection):
+            break
+        if isinstance(dataset, torch.utils.data.Subset):
+            dataset = dataset.dataset
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return dataset.coco
+    return convert_to_coco_api(dataset)
+
+
+class CocoDetection(torchvision.datasets.CocoDetection):
+    def __init__(self, img_folder, ann_file, transforms):
+        super().__init__(img_folder, ann_file)
+        self._transforms = transforms
+
+    def __getitem__(self, idx):
+        img, target = super().__getitem__(idx)
+        image_id = self.ids[idx]
+        target = dict(image_id=image_id, annotations=target)
+        if self._transforms is not None:
+            img, target = self._transforms(img, target)
+        return img, target
+
+
+def get_coco(root, image_set, transforms, mode="instances", use_v2=False, with_masks=False):
+    anno_file_template = "{}_{}2017.json"
+    PATHS = {
+        "train": ("train2017", os.path.join("annotations", anno_file_template.format(mode, "train"))),
+        "val": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val"))),
+        # "train": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val")))
+    }
+
+    img_folder, ann_file = PATHS[image_set]
+    img_folder = os.path.join(root, img_folder)
+    ann_file = os.path.join(root, ann_file)
+
+    if use_v2:
+        from torchvision.datasets import wrap_dataset_for_transforms_v2
+
+        dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
+        target_keys = ["boxes", "labels", "image_id"]
+        if with_masks:
+            target_keys += ["masks"]
+        dataset = wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys)
+    else:
+        # TODO: handle with_masks for V1?
+        t = [ConvertCocoPolysToMask()]
+        if transforms is not None:
+            t.append(transforms)
+        transforms = T.Compose(t)
+
+        dataset = CocoDetection(img_folder, ann_file, transforms=transforms)
+
+    if image_set == "train":
+        dataset = _coco_remove_images_without_annotations(dataset)
+
+    # dataset = torch.utils.data.Subset(dataset, [i for i in range(500)])
+
+    return dataset
diff --git a/integrations/pytorch_ddp/test/torchvision/coco_utils.py.1 b/integrations/pytorch_ddp/test/torchvision/coco_utils.py.1
new file mode 100644
index 00000000..f40dcdff
--- /dev/null
+++ b/integrations/pytorch_ddp/test/torchvision/coco_utils.py.1
@@ -0,0 +1,234 @@
+import os
+
+import torch
+import torch.utils.data
+import torchvision
+import transforms as T
+from pycocotools import mask as coco_mask
+from pycocotools.coco import COCO
+
+
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+
+
+class ConvertCocoPolysToMask:
+    def __call__(self, image, target):
+        w, h = image.size
+
+        image_id = target["image_id"]
+
+        anno = target["annotations"]
+
+        anno = [obj for obj in anno if obj["iscrowd"] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+
+        classes = [obj["category_id"] for obj in anno]
+        classes = torch.tensor(classes, dtype=torch.int64)
+
+        segmentations = [obj["segmentation"] for obj in anno]
+        masks = convert_coco_poly_to_mask(segmentations, h, w)
+
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.view(num_keypoints, -1, 3)
+
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] for obj in anno])
+        target["area"] = area
+        target["iscrowd"] = iscrowd
+
+        return image, target
+
+
+def _coco_remove_images_without_annotations(dataset, cat_list=None):
+    def _has_only_empty_bbox(anno):
+        return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
+
+    def _count_visible_keypoints(anno):
+        return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
+
+    min_keypoints_per_image = 10
+
+    def _has_valid_annotation(anno):
+        # if it's empty, there is no annotation
+        if len(anno) == 0:
+            return False
+        # if all boxes have close to zero area, there is no annotation
+        if _has_only_empty_bbox(anno):
+            return False
+        # keypoints task have a slight different criteria for considering
+        # if an annotation is valid
+        if "keypoints" not in anno[0]:
+            return True
+        # for keypoint detection tasks, only consider valid images those
+        # containing at least min_keypoints_per_image
+        if _count_visible_keypoints(anno) >= min_keypoints_per_image:
+            return True
+        return False
+
+    ids = []
+    for ds_idx, img_id in enumerate(dataset.ids):
+        ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
+        anno = dataset.coco.loadAnns(ann_ids)
+        if cat_list:
+            anno = [obj for obj in anno if obj["category_id"] in cat_list]
+        if _has_valid_annotation(anno):
+            ids.append(ds_idx)
+
+    dataset = torch.utils.data.Subset(dataset, ids)
+    return dataset
+
+
+def convert_to_coco_api(ds):
+    coco_ds = COCO()
+    # annotation IDs need to start at 1, not 0, see torchvision issue #1530
+    ann_id = 1
+    dataset = {"images": [], "categories": [], "annotations": []}
+    categories = set()
+    for img_idx in range(len(ds)):
+        # find better way to get target
+        # targets = ds.get_annotations(img_idx)
+        img, targets = ds[img_idx]
+        image_id = targets["image_id"]
+        img_dict = {}
+        img_dict["id"] = image_id
+        img_dict["height"] = img.shape[-2]
+        img_dict["width"] = img.shape[-1]
+        dataset["images"].append(img_dict)
+        bboxes = targets["boxes"].clone()
+        bboxes[:, 2:] -= bboxes[:, :2]
+        bboxes = bboxes.tolist()
+        labels = targets["labels"].tolist()
+        areas = targets["area"].tolist()
+        iscrowd = targets["iscrowd"].tolist()
+        if "masks" in targets:
+            masks = targets["masks"]
+            # make masks Fortran contiguous for coco_mask
+            masks = masks.permute(0, 2, 1).contiguous().permute(0, 2, 1)
+        if "keypoints" in targets:
+            keypoints = targets["keypoints"]
+            keypoints = keypoints.reshape(keypoints.shape[0], -1).tolist()
+        num_objs = len(bboxes)
+        for i in range(num_objs):
+            ann = {}
+            ann["image_id"] = image_id
+            ann["bbox"] = bboxes[i]
+            ann["category_id"] = labels[i]
+            categories.add(labels[i])
+            ann["area"] = areas[i]
+            ann["iscrowd"] = iscrowd[i]
+            ann["id"] = ann_id
+            if "masks" in targets:
+                ann["segmentation"] = coco_mask.encode(masks[i].numpy())
+            if "keypoints" in targets:
+                ann["keypoints"] = keypoints[i]
+                ann["num_keypoints"] = sum(k != 0 for k in keypoints[i][2::3])
+            dataset["annotations"].append(ann)
+            ann_id += 1
+    dataset["categories"] = [{"id": i} for i in sorted(categories)]
+    coco_ds.dataset = dataset
+    coco_ds.createIndex()
+    return coco_ds
+
+
+def get_coco_api_from_dataset(dataset):
+    # FIXME: This is... awful?
+    for _ in range(10):
+        if isinstance(dataset, torchvision.datasets.CocoDetection):
+            break
+        if isinstance(dataset, torch.utils.data.Subset):
+            dataset = dataset.dataset
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return dataset.coco
+    return convert_to_coco_api(dataset)
+
+
+class CocoDetection(torchvision.datasets.CocoDetection):
+    def __init__(self, img_folder, ann_file, transforms):
+        super().__init__(img_folder, ann_file)
+        self._transforms = transforms
+
+    def __getitem__(self, idx):
+        img, target = super().__getitem__(idx)
+        image_id = self.ids[idx]
+        target = dict(image_id=image_id, annotations=target)
+        if self._transforms is not None:
+            img, target = self._transforms(img, target)
+        return img, target
+
+
+def get_coco(root, image_set, transforms, mode="instances", use_v2=False, with_masks=False):
+    anno_file_template = "{}_{}2017.json"
+    PATHS = {
+        "train": ("train2017", os.path.join("annotations", anno_file_template.format(mode, "train"))),
+        "val": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val"))),
+        # "train": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val")))
+    }
+
+    img_folder, ann_file = PATHS[image_set]
+    img_folder = os.path.join(root, img_folder)
+    ann_file = os.path.join(root, ann_file)
+
+    if use_v2:
+        from torchvision.datasets import wrap_dataset_for_transforms_v2
+
+        dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
+        target_keys = ["boxes", "labels", "image_id"]
+        if with_masks:
+            target_keys += ["masks"]
+        dataset = wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys)
+    else:
+        # TODO: handle with_masks for V1?
+        t = [ConvertCocoPolysToMask()]
+        if transforms is not None:
+            t.append(transforms)
+        transforms = T.Compose(t)
+
+        dataset = CocoDetection(img_folder, ann_file, transforms=transforms)
+
+    if image_set == "train":
+        dataset = _coco_remove_images_without_annotations(dataset)
+
+    # dataset = torch.utils.data.Subset(dataset, [i for i in range(500)])
+
+    return dataset
diff --git a/integrations/pytorch_ddp/test/torchvision/engine.py b/integrations/pytorch_ddp/test/torchvision/engine.py
new file mode 100644
index 00000000..0e9bfffd
--- /dev/null
+++ b/integrations/pytorch_ddp/test/torchvision/engine.py
@@ -0,0 +1,115 @@
+import math
+import sys
+import time
+
+import torch
+import torchvision.models.detection.mask_rcnn
+import utils
+from coco_eval import CocoEvaluator
+from coco_utils import get_coco_api_from_dataset
+
+
+def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, scaler=None):
+    model.train()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value:.6f}"))
+    header = f"Epoch: [{epoch}]"
+
+    lr_scheduler = None
+    if epoch == 0:
+        warmup_factor = 1.0 / 1000
+        warmup_iters = min(1000, len(data_loader) - 1)
+
+        lr_scheduler = torch.optim.lr_scheduler.LinearLR(
+            optimizer, start_factor=warmup_factor, total_iters=warmup_iters
+        )
+
+    for images, targets in metric_logger.log_every(data_loader, print_freq, header):
+        images = list(image.to(device) for image in images)
+        targets = [{k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in t.items()} for t in targets]
+        with torch.cuda.amp.autocast(enabled=scaler is not None):
+            loss_dict = model(images, targets)
+            losses = sum(loss for loss in loss_dict.values())
+
+        # reduce losses over all GPUs for logging purposes
+        loss_dict_reduced = utils.reduce_dict(loss_dict)
+        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
+
+        loss_value = losses_reduced.item()
+
+        if not math.isfinite(loss_value):
+            print(f"Loss is {loss_value}, stopping training")
+            print(loss_dict_reduced)
+            sys.exit(1)
+
+        optimizer.zero_grad()
+        if scaler is not None:
+            scaler.scale(losses).backward()
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            losses.backward()
+            optimizer.step()
+
+        if lr_scheduler is not None:
+            lr_scheduler.step()
+
+        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
+        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+
+    return metric_logger
+
+
+def _get_iou_types(model):
+    model_without_ddp = model
+    if isinstance(model, torch.nn.parallel.DistributedDataParallel):
+        model_without_ddp = model.module
+    iou_types = ["bbox"]
+    if isinstance(model_without_ddp, torchvision.models.detection.MaskRCNN):
+        iou_types.append("segm")
+    if isinstance(model_without_ddp, torchvision.models.detection.KeypointRCNN):
+        iou_types.append("keypoints")
+    return iou_types
+
+
+@torch.inference_mode()
+def evaluate(model, data_loader, device):
+    n_threads = torch.get_num_threads()
+    # FIXME remove this and make paste_masks_in_image run on the GPU
+    torch.set_num_threads(1)
+    cpu_device = torch.device("cpu")
+    model.eval()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    header = "Test:"
+
+    coco = get_coco_api_from_dataset(data_loader.dataset)
+    iou_types = _get_iou_types(model)
+    coco_evaluator = CocoEvaluator(coco, iou_types)
+
+    for images, targets in metric_logger.log_every(data_loader, 100, header):
+        images = list(img.to(device) for img in images)
+
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        model_time = time.time()
+        outputs = model(images)
+
+        outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
+        model_time = time.time() - model_time
+
+        res = {target["image_id"]: output for target, output in zip(targets, outputs)}
+        evaluator_time = time.time()
+        coco_evaluator.update(res)
+        evaluator_time = time.time() - evaluator_time
+        metric_logger.update(model_time=model_time, evaluator_time=evaluator_time)
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    coco_evaluator.synchronize_between_processes()
+
+    # accumulate predictions from all images
+    coco_evaluator.accumulate()
+    coco_evaluator.summarize()
+    torch.set_num_threads(n_threads)
+    return coco_evaluator
diff --git a/integrations/pytorch_ddp/test/torchvision/main.py b/integrations/pytorch_ddp/test/torchvision/main.py
new file mode 100644
index 00000000..d6296500
--- /dev/null
+++ b/integrations/pytorch_ddp/test/torchvision/main.py
@@ -0,0 +1,100 @@
+import torchvision
+from PennFudanDataset import PennFudanDataset, get_transform
+import torch
+from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
+from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
+from engine import train_one_epoch, evaluate
+import utils
+from torchvision.transforms import v2 as T
+
+
+def get_model_instance_segmentation(num_classes):
+    # load an instance segmentation model pre-trained on COCO
+    model = torchvision.models.detection.maskrcnn_resnet50_fpn(weights="DEFAULT")
+
+    # get number of input features for the classifier
+    in_features = model.roi_heads.box_predictor.cls_score.in_features
+    # replace the pre-trained head with a new one
+    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
+
+    # now get the number of input features for the mask classifier
+    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
+    hidden_layer = 256
+    # and replace the mask predictor with a new one
+    model.roi_heads.mask_predictor = MaskRCNNPredictor(
+        in_features_mask,
+        hidden_layer,
+        num_classes
+    )
+
+    return model
+
+
+
+# train on the GPU or on the CPU, if a GPU is not available
+device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+
+# our dataset has two classes only - background and person
+num_classes = 2
+# use our dataset and defined transformations
+dataset = PennFudanDataset('data/PennFudanPed', get_transform(train=True))
+dataset_test = PennFudanDataset('data/PennFudanPed', get_transform(train=False))
+
+# split the dataset in train and test set
+indices = torch.randperm(len(dataset)).tolist()
+dataset = torch.utils.data.Subset(dataset, indices[:-50])
+dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])
+
+
+# define training and validation data loaders
+data_loader = torch.utils.data.DataLoader(
+    dataset,
+    batch_size=2,
+    shuffle=True,
+    num_workers=4,
+    collate_fn=utils.collate_fn
+)
+
+data_loader_test = torch.utils.data.DataLoader(
+    dataset_test,
+    batch_size=1,
+    shuffle=False,
+    num_workers=4,
+    collate_fn=utils.collate_fn
+)
+
+# get the model using our helper function
+model = get_model_instance_segmentation(num_classes)
+
+# move model to the right device
+model.to(device)
+
+# construct an optimizer
+params = [p for p in model.parameters() if p.requires_grad]
+optimizer = torch.optim.SGD(
+    params,
+    lr=0.005,
+    momentum=0.9,
+    weight_decay=0.0005
+)
+
+# and a learning rate scheduler
+lr_scheduler = torch.optim.lr_scheduler.StepLR(
+    optimizer,
+    step_size=3,
+    gamma=0.1
+)
+
+# let's train it just for 2 epochs
+num_epochs = 2
+
+for epoch in range(num_epochs):
+    # train for one epoch, printing every 10 iterations
+    train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
+    # update the learning rate
+    lr_scheduler.step()
+    # evaluate on the test dataset
+    evaluate(model, data_loader_test, device=device)
+
+print("That's it!")
diff --git a/integrations/pytorch_ddp/test/torchvision/transforms.py b/integrations/pytorch_ddp/test/torchvision/transforms.py
new file mode 100644
index 00000000..e07ccfc9
--- /dev/null
+++ b/integrations/pytorch_ddp/test/torchvision/transforms.py
@@ -0,0 +1,601 @@
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torchvision
+from torch import nn, Tensor
+from torchvision import ops
+from torchvision.transforms import functional as F, InterpolationMode, transforms as T
+
+
+def _flip_coco_person_keypoints(kps, width):
+    flip_inds = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
+    flipped_data = kps[:, flip_inds]
+    flipped_data[..., 0] = width - flipped_data[..., 0]
+    # Maintain COCO convention that if visibility == 0, then x, y = 0
+    inds = flipped_data[..., 2] == 0
+    flipped_data[inds] = 0
+    return flipped_data
+
+
+class Compose:
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+
+
+class RandomHorizontalFlip(T.RandomHorizontalFlip):
+    def forward(
+        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
+    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        if torch.rand(1) < self.p:
+            image = F.hflip(image)
+            if target is not None:
+                _, _, width = F.get_dimensions(image)
+                target["boxes"][:, [0, 2]] = width - target["boxes"][:, [2, 0]]
+                if "masks" in target:
+                    target["masks"] = target["masks"].flip(-1)
+                if "keypoints" in target:
+                    keypoints = target["keypoints"]
+                    keypoints = _flip_coco_person_keypoints(keypoints, width)
+                    target["keypoints"] = keypoints
+        return image, target
+
+
+class PILToTensor(nn.Module):
+    def forward(
+        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
+    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        image = F.pil_to_tensor(image)
+        return image, target
+
+
+class ToDtype(nn.Module):
+    def __init__(self, dtype: torch.dtype, scale: bool = False) -> None:
+        super().__init__()
+        self.dtype = dtype
+        self.scale = scale
+
+    def forward(
+        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
+    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        if not self.scale:
+            return image.to(dtype=self.dtype), target
+        image = F.convert_image_dtype(image, self.dtype)
+        return image, target
+
+
+class RandomIoUCrop(nn.Module):
+    def __init__(
+        self,
+        min_scale: float = 0.3,
+        max_scale: float = 1.0,
+        min_aspect_ratio: float = 0.5,
+        max_aspect_ratio: float = 2.0,
+        sampler_options: Optional[List[float]] = None,
+        trials: int = 40,
+    ):
+        super().__init__()
+        # Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174
+        self.min_scale = min_scale
+        self.max_scale = max_scale
+        self.min_aspect_ratio = min_aspect_ratio
+        self.max_aspect_ratio = max_aspect_ratio
+        if sampler_options is None:
+            sampler_options = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
+        self.options = sampler_options
+        self.trials = trials
+
+    def forward(
+        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
+    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        if target is None:
+            raise ValueError("The targets can't be None for this transform.")
+
+        if isinstance(image, torch.Tensor):
+            if image.ndimension() not in {2, 3}:
+                raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.")
+            elif image.ndimension() == 2:
+                image = image.unsqueeze(0)
+
+        _, orig_h, orig_w = F.get_dimensions(image)
+
+        while True:
+            # sample an option
+            idx = int(torch.randint(low=0, high=len(self.options), size=(1,)))
+            min_jaccard_overlap = self.options[idx]
+            if min_jaccard_overlap >= 1.0:  # a value larger than 1 encodes the leave as-is option
+                return image, target
+
+            for _ in range(self.trials):
+                # check the aspect ratio limitations
+                r = self.min_scale + (self.max_scale - self.min_scale) * torch.rand(2)
+                new_w = int(orig_w * r[0])
+                new_h = int(orig_h * r[1])
+                aspect_ratio = new_w / new_h
+                if not (self.min_aspect_ratio <= aspect_ratio <= self.max_aspect_ratio):
+                    continue
+
+                # check for 0 area crops
+                r = torch.rand(2)
+                left = int((orig_w - new_w) * r[0])
+                top = int((orig_h - new_h) * r[1])
+                right = left + new_w
+                bottom = top + new_h
+                if left == right or top == bottom:
+                    continue
+
+                # check for any valid boxes with centers within the crop area
+                cx = 0.5 * (target["boxes"][:, 0] + target["boxes"][:, 2])
+                cy = 0.5 * (target["boxes"][:, 1] + target["boxes"][:, 3])
+                is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom)
+                if not is_within_crop_area.any():
+                    continue
+
+                # check at least 1 box with jaccard limitations
+                boxes = target["boxes"][is_within_crop_area]
+                ious = torchvision.ops.boxes.box_iou(
+                    boxes, torch.tensor([[left, top, right, bottom]], dtype=boxes.dtype, device=boxes.device)
+                )
+                if ious.max() < min_jaccard_overlap:
+                    continue
+
+                # keep only valid boxes and perform cropping
+                target["boxes"] = boxes
+                target["labels"] = target["labels"][is_within_crop_area]
+                target["boxes"][:, 0::2] -= left
+                target["boxes"][:, 1::2] -= top
+                target["boxes"][:, 0::2].clamp_(min=0, max=new_w)
+                target["boxes"][:, 1::2].clamp_(min=0, max=new_h)
+                image = F.crop(image, top, left, new_h, new_w)
+
+                return image, target
+
+
+class RandomZoomOut(nn.Module):
+    def __init__(
+        self, fill: Optional[List[float]] = None, side_range: Tuple[float, float] = (1.0, 4.0), p: float = 0.5
+    ):
+        super().__init__()
+        if fill is None:
+            fill = [0.0, 0.0, 0.0]
+        self.fill = fill
+        self.side_range = side_range
+        if side_range[0] < 1.0 or side_range[0] > side_range[1]:
+            raise ValueError(f"Invalid canvas side range provided {side_range}.")
+        self.p = p
+
+    @torch.jit.unused
+    def _get_fill_value(self, is_pil):
+        # type: (bool) -> int
+        # We fake the type to make it work on JIT
+        return tuple(int(x) for x in self.fill) if is_pil else 0
+
+    def forward(
+        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
+    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        if isinstance(image, torch.Tensor):
+            if image.ndimension() not in {2, 3}:
+                raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.")
+            elif image.ndimension() == 2:
+                image = image.unsqueeze(0)
+
+        if torch.rand(1) >= self.p:
+            return image, target
+
+        _, orig_h, orig_w = F.get_dimensions(image)
+
+        r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0])
+        canvas_width = int(orig_w * r)
+        canvas_height = int(orig_h * r)
+
+        r = torch.rand(2)
+        left = int((canvas_width - orig_w) * r[0])
+        top = int((canvas_height - orig_h) * r[1])
+        right = canvas_width - (left + orig_w)
+        bottom = canvas_height - (top + orig_h)
+
+        if torch.jit.is_scripting():
+            fill = 0
+        else:
+            fill = self._get_fill_value(F._is_pil_image(image))
+
+        image = F.pad(image, [left, top, right, bottom], fill=fill)
+        if isinstance(image, torch.Tensor):
+            # PyTorch's pad supports only integers on fill. So we need to overwrite the colour
+            v = torch.tensor(self.fill, device=image.device, dtype=image.dtype).view(-1, 1, 1)
+            image[..., :top, :] = image[..., :, :left] = image[..., (top + orig_h) :, :] = image[
+                ..., :, (left + orig_w) :
+            ] = v
+
+        if target is not None:
+            target["boxes"][:, 0::2] += left
+            target["boxes"][:, 1::2] += top
+
+        return image, target
+
+
+class RandomPhotometricDistort(nn.Module):
+    def __init__(
+        self,
+        contrast: Tuple[float, float] = (0.5, 1.5),
+        saturation: Tuple[float, float] = (0.5, 1.5),
+        hue: Tuple[float, float] = (-0.05, 0.05),
+        brightness: Tuple[float, float] = (0.875, 1.125),
+        p: float = 0.5,
+    ):
+        super().__init__()
+        self._brightness = T.ColorJitter(brightness=brightness)
+        self._contrast = T.ColorJitter(contrast=contrast)
+        self._hue = T.ColorJitter(hue=hue)
+        self._saturation = T.ColorJitter(saturation=saturation)
+        self.p = p
+
+    def forward(
+        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
+    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        if isinstance(image, torch.Tensor):
+            if image.ndimension() not in {2, 3}:
+                raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.")
+            elif image.ndimension() == 2:
+                image = image.unsqueeze(0)
+
+        r = torch.rand(7)
+
+        if r[0] < self.p:
+            image = self._brightness(image)
+
+        contrast_before = r[1] < 0.5
+        if contrast_before:
+            if r[2] < self.p:
+                image = self._contrast(image)
+
+        if r[3] < self.p:
+            image = self._saturation(image)
+
+        if r[4] < self.p:
+            image = self._hue(image)
+
+        if not contrast_before:
+            if r[5] < self.p:
+                image = self._contrast(image)
+
+        if r[6] < self.p:
+            channels, _, _ = F.get_dimensions(image)
+            permutation = torch.randperm(channels)
+
+            is_pil = F._is_pil_image(image)
+            if is_pil:
+                image = F.pil_to_tensor(image)
+                image = F.convert_image_dtype(image)
+            image = image[..., permutation, :, :]
+            if is_pil:
+                image = F.to_pil_image(image)
+
+        return image, target
+
+
+class ScaleJitter(nn.Module):
+    """Randomly resizes the image and its bounding boxes  within the specified scale range.
+    The class implements the Scale Jitter augmentation as described in the paper
+    `"Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation" <https://arxiv.org/abs/2012.07177>`_.
+
+    Args:
+        target_size (tuple of ints): The target size for the transform provided in (height, weight) format.
+        scale_range (tuple of ints): scaling factor interval, e.g (a, b), then scale is randomly sampled from the
+            range a <= scale <= b.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+    """
+
+    def __init__(
+        self,
+        target_size: Tuple[int, int],
+        scale_range: Tuple[float, float] = (0.1, 2.0),
+        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        antialias=True,
+    ):
+        super().__init__()
+        self.target_size = target_size
+        self.scale_range = scale_range
+        self.interpolation = interpolation
+        self.antialias = antialias
+
+    def forward(
+        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
+    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        if isinstance(image, torch.Tensor):
+            if image.ndimension() not in {2, 3}:
+                raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.")
+            elif image.ndimension() == 2:
+                image = image.unsqueeze(0)
+
+        _, orig_height, orig_width = F.get_dimensions(image)
+
+        scale = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0])
+        r = min(self.target_size[1] / orig_height, self.target_size[0] / orig_width) * scale
+        new_width = int(orig_width * r)
+        new_height = int(orig_height * r)
+
+        image = F.resize(image, [new_height, new_width], interpolation=self.interpolation, antialias=self.antialias)
+
+        if target is not None:
+            target["boxes"][:, 0::2] *= new_width / orig_width
+            target["boxes"][:, 1::2] *= new_height / orig_height
+            if "masks" in target:
+                target["masks"] = F.resize(
+                    target["masks"],
+                    [new_height, new_width],
+                    interpolation=InterpolationMode.NEAREST,
+                    antialias=self.antialias,
+                )
+
+        return image, target
+
+
+class FixedSizeCrop(nn.Module):
+    def __init__(self, size, fill=0, padding_mode="constant"):
+        super().__init__()
+        size = tuple(T._setup_size(size, error_msg="Please provide only two dimensions (h, w) for size."))
+        self.crop_height = size[0]
+        self.crop_width = size[1]
+        self.fill = fill  # TODO: Fill is currently respected only on PIL. Apply tensor patch.
+        self.padding_mode = padding_mode
+
+    def _pad(self, img, target, padding):
+        # Taken from the functional_tensor.py pad
+        if isinstance(padding, int):
+            pad_left = pad_right = pad_top = pad_bottom = padding
+        elif len(padding) == 1:
+            pad_left = pad_right = pad_top = pad_bottom = padding[0]
+        elif len(padding) == 2:
+            pad_left = pad_right = padding[0]
+            pad_top = pad_bottom = padding[1]
+        else:
+            pad_left = padding[0]
+            pad_top = padding[1]
+            pad_right = padding[2]
+            pad_bottom = padding[3]
+
+        padding = [pad_left, pad_top, pad_right, pad_bottom]
+        img = F.pad(img, padding, self.fill, self.padding_mode)
+        if target is not None:
+            target["boxes"][:, 0::2] += pad_left
+            target["boxes"][:, 1::2] += pad_top
+            if "masks" in target:
+                target["masks"] = F.pad(target["masks"], padding, 0, "constant")
+
+        return img, target
+
+    def _crop(self, img, target, top, left, height, width):
+        img = F.crop(img, top, left, height, width)
+        if target is not None:
+            boxes = target["boxes"]
+            boxes[:, 0::2] -= left
+            boxes[:, 1::2] -= top
+            boxes[:, 0::2].clamp_(min=0, max=width)
+            boxes[:, 1::2].clamp_(min=0, max=height)
+
+            is_valid = (boxes[:, 0] < boxes[:, 2]) & (boxes[:, 1] < boxes[:, 3])
+
+            target["boxes"] = boxes[is_valid]
+            target["labels"] = target["labels"][is_valid]
+            if "masks" in target:
+                target["masks"] = F.crop(target["masks"][is_valid], top, left, height, width)
+
+        return img, target
+
+    def forward(self, img, target=None):
+        _, height, width = F.get_dimensions(img)
+        new_height = min(height, self.crop_height)
+        new_width = min(width, self.crop_width)
+
+        if new_height != height or new_width != width:
+            offset_height = max(height - self.crop_height, 0)
+            offset_width = max(width - self.crop_width, 0)
+
+            r = torch.rand(1)
+            top = int(offset_height * r)
+            left = int(offset_width * r)
+
+            img, target = self._crop(img, target, top, left, new_height, new_width)
+
+        pad_bottom = max(self.crop_height - new_height, 0)
+        pad_right = max(self.crop_width - new_width, 0)
+        if pad_bottom != 0 or pad_right != 0:
+            img, target = self._pad(img, target, [0, 0, pad_right, pad_bottom])
+
+        return img, target
+
+
+class RandomShortestSize(nn.Module):
+    def __init__(
+        self,
+        min_size: Union[List[int], Tuple[int], int],
+        max_size: int,
+        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    ):
+        super().__init__()
+        self.min_size = [min_size] if isinstance(min_size, int) else list(min_size)
+        self.max_size = max_size
+        self.interpolation = interpolation
+
+    def forward(
+        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
+    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        _, orig_height, orig_width = F.get_dimensions(image)
+
+        min_size = self.min_size[torch.randint(len(self.min_size), (1,)).item()]
+        r = min(min_size / min(orig_height, orig_width), self.max_size / max(orig_height, orig_width))
+
+        new_width = int(orig_width * r)
+        new_height = int(orig_height * r)
+
+        image = F.resize(image, [new_height, new_width], interpolation=self.interpolation)
+
+        if target is not None:
+            target["boxes"][:, 0::2] *= new_width / orig_width
+            target["boxes"][:, 1::2] *= new_height / orig_height
+            if "masks" in target:
+                target["masks"] = F.resize(
+                    target["masks"], [new_height, new_width], interpolation=InterpolationMode.NEAREST
+                )
+
+        return image, target
+
+
+def _copy_paste(
+    image: torch.Tensor,
+    target: Dict[str, Tensor],
+    paste_image: torch.Tensor,
+    paste_target: Dict[str, Tensor],
+    blending: bool = True,
+    resize_interpolation: F.InterpolationMode = F.InterpolationMode.BILINEAR,
+) -> Tuple[torch.Tensor, Dict[str, Tensor]]:
+
+    # Random paste targets selection:
+    num_masks = len(paste_target["masks"])
+
+    if num_masks < 1:
+        # Such degerante case with num_masks=0 can happen with LSJ
+        # Let's just return (image, target)
+        return image, target
+
+    # We have to please torch script by explicitly specifying dtype as torch.long
+    random_selection = torch.randint(0, num_masks, (num_masks,), device=paste_image.device)
+    random_selection = torch.unique(random_selection).to(torch.long)
+
+    paste_masks = paste_target["masks"][random_selection]
+    paste_boxes = paste_target["boxes"][random_selection]
+    paste_labels = paste_target["labels"][random_selection]
+
+    masks = target["masks"]
+
+    # We resize source and paste data if they have different sizes
+    # This is something we introduced here as originally the algorithm works
+    # on equal-sized data (for example, coming from LSJ data augmentations)
+    size1 = image.shape[-2:]
+    size2 = paste_image.shape[-2:]
+    if size1 != size2:
+        paste_image = F.resize(paste_image, size1, interpolation=resize_interpolation)
+        paste_masks = F.resize(paste_masks, size1, interpolation=F.InterpolationMode.NEAREST)
+        # resize bboxes:
+        ratios = torch.tensor((size1[1] / size2[1], size1[0] / size2[0]), device=paste_boxes.device)
+        paste_boxes = paste_boxes.view(-1, 2, 2).mul(ratios).view(paste_boxes.shape)
+
+    paste_alpha_mask = paste_masks.sum(dim=0) > 0
+
+    if blending:
+        paste_alpha_mask = F.gaussian_blur(
+            paste_alpha_mask.unsqueeze(0),
+            kernel_size=(5, 5),
+            sigma=[
+                2.0,
+            ],
+        )
+
+    # Copy-paste images:
+    image = (image * (~paste_alpha_mask)) + (paste_image * paste_alpha_mask)
+
+    # Copy-paste masks:
+    masks = masks * (~paste_alpha_mask)
+    non_all_zero_masks = masks.sum((-1, -2)) > 0
+    masks = masks[non_all_zero_masks]
+
+    # Do a shallow copy of the target dict
+    out_target = {k: v for k, v in target.items()}
+
+    out_target["masks"] = torch.cat([masks, paste_masks])
+
+    # Copy-paste boxes and labels
+    boxes = ops.masks_to_boxes(masks)
+    out_target["boxes"] = torch.cat([boxes, paste_boxes])
+
+    labels = target["labels"][non_all_zero_masks]
+    out_target["labels"] = torch.cat([labels, paste_labels])
+
+    # Update additional optional keys: area and iscrowd if exist
+    if "area" in target:
+        out_target["area"] = out_target["masks"].sum((-1, -2)).to(torch.float32)
+
+    if "iscrowd" in target and "iscrowd" in paste_target:
+        # target['iscrowd'] size can be differ from mask size (non_all_zero_masks)
+        # For example, if previous transforms geometrically modifies masks/boxes/labels but
+        # does not update "iscrowd"
+        if len(target["iscrowd"]) == len(non_all_zero_masks):
+            iscrowd = target["iscrowd"][non_all_zero_masks]
+            paste_iscrowd = paste_target["iscrowd"][random_selection]
+            out_target["iscrowd"] = torch.cat([iscrowd, paste_iscrowd])
+
+    # Check for degenerated boxes and remove them
+    boxes = out_target["boxes"]
+    degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
+    if degenerate_boxes.any():
+        valid_targets = ~degenerate_boxes.any(dim=1)
+
+        out_target["boxes"] = boxes[valid_targets]
+        out_target["masks"] = out_target["masks"][valid_targets]
+        out_target["labels"] = out_target["labels"][valid_targets]
+
+        if "area" in out_target:
+            out_target["area"] = out_target["area"][valid_targets]
+        if "iscrowd" in out_target and len(out_target["iscrowd"]) == len(valid_targets):
+            out_target["iscrowd"] = out_target["iscrowd"][valid_targets]
+
+    return image, out_target
+
+
+class SimpleCopyPaste(torch.nn.Module):
+    def __init__(self, blending=True, resize_interpolation=F.InterpolationMode.BILINEAR):
+        super().__init__()
+        self.resize_interpolation = resize_interpolation
+        self.blending = blending
+
+    def forward(
+        self, images: List[torch.Tensor], targets: List[Dict[str, Tensor]]
+    ) -> Tuple[List[torch.Tensor], List[Dict[str, Tensor]]]:
+        torch._assert(
+            isinstance(images, (list, tuple)) and all([isinstance(v, torch.Tensor) for v in images]),
+            "images should be a list of tensors",
+        )
+        torch._assert(
+            isinstance(targets, (list, tuple)) and len(images) == len(targets),
+            "targets should be a list of the same size as images",
+        )
+        for target in targets:
+            # Can not check for instance type dict with inside torch.jit.script
+            # torch._assert(isinstance(target, dict), "targets item should be a dict")
+            for k in ["masks", "boxes", "labels"]:
+                torch._assert(k in target, f"Key {k} should be present in targets")
+                torch._assert(isinstance(target[k], torch.Tensor), f"Value for the key {k} should be a tensor")
+
+        # images = [t1, t2, ..., tN]
+        # Let's define paste_images as shifted list of input images
+        # paste_images = [t2, t3, ..., tN, t1]
+        # FYI: in TF they mix data on the dataset level
+        images_rolled = images[-1:] + images[:-1]
+        targets_rolled = targets[-1:] + targets[:-1]
+
+        output_images: List[torch.Tensor] = []
+        output_targets: List[Dict[str, Tensor]] = []
+
+        for image, target, paste_image, paste_target in zip(images, targets, images_rolled, targets_rolled):
+            output_image, output_data = _copy_paste(
+                image,
+                target,
+                paste_image,
+                paste_target,
+                blending=self.blending,
+                resize_interpolation=self.resize_interpolation,
+            )
+            output_images.append(output_image)
+            output_targets.append(output_data)
+
+        return output_images, output_targets
+
+    def __repr__(self) -> str:
+        s = f"{self.__class__.__name__}(blending={self.blending}, resize_interpolation={self.resize_interpolation})"
+        return s
diff --git a/integrations/pytorch_ddp/test/torchvision/transforms.py.1 b/integrations/pytorch_ddp/test/torchvision/transforms.py.1
new file mode 100644
index 00000000..e07ccfc9
--- /dev/null
+++ b/integrations/pytorch_ddp/test/torchvision/transforms.py.1
@@ -0,0 +1,601 @@
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torchvision
+from torch import nn, Tensor
+from torchvision import ops
+from torchvision.transforms import functional as F, InterpolationMode, transforms as T
+
+
+def _flip_coco_person_keypoints(kps, width):
+    flip_inds = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
+    flipped_data = kps[:, flip_inds]
+    flipped_data[..., 0] = width - flipped_data[..., 0]
+    # Maintain COCO convention that if visibility == 0, then x, y = 0
+    inds = flipped_data[..., 2] == 0
+    flipped_data[inds] = 0
+    return flipped_data
+
+
+class Compose:
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+
+
+class RandomHorizontalFlip(T.RandomHorizontalFlip):
+    def forward(
+        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
+    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        if torch.rand(1) < self.p:
+            image = F.hflip(image)
+            if target is not None:
+                _, _, width = F.get_dimensions(image)
+                target["boxes"][:, [0, 2]] = width - target["boxes"][:, [2, 0]]
+                if "masks" in target:
+                    target["masks"] = target["masks"].flip(-1)
+                if "keypoints" in target:
+                    keypoints = target["keypoints"]
+                    keypoints = _flip_coco_person_keypoints(keypoints, width)
+                    target["keypoints"] = keypoints
+        return image, target
+
+
+class PILToTensor(nn.Module):
+    def forward(
+        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
+    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        image = F.pil_to_tensor(image)
+        return image, target
+
+
+class ToDtype(nn.Module):
+    def __init__(self, dtype: torch.dtype, scale: bool = False) -> None:
+        super().__init__()
+        self.dtype = dtype
+        self.scale = scale
+
+    def forward(
+        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
+    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        if not self.scale:
+            return image.to(dtype=self.dtype), target
+        image = F.convert_image_dtype(image, self.dtype)
+        return image, target
+
+
+class RandomIoUCrop(nn.Module):
+    def __init__(
+        self,
+        min_scale: float = 0.3,
+        max_scale: float = 1.0,
+        min_aspect_ratio: float = 0.5,
+        max_aspect_ratio: float = 2.0,
+        sampler_options: Optional[List[float]] = None,
+        trials: int = 40,
+    ):
+        super().__init__()
+        # Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174
+        self.min_scale = min_scale
+        self.max_scale = max_scale
+        self.min_aspect_ratio = min_aspect_ratio
+        self.max_aspect_ratio = max_aspect_ratio
+        if sampler_options is None:
+            sampler_options = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
+        self.options = sampler_options
+        self.trials = trials
+
+    def forward(
+        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
+    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        if target is None:
+            raise ValueError("The targets can't be None for this transform.")
+
+        if isinstance(image, torch.Tensor):
+            if image.ndimension() not in {2, 3}:
+                raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.")
+            elif image.ndimension() == 2:
+                image = image.unsqueeze(0)
+
+        _, orig_h, orig_w = F.get_dimensions(image)
+
+        while True:
+            # sample an option
+            idx = int(torch.randint(low=0, high=len(self.options), size=(1,)))
+            min_jaccard_overlap = self.options[idx]
+            if min_jaccard_overlap >= 1.0:  # a value larger than 1 encodes the leave as-is option
+                return image, target
+
+            for _ in range(self.trials):
+                # check the aspect ratio limitations
+                r = self.min_scale + (self.max_scale - self.min_scale) * torch.rand(2)
+                new_w = int(orig_w * r[0])
+                new_h = int(orig_h * r[1])
+                aspect_ratio = new_w / new_h
+                if not (self.min_aspect_ratio <= aspect_ratio <= self.max_aspect_ratio):
+                    continue
+
+                # check for 0 area crops
+                r = torch.rand(2)
+                left = int((orig_w - new_w) * r[0])
+                top = int((orig_h - new_h) * r[1])
+                right = left + new_w
+                bottom = top + new_h
+                if left == right or top == bottom:
+                    continue
+
+                # check for any valid boxes with centers within the crop area
+                cx = 0.5 * (target["boxes"][:, 0] + target["boxes"][:, 2])
+                cy = 0.5 * (target["boxes"][:, 1] + target["boxes"][:, 3])
+                is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom)
+                if not is_within_crop_area.any():
+                    continue
+
+                # check at least 1 box with jaccard limitations
+                boxes = target["boxes"][is_within_crop_area]
+                ious = torchvision.ops.boxes.box_iou(
+                    boxes, torch.tensor([[left, top, right, bottom]], dtype=boxes.dtype, device=boxes.device)
+                )
+                if ious.max() < min_jaccard_overlap:
+                    continue
+
+                # keep only valid boxes and perform cropping
+                target["boxes"] = boxes
+                target["labels"] = target["labels"][is_within_crop_area]
+                target["boxes"][:, 0::2] -= left
+                target["boxes"][:, 1::2] -= top
+                target["boxes"][:, 0::2].clamp_(min=0, max=new_w)
+                target["boxes"][:, 1::2].clamp_(min=0, max=new_h)
+                image = F.crop(image, top, left, new_h, new_w)
+
+                return image, target
+
+
+class RandomZoomOut(nn.Module):
+    def __init__(
+        self, fill: Optional[List[float]] = None, side_range: Tuple[float, float] = (1.0, 4.0), p: float = 0.5
+    ):
+        super().__init__()
+        if fill is None:
+            fill = [0.0, 0.0, 0.0]
+        self.fill = fill
+        self.side_range = side_range
+        if side_range[0] < 1.0 or side_range[0] > side_range[1]:
+            raise ValueError(f"Invalid canvas side range provided {side_range}.")
+        self.p = p
+
+    @torch.jit.unused
+    def _get_fill_value(self, is_pil):
+        # type: (bool) -> int
+        # We fake the type to make it work on JIT
+        return tuple(int(x) for x in self.fill) if is_pil else 0
+
+    def forward(
+        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
+    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        if isinstance(image, torch.Tensor):
+            if image.ndimension() not in {2, 3}:
+                raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.")
+            elif image.ndimension() == 2:
+                image = image.unsqueeze(0)
+
+        if torch.rand(1) >= self.p:
+            return image, target
+
+        _, orig_h, orig_w = F.get_dimensions(image)
+
+        r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0])
+        canvas_width = int(orig_w * r)
+        canvas_height = int(orig_h * r)
+
+        r = torch.rand(2)
+        left = int((canvas_width - orig_w) * r[0])
+        top = int((canvas_height - orig_h) * r[1])
+        right = canvas_width - (left + orig_w)
+        bottom = canvas_height - (top + orig_h)
+
+        if torch.jit.is_scripting():
+            fill = 0
+        else:
+            fill = self._get_fill_value(F._is_pil_image(image))
+
+        image = F.pad(image, [left, top, right, bottom], fill=fill)
+        if isinstance(image, torch.Tensor):
+            # PyTorch's pad supports only integers on fill. So we need to overwrite the colour
+            v = torch.tensor(self.fill, device=image.device, dtype=image.dtype).view(-1, 1, 1)
+            image[..., :top, :] = image[..., :, :left] = image[..., (top + orig_h) :, :] = image[
+                ..., :, (left + orig_w) :
+            ] = v
+
+        if target is not None:
+            target["boxes"][:, 0::2] += left
+            target["boxes"][:, 1::2] += top
+
+        return image, target
+
+
+class RandomPhotometricDistort(nn.Module):
+    def __init__(
+        self,
+        contrast: Tuple[float, float] = (0.5, 1.5),
+        saturation: Tuple[float, float] = (0.5, 1.5),
+        hue: Tuple[float, float] = (-0.05, 0.05),
+        brightness: Tuple[float, float] = (0.875, 1.125),
+        p: float = 0.5,
+    ):
+        super().__init__()
+        self._brightness = T.ColorJitter(brightness=brightness)
+        self._contrast = T.ColorJitter(contrast=contrast)
+        self._hue = T.ColorJitter(hue=hue)
+        self._saturation = T.ColorJitter(saturation=saturation)
+        self.p = p
+
+    def forward(
+        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
+    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        if isinstance(image, torch.Tensor):
+            if image.ndimension() not in {2, 3}:
+                raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.")
+            elif image.ndimension() == 2:
+                image = image.unsqueeze(0)
+
+        r = torch.rand(7)
+
+        if r[0] < self.p:
+            image = self._brightness(image)
+
+        contrast_before = r[1] < 0.5
+        if contrast_before:
+            if r[2] < self.p:
+                image = self._contrast(image)
+
+        if r[3] < self.p:
+            image = self._saturation(image)
+
+        if r[4] < self.p:
+            image = self._hue(image)
+
+        if not contrast_before:
+            if r[5] < self.p:
+                image = self._contrast(image)
+
+        if r[6] < self.p:
+            channels, _, _ = F.get_dimensions(image)
+            permutation = torch.randperm(channels)
+
+            is_pil = F._is_pil_image(image)
+            if is_pil:
+                image = F.pil_to_tensor(image)
+                image = F.convert_image_dtype(image)
+            image = image[..., permutation, :, :]
+            if is_pil:
+                image = F.to_pil_image(image)
+
+        return image, target
+
+
+class ScaleJitter(nn.Module):
+    """Randomly resizes the image and its bounding boxes  within the specified scale range.
+    The class implements the Scale Jitter augmentation as described in the paper
+    `"Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation" <https://arxiv.org/abs/2012.07177>`_.
+
+    Args:
+        target_size (tuple of ints): The target size for the transform provided in (height, weight) format.
+        scale_range (tuple of ints): scaling factor interval, e.g (a, b), then scale is randomly sampled from the
+            range a <= scale <= b.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+    """
+
+    def __init__(
+        self,
+        target_size: Tuple[int, int],
+        scale_range: Tuple[float, float] = (0.1, 2.0),
+        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        antialias=True,
+    ):
+        super().__init__()
+        self.target_size = target_size
+        self.scale_range = scale_range
+        self.interpolation = interpolation
+        self.antialias = antialias
+
+    def forward(
+        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
+    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        if isinstance(image, torch.Tensor):
+            if image.ndimension() not in {2, 3}:
+                raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.")
+            elif image.ndimension() == 2:
+                image = image.unsqueeze(0)
+
+        _, orig_height, orig_width = F.get_dimensions(image)
+
+        scale = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0])
+        r = min(self.target_size[1] / orig_height, self.target_size[0] / orig_width) * scale
+        new_width = int(orig_width * r)
+        new_height = int(orig_height * r)
+
+        image = F.resize(image, [new_height, new_width], interpolation=self.interpolation, antialias=self.antialias)
+
+        if target is not None:
+            target["boxes"][:, 0::2] *= new_width / orig_width
+            target["boxes"][:, 1::2] *= new_height / orig_height
+            if "masks" in target:
+                target["masks"] = F.resize(
+                    target["masks"],
+                    [new_height, new_width],
+                    interpolation=InterpolationMode.NEAREST,
+                    antialias=self.antialias,
+                )
+
+        return image, target
+
+
+class FixedSizeCrop(nn.Module):
+    def __init__(self, size, fill=0, padding_mode="constant"):
+        super().__init__()
+        size = tuple(T._setup_size(size, error_msg="Please provide only two dimensions (h, w) for size."))
+        self.crop_height = size[0]
+        self.crop_width = size[1]
+        self.fill = fill  # TODO: Fill is currently respected only on PIL. Apply tensor patch.
+        self.padding_mode = padding_mode
+
+    def _pad(self, img, target, padding):
+        # Taken from the functional_tensor.py pad
+        if isinstance(padding, int):
+            pad_left = pad_right = pad_top = pad_bottom = padding
+        elif len(padding) == 1:
+            pad_left = pad_right = pad_top = pad_bottom = padding[0]
+        elif len(padding) == 2:
+            pad_left = pad_right = padding[0]
+            pad_top = pad_bottom = padding[1]
+        else:
+            pad_left = padding[0]
+            pad_top = padding[1]
+            pad_right = padding[2]
+            pad_bottom = padding[3]
+
+        padding = [pad_left, pad_top, pad_right, pad_bottom]
+        img = F.pad(img, padding, self.fill, self.padding_mode)
+        if target is not None:
+            target["boxes"][:, 0::2] += pad_left
+            target["boxes"][:, 1::2] += pad_top
+            if "masks" in target:
+                target["masks"] = F.pad(target["masks"], padding, 0, "constant")
+
+        return img, target
+
+    def _crop(self, img, target, top, left, height, width):
+        img = F.crop(img, top, left, height, width)
+        if target is not None:
+            boxes = target["boxes"]
+            boxes[:, 0::2] -= left
+            boxes[:, 1::2] -= top
+            boxes[:, 0::2].clamp_(min=0, max=width)
+            boxes[:, 1::2].clamp_(min=0, max=height)
+
+            is_valid = (boxes[:, 0] < boxes[:, 2]) & (boxes[:, 1] < boxes[:, 3])
+
+            target["boxes"] = boxes[is_valid]
+            target["labels"] = target["labels"][is_valid]
+            if "masks" in target:
+                target["masks"] = F.crop(target["masks"][is_valid], top, left, height, width)
+
+        return img, target
+
+    def forward(self, img, target=None):
+        _, height, width = F.get_dimensions(img)
+        new_height = min(height, self.crop_height)
+        new_width = min(width, self.crop_width)
+
+        if new_height != height or new_width != width:
+            offset_height = max(height - self.crop_height, 0)
+            offset_width = max(width - self.crop_width, 0)
+
+            r = torch.rand(1)
+            top = int(offset_height * r)
+            left = int(offset_width * r)
+
+            img, target = self._crop(img, target, top, left, new_height, new_width)
+
+        pad_bottom = max(self.crop_height - new_height, 0)
+        pad_right = max(self.crop_width - new_width, 0)
+        if pad_bottom != 0 or pad_right != 0:
+            img, target = self._pad(img, target, [0, 0, pad_right, pad_bottom])
+
+        return img, target
+
+
+class RandomShortestSize(nn.Module):
+    def __init__(
+        self,
+        min_size: Union[List[int], Tuple[int], int],
+        max_size: int,
+        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+    ):
+        super().__init__()
+        self.min_size = [min_size] if isinstance(min_size, int) else list(min_size)
+        self.max_size = max_size
+        self.interpolation = interpolation
+
+    def forward(
+        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
+    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        _, orig_height, orig_width = F.get_dimensions(image)
+
+        min_size = self.min_size[torch.randint(len(self.min_size), (1,)).item()]
+        r = min(min_size / min(orig_height, orig_width), self.max_size / max(orig_height, orig_width))
+
+        new_width = int(orig_width * r)
+        new_height = int(orig_height * r)
+
+        image = F.resize(image, [new_height, new_width], interpolation=self.interpolation)
+
+        if target is not None:
+            target["boxes"][:, 0::2] *= new_width / orig_width
+            target["boxes"][:, 1::2] *= new_height / orig_height
+            if "masks" in target:
+                target["masks"] = F.resize(
+                    target["masks"], [new_height, new_width], interpolation=InterpolationMode.NEAREST
+                )
+
+        return image, target
+
+
+def _copy_paste(
+    image: torch.Tensor,
+    target: Dict[str, Tensor],
+    paste_image: torch.Tensor,
+    paste_target: Dict[str, Tensor],
+    blending: bool = True,
+    resize_interpolation: F.InterpolationMode = F.InterpolationMode.BILINEAR,
+) -> Tuple[torch.Tensor, Dict[str, Tensor]]:
+
+    # Random paste targets selection:
+    num_masks = len(paste_target["masks"])
+
+    if num_masks < 1:
+        # Such degerante case with num_masks=0 can happen with LSJ
+        # Let's just return (image, target)
+        return image, target
+
+    # We have to please torch script by explicitly specifying dtype as torch.long
+    random_selection = torch.randint(0, num_masks, (num_masks,), device=paste_image.device)
+    random_selection = torch.unique(random_selection).to(torch.long)
+
+    paste_masks = paste_target["masks"][random_selection]
+    paste_boxes = paste_target["boxes"][random_selection]
+    paste_labels = paste_target["labels"][random_selection]
+
+    masks = target["masks"]
+
+    # We resize source and paste data if they have different sizes
+    # This is something we introduced here as originally the algorithm works
+    # on equal-sized data (for example, coming from LSJ data augmentations)
+    size1 = image.shape[-2:]
+    size2 = paste_image.shape[-2:]
+    if size1 != size2:
+        paste_image = F.resize(paste_image, size1, interpolation=resize_interpolation)
+        paste_masks = F.resize(paste_masks, size1, interpolation=F.InterpolationMode.NEAREST)
+        # resize bboxes:
+        ratios = torch.tensor((size1[1] / size2[1], size1[0] / size2[0]), device=paste_boxes.device)
+        paste_boxes = paste_boxes.view(-1, 2, 2).mul(ratios).view(paste_boxes.shape)
+
+    paste_alpha_mask = paste_masks.sum(dim=0) > 0
+
+    if blending:
+        paste_alpha_mask = F.gaussian_blur(
+            paste_alpha_mask.unsqueeze(0),
+            kernel_size=(5, 5),
+            sigma=[
+                2.0,
+            ],
+        )
+
+    # Copy-paste images:
+    image = (image * (~paste_alpha_mask)) + (paste_image * paste_alpha_mask)
+
+    # Copy-paste masks:
+    masks = masks * (~paste_alpha_mask)
+    non_all_zero_masks = masks.sum((-1, -2)) > 0
+    masks = masks[non_all_zero_masks]
+
+    # Do a shallow copy of the target dict
+    out_target = {k: v for k, v in target.items()}
+
+    out_target["masks"] = torch.cat([masks, paste_masks])
+
+    # Copy-paste boxes and labels
+    boxes = ops.masks_to_boxes(masks)
+    out_target["boxes"] = torch.cat([boxes, paste_boxes])
+
+    labels = target["labels"][non_all_zero_masks]
+    out_target["labels"] = torch.cat([labels, paste_labels])
+
+    # Update additional optional keys: area and iscrowd if exist
+    if "area" in target:
+        out_target["area"] = out_target["masks"].sum((-1, -2)).to(torch.float32)
+
+    if "iscrowd" in target and "iscrowd" in paste_target:
+        # target['iscrowd'] size can be differ from mask size (non_all_zero_masks)
+        # For example, if previous transforms geometrically modifies masks/boxes/labels but
+        # does not update "iscrowd"
+        if len(target["iscrowd"]) == len(non_all_zero_masks):
+            iscrowd = target["iscrowd"][non_all_zero_masks]
+            paste_iscrowd = paste_target["iscrowd"][random_selection]
+            out_target["iscrowd"] = torch.cat([iscrowd, paste_iscrowd])
+
+    # Check for degenerated boxes and remove them
+    boxes = out_target["boxes"]
+    degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
+    if degenerate_boxes.any():
+        valid_targets = ~degenerate_boxes.any(dim=1)
+
+        out_target["boxes"] = boxes[valid_targets]
+        out_target["masks"] = out_target["masks"][valid_targets]
+        out_target["labels"] = out_target["labels"][valid_targets]
+
+        if "area" in out_target:
+            out_target["area"] = out_target["area"][valid_targets]
+        if "iscrowd" in out_target and len(out_target["iscrowd"]) == len(valid_targets):
+            out_target["iscrowd"] = out_target["iscrowd"][valid_targets]
+
+    return image, out_target
+
+
+class SimpleCopyPaste(torch.nn.Module):
+    def __init__(self, blending=True, resize_interpolation=F.InterpolationMode.BILINEAR):
+        super().__init__()
+        self.resize_interpolation = resize_interpolation
+        self.blending = blending
+
+    def forward(
+        self, images: List[torch.Tensor], targets: List[Dict[str, Tensor]]
+    ) -> Tuple[List[torch.Tensor], List[Dict[str, Tensor]]]:
+        torch._assert(
+            isinstance(images, (list, tuple)) and all([isinstance(v, torch.Tensor) for v in images]),
+            "images should be a list of tensors",
+        )
+        torch._assert(
+            isinstance(targets, (list, tuple)) and len(images) == len(targets),
+            "targets should be a list of the same size as images",
+        )
+        for target in targets:
+            # Can not check for instance type dict with inside torch.jit.script
+            # torch._assert(isinstance(target, dict), "targets item should be a dict")
+            for k in ["masks", "boxes", "labels"]:
+                torch._assert(k in target, f"Key {k} should be present in targets")
+                torch._assert(isinstance(target[k], torch.Tensor), f"Value for the key {k} should be a tensor")
+
+        # images = [t1, t2, ..., tN]
+        # Let's define paste_images as shifted list of input images
+        # paste_images = [t2, t3, ..., tN, t1]
+        # FYI: in TF they mix data on the dataset level
+        images_rolled = images[-1:] + images[:-1]
+        targets_rolled = targets[-1:] + targets[:-1]
+
+        output_images: List[torch.Tensor] = []
+        output_targets: List[Dict[str, Tensor]] = []
+
+        for image, target, paste_image, paste_target in zip(images, targets, images_rolled, targets_rolled):
+            output_image, output_data = _copy_paste(
+                image,
+                target,
+                paste_image,
+                paste_target,
+                blending=self.blending,
+                resize_interpolation=self.resize_interpolation,
+            )
+            output_images.append(output_image)
+            output_targets.append(output_data)
+
+        return output_images, output_targets
+
+    def __repr__(self) -> str:
+        s = f"{self.__class__.__name__}(blending={self.blending}, resize_interpolation={self.resize_interpolation})"
+        return s
diff --git a/integrations/pytorch_ddp/test/torchvision/utils.py b/integrations/pytorch_ddp/test/torchvision/utils.py
new file mode 100644
index 00000000..f7391558
--- /dev/null
+++ b/integrations/pytorch_ddp/test/torchvision/utils.py
@@ -0,0 +1,282 @@
+import datetime
+import errno
+import os
+import time
+from collections import defaultdict, deque
+
+import torch
+import torch.distributed as dist
+
+
+class SmoothedValue:
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value
+        )
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+    data_list = [None] * world_size
+    dist.all_gather_object(data_list, data)
+    return data_list
+
+
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.inference_mode():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.all_reduce(values)
+        if average:
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+
+
+class MetricLogger:
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{attr}'")
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(f"{name}: {str(meter)}")
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ""
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt="{avg:.4f}")
+        data_time = SmoothedValue(fmt="{avg:.4f}")
+        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join(
+                [
+                    header,
+                    "[{0" + space_fmt + "}/{1}]",
+                    "eta: {eta}",
+                    "{meters}",
+                    "time: {time}",
+                    "data: {data}",
+                    "max mem: {memory:.0f}",
+                ]
+            )
+        else:
+            log_msg = self.delimiter.join(
+                [header, "[{0" + space_fmt + "}/{1}]", "eta: {eta}", "{meters}", "time: {time}", "data: {data}"]
+            )
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(
+                        log_msg.format(
+                            i,
+                            len(iterable),
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                            memory=torch.cuda.max_memory_allocated() / MB,
+                        )
+                    )
+                else:
+                    print(
+                        log_msg.format(
+                            i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time)
+                        )
+                    )
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print(f"{header} Total time: {total_time_str} ({total_time / len(iterable):.4f} s / it)")
+
+
+def collate_fn(batch):
+    return tuple(zip(*batch))
+
+
+def mkdir(path):
+    try:
+        os.makedirs(path)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+def init_distributed_mode(args):
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ["WORLD_SIZE"])
+        args.gpu = int(os.environ["LOCAL_RANK"])
+    elif "SLURM_PROCID" in os.environ:
+        args.rank = int(os.environ["SLURM_PROCID"])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print("Not using distributed mode")
+        args.distributed = False
+        return
+
+    args.distributed = True
+
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = "nccl"
+    print(f"| distributed init (rank {args.rank}): {args.dist_url}", flush=True)
+    torch.distributed.init_process_group(
+        backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank
+    )
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
diff --git a/integrations/pytorch_ddp/test/torchvision/utils.py.1 b/integrations/pytorch_ddp/test/torchvision/utils.py.1
new file mode 100644
index 00000000..f7391558
--- /dev/null
+++ b/integrations/pytorch_ddp/test/torchvision/utils.py.1
@@ -0,0 +1,282 @@
+import datetime
+import errno
+import os
+import time
+from collections import defaultdict, deque
+
+import torch
+import torch.distributed as dist
+
+
+class SmoothedValue:
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value
+        )
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+    data_list = [None] * world_size
+    dist.all_gather_object(data_list, data)
+    return data_list
+
+
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.inference_mode():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.all_reduce(values)
+        if average:
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+
+
+class MetricLogger:
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{attr}'")
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(f"{name}: {str(meter)}")
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ""
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt="{avg:.4f}")
+        data_time = SmoothedValue(fmt="{avg:.4f}")
+        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join(
+                [
+                    header,
+                    "[{0" + space_fmt + "}/{1}]",
+                    "eta: {eta}",
+                    "{meters}",
+                    "time: {time}",
+                    "data: {data}",
+                    "max mem: {memory:.0f}",
+                ]
+            )
+        else:
+            log_msg = self.delimiter.join(
+                [header, "[{0" + space_fmt + "}/{1}]", "eta: {eta}", "{meters}", "time: {time}", "data: {data}"]
+            )
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(
+                        log_msg.format(
+                            i,
+                            len(iterable),
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                            memory=torch.cuda.max_memory_allocated() / MB,
+                        )
+                    )
+                else:
+                    print(
+                        log_msg.format(
+                            i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time)
+                        )
+                    )
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print(f"{header} Total time: {total_time_str} ({total_time / len(iterable):.4f} s / it)")
+
+
+def collate_fn(batch):
+    return tuple(zip(*batch))
+
+
+def mkdir(path):
+    try:
+        os.makedirs(path)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+def init_distributed_mode(args):
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ["WORLD_SIZE"])
+        args.gpu = int(os.environ["LOCAL_RANK"])
+    elif "SLURM_PROCID" in os.environ:
+        args.rank = int(os.environ["SLURM_PROCID"])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print("Not using distributed mode")
+        args.distributed = False
+        return
+
+    args.distributed = True
+
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = "nccl"
+    print(f"| distributed init (rank {args.rank}): {args.dist_url}", flush=True)
+    torch.distributed.init_process_group(
+        backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank
+    )
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)

From 6feceed7684f842506cc3c2d4b7903cf810f7037 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Tue, 14 May 2024 15:04:55 +0200
Subject: [PATCH 14/64] Simulator fix

---
 .../process_group_wrapper.py                  |   1 +
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 143 +++++++++++++++---
 integrations/pytorch_ddp/test/run.sh          |   6 +-
 integrations/pytorch_ddp/test/test-generic.py |  61 +++++---
 4 files changed, 167 insertions(+), 44 deletions(-)

diff --git a/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py b/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py
index 3948d728..2c346f73 100644
--- a/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py
+++ b/integrations/pytorch_ddp/accl_process_group/process_group_wrapper.py
@@ -84,6 +84,7 @@ def create_process_group_wrapper(store, rank, size, _timeout):
         return pg
 
     #CPU only for now
+    logger.debug('Registering ACCL Backend')
     Backend.register_backend("ACCL", create_process_group_wrapper, devices='cpu')
 
 def initialize() -> None:
diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 8a83e004..9cec3e90 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -23,6 +23,7 @@
 #include <memory>
 #include <stdexcept>
 #include <string>
+#include <signal.h>
 
 #ifdef ACCL_PROCESS_GROUP_HIP_ENABLED
 #include "hip/hip_runtime.h"
@@ -56,6 +57,16 @@ namespace c10d {
 #error Cannot compile Process Group with both HIP and CUDA support
 #endif // ACCL_PROCESS_GROUP_HIP_ENABLED && ACCL_PROCESS_GROUP_CUDA_ENABLED
 
+// Activate Parameter printing:
+#define DO_PARA_PRINT
+
+#if defined(DO_PARA_PRINT)
+  #define PARA_PRINT(x)							\
+    ACCL::debug("#x size: " + std::to_string(x.numel()) + " of type: " + string_of_accl_datatype(convert_datatype_from_torch(x.scalar_type())))
+#else
+  #define PARA_PRINT(x)
+#endif
+
 namespace {
 
 /* Alternative for std::format from C++20 in C++17.
@@ -198,6 +209,25 @@ const char *convert_datatype_to_torch(ACCL::dataType torch_type) {
   }
 }
 
+const char *string_of_accl_datatype(ACCL::dataType accl_type) {
+  switch (accl_type) {
+  case ACCL::dataType::float16:
+    return "ACCL::dataType::float16";
+  case ACCL::dataType::float32:
+    return "ACCL::dataType::float32";
+  case ACCL::dataType::float64:
+    return "ACCL::dataType::float64";
+  case ACCL::dataType::int32:
+    return "ACCL::dataType::int32";
+  case ACCL::dataType::int64:
+    return "ACCL::dataType::int64";
+  default:
+    return "unknown";
+  }
+}
+
+
+  
 std::map<ACCL::dataType, ACCL::dataType> convert_compression_from_dict(
     const std::map<std::string, std::string> &dictionary) {
   std::map<ACCL::dataType, ACCL::dataType> map;
@@ -581,6 +611,23 @@ std::vector<ACCL::rank_t> convert_ranks(
   return accl_ranks;
 }
 
+// just for the sa_handler
+std::unique_ptr<::ACCL::ACCL>* global_accl;  
+
+void accl_sa_handler(int)
+{
+	static bool once = true;
+	if(once) {
+		global_accl->reset();
+		// std::cout << "Error! Signal received. Finalizing MPI..." << std::endl;
+		// MPI_Finalize();
+		// std::cout << "Done. Terminating..." << std::endl;
+		once = false;
+	}
+	exit(EXIT_FAILURE);
+}  
+  
+
 // Initialize ACCL
 ProcessGroupACCL::ProcessGroupACCL(
     const c10::intrusive_ptr<::c10d::Store> &store, int rank, int size,
@@ -599,6 +646,15 @@ ProcessGroupACCL::ProcessGroupACCL(
         || design == accl_network_utils::acclDesign::CYT_TCP),
       compression(compression), initialized(false) {
 
+  ACCL::debug("Process Group constructor called");
+  
+  // struct sigaction sa;
+  // memset(&sa, 0, sizeof(sa));
+  // sa.sa_handler = accl_sa_handler;
+  // sigfillset(&sa.sa_mask);
+  // sigaction(SIGINT,&sa,NULL);
+  // sigaction(SIGSEGV, &sa, NULL);
+  
   if (std::find(profiling_ranks.begin(), profiling_ranks.end(), rank) !=
       profiling_ranks.end()) {
     std::this_thread::sleep_for(
@@ -613,6 +669,7 @@ ProcessGroupACCL::ProcessGroupACCL(
         cyt_device = new ACCL::CoyoteDevice();
       } else if (design_ == accl_network_utils::acclDesign::CYT_RDMA) {
         cyt_device = new ACCL::CoyoteDevice(size_);
+	ACCL::debug("Starting QP-exchange");
         cyt::setup_cyt_rdma(ibvQpConn_vec, ranks_, rank_, *cyt_device);
       } else {
         throw std::runtime_error("Undefined ACCL design");
@@ -659,11 +716,13 @@ void ProcessGroupACCL::initialize() {
     }
 
     accl = std::make_unique<ACCL::ACCL>(cyt_device);
+    // global_accl = &accl;
 
     // Rendezvous protocol for now
     int protoc = 1;
     // default from test.cpp
     int segsize = 4096 * 1024;
+
     
     if (protoc == 0){
       std::cout<<"Eager Protocol"<<std::endl;
@@ -675,10 +734,12 @@ void ProcessGroupACCL::initialize() {
     
     ACCL::debug(std::string("[ACCL coyote] communicator: ") + accl->dump_communicator());
   } else {
+    ACCL::debug(std::string("Performing standard initialization"));
     accl = accl_network_utils::initialize_accl(ranks_, rank_,
                                                simulator_, design_, xrt_device,
                                                xclbin_, nbufs_, bufsize, 0,
                                                rsfec_);
+    ACCL::debug(std::string("Setting timeout and Threshold"));
     accl->set_timeout(1e6);
     accl->set_rendezvous_threshold(16*1024);
                                       
@@ -694,6 +755,7 @@ void ProcessGroupACCL::initialize() {
   // Start the worker thread accepting ACCL calls
   workerThread_ = std::thread(&ProcessGroupACCL::runLoop, this);
   initialized = true;
+  ACCL::debug(std::string("Finished Initialization"));
 }
 
 ProcessGroupACCL::~ProcessGroupACCL() { destroy(); }
@@ -1618,6 +1680,9 @@ void ProcessGroupACCL::run_alltoall(at::Tensor srctensor_original,
   std::unique_ptr<ACCL::BaseBuffer> srcdata;
   std::unique_ptr<ACCL::BaseBuffer> dstdata;
 
+  ACCL::debug("Running alltoall");
+  PARA_PRINT(srctensor_original);
+
   // Reserve device
   c10::DeviceGuard guard(srctensor->device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
@@ -1627,32 +1692,54 @@ void ProcessGroupACCL::run_alltoall(at::Tensor srctensor_original,
   if (p2p_applicable(*accl, srctensor_original, p2p_enabled)) {
     srcdata = create_and_copy_p2p_buffer(*accl, srctensor_original);
   } else {
-    if (accl->is_simulated() || coyote_enabled) {
+    if (coyote_enabled) {
+      srcdata = create_coyotebuffer(*accl, srctensor->numel(), srctensor->scalar_type());
+      ACCL::debug("Copying data to CPU tensor of size " +
+                  std::to_string(srctensor_original.numel()));
+      empty_srctensor = torch::from_blob(
+          srcdata->byte_array(), srctensor_original.sizes(),
+          srctensor_original.options().device(c10::DeviceType::CPU));
+      srctensor = &empty_srctensor;
+      srctensor->copy_(srctensor_original);
+    }
+    else if (srctensor_original.device().type() != c10::DeviceType::CPU) {
       srcdata = create_buffer(*accl, srctensor->numel(), srctensor->scalar_type());
-    } else {
-      srcdata = wrap_buffer(*accl, buf0, srctensor->numel(), srctensor->scalar_type());
+      ACCL::debug("Copying data to CPU tensor of size " +
+                  std::to_string(srctensor_original.numel()));
+      empty_srctensor = torch::from_blob(
+          srcdata->byte_array(), srctensor_original.sizes(),
+          srctensor_original.options().device(c10::DeviceType::CPU));
+      srctensor = &empty_srctensor;
+      srctensor->copy_(srctensor_original);
+    }
+    else {
+      srcdata = create_buffer(*accl, *srctensor);
     }
-    ACCL::debug("Copying data to aligned CPU tensor of size " +
-                std::to_string(srctensor_original.numel()));
-    empty_srctensor = torch::from_blob(
-        srcdata->byte_array(), srctensor_original.sizes(),
-        srctensor_original.options().device(c10::DeviceType::CPU));
-    srctensor = &empty_srctensor;
-    srctensor->copy_(srctensor_original);
-    ACCL::debug("Creating extra result buffer of size " +
-                std::to_string(srctensor_original.numel()));
   }
 
-  // Create output buffer
   if (p2p_applicable(*accl, dsttensor_original, p2p_enabled)) {
-    dstdata = create_and_copy_p2p_buffer(*accl, dsttensor_original);
+    dstdata = create_buffer_p2p(*accl, srctensor->numel(), srctensor->scalar_type());
+  } else if (coyote_enabled) {
+    dstdata = create_coyotebuffer(*accl, srctensor->numel(),srctensor->scalar_type());
+    torch::from_blob(dstdata->byte_array(), srctensor_original.sizes(),
+        srctensor_original.options().device(c10::DeviceType::CPU));
+    dsttensor = &empty_dsttensor;
   } else {
-    if (accl->is_simulated() || coyote_enabled) {
-      dstdata = create_buffer(*accl, dsttensor->numel(), dsttensor->scalar_type());
-    } else {
-      dstdata = wrap_buffer(*accl, buf0, dsttensor->numel(), dsttensor->scalar_type());
-    }
+    dstdata = create_buffer(*accl, srctensor->numel(), srctensor->scalar_type());
+    empty_dsttensor = torch::from_blob(dstdata->byte_array(), srctensor_original.sizes(), srctensor_original.options().device(c10::DeviceType::CPU));
+    dsttensor = &empty_dsttensor;
   }
+  
+  // Create output buffer
+  // if (p2p_applicable(*accl, dsttensor_original, p2p_enabled)) {
+    // dstdata = create_and_copy_p2p_buffer(*accl, dsttensor_original);
+  // } else {
+    // if (accl->is_simulated() || coyote_enabled) {
+      // dstdata = create_buffer(*accl, dsttensor->numel(), dsttensor->scalar_type());
+    // } else {
+      // dstdata = wrap_buffer(*accl, buf0, dsttensor->numel(), dsttensor->scalar_type());
+    // }
+  // }
 
   // Run alltoall
   if (!coyote_enabled) {
@@ -1664,14 +1751,17 @@ void ProcessGroupACCL::run_alltoall(at::Tensor srctensor_original,
 
   ACCL::debug("Starting alltoall of " + std::to_string(srctensor->numel()) +
               " items");
+
+  // ACCL::ACCLRequest* req = accl->bcast(*srcdata, srctensor->numel(), 0, ACCL::GLOBAL_COMM, true,
+              // true, get_compressed_type(srctensor->scalar_type()));
   ACCL::ACCLRequest* req = accl->alltoall(*srcdata, *dstdata, srctensor->numel(),
                   ACCL::GLOBAL_COMM, true, true,
                   get_compressed_type(srctensor->scalar_type()));
 
-  if(coyote_enabled){
-    ACCL::debug("Waiting for request to complete.");
-    accl->wait(req, 1000ms);
-  }
+  // if(coyote_enabled){
+  ACCL::debug("Waiting for request to complete.");
+  accl->wait(req, 1000ms);
+  // }
   ACCL::debug("Finished waiting");
   
   if (!coyote_enabled) {
@@ -1694,6 +1784,7 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::alltoall_base(
     at::Tensor &outputTensor, at::Tensor &inputTensor,
     std::vector<int64_t> &outputSplitSizes,
     std::vector<int64_t> &inputSplitSizes, const AllToAllOptions &opts) {
+  ACCL::debug("alltoall base variant called");
   if (outputSplitSizes.size() == 0 && inputSplitSizes.size() == 0) {
     // We can use alltoall
     TORCH_CHECK(
@@ -1708,8 +1799,8 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::alltoall_base(
         [opts, this](std::unique_ptr<WorkEntry>& entry) {
           auto srctensor = (entry->src)[0];
           auto dsttensor = (entry->dst)[0];
-          c10::DeviceGuard guard(srctensor.device());
-          std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+          // c10::DeviceGuard guard(srctensor.device());
+          // std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
           // Segment data if necessary
           if (dsttensor.nbytes() > bufsize) {
             ACCL::debug("dsttensor to large!");
@@ -1721,6 +1812,7 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::alltoall_base(
               run_alltoall(srctensor.slice(0, i, end), dsttensor.slice(0, i, end), opts);
             }
           } else {
+	    ACCL::debug("Running without segmentation");
             run_alltoall(srctensor, dsttensor, opts);
           }
         };
@@ -1741,6 +1833,7 @@ c10::intrusive_ptr<Work>
 ProcessGroupACCL::alltoall(std::vector<at::Tensor> &outputTensors,
                            std::vector<at::Tensor> &inputTensors,
                            const AllToAllOptions &opts) {
+  ACCL::debug("ProcessGroupACCL does not support alltoall");
   TORCH_CHECK(false, "ProcessGroupACCL does not support alltoall");
 }
 
diff --git a/integrations/pytorch_ddp/test/run.sh b/integrations/pytorch_ddp/test/run.sh
index 0dbf3d72..15d9bd67 100755
--- a/integrations/pytorch_ddp/test/run.sh
+++ b/integrations/pytorch_ddp/test/run.sh
@@ -98,10 +98,12 @@ echo "Running with $NUM_PROCESS Processes"
 rm -f $(pwd)/accl_log/rank*
 
 # C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_stdout\" $EXEC $ARG &"
-C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_stdout\" -errfile-pattern \"$(pwd)/accl_log/rank_%r_stderr\" $EXEC $ARG &"
-# C="mpirun -n $NUM_PROCESS $MPI_ARGS $EXEC $ARG &"
+# C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_stdout\" -errfile-pattern \"$(pwd)/accl_log/rank_%r_stderr\" $EXEC $ARG &"
+C="mpirun -n $NUM_PROCESS $MPI_ARGS $EXEC $ARG &"
 echo $C
 
+exit 0
+
 /bin/sh -c "$C"
 
 if ! [[ -v SLEEPTIME ]]; then
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index 448e7259..faf7de29 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -50,7 +50,7 @@
 
 count = 512
 #As in test.cpp defaults
-rxbufsize = 4096 * 1024
+rxbufsize = 4096# * 1024
 
 
 def test_broadcast():
@@ -61,8 +61,8 @@ def test_broadcast():
 
     dist.broadcast(x, 0)
 
-    logger.debug('Tensor after broadcast: ' + str(x))
-    print('Tensor after broadcast: ' + str(x))
+    # logger.debug('Tensor after broadcast: ' + str(x))
+    # print('Tensor after broadcast: ' + str(x))
     
     np.testing.assert_allclose(x, torch.ones(count))
     print("Test broadcast finished!")
@@ -145,6 +145,25 @@ def test_allreduce():
     np.testing.assert_allclose(x, [size for _ in range(count)])
     print("Test allreduce finished!")
 
+def test_alltoall():
+    input = torch.arange(4, dtype=torch.float) + float(rank) * 4.
+
+    logger.debug("All-to-all input:")
+    logger.debug(str(input)) 
+
+    output = torch.ones(4)
+    # output = torch.empty([4], dtype=torch.int64)
+
+    logger.debug("All-to-all output:")
+    logger.debug(str(output)) 
+    
+    dist.all_to_all_single(output, input)
+    
+    logger.debug("All-to-all output:")
+    logger.debug(str(output)) 
+
+    print("Test allreduce finished!")
+    
 
 class ToyModel(nn.Module):
     def __init__(self):
@@ -204,7 +223,7 @@ def demo_basic(rank: int):
         
 
     print("finished training")
-    dist.destroy_process_group()
+    # dist.destroy_process_group()
 
 def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=None, ma: str="localhost", mp: str="30505"):
     global rank, size
@@ -217,7 +236,7 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
     rank = mpi.Get_rank()
     size = mpi.Get_size()
     start_port = 5005
-    print(f"Starting tests with the following parameters:\n\
+    logger.debug(f"Starting tests with the following parameters:\n\
 Simulation: {simulator}, Communication Backend: {comms}\n\
 Rank: {rank}, World size: {size}\n\
 Host file: {host_file}, FPGA file: {fpga_file}\n\
@@ -225,6 +244,8 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
     
 
     if not simulator:
+        #default from test.cpp
+        rxbufsize = 4096 * 1024
         if host_file==None or fpga_file==None: sys.exit('Host and FPGA file need to be specified in hardware mode')
             
         with open(host_file, 'r') as hf:
@@ -236,8 +257,10 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
         if comms == "cyt_rdma":
             ranks = [accl.Rank(a, start_port, i, rxbufsize) for i, a in enumerate(fpga_ips)]
         else:
-            ranks = [accl.Rank(a, start_port + i, 0, rxbufsize) for i, a in enumerate(fpga_ips)]            
+            ranks = [accl.Rank(a, start_port + i, 0, rxbufsize) for i, a in enumerate(fpga_ips)]
     else:
+        # Somehow the simulator gets stuck if I use the same rxbufsize
+        rxbufsize = 4096# * 1024
         ranks = [accl.Rank("127.0.0.1", 5500 + i, i, rxbufsize) for i in range(size)]
 
     logger.debug(f'Ranks: {ranks}')
@@ -246,13 +269,13 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
         design = accl.ACCLDesign.udp
     elif comms == 'tcp':
         design = accl.ACCLDesign.tcp
-    elif comms == 'cyt_rdma' and not simulator:
+    elif comms == 'cyt_rdma': # and not simulator:
         design = accl.ACCLDesign.cyt_rdma
-    else:
-        if simulator:
-            sys.exit('Design "' + comms + '" currently not supported in simulator mode')
-        else:
-            sys.exit('Design "' + comms + '" currently not supported in hardware mode')
+    # else:
+        # if simulator:
+            # sys.exit('Design "' + comms + '" currently not supported in simulator mode')
+        # else:
+            # sys.exit('Design "' + comms + '" currently not supported in hardware mode')
 
     
     accl.create_process_group(ranks, design, bufsize=rxbufsize, initialize=True, simulation=simulator)
@@ -271,13 +294,17 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
         mpi.Barrier()
         test_allgather()
         mpi.Barrier()
-        # test_reduce()
-        # mpi.Barrier()
-        # test_allreduce()
-        # mpi.Barrier()
+        test_reduce()
+        mpi.Barrier()
+        test_allreduce()
+        mpi.Barrier()
         demo_basic(rank)
         mpi.Barrier()
-
+        # run_training()
+        # mpi.Barrier()
+        # test_alltoall()
+        # mpi.Barrier()
+        
     print("Finished testing")
     logger.debug('Finished testing')
         

From a94a5e2a6e888c62fa1c02dc0b0bed26465c77f1 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Fri, 17 May 2024 11:26:20 +0200
Subject: [PATCH 15/64] Introduced initialization helper functions

---
 .../pytorch_ddp/include/ProcessGroupACCL.hpp  |   3 +
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 323 +++++++-----------
 integrations/pytorch_ddp/test/test-generic.py |  32 +-
 3 files changed, 134 insertions(+), 224 deletions(-)

diff --git a/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp b/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp
index ae9944ca..97c4e013 100644
--- a/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp
+++ b/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp
@@ -292,6 +292,9 @@ class TORCH_API ProcessGroupACCL : public ProcessGroup {
   // Global states
   static void initACCLOnce();
   static void acclExit();
+  void init_input_tensor(at::Tensor &tensor_original, at::Tensor &tensor, std::unique_ptr<ACCL::BaseBuffer> &data, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
+  void init_output_tensor(at::Tensor &tensor,  at::Tensor &tensor_original, std::unique_ptr<ACCL::BaseBuffer> &dstdata, int out_tensor_size, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
+  void copy_back_tensor(at::Tensor tensor_original, std::unique_ptr<ACCL::BaseBuffer> &data, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
   static std::once_flag onceFlagInitACCL;
 
   static std::mutex pgGlobalMutex_;
diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 9cec3e90..e1205fa9 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -24,6 +24,7 @@
 #include <stdexcept>
 #include <string>
 #include <signal.h>
+#include <mpi.h>
 
 #ifdef ACCL_PROCESS_GROUP_HIP_ENABLED
 #include "hip/hip_runtime.h"
@@ -67,6 +68,26 @@ namespace c10d {
   #define PARA_PRINT(x)
 #endif
 
+
+#define STANDARD_DECL \
+  at::Tensor *tensor = &tensor_original;				\
+  std::unique_ptr<ACCL::BaseBuffer> data;				\
+  std::unique_ptr<ACCL::BaseBuffer> dstdata;				\
+
+#define DO_COND ((do_on_root && opts_root_rank == rank_) || (do_on_others && opts_root_rank != rank_))
+  
+#define PRE_REQUEST(opname, tensor)					\
+  ACCL::debug("[" #opname "] Entering barrier");				\
+  accl->barrier();							\
+  ACCL::debug("Starting " #opname " of " + std::to_string(tensor->numel()) + " items");
+
+#define POST_REQUEST					\
+  if(coyote_enabled){					\
+    ACCL::debug("Waiting for request to complete.");	\
+    accl->wait(req, 1000ms);				\
+  }							\
+  ACCL::debug("Finished waiting");
+  
 namespace {
 
 /* Alternative for std::format from C++20 in C++17.
@@ -625,8 +646,61 @@ void accl_sa_handler(int)
 		once = false;
 	}
 	exit(EXIT_FAILURE);
-}  
+}
+
+void ProcessGroupACCL::init_input_tensor(at::Tensor &tensor_original, at::Tensor &tensor, std::unique_ptr<ACCL::BaseBuffer> &data, bool do_on_root, bool do_on_others, int opts_root_rank) {
+  // at::Tensor empty_tensor;
+  if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
+    data = create_and_copy_p2p_buffer(*accl, tensor_original);
+  } else {
+    if (coyote_enabled) {
+      data = create_coyotebuffer(*accl, tensor.numel(), tensor.scalar_type());
+    } else if (tensor_original.device().type() != c10::DeviceType::CPU) {
+      data = create_buffer(*accl, tensor.numel(), tensor.scalar_type());
+    } else {
+      data = create_buffer(*accl, tensor);
+    }
+    if (coyote_enabled || tensor_original.device().type() != c10::DeviceType::CPU){
+      ACCL::debug("Copying data to CPU tensor of size " + std::to_string(tensor_original.numel()));
+      tensor = torch::from_blob(data->byte_array(), tensor_original.sizes(), 
+				tensor_original.options().device(c10::DeviceType::CPU)); 
+      if DO_COND {
+	  tensor.copy_(tensor_original);
+	}
+    } 
+  }
+}
   
+  void ProcessGroupACCL::init_output_tensor(at::Tensor &tensor,  at::Tensor &tensor_original, std::unique_ptr<ACCL::BaseBuffer> &dstdata, int out_tensor_size, bool do_on_root, bool do_on_others, int opts_root_rank) {
+    if DO_COND {
+	if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
+	  dstdata = create_buffer_p2p(*accl, tensor.numel(), tensor.scalar_type());
+	} else {
+	  if (coyote_enabled) {
+	    dstdata = create_coyotebuffer(*accl, tensor.numel(), tensor.scalar_type());
+	  } else {
+	    dstdata = create_buffer(*accl, tensor.numel(), tensor.scalar_type());
+	  }
+	}
+      }
+  }
+
+  void ProcessGroupACCL::copy_back_tensor(at::Tensor tensor_original, std::unique_ptr<ACCL::BaseBuffer> &data, bool do_on_root, bool do_on_others, int opts_root_rank){
+  if (!coyote_enabled && DO_COND) {
+    data->sync_from_device();
+  }
+  if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
+    copy_back_p2p_buffer(*data, tensor_original);
+  } else {
+    ACCL::debug("Copying data back from CPU tensor of size " +
+                std::to_string(tensor_original.numel()));
+    if DO_COND {
+      tensor_original.copy_(torch::from_blob(data->byte_array(), tensor_original.sizes(), tensor_original.options().device(c10::DeviceType::CPU)));
+    }
+  }
+  }
+  
+    
 
 // Initialize ACCL
 ProcessGroupACCL::ProcessGroupACCL(
@@ -648,26 +722,29 @@ ProcessGroupACCL::ProcessGroupACCL(
 
   ACCL::debug("Process Group constructor called");
   
-  // struct sigaction sa;
-  // memset(&sa, 0, sizeof(sa));
-  // sa.sa_handler = accl_sa_handler;
-  // sigfillset(&sa.sa_mask);
-  // sigaction(SIGINT,&sa,NULL);
-  // sigaction(SIGSEGV, &sa, NULL);
+  struct sigaction sa;
+  memset(&sa, 0, sizeof(sa));
+  sa.sa_handler = accl_sa_handler;
+  sigfillset(&sa.sa_mask);
+  sigaction(SIGINT,&sa,NULL);
+  sigaction(SIGSEGV, &sa, NULL);
   
   if (std::find(profiling_ranks.begin(), profiling_ranks.end(), rank) !=
       profiling_ranks.end()) {
     std::this_thread::sleep_for(
         std::chrono::duration<double>(profiling_timeout));
   }
-
+  
+  ACCL::debug("Converting ranks");
   ranks_ = convert_ranks(ranks);
   design_ = design;
+  MPI_Barrier(MPI_COMM_WORLD);
   if (!simulator){
     if (coyote_enabled) {
       if (design_ == accl_network_utils::acclDesign::CYT_TCP) {
         cyt_device = new ACCL::CoyoteDevice();
       } else if (design_ == accl_network_utils::acclDesign::CYT_RDMA) {
+	ACCL::debug("Creating CoyoteDevice");
         cyt_device = new ACCL::CoyoteDevice(size_);
 	ACCL::debug("Starting QP-exchange");
         cyt::setup_cyt_rdma(ibvQpConn_vec, ranks_, rank_, *cyt_device);
@@ -716,7 +793,7 @@ void ProcessGroupACCL::initialize() {
     }
 
     accl = std::make_unique<ACCL::ACCL>(cyt_device);
-    // global_accl = &accl;
+    global_accl = &accl;
 
     // Rendezvous protocol for now
     int protoc = 1;
@@ -830,86 +907,28 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::enqueue(
 
 void ProcessGroupACCL::run_broadcast(at::Tensor tensor_original,
                                      const BroadcastOptions &opts) {
-  at::Tensor *tensor = &tensor_original;
-  at::Tensor empty_tensor;
-  std::unique_ptr<ACCL::BaseBuffer> data;
+
+  STANDARD_DECL
+  
+  init_input_tensor(tensor_original, *tensor, data, true, false, opts.rootRank);
 
   // Reserve device
   c10::DeviceGuard guard(tensor->device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  // Copy data from GPU to FPGA if necessary
-  if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
-    data = create_and_copy_p2p_buffer(*accl, tensor_original);
-  } else {
-    if (coyote_enabled) {
-      // Copy tensor to CPU tensor first
-      data = create_coyotebuffer(*accl, tensor->numel(), tensor->scalar_type());
-      ACCL::debug("Copying data to CPU tensor of size " +
-                  std::to_string(tensor_original.numel()));
-      empty_tensor = torch::from_blob(
-          data->byte_array(), tensor_original.sizes(),
-          tensor_original.options().device(c10::DeviceType::CPU));
-      tensor = &empty_tensor;
-      if (rank_ == opts.rootRank) {
-        tensor->copy_(tensor_original);
-      }
-    } else if (tensor_original.device().type() != c10::DeviceType::CPU) {
-      // Copy tensor to CPU tensor first
-      data = create_buffer(*accl, tensor->numel(), tensor->scalar_type());
-      ACCL::debug("Copying data to CPU tensor of size " +
-                  std::to_string(tensor_original.numel()));
-      empty_tensor = torch::from_blob(
-          data->byte_array(), tensor_original.sizes(),
-          tensor_original.options().device(c10::DeviceType::CPU));
-      tensor = &empty_tensor;
-      if (rank_ == opts.rootRank) {
-        tensor->copy_(tensor_original);
-      }
-    } else {
-      data = create_buffer(*accl, *tensor);
-    }
-  }
-
   //check wether this is needed, with hostmem
   if (!coyote_enabled && rank_ == opts.rootRank) {
     data->sync_to_device();
   }
-
-  ACCL::debug("[Broadcast] Entering barrier");
-  accl->barrier();
   
-  ACCL::debug("Starting broadcast of " + std::to_string(tensor->numel()) + " items");
+  PRE_REQUEST(Broadcast,tensor)
 
   ACCL::ACCLRequest* req = accl->bcast(*data, tensor->numel(), opts.rootRank, ACCL::GLOBAL_COMM, true,
               true, get_compressed_type(tensor->scalar_type()));
 
-  if(coyote_enabled){
-    ACCL::debug("Waiting for request to complete.");
-    accl->wait(req, 1000ms);
-  }
-  ACCL::debug("Finished waiting");
-
-  // ACCL::debug("Returncode: " + std::to_string(retcode));  
-  // if (retcode) {
-    // add deconstruction
-    // TORCH_CHECK(false, ACCL_ERROR(retcode));
-  // }
-  
-  if (!coyote_enabled && rank_ != opts.rootRank) {
-    data->sync_from_device();
-  }
+  POST_REQUEST
 
-  // Copy results back to GPU if necessary
-  if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
-    copy_back_p2p_buffer(*data, tensor_original);
-  } else if (coyote_enabled || tensor_original.device().type() != c10::DeviceType::CPU) {
-    ACCL::debug("Copying data back from CPU tensor of size " +
-                std::to_string(tensor_original.numel()));
-    if (rank_ != opts.rootRank) {
-      tensor_original.copy_(*tensor);
-    }
-  }
+  copy_back_tensor(tensor_original, data, false, true, opts.rootRank);
 }
 
 c10::intrusive_ptr<Work>
@@ -941,77 +960,30 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
 
 void ProcessGroupACCL::run_allreduce(at::Tensor tensor_original,
                                      const AllreduceOptions &opts) {
-  at::Tensor *tensor = &tensor_original;
-  at::Tensor empty_tensor;
-  std::unique_ptr<ACCL::BaseBuffer> data;
-  std::unique_ptr<ACCL::BaseBuffer> result;
+
+  STANDARD_DECL
+
+  init_input_tensor(tensor_original, *tensor, data, true, true);    
 
   // Reserve device
   c10::DeviceGuard guard(tensor->device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  // Copy data from GPU to FPGA if necessary, and create a new result buffer,
-  // since ACCL doesn't support in-place allreduce
-  if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
-    data = create_and_copy_p2p_buffer(*accl, tensor_original);
-    result = create_buffer_p2p(*accl, tensor->numel(), tensor->scalar_type());
-  } else {
-    if (accl->is_simulated() || coyote_enabled) {
-      data = create_buffer(*accl, tensor->numel(), tensor->scalar_type());
-    } else {
-      data = wrap_buffer(*accl, buf0, tensor->numel(), tensor->scalar_type());
-    }
-    ACCL::debug("Copying data to aligned CPU tensor of size " +
-                std::to_string(tensor_original.numel()));
-    empty_tensor = torch::from_blob(
-        data->byte_array(), tensor_original.sizes(),
-        tensor_original.options().device(c10::DeviceType::CPU));
-    tensor = &empty_tensor;
-    tensor->copy_(tensor_original);
-    ACCL::debug("Creating extra result buffer of size " +
-                std::to_string(tensor_original.numel()));
-    if (accl->is_simulated() || coyote_enabled) {
-      result = create_buffer(*accl, tensor->numel(), tensor->scalar_type());
-    } else {
-      result = wrap_buffer(*accl, buf1, tensor->numel(), tensor->scalar_type());
-    }
-  }
+  init_output_tensor(*tensor, tensor_original, dstdata, tensor->numel(), true, true);
 
-  // Run allreduce
   if (!coyote_enabled) {
     data->sync_to_device();
   }
 
-  ACCL::debug("[AllReduce] Entering barrier");
-  accl->barrier();
-
-  ACCL::debug("Starting allreduce of " + std::to_string(tensor->numel()) +
-              " items");
-  ACCL::ACCLRequest* req = accl->allreduce(*data, *result, tensor->numel(), acclOp.at(opts.reduceOp),
+  PRE_REQUEST(Allreduce,tensor)  
+  
+  ACCL::ACCLRequest* req = accl->allreduce(*data, *dstdata, tensor->numel(), acclOp.at(opts.reduceOp),
                   ACCL::GLOBAL_COMM, true, true,
                   get_compressed_type(tensor->scalar_type()));
 
-  if(coyote_enabled){
-    ACCL::debug("Waiting for request to complete.");
-    accl->wait(req, 1000ms);
-  }
-  
-  ACCL::debug("Finished waiting");
+  POST_REQUEST
 
-  if (!coyote_enabled) {
-    result->sync_from_device();
-  }
-
-  // Copy result buffer back to original tensor
-  if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
-    copy_back_p2p_buffer(*result, tensor_original);
-  } else {
-    ACCL::debug("Copying result data back to original tensor of size " +
-                std::to_string(tensor_original.numel()));
-    tensor_original.copy_(torch::from_blob(
-        result->byte_array(), tensor_original.sizes(),
-        tensor_original.options().device(c10::DeviceType::CPU)));
-  }
+    copy_back_tensor(tensor_original, dstdata, true, true);
 }
 
 c10::intrusive_ptr<Work>
@@ -1048,94 +1020,30 @@ ProcessGroupACCL::allreduce_coalesced(std::vector<at::Tensor> &tensors,
 
 void ProcessGroupACCL::run_reduce(at::Tensor tensor_original,
                                   const ReduceOptions &opts) {
-  at::Tensor *tensor = &tensor_original;
-  at::Tensor empty_tensor;
-  std::unique_ptr<ACCL::BaseBuffer> data;
-  std::unique_ptr<ACCL::BaseBuffer> result;
+
+  STANDARD_DECL
+  // INIT_INPUT_TENSOR(tensor_original, tensor, empty_tensor, data)
+  init_input_tensor(tensor_original, *tensor, data, true, true);    
 
   // Reserve device
   c10::DeviceGuard guard(tensor->device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  // Copy data from GPU to FPGA if necessary, and create a new result buffer,
-  // since ACCL doesn't support in-place reduce
-  if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
-    data = create_and_copy_p2p_buffer(*accl, tensor_original);
-
-    if (rank_ == opts.rootRank) {
-      result = create_buffer_p2p(*accl, tensor->numel(), tensor->scalar_type());
-    }
-  } else {
-    if (coyote_enabled) {
-      // Copy tensor to CPU tensor first
-      data = create_coyotebuffer(*accl, tensor->numel(), tensor->scalar_type());
-      ACCL::debug("Copying data to CPU tensor of size " +
-                  std::to_string(tensor_original.numel()));
-      empty_tensor = torch::from_blob(
-          data->byte_array(), tensor_original.sizes(),
-          tensor_original.options().device(c10::DeviceType::CPU));
-      tensor = &empty_tensor;
-      tensor->copy_(tensor_original);
-    } else if (tensor_original.device().type() != c10::DeviceType::CPU) {
-      data = create_buffer(*accl, tensor->numel(), tensor->scalar_type());
-      ACCL::debug("Copying data to CPU tensor of size " +
-                  std::to_string(tensor_original.numel()));
-      empty_tensor = torch::from_blob(
-          data->byte_array(), tensor_original.sizes(),
-          tensor_original.options().device(c10::DeviceType::CPU));
-      tensor = &empty_tensor;
-      tensor->copy_(tensor_original);
-    } else {
-      data = create_buffer(*accl, *tensor);
-    }
-
-    if (rank_ == opts.rootRank) {
-      ACCL::debug("Creating extra result buffer of size " +
-                  std::to_string(tensor_original.numel()));
-      if (coyote_enabled) {
-        result = create_coyotebuffer(*accl, tensor->numel(), tensor->scalar_type());
-      } else {
-        result = create_buffer(*accl, tensor->numel(), tensor->scalar_type());
-      }
-    }
-  }
-
-  // Run reduce
+  init_output_tensor(*tensor, tensor_original, dstdata, tensor->numel(), true, false, opts.rootRank);
+    
   if (!coyote_enabled) {
     data->sync_to_device();
   }
-  
-  ACCL::debug("[Reduce] Entering barrier");
-  accl->barrier();
 
-  ACCL::debug("Starting reduce of " + std::to_string(tensor->numel()) +
-              " items");
-  ACCL::ACCLRequest* req = accl->reduce(*data, *result, tensor->numel(), opts.rootRank,
+  PRE_REQUEST(Reduce,tensor)  
+
+  ACCL::ACCLRequest* req = accl->reduce(*data, *dstdata, tensor->numel(), opts.rootRank,
                acclOp.at(opts.reduceOp), ACCL::GLOBAL_COMM, true, true,
                get_compressed_type(tensor->scalar_type()));
 
-  if(coyote_enabled){
-    ACCL::debug("Waiting for request to complete.");
-    accl->wait(req, 1000ms);
-  }
-  ACCL::debug("Finished waiting");
+  POST_REQUEST
 
-  if (!coyote_enabled && rank_ == opts.rootRank) {
-    result->sync_from_device();
-  }
-
-  // Copy result buffer back to original tensor
-  if (rank_ == opts.rootRank) {
-    if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
-      copy_back_p2p_buffer(*result, tensor_original);
-    } else {
-      ACCL::debug("Copying back results to original tensor of size " +
-                  std::to_string(tensor_original.numel()));
-      tensor_original.copy_(torch::from_blob(
-          result->byte_array(), tensor_original.sizes(),
-          tensor_original.options().device(c10::DeviceType::CPU)));
-    }
-  }
+  copy_back_tensor(tensor_original, dstdata, true, false, opts.rootRank);
 }
 
 c10::intrusive_ptr<Work>
@@ -1225,9 +1133,6 @@ void ProcessGroupACCL::run_allgather(
                       srctensor->scalar_type());
     std::vector<int64_t> sizes = {static_cast<int64_t>(srctensor->numel()) *
                                   size_};
-    dsttensor = torch::from_blob(
-        dstdata->byte_array(), sizes,
-        srctensor_original.options().device(c10::DeviceType::CPU));
   }
 
   // Run allgather
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index faf7de29..37a01b79 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -50,7 +50,7 @@
 
 count = 512
 #As in test.cpp defaults
-rxbufsize = 4096# * 1024
+rxbufsize = 4096 * 1024
 
 
 def test_broadcast():
@@ -277,6 +277,8 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
         # else:
             # sys.exit('Design "' + comms + '" currently not supported in hardware mode')
 
+    # Sometimes ACCL gets stuck on the mpi import statement, so this is to avoid issues:
+    mpi.Barrier()            
     
     accl.create_process_group(ranks, design, bufsize=rxbufsize, initialize=True, simulation=simulator)
     dist.init_process_group("ACCL", rank=rank, world_size=size)
@@ -286,20 +288,20 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
         mpi.Barrier()
         test_broadcast()
         mpi.Barrier()
-        test_sendrcv()
-        mpi.Barrier()
-        test_scatter()
-        mpi.Barrier()
-        test_gather()
-        mpi.Barrier()
-        test_allgather()
-        mpi.Barrier()
-        test_reduce()
-        mpi.Barrier()
-        test_allreduce()
-        mpi.Barrier()
-        demo_basic(rank)
-        mpi.Barrier()
+        # test_sendrcv()
+        # mpi.Barrier()
+        # test_scatter()
+        # mpi.Barrier()
+        # test_gather()
+        # mpi.Barrier()
+        # test_allgather()
+        # mpi.Barrier()
+        # test_reduce()
+        # mpi.Barrier()
+        # test_allreduce()
+        # mpi.Barrier()
+        # demo_basic(rank)
+        # mpi.Barrier()
         # run_training()
         # mpi.Barrier()
         # test_alltoall()

From 756e34275ad8e4457d0093906dc30752ae01cd09 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sat, 18 May 2024 22:24:15 +0200
Subject: [PATCH 16/64] Refactored Scatter, Gather and Allgather

---
 .../pytorch_ddp/include/ProcessGroupACCL.hpp  |  28 +-
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 546 +++++++-----------
 integrations/pytorch_ddp/test/test-generic.py |  27 +-
 3 files changed, 231 insertions(+), 370 deletions(-)

diff --git a/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp b/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp
index 97c4e013..ddc97327 100644
--- a/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp
+++ b/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp
@@ -266,17 +266,17 @@ class TORCH_API ProcessGroupACCL : public ProcessGroup {
 
   void run_send(at::Tensor tensor, int dstRank, int tag);
   void run_recv(at::Tensor tensor, int rcvRank, int tag);
-  void run_broadcast(at::Tensor tensor, const BroadcastOptions &opts);
-  void run_allreduce(at::Tensor tensor, const AllreduceOptions &opts);
-  void run_reduce(at::Tensor tensor, const ReduceOptions &opts);
-  void run_allgather(at::Tensor srctensor,
+  void run_broadcast(at::Tensor in_tensor, const BroadcastOptions &opts);
+  void run_allreduce(at::Tensor in_tensor, const AllreduceOptions &opts);
+  void run_reduce(at::Tensor in_tensor, const ReduceOptions &opts);
+  void run_allgather(at::Tensor in_tensor,
                      const std::vector<at::Tensor> &dsttensors);
-  void run_gather(at::Tensor srctensor,
+  void run_gather(at::Tensor in_tensor,
                   const std::vector<at::Tensor> &dsttensors,
                   const GatherOptions &opts);
-  void run_scatter(std::vector<at::Tensor> &srctensors, at::Tensor dsttensor,
+  void run_scatter(std::vector<at::Tensor> &in_tensors, at::Tensor dsttensor,
                    const ScatterOptions &opts);
-  void run_alltoall(at::Tensor srctensor, at::Tensor dsttensor, const AllToAllOptions &opts);
+  void run_alltoall(at::Tensor in_tensor, at::Tensor dsttensor, const AllToAllOptions &opts);
 
   ACCL::dataType get_compressed_type(c10::ScalarType datatype);
 
@@ -292,9 +292,19 @@ class TORCH_API ProcessGroupACCL : public ProcessGroup {
   // Global states
   static void initACCLOnce();
   static void acclExit();
-  void init_input_tensor(at::Tensor &tensor_original, at::Tensor &tensor, std::unique_ptr<ACCL::BaseBuffer> &data, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
-  void init_output_tensor(at::Tensor &tensor,  at::Tensor &tensor_original, std::unique_ptr<ACCL::BaseBuffer> &dstdata, int out_tensor_size, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
+  
+  void init_input_tensor(at::Tensor &tensor, std::unique_ptr<ACCL::BaseBuffer> &data, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
+
+  void init_input_data_vec(std::vector<at::Tensor> &tensor_vec, std::unique_ptr<ACCL::BaseBuffer> &data, const at::TensorOptions &options, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
+  
+  void init_output_data(at::Tensor &tensor_original, std::unique_ptr<ACCL::BaseBuffer> &dstdata, int out_tensor_size, c10::ScalarType type, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
+  
+  void init_output_tensor(const at::Tensor &tensor_original, at::Tensor &dsttensor, std::unique_ptr<ACCL::BaseBuffer> &dstdata, int out_tensor_size, c10::ScalarType type, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
+  
   void copy_back_tensor(at::Tensor tensor_original, std::unique_ptr<ACCL::BaseBuffer> &data, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
+
+  void copy_back_tensorvec(const std::vector<at::Tensor> &dsttensorvec, std::unique_ptr<ACCL::BaseBuffer> &data, at::Tensor &dsttensor, int numel, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
+  
   static std::once_flag onceFlagInitACCL;
 
   static std::mutex pgGlobalMutex_;
diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index e1205fa9..83ecbe0a 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -70,23 +70,32 @@ namespace c10d {
 
 
 #define STANDARD_DECL \
-  at::Tensor *tensor = &tensor_original;				\
   std::unique_ptr<ACCL::BaseBuffer> data;				\
   std::unique_ptr<ACCL::BaseBuffer> dstdata;				\
 
 #define DO_COND ((do_on_root && opts_root_rank == rank_) || (do_on_others && opts_root_rank != rank_))
+
+#define ROOT_RUN(call)				\
+if(rank_ == opts.rootRank){			\
+  call;					\
+}
+
+#define NON_ROOT_RUN(call)				\
+if(rank_ == opts.rootRank){				\
+  call;							\
+}  
   
 #define PRE_REQUEST(opname, tensor)					\
   ACCL::debug("[" #opname "] Entering barrier");				\
   accl->barrier();							\
-  ACCL::debug("Starting " #opname " of " + std::to_string(tensor->numel()) + " items");
+  ACCL::debug("Starting " #opname " of " + std::to_string(tensor.numel()) + " items");
 
 #define POST_REQUEST					\
-  if(coyote_enabled){					\
-    ACCL::debug("Waiting for request to complete.");	\
-    accl->wait(req, 1000ms);				\
-  }							\
-  ACCL::debug("Finished waiting");
+if(coyote_enabled){					\
+  ACCL::debug("Waiting for request to complete.");	\
+  accl->wait(req, 1000ms);				\
+}							\
+ACCL::debug("Finished waiting");
   
 namespace {
 
@@ -648,58 +657,140 @@ void accl_sa_handler(int)
 	exit(EXIT_FAILURE);
 }
 
-void ProcessGroupACCL::init_input_tensor(at::Tensor &tensor_original, at::Tensor &tensor, std::unique_ptr<ACCL::BaseBuffer> &data, bool do_on_root, bool do_on_others, int opts_root_rank) {
-  // at::Tensor empty_tensor;
-  if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
-    data = create_and_copy_p2p_buffer(*accl, tensor_original);
+void ProcessGroupACCL::init_input_tensor(at::Tensor &tensor, std::unique_ptr<ACCL::BaseBuffer> &data, bool do_on_root, bool do_on_others, int opts_root_rank) {
+  if DO_COND {
+    if (p2p_applicable(*accl, tensor, p2p_enabled)) {
+      data = create_and_copy_p2p_buffer(*accl, tensor);
+    } else {
+      if (coyote_enabled) {
+	data = create_coyotebuffer(*accl, tensor.numel(), tensor.scalar_type());
+      } else if (tensor.device().type() != c10::DeviceType::CPU) {
+	data = create_buffer(*accl, tensor.numel(), tensor.scalar_type());
+      } else {
+	data = create_buffer(*accl, tensor);
+      }
+      if (coyote_enabled || tensor.device().type() != c10::DeviceType::CPU){
+	ACCL::debug("Copying data to CPU tensor of size " + std::to_string(tensor.numel()));
+	at::Tensor wrapper_tensor = torch::from_blob(data->byte_array(), tensor.sizes(), tensor.options().device(c10::DeviceType::CPU)); 
+	wrapper_tensor.copy_(tensor);
+      } 
+    }
+    // don't sync if no rank initializes, we will fill content and sync later
+    if (!coyote_enabled && (do_on_root || do_on_others)) {
+      data->sync_to_device();
+    }
   } else {
-    if (coyote_enabled) {
-      data = create_coyotebuffer(*accl, tensor.numel(), tensor.scalar_type());
-    } else if (tensor_original.device().type() != c10::DeviceType::CPU) {
-      data = create_buffer(*accl, tensor.numel(), tensor.scalar_type());
+    data = std::unique_ptr<ACCL::Buffer<float>>(nullptr);
+  }
+}
+
+  void ProcessGroupACCL::init_input_data_vec(std::vector<at::Tensor> &tensor_vec, std::unique_ptr<ACCL::BaseBuffer> &data, const at::TensorOptions &options, bool do_on_root, bool do_on_others, int opts_root_rank) {
+  if DO_COND {
+    int64_t tens_size = static_cast<size_t>(tensor_vec[0].numel());
+    int64_t total_size = tens_size * static_cast<size_t>(size_);
+    std::vector<int64_t> sizes = {total_size};
+      
+    if (p2p_applicable(*accl, tensor_vec[0], p2p_enabled)) {
+      data = create_buffer_p2p( *accl, total_size, tensor_vec[0].scalar_type());
+    } else if (coyote_enabled) {
+	data = create_coyotebuffer(*accl, total_size, tensor_vec[0].scalar_type());
     } else {
-      data = create_buffer(*accl, tensor);
+      data = create_buffer(*accl, total_size, tensor_vec[0].scalar_type());
+    }
+    ACCL::debug("Copying data to CPU tensor of size " + std::to_string(total_size));
+    at::Tensor wrapper_tensor = torch::from_blob(data->byte_array(), sizes, options);
+    for (const auto i : c10::irange(tensor_vec.size())) {
+      if (p2p_applicable(*accl, tensor_vec[0], p2p_enabled)) {
+	auto slice = data->slice(i * tens_size, (i + 1) * tens_size);
+	copy_to_p2p_buffer(*slice, tensor_vec[i]);
+      } else {
+	auto slice = wrapper_tensor.slice(0, i * tens_size, (i + 1) * tens_size);
+	slice.copy_(tensor_vec[i]);
+      }
+    }
+    if (!coyote_enabled) {
+      data->sync_to_device();
     }
-    if (coyote_enabled || tensor_original.device().type() != c10::DeviceType::CPU){
-      ACCL::debug("Copying data to CPU tensor of size " + std::to_string(tensor_original.numel()));
-      tensor = torch::from_blob(data->byte_array(), tensor_original.sizes(), 
-				tensor_original.options().device(c10::DeviceType::CPU)); 
-      if DO_COND {
-	  tensor.copy_(tensor_original);
+  // } else {
+    // data = std::unique_ptr<ACCL::Buffer<float>>(nullptr);
+  }
+}  
+  
+  // like init_output_tensor but without needlessly setting the tensor
+void ProcessGroupACCL::init_output_data(at::Tensor &tensor_original, std::unique_ptr<ACCL::BaseBuffer> &dstdata, int out_tensor_size, c10::ScalarType type, bool do_on_root, bool do_on_others, int opts_root_rank) {
+  if DO_COND {
+      if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
+	dstdata = create_buffer_p2p(*accl, out_tensor_size, type);
+    } else {
+	if (coyote_enabled) {
+	  dstdata = create_coyotebuffer(*accl, out_tensor_size, type);
+	} else {
+	  dstdata = create_buffer(*accl, out_tensor_size, type);
 	}
-    } 
+    }
+  } else {
+    dstdata = std::unique_ptr<ACCL::Buffer<float>>(nullptr);
   }
 }
-  
-  void ProcessGroupACCL::init_output_tensor(at::Tensor &tensor,  at::Tensor &tensor_original, std::unique_ptr<ACCL::BaseBuffer> &dstdata, int out_tensor_size, bool do_on_root, bool do_on_others, int opts_root_rank) {
+
+void ProcessGroupACCL::init_output_tensor(const at::Tensor &tensor_original, at::Tensor &dsttensor, std::unique_ptr<ACCL::BaseBuffer> &dstdata, int out_tensor_size, c10::ScalarType type, bool do_on_root, bool do_on_others, int opts_root_rank) {
     if DO_COND {
 	if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
-	  dstdata = create_buffer_p2p(*accl, tensor.numel(), tensor.scalar_type());
+	  dstdata = create_buffer_p2p(*accl, out_tensor_size, type);
 	} else {
 	  if (coyote_enabled) {
-	    dstdata = create_coyotebuffer(*accl, tensor.numel(), tensor.scalar_type());
+	    dstdata = create_coyotebuffer(*accl, out_tensor_size, type);
+	    std::vector<int64_t> sizes = {static_cast<int64_t>(out_tensor_size)};
+	    dsttensor = torch::from_blob(dstdata->byte_array(), sizes, tensor_original.options().device(c10::DeviceType::CPU));
+	    // This should not be necessary:
+	    // dsttensor.copy_(tensor_original);
 	  } else {
-	    dstdata = create_buffer(*accl, tensor.numel(), tensor.scalar_type());
+	    dstdata = create_buffer(*accl, out_tensor_size, type);
+	    std::vector<int64_t> sizes = {static_cast<int64_t>(out_tensor_size)};
+	    dsttensor = torch::from_blob(dstdata->byte_array(), sizes, tensor_original.options().device(c10::DeviceType::CPU));
+	    // This should not be necessary:
+	    // dsttensor.copy_(tensor_original);
 	  }
 	}
+      } else {
+      dstdata = std::unique_ptr<ACCL::Buffer<float>>(nullptr);
+      dsttensor = at::Tensor(nullptr);
+    }
+}
+  
+void ProcessGroupACCL::copy_back_tensor(at::Tensor tensor_original, std::unique_ptr<ACCL::BaseBuffer> &data, bool do_on_root, bool do_on_others, int opts_root_rank){
+  if DO_COND {
+      if (!coyote_enabled) {
+	data->sync_from_device();
+      }
+      if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
+	copy_back_p2p_buffer(*data, tensor_original);
+      } else {
+	ACCL::debug("Copying data back from CPU tensor of size " +
+		    std::to_string(tensor_original.numel()));
+	if DO_COND {
+	    tensor_original.copy_(torch::from_blob(data->byte_array(), tensor_original.sizes(), tensor_original.options().device(c10::DeviceType::CPU)));
+	  }
       }
   }
+}
 
-  void ProcessGroupACCL::copy_back_tensor(at::Tensor tensor_original, std::unique_ptr<ACCL::BaseBuffer> &data, bool do_on_root, bool do_on_others, int opts_root_rank){
-  if (!coyote_enabled && DO_COND) {
-    data->sync_from_device();
-  }
-  if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
-    copy_back_p2p_buffer(*data, tensor_original);
-  } else {
-    ACCL::debug("Copying data back from CPU tensor of size " +
-                std::to_string(tensor_original.numel()));
-    if DO_COND {
-      tensor_original.copy_(torch::from_blob(data->byte_array(), tensor_original.sizes(), tensor_original.options().device(c10::DeviceType::CPU)));
+  void ProcessGroupACCL::copy_back_tensorvec(const std::vector<at::Tensor> &dsttensorvec, std::unique_ptr<ACCL::BaseBuffer> &data, at::Tensor &dsttensor, int numel, bool do_on_root, bool do_on_others, int opts_root_rank){
+  if DO_COND {
+    if (!coyote_enabled) {
+      data->sync_from_device();
+    }
+    for (const auto i : c10::irange(dsttensorvec.size())) {
+      if (p2p_applicable(*accl, dsttensorvec[0], p2p_enabled)) {
+	auto slice =
+	  data->slice(i * numel, (i + 1) * numel);
+	copy_back_p2p_buffer(*slice, dsttensorvec[i]);
+      } else {
+	dsttensorvec[i].copy_(dsttensor.slice(0, i * numel, (i + 1) * numel));
+      }
     }
   }
-  }
-  
+}  
     
 
 // Initialize ACCL
@@ -905,30 +996,24 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::enqueue(
   return work;
 }
 
-void ProcessGroupACCL::run_broadcast(at::Tensor tensor_original,
+void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
                                      const BroadcastOptions &opts) {
 
   STANDARD_DECL
   
-  init_input_tensor(tensor_original, *tensor, data, true, false, opts.rootRank);
+  init_input_tensor(in_tensor, data, true, true, opts.rootRank);
 
   // Reserve device
-  c10::DeviceGuard guard(tensor->device());
+  c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  //check wether this is needed, with hostmem
-  if (!coyote_enabled && rank_ == opts.rootRank) {
-    data->sync_to_device();
-  }
-  
-  PRE_REQUEST(Broadcast,tensor)
+  PRE_REQUEST(Broadcast,in_tensor)
 
-  ACCL::ACCLRequest* req = accl->bcast(*data, tensor->numel(), opts.rootRank, ACCL::GLOBAL_COMM, true,
-              true, get_compressed_type(tensor->scalar_type()));
+  ACCL::ACCLRequest* req = accl->bcast(*data, in_tensor.numel(), opts.rootRank, ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor.scalar_type()));
 
   POST_REQUEST
 
-  copy_back_tensor(tensor_original, data, false, true, opts.rootRank);
+  copy_back_tensor(in_tensor, data, true, true, opts.rootRank);
 }
 
 c10::intrusive_ptr<Work>
@@ -958,32 +1043,26 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
                  c10::optional<std::vector<at::Tensor>>(tensors));
 }
 
-void ProcessGroupACCL::run_allreduce(at::Tensor tensor_original,
+void ProcessGroupACCL::run_allreduce(at::Tensor in_tensor,
                                      const AllreduceOptions &opts) {
 
   STANDARD_DECL
 
-  init_input_tensor(tensor_original, *tensor, data, true, true);    
+  init_input_tensor(in_tensor, data, true, true);    
 
   // Reserve device
-  c10::DeviceGuard guard(tensor->device());
+  c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  init_output_tensor(*tensor, tensor_original, dstdata, tensor->numel(), true, true);
-
-  if (!coyote_enabled) {
-    data->sync_to_device();
-  }
+  init_output_data(in_tensor, dstdata, in_tensor.numel(), in_tensor.scalar_type(), true, true);
 
-  PRE_REQUEST(Allreduce,tensor)  
+  PRE_REQUEST(Allreduce,in_tensor)  
   
-  ACCL::ACCLRequest* req = accl->allreduce(*data, *dstdata, tensor->numel(), acclOp.at(opts.reduceOp),
-                  ACCL::GLOBAL_COMM, true, true,
-                  get_compressed_type(tensor->scalar_type()));
+  ACCL::ACCLRequest* req = accl->allreduce(*data, *dstdata, in_tensor.numel(), acclOp.at(opts.reduceOp), ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor.scalar_type()));
 
   POST_REQUEST
 
-    copy_back_tensor(tensor_original, dstdata, true, true);
+  copy_back_tensor(in_tensor, dstdata, true, true);
 }
 
 c10::intrusive_ptr<Work>
@@ -1018,32 +1097,25 @@ ProcessGroupACCL::allreduce_coalesced(std::vector<at::Tensor> &tensors,
               "allreduce_coalesced is currently not supported with ACCL");
 }
 
-void ProcessGroupACCL::run_reduce(at::Tensor tensor_original,
+void ProcessGroupACCL::run_reduce(at::Tensor in_tensor,
                                   const ReduceOptions &opts) {
 
   STANDARD_DECL
-  // INIT_INPUT_TENSOR(tensor_original, tensor, empty_tensor, data)
-  init_input_tensor(tensor_original, *tensor, data, true, true);    
+  init_input_tensor(in_tensor, data, true, true);    
 
   // Reserve device
-  c10::DeviceGuard guard(tensor->device());
+  c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  init_output_tensor(*tensor, tensor_original, dstdata, tensor->numel(), true, false, opts.rootRank);
+  init_output_data(in_tensor, dstdata, in_tensor.numel(), in_tensor.scalar_type(), true, false, opts.rootRank);
     
-  if (!coyote_enabled) {
-    data->sync_to_device();
-  }
-
-  PRE_REQUEST(Reduce,tensor)  
+  PRE_REQUEST(Reduce,in_tensor)  
 
-  ACCL::ACCLRequest* req = accl->reduce(*data, *dstdata, tensor->numel(), opts.rootRank,
-               acclOp.at(opts.reduceOp), ACCL::GLOBAL_COMM, true, true,
-               get_compressed_type(tensor->scalar_type()));
+  ACCL::ACCLRequest* req = accl->reduce(*data, *dstdata, in_tensor.numel(), opts.rootRank, acclOp.at(opts.reduceOp), ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor.scalar_type()));
 
   POST_REQUEST
 
-  copy_back_tensor(tensor_original, dstdata, true, false, opts.rootRank);
+  copy_back_tensor(in_tensor, dstdata, true, false, opts.rootRank);
 }
 
 c10::intrusive_ptr<Work>
@@ -1072,103 +1144,29 @@ ProcessGroupACCL::reduce(std::vector<at::Tensor> &tensors,
 }
 
 void ProcessGroupACCL::run_allgather(
-    at::Tensor srctensor_original,
+    at::Tensor in_tensor,
     const std::vector<at::Tensor> &dsttensorvec) {
-  at::Tensor *srctensor = &srctensor_original;
   at::Tensor empty_srctensor;
   std::unique_ptr<ACCL::BaseBuffer> srcdata;
   at::Tensor dsttensor;
   std::unique_ptr<ACCL::BaseBuffer> dstdata;
 
+  init_input_tensor(in_tensor, srcdata, true, true);    
   // Reserve device
-  c10::DeviceGuard guard(srctensor->device());
+  c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  // Copy data from GPU to FPGA if necessary
-  if (p2p_applicable(*accl, srctensor_original, p2p_enabled)) {
-    srcdata = create_and_copy_p2p_buffer(*accl, srctensor_original);
-  } else {
-    if (coyote_enabled) {
-      // Copy tensor to CPU tensor first
-      srcdata = create_coyotebuffer(*accl, srctensor->numel(), srctensor->scalar_type());
-      ACCL::debug("Copying data to CPU tensor of size " +
-                  std::to_string(srctensor_original.numel()));
-      empty_srctensor = torch::from_blob(
-          srcdata->byte_array(), srctensor_original.sizes(),
-          srctensor_original.options().device(c10::DeviceType::CPU));
-      srctensor = &empty_srctensor;
-      srctensor->copy_(srctensor_original);
-    } else if (srctensor_original.device().type() != c10::DeviceType::CPU) {
-      srcdata =
-          create_buffer(*accl, srctensor->numel(), srctensor->scalar_type());
-      ACCL::debug("Copying data to CPU tensor of size " +
-                  std::to_string(srctensor_original.numel()));
-      empty_srctensor = torch::from_blob(
-          srcdata->byte_array(), srctensor_original.sizes(),
-          srctensor_original.options().device(c10::DeviceType::CPU));
-      srctensor = &empty_srctensor;
-      srctensor->copy_(srctensor_original);
-    } else {
-      srcdata = create_buffer(*accl, *srctensor);
-    }
-  }
-
-  // Create new output tensor, since dsttensorvec is not continuous in memory
-  if (p2p_applicable(*accl, dsttensorvec[0], p2p_enabled)) {
-    dstdata = create_buffer_p2p(*accl,
-                                srctensor->numel() * static_cast<size_t>(size_),
-                                srctensor->scalar_type());
-  } else if (coyote_enabled) {
-    dstdata =
-        create_coyotebuffer(*accl, srctensor->numel() * static_cast<size_t>(size_),
-                      srctensor->scalar_type());
-    std::vector<int64_t> sizes = {static_cast<int64_t>(srctensor->numel()) *
-                                  size_};
-    dsttensor = torch::from_blob(
-        dstdata->byte_array(), sizes,
-        srctensor_original.options().device(c10::DeviceType::CPU));
-  } else {
-    dstdata =
-        create_buffer(*accl, srctensor->numel() * static_cast<size_t>(size_),
-                      srctensor->scalar_type());
-    std::vector<int64_t> sizes = {static_cast<int64_t>(srctensor->numel()) *
-                                  size_};
-  }
-
-  // Run allgather
-  if (!coyote_enabled) {
-    srcdata->sync_to_device();
-  }
+  init_output_tensor(dsttensorvec[0], dsttensor, dstdata, in_tensor.numel() * static_cast<size_t>(size_), in_tensor.scalar_type(), true, true);
+  
+  PRE_REQUEST(Allgather,in_tensor)
 
-  ACCL::debug("[Allgather] Entering barrier");
-  accl->barrier();
+  ACCL::ACCLRequest* req = accl->allgather(*srcdata, *dstdata, in_tensor.numel(), ACCL::GLOBAL_COMM,
+                  true, true, get_compressed_type(in_tensor.scalar_type()));
 
-  ACCL::debug("Starting allgather of " + std::to_string(srctensor->numel()) +
-              " items");
-  ACCL::ACCLRequest* req = accl->allgather(*srcdata, *dstdata, srctensor->numel(), ACCL::GLOBAL_COMM,
-                  true, true, get_compressed_type(srctensor->scalar_type()));
-
-  if(coyote_enabled){
-    ACCL::debug("Waiting for request to complete.");
-    accl->wait(req, 1000ms);
-  }
-  ACCL::debug("Finished waiting");
-  
-  if (!coyote_enabled) {
-    dstdata->sync_from_device();
-  }
+  POST_REQUEST
 
-  // Copy results back to dsttensorvec
-  for (const auto i : c10::irange(dsttensorvec.size())) {
-    if (p2p_applicable(*accl, dsttensorvec[0], p2p_enabled)) {
-      auto slice =
-          dstdata->slice(i * srctensor->numel(), (i + 1) * srctensor->numel());
-      copy_back_p2p_buffer(*slice, dsttensorvec[i]);
-    } else {
-      dsttensorvec[i].copy_(dsttensor.slice(0, i * srctensor->numel(),
-                                            (i + 1) * srctensor->numel()));
-    }
-  }
+  copy_back_tensorvec(dsttensorvec, dstdata, dsttensor, in_tensor.numel(), true, true);
+    
 }
 
 c10::intrusive_ptr<Work>
@@ -1221,112 +1219,31 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::allgather_coalesced(
   TORCH_CHECK(false, "ProcessGroupACCL does not support allgather_coalesced");
 }
 
-void ProcessGroupACCL::run_gather(at::Tensor srctensor_original,
+void ProcessGroupACCL::run_gather(at::Tensor in_tensor,
                                   const std::vector<at::Tensor> &dsttensorvec,
                                   const GatherOptions &opts) {
-  at::Tensor *srctensor = &srctensor_original;
   at::Tensor empty_srctensor;
   std::unique_ptr<ACCL::BaseBuffer> srcdata;
   at::Tensor dsttensor;
   std::unique_ptr<ACCL::BaseBuffer> dstdata;
 
+  init_input_tensor(in_tensor, srcdata, true, true);    
   // Reserve device
-  c10::DeviceGuard guard(srctensor->device());
+  c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  // Copy data from GPU to FPGA if necessary
-  if (p2p_applicable(*accl, srctensor_original, p2p_enabled)) {
-    srcdata = create_and_copy_p2p_buffer(*accl, srctensor_original);
-  } else {
-    if (coyote_enabled) {
-      srcdata =
-          create_coyotebuffer(*accl, srctensor->numel(), srctensor->scalar_type());
-      ACCL::debug("Copying data to CPU tensor of size " +
-                  std::to_string(srctensor_original.numel()));
-      empty_srctensor = torch::from_blob(
-          srcdata->byte_array(), srctensor_original.sizes(),
-          srctensor_original.options().device(c10::DeviceType::CPU));
-      srctensor = &empty_srctensor;
-      srctensor->copy_(srctensor_original);
-    } else if (srctensor_original.device().type() != c10::DeviceType::CPU) {
-      srcdata =
-          create_buffer(*accl, srctensor->numel(), srctensor->scalar_type());
-      ACCL::debug("Copying data to CPU tensor of size " +
-                  std::to_string(srctensor_original.numel()));
-      empty_srctensor = torch::from_blob(
-          srcdata->byte_array(), srctensor_original.sizes(),
-          srctensor_original.options().device(c10::DeviceType::CPU));
-      srctensor = &empty_srctensor;
-      srctensor->copy_(srctensor_original);
-    } else {
-      srcdata = create_buffer(*accl, *srctensor);
-    }
-  }
+  init_output_tensor(dsttensorvec[0], dsttensor, dstdata, in_tensor.numel() * static_cast<size_t>(size_), in_tensor.scalar_type(), true, false, opts.rootRank);
 
-  // Create new output tensor, since dsttensorvec is not continuous in memory
-  if (rank_ == opts.rootRank) {
-    if (p2p_applicable(*accl, dsttensorvec[0], p2p_enabled)) {
-      dstdata = create_buffer_p2p(
-          *accl, srctensor->numel() * static_cast<size_t>(size_),
-          srctensor->scalar_type());
-    } else if (coyote_enabled) {
-      dstdata =
-          create_coyotebuffer(*accl, srctensor->numel() * static_cast<size_t>(size_),
-                        srctensor->scalar_type());
-      std::vector<int64_t> sizes = {static_cast<int64_t>(srctensor->numel()) *
-                                    size_};
-      dsttensor =
-          torch::from_blob(dstdata->byte_array(), sizes,
-                           srctensor->options().device(c10::DeviceType::CPU));
-    } else {
-      dstdata =
-          create_buffer(*accl, srctensor->numel() * static_cast<size_t>(size_),
-                        srctensor->scalar_type());
-      std::vector<int64_t> sizes = {static_cast<int64_t>(srctensor->numel()) *
-                                    size_};
-      dsttensor =
-          torch::from_blob(dstdata->byte_array(), sizes,
-                           srctensor->options().device(c10::DeviceType::CPU));
-    }
-  }
+  PRE_REQUEST(Gather, in_tensor)
 
-  // Run gather
-  if (!coyote_enabled) {
-    srcdata->sync_to_device();
-  }
-
-  ACCL::debug("[Gather] Entering barrier");
-  accl->barrier();
-
-  ACCL::debug("Starting gather of " + std::to_string(srctensor->numel()) +
-              " items");
-  ACCL::ACCLRequest* req = accl->gather(*srcdata, *dstdata, srctensor->numel(), opts.rootRank,
+  ACCL::ACCLRequest* req = accl->gather(*srcdata, *dstdata, in_tensor.numel(), opts.rootRank,
                ACCL::GLOBAL_COMM, true, true,
-               get_compressed_type(srctensor->scalar_type()));
+               get_compressed_type(in_tensor.scalar_type()));
 
-  if(coyote_enabled){
-    ACCL::debug("Waiting for request to complete.");
-    accl->wait(req, 1000ms);
-  }
-  ACCL::debug("Finished waiting");
-  
-  if (!coyote_enabled && rank_ == opts.rootRank) {
-    dstdata->sync_from_device();
-  }
+  POST_REQUEST
 
-  // Copy results back to dsttensorvec
-  if (rank_ == opts.rootRank) {
-    for (const auto i : c10::irange(dsttensorvec.size())) {
-      if (p2p_applicable(*accl, dsttensorvec[0], p2p_enabled)) {
-        auto slice = dstdata->slice(i * srctensor->numel(),
-                                    (i + 1) * srctensor->numel());
-        copy_back_p2p_buffer(*slice, dsttensorvec[i]);
-      } else {
-        dsttensorvec[i].copy_(dsttensor.slice(0, i * srctensor->numel(),
-                                              (i + 1) * srctensor->numel()));
-      }
-    }
-  }
+    copy_back_tensorvec(dsttensorvec, dstdata, dsttensor, in_tensor.numel(), true, false, opts.rootRank);
+    
 }
 
 c10::intrusive_ptr<Work>
@@ -1360,8 +1277,7 @@ ProcessGroupACCL::gather(std::vector<std::vector<at::Tensor>> &outputTensors,
           size_t n = bufsize / srctensor.itemsize();
           for (size_t i = 0; i < srctensor.numel(); i += n) {
             size_t end =
-                std::min(i + n, static_cast<size_t>(srctensor.numel()));
-            std::vector<at::Tensor> dsttensorslices;
+                std::min(i + n, static_cast<size_t>(srctensor.numel()));            std::vector<at::Tensor> dsttensorslices;
             dsttensorslices.reserve(dsttensors.size());
             for (auto &dsttensor : dsttensors) {
               dsttensorslices.emplace_back(dsttensor.slice(0, i, end));
@@ -1386,102 +1302,33 @@ ProcessGroupACCL::gather(std::vector<std::vector<at::Tensor>> &outputTensors,
   }
 }
 
-void ProcessGroupACCL::run_scatter(std::vector<at::Tensor> &srctensorvec,
-                                   at::Tensor dsttensor_original,
+void ProcessGroupACCL::run_scatter(std::vector<at::Tensor> &in_tensor_vec,
+                                   at::Tensor out_tensor,
                                    const ScatterOptions &opts) {
-  std::unique_ptr<ACCL::BaseBuffer> srcdata;
-  at::Tensor *dsttensor = &dsttensor_original;
-  at::Tensor empty_dsttensor;
-  std::unique_ptr<ACCL::BaseBuffer> dstdata;
+  std::unique_ptr<ACCL::BaseBuffer> in_data;
+  std::unique_ptr<ACCL::BaseBuffer> out_data;
+  at::Tensor dsttensor;
 
   // Reserve device
-  c10::DeviceGuard guard(dsttensor->device());
+  c10::DeviceGuard guard(out_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  // Create new input buffer, since srctensorvec is not continuous in memory
-  if (rank_ == opts.rootRank) {
-    at::Tensor srctensor;
-    if (p2p_applicable(*accl, srctensorvec[0], p2p_enabled)) {
-      srcdata = create_buffer_p2p(
-				  *accl, dsttensor->numel() * static_cast<size_t>(size_),
-				  dsttensor->scalar_type());
-    } else if (coyote_enabled) {
-      srcdata = create_coyotebuffer(*accl,
-				    dsttensor->numel() * static_cast<size_t>(size_),
-				    dsttensor->scalar_type());
-      std::vector<int64_t> sizes = {static_cast<int64_t>(dsttensor->numel()) *
-				    size_};
-      srctensor =
-	torch::from_blob(srcdata->byte_array(), sizes,
-			 dsttensor->options().device(c10::DeviceType::CPU));
-    } else {
-      srcdata = create_buffer(*accl,
-			      dsttensor->numel() * static_cast<size_t>(size_),
-			      dsttensor->scalar_type());
-      std::vector<int64_t> sizes = {static_cast<int64_t>(dsttensor->numel()) *
-				    size_};
-      srctensor =
-	torch::from_blob(srcdata->byte_array(), sizes,
-			 dsttensor->options().device(c10::DeviceType::CPU));
-    }
+  init_input_data_vec(in_tensor_vec, in_data, out_tensor.options().device(c10::DeviceType::CPU), true, false, opts.rootRank);
 
-    // Copy data to input buffer
-    for (const auto i : c10::irange(srctensorvec.size())) {
-      if (p2p_applicable(*accl, srctensorvec[0], p2p_enabled)) {
-        auto slice = srcdata->slice(i * dsttensor->numel(),
-                                    (i + 1) * dsttensor->numel());
-        copy_to_p2p_buffer(*slice, srctensorvec[i]);
-      } else {
-        auto slice = srctensor.slice(0, i * dsttensor->numel(),
-                                     (i + 1) * dsttensor->numel());
-        slice.copy_(srctensorvec[i]);
-      }
-    }
-  }
-
-  // Create output buffer
-  if (p2p_applicable(*accl, dsttensor_original, p2p_enabled)) {
-    dstdata = create_and_copy_p2p_buffer(*accl, dsttensor_original);
-  } else {
-    if (coyote_enabled) {
-      dstdata =
-          create_coyotebuffer(*accl, dsttensor->numel(), dsttensor->scalar_type());
-      ACCL::debug("Copying data to CPU tensor of size " +
-                  std::to_string(dsttensor_original.numel()));
-      empty_dsttensor = torch::from_blob(
-          dstdata->byte_array(), dsttensor_original.sizes(),
-          dsttensor_original.options().device(c10::DeviceType::CPU));
-      dsttensor = &empty_dsttensor;
-      dsttensor->copy_(dsttensor_original);
-    } else if (dsttensor_original.device().type() != c10::DeviceType::CPU) {
-      dstdata =
-          create_buffer(*accl, dsttensor->numel(), dsttensor->scalar_type());
-      ACCL::debug("Copying data to CPU tensor of size " +
-                  std::to_string(dsttensor_original.numel()));
-      empty_dsttensor = torch::from_blob(
-          dstdata->byte_array(), dsttensor_original.sizes(),
-          dsttensor_original.options().device(c10::DeviceType::CPU));
-      dsttensor = &empty_dsttensor;
-      dsttensor->copy_(dsttensor_original);
-    } else {
-      dstdata = create_buffer(*accl, *dsttensor);
-    }
-  }
-
-  if (!coyote_enabled && rank_ == opts.rootRank) {
-    srcdata->sync_to_device();
-  }
+  ACCL::debug("[Scatter] mid");
+  
+  init_output_tensor(out_tensor, dsttensor, out_data, out_tensor.numel(), out_tensor.scalar_type(), true, true, opts.rootRank);
 
+  ACCL::debug(std::to_string(rank_));
+  
   ACCL::debug("[Scatter] Entering barrier");
   accl->barrier();
 
 
-  ACCL::debug("Starting scatter of " + std::to_string(dsttensor->numel()) +
+  ACCL::debug("Starting scatter of " + std::to_string(out_tensor.numel()) +
               " items");
   // Run scatter
-  ACCL::ACCLRequest* req = accl->scatter(*srcdata, *dstdata, dsttensor->numel(), opts.rootRank,
-                ACCL::GLOBAL_COMM, true, true,
-                get_compressed_type(dsttensor->scalar_type()));
+  ACCL::ACCLRequest* req = accl->scatter(*in_data, *out_data, out_tensor.numel(), opts.rootRank, ACCL::GLOBAL_COMM, true, true, get_compressed_type(dsttensor.scalar_type()));
 
   if(coyote_enabled){
     ACCL::debug("Waiting for request to complete.");
@@ -1491,19 +1338,20 @@ void ProcessGroupACCL::run_scatter(std::vector<at::Tensor> &srctensorvec,
   ACCL::debug("Finished wait");
 
   if (!coyote_enabled) {
-    dstdata->sync_from_device();
+    out_data->sync_from_device();
   }
 
-
-
+  // TODO delete
+  accl->barrier();
   // Copy result back to GPU if necessary
-  if (p2p_applicable(*accl, dsttensor_original, p2p_enabled)) {
-    copy_back_p2p_buffer(*dstdata, dsttensor_original);
-  } else if (coyote_enabled || dsttensor_original.device().type() != c10::DeviceType::CPU) {
+  if (p2p_applicable(*accl, out_tensor, p2p_enabled)) {
+    copy_back_p2p_buffer(*out_data, in_tensor_vec[0]);
+  } else {
     ACCL::debug("Copying data back from CPU tensor of size " +
-                std::to_string(dsttensor_original.numel()));
-    dsttensor_original.copy_(*dsttensor);
+                std::to_string(out_tensor.numel()));
+    out_tensor.copy_(dsttensor);
   }
+    ACCL::debug("Rank: " + std::to_string(rank_));
 }
 
 c10::intrusive_ptr<Work>
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index 37a01b79..bc98858d 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -48,7 +48,7 @@
 rank = 0
 size = 0
 
-count = 512
+count = 16
 #As in test.cpp defaults
 rxbufsize = 4096 * 1024
 
@@ -92,10 +92,13 @@ def test_scatter():
         x = [torch.full((count,), float(i)) for i in range(size)]
     else:
         x = None
-    y = torch.empty(count)
+    y = torch.full((count,), float(0))
 
     dist.scatter(y, x, 0)
 
+    print(y)
+    print(rank)
+    
     np.testing.assert_allclose(y, torch.full((count,), float(rank)))
     print("Test scatter finished!")
 
@@ -290,16 +293,16 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
         mpi.Barrier()
         # test_sendrcv()
         # mpi.Barrier()
-        # test_scatter()
-        # mpi.Barrier()
-        # test_gather()
-        # mpi.Barrier()
-        # test_allgather()
-        # mpi.Barrier()
-        # test_reduce()
-        # mpi.Barrier()
-        # test_allreduce()
-        # mpi.Barrier()
+        test_scatter()
+        mpi.Barrier()
+        test_gather()
+        mpi.Barrier()
+        test_allgather()
+        mpi.Barrier()
+        test_reduce()
+        mpi.Barrier()
+        test_allreduce()
+        mpi.Barrier()
         # demo_basic(rank)
         # mpi.Barrier()
         # run_training()

From c7cbb7e5f5af9bada40b94d8d3c68a04bf137c7a Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sun, 19 May 2024 12:43:55 +0200
Subject: [PATCH 17/64] Refactored rest(send, recv, alltoall)

---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 283 +++---------------
 integrations/pytorch_ddp/test/test-generic.py |  33 +-
 2 files changed, 62 insertions(+), 254 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 83ecbe0a..166e550e 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -75,16 +75,6 @@ namespace c10d {
 
 #define DO_COND ((do_on_root && opts_root_rank == rank_) || (do_on_others && opts_root_rank != rank_))
 
-#define ROOT_RUN(call)				\
-if(rank_ == opts.rootRank){			\
-  call;					\
-}
-
-#define NON_ROOT_RUN(call)				\
-if(rank_ == opts.rootRank){				\
-  call;							\
-}  
-  
 #define PRE_REQUEST(opname, tensor)					\
   ACCL::debug("[" #opname "] Entering barrier");				\
   accl->barrier();							\
@@ -318,6 +308,7 @@ std::unique_ptr<ACCL::BaseBuffer> create_coyotebuffer(ACCL::ACCL &accl, size_t l
   }
 }
 
+    //TODO delete
 // Create an ACCL Buffer with correct type
 std::unique_ptr<ACCL::BaseBuffer> wrap_buffer(ACCL::ACCL &accl, xrt::bo &bo,
                                               size_t length,
@@ -768,9 +759,7 @@ void ProcessGroupACCL::copy_back_tensor(at::Tensor tensor_original, std::unique_
       } else {
 	ACCL::debug("Copying data back from CPU tensor of size " +
 		    std::to_string(tensor_original.numel()));
-	if DO_COND {
-	    tensor_original.copy_(torch::from_blob(data->byte_array(), tensor_original.sizes(), tensor_original.options().device(c10::DeviceType::CPU)));
-	  }
+	tensor_original.copy_(torch::from_blob(data->byte_array(), tensor_original.sizes(), tensor_original.options().device(c10::DeviceType::CPU)));
       }
   }
 }
@@ -1000,7 +989,8 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
                                      const BroadcastOptions &opts) {
 
   STANDARD_DECL
-  
+
+  //Should be split to output on non-root sometime
   init_input_tensor(in_tensor, data, true, true, opts.rootRank);
 
   // Reserve device
@@ -1315,43 +1305,17 @@ void ProcessGroupACCL::run_scatter(std::vector<at::Tensor> &in_tensor_vec,
 
   init_input_data_vec(in_tensor_vec, in_data, out_tensor.options().device(c10::DeviceType::CPU), true, false, opts.rootRank);
 
-  ACCL::debug("[Scatter] mid");
   
   init_output_tensor(out_tensor, dsttensor, out_data, out_tensor.numel(), out_tensor.scalar_type(), true, true, opts.rootRank);
 
-  ACCL::debug(std::to_string(rank_));
+  PRE_REQUEST(Scatter, dsttensor)
   
-  ACCL::debug("[Scatter] Entering barrier");
-  accl->barrier();
-
-
-  ACCL::debug("Starting scatter of " + std::to_string(out_tensor.numel()) +
-              " items");
   // Run scatter
   ACCL::ACCLRequest* req = accl->scatter(*in_data, *out_data, out_tensor.numel(), opts.rootRank, ACCL::GLOBAL_COMM, true, true, get_compressed_type(dsttensor.scalar_type()));
 
-  if(coyote_enabled){
-    ACCL::debug("Waiting for request to complete.");
-    accl->wait(req, 1000ms);
-  }
-
-  ACCL::debug("Finished wait");
-
-  if (!coyote_enabled) {
-    out_data->sync_from_device();
-  }
+  POST_REQUEST
 
-  // TODO delete
-  accl->barrier();
-  // Copy result back to GPU if necessary
-  if (p2p_applicable(*accl, out_tensor, p2p_enabled)) {
-    copy_back_p2p_buffer(*out_data, in_tensor_vec[0]);
-  } else {
-    ACCL::debug("Copying data back from CPU tensor of size " +
-                std::to_string(out_tensor.numel()));
-    out_tensor.copy_(dsttensor);
-  }
-    ACCL::debug("Rank: " + std::to_string(rank_));
+  copy_back_tensor(out_tensor, out_data, true, true, opts.rootRank);
 }
 
 c10::intrusive_ptr<Work>
@@ -1424,113 +1388,38 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::reduce_scatter(
   TORCH_CHECK(false, "ProcessGroupACCL does not support reduce_scatter");
 }
 
-void ProcessGroupACCL::run_alltoall(at::Tensor srctensor_original,
-                                    at::Tensor dsttensor_original,
+void ProcessGroupACCL::run_alltoall(at::Tensor in_tensor,
+                                    at::Tensor out_tensor,
                                     const AllToAllOptions &opts) {
-  at::Tensor *srctensor = &srctensor_original;
-  at::Tensor *dsttensor = &dsttensor_original;
-  at::Tensor empty_srctensor, empty_dsttensor;
   std::unique_ptr<ACCL::BaseBuffer> srcdata;
   std::unique_ptr<ACCL::BaseBuffer> dstdata;
 
-  ACCL::debug("Running alltoall");
-  PARA_PRINT(srctensor_original);
+  PARA_PRINT(in_tensor);
+
+  init_input_tensor(in_tensor, srcdata, true, true); 
 
   // Reserve device
-  c10::DeviceGuard guard(srctensor->device());
+  c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  // Copy data from GPU to FPGA if necessary, and create a new result buffer,
-  // since ACCL doesn't support in-place allreduce
-  if (p2p_applicable(*accl, srctensor_original, p2p_enabled)) {
-    srcdata = create_and_copy_p2p_buffer(*accl, srctensor_original);
-  } else {
-    if (coyote_enabled) {
-      srcdata = create_coyotebuffer(*accl, srctensor->numel(), srctensor->scalar_type());
-      ACCL::debug("Copying data to CPU tensor of size " +
-                  std::to_string(srctensor_original.numel()));
-      empty_srctensor = torch::from_blob(
-          srcdata->byte_array(), srctensor_original.sizes(),
-          srctensor_original.options().device(c10::DeviceType::CPU));
-      srctensor = &empty_srctensor;
-      srctensor->copy_(srctensor_original);
-    }
-    else if (srctensor_original.device().type() != c10::DeviceType::CPU) {
-      srcdata = create_buffer(*accl, srctensor->numel(), srctensor->scalar_type());
-      ACCL::debug("Copying data to CPU tensor of size " +
-                  std::to_string(srctensor_original.numel()));
-      empty_srctensor = torch::from_blob(
-          srcdata->byte_array(), srctensor_original.sizes(),
-          srctensor_original.options().device(c10::DeviceType::CPU));
-      srctensor = &empty_srctensor;
-      srctensor->copy_(srctensor_original);
-    }
-    else {
-      srcdata = create_buffer(*accl, *srctensor);
-    }
-  }
+  init_output_data(out_tensor, dstdata, out_tensor.numel(), out_tensor.scalar_type(), true, true);
 
-  if (p2p_applicable(*accl, dsttensor_original, p2p_enabled)) {
-    dstdata = create_buffer_p2p(*accl, srctensor->numel(), srctensor->scalar_type());
-  } else if (coyote_enabled) {
-    dstdata = create_coyotebuffer(*accl, srctensor->numel(),srctensor->scalar_type());
-    torch::from_blob(dstdata->byte_array(), srctensor_original.sizes(),
-        srctensor_original.options().device(c10::DeviceType::CPU));
-    dsttensor = &empty_dsttensor;
-  } else {
-    dstdata = create_buffer(*accl, srctensor->numel(), srctensor->scalar_type());
-    empty_dsttensor = torch::from_blob(dstdata->byte_array(), srctensor_original.sizes(), srctensor_original.options().device(c10::DeviceType::CPU));
-    dsttensor = &empty_dsttensor;
+  for(int i = 0; i < in_tensor.numel(); i++){
+      ACCL::debug(std::to_string(i) + ": " + std::to_string(((float *) srcdata.get())[i]));
   }
-  
-  // Create output buffer
-  // if (p2p_applicable(*accl, dsttensor_original, p2p_enabled)) {
-    // dstdata = create_and_copy_p2p_buffer(*accl, dsttensor_original);
-  // } else {
-    // if (accl->is_simulated() || coyote_enabled) {
-      // dstdata = create_buffer(*accl, dsttensor->numel(), dsttensor->scalar_type());
-    // } else {
-      // dstdata = wrap_buffer(*accl, buf0, dsttensor->numel(), dsttensor->scalar_type());
-    // }
-  // }
-
-  // Run alltoall
-  if (!coyote_enabled) {
-    srcdata->sync_to_device();
+
+  for(int i = 0; i<in_tensor.numel(); i++){
+      ACCL::debug(std::to_string(((float *) dstdata.get())[i]));
   }
 
-  ACCL::debug("[AlltoAll] Entering barrier");
-  accl->barrier();
+  PRE_REQUEST(AlltoAll, in_tensor)
 
-  ACCL::debug("Starting alltoall of " + std::to_string(srctensor->numel()) +
-              " items");
+  ACCL::ACCLRequest* req = accl->alltoall(*srcdata, *dstdata, in_tensor.numel()/size_, ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor.scalar_type()));
 
-  // ACCL::ACCLRequest* req = accl->bcast(*srcdata, srctensor->numel(), 0, ACCL::GLOBAL_COMM, true,
-              // true, get_compressed_type(srctensor->scalar_type()));
-  ACCL::ACCLRequest* req = accl->alltoall(*srcdata, *dstdata, srctensor->numel(),
-                  ACCL::GLOBAL_COMM, true, true,
-                  get_compressed_type(srctensor->scalar_type()));
+  POST_REQUEST
 
-  // if(coyote_enabled){
-  ACCL::debug("Waiting for request to complete.");
-  accl->wait(req, 1000ms);
-  // }
-  ACCL::debug("Finished waiting");
+  copy_back_tensor(out_tensor, dstdata, true, true);    
   
-  if (!coyote_enabled) {
-    dstdata->sync_from_device();
-  }
-
-  // Copy result buffer back to original tensor
-  if (p2p_applicable(*accl, dsttensor_original, p2p_enabled)) {
-    copy_back_p2p_buffer(*dstdata, dsttensor_original);
-  } else {
-    ACCL::debug("Copying result data back to original tensor of size " +
-                std::to_string(dsttensor_original.numel()));
-    dsttensor_original.copy_(torch::from_blob(
-        dstdata->byte_array(), dsttensor_original.sizes(),
-        dsttensor_original.options().device(c10::DeviceType::CPU)));
-  }
 }
 
 c10::intrusive_ptr<Work> ProcessGroupACCL::alltoall_base(
@@ -1590,65 +1479,23 @@ ProcessGroupACCL::alltoall(std::vector<at::Tensor> &outputTensors,
   TORCH_CHECK(false, "ProcessGroupACCL does not support alltoall");
 }
 
-void ProcessGroupACCL::run_send(at::Tensor tensor_original, int dstRank,
+void ProcessGroupACCL::run_send(at::Tensor in_tensor, int dstRank,
                                 int tag) {
-  at::Tensor *tensor = &tensor_original;
-  at::Tensor empty_tensor;
-  std::unique_ptr<ACCL::BaseBuffer> data;
 
+  STANDARD_DECL
+    
   // Reserve device
-  c10::DeviceGuard guard(tensor->device());
+  c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  // Copy data from GPU to FPGA if necessary
-  if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
-    data = create_and_copy_p2p_buffer(*accl, tensor_original);
-  } else {
-    if (coyote_enabled) {
-      // Copy tensor to CPU tensor first
-      data = create_coyotebuffer(*accl, tensor->numel(), tensor->scalar_type());
-      ACCL::debug("Copying data to CPU tensor of size " +
-                  std::to_string(tensor_original.numel()));
-      empty_tensor = torch::from_blob(
-          data->byte_array(), tensor_original.sizes(),
-          tensor_original.options().device(c10::DeviceType::CPU));
-      tensor = &empty_tensor;
-      tensor->copy_(tensor_original);
-    } else if (tensor_original.device().type() != c10::DeviceType::CPU) {
-      // Copy tensor to CPU tensor first
-      data = create_buffer(*accl, tensor->numel(), tensor->scalar_type());
-      ACCL::debug("Copying data to CPU tensor of size " +
-                  std::to_string(tensor_original.numel()));
-      empty_tensor = torch::from_blob(
-          data->byte_array(), tensor_original.sizes(),
-          tensor_original.options().device(c10::DeviceType::CPU));
-      tensor = &empty_tensor;
-      tensor->copy_(tensor_original);
-    } else {
-      data = create_buffer(*accl, *tensor);
-    }
-  }
-
-  // Run send
-  if (!coyote_enabled) {
-    data->sync_to_device();
-  }
+  init_input_tensor(in_tensor, data, true, true);
 
-  ACCL::debug("[Send] Entering barrier");
-  accl->barrier();
+  PRE_REQUEST(Send,in_tensor)
   
-  ACCL::debug("Starting send of " + std::to_string(tensor->numel()) +
-              " items to " + std::to_string(dstRank));
-  
-  ACCL::ACCLRequest* req = accl->send(*data, tensor->numel(), dstRank, tag, ACCL::GLOBAL_COMM, true,
-             get_compressed_type(tensor->scalar_type()));
+  ACCL::ACCLRequest* req = accl->send(*data, in_tensor.numel(), dstRank, tag, ACCL::GLOBAL_COMM, true,
+             get_compressed_type(in_tensor.scalar_type()));
 
-  if(coyote_enabled){
-    ACCL::debug("Waiting for request to complete.");
-    accl->wait(req, 1000ms);
-  }
-  ACCL::debug("Finished waiting");
-  
+  POST_REQUEST
 }
 
 c10::intrusive_ptr<Work>
@@ -1676,70 +1523,24 @@ ProcessGroupACCL::send(std::vector<at::Tensor> &tensors, int dstRank, int tag) {
                  c10::optional<std::vector<at::Tensor>>(tensors));
 }
 
-void ProcessGroupACCL::run_recv(at::Tensor tensor_original, int srcRank,
+void ProcessGroupACCL::run_recv(at::Tensor out_tensor, int srcRank,
                                 int tag) {
-  at::Tensor *tensor = &tensor_original;
-  at::Tensor empty_tensor;
-  std::unique_ptr<ACCL::BaseBuffer> data;
 
+  STANDARD_DECL
+    
   // Reserve device
-  c10::DeviceGuard guard(tensor->device());
+  c10::DeviceGuard guard(out_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  // Create FPGA buffer
-  if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
-    data = create_buffer_p2p(*accl, tensor_original);
-  } else {
-    if (coyote_enabled) {
-      // Copy tensor to CPU tensor first
-      data = create_coyotebuffer(*accl, tensor->numel(), tensor->scalar_type());
-      ACCL::debug("Creating CPU tensor of size " +
-                  std::to_string(tensor_original.numel()));
-      empty_tensor = torch::from_blob(
-          data->byte_array(), tensor_original.sizes(),
-          tensor_original.options().device(c10::DeviceType::CPU));
-      tensor = &empty_tensor;
-    } else if (tensor_original.device().type() != c10::DeviceType::CPU) {
-      data = create_buffer(*accl, tensor->numel(), tensor->scalar_type());
-      ACCL::debug("Copying data to CPU tensor of size " +
-                  std::to_string(tensor_original.numel()));
-      empty_tensor = torch::from_blob(
-          data->byte_array(), tensor_original.sizes(),
-          tensor_original.options().device(c10::DeviceType::CPU));
-      tensor = &empty_tensor;
-    } else {
-      data = create_buffer(*accl, *tensor);
-    }
-  }
+  init_output_data(out_tensor, dstdata, out_tensor.numel(), out_tensor.scalar_type(), true, true);
 
-  // Run recieve
-
-  ACCL::debug("[Receive] Entering barrier");
-  accl->barrier();
-  
-  ACCL::debug("Starting receive of " + std::to_string(tensor->numel()) +
-              " items from " + std::to_string(srcRank));
-  ACCL::ACCLRequest* req = accl->recv(*data, tensor->numel(), srcRank, tag, ACCL::GLOBAL_COMM, true,
-             get_compressed_type(tensor->scalar_type()));
-
-  if(coyote_enabled){
-    ACCL::debug("Waiting for request to complete.");
-    accl->wait(req, 1000ms);
-  }
-  ACCL::debug("Finished waiting");
+  PRE_REQUEST(Receive, out_tensor)  
   
-  if (!coyote_enabled) {
-    data->sync_from_device();
-  }
+  ACCL::ACCLRequest* req = accl->recv(*dstdata, out_tensor.numel(), srcRank, tag, ACCL::GLOBAL_COMM, true, get_compressed_type(out_tensor.scalar_type()));
 
-  // Copy data back to original tensor if necessary
-  if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
-    copy_back_p2p_buffer(*data, tensor_original);
-  } else if (coyote_enabled || tensor_original.device().type() != c10::DeviceType::CPU) {
-    ACCL::debug("Copying data back from CPU tensor of size " +
-                std::to_string(tensor_original.numel()));
-    tensor_original.copy_(*tensor);
-  }
+  POST_REQUEST
+
+  copy_back_tensor(out_tensor, dstdata, true, true);      
 }
 
 c10::intrusive_ptr<Work>
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index bc98858d..055e5cb2 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -149,21 +149,28 @@ def test_allreduce():
     print("Test allreduce finished!")
 
 def test_alltoall():
-    input = torch.arange(4, dtype=torch.float) + float(rank) * 4.
+    
+    input = torch.arange(count, dtype=torch.float) + float(rank) * count
 
     logger.debug("All-to-all input:")
     logger.debug(str(input)) 
 
-    output = torch.ones(4)
-    # output = torch.empty([4], dtype=torch.int64)
+    output = torch.ones(count)
 
-    logger.debug("All-to-all output:")
-    logger.debug(str(output)) 
-    
     dist.all_to_all_single(output, input)
     
     logger.debug("All-to-all output:")
-    logger.debug(str(output)) 
+    logger.debug(str(output))
+
+    test = torch.zeros(count)
+
+    section_size = int(count/size)
+
+    for section in range(size):
+        for el in range(section_size):
+            test[section * section_size + el] = float(rank) * section_size + section * count + el
+
+    np.testing.assert_allclose(output, test)
 
     print("Test allreduce finished!")
     
@@ -291,8 +298,8 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
         mpi.Barrier()
         test_broadcast()
         mpi.Barrier()
-        # test_sendrcv()
-        # mpi.Barrier()
+        test_sendrcv()
+        mpi.Barrier()
         test_scatter()
         mpi.Barrier()
         test_gather()
@@ -303,12 +310,12 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
         mpi.Barrier()
         test_allreduce()
         mpi.Barrier()
-        # demo_basic(rank)
-        # mpi.Barrier()
+        demo_basic(rank)
+        mpi.Barrier()
         # run_training()
         # mpi.Barrier()
-        # test_alltoall()
-        # mpi.Barrier()
+        test_alltoall()
+        mpi.Barrier()
         
     print("Finished testing")
     logger.debug('Finished testing')

From 52b4433b5f0cc1bbb0d7c3fb06382f65b81a1621 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sat, 25 May 2024 11:14:55 +0200
Subject: [PATCH 18/64] Added MNIST test

---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      |  62 +----
 integrations/pytorch_ddp/test/run.sh          |   8 +-
 integrations/pytorch_ddp/test/test-generic.py |  78 +++---
 integrations/pytorch_ddp/test/test-mnist.py   | 224 ++++++++++++++++++
 4 files changed, 284 insertions(+), 88 deletions(-)
 create mode 100644 integrations/pytorch_ddp/test/test-mnist.py

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 166e550e..9519c716 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -63,7 +63,7 @@ namespace c10d {
 
 #if defined(DO_PARA_PRINT)
   #define PARA_PRINT(x)							\
-    ACCL::debug("#x size: " + std::to_string(x.numel()) + " of type: " + string_of_accl_datatype(convert_datatype_from_torch(x.scalar_type())))
+    ACCL::debug(#x "size: " + std::to_string(x.numel()) + " of type: " + string_of_accl_datatype(convert_datatype_from_torch(x.scalar_type())))
 #else
   #define PARA_PRINT(x)
 #endif
@@ -308,33 +308,6 @@ std::unique_ptr<ACCL::BaseBuffer> create_coyotebuffer(ACCL::ACCL &accl, size_t l
   }
 }
 
-    //TODO delete
-// Create an ACCL Buffer with correct type
-std::unique_ptr<ACCL::BaseBuffer> wrap_buffer(ACCL::ACCL &accl, xrt::bo &bo,
-                                              size_t length,
-                                              c10::ScalarType type) {
-  size_t size;
-  if (type == at::kInt || type == at::kFloat) {
-    size = length * 4;
-  } else {
-    size = length * 8;
-  }
-  xrt::bo slice = xrt::bo(bo, size, static_cast<size_t>(0));
-  switch (type) {
-  case at::kInt:
-    return accl.create_buffer<int32_t>(slice, length, acclDatatype.at(type));
-  case at::kLong:
-    return accl.create_buffer<int64_t>(slice, length, acclDatatype.at(type));
-  case at::kFloat:
-    return accl.create_buffer<float>(slice, length, acclDatatype.at(type));
-  case at::kDouble:
-    return accl.create_buffer<double>(slice, length, acclDatatype.at(type));
-  default:
-    TORCH_CHECK(false, "Tensor has unsupported datatype");
-    break;
-  }
-}
-
 // Create an ACCL P2P Buffer with correct type
 std::unique_ptr<ACCL::BaseBuffer>
 create_buffer_p2p(ACCL::ACCL &accl, size_t length, c10::ScalarType type) {
@@ -728,20 +701,18 @@ void ProcessGroupACCL::init_output_tensor(const at::Tensor &tensor_original, at:
     if DO_COND {
 	if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
 	  dstdata = create_buffer_p2p(*accl, out_tensor_size, type);
-	} else {
-	  if (coyote_enabled) {
+	} else if (coyote_enabled) {
 	    dstdata = create_coyotebuffer(*accl, out_tensor_size, type);
 	    std::vector<int64_t> sizes = {static_cast<int64_t>(out_tensor_size)};
 	    dsttensor = torch::from_blob(dstdata->byte_array(), sizes, tensor_original.options().device(c10::DeviceType::CPU));
 	    // This should not be necessary:
 	    // dsttensor.copy_(tensor_original);
-	  } else {
+	} else {
 	    dstdata = create_buffer(*accl, out_tensor_size, type);
 	    std::vector<int64_t> sizes = {static_cast<int64_t>(out_tensor_size)};
 	    dsttensor = torch::from_blob(dstdata->byte_array(), sizes, tensor_original.options().device(c10::DeviceType::CPU));
 	    // This should not be necessary:
 	    // dsttensor.copy_(tensor_original);
-	  }
 	}
       } else {
       dstdata = std::unique_ptr<ACCL::Buffer<float>>(nullptr);
@@ -770,13 +741,14 @@ void ProcessGroupACCL::copy_back_tensor(at::Tensor tensor_original, std::unique_
       data->sync_from_device();
     }
     for (const auto i : c10::irange(dsttensorvec.size())) {
-      if (p2p_applicable(*accl, dsttensorvec[0], p2p_enabled)) {
-	auto slice =
-	  data->slice(i * numel, (i + 1) * numel);
-	copy_back_p2p_buffer(*slice, dsttensorvec[i]);
-      } else {
+	// TODO uncomment and correct
+      // if (p2p_applicable(*accl, dsttensorvec[0], p2p_enabled)) {
+	// auto slice =
+	  // data->slice(i * numel, (i + 1) * numel);
+	// copy_back_p2p_buffer(*slice, dsttensorvec[i]);
+      // } else {
 	dsttensorvec[i].copy_(dsttensor.slice(0, i * numel, (i + 1) * numel));
-      }
+      // }
     }
   }
 }  
@@ -1222,7 +1194,7 @@ void ProcessGroupACCL::run_gather(at::Tensor in_tensor,
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  init_output_tensor(dsttensorvec[0], dsttensor, dstdata, in_tensor.numel() * static_cast<size_t>(size_), in_tensor.scalar_type(), true, false, opts.rootRank);
+  init_output_tensor(in_tensor, dsttensor, dstdata, in_tensor.numel() * static_cast<size_t>(size_), in_tensor.scalar_type(), true, false, opts.rootRank);
 
   PRE_REQUEST(Gather, in_tensor)
 
@@ -1232,7 +1204,7 @@ void ProcessGroupACCL::run_gather(at::Tensor in_tensor,
 
   POST_REQUEST
 
-    copy_back_tensorvec(dsttensorvec, dstdata, dsttensor, in_tensor.numel(), true, false, opts.rootRank);
+  copy_back_tensorvec(dsttensorvec, dstdata, dsttensor, in_tensor.numel(), true, false, opts.rootRank);
     
 }
 
@@ -1394,7 +1366,7 @@ void ProcessGroupACCL::run_alltoall(at::Tensor in_tensor,
   std::unique_ptr<ACCL::BaseBuffer> srcdata;
   std::unique_ptr<ACCL::BaseBuffer> dstdata;
 
-  PARA_PRINT(in_tensor);
+  // PARA_PRINT(in_tensor);
 
   init_input_tensor(in_tensor, srcdata, true, true); 
 
@@ -1404,14 +1376,6 @@ void ProcessGroupACCL::run_alltoall(at::Tensor in_tensor,
 
   init_output_data(out_tensor, dstdata, out_tensor.numel(), out_tensor.scalar_type(), true, true);
 
-  for(int i = 0; i < in_tensor.numel(); i++){
-      ACCL::debug(std::to_string(i) + ": " + std::to_string(((float *) srcdata.get())[i]));
-  }
-
-  for(int i = 0; i<in_tensor.numel(); i++){
-      ACCL::debug(std::to_string(((float *) dstdata.get())[i]));
-  }
-
   PRE_REQUEST(AlltoAll, in_tensor)
 
   ACCL::ACCLRequest* req = accl->alltoall(*srcdata, *dstdata, in_tensor.numel()/size_, ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor.scalar_type()));
diff --git a/integrations/pytorch_ddp/test/run.sh b/integrations/pytorch_ddp/test/run.sh
index 15d9bd67..627dfedd 100755
--- a/integrations/pytorch_ddp/test/run.sh
+++ b/integrations/pytorch_ddp/test/run.sh
@@ -9,7 +9,7 @@ fi
 if [[ -v ACCL_SCRIPT ]]; then
     SCRIPT_NAME="$ACCL_SCRIPT"
 else
-    SCRIPT_NAME=test-generic.py
+    SCRIPT_NAME=test-mnist.py
     echo "Variable ACCL_SCRIPT not set. Assuming $SCRIPT_NAME"
 fi
 
@@ -31,7 +31,7 @@ EXEC="bash -c \"source $VENV_ACTIVATE && source $SETUP_SH  && python $SCRIPT"
 #---------------Setting up vars-------------
 if [[ $ACCL_SIM -eq 1 ]]; then
     echo "Starting in simulator mode. Make sure to start the emulator beforehand"
-    ARG="-s "
+    ARG="-s -d True"
 
     ACCL_COMMS="udp"
 
@@ -98,8 +98,8 @@ echo "Running with $NUM_PROCESS Processes"
 rm -f $(pwd)/accl_log/rank*
 
 # C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_stdout\" $EXEC $ARG &"
-# C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_stdout\" -errfile-pattern \"$(pwd)/accl_log/rank_%r_stderr\" $EXEC $ARG &"
-C="mpirun -n $NUM_PROCESS $MPI_ARGS $EXEC $ARG &"
+C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_stdout\" -errfile-pattern \"$(pwd)/accl_log/rank_%r_stderr\" $EXEC $ARG &"
+# C="mpirun -n $NUM_PROCESS $MPI_ARGS $EXEC $ARG &"
 echo $C
 
 exit 0
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index 055e5cb2..13eb093d 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -96,8 +96,8 @@ def test_scatter():
 
     dist.scatter(y, x, 0)
 
-    print(y)
-    print(rank)
+    # print(y)
+    # print(rank)
     
     np.testing.assert_allclose(y, torch.full((count,), float(rank)))
     print("Test scatter finished!")
@@ -152,16 +152,11 @@ def test_alltoall():
     
     input = torch.arange(count, dtype=torch.float) + float(rank) * count
 
-    logger.debug("All-to-all input:")
-    logger.debug(str(input)) 
 
     output = torch.ones(count)
 
     dist.all_to_all_single(output, input)
     
-    logger.debug("All-to-all output:")
-    logger.debug(str(output))
-
     test = torch.zeros(count)
 
     section_size = int(count/size)
@@ -188,7 +183,18 @@ def forward(self, x):
 class MyTrainDataset(Dataset):
     def __init__(self, size):
         self.size = size
-        self.data = [(torch.rand(10), torch.rand(5)) for _ in range(size)]
+
+        self.data = []
+        for i in range(size):
+            in_feature = torch.zeros(10)
+            out_feature = torch.zeros(5)
+            for j in range(10):
+                in_feature[j] = float((i^2  + j) % 5)
+                out_feature[j//2] = out_feature[j//2] + float(((i^2 + j) % 5) * 3 * ( -1 ** (j % 2)))
+            self.data.append((in_feature, out_feature))
+                
+                
+        
 
     def __len__(self):
         return self.size
@@ -215,9 +221,9 @@ def demo_basic(rank: int):
     train_data = prepare_dataloader(train_set, batch_size)
     
     loss_fn = nn.MSELoss()
-    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
+    optimizer = optim.Adam(ddp_model.parameters(), lr=0.005)
 
-    max_epochs = 10
+    max_epochs = 20
     for epoch in range(max_epochs):
         batch_size = len(next(iter(train_data))[0])
         train_data.sampler.set_epoch(epoch)
@@ -233,6 +239,8 @@ def demo_basic(rank: int):
         
 
     print("finished training")
+    print("final params:")
+    print(ddp_model)
     # dist.destroy_process_group()
 
 def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=None, ma: str="localhost", mp: str="30505"):
@@ -293,35 +301,35 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
     accl.create_process_group(ranks, design, bufsize=rxbufsize, initialize=True, simulation=simulator)
     dist.init_process_group("ACCL", rank=rank, world_size=size)
 
-    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
-                 profile_memory=True, record_shapes=True) as prof:
-        mpi.Barrier()
-        test_broadcast()
-        mpi.Barrier()
-        test_sendrcv()
-        mpi.Barrier()
-        test_scatter()
-        mpi.Barrier()
-        test_gather()
-        mpi.Barrier()
-        test_allgather()
-        mpi.Barrier()
-        test_reduce()
-        mpi.Barrier()
-        test_allreduce()
-        mpi.Barrier()
-        demo_basic(rank)
-        mpi.Barrier()
-        # run_training()
-        # mpi.Barrier()
-        test_alltoall()
-        mpi.Barrier()
+    # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                 # profile_memory=True, record_shapes=True) as prof:
+    # mpi.Barrier()
+    # test_broadcast()
+    # mpi.Barrier()
+    # test_sendrcv()
+    # mpi.Barrier()
+    # test_scatter()
+    # mpi.Barrier()
+    # test_gather()
+    # mpi.Barrier()
+    # test_allgather()
+    # mpi.Barrier()
+    # test_reduce()
+    # mpi.Barrier()
+    # test_allreduce()
+    mpi.Barrier()
+    # demo_basic(rank)
+    # mpi.Barrier()
+    # run_training()
+    # mpi.Barrier()
+    test_alltoall()
+    mpi.Barrier()
         
     print("Finished testing")
     logger.debug('Finished testing')
         
-    print(prof.key_averages(group_by_input_shape=True)
-          .table(sort_by="cpu_time_total", row_limit=15))
+    # print(prof.key_averages(group_by_input_shape=True)
+          # .table(sort_by="cpu_time_total", row_limit=15))
 
     logger.debug('Destroying ACCL Process Group')
     dist.destroy_process_group()
diff --git a/integrations/pytorch_ddp/test/test-mnist.py b/integrations/pytorch_ddp/test/test-mnist.py
new file mode 100644
index 00000000..c82650e7
--- /dev/null
+++ b/integrations/pytorch_ddp/test/test-mnist.py
@@ -0,0 +1,224 @@
+import torch
+from torchvision import datasets
+from torchvision.transforms import ToTensor
+from torch.utils.data import DataLoader
+import torch.nn as nn
+from torch import optim
+from torch.autograd import Variable
+import torch.distributed as dist
+import accl_process_group as accl
+
+from mpi4py.MPI import COMM_WORLD as mpi
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data.distributed import DistributedSampler
+
+import argparse
+import os
+import sys
+import logging
+
+logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
+
+logger = logging.getLogger(__name__)
+
+if "ACCL_DEBUG" in os.environ and os.environ["ACCL_DEBUG"]=="1":
+    logger.setLevel(logging.DEBUG)
+else:
+    logger.setLevel(logging.WARNING)
+
+# Run via ACCL
+
+class CNN(nn.Module):
+    def __init__(self):
+        super(CNN, self).__init__()
+        self.conv1 = nn.Sequential(         
+            nn.Conv2d(
+                in_channels=1,              
+                out_channels=16,            
+                kernel_size=5,              
+                stride=1,                   
+                padding=2,                  
+            ),                              
+            nn.ReLU(),                      
+            nn.MaxPool2d(kernel_size=2),    
+        )
+        self.conv2 = nn.Sequential(         
+            nn.Conv2d(16, 32, 5, 1, 2),     
+            nn.ReLU(),                      
+            nn.MaxPool2d(2),                
+        )
+        # fully connected layer, output 10 classes
+        self.out = nn.Linear(32 * 7 * 7, 10)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        # flatten the output of conv2 to (batch_size, 32 * 7 * 7)
+        x = x.view(x.size(0), -1)       
+        output = self.out(x)
+        return output, x    # return x for visualization
+
+def train(num_epochs, cnn, loaders):
+    
+    cnn.train()
+        
+    # Train the model
+    total_step = len(loaders['train'])
+
+    optimizer = optim.Adam(cnn.parameters(), lr = 0.01)   
+
+    for epoch in range(num_epochs):
+        for i, (images, labels) in enumerate(loaders['train']):
+            
+            # gives batch data, normalize x when iterate train_loader
+            b_x = Variable(images)   # batch x
+            b_y = Variable(labels)   # batch y
+            output = cnn(b_x)[0]               
+
+            loss = loss_func(output, b_y)
+            
+            # clear gradients for this training step   
+            optimizer.zero_grad()           
+            
+            # backpropagation, compute gradients 
+            loss.backward()    
+            # apply gradients             
+            optimizer.step()                
+            
+            # if (i+1) % 100 == 0:
+            # if True:
+                print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
+                       .format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))
+        
+
+def test():
+    # Test the model
+    cnn.eval()
+    with torch.no_grad():
+        correct = 0
+        total = 0
+        for images, labels in loaders['test']:
+            test_output, last_layer = cnn(images)
+            pred_y = torch.max(test_output, 1)[1].data.squeeze()
+            accuracy = (pred_y == labels).sum().item() / float(labels.size(0))
+    print('Test Accuracy of the model on the 10000 test images: %.2f' % accuracy)
+    
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("-n", type=int, default=1)
+    parser.add_argument("-d", type=bool, default=None)
+
+
+    parser.add_argument('-s', '--simulator', action='store_true',
+                        default=False, help='Use simulation instead of '
+                                            'hardware')
+    parser.add_argument('-c', '--comms', choices=['udp', 'tcp', 'cyt_rdma'], default='tcp',
+                        help='Run tests over specified communication backend')
+    parser.add_argument('-i', '--host-file', type=str, help='Specify the file, where the host IPs are listed')
+    parser.add_argument('-f', '--fpga-file', type=str, help='Specify the file, where the FPGA IPs are listed')
+    parser.add_argument('-a','--master-address', type=str)
+    parser.add_argument('-p','--master-port', type=str)
+
+
+    args = parser.parse_args()
+
+    if args.n == 1 and args.d == None :
+        print("only one machine specified. Assuming Non distributed setup")
+        args.d = False
+    elif args.n > 1 and args.d == None:
+        print("Assung DDP setup")
+        args.d = True
+
+    
+    global rank, size
+    if args.master_address==None:
+        args.master_address = "localhost"
+    if args.master_port==None:
+        args.master_port = "30505"
+    os.environ['MASTER_ADDR'] = args.master_address
+    os.environ['MASTER_PORT'] = args.master_port
+    rank = mpi.Get_rank()
+    size = mpi.Get_size()
+
+    rxbufsize = 4096 * 1024
+
+    if args.d:
+        if not args.simulator:
+            #default from test.cpp
+            rxbufsize = 4096 * 1024
+            if host_file==None or fpga_file==None: sys.exit('Host and FPGA file need to be specified in hardware mode')
+        
+            with open(host_file, 'r') as hf:
+                host_ips = hf.read().splitlines()
+            
+            with open(fpga_file, 'r') as ff:
+                fpga_ips = ff.read().splitlines()
+
+            if comms == "cyt_rdma":
+                ranks = [accl.Rank(a, start_port, i, rxbufsize) for i, a in enumerate(fpga_ips)]
+            else:
+                ranks = [accl.Rank(a, start_port + i, 0, rxbufsize) for i, a in enumerate(fpga_ips)]
+        else:
+            # Somehow the simulator gets stuck if I use the same rxbufsize
+            rxbufsize = 4096# * 1024
+            ranks = [accl.Rank("127.0.0.1", 5500 + i, i, rxbufsize) for i in range(size)]
+
+        logger.debug(f'Ranks: {ranks}')
+
+        if args.comms == 'udp':
+            design = accl.ACCLDesign.udp
+        elif args.comms == 'tcp':
+            design = accl.ACCLDesign.tcp
+        elif args.comms == 'cyt_rdma': # and not simulator:
+            design = accl.ACCLDesign.cyt_rdma
+    
+
+        mpi.Barrier()            
+    
+        accl.create_process_group(ranks, design, bufsize=rxbufsize, initialize=True, simulation=args.simulator)
+        dist.init_process_group("ACCL", rank=rank, world_size=size)
+        
+    device = 'cpu'
+
+    train_data = datasets.MNIST(
+        root = 'data',
+        train = True,                         
+        transform = ToTensor(), 
+        download = True,            
+    )
+    test_data = datasets.MNIST(
+        root = 'data', 
+        train = False, 
+        transform = ToTensor()
+    )
+
+    if args.d : sampler = DistributedSampler
+    else : sampler = lambda x : None
+    
+    loaders = {
+        'train' : torch.utils.data.DataLoader(train_data, 
+                                              batch_size=100, 
+                                              shuffle=False,
+                                              sampler=sampler(train_data)),
+        'test'  : torch.utils.data.DataLoader(test_data, 
+                                              batch_size=100, 
+                                              shuffle=False,
+                                              sampler=sampler(test_data)),
+    }
+
+    cnn = CNN()
+    if args.d : cnn = DDP(cnn)
+
+    loss_func = nn.CrossEntropyLoss()   
+
+    num_epochs = 10
+
+    mpi.Barrier()
+
+    print("starting training")
+    
+    train(num_epochs, cnn, loaders)
+
+    test()
+

From 6f95caa6ac7a1b8e1251191ca56f16620baec701 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Mon, 27 May 2024 15:24:33 +0200
Subject: [PATCH 19/64] Added test for multi-dimensional tensors

---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 34 +++++---
 integrations/pytorch_ddp/test/run.sh          |  8 +-
 integrations/pytorch_ddp/test/test-generic.py | 80 +++++++++----------
 3 files changed, 65 insertions(+), 57 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 9519c716..c460c8ef 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -652,7 +652,9 @@ void ProcessGroupACCL::init_input_tensor(at::Tensor &tensor, std::unique_ptr<ACC
   if DO_COND {
     int64_t tens_size = static_cast<size_t>(tensor_vec[0].numel());
     int64_t total_size = tens_size * static_cast<size_t>(size_);
-    std::vector<int64_t> sizes = {total_size};
+    std::vector<int64_t> sizes = tensor_vec[0].sizes().vec();
+    // Prepend another dimension for vector length
+    sizes.insert(sizes.begin(), tensor_vec.size());
       
     if (p2p_applicable(*accl, tensor_vec[0], p2p_enabled)) {
       data = create_buffer_p2p( *accl, total_size, tensor_vec[0].scalar_type());
@@ -668,7 +670,7 @@ void ProcessGroupACCL::init_input_tensor(at::Tensor &tensor, std::unique_ptr<ACC
 	auto slice = data->slice(i * tens_size, (i + 1) * tens_size);
 	copy_to_p2p_buffer(*slice, tensor_vec[i]);
       } else {
-	auto slice = wrapper_tensor.slice(0, i * tens_size, (i + 1) * tens_size);
+	auto slice = wrapper_tensor[i];
 	slice.copy_(tensor_vec[i]);
       }
     }
@@ -697,19 +699,26 @@ void ProcessGroupACCL::init_output_data(at::Tensor &tensor_original, std::unique
   }
 }
 
-void ProcessGroupACCL::init_output_tensor(const at::Tensor &tensor_original, at::Tensor &dsttensor, std::unique_ptr<ACCL::BaseBuffer> &dstdata, int out_tensor_size, c10::ScalarType type, bool do_on_root, bool do_on_others, int opts_root_rank) {
+    void ProcessGroupACCL::init_output_tensor(const at::Tensor &tensor_original, at::Tensor &dsttensor, std::unique_ptr<ACCL::BaseBuffer> &dstdata, int num_tensors, c10::ScalarType type, bool do_on_root, bool do_on_others, int opts_root_rank) {
     if DO_COND {
+	int64_t num_tensors_s = static_cast<size_t>(num_tensors);
+	std::vector<int64_t> sizes = tensor_original.sizes().vec();
+	int64_t total_size = static_cast<size_t>(tensor_original.numel());
+	if  (num_tensors != 0) {
+	    // Prepend another dimension for vector length
+	    sizes.insert(sizes.begin(), num_tensors_s);
+	    total_size = total_size * num_tensors_s;
+	}
 	if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
-	  dstdata = create_buffer_p2p(*accl, out_tensor_size, type);
+	  dstdata = create_buffer_p2p(*accl, total_size, type);
 	} else if (coyote_enabled) {
-	    dstdata = create_coyotebuffer(*accl, out_tensor_size, type);
-	    std::vector<int64_t> sizes = {static_cast<int64_t>(out_tensor_size)};
+	    dstdata = create_coyotebuffer(*accl, total_size, type);
+	    // std::vector<int64_t> sizes = {static_cast<int64_t>(out_tensor_size)};
 	    dsttensor = torch::from_blob(dstdata->byte_array(), sizes, tensor_original.options().device(c10::DeviceType::CPU));
 	    // This should not be necessary:
 	    // dsttensor.copy_(tensor_original);
 	} else {
-	    dstdata = create_buffer(*accl, out_tensor_size, type);
-	    std::vector<int64_t> sizes = {static_cast<int64_t>(out_tensor_size)};
+	    dstdata = create_buffer(*accl, total_size, type);
 	    dsttensor = torch::from_blob(dstdata->byte_array(), sizes, tensor_original.options().device(c10::DeviceType::CPU));
 	    // This should not be necessary:
 	    // dsttensor.copy_(tensor_original);
@@ -747,7 +756,7 @@ void ProcessGroupACCL::copy_back_tensor(at::Tensor tensor_original, std::unique_
 	  // data->slice(i * numel, (i + 1) * numel);
 	// copy_back_p2p_buffer(*slice, dsttensorvec[i]);
       // } else {
-	dsttensorvec[i].copy_(dsttensor.slice(0, i * numel, (i + 1) * numel));
+	dsttensorvec[i].copy_(dsttensor[i]);
       // }
     }
   }
@@ -981,7 +990,6 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
 c10::intrusive_ptr<Work>
 ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
                             const BroadcastOptions &opts) {
-  debug(accl->dump_eager_rx_buffers(false));
   checkSingleTensor(tensors);
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [opts, this](std::unique_ptr<WorkEntry> &entry) {
@@ -1118,7 +1126,7 @@ void ProcessGroupACCL::run_allgather(
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  init_output_tensor(dsttensorvec[0], dsttensor, dstdata, in_tensor.numel() * static_cast<size_t>(size_), in_tensor.scalar_type(), true, true);
+  init_output_tensor(in_tensor, dsttensor, dstdata, size_, in_tensor.scalar_type(), true, true);
   
   PRE_REQUEST(Allgather,in_tensor)
 
@@ -1194,7 +1202,7 @@ void ProcessGroupACCL::run_gather(at::Tensor in_tensor,
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  init_output_tensor(in_tensor, dsttensor, dstdata, in_tensor.numel() * static_cast<size_t>(size_), in_tensor.scalar_type(), true, false, opts.rootRank);
+  init_output_tensor(in_tensor, dsttensor, dstdata, size_, in_tensor.scalar_type(), true, false, opts.rootRank);
 
   PRE_REQUEST(Gather, in_tensor)
 
@@ -1278,7 +1286,7 @@ void ProcessGroupACCL::run_scatter(std::vector<at::Tensor> &in_tensor_vec,
   init_input_data_vec(in_tensor_vec, in_data, out_tensor.options().device(c10::DeviceType::CPU), true, false, opts.rootRank);
 
   
-  init_output_tensor(out_tensor, dsttensor, out_data, out_tensor.numel(), out_tensor.scalar_type(), true, true, opts.rootRank);
+  init_output_tensor(out_tensor, dsttensor, out_data, 0, out_tensor.scalar_type(), true, true, opts.rootRank);
 
   PRE_REQUEST(Scatter, dsttensor)
   
diff --git a/integrations/pytorch_ddp/test/run.sh b/integrations/pytorch_ddp/test/run.sh
index 627dfedd..554c3757 100755
--- a/integrations/pytorch_ddp/test/run.sh
+++ b/integrations/pytorch_ddp/test/run.sh
@@ -9,7 +9,8 @@ fi
 if [[ -v ACCL_SCRIPT ]]; then
     SCRIPT_NAME="$ACCL_SCRIPT"
 else
-    SCRIPT_NAME=test-mnist.py
+    # SCRIPT_NAME=test-mnist.py # MNIST
+    SCRIPT_NAME=test-generic.py
     echo "Variable ACCL_SCRIPT not set. Assuming $SCRIPT_NAME"
 fi
 
@@ -31,8 +32,9 @@ EXEC="bash -c \"source $VENV_ACTIVATE && source $SETUP_SH  && python $SCRIPT"
 #---------------Setting up vars-------------
 if [[ $ACCL_SIM -eq 1 ]]; then
     echo "Starting in simulator mode. Make sure to start the emulator beforehand"
-    ARG="-s -d True"
-
+    # ARG="-s -d True" #MNIST
+    ARG="-s "
+    
     ACCL_COMMS="udp"
 
     echo "assuming $ACCL_COMMS comms in simulator"
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index 13eb093d..e05e5691 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -49,29 +49,31 @@
 size = 0
 
 count = 16
+shape = (6,4)
+num_el = 6 * 4
 #As in test.cpp defaults
 rxbufsize = 4096 * 1024
 
 
 def test_broadcast():
     if rank == 0:
-        x = torch.ones(count)
+        x = torch.ones(shape)
     else:
-        x = torch.zeros(count)
+        x = torch.zeros(shape)
 
     dist.broadcast(x, 0)
 
     # logger.debug('Tensor after broadcast: ' + str(x))
     # print('Tensor after broadcast: ' + str(x))
     
-    np.testing.assert_allclose(x, torch.ones(count))
+    np.testing.assert_allclose(x, torch.ones(shape))
     print("Test broadcast finished!")
 
 
 def test_sendrcv():
-    x = torch.full((count,), float(rank))
+    x = torch.full(shape, float(rank))
 
-    y = torch.empty(count)
+    y = torch.empty(shape)
 
     prev_rank = (rank - 1) % size
     next_rank = (rank + 1) % size
@@ -83,31 +85,29 @@ def test_sendrcv():
         dist.recv(y, prev_rank)
         dist.send(x, next_rank)
 
-    np.testing.assert_allclose(y, torch.full((count,), prev_rank))
+    np.testing.assert_allclose(y, torch.full(shape, prev_rank))
     print("Test sendrcv finished!")
 
 
 def test_scatter():
     if rank == 0:
-        x = [torch.full((count,), float(i)) for i in range(size)]
+        x = [torch.full(shape, float(i+1)) for i in range(size)]
     else:
         x = None
-    y = torch.full((count,), float(0))
+    y = torch.full(shape, float(0))
 
+    
     dist.scatter(y, x, 0)
-
-    # print(y)
-    # print(rank)
     
-    np.testing.assert_allclose(y, torch.full((count,), float(rank)))
+    np.testing.assert_allclose(y, torch.full(shape, float(rank+1)))
     print("Test scatter finished!")
 
 
 def test_gather():
-    x = torch.full((count,), float(rank))
+    x = torch.full(shape, float(rank))
 
     if rank == 0:
-        y = [torch.empty(count) for _ in range(size)]
+        y = [torch.empty(shape) for _ in range(size)]
     else:
         y = None
 
@@ -115,37 +115,37 @@ def test_gather():
 
     if rank == 0:
         for i, c in enumerate(y):
-            np.testing.assert_allclose(c, torch.full((count,), float(i)))
+            np.testing.assert_allclose(c, torch.full(shape, float(i)))
     print("Test gather finished!")
 
 
 def test_allgather():
-    x = torch.full((count,), float(rank))
-    y = [torch.empty(count) for _ in range(size)]
+    x = torch.full(shape, float(rank))
+    y = [torch.empty(shape) for _ in range(size)]
 
     dist.all_gather(y, x)
 
     for i, c in enumerate(y):
-        np.testing.assert_allclose(c, torch.full((count,), float(i)))
+        np.testing.assert_allclose(c, torch.full(shape, float(i)))
     print("Test allgather finished!")
 
 
 def test_reduce():
-    x = torch.ones(count)
+    x = torch.ones(shape)
 
     dist.reduce(x, 0, dist.ReduceOp.SUM)
 
     if rank == 0:
-        np.testing.assert_allclose(x, [size for _ in range(count)])
+        np.testing.assert_allclose(x, torch.full(shape, float(size)))
     print("Test reduce finished!")
 
 
 def test_allreduce():
-    x = torch.ones(count)
+    x = torch.ones(shape)
 
     dist.all_reduce(x, dist.ReduceOp.SUM)
 
-    np.testing.assert_allclose(x, [size for _ in range(count)])
+    np.testing.assert_allclose(x, torch.full(shape, float(size)))
     print("Test allreduce finished!")
 
 def test_alltoall():
@@ -167,7 +167,7 @@ def test_alltoall():
 
     np.testing.assert_allclose(output, test)
 
-    print("Test allreduce finished!")
+    print("Test alltoall finished!")
     
 
 class ToyModel(nn.Module):
@@ -239,8 +239,8 @@ def demo_basic(rank: int):
         
 
     print("finished training")
-    print("final params:")
-    print(ddp_model)
+    # print("final params:")
+    # print(ddp_model)
     # dist.destroy_process_group()
 
 def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=None, ma: str="localhost", mp: str="30505"):
@@ -303,25 +303,23 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
 
     # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                  # profile_memory=True, record_shapes=True) as prof:
-    # mpi.Barrier()
-    # test_broadcast()
-    # mpi.Barrier()
-    # test_sendrcv()
-    # mpi.Barrier()
-    # test_scatter()
-    # mpi.Barrier()
-    # test_gather()
-    # mpi.Barrier()
-    # test_allgather()
-    # mpi.Barrier()
-    # test_reduce()
-    # mpi.Barrier()
-    # test_allreduce()
+    mpi.Barrier()
+    test_broadcast()
+    mpi.Barrier()
+    test_sendrcv()
+    mpi.Barrier()
+    test_scatter()
+    mpi.Barrier()
+    test_gather()
+    mpi.Barrier()
+    test_allgather()
+    mpi.Barrier()
+    test_reduce()
+    mpi.Barrier()
+    test_allreduce()
     mpi.Barrier()
     # demo_basic(rank)
     # mpi.Barrier()
-    # run_training()
-    # mpi.Barrier()
     test_alltoall()
     mpi.Barrier()
         

From f930995b4f07edf1935c4baa745f2ce13081624e Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Fri, 31 May 2024 11:29:14 +0200
Subject: [PATCH 20/64] Compatibility changes for new dev branch

---
 integrations/pytorch_ddp/setup.py                 | 1 +
 integrations/pytorch_ddp/src/ProcessGroupACCL.cpp | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/integrations/pytorch_ddp/setup.py b/integrations/pytorch_ddp/setup.py
index 7917b138..10034004 100755
--- a/integrations/pytorch_ddp/setup.py
+++ b/integrations/pytorch_ddp/setup.py
@@ -63,6 +63,7 @@
                 '-Wno-sign-compare',
                 '-Wno-unused-but-set-variable',
                 '-DACCL_HARDWARE_SUPPORT',
+                '-DACCL_NETWORK_UTILS_MPI',
                 '-std=c++17',
                 '-g']
 
diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index c460c8ef..0ca9dff9 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -879,7 +879,7 @@ void ProcessGroupACCL::initialize() {
                                                rsfec_);
     ACCL::debug(std::string("Setting timeout and Threshold"));
     accl->set_timeout(1e6);
-    accl->set_rendezvous_threshold(16*1024);
+    // accl->set_rendezvous_threshold(16*1024);
                                       
     int devicemem = accl->devicemem();
     if (!simulator_) {

From fd62eefd013c46473f5ed1d4df80d38ea611a449 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Tue, 4 Jun 2024 09:33:10 +0200
Subject: [PATCH 21/64] Adapted tests to not stop on error

---
 integrations/pytorch_ddp/test/test-generic.py | 132 +++++++++++++-----
 1 file changed, 96 insertions(+), 36 deletions(-)

diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index e05e5691..ff5a29d3 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -49,13 +49,14 @@
 size = 0
 
 count = 16
-shape = (6,4)
-num_el = 6 * 4
+shape = (4, 5)
+num_el = 4 * 5
 #As in test.cpp defaults
 rxbufsize = 4096 * 1024
 
 
 def test_broadcast():
+    global num_errors
     if rank == 0:
         x = torch.ones(shape)
     else:
@@ -65,12 +66,18 @@ def test_broadcast():
 
     # logger.debug('Tensor after broadcast: ' + str(x))
     # print('Tensor after broadcast: ' + str(x))
-    
-    np.testing.assert_allclose(x, torch.ones(shape))
-    print("Test broadcast finished!")
+    try:
+        np.testing.assert_allclose(x, torch.ones(shape))
+    except AssertionError as e:
+        num_errors = num_errors + 1
+        logger.debug("Test Broadcast failed")
+        logger.debug(str(e))
+    else:
+        logger.debug("Test broadcast finished!")
 
 
 def test_sendrcv():
+    global num_errors
     x = torch.full(shape, float(rank))
 
     y = torch.empty(shape)
@@ -85,25 +92,39 @@ def test_sendrcv():
         dist.recv(y, prev_rank)
         dist.send(x, next_rank)
 
-    np.testing.assert_allclose(y, torch.full(shape, prev_rank))
-    print("Test sendrcv finished!")
+    try:
+        np.testing.assert_allclose(y, torch.full(shape, prev_rank))
+    except AssertionError as e:
+        num_errors = num_errors + 1
+        logger.debug("Test Sendrcv failed")
+        logger.debug(str(e))
+    else:
+        logger.debug("Test Sendrcv finished!")
 
 
 def test_scatter():
+    global num_errors
     if rank == 0:
         x = [torch.full(shape, float(i+1)) for i in range(size)]
     else:
         x = None
     y = torch.full(shape, float(0))
 
-    
     dist.scatter(y, x, 0)
+
+    try:
+        np.testing.assert_allclose(y, torch.full(shape, float(rank+1)))
+    except AssertionError as e:
+        num_errors = num_errors + 1
+        logger.debug("Test Scatter failed")
+        logger.debug(str(e))
+    else:
+        logger.debug("Test Scatter finished!")
     
-    np.testing.assert_allclose(y, torch.full(shape, float(rank+1)))
-    print("Test scatter finished!")
 
 
 def test_gather():
+    global num_errors
     x = torch.full(shape, float(rank))
 
     if rank == 0:
@@ -115,40 +136,70 @@ def test_gather():
 
     if rank == 0:
         for i, c in enumerate(y):
-            np.testing.assert_allclose(c, torch.full(shape, float(i)))
-    print("Test gather finished!")
-
+            try:
+                np.testing.assert_allclose(c, torch.full(shape, float(i)))
+            except AssertionError as e:
+                num_errors = num_errors + 1
+                logger.debug("Test Gather failed")
+                logger.debug(str(e))
+            else:
+                logger.debug("Test Gather finished!")
 
+            
 def test_allgather():
+    global num_errors
     x = torch.full(shape, float(rank))
     y = [torch.empty(shape) for _ in range(size)]
 
     dist.all_gather(y, x)
 
     for i, c in enumerate(y):
-        np.testing.assert_allclose(c, torch.full(shape, float(i)))
-    print("Test allgather finished!")
+        try:
+            np.testing.assert_allclose(c, torch.full(shape, float(i)))
+        except AssertionError as e:
+            num_errors = num_errors + 1
+            logger.debug("Test AllGather failed")
+            logger.debug(str(e))
+        else:
+            logger.debug("Test AllGather finished!")
+        
 
 
 def test_reduce():
+    global num_errors
     x = torch.ones(shape)
 
     dist.reduce(x, 0, dist.ReduceOp.SUM)
 
     if rank == 0:
-        np.testing.assert_allclose(x, torch.full(shape, float(size)))
-    print("Test reduce finished!")
-
+        try:
+            np.testing.assert_allclose(x, torch.full(shape, float(size)))
+        except AssertionError as e:
+            num_errors = num_errors + 1
+            logger.debug("Test Reduce failed")
+            logger.debug(str(e))
+        else:
+            logger.debug("Test Reduce finished!")
+        
 
 def test_allreduce():
+    global num_errors
     x = torch.ones(shape)
 
     dist.all_reduce(x, dist.ReduceOp.SUM)
 
-    np.testing.assert_allclose(x, torch.full(shape, float(size)))
-    print("Test allreduce finished!")
-
+    try:
+        np.testing.assert_allclose(x, torch.full(shape, float(size)))
+    except AssertionError as e:
+        num_errors = num_errors + 1
+        logger.debug("Test AllReduce failed")
+        logger.debug(str(e))
+    else:
+        logger.debug("Test AllReduce finished!")
+        
+    
 def test_alltoall():
+    global num_errors
     
     input = torch.arange(count, dtype=torch.float) + float(rank) * count
 
@@ -165,10 +216,15 @@ def test_alltoall():
         for el in range(section_size):
             test[section * section_size + el] = float(rank) * section_size + section * count + el
 
-    np.testing.assert_allclose(output, test)
-
-    print("Test alltoall finished!")
-    
+    try:
+        np.testing.assert_allclose(output, test)
+    except AssertionError as e:
+        num_errors = num_errors + 1
+        logger.debug("Test AlltoAll failed")
+        logger.debug(str(e))
+    else:
+        logger.debug("Test AlltoAll finished!")
+        
 
 class ToyModel(nn.Module):
     def __init__(self):
@@ -278,7 +334,7 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
             ranks = [accl.Rank(a, start_port + i, 0, rxbufsize) for i, a in enumerate(fpga_ips)]
     else:
         # Somehow the simulator gets stuck if I use the same rxbufsize
-        rxbufsize = 4096# * 1024
+        rxbufsize = 4096 #* 1024
         ranks = [accl.Rank("127.0.0.1", 5500 + i, i, rxbufsize) for i in range(size)]
 
     logger.debug(f'Ranks: {ranks}')
@@ -300,7 +356,8 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
     
     accl.create_process_group(ranks, design, bufsize=rxbufsize, initialize=True, simulation=simulator)
     dist.init_process_group("ACCL", rank=rank, world_size=size)
-
+    global num_errors
+    num_errors = 0
     # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                  # profile_memory=True, record_shapes=True) as prof:
     mpi.Barrier()
@@ -310,22 +367,25 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
     mpi.Barrier()
     test_scatter()
     mpi.Barrier()
-    test_gather()
-    mpi.Barrier()
+    # test_gather()
+    # mpi.Barrier()
     test_allgather()
     mpi.Barrier()
+    test_alltoall()
+    mpi.Barrier()
     test_reduce()
     mpi.Barrier()
     test_allreduce()
-    mpi.Barrier()
-    # demo_basic(rank)
     # mpi.Barrier()
-    test_alltoall()
+    # demo_basic(rank)
     mpi.Barrier()
-        
-    print("Finished testing")
-    logger.debug('Finished testing')
-        
+
+    if num_errors == 0:
+        print("======== Successfully Finished testing======")
+        logger.debug("======== Successfully Finished testing======")
+    else:
+        print(f"!!!!!!!! - {num_errors} Errors found - !!!!!!!!!")
+        logger.debug(f"!!!!!!!! - {num_errors} Errors found - !!!!!!!!!")        
     # print(prof.key_averages(group_by_input_shape=True)
           # .table(sort_by="cpu_time_total", row_limit=15))
 

From f70049d57b8cc39c9bc750b1796bb9a820045b30 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Fri, 7 Jun 2024 16:06:21 +0200
Subject: [PATCH 22/64] Added Resnet50 imagenet testcase

---
 .../pytorch_ddp/test/test-imagenet.py         | 275 ++++++++++++++++++
 1 file changed, 275 insertions(+)
 create mode 100644 integrations/pytorch_ddp/test/test-imagenet.py

diff --git a/integrations/pytorch_ddp/test/test-imagenet.py b/integrations/pytorch_ddp/test/test-imagenet.py
new file mode 100644
index 00000000..3fdd1dbc
--- /dev/null
+++ b/integrations/pytorch_ddp/test/test-imagenet.py
@@ -0,0 +1,275 @@
+import torch
+import torch.nn as nn
+from torch import optim
+from torch.optim import lr_scheduler
+from torchvision import datasets
+from torchvision.transforms import ToTensor
+from torch.utils.data import DataLoader
+from torch.autograd import Variable
+import torchvision
+from torchvision import datasets, models, transforms
+import torch.distributed as dist
+import accl_process_group as accl
+
+from mpi4py.MPI import COMM_WORLD as mpi
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data.distributed import DistributedSampler
+
+
+import numpy as np
+import matplotlib.pyplot as plt
+import time
+import argparse
+import os
+import sys
+import logging
+from PIL import Image
+from tempfile import TemporaryDirectory
+
+logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
+
+logger = logging.getLogger(__name__)
+
+if "ACCL_DEBUG" in os.environ and os.environ["ACCL_DEBUG"]=="1":
+    logger.setLevel(logging.DEBUG)
+else:
+    logger.setLevel(logging.WARNING)
+
+# Run via ACCL
+
+class CNN(nn.Module):
+    def __init__(self):
+        super(CNN, self).__init__()
+        self.conv1 = nn.Sequential(         
+            nn.Conv2d(
+                in_channels=1,              
+                out_channels=16,            
+                kernel_size=5,              
+                stride=1,                   
+                padding=2,                  
+            ),                              
+            nn.ReLU(),                      
+            nn.MaxPool2d(kernel_size=2),    
+        )
+        self.conv2 = nn.Sequential(         
+            nn.Conv2d(16, 32, 5, 1, 2),     
+            nn.ReLU(),                      
+            nn.MaxPool2d(2),                
+        )
+        # fully connected layer, output 10 classes
+        self.out = nn.Linear(32 * 7 * 7, 10)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        # flatten the output of conv2 to (batch_size, 32 * 7 * 7)
+        x = x.view(x.size(0), -1)       
+        output = self.out(x)
+        return output, x    # return x for visualization
+
+def train(model, criterion, optimizer, scheduler, num_epochs=25):
+    since = time.time()
+
+    # Create a temporary directory to save training checkpoints
+    with TemporaryDirectory() as tempdir:
+        best_model_params_path = os.path.join(tempdir, 'best_model_params.pt')
+
+        torch.save(model.state_dict(), best_model_params_path)
+        best_acc = 0.0
+
+        for epoch in range(num_epochs):
+            print(f'Epoch {epoch}/{num_epochs - 1}')
+            print('-' * 10)
+
+            # Each epoch has a training and validation phase
+            for phase in ['train', 'val']:
+                if phase == 'train':
+                    model.train()  # Set model to training mode
+                else:
+                    model.eval()   # Set model to evaluate mode
+
+                running_loss = 0.0
+                running_corrects = 0
+
+                # Iterate over data.
+                for inputs, labels in dataloaders[phase]:
+                    inputs = inputs.to(device)
+                    labels = labels.to(device)
+
+                    # zero the parameter gradients
+                    optimizer.zero_grad()
+
+                    # forward
+                    # track history if only in train
+                    with torch.set_grad_enabled(phase == 'train'):
+                        outputs = model(inputs)
+                        _, preds = torch.max(outputs, 1)
+                        loss = criterion(outputs, labels)
+
+                        # backward + optimize only if in training phase
+                        if phase == 'train':
+                            loss.backward()
+                            optimizer.step()
+
+                    # statistics
+                    running_loss += loss.item() * inputs.size(0)
+                    running_corrects += torch.sum(preds == labels.data)
+                if phase == 'train':
+                    scheduler.step()
+
+                epoch_loss = running_loss / dataset_sizes[phase]
+                epoch_acc = running_corrects.double() / dataset_sizes[phase]
+
+                print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
+
+                # deep copy the model
+                if phase == 'val' and epoch_acc > best_acc:
+                    best_acc = epoch_acc
+                    torch.save(model.state_dict(), best_model_params_path)
+
+            print()
+
+        time_elapsed = time.time() - since
+        print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
+        print(f'Best val Acc: {best_acc:4f}')
+
+        # load best model weights
+        model.load_state_dict(torch.load(best_model_params_path))
+    return model
+
+
+    
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("-n", type=int, default=1)
+    parser.add_argument("-d", type=bool, default=None)
+
+
+    parser.add_argument('-s', '--simulator', action='store_true',
+                        default=False, help='Use simulation instead of '
+                                            'hardware')
+    parser.add_argument('-c', '--comms', choices=['udp', 'tcp', 'cyt_rdma'], default='tcp',
+                        help='Run tests over specified communication backend')
+    parser.add_argument('-i', '--host-file', type=str, help='Specify the file, where the host IPs are listed')
+    parser.add_argument('-f', '--fpga-file', type=str, help='Specify the file, where the FPGA IPs are listed')
+    parser.add_argument('-a','--master-address', type=str)
+    parser.add_argument('-p','--master-port', type=str)
+
+
+    args = parser.parse_args()
+
+    if args.n == 1 and args.d == None :
+        print("only one machine specified. Assuming Non distributed setup")
+        args.d = False
+    elif args.n > 1 and args.d == None:
+        print("Assung DDP setup")
+        args.d = True
+
+    
+    global rank, size
+    if args.master_address==None:
+        args.master_address = "localhost"
+    if args.master_port==None:
+        args.master_port = "30505"
+    os.environ['MASTER_ADDR'] = args.master_address
+    os.environ['MASTER_PORT'] = args.master_port
+    rank = mpi.Get_rank()
+    size = mpi.Get_size()
+
+    host_file = args.host_file
+    fpga_file = args.fpga_file
+    comms = args.comms
+    start_port = 5005
+
+    rxbufsize = 4096 * 1024
+
+    if args.d:
+        if not args.simulator:
+            #default from test.cpp
+            rxbufsize = 4096 * 1024
+            if host_file==None or fpga_file==None: sys.exit('Host and FPGA file need to be specified in hardware mode')
+        
+            with open(host_file, 'r') as hf:
+                host_ips = hf.read().splitlines()
+            
+            with open(fpga_file, 'r') as ff:
+                fpga_ips = ff.read().splitlines()
+
+            if comms == "cyt_rdma":
+                ranks = [accl.Rank(a, start_port, i, rxbufsize) for i, a in enumerate(fpga_ips)]
+            else:
+                ranks = [accl.Rank(a, start_port + i, 0, rxbufsize) for i, a in enumerate(fpga_ips)]
+        else:
+            rxbufsize = 4096 * 1024
+            ranks = [accl.Rank("127.0.0.1", 5500 + i, i, rxbufsize) for i in range(size)]
+
+        logger.debug(f'Ranks: {ranks}')
+
+        if args.comms == 'udp':
+            design = accl.ACCLDesign.udp
+        elif args.comms == 'tcp':
+            design = accl.ACCLDesign.tcp
+        elif args.comms == 'cyt_rdma': # and not simulator:
+            design = accl.ACCLDesign.cyt_rdma
+    
+
+        mpi.Barrier()            
+    
+        accl.create_process_group(ranks, design, bufsize=rxbufsize, initialize=True, simulation=args.simulator)
+        dist.init_process_group("ACCL", rank=rank, world_size=size)
+        
+    device = 'cpu'
+
+
+    data_transforms = {
+        'train': transforms.Compose([
+            transforms.RandomResizedCrop(224),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+        ]),
+        'val': transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+        ]),
+    }
+
+    data_dir = 'imagenet-data/hymenoptera_data'
+    image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
+                                              data_transforms[x])
+                      for x in ['train', 'val']}
+
+    if args.d : sampler = DistributedSampler
+    else : sampler = lambda x : None
+
+    dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
+                                                  shuffle=False, num_workers=4, sampler=sampler(image_datasets[x]))
+              for x in ['train', 'val']}
+    dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
+    class_names = image_datasets['train'].classes
+    
+    model_ft = models.resnet50(weights='IMAGENET1K_V1')
+    
+    num_ftrs = model_ft.fc.in_features
+
+    model_ft.fc = nn.Linear(num_ftrs, 2)
+
+    if args.d : model_ft = DDP(model_ft, bucket_cap_mb=4)
+
+    criterion = nn.CrossEntropyLoss()
+
+    # Observe that all parameters are being optimized
+    optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
+
+    # Decay LR by a factor of 0.1 every 7 epochs
+    exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
+
+    loss_func = nn.CrossEntropyLoss()
+
+    model_ft = train(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
+                       num_epochs=25)
+
+

From fe7e507a0c641bae8dee996c6452fd1ca60230cc Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sun, 16 Jun 2024 17:07:55 +0200
Subject: [PATCH 23/64] Added option for sidestepping using OpenMPI (gather,
 scatter, bcast)

---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 130 +++++++++++++++++-
 1 file changed, 123 insertions(+), 7 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 0ca9dff9..ac218a91 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -47,6 +47,43 @@ using namespace ACCL;
 
 namespace c10d {
 
+// Toggles to run Collectives via OpenMPI instead(To sidestep any issues with them in ACCL)
+#define BROADCAST_SIDESTEP
+#define SCATTER_SIDESTEP
+#define GATHER_SIDESTEP
+    
+// Used in sidestepping
+#define MPI_CHECK(cmd)                                                   \
+  do {                                                                   \
+    int mpiStatus = cmd;                                                 \
+    if (mpiStatus != MPI_SUCCESS) {                                      \
+      std::string err = "MPI error in: " + std::string(__FILE__) + ":" + \
+          std::to_string(__LINE__) +                                     \
+          ", with error code: " + std::to_string(mpiStatus);             \
+      TORCH_CHECK(false, err);                                           \
+    }                                                                    \
+  } while (0)    
+
+// Used in sidestepping    
+// Op mapping
+std::map<ReduceOp::RedOpType, MPI_Op> mpiOp = {
+    {ReduceOp::MIN, MPI_MIN},
+    {ReduceOp::MAX, MPI_MAX},
+    {ReduceOp::SUM, MPI_SUM},
+    {ReduceOp::PRODUCT, MPI_PROD},
+};
+// Used in sidestepping
+// Type mapping
+std::map<at::ScalarType, MPI_Datatype> mpiDatatype = {
+    {at::kByte, MPI_UNSIGNED_CHAR},
+    {at::kChar, MPI_CHAR},
+    {at::kDouble, MPI_DOUBLE},
+    {at::kFloat, MPI_FLOAT},
+    {at::kInt, MPI_INT},
+    {at::kLong, MPI_LONG},
+    {at::kShort, MPI_SHORT},
+};
+    
 #define CEIL_DIV(x, y) ((x) / (y) + ((x) % (y) != 0))
 
 #define ACCL_ERROR(status)                                                     \
@@ -993,19 +1030,34 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
   checkSingleTensor(tensors);
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [opts, this](std::unique_ptr<WorkEntry> &entry) {
-        at::Tensor &tensor = (entry->src)[0];
+	#ifdef BROADCAST_SIDESTEP
+	auto data = (entry->src)[0];
+	ACCL::debug("[Broadcast] -- Sidestepped using OpenMPI --");
+	c10::DeviceGuard guard(data.device());
+        std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+        MPI_CHECK(MPI_Bcast(
+            data.data_ptr(),
+            data.numel(),
+            mpiDatatype.at(data.scalar_type()),
+            opts.rootRank,
+            MPI_COMM_WORLD));
+	#else
+	at::Tensor &tensor = (entry->src)[0];
         // Segment data if necessary
         if (tensor.nbytes() > bufsize) {
-          size_t n = bufsize / tensor.itemsize();
+	  size_t non_zero_dim_count = tensor.numel() / tensor.size(0);
+          size_t n = bufsize / tensor.itemsize() / non_zero_dim_count;
 	  ACCL::debug("[Broadcast] Segmenting tensor of size " + std::to_string(tensor.nbytes()) + " into " + std::to_string(n) + "-sized elements ");
-          for (size_t i = 0; i < tensor.numel(); i += n) {
-            size_t end = std::min(i + n, static_cast<size_t>(tensor.numel()));
+          for (size_t i = 0; i < tensor.size(0); i += n) {
+	    ACCL::debug("part " + std::to_string(i) + "!");
+            size_t end = std::min(i + n, static_cast<size_t>(tensor.size(0)));
             run_broadcast(tensor.slice(0, i, end), opts);
           }
         } else {
 	  ACCL::debug("[Broadcast] Broadcasting entire tensor of size " + std::to_string(tensor.nbytes()) + " without segmentation.");
           run_broadcast(tensor, opts);
         }
+	#endif
       };
   auto entry =
       std::make_unique<WorkEntry>(&tensors, &tensors, std::move(runFunc));
@@ -1240,6 +1292,38 @@ ProcessGroupACCL::gather(std::vector<std::vector<at::Tensor>> &outputTensors,
 
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [opts, this](std::unique_ptr<WorkEntry> &entry) {
+	#ifdef GATHER_SIDESTEP
+	ACCL::debug("[Gather] -- Sidestepped using OpenMPI --");
+	  auto data = (entry->src)[0];
+        void* recvbuf = nullptr;
+        at::Tensor flatOutputTensor;
+
+        std::vector<at::Tensor> dstdata = entry->dst;
+        if (rank_ == opts.rootRank) {
+          flatOutputTensor = newLikeFlat(dstdata);
+          recvbuf = flatOutputTensor.data_ptr();
+        }
+
+        c10::DeviceGuard guard(data.device());
+        std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+        MPI_CHECK(MPI_Gather(
+            data.data_ptr(),
+            data.numel(),
+            mpiDatatype.at(data.scalar_type()),
+            recvbuf,
+            data.numel(),
+            mpiDatatype.at(data.scalar_type()),
+            opts.rootRank,
+            MPI_COMM_WORLD));
+
+        if (rank_ == opts.rootRank) {
+          const std::vector<at::Tensor>& outputDataVec = entry->dst;
+          // copy the flattened output tensors to the outputs
+          for (const auto i : c10::irange(outputDataVec.size())) {
+            outputDataVec.at(i).copy_(flatOutputTensor[i]);
+          }
+        }
+	#else
         auto srctensor = (entry->src)[0];
         auto &dsttensors = entry->dst;
         // Segment data if necessary
@@ -1257,6 +1341,7 @@ ProcessGroupACCL::gather(std::vector<std::vector<at::Tensor>> &outputTensors,
         } else {
           run_gather(srctensor, dsttensors, opts);
         }
+      #endif
       };
 
   if (rank_ == opts.rootRank) {
@@ -1322,16 +1407,46 @@ ProcessGroupACCL::scatter(std::vector<at::Tensor> &outputTensors,
 
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [opts, this](std::unique_ptr<WorkEntry> &entry) {
+        #ifdef SCATTER_SIDESTEP
+	ACCL::debug("[Scatter] -- Sidestepped using OpenMPI --");
+	auto data = (entry->dst)[0];
+        void* sendbuf = nullptr;
+        at::Tensor flatInputTensor;
+
+        if (rank_ == opts.rootRank) {
+          std::vector<at::Tensor>& inputDataVec = entry->src;
+          flatInputTensor = newLikeFlat(inputDataVec);
+          sendbuf = flatInputTensor.data_ptr();
+
+          // copy the input tensors to the flatten large send buffer
+          for (const auto i : c10::irange(inputDataVec.size())) {
+            flatInputTensor[i].copy_(inputDataVec.at(i));
+          }
+        }
+
+        c10::DeviceGuard guard(data.device());
+        std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+        MPI_CHECK(MPI_Scatter(
+            sendbuf,
+            data.numel(),
+            mpiDatatype.at(data.scalar_type()),
+            data.data_ptr(),
+            data.numel(),
+            mpiDatatype.at(data.scalar_type()),
+            opts.rootRank,
+            MPI_COMM_WORLD));
+        #else
         auto &srctensors = entry->src;
         auto dsttensor = (entry->dst)[0];
         // Segment data if necessary
         if (dsttensor.nbytes() > bufsize) {
           ACCL::debug("dsttensor to large!");
-          size_t n = bufsize / dsttensor.itemsize();
-          for (size_t i = 0; i < dsttensor.numel(); i += n) {
+	  size_t non_zero_dim_count = dsttensor.numel() / dsttensor.size(0);
+          size_t n = bufsize / dsttensor.itemsize() / non_zero_dim_count;
+          for (size_t i = 0; i < dsttensor.size(0); i += n) {
             ACCL::debug("part " + std::to_string(i) + "!");
             size_t end =
-                std::min(i + n, static_cast<size_t>(dsttensor.numel()));
+                std::min(i + n, static_cast<size_t>(dsttensor.size(0)));
             std::vector<at::Tensor> srctensorslices;
             srctensorslices.reserve(srctensors.size());
             for (auto &srctensor : srctensors) {
@@ -1342,6 +1457,7 @@ ProcessGroupACCL::scatter(std::vector<at::Tensor> &outputTensors,
         } else {
           run_scatter(srctensors, dsttensor, opts);
         }
+        #endif
       };
 
   if (rank_ == opts.rootRank) {

From e78e2fea2815143cee66d39cdd19453e68b91985 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sun, 16 Jun 2024 17:11:51 +0200
Subject: [PATCH 24/64] Added Performance measurements

---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      |  23 +-
 integrations/pytorch_ddp/test/test-generic.py | 269 +++++++++++-------
 2 files changed, 185 insertions(+), 107 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index ac218a91..a6d9f58b 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -115,14 +115,31 @@ std::map<at::ScalarType, MPI_Datatype> mpiDatatype = {
 #define PRE_REQUEST(opname, tensor)					\
   ACCL::debug("[" #opname "] Entering barrier");				\
   accl->barrier();							\
-  ACCL::debug("Starting " #opname " of " + std::to_string(tensor.numel()) + " items");
+  ACCL::debug("Starting " #opname " of " + std::to_string(tensor.numel()) + " items"); \
+  auto start = std::chrono::high_resolution_clock::now();
 
 #define POST_REQUEST					\
 if(coyote_enabled){					\
+  double durationUs = 0.0;				\
   ACCL::debug("Waiting for request to complete.");	\
-  accl->wait(req, 1000ms);				\
-}							\
+  bool ret = accl->wait(req, 20000ms);			\
+  if(ret == false){					\
+      ACCL::debug("!!!!!!! Timeout !!!!!!!");		\
+  }							\
+  auto end = std::chrono::high_resolution_clock::now();			\
+  durationUs = (std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count() / 1000.0); \
+  ACCL::debug("host measured durationUs:" + std::to_string(durationUs)); \
+  std::this_thread::sleep_for(10ms);					\
+  durationUs = (double)accl->get_duration(req)/1000.0;			\
+  if(durationUs > 1.0){							\
+      ACCL::debug("ACCL measured durationUs:" + std::to_string(durationUs)); \
+  }									\
+}									\
 ACCL::debug("Finished waiting");
+
+// Better logging
+// accl_log(mpi_rank, format_log("bcast", options, durationUs, 0));	\
+
   
 namespace {
 
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index ff5a29d3..e81c7864 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -49,25 +49,51 @@
 size = 0
 
 count = 16
-shape = (4, 5)
-num_el = 4 * 5
+shape = (64,)
+num_el = 64
 #As in test.cpp defaults
 rxbufsize = 4096 * 1024
 
 
-def test_broadcast():
-    global num_errors
-    if rank == 0:
-        x = torch.ones(shape)
+def test_broadcast_segment():
+    with torch.profiler.record_function("test bcast segmented"):
+        global num_errors
+        shape_segment = (1024 * 1024,)
+        if rank == 0:
+            x = torch.ones(shape_segment, dtype=torch.float)
+        else:
+            x = torch.zeros(shape_segment, dtype=torch.float)
+
+        dist.broadcast(x, 0)
+
+        mpi.Barrier()            
+        # logger.debug('Tensor after broadcast: ' + str(x))
+        # print('Tensor after broadcast: ' + str(x))
+    try:
+        np.testing.assert_allclose(x, torch.ones(shape_segment, dtype=torch.float))
+    except AssertionError as e:
+        num_errors = num_errors + 1
+        logger.debug("Test Broadcast failed")
+        logger.debug(str(e))
     else:
-        x = torch.zeros(shape)
+        logger.debug("Test broadcast finished!")
 
-    dist.broadcast(x, 0)
+def test_broadcast():
+    with torch.profiler.record_function("test bcast double prec"):
+        global num_errors
+        if rank == 0:
+            x = torch.ones(shape, dtype=torch.double)
+        else:
+            x = torch.zeros(shape, dtype=torch.double)
 
+        dist.broadcast(x, 0)
+
+        mpi.Barrier()            
+        
     # logger.debug('Tensor after broadcast: ' + str(x))
     # print('Tensor after broadcast: ' + str(x))
     try:
-        np.testing.assert_allclose(x, torch.ones(shape))
+        np.testing.assert_allclose(x, torch.ones(shape, dtype=torch.double))
     except AssertionError as e:
         num_errors = num_errors + 1
         logger.debug("Test Broadcast failed")
@@ -75,23 +101,48 @@ def test_broadcast():
     else:
         logger.debug("Test broadcast finished!")
 
+def test_broadcast_2():
+    with torch.profiler.record_function("test bcast float prec"):
+        test_type = torch.float
+        shape_2 = (204, 2)
+        global num_errors
+        if rank == 0:
+            x = torch.ones(shape_2, dtype=test_type)
+        else:
+            x = torch.zeros(shape_2, dtype=test_type)
 
-def test_sendrcv():
-    global num_errors
-    x = torch.full(shape, float(rank))
+        dist.broadcast(x, 0)
+        mpi.Barrier()            
 
-    y = torch.empty(shape)
+    # logger.debug('Tensor after broadcast: ' + str(x))
+    # print('Tensor after broadcast: ' + str(x))
+    try:
+        np.testing.assert_allclose(x, torch.ones(shape_2, dtype=test_type))
+    except AssertionError as e:
+        num_errors = num_errors + 1
+        logger.debug("Test Broadcast failed")
+        logger.debug(str(e))
+    else:
+        logger.debug("Test broadcast finished!")
+
+        
+def test_sendrcv():
+    with torch.profiler.record_function("test_sendrcv"):
+        global num_errors
+        x = torch.full(shape, float(rank))
 
-    prev_rank = (rank - 1) % size
-    next_rank = (rank + 1) % size
+        y = torch.empty(shape)
 
-    if rank % 2:
-        dist.send(x, next_rank)
-        dist.recv(y, prev_rank)
-    else:
-        dist.recv(y, prev_rank)
-        dist.send(x, next_rank)
+        prev_rank = (rank - 1) % size
+        next_rank = (rank + 1) % size
 
+        if rank % 2:
+            dist.send(x, next_rank)
+            dist.recv(y, prev_rank)
+        else:
+            dist.recv(y, prev_rank)
+            dist.send(x, next_rank)
+        mpi.Barrier()
     try:
         np.testing.assert_allclose(y, torch.full(shape, prev_rank))
     except AssertionError as e:
@@ -103,15 +154,16 @@ def test_sendrcv():
 
 
 def test_scatter():
-    global num_errors
-    if rank == 0:
-        x = [torch.full(shape, float(i+1)) for i in range(size)]
-    else:
-        x = None
-    y = torch.full(shape, float(0))
-
-    dist.scatter(y, x, 0)
+    with torch.profiler.record_function("test_scatter"):
+        global num_errors
+        if rank == 0:
+            x = [torch.full(shape, float(i+1)) for i in range(size)]
+        else:
+            x = None
+        y = torch.full(shape, float(0))
 
+        dist.scatter(y, x, 0)
+        mpi.Barrier()
     try:
         np.testing.assert_allclose(y, torch.full(shape, float(rank+1)))
     except AssertionError as e:
@@ -124,16 +176,17 @@ def test_scatter():
 
 
 def test_gather():
-    global num_errors
-    x = torch.full(shape, float(rank))
-
-    if rank == 0:
-        y = [torch.empty(shape) for _ in range(size)]
-    else:
-        y = None
+    with torch.profiler.record_function("test_gather"):
+        global num_errors
+        x = torch.full(shape, float(rank))
 
-    dist.gather(x, y, 0)
+        if rank == 0:
+            y = [torch.empty(shape) for _ in range(size)]
+        else:
+            y = None
 
+        dist.gather(x, y, 0)
+        mpi.Barrier()
     if rank == 0:
         for i, c in enumerate(y):
             try:
@@ -147,15 +200,17 @@ def test_gather():
 
             
 def test_allgather():
-    global num_errors
-    x = torch.full(shape, float(rank))
-    y = [torch.empty(shape) for _ in range(size)]
-
-    dist.all_gather(y, x)
-
+    with torch.profiler.record_function("test_allgather"):
+        global num_errors
+        shape_gather = (1,)
+        x = torch.full(shape_gather, float(rank), dtype=torch.double)
+        y = [torch.empty(shape_gather, dtype=torch.double) for _ in range(size)]
+
+        dist.all_gather(y, x)
+        mpi.Barrier()
     for i, c in enumerate(y):
         try:
-            np.testing.assert_allclose(c, torch.full(shape, float(i)))
+            np.testing.assert_allclose(c, torch.full(shape_gather, float(i), dtype=torch.double))
         except AssertionError as e:
             num_errors = num_errors + 1
             logger.debug("Test AllGather failed")
@@ -166,11 +221,12 @@ def test_allgather():
 
 
 def test_reduce():
-    global num_errors
-    x = torch.ones(shape)
-
-    dist.reduce(x, 0, dist.ReduceOp.SUM)
+    with torch.profiler.record_function("test_reduce"):
+        global num_errors
+        x = torch.ones(shape)
 
+        dist.reduce(x, 0, dist.ReduceOp.SUM)
+        mpi.Barrier()            
     if rank == 0:
         try:
             np.testing.assert_allclose(x, torch.full(shape, float(size)))
@@ -183,11 +239,13 @@ def test_reduce():
         
 
 def test_allreduce():
-    global num_errors
-    x = torch.ones(shape)
-
-    dist.all_reduce(x, dist.ReduceOp.SUM)
+    with torch.profiler.record_function("test_allreduce"):
+        global num_errors
+        x = torch.ones(shape)
 
+        dist.all_reduce(x, dist.ReduceOp.SUM)
+        mpi.Barrier()
+        
     try:
         np.testing.assert_allclose(x, torch.full(shape, float(size)))
     except AssertionError as e:
@@ -199,15 +257,18 @@ def test_allreduce():
         
     
 def test_alltoall():
-    global num_errors
-    
-    input = torch.arange(count, dtype=torch.float) + float(rank) * count
+    with torch.profiler.record_function("test_alltoall"):
+        global num_errors
 
+        input = torch.arange(count, dtype=torch.float) + float(rank) * count
 
-    output = torch.ones(count)
 
-    dist.all_to_all_single(output, input)
-    
+        output = torch.ones(count)
+
+        dist.all_to_all_single(output, input)
+
+        mpi.Barrier()
+
     test = torch.zeros(count)
 
     section_size = int(count/size)
@@ -269,32 +330,36 @@ def prepare_dataloader(dataset: Dataset, batch_size: int):
     )    
     
 def demo_basic(rank: int):
-    model = ToyModel()
-    ddp_model = DDP(model)
 
-    train_set = MyTrainDataset(2048)  # load your dataset
-    batch_size=64
-    train_data = prepare_dataloader(train_set, batch_size)
-    
-    loss_fn = nn.MSELoss()
-    optimizer = optim.Adam(ddp_model.parameters(), lr=0.005)
-
-    max_epochs = 20
-    for epoch in range(max_epochs):
-        batch_size = len(next(iter(train_data))[0])
-        train_data.sampler.set_epoch(epoch)
-        for x, y in train_data:
-            
-            optimizer.zero_grad()
-            outputs = ddp_model(x)
-            loss = loss_fn(outputs, y)
-            loss.backward()
-            optimizer.step()
-
-        print(f"Rank {rank}: Epoch {epoch} | Batchsize: {batch_size} | Steps: {len(train_data)} | Loss: {loss}")
+    with torch.profiler.record_function("basic 2 Layer NN"):
+        model = ToyModel()
+        ddp_model = DDP(model)
+        # ddp_model = DDP(model, bucket_cap_mb=4, broadcast_buffers=False)
         
+        train_set = MyTrainDataset(2048)  # load your dataset
+        batch_size=64
+        train_data = prepare_dataloader(train_set, batch_size)
+
+        loss_fn = nn.MSELoss()
+        optimizer = optim.Adam(ddp_model.parameters(), lr=0.005)
+
+        max_epochs = 20
+        for epoch in range(max_epochs):
+            batch_size = len(next(iter(train_data))[0])
+            train_data.sampler.set_epoch(epoch)
+            for x, y in train_data:
+
+                optimizer.zero_grad()
+                outputs = ddp_model(x)
+                loss = loss_fn(outputs, y)
+                loss.backward()
+                optimizer.step()
+
+            print(f"Rank {rank}: Epoch {epoch} | Batchsize: {batch_size} | Steps: {len(train_data)} | Loss: {loss}")
+
 
-    print("finished training")
+        print("finished training")
+        mpi.Barrier()
     # print("final params:")
     # print(ddp_model)
     # dist.destroy_process_group()
@@ -358,27 +423,23 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
     dist.init_process_group("ACCL", rank=rank, world_size=size)
     global num_errors
     num_errors = 0
-    # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
-                 # profile_memory=True, record_shapes=True) as prof:
-    mpi.Barrier()
-    test_broadcast()
-    mpi.Barrier()
-    test_sendrcv()
-    mpi.Barrier()
-    test_scatter()
-    mpi.Barrier()
-    # test_gather()
-    # mpi.Barrier()
-    test_allgather()
-    mpi.Barrier()
-    test_alltoall()
-    mpi.Barrier()
-    test_reduce()
-    mpi.Barrier()
-    test_allreduce()
-    # mpi.Barrier()
-    # demo_basic(rank)
-    mpi.Barrier()
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                 profile_memory=True, record_shapes=True) as prof:
+
+        # test_allgather()
+        # test_broadcast_segment()
+        test_broadcast()
+        # test_broadcast()
+        # test_broadcast_2()
+        # test_sendrcv()
+        test_scatter()
+        test_gather()
+        # test_allgather()
+        # test_alltoall()
+        # test_reduce()
+        # test_allreduce()
+        # demo_basic(rank)
+        # mpi.Barrier()
 
     if num_errors == 0:
         print("======== Successfully Finished testing======")
@@ -386,8 +447,8 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
     else:
         print(f"!!!!!!!! - {num_errors} Errors found - !!!!!!!!!")
         logger.debug(f"!!!!!!!! - {num_errors} Errors found - !!!!!!!!!")        
-    # print(prof.key_averages(group_by_input_shape=True)
-          # .table(sort_by="cpu_time_total", row_limit=15))
+    print(prof.key_averages(group_by_input_shape=True)
+          .table(sort_by="cpu_time_total", row_limit=15))
 
     logger.debug('Destroying ACCL Process Group')
     dist.destroy_process_group()

From 3b9465d998054ef65b34d2d5eb778a2f0c2c40e5 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Wed, 19 Jun 2024 15:16:06 +0200
Subject: [PATCH 25/64] Added Allgather and Allreduce Sidestep

---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 44 +++++++++++++++++--
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index a6d9f58b..1702876a 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -48,9 +48,12 @@ using namespace ACCL;
 namespace c10d {
 
 // Toggles to run Collectives via OpenMPI instead(To sidestep any issues with them in ACCL)
-#define BROADCAST_SIDESTEP
-#define SCATTER_SIDESTEP
-#define GATHER_SIDESTEP
+// The sidestep-code is copied from the ProcessGroupMPI
+// #define BROADCAST_SIDESTEP
+// #define SCATTER_SIDESTEP
+// #define GATHER_SIDESTEP
+// #define ALLGATHER_SIDESTEP
+// #define ALLREDUCE_SIDESTEP
     
 // Used in sidestepping
 #define MPI_CHECK(cmd)                                                   \
@@ -1111,6 +1114,18 @@ ProcessGroupACCL::allreduce(std::vector<at::Tensor> &tensors,
 
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [opts, this](std::unique_ptr<WorkEntry> &entry) {
+        #ifdef ALLREDUCE_SIDESTEP
+	auto data = (entry->src)[0];
+        c10::DeviceGuard guard(data.device());
+        std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+        MPI_CHECK(MPI_Allreduce(
+            MPI_IN_PLACE,
+            data.data_ptr(),
+            data.numel(),
+            mpiDatatype.at(data.scalar_type()),
+            mpiOp.at(opts.reduceOp),
+            MPI_COMM_WORLD));
+        #else
         auto tensor = (entry->src)[0];
         // Segment data if necessary
         if (tensor.nbytes() > bufsize) {
@@ -1122,6 +1137,7 @@ ProcessGroupACCL::allreduce(std::vector<at::Tensor> &tensors,
         } else {
           run_allreduce(tensor, opts);
         }
+      #endif
       };
   auto entry =
       std::make_unique<WorkEntry>(&tensors, &tensors, std::move(runFunc));
@@ -1226,6 +1242,27 @@ ProcessGroupACCL::allgather(std::vector<std::vector<at::Tensor>> &outputTensors,
 
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [this](std::unique_ptr<WorkEntry> &entry) {
+        #ifdef ALLGATHER_SIDESTEP
+	ACCL::debug("[AllGather] -- Sidestepped using OpenMPI --");
+	auto data = (entry->src)[0];
+        std::vector<at::Tensor> outputDataVec = entry->dst;
+        auto flatOutputTensor = newLikeFlat(outputDataVec);
+
+        c10::DeviceGuard guard(data.device());
+        std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+        MPI_CHECK(MPI_Allgather(
+            data.data_ptr(),
+            data.numel(),
+            mpiDatatype.at(data.scalar_type()),
+            flatOutputTensor.data_ptr(),
+            data.numel(),
+            mpiDatatype.at(data.scalar_type()),
+            MPI_COMM_WORLD));
+
+        for (const auto i : c10::irange(outputDataVec.size())) {
+          outputDataVec[i].copy_(flatOutputTensor[i]);
+        }
+        #else
         auto srctensor = (entry->src)[0];
         auto &dsttensors = entry->dst;
         // Segment data if necessary
@@ -1244,6 +1281,7 @@ ProcessGroupACCL::allgather(std::vector<std::vector<at::Tensor>> &outputTensors,
         } else {
           run_allgather(srctensor, dsttensors);
         }
+      #endif
       };
   auto entry = std::make_unique<WorkEntry>(&inputTensors, &outputTensors[0],
                                            std::move(runFunc));

From fa3202a4ce6abf13db0e5acf24e651114fe0b411 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Wed, 19 Jun 2024 15:17:34 +0200
Subject: [PATCH 26/64] Added Waiting to SIM

---
 integrations/pytorch_ddp/src/ProcessGroupACCL.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 1702876a..52be864c 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -122,13 +122,13 @@ std::map<at::ScalarType, MPI_Datatype> mpiDatatype = {
   auto start = std::chrono::high_resolution_clock::now();
 
 #define POST_REQUEST					\
-if(coyote_enabled){					\
-  double durationUs = 0.0;				\
-  ACCL::debug("Waiting for request to complete.");	\
-  bool ret = accl->wait(req, 20000ms);			\
-  if(ret == false){					\
-      ACCL::debug("!!!!!!! Timeout !!!!!!!");		\
-  }							\
+double durationUs = 0.0;				\
+ACCL::debug("Waiting for request to complete.");	\
+bool ret = accl->wait(req, 20000ms);			\
+if(ret == false){					\
+    ACCL::debug("!!!!!!! Timeout !!!!!!!");		\
+}									\
+if(coyote_enabled){							\
   auto end = std::chrono::high_resolution_clock::now();			\
   durationUs = (std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count() / 1000.0); \
   ACCL::debug("host measured durationUs:" + std::to_string(durationUs)); \

From f9623d2472ad2294d9fe1134f3e0291ddc85da2a Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sun, 23 Jun 2024 22:12:11 +0200
Subject: [PATCH 27/64] Updated README

---
 integrations/pytorch_ddp/README.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/integrations/pytorch_ddp/README.md b/integrations/pytorch_ddp/README.md
index cc998d45..2462383e 100644
--- a/integrations/pytorch_ddp/README.md
+++ b/integrations/pytorch_ddp/README.md
@@ -42,8 +42,32 @@ source venv/bin/activate
 </details>
 
 ## Running the plugin
+
 Make sure to source the `setup.sh` script in this directory to load the ACCL plugin before starting a Python script.
 Example usage can be found in the various test files under [`test/`](test).
 
 Do make sure not to run python from within the root directory of `pytorch_ddp`, because Python will try to import the
 local incomplete [`accl_process_group/`](accl_process_group) folder instead of the actual installation.
+
+The provided test/run.sh will launch a testscript via mpirun
+
+## Setup overview
+
+- The whole Processgroup is wrapped in OpenMPI, which is used for initialization
+- You can use the OpenMPI implementation of certain collectives using the "sidestep" flags in the ProcessGroupACCL.cpp
+- Compilation using `./install` or `pip install .` can be very slow, you can run `python setup.py build_ext --inplace` and then copy the binary or other files directly. `cp accl_process_group/_c/ProcessGroupACCL.cpython-38-x86_64-linux-gnu.so ~/.local/lib/python3.8/site-packages/accl_process_group/_c/`
+- The `install.py` script will not reinstall the driver in case of ACCL updates. You will need to rebuild it yourself
+- Set `ACCL_DEBUG=1` if you want more output(also set during build). Stdout is sometimes not complete(in simulator), so best log most things in stderr
+- ACCL only supports sizes up to 4MB, If you give it tensors of higher sizes, the PG will try to segment it in first dim. Not all collectives correctly handle multi-dimensional tensors yet. 
+- Setting up the simulator with 4MB takes long, better set it lower.
+
+### How to install torchvision
+
+- install torch using the script
+- clone vision, go to the fitting version v0.16.0
+- clone libpng, configure with prefix set to local directory
+- add the bin to the path
+- not sure if needed: supply the path of the library and include to torchvision as in their development doc
+- disable the version check in torchvision setup.py, because it doesn't correctly parse the version.
+- run vision setup.py with debug, include, library and use png flags
+

From c166a1f17b7b8cb21118ed46f3601373f03c0115 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Tue, 25 Jun 2024 12:46:09 +0200
Subject: [PATCH 28/64] Further README additions

---
 integrations/pytorch_ddp/README.md |  16 +++-
 integrations/pytorch_ddp/run.sh    | 134 -----------------------------
 2 files changed, 13 insertions(+), 137 deletions(-)
 delete mode 100755 integrations/pytorch_ddp/run.sh

diff --git a/integrations/pytorch_ddp/README.md b/integrations/pytorch_ddp/README.md
index 2462383e..27532913 100644
--- a/integrations/pytorch_ddp/README.md
+++ b/integrations/pytorch_ddp/README.md
@@ -49,17 +49,20 @@ Example usage can be found in the various test files under [`test/`](test).
 Do make sure not to run python from within the root directory of `pytorch_ddp`, because Python will try to import the
 local incomplete [`accl_process_group/`](accl_process_group) folder instead of the actual installation.
 
-The provided test/run.sh will launch a testscript via mpirun
+The provided `test/run.sh` will launch a testscript via mpirun
 
 ## Setup overview
 
 - The whole Processgroup is wrapped in OpenMPI, which is used for initialization
 - You can use the OpenMPI implementation of certain collectives using the "sidestep" flags in the ProcessGroupACCL.cpp
-- Compilation using `./install` or `pip install .` can be very slow, you can run `python setup.py build_ext --inplace` and then copy the binary or other files directly. `cp accl_process_group/_c/ProcessGroupACCL.cpython-38-x86_64-linux-gnu.so ~/.local/lib/python3.8/site-packages/accl_process_group/_c/`
+- Recompilation using `./install` or `pip install .` can be very slow, you can run `python setup.py build_ext --inplace` and then copy the binary or other files directly. `cp accl_process_group/_c/ProcessGroupACCL.cpython-38-x86_64-linux-gnu.so ~/.local/lib/python3.8/site-packages/accl_process_group/_c/`
 - The `install.py` script will not reinstall the driver in case of ACCL updates. You will need to rebuild it yourself
 - Set `ACCL_DEBUG=1` if you want more output(also set during build). Stdout is sometimes not complete(in simulator), so best log most things in stderr
+- The runscript currently just outputs the command to be run(better not use the `&` at the end), which you then run manually. This is because I had bad experiences with the missing output(maybe coinciding with issues mentioned above) and termination on multiple machines, but should also work if you comment the `exit 0` and the `&` at the end of mpirun out. Don't forget, that you should still run the script to clear log files.
 - ACCL only supports sizes up to 4MB, If you give it tensors of higher sizes, the PG will try to segment it in first dim. Not all collectives correctly handle multi-dimensional tensors yet. 
-- Setting up the simulator with 4MB takes long, better set it lower.
+- Setting up the simulator with 4MB takes long, better set it lower for quick tests.
+- You can init the process group as if it were udp and run on a `cyt_rdma` simulator
+- There is no reason to not support the rdma + SIM initialization. It just hasn't been implemented yet. Certain case-splits assume no-sim if cyt_rdma is given...
 
 ### How to install torchvision
 
@@ -71,3 +74,10 @@ The provided test/run.sh will launch a testscript via mpirun
 - disable the version check in torchvision setup.py, because it doesn't correctly parse the version.
 - run vision setup.py with debug, include, library and use png flags
 
+### Tests available
+Check `test/run.sh` for ACCL_SCRIPT examples
+
+- `test-generic.py` tests everything in isolation + a small dual layer model learning a linear function
+- `test-mnist.py` should be able to be run non-distributed as well(check arguments)
+- `test-imagenet.py` does finetuning of Resnet50 according to: <https://docs.ray.io/en/latest/train/examples/pytorch/pytorch_resnet_finetune.html> and should alse be able to be run non-distributed
+- For DLRM you will need to use a small fork of the DLRM-repo with ACCL-support hosted at <https://gitlab.ethz.ch/lawirz/dlrm>. It contains a `run.sh`
diff --git a/integrations/pytorch_ddp/run.sh b/integrations/pytorch_ddp/run.sh
deleted file mode 100755
index 58d137d6..00000000
--- a/integrations/pytorch_ddp/run.sh
+++ /dev/null
@@ -1,134 +0,0 @@
-#!/bin/bash
-
-#check working directory
-if [[ $(pwd) != *pytorch_ddp ]]; then
-	echo "ERROR: this script should only be run in the pytorch_ddp dir of the repo!"
-	exit 1
-fi
-
-if [[ -v ACCL_SCRIPT ]]; then
-    SCRIPT_NAME="$ACCL_SCRIPT"
-else
-    SCRIPT_NAME=test-generic.py
-    echo "Variable ACCL_SCRIPT not set. Assuming $SCRIPT_NAME"
-fi
-
-# state variables
-mkdir -p "$(pwd)/accl_log"
-# BUILD_DIR=../build
-# point this to python venv, which has the relevant libraries installed
-VENV_ACTIVATE=$(pwd)/venv/bin/activate
-SETUP_SH=$(pwd)/setup.sh
-SCRIPT=$(pwd)/test/$SCRIPT_NAME
-HOST_FILE=./accl_log/host
-FPGA_FILE=./accl_log/fpga
-
-#enter venv and run script
-EXEC="bash -c \"source $VENV_ACTIVATE && source $SETUP_SH  && python $SCRIPT"
-# EXEC="python $SCRIPT"
-
-
-#---------------Setting up vars-------------
-if [[ $ACCL_SIM -eq 1 ]]; then
-    echo "Starting in simulator mode. Make sure to start the emulator beforehand"
-    ARG="-s "
-
-    ACCL_COMMS="udp"
-
-    echo "assuming udp comms in simulator"
-
-    if [[ -v ACCL_NP ]]; then
-        NUM_PROCESS="$ACCL_NP"
-    else
-    	echo "Variable ACCL_NP not set. Enter num of processes:"
-	read -a NUM_PROCESS
-    fi
-
-    MASTER_IP="localhost"
-    MASTER_PORT="30501"
-
-else
-    echo "Starting in hw mode. Make sure to run flow_u55c beforehand."
-    if [[ -v U55C_IDS ]]; then
-	IFS=' ' read -r -a SERVID <<< "$U55C_IDS"
-    else
-	# read server ids from user
-	echo "Variable U55C_IDS not set. Enter u55c machine ids (space separated):"
-	read -a SERVID
-    fi
-
-    if ! [[ -v ACCL_COMMS ]]; then
-        ACCL_COMMS="cyt_rdma"
-	echo "Assuming cyt_rdma comms in hardware"
-    fi
-	
-    RANK_PORT="30501"
-    # create ip files
-    rm -f $HOST_FILE $FPGA_FILE
-    NUM_PROCESS=0
-    for ID in ${SERVID[@]}; do
-	echo "10.253.74.$(((ID-1) * 4 + 66))">>$HOST_FILE
-	echo "10.253.74.$(((ID-1) * 4 + 68))">>$FPGA_FILE
-	NUM_PROCESS=$((NUM_PROCESS+1))
-	HOST_LIST+="alveo-u55c-$(printf "%02d" $ID) "
-	HOST_PORT_LIST+="alveo-u55c-$(printf "%02d" $ID):$RANK_PORT "
-    done
-
-    echo "HOST_LIST: ${HOST_LIST[*]}"
-
-    #set master address
-    MASTER_IP="10.253.74.$(((${SERVID[0]}-1) * 4 + 66))"
-    MASTER_PORT="30505"
-
-    echo "Master node set to: $MASTER_IP:$MASTER_PORT"
-
-    MPI_ARGS="-f $HOST_FILE --iface ens4f0"
-fi
-
-ARG="$ARG -c $ACCL_COMMS -i $HOST_FILE -f $FPGA_FILE -a $MASTER_IP -p $MASTER_PORT\""
-
-#---------------Running it-------------
-
-echo "Run command: $EXEC $ARG"
-
-echo "Running with $NUM_PROCESS Processes"
-
-rm -f $(pwd)/accl_log/rank*
-
-C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stdout\" $EXEC $ARG &"
-# C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stdout\" -errfile-pattern \"$(pwd)/accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stderr\" $EXEC $ARG &"
-# C="mpirun -n $NUM_PROCESS -f $HOST_FILE --iface ens4f0 $EXEC $ARG &"
-echo $C
-
-/bin/sh -c "$C"
-
-if ! [[ -v SLEEPTIME ]]; then
-    SLEEPTIME="16"
-fi
-echo "Sleeping for $SLEEPTIME"
-sleep $SLEEPTIME
-
-if ! [[ $ACCL_SIM -eq 1 ]]; then
-    parallel-ssh -H "$HOST_LIST" "killall -9 $SCRIPT_NAME"
-    parallel-ssh -H "$HOST_LIST" "dmesg | grep "fpga_tlb_miss_isr" >$(pwd)/accl_log/tlb_miss.log"
-else
-    killall -9 $SCRIPT_NAME
-    dmesg | grep "fpga_tlb_miss_isr" >$(pwd)/accl_log/tlb_miss.log
-fi
-
-mkdir -p "$(pwd)/accl_results"
-# Loop through accl log files in the source directory and append to accl_results folder
-for source_log in "$(pwd)/accl"*.log; do
-    # Extract the log number from the source log file name (assuming the format is acclX.log)
-    log_number=$(basename "${source_log}" | sed 's/accl\([0-9]*\)\.log/\1/')
-    # Create the destination log file path
-    destination_log="$(pwd)/accl_results/accl${log_number}.log"
-    # Append the content of the source log to the destination log
-    cat "${source_log}" >> "${destination_log}"
-    # Remove the tmp log
-    rm ${source_log}
-done
-
-
-
-

From 75da95e865c34bb14a1b25357fd121eef5dc00c5 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Fri, 28 Jun 2024 13:23:24 +0200
Subject: [PATCH 29/64] Added Benchmarking

---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      |  73 ++++++--
 integrations/pytorch_ddp/test/test-generic.py | 177 +++++++++++-------
 2 files changed, 166 insertions(+), 84 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 52be864c..e85168f0 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -119,9 +119,12 @@ std::map<at::ScalarType, MPI_Datatype> mpiDatatype = {
   ACCL::debug("[" #opname "] Entering barrier");				\
   accl->barrier();							\
   ACCL::debug("Starting " #opname " of " + std::to_string(tensor.numel()) + " items"); \
-  auto start = std::chrono::high_resolution_clock::now();
+  std::chrono::time_point<std::chrono::high_resolution_clock> start;	\
+  if(coyote_enabled){							\
+      start = std::chrono::high_resolution_clock::now();		\
+  }									
 
-#define POST_REQUEST					\
+#define POST_REQUEST(opname, n_bytes)				\
 double durationUs = 0.0;				\
 ACCL::debug("Waiting for request to complete.");	\
 bool ret = accl->wait(req, 20000ms);			\
@@ -136,10 +139,13 @@ if(coyote_enabled){							\
   durationUs = (double)accl->get_duration(req)/1000.0;			\
   if(durationUs > 1.0){							\
       ACCL::debug("ACCL measured durationUs:" + std::to_string(durationUs)); \
+      accl_pg_log(rank_, format_log(opname, size_, rank_, durationUs, n_bytes)); \
   }									\
 }									\
 ACCL::debug("Finished waiting");
 
+#define TIMER_WRAP()
+    
 // Better logging
 // accl_log(mpi_rank, format_log("bcast", options, durationUs, 0));	\
 
@@ -205,6 +211,26 @@ std::map<at::ScalarType, ACCL::dataType> acclDatatype = {
     {at::kShort, ACCL::dataType::int32},
 };
 
+
+std::string format_log(std::string collective, int world_size, int rank, double time, int n_bytes)
+{
+    std::string log_str = collective + "," + std::to_string(world_size) + "," + std::to_string(rank) + "," + std::to_string(time) + "," + std::to_string(n_bytes);
+    return log_str;
+}    
+
+#define ACCL_PG_LOG_FILE(i)                                                       \
+  (std::string("accl_log/accl_pg_") + i + std::string(".log"))    
+    
+void accl_pg_log(int rank, const std::string &message) {
+  std::string str_rank = std::to_string(rank);
+  std::string filename = ACCL_PG_LOG_FILE(str_rank);
+  std::ofstream outfile;
+  outfile.open(filename, std::ios::out | std::ios_base::app);
+  outfile << message << std::endl;
+  outfile.close();
+}
+    
+    
 // Checking the input tensor's validity
 void checkSingleTensorHelper(const at::Tensor &tensor) {
   if (!tensor.is_contiguous()) {
@@ -1029,8 +1055,21 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   STANDARD_DECL
 
   //Should be split to output on non-root sometime
-  init_input_tensor(in_tensor, data, true, true, opts.rootRank);
+  // init_input_tensor(in_tensor, data, true, true, opts.rootRank);
+  // This case split is necessary, because otherwise data will be set to a nullptr
 
+  std::chrono::time_point<std::chrono::high_resolution_clock> start_init  = std::chrono::high_resolution_clock::now();
+      
+  if (opts.rootRank == rank_){
+      init_input_tensor(in_tensor, data, true, false, opts.rootRank);
+  }
+  else{
+      init_output_data(in_tensor, data, in_tensor.numel(), in_tensor.scalar_type(), false, true, opts.rootRank);
+  }
+  auto end_init = std::chrono::high_resolution_clock::now();
+  double durationUs_init = (std::chrono::duration_cast<std::chrono::nanoseconds>(end_init-start_init).count() / 1000.0);
+  ACCL::debug("init tensor durationUs:" + std::to_string(durationUs_init));
+  
   // Reserve device
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
@@ -1039,9 +1078,15 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
 
   ACCL::ACCLRequest* req = accl->bcast(*data, in_tensor.numel(), opts.rootRank, ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor.scalar_type()));
 
-  POST_REQUEST
-
+  POST_REQUEST("bcast", in_tensor.nbytes())
+      
+  std::chrono::time_point<std::chrono::high_resolution_clock> start_copy  = std::chrono::high_resolution_clock::now();
   copy_back_tensor(in_tensor, data, true, true, opts.rootRank);
+  auto end_copy = std::chrono::high_resolution_clock::now();
+  double durationUs_copy = (std::chrono::duration_cast<std::chrono::nanoseconds>(end_copy-start_copy).count() / 1000.0);
+  ACCL::debug("Copy tensor durationUs:" + std::to_string(durationUs_copy));
+
+  
 }
 
 c10::intrusive_ptr<Work>
@@ -1067,7 +1112,7 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
         if (tensor.nbytes() > bufsize) {
 	  size_t non_zero_dim_count = tensor.numel() / tensor.size(0);
           size_t n = bufsize / tensor.itemsize() / non_zero_dim_count;
-	  ACCL::debug("[Broadcast] Segmenting tensor of size " + std::to_string(tensor.nbytes()) + " into " + std::to_string(n) + "-sized elements ");
+	  ACCL::debug("[Broadcast] Segmenting tensor of size " + std::to_string(tensor.nbytes()) + " into " + std::to_string(n * non_zero_dim_count) + "-sized elements ");
           for (size_t i = 0; i < tensor.size(0); i += n) {
 	    ACCL::debug("part " + std::to_string(i) + "!");
             size_t end = std::min(i + n, static_cast<size_t>(tensor.size(0)));
@@ -1102,7 +1147,7 @@ void ProcessGroupACCL::run_allreduce(at::Tensor in_tensor,
   
   ACCL::ACCLRequest* req = accl->allreduce(*data, *dstdata, in_tensor.numel(), acclOp.at(opts.reduceOp), ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor.scalar_type()));
 
-  POST_REQUEST
+  POST_REQUEST("allreduce", in_tensor.nbytes())
 
   copy_back_tensor(in_tensor, dstdata, true, true);
 }
@@ -1168,7 +1213,7 @@ void ProcessGroupACCL::run_reduce(at::Tensor in_tensor,
 
   ACCL::ACCLRequest* req = accl->reduce(*data, *dstdata, in_tensor.numel(), opts.rootRank, acclOp.at(opts.reduceOp), ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor.scalar_type()));
 
-  POST_REQUEST
+  POST_REQUEST("reduce", in_tensor.nbytes())
 
   copy_back_tensor(in_tensor, dstdata, true, false, opts.rootRank);
 }
@@ -1218,7 +1263,7 @@ void ProcessGroupACCL::run_allgather(
   ACCL::ACCLRequest* req = accl->allgather(*srcdata, *dstdata, in_tensor.numel(), ACCL::GLOBAL_COMM,
                   true, true, get_compressed_type(in_tensor.scalar_type()));
 
-  POST_REQUEST
+  POST_REQUEST("allgather", in_tensor.nbytes())
 
   copy_back_tensorvec(dsttensorvec, dstdata, dsttensor, in_tensor.numel(), true, true);
     
@@ -1317,7 +1362,7 @@ void ProcessGroupACCL::run_gather(at::Tensor in_tensor,
                ACCL::GLOBAL_COMM, true, true,
                get_compressed_type(in_tensor.scalar_type()));
 
-  POST_REQUEST
+  POST_REQUEST("gather", in_tensor.nbytes())
 
   copy_back_tensorvec(dsttensorvec, dstdata, dsttensor, in_tensor.numel(), true, false, opts.rootRank);
     
@@ -1433,7 +1478,7 @@ void ProcessGroupACCL::run_scatter(std::vector<at::Tensor> &in_tensor_vec,
   // Run scatter
   ACCL::ACCLRequest* req = accl->scatter(*in_data, *out_data, out_tensor.numel(), opts.rootRank, ACCL::GLOBAL_COMM, true, true, get_compressed_type(dsttensor.scalar_type()));
 
-  POST_REQUEST
+  POST_REQUEST("scatter", out_tensor.nbytes())
 
   copy_back_tensor(out_tensor, out_data, true, true, opts.rootRank);
 }
@@ -1559,7 +1604,7 @@ void ProcessGroupACCL::run_alltoall(at::Tensor in_tensor,
 
   ACCL::ACCLRequest* req = accl->alltoall(*srcdata, *dstdata, in_tensor.numel()/size_, ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor.scalar_type()));
 
-  POST_REQUEST
+  POST_REQUEST("alltoall", in_tensor.nbytes()/size_)
 
   copy_back_tensor(out_tensor, dstdata, true, true);    
   
@@ -1638,7 +1683,7 @@ void ProcessGroupACCL::run_send(at::Tensor in_tensor, int dstRank,
   ACCL::ACCLRequest* req = accl->send(*data, in_tensor.numel(), dstRank, tag, ACCL::GLOBAL_COMM, true,
              get_compressed_type(in_tensor.scalar_type()));
 
-  POST_REQUEST
+  POST_REQUEST("send", in_tensor.nbytes())
 }
 
 c10::intrusive_ptr<Work>
@@ -1681,7 +1726,7 @@ void ProcessGroupACCL::run_recv(at::Tensor out_tensor, int srcRank,
   
   ACCL::ACCLRequest* req = accl->recv(*dstdata, out_tensor.numel(), srcRank, tag, ACCL::GLOBAL_COMM, true, get_compressed_type(out_tensor.scalar_type()));
 
-  POST_REQUEST
+  POST_REQUEST("recv", out_tensor.nbytes())
 
   copy_back_tensor(out_tensor, dstdata, true, true);      
 }
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index e81c7864..c241b2d0 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -21,6 +21,7 @@
 import os
 import sys
 import logging
+import time
 from mpi4py.MPI import COMM_WORLD as mpi
 
 import torch
@@ -49,21 +50,22 @@
 size = 0
 
 count = 16
-shape = (64,)
-num_el = 64
+num_el = 256
+shape = (num_el,)
 #As in test.cpp defaults
 rxbufsize = 4096 * 1024
 
 
 def test_broadcast_segment():
-    with torch.profiler.record_function("test bcast segmented"):
-        global num_errors
-        shape_segment = (1024 * 1024,)
-        if rank == 0:
-            x = torch.ones(shape_segment, dtype=torch.float)
-        else:
-            x = torch.zeros(shape_segment, dtype=torch.float)
+    global num_errors
+    shape_segment = (1024 * 1024,)
+    if rank == 0:
+        x = torch.ones(shape_segment, dtype=torch.float)
+    else:
+        x = torch.zeros(shape_segment, dtype=torch.float)
 
+    with torch.profiler.record_function("test bcast segmented"):
+            
         dist.broadcast(x, 0)
 
         mpi.Barrier()            
@@ -79,21 +81,37 @@ def test_broadcast_segment():
         logger.debug("Test broadcast finished!")
 
 def test_broadcast():
-    with torch.profiler.record_function("test bcast double prec"):
-        global num_errors
-        if rank == 0:
-            x = torch.ones(shape, dtype=torch.double)
-        else:
-            x = torch.zeros(shape, dtype=torch.double)
+    global num_errors
+    if rank == 0:
+        x = torch.ones(shape)
+    else:
+        x = torch.zeros(shape)
 
-        dist.broadcast(x, 0)
+    for i in range(10):
+        with torch.profiler.record_function("test bcast " + str(i)):
+
+            start_time = time.perf_counter()
+
+            dist.broadcast(x, 0)
+
+            end_time = time.perf_counter()
+            
+            measured_time = (end_time - start_time) * 1000000
+            
+            logger.debug("Directly measured time us 1:" + str(measured_time))
+            
+            mpi.Barrier()
+
+            end_time = time.perf_counter()
+
+            measured_time = (end_time - start_time) * 1000000
+
+            logger.debug("Directly measured time us 2:" + str(measured_time))
 
-        mpi.Barrier()            
-        
     # logger.debug('Tensor after broadcast: ' + str(x))
     # print('Tensor after broadcast: ' + str(x))
     try:
-        np.testing.assert_allclose(x, torch.ones(shape, dtype=torch.double))
+        np.testing.assert_allclose(x, torch.ones(shape))
     except AssertionError as e:
         num_errors = num_errors + 1
         logger.debug("Test Broadcast failed")
@@ -102,15 +120,15 @@ def test_broadcast():
         logger.debug("Test broadcast finished!")
 
 def test_broadcast_2():
-    with torch.profiler.record_function("test bcast float prec"):
-        test_type = torch.float
-        shape_2 = (204, 2)
-        global num_errors
-        if rank == 0:
-            x = torch.ones(shape_2, dtype=test_type)
-        else:
-            x = torch.zeros(shape_2, dtype=test_type)
+    test_type = torch.float
+    shape_2 = (204, 2)
+    global num_errors
+    if rank == 0:
+        x = torch.ones(shape_2, dtype=test_type)
+    else:
+        x = torch.zeros(shape_2, dtype=test_type)
 
+    with torch.profiler.record_function("test bcast float prec"):
         dist.broadcast(x, 0)
         mpi.Barrier()            
 
@@ -127,14 +145,16 @@ def test_broadcast_2():
 
         
 def test_sendrcv():
-    with torch.profiler.record_function("test_sendrcv"):
-        global num_errors
-        x = torch.full(shape, float(rank))
+    global num_errors
+    x = torch.full(shape, float(rank))
+
+    y = torch.empty(shape)
 
-        y = torch.empty(shape)
+    prev_rank = (rank - 1) % size
+    next_rank = (rank + 1) % size
 
-        prev_rank = (rank - 1) % size
-        next_rank = (rank + 1) % size
+
+    with torch.profiler.record_function("test_sendrcv"):
 
         if rank % 2:
             dist.send(x, next_rank)
@@ -154,14 +174,15 @@ def test_sendrcv():
 
 
 def test_scatter():
-    with torch.profiler.record_function("test_scatter"):
-        global num_errors
-        if rank == 0:
-            x = [torch.full(shape, float(i+1)) for i in range(size)]
-        else:
-            x = None
-        y = torch.full(shape, float(0))
+    global num_errors
+    if rank == 0:
+        x = [torch.full(shape, float(i+1)) for i in range(size)]
+    else:
+        x = None
+    y = torch.full(shape, float(0))
 
+    with torch.profiler.record_function("test_scatter"):
+        
         dist.scatter(y, x, 0)
         mpi.Barrier()
     try:
@@ -176,15 +197,16 @@ def test_scatter():
 
 
 def test_gather():
-    with torch.profiler.record_function("test_gather"):
-        global num_errors
-        x = torch.full(shape, float(rank))
+    global num_errors
+    x = torch.full(shape, float(rank))
 
-        if rank == 0:
-            y = [torch.empty(shape) for _ in range(size)]
-        else:
-            y = None
+    if rank == 0:
+        y = [torch.empty(shape) for _ in range(size)]
+    else:
+        y = None
 
+    with torch.profiler.record_function("test_gather"):
+            
         dist.gather(x, y, 0)
         mpi.Barrier()
     if rank == 0:
@@ -200,12 +222,14 @@ def test_gather():
 
             
 def test_allgather():
-    with torch.profiler.record_function("test_allgather"):
-        global num_errors
-        shape_gather = (1,)
-        x = torch.full(shape_gather, float(rank), dtype=torch.double)
-        y = [torch.empty(shape_gather, dtype=torch.double) for _ in range(size)]
+    global num_errors
+    shape_gather = (1,)
+    x = torch.full(shape_gather, float(rank), dtype=torch.double)
+    y = [torch.empty(shape_gather, dtype=torch.double) for _ in range(size)]
 
+    with torch.profiler.record_function("test_allgather"):
+        
+        
         dist.all_gather(y, x)
         mpi.Barrier()
     for i, c in enumerate(y):
@@ -221,9 +245,10 @@ def test_allgather():
 
 
 def test_reduce():
+    global num_errors
+    x = torch.ones(shape)
+
     with torch.profiler.record_function("test_reduce"):
-        global num_errors
-        x = torch.ones(shape)
 
         dist.reduce(x, 0, dist.ReduceOp.SUM)
         mpi.Barrier()            
@@ -239,10 +264,11 @@ def test_reduce():
         
 
 def test_allreduce():
-    with torch.profiler.record_function("test_allreduce"):
-        global num_errors
-        x = torch.ones(shape)
+    global num_errors
+    x = torch.ones(shape)
 
+    with torch.profiler.record_function("test_allreduce"):
+        
         dist.all_reduce(x, dist.ReduceOp.SUM)
         mpi.Barrier()
         
@@ -257,14 +283,14 @@ def test_allreduce():
         
     
 def test_alltoall():
-    with torch.profiler.record_function("test_alltoall"):
-        global num_errors
-
-        input = torch.arange(count, dtype=torch.float) + float(rank) * count
+    global num_errors
 
+    input = torch.arange(count, dtype=torch.float) + float(rank) * count
 
-        output = torch.ones(count)
+    output = torch.ones(count)
 
+    with torch.profiler.record_function("test_alltoall"):
+        
         dist.all_to_all_single(output, input)
 
         mpi.Barrier()
@@ -307,6 +333,7 @@ def __init__(self, size):
             out_feature = torch.zeros(5)
             for j in range(10):
                 in_feature[j] = float((i^2  + j) % 5)
+                # try to learn a linear function of the input, to make sure it's parameterizable
                 out_feature[j//2] = out_feature[j//2] + float(((i^2 + j) % 5) * 3 * ( -1 ** (j % 2)))
             self.data.append((in_feature, out_feature))
                 
@@ -333,7 +360,7 @@ def demo_basic(rank: int):
 
     with torch.profiler.record_function("basic 2 Layer NN"):
         model = ToyModel()
-        ddp_model = DDP(model)
+        ddp_model = DDP(model, bucket_cap_mb=4)
         # ddp_model = DDP(model, bucket_cap_mb=4, broadcast_buffers=False)
         
         train_set = MyTrainDataset(2048)  # load your dataset
@@ -343,7 +370,7 @@ def demo_basic(rank: int):
         loss_fn = nn.MSELoss()
         optimizer = optim.Adam(ddp_model.parameters(), lr=0.005)
 
-        max_epochs = 20
+        max_epochs = 200
         for epoch in range(max_epochs):
             batch_size = len(next(iter(train_data))[0])
             train_data.sampler.set_epoch(epoch)
@@ -420,6 +447,7 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
     mpi.Barrier()            
     
     accl.create_process_group(ranks, design, bufsize=rxbufsize, initialize=True, simulation=simulator)
+    # dist.init_process_group("mpi", rank=rank, world_size=size)    
     dist.init_process_group("ACCL", rank=rank, world_size=size)
     global num_errors
     num_errors = 0
@@ -428,18 +456,27 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
 
         # test_allgather()
         # test_broadcast_segment()
-        test_broadcast()
+        # test_broadcast()
+        # test_broadcast()
+        # test_broadcast()
+        # test_broadcast()
         # test_broadcast()
         # test_broadcast_2()
         # test_sendrcv()
-        test_scatter()
-        test_gather()
+        # test_scatter()
+        # test_gather()
         # test_allgather()
         # test_alltoall()
-        # test_reduce()
         # test_allreduce()
-        # demo_basic(rank)
-        # mpi.Barrier()
+        # test_allreduce()
+        # test_allreduce()
+        # test_allreduce()
+
+        test_reduce()
+        demo_basic(rank)
+
+
+        mpi.Barrier()
 
     if num_errors == 0:
         print("======== Successfully Finished testing======")

From d3a8d5c6ff81adb6aab00fa82b8205d43f1feaaf Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Fri, 28 Jun 2024 13:27:34 +0200
Subject: [PATCH 30/64] Added multidimensional support for Gather and Allgather

---
 integrations/pytorch_ddp/src/ProcessGroupACCL.cpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index e85168f0..e59ee2ca 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -1312,10 +1312,11 @@ ProcessGroupACCL::allgather(std::vector<std::vector<at::Tensor>> &outputTensors,
         auto &dsttensors = entry->dst;
         // Segment data if necessary
         if (srctensor.nbytes() > bufsize) {
-          size_t n = bufsize / srctensor.itemsize();
-          for (size_t i = 0; i < srctensor.numel(); i += n) {
+	  size_t non_zero_dim_count = srctensor.numel() / srctensor.size(0);
+          size_t n = bufsize / srctensor.itemsize() / non_zero_dim_count;
+          for (size_t i = 0; i < srctensor.size(0); i += n) {
             size_t end =
-                std::min(i + n, static_cast<size_t>(srctensor.numel()));
+                std::min(i + n, static_cast<size_t>(srctensor.size(0)));
             std::vector<at::Tensor> dsttensorslices;
             dsttensorslices.reserve(dsttensors.size());
             for (auto &dsttensor : dsttensors) {
@@ -1428,10 +1429,12 @@ ProcessGroupACCL::gather(std::vector<std::vector<at::Tensor>> &outputTensors,
         auto &dsttensors = entry->dst;
         // Segment data if necessary
         if (srctensor.nbytes() > bufsize) {
-          size_t n = bufsize / srctensor.itemsize();
-          for (size_t i = 0; i < srctensor.numel(); i += n) {
+	  size_t non_zero_dim_count = srctensor.numel() / srctensor.size(0);
+          size_t n = bufsize / srctensor.itemsize() / non_zero_dim_count;
+	  ACCL::debug("[Gather] Segmenting tensor of size " + std::to_string(tensor.nbytes()) + " into " + std::to_string(n * non_zero_dim_count) + "-sized elements ");
+          for (size_t i = 0; i < srctensor.size(0); i += n) {
             size_t end =
-                std::min(i + n, static_cast<size_t>(srctensor.numel()));            std::vector<at::Tensor> dsttensorslices;
+                std::min(i + n, static_cast<size_t>(srctensor.size(0)));            std::vector<at::Tensor> dsttensorslices;
             dsttensorslices.reserve(dsttensors.size());
             for (auto &dsttensor : dsttensors) {
               dsttensorslices.emplace_back(dsttensor.slice(0, i, end));

From d3537644233f4e3cbc59f8a20467b6ff514c90d8 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sat, 29 Jun 2024 10:27:14 +0200
Subject: [PATCH 31/64] MNIST fixes

---
 integrations/pytorch_ddp/test/test-mnist.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/integrations/pytorch_ddp/test/test-mnist.py b/integrations/pytorch_ddp/test/test-mnist.py
index c82650e7..b26bff82 100644
--- a/integrations/pytorch_ddp/test/test-mnist.py
+++ b/integrations/pytorch_ddp/test/test-mnist.py
@@ -85,7 +85,7 @@ def train(num_epochs, cnn, loaders):
             optimizer.step()                
             
             # if (i+1) % 100 == 0:
-            # if True:
+            if True:
                 print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                        .format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))
         
@@ -99,8 +99,12 @@ def test():
         for images, labels in loaders['test']:
             test_output, last_layer = cnn(images)
             pred_y = torch.max(test_output, 1)[1].data.squeeze()
-            accuracy = (pred_y == labels).sum().item() / float(labels.size(0))
-    print('Test Accuracy of the model on the 10000 test images: %.2f' % accuracy)
+            correct_current = (pred_y == labels).sum().item()
+            total += labels.size(0)
+            correct += correct_current
+            
+            print(f'Test Batch accuracy: {correct_current}/{labels.size(0)} {correct_current/float(label.size(0))}')
+    print(f'Total accuracy: {correct}/{total} {correct/float(total)}')
     
 if __name__ == "__main__":
 
@@ -161,7 +165,7 @@ def test():
                 ranks = [accl.Rank(a, start_port + i, 0, rxbufsize) for i, a in enumerate(fpga_ips)]
         else:
             # Somehow the simulator gets stuck if I use the same rxbufsize
-            rxbufsize = 4096# * 1024
+            rxbufsize = 4096 * 1024
             ranks = [accl.Rank("127.0.0.1", 5500 + i, i, rxbufsize) for i in range(size)]
 
         logger.debug(f'Ranks: {ranks}')
@@ -208,7 +212,7 @@ def test():
     }
 
     cnn = CNN()
-    if args.d : cnn = DDP(cnn)
+    if args.d : cnn = DDP(cnn, bucket_cap_mb=4)
 
     loss_func = nn.CrossEntropyLoss()   
 

From fe43a32896ec93ae8ddda47e33b0710d1063fcf3 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Mon, 1 Jul 2024 10:25:06 +0200
Subject: [PATCH 32/64] Removed ACCL measured duration to avoid wait time

---
 integrations/pytorch_ddp/src/ProcessGroupACCL.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index e59ee2ca..2ead44d3 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -135,12 +135,6 @@ if(coyote_enabled){							\
   auto end = std::chrono::high_resolution_clock::now();			\
   durationUs = (std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count() / 1000.0); \
   ACCL::debug("host measured durationUs:" + std::to_string(durationUs)); \
-  std::this_thread::sleep_for(10ms);					\
-  durationUs = (double)accl->get_duration(req)/1000.0;			\
-  if(durationUs > 1.0){							\
-      ACCL::debug("ACCL measured durationUs:" + std::to_string(durationUs)); \
-      accl_pg_log(rank_, format_log(opname, size_, rank_, durationUs, n_bytes)); \
-  }									\
 }									\
 ACCL::debug("Finished waiting");
 

From 496c13b157d4512aa6c67b36f8fb8176610a17cc Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Mon, 1 Jul 2024 10:31:34 +0200
Subject: [PATCH 33/64] Added multidim tensor segment for AlltoAll

---
 .../pytorch_ddp/include/ProcessGroupACCL.hpp  |  4 ++
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 61 ++++++++++++++++---
 integrations/pytorch_ddp/test/test-generic.py | 34 ++++++-----
 3 files changed, 78 insertions(+), 21 deletions(-)

diff --git a/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp b/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp
index ddc97327..b1cd7582 100644
--- a/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp
+++ b/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp
@@ -276,7 +276,11 @@ class TORCH_API ProcessGroupACCL : public ProcessGroup {
                   const GatherOptions &opts);
   void run_scatter(std::vector<at::Tensor> &in_tensors, at::Tensor dsttensor,
                    const ScatterOptions &opts);
+
   void run_alltoall(at::Tensor in_tensor, at::Tensor dsttensor, const AllToAllOptions &opts);
+  
+  void run_alltoall_vec(std::vector<at::Tensor> &in_tensor_vec,
+                                    std::vector<at::Tensor> &out_tensor_vec, const AllToAllOptions &opts);
 
   ACCL::dataType get_compressed_type(c10::ScalarType datatype);
 
diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 2ead44d3..7e05c1e2 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -1425,10 +1425,11 @@ ProcessGroupACCL::gather(std::vector<std::vector<at::Tensor>> &outputTensors,
         if (srctensor.nbytes() > bufsize) {
 	  size_t non_zero_dim_count = srctensor.numel() / srctensor.size(0);
           size_t n = bufsize / srctensor.itemsize() / non_zero_dim_count;
-	  ACCL::debug("[Gather] Segmenting tensor of size " + std::to_string(tensor.nbytes()) + " into " + std::to_string(n * non_zero_dim_count) + "-sized elements ");
+	  ACCL::debug("[Gather] Segmenting tensor of size " + std::to_string(srctensor.nbytes()) + " into " + std::to_string(n * non_zero_dim_count) + "-sized elements ");
           for (size_t i = 0; i < srctensor.size(0); i += n) {
             size_t end =
-                std::min(i + n, static_cast<size_t>(srctensor.size(0)));            std::vector<at::Tensor> dsttensorslices;
+                std::min(i + n, static_cast<size_t>(srctensor.size(0)));
+            std::vector<at::Tensor> dsttensorslices;
             dsttensorslices.reserve(dsttensors.size());
             for (auto &dsttensor : dsttensors) {
               dsttensorslices.emplace_back(dsttensor.slice(0, i, end));
@@ -1607,6 +1608,32 @@ void ProcessGroupACCL::run_alltoall(at::Tensor in_tensor,
   
 }
 
+    
+void ProcessGroupACCL::run_alltoall_vec(std::vector<at::Tensor> &in_tensor_vec,
+                                    std::vector<at::Tensor> &out_tensor_vec,
+                                    const AllToAllOptions &opts) {
+  std::unique_ptr<ACCL::BaseBuffer> in_data;
+  std::unique_ptr<ACCL::BaseBuffer> out_data;
+  at::Tensor dsttensor;
+
+  // Reserve device
+  c10::DeviceGuard guard(in_tensor_vec[0].device());
+  std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+
+  init_input_data_vec(in_tensor_vec, in_data, out_tensor_vec[0].options().device(c10::DeviceType::CPU), true, true);
+
+  init_output_tensor(in_tensor_vec[0], dsttensor, out_data, size_, in_tensor_vec[0].scalar_type(), true, true);
+  
+  PRE_REQUEST(AlltoAll, in_tensor_vec[0])
+
+  ACCL::ACCLRequest* req = accl->alltoall(*in_data, *out_data, in_tensor_vec[0].numel(), ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor_vec[0].scalar_type()));
+
+  POST_REQUEST("alltoall", in_tensor_vec[0].nbytes())
+
+  copy_back_tensorvec(out_tensor_vec, out_data, dsttensor, in_tensor_vec[0].numel(), true, true);
+      
+}
+
 c10::intrusive_ptr<Work> ProcessGroupACCL::alltoall_base(
     at::Tensor &outputTensor, at::Tensor &inputTensor,
     std::vector<int64_t> &outputSplitSizes,
@@ -1626,17 +1653,37 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::alltoall_base(
         [opts, this](std::unique_ptr<WorkEntry>& entry) {
           auto srctensor = (entry->src)[0];
           auto dsttensor = (entry->dst)[0];
+
+
           // c10::DeviceGuard guard(srctensor.device());
           // std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
           // Segment data if necessary
           if (dsttensor.nbytes() > bufsize) {
             ACCL::debug("dsttensor to large!");
-            size_t n = bufsize / dsttensor.itemsize();
-            for (size_t i = 0; i < dsttensor.numel(); i += n) {
+
+	    // Split individual entries
+	    size_t non_zero_dim_count = dsttensor.numel() / dsttensor.size(0);
+	    size_t n = bufsize / dsttensor.itemsize() / size_ / non_zero_dim_count;
+	    size_t entry_size = dsttensor.numel() / size_ / non_zero_dim_count;
+            for (size_t i = 0; i < entry_size; i += n) {
               ACCL::debug("part " + std::to_string(i) + "!");
-              size_t end =
-                  std::min(i + n, static_cast<size_t>(dsttensor.numel()));
-              run_alltoall(srctensor.slice(0, i, end), dsttensor.slice(0, i, end), opts);
+              size_t end = std::min(i + n, static_cast<size_t>(entry_size));
+
+	      std::vector<at::Tensor> srctensorslices;
+	      srctensorslices.reserve(size_);
+	      ACCL::debug("dsttensorslices:");
+	      for (int j = 0; j < size_; j++) {
+		  int bufpos = j * entry_size;
+		  srctensorslices.emplace_back(srctensor.slice(0, i + bufpos, end + bufpos));
+	      }
+	      std::vector<at::Tensor> dsttensorslices;
+	      dsttensorslices.reserve(size_);
+	      ACCL::debug("dsttensorslices:");
+	      for (int j = 0; j < size_; j++) {
+		  int bufpos = j * entry_size;
+		  dsttensorslices.emplace_back(dsttensor.slice(0, i + bufpos, end + bufpos));
+	      }
+              run_alltoall_vec(srctensorslices, dsttensorslices, opts);
             }
           } else {
 	    ACCL::debug("Running without segmentation");
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index c241b2d0..7dd1d815 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -49,9 +49,9 @@
 rank = 0
 size = 0
 
-count = 16
-num_el = 256
-shape = (num_el,)
+count = 256 * 1024
+num_el = 256 * 1024
+shape = (256,1024)
 #As in test.cpp defaults
 rxbufsize = 4096 * 1024
 
@@ -285,26 +285,31 @@ def test_allreduce():
 def test_alltoall():
     global num_errors
 
-    input = torch.arange(count, dtype=torch.float) + float(rank) * count
+    input = torch.arange(num_el, dtype=torch.float) + float(rank) * num_el
 
-    output = torch.ones(count)
+    input_shaped = input.reshape(shape)
+
+    output = torch.ones(num_el)
+
+    output_shaped = output.reshape(shape)
 
     with torch.profiler.record_function("test_alltoall"):
         
-        dist.all_to_all_single(output, input)
+        dist.all_to_all_single(output_shaped, input_shaped)
 
         mpi.Barrier()
 
-    test = torch.zeros(count)
+    test = torch.zeros(num_el)
 
-    section_size = int(count/size)
+    section_size = int(num_el/size)
 
     for section in range(size):
         for el in range(section_size):
-            test[section * section_size + el] = float(rank) * section_size + section * count + el
+            test[section * section_size + el] = float(rank) * section_size + section * num_el + el
 
+    test_shaped = test.reshape(shape)
     try:
-        np.testing.assert_allclose(output, test)
+        np.testing.assert_allclose(output_shaped, test_shaped)
     except AssertionError as e:
         num_errors = num_errors + 1
         logger.debug("Test AlltoAll failed")
@@ -312,7 +317,6 @@ def test_alltoall():
     else:
         logger.debug("Test AlltoAll finished!")
         
-
 class ToyModel(nn.Module):
     def __init__(self):
         super(ToyModel, self).__init__()
@@ -456,7 +460,7 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
 
         # test_allgather()
         # test_broadcast_segment()
-        # test_broadcast()
+        test_broadcast()
         # test_broadcast()
         # test_broadcast()
         # test_broadcast()
@@ -472,8 +476,10 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
         # test_allreduce()
         # test_allreduce()
 
-        test_reduce()
-        demo_basic(rank)
+        # test_reduce()
+
+        
+        # demo_basic(rank)
 
 
         mpi.Barrier()

From 90e80b2eba0d64372421699ad4fefe5099c38be3 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Mon, 1 Jul 2024 10:33:37 +0200
Subject: [PATCH 34/64] Furter timestamping on bcast

---
 integrations/pytorch_ddp/src/ProcessGroupACCL.cpp | 12 ++++++++++++
 integrations/pytorch_ddp/test/run.sh              | 12 +++++++-----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 7e05c1e2..fc4f6847 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -742,6 +742,7 @@ void ProcessGroupACCL::init_input_tensor(at::Tensor &tensor, std::unique_ptr<ACC
     }
     ACCL::debug("Copying data to CPU tensor of size " + std::to_string(total_size));
     at::Tensor wrapper_tensor = torch::from_blob(data->byte_array(), sizes, options);
+
     for (const auto i : c10::irange(tensor_vec.size())) {
       if (p2p_applicable(*accl, tensor_vec[0], p2p_enabled)) {
 	auto slice = data->slice(i * tens_size, (i + 1) * tens_size);
@@ -786,6 +787,7 @@ void ProcessGroupACCL::init_output_data(at::Tensor &tensor_original, std::unique
 	    sizes.insert(sizes.begin(), num_tensors_s);
 	    total_size = total_size * num_tensors_s;
 	}
+	
 	if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
 	  dstdata = create_buffer_p2p(*accl, total_size, type);
 	} else if (coyote_enabled) {
@@ -1046,6 +1048,8 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::enqueue(
 void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
                                      const BroadcastOptions &opts) {
 
+  std::chrono::time_point<std::chrono::high_resolution_clock> start_inner  = std::chrono::high_resolution_clock::now();
+
   STANDARD_DECL
 
   //Should be split to output on non-root sometime
@@ -1065,6 +1069,8 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   ACCL::debug("init tensor durationUs:" + std::to_string(durationUs_init));
   
   // Reserve device
+
+  std::chrono::time_point<std::chrono::high_resolution_clock> start_lock  = std::chrono::high_resolution_clock::now();
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
@@ -1081,6 +1087,9 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   ACCL::debug("Copy tensor durationUs:" + std::to_string(durationUs_copy));
 
   
+  auto end_inner = std::chrono::high_resolution_clock::now();
+  double durationUs_inner = (std::chrono::duration_cast<std::chrono::nanoseconds>(end_inner-start_inner).count() / 1000.0);
+  ACCL::debug("Inner total tensor durationUs:" + std::to_string(durationUs_inner));  
 }
 
 c10::intrusive_ptr<Work>
@@ -1116,6 +1125,9 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
 	  ACCL::debug("[Broadcast] Broadcasting entire tensor of size " + std::to_string(tensor.nbytes()) + " without segmentation.");
           run_broadcast(tensor, opts);
         }
+	auto end = std::chrono::high_resolution_clock::now();
+	double durationUs = (std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count() / 1000.0);
+        ACCL::debug("Total bcast durationUs:" + std::to_string(durationUs));
 	#endif
       };
   auto entry =
diff --git a/integrations/pytorch_ddp/test/run.sh b/integrations/pytorch_ddp/test/run.sh
index 554c3757..be8b36db 100755
--- a/integrations/pytorch_ddp/test/run.sh
+++ b/integrations/pytorch_ddp/test/run.sh
@@ -9,7 +9,8 @@ fi
 if [[ -v ACCL_SCRIPT ]]; then
     SCRIPT_NAME="$ACCL_SCRIPT"
 else
-    # SCRIPT_NAME=test-mnist.py # MNIST
+    # SCRIPT_NAME="test-mnist.py -d True -n 2" # MNIST
+    # SCRIPT_NAME="test-imagenet.py -d True"
     SCRIPT_NAME=test-generic.py
     echo "Variable ACCL_SCRIPT not set. Assuming $SCRIPT_NAME"
 fi
@@ -71,7 +72,7 @@ else
     for ID in ${SERVID[@]}; do
 	echo "10.253.74.$(((ID-1) * 4 + 66))">>$HOST_FILE
 	echo "10.253.74.$(((ID-1) * 4 + 68))">>$FPGA_FILE
-	NUM_PROCESS=$((NUM_PROCESS+1))
+ 	NUM_PROCESS=$((NUM_PROCESS+1))
 	HOST_LIST+="alveo-u55c-$(printf "%02d" $ID) "
 	HOST_PORT_LIST+="alveo-u55c-$(printf "%02d" $ID):$RANK_PORT "
     done
@@ -84,9 +85,9 @@ else
 
     echo "Master node set to: $MASTER_IP:$MASTER_PORT"
 
-    MPI_ARGS="-f $HOST_FILE --iface ens4f0"
     # 09 and 10 have other interface names:
-    # MPI_ARGS="-f $HOST_FILE --iface ens4"
+    # MPI_ARGS="-f $HOST_FILE --iface ens4f0"
+    MPI_ARGS="-f $HOST_FILE --iface ens4"
 fi
 
 ARG="$ARG -c $ACCL_COMMS -i $HOST_FILE -f $FPGA_FILE -a $MASTER_IP -p $MASTER_PORT\""
@@ -98,6 +99,7 @@ echo "Run command: $EXEC $ARG"
 echo "Running with $NUM_PROCESS Processes"
 
 rm -f $(pwd)/accl_log/rank*
+rm -f $(pwd)/accl_log/accl_pg_*
 
 # C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_stdout\" $EXEC $ARG &"
 C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_stdout\" -errfile-pattern \"$(pwd)/accl_log/rank_%r_stderr\" $EXEC $ARG &"
@@ -109,7 +111,7 @@ exit 0
 /bin/sh -c "$C"
 
 if ! [[ -v SLEEPTIME ]]; then
-    SLEEPTIME="16"
+    SLEEPTIME="32"
 fi
 echo "Sleeping for $SLEEPTIME"
 sleep $SLEEPTIME

From f5a9508032662065130847c758238defeea03ebe Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sun, 21 Jul 2024 16:57:52 +0200
Subject: [PATCH 35/64] Using new change buffer type to reuse buffers

---
 .../pytorch_ddp/include/ProcessGroupACCL.hpp  |   7 +-
 integrations/pytorch_ddp/setup.py             |   5 +-
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 180 +++---
 integrations/pytorch_ddp/test/run.sh          |   8 +-
 integrations/pytorch_ddp/test/test-generic.py |  82 +--
 .../test/torchvision/PennFudanDataset.py      |  74 ---
 .../pytorch_ddp/test/torchvision/coco_eval.py | 192 ------
 .../test/torchvision/coco_eval.py.1           | 192 ------
 .../test/torchvision/coco_utils.py            | 234 -------
 .../test/torchvision/coco_utils.py.1          | 234 -------
 .../pytorch_ddp/test/torchvision/engine.py    | 115 ----
 .../pytorch_ddp/test/torchvision/main.py      | 100 ---
 .../test/torchvision/transforms.py            | 601 ------------------
 .../test/torchvision/transforms.py.1          | 601 ------------------
 .../pytorch_ddp/test/torchvision/utils.py     | 282 --------
 .../pytorch_ddp/test/torchvision/utils.py.1   | 282 --------
 16 files changed, 134 insertions(+), 3055 deletions(-)
 delete mode 100644 integrations/pytorch_ddp/test/torchvision/PennFudanDataset.py
 delete mode 100644 integrations/pytorch_ddp/test/torchvision/coco_eval.py
 delete mode 100644 integrations/pytorch_ddp/test/torchvision/coco_eval.py.1
 delete mode 100644 integrations/pytorch_ddp/test/torchvision/coco_utils.py
 delete mode 100644 integrations/pytorch_ddp/test/torchvision/coco_utils.py.1
 delete mode 100644 integrations/pytorch_ddp/test/torchvision/engine.py
 delete mode 100644 integrations/pytorch_ddp/test/torchvision/main.py
 delete mode 100644 integrations/pytorch_ddp/test/torchvision/transforms.py
 delete mode 100644 integrations/pytorch_ddp/test/torchvision/transforms.py.1
 delete mode 100644 integrations/pytorch_ddp/test/torchvision/utils.py
 delete mode 100644 integrations/pytorch_ddp/test/torchvision/utils.py.1

diff --git a/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp b/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp
index b1cd7582..52c3223e 100644
--- a/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp
+++ b/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp
@@ -298,7 +298,9 @@ class TORCH_API ProcessGroupACCL : public ProcessGroup {
   static void acclExit();
   
   void init_input_tensor(at::Tensor &tensor, std::unique_ptr<ACCL::BaseBuffer> &data, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
-
+  
+  void init_input_tensor_new(at::Tensor &tensor, ACCL::BaseBuffer *data, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
+  
   void init_input_data_vec(std::vector<at::Tensor> &tensor_vec, std::unique_ptr<ACCL::BaseBuffer> &data, const at::TensorOptions &options, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
   
   void init_output_data(at::Tensor &tensor_original, std::unique_ptr<ACCL::BaseBuffer> &dstdata, int out_tensor_size, c10::ScalarType type, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
@@ -336,6 +338,9 @@ class TORCH_API ProcessGroupACCL : public ProcessGroup {
   bool initialized;
   xrt::bo buf0;
   xrt::bo buf1;
+
+  std::unique_ptr<ACCL::BaseBuffer> in_buf;
+  std::unique_ptr<ACCL::BaseBuffer> out_buf;
 };
 
 } // namespace c10d
diff --git a/integrations/pytorch_ddp/setup.py b/integrations/pytorch_ddp/setup.py
index 10034004..5433a974 100755
--- a/integrations/pytorch_ddp/setup.py
+++ b/integrations/pytorch_ddp/setup.py
@@ -41,13 +41,11 @@
 accl_utils_dir = driver_dir / 'utils' / 'accl_network_utils'
 vnx_dir = root / 'accl' / 'test' / 'refdesigns' / 'xup_vitis_network_example' \
     / 'xrt_host_api'
-roce_dir = root / 'accl' / 'test' / 'refdesigns' / 'HiveNet' \
-    / 'network' / 'roce_v2' / 'xrt_utils'
 
 include_dirs = [root / 'include',  driver_dir / 'xrt' / 'include',
                 accl_utils_dir / 'include', xrt_dir / 'include',
                 root / 'accl' / 'test' / 'model' / 'zmq',
-                vnx_dir / 'include', roce_dir / 'include',
+                vnx_dir / 'include',
                 root / 'accl' / 'test' / 'refdesigns' / 'Coyote' / 'sw' / 'include',
                 '/pub/scratch/zhe/mpich/install/include',
                 '/usr/include/jsoncpp']
@@ -56,7 +54,6 @@
 sources = [root / 'src' / 'ProcessGroupACCL.cpp',
            root / 'src' / 'coyote_init.cpp',
            vnx_dir / 'src' / 'cmac.cpp', vnx_dir / 'src' / 'networklayer.cpp',
-           roce_dir / 'src' / 'cmac.cpp', roce_dir / 'src' / 'hivenet.cpp',
            accl_utils_dir / 'src' / 'accl_network_utils.cpp']
 
 compile_args = ['-Wno-reorder',
diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index fc4f6847..2656bfb3 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -116,6 +116,8 @@ std::map<at::ScalarType, MPI_Datatype> mpiDatatype = {
 #define DO_COND ((do_on_root && opts_root_rank == rank_) || (do_on_others && opts_root_rank != rank_))
 
 #define PRE_REQUEST(opname, tensor)					\
+    in_buf->change_type(convert_datatype_from_torch(tensor.scalar_type()));	\
+  out_buf->change_type(convert_datatype_from_torch(tensor.scalar_type()));	\
   ACCL::debug("[" #opname "] Entering barrier");				\
   accl->barrier();							\
   ACCL::debug("Starting " #opname " of " + std::to_string(tensor.numel()) + " items"); \
@@ -698,31 +700,18 @@ void accl_sa_handler(int)
 	exit(EXIT_FAILURE);
 }
 
+// TODO delete when not needed anymore
 void ProcessGroupACCL::init_input_tensor(at::Tensor &tensor, std::unique_ptr<ACCL::BaseBuffer> &data, bool do_on_root, bool do_on_others, int opts_root_rank) {
   if DO_COND {
-    if (p2p_applicable(*accl, tensor, p2p_enabled)) {
-      data = create_and_copy_p2p_buffer(*accl, tensor);
-    } else {
-      if (coyote_enabled) {
-	data = create_coyotebuffer(*accl, tensor.numel(), tensor.scalar_type());
-      } else if (tensor.device().type() != c10::DeviceType::CPU) {
-	data = create_buffer(*accl, tensor.numel(), tensor.scalar_type());
-      } else {
-	data = create_buffer(*accl, tensor);
-      }
-      if (coyote_enabled || tensor.device().type() != c10::DeviceType::CPU){
 	ACCL::debug("Copying data to CPU tensor of size " + std::to_string(tensor.numel()));
 	at::Tensor wrapper_tensor = torch::from_blob(data->byte_array(), tensor.sizes(), tensor.options().device(c10::DeviceType::CPU)); 
 	wrapper_tensor.copy_(tensor);
-      } 
-    }
-    // don't sync if no rank initializes, we will fill content and sync later
-    if (!coyote_enabled && (do_on_root || do_on_others)) {
+
+	//TODO check if necessary in coyote
       data->sync_to_device();
+	
     }
-  } else {
-    data = std::unique_ptr<ACCL::Buffer<float>>(nullptr);
-  }
+    // don't sync if no rank initializes, we will fill content and sync later
 }
 
   void ProcessGroupACCL::init_input_data_vec(std::vector<at::Tensor> &tensor_vec, std::unique_ptr<ACCL::BaseBuffer> &data, const at::TensorOptions &options, bool do_on_root, bool do_on_others, int opts_root_rank) {
@@ -733,13 +722,6 @@ void ProcessGroupACCL::init_input_tensor(at::Tensor &tensor, std::unique_ptr<ACC
     // Prepend another dimension for vector length
     sizes.insert(sizes.begin(), tensor_vec.size());
       
-    if (p2p_applicable(*accl, tensor_vec[0], p2p_enabled)) {
-      data = create_buffer_p2p( *accl, total_size, tensor_vec[0].scalar_type());
-    } else if (coyote_enabled) {
-	data = create_coyotebuffer(*accl, total_size, tensor_vec[0].scalar_type());
-    } else {
-      data = create_buffer(*accl, total_size, tensor_vec[0].scalar_type());
-    }
     ACCL::debug("Copying data to CPU tensor of size " + std::to_string(total_size));
     at::Tensor wrapper_tensor = torch::from_blob(data->byte_array(), sizes, options);
 
@@ -761,6 +743,7 @@ void ProcessGroupACCL::init_input_tensor(at::Tensor &tensor, std::unique_ptr<ACC
 }  
   
   // like init_output_tensor but without needlessly setting the tensor
+  // TODO: remove once all collectives reuse the buffer
 void ProcessGroupACCL::init_output_data(at::Tensor &tensor_original, std::unique_ptr<ACCL::BaseBuffer> &dstdata, int out_tensor_size, c10::ScalarType type, bool do_on_root, bool do_on_others, int opts_root_rank) {
   if DO_COND {
       if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
@@ -791,19 +774,16 @@ void ProcessGroupACCL::init_output_data(at::Tensor &tensor_original, std::unique
 	if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
 	  dstdata = create_buffer_p2p(*accl, total_size, type);
 	} else if (coyote_enabled) {
-	    dstdata = create_coyotebuffer(*accl, total_size, type);
 	    // std::vector<int64_t> sizes = {static_cast<int64_t>(out_tensor_size)};
 	    dsttensor = torch::from_blob(dstdata->byte_array(), sizes, tensor_original.options().device(c10::DeviceType::CPU));
 	    // This should not be necessary:
 	    // dsttensor.copy_(tensor_original);
 	} else {
-	    dstdata = create_buffer(*accl, total_size, type);
 	    dsttensor = torch::from_blob(dstdata->byte_array(), sizes, tensor_original.options().device(c10::DeviceType::CPU));
 	    // This should not be necessary:
 	    // dsttensor.copy_(tensor_original);
 	}
       } else {
-      dstdata = std::unique_ptr<ACCL::Buffer<float>>(nullptr);
       dsttensor = at::Tensor(nullptr);
     }
 }
@@ -891,6 +871,9 @@ ProcessGroupACCL::ProcessGroupACCL(
       } else {
         throw std::runtime_error("Undefined ACCL design");
       }
+      // create the two buffers, which are gonna be reused during calls
+      // We use float32, but they are gonna be filled arbitrarily
+
     }
     // use xrt
     else{
@@ -950,6 +933,10 @@ void ProcessGroupACCL::initialize() {
     }  
     
     ACCL::debug(std::string("[ACCL coyote] communicator: ") + accl->dump_communicator());
+
+    in_buf = accl->create_coyotebuffer<float_t>(bufsize/sizeof(float), ACCL::dataType::float32);
+    out_buf = accl->create_coyotebuffer<float_t>(bufsize/sizeof(float), ACCL::dataType::float32);
+
   } else {
     ACCL::debug(std::string("Performing standard initialization"));
     accl = accl_network_utils::initialize_accl(ranks_, rank_,
@@ -961,13 +948,18 @@ void ProcessGroupACCL::initialize() {
     // accl->set_rendezvous_threshold(16*1024);
                                       
     int devicemem = accl->devicemem();
-    if (!simulator_) {
-      // Initialize cache buffers
-      buf0 = xrt::bo(xrt_device, bufsize, devicemem);
-      buf1 = xrt::bo(xrt_device, bufsize, devicemem);
+
+    in_buf = accl->create_buffer<float_t>(bufsize/sizeof(float), ACCL::dataType::float32);
+    out_buf = accl->create_buffer<float_t>(bufsize/sizeof(float), ACCL::dataType::float32);
+
+    // Not sure if this is needed:
+    // Initialize cache buffers
+    if (!simulator_){
+	buf0 = xrt::bo(xrt_device, bufsize, devicemem);
+	buf1 = xrt::bo(xrt_device, bufsize, devicemem);
     }
+	    
   }
-
   accl->set_timeout(1e8);
   // Start the worker thread accepting ACCL calls
   workerThread_ = std::thread(&ProcessGroupACCL::runLoop, this);
@@ -981,6 +973,14 @@ void ProcessGroupACCL::destroy() {
   std::unique_lock<std::mutex> lock(pgMutex_);
   queueConsumeCV_.wait(lock, [&] { return queue_.empty(); });
 
+  //TODO free other buffer types
+  if (!simulator_) {
+      // if(coyote_enabled){
+	  in_buf->free_buffer();
+	  out_buf->free_buffer();
+      // }
+  }
+
   // Queue is empty, signal stop
   stop_ = true;
 
@@ -1050,20 +1050,16 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
 
   std::chrono::time_point<std::chrono::high_resolution_clock> start_inner  = std::chrono::high_resolution_clock::now();
 
-  STANDARD_DECL
-
-  //Should be split to output on non-root sometime
-  // init_input_tensor(in_tensor, data, true, true, opts.rootRank);
   // This case split is necessary, because otherwise data will be set to a nullptr
 
   std::chrono::time_point<std::chrono::high_resolution_clock> start_init  = std::chrono::high_resolution_clock::now();
       
   if (opts.rootRank == rank_){
-      init_input_tensor(in_tensor, data, true, false, opts.rootRank);
-  }
-  else{
-      init_output_data(in_tensor, data, in_tensor.numel(), in_tensor.scalar_type(), false, true, opts.rootRank);
+      init_input_tensor(in_tensor, in_buf, true, false, opts.rootRank);
   }
+  // else{
+      // init_output_data(in_tensor, in_buf, in_tensor.numel(), in_tensor.scalar_type(), false, true, opts.rootRank);
+  // }
   auto end_init = std::chrono::high_resolution_clock::now();
   double durationUs_init = (std::chrono::duration_cast<std::chrono::nanoseconds>(end_init-start_init).count() / 1000.0);
   ACCL::debug("init tensor durationUs:" + std::to_string(durationUs_init));
@@ -1076,12 +1072,17 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
 
   PRE_REQUEST(Broadcast,in_tensor)
 
-  ACCL::ACCLRequest* req = accl->bcast(*data, in_tensor.numel(), opts.rootRank, ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor.scalar_type()));
+  ACCL::ACCLRequest* req = accl->bcast(*in_buf, in_tensor.numel(), opts.rootRank, ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor.scalar_type()));
 
   POST_REQUEST("bcast", in_tensor.nbytes())
+
+  in_buf->sync_from_device();
+  // for(int i = 0; i<in_tensor.numel(); i++){
+      // ACCL::debug(std::to_string(((double *) in_buf->byte_array())[i]));
+  // }
       
   std::chrono::time_point<std::chrono::high_resolution_clock> start_copy  = std::chrono::high_resolution_clock::now();
-  copy_back_tensor(in_tensor, data, true, true, opts.rootRank);
+  copy_back_tensor(in_tensor, in_buf, true, true, opts.rootRank);
   auto end_copy = std::chrono::high_resolution_clock::now();
   double durationUs_copy = (std::chrono::duration_cast<std::chrono::nanoseconds>(end_copy-start_copy).count() / 1000.0);
   ACCL::debug("Copy tensor durationUs:" + std::to_string(durationUs_copy));
@@ -1110,6 +1111,7 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
             opts.rootRank,
             MPI_COMM_WORLD));
 	#else
+	std::chrono::time_point<std::chrono::high_resolution_clock> start  = std::chrono::high_resolution_clock::now();
 	at::Tensor &tensor = (entry->src)[0];
         // Segment data if necessary
         if (tensor.nbytes() > bufsize) {
@@ -1141,21 +1143,19 @@ void ProcessGroupACCL::run_allreduce(at::Tensor in_tensor,
 
   STANDARD_DECL
 
-  init_input_tensor(in_tensor, data, true, true);    
+  init_input_tensor(in_tensor, in_buf, true, true);    
 
   // Reserve device
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  init_output_data(in_tensor, dstdata, in_tensor.numel(), in_tensor.scalar_type(), true, true);
-
   PRE_REQUEST(Allreduce,in_tensor)  
   
-  ACCL::ACCLRequest* req = accl->allreduce(*data, *dstdata, in_tensor.numel(), acclOp.at(opts.reduceOp), ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor.scalar_type()));
+  ACCL::ACCLRequest* req = accl->allreduce(*in_buf, *out_buf, in_tensor.numel(), acclOp.at(opts.reduceOp), ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor.scalar_type()));
 
   POST_REQUEST("allreduce", in_tensor.nbytes())
 
-  copy_back_tensor(in_tensor, dstdata, true, true);
+  copy_back_tensor(in_tensor, out_buf, true, true);
 }
 
 c10::intrusive_ptr<Work>
@@ -1206,22 +1206,19 @@ ProcessGroupACCL::allreduce_coalesced(std::vector<at::Tensor> &tensors,
 void ProcessGroupACCL::run_reduce(at::Tensor in_tensor,
                                   const ReduceOptions &opts) {
 
-  STANDARD_DECL
-  init_input_tensor(in_tensor, data, true, true);    
+  init_input_tensor(in_tensor, in_buf, true, true);    
 
   // Reserve device
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  init_output_data(in_tensor, dstdata, in_tensor.numel(), in_tensor.scalar_type(), true, false, opts.rootRank);
-    
   PRE_REQUEST(Reduce,in_tensor)  
 
-  ACCL::ACCLRequest* req = accl->reduce(*data, *dstdata, in_tensor.numel(), opts.rootRank, acclOp.at(opts.reduceOp), ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor.scalar_type()));
+  ACCL::ACCLRequest* req = accl->reduce(*in_buf, *out_buf, in_tensor.numel(), opts.rootRank, acclOp.at(opts.reduceOp), ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor.scalar_type()));
 
   POST_REQUEST("reduce", in_tensor.nbytes())
 
-  copy_back_tensor(in_tensor, dstdata, true, false, opts.rootRank);
+  copy_back_tensor(in_tensor, out_buf, true, false, opts.rootRank);
 }
 
 c10::intrusive_ptr<Work>
@@ -1252,26 +1249,25 @@ ProcessGroupACCL::reduce(std::vector<at::Tensor> &tensors,
 void ProcessGroupACCL::run_allgather(
     at::Tensor in_tensor,
     const std::vector<at::Tensor> &dsttensorvec) {
-  at::Tensor empty_srctensor;
-  std::unique_ptr<ACCL::BaseBuffer> srcdata;
+
+    
   at::Tensor dsttensor;
-  std::unique_ptr<ACCL::BaseBuffer> dstdata;
 
-  init_input_tensor(in_tensor, srcdata, true, true);    
+  init_input_tensor(in_tensor, in_buf, true, true);    
   // Reserve device
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  init_output_tensor(in_tensor, dsttensor, dstdata, size_, in_tensor.scalar_type(), true, true);
+  init_output_tensor(in_tensor, dsttensor, out_buf, size_, in_tensor.scalar_type(), true, true);
   
   PRE_REQUEST(Allgather,in_tensor)
 
-  ACCL::ACCLRequest* req = accl->allgather(*srcdata, *dstdata, in_tensor.numel(), ACCL::GLOBAL_COMM,
+  ACCL::ACCLRequest* req = accl->allgather(*in_buf, *out_buf, in_tensor.numel(), ACCL::GLOBAL_COMM,
                   true, true, get_compressed_type(in_tensor.scalar_type()));
 
   POST_REQUEST("allgather", in_tensor.nbytes())
 
-  copy_back_tensorvec(dsttensorvec, dstdata, dsttensor, in_tensor.numel(), true, true);
+  copy_back_tensorvec(dsttensorvec, out_buf, dsttensor, in_tensor.numel(), true, true);
     
 }
 
@@ -1351,27 +1347,25 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::allgather_coalesced(
 void ProcessGroupACCL::run_gather(at::Tensor in_tensor,
                                   const std::vector<at::Tensor> &dsttensorvec,
                                   const GatherOptions &opts) {
-  at::Tensor empty_srctensor;
-  std::unique_ptr<ACCL::BaseBuffer> srcdata;
   at::Tensor dsttensor;
-  std::unique_ptr<ACCL::BaseBuffer> dstdata;
 
-  init_input_tensor(in_tensor, srcdata, true, true);    
   // Reserve device
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+  
+  init_input_tensor(in_tensor, in_buf, true, true);
 
-  init_output_tensor(in_tensor, dsttensor, dstdata, size_, in_tensor.scalar_type(), true, false, opts.rootRank);
+  init_output_tensor(in_tensor, dsttensor, out_buf, size_, in_tensor.scalar_type(), true, false, opts.rootRank);
 
   PRE_REQUEST(Gather, in_tensor)
 
-  ACCL::ACCLRequest* req = accl->gather(*srcdata, *dstdata, in_tensor.numel(), opts.rootRank,
+  ACCL::ACCLRequest* req = accl->gather(*in_buf, *out_buf, in_tensor.numel(), opts.rootRank,
                ACCL::GLOBAL_COMM, true, true,
                get_compressed_type(in_tensor.scalar_type()));
 
   POST_REQUEST("gather", in_tensor.nbytes())
 
-  copy_back_tensorvec(dsttensorvec, dstdata, dsttensor, in_tensor.numel(), true, false, opts.rootRank);
+  copy_back_tensorvec(dsttensorvec, out_buf, dsttensor, in_tensor.numel(), true, false, opts.rootRank);
     
 }
 
@@ -1470,27 +1464,24 @@ ProcessGroupACCL::gather(std::vector<std::vector<at::Tensor>> &outputTensors,
 void ProcessGroupACCL::run_scatter(std::vector<at::Tensor> &in_tensor_vec,
                                    at::Tensor out_tensor,
                                    const ScatterOptions &opts) {
-  std::unique_ptr<ACCL::BaseBuffer> in_data;
-  std::unique_ptr<ACCL::BaseBuffer> out_data;
   at::Tensor dsttensor;
 
   // Reserve device
   c10::DeviceGuard guard(out_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  init_input_data_vec(in_tensor_vec, in_data, out_tensor.options().device(c10::DeviceType::CPU), true, false, opts.rootRank);
-
+  init_input_data_vec(in_tensor_vec, in_buf, out_tensor.options().device(c10::DeviceType::CPU), true, false, opts.rootRank);
   
-  init_output_tensor(out_tensor, dsttensor, out_data, 0, out_tensor.scalar_type(), true, true, opts.rootRank);
+  init_output_tensor(out_tensor, dsttensor, out_buf, 0, out_tensor.scalar_type(), true, true, opts.rootRank);
 
   PRE_REQUEST(Scatter, dsttensor)
   
   // Run scatter
-  ACCL::ACCLRequest* req = accl->scatter(*in_data, *out_data, out_tensor.numel(), opts.rootRank, ACCL::GLOBAL_COMM, true, true, get_compressed_type(dsttensor.scalar_type()));
+  ACCL::ACCLRequest* req = accl->scatter(*in_buf, *out_buf, out_tensor.numel(), opts.rootRank, ACCL::GLOBAL_COMM, true, true, get_compressed_type(dsttensor.scalar_type()));
 
   POST_REQUEST("scatter", out_tensor.nbytes())
 
-  copy_back_tensor(out_tensor, out_data, true, true, opts.rootRank);
+  copy_back_tensor(out_tensor, out_buf, true, true, opts.rootRank);
 }
 
 c10::intrusive_ptr<Work>
@@ -1597,26 +1588,21 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::reduce_scatter(
 void ProcessGroupACCL::run_alltoall(at::Tensor in_tensor,
                                     at::Tensor out_tensor,
                                     const AllToAllOptions &opts) {
-  std::unique_ptr<ACCL::BaseBuffer> srcdata;
-  std::unique_ptr<ACCL::BaseBuffer> dstdata;
-
-  // PARA_PRINT(in_tensor);
-
-  init_input_tensor(in_tensor, srcdata, true, true); 
+  init_input_tensor(in_tensor, in_buf, true, true); 
 
   // Reserve device
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  init_output_data(out_tensor, dstdata, out_tensor.numel(), out_tensor.scalar_type(), true, true);
+  // init_output_data(out_tensor, dstdata, out_tensor.numel(), out_tensor.scalar_type(), true, true);
 
   PRE_REQUEST(AlltoAll, in_tensor)
 
-  ACCL::ACCLRequest* req = accl->alltoall(*srcdata, *dstdata, in_tensor.numel()/size_, ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor.scalar_type()));
+  ACCL::ACCLRequest* req = accl->alltoall(*in_buf, *out_buf, in_tensor.numel()/size_, ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor.scalar_type()));
 
   POST_REQUEST("alltoall", in_tensor.nbytes()/size_)
 
-  copy_back_tensor(out_tensor, dstdata, true, true);    
+  copy_back_tensor(out_tensor, out_buf, true, true);    
   
 }
 
@@ -1624,25 +1610,23 @@ void ProcessGroupACCL::run_alltoall(at::Tensor in_tensor,
 void ProcessGroupACCL::run_alltoall_vec(std::vector<at::Tensor> &in_tensor_vec,
                                     std::vector<at::Tensor> &out_tensor_vec,
                                     const AllToAllOptions &opts) {
-  std::unique_ptr<ACCL::BaseBuffer> in_data;
-  std::unique_ptr<ACCL::BaseBuffer> out_data;
   at::Tensor dsttensor;
 
   // Reserve device
   c10::DeviceGuard guard(in_tensor_vec[0].device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  init_input_data_vec(in_tensor_vec, in_data, out_tensor_vec[0].options().device(c10::DeviceType::CPU), true, true);
+  init_input_data_vec(in_tensor_vec, in_buf, out_tensor_vec[0].options().device(c10::DeviceType::CPU), true, true);
 
-  init_output_tensor(in_tensor_vec[0], dsttensor, out_data, size_, in_tensor_vec[0].scalar_type(), true, true);
+  init_output_tensor(in_tensor_vec[0], dsttensor, out_buf, size_, in_tensor_vec[0].scalar_type(), true, true);
   
   PRE_REQUEST(AlltoAll, in_tensor_vec[0])
 
-  ACCL::ACCLRequest* req = accl->alltoall(*in_data, *out_data, in_tensor_vec[0].numel(), ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor_vec[0].scalar_type()));
+  ACCL::ACCLRequest* req = accl->alltoall(*in_buf, *out_buf, in_tensor_vec[0].numel(), ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor_vec[0].scalar_type()));
 
   POST_REQUEST("alltoall", in_tensor_vec[0].nbytes())
 
-  copy_back_tensorvec(out_tensor_vec, out_data, dsttensor, in_tensor_vec[0].numel(), true, true);
+  copy_back_tensorvec(out_tensor_vec, out_buf, dsttensor, in_tensor_vec[0].numel(), true, true);
       
 }
 
@@ -1655,7 +1639,7 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::alltoall_base(
     // We can use alltoall
     TORCH_CHECK(
         outputTensor.numel() == inputTensor.numel() &&
-            outputTensor.type() == inputTensor.type(),
+            outputTensor.scalar_type() == inputTensor.scalar_type(),
         "Tensors are not equal in size or data type");
     TORCH_CHECK(
         outputTensor.size(0) % size_ == 0,
@@ -1726,17 +1710,15 @@ ProcessGroupACCL::alltoall(std::vector<at::Tensor> &outputTensors,
 void ProcessGroupACCL::run_send(at::Tensor in_tensor, int dstRank,
                                 int tag) {
 
-  STANDARD_DECL
-    
   // Reserve device
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  init_input_tensor(in_tensor, data, true, true);
+  init_input_tensor(in_tensor, in_buf, true, true);
 
   PRE_REQUEST(Send,in_tensor)
   
-  ACCL::ACCLRequest* req = accl->send(*data, in_tensor.numel(), dstRank, tag, ACCL::GLOBAL_COMM, true,
+  ACCL::ACCLRequest* req = accl->send(*in_buf, in_tensor.numel(), dstRank, tag, ACCL::GLOBAL_COMM, true,
              get_compressed_type(in_tensor.scalar_type()));
 
   POST_REQUEST("send", in_tensor.nbytes())
@@ -1770,21 +1752,19 @@ ProcessGroupACCL::send(std::vector<at::Tensor> &tensors, int dstRank, int tag) {
 void ProcessGroupACCL::run_recv(at::Tensor out_tensor, int srcRank,
                                 int tag) {
 
-  STANDARD_DECL
-    
   // Reserve device
   c10::DeviceGuard guard(out_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  init_output_data(out_tensor, dstdata, out_tensor.numel(), out_tensor.scalar_type(), true, true);
+  // init_output_data(out_tensor, dstdata, out_tensor.numel(), out_tensor.scalar_type(), true, true);
 
   PRE_REQUEST(Receive, out_tensor)  
   
-  ACCL::ACCLRequest* req = accl->recv(*dstdata, out_tensor.numel(), srcRank, tag, ACCL::GLOBAL_COMM, true, get_compressed_type(out_tensor.scalar_type()));
+  ACCL::ACCLRequest* req = accl->recv(*out_buf, out_tensor.numel(), srcRank, tag, ACCL::GLOBAL_COMM, true, get_compressed_type(out_tensor.scalar_type()));
 
   POST_REQUEST("recv", out_tensor.nbytes())
 
-  copy_back_tensor(out_tensor, dstdata, true, true);      
+  copy_back_tensor(out_tensor, out_buf, true, true);      
 }
 
 c10::intrusive_ptr<Work>
diff --git a/integrations/pytorch_ddp/test/run.sh b/integrations/pytorch_ddp/test/run.sh
index be8b36db..ebabfe7f 100755
--- a/integrations/pytorch_ddp/test/run.sh
+++ b/integrations/pytorch_ddp/test/run.sh
@@ -10,8 +10,8 @@ if [[ -v ACCL_SCRIPT ]]; then
     SCRIPT_NAME="$ACCL_SCRIPT"
 else
     # SCRIPT_NAME="test-mnist.py -d True -n 2" # MNIST
-    # SCRIPT_NAME="test-imagenet.py -d True"
-    SCRIPT_NAME=test-generic.py
+    SCRIPT_NAME="test-imagenet.py -d True"
+    # SCRIPT_NAME=test-generic.py
     echo "Variable ACCL_SCRIPT not set. Assuming $SCRIPT_NAME"
 fi
 
@@ -86,8 +86,8 @@ else
     echo "Master node set to: $MASTER_IP:$MASTER_PORT"
 
     # 09 and 10 have other interface names:
-    # MPI_ARGS="-f $HOST_FILE --iface ens4f0"
-    MPI_ARGS="-f $HOST_FILE --iface ens4"
+    MPI_ARGS="-f $HOST_FILE --iface ens4f0"
+    # MPI_ARGS="-f $HOST_FILE --iface ens4"
 fi
 
 ARG="$ARG -c $ACCL_COMMS -i $HOST_FILE -f $FPGA_FILE -a $MASTER_IP -p $MASTER_PORT\""
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index 7dd1d815..0a1a0044 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -49,16 +49,16 @@
 rank = 0
 size = 0
 
-count = 256 * 1024
-num_el = 256 * 1024
-shape = (256,1024)
+count = 16 * 1
+num_el = 16 * 1
+shape = (16 , 1)
 #As in test.cpp defaults
 rxbufsize = 4096 * 1024
 
 
 def test_broadcast_segment():
     global num_errors
-    shape_segment = (1024 * 1024,)
+    shape_segment = (1024 * 1,)
     if rank == 0:
         x = torch.ones(shape_segment, dtype=torch.float)
     else:
@@ -87,7 +87,7 @@ def test_broadcast():
     else:
         x = torch.zeros(shape)
 
-    for i in range(10):
+    for i in range(1):
         with torch.profiler.record_function("test bcast " + str(i)):
 
             start_time = time.perf_counter()
@@ -120,8 +120,8 @@ def test_broadcast():
         logger.debug("Test broadcast finished!")
 
 def test_broadcast_2():
-    test_type = torch.float
-    shape_2 = (204, 2)
+    test_type = torch.double
+    shape_2 = (2, 2)
     global num_errors
     if rank == 0:
         x = torch.ones(shape_2, dtype=test_type)
@@ -224,8 +224,8 @@ def test_gather():
 def test_allgather():
     global num_errors
     shape_gather = (1,)
-    x = torch.full(shape_gather, float(rank), dtype=torch.double)
-    y = [torch.empty(shape_gather, dtype=torch.double) for _ in range(size)]
+    x = torch.full(shape_gather, float(rank), dtype=torch.float)
+    y = [torch.empty(shape_gather, dtype=torch.float) for _ in range(size)]
 
     with torch.profiler.record_function("test_allgather"):
         
@@ -234,7 +234,7 @@ def test_allgather():
         mpi.Barrier()
     for i, c in enumerate(y):
         try:
-            np.testing.assert_allclose(c, torch.full(shape_gather, float(i), dtype=torch.double))
+            np.testing.assert_allclose(c, torch.full(shape_gather, float(i), dtype=torch.float))
         except AssertionError as e:
             num_errors = num_errors + 1
             logger.debug("Test AllGather failed")
@@ -374,7 +374,7 @@ def demo_basic(rank: int):
         loss_fn = nn.MSELoss()
         optimizer = optim.Adam(ddp_model.parameters(), lr=0.005)
 
-        max_epochs = 200
+        max_epochs = 10
         for epoch in range(max_epochs):
             batch_size = len(next(iter(train_data))[0])
             train_data.sampler.set_epoch(epoch)
@@ -430,7 +430,7 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
             ranks = [accl.Rank(a, start_port + i, 0, rxbufsize) for i, a in enumerate(fpga_ips)]
     else:
         # Somehow the simulator gets stuck if I use the same rxbufsize
-        rxbufsize = 4096 #* 1024
+        rxbufsize = 4096 # * 1024
         ranks = [accl.Rank("127.0.0.1", 5500 + i, i, rxbufsize) for i in range(size)]
 
     logger.debug(f'Ranks: {ranks}')
@@ -449,40 +449,44 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
 
     # Sometimes ACCL gets stuck on the mpi import statement, so this is to avoid issues:
     mpi.Barrier()            
+
+
+    # dist.init_process_group("mpi", rank=rank, world_size=size)
+
     
     accl.create_process_group(ranks, design, bufsize=rxbufsize, initialize=True, simulation=simulator)
-    # dist.init_process_group("mpi", rank=rank, world_size=size)    
     dist.init_process_group("ACCL", rank=rank, world_size=size)
+    
     global num_errors
     num_errors = 0
-    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
-                 profile_memory=True, record_shapes=True) as prof:
-
-        # test_allgather()
-        # test_broadcast_segment()
-        test_broadcast()
-        # test_broadcast()
-        # test_broadcast()
-        # test_broadcast()
-        # test_broadcast()
-        # test_broadcast_2()
-        # test_sendrcv()
-        # test_scatter()
-        # test_gather()
-        # test_allgather()
-        # test_alltoall()
-        # test_allreduce()
-        # test_allreduce()
-        # test_allreduce()
-        # test_allreduce()
-
-        # test_reduce()
+    # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                 # profile_memory=True, record_shapes=True) as prof:
+
+    test_broadcast_segment()
+    # test_broadcast()
+    # test_broadcast()
+    test_broadcast()
+    # test_broadcast()
+    # test_broadcast()
+    test_broadcast_2()
+    test_sendrcv()
+    test_scatter()
+    test_gather()
+    test_allgather()
+    test_alltoall()
+    test_allreduce()
+    test_allgather()
+    # test_allreduce()
+    # test_allreduce()
+    # test_allreduce()
+
+    # test_reduce()
 
         
-        # demo_basic(rank)
+    demo_basic(rank)
 
 
-        mpi.Barrier()
+    mpi.Barrier()
 
     if num_errors == 0:
         print("======== Successfully Finished testing======")
@@ -490,8 +494,8 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
     else:
         print(f"!!!!!!!! - {num_errors} Errors found - !!!!!!!!!")
         logger.debug(f"!!!!!!!! - {num_errors} Errors found - !!!!!!!!!")        
-    print(prof.key_averages(group_by_input_shape=True)
-          .table(sort_by="cpu_time_total", row_limit=15))
+    # print(prof.key_averages(group_by_input_shape=True)
+          # .table(sort_by="cpu_time_total", row_limit=15))
 
     logger.debug('Destroying ACCL Process Group')
     dist.destroy_process_group()
diff --git a/integrations/pytorch_ddp/test/torchvision/PennFudanDataset.py b/integrations/pytorch_ddp/test/torchvision/PennFudanDataset.py
deleted file mode 100644
index 949bc93e..00000000
--- a/integrations/pytorch_ddp/test/torchvision/PennFudanDataset.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import os
-import torch
-
-from torchvision.io import read_image
-from torchvision.ops.boxes import masks_to_boxes
-from torchvision import tv_tensors
-from torchvision.transforms.v2 import functional as F
-
-
-class PennFudanDataset(torch.utils.data.Dataset):
-    def __init__(self, root, transforms):
-        self.root = root
-        self.transforms = transforms
-        # load all image files, sorting them to
-        # ensure that they are aligned
-        self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
-        self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))
-
-    def __getitem__(self, idx):
-        # load images and masks
-        img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
-        mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
-        img = read_image(img_path)
-        mask = read_image(mask_path)
-        # instances are encoded as different colors
-        obj_ids = torch.unique(mask)
-        # first id is the background, so remove it
-        obj_ids = obj_ids[1:]
-        num_objs = len(obj_ids)
-
-        # split the color-encoded mask into a set
-        # of binary masks
-        masks = (mask == obj_ids[:, None, None]).to(dtype=torch.uint8)
-
-        # get bounding box coordinates for each mask
-        boxes = masks_to_boxes(masks)
-
-        # there is only one class
-        labels = torch.ones((num_objs,), dtype=torch.int64)
-
-        image_id = idx
-        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
-        # suppose all instances are not crowd
-        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
-
-        # Wrap sample and targets into torchvision tv_tensors:
-        img = tv_tensors.Image(img)
-
-        target = {}
-        target["boxes"] = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=F.get_size(img))
-        target["masks"] = tv_tensors.Mask(masks)
-        target["labels"] = labels
-        target["image_id"] = image_id
-        target["area"] = area
-        target["iscrowd"] = iscrowd
-
-        if self.transforms is not None:
-            img, target = self.transforms(img, target)
-
-        return img, target
-
-    def __len__(self):
-        return len(self.imgs)
-
-
-
-def get_transform(train):
-    transforms = []
-    if train:
-        transforms.append(T.RandomHorizontalFlip(0.5))
-    transforms.append(T.ToDtype(torch.float, scale=True))
-    transforms.append(T.ToPureTensor())
-    return T.Compose(transforms)
-
diff --git a/integrations/pytorch_ddp/test/torchvision/coco_eval.py b/integrations/pytorch_ddp/test/torchvision/coco_eval.py
deleted file mode 100644
index ba1359f8..00000000
--- a/integrations/pytorch_ddp/test/torchvision/coco_eval.py
+++ /dev/null
@@ -1,192 +0,0 @@
-import copy
-import io
-from contextlib import redirect_stdout
-
-import numpy as np
-import pycocotools.mask as mask_util
-import torch
-import utils
-from pycocotools.coco import COCO
-from pycocotools.cocoeval import COCOeval
-
-
-class CocoEvaluator:
-    def __init__(self, coco_gt, iou_types):
-        if not isinstance(iou_types, (list, tuple)):
-            raise TypeError(f"This constructor expects iou_types of type list or tuple, instead  got {type(iou_types)}")
-        coco_gt = copy.deepcopy(coco_gt)
-        self.coco_gt = coco_gt
-
-        self.iou_types = iou_types
-        self.coco_eval = {}
-        for iou_type in iou_types:
-            self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
-
-        self.img_ids = []
-        self.eval_imgs = {k: [] for k in iou_types}
-
-    def update(self, predictions):
-        img_ids = list(np.unique(list(predictions.keys())))
-        self.img_ids.extend(img_ids)
-
-        for iou_type in self.iou_types:
-            results = self.prepare(predictions, iou_type)
-            with redirect_stdout(io.StringIO()):
-                coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO()
-            coco_eval = self.coco_eval[iou_type]
-
-            coco_eval.cocoDt = coco_dt
-            coco_eval.params.imgIds = list(img_ids)
-            img_ids, eval_imgs = evaluate(coco_eval)
-
-            self.eval_imgs[iou_type].append(eval_imgs)
-
-    def synchronize_between_processes(self):
-        for iou_type in self.iou_types:
-            self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
-            create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
-
-    def accumulate(self):
-        for coco_eval in self.coco_eval.values():
-            coco_eval.accumulate()
-
-    def summarize(self):
-        for iou_type, coco_eval in self.coco_eval.items():
-            print(f"IoU metric: {iou_type}")
-            coco_eval.summarize()
-
-    def prepare(self, predictions, iou_type):
-        if iou_type == "bbox":
-            return self.prepare_for_coco_detection(predictions)
-        if iou_type == "segm":
-            return self.prepare_for_coco_segmentation(predictions)
-        if iou_type == "keypoints":
-            return self.prepare_for_coco_keypoint(predictions)
-        raise ValueError(f"Unknown iou type {iou_type}")
-
-    def prepare_for_coco_detection(self, predictions):
-        coco_results = []
-        for original_id, prediction in predictions.items():
-            if len(prediction) == 0:
-                continue
-
-            boxes = prediction["boxes"]
-            boxes = convert_to_xywh(boxes).tolist()
-            scores = prediction["scores"].tolist()
-            labels = prediction["labels"].tolist()
-
-            coco_results.extend(
-                [
-                    {
-                        "image_id": original_id,
-                        "category_id": labels[k],
-                        "bbox": box,
-                        "score": scores[k],
-                    }
-                    for k, box in enumerate(boxes)
-                ]
-            )
-        return coco_results
-
-    def prepare_for_coco_segmentation(self, predictions):
-        coco_results = []
-        for original_id, prediction in predictions.items():
-            if len(prediction) == 0:
-                continue
-
-            scores = prediction["scores"]
-            labels = prediction["labels"]
-            masks = prediction["masks"]
-
-            masks = masks > 0.5
-
-            scores = prediction["scores"].tolist()
-            labels = prediction["labels"].tolist()
-
-            rles = [
-                mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0] for mask in masks
-            ]
-            for rle in rles:
-                rle["counts"] = rle["counts"].decode("utf-8")
-
-            coco_results.extend(
-                [
-                    {
-                        "image_id": original_id,
-                        "category_id": labels[k],
-                        "segmentation": rle,
-                        "score": scores[k],
-                    }
-                    for k, rle in enumerate(rles)
-                ]
-            )
-        return coco_results
-
-    def prepare_for_coco_keypoint(self, predictions):
-        coco_results = []
-        for original_id, prediction in predictions.items():
-            if len(prediction) == 0:
-                continue
-
-            boxes = prediction["boxes"]
-            boxes = convert_to_xywh(boxes).tolist()
-            scores = prediction["scores"].tolist()
-            labels = prediction["labels"].tolist()
-            keypoints = prediction["keypoints"]
-            keypoints = keypoints.flatten(start_dim=1).tolist()
-
-            coco_results.extend(
-                [
-                    {
-                        "image_id": original_id,
-                        "category_id": labels[k],
-                        "keypoints": keypoint,
-                        "score": scores[k],
-                    }
-                    for k, keypoint in enumerate(keypoints)
-                ]
-            )
-        return coco_results
-
-
-def convert_to_xywh(boxes):
-    xmin, ymin, xmax, ymax = boxes.unbind(1)
-    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
-
-
-def merge(img_ids, eval_imgs):
-    all_img_ids = utils.all_gather(img_ids)
-    all_eval_imgs = utils.all_gather(eval_imgs)
-
-    merged_img_ids = []
-    for p in all_img_ids:
-        merged_img_ids.extend(p)
-
-    merged_eval_imgs = []
-    for p in all_eval_imgs:
-        merged_eval_imgs.append(p)
-
-    merged_img_ids = np.array(merged_img_ids)
-    merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
-
-    # keep only unique (and in sorted order) images
-    merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
-    merged_eval_imgs = merged_eval_imgs[..., idx]
-
-    return merged_img_ids, merged_eval_imgs
-
-
-def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
-    img_ids, eval_imgs = merge(img_ids, eval_imgs)
-    img_ids = list(img_ids)
-    eval_imgs = list(eval_imgs.flatten())
-
-    coco_eval.evalImgs = eval_imgs
-    coco_eval.params.imgIds = img_ids
-    coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
-
-
-def evaluate(imgs):
-    with redirect_stdout(io.StringIO()):
-        imgs.evaluate()
-    return imgs.params.imgIds, np.asarray(imgs.evalImgs).reshape(-1, len(imgs.params.areaRng), len(imgs.params.imgIds))
diff --git a/integrations/pytorch_ddp/test/torchvision/coco_eval.py.1 b/integrations/pytorch_ddp/test/torchvision/coco_eval.py.1
deleted file mode 100644
index ba1359f8..00000000
--- a/integrations/pytorch_ddp/test/torchvision/coco_eval.py.1
+++ /dev/null
@@ -1,192 +0,0 @@
-import copy
-import io
-from contextlib import redirect_stdout
-
-import numpy as np
-import pycocotools.mask as mask_util
-import torch
-import utils
-from pycocotools.coco import COCO
-from pycocotools.cocoeval import COCOeval
-
-
-class CocoEvaluator:
-    def __init__(self, coco_gt, iou_types):
-        if not isinstance(iou_types, (list, tuple)):
-            raise TypeError(f"This constructor expects iou_types of type list or tuple, instead  got {type(iou_types)}")
-        coco_gt = copy.deepcopy(coco_gt)
-        self.coco_gt = coco_gt
-
-        self.iou_types = iou_types
-        self.coco_eval = {}
-        for iou_type in iou_types:
-            self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
-
-        self.img_ids = []
-        self.eval_imgs = {k: [] for k in iou_types}
-
-    def update(self, predictions):
-        img_ids = list(np.unique(list(predictions.keys())))
-        self.img_ids.extend(img_ids)
-
-        for iou_type in self.iou_types:
-            results = self.prepare(predictions, iou_type)
-            with redirect_stdout(io.StringIO()):
-                coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO()
-            coco_eval = self.coco_eval[iou_type]
-
-            coco_eval.cocoDt = coco_dt
-            coco_eval.params.imgIds = list(img_ids)
-            img_ids, eval_imgs = evaluate(coco_eval)
-
-            self.eval_imgs[iou_type].append(eval_imgs)
-
-    def synchronize_between_processes(self):
-        for iou_type in self.iou_types:
-            self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
-            create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
-
-    def accumulate(self):
-        for coco_eval in self.coco_eval.values():
-            coco_eval.accumulate()
-
-    def summarize(self):
-        for iou_type, coco_eval in self.coco_eval.items():
-            print(f"IoU metric: {iou_type}")
-            coco_eval.summarize()
-
-    def prepare(self, predictions, iou_type):
-        if iou_type == "bbox":
-            return self.prepare_for_coco_detection(predictions)
-        if iou_type == "segm":
-            return self.prepare_for_coco_segmentation(predictions)
-        if iou_type == "keypoints":
-            return self.prepare_for_coco_keypoint(predictions)
-        raise ValueError(f"Unknown iou type {iou_type}")
-
-    def prepare_for_coco_detection(self, predictions):
-        coco_results = []
-        for original_id, prediction in predictions.items():
-            if len(prediction) == 0:
-                continue
-
-            boxes = prediction["boxes"]
-            boxes = convert_to_xywh(boxes).tolist()
-            scores = prediction["scores"].tolist()
-            labels = prediction["labels"].tolist()
-
-            coco_results.extend(
-                [
-                    {
-                        "image_id": original_id,
-                        "category_id": labels[k],
-                        "bbox": box,
-                        "score": scores[k],
-                    }
-                    for k, box in enumerate(boxes)
-                ]
-            )
-        return coco_results
-
-    def prepare_for_coco_segmentation(self, predictions):
-        coco_results = []
-        for original_id, prediction in predictions.items():
-            if len(prediction) == 0:
-                continue
-
-            scores = prediction["scores"]
-            labels = prediction["labels"]
-            masks = prediction["masks"]
-
-            masks = masks > 0.5
-
-            scores = prediction["scores"].tolist()
-            labels = prediction["labels"].tolist()
-
-            rles = [
-                mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0] for mask in masks
-            ]
-            for rle in rles:
-                rle["counts"] = rle["counts"].decode("utf-8")
-
-            coco_results.extend(
-                [
-                    {
-                        "image_id": original_id,
-                        "category_id": labels[k],
-                        "segmentation": rle,
-                        "score": scores[k],
-                    }
-                    for k, rle in enumerate(rles)
-                ]
-            )
-        return coco_results
-
-    def prepare_for_coco_keypoint(self, predictions):
-        coco_results = []
-        for original_id, prediction in predictions.items():
-            if len(prediction) == 0:
-                continue
-
-            boxes = prediction["boxes"]
-            boxes = convert_to_xywh(boxes).tolist()
-            scores = prediction["scores"].tolist()
-            labels = prediction["labels"].tolist()
-            keypoints = prediction["keypoints"]
-            keypoints = keypoints.flatten(start_dim=1).tolist()
-
-            coco_results.extend(
-                [
-                    {
-                        "image_id": original_id,
-                        "category_id": labels[k],
-                        "keypoints": keypoint,
-                        "score": scores[k],
-                    }
-                    for k, keypoint in enumerate(keypoints)
-                ]
-            )
-        return coco_results
-
-
-def convert_to_xywh(boxes):
-    xmin, ymin, xmax, ymax = boxes.unbind(1)
-    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
-
-
-def merge(img_ids, eval_imgs):
-    all_img_ids = utils.all_gather(img_ids)
-    all_eval_imgs = utils.all_gather(eval_imgs)
-
-    merged_img_ids = []
-    for p in all_img_ids:
-        merged_img_ids.extend(p)
-
-    merged_eval_imgs = []
-    for p in all_eval_imgs:
-        merged_eval_imgs.append(p)
-
-    merged_img_ids = np.array(merged_img_ids)
-    merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
-
-    # keep only unique (and in sorted order) images
-    merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
-    merged_eval_imgs = merged_eval_imgs[..., idx]
-
-    return merged_img_ids, merged_eval_imgs
-
-
-def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
-    img_ids, eval_imgs = merge(img_ids, eval_imgs)
-    img_ids = list(img_ids)
-    eval_imgs = list(eval_imgs.flatten())
-
-    coco_eval.evalImgs = eval_imgs
-    coco_eval.params.imgIds = img_ids
-    coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
-
-
-def evaluate(imgs):
-    with redirect_stdout(io.StringIO()):
-        imgs.evaluate()
-    return imgs.params.imgIds, np.asarray(imgs.evalImgs).reshape(-1, len(imgs.params.areaRng), len(imgs.params.imgIds))
diff --git a/integrations/pytorch_ddp/test/torchvision/coco_utils.py b/integrations/pytorch_ddp/test/torchvision/coco_utils.py
deleted file mode 100644
index f40dcdff..00000000
--- a/integrations/pytorch_ddp/test/torchvision/coco_utils.py
+++ /dev/null
@@ -1,234 +0,0 @@
-import os
-
-import torch
-import torch.utils.data
-import torchvision
-import transforms as T
-from pycocotools import mask as coco_mask
-from pycocotools.coco import COCO
-
-
-def convert_coco_poly_to_mask(segmentations, height, width):
-    masks = []
-    for polygons in segmentations:
-        rles = coco_mask.frPyObjects(polygons, height, width)
-        mask = coco_mask.decode(rles)
-        if len(mask.shape) < 3:
-            mask = mask[..., None]
-        mask = torch.as_tensor(mask, dtype=torch.uint8)
-        mask = mask.any(dim=2)
-        masks.append(mask)
-    if masks:
-        masks = torch.stack(masks, dim=0)
-    else:
-        masks = torch.zeros((0, height, width), dtype=torch.uint8)
-    return masks
-
-
-class ConvertCocoPolysToMask:
-    def __call__(self, image, target):
-        w, h = image.size
-
-        image_id = target["image_id"]
-
-        anno = target["annotations"]
-
-        anno = [obj for obj in anno if obj["iscrowd"] == 0]
-
-        boxes = [obj["bbox"] for obj in anno]
-        # guard against no boxes via resizing
-        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
-        boxes[:, 2:] += boxes[:, :2]
-        boxes[:, 0::2].clamp_(min=0, max=w)
-        boxes[:, 1::2].clamp_(min=0, max=h)
-
-        classes = [obj["category_id"] for obj in anno]
-        classes = torch.tensor(classes, dtype=torch.int64)
-
-        segmentations = [obj["segmentation"] for obj in anno]
-        masks = convert_coco_poly_to_mask(segmentations, h, w)
-
-        keypoints = None
-        if anno and "keypoints" in anno[0]:
-            keypoints = [obj["keypoints"] for obj in anno]
-            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
-            num_keypoints = keypoints.shape[0]
-            if num_keypoints:
-                keypoints = keypoints.view(num_keypoints, -1, 3)
-
-        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
-        boxes = boxes[keep]
-        classes = classes[keep]
-        masks = masks[keep]
-        if keypoints is not None:
-            keypoints = keypoints[keep]
-
-        target = {}
-        target["boxes"] = boxes
-        target["labels"] = classes
-        target["masks"] = masks
-        target["image_id"] = image_id
-        if keypoints is not None:
-            target["keypoints"] = keypoints
-
-        # for conversion to coco api
-        area = torch.tensor([obj["area"] for obj in anno])
-        iscrowd = torch.tensor([obj["iscrowd"] for obj in anno])
-        target["area"] = area
-        target["iscrowd"] = iscrowd
-
-        return image, target
-
-
-def _coco_remove_images_without_annotations(dataset, cat_list=None):
-    def _has_only_empty_bbox(anno):
-        return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
-
-    def _count_visible_keypoints(anno):
-        return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
-
-    min_keypoints_per_image = 10
-
-    def _has_valid_annotation(anno):
-        # if it's empty, there is no annotation
-        if len(anno) == 0:
-            return False
-        # if all boxes have close to zero area, there is no annotation
-        if _has_only_empty_bbox(anno):
-            return False
-        # keypoints task have a slight different criteria for considering
-        # if an annotation is valid
-        if "keypoints" not in anno[0]:
-            return True
-        # for keypoint detection tasks, only consider valid images those
-        # containing at least min_keypoints_per_image
-        if _count_visible_keypoints(anno) >= min_keypoints_per_image:
-            return True
-        return False
-
-    ids = []
-    for ds_idx, img_id in enumerate(dataset.ids):
-        ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
-        anno = dataset.coco.loadAnns(ann_ids)
-        if cat_list:
-            anno = [obj for obj in anno if obj["category_id"] in cat_list]
-        if _has_valid_annotation(anno):
-            ids.append(ds_idx)
-
-    dataset = torch.utils.data.Subset(dataset, ids)
-    return dataset
-
-
-def convert_to_coco_api(ds):
-    coco_ds = COCO()
-    # annotation IDs need to start at 1, not 0, see torchvision issue #1530
-    ann_id = 1
-    dataset = {"images": [], "categories": [], "annotations": []}
-    categories = set()
-    for img_idx in range(len(ds)):
-        # find better way to get target
-        # targets = ds.get_annotations(img_idx)
-        img, targets = ds[img_idx]
-        image_id = targets["image_id"]
-        img_dict = {}
-        img_dict["id"] = image_id
-        img_dict["height"] = img.shape[-2]
-        img_dict["width"] = img.shape[-1]
-        dataset["images"].append(img_dict)
-        bboxes = targets["boxes"].clone()
-        bboxes[:, 2:] -= bboxes[:, :2]
-        bboxes = bboxes.tolist()
-        labels = targets["labels"].tolist()
-        areas = targets["area"].tolist()
-        iscrowd = targets["iscrowd"].tolist()
-        if "masks" in targets:
-            masks = targets["masks"]
-            # make masks Fortran contiguous for coco_mask
-            masks = masks.permute(0, 2, 1).contiguous().permute(0, 2, 1)
-        if "keypoints" in targets:
-            keypoints = targets["keypoints"]
-            keypoints = keypoints.reshape(keypoints.shape[0], -1).tolist()
-        num_objs = len(bboxes)
-        for i in range(num_objs):
-            ann = {}
-            ann["image_id"] = image_id
-            ann["bbox"] = bboxes[i]
-            ann["category_id"] = labels[i]
-            categories.add(labels[i])
-            ann["area"] = areas[i]
-            ann["iscrowd"] = iscrowd[i]
-            ann["id"] = ann_id
-            if "masks" in targets:
-                ann["segmentation"] = coco_mask.encode(masks[i].numpy())
-            if "keypoints" in targets:
-                ann["keypoints"] = keypoints[i]
-                ann["num_keypoints"] = sum(k != 0 for k in keypoints[i][2::3])
-            dataset["annotations"].append(ann)
-            ann_id += 1
-    dataset["categories"] = [{"id": i} for i in sorted(categories)]
-    coco_ds.dataset = dataset
-    coco_ds.createIndex()
-    return coco_ds
-
-
-def get_coco_api_from_dataset(dataset):
-    # FIXME: This is... awful?
-    for _ in range(10):
-        if isinstance(dataset, torchvision.datasets.CocoDetection):
-            break
-        if isinstance(dataset, torch.utils.data.Subset):
-            dataset = dataset.dataset
-    if isinstance(dataset, torchvision.datasets.CocoDetection):
-        return dataset.coco
-    return convert_to_coco_api(dataset)
-
-
-class CocoDetection(torchvision.datasets.CocoDetection):
-    def __init__(self, img_folder, ann_file, transforms):
-        super().__init__(img_folder, ann_file)
-        self._transforms = transforms
-
-    def __getitem__(self, idx):
-        img, target = super().__getitem__(idx)
-        image_id = self.ids[idx]
-        target = dict(image_id=image_id, annotations=target)
-        if self._transforms is not None:
-            img, target = self._transforms(img, target)
-        return img, target
-
-
-def get_coco(root, image_set, transforms, mode="instances", use_v2=False, with_masks=False):
-    anno_file_template = "{}_{}2017.json"
-    PATHS = {
-        "train": ("train2017", os.path.join("annotations", anno_file_template.format(mode, "train"))),
-        "val": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val"))),
-        # "train": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val")))
-    }
-
-    img_folder, ann_file = PATHS[image_set]
-    img_folder = os.path.join(root, img_folder)
-    ann_file = os.path.join(root, ann_file)
-
-    if use_v2:
-        from torchvision.datasets import wrap_dataset_for_transforms_v2
-
-        dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
-        target_keys = ["boxes", "labels", "image_id"]
-        if with_masks:
-            target_keys += ["masks"]
-        dataset = wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys)
-    else:
-        # TODO: handle with_masks for V1?
-        t = [ConvertCocoPolysToMask()]
-        if transforms is not None:
-            t.append(transforms)
-        transforms = T.Compose(t)
-
-        dataset = CocoDetection(img_folder, ann_file, transforms=transforms)
-
-    if image_set == "train":
-        dataset = _coco_remove_images_without_annotations(dataset)
-
-    # dataset = torch.utils.data.Subset(dataset, [i for i in range(500)])
-
-    return dataset
diff --git a/integrations/pytorch_ddp/test/torchvision/coco_utils.py.1 b/integrations/pytorch_ddp/test/torchvision/coco_utils.py.1
deleted file mode 100644
index f40dcdff..00000000
--- a/integrations/pytorch_ddp/test/torchvision/coco_utils.py.1
+++ /dev/null
@@ -1,234 +0,0 @@
-import os
-
-import torch
-import torch.utils.data
-import torchvision
-import transforms as T
-from pycocotools import mask as coco_mask
-from pycocotools.coco import COCO
-
-
-def convert_coco_poly_to_mask(segmentations, height, width):
-    masks = []
-    for polygons in segmentations:
-        rles = coco_mask.frPyObjects(polygons, height, width)
-        mask = coco_mask.decode(rles)
-        if len(mask.shape) < 3:
-            mask = mask[..., None]
-        mask = torch.as_tensor(mask, dtype=torch.uint8)
-        mask = mask.any(dim=2)
-        masks.append(mask)
-    if masks:
-        masks = torch.stack(masks, dim=0)
-    else:
-        masks = torch.zeros((0, height, width), dtype=torch.uint8)
-    return masks
-
-
-class ConvertCocoPolysToMask:
-    def __call__(self, image, target):
-        w, h = image.size
-
-        image_id = target["image_id"]
-
-        anno = target["annotations"]
-
-        anno = [obj for obj in anno if obj["iscrowd"] == 0]
-
-        boxes = [obj["bbox"] for obj in anno]
-        # guard against no boxes via resizing
-        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
-        boxes[:, 2:] += boxes[:, :2]
-        boxes[:, 0::2].clamp_(min=0, max=w)
-        boxes[:, 1::2].clamp_(min=0, max=h)
-
-        classes = [obj["category_id"] for obj in anno]
-        classes = torch.tensor(classes, dtype=torch.int64)
-
-        segmentations = [obj["segmentation"] for obj in anno]
-        masks = convert_coco_poly_to_mask(segmentations, h, w)
-
-        keypoints = None
-        if anno and "keypoints" in anno[0]:
-            keypoints = [obj["keypoints"] for obj in anno]
-            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
-            num_keypoints = keypoints.shape[0]
-            if num_keypoints:
-                keypoints = keypoints.view(num_keypoints, -1, 3)
-
-        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
-        boxes = boxes[keep]
-        classes = classes[keep]
-        masks = masks[keep]
-        if keypoints is not None:
-            keypoints = keypoints[keep]
-
-        target = {}
-        target["boxes"] = boxes
-        target["labels"] = classes
-        target["masks"] = masks
-        target["image_id"] = image_id
-        if keypoints is not None:
-            target["keypoints"] = keypoints
-
-        # for conversion to coco api
-        area = torch.tensor([obj["area"] for obj in anno])
-        iscrowd = torch.tensor([obj["iscrowd"] for obj in anno])
-        target["area"] = area
-        target["iscrowd"] = iscrowd
-
-        return image, target
-
-
-def _coco_remove_images_without_annotations(dataset, cat_list=None):
-    def _has_only_empty_bbox(anno):
-        return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
-
-    def _count_visible_keypoints(anno):
-        return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
-
-    min_keypoints_per_image = 10
-
-    def _has_valid_annotation(anno):
-        # if it's empty, there is no annotation
-        if len(anno) == 0:
-            return False
-        # if all boxes have close to zero area, there is no annotation
-        if _has_only_empty_bbox(anno):
-            return False
-        # keypoints task have a slight different criteria for considering
-        # if an annotation is valid
-        if "keypoints" not in anno[0]:
-            return True
-        # for keypoint detection tasks, only consider valid images those
-        # containing at least min_keypoints_per_image
-        if _count_visible_keypoints(anno) >= min_keypoints_per_image:
-            return True
-        return False
-
-    ids = []
-    for ds_idx, img_id in enumerate(dataset.ids):
-        ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
-        anno = dataset.coco.loadAnns(ann_ids)
-        if cat_list:
-            anno = [obj for obj in anno if obj["category_id"] in cat_list]
-        if _has_valid_annotation(anno):
-            ids.append(ds_idx)
-
-    dataset = torch.utils.data.Subset(dataset, ids)
-    return dataset
-
-
-def convert_to_coco_api(ds):
-    coco_ds = COCO()
-    # annotation IDs need to start at 1, not 0, see torchvision issue #1530
-    ann_id = 1
-    dataset = {"images": [], "categories": [], "annotations": []}
-    categories = set()
-    for img_idx in range(len(ds)):
-        # find better way to get target
-        # targets = ds.get_annotations(img_idx)
-        img, targets = ds[img_idx]
-        image_id = targets["image_id"]
-        img_dict = {}
-        img_dict["id"] = image_id
-        img_dict["height"] = img.shape[-2]
-        img_dict["width"] = img.shape[-1]
-        dataset["images"].append(img_dict)
-        bboxes = targets["boxes"].clone()
-        bboxes[:, 2:] -= bboxes[:, :2]
-        bboxes = bboxes.tolist()
-        labels = targets["labels"].tolist()
-        areas = targets["area"].tolist()
-        iscrowd = targets["iscrowd"].tolist()
-        if "masks" in targets:
-            masks = targets["masks"]
-            # make masks Fortran contiguous for coco_mask
-            masks = masks.permute(0, 2, 1).contiguous().permute(0, 2, 1)
-        if "keypoints" in targets:
-            keypoints = targets["keypoints"]
-            keypoints = keypoints.reshape(keypoints.shape[0], -1).tolist()
-        num_objs = len(bboxes)
-        for i in range(num_objs):
-            ann = {}
-            ann["image_id"] = image_id
-            ann["bbox"] = bboxes[i]
-            ann["category_id"] = labels[i]
-            categories.add(labels[i])
-            ann["area"] = areas[i]
-            ann["iscrowd"] = iscrowd[i]
-            ann["id"] = ann_id
-            if "masks" in targets:
-                ann["segmentation"] = coco_mask.encode(masks[i].numpy())
-            if "keypoints" in targets:
-                ann["keypoints"] = keypoints[i]
-                ann["num_keypoints"] = sum(k != 0 for k in keypoints[i][2::3])
-            dataset["annotations"].append(ann)
-            ann_id += 1
-    dataset["categories"] = [{"id": i} for i in sorted(categories)]
-    coco_ds.dataset = dataset
-    coco_ds.createIndex()
-    return coco_ds
-
-
-def get_coco_api_from_dataset(dataset):
-    # FIXME: This is... awful?
-    for _ in range(10):
-        if isinstance(dataset, torchvision.datasets.CocoDetection):
-            break
-        if isinstance(dataset, torch.utils.data.Subset):
-            dataset = dataset.dataset
-    if isinstance(dataset, torchvision.datasets.CocoDetection):
-        return dataset.coco
-    return convert_to_coco_api(dataset)
-
-
-class CocoDetection(torchvision.datasets.CocoDetection):
-    def __init__(self, img_folder, ann_file, transforms):
-        super().__init__(img_folder, ann_file)
-        self._transforms = transforms
-
-    def __getitem__(self, idx):
-        img, target = super().__getitem__(idx)
-        image_id = self.ids[idx]
-        target = dict(image_id=image_id, annotations=target)
-        if self._transforms is not None:
-            img, target = self._transforms(img, target)
-        return img, target
-
-
-def get_coco(root, image_set, transforms, mode="instances", use_v2=False, with_masks=False):
-    anno_file_template = "{}_{}2017.json"
-    PATHS = {
-        "train": ("train2017", os.path.join("annotations", anno_file_template.format(mode, "train"))),
-        "val": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val"))),
-        # "train": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val")))
-    }
-
-    img_folder, ann_file = PATHS[image_set]
-    img_folder = os.path.join(root, img_folder)
-    ann_file = os.path.join(root, ann_file)
-
-    if use_v2:
-        from torchvision.datasets import wrap_dataset_for_transforms_v2
-
-        dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
-        target_keys = ["boxes", "labels", "image_id"]
-        if with_masks:
-            target_keys += ["masks"]
-        dataset = wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys)
-    else:
-        # TODO: handle with_masks for V1?
-        t = [ConvertCocoPolysToMask()]
-        if transforms is not None:
-            t.append(transforms)
-        transforms = T.Compose(t)
-
-        dataset = CocoDetection(img_folder, ann_file, transforms=transforms)
-
-    if image_set == "train":
-        dataset = _coco_remove_images_without_annotations(dataset)
-
-    # dataset = torch.utils.data.Subset(dataset, [i for i in range(500)])
-
-    return dataset
diff --git a/integrations/pytorch_ddp/test/torchvision/engine.py b/integrations/pytorch_ddp/test/torchvision/engine.py
deleted file mode 100644
index 0e9bfffd..00000000
--- a/integrations/pytorch_ddp/test/torchvision/engine.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import math
-import sys
-import time
-
-import torch
-import torchvision.models.detection.mask_rcnn
-import utils
-from coco_eval import CocoEvaluator
-from coco_utils import get_coco_api_from_dataset
-
-
-def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, scaler=None):
-    model.train()
-    metric_logger = utils.MetricLogger(delimiter="  ")
-    metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value:.6f}"))
-    header = f"Epoch: [{epoch}]"
-
-    lr_scheduler = None
-    if epoch == 0:
-        warmup_factor = 1.0 / 1000
-        warmup_iters = min(1000, len(data_loader) - 1)
-
-        lr_scheduler = torch.optim.lr_scheduler.LinearLR(
-            optimizer, start_factor=warmup_factor, total_iters=warmup_iters
-        )
-
-    for images, targets in metric_logger.log_every(data_loader, print_freq, header):
-        images = list(image.to(device) for image in images)
-        targets = [{k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in t.items()} for t in targets]
-        with torch.cuda.amp.autocast(enabled=scaler is not None):
-            loss_dict = model(images, targets)
-            losses = sum(loss for loss in loss_dict.values())
-
-        # reduce losses over all GPUs for logging purposes
-        loss_dict_reduced = utils.reduce_dict(loss_dict)
-        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
-
-        loss_value = losses_reduced.item()
-
-        if not math.isfinite(loss_value):
-            print(f"Loss is {loss_value}, stopping training")
-            print(loss_dict_reduced)
-            sys.exit(1)
-
-        optimizer.zero_grad()
-        if scaler is not None:
-            scaler.scale(losses).backward()
-            scaler.step(optimizer)
-            scaler.update()
-        else:
-            losses.backward()
-            optimizer.step()
-
-        if lr_scheduler is not None:
-            lr_scheduler.step()
-
-        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
-        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
-
-    return metric_logger
-
-
-def _get_iou_types(model):
-    model_without_ddp = model
-    if isinstance(model, torch.nn.parallel.DistributedDataParallel):
-        model_without_ddp = model.module
-    iou_types = ["bbox"]
-    if isinstance(model_without_ddp, torchvision.models.detection.MaskRCNN):
-        iou_types.append("segm")
-    if isinstance(model_without_ddp, torchvision.models.detection.KeypointRCNN):
-        iou_types.append("keypoints")
-    return iou_types
-
-
-@torch.inference_mode()
-def evaluate(model, data_loader, device):
-    n_threads = torch.get_num_threads()
-    # FIXME remove this and make paste_masks_in_image run on the GPU
-    torch.set_num_threads(1)
-    cpu_device = torch.device("cpu")
-    model.eval()
-    metric_logger = utils.MetricLogger(delimiter="  ")
-    header = "Test:"
-
-    coco = get_coco_api_from_dataset(data_loader.dataset)
-    iou_types = _get_iou_types(model)
-    coco_evaluator = CocoEvaluator(coco, iou_types)
-
-    for images, targets in metric_logger.log_every(data_loader, 100, header):
-        images = list(img.to(device) for img in images)
-
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
-        model_time = time.time()
-        outputs = model(images)
-
-        outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
-        model_time = time.time() - model_time
-
-        res = {target["image_id"]: output for target, output in zip(targets, outputs)}
-        evaluator_time = time.time()
-        coco_evaluator.update(res)
-        evaluator_time = time.time() - evaluator_time
-        metric_logger.update(model_time=model_time, evaluator_time=evaluator_time)
-
-    # gather the stats from all processes
-    metric_logger.synchronize_between_processes()
-    print("Averaged stats:", metric_logger)
-    coco_evaluator.synchronize_between_processes()
-
-    # accumulate predictions from all images
-    coco_evaluator.accumulate()
-    coco_evaluator.summarize()
-    torch.set_num_threads(n_threads)
-    return coco_evaluator
diff --git a/integrations/pytorch_ddp/test/torchvision/main.py b/integrations/pytorch_ddp/test/torchvision/main.py
deleted file mode 100644
index d6296500..00000000
--- a/integrations/pytorch_ddp/test/torchvision/main.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import torchvision
-from PennFudanDataset import PennFudanDataset, get_transform
-import torch
-from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
-from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
-from engine import train_one_epoch, evaluate
-import utils
-from torchvision.transforms import v2 as T
-
-
-def get_model_instance_segmentation(num_classes):
-    # load an instance segmentation model pre-trained on COCO
-    model = torchvision.models.detection.maskrcnn_resnet50_fpn(weights="DEFAULT")
-
-    # get number of input features for the classifier
-    in_features = model.roi_heads.box_predictor.cls_score.in_features
-    # replace the pre-trained head with a new one
-    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
-
-    # now get the number of input features for the mask classifier
-    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
-    hidden_layer = 256
-    # and replace the mask predictor with a new one
-    model.roi_heads.mask_predictor = MaskRCNNPredictor(
-        in_features_mask,
-        hidden_layer,
-        num_classes
-    )
-
-    return model
-
-
-
-# train on the GPU or on the CPU, if a GPU is not available
-device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
-
-
-# our dataset has two classes only - background and person
-num_classes = 2
-# use our dataset and defined transformations
-dataset = PennFudanDataset('data/PennFudanPed', get_transform(train=True))
-dataset_test = PennFudanDataset('data/PennFudanPed', get_transform(train=False))
-
-# split the dataset in train and test set
-indices = torch.randperm(len(dataset)).tolist()
-dataset = torch.utils.data.Subset(dataset, indices[:-50])
-dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])
-
-
-# define training and validation data loaders
-data_loader = torch.utils.data.DataLoader(
-    dataset,
-    batch_size=2,
-    shuffle=True,
-    num_workers=4,
-    collate_fn=utils.collate_fn
-)
-
-data_loader_test = torch.utils.data.DataLoader(
-    dataset_test,
-    batch_size=1,
-    shuffle=False,
-    num_workers=4,
-    collate_fn=utils.collate_fn
-)
-
-# get the model using our helper function
-model = get_model_instance_segmentation(num_classes)
-
-# move model to the right device
-model.to(device)
-
-# construct an optimizer
-params = [p for p in model.parameters() if p.requires_grad]
-optimizer = torch.optim.SGD(
-    params,
-    lr=0.005,
-    momentum=0.9,
-    weight_decay=0.0005
-)
-
-# and a learning rate scheduler
-lr_scheduler = torch.optim.lr_scheduler.StepLR(
-    optimizer,
-    step_size=3,
-    gamma=0.1
-)
-
-# let's train it just for 2 epochs
-num_epochs = 2
-
-for epoch in range(num_epochs):
-    # train for one epoch, printing every 10 iterations
-    train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
-    # update the learning rate
-    lr_scheduler.step()
-    # evaluate on the test dataset
-    evaluate(model, data_loader_test, device=device)
-
-print("That's it!")
diff --git a/integrations/pytorch_ddp/test/torchvision/transforms.py b/integrations/pytorch_ddp/test/torchvision/transforms.py
deleted file mode 100644
index e07ccfc9..00000000
--- a/integrations/pytorch_ddp/test/torchvision/transforms.py
+++ /dev/null
@@ -1,601 +0,0 @@
-from typing import Dict, List, Optional, Tuple, Union
-
-import torch
-import torchvision
-from torch import nn, Tensor
-from torchvision import ops
-from torchvision.transforms import functional as F, InterpolationMode, transforms as T
-
-
-def _flip_coco_person_keypoints(kps, width):
-    flip_inds = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
-    flipped_data = kps[:, flip_inds]
-    flipped_data[..., 0] = width - flipped_data[..., 0]
-    # Maintain COCO convention that if visibility == 0, then x, y = 0
-    inds = flipped_data[..., 2] == 0
-    flipped_data[inds] = 0
-    return flipped_data
-
-
-class Compose:
-    def __init__(self, transforms):
-        self.transforms = transforms
-
-    def __call__(self, image, target):
-        for t in self.transforms:
-            image, target = t(image, target)
-        return image, target
-
-
-class RandomHorizontalFlip(T.RandomHorizontalFlip):
-    def forward(
-        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
-    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
-        if torch.rand(1) < self.p:
-            image = F.hflip(image)
-            if target is not None:
-                _, _, width = F.get_dimensions(image)
-                target["boxes"][:, [0, 2]] = width - target["boxes"][:, [2, 0]]
-                if "masks" in target:
-                    target["masks"] = target["masks"].flip(-1)
-                if "keypoints" in target:
-                    keypoints = target["keypoints"]
-                    keypoints = _flip_coco_person_keypoints(keypoints, width)
-                    target["keypoints"] = keypoints
-        return image, target
-
-
-class PILToTensor(nn.Module):
-    def forward(
-        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
-    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
-        image = F.pil_to_tensor(image)
-        return image, target
-
-
-class ToDtype(nn.Module):
-    def __init__(self, dtype: torch.dtype, scale: bool = False) -> None:
-        super().__init__()
-        self.dtype = dtype
-        self.scale = scale
-
-    def forward(
-        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
-    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
-        if not self.scale:
-            return image.to(dtype=self.dtype), target
-        image = F.convert_image_dtype(image, self.dtype)
-        return image, target
-
-
-class RandomIoUCrop(nn.Module):
-    def __init__(
-        self,
-        min_scale: float = 0.3,
-        max_scale: float = 1.0,
-        min_aspect_ratio: float = 0.5,
-        max_aspect_ratio: float = 2.0,
-        sampler_options: Optional[List[float]] = None,
-        trials: int = 40,
-    ):
-        super().__init__()
-        # Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174
-        self.min_scale = min_scale
-        self.max_scale = max_scale
-        self.min_aspect_ratio = min_aspect_ratio
-        self.max_aspect_ratio = max_aspect_ratio
-        if sampler_options is None:
-            sampler_options = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
-        self.options = sampler_options
-        self.trials = trials
-
-    def forward(
-        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
-    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
-        if target is None:
-            raise ValueError("The targets can't be None for this transform.")
-
-        if isinstance(image, torch.Tensor):
-            if image.ndimension() not in {2, 3}:
-                raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.")
-            elif image.ndimension() == 2:
-                image = image.unsqueeze(0)
-
-        _, orig_h, orig_w = F.get_dimensions(image)
-
-        while True:
-            # sample an option
-            idx = int(torch.randint(low=0, high=len(self.options), size=(1,)))
-            min_jaccard_overlap = self.options[idx]
-            if min_jaccard_overlap >= 1.0:  # a value larger than 1 encodes the leave as-is option
-                return image, target
-
-            for _ in range(self.trials):
-                # check the aspect ratio limitations
-                r = self.min_scale + (self.max_scale - self.min_scale) * torch.rand(2)
-                new_w = int(orig_w * r[0])
-                new_h = int(orig_h * r[1])
-                aspect_ratio = new_w / new_h
-                if not (self.min_aspect_ratio <= aspect_ratio <= self.max_aspect_ratio):
-                    continue
-
-                # check for 0 area crops
-                r = torch.rand(2)
-                left = int((orig_w - new_w) * r[0])
-                top = int((orig_h - new_h) * r[1])
-                right = left + new_w
-                bottom = top + new_h
-                if left == right or top == bottom:
-                    continue
-
-                # check for any valid boxes with centers within the crop area
-                cx = 0.5 * (target["boxes"][:, 0] + target["boxes"][:, 2])
-                cy = 0.5 * (target["boxes"][:, 1] + target["boxes"][:, 3])
-                is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom)
-                if not is_within_crop_area.any():
-                    continue
-
-                # check at least 1 box with jaccard limitations
-                boxes = target["boxes"][is_within_crop_area]
-                ious = torchvision.ops.boxes.box_iou(
-                    boxes, torch.tensor([[left, top, right, bottom]], dtype=boxes.dtype, device=boxes.device)
-                )
-                if ious.max() < min_jaccard_overlap:
-                    continue
-
-                # keep only valid boxes and perform cropping
-                target["boxes"] = boxes
-                target["labels"] = target["labels"][is_within_crop_area]
-                target["boxes"][:, 0::2] -= left
-                target["boxes"][:, 1::2] -= top
-                target["boxes"][:, 0::2].clamp_(min=0, max=new_w)
-                target["boxes"][:, 1::2].clamp_(min=0, max=new_h)
-                image = F.crop(image, top, left, new_h, new_w)
-
-                return image, target
-
-
-class RandomZoomOut(nn.Module):
-    def __init__(
-        self, fill: Optional[List[float]] = None, side_range: Tuple[float, float] = (1.0, 4.0), p: float = 0.5
-    ):
-        super().__init__()
-        if fill is None:
-            fill = [0.0, 0.0, 0.0]
-        self.fill = fill
-        self.side_range = side_range
-        if side_range[0] < 1.0 or side_range[0] > side_range[1]:
-            raise ValueError(f"Invalid canvas side range provided {side_range}.")
-        self.p = p
-
-    @torch.jit.unused
-    def _get_fill_value(self, is_pil):
-        # type: (bool) -> int
-        # We fake the type to make it work on JIT
-        return tuple(int(x) for x in self.fill) if is_pil else 0
-
-    def forward(
-        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
-    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
-        if isinstance(image, torch.Tensor):
-            if image.ndimension() not in {2, 3}:
-                raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.")
-            elif image.ndimension() == 2:
-                image = image.unsqueeze(0)
-
-        if torch.rand(1) >= self.p:
-            return image, target
-
-        _, orig_h, orig_w = F.get_dimensions(image)
-
-        r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0])
-        canvas_width = int(orig_w * r)
-        canvas_height = int(orig_h * r)
-
-        r = torch.rand(2)
-        left = int((canvas_width - orig_w) * r[0])
-        top = int((canvas_height - orig_h) * r[1])
-        right = canvas_width - (left + orig_w)
-        bottom = canvas_height - (top + orig_h)
-
-        if torch.jit.is_scripting():
-            fill = 0
-        else:
-            fill = self._get_fill_value(F._is_pil_image(image))
-
-        image = F.pad(image, [left, top, right, bottom], fill=fill)
-        if isinstance(image, torch.Tensor):
-            # PyTorch's pad supports only integers on fill. So we need to overwrite the colour
-            v = torch.tensor(self.fill, device=image.device, dtype=image.dtype).view(-1, 1, 1)
-            image[..., :top, :] = image[..., :, :left] = image[..., (top + orig_h) :, :] = image[
-                ..., :, (left + orig_w) :
-            ] = v
-
-        if target is not None:
-            target["boxes"][:, 0::2] += left
-            target["boxes"][:, 1::2] += top
-
-        return image, target
-
-
-class RandomPhotometricDistort(nn.Module):
-    def __init__(
-        self,
-        contrast: Tuple[float, float] = (0.5, 1.5),
-        saturation: Tuple[float, float] = (0.5, 1.5),
-        hue: Tuple[float, float] = (-0.05, 0.05),
-        brightness: Tuple[float, float] = (0.875, 1.125),
-        p: float = 0.5,
-    ):
-        super().__init__()
-        self._brightness = T.ColorJitter(brightness=brightness)
-        self._contrast = T.ColorJitter(contrast=contrast)
-        self._hue = T.ColorJitter(hue=hue)
-        self._saturation = T.ColorJitter(saturation=saturation)
-        self.p = p
-
-    def forward(
-        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
-    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
-        if isinstance(image, torch.Tensor):
-            if image.ndimension() not in {2, 3}:
-                raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.")
-            elif image.ndimension() == 2:
-                image = image.unsqueeze(0)
-
-        r = torch.rand(7)
-
-        if r[0] < self.p:
-            image = self._brightness(image)
-
-        contrast_before = r[1] < 0.5
-        if contrast_before:
-            if r[2] < self.p:
-                image = self._contrast(image)
-
-        if r[3] < self.p:
-            image = self._saturation(image)
-
-        if r[4] < self.p:
-            image = self._hue(image)
-
-        if not contrast_before:
-            if r[5] < self.p:
-                image = self._contrast(image)
-
-        if r[6] < self.p:
-            channels, _, _ = F.get_dimensions(image)
-            permutation = torch.randperm(channels)
-
-            is_pil = F._is_pil_image(image)
-            if is_pil:
-                image = F.pil_to_tensor(image)
-                image = F.convert_image_dtype(image)
-            image = image[..., permutation, :, :]
-            if is_pil:
-                image = F.to_pil_image(image)
-
-        return image, target
-
-
-class ScaleJitter(nn.Module):
-    """Randomly resizes the image and its bounding boxes  within the specified scale range.
-    The class implements the Scale Jitter augmentation as described in the paper
-    `"Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation" <https://arxiv.org/abs/2012.07177>`_.
-
-    Args:
-        target_size (tuple of ints): The target size for the transform provided in (height, weight) format.
-        scale_range (tuple of ints): scaling factor interval, e.g (a, b), then scale is randomly sampled from the
-            range a <= scale <= b.
-        interpolation (InterpolationMode): Desired interpolation enum defined by
-            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
-    """
-
-    def __init__(
-        self,
-        target_size: Tuple[int, int],
-        scale_range: Tuple[float, float] = (0.1, 2.0),
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias=True,
-    ):
-        super().__init__()
-        self.target_size = target_size
-        self.scale_range = scale_range
-        self.interpolation = interpolation
-        self.antialias = antialias
-
-    def forward(
-        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
-    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
-        if isinstance(image, torch.Tensor):
-            if image.ndimension() not in {2, 3}:
-                raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.")
-            elif image.ndimension() == 2:
-                image = image.unsqueeze(0)
-
-        _, orig_height, orig_width = F.get_dimensions(image)
-
-        scale = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0])
-        r = min(self.target_size[1] / orig_height, self.target_size[0] / orig_width) * scale
-        new_width = int(orig_width * r)
-        new_height = int(orig_height * r)
-
-        image = F.resize(image, [new_height, new_width], interpolation=self.interpolation, antialias=self.antialias)
-
-        if target is not None:
-            target["boxes"][:, 0::2] *= new_width / orig_width
-            target["boxes"][:, 1::2] *= new_height / orig_height
-            if "masks" in target:
-                target["masks"] = F.resize(
-                    target["masks"],
-                    [new_height, new_width],
-                    interpolation=InterpolationMode.NEAREST,
-                    antialias=self.antialias,
-                )
-
-        return image, target
-
-
-class FixedSizeCrop(nn.Module):
-    def __init__(self, size, fill=0, padding_mode="constant"):
-        super().__init__()
-        size = tuple(T._setup_size(size, error_msg="Please provide only two dimensions (h, w) for size."))
-        self.crop_height = size[0]
-        self.crop_width = size[1]
-        self.fill = fill  # TODO: Fill is currently respected only on PIL. Apply tensor patch.
-        self.padding_mode = padding_mode
-
-    def _pad(self, img, target, padding):
-        # Taken from the functional_tensor.py pad
-        if isinstance(padding, int):
-            pad_left = pad_right = pad_top = pad_bottom = padding
-        elif len(padding) == 1:
-            pad_left = pad_right = pad_top = pad_bottom = padding[0]
-        elif len(padding) == 2:
-            pad_left = pad_right = padding[0]
-            pad_top = pad_bottom = padding[1]
-        else:
-            pad_left = padding[0]
-            pad_top = padding[1]
-            pad_right = padding[2]
-            pad_bottom = padding[3]
-
-        padding = [pad_left, pad_top, pad_right, pad_bottom]
-        img = F.pad(img, padding, self.fill, self.padding_mode)
-        if target is not None:
-            target["boxes"][:, 0::2] += pad_left
-            target["boxes"][:, 1::2] += pad_top
-            if "masks" in target:
-                target["masks"] = F.pad(target["masks"], padding, 0, "constant")
-
-        return img, target
-
-    def _crop(self, img, target, top, left, height, width):
-        img = F.crop(img, top, left, height, width)
-        if target is not None:
-            boxes = target["boxes"]
-            boxes[:, 0::2] -= left
-            boxes[:, 1::2] -= top
-            boxes[:, 0::2].clamp_(min=0, max=width)
-            boxes[:, 1::2].clamp_(min=0, max=height)
-
-            is_valid = (boxes[:, 0] < boxes[:, 2]) & (boxes[:, 1] < boxes[:, 3])
-
-            target["boxes"] = boxes[is_valid]
-            target["labels"] = target["labels"][is_valid]
-            if "masks" in target:
-                target["masks"] = F.crop(target["masks"][is_valid], top, left, height, width)
-
-        return img, target
-
-    def forward(self, img, target=None):
-        _, height, width = F.get_dimensions(img)
-        new_height = min(height, self.crop_height)
-        new_width = min(width, self.crop_width)
-
-        if new_height != height or new_width != width:
-            offset_height = max(height - self.crop_height, 0)
-            offset_width = max(width - self.crop_width, 0)
-
-            r = torch.rand(1)
-            top = int(offset_height * r)
-            left = int(offset_width * r)
-
-            img, target = self._crop(img, target, top, left, new_height, new_width)
-
-        pad_bottom = max(self.crop_height - new_height, 0)
-        pad_right = max(self.crop_width - new_width, 0)
-        if pad_bottom != 0 or pad_right != 0:
-            img, target = self._pad(img, target, [0, 0, pad_right, pad_bottom])
-
-        return img, target
-
-
-class RandomShortestSize(nn.Module):
-    def __init__(
-        self,
-        min_size: Union[List[int], Tuple[int], int],
-        max_size: int,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    ):
-        super().__init__()
-        self.min_size = [min_size] if isinstance(min_size, int) else list(min_size)
-        self.max_size = max_size
-        self.interpolation = interpolation
-
-    def forward(
-        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
-    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
-        _, orig_height, orig_width = F.get_dimensions(image)
-
-        min_size = self.min_size[torch.randint(len(self.min_size), (1,)).item()]
-        r = min(min_size / min(orig_height, orig_width), self.max_size / max(orig_height, orig_width))
-
-        new_width = int(orig_width * r)
-        new_height = int(orig_height * r)
-
-        image = F.resize(image, [new_height, new_width], interpolation=self.interpolation)
-
-        if target is not None:
-            target["boxes"][:, 0::2] *= new_width / orig_width
-            target["boxes"][:, 1::2] *= new_height / orig_height
-            if "masks" in target:
-                target["masks"] = F.resize(
-                    target["masks"], [new_height, new_width], interpolation=InterpolationMode.NEAREST
-                )
-
-        return image, target
-
-
-def _copy_paste(
-    image: torch.Tensor,
-    target: Dict[str, Tensor],
-    paste_image: torch.Tensor,
-    paste_target: Dict[str, Tensor],
-    blending: bool = True,
-    resize_interpolation: F.InterpolationMode = F.InterpolationMode.BILINEAR,
-) -> Tuple[torch.Tensor, Dict[str, Tensor]]:
-
-    # Random paste targets selection:
-    num_masks = len(paste_target["masks"])
-
-    if num_masks < 1:
-        # Such degerante case with num_masks=0 can happen with LSJ
-        # Let's just return (image, target)
-        return image, target
-
-    # We have to please torch script by explicitly specifying dtype as torch.long
-    random_selection = torch.randint(0, num_masks, (num_masks,), device=paste_image.device)
-    random_selection = torch.unique(random_selection).to(torch.long)
-
-    paste_masks = paste_target["masks"][random_selection]
-    paste_boxes = paste_target["boxes"][random_selection]
-    paste_labels = paste_target["labels"][random_selection]
-
-    masks = target["masks"]
-
-    # We resize source and paste data if they have different sizes
-    # This is something we introduced here as originally the algorithm works
-    # on equal-sized data (for example, coming from LSJ data augmentations)
-    size1 = image.shape[-2:]
-    size2 = paste_image.shape[-2:]
-    if size1 != size2:
-        paste_image = F.resize(paste_image, size1, interpolation=resize_interpolation)
-        paste_masks = F.resize(paste_masks, size1, interpolation=F.InterpolationMode.NEAREST)
-        # resize bboxes:
-        ratios = torch.tensor((size1[1] / size2[1], size1[0] / size2[0]), device=paste_boxes.device)
-        paste_boxes = paste_boxes.view(-1, 2, 2).mul(ratios).view(paste_boxes.shape)
-
-    paste_alpha_mask = paste_masks.sum(dim=0) > 0
-
-    if blending:
-        paste_alpha_mask = F.gaussian_blur(
-            paste_alpha_mask.unsqueeze(0),
-            kernel_size=(5, 5),
-            sigma=[
-                2.0,
-            ],
-        )
-
-    # Copy-paste images:
-    image = (image * (~paste_alpha_mask)) + (paste_image * paste_alpha_mask)
-
-    # Copy-paste masks:
-    masks = masks * (~paste_alpha_mask)
-    non_all_zero_masks = masks.sum((-1, -2)) > 0
-    masks = masks[non_all_zero_masks]
-
-    # Do a shallow copy of the target dict
-    out_target = {k: v for k, v in target.items()}
-
-    out_target["masks"] = torch.cat([masks, paste_masks])
-
-    # Copy-paste boxes and labels
-    boxes = ops.masks_to_boxes(masks)
-    out_target["boxes"] = torch.cat([boxes, paste_boxes])
-
-    labels = target["labels"][non_all_zero_masks]
-    out_target["labels"] = torch.cat([labels, paste_labels])
-
-    # Update additional optional keys: area and iscrowd if exist
-    if "area" in target:
-        out_target["area"] = out_target["masks"].sum((-1, -2)).to(torch.float32)
-
-    if "iscrowd" in target and "iscrowd" in paste_target:
-        # target['iscrowd'] size can be differ from mask size (non_all_zero_masks)
-        # For example, if previous transforms geometrically modifies masks/boxes/labels but
-        # does not update "iscrowd"
-        if len(target["iscrowd"]) == len(non_all_zero_masks):
-            iscrowd = target["iscrowd"][non_all_zero_masks]
-            paste_iscrowd = paste_target["iscrowd"][random_selection]
-            out_target["iscrowd"] = torch.cat([iscrowd, paste_iscrowd])
-
-    # Check for degenerated boxes and remove them
-    boxes = out_target["boxes"]
-    degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
-    if degenerate_boxes.any():
-        valid_targets = ~degenerate_boxes.any(dim=1)
-
-        out_target["boxes"] = boxes[valid_targets]
-        out_target["masks"] = out_target["masks"][valid_targets]
-        out_target["labels"] = out_target["labels"][valid_targets]
-
-        if "area" in out_target:
-            out_target["area"] = out_target["area"][valid_targets]
-        if "iscrowd" in out_target and len(out_target["iscrowd"]) == len(valid_targets):
-            out_target["iscrowd"] = out_target["iscrowd"][valid_targets]
-
-    return image, out_target
-
-
-class SimpleCopyPaste(torch.nn.Module):
-    def __init__(self, blending=True, resize_interpolation=F.InterpolationMode.BILINEAR):
-        super().__init__()
-        self.resize_interpolation = resize_interpolation
-        self.blending = blending
-
-    def forward(
-        self, images: List[torch.Tensor], targets: List[Dict[str, Tensor]]
-    ) -> Tuple[List[torch.Tensor], List[Dict[str, Tensor]]]:
-        torch._assert(
-            isinstance(images, (list, tuple)) and all([isinstance(v, torch.Tensor) for v in images]),
-            "images should be a list of tensors",
-        )
-        torch._assert(
-            isinstance(targets, (list, tuple)) and len(images) == len(targets),
-            "targets should be a list of the same size as images",
-        )
-        for target in targets:
-            # Can not check for instance type dict with inside torch.jit.script
-            # torch._assert(isinstance(target, dict), "targets item should be a dict")
-            for k in ["masks", "boxes", "labels"]:
-                torch._assert(k in target, f"Key {k} should be present in targets")
-                torch._assert(isinstance(target[k], torch.Tensor), f"Value for the key {k} should be a tensor")
-
-        # images = [t1, t2, ..., tN]
-        # Let's define paste_images as shifted list of input images
-        # paste_images = [t2, t3, ..., tN, t1]
-        # FYI: in TF they mix data on the dataset level
-        images_rolled = images[-1:] + images[:-1]
-        targets_rolled = targets[-1:] + targets[:-1]
-
-        output_images: List[torch.Tensor] = []
-        output_targets: List[Dict[str, Tensor]] = []
-
-        for image, target, paste_image, paste_target in zip(images, targets, images_rolled, targets_rolled):
-            output_image, output_data = _copy_paste(
-                image,
-                target,
-                paste_image,
-                paste_target,
-                blending=self.blending,
-                resize_interpolation=self.resize_interpolation,
-            )
-            output_images.append(output_image)
-            output_targets.append(output_data)
-
-        return output_images, output_targets
-
-    def __repr__(self) -> str:
-        s = f"{self.__class__.__name__}(blending={self.blending}, resize_interpolation={self.resize_interpolation})"
-        return s
diff --git a/integrations/pytorch_ddp/test/torchvision/transforms.py.1 b/integrations/pytorch_ddp/test/torchvision/transforms.py.1
deleted file mode 100644
index e07ccfc9..00000000
--- a/integrations/pytorch_ddp/test/torchvision/transforms.py.1
+++ /dev/null
@@ -1,601 +0,0 @@
-from typing import Dict, List, Optional, Tuple, Union
-
-import torch
-import torchvision
-from torch import nn, Tensor
-from torchvision import ops
-from torchvision.transforms import functional as F, InterpolationMode, transforms as T
-
-
-def _flip_coco_person_keypoints(kps, width):
-    flip_inds = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
-    flipped_data = kps[:, flip_inds]
-    flipped_data[..., 0] = width - flipped_data[..., 0]
-    # Maintain COCO convention that if visibility == 0, then x, y = 0
-    inds = flipped_data[..., 2] == 0
-    flipped_data[inds] = 0
-    return flipped_data
-
-
-class Compose:
-    def __init__(self, transforms):
-        self.transforms = transforms
-
-    def __call__(self, image, target):
-        for t in self.transforms:
-            image, target = t(image, target)
-        return image, target
-
-
-class RandomHorizontalFlip(T.RandomHorizontalFlip):
-    def forward(
-        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
-    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
-        if torch.rand(1) < self.p:
-            image = F.hflip(image)
-            if target is not None:
-                _, _, width = F.get_dimensions(image)
-                target["boxes"][:, [0, 2]] = width - target["boxes"][:, [2, 0]]
-                if "masks" in target:
-                    target["masks"] = target["masks"].flip(-1)
-                if "keypoints" in target:
-                    keypoints = target["keypoints"]
-                    keypoints = _flip_coco_person_keypoints(keypoints, width)
-                    target["keypoints"] = keypoints
-        return image, target
-
-
-class PILToTensor(nn.Module):
-    def forward(
-        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
-    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
-        image = F.pil_to_tensor(image)
-        return image, target
-
-
-class ToDtype(nn.Module):
-    def __init__(self, dtype: torch.dtype, scale: bool = False) -> None:
-        super().__init__()
-        self.dtype = dtype
-        self.scale = scale
-
-    def forward(
-        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
-    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
-        if not self.scale:
-            return image.to(dtype=self.dtype), target
-        image = F.convert_image_dtype(image, self.dtype)
-        return image, target
-
-
-class RandomIoUCrop(nn.Module):
-    def __init__(
-        self,
-        min_scale: float = 0.3,
-        max_scale: float = 1.0,
-        min_aspect_ratio: float = 0.5,
-        max_aspect_ratio: float = 2.0,
-        sampler_options: Optional[List[float]] = None,
-        trials: int = 40,
-    ):
-        super().__init__()
-        # Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174
-        self.min_scale = min_scale
-        self.max_scale = max_scale
-        self.min_aspect_ratio = min_aspect_ratio
-        self.max_aspect_ratio = max_aspect_ratio
-        if sampler_options is None:
-            sampler_options = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
-        self.options = sampler_options
-        self.trials = trials
-
-    def forward(
-        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
-    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
-        if target is None:
-            raise ValueError("The targets can't be None for this transform.")
-
-        if isinstance(image, torch.Tensor):
-            if image.ndimension() not in {2, 3}:
-                raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.")
-            elif image.ndimension() == 2:
-                image = image.unsqueeze(0)
-
-        _, orig_h, orig_w = F.get_dimensions(image)
-
-        while True:
-            # sample an option
-            idx = int(torch.randint(low=0, high=len(self.options), size=(1,)))
-            min_jaccard_overlap = self.options[idx]
-            if min_jaccard_overlap >= 1.0:  # a value larger than 1 encodes the leave as-is option
-                return image, target
-
-            for _ in range(self.trials):
-                # check the aspect ratio limitations
-                r = self.min_scale + (self.max_scale - self.min_scale) * torch.rand(2)
-                new_w = int(orig_w * r[0])
-                new_h = int(orig_h * r[1])
-                aspect_ratio = new_w / new_h
-                if not (self.min_aspect_ratio <= aspect_ratio <= self.max_aspect_ratio):
-                    continue
-
-                # check for 0 area crops
-                r = torch.rand(2)
-                left = int((orig_w - new_w) * r[0])
-                top = int((orig_h - new_h) * r[1])
-                right = left + new_w
-                bottom = top + new_h
-                if left == right or top == bottom:
-                    continue
-
-                # check for any valid boxes with centers within the crop area
-                cx = 0.5 * (target["boxes"][:, 0] + target["boxes"][:, 2])
-                cy = 0.5 * (target["boxes"][:, 1] + target["boxes"][:, 3])
-                is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom)
-                if not is_within_crop_area.any():
-                    continue
-
-                # check at least 1 box with jaccard limitations
-                boxes = target["boxes"][is_within_crop_area]
-                ious = torchvision.ops.boxes.box_iou(
-                    boxes, torch.tensor([[left, top, right, bottom]], dtype=boxes.dtype, device=boxes.device)
-                )
-                if ious.max() < min_jaccard_overlap:
-                    continue
-
-                # keep only valid boxes and perform cropping
-                target["boxes"] = boxes
-                target["labels"] = target["labels"][is_within_crop_area]
-                target["boxes"][:, 0::2] -= left
-                target["boxes"][:, 1::2] -= top
-                target["boxes"][:, 0::2].clamp_(min=0, max=new_w)
-                target["boxes"][:, 1::2].clamp_(min=0, max=new_h)
-                image = F.crop(image, top, left, new_h, new_w)
-
-                return image, target
-
-
-class RandomZoomOut(nn.Module):
-    def __init__(
-        self, fill: Optional[List[float]] = None, side_range: Tuple[float, float] = (1.0, 4.0), p: float = 0.5
-    ):
-        super().__init__()
-        if fill is None:
-            fill = [0.0, 0.0, 0.0]
-        self.fill = fill
-        self.side_range = side_range
-        if side_range[0] < 1.0 or side_range[0] > side_range[1]:
-            raise ValueError(f"Invalid canvas side range provided {side_range}.")
-        self.p = p
-
-    @torch.jit.unused
-    def _get_fill_value(self, is_pil):
-        # type: (bool) -> int
-        # We fake the type to make it work on JIT
-        return tuple(int(x) for x in self.fill) if is_pil else 0
-
-    def forward(
-        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
-    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
-        if isinstance(image, torch.Tensor):
-            if image.ndimension() not in {2, 3}:
-                raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.")
-            elif image.ndimension() == 2:
-                image = image.unsqueeze(0)
-
-        if torch.rand(1) >= self.p:
-            return image, target
-
-        _, orig_h, orig_w = F.get_dimensions(image)
-
-        r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0])
-        canvas_width = int(orig_w * r)
-        canvas_height = int(orig_h * r)
-
-        r = torch.rand(2)
-        left = int((canvas_width - orig_w) * r[0])
-        top = int((canvas_height - orig_h) * r[1])
-        right = canvas_width - (left + orig_w)
-        bottom = canvas_height - (top + orig_h)
-
-        if torch.jit.is_scripting():
-            fill = 0
-        else:
-            fill = self._get_fill_value(F._is_pil_image(image))
-
-        image = F.pad(image, [left, top, right, bottom], fill=fill)
-        if isinstance(image, torch.Tensor):
-            # PyTorch's pad supports only integers on fill. So we need to overwrite the colour
-            v = torch.tensor(self.fill, device=image.device, dtype=image.dtype).view(-1, 1, 1)
-            image[..., :top, :] = image[..., :, :left] = image[..., (top + orig_h) :, :] = image[
-                ..., :, (left + orig_w) :
-            ] = v
-
-        if target is not None:
-            target["boxes"][:, 0::2] += left
-            target["boxes"][:, 1::2] += top
-
-        return image, target
-
-
-class RandomPhotometricDistort(nn.Module):
-    def __init__(
-        self,
-        contrast: Tuple[float, float] = (0.5, 1.5),
-        saturation: Tuple[float, float] = (0.5, 1.5),
-        hue: Tuple[float, float] = (-0.05, 0.05),
-        brightness: Tuple[float, float] = (0.875, 1.125),
-        p: float = 0.5,
-    ):
-        super().__init__()
-        self._brightness = T.ColorJitter(brightness=brightness)
-        self._contrast = T.ColorJitter(contrast=contrast)
-        self._hue = T.ColorJitter(hue=hue)
-        self._saturation = T.ColorJitter(saturation=saturation)
-        self.p = p
-
-    def forward(
-        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
-    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
-        if isinstance(image, torch.Tensor):
-            if image.ndimension() not in {2, 3}:
-                raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.")
-            elif image.ndimension() == 2:
-                image = image.unsqueeze(0)
-
-        r = torch.rand(7)
-
-        if r[0] < self.p:
-            image = self._brightness(image)
-
-        contrast_before = r[1] < 0.5
-        if contrast_before:
-            if r[2] < self.p:
-                image = self._contrast(image)
-
-        if r[3] < self.p:
-            image = self._saturation(image)
-
-        if r[4] < self.p:
-            image = self._hue(image)
-
-        if not contrast_before:
-            if r[5] < self.p:
-                image = self._contrast(image)
-
-        if r[6] < self.p:
-            channels, _, _ = F.get_dimensions(image)
-            permutation = torch.randperm(channels)
-
-            is_pil = F._is_pil_image(image)
-            if is_pil:
-                image = F.pil_to_tensor(image)
-                image = F.convert_image_dtype(image)
-            image = image[..., permutation, :, :]
-            if is_pil:
-                image = F.to_pil_image(image)
-
-        return image, target
-
-
-class ScaleJitter(nn.Module):
-    """Randomly resizes the image and its bounding boxes  within the specified scale range.
-    The class implements the Scale Jitter augmentation as described in the paper
-    `"Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation" <https://arxiv.org/abs/2012.07177>`_.
-
-    Args:
-        target_size (tuple of ints): The target size for the transform provided in (height, weight) format.
-        scale_range (tuple of ints): scaling factor interval, e.g (a, b), then scale is randomly sampled from the
-            range a <= scale <= b.
-        interpolation (InterpolationMode): Desired interpolation enum defined by
-            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
-    """
-
-    def __init__(
-        self,
-        target_size: Tuple[int, int],
-        scale_range: Tuple[float, float] = (0.1, 2.0),
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-        antialias=True,
-    ):
-        super().__init__()
-        self.target_size = target_size
-        self.scale_range = scale_range
-        self.interpolation = interpolation
-        self.antialias = antialias
-
-    def forward(
-        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
-    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
-        if isinstance(image, torch.Tensor):
-            if image.ndimension() not in {2, 3}:
-                raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.")
-            elif image.ndimension() == 2:
-                image = image.unsqueeze(0)
-
-        _, orig_height, orig_width = F.get_dimensions(image)
-
-        scale = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0])
-        r = min(self.target_size[1] / orig_height, self.target_size[0] / orig_width) * scale
-        new_width = int(orig_width * r)
-        new_height = int(orig_height * r)
-
-        image = F.resize(image, [new_height, new_width], interpolation=self.interpolation, antialias=self.antialias)
-
-        if target is not None:
-            target["boxes"][:, 0::2] *= new_width / orig_width
-            target["boxes"][:, 1::2] *= new_height / orig_height
-            if "masks" in target:
-                target["masks"] = F.resize(
-                    target["masks"],
-                    [new_height, new_width],
-                    interpolation=InterpolationMode.NEAREST,
-                    antialias=self.antialias,
-                )
-
-        return image, target
-
-
-class FixedSizeCrop(nn.Module):
-    def __init__(self, size, fill=0, padding_mode="constant"):
-        super().__init__()
-        size = tuple(T._setup_size(size, error_msg="Please provide only two dimensions (h, w) for size."))
-        self.crop_height = size[0]
-        self.crop_width = size[1]
-        self.fill = fill  # TODO: Fill is currently respected only on PIL. Apply tensor patch.
-        self.padding_mode = padding_mode
-
-    def _pad(self, img, target, padding):
-        # Taken from the functional_tensor.py pad
-        if isinstance(padding, int):
-            pad_left = pad_right = pad_top = pad_bottom = padding
-        elif len(padding) == 1:
-            pad_left = pad_right = pad_top = pad_bottom = padding[0]
-        elif len(padding) == 2:
-            pad_left = pad_right = padding[0]
-            pad_top = pad_bottom = padding[1]
-        else:
-            pad_left = padding[0]
-            pad_top = padding[1]
-            pad_right = padding[2]
-            pad_bottom = padding[3]
-
-        padding = [pad_left, pad_top, pad_right, pad_bottom]
-        img = F.pad(img, padding, self.fill, self.padding_mode)
-        if target is not None:
-            target["boxes"][:, 0::2] += pad_left
-            target["boxes"][:, 1::2] += pad_top
-            if "masks" in target:
-                target["masks"] = F.pad(target["masks"], padding, 0, "constant")
-
-        return img, target
-
-    def _crop(self, img, target, top, left, height, width):
-        img = F.crop(img, top, left, height, width)
-        if target is not None:
-            boxes = target["boxes"]
-            boxes[:, 0::2] -= left
-            boxes[:, 1::2] -= top
-            boxes[:, 0::2].clamp_(min=0, max=width)
-            boxes[:, 1::2].clamp_(min=0, max=height)
-
-            is_valid = (boxes[:, 0] < boxes[:, 2]) & (boxes[:, 1] < boxes[:, 3])
-
-            target["boxes"] = boxes[is_valid]
-            target["labels"] = target["labels"][is_valid]
-            if "masks" in target:
-                target["masks"] = F.crop(target["masks"][is_valid], top, left, height, width)
-
-        return img, target
-
-    def forward(self, img, target=None):
-        _, height, width = F.get_dimensions(img)
-        new_height = min(height, self.crop_height)
-        new_width = min(width, self.crop_width)
-
-        if new_height != height or new_width != width:
-            offset_height = max(height - self.crop_height, 0)
-            offset_width = max(width - self.crop_width, 0)
-
-            r = torch.rand(1)
-            top = int(offset_height * r)
-            left = int(offset_width * r)
-
-            img, target = self._crop(img, target, top, left, new_height, new_width)
-
-        pad_bottom = max(self.crop_height - new_height, 0)
-        pad_right = max(self.crop_width - new_width, 0)
-        if pad_bottom != 0 or pad_right != 0:
-            img, target = self._pad(img, target, [0, 0, pad_right, pad_bottom])
-
-        return img, target
-
-
-class RandomShortestSize(nn.Module):
-    def __init__(
-        self,
-        min_size: Union[List[int], Tuple[int], int],
-        max_size: int,
-        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    ):
-        super().__init__()
-        self.min_size = [min_size] if isinstance(min_size, int) else list(min_size)
-        self.max_size = max_size
-        self.interpolation = interpolation
-
-    def forward(
-        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
-    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
-        _, orig_height, orig_width = F.get_dimensions(image)
-
-        min_size = self.min_size[torch.randint(len(self.min_size), (1,)).item()]
-        r = min(min_size / min(orig_height, orig_width), self.max_size / max(orig_height, orig_width))
-
-        new_width = int(orig_width * r)
-        new_height = int(orig_height * r)
-
-        image = F.resize(image, [new_height, new_width], interpolation=self.interpolation)
-
-        if target is not None:
-            target["boxes"][:, 0::2] *= new_width / orig_width
-            target["boxes"][:, 1::2] *= new_height / orig_height
-            if "masks" in target:
-                target["masks"] = F.resize(
-                    target["masks"], [new_height, new_width], interpolation=InterpolationMode.NEAREST
-                )
-
-        return image, target
-
-
-def _copy_paste(
-    image: torch.Tensor,
-    target: Dict[str, Tensor],
-    paste_image: torch.Tensor,
-    paste_target: Dict[str, Tensor],
-    blending: bool = True,
-    resize_interpolation: F.InterpolationMode = F.InterpolationMode.BILINEAR,
-) -> Tuple[torch.Tensor, Dict[str, Tensor]]:
-
-    # Random paste targets selection:
-    num_masks = len(paste_target["masks"])
-
-    if num_masks < 1:
-        # Such degerante case with num_masks=0 can happen with LSJ
-        # Let's just return (image, target)
-        return image, target
-
-    # We have to please torch script by explicitly specifying dtype as torch.long
-    random_selection = torch.randint(0, num_masks, (num_masks,), device=paste_image.device)
-    random_selection = torch.unique(random_selection).to(torch.long)
-
-    paste_masks = paste_target["masks"][random_selection]
-    paste_boxes = paste_target["boxes"][random_selection]
-    paste_labels = paste_target["labels"][random_selection]
-
-    masks = target["masks"]
-
-    # We resize source and paste data if they have different sizes
-    # This is something we introduced here as originally the algorithm works
-    # on equal-sized data (for example, coming from LSJ data augmentations)
-    size1 = image.shape[-2:]
-    size2 = paste_image.shape[-2:]
-    if size1 != size2:
-        paste_image = F.resize(paste_image, size1, interpolation=resize_interpolation)
-        paste_masks = F.resize(paste_masks, size1, interpolation=F.InterpolationMode.NEAREST)
-        # resize bboxes:
-        ratios = torch.tensor((size1[1] / size2[1], size1[0] / size2[0]), device=paste_boxes.device)
-        paste_boxes = paste_boxes.view(-1, 2, 2).mul(ratios).view(paste_boxes.shape)
-
-    paste_alpha_mask = paste_masks.sum(dim=0) > 0
-
-    if blending:
-        paste_alpha_mask = F.gaussian_blur(
-            paste_alpha_mask.unsqueeze(0),
-            kernel_size=(5, 5),
-            sigma=[
-                2.0,
-            ],
-        )
-
-    # Copy-paste images:
-    image = (image * (~paste_alpha_mask)) + (paste_image * paste_alpha_mask)
-
-    # Copy-paste masks:
-    masks = masks * (~paste_alpha_mask)
-    non_all_zero_masks = masks.sum((-1, -2)) > 0
-    masks = masks[non_all_zero_masks]
-
-    # Do a shallow copy of the target dict
-    out_target = {k: v for k, v in target.items()}
-
-    out_target["masks"] = torch.cat([masks, paste_masks])
-
-    # Copy-paste boxes and labels
-    boxes = ops.masks_to_boxes(masks)
-    out_target["boxes"] = torch.cat([boxes, paste_boxes])
-
-    labels = target["labels"][non_all_zero_masks]
-    out_target["labels"] = torch.cat([labels, paste_labels])
-
-    # Update additional optional keys: area and iscrowd if exist
-    if "area" in target:
-        out_target["area"] = out_target["masks"].sum((-1, -2)).to(torch.float32)
-
-    if "iscrowd" in target and "iscrowd" in paste_target:
-        # target['iscrowd'] size can be differ from mask size (non_all_zero_masks)
-        # For example, if previous transforms geometrically modifies masks/boxes/labels but
-        # does not update "iscrowd"
-        if len(target["iscrowd"]) == len(non_all_zero_masks):
-            iscrowd = target["iscrowd"][non_all_zero_masks]
-            paste_iscrowd = paste_target["iscrowd"][random_selection]
-            out_target["iscrowd"] = torch.cat([iscrowd, paste_iscrowd])
-
-    # Check for degenerated boxes and remove them
-    boxes = out_target["boxes"]
-    degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
-    if degenerate_boxes.any():
-        valid_targets = ~degenerate_boxes.any(dim=1)
-
-        out_target["boxes"] = boxes[valid_targets]
-        out_target["masks"] = out_target["masks"][valid_targets]
-        out_target["labels"] = out_target["labels"][valid_targets]
-
-        if "area" in out_target:
-            out_target["area"] = out_target["area"][valid_targets]
-        if "iscrowd" in out_target and len(out_target["iscrowd"]) == len(valid_targets):
-            out_target["iscrowd"] = out_target["iscrowd"][valid_targets]
-
-    return image, out_target
-
-
-class SimpleCopyPaste(torch.nn.Module):
-    def __init__(self, blending=True, resize_interpolation=F.InterpolationMode.BILINEAR):
-        super().__init__()
-        self.resize_interpolation = resize_interpolation
-        self.blending = blending
-
-    def forward(
-        self, images: List[torch.Tensor], targets: List[Dict[str, Tensor]]
-    ) -> Tuple[List[torch.Tensor], List[Dict[str, Tensor]]]:
-        torch._assert(
-            isinstance(images, (list, tuple)) and all([isinstance(v, torch.Tensor) for v in images]),
-            "images should be a list of tensors",
-        )
-        torch._assert(
-            isinstance(targets, (list, tuple)) and len(images) == len(targets),
-            "targets should be a list of the same size as images",
-        )
-        for target in targets:
-            # Can not check for instance type dict with inside torch.jit.script
-            # torch._assert(isinstance(target, dict), "targets item should be a dict")
-            for k in ["masks", "boxes", "labels"]:
-                torch._assert(k in target, f"Key {k} should be present in targets")
-                torch._assert(isinstance(target[k], torch.Tensor), f"Value for the key {k} should be a tensor")
-
-        # images = [t1, t2, ..., tN]
-        # Let's define paste_images as shifted list of input images
-        # paste_images = [t2, t3, ..., tN, t1]
-        # FYI: in TF they mix data on the dataset level
-        images_rolled = images[-1:] + images[:-1]
-        targets_rolled = targets[-1:] + targets[:-1]
-
-        output_images: List[torch.Tensor] = []
-        output_targets: List[Dict[str, Tensor]] = []
-
-        for image, target, paste_image, paste_target in zip(images, targets, images_rolled, targets_rolled):
-            output_image, output_data = _copy_paste(
-                image,
-                target,
-                paste_image,
-                paste_target,
-                blending=self.blending,
-                resize_interpolation=self.resize_interpolation,
-            )
-            output_images.append(output_image)
-            output_targets.append(output_data)
-
-        return output_images, output_targets
-
-    def __repr__(self) -> str:
-        s = f"{self.__class__.__name__}(blending={self.blending}, resize_interpolation={self.resize_interpolation})"
-        return s
diff --git a/integrations/pytorch_ddp/test/torchvision/utils.py b/integrations/pytorch_ddp/test/torchvision/utils.py
deleted file mode 100644
index f7391558..00000000
--- a/integrations/pytorch_ddp/test/torchvision/utils.py
+++ /dev/null
@@ -1,282 +0,0 @@
-import datetime
-import errno
-import os
-import time
-from collections import defaultdict, deque
-
-import torch
-import torch.distributed as dist
-
-
-class SmoothedValue:
-    """Track a series of values and provide access to smoothed values over a
-    window or the global series average.
-    """
-
-    def __init__(self, window_size=20, fmt=None):
-        if fmt is None:
-            fmt = "{median:.4f} ({global_avg:.4f})"
-        self.deque = deque(maxlen=window_size)
-        self.total = 0.0
-        self.count = 0
-        self.fmt = fmt
-
-    def update(self, value, n=1):
-        self.deque.append(value)
-        self.count += n
-        self.total += value * n
-
-    def synchronize_between_processes(self):
-        """
-        Warning: does not synchronize the deque!
-        """
-        if not is_dist_avail_and_initialized():
-            return
-        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
-        dist.barrier()
-        dist.all_reduce(t)
-        t = t.tolist()
-        self.count = int(t[0])
-        self.total = t[1]
-
-    @property
-    def median(self):
-        d = torch.tensor(list(self.deque))
-        return d.median().item()
-
-    @property
-    def avg(self):
-        d = torch.tensor(list(self.deque), dtype=torch.float32)
-        return d.mean().item()
-
-    @property
-    def global_avg(self):
-        return self.total / self.count
-
-    @property
-    def max(self):
-        return max(self.deque)
-
-    @property
-    def value(self):
-        return self.deque[-1]
-
-    def __str__(self):
-        return self.fmt.format(
-            median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value
-        )
-
-
-def all_gather(data):
-    """
-    Run all_gather on arbitrary picklable data (not necessarily tensors)
-    Args:
-        data: any picklable object
-    Returns:
-        list[data]: list of data gathered from each rank
-    """
-    world_size = get_world_size()
-    if world_size == 1:
-        return [data]
-    data_list = [None] * world_size
-    dist.all_gather_object(data_list, data)
-    return data_list
-
-
-def reduce_dict(input_dict, average=True):
-    """
-    Args:
-        input_dict (dict): all the values will be reduced
-        average (bool): whether to do average or sum
-    Reduce the values in the dictionary from all processes so that all processes
-    have the averaged results. Returns a dict with the same fields as
-    input_dict, after reduction.
-    """
-    world_size = get_world_size()
-    if world_size < 2:
-        return input_dict
-    with torch.inference_mode():
-        names = []
-        values = []
-        # sort the keys so that they are consistent across processes
-        for k in sorted(input_dict.keys()):
-            names.append(k)
-            values.append(input_dict[k])
-        values = torch.stack(values, dim=0)
-        dist.all_reduce(values)
-        if average:
-            values /= world_size
-        reduced_dict = {k: v for k, v in zip(names, values)}
-    return reduced_dict
-
-
-class MetricLogger:
-    def __init__(self, delimiter="\t"):
-        self.meters = defaultdict(SmoothedValue)
-        self.delimiter = delimiter
-
-    def update(self, **kwargs):
-        for k, v in kwargs.items():
-            if isinstance(v, torch.Tensor):
-                v = v.item()
-            assert isinstance(v, (float, int))
-            self.meters[k].update(v)
-
-    def __getattr__(self, attr):
-        if attr in self.meters:
-            return self.meters[attr]
-        if attr in self.__dict__:
-            return self.__dict__[attr]
-        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{attr}'")
-
-    def __str__(self):
-        loss_str = []
-        for name, meter in self.meters.items():
-            loss_str.append(f"{name}: {str(meter)}")
-        return self.delimiter.join(loss_str)
-
-    def synchronize_between_processes(self):
-        for meter in self.meters.values():
-            meter.synchronize_between_processes()
-
-    def add_meter(self, name, meter):
-        self.meters[name] = meter
-
-    def log_every(self, iterable, print_freq, header=None):
-        i = 0
-        if not header:
-            header = ""
-        start_time = time.time()
-        end = time.time()
-        iter_time = SmoothedValue(fmt="{avg:.4f}")
-        data_time = SmoothedValue(fmt="{avg:.4f}")
-        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
-        if torch.cuda.is_available():
-            log_msg = self.delimiter.join(
-                [
-                    header,
-                    "[{0" + space_fmt + "}/{1}]",
-                    "eta: {eta}",
-                    "{meters}",
-                    "time: {time}",
-                    "data: {data}",
-                    "max mem: {memory:.0f}",
-                ]
-            )
-        else:
-            log_msg = self.delimiter.join(
-                [header, "[{0" + space_fmt + "}/{1}]", "eta: {eta}", "{meters}", "time: {time}", "data: {data}"]
-            )
-        MB = 1024.0 * 1024.0
-        for obj in iterable:
-            data_time.update(time.time() - end)
-            yield obj
-            iter_time.update(time.time() - end)
-            if i % print_freq == 0 or i == len(iterable) - 1:
-                eta_seconds = iter_time.global_avg * (len(iterable) - i)
-                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
-                if torch.cuda.is_available():
-                    print(
-                        log_msg.format(
-                            i,
-                            len(iterable),
-                            eta=eta_string,
-                            meters=str(self),
-                            time=str(iter_time),
-                            data=str(data_time),
-                            memory=torch.cuda.max_memory_allocated() / MB,
-                        )
-                    )
-                else:
-                    print(
-                        log_msg.format(
-                            i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time)
-                        )
-                    )
-            i += 1
-            end = time.time()
-        total_time = time.time() - start_time
-        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
-        print(f"{header} Total time: {total_time_str} ({total_time / len(iterable):.4f} s / it)")
-
-
-def collate_fn(batch):
-    return tuple(zip(*batch))
-
-
-def mkdir(path):
-    try:
-        os.makedirs(path)
-    except OSError as e:
-        if e.errno != errno.EEXIST:
-            raise
-
-
-def setup_for_distributed(is_master):
-    """
-    This function disables printing when not in master process
-    """
-    import builtins as __builtin__
-
-    builtin_print = __builtin__.print
-
-    def print(*args, **kwargs):
-        force = kwargs.pop("force", False)
-        if is_master or force:
-            builtin_print(*args, **kwargs)
-
-    __builtin__.print = print
-
-
-def is_dist_avail_and_initialized():
-    if not dist.is_available():
-        return False
-    if not dist.is_initialized():
-        return False
-    return True
-
-
-def get_world_size():
-    if not is_dist_avail_and_initialized():
-        return 1
-    return dist.get_world_size()
-
-
-def get_rank():
-    if not is_dist_avail_and_initialized():
-        return 0
-    return dist.get_rank()
-
-
-def is_main_process():
-    return get_rank() == 0
-
-
-def save_on_master(*args, **kwargs):
-    if is_main_process():
-        torch.save(*args, **kwargs)
-
-
-def init_distributed_mode(args):
-    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
-        args.rank = int(os.environ["RANK"])
-        args.world_size = int(os.environ["WORLD_SIZE"])
-        args.gpu = int(os.environ["LOCAL_RANK"])
-    elif "SLURM_PROCID" in os.environ:
-        args.rank = int(os.environ["SLURM_PROCID"])
-        args.gpu = args.rank % torch.cuda.device_count()
-    else:
-        print("Not using distributed mode")
-        args.distributed = False
-        return
-
-    args.distributed = True
-
-    torch.cuda.set_device(args.gpu)
-    args.dist_backend = "nccl"
-    print(f"| distributed init (rank {args.rank}): {args.dist_url}", flush=True)
-    torch.distributed.init_process_group(
-        backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank
-    )
-    torch.distributed.barrier()
-    setup_for_distributed(args.rank == 0)
diff --git a/integrations/pytorch_ddp/test/torchvision/utils.py.1 b/integrations/pytorch_ddp/test/torchvision/utils.py.1
deleted file mode 100644
index f7391558..00000000
--- a/integrations/pytorch_ddp/test/torchvision/utils.py.1
+++ /dev/null
@@ -1,282 +0,0 @@
-import datetime
-import errno
-import os
-import time
-from collections import defaultdict, deque
-
-import torch
-import torch.distributed as dist
-
-
-class SmoothedValue:
-    """Track a series of values and provide access to smoothed values over a
-    window or the global series average.
-    """
-
-    def __init__(self, window_size=20, fmt=None):
-        if fmt is None:
-            fmt = "{median:.4f} ({global_avg:.4f})"
-        self.deque = deque(maxlen=window_size)
-        self.total = 0.0
-        self.count = 0
-        self.fmt = fmt
-
-    def update(self, value, n=1):
-        self.deque.append(value)
-        self.count += n
-        self.total += value * n
-
-    def synchronize_between_processes(self):
-        """
-        Warning: does not synchronize the deque!
-        """
-        if not is_dist_avail_and_initialized():
-            return
-        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
-        dist.barrier()
-        dist.all_reduce(t)
-        t = t.tolist()
-        self.count = int(t[0])
-        self.total = t[1]
-
-    @property
-    def median(self):
-        d = torch.tensor(list(self.deque))
-        return d.median().item()
-
-    @property
-    def avg(self):
-        d = torch.tensor(list(self.deque), dtype=torch.float32)
-        return d.mean().item()
-
-    @property
-    def global_avg(self):
-        return self.total / self.count
-
-    @property
-    def max(self):
-        return max(self.deque)
-
-    @property
-    def value(self):
-        return self.deque[-1]
-
-    def __str__(self):
-        return self.fmt.format(
-            median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value
-        )
-
-
-def all_gather(data):
-    """
-    Run all_gather on arbitrary picklable data (not necessarily tensors)
-    Args:
-        data: any picklable object
-    Returns:
-        list[data]: list of data gathered from each rank
-    """
-    world_size = get_world_size()
-    if world_size == 1:
-        return [data]
-    data_list = [None] * world_size
-    dist.all_gather_object(data_list, data)
-    return data_list
-
-
-def reduce_dict(input_dict, average=True):
-    """
-    Args:
-        input_dict (dict): all the values will be reduced
-        average (bool): whether to do average or sum
-    Reduce the values in the dictionary from all processes so that all processes
-    have the averaged results. Returns a dict with the same fields as
-    input_dict, after reduction.
-    """
-    world_size = get_world_size()
-    if world_size < 2:
-        return input_dict
-    with torch.inference_mode():
-        names = []
-        values = []
-        # sort the keys so that they are consistent across processes
-        for k in sorted(input_dict.keys()):
-            names.append(k)
-            values.append(input_dict[k])
-        values = torch.stack(values, dim=0)
-        dist.all_reduce(values)
-        if average:
-            values /= world_size
-        reduced_dict = {k: v for k, v in zip(names, values)}
-    return reduced_dict
-
-
-class MetricLogger:
-    def __init__(self, delimiter="\t"):
-        self.meters = defaultdict(SmoothedValue)
-        self.delimiter = delimiter
-
-    def update(self, **kwargs):
-        for k, v in kwargs.items():
-            if isinstance(v, torch.Tensor):
-                v = v.item()
-            assert isinstance(v, (float, int))
-            self.meters[k].update(v)
-
-    def __getattr__(self, attr):
-        if attr in self.meters:
-            return self.meters[attr]
-        if attr in self.__dict__:
-            return self.__dict__[attr]
-        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{attr}'")
-
-    def __str__(self):
-        loss_str = []
-        for name, meter in self.meters.items():
-            loss_str.append(f"{name}: {str(meter)}")
-        return self.delimiter.join(loss_str)
-
-    def synchronize_between_processes(self):
-        for meter in self.meters.values():
-            meter.synchronize_between_processes()
-
-    def add_meter(self, name, meter):
-        self.meters[name] = meter
-
-    def log_every(self, iterable, print_freq, header=None):
-        i = 0
-        if not header:
-            header = ""
-        start_time = time.time()
-        end = time.time()
-        iter_time = SmoothedValue(fmt="{avg:.4f}")
-        data_time = SmoothedValue(fmt="{avg:.4f}")
-        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
-        if torch.cuda.is_available():
-            log_msg = self.delimiter.join(
-                [
-                    header,
-                    "[{0" + space_fmt + "}/{1}]",
-                    "eta: {eta}",
-                    "{meters}",
-                    "time: {time}",
-                    "data: {data}",
-                    "max mem: {memory:.0f}",
-                ]
-            )
-        else:
-            log_msg = self.delimiter.join(
-                [header, "[{0" + space_fmt + "}/{1}]", "eta: {eta}", "{meters}", "time: {time}", "data: {data}"]
-            )
-        MB = 1024.0 * 1024.0
-        for obj in iterable:
-            data_time.update(time.time() - end)
-            yield obj
-            iter_time.update(time.time() - end)
-            if i % print_freq == 0 or i == len(iterable) - 1:
-                eta_seconds = iter_time.global_avg * (len(iterable) - i)
-                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
-                if torch.cuda.is_available():
-                    print(
-                        log_msg.format(
-                            i,
-                            len(iterable),
-                            eta=eta_string,
-                            meters=str(self),
-                            time=str(iter_time),
-                            data=str(data_time),
-                            memory=torch.cuda.max_memory_allocated() / MB,
-                        )
-                    )
-                else:
-                    print(
-                        log_msg.format(
-                            i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time)
-                        )
-                    )
-            i += 1
-            end = time.time()
-        total_time = time.time() - start_time
-        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
-        print(f"{header} Total time: {total_time_str} ({total_time / len(iterable):.4f} s / it)")
-
-
-def collate_fn(batch):
-    return tuple(zip(*batch))
-
-
-def mkdir(path):
-    try:
-        os.makedirs(path)
-    except OSError as e:
-        if e.errno != errno.EEXIST:
-            raise
-
-
-def setup_for_distributed(is_master):
-    """
-    This function disables printing when not in master process
-    """
-    import builtins as __builtin__
-
-    builtin_print = __builtin__.print
-
-    def print(*args, **kwargs):
-        force = kwargs.pop("force", False)
-        if is_master or force:
-            builtin_print(*args, **kwargs)
-
-    __builtin__.print = print
-
-
-def is_dist_avail_and_initialized():
-    if not dist.is_available():
-        return False
-    if not dist.is_initialized():
-        return False
-    return True
-
-
-def get_world_size():
-    if not is_dist_avail_and_initialized():
-        return 1
-    return dist.get_world_size()
-
-
-def get_rank():
-    if not is_dist_avail_and_initialized():
-        return 0
-    return dist.get_rank()
-
-
-def is_main_process():
-    return get_rank() == 0
-
-
-def save_on_master(*args, **kwargs):
-    if is_main_process():
-        torch.save(*args, **kwargs)
-
-
-def init_distributed_mode(args):
-    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
-        args.rank = int(os.environ["RANK"])
-        args.world_size = int(os.environ["WORLD_SIZE"])
-        args.gpu = int(os.environ["LOCAL_RANK"])
-    elif "SLURM_PROCID" in os.environ:
-        args.rank = int(os.environ["SLURM_PROCID"])
-        args.gpu = args.rank % torch.cuda.device_count()
-    else:
-        print("Not using distributed mode")
-        args.distributed = False
-        return
-
-    args.distributed = True
-
-    torch.cuda.set_device(args.gpu)
-    args.dist_backend = "nccl"
-    print(f"| distributed init (rank {args.rank}): {args.dist_url}", flush=True)
-    torch.distributed.init_process_group(
-        backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank
-    )
-    torch.distributed.barrier()
-    setup_for_distributed(args.rank == 0)

From 1c422db526d6c40c72d54947213566359d4671fb Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sun, 21 Jul 2024 17:05:21 +0200
Subject: [PATCH 36/64] Added model saving to the resnet50 test

---
 .../pytorch_ddp/test/test-imagenet.py         | 24 +++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/integrations/pytorch_ddp/test/test-imagenet.py b/integrations/pytorch_ddp/test/test-imagenet.py
index 3fdd1dbc..adcd5aab 100644
--- a/integrations/pytorch_ddp/test/test-imagenet.py
+++ b/integrations/pytorch_ddp/test/test-imagenet.py
@@ -36,6 +36,10 @@
     logger.setLevel(logging.WARNING)
 
 # Run via ACCL
+global best_model_params_path
+
+best_model_params_path = './best_model_params.pt'
+
 
 class CNN(nn.Module):
     def __init__(self):
@@ -69,9 +73,9 @@ def forward(self, x):
 def train(model, criterion, optimizer, scheduler, num_epochs=25):
     since = time.time()
 
+    global rank
     # Create a temporary directory to save training checkpoints
     with TemporaryDirectory() as tempdir:
-        best_model_params_path = os.path.join(tempdir, 'best_model_params.pt')
 
         torch.save(model.state_dict(), best_model_params_path)
         best_acc = 0.0
@@ -91,6 +95,8 @@ def train(model, criterion, optimizer, scheduler, num_epochs=25):
                 running_corrects = 0
 
                 # Iterate over data.
+                count = 0
+                
                 for inputs, labels in dataloaders[phase]:
                     inputs = inputs.to(device)
                     labels = labels.to(device)
@@ -113,6 +119,16 @@ def train(model, criterion, optimizer, scheduler, num_epochs=25):
                     # statistics
                     running_loss += loss.item() * inputs.size(0)
                     running_corrects += torch.sum(preds == labels.data)
+
+                    print(f'{phase} RunningLoss: {running_loss:.4f}')
+                    logger.debug(f'{phase} RunningLoss: {running_loss:.4f}')
+                    
+                    if count % 5 == 0 and rank == 0:
+                        print("saving model to " + best_model_params_path)
+                        torch.save(model.state_dict(), best_model_params_path)
+                    
+                    count += 1
+                    
                 if phase == 'train':
                     scheduler.step()
 
@@ -120,7 +136,7 @@ def train(model, criterion, optimizer, scheduler, num_epochs=25):
                 epoch_acc = running_corrects.double() / dataset_sizes[phase]
 
                 print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
-
+                logger.debug(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
                 # deep copy the model
                 if phase == 'val' and epoch_acc > best_acc:
                     best_acc = epoch_acc
@@ -269,6 +285,10 @@ def train(model, criterion, optimizer, scheduler, num_epochs=25):
 
     loss_func = nn.CrossEntropyLoss()
 
+    best_model_params_path = './best_model_params.pt'
+
+    # model_ft.load_state_dict(torch.load(best_model_params_path))
+
     model_ft = train(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
                        num_epochs=25)
 

From b9b2410d9af91bf6be85e76cb40df6317afa958c Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Mon, 29 Jul 2024 14:07:38 +0200
Subject: [PATCH 37/64] Switch to network_utils initialization

fixes error with uneven counts
---
 integrations/pytorch_ddp/src/ProcessGroupACCL.cpp | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 2656bfb3..2223c11d 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -863,11 +863,11 @@ ProcessGroupACCL::ProcessGroupACCL(
     if (coyote_enabled) {
       if (design_ == accl_network_utils::acclDesign::CYT_TCP) {
         cyt_device = new ACCL::CoyoteDevice();
+	accl_network_utils::configure_cyt_tcp(ranks_, rank_, cyt_device);
       } else if (design_ == accl_network_utils::acclDesign::CYT_RDMA) {
 	ACCL::debug("Creating CoyoteDevice");
         cyt_device = new ACCL::CoyoteDevice(size_);
-	ACCL::debug("Starting QP-exchange");
-        cyt::setup_cyt_rdma(ibvQpConn_vec, ranks_, rank_, *cyt_device);
+	accl_network_utils::configure_cyt_rdma(ranks_, rank_, cyt_device);
       } else {
         throw std::runtime_error("Undefined ACCL design");
       }
@@ -909,11 +909,6 @@ void ProcessGroupACCL::initialize() {
   }
 
   if (coyote_enabled && !simulator_) {
-    if (design_ == accl_network_utils::acclDesign::CYT_RDMA) {
-      cyt::configure_cyt_rdma(ibvQpConn_vec, ranks_, rank_);
-    } else {
-      throw std::runtime_error("Coyote configure not implemented");
-    }
 
     accl = std::make_unique<ACCL::ACCL>(cyt_device);
     global_accl = &accl;
@@ -929,13 +924,12 @@ void ProcessGroupACCL::initialize() {
       accl.get()->initialize(ranks_, rank_, size_+2, bufsize, segsize, 4096*1024*2);
     } else {
       std::cout<<"Rendezvous Protocol"<<std::endl;
-      accl.get()->initialize(ranks_, rank_, size_, 64, 64, segsize);
+      accl.get()->initialize(ranks_, rank_, 16, 1024, RDVZ_THRESHOLD);
     }  
     
     ACCL::debug(std::string("[ACCL coyote] communicator: ") + accl->dump_communicator());
 
-    in_buf = accl->create_coyotebuffer<float_t>(bufsize/sizeof(float), ACCL::dataType::float32);
-    out_buf = accl->create_coyotebuffer<float_t>(bufsize/sizeof(float), ACCL::dataType::float32);
+
 
   } else {
     ACCL::debug(std::string("Performing standard initialization"));

From 736c5820eeac5d5e7d0fe1d9da89a868c5a3e2a7 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Mon, 29 Jul 2024 14:10:03 +0200
Subject: [PATCH 38/64] Enabled sidstepping of eager allreduce

---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 53 ++++++++++---------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 2223c11d..3c7e0808 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -54,6 +54,10 @@ namespace c10d {
 // #define GATHER_SIDESTEP
 // #define ALLGATHER_SIDESTEP
 // #define ALLREDUCE_SIDESTEP
+#define ALLREDUCE_SIDESTEP false
+// #define ALLREDUCE_SIDESTEP true
+
+#define RDVZ_THRESHOLD 64
     
 // Used in sidestepping
 #define MPI_CHECK(cmd)                                                   \
@@ -1159,30 +1163,31 @@ ProcessGroupACCL::allreduce(std::vector<at::Tensor> &tensors,
 
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [opts, this](std::unique_ptr<WorkEntry> &entry) {
-        #ifdef ALLREDUCE_SIDESTEP
-	auto data = (entry->src)[0];
-        c10::DeviceGuard guard(data.device());
-        std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
-        MPI_CHECK(MPI_Allreduce(
-            MPI_IN_PLACE,
-            data.data_ptr(),
-            data.numel(),
-            mpiDatatype.at(data.scalar_type()),
-            mpiOp.at(opts.reduceOp),
-            MPI_COMM_WORLD));
-        #else
-        auto tensor = (entry->src)[0];
-        // Segment data if necessary
-        if (tensor.nbytes() > bufsize) {
-          size_t n = bufsize / tensor.itemsize();
-          for (size_t i = 0; i < tensor.numel(); i += n) {
-            size_t end = std::min(i + n, static_cast<size_t>(tensor.numel()));
-            run_allreduce(tensor.slice(0, i, end), opts);
-          }
-        } else {
-          run_allreduce(tensor, opts);
-        }
-      #endif
+	// sidestep eager allreduce
+	if (((entry->src)[0]).nbytes() <= RDVZ_THRESHOLD || ALLREDUCE_SIDESTEP){
+	    auto data = (entry->src)[0];
+	    c10::DeviceGuard guard(data.device());
+	    std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+	    MPI_CHECK(MPI_Allreduce(
+			  MPI_IN_PLACE,
+			  data.data_ptr(),
+			  data.numel(),
+			  mpiDatatype.at(data.scalar_type()),
+			  mpiOp.at(opts.reduceOp),
+			  MPI_COMM_WORLD));
+	} else {
+	    auto tensor = (entry->src)[0];
+	    // Segment data if necessary
+	    if (tensor.nbytes() > bufsize) {
+		size_t n = bufsize / tensor.itemsize();
+		for (size_t i = 0; i < tensor.numel(); i += n) {
+		    size_t end = std::min(i + n, static_cast<size_t>(tensor.numel()));
+		    run_allreduce(tensor.slice(0, i, end), opts);
+		}
+	    } else {
+		run_allreduce(tensor, opts);
+	    }
+      }
       };
   auto entry =
       std::make_unique<WorkEntry>(&tensors, &tensors, std::move(runFunc));

From b3eb6cbfec7526a43ae409c951fc889e0cf2d300 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Mon, 29 Jul 2024 14:17:49 +0200
Subject: [PATCH 39/64] Adapted PG to simplified ACCL interface

---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 66 ++++++++++---------
 integrations/pytorch_ddp/test/test-generic.py | 64 ++++++++++--------
 .../pytorch_ddp/test/test-imagenet.py         |  8 ++-
 integrations/pytorch_ddp/test/test-mnist.py   |  7 +-
 4 files changed, 82 insertions(+), 63 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 3c7e0808..fe66072c 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -49,11 +49,11 @@ namespace c10d {
 
 // Toggles to run Collectives via OpenMPI instead(To sidestep any issues with them in ACCL)
 // The sidestep-code is copied from the ProcessGroupMPI
-// #define BROADCAST_SIDESTEP
-// #define SCATTER_SIDESTEP
-// #define GATHER_SIDESTEP
-// #define ALLGATHER_SIDESTEP
-// #define ALLREDUCE_SIDESTEP
+#define BROADCAST_SIDESTEP
+#define SCATTER_SIDESTEP
+#define GATHER_SIDESTEP
+#define ALLGATHER_SIDESTEP
+    
 #define ALLREDUCE_SIDESTEP false
 // #define ALLREDUCE_SIDESTEP true
 
@@ -133,10 +133,6 @@ std::map<at::ScalarType, MPI_Datatype> mpiDatatype = {
 #define POST_REQUEST(opname, n_bytes)				\
 double durationUs = 0.0;				\
 ACCL::debug("Waiting for request to complete.");	\
-bool ret = accl->wait(req, 20000ms);			\
-if(ret == false){					\
-    ACCL::debug("!!!!!!! Timeout !!!!!!!");		\
-}									\
 if(coyote_enabled){							\
   auto end = std::chrono::high_resolution_clock::now();			\
   durationUs = (std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count() / 1000.0); \
@@ -712,7 +708,9 @@ void ProcessGroupACCL::init_input_tensor(at::Tensor &tensor, std::unique_ptr<ACC
 	wrapper_tensor.copy_(tensor);
 
 	//TODO check if necessary in coyote
-      data->sync_to_device();
+	if (!coyote_enabled) {
+	    data->sync_to_device();
+	}
 	
     }
     // don't sync if no rank initializes, we will fill content and sync later
@@ -947,17 +945,19 @@ void ProcessGroupACCL::initialize() {
                                       
     int devicemem = accl->devicemem();
 
-    in_buf = accl->create_buffer<float_t>(bufsize/sizeof(float), ACCL::dataType::float32);
-    out_buf = accl->create_buffer<float_t>(bufsize/sizeof(float), ACCL::dataType::float32);
-
     // Not sure if this is needed:
     // Initialize cache buffers
-    if (!simulator_){
-	buf0 = xrt::bo(xrt_device, bufsize, devicemem);
-	buf1 = xrt::bo(xrt_device, bufsize, devicemem);
-    }
+    // if (!simulator_){
+	// buf0 = xrt::bo(xrt_device, bufsize, devicemem);
+	// buf1 = xrt::bo(xrt_device, bufsize, devicemem);
+    // }
+    
 	    
   }
+
+  in_buf = accl->create_buffer_host<float>(bufsize/sizeof(float), ACCL::dataType::float32);
+  out_buf = accl->create_buffer_host<float>(bufsize/sizeof(float), ACCL::dataType::float32);
+  
   accl->set_timeout(1e8);
   // Start the worker thread accepting ACCL calls
   workerThread_ = std::thread(&ProcessGroupACCL::runLoop, this);
@@ -1070,11 +1070,11 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
 
   PRE_REQUEST(Broadcast,in_tensor)
 
-  ACCL::ACCLRequest* req = accl->bcast(*in_buf, in_tensor.numel(), opts.rootRank, ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor.scalar_type()));
+  accl->bcast(*in_buf, in_tensor.numel(), opts.rootRank);
 
   POST_REQUEST("bcast", in_tensor.nbytes())
 
-  in_buf->sync_from_device();
+  // in_buf->sync_from_device();
   // for(int i = 0; i<in_tensor.numel(); i++){
       // ACCL::debug(std::to_string(((double *) in_buf->byte_array())[i]));
   // }
@@ -1148,8 +1148,14 @@ void ProcessGroupACCL::run_allreduce(at::Tensor in_tensor,
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
   PRE_REQUEST(Allreduce,in_tensor)  
+
+
+  // It seems to have issues with non-even numbers, so we round to 256
+  // int rounded_count = (in_tensor.numel() + 255) & ~255;
+  // int rounded_count = 32768;
+
   
-  ACCL::ACCLRequest* req = accl->allreduce(*in_buf, *out_buf, in_tensor.numel(), acclOp.at(opts.reduceOp), ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor.scalar_type()));
+  accl->allreduce(*in_buf, *out_buf, in_tensor.numel(), acclOp.at(opts.reduceOp));      
 
   POST_REQUEST("allreduce", in_tensor.nbytes())
 
@@ -1213,7 +1219,7 @@ void ProcessGroupACCL::run_reduce(at::Tensor in_tensor,
 
   PRE_REQUEST(Reduce,in_tensor)  
 
-  ACCL::ACCLRequest* req = accl->reduce(*in_buf, *out_buf, in_tensor.numel(), opts.rootRank, acclOp.at(opts.reduceOp), ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor.scalar_type()));
+  accl->reduce(*in_buf, *out_buf, in_tensor.numel(), opts.rootRank, acclOp.at(opts.reduceOp));
 
   POST_REQUEST("reduce", in_tensor.nbytes())
 
@@ -1261,8 +1267,7 @@ void ProcessGroupACCL::run_allgather(
   
   PRE_REQUEST(Allgather,in_tensor)
 
-  ACCL::ACCLRequest* req = accl->allgather(*in_buf, *out_buf, in_tensor.numel(), ACCL::GLOBAL_COMM,
-                  true, true, get_compressed_type(in_tensor.scalar_type()));
+  accl->allgather(*in_buf, *out_buf, in_tensor.numel());
 
   POST_REQUEST("allgather", in_tensor.nbytes())
 
@@ -1358,9 +1363,7 @@ void ProcessGroupACCL::run_gather(at::Tensor in_tensor,
 
   PRE_REQUEST(Gather, in_tensor)
 
-  ACCL::ACCLRequest* req = accl->gather(*in_buf, *out_buf, in_tensor.numel(), opts.rootRank,
-               ACCL::GLOBAL_COMM, true, true,
-               get_compressed_type(in_tensor.scalar_type()));
+  accl->gather(*in_buf, *out_buf, in_tensor.numel(), opts.rootRank);
 
   POST_REQUEST("gather", in_tensor.nbytes())
 
@@ -1476,7 +1479,7 @@ void ProcessGroupACCL::run_scatter(std::vector<at::Tensor> &in_tensor_vec,
   PRE_REQUEST(Scatter, dsttensor)
   
   // Run scatter
-  ACCL::ACCLRequest* req = accl->scatter(*in_buf, *out_buf, out_tensor.numel(), opts.rootRank, ACCL::GLOBAL_COMM, true, true, get_compressed_type(dsttensor.scalar_type()));
+  accl->scatter(*in_buf, *out_buf, out_tensor.numel(), opts.rootRank);
 
   POST_REQUEST("scatter", out_tensor.nbytes())
 
@@ -1597,7 +1600,7 @@ void ProcessGroupACCL::run_alltoall(at::Tensor in_tensor,
 
   PRE_REQUEST(AlltoAll, in_tensor)
 
-  ACCL::ACCLRequest* req = accl->alltoall(*in_buf, *out_buf, in_tensor.numel()/size_, ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor.scalar_type()));
+  accl->alltoall(*in_buf, *out_buf, in_tensor.numel()/size_);
 
   POST_REQUEST("alltoall", in_tensor.nbytes()/size_)
 
@@ -1621,7 +1624,7 @@ void ProcessGroupACCL::run_alltoall_vec(std::vector<at::Tensor> &in_tensor_vec,
   
   PRE_REQUEST(AlltoAll, in_tensor_vec[0])
 
-  ACCL::ACCLRequest* req = accl->alltoall(*in_buf, *out_buf, in_tensor_vec[0].numel(), ACCL::GLOBAL_COMM, true, true, get_compressed_type(in_tensor_vec[0].scalar_type()));
+  accl->alltoall(*in_buf, *out_buf, in_tensor_vec[0].numel());
 
   POST_REQUEST("alltoall", in_tensor_vec[0].nbytes())
 
@@ -1717,8 +1720,7 @@ void ProcessGroupACCL::run_send(at::Tensor in_tensor, int dstRank,
 
   PRE_REQUEST(Send,in_tensor)
   
-  ACCL::ACCLRequest* req = accl->send(*in_buf, in_tensor.numel(), dstRank, tag, ACCL::GLOBAL_COMM, true,
-             get_compressed_type(in_tensor.scalar_type()));
+  ACCL::ACCLRequest* req = accl->send(*in_buf, in_tensor.numel(), dstRank, tag);
 
   POST_REQUEST("send", in_tensor.nbytes())
 }
@@ -1759,7 +1761,7 @@ void ProcessGroupACCL::run_recv(at::Tensor out_tensor, int srcRank,
 
   PRE_REQUEST(Receive, out_tensor)  
   
-  ACCL::ACCLRequest* req = accl->recv(*out_buf, out_tensor.numel(), srcRank, tag, ACCL::GLOBAL_COMM, true, get_compressed_type(out_tensor.scalar_type()));
+  ACCL::ACCLRequest* req = accl->recv(*out_buf, out_tensor.numel(), srcRank, tag);
 
   POST_REQUEST("recv", out_tensor.nbytes())
 
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index 0a1a0044..e6a88d31 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -49,9 +49,12 @@
 rank = 0
 size = 0
 
-count = 16 * 1
-num_el = 16 * 1
-shape = (16 , 1)
+x = 128
+y = 2
+
+count = x * y
+num_el = x * y
+shape = (x , y)
 #As in test.cpp defaults
 rxbufsize = 4096 * 1024
 
@@ -264,22 +267,27 @@ def test_reduce():
         
 
 def test_allreduce():
-    global num_errors
-    x = torch.ones(shape)
 
-    with torch.profiler.record_function("test_allreduce"):
-        
-        dist.all_reduce(x, dist.ReduceOp.SUM)
-        mpi.Barrier()
-        
-    try:
-        np.testing.assert_allclose(x, torch.full(shape, float(size)))
-    except AssertionError as e:
-        num_errors = num_errors + 1
-        logger.debug("Test AllReduce failed")
-        logger.debug(str(e))
-    else:
-        logger.debug("Test AllReduce finished!")
+    for i in range(8,21):
+    # if True:
+    
+        shape = (28938,)
+        global num_errors
+        x = torch.ones(shape)
+
+        with torch.profiler.record_function("test_allreduce"):
+
+            dist.all_reduce(x, dist.ReduceOp.SUM)
+            mpi.Barrier()
+
+        try:
+            np.testing.assert_allclose(x, torch.full(shape, float(size)))
+        except AssertionError as e:
+            num_errors = num_errors + 1
+            logger.debug("Test AllReduce failed")
+            logger.debug(str(e))
+        else:
+            logger.debug("Test AllReduce finished!")
         
     
 def test_alltoall():
@@ -462,20 +470,20 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
     # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                  # profile_memory=True, record_shapes=True) as prof:
 
-    test_broadcast_segment()
+    # test_broadcast_segment()
+    # test_broadcast()
     # test_broadcast()
     # test_broadcast()
-    test_broadcast()
     # test_broadcast()
     # test_broadcast()
-    test_broadcast_2()
-    test_sendrcv()
-    test_scatter()
-    test_gather()
-    test_allgather()
-    test_alltoall()
+    # test_broadcast_2()
+    # test_sendrcv()
+    # test_scatter()
+    # test_gather()
+    # test_allgather()
+    # test_alltoall()
     test_allreduce()
-    test_allgather()
+    # test_allgather()
     # test_allreduce()
     # test_allreduce()
     # test_allreduce()
@@ -483,7 +491,7 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
     # test_reduce()
 
         
-    demo_basic(rank)
+    # demo_basic(rank)
 
 
     mpi.Barrier()
diff --git a/integrations/pytorch_ddp/test/test-imagenet.py b/integrations/pytorch_ddp/test/test-imagenet.py
index adcd5aab..eda571e6 100644
--- a/integrations/pytorch_ddp/test/test-imagenet.py
+++ b/integrations/pytorch_ddp/test/test-imagenet.py
@@ -80,6 +80,9 @@ def train(model, criterion, optimizer, scheduler, num_epochs=25):
         torch.save(model.state_dict(), best_model_params_path)
         best_acc = 0.0
 
+        print("train len: " + str(len(dataloaders['train'])))
+        print("val len: " + str(len(dataloaders['val'])))
+        
         for epoch in range(num_epochs):
             print(f'Epoch {epoch}/{num_epochs - 1}')
             print('-' * 10)
@@ -96,6 +99,7 @@ def train(model, criterion, optimizer, scheduler, num_epochs=25):
 
                 # Iterate over data.
                 count = 0
+
                 
                 for inputs, labels in dataloaders[phase]:
                     inputs = inputs.to(device)
@@ -120,8 +124,8 @@ def train(model, criterion, optimizer, scheduler, num_epochs=25):
                     running_loss += loss.item() * inputs.size(0)
                     running_corrects += torch.sum(preds == labels.data)
 
-                    print(f'{phase} RunningLoss: {running_loss:.4f}')
-                    logger.debug(f'{phase} RunningLoss: {running_loss:.4f}')
+                    print(f'{phase} batch Loss: {loss.item():.4f}')
+                    logger.debug(f'{phase} batch Loss: {loss.item():.4f}')
                     
                     if count % 5 == 0 and rank == 0:
                         print("saving model to " + best_model_params_path)
diff --git a/integrations/pytorch_ddp/test/test-mnist.py b/integrations/pytorch_ddp/test/test-mnist.py
index b26bff82..5335b936 100644
--- a/integrations/pytorch_ddp/test/test-mnist.py
+++ b/integrations/pytorch_ddp/test/test-mnist.py
@@ -134,6 +134,11 @@ def test():
         print("Assung DDP setup")
         args.d = True
 
+
+    host_file = args.host_file
+    fpga_file = args.fpga_file
+    comms = args.comms
+    start_port = 5005
     
     global rank, size
     if args.master_address==None:
@@ -212,7 +217,7 @@ def test():
     }
 
     cnn = CNN()
-    if args.d : cnn = DDP(cnn, bucket_cap_mb=4)
+    if args.d : cnn = DDP(cnn, bucket_cap_mb=2)
 
     loss_func = nn.CrossEntropyLoss()   
 

From 8665bc6630c4a4d284c5b05ace1f15316de9dc9b Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Fri, 2 Aug 2024 11:22:51 +0200
Subject: [PATCH 40/64] Disabled bcast and ar sidestepping. added ar segmenting

---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 32 ++++++++++++-------
 .../pytorch_ddp/test/test-imagenet.py         |  2 +-
 2 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index fe66072c..0673d2ec 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -49,10 +49,13 @@ namespace c10d {
 
 // Toggles to run Collectives via OpenMPI instead(To sidestep any issues with them in ACCL)
 // The sidestep-code is copied from the ProcessGroupMPI
-#define BROADCAST_SIDESTEP
 #define SCATTER_SIDESTEP
 #define GATHER_SIDESTEP
-#define ALLGATHER_SIDESTEP
+// #define ALLGATHER_SIDESTEP
+
+#define BROADCAST_SIDESTEP false
+// #define BROADCAST_SIDESTEP true
+
     
 #define ALLREDUCE_SIDESTEP false
 // #define ALLREDUCE_SIDESTEP true
@@ -1097,9 +1100,11 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
   checkSingleTensor(tensors);
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [opts, this](std::unique_ptr<WorkEntry> &entry) {
-	#ifdef BROADCAST_SIDESTEP
+	// if (((entry->src)[0]).numel() <= RDVZ_THRESHOLD || BROADCAST_SIDESTEP){
+	if (BROADCAST_SIDESTEP){
+	    
 	auto data = (entry->src)[0];
-	ACCL::debug("[Broadcast] -- Sidestepped using OpenMPI --");
+	ACCL::debug("[Broadcast] -- Sidestepped using OpenMPI -- size " + std::to_string(data.numel()));
 	c10::DeviceGuard guard(data.device());
         std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
         MPI_CHECK(MPI_Bcast(
@@ -1108,7 +1113,7 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
             mpiDatatype.at(data.scalar_type()),
             opts.rootRank,
             MPI_COMM_WORLD));
-	#else
+	} else {
 	std::chrono::time_point<std::chrono::high_resolution_clock> start  = std::chrono::high_resolution_clock::now();
 	at::Tensor &tensor = (entry->src)[0];
         // Segment data if necessary
@@ -1128,7 +1133,7 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
 	auto end = std::chrono::high_resolution_clock::now();
 	double durationUs = (std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count() / 1000.0);
         ACCL::debug("Total bcast durationUs:" + std::to_string(durationUs));
-	#endif
+	}
       };
   auto entry =
       std::make_unique<WorkEntry>(&tensors, &tensors, std::move(runFunc));
@@ -1170,8 +1175,10 @@ ProcessGroupACCL::allreduce(std::vector<at::Tensor> &tensors,
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [opts, this](std::unique_ptr<WorkEntry> &entry) {
 	// sidestep eager allreduce
-	if (((entry->src)[0]).nbytes() <= RDVZ_THRESHOLD || ALLREDUCE_SIDESTEP){
+	// if (((entry->src)[0]).numel() <= RDVZ_THRESHOLD || ALLREDUCE_SIDESTEP){
+	  if (ALLREDUCE_SIDESTEP){
 	    auto data = (entry->src)[0];
+	    ACCL::debug("[Allreduce] -- Sidestepped using OpenMPI -- size " + std::to_string(data.numel()));
 	    c10::DeviceGuard guard(data.device());
 	    std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 	    MPI_CHECK(MPI_Allreduce(
@@ -1184,10 +1191,13 @@ ProcessGroupACCL::allreduce(std::vector<at::Tensor> &tensors,
 	} else {
 	    auto tensor = (entry->src)[0];
 	    // Segment data if necessary
-	    if (tensor.nbytes() > bufsize) {
-		size_t n = bufsize / tensor.itemsize();
-		for (size_t i = 0; i < tensor.numel(); i += n) {
-		    size_t end = std::min(i + n, static_cast<size_t>(tensor.numel()));
+	    if (tensor.nbytes() > bufsize/2) {
+		size_t non_zero_dim_count = tensor.numel() / tensor.size(0);
+		size_t n = bufsize / 2 / tensor.itemsize() / non_zero_dim_count;
+		ACCL::debug("[Allreduce] Segmenting tensor of size " + std::to_string(tensor.nbytes()) + " into " + std::to_string(n * non_zero_dim_count) + "-sized elements ");
+		for (size_t i = 0; i < tensor.size(0); i += n) {
+		    ACCL::debug("part " + std::to_string(i) + "!");
+		    size_t end = std::min(i + n, static_cast<size_t>(tensor.size(0)));
 		    run_allreduce(tensor.slice(0, i, end), opts);
 		}
 	    } else {
diff --git a/integrations/pytorch_ddp/test/test-imagenet.py b/integrations/pytorch_ddp/test/test-imagenet.py
index eda571e6..5e725035 100644
--- a/integrations/pytorch_ddp/test/test-imagenet.py
+++ b/integrations/pytorch_ddp/test/test-imagenet.py
@@ -277,7 +277,7 @@ def train(model, criterion, optimizer, scheduler, num_epochs=25):
 
     model_ft.fc = nn.Linear(num_ftrs, 2)
 
-    if args.d : model_ft = DDP(model_ft, bucket_cap_mb=4)
+    if args.d : model_ft = DDP(model_ft, bucket_cap_mb=2)
 
     criterion = nn.CrossEntropyLoss()
 

From 8730249d3f4b8e36051d666c8bdbd6028efa09ee Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sun, 4 Aug 2024 00:22:48 +0200
Subject: [PATCH 41/64] Added rounding to allreduce. dlrm works

---
 integrations/pytorch_ddp/src/ProcessGroupACCL.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 0673d2ec..15dbf2c7 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -1156,11 +1156,10 @@ void ProcessGroupACCL::run_allreduce(at::Tensor in_tensor,
 
 
   // It seems to have issues with non-even numbers, so we round to 256
-  // int rounded_count = (in_tensor.numel() + 255) & ~255;
-  // int rounded_count = 32768;
+  int rounded_count = (in_tensor.numel() + 255) & ~255;
 
   
-  accl->allreduce(*in_buf, *out_buf, in_tensor.numel(), acclOp.at(opts.reduceOp));      
+  accl->allreduce(*in_buf, *out_buf, rounded_count, acclOp.at(opts.reduceOp));      
 
   POST_REQUEST("allreduce", in_tensor.nbytes())
 

From c6782f43ee0b14d60175b0fe22b58df86574eb90 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Tue, 6 Aug 2024 10:11:46 +0200
Subject: [PATCH 42/64] Added more fine-grained benchmarking to ar and a2a

---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 116 +++++++++++-------
 1 file changed, 74 insertions(+), 42 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 15dbf2c7..54e693e9 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -61,7 +61,35 @@ namespace c10d {
 // #define ALLREDUCE_SIDESTEP true
 
 #define RDVZ_THRESHOLD 64
-    
+
+#define MICRO_BENCH_FINE 1
+
+#define MICRO_BENCH_COARSE 1
+
+#if MICRO_BENCH_FINE
+#define START_FINE(name) \
+  std::chrono::time_point<std::chrono::high_resolution_clock> start_##name  = std::chrono::high_resolution_clock::now();
+#define STOP_FINE(name) \
+  auto end_##name = std::chrono::high_resolution_clock::now();		\
+  double durationUs_##name = (std::chrono::duration_cast<std::chrono::nanoseconds>(end_##name-start_##name).count() / 1000.0); \
+  ACCL::debug(#name "_tensor durationUs:" + std::to_string(durationUs_##name));
+#else
+#define START_FINE(name)
+#define STOP_FINE(name)
+#endif
+
+#if MICRO_BENCH_COARSE
+#define START_COARSE(name) \
+  std::chrono::time_point<std::chrono::high_resolution_clock> start_##name  = std::chrono::high_resolution_clock::now();
+#define STOP_COARSE(name) \
+  auto end_##name = std::chrono::high_resolution_clock::now();		\
+  double durationUs_##name = (std::chrono::duration_cast<std::chrono::nanoseconds>(end_##name-start_##name).count() / 1000.0); \
+  ACCL::debug(#name "_tensor durationUs:" + std::to_string(durationUs_##name));
+#else
+#define START_COARSE(name)
+#define STOP_COARSE(name)
+#endif
+
 // Used in sidestepping
 #define MPI_CHECK(cmd)                                                   \
   do {                                                                   \
@@ -123,25 +151,19 @@ std::map<at::ScalarType, MPI_Datatype> mpiDatatype = {
 #define DO_COND ((do_on_root && opts_root_rank == rank_) || (do_on_others && opts_root_rank != rank_))
 
 #define PRE_REQUEST(opname, tensor)					\
-    in_buf->change_type(convert_datatype_from_torch(tensor.scalar_type()));	\
-  out_buf->change_type(convert_datatype_from_torch(tensor.scalar_type()));	\
-  ACCL::debug("[" #opname "] Entering barrier");				\
+  START_FINE(type)    \						      
+  in_buf->change_type(convert_datatype_from_torch(tensor.scalar_type())); \
+  out_buf->change_type(convert_datatype_from_torch(tensor.scalar_type()));   \
+  STOP_FINE(type)						\  
+  ACCL::debug("[" #opname "] Entering barrier");			\
+  START_FINE(barrier)    \
   accl->barrier();							\
-  ACCL::debug("Starting " #opname " of " + std::to_string(tensor.numel()) + " items"); \
-  std::chrono::time_point<std::chrono::high_resolution_clock> start;	\
-  if(coyote_enabled){							\
-      start = std::chrono::high_resolution_clock::now();		\
-  }									
+  STOP_FINE(barrier)							\
+  ACCL::debug("Performing " #opname " of " + std::to_string(tensor.numel()) + " items"); \
+  START_FINE(lib)							
 
 #define POST_REQUEST(opname, n_bytes)				\
-double durationUs = 0.0;				\
-ACCL::debug("Waiting for request to complete.");	\
-if(coyote_enabled){							\
-  auto end = std::chrono::high_resolution_clock::now();			\
-  durationUs = (std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count() / 1000.0); \
-  ACCL::debug("host measured durationUs:" + std::to_string(durationUs)); \
-}									\
-ACCL::debug("Finished waiting");
+STOP_FINE(lib)						
 
 #define TIMER_WRAP()
     
@@ -1053,7 +1075,7 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
 
   // This case split is necessary, because otherwise data will be set to a nullptr
 
-  std::chrono::time_point<std::chrono::high_resolution_clock> start_init  = std::chrono::high_resolution_clock::now();
+  START_FINE(init)
       
   if (opts.rootRank == rank_){
       init_input_tensor(in_tensor, in_buf, true, false, opts.rootRank);
@@ -1061,16 +1083,17 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   // else{
       // init_output_data(in_tensor, in_buf, in_tensor.numel(), in_tensor.scalar_type(), false, true, opts.rootRank);
   // }
-  auto end_init = std::chrono::high_resolution_clock::now();
-  double durationUs_init = (std::chrono::duration_cast<std::chrono::nanoseconds>(end_init-start_init).count() / 1000.0);
-  ACCL::debug("init tensor durationUs:" + std::to_string(durationUs_init));
+
+  STOP_FINE(init)
   
   // Reserve device
 
-  std::chrono::time_point<std::chrono::high_resolution_clock> start_lock  = std::chrono::high_resolution_clock::now();
+      
+  START_FINE(lock)
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
-
+  STOP_FINE(lock)
+  
   PRE_REQUEST(Broadcast,in_tensor)
 
   accl->bcast(*in_buf, in_tensor.numel(), opts.rootRank);
@@ -1081,17 +1104,9 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   // for(int i = 0; i<in_tensor.numel(); i++){
       // ACCL::debug(std::to_string(((double *) in_buf->byte_array())[i]));
   // }
-      
-  std::chrono::time_point<std::chrono::high_resolution_clock> start_copy  = std::chrono::high_resolution_clock::now();
+  START_FINE(copy)
   copy_back_tensor(in_tensor, in_buf, true, true, opts.rootRank);
-  auto end_copy = std::chrono::high_resolution_clock::now();
-  double durationUs_copy = (std::chrono::duration_cast<std::chrono::nanoseconds>(end_copy-start_copy).count() / 1000.0);
-  ACCL::debug("Copy tensor durationUs:" + std::to_string(durationUs_copy));
-
-  
-  auto end_inner = std::chrono::high_resolution_clock::now();
-  double durationUs_inner = (std::chrono::duration_cast<std::chrono::nanoseconds>(end_inner-start_inner).count() / 1000.0);
-  ACCL::debug("Inner total tensor durationUs:" + std::to_string(durationUs_inner));  
+  STOP_FINE(copy)
 }
 
 c10::intrusive_ptr<Work>
@@ -1100,6 +1115,7 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
   checkSingleTensor(tensors);
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [opts, this](std::unique_ptr<WorkEntry> &entry) {
+	ACCL::debug("Starting Broadcast");
 	// if (((entry->src)[0]).numel() <= RDVZ_THRESHOLD || BROADCAST_SIDESTEP){
 	if (BROADCAST_SIDESTEP){
 	    
@@ -1114,7 +1130,7 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
             opts.rootRank,
             MPI_COMM_WORLD));
 	} else {
-	std::chrono::time_point<std::chrono::high_resolution_clock> start  = std::chrono::high_resolution_clock::now();
+	START_COARSE(total)    
 	at::Tensor &tensor = (entry->src)[0];
         // Segment data if necessary
         if (tensor.nbytes() > bufsize) {
@@ -1130,9 +1146,7 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
 	  ACCL::debug("[Broadcast] Broadcasting entire tensor of size " + std::to_string(tensor.nbytes()) + " without segmentation.");
           run_broadcast(tensor, opts);
         }
-	auto end = std::chrono::high_resolution_clock::now();
-	double durationUs = (std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count() / 1000.0);
-        ACCL::debug("Total bcast durationUs:" + std::to_string(durationUs));
+	STOP_COARSE(total)    
 	}
       };
   auto entry =
@@ -1144,13 +1158,18 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
 void ProcessGroupACCL::run_allreduce(at::Tensor in_tensor,
                                      const AllreduceOptions &opts) {
 
-  STANDARD_DECL
+  START_FINE(init)
+  
+  init_input_tensor(in_tensor, in_buf, true, true);
+
+  STOP_FINE(init)
 
-  init_input_tensor(in_tensor, in_buf, true, true);    
 
+  START_FINE(lock)      
   // Reserve device
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+  STOP_FINE(lock)      
 
   PRE_REQUEST(Allreduce,in_tensor)  
 
@@ -1163,7 +1182,9 @@ void ProcessGroupACCL::run_allreduce(at::Tensor in_tensor,
 
   POST_REQUEST("allreduce", in_tensor.nbytes())
 
+  START_FINE(copy)      
   copy_back_tensor(in_tensor, out_buf, true, true);
+  STOP_FINE(copy)
 }
 
 c10::intrusive_ptr<Work>
@@ -1173,6 +1194,7 @@ ProcessGroupACCL::allreduce(std::vector<at::Tensor> &tensors,
 
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [opts, this](std::unique_ptr<WorkEntry> &entry) {
+	ACCL::debug("Starting Allreduce");
 	// sidestep eager allreduce
 	// if (((entry->src)[0]).numel() <= RDVZ_THRESHOLD || ALLREDUCE_SIDESTEP){
 	  if (ALLREDUCE_SIDESTEP){
@@ -1188,6 +1210,7 @@ ProcessGroupACCL::allreduce(std::vector<at::Tensor> &tensors,
 			  mpiOp.at(opts.reduceOp),
 			  MPI_COMM_WORLD));
 	} else {
+	    START_COARSE(total)    
 	    auto tensor = (entry->src)[0];
 	    // Segment data if necessary
 	    if (tensor.nbytes() > bufsize/2) {
@@ -1202,6 +1225,7 @@ ProcessGroupACCL::allreduce(std::vector<at::Tensor> &tensors,
 	    } else {
 		run_allreduce(tensor, opts);
 	    }
+	    STOP_COARSE(total)    
       }
       };
   auto entry =
@@ -1599,11 +1623,16 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::reduce_scatter(
 void ProcessGroupACCL::run_alltoall(at::Tensor in_tensor,
                                     at::Tensor out_tensor,
                                     const AllToAllOptions &opts) {
-  init_input_tensor(in_tensor, in_buf, true, true); 
+
+  START_FINE(init)
+  init_input_tensor(in_tensor, in_buf, true, true);
+  STOP_FINE(init)
 
   // Reserve device
+  START_FINE(lock)      
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+  STOP_FINE(lock)
 
   // init_output_data(out_tensor, dstdata, out_tensor.numel(), out_tensor.scalar_type(), true, true);
 
@@ -1613,8 +1642,9 @@ void ProcessGroupACCL::run_alltoall(at::Tensor in_tensor,
 
   POST_REQUEST("alltoall", in_tensor.nbytes()/size_)
 
+  START_FINE(copy)      
   copy_back_tensor(out_tensor, out_buf, true, true);    
-  
+  STOP_FINE(copy)
 }
 
     
@@ -1645,7 +1675,7 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::alltoall_base(
     at::Tensor &outputTensor, at::Tensor &inputTensor,
     std::vector<int64_t> &outputSplitSizes,
     std::vector<int64_t> &inputSplitSizes, const AllToAllOptions &opts) {
-  ACCL::debug("alltoall base variant called");
+    ACCL::debug("Starting AlltoAll");
   if (outputSplitSizes.size() == 0 && inputSplitSizes.size() == 0) {
     // We can use alltoall
     TORCH_CHECK(
@@ -1658,6 +1688,7 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::alltoall_base(
 
     std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
         [opts, this](std::unique_ptr<WorkEntry>& entry) {
+	  START_COARSE(total)    
           auto srctensor = (entry->src)[0];
           auto dsttensor = (entry->dst)[0];
 
@@ -1696,6 +1727,7 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::alltoall_base(
 	    ACCL::debug("Running without segmentation");
             run_alltoall(srctensor, dsttensor, opts);
           }
+	STOP_COARSE(total)	  
         };
     std::vector<at::Tensor> inputTensors = {inputTensor};
     std::vector<at::Tensor> outputTensors = {outputTensor};

From 747961c9963670d70df3fb1269a9c412e7af086d Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Mon, 12 Aug 2024 01:33:07 +0200
Subject: [PATCH 43/64] Added sidestep bcast with allreduce for resnet50

---
 integrations/pytorch_ddp/README.md            |   2 +
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      |  50 +++-
 integrations/pytorch_ddp/test/run.sh          |   4 +-
 integrations/pytorch_ddp/test/test-generic.py |  94 ++++---
 integrations/pytorch_ddp/test/test-mnist.py   |  55 +++-
 .../pytorch_ddp/test/test-resnet50.py         | 251 ++++++++++++++++++
 6 files changed, 403 insertions(+), 53 deletions(-)
 create mode 100644 integrations/pytorch_ddp/test/test-resnet50.py

diff --git a/integrations/pytorch_ddp/README.md b/integrations/pytorch_ddp/README.md
index 27532913..94a0d07d 100644
--- a/integrations/pytorch_ddp/README.md
+++ b/integrations/pytorch_ddp/README.md
@@ -12,6 +12,8 @@ python3 -m venv venv
 source venv/bin/activate
 ```
 
+Activate an XRT 21 version. Later versions led to issues before.
+
 <details><summary>Installation without GPU support</summary>
   To install the plugin without GPU support, simply run the following from within the venv:
 
diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 54e693e9..f74aed8c 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -60,6 +60,8 @@ namespace c10d {
 #define ALLREDUCE_SIDESTEP false
 // #define ALLREDUCE_SIDESTEP true
 
+#define SIDESTEP_BCAST_WITH_ALLREDUCE
+    
 #define RDVZ_THRESHOLD 64
 
 #define MICRO_BENCH_FINE 1
@@ -959,7 +961,7 @@ void ProcessGroupACCL::initialize() {
 
 
   } else {
-    ACCL::debug(std::string("Performing standard initialization"));
+    // ACCL::debug(std::string("Error XRT initialization deprecated"));
     accl = accl_network_utils::initialize_accl(ranks_, rank_,
                                                simulator_, design_, xrt_device,
                                                xclbin_, nbufs_, bufsize, 0,
@@ -1074,6 +1076,41 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   std::chrono::time_point<std::chrono::high_resolution_clock> start_inner  = std::chrono::high_resolution_clock::now();
 
   // This case split is necessary, because otherwise data will be set to a nullptr
+  #ifdef SIDESTEP_BCAST_WITH_ALLREDUCE
+  START_FINE(init)
+  
+  if (opts.rootRank == rank_){
+      init_input_tensor(in_tensor, in_buf, true, false, opts.rootRank);
+  }
+  else{
+      auto zero_tensor = torch::zeros({in_tensor.numel()}, in_tensor.scalar_type());
+      init_input_tensor(zero_tensor, in_buf, false, true, opts.rootRank);
+  }
+
+  STOP_FINE(init)
+
+  START_FINE(lock)      
+  // Reserve device
+  c10::DeviceGuard guard(in_tensor.device());
+  std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+  STOP_FINE(lock)      
+
+  PRE_REQUEST(Broadcast,in_tensor)  
+
+
+  // It seems to have issues with non-even numbers, so we round to 256
+  int rounded_count = (in_tensor.numel() + 1023) & ~1023;
+
+  
+  accl->allreduce(*in_buf, *out_buf, rounded_count, ACCL::reduceFunction::SUM);      
+
+  POST_REQUEST("allreduce", in_tensor.nbytes())
+
+  START_FINE(copy)      
+  copy_back_tensor(in_tensor, out_buf, true, true);
+  STOP_FINE(copy)
+  
+  #else
 
   START_FINE(init)
       
@@ -1096,7 +1133,9 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   
   PRE_REQUEST(Broadcast,in_tensor)
 
-  accl->bcast(*in_buf, in_tensor.numel(), opts.rootRank);
+  int rounded_count = (in_tensor.numel() + 1023) & ~1023;
+
+  accl->bcast(*in_buf, rounded_count, opts.rootRank);
 
   POST_REQUEST("bcast", in_tensor.nbytes())
 
@@ -1107,6 +1146,7 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   START_FINE(copy)
   copy_back_tensor(in_tensor, in_buf, true, true, opts.rootRank);
   STOP_FINE(copy)
+  #endif
 }
 
 c10::intrusive_ptr<Work>
@@ -1133,9 +1173,9 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
 	START_COARSE(total)    
 	at::Tensor &tensor = (entry->src)[0];
         // Segment data if necessary
-        if (tensor.nbytes() > bufsize) {
+        if (tensor.nbytes() > bufsize / 2) {
 	  size_t non_zero_dim_count = tensor.numel() / tensor.size(0);
-          size_t n = bufsize / tensor.itemsize() / non_zero_dim_count;
+          size_t n = bufsize / 2 / tensor.itemsize() / non_zero_dim_count;
 	  ACCL::debug("[Broadcast] Segmenting tensor of size " + std::to_string(tensor.nbytes()) + " into " + std::to_string(n * non_zero_dim_count) + "-sized elements ");
           for (size_t i = 0; i < tensor.size(0); i += n) {
 	    ACCL::debug("part " + std::to_string(i) + "!");
@@ -1175,7 +1215,7 @@ void ProcessGroupACCL::run_allreduce(at::Tensor in_tensor,
 
 
   // It seems to have issues with non-even numbers, so we round to 256
-  int rounded_count = (in_tensor.numel() + 255) & ~255;
+  int rounded_count = (in_tensor.numel() + 1023) & ~1023;
 
   
   accl->allreduce(*in_buf, *out_buf, rounded_count, acclOp.at(opts.reduceOp));      
diff --git a/integrations/pytorch_ddp/test/run.sh b/integrations/pytorch_ddp/test/run.sh
index ebabfe7f..78688b9d 100755
--- a/integrations/pytorch_ddp/test/run.sh
+++ b/integrations/pytorch_ddp/test/run.sh
@@ -10,8 +10,9 @@ if [[ -v ACCL_SCRIPT ]]; then
     SCRIPT_NAME="$ACCL_SCRIPT"
 else
     # SCRIPT_NAME="test-mnist.py -d True -n 2" # MNIST
-    SCRIPT_NAME="test-imagenet.py -d True"
+    SCRIPT_NAME="test-resnet50.py -d True -n 2" # MNIST
     # SCRIPT_NAME=test-generic.py
+    # SCRIPT_NAME="test-imagenet.py -d True"
     echo "Variable ACCL_SCRIPT not set. Assuming $SCRIPT_NAME"
 fi
 
@@ -100,6 +101,7 @@ echo "Running with $NUM_PROCESS Processes"
 
 rm -f $(pwd)/accl_log/rank*
 rm -f $(pwd)/accl_log/accl_pg_*
+rm -rf $(pwd)/accl_log/profiler_log
 
 # C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_stdout\" $EXEC $ARG &"
 C="mpirun -n $NUM_PROCESS $MPI_ARGS -outfile-pattern \"$(pwd)/accl_log/rank_%r_stdout\" -errfile-pattern \"$(pwd)/accl_log/rank_%r_stderr\" $EXEC $ARG &"
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index e6a88d31..f514b69a 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -49,8 +49,8 @@
 rank = 0
 size = 0
 
-x = 128
-y = 2
+x = 28939
+y = 1
 
 count = x * y
 num_el = x * y
@@ -84,14 +84,21 @@ def test_broadcast_segment():
         logger.debug("Test broadcast finished!")
 
 def test_broadcast():
+    shape = (256,)
+    
     global num_errors
-    if rank == 0:
-        x = torch.ones(shape)
-    else:
-        x = torch.zeros(shape)
 
-    for i in range(1):
-        with torch.profiler.record_function("test bcast " + str(i)):
+        
+    # for i in range(10):
+    if True:
+
+        if rank == 0:
+            x = torch.ones(shape)
+        else:
+            x = torch.zeros(shape)
+
+        
+        with torch.profiler.record_function("test bcast "):
 
             start_time = time.perf_counter()
 
@@ -123,8 +130,8 @@ def test_broadcast():
         logger.debug("Test broadcast finished!")
 
 def test_broadcast_2():
-    test_type = torch.double
-    shape_2 = (2, 2)
+    test_type = torch.float
+    shape_2 = (1048576,)
     global num_errors
     if rank == 0:
         x = torch.ones(shape_2, dtype=test_type)
@@ -226,7 +233,7 @@ def test_gather():
             
 def test_allgather():
     global num_errors
-    shape_gather = (1,)
+    shape_gather = (2,)
     x = torch.full(shape_gather, float(rank), dtype=torch.float)
     y = [torch.empty(shape_gather, dtype=torch.float) for _ in range(size)]
 
@@ -268,10 +275,11 @@ def test_reduce():
 
 def test_allreduce():
 
-    for i in range(8,21):
-    # if True:
+    # for i in range(10):
+    if True:
     
-        shape = (28938,)
+        shape = (256,)
+        # shape = (320001,)
         global num_errors
         x = torch.ones(shape)
 
@@ -293,6 +301,10 @@ def test_allreduce():
 def test_alltoall():
     global num_errors
 
+    num_el = 26624
+    
+    shape = (num_el,)
+
     input = torch.arange(num_el, dtype=torch.float) + float(rank) * num_el
 
     input_shaped = input.reshape(shape)
@@ -467,31 +479,35 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
     
     global num_errors
     num_errors = 0
-    # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
-                 # profile_memory=True, record_shapes=True) as prof:
-
-    # test_broadcast_segment()
-    # test_broadcast()
-    # test_broadcast()
-    # test_broadcast()
-    # test_broadcast()
-    # test_broadcast()
-    # test_broadcast_2()
-    # test_sendrcv()
-    # test_scatter()
-    # test_gather()
-    # test_allgather()
-    # test_alltoall()
-    test_allreduce()
-    # test_allgather()
-    # test_allreduce()
-    # test_allreduce()
-    # test_allreduce()
-
-    # test_reduce()
-
-        
-    # demo_basic(rank)
+    # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, record_shapes=True) as prof:
+
+    # for i in range(10):
+    if True:
+
+        # test_broadcast_2()
+        test_broadcast()
+        test_allreduce()
+        # test_allgather()
+        # test_broadcast_segment()
+        # test_broadcast()
+        # test_broadcast()
+        # test_broadcast()
+        # test_broadcast()
+        # test_broadcast()
+        # test_sendrcv()
+        # test_scatter()
+        # test_gather()
+        # test_allgather()
+        # test_alltoall()
+        # test_allreduce()
+        # test_allgather()
+        # test_allreduce()
+        # test_allreduce()
+
+        # test_reduce()
+
+
+        # demo_basic(rank)
 
 
     mpi.Barrier()
diff --git a/integrations/pytorch_ddp/test/test-mnist.py b/integrations/pytorch_ddp/test/test-mnist.py
index 5335b936..ad1c3158 100644
--- a/integrations/pytorch_ddp/test/test-mnist.py
+++ b/integrations/pytorch_ddp/test/test-mnist.py
@@ -2,6 +2,7 @@
 from torchvision import datasets
 from torchvision.transforms import ToTensor
 from torch.utils.data import DataLoader
+from torch.profiler import profile, ProfilerActivity
 import torch.nn as nn
 from torch import optim
 from torch.autograd import Variable
@@ -16,6 +17,7 @@
 import os
 import sys
 import logging
+import time
 
 logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
 
@@ -57,7 +59,9 @@ def forward(self, x):
         output = self.out(x)
         return output, x    # return x for visualization
 
-def train(num_epochs, cnn, loaders):
+def train(num_epochs, cnn, loaders, p):
+
+    start_time_train = time.perf_counter()
     
     cnn.train()
         
@@ -68,7 +72,8 @@ def train(num_epochs, cnn, loaders):
 
     for epoch in range(num_epochs):
         for i, (images, labels) in enumerate(loaders['train']):
-            
+            p.step()
+            start_time = time.perf_counter()
             # gives batch data, normalize x when iterate train_loader
             b_x = Variable(images)   # batch x
             b_y = Variable(labels)   # batch y
@@ -86,24 +91,39 @@ def train(num_epochs, cnn, loaders):
             
             # if (i+1) % 100 == 0:
             if True:
-                print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
-                       .format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))
+                end_time = time.perf_counter()
+                measured_time = (end_time - start_time) * 1000000
+                print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Time(us): {}' 
+                       .format(epoch + 1, num_epochs, i + 1, total_step, loss.item(), measured_time))
+
+    end_time_train = time.perf_counter()
+    measured_time_train = (end_time_train - start_time_train) * 1000000
+
+    print('Total train time: ' + str(measured_time_train))
         
 
-def test():
+def test(p):
     # Test the model
+    start_time_test = time.perf_counter()
     cnn.eval()
     with torch.no_grad():
         correct = 0
         total = 0
         for images, labels in loaders['test']:
+            p.step()
             test_output, last_layer = cnn(images)
             pred_y = torch.max(test_output, 1)[1].data.squeeze()
             correct_current = (pred_y == labels).sum().item()
             total += labels.size(0)
             correct += correct_current
             
-            print(f'Test Batch accuracy: {correct_current}/{labels.size(0)} {correct_current/float(label.size(0))}')
+            print(f'Test Batch accuracy: {correct_current}/{labels.size(0)} {correct_current/float(labels.size(0))}')
+
+
+    end_time_test = time.perf_counter()
+    measured_time_test = (end_time_test - start_time_test) * 1000000
+
+    print('Total test time: ' + str(measured_time_test))            
     print(f'Total accuracy: {correct}/{total} {correct/float(total)}')
     
 if __name__ == "__main__":
@@ -226,8 +246,27 @@ def test():
     mpi.Barrier()
 
     print("starting training")
+
+    schedule = torch.profiler.schedule(
+        wait=1,
+        warmup=1,
+        active=10,
+        repeat=3
+    )
     
-    train(num_epochs, cnn, loaders)
+    with torch.profiler.profile(
+            activities=[torch.profiler.ProfilerActivity.CPU],
+            schedule=schedule,
+            on_trace_ready=torch.profiler.tensorboard_trace_handler('./accl_log/profiler_log'),
+            record_shapes=True,
+            with_stack=True
+    ) as p:
+
+        
+        train(num_epochs, cnn, loaders, p)
+
+        test(p)
 
-    test()
+    p.stop()
 
+    print(p.key_averages().table(sort_by="self_cpu_time_total", row_limit=100))
diff --git a/integrations/pytorch_ddp/test/test-resnet50.py b/integrations/pytorch_ddp/test/test-resnet50.py
new file mode 100644
index 00000000..deb97b57
--- /dev/null
+++ b/integrations/pytorch_ddp/test/test-resnet50.py
@@ -0,0 +1,251 @@
+import torch
+import torchvision
+from torchvision import datasets
+from torchvision import models
+from torchvision import transforms
+from torchvision.transforms import ToTensor
+from torch.utils.data import DataLoader
+from torch.profiler import profile, ProfilerActivity
+import torch.nn as nn
+from torch import optim
+from torch.autograd import Variable
+import torch.distributed as dist
+import accl_process_group as accl
+
+from mpi4py.MPI import COMM_WORLD as mpi
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data.distributed import DistributedSampler
+
+import argparse
+import os
+import sys
+import logging
+import time
+
+logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
+
+logger = logging.getLogger(__name__)
+
+if "ACCL_DEBUG" in os.environ and os.environ["ACCL_DEBUG"]=="1":
+    logger.setLevel(logging.DEBUG)
+else:
+    logger.setLevel(logging.WARNING)
+
+# Run via ACCL
+
+def train(num_epochs, model, loaders, criterion, p):
+
+    start_time_train = time.perf_counter()
+    
+    model.train()
+
+    total_step = len(loaders['train'])
+
+    optimizer = optim.Adam(model.parameters(), lr = 0.001)   
+
+    for epoch in range(num_epochs):
+        model.train()
+        running_loss = 0.0
+        for i, (inputs, labels) in enumerate(loaders['train']):
+            p.step()
+            start_time = time.perf_counter()
+            
+            optimizer.zero_grad()
+            outputs = model(inputs)
+            loss = criterion(outputs, labels)
+            loss.backward()
+            optimizer.step()
+            running_loss += loss.item()
+
+            # if (i+1) % 10 == 0:
+                # break
+            if True:
+                end_time = time.perf_counter()
+                measured_time = (end_time - start_time) * 1000000
+                print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Time(us): {}' 
+                       .format(epoch + 1, num_epochs, i + 1, total_step, loss.item(), measured_time))
+
+    end_time_train = time.perf_counter()
+    measured_time_train = (end_time_train - start_time_train) * 1000000
+
+    print('Total train time: ' + str(measured_time_train))
+        
+
+def test(num_epochs, model, loaders, criterion, p):
+    # Test the model
+    start_time_test = time.perf_counter()
+    model.eval()
+    with torch.no_grad():
+        correct = 0
+        total = 0
+        val_loss = 0
+        for i, (inputs, labels) in enumerate(loaders['test']):
+            p.step()
+            test_output = model(inputs)
+            loss = criterion(test_output, labels)
+            val_loss += loss.item()
+
+            _, predicted = torch.max(test_output, 1)
+            correct_current = (predicted == labels).sum().item()
+            total += labels.size(0)
+            correct += correct_current
+            
+            print(f'Test Batch accuracy: {correct_current}/{labels.size(0)} {correct_current/float(labels.size(0))}')
+
+
+    end_time_test = time.perf_counter()
+    measured_time_test = (end_time_test - start_time_test) * 1000000
+
+    print('Total test time: ' + str(measured_time_test))            
+    print(f'Total accuracy: {correct}/{total} {correct/float(total)}')
+    
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("-n", type=int, default=1)
+    parser.add_argument("-d", type=bool, default=None)
+
+
+    parser.add_argument('-s', '--simulator', action='store_true',
+                        default=False, help='Use simulation instead of '
+                                            'hardware')
+    parser.add_argument('-c', '--comms', choices=['udp', 'tcp', 'cyt_rdma'], default='tcp',
+                        help='Run tests over specified communication backend')
+    parser.add_argument('-i', '--host-file', type=str, help='Specify the file, where the host IPs are listed')
+    parser.add_argument('-f', '--fpga-file', type=str, help='Specify the file, where the FPGA IPs are listed')
+    parser.add_argument('-a','--master-address', type=str)
+    parser.add_argument('-p','--master-port', type=str)
+
+
+    args = parser.parse_args()
+
+    if args.n == 1 and args.d == None :
+        print("only one machine specified. Assuming Non distributed setup")
+        args.d = False
+    elif args.n > 1 and args.d == None:
+        print("Assung DDP setup")
+        args.d = True
+
+
+    host_file = args.host_file
+    fpga_file = args.fpga_file
+    comms = args.comms
+    start_port = 5005
+    
+    global rank, size
+    if args.master_address==None:
+        args.master_address = "localhost"
+    if args.master_port==None:
+        args.master_port = "30505"
+    os.environ['MASTER_ADDR'] = args.master_address
+    os.environ['MASTER_PORT'] = args.master_port
+    rank = mpi.Get_rank()
+    size = mpi.Get_size()
+
+    rxbufsize = 4096 * 1024
+
+    if args.d:
+        if not args.simulator:
+            #default from test.cpp
+            rxbufsize = 4096 * 1024
+            if host_file==None or fpga_file==None: sys.exit('Host and FPGA file need to be specified in hardware mode')
+        
+            with open(host_file, 'r') as hf:
+                host_ips = hf.read().splitlines()
+            
+            with open(fpga_file, 'r') as ff:
+                fpga_ips = ff.read().splitlines()
+
+            if comms == "cyt_rdma":
+                ranks = [accl.Rank(a, start_port, i, rxbufsize) for i, a in enumerate(fpga_ips)]
+            else:
+                ranks = [accl.Rank(a, start_port + i, 0, rxbufsize) for i, a in enumerate(fpga_ips)]
+        else:
+            # Somehow the simulator gets stuck if I use the same rxbufsize
+            rxbufsize = 4096 * 1024
+            ranks = [accl.Rank("127.0.0.1", 5500 + i, i, rxbufsize) for i in range(size)]
+
+        logger.debug(f'Ranks: {ranks}')
+
+        if args.comms == 'udp':
+            design = accl.ACCLDesign.udp
+        elif args.comms == 'tcp':
+            design = accl.ACCLDesign.tcp
+        elif args.comms == 'cyt_rdma': # and not simulator:
+            design = accl.ACCLDesign.cyt_rdma
+    
+
+        mpi.Barrier()            
+    
+        accl.create_process_group(ranks, design, bufsize=rxbufsize, initialize=True, simulation=args.simulator)
+        dist.init_process_group("ACCL", rank=rank, world_size=size)
+        
+    device = 'cpu'
+
+    transform = transforms.Compose([
+        transforms.Resize(256),
+        transforms.CenterCrop(224),
+        transforms.ToTensor(),
+        transforms.Normalize(
+            mean=[0.485, 0.456, 0.406],
+            std=[0.229, 0.224, 0.225]
+        )
+    ])
+
+    train_dataset = datasets.CIFAR10(root='cifar10_data', train=True, download=True, transform=transform)
+    val_dataset = datasets.CIFAR10(root='cifar10_data', train=False, download=True, transform=transform)
+
+    if args.d : sampler = DistributedSampler
+    else : sampler = lambda x : None
+    
+    loaders = {
+        'train' : torch.utils.data.DataLoader(train_dataset, 
+                                              batch_size=32, 
+                                              shuffle=False,
+                                              num_workers=4,
+                                              sampler=sampler(train_dataset)),
+        'test'  : torch.utils.data.DataLoader(val_dataset, 
+                                              batch_size=32, 
+                                              shuffle=False,
+                                              num_workers=4,
+                                              sampler=sampler(val_dataset)),
+    }
+
+    model = models.resnet50(pretrained=True)
+    
+    if args.d : model = DDP(model, bucket_cap_mb=2, broadcast_buffers=False)
+
+    loss_func = nn.CrossEntropyLoss()   
+
+    criterion = nn.CrossEntropyLoss()
+    
+    num_epochs = 1
+
+    mpi.Barrier()
+
+    print("starting training")
+
+    schedule = torch.profiler.schedule(
+        wait=1,
+        warmup=1,
+        active=10,
+        repeat=3
+    )
+
+    
+    with torch.profiler.profile(
+            activities=[torch.profiler.ProfilerActivity.CPU],
+            schedule=schedule,
+            on_trace_ready=torch.profiler.tensorboard_trace_handler('./accl_log/profiler_log'),
+            record_shapes=True,
+    ) as p:
+
+        
+        train(num_epochs, model, loaders, criterion, p)
+
+        test(num_epochs, model, loaders, criterion, p)
+
+    p.stop()
+
+    print(p.key_averages().table(sort_by="self_cpu_time_total", row_limit=100))

From e8ffe0e66030a8231eb98bd0e6a54e4a0221ed51 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Mon, 12 Aug 2024 01:37:04 +0200
Subject: [PATCH 44/64] Replaced tensor copies with memcpy(solves performance
 issues)

---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp        | 17 ++++++++++-------
 integrations/pytorch_ddp/test/test-mnist.py     |  3 +++
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index f74aed8c..73bf9ddf 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -60,7 +60,7 @@ namespace c10d {
 #define ALLREDUCE_SIDESTEP false
 // #define ALLREDUCE_SIDESTEP true
 
-#define SIDESTEP_BCAST_WITH_ALLREDUCE
+// #define SIDESTEP_BCAST_WITH_ALLREDUCE
     
 #define RDVZ_THRESHOLD 64
 
@@ -153,10 +153,10 @@ std::map<at::ScalarType, MPI_Datatype> mpiDatatype = {
 #define DO_COND ((do_on_root && opts_root_rank == rank_) || (do_on_others && opts_root_rank != rank_))
 
 #define PRE_REQUEST(opname, tensor)					\
-  START_FINE(type)    \						      
+  START_FINE(type)    \
   in_buf->change_type(convert_datatype_from_torch(tensor.scalar_type())); \
   out_buf->change_type(convert_datatype_from_torch(tensor.scalar_type()));   \
-  STOP_FINE(type)						\  
+  STOP_FINE(type)						\
   ACCL::debug("[" #opname "] Entering barrier");			\
   START_FINE(barrier)    \
   accl->barrier();							\
@@ -170,7 +170,7 @@ STOP_FINE(lib)
 #define TIMER_WRAP()
     
 // Better logging
-// accl_log(mpi_rank, format_log("bcast", options, durationUs, 0));	\
+// accl_log(mpi_rank, format_log("bcast", options, durationUs, 0));
 
   
 namespace {
@@ -731,8 +731,9 @@ void accl_sa_handler(int)
 void ProcessGroupACCL::init_input_tensor(at::Tensor &tensor, std::unique_ptr<ACCL::BaseBuffer> &data, bool do_on_root, bool do_on_others, int opts_root_rank) {
   if DO_COND {
 	ACCL::debug("Copying data to CPU tensor of size " + std::to_string(tensor.numel()));
-	at::Tensor wrapper_tensor = torch::from_blob(data->byte_array(), tensor.sizes(), tensor.options().device(c10::DeviceType::CPU)); 
-	wrapper_tensor.copy_(tensor);
+	// at::Tensor wrapper_tensor = torch::from_blob(data->byte_array(), tensor.sizes(), tensor.options().device(c10::DeviceType::CPU)); 
+	// wrapper_tensor.copy_(tensor);
+	std::memcpy(data->byte_array(), tensor.data_ptr(), tensor.numel() * tensor.element_size());
 
 	//TODO check if necessary in coyote
 	if (!coyote_enabled) {
@@ -827,7 +828,9 @@ void ProcessGroupACCL::copy_back_tensor(at::Tensor tensor_original, std::unique_
       } else {
 	ACCL::debug("Copying data back from CPU tensor of size " +
 		    std::to_string(tensor_original.numel()));
-	tensor_original.copy_(torch::from_blob(data->byte_array(), tensor_original.sizes(), tensor_original.options().device(c10::DeviceType::CPU)));
+	std::memcpy(tensor_original.data_ptr(), data->byte_array(), tensor_original.numel() * tensor_original.element_size());
+	// tensor_original.copy_(torch::from_blob(data->byte_array(), tensor_original.sizes(), tensor_original.options().device(c10::DeviceType::CPU)));
+	ACCL::debug("Finished Copying ");
       }
   }
 }
diff --git a/integrations/pytorch_ddp/test/test-mnist.py b/integrations/pytorch_ddp/test/test-mnist.py
index ad1c3158..b667a08d 100644
--- a/integrations/pytorch_ddp/test/test-mnist.py
+++ b/integrations/pytorch_ddp/test/test-mnist.py
@@ -270,3 +270,6 @@ def test(p):
     p.stop()
 
     print(p.key_averages().table(sort_by="self_cpu_time_total", row_limit=100))
+
+
+    dist.destroy_process_group()

From 8aaae9856ee4a49ed1575b64b1eb2f1bbfa3e1d2 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sun, 18 Aug 2024 19:05:27 +0200
Subject: [PATCH 45/64] Implemented constant message sizes for resnet50

---
 .../pytorch_ddp/include/ProcessGroupACCL.hpp  |   2 +-
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 235 +++++++++++++++---
 .../pytorch_ddp/test/test-resnet50.py         | 116 +++++++--
 3 files changed, 294 insertions(+), 59 deletions(-)

diff --git a/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp b/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp
index 52c3223e..e72d0db4 100644
--- a/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp
+++ b/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp
@@ -309,7 +309,7 @@ class TORCH_API ProcessGroupACCL : public ProcessGroup {
   
   void copy_back_tensor(at::Tensor tensor_original, std::unique_ptr<ACCL::BaseBuffer> &data, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
 
-  void copy_back_tensorvec(const std::vector<at::Tensor> &dsttensorvec, std::unique_ptr<ACCL::BaseBuffer> &data, at::Tensor &dsttensor, int numel, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
+  void copy_back_tensorvec(const std::vector<at::Tensor> &dsttensorvec, std::unique_ptr<ACCL::BaseBuffer> &data, at::Tensor &dsttensor, int numel, int offset, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
   
   static std::once_flag onceFlagInitACCL;
 
diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 73bf9ddf..5a6a22de 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -49,8 +49,8 @@ namespace c10d {
 
 // Toggles to run Collectives via OpenMPI instead(To sidestep any issues with them in ACCL)
 // The sidestep-code is copied from the ProcessGroupMPI
-#define SCATTER_SIDESTEP
-#define GATHER_SIDESTEP
+// #define SCATTER_SIDESTEP
+// #define GATHER_SIDESTEP
 // #define ALLGATHER_SIDESTEP
 
 #define BROADCAST_SIDESTEP false
@@ -60,13 +60,15 @@ namespace c10d {
 #define ALLREDUCE_SIDESTEP false
 // #define ALLREDUCE_SIDESTEP true
 
-// #define SIDESTEP_BCAST_WITH_ALLREDUCE
+#define SIDESTEP_BCAST_WITH_ALLREDUCE
     
 #define RDVZ_THRESHOLD 64
 
-#define MICRO_BENCH_FINE 1
+#define MICRO_BENCH_FINE 0
 
-#define MICRO_BENCH_COARSE 1
+#define MICRO_BENCH_COARSE 0
+
+#define ACCL_MSG_SIZE 256    
 
 #if MICRO_BENCH_FINE
 #define START_FINE(name) \
@@ -352,7 +354,22 @@ const char *string_of_accl_datatype(ACCL::dataType accl_type) {
   }
 }
 
-
+const char *string_of_torch_datatype(c10::ScalarType torch_type) {
+  switch (torch_type) {
+  case at::kHalf:
+      return "torch.float16";
+  case at::kFloat:
+    return "torch.float32";
+  case at::kDouble:
+    return "torch.float64";
+  case at::kInt:
+    return "torch.int32";
+  case at::kLong:
+    return "torch.int64";
+  default:
+    return "unknown";
+  }
+}
   
 std::map<ACCL::dataType, ACCL::dataType> convert_compression_from_dict(
     const std::map<std::string, std::string> &dictionary) {
@@ -835,7 +852,7 @@ void ProcessGroupACCL::copy_back_tensor(at::Tensor tensor_original, std::unique_
   }
 }
 
-  void ProcessGroupACCL::copy_back_tensorvec(const std::vector<at::Tensor> &dsttensorvec, std::unique_ptr<ACCL::BaseBuffer> &data, at::Tensor &dsttensor, int numel, bool do_on_root, bool do_on_others, int opts_root_rank){
+void ProcessGroupACCL::copy_back_tensorvec(const std::vector<at::Tensor> &dsttensorvec, std::unique_ptr<ACCL::BaseBuffer> &data, at::Tensor &dsttensor, int numel, int offset, bool do_on_root, bool do_on_others, int opts_root_rank){
   if DO_COND {
     if (!coyote_enabled) {
       data->sync_from_device();
@@ -847,7 +864,8 @@ void ProcessGroupACCL::copy_back_tensor(at::Tensor tensor_original, std::unique_
 	  // data->slice(i * numel, (i + 1) * numel);
 	// copy_back_p2p_buffer(*slice, dsttensorvec[i]);
       // } else {
-	dsttensorvec[i].copy_(dsttensor[i]);
+	std::memcpy(dsttensorvec[i].data_ptr(), data->byte_array() + i * offset * dsttensor.element_size(), numel * dsttensor.element_size());
+	// dsttensorvec[i].copy_(dsttensor[i]);
       // }
     }
   }
@@ -1078,17 +1096,37 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
 
   std::chrono::time_point<std::chrono::high_resolution_clock> start_inner  = std::chrono::high_resolution_clock::now();
 
-  // This case split is necessary, because otherwise data will be set to a nullptr
   #ifdef SIDESTEP_BCAST_WITH_ALLREDUCE
-  START_FINE(init)
+
+  // It seems to have issues with non-even numbers, so we round to ACCL_MSG_SIZE
+  int rounded_count = ACCL_MSG_SIZE / in_tensor.element_size();
+
+  // if (in_tensor.scalar_type() == at::kInt || in_tensor.scalar_type() == at::kLong){
+      // return;
+  }
+
+  ACCL::debug("rounded count:" + std::to_string(rounded_count));
+  ACCL::debug("rootRank:" + std::to_string(opts.rootRank));
   
+  int imaginary_count = rounded_count;
+  if (in_tensor.scalar_type() == at::kDouble || in_tensor.scalar_type() == at::kLong){
+      imaginary_count = imaginary_count * 2;
+
+  }
+
+  ACCL::debug("imaginary count:" + std::to_string(imaginary_count));
+
+  
+  START_FINE(init)
+
+  auto zero_tensor = torch::zeros({imaginary_count}, at::kInt);
   if (opts.rootRank == rank_){
       init_input_tensor(in_tensor, in_buf, true, false, opts.rootRank);
   }
   else{
-      auto zero_tensor = torch::zeros({in_tensor.numel()}, in_tensor.scalar_type());
       init_input_tensor(zero_tensor, in_buf, false, true, opts.rootRank);
   }
+  init_input_tensor(zero_tensor, out_buf, true, false, opts.rootRank);
 
   STOP_FINE(init)
 
@@ -1098,16 +1136,33 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
   STOP_FINE(lock)      
 
-  PRE_REQUEST(Broadcast,in_tensor)  
 
+  in_buf->change_type(convert_datatype_from_torch(at::kInt));
+  out_buf->change_type(convert_datatype_from_torch(at::kInt));
+  accl->barrier();
+  
 
-  // It seems to have issues with non-even numbers, so we round to 256
-  int rounded_count = (in_tensor.numel() + 1023) & ~1023;
+  if (in_tensor.scalar_type() == at::kInt || in_tensor.scalar_type() == at::kLong){
+      ACCL::debug("input:");
+	      for(int i = 0; i<(in_tensor.numel() * (in_tensor.element_size() / 4)); i++){
+		  ACCL::debug(std::to_string(((int *) in_buf->byte_array())[i]));
+	      }
+  }
 
   
-  accl->allreduce(*in_buf, *out_buf, rounded_count, ACCL::reduceFunction::SUM);      
+      
+  accl->allreduce(*in_buf, *out_buf, imaginary_count, ACCL::reduceFunction::SUM);      
 
-  POST_REQUEST("allreduce", in_tensor.nbytes())
+  if (in_tensor.scalar_type() == at::kInt || in_tensor.scalar_type() == at::kLong){
+      ACCL::debug("result:");
+	      for(int i = 0; i<(in_tensor.numel() * (in_tensor.element_size() / 4)); i++){
+		  ACCL::debug(std::to_string(((int *) out_buf->byte_array())[i]));
+	      }
+  }
+  
+
+  
+  POST_REQUEST("broadcast", in_tensor.nbytes())
 
   START_FINE(copy)      
   copy_back_tensor(in_tensor, out_buf, true, true);
@@ -1115,14 +1170,27 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   
   #else
 
-  START_FINE(init)
+  // if (opts.rootRank != 0){
+    // ACCL::debug("Can't run on non-zero root rank");
+    // return;
+  // }
+
+  int rounded_count = ACCL_MSG_SIZE / in_tensor.element_size();
       
+  auto zero_tensor = torch::zeros({rounded_count}, in_tensor.scalar_type());
+  
+  
+  ACCL::debug("rounded count:" + std::to_string(rounded_count));
+  ACCL::debug("rootRank:" + std::to_string(opts.rootRank));
+  
+  START_FINE(init)
+
   if (opts.rootRank == rank_){
       init_input_tensor(in_tensor, in_buf, true, false, opts.rootRank);
   }
-  // else{
-      // init_output_data(in_tensor, in_buf, in_tensor.numel(), in_tensor.scalar_type(), false, true, opts.rootRank);
-  // }
+  else{
+      init_input_tensor(zero_tensor, in_buf, false, true, opts.rootRank);
+  }
 
   STOP_FINE(init)
   
@@ -1134,12 +1202,69 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
   STOP_FINE(lock)
   
-  PRE_REQUEST(Broadcast,in_tensor)
 
-  int rounded_count = (in_tensor.numel() + 1023) & ~1023;
+  START_FINE(lib)
+      
+
+  in_buf->change_type(convert_datatype_from_torch(at::kInt));
+  out_buf->change_type(convert_datatype_from_torch(at::kInt));
 
-  accl->bcast(*in_buf, rounded_count, opts.rootRank);
+  int imaginary_count = rounded_count;
 
+  ACCL::debug("imaginary count:" + std::to_string(imaginary_count));
+  
+  if (in_tensor.scalar_type() == at::kDouble || in_tensor.scalar_type() == at::kLong){
+      imaginary_count = imaginary_count * 2;
+
+  }
+      
+  
+  accl->barrier();
+  /*      
+    if(rank_ == opts.rootRank){
+
+      for(int i = 0; i < size_; i++){
+	  if(i != opts.rootRank){
+	        if (in_tensor.scalar_type() == at::kInt || in_tensor.scalar_type() == at::kLong){
+		    ACCL::debug("sending:")
+	      for(int i = 0; i<(in_tensor.numel() * (in_tensor.element_size() / 4)); i++){
+		  ACCL::debug(std::to_string(((int *) in_buf->byte_array())[i]));
+  }
+		}
+	      ACCL::ACCLRequest* req = accl->send(*in_buf, rounded_count, i, 203);
+	  }
+	  
+  }
+    }
+    else{
+	  ACCL::ACCLRequest* req = accl->recv(*in_buf, rounded_count, opts.rootRank, 203);
+	  if (in_tensor.scalar_type() == at::kInt || in_tensor.scalar_type() == at::kLong){
+	      ACCL::debug("received:")
+	      for(int i = 0; i<(in_tensor.numel() * (in_tensor.element_size() / 4)); i++){
+		  ACCL::debug(std::to_string(((int *) in_buf->byte_array())[i]));
+  }
+		}
+    }
+  */
+
+
+  if (in_tensor.scalar_type() == at::kInt || in_tensor.scalar_type() == at::kLong){
+      ACCL::debug("input:");
+	      for(int i = 0; i<(in_tensor.numel() * (in_tensor.element_size() / 4)); i++){
+		  ACCL::debug(std::to_string(((int *) in_buf->byte_array())[i]));
+  }
+		}
+  
+  
+  accl->bcast(*in_buf, imaginary_count, opts.rootRank);
+
+  if (in_tensor.scalar_type() == at::kInt || in_tensor.scalar_type() == at::kLong){
+      ACCL::debug("result:");
+	      for(int i = 0; i<(in_tensor.numel() * (in_tensor.element_size() / 4)); i++){
+		  ACCL::debug(std::to_string(((int *) in_buf->byte_array())[i]));
+  }
+		}
+  
   POST_REQUEST("bcast", in_tensor.nbytes())
 
   // in_buf->sync_from_device();
@@ -1158,11 +1283,15 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
   checkSingleTensor(tensors);
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [opts, this](std::unique_ptr<WorkEntry> &entry) {
-	ACCL::debug("Starting Broadcast");
+	  std::cerr << "Starting Broadcast" << std::endl;
 	// if (((entry->src)[0]).numel() <= RDVZ_THRESHOLD || BROADCAST_SIDESTEP){
 	if (BROADCAST_SIDESTEP){
 	    
 	auto data = (entry->src)[0];
+	std::cerr << "before" << std::endl;
+	if(data.scalar_type() == at::kInt || data.scalar_type() == at::kLong){
+	    std::cerr << data << std::endl;
+	}
 	ACCL::debug("[Broadcast] -- Sidestepped using OpenMPI -- size " + std::to_string(data.numel()));
 	c10::DeviceGuard guard(data.device());
         std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
@@ -1172,13 +1301,28 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
             mpiDatatype.at(data.scalar_type()),
             opts.rootRank,
             MPI_COMM_WORLD));
+	std::cerr << "after" << std::endl;
+	if(data.scalar_type() == at::kInt || data.scalar_type() == at::kLong){
+	    std::cerr << data << std::endl;
+	}
 	} else {
 	START_COARSE(total)    
 	at::Tensor &tensor = (entry->src)[0];
+	if(tensor.scalar_type() == at::kInt || tensor.scalar_type() == at::kLong){
+	    std::cerr << tensor << std::endl;
+	}
+	ACCL::debug(string_of_torch_datatype(tensor.scalar_type()));
         // Segment data if necessary
-        if (tensor.nbytes() > bufsize / 2) {
+        if (tensor.nbytes() > ACCL_MSG_SIZE) {
+	    ACCL::debug("nbytes: " + std::to_string(tensor.nbytes()));
+	    ACCL::debug("bufsize: " + std::to_string(bufsize));
+	    ACCL::debug("numel: " + std::to_string(tensor.numel()));
+	    ACCL::debug("tensor.size(0): " + std::to_string(tensor.size(0)));
 	  size_t non_zero_dim_count = tensor.numel() / tensor.size(0);
-          size_t n = bufsize / 2 / tensor.itemsize() / non_zero_dim_count;
+	  ACCL::debug("non_zero_dim_count: " + std::to_string(non_zero_dim_count));
+	  ACCL::debug("tensor.itemsize(): " + std::to_string(tensor.itemsize()));
+          size_t n = ACCL_MSG_SIZE / tensor.itemsize() / non_zero_dim_count;
+	 ACCL::debug("n: " + std::to_string(n));
 	  ACCL::debug("[Broadcast] Segmenting tensor of size " + std::to_string(tensor.nbytes()) + " into " + std::to_string(n * non_zero_dim_count) + "-sized elements ");
           for (size_t i = 0; i < tensor.size(0); i += n) {
 	    ACCL::debug("part " + std::to_string(i) + "!");
@@ -1189,6 +1333,10 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
 	  ACCL::debug("[Broadcast] Broadcasting entire tensor of size " + std::to_string(tensor.nbytes()) + " without segmentation.");
           run_broadcast(tensor, opts);
         }
+	if(tensor.scalar_type() == at::kInt || tensor.scalar_type() == at::kLong){
+	    std::cerr << tensor << std::endl;
+	}
+
 	STOP_COARSE(total)    
 	}
       };
@@ -1217,9 +1365,8 @@ void ProcessGroupACCL::run_allreduce(at::Tensor in_tensor,
   PRE_REQUEST(Allreduce,in_tensor)  
 
 
-  // It seems to have issues with non-even numbers, so we round to 256
-  int rounded_count = (in_tensor.numel() + 1023) & ~1023;
-
+  int rounded_count = ACCL_MSG_SIZE / in_tensor.element_size();
+  ACCL::debug("rounded count:" + std::to_string(rounded_count));
   
   accl->allreduce(*in_buf, *out_buf, rounded_count, acclOp.at(opts.reduceOp));      
 
@@ -1237,7 +1384,7 @@ ProcessGroupACCL::allreduce(std::vector<at::Tensor> &tensors,
 
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [opts, this](std::unique_ptr<WorkEntry> &entry) {
-	ACCL::debug("Starting Allreduce");
+	std::cerr << "Starting AllReduce" << std::endl;
 	// sidestep eager allreduce
 	// if (((entry->src)[0]).numel() <= RDVZ_THRESHOLD || ALLREDUCE_SIDESTEP){
 	  if (ALLREDUCE_SIDESTEP){
@@ -1256,9 +1403,17 @@ ProcessGroupACCL::allreduce(std::vector<at::Tensor> &tensors,
 	    START_COARSE(total)    
 	    auto tensor = (entry->src)[0];
 	    // Segment data if necessary
-	    if (tensor.nbytes() > bufsize/2) {
+	    ACCL::debug(string_of_torch_datatype(tensor.scalar_type()));
+	    if (tensor.nbytes() > (ACCL_MSG_SIZE)) {
+		ACCL::debug("nbytes: " + std::to_string(tensor.nbytes()));
+		ACCL::debug("bufsize: " + std::to_string(bufsize));
+		ACCL::debug("numel: " + std::to_string(tensor.numel()));
+		ACCL::debug("tensor.size(0): " + std::to_string(tensor.size(0)));
 		size_t non_zero_dim_count = tensor.numel() / tensor.size(0);
-		size_t n = bufsize / 2 / tensor.itemsize() / non_zero_dim_count;
+		ACCL::debug("non_zero_dim_count: " + std::to_string(non_zero_dim_count));
+		ACCL::debug("tensor.itemsize(): " + std::to_string(tensor.itemsize()));
+		size_t n = ACCL_MSG_SIZE / (tensor.itemsize() * non_zero_dim_count);
+		ACCL::debug("n: " + std::to_string(n));
 		ACCL::debug("[Allreduce] Segmenting tensor of size " + std::to_string(tensor.nbytes()) + " into " + std::to_string(n * non_zero_dim_count) + "-sized elements ");
 		for (size_t i = 0; i < tensor.size(0); i += n) {
 		    ACCL::debug("part " + std::to_string(i) + "!");
@@ -1343,11 +1498,13 @@ void ProcessGroupACCL::run_allgather(
   
   PRE_REQUEST(Allgather,in_tensor)
 
-  accl->allgather(*in_buf, *out_buf, in_tensor.numel());
+  int rounded_count = (in_tensor.numel() + 1023) & ~1023;
+      
+  accl->allgather(*in_buf, *out_buf, rounded_count);
 
   POST_REQUEST("allgather", in_tensor.nbytes())
 
-  copy_back_tensorvec(dsttensorvec, out_buf, dsttensor, in_tensor.numel(), true, true);
+  copy_back_tensorvec(dsttensorvec, out_buf, dsttensor, in_tensor.numel(), rounded_count, true, true);
     
 }
 
@@ -1369,6 +1526,7 @@ ProcessGroupACCL::allgather(std::vector<std::vector<at::Tensor>> &outputTensors,
 
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [this](std::unique_ptr<WorkEntry> &entry) {
+	  ACCL::debug("Starting AllGather");
         #ifdef ALLGATHER_SIDESTEP
 	ACCL::debug("[AllGather] -- Sidestepped using OpenMPI --");
 	auto data = (entry->src)[0];
@@ -1393,6 +1551,7 @@ ProcessGroupACCL::allgather(std::vector<std::vector<at::Tensor>> &outputTensors,
         auto srctensor = (entry->src)[0];
         auto &dsttensors = entry->dst;
         // Segment data if necessary
+	ACCL::debug(string_of_torch_datatype(dsttensors[0].scalar_type()));
         if (srctensor.nbytes() > bufsize) {
 	  size_t non_zero_dim_count = srctensor.numel() / srctensor.size(0);
           size_t n = bufsize / srctensor.itemsize() / non_zero_dim_count;
@@ -1443,7 +1602,7 @@ void ProcessGroupACCL::run_gather(at::Tensor in_tensor,
 
   POST_REQUEST("gather", in_tensor.nbytes())
 
-  copy_back_tensorvec(dsttensorvec, out_buf, dsttensor, in_tensor.numel(), true, false, opts.rootRank);
+  copy_back_tensorvec(dsttensorvec, out_buf, dsttensor, in_tensor.numel(), in_tensor.numel(), true, false, opts.rootRank);
     
 }
 
@@ -1618,10 +1777,10 @@ ProcessGroupACCL::scatter(std::vector<at::Tensor> &outputTensors,
         auto &srctensors = entry->src;
         auto dsttensor = (entry->dst)[0];
         // Segment data if necessary
-        if (dsttensor.nbytes() > bufsize) {
+        if (dsttensor.nbytes() > bufsize / 4) {
           ACCL::debug("dsttensor to large!");
 	  size_t non_zero_dim_count = dsttensor.numel() / dsttensor.size(0);
-          size_t n = bufsize / dsttensor.itemsize() / non_zero_dim_count;
+          size_t n = bufsize / 4 / dsttensor.itemsize() / non_zero_dim_count;
           for (size_t i = 0; i < dsttensor.size(0); i += n) {
             ACCL::debug("part " + std::to_string(i) + "!");
             size_t end =
@@ -1710,7 +1869,7 @@ void ProcessGroupACCL::run_alltoall_vec(std::vector<at::Tensor> &in_tensor_vec,
 
   POST_REQUEST("alltoall", in_tensor_vec[0].nbytes())
 
-  copy_back_tensorvec(out_tensor_vec, out_buf, dsttensor, in_tensor_vec[0].numel(), true, true);
+  copy_back_tensorvec(out_tensor_vec, out_buf, dsttensor, in_tensor_vec[0].numel(), in_tensor_vec[0].numel(), true, true);
       
 }
 
diff --git a/integrations/pytorch_ddp/test/test-resnet50.py b/integrations/pytorch_ddp/test/test-resnet50.py
index deb97b57..4f8b101d 100644
--- a/integrations/pytorch_ddp/test/test-resnet50.py
+++ b/integrations/pytorch_ddp/test/test-resnet50.py
@@ -17,11 +17,15 @@
 from torch.utils.data.distributed import DistributedSampler
 
 import argparse
+import numpy as np
 import os
 import sys
 import logging
 import time
 
+seed = 42
+torch.manual_seed(seed)
+
 logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
 
 logger = logging.getLogger(__name__)
@@ -33,7 +37,7 @@
 
 # Run via ACCL
 
-def train(num_epochs, model, loaders, criterion, p):
+def train(num_epochs, model, loaders, criterion):
 
     start_time_train = time.perf_counter()
     
@@ -47,7 +51,6 @@ def train(num_epochs, model, loaders, criterion, p):
         model.train()
         running_loss = 0.0
         for i, (inputs, labels) in enumerate(loaders['train']):
-            p.step()
             start_time = time.perf_counter()
             
             optimizer.zero_grad()
@@ -57,8 +60,8 @@ def train(num_epochs, model, loaders, criterion, p):
             optimizer.step()
             running_loss += loss.item()
 
-            # if (i+1) % 10 == 0:
-                # break
+            if (i+1) % 10 == 0:
+                break
             if True:
                 end_time = time.perf_counter()
                 measured_time = (end_time - start_time) * 1000000
@@ -71,7 +74,7 @@ def train(num_epochs, model, loaders, criterion, p):
     print('Total train time: ' + str(measured_time_train))
         
 
-def test(num_epochs, model, loaders, criterion, p):
+def test(num_epochs, model, loaders, criterion):
     # Test the model
     start_time_test = time.perf_counter()
     model.eval()
@@ -80,7 +83,6 @@ def test(num_epochs, model, loaders, criterion, p):
         total = 0
         val_loss = 0
         for i, (inputs, labels) in enumerate(loaders['test']):
-            p.step()
             test_output = model(inputs)
             loss = criterion(test_output, labels)
             val_loss += loss.item()
@@ -98,6 +100,67 @@ def test(num_epochs, model, loaders, criterion, p):
 
     print('Total test time: ' + str(measured_time_test))            
     print(f'Total accuracy: {correct}/{total} {correct/float(total)}')
+
+
+def test_allreduce(numel, testtype):
+
+    shape = (numel,)
+
+    
+    if testtype == torch.int64 or testtype == torch.int32:
+        rand_torch = torch.randint(torch.iinfo(testtype).min/size, torch.iinfo(testtype).max/size,shape, dtype=testtype)
+    else:
+        rand_torch = torch.rand(shape, dtype=testtype)
+    
+    # for i in range(10):
+    if True:
+    
+        # shape = (320001,)
+        x = rand_torch.clone()
+
+        dist.all_reduce(x, dist.ReduceOp.SUM)
+        mpi.Barrier()
+
+        try:
+            np.testing.assert_allclose(x, rand_torch * size)
+        except AssertionError as e:
+            logger.debug("Test AllReduce failed")
+            logger.debug(str(e))
+        else:
+            logger.debug("Test AllReduce finished!")
+
+def test_broadcast(numel, testtype):
+    shape = (numel,)
+
+    # testtype = torch.float32
+    if testtype == torch.int64 or testtype == torch.int32:
+        rand_torch = torch.randint(torch.iinfo(testtype).min, torch.iinfo(testtype).max,shape, dtype=testtype)
+        # rand_torch = torch.ones(shape, dtype=testtype)
+    else:
+        rand_torch = torch.rand(shape, dtype=testtype)
+    
+    # for i in range(10):
+    if True:
+
+        if rank == 1:
+            x = rand_torch.clone()
+        else:
+            x = torch.ones(shape, dtype=testtype)
+        
+        dist.broadcast(x, 1)
+
+        mpi.Barrier()
+
+    # logger.debug('Tensor after broadcast: ' + str(x))
+    # print('Tensor after broadcast: ' + str(x))
+    try:
+        np.testing.assert_allclose(x, rand_torch)
+    except AssertionError as e:
+        logger.debug("Test Broadcast failed")
+        logger.debug(str(e))
+    else:
+        logger.debug("Test broadcast finished!")
+            
     
 if __name__ == "__main__":
 
@@ -124,7 +187,7 @@ def test(num_epochs, model, loaders, criterion, p):
         print("only one machine specified. Assuming Non distributed setup")
         args.d = False
     elif args.n > 1 and args.d == None:
-        print("Assung DDP setup")
+        print("Assuming DDP setup")
         args.d = True
 
 
@@ -180,7 +243,16 @@ def test(num_epochs, model, loaders, criterion, p):
     
         accl.create_process_group(ranks, design, bufsize=rxbufsize, initialize=True, simulation=args.simulator)
         dist.init_process_group("ACCL", rank=rank, world_size=size)
-        
+
+
+    test_allreduce(256, torch.float32)
+    test_broadcast(256, torch.float32)
+
+    test_broadcast(162, torch.int32)
+    # if args.d : dist.destroy_process_group()
+
+    # sys.exit(0)
+    
     device = 'cpu'
 
     transform = transforms.Compose([
@@ -214,7 +286,7 @@ def test(num_epochs, model, loaders, criterion, p):
 
     model = models.resnet50(pretrained=True)
     
-    if args.d : model = DDP(model, bucket_cap_mb=2, broadcast_buffers=False)
+    if args.d : model = DDP(model, bucket_cap_mb=2, broadcast_buffers=False, find_unused_parameters=True)
 
     loss_func = nn.CrossEntropyLoss()   
 
@@ -234,18 +306,22 @@ def test(num_epochs, model, loaders, criterion, p):
     )
 
     
-    with torch.profiler.profile(
-            activities=[torch.profiler.ProfilerActivity.CPU],
-            schedule=schedule,
-            on_trace_ready=torch.profiler.tensorboard_trace_handler('./accl_log/profiler_log'),
-            record_shapes=True,
-    ) as p:
-
+    # with torch.profiler.profile(
+            # activities=[torch.profiler.ProfilerActivity.CPU],
+            # schedule=schedule,
+            # on_trace_ready=torch.profiler.tensorboard_trace_handler('./accl_log/profiler_log'),
+            # record_shapes=True,
+    # ) as p:
+
+    if True:
+    
         
-        train(num_epochs, model, loaders, criterion, p)
+        train(num_epochs, model, loaders, criterion)
+
+        # test(num_epochs, model, loaders, criterion)
 
-        test(num_epochs, model, loaders, criterion, p)
+    # p.stop()
 
-    p.stop()
+    # print(p.key_averages().table(sort_by="self_cpu_time_total", row_limit=100))
 
-    print(p.key_averages().table(sort_by="self_cpu_time_total", row_limit=100))
+    if args.d : dist.destroy_process_group()

From 3f00f33b136e7eba500fcf897ea7082baa2ddf36 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Tue, 20 Aug 2024 18:26:26 +0200
Subject: [PATCH 46/64] Parametrized splitting

---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      |  86 +++++-----
 integrations/pytorch_ddp/test/test-generic.py | 153 +++++++++++++-----
 .../pytorch_ddp/test/test-resnet50.py         |   6 +-
 3 files changed, 156 insertions(+), 89 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 5a6a22de..12c3cac3 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -60,15 +60,17 @@ namespace c10d {
 #define ALLREDUCE_SIDESTEP false
 // #define ALLREDUCE_SIDESTEP true
 
-#define SIDESTEP_BCAST_WITH_ALLREDUCE
+// #define SIDESTEP_BCAST_WITH_ALLREDUCE
     
 #define RDVZ_THRESHOLD 64
 
-#define MICRO_BENCH_FINE 0
+#define MICRO_BENCH_FINE 1
 
-#define MICRO_BENCH_COARSE 0
+#define MICRO_BENCH_COARSE 1
 
-#define ACCL_MSG_SIZE 256    
+#define ACCL_MSG_SIZE 1048576
+
+#define ROUND_NR 256  
 
 #if MICRO_BENCH_FINE
 #define START_FINE(name) \
@@ -1099,19 +1101,17 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   #ifdef SIDESTEP_BCAST_WITH_ALLREDUCE
 
   // It seems to have issues with non-even numbers, so we round to ACCL_MSG_SIZE
-  int rounded_count = ACCL_MSG_SIZE / in_tensor.element_size();
-
+  int rounded_count = (in_tensor.numel() + ROUND_NR) & ~ROUND_NR;
   // if (in_tensor.scalar_type() == at::kInt || in_tensor.scalar_type() == at::kLong){
       // return;
-  }
+  // }
 
   ACCL::debug("rounded count:" + std::to_string(rounded_count));
   ACCL::debug("rootRank:" + std::to_string(opts.rootRank));
   
   int imaginary_count = rounded_count;
   if (in_tensor.scalar_type() == at::kDouble || in_tensor.scalar_type() == at::kLong){
-      imaginary_count = imaginary_count * 2;
-
+      imaginary_count = (in_tensor.numel()*2 + ROUND_NR) & ~ROUND_NR;
   }
 
   ACCL::debug("imaginary count:" + std::to_string(imaginary_count));
@@ -1175,8 +1175,8 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
     // return;
   // }
 
-  int rounded_count = ACCL_MSG_SIZE / in_tensor.element_size();
-      
+  int rounded_count = (in_tensor.numel() + ROUND_NR) & ~ROUND_NR;
+  
   auto zero_tensor = torch::zeros({rounded_count}, in_tensor.scalar_type());
   
   
@@ -1188,9 +1188,9 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   if (opts.rootRank == rank_){
       init_input_tensor(in_tensor, in_buf, true, false, opts.rootRank);
   }
-  else{
-      init_input_tensor(zero_tensor, in_buf, false, true, opts.rootRank);
-  }
+  // else{
+      // init_input_tensor(zero_tensor, in_buf, false, true, opts.rootRank);
+  // }
 
   STOP_FINE(init)
   
@@ -1203,23 +1203,14 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   STOP_FINE(lock)
   
 
-  START_FINE(lib)
-      
+  // in_buf->change_type(convert_datatype_from_torch(at::kInt));
+  // out_buf->change_type(convert_datatype_from_torch(at::kInt));
 
-  in_buf->change_type(convert_datatype_from_torch(at::kInt));
-  out_buf->change_type(convert_datatype_from_torch(at::kInt));
-
-  int imaginary_count = rounded_count;
-
-  ACCL::debug("imaginary count:" + std::to_string(imaginary_count));
-  
-  if (in_tensor.scalar_type() == at::kDouble || in_tensor.scalar_type() == at::kLong){
-      imaginary_count = imaginary_count * 2;
-
-  }
-      
+  // int imaginary_count = rounded_count;
+  // if (in_tensor.scalar_type() == at::kDouble || in_tensor.scalar_type() == at::kLong){
+      // imaginary_count = (in_tensor.numel()*2 + ROUND_NR) & ~ROUND_NR;
+  // }
   
-  accl->barrier();
   /*      
     if(rank_ == opts.rootRank){
 
@@ -1248,22 +1239,24 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   */
 
 
-  if (in_tensor.scalar_type() == at::kInt || in_tensor.scalar_type() == at::kLong){
-      ACCL::debug("input:");
-	      for(int i = 0; i<(in_tensor.numel() * (in_tensor.element_size() / 4)); i++){
-		  ACCL::debug(std::to_string(((int *) in_buf->byte_array())[i]));
-  }
-		}
-  
+  // if (in_tensor.scalar_type() == at::kInt || in_tensor.scalar_type() == at::kLong){
+      // ACCL::debug("input:");
+	      // for(int i = 0; i<(in_tensor.numel() * (in_tensor.element_size() / 4)); i++){
+		  // ACCL::debug(std::to_string(((int *) in_buf->byte_array())[i]));
+  // }
+		// }
   
-  accl->bcast(*in_buf, imaginary_count, opts.rootRank);
 
-  if (in_tensor.scalar_type() == at::kInt || in_tensor.scalar_type() == at::kLong){
-      ACCL::debug("result:");
-	      for(int i = 0; i<(in_tensor.numel() * (in_tensor.element_size() / 4)); i++){
-		  ACCL::debug(std::to_string(((int *) in_buf->byte_array())[i]));
-  }
-		}
+  PRE_REQUEST(Broadcast, in_tensor)  
+      
+  accl->bcast(*in_buf, rounded_count, opts.rootRank);
+
+  // if (in_tensor.scalar_type() == at::kInt || in_tensor.scalar_type() == at::kLong){
+      // ACCL::debug("result:");
+	      // for(int i = 0; i<(in_tensor.numel() * (in_tensor.element_size() / 4)); i++){
+		  // ACCL::debug(std::to_string(((int *) in_buf->byte_array())[i]));
+  // }
+		// }
   
   POST_REQUEST("bcast", in_tensor.nbytes())
 
@@ -1288,8 +1281,8 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
 	if (BROADCAST_SIDESTEP){
 	    
 	auto data = (entry->src)[0];
-	std::cerr << "before" << std::endl;
 	if(data.scalar_type() == at::kInt || data.scalar_type() == at::kLong){
+	    std::cerr << "before" << std::endl;
 	    std::cerr << data << std::endl;
 	}
 	ACCL::debug("[Broadcast] -- Sidestepped using OpenMPI -- size " + std::to_string(data.numel()));
@@ -1301,8 +1294,8 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
             mpiDatatype.at(data.scalar_type()),
             opts.rootRank,
             MPI_COMM_WORLD));
-	std::cerr << "after" << std::endl;
 	if(data.scalar_type() == at::kInt || data.scalar_type() == at::kLong){
+	    std::cerr << "after" << std::endl;
 	    std::cerr << data << std::endl;
 	}
 	} else {
@@ -1365,7 +1358,8 @@ void ProcessGroupACCL::run_allreduce(at::Tensor in_tensor,
   PRE_REQUEST(Allreduce,in_tensor)  
 
 
-  int rounded_count = ACCL_MSG_SIZE / in_tensor.element_size();
+  int rounded_count = (in_tensor.numel() + ROUND_NR) & ~ROUND_NR;
+  
   ACCL::debug("rounded count:" + std::to_string(rounded_count));
   
   accl->allreduce(*in_buf, *out_buf, rounded_count, acclOp.at(opts.reduceOp));      
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index f514b69a..929a0b84 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -49,9 +49,12 @@
 rank = 0
 size = 0
 
-x = 28939
+x = 5000
 y = 1
 
+seed = 48
+torch.manual_seed(seed)
+
 count = x * y
 num_el = x * y
 shape = (x , y)
@@ -83,20 +86,27 @@ def test_broadcast_segment():
     else:
         logger.debug("Test broadcast finished!")
 
-def test_broadcast():
-    shape = (256,)
-    
+def test_broadcast(numel, testtype):
+    shape = (numel,)
+
+    # testtype = torch.float32
     global num_errors
 
-        
+    if testtype == torch.int64 or testtype == torch.int32:
+        rand_torch = torch.randint(torch.iinfo(testtype).min, torch.iinfo(testtype).max,shape, dtype=testtype)
+        # rand_torch = torch.ones(shape, dtype=testtype)
+    else:
+        rand_torch = torch.rand(shape, dtype=testtype)
+    
     # for i in range(10):
     if True:
 
         if rank == 0:
-            x = torch.ones(shape)
+            x = rand_torch.clone()
         else:
-            x = torch.zeros(shape)
+            x = torch.ones(shape, dtype=testtype)
 
+        mpi.Barrier()            
         
         with torch.profiler.record_function("test bcast "):
 
@@ -106,22 +116,20 @@ def test_broadcast():
 
             end_time = time.perf_counter()
             
-            measured_time = (end_time - start_time) * 1000000
+        measured_time = (end_time - start_time) * 1000000
             
-            logger.debug("Directly measured time us 1:" + str(measured_time))
+        logger.debug("Directly measured time us 1:" + str(measured_time))
             
-            mpi.Barrier()
+        mpi.Barrier()
 
-            end_time = time.perf_counter()
+        end_time = time.perf_counter()
 
-            measured_time = (end_time - start_time) * 1000000
+        measured_time = (end_time - start_time) * 1000000
 
-            logger.debug("Directly measured time us 2:" + str(measured_time))
+        logger.debug("Directly measured time us 2:" + str(measured_time))
 
-    # logger.debug('Tensor after broadcast: ' + str(x))
-    # print('Tensor after broadcast: ' + str(x))
     try:
-        np.testing.assert_allclose(x, torch.ones(shape))
+        np.testing.assert_allclose(x, rand_torch)
     except AssertionError as e:
         num_errors = num_errors + 1
         logger.debug("Test Broadcast failed")
@@ -233,9 +241,14 @@ def test_gather():
             
 def test_allgather():
     global num_errors
-    shape_gather = (2,)
-    x = torch.full(shape_gather, float(rank), dtype=torch.float)
-    y = [torch.empty(shape_gather, dtype=torch.float) for _ in range(size)]
+    testtype = torch.int64
+    shape_gather = (1,)
+    if testtype == torch.int64 or testtype == torch.int32:
+        rand_torch = torch.randint(torch.iinfo(testtype).min, torch.iinfo(testtype).max,shape_gather, dtype=testtype)
+    else:
+        rand_torch = torch.rand(shape_gather, dtype=testtype)
+    x = rand_torch.clone()
+    y = [torch.full(shape_gather, 0, dtype=testtype) for _ in range(size)]
 
     with torch.profiler.record_function("test_allgather"):
         
@@ -244,7 +257,7 @@ def test_allgather():
         mpi.Barrier()
     for i, c in enumerate(y):
         try:
-            np.testing.assert_allclose(c, torch.full(shape_gather, float(i), dtype=torch.float))
+            np.testing.assert_allclose(c, rand_torch)
         except AssertionError as e:
             num_errors = num_errors + 1
             logger.debug("Test AllGather failed")
@@ -273,23 +286,43 @@ def test_reduce():
             logger.debug("Test Reduce finished!")
         
 
-def test_allreduce():
+def test_allreduce(numel, testtype):
+
+    global num_errors
 
+    shape = (numel,)
+
+    
+    if testtype == torch.int64 or testtype == torch.int32:
+        rand_torch = torch.randint(torch.iinfo(testtype).min//size, torch.iinfo(testtype).max//size,shape, dtype=testtype)
+    else:
+        rand_torch = torch.rand(shape, dtype=testtype)
+    
     # for i in range(10):
     if True:
     
-        shape = (256,)
         # shape = (320001,)
-        global num_errors
-        x = torch.ones(shape)
+        x = rand_torch.clone()
+
+        mpi.Barrier()            
+        
+        start_time = time.perf_counter()
 
+        
         with torch.profiler.record_function("test_allreduce"):
 
             dist.all_reduce(x, dist.ReduceOp.SUM)
-            mpi.Barrier()
+
+        end_time = time.perf_counter()
+            
+        measured_time = (end_time - start_time) * 1000000
+
+        logger.debug("Directly measured time us 1:" + str(measured_time))            
+        
+        mpi.Barrier()
 
         try:
-            np.testing.assert_allclose(x, torch.full(shape, float(size)))
+            np.testing.assert_allclose(x, rand_torch * size)
         except AssertionError as e:
             num_errors = num_errors + 1
             logger.debug("Test AllReduce failed")
@@ -301,9 +334,9 @@ def test_allreduce():
 def test_alltoall():
     global num_errors
 
-    num_el = 26624
+    # num_el = 26624
     
-    shape = (num_el,)
+    # shape = (num_el,)
 
     input = torch.arange(num_el, dtype=torch.float) + float(rank) * num_el
 
@@ -471,22 +504,51 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
     mpi.Barrier()            
 
 
-    # dist.init_process_group("mpi", rank=rank, world_size=size)
+    dist.init_process_group("mpi", rank=rank, world_size=size)
 
     
-    accl.create_process_group(ranks, design, bufsize=rxbufsize, initialize=True, simulation=simulator)
-    dist.init_process_group("ACCL", rank=rank, world_size=size)
+    # accl.create_process_group(ranks, design, bufsize=rxbufsize, initialize=True, simulation=simulator)
+    # dist.init_process_group("ACCL", rank=rank, world_size=size)
     
     global num_errors
     num_errors = 0
-    # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, record_shapes=True) as prof:
 
+    test_allreduce(256, torch.float32)
+    test_broadcast(256, torch.float32)
+
+    schedule = torch.profiler.schedule(
+        wait=1,
+        warmup=2,
+        active=5,
+    )
+    
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, schedule=schedule, record_shapes=True) as prof:
+        for i in range(30):
+            test_broadcast(128 * 1024, torch.float32)
+            test_allreduce(128 * 1024, torch.float32)
+            prof.step()
+    
     # for i in range(10):
-    if True:
+    # if True:
+        # test_allreduce(256, torch.int32)
+        # test_allreduce(256, torch.int64)
+        # test_broadcast(256, torch.float32)
+        
+        # test_allgather()
 
         # test_broadcast_2()
-        test_broadcast()
-        test_allreduce()
+        # test_broadcast(642, torch.int64)
+        # test_broadcast(25610152, torch.float32)
+        # test_broadcast(53, torch.int64)
+        # test_broadcast(53120, torch.float32)
+        # test_broadcast(53, torch.int64)
+        # test_allreduce(25557032, torch.float32)
+        # test_broadcast(162, torch.int32)
+        # test_broadcast(25, torch.int32)
+        # test_broadcast(53120, torch.float32)
+        # test_broadcast(53, torch.int64)
+        # test_allreduce(2049000, torch.float32)
+        # test_allreduce()
         # test_allgather()
         # test_broadcast_segment()
         # test_broadcast()
@@ -496,11 +558,20 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
         # test_broadcast()
         # test_sendrcv()
         # test_scatter()
+    # for i in range(10):
         # test_gather()
-        # test_allgather()
         # test_alltoall()
-        # test_allreduce()
-        # test_allgather()
+        # test_allreduce(1000, torch.float32)
+        # test_allreduce(2052096, torch.float32)
+        # test_allreduce(1049600, torch.float32)
+        # test_broadcast(256 * 1024, torch.float32)
+        # test_allreduce(256 * 1024, torch.float32)        
+        # test_broadcast(53, torch.int64)
+        # test_broadcast(53120, torch.float32)
+        # test_broadcast(53, torch.int64)
+        # test_broadcast(162, torch.int32)
+        # test_broadcast(25, torch.int32)
+        # test_allreduce(8196000, torch.float32)
         # test_allreduce()
         # test_allreduce()
 
@@ -518,9 +589,11 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
     else:
         print(f"!!!!!!!! - {num_errors} Errors found - !!!!!!!!!")
         logger.debug(f"!!!!!!!! - {num_errors} Errors found - !!!!!!!!!")        
-    # print(prof.key_averages(group_by_input_shape=True)
-          # .table(sort_by="cpu_time_total", row_limit=15))
 
+    print(prof.key_averages(group_by_input_shape=True)
+          .table(sort_by="cpu_time_total", row_limit=15))
+
+        
     logger.debug('Destroying ACCL Process Group')
     dist.destroy_process_group()
 
diff --git a/integrations/pytorch_ddp/test/test-resnet50.py b/integrations/pytorch_ddp/test/test-resnet50.py
index 4f8b101d..ec74d469 100644
--- a/integrations/pytorch_ddp/test/test-resnet50.py
+++ b/integrations/pytorch_ddp/test/test-resnet50.py
@@ -23,7 +23,7 @@
 import logging
 import time
 
-seed = 42
+seed = 43
 torch.manual_seed(seed)
 
 logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
@@ -60,7 +60,7 @@ def train(num_epochs, model, loaders, criterion):
             optimizer.step()
             running_loss += loss.item()
 
-            if (i+1) % 10 == 0:
+            if (i+1) % 100 == 0:
                 break
             if True:
                 end_time = time.perf_counter()
@@ -286,7 +286,7 @@ def test_broadcast(numel, testtype):
 
     model = models.resnet50(pretrained=True)
     
-    if args.d : model = DDP(model, bucket_cap_mb=2, broadcast_buffers=False, find_unused_parameters=True)
+    if args.d : model = DDP(model, bucket_cap_mb=2, broadcast_buffers=True, find_unused_parameters=True)
 
     loss_func = nn.CrossEntropyLoss()   
 

From 2533a4ed526ef674fe375b6c3459ac93cab1165f Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sun, 25 Aug 2024 13:06:35 +0200
Subject: [PATCH 47/64] Improved Microbenchmarking

---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 119 +++++++++++-------
 integrations/pytorch_ddp/test/test-generic.py |  27 ++--
 2 files changed, 87 insertions(+), 59 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 12c3cac3..6c7da5ba 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -68,17 +68,24 @@ namespace c10d {
 
 #define MICRO_BENCH_COARSE 1
 
-#define ACCL_MSG_SIZE 1048576
+#define ACCL_MSG_SIZE 2097152
 
-#define ROUND_NR 256  
+#define ROUND_NR 256
+
+#define COLL_NAME UNNAMED
+
+#define x_MAKE_STRING(s) MAKE_STRING(s)
+#define MAKE_STRING(s) #s    
+
+    
 
 #if MICRO_BENCH_FINE
 #define START_FINE(name) \
   std::chrono::time_point<std::chrono::high_resolution_clock> start_##name  = std::chrono::high_resolution_clock::now();
-#define STOP_FINE(name) \
+#define STOP_FINE(name, accl_nbytes)						\
   auto end_##name = std::chrono::high_resolution_clock::now();		\
   double durationUs_##name = (std::chrono::duration_cast<std::chrono::nanoseconds>(end_##name-start_##name).count() / 1000.0); \
-  ACCL::debug(#name "_tensor durationUs:" + std::to_string(durationUs_##name));
+  ACCL::debug(#name "_" + std::string(x_MAKE_STRING(COLL_NAME)) + "_" + std::to_string(accl_nbytes) + " durationUs: " + std::to_string(durationUs_##name));
 #else
 #define START_FINE(name)
 #define STOP_FINE(name)
@@ -87,10 +94,10 @@ namespace c10d {
 #if MICRO_BENCH_COARSE
 #define START_COARSE(name) \
   std::chrono::time_point<std::chrono::high_resolution_clock> start_##name  = std::chrono::high_resolution_clock::now();
-#define STOP_COARSE(name) \
+#define STOP_COARSE(name, accl_nbytes)						\
   auto end_##name = std::chrono::high_resolution_clock::now();		\
   double durationUs_##name = (std::chrono::duration_cast<std::chrono::nanoseconds>(end_##name-start_##name).count() / 1000.0); \
-  ACCL::debug(#name "_tensor durationUs:" + std::to_string(durationUs_##name));
+  ACCL::debug(#name "_" + std::string(x_MAKE_STRING(COLL_NAME)) + "_" + std::to_string(accl_nbytes) + " durationUs: " + std::to_string(durationUs_##name));
 #else
 #define START_COARSE(name)
 #define STOP_COARSE(name)
@@ -160,16 +167,12 @@ std::map<at::ScalarType, MPI_Datatype> mpiDatatype = {
   START_FINE(type)    \
   in_buf->change_type(convert_datatype_from_torch(tensor.scalar_type())); \
   out_buf->change_type(convert_datatype_from_torch(tensor.scalar_type()));   \
-  STOP_FINE(type)						\
-  ACCL::debug("[" #opname "] Entering barrier");			\
-  START_FINE(barrier)    \
-  accl->barrier();							\
-  STOP_FINE(barrier)							\
+  STOP_FINE(type, tensor.nbytes())					\
   ACCL::debug("Performing " #opname " of " + std::to_string(tensor.numel()) + " items"); \
   START_FINE(lib)							
 
-#define POST_REQUEST(opname, n_bytes)				\
-STOP_FINE(lib)						
+#define POST_REQUEST(opname, nbytes)				\
+  STOP_FINE(lib, nbytes)						
 
 #define TIMER_WRAP()
     
@@ -1092,7 +1095,8 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::enqueue(
   queueProduceCV_.notify_one();
   return work;
 }
-
+#undef COLL_NAME
+#define COLL_NAME Broadcast
 void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
                                      const BroadcastOptions &opts) {
 
@@ -1128,19 +1132,17 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   }
   init_input_tensor(zero_tensor, out_buf, true, false, opts.rootRank);
 
-  STOP_FINE(init)
+  STOP_FINE(init, in_tensor.nbytes())
 
   START_FINE(lock)      
   // Reserve device
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
-  STOP_FINE(lock)      
+  STOP_FINE(lock, in_tensor.nbytes())      
 
 
   in_buf->change_type(convert_datatype_from_torch(at::kInt));
   out_buf->change_type(convert_datatype_from_torch(at::kInt));
-  accl->barrier();
-  
 
   if (in_tensor.scalar_type() == at::kInt || in_tensor.scalar_type() == at::kLong){
       ACCL::debug("input:");
@@ -1166,7 +1168,7 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
 
   START_FINE(copy)      
   copy_back_tensor(in_tensor, out_buf, true, true);
-  STOP_FINE(copy)
+  STOP_FINE(copy, in_tensor.nbytes())
   
   #else
 
@@ -1192,7 +1194,7 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
       // init_input_tensor(zero_tensor, in_buf, false, true, opts.rootRank);
   // }
 
-  STOP_FINE(init)
+  STOP_FINE(init, in_tensor.nbytes())
   
   // Reserve device
 
@@ -1200,7 +1202,7 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   START_FINE(lock)
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
-  STOP_FINE(lock)
+  STOP_FINE(lock, in_tensor.nbytes())
   
 
   // in_buf->change_type(convert_datatype_from_torch(at::kInt));
@@ -1266,7 +1268,7 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   // }
   START_FINE(copy)
   copy_back_tensor(in_tensor, in_buf, true, true, opts.rootRank);
-  STOP_FINE(copy)
+  STOP_FINE(copy, in_tensor.nbytes())
   #endif
 }
 
@@ -1319,8 +1321,8 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
 	  ACCL::debug("[Broadcast] Segmenting tensor of size " + std::to_string(tensor.nbytes()) + " into " + std::to_string(n * non_zero_dim_count) + "-sized elements ");
           for (size_t i = 0; i < tensor.size(0); i += n) {
 	    ACCL::debug("part " + std::to_string(i) + "!");
-            size_t end = std::min(i + n, static_cast<size_t>(tensor.size(0)));
-            run_broadcast(tensor.slice(0, i, end), opts);
+            size_t end = std::min(n, static_cast<size_t>(tensor.size(0)) - i);
+            run_broadcast(tensor.narrow(0, i, end), opts);
           }
         } else {
 	  ACCL::debug("[Broadcast] Broadcasting entire tensor of size " + std::to_string(tensor.nbytes()) + " without segmentation.");
@@ -1330,7 +1332,7 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
 	    std::cerr << tensor << std::endl;
 	}
 
-	STOP_COARSE(total)    
+	STOP_COARSE(total, ((entry->src)[0]).nbytes())    
 	}
       };
   auto entry =
@@ -1339,6 +1341,9 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
                  c10::optional<std::vector<at::Tensor>>(tensors));
 }
 
+#undef COLL_NAME
+#define COLL_NAME Allreduce
+
 void ProcessGroupACCL::run_allreduce(at::Tensor in_tensor,
                                      const AllreduceOptions &opts) {
 
@@ -1346,14 +1351,14 @@ void ProcessGroupACCL::run_allreduce(at::Tensor in_tensor,
   
   init_input_tensor(in_tensor, in_buf, true, true);
 
-  STOP_FINE(init)
+  STOP_FINE(init, in_tensor.nbytes())
 
 
   START_FINE(lock)      
   // Reserve device
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
-  STOP_FINE(lock)      
+  STOP_FINE(lock, in_tensor.nbytes())      
 
   PRE_REQUEST(Allreduce,in_tensor)  
 
@@ -1368,7 +1373,7 @@ void ProcessGroupACCL::run_allreduce(at::Tensor in_tensor,
 
   START_FINE(copy)      
   copy_back_tensor(in_tensor, out_buf, true, true);
-  STOP_FINE(copy)
+  STOP_FINE(copy, in_tensor.nbytes())
 }
 
 c10::intrusive_ptr<Work>
@@ -1410,14 +1415,16 @@ ProcessGroupACCL::allreduce(std::vector<at::Tensor> &tensors,
 		ACCL::debug("n: " + std::to_string(n));
 		ACCL::debug("[Allreduce] Segmenting tensor of size " + std::to_string(tensor.nbytes()) + " into " + std::to_string(n * non_zero_dim_count) + "-sized elements ");
 		for (size_t i = 0; i < tensor.size(0); i += n) {
+		    START_FINE(loop)    
 		    ACCL::debug("part " + std::to_string(i) + "!");
-		    size_t end = std::min(i + n, static_cast<size_t>(tensor.size(0)));
-		    run_allreduce(tensor.slice(0, i, end), opts);
+		    size_t end = std::min(n, static_cast<size_t>(tensor.size(0)) - i);
+		    run_allreduce(tensor.narrow(0, i, end), opts);
+		    STOP_FINE(loop, tensor.nbytes())    
 		}
 	    } else {
 		run_allreduce(tensor, opts);
 	    }
-	    STOP_COARSE(total)    
+	    STOP_COARSE(total, ((entry->src)[0]).nbytes())    
       }
       };
   auto entry =
@@ -1426,6 +1433,9 @@ ProcessGroupACCL::allreduce(std::vector<at::Tensor> &tensors,
                  c10::optional<std::vector<at::Tensor>>(tensors));
 }
 
+#undef COLL_NAME
+#define COLL_NAME UNNAMED
+
 c10::intrusive_ptr<Work>
 ProcessGroupACCL::allreduce_coalesced(std::vector<at::Tensor> &tensors,
                                       const AllreduceCoalescedOptions &opts) {
@@ -1464,7 +1474,7 @@ ProcessGroupACCL::reduce(std::vector<at::Tensor> &tensors,
           size_t n = bufsize / tensor.itemsize();
           for (size_t i = 0; i < tensor.numel(); i += n) {
             size_t end = std::min(i + n, static_cast<size_t>(tensor.numel()));
-            run_reduce(tensor.slice(0, i, end), opts);
+            run_reduce(tensor.narrow(0, i, end), opts);
           }
         } else {
           run_reduce(tensor, opts);
@@ -1555,9 +1565,9 @@ ProcessGroupACCL::allgather(std::vector<std::vector<at::Tensor>> &outputTensors,
             std::vector<at::Tensor> dsttensorslices;
             dsttensorslices.reserve(dsttensors.size());
             for (auto &dsttensor : dsttensors) {
-              dsttensorslices.emplace_back(dsttensor.slice(0, i, end));
+              dsttensorslices.emplace_back(dsttensor.narrow(0, i, end));
             }
-            run_allgather(srctensor.slice(0, i, end), dsttensorslices);
+            run_allgather(srctensor.narrow(0, i, end), dsttensorslices);
           }
         } else {
           run_allgather(srctensor, dsttensors);
@@ -1669,9 +1679,9 @@ ProcessGroupACCL::gather(std::vector<std::vector<at::Tensor>> &outputTensors,
             std::vector<at::Tensor> dsttensorslices;
             dsttensorslices.reserve(dsttensors.size());
             for (auto &dsttensor : dsttensors) {
-              dsttensorslices.emplace_back(dsttensor.slice(0, i, end));
+              dsttensorslices.emplace_back(dsttensor.narrow(0, i, end));
             }
-            run_gather(srctensor.slice(0, i, end), dsttensorslices, opts);
+            run_gather(srctensor.narrow(0, i, end), dsttensorslices, opts);
           }
         } else {
           run_gather(srctensor, dsttensors, opts);
@@ -1782,9 +1792,9 @@ ProcessGroupACCL::scatter(std::vector<at::Tensor> &outputTensors,
             std::vector<at::Tensor> srctensorslices;
             srctensorslices.reserve(srctensors.size());
             for (auto &srctensor : srctensors) {
-              srctensorslices.emplace_back(srctensor.slice(0, i, end));
+              srctensorslices.emplace_back(srctensor.narrow(0, i, end));
             }
-            run_scatter(srctensorslices, dsttensor.slice(0, i, end), opts);
+            run_scatter(srctensorslices, dsttensor.narrow(0, i, end), opts);
           }
         } else {
           run_scatter(srctensors, dsttensor, opts);
@@ -1816,19 +1826,24 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::reduce_scatter(
   TORCH_CHECK(false, "ProcessGroupACCL does not support reduce_scatter");
 }
 
+#undef COLL_NAME
+#define COLL_NAME AlltoAll
+
 void ProcessGroupACCL::run_alltoall(at::Tensor in_tensor,
                                     at::Tensor out_tensor,
                                     const AllToAllOptions &opts) {
 
+  int a2a_nbytes = in_tensor.nbytes()/size_;
+
   START_FINE(init)
   init_input_tensor(in_tensor, in_buf, true, true);
-  STOP_FINE(init)
+  STOP_FINE(init, a2a_nbytes)
 
   // Reserve device
   START_FINE(lock)      
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
-  STOP_FINE(lock)
+  STOP_FINE(lock, a2a_nbytes)
 
   // init_output_data(out_tensor, dstdata, out_tensor.numel(), out_tensor.scalar_type(), true, true);
 
@@ -1836,11 +1851,11 @@ void ProcessGroupACCL::run_alltoall(at::Tensor in_tensor,
 
   accl->alltoall(*in_buf, *out_buf, in_tensor.numel()/size_);
 
-  POST_REQUEST("alltoall", in_tensor.nbytes()/size_)
+  POST_REQUEST("alltoall", a2a_nbytes)
 
   START_FINE(copy)      
   copy_back_tensor(out_tensor, out_buf, true, true);    
-  STOP_FINE(copy)
+  STOP_FINE(copy,  a2a_nbytes)
 }
 
     
@@ -1850,6 +1865,9 @@ void ProcessGroupACCL::run_alltoall_vec(std::vector<at::Tensor> &in_tensor_vec,
   at::Tensor dsttensor;
 
   // Reserve device
+
+  int a2a_nbytes = in_tensor_vec[0].nbytes();
+  
   c10::DeviceGuard guard(in_tensor_vec[0].device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
@@ -1861,7 +1879,7 @@ void ProcessGroupACCL::run_alltoall_vec(std::vector<at::Tensor> &in_tensor_vec,
 
   accl->alltoall(*in_buf, *out_buf, in_tensor_vec[0].numel());
 
-  POST_REQUEST("alltoall", in_tensor_vec[0].nbytes())
+  POST_REQUEST("alltoall", a2a_nbytes)
 
   copy_back_tensorvec(out_tensor_vec, out_buf, dsttensor, in_tensor_vec[0].numel(), in_tensor_vec[0].numel(), true, true);
       
@@ -1908,14 +1926,14 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::alltoall_base(
 	      ACCL::debug("dsttensorslices:");
 	      for (int j = 0; j < size_; j++) {
 		  int bufpos = j * entry_size;
-		  srctensorslices.emplace_back(srctensor.slice(0, i + bufpos, end + bufpos));
+		  srctensorslices.emplace_back(srctensor.narrow(0, i + bufpos, end + bufpos));
 	      }
 	      std::vector<at::Tensor> dsttensorslices;
 	      dsttensorslices.reserve(size_);
 	      ACCL::debug("dsttensorslices:");
 	      for (int j = 0; j < size_; j++) {
 		  int bufpos = j * entry_size;
-		  dsttensorslices.emplace_back(dsttensor.slice(0, i + bufpos, end + bufpos));
+		  dsttensorslices.emplace_back(dsttensor.narrow(0, i + bufpos, end + bufpos));
 	      }
               run_alltoall_vec(srctensorslices, dsttensorslices, opts);
             }
@@ -1923,7 +1941,7 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::alltoall_base(
 	    ACCL::debug("Running without segmentation");
             run_alltoall(srctensor, dsttensor, opts);
           }
-	STOP_COARSE(total)	  
+	  STOP_COARSE(total, ((((entry->src)[0]).nbytes())/size_))    
         };
     std::vector<at::Tensor> inputTensors = {inputTensor};
     std::vector<at::Tensor> outputTensors = {outputTensor};
@@ -1946,6 +1964,10 @@ ProcessGroupACCL::alltoall(std::vector<at::Tensor> &outputTensors,
   TORCH_CHECK(false, "ProcessGroupACCL does not support alltoall");
 }
 
+#undef COLL_NAME
+#define COLL_NAME UNNAMED
+
+
 void ProcessGroupACCL::run_send(at::Tensor in_tensor, int dstRank,
                                 int tag) {
 
@@ -1962,6 +1984,7 @@ void ProcessGroupACCL::run_send(at::Tensor in_tensor, int dstRank,
   POST_REQUEST("send", in_tensor.nbytes())
 }
 
+
 c10::intrusive_ptr<Work>
 ProcessGroupACCL::send(std::vector<at::Tensor> &tensors, int dstRank, int tag) {
   checkSingleTensor(tensors);
@@ -1974,7 +1997,7 @@ ProcessGroupACCL::send(std::vector<at::Tensor> &tensors, int dstRank, int tag) {
           size_t n = bufsize / tensor.itemsize();
           for (size_t i = 0; i < tensor.numel(); i += n) {
             size_t end = std::min(i + n, static_cast<size_t>(tensor.numel()));
-            run_send(tensor.slice(0, i, end), dstRank, tag);
+            run_send(tensor.narrow(0, i, end), dstRank, tag);
           }
         } else {
           run_send(tensor, dstRank, tag);
@@ -2017,7 +2040,7 @@ ProcessGroupACCL::recv(std::vector<at::Tensor> &tensors, int srcRank, int tag) {
           size_t n = bufsize / tensor.itemsize();
           for (size_t i = 0; i < tensor.numel(); i += n) {
             size_t end = std::min(i + n, static_cast<size_t>(tensor.numel()));
-            run_recv(tensor.slice(0, i, end), srcRank, tag);
+            run_recv(tensor.narrow(0, i, end), srcRank, tag);
           }
         } else {
           run_recv(tensor, srcRank, tag);
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index 929a0b84..69564969 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -117,7 +117,9 @@ def test_broadcast(numel, testtype):
             end_time = time.perf_counter()
             
         measured_time = (end_time - start_time) * 1000000
-            
+
+        print("pytorch_Broadcast_" + str(x.nbytes) + " durationUs: " + str(measured_time), file=sys.stderr)
+        
         logger.debug("Directly measured time us 1:" + str(measured_time))
             
         mpi.Barrier()
@@ -317,6 +319,8 @@ def test_allreduce(numel, testtype):
             
         measured_time = (end_time - start_time) * 1000000
 
+        print("pytorch_Allreduce_" + str(x.nbytes) + " durationUs: " + str(measured_time), file=sys.stderr)
+        
         logger.debug("Directly measured time us 1:" + str(measured_time))            
         
         mpi.Barrier()
@@ -504,11 +508,11 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
     mpi.Barrier()            
 
 
-    dist.init_process_group("mpi", rank=rank, world_size=size)
+    # dist.init_process_group("mpi", rank=rank, world_size=size)
 
     
-    # accl.create_process_group(ranks, design, bufsize=rxbufsize, initialize=True, simulation=simulator)
-    # dist.init_process_group("ACCL", rank=rank, world_size=size)
+    accl.create_process_group(ranks, design, bufsize=rxbufsize, initialize=True, simulation=simulator)
+    dist.init_process_group("ACCL", rank=rank, world_size=size)
     
     global num_errors
     num_errors = 0
@@ -522,11 +526,12 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
         active=5,
     )
     
-    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, schedule=schedule, record_shapes=True) as prof:
-        for i in range(30):
-            test_broadcast(128 * 1024, torch.float32)
-            test_allreduce(128 * 1024, torch.float32)
-            prof.step()
+    # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, schedule=schedule, record_shapes=True) as prof:
+    for n in range(4,14):
+        for i in range(5):
+            test_broadcast(2**n, torch.float32)
+            test_allreduce(2**n, torch.float32)
+            # prof.step()
     
     # for i in range(10):
     # if True:
@@ -590,8 +595,8 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
         print(f"!!!!!!!! - {num_errors} Errors found - !!!!!!!!!")
         logger.debug(f"!!!!!!!! - {num_errors} Errors found - !!!!!!!!!")        
 
-    print(prof.key_averages(group_by_input_shape=True)
-          .table(sort_by="cpu_time_total", row_limit=15))
+    # print(prof.key_averages(group_by_input_shape=True)
+          # .table(sort_by="cpu_time_total", row_limit=15))
 
         
     logger.debug('Destroying ACCL Process Group')

From 30e8febb5cfa7ca094867d9bcc68b2aa215fea3a Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sun, 25 Aug 2024 14:17:56 +0200
Subject: [PATCH 48/64] Added plot scripts

---
 .../pytorch_ddp/test/plot_composition.py      | 80 +++++++++++++++++++
 integrations/pytorch_ddp/test/plot_small.py   | 58 ++++++++++++++
 2 files changed, 138 insertions(+)
 create mode 100644 integrations/pytorch_ddp/test/plot_composition.py
 create mode 100644 integrations/pytorch_ddp/test/plot_small.py

diff --git a/integrations/pytorch_ddp/test/plot_composition.py b/integrations/pytorch_ddp/test/plot_composition.py
new file mode 100644
index 00000000..fca2dd95
--- /dev/null
+++ b/integrations/pytorch_ddp/test/plot_composition.py
@@ -0,0 +1,80 @@
+import re
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+
+
+# keywords = ["Broadcast", "Allreduce", "AlltoAll",  ]
+# keywords = ["Broadcast", "Allreduce" ]
+keywords = ["Allreduce", "AlltoAll",  ]
+# parts = ["lib","barrier","total"]
+parts = ["lib","barrier","copy","lock","init", "type", "total"]
+
+part_pattern = re.compile(r"(.*)_tensor durationUs:.*")
+
+measurement_pattern = re.compile(r".*_tensor durationUs:(.*)")
+
+# keyword_pattern = re.compile(r"Starting (Broadcast|Allreduce|AlltoAll)")
+keyword_pattern = re.compile(r"Starting (Allreduce|AlltoAll)")
+# keyword_pattern = re.compile(r"Starting (Broadcast|Allreduce)")
+
+log_file_path = './accl_log/rank_0_stderr'
+
+with open(log_file_path, 'r') as log_file:
+    lines = log_file.readlines()
+
+current_keyword = None
+# results = { "Broadcast": {}, "Allreduce": {}, "AlltoAll": {} }
+# results = { "Broadcast": {}, "Allreduce": {}}
+results = { "Allreduce": {}, "AlltoAll": {} }
+
+# averages = { "Broadcast": {}, "Allreduce": {}}
+# averages = { "Broadcast": {}, "Allreduce": {}, "AlltoAll": {} }
+averages = { "Allreduce": {}, "AlltoAll": {} }
+
+for op in results:
+    for part in parts:
+        results[op][part] = []
+
+for line in lines:
+    keyword_match = keyword_pattern.search(line)
+    if keyword_match:
+        current_keyword = keyword_match.group(1)
+        continue
+
+    if current_keyword:
+        part_match = part_pattern.search(line)
+        if not part_match:
+            continue
+        part = part_match.group(1).strip()
+        if part in parts:
+            measurement_match = measurement_pattern.search(line)
+            measurement = measurement_match.group(1).strip()
+            results[current_keyword][part].append(float(measurement))
+            if part=='total':
+                current_keyword = None
+
+for op, parts in results.items():
+    for part, values in parts.items():
+        test_sum = 0
+        count = 0
+        for el in values:
+            test_sum +=  el
+            count += 1
+        averages[op][part] = test_sum / count    
+
+for op, part in averages.items():
+    labels = [key for key in part if key != 'total' and key != 'barrier']
+    times = [part[key] for key in labels]
+    total_time = part['total'] - part['barrier']
+    other_time = total_time - sum(times)
+
+    if other_time > 0:
+        labels.append('Other')
+        times.append(other_time)
+    
+    plt.figure()
+    plt.pie(times, labels=labels, autopct=lambda p: f'{p * total_time / 100:.2f}us')
+    plt.title(f'Runtime Distribution for {op}')
+
+    plt.savefig('composition_' + op + '_plot.png')
diff --git a/integrations/pytorch_ddp/test/plot_small.py b/integrations/pytorch_ddp/test/plot_small.py
new file mode 100644
index 00000000..a5d4f2d9
--- /dev/null
+++ b/integrations/pytorch_ddp/test/plot_small.py
@@ -0,0 +1,58 @@
+import re
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+
+
+keywords = ["Broadcast", "Allreduce", "AlltoAll" ]
+# keywords = ["Broadcast", "Allreduce" ]
+# parts = ["lib","barrier","total"]
+parts = ["lib","barrier","copy","lock","init", "type", "total"]
+
+part_pattern = re.compile(r"(.*)_tensor durationUs:.*")
+
+measurement_pattern = re.compile(r".*_tensor durationUs:(.*)")
+
+keyword_pattern = re.compile(r"Starting (Broadcast|Allreduce|AlltoAll)")
+# keyword_pattern = re.compile(r"Starting (Broadcast|Allreduce)")
+
+log_file_path = './accl_log/rank_0_stderr'
+
+with open(log_file_path, 'r') as log_file:
+    lines = log_file.readlines()
+
+current_keyword = None
+results = { "Broadcast": {}, "Allreduce": {}, "AlltoAll": {} }
+# results = { "Broadcast": {}, "Allreduce": {}}
+
+for op in results:
+    for part in parts:
+        results[op][part] = []
+
+for line in lines:
+    keyword_match = keyword_pattern.search(line)
+    if keyword_match:
+        current_keyword = keyword_match.group(1)
+        continue
+
+    if current_keyword:
+        part_match = part_pattern.search(line)
+        if not part_match:
+            continue
+        part = part_match.group(1).strip()
+        if part in parts:
+            measurement_match = measurement_pattern.search(line)
+            measurement = measurement_match.group(1).strip()
+            results[current_keyword][part].append(float(measurement))
+
+fig, axes = plt.subplots(len(keywords), len(parts), figsize=(5 * len(parts), 5 * len(keywords) ))
+
+for i, (dict_name, sub_dict) in enumerate(results.items()):
+    for j, (key, values) in enumerate(sub_dict.items()):
+        sns.histplot(values, ax=axes[i, j], bins=20, stat='percent', kde=True)
+        axes[i, j].set_title(f'{dict_name} - {key}')
+
+plt.tight_layout()
+plt.savefig("fullplot.png")
+
+            

From 36efec49c392605d1593a09873279149c59423b6 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sun, 25 Aug 2024 14:19:36 +0200
Subject: [PATCH 49/64] Added ACCL device measurement to bench

---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 26 +++++++++++--------
 integrations/pytorch_ddp/test/test-generic.py |  4 +--
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 6c7da5ba..64a6a802 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -171,9 +171,13 @@ std::map<at::ScalarType, MPI_Datatype> mpiDatatype = {
   ACCL::debug("Performing " #opname " of " + std::to_string(tensor.numel()) + " items"); \
   START_FINE(lib)							
 
-#define POST_REQUEST(opname, nbytes)				\
-  STOP_FINE(lib, nbytes)						
+#define POST_REQUEST(name, nbytes)				\
+  STOP_FINE(lib, nbytes)						\
+  std::this_thread::sleep_for(10ms);					\
+  double durationUs_accl_##COLL_NAME = (double)accl->get_duration(req)/1000.0; \
+  ACCL::debug("device_" + std::string(x_MAKE_STRING(COLL_NAME)) + "_"  + std::to_string(nbytes) + " durationUs: " + std::to_string(durationUs_accl_##COLL_NAME));
 
+    
 #define TIMER_WRAP()
     
 // Better logging
@@ -1153,7 +1157,7 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
 
   
       
-  accl->allreduce(*in_buf, *out_buf, imaginary_count, ACCL::reduceFunction::SUM);      
+  auto req = accl->allreduce(*in_buf, *out_buf, imaginary_count, ACCL::reduceFunction::SUM);      
 
   if (in_tensor.scalar_type() == at::kInt || in_tensor.scalar_type() == at::kLong){
       ACCL::debug("result:");
@@ -1251,7 +1255,7 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
 
   PRE_REQUEST(Broadcast, in_tensor)  
       
-  accl->bcast(*in_buf, rounded_count, opts.rootRank);
+  auto req = accl->bcast(*in_buf, rounded_count, opts.rootRank);
 
   // if (in_tensor.scalar_type() == at::kInt || in_tensor.scalar_type() == at::kLong){
       // ACCL::debug("result:");
@@ -1367,7 +1371,7 @@ void ProcessGroupACCL::run_allreduce(at::Tensor in_tensor,
   
   ACCL::debug("rounded count:" + std::to_string(rounded_count));
   
-  accl->allreduce(*in_buf, *out_buf, rounded_count, acclOp.at(opts.reduceOp));      
+  auto req = accl->allreduce(*in_buf, *out_buf, rounded_count, acclOp.at(opts.reduceOp));      
 
   POST_REQUEST("allreduce", in_tensor.nbytes())
 
@@ -1454,7 +1458,7 @@ void ProcessGroupACCL::run_reduce(at::Tensor in_tensor,
 
   PRE_REQUEST(Reduce,in_tensor)  
 
-  accl->reduce(*in_buf, *out_buf, in_tensor.numel(), opts.rootRank, acclOp.at(opts.reduceOp));
+  auto req = accl->reduce(*in_buf, *out_buf, in_tensor.numel(), opts.rootRank, acclOp.at(opts.reduceOp));
 
   POST_REQUEST("reduce", in_tensor.nbytes())
 
@@ -1504,7 +1508,7 @@ void ProcessGroupACCL::run_allgather(
 
   int rounded_count = (in_tensor.numel() + 1023) & ~1023;
       
-  accl->allgather(*in_buf, *out_buf, rounded_count);
+  auto req = accl->allgather(*in_buf, *out_buf, rounded_count);
 
   POST_REQUEST("allgather", in_tensor.nbytes())
 
@@ -1602,7 +1606,7 @@ void ProcessGroupACCL::run_gather(at::Tensor in_tensor,
 
   PRE_REQUEST(Gather, in_tensor)
 
-  accl->gather(*in_buf, *out_buf, in_tensor.numel(), opts.rootRank);
+  auto req = accl->gather(*in_buf, *out_buf, in_tensor.numel(), opts.rootRank);
 
   POST_REQUEST("gather", in_tensor.nbytes())
 
@@ -1718,7 +1722,7 @@ void ProcessGroupACCL::run_scatter(std::vector<at::Tensor> &in_tensor_vec,
   PRE_REQUEST(Scatter, dsttensor)
   
   // Run scatter
-  accl->scatter(*in_buf, *out_buf, out_tensor.numel(), opts.rootRank);
+  auto req = accl->scatter(*in_buf, *out_buf, out_tensor.numel(), opts.rootRank);
 
   POST_REQUEST("scatter", out_tensor.nbytes())
 
@@ -1849,7 +1853,7 @@ void ProcessGroupACCL::run_alltoall(at::Tensor in_tensor,
 
   PRE_REQUEST(AlltoAll, in_tensor)
 
-  accl->alltoall(*in_buf, *out_buf, in_tensor.numel()/size_);
+  auto req = accl->alltoall(*in_buf, *out_buf, in_tensor.numel()/size_);
 
   POST_REQUEST("alltoall", a2a_nbytes)
 
@@ -1877,7 +1881,7 @@ void ProcessGroupACCL::run_alltoall_vec(std::vector<at::Tensor> &in_tensor_vec,
   
   PRE_REQUEST(AlltoAll, in_tensor_vec[0])
 
-  accl->alltoall(*in_buf, *out_buf, in_tensor_vec[0].numel());
+  auto req = accl->alltoall(*in_buf, *out_buf, in_tensor_vec[0].numel());
 
   POST_REQUEST("alltoall", a2a_nbytes)
 
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index 69564969..de3c22bc 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -527,8 +527,8 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
     )
     
     # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, schedule=schedule, record_shapes=True) as prof:
-    for n in range(4,14):
-        for i in range(5):
+    for n in range(4,22):
+        for i in range(20):
             test_broadcast(2**n, torch.float32)
             test_allreduce(2**n, torch.float32)
             # prof.step()

From 39d00e5a5ddc247738b12fe282391d09beb49dfe Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sun, 25 Aug 2024 16:42:59 +0200
Subject: [PATCH 50/64] Added sleep measurement

---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      |   2 +
 integrations/pytorch_ddp/test/plot_small.py   | 132 +++++++++++++-----
 2 files changed, 101 insertions(+), 33 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 64a6a802..f7ca1f8c 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -173,7 +173,9 @@ std::map<at::ScalarType, MPI_Datatype> mpiDatatype = {
 
 #define POST_REQUEST(name, nbytes)				\
   STOP_FINE(lib, nbytes)						\
+  START_COARSE(sleep)   						\
   std::this_thread::sleep_for(10ms);					\
+  STOP_COARSE(sleep, nbytes)   						\
   double durationUs_accl_##COLL_NAME = (double)accl->get_duration(req)/1000.0; \
   ACCL::debug("device_" + std::string(x_MAKE_STRING(COLL_NAME)) + "_"  + std::to_string(nbytes) + " durationUs: " + std::to_string(durationUs_accl_##COLL_NAME));
 
diff --git a/integrations/pytorch_ddp/test/plot_small.py b/integrations/pytorch_ddp/test/plot_small.py
index a5d4f2d9..3bd1708e 100644
--- a/integrations/pytorch_ddp/test/plot_small.py
+++ b/integrations/pytorch_ddp/test/plot_small.py
@@ -2,19 +2,20 @@
 import matplotlib.pyplot as plt
 import seaborn as sns
 import numpy as np
+import matplotlib.ticker as mticker
 
-
-keywords = ["Broadcast", "Allreduce", "AlltoAll" ]
-# keywords = ["Broadcast", "Allreduce" ]
+keywords = ["Broadcast", "Allreduce" ]
 # parts = ["lib","barrier","total"]
-parts = ["lib","barrier","copy","lock","init", "type", "total"]
+parts = ["lib", "copy","init", "total", "device", "pytorch", "sleep"]
+parts_plot = ["init", "device", "lib_oh", "copy", "total_oh", "pytorch_oh"]
+
 
-part_pattern = re.compile(r"(.*)_tensor durationUs:.*")
+part_pattern = re.compile(r"(.*)_.*_.* durationUs: .*")
+op_pattern = re.compile(r".*_(.*)_.* durationUs: .*")
+count_pattern = re.compile(r".*_.*_(.*) durationUs: .*")
 
-measurement_pattern = re.compile(r".*_tensor durationUs:(.*)")
+measurement_pattern = re.compile(r".*_.*_.* durationUs: (.*)")
 
-keyword_pattern = re.compile(r"Starting (Broadcast|Allreduce|AlltoAll)")
-# keyword_pattern = re.compile(r"Starting (Broadcast|Allreduce)")
 
 log_file_path = './accl_log/rank_0_stderr'
 
@@ -22,37 +23,102 @@
     lines = log_file.readlines()
 
 current_keyword = None
-results = { "Broadcast": {}, "Allreduce": {}, "AlltoAll": {} }
+results = { "Broadcast": {}, "Allreduce": {}}
+averages = { "Broadcast": {}, "Allreduce": {}}
+
 # results = { "Broadcast": {}, "Allreduce": {}}
 
+sizes = []
+
 for op in results:
     for part in parts:
-        results[op][part] = []
-
+        results[op][part] = {}
+        averages[op][part] = {}
+    for part in parts_plot:        
+        averages[op][part] = {}
 for line in lines:
-    keyword_match = keyword_pattern.search(line)
-    if keyword_match:
-        current_keyword = keyword_match.group(1)
+    part_match = part_pattern.search(line)
+    op_match = op_pattern.search(line)
+    count_match = count_pattern.search(line)
+    if (not part_match) or (not op_match) or (not count_match):
         continue
+    part = part_match.group(1).strip()
+    op = op_match.group(1).strip()
+    cnt = int(count_match.group(1).strip())
+    if cnt > 2097152:
+        continue
+    if op not in keywords:
+        continue
+    
+    if cnt not in sizes:
+        sizes.append(cnt)
+    if part in parts:
+        measurement_match = measurement_pattern.search(line)
+        measurement = measurement_match.group(1).strip()
+        if cnt not in results[op][part].keys():
+            results[op][part][cnt] = []
+        results[op][part][cnt].append(float(measurement))
+
+        
+for op, parts in results.items():
+    for part, cnts in parts.items():
+        for cnt, mes in cnts.items():
+            test_sum = 0
+            count = 0
+            for el in mes:
+                test_sum +=  el
+                count += 1
+            averages[op][part][cnt] = test_sum / count    
+
+for op, parts in averages.items():
+    for cnt in sizes:
+        averages[op]['lib_oh'][cnt] = parts['lib'][cnt] - parts['device'][cnt]
+        averages[op]['total_oh'][cnt] = parts['total'][cnt] - parts['sleep'][cnt]  - parts['lib'][cnt] - parts['init'][cnt] - parts['copy'][cnt]
+        averages[op]['pytorch_oh'][cnt] = parts['pytorch'][cnt] - (parts['total'][cnt])
+
+    averages[op].pop('lib')
+    averages[op].pop('total')
+    averages[op].pop('sleep')
+    averages[op].pop('pytorch')    
+
+
+sizes.sort()
+
+av_lists = {}
+for word in keywords:
+    av_lists[word] = {}
+    for part in parts_plot:
+        av_lists[word][part] = []
+        for size in sizes:
+            av_lists[word][part].append(averages[word][part][size])
+
+
+
+# print(av_lists['Allreduce'])
+# print(av_lists['Allreduce'].values().shape)
+# print(sizes)
+# print(sizes.shape)
+
+for op in keywords:
+    fig, ax = plt.subplots()
+    ax.stackplot(sizes, av_lists[op].values(),
+                 labels=av_lists[op].keys(), alpha=0.8)
+    ax.legend(loc='upper left', reverse=True)
+    plt.gca().set_xscale('log', base=2)
+    ax.set_title('Execution time composition' + )
+    ax.set_xlabel('size[B]')
+    ax.set_ylabel('Latency us')
+    # add tick at every 200 million people
+    # ax.yaxis.set_minor_locator(mticker.MultipleLocator(.2))
+
+    plt.savefig(op + '_composition.png')
+
+# for i, (dict_name, sub_dict) in enumerate(results.items()):
+    # for j, (key, values) in enumerate(sub_dict.items()):
+        # sns.histplot(values, ax=axes[i, j], bins=20, stat='percent', kde=True)
+        # axes[i, j].set_title(f'{dict_name} - {key}')
 
-    if current_keyword:
-        part_match = part_pattern.search(line)
-        if not part_match:
-            continue
-        part = part_match.group(1).strip()
-        if part in parts:
-            measurement_match = measurement_pattern.search(line)
-            measurement = measurement_match.group(1).strip()
-            results[current_keyword][part].append(float(measurement))
-
-fig, axes = plt.subplots(len(keywords), len(parts), figsize=(5 * len(parts), 5 * len(keywords) ))
-
-for i, (dict_name, sub_dict) in enumerate(results.items()):
-    for j, (key, values) in enumerate(sub_dict.items()):
-        sns.histplot(values, ax=axes[i, j], bins=20, stat='percent', kde=True)
-        axes[i, j].set_title(f'{dict_name} - {key}')
-
-plt.tight_layout()
-plt.savefig("fullplot.png")
+# plt.tight_layout()
+# plt.savefig("fullplot.png")
 
             

From 0e3cf79074a094a8b5a0113b1cfba25fa1c299b1 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Mon, 26 Aug 2024 14:11:08 +0200
Subject: [PATCH 51/64] Initialization fix and cleanup

---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 56 +------------------
 integrations/pytorch_ddp/test/plot_small.py   |  2 +-
 2 files changed, 3 insertions(+), 55 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index f7ca1f8c..d6089179 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -985,7 +985,7 @@ void ProcessGroupACCL::initialize() {
       accl.get()->initialize(ranks_, rank_, size_+2, bufsize, segsize, 4096*1024*2);
     } else {
       std::cout<<"Rendezvous Protocol"<<std::endl;
-      accl.get()->initialize(ranks_, rank_, 16, 1024, RDVZ_THRESHOLD);
+      accl.get()->initialize(ranks_, rank_, 16, 1024, RDVZ_THRESHOLD, 4096*1024);
     }  
     
     ACCL::debug(std::string("[ACCL coyote] communicator: ") + accl->dump_communicator());
@@ -1185,8 +1185,6 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
 
   int rounded_count = (in_tensor.numel() + ROUND_NR) & ~ROUND_NR;
   
-  auto zero_tensor = torch::zeros({rounded_count}, in_tensor.scalar_type());
-  
   
   ACCL::debug("rounded count:" + std::to_string(rounded_count));
   ACCL::debug("rootRank:" + std::to_string(opts.rootRank));
@@ -1211,61 +1209,11 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   STOP_FINE(lock, in_tensor.nbytes())
   
 
-  // in_buf->change_type(convert_datatype_from_torch(at::kInt));
-  // out_buf->change_type(convert_datatype_from_torch(at::kInt));
-
-  // int imaginary_count = rounded_count;
-  // if (in_tensor.scalar_type() == at::kDouble || in_tensor.scalar_type() == at::kLong){
-      // imaginary_count = (in_tensor.numel()*2 + ROUND_NR) & ~ROUND_NR;
-  // }
-  
-  /*      
-    if(rank_ == opts.rootRank){
-
-      for(int i = 0; i < size_; i++){
-	  if(i != opts.rootRank){
-	        if (in_tensor.scalar_type() == at::kInt || in_tensor.scalar_type() == at::kLong){
-		    ACCL::debug("sending:")
-	      for(int i = 0; i<(in_tensor.numel() * (in_tensor.element_size() / 4)); i++){
-		  ACCL::debug(std::to_string(((int *) in_buf->byte_array())[i]));
-  }
-		}
-	      ACCL::ACCLRequest* req = accl->send(*in_buf, rounded_count, i, 203);
-	  }
-	  
-  }
-    }
-    else{
-	  ACCL::ACCLRequest* req = accl->recv(*in_buf, rounded_count, opts.rootRank, 203);
-	  if (in_tensor.scalar_type() == at::kInt || in_tensor.scalar_type() == at::kLong){
-	      ACCL::debug("received:")
-	      for(int i = 0; i<(in_tensor.numel() * (in_tensor.element_size() / 4)); i++){
-		  ACCL::debug(std::to_string(((int *) in_buf->byte_array())[i]));
-  }
-		}
-    }
-  */
-
-
-  // if (in_tensor.scalar_type() == at::kInt || in_tensor.scalar_type() == at::kLong){
-      // ACCL::debug("input:");
-	      // for(int i = 0; i<(in_tensor.numel() * (in_tensor.element_size() / 4)); i++){
-		  // ACCL::debug(std::to_string(((int *) in_buf->byte_array())[i]));
-  // }
-		// }
-  
 
   PRE_REQUEST(Broadcast, in_tensor)  
       
   auto req = accl->bcast(*in_buf, rounded_count, opts.rootRank);
 
-  // if (in_tensor.scalar_type() == at::kInt || in_tensor.scalar_type() == at::kLong){
-      // ACCL::debug("result:");
-	      // for(int i = 0; i<(in_tensor.numel() * (in_tensor.element_size() / 4)); i++){
-		  // ACCL::debug(std::to_string(((int *) in_buf->byte_array())[i]));
-  // }
-		// }
-  
   POST_REQUEST("bcast", in_tensor.nbytes())
 
   // in_buf->sync_from_device();
@@ -1273,7 +1221,7 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
       // ACCL::debug(std::to_string(((double *) in_buf->byte_array())[i]));
   // }
   START_FINE(copy)
-  copy_back_tensor(in_tensor, in_buf, true, true, opts.rootRank);
+  copy_back_tensor(in_tensor, in_buf, false, true, opts.rootRank);
   STOP_FINE(copy, in_tensor.nbytes())
   #endif
 }
diff --git a/integrations/pytorch_ddp/test/plot_small.py b/integrations/pytorch_ddp/test/plot_small.py
index 3bd1708e..43ec456e 100644
--- a/integrations/pytorch_ddp/test/plot_small.py
+++ b/integrations/pytorch_ddp/test/plot_small.py
@@ -105,7 +105,7 @@
                  labels=av_lists[op].keys(), alpha=0.8)
     ax.legend(loc='upper left', reverse=True)
     plt.gca().set_xscale('log', base=2)
-    ax.set_title('Execution time composition' + )
+    ax.set_title('Execution time composition' + op)
     ax.set_xlabel('size[B]')
     ax.set_ylabel('Latency us')
     # add tick at every 200 million people

From 98a7527c020169d2648cb3242ba54a9c50703501 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Thu, 29 Aug 2024 10:58:15 +0200
Subject: [PATCH 52/64] Cleanup of debug statements

---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 113 +++---------------
 1 file changed, 15 insertions(+), 98 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index d6089179..1f24ce6a 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -180,12 +180,6 @@ std::map<at::ScalarType, MPI_Datatype> mpiDatatype = {
   ACCL::debug("device_" + std::string(x_MAKE_STRING(COLL_NAME)) + "_"  + std::to_string(nbytes) + " durationUs: " + std::to_string(durationUs_accl_##COLL_NAME));
 
     
-#define TIMER_WRAP()
-    
-// Better logging
-// accl_log(mpi_rank, format_log("bcast", options, durationUs, 0));
-
-  
 namespace {
 
 /* Alternative for std::format from C++20 in C++17.
@@ -758,7 +752,7 @@ void accl_sa_handler(int)
 // TODO delete when not needed anymore
 void ProcessGroupACCL::init_input_tensor(at::Tensor &tensor, std::unique_ptr<ACCL::BaseBuffer> &data, bool do_on_root, bool do_on_others, int opts_root_rank) {
   if DO_COND {
-	ACCL::debug("Copying data to CPU tensor of size " + std::to_string(tensor.numel()));
+	// ACCL::debug("Copying data to CPU tensor of size " + std::to_string(tensor.numel()));
 	// at::Tensor wrapper_tensor = torch::from_blob(data->byte_array(), tensor.sizes(), tensor.options().device(c10::DeviceType::CPU)); 
 	// wrapper_tensor.copy_(tensor);
 	std::memcpy(data->byte_array(), tensor.data_ptr(), tensor.numel() * tensor.element_size());
@@ -780,7 +774,7 @@ void ProcessGroupACCL::init_input_tensor(at::Tensor &tensor, std::unique_ptr<ACC
     // Prepend another dimension for vector length
     sizes.insert(sizes.begin(), tensor_vec.size());
       
-    ACCL::debug("Copying data to CPU tensor of size " + std::to_string(total_size));
+    // ACCL::debug("Copying data to CPU tensor of size " + std::to_string(total_size));
     at::Tensor wrapper_tensor = torch::from_blob(data->byte_array(), sizes, options);
 
     for (const auto i : c10::irange(tensor_vec.size())) {
@@ -854,11 +848,11 @@ void ProcessGroupACCL::copy_back_tensor(at::Tensor tensor_original, std::unique_
       if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
 	copy_back_p2p_buffer(*data, tensor_original);
       } else {
-	ACCL::debug("Copying data back from CPU tensor of size " +
-		    std::to_string(tensor_original.numel()));
+	// ACCL::debug("Copying data back from CPU tensor of size " +
+		    // std::to_string(tensor_original.numel()));
 	std::memcpy(tensor_original.data_ptr(), data->byte_array(), tensor_original.numel() * tensor_original.element_size());
 	// tensor_original.copy_(torch::from_blob(data->byte_array(), tensor_original.sizes(), tensor_original.options().device(c10::DeviceType::CPU)));
-	ACCL::debug("Finished Copying ");
+	// ACCL::debug("Finished Copying ");
       }
   }
 }
@@ -1112,12 +1106,6 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
 
   // It seems to have issues with non-even numbers, so we round to ACCL_MSG_SIZE
   int rounded_count = (in_tensor.numel() + ROUND_NR) & ~ROUND_NR;
-  // if (in_tensor.scalar_type() == at::kInt || in_tensor.scalar_type() == at::kLong){
-      // return;
-  // }
-
-  ACCL::debug("rounded count:" + std::to_string(rounded_count));
-  ACCL::debug("rootRank:" + std::to_string(opts.rootRank));
   
   int imaginary_count = rounded_count;
   if (in_tensor.scalar_type() == at::kDouble || in_tensor.scalar_type() == at::kLong){
@@ -1150,26 +1138,8 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   in_buf->change_type(convert_datatype_from_torch(at::kInt));
   out_buf->change_type(convert_datatype_from_torch(at::kInt));
 
-  if (in_tensor.scalar_type() == at::kInt || in_tensor.scalar_type() == at::kLong){
-      ACCL::debug("input:");
-	      for(int i = 0; i<(in_tensor.numel() * (in_tensor.element_size() / 4)); i++){
-		  ACCL::debug(std::to_string(((int *) in_buf->byte_array())[i]));
-	      }
-  }
-
-  
-      
   auto req = accl->allreduce(*in_buf, *out_buf, imaginary_count, ACCL::reduceFunction::SUM);      
 
-  if (in_tensor.scalar_type() == at::kInt || in_tensor.scalar_type() == at::kLong){
-      ACCL::debug("result:");
-	      for(int i = 0; i<(in_tensor.numel() * (in_tensor.element_size() / 4)); i++){
-		  ACCL::debug(std::to_string(((int *) out_buf->byte_array())[i]));
-	      }
-  }
-  
-
-  
   POST_REQUEST("broadcast", in_tensor.nbytes())
 
   START_FINE(copy)      
@@ -1186,9 +1156,6 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   int rounded_count = (in_tensor.numel() + ROUND_NR) & ~ROUND_NR;
   
   
-  ACCL::debug("rounded count:" + std::to_string(rounded_count));
-  ACCL::debug("rootRank:" + std::to_string(opts.rootRank));
-  
   START_FINE(init)
 
   if (opts.rootRank == rank_){
@@ -1200,9 +1167,6 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
 
   STOP_FINE(init, in_tensor.nbytes())
   
-  // Reserve device
-
-      
   START_FINE(lock)
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
@@ -1216,10 +1180,6 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
 
   POST_REQUEST("bcast", in_tensor.nbytes())
 
-  // in_buf->sync_from_device();
-  // for(int i = 0; i<in_tensor.numel(); i++){
-      // ACCL::debug(std::to_string(((double *) in_buf->byte_array())[i]));
-  // }
   START_FINE(copy)
   copy_back_tensor(in_tensor, in_buf, false, true, opts.rootRank);
   STOP_FINE(copy, in_tensor.nbytes())
@@ -1232,15 +1192,11 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
   checkSingleTensor(tensors);
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [opts, this](std::unique_ptr<WorkEntry> &entry) {
-	  std::cerr << "Starting Broadcast" << std::endl;
+	  // std::cerr << "Starting Broadcast" << std::endl;
 	// if (((entry->src)[0]).numel() <= RDVZ_THRESHOLD || BROADCAST_SIDESTEP){
 	if (BROADCAST_SIDESTEP){
 	    
 	auto data = (entry->src)[0];
-	if(data.scalar_type() == at::kInt || data.scalar_type() == at::kLong){
-	    std::cerr << "before" << std::endl;
-	    std::cerr << data << std::endl;
-	}
 	ACCL::debug("[Broadcast] -- Sidestepped using OpenMPI -- size " + std::to_string(data.numel()));
 	c10::DeviceGuard guard(data.device());
         std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
@@ -1250,28 +1206,13 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
             mpiDatatype.at(data.scalar_type()),
             opts.rootRank,
             MPI_COMM_WORLD));
-	if(data.scalar_type() == at::kInt || data.scalar_type() == at::kLong){
-	    std::cerr << "after" << std::endl;
-	    std::cerr << data << std::endl;
-	}
 	} else {
 	START_COARSE(total)    
 	at::Tensor &tensor = (entry->src)[0];
-	if(tensor.scalar_type() == at::kInt || tensor.scalar_type() == at::kLong){
-	    std::cerr << tensor << std::endl;
-	}
-	ACCL::debug(string_of_torch_datatype(tensor.scalar_type()));
         // Segment data if necessary
         if (tensor.nbytes() > ACCL_MSG_SIZE) {
-	    ACCL::debug("nbytes: " + std::to_string(tensor.nbytes()));
-	    ACCL::debug("bufsize: " + std::to_string(bufsize));
-	    ACCL::debug("numel: " + std::to_string(tensor.numel()));
-	    ACCL::debug("tensor.size(0): " + std::to_string(tensor.size(0)));
 	  size_t non_zero_dim_count = tensor.numel() / tensor.size(0);
-	  ACCL::debug("non_zero_dim_count: " + std::to_string(non_zero_dim_count));
-	  ACCL::debug("tensor.itemsize(): " + std::to_string(tensor.itemsize()));
           size_t n = ACCL_MSG_SIZE / tensor.itemsize() / non_zero_dim_count;
-	 ACCL::debug("n: " + std::to_string(n));
 	  ACCL::debug("[Broadcast] Segmenting tensor of size " + std::to_string(tensor.nbytes()) + " into " + std::to_string(n * non_zero_dim_count) + "-sized elements ");
           for (size_t i = 0; i < tensor.size(0); i += n) {
 	    ACCL::debug("part " + std::to_string(i) + "!");
@@ -1279,13 +1220,8 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
             run_broadcast(tensor.narrow(0, i, end), opts);
           }
         } else {
-	  ACCL::debug("[Broadcast] Broadcasting entire tensor of size " + std::to_string(tensor.nbytes()) + " without segmentation.");
           run_broadcast(tensor, opts);
         }
-	if(tensor.scalar_type() == at::kInt || tensor.scalar_type() == at::kLong){
-	    std::cerr << tensor << std::endl;
-	}
-
 	STOP_COARSE(total, ((entry->src)[0]).nbytes())    
 	}
       };
@@ -1319,8 +1255,6 @@ void ProcessGroupACCL::run_allreduce(at::Tensor in_tensor,
 
   int rounded_count = (in_tensor.numel() + ROUND_NR) & ~ROUND_NR;
   
-  ACCL::debug("rounded count:" + std::to_string(rounded_count));
-  
   auto req = accl->allreduce(*in_buf, *out_buf, rounded_count, acclOp.at(opts.reduceOp));      
 
   POST_REQUEST("allreduce", in_tensor.nbytes())
@@ -1337,7 +1271,6 @@ ProcessGroupACCL::allreduce(std::vector<at::Tensor> &tensors,
 
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [opts, this](std::unique_ptr<WorkEntry> &entry) {
-	std::cerr << "Starting AllReduce" << std::endl;
 	// sidestep eager allreduce
 	// if (((entry->src)[0]).numel() <= RDVZ_THRESHOLD || ALLREDUCE_SIDESTEP){
 	  if (ALLREDUCE_SIDESTEP){
@@ -1356,21 +1289,13 @@ ProcessGroupACCL::allreduce(std::vector<at::Tensor> &tensors,
 	    START_COARSE(total)    
 	    auto tensor = (entry->src)[0];
 	    // Segment data if necessary
-	    ACCL::debug(string_of_torch_datatype(tensor.scalar_type()));
 	    if (tensor.nbytes() > (ACCL_MSG_SIZE)) {
-		ACCL::debug("nbytes: " + std::to_string(tensor.nbytes()));
-		ACCL::debug("bufsize: " + std::to_string(bufsize));
-		ACCL::debug("numel: " + std::to_string(tensor.numel()));
-		ACCL::debug("tensor.size(0): " + std::to_string(tensor.size(0)));
 		size_t non_zero_dim_count = tensor.numel() / tensor.size(0);
-		ACCL::debug("non_zero_dim_count: " + std::to_string(non_zero_dim_count));
-		ACCL::debug("tensor.itemsize(): " + std::to_string(tensor.itemsize()));
 		size_t n = ACCL_MSG_SIZE / (tensor.itemsize() * non_zero_dim_count);
-		ACCL::debug("n: " + std::to_string(n));
 		ACCL::debug("[Allreduce] Segmenting tensor of size " + std::to_string(tensor.nbytes()) + " into " + std::to_string(n * non_zero_dim_count) + "-sized elements ");
 		for (size_t i = 0; i < tensor.size(0); i += n) {
 		    START_FINE(loop)    
-		    ACCL::debug("part " + std::to_string(i) + "!");
+		    // ACCL::debug("part " + std::to_string(i) + "!");
 		    size_t end = std::min(n, static_cast<size_t>(tensor.size(0)) - i);
 		    run_allreduce(tensor.narrow(0, i, end), opts);
 		    STOP_FINE(loop, tensor.nbytes())    
@@ -1509,7 +1434,6 @@ ProcessGroupACCL::allgather(std::vector<std::vector<at::Tensor>> &outputTensors,
         auto srctensor = (entry->src)[0];
         auto &dsttensors = entry->dst;
         // Segment data if necessary
-	ACCL::debug(string_of_torch_datatype(dsttensors[0].scalar_type()));
         if (srctensor.nbytes() > bufsize) {
 	  size_t non_zero_dim_count = srctensor.numel() / srctensor.size(0);
           size_t n = bufsize / srctensor.itemsize() / non_zero_dim_count;
@@ -1736,7 +1660,6 @@ ProcessGroupACCL::scatter(std::vector<at::Tensor> &outputTensors,
         auto dsttensor = (entry->dst)[0];
         // Segment data if necessary
         if (dsttensor.nbytes() > bufsize / 4) {
-          ACCL::debug("dsttensor to large!");
 	  size_t non_zero_dim_count = dsttensor.numel() / dsttensor.size(0);
           size_t n = bufsize / 4 / dsttensor.itemsize() / non_zero_dim_count;
           for (size_t i = 0; i < dsttensor.size(0); i += n) {
@@ -1787,17 +1710,15 @@ void ProcessGroupACCL::run_alltoall(at::Tensor in_tensor,
                                     at::Tensor out_tensor,
                                     const AllToAllOptions &opts) {
 
-  int a2a_nbytes = in_tensor.nbytes()/size_;
-
   START_FINE(init)
   init_input_tensor(in_tensor, in_buf, true, true);
-  STOP_FINE(init, a2a_nbytes)
+  STOP_FINE(init, in_tensor.nbytes())
 
   // Reserve device
   START_FINE(lock)      
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
-  STOP_FINE(lock, a2a_nbytes)
+  STOP_FINE(lock, in_tensor.nbytes())
 
   // init_output_data(out_tensor, dstdata, out_tensor.numel(), out_tensor.scalar_type(), true, true);
 
@@ -1805,11 +1726,11 @@ void ProcessGroupACCL::run_alltoall(at::Tensor in_tensor,
 
   auto req = accl->alltoall(*in_buf, *out_buf, in_tensor.numel()/size_);
 
-  POST_REQUEST("alltoall", a2a_nbytes)
+  POST_REQUEST("alltoall", in_tensor.nbytes())
 
   START_FINE(copy)      
   copy_back_tensor(out_tensor, out_buf, true, true);    
-  STOP_FINE(copy,  a2a_nbytes)
+  STOP_FINE(copy,  in_tensor.nbytes())
 }
 
     
@@ -1865,7 +1786,6 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::alltoall_base(
           // std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
           // Segment data if necessary
           if (dsttensor.nbytes() > bufsize) {
-            ACCL::debug("dsttensor to large!");
 
 	    // Split individual entries
 	    size_t non_zero_dim_count = dsttensor.numel() / dsttensor.size(0);
@@ -1873,29 +1793,26 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::alltoall_base(
 	    size_t entry_size = dsttensor.numel() / size_ / non_zero_dim_count;
             for (size_t i = 0; i < entry_size; i += n) {
               ACCL::debug("part " + std::to_string(i) + "!");
-              size_t end = std::min(i + n, static_cast<size_t>(entry_size));
+              size_t end = std::min(n, static_cast<size_t>(entry_size) - i);
 
 	      std::vector<at::Tensor> srctensorslices;
 	      srctensorslices.reserve(size_);
-	      ACCL::debug("dsttensorslices:");
 	      for (int j = 0; j < size_; j++) {
 		  int bufpos = j * entry_size;
-		  srctensorslices.emplace_back(srctensor.narrow(0, i + bufpos, end + bufpos));
+		  srctensorslices.emplace_back(srctensor.narrow(0, i + bufpos, end));
 	      }
 	      std::vector<at::Tensor> dsttensorslices;
 	      dsttensorslices.reserve(size_);
-	      ACCL::debug("dsttensorslices:");
 	      for (int j = 0; j < size_; j++) {
 		  int bufpos = j * entry_size;
-		  dsttensorslices.emplace_back(dsttensor.narrow(0, i + bufpos, end + bufpos));
+		  dsttensorslices.emplace_back(dsttensor.narrow(0, i + bufpos, end));
 	      }
               run_alltoall_vec(srctensorslices, dsttensorslices, opts);
             }
           } else {
-	    ACCL::debug("Running without segmentation");
             run_alltoall(srctensor, dsttensor, opts);
           }
-	  STOP_COARSE(total, ((((entry->src)[0]).nbytes())/size_))    
+	  STOP_COARSE(total, ((((entry->src)[0]).nbytes())))    
         };
     std::vector<at::Tensor> inputTensors = {inputTensor};
     std::vector<at::Tensor> outputTensors = {outputTensor};

From 16ea0f67fbce0d039cd91d50726a4c8306d8ed85 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Fri, 30 Aug 2024 18:34:04 +0200
Subject: [PATCH 53/64] Added more naming to bench

---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 1f24ce6a..5babe9eb 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -1312,16 +1312,14 @@ ProcessGroupACCL::allreduce(std::vector<at::Tensor> &tensors,
                  c10::optional<std::vector<at::Tensor>>(tensors));
 }
 
-#undef COLL_NAME
-#define COLL_NAME UNNAMED
-
 c10::intrusive_ptr<Work>
 ProcessGroupACCL::allreduce_coalesced(std::vector<at::Tensor> &tensors,
                                       const AllreduceCoalescedOptions &opts) {
   TORCH_CHECK(false,
               "allreduce_coalesced is currently not supported with ACCL");
 }
-
+#undef COLL_NAME
+#define COLL_NAME Reduce
 void ProcessGroupACCL::run_reduce(at::Tensor in_tensor,
                                   const ReduceOptions &opts) {
 
@@ -1365,6 +1363,8 @@ ProcessGroupACCL::reduce(std::vector<at::Tensor> &tensors,
                  c10::optional<std::vector<at::Tensor>>(tensors));
 }
 
+#undef COLL_NAME
+#define COLL_NAME Allgather
 void ProcessGroupACCL::run_allgather(
     at::Tensor in_tensor,
     const std::vector<at::Tensor> &dsttensorvec) {
@@ -1465,6 +1465,9 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::allgather_coalesced(
   TORCH_CHECK(false, "ProcessGroupACCL does not support allgather_coalesced");
 }
 
+#undef COLL_NAME
+#define COLL_NAME Gather
+    
 void ProcessGroupACCL::run_gather(at::Tensor in_tensor,
                                   const std::vector<at::Tensor> &dsttensorvec,
                                   const GatherOptions &opts) {
@@ -1580,6 +1583,8 @@ ProcessGroupACCL::gather(std::vector<std::vector<at::Tensor>> &outputTensors,
   }
 }
 
+#undef COLL_NAME
+#define COLL_NAME Scatter
 void ProcessGroupACCL::run_scatter(std::vector<at::Tensor> &in_tensor_vec,
                                    at::Tensor out_tensor,
                                    const ScatterOptions &opts) {
@@ -1836,8 +1841,7 @@ ProcessGroupACCL::alltoall(std::vector<at::Tensor> &outputTensors,
 }
 
 #undef COLL_NAME
-#define COLL_NAME UNNAMED
-
+#define COLL_NAME Send
 
 void ProcessGroupACCL::run_send(at::Tensor in_tensor, int dstRank,
                                 int tag) {
@@ -1881,6 +1885,9 @@ ProcessGroupACCL::send(std::vector<at::Tensor> &tensors, int dstRank, int tag) {
                  c10::optional<std::vector<at::Tensor>>(tensors));
 }
 
+#undef COLL_NAME
+#define COLL_NAME Recv    
+    
 void ProcessGroupACCL::run_recv(at::Tensor out_tensor, int srcRank,
                                 int tag) {
 
@@ -1924,6 +1931,9 @@ ProcessGroupACCL::recv(std::vector<at::Tensor> &tensors, int srcRank, int tag) {
                  c10::optional<std::vector<at::Tensor>>(tensors));
 }
 
+#undef COLL_NAME
+#define COLL_NAME Unnamed
+    
 c10::intrusive_ptr<Work>
 ProcessGroupACCL::recvAnysource(std::vector<at::Tensor> &tensors, int tag) {
   TORCH_CHECK(false, "ProcessGroupACCL does not support recvAnysource");

From 4b91f0ece792d297835d7d94fcf4f140d5244b86 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Fri, 30 Aug 2024 18:36:50 +0200
Subject: [PATCH 54/64] Added pytorch side bench to all collectives

---
 integrations/pytorch_ddp/test/test-generic.py | 157 +++++++++++++-----
 1 file changed, 114 insertions(+), 43 deletions(-)

diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index de3c22bc..7fa3ed1e 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -49,7 +49,7 @@
 rank = 0
 size = 0
 
-x = 5000
+x = 1024
 y = 1
 
 seed = 48
@@ -118,7 +118,7 @@ def test_broadcast(numel, testtype):
             
         measured_time = (end_time - start_time) * 1000000
 
-        print("pytorch_Broadcast_" + str(x.nbytes) + " durationUs: " + str(measured_time), file=sys.stderr)
+        print(str(rank) + "_pytorch_Broadcast_" + str(x.nbytes) + " durationUs: " + str(measured_time), file=sys.stderr)
         
         logger.debug("Directly measured time us 1:" + str(measured_time))
             
@@ -164,8 +164,10 @@ def test_broadcast_2():
         logger.debug("Test broadcast finished!")
 
         
-def test_sendrcv():
+def test_sendrcv(numel):
     global num_errors
+
+    shape = (numel,)
     x = torch.full(shape, float(rank))
 
     y = torch.empty(shape)
@@ -175,13 +177,34 @@ def test_sendrcv():
 
 
     with torch.profiler.record_function("test_sendrcv"):
-
         if rank % 2:
+            mpi.Barrier()            
+            start_time = time.perf_counter()
             dist.send(x, next_rank)
+            end_time = time.perf_counter()
+            measured_time = (end_time - start_time) * 1000000
+            print(str(rank) + "_pytorch_Send_" + str(x.nbytes) + " durationUs: " + str(measured_time), file=sys.stderr)
+
+            mpi.Barrier()            
+            start_time = time.perf_counter()
             dist.recv(y, prev_rank)
+            end_time = time.perf_counter()
+            measured_time = (end_time - start_time) * 1000000
+            print(str(rank) + "_pytorch_Recv_" + str(x.nbytes) + " durationUs: " + str(measured_time), file=sys.stderr)
         else:
+            mpi.Barrier()            
+            start_time = time.perf_counter()
             dist.recv(y, prev_rank)
+            end_time = time.perf_counter()
+            measured_time = (end_time - start_time) * 1000000
+            print(str(rank) + "_pytorch_Recv_" + str(x.nbytes) + " durationUs: " + str(measured_time), file=sys.stderr)
+
+            mpi.Barrier()            
+            start_time = time.perf_counter()
             dist.send(x, next_rank)
+            end_time = time.perf_counter()
+            measured_time = (end_time - start_time) * 1000000
+            print(str(rank) + "_pytorch_Send_" + str(x.nbytes) + " durationUs: " + str(measured_time), file=sys.stderr)
         mpi.Barrier()
     try:
         np.testing.assert_allclose(y, torch.full(shape, prev_rank))
@@ -193,18 +216,27 @@ def test_sendrcv():
         logger.debug("Test Sendrcv finished!")
 
 
-def test_scatter():
+def test_scatter(numel):
     global num_errors
+
+    shape = (numel,)
     if rank == 0:
         x = [torch.full(shape, float(i+1)) for i in range(size)]
     else:
         x = None
     y = torch.full(shape, float(0))
 
+    mpi.Barrier()            
+    start_time = time.perf_counter()
+    
     with torch.profiler.record_function("test_scatter"):
         
         dist.scatter(y, x, 0)
-        mpi.Barrier()
+
+    end_time = time.perf_counter()
+    measured_time = (end_time - start_time) * 1000000
+    print(str(rank) + "_pytorch_Scatter_" + str(y.nbytes) + " durationUs: " + str(measured_time), file=sys.stderr)
+    
     try:
         np.testing.assert_allclose(y, torch.full(shape, float(rank+1)))
     except AssertionError as e:
@@ -216,8 +248,10 @@ def test_scatter():
     
 
 
-def test_gather():
+def test_gather(numel):
     global num_errors
+
+    shape = (numel,)
     x = torch.full(shape, float(rank))
 
     if rank == 0:
@@ -225,10 +259,17 @@ def test_gather():
     else:
         y = None
 
+    mpi.Barrier()            
+    start_time = time.perf_counter()
+        
     with torch.profiler.record_function("test_gather"):
             
         dist.gather(x, y, 0)
-        mpi.Barrier()
+
+    end_time = time.perf_counter()
+    measured_time = (end_time - start_time) * 1000000
+    print(str(rank) + "_pytorch_Gather_" + str(x.nbytes) + " durationUs: " + str(measured_time), file=sys.stderr)
+    
     if rank == 0:
         for i, c in enumerate(y):
             try:
@@ -241,22 +282,32 @@ def test_gather():
                 logger.debug("Test Gather finished!")
 
             
-def test_allgather():
+def test_allgather(numel, testtype):
     global num_errors
-    testtype = torch.int64
-    shape_gather = (1,)
+
+    shape = (numel,)
     if testtype == torch.int64 or testtype == torch.int32:
-        rand_torch = torch.randint(torch.iinfo(testtype).min, torch.iinfo(testtype).max,shape_gather, dtype=testtype)
+        rand_torch = torch.randint(torch.iinfo(testtype).min, torch.iinfo(testtype).max,shape, dtype=testtype)
     else:
-        rand_torch = torch.rand(shape_gather, dtype=testtype)
+        rand_torch = torch.rand(shape, dtype=testtype)
     x = rand_torch.clone()
-    y = [torch.full(shape_gather, 0, dtype=testtype) for _ in range(size)]
+    y = [torch.full(shape, 0, dtype=testtype) for _ in range(size)]
 
+    mpi.Barrier()            
+    start_time = time.perf_counter()
+
+    print('len y:' + str(len(y)))
+    
     with torch.profiler.record_function("test_allgather"):
+        dist.all_gather(y, x)
+
+    end_time = time.perf_counter()
+    measured_time = (end_time - start_time) * 1000000
+    print(str(rank) + "_pytorch_Allgather_" + str(x.nbytes) + " durationUs: " + str(measured_time), file=sys.stderr)
         
+    mpi.Barrier()
+
         
-        dist.all_gather(y, x)
-        mpi.Barrier()
     for i, c in enumerate(y):
         try:
             np.testing.assert_allclose(c, rand_torch)
@@ -269,14 +320,24 @@ def test_allgather():
         
 
 
-def test_reduce():
+def test_reduce(numel):
     global num_errors
+
+
+    shape = (numel,)
     x = torch.ones(shape)
 
+    mpi.Barrier()            
+    start_time = time.perf_counter()
     with torch.profiler.record_function("test_reduce"):
 
         dist.reduce(x, 0, dist.ReduceOp.SUM)
-        mpi.Barrier()            
+        mpi.Barrier()
+
+    end_time = time.perf_counter()
+    measured_time = (end_time - start_time) * 1000000
+    print(str(rank) + "_pytorch_Reduce_" + str(x.nbytes) + " durationUs: " + str(measured_time), file=sys.stderr)
+    
     if rank == 0:
         try:
             np.testing.assert_allclose(x, torch.full(shape, float(size)))
@@ -316,10 +377,8 @@ def test_allreduce(numel, testtype):
             dist.all_reduce(x, dist.ReduceOp.SUM)
 
         end_time = time.perf_counter()
-            
         measured_time = (end_time - start_time) * 1000000
-
-        print("pytorch_Allreduce_" + str(x.nbytes) + " durationUs: " + str(measured_time), file=sys.stderr)
+        print(str(rank) + "_pytorch_Allreduce_" + str(x.nbytes) + " durationUs: " + str(measured_time), file=sys.stderr)
         
         logger.debug("Directly measured time us 1:" + str(measured_time))            
         
@@ -335,34 +394,40 @@ def test_allreduce(numel, testtype):
             logger.debug("Test AllReduce finished!")
         
     
-def test_alltoall():
+def test_alltoall(numel):
     global num_errors
 
     # num_el = 26624
     
-    # shape = (num_el,)
+    shape = (numel,)
 
-    input = torch.arange(num_el, dtype=torch.float) + float(rank) * num_el
+    input = torch.arange(numel, dtype=torch.float) + float(rank) * numel
 
     input_shaped = input.reshape(shape)
 
-    output = torch.ones(num_el)
+    output = torch.ones(numel)
 
     output_shaped = output.reshape(shape)
 
+    start_time = time.perf_counter()
+    
     with torch.profiler.record_function("test_alltoall"):
         
         dist.all_to_all_single(output_shaped, input_shaped)
 
-        mpi.Barrier()
+    end_time = time.perf_counter()
 
-    test = torch.zeros(num_el)
+    measured_time = (end_time - start_time) * 1000000
+    
+    print(str(rank) + "_pytorch_AlltoAll_" + str(input.nbytes) + " durationUs: " + str(measured_time), file=sys.stderr)
+        
+    test = torch.zeros(numel)
 
-    section_size = int(num_el/size)
+    section_size = int(numel/size)
 
     for section in range(size):
         for el in range(section_size):
-            test[section * section_size + el] = float(rank) * section_size + section * num_el + el
+            test[section * section_size + el] = float(rank) * section_size + section * numel + el
 
     test_shaped = test.reshape(shape)
     try:
@@ -460,6 +525,7 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
         mp = "30505"
     os.environ['MASTER_ADDR'] = ma
     os.environ['MASTER_PORT'] = mp
+
     rank = mpi.Get_rank()
     size = mpi.Get_size()
     start_port = 5005
@@ -527,14 +593,25 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
     )
     
     # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, schedule=schedule, record_shapes=True) as prof:
-    for n in range(4,22):
-        for i in range(20):
-            test_broadcast(2**n, torch.float32)
-            test_allreduce(2**n, torch.float32)
+
+    n = 19
+    
+    if True:
+        for i in range(40):
+            num = 2**n * 3
+            test_broadcast(num, torch.float32)
+            test_allreduce(num, torch.float32)
+            test_alltoall(num)
+            test_allgather(num, torch.float32)
+            test_sendrcv(num)
+            test_scatter(num)
+            test_gather(num)
+            test_reduce(num)
+            
             # prof.step()
     
     # for i in range(10):
-    # if True:
+    if False:
         # test_allreduce(256, torch.int32)
         # test_allreduce(256, torch.int64)
         # test_broadcast(256, torch.float32)
@@ -542,30 +619,25 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
         # test_allgather()
 
         # test_broadcast_2()
-        # test_broadcast(642, torch.int64)
+        test_broadcast(1024, torch.float32)
         # test_broadcast(25610152, torch.float32)
         # test_broadcast(53, torch.int64)
         # test_broadcast(53120, torch.float32)
         # test_broadcast(53, torch.int64)
-        # test_allreduce(25557032, torch.float32)
+        test_allreduce(1024, torch.float32)
         # test_broadcast(162, torch.int32)
         # test_broadcast(25, torch.int32)
         # test_broadcast(53120, torch.float32)
         # test_broadcast(53, torch.int64)
         # test_allreduce(2049000, torch.float32)
         # test_allreduce()
-        # test_allgather()
         # test_broadcast_segment()
         # test_broadcast()
         # test_broadcast()
         # test_broadcast()
         # test_broadcast()
         # test_broadcast()
-        # test_sendrcv()
-        # test_scatter()
-    # for i in range(10):
-        # test_gather()
-        # test_alltoall()
+        test_alltoall()
         # test_allreduce(1000, torch.float32)
         # test_allreduce(2052096, torch.float32)
         # test_allreduce(1049600, torch.float32)
@@ -580,7 +652,6 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
         # test_allreduce()
         # test_allreduce()
 
-        # test_reduce()
 
 
         # demo_basic(rank)

From c8cbb74344d1916ac44198e76abea7e375513108 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Fri, 30 Aug 2024 18:50:53 +0200
Subject: [PATCH 55/64] plotting support for all collectives

---
 integrations/pytorch_ddp/test/plot_small.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/integrations/pytorch_ddp/test/plot_small.py b/integrations/pytorch_ddp/test/plot_small.py
index 43ec456e..f1575671 100644
--- a/integrations/pytorch_ddp/test/plot_small.py
+++ b/integrations/pytorch_ddp/test/plot_small.py
@@ -4,7 +4,7 @@
 import numpy as np
 import matplotlib.ticker as mticker
 
-keywords = ["Broadcast", "Allreduce" ]
+keywords = ["Broadcast", "Allreduce", "AlltoAll" ]
 # parts = ["lib","barrier","total"]
 parts = ["lib", "copy","init", "total", "device", "pytorch", "sleep"]
 parts_plot = ["init", "device", "lib_oh", "copy", "total_oh", "pytorch_oh"]
@@ -23,8 +23,8 @@
     lines = log_file.readlines()
 
 current_keyword = None
-results = { "Broadcast": {}, "Allreduce": {}}
-averages = { "Broadcast": {}, "Allreduce": {}}
+results = { "Broadcast": {}, "Allreduce": {}, "AlltoAll": {}}
+averages = { "Broadcast": {}, "Allreduce": {}, "AlltoAll": {}}
 
 # results = { "Broadcast": {}, "Allreduce": {}}
 
@@ -70,8 +70,14 @@
                 count += 1
             averages[op][part][cnt] = test_sum / count    
 
+sizes.sort()
+            
 for op, parts in averages.items():
     for cnt in sizes:
+        if cnt == 32:
+            print(averages[op]['lib_oh'])
+            print(parts['lib'])
+            print(parts['device'])
         averages[op]['lib_oh'][cnt] = parts['lib'][cnt] - parts['device'][cnt]
         averages[op]['total_oh'][cnt] = parts['total'][cnt] - parts['sleep'][cnt]  - parts['lib'][cnt] - parts['init'][cnt] - parts['copy'][cnt]
         averages[op]['pytorch_oh'][cnt] = parts['pytorch'][cnt] - (parts['total'][cnt])
@@ -82,14 +88,14 @@
     averages[op].pop('pytorch')    
 
 
-sizes.sort()
-
 av_lists = {}
 for word in keywords:
     av_lists[word] = {}
     for part in parts_plot:
         av_lists[word][part] = []
         for size in sizes:
+            # if size == 32:
+                # continue
             av_lists[word][part].append(averages[word][part][size])
 
 
@@ -105,7 +111,7 @@
                  labels=av_lists[op].keys(), alpha=0.8)
     ax.legend(loc='upper left', reverse=True)
     plt.gca().set_xscale('log', base=2)
-    ax.set_title('Execution time composition' + op)
+    ax.set_title(op + ' Execution time composition')
     ax.set_xlabel('size[B]')
     ax.set_ylabel('Latency us')
     # add tick at every 200 million people

From fcad6ca90aae2ac671ed866188b7abddcc063869 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Fri, 30 Aug 2024 19:00:17 +0200
Subject: [PATCH 56/64] attempt to fix resnet

---
 .../pytorch_ddp/test/test-resnet34.py         | 253 ++++++++++++++++++
 .../pytorch_ddp/test/test-resnet50.py         |   2 +
 2 files changed, 255 insertions(+)
 create mode 100644 integrations/pytorch_ddp/test/test-resnet34.py

diff --git a/integrations/pytorch_ddp/test/test-resnet34.py b/integrations/pytorch_ddp/test/test-resnet34.py
new file mode 100644
index 00000000..fa42963f
--- /dev/null
+++ b/integrations/pytorch_ddp/test/test-resnet34.py
@@ -0,0 +1,253 @@
+import torch
+import torchvision
+from torchvision import datasets
+from torchvision import models
+from torchvision import transforms
+from torchvision.transforms import ToTensor
+from torch.utils.data import DataLoader
+from torch.profiler import profile, ProfilerActivity
+import torch.nn as nn
+from torch import optim
+from torch.autograd import Variable
+import torch.distributed as dist
+import accl_process_group as accl
+
+from mpi4py.MPI import COMM_WORLD as mpi
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data.distributed import DistributedSampler
+
+import argparse
+import os
+import sys
+import logging
+import time
+
+logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
+
+logger = logging.getLogger(__name__)
+
+if "ACCL_DEBUG" in os.environ and os.environ["ACCL_DEBUG"]=="1":
+    logger.setLevel(logging.DEBUG)
+else:
+    logger.setLevel(logging.WARNING)
+
+# Run via ACCL
+
+def train(num_epochs, model, loaders, criterion, p):
+
+    start_time_train = time.perf_counter()
+    
+    model.train()
+
+    total_step = len(loaders['train'])
+
+    optimizer = optim.Adam(model.parameters(), lr = 0.001)   
+
+    for epoch in range(num_epochs):
+        model.train()
+        running_loss = 0.0
+        for i, (inputs, labels) in enumerate(loaders['train']):
+            p.step()
+            start_time = time.perf_counter()
+            
+            optimizer.zero_grad()
+            outputs = model(inputs)
+            loss = criterion(outputs, labels)
+            loss.backward()
+            optimizer.step()
+            running_loss += loss.item()
+
+            if (i+1) % 100 == 0:
+                break
+            if True:
+                end_time = time.perf_counter()
+                measured_time = (end_time - start_time) * 1000000
+                print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Time(us): {}' 
+                       .format(epoch + 1, num_epochs, i + 1, total_step, loss.item(), measured_time))
+
+    end_time_train = time.perf_counter()
+    measured_time_train = (end_time_train - start_time_train) * 1000000
+
+    print('Total train time: ' + str(measured_time_train))
+        
+
+def test(num_epochs, model, loaders, criterion, p):
+    # Test the model
+    start_time_test = time.perf_counter()
+    model.eval()
+    with torch.no_grad():
+        correct = 0
+        total = 0
+        val_loss = 0
+        for i, (inputs, labels) in enumerate(loaders['test']):
+            p.step()
+            test_output = model(inputs)
+            loss = criterion(test_output, labels)
+            val_loss += loss.item()
+
+            _, predicted = torch.max(test_output, 1)
+            correct_current = (predicted == labels).sum().item()
+            total += labels.size(0)
+            correct += correct_current
+            
+            print(f'Test Batch accuracy: {correct_current}/{labels.size(0)} {correct_current/float(labels.size(0))}')
+
+
+    end_time_test = time.perf_counter()
+    measured_time_test = (end_time_test - start_time_test) * 1000000
+
+    print('Total test time: ' + str(measured_time_test))            
+    print(f'Total accuracy: {correct}/{total} {correct/float(total)}')
+    
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("-n", type=int, default=1)
+    parser.add_argument("-d", type=bool, default=None)
+
+
+    parser.add_argument('-s', '--simulator', action='store_true',
+                        default=False, help='Use simulation instead of '
+                                            'hardware')
+    parser.add_argument('-c', '--comms', choices=['udp', 'tcp', 'cyt_rdma'], default='tcp',
+                        help='Run tests over specified communication backend')
+    parser.add_argument('-i', '--host-file', type=str, help='Specify the file, where the host IPs are listed')
+    parser.add_argument('-f', '--fpga-file', type=str, help='Specify the file, where the FPGA IPs are listed')
+    parser.add_argument('-a','--master-address', type=str)
+    parser.add_argument('-p','--master-port', type=str)
+
+
+    args = parser.parse_args()
+
+    if args.n == 1 and args.d == None :
+        print("only one machine specified. Assuming Non distributed setup")
+        args.d = False
+    elif args.n > 1 and args.d == None:
+        print("Assuming DDP setup")
+        args.d = True
+
+
+    host_file = args.host_file
+    fpga_file = args.fpga_file
+    comms = args.comms
+    start_port = 5005
+    
+    global rank, size
+    if args.master_address==None:
+        args.master_address = "localhost"
+    if args.master_port==None:
+        args.master_port = "30505"
+    os.environ['MASTER_ADDR'] = args.master_address
+    os.environ['MASTER_PORT'] = args.master_port
+    rank = mpi.Get_rank()
+    size = mpi.Get_size()
+
+    rxbufsize = 4096 * 1024
+
+    if args.d:
+        if not args.simulator:
+            #default from test.cpp
+            rxbufsize = 4096 * 1024
+            if host_file==None or fpga_file==None: sys.exit('Host and FPGA file need to be specified in hardware mode')
+        
+            with open(host_file, 'r') as hf:
+                host_ips = hf.read().splitlines()
+            
+            with open(fpga_file, 'r') as ff:
+                fpga_ips = ff.read().splitlines()
+
+            if comms == "cyt_rdma":
+                ranks = [accl.Rank(a, start_port, i, rxbufsize) for i, a in enumerate(fpga_ips)]
+            else:
+                ranks = [accl.Rank(a, start_port + i, 0, rxbufsize) for i, a in enumerate(fpga_ips)]
+        else:
+            # Somehow the simulator gets stuck if I use the same rxbufsize
+            rxbufsize = 4096 * 1024
+            ranks = [accl.Rank("127.0.0.1", 5500 + i, i, rxbufsize) for i in range(size)]
+
+        logger.debug(f'Ranks: {ranks}')
+
+        if args.comms == 'udp':
+            design = accl.ACCLDesign.udp
+        elif args.comms == 'tcp':
+            design = accl.ACCLDesign.tcp
+        elif args.comms == 'cyt_rdma': # and not simulator:
+            design = accl.ACCLDesign.cyt_rdma
+    
+
+        mpi.Barrier()            
+    
+        accl.create_process_group(ranks, design, bufsize=rxbufsize, initialize=True, simulation=args.simulator)
+        dist.init_process_group("ACCL", rank=rank, world_size=size)
+        
+    device = 'cpu'
+
+    transform = transforms.Compose([
+        transforms.Resize(256),
+        transforms.CenterCrop(224),
+        transforms.ToTensor(),
+        transforms.Normalize(
+            mean=[0.485, 0.456, 0.406],
+            std=[0.229, 0.224, 0.225]
+        )
+    ])
+
+    train_dataset = datasets.CIFAR10(root='cifar10_data', train=True, download=True, transform=transform)
+    val_dataset = datasets.CIFAR10(root='cifar10_data', train=False, download=True, transform=transform)
+
+    if args.d : sampler = DistributedSampler
+    else : sampler = lambda x : None
+    
+    loaders = {
+        'train' : torch.utils.data.DataLoader(train_dataset, 
+                                              batch_size=32, 
+                                              shuffle=False,
+                                              num_workers=4,
+                                              sampler=sampler(train_dataset)),
+        'test'  : torch.utils.data.DataLoader(val_dataset, 
+                                              batch_size=32, 
+                                              shuffle=False,
+                                              num_workers=4,
+                                              sampler=sampler(val_dataset)),
+    }
+
+    model = models.resnet34(pretrained=True)
+    
+    if args.d : model = DDP(model, bucket_cap_mb=2, broadcast_buffers=True, find_unused_parameters=True)
+
+    loss_func = nn.CrossEntropyLoss()   
+
+    criterion = nn.CrossEntropyLoss()
+    
+    num_epochs = 1
+
+    mpi.Barrier()
+
+    print("starting training")
+
+    schedule = torch.profiler.schedule(
+        wait=1,
+        warmup=1,
+        active=10,
+        repeat=3
+    )
+
+    
+    with torch.profiler.profile(
+            activities=[torch.profiler.ProfilerActivity.CPU],
+            schedule=schedule,
+            on_trace_ready=torch.profiler.tensorboard_trace_handler('./accl_log/profiler_log'),
+            record_shapes=True,
+    ) as p:
+
+        
+        train(num_epochs, model, loaders, criterion, p)
+
+        test(num_epochs, model, loaders, criterion, p)
+
+    p.stop()
+
+    print(p.key_averages().table(sort_by="self_cpu_time_total", row_limit=100))
+
+    if args.d : dist.destroy_process_group()
diff --git a/integrations/pytorch_ddp/test/test-resnet50.py b/integrations/pytorch_ddp/test/test-resnet50.py
index ec74d469..efb4daac 100644
--- a/integrations/pytorch_ddp/test/test-resnet50.py
+++ b/integrations/pytorch_ddp/test/test-resnet50.py
@@ -244,6 +244,8 @@ def test_broadcast(numel, testtype):
         accl.create_process_group(ranks, design, bufsize=rxbufsize, initialize=True, simulation=args.simulator)
         dist.init_process_group("ACCL", rank=rank, world_size=size)
 
+    # dist.init_process_group("mpi", rank=rank, world_size=size)
+        
 
     test_allreduce(256, torch.float32)
     test_broadcast(256, torch.float32)

From 9185e605ee5874eb5410b0063b32a9ab2ec19f75 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sat, 31 Aug 2024 13:28:19 +0200
Subject: [PATCH 57/64] fixed segmentation bug

---
 integrations/pytorch_ddp/src/ProcessGroupACCL.cpp | 14 +++++++-------
 integrations/pytorch_ddp/test/run.sh              |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 5babe9eb..56ce37ab 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -1350,7 +1350,7 @@ ProcessGroupACCL::reduce(std::vector<at::Tensor> &tensors,
         if (tensor.nbytes() > bufsize) {
           size_t n = bufsize / tensor.itemsize();
           for (size_t i = 0; i < tensor.numel(); i += n) {
-            size_t end = std::min(i + n, static_cast<size_t>(tensor.numel()));
+            size_t end = std::min(n, static_cast<size_t>(tensor.numel()) - i);
             run_reduce(tensor.narrow(0, i, end), opts);
           }
         } else {
@@ -1439,7 +1439,7 @@ ProcessGroupACCL::allgather(std::vector<std::vector<at::Tensor>> &outputTensors,
           size_t n = bufsize / srctensor.itemsize() / non_zero_dim_count;
           for (size_t i = 0; i < srctensor.size(0); i += n) {
             size_t end =
-                std::min(i + n, static_cast<size_t>(srctensor.size(0)));
+                std::min(n, static_cast<size_t>(srctensor.size(0) - i));
             std::vector<at::Tensor> dsttensorslices;
             dsttensorslices.reserve(dsttensors.size());
             for (auto &dsttensor : dsttensors) {
@@ -1556,7 +1556,7 @@ ProcessGroupACCL::gather(std::vector<std::vector<at::Tensor>> &outputTensors,
 	  ACCL::debug("[Gather] Segmenting tensor of size " + std::to_string(srctensor.nbytes()) + " into " + std::to_string(n * non_zero_dim_count) + "-sized elements ");
           for (size_t i = 0; i < srctensor.size(0); i += n) {
             size_t end =
-                std::min(i + n, static_cast<size_t>(srctensor.size(0)));
+                std::min(n, static_cast<size_t>(srctensor.size(0)) - i);
             std::vector<at::Tensor> dsttensorslices;
             dsttensorslices.reserve(dsttensors.size());
             for (auto &dsttensor : dsttensors) {
@@ -1670,7 +1670,7 @@ ProcessGroupACCL::scatter(std::vector<at::Tensor> &outputTensors,
           for (size_t i = 0; i < dsttensor.size(0); i += n) {
             ACCL::debug("part " + std::to_string(i) + "!");
             size_t end =
-                std::min(i + n, static_cast<size_t>(dsttensor.size(0)));
+                std::min(n, static_cast<size_t>(dsttensor.size(0)) - i);
             std::vector<at::Tensor> srctensorslices;
             srctensorslices.reserve(srctensors.size());
             for (auto &srctensor : srctensors) {
@@ -1871,7 +1871,7 @@ ProcessGroupACCL::send(std::vector<at::Tensor> &tensors, int dstRank, int tag) {
         if (tensor.nbytes() > bufsize) {
           size_t n = bufsize / tensor.itemsize();
           for (size_t i = 0; i < tensor.numel(); i += n) {
-            size_t end = std::min(i + n, static_cast<size_t>(tensor.numel()));
+            size_t end = std::min(n, static_cast<size_t>(tensor.numel()) - i);
             run_send(tensor.narrow(0, i, end), dstRank, tag);
           }
         } else {
@@ -1917,7 +1917,7 @@ ProcessGroupACCL::recv(std::vector<at::Tensor> &tensors, int srcRank, int tag) {
         if (tensor.nbytes() > bufsize) {
           size_t n = bufsize / tensor.itemsize();
           for (size_t i = 0; i < tensor.numel(); i += n) {
-            size_t end = std::min(i + n, static_cast<size_t>(tensor.numel()));
+            size_t end = std::min(n, static_cast<size_t>(tensor.numel()) - i);
             run_recv(tensor.narrow(0, i, end), srcRank, tag);
           }
         } else {
@@ -1941,7 +1941,7 @@ ProcessGroupACCL::recvAnysource(std::vector<at::Tensor> &tensors, int tag) {
 
 c10::intrusive_ptr<Work>
 ProcessGroupACCL::barrier(const BarrierOptions &opts) {
-  TORCH_CHECK(false, "ProcessGroupACCL does not support barrier");
+  accl->barrier();
 }
 
 c10::intrusive_ptr<Work>
diff --git a/integrations/pytorch_ddp/test/run.sh b/integrations/pytorch_ddp/test/run.sh
index 78688b9d..6482f9d1 100755
--- a/integrations/pytorch_ddp/test/run.sh
+++ b/integrations/pytorch_ddp/test/run.sh
@@ -10,8 +10,8 @@ if [[ -v ACCL_SCRIPT ]]; then
     SCRIPT_NAME="$ACCL_SCRIPT"
 else
     # SCRIPT_NAME="test-mnist.py -d True -n 2" # MNIST
-    SCRIPT_NAME="test-resnet50.py -d True -n 2" # MNIST
-    # SCRIPT_NAME=test-generic.py
+    # SCRIPT_NAME="test-resnet50.py -d True -n 2" # MNIST
+    SCRIPT_NAME=test-generic.py
     # SCRIPT_NAME="test-imagenet.py -d True"
     echo "Variable ACCL_SCRIPT not set. Assuming $SCRIPT_NAME"
 fi

From 3424514935fbd8b20d1d9725c3afbcf6ae19dd52 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sat, 31 Aug 2024 18:28:27 +0200
Subject: [PATCH 58/64] Added measurements to all collectives

---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 65 ++++++++++++++++---
 1 file changed, 55 insertions(+), 10 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 56ce37ab..95b9183c 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -1323,19 +1323,26 @@ ProcessGroupACCL::allreduce_coalesced(std::vector<at::Tensor> &tensors,
 void ProcessGroupACCL::run_reduce(at::Tensor in_tensor,
                                   const ReduceOptions &opts) {
 
-  init_input_tensor(in_tensor, in_buf, true, true);    
+
+  START_FINE(init)    
+  init_input_tensor(in_tensor, in_buf, true, true);
+  STOP_FINE(init, in_tensor.nbytes())
 
   // Reserve device
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  PRE_REQUEST(Reduce,in_tensor)  
+  PRE_REQUEST(Reduce,in_tensor)
+
+  int rounded_count = (in_tensor.numel() + ROUND_NR) & ~ROUND_NR;
 
-  auto req = accl->reduce(*in_buf, *out_buf, in_tensor.numel(), opts.rootRank, acclOp.at(opts.reduceOp));
+  auto req = accl->reduce(*in_buf, *out_buf, rounded_count, opts.rootRank, acclOp.at(opts.reduceOp));
 
   POST_REQUEST("reduce", in_tensor.nbytes())
 
+  START_FINE(copy)      
   copy_back_tensor(in_tensor, out_buf, true, false, opts.rootRank);
+  STOP_FINE(copy, in_tensor.nbytes())
 }
 
 c10::intrusive_ptr<Work>
@@ -1345,6 +1352,7 @@ ProcessGroupACCL::reduce(std::vector<at::Tensor> &tensors,
 
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [opts, this](std::unique_ptr<WorkEntry> &entry) {
+	START_COARSE(total)    
         auto tensor = (entry->src)[0];
         // Segment data if necessary
         if (tensor.nbytes() > bufsize) {
@@ -1356,6 +1364,7 @@ ProcessGroupACCL::reduce(std::vector<at::Tensor> &tensors,
         } else {
           run_reduce(tensor, opts);
         }
+	STOP_COARSE(total, ((entry->src)[0]).nbytes())    
       };
   auto entry =
       std::make_unique<WorkEntry>(&tensors, &tensors, std::move(runFunc));
@@ -1372,13 +1381,18 @@ void ProcessGroupACCL::run_allgather(
     
   at::Tensor dsttensor;
 
+  START_FINE(init)
   init_input_tensor(in_tensor, in_buf, true, true);    
   // Reserve device
+
+  init_output_tensor(in_tensor, dsttensor, out_buf, size_, in_tensor.scalar_type(), true, true);
+  STOP_FINE(init, in_tensor.nbytes())
+
+
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  init_output_tensor(in_tensor, dsttensor, out_buf, size_, in_tensor.scalar_type(), true, true);
-  
+      
   PRE_REQUEST(Allgather,in_tensor)
 
   int rounded_count = (in_tensor.numel() + 1023) & ~1023;
@@ -1386,8 +1400,10 @@ void ProcessGroupACCL::run_allgather(
   auto req = accl->allgather(*in_buf, *out_buf, rounded_count);
 
   POST_REQUEST("allgather", in_tensor.nbytes())
-
+      
+  START_FINE(copy)      
   copy_back_tensorvec(dsttensorvec, out_buf, dsttensor, in_tensor.numel(), rounded_count, true, true);
+  STOP_FINE(copy, in_tensor.nbytes())
     
 }
 
@@ -1431,6 +1447,7 @@ ProcessGroupACCL::allgather(std::vector<std::vector<at::Tensor>> &outputTensors,
           outputDataVec[i].copy_(flatOutputTensor[i]);
         }
         #else
+	START_COARSE(total)    
         auto srctensor = (entry->src)[0];
         auto &dsttensors = entry->dst;
         // Segment data if necessary
@@ -1450,6 +1467,7 @@ ProcessGroupACCL::allgather(std::vector<std::vector<at::Tensor>> &outputTensors,
         } else {
           run_allgather(srctensor, dsttensors);
         }
+	STOP_COARSE(total, ((entry->src)[0]).nbytes())    
       #endif
       };
   auto entry = std::make_unique<WorkEntry>(&inputTensors, &outputTensors[0],
@@ -1476,18 +1494,23 @@ void ProcessGroupACCL::run_gather(at::Tensor in_tensor,
   // Reserve device
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+
+  START_FINE(init)
   
   init_input_tensor(in_tensor, in_buf, true, true);
 
   init_output_tensor(in_tensor, dsttensor, out_buf, size_, in_tensor.scalar_type(), true, false, opts.rootRank);
 
+  STOP_FINE(init, in_tensor.nbytes())
+  
   PRE_REQUEST(Gather, in_tensor)
 
   auto req = accl->gather(*in_buf, *out_buf, in_tensor.numel(), opts.rootRank);
 
   POST_REQUEST("gather", in_tensor.nbytes())
-
+  START_FINE(copy)      
   copy_back_tensorvec(dsttensorvec, out_buf, dsttensor, in_tensor.numel(), in_tensor.numel(), true, false, opts.rootRank);
+  STOP_FINE(copy, in_tensor.nbytes())
     
 }
 
@@ -1547,6 +1570,7 @@ ProcessGroupACCL::gather(std::vector<std::vector<at::Tensor>> &outputTensors,
           }
         }
 	#else
+	START_COARSE(total)    
         auto srctensor = (entry->src)[0];
         auto &dsttensors = entry->dst;
         // Segment data if necessary
@@ -1567,6 +1591,7 @@ ProcessGroupACCL::gather(std::vector<std::vector<at::Tensor>> &outputTensors,
         } else {
           run_gather(srctensor, dsttensors, opts);
         }
+	STOP_COARSE(total, ((entry->src)[0]).nbytes())    
       #endif
       };
 
@@ -1594,18 +1619,24 @@ void ProcessGroupACCL::run_scatter(std::vector<at::Tensor> &in_tensor_vec,
   c10::DeviceGuard guard(out_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
+
+  START_FINE(init)
   init_input_data_vec(in_tensor_vec, in_buf, out_tensor.options().device(c10::DeviceType::CPU), true, false, opts.rootRank);
   
   init_output_tensor(out_tensor, dsttensor, out_buf, 0, out_tensor.scalar_type(), true, true, opts.rootRank);
 
-  PRE_REQUEST(Scatter, dsttensor)
+  STOP_FINE(init, out_tensor.nbytes())
+  
+  PRE_REQUEST(Scatter, out_tensor)
   
   // Run scatter
   auto req = accl->scatter(*in_buf, *out_buf, out_tensor.numel(), opts.rootRank);
 
   POST_REQUEST("scatter", out_tensor.nbytes())
 
+  START_FINE(copy)      
   copy_back_tensor(out_tensor, out_buf, true, true, opts.rootRank);
+  STOP_FINE(copy, out_tensor.nbytes())
 }
 
 c10::intrusive_ptr<Work>
@@ -1661,10 +1692,11 @@ ProcessGroupACCL::scatter(std::vector<at::Tensor> &outputTensors,
             opts.rootRank,
             MPI_COMM_WORLD));
         #else
+	START_COARSE(total)    
         auto &srctensors = entry->src;
         auto dsttensor = (entry->dst)[0];
         // Segment data if necessary
-        if (dsttensor.nbytes() > bufsize / 4) {
+        if (dsttensor.nbytes() > bufsize) {
 	  size_t non_zero_dim_count = dsttensor.numel() / dsttensor.size(0);
           size_t n = bufsize / 4 / dsttensor.itemsize() / non_zero_dim_count;
           for (size_t i = 0; i < dsttensor.size(0); i += n) {
@@ -1681,6 +1713,7 @@ ProcessGroupACCL::scatter(std::vector<at::Tensor> &outputTensors,
         } else {
           run_scatter(srctensors, dsttensor, opts);
         }
+	STOP_COARSE(total, ((entry->src)[0]).nbytes())    
         #endif
       };
 
@@ -1850,13 +1883,17 @@ void ProcessGroupACCL::run_send(at::Tensor in_tensor, int dstRank,
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
+  START_FINE(init)
   init_input_tensor(in_tensor, in_buf, true, true);
+  STOP_FINE(init, in_tensor.nbytes())
 
   PRE_REQUEST(Send,in_tensor)
   
   ACCL::ACCLRequest* req = accl->send(*in_buf, in_tensor.numel(), dstRank, tag);
 
   POST_REQUEST("send", in_tensor.nbytes())
+
+  ACCL::debug("copy_" + std::string(x_MAKE_STRING(COLL_NAME)) + "_" + std::to_string(in_tensor.nbytes()) + " durationUs: " + std::to_string(0));
 }
 
 
@@ -1866,6 +1903,7 @@ ProcessGroupACCL::send(std::vector<at::Tensor> &tensors, int dstRank, int tag) {
 
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [dstRank, tag, this](std::unique_ptr<WorkEntry> &entry) {
+	START_COARSE(total)    
         at::Tensor &tensor = (entry->src)[0];
         // Segment data if necessary
         if (tensor.nbytes() > bufsize) {
@@ -1877,6 +1915,7 @@ ProcessGroupACCL::send(std::vector<at::Tensor> &tensors, int dstRank, int tag) {
         } else {
           run_send(tensor, dstRank, tag);
         }
+	STOP_COARSE(total, ((entry->src)[0]).nbytes())    
       };
 
   auto entry =
@@ -1892,6 +1931,8 @@ void ProcessGroupACCL::run_recv(at::Tensor out_tensor, int srcRank,
                                 int tag) {
 
   // Reserve device
+
+  ACCL::debug("init_" + std::string(x_MAKE_STRING(COLL_NAME)) + "_" + std::to_string(out_tensor.nbytes()) + " durationUs: " + std::to_string(0));  
   c10::DeviceGuard guard(out_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
@@ -1903,7 +1944,9 @@ void ProcessGroupACCL::run_recv(at::Tensor out_tensor, int srcRank,
 
   POST_REQUEST("recv", out_tensor.nbytes())
 
-  copy_back_tensor(out_tensor, out_buf, true, true);      
+  START_FINE(copy)            
+  copy_back_tensor(out_tensor, out_buf, true, true);
+  STOP_FINE(copy, out_tensor.nbytes())  
 }
 
 c10::intrusive_ptr<Work>
@@ -1913,6 +1956,7 @@ ProcessGroupACCL::recv(std::vector<at::Tensor> &tensors, int srcRank, int tag) {
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [srcRank, tag, this](std::unique_ptr<WorkEntry> &entry) {
         const at::Tensor &tensor = (entry->dst)[0];
+	START_COARSE(total)    
         // Segment data if necessary
         if (tensor.nbytes() > bufsize) {
           size_t n = bufsize / tensor.itemsize();
@@ -1923,6 +1967,7 @@ ProcessGroupACCL::recv(std::vector<at::Tensor> &tensors, int srcRank, int tag) {
         } else {
           run_recv(tensor, srcRank, tag);
         }
+	STOP_COARSE(total, ((entry->src)[0]).nbytes())    
       };
 
   auto entry =

From 9c4ef138ad893ea738b4a98926270d652f9b07ca Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sun, 1 Sep 2024 12:04:02 +0200
Subject: [PATCH 59/64] Attempt to replace tensor copies

---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 92 +++----------------
 integrations/pytorch_ddp/test/test-generic.py | 11 ++-
 2 files changed, 21 insertions(+), 82 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 95b9183c..6cdc111c 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -769,22 +769,23 @@ void ProcessGroupACCL::init_input_tensor(at::Tensor &tensor, std::unique_ptr<ACC
   void ProcessGroupACCL::init_input_data_vec(std::vector<at::Tensor> &tensor_vec, std::unique_ptr<ACCL::BaseBuffer> &data, const at::TensorOptions &options, bool do_on_root, bool do_on_others, int opts_root_rank) {
   if DO_COND {
     int64_t tens_size = static_cast<size_t>(tensor_vec[0].numel());
-    int64_t total_size = tens_size * static_cast<size_t>(size_);
-    std::vector<int64_t> sizes = tensor_vec[0].sizes().vec();
+    // int64_t total_size = tens_size * static_cast<size_t>(size_);
+    // std::vector<int64_t> sizes = tensor_vec[0].sizes().vec();
     // Prepend another dimension for vector length
-    sizes.insert(sizes.begin(), tensor_vec.size());
+    // sizes.insert(sizes.begin(), tensor_vec.size());
       
     // ACCL::debug("Copying data to CPU tensor of size " + std::to_string(total_size));
-    at::Tensor wrapper_tensor = torch::from_blob(data->byte_array(), sizes, options);
+    // at::Tensor wrapper_tensor = torch::from_blob(data->byte_array(), sizes, options);
 
     for (const auto i : c10::irange(tensor_vec.size())) {
-      if (p2p_applicable(*accl, tensor_vec[0], p2p_enabled)) {
-	auto slice = data->slice(i * tens_size, (i + 1) * tens_size);
-	copy_to_p2p_buffer(*slice, tensor_vec[i]);
-      } else {
-	auto slice = wrapper_tensor[i];
-	slice.copy_(tensor_vec[i]);
-      }
+	std::memcpy(data->byte_array() + i * tens_size * tensor_vec[0].element_size(), tensor_vec[i].data_ptr(), tens_size * tensor_vec[0].element_size());
+      // if (p2p_applicable(*accl, tensor_vec[0], p2p_enabled)) {
+	// auto slice = data->slice(i * tens_size, (i + 1) * tens_size);
+	// copy_to_p2p_buffer(*slice, tensor_vec[i]);
+      // } else {
+	// auto slice = wrapper_tensor[i];
+	// slice.copy_(tensor_vec[i]);
+      // }
     }
     if (!coyote_enabled) {
       data->sync_to_device();
@@ -794,52 +795,6 @@ void ProcessGroupACCL::init_input_tensor(at::Tensor &tensor, std::unique_ptr<ACC
   }
 }  
   
-  // like init_output_tensor but without needlessly setting the tensor
-  // TODO: remove once all collectives reuse the buffer
-void ProcessGroupACCL::init_output_data(at::Tensor &tensor_original, std::unique_ptr<ACCL::BaseBuffer> &dstdata, int out_tensor_size, c10::ScalarType type, bool do_on_root, bool do_on_others, int opts_root_rank) {
-  if DO_COND {
-      if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
-	dstdata = create_buffer_p2p(*accl, out_tensor_size, type);
-    } else {
-	if (coyote_enabled) {
-	  dstdata = create_coyotebuffer(*accl, out_tensor_size, type);
-	} else {
-	  dstdata = create_buffer(*accl, out_tensor_size, type);
-	}
-    }
-  } else {
-    dstdata = std::unique_ptr<ACCL::Buffer<float>>(nullptr);
-  }
-}
-
-    void ProcessGroupACCL::init_output_tensor(const at::Tensor &tensor_original, at::Tensor &dsttensor, std::unique_ptr<ACCL::BaseBuffer> &dstdata, int num_tensors, c10::ScalarType type, bool do_on_root, bool do_on_others, int opts_root_rank) {
-    if DO_COND {
-	int64_t num_tensors_s = static_cast<size_t>(num_tensors);
-	std::vector<int64_t> sizes = tensor_original.sizes().vec();
-	int64_t total_size = static_cast<size_t>(tensor_original.numel());
-	if  (num_tensors != 0) {
-	    // Prepend another dimension for vector length
-	    sizes.insert(sizes.begin(), num_tensors_s);
-	    total_size = total_size * num_tensors_s;
-	}
-	
-	if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
-	  dstdata = create_buffer_p2p(*accl, total_size, type);
-	} else if (coyote_enabled) {
-	    // std::vector<int64_t> sizes = {static_cast<int64_t>(out_tensor_size)};
-	    dsttensor = torch::from_blob(dstdata->byte_array(), sizes, tensor_original.options().device(c10::DeviceType::CPU));
-	    // This should not be necessary:
-	    // dsttensor.copy_(tensor_original);
-	} else {
-	    dsttensor = torch::from_blob(dstdata->byte_array(), sizes, tensor_original.options().device(c10::DeviceType::CPU));
-	    // This should not be necessary:
-	    // dsttensor.copy_(tensor_original);
-	}
-      } else {
-      dsttensor = at::Tensor(nullptr);
-    }
-}
-  
 void ProcessGroupACCL::copy_back_tensor(at::Tensor tensor_original, std::unique_ptr<ACCL::BaseBuffer> &data, bool do_on_root, bool do_on_others, int opts_root_rank){
   if DO_COND {
       if (!coyote_enabled) {
@@ -1379,13 +1334,10 @@ void ProcessGroupACCL::run_allgather(
     const std::vector<at::Tensor> &dsttensorvec) {
 
     
-  at::Tensor dsttensor;
-
   START_FINE(init)
   init_input_tensor(in_tensor, in_buf, true, true);    
   // Reserve device
 
-  init_output_tensor(in_tensor, dsttensor, out_buf, size_, in_tensor.scalar_type(), true, true);
   STOP_FINE(init, in_tensor.nbytes())
 
 
@@ -1402,7 +1354,7 @@ void ProcessGroupACCL::run_allgather(
   POST_REQUEST("allgather", in_tensor.nbytes())
       
   START_FINE(copy)      
-  copy_back_tensorvec(dsttensorvec, out_buf, dsttensor, in_tensor.numel(), rounded_count, true, true);
+  copy_back_tensorvec(dsttensorvec, out_buf, in_tensor, in_tensor.numel(), rounded_count, true, true);
   STOP_FINE(copy, in_tensor.nbytes())
     
 }
@@ -1489,8 +1441,6 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::allgather_coalesced(
 void ProcessGroupACCL::run_gather(at::Tensor in_tensor,
                                   const std::vector<at::Tensor> &dsttensorvec,
                                   const GatherOptions &opts) {
-  at::Tensor dsttensor;
-
   // Reserve device
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
@@ -1499,8 +1449,6 @@ void ProcessGroupACCL::run_gather(at::Tensor in_tensor,
   
   init_input_tensor(in_tensor, in_buf, true, true);
 
-  init_output_tensor(in_tensor, dsttensor, out_buf, size_, in_tensor.scalar_type(), true, false, opts.rootRank);
-
   STOP_FINE(init, in_tensor.nbytes())
   
   PRE_REQUEST(Gather, in_tensor)
@@ -1509,7 +1457,7 @@ void ProcessGroupACCL::run_gather(at::Tensor in_tensor,
 
   POST_REQUEST("gather", in_tensor.nbytes())
   START_FINE(copy)      
-  copy_back_tensorvec(dsttensorvec, out_buf, dsttensor, in_tensor.numel(), in_tensor.numel(), true, false, opts.rootRank);
+  copy_back_tensorvec(dsttensorvec, out_buf, in_tensor, in_tensor.numel(), in_tensor.numel(), true, false, opts.rootRank);
   STOP_FINE(copy, in_tensor.nbytes())
     
 }
@@ -1613,8 +1561,6 @@ ProcessGroupACCL::gather(std::vector<std::vector<at::Tensor>> &outputTensors,
 void ProcessGroupACCL::run_scatter(std::vector<at::Tensor> &in_tensor_vec,
                                    at::Tensor out_tensor,
                                    const ScatterOptions &opts) {
-  at::Tensor dsttensor;
-
   // Reserve device
   c10::DeviceGuard guard(out_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
@@ -1623,8 +1569,6 @@ void ProcessGroupACCL::run_scatter(std::vector<at::Tensor> &in_tensor_vec,
   START_FINE(init)
   init_input_data_vec(in_tensor_vec, in_buf, out_tensor.options().device(c10::DeviceType::CPU), true, false, opts.rootRank);
   
-  init_output_tensor(out_tensor, dsttensor, out_buf, 0, out_tensor.scalar_type(), true, true, opts.rootRank);
-
   STOP_FINE(init, out_tensor.nbytes())
   
   PRE_REQUEST(Scatter, out_tensor)
@@ -1775,10 +1719,6 @@ void ProcessGroupACCL::run_alltoall(at::Tensor in_tensor,
 void ProcessGroupACCL::run_alltoall_vec(std::vector<at::Tensor> &in_tensor_vec,
                                     std::vector<at::Tensor> &out_tensor_vec,
                                     const AllToAllOptions &opts) {
-  at::Tensor dsttensor;
-
-  // Reserve device
-
   int a2a_nbytes = in_tensor_vec[0].nbytes();
   
   c10::DeviceGuard guard(in_tensor_vec[0].device());
@@ -1786,15 +1726,13 @@ void ProcessGroupACCL::run_alltoall_vec(std::vector<at::Tensor> &in_tensor_vec,
 
   init_input_data_vec(in_tensor_vec, in_buf, out_tensor_vec[0].options().device(c10::DeviceType::CPU), true, true);
 
-  init_output_tensor(in_tensor_vec[0], dsttensor, out_buf, size_, in_tensor_vec[0].scalar_type(), true, true);
-  
   PRE_REQUEST(AlltoAll, in_tensor_vec[0])
 
   auto req = accl->alltoall(*in_buf, *out_buf, in_tensor_vec[0].numel());
 
   POST_REQUEST("alltoall", a2a_nbytes)
 
-  copy_back_tensorvec(out_tensor_vec, out_buf, dsttensor, in_tensor_vec[0].numel(), in_tensor_vec[0].numel(), true, true);
+  copy_back_tensorvec(out_tensor_vec, out_buf, in_tensor_vec[0], in_tensor_vec[0].numel(), in_tensor_vec[0].numel(), true, true);
       
 }
 
diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index 7fa3ed1e..89b7c6d3 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -594,17 +594,18 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
     
     # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, schedule=schedule, record_shapes=True) as prof:
 
-    n = 19
+    static_n = 10
     
     if True:
-        for i in range(40):
-            num = 2**n * 3
+    # for n in range(6,19):
+        for i in range(1):
+            num = 2**static_n
             test_broadcast(num, torch.float32)
             test_allreduce(num, torch.float32)
             test_alltoall(num)
             test_allgather(num, torch.float32)
-            test_sendrcv(num)
-            test_scatter(num)
+            # test_sendrcv(num)
+            # test_scatter(num)
             test_gather(num)
             test_reduce(num)
             

From ae9072a39798073492ca814188dd57368d44b834 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Wed, 11 Sep 2024 09:14:03 +0200
Subject: [PATCH 60/64] Cleaned up test-generic

---
 integrations/pytorch_ddp/test/test-generic.py | 124 ++++--------------
 1 file changed, 25 insertions(+), 99 deletions(-)

diff --git a/integrations/pytorch_ddp/test/test-generic.py b/integrations/pytorch_ddp/test/test-generic.py
index 7fa3ed1e..963044f2 100644
--- a/integrations/pytorch_ddp/test/test-generic.py
+++ b/integrations/pytorch_ddp/test/test-generic.py
@@ -62,29 +62,6 @@
 rxbufsize = 4096 * 1024
 
 
-def test_broadcast_segment():
-    global num_errors
-    shape_segment = (1024 * 1,)
-    if rank == 0:
-        x = torch.ones(shape_segment, dtype=torch.float)
-    else:
-        x = torch.zeros(shape_segment, dtype=torch.float)
-
-    with torch.profiler.record_function("test bcast segmented"):
-            
-        dist.broadcast(x, 0)
-
-        mpi.Barrier()            
-        # logger.debug('Tensor after broadcast: ' + str(x))
-        # print('Tensor after broadcast: ' + str(x))
-    try:
-        np.testing.assert_allclose(x, torch.ones(shape_segment, dtype=torch.float))
-    except AssertionError as e:
-        num_errors = num_errors + 1
-        logger.debug("Test Broadcast failed")
-        logger.debug(str(e))
-    else:
-        logger.debug("Test broadcast finished!")
 
 def test_broadcast(numel, testtype):
     shape = (numel,)
@@ -120,15 +97,12 @@ def test_broadcast(numel, testtype):
 
         print(str(rank) + "_pytorch_Broadcast_" + str(x.nbytes) + " durationUs: " + str(measured_time), file=sys.stderr)
         
-        logger.debug("Directly measured time us 1:" + str(measured_time))
-            
         mpi.Barrier()
 
         end_time = time.perf_counter()
 
         measured_time = (end_time - start_time) * 1000000
 
-        logger.debug("Directly measured time us 2:" + str(measured_time))
 
     try:
         np.testing.assert_allclose(x, rand_torch)
@@ -139,31 +113,6 @@ def test_broadcast(numel, testtype):
     else:
         logger.debug("Test broadcast finished!")
 
-def test_broadcast_2():
-    test_type = torch.float
-    shape_2 = (1048576,)
-    global num_errors
-    if rank == 0:
-        x = torch.ones(shape_2, dtype=test_type)
-    else:
-        x = torch.zeros(shape_2, dtype=test_type)
-
-    with torch.profiler.record_function("test bcast float prec"):
-        dist.broadcast(x, 0)
-        mpi.Barrier()            
-
-    # logger.debug('Tensor after broadcast: ' + str(x))
-    # print('Tensor after broadcast: ' + str(x))
-    try:
-        np.testing.assert_allclose(x, torch.ones(shape_2, dtype=test_type))
-    except AssertionError as e:
-        num_errors = num_errors + 1
-        logger.debug("Test Broadcast failed")
-        logger.debug(str(e))
-    else:
-        logger.debug("Test broadcast finished!")
-
-        
 def test_sendrcv(numel):
     global num_errors
 
@@ -594,9 +543,8 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
     
     # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, schedule=schedule, record_shapes=True) as prof:
 
-    n = 19
-    
-    if True:
+    # generic testing
+    for n in range(9,20)
         for i in range(40):
             num = 2**n * 3
             test_broadcast(num, torch.float32)
@@ -609,52 +557,30 @@ def start_test(comms: str, simulator: bool, host_file: str=None, fpga_file: str=
             test_reduce(num)
             
             # prof.step()
-    
+
+    # to simulate resnet behaviour(check to make sure it's the same as in your resnet config)
     # for i in range(10):
-    if False:
-        # test_allreduce(256, torch.int32)
-        # test_allreduce(256, torch.int64)
-        # test_broadcast(256, torch.float32)
-        
-        # test_allgather()
-
-        # test_broadcast_2()
-        test_broadcast(1024, torch.float32)
-        # test_broadcast(25610152, torch.float32)
-        # test_broadcast(53, torch.int64)
-        # test_broadcast(53120, torch.float32)
-        # test_broadcast(53, torch.int64)
-        test_allreduce(1024, torch.float32)
-        # test_broadcast(162, torch.int32)
-        # test_broadcast(25, torch.int32)
-        # test_broadcast(53120, torch.float32)
-        # test_broadcast(53, torch.int64)
-        # test_allreduce(2049000, torch.float32)
-        # test_allreduce()
-        # test_broadcast_segment()
-        # test_broadcast()
-        # test_broadcast()
-        # test_broadcast()
-        # test_broadcast()
-        # test_broadcast()
-        test_alltoall()
-        # test_allreduce(1000, torch.float32)
-        # test_allreduce(2052096, torch.float32)
-        # test_allreduce(1049600, torch.float32)
-        # test_broadcast(256 * 1024, torch.float32)
-        # test_allreduce(256 * 1024, torch.float32)        
-        # test_broadcast(53, torch.int64)
-        # test_broadcast(53120, torch.float32)
-        # test_broadcast(53, torch.int64)
-        # test_broadcast(162, torch.int32)
-        # test_broadcast(25, torch.int32)
-        # test_allreduce(8196000, torch.float32)
-        # test_allreduce()
-        # test_allreduce()
-
-
-
-        # demo_basic(rank)
+    test_resnet_sim = False
+    if test_resnet_sim:
+        test_allreduce(256, torch.int32)
+        test_allreduce(256, torch.int64)
+        test_broadcast(256, torch.float32)
+        for i in range(5):
+            test_allreduce(1000, torch.float32)
+            test_allreduce(2052096, torch.float32)
+            test_allreduce(1049600, torch.float32)
+            test_broadcast(256 * 1024, torch.float32)
+            test_allreduce(256 * 1024, torch.float32)        
+            test_broadcast(53, torch.int64)
+            test_broadcast(53120, torch.float32)
+            test_broadcast(53, torch.int64)
+            test_broadcast(162, torch.int32)
+            test_broadcast(25, torch.int32)
+            test_allreduce(8196000, torch.float32)
+
+    test_NN = False
+    if test_NN:
+        demo_basic(rank)
 
 
     mpi.Barrier()

From 142a437b26488267489bea954007162c713a0af0 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Wed, 11 Sep 2024 09:26:21 +0200
Subject: [PATCH 61/64] Main code cleanup

---
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 60 ++-----------------
 1 file changed, 4 insertions(+), 56 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 56ce37ab..62074214 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -752,29 +752,21 @@ void accl_sa_handler(int)
 // TODO delete when not needed anymore
 void ProcessGroupACCL::init_input_tensor(at::Tensor &tensor, std::unique_ptr<ACCL::BaseBuffer> &data, bool do_on_root, bool do_on_others, int opts_root_rank) {
   if DO_COND {
-	// ACCL::debug("Copying data to CPU tensor of size " + std::to_string(tensor.numel()));
-	// at::Tensor wrapper_tensor = torch::from_blob(data->byte_array(), tensor.sizes(), tensor.options().device(c10::DeviceType::CPU)); 
-	// wrapper_tensor.copy_(tensor);
 	std::memcpy(data->byte_array(), tensor.data_ptr(), tensor.numel() * tensor.element_size());
-
-	//TODO check if necessary in coyote
 	if (!coyote_enabled) {
 	    data->sync_to_device();
 	}
 	
     }
-    // don't sync if no rank initializes, we will fill content and sync later
 }
-
+// This should also be implemented as a memory copy as the other ones. Check the branch pytorch_ddp_only_memcpy. maybe directly run on buffers without any tensor
   void ProcessGroupACCL::init_input_data_vec(std::vector<at::Tensor> &tensor_vec, std::unique_ptr<ACCL::BaseBuffer> &data, const at::TensorOptions &options, bool do_on_root, bool do_on_others, int opts_root_rank) {
   if DO_COND {
     int64_t tens_size = static_cast<size_t>(tensor_vec[0].numel());
     int64_t total_size = tens_size * static_cast<size_t>(size_);
     std::vector<int64_t> sizes = tensor_vec[0].sizes().vec();
-    // Prepend another dimension for vector length
     sizes.insert(sizes.begin(), tensor_vec.size());
       
-    // ACCL::debug("Copying data to CPU tensor of size " + std::to_string(total_size));
     at::Tensor wrapper_tensor = torch::from_blob(data->byte_array(), sizes, options);
 
     for (const auto i : c10::irange(tensor_vec.size())) {
@@ -789,13 +781,10 @@ void ProcessGroupACCL::init_input_tensor(at::Tensor &tensor, std::unique_ptr<ACC
     if (!coyote_enabled) {
       data->sync_to_device();
     }
-  // } else {
-    // data = std::unique_ptr<ACCL::Buffer<float>>(nullptr);
   }
 }  
   
-  // like init_output_tensor but without needlessly setting the tensor
-  // TODO: remove once all collectives reuse the buffer
+// This should also be implemented as a memory copy as the other ones. Check the branch pytorch_ddp_only_memcpy
 void ProcessGroupACCL::init_output_data(at::Tensor &tensor_original, std::unique_ptr<ACCL::BaseBuffer> &dstdata, int out_tensor_size, c10::ScalarType type, bool do_on_root, bool do_on_others, int opts_root_rank) {
   if DO_COND {
       if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
@@ -811,7 +800,7 @@ void ProcessGroupACCL::init_output_data(at::Tensor &tensor_original, std::unique
     dstdata = std::unique_ptr<ACCL::Buffer<float>>(nullptr);
   }
 }
-
+// This function could and should be removed if we adapt the structure in the calls
     void ProcessGroupACCL::init_output_tensor(const at::Tensor &tensor_original, at::Tensor &dsttensor, std::unique_ptr<ACCL::BaseBuffer> &dstdata, int num_tensors, c10::ScalarType type, bool do_on_root, bool do_on_others, int opts_root_rank) {
     if DO_COND {
 	int64_t num_tensors_s = static_cast<size_t>(num_tensors);
@@ -826,14 +815,7 @@ void ProcessGroupACCL::init_output_data(at::Tensor &tensor_original, std::unique
 	if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
 	  dstdata = create_buffer_p2p(*accl, total_size, type);
 	} else if (coyote_enabled) {
-	    // std::vector<int64_t> sizes = {static_cast<int64_t>(out_tensor_size)};
 	    dsttensor = torch::from_blob(dstdata->byte_array(), sizes, tensor_original.options().device(c10::DeviceType::CPU));
-	    // This should not be necessary:
-	    // dsttensor.copy_(tensor_original);
-	} else {
-	    dsttensor = torch::from_blob(dstdata->byte_array(), sizes, tensor_original.options().device(c10::DeviceType::CPU));
-	    // This should not be necessary:
-	    // dsttensor.copy_(tensor_original);
 	}
       } else {
       dsttensor = at::Tensor(nullptr);
@@ -845,15 +827,7 @@ void ProcessGroupACCL::copy_back_tensor(at::Tensor tensor_original, std::unique_
       if (!coyote_enabled) {
 	data->sync_from_device();
       }
-      if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
-	copy_back_p2p_buffer(*data, tensor_original);
-      } else {
-	// ACCL::debug("Copying data back from CPU tensor of size " +
-		    // std::to_string(tensor_original.numel()));
-	std::memcpy(tensor_original.data_ptr(), data->byte_array(), tensor_original.numel() * tensor_original.element_size());
-	// tensor_original.copy_(torch::from_blob(data->byte_array(), tensor_original.sizes(), tensor_original.options().device(c10::DeviceType::CPU)));
-	// ACCL::debug("Finished Copying ");
-      }
+      std::memcpy(tensor_original.data_ptr(), data->byte_array(), tensor_original.numel() * tensor_original.element_size());
   }
 }
 
@@ -863,15 +837,7 @@ void ProcessGroupACCL::copy_back_tensorvec(const std::vector<at::Tensor> &dstten
       data->sync_from_device();
     }
     for (const auto i : c10::irange(dsttensorvec.size())) {
-	// TODO uncomment and correct
-      // if (p2p_applicable(*accl, dsttensorvec[0], p2p_enabled)) {
-	// auto slice =
-	  // data->slice(i * numel, (i + 1) * numel);
-	// copy_back_p2p_buffer(*slice, dsttensorvec[i]);
-      // } else {
 	std::memcpy(dsttensorvec[i].data_ptr(), data->byte_array() + i * offset * dsttensor.element_size(), numel * dsttensor.element_size());
-	// dsttensorvec[i].copy_(dsttensor[i]);
-      // }
     }
   }
 }  
@@ -926,11 +892,7 @@ ProcessGroupACCL::ProcessGroupACCL(
       } else {
         throw std::runtime_error("Undefined ACCL design");
       }
-      // create the two buffers, which are gonna be reused during calls
-      // We use float32, but they are gonna be filled arbitrarily
-
     }
-    // use xrt
     else{
       xrt_device = xrt::device(device_index);
     }
@@ -1148,22 +1110,13 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   
   #else
 
-  // if (opts.rootRank != 0){
-    // ACCL::debug("Can't run on non-zero root rank");
-    // return;
-  // }
-
   int rounded_count = (in_tensor.numel() + ROUND_NR) & ~ROUND_NR;
   
-  
   START_FINE(init)
 
   if (opts.rootRank == rank_){
       init_input_tensor(in_tensor, in_buf, true, false, opts.rootRank);
   }
-  // else{
-      // init_input_tensor(zero_tensor, in_buf, false, true, opts.rootRank);
-  // }
 
   STOP_FINE(init, in_tensor.nbytes())
   
@@ -1192,8 +1145,6 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
   checkSingleTensor(tensors);
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [opts, this](std::unique_ptr<WorkEntry> &entry) {
-	  // std::cerr << "Starting Broadcast" << std::endl;
-	// if (((entry->src)[0]).numel() <= RDVZ_THRESHOLD || BROADCAST_SIDESTEP){
 	if (BROADCAST_SIDESTEP){
 	    
 	auto data = (entry->src)[0];
@@ -1271,8 +1222,6 @@ ProcessGroupACCL::allreduce(std::vector<at::Tensor> &tensors,
 
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [opts, this](std::unique_ptr<WorkEntry> &entry) {
-	// sidestep eager allreduce
-	// if (((entry->src)[0]).numel() <= RDVZ_THRESHOLD || ALLREDUCE_SIDESTEP){
 	  if (ALLREDUCE_SIDESTEP){
 	    auto data = (entry->src)[0];
 	    ACCL::debug("[Allreduce] -- Sidestepped using OpenMPI -- size " + std::to_string(data.numel()));
@@ -1600,7 +1549,6 @@ void ProcessGroupACCL::run_scatter(std::vector<at::Tensor> &in_tensor_vec,
 
   PRE_REQUEST(Scatter, dsttensor)
   
-  // Run scatter
   auto req = accl->scatter(*in_buf, *out_buf, out_tensor.numel(), opts.rootRank);
 
   POST_REQUEST("scatter", out_tensor.nbytes())

From e95ac2208d891c6557fcf522baaeb75af1009cd7 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Wed, 11 Sep 2024 09:38:18 +0200
Subject: [PATCH 62/64] Removed old coyote initialization

---
 .../pytorch_ddp/include/coyote_init.hpp       |  12 --
 integrations/pytorch_ddp/setup.py             |   1 -
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      |   2 -
 integrations/pytorch_ddp/src/coyote_init.cpp  | 172 ------------------
 4 files changed, 187 deletions(-)
 delete mode 100644 integrations/pytorch_ddp/include/coyote_init.hpp
 delete mode 100644 integrations/pytorch_ddp/src/coyote_init.cpp

diff --git a/integrations/pytorch_ddp/include/coyote_init.hpp b/integrations/pytorch_ddp/include/coyote_init.hpp
deleted file mode 100644
index 61989a15..00000000
--- a/integrations/pytorch_ddp/include/coyote_init.hpp
+++ /dev/null
@@ -1,12 +0,0 @@
-#pragma once
-#include <vector>
-
-#include <accl.hpp>
-
-namespace coyote_init {
-void setup_cyt_rdma(std::vector<fpga::ibvQpConn*> &ibvQpConn_vec,
-                    std::vector<ACCL::rank_t> &ranks, int local_rank,
-                    ACCL::CoyoteDevice &device);
-void configure_cyt_rdma(std::vector<fpga::ibvQpConn*> &ibvQpConn_vec,
-                        std::vector<ACCL::rank_t> &ranks, int local_rank);
-} // namespace coyote_init
diff --git a/integrations/pytorch_ddp/setup.py b/integrations/pytorch_ddp/setup.py
index 5433a974..d7998451 100755
--- a/integrations/pytorch_ddp/setup.py
+++ b/integrations/pytorch_ddp/setup.py
@@ -52,7 +52,6 @@
 library_dirs = [driver_dir / 'xrt' / 'lib', xrt_dir / 'lib', '/mnt/scratch/zhe/mpich/install/lib/libmpicxx.so']
 libraries = ['accl', 'jsoncpp', 'zmq']
 sources = [root / 'src' / 'ProcessGroupACCL.cpp',
-           root / 'src' / 'coyote_init.cpp',
            vnx_dir / 'src' / 'cmac.cpp', vnx_dir / 'src' / 'networklayer.cpp',
            accl_utils_dir / 'src' / 'accl_network_utils.cpp']
 
diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 62074214..46d2f66f 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -39,9 +39,7 @@
 
 #include <accl.hpp>
 #include <accl_network_utils.hpp>
-#include "coyote_init.hpp"
 
-namespace cyt = coyote_init;
 namespace py = pybind11;
 using namespace ACCL;
 
diff --git a/integrations/pytorch_ddp/src/coyote_init.cpp b/integrations/pytorch_ddp/src/coyote_init.cpp
deleted file mode 100644
index 8523cabf..00000000
--- a/integrations/pytorch_ddp/src/coyote_init.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-/*****************************************************************************
-  Copyright (C) 2023 Advanced Micro Devices, Inc
-
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
-
-*****************************************************************************/
-
-#include "coyote_init.hpp"
-#include <arpa/inet.h>
-#include <iostream>
-#include <mpi.h>
-
-using namespace ACCL;
-
-namespace {
-inline void swap_endianness(uint32_t *ip) {
-  uint8_t *ip_bytes = reinterpret_cast<uint8_t *>(ip);
-  *ip = (ip_bytes[3] << 0) | (ip_bytes[2] << 8) | (ip_bytes[1] << 16) |
-        (ip_bytes[0] << 24);
-}
-
-uint32_t _ip_encode(std::string ip) {
-  struct sockaddr_in sa;
-  inet_pton(AF_INET, ip.c_str(), &(sa.sin_addr));
-  swap_endianness(&sa.sin_addr.s_addr);
-  return sa.sin_addr.s_addr;
-}
-
-std::string ip_decode(uint32_t ip) {
-  char buffer[INET_ADDRSTRLEN];
-  struct in_addr sa;
-  sa.s_addr = ip;
-  swap_endianness(&sa.s_addr);
-  inet_ntop(AF_INET, &sa, buffer, INET_ADDRSTRLEN);
-  return std::string(buffer, INET_ADDRSTRLEN);
-}
-
-void exchange_qp(unsigned int master_rank, unsigned int slave_rank, unsigned int local_rank, std::vector<fpga::ibvQpConn*> &ibvQpConn_vec, std::vector<rank_t> &ranks)
-{
-  	
-	if (local_rank == master_rank)
-	{
-		std::cout<<"Local rank "<<local_rank<<" sending local QP to remote rank "<<slave_rank<<std::endl;
-		// Send the local queue pair information to the slave rank
-		MPI_Send(&(ibvQpConn_vec[slave_rank]->getQpairStruct()->local), sizeof(fpga::ibvQ), MPI_CHAR, slave_rank, 0, MPI_COMM_WORLD);
-	}
-	else if (local_rank == slave_rank)
-	{
-		std::cout<<"Local rank "<<local_rank<<" receiving remote QP from remote rank "<<master_rank<<std::endl;
-		// Receive the queue pair information from the master rank
-		fpga::ibvQ received_q;
-		MPI_Recv(&received_q, sizeof(fpga::ibvQ), MPI_CHAR, master_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-
-		// Copy the received data to the remote queue pair
-		ibvQpConn_vec[master_rank]->getQpairStruct()->remote = received_q;
-	}
-
-	// Synchronize after the first exchange to avoid race conditions
-	MPI_Barrier(MPI_COMM_WORLD);
-
-	if (local_rank == slave_rank)
-	{
-		std::cout<<"Local rank "<<local_rank<<" sending local QP to remote rank "<<master_rank<<std::endl;
-		// Send the local queue pair information to the master rank
-		MPI_Send(&(ibvQpConn_vec[master_rank]->getQpairStruct()->local), sizeof(fpga::ibvQ), MPI_CHAR, master_rank, 0, MPI_COMM_WORLD);
-	}
-	else if (local_rank == master_rank)
-	{
-		std::cout<<"Local rank "<<local_rank<<" receiving remote QP from remote rank "<<slave_rank<<std::endl;
-		// Receive the queue pair information from the slave rank
-		fpga::ibvQ received_q;
-		MPI_Recv(&received_q, sizeof(fpga::ibvQ), MPI_CHAR, slave_rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-
-		// Copy the received data to the remote queue pair
-		ibvQpConn_vec[slave_rank]->getQpairStruct()->remote = received_q;
-	}
-
-	MPI_Barrier(MPI_COMM_WORLD);
-
-	// write established connection to hardware and perform arp lookup
-	if (local_rank == master_rank)
-	{
-		int connection = (ibvQpConn_vec[slave_rank]->getQpairStruct()->local.qpn & 0xFFFF) | ((ibvQpConn_vec[slave_rank]->getQpairStruct()->remote.qpn & 0xFFFF) << 16);
-		ibvQpConn_vec[slave_rank]->getQpairStruct()->print();
-		ibvQpConn_vec[slave_rank]->setConnection(connection);
-		ibvQpConn_vec[slave_rank]->writeContext(ranks[slave_rank].port);
-		ibvQpConn_vec[slave_rank]->doArpLookup();
-		ranks[slave_rank].session_id = ibvQpConn_vec[slave_rank]->getQpairStruct()->local.qpn;
-	} else if (local_rank == slave_rank) 
-	{
-		int connection = (ibvQpConn_vec[master_rank]->getQpairStruct()->local.qpn & 0xFFFF) | ((ibvQpConn_vec[master_rank]->getQpairStruct()->remote.qpn & 0xFFFF) << 16);
-		ibvQpConn_vec[master_rank]->getQpairStruct()->print();
-		ibvQpConn_vec[master_rank]->setConnection(connection);
-		ibvQpConn_vec[master_rank]->writeContext(ranks[master_rank].port);
-		ibvQpConn_vec[master_rank]->doArpLookup();
-		ranks[master_rank].session_id = ibvQpConn_vec[master_rank]->getQpairStruct()->local.qpn;
-	}
-
-	MPI_Barrier(MPI_COMM_WORLD);
-}
-
-  
-// void exchange_qp(unsigned int first_rank, unsigned int second_rank,
-//                  unsigned int local_rank,
-//                  std::vector<fpga::ibvQpConn *> &ibvQpConn_vec,
-//                  std::vector<ACCL::rank_t> &ranks) {
-//   // write established connection to hardware and perform arp lookup
-//   if (local_rank == first_rank) {
-//     int connection =
-//         (ibvQpConn_vec[second_rank]->getQpairStruct()->local.qpn & 0xFFFF) |
-//         ((ibvQpConn_vec[second_rank]->getQpairStruct()->remote.qpn & 0xFFFF)
-//          << 16);
-//     ibvQpConn_vec[second_rank]->setConnection(connection);
-//     ibvQpConn_vec[second_rank]->writeContext(ranks[second_rank].port);
-//     ibvQpConn_vec[second_rank]->doArpLookup();
-//     ranks[second_rank].session_id =
-//         ibvQpConn_vec[second_rank]->getQpairStruct()->local.qpn;
-//   } else if (local_rank == second_rank) {
-//     int connection =
-//         (ibvQpConn_vec[first_rank]->getQpairStruct()->local.qpn & 0xFFFF) |
-//         ((ibvQpConn_vec[first_rank]->getQpairStruct()->remote.qpn & 0xFFFF)
-//          << 16);
-//     ibvQpConn_vec[first_rank]->setConnection(connection);
-//     ibvQpConn_vec[first_rank]->writeContext(ranks[first_rank].port);
-//     ibvQpConn_vec[first_rank]->doArpLookup();
-//     ranks[first_rank].session_id =
-//         ibvQpConn_vec[first_rank]->getQpairStruct()->local.qpn;
-//   }
-// }
-
-} // namespace
-
-namespace coyote_init {
-void setup_cyt_rdma(std::vector<fpga::ibvQpConn *> &ibvQpConn_vec,
-                    std::vector<ACCL::rank_t> &ranks, int local_rank,
-                    ACCL::CoyoteDevice &device) {
-  std::cout << "[ACCL Coyote] Initializing QP..." << std::endl;
-  ACCL::debug("Cyt setup on rank" + std::to_string(local_rank) + "\n");
-  // create single page dummy memory space for each qp
-  uint32_t n_pages = 1;
-  for (int i = 0; i < ranks.size(); i++) {
-    fpga::ibvQpConn *qpConn = new fpga::ibvQpConn(
-        device.coyote_qProc_vec[i], ranks[local_rank].ip, n_pages);
-    ibvQpConn_vec.push_back(qpConn);
-  }
-}
-
-void configure_cyt_rdma(std::vector<fpga::ibvQpConn *> &ibvQpConn_vec,
-                        std::vector<ACCL::rank_t> &ranks, int local_rank) {
-  std::cout << "[ACCL Coyote] Exchanging QP..." << std::endl;
-  for (int first_rank = 0; first_rank < ranks.size(); first_rank++) {
-    for (int second_rank = first_rank + 1; second_rank < ranks.size();
-         second_rank++) {
-      exchange_qp(first_rank, second_rank, local_rank, ibvQpConn_vec, ranks);
-      this_thread::sleep_for(500ms);
-    }
-  }
-
-  this_thread::sleep_for(3s);
-  std::cout << "[ACCL Coyote] Finished exchanging QP!" << std::endl;
-}
-} // namespace coyote_init

From 3ce51858af7046945050e9648e419259a7be3a26 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Wed, 11 Sep 2024 10:26:27 +0200
Subject: [PATCH 63/64] Removed some comments

---
 integrations/pytorch_ddp/src/ProcessGroupACCL.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index a42b3757..2a23f846 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -757,7 +757,6 @@ void ProcessGroupACCL::init_input_tensor(at::Tensor &tensor, std::unique_ptr<ACC
 	
     }
 }
-// This should also be implemented as a memory copy as the other ones. Check the branch pytorch_ddp_only_memcpy. maybe directly run on buffers without any tensor
   void ProcessGroupACCL::init_input_data_vec(std::vector<at::Tensor> &tensor_vec, std::unique_ptr<ACCL::BaseBuffer> &data, const at::TensorOptions &options, bool do_on_root, bool do_on_others, int opts_root_rank) {
   if DO_COND {
     int64_t tens_size = static_cast<size_t>(tensor_vec[0].numel());
@@ -769,13 +768,6 @@ void ProcessGroupACCL::init_input_tensor(at::Tensor &tensor, std::unique_ptr<ACC
 
     for (const auto i : c10::irange(tensor_vec.size())) {
 	std::memcpy(data->byte_array() + i * tens_size * tensor_vec[0].element_size(), tensor_vec[i].data_ptr(), tens_size * tensor_vec[0].element_size());
-      // if (p2p_applicable(*accl, tensor_vec[0], p2p_enabled)) {
-	// auto slice = data->slice(i * tens_size, (i + 1) * tens_size);
-	// copy_to_p2p_buffer(*slice, tensor_vec[i]);
-      // } else {
-	// auto slice = wrapper_tensor[i];
-	// slice.copy_(tensor_vec[i]);
-      // }
     }
     if (!coyote_enabled) {
       data->sync_to_device();
@@ -783,7 +775,6 @@ void ProcessGroupACCL::init_input_tensor(at::Tensor &tensor, std::unique_ptr<ACC
   }
 }  
   
-// This should also be implemented as a memory copy as the other ones. Check the branch pytorch_ddp_only_memcpy
 void ProcessGroupACCL::init_output_data(at::Tensor &tensor_original, std::unique_ptr<ACCL::BaseBuffer> &dstdata, int out_tensor_size, c10::ScalarType type, bool do_on_root, bool do_on_others, int opts_root_rank) {
   if DO_COND {
       if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {

From 17c8c56b79b1ccf52856592698c2753646297208 Mon Sep 17 00:00:00 2001
From: Laurent Wirz <lwirz@lwirz.ch>
Date: Sun, 29 Sep 2024 18:19:19 +0200
Subject: [PATCH 64/64] cleanup + documentation

---
 integrations/pytorch_ddp/DEVELOPMENT.md       |  53 ++
 .../pytorch_ddp/include/ProcessGroupACCL.hpp  |   4 -
 .../pytorch_ddp/src/ProcessGroupACCL.cpp      | 602 +-----------------
 integrations/pytorch_ddp/test/test-mnist.py   |  50 +-
 4 files changed, 102 insertions(+), 607 deletions(-)
 create mode 100644 integrations/pytorch_ddp/DEVELOPMENT.md

diff --git a/integrations/pytorch_ddp/DEVELOPMENT.md b/integrations/pytorch_ddp/DEVELOPMENT.md
new file mode 100644
index 00000000..658f057e
--- /dev/null
+++ b/integrations/pytorch_ddp/DEVELOPMENT.md
@@ -0,0 +1,53 @@
+This document explains, what the state of development is at and tries to document some of the decisions made
+
+## Structure
+
+Consists of
+
+- wrapper, bindings and helper functionality found in ./accl_process_group
+- main C++ files in ./src
+- The ACCL repo the process group itself builds on top will be in ./accl . This is replicated such that you can try different versions
+- ./test testscripts
+
+## Build process
+
+Check the ./install.py helper for dependency versions
+
+./setup.py sets up the build
+
+See the section in the README on how to avoid the long build using pip
+
+## Basics
+
+- Currently only runs via Coyote RDMA. XRT and GPU support was dropped. Simulator still runs over XRT UDP though
+- Needs MPI Library to work. Set in setup.py. Tested only with MPICH
+- The test setup in run.sh is for the HACC cluster
+- use ACCL_DEBUG=1 both during build and runs
+- Everything runs in rendezvous mode
+- if you call collectives directly they are run synchronously, but eg allreduce used internally in DDP is executed async
+- The PG allocates 2 buffers and reuses them to avoid reallocation. This is supposed to be replaced with a host buffer constructor which takes an existing memory region. To change buffer type you need to use the change_buffer_type branch(maybe already pulled) at https://github.com/lawirz/ACCL 
+- The torch profiler can see the overall execution time, but setting it up to measure sub-operation within the workerthread was attempted but failed.
+
+## ProcessGroupACCL.cpp
+
+### ProcessGroup structure
+
+A lot of the design comes from the ProcessGroupMPI. There is a concept of WorkEntries, which schedule Work on a separate worker thread. This is currently done using a single Worker thread as is the case with the MPI PG. There is still a lock, probably only relevant in case of a few management operations from the DDP side. With async execution in ACCL, we could try a different structure with AsyncWork as is done on Gloo PG I think.
+
+### Collectives
+
+- There are small wrappers, which do a few checks mostly copied from MPI PG, do the sidestep then setup the WorkEntry
+- The WorkEntries manage the Segmentation, which is not yet correctly implemented everywhere. Some collectives still use a version which relies on the input to have one-dimensional shape. Others, which require multiple Segmentations such as Scatter have similar limitations
+- Input is copied to the pre-allocated buffer. Generally copies using memcpy seem to be much faster, than using tensor.copy_ for some reason
+- ACCL does a host-to-host call. The driver figures out, that it's host to host using the buffer type. The compressed type should be added as an argument to make that work again
+- copy back
+
+## Hardware issues
+
+A lot of collectives still fail in hardware. The following can produce issues
+
+- Mixing datatypes especially ints
+- High variablity in length
+- MPI sidestepping(can't explain why this causes issues)
+
+If you run test-resnet50, you will encounter them.
diff --git a/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp b/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp
index e72d0db4..04d6f2c7 100644
--- a/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp
+++ b/integrations/pytorch_ddp/include/ProcessGroupACCL.hpp
@@ -303,10 +303,6 @@ class TORCH_API ProcessGroupACCL : public ProcessGroup {
   
   void init_input_data_vec(std::vector<at::Tensor> &tensor_vec, std::unique_ptr<ACCL::BaseBuffer> &data, const at::TensorOptions &options, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
   
-  void init_output_data(at::Tensor &tensor_original, std::unique_ptr<ACCL::BaseBuffer> &dstdata, int out_tensor_size, c10::ScalarType type, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
-  
-  void init_output_tensor(const at::Tensor &tensor_original, at::Tensor &dsttensor, std::unique_ptr<ACCL::BaseBuffer> &dstdata, int out_tensor_size, c10::ScalarType type, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
-  
   void copy_back_tensor(at::Tensor tensor_original, std::unique_ptr<ACCL::BaseBuffer> &data, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
 
   void copy_back_tensorvec(const std::vector<at::Tensor> &dsttensorvec, std::unique_ptr<ACCL::BaseBuffer> &data, at::Tensor &dsttensor, int numel, int offset, bool do_on_root, bool do_on_others, int opts_root_rank = 0);
diff --git a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
index 2a23f846..8ee9f012 100644
--- a/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
+++ b/integrations/pytorch_ddp/src/ProcessGroupACCL.cpp
@@ -62,45 +62,18 @@ namespace c10d {
     
 #define RDVZ_THRESHOLD 64
 
-#define MICRO_BENCH_FINE 1
-
-#define MICRO_BENCH_COARSE 1
-
+// This is the maximal message size. larger sizes get segmented
 #define ACCL_MSG_SIZE 2097152
 
+// counts are rounded up to this number for stability reasons
 #define ROUND_NR 256
 
+// This is intended for debugging, you can refer to the name of the collective using this
 #define COLL_NAME UNNAMED
 
 #define x_MAKE_STRING(s) MAKE_STRING(s)
 #define MAKE_STRING(s) #s    
 
-    
-
-#if MICRO_BENCH_FINE
-#define START_FINE(name) \
-  std::chrono::time_point<std::chrono::high_resolution_clock> start_##name  = std::chrono::high_resolution_clock::now();
-#define STOP_FINE(name, accl_nbytes)						\
-  auto end_##name = std::chrono::high_resolution_clock::now();		\
-  double durationUs_##name = (std::chrono::duration_cast<std::chrono::nanoseconds>(end_##name-start_##name).count() / 1000.0); \
-  ACCL::debug(#name "_" + std::string(x_MAKE_STRING(COLL_NAME)) + "_" + std::to_string(accl_nbytes) + " durationUs: " + std::to_string(durationUs_##name));
-#else
-#define START_FINE(name)
-#define STOP_FINE(name)
-#endif
-
-#if MICRO_BENCH_COARSE
-#define START_COARSE(name) \
-  std::chrono::time_point<std::chrono::high_resolution_clock> start_##name  = std::chrono::high_resolution_clock::now();
-#define STOP_COARSE(name, accl_nbytes)						\
-  auto end_##name = std::chrono::high_resolution_clock::now();		\
-  double durationUs_##name = (std::chrono::duration_cast<std::chrono::nanoseconds>(end_##name-start_##name).count() / 1000.0); \
-  ACCL::debug(#name "_" + std::string(x_MAKE_STRING(COLL_NAME)) + "_" + std::to_string(accl_nbytes) + " durationUs: " + std::to_string(durationUs_##name));
-#else
-#define START_COARSE(name)
-#define STOP_COARSE(name)
-#endif
-
 // Used in sidestepping
 #define MPI_CHECK(cmd)                                                   \
   do {                                                                   \
@@ -135,49 +108,18 @@ std::map<at::ScalarType, MPI_Datatype> mpiDatatype = {
     
 #define CEIL_DIV(x, y) ((x) / (y) + ((x) % (y) != 0))
 
-#define ACCL_ERROR(status)                                                     \
-  ("ACCL error in: " + std::string(__FILE__) + ":" +                           \
-   std::to_string(__LINE__) + ", with error code: " + std::to_string(status))
-
 #if defined(ACCL_PROCESS_GROUP_HIP_ENABLED) &&                                 \
     defined(ACCL_PROCESS_GROUP_CUDA_ENABLED)
 #error Cannot compile Process Group with both HIP and CUDA support
 #endif // ACCL_PROCESS_GROUP_HIP_ENABLED && ACCL_PROCESS_GROUP_CUDA_ENABLED
 
-// Activate Parameter printing:
-#define DO_PARA_PRINT
-
-#if defined(DO_PARA_PRINT)
-  #define PARA_PRINT(x)							\
-    ACCL::debug(#x "size: " + std::to_string(x.numel()) + " of type: " + string_of_accl_datatype(convert_datatype_from_torch(x.scalar_type())))
-#else
-  #define PARA_PRINT(x)
-#endif
-
-
-#define STANDARD_DECL \
-  std::unique_ptr<ACCL::BaseBuffer> data;				\
-  std::unique_ptr<ACCL::BaseBuffer> dstdata;				\
-
 #define DO_COND ((do_on_root && opts_root_rank == rank_) || (do_on_others && opts_root_rank != rank_))
 
 #define PRE_REQUEST(opname, tensor)					\
-  START_FINE(type)    \
   in_buf->change_type(convert_datatype_from_torch(tensor.scalar_type())); \
   out_buf->change_type(convert_datatype_from_torch(tensor.scalar_type()));   \
-  STOP_FINE(type, tensor.nbytes())					\
-  ACCL::debug("Performing " #opname " of " + std::to_string(tensor.numel()) + " items"); \
-  START_FINE(lib)							
-
-#define POST_REQUEST(name, nbytes)				\
-  STOP_FINE(lib, nbytes)						\
-  START_COARSE(sleep)   						\
-  std::this_thread::sleep_for(10ms);					\
-  STOP_COARSE(sleep, nbytes)   						\
-  double durationUs_accl_##COLL_NAME = (double)accl->get_duration(req)/1000.0; \
-  ACCL::debug("device_" + std::string(x_MAKE_STRING(COLL_NAME)) + "_"  + std::to_string(nbytes) + " durationUs: " + std::to_string(durationUs_accl_##COLL_NAME));
+  ACCL::debug("Performing " #opname " of " + std::to_string(tensor.numel()) + " items")
 
-    
 namespace {
 
 /* Alternative for std::format from C++20 in C++17.
@@ -196,34 +138,6 @@ std::string string_format(const std::string &format, Args... args) {
                      buf.get() + size - 1); // We don't want the '\0' inside
 }
 
-template <typename val_t>
-std::string format_array(val_t *data, std::size_t size, std::size_t breakval = 3) {
-  std::ostringstream buffer;
-  buffer << "[";
-  if (size <= breakval * 2 + 1) {
-    for (std::size_t i = 0; i < size; ++i) {
-      buffer << data[i];
-      if (i + 1 != size) {
-        buffer << ", ";
-      }
-    }
-  } else {
-    for (std::size_t i = 0; i < breakval; ++i) {
-      buffer << data[i] << ", ";
-    }
-    buffer << "..., ";
-    for (std::size_t i = size - breakval; i < size; ++i) {
-      buffer << data[i];
-      if (i + 1 != size) {
-        buffer << ", ";
-      }
-    }
-  }
-  buffer << "]";
-
-  return buffer.str();
-}
-
 // Op mapping
 std::map<ReduceOp, ACCL::reduceFunction> acclOp = {
     {ReduceOp::SUM, ACCL::reduceFunction::SUM},
@@ -240,25 +154,6 @@ std::map<at::ScalarType, ACCL::dataType> acclDatatype = {
 };
 
 
-std::string format_log(std::string collective, int world_size, int rank, double time, int n_bytes)
-{
-    std::string log_str = collective + "," + std::to_string(world_size) + "," + std::to_string(rank) + "," + std::to_string(time) + "," + std::to_string(n_bytes);
-    return log_str;
-}    
-
-#define ACCL_PG_LOG_FILE(i)                                                       \
-  (std::string("accl_log/accl_pg_") + i + std::string(".log"))    
-    
-void accl_pg_log(int rank, const std::string &message) {
-  std::string str_rank = std::to_string(rank);
-  std::string filename = ACCL_PG_LOG_FILE(str_rank);
-  std::ofstream outfile;
-  outfile.open(filename, std::ios::out | std::ios_base::app);
-  outfile << message << std::endl;
-  outfile.close();
-}
-    
-    
 // Checking the input tensor's validity
 void checkSingleTensorHelper(const at::Tensor &tensor) {
   if (!tensor.is_contiguous()) {
@@ -398,114 +293,6 @@ std::map<std::string, std::string> convert_compression_to_dict(
   return dictionary;
 }
 
-// Create an ACCL Buffer with correct type
-std::unique_ptr<ACCL::BaseBuffer> create_buffer(ACCL::ACCL &accl, size_t length,
-                                                c10::ScalarType type) {
-  switch (type) {
-  case at::kInt:
-    return accl.create_buffer<int32_t>(length, acclDatatype.at(type));
-  case at::kLong:
-    return accl.create_buffer<int64_t>(length, acclDatatype.at(type));
-  case at::kFloat:
-    return accl.create_buffer<float>(length, acclDatatype.at(type));
-  case at::kDouble:
-    return accl.create_buffer<double>(length, acclDatatype.at(type));
-  default:
-    TORCH_CHECK(false, "Tensor has unsupported datatype");
-    break;
-  }
-}
-
-// Create an ACCL Buffer with correct type
-std::unique_ptr<ACCL::BaseBuffer> create_coyotebuffer(ACCL::ACCL &accl, size_t length,
-                                                c10::ScalarType type) {
-  switch (type) {
-  case at::kInt:
-    return accl.create_coyotebuffer<int32_t>(length, acclDatatype.at(type));
-  case at::kLong:
-    return accl.create_coyotebuffer<int64_t>(length, acclDatatype.at(type));
-  case at::kFloat:
-    return accl.create_coyotebuffer<float>(length, acclDatatype.at(type));
-  case at::kDouble:
-    return accl.create_coyotebuffer<double>(length, acclDatatype.at(type));
-  default:
-    TORCH_CHECK(false, "Tensor has unsupported datatype");
-    break;
-  }
-}
-
-// Create an ACCL P2P Buffer with correct type
-std::unique_ptr<ACCL::BaseBuffer>
-create_buffer_p2p(ACCL::ACCL &accl, size_t length, c10::ScalarType type) {
-  switch (type) {
-  case at::kInt:
-    return accl.create_buffer_p2p<int32_t>(length, acclDatatype.at(type));
-  case at::kLong:
-    return accl.create_buffer_p2p<int64_t>(length, acclDatatype.at(type));
-  case at::kFloat:
-    return accl.create_buffer_p2p<float>(length, acclDatatype.at(type));
-  case at::kDouble:
-    return accl.create_buffer_p2p<double>(length, acclDatatype.at(type));
-  default:
-    TORCH_CHECK(false, "Tensor has unsupported datatype");
-    break;
-  }
-}
-
-std::unique_ptr<ACCL::BaseBuffer> create_buffer_p2p(ACCL::ACCL &accl,
-                                                    const at::Tensor &tensor) {
-  return create_buffer_p2p(accl, tensor.numel(), tensor.scalar_type());
-}
-
-// Create an ACCL Buffer with correct type from Tensor
-std::unique_ptr<ACCL::BaseBuffer> create_buffer(ACCL::ACCL &accl,
-                                                const at::Tensor &tensor) {
-  std::unique_ptr<ACCL::BaseBuffer> buffer;
-  switch (tensor.scalar_type()) {
-  case at::kInt:
-    buffer = accl.create_buffer(static_cast<int32_t *>(tensor.data_ptr()),
-                                tensor.numel(),
-                                acclDatatype.at(tensor.scalar_type()));
-
-    ACCL::debug("Creating int32 buffer at 0x" +
-                ACCL::debug_hex(buffer->address()) + " of " +
-                std::to_string(buffer->size()) + "B.");
-    break;
-  case at::kLong:
-    buffer = accl.create_buffer(static_cast<int64_t *>(tensor.data_ptr()),
-                                tensor.numel(),
-                                acclDatatype.at(tensor.scalar_type()));
-
-    ACCL::debug("Creating int64 buffer at 0x" +
-                ACCL::debug_hex(buffer->address()) + " of " +
-                std::to_string(buffer->size()) + "B.");
-    break;
-  case at::kFloat:
-    buffer = accl.create_buffer(static_cast<float *>(tensor.data_ptr()),
-                                tensor.numel(),
-                                acclDatatype.at(tensor.scalar_type()));
-
-    ACCL::debug("Creating float32 buffer at 0x" +
-                ACCL::debug_hex(buffer->address()) + " of " +
-                std::to_string(buffer->size()) + "B.");
-
-    break;
-  case at::kDouble:
-    buffer = accl.create_buffer(static_cast<double *>(tensor.data_ptr()),
-                                tensor.numel(),
-                                acclDatatype.at(tensor.scalar_type()));
-
-    ACCL::debug("Creating float64 buffer at 0x" +
-                ACCL::debug_hex(buffer->address()) + " of " +
-                std::to_string(buffer->size()) + "B.");
-    break;
-  default:
-    TORCH_CHECK(false, "Tensor has unsupported datatype");
-    break;
-  }
-
-  return buffer;
-}
 
 // Check if process is compiled with HIP support
 inline bool hip_enabled() {
@@ -525,149 +312,6 @@ inline bool cuda_enabled() {
 #endif
 }
 
-// Check if tensor is a GPU tensor, the ProcessGroup is compiled with GPU
-// support, ACCL is not running in simulation mode, and the ProcessGroup was
-// initialized with p2p_enabled
-bool p2p_applicable(ACCL::ACCL &accl, const at::Tensor &tensor,
-                    bool p2p_enabled) {
-  auto type = tensor.device().type();
-  if (type != c10::DeviceType::CPU && p2p_enabled && !accl.is_simulated()) {
-    if (type == c10::DeviceType::HIP) {
-      return hip_enabled();
-    } else if (type == c10::DeviceType::CUDA) {
-      // HIP tensors will identify themselves as CUDA tensor depending on the
-      // initialization, so we have to see CUDA tensors as HIP tensors if
-      // ProcessGroup is compiled with HIP support
-#ifdef ACCL_PROCESS_GROUP_HIP_ENABLED
-      return hip_enabled();
-#else
-      return cuda_enabled();
-#endif
-    }
-  }
-  return false;
-}
-
-// Copy a GPU tensor to a P2P FPGA buffer
-void copy_to_p2p_buffer(ACCL::BaseBuffer &buffer, const at::Tensor &tensor) {
-  if (tensor.device().type() == c10::DeviceType::HIP) {
-    ACCL::debug("Syncing HIP GPU buffer to FPGA");
-#ifdef ACCL_PROCESS_GROUP_HIP_ENABLED
-    hipMemcpy(buffer.byte_array(), tensor.data_ptr(), tensor.nbytes(),
-              hipMemcpyDeviceToHost);
-#else
-    TORCH_CHECK(false, "ACCL ProcessGroup is build without HIP support");
-#endif
-  } else if (tensor.device().type() == c10::DeviceType::CUDA) {
-#ifdef ACCL_PROCESS_GROUP_HIP_ENABLED
-    ACCL::debug("Syncing HIP GPU buffer to FPGA");
-    hipMemcpy(buffer.byte_array(), tensor.data_ptr(), tensor.nbytes(),
-              hipMemcpyDeviceToHost);
-#else
-    ACCL::debug("Syncing CUDA GPU buffer to FPGA");
-#ifdef ACCL_PROCESS_GROUP_CUDA_ENABLED
-    cudaMemcpy(buffer.byte_array(), tensor.data_ptr(), tensor.nbytes(),
-               cudaMemcpyDeviceToHost);
-#else
-    TORCH_CHECK(false, "ACCL ProcessGroup is build without CUDA support");
-#endif // ACCL_PROCESS_GROUP_CUDA_ENABLED
-#endif // ACCL_PROCESS_GROUP_HIP_ENABLED
-  }
-}
-
-// Create a new FPGA P2P buffer and copy contents of GPU tensor
-inline std::unique_ptr<ACCL::BaseBuffer>
-create_and_copy_p2p_buffer(ACCL::ACCL &accl, const at::Tensor &tensor) {
-  ACCL::debug("Creating p2p buffer of size " + std::to_string(tensor.nbytes()));
-  std::unique_ptr<ACCL::BaseBuffer> buffer =
-      create_buffer_p2p(accl, tensor.numel(), tensor.scalar_type());
-  copy_to_p2p_buffer(*buffer, tensor);
-  return buffer;
-}
-
-// Copy results from an FPGA P2P buffer back to the GPU tensor
-void copy_back_p2p_buffer(ACCL::BaseBuffer &buffer, const at::Tensor &tensor) {
-  if (tensor.device().type() == c10::DeviceType::HIP) {
-    ACCL::debug("Syncing HIP GPU buffer from FPGA");
-#ifdef ACCL_PROCESS_GROUP_HIP_ENABLED
-    hipMemcpy(tensor.data_ptr(), buffer.byte_array(), tensor.nbytes(),
-              hipMemcpyHostToDevice);
-#else
-    TORCH_CHECK(false, "ACCL ProcessGroup is build without HIP support");
-#endif
-  } else if (tensor.device().type() == c10::DeviceType::CUDA) {
-#ifdef ACCL_PROCESS_GROUP_HIP_ENABLED
-    ACCL::debug("Syncing HIP GPU buffer from FPGA");
-    hipMemcpy(tensor.data_ptr(), buffer.byte_array(), tensor.nbytes(),
-              hipMemcpyHostToDevice);
-#else
-    ACCL::debug("Syncing CUDA GPU buffer from FPGA");
-#ifdef ACCL_PROCESS_GROUP_CUDA_ENABLED
-    cudaMemcpy(tensor.data_ptr(), buffer.byte_array(), tensor.nbytes(),
-               cudaMemcpyHostToDevice);
-#else
-    TORCH_CHECK(false, "ACCL ProcessGroup is build without CUDA support");
-#endif // ACCL_PROCESS_GROUP_CUDA_ENABLED
-#endif // ACCL_PROCESS_GROUP_HIP_ENABLED
-  }
-}
-
-bool check_arp(vnx::Networklayer &network_layer,
-               std::vector<ACCL::rank_t> &ranks, int rank, int size) {
-  std::map<unsigned, bool> ranks_checked;
-  for (unsigned i = 0; i < static_cast<unsigned>(size); ++i) {
-    ranks_checked[i] = false;
-  }
-
-  bool sanity_check = true;
-  const std::map<int, std::pair<std::string, std::string>> arp =
-      network_layer.read_arp_table(size);
-
-  std::ostringstream ss_arp;
-  ss_arp << "ARP table:";
-
-  for (const std::pair<const int, std::pair<std::string, std::string>> &elem :
-       arp) {
-    const unsigned index = elem.first;
-    const std::pair<std::string, std::string> &entry = elem.second;
-    const std::string &mac = entry.first;
-    const std::string &ip = entry.second;
-    ss_arp << "\n(" << index << ") " << mac << ": " << ip;
-
-    for (unsigned i = 0; i < static_cast<unsigned>(size); ++i) {
-      if (ranks[i].ip == ip) {
-        if (ranks_checked[i]) {
-          std::cerr << "Double entry for " << ip << " in arp table!"
-                    << std::endl;
-          sanity_check = false;
-        } else {
-          ranks_checked[i] = true;
-        }
-      }
-    }
-  }
-
-  ACCL::debug(ss_arp.str());
-
-  if (!sanity_check) {
-    return false;
-  }
-
-  unsigned hosts = 0;
-  for (unsigned i = 0; i < static_cast<unsigned>(size); ++i) {
-    if (ranks_checked[i]) {
-      hosts += 1;
-    }
-  }
-  if (hosts < static_cast<unsigned>(size) - 1) {
-    std::cerr << "Found only " << hosts << " hosts out of " << size - 1 << "!"
-              << std::endl;
-    return false;
-  }
-
-  return true;
-}
-
 } // namespace
 
 ACCL::dataType ProcessGroupACCL::get_compressed_type(c10::ScalarType datatype) {
@@ -747,25 +391,20 @@ void accl_sa_handler(int)
 	exit(EXIT_FAILURE);
 }
 
-// TODO delete when not needed anymore
 void ProcessGroupACCL::init_input_tensor(at::Tensor &tensor, std::unique_ptr<ACCL::BaseBuffer> &data, bool do_on_root, bool do_on_others, int opts_root_rank) {
   if DO_COND {
 	std::memcpy(data->byte_array(), tensor.data_ptr(), tensor.numel() * tensor.element_size());
 	if (!coyote_enabled) {
 	    data->sync_to_device();
 	}
-	
     }
 }
-  void ProcessGroupACCL::init_input_data_vec(std::vector<at::Tensor> &tensor_vec, std::unique_ptr<ACCL::BaseBuffer> &data, const at::TensorOptions &options, bool do_on_root, bool do_on_others, int opts_root_rank) {
+
+void ProcessGroupACCL::init_input_data_vec(std::vector<at::Tensor> &tensor_vec, std::unique_ptr<ACCL::BaseBuffer> &data, const at::TensorOptions &options, bool do_on_root, bool do_on_others, int opts_root_rank) {
   if DO_COND {
     int64_t tens_size = static_cast<size_t>(tensor_vec[0].numel());
     int64_t total_size = tens_size * static_cast<size_t>(size_);
-    std::vector<int64_t> sizes = tensor_vec[0].sizes().vec();
-    sizes.insert(sizes.begin(), tensor_vec.size());
       
-    at::Tensor wrapper_tensor = torch::from_blob(data->byte_array(), sizes, options);
-
     for (const auto i : c10::irange(tensor_vec.size())) {
 	std::memcpy(data->byte_array() + i * tens_size * tensor_vec[0].element_size(), tensor_vec[i].data_ptr(), tens_size * tensor_vec[0].element_size());
     }
@@ -774,44 +413,7 @@ void ProcessGroupACCL::init_input_tensor(at::Tensor &tensor, std::unique_ptr<ACC
     }
   }
 }  
-  
-void ProcessGroupACCL::init_output_data(at::Tensor &tensor_original, std::unique_ptr<ACCL::BaseBuffer> &dstdata, int out_tensor_size, c10::ScalarType type, bool do_on_root, bool do_on_others, int opts_root_rank) {
-  if DO_COND {
-      if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
-	dstdata = create_buffer_p2p(*accl, out_tensor_size, type);
-    } else {
-	if (coyote_enabled) {
-	  dstdata = create_coyotebuffer(*accl, out_tensor_size, type);
-	} else {
-	  dstdata = create_buffer(*accl, out_tensor_size, type);
-	}
-    }
-  } else {
-    dstdata = std::unique_ptr<ACCL::Buffer<float>>(nullptr);
-  }
-}
-// This function could and should be removed if we adapt the structure in the calls
-    void ProcessGroupACCL::init_output_tensor(const at::Tensor &tensor_original, at::Tensor &dsttensor, std::unique_ptr<ACCL::BaseBuffer> &dstdata, int num_tensors, c10::ScalarType type, bool do_on_root, bool do_on_others, int opts_root_rank) {
-    if DO_COND {
-	int64_t num_tensors_s = static_cast<size_t>(num_tensors);
-	std::vector<int64_t> sizes = tensor_original.sizes().vec();
-	int64_t total_size = static_cast<size_t>(tensor_original.numel());
-	if  (num_tensors != 0) {
-	    // Prepend another dimension for vector length
-	    sizes.insert(sizes.begin(), num_tensors_s);
-	    total_size = total_size * num_tensors_s;
-	}
-	
-	if (p2p_applicable(*accl, tensor_original, p2p_enabled)) {
-	  dstdata = create_buffer_p2p(*accl, total_size, type);
-	} else if (coyote_enabled) {
-	    dsttensor = torch::from_blob(dstdata->byte_array(), sizes, tensor_original.options().device(c10::DeviceType::CPU));
-	}
-      } else {
-      dsttensor = at::Tensor(nullptr);
-    }
-}
-  
+
 void ProcessGroupACCL::copy_back_tensor(at::Tensor tensor_original, std::unique_ptr<ACCL::BaseBuffer> &data, bool do_on_root, bool do_on_others, int opts_root_rank){
   if DO_COND {
       if (!coyote_enabled) {
@@ -889,26 +491,6 @@ ProcessGroupACCL::ProcessGroupACCL(
   }
 }
 
-std::vector<std::uint8_t> ProcessGroupACCL::get_local_qp(unsigned int rank) {
-  std::vector<std::uint8_t> qp;
-  char *data = (char *) &ibvQpConn_vec[rank]->getQpairStruct()->local;
-  for (std::size_t i = 0; i < sizeof(fpga::ibvQ); ++i) {
-    qp.push_back(data[i]);
-  }
-
-  return qp;
-}
-
-void ProcessGroupACCL::set_remote_qp(unsigned int rank, std::vector<std::uint8_t> &qp) {
-  fpga::ibvQ remote_qp;
-  char *data = (char *) &remote_qp;
-  for (std::size_t i = 0; i < sizeof(fpga::ibvQ); ++i) {
-    data[i] = qp[i];
-  }
-
-  ibvQpConn_vec[rank]->getQpairStruct()->remote = remote_qp;
-}
-
 void ProcessGroupACCL::initialize() {
   std::cout << "PG initialize called\n";
   if (initialized) {
@@ -921,18 +503,10 @@ void ProcessGroupACCL::initialize() {
     global_accl = &accl;
 
     // Rendezvous protocol for now
-    int protoc = 1;
-    // default from test.cpp
     int segsize = 4096 * 1024;
 
     
-    if (protoc == 0){
-      std::cout<<"Eager Protocol"<<std::endl;
-      accl.get()->initialize(ranks_, rank_, size_+2, bufsize, segsize, 4096*1024*2);
-    } else {
-      std::cout<<"Rendezvous Protocol"<<std::endl;
-      accl.get()->initialize(ranks_, rank_, 16, 1024, RDVZ_THRESHOLD, 4096*1024);
-    }  
+    accl.get()->initialize(ranks_, rank_, 16, 1024, RDVZ_THRESHOLD, 4096*1024);
     
     ACCL::debug(std::string("[ACCL coyote] communicator: ") + accl->dump_communicator());
 
@@ -950,14 +524,6 @@ void ProcessGroupACCL::initialize() {
                                       
     int devicemem = accl->devicemem();
 
-    // Not sure if this is needed:
-    // Initialize cache buffers
-    // if (!simulator_){
-	// buf0 = xrt::bo(xrt_device, bufsize, devicemem);
-	// buf1 = xrt::bo(xrt_device, bufsize, devicemem);
-    // }
-    
-	    
   }
 
   in_buf = accl->create_buffer_host<float>(bufsize/sizeof(float), ACCL::dataType::float32);
@@ -976,12 +542,10 @@ void ProcessGroupACCL::destroy() {
   std::unique_lock<std::mutex> lock(pgMutex_);
   queueConsumeCV_.wait(lock, [&] { return queue_.empty(); });
 
-  //TODO free other buffer types
+  // TODO free other buffer types
   if (!simulator_) {
-      // if(coyote_enabled){
-	  in_buf->free_buffer();
-	  out_buf->free_buffer();
-      // }
+      in_buf->free_buffer();
+      out_buf->free_buffer();
   }
 
   // Queue is empty, signal stop
@@ -1054,8 +618,8 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
 
   std::chrono::time_point<std::chrono::high_resolution_clock> start_inner  = std::chrono::high_resolution_clock::now();
 
+  // This is very experimental
   #ifdef SIDESTEP_BCAST_WITH_ALLREDUCE
-
   // It seems to have issues with non-even numbers, so we round to ACCL_MSG_SIZE
   int rounded_count = (in_tensor.numel() + ROUND_NR) & ~ROUND_NR;
   
@@ -1064,11 +628,6 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
       imaginary_count = (in_tensor.numel()*2 + ROUND_NR) & ~ROUND_NR;
   }
 
-  ACCL::debug("imaginary count:" + std::to_string(imaginary_count));
-
-  
-  START_FINE(init)
-
   auto zero_tensor = torch::zeros({imaginary_count}, at::kInt);
   if (opts.rootRank == rank_){
       init_input_tensor(in_tensor, in_buf, true, false, opts.rootRank);
@@ -1078,54 +637,32 @@ void ProcessGroupACCL::run_broadcast(at::Tensor in_tensor,
   }
   init_input_tensor(zero_tensor, out_buf, true, false, opts.rootRank);
 
-  STOP_FINE(init, in_tensor.nbytes())
-
-  START_FINE(lock)      
   // Reserve device
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
-  STOP_FINE(lock, in_tensor.nbytes())      
-
-
-  in_buf->change_type(convert_datatype_from_torch(at::kInt));
-  out_buf->change_type(convert_datatype_from_torch(at::kInt));
 
+  PRE_REQUEST(Broadcast, in_tensor);
+  
   auto req = accl->allreduce(*in_buf, *out_buf, imaginary_count, ACCL::reduceFunction::SUM);      
 
-  POST_REQUEST("broadcast", in_tensor.nbytes())
-
-  START_FINE(copy)      
   copy_back_tensor(in_tensor, out_buf, true, true);
-  STOP_FINE(copy, in_tensor.nbytes())
   
   #else
 
   int rounded_count = (in_tensor.numel() + ROUND_NR) & ~ROUND_NR;
   
-  START_FINE(init)
-
   if (opts.rootRank == rank_){
       init_input_tensor(in_tensor, in_buf, true, false, opts.rootRank);
   }
 
-  STOP_FINE(init, in_tensor.nbytes())
-  
-  START_FINE(lock)
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
-  STOP_FINE(lock, in_tensor.nbytes())
-  
-
 
-  PRE_REQUEST(Broadcast, in_tensor)  
+  PRE_REQUEST(Broadcast, in_tensor);
       
   auto req = accl->bcast(*in_buf, rounded_count, opts.rootRank);
 
-  POST_REQUEST("bcast", in_tensor.nbytes())
-
-  START_FINE(copy)
   copy_back_tensor(in_tensor, in_buf, false, true, opts.rootRank);
-  STOP_FINE(copy, in_tensor.nbytes())
   #endif
 }
 
@@ -1148,7 +685,6 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
             opts.rootRank,
             MPI_COMM_WORLD));
 	} else {
-	START_COARSE(total)    
 	at::Tensor &tensor = (entry->src)[0];
         // Segment data if necessary
         if (tensor.nbytes() > ACCL_MSG_SIZE) {
@@ -1163,7 +699,6 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
         } else {
           run_broadcast(tensor, opts);
         }
-	STOP_COARSE(total, ((entry->src)[0]).nbytes())    
 	}
       };
   auto entry =
@@ -1178,31 +713,17 @@ ProcessGroupACCL::broadcast(std::vector<at::Tensor> &tensors,
 void ProcessGroupACCL::run_allreduce(at::Tensor in_tensor,
                                      const AllreduceOptions &opts) {
 
-  START_FINE(init)
-  
   init_input_tensor(in_tensor, in_buf, true, true);
 
-  STOP_FINE(init, in_tensor.nbytes())
-
-
-  START_FINE(lock)      
   // Reserve device
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
-  STOP_FINE(lock, in_tensor.nbytes())      
-
-  PRE_REQUEST(Allreduce,in_tensor)  
-
-
+  PRE_REQUEST(Allreduce,in_tensor); 
   int rounded_count = (in_tensor.numel() + ROUND_NR) & ~ROUND_NR;
   
   auto req = accl->allreduce(*in_buf, *out_buf, rounded_count, acclOp.at(opts.reduceOp));      
 
-  POST_REQUEST("allreduce", in_tensor.nbytes())
-
-  START_FINE(copy)      
   copy_back_tensor(in_tensor, out_buf, true, true);
-  STOP_FINE(copy, in_tensor.nbytes())
 }
 
 c10::intrusive_ptr<Work>
@@ -1225,7 +746,6 @@ ProcessGroupACCL::allreduce(std::vector<at::Tensor> &tensors,
 			  mpiOp.at(opts.reduceOp),
 			  MPI_COMM_WORLD));
 	} else {
-	    START_COARSE(total)    
 	    auto tensor = (entry->src)[0];
 	    // Segment data if necessary
 	    if (tensor.nbytes() > (ACCL_MSG_SIZE)) {
@@ -1233,16 +753,13 @@ ProcessGroupACCL::allreduce(std::vector<at::Tensor> &tensors,
 		size_t n = ACCL_MSG_SIZE / (tensor.itemsize() * non_zero_dim_count);
 		ACCL::debug("[Allreduce] Segmenting tensor of size " + std::to_string(tensor.nbytes()) + " into " + std::to_string(n * non_zero_dim_count) + "-sized elements ");
 		for (size_t i = 0; i < tensor.size(0); i += n) {
-		    START_FINE(loop)    
 		    // ACCL::debug("part " + std::to_string(i) + "!");
 		    size_t end = std::min(n, static_cast<size_t>(tensor.size(0)) - i);
 		    run_allreduce(tensor.narrow(0, i, end), opts);
-		    STOP_FINE(loop, tensor.nbytes())    
 		}
 	    } else {
 		run_allreduce(tensor, opts);
 	    }
-	    STOP_COARSE(total, ((entry->src)[0]).nbytes())    
       }
       };
   auto entry =
@@ -1263,25 +780,19 @@ void ProcessGroupACCL::run_reduce(at::Tensor in_tensor,
                                   const ReduceOptions &opts) {
 
 
-  START_FINE(init)    
   init_input_tensor(in_tensor, in_buf, true, true);
-  STOP_FINE(init, in_tensor.nbytes())
 
   // Reserve device
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  PRE_REQUEST(Reduce,in_tensor)
+  PRE_REQUEST(Reduce,in_tensor);
 
   int rounded_count = (in_tensor.numel() + ROUND_NR) & ~ROUND_NR;
 
   auto req = accl->reduce(*in_buf, *out_buf, rounded_count, opts.rootRank, acclOp.at(opts.reduceOp));
 
-  POST_REQUEST("reduce", in_tensor.nbytes())
-
-  START_FINE(copy)      
   copy_back_tensor(in_tensor, out_buf, true, false, opts.rootRank);
-  STOP_FINE(copy, in_tensor.nbytes())
 }
 
 c10::intrusive_ptr<Work>
@@ -1291,7 +802,6 @@ ProcessGroupACCL::reduce(std::vector<at::Tensor> &tensors,
 
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [opts, this](std::unique_ptr<WorkEntry> &entry) {
-	START_COARSE(total)    
         auto tensor = (entry->src)[0];
         // Segment data if necessary
         if (tensor.nbytes() > bufsize) {
@@ -1303,7 +813,6 @@ ProcessGroupACCL::reduce(std::vector<at::Tensor> &tensors,
         } else {
           run_reduce(tensor, opts);
         }
-	STOP_COARSE(total, ((entry->src)[0]).nbytes())    
       };
   auto entry =
       std::make_unique<WorkEntry>(&tensors, &tensors, std::move(runFunc));
@@ -1316,31 +825,19 @@ ProcessGroupACCL::reduce(std::vector<at::Tensor> &tensors,
 void ProcessGroupACCL::run_allgather(
     at::Tensor in_tensor,
     const std::vector<at::Tensor> &dsttensorvec) {
-
     
-  START_FINE(init)
   init_input_tensor(in_tensor, in_buf, true, true);    
-  // Reserve device
-
-  STOP_FINE(init, in_tensor.nbytes())
-
-
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
       
-  PRE_REQUEST(Allgather,in_tensor)
+  PRE_REQUEST(Allgather,in_tensor);
 
   int rounded_count = (in_tensor.numel() + 1023) & ~1023;
       
   auto req = accl->allgather(*in_buf, *out_buf, rounded_count);
 
-  POST_REQUEST("allgather", in_tensor.nbytes())
-      
-  START_FINE(copy)      
   copy_back_tensorvec(dsttensorvec, out_buf, in_tensor, in_tensor.numel(), rounded_count, true, true);
-  STOP_FINE(copy, in_tensor.nbytes())
-    
 }
 
 c10::intrusive_ptr<Work>
@@ -1383,7 +880,6 @@ ProcessGroupACCL::allgather(std::vector<std::vector<at::Tensor>> &outputTensors,
           outputDataVec[i].copy_(flatOutputTensor[i]);
         }
         #else
-	START_COARSE(total)    
         auto srctensor = (entry->src)[0];
         auto &dsttensors = entry->dst;
         // Segment data if necessary
@@ -1403,7 +899,6 @@ ProcessGroupACCL::allgather(std::vector<std::vector<at::Tensor>> &outputTensors,
         } else {
           run_allgather(srctensor, dsttensors);
         }
-	STOP_COARSE(total, ((entry->src)[0]).nbytes())    
       #endif
       };
   auto entry = std::make_unique<WorkEntry>(&inputTensors, &outputTensors[0],
@@ -1429,21 +924,13 @@ void ProcessGroupACCL::run_gather(at::Tensor in_tensor,
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  START_FINE(init)
-  
   init_input_tensor(in_tensor, in_buf, true, true);
 
-  STOP_FINE(init, in_tensor.nbytes())
-  
-  PRE_REQUEST(Gather, in_tensor)
+  PRE_REQUEST(Gather, in_tensor);
 
   auto req = accl->gather(*in_buf, *out_buf, in_tensor.numel(), opts.rootRank);
 
-  POST_REQUEST("gather", in_tensor.nbytes())
-  START_FINE(copy)      
   copy_back_tensorvec(dsttensorvec, out_buf, in_tensor, in_tensor.numel(), in_tensor.numel(), true, false, opts.rootRank);
-  STOP_FINE(copy, in_tensor.nbytes())
-    
 }
 
 c10::intrusive_ptr<Work>
@@ -1502,7 +989,6 @@ ProcessGroupACCL::gather(std::vector<std::vector<at::Tensor>> &outputTensors,
           }
         }
 	#else
-	START_COARSE(total)    
         auto srctensor = (entry->src)[0];
         auto &dsttensors = entry->dst;
         // Segment data if necessary
@@ -1523,7 +1009,6 @@ ProcessGroupACCL::gather(std::vector<std::vector<at::Tensor>> &outputTensors,
         } else {
           run_gather(srctensor, dsttensors, opts);
         }
-	STOP_COARSE(total, ((entry->src)[0]).nbytes())    
       #endif
       };
 
@@ -1550,20 +1035,13 @@ void ProcessGroupACCL::run_scatter(std::vector<at::Tensor> &in_tensor_vec,
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
 
-  START_FINE(init)
   init_input_data_vec(in_tensor_vec, in_buf, out_tensor.options().device(c10::DeviceType::CPU), true, false, opts.rootRank);
   
-  STOP_FINE(init, out_tensor.nbytes())
-  
-  PRE_REQUEST(Scatter, out_tensor)
+  PRE_REQUEST(Scatter, out_tensor);
   
   auto req = accl->scatter(*in_buf, *out_buf, out_tensor.numel(), opts.rootRank);
 
-  POST_REQUEST("scatter", out_tensor.nbytes())
-
-  START_FINE(copy)      
   copy_back_tensor(out_tensor, out_buf, true, true, opts.rootRank);
-  STOP_FINE(copy, out_tensor.nbytes())
 }
 
 c10::intrusive_ptr<Work>
@@ -1619,7 +1097,6 @@ ProcessGroupACCL::scatter(std::vector<at::Tensor> &outputTensors,
             opts.rootRank,
             MPI_COMM_WORLD));
         #else
-	START_COARSE(total)    
         auto &srctensors = entry->src;
         auto dsttensor = (entry->dst)[0];
         // Segment data if necessary
@@ -1640,7 +1117,6 @@ ProcessGroupACCL::scatter(std::vector<at::Tensor> &outputTensors,
         } else {
           run_scatter(srctensors, dsttensor, opts);
         }
-	STOP_COARSE(total, ((entry->src)[0]).nbytes())    
         #endif
       };
 
@@ -1675,27 +1151,19 @@ void ProcessGroupACCL::run_alltoall(at::Tensor in_tensor,
                                     at::Tensor out_tensor,
                                     const AllToAllOptions &opts) {
 
-  START_FINE(init)
   init_input_tensor(in_tensor, in_buf, true, true);
-  STOP_FINE(init, in_tensor.nbytes())
 
   // Reserve device
-  START_FINE(lock)      
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
-  STOP_FINE(lock, in_tensor.nbytes())
 
   // init_output_data(out_tensor, dstdata, out_tensor.numel(), out_tensor.scalar_type(), true, true);
 
-  PRE_REQUEST(AlltoAll, in_tensor)
+  PRE_REQUEST(AlltoAll, in_tensor);
 
   auto req = accl->alltoall(*in_buf, *out_buf, in_tensor.numel()/size_);
 
-  POST_REQUEST("alltoall", in_tensor.nbytes())
-
-  START_FINE(copy)      
   copy_back_tensor(out_tensor, out_buf, true, true);    
-  STOP_FINE(copy,  in_tensor.nbytes())
 }
 
     
@@ -1709,12 +1177,10 @@ void ProcessGroupACCL::run_alltoall_vec(std::vector<at::Tensor> &in_tensor_vec,
 
   init_input_data_vec(in_tensor_vec, in_buf, out_tensor_vec[0].options().device(c10::DeviceType::CPU), true, true);
 
-  PRE_REQUEST(AlltoAll, in_tensor_vec[0])
+  PRE_REQUEST(AlltoAll, in_tensor_vec[0]);
 
   auto req = accl->alltoall(*in_buf, *out_buf, in_tensor_vec[0].numel());
 
-  POST_REQUEST("alltoall", a2a_nbytes)
-
   copy_back_tensorvec(out_tensor_vec, out_buf, in_tensor_vec[0], in_tensor_vec[0].numel(), in_tensor_vec[0].numel(), true, true);
       
 }
@@ -1736,7 +1202,6 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::alltoall_base(
 
     std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
         [opts, this](std::unique_ptr<WorkEntry>& entry) {
-	  START_COARSE(total)    
           auto srctensor = (entry->src)[0];
           auto dsttensor = (entry->dst)[0];
 
@@ -1771,7 +1236,6 @@ c10::intrusive_ptr<Work> ProcessGroupACCL::alltoall_base(
           } else {
             run_alltoall(srctensor, dsttensor, opts);
           }
-	  STOP_COARSE(total, ((((entry->src)[0]).nbytes())))    
         };
     std::vector<at::Tensor> inputTensors = {inputTensor};
     std::vector<at::Tensor> outputTensors = {outputTensor};
@@ -1804,17 +1268,12 @@ void ProcessGroupACCL::run_send(at::Tensor in_tensor, int dstRank,
   c10::DeviceGuard guard(in_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  START_FINE(init)
   init_input_tensor(in_tensor, in_buf, true, true);
-  STOP_FINE(init, in_tensor.nbytes())
 
-  PRE_REQUEST(Send,in_tensor)
+  PRE_REQUEST(Send,in_tensor);
   
   ACCL::ACCLRequest* req = accl->send(*in_buf, in_tensor.numel(), dstRank, tag);
 
-  POST_REQUEST("send", in_tensor.nbytes())
-
-  ACCL::debug("copy_" + std::string(x_MAKE_STRING(COLL_NAME)) + "_" + std::to_string(in_tensor.nbytes()) + " durationUs: " + std::to_string(0));
 }
 
 
@@ -1824,7 +1283,6 @@ ProcessGroupACCL::send(std::vector<at::Tensor> &tensors, int dstRank, int tag) {
 
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [dstRank, tag, this](std::unique_ptr<WorkEntry> &entry) {
-	START_COARSE(total)    
         at::Tensor &tensor = (entry->src)[0];
         // Segment data if necessary
         if (tensor.nbytes() > bufsize) {
@@ -1836,7 +1294,6 @@ ProcessGroupACCL::send(std::vector<at::Tensor> &tensors, int dstRank, int tag) {
         } else {
           run_send(tensor, dstRank, tag);
         }
-	STOP_COARSE(total, ((entry->src)[0]).nbytes())    
       };
 
   auto entry =
@@ -1851,23 +1308,14 @@ ProcessGroupACCL::send(std::vector<at::Tensor> &tensors, int dstRank, int tag) {
 void ProcessGroupACCL::run_recv(at::Tensor out_tensor, int srcRank,
                                 int tag) {
 
-  // Reserve device
-
-  ACCL::debug("init_" + std::string(x_MAKE_STRING(COLL_NAME)) + "_" + std::to_string(out_tensor.nbytes()) + " durationUs: " + std::to_string(0));  
   c10::DeviceGuard guard(out_tensor.device());
   std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
 
-  // init_output_data(out_tensor, dstdata, out_tensor.numel(), out_tensor.scalar_type(), true, true);
-
-  PRE_REQUEST(Receive, out_tensor)  
+  PRE_REQUEST(Receive, out_tensor);
   
   ACCL::ACCLRequest* req = accl->recv(*out_buf, out_tensor.numel(), srcRank, tag);
 
-  POST_REQUEST("recv", out_tensor.nbytes())
-
-  START_FINE(copy)            
   copy_back_tensor(out_tensor, out_buf, true, true);
-  STOP_FINE(copy, out_tensor.nbytes())  
 }
 
 c10::intrusive_ptr<Work>
@@ -1877,7 +1325,6 @@ ProcessGroupACCL::recv(std::vector<at::Tensor> &tensors, int srcRank, int tag) {
   std::function<void(std::unique_ptr<WorkEntry> &)> runFunc =
       [srcRank, tag, this](std::unique_ptr<WorkEntry> &entry) {
         const at::Tensor &tensor = (entry->dst)[0];
-	START_COARSE(total)    
         // Segment data if necessary
         if (tensor.nbytes() > bufsize) {
           size_t n = bufsize / tensor.itemsize();
@@ -1888,7 +1335,6 @@ ProcessGroupACCL::recv(std::vector<at::Tensor> &tensors, int srcRank, int tag) {
         } else {
           run_recv(tensor, srcRank, tag);
         }
-	STOP_COARSE(total, ((entry->src)[0]).nbytes())    
       };
 
   auto entry =
diff --git a/integrations/pytorch_ddp/test/test-mnist.py b/integrations/pytorch_ddp/test/test-mnist.py
index b667a08d..0256b017 100644
--- a/integrations/pytorch_ddp/test/test-mnist.py
+++ b/integrations/pytorch_ddp/test/test-mnist.py
@@ -9,7 +9,6 @@
 import torch.distributed as dist
 import accl_process_group as accl
 
-from mpi4py.MPI import COMM_WORLD as mpi
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.data.distributed import DistributedSampler
 
@@ -59,7 +58,7 @@ def forward(self, x):
         output = self.out(x)
         return output, x    # return x for visualization
 
-def train(num_epochs, cnn, loaders, p):
+def train(num_epochs, cnn, loaders):
 
     start_time_train = time.perf_counter()
     
@@ -72,7 +71,7 @@ def train(num_epochs, cnn, loaders, p):
 
     for epoch in range(num_epochs):
         for i, (images, labels) in enumerate(loaders['train']):
-            p.step()
+            # p.step()
             start_time = time.perf_counter()
             # gives batch data, normalize x when iterate train_loader
             b_x = Variable(images)   # batch x
@@ -102,7 +101,7 @@ def train(num_epochs, cnn, loaders, p):
     print('Total train time: ' + str(measured_time_train))
         
 
-def test(p):
+def test():
     # Test the model
     start_time_test = time.perf_counter()
     cnn.eval()
@@ -110,7 +109,7 @@ def test(p):
         correct = 0
         total = 0
         for images, labels in loaders['test']:
-            p.step()
+            # p.step()
             test_output, last_layer = cnn(images)
             pred_y = torch.max(test_output, 1)[1].data.squeeze()
             correct_current = (pred_y == labels).sum().item()
@@ -167,8 +166,11 @@ def test(p):
         args.master_port = "30505"
     os.environ['MASTER_ADDR'] = args.master_address
     os.environ['MASTER_PORT'] = args.master_port
-    rank = mpi.Get_rank()
-    size = mpi.Get_size()
+
+    dist.init_process_group("mpi")
+    rank = dist.get_rank()
+    size = dist.get_world_size()
+
 
     rxbufsize = 4096 * 1024
 
@@ -203,10 +205,11 @@ def test(p):
             design = accl.ACCLDesign.cyt_rdma
     
 
-        mpi.Barrier()            
     
-        accl.create_process_group(ranks, design, bufsize=rxbufsize, initialize=True, simulation=args.simulator)
-        dist.init_process_group("ACCL", rank=rank, world_size=size)
+        # dist.init_process_group("mpi", rank=rank, world_size=size)
+        
+        # accl.create_process_group(ranks, design, bufsize=rxbufsize, initialize=True, simulation=args.simulator)
+        # dist.init_process_group("ACCL", rank=rank, world_size=size)
         
     device = 'cpu'
 
@@ -243,10 +246,11 @@ def test(p):
 
     num_epochs = 10
 
-    mpi.Barrier()
-
     print("starting training")
 
+    print(rank)
+    print(size)
+    
     schedule = torch.profiler.schedule(
         wait=1,
         warmup=1,
@@ -254,22 +258,18 @@ def test(p):
         repeat=3
     )
     
-    with torch.profiler.profile(
-            activities=[torch.profiler.ProfilerActivity.CPU],
-            schedule=schedule,
-            on_trace_ready=torch.profiler.tensorboard_trace_handler('./accl_log/profiler_log'),
-            record_shapes=True,
-            with_stack=True
-    ) as p:
+    # with torch.profiler.profile(
+            # activities=[torch.profiler.ProfilerActivity.CPU],
+            # schedule=schedule,
+            # on_trace_ready=torch.profiler.tensorboard_trace_handler('./accl_log/profiler_log'),
+            # record_shapes=True,
+            # with_stack=True
+    # ) as p:
 
         
-        train(num_epochs, cnn, loaders, p)
-
-        test(p)
-
-    p.stop()
+    train(num_epochs, cnn, loaders)
 
-    print(p.key_averages().table(sort_by="self_cpu_time_total", row_limit=100))
+    test()
 
 
     dist.destroy_process_group()