diff --git a/.gitignore b/.gitignore index 46f1b628..09c459e2 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,11 @@ driver/xrt/src/m2m driver/xrt/build .vscode +coyote_build* +*xcu55c-fsvh2892-2L-e* +*_prj +*.gen +*.ip_user_files +*.cache +*.srcs +**/fpga_ips.txt \ No newline at end of file diff --git a/driver/xrt/CMakeLists.txt b/driver/xrt/CMakeLists.txt index f6ed4236..c6b62a9b 100644 --- a/driver/xrt/CMakeLists.txt +++ b/driver/xrt/CMakeLists.txt @@ -74,6 +74,7 @@ set(ACCL_DOCS_RST ) set(EN_COYOTE ON) +set(EN_AVX 1 CACHE STRING "AVX environment.") if(EN_COYOTE) message("Enable Coyote") set(ACCL_HEADERS @@ -88,6 +89,7 @@ if(EN_COYOTE) file(GLOB COYOTE_SOURCE "${COYOTE_SOURCE_PATH}/*.cpp") if(EN_AVX) + add_definitions(-DEN_AVX) set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -mavx -march=native -O3") else() set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -march=native -O1") diff --git a/driver/xrt/CMakeLists_GPU.txt b/driver/xrt/CMakeLists_GPU.txt new file mode 100644 index 00000000..a93c09b6 --- /dev/null +++ b/driver/xrt/CMakeLists_GPU.txt @@ -0,0 +1,251 @@ +# /******************************************************************************* +# Copyright (C) 2022 Xilinx, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# *******************************************************************************/ + +cmake_minimum_required(VERSION 3.9) +project(accl VERSION 0.1.0 DESCRIPTION "ACCL") + +set(CMAKE_CXX_STANDARD 17) + + +set(EN_GPU 1) +if(NOT DEFINED ROCM_PATH) +if(DEFINED ENV{ROCM_PATH}) + set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCM has been installed") +elseif(DEFINED ENV{HIP_PATH}) + set(ROCM_PATH "$ENV{HIP_PATH}/.." CACHE PATH "Path to which ROCM has been installed") +else() + set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCM has been installed") +endif() +endif() + +file(STRINGS "${ROCM_PATH}/.info/version" ROCM_VERSION) +message("-- Found ROCm: ${ROCM_VERSION}") + +if (NOT DEFINED CMAKE_CXX_COMPILER) + set(CMAKE_CXX_COMPILER ${ROCM_PATH}/bin/hipcc) +endif() + +if(NOT DEFINED HIP_PATH) + if(NOT DEFINED ENV{HIP_PATH}) + set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") + else() + set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") + endif() +endif() + +if(NOT DEFINED HCC_PATH) + if(DEFINED ENV{HCC_PATH}) + set(HCC_PATH $ENV{HCC_PATH} CACHE PATH "Path to which HCC has been installed") + else() + set(HCC_PATH "${ROCM_PATH}/hcc" CACHE PATH "Path to which HCC has been installed") + endif() + set(HCC_HOME "${HCC_PATH}") +endif() + +if(NOT DEFINED HIP_CLANG_PATH) + if(NOT DEFINED ENV{HIP_CLANG_PATH}) + set(HIP_CLANG_PATH "${ROCM_PATH}/llvm/bin" CACHE PATH "Path to which HIP compatible clang binaries have been installed") + else() + set(HIP_CLANG_PATH $ENV{HIP_CLANG_PATH} CACHE PATH "Path to which HIP compatible clang binaries have been installed") + endif() +endif() + +find_package(HIP QUIET) +if(HIP_FOUND) + message(STATUS "Found HIP: " ${HIP_VERSION}) +else() + message(FATAL_ERROR "Could not find HIP. Ensure that HIP is either installed in /opt/rocm/hip or the variable HIP_PATH is set to point to the right location.") +endif() +find_package(hip REQUIRED) + +set(CYT_LANG ${CYT_LANG} HIP) + +if (NOT CMAKE_LIBRARY_OUTPUT_DIRECTORY) + set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib") +endif() + +if (NOT CMAKE_RUNTIME_OUTPUT_DIRECTORY) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/bin") +endif() + +# Consider switching to PROJECT_IS_TOP_LEVEL from CMake 3.21 (2021) +# (https://cmake.org/cmake/help/latest/variable/PROJECT_IS_TOP_LEVEL.html) +get_directory_property(HAS_PARENT PARENT_DIRECTORY) + +set(ACCL_SOURCE_PATH ${CMAKE_CURRENT_LIST_DIR}/src) +set(ACCL_HEADER_PATH ${CMAKE_CURRENT_LIST_DIR}/include) +set(ACCL_DOCS_PATH ${CMAKE_CURRENT_LIST_DIR}/docs) +set(ACCL_REPO_ROOT ${CMAKE_CURRENT_LIST_DIR}/../../) +set(ZMQ_INTF_DIR ${ACCL_REPO_ROOT}/test/model/zmq) +set(COYOTE_SOURCE_PATH ${CMAKE_CURRENT_LIST_DIR}/../../test/refdesigns/Coyote/sw/src) +set(COYOTE_HEADER_PATH ${CMAKE_CURRENT_LIST_DIR}/../../test/refdesigns/Coyote/sw/include) + +set(ACCL_HEADERS + ${ACCL_HEADER_PATH}/accl.hpp + ${ACCL_HEADER_PATH}/common.hpp + ${ACCL_HEADER_PATH}/communicator.hpp + ${ACCL_HEADER_PATH}/constants.hpp + ${ACCL_HEADER_PATH}/simdevice.hpp + ${ACCL_HEADER_PATH}/simbuffer.hpp + ${ACCL_HEADER_PATH}/xrtdevice.hpp + ${ACCL_HEADER_PATH}/acclrequest.hpp +) + +set(ACCL_SOURCES + ${ACCL_SOURCE_PATH}/accl.cpp + ${ACCL_SOURCE_PATH}/common.cpp + ${ACCL_SOURCE_PATH}/communicator.cpp + ${ACCL_SOURCE_PATH}/constants.cpp + ${ACCL_SOURCE_PATH}/simdevice.cpp + ${ACCL_SOURCE_PATH}/simbuffer.cpp + ${ACCL_SOURCE_PATH}/xrtdevice.cpp + ${ZMQ_INTF_DIR}/zmq_client.cpp + ${ZMQ_INTF_DIR}/zmq_common.cpp +) + +set(ACCL_DOCS_RST + ${ACCL_DOCS_PATH}/index.rst + ${ACCL_DOCS_PATH}/Cpp_reference/index.rst + ${ACCL_DOCS_PATH}/Cpp_reference/accl.rst + ${ACCL_DOCS_PATH}/Cpp_reference/buffer.rst + ${ACCL_DOCS_PATH}/Cpp_reference/cclo.rst + ${ACCL_DOCS_PATH}/Cpp_reference/communicator.rst + ${ACCL_DOCS_PATH}/Cpp_reference/misc.rst +) + +set(EN_COYOTE ON) +set(EN_AVX 1 CACHE STRING "AVX environment.") +if(EN_COYOTE) + message("Enable Coyote") + set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} /mnt/scratch/jooertli/ACCL_CYT_V2_EGR_HOST_GPU/test/refdesigns/Coyote/cmake) + find_package(CoyoteSW REQUIRED) + set(ACCL_HEADERS + ${ACCL_HEADERS} + ${ACCL_HEADER_PATH}/coyotebuffer.hpp + ${ACCL_HEADER_PATH}/coyotedevice.hpp + ) + set(ACCL_SOURCES + ${ACCL_SOURCES} + ${ACCL_SOURCE_PATH}/coyotedevice.cpp + ) + file(GLOB COYOTE_SOURCE "${COYOTE_SOURCE_PATH}/*.cpp") + + if(EN_AVX) + add_definitions(-DEN_AVX) + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -mavx -march=native -O3") + else() + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -march=native -O1") + endif() +endif(EN_COYOTE) + + +set(ACCL_INCLUDE_PATH ${CMAKE_CURRENT_LIST_DIR}/include ${ZMQ_INTF_DIR}) +if (HAS_PARENT) + set(ACCL_INCLUDE_PATH ${CMAKE_CURRENT_LIST_DIR}/include ${ZMQ_INTF_DIR} PARENT_SCOPE) +endif (HAS_PARENT) + + +if(EN_COYOTE) + set(ACCL_SOURCES ${ACCL_SOURCES} ${COYOTE_SOURCE}) + set(ACCL_INCLUDE_PATH ${ACCL_INCLUDE_PATH} ${COYOTE_HEADER_PATH}) +endif(EN_COYOTE) + + +add_library(accl SHARED ${ACCL_SOURCES}) +target_include_directories(accl PUBLIC ${ACCL_INCLUDE_PATH}) + +# XRT +if (NOT EXISTS $ENV{XILINX_XRT}) + message(FATAL_ERROR "Xilinx XRT not found, make sure to source setup.sh") +endif () + +target_link_directories(accl PUBLIC $ENV{XILINX_XRT}/lib) +target_link_libraries(accl PUBLIC xilinxopencl xrt_coreutil xrt_core) + +target_include_directories(accl PUBLIC /opt/rocm/include /opt/rocm/include/hsa) +target_link_libraries(accl PUBLIC hip::device numa pthread drm drm_amdgpu rt dl hsa-runtime64 hsakmt) + +target_include_directories(accl PUBLIC $ENV{XILINX_XRT}/include) + +# ZMQ +target_link_libraries(accl PUBLIC zmq pthread) + +# Json +find_package(jsoncpp REQUIRED) +target_link_libraries(accl PUBLIC jsoncpp_lib) +get_target_property(JSON_INC_PATH jsoncpp_lib INTERFACE_INCLUDE_DIRECTORIES) +target_include_directories(accl PUBLIC ${JSON_INC_PATH}) + + +if (ACCL_DEBUG) + target_compile_definitions(accl PUBLIC ACCL_DEBUG) + message("Defining ACCL_DEBUG") +endif (ACCL_DEBUG) + +set_target_properties(accl PROPERTIES + VERSION ${PROJECT_VERSION} + SOVERSION 1 + PUBLIC_HEADER include/accl.hpp +) + +# docs +find_package(Doxygen) + +# Add the cmake folder so the FindSphinx module is found +set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake" ${CMAKE_MODULE_PATH}) +find_package(Sphinx) + +if (HAS_PARENT) + set(DOC_TARGET accl_docs) +else (HAS_PARENT) + set(DOC_TARGET docs) +endif (HAS_PARENT) + +if (DOXYGEN_FOUND AND SPHINX_FOUND) + set(DOXYGEN_INDEX_FILE ${CMAKE_CURRENT_LIST_DIR}/docs/xml/index.xml) + set(SPHINX_INDEX_FILE ${CMAKE_CURRENT_LIST_DIR}/docs/sphinx/index.html) + add_custom_command(OUTPUT ${DOXYGEN_INDEX_FILE} + COMMAND ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/Doxyfile + WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} + BYPRODUCTS + ${CMAKE_CURRENT_LIST_DIR}/docs/xml + ${CMAKE_CURRENT_LIST_DIR}/docs/latex + ${CMAKE_CURRENT_LIST_DIR}/docs/html + DEPENDS ${ACCL_HEADERS} + MAIN_DEPENDENCY ${CMAKE_CURRENT_LIST_DIR}/Doxyfile + COMMENT "Generating API documentation with Doxygen" + VERBATIM ) + + add_custom_target(${DOC_TARGET}_doxygen ALL DEPENDS ${DOXYGEN_INDEX_FILE}) + + add_custom_command(OUTPUT ${SPHINX_INDEX_FILE} + COMMAND ${SPHINX_EXECUTABLE} -b html ${CMAKE_CURRENT_LIST_DIR}/docs ${CMAKE_CURRENT_LIST_DIR}/docs/sphinx + WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} + BYPRODUCTS + ${CMAKE_CURRENT_LIST_DIR}/docs/sphinx + DEPENDS ${ACCL_DOCS_RST} + MAIN_DEPENDENCY ${DOXYGEN_INDEX_FILE} + COMMENT "Generating API documentation with Sphinx" + VERBATIM + ) + + add_custom_target(${DOC_TARGET} ALL DEPENDS ${SPHINX_INDEX_FILE}) + + set_target_properties(${DOC_TARGET}_doxygen ${DOC_TARGET} PROPERTIES EXCLUDE_FROM_ALL 1 EXCLUDE_FROM_DEFAULT_BUILD 1) +else() + message("Doxygen needs to be installed to generate the doxygen documentation") +endif() diff --git a/driver/xrt/include/accl.hpp b/driver/xrt/include/accl.hpp index a98667fd..cbe7ffe8 100644 --- a/driver/xrt/include/accl.hpp +++ b/driver/xrt/include/accl.hpp @@ -101,7 +101,7 @@ class ACCL { */ void initialize(const std::vector &ranks, int local_rank, int n_egr_rx_bufs = 16, addr_t egr_rx_buf_size = 1024, - addr_t max_egr_size = 1024, addr_t max_rndzv_size = 32*1024); + addr_t max_egr_size = 1024, addr_t max_rndzv_size = 32*1024, bool rxEager_host = false); /** * Get the return code of the last ACCL call. @@ -1101,7 +1101,7 @@ ACCLRequest *barrier(communicatorId comm_id = GLOBAL_COMM, void configure_arithmetic(); void setup_eager_rx_buffers(size_t n_egr_rx_bufs, addr_t egr_rx_buf_size, - const std::vector &devicemem); + const std::vector &devicemem, bool host=false); void setup_eager_rx_buffers(size_t n_egr_rx_bufs, addr_t egr_rx_buf_size, int devicemem) { std::vector mems = {devicemem}; return setup_eager_rx_buffers(n_egr_rx_bufs, egr_rx_buf_size, mems); diff --git a/driver/xrt/include/accl/coyotebuffer.hpp b/driver/xrt/include/accl/coyotebuffer.hpp index c082a856..6405590a 100644 --- a/driver/xrt/include/accl/coyotebuffer.hpp +++ b/driver/xrt/include/accl/coyotebuffer.hpp @@ -19,12 +19,17 @@ #pragma once #include "buffer.hpp" #include "common.hpp" -#include "cProcess.hpp" +#include "cThread.hpp" #include "cDefs.hpp" #include "coyotedevice.hpp" #include #include #include +#define GPU_EN 0 +#if GPU_EN == 1 + #include + #define DEFAULT_GPU_ID 0 +#endif /** @file coyotebuffer.hpp */ @@ -57,14 +62,18 @@ template class CoyoteBuffer : public Buffer { size_t page_size = 1ULL << 21; this->buffer_size = length * sizeof(dtype); this->n_pages = (buffer_size + page_size - 1) / page_size; - std::cerr << "CoyoteBuffer contructor called! page_size:"<aligned_buffer = (dtype *)this->device->coyote_proc->getMem({fpga::CoyoteAlloc::HUGE_2M, n_pages}); + #if GPU_EN == 1 + if (hipSetDevice(DEFAULT_GPU_ID)) { throw std::runtime_error("Couldn't select GPU!"); } + this->aligned_buffer = (dtype *)this->device->coyote_proc->getMem({coyote::CoyoteAllocType::GPU, static_cast(this->buffer_size), true, DEFAULT_GPU_ID}); + #else + this->aligned_buffer = (dtype *)this->device->coyote_proc->getMem({coyote::CoyoteAllocType::HPF, this->buffer_size, true}); + #endif this->update_buffer(this->aligned_buffer, (addr_t)this->aligned_buffer); std::cerr << "Allocation successful! Allocated buffer: "<aligned_buffer << std::setbase(10) <<", Size: " << this->_size << std::endl; + //buffers in coyote per default on host memory host_flag = true; @@ -112,22 +121,30 @@ template class CoyoteBuffer : public Buffer { */ void sync_from_device() override { - std::cerr << "calling sync: " << std::setbase(16) << (uint64_t)this->aligned_buffer << ", size: " << std::setbase(10) << this->size() << std::endl; + std::cerr << "sync_from_device at address: " << std::setbase(16) << (uint64_t)this->aligned_buffer << ", size: " << std::setbase(10) << this->size() << std::endl; - this->device->coyote_proc->invoke({fpga::CoyoteOper::SYNC, this->aligned_buffer, (uint32_t)this->_size, true, true, 0, false}); + coyote::syncSg sg; + memset(&sg, 0, sizeof(coyote::syncSg)); + sg.addr = this->aligned_buffer; + sg.len = this->size(); + this->device->coyote_proc->invoke(coyote::CoyoteOper::LOCAL_SYNC, sg); this->host_flag = true; } /** * Sync the data from the host to the device. - * + * */ void sync_to_device() override { - std::cerr << "calling offload: " << std::setbase(16) << (uint64_t)this->aligned_buffer << ", size: " << std::setbase(10) << this->size() << std::endl; + std::cerr << "sync_to_device at address: " << std::setbase(16) << (uint64_t)this->aligned_buffer << ", size: " << std::setbase(10) << this->size() << std::endl; - this->device->coyote_proc->invoke({fpga::CoyoteOper::OFFLOAD, this->aligned_buffer, (uint32_t)this->_size, true, true, 0, false}); + coyote::syncSg sg; + memset(&sg, 0, sizeof(coyote::syncSg)); + sg.addr = this->aligned_buffer; + sg.len = this->size(); + this->device->coyote_proc->invoke(coyote::CoyoteOper::LOCAL_OFFLOAD, sg); this->host_flag = false; } @@ -144,7 +161,7 @@ template class CoyoteBuffer : public Buffer { // } // } - std::cerr << "Free user buffer from cProc cPid:"<< std::setbase(10)<device->coyote_proc->getCpid()<<", buffer_size:"<aligned_buffer<device->coyote_proc->getCtid()<<", buffer_size:"<aligned_buffer<device->coyote_proc->freeMem(this->aligned_buffer); return; } diff --git a/driver/xrt/include/accl/coyotedevice.hpp b/driver/xrt/include/accl/coyotedevice.hpp index 7d3e1689..5c5795cd 100644 --- a/driver/xrt/include/accl/coyotedevice.hpp +++ b/driver/xrt/include/accl/coyotedevice.hpp @@ -20,9 +20,9 @@ #include "acclrequest.hpp" #include "cclo.hpp" #include "constants.hpp" -#include "cProcess.hpp" -#include "ibvQpConn.hpp" -#include "ibvStructs.hpp" +#include "cThread.hpp" +// #include "ibvQpConn.hpp" +// #include "ibvStructs.hpp" #include #include #include @@ -108,7 +108,7 @@ class CoyoteDevice : public CCLO { void printDebug() override; - fpga::cProcess* get_device(){ + coyote::cThread* get_device(){ return coyote_proc; } @@ -120,13 +120,13 @@ class CoyoteDevice : public CCLO { val_t get_retcode(ACCLRequest *request) override; - fpga::cProcess* coyote_proc; + coyote::cThread* coyote_proc; // RDMA related // RDMA requires multiple processes to establish queue pairs // The CCLO kernel is still managed by coyote_proc unsigned int num_qp; - std::vector coyote_qProc_vec; + std::vector coyote_qProc_vec; private: const size_t OFFSET_CCLO = 0x0; diff --git a/driver/xrt/src/accl.cpp b/driver/xrt/src/accl.cpp index 9e98efb7..fe1406ef 100644 --- a/driver/xrt/src/accl.cpp +++ b/driver/xrt/src/accl.cpp @@ -128,12 +128,11 @@ ACCLRequest *ACCL::send(BaseBuffer &srcbuf, unsigned int count, if (from_fpga == false) { srcbuf.sync_to_device(); } - options.scenario = operation::send; options.comm = communicators[comm_id].communicators_addr(); options.addr_0 = &srcbuf; options.count = count; - options.root_src_dst = dst; + options.root_src_dst = dst; options.tag = tag; options.compress_dtype = compress_dtype; options.waitfor = waitfor; @@ -143,7 +142,6 @@ ACCLRequest *ACCL::send(BaseBuffer &srcbuf, unsigned int count, wait(handle); check_return_value("send", handle); } - return handle; } @@ -244,7 +242,6 @@ ACCLRequest *ACCL::recv(BaseBuffer &dstbuf, unsigned int count, "sync_from_device() after waiting" << std::endl; } - options.scenario = operation::recv; options.comm = communicators[comm_id].communicators_addr(); options.addr_2 = &dstbuf; @@ -262,7 +259,6 @@ ACCLRequest *ACCL::recv(BaseBuffer &dstbuf, unsigned int count, } check_return_value("recv", handle); } - return handle; } @@ -287,7 +283,6 @@ ACCLRequest *ACCL::recv(dataType dst_data_type, unsigned int count, wait(handle); check_return_value("recv", handle); } - return handle; } @@ -302,11 +297,9 @@ ACCLRequest *ACCL::copy(BaseBuffer *srcbuf, BaseBuffer *dstbuf, unsigned int cou "sync_from_device() after waiting" << std::endl; } - if (from_fpga == false) { srcbuf->sync_to_device(); } - options.scenario = operation::copy; options.addr_0 = srcbuf; options.addr_2 = dstbuf; @@ -318,13 +311,13 @@ ACCLRequest *ACCL::copy(BaseBuffer *srcbuf, BaseBuffer *dstbuf, unsigned int cou ACCLRequest *handle = call_async(options); if (!run_async) { - wait(handle); + std::chrono::milliseconds timeout(1000); + wait(handle, timeout); if (to_fpga == false) { dstbuf->sync_from_device(); } check_return_value("copy", handle); } - return handle; } @@ -1011,6 +1004,8 @@ std::string ACCL::dump_eager_rx_buffers(size_t n_egr_rx_bufs, bool dump_data) { address += 4; val_t addrh = cclo->read(address); address += 4; + val_t max_len = cclo->read(address); + address += 4; val_t rxtag = cclo->read(address); address += 4; val_t rxlen = cclo->read(address); @@ -1018,24 +1013,33 @@ std::string ACCL::dump_eager_rx_buffers(size_t n_egr_rx_bufs, bool dump_data) { val_t rxsrc = cclo->read(address); address += 4; val_t seq = cclo->read(address); + address += 4; + val_t hostBit = cclo->read(address); stream << "Spare RX Buffer " << i << ":\t address: 0x" << std::hex << addrh * (1UL << 32) + addrl << std::dec << " \t status: " << status << " \t occupancy: " << rxlen << "/" << maxsize << " \t MPI tag: " << std::hex << rxtag << std::dec - << " \t seq: " << seq << " \t src: " << rxsrc; + << " \t seq: " << seq << " \t src: " << rxsrc + << " \t hostBit: " << hostBit; if(dump_data) { - eager_rx_buffers[i]->sync_from_device(); + if(!(hostBit && cclo->get_device_type() == CCLO::coyote_device)){ + eager_rx_buffers[i]->sync_from_device(); + } stream << " \t data: " << std::hex << "["; for (size_t j = 0; j < eager_rx_buffers[i]->size(); ++j) { - stream << "0x" - << static_cast(static_cast( - eager_rx_buffers[i]->byte_array())[j]); - if (j != eager_rx_buffers[i]->size() - 1) { - stream << ", "; - } + if(static_cast(static_cast( + eager_rx_buffers[i]->byte_array())[j]) != 0){ + + stream << "0x" + << static_cast(static_cast( + eager_rx_buffers[i]->byte_array())[j]); + if (j != eager_rx_buffers[i]->size() - 1) { + stream << ", "; + } + } } stream << "]" << std::dec << std::endl; } else { @@ -1065,7 +1069,7 @@ void ACCL::parse_hwid(){ void ACCL::initialize(const std::vector &ranks, int local_rank, int n_egr_rx_bufs, addr_t egr_rx_buf_size, - addr_t max_egr_size, addr_t max_rndzv_size) { + addr_t max_egr_size, addr_t max_rndzv_size, bool rxEager_host) { parse_hwid(); @@ -1077,8 +1081,7 @@ void ACCL::initialize(const std::vector &ranks, int local_rank, } debug("Configuring Eager RX Buffers"); - setup_eager_rx_buffers(n_egr_rx_bufs, egr_rx_buf_size, rxbufmem); - + setup_eager_rx_buffers(n_egr_rx_bufs, egr_rx_buf_size, rxbufmem, rxEager_host); debug("Configuring Rendezvous Spare Buffers"); setup_rendezvous_spare_buffers(max_rndzv_size, rxbufmem); @@ -1129,7 +1132,7 @@ addr_t ACCL::get_arithmetic_config_addr(std::pair id) { } void ACCL::setup_eager_rx_buffers(size_t n_egr_rx_bufs, addr_t egr_rx_buf_size, - const std::vector &devicemem) { + const std::vector &devicemem, bool host) { addr_t address = CCLO_ADDR::EGR_RX_BUF_SIZE_OFFSET; eager_rx_buffer_size = egr_rx_buf_size; for (size_t i = 0; i < n_egr_rx_bufs; ++i) { @@ -1137,15 +1140,32 @@ void ACCL::setup_eager_rx_buffers(size_t n_egr_rx_bufs, addr_t egr_rx_buf_size, Buffer *buf; if (sim_mode) { - buf = new SimBuffer(new int8_t[eager_rx_buffer_size](), eager_rx_buffer_size, dataType::int8, + if(host){ + buf = new SimBuffer(new int8_t[eager_rx_buffer_size](), eager_rx_buffer_size, dataType::int8, + static_cast(cclo)->get_context(), true, ACCL_SIM_DEFAULT_BANK); + }else{ + buf = new SimBuffer(new int8_t[eager_rx_buffer_size](), eager_rx_buffer_size, dataType::int8, static_cast(cclo)->get_context()); + } } else if(cclo->get_device_type() == CCLO::xrt_device ){ - buf = new XRTBuffer(eager_rx_buffer_size, dataType::int8, *(static_cast(cclo)->get_device()), devicemem[i % devicemem.size()]); + if(host){ + //TODO: how to define host buffers in XRT? + buf = new XRTBuffer(eager_rx_buffer_size, dataType::int8, *(static_cast(cclo)->get_device()), devicemem[i % devicemem.size()]); + }else{ + buf = new XRTBuffer(eager_rx_buffer_size, dataType::int8, *(static_cast(cclo)->get_device()), devicemem[i % devicemem.size()]); + } } else if(cclo->get_device_type() == CCLO::coyote_device){ - buf = new CoyoteBuffer(eager_rx_buffer_size, dataType::int8, static_cast(cclo)); + if(host){ + //buffers in coyote per default on host + buf = new CoyoteBuffer(eager_rx_buffer_size, dataType::int8, static_cast(cclo)); + }else{ + buf = new CoyoteBuffer(eager_rx_buffer_size, dataType::int8, static_cast(cclo)); + } + } + //add if else as well, test for coyote backend + eager on host + if(!(host && cclo->get_device_type() == CCLO::coyote_device)){ + buf->sync_to_device(); } - - buf->sync_to_device(); eager_rx_buffers.emplace_back(buf); // program this buffer into the accelerator address += 4; @@ -1155,10 +1175,18 @@ void ACCL::setup_eager_rx_buffers(size_t n_egr_rx_bufs, addr_t egr_rx_buf_size, address += 4; cclo->write(address, (buf->address() >> 32) & 0xffffffff); // clear remaining 4 fields - for (size_t j = 0; j < 4; ++j) { + for (size_t j = 0; j < 5; ++j) { address += 4; cclo->write(address, 0); } + //set the host flag + // NOTE: the host flag is set to true if the buffer is a host buffer + address += 4; + if(host){ + cclo->write(address, 1); // set host flag + }else{ + cclo->write(address, 0); // set host flag + } } //write buffer len @@ -1168,7 +1196,6 @@ void ACCL::setup_eager_rx_buffers(size_t n_egr_rx_bufs, addr_t egr_rx_buf_size, cclo->write(CCLO_ADDR::NUM_EGR_RX_BUFS_OFFSET, n_egr_rx_bufs); current_config_address = address + 4; - } void ACCL::setup_rendezvous_spare_buffers(addr_t rndzv_spare_buf_size, const std::vector &devicemem){ @@ -1184,7 +1211,8 @@ void ACCL::setup_rendezvous_spare_buffers(addr_t rndzv_spare_buf_size, const std } else if(cclo->get_device_type() == CCLO::coyote_device){ buf = new CoyoteBuffer(max_rndzv_msg_size, dataType::int8, static_cast(cclo)); } - buf->sync_to_device(); + //TODO: make mem residience configurable as well for nocard configuration + //buf->sync_to_device(); utility_spares.emplace_back(buf); } cclo->write(CCLO_ADDR::SPARE1_OFFSET, utility_spares.at(0)->address() & 0xffffffff); diff --git a/driver/xrt/src/coyotedevice.cpp b/driver/xrt/src/coyotedevice.cpp index fd96b904..7c761fb1 100644 --- a/driver/xrt/src/coyotedevice.cpp +++ b/driver/xrt/src/coyotedevice.cpp @@ -18,9 +18,10 @@ #include "accl/coyotedevice.hpp" #include "accl/common.hpp" -#include "cProcess.hpp" +#include "cThread.hpp" #include #include +#include static void finish_coyote_request(ACCL::CoyoteRequest *req) { req->wait_kernel(); @@ -45,8 +46,14 @@ void CoyoteRequest::start() { } else { function = static_cast(options.reduce_function); } + /*std::cout << "options.scenario " << static_cast(options.scenario) << " options.count " << options.count << " options.comm " << options.comm << + " options.root_src_dest " << options.root_src_dst << " options.cfg_function " << static_cast(options.cfg_function) << " options.reduce_function " << static_cast(options.reduce_function) << + " options.tag " << options.tag << " options.arithcfg_addr " << options.arithcfg_addr << " options.compress_dtype " << static_cast(options.compress_dtype) << + " options.compression_flags " << std::bitset<32>(static_cast(options.compression_flags)) << " options.stream_flags " << std::bitset<32>(static_cast(options.stream_flags)) << + " options.host_flags " << std::bitset<32>(static_cast(options.host_flags)) << " address0 " << options.addr_0->address() << " address1 " << options.addr_1->address() << + " address2 " << options.addr_2->address() << " options.data_type_io_0 " << static_cast(options.data_type_io_0) << " options.data_type_io_1 " << static_cast(options.data_type_io_1) << + " options.data_type_io_2 " << static_cast(options.data_type_io_2) << std::endl;*/ uint32_t flags = static_cast(options.host_flags) << 8 | static_cast(options.stream_flags); - auto coyote_proc = reinterpret_cast(cclo())->get_device(); if ((coyote_proc->getCSR((OFFSET_HOSTCTRL + HOSTCTRL_ADDR::AP_CTRL)>>2) & 0x4) == 0) { // read AP_CTRL and check bit 3 (the idle bit) @@ -251,6 +258,7 @@ void CoyoteRequest::start() { } case ACCL::operation::config:{ coyote_proc->setCSR(static_cast(options.scenario), (OFFSET_HOSTCTRL + HOSTCTRL_ADDR::SCEN)>>2); + coyote_proc->setCSR(static_cast(options.count), (OFFSET_HOSTCTRL + HOSTCTRL_ADDR::LEN)>>2); coyote_proc->setCSR(static_cast(function), (OFFSET_HOSTCTRL + HOSTCTRL_ADDR::FUNCTION_R)>>2); //coyote_proc->setCSR(static_cast(flags), (OFFSET_HOSTCTRL + HOSTCTRL_ADDR::STREAM_FLAGS)>>2); //safe to delete? break; @@ -276,28 +284,28 @@ void CoyoteRequest::wait_kernel() { } CoyoteDevice::CoyoteDevice(): num_qp(0) { - this->coyote_proc = new fpga::cProcess(targetRegion, getpid()); - std::cerr << "ACLL DEBUG: aquiring cProc: targetRegion: " << targetRegion << ", cPid: " << coyote_proc->getCpid() << std::endl; + this->coyote_proc = new coyote::cThread(targetRegion, getpid(), 0); + std::cerr << "ACLL DEBUG: aquiring cProc: targetRegion: " << targetRegion << ", cPid: " << coyote_proc->getCtid() << std::endl; } CoyoteDevice::CoyoteDevice(unsigned int num_qp): num_qp(num_qp) { for (unsigned int i=0; i<(num_qp+1); i++) { - fpga::cProcess* cproc = new fpga::cProcess(targetRegion, getpid()); + coyote::cThread* cproc = new coyote::cThread(targetRegion, getpid(), 0); coyote_qProc_vec.push_back(cproc); } for (unsigned int i=0; igetCpid() == 0){ + if(i == 0){ this->coyote_proc = coyote_qProc_vec[i]; - std::cerr << "ACLL DEBUG: aquiring cProc: targetRegion: " << targetRegion << ", cPid: " << coyote_proc->getCpid() << std::endl; + std::cerr << "ACLL DEBUG: aquiring cProc: targetRegion: " << targetRegion << ", cPid: " << coyote_proc->getCtid() << std::endl; coyote_qProc_vec.erase(coyote_qProc_vec.begin() + i); break; } } - if(coyote_proc == NULL || coyote_proc->getCpid() != 0){ + if(coyote_proc == NULL){ std::cerr << "cProc initialization error!"<getCpid() << std::endl; + std::cerr << "ACLL DEBUG: aquiring qProc: targetRegion: " << targetRegion << ", cPid: " << coyote_qProc_vec[i]->getCtid() << std::endl; } } @@ -369,7 +377,7 @@ CCLO::deviceType CoyoteDevice::get_device_type() void CoyoteDevice::printDebug(){ coyote_proc->printDebug(); - std::ifstream inputFile("/sys/kernel/coyote_cnfg/cyt_attr_nstats_q0"); + std::ifstream inputFile("/sys/kernel/coyote_sysfs_0/cyt_attr_nstats"); if (!inputFile.is_open()) { std::cerr << "Failed to open net sts file." << std::endl; diff --git a/kernels/cclo/Makefile b/kernels/cclo/Makefile index dc76095c..48180f4c 100644 --- a/kernels/cclo/Makefile +++ b/kernels/cclo/Makefile @@ -15,13 +15,13 @@ # # *******************************************************************************/ -PLATFORM ?= xilinx_u280_xdma_201920_3 +PLATFORM ?= xilinx_u55c_gen3x16_xdma_3_202210_1 HW_DEBUG ?= none -STACK_TYPE ?= UDP +STACK_TYPE ?= RDMA MODE ?= xo EN_DMA ?= 1 EN_ARITH ?= 1 -EN_COMPRESS ?= 1 +EN_COMPRESS ?= 0 EN_EXT_KRNL ?= 1 MB_DEBUG_LEVEL ?= 0 SIM_MEM_SIZE_LOG ?= 28 diff --git a/kernels/cclo/fw/sw_apps/ccl_offload_control/src/ccl_offload_control.h b/kernels/cclo/fw/sw_apps/ccl_offload_control/src/ccl_offload_control.h index d523b4ff..3bb1cee3 100644 --- a/kernels/cclo/fw/sw_apps/ccl_offload_control/src/ccl_offload_control.h +++ b/kernels/cclo/fw/sw_apps/ccl_offload_control/src/ccl_offload_control.h @@ -273,16 +273,19 @@ typedef struct { unsigned int rx_len; unsigned int rx_src; unsigned int sequence_number; + unsigned int host; //host address for the packet } rx_buffer; #define STATUS_OFFSET 0 #define ADDRL_OFFSET 1 #define ADDRH_OFFSET 2 -#define RX_TAG_OFFSET 3 -#define RX_LEN_OFFSET 4 -#define RX_SRC_OFFSET 5 -#define SEQUENCE_NUMBER_OFFSET 6 -#define SPARE_BUFFER_FIELDS 7 +#define MAX_LEN_OFFSET 3 +#define RX_TAG_OFFSET 4 +#define RX_LEN_OFFSET 5 +#define RX_SRC_OFFSET 6 +#define SEQUENCE_NUMBER_OFFSET 7 +#define HOST_OFFSET 8 //host address offset for the buffer +#define SPARE_BUFFER_FIELDS 9 #define STATUS_IDLE 0x00 #define STATUS_ENQUEUED 0x01 diff --git a/kernels/cclo/hdl/ccl_offload.v b/kernels/cclo/hdl/ccl_offload.v index 8fbea965..8b6b17e9 100644 --- a/kernels/cclo/hdl/ccl_offload.v +++ b/kernels/cclo/hdl/ccl_offload.v @@ -622,5 +622,46 @@ module ccl_offload .s_axi_control_wstrb(s_axi_control_wstrb), .s_axi_control_wvalid(s_axi_control_wvalid) ); +//ila to capture commands on cclo interf +/*ila_top inst_ila_top( + .clk(ap_clk), + .probe0(s_axis_call_req_tdata), //32 + .probe1(s_axis_call_req_tready), + .probe2(s_axis_call_req_tvalid), + .probe3(s_axis_call_req_tlast), + .probe4(s_axis_eth_rx_data_tdata), //512 + .probe5(s_axis_eth_rx_data_tdest), //8 + .probe6(s_axis_eth_rx_data_tready), + .probe7(s_axis_eth_rx_data_tvalid), + .probe8(m_axis_dma0_s2mm_tdata), //512 + .probe9(m_axis_dma0_s2mm_tdest), //8 + .probe10(m_axis_dma0_s2mm_tready), + .probe11(m_axis_dma0_s2mm_tvalid), + .probe12(s_axis_dma0_mm2s_tdata),//512 + .probe13(s_axis_dma0_mm2s_tdest), //8 + .probe14(s_axis_dma0_mm2s_tready), + .probe15(s_axis_dma0_mm2s_tvalid), + .probe16(m_axis_dma1_s2mm_tdata),//512 + .probe17(m_axis_dma1_s2mm_tdest), //8 + .probe18(m_axis_dma1_s2mm_tready), + .probe19(m_axis_dma1_s2mm_tvalid), + .probe20(s_axis_dma1_mm2s_tdata),//512 + .probe21(s_axis_dma1_mm2s_tdest), //8 + .probe22(s_axis_dma1_mm2s_tready), + .probe23(s_axis_dma1_mm2s_tvalid), + .probe24(s_axis_eth_notification_tdata), //64 + .probe25(s_axis_eth_notification_tready), + .probe26(s_axis_eth_notification_tvalid), + .probe27(m_axis_eth_tx_meta_tdata), //32 + .probe28(m_axis_eth_tx_meta_tready), + .probe29(m_axis_eth_tx_meta_tvalid), + .probe30(s_axis_eth_tx_status_tdata), //64 + .probe31(s_axis_eth_tx_status_tready), + .probe32(s_axis_eth_tx_status_tvalid), + .probe33(m_axis_eth_tx_data_tdata), //512 + .probe34(m_axis_eth_tx_data_tready), + .probe35(m_axis_eth_tx_data_tvalid), + .probe36(m_axis_eth_tx_data_tdest) //8 + );*/ endmodule diff --git a/kernels/cclo/hls/dma_mover/dma_mover.cpp b/kernels/cclo/hls/dma_mover/dma_mover.cpp index bede2f43..27cd75b1 100644 --- a/kernels/cclo/hls/dma_mover/dma_mover.cpp +++ b/kernels/cclo/hls/dma_mover/dma_mover.cpp @@ -594,6 +594,7 @@ void instruction_decode( dm1_rd.total_bytes = seek_res.len; bytes_remaining -= seek_res.len; dm1_rd.last = (bytes_remaining <= 0); + dm1_rd.mem_id = seek_res.host ? 1 : 0; //instruct to release this buffer once the DMA movement is complete STREAM_WRITE(rxbuf_release_idx, seek_res.index); ack_insn.release_count++; diff --git a/kernels/cclo/hls/rxbuf_offload/rxbuf_enqueue.cpp b/kernels/cclo/hls/rxbuf_offload/rxbuf_enqueue.cpp index 643a11b1..209c1833 100644 --- a/kernels/cclo/hls/rxbuf_offload/rxbuf_enqueue.cpp +++ b/kernels/cclo/hls/rxbuf_offload/rxbuf_enqueue.cpp @@ -50,8 +50,10 @@ void rxbuf_enqueue( //iterate until you run out of spare buffers for(int i=0; i < nbufs; i++){ ap_uint<32> status; + ap_uint<32> host; ap_uint<64> addr; status = rx_buffers[(i * SPARE_BUFFER_FIELDS) + STATUS_OFFSET]; + host = rx_buffers[(i * SPARE_BUFFER_FIELDS) + HOST_OFFSET]; addr(31, 0) = rx_buffers[(i * SPARE_BUFFER_FIELDS) + ADDRL_OFFSET]; addr(63, 32) = rx_buffers[(i * SPARE_BUFFER_FIELDS) + ADDRH_OFFSET]; @@ -64,7 +66,7 @@ void rxbuf_enqueue( cmd.tag = tag++; cmd_word.data = cmd; cmd_word.last = 1;//unused for now - cmd_word.dest = 0;//unused for now + cmd_word.dest = host;//unused for now STREAM_WRITE(dma_cmd, cmd_word); //update spare buffer status rx_buffers[(i * SPARE_BUFFER_FIELDS) + STATUS_OFFSET] = STATUS_ENQUEUED; diff --git a/kernels/cclo/hls/rxbuf_offload/rxbuf_offload.h b/kernels/cclo/hls/rxbuf_offload/rxbuf_offload.h index 4b2b443e..35c68201 100644 --- a/kernels/cclo/hls/rxbuf_offload/rxbuf_offload.h +++ b/kernels/cclo/hls/rxbuf_offload/rxbuf_offload.h @@ -20,6 +20,7 @@ typedef struct { ap_uint<32> index; ap_uint<32> len; bool valid; + bool host; } rxbuf_seek_result; typedef struct { diff --git a/kernels/cclo/hls/rxbuf_offload/rxbuf_seek.cpp b/kernels/cclo/hls/rxbuf_offload/rxbuf_seek.cpp index 78812821..d2c8fc78 100644 --- a/kernels/cclo/hls/rxbuf_offload/rxbuf_seek.cpp +++ b/kernels/cclo/hls/rxbuf_offload/rxbuf_seek.cpp @@ -59,6 +59,7 @@ void rxbuf_seek( pending_notif.signature.src == seek_sig.src && pending_notif.signature.seqn == seek_sig.seqn){ seek_res.addr(31,0) = rx_buffers[(RX_BUFFER_METADATA_OFFSET/4) + pending_notif.index * SPARE_BUFFER_FIELDS + ADDRL_OFFSET]; seek_res.addr(63,32) = rx_buffers[(RX_BUFFER_METADATA_OFFSET/4) + pending_notif.index * SPARE_BUFFER_FIELDS + ADDRH_OFFSET]; + seek_res.host = (rx_buffers[(RX_BUFFER_METADATA_OFFSET/4) + pending_notif.index * SPARE_BUFFER_FIELDS + HOST_OFFSET] == 1) ? true : false; seek_res.len = pending_notif.signature.len; seek_res.index = pending_notif.index; seek_res.valid = true; diff --git a/kernels/cclo/hls/rxbuf_offload/rxbuf_session.cpp b/kernels/cclo/hls/rxbuf_offload/rxbuf_session.cpp index 55751558..f7063e2f 100644 --- a/kernels/cclo/hls/rxbuf_offload/rxbuf_session.cpp +++ b/kernels/cclo/hls/rxbuf_offload/rxbuf_session.cpp @@ -84,7 +84,7 @@ void rxbuf_session_command( cmd.length = notif.length; cmd_word.data = cmd; cmd_word.last = 1;//always last, each command is a single word - cmd_word.dest = 0;//always write RX data to device (not host) + cmd_word.dest = desc.mem_index;//always write RX data to device (not host) STREAM_WRITE(fragment_dma_cmd, cmd_word); } else { //if EOF update address in descriptor diff --git a/kernels/cclo/tcl/generate_kernel.tcl b/kernels/cclo/tcl/generate_kernel.tcl index 0ccd5835..17402eba 100644 --- a/kernels/cclo/tcl/generate_kernel.tcl +++ b/kernels/cclo/tcl/generate_kernel.tcl @@ -70,6 +70,28 @@ generate_target all [get_files ./ccl_offload_ex/ccl_offload_ex.srcs/sources_1/b #build a .xsa file handoff write_hw_platform -fixed -force -file $xsafile + +#create ila_top for commands on cclo interf +#create_ip -name ila -vendor xilinx.com -library ip -version 6.2 -module_name ila_top +#set_property -dict [ list CONFIG.C_PROBE0_WIDTH {32} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE4_WIDTH {512} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE5_WIDTH {8} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE8_WIDTH {512} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE9_WIDTH {8} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE12_WIDTH {512} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE13_WIDTH {8} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE16_WIDTH {512} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE17_WIDTH {8} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE20_WIDTH {512} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE21_WIDTH {8} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE24_WIDTH {64} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE27_WIDTH {32} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE30_WIDTH {64} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE33_WIDTH {512} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE36_WIDTH {8} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_NUM_OF_PROBES {37} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_EN_STRG_QUAL {1} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_DATA_DEPTH {1024} ] [get_ips ila_top] # close and exit close_project exit diff --git a/kernels/plugins/Makefile b/kernels/plugins/Makefile index da39ea18..c176941d 100644 --- a/kernels/plugins/Makefile +++ b/kernels/plugins/Makefile @@ -16,8 +16,8 @@ # # *******************************************************************************/ -PERIPHERAL_IPS = hostctrl loopback reduce_ops hp_compression dummy_tcp_stack client_arbiter vadd_put cyt_adapter external_dma dummy_cyt_rdma_stack dummy_cyt_dma tcp_session_handler -DEVICE=xcu280-fsvh2892-2L-e +PERIPHERAL_IPS = hostctrl loopback reduce_ops hp_compression dummy_tcp_stack client_arbiter vadd_put cyt_adapter external_dma tcp_session_handler +DEVICE=xcu55c-fsvh2892-2L-e TARGET=ip all: $(PERIPHERAL_IPS) diff --git a/kernels/plugins/cyt_adapter/Makefile b/kernels/plugins/cyt_adapter/Makefile index be1a85c2..2db4e41d 100644 --- a/kernels/plugins/cyt_adapter/Makefile +++ b/kernels/plugins/cyt_adapter/Makefile @@ -16,19 +16,24 @@ # *******************************************************************************/ TARGET=ip -DEVICE=xcu250-figd2104-2L-e -CYT_DMA_ADAPTER=cyt_dma_adapter_$(DEVICE).xo +DEVICE=xcu55c-fsvh2892-2L-e +CYT_DMA_SQ_ADAPTER=cyt_dma_sq_adapter_$(DEVICE).xo CYT_RDMA_ARBITER=cyt_rdma_arbiter_$(DEVICE).xo -CYT_RDMA_MUX=cyt_rdma_mux_$(DEVICE).xo +CCLO_SQ_ADAPTER=cclo_sq_adapter_$(DEVICE).xo +CYT_CQ_DM_STS_CONVERTER=cyt_cq_dm_sts_converter_$(DEVICE).xo -all: $(CYT_DMA_ADAPTER) $(CYT_RDMA_ARBITER) $(CYT_RDMA_MUX) -$(CYT_DMA_ADAPTER): build_cyt_dma_adapter.tcl cyt_dma_adapter.cpp +all: $(CYT_RDMA_ARBITER) $(CCLO_SQ_ADAPTER) $(CYT_DMA_SQ_ADAPTER) $(CYT_CQ_DM_STS_CONVERTER) + +$(CYT_CQ_DM_STS_CONVERTER): build_cyt_cq_dm_sts_converter.tcl cyt_cq_dm_sts_converter.cpp + vitis_hls $< -tclargs $(TARGET) $(DEVICE) + +$(CYT_DMA_SQ_ADAPTER): build_cyt_dma_sq_adapter.tcl cyt_dma_sq_adapter.cpp vitis_hls $< -tclargs $(TARGET) $(DEVICE) $(CYT_RDMA_ARBITER): build_cyt_rdma_arbiter.tcl cyt_rdma_arbiter.cpp vitis_hls $< -tclargs $(TARGET) $(DEVICE) -$(CYT_RDMA_MUX): build_cyt_rdma_mux.tcl cyt_rdma_mux.cpp +$(CCLO_SQ_ADAPTER): build_cclo_sq_adapter.tcl cclo_sq_adapter.cpp vitis_hls $< -tclargs $(TARGET) $(DEVICE) diff --git a/kernels/plugins/cyt_adapter/build_cyt_rdma_mux.tcl b/kernels/plugins/cyt_adapter/build_cclo_sq_adapter.tcl similarity index 81% rename from kernels/plugins/cyt_adapter/build_cyt_rdma_mux.tcl rename to kernels/plugins/cyt_adapter/build_cclo_sq_adapter.tcl index ac0e6dde..c4d915ea 100644 --- a/kernels/plugins/cyt_adapter/build_cyt_rdma_mux.tcl +++ b/kernels/plugins/cyt_adapter/build_cclo_sq_adapter.tcl @@ -51,16 +51,15 @@ switch $command { } -open_project build_cyt_rdma_mux.${device} +open_project build_cclo_sq_adapter.${device} +add_files cclo_sq_adapter.cpp -cflags "-std=c++14 -I. -I../../cclo/hls/eth_intf -I../../../driver/hls/ -DACCL_SYNTHESIS" -add_files cyt_rdma_mux.cpp -cflags "-std=c++14 -I. -I../../cclo/hls/eth_intf -I../../../driver/hls/ -DACCL_SYNTHESIS" - -set_top cyt_rdma_mux +set_top cclo_sq_adapter open_solution sol1 -config_export -format xo -library ACCL -output [pwd]/cyt_rdma_mux_${device}.xo +config_export -format xo -library ACCL -output [pwd]/cclo_sq_adapter_${device}.xo if {$do_sim} { csim_design -clean diff --git a/kernels/plugins/cyt_adapter/build_cyt_dma_adapter.tcl b/kernels/plugins/cyt_adapter/build_cyt_cq_dm_sts_converter.tcl similarity index 75% rename from kernels/plugins/cyt_adapter/build_cyt_dma_adapter.tcl rename to kernels/plugins/cyt_adapter/build_cyt_cq_dm_sts_converter.tcl index 342b3400..1b868a48 100644 --- a/kernels/plugins/cyt_adapter/build_cyt_dma_adapter.tcl +++ b/kernels/plugins/cyt_adapter/build_cyt_cq_dm_sts_converter.tcl @@ -17,7 +17,6 @@ set command [lindex $argv 0] set device [lindex $argv 1] -set stack [lindex $argv 2] set do_sim 0 set do_syn 0 @@ -52,19 +51,15 @@ switch $command { } -open_project build_cyt_dma_adapter.${device} +open_project build_cyt_cq_dm_sts_converter.${device} -if {$stack eq "RDMA"} { - add_files cyt_dma_adapter.cpp -cflags "-std=c++14 -I. -I../../../driver/hls/ -DACCL_SYNTHESIS -DACCL_RDMA" -} else { - add_files cyt_dma_adapter.cpp -cflags "-std=c++14 -I. -I../../../driver/hls/ -DACCL_SYNTHESIS" -} +add_files cyt_cq_dm_sts_converter.cpp -cflags "-std=c++14 -I. -I../../cclo/hls/eth_intf -I../../../driver/hls/ -DACCL_SYNTHESIS" -set_top cyt_dma_adapter +set_top cyt_cq_dm_sts_converter open_solution sol1 -config_export -format xo -library ACCL -output [pwd]/cyt_dma_adapter_$device.xo +config_export -format xo -library ACCL -output [pwd]/cyt_cq_dm_sts_converter_${device}.xo if {$do_sim} { csim_design -clean diff --git a/kernels/plugins/cyt_adapter/build_cyt_dma_sq_adapter.tcl b/kernels/plugins/cyt_adapter/build_cyt_dma_sq_adapter.tcl new file mode 100644 index 00000000..e8e851a7 --- /dev/null +++ b/kernels/plugins/cyt_adapter/build_cyt_dma_sq_adapter.tcl @@ -0,0 +1,82 @@ +# /******************************************************************************* +# Copyright (C) 2023 Advanced Micro Devices, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# *******************************************************************************/ + +set command [lindex $argv 0] +set device [lindex $argv 1] + +set do_sim 0 +set do_syn 0 +set do_export 0 +set do_cosim 0 + +switch $command { + "sim" { + set do_sim 1 + } + "syn" { + set do_syn 1 + } + "ip" { + set do_syn 1 + set do_export 1 + } + "cosim" { + set do_syn 1 + set do_cosim 1 + } + "all" { + set do_sim 1 + set do_syn 1 + set do_export 1 + set do_cosim 1 + } + default { + puts "Unrecognized command" + exit + } +} + + +open_project build_cyt_dma_sq_adapter.${device} + +add_files cyt_dma_sq_adapter.cpp -cflags "-std=c++14 -I. -I../../cclo/hls/eth_intf -I../../../driver/hls/ -DACCL_SYNTHESIS" + + +set_top cyt_dma_sq_adapter + +open_solution sol1 +config_export -format xo -library ACCL -output [pwd]/cyt_dma_sq_adapter_${device}.xo + +if {$do_sim} { + csim_design -clean +} + +if {$do_syn} { + set_part $device + create_clock -period 4 -name default + csynth_design +} + +if {$do_export} { + export_design +} + +if ${do_cosim} { + cosim_design +} + +exit diff --git a/kernels/plugins/cyt_adapter/cclo_sq_adapter.cpp b/kernels/plugins/cyt_adapter/cclo_sq_adapter.cpp new file mode 100644 index 00000000..cb15cbba --- /dev/null +++ b/kernels/plugins/cyt_adapter/cclo_sq_adapter.cpp @@ -0,0 +1,128 @@ +/******************************************************************************* +# Copyright (C) 2023 Advanced Micro Devices, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# *******************************************************************************/ + +#include "cyt.h" +#include "eth_intf.h" + +using namespace std; + +// convert the cclo sq (rdma) to cyt sq (rdma) +// currently the cclo sq only contains WRITE/SEND rdma command +// we keep the conversion to cyt_sq_rd output just for consistency of interfaces and future extension +// the m_axis_cyt data stream corresponds to rreq_send and we use the dest to indicate whether host/device +// the s_axis_cyt data stream corresponds to rreq_recv and we simply consume it +void cclo_sq_adapter( + hls::stream& cclo_sq, + hls::stream >& s_axis_cclo, + hls::stream& cyt_sq_wr, + hls::stream& cyt_sq_rd, + hls::stream >& m_axis_cyt, + hls::stream >& s_axis_cyt + ) +{ +#pragma HLS INTERFACE axis register port=cclo_sq +#pragma HLS INTERFACE axis register port=s_axis_cclo +#pragma HLS INTERFACE axis register port=cyt_sq_wr +#pragma HLS INTERFACE axis register port=cyt_sq_rd +#pragma HLS INTERFACE axis register port=m_axis_cyt +#pragma HLS INTERFACE axis register port=s_axis_cyt +#pragma HLS aggregate variable=cclo_sq compact=bit +#pragma HLS aggregate variable=cyt_sq_wr compact=bit +#pragma HLS aggregate variable=cyt_sq_rd compact=bit + +#pragma HLS INTERFACE ap_ctrl_none port=return +#pragma HLS PIPELINE II=1 + + enum fsmStateType {META, WR_STREAM, RD_STREAM}; + static fsmStateType fsmState = META; + + static rdma_req_t cclo_req; + static cyt_req_t cyt_req; + static ap_axiu<512, 0, 0, 8> currWord; + static ap_uint<32> pkt_word; + static ap_uint<32> word_cnt = 0; + + switch (fsmState) + { + case META: + if(!STREAM_IS_EMPTY(cclo_sq)){ + cclo_req = STREAM_READ(cclo_sq); + + cyt_req.rsrvd = 0; + cyt_req.offs = 0; + cyt_req.host = 0; + cyt_req.actv = 0; + cyt_req.len = cclo_req.len; + cyt_req.vaddr = cclo_req.vaddr; + cyt_req.last = 1; // always assert last + cyt_req.dest = cclo_req.host; // 0-device memory, 1-host memory; + cyt_req.pid = cclo_req.qpn(CYT_PID_BITS-1,0); //qpn lowest bits are pid + cyt_req.vfid = cclo_req.qpn(CYT_DEST_BITS+CYT_PID_BITS-1,CYT_PID_BITS); + cyt_req.remote = 0; + cyt_req.rdma = 0; + cyt_req.mode = 0; // always PARSE + cyt_req.strm = CYT_STRM_RDMA; + cyt_req.opcode = cclo_req.opcode; + + pkt_word = (cyt_req.len + 63) >> 6; + + if(cyt_req.opcode == CYT_RDMA_WRITE || cyt_req.opcode == CYT_RDMA_SEND || cyt_req.opcode == CYT_RDMA_IMMED){ + STREAM_WRITE(cyt_sq_wr, cyt_req); + fsmState = WR_STREAM; + } else if (cyt_req.opcode == CYT_RDMA_READ) { + STREAM_WRITE(cyt_sq_rd, cyt_req); + fsmState = RD_STREAM; + } + } + break; + // move s_axis_cclo to m_axis_cyt and adjust the dest field + case WR_STREAM: + if (!STREAM_IS_EMPTY(s_axis_cclo)) + { + currWord = STREAM_READ(s_axis_cclo); + ap_axiu<512, 0, 0, 8> outWord; + + outWord.data = currWord.data; + outWord.keep = currWord.keep; + outWord.last = currWord.last; + outWord.dest = cyt_req.dest; // use the dest flag to indicate whether it is to host or device + word_cnt++; + + if (word_cnt == pkt_word) + { + word_cnt = 0; + outWord.last = 1; + fsmState = META; + } + STREAM_WRITE(m_axis_cyt, outWord); + } + break; + // just consume all the data + case RD_STREAM: + if(!STREAM_IS_EMPTY(s_axis_cyt)){ + currWord = STREAM_READ(s_axis_cyt); + word_cnt++; + if (word_cnt == pkt_word) + { + word_cnt = 0; + fsmState = META; + } + } + break; + } + +} \ No newline at end of file diff --git a/kernels/plugins/cyt_adapter/cyt.h b/kernels/plugins/cyt_adapter/cyt.h index ae8d428b..39f7bcf2 100644 --- a/kernels/plugins/cyt_adapter/cyt.h +++ b/kernels/plugins/cyt_adapter/cyt.h @@ -24,70 +24,15 @@ using namespace std; +#define CYT_OFFS_BITS 6 #define CYT_VADDR_BITS 48 #define CYT_LEN_BITS 28 #define CYT_DEST_BITS 4 #define CYT_PID_BITS 6 -#define CYT_N_REGIONS_BITS 1 -#define CYT_RSRVD_BITS 96-4-CYT_N_REGIONS_BITS-CYT_VADDR_BITS-CYT_LEN_BITS-CYT_DEST_BITS-CYT_PID_BITS - -struct cyt_req_t{ - ap_uint rsrvd; - ap_uint vfid; - ap_uint pid; - ap_uint dest; - ap_uint<1> host; - ap_uint<1> ctl; - ap_uint<1> sync; - ap_uint<1> stream; - ap_uint len; - ap_uint vaddr; - - cyt_req_t() : rsrvd(0), vfid(0), pid(0), dest(0), host(0), ctl(0), sync(0), stream(0), len(0), vaddr(0) {} - - cyt_req_t(ap_uint rsrvd_arg, ap_uint vfid_arg, ap_uint pid_arg, - ap_uint dest_arg, ap_uint<1> host_arg, ap_uint<1> ctl_arg, ap_uint<1> sync_arg, - ap_uint<1> stream_arg, ap_uint len_arg, ap_uint vaddr_arg) - : rsrvd(rsrvd_arg), - vfid(vfid_arg), - pid(pid_arg), - dest(dest_arg), - host(host_arg), - ctl(ctl_arg), - sync(sync_arg), - stream(stream_arg), - len(len_arg), - vaddr(vaddr_arg) {} - - cyt_req_t(ap_uint<96> in) { - rsrvd = in(CYT_RSRVD_BITS - 1, 0); - vfid = in(CYT_RSRVD_BITS+CYT_N_REGIONS_BITS-1, CYT_RSRVD_BITS); - pid = in(CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS-1,CYT_RSRVD_BITS+CYT_N_REGIONS_BITS); - dest = in(CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS-1,CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS); - host = in(CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS,CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS); - ctl = in(CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS+1,CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS+1); - sync = in(CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS+2,CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS+2); - stream = in(CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS+3,CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS+3); - len = in(CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS+4+CYT_LEN_BITS-1,CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS+4); - vaddr = in(CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS+4+CYT_LEN_BITS+CYT_VADDR_BITS-1,CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS+4+CYT_LEN_BITS); - } - - operator ap_uint<96>() { - ap_uint<96> ret; - ret(CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS+4+CYT_LEN_BITS+CYT_VADDR_BITS-1,CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS+4+CYT_LEN_BITS) = vaddr; //vaddr - ret(CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS+4+CYT_LEN_BITS-1,CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS+4) = len; //len - ret(CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS+3,CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS+3) = stream; //stream - ret(CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS+2,CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS+2) = sync; //sync - ret(CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS+1,CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS+1) = ctl; //ctl - ret(CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS,CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS) = host; //host - ret(CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS+CYT_DEST_BITS-1,CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS) = dest; //dest - ret(CYT_RSRVD_BITS+CYT_N_REGIONS_BITS+CYT_PID_BITS-1,CYT_RSRVD_BITS+CYT_N_REGIONS_BITS) = pid; //pid - ret(CYT_RSRVD_BITS+CYT_N_REGIONS_BITS-1,CYT_RSRVD_BITS) = vfid; //vfid - ret(CYT_RSRVD_BITS-1,0) = rsrvd; //rsrvd, disregard - return ret; - } -}; - +#define CYT_STRM_BITS 2 +#define CYT_OPCODE_BITS 5 +#define CYT_REQ_RSRVD_BITS (128 - CYT_OFFS_BITS - 2 - CYT_VADDR_BITS - CYT_LEN_BITS - 1 - 2 * CYT_DEST_BITS - CYT_PID_BITS - 3 - CYT_STRM_BITS - CYT_OPCODE_BITS) +#define CYT_ACK_RSRVD_BITS (32 - CYT_OPCODE_BITS - CYT_STRM_BITS - 2 - CYT_DEST_BITS - CYT_PID_BITS - CYT_DEST_BITS) // Coyote RDMA Opcode #define CYT_RDMA_READ 0 @@ -95,105 +40,158 @@ struct cyt_req_t{ #define CYT_RDMA_SEND 2 #define CYT_RDMA_IMMED 3 -// Coyote cyt_rdma_req_t structs -#define CYT_RDMA_OPCODE_BITS 5 -#define CYT_RDMA_MSG_BITS 448 -#define CYT_RDMA_OFFS_BITS 4 -#define CYT_RDMA_QPN_BITS 10 -#define CYT_RDMA_MSN_BITS 24 -#define CYT_RDMA_RSRVD_BITS 17 -#define CYT_RDMA_REQ_BITS CYT_RDMA_RSRVD_BITS+CYT_RDMA_MSG_BITS+CYT_RDMA_OFFS_BITS+CYT_RDMA_MSN_BITS+4+CYT_RDMA_QPN_BITS+CYT_RDMA_OPCODE_BITS - -#define CYT_RDMA_VADDR_BITS 64 -#define CYT_RDMA_LEN_BITS 32 -#define CYT_RDMA_PARAMS_BITS 288 - -struct cyt_rdma_req_t{ - ap_uint rsrvd; - ap_uint msg; - ap_uint offs; - ap_uint ssn; - ap_uint<1> cmplt; - ap_uint<1> last; - ap_uint<1> mode; - ap_uint<1> host; - ap_uint qpn; - ap_uint opcode; - - cyt_rdma_req_t() : rsrvd(0), msg(0), offs(0), ssn(0), cmplt(0), last(0), mode(0), host(0), qpn(0), opcode(0) {} - cyt_rdma_req_t(ap_uint in) { - rsrvd = in(CYT_RDMA_RSRVD_BITS - 1, 0); - msg = in(CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS - 1, CYT_RDMA_RSRVD_BITS); - offs = in(CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS - 1, CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS); - ssn = in(CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS - 1, CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS); - cmplt = in(CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS, CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS); - last = in(CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS + 1, CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS + 1); - mode = in(CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS + 2, CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS + 2); - host = in(CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS + 3, CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS + 3); - qpn = in(CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS + 4 + CYT_RDMA_QPN_BITS - 1, CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS + 4); - opcode = in(CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS + 4 + CYT_RDMA_QPN_BITS + CYT_RDMA_OPCODE_BITS - 1, CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS + 4 + CYT_RDMA_QPN_BITS); - } - operator ap_uint() { - ap_uint ret; - ret(CYT_RDMA_RSRVD_BITS - 1, 0) = rsrvd; - ret(CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS - 1, CYT_RDMA_RSRVD_BITS) = msg; - ret(CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS - 1, CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS) = offs; - ret(CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS - 1, CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS) = ssn; - ret(CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS, CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS) = cmplt; - ret(CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS + 1, CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS + 1) = last; - ret(CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS + 2, CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS + 2) = mode; - ret(CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS + 3, CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS + 3) = host; - ret(CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS + 4 + CYT_RDMA_QPN_BITS - 1, CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS + 4) = qpn; - ret(CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS + 4 + CYT_RDMA_QPN_BITS + CYT_RDMA_OPCODE_BITS - 1, CYT_RDMA_RSRVD_BITS + CYT_RDMA_MSG_BITS + CYT_RDMA_OFFS_BITS + CYT_RDMA_MSN_BITS + 4 + CYT_RDMA_QPN_BITS) = opcode; - return ret; - } -} ; +#define RC_SEND_FIRST 0 +#define RC_SEND_MIDDLE 1 +#define RC_SEND_LAST 2 +#define RC_SEND_ONLY 4 +#define RC_RDMA_WRITE_FIRST 6 +#define RC_RDMA_WRITE_MIDDLE 7 +#define RC_RDMA_WRITE_LAST 8 +#define RC_RDMA_WRITE_LAST_WITH_IMD 9 +#define RC_RDMA_WRITE_ONLY 10 +#define RC_RDMA_WRITE_ONLY_WIT_IMD 11 +#define RC_RDMA_READ_REQUEST 12 +#define RC_RDMA_READ_RESP_FIRST 13 +#define RC_RDMA_READ_RESP_MIDDLE 14 +#define RC_RDMA_READ_RESP_LAST 15 +#define RC_RDMA_READ_RESP_ONLY 16 +#define RC_ACK 17 + +// Coyote STRM Opcode +#define CYT_STRM_CARD 0 +#define CYT_STRM_HOST 1 +#define CYT_STRM_RDMA 2 +#define CYT_STRM_TCP 3 +struct cyt_req_t{ + ap_uint rsrvd; // 19 bits + ap_uint offs; // 6 bits + ap_uint<1> host; // 1 bit + ap_uint<1> actv; // 1 bit + + ap_uint len; // 28 bits + ap_uint vaddr; // 48 bits + + ap_uint<1> last; // 1 bit -struct cyt_rdma_req_msg_t{ - ap_uint lvaddr; - ap_uint rvaddr; - ap_uint len; - ap_uint params; + ap_uint dest; // 4 bits + ap_uint pid; // 6 bits + ap_uint vfid; // 4 bits + + ap_uint<1> remote; // 1 bit + ap_uint<1> rdma; // 1 bit + ap_uint<1> mode; // 1 bit + ap_uint strm; // 2 bits + ap_uint opcode; // 5 bits - cyt_rdma_req_msg_t() : lvaddr(0), rvaddr(0), len(0), params(0) {} - cyt_rdma_req_msg_t(ap_uint in) { - lvaddr = in(CYT_RDMA_VADDR_BITS - 1, 0); - rvaddr = in(2 * CYT_RDMA_VADDR_BITS - 1, CYT_RDMA_VADDR_BITS); - len = in(2 * CYT_RDMA_VADDR_BITS + CYT_RDMA_LEN_BITS - 1, 2 * CYT_RDMA_VADDR_BITS); - params = in(CYT_RDMA_MSG_BITS - 1, 2 * CYT_RDMA_VADDR_BITS + CYT_RDMA_LEN_BITS); + // Default constructor + cyt_req_t() + : rsrvd(0), offs(0), host(0), actv(0), len(0), vaddr(0), last(0), + dest(0), pid(0), vfid(0), remote(0), rdma(0), mode(0), strm(0), opcode(0) {} + + // Parameterized constructor + cyt_req_t(ap_uint rsrvd_arg, ap_uint offs_arg, ap_uint<1> host_arg, ap_uint<1> actv_arg, + ap_uint len_arg, ap_uint vaddr_arg, ap_uint<1> last_arg, + ap_uint dest_arg, ap_uint pid_arg, ap_uint vfid_arg, + ap_uint<1> remote_arg, ap_uint<1> rdma_arg, ap_uint<1> mode_arg, ap_uint strm_arg, ap_uint opcode_arg) + : rsrvd(rsrvd_arg), offs(offs_arg), host(host_arg), actv(actv_arg), len(len_arg), vaddr(vaddr_arg), + last(last_arg), dest(dest_arg), pid(pid_arg), vfid(vfid_arg), remote(remote_arg), rdma(rdma_arg), + mode(mode_arg), strm(strm_arg), opcode(opcode_arg) {} + + // Constructor from a single ap_uint<128> argument + cyt_req_t(ap_uint<128> in) { + rsrvd = in(CYT_REQ_RSRVD_BITS - 1, 0); + offs = in(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS - 1, CYT_REQ_RSRVD_BITS); + host = in(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS); + actv = in(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 1, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 1); + len = in(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 1 + CYT_LEN_BITS - 1, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2); + vaddr = in(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS - 1, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS); + last = in(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS); + dest = in(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS - 1, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + 1); + pid = in(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS - 1, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS); + vfid = in(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS + CYT_DEST_BITS - 1, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS); + remote = in(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS + CYT_DEST_BITS, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS + CYT_DEST_BITS); + rdma = in(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS + CYT_DEST_BITS + 1, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS + CYT_DEST_BITS + 1); + mode = in(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS + CYT_DEST_BITS + 2, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS + CYT_DEST_BITS + 2); + strm = in(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS + CYT_DEST_BITS + 3 + CYT_STRM_BITS - 1, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS + CYT_DEST_BITS + 3); + opcode = in(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS + CYT_DEST_BITS + 3 + CYT_STRM_BITS + CYT_OPCODE_BITS - 1, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS + CYT_DEST_BITS + 3 + CYT_STRM_BITS); } - operator ap_uint() { - ap_uint ret; - ret(CYT_RDMA_VADDR_BITS - 1, 0) = lvaddr; - ret(2 * CYT_RDMA_VADDR_BITS - 1, CYT_RDMA_VADDR_BITS) = rvaddr; - ret(2 * CYT_RDMA_VADDR_BITS + CYT_RDMA_LEN_BITS - 1, 2 * CYT_RDMA_VADDR_BITS) = len; - ret(CYT_RDMA_MSG_BITS - 1, 2 * CYT_RDMA_VADDR_BITS + CYT_RDMA_LEN_BITS) = params; + + operator ap_uint<128>() { + ap_uint<128> ret; + + // Assigning fields to the appropriate bit positions in the 128-bit return value. + ret(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS + CYT_DEST_BITS + 3 + CYT_STRM_BITS + CYT_OPCODE_BITS - 1, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS + CYT_DEST_BITS + 3 + CYT_STRM_BITS) = opcode; // opcode + ret(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS + CYT_DEST_BITS + 3 + CYT_STRM_BITS - 1, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS + CYT_DEST_BITS + 3) = strm; // strm + ret(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS + CYT_DEST_BITS + 2, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS + CYT_DEST_BITS + 2) = mode; // mode + ret(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS + CYT_DEST_BITS + 1, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS + CYT_DEST_BITS + 1) = rdma; // rdma + ret(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS + CYT_DEST_BITS, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS + CYT_DEST_BITS) = remote; // remote + ret(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS + CYT_DEST_BITS - 1, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS) = vfid; // vfid + ret(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS + CYT_PID_BITS - 1, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS) = pid; // pid + ret(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + CYT_DEST_BITS - 1, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS + 1) = dest; // dest + ret(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS) = last; // last + ret(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS + CYT_VADDR_BITS - 1, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS) = vaddr; // vaddr + ret(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2 + CYT_LEN_BITS - 1, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 2) = len; // len + ret(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 1, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS + 1) = actv; // actv + ret(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS, CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS) = host; // host + ret(CYT_REQ_RSRVD_BITS + CYT_OFFS_BITS - 1, CYT_REQ_RSRVD_BITS) = offs; // offs + ret(CYT_REQ_RSRVD_BITS - 1, 0) = rsrvd; // rsrvd, disregard + return ret; } }; +struct cyt_ack_t { + ap_uint rsrvd; // 9 bits + ap_uint vfid; // 4 bits + ap_uint pid; // 6 bits + ap_uint dest; // 4 bits + ap_uint<1> host; // 1 bit + ap_uint<1> remote; // 1 bit + ap_uint strm; // 2 bits + ap_uint opcode; // 5 bits + + // Default constructor + cyt_ack_t() + : rsrvd(0), vfid(0), pid(0), dest(0), host(0), remote(0), strm(0), opcode(0) {} + + // Parameterized constructor + cyt_ack_t(ap_uint rsrvd_arg, + ap_uint vfid_arg, + ap_uint pid_arg, + ap_uint dest_arg, + ap_uint<1> host_arg, + ap_uint<1> remote_arg, + ap_uint strm_arg, + ap_uint opcode_arg) + : rsrvd(rsrvd_arg), vfid(vfid_arg), pid(pid_arg), dest(dest_arg), + host(host_arg), remote(remote_arg), strm(strm_arg), opcode(opcode_arg) {} + + // Constructor from a single ap_uint<32> argument + cyt_ack_t(ap_uint<32> in) { + opcode = in(31, 31 - CYT_OPCODE_BITS + 1); + strm = in(31 - CYT_OPCODE_BITS, 31 - CYT_OPCODE_BITS - CYT_STRM_BITS + 1); + remote = in(31 - CYT_OPCODE_BITS - CYT_STRM_BITS - 1, 31 - CYT_OPCODE_BITS - CYT_STRM_BITS - 1); + host = in(31 - CYT_OPCODE_BITS - CYT_STRM_BITS - 2, 31 - CYT_OPCODE_BITS - CYT_STRM_BITS - 2); + dest = in(31 - CYT_OPCODE_BITS - CYT_STRM_BITS - 3, 31 - CYT_OPCODE_BITS - CYT_STRM_BITS - CYT_DEST_BITS - 2); + pid = in(31 - CYT_OPCODE_BITS - CYT_STRM_BITS - CYT_DEST_BITS - 3, 31 - CYT_OPCODE_BITS - CYT_STRM_BITS - CYT_DEST_BITS - CYT_PID_BITS - 2); + vfid = in(31 - CYT_OPCODE_BITS - CYT_STRM_BITS - CYT_DEST_BITS - CYT_PID_BITS - 3, 31 - CYT_OPCODE_BITS - CYT_STRM_BITS - 2 * CYT_DEST_BITS - CYT_PID_BITS - 2); + rsrvd = in(CYT_ACK_RSRVD_BITS - 1, 0); // Remaining bits for reserved + } + + // Conversion operator to ap_uint<32> + operator ap_uint<32>() { + ap_uint<32> ret; + ret(31, 31 - CYT_OPCODE_BITS + 1) = opcode; + ret(31 - CYT_OPCODE_BITS, 31 - CYT_OPCODE_BITS - CYT_STRM_BITS + 1) = strm; + ret(31 - CYT_OPCODE_BITS - CYT_STRM_BITS - 1, 31 - CYT_OPCODE_BITS - CYT_STRM_BITS - 1) = remote; + ret(31 - CYT_OPCODE_BITS - CYT_STRM_BITS - 2, 31 - CYT_OPCODE_BITS - CYT_STRM_BITS - 2) = host; + ret(31 - CYT_OPCODE_BITS - CYT_STRM_BITS - 3, 31 - CYT_OPCODE_BITS - CYT_STRM_BITS - CYT_DEST_BITS - 2) = dest; + ret(31 - CYT_OPCODE_BITS - CYT_STRM_BITS - CYT_DEST_BITS - 3, 31 - CYT_OPCODE_BITS - CYT_STRM_BITS - CYT_DEST_BITS - CYT_PID_BITS - 2) = pid; + ret(31 - CYT_OPCODE_BITS - CYT_STRM_BITS - CYT_DEST_BITS - CYT_PID_BITS - 3, 31 - CYT_OPCODE_BITS - CYT_STRM_BITS - 2 * CYT_DEST_BITS - CYT_PID_BITS - 2) = vfid; + ret(CYT_ACK_RSRVD_BITS - 1, 0) = rsrvd; -void cyt_dma_adapter( - //DM command streams - hls::stream> &dma0_s2mm_cmd, - hls::stream> &dma1_s2mm_cmd, - hls::stream> &dma0_mm2s_cmd, - hls::stream> &dma1_mm2s_cmd, - //DM status streams - hls::stream> &dma0_s2mm_sts, - hls::stream> &dma1_s2mm_sts, - hls::stream> &dma0_mm2s_sts, - hls::stream> &dma1_mm2s_sts, -#ifdef ACCL_RDMA - //RDMA rd_req and wr_req - hls::stream> & rdma_wr_req, - hls::stream> & rdma_rd_req, -#endif - //Coyote Bypass interface command and status - hls::stream> &cyt_byp_wr_cmd, - hls::stream> &cyt_byp_wr_sts, - hls::stream> &cyt_byp_rd_cmd, - hls::stream> &cyt_byp_rd_sts -); \ No newline at end of file + return ret; + } +}; diff --git a/kernels/plugins/cyt_adapter/cyt_cq_dm_sts_converter.cpp b/kernels/plugins/cyt_adapter/cyt_cq_dm_sts_converter.cpp new file mode 100644 index 00000000..758f119d --- /dev/null +++ b/kernels/plugins/cyt_adapter/cyt_cq_dm_sts_converter.cpp @@ -0,0 +1,112 @@ +/******************************************************************************* +# Copyright (C) 2023 Advanced Micro Devices, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# *******************************************************************************/ + +#include "cyt.h" + +using namespace std; + + +void cyt_cq_dm_sts_converter(hls::stream & cq_sts, + hls::stream> & dm0_sts, + hls::stream> & dm1_sts, + hls::stream>& dm0_meta, + hls::stream>& dm1_meta) +{ +#pragma HLS INTERFACE axis register port=cq_sts +#pragma HLS INTERFACE axis register port=dm0_sts +#pragma HLS INTERFACE axis register port=dm1_sts +#pragma HLS INTERFACE axis register port=dm0_meta +#pragma HLS INTERFACE axis register port=dm1_meta +#pragma HLS aggregate variable=cq_sts compact=bit + +#pragma HLS INTERFACE ap_ctrl_none port=return +#pragma HLS PIPELINE II=1 + + enum fsmStateType {CYT_STS, DM_STS_0, DM_STS_1}; + static fsmStateType fsmState = CYT_STS; + + static cyt_ack_t cq_sts_word; + + static ap_axiu<32,0,0,0> dm_sts_word; + static ap_uint<1+4+23> dm_meta_word; + + switch (fsmState) + { + // no workaround anymore + case CYT_STS: + if (!STREAM_IS_EMPTY(cq_sts)) + { + cq_sts_word = STREAM_READ(cq_sts); + + // only process status if it is local memory completion status + // only send back ack when the cq_sts stems from kernel issued bypass commands with host == 0 + // if dest == 2, this comes from wr_req/rd_req, no need to forward to data mover + if((cq_sts_word.opcode == CYT_STRM_CARD || cq_sts_word.opcode == CYT_STRM_HOST) && cq_sts_word.host == 0 && (cq_sts_word.dest == 0 || cq_sts_word.dest == 1)) + { + if (cq_sts_word.dest == 0) { + fsmState = DM_STS_0; + } else if (cq_sts_word.dest == 1) { + fsmState = DM_STS_1; + } + } + else{ + fsmState = CYT_STS; + } + } + break; + case DM_STS_0: + if(!STREAM_IS_EMPTY(dm0_meta)){ + + dm_meta_word = STREAM_READ(dm0_meta); + + dm_sts_word.data.range(3,0) = dm_meta_word(26,23); //tag + dm_sts_word.data.range(4,4) = 0; // internal error + dm_sts_word.data.range(5,5) = 0; // decode erro + dm_sts_word.data.range(6,6) = 0; // slave error + dm_sts_word.data.range(7,7) = 1; // OK + dm_sts_word.data.range(30,8) = dm_meta_word(22,0); // bytes received + dm_sts_word.data.range(31,31) = dm_meta_word(27,27); // EOP + dm_sts_word.last = 1; + + STREAM_WRITE(dm0_sts, dm_sts_word); + + fsmState = CYT_STS; // todo: add the check of eop flag + } + break; + case DM_STS_1: + if(!STREAM_IS_EMPTY(dm1_meta)){ + + dm_meta_word = STREAM_READ(dm1_meta); + + dm_sts_word.data.range(3,0) = dm_meta_word(26,23); //tag + dm_sts_word.data.range(4,4) = 0; // internal error + dm_sts_word.data.range(5,5) = 0; // decode erro + dm_sts_word.data.range(6,6) = 0; // slave error + dm_sts_word.data.range(7,7) = 1; // OK + dm_sts_word.data.range(30,8) = dm_meta_word(22,0); // bytes received + dm_sts_word.data.range(31,31) = dm_meta_word(27,27); // EOP + dm_sts_word.last = 1; + + STREAM_WRITE(dm1_sts, dm_sts_word); + + fsmState = CYT_STS; // todo: add the check of eop flag + } + break; + + } +} + diff --git a/kernels/plugins/cyt_adapter/cyt_dma_adapter.cpp b/kernels/plugins/cyt_adapter/cyt_dma_adapter.cpp deleted file mode 100644 index 300e991a..00000000 --- a/kernels/plugins/cyt_adapter/cyt_dma_adapter.cpp +++ /dev/null @@ -1,238 +0,0 @@ -/******************************************************************************* -# Copyright (C) 2023 Advanced Micro Devices, Inc -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# *******************************************************************************/ - -#include "cyt.h" - -using namespace std; - -template -void dm_byp_cmd_converter(hls::stream>& dm_cmd, - hls::stream& byp_cmd, - hls::stream>& dm_meta - ) -{ -#pragma HLS inline off -#pragma HLS pipeline II=1 - - if (!STREAM_IS_EMPTY(dm_cmd)) - { - ap_axiu<104,0,0,DEST_WIDTH> dm_cmd_with_dest = STREAM_READ(dm_cmd); - ap_uint<104> dm_cmd_word = dm_cmd_with_dest.data; - - ap_uint<23> btt = dm_cmd_word(22,0); - ap_uint<64> saddr = dm_cmd_word(95,32); - ap_uint<4> tag = dm_cmd_word(99,96); - ap_uint<1> strm = dm_cmd_with_dest.dest(2,0); // 1 if targeting host memory, 0 if targeting card memory - ap_uint<1> ctl = dm_cmd_word(30,30); // ctl field determines if a TLAST must be asserted at the end of the data stream - - cyt_req_t req(0, 0, 0, DMA_CHANNEL, 0, ctl, 0, strm, btt, saddr); - STREAM_WRITE(byp_cmd, req); - - ap_uint<1+4+23> dm_meta_word; - dm_meta_word(22,0) = btt; - dm_meta_word(26,23) = tag; - dm_meta_word(27,27) = ctl; - STREAM_WRITE(dm_meta, dm_meta_word); - } -} - -template -void rdma_req_byp_cmd_converter( - hls::stream& rdma_req, - hls::stream& byp_cmd -) -{ -#pragma HLS inline off -#pragma HLS pipeline II=1 - - if(!STREAM_IS_EMPTY(rdma_req)){ - cyt_req_t req = STREAM_READ(rdma_req); - // TODO: - // Better mechanism of buffer & proc mapping - // Currently has to set the pid to 0, corresponding to coyote_proc instead of any coyote_qproc - // Every coyote_qproc has a unique physical address in device - cyt_req_t cmd(req.rsrvd, req.vfid, 0 /*req.pid*/, DMA_CHANNEL, 0, 1, 0, req.stream, req.len, req.vaddr); - STREAM_WRITE(byp_cmd, cmd); - } - -} - -void multiplexor(hls::stream& in0, - hls::stream& in1, - hls::stream& in2, - hls::stream& out) -{ -#pragma HLS inline off -#pragma HLS pipeline II=1 - - cyt_req_t currWord; - - if (!STREAM_IS_EMPTY(in0)) - { - currWord = STREAM_READ(in0); - STREAM_WRITE(out, currWord); - } - else if(!STREAM_IS_EMPTY(in1)) - { - currWord = STREAM_READ(in1); - STREAM_WRITE(out, currWord); - } - else if(!STREAM_IS_EMPTY(in2)) - { - currWord = STREAM_READ(in2); - STREAM_WRITE(out, currWord); - } - -} - - -void byp_dm_sts_converter(hls::stream> & byp_sts, - hls::stream> & dm0_sts, - hls::stream> & dm1_sts, - hls::stream>& dm0_meta, - hls::stream>& dm1_meta) -{ -#pragma HLS inline off -#pragma HLS pipeline II=1 - - if (!STREAM_IS_EMPTY(byp_sts)) - { - ap_uint<16> byp_sts_word = STREAM_READ(byp_sts); - // PID in LSB according to Coyote dma_rsp_t: - ap_uint pid = byp_sts_word(CYT_PID_BITS-1,0); - ap_uint dest = byp_sts_word(CYT_DEST_BITS+CYT_PID_BITS-1,CYT_PID_BITS); - ap_uint<1> strm = byp_sts_word(CYT_DEST_BITS+CYT_PID_BITS,CYT_DEST_BITS+CYT_PID_BITS); - ap_uint<1> host = byp_sts_word(CYT_DEST_BITS+CYT_PID_BITS+1,CYT_DEST_BITS+CYT_PID_BITS+1); - - ap_axiu<32,0,0,0> dm_sts_word; - ap_uint<1+4+23> dm_meta_word; - - // only send back ack when the byp_sts stems from kernel issued bypass commands - // if dest == 2, this comes from wr_req/rd_req, no need to forward to data mover - if(host == 0) - { - do{ - if(dest == 0){ - dm_meta_word = STREAM_READ(dm0_meta); - } else if (dest == 1){ - dm_meta_word = STREAM_READ(dm1_meta); - } - dm_sts_word.data.range(3,0) = dm_meta_word(26,23); //tag - dm_sts_word.data.range(4,4) = 0; // internal error - dm_sts_word.data.range(5,5) = 0; // decode error - dm_sts_word.data.range(6,6) = 0; // slave error - dm_sts_word.data.range(7,7) = 1; // OK - dm_sts_word.data.range(30,8) = dm_meta_word(22,0); // bytes received - dm_sts_word.data.range(31,31) = dm_meta_word(27,27); // EOP - dm_sts_word.last = 1; - if(dest == 0){ - STREAM_WRITE(dm0_sts, dm_sts_word); - } else if (dest == 1){ - STREAM_WRITE(dm1_sts, dm_sts_word); - } - } while(dm_meta_word(27,27) == 0); - } - } - -} - -// The cyt bypass commands have 3 sources if RDMA is enabled -// 2 DMA channels from the CCLO and the rdma req interface -void cyt_dma_adapter( - //DM command streams - hls::stream> &dma0_s2mm_cmd, - hls::stream> &dma1_s2mm_cmd, - hls::stream> &dma0_mm2s_cmd, - hls::stream> &dma1_mm2s_cmd, - //DM status streams - hls::stream> &dma0_s2mm_sts, - hls::stream> &dma1_s2mm_sts, - hls::stream> &dma0_mm2s_sts, - hls::stream> &dma1_mm2s_sts, - - //RDMA rd_req and wr_req - hls::stream & rdma_wr_req, - hls::stream & rdma_rd_req, - - //Coyote Bypass interface command and status - hls::stream &cyt_byp_wr_cmd, - hls::stream> &cyt_byp_wr_sts, - hls::stream &cyt_byp_rd_cmd, - hls::stream> &cyt_byp_rd_sts -) { -#pragma HLS INTERFACE axis port=dma0_s2mm_cmd -#pragma HLS INTERFACE axis port=dma1_s2mm_cmd -#pragma HLS INTERFACE axis port=dma0_mm2s_cmd -#pragma HLS INTERFACE axis port=dma1_mm2s_cmd -#pragma HLS INTERFACE axis port=dma0_s2mm_sts -#pragma HLS INTERFACE axis port=dma1_s2mm_sts -#pragma HLS INTERFACE axis port=dma0_mm2s_sts -#pragma HLS INTERFACE axis port=dma1_mm2s_sts -#pragma HLS INTERFACE axis port=cyt_byp_rd_cmd -#pragma HLS INTERFACE axis port=cyt_byp_rd_sts -#pragma HLS INTERFACE axis port=cyt_byp_wr_cmd -#pragma HLS INTERFACE axis port=cyt_byp_wr_sts -#pragma HLS INTERFACE ap_ctrl_none port=return -#pragma HLS DATAFLOW disable_start_propagation - -#pragma HLS aggregate variable=cyt_byp_wr_cmd compact=bit -#pragma HLS aggregate variable=cyt_byp_rd_cmd compact=bit - -#pragma HLS INTERFACE axis port=rdma_wr_req -#pragma HLS INTERFACE axis port=rdma_rd_req -#pragma HLS aggregate variable=rdma_wr_req compact=bit -#pragma HLS aggregate variable=rdma_rd_req compact=bit - - static hls::stream byp_wr_cmd_0; - #pragma HLS stream variable=byp_wr_cmd_0 depth=16 - static hls::stream byp_wr_cmd_1; - #pragma HLS stream variable=byp_wr_cmd_1 depth=16 - static hls::stream byp_rd_cmd_0; - #pragma HLS stream variable=byp_rd_cmd_0 depth=16 - static hls::stream byp_rd_cmd_1; - #pragma HLS stream variable=byp_rd_cmd_1 depth=16 - - static hls::stream> dma0_mm2s_meta; - #pragma HLS stream variable=dma0_mm2s_meta depth=16 - static hls::stream> dma1_mm2s_meta; - #pragma HLS stream variable=dma1_mm2s_meta depth=16 - static hls::stream> dma0_s2mm_meta; - #pragma HLS stream variable=dma0_s2mm_meta depth=16 - static hls::stream> dma1_s2mm_meta; - #pragma HLS stream variable=dma1_s2mm_meta depth=16 - - static hls::stream byp_wr_cmd_2; - #pragma HLS stream variable=byp_wr_cmd_2 depth=16 - static hls::stream byp_rd_cmd_2; - #pragma HLS stream variable=byp_rd_cmd_2 depth=16 - - dm_byp_cmd_converter<0>(dma0_s2mm_cmd, byp_wr_cmd_0, dma0_s2mm_meta); - dm_byp_cmd_converter<1>(dma1_s2mm_cmd, byp_wr_cmd_1, dma1_s2mm_meta); - rdma_req_byp_cmd_converter<2>(rdma_wr_req, byp_wr_cmd_2); - multiplexor(byp_wr_cmd_0,byp_wr_cmd_1,byp_wr_cmd_2,cyt_byp_wr_cmd); - - - dm_byp_cmd_converter<0>(dma0_mm2s_cmd,byp_rd_cmd_0, dma0_mm2s_meta); - dm_byp_cmd_converter<1>(dma1_mm2s_cmd,byp_rd_cmd_1, dma1_mm2s_meta); - rdma_req_byp_cmd_converter<2>(rdma_rd_req, byp_rd_cmd_2); - multiplexor(byp_rd_cmd_0,byp_rd_cmd_1,byp_rd_cmd_2,cyt_byp_rd_cmd); - - byp_dm_sts_converter(cyt_byp_wr_sts, dma0_s2mm_sts, dma1_s2mm_sts, dma0_s2mm_meta, dma1_s2mm_meta); - byp_dm_sts_converter(cyt_byp_rd_sts, dma0_mm2s_sts, dma1_mm2s_sts, dma0_mm2s_meta, dma1_mm2s_meta); - - -} diff --git a/kernels/plugins/cyt_adapter/cyt_dma_sq_adapter.cpp b/kernels/plugins/cyt_adapter/cyt_dma_sq_adapter.cpp new file mode 100644 index 00000000..abb76505 --- /dev/null +++ b/kernels/plugins/cyt_adapter/cyt_dma_sq_adapter.cpp @@ -0,0 +1,210 @@ +/******************************************************************************* +# Copyright (C) 2023 Advanced Micro Devices, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# *******************************************************************************/ + +#include "cyt.h" + +using namespace std; + +// convert the data mover command (dma) to the cyt_req_t (dma) +// currently all the memory accesses initialized by the CCLO is associated with pid 0 (coyote_proc) +// also we assume a vfid 0 for single cyt region +// the dest field of the dm cmd indicates the host/card accesses +// the dest field is converted to strm flag in the cyt_sq_cmd +// DMA Channel is used to select axis streams, channel 0 and 1 are reserved + +template +void dm_sq_cmd_converter(hls::stream>& dm_cmd, + hls::stream& cyt_sq_cmd, + hls::stream>& dm_meta + ) +{ +#pragma HLS inline off +#pragma HLS pipeline II=1 + + if (!STREAM_IS_EMPTY(dm_cmd)) + { + ap_axiu<104,0,0,DEST_WIDTH> dm_cmd_with_dest = STREAM_READ(dm_cmd); + ap_uint<104> dm_cmd_word = dm_cmd_with_dest.data; + + ap_uint<23> btt = dm_cmd_word(22,0); + ap_uint<64> saddr = dm_cmd_word(95,32); + ap_uint<4> tag = dm_cmd_word(99,96); + ap_uint<1> strm = dm_cmd_with_dest.dest(2,0); // 1 if targeting host memory, 0 if targeting card memory + ap_uint<1> ctl = dm_cmd_word(30,30); // ctl field determines if a TLAST must be asserted at the end of the data stream + + cyt_req_t req(0/*rsrvd_arg*/, 0 /*offs_arg*/, 0 /*host_arg*/, 0 /*actv_arg*/, + btt/*len_arg*/, saddr /*vaddr_arg*/, ctl /*last_arg*/, + DMA_CHANNEL /*dest_arg*/, 0 /*pid_arg*/, 0 /*vfid_arg*/, + 0 /*remote_arg*/, 0 /*rdma_arg*/, 0 /*mode_arg*/, strm /*strm_arg*/, 0 /*opcode_arg*/); + + STREAM_WRITE(cyt_sq_cmd, req); + + ap_uint<1+4+23> dm_meta_word; + dm_meta_word(22,0) = btt; + dm_meta_word(26,23) = tag; + dm_meta_word(27,27) = ctl; + STREAM_WRITE(dm_meta, dm_meta_word); + } +} + +// convert the cyt_rq (rdma) to cyt_sq (dma) +// Channel 2 of the host/card axis stream is reserved for cyt_rq command +// the rq dest field is used to indicate whether this is host/device access, it should be converted to strm field here +// the sq opcode is not relevant as it is targeting dma +template +void cyt_rq_sq_cmd_converter( + hls::stream& cyt_rq_cmd, + hls::stream& cyt_sq_cmd +) +{ +#pragma HLS inline off +#pragma HLS pipeline II=1 + + if(!STREAM_IS_EMPTY(cyt_rq_cmd)){ + cyt_req_t req = STREAM_READ(cyt_rq_cmd); + + // Currently has to set the pid to 0, corresponding to coyote_proc instead of any coyote_qproc + // Because all the buffer allocation within the ACCL driver is associated with the coyote_proc + // And every coyote_qproc has a unique physical address in device which is different than the coyote_proc + // Also mark the host flag in the new output command to 0 to indicate the command is issued from the kernel instead of host + // However, the cq of this command is not processed in the cq_dm_sts_converter as the dest channel is 2 + cyt_req_t cmd(req.rsrvd/*rsrvd_arg*/, req.offs /*offs_arg*/, 0/*host_arg*/, req.actv /*actv_arg*/, + req.len/*len_arg*/, req.vaddr /*vaddr_arg*/, req.last /*last_arg*/, + DMA_CHANNEL /*dest_arg*/, 0 /*pid_arg*/, req.vfid /*vfid_arg*/, + req.remote /*remote_arg*/, req.rdma /*rdma_arg*/, req.mode /*mode_arg*/, req.dest /*strm_arg*/, req.opcode /*opcode_arg*/); + + STREAM_WRITE(cyt_sq_cmd, cmd); + } + +} + +void multiplexor(hls::stream& in0, + hls::stream& in1, + hls::stream& in2, + hls::stream& in3, + hls::stream& out) +{ +#pragma HLS inline off +#pragma HLS pipeline II=1 + + cyt_req_t currWord; + + if (!STREAM_IS_EMPTY(in0)) + { + currWord = STREAM_READ(in0); + STREAM_WRITE(out, currWord); + } + else if(!STREAM_IS_EMPTY(in1)) + { + currWord = STREAM_READ(in1); + STREAM_WRITE(out, currWord); + } + else if(!STREAM_IS_EMPTY(in2)) + { + currWord = STREAM_READ(in2); + STREAM_WRITE(out, currWord); + } + else if(!STREAM_IS_EMPTY(in3)) + { + currWord = STREAM_READ(in3); + STREAM_WRITE(out, currWord); + } + +} + + + +// The cyt sq commands have 4 sources if RDMA is enabled +// 2 DMA channels from the CCLO, CCLO sq command, and the Cyt rq interface +void cyt_dma_sq_adapter( + //DM command streams + hls::stream> &dma0_s2mm_cmd, + hls::stream> &dma1_s2mm_cmd, + hls::stream> &dma0_mm2s_cmd, + hls::stream> &dma1_mm2s_cmd, + + //Coyote rq rd_req and wr_req + hls::stream & cyt_rq_wr_cmd, + hls::stream & cyt_rq_rd_cmd, + + //CCLO sq command + hls::stream& cclo_sq_wr_cmd, + hls::stream& cclo_sq_rd_cmd, + + //Coyote sq interface command + hls::stream &cyt_sq_wr_cmd, + hls::stream &cyt_sq_rd_cmd, + + //DM command meta + hls::stream> & dma0_s2mm_meta, + hls::stream> & dma1_s2mm_meta, + hls::stream> & dma0_mm2s_meta, + hls::stream> & dma1_mm2s_meta +) { +#pragma HLS INTERFACE axis port=dma0_s2mm_cmd +#pragma HLS INTERFACE axis port=dma1_s2mm_cmd +#pragma HLS INTERFACE axis port=dma0_mm2s_cmd +#pragma HLS INTERFACE axis port=dma1_mm2s_cmd +#pragma HLS INTERFACE axis port=dma0_s2mm_meta +#pragma HLS INTERFACE axis port=dma1_s2mm_meta +#pragma HLS INTERFACE axis port=dma0_mm2s_meta +#pragma HLS INTERFACE axis port=dma1_mm2s_meta +#pragma HLS INTERFACE axis port=cyt_sq_rd_cmd +#pragma HLS INTERFACE axis port=cyt_sq_wr_cmd +#pragma HLS INTERFACE ap_ctrl_none port=return +#pragma HLS DATAFLOW disable_start_propagation + +#pragma HLS aggregate variable=cyt_sq_wr_cmd compact=bit +#pragma HLS aggregate variable=cyt_sq_rd_cmd compact=bit + +#pragma HLS INTERFACE axis port=cyt_rq_wr_cmd +#pragma HLS INTERFACE axis port=cyt_rq_rd_cmd +#pragma HLS aggregate variable=cyt_rq_wr_cmd compact=bit +#pragma HLS aggregate variable=cyt_rq_rd_cmd compact=bit + +#pragma HLS INTERFACE axis port=cclo_sq_wr_cmd +#pragma HLS aggregate variable=cclo_sq_wr_cmd compact=bit +#pragma HLS INTERFACE axis port=cclo_sq_rd_cmd +#pragma HLS aggregate variable=cclo_sq_rd_cmd compact=bit + + static hls::stream sq_wr_cmd_0; + #pragma HLS stream variable=sq_wr_cmd_0 depth=16 + static hls::stream sq_wr_cmd_1; + #pragma HLS stream variable=sq_wr_cmd_1 depth=16 + static hls::stream sq_rd_cmd_0; + #pragma HLS stream variable=sq_rd_cmd_0 depth=16 + static hls::stream sq_rd_cmd_1; + #pragma HLS stream variable=sq_rd_cmd_1 depth=16 + + static hls::stream sq_wr_cmd_2; + #pragma HLS stream variable=sq_wr_cmd_2 depth=16 + static hls::stream sq_rd_cmd_2; + #pragma HLS stream variable=sq_rd_cmd_2 depth=16 + + dm_sq_cmd_converter<0>(dma0_s2mm_cmd, sq_wr_cmd_0, dma0_s2mm_meta); + dm_sq_cmd_converter<1>(dma1_s2mm_cmd, sq_wr_cmd_1, dma1_s2mm_meta); + cyt_rq_sq_cmd_converter<2>(cyt_rq_wr_cmd, sq_wr_cmd_2); + multiplexor(cclo_sq_wr_cmd, sq_wr_cmd_0,sq_wr_cmd_1,sq_wr_cmd_2, cyt_sq_wr_cmd); + + + dm_sq_cmd_converter<0>(dma0_mm2s_cmd,sq_rd_cmd_0, dma0_mm2s_meta); + dm_sq_cmd_converter<1>(dma1_mm2s_cmd,sq_rd_cmd_1, dma1_mm2s_meta); + cyt_rq_sq_cmd_converter<2>(cyt_rq_rd_cmd, sq_rd_cmd_2); + multiplexor(cclo_sq_rd_cmd, sq_rd_cmd_0,sq_rd_cmd_1,sq_rd_cmd_2, cyt_sq_rd_cmd); + + +} diff --git a/kernels/plugins/cyt_adapter/cyt_rdma_arbiter.cpp b/kernels/plugins/cyt_adapter/cyt_rdma_arbiter.cpp index da59244c..8b478808 100644 --- a/kernels/plugins/cyt_adapter/cyt_rdma_arbiter.cpp +++ b/kernels/plugins/cyt_adapter/cyt_rdma_arbiter.cpp @@ -38,23 +38,25 @@ void cyt_rdma_arbiter_meta( if (!STREAM_IS_EMPTY(s_meta)){ reqWord = STREAM_READ(s_meta); - if (reqWord.host == 0){ + // if (reqWord.strm == CYT_STRM_RDMA && reqWord.opcode == CYT_RDMA_SEND){ + if(reqWord.opcode == RC_SEND_FIRST || reqWord.opcode == RC_SEND_MIDDLE || reqWord.opcode == RC_SEND_LAST || reqWord.opcode == RC_SEND_ONLY ){ meta_notif.type = 0; //don't care meta_notif.session_id(CYT_PID_BITS-1,0) = reqWord.pid; - meta_notif.session_id(CYT_PID_BITS+CYT_DEST_BITS-1,CYT_PID_BITS) = reqWord.dest; + meta_notif.session_id(CYT_PID_BITS+CYT_DEST_BITS-1,CYT_PID_BITS) = reqWord.vfid; //TODO: check this meta_notif.length = reqWord.len; STREAM_WRITE(m_meta_0, meta_notif); - meta_internal(15,0) = reqWord.host; - meta_internal(31,16) = reqWord.stream; + meta_internal(15,0) = reqWord.dest; + meta_internal(31,16) = CYT_RDMA_SEND; meta_internal(63,32) = reqWord.len; STREAM_WRITE(meta_int, meta_internal); - } else if (reqWord.host == 1) { + // } else if (reqWord.strm == CYT_STRM_RDMA && reqWord.opcode == CYT_RDMA_WRITE) { + } else if (reqWord.opcode == RC_RDMA_WRITE_FIRST || reqWord.opcode == RC_RDMA_WRITE_MIDDLE || reqWord.opcode == RC_RDMA_WRITE_LAST || reqWord.opcode == RC_RDMA_WRITE_LAST_WITH_IMD || reqWord.opcode == RC_RDMA_WRITE_ONLY || reqWord.opcode == RC_RDMA_WRITE_ONLY_WIT_IMD) { + // simply forward the rq_wr and the conversion from rq_wr to sq_wr is done downstream STREAM_WRITE(m_meta_1, reqWord); - - meta_internal(15,0) = reqWord.host; - meta_internal(31,16) = reqWord.stream; + meta_internal(15,0) = reqWord.dest; + meta_internal(31,16) = CYT_RDMA_WRITE; meta_internal(63,32) = reqWord.len; STREAM_WRITE(meta_int, meta_internal); } @@ -68,7 +70,8 @@ void cyt_rdma_arbiter_meta( // We also append the last signal for WRITE data stream for each packet as the cyt adapter set the ctl bits always to 1 void cyt_rdma_arbiter_data( hls::stream >& meta_int, - hls::stream >& s_axis, + hls::stream >& s_axis_0, + hls::stream >& s_axis_1, hls::stream >& m_axis_0, hls::stream >& m_axis_1 ) @@ -76,14 +79,14 @@ void cyt_rdma_arbiter_data( #pragma HLS PIPELINE II=1 #pragma HLS INLINE off - enum fsmStateType {META, SEND_STREAM, WRITE_STREAM}; + enum fsmStateType {META, SEND_STREAM, WRITE_STREAM_CARD, WRITE_STREAM_HOST}; static fsmStateType fsmState = META; static ap_axiu<512, 0, 0, 8> currWord; static ap_uint<64> meta_internal; static ap_uint<16> meta_internal_host; - static ap_uint<16> meta_internal_stream; + static ap_uint<16> meta_internal_opcode; static ap_uint<32> meta_internal_len; static ap_uint<32> pkt_word; static ap_uint<32> word_cnt = 0; @@ -95,22 +98,24 @@ void cyt_rdma_arbiter_data( { meta_internal = STREAM_READ(meta_int); meta_internal_host = meta_internal(15,0); - meta_internal_stream = meta_internal(31,16); + meta_internal_opcode = meta_internal(31,16); meta_internal_len = meta_internal(63,32); pkt_word = (meta_internal_len + 63) >> 6; - if (meta_internal_host == 0){ + if (meta_internal_opcode == CYT_RDMA_SEND){ fsmState = SEND_STREAM; - } else if (meta_internal_host == 1){ - fsmState = WRITE_STREAM; + } else if (meta_internal_opcode == CYT_RDMA_WRITE && meta_internal_host == CYT_STRM_CARD){ + fsmState = WRITE_STREAM_CARD; + } else if (meta_internal_opcode == CYT_RDMA_WRITE && meta_internal_host == CYT_STRM_HOST){ + fsmState = WRITE_STREAM_HOST; } } break; case SEND_STREAM: - if (!s_axis.empty()) + if (!s_axis_0.empty()) { - currWord = STREAM_READ(s_axis); + currWord = STREAM_READ(s_axis_0); word_cnt++; if (word_cnt == pkt_word) { @@ -121,22 +126,43 @@ void cyt_rdma_arbiter_data( STREAM_WRITE(m_axis_0, currWord); } break; - case WRITE_STREAM: - if (!s_axis.empty()) + case WRITE_STREAM_CARD: + if (!s_axis_0.empty()) { - currWord = STREAM_READ(s_axis); + currWord = STREAM_READ(s_axis_0); ap_axiu<512, 0, 0, 8> outWord; outWord.data = currWord.data; outWord.keep = currWord.keep; outWord.last = currWord.last; - outWord.dest = meta_internal_stream; + outWord.dest = meta_internal_host; // use the host flag to indicate whether it is to host or device word_cnt++; if (word_cnt == pkt_word) { word_cnt = 0; - currWord.last = 1; + outWord.last = 1; + fsmState = META; + } + STREAM_WRITE(m_axis_1, outWord); + } + break; + case WRITE_STREAM_HOST: + if (!s_axis_1.empty()) + { + currWord = STREAM_READ(s_axis_1); + ap_axiu<512, 0, 0, 8> outWord; + + outWord.data = currWord.data; + outWord.keep = currWord.keep; + outWord.last = currWord.last; + outWord.dest = meta_internal_host; // use the host flag to indicate whether it is to host or device + word_cnt++; + + if (word_cnt == pkt_word) + { + word_cnt = 0; + outWord.last = 1; fsmState = META; } STREAM_WRITE(m_axis_1, outWord); @@ -145,15 +171,17 @@ void cyt_rdma_arbiter_data( } } -// check the host bit of the s_meta, which corresponds to the wr_req -// if host bit equals 0, this is a SEND Verb, route meta to eth notification and route data stream to channel 0 -// if host bit equals 1, this is an WRITE Verb, route meta and data to channel 1 -// if data routes to channel 1, set the meta_internal field according to the stream flag in the cyt_req_t to indicate host/card +// check the command type of the s_meta, which corresponds to the rq_wr +// the input data stream can come from either s_axis_0/s_axis_1, corresponding to rrsp_recv_0/1. It should be selected based on the dest field of the s_meta +// if the dest is CYT_STRM_CARD, then consume from s_axis_0, and if dest is CYT_STRM_HOST, consume from s_axis_1 +// if strm flag is CYT_STRM_RDMA, and the opcode is CYT_RDMA_SEND, this is a SEND Verb, route meta to eth notification and route data stream to channel 0 +// if strm flag is CYT_STRM_RDMA, and the opcode is CYT_RDMA_WRITE, this is a WRITE Verb, route meta and data to channel 1 +// if data routes to channel 1 (CYT_RDMA_WRITE), set the meta_internal field according to the dest flag in the cyt_req_t to indicate host/card // compact bit pragma required for cyt_req_t as this interfaces with Coyote. - void cyt_rdma_arbiter( hls::stream& s_meta, - hls::stream >& s_axis, + hls::stream >& s_axis_0, + hls::stream >& s_axis_1, hls::stream& m_meta_0, hls::stream >& m_axis_0, hls::stream& m_meta_1, @@ -161,7 +189,8 @@ void cyt_rdma_arbiter( ) { #pragma HLS INTERFACE axis register port=s_meta -#pragma HLS INTERFACE axis register port=s_axis +#pragma HLS INTERFACE axis register port=s_axis_1 +#pragma HLS INTERFACE axis register port=s_axis_0 #pragma HLS INTERFACE axis register port=m_meta_0 #pragma HLS INTERFACE axis register port=m_axis_0 #pragma HLS INTERFACE axis register port=m_meta_1 @@ -186,7 +215,8 @@ void cyt_rdma_arbiter( cyt_rdma_arbiter_data( meta_int, - s_axis, + s_axis_0, + s_axis_1, m_axis_0, m_axis_1 ); diff --git a/kernels/plugins/cyt_adapter/cyt_rdma_mux.cpp b/kernels/plugins/cyt_adapter/cyt_rdma_mux.cpp deleted file mode 100644 index 0febd79c..00000000 --- a/kernels/plugins/cyt_adapter/cyt_rdma_mux.cpp +++ /dev/null @@ -1,179 +0,0 @@ -/******************************************************************************* -# Copyright (C) 2023 Advanced Micro Devices, Inc -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# *******************************************************************************/ - -#include "cyt.h" -#include "eth_intf.h" - -using namespace std; - - -void cyt_rdma_mux_meta( - hls::stream& s_meta_0, - hls::stream& s_meta_1, - hls::stream& m_meta_0, - hls::stream& m_meta_1, - hls::stream >& meta_int - ) -{ - -#pragma HLS PIPELINE II=1 -#pragma HLS INLINE off - - static rdma_req_t s_metaWord_0; - static cyt_rdma_req_t m_metaWord_0; - static cyt_req_t s_metaWord_1; - static cyt_rdma_req_msg_t rdma_req_msg; - static ap_uint<8> dest = 0; - - // if there is a rdma_sq cmd - // sq command comes from CCLO only has WRITE and SEND Verb - if (!STREAM_IS_EMPTY(s_meta_0)){ - s_metaWord_0 = STREAM_READ(s_meta_0); - m_metaWord_0.opcode = s_metaWord_0.opcode; - m_metaWord_0.qpn = s_metaWord_0.qpn; - m_metaWord_0.host = 0; // data always managed by CCLO - m_metaWord_0.mode = 0; // always PARSE - m_metaWord_0.last = 1; // always assert last - m_metaWord_0.cmplt = 0; // no need to ack - m_metaWord_0.ssn = 0; - m_metaWord_0.offs = 0; - m_metaWord_0.rsrvd = 0; - - rdma_req_msg.lvaddr = 0; // we don't care about local vaddr - rdma_req_msg.rvaddr(47,0) = s_metaWord_0.vaddr; - rdma_req_msg.rvaddr(52,52) = s_metaWord_0.host; - rdma_req_msg.len = s_metaWord_0.len; - rdma_req_msg.params = 0; - - m_metaWord_0.msg = (ap_uint)rdma_req_msg; - - STREAM_WRITE(m_meta_0, m_metaWord_0); - dest = 0; - STREAM_WRITE(meta_int, dest); - } - else if (!STREAM_IS_EMPTY(s_meta_1)){ - s_metaWord_1 = STREAM_READ(s_meta_1); - STREAM_WRITE(m_meta_1, s_metaWord_1); - dest = 1; - STREAM_WRITE(meta_int, dest); - } -} - -void cyt_rdma_mux_data( - hls::stream >& meta_int, - hls::stream >& s_axis_0, - hls::stream >& s_axis_1, - hls::stream >& m_axis -) -{ - #pragma HLS PIPELINE II=1 - #pragma HLS INLINE off - - enum fsmStateType {META, STREAM_0, STREAM_1}; - static fsmStateType fsmState = META; - - static ap_axiu<512, 0, 0, 8> currWord; - - switch (fsmState) - { - case META: - if (!STREAM_IS_EMPTY(meta_int)) - { - ap_uint<8> dest = STREAM_READ(meta_int); - if (dest == 0){ - fsmState = STREAM_0; - } else { - fsmState = STREAM_1; - } - } - break; - case STREAM_0: - if (!STREAM_IS_EMPTY(s_axis_0)) - { - currWord = STREAM_READ(s_axis_0); - STREAM_WRITE(m_axis, currWord); - if (currWord.last) // TODO: check by cnt instead of last - { - fsmState = META; - } - } - break; - case STREAM_1: - if (!STREAM_IS_EMPTY(s_axis_1)) - { - currWord = STREAM_READ(s_axis_1); - STREAM_WRITE(m_axis, currWord); - if (currWord.last) // TODO: check by cnt instead of last - { - fsmState = META; - } - } - break; - } -} - - -// cyt rdma mux will arbitrate the data stream according to the accepted command signal -// the command can be either rdma_sq or the rd_req -// the data stream can be data stream coming from the cclo or from the host/card data stream -// these two streams are mux into single rdma m_axis data stream - -void cyt_rdma_mux( - hls::stream& s_meta_0, - hls::stream >& s_axis_0, - hls::stream& s_meta_1, - hls::stream >& s_axis_1, - hls::stream& m_meta_0, - hls::stream& m_meta_1, - hls::stream >& m_axis - ) -{ -#pragma HLS INTERFACE axis register port=s_meta_0 -#pragma HLS INTERFACE axis register port=s_axis_0 -#pragma HLS INTERFACE axis register port=s_meta_1 -#pragma HLS INTERFACE axis register port=s_axis_1 -#pragma HLS INTERFACE axis register port=m_meta_0 -#pragma HLS INTERFACE axis register port=m_meta_1 -#pragma HLS INTERFACE axis register port=m_axis -#pragma HLS aggregate variable=s_meta_0 compact=bit -#pragma HLS aggregate variable=s_meta_1 compact=bit -#pragma HLS aggregate variable=m_meta_0 compact=bit -#pragma HLS aggregate variable=m_meta_1 compact=bit - -#pragma HLS INTERFACE ap_ctrl_none port=return - -#pragma HLS DATAFLOW disable_start_propagation - - static hls::stream > meta_int; - #pragma HLS STREAM depth=4 variable=meta_int - - cyt_rdma_mux_meta( - s_meta_0, - s_meta_1, - m_meta_0, - m_meta_1, - meta_int - ); - - cyt_rdma_mux_data( - meta_int, - s_axis_0, - s_axis_1, - m_axis - ); - -} \ No newline at end of file diff --git a/test/host/Coyote/CMakeLists.txt b/test/host/Coyote/CMakeLists.txt index 7c6b9842..78b7ca5c 100644 --- a/test/host/Coyote/CMakeLists.txt +++ b/test/host/Coyote/CMakeLists.txt @@ -16,6 +16,8 @@ set(EN_AVX 1 CACHE STRING "AVX environment.") add_subdirectory(${CMAKE_SOURCE_DIR}/../../../driver/xrt/ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/xrt/) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) # Exec set(EXEC accl_on_coyote) diff --git a/test/host/Coyote/CMakeLists_GPU.txt b/test/host/Coyote/CMakeLists_GPU.txt new file mode 100644 index 00000000..fb0130d5 --- /dev/null +++ b/test/host/Coyote/CMakeLists_GPU.txt @@ -0,0 +1,113 @@ +cmake_minimum_required(VERSION 3.16) +project(accl_on_coyote) + +# set(COYOTE_DRIVER_DIR ${CMAKE_SOURCE_DIR}/../../refdesigns/Coyote/sw/) +set(ACCL_DRIVER_DIR ${CMAKE_SOURCE_DIR}/../../../driver/xrt/) + +set(ACCL_DEBUG 1) + +# Sources +# file(GLOB SOURCES ${COYOTE_DRIVER_DIR}/src/*.cpp ${ACCL_DRIVER_DIR}/src/*.cpp ${CMAKE_SOURCE_DIR}/*.cpp) + +# Enable Coyote +# set(EN_COYOTE 1 CACHE STRING "Enable Coyote") +# AVX support (Disable on Enzian) +set(EN_AVX 1 CACHE STRING "AVX environment.") + +add_subdirectory(${CMAKE_SOURCE_DIR}/../../../driver/xrt/ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/xrt/) + + +set(EN_GPU 1) +if(NOT DEFINED ROCM_PATH) +if(DEFINED ENV{ROCM_PATH}) + set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCM has been installed") +elseif(DEFINED ENV{HIP_PATH}) + set(ROCM_PATH "$ENV{HIP_PATH}/.." CACHE PATH "Path to which ROCM has been installed") +else() + set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCM has been installed") +endif() +endif() + +file(STRINGS "${ROCM_PATH}/.info/version" ROCM_VERSION) +message("-- Found ROCm: ${ROCM_VERSION}") + +if (NOT DEFINED CMAKE_CXX_COMPILER) + set(CMAKE_CXX_COMPILER ${ROCM_PATH}/bin/hipcc) +endif() + +if(NOT DEFINED HIP_PATH) + if(NOT DEFINED ENV{HIP_PATH}) + set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") + else() + set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") + endif() +endif() + +if(NOT DEFINED HCC_PATH) + if(DEFINED ENV{HCC_PATH}) + set(HCC_PATH $ENV{HCC_PATH} CACHE PATH "Path to which HCC has been installed") + else() + set(HCC_PATH "${ROCM_PATH}/hcc" CACHE PATH "Path to which HCC has been installed") + endif() + set(HCC_HOME "${HCC_PATH}") +endif() + +if(NOT DEFINED HIP_CLANG_PATH) + if(NOT DEFINED ENV{HIP_CLANG_PATH}) + set(HIP_CLANG_PATH "${ROCM_PATH}/llvm/bin" CACHE PATH "Path to which HIP compatible clang binaries have been installed") + else() + set(HIP_CLANG_PATH $ENV{HIP_CLANG_PATH} CACHE PATH "Path to which HIP compatible clang binaries have been installed") + endif() +endif() + +find_package(HIP QUIET) +if(HIP_FOUND) + message(STATUS "Found HIP: " ${HIP_VERSION}) +else() + message(FATAL_ERROR "Could not find HIP. Ensure that HIP is either installed in /opt/rocm/hip or the variable HIP_PATH is set to point to the right location.") +endif() +find_package(hip REQUIRED) + +set(CYT_LANG ${CYT_LANG} HIP) + + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# Exec +set(EXEC accl_on_coyote) +add_executable(${EXEC} ${CMAKE_SOURCE_DIR}/test.cpp) + +# accl +target_include_directories(${EXEC} PUBLIC ${ACCL_DRIVER_DIR}/include) +target_link_libraries(${EXEC} PUBLIC accl) + +# MPI +find_package(MPI REQUIRED) +message(STATUS "MPI Include Path: ${MPI_CXX_INCLUDE_PATH}") + +target_include_directories(${EXEC} PUBLIC ${MPI_CXX_INCLUDE_PATH}) +target_link_libraries(${EXEC} PUBLIC MPI::MPI_CXX) + +# XRT +if (NOT EXISTS $ENV{XILINX_XRT}) + message(FATAL_ERROR "Xilinx XRT not found, make sure to source setup.sh") +endif () + +target_link_directories(${EXEC} PUBLIC $ENV{XILINX_XRT}/lib) +target_link_libraries(${EXEC} PUBLIC xilinxopencl xrt_coreutil xrt_core) +target_include_directories(${EXEC} PUBLIC $ENV{XILINX_XRT}/include) + +# ZMQ +target_link_libraries(${EXEC} PUBLIC zmq pthread) + +target_include_directories(${EXEC} PUBLIC /opt/rocm/include /opt/rocm/include/hsa) +target_link_libraries(${EXEC} PUBLIC hip::device numa pthread drm drm_amdgpu rt dl hsa-runtime64 hsakmt) + +# # Json +# find_package(jsoncpp REQUIRED) +# target_link_libraries(${EXEC} PUBLIC jsoncpp_lib) +# get_target_property(JSON_INC_PATH jsoncpp_lib INTERFACE_INCLUDE_DIRECTORIES) +# target_include_directories(${EXEC} PUBLIC ${JSON_INC_PATH}) + + diff --git a/test/host/Coyote/run_scripts/flow_u55c.sh b/test/host/Coyote/run_scripts/flow_u55c.sh index 8976bb7a..f58dfcb7 100755 --- a/test/host/Coyote/run_scripts/flow_u55c.sh +++ b/test/host/Coyote/run_scripts/flow_u55c.sh @@ -2,7 +2,7 @@ # parameters SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -FPGA_BIT_PATH=$SCRIPT_DIR/../../../refdesigns/Coyote/hw/build_RDMA/lynx/lynx.runs/impl_1/cyt_top +FPGA_BIT_PATH=$SCRIPT_DIR/../../../refdesigns/coyote_build_RDMA_u55c/bitstreams/cyt_top # FPGA_BIT_PATH=$SCRIPT_DIR/../../../refdesigns/Coyote/hw/build_TCP/lynx/lynx.runs/impl_1/cyt_top DRIVER_PATH=$SCRIPT_DIR/../../../refdesigns/Coyote/driver/ @@ -81,12 +81,12 @@ if [ $HOT_RESET -eq 1 ]; then for servid in "${SERVID[@]}"; do boardidx=$(expr $servid - 1) host="alveo-u55c-$(printf "%02d" $servid)" - ssh -q -tt $host "sudo insmod $DRIVER_PATH/coyote_drv.ko ip_addr_q0=${IPADDR[boardidx]} mac_addr_q0=${MACADDR[boardidx]}" & + ssh -q -tt $host "sudo insmod $DRIVER_PATH/coyote_drv.ko ip_addr=${IPADDR[boardidx]} mac_addr=${MACADDR[boardidx]}" & done wait echo "Driver loaded." - echo "Getting permissions for fpga..." - parallel-ssh -H "$hostlist" -x '-tt' "sudo /opt/sgrt/cli/program/fpga_chmod 0" + # echo "Getting permissions for fpga..." + # parallel-ssh -H "$hostlist" -x '-tt' "sudo /opt/sgrt/cli/program/fpga_chmod 0" echo "Done." fi diff --git a/test/host/Coyote/run_scripts/run.sh b/test/host/Coyote/run_scripts/run.sh index 59b628a7..808adbb9 100755 --- a/test/host/Coyote/run_scripts/run.sh +++ b/test/host/Coyote/run_scripts/run.sh @@ -8,7 +8,7 @@ fi # state variables mkdir -p "$(pwd)/accl_log" -BUILD_DIR=../build +BUILD_DIR=.. EXEC=$BUILD_DIR/accl_on_coyote HOST_FILE=./accl_log/host FPGA_FILE=./accl_log/fpga @@ -40,25 +40,25 @@ done #define ACCL_BARRIER 12 ARG=" -d -f -r" # debug, hardware, and tcp/rdma flags -TEST_MODE=(10) -N_ELEMENTS=(512) # 128 256 512 1024 2048 4096 8192 16384 32768 65536 131072 262144 524288 1048576 +TEST_MODE=(0) +N_ELEMENTS=(64) # 128 256 512 1024 2048 4096 8192 16384 32768 65536 131072 262144 524288 1048576 NRUN=(1) # number of runs HOST=(1) -PROTOC=(1) # eager=0, rendezevous=1 +PROTOC=(0) # eager=0, rendezevous=1 echo "Run command: $EXEC $ARG -y $TEST_MODE -c 1024 -l $FPGA_FILE" rm -f $(pwd)/accl_log/rank* -for NP in `seq 4 $NUM_PROCESS`; do +for NP in `seq $NUM_PROCESS $NUM_PROCESS`; do for MODE in ${TEST_MODE[@]}; do for N_ELE in ${N_ELEMENTS[@]}; do for H in ${HOST[@]}; do for P in ${PROTOC[@]}; do N=$N_ELE echo "mpirun -n $NP -f $HOST_FILE --iface ens4 $EXEC $ARG -z $H -y $MODE -c $N -l $FPGA_FILE -p $P -n $NRUN &" - mpirun -n $NP -f $HOST_FILE --iface ens4f0 -outfile-pattern "./accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stdout" -errfile-pattern "./accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stdout" $EXEC $ARG -z $H -y $MODE -c $N -l $FPGA_FILE -p $P -n $NRUN & - SLEEPTIME=2 + /mnt/scratch/zhe/mpich/install/bin/mpirun -n $NP -f $HOST_FILE --iface enp65s0f0np0 -outfile-pattern "./accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stdout" -errfile-pattern "./accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stdout" $EXEC $ARG -z $H -y $MODE -c $N -l $FPGA_FILE -p $P -n $NRUN -e & + SLEEPTIME=10 sleep $SLEEPTIME parallel-ssh -H "$HOST_LIST" "kill -9 \$(ps -aux | grep accl_on_coyote | awk '{print \$2}')" parallel-ssh -H "$HOST_LIST" "dmesg | grep "fpga_tlb_miss_isr" >$(pwd)/accl_log/tlb_miss.log" diff --git a/test/host/Coyote/run_scripts/run_eval.sh b/test/host/Coyote/run_scripts/run_eval.sh new file mode 100755 index 00000000..8f2f4c1a --- /dev/null +++ b/test/host/Coyote/run_scripts/run_eval.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +#check working directory +if [[ $(pwd) != */test/host/Coyote/run_scripts ]]; then + echo "ERROR: this script should only be run in the /test/host/Coyote/run_scripts of the repo!" + exit 1 +fi + +# state variables +mkdir -p "$(pwd)/accl_log" +BUILD_DIR=.. +EXEC=$BUILD_DIR/accl_on_coyote +HOST_FILE=./accl_log/host +FPGA_FILE=./accl_log/fpga + +# read server ids from user +echo "Enter u55c machine ids (space separated):" +read -a SERVID + +# create ip files +rm -f $HOST_FILE $FPGA_FILE +NUM_PROCESS=0 +for ID in ${SERVID[@]}; do + echo "10.253.74.$(((ID-1) * 4 + 66))">>$HOST_FILE + echo "10.253.74.$(((ID-1) * 4 + 68))">>$FPGA_FILE + NUM_PROCESS=$((NUM_PROCESS+1)) + HOST_LIST+="alveo-u55c-$(printf "%02d" $ID) " +done + + +# Test Mode +#define ALL 0 +#define ACCL_SEND 3 +#define ACCL_BCAST 5 +#define ACCL_SCATTER 6 +#define ACCL_GATHER 7 +#define ACCL_REDUCE 8 +#define ACCL_ALLGATHER 9 +#define ACCL_ALLREDUCE 10 +#define ACCL_BARRIER 12 + +# read N_ELEMENTS as argument or use default +if [[ -z "$1" ]]; then + echo "No N_ELEMENTS passed as argument, using default value: 16384" + N_ELEMENTS=(16384) +else + echo "Using provided N_ELEMENTS: $1" + N_ELEMENTS=($1) +fi + +ARG=" -d -f -r" # debug, hardware, and tcp/rdma flags +TEST_MODE=(6) +NRUN=(1) # number of runs +HOST=(0) +PROTOC=(0) # eager=0, rendezevous=1 + +echo "Run command: $EXEC $ARG -y $TEST_MODE -c 1024 -l $FPGA_FILE" + +rm -f $(pwd)/accl_log/rank* + +for NP in `seq $NUM_PROCESS $NUM_PROCESS`; do + for MODE in ${TEST_MODE[@]}; do + for N_ELE in ${N_ELEMENTS[@]}; do + for H in ${HOST[@]}; do + for P in ${PROTOC[@]}; do + N=$N_ELE + echo "mpirun -n $NP -f $HOST_FILE --iface ens4 $EXEC $ARG -z $H -y $MODE -c $N -l $FPGA_FILE -p $P -n $NRUN &" + /mnt/scratch/zhe/mpich/install/bin/mpirun -n $NP -f $HOST_FILE --iface enp65s0f0np0 -outfile-pattern "./accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stdout" -errfile-pattern "./accl_log/rank_%r_M_${MODE}_N_${N}_H_${H}_P_${P}_stdout" $EXEC $ARG -z $H -y $MODE -c $N -l $FPGA_FILE -p $P -n $NRUN -e & + SLEEPTIME=15 + sleep $SLEEPTIME + parallel-ssh -H "$HOST_LIST" "kill -9 \$(ps -aux | grep accl_on_coyote | awk '{print \$2}')" + parallel-ssh -H "$HOST_LIST" "dmesg | grep "fpga_tlb_miss_isr" >$(pwd)/accl_log/tlb_miss.log" + done + done + done + done +done + +mkdir -p "$(pwd)/accl_results" +# Loop through accl log files in the source directory and append to accl_results folder +for source_log in "$(pwd)/accl"*.log; do + # Extract the log number from the source log file name (assuming the format is acclX.log) + log_number=$(basename "${source_log}" | sed 's/accl\([0-9]*\)\.log/\1/') + # Create the destination log file path + destination_log="$(pwd)/accl_results/accl${log_number}.log" + # Append the content of the source log to the destination log + cat "${source_log}" >> "${destination_log}" + # Remove the tmp log + rm ${source_log} +done diff --git a/test/host/Coyote/run_scripts/run_the_run.sh b/test/host/Coyote/run_scripts/run_the_run.sh new file mode 100644 index 00000000..e1f6a7d9 --- /dev/null +++ b/test/host/Coyote/run_scripts/run_the_run.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Paths (adjust these if needed) +UTIL_PATH="/mnt/scratch/jooertli/ACCL_CYT_V2_EGR_HOST/test/refdesigns/Coyote/util" +BITSTREAM_PATH="/mnt/scratch/jooertli/ACCL_CYT_V2_EGR_HOST/test/refdesigns/coyote_build_RDMA_u55c_norrsp_nocomp_opt_tlb2_eager_host/bitstreams/cyt_top.bit" +DRIVER_PATH="/mnt/scratch/jooertli/ACCL_CYT_V2_EGR_HOST/test/refdesigns/Coyote/driver/build/coyote_driver.ko" + +# Get FPGA numbers from the user for programming +read -p "Enter FPGA numbers (separated by space, e.g., '1 2'): " FPGA_INPUTS + +NRUN=10 +N_ELEMENTS=(4096 16384 65536 262144 524288) #4096 16384 65536 262144 524288 1048576 +for N_ELE in ${N_ELEMENTS[@]}; do + for (( run=1; run<=NRUN; run++ )); do + echo "Run #$run for N_ELEMENTS=$N_ELE" + # Run the programming script + echo "Programming FPGAs: $FPGA_INPUTS ..." + bash "$UTIL_PATH/program_hacc_remote.sh" "$BITSTREAM_PATH" "$DRIVER_PATH" <<< "$FPGA_INPUTS" + SLEEPTIME=6 + sleep $SLEEPTIME + # Run run_eval.sh with current N_ELE and same FPGA_INPUTS + echo "Now running run_eval.sh with N_ELEMENTS=$N_ELE (run #$run)..." + bash "$(dirname "$0")/run_eval.sh" "$N_ELE" <<< "$FPGA_INPUTS" + SLEEPTIME=3 + sleep $SLEEPTIME + done +done +echo "Done!" \ No newline at end of file diff --git a/test/host/Coyote/test.cpp b/test/host/Coyote/test.cpp index c7c21216..7f29fe8f 100644 --- a/test/host/Coyote/test.cpp +++ b/test/host/Coyote/test.cpp @@ -65,6 +65,7 @@ struct options_t unsigned int protoc; std::string xclbin; std::string fpgaIP; + bool eagerRx_host; }; struct timestamp_t @@ -276,6 +277,7 @@ options_t parse_options(int argc, char *argv[]) "i", "device-index", "device index of FPGA if hardware mode is used", false, 0, "positive integer"); cmd.add(device_index_arg); + TCLAP::SwitchArg eager_arg("e", "eager_host", "Eager Buffers on host", cmd, false); cmd.parse(argc, argv); if (hardware_arg.getValue()) { @@ -337,6 +339,7 @@ options_t parse_options(int argc, char *argv[]) opts.xclbin = xclbin_arg.getValue(); opts.fpgaIP = fpgaIP_arg.getValue(); opts.protoc = protoc_arg.getValue(); + opts.eagerRx_host = eager_arg.getValue(); std::cout << "count:" << opts.count << " rxbuf_size:" << opts.rxbuf_size << " seg_size:" << opts.seg_size << " num_rxbufmem:" << opts.num_rxbufmem << std::endl; return opts; @@ -354,24 +357,24 @@ options_t parse_options(int argc, char *argv[]) } -void exchange_qp(unsigned int master_rank, unsigned int slave_rank, unsigned int local_rank, std::vector &ibvQpConn_vec, std::vector &ranks) +void exchange_qp(unsigned int master_rank, unsigned int slave_rank, unsigned int local_rank, std::vector &ranks, ACCL::CoyoteDevice* device) { if (local_rank == master_rank) { std::cout<<"Local rank "<getQpairStruct()->local), sizeof(fpga::ibvQ), MPI_CHAR, slave_rank, 0, MPI_COMM_WORLD); + MPI_Send(&(device->coyote_qProc_vec[slave_rank]->getQpair()->local), sizeof(coyote::ibvQ), MPI_CHAR, slave_rank, 0, MPI_COMM_WORLD); } else if (local_rank == slave_rank) { std::cout<<"Local rank "<getQpairStruct()->remote = received_q; + device->coyote_qProc_vec[master_rank]->getQpair()->remote = received_q; } // Synchronize after the first exchange to avoid race conditions @@ -381,17 +384,17 @@ void exchange_qp(unsigned int master_rank, unsigned int slave_rank, unsigned int { std::cout<<"Local rank "<getQpairStruct()->local), sizeof(fpga::ibvQ), MPI_CHAR, master_rank, 0, MPI_COMM_WORLD); + MPI_Send(&(device->coyote_qProc_vec[master_rank]->getQpair()->local), sizeof(coyote::ibvQ), MPI_CHAR, master_rank, 0, MPI_COMM_WORLD); } else if (local_rank == master_rank) { std::cout<<"Local rank "<getQpairStruct()->remote = received_q; + device->coyote_qProc_vec[slave_rank]->getQpair()->remote = received_q; } MPI_Barrier(MPI_COMM_WORLD); @@ -399,20 +402,22 @@ void exchange_qp(unsigned int master_rank, unsigned int slave_rank, unsigned int // write established connection to hardware and perform arp lookup if (local_rank == master_rank) { - int connection = (ibvQpConn_vec[slave_rank]->getQpairStruct()->local.qpn & 0xFFFF) | ((ibvQpConn_vec[slave_rank]->getQpairStruct()->remote.qpn & 0xFFFF) << 16); - ibvQpConn_vec[slave_rank]->getQpairStruct()->print(); - ibvQpConn_vec[slave_rank]->setConnection(connection); - ibvQpConn_vec[slave_rank]->writeContext(ranks[slave_rank].port); - ibvQpConn_vec[slave_rank]->doArpLookup(); - ranks[slave_rank].session_id = ibvQpConn_vec[slave_rank]->getQpairStruct()->local.qpn; + int connection = (device->coyote_qProc_vec[slave_rank]->getQpair()->local.qpn & 0xFFFF) | ((device->coyote_qProc_vec[slave_rank]->getQpair()->remote.qpn & 0xFFFF) << 16); + device->coyote_qProc_vec[slave_rank]->getQpair()->local.print("Local "); + device->coyote_qProc_vec[slave_rank]->getQpair()->remote.print("Remote"); + //device->coyote_qProc_vec[slave_rank]->setConnection(connection); + device->coyote_qProc_vec[slave_rank]->writeQpContext(ranks[slave_rank].port); + device->coyote_qProc_vec[slave_rank]->doArpLookup(device->coyote_qProc_vec[slave_rank]->getQpair()->remote.ip_addr); + ranks[slave_rank].session_id = device->coyote_qProc_vec[slave_rank]->getQpair()->local.qpn; } else if (local_rank == slave_rank) { - int connection = (ibvQpConn_vec[master_rank]->getQpairStruct()->local.qpn & 0xFFFF) | ((ibvQpConn_vec[master_rank]->getQpairStruct()->remote.qpn & 0xFFFF) << 16); - ibvQpConn_vec[master_rank]->getQpairStruct()->print(); - ibvQpConn_vec[master_rank]->setConnection(connection); - ibvQpConn_vec[master_rank]->writeContext(ranks[master_rank].port); - ibvQpConn_vec[master_rank]->doArpLookup(); - ranks[master_rank].session_id = ibvQpConn_vec[master_rank]->getQpairStruct()->local.qpn; + int connection = (device->coyote_qProc_vec[master_rank]->getQpair()->local.qpn & 0xFFFF) | ((device->coyote_qProc_vec[master_rank]->getQpair()->remote.qpn & 0xFFFF) << 16); + device->coyote_qProc_vec[master_rank]->getQpair()->local.print("Local "); + device->coyote_qProc_vec[master_rank]->getQpair()->remote.print("Remote"); + //device->coyote_qProc_vec[master_rank]->setConnection(connection); + device->coyote_qProc_vec[master_rank]->writeQpContext(ranks[master_rank].port); + device->coyote_qProc_vec[master_rank]->doArpLookup(device->coyote_qProc_vec[slave_rank]->getQpair()->remote.ip_addr); + ranks[master_rank].session_id = device->coyote_qProc_vec[master_rank]->getQpair()->local.qpn; } MPI_Barrier(MPI_COMM_WORLD); @@ -422,63 +427,129 @@ void exchange_qp(unsigned int master_rank, unsigned int slave_rank, unsigned int void configure_cyt_rdma(std::vector &ranks, int local_rank, ACCL::CoyoteDevice* device) { - std::cout<<"Initializing QP connections..."< ibvQpConn_vec; - // create single page dummy memory space for each qp - uint32_t n_pages = 1; - for(int i=0; icoyote_qProc_vec[i], ranks[local_rank].ip, n_pages); - ibvQpConn_vec.push_back(qpConn); - // qpConn->getQpairStruct()->print(); - } + // std::cout<<"Initializing QP connections..."< ibvQp_vec; + // // create single page dummy memory space for each qp + // uint32_t n_pages = 1; + // for(int i=0; icoyote_qProc_vec[i], ranks[local_rank].ip, n_pages); + // ibvQp_vec.push_back(qpConn); + // // qpConn->getQpair()->print(); + // } std::cout<<"Exchanging QP..."< &ranks, int local_rank, ACCL::CoyoteDevice* device) -{ - std::cout<<"Configuring Coyote TCP..."<get_device()->doArpLookup(_ip_encode(ranks[i].ip)); - } - } - - //open port - for (int i=0; iget_device()->tcpOpenPort(dstPort); - } - - std::this_thread::sleep_for(10ms); - - //open con - for (int i=0; iget_device()->tcpOpenCon(dstIp, dstPort, &session); - ranks[i].session_id = session; - } - } +// void configure_cyt_tcp(std::vector &ranks, int local_rank, ACCL::CoyoteDevice* device) +// { +// std::cout<<"Configuring Coyote TCP..."<get_device()->doArpLookup(_ip_encode(ranks[i].ip)); +// } +// } + +// //open port +// for (int i=0; iget_device()->tcpOpenPort(dstPort); +// } + +// std::this_thread::sleep_for(10ms); + +// //open con +// for (int i=0; iget_device()->tcpOpenCon(dstIp, dstPort, &session); +// ranks[i].session_id = session; +// } +// } + +// } + +void test_copy(ACCL::ACCL &accl, options_t &options){ + std::cout << "Start copy test..." << std::endl<(count, dataType::float32); + auto res_buf = accl.create_coyotebuffer(count, dataType::float32); + int errors = 0; -} + if (options.count*sizeof(dataType::float32) > options.rxbuf_size){ + std::cout<<"experiment size larger than buffer size, exiting..."<buffer()[i] = (float)i; + res_buf.get()->buffer()[i] = -999.0f; + } + op_buf.get()->buffer()[0] = (float) 5; + if (options.host == 0){ op_buf->sync_to_device(); } + if (options.host == 0){ res_buf->sync_to_device(); } + // Print buffer addresses for debugging + //std::cout << "Source buffer address: " << op_buf.get()->buffer() << std::endl; + //std::cout << "Result buffer address: " << res_buf.get()->buffer() << std::endl; + + for (int n = 0; n < options.nruns; n++) + { + //copy + MPI_Barrier(MPI_COMM_WORLD); + + double durationUs = 0.0; + double tput = 0.0; + auto start = std::chrono::high_resolution_clock::now(); + ACCL::ACCLRequest* req; + req = accl.copy(*op_buf, *res_buf, count, true, true, false); + accl.wait(req, 1000ms); + + auto end = std::chrono::high_resolution_clock::now(); + durationUs = (std::chrono::duration_cast(end-start).count() / 1000.0); + tput = (options.count*sizeof(dataType::int32)*8.0)/(durationUs*1000.0); + durationUs = (double)accl.get_duration(req)/1000.0; + tput = (options.count*sizeof(dataType::int32)*8.0)/(durationUs*1000.0); + if(durationUs > 1.0){ + accl_log(mpi_rank, format_log("copy", options, durationUs, tput)); + } + std::cout << std::endl; + //compare results + if (options.host == 0){ op_buf->sync_from_device(); } + if (options.host == 0){ res_buf->sync_from_device(); } + for (int i = 0; i < count; i++) { + if (res_buf.get()->buffer()[i] != op_buf.get()->buffer()[i]) { + std::cout << std::to_string(i + 1) + "th item is incorrect!" << std::endl; + errors += 1; + } + } + if (errors > 0) { + std::cout << "Copy test failed with " << errors << " errors out of " << count << " elements!" << std::endl; + failed_tests++; + } else { + std::cout << "Copy test successful!" << std::endl; + } + } + // Free buffers + std::cout << "Freeing buffers..." << std::endl; + op_buf->free_buffer(); + res_buf->free_buffer(); +} void test_sendrcv(ACCL::ACCL &accl, options_t &options) { std::cout << "Start send recv test..." << std::endl<(bufsize, dataType::int32); + //std::cout << "Buffer address: " << op_buf.get()->buffer() << std::endl; for (int n = 0; n < options.nruns; n++) { std::cout << "Repetition " <buffer()[i] = (mpi_rank == 0) ? i : -1; + op_buf.get()->buffer()[0] = (int) 5; if (options.host == 0){ op_buf->sync_to_device(); } @@ -538,12 +610,14 @@ void test_sendrcv(ACCL::ACCL &accl, options_t &options) { int errors = 0; if (options.host == 0){ op_buf->sync_from_device(); } - if (mpi_rank == 1) { for (int i = 0; i < bufsize; i++) { - unsigned int res = op_buf.get()->buffer()[i]; - unsigned int ref = i; + float res = op_buf.get()->buffer()[i]; + float ref = i + n; + if(i == 0){ + ref = 5 + n; + } if (res != ref) { std::cout << std::to_string(i + 1) + "th item is incorrect! (" + std::to_string(res) + " != " + std::to_string(ref) + ")" @@ -559,6 +633,7 @@ void test_sendrcv(ACCL::ACCL &accl, options_t &options) { } else { std::cout << "Test is successful!" << std::endl; } + debug(accl.dump_eager_rx_buffers(false)); } op_buf->free_buffer(); @@ -1081,8 +1156,10 @@ void test_accl_base(options_t options) MPI_Barrier(MPI_COMM_WORLD); if (options.tcp){ - device = new ACCL::CoyoteDevice(); - configure_cyt_tcp(ranks, local_rank, device); + std::cout<<"ACCL with Coyote V2 TCP not supported"<(device); if (options.protoc == 0){ + bool eagerBufs = false; + if(options.eagerRx_host){ + eagerBufs = true; + } std::cout<<"Eager Protocol"<initialize(ranks, mpi_rank, - mpi_size+2, options.rxbuf_size, options.seg_size, 4096*1024*2); + mpi_size+3, options.rxbuf_size, 4096*1024, 4096*1024*2, eagerBufs); } else if (options.protoc == 1){ std::cout<<"Rendezvous Protocol"<initialize(ranks, mpi_rank, mpi_size, 64, 64, options.seg_size); @@ -1141,7 +1222,8 @@ void test_accl_base(options_t options) MPI_Barrier(MPI_COMM_WORLD); - + test_copy(*accl, options); + if(options.test_mode == ACCL_SEND || options.test_mode == 0){ debug(accl->dump_eager_rx_buffers(false)); MPI_Barrier(MPI_COMM_WORLD); @@ -1177,7 +1259,8 @@ void test_accl_base(options_t options) debug(accl->dump_communicator()); debug(accl->dump_eager_rx_buffers(false)); } - if(options.test_mode == ACCL_REDUCE || options.test_mode == 0){ + //comment out non working tests + /*if(options.test_mode == ACCL_REDUCE || options.test_mode == 0){ debug(accl->dump_eager_rx_buffers(false)); MPI_Barrier(MPI_COMM_WORLD); int root = 0; @@ -1191,7 +1274,7 @@ void test_accl_base(options_t options) test_allreduce(*accl, options, reduceFunction::SUM); debug(accl->dump_communicator()); debug(accl->dump_eager_rx_buffers(false)); - } + }*/ if(options.test_mode == ACCL_BARRIER){ std::cout << "Start barrier test..."<< std::endl; for (int n = 0; n < options.nruns; n++) @@ -1208,7 +1291,7 @@ void test_accl_base(options_t options) } } - + MPI_Barrier(MPI_COMM_WORLD); if (failed_tests == 0){ std::cout << "\nACCL base functionality test completed successfully!\n" << std::endl; } diff --git a/test/host/Coyote/test_gpu.cpp b/test/host/Coyote/test_gpu.cpp new file mode 100644 index 00000000..10dfde7f --- /dev/null +++ b/test/host/Coyote/test_gpu.cpp @@ -0,0 +1,1356 @@ +/******************************************************************************* +# Copyright (C) 2022 Xilinx, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +*******************************************************************************/ + +#include "accl.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +//IMPORTANT: compile with cmakeLists_GPU and gpu_en in coyote buffer + +// Set the tolerance for compressed datatypes high enough, since we do currently +// not replicate the float32 -> float16 conversion for our reference results +#define FLOAT16RTOL 0.005 +#define FLOAT16ATOL 0.05 + +#define FREQ 250 +#define MAX_PKT_SIZE 4096 + +int mpi_rank, mpi_size; +unsigned failed_tests; +unsigned skipped_tests; + +// leave options be for now to avoid any argument parsing issues + +struct options_t +{ + int start_port; + unsigned int rxbuf_size; + unsigned int seg_size; + unsigned int count; + unsigned int nruns; + unsigned int device_index; + unsigned int num_rxbufmem; + unsigned int test_mode; + bool debug; + bool hardware; + bool axis3; + bool udp; + bool tcp; + bool rdma; + unsigned int host; + unsigned int protoc; + std::string xclbin; + std::string fpgaIP; + bool eagerRx_host; +}; + +struct timestamp_t +{ + uint64_t cmdSeq; + uint64_t scenario; + uint64_t len; + uint64_t comm; + uint64_t root_src_dst; + uint64_t function; + uint64_t msg_tag; + uint64_t datapath_cfg; + uint64_t compression_flags; + uint64_t stream_flags; + uint64_t addra_l; + uint64_t addra_h; + uint64_t addrb_l; + uint64_t addrb_h; + uint64_t addrc_l; + uint64_t addrc_h; + uint64_t cmdTimestamp; + uint64_t cmdEnd; + uint64_t stsSeq; + uint64_t sts; + uint64_t stsTimestamp; + uint64_t stsEnd; +}; + +//****************************** +//** XCC Operations ** +//****************************** +// Housekeeping +#define ACCL_CONFIG 0 +// Primitives +#define ACCL_COPY 1 +#define ACCL_COMBINE 2 +#define ACCL_SEND 3 +#define ACCL_RECV 4 +// Collectives +#define ACCL_BCAST 5 +#define ACCL_SCATTER 6 +#define ACCL_GATHER 7 +#define ACCL_REDUCE 8 +#define ACCL_ALLGATHER 9 +#define ACCL_ALLREDUCE 10 +#define ACCL_REDUCE_SCATTER 11 +#define ACCL_BARRIER 12 +#define ACCL_ALLTOALL 13 + +// ACCL_CONFIG SUBFUNCTIONS +#define HOUSEKEEP_SWRST 0 +#define HOUSEKEEP_PKTEN 1 +#define HOUSEKEEP_TIMEOUT 2 +#define HOUSEKEEP_OPEN_PORT 3 +#define HOUSEKEEP_OPEN_CON 4 +#define HOUSEKEEP_SET_STACK_TYPE 5 +#define HOUSEKEEP_SET_MAX_SEGMENT_SIZE 6 +#define HOUSEKEEP_CLOSE_CON 7 + +std::string format_log(std::string collective, options_t options, double time, double tput) +{ + std::string host_str; + std::string protoc_str; + std::string stack_str; + if(options.host == 1){ + host_str = "host"; + } else{ + host_str = "device"; + } + if(options.protoc == 0){ + protoc_str = "eager"; + } else if (options.protoc == 1){ + protoc_str = "rndzvs"; + } + if(options.tcp){ + stack_str = "tcp"; + } else if (options.rdma) { + stack_str = "rdma"; + } + std::string log_str = collective + "," + std::to_string(mpi_size) + "," + std::to_string(mpi_rank) + "," + std::to_string(options.num_rxbufmem) + "," + std::to_string(options.count * sizeof(float)) + "," + std::to_string(options.rxbuf_size) + "," + std::to_string(options.rxbuf_size) + "," + std::to_string(MAX_PKT_SIZE) + "," + std::to_string(time) + "," + std::to_string(tput) + "," + host_str + "," + protoc_str + "," + stack_str; + return log_str; +} + +inline void swap_endianness(uint32_t *ip) +{ + uint8_t *ip_bytes = reinterpret_cast(ip); + *ip = (ip_bytes[3] << 0) | (ip_bytes[2] << 8) | (ip_bytes[1] << 16) | + (ip_bytes[0] << 24); +} + +uint32_t _ip_encode(std::string ip) +{ + struct sockaddr_in sa; + inet_pton(AF_INET, ip.c_str(), &(sa.sin_addr)); + swap_endianness(&sa.sin_addr.s_addr); + return sa.sin_addr.s_addr; +} + +std::string ip_decode(uint32_t ip) +{ + char buffer[INET_ADDRSTRLEN]; + struct in_addr sa; + sa.s_addr = ip; + swap_endianness(&sa.s_addr); + inet_ntop(AF_INET, &sa, buffer, INET_ADDRSTRLEN); + return std::string(buffer, INET_ADDRSTRLEN); +} + +void test_debug(std::string message, options_t &options) +{ + if (options.debug) + { + std::cerr << message << std::endl; + } +} + +void check_usage(int argc, char *argv[]) {} + +std::string prepend_process() +{ + return "[process " + std::to_string(mpi_rank) + "] "; +} + +template +bool is_close(T a, T b, double rtol = 1e-5, double atol = 1e-8) +{ + // std::cout << abs(a - b) << " <= " << (atol + rtol * abs(b)) << "? " << + // (abs(a - b) <= (atol + rtol * abs(b))) << std::endl; + return abs(a - b) <= (atol + rtol * abs(b)); +} + +template +static void random_array(T *data, size_t count) +{ + std::uniform_real_distribution distribution(-1000, 1000); + std::mt19937 engine; + auto generator = std::bind(distribution, engine); + for (size_t i = 0; i < count; ++i) + { + data[i] = generator(); + } +} + +template +std::unique_ptr random_array(size_t count) +{ + std::unique_ptr data(new T[count]); + random_array(data.get(), count); + return data; +} + + +options_t parse_options(int argc, char *argv[]) +{ + try + { + TCLAP::CmdLine cmd("Test ACCL C++ driver"); + TCLAP::ValueArg nruns_arg("n", "nruns", + "How many times to run each test", + false, 1, "positive integer"); + cmd.add(nruns_arg); + TCLAP::ValueArg start_port_arg( + "s", "start-port", "Start of range of ports usable for sim", false, 5005, + "positive integer"); + cmd.add(start_port_arg); + TCLAP::ValueArg count_arg("c", "count", "How many element per buffer", + false, 16, "positive integer"); + cmd.add(count_arg); + TCLAP::ValueArg bufsize_arg("b", "rxbuf-size", + "How many KB per RX buffer", false, 4096, + "positive integer"); + cmd.add(bufsize_arg); + TCLAP::ValueArg seg_arg("g", "max_segment_size", + "Maximum segmentation size in KB (should be samller than Max DMA transaction)", false, 4096, + "positive integer"); + cmd.add(seg_arg); + TCLAP::ValueArg num_rxbufmem_arg("m", "num_rxbufmem", + "Number of memory banks used for rxbuf", false, 2, + "positive integer"); + cmd.add(num_rxbufmem_arg); + TCLAP::ValueArg test_mode_arg("y", "test_mode", + "Test mode, by default run all the collective tests", false, 0, + "integer"); + cmd.add(test_mode_arg); + TCLAP::ValueArg host_arg("z", "host_buffer", + "Enable host buffer mode with 1", false, 0, + "integer"); + cmd.add(host_arg); + TCLAP::ValueArg protoc_arg("p", "protocol", + "Eager Protocol with 0 and Rendezvous with 1", false, 0, + "integer"); + cmd.add(protoc_arg); + TCLAP::SwitchArg debug_arg("d", "debug", "Enable debug mode", cmd, false); + TCLAP::SwitchArg hardware_arg("f", "hardware", "enable hardware mode", cmd, false); + TCLAP::SwitchArg axis3_arg("a", "axis3", "Use axis3 hardware setup", cmd, false); + TCLAP::SwitchArg udp_arg("u", "udp", "Use UDP hardware setup", cmd, false); + TCLAP::SwitchArg tcp_arg("t", "tcp", "Use TCP hardware setup", cmd, false); + TCLAP::SwitchArg rdma_arg("r", "rdma", "Use RDMA hardware setup", cmd, false); + TCLAP::SwitchArg userkernel_arg("k", "userkernel", "Enable user kernel(by default vadd kernel)", cmd, false); + TCLAP::ValueArg xclbin_arg( + "x", "xclbin", "xclbin of accl driver if hardware mode is used", false, + "accl.xclbin", "file"); + cmd.add(xclbin_arg); + TCLAP::ValueArg fpgaIP_arg( + "l", "ipList", "ip list of FPGAs if hardware mode is used", false, + "fpga", "file"); + cmd.add(fpgaIP_arg); + TCLAP::ValueArg device_index_arg( + "i", "device-index", "device index of FPGA if hardware mode is used", + false, 0, "positive integer"); + cmd.add(device_index_arg); + TCLAP::SwitchArg eager_arg("e", "eager_host", "Eager Buffers on host", cmd, false); + cmd.parse(argc, argv); + if (hardware_arg.getValue()) + { + if (axis3_arg.getValue()) + { + if (udp_arg.getValue() || tcp_arg.getValue() || rdma_arg.getValue()) + { + throw std::runtime_error("When using hardware axis3 mode, tcp or rdma or udp can not be used."); + } + std::cout << "Hardware axis3 mode" << std::endl; + } + if (udp_arg.getValue()) + { + if (axis3_arg.getValue() || tcp_arg.getValue() || rdma_arg.getValue()) + { + throw std::runtime_error("When using hardware udp mode, tcp or rdma or axis3 can not be used."); + } + std::cout << "Hardware udp mode" << std::endl; + } + if (tcp_arg.getValue()) + { + if (axis3_arg.getValue() || udp_arg.getValue() || rdma_arg.getValue()) + { + throw std::runtime_error("When using hardware tcp mode, udp or rdma or axis3 can not be used."); + } + std::cout << "Hardware tcp mode" << std::endl; + } + if (rdma_arg.getValue()) + { + if (axis3_arg.getValue() || udp_arg.getValue() || tcp_arg.getValue()) + { + throw std::runtime_error("When using hardware rdma mode, udp or tcp or axis3 can not be used."); + } + std::cout << "Hardware rdma mode" << std::endl; + } + if ((axis3_arg.getValue() || udp_arg.getValue() || tcp_arg.getValue() || rdma_arg.getValue()) == false) + { + throw std::runtime_error("When using hardware, specify either axis3 or tcp or" + "udp or rdma mode."); + } + } + + options_t opts; + opts.start_port = start_port_arg.getValue(); + opts.count = count_arg.getValue(); + opts.rxbuf_size = bufsize_arg.getValue() * 1024; // convert to bytes + opts.seg_size = seg_arg.getValue() * 1024; // convert to bytes + opts.num_rxbufmem = num_rxbufmem_arg.getValue(); + opts.nruns = nruns_arg.getValue(); + opts.debug = debug_arg.getValue(); + opts.host = host_arg.getValue(); + opts.hardware = hardware_arg.getValue(); + opts.axis3 = axis3_arg.getValue(); + opts.udp = udp_arg.getValue(); + opts.tcp = tcp_arg.getValue(); + opts.rdma = rdma_arg.getValue(); + opts.test_mode = test_mode_arg.getValue(); + opts.device_index = device_index_arg.getValue(); + opts.xclbin = xclbin_arg.getValue(); + opts.fpgaIP = fpgaIP_arg.getValue(); + opts.protoc = protoc_arg.getValue(); + opts.eagerRx_host = eager_arg.getValue(); + + std::cout << "count:" << opts.count << " rxbuf_size:" << opts.rxbuf_size << " seg_size:" << opts.seg_size << " num_rxbufmem:" << opts.num_rxbufmem << std::endl; + return opts; + } + catch (std::exception &e) + { + if (mpi_rank == 0) + { + std::cout << "Error: " << e.what() << std::endl; + } + + MPI_Finalize(); + exit(1); + } +} + + +void exchange_qp(unsigned int master_rank, unsigned int slave_rank, unsigned int local_rank, std::vector &ranks, ACCL::CoyoteDevice* device) +{ + + if (local_rank == master_rank) + { + std::cout<<"Local rank "<coyote_qProc_vec[slave_rank]->getQpair()->local), sizeof(coyote::ibvQ), MPI_CHAR, slave_rank, 0, MPI_COMM_WORLD); + } + else if (local_rank == slave_rank) + { + std::cout<<"Local rank "<coyote_qProc_vec[master_rank]->getQpair()->remote = received_q; + } + + // Synchronize after the first exchange to avoid race conditions + MPI_Barrier(MPI_COMM_WORLD); + + if (local_rank == slave_rank) + { + std::cout<<"Local rank "<coyote_qProc_vec[master_rank]->getQpair()->local), sizeof(coyote::ibvQ), MPI_CHAR, master_rank, 0, MPI_COMM_WORLD); + } + else if (local_rank == master_rank) + { + std::cout<<"Local rank "<coyote_qProc_vec[slave_rank]->getQpair()->remote = received_q; + } + + MPI_Barrier(MPI_COMM_WORLD); + + // write established connection to hardware and perform arp lookup + if (local_rank == master_rank) + { + int connection = (device->coyote_qProc_vec[slave_rank]->getQpair()->local.qpn & 0xFFFF) | ((device->coyote_qProc_vec[slave_rank]->getQpair()->remote.qpn & 0xFFFF) << 16); + device->coyote_qProc_vec[slave_rank]->getQpair()->local.print("Local "); + device->coyote_qProc_vec[slave_rank]->getQpair()->remote.print("Remote"); + //device->coyote_qProc_vec[slave_rank]->setConnection(connection); + device->coyote_qProc_vec[slave_rank]->writeQpContext(ranks[slave_rank].port); + device->coyote_qProc_vec[slave_rank]->doArpLookup(device->coyote_qProc_vec[slave_rank]->getQpair()->remote.ip_addr); + ranks[slave_rank].session_id = device->coyote_qProc_vec[slave_rank]->getQpair()->local.qpn; + } else if (local_rank == slave_rank) + { + int connection = (device->coyote_qProc_vec[master_rank]->getQpair()->local.qpn & 0xFFFF) | ((device->coyote_qProc_vec[master_rank]->getQpair()->remote.qpn & 0xFFFF) << 16); + device->coyote_qProc_vec[master_rank]->getQpair()->local.print("Local "); + device->coyote_qProc_vec[master_rank]->getQpair()->remote.print("Remote"); + //device->coyote_qProc_vec[master_rank]->setConnection(connection); + device->coyote_qProc_vec[master_rank]->writeQpContext(ranks[master_rank].port); + device->coyote_qProc_vec[master_rank]->doArpLookup(device->coyote_qProc_vec[slave_rank]->getQpair()->remote.ip_addr); + ranks[master_rank].session_id = device->coyote_qProc_vec[master_rank]->getQpair()->local.qpn; + } + + MPI_Barrier(MPI_COMM_WORLD); +} + + +void configure_cyt_rdma(std::vector &ranks, int local_rank, ACCL::CoyoteDevice* device) +{ + + // std::cout<<"Initializing QP connections..."< ibvQp_vec; + // // create single page dummy memory space for each qp + // uint32_t n_pages = 1; + // for(int i=0; icoyote_qProc_vec[i], ranks[local_rank].ip, n_pages); + // ibvQp_vec.push_back(qpConn); + // // qpConn->getQpair()->print(); + // } + + std::cout<<"Exchanging QP..."< &ranks, int local_rank, ACCL::CoyoteDevice* device) +// { +// std::cout<<"Configuring Coyote TCP..."<get_device()->doArpLookup(_ip_encode(ranks[i].ip)); +// } +// } + +// //open port +// for (int i=0; iget_device()->tcpOpenPort(dstPort); +// } + +// std::this_thread::sleep_for(10ms); + +// //open con +// for (int i=0; iget_device()->tcpOpenCon(dstIp, dstPort, &session); +// ranks[i].session_id = session; +// } +// } + +// } + +void test_copy(ACCL::ACCL &accl, options_t &options){ + std::cout << "Start copy test..." << std::endl<(count, ACCL::dataType::float32); + auto res_buf = accl.create_coyotebuffer(count, ACCL::dataType::float32); + int errors = 0; + if (options.count*sizeof(ACCL::dataType::float32) > options.rxbuf_size){ + std::cout<<"experiment size larger than buffer size, exiting..."<buffer()[i] = (float)i; + res_buf.get()->buffer()[i] = -999.0f; + } + op_buf.get()->buffer()[0] = (float) 5; + if (options.host == 0){ op_buf->sync_to_device(); } + if (options.host == 0){ res_buf->sync_to_device(); } + // Print buffer addresses for debugging + //std::cout << "Source buffer address: " << op_buf.get()->buffer() << std::endl; + //std::cout << "Result buffer address: " << res_buf.get()->buffer() << std::endl; + //copy + ACCL::ACCLRequest* req; + req = accl.copy(*op_buf, *res_buf, count, true, true, false); + accl.wait(req, 1000ms); + + //compare results + if (options.host == 0){ op_buf->sync_from_device(); } + if (options.host == 0){ res_buf->sync_from_device(); } + for (int i = 0; i < count; i++) { + if (res_buf.get()->buffer()[i] != op_buf.get()->buffer()[i]) { + std::cout << std::to_string(i + 1) + "th item is incorrect!" << std::endl; + errors += 1; + } + } + if (errors > 0) { + std::cout << "Copy test failed with " << errors << " errors out of " << count << " elements!" << std::endl; + failed_tests++; + } else { + std::cout << "Copy test successful!" << std::endl; + } + + // Free buffers + std::cout << "Freeing buffers..." << std::endl; + op_buf->free_buffer(); + res_buf->free_buffer(); +} + +void test_sendrcv(ACCL::ACCL &accl, options_t &options) { + std::cout << "Start send recv test..." << std::endl< options.rxbuf_size){ + std::cout<<"experiment size larger than buffer size, exiting..."<(bufsize, ACCL::dataType::int32); + std::cout << "Buffer address: " << op_buf.get()->buffer() << std::endl; + + for (int n = 0; n < options.nruns; n++) + { + std::cout << "Repetition " <buffer()[i] = (mpi_rank == 0) ? (i + n) : -1; + op_buf.get()->buffer()[0] = (int) (5 + n); + + if (options.host == 0){ op_buf->sync_to_device(); } + + MPI_Barrier(MPI_COMM_WORLD); + + double durationUs = 0.0; + double tput = 0.0; + auto start = std::chrono::high_resolution_clock::now(); + + ACCL::ACCLRequest* req; + if (mpi_rank == 0) { + // send + req = accl.send(*op_buf, bufsize, 1, ACCL::TAG_ANY, ACCL::GLOBAL_COMM, true, ACCL::dataType::none, true); // most default send from 0 to 1 + accl.wait(req, 1000ms); + + } else if (mpi_rank == 1) { + // receive + req = accl.recv(*op_buf, bufsize, 0, ACCL::TAG_ANY, ACCL::GLOBAL_COMM, true, ACCL::dataType::none, true); // most default recv to 1 from 0 + accl.wait(req, 1000ms); + } + + auto end = std::chrono::high_resolution_clock::now(); + durationUs = (std::chrono::duration_cast(end-start).count() / 1000.0); + tput = (options.count*sizeof(ACCL::dataType::int32)*8.0)/(durationUs*1000.0); + std::cout<<"host measured durationUs:"< 1.0){ + ACCL::accl_log(mpi_rank, format_log("sendrecv", options, durationUs, tput)); + } + } + + int errors = 0; + + if (options.host == 0){ op_buf->sync_from_device(); } + if (mpi_rank == 1) + { + for (int i = 0; i < bufsize; i++) { + unsigned int res = op_buf.get()->buffer()[i]; + unsigned int ref = i + n; + if(i == 0){ + ref = 5 + n; + } + if (res != ref) { + std::cout << std::to_string(i + 1) + "th item is incorrect! (" + + std::to_string(res) + " != " + std::to_string(ref) + ")" + << std::endl; + errors += 1; + } + } + } + + if (errors > 0) { + std::cout << std::to_string(errors) + " errors!" << std::endl; + failed_tests++; + } else { + std::cout << "Test is successful!" << std::endl; + } + ACCL::debug(accl.dump_eager_rx_buffers(false)); + } + + op_buf->free_buffer(); +} + + +void test_bcast(ACCL::ACCL &accl, options_t &options, int root) { + std::cout << "Start bcast test with root " + std::to_string(root) + " ..." + << std::endl< options.rxbuf_size){ + std::cout<<"experiment size larger than buffer size, exiting..."<(count, ACCL::dataType::int32); + + for (int n = 0; n < options.nruns; n++) + { + std::cout << "Repetition " <buffer()[i] = (mpi_rank == root) ? i : -1; + + if (options.host == 0){ op_buf->sync_to_device(); } + + if (mpi_rank == root) { + test_debug("Broadcasting data from " + std::to_string(mpi_rank) + "...", + options); + } else { + test_debug("Getting broadcast data from " + std::to_string(root) + "...", + options); + } + + MPI_Barrier(MPI_COMM_WORLD); + + double durationUs = 0.0; + accl.barrier(); + std::cout<<"Pass accl barrier"<(end-start).count() / 1000.0); + std::cout<<"host measured durationUs:"< 1.0){ + ACCL::accl_log(mpi_rank, format_log("bcast", options, durationUs, 0)); + } + + if (options.host == 0){ op_buf->sync_from_device(); } + + if (mpi_rank != root) { + int errors = 0; + for (int i = 0; i < count; i++) { + unsigned int res = op_buf.get()->buffer()[i]; + unsigned int ref = i; + if (res != ref) { + std::cout << std::to_string(i + 1) + "th item is incorrect! (" + + std::to_string(res) + " != " + std::to_string(ref) + + ")" + << std::endl; + errors += 1; + } + } + + if (errors > 0) { + std::cout << std::to_string(errors) + " errors!" << std::endl; + failed_tests++; + } else { + std::cout << "Test is successful!" << std::endl; + } + } + } + + op_buf->free_buffer(); + +} + +void test_scatter(ACCL::ACCL &accl, options_t &options, int root) { + std::cout << "Start scatter test with root " + std::to_string(root) + " ..." + << std::endl; + unsigned int count = options.count; + + if (options.count*mpi_size*sizeof(ACCL::dataType::int32) > options.rxbuf_size){ + std::cout<<"experiment size larger than buffer size, exiting..."<(count * mpi_size, ACCL::dataType::int32); + auto res_buf = accl.create_coyotebuffer(count, ACCL::dataType::int32); + + for (int n = 0; n < options.nruns; n++) + { + std::cout << "Repetition " <buffer()[i] = i; + for (int i = 0; i < count; i++) res_buf.get()->buffer()[i] = -1; + + if (options.host == 0){ op_buf->sync_to_device(); } + if (options.host == 0){ res_buf->sync_to_device(); } + + test_debug("Scatter data from " + std::to_string(root) + "...", options); + + MPI_Barrier(MPI_COMM_WORLD); + double durationUs = 0.0; + accl.barrier(); + std::cout<<"Pass accl barrier"<(end-start).count() / 1000.0); + std::cout<<"host measured durationUs:"< 1.0){ + ACCL::accl_log(mpi_rank, format_log("scatter", options, durationUs, 0)); + } + + std::this_thread::sleep_for(10ms); + if (options.host == 0){ op_buf->sync_from_device(); } + if (options.host == 0){ res_buf->sync_from_device(); } + + int errors = 0; + for (unsigned int i = 0; i < count; ++i) { + unsigned int res = res_buf.get()->buffer()[i]; + unsigned int ref = op_buf.get()->buffer()[i + mpi_rank * count]; + if (res != ref) { + std::cout << std::to_string(i + 1) + "th item is incorrect! (" + + std::to_string(res) + " != " + std::to_string(ref) + ")" + << std::endl; + errors += 1; + } + } + + if (errors > 0) { + std::cout << std::to_string(errors) + " errors!" << std::endl; + failed_tests++; + } else { + std::cout << "Test is successful!" << std::endl; + } + } + + op_buf->free_buffer(); + res_buf->free_buffer(); + +} + + +void test_gather(ACCL::ACCL &accl, options_t &options, int root) { + std::cout << "Start gather test with root " + std::to_string(root) + "..." + << std::endl; + unsigned int count = options.count; + + if (options.count*mpi_size*sizeof(ACCL::dataType::float32) > options.rxbuf_size){ + std::cout<<"experiment size larger than buffer size, exiting..."<(count, ACCL::dataType::float32); + + std::unique_ptr> res_buf; + if (mpi_rank == root) { + res_buf = accl.create_coyotebuffer(count * mpi_size, ACCL::dataType::float32); + } else { + //res_buf = std::unique_ptr>(nullptr); + res_buf = accl.create_coyotebuffer(count * mpi_size, ACCL::dataType::float32); + } + + for (int n = 0; n < options.nruns; n++) + { + std::cout << "Repetition " <buffer()[i] = mpi_rank*count + i; + if (options.host == 0){ op_buf->sync_to_device(); } + if (mpi_rank == root) { + for (int i = 0; i < count * mpi_size; i++) res_buf.get()->buffer()[i] = 0; + if (options.host == 0){ res_buf->sync_to_device(); } + } + + test_debug("Gather data from " + std::to_string(mpi_rank) + "...", options); + + MPI_Barrier(MPI_COMM_WORLD); + double durationUs = 0.0; + accl.barrier(); + std::cout<<"Pass accl barrier"<(end-start).count() / 1000.0); + std::cout<<"host measured durationUs:"< 1.0){ + ACCL::accl_log(mpi_rank, format_log("gather", options, durationUs, 0)); + } + + std::this_thread::sleep_for(10ms); + if (options.host == 0){ op_buf->sync_from_device(); } + if (mpi_rank == root){ + if (options.host == 0){ res_buf->sync_from_device(); } + } + + if (mpi_rank == root) { + int errors = 0; + for (unsigned int j = 0; j < mpi_size; ++j) { + for (size_t i = 0; i < count; i++) + { + float res = res_buf.get()->buffer()[j*count+i]; + float ref = j*count+i; + if (res != ref) { + std::cout << std::to_string(i + 1) + "th item is incorrect! (" + + std::to_string(res) + " != " + std::to_string(ref) + + ")" + << std::endl; + errors += 1; + } + } + } + + if (errors > 0) { + std::cout << std::to_string(errors) + " errors!" << std::endl; + failed_tests++; + } else { + std::cout << "Test is successful!" << std::endl; + } + } + } + + op_buf->free_buffer(); + if (mpi_rank == root) { + res_buf->free_buffer(); + } +} + + +void test_allgather(ACCL::ACCL &accl, options_t &options) { + std::cout << "Start allgather test..." << std::endl; + unsigned int count = options.count; + + if (options.count*mpi_size*sizeof(ACCL::dataType::int32) > options.rxbuf_size){ + std::cout<<"experiment size larger than buffer size, exiting..."<(count, ACCL::dataType::float32); + auto res_buf = accl.create_coyotebuffer(count * mpi_size, ACCL::dataType::float32); + + for (int n = 0; n < options.nruns; n++) + { + std::cout << "Repetition " <buffer()[i] = mpi_rank*count + i; + for (int i = 0; i < count * mpi_size; i++) res_buf.get()->buffer()[i] = 0; + + if (options.host == 0){ op_buf->sync_to_device(); } + if (options.host == 0){ res_buf->sync_to_device(); } + + test_debug("Gathering data...", options); + + MPI_Barrier(MPI_COMM_WORLD); + double durationUs = 0.0; + accl.barrier(); + std::cout<<"Pass accl barrier"<(end-start).count() / 1000.0); + std::cout<<"host measured durationUs:"< 1.0){ + ACCL::accl_log(mpi_rank, format_log("allgather", options, durationUs, 0)); + } + + std::this_thread::sleep_for(10ms); + + if (options.host == 0){ op_buf->sync_from_device(); } + if (options.host == 0){ res_buf->sync_from_device(); } + + int errors = 0; + for (unsigned int j = 0; j < mpi_size; ++j) { + for (size_t i = 0; i < count; i++) + { + float res = res_buf.get()->buffer()[j*count+i]; + float ref = j*count+i; + if (res != ref) { + std::cout << std::to_string(i + 1) + "th item is incorrect! (" + + std::to_string(res) + " != " + std::to_string(ref) + + ")" + << std::endl; + errors += 1; + } + } + } + + if (errors > 0) { + std::cout << std::to_string(errors) + " errors!" << std::endl; + failed_tests++; + } else { + std::cout << "Test is successful!" << std::endl; + } + } + + op_buf->free_buffer(); + res_buf->free_buffer(); + +} + +void test_reduce(ACCL::ACCL &accl, options_t &options, int root, + ACCL::reduceFunction function) { + std::cout << "Start reduce test with root " + std::to_string(root) + + " and reduce function " + + std::to_string(static_cast(function)) + "..." + << std::endl; + unsigned int count = options.count; + + if (options.count*sizeof(ACCL::dataType::int32) > options.rxbuf_size){ + std::cout<<"experiment size larger than buffer size, exiting..."<(count, ACCL::dataType::float32); + auto res_buf = accl.create_coyotebuffer(count, ACCL::dataType::float32); + + for (int n = 0; n < options.nruns; n++) + { + std::cout << "Repetition " <buffer()[i] = i; + for (int i = 0; i < count; i++) res_buf.get()->buffer()[i] = 0; + + if (options.host == 0){ op_buf->sync_to_device(); } + if (options.host == 0){ res_buf->sync_to_device(); } + + test_debug("Reduce data to " + std::to_string(root) + "...", options); + + MPI_Barrier(MPI_COMM_WORLD); + double durationUs = 0.0; + accl.barrier(); + std::cout<<"Pass accl barrier"<(end-start).count() / 1000.0); + std::cout<<"host measured durationUs:"< 1.0){ + ACCL::accl_log(mpi_rank, format_log("reduce", options, durationUs, 0)); + } + + std::this_thread::sleep_for(10ms); + if (options.host == 0){ op_buf->sync_from_device(); } + if (options.host == 0){ res_buf->sync_from_device(); } + + if (mpi_rank == root) { + int errors = 0; + + for (unsigned int i = 0; i < count; ++i) { + float res = res_buf.get()->buffer()[i]; + float ref = i * mpi_size; + + if (res != ref) { + std::cout << std::to_string(i + 1) + "th item is incorrect! (" + + std::to_string(res) + " != " + std::to_string(ref) + + ")" + << std::endl; + errors += 1; + } + } + + if (errors > 0) { + std::cout << std::to_string(errors) + " errors!" << std::endl; + failed_tests++; + } else { + std::cout << "Test is successful!" << std::endl; + } + } + } + + op_buf->free_buffer(); + res_buf->free_buffer(); +} + +void test_allreduce(ACCL::ACCL &accl, options_t &options, + ACCL::reduceFunction function) { + std::cout << "Start allreduce test and reduce function " + + std::to_string(static_cast(function)) + "..." + << std::endl; + unsigned int count = options.count; + + if (options.count*sizeof(ACCL::dataType::int32) > options.rxbuf_size){ + std::cout<<"experiment size larger than buffer size, exiting..."<(count, ACCL::dataType::int32); + auto res_buf = accl.create_coyotebuffer(count, ACCL::dataType::int32); + + for (int n = 0; n < options.nruns; n++) + { + std::cout << "Repetition " <buffer()[i] = i; + for (int i = 0; i < count; i++) res_buf.get()->buffer()[i] = 0; + + if (options.host == 0){ op_buf->sync_to_device(); } + if (options.host == 0){ res_buf->sync_to_device(); } + + test_debug("Reducing data...", options); + + MPI_Barrier(MPI_COMM_WORLD); + double durationUs = 0.0; + accl.barrier(); + std::cout<<"Pass accl barrier"<(end-start).count() / 1000.0); + std::cout<<"host measured durationUs:"< 1.0){ + ACCL::accl_log(mpi_rank, format_log("allreduce", options, durationUs, 0)); + } + + std::this_thread::sleep_for(10ms); + if (options.host == 0){ op_buf->sync_from_device(); } + if (options.host == 0){ res_buf->sync_from_device(); } + + int errors = 0; + + for (unsigned int i = 0; i < count; ++i) { + float res = res_buf.get()->buffer()[i]; + float ref = i * mpi_size; + + if (res != ref) { + std::cout << std::to_string(i + 1) + "th item is incorrect! (" + + std::to_string(res) + " != " + std::to_string(ref) + ")" + << std::endl; + errors += 1; + } + } + + if (errors > 0) { + std::cout << std::to_string(errors) + " errors!" << std::endl; + failed_tests++; + } else { + std::cout << "Test is successful!" << std::endl; + } + } + + op_buf->free_buffer(); + res_buf->free_buffer(); + +} + +std::unique_ptr<::ACCL::ACCL> accl; + +void test_accl_base(options_t options) +{ + std::cout << "Testing ACCL base functionality..." << std::endl; + + // initialize ACCL + std::vector ranks; + int local_rank = mpi_rank; + failed_tests = 0; + + // load ip addresses for targets + std::ifstream myfile; + myfile.open(options.fpgaIP); + if (!myfile.is_open()) + { + perror("Error open fpgaIP file"); + exit(EXIT_FAILURE); + } + std::vector ipList; + for (int i = 0; i < mpi_size; ++i) + { + std::string ip; + if (options.hardware && !options.axis3) + { + ip = "10.10.10." + std::to_string(i); + getline(myfile, ip); + std::cout << ip << std::endl; + ipList.push_back(ip); + } + else + { + ip = "127.0.0.1"; + } + + if(options.hardware && options.rdma) { + ACCL::rank_t new_rank = {ip, options.start_port, i, options.rxbuf_size}; + ranks.emplace_back(new_rank); + } else { + ACCL::rank_t new_rank = {ip, options.start_port + i, 0, options.rxbuf_size}; + ranks.emplace_back(new_rank); + } + + } + + std::unique_ptr accl; + // construct CoyoteDevice out here already, since it is necessary for creating buffers + // before the ACCL instance exists. + ACCL::CoyoteDevice* device; + + MPI_Barrier(MPI_COMM_WORLD); + + if (options.tcp){ + std::cout<<"ACCL with Coyote V2 TCP not supported"<(device); + if (options.protoc == 0){ + bool eagerBufs = false; + if(options.eagerRx_host){ + eagerBufs = true; + } + std::cout<<"Eager Protocol"<initialize(ranks, mpi_rank, + mpi_size+3, options.rxbuf_size, 4096*1024, 4096*1024*2, eagerBufs); + } else if (options.protoc == 1){ + std::cout<<"Rendezvous Protocol"<initialize(ranks, mpi_rank, mpi_size, 64, 64, options.seg_size); + } + + ACCL::debug(accl->dump_communicator()); + + MPI_Barrier(MPI_COMM_WORLD); + + } else { + ACCL::debug("unsupported situation!!!"); + exit(1); + } + + + + MPI_Barrier(MPI_COMM_WORLD); + + double durationUs = 0.0; + auto start = std::chrono::high_resolution_clock::now(); + ACCL::ACCLRequest* req = accl->nop(true); + accl->wait(req); + auto end = std::chrono::high_resolution_clock::now(); + durationUs = (std::chrono::duration_cast(end-start).count() / 1000.0); + uint64_t durationNs = accl->get_duration(req); + std::cout << "sw nop time [us]:"<dump_eager_rx_buffers(false)); + MPI_Barrier(MPI_COMM_WORLD); + test_sendrcv(*accl, options); + ACCL::debug(accl->dump_communicator()); + ACCL::debug(accl->dump_eager_rx_buffers(false)); + } + if(options.test_mode == ACCL_BCAST || options.test_mode == 0){ + ACCL::debug(accl->dump_eager_rx_buffers(false)); + MPI_Barrier(MPI_COMM_WORLD); + test_bcast(*accl, options, 0); + ACCL::debug(accl->dump_communicator()); + ACCL::debug(accl->dump_eager_rx_buffers(false)); + } + if(options.test_mode == ACCL_SCATTER || options.test_mode == 0){ + ACCL::debug(accl->dump_eager_rx_buffers(false)); + MPI_Barrier(MPI_COMM_WORLD); + test_scatter(*accl, options, 0); + ACCL::debug(accl->dump_communicator()); + ACCL::debug(accl->dump_eager_rx_buffers(false)); + } + if(options.test_mode == ACCL_GATHER || options.test_mode == 0){ + ACCL::debug(accl->dump_eager_rx_buffers(false)); + MPI_Barrier(MPI_COMM_WORLD); + test_gather(*accl, options, 0); + ACCL::debug(accl->dump_communicator()); + ACCL::debug(accl->dump_eager_rx_buffers(false)); + } + if(options.test_mode == ACCL_ALLGATHER || options.test_mode == 0){ + ACCL::debug(accl->dump_eager_rx_buffers(false)); + MPI_Barrier(MPI_COMM_WORLD); + test_allgather(*accl, options); + ACCL::debug(accl->dump_communicator()); + ACCL::debug(accl->dump_eager_rx_buffers(false)); + } + /*if(options.test_mode == ACCL_REDUCE || options.test_mode == 0){ + ACCL::debug(accl->dump_eager_rx_buffers(false)); + MPI_Barrier(MPI_COMM_WORLD); + int root = 0; + test_reduce(*accl, options, root, ACCL::reduceFunction::SUM); + ACCL::debug(accl->dump_communicator()); + ACCL::debug(accl->dump_eager_rx_buffers(false)); + } + if(options.test_mode == ACCL_ALLREDUCE || options.test_mode == 0){ + ACCL::debug(accl->dump_eager_rx_buffers(false)); + MPI_Barrier(MPI_COMM_WORLD); + test_allreduce(*accl, options, ACCL::reduceFunction::SUM); + ACCL::debug(accl->dump_communicator()); + ACCL::debug(accl->dump_eager_rx_buffers(false)); + }*/ + if(options.test_mode == ACCL_BARRIER){ + std::cout << "Start barrier test..."<< std::endl; + for (int n = 0; n < options.nruns; n++) + { + std::cout << "Repetition " <barrier(); + auto end = std::chrono::high_resolution_clock::now(); + durationUs = (std::chrono::duration_cast(end-start).count() / 1000.0); + std::cout<<"barrier durationUs:"< +struct aligned_allocator { + using value_type = T; + T* allocate(std::size_t num) { + void* ptr = nullptr; + if (posix_memalign(&ptr,4096,num*sizeof(T))) + throw std::bad_alloc(); + return reinterpret_cast(ptr); + } + void deallocate(T* p, std::size_t num) { + free(p); + } +}; + +void accl_sa_handler(int) +{ + static bool once = true; + if(once) { + accl.reset(); + std::cout << "Error! Signal received. Finalizing MPI..." << std::endl; + MPI_Finalize(); + std::cout << "Done. Terminating..." << std::endl; + once = false; + } + exit(EXIT_FAILURE); +} + +int main(int argc, char *argv[]) +{ + struct sigaction sa; + memset( &sa, 0, sizeof(sa) ); + sa.sa_handler = accl_sa_handler; + sigfillset(&sa.sa_mask); + sigaction(SIGINT,&sa,NULL); + sigaction(SIGSEGV, &sa, NULL); + + std::cout << "Arguments: "; + for (int i = 0; i < argc; i++) std::cout << "'" << argv[i] << "' "; std::cout << std::endl; + std::cout << "Running ACCL test in coyote..." << std::endl; + std::cout << "Initializing MPI..." << std::endl; + MPI_Init(&argc, &argv); + + std::cout << "Reading MPI rank and size values..." << std::endl; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); + + std::cout << "Parsing options" << std::endl; + options_t options = parse_options(argc, argv); + + std::cout << "Getting MPI Processor name..." << std::endl; + int len; + char name[MPI_MAX_PROCESSOR_NAME]; + MPI_Get_processor_name(name, &len); + + std::ostringstream stream; + stream << prepend_process() << "rank " << mpi_rank << " size " << mpi_size << " " << name + << std::endl; + std::cout << stream.str(); + + MPI_Barrier(MPI_COMM_WORLD); + + test_accl_base(options); + + MPI_Barrier(MPI_COMM_WORLD); + + std::cout << "Finalizing MPI..." << std::endl; + MPI_Finalize(); + std::cout << "Done. Terminating..." << std::endl; + return 0; +} diff --git a/test/refdesigns/Coyote b/test/refdesigns/Coyote index ef4853fc..4f5b7d5b 160000 --- a/test/refdesigns/Coyote +++ b/test/refdesigns/Coyote @@ -1 +1 @@ -Subproject commit ef4853fc4eefc2768213179abe1b9278834bd2d2 +Subproject commit 4f5b7d5b2a19be7dda3e454d10e642c19538b5c3 diff --git a/test/refdesigns/Makefile b/test/refdesigns/Makefile index a5ff4ed6..0d7fe2cb 100644 --- a/test/refdesigns/Makefile +++ b/test/refdesigns/Makefile @@ -77,10 +77,11 @@ COMPRESSION_XO=../../kernels/plugins/hp_compression/hp_compression_$(FPGAPART).x LOOPBACK_XO=../../kernels/plugins/loopback/loopback_$(FPGAPART).xo TCP_SESS_XO=../../kernels/plugins/tcp_session_handler/tcp_session_handler_$(FPGAPART).xo VADD_XO=../../kernels/plugins/vadd_put/vadd_put_$(FPGAPART).xo -CYT_DMA_ADAPTER_XO=../../kernels/plugins/cyt_adapter/cyt_dma_adapter_$(FPGAPART).xo +CYT_DMA_SQ_ADAPTER_XO=../../kernels/plugins/cyt_adapter/cyt_dma_sq_adapter_$(FPGAPART).xo +CYT_CQ_DM_STS_CONVERTER_XO=../../kernels/plugins/cyt_adapter/cyt_cq_dm_sts_converter_$(FPGAPART).xo CYT_RDMA_ARBITER_XO=../../kernels/plugins/cyt_adapter/cyt_rdma_arbiter_$(FPGAPART).xo -CYT_RDMA_MUX_XO=../../kernels/plugins/cyt_adapter/cyt_rdma_mux_$(FPGAPART).xo -CYT_ADAPTER_XO = $(CYT_DMA_ADAPTER_XO) $(CYT_RDMA_ARBITER_XO) $(CYT_RDMA_MUX_XO) +CCLO_SQ_ADAPTER_XO=../../kernels/plugins/cyt_adapter/cclo_sq_adapter_$(FPGAPART).xo +CYT_ADAPTER_XO = $(CYT_DMA_SQ_ADAPTER_XO) $(CYT_RDMA_ARBITER_XO) $(CCLO_SQ_ADAPTER_XO) $(CYT_CQ_DM_STS_CONVERTER_XO) HWEMU_MST_XO=$$XILINX_VITIS/data/emulation/XO/sim_ipc_axis_master_512.xo HWEMU_SLV_XO=$$XILINX_VITIS/data/emulation/XO/sim_ipc_axis_slave_512.xo @@ -193,7 +194,7 @@ else ifeq (coyote_rdma, $(MODE)) N_DDR_CHAN = 2 endif OTHER_XO += $(CYT_ADAPTER_XO) - COYOTE_CONFIG = -DFDEV_NAME=$(BOARD) -DEN_MEM=1 -DEN_STRM=1 -DEN_BPSS=1 -DEN_RDMA_0=1 -DEN_RPC=1 -DN_STRM_AXI=3 -DN_CARD_AXI=3 -DEN_HLS=0 -DACLK_F=250 -DTLBL_A=12 -DN_DDR_CHAN=$(N_DDR_CHAN) + COYOTE_CONFIG = -DFDEV_NAME=$(BOARD) -DEXAMPLE=rdma -DSHELL_PROBE=8 -DN_REGIONS=1 -DEN_MEM=1 -DEN_STRM=1 -DEN_RDMA=1 -DN_RDMA_AXI=2 -DN_STRM_AXI=3 -DN_CARD_AXI=3 -DACLK_F=250 -DTLBL_A=2 CCLO_STACK_TYPE = RDMA OUTPUT_PRODUCT = $(CYT_BIT) USE_HOSTMEM = 1 @@ -201,7 +202,7 @@ else $(error Unsupported MODE) endif -CCLO_XO = ../../kernels/cclo/$(CCLO_STACK_TYPE)_1111$(CCLO_MB_DEBUG_LEVEL)_$(FPGAPART)/ccl_offload.xo +CCLO_XO = ../../kernels/cclo/$(CCLO_STACK_TYPE)_1101$(CCLO_MB_DEBUG_LEVEL)_$(FPGAPART)/ccl_offload.xo OTHER_XO += $(CCLO_XO) ifeq (1, $(USE_HOSTMEM)) @@ -232,19 +233,19 @@ $(VNX)/NetLayers/_x.%/networklayer.xo: $(MAKE) -C xup_vitis_network_example/NetLayers DEVICE=$* all .PHONY: coyote_shell -coyote_shell: $(CYT_BUILD_DIR)/lynx/lynx.xpr +coyote_shell: $(CYT_BUILD_DIR)/test_config_0/user_c0_0/test.xpr -$(CYT_BUILD_DIR)/lynx/lynx.xpr: - mkdir $(CYT_BUILD_DIR) && cd $(CYT_BUILD_DIR) && cmake ../Coyote/hw $(COYOTE_CONFIG) - $(MAKE) -C $(CYT_BUILD_DIR) shell +$(CYT_BUILD_DIR)/test_config_0/user_c0_0/test.xpr: + mkdir $(CYT_BUILD_DIR) && cd $(CYT_BUILD_DIR) && cmake ../hdl/ $(COYOTE_CONFIG) + $(MAKE) -C $(CYT_BUILD_DIR) project $(CYT_BIT): coyote_shell $(MAKE) -C ../../kernels/cclo PLATFORM=$(PLATFORM) STACK_TYPE=$(CCLO_STACK_TYPE) MB_DEBUG_LEVEL=$(CCLO_MB_DEBUG_LEVEL) $(MAKE) -C ../../kernels/plugins DEVICE=$(FPGAPART) cp -rf $(OTHER_XO) $(CYT_BUILD_DIR)/iprepo && cd $(CYT_BUILD_DIR)/iprepo && unzip -n '*.xo' vivado -mode tcl -source tcl/coyote.tcl -tclargs $(CCLO_STACK_TYPE) $(CYT_BUILD_DIR) - cp hdl/$(MODE)_top.sv $(CYT_BUILD_DIR)/lynx/hdl/config_0/user_logic_c0_0.sv - $(MAKE) -C $(CYT_BUILD_DIR) compile + cp hdl/$(MODE)_top.sv $(CYT_BUILD_DIR)/test_config_0/user_c0_0/hdl/wrappers/user_logic_c0_0.sv + $(MAKE) -C $(CYT_BUILD_DIR) bitgen .PHONY: tcp_stack tcp_stack: $(TCP_XO) $(CMAC_TCP_XO) diff --git a/test/refdesigns/hdl/CMakeLists.txt b/test/refdesigns/hdl/CMakeLists.txt new file mode 100644 index 00000000..3af5d880 --- /dev/null +++ b/test/refdesigns/hdl/CMakeLists.txt @@ -0,0 +1,32 @@ +cmake_minimum_required(VERSION 3.0) +project(test) + +set(CYT_DIR ${CMAKE_SOURCE_DIR}/../Coyote) +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CYT_DIR}/cmake) + +find_package(CoyoteHW REQUIRED) + +# +# ACCL Test +# +if(EXAMPLE STREQUAL "rdma") + message("** ACCL-RDMA Test") + set(N_REGIONS 1) + set(EN_STRM 1) + set(EN_RDMA 1) + set(N_STRM_AXI 3) + set(N_CARD_AXI 3) + set(N_RDMA_AXI 2) + set(ACLK_F 250) + set(TLBL_A 2) + set(EN_MEM 1) + set(BUILD_OPT 1) + + validation_checks_hw() + + # load_apps ( + # VFPGA_C0_0 "coyote_rdma_top.sv" + # ) + + create_hw() +endif() diff --git a/test/refdesigns/hdl/coyote_rdma_top.sv b/test/refdesigns/hdl/coyote_rdma_top.sv index 6b333a53..56adec77 100644 --- a/test/refdesigns/hdl/coyote_rdma_top.sv +++ b/test/refdesigns/hdl/coyote_rdma_top.sv @@ -30,39 +30,32 @@ module design_user_logic_c0_0 ( // AXI4L CONTROL AXI4L.s axi_ctrl, - // DESCRIPTOR BYPASS - metaIntf.m bpss_rd_req, - metaIntf.m bpss_wr_req, - metaIntf.s bpss_rd_done, - metaIntf.s bpss_wr_done, - - // AXI4S HOST STREAMS - AXI4SR.s axis_host_0_sink, - AXI4SR.m axis_host_0_src, - AXI4SR.s axis_host_1_sink, - AXI4SR.m axis_host_1_src, - AXI4SR.s axis_host_2_sink, - AXI4SR.m axis_host_2_src, - - // AXI4S CARD STREAMS - AXI4SR.s axis_card_0_sink, - AXI4SR.m axis_card_0_src, - AXI4SR.s axis_card_1_sink, - AXI4SR.m axis_card_1_src, - AXI4SR.s axis_card_2_sink, - AXI4SR.m axis_card_2_src, - - // RDMA QSFP0 CMD - metaIntf.s rdma_0_rd_req, - metaIntf.s rdma_0_wr_req, - - // AXI4S RDMA QSFP0 STREAMS - AXI4SR.s axis_rdma_0_sink, - AXI4SR.m axis_rdma_0_src, - - // RDMA QSFP0 SQ and RQ - metaIntf.m rdma_0_sq, - metaIntf.s rdma_0_ack, + // NOTIFY + metaIntf.m notify, + + // DESCRIPTORS + metaIntf.m sq_rd, + metaIntf.m sq_wr, + metaIntf.s cq_rd, + metaIntf.s cq_wr, + metaIntf.s rq_rd, + metaIntf.s rq_wr, + + // HOST DATA STREAMS + AXI4SR.s axis_host_recv [N_STRM_AXI], + AXI4SR.m axis_host_send [N_STRM_AXI], + + // CARD DATA STREAMS + AXI4SR.s axis_card_recv [N_CARD_AXI], + AXI4SR.m axis_card_send [N_CARD_AXI], + + // RDMA DATA STREAMS REQUESTER + AXI4SR.s axis_rreq_recv [N_RDMA_AXI], + AXI4SR.m axis_rreq_send [N_RDMA_AXI], + + // RDMA DATA STREAMS RESPONDER + AXI4SR.s axis_rrsp_recv [N_RDMA_AXI], + AXI4SR.m axis_rrsp_send [N_RDMA_AXI], // Clock and reset input wire aclk, @@ -70,14 +63,12 @@ module design_user_logic_c0_0 ( ); /* -- Tie-off unused interfaces and signals ----------------------------- */ -// always_comb axis_host_0_sink.tie_off_s(); -// always_comb axis_host_0_src_s.tie_off_m(); -// always_comb axis_card_0_sink.tie_off_s(); -// always_comb axis_card_0_src_s.tie_off_m(); -// always_comb axis_host_1_sink.tie_off_s(); -// always_comb axis_host_1_src.tie_off_m(); -// always_comb axis_card_1_sink.tie_off_s(); -// always_comb axis_card_1_src_s.tie_off_m(); +always_comb notify.tie_off_m(); +always_comb axis_rrsp_send[0].tie_off_m(); +always_comb axis_rrsp_send[1].tie_off_m(); +always_comb axis_card_recv[2].tie_off_s(); +always_comb axis_host_recv[2].tie_off_s(); + /* -- USER LOGIC -------------------------------------------------------- */ @@ -85,39 +76,6 @@ module design_user_logic_c0_0 ( localparam integer COYOTE_AXIL_ADDR_LSB = $clog2(AXIL_DATA_BITS/8); localparam integer COYOTE_AXIL_ADDR_MSB = 16; -// Master Data Stream -AXI4SR axis_host_0_src_s (); -AXI4SR axis_host_1_src_s (); -AXI4SR axis_host_2_src_s (); -AXI4SR axis_card_0_src_s (); -AXI4SR axis_card_1_src_s (); -AXI4SR axis_card_2_src_s (); - -// register slices -axisr_reg_array #(.N_STAGES(4)) (.aclk(aclk), .aresetn(aresetn), .s_axis(axis_host_0_src_s), .m_axis(axis_host_0_src)); -axisr_reg_array #(.N_STAGES(4)) (.aclk(aclk), .aresetn(aresetn), .s_axis(axis_host_1_src_s), .m_axis(axis_host_1_src)); -axisr_reg_array #(.N_STAGES(4)) (.aclk(aclk), .aresetn(aresetn), .s_axis(axis_host_2_src_s), .m_axis(axis_host_2_src)); -axisr_reg_array #(.N_STAGES(4)) (.aclk(aclk), .aresetn(aresetn), .s_axis(axis_card_0_src_s), .m_axis(axis_card_0_src)); -axisr_reg_array #(.N_STAGES(4)) (.aclk(aclk), .aresetn(aresetn), .s_axis(axis_card_1_src_s), .m_axis(axis_card_1_src)); -axisr_reg_array #(.N_STAGES(4)) (.aclk(aclk), .aresetn(aresetn), .s_axis(axis_card_2_src_s), .m_axis(axis_card_2_src)); - -// Slave Data Stream -AXI4SR axis_host_0_sink_s (); -AXI4SR axis_host_1_sink_s (); -AXI4SR axis_host_2_sink_s (); -AXI4SR axis_card_0_sink_s (); -AXI4SR axis_card_1_sink_s (); -AXI4SR axis_card_2_sink_s (); - -// register slices -axisr_reg_array #(.N_STAGES(4)) (.aclk(aclk), .aresetn(aresetn), .s_axis(axis_host_0_sink), .m_axis(axis_host_0_sink_s)); -axisr_reg_array #(.N_STAGES(4)) (.aclk(aclk), .aresetn(aresetn), .s_axis(axis_host_1_sink), .m_axis(axis_host_1_sink_s)); -axisr_reg_array #(.N_STAGES(4)) (.aclk(aclk), .aresetn(aresetn), .s_axis(axis_host_2_sink), .m_axis(axis_host_2_sink_s)); -axisr_reg_array #(.N_STAGES(4)) (.aclk(aclk), .aresetn(aresetn), .s_axis(axis_card_0_sink), .m_axis(axis_card_0_sink_s)); -axisr_reg_array #(.N_STAGES(4)) (.aclk(aclk), .aresetn(aresetn), .s_axis(axis_card_1_sink), .m_axis(axis_card_1_sink_s)); -axisr_reg_array #(.N_STAGES(4)) (.aclk(aclk), .aresetn(aresetn), .s_axis(axis_card_2_sink), .m_axis(axis_card_2_sink_s)); - - // ACCL Block Design accl_bd_wrapper accl_system( .ap_clk_0(aclk), @@ -143,138 +101,201 @@ accl_bd_wrapper accl_system( .S00_AXI_0_wstrb(axi_ctrl.wstrb), .S00_AXI_0_wvalid(axi_ctrl.wvalid), - .cyt_byp_rd_cmd_0_tdata(bpss_rd_req.data), - .cyt_byp_rd_cmd_0_tready(bpss_rd_req.ready), - .cyt_byp_rd_cmd_0_tvalid(bpss_rd_req.valid), + .cyt_sq_rd_cmd_tdata(sq_rd.data), + .cyt_sq_rd_cmd_tready(sq_rd.ready), + .cyt_sq_rd_cmd_tvalid(sq_rd.valid), + + .cyt_cq_rd_sts_0_tdata(cq_rd.data), + .cyt_cq_rd_sts_0_tready(cq_rd.ready), + .cyt_cq_rd_sts_0_tvalid(cq_rd.valid), + + .cyt_sq_wr_cmd_tdata(sq_wr.data), + .cyt_sq_wr_cmd_tready(sq_wr.ready), + .cyt_sq_wr_cmd_tvalid(sq_wr.valid), - .cyt_byp_rd_sts_0_tdata(bpss_rd_done.data), - .cyt_byp_rd_sts_0_tready(bpss_rd_done.ready), - .cyt_byp_rd_sts_0_tvalid(bpss_rd_done.valid), + .cyt_cq_wr_sts_0_tdata(cq_wr.data), + .cyt_cq_wr_sts_0_tready(cq_wr.ready), + .cyt_cq_wr_sts_0_tvalid(cq_wr.valid), - .cyt_byp_wr_cmd_0_tdata(bpss_wr_req.data), - .cyt_byp_wr_cmd_0_tready(bpss_wr_req.ready), - .cyt_byp_wr_cmd_0_tvalid(bpss_wr_req.valid), + .cyt_rq_rd_tdata(rq_rd.data), + .cyt_rq_rd_tready(rq_rd.ready), + .cyt_rq_rd_tvalid(rq_rd.valid), - .cyt_byp_wr_sts_0_tdata(bpss_wr_done.data), - .cyt_byp_wr_sts_0_tready(bpss_wr_done.ready), - .cyt_byp_wr_sts_0_tvalid(bpss_wr_done.valid), + .cyt_rq_wr_tdata(rq_wr.data), + .cyt_rq_wr_tready(rq_wr.ready), + .cyt_rq_wr_tvalid(rq_wr.valid), - .m_axis_host_0_tdata(axis_host_0_src_s.tdata), - .m_axis_host_0_tkeep(axis_host_0_src_s.tkeep), - .m_axis_host_0_tlast(axis_host_0_src_s.tlast), - .m_axis_host_0_tready(axis_host_0_src_s.tready), - .m_axis_host_0_tvalid(axis_host_0_src_s.tvalid), + .m_axis_host_0_tdata(axis_host_send[0].tdata), + .m_axis_host_0_tkeep(axis_host_send[0].tkeep), + .m_axis_host_0_tlast(axis_host_send[0].tlast), + .m_axis_host_0_tready(axis_host_send[0].tready), + .m_axis_host_0_tvalid(axis_host_send[0].tvalid), .m_axis_host_0_tdest(), - .m_axis_host_1_tdata(axis_host_1_src_s.tdata), - .m_axis_host_1_tkeep(axis_host_1_src_s.tkeep), - .m_axis_host_1_tlast(axis_host_1_src_s.tlast), - .m_axis_host_1_tready(axis_host_1_src_s.tready), - .m_axis_host_1_tvalid(axis_host_1_src_s.tvalid), + .m_axis_host_1_tdata(axis_host_send[1].tdata), + .m_axis_host_1_tkeep(axis_host_send[1].tkeep), + .m_axis_host_1_tlast(axis_host_send[1].tlast), + .m_axis_host_1_tready(axis_host_send[1].tready), + .m_axis_host_1_tvalid(axis_host_send[1].tvalid), .m_axis_host_1_tdest(), - .m_axis_host_2_tdata(axis_host_2_src_s.tdata), - .m_axis_host_2_tkeep(axis_host_2_src_s.tkeep), - .m_axis_host_2_tlast(axis_host_2_src_s.tlast), - .m_axis_host_2_tready(axis_host_2_src_s.tready), - .m_axis_host_2_tvalid(axis_host_2_src_s.tvalid), + .m_axis_host_2_tdata(axis_host_send[2].tdata), + .m_axis_host_2_tkeep(axis_host_send[2].tkeep), + .m_axis_host_2_tlast(axis_host_send[2].tlast), + .m_axis_host_2_tready(axis_host_send[2].tready), + .m_axis_host_2_tvalid(axis_host_send[2].tvalid), .m_axis_host_2_tdest(), - .m_axis_card_0_tdata(axis_card_0_src_s.tdata), - .m_axis_card_0_tkeep(axis_card_0_src_s.tkeep), - .m_axis_card_0_tlast(axis_card_0_src_s.tlast), - .m_axis_card_0_tready(axis_card_0_src_s.tready), - .m_axis_card_0_tvalid(axis_card_0_src_s.tvalid), + .m_axis_card_0_tdata(axis_card_send[0].tdata), + .m_axis_card_0_tkeep(axis_card_send[0].tkeep), + .m_axis_card_0_tlast(axis_card_send[0].tlast), + .m_axis_card_0_tready(axis_card_send[0].tready), + .m_axis_card_0_tvalid(axis_card_send[0].tvalid), .m_axis_card_0_tdest(), - .m_axis_card_1_tdata(axis_card_1_src_s.tdata), - .m_axis_card_1_tkeep(axis_card_1_src_s.tkeep), - .m_axis_card_1_tlast(axis_card_1_src_s.tlast), - .m_axis_card_1_tready(axis_card_1_src_s.tready), - .m_axis_card_1_tvalid(axis_card_1_src_s.tvalid), + .m_axis_card_1_tdata(axis_card_send[1].tdata), + .m_axis_card_1_tkeep(axis_card_send[1].tkeep), + .m_axis_card_1_tlast(axis_card_send[1].tlast), + .m_axis_card_1_tready(axis_card_send[1].tready), + .m_axis_card_1_tvalid(axis_card_send[1].tvalid), .m_axis_card_1_tdest(), - .m_axis_card_2_tdata(axis_card_2_src_s.tdata), - .m_axis_card_2_tkeep(axis_card_2_src_s.tkeep), - .m_axis_card_2_tlast(axis_card_2_src_s.tlast), - .m_axis_card_2_tready(axis_card_2_src_s.tready), - .m_axis_card_2_tvalid(axis_card_2_src_s.tvalid), + .m_axis_card_2_tdata(axis_card_send[2].tdata), + .m_axis_card_2_tkeep(axis_card_send[2].tkeep), + .m_axis_card_2_tlast(axis_card_send[2].tlast), + .m_axis_card_2_tready(axis_card_send[2].tready), + .m_axis_card_2_tvalid(axis_card_send[2].tvalid), .m_axis_card_2_tdest(), - .s_axis_host_0_tdata(axis_host_0_sink_s.tdata), - .s_axis_host_0_tkeep(axis_host_0_sink_s.tkeep), - .s_axis_host_0_tlast(axis_host_0_sink_s.tlast), - .s_axis_host_0_tready(axis_host_0_sink_s.tready), - .s_axis_host_0_tvalid(axis_host_0_sink_s.tvalid), - - .s_axis_host_1_tdata(axis_host_1_sink_s.tdata), - .s_axis_host_1_tkeep(axis_host_1_sink_s.tkeep), - .s_axis_host_1_tlast(axis_host_1_sink_s.tlast), - .s_axis_host_1_tready(axis_host_1_sink_s.tready), - .s_axis_host_1_tvalid(axis_host_1_sink_s.tvalid), - - .s_axis_host_2_tdata(axis_host_2_sink_s.tdata), - .s_axis_host_2_tkeep(axis_host_2_sink_s.tkeep), - .s_axis_host_2_tlast(axis_host_2_sink_s.tlast), - .s_axis_host_2_tready(axis_host_2_sink_s.tready), - .s_axis_host_2_tvalid(axis_host_2_sink_s.tvalid), - - .s_axis_card_0_tdata(axis_card_0_sink_s.tdata), - .s_axis_card_0_tkeep(axis_card_0_sink_s.tkeep), - .s_axis_card_0_tlast(axis_card_0_sink_s.tlast), - .s_axis_card_0_tready(axis_card_0_sink_s.tready), - .s_axis_card_0_tvalid(axis_card_0_sink_s.tvalid), - - .s_axis_card_1_tdata(axis_card_1_sink_s.tdata), - .s_axis_card_1_tkeep(axis_card_1_sink_s.tkeep), - .s_axis_card_1_tlast(axis_card_1_sink_s.tlast), - .s_axis_card_1_tready(axis_card_1_sink_s.tready), - .s_axis_card_1_tvalid(axis_card_1_sink_s.tvalid), - - .s_axis_card_2_tdata(axis_card_2_sink_s.tdata), - .s_axis_card_2_tkeep(axis_card_2_sink_s.tkeep), - .s_axis_card_2_tlast(axis_card_2_sink_s.tlast), - .s_axis_card_2_tready(axis_card_2_sink_s.tready), - .s_axis_card_2_tvalid(axis_card_2_sink_s.tvalid), - - .s_axis_eth_rx_data_tdata(axis_rdma_0_sink.tdata), - .s_axis_eth_rx_data_tdest(axis_rdma_0_sink.tid), - .s_axis_eth_rx_data_tkeep(axis_rdma_0_sink.tkeep), - .s_axis_eth_rx_data_tlast(axis_rdma_0_sink.tlast), - .s_axis_eth_rx_data_tready(axis_rdma_0_sink.tready), - .s_axis_eth_rx_data_tvalid(axis_rdma_0_sink.tvalid), - - .m_axis_eth_tx_data_tdata(axis_rdma_0_src.tdata), - .m_axis_eth_tx_data_tdest(axis_rdma_0_src.tid), // not driven, default 0 - .m_axis_eth_tx_data_tkeep(axis_rdma_0_src.tkeep), - .m_axis_eth_tx_data_tlast(axis_rdma_0_src.tlast), - .m_axis_eth_tx_data_tready(axis_rdma_0_src.tready), - .m_axis_eth_tx_data_tvalid(axis_rdma_0_src.tvalid), - - .s_axis_rdma_wr_req_tdata(rdma_0_wr_req.data), - .s_axis_rdma_wr_req_tvalid(rdma_0_wr_req.valid), - .s_axis_rdma_wr_req_tready(rdma_0_wr_req.ready), - - .s_axis_rdma_rd_req_tdata(rdma_0_rd_req.data), - .s_axis_rdma_rd_req_tvalid(rdma_0_rd_req.valid), - .s_axis_rdma_rd_req_tready(rdma_0_rd_req.ready), - - .m_axis_rdma_sq_tdata(rdma_0_sq.data), - .m_axis_rdma_sq_tvalid(rdma_0_sq.valid), - .m_axis_rdma_sq_tready(rdma_0_sq.ready) + .s_axis_host_0_tdata(axis_host_recv[0].tdata), + .s_axis_host_0_tkeep(axis_host_recv[0].tkeep), + .s_axis_host_0_tlast(axis_host_recv[0].tlast), + .s_axis_host_0_tready(axis_host_recv[0].tready), + .s_axis_host_0_tvalid(axis_host_recv[0].tvalid), + + .s_axis_host_1_tdata(axis_host_recv[1].tdata), + .s_axis_host_1_tkeep(axis_host_recv[1].tkeep), + .s_axis_host_1_tlast(axis_host_recv[1].tlast), + .s_axis_host_1_tready(axis_host_recv[1].tready), + .s_axis_host_1_tvalid(axis_host_recv[1].tvalid), + + .s_axis_card_0_tdata(axis_card_recv[0].tdata), + .s_axis_card_0_tkeep(axis_card_recv[0].tkeep), + .s_axis_card_0_tlast(axis_card_recv[0].tlast), + .s_axis_card_0_tready(axis_card_recv[0].tready), + .s_axis_card_0_tvalid(axis_card_recv[0].tvalid), + + .s_axis_card_1_tdata(axis_card_recv[1].tdata), + .s_axis_card_1_tkeep(axis_card_recv[1].tkeep), + .s_axis_card_1_tlast(axis_card_recv[1].tlast), + .s_axis_card_1_tready(axis_card_recv[1].tready), + .s_axis_card_1_tvalid(axis_card_recv[1].tvalid), + + .cyt_rreq_recv_0_tdata(axis_rreq_recv[0].tdata), + .cyt_rreq_recv_0_tkeep(axis_rreq_recv[0].tkeep), + .cyt_rreq_recv_0_tlast(axis_rreq_recv[0].tlast), + .cyt_rreq_recv_0_tready(axis_rreq_recv[0].tready), + .cyt_rreq_recv_0_tvalid(axis_rreq_recv[0].tvalid), + + .cyt_rreq_recv_1_tdata(axis_rreq_recv[1].tdata), + .cyt_rreq_recv_1_tkeep(axis_rreq_recv[1].tkeep), + .cyt_rreq_recv_1_tlast(axis_rreq_recv[1].tlast), + .cyt_rreq_recv_1_tready(axis_rreq_recv[1].tready), + .cyt_rreq_recv_1_tvalid(axis_rreq_recv[1].tvalid), + + .cyt_rreq_send_0_tdata(axis_rreq_send[0].tdata), + .cyt_rreq_send_0_tdest(), + .cyt_rreq_send_0_tkeep(axis_rreq_send[0].tkeep), + .cyt_rreq_send_0_tlast(axis_rreq_send[0].tlast), + .cyt_rreq_send_0_tready(axis_rreq_send[0].tready), + .cyt_rreq_send_0_tstrb(), + .cyt_rreq_send_0_tvalid(axis_rreq_send[0].tvalid), + + .cyt_rreq_send_1_tdata(axis_rreq_send[1].tdata), + .cyt_rreq_send_1_tdest(), + .cyt_rreq_send_1_tkeep(axis_rreq_send[1].tkeep), + .cyt_rreq_send_1_tlast(axis_rreq_send[1].tlast), + .cyt_rreq_send_1_tready(axis_rreq_send[1].tready), + .cyt_rreq_send_1_tstrb(), + .cyt_rreq_send_1_tvalid(axis_rreq_send[1].tvalid), + + .cyt_rrsp_recv_0_tdata(axis_rrsp_recv[0].tdata), + .cyt_rrsp_recv_0_tkeep(axis_rrsp_recv[0].tkeep), + .cyt_rrsp_recv_0_tlast(axis_rrsp_recv[0].tlast), + .cyt_rrsp_recv_0_tready(axis_rrsp_recv[0].tready), + .cyt_rrsp_recv_0_tvalid(axis_rrsp_recv[0].tvalid), + + .cyt_rrsp_recv_1_tdata(axis_rrsp_recv[1].tdata), + .cyt_rrsp_recv_1_tkeep(axis_rrsp_recv[1].tkeep), + .cyt_rrsp_recv_1_tlast(axis_rrsp_recv[1].tlast), + .cyt_rrsp_recv_1_tready(axis_rrsp_recv[1].tready), + .cyt_rrsp_recv_1_tvalid(axis_rrsp_recv[1].tvalid) ); -assign axis_host_0_src_s.tid = 0; -assign axis_host_1_src_s.tid = 0; -assign axis_host_2_src_s.tid = 0; - -assign axis_card_0_src_s.tid = 0; -assign axis_card_1_src_s.tid = 0; -assign axis_card_2_src_s.tid = 0; - -assign rdma_0_ack.ready = 1'b1; - +/*ila_top inst_ila_top( + .clk(aclk), + .probe0(sq_wr.valid), //1 + .probe1(sq_wr.ready), //1 + .probe2(sq_wr.data), //128 + .probe3(sq_rd.valid), //1 + .probe4(sq_rd.ready), //1 + .probe5(sq_rd.data), //128 + .probe6(rq_wr.valid), //1 + .probe7(rq_wr.ready), //1 + .probe8(rq_wr.data), //128 + .probe9(rq_rd.valid), //1 + .probe10(rq_rd.ready), //1 + .probe11(rq_rd.data), //128 + .probe12(cq_rd.data), //32 + .probe13(cq_rd.valid), + .probe14(cq_rd.ready), + .probe15(cq_wr.data), //32 + .probe16(cq_wr.valid), + .probe17(cq_wr.ready), + .probe18(axis_host_send[0].tvalid), + .probe19(axis_host_send[0].tready), + .probe20(axis_host_send[0].tdata), + .probe21(axis_host_send[1].tvalid), + .probe22(axis_host_send[1].tready), + .probe23(axis_host_send[1].tdata), + .probe24(axis_rrsp_recv[0].tvalid), + .probe25(axis_rrsp_recv[0].tready), + .probe26(axis_rrsp_recv[0].tdata), + .probe27(axis_rrsp_recv[1].tvalid), + .probe28(axis_rrsp_recv[1].tready), + .probe29(axis_rrsp_recv[1].tdata), + .probe30(axis_host_recv[0].tvalid), + .probe31(axis_host_recv[0].tready), + .probe32(axis_host_recv[0].tdata), + .probe33(axis_host_recv[1].tvalid), + .probe34(axis_host_recv[1].tready), + .probe35(axis_host_recv[1].tdata), + .probe36(axis_rreq_send[0].tvalid), + .probe37(axis_rreq_send[0].tready), + .probe38(axis_rreq_send[0].tdata), + .probe39(axis_rreq_send[1].tvalid), + .probe40(axis_rreq_send[1].tready), + .probe41(axis_rreq_send[1].tdata), + .probe42(axis_rrsp_recv[0].tlast), + .probe43(axis_rrsp_recv[1].tlast), + .probe44(axis_rreq_send[0].tlast), + .probe45(axis_rreq_send[1].tlast) + );*/ + + + + +assign axis_host_send[0].tid = 0; +assign axis_host_send[1].tid = 0; +assign axis_host_send[2].tid = 0; + +assign axis_card_send[0].tid = 0; +assign axis_card_send[1].tid = 0; +assign axis_card_send[2].tid = 0; + +assign axis_rreq_send[0].tid = 0; +assign axis_rreq_send[1].tid = 0; endmodule \ No newline at end of file diff --git a/test/refdesigns/tcl/coyote.tcl b/test/refdesigns/tcl/coyote.tcl index 5225362d..5168e57e 100644 --- a/test/refdesigns/tcl/coyote.tcl +++ b/test/refdesigns/tcl/coyote.tcl @@ -1,3 +1,4 @@ + # /******************************************************************************* # Copyright (C) 2023 Advanced Micro Devices, Inc # @@ -17,234 +18,854 @@ set nettype [lindex $::argv 0] set build_dir [lindex $::argv 1] -open_project "$build_dir/lynx/lynx.xpr" +open_project "$build_dir/test_config_0/user_c0_0/test.xpr" update_compile_order -fileset sources_1 create_bd_design "accl_bd" update_compile_order -fileset sources_1 update_ip_catalog -create_bd_cell -type ip -vlnv Xilinx:ACCL:ccl_offload:1.0 ccl_offload_0 -create_bd_cell -type ip -vlnv xilinx.com:ACCL:cyt_dma_adapter:1.0 cyt_dma_adapter_0 - -connect_bd_intf_net [get_bd_intf_pins ccl_offload_0/m_axis_dma0_mm2s_cmd] [get_bd_intf_pins cyt_dma_adapter_0/dma0_mm2s_cmd] -connect_bd_intf_net [get_bd_intf_pins ccl_offload_0/m_axis_dma1_mm2s_cmd] [get_bd_intf_pins cyt_dma_adapter_0/dma1_mm2s_cmd] -connect_bd_intf_net [get_bd_intf_pins ccl_offload_0/m_axis_dma1_s2mm_cmd] [get_bd_intf_pins cyt_dma_adapter_0/dma1_s2mm_cmd] -connect_bd_intf_net [get_bd_intf_pins ccl_offload_0/m_axis_dma0_s2mm_cmd] [get_bd_intf_pins cyt_dma_adapter_0/dma0_s2mm_cmd] -connect_bd_intf_net [get_bd_intf_pins cyt_dma_adapter_0/dma0_s2mm_sts] [get_bd_intf_pins ccl_offload_0/s_axis_dma0_s2mm_sts] -connect_bd_intf_net [get_bd_intf_pins cyt_dma_adapter_0/dma1_s2mm_sts] [get_bd_intf_pins ccl_offload_0/s_axis_dma1_s2mm_sts] -connect_bd_intf_net [get_bd_intf_pins cyt_dma_adapter_0/dma0_mm2s_sts] [get_bd_intf_pins ccl_offload_0/s_axis_dma0_mm2s_sts] -connect_bd_intf_net [get_bd_intf_pins cyt_dma_adapter_0/dma1_mm2s_sts] [get_bd_intf_pins ccl_offload_0/s_axis_dma1_mm2s_sts] -make_bd_pins_external [get_bd_pins ccl_offload_0/ap_clk] -make_bd_pins_external [get_bd_pins ccl_offload_0/ap_rst_n] -connect_bd_net [get_bd_ports ap_clk_0] [get_bd_pins cyt_dma_adapter_0/ap_clk] -connect_bd_net [get_bd_ports ap_rst_n_0] [get_bd_pins cyt_dma_adapter_0/ap_rst_n] -make_bd_intf_pins_external [get_bd_intf_pins cyt_dma_adapter_0/cyt_byp_wr_sts] -make_bd_intf_pins_external [get_bd_intf_pins cyt_dma_adapter_0/cyt_byp_rd_sts] -make_bd_intf_pins_external [get_bd_intf_pins cyt_dma_adapter_0/cyt_byp_wr_cmd] -make_bd_intf_pins_external [get_bd_intf_pins cyt_dma_adapter_0/cyt_byp_rd_cmd] - -create_bd_cell -type ip -vlnv xilinx.com:ACCL:reduce_ops:1.0 reduce_ops_0 -connect_bd_intf_net [get_bd_intf_pins ccl_offload_0/m_axis_arith_op0] [get_bd_intf_pins reduce_ops_0/in0] -connect_bd_intf_net [get_bd_intf_pins ccl_offload_0/m_axis_arith_op1] [get_bd_intf_pins reduce_ops_0/in1] -connect_bd_intf_net [get_bd_intf_pins reduce_ops_0/out_r] [get_bd_intf_pins ccl_offload_0/s_axis_arith_res] -connect_bd_net [get_bd_ports ap_clk_0] [get_bd_pins reduce_ops_0/ap_clk] -connect_bd_net [get_bd_ports ap_rst_n_0] [get_bd_pins reduce_ops_0/ap_rst_n] - -create_bd_cell -type ip -vlnv xilinx.com:ACCL:hostctrl:1.0 hostctrl_0 -connect_bd_intf_net [get_bd_intf_pins hostctrl_0/cmd] [get_bd_intf_pins ccl_offload_0/s_axis_call_req] -connect_bd_intf_net [get_bd_intf_pins ccl_offload_0/m_axis_call_ack] [get_bd_intf_pins hostctrl_0/sts] -connect_bd_net [get_bd_ports ap_clk_0] [get_bd_pins hostctrl_0/ap_clk] -connect_bd_net [get_bd_ports ap_rst_n_0] [get_bd_pins hostctrl_0/ap_rst_n] - -# direct loopback for compression and kernel streams -connect_bd_intf_net [get_bd_intf_pins ccl_offload_0/m_axis_krnl] [get_bd_intf_pins ccl_offload_0/s_axis_krnl] -connect_bd_intf_net [get_bd_intf_pins ccl_offload_0/m_axis_compression0] [get_bd_intf_pins ccl_offload_0/s_axis_compression0] -connect_bd_intf_net [get_bd_intf_pins ccl_offload_0/m_axis_compression1] [get_bd_intf_pins ccl_offload_0/s_axis_compression1] -connect_bd_intf_net [get_bd_intf_pins ccl_offload_0/m_axis_compression2] [get_bd_intf_pins ccl_offload_0/s_axis_compression2] - -# create axis switch -create_bd_cell -type ip -vlnv xilinx.com:ip:axis_switch:1.1 axis_switch_2_to_1_inst_0 -set_property -dict [list CONFIG.NUM_SI {2} CONFIG.TDATA_NUM_BYTES {64} CONFIG.HAS_TKEEP {1} CONFIG.HAS_TLAST {1} CONFIG.ARB_ON_TLAST {1} CONFIG.NUM_MI {1} CONFIG.DECODER_REG {0} CONFIG.ARB_ON_MAX_XFERS {0} CONFIG.Component_Name {axis_switch_2_to_1_inst_0}] [get_bd_cells axis_switch_2_to_1_inst_0] - -create_bd_cell -type ip -vlnv xilinx.com:ip:axis_switch:1.1 axis_switch_2_to_1_inst_1 -set_property -dict [list CONFIG.NUM_SI {2} CONFIG.TDATA_NUM_BYTES {64} CONFIG.HAS_TKEEP {1} CONFIG.HAS_TLAST {1} CONFIG.ARB_ON_TLAST {1} CONFIG.NUM_MI {1} CONFIG.DECODER_REG {0} CONFIG.ARB_ON_MAX_XFERS {0} CONFIG.Component_Name {axis_switch_2_to_1_inst_1}] [get_bd_cells axis_switch_2_to_1_inst_1] - -create_bd_cell -type ip -vlnv xilinx.com:ip:axis_switch:1.1 axis_switch_1_to_2_inst_0 -set_property -dict [list CONFIG.NUM_SI {1} CONFIG.NUM_MI {2} CONFIG.TDATA_NUM_BYTES {64} CONFIG.HAS_TKEEP {1} CONFIG.HAS_TLAST {1} CONFIG.TDEST_WIDTH {8} CONFIG.DECODER_REG {1} CONFIG.Component_Name {axis_switch_1_to_2_inst_0}] [get_bd_cells axis_switch_1_to_2_inst_0] - -create_bd_cell -type ip -vlnv xilinx.com:ip:axis_switch:1.1 axis_switch_1_to_2_inst_1 -set_property -dict [list CONFIG.NUM_SI {1} CONFIG.NUM_MI {2} CONFIG.TDATA_NUM_BYTES {64} CONFIG.HAS_TKEEP {1} CONFIG.HAS_TLAST {1} CONFIG.TDEST_WIDTH {8} CONFIG.DECODER_REG {1} CONFIG.Component_Name {axis_switch_1_to_2_inst_1}] [get_bd_cells axis_switch_1_to_2_inst_1] - -switch $nettype { - "TCP" { - # externalize TCP streams - make_bd_intf_pins_external [get_bd_intf_pins ccl_offload_0/s_axis_eth_rx_data] - make_bd_intf_pins_external [get_bd_intf_pins ccl_offload_0/m_axis_eth_tx_data] - make_bd_intf_pins_external [get_bd_intf_pins ccl_offload_0/s_axis_eth_tx_status] - make_bd_intf_pins_external [get_bd_intf_pins ccl_offload_0/m_axis_eth_read_pkg] - make_bd_intf_pins_external [get_bd_intf_pins ccl_offload_0/s_axis_eth_rx_meta] - make_bd_intf_pins_external [get_bd_intf_pins ccl_offload_0/s_axis_eth_notification] - make_bd_intf_pins_external [get_bd_intf_pins ccl_offload_0/m_axis_eth_tx_meta] - } - "RDMA" { - # externalize RDMA streams - # data streams - set m_axis_eth_tx_data [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:axis_rtl:1.0 m_axis_eth_tx_data ] - set_property -dict [ list CONFIG.FREQ_HZ {250000000} ] $m_axis_eth_tx_data - set s_axis_eth_rx_data [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:axis_rtl:1.0 s_axis_eth_rx_data ] - set_property -dict [ list CONFIG.FREQ_HZ {250000000} CONFIG.HAS_TKEEP {1} CONFIG.HAS_TLAST {1} CONFIG.HAS_TREADY {1} CONFIG.HAS_TSTRB {1} CONFIG.LAYERED_METADATA {undef} CONFIG.TDATA_NUM_BYTES {64} CONFIG.TDEST_WIDTH {8} CONFIG.TID_WIDTH {0} CONFIG.TUSER_WIDTH {0} ] $s_axis_eth_rx_data - - # RDMA sq and rq - set m_axis_rdma_sq [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:axis_rtl:1.0 m_axis_rdma_sq ] - set_property -dict [ list CONFIG.FREQ_HZ {250000000} ] $m_axis_rdma_sq - - # RDMA extra pair of host/card streams - set m_axis_host_2 [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:axis_rtl:1.0 m_axis_host_2 ] - set_property -dict [ list CONFIG.FREQ_HZ {250000000} ] $m_axis_host_2 - set m_axis_card_2 [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:axis_rtl:1.0 m_axis_card_2 ] - set_property -dict [ list CONFIG.FREQ_HZ {250000000} ] $m_axis_card_2 - set s_axis_host_2 [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:axis_rtl:1.0 s_axis_host_2 ] - set_property -dict [ list CONFIG.FREQ_HZ {250000000} CONFIG.HAS_TKEEP {1} CONFIG.HAS_TLAST {1} CONFIG.HAS_TREADY {1} CONFIG.HAS_TSTRB {0} CONFIG.LAYERED_METADATA {undef} CONFIG.TDATA_NUM_BYTES {64} CONFIG.TDEST_WIDTH {0} CONFIG.TID_WIDTH {0} CONFIG.TUSER_WIDTH {0} ] $s_axis_host_2 - set s_axis_card_2 [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:axis_rtl:1.0 s_axis_card_2 ] - set_property -dict [ list CONFIG.FREQ_HZ {250000000} CONFIG.HAS_TKEEP {1} CONFIG.HAS_TLAST {1} CONFIG.HAS_TREADY {1} CONFIG.HAS_TSTRB {0} CONFIG.LAYERED_METADATA {undef} CONFIG.TDATA_NUM_BYTES {64} CONFIG.TDEST_WIDTH {0} CONFIG.TID_WIDTH {0} CONFIG.TUSER_WIDTH {0} ] $s_axis_card_2 - - # RDMA wr_req and rd_req - set s_axis_rdma_wr_req [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:axis_rtl:1.0 s_axis_rdma_wr_req ] - set_property -dict [ list CONFIG.FREQ_HZ {250000000} CONFIG.HAS_TKEEP {0} CONFIG.HAS_TLAST {0} CONFIG.HAS_TREADY {1} CONFIG.HAS_TSTRB {0} CONFIG.LAYERED_METADATA {undef} CONFIG.TDATA_NUM_BYTES {12} CONFIG.TDEST_WIDTH {0} CONFIG.TID_WIDTH {0} CONFIG.TUSER_WIDTH {0} ] $s_axis_rdma_wr_req - set s_axis_rdma_rd_req [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:axis_rtl:1.0 s_axis_rdma_rd_req ] - set_property -dict [ list CONFIG.FREQ_HZ {250000000} CONFIG.HAS_TKEEP {0} CONFIG.HAS_TLAST {0} CONFIG.HAS_TREADY {1} CONFIG.HAS_TSTRB {0} CONFIG.LAYERED_METADATA {undef} CONFIG.TDATA_NUM_BYTES {12} CONFIG.TDEST_WIDTH {0} CONFIG.TID_WIDTH {0} CONFIG.TUSER_WIDTH {0} ] $s_axis_rdma_rd_req - - # connections for rdma_arbiter and the axi 1-to-2 switch - create_bd_cell -type ip -vlnv xilinx.com:ACCL:cyt_rdma_arbiter:1.0 cyt_rdma_arbiter_0 - create_bd_cell -type ip -vlnv xilinx.com:ip:axis_switch:1.1 axis_switch_1_to_2_inst_2 - set_property -dict [list CONFIG.NUM_SI {1} CONFIG.NUM_MI {2} CONFIG.TDATA_NUM_BYTES {64} CONFIG.HAS_TKEEP {1} CONFIG.HAS_TLAST {1} CONFIG.TDEST_WIDTH {8} CONFIG.DECODER_REG {1} CONFIG.Component_Name {axis_switch_1_to_2_inst_2}] [get_bd_cells axis_switch_1_to_2_inst_2] - connect_bd_net [get_bd_ports ap_clk_0] [get_bd_pins cyt_rdma_arbiter_0/ap_clk] - connect_bd_net [get_bd_ports ap_rst_n_0] [get_bd_pins cyt_rdma_arbiter_0/ap_rst_n] - connect_bd_net [get_bd_ports ap_clk_0] [get_bd_pins axis_switch_1_to_2_inst_2/aclk] - connect_bd_net [get_bd_ports ap_rst_n_0] [get_bd_pins axis_switch_1_to_2_inst_2/aresetn] - - connect_bd_intf_net [get_bd_intf_ports s_axis_eth_rx_data] [get_bd_intf_pins cyt_rdma_arbiter_0/s_axis] - connect_bd_intf_net [get_bd_intf_ports s_axis_rdma_wr_req] [get_bd_intf_pins cyt_rdma_arbiter_0/s_meta] - connect_bd_intf_net [get_bd_intf_pins cyt_rdma_arbiter_0/m_meta_0] [get_bd_intf_pins ccl_offload_0/s_axis_eth_notification] - connect_bd_intf_net [get_bd_intf_pins cyt_rdma_arbiter_0/m_axis_0] [get_bd_intf_pins ccl_offload_0/s_axis_eth_rx_data] - connect_bd_intf_net [get_bd_intf_pins cyt_rdma_arbiter_0/m_meta_1] [get_bd_intf_pins cyt_dma_adapter_0/rdma_wr_req] - connect_bd_intf_net [get_bd_intf_pins cyt_rdma_arbiter_0/m_axis_1] [get_bd_intf_pins axis_switch_1_to_2_inst_2/S00_AXIS] - connect_bd_intf_net [get_bd_intf_pins axis_switch_1_to_2_inst_2/M00_AXIS] [get_bd_intf_ports m_axis_card_2] - connect_bd_intf_net [get_bd_intf_pins axis_switch_1_to_2_inst_2/M01_AXIS] [get_bd_intf_ports m_axis_host_2] - - # connections for rdma_mux and the axi 2-to-1 switch - create_bd_cell -type ip -vlnv xilinx.com:ACCL:cyt_rdma_mux:1.0 cyt_rdma_mux_0 - create_bd_cell -type ip -vlnv xilinx.com:ip:axis_switch:1.1 axis_switch_2_to_1_inst_2 - set_property -dict [list CONFIG.NUM_SI {2} CONFIG.TDATA_NUM_BYTES {64} CONFIG.HAS_TKEEP {1} CONFIG.HAS_TLAST {1} CONFIG.ARB_ON_TLAST {1} CONFIG.NUM_MI {1} CONFIG.DECODER_REG {0} CONFIG.ARB_ON_MAX_XFERS {0} CONFIG.Component_Name {axis_switch_2_to_1_inst_2}] [get_bd_cells axis_switch_2_to_1_inst_2] - connect_bd_net [get_bd_ports ap_clk_0] [get_bd_pins cyt_rdma_mux_0/ap_clk] - connect_bd_net [get_bd_ports ap_rst_n_0] [get_bd_pins cyt_rdma_mux_0/ap_rst_n] - connect_bd_net [get_bd_ports ap_clk_0] [get_bd_pins axis_switch_2_to_1_inst_2/aclk] - connect_bd_net [get_bd_ports ap_rst_n_0] [get_bd_pins axis_switch_2_to_1_inst_2/aresetn] - create_bd_cell -type ip -vlnv xilinx.com:ip:xlconstant:1.1 xlconstant_2 - set_property -dict [list CONFIG.CONST_WIDTH {2}] [get_bd_cells xlconstant_2] - set_property -dict [list CONFIG.CONST_VAL {0}] [get_bd_cells xlconstant_2] - connect_bd_net [get_bd_pins xlconstant_2/dout] [get_bd_pins axis_switch_2_to_1_inst_2/s_req_suppress] - - connect_bd_intf_net [get_bd_intf_pins cyt_rdma_mux_0/s_meta_0] [get_bd_intf_pins ccl_offload_0/m_axis_rdma_sq] - connect_bd_intf_net [get_bd_intf_pins cyt_rdma_mux_0/s_axis_0] [get_bd_intf_pins ccl_offload_0/m_axis_eth_tx_data] - connect_bd_intf_net [get_bd_intf_pins cyt_rdma_mux_0/s_meta_1] [get_bd_intf_ports s_axis_rdma_rd_req] - connect_bd_intf_net [get_bd_intf_pins cyt_rdma_mux_0/s_axis_1] [get_bd_intf_pins axis_switch_2_to_1_inst_2/M00_AXIS] - connect_bd_intf_net [get_bd_intf_pins cyt_rdma_mux_0/m_meta_0] [get_bd_intf_ports m_axis_rdma_sq] - connect_bd_intf_net [get_bd_intf_pins cyt_rdma_mux_0/m_meta_1] [get_bd_intf_pins cyt_dma_adapter_0/rdma_rd_req] - connect_bd_intf_net [get_bd_intf_pins cyt_rdma_mux_0/m_axis] [get_bd_intf_ports m_axis_eth_tx_data] - connect_bd_intf_net [get_bd_intf_ports s_axis_host_2] [get_bd_intf_pins axis_switch_2_to_1_inst_2/S00_AXIS] - connect_bd_intf_net [get_bd_intf_ports s_axis_card_2] [get_bd_intf_pins axis_switch_2_to_1_inst_2/S01_AXIS] - - - } - default { - puts "Unrecognized network backend" - exit - } -} - - -# externalize DMA data streams - -set m_axis_host_0 [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:axis_rtl:1.0 m_axis_host_0 ] -set_property -dict [ list CONFIG.FREQ_HZ {250000000} ] $m_axis_host_0 -set m_axis_host_1 [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:axis_rtl:1.0 m_axis_host_1 ] -set_property -dict [ list CONFIG.FREQ_HZ {250000000} ] $m_axis_host_1 -set m_axis_card_0 [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:axis_rtl:1.0 m_axis_card_0 ] -set_property -dict [ list CONFIG.FREQ_HZ {250000000} ] $m_axis_card_0 -set m_axis_card_1 [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:axis_rtl:1.0 m_axis_card_1 ] -set_property -dict [ list CONFIG.FREQ_HZ {250000000} ] $m_axis_card_1 - -set s_axis_host_0 [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:axis_rtl:1.0 s_axis_host_0 ] -set_property -dict [ list CONFIG.FREQ_HZ {250000000} CONFIG.HAS_TKEEP {1} CONFIG.HAS_TLAST {1} CONFIG.HAS_TREADY {1} CONFIG.HAS_TSTRB {0} CONFIG.LAYERED_METADATA {undef} CONFIG.TDATA_NUM_BYTES {64} CONFIG.TDEST_WIDTH {0} CONFIG.TID_WIDTH {0} CONFIG.TUSER_WIDTH {0} ] $s_axis_host_0 -set s_axis_host_1 [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:axis_rtl:1.0 s_axis_host_1 ] -set_property -dict [ list CONFIG.FREQ_HZ {250000000} CONFIG.HAS_TKEEP {1} CONFIG.HAS_TLAST {1} CONFIG.HAS_TREADY {1} CONFIG.HAS_TSTRB {0} CONFIG.LAYERED_METADATA {undef} CONFIG.TDATA_NUM_BYTES {64} CONFIG.TDEST_WIDTH {0} CONFIG.TID_WIDTH {0} CONFIG.TUSER_WIDTH {0} ] $s_axis_host_1 -set s_axis_card_0 [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:axis_rtl:1.0 s_axis_card_0 ] -set_property -dict [ list CONFIG.FREQ_HZ {250000000} CONFIG.HAS_TKEEP {1} CONFIG.HAS_TLAST {1} CONFIG.HAS_TREADY {1} CONFIG.HAS_TSTRB {0} CONFIG.LAYERED_METADATA {undef} CONFIG.TDATA_NUM_BYTES {64} CONFIG.TDEST_WIDTH {0} CONFIG.TID_WIDTH {0} CONFIG.TUSER_WIDTH {0} ] $s_axis_card_0 -set s_axis_card_1 [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:axis_rtl:1.0 s_axis_card_1 ] -set_property -dict [ list CONFIG.FREQ_HZ {250000000} CONFIG.HAS_TKEEP {1} CONFIG.HAS_TLAST {1} CONFIG.HAS_TREADY {1} CONFIG.HAS_TSTRB {0} CONFIG.LAYERED_METADATA {undef} CONFIG.TDATA_NUM_BYTES {64} CONFIG.TDEST_WIDTH {0} CONFIG.TID_WIDTH {0} CONFIG.TUSER_WIDTH {0} ] $s_axis_card_1 - - -# s_axis_host_0 and s_axis_card_0 multiplexed to single s_axis_dma0_mm2s stream, round-robin by tlast -connect_bd_intf_net [get_bd_intf_ports s_axis_host_0] [get_bd_intf_pins axis_switch_2_to_1_inst_0/S00_AXIS] -connect_bd_intf_net [get_bd_intf_ports s_axis_card_0] [get_bd_intf_pins axis_switch_2_to_1_inst_0/S01_AXIS] -connect_bd_intf_net [get_bd_intf_pins axis_switch_2_to_1_inst_0/M00_AXIS] [get_bd_intf_pins ccl_offload_0/s_axis_dma0_mm2s] -connect_bd_net [get_bd_ports ap_clk_0] [get_bd_pins axis_switch_2_to_1_inst_0/aclk] -connect_bd_net [get_bd_ports ap_rst_n_0] [get_bd_pins axis_switch_2_to_1_inst_0/aresetn] - -create_bd_cell -type ip -vlnv xilinx.com:ip:xlconstant:1.1 xlconstant_0 -set_property -dict [list CONFIG.CONST_WIDTH {2}] [get_bd_cells xlconstant_0] -set_property -dict [list CONFIG.CONST_VAL {0}] [get_bd_cells xlconstant_0] -connect_bd_net [get_bd_pins xlconstant_0/dout] [get_bd_pins axis_switch_2_to_1_inst_0/s_req_suppress] - -# s_axis_host_1 and s_axis_card_1 multiplexed to single s_axis_dma1_mm2s stream, round-robin by tlast -connect_bd_intf_net [get_bd_intf_ports s_axis_host_1] [get_bd_intf_pins axis_switch_2_to_1_inst_1/S00_AXIS] -connect_bd_intf_net [get_bd_intf_ports s_axis_card_1] [get_bd_intf_pins axis_switch_2_to_1_inst_1/S01_AXIS] -connect_bd_intf_net [get_bd_intf_pins axis_switch_2_to_1_inst_1/M00_AXIS] [get_bd_intf_pins ccl_offload_0/s_axis_dma1_mm2s] -connect_bd_net [get_bd_ports ap_clk_0] [get_bd_pins axis_switch_2_to_1_inst_1/aclk] -connect_bd_net [get_bd_ports ap_rst_n_0] [get_bd_pins axis_switch_2_to_1_inst_1/aresetn] - -create_bd_cell -type ip -vlnv xilinx.com:ip:xlconstant:1.1 xlconstant_1 -set_property -dict [list CONFIG.CONST_WIDTH {2}] [get_bd_cells xlconstant_1] -set_property -dict [list CONFIG.CONST_VAL {0}] [get_bd_cells xlconstant_1] -connect_bd_net [get_bd_pins xlconstant_1/dout] [get_bd_pins axis_switch_2_to_1_inst_1/s_req_suppress] - -# m_axis_dma0_s2mm multiplex to m_axis_host_0 and m_axis_card_0 according to the strm flag encoded in m_axis_dma0_s2mm tdest -connect_bd_intf_net [get_bd_intf_pins ccl_offload_0/m_axis_dma0_s2mm] [get_bd_intf_pins axis_switch_1_to_2_inst_0/S00_AXIS] -connect_bd_intf_net [get_bd_intf_ports m_axis_card_0] [get_bd_intf_pins axis_switch_1_to_2_inst_0/M00_AXIS] -connect_bd_intf_net [get_bd_intf_ports m_axis_host_0] [get_bd_intf_pins axis_switch_1_to_2_inst_0/M01_AXIS] -connect_bd_net [get_bd_ports ap_clk_0] [get_bd_pins axis_switch_1_to_2_inst_0/aclk] -connect_bd_net [get_bd_ports ap_rst_n_0] [get_bd_pins axis_switch_1_to_2_inst_0/aresetn] - -# m_axis_dma1_s2mm multiplex to m_axis_host_1 and m_axis_card_1 according to the strm flag encoded in m_axis_dma1_s2mm tdest -connect_bd_intf_net [get_bd_intf_pins ccl_offload_0/m_axis_dma1_s2mm] [get_bd_intf_pins axis_switch_1_to_2_inst_1/S00_AXIS] -connect_bd_intf_net [get_bd_intf_ports m_axis_card_1] [get_bd_intf_pins axis_switch_1_to_2_inst_1/M00_AXIS] -connect_bd_intf_net [get_bd_intf_ports m_axis_host_1] [get_bd_intf_pins axis_switch_1_to_2_inst_1/M01_AXIS] -connect_bd_net [get_bd_ports ap_clk_0] [get_bd_pins axis_switch_1_to_2_inst_1/aclk] -connect_bd_net [get_bd_ports ap_rst_n_0] [get_bd_pins axis_switch_1_to_2_inst_1/aresetn] - - - -# connect up AXI lite -create_bd_cell -type ip -vlnv xilinx.com:ip:smartconnect:1.0 smartconnect_0 -set_property -dict [list CONFIG.NUM_MI {2} CONFIG.NUM_SI {1}] [get_bd_cells smartconnect_0] -connect_bd_net [get_bd_ports ap_clk_0] [get_bd_pins smartconnect_0/aclk] -connect_bd_net [get_bd_ports ap_rst_n_0] [get_bd_pins smartconnect_0/aresetn] -connect_bd_intf_net [get_bd_intf_pins hostctrl_0/s_axi_control] [get_bd_intf_pins smartconnect_0/M00_AXI] -connect_bd_intf_net [get_bd_intf_pins ccl_offload_0/s_axi_control] [get_bd_intf_pins smartconnect_0/M01_AXI] -make_bd_intf_pins_external [get_bd_intf_pins smartconnect_0/S00_AXI] -set_property -dict [list CONFIG.ADDR_WIDTH {16}] [get_bd_intf_ports S00_AXI_0] - -# Create address segments -assign_bd_address -offset 0x00000000 -range 0x00002000 -target_address_space [get_bd_addr_spaces S00_AXI_0] [get_bd_addr_segs ccl_offload_0/s_axi_control/reg0] -force -assign_bd_address -offset 0x00002000 -range 0x00002000 -target_address_space [get_bd_addr_spaces S00_AXI_0] [get_bd_addr_segs hostctrl_0/s_axi_control/Reg] -force - -set_property CONFIG.PROTOCOL AXI4LITE [get_bd_intf_ports /S00_AXI_0] -set_property -dict [list CONFIG.HAS_BURST {0} CONFIG.HAS_CACHE {0} CONFIG.HAS_LOCK {0} CONFIG.HAS_QOS {0} CONFIG.HAS_REGION {0}] [get_bd_intf_ports S00_AXI_0] + + + # Create interface ports + set S00_AXI_0 [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:aximm_rtl:1.0 S00_AXI_0 ] + set_property -dict [ list \ + CONFIG.ADDR_WIDTH {14} \ + CONFIG.ARUSER_WIDTH {0} \ + CONFIG.AWUSER_WIDTH {0} \ + CONFIG.BUSER_WIDTH {0} \ + CONFIG.DATA_WIDTH {32} \ + CONFIG.FREQ_HZ {250000000} \ + CONFIG.HAS_BRESP {1} \ + CONFIG.HAS_BURST {0} \ + CONFIG.HAS_CACHE {0} \ + CONFIG.HAS_LOCK {0} \ + CONFIG.HAS_PROT {1} \ + CONFIG.HAS_QOS {0} \ + CONFIG.HAS_REGION {0} \ + CONFIG.HAS_RRESP {1} \ + CONFIG.HAS_WSTRB {1} \ + CONFIG.ID_WIDTH {0} \ + CONFIG.MAX_BURST_LENGTH {1} \ + CONFIG.NUM_READ_OUTSTANDING {1} \ + CONFIG.NUM_READ_THREADS {1} \ + CONFIG.NUM_WRITE_OUTSTANDING {1} \ + CONFIG.NUM_WRITE_THREADS {1} \ + CONFIG.PROTOCOL {AXI4LITE} \ + CONFIG.READ_WRITE_MODE {READ_WRITE} \ + CONFIG.RUSER_BITS_PER_BYTE {0} \ + CONFIG.RUSER_WIDTH {0} \ + CONFIG.SUPPORTS_NARROW_BURST {0} \ + CONFIG.WUSER_BITS_PER_BYTE {0} \ + CONFIG.WUSER_WIDTH {0} \ + ] $S00_AXI_0 + + set cyt_cq_rd_sts_0 [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:axis_rtl:1.0 cyt_cq_rd_sts_0 ] + set_property -dict [ list \ + CONFIG.FREQ_HZ {250000000} \ + CONFIG.HAS_TKEEP {0} \ + CONFIG.HAS_TLAST {0} \ + CONFIG.HAS_TREADY {1} \ + CONFIG.HAS_TSTRB {0} \ + CONFIG.LAYERED_METADATA {undef} \ + CONFIG.TDATA_NUM_BYTES {4} \ + CONFIG.TDEST_WIDTH {0} \ + CONFIG.TID_WIDTH {0} \ + CONFIG.TUSER_WIDTH {0} \ + ] $cyt_cq_rd_sts_0 + + set cyt_cq_wr_sts_0 [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:axis_rtl:1.0 cyt_cq_wr_sts_0 ] + set_property -dict [ list \ + CONFIG.FREQ_HZ {250000000} \ + CONFIG.HAS_TKEEP {0} \ + CONFIG.HAS_TLAST {0} \ + CONFIG.HAS_TREADY {1} \ + CONFIG.HAS_TSTRB {0} \ + CONFIG.LAYERED_METADATA {undef} \ + CONFIG.TDATA_NUM_BYTES {4} \ + CONFIG.TDEST_WIDTH {0} \ + CONFIG.TID_WIDTH {0} \ + CONFIG.TUSER_WIDTH {0} \ + ] $cyt_cq_wr_sts_0 + + set cyt_rq_rd [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:axis_rtl:1.0 cyt_rq_rd ] + set_property -dict [ list \ + CONFIG.FREQ_HZ {250000000} \ + CONFIG.HAS_TKEEP {0} \ + CONFIG.HAS_TLAST {0} \ + CONFIG.HAS_TREADY {1} \ + CONFIG.HAS_TSTRB {0} \ + CONFIG.LAYERED_METADATA {undef} \ + CONFIG.TDATA_NUM_BYTES {16} \ + CONFIG.TDEST_WIDTH {0} \ + CONFIG.TID_WIDTH {0} \ + CONFIG.TUSER_WIDTH {0} \ + ] $cyt_rq_rd + + set cyt_rq_wr [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:axis_rtl:1.0 cyt_rq_wr ] + set_property -dict [ list \ + CONFIG.FREQ_HZ {250000000} \ + CONFIG.HAS_TKEEP {0} \ + CONFIG.HAS_TLAST {0} \ + CONFIG.HAS_TREADY {1} \ + CONFIG.HAS_TSTRB {0} \ + CONFIG.LAYERED_METADATA {undef} \ + CONFIG.TDATA_NUM_BYTES {16} \ + CONFIG.TDEST_WIDTH {0} \ + CONFIG.TID_WIDTH {0} \ + CONFIG.TUSER_WIDTH {0} \ + ] $cyt_rq_wr + + set cyt_rreq_recv_0 [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:axis_rtl:1.0 cyt_rreq_recv_0 ] + set_property -dict [ list \ + CONFIG.FREQ_HZ {250000000} \ + CONFIG.HAS_TKEEP {1} \ + CONFIG.HAS_TLAST {1} \ + CONFIG.HAS_TREADY {0} \ + CONFIG.HAS_TSTRB {0} \ + CONFIG.LAYERED_METADATA {undef} \ + CONFIG.TDATA_NUM_BYTES {64} \ + CONFIG.TDEST_WIDTH {0} \ + CONFIG.TID_WIDTH {0} \ + CONFIG.TUSER_WIDTH {0} \ + ] $cyt_rreq_recv_0 + + set cyt_rreq_recv_1 [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:axis_rtl:1.0 cyt_rreq_recv_1 ] + set_property -dict [ list \ + CONFIG.FREQ_HZ {250000000} \ + CONFIG.HAS_TKEEP {1} \ + CONFIG.HAS_TLAST {1} \ + CONFIG.HAS_TREADY {1} \ + CONFIG.HAS_TSTRB {0} \ + CONFIG.LAYERED_METADATA {undef} \ + CONFIG.TDATA_NUM_BYTES {64} \ + CONFIG.TDEST_WIDTH {0} \ + CONFIG.TID_WIDTH {0} \ + CONFIG.TUSER_WIDTH {0} \ + ] $cyt_rreq_recv_1 + + set cyt_rreq_send_0 [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:axis_rtl:1.0 cyt_rreq_send_0 ] + set_property -dict [ list \ + CONFIG.FREQ_HZ {250000000} \ + ] $cyt_rreq_send_0 + + set cyt_rreq_send_1 [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:axis_rtl:1.0 cyt_rreq_send_1 ] + set_property -dict [ list \ + CONFIG.FREQ_HZ {250000000} \ + ] $cyt_rreq_send_1 + + set cyt_rrsp_recv_0 [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:axis_rtl:1.0 cyt_rrsp_recv_0 ] + set_property -dict [ list \ + CONFIG.FREQ_HZ {250000000} \ + CONFIG.HAS_TKEEP {1} \ + CONFIG.HAS_TLAST {1} \ + CONFIG.HAS_TREADY {1} \ + CONFIG.HAS_TSTRB {0} \ + CONFIG.LAYERED_METADATA {undef} \ + CONFIG.TDATA_NUM_BYTES {64} \ + CONFIG.TDEST_WIDTH {0} \ + CONFIG.TID_WIDTH {0} \ + CONFIG.TUSER_WIDTH {0} \ + ] $cyt_rrsp_recv_0 + + set cyt_rrsp_recv_1 [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:axis_rtl:1.0 cyt_rrsp_recv_1 ] + set_property -dict [ list \ + CONFIG.FREQ_HZ {250000000} \ + CONFIG.HAS_TKEEP {1} \ + CONFIG.HAS_TLAST {1} \ + CONFIG.HAS_TREADY {1} \ + CONFIG.HAS_TSTRB {0} \ + CONFIG.LAYERED_METADATA {undef} \ + CONFIG.TDATA_NUM_BYTES {64} \ + CONFIG.TDEST_WIDTH {0} \ + CONFIG.TID_WIDTH {0} \ + CONFIG.TUSER_WIDTH {0} \ + ] $cyt_rrsp_recv_1 + + set cyt_sq_rd_cmd [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:axis_rtl:1.0 cyt_sq_rd_cmd ] + set_property -dict [ list \ + CONFIG.FREQ_HZ {250000000} \ + ] $cyt_sq_rd_cmd + + set cyt_sq_wr_cmd [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:axis_rtl:1.0 cyt_sq_wr_cmd ] + set_property -dict [ list \ + CONFIG.FREQ_HZ {250000000} \ + ] $cyt_sq_wr_cmd + + set m_axis_card_0 [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:axis_rtl:1.0 m_axis_card_0 ] + set_property -dict [ list \ + CONFIG.FREQ_HZ {250000000} \ + ] $m_axis_card_0 + + set m_axis_card_1 [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:axis_rtl:1.0 m_axis_card_1 ] + set_property -dict [ list \ + CONFIG.FREQ_HZ {250000000} \ + ] $m_axis_card_1 + + set m_axis_card_2 [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:axis_rtl:1.0 m_axis_card_2 ] + set_property -dict [ list \ + CONFIG.FREQ_HZ {250000000} \ + ] $m_axis_card_2 + + set m_axis_host_0 [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:axis_rtl:1.0 m_axis_host_0 ] + set_property -dict [ list \ + CONFIG.FREQ_HZ {250000000} \ + ] $m_axis_host_0 + + set m_axis_host_1 [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:axis_rtl:1.0 m_axis_host_1 ] + set_property -dict [ list \ + CONFIG.FREQ_HZ {250000000} \ + ] $m_axis_host_1 + + set m_axis_host_2 [ create_bd_intf_port -mode Master -vlnv xilinx.com:interface:axis_rtl:1.0 m_axis_host_2 ] + set_property -dict [ list \ + CONFIG.FREQ_HZ {250000000} \ + ] $m_axis_host_2 + + set s_axis_card_0 [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:axis_rtl:1.0 s_axis_card_0 ] + set_property -dict [ list \ + CONFIG.FREQ_HZ {250000000} \ + CONFIG.HAS_TKEEP {1} \ + CONFIG.HAS_TLAST {1} \ + CONFIG.HAS_TREADY {1} \ + CONFIG.HAS_TSTRB {0} \ + CONFIG.LAYERED_METADATA {undef} \ + CONFIG.TDATA_NUM_BYTES {64} \ + CONFIG.TDEST_WIDTH {0} \ + CONFIG.TID_WIDTH {0} \ + CONFIG.TUSER_WIDTH {0} \ + ] $s_axis_card_0 + + set s_axis_card_1 [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:axis_rtl:1.0 s_axis_card_1 ] + set_property -dict [ list \ + CONFIG.FREQ_HZ {250000000} \ + CONFIG.HAS_TKEEP {1} \ + CONFIG.HAS_TLAST {1} \ + CONFIG.HAS_TREADY {1} \ + CONFIG.HAS_TSTRB {0} \ + CONFIG.LAYERED_METADATA {undef} \ + CONFIG.TDATA_NUM_BYTES {64} \ + CONFIG.TDEST_WIDTH {0} \ + CONFIG.TID_WIDTH {0} \ + CONFIG.TUSER_WIDTH {0} \ + ] $s_axis_card_1 + + set s_axis_host_0 [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:axis_rtl:1.0 s_axis_host_0 ] + set_property -dict [ list \ + CONFIG.FREQ_HZ {250000000} \ + CONFIG.HAS_TKEEP {1} \ + CONFIG.HAS_TLAST {1} \ + CONFIG.HAS_TREADY {1} \ + CONFIG.HAS_TSTRB {0} \ + CONFIG.LAYERED_METADATA {undef} \ + CONFIG.TDATA_NUM_BYTES {64} \ + CONFIG.TDEST_WIDTH {0} \ + CONFIG.TID_WIDTH {0} \ + CONFIG.TUSER_WIDTH {0} \ + ] $s_axis_host_0 + + set s_axis_host_1 [ create_bd_intf_port -mode Slave -vlnv xilinx.com:interface:axis_rtl:1.0 s_axis_host_1 ] + set_property -dict [ list \ + CONFIG.FREQ_HZ {250000000} \ + CONFIG.HAS_TKEEP {1} \ + CONFIG.HAS_TLAST {1} \ + CONFIG.HAS_TREADY {1} \ + CONFIG.HAS_TSTRB {0} \ + CONFIG.LAYERED_METADATA {undef} \ + CONFIG.TDATA_NUM_BYTES {64} \ + CONFIG.TDEST_WIDTH {0} \ + CONFIG.TID_WIDTH {0} \ + CONFIG.TUSER_WIDTH {0} \ + ] $s_axis_host_1 + + + # Create ports + set ap_clk_0 [ create_bd_port -dir I -type clk -freq_hz 250000000 ap_clk_0 ] + set_property -dict [ list \ + CONFIG.ASSOCIATED_BUSIF {cyt_cq_wr_sts_0:cyt_cq_rd_sts_0:cyt_sq_wr_cmd:cyt_sq_rd_cmd:m_axis_host_2:m_axis_card_2:cyt_rq_wr:m_axis_host_0:m_axis_host_1:m_axis_card_0:m_axis_card_1:s_axis_host_0:s_axis_host_1:s_axis_card_0:s_axis_card_1:S00_AXI_0:cyt_rreq_send_0:cyt_rreq_send_1:cyt_rrsp_recv_0:cyt_rrsp_recv_1:cyt_rq_rd:cyt_rreq_recv_0:cyt_rreq_recv_1} \ + ] $ap_clk_0 + set ap_rst_n_0 [ create_bd_port -dir I -type rst ap_rst_n_0 ] + + # Create instance: axis_data_fifo_0, and set properties + set axis_data_fifo_0 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_data_fifo:2.0 axis_data_fifo_0 ] + set_property -dict [ list \ + CONFIG.FIFO_DEPTH {16} \ + ] $axis_data_fifo_0 + + # Create instance: axis_data_fifo_1, and set properties + set axis_data_fifo_1 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_data_fifo:2.0 axis_data_fifo_1 ] + set_property -dict [ list \ + CONFIG.FIFO_DEPTH {16} \ + ] $axis_data_fifo_1 + + # Create instance: axis_data_fifo_2, and set properties + set axis_data_fifo_2 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_data_fifo:2.0 axis_data_fifo_2 ] + set_property -dict [ list \ + CONFIG.FIFO_DEPTH {16} \ + ] $axis_data_fifo_2 + + # Create instance: axis_data_fifo_3, and set properties + set axis_data_fifo_3 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_data_fifo:2.0 axis_data_fifo_3 ] + set_property -dict [ list \ + CONFIG.FIFO_DEPTH {16} \ + ] $axis_data_fifo_3 + + # Create instance: axis_data_fifo_4, and set properties + set axis_data_fifo_4 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_data_fifo:2.0 axis_data_fifo_4 ] + set_property -dict [ list \ + CONFIG.FIFO_DEPTH {16} \ + ] $axis_data_fifo_4 + + # Create instance: axis_data_fifo_5, and set properties + set axis_data_fifo_5 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_data_fifo:2.0 axis_data_fifo_5 ] + set_property -dict [ list \ + CONFIG.FIFO_DEPTH {16} \ + ] $axis_data_fifo_5 + + # Create instance: axis_data_fifo_6, and set properties + set axis_data_fifo_6 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_data_fifo:2.0 axis_data_fifo_6 ] + set_property -dict [ list \ + CONFIG.FIFO_DEPTH {16} \ + ] $axis_data_fifo_6 + + # Create instance: axis_data_fifo_7, and set properties + set axis_data_fifo_7 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_data_fifo:2.0 axis_data_fifo_7 ] + set_property -dict [ list \ + CONFIG.FIFO_DEPTH {16} \ + ] $axis_data_fifo_7 + + + # Create instance: axis_register_slice_0, and set properties + set axis_register_slice_0 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_register_slice:1.1 axis_register_slice_0 ] + set_property -dict [ list \ + CONFIG.NUM_SLR_CROSSINGS {0} \ + CONFIG.REG_CONFIG {16} \ + ] $axis_register_slice_0 + + # Create instance: axis_register_slice_1, and set properties + set axis_register_slice_1 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_register_slice:1.1 axis_register_slice_1 ] + set_property -dict [ list \ + CONFIG.NUM_SLR_CROSSINGS {0} \ + CONFIG.REG_CONFIG {16} \ + ] $axis_register_slice_1 + + # Create instance: axis_register_slice_2, and set properties + set axis_register_slice_2 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_register_slice:1.1 axis_register_slice_2 ] + set_property -dict [ list \ + CONFIG.NUM_SLR_CROSSINGS {0} \ + CONFIG.REG_CONFIG {16} \ + ] $axis_register_slice_2 + + # Create instance: axis_register_slice_3, and set properties + set axis_register_slice_3 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_register_slice:1.1 axis_register_slice_3 ] + set_property -dict [ list \ + CONFIG.NUM_SLR_CROSSINGS {0} \ + CONFIG.REG_CONFIG {16} \ + ] $axis_register_slice_3 + + # Create instance: axis_register_slice_4, and set properties + set axis_register_slice_4 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_register_slice:1.1 axis_register_slice_4 ] + set_property -dict [ list \ + CONFIG.NUM_SLR_CROSSINGS {0} \ + CONFIG.REG_CONFIG {16} \ + ] $axis_register_slice_4 + + # Create instance: axis_register_slice_5, and set properties + set axis_register_slice_5 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_register_slice:1.1 axis_register_slice_5 ] + set_property -dict [ list \ + CONFIG.NUM_SLR_CROSSINGS {0} \ + CONFIG.REG_CONFIG {16} \ + ] $axis_register_slice_5 + + # Create instance: axis_register_slice_6, and set properties + set axis_register_slice_6 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_register_slice:1.1 axis_register_slice_6 ] + set_property -dict [ list \ + CONFIG.NUM_SLR_CROSSINGS {0} \ + CONFIG.REG_CONFIG {16} \ + ] $axis_register_slice_6 + + # Create instance: axis_register_slice_7, and set properties + set axis_register_slice_7 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_register_slice:1.1 axis_register_slice_7 ] + set_property -dict [ list \ + CONFIG.NUM_SLR_CROSSINGS {0} \ + CONFIG.REG_CONFIG {16} \ + ] $axis_register_slice_7 + + # Create instance: axis_register_slice_8, and set properties + set axis_register_slice_8 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_register_slice:1.1 axis_register_slice_8 ] + set_property -dict [ list \ + CONFIG.NUM_SLR_CROSSINGS {0} \ + CONFIG.REG_CONFIG {16} \ + ] $axis_register_slice_8 + + # Create instance: axis_register_slice_9, and set properties + set axis_register_slice_9 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_register_slice:1.1 axis_register_slice_9 ] + set_property -dict [ list \ + CONFIG.NUM_SLR_CROSSINGS {0} \ + CONFIG.REG_CONFIG {16} \ + ] $axis_register_slice_9 + + # Create instance: axis_register_slice_10, and set properties + set axis_register_slice_10 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_register_slice:1.1 axis_register_slice_10 ] + set_property -dict [ list \ + CONFIG.NUM_SLR_CROSSINGS {0} \ + CONFIG.REG_CONFIG {16} \ + ] $axis_register_slice_10 + + # Create instance: axis_register_slice_11, and set properties + set axis_register_slice_11 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_register_slice:1.1 axis_register_slice_11 ] + set_property -dict [ list \ + CONFIG.NUM_SLR_CROSSINGS {0} \ + CONFIG.REG_CONFIG {16} \ + ] $axis_register_slice_11 + + # Create instance: axis_register_slice_12, and set properties + set axis_register_slice_12 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_register_slice:1.1 axis_register_slice_12 ] + set_property -dict [ list \ + CONFIG.NUM_SLR_CROSSINGS {0} \ + CONFIG.REG_CONFIG {16} \ + ] $axis_register_slice_12 + + # Create instance: axis_register_slice_13, and set properties + set axis_register_slice_13 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_register_slice:1.1 axis_register_slice_13 ] + set_property -dict [ list \ + CONFIG.NUM_SLR_CROSSINGS {0} \ + CONFIG.REG_CONFIG {16} \ + ] $axis_register_slice_13 + + # Create instance: axis_register_slice_14, and set properties + set axis_register_slice_14 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_register_slice:1.1 axis_register_slice_14 ] + set_property -dict [ list \ + CONFIG.NUM_SLR_CROSSINGS {0} \ + CONFIG.REG_CONFIG {16} \ + ] $axis_register_slice_14 + + # Create instance: axis_switch_1_to_2_inst_0, and set properties + set axis_switch_1_to_2_inst_0 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_switch:1.1 axis_switch_1_to_2_inst_0 ] + set_property -dict [ list \ + CONFIG.DECODER_REG {1} \ + CONFIG.HAS_TKEEP {1} \ + CONFIG.HAS_TLAST {1} \ + CONFIG.NUM_MI {2} \ + CONFIG.NUM_SI {1} \ + CONFIG.TDATA_NUM_BYTES {64} \ + CONFIG.TDEST_WIDTH {8} \ + ] $axis_switch_1_to_2_inst_0 + + # Create instance: axis_switch_1_to_2_inst_1, and set properties + set axis_switch_1_to_2_inst_1 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_switch:1.1 axis_switch_1_to_2_inst_1 ] + set_property -dict [ list \ + CONFIG.DECODER_REG {1} \ + CONFIG.HAS_TKEEP {1} \ + CONFIG.HAS_TLAST {1} \ + CONFIG.NUM_MI {2} \ + CONFIG.NUM_SI {1} \ + CONFIG.TDATA_NUM_BYTES {64} \ + CONFIG.TDEST_WIDTH {8} \ + ] $axis_switch_1_to_2_inst_1 + + # Create instance: axis_switch_1_to_2_inst_2, and set properties + set axis_switch_1_to_2_inst_2 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_switch:1.1 axis_switch_1_to_2_inst_2 ] + set_property -dict [ list \ + CONFIG.DECODER_REG {1} \ + CONFIG.HAS_TKEEP {1} \ + CONFIG.HAS_TLAST {1} \ + CONFIG.NUM_MI {2} \ + CONFIG.NUM_SI {1} \ + CONFIG.TDATA_NUM_BYTES {64} \ + CONFIG.TDEST_WIDTH {8} \ + ] $axis_switch_1_to_2_inst_2 + + # Create instance: axis_switch_1_to_2_inst_3, and set properties + set axis_switch_1_to_2_inst_3 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_switch:1.1 axis_switch_1_to_2_inst_3 ] + set_property -dict [ list \ + CONFIG.DECODER_REG {1} \ + CONFIG.HAS_TKEEP {1} \ + CONFIG.HAS_TLAST {1} \ + CONFIG.NUM_MI {2} \ + CONFIG.NUM_SI {1} \ + CONFIG.TDATA_NUM_BYTES {64} \ + CONFIG.TDEST_WIDTH {8} \ + ] $axis_switch_1_to_2_inst_3 + + # Create instance: axis_switch_2_to_1_inst_0, and set properties + set axis_switch_2_to_1_inst_0 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_switch:1.1 axis_switch_2_to_1_inst_0 ] + set_property -dict [ list \ + CONFIG.ARB_ON_MAX_XFERS {0} \ + CONFIG.ARB_ON_TLAST {1} \ + CONFIG.DECODER_REG {0} \ + CONFIG.HAS_TKEEP {1} \ + CONFIG.HAS_TLAST {1} \ + CONFIG.NUM_MI {1} \ + CONFIG.NUM_SI {2} \ + CONFIG.TDATA_NUM_BYTES {64} \ + ] $axis_switch_2_to_1_inst_0 + + # Create instance: axis_switch_2_to_1_inst_1, and set properties + set axis_switch_2_to_1_inst_1 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_switch:1.1 axis_switch_2_to_1_inst_1 ] + set_property -dict [ list \ + CONFIG.ARB_ON_MAX_XFERS {0} \ + CONFIG.ARB_ON_TLAST {1} \ + CONFIG.DECODER_REG {0} \ + CONFIG.HAS_TKEEP {1} \ + CONFIG.HAS_TLAST {1} \ + CONFIG.NUM_MI {1} \ + CONFIG.NUM_SI {2} \ + CONFIG.TDATA_NUM_BYTES {64} \ + ] $axis_switch_2_to_1_inst_1 + + # Create instance: axis_switch_2_to_1_inst_2, and set properties + set axis_switch_2_to_1_inst_2 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axis_switch:1.1 axis_switch_2_to_1_inst_2 ] + set_property -dict [ list \ + CONFIG.ARB_ON_MAX_XFERS {0} \ + CONFIG.ARB_ON_TLAST {1} \ + CONFIG.DECODER_REG {0} \ + CONFIG.HAS_TKEEP {1} \ + CONFIG.HAS_TLAST {1} \ + CONFIG.NUM_MI {1} \ + CONFIG.NUM_SI {2} \ + CONFIG.TDATA_NUM_BYTES {64} \ + ] $axis_switch_2_to_1_inst_2 + + # Create instance: ccl_offload_0, and set properties + set ccl_offload_0 [ create_bd_cell -type ip -vlnv Xilinx:ACCL:ccl_offload:1.0 ccl_offload_0 ] + + # Create instance: cclo_sq_adapter_0, and set properties + set cclo_sq_adapter_0 [ create_bd_cell -type ip -vlnv xilinx.com:ACCL:cclo_sq_adapter:1.0 cclo_sq_adapter_0 ] + + # Create instance: cyt_cq_dm_sts_conver_0, and set properties + set cyt_cq_dm_sts_conver_0 [ create_bd_cell -type ip -vlnv xilinx.com:ACCL:cyt_cq_dm_sts_converter:1.0 cyt_cq_dm_sts_conver_0 ] + + # Create instance: cyt_cq_dm_sts_conver_1, and set properties + set cyt_cq_dm_sts_conver_1 [ create_bd_cell -type ip -vlnv xilinx.com:ACCL:cyt_cq_dm_sts_converter:1.0 cyt_cq_dm_sts_conver_1 ] + + # Create instance: cyt_dma_sq_adapter_0, and set properties + set cyt_dma_sq_adapter_0 [ create_bd_cell -type ip -vlnv xilinx.com:ACCL:cyt_dma_sq_adapter:1.0 cyt_dma_sq_adapter_0 ] + + # Create instance: cyt_rdma_arbiter_0, and set properties + set cyt_rdma_arbiter_0 [ create_bd_cell -type ip -vlnv xilinx.com:ACCL:cyt_rdma_arbiter:1.0 cyt_rdma_arbiter_0 ] + + # Create instance: hostctrl_0, and set properties + set hostctrl_0 [ create_bd_cell -type ip -vlnv xilinx.com:ACCL:hostctrl:1.0 hostctrl_0 ] + + # Create instance: reduce_ops_0, and set properties + set reduce_ops_0 [ create_bd_cell -type ip -vlnv xilinx.com:ACCL:reduce_ops:1.0 reduce_ops_0 ] + + # Create instance: rst_ap_clk_0_250M, and set properties + set rst_ap_clk_0_250M [ create_bd_cell -type ip -vlnv xilinx.com:ip:proc_sys_reset:5.0 rst_ap_clk_0_250M ] + + # Create instance: smartconnect_0, and set properties + set smartconnect_0 [ create_bd_cell -type ip -vlnv xilinx.com:ip:smartconnect:1.0 smartconnect_0 ] + set_property -dict [ list \ + CONFIG.NUM_MI {2} \ + CONFIG.NUM_SI {1} \ + ] $smartconnect_0 + + # Create instance: system_ila_0, and set properties + set system_ila_0 [ create_bd_cell -type ip -vlnv xilinx.com:ip:system_ila:1.1 system_ila_0 ] + set_property -dict [ list \ + CONFIG.C_INPUT_PIPE_STAGES {2} \ + CONFIG.C_MON_TYPE {INTERFACE} \ + CONFIG.C_NUM_MONITOR_SLOTS {16} \ + CONFIG.C_SLOT_0_APC_EN {0} \ + CONFIG.C_SLOT_0_AXI_AR_SEL_DATA {1} \ + CONFIG.C_SLOT_0_AXI_AR_SEL_TRIG {1} \ + CONFIG.C_SLOT_0_AXI_AW_SEL_DATA {1} \ + CONFIG.C_SLOT_0_AXI_AW_SEL_TRIG {1} \ + CONFIG.C_SLOT_0_AXI_B_SEL_DATA {1} \ + CONFIG.C_SLOT_0_AXI_B_SEL_TRIG {1} \ + CONFIG.C_SLOT_0_AXI_DATA_SEL {1} \ + CONFIG.C_SLOT_0_AXI_R_SEL_DATA {1} \ + CONFIG.C_SLOT_0_AXI_R_SEL_TRIG {1} \ + CONFIG.C_SLOT_0_AXI_TRIG_SEL {1} \ + CONFIG.C_SLOT_0_AXI_W_SEL_DATA {1} \ + CONFIG.C_SLOT_0_AXI_W_SEL_TRIG {1} \ + CONFIG.C_SLOT_0_INTF_TYPE {xilinx.com:interface:axis_rtl:1.0} \ + CONFIG.C_SLOT_0_TYPE {0} \ + CONFIG.C_SLOT_10_APC_EN {0} \ + CONFIG.C_SLOT_10_AXI_DATA_SEL {1} \ + CONFIG.C_SLOT_10_AXI_TRIG_SEL {1} \ + CONFIG.C_SLOT_10_INTF_TYPE {xilinx.com:interface:axis_rtl:1.0} \ + CONFIG.C_SLOT_10_TYPE {0} \ + CONFIG.C_SLOT_11_APC_EN {0} \ + CONFIG.C_SLOT_11_AXI_DATA_SEL {1} \ + CONFIG.C_SLOT_11_AXI_TRIG_SEL {1} \ + CONFIG.C_SLOT_11_INTF_TYPE {xilinx.com:interface:axis_rtl:1.0} \ + CONFIG.C_SLOT_11_TYPE {0} \ + CONFIG.C_SLOT_12_APC_EN {0} \ + CONFIG.C_SLOT_12_AXI_DATA_SEL {1} \ + CONFIG.C_SLOT_12_AXI_TRIG_SEL {1} \ + CONFIG.C_SLOT_12_INTF_TYPE {xilinx.com:interface:axis_rtl:1.0} \ + CONFIG.C_SLOT_12_TYPE {0} \ + CONFIG.C_SLOT_13_APC_EN {0} \ + CONFIG.C_SLOT_13_AXI_DATA_SEL {1} \ + CONFIG.C_SLOT_13_AXI_TRIG_SEL {1} \ + CONFIG.C_SLOT_13_INTF_TYPE {xilinx.com:interface:axis_rtl:1.0} \ + CONFIG.C_SLOT_13_TYPE {0} \ + CONFIG.C_SLOT_14_APC_EN {0} \ + CONFIG.C_SLOT_14_AXI_DATA_SEL {1} \ + CONFIG.C_SLOT_14_AXI_TRIG_SEL {1} \ + CONFIG.C_SLOT_14_INTF_TYPE {xilinx.com:interface:axis_rtl:1.0} \ + CONFIG.C_SLOT_14_TYPE {0} \ + CONFIG.C_SLOT_15_APC_EN {0} \ + CONFIG.C_SLOT_15_AXI_DATA_SEL {1} \ + CONFIG.C_SLOT_15_AXI_TRIG_SEL {1} \ + CONFIG.C_SLOT_15_INTF_TYPE {xilinx.com:interface:axis_rtl:1.0} \ + CONFIG.C_SLOT_1_APC_EN {0} \ + CONFIG.C_SLOT_1_AXI_AR_SEL_DATA {1} \ + CONFIG.C_SLOT_1_AXI_AR_SEL_TRIG {1} \ + CONFIG.C_SLOT_1_AXI_AW_SEL_DATA {1} \ + CONFIG.C_SLOT_1_AXI_AW_SEL_TRIG {1} \ + CONFIG.C_SLOT_1_AXI_B_SEL_DATA {1} \ + CONFIG.C_SLOT_1_AXI_B_SEL_TRIG {1} \ + CONFIG.C_SLOT_1_AXI_DATA_SEL {1} \ + CONFIG.C_SLOT_1_AXI_R_SEL_DATA {1} \ + CONFIG.C_SLOT_1_AXI_R_SEL_TRIG {1} \ + CONFIG.C_SLOT_1_AXI_TRIG_SEL {1} \ + CONFIG.C_SLOT_1_AXI_W_SEL_DATA {1} \ + CONFIG.C_SLOT_1_AXI_W_SEL_TRIG {1} \ + CONFIG.C_SLOT_1_INTF_TYPE {xilinx.com:interface:axis_rtl:1.0} \ + CONFIG.C_SLOT_1_TYPE {0} \ + CONFIG.C_SLOT_2_APC_EN {0} \ + CONFIG.C_SLOT_2_AXI_AR_SEL_DATA {1} \ + CONFIG.C_SLOT_2_AXI_AR_SEL_TRIG {1} \ + CONFIG.C_SLOT_2_AXI_AW_SEL_DATA {1} \ + CONFIG.C_SLOT_2_AXI_AW_SEL_TRIG {1} \ + CONFIG.C_SLOT_2_AXI_B_SEL_DATA {1} \ + CONFIG.C_SLOT_2_AXI_B_SEL_TRIG {1} \ + CONFIG.C_SLOT_2_AXI_DATA_SEL {1} \ + CONFIG.C_SLOT_2_AXI_R_SEL_DATA {1} \ + CONFIG.C_SLOT_2_AXI_R_SEL_TRIG {1} \ + CONFIG.C_SLOT_2_AXI_TRIG_SEL {1} \ + CONFIG.C_SLOT_2_AXI_W_SEL_DATA {1} \ + CONFIG.C_SLOT_2_AXI_W_SEL_TRIG {1} \ + CONFIG.C_SLOT_2_INTF_TYPE {xilinx.com:interface:axis_rtl:1.0} \ + CONFIG.C_SLOT_2_TYPE {0} \ + CONFIG.C_SLOT_3_APC_EN {0} \ + CONFIG.C_SLOT_3_AXI_DATA_SEL {1} \ + CONFIG.C_SLOT_3_AXI_TRIG_SEL {1} \ + CONFIG.C_SLOT_3_INTF_TYPE {xilinx.com:interface:axis_rtl:1.0} \ + CONFIG.C_SLOT_3_TYPE {0} \ + CONFIG.C_SLOT_4_APC_EN {0} \ + CONFIG.C_SLOT_4_AXI_DATA_SEL {1} \ + CONFIG.C_SLOT_4_AXI_TRIG_SEL {1} \ + CONFIG.C_SLOT_4_INTF_TYPE {xilinx.com:interface:axis_rtl:1.0} \ + CONFIG.C_SLOT_4_TYPE {0} \ + CONFIG.C_SLOT_5_APC_EN {0} \ + CONFIG.C_SLOT_5_AXI_DATA_SEL {1} \ + CONFIG.C_SLOT_5_AXI_TRIG_SEL {1} \ + CONFIG.C_SLOT_5_INTF_TYPE {xilinx.com:interface:axis_rtl:1.0} \ + CONFIG.C_SLOT_5_TYPE {0} \ + CONFIG.C_SLOT_6_APC_EN {0} \ + CONFIG.C_SLOT_6_AXI_DATA_SEL {1} \ + CONFIG.C_SLOT_6_AXI_TRIG_SEL {1} \ + CONFIG.C_SLOT_6_INTF_TYPE {xilinx.com:interface:axis_rtl:1.0} \ + CONFIG.C_SLOT_6_TYPE {0} \ + CONFIG.C_SLOT_7_APC_EN {0} \ + CONFIG.C_SLOT_7_AXI_DATA_SEL {1} \ + CONFIG.C_SLOT_7_AXI_TRIG_SEL {1} \ + CONFIG.C_SLOT_7_INTF_TYPE {xilinx.com:interface:axis_rtl:1.0} \ + CONFIG.C_SLOT_7_TYPE {0} \ + CONFIG.C_SLOT_8_APC_EN {0} \ + CONFIG.C_SLOT_8_AXI_DATA_SEL {1} \ + CONFIG.C_SLOT_8_AXI_TRIG_SEL {1} \ + CONFIG.C_SLOT_8_INTF_TYPE {xilinx.com:interface:axis_rtl:1.0} \ + CONFIG.C_SLOT_8_TYPE {0} \ + CONFIG.C_SLOT_9_APC_EN {0} \ + CONFIG.C_SLOT_9_AXI_DATA_SEL {1} \ + CONFIG.C_SLOT_9_AXI_TRIG_SEL {1} \ + CONFIG.C_SLOT_9_INTF_TYPE {xilinx.com:interface:axis_rtl:1.0} \ + CONFIG.C_SLOT_9_TYPE {0} \ + ] $system_ila_0 + + # Create instance: xlconstant_0, and set properties + set xlconstant_0 [ create_bd_cell -type ip -vlnv xilinx.com:ip:xlconstant:1.1 xlconstant_0 ] + set_property -dict [ list \ + CONFIG.CONST_VAL {0} \ + CONFIG.CONST_WIDTH {2} \ + ] $xlconstant_0 + + # Create instance: xlconstant_1, and set properties + set xlconstant_1 [ create_bd_cell -type ip -vlnv xilinx.com:ip:xlconstant:1.1 xlconstant_1 ] + set_property -dict [ list \ + CONFIG.CONST_VAL {0} \ + CONFIG.CONST_WIDTH {2} \ + ] $xlconstant_1 + + # Create instance: xlconstant_2, and set properties + set xlconstant_2 [ create_bd_cell -type ip -vlnv xilinx.com:ip:xlconstant:1.1 xlconstant_2 ] + set_property -dict [ list \ + CONFIG.CONST_VAL {0} \ + CONFIG.CONST_WIDTH {2} \ + ] $xlconstant_2 + + # Create interface connections + connect_bd_intf_net -intf_net S00_AXI_0_1 [get_bd_intf_ports S00_AXI_0] [get_bd_intf_pins smartconnect_0/S00_AXI] + connect_bd_intf_net -intf_net axis_data_fifo_0_M_AXIS [get_bd_intf_pins axis_data_fifo_0/M_AXIS] [get_bd_intf_pins cyt_dma_sq_adapter_0/cyt_rq_wr_cmd] + connect_bd_intf_net -intf_net axis_data_fifo_1_M_AXIS [get_bd_intf_pins axis_data_fifo_1/M_AXIS] [get_bd_intf_pins cyt_dma_sq_adapter_0/cyt_rq_rd_cmd] + connect_bd_intf_net -intf_net axis_data_fifo_2_M_AXIS [get_bd_intf_pins axis_data_fifo_2/M_AXIS] [get_bd_intf_pins cyt_dma_sq_adapter_0/cclo_sq_wr_cmd] + connect_bd_intf_net -intf_net axis_data_fifo_3_M_AXIS [get_bd_intf_pins axis_data_fifo_3/M_AXIS] [get_bd_intf_pins cyt_dma_sq_adapter_0/cclo_sq_rd_cmd] + connect_bd_intf_net -intf_net axis_data_fifo_4_M_AXIS [get_bd_intf_pins axis_data_fifo_4/M_AXIS] [get_bd_intf_pins cyt_cq_dm_sts_conver_1/dm1_meta] +connect_bd_intf_net -intf_net [get_bd_intf_nets axis_data_fifo_4_M_AXIS] [get_bd_intf_pins axis_data_fifo_4/M_AXIS] [get_bd_intf_pins system_ila_0/SLOT_0_AXIS] + set_property HDL_ATTRIBUTE.DEBUG {true} [get_bd_intf_nets axis_data_fifo_4_M_AXIS] + connect_bd_intf_net -intf_net axis_data_fifo_5_M_AXIS [get_bd_intf_pins axis_data_fifo_5/M_AXIS] [get_bd_intf_pins cyt_cq_dm_sts_conver_1/dm0_meta] +connect_bd_intf_net -intf_net [get_bd_intf_nets axis_data_fifo_5_M_AXIS] [get_bd_intf_pins axis_data_fifo_5/M_AXIS] [get_bd_intf_pins system_ila_0/SLOT_1_AXIS] + set_property HDL_ATTRIBUTE.DEBUG {true} [get_bd_intf_nets axis_data_fifo_5_M_AXIS] + connect_bd_intf_net -intf_net axis_data_fifo_6_M_AXIS [get_bd_intf_pins axis_data_fifo_6/M_AXIS] [get_bd_intf_pins cyt_cq_dm_sts_conver_0/dm1_meta] +connect_bd_intf_net -intf_net [get_bd_intf_nets axis_data_fifo_6_M_AXIS] [get_bd_intf_pins axis_data_fifo_6/M_AXIS] [get_bd_intf_pins system_ila_0/SLOT_6_AXIS] + set_property HDL_ATTRIBUTE.DEBUG {true} [get_bd_intf_nets axis_data_fifo_6_M_AXIS] + connect_bd_intf_net -intf_net axis_data_fifo_7_M_AXIS [get_bd_intf_pins axis_data_fifo_7/M_AXIS] [get_bd_intf_pins cyt_cq_dm_sts_conver_0/dm0_meta] +connect_bd_intf_net -intf_net [get_bd_intf_nets axis_data_fifo_7_M_AXIS] [get_bd_intf_pins axis_data_fifo_7/M_AXIS] [get_bd_intf_pins system_ila_0/SLOT_7_AXIS] + set_property HDL_ATTRIBUTE.DEBUG {true} [get_bd_intf_nets axis_data_fifo_7_M_AXIS] + connect_bd_intf_net -intf_net axis_register_slice_0_M_AXIS [get_bd_intf_pins axis_register_slice_0/M_AXIS] [get_bd_intf_pins cyt_rdma_arbiter_0/s_axis_0] + connect_bd_intf_net -intf_net axis_register_slice_10_M_AXIS [get_bd_intf_ports cyt_sq_rd_cmd] [get_bd_intf_pins axis_register_slice_10/M_AXIS] + connect_bd_intf_net -intf_net axis_register_slice_11_M_AXIS [get_bd_intf_pins axis_register_slice_11/M_AXIS] [get_bd_intf_pins axis_switch_1_to_2_inst_3/S00_AXIS] + connect_bd_intf_net -intf_net axis_register_slice_12_M_AXIS [get_bd_intf_pins axis_register_slice_12/M_AXIS] [get_bd_intf_pins axis_switch_1_to_2_inst_0/S00_AXIS] + connect_bd_intf_net -intf_net axis_register_slice_13_M_AXIS [get_bd_intf_pins axis_register_slice_13/M_AXIS] [get_bd_intf_pins axis_switch_1_to_2_inst_1/S00_AXIS] + connect_bd_intf_net -intf_net axis_register_slice_14_M_AXIS [get_bd_intf_pins axis_register_slice_14/M_AXIS] [get_bd_intf_pins axis_switch_1_to_2_inst_2/S00_AXIS] + connect_bd_intf_net -intf_net axis_register_slice_1_M_AXIS [get_bd_intf_pins axis_register_slice_1/M_AXIS] [get_bd_intf_pins cyt_rdma_arbiter_0/s_axis_1] + connect_bd_intf_net -intf_net axis_register_slice_2_M_AXIS [get_bd_intf_pins axis_register_slice_2/M_AXIS] [get_bd_intf_pins cyt_rdma_arbiter_0/s_meta] + connect_bd_intf_net -intf_net axis_register_slice_3_M_AXIS [get_bd_intf_pins axis_register_slice_3/M_AXIS] [get_bd_intf_pins cyt_cq_dm_sts_conver_0/cq_sts] +connect_bd_intf_net -intf_net [get_bd_intf_nets axis_register_slice_3_M_AXIS] [get_bd_intf_pins axis_register_slice_3/M_AXIS] [get_bd_intf_pins system_ila_0/SLOT_8_AXIS] + set_property HDL_ATTRIBUTE.DEBUG {true} [get_bd_intf_nets axis_register_slice_3_M_AXIS] + connect_bd_intf_net -intf_net axis_register_slice_4_M_AXIS [get_bd_intf_pins axis_register_slice_4/M_AXIS] [get_bd_intf_pins cyt_cq_dm_sts_conver_1/cq_sts] +connect_bd_intf_net -intf_net [get_bd_intf_nets axis_register_slice_4_M_AXIS] [get_bd_intf_pins axis_register_slice_4/M_AXIS] [get_bd_intf_pins system_ila_0/SLOT_9_AXIS] + set_property HDL_ATTRIBUTE.DEBUG {true} [get_bd_intf_nets axis_register_slice_4_M_AXIS] + connect_bd_intf_net -intf_net axis_register_slice_5_M_AXIS [get_bd_intf_pins axis_data_fifo_1/S_AXIS] [get_bd_intf_pins axis_register_slice_5/M_AXIS] + connect_bd_intf_net -intf_net axis_register_slice_6_M_AXIS [get_bd_intf_pins axis_register_slice_6/M_AXIS] [get_bd_intf_pins cclo_sq_adapter_0/s_axis_cyt] + connect_bd_intf_net -intf_net axis_register_slice_7_M_AXIS [get_bd_intf_pins axis_register_slice_7/M_AXIS] [get_bd_intf_pins ccl_offload_0/s_axis_dma0_mm2s] + connect_bd_intf_net -intf_net axis_register_slice_8_M_AXIS [get_bd_intf_pins axis_register_slice_8/M_AXIS] [get_bd_intf_pins ccl_offload_0/s_axis_dma1_mm2s] + connect_bd_intf_net -intf_net axis_register_slice_9_M_AXIS [get_bd_intf_ports cyt_sq_wr_cmd] [get_bd_intf_pins axis_register_slice_9/M_AXIS] + connect_bd_intf_net -intf_net axis_switch_1_to_2_inst_0_M00_AXIS [get_bd_intf_ports m_axis_card_0] [get_bd_intf_pins axis_switch_1_to_2_inst_0/M00_AXIS] + connect_bd_intf_net -intf_net axis_switch_1_to_2_inst_0_M01_AXIS [get_bd_intf_ports m_axis_host_0] [get_bd_intf_pins axis_switch_1_to_2_inst_0/M01_AXIS] + connect_bd_intf_net -intf_net axis_switch_1_to_2_inst_1_M00_AXIS [get_bd_intf_ports m_axis_card_1] [get_bd_intf_pins axis_switch_1_to_2_inst_1/M00_AXIS] + connect_bd_intf_net -intf_net axis_switch_1_to_2_inst_1_M01_AXIS [get_bd_intf_ports m_axis_host_1] [get_bd_intf_pins axis_switch_1_to_2_inst_1/M01_AXIS] + connect_bd_intf_net -intf_net axis_switch_1_to_2_inst_2_M00_AXIS [get_bd_intf_ports m_axis_card_2] [get_bd_intf_pins axis_switch_1_to_2_inst_2/M00_AXIS] + connect_bd_intf_net -intf_net axis_switch_1_to_2_inst_2_M01_AXIS [get_bd_intf_ports m_axis_host_2] [get_bd_intf_pins axis_switch_1_to_2_inst_2/M01_AXIS] + connect_bd_intf_net -intf_net axis_switch_1_to_2_inst_3_M00_AXIS [get_bd_intf_ports cyt_rreq_send_0] [get_bd_intf_pins axis_switch_1_to_2_inst_3/M00_AXIS] + connect_bd_intf_net -intf_net axis_switch_1_to_2_inst_3_M01_AXIS [get_bd_intf_ports cyt_rreq_send_1] [get_bd_intf_pins axis_switch_1_to_2_inst_3/M01_AXIS] + connect_bd_intf_net -intf_net axis_switch_2_to_1_inst_0_M00_AXIS [get_bd_intf_pins axis_register_slice_7/S_AXIS] [get_bd_intf_pins axis_switch_2_to_1_inst_0/M00_AXIS] + connect_bd_intf_net -intf_net axis_switch_2_to_1_inst_1_M00_AXIS [get_bd_intf_pins axis_register_slice_8/S_AXIS] [get_bd_intf_pins axis_switch_2_to_1_inst_1/M00_AXIS] + connect_bd_intf_net -intf_net axis_switch_2_to_1_inst_2_M00_AXIS [get_bd_intf_pins axis_register_slice_6/S_AXIS] [get_bd_intf_pins axis_switch_2_to_1_inst_2/M00_AXIS] + connect_bd_intf_net -intf_net ccl_offload_0_m_axis_arith_op0 [get_bd_intf_pins ccl_offload_0/m_axis_arith_op0] [get_bd_intf_pins reduce_ops_0/in0] + connect_bd_intf_net -intf_net ccl_offload_0_m_axis_arith_op1 [get_bd_intf_pins ccl_offload_0/m_axis_arith_op1] [get_bd_intf_pins reduce_ops_0/in1] + connect_bd_intf_net -intf_net ccl_offload_0_m_axis_call_ack [get_bd_intf_pins ccl_offload_0/m_axis_call_ack] [get_bd_intf_pins hostctrl_0/sts] + #disable compression + #connect_bd_intf_net -intf_net ccl_offload_0_m_axis_compression0 [get_bd_intf_pins ccl_offload_0/m_axis_compression0] [get_bd_intf_pins ccl_offload_0/s_axis_compression0] + #connect_bd_intf_net -intf_net ccl_offload_0_m_axis_compression1 [get_bd_intf_pins ccl_offload_0/m_axis_compression1] [get_bd_intf_pins ccl_offload_0/s_axis_compression1] + #connect_bd_intf_net -intf_net ccl_offload_0_m_axis_compression2 [get_bd_intf_pins ccl_offload_0/m_axis_compression2] [get_bd_intf_pins ccl_offload_0/s_axis_compression2] + connect_bd_intf_net -intf_net ccl_offload_0_m_axis_dma0_mm2s_cmd [get_bd_intf_pins ccl_offload_0/m_axis_dma0_mm2s_cmd] [get_bd_intf_pins cyt_dma_sq_adapter_0/dma0_mm2s_cmd] +connect_bd_intf_net -intf_net [get_bd_intf_nets ccl_offload_0_m_axis_dma0_mm2s_cmd] [get_bd_intf_pins cyt_dma_sq_adapter_0/dma0_mm2s_cmd] [get_bd_intf_pins system_ila_0/SLOT_2_AXIS] + set_property HDL_ATTRIBUTE.DEBUG {true} [get_bd_intf_nets ccl_offload_0_m_axis_dma0_mm2s_cmd] + connect_bd_intf_net -intf_net ccl_offload_0_m_axis_dma0_s2mm [get_bd_intf_pins axis_register_slice_12/S_AXIS] [get_bd_intf_pins ccl_offload_0/m_axis_dma0_s2mm] + connect_bd_intf_net -intf_net ccl_offload_0_m_axis_dma0_s2mm_cmd [get_bd_intf_pins ccl_offload_0/m_axis_dma0_s2mm_cmd] [get_bd_intf_pins cyt_dma_sq_adapter_0/dma0_s2mm_cmd] +connect_bd_intf_net -intf_net [get_bd_intf_nets ccl_offload_0_m_axis_dma0_s2mm_cmd] [get_bd_intf_pins cyt_dma_sq_adapter_0/dma0_s2mm_cmd] [get_bd_intf_pins system_ila_0/SLOT_3_AXIS] + set_property HDL_ATTRIBUTE.DEBUG {true} [get_bd_intf_nets ccl_offload_0_m_axis_dma0_s2mm_cmd] + connect_bd_intf_net -intf_net ccl_offload_0_m_axis_dma1_mm2s_cmd [get_bd_intf_pins ccl_offload_0/m_axis_dma1_mm2s_cmd] [get_bd_intf_pins cyt_dma_sq_adapter_0/dma1_mm2s_cmd] +connect_bd_intf_net -intf_net [get_bd_intf_nets ccl_offload_0_m_axis_dma1_mm2s_cmd] [get_bd_intf_pins cyt_dma_sq_adapter_0/dma1_mm2s_cmd] [get_bd_intf_pins system_ila_0/SLOT_4_AXIS] + set_property HDL_ATTRIBUTE.DEBUG {true} [get_bd_intf_nets ccl_offload_0_m_axis_dma1_mm2s_cmd] + connect_bd_intf_net -intf_net ccl_offload_0_m_axis_dma1_s2mm [get_bd_intf_pins axis_register_slice_13/S_AXIS] [get_bd_intf_pins ccl_offload_0/m_axis_dma1_s2mm] + connect_bd_intf_net -intf_net ccl_offload_0_m_axis_dma1_s2mm_cmd [get_bd_intf_pins ccl_offload_0/m_axis_dma1_s2mm_cmd] [get_bd_intf_pins cyt_dma_sq_adapter_0/dma1_s2mm_cmd] +connect_bd_intf_net -intf_net [get_bd_intf_nets ccl_offload_0_m_axis_dma1_s2mm_cmd] [get_bd_intf_pins cyt_dma_sq_adapter_0/dma1_s2mm_cmd] [get_bd_intf_pins system_ila_0/SLOT_5_AXIS] + set_property HDL_ATTRIBUTE.DEBUG {true} [get_bd_intf_nets ccl_offload_0_m_axis_dma1_s2mm_cmd] + connect_bd_intf_net -intf_net ccl_offload_0_m_axis_eth_tx_data [get_bd_intf_pins ccl_offload_0/m_axis_eth_tx_data] [get_bd_intf_pins cclo_sq_adapter_0/s_axis_cclo] + connect_bd_intf_net -intf_net ccl_offload_0_m_axis_krnl [get_bd_intf_pins ccl_offload_0/m_axis_krnl] [get_bd_intf_pins ccl_offload_0/s_axis_krnl] + connect_bd_intf_net -intf_net ccl_offload_0_m_axis_rdma_sq [get_bd_intf_pins ccl_offload_0/m_axis_rdma_sq] [get_bd_intf_pins cclo_sq_adapter_0/cclo_sq] + connect_bd_intf_net -intf_net cclo_sq_adapter_0_cyt_sq_rd [get_bd_intf_pins axis_data_fifo_3/S_AXIS] [get_bd_intf_pins cclo_sq_adapter_0/cyt_sq_rd] + connect_bd_intf_net -intf_net cclo_sq_adapter_0_cyt_sq_wr [get_bd_intf_pins axis_data_fifo_2/S_AXIS] [get_bd_intf_pins cclo_sq_adapter_0/cyt_sq_wr] + connect_bd_intf_net -intf_net cclo_sq_adapter_0_m_axis_cyt [get_bd_intf_pins axis_register_slice_11/S_AXIS] [get_bd_intf_pins cclo_sq_adapter_0/m_axis_cyt] + connect_bd_intf_net -intf_net cyt_cq_dm_sts_conver_0_dm0_sts [get_bd_intf_pins ccl_offload_0/s_axis_dma0_s2mm_sts] [get_bd_intf_pins cyt_cq_dm_sts_conver_0/dm0_sts] +connect_bd_intf_net -intf_net [get_bd_intf_nets cyt_cq_dm_sts_conver_0_dm0_sts] [get_bd_intf_pins cyt_cq_dm_sts_conver_0/dm0_sts] [get_bd_intf_pins system_ila_0/SLOT_12_AXIS] + set_property HDL_ATTRIBUTE.DEBUG {true} [get_bd_intf_nets cyt_cq_dm_sts_conver_0_dm0_sts] + connect_bd_intf_net -intf_net cyt_cq_dm_sts_conver_0_dm1_sts [get_bd_intf_pins ccl_offload_0/s_axis_dma1_s2mm_sts] [get_bd_intf_pins cyt_cq_dm_sts_conver_0/dm1_sts] +connect_bd_intf_net -intf_net [get_bd_intf_nets cyt_cq_dm_sts_conver_0_dm1_sts] [get_bd_intf_pins cyt_cq_dm_sts_conver_0/dm1_sts] [get_bd_intf_pins system_ila_0/SLOT_13_AXIS] + set_property HDL_ATTRIBUTE.DEBUG {true} [get_bd_intf_nets cyt_cq_dm_sts_conver_0_dm1_sts] + connect_bd_intf_net -intf_net cyt_cq_dm_sts_conver_1_dm0_sts [get_bd_intf_pins ccl_offload_0/s_axis_dma0_mm2s_sts] [get_bd_intf_pins cyt_cq_dm_sts_conver_1/dm0_sts] +connect_bd_intf_net -intf_net [get_bd_intf_nets cyt_cq_dm_sts_conver_1_dm0_sts] [get_bd_intf_pins cyt_cq_dm_sts_conver_1/dm0_sts] [get_bd_intf_pins system_ila_0/SLOT_14_AXIS] + set_property HDL_ATTRIBUTE.DEBUG {true} [get_bd_intf_nets cyt_cq_dm_sts_conver_1_dm0_sts] + connect_bd_intf_net -intf_net cyt_cq_dm_sts_conver_1_dm1_sts [get_bd_intf_pins ccl_offload_0/s_axis_dma1_mm2s_sts] [get_bd_intf_pins cyt_cq_dm_sts_conver_1/dm1_sts] +connect_bd_intf_net -intf_net [get_bd_intf_nets cyt_cq_dm_sts_conver_1_dm1_sts] [get_bd_intf_pins cyt_cq_dm_sts_conver_1/dm1_sts] [get_bd_intf_pins system_ila_0/SLOT_15_AXIS] + set_property HDL_ATTRIBUTE.DEBUG {true} [get_bd_intf_nets cyt_cq_dm_sts_conver_1_dm1_sts] + connect_bd_intf_net -intf_net cyt_cq_rd_sts_0_1 [get_bd_intf_ports cyt_cq_rd_sts_0] [get_bd_intf_pins axis_register_slice_4/S_AXIS] + connect_bd_intf_net -intf_net cyt_cq_wr_sts_0_1 [get_bd_intf_ports cyt_cq_wr_sts_0] [get_bd_intf_pins axis_register_slice_3/S_AXIS] + connect_bd_intf_net -intf_net cyt_dma_sq_adapter_0_cyt_sq_rd_cmd [get_bd_intf_pins axis_register_slice_10/S_AXIS] [get_bd_intf_pins cyt_dma_sq_adapter_0/cyt_sq_rd_cmd] +connect_bd_intf_net -intf_net [get_bd_intf_nets cyt_dma_sq_adapter_0_cyt_sq_rd_cmd] [get_bd_intf_pins axis_register_slice_10/S_AXIS] [get_bd_intf_pins system_ila_0/SLOT_10_AXIS] + set_property HDL_ATTRIBUTE.DEBUG {true} [get_bd_intf_nets cyt_dma_sq_adapter_0_cyt_sq_rd_cmd] + connect_bd_intf_net -intf_net cyt_dma_sq_adapter_0_cyt_sq_wr_cmd [get_bd_intf_pins axis_register_slice_9/S_AXIS] [get_bd_intf_pins cyt_dma_sq_adapter_0/cyt_sq_wr_cmd] +connect_bd_intf_net -intf_net [get_bd_intf_nets cyt_dma_sq_adapter_0_cyt_sq_wr_cmd] [get_bd_intf_pins axis_register_slice_9/S_AXIS] [get_bd_intf_pins system_ila_0/SLOT_11_AXIS] + set_property HDL_ATTRIBUTE.DEBUG {true} [get_bd_intf_nets cyt_dma_sq_adapter_0_cyt_sq_wr_cmd] + connect_bd_intf_net -intf_net cyt_dma_sq_adapter_0_dma0_mm2s_meta [get_bd_intf_pins axis_data_fifo_5/S_AXIS] [get_bd_intf_pins cyt_dma_sq_adapter_0/dma0_mm2s_meta] + connect_bd_intf_net -intf_net cyt_dma_sq_adapter_0_dma0_s2mm_meta [get_bd_intf_pins axis_data_fifo_7/S_AXIS] [get_bd_intf_pins cyt_dma_sq_adapter_0/dma0_s2mm_meta] + connect_bd_intf_net -intf_net cyt_dma_sq_adapter_0_dma1_mm2s_meta [get_bd_intf_pins axis_data_fifo_4/S_AXIS] [get_bd_intf_pins cyt_dma_sq_adapter_0/dma1_mm2s_meta] + connect_bd_intf_net -intf_net cyt_dma_sq_adapter_0_dma1_s2mm_meta [get_bd_intf_pins axis_data_fifo_6/S_AXIS] [get_bd_intf_pins cyt_dma_sq_adapter_0/dma1_s2mm_meta] + connect_bd_intf_net -intf_net cyt_rdma_arbiter_0_m_axis_0 [get_bd_intf_pins ccl_offload_0/s_axis_eth_rx_data] [get_bd_intf_pins cyt_rdma_arbiter_0/m_axis_0] + connect_bd_intf_net -intf_net cyt_rdma_arbiter_0_m_axis_1 [get_bd_intf_pins axis_register_slice_14/S_AXIS] [get_bd_intf_pins cyt_rdma_arbiter_0/m_axis_1] + connect_bd_intf_net -intf_net cyt_rdma_arbiter_0_m_meta_0 [get_bd_intf_pins ccl_offload_0/s_axis_eth_notification] [get_bd_intf_pins cyt_rdma_arbiter_0/m_meta_0] + connect_bd_intf_net -intf_net cyt_rdma_arbiter_0_m_meta_1 [get_bd_intf_pins axis_data_fifo_0/S_AXIS] [get_bd_intf_pins cyt_rdma_arbiter_0/m_meta_1] + connect_bd_intf_net -intf_net cyt_rq_rd_1 [get_bd_intf_ports cyt_rq_rd] [get_bd_intf_pins axis_register_slice_5/S_AXIS] + connect_bd_intf_net -intf_net cyt_rq_wr_1 [get_bd_intf_ports cyt_rq_wr] [get_bd_intf_pins axis_register_slice_2/S_AXIS] + connect_bd_intf_net -intf_net cyt_rreq_recv_0_1 [get_bd_intf_ports cyt_rreq_recv_0] [get_bd_intf_pins axis_switch_2_to_1_inst_2/S00_AXIS] + connect_bd_intf_net -intf_net cyt_rreq_recv_1_1 [get_bd_intf_ports cyt_rreq_recv_1] [get_bd_intf_pins axis_switch_2_to_1_inst_2/S01_AXIS] + connect_bd_intf_net -intf_net cyt_rrsp_recv_0_1 [get_bd_intf_ports cyt_rrsp_recv_0] [get_bd_intf_pins axis_register_slice_0/S_AXIS] + connect_bd_intf_net -intf_net cyt_rrsp_recv_1_1 [get_bd_intf_ports cyt_rrsp_recv_1] [get_bd_intf_pins axis_register_slice_1/S_AXIS] + connect_bd_intf_net -intf_net hostctrl_0_cmd [get_bd_intf_pins ccl_offload_0/s_axis_call_req] [get_bd_intf_pins hostctrl_0/cmd] + connect_bd_intf_net -intf_net reduce_ops_0_out_r [get_bd_intf_pins ccl_offload_0/s_axis_arith_res] [get_bd_intf_pins reduce_ops_0/out_r] + connect_bd_intf_net -intf_net s_axis_card_0_1 [get_bd_intf_ports s_axis_card_0] [get_bd_intf_pins axis_switch_2_to_1_inst_0/S01_AXIS] + connect_bd_intf_net -intf_net s_axis_card_1_1 [get_bd_intf_ports s_axis_card_1] [get_bd_intf_pins axis_switch_2_to_1_inst_1/S01_AXIS] + connect_bd_intf_net -intf_net s_axis_host_0_1 [get_bd_intf_ports s_axis_host_0] [get_bd_intf_pins axis_switch_2_to_1_inst_0/S00_AXIS] + connect_bd_intf_net -intf_net s_axis_host_1_1 [get_bd_intf_ports s_axis_host_1] [get_bd_intf_pins axis_switch_2_to_1_inst_1/S00_AXIS] + connect_bd_intf_net -intf_net smartconnect_0_M00_AXI [get_bd_intf_pins hostctrl_0/s_axi_control] [get_bd_intf_pins smartconnect_0/M00_AXI] + connect_bd_intf_net -intf_net smartconnect_0_M01_AXI [get_bd_intf_pins ccl_offload_0/s_axi_control] [get_bd_intf_pins smartconnect_0/M01_AXI] + + # Create port connections + connect_bd_net -net ap_clk_0_1 [get_bd_ports ap_clk_0] [get_bd_pins axis_data_fifo_0/s_axis_aclk] [get_bd_pins axis_data_fifo_1/s_axis_aclk] [get_bd_pins axis_data_fifo_2/s_axis_aclk] [get_bd_pins axis_data_fifo_3/s_axis_aclk] [get_bd_pins axis_data_fifo_4/s_axis_aclk] [get_bd_pins axis_data_fifo_5/s_axis_aclk] [get_bd_pins axis_data_fifo_6/s_axis_aclk] [get_bd_pins axis_data_fifo_7/s_axis_aclk] [get_bd_pins axis_register_slice_0/aclk] [get_bd_pins axis_register_slice_1/aclk] [get_bd_pins axis_register_slice_10/aclk] [get_bd_pins axis_register_slice_11/aclk] [get_bd_pins axis_register_slice_12/aclk] [get_bd_pins axis_register_slice_13/aclk] [get_bd_pins axis_register_slice_14/aclk] [get_bd_pins axis_register_slice_2/aclk] [get_bd_pins axis_register_slice_3/aclk] [get_bd_pins axis_register_slice_4/aclk] [get_bd_pins axis_register_slice_5/aclk] [get_bd_pins axis_register_slice_6/aclk] [get_bd_pins axis_register_slice_7/aclk] [get_bd_pins axis_register_slice_8/aclk] [get_bd_pins axis_register_slice_9/aclk] [get_bd_pins axis_switch_1_to_2_inst_0/aclk] [get_bd_pins axis_switch_1_to_2_inst_1/aclk] [get_bd_pins axis_switch_1_to_2_inst_2/aclk] [get_bd_pins axis_switch_1_to_2_inst_3/aclk] [get_bd_pins axis_switch_2_to_1_inst_0/aclk] [get_bd_pins axis_switch_2_to_1_inst_1/aclk] [get_bd_pins axis_switch_2_to_1_inst_2/aclk] [get_bd_pins ccl_offload_0/ap_clk] [get_bd_pins cclo_sq_adapter_0/ap_clk] [get_bd_pins cyt_cq_dm_sts_conver_0/ap_clk] [get_bd_pins cyt_cq_dm_sts_conver_1/ap_clk] [get_bd_pins cyt_dma_sq_adapter_0/ap_clk] [get_bd_pins cyt_rdma_arbiter_0/ap_clk] [get_bd_pins hostctrl_0/ap_clk] [get_bd_pins reduce_ops_0/ap_clk] [get_bd_pins rst_ap_clk_0_250M/slowest_sync_clk] [get_bd_pins smartconnect_0/aclk] [get_bd_pins system_ila_0/clk] + connect_bd_net -net ap_rst_n_0_1 [get_bd_ports ap_rst_n_0] [get_bd_pins axis_data_fifo_4/s_axis_aresetn] [get_bd_pins axis_data_fifo_5/s_axis_aresetn] [get_bd_pins axis_data_fifo_6/s_axis_aresetn] [get_bd_pins axis_data_fifo_7/s_axis_aresetn] [get_bd_pins axis_register_slice_0/aresetn] [get_bd_pins axis_register_slice_1/aresetn] [get_bd_pins axis_register_slice_10/aresetn] [get_bd_pins axis_register_slice_11/aresetn] [get_bd_pins axis_register_slice_12/aresetn] [get_bd_pins axis_register_slice_13/aresetn] [get_bd_pins axis_register_slice_14/aresetn] [get_bd_pins axis_register_slice_7/aresetn] [get_bd_pins axis_register_slice_8/aresetn] [get_bd_pins axis_register_slice_9/aresetn] [get_bd_pins axis_switch_1_to_2_inst_0/aresetn] [get_bd_pins axis_switch_1_to_2_inst_1/aresetn] [get_bd_pins axis_switch_1_to_2_inst_2/aresetn] [get_bd_pins axis_switch_1_to_2_inst_3/aresetn] [get_bd_pins axis_switch_2_to_1_inst_0/aresetn] [get_bd_pins axis_switch_2_to_1_inst_1/aresetn] [get_bd_pins ccl_offload_0/ap_rst_n] [get_bd_pins cclo_sq_adapter_0/ap_rst_n] [get_bd_pins cyt_cq_dm_sts_conver_0/ap_rst_n] [get_bd_pins cyt_cq_dm_sts_conver_1/ap_rst_n] [get_bd_pins cyt_dma_sq_adapter_0/ap_rst_n] [get_bd_pins cyt_rdma_arbiter_0/ap_rst_n] [get_bd_pins hostctrl_0/ap_rst_n] [get_bd_pins reduce_ops_0/ap_rst_n] [get_bd_pins rst_ap_clk_0_250M/ext_reset_in] [get_bd_pins smartconnect_0/aresetn] [get_bd_pins system_ila_0/resetn] + connect_bd_net -net rst_ap_clk_0_250M_peripheral_aresetn [get_bd_pins axis_data_fifo_0/s_axis_aresetn] [get_bd_pins axis_data_fifo_1/s_axis_aresetn] [get_bd_pins axis_data_fifo_2/s_axis_aresetn] [get_bd_pins axis_data_fifo_3/s_axis_aresetn] [get_bd_pins axis_register_slice_2/aresetn] [get_bd_pins axis_register_slice_3/aresetn] [get_bd_pins axis_register_slice_4/aresetn] [get_bd_pins axis_register_slice_5/aresetn] [get_bd_pins axis_register_slice_6/aresetn] [get_bd_pins axis_switch_2_to_1_inst_2/aresetn] [get_bd_pins rst_ap_clk_0_250M/peripheral_aresetn] + connect_bd_net -net xlconstant_0_dout [get_bd_pins axis_switch_2_to_1_inst_0/s_req_suppress] [get_bd_pins xlconstant_0/dout] + connect_bd_net -net xlconstant_1_dout [get_bd_pins axis_switch_2_to_1_inst_1/s_req_suppress] [get_bd_pins xlconstant_1/dout] + connect_bd_net -net xlconstant_2_dout [get_bd_pins axis_switch_2_to_1_inst_2/s_req_suppress] [get_bd_pins xlconstant_2/dout] + + # Create address segments + assign_bd_address -offset 0x00000000 -range 0x00002000 -target_address_space [get_bd_addr_spaces S00_AXI_0] [get_bd_addr_segs ccl_offload_0/s_axi_control/reg0] -force + assign_bd_address -offset 0x00002000 -range 0x00002000 -target_address_space [get_bd_addr_spaces S00_AXI_0] [get_bd_addr_segs hostctrl_0/s_axi_control/Reg] -force + +# create some hierarchies +group_bd_cells cclo [get_bd_cells hostctrl_0] [get_bd_cells smartconnect_0] [get_bd_cells reduce_ops_0] [get_bd_cells ccl_offload_0] +group_bd_cells rrsp_arbitration [get_bd_cells axis_switch_1_to_2_inst_2] [get_bd_cells axis_register_slice_0] [get_bd_cells axis_register_slice_1] [get_bd_cells cyt_rdma_arbiter_0] [get_bd_cells axis_register_slice_14] [get_bd_cells axis_register_slice_2] +group_bd_cells completion_conversion [get_bd_cells axis_register_slice_3] [get_bd_cells axis_data_fifo_4] [get_bd_cells axis_data_fifo_5] [get_bd_cells axis_register_slice_4] [get_bd_cells axis_data_fifo_6] [get_bd_cells cyt_cq_dm_sts_conver_0] [get_bd_cells axis_data_fifo_7] [get_bd_cells cyt_cq_dm_sts_conver_1] +group_bd_cells request_conversion [get_bd_cells axis_register_slice_9] [get_bd_cells axis_data_fifo_0] [get_bd_cells axis_data_fifo_1] [get_bd_cells axis_data_fifo_2] [get_bd_cells axis_data_fifo_3] [get_bd_cells cyt_dma_sq_adapter_0] [get_bd_cells axis_register_slice_10] +group_bd_cells sq_conversion [get_bd_cells axis_switch_2_to_1_inst_2] [get_bd_cells axis_switch_1_to_2_inst_3] [get_bd_cells axis_register_slice_11] [get_bd_cells axis_register_slice_6] [get_bd_cells cclo_sq_adapter_0] [get_bd_cells xlconstant_2] +group_bd_cells local_dma_input_muxing [get_bd_cells axis_switch_2_to_1_inst_0] [get_bd_cells axis_switch_2_to_1_inst_1] [get_bd_cells xlconstant_0] [get_bd_cells axis_register_slice_7] [get_bd_cells axis_register_slice_8] [get_bd_cells xlconstant_1] +group_bd_cells local_dma_output_demuxing [get_bd_cells axis_switch_1_to_2_inst_0] [get_bd_cells axis_register_slice_12] [get_bd_cells axis_register_slice_13] [get_bd_cells axis_switch_1_to_2_inst_1] + +#create ila_top +#create_ip -name ila -vendor xilinx.com -library ip -version 6.2 -module_name ila_top +#set_property -dict [ list CONFIG.C_PROBE2_WIDTH {128} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE5_WIDTH {128} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE8_WIDTH {128} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE11_WIDTH {128} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE12_WIDTH {32} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE15_WIDTH {32} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE20_WIDTH {512} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE23_WIDTH {512} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE26_WIDTH {512} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE29_WIDTH {512} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE32_WIDTH {512} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE35_WIDTH {512} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE38_WIDTH {512} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_PROBE41_WIDTH {512} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_NUM_OF_PROBES {46} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_EN_STRG_QUAL {1} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_INPUT_PIPE_STAGES {2} ] [get_ips ila_top] +#set_property -dict [ list CONFIG.C_DATA_DEPTH {2048} ] [get_ips ila_top] + +#set_clock_uncertainty 0.2 + validate_bd_design save_bd_design -make_wrapper -files [get_files "$build_dir/lynx/lynx.srcs/sources_1/bd/accl_bd/accl_bd.bd"] -top -add_files -norecurse "$build_dir/lynx/lynx.gen/sources_1/bd/accl_bd/hdl/accl_bd_wrapper.v" +make_wrapper -files [get_files "$build_dir/test_config_0/user_c0_0/test.srcs/sources_1/bd/accl_bd/accl_bd.bd"] -top +add_files -norecurse "$build_dir/test_config_0/user_c0_0/test.srcs/sources_1/bd/accl_bd/hdl/accl_bd_wrapper.v" update_compile_order -fileset sources_1 exit + +