fix: resolve review comments

kilinchange · kilinchange · commit 5dffc467a02d · 2025-12-23T10:38:03.000+08:00
diff --git a/infini_train/include/nn/parallel/process_group.h b/infini_train/include/nn/parallel/process_group.h
@@ -34,22 +34,25 @@ class ProcessGroup {
 
     // Asynchronous communication APIs (Compute / Communication stream decoupled)
     virtual std::shared_ptr<Work> AllReduce(const std::shared_ptr<Tensor> &tensor,
-                                            const function::AllreduceOptions &opts) const
+                                            function::ReduceOpType reduce_op = function::ReduceOpType::kSum,
+                                            bool async_op = false) const
         = 0;
 
     virtual std::shared_ptr<Work> AllGather(const std::shared_ptr<Tensor> &output, const std::shared_ptr<Tensor> &input,
-                                            bool async_op) const
+                                            bool async_op = false) const
         = 0;
 
-    virtual std::shared_ptr<Work> ReduceScatter(const std::shared_ptr<Tensor> &output,
-                                                const std::shared_ptr<Tensor> &input,
-                                                const function::AllreduceOptions &opts) const
+    virtual std::shared_ptr<Work>
+    ReduceScatter(const std::shared_ptr<Tensor> &output, const std::shared_ptr<Tensor> &input,
+                  function::ReduceOpType reduce_op = function::ReduceOpType::kSum, bool async_op = false) const
         = 0;
 
-    virtual std::shared_ptr<Work> Send(std::vector<std::shared_ptr<Tensor>> tensors, int dest_rank, bool async_op) const
+    virtual std::shared_ptr<Work> Send(std::vector<std::shared_ptr<Tensor>> tensors, int dest_rank,
+                                       bool async_op = false) const
         = 0;
 
-    virtual std::shared_ptr<Work> Recv(std::vector<std::shared_ptr<Tensor>> tensors, int src_rank, bool async_op) const
+    virtual std::shared_ptr<Work> Recv(std::vector<std::shared_ptr<Tensor>> tensors, int src_rank,
+                                       bool async_op = false) const
         = 0;
 
     // Legacy communication APIs (Single-stream)
@@ -90,14 +93,14 @@ class ProcessGroupNCCL final : public ProcessGroup {
     ~ProcessGroupNCCL();
 
     // Asynchronous communication APIs (Compute / Communication stream decoupled)
-    std::shared_ptr<Work> AllReduce(const std::shared_ptr<Tensor> &tensor,
-                                    const function::AllreduceOptions &opts) const override;
+    std::shared_ptr<Work> AllReduce(const std::shared_ptr<Tensor> &tensor, function::ReduceOpType reduce_op,
+                                    bool async_op) const override;
 
     std::shared_ptr<Work> AllGather(const std::shared_ptr<Tensor> &output, const std::shared_ptr<Tensor> &input,
                                     bool async_op) const override;
 
     std::shared_ptr<Work> ReduceScatter(const std::shared_ptr<Tensor> &output, const std::shared_ptr<Tensor> &input,
-                                        const function::AllreduceOptions &opts) const override;
+                                        function::ReduceOpType reduce_op, bool async_op) const override;
 
     std::shared_ptr<Work> Send(std::vector<std::shared_ptr<Tensor>> tensors, int dest_rank,
                                bool async_op) const override;
diff --git a/infini_train/include/nn/parallel/reduce_op_type.h b/infini_train/include/nn/parallel/reduce_op_type.h
@@ -11,9 +11,4 @@ enum class ReduceOpType : int8_t {
     kAvg,
 };
 
-struct AllreduceOptions {
-    ReduceOpType reduce_op_type = ReduceOpType::kSum;
-    bool async_op = false;
-};
-
 } // namespace infini_train::nn::parallel::function
diff --git a/infini_train/src/nn/parallel/parallel_functional.cc b/infini_train/src/nn/parallel/parallel_functional.cc
@@ -19,7 +19,7 @@ std::shared_ptr<Work> AllReduce(const std::shared_ptr<Tensor> &tensor, ReduceOpT
     if (pg == nullptr) {
         pg = ProcessGroupFactory::Instance()->GetDefaultProcessGroup();
     }
-    return pg->AllReduce(tensor, {reduce_op, async_op});
+    return pg->AllReduce(tensor, reduce_op, async_op);
 }
 
 std::shared_ptr<Work> AllGather(const std::shared_ptr<Tensor> &output, const std::shared_ptr<Tensor> &input,
@@ -37,7 +37,7 @@ std::shared_ptr<Work> ReduceScatter(const std::shared_ptr<Tensor> &output, const
     if (pg == nullptr) {
         pg = ProcessGroupFactory::Instance()->GetDefaultProcessGroup();
     }
-    return pg->ReduceScatter(output, input, {reduce_op, async_op});
+    return pg->ReduceScatter(output, input, reduce_op, async_op);
 }
 
 std::vector<std::vector<std::shared_ptr<Tensor>>> Scatter(const std::vector<std::shared_ptr<Tensor>> &input_tensors,
diff --git a/infini_train/src/nn/parallel/process_group.cc b/infini_train/src/nn/parallel/process_group.cc
@@ -188,7 +188,7 @@ void ProcessGroupNCCL::InitStreams() {
 }
 
 std::shared_ptr<Work> ProcessGroupNCCL::AllReduce(const std::shared_ptr<Tensor> &tensor,
-                                                  const function::AllreduceOptions &opts) const {
+                                                  function::ReduceOpType reduce_op, bool async_op) const {
     void *buffer = tensor->DataPtr();
     const auto *device = dynamic_cast<const CudaDevice *>(tensor->GetDevice());
     device->SetDevice();
@@ -208,11 +208,11 @@ std::shared_ptr<Work> ProcessGroupNCCL::AllReduce(const std::shared_ptr<Tensor>
 
     // Perform NcclAllReduce on comm stream
     NCCL_CHECK(ncclAllReduce(buffer, buffer, tensor->NumElements(), kNcclDtypeMap.at(tensor->Dtype()),
-                             kNcclReduceOpMap.at(opts.reduce_op_type), comm, comm_stream));
+                             kNcclReduceOpMap.at(reduce_op), comm, comm_stream));
 
     CUDA_CHECK(cudaEventRecord(done_event, comm_stream));
 
-    if (opts.async_op) {
+    if (async_op) {
         return std::move(work);
     } else {
         work->WaitNonBlocking();
@@ -253,7 +253,7 @@ std::shared_ptr<Work> ProcessGroupNCCL::AllGather(const std::shared_ptr<Tensor>
 
 std::shared_ptr<Work> ProcessGroupNCCL::ReduceScatter(const std::shared_ptr<Tensor> &output,
                                                       const std::shared_ptr<Tensor> &input,
-                                                      const function::AllreduceOptions &opts) const {
+                                                      function::ReduceOpType reduce_op, bool async_op) const {
     const auto *device = dynamic_cast<const CudaDevice *>(input->GetDevice());
     auto comm = device_comm_map_.at(device);
 
@@ -271,12 +271,11 @@ std::shared_ptr<Work> ProcessGroupNCCL::ReduceScatter(const std::shared_ptr<Tens
     CUDA_CHECK(cudaStreamWaitEvent(comm_stream, ready_event, 0));
 
     NCCL_CHECK(ncclReduceScatter(input->DataPtr(), output->DataPtr(), output->NumElements(),
-                                 kNcclDtypeMap.at(input->Dtype()), kNcclReduceOpMap.at(opts.reduce_op_type), comm,
-                                 comm_stream));
+                                 kNcclDtypeMap.at(input->Dtype()), kNcclReduceOpMap.at(reduce_op), comm, comm_stream));
 
     CUDA_CHECK(cudaEventRecord(done_event, comm_stream));
 
-    if (opts.async_op) {
+    if (async_op) {
         return std::move(work);
     } else {
         work->WaitNonBlocking();
diff --git a/infini_train/src/nn/parallel/reducer.cc b/infini_train/src/nn/parallel/reducer.cc
@@ -419,7 +419,7 @@ void Reducer::FinalizeBucketDense(size_t bucket_index) {
         // FIXME(zbl): support custom hook later
         LOG(FATAL) << "Custom hook is not supported now";
     } else {
-        bucket.work = ddp_pg->AllReduce(bucket.contents, {function::ReduceOpType::kAvg, true});
+        bucket.work = ddp_pg->AllReduce(bucket.contents, function::ReduceOpType::kAvg, true);
     }
 }
 
diff --git a/infini_train/src/nn/parallel/tensor_parallel.cc b/infini_train/src/nn/parallel/tensor_parallel.cc
@@ -103,7 +103,7 @@ std::shared_ptr<Tensor> Reduce(const std::shared_ptr<Tensor> &tensor) {
 
     auto output = std::make_shared<Tensor>(*tensor);
 
-    tp_group->AllReduce(output, {function::ReduceOpType::kSum, false});
+    tp_group->AllReduce(output, function::ReduceOpType::kSum, false);
     return output;
 }
 
@@ -125,7 +125,7 @@ std::shared_ptr<Tensor> ReduceScatterAlongFirstDim(const std::shared_ptr<Tensor>
 
     auto output = std::make_shared<Tensor>(output_shape, tensor->Dtype(), device);
 
-    tp_group->ReduceScatter(output, tensor, {function::ReduceOpType::kSum, false});
+    tp_group->ReduceScatter(output, tensor, function::ReduceOpType::kSum, false);
 
     return output;
 }
@@ -465,7 +465,7 @@ VocabParallelCrossEntropy::Forward(const std::vector<std::shared_ptr<Tensor>> &i
     auto local_max = logits_masked->Max(-1);
     auto global_max = local_max;
     if (tp_size > 1) {
-        tp_group->AllReduce(global_max, {function::ReduceOpType::kMax, false});
+        tp_group->AllReduce(global_max, function::ReduceOpType::kMax, false);
     }
     auto shifted = logits_masked->Sub(global_max->Unsqueeze(-1));
 

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ std::shared_ptr<Work> AllReduce(const std::shared_ptr<Tensor> &tensor, ReduceOpT`
`19`	`19`	`if (pg == nullptr) {`
`20`	`20`	`pg = ProcessGroupFactory::Instance()->GetDefaultProcessGroup();`
`21`	`21`	`}`
`22`		`- return pg->AllReduce(tensor, {reduce_op, async_op});`
	`22`	`+ return pg->AllReduce(tensor, reduce_op, async_op);`
`23`	`23`	`}`
`24`	`24`
`25`	`25`	`std::shared_ptr<Work> AllGather(const std::shared_ptr<Tensor> &output, const std::shared_ptr<Tensor> &input,`
`@@ -37,7 +37,7 @@ std::shared_ptr<Work> ReduceScatter(const std::shared_ptr<Tensor> &output, const`
`37`	`37`	`if (pg == nullptr) {`
`38`	`38`	`pg = ProcessGroupFactory::Instance()->GetDefaultProcessGroup();`
`39`	`39`	`}`
`40`		`- return pg->ReduceScatter(output, input, {reduce_op, async_op});`
	`40`	`+ return pg->ReduceScatter(output, input, reduce_op, async_op);`
`41`	`41`	`}`
`42`	`42`
`43`	`43`	`std::vector<std::vector<std::shared_ptr<Tensor>>> Scatter(const std::vector<std::shared_ptr<Tensor>> &input_tensors,`
Original file line number	Diff line number	Diff line change
`@@ -419,7 +419,7 @@ void Reducer::FinalizeBucketDense(size_t bucket_index) {`
`419`	`419`	`// FIXME(zbl): support custom hook later`
`420`	`420`	`LOG(FATAL) << "Custom hook is not supported now";`
`421`	`421`	`} else {`
`422`		`- bucket.work = ddp_pg->AllReduce(bucket.contents, {function::ReduceOpType::kAvg, true});`
	`422`	`+ bucket.work = ddp_pg->AllReduce(bucket.contents, function::ReduceOpType::kAvg, true);`
`423`	`423`	`}`
`424`	`424`	`}`
`425`	`425`
Original file line number	Diff line number	Diff line change
`@@ -103,7 +103,7 @@ std::shared_ptr<Tensor> Reduce(const std::shared_ptr<Tensor> &tensor) {`
`103`	`103`
`104`	`104`	`auto output = std::make_shared<Tensor>(*tensor);`
`105`	`105`
`106`		`- tp_group->AllReduce(output, {function::ReduceOpType::kSum, false});`
	`106`	`+ tp_group->AllReduce(output, function::ReduceOpType::kSum, false);`
`107`	`107`	`return output;`
`108`	`108`	`}`
`109`	`109`
`@@ -125,7 +125,7 @@ std::shared_ptr<Tensor> ReduceScatterAlongFirstDim(const std::shared_ptr<Tensor>`
`125`	`125`
`126`	`126`	`auto output = std::make_shared<Tensor>(output_shape, tensor->Dtype(), device);`
`127`	`127`
`128`		`- tp_group->ReduceScatter(output, tensor, {function::ReduceOpType::kSum, false});`
	`128`	`+ tp_group->ReduceScatter(output, tensor, function::ReduceOpType::kSum, false);`
`129`	`129`
`130`	`130`	`return output;`
`131`	`131`	`}`
`@@ -465,7 +465,7 @@ VocabParallelCrossEntropy::Forward(const std::vector<std::shared_ptr<Tensor>> &i`
`465`	`465`	`auto local_max = logits_masked->Max(-1);`
`466`	`466`	`auto global_max = local_max;`
`467`	`467`	`if (tp_size > 1) {`
`468`		`- tp_group->AllReduce(global_max, {function::ReduceOpType::kMax, false});`
	`468`	`+ tp_group->AllReduce(global_max, function::ReduceOpType::kMax, false);`
`469`	`469`	`}`
`470`	`470`	`auto shifted = logits_masked->Sub(global_max->Unsqueeze(-1));`
`471`	`471`