diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml index c17d1432..7b5c4b12 100755 --- a/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml @@ -2,7 +2,7 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b'] " + CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'deepseek_v3_671b'] " POST: " enqueue_config('training_6.0.0/closed_{}.yaml'.format(v['value'])) " - KEY: diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseek_v3_671b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseek_v3_671b.yaml new file mode 100644 index 00000000..07059d7e --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseek_v3_671b.yaml @@ -0,0 +1,74 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 4096 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'cosine with linear warmup' " + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-08 " + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.1 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1024 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 2.7) and v['value'] > 0.0" # TODO(dfridman): Update this once we have the exact value diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml index ab82d076..8b0b43a6 100644 --- a/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml @@ -2,5 +2,5 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b'] " + CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'deepseek_v3_671b'] " POST: " enqueue_config('training_6.0.0/open_{}.yaml'.format(v['value'])) " diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_deepseek_v3_671b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_deepseek_v3_671b.yaml new file mode 100644 index 00000000..a9f73830 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_deepseek_v3_671b.yaml @@ -0,0 +1,65 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 4096 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1024 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 2.7) and v['value'] > 0.0" # TODO(dfridman): Update this once we have the exact value + diff --git a/mlperf_logging/mllog/constants.py b/mlperf_logging/mllog/constants.py index 57972a6f..3016f0a3 100644 --- a/mlperf_logging/mllog/constants.py +++ b/mlperf_logging/mllog/constants.py @@ -57,6 +57,7 @@ LLAMA31_405B = "llama31_405b" LLAMA31_8B = "llama31_8b" FLUX1 = "flux1" +DEEPSEEK_V3_671B = "deepseek_v3_671b" # Constant values - model info ADAGRAD = "adagrad"