diff --git a/mlperf_logging/benchmark_meta.py b/mlperf_logging/benchmark_meta.py index cf7c3e3b..593c91d7 100644 --- a/mlperf_logging/benchmark_meta.py +++ b/mlperf_logging/benchmark_meta.py @@ -155,7 +155,14 @@ 'llama2_70b_lora', 'rgat', 'llama31_405b' - ] + ], + '6.0': [ + 'llama31_8b', + 'dlrm_dcnv2', + 'flux1', + 'llama2_70b_lora', + 'llama31_405b' + ] }, 'hpc': { diff --git a/mlperf_logging/compliance_checker/README.md b/mlperf_logging/compliance_checker/README.md index 48c6ed52..df66f132 100644 --- a/mlperf_logging/compliance_checker/README.md +++ b/mlperf_logging/compliance_checker/README.md @@ -10,9 +10,9 @@ To check a log file for compliance: python -m mlperf_logging.compliance_checker [--config YAML] [--usage training/hpc] [--ruleset MLPERF_EDITION] FILENAME -By default, 5.1.0 training edition rules are used and the default config is set to `5.1.0/common.yaml`. +By default, 6.0.0 training edition rules are used and the default config is set to `6.0.0/common.yaml`. This config will check all common keys and enqueue benchmark specific config to be checked as well. -Old training editions, still supported are 5.0.0, 4.1.0, 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0 +Old training editions, still supported are 6.0.0, 5.1.0, 5.0.0, 4.1.0, 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0 To check hpc compliance rules (only 1.0.0 ruleset is supported), set --usage hpc --ruleset 1.0.0. @@ -22,23 +22,19 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_ ### Existing config files for training submissions - 5.1.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file - 5.1.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks - 5.1.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks - 5.1.0/closed_retinanet.yaml - Per-benchmark rules, closed submissions. - 5.1.0/closed_llama31_8b.yaml - 5.1.0/closed_llama31_405b.yaml - 5.1.0/closed_dlrm_dcnv2.yaml - 5.1.0/closed_rgat.yaml - 5.1.0/closed_llama2_70b_lora.yaml - 5.1.0/closed_flux1.yaml - 5.1.0/open_retinanet.yaml - Per-benchmark rules, open submissions. - 5.1.0/open_llama31_8b.yaml - 5.1.0/open_llama31_405b.yaml - 5.1.0/open_dlrm_dcnv2.yaml - 5.1.0/open_rgat.yaml - 5.1.0/open_llama2_70b_lora.yaml - 5.1.0/open_flux1.yaml + 6.0.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file + 6.0.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks + 6.0.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks + 6.0.0/closed_llama31_8b.yaml + 6.0.0/closed_llama31_405b.yaml + 6.0.0/closed_dlrm_dcnv2.yaml + 6.0.0/closed_llama2_70b_lora.yaml + 6.0.0/closed_flux1.yaml + 6.0.0/open_llama31_8b.yaml + 6.0.0/open_llama31_405b.yaml + 6.0.0/open_dlrm_dcnv2.yaml + 6.0.0/open_llama2_70b_lora.yaml + 6.0.0/open_flux1.yaml ### Existing config files for HPC submissions diff --git a/mlperf_logging/compliance_checker/mlp_compliance.py b/mlperf_logging/compliance_checker/mlp_compliance.py index ad1f030d..23241c00 100644 --- a/mlperf_logging/compliance_checker/mlp_compliance.py +++ b/mlperf_logging/compliance_checker/mlp_compliance.py @@ -315,7 +315,7 @@ def get_parser(): parser.add_argument('--usage', type=str, default='training', choices=usage_choices(), help='what WG do the benchmarks come from') - parser.add_argument('--ruleset', type=str, default='5.1.0', + parser.add_argument('--ruleset', type=str, default='6.0.0', choices=rule_choices(), help='what version of rules to check the log against') parser.add_argument('--config', type=str, diff --git a/mlperf_logging/compliance_checker/mlp_parser/__init__.py b/mlperf_logging/compliance_checker/mlp_parser/__init__.py index 46a85b35..0f4d989a 100644 --- a/mlperf_logging/compliance_checker/mlp_parser/__init__.py +++ b/mlperf_logging/compliance_checker/mlp_parser/__init__.py @@ -10,6 +10,7 @@ from .ruleset_410 import parse_file as parse_file_410 from .ruleset_500 import parse_file as parse_file_500 from .ruleset_510 import parse_file as parse_file_510 +from .ruleset_600 import parse_file as parse_file_600 def parse_file(filename, ruleset='0.6.0'): if ruleset == '0.6.0': @@ -36,5 +37,7 @@ def parse_file(filename, ruleset='0.6.0'): return parse_file_500(filename) elif ruleset == '5.1.0': return parse_file_510(filename) + elif ruleset == '6.0.0': + return parse_file_600(filename) else: raise Exception(f'Ruleset "{ruleset}" is not supported') diff --git a/mlperf_logging/compliance_checker/mlp_parser/ruleset_600.py b/mlperf_logging/compliance_checker/mlp_parser/ruleset_600.py new file mode 100644 index 00000000..e30b08d2 --- /dev/null +++ b/mlperf_logging/compliance_checker/mlp_parser/ruleset_600.py @@ -0,0 +1,105 @@ +''' +Parses a text MLPerf log into a structured format. +''' + +from __future__ import print_function + +import collections +import json +import re +import sys +from dataclasses import dataclass + +from io import open + +@dataclass +class LogLine: + """Class for keeping track of an item in inventory.""" + full_string: str + timestamp: float + key: str + value: str + lineno: int + +TOKEN = ':::MLLOG ' + + +def parse_line(line): + if not line.startswith(TOKEN): + return None + + return json.loads(line[len(TOKEN):]) + + +def string_to_logline(lineno, string): + ''' Returns a LogLine or raises a ValueError ''' + m = parse_line(string) + + if m is None: + raise ValueError('does not match regex') + + args = [] + args.append(string) # full string + + ts = float(m['time_ms']) # may raise error, e.g. "1.2.3" + # TODO check for weird values + args.append(ts) + + args.append(m['key']) # key + + j = { 'value': m['value'], 'metadata': m['metadata'] } + args.append(j) + + args.append(lineno) + return LogLine(*args) + + +def parse_file(filename): + ''' Reads a file by name and returns list of loglines and list of errors''' + with open(filename, encoding='latin-1') as f: + return parse_generator(f) + + +def strip_and_dedup(gen): + lines = [] + for l in gen: + if TOKEN not in l: + continue + lines.append(re.sub(".*"+TOKEN, TOKEN, l)) + return lines + + + +def parse_generator(gen): + ''' Reads a generator of lines and returns (loglines, errors) + The list of errors are any parsing issues as a tuple (str_line, error_msg) + ''' + loglines = [] + failed = [] + for lineno, line in enumerate(strip_and_dedup(gen)): + line = line.strip() + try: + ll = string_to_logline(lineno, line) + loglines.append(ll) + except ValueError as e: + failed.append((line, str(e))) + return loglines, failed + + +if __name__ == '__main__': + if len(sys.argv) != 2: + print('usage: mlp_parser.py FILENAME') + print(' tests parsing on the file.') + sys.exit(1) + + filename = sys.argv[1] + lines, errors = parse_file(filename) + + print('Parsed {} log lines with {} errors.'.format(len(lines), len(errors))) + + if len(errors) > 0: + print('Lines which failed to parse:') + for line, error in errors: + print(' Following line failed: {}'.format(error)) + print(line) + diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml new file mode 100755 index 00000000..c17d1432 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_common.yaml @@ -0,0 +1,11 @@ + +- KEY: + NAME: submission_benchmark + REQ: EXACTLY_ONE + CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b'] " + POST: " enqueue_config('training_6.0.0/closed_{}.yaml'.format(v['value'])) " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_dlrm_dcnv2.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_dlrm_dcnv2.yaml new file mode 100644 index 00000000..45344bd2 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_dlrm_dcnv2.yaml @@ -0,0 +1,59 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adagrad' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adagrad_learning_rate_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_adagrad_initial_accumulator_value + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_adagrad_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-8 " + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_learning_rate_decay_start_step + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] >= 0.80275 and v['value'] <= 1.0" + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 89137319 " diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_flux1.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_flux1.yaml new file mode 100644 index 00000000..49f60bdb --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_flux1.yaml @@ -0,0 +1,56 @@ +- KEY: + NAME: global_batch_size + REQ: AT_LEAST_ONE + CHECK: " v['value'] >= 0 " + +- KEY: + NAME: evaluation_frequency + REQ: EXACTLY_ONE + CHECK: " v['value'] == 262144" + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-08 " + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.1 " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0 " + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] <= 0.586 and v['value'] > 0.0" diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_llama2_70b_lora.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_llama2_70b_lora.yaml new file mode 100755 index 00000000..46de03ef --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_llama2_70b_lora.yaml @@ -0,0 +1,42 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + + +- KEY: + NAME: opt_learning_rate_training_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: lora_alpha + REQ: EXACTLY_ONE + +- KEY: + NAME: lora_rank + REQ: EXACTLY_ONE + CHECK: " v['value'] == 16" + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 0.925) and v['value'] > 0.0" diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_llama31_405b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_llama31_405b.yaml new file mode 100644 index 00000000..90e2d45a --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_llama31_405b.yaml @@ -0,0 +1,88 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 8192 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] * 1152 == s['global_batch_size'] * 8e-5 " + +- KEY: + NAME: opt_end_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + POST: > + s['opt_learning_rate_warmup_steps'] = math.ceil(8000 * 1152 / s['global_batch_size'] ) + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == math.ceil(1_200_000 * 1152 / s['global_batch_size'] ) - s['opt_learning_rate_warmup_steps'] " + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'cosine with linear warmup' " + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-05 " + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.1 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 5760 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 5.6) and v['value'] > 0.0" + +- KEY: + NAME: init_checkpoint_step + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_llama31_8b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_llama31_8b.yaml new file mode 100644 index 00000000..d12bf9c8 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_llama31_8b.yaml @@ -0,0 +1,87 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 8192 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_end_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + POST: > + s['opt_learning_rate_warmup_steps'] = v['value'] + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1_200_000 - s['opt_learning_rate_warmup_steps'] " + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'cosine with linear warmup' " + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-05 " + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.1 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1024 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 3.3) and v['value'] > 0.0" + +- KEY: + NAME: max_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1200000 " + diff --git a/mlperf_logging/compliance_checker/training_6.0.0/common.yaml b/mlperf_logging/compliance_checker/training_6.0.0/common.yaml new file mode 100755 index 00000000..43b410ac --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/common.yaml @@ -0,0 +1,146 @@ +# This file lists all the KEYs to be checked. Every line that matches mlperf logging regex (::MLL...) will be checked against these rules. +# In the order of the appearance in the log, for each line will execute the code specified under CHECK for the KEY in that line. +# The code will be launched using local state 'v' which is the content of value field in log line, and global state 's'. +# Global state 's' exists to allow cross-line checks, like start/stop pairs etc. To initialize 's' use BEGIN record which CODE will +# be executed before any checks. +# In addition, occurrence of each key will be counted and at the end if a requirement regarding the number of occurrences is defined it will +# be confirmed. This could be implemented using global state, but since this is a common thing to do it is natively supported. +# +# KEY record: +# NAME +# REQ - optional - {EXACTLY_ONE, AT_LEAST_ONE} +# PRE - optional - code to be executed before CHECK +# CHECK - optional - expression to be evaluated to verify correctness +# POST - optional - code to be executed after CHECK + +- BEGIN: + CODE: > + s.update({ + 'init_started': False, + 'init_stopped' : False, + 'run_started' : False, + 'run_stopped' : False, + 'in_epoch' : False, + 'last_epoch' : 0, + 'in_block' : False, + 'block_first_epoch' : -1, + 'first_init_start': 9e99, + 'compile_time_mins': 0, + }) + +- KEY: + NAME: submission_org + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + +- KEY: + NAME: submission_platform + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + +- KEY: + NAME: submission_division + REQ: EXACTLY_ONE + CHECK: " v['value'] in ['closed', 'open'] " + POST: " enqueue_config('training_6.0.0/{}_common.yaml'.format(v['value'])); s['compile_time_mins'] = 240 if v['value'] == 'open' else 30 " + +# at least one record should be found, but any found records must pass the test +- KEY: + NAME: cache_clear + REQ: AT_LEAST_ONE + CHECK: + - "'value' in v" + +# frequency not checked +- KEY: + NAME: init_start + REQ: AT_LEAST_ONE + CHECK: + - "not s['init_stopped']" + - "not s['run_started']" + POST: " s['init_started'] = True; s['first_init_start']=min(s['first_init_start'], ll.timestamp) " + +# confirm less than 20min since the very first init_start +- KEY: + NAME: init_stop + REQ: EXACTLY_ONE + CHECK: + - "s['init_started']" + - "not s['run_started']" + - "ll.timestamp - s['first_init_start'] < (s['compile_time_mins']*60*1e3)" + POST: " s['init_stopped'] = True" + +- KEY: + NAME: run_start + REQ: EXACTLY_ONE + CHECK: " ( s['init_stopped'] == True )" + POST: " s['run_started'] = True " + +# status can also be aborted, but not allowing it here for now +# if eval is inside epoch and we decide to terminate, we can lack epoch_stop, it is ok +- KEY: + NAME: run_stop + REQ: EXACTLY_ONE + CHECK: + - "s['run_started']" + - "'status' in v['metadata']" + POST: " s['run_stopped'] = True " + +# FIXME: check epoch_count value match +- KEY: + NAME: block_start + REQ: AT_LEAST_ONE_OR(epoch_start) + CHECK: + - "s['run_started']" + - "('epoch_count' in v['metadata']) | ('samples_count' in v['metadata'])" + - "'first_epoch_num' in v['metadata'] if 'epoch_count' in v['metadata'] else True" + - "v['metadata']['epoch_count'] > 0 if 'epoch_count' in v['metadata'] else True" + - "v['metadata']['samples_count'] >= 0 if 'samples_count' in v['metadata'] else True" + +- KEY: + NAME: block_stop + REQ: AT_LEAST_ONE_OR(epoch_stop) + CHECK: + - "('first_epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: epoch_start + REQ: AT_LEAST_ONE_OR(block_start) + CHECK: + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: epoch_stop + REQ: AT_LEAST_ONE_OR(block_stop) + CHECK: + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +# making sure previous eval did print it's accuracy result +- KEY: + NAME: eval_start + REQ: AT_LEAST_ONE_OR(block_start) + CHECK: + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: eval_stop + REQ: AT_LEAST_ONE_OR(block_stop) + CHECK: + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: train_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml new file mode 100644 index 00000000..ab82d076 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_common.yaml @@ -0,0 +1,6 @@ + +- KEY: + NAME: submission_benchmark + REQ: EXACTLY_ONE + CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b'] " + POST: " enqueue_config('training_6.0.0/open_{}.yaml'.format(v['value'])) " diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_dlrm_dcnv2.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_dlrm_dcnv2.yaml new file mode 100644 index 00000000..7f70c0c3 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_dlrm_dcnv2.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] <= 1.0" diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_flux1.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_flux1.yaml new file mode 100644 index 00000000..19ee8dea --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_flux1.yaml @@ -0,0 +1,13 @@ +# Stable diffusion uses two metrics, FID and CLIP. +# These metrics can be calculated offline, using different scripts +# and logged seperatly. Therefore, we create a virtual key +# called aggregated_eval_accuracy, which aggregates +# both metrics into a single log line + +# TODO: Update with official metric name +- KEY: + NAME: averaged_validation_loss + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] <= 0.586 and v['value'] > 0.0" diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_llama2_70b_lora.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_llama2_70b_lora.yaml new file mode 100755 index 00000000..14c4176d --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_llama2_70b_lora.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_llama31_405b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_llama31_405b.yaml new file mode 100644 index 00000000..0a29e8b2 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_llama31_405b.yaml @@ -0,0 +1,78 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 8192 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_end_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 5760 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 5.6) and v['value'] > 0.0" + +- KEY: + NAME: init_checkpoint_step + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + diff --git a/mlperf_logging/compliance_checker/training_6.0.0/open_llama31_8b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/open_llama31_8b.yaml new file mode 100644 index 00000000..ff3f2043 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.0.0/open_llama31_8b.yaml @@ -0,0 +1,8 @@ + +# TODO: Update with official compliance requirements +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/package_checker/README.md b/mlperf_logging/package_checker/README.md index b084d875..6d6e92d2 100644 --- a/mlperf_logging/package_checker/README.md +++ b/mlperf_logging/package_checker/README.md @@ -10,7 +10,7 @@ To check an organization's submission package for compliance: python3 -m mlperf_logging.package_checker FOLDER USAGE RULESET ``` -Currently, USAGE in ["training"] and RULESET in ["0.6.0", "0.7.0", "1.0.0", "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0", "4.1.0", "5.0.0", "5.1.0"] are supported. +Currently, USAGE in ["training"] and RULESET in ["0.6.0", "0.7.0", "1.0.0", "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0", "4.1.0", "5.0.0", "5.1.0", "6.0.0"] are supported. The package checker checks: 1. The number of result files for each benchmark matches the required count. If diff --git a/mlperf_logging/package_checker/package_checker.py b/mlperf_logging/package_checker/package_checker.py index 12f10ffb..45eecb8a 100644 --- a/mlperf_logging/package_checker/package_checker.py +++ b/mlperf_logging/package_checker/package_checker.py @@ -191,14 +191,14 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror, logging.error(" %d files do not comply, directory cannot be accepted", len(error_list)) # Check if each run use unique seeds. - if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0'} and division == 'closed': + if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0', '6.0.0'} and division == 'closed': seed_checker_bypass = (global_seed_checker_bypass or system_seed_checker_bypass or result_seed_checker_bypass) if not seed_checker.check_seeds(result_files, seed_checker_bypass): too_many_errors = True logging.error('Seed checker failed') # Run RCP checker for >= 1.0.0 - if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0'} and division == 'closed' and benchmark != 'minigo': + if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0', '6.0.0'} and division == 'closed' and benchmark != 'minigo': # Now go again through result files to do RCP checks rcp_bypass = (global_rcp_bypass or system_rcp_bypass or result_rcp_bypass) rcp_pass, rcp_msg, _ = rcp_checker.check_directory( @@ -252,7 +252,7 @@ def check_training_package(folder, usage, ruleset, quiet, werror, rcp_bypass, rc ruleset: The ruleset such as 0.6.0, 0.7.0, 1.0.0, etc. """ too_many_errors = False - if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0'}: + if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0', '6.0.0'}: logging.info(' Checking System Description Files') system_description_pass = check_systems(folder, usage, ruleset) too_many_errors = too_many_errors or not system_description_pass diff --git a/mlperf_logging/rcp_checker/README.md b/mlperf_logging/rcp_checker/README.md index 412b7605..966e49cc 100644 --- a/mlperf_logging/rcp_checker/README.md +++ b/mlperf_logging/rcp_checker/README.md @@ -8,10 +8,10 @@ Run Reference Convergence Point checks for a submission directory. This consists of testing whether a submission does not converge statistically faster than the reference. -For training, RCPs are loaded from directory mlperf_logging/rcp_checker/training_5.1.0/*.json +For training, RCPs are loaded from directory mlperf_logging/rcp_checker/training_6.0.0/*.json The RCP checker supports only the 1.0.0 version onwards. -The current training version is 5.1.0. +The current training version is 6.0.0. ## Usage diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index 55405eb7..cdb383f7 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -13,6 +13,31 @@ import scipy.stats import sys + +def is_version_at_least(version, min_version): + """ + Check if a version string is at least the minimum version. + + Args: + version: Version string to check (e.g., "5.1.0") + min_version: Minimum version string (e.g., "5.0.0") + + Returns: + True if version >= min_version, False otherwise + + Example: + is_version_at_least("5.1.0", "5.0.0") # True + is_version_at_least("4.1.0", "5.0.0") # False + """ + version_parts = tuple(int(x) for x in version.split('.')) + min_parts = tuple(int(x) for x in min_version.split('.')) + # Pad shorter version with zeros for comparison + max_len = max(len(version_parts), len(min_parts)) + version_parts = version_parts + (0,) * (max_len - len(version_parts)) + min_parts = min_parts + (0,) * (max_len - len(min_parts)) + return version_parts >= min_parts + + # Number of submission runs for each benchmark # References need 2x of these runs # We use olympic scoring for statistics, so we reject @@ -166,8 +191,8 @@ def get_submission_epochs(result_files, ruleset, bert_train_samples): class RCP_Checker: def __init__(self, usage, ruleset, benchmark, verbose, rcp_file=None): - if ruleset not in {'1.0.0', "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0", "4.1.0", "5.0.0", "5.1.0"}: - raise Exception('RCP Checker only supported in 1.0.0, 1.1.0, 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0, 4.1.0, 5.0.0 and 5.1.0') + if ruleset not in {'1.0.0', "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0", "4.1.0", "5.0.0", "5.1.0", "6.0.0"}: + raise Exception('RCP Checker only supported in 1.0.0, 1.1.0, 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0, 4.1.0, 5.0.0, 5.1.0 and 6.0.0') self.usage = usage self.ruleset = ruleset self.benchmark = benchmark @@ -447,7 +472,7 @@ def _reset_results_scaling(self, results_dir): def _eval_submission_record(self, rcp_record, subm_epochs, results_dir): '''Compare reference and submission convergence.''' - if self.ruleset in ["5.0.0", "5.1.0"] and self.benchmark == "llama31_405b": + if is_version_at_least(self.ruleset, "5.0.0") and self.benchmark == "llama31_405b": rcp_record['Max Speedup'] = rcp_record['RCP Mean'] / (rcp_record['Min Epochs'] - 46080) subm_epochs.sort() @@ -550,7 +575,7 @@ def get_parser(): parser.add_argument('--rcp_usage', type=str, default='training', choices=['training', 'hpc'], help='what WG does the benchmark come from to check the log against') - parser.add_argument('--rcp_version', type=str, default='5.1.0', + parser.add_argument('--rcp_version', type=str, default='6.0.0', help='what version of rules to check the log against') parser.add_argument('--verbose', action='store_true') parser.add_argument('--bert_train_samples', action='store_true', diff --git a/mlperf_logging/rcp_checker/training_5.1.0/rcps_flux1.json b/mlperf_logging/rcp_checker/training_5.1.0/rcps_flux1.json index c612b94b..5da82690 100644 --- a/mlperf_logging/rcp_checker/training_5.1.0/rcps_flux1.json +++ b/mlperf_logging/rcp_checker/training_5.1.0/rcps_flux1.json @@ -16,9 +16,9 @@ "opt_gradient_clip_norm": 1.0 }, "samples to converge": [ - 7340032, 7077888, 7077888, 7340032, 7077888, 7340032, 7340032, 7340032, - 7340032, 6815744, 7340032, 7602176, 7077888, 7340032, 7077888, 7077888, - 6815744, 7077888, 7077888, 6815744 + 8388608, 8388608, 8388608, 8126464, 7864320, 8126464, 7864320, 7602176, + 8650752, 8126464, 8650752, 8126464, 8388608, 8126464, 8126464, 8126464, + 8126464, 8126464, 8126464, 7864320 ] }, "flux_ref_1024": { @@ -33,14 +33,14 @@ "opt_adamw_beta_2": 0.95, "opt_adamw_epsilon": 1e-8, "opt_adamw_weight_decay": 0.1, - "opt_base_learning_rate": 2.5e-4, - "opt_learning_rate_warmup_steps": 800, + "opt_base_learning_rate": 2.0e-4, + "opt_learning_rate_warmup_steps": 0, "opt_gradient_clip_norm": 1.0 }, "samples to converge": [ - 9175040, 8388608, 8388608, 8126464, 8650752, 8650752, 8126464, 9175040, - 8650752, 8912896, 8388608, 8912896, 8388608, 8388608, 8388608, 8126464, - 8650752, 8650752, 8388608, 8650752 + 8912896, 8650752, 9437184, 8126464, 8388608, 9175040, 8650752, 8126464, + 8388608, 9961472, 7864320, 8126464, 9699328, 8650752, 9437184, 8912896, + 8388608, 9175040, 8126464, 9175040 ] }, "flux_ref_2048": { @@ -60,9 +60,9 @@ "opt_gradient_clip_norm": 1.0 }, "samples to converge": [ - 10485760, 10223616, 10485760, 10747904, 10485760, 9961472, 11534336, - 10485760, 10485760, 11272192, 9961472, 9699328, 10747904, 9175040, - 9699328, 10485760, 9437184, 10223616, 9699328, 11010048 + 11272192, 10223616, 11534336, 10747904, 9699328, 10485760, 11010048, + 10223616, 11796480, 10485760, 10747904, 11272192, 9699328, 10485760, + 11534336, 9961472, 10485760, 10485760, 11272192, 11272192 ] }, "flux_ref_4096": { @@ -82,9 +82,9 @@ "opt_gradient_clip_norm": 1.0 }, "samples to converge": [ - 15204352, 15990784, 15466496, 15728640, 15204352, 15466496, 15990784, - 15204352, 14942208, 15204352, 15466496, 16252928, 14680064, 14942208, - 13893632, 15466496, 15466496, 15728640, 15466496, 15204352 + 15466496, 15728640, 15990784, 15466496, 15728640, 15466496, 14942208, + 14680064, 15728640, 15990784, 15990784, 15728640, 15728640, 16252928, + 14942208, 15728640, 16252928, 15204352, 16515072, 14942208 ] } } diff --git a/mlperf_logging/rcp_checker/training_6.0.0/rcps_dlrm_dcnv2.json b/mlperf_logging/rcp_checker/training_6.0.0/rcps_dlrm_dcnv2.json new file mode 100644 index 00000000..3a71eff5 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_6.0.0/rcps_dlrm_dcnv2.json @@ -0,0 +1,162 @@ +{ + + "dlrm_dcnv2_ref_32768": { + "Benchmark": "dlrm_dcnv2", + "Creator": "NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "DGX-A100", + "Precision": "FP32", + "BS": 32768, + "Hyperparams": { + "opt_name": "adagrad", + "opt_base_learning_rate": 0.004, + "opt_adagrad_learning_rate_decay": 0.0, + "opt_adagrad_initial_accumulator_value": 0.0, + "opt_adagrad_epsilon": 1e-08, + "opt_weight_decay": 0.0, + "opt_learning_rate_warmup_steps": 0, + "opt_learning_rate_decay_start_step": 0, + "opt_learning_rate_decay_steps": 0 + }, + "Epochs to converge": [ + 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, + 0.75, 0.7, 0.7, 0.7, 0.75, 0.75, 0.75, 0.7, 0.7, 0.7, + 0.7, 0.7, 0.75, 0.7, 0.65, 0.7, 0.7, 0.7, 0.7, 0.7 + ] + }, + + "dlrm_dcnv2_ref_55296": { + "Benchmark": "dlrm_dcnv2", + "Creator": "NVIDIA", + "When": "At 3.0 submission", + "Platform": "DGX-A100", + "Precision": "FP32", + "BS": 55296, + "Hyperparams": { + "opt_name": "adagrad", + "opt_base_learning_rate": 0.004, + "opt_adagrad_learning_rate_decay": 0.0, + "opt_adagrad_initial_accumulator_value": 0.0, + "opt_adagrad_epsilon": 1e-08, + "opt_weight_decay": 0.0, + "opt_learning_rate_warmup_steps": 0, + "opt_learning_rate_decay_start_step": 0, + "opt_learning_rate_decay_steps": 0 + }, + "Epochs to converge": [ + 0.75, 0.75, 0.75, 0.7, 0.8, 0.75, 0.75, 0.75, 0.75, 0.75, + 0.9, 0.7, 0.75, 0.8, 0.7, 0.8, 0.7, 0.7, 0.75, 0.7, + 0.7, 0.9, 0.75, 0.7, 0.8, 0.75, 0.75, 0.8, 0.75, 0.8, + 0.9, 0.75, 0.8, 0.75, 0.8, 0.75, 0.75, 0.75, 0.7, 0.75, + 0.75, 0.8, 0.75, 0.8, 0.8, 0.9, 0.75, 0.75, 0.7, 0.75 + ] + }, + + "dlrm_dcnv2_ref_65536": { + "Benchmark": "dlrm_dcnv2", + "Creator": "NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "DGX-A100", + "Precision": "FP32", + "BS": 65536, + "Hyperparams": { + "opt_name": "adagrad", + "opt_base_learning_rate": 0.004, + "opt_adagrad_learning_rate_decay": 0.0, + "opt_adagrad_initial_accumulator_value": 0.0, + "opt_adagrad_epsilon": 1e-08, + "opt_weight_decay": 0.0, + "opt_learning_rate_warmup_steps": 0, + "opt_learning_rate_decay_start_step": 0, + "opt_learning_rate_decay_steps": 0 + }, + "Epochs to converge": [ + 0.75, 0.8, 0.75, 0.75, 0.8, 0.75, 0.8, 0.9, 0.95, 0.75, + 0.75, 0.75, 0.85, 0.85, 0.7, 0.75, 0.75, 0.9, 0.85, 0.8, + 0.7, 0.75, 0.75, 0.75, 0.8, 0.9, 0.75, 0.8, 0.85, 0.8 + ] + }, + + "dlrm_dcnv2_ref_102400": { + "Benchmark": "dlrm_dcnv2", + "Creator": "NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "DGX-A100", + "Precision": "FP32", + "BS": 102400, + "Hyperparams": { + "opt_name": "adagrad", + "opt_base_learning_rate": 0.004, + "opt_adagrad_learning_rate_decay": 0.0, + "opt_adagrad_initial_accumulator_value": 0.0, + "opt_adagrad_epsilon": 1e-08, + "opt_weight_decay": 0.0, + "opt_learning_rate_warmup_steps": 0, + "opt_learning_rate_decay_start_step": 0, + "opt_learning_rate_decay_steps": 0 + }, + "Epochs to converge": [ + 0.85, 0.95, 0.95, 0.85, 0.9, 0.8, 0.85, 0.9, 0.9, 0.9, + 0.95, 0.9, 0.9, 0.9, 0.9, 0.9, 0.85, 0.85, 0.9, 0.9, + 0.8, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.85, 0.9, 0.9, + 0.9, 0.95, 0.85, 0.9, 0.9, 0.9, 0.85, 0.9, 0.95, 0.9, + 0.85, 0.95, 0.9, 0.9, 0.8, 0.9, 0.9, 0.9, 0.85, 0.9 + ] + }, + + "dlrm_dcnv2_ref_135168": { + "Benchmark": "dlrm_dcnv2", + "Creator": "NVIDIA", + "When": "At 3.0 submission", + "Platform": "DGX-A100", + "Precision": "FP32", + "BS": 135168, + "Hyperparams": { + "opt_name": "adagrad", + "opt_base_learning_rate": 0.0034, + "opt_adagrad_learning_rate_decay": 0.0, + "opt_adagrad_initial_accumulator_value": 0.0, + "opt_adagrad_epsilon": 1e-08, + "opt_weight_decay": 0.0, + "opt_learning_rate_warmup_steps": 0, + "opt_learning_rate_decay_start_step": 0, + "opt_learning_rate_decay_steps": 0 + }, + "Epochs to converge": [ + 0.95, 0.9, 0.9, 0.9, 0.9, 0.95, 0.9, 0.95, 0.95, 0.9, + 0.95, 0.95, 0.95, 1.0, 0.85, 0.9, 0.9, 0.95, 0.95, 0.95, + 0.95, 0.9, 0.9, 0.9, 0.95, 0.95, 1.0, 0.9, 0.95, 0.95, + 0.85, 0.95, 0.95, 0.95, 0.9, 0.95, 0.9, 0.9, 1.0, 0.9, + 0.95, 0.9, 0.95, 0.95, 0.95, 0.95, 0.95, 0.9, 0.9, 0.9, + 0.9, 0.9, 0.9, 0.9, 0.95, 0.85, 0.95, 0.95, 0.9, 0.95, + 0.95, 0.95, 0.95, 1.0, 0.9, 0.95, 0.9, 1.0, 0.85, 0.9, + 0.9, 0.95, 0.95, 0.9, 0.95, 0.9, 0.95, 0.85, 0.95, 0.95, + 0.95, 0.9, 0.9, 0.95, 0.9, 0.95, 0.9, 1.0 + ] + }, + + "dlrm_dcnv2_ref_160000": { + "Benchmark": "dlrm_dcnv2", + "Creator": "Cisco", + "When": "At 5.1 submission", + "Platform": "DGX-H100", + "Precision": "FP32", + "BS": 160000, + "Hyperparams": { + "opt_name": "adagrad", + "opt_base_learning_rate": 0.004, + "opt_adagrad_learning_rate_decay": 0.0, + "opt_adagrad_initial_accumulator_value": 0.0, + "opt_adagrad_epsilon": 1e-08, + "opt_weight_decay": 0.0, + "opt_learning_rate_warmup_steps": 0, + "opt_learning_rate_decay_start_step": 0, + "opt_learning_rate_decay_steps": 0 + }, + "Epochs to converge": [ + 0.95, 0.95, 1, 1, 0.95, 1, 1, 0.95, 0.95, 0.90, + 0.90, 1, 0.90, 0.95, 0.90, 1, 0.95, 0.95, 0.95, 1 + ] + } + +} diff --git a/mlperf_logging/rcp_checker/training_6.0.0/rcps_flux1.json b/mlperf_logging/rcp_checker/training_6.0.0/rcps_flux1.json new file mode 100644 index 00000000..e071b7db --- /dev/null +++ b/mlperf_logging/rcp_checker/training_6.0.0/rcps_flux1.json @@ -0,0 +1,90 @@ +{ + "flux_ref_512": { + "Benchmark": "flux1", + "Creator": "NVIDIA", + "When": "Reference RCPs before v6.0", + "Platform": "8xDGX-B200", + "Precision": "BF16", + "BS": 512, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.95, + "opt_adamw_epsilon": 1e-8, + "opt_adamw_weight_decay": 0.1, + "opt_base_learning_rate": 2.0e-4, + "opt_learning_rate_warmup_steps": 1600, + "opt_gradient_clip_norm": 1.0 + }, + "samples to converge": [ + 7077888, 7340032, 7077888, 7077888, 7340032, 7340032, 7602176, 7340032, + 7077888, 7340032, 7077888, 7340032, 7340032, 7077888, 7077888, 7077888, + 7340032, 7340032, 7077888, 7340032 + ] + }, + "flux_ref_1024": { + "Benchmark": "flux1", + "Creator": "NVIDIA", + "When": "Reference RCPs before v6.0", + "Platform": "8xDGX-B200", + "Precision": "BF16", + "BS": 1024, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.95, + "opt_adamw_epsilon": 1e-8, + "opt_adamw_weight_decay": 0.1, + "opt_base_learning_rate": 2.5e-4, + "opt_learning_rate_warmup_steps": 800, + "opt_gradient_clip_norm": 1.0 + }, + "samples to converge": [ + 8650752, 8650752, 8126464, 8650752, 8650752, 8912896, 8126464, 8388608, + 8650752, 8126464, 8126464, 8650752, 8388608, 8388608, 8650752, 8388608, + 8388608, 8388608, 8912896, 8650752 + ] + }, + "flux_ref_2048": { + "Benchmark": "flux1", + "Creator": "NVIDIA", + "When": "Reference RCPs before v6.0", + "Platform": "8xDGX-B200", + "Precision": "BF16", + "BS": 2048, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.95, + "opt_adamw_epsilon": 1e-8, + "opt_adamw_weight_decay": 0.1, + "opt_base_learning_rate": 2.5e-4, + "opt_learning_rate_warmup_steps": 0, + "opt_gradient_clip_norm": 1.0 + }, + "samples to converge": [ + 9437184, 10223616, 10485760, 11010048, 10747904, 12320768, 10485760, + 9961472, 10485760, 9437184, 9699328, 11534336, 9699328, 9699328, 10747904, + 9961472, 10485760, 10747904, 9961472, 9961472 + ] + }, + "flux_ref_4096": { + "Benchmark": "flux1", + "Creator": "NVIDIA", + "When": "Reference RCPs before v6.0", + "Platform": "8xDGX-B200", + "Precision": "BF16", + "BS": 4096, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.95, + "opt_adamw_epsilon": 1e-8, + "opt_adamw_weight_decay": 0.1, + "opt_base_learning_rate": 4.0e-4, + "opt_learning_rate_warmup_steps": 100, + "opt_gradient_clip_norm": 1.0 + }, + "samples to converge": [ + 15204352, 15990784, 15466496, 15728640, 15204352, 15466496, 15990784, + 15204352, 14942208, 15204352, 15466496, 16252928, 14680064, 14942208, + 13893632, 15466496, 15466496, 15728640, 15466496, 15204352 + ] + } +} diff --git a/mlperf_logging/rcp_checker/training_6.0.0/rcps_llama2_70b_lora.json b/mlperf_logging/rcp_checker/training_6.0.0/rcps_llama2_70b_lora.json new file mode 100644 index 00000000..86630bdf --- /dev/null +++ b/mlperf_logging/rcp_checker/training_6.0.0/rcps_llama2_70b_lora.json @@ -0,0 +1,95 @@ +{ + "llama2_70b_lora_ref_8": + { + "Benchmark": "llama2_70b_lora", + "Creator": "NVIDIA", + "When": "Prior to 4.0 submission", + "Platform": "TBD", + "Precision": "BF16", + "BS": 8, + "Hyperparams": { + "opt_base_learning_rate": 4e-4, + "opt_max_grad_norm": 0.3, + "opt_learning_rate_warmup_epochs": 0, + "opt_learning_rate_decay_boundary_epochs": [], + "gradient_accumulation_steps": 1, + "lora_r": 16, + "lora_alpha": 32, + "max_steps": 1024 + }, + "samples to converge": [ + 3072,2688,3456,3072,3072,3072,3456,3456,3072,2688, + 3456,3072,3072,3072,3840,3456,2688,3072,3456,3456 + ] + }, + + "llama2_70b_lora_ref_16": + { + "Benchmark": "llama2_70b_lora", + "Creator": "NVIDIA", + "When": "Prior to 4.0 submission", + "Platform": "TBD", + "Precision": "BF16", + "BS": 16, + "Hyperparams": { + "opt_base_learning_rate": 4e-4, + "opt_max_grad_norm": 0.3, + "opt_learning_rate_warmup_epochs": 0, + "opt_learning_rate_decay_boundary_epochs": [], + "gradient_accumulation_steps": 1, + "lora_r": 16, + "lora_alpha": 32, + "max_steps": 1024 + }, + "samples to converge": [ + 3840,3840,4224,3840,3840,3840,4608,3840,4608,3840, + 4992,3840,3840,3840,4992,3840,3840,4224,3840,3456 + ] + }, + "llama2_70b_lora_ref_32": + { + "Benchmark": "llama2_70b_lora", + "Creator": "NVIDIA", + "When": "Prior to 4.0 submission", + "Platform": "TBD", + "Precision": "BF16", + "BS": 32, + "Hyperparams": { + "opt_base_learning_rate": 4e-4, + "opt_max_grad_norm": 0.3, + "opt_learning_rate_warmup_epochs": 0, + "opt_learning_rate_decay_boundary_epochs": [], + "gradient_accumulation_steps": 1, + "lora_r": 16, + "lora_alpha": 32, + "max_steps": 1024 + }, + "samples to converge": [ + 5760,6528,6144,6528,5376,6528,5760,6144,6144,6528, + 6144,6144,6144,5760,5760,5760,5760,5760,6144,5760 + ] + }, + "llama2_70b_lora_ref_128": + { + "Benchmark": "llama2_70b_lora", + "Creator": "NVIDIA", + "When": "Prior to 4.0 submission", + "Platform": "TBD", + "Precision": "BF16", + "BS": 128, + "Hyperparams": { + "opt_base_learning_rate": 1e-3, + "opt_max_grad_norm": 0.3, + "opt_learning_rate_warmup_epochs": 0, + "opt_learning_rate_decay_boundary_epochs": [], + "gradient_accumulation_steps": 1, + "lora_r": 16, + "lora_alpha": 32, + "max_steps": 1024 + }, + "samples to converge": [ + 11520,13056,10752,12672,12288,11136,10752,13056, 10752,9984, + 11136,11136,11136,10752,11520,11136,11136,10752,11136,9984 + ] + } +} diff --git a/mlperf_logging/rcp_checker/training_6.0.0/rcps_llama31_405b.json b/mlperf_logging/rcp_checker/training_6.0.0/rcps_llama31_405b.json new file mode 100644 index 00000000..d1a7620d --- /dev/null +++ b/mlperf_logging/rcp_checker/training_6.0.0/rcps_llama31_405b.json @@ -0,0 +1,60 @@ +{ + "llama31_405b_ref_1152": + { + "Benchmark": "llama31_405b", + "Creator": "NVIDIA", + "When": "Reference RCPs after 5.0 submission", + "Platform": "288xDGX-H100", + "Precision": "BF16", + "BS": 1152, + "Hyperparams": { + "opt_base_learning_rate": 8e-05, + "opt_learning_rate_warmup_steps": 8000, + "gradient_accumulation_steps": 144 + }, + "Epochs to converge": [ + 313344,313344,313344, + 331776,313344,294912 + ] + }, + + "llama31_405b_ref_2304": + { + "Benchmark": "llama31_405b", + "Creator": "NVIDIA", + "When": "Reference RCPs after 5.0 submission", + "Platform": "288xDGX-H100", + "Precision": "BF16", + "BS": 2304, + "Hyperparams": { + "opt_base_learning_rate": 16e-05, + "opt_learning_rate_warmup_steps": 4000, + "gradient_accumulation_steps": 288 + }, + "Epochs to converge": [ + 368640,350208,387072, + 368640,368640,368640 + ] + }, + + "llama31_405b_ref_4608": + { + "Benchmark": "llama31_405b", + "Creator": "NVIDIA", + "When": "Reference RCPs after 5.0 submission", + "Platform": "288xDGX-H100", + "Precision": "BF16", + "BS": 4608, + "Hyperparams": { + "opt_base_learning_rate": 32e-05, + "opt_learning_rate_warmup_steps": 2000, + "gradient_accumulation_steps": 576 + }, + "Epochs to converge": [ + 497664,497664,460800, + 497664,479232,497664 + ] + } + } + + diff --git a/mlperf_logging/rcp_checker/training_6.0.0/rcps_llama31_8b.json b/mlperf_logging/rcp_checker/training_6.0.0/rcps_llama31_8b.json new file mode 100644 index 00000000..1b7143f1 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_6.0.0/rcps_llama31_8b.json @@ -0,0 +1,112 @@ +{ + "llama31_8b_ref_16": + { + "Benchmark": "llama31_8b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.1 submission", + "Platform": "2xDGX-B200", + "Precision": "BF16", + "BS": 16, + "Hyperparams": { + "opt_base_learning_rate": 4e-04, + "opt_learning_rate_warmup_samples": 256, + "gradient_accumulation_steps": 1 + }, + "Epochs to converge": [ + 159744, 159744, 159744, 159744, 159744, + 159744, 172032, 159744, 172032, 159744, + 172032, 159744, 159744, 159744, 159744, + 159744, 159744, 159744, 159744, 159744 + ] + }, + + "llama31_8b_ref_32": + { + "Benchmark": "llama31_8b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.1 submission", + "Platform": "4xDGX-B200", + "Precision": "BF16", + "BS": 32, + "Hyperparams": { + "opt_base_learning_rate": 8e-04, + "opt_learning_rate_warmup_samples": 4096, + "gradient_accumulation_steps": 1 + }, + "Epochs to converge": [ + 196608, 172032, 184320, 184320, 172032, + 172032, 184320, 184320, 184320, 172032, + 172032, 172032, 184320, 184320, 184320, + 172032, 172032, 172032, 184320, 184320 + ] + }, + + "llama31_8b_ref_64": + { + "Benchmark": "llama31_8b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.1 submission", + "Platform": "4xDGX-B200", + "Precision": "BF16", + "BS": 64, + "Hyperparams": { + "opt_base_learning_rate": 8e-04, + "opt_learning_rate_warmup_samples": 6144, + "gradient_accumulation_steps": 2 + }, + "Epochs to converge": [ + 233472, 208896, 208896, 233472, 233472, + 233472, 233472, 233472, 208896, 233472, + 233472, 233472, 245760, 221184, 208896, + 233472, 233472, 221184, 221184, 221184 + ] + }, + + "llama31_8b_ref_96": + { + "Benchmark": "llama31_8b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.1 submission", + "Platform": "2xDGX-B200", + "Precision": "BF16", + "BS": 96, + "Hyperparams": { + "opt_base_learning_rate": 1e-03, + "opt_learning_rate_warmup_samples": 16348, + "gradient_accumulation_steps": 6 + }, + "Epochs to converge": [ + 297216, 284832, 272448, 272448, 272448, + 272448, 297216, 272448, 297216, 272448, + 297216, 260064, 272448, 272448, 272448, + 284832, 260064, 284832, 284832, 272448 + ] + }, + + "llama31_8b_ref_128": + { + "Benchmark": "llama31_8b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.1 submission", + "Platform": "4xDGX-B200", + "Precision": "BF16", + "BS": 128, + "Hyperparams": { + "opt_base_learning_rate": 2e-03, + "opt_learning_rate_warmup_samples": 32768, + "gradient_accumulation_steps": 4 + }, + "Epochs to converge": [ + 368640, 344064, 356352, 344064, 368640, + 368640, 405504, 344064, 331776, 307200, + 331776, 380928, 307200, 344064, 319488, + 356352, 331776, 319488, 356352, 331776 + ] + } +} + + + + + + diff --git a/mlperf_logging/rcp_checker/visualization_scripts/rcp_viewer.py b/mlperf_logging/rcp_checker/visualization_scripts/rcp_viewer.py index cba24d70..50f6cf9b 100755 --- a/mlperf_logging/rcp_checker/visualization_scripts/rcp_viewer.py +++ b/mlperf_logging/rcp_checker/visualization_scripts/rcp_viewer.py @@ -22,7 +22,7 @@ def main(): parser.add_argument('--usage', type=str, default='training', choices=['training', 'hpc'], help="the WG that produced the benchmark") - parser.add_argument('--version', type=str, default='5.1.0', + parser.add_argument('--version', type=str, default='6.0.0', help='what version of the ruleset') parser.add_argument('--verbose', action='store_true') parser.add_argument('--unpruned', action='store_true', diff --git a/mlperf_logging/repo_checker/README.md b/mlperf_logging/repo_checker/README.md index 39e974b7..4ee01c55 100644 --- a/mlperf_logging/repo_checker/README.md +++ b/mlperf_logging/repo_checker/README.md @@ -12,7 +12,7 @@ review process. python3 -m mlperf_logging.repo_checker FOLDER USAGE RULESET ``` -Currently, USAGE in ["training", "hpc"] and RULESETS 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0, 4.1.0, 5.0.0 and 5.1.0 are supported. +Currently, USAGE in ["training", "hpc"] and RULESETS 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0, 4.1.0, 5.0.0, 5.1.0 and 6.0.0 are supported. The repo checker checks: 1. Whether the repo contains filenames that github does not like, e.g. files with spaces, diff --git a/mlperf_logging/repo_checker/repo_checker.py b/mlperf_logging/repo_checker/repo_checker.py index 2efb2e1e..140bff92 100644 --- a/mlperf_logging/repo_checker/repo_checker.py +++ b/mlperf_logging/repo_checker/repo_checker.py @@ -127,8 +127,8 @@ def get_parser(): parser.add_argument( 'ruleset', type=str, - choices=['2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0'], - help='the ruleset. 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0, 4.1.0, 5.0.0 and 5.1.0 are currently supported.' + choices=['2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0', '6.0.0'], + help='the ruleset. 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0, 4.1.0, 5.0.0, 5.1.0 and 6.0.0 are currently supported.' ) parser.add_argument( '--log_output', diff --git a/mlperf_logging/result_summarizer/compute_score/README.md b/mlperf_logging/result_summarizer/compute_score/README.md index 9c35f77f..8999ceec 100644 --- a/mlperf_logging/result_summarizer/compute_score/README.md +++ b/mlperf_logging/result_summarizer/compute_score/README.md @@ -17,7 +17,7 @@ python3 -m mlperf_logging.result_summarizer.compute_score --benchmark BENCHMARK **SYSTEM_NAME:** Optional system name. **BENCHMARK_FOLDER:** Folder containing all the results files of the benchmark. **USAGE:** Either "training" or "hpc", -**RULESET:** Version of the rules that applies one of "1.0.0", "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0", "4.1.0", "5.0.0", "5.1.0". +**RULESET:** Version of the rules that applies one of "1.0.0", "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0", "4.1.0", "5.0.0", "5.1.0", "6.0.0". **[--is_weak_scaling]:** Is the benchmark weak scaling (only applies to HPC). **[--scale]:** Compute the scaling.json file (only if the folder does not contain it already). **[--has_power]:** Have the results power measurements . diff --git a/mlperf_logging/result_summarizer/config.yaml b/mlperf_logging/result_summarizer/config.yaml index 897d29a9..111921f0 100644 --- a/mlperf_logging/result_summarizer/config.yaml +++ b/mlperf_logging/result_summarizer/config.yaml @@ -102,6 +102,13 @@ columns: llama31_8b: ["Benchmark results (minutes)", "Small LLM", "C4", "Llama31-8b"] llama31_405b: ["Benchmark results (minutes)", "LLM", "C4", "Llama31-405B"] default: [" ", " ", " "] + "6.0.0": + dlrm_dcnv2: ["Benchmark results (minutes)", "Recommendation", "1TB Multihot Clickthrough", "DLRM DCNv2"] + flux1: ["Benchmark results (minutes)", "Text to image", "CC12M and Coco-2014 for eval", "Flux1"] + llama2_70b_lora: ["Benchmark results (minutes)", "LLM-Finetune", "SCROLSS Gov Report", "LLama2-70B-LoRA"] + llama31_8b: ["Benchmark results (minutes)", "Small LLM", "C4", "Llama31-8b"] + llama31_405b: ["Benchmark results (minutes)", "LLM", "C4", "Llama31-405B"] + default: [" ", " ", " "] hpc: "2.0.0": diff --git a/scripts/verify_for_v6.0_training.sh b/scripts/verify_for_v6.0_training.sh new file mode 100755 index 00000000..7b90b4aa --- /dev/null +++ b/scripts/verify_for_v6.0_training.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +set -e + +# rcp_bypass and rcp_bert_train_samples package checker params +# need to be retrieved at package_checker_params file at top-level submission dir. +PACKAGE_CHECKER_PARAMS="" +PACKAGE_CHECKER_PARAMS_FILE="$1/package_checker_params" +if test -f "$PACKAGE_CHECKER_PARAMS_FILE"; then + while IFS= read -r line + do + PACKAGE_CHECKER_PARAMS="$PACKAGE_CHECKER_PARAMS --$line" + done < "$PACKAGE_CHECKER_PARAMS_FILE" +fi + +python3 -m mlperf_logging.package_checker $1 training 6.0.0 $PACKAGE_CHECKER_PARAMS +python3 -m mlperf_logging.result_summarizer $1 training 6.0.0 +python3 -m mlperf_logging.repo_checker $1 training 6.0.0