Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion mlperf_logging/benchmark_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,14 @@
'llama2_70b_lora',
'rgat',
'llama31_405b'
]
],
'6.0': [
'llama31_8b',
'dlrm_dcnv2',
'flux1',
'llama2_70b_lora',
'llama31_405b'
]
},

'hpc': {
Expand Down
34 changes: 15 additions & 19 deletions mlperf_logging/compliance_checker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ To check a log file for compliance:

python -m mlperf_logging.compliance_checker [--config YAML] [--usage training/hpc] [--ruleset MLPERF_EDITION] FILENAME

By default, 5.1.0 training edition rules are used and the default config is set to `5.1.0/common.yaml`.
By default, 6.0.0 training edition rules are used and the default config is set to `6.0.0/common.yaml`.
This config will check all common keys and enqueue benchmark specific config to be checked as well.
Old training editions, still supported are 5.0.0, 4.1.0, 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0
Old training editions, still supported are 6.0.0, 5.1.0, 5.0.0, 4.1.0, 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0

To check hpc compliance rules (only 1.0.0 ruleset is supported), set --usage hpc --ruleset 1.0.0.

Expand All @@ -22,23 +22,19 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_

### Existing config files for training submissions

5.1.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file
5.1.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks
5.1.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks
5.1.0/closed_retinanet.yaml - Per-benchmark rules, closed submissions.
5.1.0/closed_llama31_8b.yaml
5.1.0/closed_llama31_405b.yaml
5.1.0/closed_dlrm_dcnv2.yaml
5.1.0/closed_rgat.yaml
5.1.0/closed_llama2_70b_lora.yaml
5.1.0/closed_flux1.yaml
5.1.0/open_retinanet.yaml - Per-benchmark rules, open submissions.
5.1.0/open_llama31_8b.yaml
5.1.0/open_llama31_405b.yaml
5.1.0/open_dlrm_dcnv2.yaml
5.1.0/open_rgat.yaml
5.1.0/open_llama2_70b_lora.yaml
5.1.0/open_flux1.yaml
6.0.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file
6.0.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks
6.0.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks
6.0.0/closed_llama31_8b.yaml
6.0.0/closed_llama31_405b.yaml
6.0.0/closed_dlrm_dcnv2.yaml
6.0.0/closed_llama2_70b_lora.yaml
6.0.0/closed_flux1.yaml
6.0.0/open_llama31_8b.yaml
6.0.0/open_llama31_405b.yaml
6.0.0/open_dlrm_dcnv2.yaml
6.0.0/open_llama2_70b_lora.yaml
6.0.0/open_flux1.yaml

### Existing config files for HPC submissions

Expand Down
2 changes: 1 addition & 1 deletion mlperf_logging/compliance_checker/mlp_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ def get_parser():
parser.add_argument('--usage', type=str, default='training',
choices=usage_choices(),
help='what WG do the benchmarks come from')
parser.add_argument('--ruleset', type=str, default='5.1.0',
parser.add_argument('--ruleset', type=str, default='6.0.0',
choices=rule_choices(),
help='what version of rules to check the log against')
parser.add_argument('--config', type=str,
Expand Down
3 changes: 3 additions & 0 deletions mlperf_logging/compliance_checker/mlp_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .ruleset_410 import parse_file as parse_file_410
from .ruleset_500 import parse_file as parse_file_500
from .ruleset_510 import parse_file as parse_file_510
from .ruleset_600 import parse_file as parse_file_600

def parse_file(filename, ruleset='0.6.0'):
if ruleset == '0.6.0':
Expand All @@ -36,5 +37,7 @@ def parse_file(filename, ruleset='0.6.0'):
return parse_file_500(filename)
elif ruleset == '5.1.0':
return parse_file_510(filename)
elif ruleset == '6.0.0':
return parse_file_600(filename)
else:
raise Exception(f'Ruleset "{ruleset}" is not supported')
105 changes: 105 additions & 0 deletions mlperf_logging/compliance_checker/mlp_parser/ruleset_600.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
'''
Parses a text MLPerf log into a structured format.
'''

from __future__ import print_function

import collections
import json
import re
import sys
from dataclasses import dataclass

from io import open

@dataclass
class LogLine:
"""Class for keeping track of an item in inventory."""
full_string: str
timestamp: float
key: str
value: str
lineno: int

TOKEN = ':::MLLOG '


def parse_line(line):
if not line.startswith(TOKEN):
return None

return json.loads(line[len(TOKEN):])


def string_to_logline(lineno, string):
''' Returns a LogLine or raises a ValueError '''
m = parse_line(string)

if m is None:
raise ValueError('does not match regex')

args = []
args.append(string) # full string

ts = float(m['time_ms']) # may raise error, e.g. "1.2.3"
# TODO check for weird values
args.append(ts)

args.append(m['key']) # key

j = { 'value': m['value'], 'metadata': m['metadata'] }
args.append(j)

args.append(lineno)
return LogLine(*args)


def parse_file(filename):
''' Reads a file by name and returns list of loglines and list of errors'''
with open(filename, encoding='latin-1') as f:
return parse_generator(f)


def strip_and_dedup(gen):
lines = []
for l in gen:
if TOKEN not in l:
continue
lines.append(re.sub(".*"+TOKEN, TOKEN, l))
return lines



def parse_generator(gen):
''' Reads a generator of lines and returns (loglines, errors)
The list of errors are any parsing issues as a tuple (str_line, error_msg)
'''
loglines = []
failed = []
for lineno, line in enumerate(strip_and_dedup(gen)):
line = line.strip()
try:
ll = string_to_logline(lineno, line)
loglines.append(ll)
except ValueError as e:
failed.append((line, str(e)))
return loglines, failed


if __name__ == '__main__':
if len(sys.argv) != 2:
print('usage: mlp_parser.py FILENAME')
print(' tests parsing on the file.')
sys.exit(1)

filename = sys.argv[1]
lines, errors = parse_file(filename)

print('Parsed {} log lines with {} errors.'.format(len(lines), len(errors)))

if len(errors) > 0:
print('Lines which failed to parse:')
for line, error in errors:
print(' Following line failed: {}'.format(error))
print(line)

Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

- KEY:
NAME: submission_benchmark
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b'] "
POST: " enqueue_config('training_6.0.0/closed_{}.yaml'.format(v['value'])) "

- KEY:
NAME: gradient_accumulation_steps
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0 "
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
- KEY:
NAME: global_batch_size
REQ: EXACTLY_ONE

- KEY:
NAME: opt_name
REQ: EXACTLY_ONE
CHECK: " v['value'] == 'adagrad' "

- KEY:
NAME: opt_base_learning_rate
REQ: EXACTLY_ONE

- KEY:
NAME: opt_adagrad_learning_rate_decay
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0 "

- KEY:
NAME: opt_weight_decay
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0 "

- KEY:
NAME: opt_adagrad_initial_accumulator_value
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0 "

- KEY:
NAME: opt_adagrad_epsilon
REQ: EXACTLY_ONE
CHECK: " v['value'] == 1e-8 "

- KEY:
NAME: opt_learning_rate_warmup_steps
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0 "

- KEY:
NAME: opt_learning_rate_decay_start_step
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0 "

- KEY:
NAME: opt_learning_rate_decay_steps
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0 "

- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "'epoch_num' in v['metadata']"
ATLEAST_ONE_CHECK: "v['value'] >= 0.80275 and v['value'] <= 1.0"

- KEY:
NAME: eval_samples
REQ: EXACTLY_ONE
CHECK: " v['value'] == 89137319 "
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
- KEY:
NAME: global_batch_size
REQ: AT_LEAST_ONE
CHECK: " v['value'] >= 0 "

- KEY:
NAME: evaluation_frequency
REQ: EXACTLY_ONE
CHECK: " v['value'] == 262144"

- KEY:
NAME: opt_name
REQ: EXACTLY_ONE
CHECK: " v['value'] == 'adamw' "

- KEY:
NAME: opt_adamw_beta_1
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0.9 "

- KEY:
NAME: opt_adamw_beta_2
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0.95 "

- KEY:
NAME: opt_adamw_epsilon
REQ: EXACTLY_ONE
CHECK: " v['value'] == 1e-08 "

- KEY:
NAME: opt_adamw_weight_decay
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0.1 "

- KEY:
NAME: opt_base_learning_rate
REQ: EXACTLY_ONE
CHECK: " v['value'] >= 0.0 "

- KEY:
NAME: opt_learning_rate_warmup_steps
REQ: EXACTLY_ONE
CHECK: " v['value'] >= 0 "

- KEY:
NAME: opt_gradient_clip_norm
REQ: EXACTLY_ONE
CHECK: " v['value'] == 1.0 "

- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "'samples_count' in v['metadata']"
ATLEAST_ONE_CHECK: "v['value'] <= 0.586 and v['value'] > 0.0"
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
- KEY:
NAME: global_batch_size
REQ: EXACTLY_ONE
POST: >
s['global_batch_size'] = v['value']

- KEY:
NAME: opt_base_learning_rate
REQ: EXACTLY_ONE


- KEY:
NAME: opt_learning_rate_training_steps
REQ: EXACTLY_ONE

- KEY:
NAME: opt_gradient_clip_norm
REQ: EXACTLY_ONE

- KEY:
NAME: opt_adamw_weight_decay
REQ: EXACTLY_ONE

- KEY:
NAME: gradient_accumulation_steps
REQ: EXACTLY_ONE

- KEY:
NAME: lora_alpha
REQ: EXACTLY_ONE

- KEY:
NAME: lora_rank
REQ: EXACTLY_ONE
CHECK: " v['value'] == 16"

- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "'samples_count' in v['metadata']"
ATLEAST_ONE_CHECK: "(v['value'] <= 0.925) and v['value'] > 0.0"
Loading