diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 445ce97f3..2acfdb838 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -98,16 +98,16 @@ "dlrm-v2-99.9": ("AUC", 80.31 * 0.999), "3d-unet-99": ("DICE", 0.86170 * 0.99), "3d-unet-99.9": ("DICE", 0.86170 * 0.999), - "gptj-99" : ("ROUGE1", 42.9865 * 0.99, "ROUGE2", 20.1235 * 0.99, "ROUGEL", 29.9881 * 0.99, "GEN_LEN", 4016878*0.9), - "gptj-99.9" : ("ROUGE1", 42.9865 * 0.999, "ROUGE2", 20.1235 * 0.999, "ROUGEL", 29.9881 * 0.999, "GEN_LEN", 4016878*0.9), - "llama2-70b-99" : ("ROUGE1", 44.4312 * 0.99, "ROUGE2", 22.0352 * 0.99, "ROUGEL", 28.6162 * 0.99, "TOKENS_PER_SAMPLE", 294.45*0.9), - "llama2-70b-99.9" : ("ROUGE1", 44.4312 * 0.999, "ROUGE2", 22.0352 * 0.999, "ROUGEL", 28.6162 * 0.999, "TOKENS_PER_SAMPLE", 294.45*0.9), + "gptj-99": ("ROUGE1", 42.9865 * 0.99, "ROUGE2", 20.1235 * 0.99, "ROUGEL", 29.9881 * 0.99, "GEN_LEN", 4016878 * 0.9), + "gptj-99.9": ("ROUGE1", 42.9865 * 0.999, "ROUGE2", 20.1235 * 0.999, "ROUGEL", 29.9881 * 0.999, "GEN_LEN", 4016878 * 0.9), + "llama2-70b-99": ("ROUGE1", 44.4312 * 0.99, "ROUGE2", 22.0352 * 0.99, "ROUGEL", 28.6162 * 0.99, "TOKENS_PER_SAMPLE", 294.45 * 0.9), + "llama2-70b-99.9": ("ROUGE1", 44.4312 * 0.999, "ROUGE2", 22.0352 * 0.999, "ROUGEL", 28.6162 * 0.999, "TOKENS_PER_SAMPLE", 294.45 * 0.9), "stable-diffusion-xl": ("CLIP_SCORE", 31.68631873, "FID_SCORE", 23.01085758) }, "accuracy-upper-limit": { "stable-diffusion-xl": ("CLIP_SCORE", 31.81331801, "FID_SCORE", 23.95007626), - "llama2-70b-99" : ("TOKENS_PER_SAMPLE", 294.45*1.1), - "llama2-70b-99.9" : ("TOKENS_PER_SAMPLE", 294.45*1.1) + "llama2-70b-99": ("TOKENS_PER_SAMPLE", 294.45 * 1.1), + "llama2-70b-99.9": ("TOKENS_PER_SAMPLE", 294.45 * 1.1) }, "accuracy-delta-perc": { "stable-diffusion-xl": { @@ -163,7 +163,7 @@ "gptj-99.9": {"Server": 20000000000}, "llama2-70b-99": {"Server": 20000000000}, "llama2-70b-99.9": {"Server": 20000000000}, - "stable-diffusion-xl" : {"Server": 20000000000} + "stable-diffusion-xl": {"Server": 20000000000} }, "min-queries": { "resnet": { @@ -263,18 +263,18 @@ "dlrm-v2-99.9": ("AUC", 80.31 * 0.999), "3d-unet-99": ("DICE", 0.86170 * 0.99), "3d-unet-99.9": ("DICE", 0.86170 * 0.999), - "gptj-99" : ("ROUGE1", 42.9865 * 0.99, "ROUGE2", 20.1235 * 0.99, "ROUGEL", 29.9881 * 0.99, "GEN_LEN", 4016878*0.9), - "gptj-99.9" : ("ROUGE1", 42.9865 * 0.999, "ROUGE2", 20.1235 * 0.999, "ROUGEL", 29.9881 * 0.999, "GEN_LEN", 4016878*0.9), - "llama2-70b-99" : ("ROUGE1", 44.4312 * 0.99, "ROUGE2", 22.0352 * 0.99, "ROUGEL", 28.6162 * 0.99, "TOKENS_PER_SAMPLE", 294.45*0.9), - "llama2-70b-99.9" : ("ROUGE1", 44.4312 * 0.999, "ROUGE2", 22.0352 * 0.999, "ROUGEL", 28.6162 * 0.999, "TOKENS_PER_SAMPLE", 294.45*0.9), + "gptj-99": ("ROUGE1", 42.9865 * 0.99, "ROUGE2", 20.1235 * 0.99, "ROUGEL", 29.9881 * 0.99, "GEN_LEN", 4016878 * 0.9), + "gptj-99.9": ("ROUGE1", 42.9865 * 0.999, "ROUGE2", 20.1235 * 0.999, "ROUGEL", 29.9881 * 0.999, "GEN_LEN", 4016878 * 0.9), + "llama2-70b-99": ("ROUGE1", 44.4312 * 0.99, "ROUGE2", 22.0352 * 0.99, "ROUGEL", 28.6162 * 0.99, "TOKENS_PER_SAMPLE", 294.45 * 0.9), + "llama2-70b-99.9": ("ROUGE1", 44.4312 * 0.999, "ROUGE2", 22.0352 * 0.999, "ROUGEL", 28.6162 * 0.999, "TOKENS_PER_SAMPLE", 294.45 * 0.9), "stable-diffusion-xl": ("CLIP_SCORE", 31.68631873, "FID_SCORE", 23.01085758), - "mixtral-8x7b" : ("ROUGE1", 45.4911 * 0.99, "ROUGE2", 23.2829 * 0.99, "ROUGEL", 30.3615 * 0.99, "TOKENS_PER_SAMPLE", 145.9 * 0.9, "gsm8k_accuracy", 73.78*0.99, "mbxp_accuracy", 60.12 * 0.99), + "mixtral-8x7b": ("ROUGE1", 45.4911 * 0.99, "ROUGE2", 23.2829 * 0.99, "ROUGEL", 30.3615 * 0.99, "TOKENS_PER_SAMPLE", 145.9 * 0.9, "gsm8k_accuracy", 73.78 * 0.99, "mbxp_accuracy", 60.12 * 0.99), }, "accuracy-upper-limit": { "stable-diffusion-xl": ("CLIP_SCORE", 31.81331801, "FID_SCORE", 23.95007626), - "llama2-70b-99" : ("TOKENS_PER_SAMPLE", 294.45*1.1), - "llama2-70b-99.9" : ("TOKENS_PER_SAMPLE", 294.45*1.1), - "mixtral-8x7b" : ("TOKENS_PER_SAMPLE", 145.9 * 1.1) + "llama2-70b-99": ("TOKENS_PER_SAMPLE", 294.45 * 1.1), + "llama2-70b-99.9": ("TOKENS_PER_SAMPLE", 294.45 * 1.1), + "mixtral-8x7b": ("TOKENS_PER_SAMPLE", 145.9 * 1.1) }, "accuracy-delta-perc": { "stable-diffusion-xl": { @@ -329,7 +329,7 @@ "gptj-99.9": {"Server": 20000000000}, "llama2-70b-99": {"Server": 20000000000}, "llama2-70b-99.9": {"Server": 20000000000}, - "stable-diffusion-xl" : {"Server": 20000000000} + "stable-diffusion-xl": {"Server": 20000000000} # TODO: Mixtral metrics # "mixtral-8x7b" : {"Server": 20000000000} }, @@ -514,19 +514,19 @@ } LLM_LATENCY_LIMITS = { - "llama2-70b-99":{ + "llama2-70b-99": { "conversational": { "ttft": 2000 * 1000000, "tpot": 200 * 1000000 } }, - "llama2-70b-99.9":{ + "llama2-70b-99.9": { "conversational": { "ttft": 2000 * 1000000, "tpot": 200 * 1000000 } }, - "mixtral-8x7b":{ + "mixtral-8x7b": { "conversational": { "ttft": 2000 * 1000000, "tpot": 200 * 1000000 @@ -755,7 +755,7 @@ def get_accuracy_target(self, model): if model not in self.accuracy_target: raise ValueError("model not known: " + model) return self.accuracy_target[model] - + def get_accuracy_upper_limit(self, model): return self.accuracy_upper_limit.get(model, None) @@ -781,12 +781,12 @@ def get_min_query_count(self, model, scenario): if model not in self.min_queries: raise ValueError("model not known: " + model) return self.min_queries[model].get(scenario) - + def get_delta_perc(self, model, metric): if model in self.accuracy_delta_perc: if metric in self.accuracy_delta_perc[model]: return self.accuracy_delta_perc[model][metric] - + more_accurate = model.find("99.9") if more_accurate == -1: required_delta_perc = 1 @@ -797,12 +797,11 @@ def get_delta_perc(self, model, metric): def has_new_logging_format(self): return True - def uses_early_stopping(self, scenario): return ( scenario in ["Server", "SingleStream", "MultiStream"] ) - + def requires_equal_issue(self, model, division): return ( division in ["closed", "network"] and @@ -812,7 +811,7 @@ def requires_equal_issue(self, model, division): "gptj-99", "gptj-99.9", "llama2-70b-99", - "llama2-70b-99.9", + "llama2-70b-99.9", "mixtral-8x7b" ] and self.version in ["v4.1"] @@ -830,7 +829,10 @@ def get_args(): help="mlperf version", ) parser.add_argument("--submitter", help="filter to submitter") - parser.add_argument("--csv", default="summary.csv", help="csv file with results") + parser.add_argument( + "--csv", + default="summary.csv", + help="csv file with results") parser.add_argument( "--skip_compliance", action="store_true", @@ -841,7 +843,10 @@ def get_args(): help="File containing extra custom model mapping. It is assumed to be inside the folder open/", default="model_mapping.json", ) - parser.add_argument("--debug", action="store_true", help="extra debug output") + parser.add_argument( + "--debug", + action="store_true", + help="extra debug output") parser.add_argument( "--submission-exceptions", action="store_true", @@ -883,17 +888,20 @@ def get_args(): def list_dir(*path): path = os.path.join(*path) - return [f for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))] + return [f for f in os.listdir( + path) if os.path.isdir(os.path.join(path, f))] def list_files(*path): path = os.path.join(*path) - return [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))] + return [f for f in os.listdir( + path) if os.path.isfile(os.path.join(path, f))] def list_empty_dirs_recursively(*path): path = os.path.join(*path) - return [dirpath for dirpath, dirs, files in os.walk(path) if not dirs and not files] + return [dirpath for dirpath, dirs, files in os.walk( + path) if not dirs and not files] def list_dirs_recursively(*path): @@ -919,19 +927,23 @@ def check_extra_files(path, target_files): check_pass = False missing_files.append(os.path.join(path, dir)) else: - files = [f.split(".")[0] for f in list_files(os.path.join(path, dir))] + files = [f.split(".")[0] + for f in list_files(os.path.join(path, dir))] for target_file in target_files[dir]: if target_file not in files: check_pass = False - missing_files.append(f"{os.path.join(path, dir, target_file)}.png") + missing_files.append( + f"{os.path.join(path, dir, target_file)}.png") if "captions" not in files: - missing_files.append(f"{os.path.join(path, dir, 'captions.txt')}") + missing_files.append( + f"{os.path.join(path, dir, 'captions.txt')}") return check_pass, missing_files def split_path(m): return m.replace("\\", "/").split("/") + def get_boolean(s): if s is None: return False @@ -942,7 +954,8 @@ def get_boolean(s): elif isinstance(s, int): return bool(s) else: - raise TypeError(f"Variable should be bool, string or int, got {type(s)} instead") + raise TypeError( + f"Variable should be bool, string or int, got {type(s)} instead") def find_error_in_detail_log(config, fname): @@ -987,19 +1000,20 @@ def check_accuracy_dir(config, model, path, verbose): up_patterns = [] acc_limit_check = True for i in range(0, len(acc_upper_limit), 2): - acc_type, acc_target = acc_upper_limit[i:i+2] + acc_type, acc_target = acc_upper_limit[i:i + 2] acc_limits.append(acc_target) up_patterns.append(ACC_PATTERN[acc_type]) for i in range(0, len(target), 2): - acc_type, acc_target = target[i:i+2] + acc_type, acc_target = target[i:i + 2] patterns.append(ACC_PATTERN[acc_type]) acc_targets.append(acc_target) acc_types.append(acc_type) acc_seen = [False for _ in acc_targets] with open(os.path.join(path, "accuracy.txt"), "r", encoding="utf-8") as f: for line in f: - for i, (pattern, acc_target, acc_type) in enumerate(zip(patterns, acc_targets, acc_types)): + for i, (pattern, acc_target, acc_type) in enumerate( + zip(patterns, acc_targets, acc_types)): m = re.match(pattern, line) if m: acc = m.group(1) @@ -1011,24 +1025,34 @@ def check_accuracy_dir(config, model, path, verbose): acc_seen[i] = True elif acc is not None: all_accuracy_valid = False - log.warning("%s accuracy not met: expected=%f, found=%s", path, acc_target, acc) + log.warning( + "%s accuracy not met: expected=%f, found=%s", + path, + acc_target, + acc) if acc: result_acc[acc_type] = acc acc = None if acc_upper_limit is not None: - for i, (pattern, acc_limit) in enumerate(zip(up_patterns, acc_limits)): + for i, (pattern, acc_limit) in enumerate( + zip(up_patterns, acc_limits)): m = re.match(pattern, line) if m: acc = m.group(1) m = re.match(r"^hash=([\w\d]+)$", line) if m: hash_val = m.group(1) - if acc is not None and acc_upper_limit is not None and float(acc) > acc_limit: + if acc is not None and acc_upper_limit is not None and float( + acc) > acc_limit: acc_limit_check = False - log.warning("%s accuracy not met: upper limit=%f, found=%s", path, acc_limit, acc) + log.warning( + "%s accuracy not met: upper limit=%f, found=%s", + path, + acc_limit, + acc) acc = None if all(acc_seen) and hash_val: - break; + break is_valid = all_accuracy_valid & all(acc_seen) if acc_upper_limit is not None: is_valid &= acc_limit_check @@ -1062,20 +1086,23 @@ def extra_check_llm(mlperf_log, scenario, model): return None, True else: for constraint, limits in LLM_LATENCY_LIMITS[model].items(): - if mlperf_log["result_first_token_99.00_percentile_latency_ns"] < limits["ttft"] and mlperf_log["result_time_per_output_token_99.00_percentile_ns"] < limits["tpot"]: + if mlperf_log["result_first_token_99.00_percentile_latency_ns"] < limits[ + "ttft"] and mlperf_log["result_time_per_output_token_99.00_percentile_ns"] < limits["tpot"]: return constraint, True else: - log.error(f'use_token_latencies flag needs to be enabled for Llama2 benchmark') + log.error( + f'use_token_latencies flag needs to be enabled for Llama2 benchmark') return None, False - log.error(f'Failed Llama2 extra check for TTFT and TPOT. TTFT 99-tile: {mlperf_log["result_first_token_99.00_percentile_latency_ns"]}, TPOT 99-tile: {mlperf_log["result_time_per_output_token_99.00_percentile_ns"]}') + log.error( + f'Failed Llama2 extra check for TTFT and TPOT. TTFT 99-tile: {mlperf_log["result_first_token_99.00_percentile_latency_ns"]}, TPOT 99-tile: {mlperf_log["result_time_per_output_token_99.00_percentile_ns"]}') return None, False - + def get_performance_metric( config, model, path, scenario_fixed, division, system_json, has_power=False ): - #Assumes new logging format + # Assumes new logging format version = config.version fname = os.path.join(path, "mlperf_log_detail.txt") @@ -1088,15 +1115,19 @@ def get_performance_metric( scenario = mlperf_log["effective_scenario"] res = float(mlperf_log[RESULT_FIELD_NEW[version][scenario]]) - if version in RESULT_FIELD_BENCHMARK_OVERWRITE and model in RESULT_FIELD_BENCHMARK_OVERWRITE[version] and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[version][model]: - res = float(mlperf_log[RESULT_FIELD_BENCHMARK_OVERWRITE[version][model][scenario]]) + if version in RESULT_FIELD_BENCHMARK_OVERWRITE and model in RESULT_FIELD_BENCHMARK_OVERWRITE[ + version] and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[version][model]: + res = float( + mlperf_log[RESULT_FIELD_BENCHMARK_OVERWRITE[version][model][scenario]]) inferred = False if scenario_fixed != scenario: - inferred, res = get_inferred_result(scenario_fixed, scenario, res, mlperf_log, config, False) + inferred, res = get_inferred_result( + scenario_fixed, scenario, res, mlperf_log, config, False) return res + def check_performance_dir( config, model, path, scenario_fixed, division, system_json, has_power=False ): @@ -1119,12 +1150,14 @@ def check_performance_dir( scenario = mlperf_log["effective_scenario"] res = float(mlperf_log[RESULT_FIELD_NEW[version][scenario]]) - if version in RESULT_FIELD_BENCHMARK_OVERWRITE and model in RESULT_FIELD_BENCHMARK_OVERWRITE[version] and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[version][model]: - res = float(mlperf_log[RESULT_FIELD_BENCHMARK_OVERWRITE[version][model][scenario]]) + if version in RESULT_FIELD_BENCHMARK_OVERWRITE and model in RESULT_FIELD_BENCHMARK_OVERWRITE[ + version] and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[version][model]: + res = float( + mlperf_log[RESULT_FIELD_BENCHMARK_OVERWRITE[version][model][scenario]]) - if model in ["llama2-70b-99", "llama2-70b-99.9", "mixtral-8x7b"]: - llama_constraint, is_valid = extra_check_llm(mlperf_log, scenario_fixed, model) + llama_constraint, is_valid = extra_check_llm( + mlperf_log, scenario_fixed, model) latency_99_percentile = mlperf_log["result_99.00_percentile_latency_ns"] latency_mean = mlperf_log["result_mean_latency_ns"] @@ -1136,7 +1169,8 @@ def check_performance_dir( min_query_count = mlperf_log["effective_min_query_count"] samples_per_query = mlperf_log["effective_samples_per_query"] min_duration = mlperf_log["effective_min_duration_ms"] - equal_issue_used_check = (mlperf_log["effective_sample_concatenate_permutation"] == "true") + equal_issue_used_check = ( + mlperf_log["effective_sample_concatenate_permutation"] == "true") if not config.requires_equal_issue(model, division): equal_issue_used_check = True sut_name = mlperf_log["sut_name"] @@ -1146,7 +1180,8 @@ def check_performance_dir( if not find_error_in_detail_log(config, fname): is_valid = False - required_performance_sample_count = config.get_performance_sample_count(model) + required_performance_sample_count = config.get_performance_sample_count( + model) if performance_sample_count < required_performance_sample_count: log.error( "%s performance_sample_count, found %d, needs to be >= %d", @@ -1184,7 +1219,6 @@ def check_performance_dir( if scenario == "SingleStream" or scenario == "MultiStream": res /= MS_TO_NS - # Check if the current scenario uses early stopping uses_early_stopping = config.uses_early_stopping(scenario) @@ -1201,7 +1235,8 @@ def check_performance_dir( # If the scenario has a target latency (Server scenario), check # that the target latency that was passed to the early stopping # is less than the target latency. - target_latency = config.latency_constraint.get(model, dict()).get(scenario) + target_latency = config.latency_constraint.get( + model, dict()).get(scenario) if target_latency: early_stopping_latency_ns = mlperf_log["effective_target_latency_ns"] log.info( @@ -1221,7 +1256,8 @@ def check_performance_dir( else: # check if the benchmark meets latency constraint - target_latency = config.latency_constraint.get(model, dict()).get(scenario) + target_latency = config.latency_constraint.get( + model, dict()).get(scenario) log.info( "Target latency: %s, Latency: %s, Scenario: %s", target_latency, @@ -1252,7 +1288,8 @@ def check_performance_dir( ) is_valid = False - if scenario == "Offline" and (samples_per_query < OFFLINE_MIN_SPQ_SINCE_V4[model]): + if scenario == "Offline" and ( + samples_per_query < OFFLINE_MIN_SPQ_SINCE_V4[model]): log.error( "%s Required minimum samples per query not met by user config, Expected=%s, Found=%s", fname, @@ -1275,14 +1312,16 @@ def check_performance_dir( inferred = False if scenario_fixed != scenario: - inferred, res = get_inferred_result(scenario_fixed, scenario, res, mlperf_log, config, True) + inferred, res = get_inferred_result( + scenario_fixed, scenario, res, mlperf_log, config, True) is_network_system, is_network_mode_valid = is_system_over_network( division, system_json, path ) is_valid &= is_network_mode_valid if is_network_system: - # for network mode verify the SUT name is valid, according to the rules (must include "Network SUT" in name) + # for network mode verify the SUT name is valid, according to the rules + # (must include "Network SUT" in name) if NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME not in sut_name: log.error( f"{fname} invalid sut name for network mode. expecting the substring '{NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME}' got '{sut_name}'" @@ -1291,7 +1330,9 @@ def check_performance_dir( return is_valid, res, inferred, equal_issue_used_check -def get_inferred_result(scenario_fixed, scenario, res, mlperf_log, config, log_error=False): + +def get_inferred_result(scenario_fixed, scenario, res, + mlperf_log, config, log_error=False): inferred = False # Check if current scenario (and version) uses early stopping @@ -1305,7 +1346,8 @@ def get_inferred_result(scenario_fixed, scenario, res, mlperf_log, config, log_e latency_mean = mlperf_log["result_mean_query_latency_ns"] samples_per_query = mlperf_log["effective_samples_per_query"] if scenario == "SingleStream": - # qps_wo_loadgen_overhead is only used for inferring Offline from SingleStream; only for old submissions + # qps_wo_loadgen_overhead is only used for inferring Offline from + # SingleStream; only for old submissions qps_wo_loadgen_overhead = mlperf_log["result_qps_without_loadgen_overhead"] # special case for results inferred from different scenario @@ -1338,11 +1380,12 @@ def get_inferred_result(scenario_fixed, scenario, res, mlperf_log, config, log_e res = (latency_99_percentile * samples_per_query) / MS_TO_NS return inferred, res + def get_power_metric(config, scenario_fixed, log_path, is_valid, res): # parse the power logs server_timezone = datetime.timedelta(0) client_timezone = datetime.timedelta(0) - + detail_log_fname = os.path.join(log_path, "mlperf_log_detail.txt") mlperf_log = MLPerfLog(detail_log_fname) datetime_format = "%m-%d-%Y %H:%M:%S.%f" @@ -1356,7 +1399,7 @@ def get_power_metric(config, scenario_fixed, log_path, is_valid, res): ) # Obtain the scenario also from logs to check if power is inferred scenario = mlperf_log["effective_scenario"] - + spl_fname = os.path.join(log_path, "spl.txt") power_list = [] with open(spl_fname) as f: @@ -1389,7 +1432,8 @@ def get_power_metric(config, scenario_fixed, log_path, is_valid, res): avg_power_efficiency = res / avg_power else: - # In SingleStream and MultiStream scenarios, the power metric is in mJ/query. + # In SingleStream and MultiStream scenarios, the power metric is in + # mJ/query. assert scenario_fixed in [ "MultiStream", "SingleStream", @@ -1432,11 +1476,17 @@ def check_power_dir( # check if all the required files are present required_files = REQUIRED_PERF_FILES + REQUIRED_PERF_POWER_FILES - diff = files_diff(list_files(testing_path), required_files, OPTIONAL_PERF_FILES) + diff = files_diff( + list_files(testing_path), + required_files, + OPTIONAL_PERF_FILES) if diff: log.error("%s has file list mismatch (%s)", testing_path, diff) is_valid = False - diff = files_diff(list_files(ranging_path), required_files, OPTIONAL_PERF_FILES) + diff = files_diff( + list_files(ranging_path), + required_files, + OPTIONAL_PERF_FILES) if diff: log.error("%s has file list mismatch (%s)", ranging_path, diff) is_valid = False @@ -1474,7 +1524,9 @@ def check_power_dir( sys.stdout.flush() sys.stderr.flush() if check_power_result != 0: - log.error("Power WG power_checker.py did not pass for: %s", perf_path) + log.error( + "Power WG power_checker.py did not pass for: %s", + perf_path) is_valid = False return is_valid, power_metric, power_efficiency_testing @@ -1620,19 +1672,19 @@ def log_result( "Offline": "Tokens/s", "Server": "Tokens/s", }, - "llama2-70b-99" : { + "llama2-70b-99": { "SingleStream": "Latency (ms)", "MultiStream": "Latency (ms)", "Offline": "Tokens/s", "Server": "Tokens/s", }, - "llama2-70b-99.9" : { + "llama2-70b-99.9": { "SingleStream": "Latency (ms)", "MultiStream": "Latency (ms)", "Offline": "Tokens/s", "Server": "Tokens/s", }, - "mixtral-8x7b" : { + "mixtral-8x7b": { "SingleStream": "Latency (ms)", "MultiStream": "Latency (ms)", "Offline": "Tokens/s", @@ -1657,8 +1709,8 @@ def log_result( unit = special_unit_dict.get(model_name, unit_dict)[scenario_fixed] power_unit = power_unit_dict[scenario_fixed] - - if (power_metric <= 0) or (not get_boolean(system_json.get("system_power_only"))): + if (power_metric <= 0) or (not get_boolean( + system_json.get("system_power_only"))): csv.write( fmt.format( submitter, @@ -1690,7 +1742,7 @@ def log_result( unit, '"' + weight_data_types + '"', ) - ) + ) if power_metric > 0: csv.write( @@ -1770,7 +1822,7 @@ def log_result( if not os.path.exists(results_path): continue - ## Apply folder checks + # Apply folder checks dirs = list_dirs_recursively(division, submitter) files = list_files_recursively(division, submitter) @@ -1871,7 +1923,9 @@ def log_result( extra_model_mapping = json.load(fp) for system_desc in list_dir(results_path): - # we are looking at ./$division/$submitter/results/$system_desc, ie ./closed/mlperf_org/results/t4-ort + # we are looking at + # ./$division/$submitter/results/$system_desc, ie + # ./closed/mlperf_org/results/t4-ort # # check if system_id is good. @@ -1900,7 +1954,8 @@ def log_result( if config.version not in ["v0.5"]: valid_system_types = ["datacenter", "edge"] if config.version not in ["v0.7"]: - valid_system_types += ["datacenter,edge", "edge,datacenter"] + valid_system_types += ["datacenter,edge", + "edge,datacenter"] if system_type not in valid_system_types: log.error( "%s has invalid system type (%s)", @@ -1934,7 +1989,8 @@ def log_result( if is_closed_or_network and mlperf_model not in config.models: # for closed/network divisions we want the model name to match. - # for open division the model_name might be different than the task + # for open division the model_name might be different + # than the task log.error( "%s has an invalid model %s for closed/network division", name, @@ -1962,9 +2018,12 @@ def log_result( list(required_scenarios) + list(config.get_optional(mlperf_model)) ) - for scenario in list_dir(results_path, system_desc, model_name): - # some submissions in v0.5 use lower case scenarios - map them for now - scenario_fixed = SCENARIO_MAPPING.get(scenario, scenario) + for scenario in list_dir( + results_path, system_desc, model_name): + # some submissions in v0.5 use lower case scenarios - + # map them for now + scenario_fixed = SCENARIO_MAPPING.get( + scenario, scenario) # Skip scenario for debug purposes if scenario in scenarios_to_skip: @@ -2016,7 +2075,8 @@ def log_result( scenario, ) if not os.path.exists(measurement_dir): - log.error("no measurement_dir for %s", measurement_dir) + log.error( + "no measurement_dir for %s", measurement_dir) results[measurement_dir] = None errors += 1 continue @@ -2046,7 +2106,8 @@ def log_result( # check accuracy accuracy_is_valid = False acc_path = os.path.join(name, "accuracy") - if not os.path.exists(os.path.join(acc_path, "accuracy.txt")): + if not os.path.exists( + os.path.join(acc_path, "accuracy.txt")): log.error( "%s has no accuracy.txt. Generate it with accuracy-imagenet.py or accuracy-coco.py or " "process_accuracy.py", @@ -2055,7 +2116,8 @@ def log_result( errors += 1 continue elif scenario not in scenarios_to_skip: - diff = files_diff(list_files(acc_path), REQUIRED_ACC_FILES) + diff = files_diff( + list_files(acc_path), REQUIRED_ACC_FILES) if diff: log.error( "%s has file list mismatch (%s)", acc_path, diff @@ -2068,10 +2130,19 @@ def log_result( acc_path, debug or is_closed_or_network, ) - acc = json.dumps(acc).replace(",", " ").replace('"', "").replace("{", "").replace("}", "") + acc = json.dumps(acc).replace( + ",", + " ").replace( + '"', + "").replace( + "{", + "").replace( + "}", + "") if mlperf_model in REQUIRED_ACC_BENCHMARK: if config.version in REQUIRED_ACC_BENCHMARK[mlperf_model]: - extra_files_pass, missing_files = check_extra_files(acc_path, REQUIRED_ACC_BENCHMARK[mlperf_model][config.version]) + extra_files_pass, missing_files = check_extra_files( + acc_path, REQUIRED_ACC_BENCHMARK[mlperf_model][config.version]) if not extra_files_pass: log.error( "%s expected to have the following extra files (%s)", acc_path, missing_files @@ -2085,7 +2156,8 @@ def log_result( ) accuracy_is_valid = True if not accuracy_is_valid: - # a little below we'll not copy this into the results csv + # a little below we'll not copy this into the + # results csv errors += 1 log.error("%s, accuracy not valid", acc_path) @@ -2278,14 +2350,16 @@ def log_result( ) else: results[name] = None - log.error("%s is OK but accuracy has issues", name) + log.error( + "%s is OK but accuracy has issues", name) # Discard scenarios that we want to skip for scenario in scenarios_to_skip: required_scenarios.discard(scenario) if required_scenarios: - name = os.path.join(results_path, system_desc, model_name) + name = os.path.join( + results_path, system_desc, model_name) if is_closed_or_network: results[name] = None log.error( @@ -2313,9 +2387,8 @@ def check_system_desc_id( ): is_valid = True # check all required fields - - required_fields = SYSTEM_DESC_REQUIRED_FIELDS.copy() + required_fields = SYSTEM_DESC_REQUIRED_FIELDS.copy() is_network_system, is_network_mode_valid = is_system_over_network( division, systems_json, fname @@ -2340,7 +2413,6 @@ def check_system_desc_id( "%s, field %s requires a meaningful response but is empty", fname, k ) - # SYSTEM_DESC_REQUIRED_FIELDS_POWER should be mandatory when a submission has power logs, but since we # check power submission in check_results_dir, the information is not available yet at this stage and we do # this check later @@ -2428,21 +2500,45 @@ def check_measurement_dir( if has_power and not skip_check_power_measure_files: path = measurement_dir - all_files_1 = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))] + all_files_1 = [ + os.path.join( + path, + f) for f in os.listdir(path) if os.path.isfile( + os.path.join( + path, + f))] path = os.path.join(path, "..") - all_files_2 = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))] + all_files_2 = [ + os.path.join( + path, + f) for f in os.listdir(path) if os.path.isfile( + os.path.join( + path, + f))] path = os.path.join(path, "..") - all_files_3 = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))] + all_files_3 = [ + os.path.join( + path, + f) for f in os.listdir(path) if os.path.isfile( + os.path.join( + path, + f))] path = os.path.join(path, "..") - all_files_4 = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))] + all_files_4 = [ + os.path.join( + path, + f) for f in os.listdir(path) if os.path.isfile( + os.path.join( + path, + f))] all_files = all_files_1 + all_files_2 + all_files_3 + all_files_4 for i in REQUIRED_POWER_MEASURE_FILES: found = False for file in all_files: if re.match(i, os.path.basename(file)): - found = True - file_path = file + found = True + file_path = file if not found: log.error("%s is missing %s", measurement_dir, i) is_valid = False @@ -2482,9 +2578,10 @@ def check_measurement_dir( log.error("%s, field %s is missing", fname, k) elif check_empty_fields and not j[k]: is_valid = False - log.error("%s, field %s is missing meaningful value", fname, k) + log.error( + "%s, field %s is missing meaningful value", fname, k) - impl = system_file[len(system_desc) + 1 : -end] + impl = system_file[len(system_desc) + 1: -end] code_dir = os.path.join(root, "code", model) if os.path.isfile(code_dir): with open(code_dir, "r") as f: @@ -2498,7 +2595,7 @@ def check_measurement_dir( if not os.path.exists(os.path.dirname(code_dir)): log.error("%s is missing code_dir %s", fname, code_dir) is_valid = False - + else: log.error("%s is missing %s*.json", fname, system_desc) is_valid = False @@ -2521,7 +2618,9 @@ def check_compliance_perf_dir(test_dir): is_valid = True break if is_valid == False: - log.error("Compliance test performance check in %s failed", test_dir) + log.error( + "Compliance test performance check in %s failed", + test_dir) # Check performance dir test_perf_path = os.path.join(test_dir, "performance", "run_1") @@ -2535,7 +2634,10 @@ def check_compliance_perf_dir(test_dir): ["mlperf_log_accuracy.json"], ) if diff: - log.error("%s has file list mismatch (%s)", test_perf_path, diff) + log.error( + "%s has file list mismatch (%s)", + test_perf_path, + diff) is_valid = False return is_valid @@ -2577,14 +2679,17 @@ def check_compliance_acc_dir(test_dir, model, config): else REQUIRED_TEST01_ACC_FILES, ) if diff: - log.error("%s has file list mismatch (%s)", test_acc_path, diff) + log.error( + "%s has file list mismatch (%s)", + test_acc_path, + diff) is_valid = False elif not acc_passed: target = config.get_accuracy_target(model) patterns = [] acc_types = [] for i in range(0, len(target), 2): - acc_type = target[i:i+2] + acc_type = target[i:i + 2] acc_types.append(acc_type) patterns.append(ACC_PATTERN[acc_type[0]]) acc_seen = [False for _ in acc_type] @@ -2613,14 +2718,17 @@ def check_compliance_acc_dir(test_dir, model, config): for acc_type, pattern in zip(acc_types, patterns): m = re.match(pattern, line) if m: - acc_compliance[acc_type] = float(m.group(1)) + acc_compliance[acc_type] = float( + m.group(1)) for acc_type in acc_types: if acc_baseline[acc_type] == 0 or acc_compliance[acc_type] == 0: is_valid = False break else: - required_delta_perc = config.get_delta_perc(model, acc_type[0]) - delta_perc = abs(1 - acc_baseline[acc_type] / acc_compliance[acc_type]) * 100 + required_delta_perc = config.get_delta_perc( + model, acc_type[0]) + delta_perc = abs( + 1 - acc_baseline[acc_type] / acc_compliance[acc_type]) * 100 if delta_perc <= required_delta_perc: is_valid = True else: @@ -2641,9 +2749,11 @@ def check_compliance_acc_dir(test_dir, model, config): length_check_pass = "Sample length check pass: True" in lines is_valid = first_token_pass and eos_pass and length_check_pass if not is_valid: - log.error(f"TEST06 accuracy check failed. first_token_check: {first_token_pass} eos_check: {eos_pass} length_check: {length_check_pass}.") + log.error( + f"TEST06 accuracy check failed. first_token_check: {first_token_pass} eos_check: {eos_pass} length_check: {length_check_pass}.") else: - raise NotImplemented(f"{test_dir} is neither TEST01 and TEST06, which doesn't require accuracy check") + raise NotImplemented( + f"{test_dir} is neither TEST01 and TEST06, which doesn't require accuracy check") return is_valid @@ -2694,17 +2804,16 @@ def check_compliance_dir( if model in [ "stable-diffusion-xl" - ] and config.version in [ "v4.0" ]: + ] and config.version in ["v4.0"]: test_list.remove("TEST01") test_list.remove("TEST04") - if model in [ "llama2-70b-99", "llama2-70b-99.9", "mixtral-8x7b" ]: - test_list.append("TEST06") + test_list.append("TEST06") if test_list and not os.path.exists(compliance_dir): log.error("no compliance dir for %s: %s", name, compliance_dir) @@ -2728,7 +2837,10 @@ def check_compliance_dir( config, model, compliance_perf_dir, scenario, division, system_json ) if is_inferred: - log.info("%s has inferred results, qps=%s", compliance_perf_dir, r) + log.info( + "%s has inferred results, qps=%s", + compliance_perf_dir, + r) except Exception as e: log.error( "%s caused exception in check_performance_dir: %s", @@ -2742,7 +2854,7 @@ def check_compliance_dir( and compliance_perf_valid ) - compliance_acc_pass= True + compliance_acc_pass = True for test in ["TEST01", "TEST06"]: if test in test_list: # Check accuracy for TEST01 @@ -2750,7 +2862,6 @@ def check_compliance_dir( os.path.join(compliance_dir, test), model, config ) - return compliance_perf_pass and compliance_acc_pass and compliance_perf_dir_pass @@ -2836,7 +2947,8 @@ def merge_two_dict(x, y): unique_closed_systems = merge_two_dict( closed_power_systems, closed_non_power_systems ) - unique_open_systems = merge_two_dict(open_power_systems, open_non_power_systems) + unique_open_systems = merge_two_dict( + open_power_systems, open_non_power_systems) unique_network_systems = merge_two_dict( network_power_systems, network_non_power_systems ) @@ -2845,8 +2957,10 @@ def merge_two_dict(x, y): unique_systems = merge_two_dict(unique_systems, unique_network_systems) # power systems can be repeating in open, closed and network - unique_power_systems = merge_two_dict(closed_power_systems, open_power_systems) - unique_power_systems = merge_two_dict(unique_power_systems, network_power_systems) + unique_power_systems = merge_two_dict( + closed_power_systems, open_power_systems) + unique_power_systems = merge_two_dict( + unique_power_systems, network_power_systems) number_systems = len(unique_systems) number_power_systems = len(unique_power_systems) @@ -2867,7 +2981,8 @@ def sum_dict_values(x): count_open_results = count_open_power_results + count_open_non_power_results count_network_power_results = sum_dict_values(network_power_systems) - count_network_non_power_results = sum_dict_values(network_non_power_systems) + count_network_non_power_results = sum_dict_values( + network_non_power_systems) count_network_results = ( count_network_power_results + count_network_non_power_results ) @@ -2905,7 +3020,10 @@ def sum_dict_values(x): ) log.info("---") - log.info("Systems=%d, Power Systems=%d", number_systems, number_power_systems) + log.info( + "Systems=%d, Power Systems=%d", + number_systems, + number_power_systems) log.info( "Closed Systems=%d, Closed Power Systems=%d", number_closed_systems,