diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..1d2f728 --- /dev/null +++ b/.flake8 @@ -0,0 +1,5 @@ +[flake8] +extend-ignore = E203 +exclude = .git,__pycache__,batches,resources +max-complexity = 10 +max-line-length = 88 diff --git a/.gitignore b/.gitignore index 37ca6ef..ed7863a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,20 +1,10 @@ -/logs*.d/**/* .env /Cargo.lock -/tasks.d/**/* -/jobs.d/**/* -/results*.d/**/* -/scripts*.d/**/* -/inventories*.d/**/* -/backup/**/* -/test_results/**/* -/jobs*.yaml -/menage.sh -/resources -/*.tar.* /*ipynb* -/batches/** +/batches /.ssh_g5k.pub +/__pycache__ +/*.png # Added by cargo diff --git a/analysis/data_analysis.py b/analysis/data_analysis.py new file mode 100644 index 0000000..5d658b7 --- /dev/null +++ b/analysis/data_analysis.py @@ -0,0 +1,286 @@ +# IMPORTS +import os +import sys +import polars as pl +import schemas +import extract +import load +import rq1 +import rq2 +import rq3 +import rq34 +import visualization + + +vendor_generation_map = { + "E5-2620 v4": { + "architecture": "Broadwell-E", + "vendor": "Intel", + "generation": 6, + "launch_date": "Q1 2016", + }, + "E5-2630L v4": { + "architecture": "Broadwell-E", + "vendor": "Intel", + "generation": 6, + "launch_date": "Q1 2016", + }, + "E5-2698 v4": { + "architecture": "Broadwell-E", + "vendor": "Intel", + "generation": 6, + "launch_date": "Q1 2016", + }, + "E5-2630 v3": { + "architecture": "Haswell-E", + "vendor": "Intel", + "generation": 5, + "launch_date": "Q3 2014", + }, + "Gold 5220": { + "architecture": "Cascade Lake-SP", + "vendor": "Intel", + "generation": 10, + "launch_date": "Q2 2019", + }, + "Gold 5218": { + "architecture": "Cascade Lake-SP", + "vendor": "Intel", + "generation": 10, + "launch_date": "Q2 2019", + }, + "i7-9750H": { + "architecture": "Coffee Lake", + "vendor": "Intel", + "generation": 9, + "launch_date": "Q2 2019", + }, + "Silver 4314": { + "architecture": "Ice Lake-SP", + "vendor": "Intel", + "generation": 10, + "launch_date": "Q2 2021", + }, + "Gold 5320": { + "architecture": "Ice Lake-SP", + "vendor": "Intel", + "generation": 10, + "launch_date": "Q2 2021", + }, + "Gold 6126": { + "architecture": "Skylake-SP", + "vendor": "Intel", + "generation": 6, + "launch_date": "Q3 2017", + }, + "Gold 6130": { + "architecture": "Skylake-SP", + "vendor": "Intel", + "generation": 6, + "launch_date": "Q3 2017", + }, + "E5-2620": { + "architecture": "Sandy Bridge-EP", + "vendor": "Intel", + "generation": 3, + "launch_date": "Q1 2012", + }, + "E5-2630": { + "architecture": "Sandy Bridge-EP", + "vendor": "Intel", + "generation": 3, + "launch_date": "Q1 2012", + }, + "E5-2630L": { + "architecture": "Sandy Bridge-EP", + "vendor": "Intel", + "generation": 3, + "launch_date": "Q1 2012", + }, + "E5-2660": { + "architecture": "Sandy Bridge-EP", + "vendor": "Intel", + "generation": 3, + "launch_date": "Q1 2012", + }, + "7301": { + "architecture": "Zen", + "vendor": "AMD", + "generation": 1, + "launch_date": "Q2 2017", + }, + "7352": { + "architecture": "Zen 2", + "vendor": "AMD", + "generation": 2, + "launch_date": "Q3 2019", + }, + "7452": { + "architecture": "Zen 2", + "vendor": "AMD", + "generation": 2, + "launch_date": "Q3 2019", + }, + "7642": { + "architecture": "Zen 2", + "vendor": "AMD", + "generation": 2, + "launch_date": "Q3 2019", + }, + "7742": { + "architecture": "Zen 2", + "vendor": "AMD", + "generation": 2, + "launch_date": "Q3 2019", + }, +} + + +def main(): + + test = sys.argv[1] + if test == "test": + test = True + else: + test = False + + debian11_energy_stats_df = energy_for_os( + "debian11-5.10-0", + r"batches/debian11-5\.10-0\.d/results-debian11-5\.10-0\.d/([^/]+)/([^/]+)/([^/]+)/[^_]*_([^_]+).*", + test, + ) + ubuntu2404_energy_stats_df = energy_for_os( + "ubuntu2404nfs-6.8-0", + r"batches/ubuntu2404nfs-6\.8-0\.d/results-ubuntu2404nfs-6\.8-0\.d/([^/]+)/([^/]+)/([^/]+)/[^_]*_([^_]+).*", + test, + ) + + powerapi_energy_stats_df = energy_for_os( + "powerapi", + r"batches/powerapi\.d/results-powerapi\.d/([^/]+)/([^/]+)/([^/]+)/[^_]*_([^_]+).*", + test, + ) + + + rq3.correlation_perf_perf_hwpc_hwpc_cv_os(ubuntu2404_energy_stats_df, debian11_energy_stats_df, "alone") + rq1.correlation_perf_hwpc_cv(debian11_energy_stats_df, "alone", "debian11 Kernel 5.10") + rq1.correlation_perf_hwpc_cv(debian11_energy_stats_df, "not_alone", "debian11 Kernel 5.10") + rq1.correlation_perf_hwpc_cv(ubuntu2404_energy_stats_df, "alone", "ubuntu2404 Kernel 6.8") + rq1.correlation_perf_hwpc_cv(ubuntu2404_energy_stats_df, "not_alone", "ubuntu2404 Kernel 6.8") + + rq2.boxplots_perf_hwpc_cv_processor(debian11_energy_stats_df, "processor_detail", "pkg_coefficient_of_variation", "job", "25 000 Operations") + + + concatenated_dfs = pl.concat([debian11_energy_stats_df, ubuntu2404_energy_stats_df]) + concatenated_dfs = concatenated_dfs.sql( + "SELECT * FROM self WHERE nb_ops_per_core > 25" + ) + + joined_df = ubuntu2404_energy_stats_df.join( + debian11_energy_stats_df, + on=["node", "nb_ops_per_core", "nb_core", "job"], + suffix="_debian", + ) + + # Get rid of 25 OPS as it may be unrelevant + joined_df = joined_df.sql("SELECT * FROM self WHERE nb_ops_per_core > 25") + + + # RQ3/4 + rq34.os_comparison_boxplots_processor_versions_pkg_all( + [debian11_energy_stats_df, ubuntu2404_energy_stats_df] + ) + rq34.os_comparison_boxplots_processor_versions_ram_all( + [debian11_energy_stats_df, ubuntu2404_energy_stats_df] + ) + print("Heatmaps pkg perf alone") + rq34.os_comparison_heatmap_processor_versions_pkg_nb_ops(joined_df.sql("SELECT * FROM self WHERE job = 'perf_alone'"), "PERF") + print("Heatmaps pkg hwpc alone") + rq34.os_comparison_heatmap_processor_versions_pkg_nb_ops(joined_df.sql("SELECT * FROM self WHERE job = 'hwpc_alone'"), "HWPC") + print("Heatmaps ram perf alone") + rq34.os_comparison_heatmap_processor_versions_ram_nb_ops(joined_df.sql("SELECT * FROM self WHERE job = 'perf_alone'"), "PERF") + print("Heatmaps ram hwpc alone") + rq34.os_comparison_heatmap_processor_versions_ram_nb_ops(joined_df.sql("SELECT * FROM self WHERE job = 'hwpc_alone'"), "HWPC") + rq34.os_comparison_heatmap_processor_versions_pkg_percent_used(joined_df) + rq34.os_comparison_heatmap_processor_versions_ram_percent_used(joined_df) + + rq34.debian_facetgrid_processor_versions_pkg_cv_nb_ops(debian11_energy_stats_df.sql("SELECT * FROM self WHERE nb_ops_per_core > 25")) + rq34.debian_facetgrid_processor_versions_ram_cv_nb_ops(debian11_energy_stats_df.sql("SELECT * FROM self WHERE nb_ops_per_core > 25")) + rq34.ubuntu_facetgrid_processor_versions_pkg_cv_nb_ops(ubuntu2404_energy_stats_df.sql("SELECT * FROM self WHERE nb_ops_per_core > 25")) + rq34.ubuntu_facetgrid_processor_versions_ram_cv_nb_ops(ubuntu2404_energy_stats_df.sql("SELECT * FROM self WHERE nb_ops_per_core > 25")) + + +def energy_for_os(os_flavor, results_directory_match, test): + if test: + energy_stats_csv_file = ( + f"batches/{os_flavor}.d/{os_flavor}_energy_stats_sample.csv" + ) + else: + energy_stats_csv_file = f"batches/{os_flavor}.d/{os_flavor}_energy_stats.csv" + if os.path.exists(energy_stats_csv_file): + return pl.read_csv(energy_stats_csv_file) + results_directory: str = f"batches/{os_flavor}.d/results-{os_flavor}.d/" + inventories_directory: str = f"batches/{os_flavor}.d/inventories-{os_flavor}.d/" + (hwpc_files, perf_files) = extract.extract_csv_files(results_directory) + + nodes_df = extract.extract_json_files( + directory=inventories_directory, schema=schemas.nodes_configuration_columns + ) + + nodes_df = nodes_df.with_columns( + [ + # (pl.col("processor_version").map_elements(lambda x: f"{x}\nGen: {vendor_generation_map[x]['architecture']}\nRelease: {vendor_generation_map[x]['launch_date']}", return_dtype=pl.String).alias("processor_detail")), + ( + pl.col("processor_version") + .map_elements( + lambda x: f"{x}\n{vendor_generation_map[x]['architecture']}", + return_dtype=pl.String, + ) + .alias("processor_detail") + ), + ( + pl.col("processor_version") + .map_elements( + lambda x: vendor_generation_map[x]["generation"], + return_dtype=pl.String, + ) + .alias("processor_generation") + ), + ( + pl.col("processor_version") + .map_elements( + lambda x: vendor_generation_map[x]["vendor"], return_dtype=pl.String + ) + .alias("processor_vendor") + ), + ] + ) + + print("Nodes Configuration glimpse:\n", nodes_df.head()) + + # Data Exploration + (hwpc_results, perf_results) = load.load_results( + hwpc_files, perf_files, results_directory_match, test + ) + print( + "HWPC Results glimpse:\n", + hwpc_results.head(), + "\nHWPC Results stats:\n", + hwpc_results.describe(), + ) + print(hwpc_results.sql("select energy_pkg from self").describe()) + print( + "Perf Results glimpse:\n", + perf_results.head(), + "\nPerf Results stats:\n", + perf_results.describe(), + ) + + energy_stats_df = load.load_energy(hwpc_results, perf_results, nodes_df, os_flavor) + energy_stats_df.write_csv(energy_stats_csv_file, separator=",") + + return energy_stats_df + + +if __name__ == "__main__": + main() diff --git a/analysis/execution_time.py b/analysis/execution_time.py new file mode 100644 index 0000000..e110f8d --- /dev/null +++ b/analysis/execution_time.py @@ -0,0 +1,71 @@ +import os +import glob +import pandas as pd + +def compute_mean_std(directory, nb_ops): + # Define the file pattern to search for + pattern = os.path.join(directory, f"**/perf_*_{nb_ops}.csv") + files = glob.glob(pattern, recursive=True) + + if not files: + print(f"No files found for NB_OPS={nb_ops}") + return + + time_elapsed_values = [] + + # Loop through all matching files + for file in files: + try: + # Read the CSV file + df = pd.read_csv(file) + # Append the time_elapsed column to the list + time_elapsed_values.extend(df["time_elapsed"].dropna()) + except Exception as e: + print(f"Error reading file {file}: {e}") + + if not time_elapsed_values: + print(f"No valid time_elapsed values found in files for NB_OPS={nb_ops}") + return + + # Compute mean and standard deviation + mean_time = sum(time_elapsed_values) / len(time_elapsed_values) + std_dev_time = (sum((x - mean_time) ** 2 for x in time_elapsed_values) / len(time_elapsed_values)) ** 0.5 + + print(f"Results for NB_OPS={nb_ops}:") + print(f" Mean time_elapsed: {mean_time:.6f} seconds") + print(f" Standard deviation: {std_dev_time:.6f} seconds") + +# Example usage +# Replace "your_directory_path" with the actual path to the directory containing the files +print("For Ubuntu") +directory = "./batches/ubuntu2404nfs-6.8-0.d/results-ubuntu2404nfs-6.8-0.d/" +nb_ops = 25 # Change this to 250, 2500, or 25000 as needed +compute_mean_std(directory, nb_ops) +nb_ops = 250 # Change this to 250, 2500, or 25000 as needed +compute_mean_std(directory, nb_ops) +nb_ops = 2500 # Change this to 250, 2500, or 25000 as needed +compute_mean_std(directory, nb_ops) +nb_ops = 25000 # Change this to 250, 2500, or 25000 as needed +compute_mean_std(directory, nb_ops) + +print("For Debian") +directory = "./batches/debian11-5.10-0.d/results-debian11-5.10-0.d/" +nb_ops = 25 # Change this to 250, 2500, or 25000 as needed +compute_mean_std(directory, nb_ops) +nb_ops = 250 # Change this to 250, 2500, or 25000 as needed +compute_mean_std(directory, nb_ops) +nb_ops = 2500 # Change this to 250, 2500, or 25000 as needed +compute_mean_std(directory, nb_ops) +nb_ops = 25000 # Change this to 250, 2500, or 25000 as needed +compute_mean_std(directory, nb_ops) + +print("For Powerapi") +directory = "./results_powerapi2u" +nb_ops = 25 # Change this to 250, 2500, or 25000 as needed +compute_mean_std(directory, nb_ops) +nb_ops = 250 # Change this to 250, 2500, or 25000 as needed +compute_mean_std(directory, nb_ops) +nb_ops = 2500 # Change this to 250, 2500, or 25000 as needed +compute_mean_std(directory, nb_ops) +nb_ops = 25000 # Change this to 250, 2500, or 25000 as needed +compute_mean_std(directory, nb_ops) diff --git a/analysis/extract.py b/analysis/extract.py new file mode 100644 index 0000000..25abefd --- /dev/null +++ b/analysis/extract.py @@ -0,0 +1,147 @@ +import os +import json +import csv +import re +from typing import Tuple, List +import polars as pl + + +# Extract CSV to Polars DataFrames +# Extract HWPC & PERF CSVs +def extract_csv_files(directory: str) -> Tuple[List[str], List[str]]: + hwpc_files = [] + perf_files = [] + for site in os.scandir(directory): + for cluster in os.scandir(site.path): + for node in os.scandir(cluster.path): + if node.is_dir(): + for filename in os.scandir(node.path): + + if filename.path.endswith(".csv"): + if filename.name.startswith("hwpc"): + hwpc_files.append(filename.path) + elif filename.name.startswith("perf"): + perf_files.append(filename.path) + return hwpc_files, perf_files + + +def read_hwpc_csv(file_path: str, results_directory_match: str): + (site, cluster, node, task) = re.match(results_directory_match, file_path).groups() + with_perf = False + if task == "and": + with_perf = True + rows = [] + with open(file_path, "r") as csv_file: + reader = csv.reader(csv_file) + next(reader) # Skip header + for row in reader: + parsed_row = ( + int(row[0]), + row[1], + row[2], + int(row[3]), + int(row[4]), + int(row[5]) if row[5] else None, + int(row[6]) if row[6] else None, + int(row[7]) if row[7] else None, + int(row[8]), + int(row[9]), + int(row[10]), + int(row[11]), + int(row[12]), + with_perf, + site, + cluster, + node, + ) + rows.append(parsed_row) + + return rows + + +def read_perf_csv(file_path: str, results_directory_match: str): + (site, clstr, node, task) = re.match(results_directory_match, file_path).groups() + with_hwpc = False + if task == "and": + with_hwpc = True + rows = [] + with open(file_path, "r") as csv_file: + reader = csv.DictReader(csv_file) + for row in reader: + try: + power_energy_pkg = float(row["power_energy_pkg"]) + except ValueError: + power_energy_pkg = 0.0 + try: + power_energy_ram = float(row["power_energy_ram"]) + except ValueError: + power_energy_ram = 0.0 + try: + power_energy_cores = (float(row["power_energy_cores"]),) + except ValueError: + power_energy_cores = 0.0 + parsed_row = ( + float(power_energy_pkg), + float(power_energy_ram), + float(power_energy_cores), + float(row["time_elapsed"]), + int(row["nb_core"]), + int(row["nb_ops_per_core"]), + int(row["iteration"]), + bool(with_hwpc), + site, + clstr, + node, + ) + rows.append(parsed_row) + return rows + + +# Extract JSON nodes information + + +def extract_json_files(directory: str, schema: str): + + nodes_df = pl.DataFrame(schema=schema, strict=True) + + for site in os.scandir(directory): + for cluster in os.scandir(site.path): + + for node in os.scandir(cluster.path): + if node.name.endswith(".json"): + with open(node.path, "r") as json_file: + data = json.load(json_file) + # Assuming proper parsing and casting here + node = ( + data["uid"], + data["cluster"], + bool(data["exotic"]), + int(data["architecture"]["nb_cores"]), + int(data["architecture"]["nb_threads"]), + data["processor"]["vendor"], + int(data["processor"]["clock_speed"]), + data["processor"]["instruction_set"], + bool(data["processor"]["ht_capable"]), + data["processor"]["microarchitecture"], + data["processor"]["microcode"], + data["processor"]["model"], + data["processor"]["version"], + data["operating_system"]["cstate_driver"], + data["operating_system"]["cstate_governor"], + data["operating_system"]["pstate_driver"], + data["operating_system"]["pstate_governor"], + bool(data["operating_system"]["turboboost_enabled"]), + ) + + nodes_df = pl.concat( + [ + nodes_df, + pl.from_records( + schema=schema, + data=[node], + strict=True, + orient="row", + ), + ] + ) + return nodes_df diff --git a/analysis/load.py b/analysis/load.py new file mode 100644 index 0000000..76449d7 --- /dev/null +++ b/analysis/load.py @@ -0,0 +1,205 @@ +import schemas +import extract +from typing import * +import polars as pl +from tqdm import tqdm +from math import ldexp + + +def load_hwpc_results(hwpc_df): + print(hwpc_df.sql("select rapl_energy_pkg from self").describe()) + hwpc_results = pl.sql( + """ + SELECT node, nb_core, nb_ops_per_core, iteration, alone, + SUM(rapl_energy_pkg) as energy_pkg_int, + SUM(rapl_energy_cores) as energy_cores_int, + SUM(rapl_energy_dram) as energy_ram_int + FROM hwpc_df + GROUP BY sensor, target, socket, cpu, node, nb_core, + nb_ops_per_core, iteration, alone + """ + ).collect() + + hwpc_results = hwpc_results.with_columns( + pl.col("energy_pkg_int") + .map_elements(lambda x: ldexp(x, -32) * 10e6, return_dtype=pl.Float64) + .alias("energy_pkg"), + ) + + hwpc_results = hwpc_results.with_columns( + pl.col("energy_cores_int") + .map_elements(lambda x: ldexp(x, -32) * 10e6, return_dtype=pl.Float64) + .alias("energy_cores"), + ) + + hwpc_results = hwpc_results.with_columns( + pl.col("energy_ram_int") + .map_elements(lambda x: ldexp(x, -32) * 10e6, return_dtype=pl.Float64) + .alias("energy_ram"), + ) + + hwpc_results = hwpc_results.drop( + ["energy_pkg_int", "energy_cores_int", "energy_ram_int"] + ) + + task = pl.Series("task", ["hwpc" for i in range(hwpc_results.shape[0])]) + hwpc_results.insert_column(1, task) + + return hwpc_results + + +def load_perf_results(perf_df): + perf_results = pl.sql( + """ + SELECT node, nb_core, nb_ops_per_core, iteration, alone, + power_energy_pkg as energy_pkg, + power_energy_cores as energy_cores, + power_energy_ram as energy_ram FROM perf_df + """ + ).collect() + perf_results = perf_results.with_columns(pl.col("energy_pkg") * 10e6) + perf_results = perf_results.with_columns(pl.col("energy_cores") * 10e6) + perf_results = perf_results.with_columns(pl.col("energy_ram") * 10e6) + task = pl.Series("task", ["perf" for i in range(perf_results.shape[0])]) + perf_results.insert_column(1, task) + + return perf_results + + +def load_results(hwpc_files, perf_files, results_directory_match, test): + hwpc_df = pl.DataFrame(schema=schemas.hwpc_columns, strict=True) + + perf_df = pl.DataFrame(schema=schemas.perf_columns, strict=True) + + if test: + count = 0 + for hwpc_file, perf_file in tqdm(zip(hwpc_files, perf_files)): + if test: + count += 1 + if count == 100: + break + hwpc_df = pl.concat( + [ + hwpc_df, + pl.from_records( + schema=schemas.hwpc_columns, + data=extract.read_hwpc_csv(hwpc_file, results_directory_match), + strict=True, + orient="row", + ), + ] + ) + perf_df = pl.concat( + [ + perf_df, + pl.from_records( + schema=schemas.perf_columns, + data=extract.read_perf_csv(perf_file, results_directory_match), + strict=True, + orient="row", + ), + ] + ) + + hwpc_results = load_hwpc_results(hwpc_df) + perf_results = load_perf_results(perf_df) + + return (hwpc_results, perf_results) + + +def load_energy(hwpc_results, perf_results, nodes_df, os): + energy_df = pl.concat([hwpc_results, perf_results]) + energy_df = pl.DataFrame(schema=schemas.energy_columns, data=energy_df) + + energy_stats_df = energy_df.sql( + """ + SELECT + node, + task, + nb_core, + nb_ops_per_core, + alone, + avg(energy_pkg) as pkg_average, + median(energy_pkg) as pkg_median, + min(energy_pkg) as pkg_minimum, + max(energy_pkg) as pkg_maximum, + stddev(energy_pkg) as pkg_standard_deviation, + quantile_cont(energy_pkg, 0.25) as pkg_quantile_25, + quantile_cont(energy_pkg, 0.75) as pkg_quantile_75, + (stddev(energy_pkg) / avg(energy_pkg)) as pkg_coefficient_of_variation, + avg(energy_cores) as cores_average, + median(energy_cores) as cores_median, + min(energy_cores) as cores_minimum, + max(energy_cores) as cores_maximum, + stddev(energy_cores) as cores_standard_deviation, + quantile_cont(energy_cores, 0.25) as cores_quantile_25, + quantile_cont(energy_cores, 0.75) as cores_quantile_75, + (stddev(energy_cores)/avg(energy_cores)) as cores_coefficient_of_variation, + avg(energy_ram) as ram_average, + median(energy_ram) as ram_median, + min(energy_ram) as ram_minimum, + max(energy_ram) as ram_maximum, + stddev(energy_ram) as ram_standard_deviation, + quantile_cont(energy_ram, 0.25) as ram_quantile_25, + quantile_cont(energy_ram, 0.75) as ram_quantile_75, + (stddev(energy_ram) / avg(energy_ram)) as ram_coefficient_of_variation, + FROM self + GROUP BY node, task, nb_core, nb_ops_per_core, alone + """ + ) + energy_stats_df = pl.DataFrame(energy_stats_df, schema=schemas.stats_columns) + energy_stats_df = energy_stats_df.join( + other=nodes_df, left_on="node", right_on="uid", how="left", validate="m:1" + ) + energy_stats_df = energy_stats_df.with_columns([ + (pl.col("nb_core") / pl.col("architecture_nb_cores")).alias("percent_cores_used"), + (pl.col("nb_core") / pl.col("architecture_nb_threads")).alias("percent_threads_used"), + + ]) + print("New columns :", energy_stats_df.sql("SELECT percent_cores_used, percent_threads_used FROM self").describe()) + + ranges = { + "10%": (0, 0.1), + "25": (0.1, 0.25), + "50": (0.25, 0.5), + "75": (0.5, 0.75), + "90": (0.75, 0.9), + "100": (0.9, 1.0), + "110": (1.0, 1.1) + } + + def assign_category(value): + for label, (low, high) in ranges.items(): + if low <= value < high: + return int(label) + return None + + energy_stats_df = energy_stats_df.with_columns( + pl.col("percent_cores_used") + .map_elements(lambda x : assign_category(x)) + .alias("percent_cores_used_category") + ) + + energy_stats_df = energy_stats_df.with_columns( + pl.col("percent_threads_used") + .map_elements(lambda x : assign_category(x)) + .alias("percent_threads_used_category") + ) + + + jobs = { + "hwpc_true": "hwpc_alone", + "hwpc_false": "hwpc_with_perf", + "perf_true": "perf_alone", + "perf_false": "perf_with_hwpc", + } + + energy_stats_df = energy_stats_df.with_columns( + pl.concat_str(["task", "alone"], separator="_").alias("job") + ) + + energy_stats_df = energy_stats_df.with_columns(pl.col("job").replace_strict(jobs)) + + print("New columns :", energy_stats_df.sql("SELECT percent_cores_used, percent_threads_used, percent_cores_used_category, percent_threads_used_category FROM self").describe()) + + return energy_stats_df diff --git a/analysis/rq1.py b/analysis/rq1.py new file mode 100644 index 0000000..64a3810 --- /dev/null +++ b/analysis/rq1.py @@ -0,0 +1,57 @@ +import visualization +import matplotlib.pyplot as plt +import seaborn as sns +import pandas as pd +import polars as pl + +def correlation_perf_hwpc_cv(df, job, os): + + if job == "alone": + df_perf = df.sql(f"SELECT * FROM self WHERE job = 'perf_{job}'") + df_hwpc = df.sql(f"SELECT * FROM self WHERE job = 'hwpc_{job}'") + title = f"Scatterplot of PERF coefficient of variation related to HWPC, PKG domain, measurement tools isolated\n{os}" + else: + df_perf = df.sql(f"SELECT * FROM self WHERE job = 'perf_with_hwpc'") + df_hwpc = df.sql(f"SELECT * FROM self WHERE job = 'hwpc_with_perf'") + title = f"Scatterplot of PERF coefficient of variation related to HWPC, PKG domain, measurement tools running together\n{os}" + + joined = df_hwpc.join( + other=df_perf, on=["node", "nb_ops_per_core", "nb_core", "alone"], how="left", validate="1:1", suffix="_perf" + ) + + sns.set_theme(style="whitegrid") + f, ax = plt.subplots(figsize=(12,8)) + sns.despine(f, left=True, bottom=True) + plotted_df = joined.sql("SELECT * FROM self WHERE nb_ops_per_core = 25000 and processor_version != 'Gold 5320'").drop_nulls(subset=["pkg_coefficient_of_variation", "pkg_coefficient_of_variation_perf"]).drop_nans(subset=["pkg_coefficient_of_variation", "pkg_coefficient_of_variation_perf"]) + + max_perf = plotted_df["pkg_coefficient_of_variation_perf"].max() + max_hwpc = plotted_df["pkg_coefficient_of_variation"].max() + max_both = max(max_perf, max_hwpc) + + corr = plotted_df.select(pl.corr("pkg_coefficient_of_variation_perf", "pkg_coefficient_of_variation")).item() + correlations = ( + plotted_df.group_by("processor_detail") + .agg(pl.corr("pkg_coefficient_of_variation_perf", "pkg_coefficient_of_variation").alias("corr")) + ) + corr_dict = dict(zip(correlations["processor_detail"], correlations["corr"])) + scatter = sns.scatterplot(data=plotted_df, + x="pkg_coefficient_of_variation_perf", + y="pkg_coefficient_of_variation", + hue="processor_detail", + style="processor_vendor" + ) + sns.lineplot(x=[0, max_both], y=[0, max_both], color="red", linestyle="dashed", label="f(x) = x") + plt.title(title) + plt.xlabel("Coefficient of variation of PERF for PKG domain") + plt.ylabel("Coefficient of variation of HWPC for PKG domain") + plt.text(0.05, 0.95, f"Correlation: {corr:.2f}", transform=plt.gca().transAxes, + fontsize=12, verticalalignment='top', bbox=dict(boxstyle="round,pad=0.3", + edgecolor='black', + facecolor='white') + ) + handles, labels = scatter.get_legend_handles_labels() + new_labels = [f"{label} (corr: {corr_dict.get(label, 'N/A'):.2f})" for label in labels if label in corr_dict] + plt.legend(handles, new_labels, loc="lower right") + plt.tight_layout() + plt.show() + diff --git a/analysis/rq2.py b/analysis/rq2.py new file mode 100644 index 0000000..c9923d6 --- /dev/null +++ b/analysis/rq2.py @@ -0,0 +1,33 @@ +import visualization +import matplotlib.pyplot as plt +import seaborn as sns +import pandas as pd +import polars as pl +import re + + +def boxplots_perf_hwpc_cv_processor(df, x, y, hue, prefix, save=True, show=True): + plt.figure(figsize=(12, 6)) + df = df.sql("SELECT * FROM self WHERE nb_ops_per_core = 25000") + sns.boxplot( + data=df, + x=x, + y=y, + hue=hue, + showfliers=False + ) + + title = f"{prefix} - PKG Coefficient of Variation by {hue} and {x}" + plt.title(title) + plt.xticks(rotation=90, ha="right") + plt.xlabel("Processor version and generation") + plt.ylabel("PKG Coefficient of Variation") + safe_title = re.sub(r'[^\w\s-]', '', title) # Remove invalid characters + safe_title = safe_title.replace(" ", "_") + safe_title = safe_title.replace("\n", "_") + plt.tight_layout() + if save: + plt.savefig(f'{safe_title}.png', dpi=500) + if show: + plt.show() + diff --git a/analysis/rq3.py b/analysis/rq3.py new file mode 100644 index 0000000..e48826d --- /dev/null +++ b/analysis/rq3.py @@ -0,0 +1,68 @@ +import visualization +import matplotlib.pyplot as plt +import seaborn as sns +import pandas as pd +import polars as pl + +def correlation_perf_perf_hwpc_hwpc_cv_os(df1, df2, job): + + if job == "alone": + df1_perf = df1.sql(f"SELECT * FROM self WHERE job = 'perf_{job}'") + df2_perf = df2.sql(f"SELECT * FROM self WHERE job = 'perf_{job}'") + df1_hwpc = df1.sql(f"SELECT * FROM self WHERE job = 'hwpc_{job}'") + df2_hwpc = df2.sql(f"SELECT * FROM self WHERE job = 'hwpc_{job}'") + title = f"Scatterplot of Ubuntu PERF coefficient of variation related to Debian, PKG domain, measurement tools isolated" + else: + df1_perf = df1.sql(f"SELECT * FROM self WHERE job = 'perf_with_hwpc'") + df2_perf = df2.sql(f"SELECT * FROM self WHERE job = 'perf_with_hwpc'") + df1_hwpc = df1.sql(f"SELECT * FROM self WHERE job = 'hwpc_with_perf'") + df2_hwpc = df2.sql(f"SELECT * FROM self WHERE job = 'hwpc_with_perf'") + title = f"Scatterplot of Ubuntu PERF coefficient of variation related to Debian, PKG domain, measurement tools running together" + + joined_perf = df1_perf.join( + other=df2_perf, on=["node", "nb_ops_per_core", "nb_core", "alone"], how="left", validate="1:1", suffix="_debian" + ) + joined_hwpc = df1_hwpc.join( + other=df2_hwpc, on=["node", "nb_ops_per_core", "nb_core", "alone"], how="left", validate="1:1", suffix="_debian" + ) + + sns.set_theme(style="whitegrid") + f, ax = plt.subplots(figsize=(12,8)) + sns.despine(f, left=True, bottom=True) + plotted_df_perf = joined_perf.sql("SELECT * FROM self WHERE nb_ops_per_core = 25000 and processor_version != 'Gold 5320'").drop_nulls(subset=["pkg_coefficient_of_variation_debian", "pkg_coefficient_of_variation"]).drop_nans(subset=["pkg_coefficient_of_variation_debian", "pkg_coefficient_of_variation"]) + plotted_df_hwpc = joined_hwpc.sql("SELECT * FROM self WHERE nb_ops_per_core = 25000 and processor_version != 'Gold 5320'").drop_nulls(subset=["pkg_coefficient_of_variation_debian", "pkg_coefficient_of_variation"]).drop_nans(subset=["pkg_coefficient_of_variation_debian", "pkg_coefficient_of_variation"]) + + max_perf_1 = plotted_df_perf["pkg_coefficient_of_variation"].max() + max_perf_2 = plotted_df_perf["pkg_coefficient_of_variation_debian"].max() + max_perf_both = max(max_perf_1, max_perf_2) + max_hwpc_1 = plotted_df_hwpc["pkg_coefficient_of_variation"].max() + max_hwpc_2 = plotted_df_hwpc["pkg_coefficient_of_variation_debian"].max() + max_hwpc_both = max(max_hwpc_1, max_hwpc_2) + + corr = plotted_df_perf.select(pl.corr("pkg_coefficient_of_variation", "pkg_coefficient_of_variation_debian")).item() + correlations = ( + plotted_df_perf.group_by("processor_detail") + .agg(pl.corr("pkg_coefficient_of_variation", "pkg_coefficient_of_variation_debian").alias("corr")) + ) + corr_dict = dict(zip(correlations["processor_detail"], correlations["corr"])) + scatter = sns.scatterplot(data=plotted_df_perf, + x="pkg_coefficient_of_variation", + y="pkg_coefficient_of_variation_debian", + hue="node", + style="processor_vendor" + ) + sns.lineplot(x=[0, max_perf_both], y=[0, max_perf_both], color="red", linestyle="dashed", label="f(x) = x") + plt.title(title) + plt.xlabel("Coefficient of variation of PERF for PKG domain - Ubuntu2404 - Kernel 6.8") + plt.ylabel("Coefficient of variation of HWPC for PKG domain - Debian11 - Kernel 5.10") + plt.text(0.05, 0.95, f"Correlation: {corr:.2f}", transform=plt.gca().transAxes, + fontsize=12, verticalalignment='top', bbox=dict(boxstyle="round,pad=0.3", + edgecolor='black', + facecolor='white') + ) + #handles, labels = scatter.get_legend_handles_labels() + #new_labels = [f"{label} (corr: {corr_dict.get(label, 'N/A'):.2f})" for label in labels if label in corr_dict] + #plt.legend(handles, new_labels, loc="lower right") + plt.tight_layout() + plt.show() + diff --git a/analysis/rq34.py b/analysis/rq34.py new file mode 100644 index 0000000..faf0a24 --- /dev/null +++ b/analysis/rq34.py @@ -0,0 +1,42 @@ +import visualization + + +def os_comparison_boxplots_processor_versions_pkg_all(dfs, save=False, show=False): + visualization.plot_boxplots( + dfs, + "processor_detail", + "pkg_coefficient_of_variation", + "job", + "All Measurements", + ) + +def os_comparison_boxplots_processor_versions_ram_all(dfs, save=False, show=False): + visualization.plot_boxplots( + dfs, + "processor_detail", + "ram_coefficient_of_variation", + "job", + "All Measurements", + ) + + +def os_comparison_heatmap_processor_versions_pkg_nb_ops(joined_df, tool, save=False, show=False): + visualization.plot_os_degradation_nb_ops(joined_df, "pkg", tool) + +def os_comparison_heatmap_processor_versions_ram_nb_ops(joined_df, tool, save=False, show=False): + visualization.plot_os_degradation_nb_ops(joined_df, "ram", tool) + +def os_comparison_heatmap_processor_versions_pkg_percent_used(joined_df, save=False, show=False): + visualization.plot_os_degradation_percent_used(joined_df, "pkg") + +def os_comparison_heatmap_processor_versions_ram_percent_used(joined_df, save=False, show=False): + visualization.plot_os_degradation_percent_used(joined_df, "ram") + +def debian_facetgrid_processor_versions_pkg_cv_nb_ops(debian_df, save=True, show=True): + visualization.plot_facet_grid_nb_ops_per_core_versions_domain_cv(debian_df, "pkg", "debian11 5.10") +def debian_facetgrid_processor_versions_ram_cv_nb_ops(debian_df, save=True, show=True): + visualization.plot_facet_grid_nb_ops_per_core_versions_domain_cv(debian_df, "ram", "debian11 5.10") +def ubuntu_facetgrid_processor_versions_pkg_cv_nb_ops(ubuntu_df, save=True, show=True): + visualization.plot_facet_grid_nb_ops_per_core_versions_domain_cv(ubuntu_df, "pkg", "ubuntu2404 6.8") +def ubuntu_facetgrid_processor_versions_ram_cv_nb_ops(ubuntu_df, save=True, show=True): + visualization.plot_facet_grid_nb_ops_per_core_versions_domain_cv(ubuntu_df, "ram", "ubuntu2404 6.8") diff --git a/analysis/schemas.py b/analysis/schemas.py new file mode 100644 index 0000000..8250db4 --- /dev/null +++ b/analysis/schemas.py @@ -0,0 +1,99 @@ +# Schemas declaration +hwpc_columns = [ + ("timestamp", int), + ("sensor", str), + ("target", str), + ("socket", int), + ("cpu", int), + ("rapl_energy_pkg", int), + ("rapl_energy_dram", int), + ("rapl_energy_cores", int), + ("time_enabled", int), + ("time_running", int), + ("nb_core", int), + ("nb_ops_per_core", int), + ("iteration", int), + ("alone", bool), + ("site", str), + ("clstr", str), + ("node", str), +] + +perf_columns = [ + ("power_energy_pkg", float), + ("power_energy_ram", float), + ("power_energy_cores", float), + ("time_elapsed", float), + ("nb_core", int), + ("nb_ops_per_core", int), + ("iteration", int), + ("alone", bool), + ("site", str), + ("clstr", str), + ("node", str), +] + +energy_columns = [ + ("node", str), + ("task", str), + ("nb_core", int), + ("nb_ops_per_core", int), + ("iteration", int), + ("alone", bool), + ("energy_pkg", float), + ("energy_cores", float), + ("energy_ram", float), +] + +stats_columns = [ + ("node", str), + ("task", str), + ("nb_core", int), + ("nb_ops_per_core", int), + ("alone", bool), + ("pkg_minimum", float), + ("pkg_maximum", float), + ("pkg_average", float), + ("pkg_median", float), + ("pkg_standard_deviation", float), + ("pkg_quantile_25", float), + ("pkg_quantile_75", float), + ("pkg_coefficient_of_variation", float), + ("cores_minimum", float), + ("cores_maximum", float), + ("cores_average", float), + ("cores_median", float), + ("cores_standard_deviation", float), + ("cores_quantile_25", float), + ("cores_quantile_75", float), + ("cores_coefficient_of_variation", float), + ("ram_minimum", float), + ("ram_maximum", float), + ("ram_average", float), + ("ram_median", float), + ("ram_standard_deviation", float), + ("ram_quantile_25", float), + ("ram_quantile_75", float), + ("ram_coefficient_of_variation", float), +] + +nodes_configuration_columns = [ + ("uid", str), + ("clstr", str), + ("exotic", bool), + ("architecture_nb_cores", int), + ("architecture_nb_threads", int), + ("processor_vendor", str), + ("processor_clock_speed", int), + ("processor_instruction_set", str), + ("processor_ht_capable", bool), + ("processor_microarchitecture", str), + ("processor_microcode", str), + ("processor_model", str), + ("processor_version", str), + ("os_cstate_driver", str), + ("os_cstate_governor", str), + ("os_pstate_driver", str), + ("os_pstate_governor", str), + ("os_turboboost_enabled", bool), +] diff --git a/analysis/visualization.py b/analysis/visualization.py new file mode 100644 index 0000000..7d70a1f --- /dev/null +++ b/analysis/visualization.py @@ -0,0 +1,315 @@ +import matplotlib.pyplot as plt +import numpy as np +import re +import seaborn as sns +import polars as pl + +palette = { + "hwpc_alone": "#1f77b4", + "hwpc_with_perf": "#17becf", + "perf_alone": "#d62728", + "perf_with_hwpc": "#ff7f0e", + } + +def plot_violinplot(dfs, x, y, hue, save=True, show=True): + fig, axs = plt.subplots(nrows=1, ncols=2, sharey=True) + plt.ylim(0, 1) + + sns.violinplot(ax=axs[0], data=dfs[0], x=x, y=y, hue=hue, inner="quart", gap=0.1) + + sns.violinplot( + ax=axs[1], data=dfs[1], x=x, y=y, hue=hue, inner="quart", gap=0.1, cut=0.1 + ) + plt.title(f"{y} for {x} by {hue}") + if show: + plt.show() + + +def plot_boxplot(df, x, y, hue, prefix, save=True, show=True): + plt.figure(figsize=(12, 6)) + plt.ylim(0, .1) + df = df.sql("SELECT * FROM self WHERE nb_ops_per_core > 25") + sns.boxplot(data=df, x=x, y=y, hue=hue) + # sns.boxplot( + # data=df, + # x=x, + # y=y, + # hue=hue, + # ) + + title = f"{prefix} - HWPC Coefficient of Variation\n{y} for {x} by {hue}" + safe_title = re.sub(r'[^\w\s-]', '', title) # Remove invalid characters + safe_title = safe_title.replace(" ", "_") + safe_title = safe_title.replace("\n", "_") + if save: + plt.savefig(f'{safe_title}.png', dpi=600) + if show: + plt.show() + +def plot_facet_grid_nb_ops_per_core_versions_domain_cv(df, domain, os, save=True, show=True): + df = df.to_pandas() + df = df.sort_values(by=["processor_vendor", "processor_generation"]) + g = sns.FacetGrid( + df, + col="nb_ops_per_core", # Each grid is for a unique nb_ops_per_core value + sharey=True, # Share the y-axis across all grids + margin_titles=True, + aspect=2, # Adjust the aspect ratio of the grid + height=7, # Set the height of each subplot + palette=palette, + col_wrap=3, + ) + + plt.ylim(0, 1) + # Map the boxplot to each grid + g.map( + sns.boxplot, + "processor_detail", # X-axis + f"{domain}_coefficient_of_variation", # Y-axis + "job", # Hue for grouping + palette=palette, + showfliers=False, + ) + g.set_axis_labels("Processor Detail", f"{domain} coefficient of variation") + g.set_titles(col_template="Ops per Core: {col_name}") + g.add_legend(title="Job") + g.legend.set_bbox_to_anchor((0.85, 0.75)) # (x, y) coordinates relative to the first subplot + g.legend.set_frame_on(True) + # Rotate x-axis labels for better readability + for ax in g.axes.flat: + ax.tick_params(axis="x", rotation=90) + title = f"Boxplots of {domain} measurements CV by nb_ops_per_core and processor versions - {os}" + safe_title = re.sub(r'[^\w\s-]', '', title) # Remove invalid characters + safe_title = safe_title.replace(" ", "_") + safe_title = safe_title.replace("\n", "_") + plt.suptitle(title) + plt.tight_layout() + if save: + plt.savefig(f'{safe_title}.png', dpi=600) + if show: + plt.show() + +def plot_boxplots(dfs, x, y, hue, prefix, save=True, show=True): + fig, axs = plt.subplots(nrows=1, ncols=2,figsize=(16,7) ,sharey=True) + dfs[0] = dfs[0].sort(x) + dfs[1] = dfs[1].sort(x) + + plt.ylim(0, 1) + + sns.boxplot( + ax=axs[0], + data=dfs[0], + x=x, + y=y, + hue=hue, + gap=0.1, + palette=palette, + showfliers=False, + ) + axs[0].set_title("Debian11 - Kernel 5.10 - HWPC Coefficient of Variation") + axs[0].set_xticklabels(axs[0].get_xticklabels(), rotation=90, ha="right") + sns.boxplot( + ax=axs[1], + data=dfs[1], + x=x, + y=y, + hue=hue, + gap=0.1, + palette=palette, + showfliers=False, + ) + axs[1].set_title("Ubuntu2404nfs - Kernel 6.8 - HWPC Coefficient of Variation") + axs[1].set_xticklabels(axs[1].get_xticklabels(), rotation=90, ha="right") + title = f"{prefix}\n{y} for {x} by {hue}" + safe_title = re.sub(r'[^\w\s-]', '', title) # Remove invalid characters + safe_title = safe_title.replace(" ", "_") + safe_title = safe_title.replace("\n", "_") + plt.title(title) + plt.tight_layout() + if save: + plt.savefig(f'{safe_title}.png', dpi=600) + if show: + plt.show() + +def plot_os_degradation_nb_ops(joined_df, domain, tool, save=True, show=True): + joined_df = joined_df.with_columns( + ( + ( + pl.col(f"{domain}_coefficient_of_variation") + - pl.col(f"{domain}_coefficient_of_variation_debian") + ).alias(f"{domain}_diff") + ) + ) + + joined_df = joined_df.with_columns( + ( + ( + pl.col(f"{domain}_coefficient_of_variation") + / pl.col(f"{domain}_coefficient_of_variation_debian") + ).alias(f"{domain}_ratio") + ) + ) + + + aggregated = joined_df.group_by(["processor_detail", "nb_ops_per_core"]).agg( + pl.col(f"{domain}_ratio").median().alias(f"{domain}_median_ratio"), + pl.col(f"{domain}_diff").median().alias(f"{domain}_median_diff"), + pl.col("processor_vendor").min().alias("processor_vendor"), + pl.col("processor_generation").min().alias("processor_generation"), + ) + + df_pandas = aggregated.to_pandas() + df_pandas = df_pandas.sort_values(by=["processor_vendor", "processor_generation"]) + + plt.figure(figsize=(12, 5)) + ratio_cmap = sns.diverging_palette(220, 20, l=65, center="light", as_cmap=True) + pivot_table = df_pandas.pivot( + index="nb_ops_per_core", + columns="processor_detail", + values=f"{domain}_median_ratio", + ) + + sns.heatmap( + pivot_table, + annot=True, + fmt=".2f", + cmap=ratio_cmap, + vmin=0, + vmax=2, + ) + title = f"Heatmap of median ratio of {domain} measurements CV (ubuntu/debian) by vendor\nfor {tool} tool" + safe_title = re.sub(r'[^\w\s-]', '', title) # Remove invalid characters + safe_title = safe_title.replace(" ", "_") + safe_title = safe_title.replace("\n", "_") + plt.title(title) + plt.xlabel("Processor version and generation") + plt.ylabel("Number of operations per core") + plt.tight_layout() + if save: + plt.savefig(f'{safe_title}.png', dpi=600) + if show: + plt.show() + + diff_cmap = sns.diverging_palette(220, 20, l=65, center="light", as_cmap=True) + plt.figure(figsize=(15, 6)) + pivot_table = df_pandas.pivot( + index="nb_ops_per_core", + columns="processor_detail", + values=f"{domain}_median_diff", + ) + q1 = np.nanpercentile(pivot_table.values, 25) + q3 = np.nanpercentile(pivot_table.values, 75) + iqr = q3 - q1 + vmax = q3 + 1.5 * iqr + sns.heatmap( + pivot_table, + annot=True, + fmt=".2f", + cmap=diff_cmap, + vmin=-vmax, + vmax=vmax, + ) + plt.xlabel("Processor Details") + plt.xticks(rotation=90, ha="right") + plt.ylabel("Number of Operations Per Core") + + title = f"Heatmap of median diff for {domain} measurements CV (ubuntu - debian) by vendor\nfor {tool} tool" + safe_title = re.sub(r'[^\w\s-]', '', title) # Remove invalid characters + safe_title = safe_title.replace(" ", "_") + safe_title = safe_title.replace("\n", "_") + plt.title(title) + plt.tight_layout() + if save: + plt.savefig(f'{safe_title}.png', dpi=600) + if show: + plt.show() + + +def plot_os_degradation_percent_used(joined_df, domain, save=True, show=True): + joined_df = joined_df.with_columns( + ( + ( + pl.col(f"{domain}_coefficient_of_variation") + - pl.col(f"{domain}_coefficient_of_variation_debian") + ).alias(f"{domain}_diff") + ) + ) + + joined_df = joined_df.with_columns( + ( + ( + pl.col(f"{domain}_coefficient_of_variation") + / pl.col(f"{domain}_coefficient_of_variation_debian") + ).alias(f"{domain}_ratio") + ) + ) + + aggregated = joined_df.group_by(["processor_detail", "percent_cores_used_category"]).agg( + pl.col(f"{domain}_ratio").median().alias(f"{domain}_median_ratio"), + pl.col(f"{domain}_diff").median().alias(f"{domain}_median_diff"), + ) + + df_pandas = aggregated.to_pandas() + df_pandas = df_pandas.sort_values(by=["processor_detail"]) + + plt.figure(figsize=(12, 5)) + ratio_cmap = sns.diverging_palette(220, 20, l=65, center="light", as_cmap=True) + pivot_table = df_pandas.pivot( + index="percent_cores_used_category", + columns="processor_detail", + values=f"{domain}_median_ratio", + ) + + sns.heatmap( + pivot_table, + annot=True, + fmt=".2f", + cmap=ratio_cmap, + vmin=0, + vmax=2, + ) + title = f"Heatmap of median ratio of HWPC {domain} measurements CV (ubuntu/debian) by vendor" + safe_title = re.sub(r'[^\w\s-]', '', title) # Remove invalid characters + safe_title = safe_title.replace(" ", "_") + safe_title = safe_title.replace("\n", "_") + plt.title(title) + plt.tight_layout() + if save: + plt.savefig(f'{safe_title}.png', dpi=600) + if show: + plt.show() + + diff_cmap = sns.diverging_palette(220, 20, l=65, center="light", as_cmap=True) + plt.figure(figsize=(15, 6)) + pivot_table = df_pandas.pivot( + index="percent_cores_used_category", + columns="processor_detail", + values=f"{domain}_median_diff", + ) + q1 = np.nanpercentile(pivot_table.values, 25) + q3 = np.nanpercentile(pivot_table.values, 75) + iqr = q3 - q1 + vmax = q3 + 1.5 * iqr + sns.heatmap( + pivot_table, + annot=True, + fmt=".2f", + cmap=diff_cmap, + vmin=-vmax, + vmax=vmax, + ) + plt.xlabel("Processor Details") + plt.xticks(rotation=90, ha="right") + plt.ylabel("Percent core used") + + title = f"Heatmap of median diff for HWPC {domain} measurements CV (ubuntu - debian) by vendor" + safe_title = re.sub(r'[^\w\s-]', '', title) # Remove invalid characters + safe_title = safe_title.replace(" ", "_") + safe_title = safe_title.replace("\n", "_") + plt.title(title) + plt.tight_layout() + if save: + plt.savefig(f'{safe_title}.png', dpi=600) + if show: + plt.show() + diff --git a/src/results.rs b/src/results.rs index 8ddf25a..2cd1820 100644 --- a/src/results.rs +++ b/src/results.rs @@ -1,5 +1,5 @@ use thiserror::Error; -use log::{debug, warn}; +use log::{debug, warn, error}; use serde::{Serialize, Deserialize}; use std::path::{Path, PathBuf}; use std::fs::File; @@ -224,6 +224,12 @@ fn aggregate_hwpc( let (output_parent, output_basename) = (raw_results_dir_path.parent().unwrap(), raw_results_dir_path.file_name().unwrap()); let output_path = &format!("{}/{}.csv", output_parent.to_str().unwrap(), output_basename.to_str().unwrap()); + if Path::new(output_path).exists() { + match fs::remove_file(output_path) { + Ok(_) => debug!("File '{}' was deleted successfully.", output_path), + Err(e) => error!("Failed to delete file '{}': {}", output_path, e), + } + } let mut raw_results_subdirs = Vec::new();