Skip to content

Commit

Permalink
aggregate metrics separates normal vs malcious accuracy (#136)
Browse files Browse the repository at this point in the history
  • Loading branch information
joyce-yuan authored Nov 2, 2024
1 parent 3bc064a commit 59b5bc6
Showing 1 changed file with 62 additions and 14 deletions.
76 changes: 62 additions & 14 deletions src/utils/post_hoc_plot_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,15 @@ def get_all_nodes(logs_dir: str) -> List[str]:
"""Return all node directories in the log folder"""
return [d for d in os.listdir(logs_dir) if (os.path.isdir(os.path.join(logs_dir, d)) and d != "node_0" and d.startswith('node'))]

def get_node_type(node_id: str, logs_dir: str) -> str:
"""Determine if the node is malicious or normal based on its config file."""
config_path = os.path.join(logs_dir, f'node_{node_id}', 'config.json')
with open(config_path, 'r') as file:
config = json.load(file)

# Check if node has a malicious type
return config.get("malicious_type", "normal")

# Calculate Metrics Per User
def calculate_auc(df: pd.DataFrame, metric: str = 'acc') -> float:
"""Calculate AUC for the given dataframe's accuracy or loss."""
Expand Down Expand Up @@ -50,9 +59,11 @@ def compute_per_user_metrics(node_id: str, logs_dir: str) -> Dict[str, float]:
return metrics

def aggregate_metrics_across_users(logs_dir: str, output_dir: Optional[str] = None) -> Tuple[pd.Series, pd.Series, pd.DataFrame]:
"""Aggregate metrics across all users and save the results to CSV files."""
"""Aggregate metrics across all users, categorize nodes, and save the results to CSV files."""
nodes = get_all_nodes(logs_dir)
all_metrics: List[Dict[str, float]] = []
normal_metrics = []
malicious_metrics = []

# Ensure the output directory exists
if not output_dir:
Expand All @@ -63,25 +74,62 @@ def aggregate_metrics_across_users(logs_dir: str, output_dir: Optional[str] = No
node_id = node.split('_')[-1]
metrics = compute_per_user_metrics(node_id, logs_dir)
metrics['node'] = node

# Get node type and categorize metrics
node_type = get_node_type(node_id, logs_dir)
if node_type == "normal":
normal_metrics.append(metrics)
elif node_type != "normal": # Check for malicious nodes
malicious_metrics.append(metrics)

all_metrics.append(metrics)

# Convert to DataFrame for easier processing
df_metrics = pd.DataFrame(all_metrics)
# Select only numeric columns before calculating mean and std
numeric_columns = df_metrics.select_dtypes(include=[np.number])

# Calculate average and standard deviation
avg_metrics = numeric_columns.mean()
std_metrics = numeric_columns.std()

# Save the DataFrame with per-user metrics
df_metrics.to_csv(os.path.join(output_dir, 'per_user_metrics.csv'), index=False)

# Save the average and standard deviation statistics
summary_stats = pd.DataFrame({'Average': avg_metrics, 'Standard Deviation': std_metrics})
summary_stats.to_csv(os.path.join(output_dir, 'summary_statistics.csv'))

return avg_metrics, std_metrics, df_metrics
# Calculate overall average and standard deviation
overall_avg = numeric_columns.mean()
overall_std = numeric_columns.std()

# Convert normal and malicious metrics to DataFrames
df_normal = pd.DataFrame(normal_metrics)
df_malicious = pd.DataFrame(malicious_metrics)

# Calculate normal node statistics
normal_avg = df_normal.mean(numeric_only=True)
normal_std = df_normal.std(numeric_only=True)

# Calculate malicious node statistics (if there are malicious nodes)
if not df_malicious.empty:
malicious_avg = df_malicious.mean(numeric_only=True)
malicious_std = df_malicious.std(numeric_only=True)
else:
# If no malicious nodes, fill with NaN values for clarity
malicious_avg = pd.Series([np.nan] * len(normal_avg), index=normal_avg.index)
malicious_std = pd.Series([np.nan] * len(normal_std), index=normal_std.index)

# Compile all metrics into a DataFrame with descriptive row names
summary_stats_data = {
f"{metric}_overall": [overall_avg[metric], overall_std[metric]]
for metric in overall_avg.index
}
summary_stats_data.update({
f"{metric}_normal": [normal_avg[metric], normal_std[metric]]
for metric in normal_avg.index
})
summary_stats_data.update({
f"{metric}_malicious": [malicious_avg[metric], malicious_std[metric]]
for metric in malicious_avg.index
})

# Create a DataFrame from the dictionary and specify columns
summary_stats_df = pd.DataFrame.from_dict(summary_stats_data, orient='index', columns=["Average", "Standard Deviation"])

# Save the summary statistics to a single CSV file
summary_stats_df.to_csv(os.path.join(output_dir, 'summary_statistics.csv'))

return overall_avg, overall_std, df_metrics

def compute_per_user_round_data(node_id: str, logs_dir: str, metrics_map: Optional[Dict[str, str]] = None) -> Dict[str, np.ndarray]:
"""Extract per-round data (accuracy and loss) for train/test from the logs."""
Expand Down

0 comments on commit 59b5bc6

Please sign in to comment.