-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdebug_dataset_stats.py
More file actions
55 lines (40 loc) · 1.48 KB
/
debug_dataset_stats.py
File metadata and controls
55 lines (40 loc) · 1.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import torch
import os
import sys
# Ensure src can be imported
sys.path.append(os.getcwd())
def check_dataset_stats():
dataset_path = "data/processed/training_dataset.pt"
if not os.path.exists(dataset_path):
print(f"Error: Dataset not found at {dataset_path}")
return
print(f"Loading {dataset_path} ...")
data = torch.load(dataset_path)
total_samples = len(data)
total_lines = 0
total_positives = 0
# Analyze distribution
func_with_logs = 0
for item in data:
labels = item['labels'] # Tensor shape (Seq_Len,)
num_lines = labels.shape[0]
num_pos = torch.sum(labels).item()
total_lines += num_lines
total_positives += num_pos
if num_pos > 0:
func_with_logs += 1
print("-" * 30)
print(f"Total Functions: {total_samples}")
print(f"Total Code Lines: {total_lines}")
print(f"Total Log Lines (1): {int(total_positives)}")
print("-" * 30)
if total_positives == 0:
print("❌ CRITICAL ERROR: Dataset contains ZERO positive labels.")
print(" Reason: Your keywords list likely doesn't match the logs in your code.")
else:
ratio = (total_positives / total_lines) * 100
print(f"Positive Ratio: {ratio:.4f}%")
print(f"Funcs containing logs: {func_with_logs} / {total_samples}")
print("✅ Data distribution looks plausible (typically 1% - 5%).")
if __name__ == "__main__":
check_dataset_stats()