LogGen/debug_dataset_stats.py at main · IntelligentDDS/LogGen · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import torch
import os
import sys

# Ensure src can be imported
sys.path.append(os.getcwd())


def check_dataset_stats():
    dataset_path = "data/processed/training_dataset.pt"

    if not os.path.exists(dataset_path):
        print(f"Error: Dataset not found at {dataset_path}")
        return

    print(f"Loading {dataset_path} ...")
    data = torch.load(dataset_path)

    total_samples = len(data)
    total_lines = 0
    total_positives = 0

    # Analyze distribution
    func_with_logs = 0

    for item in data:
        labels = item['labels']  # Tensor shape (Seq_Len,)

        num_lines = labels.shape[0]
        num_pos = torch.sum(labels).item()

        total_lines += num_lines
        total_positives += num_pos

        if num_pos > 0:
            func_with_logs += 1

    print("-" * 30)
    print(f"Total Functions:      {total_samples}")
    print(f"Total Code Lines:     {total_lines}")
    print(f"Total Log Lines (1):  {int(total_positives)}")
    print("-" * 30)

    if total_positives == 0:
        print("❌ CRITICAL ERROR: Dataset contains ZERO positive labels.")
        print("   Reason: Your keywords list likely doesn't match the logs in your code.")
    else:
        ratio = (total_positives / total_lines) * 100
        print(f"Positive Ratio:       {ratio:.4f}%")
        print(f"Funcs containing logs: {func_with_logs} / {total_samples}")
        print("✅ Data distribution looks plausible (typically 1% - 5%).")


if __name__ == "__main__":
    check_dataset_stats()