ProfRCA/utils.py at master · IntelligentDDS/ProfRCA · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import gzip
import shutil
import random
import torch
import numpy as np
import matplotlib.pyplot as plt
import os
import subprocess
import time
from sklearn.manifold import TSNE
import re

def timestamp():
    return time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())

def timestamp_readable():
    return time.strftime("[%Y-%m-%d %H:%M:%S] ", time.localtime())

def decompress(filename):
    '''decompress a pprof.pb.gz file'''
    with gzip.open(filename, 'rb') as f_in:
        with open(filename[:-3], 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

def decompress_all(data_dir):
    result = subprocess.run(['gzip', '-d', '-r', data_dir], capture_output=True, text=True)

def setup_seed(seed):

    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    np.random.seed(seed)
    random.seed(seed)

################## 可视化 ##################

def visualize_tsne(emb, labels):
    tsne = TSNE(n_components=2, random_state=0)
    emb_2d = tsne.fit_transform(emb)

    plt.figure(figsize=(5, 5))
    for i in range(emb_2d.shape[0]):
        plt.scatter(emb_2d[i, 0], emb_2d[i, 1], c='red' if labels[i] == 1 else 'blue', label=labels[i])

    plt.xlabel('t-SNE component 1')
    plt.ylabel('t-SNE component 2')
    plt.title('t-SNE visualization of embeddings')
    plt.show()


################## 训练debug工具 ##################

def check_gradients(model, threshold=100.0):
    """检查模型参数的梯度是否过大或过小"""
    grad_info = {}
    for name, param in model.named_parameters():
        if param.grad is not None:
            grad_norm = param.grad.norm().item()
            if grad_norm > threshold:
                grad_info[name] = {
                    'norm': grad_norm,
                    'status': 'too_large',
                    'max': param.grad.max().item(),
                    'min': param.grad.min().item(),
                    'mean': param.grad.mean().item()
                }
            elif grad_norm < 1e-10:
                grad_info[name] = {
                    'norm': grad_norm,
                    'status': 'too_small',
                    'max': param.grad.max().item(),
                    'min': param.grad.min().item(),
                    'mean': param.grad.mean().item()
                }
    return grad_info

def log_tensor_stats(tensor, name, log_file):
    """记录张量的统计信息"""
    if torch.isnan(tensor).any() or torch.isinf(tensor).any():
        stats = {
            'name': name,
            'shape': tuple(tensor.shape),
            'min': tensor.min().item() if not torch.isnan(tensor).all() else float('nan'),
            'max': tensor.max().item() if not torch.isnan(tensor).all() else float('nan'),
            'mean': tensor.mean().item() if not torch.isnan(tensor).all() else float('nan'),
            'nan_count': torch.isnan(tensor).sum().item(),
            'inf_count': torch.isinf(tensor).sum().item(),
            'zero_count': (tensor == 0).sum().item()
        }
        log_file.write(f"\nTensor Statistics for {name}:\n")
        for key, value in stats.items():
            log_file.write(f"  {key}: {value}\n")
        return True
    return False


################## LLM输出 ##################

def clean_llm_output(text):
    """
    清理LLM输出内容，移除<think>...</think>标签及其内容
    如果没有找到<think>标签，则认为生成失败，返回空字符串
    """
    # 检查是否存在<think>标签
    if '<think>' not in text:
        return ""

    # 使用正则表达式移除<think>...</think>部分
    think_pattern = re.compile(r'<think>.*?</think>', re.DOTALL)
    think_matches = think_pattern.findall(text)
    print(f"LLM思考think输出: {think_matches}")
    # 如果没有匹配到完整的<think></think>标签对，则认为生成失败
    if not think_matches:
        return ""

    # 移除<think>...</think>部分
    cleaned_text = think_pattern.sub('', text)

    # 如果移除后文本为空，则认为生成失败
    if not cleaned_text.strip():
        return ""

    return cleaned_text.strip()