diff --git a/data/Linux/anomaly_lables.csv b/data/Linux/anomaly_lables.csv new file mode 100644 index 0000000..c2b8c57 Binary files /dev/null and b/data/Linux/anomaly_lables.csv differ diff --git a/data/Linux/log_matrix.npy b/data/Linux/log_matrix.npy new file mode 100644 index 0000000..5e371a5 Binary files /dev/null and b/data/Linux/log_matrix.npy differ diff --git a/data/Linux/mal_matrix.npy b/data/Linux/mal_matrix.npy new file mode 100644 index 0000000..fbcc8f2 Binary files /dev/null and b/data/Linux/mal_matrix.npy differ diff --git a/demo/PCA_demo_without_labels.py b/demo/PCA_demo_without_labels.py index d54a1c0..4b1c0a2 100644 --- a/demo/PCA_demo_without_labels.py +++ b/demo/PCA_demo_without_labels.py @@ -14,35 +14,119 @@ sys.path.append('../') from loglizer.models import PCA from loglizer import dataloader, preprocessing +from collections import Counter +import pandas as pd + +# struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file +struct_log = '../../Dataset_ML/Linux_matrix/log_matrix.npy' +mal_struct_log = '../../Dataset_ML/Linux_mal_matrix/mal_matrix.npy' -struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file if __name__ == '__main__': - ## 1. Load strutured log file and extract feature vectors - # Save the raw event sequence file by setting save_csv=True - (x_train, _), (_, _) = dataloader.load_HDFS(struct_log, window='session', - split_type='sequential', save_csv=True) + # # 1. Load structured log file and extract feature vectors + # # Save the raw event sequence file by setting save_csv=True + # (x_train, _), (_, _) = dataloader.load_HDFS(struct_log, window='session', + # split_type='sequential', save_csv=True) + # feature_extractor = preprocessing.FeatureExtractor() + # x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', + # normalization='zero-mean') + # + # ## 2. Train an unsupervised model + # print('Train phase:') + # # Initialize PCA, or other unsupervised models, LogClustering, InvariantsMiner + # model = PCA() + # # Model hyper-parameters may be sensitive to log data, here we use the default for demo + # model.fit(x_train) + # # Make predictions and manually check for correctness. Details may need to go into the raw logs + # y_train = model.predict(x_train) + # + # ## 3. Use the trained model for online anomaly detection + # print('Test phase:') + # # Load another new log file. Here we use struct_log for demo only + # (x_test, _), (_, _) = dataloader.load_HDFS(struct_log, window='session', split_type='sequential') + # # Go through the same feature extraction process with training, using transform() instead + # x_test = feature_extractor.transform(x_test) + # # Finally make predictions and alter on anomaly cases + # y_test = model.predict(x_test) + # print("the result is:",y_test) + # print("the labels are:",Counter(y_test)) + + + # example without train_ratio + (x_train, _), (_, _) = dataloader.load_Linux(struct_log, window='sliding',split_type='sequential', save_csv = True) feature_extractor = preprocessing.FeatureExtractor() - x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', - normalization='zero-mean') - - ## 2. Train an unsupervised model - print('Train phase:') - # Initialize PCA, or other unsupervised models, LogClustering, InvariantsMiner - model = PCA() - # Model hyper-parameters may be sensitive to log data, here we use the default for demo + x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', normalization='zero-mean') + + # 2.Train an unsupervised model + print("Train phase") + # Initialize PCA + model = PCA() + # model hyper-parameters may be sensitive to log data, here we use the default for demo model.fit(x_train) - # Make predictions and manually check for correctness. Details may need to go into the raw logs - y_train = model.predict(x_train) - - ## 3. Use the trained model for online anomaly detection - print('Test phase:') - # Load another new log file. Here we use struct_log for demo only - (x_test, _), (_, _) = dataloader.load_HDFS(struct_log, window='session', split_type='sequential') - # Go through the same feature extraction process with training, using transform() instead - x_test = feature_extractor.transform(x_test) + # make predictions and manually check for correctness. Details may need to go into the raw logs + y_train = model.predict(x_train) + + # 3. Use the trained model for online anomaly detection + print("Test phase:") + # load another new log file, here we should know the basic set should be large as much as possible + # cuz for every vector, the same position may have different meanings --- can not be compared + (x_test,_),(_,_) = dataloader.load_Linux(mal_struct_log, window = 'sliding', split_type = 'sequential') + # go through the same feature extraction process with training + + x_test_original = x_test.copy() + # assert x_test == x_train, 'the training data is not the same with testing data' + x_test = feature_extractor.transform(x_test) # Finally make predictions and alter on anomaly cases y_test = model.predict(x_test) - + # build the tracing dict + x_y_dict = {} + # define the counter + i = 0 + for x,y in zip(x_test_original, y_test): + x_y_dict[str(x)+','+str(i)] = y_test + i += 1 + # print("the result is:", len(y_test)) + # print("the key names are:", x_y_dict.keys()) + # get the indexs of anomaly sequences + anomaly_sequence_index = [i for i in range(len(y_test)) if y_test[i] == 1] + print("the index of anomaly sequence is:", anomaly_sequence_index) + + # trace the index in the sliding_file_path + sliding_file_path = '../../Dataset_ML/Linux_mal_sliding_24h_3h.csv' + for index in anomaly_sequence_index: + # read sliding file: start_end_index + fd = pd.read_csv(sliding_file_path, header = None) + start_index, end_index = None, None + # get the start and end time from index value + start_index = fd.iloc[index,:][0] + end_index = fd.iloc[index,:][1] + print("please check log csv indexes between {} and {}".format(start_index, end_index)) + + anomaly_sequence = [] + for index in anomaly_sequence_index: + # anomaly_sequence = [var for var in x_y_dict.keys() if int(var.split(',')[-1]) == index] + + for var in x_y_dict.keys(): + # print("the var is:",var) + if int(var.split(',')[-1]) == index: + # print out the anomaly test_x sequence + # print(var) + anomaly_sequence.append(var) + + # print("the anomaly sequence is:", len(anomaly_sequence)) + print("the lables are:", Counter(y_test)) + print("the counter is {} and the anomaly rate is: {}".format(Counter(y_test), len(anomaly_sequence)/x_test.shape[0])) + +''' +For HDFS: +the result is: [0. 0. 0. ... 0. 0. 0.] +the labels are: Counter({0.0: 3951, 1.0: 19}) --- there are 19 anomalies +For Linux_logs: +Counter({0.0: 163, 1.0: 3/5}) 0.0184 --- 0.0307 +For Linux_mali_logs: +Counter({0.0: 127, 1.0: 25}) 0.1969 +''' + + diff --git a/loglizer/dataloader.py b/loglizer/dataloader.py index 574617e..83b2f69 100644 --- a/loglizer/dataloader.py +++ b/loglizer/dataloader.py @@ -38,8 +38,9 @@ def _split_data(x_data, y_data=None, train_ratio=0, split_type='uniform'): else: y_train = y_data[0:num_train] y_test = y_data[num_train:] - # Random shuffle - indexes = shuffle(np.arange(x_train.shape[0])) + + # fixed shuffle ---- in order to have the same result + indexes = shuffle(np.arange(x_train.shape[0]), random_state=7) x_train = x_train[indexes] if y_train is not None: y_train = y_train[indexes] @@ -140,6 +141,7 @@ def load_BGL(log_file, label_file=None, window='sliding', time_interval=60, step """ + def bgl_preprocess_data(para, raw_data, event_mapping_data): """ split logs into sliding windows, built an event count matrix and get the corresponding label @@ -159,20 +161,31 @@ def bgl_preprocess_data(para, raw_data, event_mapping_data): if not os.path.exists(para['save_path']): os.mkdir(para['save_path']) log_size = raw_data.shape[0] - sliding_file_path = para['save_path']+'sliding_'+str(para['window_size'])+'h_'+str(para['step_size'])+'h.csv' + sliding_file_path = para['save_path']+'_sliding_'+str(para['window_size'])+'h_'+str(para['step_size'])+'h.csv' #=============divide into sliding windows=========# start_end_index_list = [] # list of tuples, tuple contains two number, which represent the start and end of sliding time window - label_data, time_data = raw_data[:,0], raw_data[:, 1] + # get the list of label data and the list of time data + label_data, time_data = raw_data[:,0], raw_data[:,1] if not os.path.exists(sliding_file_path): # split into sliding window + # get the first value in the time_data list start_time = time_data[0] + print("the start_time is:",start_time) + print("the type of time is:",type(start_time)) + # the index points at the index in the time_data list start_index = 0 end_index = 0 # get the first start, end index, end time for cur_time in time_data: - if cur_time < start_time + para['window_size']*3600: + # the start_time + para['window_size']: + ## start_time is the first value in the time_data list + ## get the data scope using the window size + ## cur_time < the result means it is in the scope of window size + print("the current time is:",cur_time) + # if cur_time < start_time + para['window_size']*3600: + if int(cur_time) < int(start_time) + para['window_size'] * 3600: end_index += 1 end_time = cur_time else: @@ -181,15 +194,19 @@ def bgl_preprocess_data(para, raw_data, event_mapping_data): break # move the start and end index until next sliding window while end_index < log_size: - start_time = start_time + para['step_size']*3600 - end_time = end_time + para['step_size']*3600 + # start_time = start_time + para['step_size']*3600 + # end_time = end_time + para['step_size']*3600 + start_time = int(start_time) + para['step_size']*3600 + end_time = int(end_time) + para['step_size']*3600 for i in range(start_index,end_index): - if time_data[i] < start_time: + # if time_data[i] < start_time: + if int(time_data[i]) < start_time: i+=1 else: break for j in range(end_index, log_size): - if time_data[j] < end_time: + # if time_data[j] < end_time: + if int(time_data[j]) < end_time: j+=1 else: break @@ -199,7 +216,7 @@ def bgl_preprocess_data(para, raw_data, event_mapping_data): start_end_index_list.append(start_end_pair) inst_number = len(start_end_index_list) print('there are %d instances (sliding windows) in this dataset\n'%inst_number) - np.savetxt(sliding_file_path,start_end_index_list,delimiter=',',fmt='%d') + np.savetxt(sliding_file_path, start_end_index_list, delimiter=',', fmt='%d') else: print('Loading start_end_index_list from file') start_end_index_list = pd.read_csv(sliding_file_path, header=None).values @@ -218,16 +235,24 @@ def bgl_preprocess_data(para, raw_data, event_mapping_data): expanded_indexes_list[i].append(l) event_mapping_data = [row[0] for row in event_mapping_data] + print("the event_mapping_data is:", event_mapping_data) event_num = len(list(set(event_mapping_data))) print('There are %d log events'%event_num) #=============get labels and event count of each sliding window =========# labels = [] + # inst_number --- row, every row is a log sequence(windows sliding) + # event_num --- column, every column is a event, the number is the occurrence of a corresponding event event_count_matrix = np.zeros((inst_number,event_num)) for j in range(inst_number): label = 0 #0 represent success, 1 represent failure for k in expanded_indexes_list[j]: + print("the length of expanded_indexes_list is:",len(expanded_indexes_list[j])) + print("the k value is:",k) event_index = event_mapping_data[k] + print("the event_index is:", event_index) + # the index is not different from the eventId + event_index = event_index-1 event_count_matrix[j, event_index] += 1 if label_data[k]: label = 1 @@ -237,3 +262,171 @@ def bgl_preprocess_data(para, raw_data, event_mapping_data): print("Among all instances, %d are anomalies"%sum(labels)) assert event_count_matrix.shape[0] == len(labels) return event_count_matrix, labels + + + +# this is a part of test for bgl_preprocess_data function +# import os +# import pandas as pd +# import numpy as np +# from collections import Counter +# +# para = {} +# para['save_path'] = '../../logparser-master/logs/BGL/BGL_2k.log_matrix' +# para['window_size'] = 24 # 24 hours ---- one day +# para['step_size'] = 3 # 3 hours +# +# # list data, the element is tuple of (label, time) +# +# # System log Detection/Anomaly_Detection_Time.ipynb +# df_raw_data = pd.read_csv('../../logparser-master/logs/BGL/BGL_2k.log_structured.csv') +# raw_data = [] +# for label, time in zip(df_raw_data['Label'],df_raw_data['Timestamp']): +# raw_data.append((label, time)) +# # raw_data +# raw_data = np.array(raw_data) +# +# df_map_event = pd.read_csv('../../logparser-master/logs/BGL/BGL_2k.log_structured.csv') +# event_mapping_data = [] +# ids = [] +# ids = [int(x[1:]) for x in df_map_event['EventId']] +# +# for id, log in zip(ids, df_map_event['EventTemplate']): +# event_mapping_data.append([id,log]) +# +# +# event_count_matrix, labels = bgl_preprocess_data(para, raw_data, event_mapping_data) +# print("the event_count_matrix is:", Counter(event_count_matrix[9])) +# print("the labels are:", Counter(labels)) + + +def load_Linux(log_file, label_file=None, window ='sliding', time_interval = None,stepping_size = None, train_ratio = 0.5, split_type = 'sequential', save_csv=False): + + print('========== Input data summary==========') + if log_file.endswith('.npy'): + # split training and validation set in a class-uniform way + assert window == 'sliding','Only window=session is supported for Linux dataset' + + data_df = np.load(log_file) + if label_file is None: + if split_type == 'uniform': + split_type = 'sequential','Warning: Only split type=sequential is supported' + # split training and validation set sequentially + x_data = data_df + (x_train,_),(x_test,_) = _split_data(x_data, train_ratio = train_ratio, split_type = split_type) + print('Total: {} instances, train: {} instances, test: {} instances'.format(x_data.shape[0], x_train.shape[0], x_test.shape[0])) + + return (x_train, None), (x_test, None) + else: + raise NotImplementedError('load_Linux() only support npy files') + +# this is a part of test for linux_preprocess_data function --- get the event matrix + + +def Linux_preprocess_data(para, raw_data, event_mapping_data): + """ + split logs into sliding windows, built an event count matrix and get the corresponding label + + Args: + -------- + para: the parameters dictionary + raw_data: list of (Time) --- we will transfer the time to seconds, and get the abs + event_mapping_data: a list of event index, where each row index indicates a corresponding log + + Returns: + -------- + event_count_matrix: event count matrix, where each row is an instance (log sequence vector) + """ + + # create the directory for saving the sliding windows (start_index, end_index), which can be directly loaded in future running + if not os.path.exists(para['save_path']): + os.mkdir(para['save_path']) + log_size = raw_data.shape[0] + sliding_file_path = para['save_path']+'_sliding_'+str(para['window_size'])+'h_'+str(para['step_size'])+'h.csv' + print("the sliding_file_path is:", sliding_file_path) + + # ============= divide into sliding windows ============ + + start_end_index_list = [] # list of tuples, tuple contains two number, which represent the start and end of sliding time window + # get the list of label data and the list of time data + time_data = raw_data + print("the time_data is:", time_data) + if not os.path.exists(sliding_file_path): + start_time = time_data[0] + print("the start_time is:",start_time) + start_index = 0 + end_index = 0 + # finish the comparision in one roll with window_size + for cur_time in time_data: + if cur_time < start_time + para['window_size'] * 3600: + end_index += 1 + end_time = cur_time + else: + start_end_pair = tuple((start_index, end_index)) + start_end_index_list.append(start_end_pair) + break + + # sliding the block and change the index of start and end + while end_index < log_size: + # add the sliding size to start time + start_time = start_time + para['step_size']*3600 + end_time = end_time + para['step_size']*3600 + for i in range(start_index, end_index): + if time_data[i] < start_time: + i += 1 + else: + break + for j in range(end_index, log_size): + if time_data[j] < end_time: + j += 1 + else: + break + start_index = i + end_index = j + # update the start_end_pair + start_end_pair = tuple((start_index, end_index)) + start_end_index_list.append(start_end_pair) + # compute how many sequence(lines) in total + inst_number = len(start_end_index_list) + print("there are %d instances (sliding windows) in this dataset"%(inst_number)) + np.savetxt(sliding_file_path, start_end_index_list, delimiter=',', fmt='%d') + else: + print("Loading start_end_index_list from file") + start_end_index_list = pd.read_csv(sliding_file_path, header = None).values + inst_number = len(start_end_index_list) + print("there are %d instances (sliding windows) in this dataset"%(inst_number)) + + # get all the log indexes in each time window by ranging from start_index to end_index + # in order to counter + expanded_indexes_list = [] + for t in range(inst_number): + # for every row(sequence), there should be a index_list + index_list = [] + expanded_indexes_list.append(index_list) + for i in range(inst_number): + # get the index_list for every row + start_index = start_end_index_list[i][0] + end_index = start_end_index_list[i][1] + # add the indexes for a sequence to expanded_indexed_list + for l in range(start_index, end_index): + expanded_indexes_list[i].append(l) + + event_mapping_data = [row[0] for row in event_mapping_data] + # get the total number for events + event_num = len(list(set(event_mapping_data))) + print("the event number is:", event_num) + + # ============ get event count of each sliding window ============= + event_count_matrix = np.zeros((inst_number, event_num)) + for j in range(inst_number): + for k in expanded_indexes_list[j]: + event_index = event_mapping_data[k] + # make the eventId suitable for list index + event_index = event_index - 1 + event_count_matrix[j, event_index] += 1 + + return event_count_matrix + + + + diff --git a/loglizer/exec.sh b/loglizer/exec.sh new file mode 100644 index 0000000..e2d0e01 --- /dev/null +++ b/loglizer/exec.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +path1='../../Dataset_ML/Linux/Client/Client_train/structured_log.csv' +path2='../../Dataset_ML/Linux/Client/Client_train/Event_dict.pkl' +path3='../../Dataset_ML/Linux/Client/Client_train/structured_log_id.csv' +path4='../../Dataset_ML/Linux/Client/Client_train/Linux_matrix/log_matrix.npy' +path5='../../Dataset_ML/Linux/Client/Client_com/structured_log.csv' +path6='../../Dataset_ML/Linux/Client/Client_com/Event_dict.pkl' +path7='../../Dataset_ML/Linux/Client/Client_com/structured_log_id.csv' +path8='../../Dataset_ML/Linux/Client/Client_com/Linux_matrix/log_matrix.npy' + +python3 matrixgen_client.py --p1 $path1 --p2 $path2 --p3 $path3 --p4 $path4 --p5 $path5 --p6 $path6 --p7 $path7 --p8 $path8 + +exit 0 diff --git a/loglizer/matrixgen.py b/loglizer/matrixgen.py new file mode 100644 index 0000000..cfde401 --- /dev/null +++ b/loglizer/matrixgen.py @@ -0,0 +1,261 @@ +import os +import pandas as pd +import numpy as np +from collections import Counter +import re +from dataloader import * +import joblib +import optparse + +# function to transform hours and minutes to seconds +def trans_seconds(time_list): + seconds_list = [] + seconds = 0 + for i in range(len(time_list)): + # print("splitting time:",time_list[i]) + seconds = int(time_list[i][0]) * 3600 + int(time_list[i][1]) * 60 + int(time_list[i][2]) + seconds_list.append(seconds) + return seconds_list + +# transformation between month name to numbers +def month_string_to_number(string): + m = { + 'Jan': 1, + 'Feb': 2, + 'Mar': 3, + 'Apr': 4, + 'May': 5, + 'Jun': 6, + 'Jul': 7, + 'Aug': 8, + 'Sep': 9, + 'Oct': 10, + 'Nov': 11, + 'Dec': 12 + } + s = string.strip()[:3] + + try: + out = m[s] + return out + except: + pattern = '<.*>(.*)' + match = re.match(pattern,string) + s = match.group(1) + out = m[s] + return out + # process the special case with Jun + # raise ValueError('Not a month') + +# transform month, day to seconds +def trans_seconds(month_list, day_list, time_list): + seconds_list = [] + seconds = 0 + for i in range(len(day_list)): + # we assume there are 30 days for every month + seconds = (int(month_list[i]) - int(month_list[0])) * 30 * 24 * 3600 + (int(day_list[i]) - int(day_list[0])) * 24 * 3600 + \ + int(time_list[i][0]) * 3600 + int(time_list[i][1]) * 60 + int(time_list[i][2]) + # print("the seconds are:", seconds) + seconds_list.append(seconds) + return seconds_list + +# transform log key to eventID +# def Event_Convert(fd): +# event_map = {} +# for i, event in enumerate(fd['EventId']): +# event_map['E' + str(i+1)] = event +# +# return event_map +def Event_Convert(fd, filename): + event_map = {} + event_list = None + event_list = fd['EventId'] + # get the unique values in a list + event_list = list(set(event_list)) + for i, event in enumerate(event_list): + event_map[str(i+1)] = event + joblib.dump(event_map, filename) + return event_map + + +if __name__ == "__main__": + + # define the window_size and step_size to get time sequence + para = {} + para['save_path'] = '../../Dataset_ML/Linux/Client/Client_train/' + para['window_size'] = 0.5 # 24 hours ---- one day + para['step_size'] = 0.2 # 3 hours + + # =============================== generate the event matrix for norcom linux logs ========================= + + # set the format of command input + parser = optparse.OptionParser('usage %prog --p1 \ + --p2 --p3 --p4 \ + --p5 --p6 --p7 \ + --p8 ') + # set the elements for every parameter + parser.add_option('--p1', dest='structured_log_filename', type='string', help='Please input the structured log filename: ') + parser.add_option('--p2', dest='dict_filename', type='string', help='Please input the dict filename for training data: ') + parser.add_option('--p3', dest='structured_log_id_filename', type='string', help='Please input the structured log id filename: ') + parser.add_option('--p4', dest='matrix', type='string', help='Please input the location where you want to save the matrix: ') + parser.add_option('--p5', dest='structured_log_com_filename', type='string', help='Please input the coming structured log filename: ') + parser.add_option('--p6', dest='dict_filename_com', type='string', help='Please input the dict filename for testing data') + parser.add_option('--p7', dest='structured_log_id_com_filename', type='string', help='Please input the coming structured log id filename: ') + parser.add_option('--p8', dest='matrix_com', type='string', help='Please input the location where you want to save the coming matrix: ') + + + # parser arguments through the parse_args() + (options, args) = parser.parse_args() + # get the values from options + structured_log_filename = options.structured_log_filename + dict_filename = options.dict_filename + structured_log_id_filename = options.structured_log_id_filename + matrix = options.matrix + structured_log_com_filename = options.structured_log_com_filename + dict_filename_com = options.dict_filename_com + structured_log_id_com_filename = options.structured_log_id_com_filename + matrix_com = options.matrix_com + + # get the linux dataframe + fd_linux = pd.read_csv(structured_log_filename) + # make a copy to avoid modifying the original data + fd_linux = fd_linux.copy() + + # dict_filename has been given by parser + # check whether the dict_filename has existed + if os.path.isfile(dict_filename): + event_map = joblib.load(dict_filename) + else: + event_map = Event_Convert(fd_linux, dict_filename) + # shift the key and value of the dict + event_map = {val: key for (key, val) in event_map.items()} + + #for i in range(len(fd_linux['EventId'])): + # for key, value in event_map.items(): + # # print("the key {} and value {}".format(key, value)) + # if fd_linux['EventId'][i] == value: + # # replace the hashed eventId into format like numerical id + # fd_linux.is_copy = False + # fd_linux['EventId'][i] = key + # print("the replace eventId is:", fd_linux['EventId'][i]) + + + #fd_linux['EventId'].map(event_map).fillna(fd_linux['EventId']) + fd_linux['EventId'] = fd_linux['EventId'].map(event_map) + + # structured_log_id_filename has been generated above + + + fd_linux.to_csv(structured_log_id_filename, index = False) + # read the saved csv + fd_linux_id = pd.read_csv(structured_log_id_filename) + # sort the dataframe from time increasing order + fd_linux_id_sort = fd_linux_id.copy() + fd_linux_id_sort.sort_index(axis=0, ascending=False, inplace=True) + # reset the index + fd_linux_id_sort = fd_linux_id_sort.reset_index(drop = True) + print(fd_linux_id_sort.head()) + # part to transform the month, date, time into seconds + month_list, time_list, day_list, day_list = [], [], [], [] + + for i in range(len(fd_linux_id_sort['Time'])): + time_list.append(fd_linux_id_sort['Time'][i].split(':')) + for j in range(len(fd_linux_id_sort['Date'])): + day_list.append(fd_linux_id_sort['Date'][j]) + + month_number = 0 + for k in range(len(fd_linux_id_sort['Month'])): + month_number = month_string_to_number(fd_linux_id_sort['Month'][k]) + month_list.append(month_number) + + seconds_list = trans_seconds(month_list, day_list, time_list) + + raw_data = np.array(seconds_list) + + event_mapping_data = [] + Event_ids = [] + # get the digits part of eventID + + Event_ids = [int(x) for x in fd_linux_id['EventId']] + + for id, log in zip(Event_ids, fd_linux_id['EventTemplate']): + event_mapping_data.append([id, log]) + + + # create the event count matrix with the function of Linux_preprocess_data + event_count_matrix = Linux_preprocess_data(para, raw_data, event_mapping_data) + # print("the event_count_matrix is:", Counter(event_count_matrix[9])) + print("the event_count_matrix is:", event_count_matrix) + # matrix path has been generated above + np.save(matrix, event_count_matrix) + + + + # =============================== generate the event matrix for comicious linux logs ========================= + + para_com = {} + para_com['save_path'] = '../../Dataset_ML/Linux/Client/Client_com/' + para_com['window_size'] = 24 # 24 hours ---- one day + para_com['step_size'] = 3 # 3 hours + + # structured_log_com_filename has been give by parser + fd_linux_com = pd.read_csv(structured_log_com_filename) + fd_linux_com = fd_linux_com.copy() + + # dict_filename_com has been given by parser + # check whether the dict_filename_com has existed + if os.path.isfile(dict_filename_com): + event_map_com = joblib.load(dict_filename_com) + else: + event_map_com = Event_Convert(fd_linux_com, dict_filename_com) + + for i in range(len(fd_linux_com['EventId'])): + for key, value in event_map_com.items(): + fd_linux_com.is_copy = False + if fd_linux_com['EventId'][i] == value: + fd_linux_com['EventId'][i] = key + + # structured_log_com_filename + fd_linux_com.to_csv(structured_log_id_com_filename, index=False) + + fd_linux_com_id = pd.read_csv(structured_log_id_com_filename) + fd_linux_com_id = fd_linux_com_id.copy() + + fd_linux_com_id.sort_index(axis=0, ascending=False, inplace=True) + + fd_linux_com_id = fd_linux_com_id.reset_index(drop = True) + + fd_linux_com_id = fd_linux_com_id.copy() + + # part to transform date time into seconds + month_list_com ,time_list_com, day_list_com, day_list_com = [],[],[], [] + + for i in range(len(fd_linux_com_id['Time'])): + time_list_com.append(fd_linux_com_id['Time'][i].split(':')) + for j in range(len(fd_linux_com_id['Date'])): + day_list_com.append(fd_linux_com_id['Date'][j]) + + month_number_com = 0 + for k in range(len(fd_linux_com_id['Month'])): + # print("we are transferring the month:",fd_linux['Month'][k]) + month_number_com = month_string_to_number(fd_linux_com_id['Month'][k]) + month_list_com.append(month_number_com) + + seconds_list_com = trans_seconds(month_list_com, day_list_com, time_list_com) + + raw_data_com = np.array(seconds_list_com) + + event_mapping_data_com = [] + Event_ids_com = [] + # get the digits part of eventID + Event_ids_com = [int(x) for x in fd_linux_com_id['EventId']] + + for id, log in zip(Event_ids_com, fd_linux_com_id['EventTemplate']): + event_mapping_data_com.append([id, log]) + + + event_count_matrix_com = Linux_preprocess_data(para_com, raw_data_com, event_mapping_data_com) + # print("the event_count_matrix is:", Counter(event_count_matrix[9])) + print("the event_count_matrix is:", event_count_matrix_com) + # matrix_com has been given by parser + np.save(matrix_com, event_count_matrix_com) diff --git a/loglizer/models/PCA.py b/loglizer/models/PCA.py index 6d6a437..22ebf5a 100644 --- a/loglizer/models/PCA.py +++ b/loglizer/models/PCA.py @@ -64,7 +64,7 @@ def fit(self, X): variance += sigma[i] if variance / total_variance >= n_components: break - n_components = i + 1 + n_components = i + 1 P = U[:, :n_components] I = np.identity(num_events, int)