diff --git a/data/Linux/anomaly_lables.csv b/data/Linux/anomaly_lables.csv
new file mode 100644
index 0000000..c2b8c57
Binary files /dev/null and b/data/Linux/anomaly_lables.csv differ
diff --git a/data/Linux/log_matrix.npy b/data/Linux/log_matrix.npy
new file mode 100644
index 0000000..5e371a5
Binary files /dev/null and b/data/Linux/log_matrix.npy differ
diff --git a/data/Linux/mal_matrix.npy b/data/Linux/mal_matrix.npy
new file mode 100644
index 0000000..fbcc8f2
Binary files /dev/null and b/data/Linux/mal_matrix.npy differ
diff --git a/demo/PCA_demo_without_labels.py b/demo/PCA_demo_without_labels.py
index d54a1c0..4b1c0a2 100644
--- a/demo/PCA_demo_without_labels.py
+++ b/demo/PCA_demo_without_labels.py
@@ -14,35 +14,119 @@
 sys.path.append('../')
 from loglizer.models import PCA
 from loglizer import dataloader, preprocessing
+from collections import Counter
+import pandas as pd
+
+# struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
+struct_log = '../../Dataset_ML/Linux_matrix/log_matrix.npy'
+mal_struct_log = '../../Dataset_ML/Linux_mal_matrix/mal_matrix.npy'
 
-struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
 
 if __name__ == '__main__':
-    ## 1. Load strutured log file and extract feature vectors
-    # Save the raw event sequence file by setting save_csv=True
-    (x_train, _), (_, _) = dataloader.load_HDFS(struct_log, window='session', 
-                                                split_type='sequential', save_csv=True)
+    # # 1. Load structured log file and extract feature vectors
+    # # Save the raw event sequence file by setting save_csv=True
+    # (x_train, _), (_, _) = dataloader.load_HDFS(struct_log, window='session',
+    #                                             split_type='sequential', save_csv=True)
+    # feature_extractor = preprocessing.FeatureExtractor()
+    # x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf',
+    #                                           normalization='zero-mean')
+    #
+    # ## 2. Train an unsupervised model
+    # print('Train phase:')
+    # # Initialize PCA, or other unsupervised models, LogClustering, InvariantsMiner
+    # model = PCA()
+    # # Model hyper-parameters may be sensitive to log data, here we use the default for demo
+    # model.fit(x_train)
+    # # Make predictions and manually check for correctness. Details may need to go into the raw logs
+    # y_train = model.predict(x_train)
+    #
+    # ## 3. Use the trained model for online anomaly detection
+    # print('Test phase:')
+    # # Load another new log file. Here we use struct_log for demo only
+    # (x_test, _), (_, _) = dataloader.load_HDFS(struct_log, window='session', split_type='sequential')
+    # # Go through the same feature extraction process with training, using transform() instead
+    # x_test = feature_extractor.transform(x_test)
+    # # Finally make predictions and alter on anomaly cases
+    # y_test = model.predict(x_test)
+    # print("the result is:",y_test)
+    # print("the labels are:",Counter(y_test))
+
+
+    # example without train_ratio
+    (x_train, _), (_, _) = dataloader.load_Linux(struct_log, window='sliding',split_type='sequential', save_csv = True)
     feature_extractor = preprocessing.FeatureExtractor()
-    x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', 
-                                              normalization='zero-mean')
-    
-    ## 2. Train an unsupervised model
-    print('Train phase:')
-    # Initialize PCA, or other unsupervised models, LogClustering, InvariantsMiner
-    model = PCA() 
-    # Model hyper-parameters may be sensitive to log data, here we use the default for demo
+    x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', normalization='zero-mean')
+
+    # 2.Train an unsupervised model
+    print("Train phase")
+    # Initialize PCA
+    model = PCA()
+    # model hyper-parameters may be sensitive to log data, here we use the default for demo
     model.fit(x_train)
-    # Make predictions and manually check for correctness. Details may need to go into the raw logs
-    y_train = model.predict(x_train) 
-
-    ## 3. Use the trained model for online anomaly detection
-    print('Test phase:')
-    # Load another new log file. Here we use struct_log for demo only
-    (x_test, _), (_, _) = dataloader.load_HDFS(struct_log, window='session', split_type='sequential')
-    # Go through the same feature extraction process with training, using transform() instead
-    x_test = feature_extractor.transform(x_test) 
+    # make predictions and manually check for correctness. Details may need to go into the raw logs
+    y_train = model.predict(x_train)
+
+    # 3. Use the trained model for online anomaly detection
+    print("Test phase:")
+    # load another new log file, here we should know the basic set should be large as much as possible
+    # cuz for every vector, the same position may have different meanings --- can not be compared
+    (x_test,_),(_,_) = dataloader.load_Linux(mal_struct_log, window = 'sliding', split_type = 'sequential')
+    # go through the same feature extraction process with training
+
+    x_test_original = x_test.copy()
+    # assert x_test == x_train, 'the training data is not the same with testing data'
+    x_test = feature_extractor.transform(x_test)
     # Finally make predictions and alter on anomaly cases
     y_test = model.predict(x_test)
-    
+    # build the tracing dict
+    x_y_dict = {}
+    # define the counter
+    i = 0
+    for x,y in zip(x_test_original, y_test):
+        x_y_dict[str(x)+','+str(i)] = y_test
+        i += 1
+    # print("the result is:", len(y_test))
+    # print("the key names are:", x_y_dict.keys())
+    # get the indexs of anomaly sequences
+    anomaly_sequence_index = [i for i in range(len(y_test)) if y_test[i] == 1]
+    print("the index of anomaly sequence is:", anomaly_sequence_index)
+
+    # trace the index in the sliding_file_path
+    sliding_file_path = '../../Dataset_ML/Linux_mal_sliding_24h_3h.csv'
+    for index in anomaly_sequence_index:
+        # read sliding file: start_end_index
+        fd = pd.read_csv(sliding_file_path, header = None)
+        start_index, end_index = None, None
+        # get the start and end time from index value
+        start_index = fd.iloc[index,:][0]
+        end_index = fd.iloc[index,:][1]
+        print("please check log csv indexes between {} and {}".format(start_index, end_index))
+
+    anomaly_sequence = []
+    for index in anomaly_sequence_index:
+        # anomaly_sequence = [var for var in x_y_dict.keys() if int(var.split(',')[-1]) == index]
+
+        for var in x_y_dict.keys():
+            # print("the var is:",var)
+            if int(var.split(',')[-1]) == index:
+                # print out the anomaly test_x sequence
+                # print(var)
+                anomaly_sequence.append(var)
+
+    # print("the anomaly sequence is:", len(anomaly_sequence))
+    print("the lables are:", Counter(y_test))
+    print("the counter is {} and the anomaly rate is: {}".format(Counter(y_test), len(anomaly_sequence)/x_test.shape[0]))
+
+'''
+For HDFS:
+the result is: [0. 0. 0. ... 0. 0. 0.]
+the labels are: Counter({0.0: 3951, 1.0: 19}) --- there are 19 anomalies
+For Linux_logs:
+Counter({0.0: 163, 1.0: 3/5})   0.0184 --- 0.0307
+For Linux_mali_logs:
+Counter({0.0: 127, 1.0: 25})    0.1969
+'''
+
+
 
 
diff --git a/loglizer/dataloader.py b/loglizer/dataloader.py
index 574617e..83b2f69 100644
--- a/loglizer/dataloader.py
+++ b/loglizer/dataloader.py
@@ -38,8 +38,9 @@ def _split_data(x_data, y_data=None, train_ratio=0, split_type='uniform'):
         else:
             y_train = y_data[0:num_train]
             y_test = y_data[num_train:]
-    # Random shuffle
-    indexes = shuffle(np.arange(x_train.shape[0]))
+
+    # fixed shuffle ---- in order to have the same result
+    indexes = shuffle(np.arange(x_train.shape[0]), random_state=7)
     x_train = x_train[indexes]
     if y_train is not None:
         y_train = y_train[indexes]
@@ -140,6 +141,7 @@ def load_BGL(log_file, label_file=None, window='sliding', time_interval=60, step
     """
 
 
+
 def bgl_preprocess_data(para, raw_data, event_mapping_data):
     """ split logs into sliding windows, built an event count matrix and get the corresponding label
 
@@ -159,20 +161,31 @@ def bgl_preprocess_data(para, raw_data, event_mapping_data):
     if not os.path.exists(para['save_path']):
         os.mkdir(para['save_path'])
     log_size = raw_data.shape[0]
-    sliding_file_path = para['save_path']+'sliding_'+str(para['window_size'])+'h_'+str(para['step_size'])+'h.csv'
+    sliding_file_path = para['save_path']+'_sliding_'+str(para['window_size'])+'h_'+str(para['step_size'])+'h.csv'
 
     #=============divide into sliding windows=========#
     start_end_index_list = [] # list of tuples, tuple contains two number, which represent the start and end of sliding time window
-    label_data, time_data = raw_data[:,0], raw_data[:, 1]
+    # get the list of label data and the list of time data
+    label_data, time_data = raw_data[:,0], raw_data[:,1]
     if not os.path.exists(sliding_file_path):
         # split into sliding window
+        # get the first value in the time_data list
         start_time = time_data[0]
+        print("the start_time is:",start_time)
+        print("the type of time is:",type(start_time))
+        # the index points at the index in the time_data list
         start_index = 0
         end_index = 0
 
         # get the first start, end index, end time
         for cur_time in time_data:
-            if  cur_time < start_time + para['window_size']*3600:
+            # the start_time + para['window_size']:
+            ## start_time is the first value in the time_data list
+            ## get the data scope using the window size
+            ## cur_time < the result means it is in the scope of window size
+            print("the current time is:",cur_time)
+            # if cur_time < start_time + para['window_size']*3600:
+            if int(cur_time) < int(start_time) + para['window_size'] * 3600:
                 end_index += 1
                 end_time = cur_time
             else:
@@ -181,15 +194,19 @@ def bgl_preprocess_data(para, raw_data, event_mapping_data):
                 break
         # move the start and end index until next sliding window
         while end_index < log_size:
-            start_time = start_time + para['step_size']*3600
-            end_time = end_time + para['step_size']*3600
+            # start_time = start_time + para['step_size']*3600
+            # end_time = end_time + para['step_size']*3600
+            start_time = int(start_time) + para['step_size']*3600
+            end_time = int(end_time) + para['step_size']*3600
             for i in range(start_index,end_index):
-                if time_data[i] < start_time:
+                # if time_data[i] < start_time:
+                if int(time_data[i]) < start_time:
                     i+=1
                 else:
                     break
             for j in range(end_index, log_size):
-                if time_data[j] < end_time:
+                # if time_data[j] < end_time:
+                if int(time_data[j]) < end_time:
                     j+=1
                 else:
                     break
@@ -199,7 +216,7 @@ def bgl_preprocess_data(para, raw_data, event_mapping_data):
             start_end_index_list.append(start_end_pair)
         inst_number = len(start_end_index_list)
         print('there are %d instances (sliding windows) in this dataset\n'%inst_number)
-        np.savetxt(sliding_file_path,start_end_index_list,delimiter=',',fmt='%d')
+        np.savetxt(sliding_file_path, start_end_index_list, delimiter=',', fmt='%d')
     else:
         print('Loading start_end_index_list from file')
         start_end_index_list = pd.read_csv(sliding_file_path, header=None).values
@@ -218,16 +235,24 @@ def bgl_preprocess_data(para, raw_data, event_mapping_data):
             expanded_indexes_list[i].append(l)
 
     event_mapping_data = [row[0] for row in event_mapping_data]
+    print("the event_mapping_data is:", event_mapping_data)
     event_num = len(list(set(event_mapping_data)))
     print('There are %d log events'%event_num)
 
     #=============get labels and event count of each sliding window =========#
     labels = []
+    # inst_number --- row, every row is a log sequence(windows sliding)
+    # event_num --- column, every column is a event, the number is the occurrence of a corresponding event
     event_count_matrix = np.zeros((inst_number,event_num))
     for j in range(inst_number):
         label = 0   #0 represent success, 1 represent failure
         for k in expanded_indexes_list[j]:
+            print("the length of expanded_indexes_list is:",len(expanded_indexes_list[j]))
+            print("the k value is:",k)
             event_index = event_mapping_data[k]
+            print("the event_index is:", event_index)
+            # the index is not different from the eventId
+            event_index = event_index-1
             event_count_matrix[j, event_index] += 1
             if label_data[k]:
                 label = 1
@@ -237,3 +262,171 @@ def bgl_preprocess_data(para, raw_data, event_mapping_data):
     print("Among all instances, %d are anomalies"%sum(labels))
     assert event_count_matrix.shape[0] == len(labels)
     return event_count_matrix, labels
+
+
+
+# this is a part of test for bgl_preprocess_data function
+# import os
+# import pandas as pd
+# import numpy as np
+# from collections import Counter
+#
+# para = {}
+# para['save_path'] = '../../logparser-master/logs/BGL/BGL_2k.log_matrix'
+# para['window_size'] = 24 # 24 hours ---- one day
+# para['step_size'] = 3 # 3 hours
+#
+# # list data, the element is tuple of (label, time)
+#
+# # System log Detection/Anomaly_Detection_Time.ipynb
+# df_raw_data = pd.read_csv('../../logparser-master/logs/BGL/BGL_2k.log_structured.csv')
+# raw_data = []
+# for label, time in zip(df_raw_data['Label'],df_raw_data['Timestamp']):
+#     raw_data.append((label, time))
+# # raw_data
+# raw_data = np.array(raw_data)
+#
+# df_map_event = pd.read_csv('../../logparser-master/logs/BGL/BGL_2k.log_structured.csv')
+# event_mapping_data = []
+# ids = []
+# ids = [int(x[1:]) for x in df_map_event['EventId']]
+#
+# for id, log in zip(ids, df_map_event['EventTemplate']):
+#     event_mapping_data.append([id,log])
+#
+#
+# event_count_matrix, labels = bgl_preprocess_data(para, raw_data, event_mapping_data)
+# print("the event_count_matrix is:", Counter(event_count_matrix[9]))
+# print("the labels are:", Counter(labels))
+
+
+def load_Linux(log_file, label_file=None, window ='sliding', time_interval = None,stepping_size = None, train_ratio = 0.5, split_type = 'sequential', save_csv=False):
+
+    print('========== Input data summary==========')
+    if log_file.endswith('.npy'):
+        # split training and validation set in a class-uniform way
+        assert window == 'sliding','Only window=session is supported for Linux dataset'
+
+        data_df = np.load(log_file)
+        if label_file is None:
+            if split_type == 'uniform':
+                split_type = 'sequential','Warning: Only split type=sequential is supported'
+            # split training and validation set sequentially
+            x_data = data_df
+            (x_train,_),(x_test,_) = _split_data(x_data, train_ratio = train_ratio, split_type = split_type)
+            print('Total: {} instances, train: {} instances, test: {} instances'.format(x_data.shape[0], x_train.shape[0], x_test.shape[0]))
+
+            return (x_train, None), (x_test, None)
+    else:
+        raise NotImplementedError('load_Linux() only support npy files')
+
+# this is a part of test for linux_preprocess_data function --- get the event matrix
+
+
+def Linux_preprocess_data(para, raw_data, event_mapping_data):
+    """
+    split logs into sliding windows, built an event count matrix and get the corresponding label
+
+    Args:
+    --------
+    para: the parameters dictionary
+    raw_data: list of (Time) --- we will transfer the time to seconds, and get the abs
+    event_mapping_data: a list of event index, where each row index indicates a corresponding log
+
+    Returns:
+    --------
+    event_count_matrix: event count matrix, where each row is an instance (log sequence vector)
+    """
+
+    # create the directory for saving the sliding windows (start_index, end_index), which can be directly loaded in future running
+    if not os.path.exists(para['save_path']):
+        os.mkdir(para['save_path'])
+    log_size = raw_data.shape[0]
+    sliding_file_path = para['save_path']+'_sliding_'+str(para['window_size'])+'h_'+str(para['step_size'])+'h.csv'
+    print("the sliding_file_path is:", sliding_file_path)
+
+    # ============= divide into sliding windows ============
+
+    start_end_index_list = [] # list of tuples, tuple contains two number, which represent the start and end of sliding time window
+    # get the list of label data and the list of time data
+    time_data = raw_data
+    print("the time_data is:", time_data)
+    if not os.path.exists(sliding_file_path):
+        start_time = time_data[0]
+        print("the start_time is:",start_time)
+        start_index = 0
+        end_index = 0
+        # finish the comparision in one roll with window_size
+        for cur_time in time_data:
+            if cur_time < start_time + para['window_size'] * 3600:
+                end_index += 1
+                end_time = cur_time
+            else:
+                start_end_pair = tuple((start_index, end_index))
+                start_end_index_list.append(start_end_pair)
+                break
+    
+    # sliding the block and change the index of start and end
+        while end_index < log_size:
+            # add the sliding size to start time
+            start_time = start_time + para['step_size']*3600
+            end_time = end_time + para['step_size']*3600
+            for i in range(start_index, end_index):
+                if time_data[i] < start_time:
+                    i += 1
+                else:
+                    break
+            for j in range(end_index, log_size):
+                if time_data[j] < end_time:
+                    j += 1
+                else:
+                    break
+            start_index = i
+            end_index = j
+            # update the start_end_pair
+            start_end_pair = tuple((start_index, end_index))
+            start_end_index_list.append(start_end_pair)
+        # compute how many sequence(lines) in total
+        inst_number = len(start_end_index_list)
+        print("there are %d instances (sliding windows) in this dataset"%(inst_number))
+        np.savetxt(sliding_file_path, start_end_index_list, delimiter=',', fmt='%d')
+    else:
+        print("Loading start_end_index_list from file")
+        start_end_index_list = pd.read_csv(sliding_file_path, header = None).values
+        inst_number = len(start_end_index_list)
+        print("there are %d instances (sliding windows) in this dataset"%(inst_number))
+
+    # get all the log indexes in each time window by ranging from start_index to end_index
+    # in order to counter
+    expanded_indexes_list = []
+    for t in range(inst_number):
+        # for every row(sequence), there should be a index_list
+        index_list = []
+        expanded_indexes_list.append(index_list)
+    for i in range(inst_number):
+        # get the index_list for every row
+        start_index = start_end_index_list[i][0]
+        end_index = start_end_index_list[i][1]
+        # add the indexes for a sequence to expanded_indexed_list
+        for l in range(start_index, end_index):
+            expanded_indexes_list[i].append(l)
+
+    event_mapping_data = [row[0] for row in event_mapping_data]
+    # get the total number for events
+    event_num = len(list(set(event_mapping_data)))
+    print("the event number is:", event_num)
+
+    # ============ get event count of each sliding window =============
+    event_count_matrix = np.zeros((inst_number, event_num))
+    for j in range(inst_number):
+        for k in expanded_indexes_list[j]:
+            event_index = event_mapping_data[k]
+            # make the eventId suitable for list index
+            event_index = event_index - 1
+            event_count_matrix[j, event_index] += 1
+
+    return event_count_matrix
+
+
+
+
diff --git a/loglizer/exec.sh b/loglizer/exec.sh
new file mode 100644
index 0000000..e2d0e01
--- /dev/null
+++ b/loglizer/exec.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+path1='../../Dataset_ML/Linux/Client/Client_train/structured_log.csv'
+path2='../../Dataset_ML/Linux/Client/Client_train/Event_dict.pkl'
+path3='../../Dataset_ML/Linux/Client/Client_train/structured_log_id.csv'
+path4='../../Dataset_ML/Linux/Client/Client_train/Linux_matrix/log_matrix.npy'
+path5='../../Dataset_ML/Linux/Client/Client_com/structured_log.csv'
+path6='../../Dataset_ML/Linux/Client/Client_com/Event_dict.pkl'
+path7='../../Dataset_ML/Linux/Client/Client_com/structured_log_id.csv'
+path8='../../Dataset_ML/Linux/Client/Client_com/Linux_matrix/log_matrix.npy'
+
+python3 matrixgen_client.py --p1 $path1 --p2 $path2 --p3 $path3 --p4 $path4 --p5 $path5 --p6 $path6 --p7 $path7 --p8 $path8
+
+exit 0
diff --git a/loglizer/matrixgen.py b/loglizer/matrixgen.py
new file mode 100644
index 0000000..cfde401
--- /dev/null
+++ b/loglizer/matrixgen.py
@@ -0,0 +1,261 @@
+import os
+import pandas as pd
+import numpy as np
+from collections import Counter
+import re
+from dataloader import *
+import joblib
+import optparse
+
+# function to transform hours and minutes to seconds
+def trans_seconds(time_list):
+    seconds_list = []
+    seconds = 0
+    for i in range(len(time_list)):
+        #         print("splitting time:",time_list[i])
+        seconds = int(time_list[i][0]) * 3600 + int(time_list[i][1]) * 60 + int(time_list[i][2])
+        seconds_list.append(seconds)
+    return seconds_list
+
+# transformation between month name to numbers
+def month_string_to_number(string):
+    m = {
+        'Jan': 1,
+        'Feb': 2,
+        'Mar': 3,
+        'Apr': 4,
+        'May': 5,
+        'Jun': 6,
+        'Jul': 7,
+        'Aug': 8,
+        'Sep': 9,
+        'Oct': 10,
+        'Nov': 11,
+        'Dec': 12
+    }
+    s = string.strip()[:3]
+
+    try:
+        out = m[s]
+        return out
+    except:
+        pattern = '<.*>(.*)'
+        match = re.match(pattern,string)
+        s = match.group(1)
+        out = m[s]
+        return out
+        # process the special case with <N/ASCII>Jun
+        # raise ValueError('Not a month')
+
+# transform month, day to seconds
+def trans_seconds(month_list, day_list, time_list):
+    seconds_list = []
+    seconds = 0
+    for i in range(len(day_list)):
+        # we assume there are 30 days for every month
+        seconds = (int(month_list[i]) - int(month_list[0])) * 30 * 24 * 3600 + (int(day_list[i]) - int(day_list[0])) * 24 * 3600 + \
+                  int(time_list[i][0]) * 3600 + int(time_list[i][1]) * 60 + int(time_list[i][2])
+        # print("the seconds are:", seconds)
+        seconds_list.append(seconds)
+    return seconds_list
+
+# transform log key to eventID
+# def Event_Convert(fd):
+#     event_map = {}
+#     for i, event in enumerate(fd['EventId']):
+#         event_map['E' + str(i+1)] = event
+#
+#     return event_map
+def Event_Convert(fd, filename):
+    event_map = {}
+    event_list = None
+    event_list = fd['EventId']
+    # get the unique values in a list
+    event_list = list(set(event_list))
+    for i, event in enumerate(event_list):
+        event_map[str(i+1)] = event
+    joblib.dump(event_map, filename)
+    return event_map
+
+
+if __name__ == "__main__":
+
+    # define the window_size and step_size to get time sequence
+    para = {}
+    para['save_path'] = '../../Dataset_ML/Linux/Client/Client_train/'
+    para['window_size'] = 0.5 # 24 hours ---- one day
+    para['step_size'] = 0.2 # 3 hours
+
+    # =============================== generate the event matrix for norcom linux logs =========================
+
+    # set the format of command input
+    parser = optparse.OptionParser('usage %prog --p1 <structured log filename for training data> \
+                                    --p2 <dict_filename> --p3 <structured id log filename for training data> --p4 <transformed matrix for training> \
+                                    --p5 <structured log filename for testing data> --p6 <dict_filename_com> --p7 <structured id log filename for testing data> \
+                                    --p8 <transformed matrix for testing>')
+    # set the elements for every parameter
+    parser.add_option('--p1', dest='structured_log_filename', type='string', help='Please input the structured log filename: ')
+    parser.add_option('--p2', dest='dict_filename', type='string', help='Please input the dict filename for training data: ')
+    parser.add_option('--p3', dest='structured_log_id_filename', type='string', help='Please input the structured log id filename: ')
+    parser.add_option('--p4', dest='matrix', type='string', help='Please input the location where you want to save the matrix: ')
+    parser.add_option('--p5', dest='structured_log_com_filename', type='string', help='Please input the coming structured log filename: ')
+    parser.add_option('--p6', dest='dict_filename_com', type='string', help='Please input the dict filename for testing data')
+    parser.add_option('--p7', dest='structured_log_id_com_filename', type='string', help='Please input the coming structured log id filename: ')
+    parser.add_option('--p8', dest='matrix_com', type='string', help='Please input the location where you want to save the coming matrix: ')
+
+
+    # parser arguments through the parse_args()
+    (options, args) = parser.parse_args()
+    # get the values from options
+    structured_log_filename = options.structured_log_filename
+    dict_filename = options.dict_filename
+    structured_log_id_filename = options.structured_log_id_filename
+    matrix = options.matrix
+    structured_log_com_filename = options.structured_log_com_filename
+    dict_filename_com = options.dict_filename_com
+    structured_log_id_com_filename = options.structured_log_id_com_filename
+    matrix_com = options.matrix_com
+
+    # get the linux dataframe
+    fd_linux = pd.read_csv(structured_log_filename)
+    # make a copy to avoid modifying the original data
+    fd_linux = fd_linux.copy()
+
+    # dict_filename has been given by parser
+    # check whether the dict_filename has existed
+    if os.path.isfile(dict_filename):
+        event_map = joblib.load(dict_filename)
+    else:
+        event_map = Event_Convert(fd_linux, dict_filename)
+    # shift the key and value of the dict
+    event_map = {val: key for (key, val) in event_map.items()}
+    
+    #for i in range(len(fd_linux['EventId'])):
+     #   for key, value in event_map.items():
+      #      # print("the key {} and value {}".format(key,  value))
+        #    if fd_linux['EventId'][i] == value:
+       #         # replace the hashed eventId into format like numerical id
+         #       fd_linux.is_copy = False
+          #      fd_linux['EventId'][i] = key
+           #     print("the replace eventId is:", fd_linux['EventId'][i])
+    
+
+    #fd_linux['EventId'].map(event_map).fillna(fd_linux['EventId'])
+    fd_linux['EventId'] = fd_linux['EventId'].map(event_map)
+
+    # structured_log_id_filename has been generated above
+    
+    
+    fd_linux.to_csv(structured_log_id_filename, index = False)
+    # read the saved csv
+    fd_linux_id = pd.read_csv(structured_log_id_filename)
+    # sort the dataframe from time increasing order
+    fd_linux_id_sort = fd_linux_id.copy()
+    fd_linux_id_sort.sort_index(axis=0, ascending=False, inplace=True)
+    # reset the index
+    fd_linux_id_sort = fd_linux_id_sort.reset_index(drop = True)
+    print(fd_linux_id_sort.head())
+    # part to transform the month, date, time into seconds
+    month_list, time_list, day_list, day_list = [], [], [], []
+
+    for i in range(len(fd_linux_id_sort['Time'])):
+        time_list.append(fd_linux_id_sort['Time'][i].split(':'))
+    for j in range(len(fd_linux_id_sort['Date'])):
+        day_list.append(fd_linux_id_sort['Date'][j])
+
+    month_number = 0
+    for k in range(len(fd_linux_id_sort['Month'])):
+        month_number = month_string_to_number(fd_linux_id_sort['Month'][k])
+        month_list.append(month_number)
+
+    seconds_list = trans_seconds(month_list, day_list, time_list)
+
+    raw_data = np.array(seconds_list)
+
+    event_mapping_data = []
+    Event_ids = []
+    # get the digits part of eventID
+
+    Event_ids = [int(x) for x in fd_linux_id['EventId']]
+
+    for id, log in zip(Event_ids, fd_linux_id['EventTemplate']):
+        event_mapping_data.append([id, log])
+
+
+    # create the event count matrix with the function of Linux_preprocess_data
+    event_count_matrix = Linux_preprocess_data(para, raw_data, event_mapping_data)
+    # print("the event_count_matrix is:", Counter(event_count_matrix[9]))
+    print("the event_count_matrix is:", event_count_matrix)
+    # matrix path has been generated above
+    np.save(matrix, event_count_matrix)
+
+
+
+    # =============================== generate the event matrix for comicious linux logs =========================
+
+    para_com = {}
+    para_com['save_path'] = '../../Dataset_ML/Linux/Client/Client_com/'
+    para_com['window_size'] = 24  # 24 hours ---- one day
+    para_com['step_size'] = 3  # 3 hours
+
+    # structured_log_com_filename has been give by parser
+    fd_linux_com = pd.read_csv(structured_log_com_filename)
+    fd_linux_com = fd_linux_com.copy()
+
+    # dict_filename_com has been given by parser
+    # check whether the dict_filename_com has existed
+    if os.path.isfile(dict_filename_com):
+        event_map_com = joblib.load(dict_filename_com)
+    else:
+        event_map_com = Event_Convert(fd_linux_com, dict_filename_com)
+
+    for i in range(len(fd_linux_com['EventId'])):
+        for key, value in event_map_com.items():
+            fd_linux_com.is_copy = False
+            if fd_linux_com['EventId'][i] == value:
+                fd_linux_com['EventId'][i] = key
+
+    # structured_log_com_filename
+    fd_linux_com.to_csv(structured_log_id_com_filename, index=False)
+
+    fd_linux_com_id = pd.read_csv(structured_log_id_com_filename)
+    fd_linux_com_id = fd_linux_com_id.copy()
+
+    fd_linux_com_id.sort_index(axis=0, ascending=False, inplace=True)
+
+    fd_linux_com_id = fd_linux_com_id.reset_index(drop = True)
+
+    fd_linux_com_id = fd_linux_com_id.copy()
+
+    # part to transform date time into seconds
+    month_list_com ,time_list_com, day_list_com, day_list_com = [],[],[], []
+
+    for i in range(len(fd_linux_com_id['Time'])):
+        time_list_com.append(fd_linux_com_id['Time'][i].split(':'))
+    for j in range(len(fd_linux_com_id['Date'])):
+        day_list_com.append(fd_linux_com_id['Date'][j])
+
+    month_number_com = 0
+    for k in range(len(fd_linux_com_id['Month'])):
+        # print("we are transferring the month:",fd_linux['Month'][k])
+        month_number_com = month_string_to_number(fd_linux_com_id['Month'][k])
+        month_list_com.append(month_number_com)
+
+    seconds_list_com = trans_seconds(month_list_com, day_list_com, time_list_com)
+
+    raw_data_com = np.array(seconds_list_com)
+
+    event_mapping_data_com = []
+    Event_ids_com = []
+    # get the digits part of eventID
+    Event_ids_com = [int(x) for x in fd_linux_com_id['EventId']]
+
+    for id, log in zip(Event_ids_com, fd_linux_com_id['EventTemplate']):
+        event_mapping_data_com.append([id, log])
+
+
+    event_count_matrix_com = Linux_preprocess_data(para_com, raw_data_com, event_mapping_data_com)
+    # print("the event_count_matrix is:", Counter(event_count_matrix[9]))
+    print("the event_count_matrix is:", event_count_matrix_com)
+    # matrix_com has been given by parser
+    np.save(matrix_com, event_count_matrix_com)
diff --git a/loglizer/models/PCA.py b/loglizer/models/PCA.py
index 6d6a437..22ebf5a 100644
--- a/loglizer/models/PCA.py
+++ b/loglizer/models/PCA.py
@@ -64,7 +64,7 @@ def fit(self, X):
                 variance += sigma[i]
                 if variance / total_variance >= n_components:
                     break
-            n_components = i + 1
+                n_components = i + 1
 
         P = U[:, :n_components]
         I = np.identity(num_events, int)