aigatech · aaravg04 · Sep 13, 2024 · Sep 17, 2024 · Sep 17, 2024 · Oct 2, 2024
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -19,4 +19,7 @@ cfg_constructor/out
 **/data
 **/bin
 **/.venv
-**/logdat
+**/logdat
+
+# built dgl lib
+**/dgl
diff --git a/.gitmodules b/.gitmodules
diff --git a/README.md b/README.md
@@ -11,28 +11,26 @@ Install dependencies
 Enter Poetry Shell
 `poetry shell`
 
-## Running the CFG Creator
+## Running the CFG Constructor Tool `(src/cfg_constructor)`
 
 To generate Control Flow Graphs (CFGs) from binary files, use the `cfg_creator.py` script. This script analyzes binary files and creates CFGs, which can be visualized or saved in different formats.
 
 ### Usage
 
 Run the script from the root directory using the following command:
 ```sh
-python cfg_constructor/cfg_creator.py --data_dir <path_to_binary_files> --vis_mode <visualization_mode> --job_id <job_id>
+python src/cfg_constructor/cfg_creator.py --data_dir <path_to_binary_files> --vis_mode <visualization_mode> --job_id <job_id>
 ```
 
 Args:
 - `--data_dir`: Path to the directory containing the binary files in the parent directory. (str, default='data')
 - `--vis_mode`: Visualization mode. 0 = visualize in window, 1 = save as HTML docs, 2 = save graphs w/o visualizing as edgelists and `csv` for node values. (int, default=2)
-- `--job_id`: int for job id for use for logging + avoiding reprocessing already processed data based on job_id, vis_mode, and data dir (int, default=0)
+- `--job-id`: int for job id for use for logging + avoiding reprocessing already processed data based on job_id, vis_mode, and data dir (int, default=0)
 
 Output will be stored dependent on the vis_mode:
 - vis_mode=0: no saved output, graphs will be displayed in GUI
 - vis_mode=1: HTML files in `cfg_constructor/out/out_html`
-- vis_mode=2: CSV files in `cfg_constructor/out/out_edgelists`
-
-For vis_mode = {1, 2}, clear the directory before rerunning our tool as it will empty the directory and overwrite it with files from the currently running job 
+- vis_mode=2: CSV files in `cfg_constructor/out/out_adjacency_matrices`
 
 Example usage from my machine: 
 ```
@@ -42,6 +40,7 @@ Example usage from my machine:
 
 This goes to the root directory of the repository and runs the constructor from a `data` dir (also in the root directory), visualizes each with mode `2` (saving adjacency lists to specified dir above), and assigns job_id `0` for logging (i.e. program crashes, can easily resume)
 
+If a logging file with the existing job already exists, the script will load that and silently skip any files marked as processed by that log file. 
 
 ## Methodology
 Using Static Analysis (deconstruction of binaries without execution) to extract Control Flow Graphs from a binary.

diff --git a/cfg_constructor/utils/log_utils.py b/cfg_constructor/utils/log_utils.py
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,7 +18,9 @@ wheel = "^0.44.0"
 setuptools-rust = "^1.10.1"
 iced-x86 = "^1.21.0"
 scipy = "^1.14.0"
-
+scikit-learn = "^1.5.2"
+torch = "^2.4.1"
+numpy = "^2.1.2"
 
 [build-system]
 requires = ["poetry-core"]

diff --git a/quantization_graph_sage_model.pth.pt b/quantization_graph_sage_model.pth.pt
diff --git a/src/.DS_Store b/src/.DS_Store
diff --git a/src/cfg_constructor/.DS_Store b/src/cfg_constructor/.DS_Store
diff --git a/cfg_constructor/cfg_creator.py → src/cfg_constructor/cfg_creator.py b/cfg_constructor/cfg_creator.py → src/cfg_constructor/cfg_creator.py
@@ -9,8 +9,22 @@
 @click.command()
 @click.option('--data_dir', default='data', help='Directory where the neutralized binaries are stored')
 @click.option('--vis_mode', default=2, type=int, help='Visualization mode: 0 = visualize in window, 1 = save as HTML docs, 2 = save graphs w/o visualizing as edgelists')
-@click.option('--job_id', default=0, type=int, help='Job ID for logging')
+@click.option('--job-id', default=0, type=int, help='Job ID for logging')
 def main(data_dir, vis_mode, job_id):
+    """
+    Main function to process binary files, generate their control flow graphs (CFGs),
+    visualize them based on the specified mode, and log the visualization details.
+
+    Parameters:
+    data_dir (str): The directory containing the neutralized binaries.
+
+    vis_mode (int): The mode of visualization 
+                    0: visualize in window, 
+                    1: save as HTML docs
+                    2: save graphs without visualizing
+
+    job_id (int): The unique identifier for the job, used for logging purposes.
+    """
     log_file = setup_logging(job_id)
 
     for infile in tqdm(glob.glob(os.path.join(data_dir, '*')), desc="Processing binaries"):

diff --git a/cfg_constructor/utils/bin_processing.py → src/cfg_constructor/utils/bin_processing.py b/cfg_constructor/utils/bin_processing.py → src/cfg_constructor/utils/bin_processing.py
@@ -3,22 +3,33 @@
 
 import iced_x86
 import networkx as nx
+import time
 
 FORMATTER = iced_x86.Formatter(iced_x86.FormatterSyntax.INTEL)
+
 OP_SET = {
     iced_x86.OpKind.IMMEDIATE8,
     iced_x86.OpKind.IMMEDIATE16,
     iced_x86.OpKind.IMMEDIATE32,
     iced_x86.OpKind.IMMEDIATE64
 }
+
 BRANCH_SET = {
     iced_x86.FlowControl.UNCONDITIONAL_BRANCH,
     iced_x86.FlowControl.CONDITIONAL_BRANCH
 }
 
 def analyze_and_save_binary(infile, vis_mode, log_file):
+    """
+    Analyze a binary file, generate its control flow graph (CFG), 
+    visualize it, and log the visualization details.
+
+    Parameters:
+    infile (str): The path to the binary file to analyze.
+    vis_mode (int): The mode of visualization to use.
+    log_file (str): The path to the log file for recording visualization details.
+    """
     if is_already_visualized(log_file, infile):
-        print(f"Skipping {infile} as it has already been visualized.")
         return 
 
     binary_data = open(infile, 'rb').read()
@@ -28,6 +39,8 @@ def analyze_and_save_binary(infile, vis_mode, log_file):
     instructions = list(decoder)
     cfg = nx.DiGraph()
 
+    start = time.time()
+
     for i, instr in enumerate(instructions):
         cfg.add_node(instr.ip, instruction=FORMATTER.format(instr))
 
@@ -41,5 +54,7 @@ def analyze_and_save_binary(infile, vis_mode, log_file):
                 if next_ip:
                     cfg.add_edge(instr.ip, next_ip)
 
+    duration = round((time.time() - start) * 1000, 4)
+
     vis(cfg, infile, vis_mode)
-    log_visualization(log_file, infile, vis_mode)
+    log_visualization(log_file, infile, vis_mode, duration)
diff --git a/src/cfg_constructor/utils/log_utils.py b/src/cfg_constructor/utils/log_utils.py
@@ -0,0 +1,73 @@
+import logging
+import os
+import json
+import time
+
+def setup_logging(job_id):
+    """
+    Set up logging for the visualization process with JSON output.
+
+    Parameters:
+    job_id (str): The unique identifier for the job.
+
+    Returns:
+    str: The path to the log file created.
+    """
+    log_dir = 'src/cfg_constructor/logs'
+    os.makedirs(log_dir, exist_ok=True)
+    log_file = os.path.join(log_dir, f'visualization_log_{job_id}.json')
+
+    # Create a custom JSON formatter
+    class JsonFormatter(logging.Formatter):
+        def format(self, record):
+            log_obj = {
+                'filename': record.filename,
+                'exename': record.exename,
+                'vismode': record.vismode,
+                'time': record.time
+            }
+            return json.dumps(log_obj)
+
+    # Set up logging
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+
+    # Create a file handler
+    file_handler = logging.FileHandler(log_file)
+    file_handler.setFormatter(JsonFormatter())
+    logger.addHandler(file_handler)
+
+    return log_file
+
+def log_visualization(log_file, fname, vis_mode, duration):
+    """
+    Log the visualization details.
+
+    Parameters:
+    log_file (str): The path to the log file.
+    fname (str): The name of the file being visualized.
+    vis_mode (int): The mode of visualization used.
+    duration (float): The time taken to generate the CFG.
+    """
+
+    logging.info('', extra={'exename': fname, 'vismode': vis_mode, 'time': duration})
+
+def is_already_visualized(log_file, fname):
+    """
+    Check if a file has already been visualized by searching the log.
+
+    Parameters:
+    log_file (str): The path to the log file.
+    fname (str): The name of the file to check.
+
+    Returns:
+    bool: True if the file has been visualized, False otherwise.
+    """
+    with open(log_file, 'r') as f:
+        log_data = f.readlines()
+        for line in log_data:
+            obj = json.loads(line)
+            if fname == obj['exename']:
+                return True
+
+    return False
diff --git a/cfg_constructor/utils/vis_utils.py → src/cfg_constructor/utils/vis_utils.py b/cfg_constructor/utils/vis_utils.py → src/cfg_constructor/utils/vis_utils.py
@@ -5,6 +5,17 @@
 import os
 
 def vis(cfg, infile, vis_mode):
+    """
+    Visualize the control flow graph (CFG) based on the specified mode.
+
+    Parameters:
+    cfg (networkx.DiGraph): The control flow graph to visualize.
+    infile (str): The input file name for the CFG.
+    vis_mode (int): The mode of visualization (0: Matplotlib, 1: Pyvis, 2: Save to file).
+
+    Raises:
+    ValueError: If an invalid visualization mode is provided.
+    """
     f_id = os.path.basename(infile)
 
     if vis_mode == 0:
@@ -18,6 +29,13 @@ def vis(cfg, infile, vis_mode):
 
 
 def visualize_cfg(cfg, title):
+    """
+    Visualize the control flow graph using Matplotlib.
+
+    Parameters:
+    cfg (networkx.DiGraph): The control flow graph to visualize.
+    title (str): The title for the visualization.
+    """
     pos = nx.spring_layout(cfg)
     nx.draw(cfg, pos, with_labels=True, node_color='lightblue', node_size=500, font_size=8, arrows=True)
     nx.draw_networkx_labels(cfg, pos, {node: f"{node:x}" for node in cfg.nodes()})
@@ -26,6 +44,13 @@ def visualize_cfg(cfg, title):
     plt.show()
 
 def visualize_cfg_with_visnetwork(cfg, title):
+    """
+    Visualize the control flow graph in an HTML format for browser based viewing.
+
+    Parameters:
+    cfg (networkx.DiGraph): The control flow graph to visualize.
+    title (str): The title for the visualization.
+    """
     # Check if the graph is empty
     if len(cfg.nodes) == 0:
         print("The graph is empty!")
@@ -48,35 +73,33 @@ def visualize_cfg_with_visnetwork(cfg, title):
     # Display the network in a browser view
     output_dir = os.path.join('out', 'out_html')
     output_dir = os.path.join('cfg_constructor', output_dir)
+    output_dir = os.path.join('src', output_dir)
+
     os.makedirs(output_dir, exist_ok=True)
     net.save_graph(os.path.join(output_dir, f'{title}.html'))
 
 def save_cfg(cfg, title):
     """
-    Save the control flow graph (CFG) to an Edgelist file and node names to a separate file.
+    Save the control flow graph (CFG) to an adjacency matrix file and node names to a separate file.
 
     Parameters:
-    - cfg: NetworkX graph object representing the control flow graph
-    - title: String representing the title of the files (without extension)
+    cfg (networkx.DiGraph): The control flow graph to save.
+    title (str): The title for the output files (without extension).
     """
     # Check if the graph is empty
     if len(cfg.nodes) == 0:
         print("The graph is empty!")
         return
 
-    # Define directory for saving files
-    output_dir = os.path.join('out', 'out_edgelists')
-    output_dir = os.path.join('cfg_constructor', output_dir)
-    os.makedirs(output_dir, exist_ok=True)
-
-    # Define file paths
-    edgelist_path = os.path.join(output_dir, f'{title}.txt')
-    node_names_path = os.path.join(output_dir, f'{title}_node_names.csv')
-
-    # Save Edgelist
-    nx.write_edgelist(cfg, edgelist_path)
-
+    # Create adjacency matrix
+    adjacency_matrix = nx.to_numpy_array(cfg)
+
+    # Save Adjacency Matrix
+    adjacency_matrix_path = os.path.join('out', 'out_adjacency_matrices', f"{title}_adjacency_matrix.csv")
+    pd.DataFrame(adjacency_matrix).to_csv(adjacency_matrix_path, index=False, header=False)
+
     # Save Node Names
     node_names = {node: f"Node_{node}" for node in cfg.nodes}
     node_names_df = pd.DataFrame(list(node_names.items()), columns=['Node', 'Name'])
+    node_names_path = os.path.join('out', 'out_adjacency_matrices', f"{title}_node_names.csv")
     node_names_df.to_csv(node_names_path, index=False)
diff --git a/src/models/.DS_Store b/src/models/.DS_Store
diff --git a/src/models/GAE/README.md b/src/models/GAE/README.md
@@ -0,0 +1,3 @@
+# Graph Autoencoder (GAE)
+
+To be filled - will contain training, initialization, etc. for a GAE approach to malware classification. Specifically, Encoder-Decoder training and then stitching the Encoder to a FFN (or other model) for the classification task
diff --git a/src/models/GAT/README.md b/src/models/GAT/README.md
@@ -0,0 +1,3 @@
+# Graph Attention Network (GAT)
+
+To be filled - will contain training, initialization, etc. for a GAT approach to malware classification
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Graph Autoencoder (GAE)

		To be filled - will contain training, initialization, etc. for a GAE approach to malware classification. Specifically, Encoder-Decoder training and then stitching the Encoder to a FFN (or other model) for the classification task
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Graph Attention Network (GAT)

		To be filled - will contain training, initialization, etc. for a GAT approach to malware classification