UCdasec · GuillermoRached · Feb 27, 2024 · Apr 20, 2024 · Apr 20, 2024 · May 11, 2024
diff --git a/Analyzer.py b/Analyzer.py
@@ -19,13 +19,17 @@ def __init__(self, filename: str, parsed_data: Parser, total_lines: int, out_dir
         """
         self.filename = filename
         self.parsed_data = parsed_data
-        
+
         # ! Outdated branch pattern detection
         # self.branchV1_detector = BranchV1(filename, total_lines, directory_name)
-        self.branchV2_detector = BranchV2(filename, parsed_data.arch.name, parsed_data.opt, directory_name, sensitivity=4)
+        self.branchV2_detector = BranchV2(filename, parsed_data.arch.name, parsed_data.opt, directory_name, directory_name, sensitivity=4)
         self.constant_detector = ConstantCoding(filename, parsed_data.arch.name, parsed_data.opt, total_lines, directory_name, sensitivity=4)
         self.loop_detector = LoopCheck(filename, parsed_data.arch.name, parsed_data.opt, total_lines, directory_name)
         self.bypass_detector = Bypass(filename, parsed_data.arch.name, parsed_data.opt, total_lines, directory_name)
+        # TODO: Instantiate a list of detectors, then iterate on detectors for other functions
+        # Doing the above avoids the below condition on subsequent function calls
+        if parsed_data.arch.name  == "x86":       
+            self.bypass_detector = None
         # if self.create_directory(console):
         self.static_analysis()
 
@@ -54,7 +58,8 @@ def static_analysis(self) -> None:
                 self.branchV2_detector.analysis(line)
                 self.constant_detector.analysis(line)
                 self.loop_detector.analysis(line)
-                self.bypass_detector.analysis(line)
+                if self.bypass_detector:
+                    self.bypass_detector.analysis(line)
             elif type(line) == Location:
                 self.constant_detector.analysis(line)
                 self.loop_detector.analysis(line)
@@ -80,8 +85,9 @@ def print_analysis_results(self, console: Console) -> None:
         console.print(f"[Pattern] [bright_yellow]LoopCheck[/bright_yellow]\n")
         self.loop_detector.print_results(console)
 
-        console.print(f"[Pattern] [bright_yellow]Bypass[/bright_yellow]\n")
-        self.bypass_detector.print_results(console)
+        if self.bypass_detector:
+            console.print(f"[Pattern] [bright_yellow]Bypass[/bright_yellow]\n")
+            self.bypass_detector.print_results(console)
 
     def save_and_print_analysis_results(self, console: Console) -> None:
         """
@@ -101,14 +107,18 @@ def save_and_print_analysis_results(self, console: Console) -> None:
         self.loop_detector.save_and_print_results(console)
         console.print(f"Saved")
 
-        console.print(f"Saving Bypass...")
-        self.bypass_detector.save_and_print_results(console)
-        console.print(f"Saved")
+        if self.bypass_detector:
+            console.print(f"Saving Bypass...")
+            self.bypass_detector.save_and_print_results(console)
+            console.print(f"Saved")
 
     def print_total_vulnerable_lines(self, console: Console) -> None:
         # total number of vulnerable lines
         total_vulnerable_lines = (len(self.branchV2_detector.vulnerable_instructions) + len(self.constant_detector.vulnerable_instructions)
-                                  + len(self.loop_detector.vulnerable_instructions) + len(self.bypass_detector.vulnerable_set))
+                                  + len(self.loop_detector.vulnerable_instructions))
+
+        if self.bypass_detector:
+            total_vulnerable_lines += len(self.bypass_detector.vulnerable_set)
         print(f"Total number of vulnerable lines: {total_vulnerable_lines}")
 
         # total number of branch faults
@@ -124,9 +134,15 @@ def print_total_vulnerable_lines(self, console: Console) -> None:
         console.print(f"\tTotal number of Loop Check vulnerabilities: {total_loop_faults}")
 
         # total number of bypass faults
-        total_bypass_faults = len(self.bypass_detector.vulnerable_set)
-        console.print(f"\tTotal number of Bypass vulnerabilities: {total_bypass_faults}")
+        if self.bypass_detector:
+            total_bypass_faults = len(self.bypass_detector.vulnerable_set)
+            console.print(f"\tTotal number of Bypass vulnerabilities: {total_bypass_faults}")
 
     def get_total_vulnerable_lines(self) -> int:
-        return (len(self.branchV2_detector.vulnerable_instructions) + len(self.constant_detector.vulnerable_instructions)
-                + len(self.loop_detector.vulnerable_instructions) + len(self.bypass_detector.vulnerable_set))
+        total_lines = (len(self.branchV2_detector.vulnerable_instructions) + len(self.constant_detector.vulnerable_instructions)
+                + len(self.loop_detector.vulnerable_instructions))
+
+        if self.bypass_detector:
+            total_lines += len(self.bypass_detector.vulnerable_set)
+
+        return total_lines
diff --git a/Parser.py b/Parser.py
@@ -1,7 +1,10 @@
 import re
+from subprocess import check_output
+from capstone import *
 from rich.console import Console
+from elftools.elf.elffile import ELFFile
 
-from constants import optimization_levels
+from constants import Architectures, BinaryModes, optimization_levels
 
 class Register():
     def __init__(self, name: str):
@@ -97,7 +100,10 @@ def set_arguments(self, value: str):
         if indicator != -1:
             # The string has two args
             # The +2 here is because of the ', #' that will be at index "indicator"
-            args["offset"] = IntegerLiteral(int(value[value.index('#')+1:]))
+            try:
+                args["offset"] = IntegerLiteral(int(value[value.index('#')+1:]))
+            except ValueError:
+                args["offset"] = IntegerLiteral(int(value[value.index('#')+1:], 16))
 
         return args
 
@@ -157,15 +163,66 @@ def __init__(self, file: str, console: Console):
         self.total_lines: int = 0
 
         self.arch = Architecture(line=None, instruction=None)
-        self.opt : str
+        self.opt : str = "O0"
+        self.is_binary = False
 
         self.parseFile(console)
 
     def parseFile(self, console: Console):
         console.log(f"Reading file: {self.filename}")
-        with open(self.filename) as f:
-            lines: list[str] = f.readlines()
+        lines: list[str] = []
+
+            # Source file parsing
+        if self.__is_file_source(self.filename):
+            with open(self.filename, mode="r") as source_file:
+                lines = source_file.readlines()
+        else:
+            self.is_binary = True
+            with open(self.filename, mode="rb") as binary_file:
+                elf_file = ELFFile(binary_file)
+                text_section = elf_file.get_section_by_name(".text")
+                data_section = elf_file.get_section_by_name(".data")
+                rodata_section = elf_file.get_section_by_name(".rodata")
+                # Sections print for debugging
+                for section in elf_file.iter_sections():
+                    print(hex(section["sh_addr"]), section.name)
+
+                symtab = elf_file.get_section_by_name(".symtab")
+                for i in range(5):
+                    print("symbol #{} - {}".format(i, symtab.get_symbol(i).name))
+
+                main_offset = symtab.get_symbol_by_name("main")[0].entry["st_value"]
+                main_size = symtab.get_symbol_by_name("main")[0].entry["st_value"]
+
+                # Code
+                ops = text_section.data()
+                addr = text_section["sh_addr"]
+
+                # Global Vars
+                dops = data_section.data()
+                daddr = data_section["sh_addr"]
+
+                # Strings
+                rdops = rodata_section.data()
+                rdaddr = rodata_section["sh_addr"]
+
+                # Determine architecture and mode for Capstone
+                file_target_system = self.__determine_binary_architecture(elf_file)
+                self.arch.architecture_found(file_target_system.name.lower())
+                file_mode = None
+                if file_target_system == Architectures.ARM:
+                    file_mode = CS_MODE_ARM
+                else:
+                    file_mode = self.__determine_binary_mode(elf_file).value
+
+                md = Cs(file_target_system.value, file_mode)
+                # Dissassemble and store lines
+                lines = []
+                for i in md.disasm(code=ops, offset=addr):
+                    # lines.append("{} {}".format(i.mnemonic, i.op_str))
+                    print("0x%x:\t%s\t%s" %(i.address, i.mnemonic, i.op_str))
         console.log(f"[green]File read successfully![/green]\n")
+
 
         console.log(f"Processing assembly data:")
         self.isolateSections(lines)
@@ -175,7 +232,9 @@ def isolateSections(self, lines: list[str]):
         program = []
         line_number = 1
 
-        global attribute_1, attribute_2 # For determining optimization level
+        # For determining optimization level
+        attribute_1 = None
+        attribute_2 = None
         for line in lines:
             s = line.strip()
             # Line is a location 
@@ -195,10 +254,10 @@ def isolateSections(self, lines: list[str]):
             elif s.startswith(".ident"):
                 break
             # if line starts with .eabi_attribute 30, we get 1st attribute for optimization level
-            elif s.startswith(".eabi_attribute 30"):
+            elif s.startswith(".eabi_attribute 30") and not self.is_binary:
                 attribute_1 = self.get_eabi_attribute(s)
             # if line starts with .eabi_attribute 23, we get 2nd attribute for optimization level
-            elif s.startswith(".eabi_attribute 23"):
+            elif s.startswith(".eabi_attribute 23") and not self.is_binary:
                 attribute_2 = self.get_eabi_attribute(s)
             # Line is an instruction
             else:
@@ -241,7 +300,11 @@ def parseArguments(self, line: str, line_number: int):
 
             # Check if a number
             if self.isNumber(arg):
-                arguments.append(IntegerLiteral(int(arg[1:] if arg.startswith('#') or arg.startswith('$') else arg)))
+                try:
+                    arguments.append(IntegerLiteral(int(arg[1:] if arg.startswith('#') or arg.startswith('$') else arg)))
+                except ValueError:
+                    arguments.append(IntegerLiteral(int(arg[1:] if arg.startswith('#') or arg.startswith('$') else arg, 16)))
+
             # ! This notation can also be used in ARM for LDR
             elif re.search(r"\.long|\.value", instruction) and self.isNumber(arg):
                 # in case its a global variable
@@ -282,3 +345,22 @@ def get_eabi_attribute(self, s: str):
                 return int(tag_match.group(2))
         else:
             return None
+
+    def __is_file_source(self, file_path: str):
+        """Uses `file` command on the provided file path and determines its type from the command output
+
+        Args:
+            file_path (str): Path to the file being analyzed
+        """
+        file_output = check_output(["file", file_path]).decode()
+
+        if re.match(r".*ASCII\stext\s.*", file_output):
+            # File is source
+            return True
+        return False
+
+    def __determine_binary_mode(self, elf_file: ELFFile):
+        return BinaryModes.from_elf_class(elf_file.elfclass)
+
+    def __determine_binary_architecture(self, elf_file: ELFFile):
+        return Architectures.from_elf_machine(elf_file.header.get("e_machine", ""))
diff --git a/README.md b/README.md
@@ -73,6 +73,8 @@ The entrypoint to the program, `main.py`, serves as a central location to utiliz
 
 Main parsing module. Intended to parse assembly code. It combs through the source code and creates objects depending on what it encounters. Once the source code is transformed into a list of objects, it can be more easily worked with to discover patterns. It uses Python’s type hints to be more transparent.
 
+> If passed a compiled binary, the parser uses [capstone](https://www.capstone-engine.org/) in combination with [pyefltools](https://github.com/eliben/pyelftools) to dissassemble and parse the binary.
+
 `Locations` are spots in the code that can be referenced and jumped to. Example: .LC0 and main.
 
 `IntegerLiterals` are integers. In 32-bit syntax, these are prefaced with a “#”

diff --git a/constants/Architectures.py b/constants/Architectures.py
@@ -0,0 +1,16 @@
+from enum import Enum
+
+
+class Architectures(Enum):
+    ARM = 0
+    X86 = 3
+    UNKNOWN = 9
+
+    @staticmethod
+    def from_elf_machine(elf_machine: str):
+        if "ARM" in elf_machine:
+            return Architectures.ARM
+        elif "X86" in elf_machine:
+            return Architectures.X86
+        else:
+            return Architectures.UNKNOWN
diff --git a/constants/BinaryModes.py b/constants/BinaryModes.py
@@ -0,0 +1,17 @@
+from enum import Enum
+
+
+class BinaryModes(Enum):
+    # NOTE: These values match their Capstone Modes (CS_MODE) counterparts
+    B32 = 4
+    B64 = 8
+    UNKNOWN = 9
+
+    @staticmethod
+    def from_elf_class(elf_class: int):
+        if elf_class == 32:
+            return BinaryModes.B32
+        elif elf_class == 64:
+            return BinaryModes.B64
+        else:
+            return BinaryModes.UNKNOWN
diff --git a/constants/__init__.py b/constants/__init__.py
@@ -1,2 +1,4 @@
 from .patterns import *
-from .trivialValues import *
+from .trivialValues import *
+from .Architectures import *
+from .BinaryModes import *
diff --git a/constants/patterns.py b/constants/patterns.py
@@ -1,8 +1,9 @@
 pattern_list = {
     "x86": {
-        "branch": ['cmpl', ['jne', 'je', 'jnz', 'jz']], #cmp??
+        "branch": [['cmpl'], ['jne', 'je', 'jnz', 'jz'], []], #cmp??
         "constant_coding": ['movl', 'movq', 'movw', '.value', ".long"],
         "loop_check": ['cmpl', 'cmpl', 'j'], #cmpb, cmp??
+        "bypass": [[], []]
     },
     "arm": {
         "branch": [['cmp', 'subs', 'rsbs'], ['beq', 'bne', 'bcs', 'bhs', 'bcc', 'blo', 'bmi', 'bpl', 'bvs', 'bvc', 'bhi', 'bls', 'bge',

diff --git a/docs/developer_notes/architecture_and_compilers.md b/docs/developer_notes/architecture_and_compilers.md
@@ -6,6 +6,7 @@
   - [Architectures](#architectures)
   - [Cross-compiling](#cross-compiling)
     - [ARM](#arm)
+    - [RISC-V](#risc-v)
     - [x86](#x86)
   - [Tool Chain Conventions](#tool-chain-conventions)
 
@@ -44,6 +45,10 @@ In order to compile the dataset, scripts will be provided in order to have both
 arm-none-eabi-gcc -S -o filename.s /path/to/filename.c
 ```
 
+```bash
+arm-none-eabi-gcc --specs=nosys.specs -o filename /path/to/filename.c
+```
+
 ### RISC-V
 The RISC-V GCC toolchain and its installation instructions can be found at this [link](https://github.com/riscv-collab/riscv-gnu-toolchain). Once you have installed the toolchain successfully, you can  create Assembly binaries with the following command:
 ```bash

diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,4 @@
 capstone
-rich
+setuptools
+rich
+pyelftools