Skip to content
Open
6 changes: 4 additions & 2 deletions idaplugin/rematch/collectors/vectors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
from .assembly_hash import AssemblyHashVector
from .mnemonic_hash import MnemonicHashVector
from .mnemonic_hist import MnemonicHistVector

from .fnv_hash import FnvHashVector
from .apidom_hash import ApiDomintorHashVector

__all__ = ["Vector", "IdentityHashVector", "NameHashVector",
"AssemblyHashVector", "MnemonicHashVector", "MnemonicHistVector"]
"AssemblyHashVector", "MnemonicHashVector", "MnemonicHistVector",
"FnvHashVector", "ApiDomintorHashVector", ]
41 changes: 41 additions & 0 deletions idaplugin/rematch/collectors/vectors/apidom_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from ida_gdl import FlowChart
from ida_idp import is_call_insn
from idaapi import get_func

from . import vector
from collections import defaultdict


class ApiDomintorHashVector(vector.Vector):
type = 'apidom_hash'
type_version = 0

@classmethod
def data(cls, offset):
# iterate over the function's basic blocks
flwchrt = FlowChart(get_func(offset))
bbcall = defaultdict(list)

for blck in flwchrt:
start = blck.startEA
curr_ea = start
end = blck.endEA

# bucketsize every basic block

# TODO XXX
# find a decent way to get imports
# maybe a helper function instead
# of inlining it here.

bbinsn = []
while curr_ea < end:
bbinsn.append(GetMnem(curr_ea)) # noqa: F821

if is_call_insn(curr_ea):
bbcall[start].append(bbinsn)
bbinsn = []

curr_ea = NextHead(curr_ea) # noqa: F821

return bbcall
55 changes: 55 additions & 0 deletions idaplugin/rematch/collectors/vectors/fnv_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import idautils
import idaapi
import idc


from . import vector


class FnvHashVector(vector.Vector):
FNV1_64A_PRIME = 0x100000001b3
FNV1_32A_PRIME = 0x01000193
FNV1_32A_INIT = 0x811c9dc5
FNV1_32A_SIZE = 2**32
FNV1_64A_INIT = 0xcbf29ce484222325
FNV1_64A_SIZE = 2**64
type = 'fnv_hash'
type_version = 0

def fnv_64a(self, data):
val = self.FNV1_64A_INIT
val = val ^ data
val = (val * self.FNV1_64A_PRIME) % self.FNV1_64A_SIZE
return val

def fnv_32a(self, data):
val = self.FNV1_32A_INIT
val = val ^ data
val = (val * self.FNV1_32A_PRIME) % self.FNV1_32A_SIZE
return val

@classmethod
def data(cls, offset):
bitness = idaapi.get_inf_structure()

# assuming there is no 128-bit architecture yet...
# also if it's 16b we'll hash it as 32b, kinda hoping
# this won't fuckup things too much.
if bitness.is_64():
fnv_fn = cls.fnv_64a
else:
fnv_fn = cls.fnv_32a
if len(list(idautils.FuncItems(offset))) < 3:
return None

for ea in idautils.FuncItems(offset):
h = fnv_fn(idc.Byte(ea))
has_coderefs = idautils.CodeRefsFrom(ea, True) or \
idautils.DataRefsFrom(ea)
if has_coderefs:
continue

for i in range(ea + 1, ea + idc.ItemSize(ea)):
h = fnv_fn(h, idc.Byte(i))

return h
30 changes: 30 additions & 0 deletions idaplugin/rematch/collectors/vectors/mdindx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from collect import defaultdict
import ida_gdl
import idaapi

from . import vector


class MDIndexVector(vector.Vector):
type = 'MDIndex_Hash'
type_version = 0

@classmethod
def data(cls, offset):
# we're assuming offset is actually a function which has boundaries,
# this assumption is reasonable as we assume the underlying framework
# (IDA, Binja, r2), iterates only over functions.
fn = idaapi.get_func(offset)
bbs = ida_gdl.FlowChart(fn)

bbset = defaultdict(dict)

for bb in bbs:
if bb not in bbset:
bbset[bb] = {'in': 0, 'out': 0}
bbset[bb]['in'] += 1

chunks = [chunk for chunk in bb.succs()]
if chunks[-1] not in bbset:
bbset[bb] = {'in': 0, 'out': 0}
bbset[bb]['out'] += 1
3 changes: 2 additions & 1 deletion idaplugin/rematch/instances/function.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,6 @@ def __init__(self, *args, **kwargs):
self.vectors |= {collectors.vectors.IdentityHashVector,
collectors.vectors.AssemblyHashVector,
collectors.vectors.MnemonicHashVector,
collectors.vectors.MnemonicHistVector}
collectors.vectors.MnemonicHistVector,
collectors.vectors.FnvHashVector, }
self.annotations |= {collectors.annotations.AssemblyAnnotation}
9 changes: 7 additions & 2 deletions server/collab/matchers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,15 @@
from .mnemonic_hash import MnemonicHashMatcher
from .name_hash import NameHashMatcher
from .mnemonic_hist import MnemonicHistogramMatcher
from .fnv_hash import FnvHashMatcher
from .apidom_hash import ApiDominatorMatcher
from .fuzzy_matcher import FuzzyHashMatcher


matchers_list = [IdentityHashMatcher, NameHashMatcher, AssemblyHashMatcher,
MnemonicHashMatcher, MnemonicHistogramMatcher]
MnemonicHashMatcher, MnemonicHistogramMatcher,
FnvHashMatcher, ApiDominatorMatcher, FuzzyHashMatcher, ]

__all__ = ['IdentityHashMatcher', 'AssemblyHashMatcher', 'MnemonicHashMatcher',
'NameHashMatcher', 'MnemonicHistogramMatcher', 'matchers_list']
'NameHashMatcher', 'MnemonicHistogramMatcher', 'matchers_list',
'FnvHashMatcher', 'ApiDominatorMatcher', 'FuzzyHashMatcher', ]
7 changes: 7 additions & 0 deletions server/collab/matchers/apidom_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from . import fuzzy_matcher


class ApiDominatorMatcher(fuzzy_matcher.FuzzyHashMatcher):
vector_type = 'apidom_hash'
match_type = 'apidom_hash'
matcher_name = 'API Call Dominator Hash'
7 changes: 7 additions & 0 deletions server/collab/matchers/fnv_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from . import fuzzy_matcher


class FnvHashMatcher(fuzzy_matcher.FuzzyHashMatcher):
vector_type = 'fnv_hash'
match_type = 'fnv_hash'
matcher_name = 'FNV Hash'
40 changes: 40 additions & 0 deletions server/collab/matchers/fuzzy_matcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import itertools
import json
from operator import xor as xorred_fn

import numpy as np
import sklearn as skl
import sklearn.metrics # noqa flake8 importing as a different name
import sklearn.feature_extraction # noqa flake8 importing as a different name

from . import matcher


class FuzzyHashMatcher(matcher.Matcher):
@classmethod
def match(cls, source, target):
target_values = itertools.izip(*source.value_list('instance_id', 'data'))
source_values = itertools.izip(*target.value_list('instance_id', 'data'))

source_instance_ids, source_data = source_values
target_instance_ids, target_data = target_values

source_list = [json.loads(d) for d in source_data]
target_list = [json.loads(d) for d in target_data]

dictvect = skl.feature_extraction.DictVectorizer()
source_matrix = dictvect.fit_transform(source_list)
target_matrix = dictvect.transform(target_list)

distance_matrix = skl.metric.pairwise_distances(source_matrix,
target_matrix,
xorred_fn)
max_distance = distance_matrix.max()
score_matrix = (1 - (distance_matrix / max_distance)) * 100

for source_i, target_i in np.ndindex(*distance_matrix.shape):
source_instance_id = source_instance_ids[source_i]
target_instance_id = target_instance_ids[target_i]

score = score_matrix[source_i][target_i]
yield (source_instance_id, target_instance_id, score)
3 changes: 1 addition & 2 deletions server/collab/matchers/hist_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@

import numpy as np
import sklearn as skl
import sklearn.metrics # noqa flake8 importing as a different name
import sklearn.preprocessing # noqa flake8 importing as a different name
import sklearn.metrics
import sklearn.feature_extraction # noqa flake8 importing as a different name

from . import matcher
Expand Down