nirizr · shiftre · Jun 11, 2017 · Jun 13, 2017 · Jun 14, 2017 · Jun 14, 2017
diff --git a/idaplugin/rematch/collectors/vectors/__init__.py b/idaplugin/rematch/collectors/vectors/__init__.py
@@ -4,7 +4,9 @@
 from .assembly_hash import AssemblyHashVector
 from .mnemonic_hash import MnemonicHashVector
 from .mnemonic_hist import MnemonicHistVector
-
+from .fnv_hash import FnvHashVector
+from .apidom_hash import ApiDomintorHashVector
 
 __all__ = ["Vector", "IdentityHashVector", "NameHashVector",
-           "AssemblyHashVector", "MnemonicHashVector", "MnemonicHistVector"]
+           "AssemblyHashVector", "MnemonicHashVector", "MnemonicHistVector",
+           "FnvHashVector", "ApiDomintorHashVector", ]
diff --git a/idaplugin/rematch/collectors/vectors/apidom_hash.py b/idaplugin/rematch/collectors/vectors/apidom_hash.py
@@ -0,0 +1,41 @@
+from ida_gdl import FlowChart
+from ida_idp import is_call_insn
+from idaapi import get_func
+
+from . import vector
+from collections import defaultdict
+
+
+class ApiDomintorHashVector(vector.Vector):
+  type = 'apidom_hash'
+  type_version = 0
+
+  @classmethod
+  def data(cls, offset):
+    # iterate over the function's basic blocks
+    flwchrt = FlowChart(get_func(offset))
+    bbcall = defaultdict(list)
+
+    for blck in flwchrt:
+      start = blck.startEA
+      curr_ea = start
+      end = blck.endEA
+
+      # bucketsize every basic block
+
+      # TODO XXX
+      # find a decent way to get imports
+      # maybe a helper function instead
+      # of inlining it here.
+
+      bbinsn = []
+      while curr_ea < end:
+        bbinsn.append(GetMnem(curr_ea))  # noqa: F821
+
+        if is_call_insn(curr_ea):
+          bbcall[start].append(bbinsn)
+          bbinsn = []
+
+        curr_ea = NextHead(curr_ea)  # noqa: F821
+
+    return bbcall
diff --git a/idaplugin/rematch/collectors/vectors/fnv_hash.py b/idaplugin/rematch/collectors/vectors/fnv_hash.py
@@ -0,0 +1,55 @@
+import idautils
+import idaapi
+import idc
+
+
+from . import vector
+
+
+class FnvHashVector(vector.Vector):
+  FNV1_64A_PRIME = 0x100000001b3
+  FNV1_32A_PRIME = 0x01000193
+  FNV1_32A_INIT = 0x811c9dc5
+  FNV1_32A_SIZE = 2**32
+  FNV1_64A_INIT = 0xcbf29ce484222325
+  FNV1_64A_SIZE = 2**64
+  type = 'fnv_hash'
+  type_version = 0
+
+  def fnv_64a(self, data):
+    val = self.FNV1_64A_INIT
+    val = val ^ data
+    val = (val * self.FNV1_64A_PRIME) % self.FNV1_64A_SIZE
+    return val
+
+  def fnv_32a(self, data):
+    val = self.FNV1_32A_INIT
+    val = val ^ data
+    val = (val * self.FNV1_32A_PRIME) % self.FNV1_32A_SIZE
+    return val
+
+  @classmethod
+  def data(cls, offset):
+    bitness = idaapi.get_inf_structure()
+
+    # assuming there is no 128-bit architecture yet...
+    # also if it's 16b we'll hash it as 32b, kinda hoping
+    # this won't fuckup things too much.
+    if bitness.is_64():
+      fnv_fn = cls.fnv_64a
+    else:
+      fnv_fn = cls.fnv_32a
+    if len(list(idautils.FuncItems(offset))) < 3:
+      return None
+
+    for ea in idautils.FuncItems(offset):
+      h = fnv_fn(idc.Byte(ea))
+      has_coderefs = idautils.CodeRefsFrom(ea, True) or \
+                     idautils.DataRefsFrom(ea)
+      if has_coderefs:
+        continue
+
+      for i in range(ea + 1, ea + idc.ItemSize(ea)):
+        h = fnv_fn(h, idc.Byte(i))
+
+      return h
diff --git a/idaplugin/rematch/collectors/vectors/mdindx.py b/idaplugin/rematch/collectors/vectors/mdindx.py
@@ -0,0 +1,30 @@
+from collect import defaultdict
+import ida_gdl
+import idaapi
+
+from . import vector
+
+
+class MDIndexVector(vector.Vector):
+  type = 'MDIndex_Hash'
+  type_version = 0
+
+  @classmethod
+  def data(cls, offset):
+    # we're assuming offset is actually a function which has boundaries,
+    # this assumption is reasonable as we assume the underlying framework
+    # (IDA, Binja, r2), iterates only over functions.
+    fn = idaapi.get_func(offset)
+    bbs = ida_gdl.FlowChart(fn)
+
+    bbset = defaultdict(dict)
+
+    for bb in bbs:
+      if bb not in bbset:
+        bbset[bb] = {'in': 0, 'out': 0}
+      bbset[bb]['in'] += 1
+
+      chunks = [chunk for chunk in bb.succs()]
+      if chunks[-1] not in bbset:
+        bbset[bb] = {'in': 0, 'out': 0}
+      bbset[bb]['out'] += 1
diff --git a/idaplugin/rematch/instances/function.py b/idaplugin/rematch/instances/function.py
@@ -18,5 +18,6 @@ def __init__(self, *args, **kwargs):
     self.vectors |= {collectors.vectors.IdentityHashVector,
                      collectors.vectors.AssemblyHashVector,
                      collectors.vectors.MnemonicHashVector,
-                     collectors.vectors.MnemonicHistVector}
+                     collectors.vectors.MnemonicHistVector,
+                     collectors.vectors.FnvHashVector, }
     self.annotations |= {collectors.annotations.AssemblyAnnotation}
diff --git a/server/collab/matchers/__init__.py b/server/collab/matchers/__init__.py
@@ -3,10 +3,15 @@
 from .mnemonic_hash import MnemonicHashMatcher
 from .name_hash import NameHashMatcher
 from .mnemonic_hist import MnemonicHistogramMatcher
+from .fnv_hash import FnvHashMatcher
+from .apidom_hash import ApiDominatorMatcher
+from .fuzzy_matcher import FuzzyHashMatcher
 
 
 matchers_list = [IdentityHashMatcher, NameHashMatcher, AssemblyHashMatcher,
-                 MnemonicHashMatcher, MnemonicHistogramMatcher]
+                 MnemonicHashMatcher, MnemonicHistogramMatcher,
+                 FnvHashMatcher, ApiDominatorMatcher, FuzzyHashMatcher, ]
 
 __all__ = ['IdentityHashMatcher', 'AssemblyHashMatcher', 'MnemonicHashMatcher',
-           'NameHashMatcher', 'MnemonicHistogramMatcher', 'matchers_list']
+           'NameHashMatcher', 'MnemonicHistogramMatcher', 'matchers_list',
+           'FnvHashMatcher', 'ApiDominatorMatcher', 'FuzzyHashMatcher', ]
diff --git a/server/collab/matchers/apidom_hash.py b/server/collab/matchers/apidom_hash.py
@@ -0,0 +1,7 @@
+from . import fuzzy_matcher
+
+
+class ApiDominatorMatcher(fuzzy_matcher.FuzzyHashMatcher):
+  vector_type = 'apidom_hash'
+  match_type = 'apidom_hash'
+  matcher_name = 'API Call Dominator Hash'
diff --git a/server/collab/matchers/fnv_hash.py b/server/collab/matchers/fnv_hash.py
@@ -0,0 +1,7 @@
+from . import fuzzy_matcher
+
+
+class FnvHashMatcher(fuzzy_matcher.FuzzyHashMatcher):
+  vector_type = 'fnv_hash'
+  match_type = 'fnv_hash'
+  matcher_name = 'FNV Hash'
diff --git a/server/collab/matchers/fuzzy_matcher.py b/server/collab/matchers/fuzzy_matcher.py
@@ -0,0 +1,40 @@
+import itertools
+import json
+from operator import xor as xorred_fn
+
+import numpy as np
+import sklearn as skl
+import sklearn.metrics  # noqa flake8 importing as a different name
+import sklearn.feature_extraction  # noqa flake8 importing as a different name
+
+from . import matcher
+
+
+class FuzzyHashMatcher(matcher.Matcher):
+  @classmethod
+  def match(cls, source, target):
+    target_values = itertools.izip(*source.value_list('instance_id', 'data'))
+    source_values = itertools.izip(*target.value_list('instance_id', 'data'))
+
+    source_instance_ids, source_data = source_values
+    target_instance_ids, target_data = target_values
+
+    source_list = [json.loads(d) for d in source_data]
+    target_list = [json.loads(d) for d in target_data]
+
+    dictvect = skl.feature_extraction.DictVectorizer()
+    source_matrix = dictvect.fit_transform(source_list)
+    target_matrix = dictvect.transform(target_list)
+
+    distance_matrix = skl.metric.pairwise_distances(source_matrix,
+                                                    target_matrix,
+                                                    xorred_fn)
+    max_distance = distance_matrix.max()
+    score_matrix = (1 - (distance_matrix / max_distance)) * 100
+
+    for source_i, target_i in np.ndindex(*distance_matrix.shape):
+      source_instance_id = source_instance_ids[source_i]
+      target_instance_id = target_instance_ids[target_i]
+
+      score = score_matrix[source_i][target_i]
+      yield (source_instance_id, target_instance_id, score)
diff --git a/server/collab/matchers/hist_matcher.py b/server/collab/matchers/hist_matcher.py
@@ -3,8 +3,7 @@
 
 import numpy as np
 import sklearn as skl
-import sklearn.metrics  # noqa flake8 importing as a different name
-import sklearn.preprocessing  # noqa flake8 importing as a different name
+import sklearn.metrics
 import sklearn.feature_extraction  # noqa flake8 importing as a different name
 
 from . import matcher