From 249e78f1e334b64f150c668aee05393f692b7a4c Mon Sep 17 00:00:00 2001
From: Gaurav Gupta <gaugup@microsoft.com>
Date: Thu, 20 Jan 2022 00:47:18 -0800
Subject: [PATCH] [WIP] Refactor common functions in public/private data
 interface

Signed-off-by: Gaurav Gupta <gaugup@microsoft.com>
---
 .../data_interfaces/base_data_interface.py    | 48 +++++++++++++++++++
 .../data_interfaces/private_data_interface.py | 36 --------------
 .../data_interfaces/public_data_interface.py  | 46 ------------------
 3 files changed, 48 insertions(+), 82 deletions(-)

diff --git a/dice_ml/data_interfaces/base_data_interface.py b/dice_ml/data_interfaces/base_data_interface.py
index f0abfa54..80997584 100644
--- a/dice_ml/data_interfaces/base_data_interface.py
+++ b/dice_ml/data_interfaces/base_data_interface.py
@@ -1,6 +1,8 @@
 """Module containing base class for data interfaces for dice-ml."""
 
 from abc import ABC, abstractmethod
+import numpy as np
+import pandas as pd
 
 
 class _BaseData(ABC):
@@ -27,6 +29,52 @@ def set_continuous_feature_indexes(self, query_instance):
         self.continuous_feature_indexes = [query_instance.columns.get_loc(name) for name in
                                            self.continuous_feature_names]
 
+    def from_dummies(self, data, prefix_sep='_'):
+        """Gets the original data from dummy encoded data with k levels."""
+        out = data.copy()
+        for feat in self.categorical_feature_names:
+            # first, derive column names in the one-hot-encoded data from the original data
+            cat_col_values = []
+            for val in list(self.data_df[feat].unique()):
+                cat_col_values.append(feat + prefix_sep + str(
+                    val))  # join original feature name and its unique values , ex: education_school
+            match_cols = [c for c in data.columns if
+                          c in cat_col_values]  # check for the above matching columns in the encoded data
+
+            # then, recreate original data by removing the suffixes - based on the GitHub issue comment:
+            # https://github.com/pandas-dev/pandas/issues/8745#issuecomment-417861271
+            cols, labs = [[c.replace(
+                x, "") for c in match_cols] for x in ["", feat + prefix_sep]]
+            out[feat] = pd.Categorical(
+                np.array(labs)[np.argmax(data[cols].values, axis=1)])
+            out.drop(cols, axis=1, inplace=True)
+        return out
+
+    def one_hot_encode_data(self, data):
+        """One-hot-encodes the data."""
+        return pd.get_dummies(data, drop_first=False, columns=self.categorical_feature_names)
+
+    def get_decoded_data(self, data, encoding='one-hot'):
+        """Gets the original data from encoded data."""
+        if len(data) == 0:
+            return data
+
+        index = [i for i in range(0, len(data))]
+        if encoding == 'one-hot':
+            if isinstance(data, pd.DataFrame):
+                return self.from_dummies(data)
+            elif isinstance(data, np.ndarray):
+                data = pd.DataFrame(data=data, index=index,
+                                    columns=self.ohe_encoded_feature_names)
+                return self.from_dummies(data)
+            else:
+                raise ValueError("data should be a pandas dataframe or a numpy array")
+
+        elif encoding == 'label':
+            data = pd.DataFrame(data=data, index=index,
+                                columns=self.feature_names)
+            return data
+
     @abstractmethod
     def __init__(self, params):
         """The init method needs to be implemented by the inherting classes."""
diff --git a/dice_ml/data_interfaces/private_data_interface.py b/dice_ml/data_interfaces/private_data_interface.py
index 2960c17b..57c7e1c8 100644
--- a/dice_ml/data_interfaces/private_data_interface.py
+++ b/dice_ml/data_interfaces/private_data_interface.py
@@ -103,10 +103,6 @@ def _validate_and_set_mad(self, params):
         else:
             self.mad = {}
 
-    def one_hot_encode_data(self, data):
-        """One-hot-encodes the data."""
-        return pd.get_dummies(data, drop_first=False, columns=self.categorical_feature_names)
-
     def normalize_data(self, df, encoding='one-hot'):
         """Normalizes continuous features to make them fall in the range [0,1]."""
         result = df.copy()
@@ -254,17 +250,6 @@ def from_label(self, data):
                 out[column] = self.labelencoder[self.feature_names[column]].inverse_transform([round(out[column])])[0]
             return out
 
-    def from_dummies(self, data, prefix_sep='_'):
-        """Gets the original data from dummy encoded data with k levels."""
-        out = data.copy()
-        for feature_name in self.categorical_feature_names:
-            cols, labs = [[c.replace(
-                x, "") for c in data.columns if feature_name+prefix_sep in c] for x in ["", feature_name+prefix_sep]]
-            out[feature_name] = pd.Categorical(
-                np.array(labs)[np.argmax(data[cols].values, axis=1)])
-            out.drop(cols, axis=1, inplace=True)
-        return out
-
     def get_decimal_precisions(self):
         """"Gets the precision of continuous features in the data."""
         precisions = [0]*len(self.continuous_feature_names)
@@ -276,27 +261,6 @@ def get_decimal_precisions(self):
                 precisions[ix] = self.type_and_precision[feature_name][1]
         return precisions
 
-    def get_decoded_data(self, data, encoding='one-hot'):
-        """Gets the original data from encoded data."""
-        if len(data) == 0:
-            return data
-
-        index = [i for i in range(0, len(data))]
-        if encoding == 'one-hot':
-            if isinstance(data, pd.DataFrame):
-                return self.from_dummies(data)
-            elif isinstance(data, np.ndarray):
-                data = pd.DataFrame(data=data, index=index,
-                                    columns=self.ohe_encoded_feature_names)
-                return self.from_dummies(data)
-            else:
-                raise ValueError("data should be a pandas dataframe or a numpy array")
-
-        elif encoding == 'label':
-            data = pd.DataFrame(data=data, index=index,
-                                columns=self.feature_names)
-            return data
-
     def prepare_df_for_ohe_encoding(self):
         """Create base dataframe to do OHE for a single instance or a set of instances"""
         levels = []
diff --git a/dice_ml/data_interfaces/public_data_interface.py b/dice_ml/data_interfaces/public_data_interface.py
index ab3e5ed1..bb9a41bc 100644
--- a/dice_ml/data_interfaces/public_data_interface.py
+++ b/dice_ml/data_interfaces/public_data_interface.py
@@ -204,10 +204,6 @@ def get_data_type(self, col):
         else:
             raise ValueError("Unknown data type of feature %s: must be int or float" % col)
 
-    def one_hot_encode_data(self, data):
-        """One-hot-encodes the data."""
-        return pd.get_dummies(data, drop_first=False, columns=self.categorical_feature_names)
-
     def normalize_data(self, df):
         """Normalizes continuous features to make them fall in the range [0,1]."""
         result = df.copy()
@@ -436,27 +432,6 @@ def from_label(self, data):
                 out[c] = self.labelencoder[self.feature_names[c]].inverse_transform([round(out[c])])[0]
             return out
 
-    def from_dummies(self, data, prefix_sep='_'):
-        """Gets the original data from dummy encoded data with k levels."""
-        out = data.copy()
-        for feat in self.categorical_feature_names:
-            # first, derive column names in the one-hot-encoded data from the original data
-            cat_col_values = []
-            for val in list(self.data_df[feat].unique()):
-                cat_col_values.append(feat + prefix_sep + str(
-                    val))  # join original feature name and its unique values , ex: education_school
-            match_cols = [c for c in data.columns if
-                          c in cat_col_values]  # check for the above matching columns in the encoded data
-
-            # then, recreate original data by removing the suffixes - based on the GitHub issue comment:
-            # https://github.com/pandas-dev/pandas/issues/8745#issuecomment-417861271
-            cols, labs = [[c.replace(
-                x, "") for c in match_cols] for x in ["", feat + prefix_sep]]
-            out[feat] = pd.Categorical(
-                np.array(labs)[np.argmax(data[cols].values, axis=1)])
-            out.drop(cols, axis=1, inplace=True)
-        return out
-
     def get_decimal_precisions(self, output_type="list"):
         """"Gets the precision of continuous features in the data."""
         # if the precision of a continuous feature is not given, we use the maximum precision of the modes to capture the
@@ -481,27 +456,6 @@ def get_decimal_precisions(self, output_type="list"):
         elif output_type == "dict":
             return precisions_dict
 
-    def get_decoded_data(self, data, encoding='one-hot'):
-        """Gets the original data from encoded data."""
-        if len(data) == 0:
-            return data
-
-        index = [i for i in range(0, len(data))]
-        if encoding == 'one-hot':
-            if isinstance(data, pd.DataFrame):
-                return self.from_dummies(data)
-            elif isinstance(data, np.ndarray):
-                data = pd.DataFrame(data=data, index=index,
-                                    columns=self.ohe_encoded_feature_names)
-                return self.from_dummies(data)
-            else:
-                raise ValueError("data should be a pandas dataframe or a numpy array")
-
-        elif encoding == 'label':
-            data = pd.DataFrame(data=data, index=index,
-                                columns=self.feature_names)
-            return data
-
     def prepare_df_for_ohe_encoding(self):
         """Create base dataframe to do OHE for a single instance or a set of instances"""
         levels = []