From 249e78f1e334b64f150c668aee05393f692b7a4c Mon Sep 17 00:00:00 2001 From: Gaurav Gupta Date: Thu, 20 Jan 2022 00:47:18 -0800 Subject: [PATCH] [WIP] Refactor common functions in public/private data interface Signed-off-by: Gaurav Gupta --- .../data_interfaces/base_data_interface.py | 48 +++++++++++++++++++ .../data_interfaces/private_data_interface.py | 36 -------------- .../data_interfaces/public_data_interface.py | 46 ------------------ 3 files changed, 48 insertions(+), 82 deletions(-) diff --git a/dice_ml/data_interfaces/base_data_interface.py b/dice_ml/data_interfaces/base_data_interface.py index f0abfa54..80997584 100644 --- a/dice_ml/data_interfaces/base_data_interface.py +++ b/dice_ml/data_interfaces/base_data_interface.py @@ -1,6 +1,8 @@ """Module containing base class for data interfaces for dice-ml.""" from abc import ABC, abstractmethod +import numpy as np +import pandas as pd class _BaseData(ABC): @@ -27,6 +29,52 @@ def set_continuous_feature_indexes(self, query_instance): self.continuous_feature_indexes = [query_instance.columns.get_loc(name) for name in self.continuous_feature_names] + def from_dummies(self, data, prefix_sep='_'): + """Gets the original data from dummy encoded data with k levels.""" + out = data.copy() + for feat in self.categorical_feature_names: + # first, derive column names in the one-hot-encoded data from the original data + cat_col_values = [] + for val in list(self.data_df[feat].unique()): + cat_col_values.append(feat + prefix_sep + str( + val)) # join original feature name and its unique values , ex: education_school + match_cols = [c for c in data.columns if + c in cat_col_values] # check for the above matching columns in the encoded data + + # then, recreate original data by removing the suffixes - based on the GitHub issue comment: + # https://github.com/pandas-dev/pandas/issues/8745#issuecomment-417861271 + cols, labs = [[c.replace( + x, "") for c in match_cols] for x in ["", feat + prefix_sep]] + out[feat] = pd.Categorical( + np.array(labs)[np.argmax(data[cols].values, axis=1)]) + out.drop(cols, axis=1, inplace=True) + return out + + def one_hot_encode_data(self, data): + """One-hot-encodes the data.""" + return pd.get_dummies(data, drop_first=False, columns=self.categorical_feature_names) + + def get_decoded_data(self, data, encoding='one-hot'): + """Gets the original data from encoded data.""" + if len(data) == 0: + return data + + index = [i for i in range(0, len(data))] + if encoding == 'one-hot': + if isinstance(data, pd.DataFrame): + return self.from_dummies(data) + elif isinstance(data, np.ndarray): + data = pd.DataFrame(data=data, index=index, + columns=self.ohe_encoded_feature_names) + return self.from_dummies(data) + else: + raise ValueError("data should be a pandas dataframe or a numpy array") + + elif encoding == 'label': + data = pd.DataFrame(data=data, index=index, + columns=self.feature_names) + return data + @abstractmethod def __init__(self, params): """The init method needs to be implemented by the inherting classes.""" diff --git a/dice_ml/data_interfaces/private_data_interface.py b/dice_ml/data_interfaces/private_data_interface.py index 2960c17b..57c7e1c8 100644 --- a/dice_ml/data_interfaces/private_data_interface.py +++ b/dice_ml/data_interfaces/private_data_interface.py @@ -103,10 +103,6 @@ def _validate_and_set_mad(self, params): else: self.mad = {} - def one_hot_encode_data(self, data): - """One-hot-encodes the data.""" - return pd.get_dummies(data, drop_first=False, columns=self.categorical_feature_names) - def normalize_data(self, df, encoding='one-hot'): """Normalizes continuous features to make them fall in the range [0,1].""" result = df.copy() @@ -254,17 +250,6 @@ def from_label(self, data): out[column] = self.labelencoder[self.feature_names[column]].inverse_transform([round(out[column])])[0] return out - def from_dummies(self, data, prefix_sep='_'): - """Gets the original data from dummy encoded data with k levels.""" - out = data.copy() - for feature_name in self.categorical_feature_names: - cols, labs = [[c.replace( - x, "") for c in data.columns if feature_name+prefix_sep in c] for x in ["", feature_name+prefix_sep]] - out[feature_name] = pd.Categorical( - np.array(labs)[np.argmax(data[cols].values, axis=1)]) - out.drop(cols, axis=1, inplace=True) - return out - def get_decimal_precisions(self): """"Gets the precision of continuous features in the data.""" precisions = [0]*len(self.continuous_feature_names) @@ -276,27 +261,6 @@ def get_decimal_precisions(self): precisions[ix] = self.type_and_precision[feature_name][1] return precisions - def get_decoded_data(self, data, encoding='one-hot'): - """Gets the original data from encoded data.""" - if len(data) == 0: - return data - - index = [i for i in range(0, len(data))] - if encoding == 'one-hot': - if isinstance(data, pd.DataFrame): - return self.from_dummies(data) - elif isinstance(data, np.ndarray): - data = pd.DataFrame(data=data, index=index, - columns=self.ohe_encoded_feature_names) - return self.from_dummies(data) - else: - raise ValueError("data should be a pandas dataframe or a numpy array") - - elif encoding == 'label': - data = pd.DataFrame(data=data, index=index, - columns=self.feature_names) - return data - def prepare_df_for_ohe_encoding(self): """Create base dataframe to do OHE for a single instance or a set of instances""" levels = [] diff --git a/dice_ml/data_interfaces/public_data_interface.py b/dice_ml/data_interfaces/public_data_interface.py index ab3e5ed1..bb9a41bc 100644 --- a/dice_ml/data_interfaces/public_data_interface.py +++ b/dice_ml/data_interfaces/public_data_interface.py @@ -204,10 +204,6 @@ def get_data_type(self, col): else: raise ValueError("Unknown data type of feature %s: must be int or float" % col) - def one_hot_encode_data(self, data): - """One-hot-encodes the data.""" - return pd.get_dummies(data, drop_first=False, columns=self.categorical_feature_names) - def normalize_data(self, df): """Normalizes continuous features to make them fall in the range [0,1].""" result = df.copy() @@ -436,27 +432,6 @@ def from_label(self, data): out[c] = self.labelencoder[self.feature_names[c]].inverse_transform([round(out[c])])[0] return out - def from_dummies(self, data, prefix_sep='_'): - """Gets the original data from dummy encoded data with k levels.""" - out = data.copy() - for feat in self.categorical_feature_names: - # first, derive column names in the one-hot-encoded data from the original data - cat_col_values = [] - for val in list(self.data_df[feat].unique()): - cat_col_values.append(feat + prefix_sep + str( - val)) # join original feature name and its unique values , ex: education_school - match_cols = [c for c in data.columns if - c in cat_col_values] # check for the above matching columns in the encoded data - - # then, recreate original data by removing the suffixes - based on the GitHub issue comment: - # https://github.com/pandas-dev/pandas/issues/8745#issuecomment-417861271 - cols, labs = [[c.replace( - x, "") for c in match_cols] for x in ["", feat + prefix_sep]] - out[feat] = pd.Categorical( - np.array(labs)[np.argmax(data[cols].values, axis=1)]) - out.drop(cols, axis=1, inplace=True) - return out - def get_decimal_precisions(self, output_type="list"): """"Gets the precision of continuous features in the data.""" # if the precision of a continuous feature is not given, we use the maximum precision of the modes to capture the @@ -481,27 +456,6 @@ def get_decimal_precisions(self, output_type="list"): elif output_type == "dict": return precisions_dict - def get_decoded_data(self, data, encoding='one-hot'): - """Gets the original data from encoded data.""" - if len(data) == 0: - return data - - index = [i for i in range(0, len(data))] - if encoding == 'one-hot': - if isinstance(data, pd.DataFrame): - return self.from_dummies(data) - elif isinstance(data, np.ndarray): - data = pd.DataFrame(data=data, index=index, - columns=self.ohe_encoded_feature_names) - return self.from_dummies(data) - else: - raise ValueError("data should be a pandas dataframe or a numpy array") - - elif encoding == 'label': - data = pd.DataFrame(data=data, index=index, - columns=self.feature_names) - return data - def prepare_df_for_ohe_encoding(self): """Create base dataframe to do OHE for a single instance or a set of instances""" levels = []