diff --git a/.idea/RobustInference.iml b/.idea/RobustInference.iml
index fa80a76..85c7612 100644
--- a/.idea/RobustInference.iml
+++ b/.idea/RobustInference.iml
@@ -4,7 +4,7 @@
-
+
diff --git a/.idea/misc.xml b/.idea/misc.xml
index a2e120d..dc9ea49 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,4 +1,4 @@
-
+
\ No newline at end of file
diff --git a/RIFLE/RobustImputer.py b/RIFLE/RobustImputer.py
index f6467b4..6334bb4 100644
--- a/RIFLE/RobustImputer.py
+++ b/RIFLE/RobustImputer.py
@@ -2,6 +2,9 @@
import numpy as np
from sklearn.preprocessing import StandardScaler
from math import sqrt
+import multiprocessing
+import time
+from preprocessing import PolyFeatures
class RobustImputer:
@@ -14,73 +17,122 @@ def __init__(self):
self.validation_data_proportion = 0.1
self.data = None
self.transformed_data = None
+ self.poly_transformed_data = None
self.confidence_matrix = None
self.imputed_data = None
+ self.cols = None
def read_and_scale(self, filename):
- self.data = pd.read_csv(filename)
+ self.data = pd.read_csv(filename, na_values='?')
+ self.cols = self.data.columns
sc = StandardScaler()
sc.fit(self.data)
-
transformed = sc.transform(self.data)
self.transformed_data = pd.DataFrame(transformed, columns=self.data.columns, index=self.data.index)
+ poly = PolyFeatures(2, include_bias=False)
+ poly.fit(self.data)
+ poly_data = poly.transform(self.data.to_numpy(dtype=float))
+ sc.fit(poly_data)
+ poly_transformed = sc.transform(poly_data)
+ self.poly_transformed_data = pd.DataFrame(data=poly_transformed,
+ index=self.data.index,
+ columns=poly.get_feature_names_out(self.data.columns))
+ print(self.poly_transformed_data)
+
def scale_data(self, data):
self.data = data
sc = StandardScaler()
sc.fit(self.data)
-
transformed = sc.transform(self.data)
self.transformed_data = pd.DataFrame(transformed, columns=data.columns, index=data.index)
- def estimate_confidence_intervals(self):
+ poly = PolyFeatures(2, include_bias=False)
+ poly.fit(self.data)
+ poly_data = poly.transform(self.data.to_numpy(dtype=float))
+ sc.fit(poly_data)
+ poly_transformed = sc.transform(poly_data)
+ self.poly_transformed_data = pd.DataFrame(data=poly_transformed,
+ index=self.data.index,
+ columns=poly.get_feature_names_out(data.columns))
- data = self.transformed_data
- dimension = data.shape[1]
- confidence_matrix = np.zeros(shape=(dimension, dimension))
+ def find_confidence_interval(self, feature_index1):
- cols = data.columns
+ # print starting point and features for each process
+ # print(f'starting find_confidence_interval with {feature_index1}')
- for i in range(dimension):
- for j in range(i, dimension):
- feature_i = cols[i]
- feature_j = cols[j]
- columns = data[[feature_i, feature_j]]
- intersections = columns[columns[[feature_i, feature_j]].notnull().all(axis=1)]
+ # data = self.transformed_data
+ data = self.poly_transformed_data
+ dimension = data.shape[1]
+ for feature_index2 in range(feature_index1, dimension):
+ cols = data.columns
+ feature_i = cols[feature_index1]
+ feature_j = cols[feature_index2]
+ columns = data[[feature_i, feature_j]]
+ intersections = columns[columns[[feature_i, feature_j]].notnull().all(axis=1)]
+
+ intersection_num = len(intersections)
+
+ sample_size = int(intersection_num * self.bootstrap_proportion)
+
+ if sample_size < 2:
+ max_vals = columns.max()
+ max1 = max_vals[feature_i]
+ max2 = max_vals[feature_j]
+ self.confidence_matrix[feature_index1][feature_index2] = max1 * max2
+
+ estimation_array = []
+ for ind in range(self.number_of_bootstrap_estimations):
+ # current_sample = np.array(intersections.sample(n=sample_size, replace=self.with_replacement))
+ # For debugging
+ current_sample = np.array(
+ intersections.sample(n=sample_size, replace=self.with_replacement, random_state=1))
+ f1 = current_sample[:, 0]
+ f2 = current_sample[:, 1]
+ inner_prod = np.inner(f1, f2) / sample_size
+ estimation_array.append(inner_prod)
+
+ self.confidence_matrix[feature_index1][feature_index2] = np.std(estimation_array)
+
+ # print ending point and features for each process
+ # print(f'finishing find_confidence_interval with {feature_index1, feature_index2}')
- intersection_num = len(intersections)
+ def estimate_confidence_intervals(self):
- sample_size = int(intersection_num * self.bootstrap_proportion)
+ # data = self.transformed_data
+ data = self.poly_transformed_data
+ dimension = data.shape[1]
+
+ # initialized confidence matrix so that we are not subscripting a NoneType object
+ self.confidence_matrix = np.zeros(shape=(dimension, dimension), dtype="float")
- if sample_size < 2:
- max_vals = columns.max()
- max1 = max_vals[feature_i]
- max2 = max_vals[feature_j]
- confidence_matrix[i][j] = max1 * max2
- continue
+ # start timer
+ start = time.time()
- estimation_array = []
- for ind in range(self.number_of_bootstrap_estimations):
- current_sample = np.array(intersections.sample(n=sample_size, replace=self.with_replacement))
- f1 = current_sample[:, 0]
- f2 = current_sample[:, 1]
- inner_prod = np.inner(f1, f2) / sample_size
- estimation_array.append(inner_prod)
+ pool = multiprocessing.Pool()
+ pool.map(self.find_confidence_interval, range(dimension))
+ pool.close()
- confidence_matrix[i][j] = np.std(estimation_array)
+ # end timer and output time taken
+ end = time.time()
+ print('Confidence done in {:.4f} seconds'.format(end - start))
- for j in range(dimension):
- for i in range(j + 1, dimension):
- confidence_matrix[i][j] = confidence_matrix[j][i]
+ #
+ # for j in range(dimension):
+ # for i in range(j + 1, dimension):
+ # confidence_matrix[i][j] = confidence_matrix[j][i]
- self.confidence_matrix = confidence_matrix
+ # self.confidence_matrix = confidence_matrix
def impute_data(self, column_index):
- data = self.transformed_data
+ print(f'starting impute_data with {column_index}')
+ # data = self.transformed_data
+ data = self.poly_transformed_data
confidence_intervals = self.confidence_matrix
- data_columns = data.columns
+ # data_columns = data.columns
+ data_columns = self.cols
y_column = data_columns[column_index]
X = data.drop([y_column], axis=1)
@@ -208,22 +260,41 @@ def impute_data(self, column_index):
y_predict = np.dot(data_i.T, theta)
predicts.append(y_predict[0][0])
- return predicts
+ res = (column_index, predicts)
+ return res
def impute(self):
+
+ start = time.time()
+
original_data = self.data
standard_deviations = original_data.std()
means = original_data.mean()
data_cols = original_data.columns
- for column_ind in range(original_data.shape[1]):
+ dimension = original_data.shape[1]
+ pool = multiprocessing.Pool()
+ predictions = pool.map(self.impute_data, range(dimension))
+ pool.close()
+
+ for pred_index in range(len(predictions)):
+ column_ind = predictions[pred_index][0]
print(data_cols[column_ind] + " is imputed.")
- predictions = self.impute_data(column_ind)
- predictions = [x * standard_deviations[column_ind] + means[column_ind] for x in predictions]
+ temp = [x * standard_deviations[column_ind] + means[column_ind] for x in predictions[pred_index][1]]
+
+ original_data[data_cols[column_ind]] = temp
- original_data[data_cols[column_ind]] = predictions
+ # for column_ind in range(original_data.shape[1]):
+ # print(data_cols[column_ind] + " is imputed.")
+ # predictions = self.impute_data(column_ind)
+ # predictions = [x * standard_deviations[column_ind] + means[column_ind] for x in predictions]
+ #
+ # original_data[data_cols[column_ind]] = predictions
+ #
self.imputed_data = original_data
+ end = time.time()
+ print('Impute done in {:.4f} seconds'.format(end - start))
def write_to_csv(self, output_filename):
self.imputed_data.to_csv(output_filename, index=False)
diff --git a/RIFLE/__pycache__/RobustImputer.cpython-39.pyc b/RIFLE/__pycache__/RobustImputer.cpython-39.pyc
new file mode 100644
index 0000000..a24b721
Binary files /dev/null and b/RIFLE/__pycache__/RobustImputer.cpython-39.pyc differ
diff --git a/RIFLE/preprocessing/__init__.py b/RIFLE/preprocessing/__init__.py
new file mode 100644
index 0000000..3d7fe1d
--- /dev/null
+++ b/RIFLE/preprocessing/__init__.py
@@ -0,0 +1,3 @@
+from ._polynomial import PolyFeatures
+
+__all__ = ["PolyFeatures"]
\ No newline at end of file
diff --git a/RIFLE/preprocessing/__pycache__/__init__.cpython-39.pyc b/RIFLE/preprocessing/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..909dbc6
Binary files /dev/null and b/RIFLE/preprocessing/__pycache__/__init__.cpython-39.pyc differ
diff --git a/RIFLE/preprocessing/__pycache__/_polynomial.cpython-39.pyc b/RIFLE/preprocessing/__pycache__/_polynomial.cpython-39.pyc
new file mode 100644
index 0000000..dcc5f66
Binary files /dev/null and b/RIFLE/preprocessing/__pycache__/_polynomial.cpython-39.pyc differ
diff --git a/RIFLE/preprocessing/__pycache__/validation.cpython-39.pyc b/RIFLE/preprocessing/__pycache__/validation.cpython-39.pyc
new file mode 100644
index 0000000..741511b
Binary files /dev/null and b/RIFLE/preprocessing/__pycache__/validation.cpython-39.pyc differ
diff --git a/RIFLE/preprocessing/_polynomial.py b/RIFLE/preprocessing/_polynomial.py
new file mode 100644
index 0000000..85a54f4
--- /dev/null
+++ b/RIFLE/preprocessing/_polynomial.py
@@ -0,0 +1,254 @@
+import collections
+import numbers
+from itertools import chain
+from itertools import combinations_with_replacement as combinations_w_r
+import numpy as np
+from scipy.special import comb
+
+from .validation import _check_feature_names_in
+
+
+class PolyFeatures:
+ """ Generate interaction and polynomial features. Altered version of
+ sklearn.preprocessing.PolynomialFeatures to preserve NaN values.
+
+ Parameters
+ ----------
+ degree : int, default=2
+ Maximum degree of the polynomial features.
+
+ include_bias : bool, default=True
+ If 'True', then include the bias column, the feature in which all
+ polynomial powers are zero (acts as an intercept term in a linear
+ model.
+ """
+
+ def __init__(self, degree=2, *, include_bias=True):
+ self.degree = degree
+ self.include_bias = include_bias
+
+ @staticmethod
+ def _combinations(n_features, min_degree, max_degree, include_bias):
+ comb = combinations_w_r
+ start = max(1, min_degree)
+ iter = chain.from_iterable(
+ comb(range(n_features), i) for i in range(start, max_degree + 1)
+ )
+ if include_bias:
+ iter = chain(comb(range(n_features), 0), iter)
+ return iter
+
+ @staticmethod
+ def _num_combinations(n_features, min_degree, max_degree, include_bias):
+ """
+ Calculate number of terms in polynomial expansion.
+
+ """
+ combinations = comb(n_features + max_degree, max_degree, exact=True) - 1
+ if min_degree > 0:
+ d = min_degree - 1
+ combinations -= comb(n_features + d, d, exact=True) - 1
+
+ if include_bias:
+ combinations += 1
+
+ return combinations
+
+ @property
+ def powers_(self):
+ """
+ Exponent for each of the inputs in the output.
+
+ """
+ combinations = self._combinations(
+ n_features=self.n_features_in_,
+ min_degree=self._min_degree,
+ max_degree=self._max_degree,
+ include_bias=self.include_bias,
+ )
+ return np.vstack(
+ [np.bincount(c, minlength=self.n_features_in_) for c in combinations]
+ )
+
+ def get_feature_names_out(self, input_features=None):
+ """
+ Get output feature names for transformation.
+
+ Parameters
+ ----------
+ input_features : array of str objects or None, default=None
+ Input features.
+
+ Returns
+ -------
+ feature_names : ndarray of str objects
+ Transformed feature names.
+ """
+ powers = self.powers_
+ input_features = _check_feature_names_in(self, input_features)
+ feature_names = []
+ for row in powers:
+ inds = np.where(row)[0]
+ if len(inds):
+ name = " ".join(
+ "%s^%d" % (input_features[ind], exp)
+ if exp != 1
+ else input_features[ind]
+ for ind, exp in zip(inds, row[inds])
+ )
+ else:
+ name = "1"
+ feature_names.append(name)
+ return np.asarray(feature_names, dtype=object)
+
+ def fit(self, X):
+ """
+ Compute number of output features.
+
+ Parameters
+ ----------
+ X : array-like matrix of shape (n_samples, n_features)
+ The data.
+
+ Returns
+ -------
+ self : object
+ Fitted transformer.
+ """
+ _, n_features = X.shape
+ self.n_features_in_ = n_features
+ if isinstance(self.degree, numbers.Integral):
+ if self.degree < 0:
+ raise ValueError(
+ f"degree must be a non-negative integer, got {self.degree}."
+ )
+ elif self.degree == 0 and not self.include_bias:
+ raise ValueError(
+ "Setting degree to zero and include_bias to False would result in"
+ " an empty output array."
+ )
+
+ self._min_degree = 0
+ self._max_degree = self.degree
+ elif (
+ isinstance(self.degree, collections.abc.Iterable) and len(self.degree) == 2
+ ):
+ self._min_degree, self._max_degree = self.degree
+ if not (
+ isinstance(self._min_degree, numbers.Integral)
+ and isinstance(self._max_degree, numbers.Integral)
+ and self._min_degree >= 0
+ and self._min_degree <= self._max_degree
+ ):
+ raise ValueError(
+ "degree=(min_degree, max_degree) must "
+ "be non-negative integers that fulfil "
+ "min_degree <= max_degree, got "
+ f"{self.degree}."
+ )
+ elif self._max_degree == 0 and not self.include_bias:
+ raise ValueError(
+ "Setting both min_deree and max_degree to zero and include_bias to"
+ " False would result in an empty output array."
+ )
+ else:
+ raise ValueError(
+ "degree must be a non-negative int or tuple "
+ "(min_degree, max_degree), got "
+ f"{self.degree}."
+ )
+
+ self.n_output_features_ = self._num_combinations(
+ n_features=n_features,
+ min_degree=self._min_degree,
+ max_degree=self._max_degree,
+ include_bias=self.include_bias,
+ )
+ # We also record the number of output features for
+ # _max_degree = 0
+ self._n_out_full = self._num_combinations(
+ n_features=n_features,
+ min_degree=0,
+ max_degree=self._max_degree,
+ include_bias=self.include_bias,
+ )
+
+ return self
+
+ def transform(self, X):
+ """
+ Transform data to polynomial features.
+
+ Parameters
+ ----------
+ X : array-like matrix of shape (n_samples, n_features)
+ The data to transform.
+
+ Returns
+ -------
+ XP : ndarray matrix of shape (n_samples, NP)
+ The matrix of features, where NP is the number of polynomial features
+ generated from the combination of inputs.
+ """
+ n_samples, n_features = X.shape
+ # Do as if _min_degree = 0 and cut down array after the
+ # computation, i.e. use _n_out_full instead of n_output_features_.
+
+ XP = np.empty(shape=(n_samples, self._n_out_full),
+ dtype=X.dtype)
+
+ # degree 0 term
+ if self.include_bias:
+ XP[:, 0] = 1
+ current_col = 1
+ else:
+ current_col = 0
+
+ if self._max_degree == 0:
+ return XP
+
+ # degree 1 term
+ XP[:, current_col: current_col + n_features] = X
+ index = list(range(current_col, current_col + n_features))
+ current_col += n_features
+ index.append(current_col)
+
+ # loop over degree >= 2 terms
+ for _ in range(2, self._max_degree + 1):
+ new_index = []
+ end = index[-1]
+ for feature_idx in range(n_features):
+ start = index[feature_idx]
+ new_index.append(current_col)
+ next_col = current_col + end - start
+ if next_col <= current_col:
+ break
+ # multiply
+ np.multiply(
+ XP[:, start:end],
+ X[:, feature_idx: feature_idx + 1],
+ out=XP[:, current_col:next_col],
+ casting="no",
+ )
+ # print(XP[:, start:end])
+ # print(X[:, feature_idx: feature_idx + 1])
+ # print(XP[:, current_col:next_col])
+ # print('-----')
+ current_col = next_col
+
+ new_index.append(current_col)
+ index = new_index
+
+ if self._min_degree > 1:
+ n_XP, n_Xout = self._n_out_full, self.n_output_features_
+ if self.include_bias:
+ Xout = np.empty(
+ shape=(n_samples, n_Xout), dtype=XP.dtype, order=self.order
+ )
+ Xout[:, 0] = 1
+ Xout[:, 1:] = XP[:, n_XP - n_Xout + 1:]
+ else:
+ Xout = XP[:, n_XP - n_Xout:].copy()
+ XP = Xout
+
+ return XP
diff --git a/RIFLE/preprocessing/validation.py b/RIFLE/preprocessing/validation.py
new file mode 100644
index 0000000..8d1541c
--- /dev/null
+++ b/RIFLE/preprocessing/validation.py
@@ -0,0 +1,50 @@
+import numpy as np
+
+
+def _check_feature_names_in(estimator, input_features=None, *, generate_names=True):
+ """
+ Check `input_features` and generate names if needed.
+
+ Parameters
+ ----------
+ input_features : array-like of type str or None, default=None
+ Input features.
+
+ generate_names : bool, default=None
+ Whether to generate names when 'input_features' is 'None'.
+
+ Return
+ ------
+ feature_names_in : ndarray of str or 'None'
+ Feature names in.
+
+ """
+
+ feature_names_in_ = getattr(estimator, "feature_names_in_", None)
+ n_features_in_ = getattr(estimator, "n_features_in_", None)
+
+ if input_features is not None:
+ input_features = np.asarray(input_features, dtype=object)
+ if feature_names_in_ is not None and not np.array_equal(
+ feature_names_in_, input_features
+ ):
+ raise ValueError("input_features is not equal to feature_names_in_")
+
+ if n_features_in_ is not None and len(input_features) != n_features_in_:
+ raise ValueError(
+ "input_features should have length equal to number of "
+ f"features ({n_features_in_}), got {len(input_features)}"
+ )
+ return input_features
+
+ if feature_names_in_ is not None:
+ return feature_names_in_
+
+ if not generate_names:
+ return
+
+ # Generates feature names if `n_features_in_` is defined
+ if n_features_in_ is None:
+ raise ValueError("Unable to generate feature names without n_features_in_")
+
+ return np.asarray([f"x{i}" for i in range(n_features_in_)], dtype=object)
\ No newline at end of file
diff --git a/RIFLE/run.py b/RIFLE/run.py
index e904f70..6866245 100644
--- a/RIFLE/run.py
+++ b/RIFLE/run.py
@@ -1,12 +1,23 @@
from RobustImputer import RobustImputer
import sys
+import time
-missing, imputed = sys.argv[1:3]
-imputer = RobustImputer()
+def run():
+ missing, imputed = sys.argv[1:3]
+ imputer = RobustImputer()
-imputer.read_and_scale(missing)
-imputer.estimate_confidence_intervals()
-imputer.impute()
+ imputer.read_and_scale(missing)
+ imputer.estimate_confidence_intervals()
+ imputer.impute()
+ imputer.write_to_csv(imputed)
-imputer.write_to_csv(imputed)
+# This guard is necessary to avoid creating subprocesses recursively.
+# Without it a runtime error is generated, but there is likely a more clever way to do this
+
+
+if __name__ == '__main__':
+ start = time.time()
+ run()
+ end = time.time()
+ print('Done in {:.4f} seconds'.format(end - start))
\ No newline at end of file