diff --git a/.idea/RobustInference.iml b/.idea/RobustInference.iml index fa80a76..85c7612 100644 --- a/.idea/RobustInference.iml +++ b/.idea/RobustInference.iml @@ -4,7 +4,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index a2e120d..dc9ea49 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/RIFLE/RobustImputer.py b/RIFLE/RobustImputer.py index f6467b4..6334bb4 100644 --- a/RIFLE/RobustImputer.py +++ b/RIFLE/RobustImputer.py @@ -2,6 +2,9 @@ import numpy as np from sklearn.preprocessing import StandardScaler from math import sqrt +import multiprocessing +import time +from preprocessing import PolyFeatures class RobustImputer: @@ -14,73 +17,122 @@ def __init__(self): self.validation_data_proportion = 0.1 self.data = None self.transformed_data = None + self.poly_transformed_data = None self.confidence_matrix = None self.imputed_data = None + self.cols = None def read_and_scale(self, filename): - self.data = pd.read_csv(filename) + self.data = pd.read_csv(filename, na_values='?') + self.cols = self.data.columns sc = StandardScaler() sc.fit(self.data) - transformed = sc.transform(self.data) self.transformed_data = pd.DataFrame(transformed, columns=self.data.columns, index=self.data.index) + poly = PolyFeatures(2, include_bias=False) + poly.fit(self.data) + poly_data = poly.transform(self.data.to_numpy(dtype=float)) + sc.fit(poly_data) + poly_transformed = sc.transform(poly_data) + self.poly_transformed_data = pd.DataFrame(data=poly_transformed, + index=self.data.index, + columns=poly.get_feature_names_out(self.data.columns)) + print(self.poly_transformed_data) + def scale_data(self, data): self.data = data sc = StandardScaler() sc.fit(self.data) - transformed = sc.transform(self.data) self.transformed_data = pd.DataFrame(transformed, columns=data.columns, index=data.index) - def estimate_confidence_intervals(self): + poly = PolyFeatures(2, include_bias=False) + poly.fit(self.data) + poly_data = poly.transform(self.data.to_numpy(dtype=float)) + sc.fit(poly_data) + poly_transformed = sc.transform(poly_data) + self.poly_transformed_data = pd.DataFrame(data=poly_transformed, + index=self.data.index, + columns=poly.get_feature_names_out(data.columns)) - data = self.transformed_data - dimension = data.shape[1] - confidence_matrix = np.zeros(shape=(dimension, dimension)) + def find_confidence_interval(self, feature_index1): - cols = data.columns + # print starting point and features for each process + # print(f'starting find_confidence_interval with {feature_index1}') - for i in range(dimension): - for j in range(i, dimension): - feature_i = cols[i] - feature_j = cols[j] - columns = data[[feature_i, feature_j]] - intersections = columns[columns[[feature_i, feature_j]].notnull().all(axis=1)] + # data = self.transformed_data + data = self.poly_transformed_data + dimension = data.shape[1] + for feature_index2 in range(feature_index1, dimension): + cols = data.columns + feature_i = cols[feature_index1] + feature_j = cols[feature_index2] + columns = data[[feature_i, feature_j]] + intersections = columns[columns[[feature_i, feature_j]].notnull().all(axis=1)] + + intersection_num = len(intersections) + + sample_size = int(intersection_num * self.bootstrap_proportion) + + if sample_size < 2: + max_vals = columns.max() + max1 = max_vals[feature_i] + max2 = max_vals[feature_j] + self.confidence_matrix[feature_index1][feature_index2] = max1 * max2 + + estimation_array = [] + for ind in range(self.number_of_bootstrap_estimations): + # current_sample = np.array(intersections.sample(n=sample_size, replace=self.with_replacement)) + # For debugging + current_sample = np.array( + intersections.sample(n=sample_size, replace=self.with_replacement, random_state=1)) + f1 = current_sample[:, 0] + f2 = current_sample[:, 1] + inner_prod = np.inner(f1, f2) / sample_size + estimation_array.append(inner_prod) + + self.confidence_matrix[feature_index1][feature_index2] = np.std(estimation_array) + + # print ending point and features for each process + # print(f'finishing find_confidence_interval with {feature_index1, feature_index2}') - intersection_num = len(intersections) + def estimate_confidence_intervals(self): - sample_size = int(intersection_num * self.bootstrap_proportion) + # data = self.transformed_data + data = self.poly_transformed_data + dimension = data.shape[1] + + # initialized confidence matrix so that we are not subscripting a NoneType object + self.confidence_matrix = np.zeros(shape=(dimension, dimension), dtype="float") - if sample_size < 2: - max_vals = columns.max() - max1 = max_vals[feature_i] - max2 = max_vals[feature_j] - confidence_matrix[i][j] = max1 * max2 - continue + # start timer + start = time.time() - estimation_array = [] - for ind in range(self.number_of_bootstrap_estimations): - current_sample = np.array(intersections.sample(n=sample_size, replace=self.with_replacement)) - f1 = current_sample[:, 0] - f2 = current_sample[:, 1] - inner_prod = np.inner(f1, f2) / sample_size - estimation_array.append(inner_prod) + pool = multiprocessing.Pool() + pool.map(self.find_confidence_interval, range(dimension)) + pool.close() - confidence_matrix[i][j] = np.std(estimation_array) + # end timer and output time taken + end = time.time() + print('Confidence done in {:.4f} seconds'.format(end - start)) - for j in range(dimension): - for i in range(j + 1, dimension): - confidence_matrix[i][j] = confidence_matrix[j][i] + # + # for j in range(dimension): + # for i in range(j + 1, dimension): + # confidence_matrix[i][j] = confidence_matrix[j][i] - self.confidence_matrix = confidence_matrix + # self.confidence_matrix = confidence_matrix def impute_data(self, column_index): - data = self.transformed_data + print(f'starting impute_data with {column_index}') + # data = self.transformed_data + data = self.poly_transformed_data confidence_intervals = self.confidence_matrix - data_columns = data.columns + # data_columns = data.columns + data_columns = self.cols y_column = data_columns[column_index] X = data.drop([y_column], axis=1) @@ -208,22 +260,41 @@ def impute_data(self, column_index): y_predict = np.dot(data_i.T, theta) predicts.append(y_predict[0][0]) - return predicts + res = (column_index, predicts) + return res def impute(self): + + start = time.time() + original_data = self.data standard_deviations = original_data.std() means = original_data.mean() data_cols = original_data.columns - for column_ind in range(original_data.shape[1]): + dimension = original_data.shape[1] + pool = multiprocessing.Pool() + predictions = pool.map(self.impute_data, range(dimension)) + pool.close() + + for pred_index in range(len(predictions)): + column_ind = predictions[pred_index][0] print(data_cols[column_ind] + " is imputed.") - predictions = self.impute_data(column_ind) - predictions = [x * standard_deviations[column_ind] + means[column_ind] for x in predictions] + temp = [x * standard_deviations[column_ind] + means[column_ind] for x in predictions[pred_index][1]] + + original_data[data_cols[column_ind]] = temp - original_data[data_cols[column_ind]] = predictions + # for column_ind in range(original_data.shape[1]): + # print(data_cols[column_ind] + " is imputed.") + # predictions = self.impute_data(column_ind) + # predictions = [x * standard_deviations[column_ind] + means[column_ind] for x in predictions] + # + # original_data[data_cols[column_ind]] = predictions + # self.imputed_data = original_data + end = time.time() + print('Impute done in {:.4f} seconds'.format(end - start)) def write_to_csv(self, output_filename): self.imputed_data.to_csv(output_filename, index=False) diff --git a/RIFLE/__pycache__/RobustImputer.cpython-39.pyc b/RIFLE/__pycache__/RobustImputer.cpython-39.pyc new file mode 100644 index 0000000..a24b721 Binary files /dev/null and b/RIFLE/__pycache__/RobustImputer.cpython-39.pyc differ diff --git a/RIFLE/preprocessing/__init__.py b/RIFLE/preprocessing/__init__.py new file mode 100644 index 0000000..3d7fe1d --- /dev/null +++ b/RIFLE/preprocessing/__init__.py @@ -0,0 +1,3 @@ +from ._polynomial import PolyFeatures + +__all__ = ["PolyFeatures"] \ No newline at end of file diff --git a/RIFLE/preprocessing/__pycache__/__init__.cpython-39.pyc b/RIFLE/preprocessing/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..909dbc6 Binary files /dev/null and b/RIFLE/preprocessing/__pycache__/__init__.cpython-39.pyc differ diff --git a/RIFLE/preprocessing/__pycache__/_polynomial.cpython-39.pyc b/RIFLE/preprocessing/__pycache__/_polynomial.cpython-39.pyc new file mode 100644 index 0000000..dcc5f66 Binary files /dev/null and b/RIFLE/preprocessing/__pycache__/_polynomial.cpython-39.pyc differ diff --git a/RIFLE/preprocessing/__pycache__/validation.cpython-39.pyc b/RIFLE/preprocessing/__pycache__/validation.cpython-39.pyc new file mode 100644 index 0000000..741511b Binary files /dev/null and b/RIFLE/preprocessing/__pycache__/validation.cpython-39.pyc differ diff --git a/RIFLE/preprocessing/_polynomial.py b/RIFLE/preprocessing/_polynomial.py new file mode 100644 index 0000000..85a54f4 --- /dev/null +++ b/RIFLE/preprocessing/_polynomial.py @@ -0,0 +1,254 @@ +import collections +import numbers +from itertools import chain +from itertools import combinations_with_replacement as combinations_w_r +import numpy as np +from scipy.special import comb + +from .validation import _check_feature_names_in + + +class PolyFeatures: + """ Generate interaction and polynomial features. Altered version of + sklearn.preprocessing.PolynomialFeatures to preserve NaN values. + + Parameters + ---------- + degree : int, default=2 + Maximum degree of the polynomial features. + + include_bias : bool, default=True + If 'True', then include the bias column, the feature in which all + polynomial powers are zero (acts as an intercept term in a linear + model. + """ + + def __init__(self, degree=2, *, include_bias=True): + self.degree = degree + self.include_bias = include_bias + + @staticmethod + def _combinations(n_features, min_degree, max_degree, include_bias): + comb = combinations_w_r + start = max(1, min_degree) + iter = chain.from_iterable( + comb(range(n_features), i) for i in range(start, max_degree + 1) + ) + if include_bias: + iter = chain(comb(range(n_features), 0), iter) + return iter + + @staticmethod + def _num_combinations(n_features, min_degree, max_degree, include_bias): + """ + Calculate number of terms in polynomial expansion. + + """ + combinations = comb(n_features + max_degree, max_degree, exact=True) - 1 + if min_degree > 0: + d = min_degree - 1 + combinations -= comb(n_features + d, d, exact=True) - 1 + + if include_bias: + combinations += 1 + + return combinations + + @property + def powers_(self): + """ + Exponent for each of the inputs in the output. + + """ + combinations = self._combinations( + n_features=self.n_features_in_, + min_degree=self._min_degree, + max_degree=self._max_degree, + include_bias=self.include_bias, + ) + return np.vstack( + [np.bincount(c, minlength=self.n_features_in_) for c in combinations] + ) + + def get_feature_names_out(self, input_features=None): + """ + Get output feature names for transformation. + + Parameters + ---------- + input_features : array of str objects or None, default=None + Input features. + + Returns + ------- + feature_names : ndarray of str objects + Transformed feature names. + """ + powers = self.powers_ + input_features = _check_feature_names_in(self, input_features) + feature_names = [] + for row in powers: + inds = np.where(row)[0] + if len(inds): + name = " ".join( + "%s^%d" % (input_features[ind], exp) + if exp != 1 + else input_features[ind] + for ind, exp in zip(inds, row[inds]) + ) + else: + name = "1" + feature_names.append(name) + return np.asarray(feature_names, dtype=object) + + def fit(self, X): + """ + Compute number of output features. + + Parameters + ---------- + X : array-like matrix of shape (n_samples, n_features) + The data. + + Returns + ------- + self : object + Fitted transformer. + """ + _, n_features = X.shape + self.n_features_in_ = n_features + if isinstance(self.degree, numbers.Integral): + if self.degree < 0: + raise ValueError( + f"degree must be a non-negative integer, got {self.degree}." + ) + elif self.degree == 0 and not self.include_bias: + raise ValueError( + "Setting degree to zero and include_bias to False would result in" + " an empty output array." + ) + + self._min_degree = 0 + self._max_degree = self.degree + elif ( + isinstance(self.degree, collections.abc.Iterable) and len(self.degree) == 2 + ): + self._min_degree, self._max_degree = self.degree + if not ( + isinstance(self._min_degree, numbers.Integral) + and isinstance(self._max_degree, numbers.Integral) + and self._min_degree >= 0 + and self._min_degree <= self._max_degree + ): + raise ValueError( + "degree=(min_degree, max_degree) must " + "be non-negative integers that fulfil " + "min_degree <= max_degree, got " + f"{self.degree}." + ) + elif self._max_degree == 0 and not self.include_bias: + raise ValueError( + "Setting both min_deree and max_degree to zero and include_bias to" + " False would result in an empty output array." + ) + else: + raise ValueError( + "degree must be a non-negative int or tuple " + "(min_degree, max_degree), got " + f"{self.degree}." + ) + + self.n_output_features_ = self._num_combinations( + n_features=n_features, + min_degree=self._min_degree, + max_degree=self._max_degree, + include_bias=self.include_bias, + ) + # We also record the number of output features for + # _max_degree = 0 + self._n_out_full = self._num_combinations( + n_features=n_features, + min_degree=0, + max_degree=self._max_degree, + include_bias=self.include_bias, + ) + + return self + + def transform(self, X): + """ + Transform data to polynomial features. + + Parameters + ---------- + X : array-like matrix of shape (n_samples, n_features) + The data to transform. + + Returns + ------- + XP : ndarray matrix of shape (n_samples, NP) + The matrix of features, where NP is the number of polynomial features + generated from the combination of inputs. + """ + n_samples, n_features = X.shape + # Do as if _min_degree = 0 and cut down array after the + # computation, i.e. use _n_out_full instead of n_output_features_. + + XP = np.empty(shape=(n_samples, self._n_out_full), + dtype=X.dtype) + + # degree 0 term + if self.include_bias: + XP[:, 0] = 1 + current_col = 1 + else: + current_col = 0 + + if self._max_degree == 0: + return XP + + # degree 1 term + XP[:, current_col: current_col + n_features] = X + index = list(range(current_col, current_col + n_features)) + current_col += n_features + index.append(current_col) + + # loop over degree >= 2 terms + for _ in range(2, self._max_degree + 1): + new_index = [] + end = index[-1] + for feature_idx in range(n_features): + start = index[feature_idx] + new_index.append(current_col) + next_col = current_col + end - start + if next_col <= current_col: + break + # multiply + np.multiply( + XP[:, start:end], + X[:, feature_idx: feature_idx + 1], + out=XP[:, current_col:next_col], + casting="no", + ) + # print(XP[:, start:end]) + # print(X[:, feature_idx: feature_idx + 1]) + # print(XP[:, current_col:next_col]) + # print('-----') + current_col = next_col + + new_index.append(current_col) + index = new_index + + if self._min_degree > 1: + n_XP, n_Xout = self._n_out_full, self.n_output_features_ + if self.include_bias: + Xout = np.empty( + shape=(n_samples, n_Xout), dtype=XP.dtype, order=self.order + ) + Xout[:, 0] = 1 + Xout[:, 1:] = XP[:, n_XP - n_Xout + 1:] + else: + Xout = XP[:, n_XP - n_Xout:].copy() + XP = Xout + + return XP diff --git a/RIFLE/preprocessing/validation.py b/RIFLE/preprocessing/validation.py new file mode 100644 index 0000000..8d1541c --- /dev/null +++ b/RIFLE/preprocessing/validation.py @@ -0,0 +1,50 @@ +import numpy as np + + +def _check_feature_names_in(estimator, input_features=None, *, generate_names=True): + """ + Check `input_features` and generate names if needed. + + Parameters + ---------- + input_features : array-like of type str or None, default=None + Input features. + + generate_names : bool, default=None + Whether to generate names when 'input_features' is 'None'. + + Return + ------ + feature_names_in : ndarray of str or 'None' + Feature names in. + + """ + + feature_names_in_ = getattr(estimator, "feature_names_in_", None) + n_features_in_ = getattr(estimator, "n_features_in_", None) + + if input_features is not None: + input_features = np.asarray(input_features, dtype=object) + if feature_names_in_ is not None and not np.array_equal( + feature_names_in_, input_features + ): + raise ValueError("input_features is not equal to feature_names_in_") + + if n_features_in_ is not None and len(input_features) != n_features_in_: + raise ValueError( + "input_features should have length equal to number of " + f"features ({n_features_in_}), got {len(input_features)}" + ) + return input_features + + if feature_names_in_ is not None: + return feature_names_in_ + + if not generate_names: + return + + # Generates feature names if `n_features_in_` is defined + if n_features_in_ is None: + raise ValueError("Unable to generate feature names without n_features_in_") + + return np.asarray([f"x{i}" for i in range(n_features_in_)], dtype=object) \ No newline at end of file diff --git a/RIFLE/run.py b/RIFLE/run.py index e904f70..6866245 100644 --- a/RIFLE/run.py +++ b/RIFLE/run.py @@ -1,12 +1,23 @@ from RobustImputer import RobustImputer import sys +import time -missing, imputed = sys.argv[1:3] -imputer = RobustImputer() +def run(): + missing, imputed = sys.argv[1:3] + imputer = RobustImputer() -imputer.read_and_scale(missing) -imputer.estimate_confidence_intervals() -imputer.impute() + imputer.read_and_scale(missing) + imputer.estimate_confidence_intervals() + imputer.impute() + imputer.write_to_csv(imputed) -imputer.write_to_csv(imputed) +# This guard is necessary to avoid creating subprocesses recursively. +# Without it a runtime error is generated, but there is likely a more clever way to do this + + +if __name__ == '__main__': + start = time.time() + run() + end = time.time() + print('Done in {:.4f} seconds'.format(end - start)) \ No newline at end of file