Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .idea/RobustInference.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

153 changes: 112 additions & 41 deletions RIFLE/RobustImputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
import numpy as np
from sklearn.preprocessing import StandardScaler
from math import sqrt
import multiprocessing
import time
from preprocessing import PolyFeatures


class RobustImputer:
Expand All @@ -14,73 +17,122 @@ def __init__(self):
self.validation_data_proportion = 0.1
self.data = None
self.transformed_data = None
self.poly_transformed_data = None
self.confidence_matrix = None
self.imputed_data = None
self.cols = None

def read_and_scale(self, filename):
self.data = pd.read_csv(filename)
self.data = pd.read_csv(filename, na_values='?')
self.cols = self.data.columns

sc = StandardScaler()
sc.fit(self.data)

transformed = sc.transform(self.data)
self.transformed_data = pd.DataFrame(transformed, columns=self.data.columns, index=self.data.index)

poly = PolyFeatures(2, include_bias=False)
poly.fit(self.data)
poly_data = poly.transform(self.data.to_numpy(dtype=float))
sc.fit(poly_data)
poly_transformed = sc.transform(poly_data)
self.poly_transformed_data = pd.DataFrame(data=poly_transformed,
index=self.data.index,
columns=poly.get_feature_names_out(self.data.columns))
print(self.poly_transformed_data)

def scale_data(self, data):
self.data = data
sc = StandardScaler()
sc.fit(self.data)

transformed = sc.transform(self.data)
self.transformed_data = pd.DataFrame(transformed, columns=data.columns, index=data.index)

def estimate_confidence_intervals(self):
poly = PolyFeatures(2, include_bias=False)
poly.fit(self.data)
poly_data = poly.transform(self.data.to_numpy(dtype=float))
sc.fit(poly_data)
poly_transformed = sc.transform(poly_data)
self.poly_transformed_data = pd.DataFrame(data=poly_transformed,
index=self.data.index,
columns=poly.get_feature_names_out(data.columns))

data = self.transformed_data
dimension = data.shape[1]
confidence_matrix = np.zeros(shape=(dimension, dimension))
def find_confidence_interval(self, feature_index1):

cols = data.columns
# print starting point and features for each process
# print(f'starting find_confidence_interval with {feature_index1}')

for i in range(dimension):
for j in range(i, dimension):
feature_i = cols[i]
feature_j = cols[j]
columns = data[[feature_i, feature_j]]
intersections = columns[columns[[feature_i, feature_j]].notnull().all(axis=1)]
# data = self.transformed_data
data = self.poly_transformed_data
dimension = data.shape[1]
for feature_index2 in range(feature_index1, dimension):
cols = data.columns
feature_i = cols[feature_index1]
feature_j = cols[feature_index2]
columns = data[[feature_i, feature_j]]
intersections = columns[columns[[feature_i, feature_j]].notnull().all(axis=1)]

intersection_num = len(intersections)

sample_size = int(intersection_num * self.bootstrap_proportion)

if sample_size < 2:
max_vals = columns.max()
max1 = max_vals[feature_i]
max2 = max_vals[feature_j]
self.confidence_matrix[feature_index1][feature_index2] = max1 * max2

estimation_array = []
for ind in range(self.number_of_bootstrap_estimations):
# current_sample = np.array(intersections.sample(n=sample_size, replace=self.with_replacement))
# For debugging
current_sample = np.array(
intersections.sample(n=sample_size, replace=self.with_replacement, random_state=1))
f1 = current_sample[:, 0]
f2 = current_sample[:, 1]
inner_prod = np.inner(f1, f2) / sample_size
estimation_array.append(inner_prod)

self.confidence_matrix[feature_index1][feature_index2] = np.std(estimation_array)

# print ending point and features for each process
# print(f'finishing find_confidence_interval with {feature_index1, feature_index2}')

intersection_num = len(intersections)
def estimate_confidence_intervals(self):

sample_size = int(intersection_num * self.bootstrap_proportion)
# data = self.transformed_data
data = self.poly_transformed_data
dimension = data.shape[1]

# initialized confidence matrix so that we are not subscripting a NoneType object
self.confidence_matrix = np.zeros(shape=(dimension, dimension), dtype="float")

if sample_size < 2:
max_vals = columns.max()
max1 = max_vals[feature_i]
max2 = max_vals[feature_j]
confidence_matrix[i][j] = max1 * max2
continue
# start timer
start = time.time()

estimation_array = []
for ind in range(self.number_of_bootstrap_estimations):
current_sample = np.array(intersections.sample(n=sample_size, replace=self.with_replacement))
f1 = current_sample[:, 0]
f2 = current_sample[:, 1]
inner_prod = np.inner(f1, f2) / sample_size
estimation_array.append(inner_prod)
pool = multiprocessing.Pool()
pool.map(self.find_confidence_interval, range(dimension))
pool.close()

confidence_matrix[i][j] = np.std(estimation_array)
# end timer and output time taken
end = time.time()
print('Confidence done in {:.4f} seconds'.format(end - start))

for j in range(dimension):
for i in range(j + 1, dimension):
confidence_matrix[i][j] = confidence_matrix[j][i]
#
# for j in range(dimension):
# for i in range(j + 1, dimension):
# confidence_matrix[i][j] = confidence_matrix[j][i]

self.confidence_matrix = confidence_matrix
# self.confidence_matrix = confidence_matrix

def impute_data(self, column_index):
data = self.transformed_data
print(f'starting impute_data with {column_index}')
# data = self.transformed_data
data = self.poly_transformed_data
confidence_intervals = self.confidence_matrix

data_columns = data.columns
# data_columns = data.columns
data_columns = self.cols

y_column = data_columns[column_index]
X = data.drop([y_column], axis=1)
Expand Down Expand Up @@ -208,22 +260,41 @@ def impute_data(self, column_index):
y_predict = np.dot(data_i.T, theta)
predicts.append(y_predict[0][0])

return predicts
res = (column_index, predicts)
return res

def impute(self):

start = time.time()

original_data = self.data
standard_deviations = original_data.std()
means = original_data.mean()
data_cols = original_data.columns

for column_ind in range(original_data.shape[1]):
dimension = original_data.shape[1]
pool = multiprocessing.Pool()
predictions = pool.map(self.impute_data, range(dimension))
pool.close()

for pred_index in range(len(predictions)):
column_ind = predictions[pred_index][0]
print(data_cols[column_ind] + " is imputed.")
predictions = self.impute_data(column_ind)
predictions = [x * standard_deviations[column_ind] + means[column_ind] for x in predictions]
temp = [x * standard_deviations[column_ind] + means[column_ind] for x in predictions[pred_index][1]]

original_data[data_cols[column_ind]] = temp

original_data[data_cols[column_ind]] = predictions
# for column_ind in range(original_data.shape[1]):
# print(data_cols[column_ind] + " is imputed.")
# predictions = self.impute_data(column_ind)
# predictions = [x * standard_deviations[column_ind] + means[column_ind] for x in predictions]
#
# original_data[data_cols[column_ind]] = predictions
#

self.imputed_data = original_data
end = time.time()
print('Impute done in {:.4f} seconds'.format(end - start))

def write_to_csv(self, output_filename):
self.imputed_data.to_csv(output_filename, index=False)
Binary file added RIFLE/__pycache__/RobustImputer.cpython-39.pyc
Binary file not shown.
3 changes: 3 additions & 0 deletions RIFLE/preprocessing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from ._polynomial import PolyFeatures

__all__ = ["PolyFeatures"]
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading