From a19cefa48347bc2dda9b260eaffb1969ade5c35c Mon Sep 17 00:00:00 2001 From: sinab Date: Thu, 23 Jun 2022 10:03:06 -0700 Subject: [PATCH 1/9] + Add Multi processing --- RIFLE/RobustImputer.py | 73 ++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 32 deletions(-) diff --git a/RIFLE/RobustImputer.py b/RIFLE/RobustImputer.py index f6467b4..387828d 100644 --- a/RIFLE/RobustImputer.py +++ b/RIFLE/RobustImputer.py @@ -2,6 +2,7 @@ import numpy as np from sklearn.preprocessing import StandardScaler from math import sqrt +import multiprocessing class RobustImputer: @@ -34,47 +35,55 @@ def scale_data(self, data): transformed = sc.transform(self.data) self.transformed_data = pd.DataFrame(transformed, columns=data.columns, index=data.index) + def find_confidence_interval(self, feature_index1, feature_index2): + data = self.transformed_data + cols = data.columns + feature_i = cols[feature_index1] + feature_j = cols[feature_index2] + columns = data[[feature_i, feature_j]] + intersections = columns[columns[[feature_i, feature_j]].notnull().all(axis=1)] + + intersection_num = len(intersections) + + sample_size = int(intersection_num * self.bootstrap_proportion) + + if sample_size < 2: + max_vals = columns.max() + max1 = max_vals[feature_i] + max2 = max_vals[feature_j] + self.confidence_matrix[feature_index1][feature_index2] = max1 * max2 + + estimation_array = [] + for ind in range(self.number_of_bootstrap_estimations): + # current_sample = np.array(intersections.sample(n=sample_size, replace=self.with_replacement)) + # For debugging + current_sample = np.array( + intersections.sample(n=sample_size, replace=self.with_replacement, random_state=1)) + f1 = current_sample[:, 0] + f2 = current_sample[:, 1] + inner_prod = np.inner(f1, f2) / sample_size + estimation_array.append(inner_prod) + + self.confidence_matrix[feature_index1][feature_index2] = np.std(estimation_array) + def estimate_confidence_intervals(self): data = self.transformed_data dimension = data.shape[1] - confidence_matrix = np.zeros(shape=(dimension, dimension)) - - cols = data.columns for i in range(dimension): for j in range(i, dimension): - feature_i = cols[i] - feature_j = cols[j] - columns = data[[feature_i, feature_j]] - intersections = columns[columns[[feature_i, feature_j]].notnull().all(axis=1)] - - intersection_num = len(intersections) - - sample_size = int(intersection_num * self.bootstrap_proportion) - - if sample_size < 2: - max_vals = columns.max() - max1 = max_vals[feature_i] - max2 = max_vals[feature_j] - confidence_matrix[i][j] = max1 * max2 - continue - - estimation_array = [] - for ind in range(self.number_of_bootstrap_estimations): - current_sample = np.array(intersections.sample(n=sample_size, replace=self.with_replacement)) - f1 = current_sample[:, 0] - f2 = current_sample[:, 1] - inner_prod = np.inner(f1, f2) / sample_size - estimation_array.append(inner_prod) - - confidence_matrix[i][j] = np.std(estimation_array) + p = multiprocessing.Process(target=find_confidence_interval, args=(i, j,)) + # 1) start and join the process + # 2) Check whether the code works properly + # 3) check whether the solution is acceptable (same as the previous case) - for j in range(dimension): - for i in range(j + 1, dimension): - confidence_matrix[i][j] = confidence_matrix[j][i] + # + # for j in range(dimension): + # for i in range(j + 1, dimension): + # confidence_matrix[i][j] = confidence_matrix[j][i] - self.confidence_matrix = confidence_matrix + # self.confidence_matrix = confidence_matrix def impute_data(self, column_index): data = self.transformed_data From fb815bb892f8a19d746686281301c185e1247893 Mon Sep 17 00:00:00 2001 From: gaiadennison Date: Thu, 23 Jun 2022 15:15:14 -0700 Subject: [PATCH 2/9] added multiprocessing for confidence intervals --- RIFLE/RobustImputer.py | 28 +++++++++++++++++- .../__pycache__/RobustImputer.cpython-39.pyc | Bin 0 -> 6144 bytes RIFLE/run.py | 18 +++++++---- 3 files changed, 39 insertions(+), 7 deletions(-) create mode 100644 RIFLE/__pycache__/RobustImputer.cpython-39.pyc diff --git a/RIFLE/RobustImputer.py b/RIFLE/RobustImputer.py index 387828d..503b818 100644 --- a/RIFLE/RobustImputer.py +++ b/RIFLE/RobustImputer.py @@ -3,6 +3,7 @@ from sklearn.preprocessing import StandardScaler from math import sqrt import multiprocessing +import time class RobustImputer: @@ -36,6 +37,10 @@ def scale_data(self, data): self.transformed_data = pd.DataFrame(transformed, columns=data.columns, index=data.index) def find_confidence_interval(self, feature_index1, feature_index2): + + # print starting point and features for each process + print(f'starting find_confidence_interval with {feature_index1, feature_index2}') + data = self.transformed_data cols = data.columns feature_i = cols[feature_index1] @@ -66,18 +71,39 @@ def find_confidence_interval(self, feature_index1, feature_index2): self.confidence_matrix[feature_index1][feature_index2] = np.std(estimation_array) + # print ending point and features for each process + print(f'finishing find_confidence_interval with {feature_index1, feature_index2}') + def estimate_confidence_intervals(self): data = self.transformed_data dimension = data.shape[1] + # initialized confidence matrix so that we are not subscripting a NoneType object + self.confidence_matrix = np.zeros(shape=(dimension, dimension)) + # start timer + start = time.time() + + # list to keep track of processes because all processes must be started before any can be joined + process_list = [] for i in range(dimension): for j in range(i, dimension): - p = multiprocessing.Process(target=find_confidence_interval, args=(i, j,)) + p = multiprocessing.Process(target=self.find_confidence_interval, args=(i, j,)) + p.start() + process_list.append(p) # 1) start and join the process # 2) Check whether the code works properly # 3) check whether the solution is acceptable (same as the previous case) + # join all processes and verify they have ended + for process in process_list: + process.join() + print(f'Process p is alive: {process.is_alive()}') + + # end timer and output time taken + end = time.time() + print('Done in {:.4f} seconds'.format(end-start)) + # # for j in range(dimension): # for i in range(j + 1, dimension): diff --git a/RIFLE/__pycache__/RobustImputer.cpython-39.pyc b/RIFLE/__pycache__/RobustImputer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..336d4c7ce433e9605eb4dc6747cc87d3585a5d97 GIT binary patch literal 6144 zcmaJ_ONGQ z)F_HbWGn?~bBs^3zH!ebQVuyq$}Li)Na<6g9HKqvV&xF>{naz<48fUE|MmXsufP6R z{Xw~0F!1~NAAftdaNIEdL5=aBg~nBU@kao{5Ugczrc0AsD9x7LcDTc6%xbyq9M9=G zYkAz$^YT3ZJww>S`Gp~zCnhflcH5}CJHTh$th2OoI|(`>km7bDXoV6zcDyc=291yZ zOf;_Ii$4YkH5J?xOqjngK#3)6l-NbMC>@a#9!gi_MFC|_6h#T8C(5FNGB2jYG|GaQ z0WHwWEDsH_ti5$j6)fp zSqq{7t2$BK?VP#MZEO$)@tIp6e(>>kC*e5TS9*_A&-bHFl=yxT_+mn8F!z=J=Iq36 zt&Em5=$b}98x;qizP9h9k{AP~*fw3BnOIMZzNyRsuB>TpGiIRfWc9w47)@8uc4&alfsrL>$UY{)7|dea!PikO2sSYeQ1$ZZ?_qADr}vr8yllx0;?-cLFJc z$J~vBb`PF_OaSRuCzPoh^m<`Oq*k1WdQB?|udIeavLQoQ45;9_SK8;b7;zhl8uR?i zsk_x?Mr&D{-Z*S%!^x+|n?B4eFJ|QUakLq755)Vhq%pTqJE!4!KD}#t{p<`luHco% zhLo^;eug16SI^-)&kOoYWPtxirRP_%kgWg6?2a762{0pG0WcgAbr1g%TQohhVp5&{ zRnuk_%VyIrY;&5?zhrJMPpZSWp`UPhbRjN2tsGyY0ueF#u%1mONYKs1Y+4HTbJZl# zC%2b40|&xwL3HiJRi+l)_xld&4q7I}_*CCbau7A6nI{FpTI-of3NY<C+G8ev1TH;TDHeLiU;0X$5)$$-1C01cI z2uj8ESi$z#=IKe{_`et@oW*-UT*9X%iooc&#T#q@eAt0#P|g@ zH!=Q}f_!8%G~eIW=e{sHcmLu9S5B&n6P&22bK3Wa{yX5=SU0x2STk_jN6~Wx zBj?oxXxveCOcd2wgkPui=rKKdOuePv#wfLR`TxpJTlV^v#(oC6NSuFH>%zOF8<~C# zbzlJxzfu-b$-}UjI$V4gzrT-qcCKhw zH|+lAiadhR9g6_HdUhyy+adqGLx^`M%-x~LOJ1c~jR1MO9RUzsy*6cN1o9(l{uxVl z{`2C+i~7;bFH)eE1xmKS`;&V~m!hKH)A?#rP}8sIt&x;wxUkZTBC^b0r%+jnncNCZg=2^C2E0 z*IpBU#k{0n&BJ#e_a4LV(i1RcpQY>$cOPr#&GUDB1iwDc!i(868?itV^V-<+JGn;$ zcUyv&h7sb8u$2VJTIj-U2l0LXGhWo?9Y29z=2_<*p4Xjgct3Kl(S6`kEoIkJc9%QX z{dUyRMev9_ce54>w7i}SMbt=Q`5oebBfAg%h8g@gHjb~4smLPP>5H@4mRrZhS;uy#w-NB#=Tb1&RA6Xj3Ve^E`pctjJLeIIINGU zJg;lA9f!Dnw4LN$2p$#hAa2%22?e)k)Yfxc&FqNh<0R~5v3hOy7!gR$G9xcfdYyPR zI`(zq(Wr54eB>I9I9iHQcAw`TkG6j`9xG;v5&bTe;BowW0G+%rU4-b?PIoH=OMg5iC~}pz@lTil3AmmAD?GY z=wCpc#yxftao<6d1!`e^6(b%nWwU7o)fPeBEOwg4w}d@s&CP|$=y2?O6R(9)riD~1 zegJ^@LcbOUY}@$KRE(a!w3qUR&v_UmEmUcxu3AnseyGJHQy^o|5mj zzA(OEUzn>|ZqHNJ4f4u(q9ZY(S6njIPw6DZD7mK1;$~?nikF62s+9>!=^;zTk&-9z z62KaE`+S~WHcvIin(>qoEpx`N&2{_sSn2`K33KIa${szpcwyIKaHkz2XU6wXvW{|? zz6c*gnWT*+Y6_RgY&E*=-esK7_!qV|Hc!5ey(^bs>hQe0jLK-qD2wtYR^h>k4YRxn zJxjF3l@yN49}}1c7$%Zg9$j%CGYWK~lGm4aRfCnLUlNSlRO@KCh Y9VMxEDLdX}Fvp0=AJc7?@JDI=7pW8;aR2}S literal 0 HcmV?d00001 diff --git a/RIFLE/run.py b/RIFLE/run.py index e904f70..2e55759 100644 --- a/RIFLE/run.py +++ b/RIFLE/run.py @@ -2,11 +2,17 @@ import sys -missing, imputed = sys.argv[1:3] -imputer = RobustImputer() +def run(): + missing, imputed = sys.argv[1:3] + imputer = RobustImputer() -imputer.read_and_scale(missing) -imputer.estimate_confidence_intervals() -imputer.impute() + imputer.read_and_scale(missing) + imputer.estimate_confidence_intervals() + imputer.impute() + imputer.write_to_csv(imputed) -imputer.write_to_csv(imputed) + +# This guard is necessary to avoid creating subprocesses recursively. +# Without it a runtime error is generated, but there is likely a more clever way to do this +if __name__ == '__main__': + run() \ No newline at end of file From 2e0ecfa39be59b78668f4be69f283e030f135ae9 Mon Sep 17 00:00:00 2001 From: gaiadennison Date: Wed, 6 Jul 2022 15:17:37 -0700 Subject: [PATCH 3/9] impute parallel --- RIFLE/RobustImputer.py | 117 ++++++++++-------- .../__pycache__/RobustImputer.cpython-39.pyc | Bin 6144 -> 6353 bytes RIFLE/run.py | 6 +- 3 files changed, 69 insertions(+), 54 deletions(-) diff --git a/RIFLE/RobustImputer.py b/RIFLE/RobustImputer.py index 503b818..d016030 100644 --- a/RIFLE/RobustImputer.py +++ b/RIFLE/RobustImputer.py @@ -36,73 +36,64 @@ def scale_data(self, data): transformed = sc.transform(self.data) self.transformed_data = pd.DataFrame(transformed, columns=data.columns, index=data.index) - def find_confidence_interval(self, feature_index1, feature_index2): + def find_confidence_interval(self, feature_index1): # print starting point and features for each process - print(f'starting find_confidence_interval with {feature_index1, feature_index2}') + # print(f'starting find_confidence_interval with {feature_index1}') data = self.transformed_data - cols = data.columns - feature_i = cols[feature_index1] - feature_j = cols[feature_index2] - columns = data[[feature_i, feature_j]] - intersections = columns[columns[[feature_i, feature_j]].notnull().all(axis=1)] - - intersection_num = len(intersections) - - sample_size = int(intersection_num * self.bootstrap_proportion) - - if sample_size < 2: - max_vals = columns.max() - max1 = max_vals[feature_i] - max2 = max_vals[feature_j] - self.confidence_matrix[feature_index1][feature_index2] = max1 * max2 - - estimation_array = [] - for ind in range(self.number_of_bootstrap_estimations): - # current_sample = np.array(intersections.sample(n=sample_size, replace=self.with_replacement)) - # For debugging - current_sample = np.array( - intersections.sample(n=sample_size, replace=self.with_replacement, random_state=1)) - f1 = current_sample[:, 0] - f2 = current_sample[:, 1] - inner_prod = np.inner(f1, f2) / sample_size - estimation_array.append(inner_prod) - - self.confidence_matrix[feature_index1][feature_index2] = np.std(estimation_array) + dimension = data.shape[1] + for feature_index2 in range(feature_index1, dimension): + cols = data.columns + feature_i = cols[feature_index1] + feature_j = cols[feature_index2] + columns = data[[feature_i, feature_j]] + intersections = columns[columns[[feature_i, feature_j]].notnull().all(axis=1)] + + intersection_num = len(intersections) + + sample_size = int(intersection_num * self.bootstrap_proportion) + + if sample_size < 2: + max_vals = columns.max() + max1 = max_vals[feature_i] + max2 = max_vals[feature_j] + self.confidence_matrix[feature_index1][feature_index2] = max1 * max2 + + estimation_array = [] + for ind in range(self.number_of_bootstrap_estimations): + # current_sample = np.array(intersections.sample(n=sample_size, replace=self.with_replacement)) + # For debugging + current_sample = np.array( + intersections.sample(n=sample_size, replace=self.with_replacement, random_state=1)) + f1 = current_sample[:, 0] + f2 = current_sample[:, 1] + inner_prod = np.inner(f1, f2) / sample_size + estimation_array.append(inner_prod) + + self.confidence_matrix[feature_index1][feature_index2] = np.std(estimation_array) # print ending point and features for each process - print(f'finishing find_confidence_interval with {feature_index1, feature_index2}') + # print(f'finishing find_confidence_interval with {feature_index1, feature_index2}') def estimate_confidence_intervals(self): data = self.transformed_data dimension = data.shape[1] + # initialized confidence matrix so that we are not subscripting a NoneType object self.confidence_matrix = np.zeros(shape=(dimension, dimension)) # start timer start = time.time() - # list to keep track of processes because all processes must be started before any can be joined - process_list = [] - for i in range(dimension): - for j in range(i, dimension): - p = multiprocessing.Process(target=self.find_confidence_interval, args=(i, j,)) - p.start() - process_list.append(p) - # 1) start and join the process - # 2) Check whether the code works properly - # 3) check whether the solution is acceptable (same as the previous case) - - # join all processes and verify they have ended - for process in process_list: - process.join() - print(f'Process p is alive: {process.is_alive()}') + pool = multiprocessing.Pool() + pool.map(self.find_confidence_interval, range(dimension)) + pool.close() # end timer and output time taken end = time.time() - print('Done in {:.4f} seconds'.format(end-start)) + print('Confidence done in {:.4f} seconds'.format(end-start)) # # for j in range(dimension): @@ -112,6 +103,7 @@ def estimate_confidence_intervals(self): # self.confidence_matrix = confidence_matrix def impute_data(self, column_index): + print(f'starting impute_data with {column_index}') data = self.transformed_data confidence_intervals = self.confidence_matrix @@ -243,22 +235,41 @@ def impute_data(self, column_index): y_predict = np.dot(data_i.T, theta) predicts.append(y_predict[0][0]) - return predicts + res = (column_index, predicts) + return res def impute(self): + + start = time.time() + original_data = self.data standard_deviations = original_data.std() means = original_data.mean() data_cols = original_data.columns - for column_ind in range(original_data.shape[1]): + dimension = original_data.shape[1] + pool = multiprocessing.Pool() + predictions = pool.map(self.impute_data, range(dimension)) + pool.close() + + for pred_index in range(len(predictions)): + column_ind = predictions[pred_index][0] print(data_cols[column_ind] + " is imputed.") - predictions = self.impute_data(column_ind) - predictions = [x * standard_deviations[column_ind] + means[column_ind] for x in predictions] + temp = [x * standard_deviations[column_ind] + means[column_ind] for x in predictions[pred_index][1]] - original_data[data_cols[column_ind]] = predictions + original_data[data_cols[column_ind]] = temp + + # for column_ind in range(original_data.shape[1]): + # print(data_cols[column_ind] + " is imputed.") + # predictions = self.impute_data(column_ind) + # predictions = [x * standard_deviations[column_ind] + means[column_ind] for x in predictions] + # + # original_data[data_cols[column_ind]] = predictions + # self.imputed_data = original_data + end = time.time() + print('Impute done in {:.4f} seconds'.format(end - start)) def write_to_csv(self, output_filename): - self.imputed_data.to_csv(output_filename, index=False) + self.imputed_data.to_csv(output_filename, index=False) \ No newline at end of file diff --git a/RIFLE/__pycache__/RobustImputer.cpython-39.pyc b/RIFLE/__pycache__/RobustImputer.cpython-39.pyc index 336d4c7ce433e9605eb4dc6747cc87d3585a5d97..c9dcbcf780b287e9da19c75c43a1db232dbe70d3 100644 GIT binary patch delta 2722 zcmZ`*&2L*v6`z^=;p?yK?>Np!V>f9UKetJn_Vw!(D7>4zc+_M(X(F(eH`Dr`22Ow7_lx31&J>Ac!h;V zAc&K3BJQ$NzGH?aBt~eFMM`R;H_^vPkWI0Dfw4pCKchzGm~}0Wd*X%Cj|VozhbKHt z1TjirE2qdnB`N6{PnrAyIivWPg(*sr4fQno7DZ@pY?qJG?1NMFtPXg@}L#eZ4HnH zL~dy{KWNEdVh*rG^!@mi%KPB@c{EG_3?pwi!r^)D7|}oCw}-dDPvO(3T*YAm=5gZR zX5Rr4z=3r+qL_6>Xy}LqRytxWP$pUXh)B!Y|$i=SK3r<6KUWsB_C4-1n-p**1%uFQ%jVw0FVp5LWu?Tao zsVt~Gs5~x*`BZp3MV&e+s zaD@u5IWchZ`cf5uQjt?A!?# zf}!%r%yZo`EmW9t#+aW9e~l`m`Tr&3$Y6(mB3EgLi!d=pi!*;N1r=ICB3dGM>IPcU zPFSX8pb57^+tFjaOOfFUb{A-Q8bmnJ8P5z?@Krq))@X$ip1r`HvQr^nV(KEDK~}1k z!2w=1aF(b>wP^(H;o^cfM>QC*LRC*(XJPeJ9DXyX1j~wxHh#kIs7!el$SuLdXf=I- z$D;e`b3md%>sWT3Vsv(Dy|(?lC&CTS3F~wY2~vh{VOzvQ_snR8R!LD+TAi9O=0#S9 zX|x7Y8cL7Wse$nYtzV<_bV0{{yuj%EbPk+)F*r*X@4>1~+N3&8Ysg2mi4EThH%^Vi zM<@ac>Mh!OBeucOr$AQLrf$)Om%xITykztlcNa&h(v^=)3$K05@C~~5F=I<~Syuor zypCgC(#AKa(anuTkpan7a8@DNrYi{ORa)~>bP2Vjp`~kDx<*&&ZAcXbh=wza-q`BY zJp8U+ehZ|!Qd!pKlEBbtNNu* z@T97|sR$(dCfujFQG>p5^qr%UTaHGZI#!x*VRJ}0p`M*LFun|LAP-Ri2Suql0 z_Mz$x^jxJ_opw?9;eQjLsd9x!!|QTos&b{C53_QW2vG5-H+<)FNX1Aq02LSSLl&f; zknsTnX_RpVID5{X^XFpE@J!G8x_)UuhK=s33!>g0Vyb_e9X`OAJ#=U0#)Gtnq)Ktp znOv=WOJdd}9K0EgT&qpukc1pUMe-dj<&+%A|cB4Dx&Giju!5Ol+Qi?w0d64 gm-8T-R`2Lw@DA1ER~76^%gieO>XkK5-2yNwgid#$SK zJ{eR{sYInnZ-K-HyAnt&`Ro9R6*~}YSa<;v8^nSo5-ZA?vFXp&+_}G*d*|LecizuO z-d=TUsZ^Zd_s;iUU5w9t?*3|YbV(0?dCU&R3mAT!Ge)e*LY?#`4-GPikF(IktYNlD zCzGr#iQ6RqL#nIwSX1}7r`?hIPGFF8lZUauq!{+I@(fg*;+}q)i4QttckLbv6OPgHN<`g99H^Ov~=qAZ&rYH`s3zNnjy~o20WjyAw z_cYJklgtay^aa;u97CdgKY z*k~Uk11plDLrym-rR{5h*)uLO-< zv2&^J`;FFC)qZt=i&ZH$eQX|xWwolg(`fb0UD0R-eQR?^Y}bNHTm@DIt{U^OZOd=$ zw8ZRjtf(f3`<}nj@Pj^Y^m((-cdPc09kpXeXI_S)Ujks7#p7B|cXWd%xy$q1;jZQI zxDkCDJJmS}jUvb1a0xU?p~h^9jkB(S2s?Oqf(3!WWTXbx>dgp05u3u3anpJqr>Q#3QM*yDLtfG0Ey z%S%#^=4lD|veKWXgS4zl-5FtYaIgkBJQS4a&}lfiLMv3k;dF6^R$y>3T-ejwXUI~K zJxqrmD`jw~^pLBEX~BzO!&NUH<=jg+(i9#2-?i}ABaSc7u}7TE(-GBIz3@1WwW^$7 zqNOG(I?oWe8E_6jvPMTyzcV!JC1@3|(2|nQD(NgeK~F*|DTL^aVT>L#wP&>V)b@u_ z*)lpORAHTvg*GU*VL^?f^(Iy8WulBGB=GDqaU^P`{gri~f955S&Gt_Zpt1gZnu!Cz z#7mO=$%f$Hd-v{DW}Z;lBTeX%r7`>iZ&&d8P<^ihu zm7PmsIr=2?3O^bBo%!1oPlNsCU)c>lFLF{%fe~Q+L}PpRYEVD#)zG=Y@V^0I(YcYY zqlx@N#{hxm$>bM>e+@vjQBJ(ed+fG`8D9psK{xeK!&d=sn(}g}gsk6Y^3i*rAv#(C zJUnNtYriimUcbd|@mt!a?&+TKu0#5U{DQz&3Gu;-Q>?v6JVorhbN?la7Dw-L9{n)7 z_CzJ7-VkvJE>(@~I-)QJR6SCIS4=(t;!O!M02M=IqZ?zB&s7t=1$cJ+K^)FW zB^3;75R8Ao@MVe^xA3Ys+|qJ9!{dBRbG67We8}zSiz~5t hXU^#`+et?+7E63Fsu!0M$E5g42@L2IqA!Y5{{caqSS0`e diff --git a/RIFLE/run.py b/RIFLE/run.py index 2e55759..4e82d5c 100644 --- a/RIFLE/run.py +++ b/RIFLE/run.py @@ -1,5 +1,6 @@ from RobustImputer import RobustImputer import sys +import time def run(): @@ -15,4 +16,7 @@ def run(): # This guard is necessary to avoid creating subprocesses recursively. # Without it a runtime error is generated, but there is likely a more clever way to do this if __name__ == '__main__': - run() \ No newline at end of file + start = time.time() + run() + end = time.time() + print('Done in {:.4f} seconds'.format(end - start)) \ No newline at end of file From 3ef0782e5af4ad882e3731f83fb8e9ca03f41bea Mon Sep 17 00:00:00 2001 From: gaiadennison Date: Tue, 19 Jul 2022 12:17:25 -0700 Subject: [PATCH 4/9] polynomial features transform --- RIFLE/RobustImputer.py | 20 +- .../__pycache__/RobustImputer.cpython-39.pyc | Bin 6353 -> 6485 bytes RIFLE/preprocessing/__init__.py | 3 + .../__pycache__/__init__.cpython-39.pyc | Bin 0 -> 234 bytes .../__pycache__/_polynomial.cpython-39.pyc | Bin 0 -> 5065 bytes .../__pycache__/validation.cpython-39.pyc | Bin 0 -> 1109 bytes RIFLE/preprocessing/_polynomial.py | 218 ++++++++++++++++++ RIFLE/preprocessing/validation.py | 37 +++ 8 files changed, 270 insertions(+), 8 deletions(-) create mode 100644 RIFLE/preprocessing/__init__.py create mode 100644 RIFLE/preprocessing/__pycache__/__init__.cpython-39.pyc create mode 100644 RIFLE/preprocessing/__pycache__/_polynomial.cpython-39.pyc create mode 100644 RIFLE/preprocessing/__pycache__/validation.cpython-39.pyc create mode 100644 RIFLE/preprocessing/_polynomial.py create mode 100644 RIFLE/preprocessing/validation.py diff --git a/RIFLE/RobustImputer.py b/RIFLE/RobustImputer.py index d016030..e5bf328 100644 --- a/RIFLE/RobustImputer.py +++ b/RIFLE/RobustImputer.py @@ -4,7 +4,7 @@ from math import sqrt import multiprocessing import time - +from preprocessing import PolyFeatures class RobustImputer: @@ -16,17 +16,13 @@ def __init__(self): self.validation_data_proportion = 0.1 self.data = None self.transformed_data = None + self.poly_transformed_data = None self.confidence_matrix = None self.imputed_data = None def read_and_scale(self, filename): - self.data = pd.read_csv(filename) - - sc = StandardScaler() - sc.fit(self.data) - - transformed = sc.transform(self.data) - self.transformed_data = pd.DataFrame(transformed, columns=self.data.columns, index=self.data.index) + data = pd.read_csv(filename) + self.scale_data(data) def scale_data(self, data): self.data = data @@ -34,6 +30,14 @@ def scale_data(self, data): sc.fit(self.data) transformed = sc.transform(self.data) + + poly = PolyFeatures(3) + poly.fit(transformed) + poly_transformed = poly.transform(transformed) + self.poly_transformed_data = pd.DataFrame(data=poly_transformed, + index=self.data.index, + columns=poly.get_feature_names_out()) + self.transformed_data = pd.DataFrame(transformed, columns=data.columns, index=data.index) def find_confidence_interval(self, feature_index1): diff --git a/RIFLE/__pycache__/RobustImputer.cpython-39.pyc b/RIFLE/__pycache__/RobustImputer.cpython-39.pyc index c9dcbcf780b287e9da19c75c43a1db232dbe70d3..ca09ea47e348ed8f81630f0ec16419c3319ed170 100644 GIT binary patch delta 1582 zcmZuxOK%)S5T5RN?99HrCcA6n>;_{ejt5&x983bSm4pC^;9!9qUW3GFvNJYob{^`U zh1golNJiX{uv$on1BnkgaK$IMz#rh@0|Jc%cV7?}Bu+6^J;GQKJL+$%ySl2sdUWs4 z{d}S7RVqaVe&^^fx81Wl)fRbeXJ%pCSA4Bk3`f_l*w~-`J*3*(b}2UD(+e zuh}Fkza010ZvvQQFi~ea#r?17%HFE5NF`tN8R|6+@O9bwJjNFbcjZS4)z;wse4D z0!`$V>D|Z=?qr2d-0O!Cw<&AGGHc-@NZpmZ6fS0fyKz5dH8haUhd{ConI=iXp}gaij$8-6qhXz#8sf9=F~vh^ zWLT$@Fl50LV!&6XWonWld^j^6nA+aHcAF4G%p3D{JUY|)J6;lGI>fRXu(lW&t!Zq% zObxv|Y#1a~!YPdsNB0=>)*XX;VbTj&S$t=VHL<`>_1_R>2Ip8+{AnzZwwNt!myT6i zd{bzX8S!i357Pj%FA19vKbZCiGGP-4)96j#2w2S7u&7#7#yMP`6ql`536IJI19W|G0l^38Ub{Y|ZFbYu|VI+c{Du1sv+A$9SIRo|N& zuG7{-87Otbq@U(@FWg%`v5%!b(|Fa$_lXZ0mzq+P_pZ3u>+-Y{hshN~eBZeB<{Yw+ zH*^N6Xkh?UWtYm@U^>{72=fSUAV}g*p@i)7WITe>B>=ma_x00aqj}1h#pP$j7tP7m zWfUcmw@{L7jGX&l;Epm6!whQ_ITI3*(AH0B`p03Z5k_0tAvt5HzY!!(gG?}Ekd;ruCsPpZ`VH> zdm%wv?IDO;C87}$LPBbLYR{37xFB)kR(e24K5%Y1RB+=&I&WN}5Y;Pxelu_8z3+W9 zZ+}~OIA3wf<)XwNX8g3;o!+Xn(7Ri+?bd@8?-=H(> zebi>R&?M5?J@hU67|)_Hwuvt*VS_Xsi*E6;t-I1C={~+-;!NvAy?ziTnc|18w{8|Q z9J>~wp5xk`qWx?ST+u|L(;j1CStaMtMvWPnmL9LAR7Q0i=0 zYT^WO&!Z)FLYr)WmR4oWbJ9NbY;jEUw9G!$+Ni<4(>AO72sYW*q=}BPv*ZbZn{yx0 zqijhxCx8w;3NQ;Hl`lLRC3J)h^l5bgrl;9Gy;XvIxdtcM1HDn&6Q2F0PoWRC$BREI zEeK5y10aA`I6Q_z%o^3$qmlUHy>_ z?OH&KLLWm5B;X{Nl!#6UnIV&8DCy*|@S7z|jrl);wbBU--!2py|HNd30wEbymMWf{`^uo|1i9Q?mJc9rL literal 0 HcmV?d00001 diff --git a/RIFLE/preprocessing/__pycache__/_polynomial.cpython-39.pyc b/RIFLE/preprocessing/__pycache__/_polynomial.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f4948d4abbef309b9a7d1f40dd00fd95392fc8a0 GIT binary patch literal 5065 zcmb_g&2QYs73U0xTrRcxu;Z_!X+fk2nr#y~{YaalitE;Kf~2nMHm(~ik`mMoCGCn! zu7|UcwBTMU2XPJt(p%9l*uCl6L;ryM5h=i9d-5eH&|Ba{{k`GtYS&SaODPOBA8&?l z-n{pFzu9PU(YNrF?4KLYKWSP2rpD}Jqj4D}Z{Z~^$r7to{bwzPHcxoZZrP0bY}7@| zF?E)>ts447;`QpSI{F;xCVp?BwZN=bEm@P^ZA*Fwyw#BG4Xasyfc35QCM%uxcGO8x z`D#1sZFJH|cd|4OcRPAJRB=Cv+Ho&V^&u+-2|Qq!2kG-+dpmCLgqv}s2PzKJs2As9 zCvDTZvyVXIGK$$zY|$=STs|hbwExVckj`!FCl=C`wcFMKYdNTUvTo{bT$8?BFm+F! zk&ANaHfz=8vRt`swS0L>p2q(L*}#s@mW}IKvj0k@PyT@8n>V0n13D!G8HXF4C~s3Q zJ+uq5HMOsx;CMsJH>^Xp-sGh#<1H1(iVm)6mm-goP1Qid^c-4h3H48| zUHo+(tNh|t)QM!Arky-XFTR|$2V}hb;;&wN<=WM`cOS-8Sv$`2PP%n5>_hi7>vf`J zwZC6_AxzoPVR#B_=VU^QxomiLc89AInYI}x89XjZ4TUzBuj8ey_btsNE7$=a3C-`> z?^*A$O-=`J+tZ>D1()o+b--VS`R}ulQ`qWt&F@%R++hbeq61zynClb{%SZ|*hF|1(G3iRchu=p)bMpX+Cz_~aoO1EOzm#;owvd$q_4hBj; zow!tP46Bx@x4yO&r}182U3whN+hWuV94J%hbJ=z=BjT4kkt|O&OK{G40&tOia z`(ilzL0Zc=o5cn5*$QhMdA49@*zk!j-p%Jun2x}>?R8W%>|OBlD3VEj*OvALYa|Ms z+>TI}@t@jGA&i^@Ip1Rvxf5%j@krh+(XZyS?s$1qrbppD%$TG>FO zIb*FuVI1udqnxPfW6;tB+SWMKG}JAO!qU3Sf6s5xF*2CH4zEq(bW3jox_H_zJwSXr z8>G5iI*v4eP1xk>G?pz{`;e(c)MmoZUrRc<#_9Agy@ZkfqFBRc=4{;0b^(9kOx#VU zG;8W>wB9%HDi^|;I6`ASkkr#8R+X z&_#XaXn5u%1=pu3fF==AdMM`XH7W(pe#A8r!5tvt`_`5Enne zAg6P{)J@6nImRb!DS)SLfk-Qy$B*&UO_AhAZeic674Cfo9K4`;*DGvYm+oDTgQ^XG zJoTPeV?B0_V4PY5lUj{8X<%j!x9XWBJ>xqjv_4_MoVU=Xrm#$o4Osr>TYSVu98h8H z*ypDD1Nf^YJ3R_a!*F<6sowu#)K^K|~(r{%gi1@sw5u{)_OZN&W{E0>IKpMc!z z9KQQAl$^{gg!`F$bea!em^+%SSWl*oc?5!E#*-%4U$So#qDZ~pkDI=_h!qu`VC6N= z?slx=(uTX2qMLQn(jIpDr5pA8F*L13d8AadU%J^wH*RZqrSY!sVS&=E+zI$UOJ_=3 zWxD|NROY3iQ>eU<(p=?*XHV4pGRE@@cv+4MhzB6CB`Tuv(60&BV8f@reEV~5N~bu5 z{1=hO3f8r7k{r%y6I;>XEL~fRE)unpRF`bWQfu0kX5>#`i+`~X*zZ_+VLZy!4b*D+ z4af)@0Y$Jvk4Evop8j>brr%xYHn8e{^dgOc5I8$%QT;7qN|+JWjV#E7`TmEdRiCsV z=|LMueMmKw`VPxK!dG2d;}Om3kmTxPjMVavAyYFtpGM7Sb}S8vCE9znBFW48T>)K| zs}b5ww?5OqQh;+mp>X{!*5mjKj;l|9c;uw6r@@DTpDj7 z9^WB=*YQ@Yo(;A#9h7W!_?tIktwHaC@jRUc!&qfOlu8I6)6xI~uYj$@!EQE4WT4`F zkZ6cNGKhP9y+6SO+R8AU>|-O-+hbu+VzZCq57I9FFWVcw_N8_XkX!YjpTxlz?Y>f7eI1r+o&rJJ zj{0$xg8U9BH!@7xjt!45YZbN0OYb#0K@}yXe=|x3@l~ZVWw?oY7VD_z@G2MDS&{%i z=$cg8(MB87CWP@tl6s1yPN!kzO%r6xL*zcw8N%pEQyBOu*_$N`tHf_o+%em+H#_)*-j;N2@OB`?B_>9}$r(F0CxeFOveCi&#ZYA$J z=-~nwERZ63l`tQA9@S0jp;sHO%z+b>n1dp^Cz1DELCMQGDAKHp#Db8sH#q=`I1_Fh zXgg{RFa(Av?RU8pn|uuy*^z_1D8NgtE&vEYzfRl}SFo`h@+fX1=dE{*jPvIB7#23fC$J#fl9=dwp z70fCcBj?uNXTIl++*^Od?iVEA5g#!15m^qU!L8BaXbCk3z5f_nbqO{VU9#pPfX*H* z!=lTx7M*IiSS($Dea{2($2v{qz*?zi%}SqqM0zZFZ|S}@S}vBy_W*zR!zozam{5;E zMmnNU?@w>6CjlnN+f}%T`aZoVcdb|SpM2-R5gxG747WcUQBv00QBF7sYB8NuI+>EO zYA#n@h8Q{q3)T8`VY)U{Q7MRSnhTry5jsn+2L$c(ll>}ke@=oFP4p{9f4{j>CI4Sg zPdyFus0S>`tD7Ew^E!fjpcF_vZW~GIfC0pNitbtvj(0(q(FXSctyq{ahK}4*boVVi zTvW}xmPo@lueV&hvB{R8cf}x%gS(A zCk#tJ411XzBvfyN;jKZGR3nN2plkp>b*J5n^>!vJc5hJ6!%rH3S-oG8GXe?qOM12G zbqsJ*11O?}#OQOnFnfGOaOV3c%T)Yx-cOB0b5XrcE4+$VS*KgJ&axyomv&{-$axxZ z0ObAsiViO3?M{DxHSfo5{HQYiap9dP>A-&h-4$`9ub-q=>8(}UzeGP8m>VYDE*r+= I4c@5z4=7XpRsaA1 literal 0 HcmV?d00001 diff --git a/RIFLE/preprocessing/__pycache__/validation.cpython-39.pyc b/RIFLE/preprocessing/__pycache__/validation.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..741511b4ae5fa51acd10618180dafc3bbffa61b5 GIT binary patch literal 1109 zcmZ`&KW`H;6t{hMNt&jSkPuxMydV*Ql5Q+ji_{j0f;v#jPzj~Z*}Yu3_)^;!NOWCD z#Rd{{3xbiazz1RQ%EZiqSjzJ?X&VqPdN0rKzu(Wt78a@q==9gOPAx#_yE|MZ9}f3{ z^-T~Qah#xC%uAeb?-X;N2glwn;dmQ0%0D0iZ9Q#ZTaSbkiW$+CtS59xoCgR4Llaa2 zUIW%&K^T-nSx&~@kyq$mq5C=5GOR`hACMdykNa=Q5zfI2)=h&y`MHY0SA;Lj{o9+ew+X#_n+_jV-fB*L}1xYMNHLmw84Eq?U{HHan6C5vNKhR{O%e$?90Lg`k- zV#bA(u})=cJq@#-kVdzjJbdtYV@CFs=&LjoTE{YK4OkL$X5v&f`$Jn@PGW7swAWv` z1s#3@qtTsq=-`|}Z~;tnIY~p7=oL_=S&fPa&O`w=(59;TCtf3FW?9yZE1BU94UL zfl4)8BNbf7HSC}HUWHu8#Qn~ETsi}b?}OJTJ_aAI5VZ(74ZPLxYyc0UKU8%v6xJ)f W3p|PeD$Bi;XNg#GFl#q##rp${F*ULP literal 0 HcmV?d00001 diff --git a/RIFLE/preprocessing/_polynomial.py b/RIFLE/preprocessing/_polynomial.py new file mode 100644 index 0000000..2f58824 --- /dev/null +++ b/RIFLE/preprocessing/_polynomial.py @@ -0,0 +1,218 @@ +import collections +import numbers +from itertools import chain +from itertools import combinations_with_replacement as combinations_w_r +import numpy as np +from scipy.special import comb + +from .validation import _check_feature_names_in + + +# Simplified/altered version of PolynomialFeatures from sklearn.preprocessing +# to preserve NaN values + + +class PolyFeatures: + + def __init__(self, degree=2, *, include_bias=True): + self.degree = degree + self.include_bias = include_bias + + @staticmethod + def _combinations(n_features, min_degree, max_degree, include_bias): + comb = combinations_w_r + start = max(1, min_degree) + iter = chain.from_iterable( + comb(range(n_features), i) for i in range(start, max_degree + 1) + ) + if include_bias: + iter = chain(comb(range(n_features), 0), iter) + return iter + + @staticmethod + def _num_combinations(n_features, min_degree, max_degree, include_bias): + """ + Calculate number of terms in polynomial expansion. + + """ + combinations = comb(n_features + max_degree, max_degree, exact=True) - 1 + if min_degree > 0: + d = min_degree - 1 + combinations -= comb(n_features + d, d, exact=True) - 1 + + if include_bias: + combinations += 1 + + return combinations + + @property + def powers_(self): + """ + Exponent for each of the inputs in the output. + + """ + combinations = self._combinations( + n_features=self.n_features_in_, + min_degree=self._min_degree, + max_degree=self._max_degree, + include_bias=self.include_bias, + ) + return np.vstack( + [np.bincount(c, minlength=self.n_features_in_) for c in combinations] + ) + + def get_feature_names_out(self, input_features=None): + """ + Get output feature names for transformation. + + """ + powers = self.powers_ + input_features = _check_feature_names_in(self, input_features) + feature_names = [] + for row in powers: + inds = np.where(row)[0] + if len(inds): + name = " ".join( + "%s^%d" % (input_features[ind], exp) + if exp != 1 + else input_features[ind] + for ind, exp in zip(inds, row[inds]) + ) + else: + name = "1" + feature_names.append(name) + return np.asarray(feature_names, dtype=object) + + def fit(self, X): + """ + Compute number of output features. + + """ + + _, n_features = X.shape + self.n_features_in_ = n_features + if isinstance(self.degree, numbers.Integral): + if self.degree < 0: + raise ValueError( + f"degree must be a non-negative integer, got {self.degree}." + ) + elif self.degree == 0 and not self.include_bias: + raise ValueError( + "Setting degree to zero and include_bias to False would result in" + " an empty output array." + ) + + self._min_degree = 0 + self._max_degree = self.degree + elif ( + isinstance(self.degree, collections.abc.Iterable) and len(self.degree) == 2 + ): + self._min_degree, self._max_degree = self.degree + if not ( + isinstance(self._min_degree, numbers.Integral) + and isinstance(self._max_degree, numbers.Integral) + and self._min_degree >= 0 + and self._min_degree <= self._max_degree + ): + raise ValueError( + "degree=(min_degree, max_degree) must " + "be non-negative integers that fulfil " + "min_degree <= max_degree, got " + f"{self.degree}." + ) + elif self._max_degree == 0 and not self.include_bias: + raise ValueError( + "Setting both min_deree and max_degree to zero and include_bias to" + " False would result in an empty output array." + ) + else: + raise ValueError( + "degree must be a non-negative int or tuple " + "(min_degree, max_degree), got " + f"{self.degree}." + ) + + self.n_output_features_ = self._num_combinations( + n_features=n_features, + min_degree=self._min_degree, + max_degree=self._max_degree, + include_bias=self.include_bias, + ) + # We also record the number of output features for + # _max_degree = 0 + self._n_out_full = self._num_combinations( + n_features=n_features, + min_degree=0, + max_degree=self._max_degree, + include_bias=self.include_bias, + ) + + return self + + def transform(self, X): + """ + Transform data to polynomial features. + + """ + n_samples, n_features = X.shape + # Do as if _min_degree = 0 and cut down array after the + # computation, i.e. use _n_out_full instead of n_output_features_. + + XP = np.empty(shape=(n_samples, self._n_out_full), + dtype=X.dtype) + + # degree 0 term + if self.include_bias: + XP[:, 0] = 1 + current_col = 1 + else: + current_col = 0 + + if self._max_degree == 0: + return XP + + # degree 1 term + XP[:, current_col: current_col + n_features] = X + index = list(range(current_col, current_col + n_features)) + current_col += n_features + index.append(current_col) + + # loop over degree >= 2 terms + for _ in range(2, self._max_degree + 1): + new_index = [] + end = index[-1] + for feature_idx in range(n_features): + start = index[feature_idx] + new_index.append(current_col) + next_col = current_col + end - start + if next_col <= current_col: + break + # multiply + np.multiply( + XP[:, start:end], + X[:, feature_idx: feature_idx + 1], + out=XP[:, current_col:next_col], + casting="no", + ) + # print(XP[:, start:end]) + # print(X[:, feature_idx: feature_idx + 1]) + # print(XP[:, current_col:next_col]) + # print('-----') + current_col = next_col + + new_index.append(current_col) + index = new_index + + if self._min_degree > 1: + n_XP, n_Xout = self._n_out_full, self.n_output_features_ + if self.include_bias: + Xout = np.empty( + shape=(n_samples, n_Xout), dtype=XP.dtype, order=self.order + ) + Xout[:, 0] = 1 + Xout[:, 1:] = XP[:, n_XP - n_Xout + 1:] + else: + Xout = XP[:, n_XP - n_Xout:].copy() + XP = Xout + + return XP diff --git a/RIFLE/preprocessing/validation.py b/RIFLE/preprocessing/validation.py new file mode 100644 index 0000000..f559f24 --- /dev/null +++ b/RIFLE/preprocessing/validation.py @@ -0,0 +1,37 @@ +import numpy as np + + +def _check_feature_names_in(estimator, input_features=None, *, generate_names=True): + """ + Check `input_features` and generate names if needed. + + """ + + feature_names_in_ = getattr(estimator, "feature_names_in_", None) + n_features_in_ = getattr(estimator, "n_features_in_", None) + + if input_features is not None: + input_features = np.asarray(input_features, dtype=object) + if feature_names_in_ is not None and not np.array_equal( + feature_names_in_, input_features + ): + raise ValueError("input_features is not equal to feature_names_in_") + + if n_features_in_ is not None and len(input_features) != n_features_in_: + raise ValueError( + "input_features should have length equal to number of " + f"features ({n_features_in_}), got {len(input_features)}" + ) + return input_features + + if feature_names_in_ is not None: + return feature_names_in_ + + if not generate_names: + return + + # Generates feature names if `n_features_in_` is defined + if n_features_in_ is None: + raise ValueError("Unable to generate feature names without n_features_in_") + + return np.asarray([f"x{i}" for i in range(n_features_in_)], dtype=object) \ No newline at end of file From 608ef1aa74578c67f1c3563d6f915e23211b7480 Mon Sep 17 00:00:00 2001 From: gaiadennison Date: Tue, 19 Jul 2022 12:42:23 -0700 Subject: [PATCH 5/9] read and scale bug fixed --- RIFLE/RobustImputer.py | 19 +++++++++++++++--- .../__pycache__/RobustImputer.cpython-39.pyc | Bin 6485 -> 6695 bytes 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/RIFLE/RobustImputer.py b/RIFLE/RobustImputer.py index e5bf328..19c89fb 100644 --- a/RIFLE/RobustImputer.py +++ b/RIFLE/RobustImputer.py @@ -21,8 +21,21 @@ def __init__(self): self.imputed_data = None def read_and_scale(self, filename): - data = pd.read_csv(filename) - self.scale_data(data) + self.data = pd.read_csv(filename) + + sc = StandardScaler() + sc.fit(self.data) + + transformed = sc.transform(self.data) + + poly = PolyFeatures(3) + poly.fit(transformed) + poly_transformed = poly.transform(transformed) + self.poly_transformed_data = pd.DataFrame(data=poly_transformed, + index=self.data.index, + columns=poly.get_feature_names_out(self.data.columns)) + print(self.poly_transformed_data) + self.transformed_data = pd.DataFrame(transformed, columns=self.data.columns, index=self.data.index) def scale_data(self, data): self.data = data @@ -36,7 +49,7 @@ def scale_data(self, data): poly_transformed = poly.transform(transformed) self.poly_transformed_data = pd.DataFrame(data=poly_transformed, index=self.data.index, - columns=poly.get_feature_names_out()) + columns=poly.get_feature_names_out(data.columns)) self.transformed_data = pd.DataFrame(transformed, columns=data.columns, index=data.index) diff --git a/RIFLE/__pycache__/RobustImputer.cpython-39.pyc b/RIFLE/__pycache__/RobustImputer.cpython-39.pyc index ca09ea47e348ed8f81630f0ec16419c3319ed170..f2ca555606920b5919e43ac26f02456e817be651 100644 GIT binary patch delta 1023 zcmYjPOHUI~6ux&}olYMVE3`m)2&h!RM}tHG6$LRy<0H7SF6!7Bv`srRxic6b(-6bP zjjOveF6hcv2%7FqjB(*g6Jtmw{s49F!VP%OEkS2;zkK=5Ip3Uf=6=gPAB*d;n1)1yH!4ua|P4I^8|JX}YH@I+*unipa9+8XoXIZP`H`QDp?mbJPjvhED?>1rxbgv0an&0asxEy0^R4 zW+UVDb9e^dZ6vjKa)h;m2nLv--&?LIS)ge;n&`(D>5WAC4i~O5we%%~(9i9iot^9; zH|HagRw#h6^O8YqC$D;T!CI_3%OnS4$g-^!lh}F5woI2m!+E*nTDnD!(+`R11hhx}(s-X-7I2bIw%@uv2`T(`orFB8YO-p+X+HlHgwFt+1<(Pw>p5Ux z-E4OsuxSQajo&eHp8ja>QN}^OK%=RFZn!1QUt$2`zeFijokhJlEH xvGjcJA;>ulz;{0jjDP$-um^NCeFV4CSLsX)Zc1L>-3tAh9#(kIlT_-s`4`EnfuP>WD@Ecy*>0Lcqjb-@R8!G_a;8?a<4W7kX zP_-J4X&b7M^44+VYu2 zq|+SYM77}uPMl~$-3w!izhs5~i-HB#(bh<%d!X$n`o=MkAe>`+*}HTC=_u=Er*2&5 z^w$u=GJ?&azIt;|0^xHD%`ZuQ@u%18=`(PV9{_d$atI~%I5#vo$ElI@eD8$=zM=Y_ z69CFK67|5dJUGSPN-4H zG+lH76~cLh6@&)}BI^ZYnEFWHF=UrG=-SbkF0;@1ak_xVSJ;pIbm=Nm{A+lGFw7yQ zb+_3HokNL@7oL+b_O@_`oM1l-m5CAb!8wAMz%^u|_$D%wZ4~E;%A(>}M!cwm*K(U} QKC^elG8LMa*tg=wU-=ThV*mgE From 9aad45db1300f8dd1a3f8938cea1e9956e3fb51a Mon Sep 17 00:00:00 2001 From: sinab Date: Thu, 21 Jul 2022 10:34:56 -0700 Subject: [PATCH 6/9] + Impute Transformed data --- RIFLE/RobustImputer.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/RIFLE/RobustImputer.py b/RIFLE/RobustImputer.py index 19c89fb..25dc4da 100644 --- a/RIFLE/RobustImputer.py +++ b/RIFLE/RobustImputer.py @@ -6,6 +6,7 @@ import time from preprocessing import PolyFeatures + class RobustImputer: def __init__(self): @@ -19,9 +20,11 @@ def __init__(self): self.poly_transformed_data = None self.confidence_matrix = None self.imputed_data = None + self.cols = None def read_and_scale(self, filename): self.data = pd.read_csv(filename) + self.cols = self.data.columns sc = StandardScaler() sc.fit(self.data) @@ -58,7 +61,8 @@ def find_confidence_interval(self, feature_index1): # print starting point and features for each process # print(f'starting find_confidence_interval with {feature_index1}') - data = self.transformed_data + # data = self.transformed_data + data = self.poly_transformed_data dimension = data.shape[1] for feature_index2 in range(feature_index1, dimension): cols = data.columns @@ -95,7 +99,8 @@ def find_confidence_interval(self, feature_index1): def estimate_confidence_intervals(self): - data = self.transformed_data + # data = self.transformed_data + data = self.poly_transformed_data dimension = data.shape[1] # initialized confidence matrix so that we are not subscripting a NoneType object @@ -110,7 +115,7 @@ def estimate_confidence_intervals(self): # end timer and output time taken end = time.time() - print('Confidence done in {:.4f} seconds'.format(end-start)) + print('Confidence done in {:.4f} seconds'.format(end - start)) # # for j in range(dimension): @@ -121,10 +126,12 @@ def estimate_confidence_intervals(self): def impute_data(self, column_index): print(f'starting impute_data with {column_index}') - data = self.transformed_data + # data = self.transformed_data + data = self.poly_transformed_data confidence_intervals = self.confidence_matrix - data_columns = data.columns + # data_columns = data.columns + data_columns = self.cols y_column = data_columns[column_index] X = data.drop([y_column], axis=1) @@ -289,4 +296,4 @@ def impute(self): print('Impute done in {:.4f} seconds'.format(end - start)) def write_to_csv(self, output_filename): - self.imputed_data.to_csv(output_filename, index=False) \ No newline at end of file + self.imputed_data.to_csv(output_filename, index=False) From 7f49c0cfc079ce48c7963a7305c9c71cf73a47c1 Mon Sep 17 00:00:00 2001 From: gaiadennison Date: Wed, 31 Aug 2022 11:56:44 -0700 Subject: [PATCH 7/9] poly features preprocessing stage added --- RIFLE/RobustImputer.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/RIFLE/RobustImputer.py b/RIFLE/RobustImputer.py index 25dc4da..6334bb4 100644 --- a/RIFLE/RobustImputer.py +++ b/RIFLE/RobustImputer.py @@ -23,39 +23,40 @@ def __init__(self): self.cols = None def read_and_scale(self, filename): - self.data = pd.read_csv(filename) + self.data = pd.read_csv(filename, na_values='?') self.cols = self.data.columns sc = StandardScaler() sc.fit(self.data) - transformed = sc.transform(self.data) + self.transformed_data = pd.DataFrame(transformed, columns=self.data.columns, index=self.data.index) - poly = PolyFeatures(3) - poly.fit(transformed) - poly_transformed = poly.transform(transformed) + poly = PolyFeatures(2, include_bias=False) + poly.fit(self.data) + poly_data = poly.transform(self.data.to_numpy(dtype=float)) + sc.fit(poly_data) + poly_transformed = sc.transform(poly_data) self.poly_transformed_data = pd.DataFrame(data=poly_transformed, index=self.data.index, columns=poly.get_feature_names_out(self.data.columns)) print(self.poly_transformed_data) - self.transformed_data = pd.DataFrame(transformed, columns=self.data.columns, index=self.data.index) def scale_data(self, data): self.data = data sc = StandardScaler() sc.fit(self.data) - transformed = sc.transform(self.data) + self.transformed_data = pd.DataFrame(transformed, columns=data.columns, index=data.index) - poly = PolyFeatures(3) - poly.fit(transformed) - poly_transformed = poly.transform(transformed) + poly = PolyFeatures(2, include_bias=False) + poly.fit(self.data) + poly_data = poly.transform(self.data.to_numpy(dtype=float)) + sc.fit(poly_data) + poly_transformed = sc.transform(poly_data) self.poly_transformed_data = pd.DataFrame(data=poly_transformed, index=self.data.index, columns=poly.get_feature_names_out(data.columns)) - self.transformed_data = pd.DataFrame(transformed, columns=data.columns, index=data.index) - def find_confidence_interval(self, feature_index1): # print starting point and features for each process @@ -104,7 +105,7 @@ def estimate_confidence_intervals(self): dimension = data.shape[1] # initialized confidence matrix so that we are not subscripting a NoneType object - self.confidence_matrix = np.zeros(shape=(dimension, dimension)) + self.confidence_matrix = np.zeros(shape=(dimension, dimension), dtype="float") # start timer start = time.time() From d8fca9521b6fa13dd82e7bb1e807174a0149f929 Mon Sep 17 00:00:00 2001 From: gaiadennison Date: Wed, 31 Aug 2022 12:17:51 -0700 Subject: [PATCH 8/9] documentation --- RIFLE/preprocessing/_polynomial.py | 44 +++++++++++++++++++++++++++--- RIFLE/preprocessing/validation.py | 13 +++++++++ 2 files changed, 53 insertions(+), 4 deletions(-) diff --git a/RIFLE/preprocessing/_polynomial.py b/RIFLE/preprocessing/_polynomial.py index 2f58824..85a54f4 100644 --- a/RIFLE/preprocessing/_polynomial.py +++ b/RIFLE/preprocessing/_polynomial.py @@ -8,11 +8,20 @@ from .validation import _check_feature_names_in -# Simplified/altered version of PolynomialFeatures from sklearn.preprocessing -# to preserve NaN values +class PolyFeatures: + """ Generate interaction and polynomial features. Altered version of + sklearn.preprocessing.PolynomialFeatures to preserve NaN values. + Parameters + ---------- + degree : int, default=2 + Maximum degree of the polynomial features. -class PolyFeatures: + include_bias : bool, default=True + If 'True', then include the bias column, the feature in which all + polynomial powers are zero (acts as an intercept term in a linear + model. + """ def __init__(self, degree=2, *, include_bias=True): self.degree = degree @@ -65,6 +74,15 @@ def get_feature_names_out(self, input_features=None): """ Get output feature names for transformation. + Parameters + ---------- + input_features : array of str objects or None, default=None + Input features. + + Returns + ------- + feature_names : ndarray of str objects + Transformed feature names. """ powers = self.powers_ input_features = _check_feature_names_in(self, input_features) @@ -87,8 +105,16 @@ def fit(self, X): """ Compute number of output features. - """ + Parameters + ---------- + X : array-like matrix of shape (n_samples, n_features) + The data. + Returns + ------- + self : object + Fitted transformer. + """ _, n_features = X.shape self.n_features_in_ = n_features if isinstance(self.degree, numbers.Integral): @@ -153,6 +179,16 @@ def transform(self, X): """ Transform data to polynomial features. + Parameters + ---------- + X : array-like matrix of shape (n_samples, n_features) + The data to transform. + + Returns + ------- + XP : ndarray matrix of shape (n_samples, NP) + The matrix of features, where NP is the number of polynomial features + generated from the combination of inputs. """ n_samples, n_features = X.shape # Do as if _min_degree = 0 and cut down array after the diff --git a/RIFLE/preprocessing/validation.py b/RIFLE/preprocessing/validation.py index f559f24..8d1541c 100644 --- a/RIFLE/preprocessing/validation.py +++ b/RIFLE/preprocessing/validation.py @@ -5,6 +5,19 @@ def _check_feature_names_in(estimator, input_features=None, *, generate_names=Tr """ Check `input_features` and generate names if needed. + Parameters + ---------- + input_features : array-like of type str or None, default=None + Input features. + + generate_names : bool, default=None + Whether to generate names when 'input_features' is 'None'. + + Return + ------ + feature_names_in : ndarray of str or 'None' + Feature names in. + """ feature_names_in_ = getattr(estimator, "feature_names_in_", None) From 53e3e5088edc044411f4375f8f49ffee831e2080 Mon Sep 17 00:00:00 2001 From: gaiadennison Date: Wed, 31 Aug 2022 12:18:50 -0700 Subject: [PATCH 9/9] documentation --- .idea/RobustInference.iml | 2 +- .idea/misc.xml | 2 +- .../__pycache__/RobustImputer.cpython-39.pyc | Bin 6695 -> 6933 bytes .../__pycache__/_polynomial.cpython-39.pyc | Bin 5065 -> 5065 bytes RIFLE/run.py | 3 ++- 5 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.idea/RobustInference.iml b/.idea/RobustInference.iml index fa80a76..85c7612 100644 --- a/.idea/RobustInference.iml +++ b/.idea/RobustInference.iml @@ -4,7 +4,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index a2e120d..dc9ea49 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/RIFLE/__pycache__/RobustImputer.cpython-39.pyc b/RIFLE/__pycache__/RobustImputer.cpython-39.pyc index f2ca555606920b5919e43ac26f02456e817be651..a24b721c9cda3525831cb5b40ad4b8f3456ccc81 100644 GIT binary patch delta 1866 zcmZWp&2QX96!-XRz1}Y;*<{ls+m9rrm=q{&N=lU!T0&@P6@);Q*vrb<@uFS4_GauY z1np{(azr_e5C=d?gb+j^B$Nx{6!8zh*+OtaB6luSDB-mH5p6RvU*FK&5Horv33cp8gs$HpS@(;)q$?z}9kHfKA3%t-Ku^PHweOF?m zd{cfkn+4Nw3>^@i|14kYr-14(UW5_u14L4jbO|L?-jcy5jHcXFfT@wTDfQ&8yshKh z4j~foRKj}-*cn~em+S^iQ)LV7%qT-us!@GQ?x|h%n6I;Kxl%uIFvO{{BaIX!V z1MBI+wfJc>4G!|jPj(13#19IQ1v7@Yc$$uySoa@!wxoyFacl>#d4cg4CXqt zS_>Z}%ecZu5YS(&)EiNpj+o#5Nx*d2MRSMRzCFsDXOW7!T5r9I*c@dK1(`s0AJogmLg=YaIVi8l; zNfzD-8L>!Acy)NQhlaxU)w%qCGsZ15BK9UfdLJ}IO z+bFrzM3J?Df^zpC1r=nVpbB15L8+Mn1+(ZM^wWj+oKP*b$}n`+p8{NAW#p$3&H~u^ zq~2I%HjOZYa1LP>K?otl8q}Db$A!lbE+AL{SYYHr+b59CAzVaw65%Q77T}2ro1(Bn z4%``Hgm8Yaj_{71sI7UpH=_GQW>b7Kb%9v?tJFFv^0IM76RpDU8y{zm{ek~wlrxLy z_cHp``D~gIkAIXtqrwT2`FH8(Y;`-=1zEH zm(9y_H}U%_K&;igAavQ|$aQQ{C$jZpCAqqOeCXO^_wGcd~tfKAvu|HitjV~GwFeq>Ge?;*viA7Fk56rl0Kt%AgG54(?_KS3KF z93CAV*|Q=xp%oj#bv#@Zm9n6n=y@H#Q-0vT=dz%p2EH4LR4ZB|^c|mFL!%r(tW(!> zBbR?ZbjiZx*#ep>UQlC;{I{XkNO5mr_yEKKNF)#E zNmuqhE0$zEvEch-3uA(c!)PvgjWDVOt@e#co-4iaCbZL|c3T2gQCltqDY{h%hSx4z zMtBLKiEtHR1py;Y3W9}aF9X;1h+Z6oO6!=~n9v&-L-PN^%?c$-cuHP>jTAj{iEo+Arba(+HOk zM2p=;CQ@i4TSxfUlVBJY(*R5uqMZ!c$QdJsW!T1~(U&sDIk9f%V!aKQniGmsUeWNj s++aff(eQ2l^{Lq_qIg)~O0Lb^HVbNQ7&iR3#1$*N{O|W%nV2EU-|rTWM*si- delta 1651 zcmZXU&u<$=6vuaFe|WuXJ5HQ5j%gaG6WSF)ptLD1ZAw}SQUwYsMXK0~*?Kp{#@=;j z*3d+@e8`1MfimI(2jWwkODmLr0CDD*!~w}S4oH|AT#*vqn=Ni=ZOzxu^XAQu&wDdp zoVt55=Vh~&0-t;4uh$7D_G4((NF{9}+<*(M#*Cmp3iV|7dE(hl9$I!c}P2~oh-6T7R%hEKLgSL$IMcYWk`il=YRX%8s@+ zRAEDo%5v~C&}*R|+!~~+QKQ`qxkFhAy4eg4$qi`BcTE;}ex=GklmU!ll)7Gv2kDr3 zAzzPJlWEwQzU0N;>&$Bgx{MchwQ??*ZVab0SV8=#EnLmFqQ*vL6dd>r!6h<<6XoQJ zUg%eN)oTRP(8qCC3dv9dT85g$g0F$!JyW%axTZgsL+IEf^jqRH-RX@`(h*AaAxgb& zNR)_F+crC-p6-(oPV}nS2QBrGpN;$&?#9VHV!;kW2b)Df3hXJABPcUU4|EPq$uy7F zeNm#&;a^LdbgUTs7?U{9lMVBL5z z76tkxic@kBk3AL-v{n=~25JyKQ600NXJ+&7!B9Q{qCiDh>V){+zDelrwakicBO^vo zUKaiFHw*{OqWF9KXzw*Jhj`*4o^a@c)K)K!DNK%8!@obLz_GW#zwgY;oyv@{ad62j zx|+|TmTSgQC2Tia8w@9MQ(+Wt1}t*5C=9qfy{p%F=!NV>jLd=z3}1XQv7AQ~>;;Tc z8&Q>=6u(SdA%)$A!gqwY;*ZJh;(?T4oL$3Q1~UJf(80e2QAiFefbQOznx;k)A?{DV zbV!==8d>KWH6B-^X6t-G{4@Q|yJsORA-xQit2G1IpDaz8u5;*KK{=1&p~%g?f)?&L z0fN}sDu`nx$-5x#6^|I0J$6xiUo0QS8<-R^5@vEYMsf-&?3sN1ML%7JZD92D_!CgX29^nRB@1s28 zI)cRTl}U+u6`i%Q~U$*WoiC|gol9L bY>Ndg7F7ezYvGOLQNUH6SYG^9DiiuIAiQ2n diff --git a/RIFLE/preprocessing/__pycache__/_polynomial.cpython-39.pyc b/RIFLE/preprocessing/__pycache__/_polynomial.cpython-39.pyc index f4948d4abbef309b9a7d1f40dd00fd95392fc8a0..dcc5f6646b968e5884a50ab7908c2adb92a85f11 100644 GIT binary patch delta 20 acmX@9eo~z~k(ZZ?0SHP`A8h2_Cky~N`USTD delta 20 acmX@9eo~z~k(ZZ?0SIm}Uf;;QPZ$6?KLxM= diff --git a/RIFLE/run.py b/RIFLE/run.py index 4e82d5c..6866245 100644 --- a/RIFLE/run.py +++ b/RIFLE/run.py @@ -12,9 +12,10 @@ def run(): imputer.impute() imputer.write_to_csv(imputed) - # This guard is necessary to avoid creating subprocesses recursively. # Without it a runtime error is generated, but there is likely a more clever way to do this + + if __name__ == '__main__': start = time.time() run()