diff --git a/FDA.py b/FDA.py new file mode 100644 index 0000000..bb983d7 --- /dev/null +++ b/FDA.py @@ -0,0 +1,165 @@ +import numpy as np +import matplotlib.pyplot as plt +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA +from sklearn.decomposition import PCA as PCA +from fileimport import FeatureFile +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import confusion_matrix,accuracy_score +from sklearn.model_selection import LeaveOneOut + +from sklearn.neighbors import KNeighborsClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.svm import SVC +from sklearn.linear_model import LogisticRegression + +class FDAPCA: + def __init__(self, filename = "../Parkinson_Multiple_Sound_Recording/train_data.txt"): + file = FeatureFile(filename) + #[self.X, self.y] = file.get_data_frame() + [self.X, self.y] = file.get_normalized_data_frame() + self.newsplit() + + def newsplit(self): + sc = StandardScaler() + self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2)#, random_state=0) + self.X_train = sc.fit_transform(self.X_train) + self.X_test = sc.transform(self.X_test) + + def fda(self): + self.lda = LDA() + self.lda.fit_transform(self.X, self.y) + return self.lda.transform(self.X) + + def pca(self,n_components): + if n_components == 0: + self.pca = PCA() + else: + self.pca = PCA(n_components) + return self.pca.fit_transform(self.X) + + + def testKNN(self,X_test,n_neighbors): + clf = KNeighborsClassifier(n_neighbors) + clf = SVC(kernel='linear') + clf = RandomForestClassifier(n_neighbors) + clf = LogisticRegression(random_state=0, solver='lbfgs') + + + X_train, X_test, y_train, y_test = train_test_split(X_test, self.y, test_size=0.2)#, random_state=0) + plotFDA(X_train,y_train) + clf.fit(X_train, y_train) + y_pred = clf.predict(X_test) + print('Accuracy ' + str(accuracy_score(y_test, y_pred))) + +def plotPCA(X,y): + fig = plt.figure() + ax1 = fig.add_subplot(111) + + Xtrue = X[y == 1] + Xfalse = X[y == 0] + xax = 10 + yax = 20 + Xxt = Xtrue[:,xax] + Xyt = Xtrue[:,yax] + Xxf = Xfalse[:,xax] + Xyf = Xfalse[:,yax] + ax1.scatter(Xxt, Xyt, s=10, c='b', marker="s", label='first') + ax1.scatter(Xxf, Xyf, s=10, c='r', marker="o", label='second') + plt.legend(loc='upper left'); + plt.show() + +def plotFDA(X,y): + fig = plt.figure() + ax1 = fig.add_subplot(111) + + Xtrue = X[y == 1] + Xfalse = X[y == 0] + zerost = np.ones(Xtrue.shape) + zerosf = np.zeros(Xfalse.shape) + ax1.scatter(Xtrue,zerost , s=10, c='b', marker="s", label='Positive') + ax1.scatter(Xfalse, zerosf, s=10, c='r', marker="o", label='Negative') + #plt.legend(loc='upper left'); + plt.show() + +def LeaveOOwithPCAandLDA(X,y): + #clf = RandomForestClassifier(n_neighbors) + #clf = SVC(kernel='linear') + #clf = SVC(kernel='rbf', gamma='auto') + #clf = KNeighborsClassifier(n_neighbors) + clf = LogisticRegression(random_state=0, solver='lbfgs') + + loo = LeaveOneOut() + loo.get_n_splits(X) + accuracies = [] + accuraciesLDA = [] + wrongpred = [] + for train_index, test_index in loo.split(X): + #print(test_index+1) + #print("TRAIN:", train_index, "TEST:", test_index) + X_train, X_test = X[train_index], X[test_index] + y_train, y_test = y[train_index], y[test_index] + + lda = LDA() + + X_trainLDA = lda.fit_transform(X_train, y_train) + X_testLDA = lda.transform(X_test) + + #plotFDA(X_trainLDA,y_train) + + clf.fit(X_train, y_train) + y_pred = clf.predict(X_test) + accuracies.append(accuracy_score(y_test, y_pred)) + clf.fit(X_trainLDA, y_train) + y_pred = clf.predict(X_testLDA) + if y_pred != y_test.values: + wrongpred.append(test_index) + accuraciesLDA.append(accuracy_score(y_test, y_pred)) + #print(f"Predicted wrong on {wrongpred}") + return accuracies, accuraciesLDA + +def LeaveOO(X,y): + #clf = KNeighborsClassifier(n_neighbors) + #clf = SVC(kernel='rbf', gamma='auto') + #clf = SVC(kernel='linear') + clf = RandomForestClassifier(n_neighbors) + #clf = LogisticRegression(random_state=0, solver='lbfgs') + + loo = LeaveOneOut() + loo.get_n_splits(X) + accuracies = [] + for train_index, test_index in loo.split(X): + print(test_index+1) + #print("TRAIN:", train_index, "TEST:", test_index) + X_train, X_test = X[train_index], X[test_index] + y_train, y_test = y[train_index], y[test_index] + + clf.fit(X_train, y_train) + y_pred = clf.predict(X_test) + accuracies.append(accuracy_score(y_test, y_pred)) + return accuracies + + + + +lda = LDA() + +n_neighbors = 2 +x = FDAPCA() +y = x.y +x.X = x.X.values +#xfda = x.fda() +for i in range(1,38): + pca = PCA(i) + xpca = pca.fit_transform(x.X) + accuracies, accuraciesLDA = LeaveOOwithPCAandLDA(xpca, y) + print(f"{i} PCA components: accuracy {np.mean(accuracies)}, LDA accuracy {np.mean(accuraciesLDA)}") +#plotPCA(x.X.values,y) + +#xPCAFDA = lda.fit_transform(xpca, y) +#x.testKNN(xPCAFDA, n_neighbors) + +#print(accuracies) +#plotFDA(xPCAFDA,y) +#plotFDA(xpca[0:1,:],y) +True == True \ No newline at end of file diff --git a/fileimport.py b/fileimport.py new file mode 100644 index 0000000..cd1df1e --- /dev/null +++ b/fileimport.py @@ -0,0 +1,105 @@ +# Libraries +import pandas as pd +from sklearn.preprocessing import StandardScaler + +# Globals +class FeatureFile: + def __init__(self, filename): + self.test_percent = 0.20 # percent of dataset withheld for validation + + # Import Training Data + # column names + self.names = ['Subject ID', + 'Jitter (local)', 'Jitter (local, abs)', 'Jitter (rap)', + 'Jitter (ppq5)', 'Jitter (ddp)', 'Shimmer (local)', + 'Shimmer (local, dB)', 'Shimmer (apq3)', 'Shimmer (apq5)', + 'Shimmer (apq11)', 'Shimmer (dda)', 'AC', 'NTH', 'HTN', + 'Median Pitch', 'Mean Pitch', 'Std Dev Pitch', 'Min Pitch', + 'Max Pitch', 'Num Pulses', 'Num Periods', 'Mean Period', + 'Std Dev Periods', 'Frac Unvoiced Frames', 'Num Breaks', + 'Degree of Breaks'] + # training column names + train_names = self.names + ['UPDRS', 'class info'] + + self.df = pd.read_csv(filename, + header=None, + names=train_names) + self.df.head() + # initialize patients arrary + patients = {} + for i in range(40): + patients[i] = self.df.iloc[i * 26:i * 26 + 26].agg(['mean']) + # remerge the averages + avg_df = patients[0] + for i in range(1, 40): + avg_df = avg_df.append(patients[i]) + #self.df = avg_df + + def get_data_frame(self): + # Get examples + X = self.df.drop(['UPDRS', 'class info', 'Subject ID'], axis=1) + # X = self.df.drop(['UPDRS', 'class info'], axis=1) + # Get labels + Y = self.df['class info'] + return [X, Y] + + def get_normalized_data_frame(self): + # Get examples + df = self.stats_norm_patients(self.df) + df = pd.concat(df, axis=0) + X = df.drop(['UPDRS', 'class info', 'Subject ID'], axis=1) + # X = self.df.drop(['UPDRS', 'class info'], axis=1) + # Get labels + Y = df['class info'] + return [X, Y] + + def stats_norm_patients(self, df): + return self.stats_patients(df, pat_func=self.normalize_patients) + + def stats_patients(self, df, pat_func): + # get patients + p = pat_func(df) + # intialize stat based patients dictionary + s = {} + # for each patient + for (k, v) in p.items(): + #print(k) + s[k] = self.stats(v).drop(['Subject ID mean', 'UPDRS mean', 'class info mean', + 'Subject ID std', 'UPDRS std', 'class info std', ], axis=1) + s[k]['Subject ID'] = v['Subject ID'].values[0] + s[k]['UPDRS'] = v['UPDRS'].values[0] + s[k]['class info'] = v['class info'].values[0] + return s + + def normalize_patients(self, df): + # remove labels and ID + data = df.drop(['Subject ID', 'UPDRS', 'class info'], axis=1) + # create Scaler + scale = StandardScaler() + # fit and transfrom the data + normalized = pd.DataFrame(scale.fit_transform(data), columns=self.names[1:]) + # put labels and ID back in + normalized['Subject ID'] = df['Subject ID'] + normalized['UPDRS'] = df['UPDRS'] + normalized['class info'] = df['class info'] + + # break into patients and return + return self.patients(normalized) + + def patients(self, df): + p = {} + for i in df['Subject ID'].unique(): + p[i - 1] = df.loc[df['Subject ID'] == i] + return p + + def stats(self, df): + # initialize features + features = pd.DataFrame() + # for each column in DataFrame + for c in df.columns: + # create a new feature of its mean + features[c + ' mean'] = [df[c].mean(axis=0)] + # create a new feature of its std + features[c + ' std'] = [df[c].std(axis=0)] + # return features + return features \ No newline at end of file