ncomly · albinfor · Dec 3, 2019 · Dec 11, 2019
diff --git a/FDA.py b/FDA.py
@@ -0,0 +1,165 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
+from sklearn.decomposition import PCA as PCA
+from fileimport import FeatureFile
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import confusion_matrix,accuracy_score
+from sklearn.model_selection import LeaveOneOut
+
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.svm import SVC
+from sklearn.linear_model import LogisticRegression
+
+class FDAPCA:
+    def __init__(self, filename = "../Parkinson_Multiple_Sound_Recording/train_data.txt"):
+        file = FeatureFile(filename)
+        #[self.X, self.y] = file.get_data_frame()
+        [self.X, self.y] = file.get_normalized_data_frame()
+        self.newsplit()
+
+    def newsplit(self):
+        sc = StandardScaler()
+        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2)#, random_state=0)
+        self.X_train = sc.fit_transform(self.X_train)
+        self.X_test = sc.transform(self.X_test)
+
+    def fda(self):
+        self.lda = LDA()
+        self.lda.fit_transform(self.X, self.y)
+        return self.lda.transform(self.X)
+
+    def pca(self,n_components):
+        if n_components == 0:
+            self.pca = PCA()
+        else:
+            self.pca = PCA(n_components)
+        return self.pca.fit_transform(self.X)
+
+
+    def testKNN(self,X_test,n_neighbors):
+        clf = KNeighborsClassifier(n_neighbors)
+        clf = SVC(kernel='linear')
+        clf = RandomForestClassifier(n_neighbors)
+        clf = LogisticRegression(random_state=0, solver='lbfgs')
+
+
+        X_train, X_test, y_train, y_test = train_test_split(X_test, self.y, test_size=0.2)#, random_state=0)
+        plotFDA(X_train,y_train)
+        clf.fit(X_train, y_train)
+        y_pred = clf.predict(X_test)
+        print('Accuracy ' + str(accuracy_score(y_test, y_pred)))
+
+def plotPCA(X,y):
+    fig = plt.figure()
+    ax1 = fig.add_subplot(111)
+
+    Xtrue = X[y == 1]
+    Xfalse = X[y == 0]
+    xax = 10
+    yax = 20
+    Xxt = Xtrue[:,xax]
+    Xyt = Xtrue[:,yax]
+    Xxf = Xfalse[:,xax]
+    Xyf = Xfalse[:,yax]
+    ax1.scatter(Xxt, Xyt, s=10, c='b', marker="s", label='first')
+    ax1.scatter(Xxf, Xyf, s=10, c='r', marker="o", label='second')
+    plt.legend(loc='upper left');
+    plt.show()
+
+def plotFDA(X,y):
+    fig = plt.figure()
+    ax1 = fig.add_subplot(111)
+
+    Xtrue = X[y == 1]
+    Xfalse = X[y == 0]
+    zerost = np.ones(Xtrue.shape)
+    zerosf = np.zeros(Xfalse.shape)
+    ax1.scatter(Xtrue,zerost , s=10, c='b', marker="s", label='Positive')
+    ax1.scatter(Xfalse, zerosf, s=10, c='r', marker="o", label='Negative')
+    #plt.legend(loc='upper left');
+    plt.show()
+
+def LeaveOOwithPCAandLDA(X,y):
+    #clf = RandomForestClassifier(n_neighbors)
+    #clf = SVC(kernel='linear')
+    #clf = SVC(kernel='rbf', gamma='auto')
+    #clf = KNeighborsClassifier(n_neighbors)
+    clf = LogisticRegression(random_state=0, solver='lbfgs')
+
+    loo = LeaveOneOut()
+    loo.get_n_splits(X)
+    accuracies = []
+    accuraciesLDA = []
+    wrongpred = []
+    for train_index, test_index in loo.split(X):
+        #print(test_index+1)
+        #print("TRAIN:", train_index, "TEST:", test_index)
+        X_train, X_test = X[train_index], X[test_index]
+        y_train, y_test = y[train_index], y[test_index]
+
+        lda = LDA()
+
+        X_trainLDA = lda.fit_transform(X_train, y_train)
+        X_testLDA = lda.transform(X_test)
+
+        #plotFDA(X_trainLDA,y_train)
+
+        clf.fit(X_train, y_train)
+        y_pred = clf.predict(X_test)
+        accuracies.append(accuracy_score(y_test, y_pred))
+        clf.fit(X_trainLDA, y_train)
+        y_pred = clf.predict(X_testLDA)
+        if y_pred != y_test.values:
+            wrongpred.append(test_index)
+        accuraciesLDA.append(accuracy_score(y_test, y_pred))
+    #print(f"Predicted wrong on {wrongpred}")
+    return accuracies, accuraciesLDA
+
+def LeaveOO(X,y):
+    #clf = KNeighborsClassifier(n_neighbors)
+    #clf = SVC(kernel='rbf', gamma='auto')
+    #clf = SVC(kernel='linear')
+    clf = RandomForestClassifier(n_neighbors)
+    #clf = LogisticRegression(random_state=0, solver='lbfgs')
+
+    loo = LeaveOneOut()
+    loo.get_n_splits(X)
+    accuracies = []
+    for train_index, test_index in loo.split(X):
+        print(test_index+1)
+        #print("TRAIN:", train_index, "TEST:", test_index)
+        X_train, X_test = X[train_index], X[test_index]
+        y_train, y_test = y[train_index], y[test_index]
+
+        clf.fit(X_train, y_train)
+        y_pred = clf.predict(X_test)
+        accuracies.append(accuracy_score(y_test, y_pred))
+    return accuracies
+
+
+
+
+lda = LDA()
+
+n_neighbors = 2
+x = FDAPCA()
+y = x.y
+x.X = x.X.values
+#xfda = x.fda()
+for i in range(1,38):
+    pca = PCA(i)
+    xpca = pca.fit_transform(x.X)
+    accuracies, accuraciesLDA = LeaveOOwithPCAandLDA(xpca, y)
+    print(f"{i} PCA components: accuracy {np.mean(accuracies)}, LDA accuracy {np.mean(accuraciesLDA)}")
+#plotPCA(x.X.values,y)
+
+#xPCAFDA = lda.fit_transform(xpca, y)
+#x.testKNN(xPCAFDA, n_neighbors)
+
+#print(accuracies)
+#plotFDA(xPCAFDA,y)
+#plotFDA(xpca[0:1,:],y)
+True == True
diff --git a/fileimport.py b/fileimport.py
@@ -0,0 +1,105 @@
+# Libraries
+import pandas as pd
+from sklearn.preprocessing import StandardScaler
+
+# Globals
+class FeatureFile:
+    def __init__(self, filename):
+        self.test_percent = 0.20 # percent of dataset withheld for validation
+
+        # Import Training Data
+        # column names
+        self.names = ['Subject ID',
+                 'Jitter (local)', 'Jitter (local, abs)', 'Jitter (rap)',
+                 'Jitter (ppq5)', 'Jitter (ddp)', 'Shimmer (local)',
+                 'Shimmer (local, dB)', 'Shimmer (apq3)', 'Shimmer (apq5)',
+                 'Shimmer (apq11)', 'Shimmer (dda)', 'AC', 'NTH', 'HTN',
+                 'Median Pitch', 'Mean Pitch', 'Std Dev Pitch', 'Min Pitch',
+                 'Max Pitch', 'Num Pulses', 'Num Periods', 'Mean Period',
+                 'Std Dev Periods', 'Frac Unvoiced Frames', 'Num  Breaks',
+                 'Degree of Breaks']
+        # training column names
+        train_names = self.names + ['UPDRS', 'class info']
+
+        self.df = pd.read_csv(filename,
+                         header=None,
+                         names=train_names)
+        self.df.head()
+        # initialize patients arrary
+        patients = {}
+        for i in range(40):
+            patients[i] = self.df.iloc[i * 26:i * 26 + 26].agg(['mean'])
+        # remerge the averages
+        avg_df = patients[0]
+        for i in range(1, 40):
+            avg_df = avg_df.append(patients[i])
+        #self.df = avg_df
+
+    def get_data_frame(self):
+        # Get examples
+        X = self.df.drop(['UPDRS', 'class info', 'Subject ID'], axis=1)
+        # X = self.df.drop(['UPDRS', 'class info'], axis=1)
+        # Get labels
+        Y = self.df['class info']
+        return [X, Y]
+
+    def get_normalized_data_frame(self):
+        # Get examples
+        df = self.stats_norm_patients(self.df)
+        df = pd.concat(df, axis=0)
+        X = df.drop(['UPDRS', 'class info', 'Subject ID'], axis=1)
+        # X = self.df.drop(['UPDRS', 'class info'], axis=1)
+        # Get labels
+        Y = df['class info']
+        return [X, Y]
+
+    def stats_norm_patients(self, df):
+        return self.stats_patients(df, pat_func=self.normalize_patients)
+
+    def stats_patients(self, df, pat_func):
+        # get patients
+        p = pat_func(df)
+        # intialize stat based patients dictionary
+        s = {}
+        # for each patient
+        for (k, v) in p.items():
+            #print(k)
+            s[k] = self.stats(v).drop(['Subject ID mean', 'UPDRS mean', 'class info mean',
+                                  'Subject ID std', 'UPDRS std', 'class info std', ], axis=1)
+            s[k]['Subject ID'] = v['Subject ID'].values[0]
+            s[k]['UPDRS'] = v['UPDRS'].values[0]
+            s[k]['class info'] = v['class info'].values[0]
+        return s
+
+    def normalize_patients(self, df):
+        # remove labels and ID
+        data = df.drop(['Subject ID', 'UPDRS', 'class info'], axis=1)
+        # create Scaler
+        scale = StandardScaler()
+        # fit and transfrom the data
+        normalized = pd.DataFrame(scale.fit_transform(data), columns=self.names[1:])
+        # put labels and ID back in
+        normalized['Subject ID'] = df['Subject ID']
+        normalized['UPDRS'] = df['UPDRS']
+        normalized['class info'] = df['class info']
+
+        # break into patients and return
+        return self.patients(normalized)
+
+    def patients(self, df):
+        p = {}
+        for i in df['Subject ID'].unique():
+            p[i - 1] = df.loc[df['Subject ID'] == i]
+        return p
+
+    def stats(self, df):
+        # initialize features
+        features = pd.DataFrame()
+        # for each column in DataFrame
+        for c in df.columns:
+            # create a new feature of its mean
+            features[c + ' mean'] = [df[c].mean(axis=0)]
+            # create a new feature of its std
+            features[c + ' std'] = [df[c].std(axis=0)]
+        # return features
+        return features