diff --git a/scripts/tcga_fpkm_dendrogram/README.md b/scripts/tcga_fpkm_dendrogram/README.md new file mode 100644 index 0000000..de346fc --- /dev/null +++ b/scripts/tcga_fpkm_dendrogram/README.md @@ -0,0 +1,53 @@ +# TCGA_FPKM_dendrogram + +Generates dendrogram image and csv file holding TCGA patients and their assigned cluster colors. + +Additional optional functionality: +- Generates csv file holding TCGA patients clustered into n clusters using dendrogram's comparison function +- Generate interactive dendrogram using Plotly library +- Generate scatter plot of 2-component PCA analysis using TCGA-FPKM data + +## Installation + +`virtualenv` is a way to isolate software dependencies for python scripts. + +First install `virtualenv`: + +```bash +pip3 install virtualenv +``` + +Then setup and activate the virtualenv: +```bash +cd tcga_fpkm_dendogram +python3 -m venv env +source env/bin/activate +``` + +Then install the dependencies for this script: + +```bash +pip install wheel +pip install -r requirements.txt +``` + +Now you are ready to run the script. + +IMPORTANT: When you are done using the script, make sure to deactivate the `virtualenv` by running the `deactivate` command. + +```bash +# Deactivates virtualenv and stops isolating dependencies. +deactivate +``` + +Otherwise your other python scripts may not work. + +## Usage + +Once you have activated the `virtualenv`, use the script as follows: + +```bash +python3 tcga_fpkm_dendrogram.py +``` + +A folder called `results` will be created which will contain the following output files: diff --git a/scripts/tcga_fpkm_dendrogram/gen_dendro.py b/scripts/tcga_fpkm_dendrogram/gen_dendro.py new file mode 100644 index 0000000..e461f40 --- /dev/null +++ b/scripts/tcga_fpkm_dendrogram/gen_dendro.py @@ -0,0 +1,96 @@ +from sklearn.cluster import AgglomerativeClustering +from sklearn.decomposition import PCA +import scipy.cluster.hierarchy as shc +import plotly.figure_factory as ff +import matplotlib.pyplot as plt +import pandas as pd +import numpy as np +import shutil +import sys +import os + +# Create and save TCGA-FPKM static dendrogram & patient assigned cluster colors +def plot_dendro_static(data, labels): + plt.figure(figsize=(10, 7)) + plt.title("TCGA-FPKM: Dendrogram") + result = shc.dendrogram(shc.linkage(data, method='ward')) + plt.savefig('./results/TCGA-FPKM_dendrogram-stat.png') + plt.show() + + leaf_colors = {} + color_tcga = {} + missed_tcga = patient_labels[845] + + for i in range(len(result['color_list'])): + leaf_colors[result['leaves'][i]] = result['color_list'][i] + + for j in range(len(patient_labels)): + if j != 845: + color_tcga[patient_labels[j]] = leaf_colors[j] + + df = pd.DataFrame({'Patient': list(color_tcga.keys()), + 'Cluster_color': list(color_tcga.values())}) + df.to_csv('./results/TCGA-FPKM_cluster-colors.csv', index=False) + +# Create and save TCGA-FPKM interactive dendrogram +def plot_dendro_interactive(data, labels): + data_array = tcga_data.to_numpy() + fig = ff.create_dendrogram(data_array, orientation='left', labels=labels) + fig.update_layout(width=1600, height=800) + fig.write_image('./results/TCGA-FPKM_dendrogram-intr.png') + fig.show() + +# Create and save TCGA-FPKM patient instances clustered into n clusters +def gen_clusters(n, data, labels): + cluster = AgglomerativeClustering(n_clusters=n, affinity='euclidean', + linkage='ward') + cluster.fit_predict(data) + clusters = cluster.labels_.tolist() + + df = pd.DataFrame(list(zip(labels, clusters)), + columns =['Patient', 'Cluster']) + df.to_csv('./results/TCGA-FPKM_Cluster.csv', index=False) + +# Create and save 2-component PCA analysis scatter plot of TCGA-FPKM data +def gen_scatter(data): + features = list(tcga_data.columns) + x = tcga_data.loc[:, features].values + + pca = PCA(n_components=2) + principalComponents = pca.fit_transform(x) + principalDf = pd.DataFrame(data = principalComponents + , columns = ['principal component 1', 'principal component 2']) + + principalDf.plot.scatter(x='principal component 1', + y='principal component 2', + title= "TCGA-FPKM: 2 Component PCA") + plt.savefig('./results/TCGA-FPKM_pca_scatter.png') + plt.show(block=True) + +#################################### MAIN ##################################### +if __name__ == "__main__": + # Check command-line argument + if (len(sys.argv) < 2): + print('missing input file name argument') + sys.exit() + + # Check file validity + filename = str(sys.argv[1]) + if (filename.split('.')[-1] != 'csv'): + print('input file is invalid type (not .csv)') + sys.exit() + + # Create directory for results + path = './results' + if (os.path.isdir(path)): + shutil.rmtree(path) + os.mkdir(path) + + # Reading in TCGA data + tcga_df = pd.read_csv(filename) + patient_labels = tcga_df.columns[1:] + tcga_data = tcga_df.drop(columns=['sample']).transpose() + + # Plotting static dendrogram + plot_dendro_static(tcga_data, patient_labels) + diff --git a/scripts/tcga_fpkm_dendrogram/requirements.txt b/scripts/tcga_fpkm_dendrogram/requirements.txt new file mode 100644 index 0000000..40c5189 --- /dev/null +++ b/scripts/tcga_fpkm_dendrogram/requirements.txt @@ -0,0 +1,6 @@ +plotly==4.11.0 +scipy==1.4.1 +pandas==1.0.4 +numpy==1.18.5 +matplotlib==3.2.1 +scikit_learn==0.23.2