From fb55d2f0cfb27da7d4c6d9be0e95b3f7176b4ded Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Tue, 24 Jun 2025 05:45:30 -0400 Subject: [PATCH 1/5] create empty marker plots if sample has no reads above the detection threshold --- lusSTR/workflows/strs.smk | 15 ++++++---- lusSTR/wrappers/filter.py | 60 ++++++++++++++++++++------------------- 2 files changed, 41 insertions(+), 34 deletions(-) diff --git a/lusSTR/workflows/strs.smk b/lusSTR/workflows/strs.smk index fb77fa4..e223d35 100644 --- a/lusSTR/workflows/strs.smk +++ b/lusSTR/workflows/strs.smk @@ -21,9 +21,7 @@ custom = config["custom_ranges"] def get_sample_IDs(input, a_software, output, software, separate): convert_out = f"{output}.txt" format_out = f"{output}.csv" - if (software == "efm" or software == "mpsproto") and separate is False: - ID_list = os.path.basename(output) - elif os.path.exists(convert_out): + if os.path.exists(convert_out): ID_list = get_existing_IDs(convert_out, "\t") elif os.path.exists(format_out): ID_list = get_existing_IDs(format_out, ",") @@ -93,6 +91,13 @@ def get_output(): return outname +def get_markerplot_name(output, custom): + if custom: + return f"{output}_custom_range" + else: + return output + + rule all: input: expand("{name}.csv", name=output_name), @@ -136,9 +141,9 @@ rule filter: rules.convert.output output: expand( - "{outdir}/{samplename}_{prof_t}_{data_t}.csv", outdir=output_name, + "MarkerPlots/{output_name}_{samplename}_marker_plots.pdf", output_name=get_markerplot_name(config["output"], config["custom_ranges"]), samplename=get_sample_IDs(input_name, config["analysis_software"], output_name, software, - separate), prof_t=prof, data_t=data + separate) ) params: output_type=config["output_type"], diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index 812713e..39e7c1b 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -313,39 +313,40 @@ def format_ref_table(new_rows, sample_data, datatype): return sort_df -def marker_plots(df, output_name, kit, wd="."): +def marker_plots(df, output_name, kit, sample_list, wd="."): Path(f"{wd}/MarkerPlots").mkdir(parents=True, exist_ok=True) filt_df = df[df["allele_type"] == "Typed"] - for sample_id in df["SampleID"].unique(): - if df[df["SampleID"] == sample_id].empty: - print(f"{sample_id} does not have any reads passing filter. Skipping to next sample.") - else: - with PdfPages(f"{wd}/MarkerPlots/{output_name}_{sample_id}_marker_plots.pdf") as pdf: - if not filt_df[filt_df["SampleID"] == sample_id].empty: - make_plot(filt_df, sample_id, output_name, kit, filters=True, at=False) - pdf.savefig() - make_plot(df, sample_id, output_name, kit) - pdf.savefig() - make_plot(df, sample_id, output_name, kit, sameyaxis=True) + for sample_id in sample_list: + # if df[df["SampleID"] == sample_id].empty: + # print(f"{sample_id} does not have any reads passing filter. Skipping to next sample.") + # else: + with PdfPages(f"{wd}/MarkerPlots/{output_name}_{sample_id}_marker_plots.pdf") as pdf: + if not filt_df[filt_df["SampleID"] == sample_id].empty: + make_plot(filt_df, sample_id, output_name, kit, filters=True, at=False) pdf.savefig() + make_plot(df, sample_id, output_name, kit) + pdf.savefig() + make_plot(df, sample_id, output_name, kit, sameyaxis=True) + pdf.savefig() def make_plot(df, sample_id, output_name, kit, sameyaxis=False, filters=False, at=True): sample_df = df[df["SampleID"] == sample_id].copy() - conditions = [ - sample_df["allele_type"].str.contains("Typed"), - sample_df["allele_type"].str.contains("BelowAT"), - sample_df["allele_type"].str.contains("stutter"), - sample_df["allele_type"].str.contains("Deleted"), - ] - values = ["Typed", "BelowAT", "Stutter", "Deleted"] - sample_df.loc[:, "Type"] = np.select(conditions, values) - max_reads = max(sample_df["Reads"]) - n = 100 if max_reads > 1000 else 10 - max_yvalue = (int(math.ceil(max_reads / n)) * n) + n - increase_value = int(math.ceil((max_yvalue / 5) / n)) * n + plot_loc = 0 fig = plt.figure(figsize=(30, 30)) - n = 0 + if not sample_df.empty: + conditions = [ + sample_df["allele_type"].str.contains("Typed"), + sample_df["allele_type"].str.contains("BelowAT"), + sample_df["allele_type"].str.contains("stutter"), + sample_df["allele_type"].str.contains("Deleted"), + ] + values = ["Typed", "BelowAT", "Stutter", "Deleted"] + sample_df.loc[:, "Type"] = np.select(conditions, values) + max_reads = max(sample_df["Reads"]) + n = 100 if max_reads > 1000 else 10 + max_yvalue = (int(math.ceil(max_reads / n)) * n) + n + increase_value = int(math.ceil((max_yvalue / 5) / n)) * n if kit == "powerseq": str_list = ( str_lists["powerseq_ystrs"] if "sexloci" in output_name else str_lists["powerseq_strs"] @@ -355,10 +356,10 @@ def make_plot(df, sample_id, output_name, kit, sameyaxis=False, filters=False, a str_lists["forenseq_ystrs"] if "sexloci" in output_name else str_lists["forenseq_strs"] ) for marker in str_list: - n += 1 + plot_loc += 1 colors = {"Typed": "green", "Stutter": "blue", "BelowAT": "red", "Deleted": "purple"} marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele") - ax = fig.add_subplot(6, 5, n) + ax = fig.add_subplot(6, 5, plot_loc) if not marker_df.empty: if marker == "AMELOGENIN": for i, row in marker_df.iterrows(): @@ -448,6 +449,7 @@ def process_input( info=True, ): full_df = pd.read_csv(f"{input_name}.txt", sep="\t") + sample_list = full_df["SampleID"].unique() if custom: seq_col = "Custom_Range_Sequence" brack_col = "Custom_Bracketed_Notation" @@ -460,7 +462,7 @@ def process_input( ) if nofiltering: full_df["allele_type"] = "Typed" - marker_plots(full_df, input_name, kit) + marker_plots(full_df, input_name, kit, sample_list) if output_type == "efm" or output_type == "mpsproto": EFM_output(full_df, outpath, profile_type, data_type, brack_col, sex, kit, separate) else: @@ -469,7 +471,7 @@ def process_input( dict_loc = {k: v for k, v in full_df.groupby(["SampleID", "Locus"])} final_df, flags_df = process_strs(dict_loc, data_type, seq_col, brack_col, kit) if final_df is not None: - marker_plots(final_df, input_name, kit) + marker_plots(final_df, input_name, kit, sample_list) if output_type == "efm" or output_type == "mpsproto": EFM_output( final_df, outpath, profile_type, data_type, brack_col, sex, kit, separate From 8e3cfa452b52f4cf2408b159a50201264fe96b34 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Wed, 25 Jun 2025 05:57:51 -0400 Subject: [PATCH 2/5] change how tabs were coded (removed streamlit_option_menu package) to work on windows [skip ci] --- lusSTR/cli/gui.py | 52 +++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/lusSTR/cli/gui.py b/lusSTR/cli/gui.py index 29b95ff..5c1b8c5 100644 --- a/lusSTR/cli/gui.py +++ b/lusSTR/cli/gui.py @@ -24,7 +24,6 @@ import plotly.express as px import plotly.graph_objs as go import streamlit as st -from streamlit_option_menu import option_menu import yaml import subprocess import os @@ -126,28 +125,21 @@ def main(): # Creating Navigation Bar - selected = option_menu( - menu_title=None, - options=["Home", "STRs", "SNPs", "How to Use", "Contact"], - icons=["house", "gear", "gear-fill", "book", "envelope"], - menu_icon="cast", - default_index=0, - orientation="horizontal", - ) + tab1, tab2, tab3, tab4, tab5 = st.tabs(["Home", "STRs", "SNPs", "How To Use", "Contact"]) - if selected == "Home": + with tab1: show_home_page() - elif selected == "STRs": + with tab2: show_STR_page() - elif selected == "SNPs": + with tab3: show_SNP_page() - elif selected == "How to Use": + with tab4: show_how_to_use_page() - elif selected == "Contact": + with tab5: show_contact_page() @@ -216,11 +208,14 @@ def interactive_plots_allmarkers(sample_df, flagged_df): col = cols[n] container = col.container(border=True) sample_locus = sample_df["SampleID"].unique() + "_" + marker - sample_df = np.where( - sample_df["Locus"] == "AMELOGENIN", - np.where(sample_df["CE_Allele"] == "X", 0, 1), - sample_df["CE_Allele"], - ) + #sample_df = np.where( + # sample_df["Locus"] == "AMELOGENIN", + # np.where(sample_df["CE_Allele"] == "X", "0.0", "1.0"), + # sample_df["CE_Allele"], + #) + for i, row in sample_df.iterrows(): + if row["Locus"] == "AMELOGENIN": + sample_df.loc[i, "CE_Allele"] = 0 if row.CE_Allele == "X" else 1 sample_df["CE_Allele"] = pd.to_numeric(sample_df["CE_Allele"]) marker_df = sample_df[sample_df["Locus"] == marker].sort_values( by=["CE_Allele", "allele_type"], ascending=[False, True] @@ -341,11 +336,14 @@ def interactive_setup(df1, file): interactive_plots_allmarkers(sample_df, flags) else: plot_df = sample_df - sample_df = np.where( - sample_df["Locus"] == "AMELOGENIN", - np.where(sample_df["CE_Allele"] == "X", 0, 1), - sample_df["CE_Allele"], - ) + #sample_df = np.where( + # sample_df["Locus"] == "AMELOGENIN", + # np.where(sample_df["CE_Allele"] == "X", 0, 1), + # sample_df["CE_Allele"], + #) + for i, row in sample_df.iterrows(): + if row["Locus"] == "AMELOGENIN": + sample_df.loc[i, "CE_Allele"] = 0 if row.CE_Allele == "X" else 1 plot_df["CE_Allele"] = pd.to_numeric(plot_df["CE_Allele"]) locus_key = f"{sample}_{locus}" if locus_key not in st.session_state: @@ -825,7 +823,7 @@ def show_SNP_page(): "Multiple Input Files" ) input_option = st.radio( - "Select Input Option:", ("Individual File", "Folder with Multiple Files") + "Select Input Option:", ("Individual File", "Folder with Multiple Files"), key="snps" ) # Initialize session state if not already initialized @@ -850,7 +848,7 @@ def show_SNP_page(): # Display The Selected Path if st.session_state.samp_input: - st.text_input("Location Of Your Input File(s):", st.session_state.samp_input) + st.text_input("Location Of Your Input File(s):", st.session_state.samp_input, key="input_snps") # Store Selected Path to Reference in Config samp_input = st.session_state.samp_input @@ -965,7 +963,7 @@ def show_SNP_page(): # Display selected path if st.session_state.wd_dirname: - st.text_input("Your Specified Output Folder:", st.session_state.wd_dirname) + st.text_input("Your Specified Output Folder:", st.session_state.wd_dirname, key="output_snps") ##################################################################### # SNP: Generate Config File Based on Settings # From 3504ae71ff9fdb37b1e1c59f772df775bedf979f Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Fri, 18 Jul 2025 09:50:53 -0400 Subject: [PATCH 3/5] detects whether os is windows and creates log appropriately --- lusSTR/workflows/strs.smk | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/lusSTR/workflows/strs.smk b/lusSTR/workflows/strs.smk index e223d35..d10f6c9 100644 --- a/lusSTR/workflows/strs.smk +++ b/lusSTR/workflows/strs.smk @@ -74,13 +74,24 @@ def parse_sample_details(filename): def create_log(log): now = datetime.now() dt = now.strftime("%m%d%Y_%H_%M_%S") - shell("mkdir -p logs/{dt}/input/") - shell("cp '{log}' logs/{dt}/") - if os.path.isdir(input_name): - shell("cp '{input_name}'/*.* logs/{dt}/input/") + system = os.name + if system == "nt": + shell("md logs\\{dt}\\Input\\") + shell('copy "{log}" logs\\{dt}\\') + shell("copy config.yaml logs\\{dt}\\") + new_file = input_name.replace("/", "\\") + if os.path.isdir(input_name): + shell('xcopy "{new_file}" logs\\{dt}\\Input') + else: + shell('copy "{new_file}" logs\\{dt}\\Input\\') else: - shell("cp '{input_name}' logs/{dt}/input/") - shell("cp config.yaml logs/{dt}/") + shell("mkdir -p logs/{dt}/input/") + shell("cp '{log}' logs/{dt}/") + if os.path.isdir(input_name): + shell("cp '{input_name}'/*.* logs/{dt}/input/") + else: + shell("cp '{input_name}' logs/{dt}/input/") + shell("cp config.yaml logs/{dt}/") def get_output(): @@ -103,9 +114,9 @@ rule all: expand("{name}.csv", name=output_name), expand("{name}.txt", name=output_name), expand( - "{outdir}/{samplename}_{prof_t}_{data_t}.csv", outdir=output_name, + "MarkerPlots/{output_name}_{samplename}_marker_plots.pdf", output_name=get_markerplot_name(output_name, config["custom_ranges"]), samplename=get_sample_IDs(input_name, config["analysis_software"], output_name, software, - separate), prof_t=prof, data_t=data + separate) ) @@ -141,7 +152,7 @@ rule filter: rules.convert.output output: expand( - "MarkerPlots/{output_name}_{samplename}_marker_plots.pdf", output_name=get_markerplot_name(config["output"], config["custom_ranges"]), + "MarkerPlots/{output_name}_{samplename}_marker_plots.pdf", output_name=get_markerplot_name(output_name, config["custom_ranges"]), samplename=get_sample_IDs(input_name, config["analysis_software"], output_name, software, separate) ) From a1471fcf8b0256f87f5cb743e7cae2693f2d1f7c Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Mon, 21 Jul 2025 05:26:48 -0400 Subject: [PATCH 4/5] removed hashed out code --- lusSTR/cli/gui.py | 10 ---------- lusSTR/wrappers/filter.py | 3 --- 2 files changed, 13 deletions(-) diff --git a/lusSTR/cli/gui.py b/lusSTR/cli/gui.py index 5c1b8c5..7dfa8cc 100644 --- a/lusSTR/cli/gui.py +++ b/lusSTR/cli/gui.py @@ -208,11 +208,6 @@ def interactive_plots_allmarkers(sample_df, flagged_df): col = cols[n] container = col.container(border=True) sample_locus = sample_df["SampleID"].unique() + "_" + marker - #sample_df = np.where( - # sample_df["Locus"] == "AMELOGENIN", - # np.where(sample_df["CE_Allele"] == "X", "0.0", "1.0"), - # sample_df["CE_Allele"], - #) for i, row in sample_df.iterrows(): if row["Locus"] == "AMELOGENIN": sample_df.loc[i, "CE_Allele"] = 0 if row.CE_Allele == "X" else 1 @@ -336,11 +331,6 @@ def interactive_setup(df1, file): interactive_plots_allmarkers(sample_df, flags) else: plot_df = sample_df - #sample_df = np.where( - # sample_df["Locus"] == "AMELOGENIN", - # np.where(sample_df["CE_Allele"] == "X", 0, 1), - # sample_df["CE_Allele"], - #) for i, row in sample_df.iterrows(): if row["Locus"] == "AMELOGENIN": sample_df.loc[i, "CE_Allele"] = 0 if row.CE_Allele == "X" else 1 diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index 39e7c1b..8edd3ee 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -317,9 +317,6 @@ def marker_plots(df, output_name, kit, sample_list, wd="."): Path(f"{wd}/MarkerPlots").mkdir(parents=True, exist_ok=True) filt_df = df[df["allele_type"] == "Typed"] for sample_id in sample_list: - # if df[df["SampleID"] == sample_id].empty: - # print(f"{sample_id} does not have any reads passing filter. Skipping to next sample.") - # else: with PdfPages(f"{wd}/MarkerPlots/{output_name}_{sample_id}_marker_plots.pdf") as pdf: if not filt_df[filt_df["SampleID"] == sample_id].empty: make_plot(filt_df, sample_id, output_name, kit, filters=True, at=False) From 75ba13d5feba34e9590020a9c6a14e8e2f9545b9 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Mon, 21 Jul 2025 09:52:31 -0400 Subject: [PATCH 5/5] copying log files using python instead of bash --- lusSTR/workflows/strs.smk | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/lusSTR/workflows/strs.smk b/lusSTR/workflows/strs.smk index d10f6c9..5d76a9e 100644 --- a/lusSTR/workflows/strs.smk +++ b/lusSTR/workflows/strs.smk @@ -6,6 +6,7 @@ import os import pandas as pd from pathlib import Path import re +import shutil configfile: "config.yaml" @@ -74,24 +75,17 @@ def parse_sample_details(filename): def create_log(log): now = datetime.now() dt = now.strftime("%m%d%Y_%H_%M_%S") - system = os.name - if system == "nt": - shell("md logs\\{dt}\\Input\\") - shell('copy "{log}" logs\\{dt}\\') - shell("copy config.yaml logs\\{dt}\\") - new_file = input_name.replace("/", "\\") - if os.path.isdir(input_name): - shell('xcopy "{new_file}" logs\\{dt}\\Input') - else: - shell('copy "{new_file}" logs\\{dt}\\Input\\') + input_name = Path(config["samp_input"]) + dtdir = Path("logs") / dt + logdir = dtdir / "input" + logdir.mkdir(parents=True, exist_ok=True) + shutil.copy(log, dtdir / "snakemake.log") + if input_name.is_dir(): + for path in input_name.glob("*.*"): + shutil.copy(path, logdir / path.name) else: - shell("mkdir -p logs/{dt}/input/") - shell("cp '{log}' logs/{dt}/") - if os.path.isdir(input_name): - shell("cp '{input_name}'/*.* logs/{dt}/input/") - else: - shell("cp '{input_name}' logs/{dt}/input/") - shell("cp config.yaml logs/{dt}/") + shutil.copy(input_name, logdir / input_name.name) + shutil.copy("config.yaml", dtdir / "config.yaml") def get_output():