Skip to content

implement quality control steps #6

@matbonfanti

Description

@matbonfanti

Description of feature

implement steps for QC metrics plot using this code from @SaraTerzol

# Quality control
def qc_from_h5ad(
    h5ad_path,
    sample_id,
    plots_dir="/home/jovyan/work/results/QC_plots",
    output_dir="/home/jovyan/work/results/object/H5AD",
    min_counts=100,
    min_genes=50,
    max_mt=20,
    novelty_thresh=0.8,
    min_cells=3,
    pdf_pages=None  # optional PdfPages object to save all plots in a single PDF
):
    """
    Perform full QC starting from a 016um H5AD file, including:
    - In-tissue filtering
    - Genes/UMI per spot
    - Mitochondrial, ribosomal, hemoglobin gene QC
    - Combined QC plots
    - Local outlier detection
    - Save filtered AnnData
    
    Parameters
    ----------
    h5ad_path : str
        Path to the 016um H5AD file.
    sample_id : str
        Sample identifier.
    plots_dir : str
        Directory where QC plots will be saved.
    output_dir : str
        Directory where the filtered AnnData will be saved.
    pdf_pages : PdfPages, optional
        Pass an open PdfPages object to save all plots in a single PDF.
        
    Returns
    -------
    dict
        Summary statistics: total spots, in-tissue spots, genes/UMI per spot.
    """

    # ------------------------
    # Create directories if needed
    # ------------------------
    os.makedirs(plots_dir, exist_ok=True)
    os.makedirs(output_dir, exist_ok=True)
    sns.set(style="whitegrid")
    
    # ------------------------
    # Start message
    # ------------------------
    print(f"\n=== Processing sample: {sample_id} ===")
    
    # ------------------------
    # Load 016um AnnData
    # ------------------------
    adata = sc.read_h5ad(h5ad_path)
    adata.obs['sample'] = sample_id
    
    # ------------------------
    # Count spots
    # ------------------------
    total_spots = len(adata.obs['in_tissue'])
    n_in_tissue = (adata.obs['in_tissue'] == 1).sum()
    n_out_tissue = (adata.obs['in_tissue'] == 0).sum()
    print(f"Total spots: {total_spots}, In tissue: {n_in_tissue}, Outside tissue: {n_out_tissue}")
    
    # ------------------------
    # Filter in-tissue spots
    # ------------------------
    adata.obs["in_tissue_str"] = ["In tissue" if x==1 else "Outside tissue" for x in adata.obs["in_tissue"]]
    adata = adata[adata.obs["in_tissue_str"]=="In tissue"].copy()
    
    if not sp.issparse(adata.X):
        adata.X = sp.csr_matrix(adata.X)
    
    # ------------------------
    # Compute genes and UMI per spot
    # ------------------------
    genes_per_spot = np.array((adata.X > 0).sum(axis=1)).flatten()
    umi_per_spot = np.array(adata.X.sum(axis=1)).flatten()
    mean_genes_per_spot = genes_per_spot.mean()
    mean_umi_per_spot = umi_per_spot.mean()
    
    print(f"Mean genes per spot: {mean_genes_per_spot:.2f}")
    print(f"Mean UMI per spot: {mean_umi_per_spot:.2f}")
    
    # ------------------------
    # Flag genes for QC
    # ------------------------
    adata.var["mt"] = adata.var_names.str.startswith("MT-")
    adata.var["ribo"] = adata.var_names.str.startswith(('RPS','RPL'))
    adata.var["hb"] = adata.var_names.str.startswith("HB")
    
    # ------------------------
    # Calculate QC metrics
    # ------------------------
    sc.pp.calculate_qc_metrics(
        adata,
        qc_vars=['mt','ribo','hb'],
        inplace=True,
        percent_top=[20],
        log1p=True
    )
    
    print(adata.obs)
    
    # ------------------------
    # Compute novelty (complexity) score
    # ------------------------
    
    # Novelty score
    novelty_score = genes_per_spot / umi_per_spot
    adata.obs["novelty_score"] = novelty_score
    adata.obs["low_complexity"] = novelty_score < novelty_thresh
    num_low_complexity = adata.obs["low_complexity"].sum()
    print(f"Num_low_complexity: {num_low_complexity}")
    

    # QC flags for global outliers
    adata.obs["qc_lib_size"] = adata.obs["total_counts"] < min_counts
    adata.obs["qc_detected"] = adata.obs["n_genes_by_counts"] < min_genes
    if "pct_counts_mt" in adata.obs.columns:
        adata.obs["qc_mito"] = adata.obs["pct_counts_mt"] > max_mt
    else:
        adata.obs["qc_mito"] = False
        
    # ------------------------
    # MT table
    # ------------------------
    if "pct_counts_mt" in adata.obs.columns:
        mt_table = pd.DataFrame({
            "Sample": [sample_id],
            "Total spots": [total_spots],
            "Mean % MT": [adata.obs["pct_counts_mt"].mean()],
            "Median % MT": [adata.obs["pct_counts_mt"].median()],
            "Max % MT": [adata.obs["pct_counts_mt"].max()],
            "Min % MT": [adata.obs["pct_counts_mt"].min()]
        })

        print("\n--- Mitochondrial counts summary ---")
        print(mt_table)


    # ------------------------
    # QC plots before filtering
    # ------------------------
    fig, axs = plt.subplots(1, 3, figsize=(18, 5))

    sns.histplot(adata.obs["total_counts"], kde=True, color="skyblue", ax=axs[0])
    axs[0].set_title(f"{sample_id} - Total Counts per Spot")
    axs[0].set_xlabel("Total counts")
    axs[0].set_ylabel("Number of spots")

    sns.histplot(adata.obs["n_genes_by_counts"], kde=True, bins=60, color="lightgreen", ax=axs[1])
    axs[1].set_title(f"{sample_id} - Genes Detected per Spot")
    axs[1].set_xlabel("Number of genes")
    axs[1].set_ylabel("Number of spots")

    if "pct_counts_mt" in adata.obs.columns:
        sns.histplot(adata.obs["pct_counts_mt"], kde=True, bins=60, color="salmon", ax=axs[2])
        axs[2].set_title(f"{sample_id} - Percent Mitochondrial Counts")
        axs[2].set_xlabel("% mt counts")
        axs[2].set_ylabel("Number of spots")

    plt.tight_layout()
    plt.savefig(os.path.join(plots_dir, f"{sample_id}_qc_before_filter.png"))

    if pdf_pages:
        pdf_pages.savefig(fig)
    plt.close(fig)
    
    
    # ------------------------
    # Global outlier detection
    # ------------------------
    # Combine global outliers
    adata.obs["global_outliers"] = (
        adata.obs["qc_lib_size"] |
        adata.obs["qc_detected"] |
        adata.obs["qc_mito"] 
        #adata.obs["low_complexity"]
    )
    print("Number of global outliers:")
    print(adata.obs["global_outliers"].value_counts())
    
    # ------------------------
    # Local outlier detection
    # ------------------------
    for metric in ["total_counts", "n_genes_by_counts","pct_counts_mt"]:
        lo.local_outliers(adata, metric=metric, sample_key="region",n_neighbors=36)
        outlier_col = f"{metric}_outliers"
        outliers_count = adata.obs[outlier_col].sum()
        print(f"Detected {outliers_count} local outliers for {metric}.")
        
    # Combine local outliers
    adata.obs["local_outliers"] = (
        adata.obs["total_counts_outliers"] |
        adata.obs["n_genes_by_counts_outliers"] |
        adata .obs["pct_counts_mt_outliers"]
    )
    print("Number of local outliers:")
    print(adata.obs["local_outliers"].value_counts())
    
    # ------------------------
    # Spatial QC plot 
    # ------------------------
    plt.rcParams["figure.figsize"] = (8, 8)
    sc.pl.spatial(
        adata,
        img_key="hires",
        color=["total_counts", "n_genes_by_counts","pct_counts_mt","global_outliers","total_counts_outliers","n_genes_by_counts_outliers","pct_counts_mt_outliers","local_outliers"],
        spot_size=16,
        cmap='viridis',
        alpha=0.8,
        show=False
    )
    plt.tight_layout()
    plt.savefig(os.path.join(plots_dir, f"{sample_id}_spatial_qc.png"))
    plt.close()
    
    # ------------------------
    # Collect QC summary
    # ------------------------
    summary_dict = {
        "Sample": sample_id,
        "Total spots": total_spots,
        "In-tissue spots": n_in_tissue,
        "Outside-tissue spots": n_out_tissue,
        "Mean genes per spot": mean_genes_per_spot,
        "Mean UMI per spot": mean_umi_per_spot,
        "Global outliers (True)": int(adata.obs["global_outliers"].sum()),
        "Global outliers (False)": int((~adata.obs["global_outliers"]).sum()),
        "Local outliers (True)": int(adata.obs["local_outliers"].sum()),
        "Local outliers (False)": int((~adata.obs["local_outliers"]).sum()),
    }

    # Add mitochondrial summary if available
    if "pct_counts_mt" in adata.obs.columns:
        summary_dict.update({
            "Mean % MT": adata.obs["pct_counts_mt"].mean(),
            "Median % MT": adata.obs["pct_counts_mt"].median()
        })

    # ------------------------
    # Save summary as CSV
    # ------------------------
    summary_df = pd.DataFrame([summary_dict])
    csv_path = os.path.join(plots_dir, f"{sample_id}_QC_summary.csv")
    summary_df.to_csv(csv_path, index=False)
    print(f"\nQC summary saved to: {csv_path}")
    
    # ------------------------
    # Save filtered AnnData
    # ------------------------
    output_h5ad = os.path.join(output_dir, f"{sample_id}_016um_qc.h5ad")
    adata.write(output_h5ad)
    print(f"Filtered AnnData saved to: {output_h5ad}")
    
    # ------------------------
    # Return summary statistics
    # ------------------------
    return {
        "sample_id": sample_id,
        "total_spots": total_spots,
        "spots_in_tissue": n_in_tissue,
        "spots_out_tissue": n_out_tissue,
        "mean_genes_per_spot": mean_genes_per_spot,
        "mean_umi_per_spot": mean_umi_per_spot
    }
Call functions
Load the AnnData object and initialize the SpatialData object with image and scale factors.
samples = [{"path": "/home/jovyan/data/2_Analysis/0_spaceranger/NFG012_018_CyLIB", "id": "SW03"}]
processed_samples = {}

for sample in samples:
    sample_path = sample["path"]
    sample_id = sample["id"]
    
    zarr_path, h5ad_path = load_and_save(
        NFG_path=sample_path,
        sample_id=sample_id,
        output_dir="/home/jovyan/work/results/object/"
    )
    
    processed_samples[sample_id] = {"zarr": zarr_path, "h5ad": h5ad_path}

Metadata

Metadata

Assignees

Labels

enhancementNew feature or request

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions