From a1f7499b2e8b91739cd7df45bb11ca8c5d15b6e7 Mon Sep 17 00:00:00 2001 From: Ouma Ronald Date: Thu, 26 Feb 2026 12:36:20 +0300 Subject: [PATCH 1/2] updates to the docs --- README.md | 2 +- app/utils/config.py | 61 ++++++++++++++++++- docker-setup.sh | 2 +- docs/CURATOR_TABLE_DESIGN.md | 2 +- run_confusion_analysis.sh | 2 +- test_cli.py => scripts/dev/test_cli.py | 1 + .../eval/confusion_matrix_analysis.py | 1 + scripts/{archive => ops}/log_cleanup.py | 0 scripts/{archive => ops}/log_dashboard.py | 0 .../{archive => ops}/performance_monitor.py | 0 10 files changed, 66 insertions(+), 5 deletions(-) rename test_cli.py => scripts/dev/test_cli.py (99%) rename confusion_matrix_analysis.py => scripts/eval/confusion_matrix_analysis.py (99%) rename scripts/{archive => ops}/log_cleanup.py (100%) rename scripts/{archive => ops}/log_dashboard.py (100%) rename scripts/{archive => ops}/performance_monitor.py (100%) diff --git a/README.md b/README.md index 37e6b66..90f6868 100644 --- a/README.md +++ b/README.md @@ -197,7 +197,7 @@ BioAnalyzer includes a formal validation workflow to compare automated predictio - **Ground truth**: Expert annotations in `feedback.csv` for the six BugSigDB fields - **Predictions**: BioAnalyzer outputs in a predictions CSV (e.g. `analysis_results.csv` or `new.csv`) - **Alignment**: PMIDs are aligned with `align_pmids.py` -- **Evaluation**: `confusion_matrix_analysis.py` computes 3-class confusion matrices (`ABSENT`, `PARTIALLY_PRESENT`, `PRESENT`) and per-field accuracy +- **Evaluation**: `scripts/eval/confusion_matrix_analysis.py` computes 3-class confusion matrices (`ABSENT`, `PARTIALLY_PRESENT`, `PRESENT`) and per-field accuracy - **Outputs**: Metrics and PNG confusion matrices are written to `confusion_matrix_results/` For sharing/inspection, `create_validation_dataset.py` can generate a flat CSV: diff --git a/app/utils/config.py b/app/utils/config.py index 706f1aa..3971531 100644 --- a/app/utils/config.py +++ b/app/utils/config.py @@ -3,6 +3,7 @@ import logging from typing import List, Optional from .credential_masking import mask_exception_message, mask_string +from app.core.settings import get_settings try: from dotenv import load_dotenv # type: ignore @@ -326,5 +327,63 @@ def setup_logging() -> logging.Logger: return root_logger -# Initialize logging when module is imported logger = setup_logging() + +# --------------------------------------------------------------------------- +# Bridge to structured settings (app.core.settings) +# This keeps app.core.settings as the single source of truth while preserving +# existing config module attributes used across the codebase. +# --------------------------------------------------------------------------- +try: + _settings = get_settings() + + # API / timeout settings + API_TIMEOUT = _settings.api.timeout + ANALYSIS_TIMEOUT = _settings.api.analysis_timeout + GEMINI_TIMEOUT = _settings.api.gemini_timeout + FRONTEND_TIMEOUT = _settings.api.frontend_timeout + NCBI_RATE_LIMIT_DELAY = _settings.api.ncbi_rate_limit_delay + MAX_CONCURRENT_REQUESTS = _settings.api.max_concurrent_requests + + # LLM config + if _settings.llm.provider: + LLM_PROVIDER = _settings.llm.provider + if _settings.llm.model: + LLM_MODEL = _settings.llm.model + + # RAG settings + RAG_SUMMARY_PROVIDER = _settings.rag.summary_provider or RAG_SUMMARY_PROVIDER + RAG_SUMMARY_MODEL = _settings.rag.summary_model or RAG_SUMMARY_MODEL + RAG_SUMMARY_LENGTH = _settings.rag.summary_length.value + RAG_SUMMARY_QUALITY = _settings.rag.summary_quality.value + RAG_RERANK_METHOD = _settings.rag.rerank_method.value + RAG_USE_SUMMARY_CACHE = _settings.rag.use_cache + RAG_MAX_SUMMARY_KEY_POINTS = _settings.rag.max_summary_key_points + RAG_TOP_K_CHUNKS = _settings.rag.top_k_chunks + + # Cache settings + CACHE_VALIDITY_HOURS = _settings.cache.validity_hours + MAX_CACHE_SIZE = _settings.cache.max_size + + # Retrieval settings + USE_FULLTEXT = _settings.retrieval.use_fulltext + if _settings.retrieval.ncbi_api_key: + NCBI_API_KEY = _settings.retrieval.ncbi_api_key + if _settings.retrieval.email: + EMAIL = _settings.retrieval.email + + # Security / environment + ENVIRONMENT = _settings.environment.value + CORS_ORIGINS = _settings.security.cors_origins + ENABLE_REQUEST_ID = _settings.security.enable_request_id + + # Rate limiting + ENABLE_RATE_LIMITING = _settings.rate_limit.enabled + RATE_LIMIT_PER_MINUTE = _settings.rate_limit.requests_per_minute + + # Logging + LOG_LEVEL = _settings.logging.level.value +except Exception: + # Fail gracefully; fall back to env-based values defined above + pass + diff --git a/docker-setup.sh b/docker-setup.sh index 934005e..8164653 100755 --- a/docker-setup.sh +++ b/docker-setup.sh @@ -68,7 +68,7 @@ fi print_status "Testing Docker image..." # Test the image with a simple command -docker run --rm bioanalyzer-package python test_cli.py +docker run --rm bioanalyzer-package python scripts/dev/test_cli.py if [ $? -eq 0 ]; then print_success "Docker image test passed!" diff --git a/docs/CURATOR_TABLE_DESIGN.md b/docs/CURATOR_TABLE_DESIGN.md index a1bd351..9856087 100644 --- a/docs/CURATOR_TABLE_DESIGN.md +++ b/docs/CURATOR_TABLE_DESIGN.md @@ -92,4 +92,4 @@ We can start with **PMID, Title, Year, Journal, the 6 field statuses, and one co - **Design:** `docs/CURATOR_TABLE_DESIGN.md` (this file). - **App:** `curator_table/` (Streamlit app + README). -- **Data format:** Same as existing BioAnalyzer export (e.g. `analysis_results.csv` / validation dataset shape); see `create_validation_dataset.py` and `confusion_matrix_analysis.py` for column names. +- **Data format:** Same as existing BioAnalyzer export (e.g. `analysis_results.csv` / validation dataset shape); see `create_validation_dataset.py` and `scripts/eval/confusion_matrix_analysis.py` for column names. diff --git a/run_confusion_analysis.sh b/run_confusion_analysis.sh index 30defd0..99ffee5 100755 --- a/run_confusion_analysis.sh +++ b/run_confusion_analysis.sh @@ -55,7 +55,7 @@ docker run --rm \ -v "$SCRIPT_DIR:/app" \ -w /app \ bioanalyzer-package \ - python confusion_matrix_analysis.py "$PREDICTIONS_FILE" "$FEEDBACK_FILE" + python scripts/eval/confusion_matrix_analysis.py "$PREDICTIONS_FILE" "$FEEDBACK_FILE" echo "" echo "==========================================" diff --git a/test_cli.py b/scripts/dev/test_cli.py similarity index 99% rename from test_cli.py rename to scripts/dev/test_cli.py index f4aa6ef..f60d6bb 100644 --- a/test_cli.py +++ b/scripts/dev/test_cli.py @@ -121,3 +121,4 @@ def test_field_info(): print(" python3 cli.py analyze 12345678") else: print("\n❌ Package CLI structure has issues!") + diff --git a/confusion_matrix_analysis.py b/scripts/eval/confusion_matrix_analysis.py similarity index 99% rename from confusion_matrix_analysis.py rename to scripts/eval/confusion_matrix_analysis.py index 6feb7ea..102ac86 100644 --- a/confusion_matrix_analysis.py +++ b/scripts/eval/confusion_matrix_analysis.py @@ -248,3 +248,4 @@ def main(): if __name__ == "__main__": main() + diff --git a/scripts/archive/log_cleanup.py b/scripts/ops/log_cleanup.py similarity index 100% rename from scripts/archive/log_cleanup.py rename to scripts/ops/log_cleanup.py diff --git a/scripts/archive/log_dashboard.py b/scripts/ops/log_dashboard.py similarity index 100% rename from scripts/archive/log_dashboard.py rename to scripts/ops/log_dashboard.py diff --git a/scripts/archive/performance_monitor.py b/scripts/ops/performance_monitor.py similarity index 100% rename from scripts/archive/performance_monitor.py rename to scripts/ops/performance_monitor.py From f9ddd2b68221e5ba3020ab42cdcee9f57ae6dfec Mon Sep 17 00:00:00 2001 From: Ouma Ronald Date: Mon, 2 Mar 2026 16:47:46 +0300 Subject: [PATCH 2/2] refactor(curator): shorten and harden app.py for maintainability - Config: single CONFIG dict with FEEDBACK_DIR, USER, BIOANALYZER_VERSION from env - Schema: OPTIONS dict for valid_states/col_feedback/true_label; dynamic _feedback_schema() - Data: unified _load_data(path_or_upload, is_path) with BytesIO for uploads - Helpers: _safe_int, _normalize_status, _priority_score (comprehension); PMID link inline - Normalization: assign chains, try/except for PMID and year parsing - Feedback: load_feedback/save_feedback/upsert_feedback use _feedback_schema(); single loop for row pred/true/col_feedback - UI: render_filters dict comp for status filters; display_cols from single want list; compact_cols list comp - Robustness: logging for load/save/errors; PMID validation warning if not in dataset; explicit TypeError/ValueError in helpers - Removed: OPTIONAL_COLUMNS (unused); redundant docstrings; duplicate loops --- curator_table/app.py | 639 +++++++++++++------------------------------ 1 file changed, 189 insertions(+), 450 deletions(-) diff --git a/curator_table/app.py b/curator_table/app.py index 6c63c45..99346ca 100644 --- a/curator_table/app.py +++ b/curator_table/app.py @@ -3,63 +3,44 @@ BioAnalyzer Curator Table (with column-level validation + curator ground truth) ============================================================================= -A Streamlit dashboard for real-world validation of BioAnalyzer predictions. - -This app supports: -1) A sortable, searchable, filterable table of candidate curatable PubMed articles. -2) A curator feedback workflow aligned by PMID. -3) Column-level correctness checks for each BioAnalyzer field. -4) Curator-provided TRUE labels (ground truth) for each field. -5) Exportable feedback suitable for: - - confusion matrices - - per-field error analysis - - binary + multiclass evaluation - - MCC decisions on how to treat PARTIALLY_PRESENT - -Run: - streamlit run curator_table/app.py - -Input data: - CSV or Parquet with at minimum: - - PMID - Recommended: - - Title, Journal, Year, Summary - Expected BioAnalyzer outputs: - - Host Species Status - - Body Site Status - - Condition Status - - Sequencing Type Status - - Taxa Level Status - - Sample Size Status - -Status values expected: - ABSENT | PARTIALLY_PRESENT | PRESENT - -Feedback storage: - results/curator_feedback.csv - results/curator_feedback.parquet - -Design notes ------------- -- Feedback is upserted by (PMID, curator_id) to prevent duplicates. -- Feedback rows store BOTH: - (a) the BioAnalyzer predictions (pred__*) - (b) the curator's evaluation (col_feedback__*) - (c) the curator's ground truth labels (true__*) - This makes benchmarking reproducible even if the input dataset changes. +Streamlit dashboard for real-world validation of BioAnalyzer predictions: +sortable/searchable/filterable table, curator feedback by PMID, column-level +correctness + ground truth, exportable feedback for confusion matrices and MCC. + +Run: streamlit run curator_table/app.py +Input: CSV or Parquet with PMID (recommended: Title, Journal, Year, Summary). +Status values: ABSENT | PARTIALLY_PRESENT | PRESENT. +Feedback: results/curator_feedback.csv and .parquet (upserted by PMID + curator_id). """ from __future__ import annotations +import io +import logging import os from pathlib import Path -from typing import Optional, List, Dict +from typing import Optional import pandas as pd import streamlit as st # ----------------------------- -# Expected prediction columns +# Config (env overrides) +# ----------------------------- +CONFIG = { + "feedback_dir": Path(os.getenv("FEEDBACK_DIR", "results")), + "curator_id_default": os.getenv("USER", ""), + "bioanalyzer_version_default": os.getenv("BIOANALYZER_VERSION", ""), +} + +CONFIG["feedback_dir"].mkdir(exist_ok=True) +FEEDBACK_CSV = CONFIG["feedback_dir"] / "curator_feedback.csv" +FEEDBACK_PARQUET = CONFIG["feedback_dir"] / "curator_feedback.parquet" + +logger = logging.getLogger(__name__) + +# ----------------------------- +# Schema (single source of truth) # ----------------------------- STATUS_COLUMNS = [ "Host Species Status", @@ -69,312 +50,202 @@ "Taxa Level Status", "Sample Size Status", ] - -OPTIONAL_COLUMNS = [ - "Title", - "Journal", - "Summary", - "Processing Time", - "Year", - "Publication Date", +OPTIONS = { + "valid_states": ["ABSENT", "PARTIALLY_PRESENT", "PRESENT"], + "col_feedback": ["Not reviewed", "Correct", "Incorrect", "Unclear"], + "true_label": ["Not reviewed", "ABSENT", "PARTIALLY_PRESENT", "PRESENT"], +} + +_safe = lambda col: col.replace(" ", "_") +FEEDBACK_BASE_COLS = [ + "PMID", "curator_id", "overall_verdict", "comment", "timestamp", "bioanalyzer_version", ] +PRED_PREFIX, TRUE_PREFIX, COL_FB_PREFIX = "pred__", "true__", "col_feedback__" -VALID_STATES = ["ABSENT", "PARTIALLY_PRESENT", "PRESENT"] -# Column-level feedback options (curator judgement of correctness) -COL_FEEDBACK_OPTIONS = [ - "Not reviewed", - "Correct", - "Incorrect", - "Unclear", -] - -# True label options (ground truth) -TRUE_LABEL_OPTIONS = [ - "Not reviewed", - "ABSENT", - "PARTIALLY_PRESENT", - "PRESENT", -] - -# ----------------------------- -# Feedback persistence -# ----------------------------- -DEFAULT_FEEDBACK_DIR = Path("results") -DEFAULT_FEEDBACK_DIR.mkdir(exist_ok=True) - -FEEDBACK_CSV = DEFAULT_FEEDBACK_DIR / "curator_feedback.csv" -FEEDBACK_PARQUET = DEFAULT_FEEDBACK_DIR / "curator_feedback.parquet" +def _feedback_schema() -> list[str]: + """Full feedback column schema (dynamic from STATUS_COLUMNS).""" + derived = [ + f"{PRED_PREFIX}{_safe(c)}" for c in STATUS_COLUMNS + ] + [ + f"{TRUE_PREFIX}{_safe(c)}" for c in STATUS_COLUMNS + ] + [ + f"{COL_FB_PREFIX}{_safe(c)}" for c in STATUS_COLUMNS + ] + return FEEDBACK_BASE_COLS + derived # ----------------------------- # Helpers # ----------------------------- -def make_pmid_link(pmid) -> str: - """Return PubMed URL for a PMID.""" +def _make_pmid_link(pmid) -> str: try: - pid = str(int(float(pmid))) - return f"https://pubmed.ncbi.nlm.nih.gov/{pid}/" - except Exception: + return f"https://pubmed.ncbi.nlm.nih.gov/{int(float(pmid))}/" + except (TypeError, ValueError): return "" -def safe_int(x) -> Optional[int]: +def _safe_int(x) -> Optional[int]: try: return int(float(x)) - except Exception: + except (TypeError, ValueError): return None -def normalize_status_value(x: str) -> str: - """Normalize status values to one of ABSENT/PARTIALLY_PRESENT/PRESENT.""" +def _normalize_status(x: str) -> str: if pd.isna(x): return "" x = str(x).strip().upper() - if x in VALID_STATES: + if x in OPTIONS["valid_states"]: return x - - # Common variants if x in {"PARTIAL", "PARTIALLY", "PARTLY"}: return "PARTIALLY_PRESENT" if x in {"YES", "TRUE"}: return "PRESENT" if x in {"NO", "FALSE"}: return "ABSENT" - return x -def compute_priority_score(row: pd.Series) -> float: - """ - Rank candidates by how many fields are predicted present. - - - PRESENT contributes 1.0 - - PARTIALLY_PRESENT contributes 0.5 - """ - score = 0.0 - for col in STATUS_COLUMNS: - val = str(row.get(col, "")).strip().upper() - if val == "PRESENT": - score += 1.0 - elif val == "PARTIALLY_PRESENT": - score += 0.5 - return score - - -@st.cache_data(show_spinner=False) -def load_data_from_path(path: str) -> pd.DataFrame: - path = Path(path) - if not path.exists(): - return pd.DataFrame() - - suf = path.suffix.lower() - if suf == ".csv": - return pd.read_csv(path) - if suf in (".parquet", ".pq"): - return pd.read_parquet(path) - - raise ValueError(f"Unsupported format: {suf}. Use .csv or .parquet.") +def _priority_score(row: pd.Series) -> float: + weights = {"PRESENT": 1.0, "PARTIALLY_PRESENT": 0.5} + return sum( + weights.get(str(row.get(col, "")).strip().upper(), 0.0) + for col in STATUS_COLUMNS + ) +# ----------------------------- +# Data loading (unified) +# ----------------------------- @st.cache_data(show_spinner=False) -def load_data_from_upload(uploaded) -> pd.DataFrame: - if uploaded is None: +def _load_data(source, is_path: bool) -> pd.DataFrame: + """Load from file path (str/Path) or uploaded file; returns empty DataFrame on failure.""" + if source is None or (is_path and not source): return pd.DataFrame() - - name = uploaded.name.lower() - if name.endswith(".csv"): - return pd.read_csv(uploaded) - if name.endswith(".parquet") or name.endswith(".pq"): - return pd.read_parquet(uploaded) - - raise ValueError("Unsupported upload. Use .csv or .parquet.") + if is_path: + path = Path(source) + if not path.exists(): + logger.warning("Path does not exist: %s", path) + return pd.DataFrame() + buf, ext = path, path.suffix.lower() + else: + name = source.name.lower() + buf = io.BytesIO(source.getvalue()) + ext = ".csv" if name.endswith(".csv") else (".parquet" if name.endswith((".parquet", ".pq")) else "") + if ext == ".csv": + return pd.read_csv(buf) + if ext in (".parquet", ".pq"): + return pd.read_parquet(buf) + raise ValueError(f"Unsupported format. Use .csv or .parquet.") def normalize_dataset(df: pd.DataFrame) -> pd.DataFrame: - """Normalize dataset and derive helper columns.""" - if df.empty: - return df - - if "PMID" not in df.columns: - st.error("Data must contain a 'PMID' column.") + """Normalize PMID, statuses, derive year/link/priority. Returns empty DataFrame if invalid.""" + if df.empty or "PMID" not in df.columns: + if not df.empty: + st.error("Data must contain a 'PMID' column.") + return pd.DataFrame() + try: + df = ( + df.assign(PMID=df["PMID"].apply(_safe_int)) + .dropna(subset=["PMID"]) + .astype({"PMID": int}) + ) + except Exception as e: + logger.exception("PMID normalization failed: %s", e) return pd.DataFrame() - - # Normalize PMID - df["PMID"] = df["PMID"].apply(safe_int) - df = df.dropna(subset=["PMID"]).copy() - df["PMID"] = df["PMID"].astype(int) - - # Derive year if "Year" not in df.columns and "Publication Date" in df.columns: try: - df["Year"] = pd.to_datetime(df["Publication Date"], errors="coerce").dt.year + df = df.assign(Year=pd.to_datetime(df["Publication Date"], errors="coerce").dt.year) except Exception: pass - - # Normalize statuses for col in STATUS_COLUMNS: if col in df.columns: - df[col] = df[col].apply(normalize_status_value) - - # Priority score - df["Priority Score"] = df.apply(compute_priority_score, axis=1) - - # PubMed link - df["PubMed Link"] = df["PMID"].apply(make_pmid_link) - + df = df.assign(**{col: df[col].apply(_normalize_status)}) + df = df.assign( + **{"Priority Score": df.apply(_priority_score, axis=1)}, + **{"PubMed Link": df["PMID"].apply(_make_pmid_link)}, + ) return df -def _safe_field_name(col: str) -> str: - """Convert 'Host Species Status' -> 'Host_Species_Status'.""" - return col.replace(" ", "_") - - -def _default_feedback_columns() -> List[str]: - """Defines the full schema for feedback rows.""" - base = [ - "PMID", - "curator_id", - "overall_verdict", - "comment", - "timestamp", - "bioanalyzer_version", - ] - - # Predicted statuses - pred_cols = [] - for col in STATUS_COLUMNS: - safe = _safe_field_name(col) - pred_cols.append(f"pred__{safe}") - - # True labels (curator ground truth) - true_cols = [] - for col in STATUS_COLUMNS: - safe = _safe_field_name(col) - true_cols.append(f"true__{safe}") - - # Column-level feedback (Correct/Incorrect/etc.) - col_feedback_cols = [] - for col in STATUS_COLUMNS: - safe = _safe_field_name(col) - col_feedback_cols.append(f"col_feedback__{safe}") - - return base + pred_cols + true_cols + col_feedback_cols - - +# ----------------------------- +# Feedback persistence +# ----------------------------- def load_feedback() -> pd.DataFrame: - """Load feedback from parquet/csv or return empty with schema.""" - if FEEDBACK_PARQUET.exists(): - try: - df = pd.read_parquet(FEEDBACK_PARQUET) - return df - except Exception: - pass - - if FEEDBACK_CSV.exists(): - try: - df = pd.read_csv(FEEDBACK_CSV) - return df - except Exception: - pass - - return pd.DataFrame(columns=_default_feedback_columns()) + """Load feedback from parquet then csv; empty DataFrame with schema if missing.""" + for path, reader in [(FEEDBACK_PARQUET, pd.read_parquet), (FEEDBACK_CSV, pd.read_csv)]: + if path.exists(): + try: + return reader(path) + except Exception as e: + logger.warning("Failed to load %s: %s", path, e) + return pd.DataFrame(columns=_feedback_schema()) def save_feedback(df: pd.DataFrame) -> None: - """Save feedback in CSV and optionally parquet.""" - DEFAULT_FEEDBACK_DIR.mkdir(exist_ok=True) - - # Ensure schema - for col in _default_feedback_columns(): + """Persist feedback to CSV and Parquet; ensure schema.""" + CONFIG["feedback_dir"].mkdir(exist_ok=True) + for col in _feedback_schema(): if col not in df.columns: df[col] = "" - df.to_csv(FEEDBACK_CSV, index=False) - try: df.to_parquet(FEEDBACK_PARQUET, index=False) - except Exception: - # Parquet may fail if pyarrow isn't installed - pass + except Exception as e: + logger.warning("Parquet save skipped: %s", e) -def upsert_feedback(existing: pd.DataFrame, row: Dict) -> pd.DataFrame: +def upsert_feedback(existing: pd.DataFrame, row: dict) -> pd.DataFrame: """Upsert by PMID + curator_id.""" - if existing.empty: - return pd.DataFrame([row]) - - # Ensure schema - for col in _default_feedback_columns(): + for col in _feedback_schema(): if col not in existing.columns: existing[col] = "" - - mask = (existing["PMID"].astype(str) == str(row["PMID"])) & ( - existing["curator_id"].astype(str) == str(row["curator_id"]) - ) - - if mask.any(): - for k, v in row.items(): - existing.loc[mask, k] = v - return existing - + if not existing.empty: + mask = ( + (existing["PMID"].astype(str) == str(row["PMID"])) + & (existing["curator_id"].astype(str) == str(row["curator_id"])) + ) + if mask.any(): + for k, v in row.items(): + existing.loc[mask, k] = v + return existing return pd.concat([existing, pd.DataFrame([row])], ignore_index=True) # ----------------------------- -# UI rendering +# UI # ----------------------------- def render_filters(df: pd.DataFrame) -> pd.DataFrame: st.sidebar.header("Filters") - search = st.sidebar.text_input( "Search (PMID, title, journal, summary)", placeholder="e.g. obesity, 2019, Lactobacillus", ).strip().lower() - - status_filters = {} - for col in STATUS_COLUMNS: - if col in df.columns: - status_filters[col] = st.sidebar.multiselect( - col, - options=VALID_STATES, - default=[], - ) - + status_filters = { + col: st.sidebar.multiselect(col, options=OPTIONS["valid_states"], default=[]) + for col in STATUS_COLUMNS + if col in df.columns + } year_range = None if "Year" in df.columns: years = df["Year"].dropna() if not years.empty: - years = years.astype(int) min_y, max_y = int(years.min()), int(years.max()) - year_range = st.sidebar.slider( - "Year range", - min_value=min_y, - max_value=max_y, - value=(min_y, max_y), - ) - + year_range = st.sidebar.slider("Year range", min_y, max_y, (min_y, max_y)) out = df.copy() - - # Search if search: - mask = pd.Series(False, index=out.index) - mask |= out["PMID"].astype(str).str.contains(search, na=False) + mask = out["PMID"].astype(str).str.contains(search, na=False) for col in ["Title", "Journal", "Summary"]: if col in out.columns: - mask |= out[col].astype(str).str.lower().str.contains(search, na=False) + mask = mask | out[col].astype(str).str.lower().str.contains(search, na=False) out = out.loc[mask] - - # Status filters for col, allowed in status_filters.items(): if allowed: out = out[out[col].isin(allowed)] - - # Year filter if year_range and "Year" in out.columns: out = out[(out["Year"] >= year_range[0]) & (out["Year"] <= year_range[1])] - return out @@ -382,56 +253,30 @@ def render_table(df: pd.DataFrame) -> Optional[int]: if df.empty: st.warning("No rows match your filters.") return None - st.subheader("Candidate curatable articles") st.caption("Tip: Sort by Priority Score to review the most promising candidates first.") - sort_options = ["Priority Score", "PMID"] if "Title" in df.columns: sort_options.append("Title") - for c in STATUS_COLUMNS: - if c in df.columns: - sort_options.append(c) - + sort_options.extend(c for c in STATUS_COLUMNS if c in df.columns) sort_col = st.selectbox("Sort by", options=sort_options, index=0) ascending = st.checkbox("Ascending", value=False) - if sort_col in df.columns: df = df.sort_values(by=sort_col, ascending=ascending, na_position="last") - st.divider() max_rows = st.slider("Rows to display", 50, 2000, 300, 50) df_show = df.head(max_rows).copy() - - display_cols: List[str] = ["PMID"] - if "PubMed Link" in df_show.columns: - display_cols.append("PubMed Link") - - for c in ["Priority Score", "Title", "Journal", "Year"]: - if c in df_show.columns: - display_cols.append(c) - - for c in STATUS_COLUMNS: - if c in df_show.columns: - display_cols.append(c) - - if "Summary" in df_show.columns: - display_cols.append("Summary") - + want = ["PMID", "PubMed Link", "Priority Score", "Title", "Journal", "Year"] + list(STATUS_COLUMNS) + ["Summary"] + display_cols = [c for c in want if c in df_show.columns] df_show = df_show[display_cols] - st.dataframe( df_show, use_container_width=True, height=650, - column_config={ - "PubMed Link": st.column_config.LinkColumn("PubMed", display_text="Open"), - }, + column_config={"PubMed Link": st.column_config.LinkColumn("PubMed", display_text="Open")}, ) - st.metric("Rows after filtering", len(df)) st.metric("Rows displayed", len(df_show)) - st.divider() st.subheader("Quick select for feedback") selected = st.selectbox( @@ -439,127 +284,89 @@ def render_table(df: pd.DataFrame) -> Optional[int]: options=[""] + df_show["PMID"].astype(str).tolist(), index=0, ) - if selected: - return int(selected) - return None - - -def render_column_level_validation(selected_row: pd.Series) -> Dict[str, str]: - """ - Render per-column correctness + curator true label UI. - Returns a dict with keys: - - col_feedback__X - - true__X - """ - st.markdown("### Field-by-field validation (ground truth)") + return int(selected) if selected else None + +def render_column_level_validation(selected_row: pd.Series) -> dict[str, str]: + """Per-column correctness + curator true label UI. Returns col_feedback__* and true__*.""" + st.markdown("### Field-by-field validation (ground truth)") st.caption( - "For each field, provide the curator TRUE label (ground truth). " - "Optionally also mark whether BioAnalyzer's predicted status was correct." + "For each field, provide the curator TRUE label. " + "Optionally mark whether BioAnalyzer's predicted status was correct." ) - out = {} - left, right = st.columns(2) - halves = [STATUS_COLUMNS[:3], STATUS_COLUMNS[3:]] - - for pane, cols in zip([left, right], halves): + for pane, cols in zip([left, right], [STATUS_COLUMNS[:3], STATUS_COLUMNS[3:]]): with pane: for col in cols: if col not in selected_row.index: continue - - safe = _safe_field_name(col) + safe = _safe(col) pred = str(selected_row.get(col, "")).strip() label = col.replace(" Status", "") - st.markdown(f"**{label}**") st.write(f"BioAnalyzer predicted: `{pred}`") - - # Curator true label - true_key = f"true__{safe}" - true_choice = st.selectbox( + true_key = f"{TRUE_PREFIX}{safe}" + out[true_key] = st.selectbox( f"Curator TRUE label for {label}", - options=TRUE_LABEL_OPTIONS, + options=OPTIONS["true_label"], index=0, key=f"ui__{true_key}", ) - out[true_key] = true_choice - - # Optional correctness judgement - fb_key = f"col_feedback__{safe}" - fb_choice = st.selectbox( + fb_key = f"{COL_FB_PREFIX}{safe}" + out[fb_key] = st.selectbox( f"Was BioAnalyzer correct for {label}?", - options=COL_FEEDBACK_OPTIONS, + options=OPTIONS["col_feedback"], index=0, key=f"ui__{fb_key}", ) - out[fb_key] = fb_choice - st.divider() - return out def render_feedback_section(selected_pmid: Optional[int], dataset_df: pd.DataFrame) -> None: st.subheader("Curator feedback") st.caption("Feedback is stored locally in results/. Entries are upserted by PMID + curator_id.") - feedback_df = load_feedback() - selected_row = None - title_prefill = "" - if selected_pmid is not None: try: selected_row = dataset_df.loc[dataset_df["PMID"] == selected_pmid].iloc[0] except Exception: selected_row = None - - if selected_row is not None and "Title" in selected_row.index: - title_prefill = str(selected_row.get("Title", "")) + title_prefill = str(selected_row.get("Title", "")) if selected_row is not None and "Title" in selected_row.index else "" with st.form("feedback_form", clear_on_submit=False): curator_id = st.text_input( "Curator ID / initials", - value=os.getenv("USER", ""), + value=CONFIG["curator_id_default"], placeholder="e.g. Ronald Ouma", ).strip() - fb_pmid = st.text_input( "PMID", value=str(selected_pmid) if selected_pmid else "", placeholder="e.g. 31215600", ).strip() - if title_prefill: st.write(f"**Title:** {title_prefill}") - overall_verdict = st.selectbox( "Overall paper verdict", options=["Curatable", "Not curatable", "Uncertain", "Not reviewed"], index=0, ) - comment = st.text_area( "Comment (optional)", placeholder="Evidence, edge case, missing field, false positive reason, etc.", height=90, ) - bioanalyzer_version = st.text_input( "BioAnalyzer version (recommended)", - value=os.getenv("BIOANALYZER_VERSION", ""), + value=CONFIG["bioanalyzer_version_default"], placeholder="e.g. 1.0.0, commit SHA, docker tag", ).strip() - - # Field-by-field validation - field_validation = {} - if selected_row is not None: - field_validation = render_column_level_validation(selected_row) - else: + field_validation = render_column_level_validation(selected_row) if selected_row is not None else {} + if selected_row is None: st.info("Select a PMID above to enable field-level validation.") - submitted = st.form_submit_button("Save feedback") if submitted: @@ -569,12 +376,12 @@ def render_feedback_section(selected_pmid: Optional[int], dataset_df: pd.DataFra if not fb_pmid: st.error("Please provide a PMID.") return - - pid = safe_int(fb_pmid) + pid = _safe_int(fb_pmid) if pid is None: st.error("PMID must be numeric.") return - + if selected_pmid is not None and pid != selected_pmid and pid not in dataset_df["PMID"].values: + st.warning("PMID not in current dataset; feedback will still be saved.") row = { "PMID": int(pid), "curator_id": curator_id, @@ -583,62 +390,33 @@ def render_feedback_section(selected_pmid: Optional[int], dataset_df: pd.DataFra "timestamp": pd.Timestamp.now(tz="UTC").isoformat(), "bioanalyzer_version": bioanalyzer_version, } - - # Always store predictions into feedback (reproducibility) - for col in STATUS_COLUMNS: - safe = _safe_field_name(col) - pred_key = f"pred__{safe}" - - if selected_row is not None and col in selected_row.index: - row[pred_key] = str(selected_row.get(col, "")).strip() - else: - row[pred_key] = "" - - # Store curator true labels + correctness for col in STATUS_COLUMNS: - safe = _safe_field_name(col) - - true_key = f"true__{safe}" - fb_key = f"col_feedback__{safe}" - - row[true_key] = field_validation.get(true_key, "Not reviewed") - row[fb_key] = field_validation.get(fb_key, "Not reviewed") - + s = _safe(col) + row[f"{PRED_PREFIX}{s}"] = ( + str(selected_row.get(col, "")).strip() + if selected_row is not None and col in selected_row.index else "" + ) + row[f"{TRUE_PREFIX}{s}"] = field_validation.get(f"{TRUE_PREFIX}{s}", "Not reviewed") + row[f"{COL_FB_PREFIX}{s}"] = field_validation.get(f"{COL_FB_PREFIX}{s}", "Not reviewed") feedback_df = upsert_feedback(feedback_df, row) save_feedback(feedback_df) - + logger.info("Saved feedback for PMID %s (curator=%s)", pid, curator_id) st.success(f"Saved feedback for PMID {pid} (curator={curator_id}).") st.divider() st.subheader("Existing feedback") - if feedback_df.empty: st.info("No feedback recorded yet.") return - - compact_cols = [ - "PMID", - "curator_id", - "overall_verdict", - "timestamp", - "bioanalyzer_version", + compact_cols = [c for c in FEEDBACK_BASE_COLS if c in feedback_df.columns] + [ + k for c in STATUS_COLUMNS for k in (f"{PRED_PREFIX}{_safe(c)}", f"{TRUE_PREFIX}{_safe(c)}", f"{COL_FB_PREFIX}{_safe(c)}") + if k in feedback_df.columns ] - - # Add prediction + true + feedback columns in a readable order - for col in STATUS_COLUMNS: - safe = _safe_field_name(col) - for k in [f"pred__{safe}", f"true__{safe}", f"col_feedback__{safe}"]: - if k in feedback_df.columns: - compact_cols.append(k) - - compact_cols = [c for c in compact_cols if c in feedback_df.columns] - st.dataframe( feedback_df.sort_values("timestamp", ascending=False)[compact_cols], use_container_width=True, height=380, ) - st.download_button( "Download feedback CSV", data=feedback_df.to_csv(index=False), @@ -647,103 +425,64 @@ def render_feedback_section(selected_pmid: Optional[int], dataset_df: pd.DataFra ) -# ----------------------------- -# Main -# ----------------------------- -def main(): +def main() -> None: st.set_page_config(page_title="BioAnalyzer Curator Table", layout="wide") st.title("BioAnalyzer Curator Table") - st.markdown( """ -This dashboard provides a **sortable, searchable, filterable** table of BioAnalyzer predictions for -candidate curatable PubMed articles. - -### Why this is useful -- Lets curators review predictions in a real-world workflow -- Captures feedback aligned by PMID -- Stores: - - BioAnalyzer predicted statuses - - curator ground truth statuses - - curator correctness flags - -This makes the exported feedback suitable for: -- confusion matrices to evaluate accuracy -- per-field error profiling -- MCC decisions for PARTIALLY_PRESENT +This dashboard provides a **sortable, searchable, filterable** table of BioAnalyzer predictions. + +- Curators review predictions and capture feedback by PMID. +- Stored: predicted statuses, curator ground truth, correctness flags. +- Export suitable for confusion matrices, per-field error profiling, MCC for PARTIALLY_PRESENT. """ ) - st.sidebar.header("Data source") - data_source = st.sidebar.radio( "Choose input mode", options=["Upload CSV/Parquet", "Use file path"], index=0, ) - raw_df = pd.DataFrame() - if data_source == "Upload CSV/Parquet": - uploaded = st.sidebar.file_uploader( - "Upload dataset", - type=["csv", "parquet", "pq"], - ) + uploaded = st.sidebar.file_uploader("Upload dataset", type=["csv", "parquet", "pq"]) if uploaded: try: - raw_df = load_data_from_upload(uploaded) + raw_df = _load_data(uploaded, is_path=False) except Exception as e: st.error(f"Could not load file: {e}") + logger.exception("Upload load failed") return else: - path = st.sidebar.text_input( - "Path to CSV/Parquet", - placeholder="e.g. analysis_results.csv", - ).strip() + path = st.sidebar.text_input("Path to CSV/Parquet", placeholder="e.g. analysis_results.csv").strip() if path: try: - raw_df = load_data_from_path(path) + raw_df = _load_data(path, is_path=True) except Exception as e: st.error(str(e)) return - if raw_df.empty: st.info("Upload a dataset or provide a file path to begin.") st.stop() - df = normalize_dataset(raw_df) - if df.empty: st.error("Dataset loaded, but no valid rows found after normalization.") st.stop() - missing = [c for c in STATUS_COLUMNS if c not in df.columns] if missing: st.warning( - "Some expected status columns are missing. " - "Priority scoring and filtering will be partial.\n\n" - f"Missing columns: {missing}" + "Some expected status columns are missing. Priority and filtering will be partial.\n\n" + f"Missing: {missing}" ) - filtered_df = render_filters(df) selected_pmid = render_table(filtered_df) - st.divider() render_feedback_section(selected_pmid, filtered_df) - st.sidebar.divider() st.sidebar.header("Notes") st.sidebar.markdown( - f""" -Feedback files are saved to: - -- `{FEEDBACK_CSV}` -- `{FEEDBACK_PARQUET}` (if parquet supported) - -Tip: set an environment variable to track versions: - -- `BIOANALYZER_VERSION=commit_sha` - """ + f"Feedback: `{FEEDBACK_CSV}` and `{FEEDBACK_PARQUET}`. " + "Tip: set `BIOANALYZER_VERSION` or `FEEDBACK_DIR` in environment." )