diff --git a/README.md b/README.md index 37e6b66..90f6868 100644 --- a/README.md +++ b/README.md @@ -197,7 +197,7 @@ BioAnalyzer includes a formal validation workflow to compare automated predictio - **Ground truth**: Expert annotations in `feedback.csv` for the six BugSigDB fields - **Predictions**: BioAnalyzer outputs in a predictions CSV (e.g. `analysis_results.csv` or `new.csv`) - **Alignment**: PMIDs are aligned with `align_pmids.py` -- **Evaluation**: `confusion_matrix_analysis.py` computes 3-class confusion matrices (`ABSENT`, `PARTIALLY_PRESENT`, `PRESENT`) and per-field accuracy +- **Evaluation**: `scripts/eval/confusion_matrix_analysis.py` computes 3-class confusion matrices (`ABSENT`, `PARTIALLY_PRESENT`, `PRESENT`) and per-field accuracy - **Outputs**: Metrics and PNG confusion matrices are written to `confusion_matrix_results/` For sharing/inspection, `create_validation_dataset.py` can generate a flat CSV: diff --git a/app/utils/config.py b/app/utils/config.py index 706f1aa..3971531 100644 --- a/app/utils/config.py +++ b/app/utils/config.py @@ -3,6 +3,7 @@ import logging from typing import List, Optional from .credential_masking import mask_exception_message, mask_string +from app.core.settings import get_settings try: from dotenv import load_dotenv # type: ignore @@ -326,5 +327,63 @@ def setup_logging() -> logging.Logger: return root_logger -# Initialize logging when module is imported logger = setup_logging() + +# --------------------------------------------------------------------------- +# Bridge to structured settings (app.core.settings) +# This keeps app.core.settings as the single source of truth while preserving +# existing config module attributes used across the codebase. +# --------------------------------------------------------------------------- +try: + _settings = get_settings() + + # API / timeout settings + API_TIMEOUT = _settings.api.timeout + ANALYSIS_TIMEOUT = _settings.api.analysis_timeout + GEMINI_TIMEOUT = _settings.api.gemini_timeout + FRONTEND_TIMEOUT = _settings.api.frontend_timeout + NCBI_RATE_LIMIT_DELAY = _settings.api.ncbi_rate_limit_delay + MAX_CONCURRENT_REQUESTS = _settings.api.max_concurrent_requests + + # LLM config + if _settings.llm.provider: + LLM_PROVIDER = _settings.llm.provider + if _settings.llm.model: + LLM_MODEL = _settings.llm.model + + # RAG settings + RAG_SUMMARY_PROVIDER = _settings.rag.summary_provider or RAG_SUMMARY_PROVIDER + RAG_SUMMARY_MODEL = _settings.rag.summary_model or RAG_SUMMARY_MODEL + RAG_SUMMARY_LENGTH = _settings.rag.summary_length.value + RAG_SUMMARY_QUALITY = _settings.rag.summary_quality.value + RAG_RERANK_METHOD = _settings.rag.rerank_method.value + RAG_USE_SUMMARY_CACHE = _settings.rag.use_cache + RAG_MAX_SUMMARY_KEY_POINTS = _settings.rag.max_summary_key_points + RAG_TOP_K_CHUNKS = _settings.rag.top_k_chunks + + # Cache settings + CACHE_VALIDITY_HOURS = _settings.cache.validity_hours + MAX_CACHE_SIZE = _settings.cache.max_size + + # Retrieval settings + USE_FULLTEXT = _settings.retrieval.use_fulltext + if _settings.retrieval.ncbi_api_key: + NCBI_API_KEY = _settings.retrieval.ncbi_api_key + if _settings.retrieval.email: + EMAIL = _settings.retrieval.email + + # Security / environment + ENVIRONMENT = _settings.environment.value + CORS_ORIGINS = _settings.security.cors_origins + ENABLE_REQUEST_ID = _settings.security.enable_request_id + + # Rate limiting + ENABLE_RATE_LIMITING = _settings.rate_limit.enabled + RATE_LIMIT_PER_MINUTE = _settings.rate_limit.requests_per_minute + + # Logging + LOG_LEVEL = _settings.logging.level.value +except Exception: + # Fail gracefully; fall back to env-based values defined above + pass + diff --git a/curator_table/app.py b/curator_table/app.py index 6c63c45..99346ca 100644 --- a/curator_table/app.py +++ b/curator_table/app.py @@ -3,63 +3,44 @@ BioAnalyzer Curator Table (with column-level validation + curator ground truth) ============================================================================= -A Streamlit dashboard for real-world validation of BioAnalyzer predictions. - -This app supports: -1) A sortable, searchable, filterable table of candidate curatable PubMed articles. -2) A curator feedback workflow aligned by PMID. -3) Column-level correctness checks for each BioAnalyzer field. -4) Curator-provided TRUE labels (ground truth) for each field. -5) Exportable feedback suitable for: - - confusion matrices - - per-field error analysis - - binary + multiclass evaluation - - MCC decisions on how to treat PARTIALLY_PRESENT - -Run: - streamlit run curator_table/app.py - -Input data: - CSV or Parquet with at minimum: - - PMID - Recommended: - - Title, Journal, Year, Summary - Expected BioAnalyzer outputs: - - Host Species Status - - Body Site Status - - Condition Status - - Sequencing Type Status - - Taxa Level Status - - Sample Size Status - -Status values expected: - ABSENT | PARTIALLY_PRESENT | PRESENT - -Feedback storage: - results/curator_feedback.csv - results/curator_feedback.parquet - -Design notes ------------- -- Feedback is upserted by (PMID, curator_id) to prevent duplicates. -- Feedback rows store BOTH: - (a) the BioAnalyzer predictions (pred__*) - (b) the curator's evaluation (col_feedback__*) - (c) the curator's ground truth labels (true__*) - This makes benchmarking reproducible even if the input dataset changes. +Streamlit dashboard for real-world validation of BioAnalyzer predictions: +sortable/searchable/filterable table, curator feedback by PMID, column-level +correctness + ground truth, exportable feedback for confusion matrices and MCC. + +Run: streamlit run curator_table/app.py +Input: CSV or Parquet with PMID (recommended: Title, Journal, Year, Summary). +Status values: ABSENT | PARTIALLY_PRESENT | PRESENT. +Feedback: results/curator_feedback.csv and .parquet (upserted by PMID + curator_id). """ from __future__ import annotations +import io +import logging import os from pathlib import Path -from typing import Optional, List, Dict +from typing import Optional import pandas as pd import streamlit as st # ----------------------------- -# Expected prediction columns +# Config (env overrides) +# ----------------------------- +CONFIG = { + "feedback_dir": Path(os.getenv("FEEDBACK_DIR", "results")), + "curator_id_default": os.getenv("USER", ""), + "bioanalyzer_version_default": os.getenv("BIOANALYZER_VERSION", ""), +} + +CONFIG["feedback_dir"].mkdir(exist_ok=True) +FEEDBACK_CSV = CONFIG["feedback_dir"] / "curator_feedback.csv" +FEEDBACK_PARQUET = CONFIG["feedback_dir"] / "curator_feedback.parquet" + +logger = logging.getLogger(__name__) + +# ----------------------------- +# Schema (single source of truth) # ----------------------------- STATUS_COLUMNS = [ "Host Species Status", @@ -69,312 +50,202 @@ "Taxa Level Status", "Sample Size Status", ] - -OPTIONAL_COLUMNS = [ - "Title", - "Journal", - "Summary", - "Processing Time", - "Year", - "Publication Date", +OPTIONS = { + "valid_states": ["ABSENT", "PARTIALLY_PRESENT", "PRESENT"], + "col_feedback": ["Not reviewed", "Correct", "Incorrect", "Unclear"], + "true_label": ["Not reviewed", "ABSENT", "PARTIALLY_PRESENT", "PRESENT"], +} + +_safe = lambda col: col.replace(" ", "_") +FEEDBACK_BASE_COLS = [ + "PMID", "curator_id", "overall_verdict", "comment", "timestamp", "bioanalyzer_version", ] +PRED_PREFIX, TRUE_PREFIX, COL_FB_PREFIX = "pred__", "true__", "col_feedback__" -VALID_STATES = ["ABSENT", "PARTIALLY_PRESENT", "PRESENT"] -# Column-level feedback options (curator judgement of correctness) -COL_FEEDBACK_OPTIONS = [ - "Not reviewed", - "Correct", - "Incorrect", - "Unclear", -] - -# True label options (ground truth) -TRUE_LABEL_OPTIONS = [ - "Not reviewed", - "ABSENT", - "PARTIALLY_PRESENT", - "PRESENT", -] - -# ----------------------------- -# Feedback persistence -# ----------------------------- -DEFAULT_FEEDBACK_DIR = Path("results") -DEFAULT_FEEDBACK_DIR.mkdir(exist_ok=True) - -FEEDBACK_CSV = DEFAULT_FEEDBACK_DIR / "curator_feedback.csv" -FEEDBACK_PARQUET = DEFAULT_FEEDBACK_DIR / "curator_feedback.parquet" +def _feedback_schema() -> list[str]: + """Full feedback column schema (dynamic from STATUS_COLUMNS).""" + derived = [ + f"{PRED_PREFIX}{_safe(c)}" for c in STATUS_COLUMNS + ] + [ + f"{TRUE_PREFIX}{_safe(c)}" for c in STATUS_COLUMNS + ] + [ + f"{COL_FB_PREFIX}{_safe(c)}" for c in STATUS_COLUMNS + ] + return FEEDBACK_BASE_COLS + derived # ----------------------------- # Helpers # ----------------------------- -def make_pmid_link(pmid) -> str: - """Return PubMed URL for a PMID.""" +def _make_pmid_link(pmid) -> str: try: - pid = str(int(float(pmid))) - return f"https://pubmed.ncbi.nlm.nih.gov/{pid}/" - except Exception: + return f"https://pubmed.ncbi.nlm.nih.gov/{int(float(pmid))}/" + except (TypeError, ValueError): return "" -def safe_int(x) -> Optional[int]: +def _safe_int(x) -> Optional[int]: try: return int(float(x)) - except Exception: + except (TypeError, ValueError): return None -def normalize_status_value(x: str) -> str: - """Normalize status values to one of ABSENT/PARTIALLY_PRESENT/PRESENT.""" +def _normalize_status(x: str) -> str: if pd.isna(x): return "" x = str(x).strip().upper() - if x in VALID_STATES: + if x in OPTIONS["valid_states"]: return x - - # Common variants if x in {"PARTIAL", "PARTIALLY", "PARTLY"}: return "PARTIALLY_PRESENT" if x in {"YES", "TRUE"}: return "PRESENT" if x in {"NO", "FALSE"}: return "ABSENT" - return x -def compute_priority_score(row: pd.Series) -> float: - """ - Rank candidates by how many fields are predicted present. - - - PRESENT contributes 1.0 - - PARTIALLY_PRESENT contributes 0.5 - """ - score = 0.0 - for col in STATUS_COLUMNS: - val = str(row.get(col, "")).strip().upper() - if val == "PRESENT": - score += 1.0 - elif val == "PARTIALLY_PRESENT": - score += 0.5 - return score - - -@st.cache_data(show_spinner=False) -def load_data_from_path(path: str) -> pd.DataFrame: - path = Path(path) - if not path.exists(): - return pd.DataFrame() - - suf = path.suffix.lower() - if suf == ".csv": - return pd.read_csv(path) - if suf in (".parquet", ".pq"): - return pd.read_parquet(path) - - raise ValueError(f"Unsupported format: {suf}. Use .csv or .parquet.") +def _priority_score(row: pd.Series) -> float: + weights = {"PRESENT": 1.0, "PARTIALLY_PRESENT": 0.5} + return sum( + weights.get(str(row.get(col, "")).strip().upper(), 0.0) + for col in STATUS_COLUMNS + ) +# ----------------------------- +# Data loading (unified) +# ----------------------------- @st.cache_data(show_spinner=False) -def load_data_from_upload(uploaded) -> pd.DataFrame: - if uploaded is None: +def _load_data(source, is_path: bool) -> pd.DataFrame: + """Load from file path (str/Path) or uploaded file; returns empty DataFrame on failure.""" + if source is None or (is_path and not source): return pd.DataFrame() - - name = uploaded.name.lower() - if name.endswith(".csv"): - return pd.read_csv(uploaded) - if name.endswith(".parquet") or name.endswith(".pq"): - return pd.read_parquet(uploaded) - - raise ValueError("Unsupported upload. Use .csv or .parquet.") + if is_path: + path = Path(source) + if not path.exists(): + logger.warning("Path does not exist: %s", path) + return pd.DataFrame() + buf, ext = path, path.suffix.lower() + else: + name = source.name.lower() + buf = io.BytesIO(source.getvalue()) + ext = ".csv" if name.endswith(".csv") else (".parquet" if name.endswith((".parquet", ".pq")) else "") + if ext == ".csv": + return pd.read_csv(buf) + if ext in (".parquet", ".pq"): + return pd.read_parquet(buf) + raise ValueError(f"Unsupported format. Use .csv or .parquet.") def normalize_dataset(df: pd.DataFrame) -> pd.DataFrame: - """Normalize dataset and derive helper columns.""" - if df.empty: - return df - - if "PMID" not in df.columns: - st.error("Data must contain a 'PMID' column.") + """Normalize PMID, statuses, derive year/link/priority. Returns empty DataFrame if invalid.""" + if df.empty or "PMID" not in df.columns: + if not df.empty: + st.error("Data must contain a 'PMID' column.") + return pd.DataFrame() + try: + df = ( + df.assign(PMID=df["PMID"].apply(_safe_int)) + .dropna(subset=["PMID"]) + .astype({"PMID": int}) + ) + except Exception as e: + logger.exception("PMID normalization failed: %s", e) return pd.DataFrame() - - # Normalize PMID - df["PMID"] = df["PMID"].apply(safe_int) - df = df.dropna(subset=["PMID"]).copy() - df["PMID"] = df["PMID"].astype(int) - - # Derive year if "Year" not in df.columns and "Publication Date" in df.columns: try: - df["Year"] = pd.to_datetime(df["Publication Date"], errors="coerce").dt.year + df = df.assign(Year=pd.to_datetime(df["Publication Date"], errors="coerce").dt.year) except Exception: pass - - # Normalize statuses for col in STATUS_COLUMNS: if col in df.columns: - df[col] = df[col].apply(normalize_status_value) - - # Priority score - df["Priority Score"] = df.apply(compute_priority_score, axis=1) - - # PubMed link - df["PubMed Link"] = df["PMID"].apply(make_pmid_link) - + df = df.assign(**{col: df[col].apply(_normalize_status)}) + df = df.assign( + **{"Priority Score": df.apply(_priority_score, axis=1)}, + **{"PubMed Link": df["PMID"].apply(_make_pmid_link)}, + ) return df -def _safe_field_name(col: str) -> str: - """Convert 'Host Species Status' -> 'Host_Species_Status'.""" - return col.replace(" ", "_") - - -def _default_feedback_columns() -> List[str]: - """Defines the full schema for feedback rows.""" - base = [ - "PMID", - "curator_id", - "overall_verdict", - "comment", - "timestamp", - "bioanalyzer_version", - ] - - # Predicted statuses - pred_cols = [] - for col in STATUS_COLUMNS: - safe = _safe_field_name(col) - pred_cols.append(f"pred__{safe}") - - # True labels (curator ground truth) - true_cols = [] - for col in STATUS_COLUMNS: - safe = _safe_field_name(col) - true_cols.append(f"true__{safe}") - - # Column-level feedback (Correct/Incorrect/etc.) - col_feedback_cols = [] - for col in STATUS_COLUMNS: - safe = _safe_field_name(col) - col_feedback_cols.append(f"col_feedback__{safe}") - - return base + pred_cols + true_cols + col_feedback_cols - - +# ----------------------------- +# Feedback persistence +# ----------------------------- def load_feedback() -> pd.DataFrame: - """Load feedback from parquet/csv or return empty with schema.""" - if FEEDBACK_PARQUET.exists(): - try: - df = pd.read_parquet(FEEDBACK_PARQUET) - return df - except Exception: - pass - - if FEEDBACK_CSV.exists(): - try: - df = pd.read_csv(FEEDBACK_CSV) - return df - except Exception: - pass - - return pd.DataFrame(columns=_default_feedback_columns()) + """Load feedback from parquet then csv; empty DataFrame with schema if missing.""" + for path, reader in [(FEEDBACK_PARQUET, pd.read_parquet), (FEEDBACK_CSV, pd.read_csv)]: + if path.exists(): + try: + return reader(path) + except Exception as e: + logger.warning("Failed to load %s: %s", path, e) + return pd.DataFrame(columns=_feedback_schema()) def save_feedback(df: pd.DataFrame) -> None: - """Save feedback in CSV and optionally parquet.""" - DEFAULT_FEEDBACK_DIR.mkdir(exist_ok=True) - - # Ensure schema - for col in _default_feedback_columns(): + """Persist feedback to CSV and Parquet; ensure schema.""" + CONFIG["feedback_dir"].mkdir(exist_ok=True) + for col in _feedback_schema(): if col not in df.columns: df[col] = "" - df.to_csv(FEEDBACK_CSV, index=False) - try: df.to_parquet(FEEDBACK_PARQUET, index=False) - except Exception: - # Parquet may fail if pyarrow isn't installed - pass + except Exception as e: + logger.warning("Parquet save skipped: %s", e) -def upsert_feedback(existing: pd.DataFrame, row: Dict) -> pd.DataFrame: +def upsert_feedback(existing: pd.DataFrame, row: dict) -> pd.DataFrame: """Upsert by PMID + curator_id.""" - if existing.empty: - return pd.DataFrame([row]) - - # Ensure schema - for col in _default_feedback_columns(): + for col in _feedback_schema(): if col not in existing.columns: existing[col] = "" - - mask = (existing["PMID"].astype(str) == str(row["PMID"])) & ( - existing["curator_id"].astype(str) == str(row["curator_id"]) - ) - - if mask.any(): - for k, v in row.items(): - existing.loc[mask, k] = v - return existing - + if not existing.empty: + mask = ( + (existing["PMID"].astype(str) == str(row["PMID"])) + & (existing["curator_id"].astype(str) == str(row["curator_id"])) + ) + if mask.any(): + for k, v in row.items(): + existing.loc[mask, k] = v + return existing return pd.concat([existing, pd.DataFrame([row])], ignore_index=True) # ----------------------------- -# UI rendering +# UI # ----------------------------- def render_filters(df: pd.DataFrame) -> pd.DataFrame: st.sidebar.header("Filters") - search = st.sidebar.text_input( "Search (PMID, title, journal, summary)", placeholder="e.g. obesity, 2019, Lactobacillus", ).strip().lower() - - status_filters = {} - for col in STATUS_COLUMNS: - if col in df.columns: - status_filters[col] = st.sidebar.multiselect( - col, - options=VALID_STATES, - default=[], - ) - + status_filters = { + col: st.sidebar.multiselect(col, options=OPTIONS["valid_states"], default=[]) + for col in STATUS_COLUMNS + if col in df.columns + } year_range = None if "Year" in df.columns: years = df["Year"].dropna() if not years.empty: - years = years.astype(int) min_y, max_y = int(years.min()), int(years.max()) - year_range = st.sidebar.slider( - "Year range", - min_value=min_y, - max_value=max_y, - value=(min_y, max_y), - ) - + year_range = st.sidebar.slider("Year range", min_y, max_y, (min_y, max_y)) out = df.copy() - - # Search if search: - mask = pd.Series(False, index=out.index) - mask |= out["PMID"].astype(str).str.contains(search, na=False) + mask = out["PMID"].astype(str).str.contains(search, na=False) for col in ["Title", "Journal", "Summary"]: if col in out.columns: - mask |= out[col].astype(str).str.lower().str.contains(search, na=False) + mask = mask | out[col].astype(str).str.lower().str.contains(search, na=False) out = out.loc[mask] - - # Status filters for col, allowed in status_filters.items(): if allowed: out = out[out[col].isin(allowed)] - - # Year filter if year_range and "Year" in out.columns: out = out[(out["Year"] >= year_range[0]) & (out["Year"] <= year_range[1])] - return out @@ -382,56 +253,30 @@ def render_table(df: pd.DataFrame) -> Optional[int]: if df.empty: st.warning("No rows match your filters.") return None - st.subheader("Candidate curatable articles") st.caption("Tip: Sort by Priority Score to review the most promising candidates first.") - sort_options = ["Priority Score", "PMID"] if "Title" in df.columns: sort_options.append("Title") - for c in STATUS_COLUMNS: - if c in df.columns: - sort_options.append(c) - + sort_options.extend(c for c in STATUS_COLUMNS if c in df.columns) sort_col = st.selectbox("Sort by", options=sort_options, index=0) ascending = st.checkbox("Ascending", value=False) - if sort_col in df.columns: df = df.sort_values(by=sort_col, ascending=ascending, na_position="last") - st.divider() max_rows = st.slider("Rows to display", 50, 2000, 300, 50) df_show = df.head(max_rows).copy() - - display_cols: List[str] = ["PMID"] - if "PubMed Link" in df_show.columns: - display_cols.append("PubMed Link") - - for c in ["Priority Score", "Title", "Journal", "Year"]: - if c in df_show.columns: - display_cols.append(c) - - for c in STATUS_COLUMNS: - if c in df_show.columns: - display_cols.append(c) - - if "Summary" in df_show.columns: - display_cols.append("Summary") - + want = ["PMID", "PubMed Link", "Priority Score", "Title", "Journal", "Year"] + list(STATUS_COLUMNS) + ["Summary"] + display_cols = [c for c in want if c in df_show.columns] df_show = df_show[display_cols] - st.dataframe( df_show, use_container_width=True, height=650, - column_config={ - "PubMed Link": st.column_config.LinkColumn("PubMed", display_text="Open"), - }, + column_config={"PubMed Link": st.column_config.LinkColumn("PubMed", display_text="Open")}, ) - st.metric("Rows after filtering", len(df)) st.metric("Rows displayed", len(df_show)) - st.divider() st.subheader("Quick select for feedback") selected = st.selectbox( @@ -439,127 +284,89 @@ def render_table(df: pd.DataFrame) -> Optional[int]: options=[""] + df_show["PMID"].astype(str).tolist(), index=0, ) - if selected: - return int(selected) - return None - - -def render_column_level_validation(selected_row: pd.Series) -> Dict[str, str]: - """ - Render per-column correctness + curator true label UI. - Returns a dict with keys: - - col_feedback__X - - true__X - """ - st.markdown("### Field-by-field validation (ground truth)") + return int(selected) if selected else None + +def render_column_level_validation(selected_row: pd.Series) -> dict[str, str]: + """Per-column correctness + curator true label UI. Returns col_feedback__* and true__*.""" + st.markdown("### Field-by-field validation (ground truth)") st.caption( - "For each field, provide the curator TRUE label (ground truth). " - "Optionally also mark whether BioAnalyzer's predicted status was correct." + "For each field, provide the curator TRUE label. " + "Optionally mark whether BioAnalyzer's predicted status was correct." ) - out = {} - left, right = st.columns(2) - halves = [STATUS_COLUMNS[:3], STATUS_COLUMNS[3:]] - - for pane, cols in zip([left, right], halves): + for pane, cols in zip([left, right], [STATUS_COLUMNS[:3], STATUS_COLUMNS[3:]]): with pane: for col in cols: if col not in selected_row.index: continue - - safe = _safe_field_name(col) + safe = _safe(col) pred = str(selected_row.get(col, "")).strip() label = col.replace(" Status", "") - st.markdown(f"**{label}**") st.write(f"BioAnalyzer predicted: `{pred}`") - - # Curator true label - true_key = f"true__{safe}" - true_choice = st.selectbox( + true_key = f"{TRUE_PREFIX}{safe}" + out[true_key] = st.selectbox( f"Curator TRUE label for {label}", - options=TRUE_LABEL_OPTIONS, + options=OPTIONS["true_label"], index=0, key=f"ui__{true_key}", ) - out[true_key] = true_choice - - # Optional correctness judgement - fb_key = f"col_feedback__{safe}" - fb_choice = st.selectbox( + fb_key = f"{COL_FB_PREFIX}{safe}" + out[fb_key] = st.selectbox( f"Was BioAnalyzer correct for {label}?", - options=COL_FEEDBACK_OPTIONS, + options=OPTIONS["col_feedback"], index=0, key=f"ui__{fb_key}", ) - out[fb_key] = fb_choice - st.divider() - return out def render_feedback_section(selected_pmid: Optional[int], dataset_df: pd.DataFrame) -> None: st.subheader("Curator feedback") st.caption("Feedback is stored locally in results/. Entries are upserted by PMID + curator_id.") - feedback_df = load_feedback() - selected_row = None - title_prefill = "" - if selected_pmid is not None: try: selected_row = dataset_df.loc[dataset_df["PMID"] == selected_pmid].iloc[0] except Exception: selected_row = None - - if selected_row is not None and "Title" in selected_row.index: - title_prefill = str(selected_row.get("Title", "")) + title_prefill = str(selected_row.get("Title", "")) if selected_row is not None and "Title" in selected_row.index else "" with st.form("feedback_form", clear_on_submit=False): curator_id = st.text_input( "Curator ID / initials", - value=os.getenv("USER", ""), + value=CONFIG["curator_id_default"], placeholder="e.g. Ronald Ouma", ).strip() - fb_pmid = st.text_input( "PMID", value=str(selected_pmid) if selected_pmid else "", placeholder="e.g. 31215600", ).strip() - if title_prefill: st.write(f"**Title:** {title_prefill}") - overall_verdict = st.selectbox( "Overall paper verdict", options=["Curatable", "Not curatable", "Uncertain", "Not reviewed"], index=0, ) - comment = st.text_area( "Comment (optional)", placeholder="Evidence, edge case, missing field, false positive reason, etc.", height=90, ) - bioanalyzer_version = st.text_input( "BioAnalyzer version (recommended)", - value=os.getenv("BIOANALYZER_VERSION", ""), + value=CONFIG["bioanalyzer_version_default"], placeholder="e.g. 1.0.0, commit SHA, docker tag", ).strip() - - # Field-by-field validation - field_validation = {} - if selected_row is not None: - field_validation = render_column_level_validation(selected_row) - else: + field_validation = render_column_level_validation(selected_row) if selected_row is not None else {} + if selected_row is None: st.info("Select a PMID above to enable field-level validation.") - submitted = st.form_submit_button("Save feedback") if submitted: @@ -569,12 +376,12 @@ def render_feedback_section(selected_pmid: Optional[int], dataset_df: pd.DataFra if not fb_pmid: st.error("Please provide a PMID.") return - - pid = safe_int(fb_pmid) + pid = _safe_int(fb_pmid) if pid is None: st.error("PMID must be numeric.") return - + if selected_pmid is not None and pid != selected_pmid and pid not in dataset_df["PMID"].values: + st.warning("PMID not in current dataset; feedback will still be saved.") row = { "PMID": int(pid), "curator_id": curator_id, @@ -583,62 +390,33 @@ def render_feedback_section(selected_pmid: Optional[int], dataset_df: pd.DataFra "timestamp": pd.Timestamp.now(tz="UTC").isoformat(), "bioanalyzer_version": bioanalyzer_version, } - - # Always store predictions into feedback (reproducibility) - for col in STATUS_COLUMNS: - safe = _safe_field_name(col) - pred_key = f"pred__{safe}" - - if selected_row is not None and col in selected_row.index: - row[pred_key] = str(selected_row.get(col, "")).strip() - else: - row[pred_key] = "" - - # Store curator true labels + correctness for col in STATUS_COLUMNS: - safe = _safe_field_name(col) - - true_key = f"true__{safe}" - fb_key = f"col_feedback__{safe}" - - row[true_key] = field_validation.get(true_key, "Not reviewed") - row[fb_key] = field_validation.get(fb_key, "Not reviewed") - + s = _safe(col) + row[f"{PRED_PREFIX}{s}"] = ( + str(selected_row.get(col, "")).strip() + if selected_row is not None and col in selected_row.index else "" + ) + row[f"{TRUE_PREFIX}{s}"] = field_validation.get(f"{TRUE_PREFIX}{s}", "Not reviewed") + row[f"{COL_FB_PREFIX}{s}"] = field_validation.get(f"{COL_FB_PREFIX}{s}", "Not reviewed") feedback_df = upsert_feedback(feedback_df, row) save_feedback(feedback_df) - + logger.info("Saved feedback for PMID %s (curator=%s)", pid, curator_id) st.success(f"Saved feedback for PMID {pid} (curator={curator_id}).") st.divider() st.subheader("Existing feedback") - if feedback_df.empty: st.info("No feedback recorded yet.") return - - compact_cols = [ - "PMID", - "curator_id", - "overall_verdict", - "timestamp", - "bioanalyzer_version", + compact_cols = [c for c in FEEDBACK_BASE_COLS if c in feedback_df.columns] + [ + k for c in STATUS_COLUMNS for k in (f"{PRED_PREFIX}{_safe(c)}", f"{TRUE_PREFIX}{_safe(c)}", f"{COL_FB_PREFIX}{_safe(c)}") + if k in feedback_df.columns ] - - # Add prediction + true + feedback columns in a readable order - for col in STATUS_COLUMNS: - safe = _safe_field_name(col) - for k in [f"pred__{safe}", f"true__{safe}", f"col_feedback__{safe}"]: - if k in feedback_df.columns: - compact_cols.append(k) - - compact_cols = [c for c in compact_cols if c in feedback_df.columns] - st.dataframe( feedback_df.sort_values("timestamp", ascending=False)[compact_cols], use_container_width=True, height=380, ) - st.download_button( "Download feedback CSV", data=feedback_df.to_csv(index=False), @@ -647,103 +425,64 @@ def render_feedback_section(selected_pmid: Optional[int], dataset_df: pd.DataFra ) -# ----------------------------- -# Main -# ----------------------------- -def main(): +def main() -> None: st.set_page_config(page_title="BioAnalyzer Curator Table", layout="wide") st.title("BioAnalyzer Curator Table") - st.markdown( """ -This dashboard provides a **sortable, searchable, filterable** table of BioAnalyzer predictions for -candidate curatable PubMed articles. - -### Why this is useful -- Lets curators review predictions in a real-world workflow -- Captures feedback aligned by PMID -- Stores: - - BioAnalyzer predicted statuses - - curator ground truth statuses - - curator correctness flags - -This makes the exported feedback suitable for: -- confusion matrices to evaluate accuracy -- per-field error profiling -- MCC decisions for PARTIALLY_PRESENT +This dashboard provides a **sortable, searchable, filterable** table of BioAnalyzer predictions. + +- Curators review predictions and capture feedback by PMID. +- Stored: predicted statuses, curator ground truth, correctness flags. +- Export suitable for confusion matrices, per-field error profiling, MCC for PARTIALLY_PRESENT. """ ) - st.sidebar.header("Data source") - data_source = st.sidebar.radio( "Choose input mode", options=["Upload CSV/Parquet", "Use file path"], index=0, ) - raw_df = pd.DataFrame() - if data_source == "Upload CSV/Parquet": - uploaded = st.sidebar.file_uploader( - "Upload dataset", - type=["csv", "parquet", "pq"], - ) + uploaded = st.sidebar.file_uploader("Upload dataset", type=["csv", "parquet", "pq"]) if uploaded: try: - raw_df = load_data_from_upload(uploaded) + raw_df = _load_data(uploaded, is_path=False) except Exception as e: st.error(f"Could not load file: {e}") + logger.exception("Upload load failed") return else: - path = st.sidebar.text_input( - "Path to CSV/Parquet", - placeholder="e.g. analysis_results.csv", - ).strip() + path = st.sidebar.text_input("Path to CSV/Parquet", placeholder="e.g. analysis_results.csv").strip() if path: try: - raw_df = load_data_from_path(path) + raw_df = _load_data(path, is_path=True) except Exception as e: st.error(str(e)) return - if raw_df.empty: st.info("Upload a dataset or provide a file path to begin.") st.stop() - df = normalize_dataset(raw_df) - if df.empty: st.error("Dataset loaded, but no valid rows found after normalization.") st.stop() - missing = [c for c in STATUS_COLUMNS if c not in df.columns] if missing: st.warning( - "Some expected status columns are missing. " - "Priority scoring and filtering will be partial.\n\n" - f"Missing columns: {missing}" + "Some expected status columns are missing. Priority and filtering will be partial.\n\n" + f"Missing: {missing}" ) - filtered_df = render_filters(df) selected_pmid = render_table(filtered_df) - st.divider() render_feedback_section(selected_pmid, filtered_df) - st.sidebar.divider() st.sidebar.header("Notes") st.sidebar.markdown( - f""" -Feedback files are saved to: - -- `{FEEDBACK_CSV}` -- `{FEEDBACK_PARQUET}` (if parquet supported) - -Tip: set an environment variable to track versions: - -- `BIOANALYZER_VERSION=commit_sha` - """ + f"Feedback: `{FEEDBACK_CSV}` and `{FEEDBACK_PARQUET}`. " + "Tip: set `BIOANALYZER_VERSION` or `FEEDBACK_DIR` in environment." ) diff --git a/docker-setup.sh b/docker-setup.sh index 934005e..8164653 100755 --- a/docker-setup.sh +++ b/docker-setup.sh @@ -68,7 +68,7 @@ fi print_status "Testing Docker image..." # Test the image with a simple command -docker run --rm bioanalyzer-package python test_cli.py +docker run --rm bioanalyzer-package python scripts/dev/test_cli.py if [ $? -eq 0 ]; then print_success "Docker image test passed!" diff --git a/docs/CURATOR_TABLE_DESIGN.md b/docs/CURATOR_TABLE_DESIGN.md index a1bd351..9856087 100644 --- a/docs/CURATOR_TABLE_DESIGN.md +++ b/docs/CURATOR_TABLE_DESIGN.md @@ -92,4 +92,4 @@ We can start with **PMID, Title, Year, Journal, the 6 field statuses, and one co - **Design:** `docs/CURATOR_TABLE_DESIGN.md` (this file). - **App:** `curator_table/` (Streamlit app + README). -- **Data format:** Same as existing BioAnalyzer export (e.g. `analysis_results.csv` / validation dataset shape); see `create_validation_dataset.py` and `confusion_matrix_analysis.py` for column names. +- **Data format:** Same as existing BioAnalyzer export (e.g. `analysis_results.csv` / validation dataset shape); see `create_validation_dataset.py` and `scripts/eval/confusion_matrix_analysis.py` for column names. diff --git a/run_confusion_analysis.sh b/run_confusion_analysis.sh index 30defd0..99ffee5 100755 --- a/run_confusion_analysis.sh +++ b/run_confusion_analysis.sh @@ -55,7 +55,7 @@ docker run --rm \ -v "$SCRIPT_DIR:/app" \ -w /app \ bioanalyzer-package \ - python confusion_matrix_analysis.py "$PREDICTIONS_FILE" "$FEEDBACK_FILE" + python scripts/eval/confusion_matrix_analysis.py "$PREDICTIONS_FILE" "$FEEDBACK_FILE" echo "" echo "==========================================" diff --git a/scripts/dev/test_cli.py b/scripts/dev/test_cli.py new file mode 100644 index 0000000..f60d6bb --- /dev/null +++ b/scripts/dev/test_cli.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +""" +BioAnalyzer CLI Test - Simple test without dependencies +""" + + +def test_cli_structure(): + """Test that the CLI structure is correct.""" + print("๐Ÿงช Testing BioAnalyzer Package CLI Structure...") + + # Test CLI file exists + import os + + cli_path = "cli.py" + if os.path.exists(cli_path): + print("โœ… CLI file exists") + else: + print("โŒ CLI file missing") + return False + + # Test main.py exists + main_path = "main.py" + if os.path.exists(main_path): + print("โœ… Main.py file exists") + else: + print("โŒ Main.py file missing") + return False + + # Test app directory structure + app_dirs = ["app", "app/api", "app/services", "app/models", "app/utils"] + for dir_path in app_dirs: + if os.path.exists(dir_path): + print(f"โœ… {dir_path} directory exists") + else: + print(f"โŒ {dir_path} directory missing") + return False + + # Test configuration files + config_files = ["config/requirements.txt", "setup.py", "README.md"] + for config_file in config_files: + if os.path.exists(config_file): + print(f"โœ… {config_file} exists") + else: + print(f"โŒ {config_file} missing") + return False + + print("\n๐ŸŽ‰ All structure tests passed!") + return True + + +def test_field_info(): + """Test field information display.""" + print("\n๐Ÿ“‹ BugSigDB Essential Fields:") + print("=" * 50) + + fields = { + "host_species": { + "name": "Host Species", + "description": "The host organism being studied (e.g., Human, Mouse, Rat)", + "required": True, + }, + "body_site": { + "name": "Body Site", + "description": "Where the microbiome sample was collected (e.g., Gut, Oral, Skin)", + "required": True, + }, + "condition": { + "name": "Condition", + "description": "What disease, treatment, or exposure is being studied", + "required": True, + }, + "sequencing_type": { + "name": "Sequencing Type", + "description": "What molecular method was used (e.g., 16S, metagenomics)", + "required": True, + }, + "taxa_level": { + "name": "Taxa Level", + "description": "What taxonomic level was analyzed (e.g., phylum, genus, species)", + "required": True, + }, + "sample_size": { + "name": "Sample Size", + "description": "Number of samples or participants analyzed", + "required": True, + }, + } + + for field_key, field_info in fields.items(): + print(f"\n{field_info['name']} ({field_key}):") + print(f" Description: {field_info['description']}") + print(f" Required: {'Yes' if field_info['required'] else 'No'}") + + print("\n\nField Status Values:") + print("-" * 30) + status_values = { + "PRESENT": "Information is complete and clear", + "PARTIALLY_PRESENT": "Some information available but incomplete", + "ABSENT": "Information is missing", + } + + for status, description in status_values.items(): + print(f" {status}: {description}") + + print("\n" + "=" * 50) + + +if __name__ == "__main__": + print("๐Ÿš€ BioAnalyzer Package CLI Test") + print("=" * 40) + + # Test structure + if test_cli_structure(): + # Test field info + test_field_info() + + print("\nโœ… Package CLI structure is correct!") + print("๐Ÿ“ To use the CLI, install dependencies:") + print(" pip install -r config/requirements.txt") + print(" python3 cli.py fields") + print(" python3 cli.py analyze 12345678") + else: + print("\nโŒ Package CLI structure has issues!") + diff --git a/confusion_matrix_analysis.py b/scripts/eval/confusion_matrix_analysis.py similarity index 99% rename from confusion_matrix_analysis.py rename to scripts/eval/confusion_matrix_analysis.py index 6feb7ea..102ac86 100644 --- a/confusion_matrix_analysis.py +++ b/scripts/eval/confusion_matrix_analysis.py @@ -248,3 +248,4 @@ def main(): if __name__ == "__main__": main() + diff --git a/scripts/ops/log_cleanup.py b/scripts/ops/log_cleanup.py new file mode 100644 index 0000000..14717e2 --- /dev/null +++ b/scripts/ops/log_cleanup.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python3 +""" +Archived: Log Cleanup for BioAnalyzer +===================================== + +This script helps manage log files by cleaning up old entries and rotating logs. +Moved to scripts/archive/ to indicate it's a dev/ops utility and not part of the +core backend runtime. +""" + +import os +import sys +import argparse +from pathlib import Path +from datetime import datetime, timedelta +import shutil + + +class LogCleanup: + """Log file cleanup and management.""" + + def __init__(self, log_dir="logs"): + self.log_dir = Path(log_dir) + self.log_files = { + "main": self.log_dir / "bioanalyzer.log", + "performance": self.log_dir / "performance.log", + "errors": self.log_dir / "errors.log", + "api": self.log_dir / "api_calls.log", + } + + def cleanup_old_logs(self, days=7): + """Remove log files older than specified days.""" + cutoff_date = datetime.now() - timedelta(days=days) + removed_count = 0 + + print(f"๐Ÿงน Cleaning up logs older than {days} days...") + + for log_file in self.log_dir.glob("*.log.*"): + try: + # Check if it's a rotated log file + if log_file.name.endswith((".1", ".2", ".3", ".4", ".5")): + # Get file modification time + mtime = datetime.fromtimestamp(log_file.stat().st_mtime) + if mtime < cutoff_date: + log_file.unlink() + print(f"โœ… Removed old log: {log_file.name}") + removed_count += 1 + except Exception as e: + print(f"โŒ Error removing {log_file.name}: {e}") + + print(f"โœ… Cleanup complete. Removed {removed_count} old log files.") + + def rotate_logs(self): + """Manually rotate log files.""" + print("๐Ÿ”„ Rotating log files...") + + for log_type, log_file in self.log_files.items(): + if not log_file.exists(): + continue + + try: + # Create backup with timestamp + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + backup_name = f"{log_file.stem}_{timestamp}{log_file.suffix}" + backup_path = self.log_dir / backup_name + + # Copy current log to backup + shutil.copy2(log_file, backup_path) + + # Clear current log + with open(log_file, "w") as f: + f.write("") + + print(f"โœ… Rotated {log_type}: {backup_name}") + + except Exception as e: + print(f"โŒ Error rotating {log_type}: {e}") + + def compress_logs(self): + """Compress old log files to save space.""" + import gzip + + print("๐Ÿ—œ๏ธ Compressing old log files...") + compressed_count = 0 + + for log_file in self.log_dir.glob("*.log.*"): + if log_file.name.endswith((".1", ".2", ".3", ".4", ".5")): + try: + # Skip already compressed files + if log_file.suffix == ".gz": + continue + + # Compress file + with open(log_file, "rb") as f_in: + compressed_path = log_file.with_suffix(log_file.suffix + ".gz") + with gzip.open(compressed_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + + # Remove original file + log_file.unlink() + print(f"โœ… Compressed: {log_file.name} -> {compressed_path.name}") + compressed_count += 1 + + except Exception as e: + print(f"โŒ Error compressing {log_file.name}: {e}") + + print(f"โœ… Compression complete. Compressed {compressed_count} files.") + + def show_log_info(self): + """Show information about log files.""" + print("๐Ÿ“Š LOG FILES INFORMATION") + print("=" * 50) + + total_size = 0 + + for log_type, log_file in self.log_files.items(): + if log_file.exists(): + size = log_file.stat().st_size + size_mb = size / (1024 * 1024) + total_size += size + + # Count lines + try: + with open(log_file, "r", encoding="utf-8") as f: + line_count = sum(1 for _ in f) + except: + line_count = 0 + + print( + f"{log_type.upper():12} | {size_mb:6.2f} MB | {line_count:8d} lines" + ) + else: + print(f"{log_type.upper():12} | Not found") + + # Check for rotated logs + rotated_logs = list(self.log_dir.glob("*.log.*")) + if rotated_logs: + print(f"\n๐Ÿ”„ ROTATED LOGS ({len(rotated_logs)} files):") + for log_file in sorted(rotated_logs): + size = log_file.stat().st_size + size_kb = size / 1024 + mtime = datetime.fromtimestamp(log_file.stat().st_mtime) + age = datetime.now() - mtime + + if age.days > 0: + age_str = f"{age.days}d ago" + else: + age_str = f"{age.seconds // 3600}h ago" + + print(f" {log_file.name:30} | {size_kb:6.1f} KB | {age_str}") + + print(f"\n๐Ÿ’พ Total log size: {total_size / (1024 * 1024):.2f} MB") + + def reset_logs(self, confirm=True): + """Reset all log files (clear content).""" + if confirm: + response = input("Are you sure you want to reset ALL log files? (y/N): ") + if response.lower() != "y": + print("Log reset cancelled.") + return + + print("๐Ÿ”„ Resetting all log files...") + + for log_type, log_file in self.log_files.items(): + if log_file.exists(): + try: + with open(log_file, "w") as f: + f.write("") + print(f"โœ… Reset {log_type} log") + except Exception as e: + print(f"โŒ Error resetting {log_type}: {e}") + + print("โœ… All log files have been reset.") + + +def main(): + parser = argparse.ArgumentParser(description="BioAnalyzer Log Cleanup (Archived)") + parser.add_argument( + "--cleanup", type=int, metavar="DAYS", help="Clean up logs older than DAYS" + ) + parser.add_argument("--rotate", action="store_true", help="Rotate log files") + parser.add_argument( + "--compress", action="store_true", help="Compress old log files" + ) + parser.add_argument("--info", action="store_true", help="Show log file information") + parser.add_argument("--reset", action="store_true", help="Reset all log files") + parser.add_argument( + "--logs", default="logs", help="Log directory path (default: logs)" + ) + + args = parser.parse_args() + + cleanup = LogCleanup(args.logs) + + if args.cleanup: + cleanup.cleanup_old_logs(args.cleanup) + elif args.rotate: + cleanup.rotate_logs() + elif args.compress: + cleanup.compress_logs() + elif args.info: + cleanup.show_log_info() + elif args.reset: + cleanup.reset_logs() + else: + # Default: show info + cleanup.show_log_info() + + +if __name__ == "__main__": + main() + diff --git a/scripts/ops/log_dashboard.py b/scripts/ops/log_dashboard.py new file mode 100644 index 0000000..4b0d0d8 --- /dev/null +++ b/scripts/ops/log_dashboard.py @@ -0,0 +1,254 @@ +#!/usr/bin/env python3 +""" +Archived: Log Dashboard for BioAnalyzer +======================================= + +A simple dashboard to monitor logs in real-time with performance metrics. +Moved to scripts/archive/ to indicate it's a dev/ops utility and not part of the +core backend runtime. +""" + +import os +import sys +import time +import json +from pathlib import Path +from datetime import datetime, timedelta +import argparse +import re + + +class LogDashboard: + """Simple log monitoring dashboard.""" + + def __init__(self, log_dir="logs"): + self.log_dir = Path(log_dir) + self.performance_log = self.log_dir / "performance.log" + self.error_log = self.log_dir / "errors.log" + self.main_log = self.log_dir / "bioanalyzer.log" + + # Statistics + self.stats = { + "total_queries": 0, + "successful_queries": 0, + "failed_queries": 0, + "cached_queries": 0, + "avg_response_time": 0, + "errors": [], + "recent_activity": [], + } + + # Track file positions + self.file_positions = {} + for log_file in [self.performance_log, self.error_log, self.main_log]: + if log_file.exists(): + self.file_positions[log_file] = log_file.stat().st_size + else: + self.file_positions[log_file] = 0 + + def update_stats(self): + """Update statistics from log files.""" + self._update_performance_stats() + self._update_error_stats() + self._update_recent_activity() + + def _update_performance_stats(self): + """Update performance statistics.""" + if not self.performance_log.exists(): + return + + try: + with open(self.performance_log, "r", encoding="utf-8") as f: + lines = f.readlines() + + # Reset counters + self.stats["total_queries"] = 0 + self.stats["successful_queries"] = 0 + self.stats["failed_queries"] = 0 + self.stats["cached_queries"] = 0 + response_times = [] + + for line in lines: + if "PMID_QUERY_END" in line: + self.stats["total_queries"] += 1 + + # Parse status + if "Status: SUCCESS" in line: + self.stats["successful_queries"] += 1 + elif "Status: FAILED" in line: + self.stats["failed_queries"] += 1 + + # Parse cache status + if "Cache: CACHED" in line: + self.stats["cached_queries"] += 1 + + # Parse duration + duration_match = re.search(r"Duration: ([\d.]+)s", line) + if duration_match: + response_times.append(float(duration_match.group(1))) + + # Calculate average response time + if response_times: + self.stats["avg_response_time"] = sum(response_times) / len( + response_times + ) + + except Exception as e: + print(f"Error updating performance stats: {e}") + + def _update_error_stats(self): + """Update error statistics.""" + if not self.error_log.exists(): + return + + try: + with open(self.error_log, "r", encoding="utf-8") as f: + lines = f.readlines() + + # Get last 10 errors + recent_errors = [] + for line in lines[-10:]: + if line.strip(): + # Extract error summary + error_match = re.search( + r"ERROR - PMID: (\d+) \| Context: (.+?) \|", line + ) + if error_match: + pmid = error_match.group(1) + context = error_match.group(2) + recent_errors.append(f"PMID {pmid}: {context}") + + self.stats["errors"] = recent_errors[-5:] # Keep last 5 errors + + except Exception as e: + print(f"Error updating error stats: {e}") + + def _update_recent_activity(self): + """Update recent activity.""" + if not self.main_log.exists(): + return + + try: + with open(self.main_log, "r", encoding="utf-8") as f: + lines = f.readlines() + + # Get last 10 log entries + recent_lines = lines[-10:] + self.stats["recent_activity"] = [] + + for line in recent_lines: + if line.strip(): + # Extract timestamp and message + timestamp_match = re.search( + r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})", line + ) + if timestamp_match: + timestamp = timestamp_match.group(1) + # Extract meaningful part of the message + message = ( + line.split(" - ", 2)[-1] if " - " in line else line.strip() + ) + self.stats["recent_activity"].append(f"{timestamp}: {message}") + + except Exception as e: + print(f"Error updating recent activity: {e}") + + def display_dashboard(self): + """Display the dashboard.""" + os.system("clear" if os.name == "posix" else "cls") + + print("๐Ÿš€ BioAnalyzer Log Dashboard (Archived)") + print("=" * 60) + print(f"๐Ÿ“… Last Updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print() + + # Performance Overview + print("๐Ÿ“Š PERFORMANCE OVERVIEW") + print("-" * 30) + print(f"Total PMID Queries: {self.stats['total_queries']}") + print(f"Successful: {self.stats['successful_queries']} โœ…") + print(f"Failed: {self.stats['failed_queries']} โŒ") + print(f"Cached Results: {self.stats['cached_queries']} ๐Ÿ“‹") + print(f"Avg Response Time: {self.stats['avg_response_time']:.2f}s") + + # Success rate + if self.stats["total_queries"] > 0: + success_rate = ( + self.stats["successful_queries"] / self.stats["total_queries"] + ) * 100 + print(f"Success Rate: {success_rate:.1f}%") + print() + + # Recent Errors + if self.stats["errors"]: + print("โŒ RECENT ERRORS") + print("-" * 20) + for error in self.stats["errors"]: + print(f"โ€ข {error}") + print() + + # Recent Activity + if self.stats["recent_activity"]: + print("๐Ÿ“ RECENT ACTIVITY") + print("-" * 20) + for activity in self.stats["recent_activity"][-5:]: # Show last 5 + print(f"โ€ข {activity}") + print() + + # File Status + print("๐Ÿ“ LOG FILES STATUS") + print("-" * 20) + for log_name, log_file in [ + ("Main", self.main_log), + ("Performance", self.performance_log), + ("Errors", self.error_log), + ]: + if log_file.exists(): + size = log_file.stat().st_size + size_kb = size / 1024 + print(f"{log_name}: {size_kb:.1f} KB") + else: + print(f"{log_name}: Not found") + + print() + print("Press Ctrl+C to stop monitoring") + + def monitor(self, refresh_interval=5): + """Monitor logs with periodic updates.""" + print("Starting log monitoring (Archived script)...") + print("Press Ctrl+C to stop") + + try: + while True: + self.update_stats() + self.display_dashboard() + time.sleep(refresh_interval) + + except KeyboardInterrupt: + print("\n๐Ÿ‘‹ Monitoring stopped.") + + +def main(): + parser = argparse.ArgumentParser( + description="BioAnalyzer Log Dashboard (Archived)" + ) + parser.add_argument( + "--refresh", + "-r", + type=int, + default=5, + help="Refresh interval in seconds (default: 5)", + ) + parser.add_argument( + "--logs", default="logs", help="Log directory path (default: logs)" + ) + + args = parser.parse_args() + + dashboard = LogDashboard(args.logs) + dashboard.monitor(args.refresh) + + +if __name__ == "__main__": + main() + diff --git a/scripts/ops/performance_monitor.py b/scripts/ops/performance_monitor.py new file mode 100644 index 0000000..063d04c --- /dev/null +++ b/scripts/ops/performance_monitor.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +""" +Archived: Performance Monitor for BioAnalyzer +============================================ + +This script monitors the performance of PMID queries and helps identify +API bottlenecks. Moved to scripts/archive/ to indicate it's a dev/ops +utility and not part of the core backend runtime. +""" + +import requests +import time +import json +from datetime import datetime +import argparse + + +def test_pmid_query(pmid, base_url="http://localhost:8000"): + """Test a single PMID query and measure performance.""" + print(f"Testing PMID: {pmid}") + + start_time = time.time() + + try: + # Test health endpoint first + health_response = requests.get(f"{base_url}/health", timeout=10) + health_time = time.time() - start_time + + if health_response.status_code != 200: + print(f"โŒ Health check failed: {health_response.status_code}") + return False + + print(f"โœ… Health check: {health_time:.2f}s") + + # Test enhanced analysis endpoint + analysis_start = time.time() + response = requests.get( + f"{base_url}/enhanced_analysis/{pmid}", timeout=150 + ) # Increased timeout to 150 seconds + analysis_time = time.time() - analysis_start + + total_time = time.time() - start_time + + if response.status_code == 200: + print(f"โœ… Analysis successful: {analysis_time:.2f}s") + print(f"โœ… Total time: {total_time:.2f}s") + + # Check if result was cached + data = response.json() + if data.get("cached", False): + print("๐Ÿ“‹ Result served from cache") + else: + print("๐Ÿ”„ Result generated fresh") + + return True + else: + print(f"โŒ Analysis failed: {response.status_code}") + try: + error_data = response.json() + print(f"Error: {error_data.get('detail', 'Unknown error')}") + except: + print(f"Error: {response.text}") + return False + + except requests.exceptions.Timeout: + print("โŒ Request timed out") + return False + except Exception as e: + print(f"โŒ Error: {str(e)}") + return False + + +def test_multiple_pmids(pmids, base_url="http://localhost:8000"): + """Test multiple PMIDs and provide performance summary.""" + print(f"Testing {len(pmids)} PMIDs...") + print("=" * 50) + + results = [] + total_time = 0 + + for i, pmid in enumerate(pmids, 1): + print(f"\n[{i}/{len(pmids)}] ", end="") + start_time = time.time() + + success = test_pmid_query(pmid, base_url) + query_time = time.time() - start_time + + results.append({"pmid": pmid, "success": success, "time": query_time}) + + total_time += query_time + + # Add delay between requests to avoid overwhelming the server + if i < len(pmids): + time.sleep(1) + + # Print summary + print("\n" + "=" * 50) + print("PERFORMANCE SUMMARY") + print("=" * 50) + + successful = sum(1 for r in results if r["success"]) + failed = len(results) - successful + + print(f"Total PMIDs tested: {len(pmids)}") + print(f"Successful: {successful}") + print(f"Failed: {failed}") + print(f"Success rate: {(successful/len(pmids)*100):.1f}%") + print(f"Total time: {total_time:.2f}s") + print(f"Average time per PMID: {(total_time/len(pmids)):.2f}s") + + if successful > 0: + successful_times = [r["time"] for r in results if r["success"]] + print(f"Fastest query: {min(successful_times):.2f}s") + print(f"Slowest query: {max(successful_times):.2f}s") + + # Check cache performance + try: + metrics_response = requests.get(f"{base_url}/metrics", timeout=10) + if metrics_response.status_code == 200: + metrics = metrics_response.json() + cache_stats = metrics.get("cache", {}) + print(f"\nCache Statistics:") + print( + f" Total analyzed: {cache_stats.get('total_curation_analyzed', 'N/A')}" + ) + print( + f" Cache hit rate: {cache_stats.get('curation_readiness_rate', 'N/A'):.1%}" + ) + print( + f" Recent activity (24h): {cache_stats.get('recent_analysis_24h', 'N/A')}" + ) + except: + print("\nCould not retrieve cache statistics") + + +def main(): + parser = argparse.ArgumentParser( + description="Performance Monitor for BioAnalyzer (Archived)" + ) + parser.add_argument("--pmid", help="Single PMID to test") + parser.add_argument("--pmids", nargs="+", help="Multiple PMIDs to test") + parser.add_argument("--file", help="File containing PMIDs (one per line)") + parser.add_argument( + "--url", default="http://localhost:8000", help="Base URL of the API" + ) + + args = parser.parse_args() + + if args.pmid: + test_pmid_query(args.pmid, args.url) + elif args.pmids: + test_multiple_pmids(args.pmids, args.url) + elif args.file: + try: + with open(args.file, "r") as f: + pmids = [line.strip() for line in f if line.strip()] + test_multiple_pmids(pmids, args.url) + except FileNotFoundError: + print(f"File not found: {args.file}") + except Exception as e: + print(f"Error reading file: {str(e)}") + else: + # Test with some sample PMIDs + sample_pmids = ["12345", "67890", "11111"] + print("No PMIDs specified. Testing with sample PMIDs...") + test_multiple_pmids(sample_pmids, args.url) + + +if __name__ == "__main__": + main() +