diff --git a/.vscode/settings.json b/.vscode/settings.json index e548c7e..3ee8a20 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -18,5 +18,10 @@ } }, "python.terminal.activateEnvironment": false, - "python.terminal.activateEnvInCurrentTerminal": false + "python.terminal.activateEnvInCurrentTerminal": false, + "workbench.editorAssociations": { + "*.copilotmd": "vscode.markdown.preview.editor", + "*.docx": "default", + "file:/**/*.csv": "jupyter-data-wrangler" + } } \ No newline at end of file diff --git a/Readme.md b/Readme.md index 7349d8e..e08fcd9 100644 --- a/Readme.md +++ b/Readme.md @@ -53,7 +53,7 @@ A complete machine learning operations (ML Ops) system demonstrating the full li │ ▼ ┌──────────────────────────────────────────┐ - │ GATEWAY SERVICE (:8000) │ + │ GATEWAY SERVICE (:8080 → :8000) │ │ ┌────────────────────────────────────┐ │ │ │ 1. Whitelist Check (15 domains) │ │ │ │ ├─ google.com, github.com │ │ @@ -87,7 +87,7 @@ A complete machine learning operations (ML Ops) system demonstrating the full li │ ▼ ┌──────────────────────────────────────────┐ - │ MODEL SERVICE (:9000) │ + │ MODEL SERVICE (:8002) │ │ ┌────────────────────────────────────┐ │ │ │ • Feature Extraction (8 features) │ │ │ │ • XGBoost Inference │ │ @@ -854,7 +854,7 @@ suite.expect_column_to_exist("CharContinuationRate") **Multi-Stage Build:** ```dockerfile -# gateway.Dockerfile (BRANCH: feature/docker-slim-gateway) +# gateway.Dockerfile # ---- build stage: install runtime deps into a venv ---- FROM python:3.11-slim AS builder @@ -1067,71 +1067,109 @@ pip install -e ".[dev]" ### Running the Services +**Option 1: Using Docker Compose (Recommended)** +```bash +# Start all services +docker compose up -d + +# Check service health +docker ps + +# View logs +docker compose logs -f gateway +docker compose logs -f model-svc +``` + +**Option 2: Running Locally (Development)** + **Terminal 1: Model Service** ```bash -python -m model_svc.main +python -m src.model_svc.main # Wait for: ✓ Model Service Ready -# Listening on http://localhost:9000 +# Listening on http://localhost:8002 ``` **Terminal 2: Gateway Service** ```bash # Windows -set MODEL_SVC_URL=http://localhost:9000 +set MODEL_SVC_URL=http://localhost:8002 + # Linux/Mac -export MODEL_SVC_URL=http://localhost:9000 +export MODEL_SVC_URL=http://localhost:8002 -python -m gateway.main +python -m src.gateway.main # Listening on http://localhost:8000 ``` ### Testing the System +**When using Docker (recommended):** + **1. Whitelist Test** ```bash -curl -X POST http://localhost:8000/predict \ - -H "Content-Type: application/json" \ - -d '{"url":"https://github.com"}' +curl -X POST http://localhost:8080/predict ^ + -H "Content-Type: application/json" ^ + -d "{\"url\":\"https://github.com\"}" # → {"decision":"ALLOW","reason":"domain-whitelist","source":"whitelist"} ``` **2. Phishing Detection** ```bash -curl -X POST http://localhost:8000/predict \ - -H "Content-Type: application/json" \ - -d '{"url":"https://phishing.top"}' +curl -X POST http://localhost:8080/predict ^ + -H "Content-Type: application/json" ^ + -d "{\"url\":\"https://phishing.top\"}" # → {"p_malicious":1.0,"decision":"BLOCK","reason":"policy-band"} ``` **3. Short Domain Routing** ```bash -curl -X POST http://localhost:8000/predict \ - -H "Content-Type: application/json" \ - -d '{"url":"https://npm.org"}' +curl -X POST http://localhost:8080/predict ^ + -H "Content-Type: application/json" ^ + -d "{\"url\":\"https://npm.org\"}" # → {"decision":"ALLOW","reason":"judge-short-domain-lean-legit"} ``` **4. SHAP Dashboard** -Open browser: `http://localhost:8000/explain` +Open browser: `http://localhost:8080/explain` **5. Stats Monitoring** ```bash -curl http://localhost:8000/stats +curl http://localhost:8080/stats # → {"policy_decisions":{...},"final_decisions":{...},"judge_verdicts":{...}} ``` ### Docker Deployment +#### Using standalone Docker + ```bash # Build gateway image docker build -f docker/gateway.Dockerfile -t phishguard-gateway:latest . -# Run container -docker run --rm -p 8000:8000 \ - -e MODEL_SVC_URL=http://host.docker.internal:9000 \ +# Build model service image +docker build -f docker/model.Dockerfile -t phishguard-model:latest . + +# Run model service first +docker run -d --name phishguard-model -p 8002:8002 phishguard-model:latest + +# Run gateway service +docker run --rm -p 8080:8000 \ + -e MODEL_SVC_URL=http://host.docker.internal:8002 \ + -e THRESHOLDS_JSON=configs/dev/thresholds.json \ + -e JUDGE_BACKEND=stub \ phishguard-gateway:latest ``` +#### Using Docker Compose + +```bash +# Build and run all services +docker compose up -d + +# Or build and run specific service +docker compose up gateway -d +``` + --- ## 🗂️ Repository Structure diff --git a/models/dev/archive/model.pkl b/models/dev/archive/model.pkl deleted file mode 100644 index b605f95..0000000 Binary files a/models/dev/archive/model.pkl and /dev/null differ diff --git a/models/dev/archive/model_7feat.pkl b/models/dev/archive/model_7feat.pkl deleted file mode 100644 index 5f5dd30..0000000 Binary files a/models/dev/archive/model_7feat.pkl and /dev/null differ diff --git a/models/dev/archive/model_7feat_meta.json b/models/dev/archive/model_7feat_meta.json deleted file mode 100644 index 523ee5b..0000000 --- a/models/dev/archive/model_7feat_meta.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "feature_order": [ - "TLDLegitimateProb", - "CharContinuationRate", - "SpacialCharRatioInURL", - "URLCharProb", - "LetterRatioInURL", - "NoOfOtherSpecialCharsInURL", - "DomainLength" - ], - "class_mapping": { - "phish": 0, - "legit": 1 - }, - "phish_proba_col_index": 0, - "model_type": "CalibratedClassifierCV", - "calibration": "isotonic_cv5", - "training_date": "2025-10-14T00:34:13.019360", - "seed": 42, - "metrics": { - "pr_auc": 0.9987538019764365, - "f1_macro": 0.9939829924548755, - "brier": 0.0052004613854286915 - }, - "thresholds": { - "optimal_threshold": 0.46999999999999986, - "gray_zone_low": 0.011, - "gray_zone_high": 0.9980000000000003, - "gray_zone_rate": 0.11994973697101356, - "f1_score_at_optimal": 0.9940051502619371 - }, - "notes": "7-feature model without IsHTTPS - production candidate, 99.88% PR-AUC, robust to HTTPS phishing" -} \ No newline at end of file diff --git a/models/dev/archive/model_meta.json b/models/dev/archive/model_meta.json deleted file mode 100644 index 33ecbf1..0000000 --- a/models/dev/archive/model_meta.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "feature_order": [ - "url_len", - "url_digit_ratio", - "url_subdomains", - "NoOfOtherSpecialCharsInURL", - "SpacialCharRatioInURL", - "CharContinuationRate", - "URLCharProb", - "TLDLegitimateProb" - ], - "class_mapping": { - "phish": 0, - "legit": 1 - }, - "phish_proba_col_index": 0, - "model_type": "CalibratedClassifierCV", - "notes": "URL-only baseline; calibrated; saved for model_svc." -} \ No newline at end of file diff --git a/notebooks/archive/02_baseline_and_calibration.ipynb b/notebooks/archive/02_baseline_and_calibration.ipynb deleted file mode 100644 index f4073bb..0000000 --- a/notebooks/archive/02_baseline_and_calibration.ipynb +++ /dev/null @@ -1,461 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "c942e850", - "metadata": {}, - "source": [ - "## **Baseline-calibration** \n", - "- this notebook is designed to build trustworthy baseline models for phishing detection. It’s not about chasing state-of-the-art performance yet — it’s about laying a solid foundation we can reason about, debug, and improve later." - ] - }, - { - "cell_type": "markdown", - "id": "03f859c7", - "metadata": {}, - "source": [ - "### **Loads essential libraries for data manipulation, file handling, and visualization.**" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "7d03eb1a", - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "import os, json, numpy as np, pandas as pd\n", - "from sklearn.model_selection import train_test_split, StratifiedKFold\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.calibration import CalibratedClassifierCV\n", - "from sklearn.metrics import f1_score, average_precision_score, brier_score_loss\n", - "from xgboost import XGBClassifier\n", - "import mlflow\n", - "import yaml\n", - "from dotenv import load_dotenv\n" - ] - }, - { - "cell_type": "markdown", - "id": "37a54fcb", - "metadata": {}, - "source": [ - "### **Set working directory to root**" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "4682afd3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'d:\\\\MLops\\\\NetworkSecurity'" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%pwd\n", - "os.chdir(\"../\")\n", - "%pwd\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "d0c96a68", - "metadata": {}, - "source": [ - "### **Config & paths**" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "4c37549b", - "metadata": {}, - "outputs": [], - "source": [ - "SEED = 42\n", - "RAW = Path(\"data/raw/PhiUSIIL_Phishing_URL_Dataset.csv\")\n", - "CLEAN = Path(\"data/processed/phiusiil_clean.csv\")\n", - "DATA = CLEAN if CLEAN.exists() else RAW\n", - "THRESH_PATH = Path(os.getenv(\"THRESHOLDS_JSON\", \"configs/dev/thresholds.json\"))\n", - "MLFLOW_URI = os.getenv(\"MLFLOW_TRACKING_URI\", \"http://localhost:5000\")\n", - "EXPERIMENT = os.getenv(\"MLFLOW_EXPERIMENT\", \"phiusiil_baselines\")\n", - "THRESH_PATH.parent.mkdir(parents=True, exist_ok=True)" - ] - }, - { - "cell_type": "markdown", - "id": "8b9d75d0", - "metadata": {}, - "source": [ - "## **train, calibrate, evaluate, choose thresholds**" - ] - }, - { - "cell_type": "markdown", - "id": "d3ed69ee", - "metadata": {}, - "source": [ - "### **Load & split**" - ] - }, - { - "cell_type": "markdown", - "id": "66643f1a", - "metadata": {}, - "source": [ - "#### Intent: Load & Split\n", - "\n", - "This block loads the raw or cleaned phishing dataset, identifies the label column, and prepares the features and labels for modeling. It also handles the URL column separately, ensuring only numeric features are used for training. Finally, it splits the data into training and validation sets using stratified sampling to preserve the class balance.\n", - "\n", - "The goal is to set up a clean, well-structured dataset so that subsequent modeling steps are reliable and reproducible. This step is crucial for ensuring that the model is trained and evaluated on representative data, minimizing bias and data leakage." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "107a5b3a", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_csv(DATA, encoding_errors = \"ignore\")\n", - "label_col = next((c for c in df.columns if c.lower() in {\"label\",\"result\",\"y\",\"target\"}), None)\n", - "assert label_col is not None, \"No label column found\"\n", - "\n", - "y = df[label_col].astype(int).values # 1=legit, 0=phish\n", - "X = df.drop(columns=[label_col], axis=1)\n", - "\n", - "\n", - "if \"URL\" in X.columns: # Keep url from the X columns\n", - " urls = X[\"URL\"].astype(str).values\n", - " X = X.drop(columns=[\"URL\"])\n", - "\n", - "else:\n", - " urls = np.array([\"\"] * len(X)) # Create placeholder URLs\n", - "\n", - "\n", - "# Keep only numeric values\n", - "X = X.select_dtypes(include=[\"number\"]).copy()\n", - "X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, stratify=y, random_state=SEED)\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "3e61befe", - "metadata": {}, - "source": [ - "### **Define candidates (uncalibrated base)**" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "1b3c63f7", - "metadata": {}, - "outputs": [], - "source": [ - "logreg_base = Pipeline([\n", - " (\"scaler\", StandardScaler(with_mean=False)), # sparse-safe; no harm if dense\n", - " (\"clf\", LogisticRegression(max_iter=2000, class_weight=\"balanced\", random_state=SEED))\n", - "\n", - "])\n", - "\n", - "xgb_base = XGBClassifier(\n", - " n_estimators=300, max_depth=6, learning_rate=0.1, subsample=0.9, colsample_bytree=0.9,\n", - " reg_lambda=1.0, random_state=SEED, n_jobs=0, objective=\"binary:logistic\", verbose=False\n", - ")\n", - "\n", - "\n", - "candidates = {\n", - " \"logreg\": logreg_base,\n", - " \"xgb\": xgb_base,\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "6570c4ce", - "metadata": {}, - "source": [ - "### **Fit + calibrate + score**" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "2c9e0837", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "d:\\MLops\\NetworkSecurity\\venv\\Lib\\site-packages\\xgboost\\training.py:183: UserWarning: [15:02:47] WARNING: C:\\actions-runner\\_work\\xgboost\\xgboost\\src\\learner.cc:738: \n", - "Parameters: { \"verbose\" } are not used.\n", - "\n", - " bst.update(dtrain, iteration=i, fobj=obj)\n", - "d:\\MLops\\NetworkSecurity\\venv\\Lib\\site-packages\\xgboost\\training.py:183: UserWarning: [15:02:50] WARNING: C:\\actions-runner\\_work\\xgboost\\xgboost\\src\\learner.cc:738: \n", - "Parameters: { \"verbose\" } are not used.\n", - "\n", - " bst.update(dtrain, iteration=i, fobj=obj)\n", - "d:\\MLops\\NetworkSecurity\\venv\\Lib\\site-packages\\xgboost\\training.py:183: UserWarning: [15:02:52] WARNING: C:\\actions-runner\\_work\\xgboost\\xgboost\\src\\learner.cc:738: \n", - "Parameters: { \"verbose\" } are not used.\n", - "\n", - " bst.update(dtrain, iteration=i, fobj=obj)\n", - "d:\\MLops\\NetworkSecurity\\venv\\Lib\\site-packages\\xgboost\\training.py:183: UserWarning: [15:02:54] WARNING: C:\\actions-runner\\_work\\xgboost\\xgboost\\src\\learner.cc:738: \n", - "Parameters: { \"verbose\" } are not used.\n", - "\n", - " bst.update(dtrain, iteration=i, fobj=obj)\n", - "d:\\MLops\\NetworkSecurity\\venv\\Lib\\site-packages\\xgboost\\training.py:183: UserWarning: [15:02:56] WARNING: C:\\actions-runner\\_work\\xgboost\\xgboost\\src\\learner.cc:738: \n", - "Parameters: { \"verbose\" } are not used.\n", - "\n", - " bst.update(dtrain, iteration=i, fobj=obj)\n" - ] - } - ], - "source": [ - "def fit_calibrated(name, model):\n", - " # isotonic calibration with 5-fold CV (robust on tabular)\n", - " calib = CalibratedClassifierCV(model, \n", - " method=\"isotonic\", \n", - " cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)\n", - " )\n", - " calib.fit(X_train, y_train)\n", - " \n", - " # we need p_malicious = P(y=0). Most sklearn returns prob for class 1 -> P(y=1) (legit)\n", - " p_legit = calib.predict_proba(X_val)[:, 1]\n", - " p_mal = 1.0 - p_legit\n", - " \n", - " # core metrics\n", - " f1m = f1_score(y_val, (p_mal >= 0.5).astype(int), average=\"macro\") # temp decision at 0.5 on p_mal\n", - " prauc = average_precision_score((y_val==0).astype(int), p_mal) # AP wrt phishing as positive class\n", - " brier = brier_score_loss((y_val==0).astype(int), p_mal) # smaller=better\n", - " return calib, {\"f1_macro@0.5_on_p_mal\": float(f1m), \n", - " \"pr_auc_phish\": float(prauc), \n", - " \"brier_phish\": float(brier)}, p_mal\n", - "\n", - "results, calibrated, pvals = {}, {}, {}\n", - "for name, model in candidates.items():\n", - " cls, metrics, p_mal = fit_calibrated(name, model)\n", - " calibrated[name] = cls\n", - " pvals[name] = p_mal\n", - " results[name] = metrics" - ] - }, - { - "cell_type": "markdown", - "id": "b4e36fd2", - "metadata": {}, - "source": [ - "### **Pick best by PR-AUC (tie-break F1)**" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "517cb82d", - "metadata": {}, - "outputs": [], - "source": [ - "order = sorted(results.items(), key=lambda kv: (kv[1][\"pr_auc_phish\"], kv[1][\"f1_macro@0.5_on_p_mal\"]), reverse=True)\n", - "best_name, best_metrics = order[0]\n", - "best_model = calibrated[best_name]\n", - "p_mal = pvals[best_name]" - ] - }, - { - "cell_type": "markdown", - "id": "89af2e04", - "metadata": {}, - "source": [ - "### **Find single threshold (t) maximizing F1-macro**" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "49956973", - "metadata": {}, - "outputs": [], - "source": [ - "grid = np.linspace(0.05, 0.95, 19)\n", - "f1s = []\n", - "for t in grid:\n", - " y_hat = (p_mal >= t).astype(int) # 1=phish prediction if p_mal>=t\n", - " # but our y is 0=phish, 1=legit → map predictions to y-space:\n", - " y_pred = 1 - y_hat\n", - " f1s.append(f1_score(y_val, y_pred, average=\"macro\"))\n", - "t_star = float(grid[int(np.argmax(f1s))])" - ] - }, - { - "cell_type": "markdown", - "id": "fe117ae5", - "metadata": {}, - "source": [ - "### **Expand to gray-zone band around t targeting ~10–15%**" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "5dd5e4c3", - "metadata": {}, - "outputs": [], - "source": [ - "target_lo, target_hi = 0.10, 0.15\n", - "band_candidates = np.linspace(0.05, 0.40, 8) # half-widths\n", - "chosen = (t_star, max(0.0, t_star-0.10), min(1.0, t_star+0.10), 0.0) # default\n", - "for w in band_candidates:\n", - " low, high = max(0.0, t_star - w), min(1.0, t_star + w)\n", - " gray = ((p_mal >= low) & (p_mal < high)).mean()\n", - " if target_lo <= gray <= target_hi:\n", - " chosen = (t_star, float(low), float(high), float(gray)); break\n", - "t_star, low, high, gray_rate = chosen" - ] - }, - { - "cell_type": "markdown", - "id": "747f9cc6", - "metadata": {}, - "source": [ - "### **Final metrics (forced decision and gray-zone rate)**" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "81bb94d9", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Selection: xgb {'f1_macro@0.5_on_p_mal': 0.0, 'pr_auc_phish': 1.0, 'brier_phish': 4.261942182010472e-06}\n", - "Thresholds: {'t_star': 0.05, 'low': 0.0, 'high': 0.15000000000000002, 'gray_zone_rate': 0.0}\n" - ] - } - ], - "source": [ - "y_hat_star = (p_mal >= t_star).astype(int)\n", - "y_pred_star = 1 - y_hat_star\n", - "final_f1 = f1_score(y_val, y_pred_star, average=\"macro\")\n", - "final_pr = average_precision_score((y_val==0).astype(int), p_mal)\n", - "\n", - "summary = {\n", - " \"data_file\": str(DATA),\n", - " \"best_model\": best_name,\n", - " \"metrics_val\": {\n", - " \"pr_auc_phish\": final_pr,\n", - " \"f1_macro@t_star\": final_f1,\n", - " \"brier_phish\": brier_score_loss((y_val==0).astype(int), p_mal),\n", - " },\n", - " \"thresholds\": {\"t_star\": t_star, \"low\": low, \"high\": high, \"gray_zone_rate\": gray_rate},\n", - " \"class_mapping\": {\"phish\": 0, \"legit\": 1},\n", - " \"seed\": SEED,\n", - "}\n", - "print(\"Selection:\", best_name, best_metrics)\n", - "print(\"Thresholds:\", summary[\"thresholds\"])" - ] - }, - { - "cell_type": "markdown", - "id": "4adb1586", - "metadata": {}, - "source": [ - "## **log to MLflow + export thresholds.json**" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "cda20d70", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🏃 View run xgb_calibrated at: http://localhost:5000/#/experiments/1/runs/35462fab62a54f1d97ed7b7fa910c0ef\n", - "🧪 View experiment at: http://localhost:5000/#/experiments/1\n", - "MLflow tracking URI: http://localhost:5000\n", - "Wrote thresholds → D:\\MLops\\NetworkSecurity\\configs\\dev\\thresholds.json\n" - ] - } - ], - "source": [ - "# log to MLflow + export thresholds.json\n", - "mlflow.set_tracking_uri(MLFLOW_URI)\n", - "mlflow.set_experiment(EXPERIMENT)\n", - "with mlflow.start_run(run_name=f\"{best_name}_calibrated\"):\n", - " mlflow.log_params({\n", - " \"model\": best_name,\n", - " \"calibration\": \"isotonic_cv5\",\n", - " \"seed\": SEED,\n", - " \"features\": X.shape[1],\n", - " \"train_rows\": int(len(X_train)),\n", - " \"val_rows\": int(len(X_val)),\n", - " \"data_file\": str(DATA),\n", - " })\n", - " mlflow.log_metrics({\n", - " \"val_pr_auc_phish\": summary[\"metrics_val\"][\"pr_auc_phish\"],\n", - " \"val_f1_macro_t_star\": summary[\"metrics_val\"][\"f1_macro@t_star\"],\n", - " \"val_brier_phish\": summary[\"metrics_val\"][\"brier_phish\"],\n", - " \"gray_zone_rate\": summary[\"thresholds\"][\"gray_zone_rate\"],\n", - " \"t_star\": summary[\"thresholds\"][\"t_star\"],\n", - " \"low\": summary[\"thresholds\"][\"low\"],\n", - " \"high\": summary[\"thresholds\"][\"high\"],\n", - " })\n", - " # Save/export thresholds for serving\n", - " with open(THRESH_PATH, \"w\", encoding=\"utf-8\") as f:\n", - " json.dump({\n", - " \"model\": best_name,\n", - " \"class_mapping\": summary[\"class_mapping\"],\n", - " \"calibration\": {\"method\": \"isotonic\", \"cv\": 5},\n", - " \"thresholds\": summary[\"thresholds\"],\n", - " \"data\": {\"file\": summary[\"data_file\"]},\n", - " \"seed\": summary[\"seed\"],\n", - " }, f, indent=2)\n", - " mlflow.log_artifact(THRESH_PATH)\n", - "\n", - "print(f\"MLflow tracking URI: {MLFLOW_URI}\")\n", - "print(f\"Wrote thresholds → {THRESH_PATH.resolve()}\")\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/archive/03_ablation_url_only.ipynb b/notebooks/archive/03_ablation_url_only.ipynb deleted file mode 100644 index d59eb90..0000000 --- a/notebooks/archive/03_ablation_url_only.ipynb +++ /dev/null @@ -1,1210 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "6c04bf5e", - "metadata": {}, - "source": [ - "## **Ablation B — URL-only manifest**\n", - "- (keep TLDLegitimateProb, exclude HTML/Title features)" - ] - }, - { - "cell_type": "markdown", - "id": "3db06cd0", - "metadata": {}, - "source": [ - "### **Import Libraries**" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "5413dffd", - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "import os\n", - "import json\n", - "import numpy as np\n", - "import pandas as pd\n", - "from sklearn.model_selection import train_test_split, StratifiedKFold\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.calibration import CalibratedClassifierCV\n", - "from sklearn.metrics import f1_score, average_precision_score, brier_score_loss\n", - "from xgboost import XGBClassifier\n", - "import mlflow\n", - "import yaml\n", - "from dotenv import load_dotenv\n" - ] - }, - { - "cell_type": "markdown", - "id": "0ae2b2e8", - "metadata": {}, - "source": [ - "### **Set working directory**" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "dc490286", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "d:\\MLops\\NetworkSecurity\n" - ] - } - ], - "source": [ - "os.chdir(\"../\")\n", - "print(os.getcwd())" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "677b791d", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "python-dotenv could not parse statement starting at line 1\n", - "python-dotenv could not parse statement starting at line 3\n", - "python-dotenv could not parse statement starting at line 7\n", - "python-dotenv could not parse statement starting at line 10\n", - "python-dotenv could not parse statement starting at line 11\n", - "python-dotenv could not parse statement starting at line 12\n", - "python-dotenv could not parse statement starting at line 13\n", - "python-dotenv could not parse statement starting at line 14\n", - "python-dotenv could not parse statement starting at line 15\n" - ] - } - ], - "source": [ - "# Load environment variables from .env file\n", - "load_dotenv()\n", - "SEED = 42\n", - "THRESH_PATH = Path(\"configs/dev/thresholds.json\")\n", - "MLFLOW_URI = os.getenv(\"MLFLOW_TRACKING_URI\", \"http://localhost:5000\")\n", - "EXPERIMENT = os.getenv(\"MLFLOW_EXPERIMENT\", \"phiusiil_baselines\")\n", - "THRESH_PATH.parent.mkdir(parents=True, exist_ok=True)" - ] - }, - { - "cell_type": "markdown", - "id": "11b74e23", - "metadata": {}, - "source": [ - "### **Load dataset and yml files**" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "f3582563", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Data fingerprint: {'rows': 235370, 'cols': 58, 'file': 'data\\\\processed\\\\phiusiil_clean_urlfeats.csv', 'md5': '30393b938e541b7b3cef650818740d20', 'added_features': ['url_len', 'url_digit_ratio', 'url_subdomains'], 'ranges': {'url_len': [14, 6097], 'url_digit_ratio': [0.0, 0.6842105263157895], 'url_subdomains': [0, 10]}}\n" - ] - } - ], - "source": [ - "DATA = Path(\"data/processed/phiusiil_clean_urlfeats.csv\")\n", - "MANIFEST = Path(\"configs/dev/features_url_only.yaml\")\n", - "\n", - "# Show the fingerprint we wrote, handy for MLflow tags\n", - "fp_path = Path(\"outputs/url_features_fingerprint.json\")\n", - "\n", - "if fp_path.exists():\n", - " print(\"Data fingerprint:\", json.loads(fp_path.read_text()))\n", - "else:\n", - " print(\"Fingerprint file not found; proceed anyway.\")\n", - "\n", - "assert DATA.exists(), f\"Missing processed data: {DATA}\"\n", - "assert MANIFEST.exists(), f\"Missing manifest: {MANIFEST}\"\n", - "\n", - "\n", - "cfg = yaml.safe_load(MANIFEST.read_text())\n", - "whitelist = cfg[\"include\"]\n", - "blacklist = set(cfg.get(\"exclude\", []))\n", - "\n", - "df = pd.read_csv(DATA, encoding_errors=\"ignore\")\n", - "label_col = next(\n", - " (c for c in df.columns if c.lower() in {\"label\", \"result\", \"y\", \"target\"}), None\n", - ")\n", - "assert label_col, \"No label column found in processed data\"" - ] - }, - { - "cell_type": "markdown", - "id": "9d6151c0", - "metadata": {}, - "source": [ - "### **Selects the features to include/exclude**" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "cfc6955d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "URL-only manifest (present): ['url_len', 'url_digit_ratio', 'url_subdomains', 'NoOfOtherSpecialCharsInURL', 'SpacialCharRatioInURL', 'CharContinuationRate', 'URLCharProb', 'TLDLegitimateProb']\n" - ] - } - ], - "source": [ - "# Keep exactly the whitelist columns that actually exist; drop anything else\n", - "present = [c for c in whitelist if c in df.columns]\n", - "missing = [c for c in whitelist if c not in df.columns]\n", - "assert present, f\"No manifest features found. Missing from data: {missing}\"\n", - "\n", - "# Never allow blacklisted or non-numeric columns to slip in\n", - "X = df[present].select_dtypes(include=[\"number\"]).copy()\n", - "y = df[label_col].astype(int).values\n", - "\n", - "print(\"URL-only manifest (present):\", present)\n", - "if missing:\n", - " print(\"Note: these manifest features were not found and are skipped:\", missing)\n", - "\n", - "# Save the resolved feature list for audit + MLflow logging later\n", - "Path(\"outputs\").mkdir(exist_ok=True)\n", - "Path(\"outputs/feature_manifest_resolved.json\").write_text(\n", - " json.dumps({\"features\": present}, indent=2)\n", - ")\n", - "\n", - "# Optionally extract URLs if needed for later use\n", - "if \"URL\" in df.columns:\n", - " urls = df[\"URL\"].astype(str).values\n", - "else:\n", - " urls = np.array([\"\"] * len(df))\n", - "\n", - "# Train/val split\n", - "X_train, X_val, y_train, y_val = train_test_split(\n", - " X, y, test_size=0.20, stratify=y, random_state=SEED\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "id": "694d8e93", - "metadata": {}, - "source": [ - "### **Define candidates (uncalibrated base)**" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "7d2496aa", - "metadata": {}, - "outputs": [], - "source": [ - "logreg_base = Pipeline(\n", - " [\n", - " (\"scaler\", StandardScaler(with_mean=False)), # sparse-safe; no harm if dense\n", - " (\n", - " \"clf\",\n", - " LogisticRegression(\n", - " max_iter=2000, class_weight=\"balanced\", random_state=SEED\n", - " ),\n", - " ),\n", - " ]\n", - ")\n", - "\n", - "xgb_base = XGBClassifier(\n", - " n_estimators=300,\n", - " max_depth=6,\n", - " learning_rate=0.1,\n", - " subsample=0.9,\n", - " colsample_bytree=0.9,\n", - " reg_lambda=1.0,\n", - " random_state=SEED,\n", - " n_jobs=0,\n", - " objective=\"binary:logistic\",\n", - " verbose=False,\n", - ")\n", - "\n", - "\n", - "candidates = {\n", - " \"logreg\": logreg_base,\n", - " \"xgb\": xgb_base,\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "7c795cf1", - "metadata": {}, - "source": [ - "### **Fit + calibrate + score**" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "23d8d901", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "d:\\MLops\\NetworkSecurity\\venv\\Lib\\site-packages\\xgboost\\core.py:158: UserWarning: [11:52:46] WARNING: C:\\buildkite-agent\\builds\\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\\xgboost\\xgboost-ci-windows\\src\\learner.cc:740: \n", - "Parameters: { \"verbose\" } are not used.\n", - "\n", - " warnings.warn(smsg, UserWarning)\n", - "d:\\MLops\\NetworkSecurity\\venv\\Lib\\site-packages\\xgboost\\core.py:158: UserWarning: [11:52:48] WARNING: C:\\buildkite-agent\\builds\\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\\xgboost\\xgboost-ci-windows\\src\\learner.cc:740: \n", - "Parameters: { \"verbose\" } are not used.\n", - "\n", - " warnings.warn(smsg, UserWarning)\n", - "d:\\MLops\\NetworkSecurity\\venv\\Lib\\site-packages\\xgboost\\core.py:158: UserWarning: [11:52:50] WARNING: C:\\buildkite-agent\\builds\\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\\xgboost\\xgboost-ci-windows\\src\\learner.cc:740: \n", - "Parameters: { \"verbose\" } are not used.\n", - "\n", - " warnings.warn(smsg, UserWarning)\n", - "d:\\MLops\\NetworkSecurity\\venv\\Lib\\site-packages\\xgboost\\core.py:158: UserWarning: [11:52:53] WARNING: C:\\buildkite-agent\\builds\\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\\xgboost\\xgboost-ci-windows\\src\\learner.cc:740: \n", - "Parameters: { \"verbose\" } are not used.\n", - "\n", - " warnings.warn(smsg, UserWarning)\n", - "d:\\MLops\\NetworkSecurity\\venv\\Lib\\site-packages\\xgboost\\core.py:158: UserWarning: [11:52:55] WARNING: C:\\buildkite-agent\\builds\\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\\xgboost\\xgboost-ci-windows\\src\\learner.cc:740: \n", - "Parameters: { \"verbose\" } are not used.\n", - "\n", - " warnings.warn(smsg, UserWarning)\n" - ] - } - ], - "source": [ - "def fit_calibrated(name, model):\n", - " # isotonic calibration with 5-fold CV (robust on tabular)\n", - "\n", - " calib = CalibratedClassifierCV(\n", - " model,\n", - " method=\"isotonic\",\n", - " cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED),\n", - " )\n", - " calib.fit(X_train, y_train)\n", - "\n", - " # we need p_malicious = P(y=0). Most sklearn returns prob for class 1 -> P(y=1) (legit)\n", - " p_legit = calib.predict_proba(X_val)[:, 1]\n", - " p_mal = 1.0 - p_legit\n", - "\n", - " y_hat_phish = (p_mal >= 0.5).astype(int) # 1 means \"predict phish\"\n", - " y_pred = 1 - y_hat_phish # map back to y-space (1=legit, 0=phish)\n", - "\n", - " # core metrics\n", - " f1m = f1_score(y_val, y_pred, average=\"macro\") # temp decision at 0.5 on p_mal\n", - " prauc = average_precision_score(\n", - " (y_val == 0).astype(int), p_mal\n", - " ) # AP wrt phishing as positive class\n", - " brier = brier_score_loss((y_val == 0).astype(int), p_mal) # smaller=better\n", - " return (\n", - " calib,\n", - " {\n", - " \"f1_macro@0.5_on_p_mal\": float(f1m),\n", - " \"pr_auc_phish\": float(prauc),\n", - " \"brier_phish\": float(brier),\n", - " },\n", - " p_mal,\n", - " )\n", - "\n", - "\n", - "results, calibrated, pvals = {}, {}, {}\n", - "for name, model in candidates.items():\n", - " cls, metrics, p_mal = fit_calibrated(name, model)\n", - " calibrated[name] = cls\n", - " pvals[name] = p_mal\n", - " results[name] = metrics" - ] - }, - { - "cell_type": "markdown", - "id": "4263f55e", - "metadata": {}, - "source": [ - "### **Pick best by PR-AUC (tie-break F1)**" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "106aa1c6", - "metadata": {}, - "outputs": [], - "source": [ - "order = sorted(\n", - " results.items(),\n", - " key=lambda kv: (kv[1][\"pr_auc_phish\"], kv[1][\"f1_macro@0.5_on_p_mal\"]),\n", - " reverse=True,\n", - ")\n", - "best_name, best_metrics = order[0]\n", - "best_model = calibrated[best_name]\n", - "p_mal = pvals[best_name]" - ] - }, - { - "cell_type": "markdown", - "id": "1d969f08", - "metadata": {}, - "source": [ - "### **Find single threshold (t) maximizing F1-macro**" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "b4532fd6", - "metadata": {}, - "outputs": [], - "source": [ - "grid = np.linspace(0.05, 0.95, 19)\n", - "f1s = []\n", - "for t in grid:\n", - " y_hat = (p_mal >= t).astype(int) # 1=phish prediction if p_mal>=t\n", - " # but our y is 0=phish, 1=legit → map predictions to y-space:\n", - " y_pred = 1 - y_hat\n", - " f1s.append(f1_score(y_val, y_pred, average=\"macro\"))\n", - "t_star = float(grid[int(np.argmax(f1s))])" - ] - }, - { - "cell_type": "markdown", - "id": "45b91956", - "metadata": {}, - "source": [ - "### **Choose a symmetric band around t for a target gray-zone rate**" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "03f19779", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'t_star': 0.44999999999999996, 'low': 0.18828124999999996, 'high': 0.71171875, 'gray_zone_rate': 0.09958788290776224}\n" - ] - } - ], - "source": [ - "def pick_band_for_target(\n", - " p_mal: np.ndarray, t_star: float, target=0.10, tol=0.002, max_iters=40\n", - "):\n", - " lo, hi = 0.0, 0.5 # search bounds on half-width\n", - "\n", - " def gray(half_w):\n", - " low = max(0.0, t_star - half_w)\n", - " high = min(1.0, t_star + half_w)\n", - " return ((p_mal >= low) & (p_mal < high)).mean(), low, high\n", - "\n", - " for _ in range(max_iters):\n", - " half_w = (lo + hi) / 2\n", - " g, low, high = gray(half_w)\n", - " if g > target + tol: # too wide -> shrink\n", - " hi = half_w\n", - " elif g < target - tol: # too narrow -> widen\n", - " lo = half_w\n", - " else:\n", - " low_f, high_f = float(low), float(high)\n", - " return low_f, high_f, float(g)\n", - "\n", - " g, low, high = gray((lo + hi) / 2)\n", - " return float(low), float(high), float(g)\n", - "\n", - "\n", - "# Compute the band\n", - "low, high, gray = pick_band_for_target(p_mal, t_star=t_star, target=0.10)\n", - "print({\"t_star\": float(t_star), \"low\": low, \"high\": high, \"gray_zone_rate\": gray})" - ] - }, - { - "cell_type": "markdown", - "id": "fccab024", - "metadata": {}, - "source": [ - "### **Expand to gray-zone band around t targeting ~10–15%**" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "cd32ca0a", - "metadata": {}, - "outputs": [], - "source": [ - "target_lo, target_hi = 0.10, 0.15\n", - "band_candidates = np.linspace(0.05, 0.40, 8) # half-widths\n", - "chosen = (t_star, max(0.0, t_star - 0.10), min(1.0, t_star + 0.10), 0.0) # default\n", - "for w in band_candidates:\n", - " low, high = max(0.0, t_star - w), min(1.0, t_star + w)\n", - " gray = ((p_mal >= low) & (p_mal < high)).mean()\n", - " if target_lo <= gray <= target_hi:\n", - " chosen = (t_star, float(low), float(high), float(gray))\n", - " break\n", - "t_star, low, high, gray_rate = chosen" - ] - }, - { - "cell_type": "markdown", - "id": "e07e3e58", - "metadata": {}, - "source": [ - "### **Final metrics (forced decision and gray-zone rate)**" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "811c0936", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Selection: xgb {'f1_macro@0.5_on_p_mal': 0.9163419573449059, 'pr_auc_phish': 0.9577761979950676, 'brier_phish': 0.06481116607409061}\n", - "Thresholds: {'t_star': 0.44999999999999996, 'low': 0.14999999999999997, 'high': 0.75, 'gray_zone_rate': 0.14232909886561584}\n" - ] - } - ], - "source": [ - "y_hat_star = (p_mal >= t_star).astype(int)\n", - "y_pred_star = 1 - y_hat_star\n", - "final_f1 = f1_score(y_val, y_pred_star, average=\"macro\")\n", - "final_pr = average_precision_score((y_val == 0).astype(int), p_mal)\n", - "\n", - "summary = {\n", - " \"data_file\": str(DATA),\n", - " \"best_model\": best_name,\n", - " \"metrics_val\": {\n", - " \"pr_auc_phish\": final_pr,\n", - " \"f1_macro@t_star\": final_f1,\n", - " \"brier_phish\": brier_score_loss((y_val == 0).astype(int), p_mal),\n", - " },\n", - " \"thresholds\": {\n", - " \"t_star\": float(t_star),\n", - " \"low\": float(low),\n", - " \"high\": float(high),\n", - " \"gray_zone_rate\": float(gray),\n", - " },\n", - " \"class_mapping\": {\"phish\": 0, \"legit\": 1},\n", - " \"seed\": SEED,\n", - "}\n", - "print(\"Selection:\", best_name, best_metrics)\n", - "print(\"Thresholds:\", summary[\"thresholds\"])" - ] - }, - { - "cell_type": "markdown", - "id": "be95ce64", - "metadata": {}, - "source": [ - "## **log to MLflow + export thresholds.json**" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "e4f268e2", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025/09/17 11:59:15 INFO mlflow.tracking.fluent: Experiment with name 'phiusiil_baselines' does not exist. Creating a new experiment.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🏃 View run xgb_calibrated at: http://localhost:5000/#/experiments/1/runs/c80b541349ed467095e1f1155b9bf8b8\n", - "🧪 View experiment at: http://localhost:5000/#/experiments/1\n", - "MLflow tracking URI: http://localhost:5000\n", - "Wrote thresholds → D:\\MLops\\NetworkSecurity\\configs\\dev\\thresholds.json\n" - ] - } - ], - "source": [ - "# log to MLflow + export thresholds.json\n", - "mlflow.set_tracking_uri(MLFLOW_URI)\n", - "mlflow.set_experiment(EXPERIMENT)\n", - "with mlflow.start_run(run_name=f\"{best_name}_calibrated\"):\n", - " mlflow.log_params(\n", - " {\n", - " \"model\": best_name,\n", - " \"calibration\": \"isotonic_cv5\",\n", - " \"seed\": SEED,\n", - " \"features\": X.shape[1],\n", - " \"train_rows\": int(len(X_train)),\n", - " \"val_rows\": int(len(X_val)),\n", - " \"data_file\": str(DATA),\n", - " }\n", - " )\n", - " mlflow.log_metrics(\n", - " {\n", - " \"val_pr_auc_phish\": summary[\"metrics_val\"][\"pr_auc_phish\"],\n", - " \"val_f1_macro_t_star\": summary[\"metrics_val\"][\"f1_macro@t_star\"],\n", - " \"val_brier_phish\": summary[\"metrics_val\"][\"brier_phish\"],\n", - " \"gray_zone_rate\": summary[\"thresholds\"][\"gray_zone_rate\"],\n", - " \"t_star\": summary[\"thresholds\"][\"t_star\"],\n", - " \"low\": summary[\"thresholds\"][\"low\"],\n", - " \"high\": summary[\"thresholds\"][\"high\"],\n", - " }\n", - " )\n", - " # Save/export thresholds for serving\n", - " with open(THRESH_PATH, \"w\", encoding=\"utf-8\") as f:\n", - " json.dump(\n", - " {\n", - " \"model\": best_name,\n", - " \"class_mapping\": summary[\"class_mapping\"],\n", - " \"calibration\": {\"method\": \"isotonic\", \"cv\": 5},\n", - " \"thresholds\": summary[\"thresholds\"],\n", - " \"data\": {\"file\": summary[\"data_file\"]},\n", - " \"seed\": summary[\"seed\"],\n", - " },\n", - " f,\n", - " indent=2,\n", - " )\n", - " mlflow.log_artifact(THRESH_PATH)\n", - "\n", - "print(f\"MLflow tracking URI: {MLFLOW_URI}\")\n", - "print(f\"Wrote thresholds → {THRESH_PATH.resolve()}\")\n" - ] - }, - { - "cell_type": "markdown", - "id": "b061323d", - "metadata": {}, - "source": [ - "## **Persist trained model + metadata for model_svc**\n", - "\n", - "Save the calibrated model and metadata to `models/dev/` for serving." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "42ea9f3b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[artifact] model: models\\dev\\model.pkl md5: f022f9a181bd7bb406ce544f2947ad1f\n", - "[artifact] meta : models\\dev\\model_meta.json\n", - "[artifact] features: 8 → ['url_len', 'url_digit_ratio', 'url_subdomains', 'NoOfOtherSpecialCharsInURL', 'SpacialCharRatioInURL', 'CharContinuationRate', 'URLCharProb', 'TLDLegitimateProb'] ...\n" - ] - } - ], - "source": [ - "# === Persist trained model + metadata for model_svc ===\n", - "from __future__ import annotations\n", - "import json, hashlib\n", - "from pathlib import Path\n", - "import joblib\n", - "\n", - "# 1) Locate the fitted estimator (adjust the list if your var names differ)\n", - "candidates = [\n", - " globals().get(\"calibrated_clf\"),\n", - " globals().get(\"calibrated_model\"),\n", - " globals().get(\"best_model\"),\n", - " globals().get(\"best_clf\"),\n", - " globals().get(\"final_model\"),\n", - " globals().get(\"clf\"),\n", - "]\n", - "fitted = next((m for m in candidates if m is not None), None)\n", - "if fitted is None:\n", - " raise ValueError(\n", - " \"Could not find a fitted estimator (expected one of: calibrated_clf, calibrated_model, best_model, best_clf, final_model, clf)\"\n", - " )\n", - "\n", - "# 2) Infer feature order (priority: X_train -> X_val -> manifest include list)\n", - "feature_order = None\n", - "for X_name in (\"X_train\", \"X_val\", \"X_features\"):\n", - " if X_name in globals() and hasattr(globals()[X_name], \"columns\"):\n", - " feature_order = list(globals()[X_name].columns)\n", - " break\n", - "if feature_order is None:\n", - " # fallback to config manifest\n", - "\n", - " manifest_path = Path(\"configs/dev/features_url_only.yaml\")\n", - " if manifest_path.exists():\n", - " data = yaml.safe_load(manifest_path.read_text(encoding=\"utf-8\"))\n", - " feature_order = list(data.get(\"include\", []))\n", - "if not feature_order:\n", - " raise ValueError(\n", - " \"Feature order not found; please expose X_train/X_val with columns or ensure configs/dev/features_url_only.yaml has 'include'.\"\n", - " )\n", - "\n", - "# 3) Determine which class id corresponds to 'phish'\n", - "# Our convention so far: class 0 == 'phish', class 1 == 'legit'.\n", - "phish_class_id = 0\n", - "if hasattr(fitted, \"classes_\"):\n", - " classes = list(getattr(fitted, \"classes_\"))\n", - " if 0 in classes:\n", - " phish_class_id = classes.index(0) # index in predict_proba columns\n", - " else:\n", - " # If classes are strings or different mapping, prefer the first column but record classes\n", - " phish_class_id = 0\n", - "\n", - "# 4) Build metadata (small but sufficient for serving)\n", - "meta = {\n", - " \"feature_order\": feature_order,\n", - " \"class_mapping\": {\"phish\": 0, \"legit\": 1}, # training-time label mapping\n", - " \"phish_proba_col_index\": phish_class_id, # column index in predict_proba for P(phish)\n", - " \"model_type\": type(fitted).__name__,\n", - " \"notes\": \"URL-only baseline; calibrated; saved for model_svc.\",\n", - "}\n", - "\n", - "# 5) Output paths\n", - "ART_DIR = Path(\"models/dev\")\n", - "ART_DIR.mkdir(parents=True, exist_ok=True)\n", - "MODEL_PATH = ART_DIR / \"model.pkl\"\n", - "META_PATH = ART_DIR / \"model_meta.json\"\n", - "\n", - "# 6) Save\n", - "joblib.dump(fitted, MODEL_PATH)\n", - "META_PATH.write_text(json.dumps(meta, indent=2), encoding=\"utf-8\")\n", - "\n", - "# 7) Fingerprint for auditability\n", - "m = hashlib.md5(MODEL_PATH.read_bytes()).hexdigest()\n", - "print(\"[artifact] model:\", MODEL_PATH, \"md5:\", m)\n", - "print(\"[artifact] meta :\", META_PATH)\n", - "print(\"[artifact] features:\", len(feature_order), \"→\", feature_order[:8], \"...\")" - ] - }, - { - "cell_type": "markdown", - "id": "0bb91fbc", - "metadata": {}, - "source": [ - "---\n", - "---\n", - "# PART 2: Retrain with Shared Feature Extraction\n", - "\n", - "**Context:** After deploying the model, we discovered training/serving skew - the models were trained on features extracted by PhiUSIIL authors, but production used our `src/common/feature_extraction.py`. This caused all predictions to return 1.0.\n", - "\n", - "**Goal:** Retrain models using features extracted with our shared library to ensure training/serving consistency.\n", - "\n", - "**Validation Checkpoint:** Compare performance (PR-AUC, F1, Brier) with original models above." - ] - }, - { - "cell_type": "markdown", - "id": "fd90ca37", - "metadata": {}, - "source": [ - "## Load New Features" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8e15f10e", - "metadata": {}, - "outputs": [], - "source": [ - "# Load features extracted with shared library\n", - "DATA_V2 = Path(\"data/processed/phiusiil_features_v2.csv\")\n", - "\n", - "if not DATA_V2.exists():\n", - " raise FileNotFoundError(\n", - " f\"New features not found: {DATA_V2}\\n\"\n", - " f\"Run notebooks/feature_engineering.ipynb first\"\n", - " )\n", - "\n", - "df_v2 = pd.read_csv(DATA_V2)\n", - "\n", - "print(f\"Loaded: {DATA_V2}\")\n", - "print(f\"Shape: {df_v2.shape}\")\n", - "print(f\"Columns: {list(df_v2.columns)}\")" - ] - }, - { - "cell_type": "markdown", - "id": "d9684ad4", - "metadata": {}, - "source": [ - "## Feature Distribution Comparison" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e548282d", - "metadata": {}, - "outputs": [], - "source": [ - "# Compare distributions of key features\n", - "print(\"Feature distribution comparison (Old vs New):\\n\")\n", - "\n", - "feature_cols_v2 = [c for c in df_v2.columns if c not in [\"URL\", \"label\"]]\n", - "\n", - "comparison_data = []\n", - "for feat in feature_cols_v2:\n", - " if feat in df.columns: # If feature existed in old data\n", - " old_mean = df[feat].mean()\n", - " new_mean = df_v2[feat].mean()\n", - " diff = abs(old_mean - new_mean)\n", - " status = \"✓\" if diff < 0.1 else \"⚠️\"\n", - "\n", - " comparison_data.append(\n", - " {\n", - " \"Feature\": feat,\n", - " \"Old Mean\": old_mean,\n", - " \"New Mean\": new_mean,\n", - " \"Diff\": diff,\n", - " \"Status\": status,\n", - " }\n", - " )\n", - "\n", - " print(\n", - " f\"{status} {feat:35s} Old: {old_mean:7.4f} New: {new_mean:7.4f} Δ: {diff:7.4f}\"\n", - " )\n", - "\n", - "# Save comparison for later reference\n", - "df_comparison = pd.DataFrame(comparison_data)\n", - "df_comparison.to_csv(\"outputs/feature_comparison_v1_vs_v2.csv\", index=False)\n", - "print(f\"\\n✓ Saved comparison to outputs/feature_comparison_v1_vs_v2.csv\")" - ] - }, - { - "cell_type": "markdown", - "id": "125984d4", - "metadata": {}, - "source": [ - "## Prepare Data (Same 7 Features)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "437c2c70", - "metadata": {}, - "outputs": [], - "source": [ - "# Use same 7 features as original model\n", - "FEATURES_7 = [\n", - " \"TLDLegitimateProb\",\n", - " \"CharContinuationRate\",\n", - " \"SpacialCharRatioInURL\",\n", - " \"URLCharProb\",\n", - " \"LetterRatioInURL\",\n", - " \"NoOfOtherSpecialCharsInURL\",\n", - " \"DomainLength\",\n", - "]\n", - "\n", - "# Verify all features exist\n", - "missing = [f for f in FEATURES_7 if f not in df_v2.columns]\n", - "if missing:\n", - " raise ValueError(f\"Missing features in new data: {missing}\")\n", - "\n", - "X_v2 = df_v2[FEATURES_7].copy()\n", - "y_v2 = df_v2[\"label\"].astype(int).values\n", - "\n", - "print(f\"Feature matrix shape: {X_v2.shape}\")\n", - "print(f\"Label distribution:\\n{pd.Series(y_v2).value_counts()}\")\n", - "print(f\"\\nFeatures: {FEATURES_7}\")" - ] - }, - { - "cell_type": "markdown", - "id": "1455af54", - "metadata": {}, - "source": [ - "## Train/Val Split" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "287105bc", - "metadata": {}, - "outputs": [], - "source": [ - "# Same split as original (20% validation, stratified, same seed)\n", - "X_train_v2, X_val_v2, y_train_v2, y_val_v2 = train_test_split(\n", - " X_v2, y_v2, test_size=0.20, stratify=y_v2, random_state=SEED\n", - ")\n", - "\n", - "print(f\"Train: {X_train_v2.shape}\")\n", - "print(f\"Val: {X_val_v2.shape}\")" - ] - }, - { - "cell_type": "markdown", - "id": "3b38a4bd", - "metadata": {}, - "source": [ - "## Train 7-Feature Model (v2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "38cbaeb5", - "metadata": {}, - "outputs": [], - "source": [ - "# Use same model architecture as original\n", - "xgb_base_v2 = XGBClassifier(\n", - " n_estimators=300,\n", - " max_depth=6,\n", - " learning_rate=0.1,\n", - " subsample=0.9,\n", - " colsample_bytree=0.9,\n", - " reg_lambda=1.0,\n", - " random_state=SEED,\n", - " n_jobs=0,\n", - " objective=\"binary:logistic\",\n", - " verbose=False,\n", - ")\n", - "\n", - "# Calibrate with isotonic (same as original)\n", - "calib_v2 = CalibratedClassifierCV(\n", - " xgb_base_v2,\n", - " method=\"isotonic\",\n", - " cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED),\n", - ")\n", - "\n", - "print(\"Training 7-feature model (v2)...\")\n", - "calib_v2.fit(X_train_v2, y_train_v2)\n", - "print(\"✓ Training complete\")" - ] - }, - { - "cell_type": "markdown", - "id": "7e32ac02", - "metadata": {}, - "source": [ - "## Evaluate 7-Feature Model (v2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a192c46", - "metadata": {}, - "outputs": [], - "source": [ - "# Get predictions\n", - "p_legit_v2 = calib_v2.predict_proba(X_val_v2)[:, 1] # P(legit)\n", - "p_mal_v2 = 1.0 - p_legit_v2 # P(phish)\n", - "\n", - "# Calculate metrics\n", - "y_val_phish_v2 = (y_val_v2 == 0).astype(int) # 1=phish for PR-AUC\n", - "\n", - "pr_auc_v2 = average_precision_score(y_val_phish_v2, p_mal_v2)\n", - "brier_v2 = brier_score_loss(y_val_phish_v2, p_mal_v2)\n", - "\n", - "# F1 at optimal threshold\n", - "grid = np.linspace(0.05, 0.95, 19)\n", - "f1s_v2 = []\n", - "for t in grid:\n", - " y_hat = (p_mal_v2 >= t).astype(int)\n", - " y_pred = 1 - y_hat\n", - " f1s_v2.append(f1_score(y_val_v2, y_pred, average=\"macro\"))\n", - "f1_v2 = max(f1s_v2)\n", - "\n", - "print(\"\\n\" + \"=\" * 60)\n", - "print(\"7-FEATURE MODEL (V2) PERFORMANCE\")\n", - "print(\"=\" * 60)\n", - "print(f\"PR-AUC (phish): {pr_auc_v2:.6f}\")\n", - "print(f\"F1-Macro: {f1_v2:.6f}\")\n", - "print(f\"Brier Score: {brier_v2:.6f}\")\n", - "print(\"=\" * 60)" - ] - }, - { - "cell_type": "markdown", - "id": "7e6c806d", - "metadata": {}, - "source": [ - "## Compare with Original Model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "318db925", - "metadata": {}, - "outputs": [], - "source": [ - "# Get original model metrics (from Part 1 above)\n", - "try:\n", - " # These should be defined from Part 1\n", - " pr_auc_original = final_pr # or extract from summary dict\n", - " f1_original = final_f1\n", - " brier_original = summary[\"metrics_val\"][\"brier_phish\"]\n", - "\n", - " print(\"\\n\" + \"=\" * 60)\n", - " print(\"MODEL COMPARISON: ORIGINAL vs V2\")\n", - " print(\"=\" * 60)\n", - " print(f\"{'Metric':<20} {'Original':>12} {'V2':>12} {'Δ':>12} {'Status':>10}\")\n", - " print(\"-\" * 60)\n", - "\n", - " # PR-AUC comparison\n", - " pr_diff = pr_auc_v2 - pr_auc_original\n", - " pr_status = \"✓\" if pr_auc_v2 > 0.95 else \"⚠️\"\n", - " print(\n", - " f\"{'PR-AUC':<20} {pr_auc_original:>12.6f} {pr_auc_v2:>12.6f} {pr_diff:>+12.6f} {pr_status:>10}\"\n", - " )\n", - "\n", - " # F1 comparison\n", - " f1_diff = f1_v2 - f1_original\n", - " f1_status = \"✓\" if f1_v2 > 0.95 else \"⚠️\"\n", - " print(\n", - " f\"{'F1-Macro':<20} {f1_original:>12.6f} {f1_v2:>12.6f} {f1_diff:>+12.6f} {f1_status:>10}\"\n", - " )\n", - "\n", - " # Brier comparison\n", - " brier_diff = brier_v2 - brier_original\n", - " brier_status = \"✓\" if brier_v2 < 0.01 else \"⚠️\"\n", - " print(\n", - " f\"{'Brier Score':<20} {brier_original:>12.6f} {brier_v2:>12.6f} {brier_diff:>+12.6f} {brier_status:>10}\"\n", - " )\n", - "\n", - " print(\"=\" * 60)\n", - "\n", - " # Overall assessment\n", - " if pr_auc_v2 > 0.95 and f1_v2 > 0.95:\n", - " print(\"\\n✅ V2 MODEL PERFORMANCE: ACCEPTABLE\")\n", - " print(\" → Performance maintained with shared feature extraction\")\n", - " print(\" → Safe to deploy v2 models\")\n", - " else:\n", - " print(\"\\n⚠️ V2 MODEL PERFORMANCE: BELOW THRESHOLD\")\n", - " print(\" → Feature distribution shift impacted model quality\")\n", - " print(\" → Recommendation: Redo feature selection with new features\")\n", - "\n", - "except NameError:\n", - " print(\"⚠️ Original model metrics not found - run Part 1 first\")" - ] - }, - { - "cell_type": "markdown", - "id": "ae430aa3", - "metadata": {}, - "source": [ - "## Spot Check Predictions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f3bcf404", - "metadata": {}, - "outputs": [], - "source": [ - "# Test on known URLs using our feature extraction\n", - "import sys\n", - "\n", - "sys.path.insert(0, str(Path.cwd() / \"src\"))\n", - "from common.feature_extraction import extract_features\n", - "\n", - "test_urls = [\n", - " (\"https://google.com\", \"Legitimate\", 0.02),\n", - " (\"https://github.com\", \"Legitimate\", 0.02),\n", - " (\"https://microsoft.com\", \"Legitimate\", 0.02),\n", - " (\"http://phishing.top/login\", \"Phishing\", 0.95),\n", - " (\"http://secure-bank-verify.tk/account\", \"Phishing\", 0.95),\n", - " (\"http://paypal-secure-login.ml/update\", \"Phishing\", 0.95),\n", - "]\n", - "\n", - "print(\"\\n\" + \"=\" * 60)\n", - "print(\"SPOT CHECK: KNOWN URLS\")\n", - "print(\"=\" * 60)\n", - "print(f\"{'URL':<45} {'Expected':<12} {'p_mal':>8} {'Status':>8}\")\n", - "print(\"-\" * 60)\n", - "\n", - "spot_check_results = []\n", - "\n", - "for url, expected_class, expected_score in test_urls:\n", - " try:\n", - " # Extract features\n", - " features = extract_features(url, include_https=False)\n", - "\n", - " # Create DataFrame with correct feature order\n", - " X_test = pd.DataFrame([features])[FEATURES_7]\n", - "\n", - " # Predict\n", - " p_legit = calib_v2.predict_proba(X_test)[0, 1]\n", - " p_mal = 1.0 - p_legit\n", - "\n", - " # Check if prediction is correct\n", - " if expected_class == \"Legitimate\":\n", - " status = \"✓\" if p_mal < 0.3 else \"✗\"\n", - " else: # Phishing\n", - " status = \"✓\" if p_mal > 0.7 else \"✗\"\n", - "\n", - " spot_check_results.append(\n", - " {\n", - " \"url\": url,\n", - " \"expected\": expected_class,\n", - " \"p_malicious\": p_mal,\n", - " \"correct\": status == \"✓\",\n", - " }\n", - " )\n", - "\n", - " print(f\"{url[:45]:<45} {expected_class:<12} {p_mal:>8.4f} {status:>8}\")\n", - "\n", - " except Exception as e:\n", - " print(f\"{url[:45]:<45} {'ERROR':<12} {'N/A':>8} {'✗':>8}\")\n", - " print(f\" Error: {e}\")\n", - "\n", - "print(\"=\" * 60)\n", - "\n", - "# Summary\n", - "correct_count = sum(r[\"correct\"] for r in spot_check_results)\n", - "total_count = len(spot_check_results)\n", - "accuracy = correct_count / total_count if total_count > 0 else 0\n", - "\n", - "print(f\"\\nSpot Check Accuracy: {correct_count}/{total_count} ({accuracy:.1%})\")\n", - "\n", - "if accuracy == 1.0:\n", - " print(\"✅ All spot checks passed!\")\n", - "elif accuracy >= 0.8:\n", - " print(\"⚠️ Most spot checks passed, but some failed\")\n", - "else:\n", - " print(\"❌ Many spot checks failed - model may not be working correctly\")" - ] - }, - { - "cell_type": "markdown", - "id": "469d6e94", - "metadata": {}, - "source": [ - "## Save V2 Model (If Performance Acceptable)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "445843fc", - "metadata": {}, - "outputs": [], - "source": [ - "# Only save if performance is acceptable\n", - "if pr_auc_v2 > 0.95 and f1_v2 > 0.95:\n", - " # Save model\n", - " MODEL_PATH_V2 = Path(\"models/dev/model_7feat_v2.pkl\")\n", - " joblib.dump(calib_v2, MODEL_PATH_V2)\n", - " print(f\"✓ Saved model to {MODEL_PATH_V2}\")\n", - "\n", - " # Save metadata\n", - " META_PATH_V2 = Path(\"models/dev/model_7feat_v2_meta.json\")\n", - " meta_v2 = {\n", - " \"feature_order\": FEATURES_7,\n", - " \"class_mapping\": {\"phish\": 0, \"legit\": 1},\n", - " \"phish_proba_col_index\": 0,\n", - " \"model_type\": \"CalibratedClassifierCV\",\n", - " \"calibration\": \"isotonic_cv5\",\n", - " \"training_date\": pd.Timestamp.now().isoformat(),\n", - " \"seed\": SEED,\n", - " \"metrics\": {\n", - " \"pr_auc\": float(pr_auc_v2),\n", - " \"f1_macro\": float(f1_v2),\n", - " \"brier\": float(brier_v2),\n", - " },\n", - " \"notes\": \"7-feature model trained with shared feature extraction library (v2)\",\n", - " \"data_source\": \"phiusiil_features_v2.csv\",\n", - " }\n", - "\n", - " META_PATH_V2.write_text(json.dumps(meta_v2, indent=2), encoding=\"utf-8\")\n", - " print(f\"✓ Saved metadata to {META_PATH_V2}\")\n", - "\n", - " print(\"\\n✅ V2 MODEL SAVED - Ready for deployment\")\n", - "else:\n", - " print(\"\\n⚠️ V2 MODEL NOT SAVED - Performance below threshold\")\n", - " print(\" Next steps: Redo feature selection with new features\")" - ] - }, - { - "cell_type": "markdown", - "id": "8ca5d068", - "metadata": {}, - "source": [ - "## Decision: Which Model to Use?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "61857b5d", - "metadata": {}, - "outputs": [], - "source": [ - "print(\"\\n\" + \"=\" * 60)\n", - "print(\"DEPLOYMENT RECOMMENDATION\")\n", - "print(\"=\" * 60)\n", - "\n", - "if pr_auc_v2 > 0.95 and f1_v2 > 0.95:\n", - " print(\"\\n✅ DEPLOY V2 MODEL\")\n", - " print(\"\\nReasons:\")\n", - " print(\" 1. Performance maintained (PR-AUC > 0.95, F1 > 0.95)\")\n", - " print(\" 2. Training/serving consistency (same feature extraction)\")\n", - " print(\" 3. Spot checks pass on known URLs\")\n", - " print(\"\\nNext steps:\")\n", - " print(\" 1. Update model service to use model_7feat_v2.pkl\")\n", - " print(\" 2. Update config.yaml to point to v2 models\")\n", - " print(\" 3. Test end-to-end with model service\")\n", - " print(\" 4. Document the training/serving skew fix\")\n", - "else:\n", - " print(\"\\n⚠️ DO NOT DEPLOY V2 MODEL\")\n", - " print(\"\\nReasons:\")\n", - " print(f\" 1. Performance degraded: PR-AUC={pr_auc_v2:.4f} (target: >0.95)\")\n", - " print(\" 2. Feature distribution shift too large\")\n", - " print(\"\\nNext steps:\")\n", - " print(\" 1. Run feature selection on new features (phiusiil_features_v2.csv)\")\n", - " print(\" 2. Identify optimal feature set for NEW distributions\")\n", - " print(\" 3. Retrain with optimal features\")\n", - " print(\" 4. Re-evaluate performance\")\n", - "\n", - "print(\"=\" * 60)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/archive/04_robustness_checks.ipynb b/notebooks/archive/04_robustness_checks.ipynb deleted file mode 100644 index ac796a6..0000000 --- a/notebooks/archive/04_robustness_checks.ipynb +++ /dev/null @@ -1,394 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "c8d2692e", - "metadata": {}, - "source": [ - "## **robustness_checker**" - ] - }, - { - "cell_type": "markdown", - "id": "ed2ec4c3", - "metadata": {}, - "source": [ - "### **Import libraries**" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "eb3e5fc2", - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "import numpy as np, pandas as pd\n", - "from sklearn.model_selection import train_test_split, StratifiedKFold\n", - "from sklearn.metrics import average_precision_score, brier_score_loss\n", - "from xgboost import XGBClassifier\n", - "import matplotlib.pyplot as plt\n", - "import os" - ] - }, - { - "cell_type": "markdown", - "id": "3b397718", - "metadata": {}, - "source": [ - "### **Load the dataset**" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "a0c052dc", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "d:\\MLops\\NetworkSecurity\n" - ] - } - ], - "source": [ - "os.chdir(\"../\")\n", - "print(os.getcwd())" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "f144f7dd", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Checking for: D:\\MLops\\NetworkSecurity\\data\\processed\\phiusiil_clean.csv\n", - "Exists? True\n", - "Using dataset: data\\processed\\phiusiil_clean.csv\n" - ] - } - ], - "source": [ - "\n", - "\n", - "SEED = 42\n", - "CLEAN = Path(\"data/processed/phiusiil_clean.csv\")\n", - "RAW = Path(\"data/raw/PhiUSIIL_Phishing_URL_Dataset.csv\")\n", - "DATA = CLEAN if CLEAN.exists() else RAW\n", - "print(\"Checking for:\", CLEAN.resolve())\n", - "print(\"Exists?\", CLEAN.exists())\n", - "print(f\"Using dataset: {DATA}\")" - ] - }, - { - "cell_type": "markdown", - "id": "903ec957", - "metadata": {}, - "source": [ - "### **Load & basic prep**" - ] - }, - { - "cell_type": "markdown", - "id": "ef2e100b", - "metadata": {}, - "source": [ - "This code block loads the dataset, automatically locates the label column, and prepares the target variable and features for modeling. It ensures that only numeric columns are used as features, removes the label and URL columns, and prints out the number of numeric features available for analysis. The data is then split into training and validation sets with stratified sampling to maintain class balance, and a version of the validation labels is created to support precision-recall metrics focused on phishing detection." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ca4df2cf", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Features: 50 numeric columns\n" - ] - } - ], - "source": [ - "\n", - "df = pd.read_csv(DATA, encoding_errors=\"ignore\")\n", - "\n", - "label_col = next((c for c in df.columns if c.lower() in {\"label\",\"result\",\"y\",\"target\"}), None)\n", - "assert label_col, \"No label column found\"\n", - "\n", - "y = df[label_col].astype(int).values # 1=legit, 0=phish\n", - "y_phish = (y == 0).astype(int) # for PR-AUC where positive=phish\n", - "\n", - "urls = df[\"URL\"].astype(str).values if \"URL\" in df.columns else np.array([\"\"]*len(df))\n", - "X_all = df.drop(columns=[label_col])\n", - "if \"URL\" in X_all.columns:\n", - " X_all = X_all.drop(columns=[\"URL\"])\n", - "\n", - "X_all = X_all.select_dtypes(include=[\"number\"]).copy()\n", - "print(f\"Features: {X_all.shape[1]} numeric columns\")\n", - "\n", - "X_train, X_val, y_train, y_val, u_train, u_val = train_test_split(\n", - " X_all, y, urls, test_size=0.20, stratify=y, random_state=SEED\n", - ")\n", - "\n", - "y_val_phish = (y_val == 0).astype(int)" - ] - }, - { - "cell_type": "markdown", - "id": "e50dd38b", - "metadata": {}, - "source": [ - "### **Utility: fit a small XGB and return p_mal = P(phish)**" - ] - }, - { - "cell_type": "markdown", - "id": "0e772504", - "metadata": {}, - "source": [ - "This function trains an XGBoost classifier to predict the probability that a sample is phishing, using the provided training features and labels. It sets up the model with specific hyperparameters, converts the labels so that phishing is treated as the positive class, and returns the predicted probabilities for the test set. This utility helps quickly evaluate how well the model can distinguish phishing samples from legitimate ones." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "bd159b06", - "metadata": {}, - "outputs": [], - "source": [ - "def fit_xgb(Xtr, ytr, Xte, seed=SEED, n_estimators=200):\n", - " clf = XGBClassifier(\n", - " n_estimators=n_estimators, max_depth=6, learning_rate=0.1,\n", - " subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0,\n", - " random_state=seed, n_jobs=0, objective=\"binary:logistic\"\n", - " )\n", - " clf.fit(Xtr, (ytr==0).astype(int)) # train with phish=1 target\n", - " p_phish = clf.predict_proba(Xte)[:,1] # P(phish)\n", - " return p_phish" - ] - }, - { - "cell_type": "markdown", - "id": "d86194c7", - "metadata": {}, - "source": [ - "### **A1-** **Label-shuffle baseline**" - ] - }, - { - "cell_type": "markdown", - "id": "fb9773b7", - "metadata": {}, - "source": [ - "This code block tests the model’s robustness by randomly shuffling the phishing labels in the training set, breaking any real relationship between features and labels. It then trains the model on this shuffled data and evaluates its performance on the validation set, expecting metrics like average precision and Brier score to drop to chance levels (around 0.5 for AP). This helps confirm that the model is not learning shortcuts or artifacts from the data, and that high scores on real data are meaningful." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "c0926639", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[A1] Label-shuffle → AP=0.422 (≈0.5 expected), Brier=0.247\n" - ] - } - ], - "source": [ - "rng = np.random.default_rng(SEED)\n", - "y_train_shuf = rng.permutation((y_train==0).astype(int)) # shuffle phish labels\n", - "p_shuf = fit_xgb(X_train, 1 - y_train_shuf, X_val) # invert back to y-coding for fit helper\n", - "ap_shuf = average_precision_score(y_val_phish, p_shuf)\n", - "brier_shuf = brier_score_loss(y_val_phish, p_shuf)\n", - "print(f\"[A1] Label-shuffle → AP={ap_shuf:.3f} (≈0.5 expected), Brier={brier_shuf:.3f}\")" - ] - }, - { - "cell_type": "markdown", - "id": "8b02a786", - "metadata": {}, - "source": [ - "### **A2- Single-feature PR-AUC scanning**" - ] - }, - { - "cell_type": "markdown", - "id": "8a5eecf7", - "metadata": {}, - "source": [ - "This code scans each feature in the validation set to see how well it alone can separate phishing from legitimate samples, using average precision (AP) as the metric. It tests both the original and inverted values of each feature to account for sign direction, then records the best AP for each. Features with very high AP (>0.995) are flagged as potential shortcuts, meaning they might allow the model to easily distinguish classes due to data artifacts or policy-sensitive information." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "03c98316", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[A2] Top-10 single-feature AP (max over orientation):\n", - " feature ap_max\n", - " URLSimilarityIndex 0.995497\n", - " LineOfCode 0.988937\n", - " NoOfExternalRef 0.979716\n", - " NoOfImage 0.966242\n", - " NoOfJS 0.955238\n", - " NoOfSelfRef 0.936094\n", - " NoOfCSS 0.929496\n", - " LargestLineLength 0.853622\n", - "NoOfOtherSpecialCharsInURL 0.791699\n", - " SpacialCharRatioInURL 0.783111\n", - "[A2] Features with AP>0.995: 1 (flagged as policy-sensitive)\n" - ] - } - ], - "source": [ - "ap_rows = []\n", - "for col in X_all.columns:\n", - " s_val = X_val[col].values.astype(float)\n", - " # Use both orientations; take the best AP (since sign may be arbitrary)\n", - " ap1 = average_precision_score(y_val_phish, s_val)\n", - " ap2 = average_precision_score(y_val_phish, -s_val)\n", - " ap_rows.append((col, float(max(ap1, ap2))))\n", - "ap_df = pd.DataFrame(ap_rows, columns=[\"feature\", \"ap_max\"])\n", - "ap_df.sort_values(\"ap_max\", ascending=False, inplace=True)\n", - "top10 = ap_df.head(10)\n", - "n_shortcuts = (ap_df[\"ap_max\"] > 0.995).sum()\n", - "print(f\"[A2] Top-10 single-feature AP (max over orientation):\\n{top10.to_string(index=False)}\")\n", - "print(f\"[A2] Features with AP>0.995: {n_shortcuts} (flagged as policy-sensitive)\")" - ] - }, - { - "cell_type": "markdown", - "id": "c7fb820e", - "metadata": {}, - "source": [ - "### **A3- Degeneracy check on p_mal vals**" - ] - }, - { - "cell_type": "markdown", - "id": "96b88c57", - "metadata": {}, - "source": [ - "This code trains the model on the true labels and predicts phishing probabilities for the validation set. It then checks how many predictions are extremely close to 0 or 1, which can indicate overconfident or degenerate outputs. The average precision and Brier score are calculated to assess model performance, and a histogram visualizes the distribution of predicted probabilities." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "76215a88", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[A3] True model → AP=1.000, Brier=0.000005\n", - "[A3] Prob mass near 0: 0.567 | near 1: 0.425\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "p_mal = fit_xgb(X_train, y_train, X_val)\n", - "share_near_0 = (p_mal <= 1e-4).mean()\n", - "share_near_1 = (p_mal >= 1 - 1e-4).mean()\n", - "ap_true = average_precision_score(y_val_phish, p_mal)\n", - "brier_true = brier_score_loss(y_val_phish, p_mal)\n", - "print(f\"[A3] True model → AP={ap_true:.3f}, Brier={brier_true:.6f}\")\n", - "print(f\"[A3] Prob mass near 0: {share_near_0:.3f} | near 1: {share_near_1:.3f}\")\n", - "plt.figure(figsize=(6,3.5)); plt.hist(p_mal, bins=20); plt.title(\"p_malicious (val)\"); plt.xlabel(\"p_mal\"); plt.ylabel(\"count\"); plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "238c30db", - "metadata": {}, - "source": [ - "### **A4- Cross-split duplicate URL contamination**" - ] - }, - { - "cell_type": "markdown", - "id": "fecdfde9", - "metadata": {}, - "source": [ - "This code checks for any duplicate URLs that appear in both the training and validation sets, which could cause data leakage and inflate model performance. It then saves the single-feature average precision (AP) results to a CSV file for further review, making it easier to identify features that may act as shortcuts or have unusually high predictive power." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "64ef39d1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[A4] Train/Val overlapping URLs: 0 (should be 0 after dedup)\n", - "[A2] Wrote single-feature AP table → outputs\\robustness\\single_feature_ap.csv\n" - ] - } - ], - "source": [ - "if urls.size and urls.dtype.kind in {\"U\",\"S\",\"O\"}:\n", - " inter = set(u_train).intersection(set(u_val))\n", - " print(f\"[A4] Train/Val overlapping URLs: {len(inter)} (should be 0 after dedup)\")\n", - "else:\n", - " print(\"[A4] URL column not available for overlap test\")\n", - " \n", - "# Save the single-feature AP table for review\n", - "out_dir = Path(\"outputs/robustness\"); out_dir.mkdir(parents=True, exist_ok=True)\n", - "ap_df.to_csv(out_dir/\"single_feature_ap.csv\", index=False)\n", - "print(f\"[A2] Wrote single-feature AP table → {out_dir/'single_feature_ap.csv'}\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/scripts/archive/materialize_url_features.py b/scripts/archive/materialize_url_features.py deleted file mode 100644 index 78ba859..0000000 --- a/scripts/archive/materialize_url_features.py +++ /dev/null @@ -1,101 +0,0 @@ -from __future__ import annotations - -import argparse -import hashlib -import json -from pathlib import Path - -import pandas as pd - - -def url_len(s: str) -> int: - return len(s) if isinstance(s, str) else 0 - - -def digit_ratio(s: str) -> float: - if not isinstance(s, str) or not s: - return 0.0 - d = sum(ch.isdigit() for ch in s) - return d / len(s) - - -def subdomain_count(s: str) -> int: - if not isinstance(s, str) or not s: - return 0 - host = s.split("://", 1)[-1].split("/", 1)[0] - return max(0, host.count(".") - 1) - - -def md5_file(p: Path) -> str: - h = hashlib.md5( - usedforsecurity=False - ) # nosec B324 - Used for data fingerprinting, not security - with p.open("rb") as f: - for chunk in iter(lambda: f.read(1 << 20), b""): - h.update(chunk) - return h.hexdigest() - - -def main(): - ap = argparse.ArgumentParser() - ap.add_argument( - "--infile", type=Path, default=Path("data/processed/phiusiil_clean.csv") - ) - ap.add_argument( - "--outfile", - type=Path, - default=Path("data/processed/phiusiil_clean_urlfeats.csv"), - ) - args = ap.parse_args() - - if not args.infile.exists(): - raise FileNotFoundError(f"Input not found: {args.infile}") - df = pd.read_csv(args.infile, encoding_errors="ignore") - if "URL" not in df.columns: - raise ValueError("Expected 'URL' column in processed data") - - # Add/overwrite deterministic URL-only features - df["url_len"] = df["URL"].map(url_len).astype("int64") - df["url_digit_ratio"] = df["URL"].map(digit_ratio).astype("float64") - df["url_subdomains"] = df["URL"].map(subdomain_count).astype("int64") - - # Quick invariants - if df["url_len"].isna().sum() != 0: - raise ValueError("url_len contains NaN values") - if df["url_subdomains"].isna().sum() != 0: - raise ValueError("url_subdomains contains NaN values") - if not df["url_digit_ratio"].between(0.0, 1.0, inclusive="both").all(): - raise ValueError("digit_ratio outside [0,1]") - if not ((df["url_len"] >= 0).all() and (df["url_subdomains"] >= 0).all()): - raise ValueError("negative length/subdomains?") - - # Write artifact - args.outfile.parent.mkdir(parents=True, exist_ok=True) - df.to_csv(args.outfile, index=False) - - # Fingerprint + summary (useful to tag MLflow runs) - fp = { - "rows": int(len(df)), - "cols": int(df.shape[1]), - "file": str(args.outfile), - "md5": md5_file(args.outfile), - "added_features": ["url_len", "url_digit_ratio", "url_subdomains"], - "ranges": { - "url_len": [int(df["url_len"].min()), int(df["url_len"].max())], - "url_digit_ratio": [ - float(df["url_digit_ratio"].min()), - float(df["url_digit_ratio"].max()), - ], - "url_subdomains": [ - int(df["url_subdomains"].min()), - int(df["url_subdomains"].max()), - ], - }, - } - Path("outputs").mkdir(exist_ok=True) - Path("outputs/url_features_fingerprint.json").write_text(json.dumps(fp, indent=2)) - print(json.dumps(fp, indent=2)) - - -if __name__ == "__main__": - main() diff --git a/src/common/__pycache__/stats.cpython-311.pyc b/src/common/__pycache__/stats.cpython-311.pyc index cdce4e9..598a118 100644 Binary files a/src/common/__pycache__/stats.cpython-311.pyc and b/src/common/__pycache__/stats.cpython-311.pyc differ diff --git a/src/common/__pycache__/thresholds.cpython-311.pyc b/src/common/__pycache__/thresholds.cpython-311.pyc index 777ec77..eb3a7a2 100644 Binary files a/src/common/__pycache__/thresholds.cpython-311.pyc and b/src/common/__pycache__/thresholds.cpython-311.pyc differ diff --git a/src/gateway/__pycache__/judge_wire.cpython-311.pyc b/src/gateway/__pycache__/judge_wire.cpython-311.pyc index e70254d..01ae2d9 100644 Binary files a/src/gateway/__pycache__/judge_wire.cpython-311.pyc and b/src/gateway/__pycache__/judge_wire.cpython-311.pyc differ diff --git a/src/gateway/__pycache__/main.cpython-311.pyc b/src/gateway/__pycache__/main.cpython-311.pyc index ba35224..ecd3b92 100644 Binary files a/src/gateway/__pycache__/main.cpython-311.pyc and b/src/gateway/__pycache__/main.cpython-311.pyc differ diff --git a/src/gateway/main.py b/src/gateway/main.py index 5d59426..696e05f 100644 --- a/src/gateway/main.py +++ b/src/gateway/main.py @@ -353,7 +353,17 @@ def explain_dashboard(): """ import pathlib - static_dir = pathlib.Path(__file__).parent / "static" + # In Docker, static files are copied to /app/src/gateway/static/ + # When running locally, they're relative to this file + docker_static_dir = pathlib.Path("/app/src/gateway/static") + local_static_dir = pathlib.Path(__file__).parent / "static" + + # Prefer Docker location if it exists, otherwise use local + if docker_static_dir.exists(): + static_dir = docker_static_dir + else: + static_dir = local_static_dir + html_file = static_dir / "explain.html" print(f"[DEBUG] Looking for dashboard at: {html_file.absolute()}") diff --git a/src/judge_svc/__pycache__/adapter.cpython-311.pyc b/src/judge_svc/__pycache__/adapter.cpython-311.pyc index ea6bf34..fcf6942 100644 Binary files a/src/judge_svc/__pycache__/adapter.cpython-311.pyc and b/src/judge_svc/__pycache__/adapter.cpython-311.pyc differ diff --git a/src/judge_svc/__pycache__/contracts.cpython-311.pyc b/src/judge_svc/__pycache__/contracts.cpython-311.pyc index de69e7b..ad76890 100644 Binary files a/src/judge_svc/__pycache__/contracts.cpython-311.pyc and b/src/judge_svc/__pycache__/contracts.cpython-311.pyc differ diff --git a/src/judge_svc/__pycache__/stub.cpython-311.pyc b/src/judge_svc/__pycache__/stub.cpython-311.pyc index 850ce59..8cd9d43 100644 Binary files a/src/judge_svc/__pycache__/stub.cpython-311.pyc and b/src/judge_svc/__pycache__/stub.cpython-311.pyc differ diff --git a/src/model_svc/__pycache__/main.cpython-311.pyc b/src/model_svc/__pycache__/main.cpython-311.pyc index 8c20536..97d1e63 100644 Binary files a/src/model_svc/__pycache__/main.cpython-311.pyc and b/src/model_svc/__pycache__/main.cpython-311.pyc differ