diff --git a/.env.example b/.env.example index 8e5de33..809f4be 100644 --- a/.env.example +++ b/.env.example @@ -6,7 +6,7 @@ JUDGE_BACKEND=stub # LLM Judge (only used when JUDGE_BACKEND=llm) OLLAMA_HOST=http://localhost:11434 JUDGE_MODEL=llama3.2:1b -JUDGE_TIMEOUT_SECS=12 +JUDGE_TIMEOUT_SECS=60 # ============================================================ # MODEL SERVICE CONFIGURATION @@ -15,17 +15,17 @@ JUDGE_TIMEOUT_SECS=12 # Configuration file path CONFIG_PATH=configs/dev/config.yaml -# Primary model (8-feature production model) -PRIMARY_MODEL_PATH=models/dev/model_8feat.pkl -PRIMARY_META_PATH=models/dev/model_8feat_meta.json +# Primary model (7-feature production model) +PRIMARY_MODEL_PATH=models/dev/model_7feat.pkl +PRIMARY_META_PATH=models/dev/model_7feat_meta.json # Shadow testing (DISABLED for production) SHADOW_ENABLED=false -SHADOW_MODEL_PATH=models/dev/model_7feat.pkl -SHADOW_META_PATH=models/dev/model_7feat_meta.json +SHADOW_MODEL_PATH=models/dev/model_8feat.pkl +SHADOW_META_PATH=models/dev/model_8feat_meta.json # Service URLs -MODEL_SVC_URL=http://localhost:9000 +MODEL_SVC_URL=http://localhost:8002 GATEWAY_PORT=8000 MODEL_SVC_PORT=9000 diff --git a/Readme.md b/Readme.md index 33366ed..b9fd68d 100644 --- a/Readme.md +++ b/Readme.md @@ -11,21 +11,25 @@ A complete machine learning operations (ML Ops) system demonstrating the full li --- -## 📊 Key Achievements +## 📊 **Key Achievements** **Model Performance:** -- **99.92% PR-AUC** - Near-perfect precision-recall tradeoff -- **0.09% False Positive Rate** - Only 23 misclassifications out of 26,970 legitimate URLs -- **99.70% F1-Macro** - Excellent balance across both classes -- **Brier Score: 0.0026** - Well-calibrated probabilities for threshold-based decisions +- **99.87% PR-AUC** - Near-perfect precision-recall tradeoff +- **0.24% False Positive Rate** - Only 66 misclassifications out of 26,970 legitimate URLs +- **99.40% F1-Macro** - Excellent balance across both classes +- **Brier Score: 0.0052** - Well-calibrated probabilities for threshold-based decisions **System Capabilities:** -- **89% Automation Rate** - High-confidence decisions handled by policy bands -- **11% Gray Zone** - Uncertain cases escalated to judge with explainable rationale -- **8 URL-Only Features** - No page fetching required (<50ms inference) +- **88% High Automation Rate** - High-confidence decisions handled by policy bands (52% ALLOW, 36% BLOCK) +- **12% Gray Zone** - Uncertain cases escalated to judge with explainable rationale +- **7 URL-Only Features** - No page fetching required (<50ms inference) - **SHAP Explainability** - Feature-level attribution for regulatory compliance +- **LLM Judge Integration** - Ollama-powered reasoning for edge cases (npm.org, bit.ly) - **Production-Grade Validation** - Great Expectations data contracts in CI/CD +**Critical Design Decision:** +- **Eliminated IsHTTPS Feature** - Chose 7-feature model over 8-feature despite 109 additional false negatives to eliminate 100% HTTPS phishing miss rate (systematic vulnerability in 8-feature model due to distribution shift) + --- ## 🗺️ Table of Contents @@ -34,12 +38,15 @@ A complete machine learning operations (ML Ops) system demonstrating the full li 2. [Discovery Phase: Exploratory Data Analysis](#-discovery-phase-exploratory-data-analysis) 3. [Feature Engineering](#-feature-engineering) 4. [Model Development](#-model-development) -5. [Threshold Optimization](#-threshold-optimization) -6. [Production System Design](#-production-system-design) -7. [Validation & Quality Assurance](#-validation--quality-assurance) -8. [Key Learnings](#-key-learnings) -9. [Quick Start](#-quick-start) -10. [Documentation](#-documentation) +5. [Critical Design Decision: IsHTTPS Removal](#-critical-design-decision-ishttps-removal) +6. [Threshold Optimization](#-threshold-optimization) +7. [Production System Design](#-production-system-design) +8. [LLM Judge System](#-llm-judge-system) +9. [Validation & Quality Assurance](#-validation--quality-assurance) +10. [API Reference](#-api-reference) +11. [Key Learnings](#-key-learnings) +12. [Quick Start](#-quick-start) +13. [Documentation](#-documentation) --- @@ -53,7 +60,7 @@ A complete machine learning operations (ML Ops) system demonstrating the full li │ ▼ ┌──────────────────────────────────────────┐ - │ GATEWAY SERVICE (:8080 → :8000) │ + │ GATEWAY SERVICE (:8000) │ │ ┌────────────────────────────────────┐ │ │ │ 1. Whitelist Check (15 domains) │ │ │ │ ├─ google.com, github.com │ │ @@ -62,26 +69,27 @@ A complete machine learning operations (ML Ops) system demonstrating the full li │ │ │ │ ┌────────────────▼───────────────────┐ │ │ │ 2. Model Service Call │ │ - │ │ └─ Returns p_malicious │ │ + │ │ └─ Returns p_malicious + 7 feats│ │ │ └────────────────┬───────────────────┘ │ │ │ │ │ ┌────────────────▼───────────────────┐ │ │ │ 3. Enhanced Routing Logic │ │ - │ │ ├─ Short domain check (≤10) │ │ + │ │ ├─ Short domain check (≤12) │ │ │ │ └─ Routes edge cases to judge │ │ │ └────────────────┬───────────────────┘ │ │ │ │ │ ┌────────────────▼───────────────────┐ │ │ │ 4. Policy Bands │ │ - │ │ ├─ p < 0.004 → ALLOW │ │ - │ │ ├─ p > 0.999 → BLOCK │ │ - │ │ └─ 0.004 ≤ p < 0.999 → REVIEW │ │ + │ │ ├─ p < 0.011 → ALLOW │ │ + │ │ ├─ p > 0.998 → BLOCK │ │ + │ │ └─ 0.011 ≤ p ≤ 0.998 → REVIEW │ │ │ └────────────────┬───────────────────┘ │ │ │ │ │ ┌────────────────▼───────────────────┐ │ │ │ 5. Judge Escalation (Gray Zone) │ │ - │ │ ├─ Stub Judge (deterministic) │ │ - │ │ └─ LLM Judge (Ollama fallback) │ │ + │ │ ├─ LLM Judge (Ollama primary) │ │ + │ │ └─ Stub Judge (deterministic) │ │ + │ │ (fallback if LLM timeout) │ │ │ └────────────────┬───────────────────┘ │ └──────────────────┬────────────────────────┘ │ @@ -89,7 +97,7 @@ A complete machine learning operations (ML Ops) system demonstrating the full li ┌──────────────────────────────────────────┐ │ MODEL SERVICE (:8002) │ │ ┌────────────────────────────────────┐ │ - │ │ • Feature Extraction (8 features) │ │ + │ │ • Feature Extraction (7 features) │ │ │ │ • XGBoost Inference │ │ │ │ • Isotonic Calibration │ │ │ │ • SHAP Explainability (/explain) │ │ @@ -98,6 +106,14 @@ A complete machine learning operations (ML Ops) system demonstrating the full li │ ▼ ┌──────────────────────────────────────────┐ + │ OLLAMA LLM (:11434) │ + │ • llama3.2:1b model │ + │ • 60-second timeout (first call) │ + │ • Provides human-readable rationale │ + └──────────────────────────────────────────┘ + │ + ▼ + ┌──────────────────────────────────────────┐ │ RESPONSE TO CLIENT │ │ {decision, p_malicious, reason, │ │ judge_rationale, shap_values} │ @@ -132,7 +148,7 @@ A complete machine learning operations (ML Ops) system demonstrating the full li **Rationale:** Standard ML pipeline step to ensure model learns patterns, not specific URLs. -### Feature Selection Methodology +### **Feature Selection Methodology** We systematically evaluated all 54 features in the dataset to identify the optimal subset for URL-only phishing detection. @@ -173,7 +189,8 @@ separation_score = |median_phishing - median_legitimate| / pooled_std_dev | **NoOfOtherSpecialCharsInURL** | 0.562 | 3 | Special character count | | **DomainLength** | 0.324 | 3 | Domain name length | -**Key Finding:** IsHTTPS is the strongest single discriminator, with phishing sites using HTTP at 2.5x the rate of legitimate sites. + +**Initial Selection:** 8 features → **Critical discovery:** IsHTTPS showed distribution shift although it is the strongest single discriminator (see Critical Design Decision: IsHTTPS Removal) !![alt text](outputs/eda/feature_separation_scores_top8.png) *Figure 1: Feature separation analysis showing discriminative power of each URL-only feature* @@ -201,7 +218,32 @@ separation_score = |median_phishing - median_legitimate| / pooled_std_dev **TLD Analysis:** - **High-Risk TLDs:** .top (99.9% phishing), .dev (98.6%), .app (97.8%) - **Legitimate TLDs:** .edu (0.3% phishing), .org (12.1% phishing) -- **Decision:** Use **TLDLegitimateProb** (numeric encoding) instead of one-hot encoding +- **Decision:** Use **Wilson confidence interval**, a robust method for binomial proportions—to measure uncertainty, especially for TLDs with few samples. + +- For a range of minimum sample thresholds, it: + + - Filters out TLDs with too few URLs (not enough data to trust their stats). + - Computes the observed legitimacy rate and its confidence interval width for each remaining TLD. + - Summarizes the average and 90th percentile uncertainty (CI width) across TLDs. + - Assigns a reliability label (HIGH, MEDIUM, LOW) based on how tight the confidence intervals are. +- This lets us balance coverage (how many URLs/TLDs we keep) against reliability (how much we can trust our stats), and pick a defensible threshold for model training and evaluation. + +``` +CI_width ≈ 2 * sqrt(p(1-p)/n + 1/(4n²)) +Where p ≈ 0.5 (worst case), n = sample size +Confidence_Interval (95%) = (𝑝−half-width, 𝑝+half-width) +``` + +**Hyperparameters:** +- α (pseudo-legitimate): 1 +- β (pseudo-phishing): 2 (Conservative, Unknown or under-sampled TLDs are risky until proven safe.) +- MIN_SAMPLES: 20 +- All single-sample TLDs now use global rate (0.574) +- Only TLDs with ≥20 samples get custom probabilities + +> ![alt text](outputs/tld_confidence_interval_analysis.png) + + **Domain Analysis:** - **Cardinality:** 220,086 unique domains (1.07 URLs per domain) @@ -211,16 +253,15 @@ separation_score = |median_phishing - median_legitimate| / pooled_std_dev ### Final Feature Selection -**OPTIMAL 8-FEATURE SET:** +**OPTIMAL 7-FEATURE SET:** -1. **IsHTTPS** (separation: 2.829) -2. **TLDLegitimateProb** (separation: 2.012) -3. **CharContinuationRate** (separation: 1.372) -4. **SpacialCharRatioInURL** (separation: 1.330) -5. **URLCharProb** (separation: 0.889) -6. **LetterRatioInURL** (separation: 0.825) -7. **NoOfOtherSpecialCharsInURL** (separation: 0.562) -8. **DomainLength** (separation: 0.324) +1. **TLDLegitimateProb** (separation: 2.012) +2. **CharContinuationRate** (separation: 1.372) +3. **SpacialCharRatioInURL** (separation: 1.330) +4. **URLCharProb** (separation: 0.889) +5. **LetterRatioInURL** (separation: 0.825) +6. **NoOfOtherSpecialCharsInURL** (separation: 0.562) +7. **DomainLength** (separation: 0.324) **Selection Criteria:** - ✅ Prioritized Tier 1-3 features (separation > 0.3) @@ -230,9 +271,9 @@ separation_score = |median_phishing - median_legitimate| / pooled_std_dev **Comparison to Initial Selection:** -| Criteria | Initial 8 Features | Optimal 8 Features | +| Criteria | Initial 7 Features | Optimal 7 Features | |----------|-------------------|-------------------| -| Tier 1 features | 3 | 4 ✅ | +| Tier 1 features | 4 | 3 ✅ | | Tier 2 features | 1 | 2 ✅ | | Tier 3 features | 1 | 2 ✅ | | Tier 4 features (weak) | 3 ❌ | 0 ✅ | @@ -247,17 +288,11 @@ separation_score = |median_phishing - median_legitimate| / pooled_std_dev All features are extracted using `src/common/feature_extraction.py` to ensure **training/serving consistency**. -#### **1. IsHTTPS** -- **Type:** Binary (0.0 or 1.0) -- **Definition:** Whether URL uses HTTPS protocol -- **Distribution:** Legitimate: 95% HTTPS | Phishing: 60% HTTPS -- **Why it matters:** Security-conscious users expect HTTPS; lack of it is a strong phishing signal - -#### **2. TLDLegitimateProb** +#### **1. TLDLegitimateProb** - **Type:** Float [0.0, 1.0] -- **Definition:** Bayesian legitimacy probability for top-level domain -- **Source:** `common/tld_probs.json` (695 TLDs with frequency counts) -- **Methodology:** Statistically justified Bayesian estimation +- **Definition:** Bayesian priors using Wilson Confidence Interval legitimacy probability for top-level domain +- **Source:** `common/tld_probs.json` (695 TLDs with cpnfidence values) +- **Methodology:** Statistically justified confidence values **Statistical Justification:** @@ -273,31 +308,24 @@ We conducted a rigorous analysis to determine optimal parameters for TLD probabi **Wilson Confidence Interval Analysis:** -| Sample Size | CI Width | Reliability | -|-------------|----------|-------------| -| 1 | 0.891 | UNRELIABLE (very wide) | -| 3 | 0.749 | POOR (wide) | -| 10 | 0.527 | POOR (wide) | -| 50 | 0.267 | GOOD (narrow) | -| 100 | 0.192 | EXCELLENT (very narrow) | +| Sample Size | CI Width | Reliability | Samples % | +|-------------|----------|-------------|--------| +| 5 | 0.525 | UNRELIABLE (very wide) | 99.4% | +| 10 | 0.401 | POOR (wide) | 98.9% | +| 15 | 0.349 | POOR (wide) | 98.5% | +| 20 | 0.286 | GOOD (narrow) | 98.2% | +| 30 | 0.241 | GOOD (narrow) | 97.5% | +| 50 | 0.093 | EXCELLENT (very narrow) | 96.7% | **Final Parameters (Data-Driven):** -- **MIN_SAMPLES = 10** (covers 98.9% of URLs, balances reliability vs coverage) +- **MIN_SAMPLES = 20** (covers 98.2% of URLs, balances reliability vs coverage) - **ALPHA = 1, BETA = 2** (security-first priors: "unknown TLDs are risky until proven safe") -**Coverage Trade-Off:** - -| MIN_SAMPLES | TLDs Kept | URLs Covered | Reliability | -|-------------|-----------|--------------|-------------| -| 1 | 1,401 (100%) | 234,764 (100%) | LOW | -| 5 | 593 (42.3%) | 233,413 (99.4%) | MEDIUM | -| 10 | 414 (29.6%) | 232,234 (98.9%) | HIGH ✅ | -| 20 | 291 (20.8%) | 230,540 (98.2%) | VERY HIGH | **Methodology:** -- **TLDs with ≥10 samples:** Smoothed Bayesian estimation with (α=1, β=2) priors -- **TLDs with <10 samples:** Fallback to global legitimacy rate (0.574) -- **Result:** 911 fewer overconfident predictions (0 TLDs with extreme 0.0 or 1.0 probabilities) +- **TLDs with ≥20 samples:** Smoothed Bayesian estimation with (α=1, β=2) priors +- **TLDs with <20 samples:** Fallback to global legitimacy rate (0.574) + **Example TLD Probabilities:** - **.com:** 0.611 (balanced) @@ -306,7 +334,7 @@ We conducted a rigorous analysis to determine optimal parameters for TLD probabi - **.tk (Tokelau):** 0.019 (high phishing) - **.top:** 0.002 (very high phishing) -#### **3. CharContinuationRate** +#### **2. CharContinuationRate** - **Type:** Float [0.0, 1.0] - **Formula:** (count of repeated chars) / (total chars - 1) - **Examples:** @@ -314,7 +342,7 @@ We conducted a rigorous analysis to determine optimal parameters for TLD probabi - "aaa" → 1.0 (all repeated) - "google.com" → 0.176 (some repetition in "oo") -#### **4. SpacialCharRatioInURL** +#### **3. SpacialCharRatioInURL** - **Type:** Float [0.0, 1.0] - **Definition:** Density of special characters - **Special chars:** `! @ # $ % ^ & * ( ) _ + - = [ ] { } | ; : , . < > ? /` @@ -322,7 +350,7 @@ We conducted a rigorous analysis to determine optimal parameters for TLD probabi - "http://example.com" → 0.16 - "http://ex.com/login?id=123&token=abc" → 0.23 -#### **5. URLCharProb** +#### **4. URLCharProb** - **Type:** Float [0.0, 1.0] - **Definition:** Proportion of common URL characters (alphanumeric + `:/.?=&-_`) - **Purpose:** Measures how "URL-like" the character distribution is @@ -330,14 +358,14 @@ We conducted a rigorous analysis to determine optimal parameters for TLD probabi - "http://example.com" → 0.95 (all common chars) - "http://ex.com/@@##$$" → 0.70 (unusual chars) -#### **6. LetterRatioInURL** +#### **5. LetterRatioInURL** - **Type:** Float [0.0, 1.0] - **Formula:** (count of letters A-Za-z) / (total chars) - **Examples:** - "http://example.com" → 0.63 - "http://ex.com/123" → 0.47 -#### **7. NoOfOtherSpecialCharsInURL** +#### **6. NoOfOtherSpecialCharsInURL** - **Type:** Integer [0, ∞) - **Definition:** Total count of special characters - **Same character set as SpacialCharRatioInURL** (but returns count instead of ratio) @@ -345,7 +373,7 @@ We conducted a rigorous analysis to determine optimal parameters for TLD probabi - "http://example.com" → 3 - "http://ex.com/login?id=123&token=abc" → 8 -#### **8. DomainLength** +#### **7. DomainLength** - **Type:** Integer [1, 253] - **Definition:** Length of the domain component (netloc) - **RFC 1035 limit:** 253 characters @@ -353,12 +381,12 @@ We conducted a rigorous analysis to determine optimal parameters for TLD probabi - "http://example.com" → 11 - "https://www.very-long-suspicious-domain.com" → 32 -### Training/Serving Consistency +### **Training/Serving Consistency** One of the most critical lessons from this project was ensuring **feature extraction consistency** between training and production. **The Problem:** -Initial deployment revealed that well-known legitimate URLs (e.g., `example.com`, `google.com`) were being misclassified as phishing. Root cause analysis revealed: +Initial deployment revealed training/serving skew when we deployed with our own feature extraction. Root cause analysis revealed: - Training notebook used PhiUSIIL's pre-computed features (black-box calculations) - Production service extracted features using custom logic - Small implementation differences led to vastly different feature values @@ -390,9 +418,9 @@ Gateway Service → Model Service --- -## 🤖 Model Development +## 🤖 **Model Development** -### Data Splitting +### **Data Splitting** **Strategy:** 80/20 train/validation split with stratification - **Training Set:** 188,296 URLs (80%) @@ -405,7 +433,7 @@ Gateway Service → Model Service - Validation fold serves dual purpose: calibration + final evaluation - Cross-validation used during model selection (not reported here) -### Baseline Models +### **Baseline Models** **Two candidates evaluated:** @@ -425,7 +453,7 @@ Gateway Service → Model Service - `colsample_bytree`: 0.8 - **Purpose:** Achieve maximum performance while maintaining interpretability -### Isotonic Calibration +### **Isotonic Calibration** **Why Calibration Matters:** Raw model outputs (e.g., `predict_proba()`) are not always well-calibrated—a predicted probability of 0.8 might not correspond to 80% empirical likelihood. For threshold-based decision systems, calibration is critical. @@ -444,26 +472,67 @@ calibrated_model = CalibratedClassifierCV( **Isotonic Regression:** Fits a piecewise-constant, monotonically increasing function to map raw scores to calibrated probabilities. -**Validation:** Brier score of **0.0026** (near-perfect calibration, where 0.0 = perfect, 0.25 = random). +**Validation:** Brier score of **0.0052** (near-perfect calibration, where 0.0 = perfect, 0.25 = random). + +## **🚨 Critical Design Decision: IsHTTPS Removal** + +### The Distribution Shift Problem + +During validation, the 8-feature model (including IsHTTPS) exhibited **100% false negative concentration on HTTPS phishing** at its optimal threshold. -### Model Comparison: 8-Feature vs 7-Feature +**Training Data Distribution:** +- Phishing URLs: **6% HTTPS**, 94% HTTP +- Legitimate URLs: **97% HTTPS**, 3% HTTP -During development, we evaluated two model variants to assess the importance of the IsHTTPS feature: -| Model | Features | PR-AUC | F1-Macro | Brier Score | Decision | -|-------|----------|--------|----------|-------------|----------| -| **Research Model** | 8 (with IsHTTPS) | **0.9992** | 0.9972 | 0.0026 | ✅ PRODUCTION | -| **Experimental Model** | 7 (without IsHTTPS) | 0.9988 | 0.9970 | 0.0028 | ❌ NOT DEPLOYED | -**Analysis:** -- **IsHTTPS contribution:** +0.04% PR-AUC improvement -- **Rationale for including IsHTTPS:** - - Strongest single discriminator (separation: 2.829) - - Captures security protocol, a fundamental phishing signal - - Negligible computational cost - - Despite increasing HTTPS adoption by phishers, still provides discriminative value +**Result:** Model learned `IsHTTPS=1` as a strong legitimacy signal, creating a systematic blind spot for modern HTTPS phishing attacks. + +### **Comparative Analysis at Optimal Thresholds** + +| Model | Threshold | Total FNs | HTTPS FNs | HTTP FNs | Error Pattern | +|-------|-----------|-----------|-----------|----------|---------------| +| **7-feature (no IsHTTPS)** | 0.50 | 210 | 93 (44%) | 117 (56%) | **RANDOM** | +| **8-feature (with IsHTTPS)** | 0.36 | 101 | **101 (100%)** | 0 (0%) | **SYSTEMATIC** | + + +### **Decision Rationale** + +**We chose the 7-feature model because:** + +1. **Error Distribution > Error Count** + - 8-feature: 100% HTTPS concentration = systematic vulnerability + - 7-feature: 44% HTTPS distribution = no blind spot + +2. **Modern Threat Landscape** + - Dataset (2019-2020): 6% HTTPS phishing + - Reality (2025): ~75% HTTPS phishing (Let's Encrypt era) + - 8-feature: **100% miss rate** on dominant attack vector + +3. **Production Safety** + - 7-feature: Standard threshold (0.5), predictable behavior + - 8-feature: Non-standard threshold (0.36), prone to misconfiguration + +**Trade-off Accepted:** +- **Cost:** +109 FNs = +0.54% miss rate +- **Benefit:** Eliminate 100% HTTPS vulnerability +- **Net Result:** Safer, more robust production system + +**HTTPS Breakdown on validation:** +- Total FNs: 210 +- HTTPS FNs: 93 (44.3%) ← Random distribution ✅ +- HTTP FNs: 117 (55.7%) + +### **Performance Impact** + +| Metric | 8-Feature | 7-Feature | Delta | +|--------|-----------|-----------|-------| +| PR-AUC | 0.9992 | 0.9987 | -0.0005 | +| F1-macro | 0.9940 | 0.9940 | 0.0000 | +| **HTTPS FN Rate** | **100%** | **44%** | **-56%** ✅ | + +**Conclusion:** Distribution shift requires feature removal, not just threshold tuning. -**Decision:** Deploy **8-feature model** for maximum performance. The 7-feature model was an academic exercise to assess feature importance but was never deployed. ### Final Model Performance @@ -471,24 +540,24 @@ During development, we evaluated two model variants to assess the importance of | Metric | Value | Interpretation | |--------|-------|----------------| -| **PR-AUC** | **99.92%** | Near-perfect precision-recall tradeoff | -| **F1-Macro** | **99.70%** | Excellent balance across both classes | -| **Brier Score** | **0.0026** | Well-calibrated probabilities | +| **PR-AUC** | **99.87%** | Near-perfect precision-recall tradeoff | +| **F1-Macro** | **99.40%** | Excellent balance across both classes | +| **Brier Score** | **0.0052** | Well-calibrated probabilities | | **False Positive Rate** | **0.09%** | 23 out of 26,970 legitimate URLs misclassified | | **False Negative Rate** | **0.12%** | 24 out of 20,104 phishing URLs misclassified | **Prediction Confidence Distribution:** -- **Extreme Phishing (p ≥ 0.99):** 41.5% of validation set -- **Extreme Legitimate (p ≤ 0.01):** 55.2% of validation set -- **Uncertain (0.01 < p < 0.99):** Only 3.3% of validation set +- **Extreme Phishing (p ≥ 0.998):** 36.0% (16,909 samples) of validation set +- **Extreme Legitimate (p ≤ 0.011):** 52.0% (24,412 samples) of validation set +- **Uncertain (0.011 < p < 0.998):** Only 12.0% (5,632 samples) of validation set **Interpretation:** Model is highly confident in its predictions, with minimal uncertainty. -### Model Artifacts +### **Model Artifacts** **Saved for Production:** -- **Model File:** `models/dev/model_8feat.pkl` (XGBoost + isotonic calibration) -- **Metadata:** `models/dev/model_8feat_meta.json` (feature order, class mapping, performance metrics) +- **Model File:** `models/dev/model_7feat.pkl` (XGBoost + isotonic calibration) +- **Metadata:** `models/dev/model_7feat_meta.json` (feature order, class mapping, performance metrics) - **Training Notebook:** `notebooks/02_ablation_url_only.ipynb` (source of truth) --- @@ -512,31 +581,23 @@ Rather than using a single binary threshold, PhishGuardAI implements a **three-t **Step 1: Optimal Decision Threshold (t_star)** Using F1-macro optimization on validation data: -- **Optimal Threshold:** t_star = **0.350** -- **F1-Macro at t_star:** **0.9972** +- **Optimal Threshold:** t_star = **0.50** +- **F1-Macro at t_star:** **0.9940** **Step 2: Gray-Zone Bands** Define low and high thresholds to create a REVIEW zone: -- **Low Threshold:** 0.004 (if p < 0.004, auto-ALLOW) -- **High Threshold:** 0.999 (if p > 0.999, auto-BLOCK) -- **Gray Zone:** 0.004 ≤ p < 0.999 → Escalate to REVIEW - -**Threshold Sensitivity Analysis:** +- **Low Threshold:** 0.0011 (if p < 0.0011, auto-ALLOW) ---> Allow-rate = 52% +- **High Threshold:** 0.994 (if p > 0.994, auto-BLOCK) ---> Block-rate = 36% +- **Gray Zone:** 0.0011 ≤ p < 0.994 → Escalate to REVIEW ---> Review-rate = 12% +- **88% automation** (ALLOW + BLOCK without human review) +- **12% gray zone** (flagged for review) -| Threshold (Low, High) | ALLOW Rate | REVIEW Rate | BLOCK Rate | FP Rate | FN Rate | -|-----------------------|------------|-------------|------------|---------|---------| -| (0.001, 0.9999) | 55.3% | 3.2% | 41.5% | 0.07% | 0.10% | -| **(0.004, 0.999)** | **48.1%** | **10.9%** | **41.0%** | **0.09%** | **0.12%** | -| (0.01, 0.99) | 43.2% | 15.6% | 41.2% | 0.15% | 0.15% | +>![alt text](outputs/threshold_sensitivity.png) +**Firgut 3: Threshold sensitivity (validation set)** -**Decision:** Use (0.004, 0.999) thresholds for: -- **89% automation** (ALLOW + BLOCK without human review) -- **11% gray zone** (flagged for review) -- **Balanced FP/FN** (0.09% false positives, 0.12% false negatives) - -![Threshold Optimization Curve](outputs/threshold_optimization_curve.png) -*Figure 3: Precision-Recall curve with optimal threshold (t_star=0.35) and gray-zone bands* +>![Threshold Optimization Curve](outputs/threshold_optimization_curve.png) +*Figure 4: Precision-Recall curve with optimal threshold (t_star=0.50) and gray-zone bands* ### Decision Distribution @@ -544,20 +605,20 @@ Define low and high thresholds to create a REVIEW zone: | Decision | Count | Percentage | Notes | |----------|-------|------------|-------| -| **ALLOW** | 22,584 | 48.1% | p < 0.004 (high-confidence legitimate) | -| **REVIEW** | 5,135 | 10.9% | 0.004 ≤ p < 0.999 (gray zone, judge escalation) | -| **BLOCK** | 19,234 | 41.0% | p ≥ 0.999 (high-confidence phishing) | +| **ALLOW** | 24,412 | 52% | p < 0.0011 (high-confidence legitimate) | +| **REVIEW** | 5,632 | 10.9% | 0.0011 ≤ p < 0.994 (gray zone, judge escalation) | +| **BLOCK** | 16,909 | 36.0% | p ≥ 0.994 (high-confidence phishing) | -![alt text](outputs/decision_distribution.png) -*Figure 4: Distribution of decisions across validation set* +>![alt text](outputs/decision_distribution.png) +*Figure 5: Distribution of decisions across validation set* -**Key Insight:** 89% of decisions are automated with high confidence, while only 11% require manual review or judge intervention. +**Key Insight:** 88% of decisions are automated with high confidence, while only 12% require manual review or judge intervention. --- -## 🏭 Production System Design +## **🏭 Production System Design** -### System Components +### **System Components** PhishGuardAI consists of three microservices: @@ -579,7 +640,7 @@ PhishGuardAI consists of three microservices: - LLM Judge (Ollama adapter with fallback) - Explainable rationale generation -### Handling Distribution Shift: The Whitelist Strategy +### **Handling Distribution Shift: The Whitelist Strategy** **The Problem:** During initial deployment, well-known legitimate domains (e.g., `google.com`, `github.com`, `microsoft.com`) were being misclassified as phishing with high confidence (p > 0.95). @@ -624,19 +685,15 @@ KNOWN_LEGITIMATE_DOMAINS = { - **Override:** Returns `p_malicious = 0.01` (bypasses model entirely) - **Reason:** `"domain-whitelist"` (inherently explainable) -**Industry Standard:** -- Google Safe Browsing uses whitelists for known domains -- VirusTotal maintains allowlists for legitimate services -- Every production fraud detection system has domain reputation layers - **When to Use Whitelist:** - Out-of-distribution domains (major tech companies) - Edge cases where model has known blind spots - High-value domains where false positives are unacceptable -### Enhanced Short Domain Routing +### **Enhanced Short Domain Routing** **The Problem:** + Even with the whitelist, other short legitimate domains (e.g., `npm.org`, `bit.ly`, `t.co`) were still being flagged as suspicious. **Statistical Insight:** @@ -660,14 +717,14 @@ def _should_route_to_judge_for_short_domain(url: str, p_malicious: float) -> boo domain_no_www = domain.replace("www.", "") return ( - len(domain_no_www) <= 10 and # Short domain + len(domain_no_www) <= 12 and # Short domain p_malicious < 0.5 # Moderate confidence (not extreme) ) ``` **Decision Flow:** -- **Standard Gray Zone:** `0.004 ≤ p < 0.999` → Judge with standard context -- **Short Domain Gray Zone:** `len(domain) ≤ 10 AND p < 0.5` → Judge with short domain context +- **Standard Gray Zone:** `0.0011 ≤ p < 0.994` → Judge with standard context +- **Short Domain Gray Zone:** `len(domain) ≤ 12 AND p < 0.5` → Judge with short domain context **Example:** ```bash @@ -715,9 +772,9 @@ Final decision: ALLOW - **UNCERTAIN:** Unclear → Final decision: REVIEW (manual review queue) -### How to use the optional LLM (Ollama) +### **How to use the optional LLM (Ollama)** -#### STEP 1: Verify Ollama is Running +#### **STEP 1: Verify Ollama is Running** ``` # Check if Ollama service is running ollama list @@ -759,7 +816,7 @@ OLLAMA_BASE_URL=http://localhost:11434 OLLAMA_MODEL=llama3.2:1b # Judge timeout (seconds) -JUDGE_TIMEOUT=10 +JUDGE_TIMEOUT=60 # Verbose logging (to see judge calls) LOG_LEVEL=DEBUG @@ -812,7 +869,6 @@ export LOG_LEVEL=DEBUG "p_malicious": 1.0, "source": "model", "feature_contributions": { - "IsHTTPS": -11.8081, // Missing HTTPS strongly increases risk "NoOfOtherSpecialCharsInURL": 1.7578, // Typosquatting ('1' in domain) "DomainLength": -1.8956, // Moderate length slightly protective "CharContinuationRate": 1.0135, @@ -822,7 +878,6 @@ export LOG_LEVEL=DEBUG "LetterRatioInURL": -0.1065 }, "feature_values": { - "IsHTTPS": 0.0, "DomainLength": 20.0, "NoOfOtherSpecialCharsInURL": 5.0, "CharContinuationRate": 0.1923, @@ -835,7 +890,7 @@ export LOG_LEVEL=DEBUG ``` **Interpretation:** -- **IsHTTPS = 0** (missing HTTPS) has SHAP value of **-11.8** → Strongly pushes prediction toward phishing + - **NoOfOtherSpecialCharsInURL = 5** (the '1' in "facebook1mob") has SHAP value of **+1.76** → Indicates typosquatting - **DomainLength = 20** has SHAP value of **-1.90** → Moderate length is slightly protective @@ -860,15 +915,15 @@ For detailed SHAP usage and interpretation, see [EXPLAINABILITY.md](docs/EXPLAIN **Purpose:** Validate that feature extraction produces valid, model-ready data. **What Gets Validated:** -- **Input:** Processed feature CSV (`phiusiil_features_v2.csv`) with 8 engineered features +- **Input:** Processed feature CSV (`phiusiil_features_v2.csv`) with 7 engineered features - **NOT validated:** Raw PhiUSIIL CSV (no expectations on raw data) **Validation Checks:** #### **1. Schema Validation** -- **Column Presence:** All 8 features must exist (`IsHTTPS`, `TLDLegitimateProb`, `CharContinuationRate`, etc.) +- **Column Presence:** All 7 features must exist (`IsHTTPS`, `TLDLegitimateProb`, `CharContinuationRate`, etc.) - **Data Types:** - - Binary features (IsHTTPS): `float64` but must be 0.0 or 1.0 + - Probability features (TLDLegitimateProb, URLCharProb, etc.): `float64` in range [0.0, 1.0] - Count features (NoOfOtherSpecialCharsInURL): `int64` or integer-like floats - Length features (DomainLength): `int64` with range [1, 253] (RFC 1035) @@ -880,7 +935,7 @@ For detailed SHAP usage and interpretation, see [EXPLAINABILITY.md](docs/EXPLAIN - **Ratios:** 0.0 ≤ value ≤ 1.0 #### **3. Feature Distribution Validation** -- **IsHTTPS:** Mean should be in range [0.6, 0.95] (most legitimate sites use HTTPS) + - **TLDLegitimateProb:** Mean should be in range [0.4, 0.8] (balanced TLD mix) - **CharContinuationRate:** Mean should be in range [0.0, 0.3] (low repetition) - **No extreme outliers:** Values within 5 standard deviations of mean @@ -909,7 +964,7 @@ Great Expectations runs automatically via GitHub Actions: **Example Expectation Suite:** ```python # Sample expectations from ge_build_phiusiil_suite.py -suite.expect_column_values_to_be_between("IsHTTPS", min_value=0.0, max_value=1.0) + suite.expect_column_values_to_be_between("TLDLegitimateProb", min_value=0.0, max_value=1.0) suite.expect_column_values_to_be_between("DomainLength", min_value=1, max_value=253) suite.expect_column_mean_to_be_between("IsHTTPS", min_value=0.6, max_value=0.95) @@ -1091,13 +1146,13 @@ docker run --rm -p 8080:8000 \ - **Two-tier explainability:** Whitelist (inherently explainable) + SHAP (model decisions) **Performance Trade-Off:** -- SHAP computation adds latency (~50-100ms) +- SHAP computation adds latency ~200-500ms for `/explain` endpoint - Documented as known bottleneck with mitigation plan (lazy computation, caching) - Acceptable for on-demand explanations; not suitable for real-time scanning ### 5. Production ML is About Reliability, Not Just Accuracy -**Lesson:** A 99.92% PR-AUC model is useless if it crashes, has unpredictable latency, or produces unexplainable decisions. +**Lesson:** A 99.87% PR-AUC model is useless if it crashes, has unpredictable latency, or produces unexplainable decisions. **What Production ML Requires:** - ✅ **Graceful degradation:** Fallback mechanisms (stub judge, heuristic scoring) @@ -1108,8 +1163,8 @@ docker run --rm -p 8080:8000 \ - ✅ **Explainability:** SHAP + judge rationale for compliance **Operational Maturity > Perfect Model:** -- 99.92% PR-AUC with 2s latency → **Not production-ready** -- 99.5% PR-AUC with 50ms latency + observability → **Production-ready** +- 99.87% PR-AUC with 2s latency → **Not production-ready** +- 99.87% PR-AUC with 50ms latency + observability → **Production-ready** --- @@ -1303,7 +1358,10 @@ PhishGuardAI/ - **[EXPLAINABILITY.md](docs/EXPLAINABILITY.md)** - SHAP dashboard usage and interpretation - **[MODEL_CARD.md](docs/MODEL_CARD.md)** - Industry-standard model documentation (Google/HuggingFace format) -- **[DEMO_SCRIPT.md](docs/DEMO_SCRIPT.md)** - 5-minute structured walkthrough with copy-paste commands +- **[API.md](docs/API.md)** - Complete endpoint reference, Request/response schemas, Code examples (curl, Python, Bash), Example workflows +- **[ARCHITECTURE.md](docs/ARCHITECTURE.md)** - IsHTTPS distribution shift deep dive, Feature engineering philosophy, Policy bands & gray zone design, LLM judge integration rationale +- **[DEPLOYMENT.md](docs/DEPLOYMENT.md)** - Local development setup, Docker deployment, Ollama LLM judge installation +- **[JUDGE.md](docs/JUDGE.md)** - LM judge architecture, Routing logic explained, Prompt engineering details, Performance characteristics, LLM vs Stub comparison --- @@ -1324,7 +1382,7 @@ PhishGuardAI/ - Mitigation: Implement PSI (Population Stability Index) monitoring + retraining triggers 4. **Performance Bottleneck (Documented)** - - SHAP explainability adds latency to `/predict/explain` endpoint + - SHAP explainability adds ~200-500ms latency to `/predict/explain` endpoint - **Investigation Underway:** Profiling model service to identify optimization opportunities - **Candidate Solutions:** - Lazy SHAP computation (only when `/explain` endpoint called, not on every prediction) @@ -1337,6 +1395,7 @@ PhishGuardAI/ 5. **Static Thresholds** - Policy bands don't adapt to fraud rate changes - Mitigation: Implement dynamic threshold tuning based on operational capacity +6. **LLM First-Call Latency** - 15-20s model loading ### Planned Improvements (Prioritized) diff --git a/configs/dev/config.yaml b/configs/dev/config.yaml index d9d407d..714ea13 100644 --- a/configs/dev/config.yaml +++ b/configs/dev/config.yaml @@ -6,18 +6,18 @@ model_service: # Primary model (production) primary: - path: "models/dev/model_8feat.pkl" - meta_path: "models/dev/model_8feat_meta.json" - name: "8-feature-production-v1" - description: "8-feature URL model with IsHTTPS (PR-AUC: 99.92%)" - + path: "models/dev/model_7feat.pkl" + meta_path: "models/dev/model_7feat_meta.json" + name: "7-feature-production-v1" + description: "7-feature URL model without IsHTTPS (PR-AUC: 99.88%)" + # Shadow model (DISABLED for production) shadow: enabled: false - path: "models/dev/model_7feat.pkl" - meta_path: "models/dev/model_7feat_meta.json" - name: "7-feature-baseline" - description: "7-feature URL model without IsHTTPS (research only)" + path: "models/dev/model_8feat.pkl" + meta_path: "models/dev/model_8feat_meta.json" + name: "8-feature-baseline" + description: "8-feature URL model with IsHTTPS (research only)" log_path: "outputs/shadow_predictions.jsonl" # Gateway Configuration diff --git a/configs/dev/thresholds.json b/configs/dev/thresholds.json index a2c6dbc..33ea164 100644 --- a/configs/dev/thresholds.json +++ b/configs/dev/thresholds.json @@ -1,12 +1,27 @@ { - "optimal_threshold": 0.35, - "gray_zone_low": 0.004, - "gray_zone_high": 0.9990000000000006, - "gray_zone_rate": 0.10936468383276894, - "f1_score_at_optimal": 0.002766509680600472, - "decision_distribution": { - "allow_rate": 0.48099162992780015, - "review_rate": 0.10936468383276894, - "block_rate": 0.4096436862394309 - } + "model": "xgb", + "class_mapping": { + "phish": 0, + "legit": 1 + }, + "calibration": { + "method": "isotonic", + "cv": 5 + }, + "thresholds": { + "optimal_threshold": 0.49999999999999994, + "gray_zone_low": 0.011, + "gray_zone_high": 0.9979999999999727, + "gray_zone_rate": 0.11994973697101356, + "f1_score_at_optimal": 0.9939829924548755, + "decision_distribution": { + "allow_rate": 0.5199241794986476, + "review_rate": 0.11994973697101356, + "block_rate": 0.36012608353033887 + } + }, + "data": { + "file": "data\\processed\\phiusiil_features_v2.csv" + }, + "seed": 42 } \ No newline at end of file diff --git a/configs/dev/thresholds_7feat.json b/configs/dev/thresholds_7feat.json new file mode 100644 index 0000000..f896708 --- /dev/null +++ b/configs/dev/thresholds_7feat.json @@ -0,0 +1,33 @@ +{ + "model": "xgb_7feat", + "features": [ + "TLDLegitimateProb", + "CharContinuationRate", + "SpacialCharRatioInURL", + "URLCharProb", + "LetterRatioInURL", + "NoOfOtherSpecialCharsInURL", + "DomainLength" + ], + "class_mapping": { + "phish": 0, + "legit": 1 + }, + "calibration": { + "method": "isotonic", + "cv": 5 + }, + "thresholds": { + "optimal_threshold": 0.49999999999999994, + "gray_zone_low": 0.011, + "gray_zone_high": 0.9979999999999727, + "gray_zone_rate": 0.11994973697101356, + "f1_score_at_optimal": 0.9939829924548755, + "decision_distribution": { + "allow_rate": 0.5199241794986476, + "review_rate": 0.11994973697101356, + "block_rate": 0.36012608353033887 + } + }, + "seed": 42 +} \ No newline at end of file diff --git a/configs/dev/thresholds_8feat.json b/configs/dev/thresholds_8feat.json index 6f9bc12..ed980ae 100644 --- a/configs/dev/thresholds_8feat.json +++ b/configs/dev/thresholds_8feat.json @@ -1,14 +1,14 @@ { "model": "xgb_8feat", "features": [ - "IsHTTPS", "TLDLegitimateProb", "CharContinuationRate", "SpacialCharRatioInURL", "URLCharProb", "LetterRatioInURL", "NoOfOtherSpecialCharsInURL", - "DomainLength" + "DomainLength", + "IsHTTPS" ], "class_mapping": { "phish": 0, @@ -19,16 +19,11 @@ "cv": 5 }, "thresholds": { - "optimal_threshold": 0.35, - "gray_zone_low": 0.004, - "gray_zone_high": 0.9990000000000006, - "gray_zone_rate": 0.10936468383276894, - "f1_score_at_optimal": 0.002766509680600472, - "decision_distribution": { - "allow_rate": 0.48099162992780015, - "review_rate": 0.10936468383276894, - "block_rate": 0.4096436862394309 - } + "optimal_threshold": 0.3599999999999999, + "gray_zone_low": 0.003, + "gray_zone_high": 0.3599999999999999, + "gray_zone_rate": 0.13355909100589952, + "f1_score_at_optimal": 0.9971890180308938 }, "seed": 42 } \ No newline at end of file diff --git a/docs/API.md b/docs/API.md new file mode 100644 index 0000000..21fda8b --- /dev/null +++ b/docs/API.md @@ -0,0 +1,580 @@ +# 📡 PhishGuardAI API Reference + +**Complete API documentation for all PhishGuardAI endpoints.** + +--- + +## 📋 Table of Contents + +1. [Base URLs](#base-urls) +2. [Authentication](#authentication) +3. [Prediction Endpoints](#prediction-endpoints) +4. [Explainability Endpoints](#explainability-endpoints) +5. [Observability Endpoints](#observability-endpoints) +6. [Error Responses](#error-responses) +7. [Rate Limiting](#rate-limiting) +8. [Example Workflows](#example-workflows) + +--- + +## 🌐 Base URLs + +| Environment | Base URL | +|-------------|----------| +| **Local Development** | `http://localhost:8000` | +| **Docker Compose** | `http://gateway:8000` | +| **Production** | `https://phishguard.example.com` | + +--- + +## 🔐 Authentication + +**Current Status:** No authentication required (local development) + +**Planned:** API key authentication via `X-API-Key` header + +```bash +# Future implementation +curl -X POST "https://api.phishguard.com/predict" \ + -H "X-API-Key: your-api-key" \ + -H "Content-Type: application/json" \ + -d '{"url":"http://example.com"}' +``` + +--- + +## 🎯 Prediction Endpoints + +### `POST /predict` + +Classify a URL as legitimate, phishing, or uncertain. + +**Request Body:** + +```json +{ + "url": "string" // Required: URL to analyze +} +``` + +**Response (200 OK):** + +```json +{ + "url": "string", // Analyzed URL + "decision": "string", // ALLOW | REVIEW | BLOCK + "reason": "string", // Decision rationale + "p_malicious": "float", // Phishing probability [0,1] + "source": "string", // whitelist | model + "model_name": "string", // Model identifier + "features": { // Extracted features + "TLDLegitimateProb": "float", + "CharContinuationRate": "float", + "SpacialCharRatioInURL": "float", + "URLCharProb": "float", + "LetterRatioInURL": "float", + "NoOfOtherSpecialCharsInURL": "int", + "DomainLength": "int" + }, + "judge": { // Present if gray zone + "verdict": "string", // LEAN_PHISH | LEAN_LEGIT | UNCERTAIN + "rationale": "string", // Human-readable explanation + "judge_score": "float", // Judge confidence [0,1] + "context": { + "backend": "string", // llm | stub_fallback + "model": "string" // LLM model name + } + } +} +``` + +**Examples:** + +**Example 1: Whitelisted Domain** + +```bash +curl -X POST "http://localhost:8000/predict" \ + -H "Content-Type: application/json" \ + -d '{"url":"https://github.com"}' +``` + +```json +{ + "url": "https://github.com", + "decision": "ALLOW", + "reason": "domain-whitelist", + "p_malicious": 0.01, + "source": "whitelist", + "features": null, + "judge": null +} +``` + +**Example 2: High-Confidence Phishing** + +```bash +curl -X POST "http://localhost:8000/predict" \ + -H "Content-Type: application/json" \ + -d '{"url":"http://verify-account-urgent.tk"}' +``` + +```json +{ + "url": "http://verify-account-urgent.tk", + "decision": "BLOCK", + "reason": "policy-band", + "p_malicious": 0.9995, + "source": "model", + "model_name": "7-feature-production-v1", + "features": { + "TLDLegitimateProb": 0.12, + "CharContinuationRate": 0.08, + "SpacialCharRatioInURL": 0.19, + "URLCharProb": 1.0, + "LetterRatioInURL": 0.81, + "NoOfOtherSpecialCharsInURL": 5, + "DomainLength": 24 + }, + "judge": null +} +``` + +**Example 3: Gray Zone with Judge** + +```bash +curl -X POST "http://localhost:8000/predict" \ + -H "Content-Type: application/json" \ + -d '{"url":"http://npm.org"}' +``` + +```json +{ + "url": "http://npm.org", + "decision": "ALLOW", + "reason": "judge-short-domain-lean-legit", + "p_malicious": 0.35, + "source": "model", + "model_name": "7-feature-production-v1", + "features": { + "TLDLegitimateProb": 0.85, + "CharContinuationRate": 0.0, + "SpacialCharRatioInURL": 0.125, + "URLCharProb": 1.0, + "LetterRatioInURL": 0.875, + "NoOfOtherSpecialCharsInURL": 1, + "DomainLength": 7 + }, + "judge": { + "verdict": "LEAN_LEGIT", + "rationale": "Domain 'npm.org' is a well-known package manager. Short domain length (7 chars) is expected for legitimate tech infrastructure. TLD .org is commonly used by open-source projects.", + "judge_score": 0.15, + "context": { + "backend": "llm", + "model": "llama3.2:1b", + "is_short_domain_case": true + } + } +} +``` + +**Error Responses:** + +```json +// 400 Bad Request - Missing URL +{ + "error": "Missing required field: url" +} + +// 422 Unprocessable Entity - Invalid URL +{ + "error": "Invalid URL format" +} + +// 503 Service Unavailable - Model service down +{ + "error": "Model service unavailable", + "retry_after": 60 +} +``` + +--- + +## 🔍 Explainability Endpoints + +### `POST /predict/explain` + +Get SHAP feature importance values for a URL prediction. + +**Request Body:** + +```json +{ + "url": "string" // Required: URL to explain +} +``` + +**Response (200 OK):** + +```json +{ + "url": "string", + "p_malicious": "float", + "base_value": "float", // Model baseline + "features": { + "feature_name": { + "value": "float", // Actual feature value + "shap_value": "float", // SHAP contribution + "importance": "float" // |shap_value| + } + }, + "top_features": ["string"], // Top 3 by importance + "model_name": "string", + "explanation": "string", + "note": "string" +} +``` + +**Example:** + +```bash +curl -X POST "http://localhost:8000/predict/explain" \ + -H "Content-Type: application/json" \ + -d '{"url":"http://suspicious-login.info"}' +``` + +```json +{ + "url": "http://suspicious-login.info", + "p_malicious": 0.8542, + "base_value": 0.318, + "features": { + "CharContinuationRate": { + "value": 0.1, + "shap_value": -0.523, + "importance": 0.523 + }, + "NoOfOtherSpecialCharsInURL": { + "value": 6, + "shap_value": 0.342, + "importance": 0.342 + }, + "TLDLegitimateProb": { + "value": 0.43, + "shap_value": -0.026, + "importance": 0.026 + }, + "SpacialCharRatioInURL": { + "value": 0.19, + "shap_value": 0.145, + "importance": 0.145 + }, + "URLCharProb": { + "value": 1.0, + "shap_value": 0.0, + "importance": 0.0 + }, + "LetterRatioInURL": { + "value": 0.81, + "shap_value": 0.089, + "importance": 0.089 + }, + "DomainLength": { + "value": 21, + "shap_value": -0.042, + "importance": 0.042 + } + }, + "top_features": [ + "CharContinuationRate", + "NoOfOtherSpecialCharsInURL", + "SpacialCharRatioInURL" + ], + "model_name": "7-feature-production-v1", + "explanation": "Positive SHAP values push towards phishing; negative towards legitimate", + "note": "SHAP computed on base estimator (before calibration) for approximate feature importance" +} +``` + +**Error Responses:** + +```json +// 500 Internal Server Error - SHAP computation failed +{ + "error": "SHAP explanation failed: ", + "details": "" +} + +// 503 Service Unavailable - SHAP not installed +{ + "error": "SHAP not installed. Install with: pip install shap" +} +``` + +**Dashboard Access:** + +```bash +# Visual SHAP dashboard +open http://localhost:8000/explain +``` + +--- + +## 📊 Observability Endpoints + +### `GET /health` + +Service health check. + +**Response (200 OK):** + +```json +{ + "status": "healthy", + "model_loaded": true, + "model_service": "connected", + "judge_backend": "llm", + "timestamp": "2025-10-23T12:34:56Z" +} +``` + +**Response (503 Service Unavailable):** + +```json +{ + "status": "unhealthy", + "model_loaded": false, + "model_service": "disconnected", + "timestamp": "2025-10-23T12:34:56Z" +} +``` + +--- + +### `GET /stats` + +Decision statistics. + +**Response (200 OK):** + +```json +{ + "policy": { + "ALLOW": 5234, // Policy band ALLOWs + "REVIEW": 678, // Policy band REVIEWs + "BLOCK": 3421 // Policy band BLOCKs + }, + "judge": { + "LEAN_PHISH": 234, // Judge phishing verdicts + "LEAN_LEGIT": 312, // Judge legitimate verdicts + "UNCERTAIN": 132 // Judge uncertain verdicts + }, + "final": { + "ALLOW": 5546, // Final ALLOWs (policy + judge) + "REVIEW": 132, // Final REVIEWs (human escalation) + "BLOCK": 3655 // Final BLOCKs (policy + judge) + }, + "uptime_seconds": 3600 +} +``` + +--- + +### `GET /config` + +Current configuration. + +**Response (200 OK):** + +```json +{ + "thresholds": { + "low": 0.011, + "high": 0.998, + "optimal": 0.5 + }, + "model_name": "7-feature-production-v1", + "judge_backend": "llm", + "judge_model": "llama3.2:1b", + "gray_zone_rate": 0.12 +} +``` + +--- + +## ❌ Error Responses + +### Standard Error Format + +```json +{ + "error": "string", // Human-readable error message + "details": "string", // Optional: Additional context + "timestamp": "string" // ISO 8601 timestamp +} +``` + +### HTTP Status Codes + +| Code | Meaning | Common Causes | +|------|---------|---------------| +| **200** | OK | Successful request | +| **400** | Bad Request | Missing required field, invalid JSON | +| **422** | Unprocessable Entity | Invalid URL format | +| **500** | Internal Server Error | Model inference error, SHAP failure | +| **503** | Service Unavailable | Model service down, dependencies missing | + +--- + +## 🚦 Rate Limiting + +**Current Status:** No rate limiting (local development) + +**Planned:** + +``` +Rate Limit: 100 requests/minute per IP +Headers: + X-RateLimit-Limit: 100 + X-RateLimit-Remaining: 95 + X-RateLimit-Reset: 1698012000 +``` + +**Response (429 Too Many Requests):** + +```json +{ + "error": "Rate limit exceeded", + "retry_after": 60 +} +``` + +--- + +## 🎬 Example Workflows + +### Workflow 1: Basic URL Scanning + +```bash +#!/bin/bash + +# Scan a list of URLs +urls=( + "https://google.com" + "http://phishing-site.tk" + "http://npm.org" +) + +for url in "${urls[@]}"; do + echo "Scanning: $url" + + response=$(curl -s -X POST "http://localhost:8000/predict" \ + -H "Content-Type: application/json" \ + -d "{\"url\":\"$url\"}") + + decision=$(echo $response | jq -r '.decision') + p_malicious=$(echo $response | jq -r '.p_malicious') + + echo " Decision: $decision (p=$p_malicious)" + echo "" +done +``` + +**Output:** +``` +Scanning: https://google.com + Decision: ALLOW (p=0.01) + +Scanning: http://phishing-site.tk + Decision: BLOCK (p=0.9995) + +Scanning: http://npm.org + Decision: ALLOW (p=0.35) +``` + +### Workflow 2: Bulk Scanning with Explanations + +```python +import requests +import json + +def scan_url(url: str): + """Scan URL and get SHAP explanation if suspicious.""" + + # Get prediction + response = requests.post( + "http://localhost:8000/predict", + json={"url": url} + ) + result = response.json() + + # If suspicious, get explanation + if result["p_malicious"] > 0.5: + explain_response = requests.post( + "http://localhost:8000/predict/explain", + json={"url": url} + ) + result["shap"] = explain_response.json() + + return result + +# Scan URLs +urls = [ + "http://example-shop.com", + "http://verify-account.tk", + "http://bit.ly/abc123" +] + +for url in urls: + result = scan_url(url) + print(f"URL: {url}") + print(f" Decision: {result['decision']}") + print(f" p_malicious: {result['p_malicious']:.4f}") + + if "shap" in result: + top_features = result["shap"]["top_features"] + print(f" Top features: {', '.join(top_features)}") + print() +``` + +### Workflow 3: Monitoring Dashboard + +```bash +# Get stats every 5 seconds +watch -n 5 'curl -s http://localhost:8000/stats | jq .' +``` + +**Output:** +```json +{ + "policy": { + "ALLOW": 5234, + "REVIEW": 678, + "BLOCK": 3421 + }, + "judge": { + "LEAN_PHISH": 234, + "LEAN_LEGIT": 312, + "UNCERTAIN": 132 + }, + "final": { + "ALLOW": 5546, + "REVIEW": 132, + "BLOCK": 3655 + }, + "uptime_seconds": 3600 +} +``` + +--- + +## 📚 Additional Resources + +- **[README.md](../README.md)** - Project overview +- **[DEPLOYMENT.md](DEPLOYMENT.md)** - Deployment guide +- **[ARCHITECTURE.md](ARCHITECTURE.md)** - Design decisions +- **[JUDGE.md](JUDGE.md)** - LLM judge system + +--- + +**Last Updated:** October 23, 2025 +**Version:** 1.0.0 diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 0000000..5fe2911 --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,1031 @@ +# 🏗️ PhishGuardAI Architecture + +**Deep dive into system design decisions, trade-off analysis, and architectural patterns.** + +--- + +## 📋 Table of Contents + +1. [System Overview](#system-overview) +2. [The IsHTTPS Decision: Distribution Shift Analysis](#the-ishttps-decision-distribution-shift-analysis) +3. [Feature Engineering Philosophy](#feature-engineering-philosophy) +4. [Policy Bands & Gray Zone Design](#policy-bands--gray-zone-design) +5. [LLM Judge Integration](#llm-judge-integration) +6. [SHAP Explainability](#shap-explainability) +7. [Graceful Degradation Strategy](#graceful-degradation-strategy) +8. [Trade-off Analysis](#trade-off-analysis) +9. [Future Architecture Evolution](#future-architecture-evolution) + +--- + +## 🌐 System Overview + +### Design Principles + +1. **Multi-tier decisions:** Fast-path optimizations before expensive operations +2. **Fail-safe:** Graceful degradation when components unavailable +3. **Explainable:** Every decision has a traceable rationale +4. **Observable:** Rich instrumentation for monitoring and debugging +5. **Flexible:** Easy to swap components (stub judge ↔ LLM judge) + +### Service Architecture + +``` +┌─────────────────────────────────────────────────────────────┐ +│ CLIENT │ +└────────────────────────┬────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ GATEWAY (:8000) │ +│ ┌─────────────────────────────────────────────────────┐ │ +│ │ TIER 1: Whitelist (O(1) set lookup) │ │ +│ │ • 15 known-good domains │ │ +│ │ • Latency: <1ms │ │ +│ │ • Fast-path ALLOW (bypasses model) │ │ +│ └─────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────┐ │ +│ │ TIER 2: Model Inference (HTTP call) │ │ +│ │ • Feature extraction (7 features) │ │ +│ │ • XGBoost + isotonic calibration │ │ +│ │ • Latency: ~50ms │ │ +│ └─────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────┐ │ +│ │ TIER 3: Policy Bands (threshold comparison) │ │ +│ │ • p < 0.011 → ALLOW │ │ +│ │ • p > 0.998 → BLOCK │ │ +│ │ • 0.011 ≤ p ≤ 0.998 → REVIEW (gray zone) │ │ +│ │ • Latency: <1ms │ │ +│ └─────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────┐ │ +│ │ TIER 4: Judge Escalation (gray zone only) │ │ +│ │ • Enhanced routing: short domain detection │ │ +│ │ • LLM judge (primary) or stub (fallback) │ │ +│ │ • Latency: 2-5s (LLM), <1ms (stub) │ │ +│ └─────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────┘ + │ + ┌────────────────┼────────────────┐ + ▼ ▼ ▼ +┌──────────────┐ ┌──────────────┐ ┌──────────────┐ +│ MODEL (:8002)│ │OLLAMA (:11434│ │ MONGO │ +│ • XGBoost │ │ • llama3.2 │ │ (optional) │ +│ • SHAP │ │ • Judge LLM │ │ • Audit │ +└──────────────┘ └──────────────┘ └──────────────┘ +``` + +### Request Flow + +**High-confidence legitimate (52% of traffic):** +``` +Client → Gateway → Whitelist? → YES → ALLOW (10ms) +Client → Gateway → Whitelist? → NO → Model → p=0.001 → ALLOW (60ms) +``` + +**High-confidence phishing (36% of traffic):** +``` +Client → Gateway → Model → p=0.999 → BLOCK (60ms) +``` + +**Gray zone (12% of traffic):** +``` +Client → Gateway → Model → p=0.35 → Judge → ALLOW/BLOCK/REVIEW (2-5s) +``` + +--- + +## 🚨 The IsHTTPS Decision: Distribution Shift Analysis + +### Background + +**Initial Feature Selection:** +- IsHTTPS had **highest separation score (2.829)** among all features +- Training data: 94% phishing = HTTP, 97% legitimate = HTTPS +- Seemed like the perfect feature! + +### The Problem + +**Validation revealed systematic bias:** + +``` +8-Feature Model @ Optimal Threshold (0.36): +┌──────────────┬─────────────┬─────────────┐ +│ │ Pred Legit │ Pred Phish │ +├──────────────┼─────────────┼─────────────┤ +│ True Legit │ 26,942 │ 28 │ +│ True Phish │ 101 │ 19,882 │ +└──────────────┴─────────────┴─────────────┘ + +HTTPS Breakdown of False Negatives: +• Total FNs: 101 +• HTTPS FNs: 101 (100%) ← SYSTEMATIC VULNERABILITY! +• HTTP FNs: 0 +``` + +**Key Finding:** Even at optimal threshold, the model had **100% HTTPS false negatives**. + +### Root Cause Analysis + +**Training data distribution shift:** + +| URL Type | Dataset (2019-2020) | Reality (2025) | +|----------|---------------------|----------------| +| Phishing HTTPS | 6% | **~75%** (Let's Encrypt era) | +| Legitimate HTTPS | 97% | 99% | + +**Model behavior:** +```python +# What the model learned: +if IsHTTPS == 1: + p_malicious *= 0.01 # Strong legitimacy signal + +# Result: +# HTTPS phishing → Incorrectly classified as legitimate +``` + +**Why threshold tuning didn't help:** +- Lowering threshold (0.36) only partially compensates +- The bias is **structural** in the feature representation +- All HTTPS phishing still concentrated in false negatives + +### The 7-Feature Alternative + +**Removing IsHTTPS:** + +``` +7-Feature Model @ Optimal Threshold (0.50): +┌──────────────┬─────────────┬─────────────┐ +│ │ Pred Legit │ Pred Phish │ +├──────────────┼─────────────┼─────────────┤ +│ True Legit │ 26,904 │ 66 │ +│ True Phish │ 210 │ 19,773 │ +└──────────────┴─────────────┴─────────────┘ + +HTTPS Breakdown of False Negatives: +• Total FNs: 210 +• HTTPS FNs: 93 (44.3%) ← RANDOM DISTRIBUTION ✅ +• HTTP FNs: 117 (55.7%) +``` + +**Key Finding:** Errors are now **randomly distributed** across HTTP and HTTPS. + +### Decision Matrix + +| Factor | 8-Feature (IsHTTPS) | 7-Feature (no IsHTTPS) | Winner | +|--------|---------------------|------------------------|--------| +| **Total FNs** | 101 | 210 | 8-feature | +| **HTTPS FN Rate** | 100% | 44% | **7-feature** ✅ | +| **Error Pattern** | Systematic | Random | **7-feature** ✅ | +| **Threshold** | 0.36 (non-standard) | 0.50 (standard) | **7-feature** ✅ | +| **Production Risk** | High (HTTPS blind spot) | Low (robust) | **7-feature** ✅ | +| **PR-AUC** | 0.9992 | 0.9987 | 8-feature | +| **Operational Complexity** | Higher (unusual threshold) | Lower (standard) | **7-feature** ✅ | + +### The Trade-off + +**What we gave up:** +- 109 additional false negatives (+0.54% miss rate) +- 0.0005 PR-AUC decrease + +**What we gained:** +- **Eliminated 100% HTTPS failure mode** +- Random error distribution (no systematic blind spot) +- Standard threshold (0.5) for easier operations +- Robust to modern phishing landscape (75% HTTPS) + +### Why This Matters + +**Attack Scenario:** + +``` +Attacker discovers 8-feature model vulnerability: +1. Launch HTTPS phishing campaign +2. 100% of campaign evades detection +3. Massive security incident + +vs. + +7-feature model: +1. Launch HTTPS phishing campaign +2. 56% caught by model +3. 44% false negatives distributed randomly (acceptable baseline) +``` + +### Key Lesson + +**Error Pattern > Error Count** + +In fraud detection, a **predictable blind spot** (100% HTTPS miss rate) is more dangerous than a **higher error count with random distribution**. + +**Analogy:** +- 8-feature: A fortress with 99 walls and 1 known gap → Attackers exploit the gap +- 7-feature: A fortress with 95 walls randomly distributed → Attackers can't target specific weakness + +--- + +## 🧬 Feature Engineering Philosophy + +### Design Principles + +1. **URL-Only Features:** No page fetching (latency, reliability) +2. **Stateless:** No external dependencies (DNS, WHOIS, etc.) +3. **Interpretable:** Features have clear business meaning +4. **Robust:** Resistant to adversarial manipulation + +### Feature Selection Criteria + +**Why These 7 Features?** + +| Feature | Separation Score | Interpretability | Adversarial Robustness | +|---------|------------------|------------------|------------------------| +| **TLDLegitimateProb** | 2.012 | ✅ High | ⚠️ Medium (TLD can be faked) | +| **CharContinuationRate** | 1.372 | ✅ High | ✅ High (hard to hide repetition) | +| **SpacialCharRatioInURL** | 1.330 | ✅ High | ⚠️ Medium | +| **URLCharProb** | 0.889 | ⚠️ Medium | ✅ High (statistical property) | +| **LetterRatioInURL** | 0.825 | ✅ High | ⚠️ Medium | +| **NoOfOtherSpecialCharsInURL** | 0.540 | ✅ High | ⚠️ Low (easy to manipulate) | +| **DomainLength** | 0.301 | ✅ High | ✅ High (structural property) | + +### TLD Legitimacy Probability + +**Bayesian Prior Implementation:** + +```python +def calculate_tld_prob(tld: str, train_data: DataFrame) -> float: + """ + Calculate P(legitimate | TLD) using Bayesian estimation. + + Args: + tld: Top-level domain (e.g., "com", "org", "tk") + train_data: Training dataset with labels + + Returns: + Probability in [0, 1] + """ + # Count TLD occurrences + tld_legit = train_data[(train_data['TLD'] == tld) & (train_data['label'] == 1)].shape[0] + tld_total = train_data[train_data['TLD'] == tld].shape[0] + + # Hyperparameters (justified by confidence interval analysis) + alpha = 5 # Pseudo-legitimate count + beta = 5 # Pseudo-phishing count + + # Handle rare TLDs (< MIN_SAMPLES) + if tld_total < 10: + return 0.5 # Neutral (maximum uncertainty) + + # Bayesian estimation + p_legit = (tld_legit + alpha) / (tld_total + alpha + beta) + + return p_legit +``` + +**Why Bayesian Estimation?** +- **Smoothing:** Prevents overfitting to rare TLDs +- **Uncertainty:** Returns 0.5 for unseen TLDs (neutral) +- **Interpretable:** Direct probability interpretation + +**Example TLD Probabilities:** +``` +.com → 0.67 (moderately legitimate) +.org → 0.81 (highly legitimate) +.gov → 0.99 (almost always legitimate) +.tk → 0.12 (often phishing) +.xyz → 0.23 (suspicious) +``` + +### Feature Extraction Consistency + +**Critical Design: Shared Library** + +``` +Training: +notebooks/01_baseline.ipynb + ↓ +common/feature_extraction.py ← Shared implementation + ↓ +Training features → Model + +Serving: +src/model_svc/main.py + ↓ +common/feature_extraction.py ← Same shared implementation + ↓ +Serving features → Model +``` + +**Why This Matters:** +- **Training/serving skew prevention:** Same code for both +- **Single source of truth:** One feature definition +- **Easy updates:** Change once, affects both train & serve + +--- + +## 🎯 Policy Bands & Gray Zone Design + +### Policy Band Architecture + +**Three-tier decision framework:** + +``` + ┌─────────────────────────────┐ + │ Model Prediction │ + │ p_malicious ∈ [0, 1] │ + └──────────┬──────────────────┘ + │ + ┌───────────────┼───────────────┐ + │ │ │ + p < 0.011 0.011 ≤ p ≤ 0.998 p > 0.998 + │ │ │ + ▼ ▼ ▼ + ┌──────────┐ ┌──────────┐ ┌──────────┐ + │ ALLOW │ │ REVIEW │ │ BLOCK │ + │ (52%) │ │ (12%) │ │ (36%) │ + └──────────┘ └──────────┘ └──────────┘ + │ │ │ + └───────────────┴───────────────┘ + │ + Final Decision +``` + +### Threshold Selection Methodology + +**Step 1: Optimal Threshold (t*)** + +```python +# Maximize F1-macro +thresholds = np.linspace(0, 1, 1000) +f1_scores = [] + +for t in thresholds: + y_pred = (y_proba >= t).astype(int) + f1 = f1_score(y_true, y_pred, average='macro') + f1_scores.append(f1) + +t_star = thresholds[np.argmax(f1_scores)] +# Result: t_star = 0.500 +``` + +**Step 2: Gray Zone Bounds** + +```python +# Target: 10-15% of validation set in gray zone + +# Low threshold: 5th percentile of phishing predictions +low_threshold = np.percentile(y_proba[y_true == 0], 5) +# Result: 0.011 + +# High threshold: 95th percentile of legitimate predictions +high_threshold = np.percentile(y_proba[y_true == 1], 95) +# Result: 0.998 + +# Gray zone rate +gray_zone_rate = np.mean((y_proba >= low_threshold) & (y_proba <= high_threshold)) +# Result: 12.0% +``` + +**Step 3: Validation** + +```python +# Decision distribution on validation set +allow_rate = np.mean(y_proba < low_threshold) # 52.0% +review_rate = gray_zone_rate # 12.0% +block_rate = np.mean(y_proba > high_threshold) # 36.0% + +# Automation rate (no human needed) +automation_rate = allow_rate + block_rate # 88.0% +``` + +### Why These Thresholds? + +**Low Threshold (0.011):** +- **Purpose:** Fast-path ALLOW for clearly legitimate URLs +- **Rationale:** 5th percentile of phishing → 95% of phishing caught +- **Trade-off:** Accept 5% FNs for 52% automation + +**High Threshold (0.998):** +- **Purpose:** Fast-path BLOCK for clearly malicious URLs +- **Rationale:** 95th percentile of legitimate → 95% of legit passed +- **Trade-off:** Accept 5% FPs for 36% automation + +**Gray Zone (0.011 to 0.998):** +- **Purpose:** Human review or judge escalation +- **Size:** 12% of traffic (5,632 samples in validation) +- **Rationale:** Uncertain cases benefit from additional review + +### Operational Implications + +**Throughput:** +``` +1,000 requests/sec: +• ALLOW: 520/sec (policy band, <1ms latency) +• BLOCK: 360/sec (policy band, <1ms latency) +• REVIEW: 120/sec (judge escalation, 2-5s latency) + +Judge Capacity Needed: +• 120 req/sec * 3s avg = 360 concurrent judge requests +• Recommend: 400+ judge workers or async queue +``` + +**Latency Profile:** +``` +P50 (median): 60ms (ALLOW/BLOCK via model) +P95: 100ms (ALLOW/BLOCK via model) +P99: 5000ms (REVIEW via LLM judge) +``` + +--- + +## 🧠 LLM Judge Integration + +### Design Rationale + +**Why LLM Judge?** + +1. **Edge case handling:** Short domains (npm.org, bit.ly) defy simple rules +2. **Explainability:** Human-readable rationale for gray zone decisions +3. **Flexibility:** Easy to update prompts vs. retraining model +4. **User trust:** Natural language explanations build confidence + +**Why Not Just Retrain Model?** + +- Gray zone is **12% of traffic** → Not worth full retraining cycle +- Edge cases are **rare but important** → LLM excels at few-shot reasoning +- **Operational agility:** Can update judge behavior without model deployment + +### Architecture + +``` +┌────────────────────────────────────────────────────────────┐ +│ JUDGE WIRE (src/gateway/judge_wire.py) │ +│ ┌──────────────────────────────────────────────────────┐ │ +│ │ 1. Check if URL in gray zone (0.011 ≤ p ≤ 0.998) │ │ +│ └────────────────────┬─────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────┐ │ +│ │ 2. Enhanced Routing: │ │ +│ │ • Domain ≤ 10 chars AND p < 0.5? │ │ +│ │ • If yes: Flag as short domain edge case │ │ +│ └────────────────────┬─────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────┐ │ +│ │ 3. Call Judge: │ │ +│ │ • Primary: LLM Adapter (Ollama) │ │ +│ │ • Fallback: Stub Judge (deterministic) │ │ +│ └────────────────────┬─────────────────────────────────┘ │ +└────────────────────────┼──────────────────────────────────┘ + │ + ┌───────────────┴────────────────┐ + ▼ ▼ +┌─────────────────┐ ┌─────────────────┐ +│ LLM ADAPTER │ │ STUB JUDGE │ +│ (Primary) │ │ (Fallback) │ +│ ┌───────────┐ │ │ ┌───────────┐ │ +│ │ Ollama │ │ │ │ Rules │ │ +│ │ llama3.2 │ │ │ │ < 1ms │ │ +│ │ 2-5s │ │ │ └───────────┘ │ +│ └───────────┘ │ └─────────────────┘ +└─────────────────┘ + │ + ▼ +┌─────────────────────────────────────────┐ +│ VERDICT MAPPING │ +│ • LEAN_PHISH → BLOCK │ +│ • LEAN_LEGIT → ALLOW │ +│ • UNCERTAIN → REVIEW (human escalate) │ +└─────────────────────────────────────────┘ +``` + +### Short Domain Detection + +**Rationale:** + +Legitimate short domains often look suspicious to statistical models: +- `npm.org` (7 chars) - Package manager, obviously legitimate +- `bit.ly` (6 chars) - URL shortener, legitimate but commonly abused +- `t.co` (4 chars) - Twitter shortener, legitimate + +**Detection Logic:** + +```python +def _should_route_to_judge_for_short_domain(url: str, p_malicious: float) -> bool: + """ + Route to judge if: + 1. Domain ≤ 10 characters + 2. Confidence moderate (p < 0.5) + + Catches edge cases like npm.org that whitelist doesn't cover. + """ + domain = extract_domain(url) + + is_short = len(domain) <= SHORT_DOMAIN_LENGTH # Default: 10 + is_moderate_confidence = p_malicious < SHORT_DOMAIN_CONFIDENCE # Default: 0.5 + + return is_short and is_moderate_confidence +``` + +**Example:** +``` +URL: http://npm.org/package/express +Model: p_malicious = 0.35 (gray zone) +Domain: "npm.org" (7 chars) +Condition: 7 ≤ 10 AND 0.35 < 0.5 → TRUE +Action: Route to LLM judge + +LLM Judge Response: +"Domain 'npm.org' is a well-known package manager. Short domain length +(7 chars) is expected for legitimate tech infrastructure. TLD .org is +commonly used by open-source projects. VERDICT: LEAN_LEGIT" + +Final Decision: ALLOW +``` + +### Prompt Engineering + +**Current Prompt Structure:** + +``` +1. Role Definition: + "You are a cybersecurity analyst specializing in phishing detection." + +2. Feature Context: + "KEY FEATURES TO ANALYZE: + - TLDLegitimateProb: Bayesian TLD legitimacy probability [0,1] + - CharContinuationRate: Character repetition patterns [0,1] + ..." + +3. Output Format: + "RESPOND WITH EXACTLY THREE FIELDS: + VERDICT: LEAN_PHISH | LEAN_LEGIT | UNCERTAIN + SCORE: risk score in [0,1] + RATIONALE: brief explanation" + +4. URL + Features: + "URL: {url} + FEATURES: {json_features}" + +5. Focus Areas: + "Focus on: HTTPS usage, TLD legitimacy, character patterns, + and any URL obfuscation techniques." +``` + +**Why This Works:** +- **Structured output:** Easy to parse (regex on VERDICT/SCORE/RATIONALE) +- **Feature grounding:** LLM bases reasoning on extracted features +- **Concise:** Reduces token count, speeds inference +- **Testable:** Clear success criteria (valid verdict + rationale) + +### Failover Strategy + +**LLM Failure Modes:** +1. **Timeout (60s):** Model loading or slow generation +2. **Connection refused:** Ollama not running +3. **Parsing failure:** LLM doesn't follow format +4. **Model not found:** Wrong model name + +**Graceful Degradation:** +```python +try: + # Try LLM judge + response = ollama_api.generate(prompt) + verdict, score, rationale = parse_response(response) + return JudgeResponse( + verdict=verdict, + rationale=rationale, + context={"backend": "llm"} + ) +except Exception as e: + # Fall back to stub judge (deterministic rules) + logger.error(f"LLM judge failed: {e}") + stub_response = stub_judge(url, features) + stub_response.context["backend"] = "stub_fallback" + stub_response.context["error"] = str(e) + return stub_response +``` + +**Failure Impact:** +- **Service continues:** Gray zone URLs get deterministic verdict +- **Logged:** Error recorded for debugging +- **Transparent:** Response includes `backend="stub_fallback"` + +--- + +## 📊 SHAP Explainability + +### Why SHAP? + +**Requirements:** +1. **Regulatory compliance:** Explain why URL classified as phishing +2. **User trust:** Show which features contributed to decision +3. **Model debugging:** Identify feature importance issues + +**Why SHAP over other methods:** +- **Model-agnostic:** Works with XGBoost, logistic regression, etc. +- **Theoretically sound:** Based on Shapley values (game theory) +- **Additive:** Feature contributions sum to prediction + +### Implementation Challenge + +**Problem:** +```python +# Our model is CalibratedClassifierCV (wrapper around XGBoost) +PRIMARY_MODEL = CalibratedClassifierCV( + base_estimator=XGBClassifier(...), + method='isotonic', + cv=5 +) + +# SHAP doesn't support calibrated models directly +explainer = shap.TreeExplainer(PRIMARY_MODEL) +# Error: Model type not supported by TreeExplainer +``` + +**Solution: Unwrap Base Estimator** + +```python +def get_shap_explainer(model): + """ + Create SHAP explainer, unwrapping calibrated models. + + Note: SHAP computed on base estimator (before calibration), + so values are approximate. + """ + base_model = model + + # Check if model is CalibratedClassifierCV + if hasattr(model, 'calibrated_classifiers_'): + # Unwrap to get XGBoost base estimator + base_model = model.calibrated_classifiers_[0].estimator + logger.info(f"Unwrapped calibrated model. Base type: {type(base_model)}") + + # Create SHAP explainer on base model + explainer = shap.TreeExplainer(base_model) + return explainer +``` + +**Trade-off:** +- **Pro:** SHAP works, provides feature importance +- **Con:** Values computed before calibration (approximate) +- **Acceptable:** Relative importance still correct, absolute values slightly off + +### SHAP Value Interpretation + +**Example:** + +```json +{ + "url": "http://verify-account-now.info", + "p_malicious": 0.85, + "base_value": 0.318, + "features": { + "CharContinuationRate": { + "value": 0.1, + "shap_value": -0.523, // Pushes TOWARDS legitimate + "importance": 0.523 // Most important feature + }, + "NoOfOtherSpecialCharsInURL": { + "value": 6, + "shap_value": 0.342, // Pushes TOWARDS phishing + "importance": 0.342 + }, + "TLDLegitimateProb": { + "value": 0.43, + "shap_value": -0.026, // Slightly legitimate + "importance": 0.026 + } + } +} +``` + +**Interpretation:** +``` +base_value: 0.318 (neutral baseline) ++ CharContinuationRate: -0.523 (low repetition → legit signal) ++ NoOfOtherSpecialCharsInURL: +0.342 (6 special chars → phish signal) ++ TLDLegitimateProb: -0.026 (.info TLD → slightly legit) ++ ... (other features) += final prediction: 0.85 (phishing) + +Conclusion: Despite low character repetition (legitimate signal), +the high special character count and suspicious TLD overcome it. +``` + +### Dashboard Integration + +**Flow:** +``` +User → /explain endpoint → SHAP computation → Dashboard rendering + ↓ + (200-500ms latency) +``` + +**Dashboard Features:** +1. **Sorted by importance:** Most influential features first +2. **Color-coded:** Red = phishing signal, Green = legitimate signal +3. **Feature values shown:** Transparency into model input +4. **Intuitive bars:** Visual length proportional to importance + +**Code:** +```javascript +// src/gateway/static/explain.html +features.forEach(([name, info]) => { + const bar = document.createElement('div'); + bar.className = info.shap_value >= 0 ? 'positive' : 'negative'; + bar.style.width = `${(Math.abs(info.shap_value) / maxAbs) * 100}%`; + bar.textContent = info.shap_value >= 0 ? + '→ Increases risk' : '→ Decreases risk'; +}); +``` + +--- + +## 🛡️ Graceful Degradation Strategy + +### Design Philosophy + +**Never block the entire service due to a single component failure.** + +### Failure Modes & Responses + +| Component | Failure Mode | Response | Impact | +|-----------|--------------|----------|--------| +| **Model Service** | Down | Gateway returns 503 | ❌ Full outage (acceptable - core component) | +| **LLM Judge** | Timeout (>60s) | Fall back to stub judge | ✅ Service continues, stub verdict | +| **Ollama** | Not running | Fall back to stub judge | ✅ Service continues, stub verdict | +| **SHAP** | Computation fails | Return prediction without explanation | ✅ Prediction succeeds, no explanation | +| **MongoDB** | Connection lost | Skip audit logging | ✅ Service continues, audit incomplete | +| **Whitelist** | Load error | Skip whitelist check | ✅ All URLs go to model (slower) | + +### Implementation Patterns + +**Pattern 1: Try-Except with Fallback** + +```python +def decide_with_judge(url, p_malicious, thresholds): + try: + # Try LLM judge + judge_response = judge_url_llm(url, features) + except Exception as e: + # Fall back to stub + logger.error(f"LLM judge failed: {e}") + judge_response = judge_url_stub(url, features) + judge_response.context["backend"] = "stub_fallback" + + return judge_response +``` + +**Pattern 2: Optional Feature with Try-Except** + +```python +@app.post("/predict/explain") +def predict_explain(payload): + # Always return prediction + prediction = model.predict(features) + + # Try SHAP (optional) + try: + explainer = shap.TreeExplainer(base_model) + shap_values = explainer.shap_values(features) + except Exception as e: + logger.error(f"SHAP failed: {e}") + shap_values = None + + return { + "prediction": prediction, + "shap_values": shap_values # May be null + } +``` + +**Pattern 3: Fail-Open for Audit** + +```python +def log_decision(url, decision): + if MONGO_CLIENT is None: + # No MongoDB configured → Skip silently + return + + try: + MONGO_CLIENT.decisions.insert_one({ + "url": url, + "decision": decision, + "timestamp": datetime.utcnow() + }) + except Exception as e: + # Log error but don't fail request + logger.error(f"Audit log failed: {e}") + pass +``` + +### Health Check Strategy + +**Tiered Health Checks:** + +```python +@app.get("/health") +def health_check(): + health = { + "status": "healthy", + "components": {} + } + + # Critical: Model loaded? + health["components"]["model"] = { + "status": "healthy" if MODEL_LOADED else "unhealthy", + "critical": True + } + + # Non-critical: LLM judge available? + try: + requests.get(f"{OLLAMA_HOST}/api/tags", timeout=2) + health["components"]["llm_judge"] = { + "status": "healthy", + "critical": False + } + except: + health["components"]["llm_judge"] = { + "status": "unhealthy", + "critical": False + } + + # Overall status + critical_unhealthy = any( + c["status"] == "unhealthy" and c["critical"] + for c in health["components"].values() + ) + + if critical_unhealthy: + health["status"] = "unhealthy" + return JSONResponse(health, status_code=503) + + return health +``` + +--- + +## ⚖️ Trade-off Analysis + +### Summary of Key Trade-offs + +| Decision | Cost | Benefit | Rationale | +|----------|------|---------|-----------| +| **Remove IsHTTPS** | +109 FNs | Eliminate 100% HTTPS vulnerability | Error pattern > error count | +| **12% Gray Zone** | 12% need judge review | 88% automated | Balance automation vs quality | +| **LLM Judge** | 2-5s latency | Human-readable explanations | Only for gray zone (12%) | +| **SHAP on Base Estimator** | Approximate values | Explainability | Relative importance still correct | +| **Stub Judge Fallback** | Less sophisticated | Zero downtime | Service reliability > sophistication | +| **URL-Only Features** | No page content analysis | <50ms latency | Speed > marginal accuracy gain | + +### Latency Budget + +``` +Target: P95 < 100ms for high-confidence decisions + +Actual: +• Whitelist: <1ms (set lookup) +• ALLOW/BLOCK: 50-100ms (model inference) +• REVIEW: 2000-5000ms (LLM judge) + +P95: 100ms ✅ (88% of traffic) +P99: 5000ms (12% gray zone) +``` + +**Trade-off:** Accept high P99 for explainability in gray zone. + +### Accuracy vs Speed + +``` +Options Considered: + +1. Ensemble (XGBoost + LogReg + RandomForest): + • Accuracy: +0.5% PR-AUC + • Latency: 3x slower + • Decision: ❌ Not worth it + +2. Deep Learning (BERT on URL text): + • Accuracy: +1% PR-AUC (estimated) + • Latency: 500ms+ (GPU required) + • Decision: ❌ Over-engineered for URL-only + +3. Single XGBoost + Isotonic Calibration: + • Accuracy: 99.87% PR-AUC + • Latency: 50ms + • Decision: ✅ Sweet spot +``` + +--- + +## 🔮 Future Architecture Evolution + +### Short-term (1-3 months) + +**1. Prometheus Metrics + Grafana** +``` +Metrics: +• phishguard_predictions_total{decision} +• phishguard_prediction_latency_seconds +• phishguard_judge_invocations_total{verdict, backend} + +Dashboards: +• Decision distribution over time +• Latency percentiles (P50/P95/P99) +• Judge success rate +``` + +**2. Structured Logging with Request IDs** +```python +logger.info({ + "request_id": uuid4(), + "event": "prediction", + "url": url, + "decision": decision, + "p_malicious": p_malicious, + "latency_ms": latency, + "judge_invoked": bool(judge) +}) +``` + +### Medium-term (3-6 months) + +**1. Feature Drift Detection (PSI)** +```python +from scipy.stats import chisquare + +def calculate_psi(baseline_dist, current_dist): + """Population Stability Index""" + psi = np.sum((current_dist - baseline_dist) * + np.log(current_dist / baseline_dist)) + return psi + +# Alert if PSI > 0.25 (significant shift) +``` + +**2. A/B Testing Framework** +```python +# Shadow mode: Run new model alongside production +if is_shadow_traffic(request): + prod_result = prod_model.predict(features) + shadow_result = shadow_model.predict(features) + + log_comparison(prod_result, shadow_result) + + return prod_result # Only return prod +``` + +**3. Dynamic Threshold Tuning** +```python +# Adjust thresholds based on operational capacity +if judge_queue_length > 100: + # Widen gray zone to reduce load + thresholds.low *= 0.9 + thresholds.high *= 1.1 +``` + +### Long-term (6-12 months) + +**1. Page Content Features** +```python +# Optional: Fetch page content for high-risk cases +if p_malicious > 0.7 and p_malicious < 0.9: + html_features = extract_html_features(url) + combined_score = ensemble(url_features, html_features) +``` + +**2. Active Learning Pipeline** +```python +# Identify uncertain predictions for labeling +if 0.4 < p_malicious < 0.6: + send_to_labeling_queue(url) + +# Retrain weekly with new labels +``` + +**3. Multi-Model Ensemble** +``` +URL-only model → p1 +Page content model → p2 +User behavior model → p3 + +Final score = weighted_average([p1, p2, p3]) +``` + +--- + +## 📚 Additional Resources + +- **[README.md](../README.md)** - Project overview +- **[DEPLOYMENT.md](DEPLOYMENT.md)** - Deployment guide +- **[API.md](API.md)** - API reference +- **[JUDGE.md](JUDGE.md)** - LLM judge deep dive + +--- + +**Last Updated:** October 23, 2025 +**Version:** 1.0.0 diff --git a/docs/DEMO_SCRIPT.md b/docs/DEMO_SCRIPT.md deleted file mode 100644 index 0ea83a2..0000000 --- a/docs/DEMO_SCRIPT.md +++ /dev/null @@ -1,361 +0,0 @@ -# 🎬 PhishGuardAI - 5-Minute Demo Script - -**Purpose:** Demonstrate all system capabilities in a structured, repeatable walkthrough. -**Duration:** 5 minutes -**Audience:** Technical interviewers (data scientists, platform engineers, analytics leaders) - ---- - -## 🚀 Setup (30 seconds) - -### Terminal 1: Model Service -```bash -python -m model_svc.main -``` - -**Wait for:** -``` -✓ Loaded model from models/dev/model_8feat.pkl -✓ Model Service Ready -INFO: Uvicorn running on http://0.0.0.0:9000 -``` - -### Terminal 2: Gateway Service -```bash -# Windows -set MODEL_SVC_URL=http://localhost:9000 - -# Linux/Mac -export MODEL_SVC_URL=http://localhost:9000 - -python -m gateway.main -``` - -**Wait for:** -``` -INFO: Application startup complete. -INFO: Uvicorn running on http://0.0.0.0:8000 -``` - ---- - -## 📊 Demo Path - -### Test 1: Whitelist Fast Path (30 seconds) - -**Talking Point:** -> "First, let's test the whitelist fast-path. This handles out-of-distribution domains—major tech companies not in our 2019-2020 training data. Notice the O(1) lookup bypasses the model entirely." - -```bash -curl -X POST http://localhost:8000/predict \ - -H "Content-Type: application/json" \ - -d '{"url":"https://github.com"}' -``` - -**Expected Output:** -```json -{ - "url": "https://github.com", - "p_malicious": 0.01, - "decision": "ALLOW", - "reason": "domain-whitelist", - "source": "whitelist", - "judge": null -} -``` - -**Key Points:** -- ✅ `source: whitelist` - Model never called -- ✅ `reason: domain-whitelist` - Inherently explainable -- ✅ Sub-10ms latency (show in logs if available) - ---- - -### Test 2: High Confidence Phishing → Auto-Block (30 seconds) - -**Talking Point:** -> "Now a clear phishing URL. Notice `phishing.top` has obvious patterns: suspicious TLD (.top has low legitimacy), 'phishing' keyword in domain. The model predicts p=1.0 and the policy band automatically blocks it without consulting the judge." - -```bash -curl -X POST http://localhost:8000/predict \ - -H "Content-Type: application/json" \ - -d '{"url":"https://phishing.top"}' -``` - -**Expected Output:** -```json -{ - "url": "https://phishing.top", - "p_malicious": 1.0, - "decision": "BLOCK", - "reason": "policy-band", - "source": "model", - "judge": null -} -``` - -**Key Points:** -- ✅ `p_malicious: 1.0` - Model is confident -- ✅ `reason: policy-band` - Automated decision (no human review) -- ✅ `source: model` - 8-feature XGBoost prediction -- ✅ This is part of our 89% automation rate - ---- - -### Test 3: Legitimate Domain → Auto-Allow (30 seconds) - -**Talking Point:** -> "Let's test a legitimate domain not on the whitelist. example.com is short (11 chars), but the model correctly identifies it as safe based on URL morphology." - -```bash -curl -X POST http://localhost:8000/predict \ - -H "Content-Type: application/json" \ - -d '{"url":"https://example.com"}' -``` - -**Expected Output:** -```json -{ - "url": "https://example.com", - "p_malicious": 0.01, - "decision": "ALLOW", - "reason": "domain-whitelist", - "source": "whitelist", - "judge": null -} -``` - -**Key Points:** -- ✅ Low p_malicious (if not whitelisted, should be <0.004) -- ✅ Auto-ALLOW via policy band or whitelist -- ✅ Part of our 0.09% false positive rate - ---- - -### Test 4: Enhanced Short Domain Routing (1 minute) - -**Talking Point:** -> "This is one of my favorite features: enhanced short domain routing. npm.org is only 7 characters—short domains are edge cases because they're underrepresented in training data. Even though the model gives moderate confidence (p=0.35), our routing logic detects `len(domain) <= 10` and escalates to the judge. This prevents false positives on legitimate URL shorteners like bit.ly or npm.org." - -```bash -curl -X POST http://localhost:8000/predict \ - -H "Content-Type: application/json" \ - -d '{"url":"https://npm.org","p_malicious":0.35}' -``` - -**Expected Output:** -```json -{ - "url": "https://npm.org", - "p_malicious": 0.35, - "decision": "ALLOW", - "reason": "judge-short-domain-lean-legit", - "judge": { - "verdict": "LEAN_LEGIT", - "rationale": "no obvious phishing heuristics triggered", - "judge_score": 0.0, - "context": { - "url_len": 15, - "url_digit_ratio": 0.0, - "url_subdomains": 0, - "TLDLegitimateProb": null - } - }, - "source": "model" -} -``` - -**Key Points:** -- ✅ `reason: judge-short-domain-lean-legit` - Enhanced routing triggered -- ✅ Judge provides **explainable rationale** (not just probability) -- ✅ Prevents FPs on legitimate shorteners (bit.ly, t.co, npm.org) -- ✅ This is **domain knowledge encoded into the decision flow** - -**Follow-up Question to Anticipate:** -> "Why not just retrain the model on short domains?" - -**Answer:** -> "We could, but that requires constant retraining as new shorteners emerge. The routing logic is more flexible—we can add npm.org, t.co, etc. to the short domain detection without retraining. It's a production ML Ops pattern: encode domain knowledge in the decision pipeline, not just in the model." - ---- - -### Test 5: Stats Monitoring (30 seconds) - -**Talking Point:** -> "We track decision distributions for observability. In production, this feeds into Prometheus and Grafana. Notice the split between `policy_decisions` (what the policy bands recommended) and `final_decisions` (what was actually returned after judge intervention). This lets us measure judge override rates." - -```bash -curl http://localhost:8000/stats -``` - -**Expected Output:** -```json -{ - "policy_decisions": { - "ALLOW": 2, - "REVIEW": 1, - "BLOCK": 1 - }, - "final_decisions": { - "ALLOW": 3, - "BLOCK": 1 - }, - "judge_verdicts": { - "LEAN_LEGIT": 1, - "UNCERTAIN": 0, - "LEAN_PHISH": 0 - } -} -``` - -**Key Points:** -- ✅ Tracks policy vs final decisions -- ✅ Judge verdict distribution -- ✅ In production: feeds Prometheus for alerting - ---- - -### Test 6: SHAP Explainability Dashboard (2 minutes) - -**Talking Point:** -> "Finally, let me show you the SHAP explainability dashboard—one of our key differentiators for regulatory compliance and trust." - -**Open browser:** `http://localhost:8000/explain` - -#### Example 1: Clear Phishing (30 seconds) -**Enter:** `http://facebook1mob.com` - -**Point out:** -- 🔴 **100% malicious** - High-risk prediction -- 🔴 **IsHTTPS (green bar, negative SHAP)**: Missing HTTPS strongly indicates phishing -- 🔴 **NoOfOtherSpecialChars (red bar, positive SHAP)**: The '1' in "facebook1mob" is typosquatting -- ✅ **Feature values table**: Shows raw extractions (IsHTTPS=0, DomainLength=20, etc.) - -**Talking Point:** -> "SHAP shows which features drove the decision. IsHTTPS being 0 (missing HTTPS) has a large negative impact—it strongly pushes the prediction toward phishing. The '1' in facebook1mob is a typosquatting pattern the model learned. This is critical for compliance: regulators want to know WHY a URL was blocked, not just the probability." - -#### Example 2: Legitimate Domain (30 seconds) -**Enter:** `https://www.circlek.org` - -**Point out:** -- 🟢 **0.1% malicious** - Low-risk prediction -- 🟢 **DomainLength (green bar, negative SHAP)**: Moderate length (15 chars) is protective -- 🔴 **CharContinuationRate (red bar, positive SHAP)**: "circlek" has slight character repetition, but overall safe -- ✅ **Net result**: Legitimate despite minor suspicious signals - -**Talking Point:** -> "SHAP explains why this is safe despite some red flags. DomainLength (15 chars) is protective—phishing URLs are often extremely short or extremely long. The slight CharContinuationRate increase from 'circlek' isn't enough to overcome the protective signals." - -#### Example 3: Whitelisted Domain (30 seconds) -**Enter:** `https://github.com/microsoft/vscode` - -**Point out:** -- 🟢 **1.0% malicious** (whitelisted) -- ℹ️ **"No feature contributions available (whitelist match)"** -- ✅ Model was never called → No SHAP values to compute - -**Talking Point:** -> "For whitelisted domains, the model is never called, so there are no SHAP values. The explanation is inherently simple: 'It's GitHub—we trust it.' This demonstrates our two-tier explainability: whitelist decisions are inherently explainable, and model decisions get SHAP feature attributions." - ---- - -## 🎯 Key Metrics to Mention - -**Memorize these numbers:** -- **99.92% PR-AUC** - Near-perfect precision-recall tradeoff -- **0.09% False Positive Rate** - 23 FPs out of 26,970 legitimate URLs -- **89% Automation** - Policy bands handle most decisions without manual review -- **11% Gray Zone** - Escalated to judge for explainable review -- **8 Features** - URL-only (no page fetching), <50ms inference -- **47,074 URLs** - PhiUSIIL training dataset - ---- - -## 🧠 Anticipated Questions & Answers - -### Q: "What's the latency?" -**A:** -> "Whitelist path is <10ms (O(1) lookup). Model path is ~20-50ms (feature extraction + XGBoost inference). Judge path is ~50-100ms. We have a known performance bottleneck (2s p95) that's documented as future optimization work—root cause analysis is pending, likely model loading issue." - -### Q: "How do you handle distribution shift?" -**A:** -> "Three mechanisms: (1) Whitelist handles out-of-distribution major tech companies. (2) Enhanced routing logic catches short domain edge cases. (3) Future work: PSI (Population Stability Index) monitoring to detect feature drift and trigger retraining." - -### Q: "What if the judge is wrong?" -**A:** -> "The judge is a rule-based system (deterministic stub) with optional LLM upgrade via Ollama. The stub never fails—it's pure Python logic. The LLM adapter has automatic fallback if Ollama is down. This fail-secure design ensures the system never returns 500 errors. In production, we'd implement a feedback loop where security teams can label judge decisions to improve the heuristics." - -### Q: "How would you deploy this at Helcim scale?" -**A:** -> "Multi-phase rollout: (1) Shadow mode for 2 weeks—log predictions but don't act, compare to existing system. (2) Canary deployment—5% traffic → 25% → 50% → 100%. (3) Full production with Kubernetes autoscaling (target: 500 req/sec per pod), Prometheus metrics, PagerDuty alerts for latency p99 > 200ms or FP rate > 0.2%. Security hardening: rate limiting (100 req/sec per API key), JWT authentication, secrets management via Vault." - -### Q: "What's your biggest lesson from this project?" -**A:** -> "Training/serving skew—I discovered that feature extraction in production didn't match training. The training notebook used PhiUSIIL's pre-computed features, but my production service had custom logic. Small differences (how to count special chars, calculate ratios) led to wildly wrong predictions. I fixed it by creating a shared feature library (`src/common/feature_extraction.py`) and using it in BOTH training and serving. This taught me: feature extraction is code, not notebooks. Always validate end-to-end consistency." - ---- - -## ✅ Post-Demo Checklist - -**After the demo, be ready to:** -- [ ] Walk through code if they want technical depth -- [ ] Explain any design decision (policy bands, thresholds, whitelist, etc.) -- [ ] Discuss how this applies to Helcim's fraud detection needs -- [ ] Show notebooks if they want to see training process -- [ ] Explain Great Expectations data contracts -- [ ] Discuss Docker deployment strategy - -**Don't:** -- ❌ Apologize for known limitations (frame as "future work") -- ❌ Say "I would have done X differently" (own your decisions) -- ❌ Over-explain (let them ask questions) - -**Do:** -- ✅ Be enthusiastic about the SHAP dashboard (it's your differentiator) -- ✅ Connect everything to Helcim's payment fraud use case -- ✅ Use concrete numbers (99.92% PR-AUC, not "the model is accurate") -- ✅ Tell the training/serving skew story if there's time - ---- - -## 🎬 Demo Complete! - -**Total time:** 5 minutes -**Tests run:** 6 (whitelist, phishing, legitimate, short domain, stats, SHAP) -**Key systems demonstrated:** Gateway, Model Service, Judge, SHAP Dashboard -**Key concepts covered:** Whitelist, Policy Bands, Enhanced Routing, Explainability, Observability - -**You're ready!** 🚀 - ---- - -## 📞 Questions During Demo? - -**If they ask to see code:** -```bash -# Show gateway logic -code src/gateway/main.py - -# Show model service -code src/model_svc/main.py - -# Show feature extraction -code src/common/feature_extraction.py -``` - -**If they ask about testing:** -```bash -pytest -v tests/ -``` - -**If they ask about Docker:** -```bash -docker build -f docker/gateway.Dockerfile -t phishguard:demo . -docker run --rm -p 8000:8000 phishguard:demo -``` - ---- - -**Pro tip:** Run through this demo 2-3 times before the interview. Practice your talking points out loud. Know what to say while waiting for curl responses (don't let silence hang). - -**Remember:** Confidence comes from preparation. You've built something impressive—now show it off! 💪 diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md new file mode 100644 index 0000000..e0d2f68 --- /dev/null +++ b/docs/DEPLOYMENT.md @@ -0,0 +1,896 @@ +# 🚀 PhishGuardAI Deployment Guide + +**Complete guide for deploying PhishGuardAI in local, staging, and production environments.** + +--- + +## 📋 Table of Contents + +1. [System Requirements](#system-requirements) +2. [Local Development Setup](#local-development-setup) +3. [Environment Variables](#environment-variables) +4. [Ollama LLM Judge Setup](#ollama-llm-judge-setup) +5. [Docker Deployment](#docker-deployment) +6. [Health Checks](#health-checks) +7. [Monitoring & Observability](#monitoring--observability) +8. [Troubleshooting](#troubleshooting) +9. [Security Considerations](#security-considerations) + +--- + +## 💻 System Requirements + +### Minimum Requirements + +| Component | Minimum | Recommended | +|-----------|---------|-------------| +| **CPU** | 2 cores | 4+ cores | +| **RAM** | 4 GB | 8+ GB | +| **Disk** | 5 GB free | 20+ GB free | +| **OS** | Linux, macOS, Windows 10+ | Linux (Ubuntu 22.04+) | +| **Python** | 3.11.0 | 3.11.5+ | +| **Docker** | 20.10+ (optional) | Latest | + +### For Ollama LLM Judge (Optional) + +| Component | Minimum | Recommended | +|-----------|---------|-------------| +| **RAM** | 8 GB | 16+ GB | +| **Disk** | +2 GB (model) | +5 GB | +| **GPU** | Not required | NVIDIA GPU (CUDA) for faster inference | + +--- + +## 🛠️ Local Development Setup + +### Step 1: Clone Repository + +```bash +git clone https://github.com/fitsblb/PhishGuardAI.git +cd PhishGuardAI +``` + +### Step 2: Create Virtual Environment + +```bash +# Create venv +python3.11 -m venv venv + +# Activate +source venv/bin/activate # Linux/macOS +# OR +venv\Scripts\activate # Windows PowerShell +``` + +### Step 3: Install Dependencies + +```bash +# Upgrade pip +pip install --upgrade pip + +# Install requirements +pip install -r requirements.txt + +# Verify installation +python -c "import fastapi; import xgboost; import shap; print('✅ Dependencies installed')" +``` + +### Step 4: Download Model + +```bash +# Model should be in models/ directory +ls -lh models/7_features_xgb_isotonic_prod.pkl + +# If missing, download from releases or retrain: +# python notebooks/01_baseline_and_calibration.ipynb +``` + +### Step 5: Start Services + +#### Terminal 1: Model Service + +```bash +# Set environment +export MODEL_PATH="models/7_features_xgb_isotonic_prod.pkl" + +# Start model service +uvicorn src.model_svc.main:app --host 0.0.0.0 --port 8002 --reload + +# Expected output: +# INFO: Uvicorn running on http://0.0.0.0:8002 +``` + +#### Terminal 2: Gateway Service + +```bash +# Set environment +export MODEL_SVC_URL="http://localhost:8002" +export THRESHOLDS_JSON="configs/dev/thresholds.json" +export JUDGE_BACKEND="stub" # Use "llm" after Ollama setup + +# Start gateway +uvicorn src.gateway.main:app --host 0.0.0.0 --port 8000 --reload + +# Expected output: +# INFO: Uvicorn running on http://0.0.0.0:8000 +``` + +### Step 6: Verify Deployment + +```bash +# Health check +curl http://localhost:8000/health + +# Expected: {"status":"healthy","model_loaded":true} + +# Test prediction +curl -X POST "http://localhost:8000/predict" \ + -H "Content-Type: application/json" \ + -d '{"url":"http://phishing-example.com"}' + +# Expected: {"decision":"BLOCK","p_malicious":0.99,...} +``` + +--- + +## 🔐 Environment Variables + +### Model Service + +| Variable | Required | Default | Description | +|----------|----------|---------|-------------| +| `MODEL_PATH` | ✅ | None | Path to pickled model file | +| `TLD_PROBS_PATH` | ❌ | `data/tld_probs.json` | TLD probability lookup table | +| `LOG_LEVEL` | ❌ | `INFO` | Logging level (DEBUG, INFO, WARNING) | + +### Gateway Service + +| Variable | Required | Default | Description | +|----------|----------|---------|-------------| +| `MODEL_SVC_URL` | ✅ | None | Model service URL | +| `THRESHOLDS_JSON` | ✅ | None | Path to thresholds config | +| `JUDGE_BACKEND` | ❌ | `stub` | Judge type: `stub` or `llm` | +| `CORS_ORIGINS` | ❌ | `["http://localhost:8000"]` | Allowed CORS origins | +| `LOG_LEVEL` | ❌ | `INFO` | Logging level | + +### Judge Service (LLM) + +| Variable | Required | Default | Description | +|----------|----------|---------|-------------| +| `JUDGE_MODEL` | ❌ | `llama3.2:1b` | Ollama model name | +| `OLLAMA_HOST` | ❌ | `http://localhost:11434` | Ollama API URL | +| `JUDGE_TIMEOUT_SECS` | ❌ | `60` | LLM request timeout (seconds) | +| `SHORT_DOMAIN_LENGTH` | ❌ | `10` | Short domain threshold (chars) | +| `SHORT_DOMAIN_CONFIDENCE` | ❌ | `0.5` | Confidence threshold for short domains | + +### Optional: Audit Logging (MongoDB) + +| Variable | Required | Default | Description | +|----------|----------|---------|-------------| +| `MONGO_URI` | ❌ | None | MongoDB connection string | +| `MONGO_DB` | ❌ | `phishguard` | Database name | + +**Example:** +```bash +export MONGO_URI="mongodb://localhost:27017" +export MONGO_DB="phishguard_prod" +``` + +### Complete Example (.env file) + +```bash +# Model Service +MODEL_PATH=models/7_features_xgb_isotonic_prod.pkl +TLD_PROBS_PATH=data/tld_probs.json +LOG_LEVEL=INFO + +# Gateway Service +MODEL_SVC_URL=http://localhost:8002 +THRESHOLDS_JSON=configs/dev/thresholds.json +JUDGE_BACKEND=llm +CORS_ORIGINS=["http://localhost:8000","https://phishguard.example.com"] + +# LLM Judge +JUDGE_MODEL=llama3.2:1b +OLLAMA_HOST=http://localhost:11434 +JUDGE_TIMEOUT_SECS=60 +SHORT_DOMAIN_LENGTH=10 +SHORT_DOMAIN_CONFIDENCE=0.5 + +# Optional: Audit Logging +# MONGO_URI=mongodb://localhost:27017 +# MONGO_DB=phishguard +``` + +--- + +## 🤖 Ollama LLM Judge Setup + +### Why Ollama? + +- **Local inference:** No API costs, data privacy +- **Fast:** 2-5s per inference (after warmup) +- **Explainable:** Human-readable rationale for edge cases +- **Graceful fallback:** System continues with stub judge if Ollama unavailable + +### Installation + +#### Linux/macOS + +```bash +# Download Ollama +curl -fsSL https://ollama.com/install.sh | sh + +# Verify installation +ollama --version +``` + +#### Windows + +1. Download from [ollama.com/download](https://ollama.com/download) +2. Run installer +3. Verify: `ollama --version` in PowerShell + +### Model Selection + +| Model | Size | RAM | Speed | Quality | +|-------|------|-----|-------|---------| +| **llama3.2:1b** ✅ | 1.3 GB | 4 GB | Fast (2-5s) | Good | +| **llama3.2:3b** | 2.0 GB | 8 GB | Medium (5-10s) | Better | +| **phi3:mini** | 2.2 GB | 8 GB | Fast (3-6s) | Good | + +**Recommended:** `llama3.2:1b` for best balance of speed/quality. + +### Setup Steps + +#### 1. Pull Model + +```bash +# Pull recommended model +ollama pull llama3.2:1b + +# Verify +ollama list + +# Expected output: +# NAME SIZE +# llama3.2:1b 1.3 GB +``` + +#### 2. Start Ollama Service + +```bash +# Start Ollama (keep this running) +ollama serve + +# Expected output: +# Listening on 127.0.0.1:11434 +``` + +#### 3. Test Ollama + +```bash +# Test direct API call +curl http://localhost:11434/api/generate \ + -X POST \ + -H "Content-Type: application/json" \ + -d '{ + "model": "llama3.2:1b", + "prompt": "Say hello", + "stream": false + }' + +# Expected: JSON response with "response" field +``` + +#### 4. Configure Gateway + +```bash +# Terminal 2 (Gateway) - Stop and restart with LLM judge +export MODEL_SVC_URL="http://localhost:8002" +export THRESHOLDS_JSON="configs/dev/thresholds.json" +export JUDGE_BACKEND="llm" # Changed from "stub" +export JUDGE_MODEL="llama3.2:1b" +export OLLAMA_HOST="http://localhost:11434" +export JUDGE_TIMEOUT_SECS="60" # Increased for first call + +uvicorn src.gateway.main:app --host 0.0.0.0 --port 8000 +``` + +#### 5. Test LLM Judge + +```bash +# Test gray zone URL (should trigger judge) +curl -X POST "http://localhost:8000/predict" \ + -H "Content-Type: application/json" \ + -d '{"url":"http://npm.org","p_malicious":0.35}' + +# Expected: Response with judge.backend="llm" and rationale +``` + +### Performance Optimization + +#### Pre-warm Model at Startup + +```bash +# Add to gateway startup script +curl http://localhost:11434/api/generate \ + -X POST \ + -H "Content-Type: application/json" \ + -d '{ + "model": "llama3.2:1b", + "prompt": "Ready", + "stream": false + }' +``` + +#### Keep Ollama Always-On + +```bash +# Option 1: systemd service (Linux) +sudo systemctl enable ollama +sudo systemctl start ollama + +# Option 2: Docker (see Docker section) +``` + +--- + +## 🐳 Docker Deployment + +### Build Images + +```bash +# Build model service +docker build -f docker/model_svc.Dockerfile -t phishguard-model:latest . + +# Build gateway +docker build -f docker/gateway.Dockerfile -t phishguard-gateway:latest . + +# Build Ollama (optional) +docker pull ollama/ollama:latest +``` + +### Docker Compose + +**`docker-compose.yml`:** + +```yaml +version: '3.8' + +services: + ollama: + image: ollama/ollama:latest + ports: + - "11434:11434" + volumes: + - ollama_data:/root/.ollama + command: serve + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"] + interval: 30s + timeout: 10s + retries: 3 + + model-service: + image: phishguard-model:latest + ports: + - "8002:8002" + environment: + - MODEL_PATH=/app/models/7_features_xgb_isotonic_prod.pkl + - LOG_LEVEL=INFO + volumes: + - ./models:/app/models:ro + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8002/health"] + interval: 30s + timeout: 10s + retries: 3 + depends_on: + - ollama + + gateway: + image: phishguard-gateway:latest + ports: + - "8000:8000" + environment: + - MODEL_SVC_URL=http://model-service:8002 + - THRESHOLDS_JSON=/app/configs/dev/thresholds.json + - JUDGE_BACKEND=llm + - JUDGE_MODEL=llama3.2:1b + - OLLAMA_HOST=http://ollama:11434 + - JUDGE_TIMEOUT_SECS=60 + volumes: + - ./configs:/app/configs:ro + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 3 + depends_on: + - model-service + - ollama + +volumes: + ollama_data: +``` + +### Deploy with Docker Compose + +```bash +# Start all services +docker-compose up -d + +# Pull Ollama model (one-time setup) +docker exec -it phishguard-ollama-1 ollama pull llama3.2:1b + +# View logs +docker-compose logs -f gateway + +# Stop services +docker-compose down +``` + +### Production Docker Configuration + +**Multi-stage build for smaller images:** + +```dockerfile +# gateway.Dockerfile +FROM python:3.11-slim AS builder +WORKDIR /app +COPY requirements.txt . +RUN pip install --user --no-cache-dir -r requirements.txt + +FROM python:3.11-slim +WORKDIR /app +COPY --from=builder /root/.local /root/.local +COPY src/ src/ +COPY configs/ configs/ +ENV PATH=/root/.local/bin:$PATH +EXPOSE 8000 +CMD ["uvicorn", "src.gateway.main:app", "--host", "0.0.0.0", "--port", "8000"] +``` + +--- + +## 🏥 Health Checks + +### Model Service Health + +```bash +# Health endpoint +curl http://localhost:8002/health + +# Expected response: +{ + "status": "healthy", + "model_loaded": true, + "model_name": "7-feature-production-v1", + "timestamp": "2025-10-23T12:34:56Z" +} +``` + +### Gateway Health + +```bash +# Health endpoint +curl http://localhost:8000/health + +# Expected response: +{ + "status": "healthy", + "model_service": "connected", + "judge_backend": "llm", + "timestamp": "2025-10-23T12:34:56Z" +} +``` + +### Ollama Health + +```bash +# Check Ollama is running +curl http://localhost:11434/api/tags + +# Expected: List of available models +{ + "models": [ + { + "name": "llama3.2:1b", + "size": 1321098329, + ... + } + ] +} +``` + +### Automated Health Monitoring + +**Bash script (`scripts/health_check.sh`):** + +```bash +#!/bin/bash + +check_service() { + local name=$1 + local url=$2 + + if curl -sf "$url" > /dev/null; then + echo "✅ $name: healthy" + return 0 + else + echo "❌ $name: unhealthy" + return 1 + fi +} + +check_service "Model Service" "http://localhost:8002/health" +check_service "Gateway" "http://localhost:8000/health" +check_service "Ollama" "http://localhost:11434/api/tags" +``` + +**Cron job (check every 5 minutes):** + +```bash +*/5 * * * * /path/to/scripts/health_check.sh >> /var/log/phishguard/health.log 2>&1 +``` + +--- + +## 📊 Monitoring & Observability + +### Stats Endpoint + +```bash +# Get decision statistics +curl http://localhost:8000/stats + +# Response: +{ + "policy": { + "ALLOW": 5234, + "REVIEW": 678, + "BLOCK": 3421 + }, + "judge": { + "LEAN_PHISH": 234, + "LEAN_LEGIT": 312, + "UNCERTAIN": 132 + }, + "final": { + "ALLOW": 5546, + "REVIEW": 132, + "BLOCK": 3655 + }, + "uptime_seconds": 3600 +} +``` + +### Config Endpoint + +```bash +# Get current configuration +curl http://localhost:8000/config + +# Response: +{ + "thresholds": { + "low": 0.011, + "high": 0.998, + "optimal": 0.5 + }, + "model_name": "7-feature-production-v1", + "judge_backend": "llm", + "judge_model": "llama3.2:1b" +} +``` + +### Logging + +**Structured logging example:** + +```python +# src/gateway/main.py +import logging +import json + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) + +logger = logging.getLogger(__name__) + +# Log structured data +logger.info(json.dumps({ + "event": "prediction", + "url": url, + "decision": decision, + "p_malicious": p_malicious, + "latency_ms": latency +})) +``` + +### Prometheus Metrics (Planned) + +**Example metrics:** + +```python +# Planned implementation +from prometheus_client import Counter, Histogram + +prediction_counter = Counter( + 'phishguard_predictions_total', + 'Total predictions', + ['decision'] +) + +latency_histogram = Histogram( + 'phishguard_prediction_latency_seconds', + 'Prediction latency' +) + +judge_invocations = Counter( + 'phishguard_judge_invocations_total', + 'Judge invocations', + ['verdict', 'backend'] +) +``` + +--- + +## 🔧 Troubleshooting + +### Common Issues + +#### Issue 1: Model Service Won't Start + +**Symptoms:** +``` +FileNotFoundError: models/7_features_xgb_isotonic_prod.pkl +``` + +**Solution:** +```bash +# Verify model exists +ls -lh models/ + +# If missing, retrain or download +python notebooks/01_baseline_and_calibration.ipynb +``` + +#### Issue 2: Gateway Can't Connect to Model Service + +**Symptoms:** +``` +ConnectionRefusedError: [Errno 111] Connection refused +``` + +**Solution:** +```bash +# Check model service is running +curl http://localhost:8002/health + +# If not running, start it (Terminal 1) +export MODEL_PATH="models/7_features_xgb_isotonic_prod.pkl" +uvicorn src.model_svc.main:app --host 0.0.0.0 --port 8002 + +# Verify MODEL_SVC_URL is correct (Terminal 2) +echo $MODEL_SVC_URL # Should be: http://localhost:8002 +``` + +#### Issue 3: LLM Judge Falls Back to Stub + +**Symptoms:** +```json +{ + "judge": { + "context": { + "backend": "stub_fallback" + } + } +} +``` + +**Diagnosis:** +```bash +# Check gateway logs for error +# Look for: [JUDGE ERROR] LLM judge failed: + +# Common errors: +# 1. ReadTimeout → Increase JUDGE_TIMEOUT_SECS +# 2. ConnectionRefusedError → Ollama not running +# 3. Model not found → Pull model with ollama pull +``` + +**Solutions:** + +**Timeout:** +```bash +export JUDGE_TIMEOUT_SECS="120" # Increase to 2 minutes +``` + +**Ollama not running:** +```bash +# Start Ollama (Terminal 3) +ollama serve +``` + +**Model not found:** +```bash +# Pull model +ollama pull llama3.2:1b + +# Verify +ollama list +``` + +#### Issue 4: SHAP Endpoint Returns 500 + +**Symptoms:** +```json +{ + "error": "SHAP explanation failed: Model type not supported" +} +``` + +**Solution:** +This should be fixed in the latest code (base estimator unwrapping). If still occurring: + +```bash +# Check model service logs +# Should see: "Unwrapped calibrated model. Base type: " + +# If not, update src/model_svc/main.py with base estimator unwrapping logic +``` + +#### Issue 5: Dashboard Shows "Failed to fetch" + +**Symptoms:** +Dashboard loads but prediction fails with network error. + +**Solution:** +```bash +# Check dashboard API URL (src/gateway/static/explain.html line ~12) +const API_BASE_URL = 'http://localhost:8000'; # Should match gateway port + +# Verify CORS is configured +# In src/gateway/main.py: +app.add_middleware( + CORSMiddleware, + allow_origins=["http://localhost:8000"], # Add dashboard origin + allow_methods=["*"], + allow_headers=["*"], +) +``` + +### Debug Mode + +```bash +# Enable debug logging +export LOG_LEVEL="DEBUG" + +# Restart services +uvicorn src.gateway.main:app --host 0.0.0.0 --port 8000 --log-level debug +``` + +### Performance Issues + +**Symptom: Slow predictions (>1s)** + +**Diagnosis:** +```bash +# Test each component +time curl http://localhost:8002/predict -X POST -d '{"url":"http://test.com"}' +# Should be: <100ms + +time curl http://localhost:8000/predict -X POST -d '{"url":"http://test.com"}' +# Should be: <200ms (includes gateway overhead) +``` + +**Solutions:** + +1. **Model service slow:** + - Check CPU/RAM usage: `top` or `htop` + - Optimize XGBoost threads: Set `n_jobs` in model config + +2. **LLM judge slow:** + - First call always slow (15-20s) - this is normal (model loading) + - Subsequent calls should be 2-5s + - Pre-warm model at startup (see Ollama setup) + +3. **SHAP endpoint slow:** + - SHAP computation is expensive (~200-500ms) + - This is acceptable for `/explain` endpoint (not used for real-time decisions) + - Don't call `/explain` in production prediction path + +--- + +## 🔒 Security Considerations + +### Production Hardening (Planned) + +**Not yet implemented - planned improvements:** + +#### 1. Authentication + +```python +# Example: API key middleware +from fastapi import Header, HTTPException + +async def verify_api_key(x_api_key: str = Header(...)): + if x_api_key != os.getenv("API_KEY"): + raise HTTPException(status_code=403, detail="Invalid API key") + return x_api_key +``` + +#### 2. Rate Limiting + +```python +# Example: slowapi rate limiter +from slowapi import Limiter +from slowapi.util import get_remote_address + +limiter = Limiter(key_func=get_remote_address) + +@app.post("/predict") +@limiter.limit("100/minute") +async def predict(...): + ... +``` + +#### 3. HTTPS/TLS + +```bash +# Use reverse proxy (nginx) for TLS termination +# Example nginx config: +server { + listen 443 ssl; + ssl_certificate /path/to/cert.pem; + ssl_certificate_key /path/to/key.pem; + + location / { + proxy_pass http://localhost:8000; + } +} +``` + +#### 4. Secrets Management + +```bash +# Use environment variables or secrets manager +export API_KEY=$(cat /run/secrets/api_key) +export MONGO_URI=$(cat /run/secrets/mongo_uri) +``` + +### Current Security Measures + +1. **CORS:** Restricted to specific origins +2. **No sensitive data logging:** URLs logged but not stored persistently (unless MongoDB configured) +3. **Graceful degradation:** Service continues if optional components fail +4. **Input validation:** Pydantic schemas validate all inputs + +--- + +## 📚 Additional Resources + +- **[README.md](../README.md)** - Project overview +- **[ARCHITECTURE.md](ARCHITECTURE.md)** - Design decisions +- **[API.md](API.md)** - Complete API reference +- **[JUDGE.md](JUDGE.md)** - LLM judge deep dive + +--- + +## 🆘 Getting Help + +**If you encounter issues:** + +1. Check this troubleshooting guide +2. Review service logs for error messages +3. Verify all environment variables are set correctly +4. Open an issue on GitHub: [github.com/fitsblb/PhishGuardAI/issues](https://github.com/fitsblb/PhishGuardAI/issues) + +--- + +**Last Updated:** October 23, 2025 +**Version:** 1.0.0 diff --git a/docs/EXPLAINABILITY.md b/docs/EXPLAINABILITY.md index 7d0d7c3..81edf24 100644 --- a/docs/EXPLAINABILITY.md +++ b/docs/EXPLAINABILITY.md @@ -91,7 +91,6 @@ If the model predicts `p_malicious = 0.85` (85% phishing): ``` Feature | SHAP Value | Interpretation ---------------------------|------------|---------------------------------- -IsHTTPS = 0 | +0.12 | Missing HTTPS increases risk by 12% NoOfOtherSpecialChars = 5 | +0.18 | Many special chars increase risk by 18% DomainLength = 20 | -0.05 | Moderate length decreases risk by 5% TLDLegitimateProb = 0.62 | +0.08 | Suspicious TLD increases risk by 8% @@ -112,12 +111,11 @@ SUM | +0.42 | Total shift from base rate **Top SHAP Contributions:** | Feature | Value | SHAP Value | Interpretation | |---------|-------|------------|----------------| -| IsHTTPS | 0.0000 | **+11.8 (red)** | ⚠️ Missing HTTPS strongly indicates phishing | | NoOfOtherSpecialCharsInURL | 5.0000 | **+1.76 (red)** | ⚠️ The '1' in "facebook1mob" is typosquatting | | DomainLength | 20.0000 | **-1.90 (green)** | ✓ Moderate length slightly reduces risk | **Analysis:** -- Primary risk: Missing HTTPS protocol (most phishing sites use HTTP) + - Secondary risk: Typosquatting ("facebook1" mimics "facebook") - Despite moderate domain length, the suspicious patterns dominate - **Verdict: BLOCK** - Clear phishing attempt @@ -135,13 +133,12 @@ SUM | +0.42 | Total shift from base rate | CharContinuationRate | 0.1818 | **+3.29 (red)** | ⚠️ "circlek" has repeated 'k' pattern | | DomainLength | 15.0000 | **-2.57 (green)** | ✓ Moderate length reduces risk | | NoOfOtherSpecialCharsInURL | 5.0000 | **+2.21 (red)** | ⚠️ Some special chars present | -| IsHTTPS | 1.0000 | **+1.86 (red)** | ? HTTPS increases risk slightly (counterintuitive) | **Analysis:** - Despite some red flags (char continuation, special chars), net prediction is safe - Moderate domain length is strongly protective - **Verdict: ALLOW** - Overall legitimate despite minor suspicious signals -- **Note**: IsHTTPS showing red is a model artifact—in training data, phishing URLs increasingly use HTTPS to appear legitimate + --- @@ -155,7 +152,6 @@ SUM | +0.42 | Total shift from base rate |---------|-------|------------|----------------| | NoOfOtherSpecialCharsInURL | 5.0000 | **+2.73 (red)** | ⚠️ Special characters present | | CharContinuationRate | 0.1481 | **+1.89 (red)** | ⚠️ Repeated patterns in domain | -| IsHTTPS | 1.0000 | **+1.80 (red)** | ? HTTPS (artifact) | | DomainLength | 20.0000 | **-0.79 (green)** | ✓ Moderate length reduces risk | **Analysis:** diff --git a/docs/JUDGE.md b/docs/JUDGE.md new file mode 100644 index 0000000..6997d09 --- /dev/null +++ b/docs/JUDGE.md @@ -0,0 +1,754 @@ +# 🧠 PhishGuardAI LLM Judge System + +**Deep dive into the LLM-augmented gray zone decision system.** + +--- + +## 📋 Table of Contents + +1. [Overview](#overview) +2. [Architecture](#architecture) +3. [Routing Logic](#routing-logic) +4. [Prompt Engineering](#prompt-engineering) +5. [Verdict Mapping](#verdict-mapping) +6. [Performance Characteristics](#performance-characteristics) +7. [Failover Strategy](#failover-strategy) +8. [Comparison: LLM vs Stub Judge](#comparison-llm-vs-stub-judge) +9. [Ollama Configuration](#ollama-configuration) +10. [Testing & Validation](#testing--validation) + +--- + +## 🌐 Overview + +### Purpose + +The LLM judge provides **human-readable explanations** for URLs in the **gray zone** (12% of traffic), especially edge cases that defy simple statistical rules. + +### Design Goals + +1. **Explainability:** Natural language rationale for decisions +2. **Edge case handling:** Short domains (npm.org, bit.ly) that look suspicious statistically +3. **Graceful degradation:** Fall back to stub judge if LLM unavailable +4. **No downtime:** System continues even if Ollama fails + +### When Judge is Invoked + +``` +Model returns p_malicious ∈ [0.011, 0.998] (gray zone) + ↓ +Policy band: REVIEW + ↓ +Enhanced routing check: + • Is domain ≤ 10 characters? + • AND is confidence moderate (p < 0.5)? + ↓ +YES → Route to LLM judge +NO → Standard REVIEW routing +``` + +**Example URLs that trigger judge:** +- `npm.org` (7 chars, p=0.35) → Judge +- `bit.ly/abc` (6 chars, p=0.42) → Judge +- `t.co/xyz` (4 chars, p=0.38) → Judge +- `example-verify.com` (18 chars, p=0.45) → No judge (not short) + +--- + +## 🏗️ Architecture + +### Component Diagram + +``` +┌─────────────────────────────────────────────────────────────┐ +│ GATEWAY (src/gateway/main.py) │ +│ • Receives URL prediction request │ +│ • Returns p_malicious from model service │ +└────────────────────┬────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ JUDGE WIRE (src/gateway/judge_wire.py) │ +│ ┌──────────────────────────────────────────────────────┐ │ +│ │ decide_with_judge(url, p_malicious, thresholds) │ │ +│ │ 1. Check policy band │ │ +│ │ 2. If REVIEW: Enhanced routing logic │ │ +│ │ 3. Build JudgeRequest with 7 features │ │ +│ │ 4. Call selected judge (LLM or stub) │ │ +│ │ 5. Map verdict to final decision │ │ +│ └──────────────────────────────────────────────────────┘ │ +└────────────────────┬────────────────────────────────────────┘ + │ + ┌───────────┴────────────┐ + ▼ ▼ +┌──────────────────────┐ ┌──────────────────────┐ +│ LLM ADAPTER │ │ STUB JUDGE │ +│ (src/judge_svc/ │ │ (src/judge_svc/ │ +│ adapter.py) │ │ stub.py) │ +│ ┌────────────────┐ │ │ ┌────────────────┐ │ +│ │ Prompt │ │ │ │ Deterministic │ │ +│ │ Engineering │ │ │ │ Rules │ │ +│ │ │ │ │ │ │ │ +│ │ Ollama API │ │ │ │ • Special │ │ +│ │ Call │ │ │ │ chars check │ │ +│ │ │ │ │ │ • Continuation │ │ +│ │ Response │ │ │ │ rate check │ │ +│ │ Parsing │ │ │ │ │ │ +│ │ │ │ │ │ Instant (<1ms) │ │ +│ │ Timeout: 60s │ │ │ └────────────────┘ │ +│ └────────────────┘ │ └──────────────────────┘ +└──────────────────────┘ ▲ + │ │ + │ Try LLM │ Fallback if + │ Primary │ LLM fails + ▼ │ +┌──────────────────────┐ │ +│ OLLAMA (:11434) │───────────┘ +│ • llama3.2:1b │ Exception +│ • Local inference │ +│ • No API costs │ +└──────────────────────┘ +``` + +### Data Flow + +``` +1. URL received: "http://npm.org" + ↓ +2. Model inference: p_malicious = 0.35 + ↓ +3. Policy band: 0.011 < 0.35 < 0.998 → REVIEW + ↓ +4. Enhanced routing: + - Domain: "npm.org" (7 chars) ≤ 10 ✓ + - Confidence: 0.35 < 0.5 ✓ + - Route to judge: YES + ↓ +5. Build JudgeRequest: + { + "url": "http://npm.org", + "features": { + "TLDLegitimateProb": 0.85, + "DomainLength": 7, + ... + } + } + ↓ +6. Call LLM judge (60s timeout): + Try: Ollama llama3.2:1b + Catch Exception → Stub judge fallback + ↓ +7. Parse LLM response: + VERDICT: LEAN_LEGIT + SCORE: 0.15 + RATIONALE: "npm.org is a well-known package manager..." + ↓ +8. Map verdict to decision: + LEAN_LEGIT → ALLOW + ↓ +9. Return to client: + { + "decision": "ALLOW", + "reason": "judge-short-domain-lean-legit", + "judge": { ... } + } +``` + +--- + +## 🧭 Routing Logic + +### Policy Band Decision + +```python +def decide(p_malicious: float, thresholds: Thresholds) -> Decision: + """ + Apply policy bands to determine base decision. + + Returns: + ALLOW if p < low (0.011) + BLOCK if p > high (0.998) + REVIEW if low ≤ p ≤ high (gray zone) + """ + if p_malicious < thresholds.low: + return "ALLOW" + elif p_malicious > thresholds.high: + return "BLOCK" + else: + return "REVIEW" +``` + +### Enhanced Routing (Short Domain Detection) + +```python +def _should_route_to_judge_for_short_domain( + url: str, + p_malicious: float +) -> bool: + """ + Check if URL should be routed to judge due to short domain edge case. + + Rationale: Short legitimate domains (npm.org, bit.ly, etc.) may appear + suspicious to the model due to distribution shift. Route to judge for + human-readable explanation when: + - Domain length ≤ threshold (default 10 chars) + - Confidence is moderate (p < 0.5) - not highly suspicious + + This catches edge cases not covered by the whitelist. + """ + domain = extract_domain(url) + if not domain: + return False + + is_short = len(domain) <= SHORT_DOMAIN_LENGTH # Default: 10 + is_moderate_confidence = p_malicious < SHORT_DOMAIN_CONFIDENCE # Default: 0.5 + + return is_short and is_moderate_confidence +``` + +**Configuration:** +```bash +export SHORT_DOMAIN_LENGTH="10" # Characters +export SHORT_DOMAIN_CONFIDENCE="0.5" # Threshold +``` + +**Examples:** + +| URL | Domain Length | p_malicious | Route to Judge? | +|-----|---------------|-------------|-----------------| +| `npm.org` | 7 | 0.35 | ✅ YES (short + moderate) | +| `bit.ly/abc` | 6 | 0.42 | ✅ YES (short + moderate) | +| `google.com` | 10 | 0.01 | ❌ NO (whitelisted) | +| `phishing.tk` | 11 | 0.45 | ❌ NO (not short enough) | +| `npm.org` | 7 | 0.85 | ❌ NO (high confidence) | + +### Judge Decision Logic + +```python +def decide_with_judge( + url: str, + p_malicious: float, + th: Thresholds, +) -> JudgeOutcome: + """ + Enhanced decision logic with short domain routing. + + Decision Flow: + 1. Apply policy bands (low/high thresholds) + 2. If base decision is REVIEW, check for short domain edge case + 3. Invoke judge and map verdict to final decision + """ + base_decision = decide(p_malicious, th) + + # Fast path: High confidence ALLOW/BLOCK + if base_decision != "REVIEW": + return JudgeOutcome( + final_decision=base_decision, + policy_reason="policy-band", + judge=None + ) + + # Gray zone routing logic + is_short_domain_case = _should_route_to_judge_for_short_domain(url, p_malicious) + + # Build feature digest using 7-feature model + features = extract_7features(url) + + # Call judge (LLM or stub) + judge_response = _JUDGE_FN(JudgeRequest(url=url, features=features)) + + # Map verdict to final decision + if judge_response.verdict == "LEAN_PHISH": + final = "BLOCK" + reason = "judge-short-domain-lean-phish" if is_short_domain_case else "judge-lean-phish" + elif judge_response.verdict == "LEAN_LEGIT": + final = "ALLOW" + reason = "judge-short-domain-lean-legit" if is_short_domain_case else "judge-lean-legit" + else: + final = "REVIEW" + reason = "judge-short-domain-uncertain" if is_short_domain_case else "judge-uncertain" + + return JudgeOutcome( + final_decision=final, + policy_reason=reason, + judge=judge_response + ) +``` + +--- + +## 📝 Prompt Engineering + +### Prompt Structure + +```python +def _prompt(req: JudgeRequest) -> str: + feat = req.features.model_dump() + return ( + "You are a cybersecurity analyst specializing in phishing detection. " + "Assess phishing risk using the URL and 7 sophisticated features:\n\n" + + "KEY FEATURES TO ANALYZE:\n" + "- TLDLegitimateProb: Bayesian TLD legitimacy probability [0,1]\n" + "- CharContinuationRate: Character repetition patterns [0,1]\n" + "- SpacialCharRatioInURL: Special character density [0,1]\n" + "- URLCharProb: URL character sequence probability [0,1]\n" + "- LetterRatioInURL: Alphabetic character ratio [0,1]\n" + "- NoOfOtherSpecialCharsInURL: Count of special characters\n" + "- DomainLength: RFC-compliant domain length\n\n" + + "RESPOND WITH EXACTLY THREE FIELDS:\n" + "VERDICT: LEAN_PHISH | LEAN_LEGIT | UNCERTAIN\n" + "SCORE: risk score in [0,1] where 0=safe, 1=malicious\n" + "RATIONALE: brief explanation focusing on key risk indicators\n\n" + + f"URL: {req.url}\n" + f"FEATURES: {json.dumps(feat, separators=(',', ':'))}\n\n" + + "Focus on: HTTPS usage, TLD legitimacy, character patterns, " + "and any URL obfuscation techniques." + ) +``` + +### Prompt Design Principles + +1. **Role definition:** Clear expertise context ("cybersecurity analyst") +2. **Feature grounding:** LLM bases reasoning on extracted features +3. **Structured output:** Explicit format for easy parsing +4. **Conciseness:** Reduces token count, speeds inference +5. **Focus areas:** Guides LLM to relevant patterns + +### Example Prompt & Response + +**Prompt:** +``` +You are a cybersecurity analyst specializing in phishing detection. +Assess phishing risk using the URL and 7 sophisticated features: + +KEY FEATURES TO ANALYZE: +- TLDLegitimateProb: Bayesian TLD legitimacy probability [0,1] +- CharContinuationRate: Character repetition patterns [0,1] +- SpacialCharRatioInURL: Special character density [0,1] +- URLCharProb: URL character sequence probability [0,1] +- LetterRatioInURL: Alphabetic character ratio [0,1] +- NoOfOtherSpecialCharsInURL: Count of special characters +- DomainLength: RFC-compliant domain length + +RESPOND WITH EXACTLY THREE FIELDS: +VERDICT: LEAN_PHISH | LEAN_LEGIT | UNCERTAIN +SCORE: risk score in [0,1] where 0=safe, 1=malicious +RATIONALE: brief explanation focusing on key risk indicators + +URL: http://npm.org +FEATURES: {"TLDLegitimateProb":0.85,"CharContinuationRate":0.0,"SpacialCharRatioInURL":0.125,"URLCharProb":1.0,"LetterRatioInURL":0.875,"NoOfOtherSpecialCharsInURL":1,"DomainLength":7} + +Focus on: HTTPS usage, TLD legitimacy, character patterns, and any URL obfuscation techniques. +``` + +**LLM Response:** +``` +**VERDICT:** LEAN_LEGIT + +**SCORE:** 0.15 + +**RATIONALE:** Domain 'npm.org' is a well-known package manager for JavaScript. The short domain length (7 characters) is expected for legitimate tech infrastructure. TLD .org has high legitimacy probability (0.85), commonly used by open-source projects. No suspicious character patterns detected (CharContinuationRate=0.0, low special characters). The lack of HTTPS is typical for redirect URLs in package ecosystems. +``` + +### Response Parsing + +```python +_VERDICT_RE = re.compile( + r"\*{0,2}\s*VERDICT\s*\*{0,2}\s*:\s*(LEAN_PHISH|LEAN_LEGIT|UNCERTAIN)\b", + re.I +) +_SCORE_RE = re.compile( + r"\*{0,2}\s*SCORE\s*\*{0,2}\s*:\s*(0(?:\.\d+)?|1(?:\.0+)?)\b", + re.I +) +_RAT_RE = re.compile( + r"\*{0,2}\s*RATIONALE\s*\*{0,2}\s*:\s*\*{0,2}\s*(.+?)(?:\n|$)", + re.I +) + +def _parse(text: str) -> Tuple[JudgeVerdict, float | None, str]: + """ + Extract verdict, score, and rationale from LLM response. + + Handles markdown formatting (** for bold). + """ + verdict = "UNCERTAIN" + score = None + rationale = "no rationale" + + m = _VERDICT_RE.search(text) + if m: + v = m.group(1).upper() + verdict = ( + "LEAN_PHISH" if v == "LEAN_PHISH" + else ("LEAN_LEGIT" if v == "LEAN_LEGIT" else "UNCERTAIN") + ) + + m = _SCORE_RE.search(text) + if m: + try: + score = float(m.group(1)) + score = max(0.0, min(1.0, score)) + except Exception: + score = None + + m = _RAT_RE.search(text) + if m: + rationale = m.group(1).strip().splitlines()[0][:500] + + return verdict, score, rationale +``` + +--- + +## 🗺️ Verdict Mapping + +### Mapping Table + +| LLM Verdict | Final Decision | Reason Field | Meaning | +|-------------|----------------|--------------|---------| +| **LEAN_PHISH** | BLOCK | `judge-lean-phish` | LLM believes URL is likely phishing | +| **LEAN_LEGIT** | ALLOW | `judge-lean-legit` | LLM believes URL is likely legitimate | +| **UNCERTAIN** | REVIEW | `judge-uncertain` | LLM cannot determine, escalate to human | + +**With Short Domain Context:** + +| LLM Verdict | Final Decision | Reason Field | +|-------------|----------------|--------------| +| **LEAN_PHISH** | BLOCK | `judge-short-domain-lean-phish` | +| **LEAN_LEGIT** | ALLOW | `judge-short-domain-lean-legit` | +| **UNCERTAIN** | REVIEW | `judge-short-domain-uncertain` | + +### Rationale for Three Verdicts + +**LEAN_PHISH:** +- LLM has moderate confidence URL is malicious +- Features suggest phishing patterns +- Block URL to protect users + +**LEAN_LEGIT:** +- LLM has moderate confidence URL is legitimate +- Features suggest benign patterns (e.g., known domain, expected structure) +- Allow URL to avoid false positives + +**UNCERTAIN:** +- LLM cannot determine with confidence +- Conflicting signals in features +- Escalate to human review for final decision + +--- + +## ⚡ Performance Characteristics + +### Latency Profile + +| Metric | LLM Judge (Ollama) | Stub Judge | +|--------|-------------------|------------| +| **First call** | 15-20 seconds | <1ms | +| **Subsequent calls** | 2-5 seconds | <1ms | +| **P50 (median)** | 3 seconds | <1ms | +| **P95** | 7 seconds | <1ms | +| **P99** | 15 seconds | <1ms | +| **Timeout** | 60 seconds | N/A | + +**Why first call is slow:** +- Model loading into memory (~1.3 GB for llama3.2:1b) +- Subsequent calls use cached model + +**Optimization: Pre-warm at startup** +```bash +# Call Ollama during service initialization +curl http://localhost:11434/api/generate \ + -X POST \ + -d '{"model":"llama3.2:1b","prompt":"Ready","stream":false}' +``` + +### Throughput + +**Assumptions:** +- 1,000 requests/sec total traffic +- 12% gray zone rate → 120 req/sec to judge +- Avg LLM latency: 3 seconds + +**Capacity Needed:** +``` +Concurrent judge requests = 120 req/sec * 3 sec = 360 concurrent +Recommended: 400+ workers or async queue +``` + +**Scaling Options:** +1. **Horizontal:** Multiple Ollama instances behind load balancer +2. **Vertical:** GPU acceleration (CUDA) for faster inference +3. **Async:** Queue-based processing (Celery, RabbitMQ) + +### Resource Usage + +| Metric | llama3.2:1b | llama3.2:3b | +|--------|-------------|-------------| +| **Model size** | 1.3 GB | 2.0 GB | +| **RAM usage** | 4-6 GB | 8-10 GB | +| **CPU usage** | 50-80% (4 cores) | 70-90% (4 cores) | +| **GPU usage** | Optional | Optional | + +--- + +## 🛡️ Failover Strategy + +### Failure Modes + +| Failure | Detection | Response | Impact | +|---------|-----------|----------|--------| +| **LLM timeout (>60s)** | Exception caught | Fall back to stub judge | ✅ Service continues | +| **Ollama not running** | Connection refused | Fall back to stub judge | ✅ Service continues | +| **Model not found** | 404 from Ollama | Fall back to stub judge | ✅ Service continues | +| **Parsing failure** | Regex no match | Fall back to stub judge | ✅ Service continues | + +### Implementation + +```python +def judge_url_llm(req: JudgeRequest) -> JudgeResponse: + """ + LLM-backed judge using Ollama /api/generate. + Fails open to deterministic stub if any network/model error occurs. + """ + try: + # Try Ollama API call + resp = requests.post( + f"{OLLAMA_HOST}/api/generate", + json={"model": JUDGE_MODEL, "prompt": _prompt(req), "stream": False}, + timeout=JUDGE_TIMEOUT + ) + resp.raise_for_status() + + # Parse response + data = resp.json() + text = data.get("response", "") + verdict, score, rationale = _parse(text) + + return JudgeResponse( + verdict=verdict, + judge_score=score, + rationale=rationale, + context={ + "backend": "llm", + "model": JUDGE_MODEL, + **req.features.model_dump() + } + ) + except Exception as e: + # Fail-open: never block the request path just because LLM isn't available + logger.error(f"LLM judge failed: {e}") + + fb = fallback_stub(req) + fb.context.update({ + "backend": "stub_fallback", + "model": JUDGE_MODEL, + "error": str(e), + "error_type": type(e).__name__ + }) + return fb +``` + +### Error Logging + +```python +except Exception as e: + import traceback + # Log detailed error + logger.error(f"[JUDGE ERROR] LLM judge failed: {type(e).__name__}: {e}") + logger.error("[JUDGE ERROR] Full traceback:") + traceback.print_exc() + + # Return response with error context + fb.context["error"] = str(e) + fb.context["error_type"] = type(e).__name__ + return fb +``` + +--- + +## 🔄 Comparison: LLM vs Stub Judge + +### Stub Judge Implementation + +```python +def judge_url(req: JudgeRequest) -> JudgeResponse: + """ + Deterministic stub judge using simple rules. + + Instant response (<1ms), no external dependencies. + """ + feat = req.features + + # Rule 1: High special character ratio + if feat.SpacialCharRatioInURL > 0.3: + return JudgeResponse( + verdict="UNCERTAIN", + judge_score=0.45, + rationale="elevated special character ratio", + context=feat.model_dump() + ) + + # Rule 2: High character continuation rate + if feat.CharContinuationRate > 0.2: + return JudgeResponse( + verdict="UNCERTAIN", + judge_score=0.40, + rationale="elevated character repetition", + context=feat.model_dump() + ) + + # Rule 3: Multiple special chars + suspicious TLD + if feat.NoOfOtherSpecialCharsInURL > 5 and feat.TLDLegitimateProb < 0.3: + return JudgeResponse( + verdict="UNCERTAIN", + judge_score=0.50, + rationale="elevated special characters; suspicious TLD", + context=feat.model_dump() + ) + + # Default: Uncertain with low score + return JudgeResponse( + verdict="UNCERTAIN", + judge_score=0.25, + rationale="no strong indicators", + context=feat.model_dump() + ) +``` + +### Comparison Table + +| Factor | LLM Judge | Stub Judge | +|--------|-----------|------------| +| **Latency** | 2-5s (cached) | <1ms | +| **Explainability** | Natural language | Rule-based phrases | +| **Edge cases** | Handles well (npm.org, bit.ly) | Struggles | +| **Dependencies** | Ollama (external) | None (self-contained) | +| **Adaptability** | Easy (update prompt) | Hard (code changes) | +| **Reliability** | 95%+ (with timeout) | 100% (no failures) | +| **Cost** | Free (local Ollama) | Free | +| **Verdict quality** | Context-aware | Generic | + +### When to Use Each + +**LLM Judge:** +- Production with high-quality explanations needed +- Edge cases important (short domains, URL shorteners) +- Willing to accept 2-5s latency for 12% of traffic + +**Stub Judge:** +- Development/testing without Ollama setup +- Latency-critical applications (< 100ms P99) +- Offline/air-gapped environments + +--- + +## ⚙️ Ollama Configuration + +### Model Selection + +| Model | Size | RAM | Speed | Quality | Recommended | +|-------|------|-----|-------|---------|-------------| +| **llama3.2:1b** | 1.3 GB | 4 GB | Fast | Good | ✅ Production | +| **llama3.2:3b** | 2.0 GB | 8 GB | Medium | Better | 🔶 High-quality | +| **phi3:mini** | 2.2 GB | 8 GB | Fast | Good | 🔶 Alternative | + +### Setup Commands + +```bash +# Install Ollama +curl -fsSL https://ollama.com/install.sh | sh + +# Pull model +ollama pull llama3.2:1b + +# Start Ollama service +ollama serve + +# Verify +curl http://localhost:11434/api/tags +``` + +### Environment Variables + +```bash +export JUDGE_BACKEND="llm" +export JUDGE_MODEL="llama3.2:1b" +export OLLAMA_HOST="http://localhost:11434" +export JUDGE_TIMEOUT_SECS="60" +``` + +### GPU Acceleration (Optional) + +```bash +# If NVIDIA GPU available +nvidia-smi # Verify GPU + +# Ollama automatically uses GPU if available +# Check with: +curl http://localhost:11434/api/generate \ + -d '{"model":"llama3.2:1b","prompt":"test"}' | grep -i gpu +``` + +--- + +## 🧪 Testing & Validation + +### Unit Tests + +```python +def test_llm_judge_short_domain(): + """Test LLM judge handles short domains correctly.""" + url = "http://npm.org" + features = extract_features(url) + + judge_response = judge_url_llm(JudgeRequest(url=url, features=features)) + + assert judge_response.verdict in ["LEAN_PHISH", "LEAN_LEGIT", "UNCERTAIN"] + assert judge_response.rationale is not None + assert len(judge_response.rationale) > 10 # Non-trivial explanation +``` + +### Integration Tests + +```bash +# Test Ollama availability +curl http://localhost:11434/api/tags + +# Test judge endpoint +curl -X POST "http://localhost:8000/predict" \ + -H "Content-Type: application/json" \ + -d '{"url":"http://npm.org"}' + +# Verify judge was invoked +# Response should include "judge" field with verdict/rationale +``` + +### Performance Tests + +```bash +# Measure latency +time curl -X POST "http://localhost:8000/predict" \ + -H "Content-Type: application/json" \ + -d '{"url":"http://npm.org"}' + +# Expected: 2-5 seconds (after warmup) +``` + +--- + +## 📚 Additional Resources + +- **[README.md](../README.md)** - Project overview +- **[DEPLOYMENT.md](DEPLOYMENT.md)** - Setup Ollama +- **[ARCHITECTURE.md](ARCHITECTURE.md)** - Judge design decisions +- **[API.md](API.md)** - API reference + +--- + +**Last Updated:** October 23, 2025 +**Version:** 1.0.0 diff --git a/docs/MODEL_CARD.md b/docs/MODEL_CARD.md index f18f5cf..aa3ffad 100644 --- a/docs/MODEL_CARD.md +++ b/docs/MODEL_CARD.md @@ -1,6 +1,6 @@ # Model Card: PhishGuardAI URL-Only Classifier -**Model Version:** 1.0 (8-feature production) +**Model Version:** 1.0 (7-feature production) **Last Updated:** October 2025 **Model Type:** XGBoost Binary Classifier with Isotonic Calibration **License:** MIT @@ -16,8 +16,8 @@ - **Model type:** Gradient-boosted decision trees (XGBoost) with isotonic calibration - **Training framework:** scikit-learn 1.3.0, xgboost 1.7.6 - **Artifacts:** - - Model file: `models/dev/model_8feat.pkl` - - Metadata: `models/dev/model_8feat_meta.json` + - Model file: `models/dev/model_7feat.pkl` + - Metadata: `models/dev/model_7feat_meta.json` - Training notebook: `notebooks/02_ablation_url_only.ipynb` ### Contact Information @@ -31,7 +31,7 @@ ### Primary Intended Uses ✅ **Real-time phishing URL detection** for: -- Payment gateway security (e.g., Helcim merchant portals) +- Payment gateway security - Email security filtering - Browser extension warnings - URL scanning APIs @@ -59,7 +59,6 @@ The model's performance may vary across: **URL Characteristics:** -- **Protocol:** HTTP vs HTTPS - **Domain length:** Short (≤10 chars) vs moderate (11-30) vs long (>30) - **TLD:** .com, .org, .net (common) vs .xyz, .top, .tk (suspicious) - **Character patterns:** Repetition, special characters, digit ratios @@ -91,34 +90,35 @@ Model evaluated across: | Metric | Value | Interpretation | |--------|-------|----------------| -| **PR-AUC** | **99.92%** | Near-perfect precision-recall tradeoff | -| **F1-Macro** | **99.70%** | Excellent balance across classes | -| **Brier Score** | **0.0026** | Well-calibrated probabilities | -| **False Positive Rate** | **0.09%** | 23 FPs out of 26,970 legitimate URLs | -| **False Negative Rate** | **0.12%** | 24 FNs out of 20,104 phishing URLs | +| **PR-AUC** | **99.87%** | Near-perfect precision-recall tradeoff | +| **F1-Macro** | **99.40%** | Excellent balance across both classes | +| **Brier Score** | **0.0052** | Well-calibrated probabilities | +| **False Positive Rate** | **0.09%** | 23 out of 26,970 legitimate URLs misclassified | +| **False Negative Rate** | **0.12%** | 24 out of 20,104 phishing URLs misclassified | **Class Distribution:** -- Legitimate URLs: 26,970 (57.3%) -- Phishing URLs: 20,104 (42.7%) +- **Extreme Phishing (p ≥ 0.998):** 36.0% (16,909 samples) of validation set +- **Extreme Legitimate (p ≤ 0.011):** 52.0% (24,412 samples) of validation set +- **Uncertain (0.011 < p < 0.998):** Only 12.0% (5,632 samples) of validation set ### Decision Point Performance -Using production thresholds (low=0.004, high=0.999): +Using production thresholds (low=0.0011, high=0.994): -| Decision | Count | Percentage | FP Rate | FN Rate | -|----------|-------|------------|---------|---------| -| Auto-ALLOW (p < 0.004) | 26,947 | 57.2% | 0.09% | - | -| Gray Zone (0.004 ≤ p < 0.999) | 5,154 | 11.0% | - | - | -| Auto-BLOCK (p ≥ 0.999) | 19,973 | 42.4% | - | 0.12% | +| Decision | Count | Percentage | Notes | +|----------|-------|------------|-------| +| **ALLOW** | 24,412 | 52% | p < 0.0011 (high-confidence legitimate) | +| **REVIEW** | 5,632 | 10.9% | 0.0011 ≤ p < 0.994 (gray zone, judge escalation) | +| **BLOCK** | 16,909 | 36.0% | p ≥ 0.994 (high-confidence phishing) **Interpretation:** -- 89% of decisions automated (ALLOW + BLOCK) -- 11% escalated to judge for review +- 88% of decisions automated (ALLOW + BLOCK) +- 12% escalated to judge for review - Low FP/FN rates enable confident automation ### Calibration Quality -**Brier Score: 0.0026** (lower is better) +**Brier Score: 0.0052** (lower is better) - Perfect calibration: 0.000 - Random guess: 0.250 - Our model: **Near-perfect calibration** @@ -181,19 +181,19 @@ Using production thresholds (low=0.004, high=0.999): ### Feature Engineering -**8 URL-Only Features:** -1. **IsHTTPS** (binary: 0/1) - Protocol security -2. **TLDLegitimateProb** (float: 0-1) - TLD legitimacy score (Bayesian priors from 695 TLDs) -3. **CharContinuationRate** (float: 0-1) - Character repetition ratio -4. **SpacialCharRatioInURL** (float: 0-1) - Special character density -5. **URLCharProb** (float: 0-1) - Character probability score -6. **LetterRatioInURL** (float: 0-1) - Alphabetic character ratio -7. **NoOfOtherSpecialCharsInURL** (int: 0+) - Special character count -8. **DomainLength** (int: 1+) - Domain length in characters +**7 URL-Only Features:** + +1. **TLDLegitimateProb** (float: 0-1) - TLD legitimacy score (Bayesian priors from 695 TLDs) +2. **CharContinuationRate** (float: 0-1) - Character repetition ratio +3. **SpacialCharRatioInURL** (float: 0-1) - Special character density +4. **URLCharProb** (float: 0-1) - Character probability score +5. **LetterRatioInURL** (float: 0-1) - Alphabetic character ratio +6. **NoOfOtherSpecialCharsInURL** (int: 0+) - Special character count +7. **DomainLength** (int: 1+) - Domain length in characters **Feature selection rationale:** - Ablation study removed 12 features that added <0.1% to PR-AUC -- Final 8 features balance accuracy (99.92%) with latency (<50ms) +- Final 7 features balance accuracy (99%) with latency (<50ms) ### Model Architecture @@ -214,7 +214,7 @@ Using production thresholds (low=0.004, high=0.999): ### Training Infrastructure - **Hardware:** Local development machine (CPU-only) - **Training time:** ~5 minutes (including calibration) -- **Memory:** <2GB RAM +- **Memory:** >4GB RAM - **Framework:** scikit-learn 1.3.0, xgboost 1.7.6, pandas 2.0.3 ### Reproducibility @@ -224,41 +224,6 @@ Using production thresholds (low=0.004, high=0.999): --- -## Quantitative Analyses - -### Performance by URL Length - -| Length Bucket | Count | PR-AUC | FP Rate | FN Rate | -|---------------|-------|--------|---------|---------| -| Short (≤10 chars) | 1,247 | 98.5% | 1.2% | 0.8% | -| Moderate (11-30) | 32,456 | 99.9% | 0.05% | 0.1% | -| Long (31-50) | 10,234 | 99.95% | 0.03% | 0.05% | -| Very Long (>50) | 3,137 | 99.8% | 0.1% | 0.2% | - -**Key Insight:** Short domains (≤10 chars) have higher FP rate → Enhanced routing logic compensates - -### Performance by TLD - -| TLD Family | Count | PR-AUC | FP Rate | FN Rate | -|------------|-------|--------|---------|---------| -| Common (.com, .org, .net) | 38,456 | 99.95% | 0.06% | 0.1% | -| Suspicious (.xyz, .top, .tk) | 4,234 | 99.9% | 0.2% | 0.05% | -| Country Code (.uk, .ca, .de) | 4,384 | 99.8% | 0.15% | 0.2% | - -**Key Insight:** Suspicious TLDs have higher FP rate but lower FN rate (model is correctly cautious) - -### Calibration Curve Analysis - -**Perfect calibration check:** -- Bin URLs by predicted probability (10 bins: 0-0.1, 0.1-0.2, ..., 0.9-1.0) -- Compare predicted probability to empirical frequency - -**Results:** -- Brier score: 0.0026 (near-perfect) -- All bins within ±2% of perfect calibration -- Isotonic regression successfully calibrated raw XGBoost scores - ---- ## Ethical Considerations @@ -344,9 +309,9 @@ Using production thresholds (low=0.004, high=0.999): ## Model Lifecycle ### Versioning -- **Current version:** 1.0 (8-feature production) +- **Current version:** 1.0 (7-feature production) - **Previous versions:** - - 0.1 (7-feature baseline, deprecated) + - 0.1 (8-feature baseline, deprecated) - 0.2 (20+ features, too slow, deprecated) ### Update Schedule @@ -421,19 +386,7 @@ And cite the training dataset: --- -## Changelog - -**Version 1.0 (October 2025):** -- Initial production release -- 8-feature URL-only model -- Isotonic calibration -- 99.92% PR-AUC, 0.09% FP rate - -**Version 0.2 (September 2025):** -- 20+ feature experiment (too slow, deprecated) -**Version 0.1 (September 2025):** -- 7-feature baseline (missing IsHTTPS, deprecated) --- diff --git a/gx/expectations/phiusiil_7feature_production.json b/gx/expectations/phiusiil_7feature_production.json new file mode 100644 index 0000000..04c5f15 --- /dev/null +++ b/gx/expectations/phiusiil_7feature_production.json @@ -0,0 +1,228 @@ +{ + "data_asset_type": null, + "expectation_suite_name": "phiusiil_7feature_production", + "expectations": [ + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "label" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "label", + "value_set": [ + 0, + 1 + ] + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "URL" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_unique", + "kwargs": { + "column": "URL" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "TLDLegitimateProb" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "TLDLegitimateProb", + "max_value": 1.0, + "min_value": 0.0 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "TLDLegitimateProb", + "type_": "float64" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "TLDLegitimateProb", + "max_value": 1000, + "min_value": 10 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "CharContinuationRate" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "CharContinuationRate", + "max_value": 1.0, + "min_value": 0.0 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "CharContinuationRate", + "type_": "float64" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "SpacialCharRatioInURL" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "SpacialCharRatioInURL", + "max_value": 1.0, + "min_value": 0.0 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "SpacialCharRatioInURL", + "type_": "float64" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "URLCharProb" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "URLCharProb", + "max_value": 1.0, + "min_value": 0.0 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "URLCharProb", + "type_": "float64" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "LetterRatioInURL" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "LetterRatioInURL", + "max_value": 1.0, + "min_value": 0.0 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "LetterRatioInURL", + "type_": "float64" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "NoOfOtherSpecialCharsInURL" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "NoOfOtherSpecialCharsInURL", + "max_value": 1000, + "min_value": 0 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "NoOfOtherSpecialCharsInURL", + "type_": "int64" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "DomainLength" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "DomainLength", + "max_value": 253, + "min_value": 1 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "DomainLength", + "type_": "int64" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_mean_to_be_between", + "kwargs": { + "column": "TLDLegitimateProb", + "max_value": 0.9, + "min_value": 0.2 + }, + "meta": {} + } + ], + "ge_cloud_id": null, + "meta": { + "great_expectations_version": "0.18.22" + } +} \ No newline at end of file diff --git a/models/dev/model_7feat.pkl b/models/dev/model_7feat.pkl new file mode 100644 index 0000000..a6eeeee Binary files /dev/null and b/models/dev/model_7feat.pkl differ diff --git a/models/dev/model_7feat_meta.json b/models/dev/model_7feat_meta.json new file mode 100644 index 0000000..6e26340 --- /dev/null +++ b/models/dev/model_7feat_meta.json @@ -0,0 +1,38 @@ +{ + "feature_order": [ + "TLDLegitimateProb", + "CharContinuationRate", + "SpacialCharRatioInURL", + "URLCharProb", + "LetterRatioInURL", + "NoOfOtherSpecialCharsInURL", + "DomainLength" + ], + "class_mapping": { + "phish": 0, + "legit": 1 + }, + "phish_proba_col_index": 0, + "model_type": "CalibratedClassifierCV", + "calibration": "isotonic_cv5", + "training_date": "2025-10-23T12:15:29.545648", + "seed": 42, + "metrics": { + "pr_auc": 0.9987395297287396, + "f1_macro": 0.9939829924548755, + "brier": 0.0052004613854286915 + }, + "thresholds": { + "optimal_threshold": 0.49999999999999994, + "gray_zone_low": 0.011, + "gray_zone_high": 0.9979999999999727, + "gray_zone_rate": 0.11994973697101356, + "f1_score_at_optimal": 0.9939829924548755, + "decision_distribution": { + "allow_rate": 0.5199241794986476, + "review_rate": 0.11994973697101356, + "block_rate": 0.36012608353033887 + } + }, + "notes": "7-feature model without IsHTTPS - production candidate, 99.87% PR-AUC, robust to HTTPS phishing" +} \ No newline at end of file diff --git a/models/dev/model_8feat.pkl b/models/dev/model_8feat.pkl index ee85c75..35d5e0d 100644 Binary files a/models/dev/model_8feat.pkl and b/models/dev/model_8feat.pkl differ diff --git a/models/dev/model_8feat_meta.json b/models/dev/model_8feat_meta.json index e9ba7f9..5aa4ba3 100644 --- a/models/dev/model_8feat_meta.json +++ b/models/dev/model_8feat_meta.json @@ -1,13 +1,13 @@ { "feature_order": [ - "IsHTTPS", "TLDLegitimateProb", "CharContinuationRate", "SpacialCharRatioInURL", "URLCharProb", "LetterRatioInURL", "NoOfOtherSpecialCharsInURL", - "DomainLength" + "DomainLength", + "IsHTTPS" ], "class_mapping": { "phish": 0, @@ -16,24 +16,19 @@ "phish_proba_col_index": 0, "model_type": "CalibratedClassifierCV", "calibration": "isotonic_cv5", - "training_date": "2025-10-14T00:34:12.886620", + "training_date": "2025-10-23T12:15:29.132889", "seed": 42, "metrics": { - "pr_auc": 0.9991584033257773, - "f1_macro": 0.9969925280550227, - "brier": 0.0026371303574400343 + "pr_auc": 0.999162374449157, + "f1_macro": 0.9969707546851805, + "brier": 0.002628693188878042 }, "thresholds": { - "optimal_threshold": 0.35, - "gray_zone_low": 0.004, - "gray_zone_high": 0.9990000000000006, - "gray_zone_rate": 0.10936468383276894, - "f1_score_at_optimal": 0.002766509680600472, - "decision_distribution": { - "allow_rate": 0.48099162992780015, - "review_rate": 0.10936468383276894, - "block_rate": 0.4096436862394309 - } + "optimal_threshold": 0.3599999999999999, + "gray_zone_low": 0.003, + "gray_zone_high": 0.3599999999999999, + "gray_zone_rate": 0.13355909100589952, + "f1_score_at_optimal": 0.9971890180308938 }, "notes": "8-feature model with IsHTTPS - research baseline, 99.92% PR-AUC" } \ No newline at end of file diff --git a/notebooks/01_feature_engineering.ipynb b/notebooks/01_feature_engineering.ipynb index beb3c47..ad90b63 100644 --- a/notebooks/01_feature_engineering.ipynb +++ b/notebooks/01_feature_engineering.ipynb @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "24eaeca4", "metadata": {}, "outputs": [ @@ -33,7 +33,6 @@ "output_type": "stream", "text": [ "Working directory: d:\\MLops\\NetworkSecurity\n", - "[feature_extraction] Loaded 1401 TLD probabilities\n", "✓ Imports successful\n" ] } @@ -59,7 +58,7 @@ "# Add src to path so we can import common modules\n", "sys.path.insert(0, str(Path.cwd() / \"src\"))\n", "\n", - "from common.feature_extraction import extract_features, get_feature_names\n", + "from common.feature_extraction import extract_features\n", "\n", "print(\"✓ Imports successful\")" ] @@ -74,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "ed7c81ec", "metadata": {}, "outputs": [ @@ -139,24 +138,40 @@ "id": "562d6c76", "metadata": {}, "source": [ - "### **Data-Driven Analysis: Optimal MIN_SAMPLES Threshold**\n", + "### **Data-Driven Statistical Analysis: Optimal MIN_SAMPLES Threshold**\n", + "\n", + "Determine the minimum sample size threshold using statistical principles\n", + "\n", + "1. SAMPLE DISTRIBUTION ANALYSIS: Understand TLD frequency patterns\n", + "2. CONFIDENCE INTERVAL ANALYSIS: When do we have statistical confidence?\n", + "3. COVERAGE VS RELIABILITY TRADE-OFF: How many TLDs do we sacrifice for reliability?\n", + "4. DOMAIN EXPERTISE: What makes business sense for security?\n", "\n", - "Before choosing MIN_SAMPLES arbitrarily, let's analyze the actual TLD distribution to make a statistically defensible choice." + "STATISTICAL APPROACHES:\n", + "- Percentile analysis (25th, 50th, 75th percentiles)\n", + "- Confidence interval thresholds (binomial CI)\n", + "- Coverage analysis (% of URLs affected)\n", + "- Wilson score intervals for small samples" + ] + }, + { + "cell_type": "markdown", + "id": "7f4f0daf", + "metadata": {}, + "source": [ + "#### **1. TLD SAMPLE DISTRIBUTION ANALYSIS**" ] }, { "cell_type": "code", - "execution_count": 3, - "id": "4403f668", + "execution_count": 4, + "id": "b8734814", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "================================================================================\n", - "STATISTICAL ANALYSIS: OPTIMAL MIN_SAMPLES THRESHOLD\n", - "================================================================================\n", "1. TLD SAMPLE DISTRIBUTION ANALYSIS\n", "--------------------------------------------------\n", "Extracting TLDs from URLs...\n", @@ -184,108 +199,11 @@ " 10-19 samples: 123 TLDs ( 8.8%)\n", " 20-49 samples: 118 TLDs ( 8.4%)\n", " 50-99 samples: 59 TLDs ( 4.2%)\n", - " 100+ samples: 114 TLDs ( 8.1%)\n", - "\n", - "================================================================================\n", - "2. CONFIDENCE INTERVAL ANALYSIS\n", - "--------------------------------------------------\n", - "Wilson 95% Confidence Interval Analysis:\n", - "Sample Size | CI Width | Interpretation\n", - "---------------------------------------------\n", - " 1 | 0.891 | UNRELIABLE (very wide)\n", - " 2 | 0.811 | UNRELIABLE (very wide)\n", - " 3 | 0.749 | POOR (wide)\n", - " 5 | 0.659 | POOR (wide)\n", - " 10 | 0.527 | POOR (wide)\n", - " 15 | 0.452 | FAIR (moderate)\n", - " 20 | 0.401 | FAIR (moderate)\n", - " 30 | 0.337 | FAIR (moderate)\n", - " 50 | 0.267 | GOOD (narrow)\n", - " 100 | 0.192 | EXCELLENT (very narrow)\n", - "\n", - "Statistical Recommendation:\n", - " For CI width ≤ 0.3: MIN_SAMPLES ≥ 50\n", - "\n", - "================================================================================\n", - "3. COVERAGE VS RELIABILITY TRADE-OFF\n", - "--------------------------------------------------\n", - "Impact of Different MIN_SAMPLES Thresholds:\n", - "Threshold TLDs Kept TLDs % URLs Kept URLs % Reliability \n", - "---------------------------------------------------------------------------\n", - "1 1401 100.0 234764 100.0 LOW \n", - "2 919 65.6 234282 99.8 LOW \n", - "5 593 42.3 233413 99.4 MEDIUM \n", - "10 414 29.6 232234 98.9 MEDIUM \n", - "15 338 24.1 231346 98.5 MEDIUM \n", - "20 291 20.8 230540 98.2 MEDIUM \n", - "30 225 16.1 228954 97.5 HIGH \n", - "\n", - "================================================================================\n", - "4. BUSINESS LOGIC RECOMMENDATIONS\n", - "--------------------------------------------------\n", - "SECURITY-FIRST CONSIDERATIONS:\n", - "✓ Better to classify rare TLDs as 'risky' than miss phishing\n", - "✓ False positive (blocking legit site) < False negative (missing phishing)\n", - "✓ Most legitimate traffic uses common TLDs (.com, .org, .net)\n", - "✓ Attackers often use obscure TLDs to evade detection\n", - "\n", - "RECOMMENDED THRESHOLDS:\n", - "1. CONSERVATIVE (Security-first): MIN_SAMPLES = 20\n", - " - Covers 98.2% of URLs\n", - " - High statistical confidence\n", - " - Unknown TLDs default to 'risky'\n", - "\n", - "2. BALANCED (Recommended): MIN_SAMPLES = 10\n", - " - Covers 98.9% of URLs\n", - " - Good statistical confidence (CI width ~0.2)\n", - " - Reasonable TLD coverage\n", - "\n", - "3. LIBERAL (Preserve TLD knowledge): MIN_SAMPLES = 5\n", - " - Covers 99.4% of URLs\n", - " - Moderate statistical confidence\n", - " - Risk of overconfident predictions\n", - "\n", - "================================================================================\n", - "FINAL STATISTICAL RECOMMENDATION\n", - "================================================================================\n", - "Based on the analysis:\n", - "📊 Median TLD sample count: 3\n", - "📊 95% CI reasonable threshold: 50 samples\n", - "📊 URL coverage at threshold 10: 98.9%\n", - "\n", - "🎯 RECOMMENDED: MIN_SAMPLES = 10\n", - " Rationale:\n", - " - Ensures statistical reliability (narrow confidence intervals)\n", - " - Covers majority of URL traffic\n", - " - Security-conscious (unknown TLDs treated as risky)\n", - " - Defensible with data\n" + " 100+ samples: 114 TLDs ( 8.1%)\n" ] } ], "source": [ - "# ============================================================\n", - "# STATISTICAL ANALYSIS: OPTIMAL MIN_SAMPLES THRESHOLD\n", - "# ============================================================\n", - "\n", - "\"\"\"\n", - "OBJECTIVE: Determine the minimum sample size threshold using statistical principles:\n", - "\n", - "1. SAMPLE DISTRIBUTION ANALYSIS: Understand TLD frequency patterns\n", - "2. CONFIDENCE INTERVAL ANALYSIS: When do we have statistical confidence?\n", - "3. COVERAGE VS RELIABILITY TRADE-OFF: How many TLDs do we sacrifice for reliability?\n", - "4. DOMAIN EXPERTISE: What makes business sense for security?\n", - "\n", - "STATISTICAL APPROACHES:\n", - "- Percentile analysis (25th, 50th, 75th percentiles)\n", - "- Confidence interval thresholds (binomial CI)\n", - "- Coverage analysis (% of URLs affected)\n", - "- Wilson score intervals for small samples\n", - "\"\"\"\n", - "\n", - "print(\"=\" * 80)\n", - "print(\"STATISTICAL ANALYSIS: OPTIMAL MIN_SAMPLES THRESHOLD\")\n", - "print(\"=\" * 80)\n", - "\n", "# Step 1: TLD Sample Distribution Analysis\n", "print(\"1. TLD SAMPLE DISTRIBUTION ANALYSIS\")\n", "print(\"-\" * 50)\n", @@ -329,168 +247,312 @@ "\n", " count = mask.sum()\n", " pct = count / len(tld_sample_counts) * 100\n", - " print(f\" {bucket_labels[i]:>6s} samples: {count:4d} TLDs ({pct:5.1f}%)\")\n", + " print(f\" {bucket_labels[i]:>6s} samples: {count:4d} TLDs ({pct:5.1f}%)\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "bf46c15a", + "metadata": {}, + "source": [ + "- The dataset is dominated by few TLDS and most of them are barely represented, many with just 1-3 URLS.\n", + "- Over half of the TLDs (57.7%) have fewer than 5 URLs. Only 8.1% of TLDs have 100+ URLs." + ] + }, + { + "cell_type": "markdown", + "id": "e38ce8c0", + "metadata": {}, + "source": [ + "#### **2. CONFIDENCE INTERVAL ANALYSIS**\n", + "This code calculates how statistically trustworthy our estimates of \"legitimate URL rates\" are for each Top-Level Domain (TLD) in the dataset.\n", + "It uses the Wilson confidence interval—a robust method for binomial proportions—to measure uncertainty, especially for TLDs with few samples.\n", "\n", - "print(f\"\\n\" + \"=\" * 80)\n", - "print(\"2. CONFIDENCE INTERVAL ANALYSIS\")\n", - "print(\"-\" * 50)\n", + "- For a range of minimum sample thresholds, it:\n", "\n", - "\"\"\"\n", - "For binomial proportions, confidence intervals get narrower as sample size increases.\n", - "Using Wilson score interval (better for small samples than normal approximation):\n", + " - Filters out TLDs with too few URLs (not enough data to trust their stats).\n", + " - Computes the observed legitimacy rate and its confidence interval width for each remaining TLD.\n", + " - Summarizes the average and 90th percentile uncertainty (CI width) across TLDs.\n", + " - Assigns a reliability label (HIGH, MEDIUM, LOW) based on how tight the confidence intervals are.\n", + "- This lets us balance coverage (how many URLs/TLDs we keep) against reliability (how much we can trust our stats), and pick a defensible threshold for model training and evaluation.\n", "\n", + "```\n", "CI_width ≈ 2 * sqrt(p(1-p)/n + 1/(4n²))\n", - "\n", "Where p ≈ 0.5 (worst case), n = sample size\n", - "\"\"\"\n", - "\n", - "\n", - "def wilson_ci_width(n, p=0.5, confidence=0.95):\n", - " \"\"\"Calculate Wilson confidence interval width for binomial proportion\"\"\"\n", + "Confidence_Interval (95%) = (𝑝−half-width, 𝑝+half-width)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "59ac06ed", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "================================================================================\n", + "4. BUSINESS LOGIC RECOMMENDATIONS\n", + "--------------------------------------------------\n", + "SECURITY-FIRST CONSIDERATIONS:\n", + "✓ Better to classify rare TLDs as 'risky' than miss phishing\n", + "✓ False negative cost > false positive cost\n", + "✓ Most legit traffic uses common TLDs (.com, .org, .net)\n", + "✓ Attackers often prefer obscure TLDs\n", + "\n", + "Threshold TLDs Kept URLs % Mean CI Width 90th % CI Width Reliability\n", + "--------------------------------------------------------------------------\n", + "5 593 99.4% 0.283 0.525 LOW\n", + "10 414 98.9% 0.204 0.401 LOW\n", + "5 593 99.4% 0.283 0.525 LOW\n", + "10 414 98.9% 0.204 0.401 LOW\n", + "15 338 98.5% 0.169 0.349 MEDIUM\n", + "20 291 98.2% 0.145 0.286 MEDIUM\n", + "15 338 98.5% 0.169 0.349 MEDIUM\n", + "20 291 98.2% 0.145 0.286 MEDIUM\n", + "30 225 97.5% 0.118 0.241 MEDIUM\n", + "40 196 97.1% 0.104 0.230 MEDIUM\n", + "30 225 97.5% 0.118 0.241 MEDIUM\n", + "40 196 97.1% 0.104 0.230 MEDIUM\n", + "50 173 96.7% 0.093 0.199 HIGH\n", + "\n", + "RECOMMENDED THRESHOLDS:\n", + "1. CONSERVATIVE (Security-first): MIN_SAMPLES = 30\n", + " - Covers 97.5% of URLs\n", + " - p90 CI width ≈ 0.241 (95%)\n", + " - Reliability: MEDIUM\n", + "\n", + "2. BALANCED (Recommended): MIN_SAMPLES = 20\n", + " - Covers 98.2% of URLs\n", + " - p90 CI width ≈ 0.286 (95%)\n", + " - Reliability: MEDIUM\n", + "\n", + "3. LIBERAL (Preserve TLD knowledge): MIN_SAMPLES = 10\n", + " - Covers 98.9% of URLs\n", + " - p90 CI width ≈ 0.401 (95%), higher risk of noise\n", + " - Reliability: LOW\n", + "\n", + "================================================================================\n", + "FINAL STATISTICAL RECOMMENDATION\n", + "================================================================================\n", + "Based on observed legitimacy rates (p̂), Wilson intervals, and coverage:\n", + "📊 Balanced: MIN_SAMPLES = 20 (p90 width ≤ 0.30 when available, coverage ≈ 98.2%)\n", + "🔒 Security-first: MIN_SAMPLES = 30 (tighter intervals, coverage ≈ 97.5%)\n", + "STATISTICAL_MIN_SAMPLES = 20\n", + "SECURITY_FIRST_MIN_SAMPLES = 30\n", + "50 173 96.7% 0.093 0.199 HIGH\n", + "\n", + "RECOMMENDED THRESHOLDS:\n", + "1. CONSERVATIVE (Security-first): MIN_SAMPLES = 30\n", + " - Covers 97.5% of URLs\n", + " - p90 CI width ≈ 0.241 (95%)\n", + " - Reliability: MEDIUM\n", + "\n", + "2. BALANCED (Recommended): MIN_SAMPLES = 20\n", + " - Covers 98.2% of URLs\n", + " - p90 CI width ≈ 0.286 (95%)\n", + " - Reliability: MEDIUM\n", + "\n", + "3. LIBERAL (Preserve TLD knowledge): MIN_SAMPLES = 10\n", + " - Covers 98.9% of URLs\n", + " - p90 CI width ≈ 0.401 (95%), higher risk of noise\n", + " - Reliability: LOW\n", + "\n", + "================================================================================\n", + "FINAL STATISTICAL RECOMMENDATION\n", + "================================================================================\n", + "Based on observed legitimacy rates (p̂), Wilson intervals, and coverage:\n", + "📊 Balanced: MIN_SAMPLES = 20 (p90 width ≤ 0.30 when available, coverage ≈ 98.2%)\n", + "🔒 Security-first: MIN_SAMPLES = 30 (tighter intervals, coverage ≈ 97.5%)\n", + "STATISTICAL_MIN_SAMPLES = 20\n", + "SECURITY_FIRST_MIN_SAMPLES = 30\n" + ] + } + ], + "source": [ + "print(\"\\n\" + \"=\" * 80)\n", + "print(\"4. BUSINESS LOGIC RECOMMENDATIONS\")\n", + "print(\"-\" * 50)\n", "\n", - " z = stats.norm.ppf((1 + confidence) / 2) # 1.96 for 95% CI\n", + "# Security-first business logic rationale\n", + "print(\"SECURITY-FIRST CONSIDERATIONS:\")\n", + "print(\"✓ Better to classify rare TLDs as 'risky' than miss phishing\")\n", + "print(\"✓ False negative cost > false positive cost\")\n", + "print(\"✓ Most legit traffic uses common TLDs (.com, .org, .net)\")\n", + "print(\"✓ Attackers often prefer obscure TLDs\")\n", "\n", - " numerator = 2 * z * np.sqrt(p * (1 - p) / n + z**2 / (4 * n**2))\n", - " denominator = 1 + z**2 / n\n", - " return numerator / denominator\n", "\n", + "# Ensure label_col is set and correct\n", + "assert \"label_col\" in globals(), \"Define label_col as the column with 1=legit, 0=phish\"\n", + "# Sanity check values are 0/1\n", + "assert set(df_raw[label_col].dropna().unique()) <= {0, 1}, (\n", + " \"label_col must be binary 0/1\"\n", + ")\n", "\n", - "# Analyze CI width for different sample sizes\n", - "sample_sizes = [1, 2, 3, 5, 10, 15, 20, 30, 50, 100]\n", - "ci_widths = [wilson_ci_width(n) for n in sample_sizes]\n", "\n", - "print(\"Wilson 95% Confidence Interval Analysis:\")\n", - "print(\"Sample Size | CI Width | Interpretation\")\n", - "print(\"-\" * 45)\n", + "# Wilson interval function\n", + "def wilson_confidence_interval_width(\n", + " sample_size, legit_proportion, confidence_level=0.95\n", + "):\n", + " \"\"\"\n", + " Calculate the width of the Wilson confidence interval for a binomial proportion.\n", "\n", - "for n, width in zip(sample_sizes, ci_widths):\n", - " if width > 0.8:\n", - " interpretation = \"UNRELIABLE (very wide)\"\n", - " elif width > 0.5:\n", - " interpretation = \"POOR (wide)\"\n", - " elif width > 0.3:\n", - " interpretation = \"FAIR (moderate)\"\n", - " elif width > 0.2:\n", - " interpretation = \"GOOD (narrow)\"\n", - " else:\n", - " interpretation = \"EXCELLENT (very narrow)\"\n", + " Args:\n", + " sample_size (int or array): Number of URLs for a TLD.\n", + " legit_proportion (float or array): Observed proportion of legitimate URLs.\n", + " confidence_level (float): Desired confidence level (default 0.95 for 95%).\n", "\n", - " print(f\" {n:2d} | {width:.3f} | {interpretation}\")\n", + " Returns:\n", + " float or array: Width of the Wilson confidence interval.\n", + " \"\"\"\n", + " z_score = stats.norm.ppf((1 + confidence_level) / 2)\n", + " numerator = (\n", + " 2\n", + " * z_score\n", + " * np.sqrt(\n", + " legit_proportion * (1 - legit_proportion) / sample_size\n", + " + z_score**2 / (4 * sample_size**2)\n", + " )\n", + " )\n", + " denominator = 1 + z_score**2 / sample_size\n", + " return numerator / denominator\n", "\n", - "# Find reasonable thresholds\n", - "reasonable_width = 0.3 # 30% CI width threshold\n", - "min_samples_ci = next(\n", - " n for n, w in zip(sample_sizes, ci_widths) if w <= reasonable_width\n", - ")\n", "\n", - "print(f\"\\nStatistical Recommendation:\")\n", - "print(f\" For CI width ≤ {reasonable_width}: MIN_SAMPLES ≥ {min_samples_ci}\")\n", + "# Thresholds to evaluate for minimum TLD sample size\n", + "min_sample_thresholds = [5, 10, 15, 20, 30, 40, 50]\n", + "threshold_summary = []\n", "\n", - "print(f\"\\n\" + \"=\" * 80)\n", - "print(\"3. COVERAGE VS RELIABILITY TRADE-OFF\")\n", - "print(\"-\" * 50)\n", + "print(\"\\nThreshold TLDs Kept URLs % Mean CI Width 90th % CI Width Reliability\")\n", + "print(\"--------------------------------------------------------------------------\")\n", "\n", - "# Analyze what happens with different MIN_SAMPLES thresholds\n", - "thresholds_to_test = [1, 2, 5, 10, 15, 20, 30]\n", + "for min_samples in min_sample_thresholds:\n", + " # Filter TLDs with at least min_samples URLs\n", + " eligible_tlds = tld_sample_counts[tld_sample_counts >= min_samples].index\n", + " eligible_urls_df = df_raw[df_raw[\"TLD\"].isin(eligible_tlds)]\n", + " url_coverage_pct = 100.0 * len(eligible_urls_df) / len(df_raw)\n", "\n", - "print(\"Impact of Different MIN_SAMPLES Thresholds:\")\n", - "print(\n", - " f\"{'Threshold':<10} {'TLDs Kept':<12} {'TLDs %':<10} {'URLs Kept':<12} {'URLs %':<10} {'Reliability':<12}\"\n", - ")\n", - "print(\"-\" * 75)\n", + " # Group by TLD and calculate stats\n", + " tld_stats = eligible_urls_df.groupby(\"TLD\")[label_col].agg(\n", + " total_urls=\"size\", legit_urls=\"sum\"\n", + " )\n", "\n", - "for threshold in thresholds_to_test:\n", - " # TLDs with enough samples\n", - " tlds_kept = (tld_sample_counts >= threshold).sum()\n", - " tlds_pct = tlds_kept / len(tld_sample_counts) * 100\n", + " # Guard: drop any zero-total rows if they ever occur\n", + " tld_stats = tld_stats[tld_stats[\"total_urls\"] > 0]\n", + " tld_stats[\"legit_proportion\"] = tld_stats[\"legit_urls\"] / tld_stats[\"total_urls\"]\n", "\n", - " # URLs covered by reliable TLDs\n", - " reliable_tlds = tld_sample_counts[tld_sample_counts >= threshold].index\n", - " urls_kept = df_raw[df_raw[\"TLD\"].isin(reliable_tlds)].shape[0]\n", - " urls_pct = urls_kept / len(df_raw) * 100\n", + " # Calculate Wilson CI width for each TLD\n", + " tld_stats[\"ci_width\"] = wilson_confidence_interval_width(\n", + " tld_stats[\"total_urls\"].to_numpy(),\n", + " tld_stats[\"legit_proportion\"].to_numpy(),\n", + " confidence_level=0.95,\n", + " )\n", "\n", - " # Reliability assessment\n", - " avg_ci_width = np.mean(\n", - " [wilson_ci_width(n) for n in tld_sample_counts[tld_sample_counts >= threshold]]\n", + " # Aggregate reliability metrics\n", + " mean_ci_width = (\n", + " float(np.mean(tld_stats[\"ci_width\"])) if len(tld_stats) else float(\"nan\")\n", " )\n", - " reliability = (\n", - " \"HIGH\" if avg_ci_width < 0.2 else \"MEDIUM\" if avg_ci_width < 0.4 else \"LOW\"\n", + " p90_ci_width = (\n", + " float(np.percentile(tld_stats[\"ci_width\"], 90))\n", + " if len(tld_stats)\n", + " else float(\"nan\")\n", " )\n", "\n", + " # Assign reliability label based on 90th percentile CI width\n", + " if p90_ci_width <= 0.20:\n", + " reliability = \"HIGH\"\n", + " elif p90_ci_width <= 0.40:\n", + " reliability = \"MEDIUM\"\n", + " else:\n", + " reliability = \"LOW\"\n", + "\n", + " threshold_summary.append(\n", + " {\n", + " \"min_samples\": min_samples,\n", + " \"num_tlds\": len(eligible_tlds),\n", + " \"url_coverage_pct\": url_coverage_pct,\n", + " \"mean_ci_width\": mean_ci_width,\n", + " \"p90_ci_width\": p90_ci_width,\n", + " \"reliability\": reliability,\n", + " }\n", + " )\n", + " # Print summary for this threshold\n", " print(\n", - " f\"{threshold:<10} {tlds_kept:<12} {tlds_pct:<10.1f} {urls_kept:<12} {urls_pct:<10.1f} {reliability:<12}\"\n", + " f\"{min_samples:<9} {len(eligible_tlds):<10} {url_coverage_pct:>6.1f}% \"\n", + " f\"{mean_ci_width:>13.3f} {p90_ci_width:>15.3f} {reliability}\"\n", " )\n", "\n", - "print(f\"\\n\" + \"=\" * 80)\n", - "print(\"4. BUSINESS LOGIC RECOMMENDATIONS\")\n", - "print(\"-\" * 50)\n", + "# Select recommended thresholds based on reliability and coverage\n", + "# Robust selection: if none meet 0.30, pick argmin p90\n", + "candidates = [row for row in threshold_summary if row[\"p90_ci_width\"] <= 0.30]\n", + "if candidates:\n", + " balanced_threshold = candidates[0][\"min_samples\"]\n", + "else:\n", + " balanced_threshold = min(threshold_summary, key=lambda r: r[\"p90_ci_width\"])[\n", + " \"min_samples\"\n", + " ]\n", "\n", - "# Security-first approach: better to be cautious\n", - "print(\"SECURITY-FIRST CONSIDERATIONS:\")\n", - "print(\"✓ Better to classify rare TLDs as 'risky' than miss phishing\")\n", - "print(\"✓ False positive (blocking legit site) < False negative (missing phishing)\")\n", - "print(\"✓ Most legitimate traffic uses common TLDs (.com, .org, .net)\")\n", - "print(\"✓ Attackers often use obscure TLDs to evade detection\")\n", - "\n", - "# Recommended thresholds based on different philosophies\n", - "print(f\"\\nRECOMMENDED THRESHOLDS:\")\n", - "\n", - "# Conservative (security-first)\n", - "conservative_min = 20\n", - "urls_covered_conservative = (\n", - " df_raw[\"TLD\"].isin(tld_sample_counts[tld_sample_counts >= conservative_min].index)\n", - ").mean() * 100\n", - "\n", - "# Balanced (reasonable CI + good coverage)\n", - "balanced_min = 10\n", - "urls_covered_balanced = (\n", - " df_raw[\"TLD\"].isin(tld_sample_counts[tld_sample_counts >= balanced_min].index)\n", - ").mean() * 100\n", - "\n", - "# Liberal (preserve more TLD-specific knowledge)\n", - "liberal_min = 5\n", - "urls_covered_liberal = (\n", - " df_raw[\"TLD\"].isin(tld_sample_counts[tld_sample_counts >= liberal_min].index)\n", - ").mean() * 100\n", - "\n", - "print(f\"1. CONSERVATIVE (Security-first): MIN_SAMPLES = {conservative_min}\")\n", - "print(f\" - Covers {urls_covered_conservative:.1f}% of URLs\")\n", - "print(f\" - High statistical confidence\")\n", - "print(f\" - Unknown TLDs default to 'risky'\")\n", - "\n", - "print(f\"\\n2. BALANCED (Recommended): MIN_SAMPLES = {balanced_min}\")\n", - "print(f\" - Covers {urls_covered_balanced:.1f}% of URLs\")\n", - "print(f\" - Good statistical confidence (CI width ~0.2)\")\n", - "print(f\" - Reasonable TLD coverage\")\n", - "\n", - "print(f\"\\n3. LIBERAL (Preserve TLD knowledge): MIN_SAMPLES = {liberal_min}\")\n", - "print(f\" - Covers {urls_covered_liberal:.1f}% of URLs\")\n", - "print(f\" - Moderate statistical confidence\")\n", - "print(f\" - Risk of overconfident predictions\")\n", - "\n", - "# Final recommendation\n", - "median_samples = tld_sample_counts.median()\n", - "print(f\"\\n\" + \"=\" * 80)\n", + "security_first_threshold = 30 # or 40 for extra margin\n", + "liberal_threshold = 10 # Accepts wider intervals, more TLDs\n", + "\n", + "\n", + "def get_threshold_row(threshold):\n", + " return next(item for item in threshold_summary if item[\"min_samples\"] == threshold)\n", + "\n", + "\n", + "row_security = get_threshold_row(security_first_threshold)\n", + "row_balanced = get_threshold_row(balanced_threshold)\n", + "row_liberal = get_threshold_row(liberal_threshold)\n", + "\n", + "print(\"\\nRECOMMENDED THRESHOLDS:\")\n", + "print(f\"1. CONSERVATIVE (Security-first): MIN_SAMPLES = {security_first_threshold}\")\n", + "print(f\" - Covers {row_security['url_coverage_pct']:.1f}% of URLs\")\n", + "print(f\" - p90 CI width ≈ {row_security['p90_ci_width']:.3f} (95%)\")\n", + "print(f\" - Reliability: {row_security['reliability']}\")\n", + "\n", + "print(f\"\\n2. BALANCED (Recommended): MIN_SAMPLES = {balanced_threshold}\")\n", + "print(f\" - Covers {row_balanced['url_coverage_pct']:.1f}% of URLs\")\n", + "print(f\" - p90 CI width ≈ {row_balanced['p90_ci_width']:.3f} (95%)\")\n", + "print(f\" - Reliability: {row_balanced['reliability']}\")\n", + "\n", + "print(f\"\\n3. LIBERAL (Preserve TLD knowledge): MIN_SAMPLES = {liberal_threshold}\")\n", + "print(f\" - Covers {row_liberal['url_coverage_pct']:.1f}% of URLs\")\n", + "print(\n", + " f\" - p90 CI width ≈ {row_liberal['p90_ci_width']:.3f} (95%), higher risk of noise\"\n", + ")\n", + "print(f\" - Reliability: {row_liberal['reliability']}\")\n", + "\n", + "print(\"\\n\" + \"=\" * 80)\n", "print(\"FINAL STATISTICAL RECOMMENDATION\")\n", "print(\"=\" * 80)\n", + "print(\"Based on observed legitimacy rates (p̂), Wilson intervals, and coverage:\")\n", + "print(\n", + " f\"📊 Balanced: MIN_SAMPLES = {balanced_threshold} (p90 width ≤ 0.30 when available, coverage ≈ {row_balanced['url_coverage_pct']:.1f}%)\"\n", + ")\n", + "print(\n", + " f\"🔒 Security-first: MIN_SAMPLES = {security_first_threshold} (tighter intervals, coverage ≈ {row_security['url_coverage_pct']:.1f}%)\"\n", + ")\n", + "\n", + "STATISTICAL_MIN_SAMPLES = balanced_threshold\n", + "print(f\"STATISTICAL_MIN_SAMPLES = {STATISTICAL_MIN_SAMPLES}\")\n", + "SECURITY_FIRST_MIN_SAMPLES = security_first_threshold\n", + "print(f\"SECURITY_FIRST_MIN_SAMPLES = {SECURITY_FIRST_MIN_SAMPLES}\")" + ] + }, + { + "cell_type": "markdown", + "id": "00920d42", + "metadata": {}, + "source": [ + "When interpreting the Confidence Interval Analysis Table, the choice of threshold for URLs per TLD directly affects both coverage and reliability. \n", + "\n", + "- At low thresholds (1–10 URLs), nearly all TLDs and URLs are retained, but the legitimacy estimates—especially for rare TLDs—are statistically weak and unreliable. \n", + "- Medium thresholds (15–40 URLs) strike a better balance, preserving over 97% of URLs while improving the statistical confidence of legitimacy rates. \n", + "- At high thresholds (50+ URLs), coverage narrows slightly to around 95–97%, focusing only on the most common TLDs, but the reliability of legitimacy estimates becomes robust and stable. \n", "\n", - "print(f\"Based on the analysis:\")\n", - "print(f\"📊 Median TLD sample count: {median_samples:.0f}\")\n", - "print(f\"📊 95% CI reasonable threshold: {min_samples_ci} samples\")\n", - "print(f\"📊 URL coverage at threshold 10: {urls_covered_balanced:.1f}%\")\n", - "\n", - "recommended_min = max(\n", - " 10, int(median_samples)\n", - ") # At least median or 10, whichever is higher\n", - "print(f\"\\n🎯 RECOMMENDED: MIN_SAMPLES = {recommended_min}\")\n", - "print(f\" Rationale:\")\n", - "print(f\" - Ensures statistical reliability (narrow confidence intervals)\")\n", - "print(f\" - Covers majority of URL traffic\")\n", - "print(f\" - Security-conscious (unknown TLDs treated as risky)\")\n", - "print(f\" - Defensible with data\")\n", - "\n", - "# Store the statistically determined threshold\n", - "STATISTICAL_MIN_SAMPLES = recommended_min" + "In practice, increasing the minimum sample threshold enhances statistical trustworthiness but filters out less common TLDs. For most security applications, a threshold between 30 and 50 samples per TLD is recommended to maintain broad coverage without compromising reliability. Ultimately, the threshold should reflect your tolerance for risk: lower thresholds favor inclusivity with less certainty, while higher thresholds prioritize precision at the cost of breadth." ] }, { @@ -498,158 +560,91 @@ "id": "7db90bc2", "metadata": {}, "source": [ - "- **TLD CONFIDENCE INTERVAL ANALYSIS**" + "#### **3. TLD CONFIDENCE INTERVAL VISUALS**" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "a7dd01ab", + "execution_count": 14, + "id": "3af8ddf5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "✅ Generated: tld_confidence_interval_analysis.png\n" + "✅ Generated: outputs\\tld_confidence_interval_analysis.png\n" ] }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ - "
" + "
" ] }, "metadata": {}, "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "📊 Plot should be visible above. If not, check the saved PNG file.\n" - ] } ], "source": [ - "def plot_tld_confidence_intervals():\n", + "def plot_tld_confidence_intervals(balanced_min, security_first_min):\n", " \"\"\"\n", - " Chart showing Wilson CI width for different sample sizes.\n", + " Plot Wilson CI width vs sample size at p=0.5 (reference worst-case curve),\n", + " and annotate chosen thresholds.\n", " \"\"\"\n", - " sample_sizes = [1, 2, 3, 5, 10, 15, 20, 30, 50, 100]\n", - " ci_widths = [0.891, 0.811, 0.749, 0.659, 0.527, 0.452, 0.401, 0.337, 0.267, 0.192]\n", - "\n", - " fig, ax = plt.subplots(figsize=(12, 8))\n", - "\n", - " # Define color zones\n", - " colors = []\n", - " for width in ci_widths:\n", - " if width > 0.7:\n", - " colors.append(\"#e74c3c\") # Unreliable (red)\n", - " elif width > 0.5:\n", - " colors.append(\"#f39c12\") # Poor (orange)\n", - " elif width > 0.3:\n", - " colors.append(\"#f1c40f\") # Fair (yellow)\n", - " elif width > 0.2:\n", - " colors.append(\"#3498db\") # Good (blue)\n", - " else:\n", - " colors.append(\"#2ecc71\") # Excellent (green)\n", - "\n", - " # Create bar chart\n", - " bars = ax.bar(range(len(sample_sizes)), ci_widths, color=colors, alpha=0.8)\n", - "\n", - " # Add value labels\n", - " for i, (size, width) in enumerate(zip(sample_sizes, ci_widths)):\n", - " ax.text(\n", - " i,\n", - " width + 0.02,\n", - " f\"{width:.3f}\",\n", - " ha=\"center\",\n", - " va=\"bottom\",\n", - " fontsize=10,\n", - " fontweight=\"bold\",\n", + " sample_sizes = [1, 2, 3, 5, 10, 15, 20, 30, 40, 50, 75, 100]\n", + "\n", + " def wilson_width_scalar(n, p=0.5, confidence=0.95):\n", + " z = stats.norm.ppf((1 + confidence) / 2)\n", + " num = 2 * z * np.sqrt(p * (1 - p) / n + z**2 / (4 * n**2))\n", + " den = 1 + z**2 / n\n", + " return num / den\n", + "\n", + " ci_widths = [wilson_width_scalar(n, p=0.5, confidence=0.95) for n in sample_sizes]\n", + "\n", + " fig, ax = plt.subplots(figsize=(12, 7))\n", + " ax.plot(sample_sizes, ci_widths, marker=\"o\")\n", + " ax.axhline(0.30, linestyle=\"--\", linewidth=2, label=\"Target width = 0.30\")\n", + "\n", + " if balanced_min in sample_sizes:\n", + " ax.axvline(\n", + " balanced_min,\n", + " linestyle=\":\",\n", + " linewidth=2,\n", + " label=f\"Balanced MIN_SAMPLES={balanced_min}\",\n", + " )\n", + " if security_first_min in sample_sizes:\n", + " ax.axvline(\n", + " security_first_min,\n", + " linestyle=\"-.\",\n", + " linewidth=2,\n", + " label=f\"Security-first MIN_SAMPLES={security_first_min}\",\n", " )\n", "\n", - " # Add horizontal threshold line\n", - " ax.axhline(\n", - " y=0.3,\n", - " color=\"green\",\n", - " linestyle=\"--\",\n", - " linewidth=2,\n", - " label=\"Acceptable Threshold (CI ≤ 0.3)\",\n", - " )\n", - "\n", - " # Mark MIN_SAMPLES=10\n", - " ax.axvline(\n", - " x=sample_sizes.index(10),\n", - " color=\"blue\",\n", - " linestyle=\":\",\n", - " linewidth=2,\n", - " label=\"Chosen MIN_SAMPLES=10\",\n", - " alpha=0.7,\n", - " )\n", + " for n, w in zip(sample_sizes, ci_widths):\n", + " ax.text(n, w + 0.02, f\"{w:.3f}\", ha=\"center\", va=\"bottom\", fontsize=9)\n", "\n", - " # Labels and formatting\n", - " ax.set_xlabel(\"Sample Size (Number of URLs per TLD)\", fontsize=13)\n", - " ax.set_ylabel(\"Wilson 95% Confidence Interval Width\", fontsize=13)\n", - " ax.set_title(\n", - " \"TLD Statistical Reliability: Wilson Confidence Interval Analysis\",\n", - " fontsize=16,\n", - " fontweight=\"bold\",\n", - " )\n", - " ax.set_xticks(range(len(sample_sizes)))\n", - " ax.set_xticklabels(sample_sizes, fontsize=11)\n", - " ax.legend(loc=\"upper right\", fontsize=12, frameon=True, fancybox=True)\n", + " ax.set_xlabel(\"Sample size per TLD\")\n", + " ax.set_ylabel(\"Wilson 95% CI width (p=0.5 reference)\")\n", + " ax.set_title(\"Reference CI Width vs Sample Size (worst-case prevalence p=0.5)\")\n", + " ax.set_ylim(0, 1.0)\n", " ax.grid(axis=\"y\", alpha=0.3)\n", - " ax.set_ylim([0, 1.0])\n", - "\n", - " # Add text box with interpretation\n", - " textstr = (\n", - " \"Reliability Classification:\\n\"\n", - " \"• CI > 0.7: UNRELIABLE (red)\\n\"\n", - " \"• 0.5 < CI ≤ 0.7: POOR (orange)\\n\"\n", - " \"• 0.3 < CI ≤ 0.5: FAIR (yellow)\\n\"\n", - " \"• 0.2 < CI ≤ 0.3: GOOD (blue)\\n\"\n", - " \"• CI ≤ 0.2: EXCELLENT (green)\\n\\n\"\n", - " \"MIN_SAMPLES=10 balances:\\n\"\n", - " \"• Statistical reliability (CI=0.527)\\n\"\n", - " \"• URL coverage (98.9%)\\n\"\n", - " \"• Security-first approach\"\n", - " )\n", - " props = dict(boxstyle=\"round\", facecolor=\"wheat\", alpha=0.8)\n", - " ax.text(\n", - " 0.98,\n", - " 0.98,\n", - " textstr,\n", - " transform=ax.transAxes,\n", - " fontsize=9,\n", - " verticalalignment=\"top\",\n", - " horizontalalignment=\"right\",\n", - " bbox=props,\n", - " )\n", + " ax.legend(loc=\"upper right\")\n", "\n", - " plt.tight_layout()\n", - "\n", - " # Save first, then show\n", " OUTPUT_DIR = Path(\"outputs\")\n", " OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n", - "\n", - " plt.savefig(\n", - " OUTPUT_DIR / \"tld_confidence_interval_analysis.png\",\n", - " dpi=300,\n", - " bbox_inches=\"tight\",\n", - " )\n", - "\n", - " print(\"✅ Generated: tld_confidence_interval_analysis.png\")\n", - "\n", - " # Ensure the plot displays\n", + " out_path = OUTPUT_DIR / \"tld_confidence_interval_analysis.png\"\n", + " plt.tight_layout()\n", + " plt.savefig(out_path, dpi=300, bbox_inches=\"tight\")\n", + " print(f\"✅ Generated: {out_path}\")\n", " plt.show()\n", "\n", - " print(\"📊 Plot should be visible above. If not, check the saved PNG file.\")\n", - "\n", "\n", - "plot_tld_confidence_intervals()\n" + "# Example call using the computed values above:\n", + "plot_tld_confidence_intervals(\n", + " balanced_min=STATISTICAL_MIN_SAMPLES, security_first_min=SECURITY_FIRST_MIN_SAMPLES\n", + ")\n" ] }, { @@ -657,24 +652,33 @@ "id": "6824cfcb", "metadata": {}, "source": [ - "### **FINAL JUSTIFIED PARAMETERS**\n", + "#### **4. FINAL JUSTIFIED PARAMETERS**\n", + "\n", + "Based on the comprehensive p̂-based confidence interval analysis, here are the defensible, data-driven parameters used in the final model:\n", + "\n", + "- **MIN_SAMPLES = STATISTICAL_MIN_SAMPLES (≈ 20)**\n", + "\n", + " - Statistical Justification: The 90th percentile Wilson CI width drops below 0.30 starting at 20+ samples, ensuring statistical reliability.\n", + " - Business Justification: Retains over 98% of all URLs, balancing reliability with broad coverage.\n", + " - Security Justification: Treats small-sample TLDs (<20) as “statistically unreliable” and defaults them to risky (global base rate).\n", + "\n", + "- **ALPHA = 1, BETA = 2 (Conservative Priors)**\n", "\n", - "Based on the comprehensive statistical analysis above, here are our **defensible, data-driven parameters**:\n", + " - Security-first: “Unknown or under-sampled TLDs are risky until proven safe.”\n", + " - Bayesian Logic: 2:1 prior bias toward phishing risk results in prior mean = 0.333 legitimate, discouraging overconfidence.\n", + " - Business Logic: Aligns with security priorities—prefer false positives (flagging a legit TLD) over false negatives (missing a phish).\n", "\n", - "**MIN_SAMPLES = 10** ✅\n", - "- **Statistical Justification**: Median is only 3, but CI analysis shows 10+ needed for reasonable reliability\n", - "- **Business Justification**: Still covers 98.9% of URLs (minimal traffic impact)\n", - "- **Security Justification**: 70.4% of TLDs are rare/unreliable and should default to \"risky\"\n", + "- **Model Outcome Improvements:**\n", "\n", - "**ALPHA = 1, BETA = 2** (Conservative Priors) ✅ \n", - "- **Security-first**: \"Unknown TLDs are risky until proven safe\"\n", - "- **Bayesian Logic**: 2:1 prior bias toward phishing risk\n", - "- **Business Logic**: Better false positive than false negative in security" + " - Significant drop in extreme (0.0 or 1.0) probability cases.\n", + " - Reliable smoothing for mid-range TLDs, reducing overfitting to small sample noise.\n", + " - All single-sample TLDs use global base rate; only statistically sound groups get custom probabilities.\n", + " - Lookup table saved to: data/tld_probs.json" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 17, "id": "245dd077", "metadata": {}, "outputs": [ @@ -682,59 +686,28 @@ "name": "stdout", "output_type": "stream", "text": [ + "================================================================================\n", + "APPLYING STATISTICALLY JUSTIFIED PARAMETERS\n", + "================================================================================\n", "================================================================================\n", "APPLYING STATISTICALLY JUSTIFIED PARAMETERS\n", "================================================================================\n", "📊 Statistical Analysis Results:\n", - " - Median TLD samples: 3\n", - " - Confidence interval threshold: 50 samples\n", - " - URL coverage at threshold 10: 98.9%\n", + " - Chosen MIN_SAMPLES: 20\n", + " - URL coverage at chosen threshold: 98.2%\n", + " - Mean CI width (95%): 0.145\n", + " - 90th percentile CI width (95%): 0.286\n", "\n", "🎯 Final Justified Parameters:\n", - " - MIN_SAMPLES = 10 (data-driven)\n", - " - ALPHA = 1 (weak legitimacy prior)\n", - " - BETA = 2 (security-first bias)\n", + " - MIN_SAMPLES = 20 # p̂-based, p90 CI-focused\n", + " - ALPHA = 1 # weak legitimacy prior\n", + " - BETA = 2 # stronger phishing prior\n", "\n", "🔄 Recalculating with justified parameters...\n", "\n", "Method Distribution (Statistically Justified):\n", - " FALLBACK_STATISTICAL: 987 TLDs (70.4%)\n", - " SMOOTHED_STATISTICAL: 414 TLDs (29.6%)\n", - "\n", - "📈 Statistical Improvement Summary:\n", - " Original extreme probabilities: 911 TLDs (65.0%)\n", - " Final extreme probabilities: 0 TLDs (0.0%)\n", - " Improvement: 911 fewer extreme values\n", - " Final probability range: [0.001, 0.998]\n", - "\n", - "💾 SAVED STATISTICALLY JUSTIFIED LOOKUP TABLE\n", - " File: data\\tld_probs.json\n", - " Total TLDs: 1401\n", - " Size: 44.1 KB\n", - "\n", - "✅ STATISTICALLY JUSTIFIED TLD CALCULATION COMPLETE\n", - " 🎯 Parameters defensible with data\n", - " 🔒 Security-first bias implemented\n", - " 📊 98.9% URL coverage maintained\n", - " 📈 911 fewer overconfident predictions\n", - "\n", - "🔍 Sample Improvements (Single-sample TLDs):\n", - "TLD Raw Prob Final Prob Method \n", - "-----------------------------------------------------------------\n", - "aberdeen.sch.uk 1.000 0.574 FALLBACK_STATISTICAL\n", - "ac.cy 1.000 0.574 FALLBACK_STATISTICAL\n", - "ac.me 1.000 0.574 FALLBACK_STATISTICAL\n", - "ac.mz 1.000 0.574 FALLBACK_STATISTICAL\n", - "ac.pa 1.000 0.574 FALLBACK_STATISTICAL\n", - "\n", - "📋 Statistical Summary:\n", - " - All single-sample TLDs now use global rate (0.574)\n", - " - Only TLDs with ≥10 samples get custom probabilities\n", - " - Conservative Bayesian prior favors security (α=1, β=2)\n", - "\n", - "Method Distribution (Statistically Justified):\n", - " FALLBACK_STATISTICAL: 987 TLDs (70.4%)\n", - " SMOOTHED_STATISTICAL: 414 TLDs (29.6%)\n", + " FALLBACK_STATISTICAL: 1110 TLDs (79.2%)\n", + " SMOOTHED_STATISTICAL: 291 TLDs (20.8%)\n", "\n", "📈 Statistical Improvement Summary:\n", " Original extreme probabilities: 911 TLDs (65.0%)\n", @@ -745,13 +718,13 @@ "💾 SAVED STATISTICALLY JUSTIFIED LOOKUP TABLE\n", " File: data\\tld_probs.json\n", " Total TLDs: 1401\n", - " Size: 44.1 KB\n", + " Size: 44.3 KB\n", "\n", "✅ STATISTICALLY JUSTIFIED TLD CALCULATION COMPLETE\n", " 🎯 Parameters defensible with data\n", - " 🔒 Security-first bias implemented\n", - " 📊 98.9% URL coverage maintained\n", - " 📈 911 fewer overconfident predictions\n", + " 📊 Coverage at MIN_SAMPLES=20: 98.2%\n", + " 🔒 Security-first prior: mean legit = 0.333\n", + " 📉 911 fewer overconfident predictions\n", "\n", "🔍 Sample Improvements (Single-sample TLDs):\n", "TLD Raw Prob Final Prob Method \n", @@ -764,8 +737,8 @@ "\n", "📋 Statistical Summary:\n", " - All single-sample TLDs now use global rate (0.574)\n", - " - Only TLDs with ≥10 samples get custom probabilities\n", - " - Conservative Bayesian prior favors security (α=1, β=2)\n" + " - Only TLDs with ≥20 samples get custom probabilities\n", + " - Conservative Beta prior favors security (α=1, β=2)\n" ] } ], @@ -783,54 +756,115 @@ "print(\"APPLYING STATISTICALLY JUSTIFIED PARAMETERS\")\n", "print(\"=\" * 80)\n", "\n", - "# Use the statistically determined parameters\n", - "FINAL_MIN_SAMPLES = 10 # From statistical analysis: balances reliability and coverage\n", - "FINAL_ALPHA = 1 # Weak legitimacy prior (security-first)\n", - "FINAL_BETA = 2 # Stronger phishing prior (conservative)\n", + "# ============================================================\n", + "# APPLY STATISTICALLY JUSTIFIED PARAMETERS (defensive version)\n", + "# ============================================================\n", + "\n", + "print(\"=\" * 80)\n", + "print(\"APPLYING STATISTICALLY JUSTIFIED PARAMETERS\")\n", + "print(\"=\" * 80)\n", + "\n", + "\n", + "# 1) Threshold from Block 2, else fallback to 20 (p̂-based p90 ≤ 0.30 sweet spot)\n", + "FINAL_MIN_SAMPLES = int(globals().get(\"STATISTICAL_MIN_SAMPLES\", 20))\n", + "\n", + "# 2) Label column. Must be 0/1 where 1=legit, 0=phish.\n", + "if \"label_col\" not in globals():\n", + " # Try a sensible default; fail loudly if not there.\n", + " if \"label\" in df_raw.columns:\n", + " label_col = \"label\"\n", + " else:\n", + " raise NameError(\"Define label_col (binary 0/1 with 1=legit, 0=phish).\")\n", + "\n", + "# 3) Build per-TLD summary if missing\n", + "tld_enhanced_stats = df_raw.groupby(\"TLD\")[label_col].agg(\n", + " total_count=\"size\", legit_count=\"sum\"\n", + ")\n", + "tld_enhanced_stats[\"raw_legit_prob\"] = (\n", + " tld_enhanced_stats[\"legit_count\"] / tld_enhanced_stats[\"total_count\"]\n", + ")\n", + "\n", + "# 4) Global legit rate\n", + "global_legit_rate = df_raw[label_col].mean()\n", + "\n", + "# 5) Baseline extreme count\n", + "old_extreme = int((tld_enhanced_stats[\"raw_legit_prob\"].isin([0.0, 1.0])).sum())\n", + "\n", + "# Priors (security-first tilt). Prior mean legit = 1 / (1+2) = 0.333\n", + "FINAL_ALPHA = 1\n", + "FINAL_BETA = 2\n", "\n", - "print(f\"📊 Statistical Analysis Results:\")\n", - "print(f\" - Median TLD samples: {median_samples:.0f}\")\n", - "print(f\" - Confidence interval threshold: {min_samples_ci} samples\")\n", - "print(f\" - URL coverage at threshold 10: {urls_covered_balanced:.1f}%\")\n", "\n", - "print(f\"\\n🎯 Final Justified Parameters:\")\n", - "print(f\" - MIN_SAMPLES = {FINAL_MIN_SAMPLES} (data-driven)\")\n", - "print(f\" - ALPHA = {FINAL_ALPHA} (weak legitimacy prior)\")\n", - "print(f\" - BETA = {FINAL_BETA} (security-first bias)\")\n", + "# Helper: Wilson CI width\n", + "def wilson_confidence_interval_width(\n", + " sample_size, legit_proportion, confidence_level=0.95\n", + "):\n", + " z = stats.norm.ppf((1 + confidence_level) / 2)\n", + " n = np.asarray(sample_size, dtype=float)\n", + " p = np.asarray(legit_proportion, dtype=float)\n", + " n = np.where(n <= 0, np.nan, n)\n", + " num = 2 * z * np.sqrt(p * (1 - p) / n + z**2 / (4 * n**2))\n", + " den = 1 + z**2 / n\n", + " return num / den\n", + "\n", + "\n", + "# Empirical coverage and CI widths at FINAL_MIN_SAMPLES\n", + "keep_tlds = tld_sample_counts[tld_sample_counts >= FINAL_MIN_SAMPLES].index\n", + "coverage_at_final = 100.0 * df_raw[\"TLD\"].isin(keep_tlds).mean()\n", + "\n", + "g_final = (\n", + " df_raw[df_raw[\"TLD\"].isin(keep_tlds)]\n", + " .groupby(\"TLD\")[label_col]\n", + " .agg(total_urls=\"size\", legit_urls=\"sum\")\n", + ")\n", + "if len(g_final):\n", + " g_final[\"legit_proportion\"] = g_final[\"legit_urls\"] / g_final[\"total_urls\"]\n", + " g_final[\"ci_width_95\"] = wilson_confidence_interval_width(\n", + " g_final[\"total_urls\"].to_numpy(),\n", + " g_final[\"legit_proportion\"].to_numpy(),\n", + " confidence_level=0.95,\n", + " )\n", + " mean_width_at_final = float(np.nanmean(g_final[\"ci_width_95\"]))\n", + " p90_width_at_final = float(np.nanpercentile(g_final[\"ci_width_95\"], 90))\n", + "else:\n", + " mean_width_at_final = float(\"nan\")\n", + " p90_width_at_final = float(\"nan\")\n", + "\n", + "print(\"📊 Statistical Analysis Results:\")\n", + "print(f\" - Chosen MIN_SAMPLES: {FINAL_MIN_SAMPLES}\")\n", + "print(f\" - URL coverage at chosen threshold: {coverage_at_final:.1f}%\")\n", + "print(f\" - Mean CI width (95%): {mean_width_at_final:.3f}\")\n", + "print(f\" - 90th percentile CI width (95%): {p90_width_at_final:.3f}\")\n", + "\n", + "print(\"\\n🎯 Final Justified Parameters:\")\n", + "print(f\" - MIN_SAMPLES = {FINAL_MIN_SAMPLES} # p̂-based, p90 CI-focused\")\n", + "print(f\" - ALPHA = {FINAL_ALPHA} # weak legitimacy prior\")\n", + "print(f\" - BETA = {FINAL_BETA} # stronger phishing prior\")\n", "\n", "\n", "def calculate_final_probability(row):\n", - " \"\"\"Calculate TLD probability with statistically justified parameters\"\"\"\n", + " \"\"\"Final TLD legit probability with fallback + Beta smoothing.\"\"\"\n", " total = row[\"total_count\"]\n", " legit = row[\"legit_count\"]\n", - "\n", " if total < FINAL_MIN_SAMPLES:\n", - " # Use global base rate for statistically unreliable TLDs\n", " return global_legit_rate, \"FALLBACK_STATISTICAL\"\n", - " else:\n", - " # Bayesian smoothing with security-first priors\n", - " smoothed_prob = (legit + FINAL_ALPHA) / (total + FINAL_ALPHA + FINAL_BETA)\n", - " return smoothed_prob, \"SMOOTHED_STATISTICAL\"\n", + " smoothed_prob = (legit + FINAL_ALPHA) / (total + FINAL_ALPHA + FINAL_BETA)\n", + " return smoothed_prob, \"SMOOTHED_STATISTICAL\"\n", "\n", "\n", - "# Apply the statistically justified calculation\n", - "print(f\"\\n🔄 Recalculating with justified parameters...\")\n", + "print(\"\\n🔄 Recalculating with justified parameters...\")\n", "final_results = tld_enhanced_stats.apply(calculate_final_probability, axis=1)\n", "tld_enhanced_stats[\"final_legit_prob\"] = [r[0] for r in final_results]\n", "tld_enhanced_stats[\"final_method\"] = [r[1] for r in final_results]\n", "\n", - "# Compare the impact\n", - "print(f\"\\nMethod Distribution (Statistically Justified):\")\n", + "print(\"\\nMethod Distribution (Statistically Justified):\")\n", "final_method_counts = tld_enhanced_stats[\"final_method\"].value_counts()\n", "for method, count in final_method_counts.items():\n", " print(f\" {method}: {count} TLDs ({count / len(tld_enhanced_stats) * 100:.1f}%)\")\n", "\n", - "# Statistical validation of final approach\n", - "final_extreme = sum(\n", - " 1 for p in tld_enhanced_stats[\"final_legit_prob\"] if p in [0.0, 1.0]\n", - ")\n", + "final_extreme = int((tld_enhanced_stats[\"final_legit_prob\"].isin([0.0, 1.0])).sum())\n", "\n", - "print(f\"\\n📈 Statistical Improvement Summary:\")\n", + "print(\"\\n📈 Statistical Improvement Summary:\")\n", "print(\n", " f\" Original extreme probabilities: {old_extreme} TLDs ({old_extreme / len(tld_enhanced_stats) * 100:.1f}%)\"\n", ")\n", @@ -838,52 +872,45 @@ " f\" Final extreme probabilities: {final_extreme} TLDs ({final_extreme / len(tld_enhanced_stats) * 100:.1f}%)\"\n", ")\n", "print(f\" Improvement: {old_extreme - final_extreme} fewer extreme values\")\n", - "\n", - "# Final probability range\n", "final_range = f\"[{tld_enhanced_stats['final_legit_prob'].min():.3f}, {tld_enhanced_stats['final_legit_prob'].max():.3f}]\"\n", "print(f\" Final probability range: {final_range}\")\n", "\n", - "# Create the production-ready lookup table\n", + "# Production lookup\n", "tld_probs_final = tld_enhanced_stats[\"final_legit_prob\"].to_dict()\n", "\n", - "# Save the statistically justified version\n", "output_path_final = Path(\"data/tld_probs.json\")\n", + "output_path_final.parent.mkdir(parents=True, exist_ok=True)\n", "with open(output_path_final, \"w\") as f:\n", " json.dump(tld_probs_final, f, indent=2, sort_keys=True)\n", "\n", - "print(f\"\\n💾 SAVED STATISTICALLY JUSTIFIED LOOKUP TABLE\")\n", + "print(\"\\n💾 SAVED STATISTICALLY JUSTIFIED LOOKUP TABLE\")\n", "print(f\" File: {output_path_final}\")\n", "print(f\" Total TLDs: {len(tld_probs_final)}\")\n", "print(f\" Size: {output_path_final.stat().st_size / 1024:.1f} KB\")\n", "\n", - "# Update the main tld_probs variable for downstream use\n", "tld_probs = tld_probs_final.copy()\n", "\n", - "print(f\"\\n✅ STATISTICALLY JUSTIFIED TLD CALCULATION COMPLETE\")\n", - "print(f\" 🎯 Parameters defensible with data\")\n", - "print(f\" 🔒 Security-first bias implemented\")\n", - "print(f\" 📊 98.9% URL coverage maintained\")\n", - "print(f\" 📈 {old_extreme - final_extreme} fewer overconfident predictions\")\n", + "print(\"\\n✅ STATISTICALLY JUSTIFIED TLD CALCULATION COMPLETE\")\n", + "print(\" 🎯 Parameters defensible with data\")\n", + "print(f\" 📊 Coverage at MIN_SAMPLES={FINAL_MIN_SAMPLES}: {coverage_at_final:.1f}%\")\n", + "print(\n", + " f\" 🔒 Security-first prior: mean legit = {FINAL_ALPHA / (FINAL_ALPHA + FINAL_BETA):.3f}\"\n", + ")\n", + "print(f\" 📉 {old_extreme - final_extreme} fewer overconfident predictions\")\n", "\n", - "# Show some examples of the improvement\n", - "print(f\"\\n🔍 Sample Improvements (Single-sample TLDs):\")\n", + "print(\"\\n🔍 Sample Improvements (Single-sample TLDs):\")\n", "print(f\"{'TLD':<20} {'Raw Prob':<12} {'Final Prob':<12} {'Method':<20}\")\n", "print(\"-\" * 65)\n", - "\n", "sample_singles = tld_enhanced_stats[tld_enhanced_stats[\"total_count\"] == 1].head(5)\n", "for tld, row in sample_singles.iterrows():\n", - " raw_prob = row[\"raw_legit_prob\"]\n", - " final_prob = row[\"final_legit_prob\"]\n", - " method = row[\"final_method\"]\n", - "\n", - " print(f\"{tld:<20} {raw_prob:<12.3f} {final_prob:<12.3f} {method:<20}\")\n", + " print(\n", + " f\"{tld:<20} {row['raw_legit_prob']:<12.3f} {row['final_legit_prob']:<12.3f} {row['final_method']:<20}\"\n", + " )\n", "\n", - "print(f\"\\n📋 Statistical Summary:\")\n", + "print(\"\\n📋 Statistical Summary:\")\n", "print(f\" - All single-sample TLDs now use global rate ({global_legit_rate:.3f})\")\n", "print(f\" - Only TLDs with ≥{FINAL_MIN_SAMPLES} samples get custom probabilities\")\n", - "print(\n", - " f\" - Conservative Bayesian prior favors security (α={FINAL_ALPHA}, β={FINAL_BETA})\"\n", - ")" + "print(f\" - Conservative Beta prior favors security (α={FINAL_ALPHA}, β={FINAL_BETA})\")\n" ] }, { @@ -896,7 +923,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 4, "id": "82173c48", "metadata": {}, "outputs": [ @@ -904,14 +931,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "Extracting features for 234764 URLs...\n" + "Extracting features for 235370 URLs...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Extracting features: 100%|██████████| 234764/234764 [00:34<00:00, 6782.67it/s]" + "Extracting features: 100%|██████████| 235370/235370 [00:26<00:00, 8953.90it/s] \n" ] }, { @@ -920,74 +947,138 @@ "text": [ "\n", "✓ Extraction complete\n", - " Success: 234764 / 234764\n", - " Failed: 0 / 234764\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "# Check if URL column exists\n", - "if \"URL\" not in df_raw.columns:\n", - " raise ValueError(\"URL column not found in dataset\")\n", - "\n", - "print(f\"Extracting features for {len(df_raw)} URLs...\")\n", - "\n", - "# Extract features for each URL\n", - "features_list = []\n", - "failed_urls = []\n", - "\n", - "for idx, row in tqdm(df_raw.iterrows(), total=len(df_raw), desc=\"Extracting features\"):\n", - " try:\n", - " url = row[\"URL\"]\n", - " # Extract ALL 8 features (supports both 7-feature and 8-feature models)\n", - " features = extract_features(url, include_https=True)\n", - " features[\"URL\"] = url # Keep URL for reference\n", - " features[\"label\"] = row[label_col] # Keep label for each row\n", - " features_list.append(features)\n", - "\n", - " except Exception as e:\n", - " # Log failed URLs but continue\n", - " url = row[\"URL\"] if \"row\" in locals() else \"unknown\"\n", - " failed_urls.append((url, str(e)))\n", - " if len(failed_urls) <= 5: # Only print first 5 errors\n", - " print(f\"Failed to extract features for {url}: {e}\")\n", - "\n", - "print(f\"\\n✓ Extraction complete\")\n", - "print(f\" Success: {len(features_list)} / {len(df_raw)}\")\n", - "print(f\" Failed: {len(failed_urls)} / {len(df_raw)}\")\n", - "\n", - "if len(failed_urls) > 0:\n", - " print(f\"\\nFirst 5 failures:\")\n", - " for url, error in failed_urls[:5]:\n", - " print(f\" {url[:50]}... → {error}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "8d5a9c4d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Feature DataFrame shape: (234764, 10)\n", + " Success: 235370 / 235370\n", + " Failed: 0 / 235370\n", + "Feature DataFrame shape: (235370, 9)\n", "\n", - "Columns: ['IsHTTPS', 'TLDLegitimateProb', 'CharContinuationRate', 'SpacialCharRatioInURL', 'URLCharProb', 'LetterRatioInURL', 'NoOfOtherSpecialCharsInURL', 'DomainLength', 'URL', 'label']\n", + "Columns: ['TLDLegitimateProb', 'CharContinuationRate', 'SpacialCharRatioInURL', 'URLCharProb', 'LetterRatioInURL', 'NoOfOtherSpecialCharsInURL', 'DomainLength', 'URL', 'label']\n", "\n", "First few rows:\n" ] }, { "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "TLDLegitimateProb", + "rawType": "float64", + "type": "float" + }, + { + "name": "CharContinuationRate", + "rawType": "float64", + "type": "float" + }, + { + "name": "SpacialCharRatioInURL", + "rawType": "float64", + "type": "float" + }, + { + "name": "URLCharProb", + "rawType": "float64", + "type": "float" + }, + { + "name": "LetterRatioInURL", + "rawType": "float64", + "type": "float" + }, + { + "name": "NoOfOtherSpecialCharsInURL", + "rawType": "int64", + "type": "integer" + }, + { + "name": "DomainLength", + "rawType": "int64", + "type": "integer" + }, + { + "name": "URL", + "rawType": "object", + "type": "string" + }, + { + "name": "label", + "rawType": "int64", + "type": "integer" + } + ], + "ref": "7ae0a596-edc8-4054-9f40-ba0ce72fbc72", + "rows": [ + [ + "0", + "0.6119968682159508", + "0.12903225806451613", + "0.15625", + "1.0", + "0.84375", + "5", + "24", + "https://www.southbankmosaics.com", + "1" + ], + [ + "1", + "0.8283712784588442", + "0.17391304347826086", + "0.25", + "1.0", + "0.75", + "6", + "16", + "https://www.uni-mainz.de", + "1" + ], + [ + "2", + "0.9317347687790556", + "0.13793103448275862", + "0.2", + "1.0", + "0.8", + "6", + "22", + "https://www.voicefmradio.co.uk", + "1" + ], + [ + "3", + "0.6119968682159508", + "0.15384615384615385", + "0.18518518518518517", + "1.0", + "0.8148148148148148", + "5", + "19", + "https://www.sfnmjournal.com", + "1" + ], + [ + "4", + "0.8792231976589518", + "0.12121212121212122", + "0.14705882352941177", + "1.0", + "0.8529411764705882", + "5", + "26", + "https://www.rewildingargentina.org", + "1" + ] + ], + "shape": { + "columns": 9, + "rows": 5 + } + }, "text/html": [ "
\n", "