From c1a5e84859c5932b19f26e245619d3c0a598861f Mon Sep 17 00:00:00 2001 From: bk86a <41694587+bk86a@users.noreply.github.com> Date: Mon, 23 Feb 2026 20:20:50 +0100 Subject: [PATCH] feat: relax MT regex and add country-level majority-vote fallback (#14) Make the separator between alpha prefix and digits optional in the MT postal code regex (space, dash, or nothing all accepted). Codes like MST1000 now get exact TERCET matches instead of falling to approximate. Add a new Tier 4 country-level fallback for countries where all postal codes share the same NUTS1/NUTS2 but NUTS3 has a dominant winner. Digit-only MT codes that previously returned 404 now resolve to MT0/MT00/MT001 with appropriate confidence scores. --- CHANGELOG.md | 10 +++++++ app/__init__.py | 2 +- app/data_loader.py | 60 ++++++++++++++++++++++++++++++++++++++-- app/postal_patterns.json | 4 +-- 4 files changed, 70 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 99bcb2d..dd16c5a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/). +## [0.12.0] - 2026-02-23 + +### Fixed + +- **MT regex** (#14): separator between alpha prefix and digits is now optional (`MST1000` accepted alongside `MST 1000` and `MST-1000`). Previously, codes without a space failed regex extraction and fell to approximate matching with lower confidence. + +### Added + +- **Country-level majority-vote fallback**: new Tier 4 in the lookup chain for countries where all postal codes map to the same NUTS1/NUTS2 but NUTS3 has a dominant winner. Returns `match_type: "approximate"` with NUTS1/NUTS2 confidence 1.0 and NUTS3 confidence based on agreement ratio (capped at 0.80). Naturally captures MT (MT0/MT00/MT001 at ~77%). Digit-only MT codes like `1043` that previously returned 404 now get a valid approximate result. + ## [0.11.0] - 2026-02-23 ### Added diff --git a/app/__init__.py b/app/__init__.py index ae6db5f..ea370a8 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1 +1 @@ -__version__ = "0.11.0" +__version__ = "0.12.0" diff --git a/app/data_loader.py b/app/data_loader.py index a448aed..354685a 100644 --- a/app/data_loader.py +++ b/app/data_loader.py @@ -35,6 +35,10 @@ # Countries with a single NUTS3 region: country_code -> nuts3 code _single_nuts3: dict[str, str] = {} +# Country-level majority-vote fallback for countries where NUTS1/NUTS2 +# are unanimous but NUTS3 has a dominant winner (e.g. MT → MT0/MT00/MT001) +_country_fallback: dict[str, dict] = {} + # NUTS region names: nuts_id -> name_latn _nuts_names: dict[str, str] = {} @@ -619,6 +623,41 @@ def _build_prefix_index() -> None: if _single_nuts3: logger.info("Single-NUTS3 countries: %s", ", ".join(sorted(_single_nuts3))) + # Country-level majority-vote fallback for countries NOT in _single_nuts3 + # where NUTS1 and NUTS2 are unanimous but NUTS3 has a dominant winner + _country_fallback.clear() + caps = settings.approximate_confidence_caps + for cc, nuts3_set in country_nuts3.items(): + if cc in _single_nuts3: + continue + nuts1_set = {n[:3] for n in nuts3_set} + nuts2_set = {n[:4] for n in nuts3_set} + if len(nuts1_set) != 1 or len(nuts2_set) != 1: + continue + # Count postal codes per NUTS3 to find dominant region + nuts3_counts: Counter[str] = Counter() + for (c, _), n3 in _lookup.items(): + if c == cc: + nuts3_counts[n3] += 1 + total = sum(nuts3_counts.values()) + if total == 0: + continue + winner, winner_count = nuts3_counts.most_common(1)[0] + ratio = winner_count / total + _country_fallback[cc] = { + "nuts1": next(iter(nuts1_set)), + "nuts1_confidence": 1.0, + "nuts2": next(iter(nuts2_set)), + "nuts2_confidence": 1.0, + "nuts3": winner, + "nuts3_confidence": round(min(ratio, caps["nuts3"]), 2), + } + if _country_fallback: + logger.info( + "Country-level fallback: %s", + ", ".join(f"{cc}→{v['nuts3']}" for cc, v in sorted(_country_fallback.items())), + ) + def _estimate_by_prefix(cc: str, postal_code: str) -> dict | None: """Runtime estimation via longest prefix match + majority vote. @@ -910,11 +949,12 @@ def load_data() -> None: def lookup(country_code: str, postal_code: str) -> dict | None: """Look up NUTS codes for a given country + postal code. - Four-tier fall-through: + Five-tier fall-through: 1. Exact TERCET match → confidence 1.0 2. Pre-computed estimate → stored confidence per level 3. Runtime prefix-based estimation → calculated confidence - 4. Single-NUTS3 country fallback → confidence 1.0 (e.g. LI, CY, LU) + 4. Country-level majority vote → unanimous NUTS1/2, dominant NUTS3 (e.g. MT) + 5. Single-NUTS3 country fallback → confidence 1.0 (e.g. LI, CY, LU) Returns a dict with nuts1/2/3, match_type, and per-level confidence, or None. """ @@ -962,7 +1002,21 @@ def lookup(country_code: str, postal_code: str) -> dict | None: approx.update(_resolve_names(approx["nuts1"], approx["nuts2"], approx["nuts3"])) return approx - # Tier 4: Single-NUTS3 country fallback (e.g. LI → LI000) + # Tier 4: Country-level majority vote (unanimous NUTS1/2, dominant NUTS3) + fallback = _country_fallback.get(cc) + if fallback is not None: + return { + "match_type": "approximate", + "nuts1": fallback["nuts1"], + "nuts1_confidence": fallback["nuts1_confidence"], + "nuts2": fallback["nuts2"], + "nuts2_confidence": fallback["nuts2_confidence"], + "nuts3": fallback["nuts3"], + "nuts3_confidence": fallback["nuts3_confidence"], + **_resolve_names(fallback["nuts1"], fallback["nuts2"], fallback["nuts3"]), + } + + # Tier 5: Single-NUTS3 country fallback (e.g. LI → LI000) nuts3 = _single_nuts3.get(cc) if nuts3 is not None: return { diff --git a/app/postal_patterns.json b/app/postal_patterns.json index b791d6e..eaec6fd 100644 --- a/app/postal_patterns.json +++ b/app/postal_patterns.json @@ -116,8 +116,8 @@ "expected_digits": 4 }, "MT": { - "regex": "^([A-Z]{2,3}\\s\\d{2,4})$", - "example": "VLT 1010, FNT 1010, MSK 1234", + "regex": "^([A-Z]{2,3}[\\s\\-]?\\d{2,4})$", + "example": "VLT 1010, MST1000, FNT-1010", "tercet_map": "keep_alpha" }, "NL": {