diff --git a/CHANGELOG.md b/CHANGELOG.md index 99bcb2d..dd16c5a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/). +## [0.12.0] - 2026-02-23 + +### Fixed + +- **MT regex** (#14): separator between alpha prefix and digits is now optional (`MST1000` accepted alongside `MST 1000` and `MST-1000`). Previously, codes without a space failed regex extraction and fell to approximate matching with lower confidence. + +### Added + +- **Country-level majority-vote fallback**: new Tier 4 in the lookup chain for countries where all postal codes map to the same NUTS1/NUTS2 but NUTS3 has a dominant winner. Returns `match_type: "approximate"` with NUTS1/NUTS2 confidence 1.0 and NUTS3 confidence based on agreement ratio (capped at 0.80). Naturally captures MT (MT0/MT00/MT001 at ~77%). Digit-only MT codes like `1043` that previously returned 404 now get a valid approximate result. + ## [0.11.0] - 2026-02-23 ### Added diff --git a/app/__init__.py b/app/__init__.py index ae6db5f..ea370a8 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1 +1 @@ -__version__ = "0.11.0" +__version__ = "0.12.0" diff --git a/app/data_loader.py b/app/data_loader.py index a448aed..354685a 100644 --- a/app/data_loader.py +++ b/app/data_loader.py @@ -35,6 +35,10 @@ # Countries with a single NUTS3 region: country_code -> nuts3 code _single_nuts3: dict[str, str] = {} +# Country-level majority-vote fallback for countries where NUTS1/NUTS2 +# are unanimous but NUTS3 has a dominant winner (e.g. MT → MT0/MT00/MT001) +_country_fallback: dict[str, dict] = {} + # NUTS region names: nuts_id -> name_latn _nuts_names: dict[str, str] = {} @@ -619,6 +623,41 @@ def _build_prefix_index() -> None: if _single_nuts3: logger.info("Single-NUTS3 countries: %s", ", ".join(sorted(_single_nuts3))) + # Country-level majority-vote fallback for countries NOT in _single_nuts3 + # where NUTS1 and NUTS2 are unanimous but NUTS3 has a dominant winner + _country_fallback.clear() + caps = settings.approximate_confidence_caps + for cc, nuts3_set in country_nuts3.items(): + if cc in _single_nuts3: + continue + nuts1_set = {n[:3] for n in nuts3_set} + nuts2_set = {n[:4] for n in nuts3_set} + if len(nuts1_set) != 1 or len(nuts2_set) != 1: + continue + # Count postal codes per NUTS3 to find dominant region + nuts3_counts: Counter[str] = Counter() + for (c, _), n3 in _lookup.items(): + if c == cc: + nuts3_counts[n3] += 1 + total = sum(nuts3_counts.values()) + if total == 0: + continue + winner, winner_count = nuts3_counts.most_common(1)[0] + ratio = winner_count / total + _country_fallback[cc] = { + "nuts1": next(iter(nuts1_set)), + "nuts1_confidence": 1.0, + "nuts2": next(iter(nuts2_set)), + "nuts2_confidence": 1.0, + "nuts3": winner, + "nuts3_confidence": round(min(ratio, caps["nuts3"]), 2), + } + if _country_fallback: + logger.info( + "Country-level fallback: %s", + ", ".join(f"{cc}→{v['nuts3']}" for cc, v in sorted(_country_fallback.items())), + ) + def _estimate_by_prefix(cc: str, postal_code: str) -> dict | None: """Runtime estimation via longest prefix match + majority vote. @@ -910,11 +949,12 @@ def load_data() -> None: def lookup(country_code: str, postal_code: str) -> dict | None: """Look up NUTS codes for a given country + postal code. - Four-tier fall-through: + Five-tier fall-through: 1. Exact TERCET match → confidence 1.0 2. Pre-computed estimate → stored confidence per level 3. Runtime prefix-based estimation → calculated confidence - 4. Single-NUTS3 country fallback → confidence 1.0 (e.g. LI, CY, LU) + 4. Country-level majority vote → unanimous NUTS1/2, dominant NUTS3 (e.g. MT) + 5. Single-NUTS3 country fallback → confidence 1.0 (e.g. LI, CY, LU) Returns a dict with nuts1/2/3, match_type, and per-level confidence, or None. """ @@ -962,7 +1002,21 @@ def lookup(country_code: str, postal_code: str) -> dict | None: approx.update(_resolve_names(approx["nuts1"], approx["nuts2"], approx["nuts3"])) return approx - # Tier 4: Single-NUTS3 country fallback (e.g. LI → LI000) + # Tier 4: Country-level majority vote (unanimous NUTS1/2, dominant NUTS3) + fallback = _country_fallback.get(cc) + if fallback is not None: + return { + "match_type": "approximate", + "nuts1": fallback["nuts1"], + "nuts1_confidence": fallback["nuts1_confidence"], + "nuts2": fallback["nuts2"], + "nuts2_confidence": fallback["nuts2_confidence"], + "nuts3": fallback["nuts3"], + "nuts3_confidence": fallback["nuts3_confidence"], + **_resolve_names(fallback["nuts1"], fallback["nuts2"], fallback["nuts3"]), + } + + # Tier 5: Single-NUTS3 country fallback (e.g. LI → LI000) nuts3 = _single_nuts3.get(cc) if nuts3 is not None: return { diff --git a/app/postal_patterns.json b/app/postal_patterns.json index b791d6e..eaec6fd 100644 --- a/app/postal_patterns.json +++ b/app/postal_patterns.json @@ -116,8 +116,8 @@ "expected_digits": 4 }, "MT": { - "regex": "^([A-Z]{2,3}\\s\\d{2,4})$", - "example": "VLT 1010, FNT 1010, MSK 1234", + "regex": "^([A-Z]{2,3}[\\s\\-]?\\d{2,4})$", + "example": "VLT 1010, MST1000, FNT-1010", "tercet_map": "keep_alpha" }, "NL": {