Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/).

## [0.12.0] - 2026-02-23

### Fixed

- **MT regex** (#14): separator between alpha prefix and digits is now optional (`MST1000` accepted alongside `MST 1000` and `MST-1000`). Previously, codes without a space failed regex extraction and fell to approximate matching with lower confidence.

### Added

- **Country-level majority-vote fallback**: new Tier 4 in the lookup chain for countries where all postal codes map to the same NUTS1/NUTS2 but NUTS3 has a dominant winner. Returns `match_type: "approximate"` with NUTS1/NUTS2 confidence 1.0 and NUTS3 confidence based on agreement ratio (capped at 0.80). Naturally captures MT (MT0/MT00/MT001 at ~77%). Digit-only MT codes like `1043` that previously returned 404 now get a valid approximate result.

## [0.11.0] - 2026-02-23

### Added
Expand Down
2 changes: 1 addition & 1 deletion app/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.11.0"
__version__ = "0.12.0"
60 changes: 57 additions & 3 deletions app/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@
# Countries with a single NUTS3 region: country_code -> nuts3 code
_single_nuts3: dict[str, str] = {}

# Country-level majority-vote fallback for countries where NUTS1/NUTS2
# are unanimous but NUTS3 has a dominant winner (e.g. MT → MT0/MT00/MT001)
_country_fallback: dict[str, dict] = {}

# NUTS region names: nuts_id -> name_latn
_nuts_names: dict[str, str] = {}

Expand Down Expand Up @@ -619,6 +623,41 @@ def _build_prefix_index() -> None:
if _single_nuts3:
logger.info("Single-NUTS3 countries: %s", ", ".join(sorted(_single_nuts3)))

# Country-level majority-vote fallback for countries NOT in _single_nuts3
# where NUTS1 and NUTS2 are unanimous but NUTS3 has a dominant winner
_country_fallback.clear()
caps = settings.approximate_confidence_caps
for cc, nuts3_set in country_nuts3.items():
if cc in _single_nuts3:
continue
nuts1_set = {n[:3] for n in nuts3_set}
nuts2_set = {n[:4] for n in nuts3_set}
if len(nuts1_set) != 1 or len(nuts2_set) != 1:
continue
# Count postal codes per NUTS3 to find dominant region
nuts3_counts: Counter[str] = Counter()
for (c, _), n3 in _lookup.items():
if c == cc:
nuts3_counts[n3] += 1
total = sum(nuts3_counts.values())
if total == 0:
continue
winner, winner_count = nuts3_counts.most_common(1)[0]
ratio = winner_count / total
_country_fallback[cc] = {
"nuts1": next(iter(nuts1_set)),
"nuts1_confidence": 1.0,
"nuts2": next(iter(nuts2_set)),
"nuts2_confidence": 1.0,
"nuts3": winner,
"nuts3_confidence": round(min(ratio, caps["nuts3"]), 2),
}
if _country_fallback:
logger.info(
"Country-level fallback: %s",
", ".join(f"{cc}→{v['nuts3']}" for cc, v in sorted(_country_fallback.items())),
)


def _estimate_by_prefix(cc: str, postal_code: str) -> dict | None:
"""Runtime estimation via longest prefix match + majority vote.
Expand Down Expand Up @@ -910,11 +949,12 @@ def load_data() -> None:
def lookup(country_code: str, postal_code: str) -> dict | None:
"""Look up NUTS codes for a given country + postal code.

Four-tier fall-through:
Five-tier fall-through:
1. Exact TERCET match → confidence 1.0
2. Pre-computed estimate → stored confidence per level
3. Runtime prefix-based estimation → calculated confidence
4. Single-NUTS3 country fallback → confidence 1.0 (e.g. LI, CY, LU)
4. Country-level majority vote → unanimous NUTS1/2, dominant NUTS3 (e.g. MT)
5. Single-NUTS3 country fallback → confidence 1.0 (e.g. LI, CY, LU)

Returns a dict with nuts1/2/3, match_type, and per-level confidence, or None.
"""
Expand Down Expand Up @@ -962,7 +1002,21 @@ def lookup(country_code: str, postal_code: str) -> dict | None:
approx.update(_resolve_names(approx["nuts1"], approx["nuts2"], approx["nuts3"]))
return approx

# Tier 4: Single-NUTS3 country fallback (e.g. LI → LI000)
# Tier 4: Country-level majority vote (unanimous NUTS1/2, dominant NUTS3)
fallback = _country_fallback.get(cc)
if fallback is not None:
return {
"match_type": "approximate",
"nuts1": fallback["nuts1"],
"nuts1_confidence": fallback["nuts1_confidence"],
"nuts2": fallback["nuts2"],
"nuts2_confidence": fallback["nuts2_confidence"],
"nuts3": fallback["nuts3"],
"nuts3_confidence": fallback["nuts3_confidence"],
**_resolve_names(fallback["nuts1"], fallback["nuts2"], fallback["nuts3"]),
}

# Tier 5: Single-NUTS3 country fallback (e.g. LI → LI000)
nuts3 = _single_nuts3.get(cc)
if nuts3 is not None:
return {
Expand Down
4 changes: 2 additions & 2 deletions app/postal_patterns.json
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@
"expected_digits": 4
},
"MT": {
"regex": "^([A-Z]{2,3}\\s\\d{2,4})$",
"example": "VLT 1010, FNT 1010, MSK 1234",
"regex": "^([A-Z]{2,3}[\\s\\-]?\\d{2,4})$",
"example": "VLT 1010, MST1000, FNT-1010",
"tercet_map": "keep_alpha"
},
"NL": {
Expand Down
Loading