Files
alfred/alfred/infrastructure/knowledge/language_registry.py
T
francwa 5bcf22b408 refactor(shared): Language VO is strict; from_raw() factory for un-normalized input
object.__setattr__ inside __post_init__ on a frozen dataclass is a
code smell — it bypasses the immutability guarantee to mutate fields
mid-construction. Split the responsibilities:

* Direct constructor is strict — rejects un-normalized input (uppercase
  iso, whitespace in aliases, etc.) so once a Language exists in the
  system, its fields are guaranteed canonical.
* Language.from_raw() factory handles arbitrary YAML/user input — it
  lowercases the iso, dedups/normalizes aliases, then constructs.

Only caller that built from raw data (LanguageRegistry loading YAML)
moves to from_raw(). Test fixtures already pass normalized data so
they keep using the direct constructor.
2026-05-20 23:48:30 +02:00

130 lines
4.4 KiB
Python

"""LanguageRegistry — loads and queries the canonical language table from YAML.
Builtin entries live in ``alfred/knowledge/iso_languages.yaml`` (versioned).
Learned entries can be added to ``data/knowledge/iso_languages_learned.yaml``
(gitignored, instance-local) and are merged additively — they extend builtin
languages or add new ones, never remove builtin entries.
"""
import logging
from pathlib import Path
import yaml
import alfred as _alfred_pkg
from alfred.domain.shared.value_objects import Language
logger = logging.getLogger(__name__)
_BUILTIN_ROOT = Path(_alfred_pkg.__file__).parent / "knowledge"
_LEARNED_ROOT = Path(_alfred_pkg.__file__).parent.parent / "data" / "knowledge"
def _load_yaml(path: Path) -> dict:
try:
with open(path, encoding="utf-8") as f:
return yaml.safe_load(f) or {}
except FileNotFoundError:
return {}
except Exception as e:
logger.warning(f"LanguageRegistry: could not load {path}: {e}")
return {}
def _merge_language_entries(base: dict, override: dict) -> dict:
"""
Merge learned language entries into builtin entries.
For each language iso, aliases lists are extended (deduped, order preserved);
scalar fields in override win over base.
"""
result = dict(base)
for iso, override_entry in override.items():
if iso not in result:
result[iso] = override_entry
continue
merged = dict(result[iso])
for key, val in override_entry.items():
if key == "aliases" and isinstance(val, list):
existing = merged.get("aliases", []) or []
merged["aliases"] = existing + [v for v in val if v not in existing]
else:
merged[key] = val
result[iso] = merged
return result
class LanguageRegistry:
"""
Loads the canonical language table and provides lookup methods.
Usage::
registry = LanguageRegistry()
fr = registry.from_iso("fra")
fr2 = registry.from_any("French") # → same Language as `fr`
fr3 = registry.from_any("fr") # → same Language
fr4 = registry.from_any("vostfr") # → None (vostfr is subtitle-specific,
# lives in subtitles knowledge)
"""
def __init__(self) -> None:
self._by_iso: dict[str, Language] = {}
self._lookup: dict[str, Language] = {} # any-form → Language
self._load()
def _load(self) -> None:
builtin = (
_load_yaml(_BUILTIN_ROOT / "iso_languages.yaml").get("languages", {}) or {}
)
learned = (
_load_yaml(_LEARNED_ROOT / "iso_languages_learned.yaml").get(
"languages", {}
)
or {}
)
merged = _merge_language_entries(builtin, learned)
for iso, entry in merged.items():
language = Language.from_raw(
iso=iso,
english_name=entry.get("english_name", iso),
native_name=entry.get("native_name", iso),
aliases=tuple(entry.get("aliases", []) or []),
)
self._by_iso[language.iso] = language
# Build the flat lookup table for from_any
self._lookup[language.iso] = language
self._lookup[language.english_name.lower()] = language
self._lookup[language.native_name.lower()] = language
for alias in language.aliases:
self._lookup[alias] = language
logger.info(f"LanguageRegistry: {len(self._by_iso)} languages loaded")
def from_iso(self, code: str) -> Language | None:
"""Look up by canonical 639-2/T code (case-insensitive)."""
if not isinstance(code, str):
return None
return self._by_iso.get(code.lower().strip())
def from_any(self, raw: str) -> Language | None:
"""
Look up by any known representation: iso code, 639-1, 639-2/B variant,
english name, native name, or any registered alias. Case-insensitive.
"""
if not isinstance(raw, str):
return None
return self._lookup.get(raw.lower().strip())
def all(self) -> list[Language]:
"""Return all known languages, in load order."""
return list(self._by_iso.values())
def __contains__(self, raw: str) -> bool:
return self.from_any(raw) is not None
def __len__(self) -> int:
return len(self._by_iso)