5bcf22b408
object.__setattr__ inside __post_init__ on a frozen dataclass is a code smell — it bypasses the immutability guarantee to mutate fields mid-construction. Split the responsibilities: * Direct constructor is strict — rejects un-normalized input (uppercase iso, whitespace in aliases, etc.) so once a Language exists in the system, its fields are guaranteed canonical. * Language.from_raw() factory handles arbitrary YAML/user input — it lowercases the iso, dedups/normalizes aliases, then constructs. Only caller that built from raw data (LanguageRegistry loading YAML) moves to from_raw(). Test fixtures already pass normalized data so they keep using the direct constructor.
130 lines
4.4 KiB
Python
130 lines
4.4 KiB
Python
"""LanguageRegistry — loads and queries the canonical language table from YAML.
|
|
|
|
Builtin entries live in ``alfred/knowledge/iso_languages.yaml`` (versioned).
|
|
Learned entries can be added to ``data/knowledge/iso_languages_learned.yaml``
|
|
(gitignored, instance-local) and are merged additively — they extend builtin
|
|
languages or add new ones, never remove builtin entries.
|
|
"""
|
|
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
import yaml
|
|
|
|
import alfred as _alfred_pkg
|
|
|
|
from alfred.domain.shared.value_objects import Language
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_BUILTIN_ROOT = Path(_alfred_pkg.__file__).parent / "knowledge"
|
|
_LEARNED_ROOT = Path(_alfred_pkg.__file__).parent.parent / "data" / "knowledge"
|
|
|
|
|
|
def _load_yaml(path: Path) -> dict:
|
|
try:
|
|
with open(path, encoding="utf-8") as f:
|
|
return yaml.safe_load(f) or {}
|
|
except FileNotFoundError:
|
|
return {}
|
|
except Exception as e:
|
|
logger.warning(f"LanguageRegistry: could not load {path}: {e}")
|
|
return {}
|
|
|
|
|
|
def _merge_language_entries(base: dict, override: dict) -> dict:
|
|
"""
|
|
Merge learned language entries into builtin entries.
|
|
|
|
For each language iso, aliases lists are extended (deduped, order preserved);
|
|
scalar fields in override win over base.
|
|
"""
|
|
result = dict(base)
|
|
for iso, override_entry in override.items():
|
|
if iso not in result:
|
|
result[iso] = override_entry
|
|
continue
|
|
merged = dict(result[iso])
|
|
for key, val in override_entry.items():
|
|
if key == "aliases" and isinstance(val, list):
|
|
existing = merged.get("aliases", []) or []
|
|
merged["aliases"] = existing + [v for v in val if v not in existing]
|
|
else:
|
|
merged[key] = val
|
|
result[iso] = merged
|
|
return result
|
|
|
|
|
|
class LanguageRegistry:
|
|
"""
|
|
Loads the canonical language table and provides lookup methods.
|
|
|
|
Usage::
|
|
|
|
registry = LanguageRegistry()
|
|
fr = registry.from_iso("fra")
|
|
fr2 = registry.from_any("French") # → same Language as `fr`
|
|
fr3 = registry.from_any("fr") # → same Language
|
|
fr4 = registry.from_any("vostfr") # → None (vostfr is subtitle-specific,
|
|
# lives in subtitles knowledge)
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
self._by_iso: dict[str, Language] = {}
|
|
self._lookup: dict[str, Language] = {} # any-form → Language
|
|
self._load()
|
|
|
|
def _load(self) -> None:
|
|
builtin = (
|
|
_load_yaml(_BUILTIN_ROOT / "iso_languages.yaml").get("languages", {}) or {}
|
|
)
|
|
learned = (
|
|
_load_yaml(_LEARNED_ROOT / "iso_languages_learned.yaml").get(
|
|
"languages", {}
|
|
)
|
|
or {}
|
|
)
|
|
merged = _merge_language_entries(builtin, learned)
|
|
|
|
for iso, entry in merged.items():
|
|
language = Language.from_raw(
|
|
iso=iso,
|
|
english_name=entry.get("english_name", iso),
|
|
native_name=entry.get("native_name", iso),
|
|
aliases=tuple(entry.get("aliases", []) or []),
|
|
)
|
|
self._by_iso[language.iso] = language
|
|
# Build the flat lookup table for from_any
|
|
self._lookup[language.iso] = language
|
|
self._lookup[language.english_name.lower()] = language
|
|
self._lookup[language.native_name.lower()] = language
|
|
for alias in language.aliases:
|
|
self._lookup[alias] = language
|
|
|
|
logger.info(f"LanguageRegistry: {len(self._by_iso)} languages loaded")
|
|
|
|
def from_iso(self, code: str) -> Language | None:
|
|
"""Look up by canonical 639-2/T code (case-insensitive)."""
|
|
if not isinstance(code, str):
|
|
return None
|
|
return self._by_iso.get(code.lower().strip())
|
|
|
|
def from_any(self, raw: str) -> Language | None:
|
|
"""
|
|
Look up by any known representation: iso code, 639-1, 639-2/B variant,
|
|
english name, native name, or any registered alias. Case-insensitive.
|
|
"""
|
|
if not isinstance(raw, str):
|
|
return None
|
|
return self._lookup.get(raw.lower().strip())
|
|
|
|
def all(self) -> list[Language]:
|
|
"""Return all known languages, in load order."""
|
|
return list(self._by_iso.values())
|
|
|
|
def __contains__(self, raw: str) -> bool:
|
|
return self.from_any(raw) is not None
|
|
|
|
def __len__(self) -> int:
|
|
return len(self._by_iso)
|