Files
alfred/alfred/infrastructure/knowledge/release.py
T
francwa 0246f85ef8 refactor(release): move codec mappings from code to YAML knowledge
The three module-level dicts in enrich_from_probe (ffprobe codec name
to scene token, channel count to layout) were exactly the kind of
domain lookup table CLAUDE.md says belongs in YAML, not in Python.
Move them to alfred/knowledge/release/probe_mappings.yaml, load
through a new ReleaseKnowledge.probe_mappings port field, and add a
kb parameter to enrich_from_probe so the consumer reads the maps via
the same injection pattern as everything else.

- New knowledge file: alfred/knowledge/release/probe_mappings.yaml
- New loader: load_probe_mappings() in infrastructure/knowledge/release.py
  (normalizes channel-count keys back to int).
- Port: ReleaseKnowledge gains probe_mappings: dict.
- Adapter: YamlReleaseKnowledge populates it at __init__.
- Consumer: enrich_from_probe(parsed, info, kb) reads the three sub-maps
  from kb.probe_mappings; unknown codecs still fall back to uppercase
  raw value, same behaviour as before.
- Call sites updated: inspect_release passes kb through; the testing
  script gets its kb wiring (it was already broken since the
  ReleaseKnowledge refactor); all 22 enrich_from_probe call sites in
  tests/application/test_enrich_from_probe.py pass _KB.
2026-05-21 07:37:42 +02:00

234 lines
7.0 KiB
Python

"""Release knowledge loader.
Three-layer merge (lowest → highest priority):
1. Builtin — alfred/knowledge/release/
2. Sites — alfred/knowledge/release/sites/*.yaml (all trackers)
3. Learned — data/knowledge/release/ (user additions via the learn tool)
Lists are extended additively, scalars from higher layers win.
"""
from pathlib import Path
import yaml
import alfred as _alfred_pkg
_BUILTIN_ROOT = Path(_alfred_pkg.__file__).parent / "knowledge" / "release"
_SITES_ROOT = _BUILTIN_ROOT / "sites"
_GROUPS_ROOT = _BUILTIN_ROOT / "release_groups"
_LEARNED_ROOT = (
Path(_alfred_pkg.__file__).parent.parent / "data" / "knowledge" / "release"
)
_LEARNED_GROUPS_ROOT = _LEARNED_ROOT / "release_groups"
def _merge(base: dict, overlay: dict) -> dict:
"""Merge overlay into base — lists are extended, scalars from overlay win."""
result = dict(base)
for key, val in overlay.items():
if key in result and isinstance(result[key], list) and isinstance(val, list):
result[key] = result[key] + [v for v in val if v not in result[key]]
else:
result[key] = val
return result
def _read(path: Path) -> dict:
try:
with open(path, encoding="utf-8") as f:
return yaml.safe_load(f) or {}
except FileNotFoundError:
return {}
def _load(filename: str) -> dict:
result = _read(_BUILTIN_ROOT / filename)
result = _merge(result, _read(_LEARNED_ROOT / filename))
return result
def _load_sites() -> dict:
"""Merge all site YAML files into a single dict."""
result: dict = {}
for site_file in sorted(_SITES_ROOT.glob("*.yaml")):
result = _merge(result, _read(site_file))
return result
def load_resolutions() -> set[str]:
return set(_load("resolutions.yaml").get("resolutions", []))
def load_sources() -> set[str]:
return set(_load("sources.yaml").get("sources", []))
def load_distributors() -> set[str]:
"""Streaming distributor tokens (NF, AMZN, DSNP, …).
Distinct from ``load_sources()`` — distributors are uppercase scene
tags identifying the platform, not the capture origin.
"""
return {t.upper() for t in _load("distributors.yaml").get("distributors", [])}
def load_codecs() -> set[str]:
return set(_load("codecs.yaml").get("codecs", []))
def load_win_forbidden_chars() -> list[str]:
return _load("filesystem.yaml").get("win_forbidden_chars", [])
def load_video_extensions() -> set[str]:
return set(_load("file_extensions.yaml").get("video", []))
def load_non_video_extensions() -> set[str]:
return set(_load("file_extensions.yaml").get("non_video", []))
def load_metadata_extensions() -> set[str]:
return set(_load("file_extensions.yaml").get("metadata", []))
def load_subtitle_extensions() -> set[str]:
return set(_load("file_extensions.yaml").get("subtitle", []))
def load_forbidden_chars() -> set[str]:
return set(_load("release_format.yaml").get("forbidden_chars", []))
def load_language_tokens() -> set[str]:
base = {t.upper() for t in _load("languages.yaml").get("tokens", [])}
sites = {t.upper() for t in _load_sites().get("languages", [])}
return base | sites
def load_audio() -> dict:
return _load("audio.yaml")
def load_video() -> dict:
return _load("video.yaml")
def load_editions() -> dict:
base = _load("editions.yaml")
site_tokens = _load_sites().get("editions", {}).get("tokens", [])
if site_tokens:
existing = base.get("tokens", [])
base["tokens"] = existing + [t for t in site_tokens if t not in existing]
return base
def load_sources_extra() -> set[str]:
"""Additional source tokens from site files."""
return set(_load_sites().get("sources", []))
def load_hdr_extra() -> set[str]:
"""Additional HDR tokens from site files."""
return {t.upper() for t in _load_sites().get("hdr", [])}
def load_media_type_tokens() -> dict:
"""Site-specific media type tokens (doc, concert, collection, integrale)."""
return _load_sites().get("media_type_tokens", {})
def load_group_schemas() -> dict:
"""Load every release-group schema YAML keyed by uppercase group name.
Builtin schemas in ``alfred/knowledge/release/release_groups/`` are
merged with user-learned schemas in
``data/knowledge/release/release_groups/`` (the learned ones win on
name collision).
"""
result: dict = {}
for root in (_GROUPS_ROOT, _LEARNED_GROUPS_ROOT):
if not root.is_dir():
continue
for path in sorted(root.glob("*.yaml")):
data = _read(path)
name = data.get("name")
if not name:
continue
result[name.upper()] = data
return result
def load_scoring() -> dict:
"""Load the parse-scoring config.
Returns a dict with three top-level keys: ``weights``, ``penalties``,
``thresholds``. Defaults are baked in so a missing or partial YAML
never breaks the parser — only de-tunes it.
"""
raw = _load("scoring.yaml")
weights = {
"title": 30,
"media_type": 20,
"year": 15,
"season": 10,
"episode": 5,
"resolution": 5,
"source": 5,
"codec": 5,
"group": 5,
}
weights.update(raw.get("weights", {}) or {})
penalties = {"unknown_token": 5, "max_unknown_penalty": 30}
penalties.update(raw.get("penalties", {}) or {})
thresholds = {"shitty_min": 60}
thresholds.update(raw.get("thresholds", {}) or {})
return {
"weights": weights,
"penalties": penalties,
"thresholds": thresholds,
}
def load_probe_mappings() -> dict:
"""Load ffprobe→scene-token translation tables.
Returns a dict with three keys:
- ``video_codec``: ``{ffprobe_codec_lower: scene_token}``
- ``audio_codec``: ``{ffprobe_codec_lower: scene_token}``
- ``audio_channels``: ``{channel_count_int: layout_str}``
Channel-count keys are normalized to ``int`` here so the consumer can
look up ``track.channels`` directly. Missing sections fall back to
empty dicts — the enrichment code degrades to its uppercase-fallback
path when a mapping is absent.
"""
raw = _load("probe_mappings.yaml")
video_codec = {k.lower(): v for k, v in (raw.get("video_codec") or {}).items()}
audio_codec = {k.lower(): v for k, v in (raw.get("audio_codec") or {}).items()}
audio_channels: dict[int, str] = {}
for k, v in (raw.get("audio_channels") or {}).items():
try:
audio_channels[int(k)] = v
except (TypeError, ValueError):
continue
return {
"video_codec": video_codec,
"audio_codec": audio_codec,
"audio_channels": audio_channels,
}
def load_separators() -> list[str]:
"""Single-char token separators used by the release name tokenizer.
Always includes the canonical "." even if absent from YAML, to prevent a
misconfigured file from breaking the parser entirely.
"""
seps = _load("separators.yaml").get("separators", []) or []
if "." not in seps:
seps = [".", *seps]
return seps