0246f85ef8
The three module-level dicts in enrich_from_probe (ffprobe codec name to scene token, channel count to layout) were exactly the kind of domain lookup table CLAUDE.md says belongs in YAML, not in Python. Move them to alfred/knowledge/release/probe_mappings.yaml, load through a new ReleaseKnowledge.probe_mappings port field, and add a kb parameter to enrich_from_probe so the consumer reads the maps via the same injection pattern as everything else. - New knowledge file: alfred/knowledge/release/probe_mappings.yaml - New loader: load_probe_mappings() in infrastructure/knowledge/release.py (normalizes channel-count keys back to int). - Port: ReleaseKnowledge gains probe_mappings: dict. - Adapter: YamlReleaseKnowledge populates it at __init__. - Consumer: enrich_from_probe(parsed, info, kb) reads the three sub-maps from kb.probe_mappings; unknown codecs still fall back to uppercase raw value, same behaviour as before. - Call sites updated: inspect_release passes kb through; the testing script gets its kb wiring (it was already broken since the ReleaseKnowledge refactor); all 22 enrich_from_probe call sites in tests/application/test_enrich_from_probe.py pass _KB.
234 lines
7.0 KiB
Python
234 lines
7.0 KiB
Python
"""Release knowledge loader.
|
|
|
|
Three-layer merge (lowest → highest priority):
|
|
1. Builtin — alfred/knowledge/release/
|
|
2. Sites — alfred/knowledge/release/sites/*.yaml (all trackers)
|
|
3. Learned — data/knowledge/release/ (user additions via the learn tool)
|
|
|
|
Lists are extended additively, scalars from higher layers win.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
|
|
import yaml
|
|
|
|
import alfred as _alfred_pkg
|
|
|
|
_BUILTIN_ROOT = Path(_alfred_pkg.__file__).parent / "knowledge" / "release"
|
|
_SITES_ROOT = _BUILTIN_ROOT / "sites"
|
|
_GROUPS_ROOT = _BUILTIN_ROOT / "release_groups"
|
|
_LEARNED_ROOT = (
|
|
Path(_alfred_pkg.__file__).parent.parent / "data" / "knowledge" / "release"
|
|
)
|
|
_LEARNED_GROUPS_ROOT = _LEARNED_ROOT / "release_groups"
|
|
|
|
|
|
def _merge(base: dict, overlay: dict) -> dict:
|
|
"""Merge overlay into base — lists are extended, scalars from overlay win."""
|
|
result = dict(base)
|
|
for key, val in overlay.items():
|
|
if key in result and isinstance(result[key], list) and isinstance(val, list):
|
|
result[key] = result[key] + [v for v in val if v not in result[key]]
|
|
else:
|
|
result[key] = val
|
|
return result
|
|
|
|
|
|
def _read(path: Path) -> dict:
|
|
try:
|
|
with open(path, encoding="utf-8") as f:
|
|
return yaml.safe_load(f) or {}
|
|
except FileNotFoundError:
|
|
return {}
|
|
|
|
|
|
def _load(filename: str) -> dict:
|
|
result = _read(_BUILTIN_ROOT / filename)
|
|
result = _merge(result, _read(_LEARNED_ROOT / filename))
|
|
return result
|
|
|
|
|
|
def _load_sites() -> dict:
|
|
"""Merge all site YAML files into a single dict."""
|
|
result: dict = {}
|
|
for site_file in sorted(_SITES_ROOT.glob("*.yaml")):
|
|
result = _merge(result, _read(site_file))
|
|
return result
|
|
|
|
|
|
def load_resolutions() -> set[str]:
|
|
return set(_load("resolutions.yaml").get("resolutions", []))
|
|
|
|
|
|
def load_sources() -> set[str]:
|
|
return set(_load("sources.yaml").get("sources", []))
|
|
|
|
|
|
def load_distributors() -> set[str]:
|
|
"""Streaming distributor tokens (NF, AMZN, DSNP, …).
|
|
|
|
Distinct from ``load_sources()`` — distributors are uppercase scene
|
|
tags identifying the platform, not the capture origin.
|
|
"""
|
|
return {t.upper() for t in _load("distributors.yaml").get("distributors", [])}
|
|
|
|
|
|
def load_codecs() -> set[str]:
|
|
return set(_load("codecs.yaml").get("codecs", []))
|
|
|
|
|
|
def load_win_forbidden_chars() -> list[str]:
|
|
return _load("filesystem.yaml").get("win_forbidden_chars", [])
|
|
|
|
|
|
def load_video_extensions() -> set[str]:
|
|
return set(_load("file_extensions.yaml").get("video", []))
|
|
|
|
|
|
def load_non_video_extensions() -> set[str]:
|
|
return set(_load("file_extensions.yaml").get("non_video", []))
|
|
|
|
|
|
def load_metadata_extensions() -> set[str]:
|
|
return set(_load("file_extensions.yaml").get("metadata", []))
|
|
|
|
|
|
def load_subtitle_extensions() -> set[str]:
|
|
return set(_load("file_extensions.yaml").get("subtitle", []))
|
|
|
|
|
|
def load_forbidden_chars() -> set[str]:
|
|
return set(_load("release_format.yaml").get("forbidden_chars", []))
|
|
|
|
|
|
def load_language_tokens() -> set[str]:
|
|
base = {t.upper() for t in _load("languages.yaml").get("tokens", [])}
|
|
sites = {t.upper() for t in _load_sites().get("languages", [])}
|
|
return base | sites
|
|
|
|
|
|
def load_audio() -> dict:
|
|
return _load("audio.yaml")
|
|
|
|
|
|
def load_video() -> dict:
|
|
return _load("video.yaml")
|
|
|
|
|
|
def load_editions() -> dict:
|
|
base = _load("editions.yaml")
|
|
site_tokens = _load_sites().get("editions", {}).get("tokens", [])
|
|
if site_tokens:
|
|
existing = base.get("tokens", [])
|
|
base["tokens"] = existing + [t for t in site_tokens if t not in existing]
|
|
return base
|
|
|
|
|
|
def load_sources_extra() -> set[str]:
|
|
"""Additional source tokens from site files."""
|
|
return set(_load_sites().get("sources", []))
|
|
|
|
|
|
def load_hdr_extra() -> set[str]:
|
|
"""Additional HDR tokens from site files."""
|
|
return {t.upper() for t in _load_sites().get("hdr", [])}
|
|
|
|
|
|
def load_media_type_tokens() -> dict:
|
|
"""Site-specific media type tokens (doc, concert, collection, integrale)."""
|
|
return _load_sites().get("media_type_tokens", {})
|
|
|
|
|
|
def load_group_schemas() -> dict:
|
|
"""Load every release-group schema YAML keyed by uppercase group name.
|
|
|
|
Builtin schemas in ``alfred/knowledge/release/release_groups/`` are
|
|
merged with user-learned schemas in
|
|
``data/knowledge/release/release_groups/`` (the learned ones win on
|
|
name collision).
|
|
"""
|
|
result: dict = {}
|
|
for root in (_GROUPS_ROOT, _LEARNED_GROUPS_ROOT):
|
|
if not root.is_dir():
|
|
continue
|
|
for path in sorted(root.glob("*.yaml")):
|
|
data = _read(path)
|
|
name = data.get("name")
|
|
if not name:
|
|
continue
|
|
result[name.upper()] = data
|
|
return result
|
|
|
|
|
|
def load_scoring() -> dict:
|
|
"""Load the parse-scoring config.
|
|
|
|
Returns a dict with three top-level keys: ``weights``, ``penalties``,
|
|
``thresholds``. Defaults are baked in so a missing or partial YAML
|
|
never breaks the parser — only de-tunes it.
|
|
"""
|
|
raw = _load("scoring.yaml")
|
|
weights = {
|
|
"title": 30,
|
|
"media_type": 20,
|
|
"year": 15,
|
|
"season": 10,
|
|
"episode": 5,
|
|
"resolution": 5,
|
|
"source": 5,
|
|
"codec": 5,
|
|
"group": 5,
|
|
}
|
|
weights.update(raw.get("weights", {}) or {})
|
|
penalties = {"unknown_token": 5, "max_unknown_penalty": 30}
|
|
penalties.update(raw.get("penalties", {}) or {})
|
|
thresholds = {"shitty_min": 60}
|
|
thresholds.update(raw.get("thresholds", {}) or {})
|
|
return {
|
|
"weights": weights,
|
|
"penalties": penalties,
|
|
"thresholds": thresholds,
|
|
}
|
|
|
|
|
|
def load_probe_mappings() -> dict:
|
|
"""Load ffprobe→scene-token translation tables.
|
|
|
|
Returns a dict with three keys:
|
|
|
|
- ``video_codec``: ``{ffprobe_codec_lower: scene_token}``
|
|
- ``audio_codec``: ``{ffprobe_codec_lower: scene_token}``
|
|
- ``audio_channels``: ``{channel_count_int: layout_str}``
|
|
|
|
Channel-count keys are normalized to ``int`` here so the consumer can
|
|
look up ``track.channels`` directly. Missing sections fall back to
|
|
empty dicts — the enrichment code degrades to its uppercase-fallback
|
|
path when a mapping is absent.
|
|
"""
|
|
raw = _load("probe_mappings.yaml")
|
|
video_codec = {k.lower(): v for k, v in (raw.get("video_codec") or {}).items()}
|
|
audio_codec = {k.lower(): v for k, v in (raw.get("audio_codec") or {}).items()}
|
|
audio_channels: dict[int, str] = {}
|
|
for k, v in (raw.get("audio_channels") or {}).items():
|
|
try:
|
|
audio_channels[int(k)] = v
|
|
except (TypeError, ValueError):
|
|
continue
|
|
return {
|
|
"video_codec": video_codec,
|
|
"audio_codec": audio_codec,
|
|
"audio_channels": audio_channels,
|
|
}
|
|
|
|
|
|
def load_separators() -> list[str]:
|
|
"""Single-char token separators used by the release name tokenizer.
|
|
|
|
Always includes the canonical "." even if absent from YAML, to prevent a
|
|
misconfigured file from breaking the parser entirely.
|
|
"""
|
|
seps = _load("separators.yaml").get("separators", []) or []
|
|
if "." not in seps:
|
|
seps = [".", *seps]
|
|
return seps
|