refactor(release): move codec mappings from code to YAML knowledge

The three module-level dicts in enrich_from_probe (ffprobe codec name to scene token, channel count to layout) were exactly the kind of domain lookup table CLAUDE.md says belongs in YAML, not in Python. Move them to alfred/knowledge/release/probe_mappings.yaml, load through a new ReleaseKnowledge.probe_mappings port field, and add a kb parameter to enrich_from_probe so the consumer reads the maps via the same injection pattern as everything else. - New knowledge file: alfred/knowledge/release/probe_mappings.yaml - New loader: load_probe_mappings() in infrastructure/knowledge/release.py (normalizes channel-count keys back to int). - Port: ReleaseKnowledge gains probe_mappings: dict. - Adapter: YamlReleaseKnowledge populates it at __init__. - Consumer: enrich_from_probe(parsed, info, kb) reads the three sub-maps from kb.probe_mappings; unknown codecs still fall back to uppercase raw value, same behaviour as before. - Call sites updated: inspect_release passes kb through; the testing script gets its kb wiring (it was already broken since the ReleaseKnowledge refactor); all 22 enrich_from_probe call sites in tests/application/test_enrich_from_probe.py pass _KB.
2026-05-21 07:37:42 +02:00
parent e62dc90bd1
commit 0246f85ef8
9 changed files with 154 additions and 64 deletions
@@ -57,6 +57,18 @@ callers).
 ### Changed
 - **`enrich_from_probe` codec mappings moved to YAML.** The three
  hard-coded module dicts (`_VIDEO_CODEC_MAP`, `_AUDIO_CODEC_MAP`,
  `_CHANNEL_MAP`) translating ffprobe output to scene tokens
  (`hevc → x265`, `eac3 → EAC3`, `8 → "7.1"`, …) now live in
  `alfred/knowledge/release/probe_mappings.yaml` and are loaded into
  `ReleaseKnowledge.probe_mappings` (new port field, populated by
  `YamlReleaseKnowledge`). `enrich_from_probe` gains a third `kb`
  parameter and reads the maps from there. Aligns with the CLAUDE.md
  rule that lookup tables of domain knowledge belong in YAML, not in
  Python — and opens the door to a future "learn new codec" pass.
  Callers updated: `inspect_release`, `testing/recognize_folders_in_downloads.py`,
  and all 22 sites in `tests/application/test_enrich_from_probe.py`.
 - **`ParsedRelease.tech_string` is now a derived `@property`**
  (`alfred/domain/release/value_objects.py`). It computes
  `quality.source.codec` joined by dots on every access, so it stays in
@@ -2,55 +2,36 @@
 from __future__ import annotations
 from alfred.domain.release.ports import ReleaseKnowledge
 from alfred.domain.release.value_objects import ParsedRelease
 from alfred.domain.shared.media import MediaInfo
 # Map ffprobe codec names to scene-style codec tokens
 _VIDEO_CODEC_MAP = {
    "hevc": "x265",
    "h264": "x264",
    "h265": "x265",
    "av1": "AV1",
    "vp9": "VP9",
    "mpeg4": "XviD",
 }
-# Map ffprobe audio codec names to scene-style tokens
+def enrich_from_probe(
-_AUDIO_CODEC_MAP = {
+    parsed: ParsedRelease, info: MediaInfo, kb: ReleaseKnowledge
-    "eac3": "EAC3",
+) -> None:
    "ac3": "AC3",
    "dts": "DTS",
    "truehd": "TrueHD",
    "aac": "AAC",
    "flac": "FLAC",
    "opus": "OPUS",
    "mp3": "MP3",
    "pcm_s16l": "PCM",
    "pcm_s24l": "PCM",
 }
 # Map channel count to standard layout string
 _CHANNEL_MAP = {
    8: "7.1",
    6: "5.1",
    2: "2.0",
    1: "1.0",
 }
 def enrich_from_probe(parsed: ParsedRelease, info: MediaInfo) -> None:
    """
    Fill None fields in parsed using data from ffprobe MediaInfo.
    Only overwrites fields that are currently None — token-level values
-    from the release name always take priority.
+    from the release name always take priority. Mutates parsed in place.
-    Mutates parsed in place.
+
    Translation tables (ffprobe codec name → scene token, channel count
    → layout) live in ``kb.probe_mappings`` (loaded from
    ``alfred/knowledge/release/probe_mappings.yaml``). When ffprobe
    reports a value with no mapping entry, the fallback is the uppercase
    raw value so unknown codecs still surface in a predictable form.
    """
    mappings = kb.probe_mappings
    video_codec_map: dict[str, str] = mappings.get("video_codec", {})
    audio_codec_map: dict[str, str] = mappings.get("audio_codec", {})
    channel_map: dict[int, str] = mappings.get("audio_channels", {})
    if parsed.quality is None and info.resolution:
        parsed.quality = info.resolution
    if parsed.codec is None and info.video_codec:
-        parsed.codec = _VIDEO_CODEC_MAP.get(
+        parsed.codec = video_codec_map.get(
            info.video_codec.lower(), info.video_codec.upper()
        )
@@ -64,12 +45,12 @@ def enrich_from_probe(parsed: ParsedRelease, info: MediaInfo) -> None:
    if track:
        if parsed.audio_codec is None and track.codec:
-            parsed.audio_codec = _AUDIO_CODEC_MAP.get(
+            parsed.audio_codec = audio_codec_map.get(
                track.codec.lower(), track.codec.upper()
            )
        if parsed.audio_channels is None and track.channels:
-            parsed.audio_channels = _CHANNEL_MAP.get(
+            parsed.audio_channels = channel_map.get(
                track.channels, f"{track.channels}ch"
            )
@@ -127,7 +127,7 @@ def inspect_release(
    if main_video is not None and parsed.media_type not in _NON_PROBABLE_MEDIA_TYPES:
        media_info = prober.probe(main_video)
        if media_info is not None:
-            enrich_from_probe(parsed, media_info)
+            enrich_from_probe(parsed, media_info, kb)
            probe_used = True
    return InspectedResult(
@@ -52,6 +52,18 @@ class ReleaseKnowledge(Protocol):
    scoring: dict
    # --- ffprobe → scene-token translation tables (consumed by
    #     ``application.release.enrich_from_probe``). Domain parsing itself
    #     doesn't touch these — exposed on the same KB to keep release
    #     knowledge in a single ownership point.
    #
    #     Shape:
    #       - ``video_codec``:    dict[str, str]   ffprobe lower → scene token
    #       - ``audio_codec``:    dict[str, str]   ffprobe lower → scene token
    #       - ``audio_channels``: dict[int, str]   channel count → layout ---
    probe_mappings: dict
    # --- File-extension sets (used by application/infra modules that work
    #     directly with filesystem paths, e.g. media-type detection, video
    #     lookup). Domain parsing itself doesn't touch these. ---
@@ -191,6 +191,36 @@ def load_scoring() -> dict:
    }
 def load_probe_mappings() -> dict:
    """Load ffprobe→scene-token translation tables.
    Returns a dict with three keys:
      - ``video_codec``: ``{ffprobe_codec_lower: scene_token}``
      - ``audio_codec``: ``{ffprobe_codec_lower: scene_token}``
      - ``audio_channels``: ``{channel_count_int: layout_str}``
    Channel-count keys are normalized to ``int`` here so the consumer can
    look up ``track.channels`` directly. Missing sections fall back to
    empty dicts — the enrichment code degrades to its uppercase-fallback
    path when a mapping is absent.
    """
    raw = _load("probe_mappings.yaml")
    video_codec = {k.lower(): v for k, v in (raw.get("video_codec") or {}).items()}
    audio_codec = {k.lower(): v for k, v in (raw.get("audio_codec") or {}).items()}
    audio_channels: dict[int, str] = {}
    for k, v in (raw.get("audio_channels") or {}).items():
        try:
            audio_channels[int(k)] = v
        except (TypeError, ValueError):
            continue
    return {
        "video_codec": video_codec,
        "audio_codec": audio_codec,
        "audio_channels": audio_channels,
    }
 def load_separators() -> list[str]:
    """Single-char token separators used by the release name tokenizer.
@@ -29,6 +29,7 @@ from .release import (
    load_media_type_tokens,
    load_metadata_extensions,
    load_non_video_extensions,
    load_probe_mappings,
    load_resolutions,
    load_scoring,
    load_separators,
@@ -89,6 +90,10 @@ class YamlReleaseKnowledge:
        # Parse-scoring config (weights / penalties / thresholds).
        self.scoring: dict = load_scoring()
        # ffprobe → scene-token mapping tables (consumed by
        # ``application.release.enrich_from_probe``).
        self.probe_mappings: dict = load_probe_mappings()
        # File-extension sets (used by application/infra modules, not by
        # the parser itself — kept here so there is a single ownership
        # point for release knowledge).
@@ -0,0 +1,45 @@
 # Translation table — ffprobe output → scene-style release tokens.
 #
 # Consumed by ``alfred.application.release.enrich_from_probe`` when filling
 # missing ParsedRelease fields from a probed MediaInfo. Token-level values
 # from the release name always win; these mappings only fire when the
 # corresponding ParsedRelease field is None.
 #
 # Lookup is case-insensitive on the key side (ffprobe sometimes emits
 # uppercase, sometimes lowercase). When no key matches, the fallback is
 # ``ffprobe_value.upper()`` so unknown codecs still surface in a
 # predictable form (and signal the gap to a future "learn" pass).
 #
 # Each section is a flat dict — values are the canonical scene tokens
 # Alfred uses everywhere (filename builders, ParsedRelease fields).
 # ffprobe video codec name → scene codec token
 video_codec:
  hevc: x265
  h264: x264
  h265: x265
  av1: AV1
  vp9: VP9
  mpeg4: XviD
 # ffprobe audio codec name → scene audio token
 audio_codec:
  eac3: EAC3
  ac3: AC3
  dts: DTS
  truehd: TrueHD
  aac: AAC
  flac: FLAC
  opus: OPUS
  mp3: MP3
  pcm_s16l: PCM
  pcm_s24l: PCM
 # Channel count (integer) → standard layout string.
 # Keys are strings here because YAML mappings prefer string keys; the
 # loader normalizes them back to int.
 audio_channels:
  "8": "7.1"
  "6": "5.1"
  "2": "2.0"
  "1": "1.0"
@@ -104,8 +104,10 @@ def main() -> None:
    from alfred.application.release.enrich_from_probe import enrich_from_probe
    from alfred.domain.release.services import parse_release
    from alfred.infrastructure.filesystem.find_video import find_video_file
    from alfred.infrastructure.knowledge.release_kb import YamlReleaseKnowledge
    from alfred.infrastructure.probe import FfprobeMediaProber
    _kb = YamlReleaseKnowledge()
    _prober = FfprobeMediaProber()
    entries = sorted(downloads.iterdir(), key=lambda p: p.name.lower())
@@ -123,14 +125,14 @@ def main() -> None:
        name = entry.name
        try:
-            p = parse_release(name)
+            p, _report = parse_release(name, _kb)
-            p.media_type = detect_media_type(p, entry)
+            p.media_type = detect_media_type(p, entry, _kb)
            if p.media_type not in ("unknown", "other"):
                video_file = find_video_file(entry)
                if video_file:
                    media_info = _prober.probe(video_file)
                    if media_info:
-                        enrich_from_probe(p, media_info)
+                        enrich_from_probe(p, media_info, _kb)
            warnings = _assess(p)
        except Exception as e:
            warnings = [f"parse error: {e}"]
@@ -21,6 +21,9 @@ from __future__ import annotations
 from alfred.application.release.enrich_from_probe import enrich_from_probe
 from alfred.domain.release.value_objects import ParsedRelease
 from alfred.domain.shared.media import AudioTrack, MediaInfo, VideoTrack
 from alfred.infrastructure.knowledge.release_kb import YamlReleaseKnowledge
 _KB = YamlReleaseKnowledge()
 def _info_with_video(*, width=None, height=None, codec=None, **rest) -> MediaInfo:
@@ -59,17 +62,17 @@ def _bare(**overrides) -> ParsedRelease:
 class TestQuality:
    def test_fills_when_none(self):
        p = _bare()
-        enrich_from_probe(p, _info_with_video(width=1920, height=1080))
+        enrich_from_probe(p, _info_with_video(width=1920, height=1080), _KB)
        assert p.quality == "1080p"
    def test_does_not_overwrite_existing(self):
        p = _bare(quality="2160p")
-        enrich_from_probe(p, _info_with_video(width=1920, height=1080))
+        enrich_from_probe(p, _info_with_video(width=1920, height=1080), _KB)
        assert p.quality == "2160p"
    def test_no_dims_leaves_none(self):
        p = _bare()
-        enrich_from_probe(p, MediaInfo())
+        enrich_from_probe(p, MediaInfo(), _KB)
        assert p.quality is None
@@ -81,27 +84,27 @@ class TestQuality:
 class TestVideoCodec:
    def test_hevc_to_x265(self):
        p = _bare()
-        enrich_from_probe(p, _info_with_video(codec="hevc"))
+        enrich_from_probe(p, _info_with_video(codec="hevc"), _KB)
        assert p.codec == "x265"
    def test_h264_to_x264(self):
        p = _bare()
-        enrich_from_probe(p, _info_with_video(codec="h264"))
+        enrich_from_probe(p, _info_with_video(codec="h264"), _KB)
        assert p.codec == "x264"
    def test_unknown_codec_uppercased(self):
        p = _bare()
-        enrich_from_probe(p, _info_with_video(codec="weird"))
+        enrich_from_probe(p, _info_with_video(codec="weird"), _KB)
        assert p.codec == "WEIRD"
    def test_does_not_overwrite_existing(self):
        p = _bare(codec="HEVC")
-        enrich_from_probe(p, _info_with_video(codec="h264"))
+        enrich_from_probe(p, _info_with_video(codec="h264"), _KB)
        assert p.codec == "HEVC"
    def test_no_codec_leaves_none(self):
        p = _bare()
-        enrich_from_probe(p, MediaInfo())
+        enrich_from_probe(p, MediaInfo(), _KB)
        assert p.codec is None
@@ -119,7 +122,7 @@ class TestAudio:
            ]
        )
        p = _bare()
-        enrich_from_probe(p, info)
+        enrich_from_probe(p, info, _KB)
        assert p.audio_codec == "EAC3"
        assert p.audio_channels == "5.1"
@@ -131,32 +134,32 @@ class TestAudio:
            ]
        )
        p = _bare()
-        enrich_from_probe(p, info)
+        enrich_from_probe(p, info, _KB)
        assert p.audio_codec == "AC3"
        assert p.audio_channels == "5.1"
    def test_channel_count_unknown_falls_back(self):
        info = MediaInfo(audio_tracks=[AudioTrack(0, "aac", 4, "quad", "eng")])
        p = _bare()
-        enrich_from_probe(p, info)
+        enrich_from_probe(p, info, _KB)
        assert p.audio_channels == "4ch"
    def test_unknown_audio_codec_uppercased(self):
        info = MediaInfo(audio_tracks=[AudioTrack(0, "newcodec", 2, "stereo", "eng")])
        p = _bare()
-        enrich_from_probe(p, info)
+        enrich_from_probe(p, info, _KB)
        assert p.audio_codec == "NEWCODEC"
    def test_no_audio_tracks(self):
        p = _bare()
-        enrich_from_probe(p, MediaInfo())
+        enrich_from_probe(p, MediaInfo(), _KB)
        assert p.audio_codec is None
        assert p.audio_channels is None
    def test_does_not_overwrite_existing_audio_fields(self):
        info = MediaInfo(audio_tracks=[AudioTrack(0, "ac3", 6, "5.1", "eng")])
        p = _bare(audio_codec="DTS-HD.MA", audio_channels="7.1")
-        enrich_from_probe(p, info)
+        enrich_from_probe(p, info, _KB)
        assert p.audio_codec == "DTS-HD.MA"
        assert p.audio_channels == "7.1"
@@ -175,7 +178,7 @@ class TestLanguages:
            ]
        )
        p = _bare()
-        enrich_from_probe(p, info)
+        enrich_from_probe(p, info, _KB)
        assert p.languages == ["eng", "fre"]
    def test_skips_und(self):
@@ -186,7 +189,7 @@ class TestLanguages:
            ]
        )
        p = _bare()
-        enrich_from_probe(p, info)
+        enrich_from_probe(p, info, _KB)
        assert p.languages == ["eng"]
    def test_dedup_against_existing_case_insensitive(self):
@@ -201,13 +204,13 @@ class TestLanguages:
        )
        p = _bare()
        p.languages = ["ENG"]
-        enrich_from_probe(p, info)
+        enrich_from_probe(p, info, _KB)
        # "eng" → upper "ENG" already present → skipped. "fre" → "FRE" new → kept.
        assert p.languages == ["ENG", "fre"]
    def test_no_audio_tracks_leaves_languages_empty(self):
        p = _bare()
-        enrich_from_probe(p, MediaInfo())
+        enrich_from_probe(p, MediaInfo(), _KB)
        assert p.languages == []
@@ -224,7 +227,7 @@ class TestTechString:
    def test_rebuilt_from_filled_quality_and_codec(self):
        p = _bare()
        enrich_from_probe(
-            p, _info_with_video(width=1920, height=1080, codec="hevc")
+            p, _info_with_video(width=1920, height=1080, codec="hevc"), _KB
        )
        assert p.quality == "1080p"
        assert p.codec == "x265"
@@ -234,7 +237,7 @@ class TestTechString:
        # Token-level source must stay; probe fills only None fields.
        p = _bare(source="BluRay")
        enrich_from_probe(
-            p, _info_with_video(width=1920, height=1080, codec="hevc")
+            p, _info_with_video(width=1920, height=1080, codec="hevc"), _KB
        )
        assert p.tech_string == "1080p.BluRay.x265"
@@ -242,10 +245,10 @@ class TestTechString:
        # No video info → nothing to fill → derived tech_string stays as it was.
        p = _bare(quality="2160p", source="WEB-DL", codec="x265")
        assert p.tech_string == "2160p.WEB-DL.x265"
-        enrich_from_probe(p, MediaInfo())
+        enrich_from_probe(p, MediaInfo(), _KB)
        assert p.tech_string == "2160p.WEB-DL.x265"
    def test_empty_when_nothing_known(self):
        p = _bare()
-        enrich_from_probe(p, MediaInfo())
+        enrich_from_probe(p, MediaInfo(), _KB)
        assert p.tech_string == ""