From 0246f85ef8a77ca42f8413167d37e945958b32bd Mon Sep 17 00:00:00 2001 From: Francwa Date: Thu, 21 May 2026 07:37:42 +0200 Subject: [PATCH] refactor(release): move codec mappings from code to YAML knowledge The three module-level dicts in enrich_from_probe (ffprobe codec name to scene token, channel count to layout) were exactly the kind of domain lookup table CLAUDE.md says belongs in YAML, not in Python. Move them to alfred/knowledge/release/probe_mappings.yaml, load through a new ReleaseKnowledge.probe_mappings port field, and add a kb parameter to enrich_from_probe so the consumer reads the maps via the same injection pattern as everything else. - New knowledge file: alfred/knowledge/release/probe_mappings.yaml - New loader: load_probe_mappings() in infrastructure/knowledge/release.py (normalizes channel-count keys back to int). - Port: ReleaseKnowledge gains probe_mappings: dict. - Adapter: YamlReleaseKnowledge populates it at __init__. - Consumer: enrich_from_probe(parsed, info, kb) reads the three sub-maps from kb.probe_mappings; unknown codecs still fall back to uppercase raw value, same behaviour as before. - Call sites updated: inspect_release passes kb through; the testing script gets its kb wiring (it was already broken since the ReleaseKnowledge refactor); all 22 enrich_from_probe call sites in tests/application/test_enrich_from_probe.py pass _KB. --- CHANGELOG.md | 12 ++++ .../application/release/enrich_from_probe.py | 57 +++++++------------ alfred/application/release/inspect.py | 2 +- alfred/domain/release/ports/knowledge.py | 12 ++++ alfred/infrastructure/knowledge/release.py | 30 ++++++++++ alfred/infrastructure/knowledge/release_kb.py | 5 ++ alfred/knowledge/release/probe_mappings.yaml | 45 +++++++++++++++ testing/recognize_folders_in_downloads.py | 8 ++- tests/application/test_enrich_from_probe.py | 47 ++++++++------- 9 files changed, 154 insertions(+), 64 deletions(-) create mode 100644 alfred/knowledge/release/probe_mappings.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 04c6275..1444904 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -57,6 +57,18 @@ callers). ### Changed +- **`enrich_from_probe` codec mappings moved to YAML.** The three + hard-coded module dicts (`_VIDEO_CODEC_MAP`, `_AUDIO_CODEC_MAP`, + `_CHANNEL_MAP`) translating ffprobe output to scene tokens + (`hevc → x265`, `eac3 → EAC3`, `8 → "7.1"`, …) now live in + `alfred/knowledge/release/probe_mappings.yaml` and are loaded into + `ReleaseKnowledge.probe_mappings` (new port field, populated by + `YamlReleaseKnowledge`). `enrich_from_probe` gains a third `kb` + parameter and reads the maps from there. Aligns with the CLAUDE.md + rule that lookup tables of domain knowledge belong in YAML, not in + Python — and opens the door to a future "learn new codec" pass. + Callers updated: `inspect_release`, `testing/recognize_folders_in_downloads.py`, + and all 22 sites in `tests/application/test_enrich_from_probe.py`. - **`ParsedRelease.tech_string` is now a derived `@property`** (`alfred/domain/release/value_objects.py`). It computes `quality.source.codec` joined by dots on every access, so it stays in diff --git a/alfred/application/release/enrich_from_probe.py b/alfred/application/release/enrich_from_probe.py index 1779046..b26eada 100644 --- a/alfred/application/release/enrich_from_probe.py +++ b/alfred/application/release/enrich_from_probe.py @@ -2,55 +2,36 @@ from __future__ import annotations +from alfred.domain.release.ports import ReleaseKnowledge from alfred.domain.release.value_objects import ParsedRelease from alfred.domain.shared.media import MediaInfo -# Map ffprobe codec names to scene-style codec tokens -_VIDEO_CODEC_MAP = { - "hevc": "x265", - "h264": "x264", - "h265": "x265", - "av1": "AV1", - "vp9": "VP9", - "mpeg4": "XviD", -} -# Map ffprobe audio codec names to scene-style tokens -_AUDIO_CODEC_MAP = { - "eac3": "EAC3", - "ac3": "AC3", - "dts": "DTS", - "truehd": "TrueHD", - "aac": "AAC", - "flac": "FLAC", - "opus": "OPUS", - "mp3": "MP3", - "pcm_s16l": "PCM", - "pcm_s24l": "PCM", -} - -# Map channel count to standard layout string -_CHANNEL_MAP = { - 8: "7.1", - 6: "5.1", - 2: "2.0", - 1: "1.0", -} - - -def enrich_from_probe(parsed: ParsedRelease, info: MediaInfo) -> None: +def enrich_from_probe( + parsed: ParsedRelease, info: MediaInfo, kb: ReleaseKnowledge +) -> None: """ Fill None fields in parsed using data from ffprobe MediaInfo. Only overwrites fields that are currently None — token-level values - from the release name always take priority. - Mutates parsed in place. + from the release name always take priority. Mutates parsed in place. + + Translation tables (ffprobe codec name → scene token, channel count + → layout) live in ``kb.probe_mappings`` (loaded from + ``alfred/knowledge/release/probe_mappings.yaml``). When ffprobe + reports a value with no mapping entry, the fallback is the uppercase + raw value so unknown codecs still surface in a predictable form. """ + mappings = kb.probe_mappings + video_codec_map: dict[str, str] = mappings.get("video_codec", {}) + audio_codec_map: dict[str, str] = mappings.get("audio_codec", {}) + channel_map: dict[int, str] = mappings.get("audio_channels", {}) + if parsed.quality is None and info.resolution: parsed.quality = info.resolution if parsed.codec is None and info.video_codec: - parsed.codec = _VIDEO_CODEC_MAP.get( + parsed.codec = video_codec_map.get( info.video_codec.lower(), info.video_codec.upper() ) @@ -64,12 +45,12 @@ def enrich_from_probe(parsed: ParsedRelease, info: MediaInfo) -> None: if track: if parsed.audio_codec is None and track.codec: - parsed.audio_codec = _AUDIO_CODEC_MAP.get( + parsed.audio_codec = audio_codec_map.get( track.codec.lower(), track.codec.upper() ) if parsed.audio_channels is None and track.channels: - parsed.audio_channels = _CHANNEL_MAP.get( + parsed.audio_channels = channel_map.get( track.channels, f"{track.channels}ch" ) diff --git a/alfred/application/release/inspect.py b/alfred/application/release/inspect.py index 9361fa6..bb758cd 100644 --- a/alfred/application/release/inspect.py +++ b/alfred/application/release/inspect.py @@ -127,7 +127,7 @@ def inspect_release( if main_video is not None and parsed.media_type not in _NON_PROBABLE_MEDIA_TYPES: media_info = prober.probe(main_video) if media_info is not None: - enrich_from_probe(parsed, media_info) + enrich_from_probe(parsed, media_info, kb) probe_used = True return InspectedResult( diff --git a/alfred/domain/release/ports/knowledge.py b/alfred/domain/release/ports/knowledge.py index 183c3a0..eba323f 100644 --- a/alfred/domain/release/ports/knowledge.py +++ b/alfred/domain/release/ports/knowledge.py @@ -52,6 +52,18 @@ class ReleaseKnowledge(Protocol): scoring: dict + # --- ffprobe → scene-token translation tables (consumed by + # ``application.release.enrich_from_probe``). Domain parsing itself + # doesn't touch these — exposed on the same KB to keep release + # knowledge in a single ownership point. + # + # Shape: + # - ``video_codec``: dict[str, str] ffprobe lower → scene token + # - ``audio_codec``: dict[str, str] ffprobe lower → scene token + # - ``audio_channels``: dict[int, str] channel count → layout --- + + probe_mappings: dict + # --- File-extension sets (used by application/infra modules that work # directly with filesystem paths, e.g. media-type detection, video # lookup). Domain parsing itself doesn't touch these. --- diff --git a/alfred/infrastructure/knowledge/release.py b/alfred/infrastructure/knowledge/release.py index 05eb08b..08f7939 100644 --- a/alfred/infrastructure/knowledge/release.py +++ b/alfred/infrastructure/knowledge/release.py @@ -191,6 +191,36 @@ def load_scoring() -> dict: } +def load_probe_mappings() -> dict: + """Load ffprobe→scene-token translation tables. + + Returns a dict with three keys: + + - ``video_codec``: ``{ffprobe_codec_lower: scene_token}`` + - ``audio_codec``: ``{ffprobe_codec_lower: scene_token}`` + - ``audio_channels``: ``{channel_count_int: layout_str}`` + + Channel-count keys are normalized to ``int`` here so the consumer can + look up ``track.channels`` directly. Missing sections fall back to + empty dicts — the enrichment code degrades to its uppercase-fallback + path when a mapping is absent. + """ + raw = _load("probe_mappings.yaml") + video_codec = {k.lower(): v for k, v in (raw.get("video_codec") or {}).items()} + audio_codec = {k.lower(): v for k, v in (raw.get("audio_codec") or {}).items()} + audio_channels: dict[int, str] = {} + for k, v in (raw.get("audio_channels") or {}).items(): + try: + audio_channels[int(k)] = v + except (TypeError, ValueError): + continue + return { + "video_codec": video_codec, + "audio_codec": audio_codec, + "audio_channels": audio_channels, + } + + def load_separators() -> list[str]: """Single-char token separators used by the release name tokenizer. diff --git a/alfred/infrastructure/knowledge/release_kb.py b/alfred/infrastructure/knowledge/release_kb.py index 5ecb6ba..7f9d0b6 100644 --- a/alfred/infrastructure/knowledge/release_kb.py +++ b/alfred/infrastructure/knowledge/release_kb.py @@ -29,6 +29,7 @@ from .release import ( load_media_type_tokens, load_metadata_extensions, load_non_video_extensions, + load_probe_mappings, load_resolutions, load_scoring, load_separators, @@ -89,6 +90,10 @@ class YamlReleaseKnowledge: # Parse-scoring config (weights / penalties / thresholds). self.scoring: dict = load_scoring() + # ffprobe → scene-token mapping tables (consumed by + # ``application.release.enrich_from_probe``). + self.probe_mappings: dict = load_probe_mappings() + # File-extension sets (used by application/infra modules, not by # the parser itself — kept here so there is a single ownership # point for release knowledge). diff --git a/alfred/knowledge/release/probe_mappings.yaml b/alfred/knowledge/release/probe_mappings.yaml new file mode 100644 index 0000000..b5371f5 --- /dev/null +++ b/alfred/knowledge/release/probe_mappings.yaml @@ -0,0 +1,45 @@ +# Translation table — ffprobe output → scene-style release tokens. +# +# Consumed by ``alfred.application.release.enrich_from_probe`` when filling +# missing ParsedRelease fields from a probed MediaInfo. Token-level values +# from the release name always win; these mappings only fire when the +# corresponding ParsedRelease field is None. +# +# Lookup is case-insensitive on the key side (ffprobe sometimes emits +# uppercase, sometimes lowercase). When no key matches, the fallback is +# ``ffprobe_value.upper()`` so unknown codecs still surface in a +# predictable form (and signal the gap to a future "learn" pass). +# +# Each section is a flat dict — values are the canonical scene tokens +# Alfred uses everywhere (filename builders, ParsedRelease fields). + +# ffprobe video codec name → scene codec token +video_codec: + hevc: x265 + h264: x264 + h265: x265 + av1: AV1 + vp9: VP9 + mpeg4: XviD + +# ffprobe audio codec name → scene audio token +audio_codec: + eac3: EAC3 + ac3: AC3 + dts: DTS + truehd: TrueHD + aac: AAC + flac: FLAC + opus: OPUS + mp3: MP3 + pcm_s16l: PCM + pcm_s24l: PCM + +# Channel count (integer) → standard layout string. +# Keys are strings here because YAML mappings prefer string keys; the +# loader normalizes them back to int. +audio_channels: + "8": "7.1" + "6": "5.1" + "2": "2.0" + "1": "1.0" diff --git a/testing/recognize_folders_in_downloads.py b/testing/recognize_folders_in_downloads.py index 8dd2d41..459cf29 100644 --- a/testing/recognize_folders_in_downloads.py +++ b/testing/recognize_folders_in_downloads.py @@ -104,8 +104,10 @@ def main() -> None: from alfred.application.release.enrich_from_probe import enrich_from_probe from alfred.domain.release.services import parse_release from alfred.infrastructure.filesystem.find_video import find_video_file + from alfred.infrastructure.knowledge.release_kb import YamlReleaseKnowledge from alfred.infrastructure.probe import FfprobeMediaProber + _kb = YamlReleaseKnowledge() _prober = FfprobeMediaProber() entries = sorted(downloads.iterdir(), key=lambda p: p.name.lower()) @@ -123,14 +125,14 @@ def main() -> None: name = entry.name try: - p = parse_release(name) - p.media_type = detect_media_type(p, entry) + p, _report = parse_release(name, _kb) + p.media_type = detect_media_type(p, entry, _kb) if p.media_type not in ("unknown", "other"): video_file = find_video_file(entry) if video_file: media_info = _prober.probe(video_file) if media_info: - enrich_from_probe(p, media_info) + enrich_from_probe(p, media_info, _kb) warnings = _assess(p) except Exception as e: warnings = [f"parse error: {e}"] diff --git a/tests/application/test_enrich_from_probe.py b/tests/application/test_enrich_from_probe.py index 9893614..349d0e8 100644 --- a/tests/application/test_enrich_from_probe.py +++ b/tests/application/test_enrich_from_probe.py @@ -21,6 +21,9 @@ from __future__ import annotations from alfred.application.release.enrich_from_probe import enrich_from_probe from alfred.domain.release.value_objects import ParsedRelease from alfred.domain.shared.media import AudioTrack, MediaInfo, VideoTrack +from alfred.infrastructure.knowledge.release_kb import YamlReleaseKnowledge + +_KB = YamlReleaseKnowledge() def _info_with_video(*, width=None, height=None, codec=None, **rest) -> MediaInfo: @@ -59,17 +62,17 @@ def _bare(**overrides) -> ParsedRelease: class TestQuality: def test_fills_when_none(self): p = _bare() - enrich_from_probe(p, _info_with_video(width=1920, height=1080)) + enrich_from_probe(p, _info_with_video(width=1920, height=1080), _KB) assert p.quality == "1080p" def test_does_not_overwrite_existing(self): p = _bare(quality="2160p") - enrich_from_probe(p, _info_with_video(width=1920, height=1080)) + enrich_from_probe(p, _info_with_video(width=1920, height=1080), _KB) assert p.quality == "2160p" def test_no_dims_leaves_none(self): p = _bare() - enrich_from_probe(p, MediaInfo()) + enrich_from_probe(p, MediaInfo(), _KB) assert p.quality is None @@ -81,27 +84,27 @@ class TestQuality: class TestVideoCodec: def test_hevc_to_x265(self): p = _bare() - enrich_from_probe(p, _info_with_video(codec="hevc")) + enrich_from_probe(p, _info_with_video(codec="hevc"), _KB) assert p.codec == "x265" def test_h264_to_x264(self): p = _bare() - enrich_from_probe(p, _info_with_video(codec="h264")) + enrich_from_probe(p, _info_with_video(codec="h264"), _KB) assert p.codec == "x264" def test_unknown_codec_uppercased(self): p = _bare() - enrich_from_probe(p, _info_with_video(codec="weird")) + enrich_from_probe(p, _info_with_video(codec="weird"), _KB) assert p.codec == "WEIRD" def test_does_not_overwrite_existing(self): p = _bare(codec="HEVC") - enrich_from_probe(p, _info_with_video(codec="h264")) + enrich_from_probe(p, _info_with_video(codec="h264"), _KB) assert p.codec == "HEVC" def test_no_codec_leaves_none(self): p = _bare() - enrich_from_probe(p, MediaInfo()) + enrich_from_probe(p, MediaInfo(), _KB) assert p.codec is None @@ -119,7 +122,7 @@ class TestAudio: ] ) p = _bare() - enrich_from_probe(p, info) + enrich_from_probe(p, info, _KB) assert p.audio_codec == "EAC3" assert p.audio_channels == "5.1" @@ -131,32 +134,32 @@ class TestAudio: ] ) p = _bare() - enrich_from_probe(p, info) + enrich_from_probe(p, info, _KB) assert p.audio_codec == "AC3" assert p.audio_channels == "5.1" def test_channel_count_unknown_falls_back(self): info = MediaInfo(audio_tracks=[AudioTrack(0, "aac", 4, "quad", "eng")]) p = _bare() - enrich_from_probe(p, info) + enrich_from_probe(p, info, _KB) assert p.audio_channels == "4ch" def test_unknown_audio_codec_uppercased(self): info = MediaInfo(audio_tracks=[AudioTrack(0, "newcodec", 2, "stereo", "eng")]) p = _bare() - enrich_from_probe(p, info) + enrich_from_probe(p, info, _KB) assert p.audio_codec == "NEWCODEC" def test_no_audio_tracks(self): p = _bare() - enrich_from_probe(p, MediaInfo()) + enrich_from_probe(p, MediaInfo(), _KB) assert p.audio_codec is None assert p.audio_channels is None def test_does_not_overwrite_existing_audio_fields(self): info = MediaInfo(audio_tracks=[AudioTrack(0, "ac3", 6, "5.1", "eng")]) p = _bare(audio_codec="DTS-HD.MA", audio_channels="7.1") - enrich_from_probe(p, info) + enrich_from_probe(p, info, _KB) assert p.audio_codec == "DTS-HD.MA" assert p.audio_channels == "7.1" @@ -175,7 +178,7 @@ class TestLanguages: ] ) p = _bare() - enrich_from_probe(p, info) + enrich_from_probe(p, info, _KB) assert p.languages == ["eng", "fre"] def test_skips_und(self): @@ -186,7 +189,7 @@ class TestLanguages: ] ) p = _bare() - enrich_from_probe(p, info) + enrich_from_probe(p, info, _KB) assert p.languages == ["eng"] def test_dedup_against_existing_case_insensitive(self): @@ -201,13 +204,13 @@ class TestLanguages: ) p = _bare() p.languages = ["ENG"] - enrich_from_probe(p, info) + enrich_from_probe(p, info, _KB) # "eng" → upper "ENG" already present → skipped. "fre" → "FRE" new → kept. assert p.languages == ["ENG", "fre"] def test_no_audio_tracks_leaves_languages_empty(self): p = _bare() - enrich_from_probe(p, MediaInfo()) + enrich_from_probe(p, MediaInfo(), _KB) assert p.languages == [] @@ -224,7 +227,7 @@ class TestTechString: def test_rebuilt_from_filled_quality_and_codec(self): p = _bare() enrich_from_probe( - p, _info_with_video(width=1920, height=1080, codec="hevc") + p, _info_with_video(width=1920, height=1080, codec="hevc"), _KB ) assert p.quality == "1080p" assert p.codec == "x265" @@ -234,7 +237,7 @@ class TestTechString: # Token-level source must stay; probe fills only None fields. p = _bare(source="BluRay") enrich_from_probe( - p, _info_with_video(width=1920, height=1080, codec="hevc") + p, _info_with_video(width=1920, height=1080, codec="hevc"), _KB ) assert p.tech_string == "1080p.BluRay.x265" @@ -242,10 +245,10 @@ class TestTechString: # No video info → nothing to fill → derived tech_string stays as it was. p = _bare(quality="2160p", source="WEB-DL", codec="x265") assert p.tech_string == "2160p.WEB-DL.x265" - enrich_from_probe(p, MediaInfo()) + enrich_from_probe(p, MediaInfo(), _KB) assert p.tech_string == "2160p.WEB-DL.x265" def test_empty_when_nothing_known(self): p = _bare() - enrich_from_probe(p, MediaInfo()) + enrich_from_probe(p, MediaInfo(), _KB) assert p.tech_string == ""