diff --git a/CHANGELOG.md b/CHANGELOG.md index 04c6275..1444904 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -57,6 +57,18 @@ callers). ### Changed +- **`enrich_from_probe` codec mappings moved to YAML.** The three + hard-coded module dicts (`_VIDEO_CODEC_MAP`, `_AUDIO_CODEC_MAP`, + `_CHANNEL_MAP`) translating ffprobe output to scene tokens + (`hevc → x265`, `eac3 → EAC3`, `8 → "7.1"`, …) now live in + `alfred/knowledge/release/probe_mappings.yaml` and are loaded into + `ReleaseKnowledge.probe_mappings` (new port field, populated by + `YamlReleaseKnowledge`). `enrich_from_probe` gains a third `kb` + parameter and reads the maps from there. Aligns with the CLAUDE.md + rule that lookup tables of domain knowledge belong in YAML, not in + Python — and opens the door to a future "learn new codec" pass. + Callers updated: `inspect_release`, `testing/recognize_folders_in_downloads.py`, + and all 22 sites in `tests/application/test_enrich_from_probe.py`. - **`ParsedRelease.tech_string` is now a derived `@property`** (`alfred/domain/release/value_objects.py`). It computes `quality.source.codec` joined by dots on every access, so it stays in diff --git a/alfred/application/release/enrich_from_probe.py b/alfred/application/release/enrich_from_probe.py index 1779046..b26eada 100644 --- a/alfred/application/release/enrich_from_probe.py +++ b/alfred/application/release/enrich_from_probe.py @@ -2,55 +2,36 @@ from __future__ import annotations +from alfred.domain.release.ports import ReleaseKnowledge from alfred.domain.release.value_objects import ParsedRelease from alfred.domain.shared.media import MediaInfo -# Map ffprobe codec names to scene-style codec tokens -_VIDEO_CODEC_MAP = { - "hevc": "x265", - "h264": "x264", - "h265": "x265", - "av1": "AV1", - "vp9": "VP9", - "mpeg4": "XviD", -} -# Map ffprobe audio codec names to scene-style tokens -_AUDIO_CODEC_MAP = { - "eac3": "EAC3", - "ac3": "AC3", - "dts": "DTS", - "truehd": "TrueHD", - "aac": "AAC", - "flac": "FLAC", - "opus": "OPUS", - "mp3": "MP3", - "pcm_s16l": "PCM", - "pcm_s24l": "PCM", -} - -# Map channel count to standard layout string -_CHANNEL_MAP = { - 8: "7.1", - 6: "5.1", - 2: "2.0", - 1: "1.0", -} - - -def enrich_from_probe(parsed: ParsedRelease, info: MediaInfo) -> None: +def enrich_from_probe( + parsed: ParsedRelease, info: MediaInfo, kb: ReleaseKnowledge +) -> None: """ Fill None fields in parsed using data from ffprobe MediaInfo. Only overwrites fields that are currently None — token-level values - from the release name always take priority. - Mutates parsed in place. + from the release name always take priority. Mutates parsed in place. + + Translation tables (ffprobe codec name → scene token, channel count + → layout) live in ``kb.probe_mappings`` (loaded from + ``alfred/knowledge/release/probe_mappings.yaml``). When ffprobe + reports a value with no mapping entry, the fallback is the uppercase + raw value so unknown codecs still surface in a predictable form. """ + mappings = kb.probe_mappings + video_codec_map: dict[str, str] = mappings.get("video_codec", {}) + audio_codec_map: dict[str, str] = mappings.get("audio_codec", {}) + channel_map: dict[int, str] = mappings.get("audio_channels", {}) + if parsed.quality is None and info.resolution: parsed.quality = info.resolution if parsed.codec is None and info.video_codec: - parsed.codec = _VIDEO_CODEC_MAP.get( + parsed.codec = video_codec_map.get( info.video_codec.lower(), info.video_codec.upper() ) @@ -64,12 +45,12 @@ def enrich_from_probe(parsed: ParsedRelease, info: MediaInfo) -> None: if track: if parsed.audio_codec is None and track.codec: - parsed.audio_codec = _AUDIO_CODEC_MAP.get( + parsed.audio_codec = audio_codec_map.get( track.codec.lower(), track.codec.upper() ) if parsed.audio_channels is None and track.channels: - parsed.audio_channels = _CHANNEL_MAP.get( + parsed.audio_channels = channel_map.get( track.channels, f"{track.channels}ch" ) diff --git a/alfred/application/release/inspect.py b/alfred/application/release/inspect.py index 9361fa6..bb758cd 100644 --- a/alfred/application/release/inspect.py +++ b/alfred/application/release/inspect.py @@ -127,7 +127,7 @@ def inspect_release( if main_video is not None and parsed.media_type not in _NON_PROBABLE_MEDIA_TYPES: media_info = prober.probe(main_video) if media_info is not None: - enrich_from_probe(parsed, media_info) + enrich_from_probe(parsed, media_info, kb) probe_used = True return InspectedResult( diff --git a/alfred/domain/release/ports/knowledge.py b/alfred/domain/release/ports/knowledge.py index 183c3a0..eba323f 100644 --- a/alfred/domain/release/ports/knowledge.py +++ b/alfred/domain/release/ports/knowledge.py @@ -52,6 +52,18 @@ class ReleaseKnowledge(Protocol): scoring: dict + # --- ffprobe → scene-token translation tables (consumed by + # ``application.release.enrich_from_probe``). Domain parsing itself + # doesn't touch these — exposed on the same KB to keep release + # knowledge in a single ownership point. + # + # Shape: + # - ``video_codec``: dict[str, str] ffprobe lower → scene token + # - ``audio_codec``: dict[str, str] ffprobe lower → scene token + # - ``audio_channels``: dict[int, str] channel count → layout --- + + probe_mappings: dict + # --- File-extension sets (used by application/infra modules that work # directly with filesystem paths, e.g. media-type detection, video # lookup). Domain parsing itself doesn't touch these. --- diff --git a/alfred/infrastructure/knowledge/release.py b/alfred/infrastructure/knowledge/release.py index 05eb08b..08f7939 100644 --- a/alfred/infrastructure/knowledge/release.py +++ b/alfred/infrastructure/knowledge/release.py @@ -191,6 +191,36 @@ def load_scoring() -> dict: } +def load_probe_mappings() -> dict: + """Load ffprobe→scene-token translation tables. + + Returns a dict with three keys: + + - ``video_codec``: ``{ffprobe_codec_lower: scene_token}`` + - ``audio_codec``: ``{ffprobe_codec_lower: scene_token}`` + - ``audio_channels``: ``{channel_count_int: layout_str}`` + + Channel-count keys are normalized to ``int`` here so the consumer can + look up ``track.channels`` directly. Missing sections fall back to + empty dicts — the enrichment code degrades to its uppercase-fallback + path when a mapping is absent. + """ + raw = _load("probe_mappings.yaml") + video_codec = {k.lower(): v for k, v in (raw.get("video_codec") or {}).items()} + audio_codec = {k.lower(): v for k, v in (raw.get("audio_codec") or {}).items()} + audio_channels: dict[int, str] = {} + for k, v in (raw.get("audio_channels") or {}).items(): + try: + audio_channels[int(k)] = v + except (TypeError, ValueError): + continue + return { + "video_codec": video_codec, + "audio_codec": audio_codec, + "audio_channels": audio_channels, + } + + def load_separators() -> list[str]: """Single-char token separators used by the release name tokenizer. diff --git a/alfred/infrastructure/knowledge/release_kb.py b/alfred/infrastructure/knowledge/release_kb.py index 5ecb6ba..7f9d0b6 100644 --- a/alfred/infrastructure/knowledge/release_kb.py +++ b/alfred/infrastructure/knowledge/release_kb.py @@ -29,6 +29,7 @@ from .release import ( load_media_type_tokens, load_metadata_extensions, load_non_video_extensions, + load_probe_mappings, load_resolutions, load_scoring, load_separators, @@ -89,6 +90,10 @@ class YamlReleaseKnowledge: # Parse-scoring config (weights / penalties / thresholds). self.scoring: dict = load_scoring() + # ffprobe → scene-token mapping tables (consumed by + # ``application.release.enrich_from_probe``). + self.probe_mappings: dict = load_probe_mappings() + # File-extension sets (used by application/infra modules, not by # the parser itself — kept here so there is a single ownership # point for release knowledge). diff --git a/alfred/knowledge/release/probe_mappings.yaml b/alfred/knowledge/release/probe_mappings.yaml new file mode 100644 index 0000000..b5371f5 --- /dev/null +++ b/alfred/knowledge/release/probe_mappings.yaml @@ -0,0 +1,45 @@ +# Translation table — ffprobe output → scene-style release tokens. +# +# Consumed by ``alfred.application.release.enrich_from_probe`` when filling +# missing ParsedRelease fields from a probed MediaInfo. Token-level values +# from the release name always win; these mappings only fire when the +# corresponding ParsedRelease field is None. +# +# Lookup is case-insensitive on the key side (ffprobe sometimes emits +# uppercase, sometimes lowercase). When no key matches, the fallback is +# ``ffprobe_value.upper()`` so unknown codecs still surface in a +# predictable form (and signal the gap to a future "learn" pass). +# +# Each section is a flat dict — values are the canonical scene tokens +# Alfred uses everywhere (filename builders, ParsedRelease fields). + +# ffprobe video codec name → scene codec token +video_codec: + hevc: x265 + h264: x264 + h265: x265 + av1: AV1 + vp9: VP9 + mpeg4: XviD + +# ffprobe audio codec name → scene audio token +audio_codec: + eac3: EAC3 + ac3: AC3 + dts: DTS + truehd: TrueHD + aac: AAC + flac: FLAC + opus: OPUS + mp3: MP3 + pcm_s16l: PCM + pcm_s24l: PCM + +# Channel count (integer) → standard layout string. +# Keys are strings here because YAML mappings prefer string keys; the +# loader normalizes them back to int. +audio_channels: + "8": "7.1" + "6": "5.1" + "2": "2.0" + "1": "1.0" diff --git a/testing/recognize_folders_in_downloads.py b/testing/recognize_folders_in_downloads.py index 8dd2d41..459cf29 100644 --- a/testing/recognize_folders_in_downloads.py +++ b/testing/recognize_folders_in_downloads.py @@ -104,8 +104,10 @@ def main() -> None: from alfred.application.release.enrich_from_probe import enrich_from_probe from alfred.domain.release.services import parse_release from alfred.infrastructure.filesystem.find_video import find_video_file + from alfred.infrastructure.knowledge.release_kb import YamlReleaseKnowledge from alfred.infrastructure.probe import FfprobeMediaProber + _kb = YamlReleaseKnowledge() _prober = FfprobeMediaProber() entries = sorted(downloads.iterdir(), key=lambda p: p.name.lower()) @@ -123,14 +125,14 @@ def main() -> None: name = entry.name try: - p = parse_release(name) - p.media_type = detect_media_type(p, entry) + p, _report = parse_release(name, _kb) + p.media_type = detect_media_type(p, entry, _kb) if p.media_type not in ("unknown", "other"): video_file = find_video_file(entry) if video_file: media_info = _prober.probe(video_file) if media_info: - enrich_from_probe(p, media_info) + enrich_from_probe(p, media_info, _kb) warnings = _assess(p) except Exception as e: warnings = [f"parse error: {e}"] diff --git a/tests/application/test_enrich_from_probe.py b/tests/application/test_enrich_from_probe.py index 9893614..349d0e8 100644 --- a/tests/application/test_enrich_from_probe.py +++ b/tests/application/test_enrich_from_probe.py @@ -21,6 +21,9 @@ from __future__ import annotations from alfred.application.release.enrich_from_probe import enrich_from_probe from alfred.domain.release.value_objects import ParsedRelease from alfred.domain.shared.media import AudioTrack, MediaInfo, VideoTrack +from alfred.infrastructure.knowledge.release_kb import YamlReleaseKnowledge + +_KB = YamlReleaseKnowledge() def _info_with_video(*, width=None, height=None, codec=None, **rest) -> MediaInfo: @@ -59,17 +62,17 @@ def _bare(**overrides) -> ParsedRelease: class TestQuality: def test_fills_when_none(self): p = _bare() - enrich_from_probe(p, _info_with_video(width=1920, height=1080)) + enrich_from_probe(p, _info_with_video(width=1920, height=1080), _KB) assert p.quality == "1080p" def test_does_not_overwrite_existing(self): p = _bare(quality="2160p") - enrich_from_probe(p, _info_with_video(width=1920, height=1080)) + enrich_from_probe(p, _info_with_video(width=1920, height=1080), _KB) assert p.quality == "2160p" def test_no_dims_leaves_none(self): p = _bare() - enrich_from_probe(p, MediaInfo()) + enrich_from_probe(p, MediaInfo(), _KB) assert p.quality is None @@ -81,27 +84,27 @@ class TestQuality: class TestVideoCodec: def test_hevc_to_x265(self): p = _bare() - enrich_from_probe(p, _info_with_video(codec="hevc")) + enrich_from_probe(p, _info_with_video(codec="hevc"), _KB) assert p.codec == "x265" def test_h264_to_x264(self): p = _bare() - enrich_from_probe(p, _info_with_video(codec="h264")) + enrich_from_probe(p, _info_with_video(codec="h264"), _KB) assert p.codec == "x264" def test_unknown_codec_uppercased(self): p = _bare() - enrich_from_probe(p, _info_with_video(codec="weird")) + enrich_from_probe(p, _info_with_video(codec="weird"), _KB) assert p.codec == "WEIRD" def test_does_not_overwrite_existing(self): p = _bare(codec="HEVC") - enrich_from_probe(p, _info_with_video(codec="h264")) + enrich_from_probe(p, _info_with_video(codec="h264"), _KB) assert p.codec == "HEVC" def test_no_codec_leaves_none(self): p = _bare() - enrich_from_probe(p, MediaInfo()) + enrich_from_probe(p, MediaInfo(), _KB) assert p.codec is None @@ -119,7 +122,7 @@ class TestAudio: ] ) p = _bare() - enrich_from_probe(p, info) + enrich_from_probe(p, info, _KB) assert p.audio_codec == "EAC3" assert p.audio_channels == "5.1" @@ -131,32 +134,32 @@ class TestAudio: ] ) p = _bare() - enrich_from_probe(p, info) + enrich_from_probe(p, info, _KB) assert p.audio_codec == "AC3" assert p.audio_channels == "5.1" def test_channel_count_unknown_falls_back(self): info = MediaInfo(audio_tracks=[AudioTrack(0, "aac", 4, "quad", "eng")]) p = _bare() - enrich_from_probe(p, info) + enrich_from_probe(p, info, _KB) assert p.audio_channels == "4ch" def test_unknown_audio_codec_uppercased(self): info = MediaInfo(audio_tracks=[AudioTrack(0, "newcodec", 2, "stereo", "eng")]) p = _bare() - enrich_from_probe(p, info) + enrich_from_probe(p, info, _KB) assert p.audio_codec == "NEWCODEC" def test_no_audio_tracks(self): p = _bare() - enrich_from_probe(p, MediaInfo()) + enrich_from_probe(p, MediaInfo(), _KB) assert p.audio_codec is None assert p.audio_channels is None def test_does_not_overwrite_existing_audio_fields(self): info = MediaInfo(audio_tracks=[AudioTrack(0, "ac3", 6, "5.1", "eng")]) p = _bare(audio_codec="DTS-HD.MA", audio_channels="7.1") - enrich_from_probe(p, info) + enrich_from_probe(p, info, _KB) assert p.audio_codec == "DTS-HD.MA" assert p.audio_channels == "7.1" @@ -175,7 +178,7 @@ class TestLanguages: ] ) p = _bare() - enrich_from_probe(p, info) + enrich_from_probe(p, info, _KB) assert p.languages == ["eng", "fre"] def test_skips_und(self): @@ -186,7 +189,7 @@ class TestLanguages: ] ) p = _bare() - enrich_from_probe(p, info) + enrich_from_probe(p, info, _KB) assert p.languages == ["eng"] def test_dedup_against_existing_case_insensitive(self): @@ -201,13 +204,13 @@ class TestLanguages: ) p = _bare() p.languages = ["ENG"] - enrich_from_probe(p, info) + enrich_from_probe(p, info, _KB) # "eng" → upper "ENG" already present → skipped. "fre" → "FRE" new → kept. assert p.languages == ["ENG", "fre"] def test_no_audio_tracks_leaves_languages_empty(self): p = _bare() - enrich_from_probe(p, MediaInfo()) + enrich_from_probe(p, MediaInfo(), _KB) assert p.languages == [] @@ -224,7 +227,7 @@ class TestTechString: def test_rebuilt_from_filled_quality_and_codec(self): p = _bare() enrich_from_probe( - p, _info_with_video(width=1920, height=1080, codec="hevc") + p, _info_with_video(width=1920, height=1080, codec="hevc"), _KB ) assert p.quality == "1080p" assert p.codec == "x265" @@ -234,7 +237,7 @@ class TestTechString: # Token-level source must stay; probe fills only None fields. p = _bare(source="BluRay") enrich_from_probe( - p, _info_with_video(width=1920, height=1080, codec="hevc") + p, _info_with_video(width=1920, height=1080, codec="hevc"), _KB ) assert p.tech_string == "1080p.BluRay.x265" @@ -242,10 +245,10 @@ class TestTechString: # No video info → nothing to fill → derived tech_string stays as it was. p = _bare(quality="2160p", source="WEB-DL", codec="x265") assert p.tech_string == "2160p.WEB-DL.x265" - enrich_from_probe(p, MediaInfo()) + enrich_from_probe(p, MediaInfo(), _KB) assert p.tech_string == "2160p.WEB-DL.x265" def test_empty_when_nothing_known(self): p = _bare() - enrich_from_probe(p, MediaInfo()) + enrich_from_probe(p, MediaInfo(), _KB) assert p.tech_string == ""