refactor(release): move codec mappings from code to YAML knowledge

The three module-level dicts in enrich_from_probe (ffprobe codec name
to scene token, channel count to layout) were exactly the kind of
domain lookup table CLAUDE.md says belongs in YAML, not in Python.
Move them to alfred/knowledge/release/probe_mappings.yaml, load
through a new ReleaseKnowledge.probe_mappings port field, and add a
kb parameter to enrich_from_probe so the consumer reads the maps via
the same injection pattern as everything else.

- New knowledge file: alfred/knowledge/release/probe_mappings.yaml
- New loader: load_probe_mappings() in infrastructure/knowledge/release.py
  (normalizes channel-count keys back to int).
- Port: ReleaseKnowledge gains probe_mappings: dict.
- Adapter: YamlReleaseKnowledge populates it at __init__.
- Consumer: enrich_from_probe(parsed, info, kb) reads the three sub-maps
  from kb.probe_mappings; unknown codecs still fall back to uppercase
  raw value, same behaviour as before.
- Call sites updated: inspect_release passes kb through; the testing
  script gets its kb wiring (it was already broken since the
  ReleaseKnowledge refactor); all 22 enrich_from_probe call sites in
  tests/application/test_enrich_from_probe.py pass _KB.
This commit is contained in:
2026-05-21 07:37:42 +02:00
parent e62dc90bd1
commit 0246f85ef8
9 changed files with 154 additions and 64 deletions
+12
View File
@@ -57,6 +57,18 @@ callers).
### Changed ### Changed
- **`enrich_from_probe` codec mappings moved to YAML.** The three
hard-coded module dicts (`_VIDEO_CODEC_MAP`, `_AUDIO_CODEC_MAP`,
`_CHANNEL_MAP`) translating ffprobe output to scene tokens
(`hevc → x265`, `eac3 → EAC3`, `8 → "7.1"`, …) now live in
`alfred/knowledge/release/probe_mappings.yaml` and are loaded into
`ReleaseKnowledge.probe_mappings` (new port field, populated by
`YamlReleaseKnowledge`). `enrich_from_probe` gains a third `kb`
parameter and reads the maps from there. Aligns with the CLAUDE.md
rule that lookup tables of domain knowledge belong in YAML, not in
Python — and opens the door to a future "learn new codec" pass.
Callers updated: `inspect_release`, `testing/recognize_folders_in_downloads.py`,
and all 22 sites in `tests/application/test_enrich_from_probe.py`.
- **`ParsedRelease.tech_string` is now a derived `@property`** - **`ParsedRelease.tech_string` is now a derived `@property`**
(`alfred/domain/release/value_objects.py`). It computes (`alfred/domain/release/value_objects.py`). It computes
`quality.source.codec` joined by dots on every access, so it stays in `quality.source.codec` joined by dots on every access, so it stays in
+19 -38
View File
@@ -2,55 +2,36 @@
from __future__ import annotations from __future__ import annotations
from alfred.domain.release.ports import ReleaseKnowledge
from alfred.domain.release.value_objects import ParsedRelease from alfred.domain.release.value_objects import ParsedRelease
from alfred.domain.shared.media import MediaInfo from alfred.domain.shared.media import MediaInfo
# Map ffprobe codec names to scene-style codec tokens
_VIDEO_CODEC_MAP = {
"hevc": "x265",
"h264": "x264",
"h265": "x265",
"av1": "AV1",
"vp9": "VP9",
"mpeg4": "XviD",
}
# Map ffprobe audio codec names to scene-style tokens def enrich_from_probe(
_AUDIO_CODEC_MAP = { parsed: ParsedRelease, info: MediaInfo, kb: ReleaseKnowledge
"eac3": "EAC3", ) -> None:
"ac3": "AC3",
"dts": "DTS",
"truehd": "TrueHD",
"aac": "AAC",
"flac": "FLAC",
"opus": "OPUS",
"mp3": "MP3",
"pcm_s16l": "PCM",
"pcm_s24l": "PCM",
}
# Map channel count to standard layout string
_CHANNEL_MAP = {
8: "7.1",
6: "5.1",
2: "2.0",
1: "1.0",
}
def enrich_from_probe(parsed: ParsedRelease, info: MediaInfo) -> None:
""" """
Fill None fields in parsed using data from ffprobe MediaInfo. Fill None fields in parsed using data from ffprobe MediaInfo.
Only overwrites fields that are currently None — token-level values Only overwrites fields that are currently None — token-level values
from the release name always take priority. from the release name always take priority. Mutates parsed in place.
Mutates parsed in place.
Translation tables (ffprobe codec name → scene token, channel count
→ layout) live in ``kb.probe_mappings`` (loaded from
``alfred/knowledge/release/probe_mappings.yaml``). When ffprobe
reports a value with no mapping entry, the fallback is the uppercase
raw value so unknown codecs still surface in a predictable form.
""" """
mappings = kb.probe_mappings
video_codec_map: dict[str, str] = mappings.get("video_codec", {})
audio_codec_map: dict[str, str] = mappings.get("audio_codec", {})
channel_map: dict[int, str] = mappings.get("audio_channels", {})
if parsed.quality is None and info.resolution: if parsed.quality is None and info.resolution:
parsed.quality = info.resolution parsed.quality = info.resolution
if parsed.codec is None and info.video_codec: if parsed.codec is None and info.video_codec:
parsed.codec = _VIDEO_CODEC_MAP.get( parsed.codec = video_codec_map.get(
info.video_codec.lower(), info.video_codec.upper() info.video_codec.lower(), info.video_codec.upper()
) )
@@ -64,12 +45,12 @@ def enrich_from_probe(parsed: ParsedRelease, info: MediaInfo) -> None:
if track: if track:
if parsed.audio_codec is None and track.codec: if parsed.audio_codec is None and track.codec:
parsed.audio_codec = _AUDIO_CODEC_MAP.get( parsed.audio_codec = audio_codec_map.get(
track.codec.lower(), track.codec.upper() track.codec.lower(), track.codec.upper()
) )
if parsed.audio_channels is None and track.channels: if parsed.audio_channels is None and track.channels:
parsed.audio_channels = _CHANNEL_MAP.get( parsed.audio_channels = channel_map.get(
track.channels, f"{track.channels}ch" track.channels, f"{track.channels}ch"
) )
+1 -1
View File
@@ -127,7 +127,7 @@ def inspect_release(
if main_video is not None and parsed.media_type not in _NON_PROBABLE_MEDIA_TYPES: if main_video is not None and parsed.media_type not in _NON_PROBABLE_MEDIA_TYPES:
media_info = prober.probe(main_video) media_info = prober.probe(main_video)
if media_info is not None: if media_info is not None:
enrich_from_probe(parsed, media_info) enrich_from_probe(parsed, media_info, kb)
probe_used = True probe_used = True
return InspectedResult( return InspectedResult(
+12
View File
@@ -52,6 +52,18 @@ class ReleaseKnowledge(Protocol):
scoring: dict scoring: dict
# --- ffprobe → scene-token translation tables (consumed by
# ``application.release.enrich_from_probe``). Domain parsing itself
# doesn't touch these — exposed on the same KB to keep release
# knowledge in a single ownership point.
#
# Shape:
# - ``video_codec``: dict[str, str] ffprobe lower → scene token
# - ``audio_codec``: dict[str, str] ffprobe lower → scene token
# - ``audio_channels``: dict[int, str] channel count → layout ---
probe_mappings: dict
# --- File-extension sets (used by application/infra modules that work # --- File-extension sets (used by application/infra modules that work
# directly with filesystem paths, e.g. media-type detection, video # directly with filesystem paths, e.g. media-type detection, video
# lookup). Domain parsing itself doesn't touch these. --- # lookup). Domain parsing itself doesn't touch these. ---
@@ -191,6 +191,36 @@ def load_scoring() -> dict:
} }
def load_probe_mappings() -> dict:
"""Load ffprobe→scene-token translation tables.
Returns a dict with three keys:
- ``video_codec``: ``{ffprobe_codec_lower: scene_token}``
- ``audio_codec``: ``{ffprobe_codec_lower: scene_token}``
- ``audio_channels``: ``{channel_count_int: layout_str}``
Channel-count keys are normalized to ``int`` here so the consumer can
look up ``track.channels`` directly. Missing sections fall back to
empty dicts — the enrichment code degrades to its uppercase-fallback
path when a mapping is absent.
"""
raw = _load("probe_mappings.yaml")
video_codec = {k.lower(): v for k, v in (raw.get("video_codec") or {}).items()}
audio_codec = {k.lower(): v for k, v in (raw.get("audio_codec") or {}).items()}
audio_channels: dict[int, str] = {}
for k, v in (raw.get("audio_channels") or {}).items():
try:
audio_channels[int(k)] = v
except (TypeError, ValueError):
continue
return {
"video_codec": video_codec,
"audio_codec": audio_codec,
"audio_channels": audio_channels,
}
def load_separators() -> list[str]: def load_separators() -> list[str]:
"""Single-char token separators used by the release name tokenizer. """Single-char token separators used by the release name tokenizer.
@@ -29,6 +29,7 @@ from .release import (
load_media_type_tokens, load_media_type_tokens,
load_metadata_extensions, load_metadata_extensions,
load_non_video_extensions, load_non_video_extensions,
load_probe_mappings,
load_resolutions, load_resolutions,
load_scoring, load_scoring,
load_separators, load_separators,
@@ -89,6 +90,10 @@ class YamlReleaseKnowledge:
# Parse-scoring config (weights / penalties / thresholds). # Parse-scoring config (weights / penalties / thresholds).
self.scoring: dict = load_scoring() self.scoring: dict = load_scoring()
# ffprobe → scene-token mapping tables (consumed by
# ``application.release.enrich_from_probe``).
self.probe_mappings: dict = load_probe_mappings()
# File-extension sets (used by application/infra modules, not by # File-extension sets (used by application/infra modules, not by
# the parser itself — kept here so there is a single ownership # the parser itself — kept here so there is a single ownership
# point for release knowledge). # point for release knowledge).
@@ -0,0 +1,45 @@
# Translation table — ffprobe output → scene-style release tokens.
#
# Consumed by ``alfred.application.release.enrich_from_probe`` when filling
# missing ParsedRelease fields from a probed MediaInfo. Token-level values
# from the release name always win; these mappings only fire when the
# corresponding ParsedRelease field is None.
#
# Lookup is case-insensitive on the key side (ffprobe sometimes emits
# uppercase, sometimes lowercase). When no key matches, the fallback is
# ``ffprobe_value.upper()`` so unknown codecs still surface in a
# predictable form (and signal the gap to a future "learn" pass).
#
# Each section is a flat dict — values are the canonical scene tokens
# Alfred uses everywhere (filename builders, ParsedRelease fields).
# ffprobe video codec name → scene codec token
video_codec:
hevc: x265
h264: x264
h265: x265
av1: AV1
vp9: VP9
mpeg4: XviD
# ffprobe audio codec name → scene audio token
audio_codec:
eac3: EAC3
ac3: AC3
dts: DTS
truehd: TrueHD
aac: AAC
flac: FLAC
opus: OPUS
mp3: MP3
pcm_s16l: PCM
pcm_s24l: PCM
# Channel count (integer) → standard layout string.
# Keys are strings here because YAML mappings prefer string keys; the
# loader normalizes them back to int.
audio_channels:
"8": "7.1"
"6": "5.1"
"2": "2.0"
"1": "1.0"
+5 -3
View File
@@ -104,8 +104,10 @@ def main() -> None:
from alfred.application.release.enrich_from_probe import enrich_from_probe from alfred.application.release.enrich_from_probe import enrich_from_probe
from alfred.domain.release.services import parse_release from alfred.domain.release.services import parse_release
from alfred.infrastructure.filesystem.find_video import find_video_file from alfred.infrastructure.filesystem.find_video import find_video_file
from alfred.infrastructure.knowledge.release_kb import YamlReleaseKnowledge
from alfred.infrastructure.probe import FfprobeMediaProber from alfred.infrastructure.probe import FfprobeMediaProber
_kb = YamlReleaseKnowledge()
_prober = FfprobeMediaProber() _prober = FfprobeMediaProber()
entries = sorted(downloads.iterdir(), key=lambda p: p.name.lower()) entries = sorted(downloads.iterdir(), key=lambda p: p.name.lower())
@@ -123,14 +125,14 @@ def main() -> None:
name = entry.name name = entry.name
try: try:
p = parse_release(name) p, _report = parse_release(name, _kb)
p.media_type = detect_media_type(p, entry) p.media_type = detect_media_type(p, entry, _kb)
if p.media_type not in ("unknown", "other"): if p.media_type not in ("unknown", "other"):
video_file = find_video_file(entry) video_file = find_video_file(entry)
if video_file: if video_file:
media_info = _prober.probe(video_file) media_info = _prober.probe(video_file)
if media_info: if media_info:
enrich_from_probe(p, media_info) enrich_from_probe(p, media_info, _kb)
warnings = _assess(p) warnings = _assess(p)
except Exception as e: except Exception as e:
warnings = [f"parse error: {e}"] warnings = [f"parse error: {e}"]
+25 -22
View File
@@ -21,6 +21,9 @@ from __future__ import annotations
from alfred.application.release.enrich_from_probe import enrich_from_probe from alfred.application.release.enrich_from_probe import enrich_from_probe
from alfred.domain.release.value_objects import ParsedRelease from alfred.domain.release.value_objects import ParsedRelease
from alfred.domain.shared.media import AudioTrack, MediaInfo, VideoTrack from alfred.domain.shared.media import AudioTrack, MediaInfo, VideoTrack
from alfred.infrastructure.knowledge.release_kb import YamlReleaseKnowledge
_KB = YamlReleaseKnowledge()
def _info_with_video(*, width=None, height=None, codec=None, **rest) -> MediaInfo: def _info_with_video(*, width=None, height=None, codec=None, **rest) -> MediaInfo:
@@ -59,17 +62,17 @@ def _bare(**overrides) -> ParsedRelease:
class TestQuality: class TestQuality:
def test_fills_when_none(self): def test_fills_when_none(self):
p = _bare() p = _bare()
enrich_from_probe(p, _info_with_video(width=1920, height=1080)) enrich_from_probe(p, _info_with_video(width=1920, height=1080), _KB)
assert p.quality == "1080p" assert p.quality == "1080p"
def test_does_not_overwrite_existing(self): def test_does_not_overwrite_existing(self):
p = _bare(quality="2160p") p = _bare(quality="2160p")
enrich_from_probe(p, _info_with_video(width=1920, height=1080)) enrich_from_probe(p, _info_with_video(width=1920, height=1080), _KB)
assert p.quality == "2160p" assert p.quality == "2160p"
def test_no_dims_leaves_none(self): def test_no_dims_leaves_none(self):
p = _bare() p = _bare()
enrich_from_probe(p, MediaInfo()) enrich_from_probe(p, MediaInfo(), _KB)
assert p.quality is None assert p.quality is None
@@ -81,27 +84,27 @@ class TestQuality:
class TestVideoCodec: class TestVideoCodec:
def test_hevc_to_x265(self): def test_hevc_to_x265(self):
p = _bare() p = _bare()
enrich_from_probe(p, _info_with_video(codec="hevc")) enrich_from_probe(p, _info_with_video(codec="hevc"), _KB)
assert p.codec == "x265" assert p.codec == "x265"
def test_h264_to_x264(self): def test_h264_to_x264(self):
p = _bare() p = _bare()
enrich_from_probe(p, _info_with_video(codec="h264")) enrich_from_probe(p, _info_with_video(codec="h264"), _KB)
assert p.codec == "x264" assert p.codec == "x264"
def test_unknown_codec_uppercased(self): def test_unknown_codec_uppercased(self):
p = _bare() p = _bare()
enrich_from_probe(p, _info_with_video(codec="weird")) enrich_from_probe(p, _info_with_video(codec="weird"), _KB)
assert p.codec == "WEIRD" assert p.codec == "WEIRD"
def test_does_not_overwrite_existing(self): def test_does_not_overwrite_existing(self):
p = _bare(codec="HEVC") p = _bare(codec="HEVC")
enrich_from_probe(p, _info_with_video(codec="h264")) enrich_from_probe(p, _info_with_video(codec="h264"), _KB)
assert p.codec == "HEVC" assert p.codec == "HEVC"
def test_no_codec_leaves_none(self): def test_no_codec_leaves_none(self):
p = _bare() p = _bare()
enrich_from_probe(p, MediaInfo()) enrich_from_probe(p, MediaInfo(), _KB)
assert p.codec is None assert p.codec is None
@@ -119,7 +122,7 @@ class TestAudio:
] ]
) )
p = _bare() p = _bare()
enrich_from_probe(p, info) enrich_from_probe(p, info, _KB)
assert p.audio_codec == "EAC3" assert p.audio_codec == "EAC3"
assert p.audio_channels == "5.1" assert p.audio_channels == "5.1"
@@ -131,32 +134,32 @@ class TestAudio:
] ]
) )
p = _bare() p = _bare()
enrich_from_probe(p, info) enrich_from_probe(p, info, _KB)
assert p.audio_codec == "AC3" assert p.audio_codec == "AC3"
assert p.audio_channels == "5.1" assert p.audio_channels == "5.1"
def test_channel_count_unknown_falls_back(self): def test_channel_count_unknown_falls_back(self):
info = MediaInfo(audio_tracks=[AudioTrack(0, "aac", 4, "quad", "eng")]) info = MediaInfo(audio_tracks=[AudioTrack(0, "aac", 4, "quad", "eng")])
p = _bare() p = _bare()
enrich_from_probe(p, info) enrich_from_probe(p, info, _KB)
assert p.audio_channels == "4ch" assert p.audio_channels == "4ch"
def test_unknown_audio_codec_uppercased(self): def test_unknown_audio_codec_uppercased(self):
info = MediaInfo(audio_tracks=[AudioTrack(0, "newcodec", 2, "stereo", "eng")]) info = MediaInfo(audio_tracks=[AudioTrack(0, "newcodec", 2, "stereo", "eng")])
p = _bare() p = _bare()
enrich_from_probe(p, info) enrich_from_probe(p, info, _KB)
assert p.audio_codec == "NEWCODEC" assert p.audio_codec == "NEWCODEC"
def test_no_audio_tracks(self): def test_no_audio_tracks(self):
p = _bare() p = _bare()
enrich_from_probe(p, MediaInfo()) enrich_from_probe(p, MediaInfo(), _KB)
assert p.audio_codec is None assert p.audio_codec is None
assert p.audio_channels is None assert p.audio_channels is None
def test_does_not_overwrite_existing_audio_fields(self): def test_does_not_overwrite_existing_audio_fields(self):
info = MediaInfo(audio_tracks=[AudioTrack(0, "ac3", 6, "5.1", "eng")]) info = MediaInfo(audio_tracks=[AudioTrack(0, "ac3", 6, "5.1", "eng")])
p = _bare(audio_codec="DTS-HD.MA", audio_channels="7.1") p = _bare(audio_codec="DTS-HD.MA", audio_channels="7.1")
enrich_from_probe(p, info) enrich_from_probe(p, info, _KB)
assert p.audio_codec == "DTS-HD.MA" assert p.audio_codec == "DTS-HD.MA"
assert p.audio_channels == "7.1" assert p.audio_channels == "7.1"
@@ -175,7 +178,7 @@ class TestLanguages:
] ]
) )
p = _bare() p = _bare()
enrich_from_probe(p, info) enrich_from_probe(p, info, _KB)
assert p.languages == ["eng", "fre"] assert p.languages == ["eng", "fre"]
def test_skips_und(self): def test_skips_und(self):
@@ -186,7 +189,7 @@ class TestLanguages:
] ]
) )
p = _bare() p = _bare()
enrich_from_probe(p, info) enrich_from_probe(p, info, _KB)
assert p.languages == ["eng"] assert p.languages == ["eng"]
def test_dedup_against_existing_case_insensitive(self): def test_dedup_against_existing_case_insensitive(self):
@@ -201,13 +204,13 @@ class TestLanguages:
) )
p = _bare() p = _bare()
p.languages = ["ENG"] p.languages = ["ENG"]
enrich_from_probe(p, info) enrich_from_probe(p, info, _KB)
# "eng" → upper "ENG" already present → skipped. "fre" → "FRE" new → kept. # "eng" → upper "ENG" already present → skipped. "fre" → "FRE" new → kept.
assert p.languages == ["ENG", "fre"] assert p.languages == ["ENG", "fre"]
def test_no_audio_tracks_leaves_languages_empty(self): def test_no_audio_tracks_leaves_languages_empty(self):
p = _bare() p = _bare()
enrich_from_probe(p, MediaInfo()) enrich_from_probe(p, MediaInfo(), _KB)
assert p.languages == [] assert p.languages == []
@@ -224,7 +227,7 @@ class TestTechString:
def test_rebuilt_from_filled_quality_and_codec(self): def test_rebuilt_from_filled_quality_and_codec(self):
p = _bare() p = _bare()
enrich_from_probe( enrich_from_probe(
p, _info_with_video(width=1920, height=1080, codec="hevc") p, _info_with_video(width=1920, height=1080, codec="hevc"), _KB
) )
assert p.quality == "1080p" assert p.quality == "1080p"
assert p.codec == "x265" assert p.codec == "x265"
@@ -234,7 +237,7 @@ class TestTechString:
# Token-level source must stay; probe fills only None fields. # Token-level source must stay; probe fills only None fields.
p = _bare(source="BluRay") p = _bare(source="BluRay")
enrich_from_probe( enrich_from_probe(
p, _info_with_video(width=1920, height=1080, codec="hevc") p, _info_with_video(width=1920, height=1080, codec="hevc"), _KB
) )
assert p.tech_string == "1080p.BluRay.x265" assert p.tech_string == "1080p.BluRay.x265"
@@ -242,10 +245,10 @@ class TestTechString:
# No video info → nothing to fill → derived tech_string stays as it was. # No video info → nothing to fill → derived tech_string stays as it was.
p = _bare(quality="2160p", source="WEB-DL", codec="x265") p = _bare(quality="2160p", source="WEB-DL", codec="x265")
assert p.tech_string == "2160p.WEB-DL.x265" assert p.tech_string == "2160p.WEB-DL.x265"
enrich_from_probe(p, MediaInfo()) enrich_from_probe(p, MediaInfo(), _KB)
assert p.tech_string == "2160p.WEB-DL.x265" assert p.tech_string == "2160p.WEB-DL.x265"
def test_empty_when_nothing_known(self): def test_empty_when_nothing_known(self):
p = _bare() p = _bare()
enrich_from_probe(p, MediaInfo()) enrich_from_probe(p, MediaInfo(), _KB)
assert p.tech_string == "" assert p.tech_string == ""