feat(release): parse_release returns (ParsedRelease, ParseReport)

Wire the scoring foundations into the parser entry point. parse_release now returns a tuple — the structural ParsedRelease and a diagnostic ParseReport carrying confidence (0-100), road (EASY / SHITTY / PATH_OF_PAIN), the residual UNKNOWN tokens, and the list of critical fields that couldn't be filled. EASY is decided structurally (a group schema matched), independently of the score. SHITTY vs PATH_OF_PAIN is decided by score against the 60 cutoff from scoring.yaml. Malformed names (forbidden chars) emit a zero-confidence PoP report and short-circuit to parse_path=AI as before. ParsePath stays as-is (DIRECT / SANITIZED / AI) — it records *how* we tokenized, not how confident we are. The two dimensions are now properly separated. Call sites propagated: - alfred/application/filesystem/resolve_destination.py (4 occurrences) - alfred/agent/tools/filesystem.py - tests/domain/test_release.py - tests/domain/test_release_fixtures.py - tests/application/test_detect_media_type.py New tests/domain/release/test_parser_v2_scoring.py (22 cases) locks ParseReport validation, compute_score arithmetic, decide_road thresholding, the collector helpers, and the end-to-end tuple contract.
2026-05-20 01:21:30 +02:00
parent 98c688f29b
commit b4c9efd13b
7 changed files with 336 additions and 22 deletions
@@ -0,0 +1,282 @@
+"""Phase A — parse-confidence scoring.
+
+These tests pin the score / road semantics without going through
+fixtures. They exercise the small pure functions in
+``alfred.domain.release.parser.scoring`` and the end-to-end contract
+that ``parse_release`` returns a ``(ParsedRelease, ParseReport)`` tuple.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from alfred.domain.release.parser.scoring import (
+    Road,
+    collect_missing_critical,
+    collect_unknown_tokens,
+    compute_score,
+    decide_road,
+)
+from alfred.domain.release.parser.tokens import Token, TokenRole
+from alfred.domain.release.services import parse_release
+from alfred.domain.release.value_objects import (
+    MediaTypeToken,
+    ParsedRelease,
+    ParsePath,
+    ParseReport,
+)
+from alfred.domain.shared.exceptions import ValidationError
+from alfred.infrastructure.knowledge.release_kb import YamlReleaseKnowledge
+
+_KB = YamlReleaseKnowledge()
+
+
+# --------------------------------------------------------------------- #
+# ParseReport VO                                                        #
+# --------------------------------------------------------------------- #
+
+
+class TestParseReport:
+    def test_construct_with_defaults(self) -> None:
+        report = ParseReport(confidence=80, road="easy")
+        assert report.confidence == 80
+        assert report.road == "easy"
+        assert report.unknown_tokens == ()
+        assert report.missing_critical == ()
+
+    def test_is_frozen(self) -> None:
+        report = ParseReport(confidence=50, road="shitty")
+        with pytest.raises(Exception):  # FrozenInstanceError
+            report.confidence = 99  # type: ignore[misc]
+
+    def test_confidence_lower_bound(self) -> None:
+        with pytest.raises(ValidationError):
+            ParseReport(confidence=-1, road="easy")
+
+    def test_confidence_upper_bound(self) -> None:
+        with pytest.raises(ValidationError):
+            ParseReport(confidence=101, road="easy")
+
+
+# --------------------------------------------------------------------- #
+# compute_score                                                         #
+# --------------------------------------------------------------------- #
+
+
+def _movie(year: int = 2020, **overrides) -> ParsedRelease:
+    """Build a populated movie ParsedRelease for scoring tests."""
+    base = dict(
+        raw="Inception.2010.1080p.BluRay.x264-GROUP",
+        normalised="Inception.2010.1080p.BluRay.x264-GROUP",
+        title="Inception",
+        title_sanitized="Inception",
+        year=year,
+        season=None,
+        episode=None,
+        episode_end=None,
+        quality="1080p",
+        source="BluRay",
+        codec="x264",
+        group="GROUP",
+        tech_string="1080p.BluRay.x264",
+        media_type=MediaTypeToken.MOVIE.value,
+        parse_path=ParsePath.DIRECT.value,
+    )
+    base.update(overrides)
+    return ParsedRelease(**base)
+
+
+def _all_annotated() -> list[Token]:
+    """Token stream where everything is annotated — zero penalty."""
+    return [
+        Token("Inception", 0, TokenRole.TITLE),
+        Token("2010", 1, TokenRole.YEAR),
+        Token("1080p", 2, TokenRole.RESOLUTION),
+        Token("BluRay", 3, TokenRole.SOURCE),
+        Token("x264", 4, TokenRole.CODEC),
+        Token("GROUP", 5, TokenRole.GROUP),
+    ]
+
+
+class TestComputeScore:
+    def test_fully_populated_movie_scores_high(self) -> None:
+        parsed = _movie()
+        score = compute_score(parsed, _all_annotated(), _KB)
+        # title 30 + media_type 20 + year 15 + resolution 5 + source 5
+        # + codec 5 + group 5 = 85
+        assert score == 85
+
+    def test_tv_show_gets_season_and_episode_weight(self) -> None:
+        parsed = ParsedRelease(
+            raw="Oz.S01E01.1080p.WEBRip.x265-KONTRAST",
+            normalised="Oz.S01E01.1080p.WEBRip.x265-KONTRAST",
+            title="Oz",
+            title_sanitized="Oz",
+            year=None,
+            season=1,
+            episode=1,
+            episode_end=None,
+            quality="1080p",
+            source="WEBRip",
+            codec="x265",
+            group="KONTRAST",
+            tech_string="1080p.WEBRip.x265",
+            media_type=MediaTypeToken.TV_SHOW.value,
+            parse_path=ParsePath.DIRECT.value,
+        )
+        tokens = [
+            Token("Oz", 0, TokenRole.TITLE),
+            Token("S01E01", 1, TokenRole.SEASON_EPISODE),
+            Token("1080p", 2, TokenRole.RESOLUTION),
+            Token("WEBRip", 3, TokenRole.SOURCE),
+            Token("x265", 4, TokenRole.CODEC),
+            Token("KONTRAST", 5, TokenRole.GROUP),
+        ]
+        score = compute_score(parsed, tokens, _KB)
+        # title 30 + media_type 20 + season 10 + episode 5 + resolution 5
+        # + source 5 + codec 5 + group 5 = 85 (no year)
+        assert score == 85
+
+    def test_unknown_tokens_subtract_penalty(self) -> None:
+        parsed = _movie()
+        tokens = _all_annotated() + [
+            Token("noise", 6, TokenRole.UNKNOWN),
+            Token("more", 7, TokenRole.UNKNOWN),
+        ]
+        score = compute_score(parsed, tokens, _KB)
+        # 85 baseline - 2*5 unknown tokens = 75
+        assert score == 75
+
+    def test_unknown_penalty_capped(self) -> None:
+        parsed = _movie()
+        # 20 unknown tokens × 5 = 100 raw, capped at 30
+        tokens = _all_annotated() + [
+            Token(f"t{i}", 6 + i, TokenRole.UNKNOWN) for i in range(20)
+        ]
+        score = compute_score(parsed, tokens, _KB)
+        assert score == 85 - 30
+
+    def test_score_clamped_to_zero(self) -> None:
+        # Empty-ish parse with lots of unknown tokens
+        parsed = _movie(year=None, quality=None, source=None, codec=None)
+        tokens = [Token(f"t{i}", i, TokenRole.UNKNOWN) for i in range(10)]
+        score = compute_score(parsed, tokens, _KB)
+        # title 30 + media_type 20 + group 5 = 55, -30 cap = 25
+        # Sanity: still clamped at 0 minimum even if math goes weird
+        assert 0 <= score <= 100
+
+    def test_unknown_media_type_does_not_count(self) -> None:
+        parsed = _movie(media_type=MediaTypeToken.UNKNOWN.value)
+        score = compute_score(parsed, _all_annotated(), _KB)
+        # Loses the 20 of media_type vs baseline
+        assert score == 85 - 20
+
+    def test_unknown_group_does_not_count(self) -> None:
+        parsed = _movie(group="UNKNOWN")
+        score = compute_score(parsed, _all_annotated(), _KB)
+        assert score == 85 - 5
+
+
+# --------------------------------------------------------------------- #
+# decide_road                                                           #
+# --------------------------------------------------------------------- #
+
+
+class TestDecideRoad:
+    def test_known_schema_is_easy_regardless_of_score(self) -> None:
+        # Even a terrible score returns EASY when a schema matched.
+        assert decide_road(score=0, has_schema=True, kb=_KB) is Road.EASY
+
+    def test_no_schema_high_score_is_shitty(self) -> None:
+        assert decide_road(score=80, has_schema=False, kb=_KB) is Road.SHITTY
+
+    def test_no_schema_low_score_is_pop(self) -> None:
+        assert decide_road(score=10, has_schema=False, kb=_KB) is Road.PATH_OF_PAIN
+
+    def test_threshold_boundary_is_inclusive(self) -> None:
+        threshold = _KB.scoring["thresholds"]["shitty_min"]
+        assert decide_road(threshold, has_schema=False, kb=_KB) is Road.SHITTY
+        assert (
+            decide_road(threshold - 1, has_schema=False, kb=_KB)
+            is Road.PATH_OF_PAIN
+        )
+
+
+# --------------------------------------------------------------------- #
+# Collectors                                                            #
+# --------------------------------------------------------------------- #
+
+
+class TestCollectors:
+    def test_collect_unknown_tokens_preserves_order(self) -> None:
+        tokens = [
+            Token("A", 0, TokenRole.TITLE),
+            Token("X", 1, TokenRole.UNKNOWN),
+            Token("B", 2, TokenRole.RESOLUTION),
+            Token("Y", 3, TokenRole.UNKNOWN),
+        ]
+        assert collect_unknown_tokens(tokens) == ("X", "Y")
+
+    def test_collect_missing_critical_full(self) -> None:
+        empty = ParsedRelease(
+            raw="x",
+            normalised="x",
+            title="",
+            title_sanitized="",
+            year=None,
+            season=None,
+            episode=None,
+            episode_end=None,
+            quality=None,
+            source=None,
+            codec=None,
+            group="UNKNOWN",
+            tech_string="",
+            media_type=MediaTypeToken.UNKNOWN.value,
+            parse_path=ParsePath.DIRECT.value,
+        )
+        assert set(collect_missing_critical(empty)) == {
+            "title",
+            "media_type",
+            "year",
+        }
+
+    def test_collect_missing_critical_none(self) -> None:
+        parsed = _movie()
+        assert collect_missing_critical(parsed) == ()
+
+
+# --------------------------------------------------------------------- #
+# End-to-end contract                                                   #
+# --------------------------------------------------------------------- #
+
+
+class TestParseReleaseReturnsReport:
+    def test_returns_tuple(self) -> None:
+        result = parse_release("Inception.2010.1080p.BluRay.x264-GROUP", _KB)
+        assert isinstance(result, tuple)
+        assert len(result) == 2
+        parsed, report = result
+        assert isinstance(parsed, ParsedRelease)
+        assert isinstance(report, ParseReport)
+
+    def test_known_group_is_easy_road(self) -> None:
+        # KONTRAST has a schema in release_groups/
+        _, report = parse_release(
+            "Oz.S03E01.1080p.WEBRip.x265-KONTRAST", _KB
+        )
+        assert report.road == Road.EASY.value
+        assert report.confidence > 0
+
+    def test_unknown_group_well_formed_is_shitty(self) -> None:
+        # No registered schema but well-formed scene name → SHITTY
+        _, report = parse_release(
+            "Inception.2010.1080p.BluRay.x264-NOSCHEMA", _KB
+        )
+        assert report.road == Road.SHITTY.value
+
+    def test_malformed_name_is_pop(self) -> None:
+        # Forbidden chars (@) — short-circuits to AI / PoP.
+        _, report = parse_release("garbage@#%name", _KB)
+        assert report.road == Road.PATH_OF_PAIN.value
+        assert report.confidence == 0