diff --git a/CHANGELOG.md b/CHANGELOG.md index 1444904..ace31ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -57,6 +57,17 @@ callers). ### Changed +- **`ParsePath` enum renamed to `TokenizationRoute`.** The old name + collided with `pathlib.Path` in code-reading mental models, and was + one letter from `parse_path` (the field that holds the value) — making + it harder than it needed to be to spot the type vs the attribute. + ``TokenizationRoute`` says what it actually captures (DIRECT / + SANITIZED / AI = how the name reached the tokenizer), and the class + docstring now spells out the orthogonality with ``Road`` (EASY / + SHITTY / PATH_OF_PAIN, which captures parser confidence on + ``ParseReport``). The ``parse_path`` field name stays unchanged — + string values too — so YAML fixtures, the ``analyze_release`` tool + spec, and any external consumer are untouched. - **`enrich_from_probe` codec mappings moved to YAML.** The three hard-coded module dicts (`_VIDEO_CODEC_MAP`, `_AUDIO_CODEC_MAP`, `_CHANNEL_MAP`) translating ffprobe output to scene tokens diff --git a/alfred/domain/release/parser/scoring.py b/alfred/domain/release/parser/scoring.py index 4e27fc3..e3a23da 100644 --- a/alfred/domain/release/parser/scoring.py +++ b/alfred/domain/release/parser/scoring.py @@ -34,7 +34,7 @@ from .tokens import Token, TokenRole class Road(str, Enum): """How the parser handled a given release name. - Distinct from :class:`~alfred.domain.release.value_objects.ParsePath`, + Distinct from :class:`~alfred.domain.release.value_objects.TokenizationRoute`, which records the tokenization route (DIRECT / SANITIZED / AI). Road is about confidence in the *result*, not the *method*. """ diff --git a/alfred/domain/release/services.py b/alfred/domain/release/services.py index 50351a5..bfcc3c4 100644 --- a/alfred/domain/release/services.py +++ b/alfred/domain/release/services.py @@ -21,7 +21,7 @@ from __future__ import annotations from .parser import pipeline as _v2 from .parser import scoring as _scoring from .ports import ReleaseKnowledge -from .value_objects import MediaTypeToken, ParsedRelease, ParsePath, ParseReport +from .value_objects import MediaTypeToken, ParsedRelease, ParseReport, TokenizationRoute def parse_release( @@ -44,7 +44,7 @@ def parse_release( 3. Otherwise run the v2 pipeline: tokenize → annotate (EASY when a group schema is known, SHITTY otherwise) → assemble → score. """ - parse_path = ParsePath.DIRECT + parse_path = TokenizationRoute.DIRECT # Apostrophes inside titles ("Don't", "L'avare") are common and should # not push the release through the AI fallback. Strip them up front so @@ -53,11 +53,11 @@ def parse_release( working_name = name if "'" in working_name: working_name = working_name.replace("'", "") - parse_path = ParsePath.SANITIZED + parse_path = TokenizationRoute.SANITIZED clean, site_tag = _v2.strip_site_tag(working_name) if site_tag is not None: - parse_path = ParsePath.SANITIZED + parse_path = TokenizationRoute.SANITIZED if not _is_well_formed(clean, kb): parsed = ParsedRelease( @@ -75,7 +75,7 @@ def parse_release( group="UNKNOWN", media_type=MediaTypeToken.UNKNOWN, site_tag=site_tag, - parse_path=ParsePath.AI, + parse_path=TokenizationRoute.AI, ) report = ParseReport( confidence=0, diff --git a/alfred/domain/release/value_objects.py b/alfred/domain/release/value_objects.py index dca2807..243ecdc 100644 --- a/alfred/domain/release/value_objects.py +++ b/alfred/domain/release/value_objects.py @@ -40,9 +40,21 @@ class MediaTypeToken(str, Enum): UNKNOWN = "unknown" -class ParsePath(str, Enum): - """How a ``ParsedRelease`` was produced. ``str``-backed for the same - reasons as :class:`MediaTypeToken`.""" +class TokenizationRoute(str, Enum): + """How a ``ParsedRelease`` was produced. + + Records the **tokenization route** — i.e. whether the release name + was tokenized as-is (``DIRECT``), after a sanitization pass like + site-tag stripping or apostrophe removal (``SANITIZED``), or whether + structural parsing failed and an LLM rebuild is needed (``AI``). + + This is **orthogonal** to :class:`~alfred.domain.release.parser.scoring.Road` + (EASY / SHITTY / PATH_OF_PAIN), which captures parser confidence and + is recorded on :class:`ParseReport`. Both can vary independently — + a SANITIZED name can still land on the EASY road if a group schema + matches the tokens after stripping. + + ``str``-backed for the same reasons as :class:`MediaTypeToken`.""" DIRECT = "direct" SANITIZED = "sanitized" @@ -127,7 +139,7 @@ class ParsedRelease: site_tag: str | None = ( None # site watermark stripped from name, e.g. "TGx", "OxTorrent.vc" ) - parse_path: ParsePath = ParsePath.DIRECT + parse_path: TokenizationRoute = TokenizationRoute.DIRECT languages: list[str] = field(default_factory=list) # ["MULTI", "VFF"], ["FRENCH"], … audio_codec: str | None = None # "DTS-HD.MA", "DDP", "EAC3", … audio_channels: str | None = None # "5.1", "7.1", "2.0", … @@ -168,9 +180,9 @@ class ParsedRelease: f"ParsedRelease.media_type must be a MediaTypeToken, " f"got {type(self.media_type).__name__}: {self.media_type!r}" ) - if not isinstance(self.parse_path, ParsePath): + if not isinstance(self.parse_path, TokenizationRoute): raise ValidationError( - f"ParsedRelease.parse_path must be a ParsePath, " + f"ParsedRelease.parse_path must be a TokenizationRoute, " f"got {type(self.parse_path).__name__}: {self.parse_path!r}" ) diff --git a/tests/domain/release/test_parser_v2_scoring.py b/tests/domain/release/test_parser_v2_scoring.py index 85a19d5..9dca58b 100644 --- a/tests/domain/release/test_parser_v2_scoring.py +++ b/tests/domain/release/test_parser_v2_scoring.py @@ -22,8 +22,8 @@ from alfred.domain.release.services import parse_release from alfred.domain.release.value_objects import ( MediaTypeToken, ParsedRelease, - ParsePath, ParseReport, + TokenizationRoute, ) from alfred.domain.shared.exceptions import ValidationError from alfred.infrastructure.knowledge.release_kb import YamlReleaseKnowledge @@ -79,7 +79,7 @@ def _movie(year: int = 2020, **overrides) -> ParsedRelease: codec="x264", group="GROUP", media_type=MediaTypeToken.MOVIE, - parse_path=ParsePath.DIRECT, + parse_path=TokenizationRoute.DIRECT, ) base.update(overrides) return ParsedRelease(**base) @@ -120,7 +120,7 @@ class TestComputeScore: codec="x265", group="KONTRAST", media_type=MediaTypeToken.TV_SHOW, - parse_path=ParsePath.DIRECT, + parse_path=TokenizationRoute.DIRECT, ) tokens = [ Token("Oz", 0, TokenRole.TITLE), @@ -230,7 +230,7 @@ class TestCollectors: codec=None, group="UNKNOWN", media_type=MediaTypeToken.UNKNOWN, - parse_path=ParsePath.DIRECT, + parse_path=TokenizationRoute.DIRECT, ) assert set(collect_missing_critical(empty)) == { "title",