refactor(release): rename ParsePath enum to TokenizationRoute
ParsePath collided with pathlib.Path in mental models, and was one
letter from the parse_path attribute that stores its value — confusion
on confusion. Road (EASY/SHITTY/PATH_OF_PAIN) is the parser-confidence
axis; TokenizationRoute (DIRECT/SANITIZED/AI) is the tokenization-method
axis. They're orthogonal and the new name makes that obvious.
Field name parse_path stays — it's the right name for the attribute
that *holds* the route. String values ("direct", "sanitized", "ai")
stay too, so YAML fixtures and the analyze_release tool spec are
unchanged. Only the type symbol changes:
- value_objects.py: class rename + docstring spelling out orthogonality
with Road.
- services.py: 3 call sites.
- scoring.py: docstring cross-reference updated.
- tests/domain/release/test_parser_v2_scoring.py: import + 3 call sites.
This commit is contained in:
@@ -57,6 +57,17 @@ callers).
|
||||
|
||||
### Changed
|
||||
|
||||
- **`ParsePath` enum renamed to `TokenizationRoute`.** The old name
|
||||
collided with `pathlib.Path` in code-reading mental models, and was
|
||||
one letter from `parse_path` (the field that holds the value) — making
|
||||
it harder than it needed to be to spot the type vs the attribute.
|
||||
``TokenizationRoute`` says what it actually captures (DIRECT /
|
||||
SANITIZED / AI = how the name reached the tokenizer), and the class
|
||||
docstring now spells out the orthogonality with ``Road`` (EASY /
|
||||
SHITTY / PATH_OF_PAIN, which captures parser confidence on
|
||||
``ParseReport``). The ``parse_path`` field name stays unchanged —
|
||||
string values too — so YAML fixtures, the ``analyze_release`` tool
|
||||
spec, and any external consumer are untouched.
|
||||
- **`enrich_from_probe` codec mappings moved to YAML.** The three
|
||||
hard-coded module dicts (`_VIDEO_CODEC_MAP`, `_AUDIO_CODEC_MAP`,
|
||||
`_CHANNEL_MAP`) translating ffprobe output to scene tokens
|
||||
|
||||
@@ -34,7 +34,7 @@ from .tokens import Token, TokenRole
|
||||
class Road(str, Enum):
|
||||
"""How the parser handled a given release name.
|
||||
|
||||
Distinct from :class:`~alfred.domain.release.value_objects.ParsePath`,
|
||||
Distinct from :class:`~alfred.domain.release.value_objects.TokenizationRoute`,
|
||||
which records the tokenization route (DIRECT / SANITIZED / AI). Road
|
||||
is about confidence in the *result*, not the *method*.
|
||||
"""
|
||||
|
||||
@@ -21,7 +21,7 @@ from __future__ import annotations
|
||||
from .parser import pipeline as _v2
|
||||
from .parser import scoring as _scoring
|
||||
from .ports import ReleaseKnowledge
|
||||
from .value_objects import MediaTypeToken, ParsedRelease, ParsePath, ParseReport
|
||||
from .value_objects import MediaTypeToken, ParsedRelease, ParseReport, TokenizationRoute
|
||||
|
||||
|
||||
def parse_release(
|
||||
@@ -44,7 +44,7 @@ def parse_release(
|
||||
3. Otherwise run the v2 pipeline: tokenize → annotate (EASY when a
|
||||
group schema is known, SHITTY otherwise) → assemble → score.
|
||||
"""
|
||||
parse_path = ParsePath.DIRECT
|
||||
parse_path = TokenizationRoute.DIRECT
|
||||
|
||||
# Apostrophes inside titles ("Don't", "L'avare") are common and should
|
||||
# not push the release through the AI fallback. Strip them up front so
|
||||
@@ -53,11 +53,11 @@ def parse_release(
|
||||
working_name = name
|
||||
if "'" in working_name:
|
||||
working_name = working_name.replace("'", "")
|
||||
parse_path = ParsePath.SANITIZED
|
||||
parse_path = TokenizationRoute.SANITIZED
|
||||
|
||||
clean, site_tag = _v2.strip_site_tag(working_name)
|
||||
if site_tag is not None:
|
||||
parse_path = ParsePath.SANITIZED
|
||||
parse_path = TokenizationRoute.SANITIZED
|
||||
|
||||
if not _is_well_formed(clean, kb):
|
||||
parsed = ParsedRelease(
|
||||
@@ -75,7 +75,7 @@ def parse_release(
|
||||
group="UNKNOWN",
|
||||
media_type=MediaTypeToken.UNKNOWN,
|
||||
site_tag=site_tag,
|
||||
parse_path=ParsePath.AI,
|
||||
parse_path=TokenizationRoute.AI,
|
||||
)
|
||||
report = ParseReport(
|
||||
confidence=0,
|
||||
|
||||
@@ -40,9 +40,21 @@ class MediaTypeToken(str, Enum):
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
|
||||
class ParsePath(str, Enum):
|
||||
"""How a ``ParsedRelease`` was produced. ``str``-backed for the same
|
||||
reasons as :class:`MediaTypeToken`."""
|
||||
class TokenizationRoute(str, Enum):
|
||||
"""How a ``ParsedRelease`` was produced.
|
||||
|
||||
Records the **tokenization route** — i.e. whether the release name
|
||||
was tokenized as-is (``DIRECT``), after a sanitization pass like
|
||||
site-tag stripping or apostrophe removal (``SANITIZED``), or whether
|
||||
structural parsing failed and an LLM rebuild is needed (``AI``).
|
||||
|
||||
This is **orthogonal** to :class:`~alfred.domain.release.parser.scoring.Road`
|
||||
(EASY / SHITTY / PATH_OF_PAIN), which captures parser confidence and
|
||||
is recorded on :class:`ParseReport`. Both can vary independently —
|
||||
a SANITIZED name can still land on the EASY road if a group schema
|
||||
matches the tokens after stripping.
|
||||
|
||||
``str``-backed for the same reasons as :class:`MediaTypeToken`."""
|
||||
|
||||
DIRECT = "direct"
|
||||
SANITIZED = "sanitized"
|
||||
@@ -127,7 +139,7 @@ class ParsedRelease:
|
||||
site_tag: str | None = (
|
||||
None # site watermark stripped from name, e.g. "TGx", "OxTorrent.vc"
|
||||
)
|
||||
parse_path: ParsePath = ParsePath.DIRECT
|
||||
parse_path: TokenizationRoute = TokenizationRoute.DIRECT
|
||||
languages: list[str] = field(default_factory=list) # ["MULTI", "VFF"], ["FRENCH"], …
|
||||
audio_codec: str | None = None # "DTS-HD.MA", "DDP", "EAC3", …
|
||||
audio_channels: str | None = None # "5.1", "7.1", "2.0", …
|
||||
@@ -168,9 +180,9 @@ class ParsedRelease:
|
||||
f"ParsedRelease.media_type must be a MediaTypeToken, "
|
||||
f"got {type(self.media_type).__name__}: {self.media_type!r}"
|
||||
)
|
||||
if not isinstance(self.parse_path, ParsePath):
|
||||
if not isinstance(self.parse_path, TokenizationRoute):
|
||||
raise ValidationError(
|
||||
f"ParsedRelease.parse_path must be a ParsePath, "
|
||||
f"ParsedRelease.parse_path must be a TokenizationRoute, "
|
||||
f"got {type(self.parse_path).__name__}: {self.parse_path!r}"
|
||||
)
|
||||
|
||||
|
||||
@@ -22,8 +22,8 @@ from alfred.domain.release.services import parse_release
|
||||
from alfred.domain.release.value_objects import (
|
||||
MediaTypeToken,
|
||||
ParsedRelease,
|
||||
ParsePath,
|
||||
ParseReport,
|
||||
TokenizationRoute,
|
||||
)
|
||||
from alfred.domain.shared.exceptions import ValidationError
|
||||
from alfred.infrastructure.knowledge.release_kb import YamlReleaseKnowledge
|
||||
@@ -79,7 +79,7 @@ def _movie(year: int = 2020, **overrides) -> ParsedRelease:
|
||||
codec="x264",
|
||||
group="GROUP",
|
||||
media_type=MediaTypeToken.MOVIE,
|
||||
parse_path=ParsePath.DIRECT,
|
||||
parse_path=TokenizationRoute.DIRECT,
|
||||
)
|
||||
base.update(overrides)
|
||||
return ParsedRelease(**base)
|
||||
@@ -120,7 +120,7 @@ class TestComputeScore:
|
||||
codec="x265",
|
||||
group="KONTRAST",
|
||||
media_type=MediaTypeToken.TV_SHOW,
|
||||
parse_path=ParsePath.DIRECT,
|
||||
parse_path=TokenizationRoute.DIRECT,
|
||||
)
|
||||
tokens = [
|
||||
Token("Oz", 0, TokenRole.TITLE),
|
||||
@@ -230,7 +230,7 @@ class TestCollectors:
|
||||
codec=None,
|
||||
group="UNKNOWN",
|
||||
media_type=MediaTypeToken.UNKNOWN,
|
||||
parse_path=ParsePath.DIRECT,
|
||||
parse_path=TokenizationRoute.DIRECT,
|
||||
)
|
||||
assert set(collect_missing_critical(empty)) == {
|
||||
"title",
|
||||
|
||||
Reference in New Issue
Block a user