refactor(release): rename ParsePath enum to TokenizationRoute
ParsePath collided with pathlib.Path in mental models, and was one
letter from the parse_path attribute that stores its value — confusion
on confusion. Road (EASY/SHITTY/PATH_OF_PAIN) is the parser-confidence
axis; TokenizationRoute (DIRECT/SANITIZED/AI) is the tokenization-method
axis. They're orthogonal and the new name makes that obvious.
Field name parse_path stays — it's the right name for the attribute
that *holds* the route. String values ("direct", "sanitized", "ai")
stay too, so YAML fixtures and the analyze_release tool spec are
unchanged. Only the type symbol changes:
- value_objects.py: class rename + docstring spelling out orthogonality
with Road.
- services.py: 3 call sites.
- scoring.py: docstring cross-reference updated.
- tests/domain/release/test_parser_v2_scoring.py: import + 3 call sites.
This commit is contained in:
@@ -57,6 +57,17 @@ callers).
|
|||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
|
|
||||||
|
- **`ParsePath` enum renamed to `TokenizationRoute`.** The old name
|
||||||
|
collided with `pathlib.Path` in code-reading mental models, and was
|
||||||
|
one letter from `parse_path` (the field that holds the value) — making
|
||||||
|
it harder than it needed to be to spot the type vs the attribute.
|
||||||
|
``TokenizationRoute`` says what it actually captures (DIRECT /
|
||||||
|
SANITIZED / AI = how the name reached the tokenizer), and the class
|
||||||
|
docstring now spells out the orthogonality with ``Road`` (EASY /
|
||||||
|
SHITTY / PATH_OF_PAIN, which captures parser confidence on
|
||||||
|
``ParseReport``). The ``parse_path`` field name stays unchanged —
|
||||||
|
string values too — so YAML fixtures, the ``analyze_release`` tool
|
||||||
|
spec, and any external consumer are untouched.
|
||||||
- **`enrich_from_probe` codec mappings moved to YAML.** The three
|
- **`enrich_from_probe` codec mappings moved to YAML.** The three
|
||||||
hard-coded module dicts (`_VIDEO_CODEC_MAP`, `_AUDIO_CODEC_MAP`,
|
hard-coded module dicts (`_VIDEO_CODEC_MAP`, `_AUDIO_CODEC_MAP`,
|
||||||
`_CHANNEL_MAP`) translating ffprobe output to scene tokens
|
`_CHANNEL_MAP`) translating ffprobe output to scene tokens
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ from .tokens import Token, TokenRole
|
|||||||
class Road(str, Enum):
|
class Road(str, Enum):
|
||||||
"""How the parser handled a given release name.
|
"""How the parser handled a given release name.
|
||||||
|
|
||||||
Distinct from :class:`~alfred.domain.release.value_objects.ParsePath`,
|
Distinct from :class:`~alfred.domain.release.value_objects.TokenizationRoute`,
|
||||||
which records the tokenization route (DIRECT / SANITIZED / AI). Road
|
which records the tokenization route (DIRECT / SANITIZED / AI). Road
|
||||||
is about confidence in the *result*, not the *method*.
|
is about confidence in the *result*, not the *method*.
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ from __future__ import annotations
|
|||||||
from .parser import pipeline as _v2
|
from .parser import pipeline as _v2
|
||||||
from .parser import scoring as _scoring
|
from .parser import scoring as _scoring
|
||||||
from .ports import ReleaseKnowledge
|
from .ports import ReleaseKnowledge
|
||||||
from .value_objects import MediaTypeToken, ParsedRelease, ParsePath, ParseReport
|
from .value_objects import MediaTypeToken, ParsedRelease, ParseReport, TokenizationRoute
|
||||||
|
|
||||||
|
|
||||||
def parse_release(
|
def parse_release(
|
||||||
@@ -44,7 +44,7 @@ def parse_release(
|
|||||||
3. Otherwise run the v2 pipeline: tokenize → annotate (EASY when a
|
3. Otherwise run the v2 pipeline: tokenize → annotate (EASY when a
|
||||||
group schema is known, SHITTY otherwise) → assemble → score.
|
group schema is known, SHITTY otherwise) → assemble → score.
|
||||||
"""
|
"""
|
||||||
parse_path = ParsePath.DIRECT
|
parse_path = TokenizationRoute.DIRECT
|
||||||
|
|
||||||
# Apostrophes inside titles ("Don't", "L'avare") are common and should
|
# Apostrophes inside titles ("Don't", "L'avare") are common and should
|
||||||
# not push the release through the AI fallback. Strip them up front so
|
# not push the release through the AI fallback. Strip them up front so
|
||||||
@@ -53,11 +53,11 @@ def parse_release(
|
|||||||
working_name = name
|
working_name = name
|
||||||
if "'" in working_name:
|
if "'" in working_name:
|
||||||
working_name = working_name.replace("'", "")
|
working_name = working_name.replace("'", "")
|
||||||
parse_path = ParsePath.SANITIZED
|
parse_path = TokenizationRoute.SANITIZED
|
||||||
|
|
||||||
clean, site_tag = _v2.strip_site_tag(working_name)
|
clean, site_tag = _v2.strip_site_tag(working_name)
|
||||||
if site_tag is not None:
|
if site_tag is not None:
|
||||||
parse_path = ParsePath.SANITIZED
|
parse_path = TokenizationRoute.SANITIZED
|
||||||
|
|
||||||
if not _is_well_formed(clean, kb):
|
if not _is_well_formed(clean, kb):
|
||||||
parsed = ParsedRelease(
|
parsed = ParsedRelease(
|
||||||
@@ -75,7 +75,7 @@ def parse_release(
|
|||||||
group="UNKNOWN",
|
group="UNKNOWN",
|
||||||
media_type=MediaTypeToken.UNKNOWN,
|
media_type=MediaTypeToken.UNKNOWN,
|
||||||
site_tag=site_tag,
|
site_tag=site_tag,
|
||||||
parse_path=ParsePath.AI,
|
parse_path=TokenizationRoute.AI,
|
||||||
)
|
)
|
||||||
report = ParseReport(
|
report = ParseReport(
|
||||||
confidence=0,
|
confidence=0,
|
||||||
|
|||||||
@@ -40,9 +40,21 @@ class MediaTypeToken(str, Enum):
|
|||||||
UNKNOWN = "unknown"
|
UNKNOWN = "unknown"
|
||||||
|
|
||||||
|
|
||||||
class ParsePath(str, Enum):
|
class TokenizationRoute(str, Enum):
|
||||||
"""How a ``ParsedRelease`` was produced. ``str``-backed for the same
|
"""How a ``ParsedRelease`` was produced.
|
||||||
reasons as :class:`MediaTypeToken`."""
|
|
||||||
|
Records the **tokenization route** — i.e. whether the release name
|
||||||
|
was tokenized as-is (``DIRECT``), after a sanitization pass like
|
||||||
|
site-tag stripping or apostrophe removal (``SANITIZED``), or whether
|
||||||
|
structural parsing failed and an LLM rebuild is needed (``AI``).
|
||||||
|
|
||||||
|
This is **orthogonal** to :class:`~alfred.domain.release.parser.scoring.Road`
|
||||||
|
(EASY / SHITTY / PATH_OF_PAIN), which captures parser confidence and
|
||||||
|
is recorded on :class:`ParseReport`. Both can vary independently —
|
||||||
|
a SANITIZED name can still land on the EASY road if a group schema
|
||||||
|
matches the tokens after stripping.
|
||||||
|
|
||||||
|
``str``-backed for the same reasons as :class:`MediaTypeToken`."""
|
||||||
|
|
||||||
DIRECT = "direct"
|
DIRECT = "direct"
|
||||||
SANITIZED = "sanitized"
|
SANITIZED = "sanitized"
|
||||||
@@ -127,7 +139,7 @@ class ParsedRelease:
|
|||||||
site_tag: str | None = (
|
site_tag: str | None = (
|
||||||
None # site watermark stripped from name, e.g. "TGx", "OxTorrent.vc"
|
None # site watermark stripped from name, e.g. "TGx", "OxTorrent.vc"
|
||||||
)
|
)
|
||||||
parse_path: ParsePath = ParsePath.DIRECT
|
parse_path: TokenizationRoute = TokenizationRoute.DIRECT
|
||||||
languages: list[str] = field(default_factory=list) # ["MULTI", "VFF"], ["FRENCH"], …
|
languages: list[str] = field(default_factory=list) # ["MULTI", "VFF"], ["FRENCH"], …
|
||||||
audio_codec: str | None = None # "DTS-HD.MA", "DDP", "EAC3", …
|
audio_codec: str | None = None # "DTS-HD.MA", "DDP", "EAC3", …
|
||||||
audio_channels: str | None = None # "5.1", "7.1", "2.0", …
|
audio_channels: str | None = None # "5.1", "7.1", "2.0", …
|
||||||
@@ -168,9 +180,9 @@ class ParsedRelease:
|
|||||||
f"ParsedRelease.media_type must be a MediaTypeToken, "
|
f"ParsedRelease.media_type must be a MediaTypeToken, "
|
||||||
f"got {type(self.media_type).__name__}: {self.media_type!r}"
|
f"got {type(self.media_type).__name__}: {self.media_type!r}"
|
||||||
)
|
)
|
||||||
if not isinstance(self.parse_path, ParsePath):
|
if not isinstance(self.parse_path, TokenizationRoute):
|
||||||
raise ValidationError(
|
raise ValidationError(
|
||||||
f"ParsedRelease.parse_path must be a ParsePath, "
|
f"ParsedRelease.parse_path must be a TokenizationRoute, "
|
||||||
f"got {type(self.parse_path).__name__}: {self.parse_path!r}"
|
f"got {type(self.parse_path).__name__}: {self.parse_path!r}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -22,8 +22,8 @@ from alfred.domain.release.services import parse_release
|
|||||||
from alfred.domain.release.value_objects import (
|
from alfred.domain.release.value_objects import (
|
||||||
MediaTypeToken,
|
MediaTypeToken,
|
||||||
ParsedRelease,
|
ParsedRelease,
|
||||||
ParsePath,
|
|
||||||
ParseReport,
|
ParseReport,
|
||||||
|
TokenizationRoute,
|
||||||
)
|
)
|
||||||
from alfred.domain.shared.exceptions import ValidationError
|
from alfred.domain.shared.exceptions import ValidationError
|
||||||
from alfred.infrastructure.knowledge.release_kb import YamlReleaseKnowledge
|
from alfred.infrastructure.knowledge.release_kb import YamlReleaseKnowledge
|
||||||
@@ -79,7 +79,7 @@ def _movie(year: int = 2020, **overrides) -> ParsedRelease:
|
|||||||
codec="x264",
|
codec="x264",
|
||||||
group="GROUP",
|
group="GROUP",
|
||||||
media_type=MediaTypeToken.MOVIE,
|
media_type=MediaTypeToken.MOVIE,
|
||||||
parse_path=ParsePath.DIRECT,
|
parse_path=TokenizationRoute.DIRECT,
|
||||||
)
|
)
|
||||||
base.update(overrides)
|
base.update(overrides)
|
||||||
return ParsedRelease(**base)
|
return ParsedRelease(**base)
|
||||||
@@ -120,7 +120,7 @@ class TestComputeScore:
|
|||||||
codec="x265",
|
codec="x265",
|
||||||
group="KONTRAST",
|
group="KONTRAST",
|
||||||
media_type=MediaTypeToken.TV_SHOW,
|
media_type=MediaTypeToken.TV_SHOW,
|
||||||
parse_path=ParsePath.DIRECT,
|
parse_path=TokenizationRoute.DIRECT,
|
||||||
)
|
)
|
||||||
tokens = [
|
tokens = [
|
||||||
Token("Oz", 0, TokenRole.TITLE),
|
Token("Oz", 0, TokenRole.TITLE),
|
||||||
@@ -230,7 +230,7 @@ class TestCollectors:
|
|||||||
codec=None,
|
codec=None,
|
||||||
group="UNKNOWN",
|
group="UNKNOWN",
|
||||||
media_type=MediaTypeToken.UNKNOWN,
|
media_type=MediaTypeToken.UNKNOWN,
|
||||||
parse_path=ParsePath.DIRECT,
|
parse_path=TokenizationRoute.DIRECT,
|
||||||
)
|
)
|
||||||
assert set(collect_missing_critical(empty)) == {
|
assert set(collect_missing_critical(empty)) == {
|
||||||
"title",
|
"title",
|
||||||
|
|||||||
Reference in New Issue
Block a user