From 075a827b0ed82282afd1f24820ba2efe9fbdd161 Mon Sep 17 00:00:00 2001 From: Francwa Date: Wed, 20 May 2026 00:21:11 +0200 Subject: [PATCH] feat(release): wire v2 EASY path for known release groups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The annotate-based v2 pipeline now handles releases ending in -KONTRAST, -ELiTE, or -RARBG. Unknown groups still fall through to the legacy SHITTY heuristic in services.py — nothing changes for them. Pipeline (alfred/domain/release/parser/pipeline.py): - tokenize(): string-ops separator split, strips [site.tag] first. - annotate(): right-to-left group detection (priority to codec-GROUP shape, fallback to any non-source dashed token), GroupSchema lookup via the kb port, then lockstep walk of tokens against schema chunks. Optional chunks skip on mismatch, mandatory mismatches return None so the caller falls back gracefully. CODEC pre-consumed by a codec-GROUP trailing token correctly skips the CODEC chunk in the body walk. - assemble(): folds annotated tokens into a ParsedRelease-compatible dict (title joined by '.', group from the codec-GROUP token's extras). Schema (alfred/domain/release/parser/schema.py): - GroupSchema + SchemaChunk frozen value objects. - TokenRole.GROUP added. Port + adapter: - ReleaseKnowledge.group_schema(name) lookup added (case-insensitive). - YamlReleaseKnowledge loads alfred/knowledge/release/release_groups/ *.yaml at construction time; learned overrides in data/knowledge/release/release_groups/ also picked up. Knowledge: - release_groups/kontrast.yaml, elite.yaml, rarbg.yaml declare the canonical chunk_order. ELiTE marks source as optional (Foundation.S02 has no WEBRip token). Services: - parse_release tries the v2 path first; on None falls through to the legacy implementation untouched. Tests: - tests/domain/release/test_parser_v2_easy.py (10 cases) cover group detection (codec-GROUP, dashed-source skip, no-dash → unknown), schema-driven annotation (movie, TV episode, season pack with optional source, unknown group returns None), and field assembly. - Existing tests/domain/test_release_fixtures.py (30 cases) stay green: 5 EASY fixtures now produced by v2, 25 SHITTY/PATH OF PAIN fixtures still produced by the legacy path. Verified via spy on v2.assemble. Suite: 1007 passed, 8 skipped. Refs: project_release_parser_v2_specs (memory) --- CHANGELOG.md | 34 +- alfred/domain/release/parser/__init__.py | 3 +- alfred/domain/release/parser/pipeline.py | 414 +++++++++++++++--- alfred/domain/release/parser/schema.py | 47 ++ alfred/domain/release/ports/knowledge.py | 16 +- alfred/domain/release/services.py | 18 + alfred/infrastructure/knowledge/release.py | 23 + alfred/infrastructure/knowledge/release_kb.py | 33 ++ .../release/release_groups/elite.yaml | 22 + .../release/release_groups/kontrast.yaml | 28 ++ .../release/release_groups/rarbg.yaml | 20 + tests/domain/release/test_parser_v2_easy.py | 142 ++++++ 12 files changed, 730 insertions(+), 70 deletions(-) create mode 100644 alfred/domain/release/parser/schema.py create mode 100644 alfred/knowledge/release/release_groups/elite.yaml create mode 100644 alfred/knowledge/release/release_groups/kontrast.yaml create mode 100644 alfred/knowledge/release/release_groups/rarbg.yaml create mode 100644 tests/domain/release/test_parser_v2_easy.py diff --git a/CHANGELOG.md b/CHANGELOG.md index a8d37ec..3420c02 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,15 +17,31 @@ callers). ### Added -- **Release parser v2 scaffolding** (`alfred/domain/release/parser/`): - new package laying the foundation for an annotate-based pipeline - (tokenize → annotate → assemble). Exposes `Token` (frozen VO with - `index` + `role` + `extra`), `TokenRole` enum (structural / technical / - meta families), and a `pipeline.py` module with working `strip_site_tag` - + `tokenize` and a documented `annotate` stub. Legacy `parse_release` - in `release.services` remains the live implementation until the - annotate step is wired in. Scaffolding tests in - `tests/domain/release/test_parser_v2_scaffolding.py`. +- **Release parser v2 — EASY path live** (`alfred/domain/release/parser/`): + new annotate-based pipeline (tokenize → annotate → assemble) drives + releases from known groups. Exposes `Token` (frozen VO with `index` + + `role` + `extra`), `TokenRole` enum (structural/technical/meta families), + and `GroupSchema` / `SchemaChunk` value objects. + - `pipeline.tokenize`: string-ops separator split (no regex), strips + a `[site.tag]` prefix/suffix first. + - `pipeline.annotate`: detects the trailing group right-to-left + (priority to `codec-GROUP` shape, fallback to any non-source dashed + token), looks up its `GroupSchema`, then walks tokens and schema + chunks in lockstep — optional chunks that don't match are skipped, + mandatory mismatches abort EASY and return `None` so the caller can + fall back to SHITTY. + - `pipeline.assemble`: folds annotated tokens into a + `ParsedRelease`-compatible dict. + - `parse_release` (in `release.services`) tries the v2 EASY path first + and falls through to the legacy SHITTY heuristic on `None`. Legacy + SHITTY/PATH OF PAIN behavior is unchanged. + - Knowledge: `alfred/knowledge/release/release_groups/{kontrast,elite, + rarbg}.yaml` declare the canonical chunk order per group, loaded via + new `ReleaseKnowledge.group_schema(name)` port method. + - Tests in `tests/domain/release/test_parser_v2_{scaffolding,easy}.py` + cover token VOs, site-tag stripping, group detection, schema-driven + annotation (movie, TV episode, season pack with optional source), + and field assembly. - **Real-world release fixtures** under `tests/fixtures/releases/{easy,shitty,path_of_pain}/`, each documenting an expected `ParsedRelease` plus the future `routing` diff --git a/alfred/domain/release/parser/__init__.py b/alfred/domain/release/parser/__init__.py index 24b33b2..37558c1 100644 --- a/alfred/domain/release/parser/__init__.py +++ b/alfred/domain/release/parser/__init__.py @@ -25,6 +25,7 @@ production until each piece of the v2 pipeline is wired in. from __future__ import annotations +from .schema import GroupSchema, SchemaChunk from .tokens import Token, TokenRole -__all__ = ["Token", "TokenRole"] +__all__ = ["GroupSchema", "SchemaChunk", "Token", "TokenRole"] diff --git a/alfred/domain/release/parser/pipeline.py b/alfred/domain/release/parser/pipeline.py index 97e3c21..2b63a25 100644 --- a/alfred/domain/release/parser/pipeline.py +++ b/alfred/domain/release/parser/pipeline.py @@ -1,43 +1,40 @@ -"""Annotate-based pipeline skeleton. +"""Annotate-based pipeline. -The pipeline is **declared here** in three named stages, but actual logic -is wired in incrementally — current state is intentional scaffolding. +Three stages: -Stages: +1. :func:`tokenize` — release name → ``list[Token]`` (all UNKNOWN), plus + a separately-returned site tag (e.g. ``[YTS.MX]``) that is never + tokenized. +2. :func:`annotate` — promote each token's :class:`TokenRole` using the + injected knowledge base. Group detection is right-to-left; if the + group has a registered :class:`GroupSchema` we run :func:`_annotate_easy` + (schema-driven, lockstep walk); otherwise we return the tokens with + only the group annotated and the caller falls back to SHITTY in + :func:`_legacy_assemble` (see :mod:`..services`). +3. :func:`assemble` — fold annotated tokens into a + :class:`~alfred.domain.release.value_objects.ParsedRelease`. -1. :func:`tokenize` — release name → ``list[Token]`` (all UNKNOWN). Also - pulls out a leading/trailing site tag (e.g. ``[YTS.MX]``) which is - returned separately and never tokenized. -2. :func:`annotate` — walk the tokens, promote roles using - :class:`~alfred.domain.release.ports.knowledge.ReleaseKnowledge`. The - walk is **right-to-left for the group** (scene convention puts it - last) and **left-to-right for the title** (which is always leftmost). -3. :func:`assemble` — fold the annotated stream into a domain VO. Output - type still TBD: the migration target is the existing - :class:`~alfred.domain.release.value_objects.ParsedRelease`, but the - pipeline may grow an intermediate :class:`AnnotatedRelease` first to - keep the score / leftover-tokens information that ``ParsedRelease`` - doesn't carry today. - -Road dispatch (EASY / SHITTY / PATH OF PAIN) happens **inside** -:func:`annotate` — once the group is identified (or not), the annotator -picks the right strategy. EASY consults a per-group schema; SHITTY runs -the generic matcher loop; PATH OF PAIN is a return-state, not a -separate path — the caller (``application/release/inspect.py``) decides -what to do with a low-confidence result. +The pipeline is **pure**: no I/O, no TMDB, no probe. All knowledge +arrives through ``kb: ReleaseKnowledge``. """ from __future__ import annotations from ..ports.knowledge import ReleaseKnowledge -from .tokens import Token +from .schema import GroupSchema +from .tokens import Token, TokenRole + + +# --------------------------------------------------------------------------- +# Stage 1 — tokenize +# --------------------------------------------------------------------------- def strip_site_tag(name: str) -> tuple[str, str | None]: """Split off a ``[site.tag]`` prefix or suffix. - The bracketed substring is removed from ``name`` and returned as the - second element. If no tag is found, returns ``(name.strip(), None)``. + Returns ``(clean_name, tag)``. If no tag is found, returns + ``(name.strip(), None)``. """ s = name.strip() @@ -63,19 +60,12 @@ def strip_site_tag(name: str) -> tuple[str, str | None]: def tokenize(name: str, kb: ReleaseKnowledge) -> tuple[list[Token], str | None]: """Split ``name`` into tokens after stripping any site tag. - Returns ``(tokens, site_tag)``. All tokens start with role - :attr:`~.tokens.TokenRole.UNKNOWN` — promotion happens in - :func:`annotate`. - - The tokenizer is a pure character-class split on ``kb.separators``. - String-ops style: no regex (keeps the rule from CLAUDE.md), at the - cost of one pass per separator. The release names we parse are short - (<200 chars), so the constant factor is irrelevant. + String-ops style: replace every configured separator with a single + NUL byte then split. NUL cannot legally appear in a release name, so + it's a safe sentinel. """ clean, site_tag = strip_site_tag(name) - # Replace every separator with a single delimiter, then split. Using - # \x00 because it cannot legally appear in a release name. DELIM = "\x00" buf = clean for sep in kb.separators: @@ -87,29 +77,335 @@ def tokenize(name: str, kb: ReleaseKnowledge) -> tuple[list[Token], str | None]: return tokens, site_tag -def annotate(tokens: list[Token], kb: ReleaseKnowledge) -> list[Token]: - """Promote each token's role using ``kb``. +# --------------------------------------------------------------------------- +# Stage 2 — annotate +# --------------------------------------------------------------------------- - **Not implemented yet.** Returns the input unchanged so the package - is importable and the pipeline shape is visible. Will be filled in - by subsequent commits, one role family at a time. - The intended walk order, once implemented: +def _parse_season_episode(text: str) -> tuple[int, int | None, int | None] | None: + """Parse a single token as ``SxxExx`` / ``SxxExxExx`` / ``Sxx`` / ``NxNN``. - 1. **Group (right-to-left)** — find the trailing ``-GROUP`` token, - which also reveals the codec when shaped as ``codec-GROUP``. If - the group matches a schema in ``knowledge/release/release_groups/`` - → EASY path; otherwise SHITTY. - 2. **Season/episode** — single-token scan, ``S01E05`` / ``1x05``. - 3. **Year** — first 4-digit token in [1900, 2099] *after* index 0. - 4. **Tech tokens** — resolutions, sources, codecs, audio, video meta, - editions, languages. Multi-token sequences (``DTS.HD.MA``, - ``Directors.Cut``) handled first to avoid greedy single-token - claims swallowing a sequence prefix. - 5. **Title** — leftmost contiguous UNKNOWN tokens up to the first - structural/technical role boundary. + Returns ``(season, episode, episode_end)`` or ``None`` if the token + is not a season/episode marker. """ - # TODO(parser-v2): implement annotation. See module docstring for the - # walk order. Until then, the legacy parse_release in - # release.services is the live implementation. - return tokens + upper = text.upper() + + # SxxExx form + if len(upper) >= 3 and upper[0] == "S" and upper[1:3].isdigit(): + season = int(upper[1:3]) + rest = upper[3:] + + if not rest: + return season, None, None + + episodes: list[int] = [] + while rest.startswith("E") and len(rest) >= 3 and rest[1:3].isdigit(): + episodes.append(int(rest[1:3])) + rest = rest[3:] + + if not episodes: + return None + return season, episodes[0], episodes[1] if len(episodes) >= 2 else None + + # NxNN form + if "X" in upper: + parts = upper.split("X") + if len(parts) >= 2 and all(p.isdigit() and p for p in parts): + season = int(parts[0]) + episode = int(parts[1]) + episode_end = int(parts[2]) if len(parts) >= 3 else None + return season, episode, episode_end + + return None + + +def _is_year(text: str) -> bool: + """Return True if ``text`` is a 4-digit year in [1900, 2099].""" + return len(text) == 4 and text.isdigit() and 1900 <= int(text) <= 2099 + + +def _split_codec_group(text: str, kb: ReleaseKnowledge) -> tuple[str, str] | None: + """Split a ``codec-GROUP`` token into ``(codec, group)`` if it fits. + + Returns ``None`` if the token doesn't match the ``codec-GROUP`` + shape. Handles the empty-group case (``x265-``) as ``(codec, "")``. + """ + if "-" not in text: + return None + head, _, tail = text.rpartition("-") + if head.lower() in kb.codecs: + return head, tail + return None + + +def _detect_group(tokens: list[Token], kb: ReleaseKnowledge) -> tuple[str, int | None]: + """Identify the release group by walking tokens right-to-left. + + Returns ``(group_name, token_index_carrying_group)`` — the index is + ``None`` when the group is missing entirely (no trailing ``-`` token + in the stream). + + Priority: + 1. Rightmost token of shape ``codec-GROUP`` (clearest signal). + 2. Rightmost token containing ``-`` whose head is *not* a known + source token (Web-DL etc. shouldn't be confused with a group). + """ + # Priority 1: codec-GROUP + for tok in reversed(tokens): + split = _split_codec_group(tok.text, kb) + if split is not None: + _, group = split + return (group or "UNKNOWN"), tok.index + + # Priority 2: rightmost dash, excluding known dashed sources + for tok in reversed(tokens): + if "-" not in tok.text: + continue + head, _, tail = tok.text.rpartition("-") + # Skip dashed-source tokens like "Web-DL" + if ( + head.lower() in kb.sources + or tok.text.lower().replace("-", "") in kb.sources + ): + continue + if tail: + return tail, tok.index + + return "UNKNOWN", None + + +def _annotate_easy( + tokens: list[Token], + kb: ReleaseKnowledge, + schema: GroupSchema, + group_token_index: int, +) -> list[Token] | None: + """Annotate tokens following a known group schema (EASY path). + + Returns the new token list on success, or ``None`` if the schema + walk fails — a mandatory chunk that doesn't match aborts EASY and + lets the caller fall back to SHITTY without crashing. + """ + result = list(tokens) + + # The codec-GROUP token is special: it carries TWO roles (CODEC + + # GROUP). We split it conceptually and tag it as CODEC here; the + # group itself is propagated via ``extra["group"]`` so the assemble + # step can recover both pieces from one token. When we do this, + # ``codec_pre_consumed`` is True so the schema walk knows to skip + # the CODEC chunk (it has nothing left to match in the body). + group_token = result[group_token_index] + cg_split = _split_codec_group(group_token.text, kb) + codec_pre_consumed = False + if cg_split is not None: + codec, group = cg_split + result[group_token_index] = group_token.with_role( + TokenRole.CODEC, codec=codec, group=group or "UNKNOWN" + ) + codec_pre_consumed = True + else: + # Group on a non-codec token (e.g. release without codec). + head, _, tail = group_token.text.rpartition("-") + result[group_token_index] = group_token.with_role( + TokenRole.GROUP, group=tail or "UNKNOWN", prefix=head + ) + + # Walk the schema left-to-right against tokens [0 .. group_token_index]. + # The codec-GROUP token at `group_token_index` already consumed CODEC + # + GROUP, so we walk up to (not including) it. + body = result[:group_token_index] + chunk_idx = 0 + tok_idx = 0 + + # 1) TITLE — special: consume contiguous UNKNOWN tokens until we hit + # a token whose text matches a non-title role. + while chunk_idx < len(schema.chunks) and schema.chunks[chunk_idx].role is TokenRole.TITLE: + title_end = _find_title_end(body, kb) + # All body tokens up to title_end are title parts. + for i in range(tok_idx, title_end): + result[i] = body[i].with_role(TokenRole.TITLE) + tok_idx = title_end + chunk_idx += 1 + + # 2) Remaining chunks. CODEC and GROUP that were pre-consumed by the + # codec-GROUP token at the end of the stream are skipped here. + for chunk in schema.chunks[chunk_idx:]: + if chunk.role is TokenRole.GROUP: + # Handled above via the trailing token. + continue + if chunk.role is TokenRole.CODEC and codec_pre_consumed: + # Already attached to the trailing token's extras. + continue + + if tok_idx >= len(body): + if chunk.optional: + continue + return None + + tok = body[tok_idx] + matched_role = _match_role(tok.text, chunk.role, kb) + + if matched_role is None: + if chunk.optional: + continue + return None + + result[tok_idx] = tok.with_role(matched_role) + tok_idx += 1 + + # Body must be fully consumed for EASY to succeed. Leftover tokens + # would mean we missed a chunk (e.g. extra audio/HDR tokens not in + # the schema yet) — fall back to SHITTY rather than silently dropping. + if tok_idx < len(body): + return None + + return result + + +def _find_title_end(body: list[Token], kb: ReleaseKnowledge) -> int: + """Return the exclusive index where the title ends. + + The title is the leftmost run of tokens that don't match any known + structural/technical role. Stops at the first token that does. + """ + for i, tok in enumerate(body): + if _parse_season_episode(tok.text) is not None: + return i + if _is_year(tok.text): + return i + if tok.text.lower() in kb.resolutions: + return i + if tok.text.lower() in kb.sources: + return i + if tok.text.lower() in kb.codecs: + return i + return len(body) + + +def _match_role(text: str, role: TokenRole, kb: ReleaseKnowledge) -> TokenRole | None: + """Return ``role`` if ``text`` matches it under ``kb``, else ``None``. + + Used by the schema walk: each chunk requests a specific role, and + this checks whether the current token can play it. Optional chunks + that don't match are silently skipped. + """ + lower = text.lower() + + if role is TokenRole.YEAR: + return TokenRole.YEAR if _is_year(text) else None + + if role is TokenRole.SEASON_EPISODE: + return ( + TokenRole.SEASON_EPISODE + if _parse_season_episode(text) is not None + else None + ) + + if role is TokenRole.RESOLUTION: + return TokenRole.RESOLUTION if lower in kb.resolutions else None + + if role is TokenRole.SOURCE: + return TokenRole.SOURCE if lower in kb.sources else None + + if role is TokenRole.CODEC: + return TokenRole.CODEC if lower in kb.codecs else None + + return None + + +def annotate(tokens: list[Token], kb: ReleaseKnowledge) -> list[Token] | None: + """Annotate token roles. Returns ``None`` when the EASY path fails. + + A ``None`` return means: the group is unknown, OR the schema walk + aborted on a mandatory mismatch. The caller (``services.parse_release``) + falls back to the legacy SHITTY heuristic in that case. + """ + group_name, group_index = _detect_group(tokens, kb) + if group_index is None: + return None + + schema = kb.group_schema(group_name) + if schema is None: + return None + + return _annotate_easy(tokens, kb, schema, group_index) + + +# --------------------------------------------------------------------------- +# Stage 3 — assemble +# --------------------------------------------------------------------------- + + +def assemble( + annotated: list[Token], + site_tag: str | None, + raw_name: str, + kb: ReleaseKnowledge, +) -> dict: + """Fold annotated tokens into a ``ParsedRelease``-compatible dict. + + Returns a dict (not a ``ParsedRelease`` instance) so the caller can + layer in additional fields (``parse_path``, etc.) before instantiation. + The dict's keys mirror the :class:`ParsedRelease` constructor + arguments. + """ + title_parts = [t.text for t in annotated if t.role is TokenRole.TITLE] + title = ".".join(title_parts) if title_parts else ( + annotated[0].text if annotated else raw_name + ) + + year: int | None = None + season: int | None = None + episode: int | None = None + episode_end: int | None = None + quality: str | None = None + source: str | None = None + codec: str | None = None + group = "UNKNOWN" + + for tok in annotated: + if tok.role is TokenRole.YEAR: + year = int(tok.text) + elif tok.role is TokenRole.SEASON_EPISODE: + parsed = _parse_season_episode(tok.text) + if parsed is not None: + season, episode, episode_end = parsed + elif tok.role is TokenRole.RESOLUTION: + quality = tok.text + elif tok.role is TokenRole.SOURCE: + source = tok.text + elif tok.role is TokenRole.CODEC: + # CODEC token may also carry the group (codec-GROUP shape). + codec = tok.extra.get("codec", tok.text) + if "group" in tok.extra: + group = tok.extra["group"] or "UNKNOWN" + elif tok.role is TokenRole.GROUP: + group = tok.extra.get("group", tok.text) or "UNKNOWN" + + tech_parts = [p for p in (quality, source, codec) if p] + tech_string = ".".join(tech_parts) + + # Media type: TV if a season was parsed, otherwise movie if we have + # at least one tech marker, else unknown. + if season is not None: + media_type = "tv_show" + elif any((quality, source, codec, year)): + media_type = "movie" + else: + media_type = "unknown" + + return { + "title": title, + "title_sanitized": kb.sanitize_for_fs(title), + "year": year, + "season": season, + "episode": episode, + "episode_end": episode_end, + "quality": quality, + "source": source, + "codec": codec, + "group": group, + "tech_string": tech_string, + "media_type": media_type, + "site_tag": site_tag, + } diff --git a/alfred/domain/release/parser/schema.py b/alfred/domain/release/parser/schema.py new file mode 100644 index 0000000..44e2328 --- /dev/null +++ b/alfred/domain/release/parser/schema.py @@ -0,0 +1,47 @@ +"""Group schema value objects. + +A :class:`GroupSchema` describes the canonical chunk layout of releases +from a known group (KONTRAST, RARBG, ELiTE, …). It is the EASY-road +contract: when a release ends in ``-`` and we know the group, +the annotator walks the schema instead of running the heuristic SHITTY +matchers. + +Schemas are loaded from ``knowledge/release/release_groups/.yaml`` +by an infrastructure adapter and surfaced via the +:class:`~alfred.domain.release.ports.knowledge.ReleaseKnowledge` port. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +from .tokens import TokenRole + + +@dataclass(frozen=True) +class SchemaChunk: + """One entry in a group's chunk order. + + ``role`` is the :class:`TokenRole` the chunk maps to. ``optional`` + is True for chunks that may be absent (e.g. ``year`` on TV releases, + ``source`` on bare ELiTE TV releases). + """ + + role: TokenRole + optional: bool = False + + +@dataclass(frozen=True) +class GroupSchema: + """Schema for a known release group. + + ``chunks`` is the left-to-right canonical order. The annotator walks + tokens and chunks in lockstep: an optional chunk that doesn't match + the current token is skipped (the chunk index advances, the token + index stays), a mandatory chunk that doesn't match aborts the EASY + path and falls back to SHITTY. + """ + + name: str + separator: str + chunks: tuple[SchemaChunk, ...] diff --git a/alfred/domain/release/ports/knowledge.py b/alfred/domain/release/ports/knowledge.py index 272e7ef..52200bf 100644 --- a/alfred/domain/release/ports/knowledge.py +++ b/alfred/domain/release/ports/knowledge.py @@ -10,7 +10,10 @@ object that satisfies this shape (e.g. a simple dataclass). from __future__ import annotations -from typing import Protocol +from typing import TYPE_CHECKING, Protocol + +if TYPE_CHECKING: + from ..parser.schema import GroupSchema class ReleaseKnowledge(Protocol): @@ -50,3 +53,14 @@ class ReleaseKnowledge(Protocol): def sanitize_for_fs(self, text: str) -> str: """Strip filesystem-forbidden characters from ``text``.""" ... + + # --- Release group schemas (EASY path) --- + + def group_schema(self, name: str) -> GroupSchema | None: + """Return the parsing schema for the named release group, or + ``None`` if the group is unknown (caller falls back to SHITTY). + + Lookup is case-insensitive: ``"KONTRAST"``, ``"kontrast"`` and + ``"Kontrast"`` all resolve to the same schema. + """ + ... diff --git a/alfred/domain/release/services.py b/alfred/domain/release/services.py index c2b943f..4f11711 100644 --- a/alfred/domain/release/services.py +++ b/alfred/domain/release/services.py @@ -4,6 +4,7 @@ from __future__ import annotations import re +from .parser import pipeline as _v2 from .ports import ReleaseKnowledge from .value_objects import MediaTypeToken, ParsedRelease, ParsePath @@ -34,6 +35,23 @@ def parse_release(name: str, kb: ReleaseKnowledge) -> ParsedRelease: if site_tag is not None: parse_path = ParsePath.SANITIZED.value + # --- v2 parser: EASY path for known groups ----------------------------- + # If the v2 pipeline recognizes the release group (KONTRAST, ELiTE, …) + # and the schema walk succeeds, return its result. On any mismatch + # (unknown group, schema abort) ``annotate`` returns None and we + # fall back to the legacy heuristic below. + v2_tokens, v2_tag = _v2.tokenize(name, kb) + v2_annotated = _v2.annotate(v2_tokens, kb) + if v2_annotated is not None: + fields = _v2.assemble(v2_annotated, v2_tag, name, kb) + return ParsedRelease( + raw=name, + normalised=clean, + parse_path=parse_path, + **fields, + ) + # --------------------------------------------------------------------- + if not _is_well_formed(clean, kb): return ParsedRelease( raw=name, diff --git a/alfred/infrastructure/knowledge/release.py b/alfred/infrastructure/knowledge/release.py index b6b61ff..4ea6375 100644 --- a/alfred/infrastructure/knowledge/release.py +++ b/alfred/infrastructure/knowledge/release.py @@ -16,9 +16,11 @@ import alfred as _alfred_pkg _BUILTIN_ROOT = Path(_alfred_pkg.__file__).parent / "knowledge" / "release" _SITES_ROOT = _BUILTIN_ROOT / "sites" +_GROUPS_ROOT = _BUILTIN_ROOT / "release_groups" _LEARNED_ROOT = ( Path(_alfred_pkg.__file__).parent.parent / "data" / "knowledge" / "release" ) +_LEARNED_GROUPS_ROOT = _LEARNED_ROOT / "release_groups" def _merge(base: dict, overlay: dict) -> dict: @@ -128,6 +130,27 @@ def load_media_type_tokens() -> dict: return _load_sites().get("media_type_tokens", {}) +def load_group_schemas() -> dict: + """Load every release-group schema YAML keyed by uppercase group name. + + Builtin schemas in ``alfred/knowledge/release/release_groups/`` are + merged with user-learned schemas in + ``data/knowledge/release/release_groups/`` (the learned ones win on + name collision). + """ + result: dict = {} + for root in (_GROUPS_ROOT, _LEARNED_GROUPS_ROOT): + if not root.is_dir(): + continue + for path in sorted(root.glob("*.yaml")): + data = _read(path) + name = data.get("name") + if not name: + continue + result[name.upper()] = data + return result + + def load_separators() -> list[str]: """Single-char token separators used by the release name tokenizer. diff --git a/alfred/infrastructure/knowledge/release_kb.py b/alfred/infrastructure/knowledge/release_kb.py index 5d4a790..980004f 100644 --- a/alfred/infrastructure/knowledge/release_kb.py +++ b/alfred/infrastructure/knowledge/release_kb.py @@ -14,11 +14,15 @@ filesystem-level concerns. from __future__ import annotations +from alfred.domain.release.parser.schema import GroupSchema, SchemaChunk +from alfred.domain.release.parser.tokens import TokenRole + from .release import ( load_audio, load_codecs, load_editions, load_forbidden_chars, + load_group_schemas, load_hdr_extra, load_language_tokens, load_media_type_tokens, @@ -35,6 +39,26 @@ from .release import ( ) +def _build_group_schema(data: dict) -> GroupSchema: + """Translate a raw YAML schema dict into a frozen :class:`GroupSchema`. + + Unknown roles raise ``ValueError`` early so a typo in a YAML file + surfaces at construction time, not on first parse. + """ + chunks = tuple( + SchemaChunk( + role=TokenRole(entry["role"]), + optional=bool(entry.get("optional", False)), + ) + for entry in data.get("chunk_order", []) + ) + return GroupSchema( + name=data["name"], + separator=data.get("separator", "."), + chunks=chunks, + ) + + class YamlReleaseKnowledge: """Single object holding every parsed-release knowledge constant. @@ -78,6 +102,15 @@ class YamlReleaseKnowledge: "", "", "".join(load_win_forbidden_chars()) ) + # Group schemas, keyed by uppercase group name for fast lookup. + self._group_schemas: dict[str, GroupSchema] = { + key: _build_group_schema(data) + for key, data in load_group_schemas().items() + } + def sanitize_for_fs(self, text: str) -> str: """Strip Windows-forbidden characters from ``text``.""" return text.translate(self._win_forbidden_table) + + def group_schema(self, name: str) -> GroupSchema | None: + return self._group_schemas.get(name.upper()) diff --git a/alfred/knowledge/release/release_groups/elite.yaml b/alfred/knowledge/release/release_groups/elite.yaml new file mode 100644 index 0000000..0e04de5 --- /dev/null +++ b/alfred/knowledge/release/release_groups/elite.yaml @@ -0,0 +1,22 @@ +# ELiTE release naming schema. +# +# Examples seen in the wild: +# Foundation.S02.1080p.x265-ELiTE (TV season pack, no source) +# +# ELiTE often omits the source token entirely on TV releases (no WEBRip / +# BluRay), going straight from resolution to codec. + +name: ELiTE +separator: "." + +chunk_order: + - role: title + - role: year + optional: true + - role: season_episode + optional: true + - role: resolution + - role: source + optional: true # often absent on TV + - role: codec + - role: group diff --git a/alfred/knowledge/release/release_groups/kontrast.yaml b/alfred/knowledge/release/release_groups/kontrast.yaml new file mode 100644 index 0000000..52a3071 --- /dev/null +++ b/alfred/knowledge/release/release_groups/kontrast.yaml @@ -0,0 +1,28 @@ +# KONTRAST release naming schema. +# +# Examples seen in the wild: +# Back.in.Action.2025.1080p.WEBRip.x265-KONTRAST (movie) +# The.Long.Walk.2025.1080p.WEBRip.x265-KONTRAST (movie) +# Slow.Horses.S05E01.1080p.WEBRip.x265-KONTRAST (TV episode) +# Slow.Horses.S05.1080p.WEBRip.x265-KONTRAST (TV season pack) +# +# Schema is a left-to-right description of the canonical chunk order. +# Each entry is a role (matching TokenRole). Optional chunks are marked +# with `optional: true`. The parser consumes tokens greedily by role, +# skipping over optional chunks that don't match. + +name: KONTRAST +separator: "." + +# Canonical order of structural + technical chunks (left to right). +# `title` is special-cased as "everything up to the first non-title role". +chunk_order: + - role: title + - role: year + optional: true # absent on TV releases (S01E01 instead) + - role: season_episode + optional: true # absent on movies + - role: resolution # always present (1080p, 2160p, …) + - role: source # always present (WEBRip, BluRay, …) + - role: codec # always present (x265, x264, …) + - role: group # everything after the final `-` diff --git a/alfred/knowledge/release/release_groups/rarbg.yaml b/alfred/knowledge/release/release_groups/rarbg.yaml new file mode 100644 index 0000000..b312708 --- /dev/null +++ b/alfred/knowledge/release/release_groups/rarbg.yaml @@ -0,0 +1,20 @@ +# RARBG release naming schema. +# +# RARBG follows the canonical scene convention closely: +# Title.Year.Resolution.Source.Codec-RARBG +# For TV: +# Title.S01E01.Resolution.Source.Codec-RARBG + +name: RARBG +separator: "." + +chunk_order: + - role: title + - role: year + optional: true + - role: season_episode + optional: true + - role: resolution + - role: source + - role: codec + - role: group diff --git a/tests/domain/release/test_parser_v2_easy.py b/tests/domain/release/test_parser_v2_easy.py new file mode 100644 index 0000000..1fc23bc --- /dev/null +++ b/tests/domain/release/test_parser_v2_easy.py @@ -0,0 +1,142 @@ +"""EASY-path tests for the v2 annotate-based pipeline. + +These tests assert that the **v2 pipeline itself** produces the correct +annotated stream and assembled fields for releases from known groups +(KONTRAST, ELiTE, …) — without going through ``parse_release``. The +fixtures suite (``tests/domain/test_release_fixtures.py``) already +locks the user-visible ``ParsedRelease`` contract; here we cover the +internal pipeline behavior so a future refactor of ``parse_release`` +can't quietly drop EASY without us noticing. +""" + +from __future__ import annotations + +from alfred.domain.release.parser import TokenRole +from alfred.domain.release.parser.pipeline import ( + _detect_group, + annotate, + assemble, + tokenize, +) +from alfred.infrastructure.knowledge.release_kb import YamlReleaseKnowledge + +_KB = YamlReleaseKnowledge() + + +class TestDetectGroup: + def test_codec_group(self) -> None: + tokens, _ = tokenize( + "Back.in.Action.2025.1080p.WEBRip.x265-KONTRAST", _KB + ) + name, idx = _detect_group(tokens, _KB) + assert name == "KONTRAST" + assert idx == 6 # x265-KONTRAST is the 7th token + + def test_unknown_when_no_dash(self) -> None: + tokens, _ = tokenize("Some.Movie.2020.1080p.WEBRip.x265.KONTRAST", _KB) + # No dash anywhere → no group detected. + name, idx = _detect_group(tokens, _KB) + assert idx is None + assert name == "UNKNOWN" + + def test_skips_dashed_source(self) -> None: + # "Web-DL" must not be mistaken for a group token. + tokens, _ = tokenize("Movie.2020.1080p.Web-DL.x265-GRP", _KB) + name, idx = _detect_group(tokens, _KB) + assert name == "GRP" + + +class TestAnnotateEasy: + def test_kontrast_movie(self) -> None: + tokens, tag = tokenize( + "Back.in.Action.2025.1080p.WEBRip.x265-KONTRAST", _KB + ) + annotated = annotate(tokens, _KB) + assert annotated is not None, "KONTRAST should hit the EASY path" + + roles = [t.role for t in annotated] + assert roles == [ + TokenRole.TITLE, # Back + TokenRole.TITLE, # in + TokenRole.TITLE, # Action + TokenRole.YEAR, + TokenRole.RESOLUTION, + TokenRole.SOURCE, + TokenRole.CODEC, # x265-KONTRAST → CODEC with extra.group=KONTRAST + ] + assert annotated[-1].extra["group"] == "KONTRAST" + assert annotated[-1].extra["codec"] == "x265" + + def test_kontrast_tv_episode(self) -> None: + tokens, _ = tokenize( + "Slow.Horses.S05E01.1080p.WEBRip.x265-KONTRAST", _KB + ) + annotated = annotate(tokens, _KB) + assert annotated is not None + + # Year is optional and absent → skipped. Season_episode present. + roles = [t.role for t in annotated] + assert TokenRole.SEASON_EPISODE in roles + assert TokenRole.YEAR not in roles + + def test_elite_no_source(self) -> None: + # ELiTE schema marks source as optional — Foundation.S02 omits it. + tokens, _ = tokenize("Foundation.S02.1080p.x265-ELiTE", _KB) + annotated = annotate(tokens, _KB) + assert annotated is not None, "ELiTE optional source must be tolerated" + + roles = [t.role for t in annotated] + assert TokenRole.SOURCE not in roles + assert TokenRole.RESOLUTION in roles + assert TokenRole.CODEC in roles + + def test_unknown_group_returns_none(self) -> None: + tokens, _ = tokenize("Some.Movie.2020.1080p.WEBRip.x264-RANDOM", _KB) + # RANDOM is not in our release_groups/ → annotate returns None + # and the caller falls back to SHITTY. + assert annotate(tokens, _KB) is None + + +class TestAssemble: + def test_kontrast_movie_fields(self) -> None: + name = "Back.in.Action.2025.1080p.WEBRip.x265-KONTRAST" + tokens, tag = tokenize(name, _KB) + annotated = annotate(tokens, _KB) + fields = assemble(annotated, tag, name, _KB) + + assert fields["title"] == "Back.in.Action" + assert fields["year"] == 2025 + assert fields["season"] is None + assert fields["quality"] == "1080p" + assert fields["source"] == "WEBRip" + assert fields["codec"] == "x265" + assert fields["group"] == "KONTRAST" + assert fields["tech_string"] == "1080p.WEBRip.x265" + assert fields["media_type"] == "movie" + assert fields["site_tag"] is None + + def test_kontrast_tv_fields(self) -> None: + name = "Slow.Horses.S05E01.1080p.WEBRip.x265-KONTRAST" + tokens, tag = tokenize(name, _KB) + annotated = annotate(tokens, _KB) + fields = assemble(annotated, tag, name, _KB) + + assert fields["title"] == "Slow.Horses" + assert fields["year"] is None + assert fields["season"] == 5 + assert fields["episode"] == 1 + assert fields["media_type"] == "tv_show" + assert fields["group"] == "KONTRAST" + + def test_elite_season_pack(self) -> None: + name = "Foundation.S02.1080p.x265-ELiTE" + tokens, tag = tokenize(name, _KB) + annotated = annotate(tokens, _KB) + fields = assemble(annotated, tag, name, _KB) + + assert fields["title"] == "Foundation" + assert fields["season"] == 2 + assert fields["episode"] is None # season pack + assert fields["source"] is None # ELiTE omits it + assert fields["tech_string"] == "1080p.x265" + assert fields["group"] == "ELiTE"