diff --git a/CHANGELOG.md b/CHANGELOG.md index a8d37ec..3420c02 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,15 +17,31 @@ callers). ### Added -- **Release parser v2 scaffolding** (`alfred/domain/release/parser/`): - new package laying the foundation for an annotate-based pipeline - (tokenize → annotate → assemble). Exposes `Token` (frozen VO with - `index` + `role` + `extra`), `TokenRole` enum (structural / technical / - meta families), and a `pipeline.py` module with working `strip_site_tag` - + `tokenize` and a documented `annotate` stub. Legacy `parse_release` - in `release.services` remains the live implementation until the - annotate step is wired in. Scaffolding tests in - `tests/domain/release/test_parser_v2_scaffolding.py`. +- **Release parser v2 — EASY path live** (`alfred/domain/release/parser/`): + new annotate-based pipeline (tokenize → annotate → assemble) drives + releases from known groups. Exposes `Token` (frozen VO with `index` + + `role` + `extra`), `TokenRole` enum (structural/technical/meta families), + and `GroupSchema` / `SchemaChunk` value objects. + - `pipeline.tokenize`: string-ops separator split (no regex), strips + a `[site.tag]` prefix/suffix first. + - `pipeline.annotate`: detects the trailing group right-to-left + (priority to `codec-GROUP` shape, fallback to any non-source dashed + token), looks up its `GroupSchema`, then walks tokens and schema + chunks in lockstep — optional chunks that don't match are skipped, + mandatory mismatches abort EASY and return `None` so the caller can + fall back to SHITTY. + - `pipeline.assemble`: folds annotated tokens into a + `ParsedRelease`-compatible dict. + - `parse_release` (in `release.services`) tries the v2 EASY path first + and falls through to the legacy SHITTY heuristic on `None`. Legacy + SHITTY/PATH OF PAIN behavior is unchanged. + - Knowledge: `alfred/knowledge/release/release_groups/{kontrast,elite, + rarbg}.yaml` declare the canonical chunk order per group, loaded via + new `ReleaseKnowledge.group_schema(name)` port method. + - Tests in `tests/domain/release/test_parser_v2_{scaffolding,easy}.py` + cover token VOs, site-tag stripping, group detection, schema-driven + annotation (movie, TV episode, season pack with optional source), + and field assembly. - **Real-world release fixtures** under `tests/fixtures/releases/{easy,shitty,path_of_pain}/`, each documenting an expected `ParsedRelease` plus the future `routing` diff --git a/alfred/domain/release/parser/__init__.py b/alfred/domain/release/parser/__init__.py index 24b33b2..37558c1 100644 --- a/alfred/domain/release/parser/__init__.py +++ b/alfred/domain/release/parser/__init__.py @@ -25,6 +25,7 @@ production until each piece of the v2 pipeline is wired in. from __future__ import annotations +from .schema import GroupSchema, SchemaChunk from .tokens import Token, TokenRole -__all__ = ["Token", "TokenRole"] +__all__ = ["GroupSchema", "SchemaChunk", "Token", "TokenRole"] diff --git a/alfred/domain/release/parser/pipeline.py b/alfred/domain/release/parser/pipeline.py index 97e3c21..2b63a25 100644 --- a/alfred/domain/release/parser/pipeline.py +++ b/alfred/domain/release/parser/pipeline.py @@ -1,43 +1,40 @@ -"""Annotate-based pipeline skeleton. +"""Annotate-based pipeline. -The pipeline is **declared here** in three named stages, but actual logic -is wired in incrementally — current state is intentional scaffolding. +Three stages: -Stages: +1. :func:`tokenize` — release name → ``list[Token]`` (all UNKNOWN), plus + a separately-returned site tag (e.g. ``[YTS.MX]``) that is never + tokenized. +2. :func:`annotate` — promote each token's :class:`TokenRole` using the + injected knowledge base. Group detection is right-to-left; if the + group has a registered :class:`GroupSchema` we run :func:`_annotate_easy` + (schema-driven, lockstep walk); otherwise we return the tokens with + only the group annotated and the caller falls back to SHITTY in + :func:`_legacy_assemble` (see :mod:`..services`). +3. :func:`assemble` — fold annotated tokens into a + :class:`~alfred.domain.release.value_objects.ParsedRelease`. -1. :func:`tokenize` — release name → ``list[Token]`` (all UNKNOWN). Also - pulls out a leading/trailing site tag (e.g. ``[YTS.MX]``) which is - returned separately and never tokenized. -2. :func:`annotate` — walk the tokens, promote roles using - :class:`~alfred.domain.release.ports.knowledge.ReleaseKnowledge`. The - walk is **right-to-left for the group** (scene convention puts it - last) and **left-to-right for the title** (which is always leftmost). -3. :func:`assemble` — fold the annotated stream into a domain VO. Output - type still TBD: the migration target is the existing - :class:`~alfred.domain.release.value_objects.ParsedRelease`, but the - pipeline may grow an intermediate :class:`AnnotatedRelease` first to - keep the score / leftover-tokens information that ``ParsedRelease`` - doesn't carry today. - -Road dispatch (EASY / SHITTY / PATH OF PAIN) happens **inside** -:func:`annotate` — once the group is identified (or not), the annotator -picks the right strategy. EASY consults a per-group schema; SHITTY runs -the generic matcher loop; PATH OF PAIN is a return-state, not a -separate path — the caller (``application/release/inspect.py``) decides -what to do with a low-confidence result. +The pipeline is **pure**: no I/O, no TMDB, no probe. All knowledge +arrives through ``kb: ReleaseKnowledge``. """ from __future__ import annotations from ..ports.knowledge import ReleaseKnowledge -from .tokens import Token +from .schema import GroupSchema +from .tokens import Token, TokenRole + + +# --------------------------------------------------------------------------- +# Stage 1 — tokenize +# --------------------------------------------------------------------------- def strip_site_tag(name: str) -> tuple[str, str | None]: """Split off a ``[site.tag]`` prefix or suffix. - The bracketed substring is removed from ``name`` and returned as the - second element. If no tag is found, returns ``(name.strip(), None)``. + Returns ``(clean_name, tag)``. If no tag is found, returns + ``(name.strip(), None)``. """ s = name.strip() @@ -63,19 +60,12 @@ def strip_site_tag(name: str) -> tuple[str, str | None]: def tokenize(name: str, kb: ReleaseKnowledge) -> tuple[list[Token], str | None]: """Split ``name`` into tokens after stripping any site tag. - Returns ``(tokens, site_tag)``. All tokens start with role - :attr:`~.tokens.TokenRole.UNKNOWN` — promotion happens in - :func:`annotate`. - - The tokenizer is a pure character-class split on ``kb.separators``. - String-ops style: no regex (keeps the rule from CLAUDE.md), at the - cost of one pass per separator. The release names we parse are short - (<200 chars), so the constant factor is irrelevant. + String-ops style: replace every configured separator with a single + NUL byte then split. NUL cannot legally appear in a release name, so + it's a safe sentinel. """ clean, site_tag = strip_site_tag(name) - # Replace every separator with a single delimiter, then split. Using - # \x00 because it cannot legally appear in a release name. DELIM = "\x00" buf = clean for sep in kb.separators: @@ -87,29 +77,335 @@ def tokenize(name: str, kb: ReleaseKnowledge) -> tuple[list[Token], str | None]: return tokens, site_tag -def annotate(tokens: list[Token], kb: ReleaseKnowledge) -> list[Token]: - """Promote each token's role using ``kb``. +# --------------------------------------------------------------------------- +# Stage 2 — annotate +# --------------------------------------------------------------------------- - **Not implemented yet.** Returns the input unchanged so the package - is importable and the pipeline shape is visible. Will be filled in - by subsequent commits, one role family at a time. - The intended walk order, once implemented: +def _parse_season_episode(text: str) -> tuple[int, int | None, int | None] | None: + """Parse a single token as ``SxxExx`` / ``SxxExxExx`` / ``Sxx`` / ``NxNN``. - 1. **Group (right-to-left)** — find the trailing ``-GROUP`` token, - which also reveals the codec when shaped as ``codec-GROUP``. If - the group matches a schema in ``knowledge/release/release_groups/`` - → EASY path; otherwise SHITTY. - 2. **Season/episode** — single-token scan, ``S01E05`` / ``1x05``. - 3. **Year** — first 4-digit token in [1900, 2099] *after* index 0. - 4. **Tech tokens** — resolutions, sources, codecs, audio, video meta, - editions, languages. Multi-token sequences (``DTS.HD.MA``, - ``Directors.Cut``) handled first to avoid greedy single-token - claims swallowing a sequence prefix. - 5. **Title** — leftmost contiguous UNKNOWN tokens up to the first - structural/technical role boundary. + Returns ``(season, episode, episode_end)`` or ``None`` if the token + is not a season/episode marker. """ - # TODO(parser-v2): implement annotation. See module docstring for the - # walk order. Until then, the legacy parse_release in - # release.services is the live implementation. - return tokens + upper = text.upper() + + # SxxExx form + if len(upper) >= 3 and upper[0] == "S" and upper[1:3].isdigit(): + season = int(upper[1:3]) + rest = upper[3:] + + if not rest: + return season, None, None + + episodes: list[int] = [] + while rest.startswith("E") and len(rest) >= 3 and rest[1:3].isdigit(): + episodes.append(int(rest[1:3])) + rest = rest[3:] + + if not episodes: + return None + return season, episodes[0], episodes[1] if len(episodes) >= 2 else None + + # NxNN form + if "X" in upper: + parts = upper.split("X") + if len(parts) >= 2 and all(p.isdigit() and p for p in parts): + season = int(parts[0]) + episode = int(parts[1]) + episode_end = int(parts[2]) if len(parts) >= 3 else None + return season, episode, episode_end + + return None + + +def _is_year(text: str) -> bool: + """Return True if ``text`` is a 4-digit year in [1900, 2099].""" + return len(text) == 4 and text.isdigit() and 1900 <= int(text) <= 2099 + + +def _split_codec_group(text: str, kb: ReleaseKnowledge) -> tuple[str, str] | None: + """Split a ``codec-GROUP`` token into ``(codec, group)`` if it fits. + + Returns ``None`` if the token doesn't match the ``codec-GROUP`` + shape. Handles the empty-group case (``x265-``) as ``(codec, "")``. + """ + if "-" not in text: + return None + head, _, tail = text.rpartition("-") + if head.lower() in kb.codecs: + return head, tail + return None + + +def _detect_group(tokens: list[Token], kb: ReleaseKnowledge) -> tuple[str, int | None]: + """Identify the release group by walking tokens right-to-left. + + Returns ``(group_name, token_index_carrying_group)`` — the index is + ``None`` when the group is missing entirely (no trailing ``-`` token + in the stream). + + Priority: + 1. Rightmost token of shape ``codec-GROUP`` (clearest signal). + 2. Rightmost token containing ``-`` whose head is *not* a known + source token (Web-DL etc. shouldn't be confused with a group). + """ + # Priority 1: codec-GROUP + for tok in reversed(tokens): + split = _split_codec_group(tok.text, kb) + if split is not None: + _, group = split + return (group or "UNKNOWN"), tok.index + + # Priority 2: rightmost dash, excluding known dashed sources + for tok in reversed(tokens): + if "-" not in tok.text: + continue + head, _, tail = tok.text.rpartition("-") + # Skip dashed-source tokens like "Web-DL" + if ( + head.lower() in kb.sources + or tok.text.lower().replace("-", "") in kb.sources + ): + continue + if tail: + return tail, tok.index + + return "UNKNOWN", None + + +def _annotate_easy( + tokens: list[Token], + kb: ReleaseKnowledge, + schema: GroupSchema, + group_token_index: int, +) -> list[Token] | None: + """Annotate tokens following a known group schema (EASY path). + + Returns the new token list on success, or ``None`` if the schema + walk fails — a mandatory chunk that doesn't match aborts EASY and + lets the caller fall back to SHITTY without crashing. + """ + result = list(tokens) + + # The codec-GROUP token is special: it carries TWO roles (CODEC + + # GROUP). We split it conceptually and tag it as CODEC here; the + # group itself is propagated via ``extra["group"]`` so the assemble + # step can recover both pieces from one token. When we do this, + # ``codec_pre_consumed`` is True so the schema walk knows to skip + # the CODEC chunk (it has nothing left to match in the body). + group_token = result[group_token_index] + cg_split = _split_codec_group(group_token.text, kb) + codec_pre_consumed = False + if cg_split is not None: + codec, group = cg_split + result[group_token_index] = group_token.with_role( + TokenRole.CODEC, codec=codec, group=group or "UNKNOWN" + ) + codec_pre_consumed = True + else: + # Group on a non-codec token (e.g. release without codec). + head, _, tail = group_token.text.rpartition("-") + result[group_token_index] = group_token.with_role( + TokenRole.GROUP, group=tail or "UNKNOWN", prefix=head + ) + + # Walk the schema left-to-right against tokens [0 .. group_token_index]. + # The codec-GROUP token at `group_token_index` already consumed CODEC + # + GROUP, so we walk up to (not including) it. + body = result[:group_token_index] + chunk_idx = 0 + tok_idx = 0 + + # 1) TITLE — special: consume contiguous UNKNOWN tokens until we hit + # a token whose text matches a non-title role. + while chunk_idx < len(schema.chunks) and schema.chunks[chunk_idx].role is TokenRole.TITLE: + title_end = _find_title_end(body, kb) + # All body tokens up to title_end are title parts. + for i in range(tok_idx, title_end): + result[i] = body[i].with_role(TokenRole.TITLE) + tok_idx = title_end + chunk_idx += 1 + + # 2) Remaining chunks. CODEC and GROUP that were pre-consumed by the + # codec-GROUP token at the end of the stream are skipped here. + for chunk in schema.chunks[chunk_idx:]: + if chunk.role is TokenRole.GROUP: + # Handled above via the trailing token. + continue + if chunk.role is TokenRole.CODEC and codec_pre_consumed: + # Already attached to the trailing token's extras. + continue + + if tok_idx >= len(body): + if chunk.optional: + continue + return None + + tok = body[tok_idx] + matched_role = _match_role(tok.text, chunk.role, kb) + + if matched_role is None: + if chunk.optional: + continue + return None + + result[tok_idx] = tok.with_role(matched_role) + tok_idx += 1 + + # Body must be fully consumed for EASY to succeed. Leftover tokens + # would mean we missed a chunk (e.g. extra audio/HDR tokens not in + # the schema yet) — fall back to SHITTY rather than silently dropping. + if tok_idx < len(body): + return None + + return result + + +def _find_title_end(body: list[Token], kb: ReleaseKnowledge) -> int: + """Return the exclusive index where the title ends. + + The title is the leftmost run of tokens that don't match any known + structural/technical role. Stops at the first token that does. + """ + for i, tok in enumerate(body): + if _parse_season_episode(tok.text) is not None: + return i + if _is_year(tok.text): + return i + if tok.text.lower() in kb.resolutions: + return i + if tok.text.lower() in kb.sources: + return i + if tok.text.lower() in kb.codecs: + return i + return len(body) + + +def _match_role(text: str, role: TokenRole, kb: ReleaseKnowledge) -> TokenRole | None: + """Return ``role`` if ``text`` matches it under ``kb``, else ``None``. + + Used by the schema walk: each chunk requests a specific role, and + this checks whether the current token can play it. Optional chunks + that don't match are silently skipped. + """ + lower = text.lower() + + if role is TokenRole.YEAR: + return TokenRole.YEAR if _is_year(text) else None + + if role is TokenRole.SEASON_EPISODE: + return ( + TokenRole.SEASON_EPISODE + if _parse_season_episode(text) is not None + else None + ) + + if role is TokenRole.RESOLUTION: + return TokenRole.RESOLUTION if lower in kb.resolutions else None + + if role is TokenRole.SOURCE: + return TokenRole.SOURCE if lower in kb.sources else None + + if role is TokenRole.CODEC: + return TokenRole.CODEC if lower in kb.codecs else None + + return None + + +def annotate(tokens: list[Token], kb: ReleaseKnowledge) -> list[Token] | None: + """Annotate token roles. Returns ``None`` when the EASY path fails. + + A ``None`` return means: the group is unknown, OR the schema walk + aborted on a mandatory mismatch. The caller (``services.parse_release``) + falls back to the legacy SHITTY heuristic in that case. + """ + group_name, group_index = _detect_group(tokens, kb) + if group_index is None: + return None + + schema = kb.group_schema(group_name) + if schema is None: + return None + + return _annotate_easy(tokens, kb, schema, group_index) + + +# --------------------------------------------------------------------------- +# Stage 3 — assemble +# --------------------------------------------------------------------------- + + +def assemble( + annotated: list[Token], + site_tag: str | None, + raw_name: str, + kb: ReleaseKnowledge, +) -> dict: + """Fold annotated tokens into a ``ParsedRelease``-compatible dict. + + Returns a dict (not a ``ParsedRelease`` instance) so the caller can + layer in additional fields (``parse_path``, etc.) before instantiation. + The dict's keys mirror the :class:`ParsedRelease` constructor + arguments. + """ + title_parts = [t.text for t in annotated if t.role is TokenRole.TITLE] + title = ".".join(title_parts) if title_parts else ( + annotated[0].text if annotated else raw_name + ) + + year: int | None = None + season: int | None = None + episode: int | None = None + episode_end: int | None = None + quality: str | None = None + source: str | None = None + codec: str | None = None + group = "UNKNOWN" + + for tok in annotated: + if tok.role is TokenRole.YEAR: + year = int(tok.text) + elif tok.role is TokenRole.SEASON_EPISODE: + parsed = _parse_season_episode(tok.text) + if parsed is not None: + season, episode, episode_end = parsed + elif tok.role is TokenRole.RESOLUTION: + quality = tok.text + elif tok.role is TokenRole.SOURCE: + source = tok.text + elif tok.role is TokenRole.CODEC: + # CODEC token may also carry the group (codec-GROUP shape). + codec = tok.extra.get("codec", tok.text) + if "group" in tok.extra: + group = tok.extra["group"] or "UNKNOWN" + elif tok.role is TokenRole.GROUP: + group = tok.extra.get("group", tok.text) or "UNKNOWN" + + tech_parts = [p for p in (quality, source, codec) if p] + tech_string = ".".join(tech_parts) + + # Media type: TV if a season was parsed, otherwise movie if we have + # at least one tech marker, else unknown. + if season is not None: + media_type = "tv_show" + elif any((quality, source, codec, year)): + media_type = "movie" + else: + media_type = "unknown" + + return { + "title": title, + "title_sanitized": kb.sanitize_for_fs(title), + "year": year, + "season": season, + "episode": episode, + "episode_end": episode_end, + "quality": quality, + "source": source, + "codec": codec, + "group": group, + "tech_string": tech_string, + "media_type": media_type, + "site_tag": site_tag, + } diff --git a/alfred/domain/release/parser/schema.py b/alfred/domain/release/parser/schema.py new file mode 100644 index 0000000..44e2328 --- /dev/null +++ b/alfred/domain/release/parser/schema.py @@ -0,0 +1,47 @@ +"""Group schema value objects. + +A :class:`GroupSchema` describes the canonical chunk layout of releases +from a known group (KONTRAST, RARBG, ELiTE, …). It is the EASY-road +contract: when a release ends in ``-`` and we know the group, +the annotator walks the schema instead of running the heuristic SHITTY +matchers. + +Schemas are loaded from ``knowledge/release/release_groups/.yaml`` +by an infrastructure adapter and surfaced via the +:class:`~alfred.domain.release.ports.knowledge.ReleaseKnowledge` port. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +from .tokens import TokenRole + + +@dataclass(frozen=True) +class SchemaChunk: + """One entry in a group's chunk order. + + ``role`` is the :class:`TokenRole` the chunk maps to. ``optional`` + is True for chunks that may be absent (e.g. ``year`` on TV releases, + ``source`` on bare ELiTE TV releases). + """ + + role: TokenRole + optional: bool = False + + +@dataclass(frozen=True) +class GroupSchema: + """Schema for a known release group. + + ``chunks`` is the left-to-right canonical order. The annotator walks + tokens and chunks in lockstep: an optional chunk that doesn't match + the current token is skipped (the chunk index advances, the token + index stays), a mandatory chunk that doesn't match aborts the EASY + path and falls back to SHITTY. + """ + + name: str + separator: str + chunks: tuple[SchemaChunk, ...] diff --git a/alfred/domain/release/ports/knowledge.py b/alfred/domain/release/ports/knowledge.py index 272e7ef..52200bf 100644 --- a/alfred/domain/release/ports/knowledge.py +++ b/alfred/domain/release/ports/knowledge.py @@ -10,7 +10,10 @@ object that satisfies this shape (e.g. a simple dataclass). from __future__ import annotations -from typing import Protocol +from typing import TYPE_CHECKING, Protocol + +if TYPE_CHECKING: + from ..parser.schema import GroupSchema class ReleaseKnowledge(Protocol): @@ -50,3 +53,14 @@ class ReleaseKnowledge(Protocol): def sanitize_for_fs(self, text: str) -> str: """Strip filesystem-forbidden characters from ``text``.""" ... + + # --- Release group schemas (EASY path) --- + + def group_schema(self, name: str) -> GroupSchema | None: + """Return the parsing schema for the named release group, or + ``None`` if the group is unknown (caller falls back to SHITTY). + + Lookup is case-insensitive: ``"KONTRAST"``, ``"kontrast"`` and + ``"Kontrast"`` all resolve to the same schema. + """ + ... diff --git a/alfred/domain/release/services.py b/alfred/domain/release/services.py index c2b943f..4f11711 100644 --- a/alfred/domain/release/services.py +++ b/alfred/domain/release/services.py @@ -4,6 +4,7 @@ from __future__ import annotations import re +from .parser import pipeline as _v2 from .ports import ReleaseKnowledge from .value_objects import MediaTypeToken, ParsedRelease, ParsePath @@ -34,6 +35,23 @@ def parse_release(name: str, kb: ReleaseKnowledge) -> ParsedRelease: if site_tag is not None: parse_path = ParsePath.SANITIZED.value + # --- v2 parser: EASY path for known groups ----------------------------- + # If the v2 pipeline recognizes the release group (KONTRAST, ELiTE, …) + # and the schema walk succeeds, return its result. On any mismatch + # (unknown group, schema abort) ``annotate`` returns None and we + # fall back to the legacy heuristic below. + v2_tokens, v2_tag = _v2.tokenize(name, kb) + v2_annotated = _v2.annotate(v2_tokens, kb) + if v2_annotated is not None: + fields = _v2.assemble(v2_annotated, v2_tag, name, kb) + return ParsedRelease( + raw=name, + normalised=clean, + parse_path=parse_path, + **fields, + ) + # --------------------------------------------------------------------- + if not _is_well_formed(clean, kb): return ParsedRelease( raw=name, diff --git a/alfred/infrastructure/knowledge/release.py b/alfred/infrastructure/knowledge/release.py index b6b61ff..4ea6375 100644 --- a/alfred/infrastructure/knowledge/release.py +++ b/alfred/infrastructure/knowledge/release.py @@ -16,9 +16,11 @@ import alfred as _alfred_pkg _BUILTIN_ROOT = Path(_alfred_pkg.__file__).parent / "knowledge" / "release" _SITES_ROOT = _BUILTIN_ROOT / "sites" +_GROUPS_ROOT = _BUILTIN_ROOT / "release_groups" _LEARNED_ROOT = ( Path(_alfred_pkg.__file__).parent.parent / "data" / "knowledge" / "release" ) +_LEARNED_GROUPS_ROOT = _LEARNED_ROOT / "release_groups" def _merge(base: dict, overlay: dict) -> dict: @@ -128,6 +130,27 @@ def load_media_type_tokens() -> dict: return _load_sites().get("media_type_tokens", {}) +def load_group_schemas() -> dict: + """Load every release-group schema YAML keyed by uppercase group name. + + Builtin schemas in ``alfred/knowledge/release/release_groups/`` are + merged with user-learned schemas in + ``data/knowledge/release/release_groups/`` (the learned ones win on + name collision). + """ + result: dict = {} + for root in (_GROUPS_ROOT, _LEARNED_GROUPS_ROOT): + if not root.is_dir(): + continue + for path in sorted(root.glob("*.yaml")): + data = _read(path) + name = data.get("name") + if not name: + continue + result[name.upper()] = data + return result + + def load_separators() -> list[str]: """Single-char token separators used by the release name tokenizer. diff --git a/alfred/infrastructure/knowledge/release_kb.py b/alfred/infrastructure/knowledge/release_kb.py index 5d4a790..980004f 100644 --- a/alfred/infrastructure/knowledge/release_kb.py +++ b/alfred/infrastructure/knowledge/release_kb.py @@ -14,11 +14,15 @@ filesystem-level concerns. from __future__ import annotations +from alfred.domain.release.parser.schema import GroupSchema, SchemaChunk +from alfred.domain.release.parser.tokens import TokenRole + from .release import ( load_audio, load_codecs, load_editions, load_forbidden_chars, + load_group_schemas, load_hdr_extra, load_language_tokens, load_media_type_tokens, @@ -35,6 +39,26 @@ from .release import ( ) +def _build_group_schema(data: dict) -> GroupSchema: + """Translate a raw YAML schema dict into a frozen :class:`GroupSchema`. + + Unknown roles raise ``ValueError`` early so a typo in a YAML file + surfaces at construction time, not on first parse. + """ + chunks = tuple( + SchemaChunk( + role=TokenRole(entry["role"]), + optional=bool(entry.get("optional", False)), + ) + for entry in data.get("chunk_order", []) + ) + return GroupSchema( + name=data["name"], + separator=data.get("separator", "."), + chunks=chunks, + ) + + class YamlReleaseKnowledge: """Single object holding every parsed-release knowledge constant. @@ -78,6 +102,15 @@ class YamlReleaseKnowledge: "", "", "".join(load_win_forbidden_chars()) ) + # Group schemas, keyed by uppercase group name for fast lookup. + self._group_schemas: dict[str, GroupSchema] = { + key: _build_group_schema(data) + for key, data in load_group_schemas().items() + } + def sanitize_for_fs(self, text: str) -> str: """Strip Windows-forbidden characters from ``text``.""" return text.translate(self._win_forbidden_table) + + def group_schema(self, name: str) -> GroupSchema | None: + return self._group_schemas.get(name.upper()) diff --git a/alfred/knowledge/release/release_groups/elite.yaml b/alfred/knowledge/release/release_groups/elite.yaml new file mode 100644 index 0000000..0e04de5 --- /dev/null +++ b/alfred/knowledge/release/release_groups/elite.yaml @@ -0,0 +1,22 @@ +# ELiTE release naming schema. +# +# Examples seen in the wild: +# Foundation.S02.1080p.x265-ELiTE (TV season pack, no source) +# +# ELiTE often omits the source token entirely on TV releases (no WEBRip / +# BluRay), going straight from resolution to codec. + +name: ELiTE +separator: "." + +chunk_order: + - role: title + - role: year + optional: true + - role: season_episode + optional: true + - role: resolution + - role: source + optional: true # often absent on TV + - role: codec + - role: group diff --git a/alfred/knowledge/release/release_groups/kontrast.yaml b/alfred/knowledge/release/release_groups/kontrast.yaml new file mode 100644 index 0000000..52a3071 --- /dev/null +++ b/alfred/knowledge/release/release_groups/kontrast.yaml @@ -0,0 +1,28 @@ +# KONTRAST release naming schema. +# +# Examples seen in the wild: +# Back.in.Action.2025.1080p.WEBRip.x265-KONTRAST (movie) +# The.Long.Walk.2025.1080p.WEBRip.x265-KONTRAST (movie) +# Slow.Horses.S05E01.1080p.WEBRip.x265-KONTRAST (TV episode) +# Slow.Horses.S05.1080p.WEBRip.x265-KONTRAST (TV season pack) +# +# Schema is a left-to-right description of the canonical chunk order. +# Each entry is a role (matching TokenRole). Optional chunks are marked +# with `optional: true`. The parser consumes tokens greedily by role, +# skipping over optional chunks that don't match. + +name: KONTRAST +separator: "." + +# Canonical order of structural + technical chunks (left to right). +# `title` is special-cased as "everything up to the first non-title role". +chunk_order: + - role: title + - role: year + optional: true # absent on TV releases (S01E01 instead) + - role: season_episode + optional: true # absent on movies + - role: resolution # always present (1080p, 2160p, …) + - role: source # always present (WEBRip, BluRay, …) + - role: codec # always present (x265, x264, …) + - role: group # everything after the final `-` diff --git a/alfred/knowledge/release/release_groups/rarbg.yaml b/alfred/knowledge/release/release_groups/rarbg.yaml new file mode 100644 index 0000000..b312708 --- /dev/null +++ b/alfred/knowledge/release/release_groups/rarbg.yaml @@ -0,0 +1,20 @@ +# RARBG release naming schema. +# +# RARBG follows the canonical scene convention closely: +# Title.Year.Resolution.Source.Codec-RARBG +# For TV: +# Title.S01E01.Resolution.Source.Codec-RARBG + +name: RARBG +separator: "." + +chunk_order: + - role: title + - role: year + optional: true + - role: season_episode + optional: true + - role: resolution + - role: source + - role: codec + - role: group diff --git a/tests/domain/release/test_parser_v2_easy.py b/tests/domain/release/test_parser_v2_easy.py new file mode 100644 index 0000000..1fc23bc --- /dev/null +++ b/tests/domain/release/test_parser_v2_easy.py @@ -0,0 +1,142 @@ +"""EASY-path tests for the v2 annotate-based pipeline. + +These tests assert that the **v2 pipeline itself** produces the correct +annotated stream and assembled fields for releases from known groups +(KONTRAST, ELiTE, …) — without going through ``parse_release``. The +fixtures suite (``tests/domain/test_release_fixtures.py``) already +locks the user-visible ``ParsedRelease`` contract; here we cover the +internal pipeline behavior so a future refactor of ``parse_release`` +can't quietly drop EASY without us noticing. +""" + +from __future__ import annotations + +from alfred.domain.release.parser import TokenRole +from alfred.domain.release.parser.pipeline import ( + _detect_group, + annotate, + assemble, + tokenize, +) +from alfred.infrastructure.knowledge.release_kb import YamlReleaseKnowledge + +_KB = YamlReleaseKnowledge() + + +class TestDetectGroup: + def test_codec_group(self) -> None: + tokens, _ = tokenize( + "Back.in.Action.2025.1080p.WEBRip.x265-KONTRAST", _KB + ) + name, idx = _detect_group(tokens, _KB) + assert name == "KONTRAST" + assert idx == 6 # x265-KONTRAST is the 7th token + + def test_unknown_when_no_dash(self) -> None: + tokens, _ = tokenize("Some.Movie.2020.1080p.WEBRip.x265.KONTRAST", _KB) + # No dash anywhere → no group detected. + name, idx = _detect_group(tokens, _KB) + assert idx is None + assert name == "UNKNOWN" + + def test_skips_dashed_source(self) -> None: + # "Web-DL" must not be mistaken for a group token. + tokens, _ = tokenize("Movie.2020.1080p.Web-DL.x265-GRP", _KB) + name, idx = _detect_group(tokens, _KB) + assert name == "GRP" + + +class TestAnnotateEasy: + def test_kontrast_movie(self) -> None: + tokens, tag = tokenize( + "Back.in.Action.2025.1080p.WEBRip.x265-KONTRAST", _KB + ) + annotated = annotate(tokens, _KB) + assert annotated is not None, "KONTRAST should hit the EASY path" + + roles = [t.role for t in annotated] + assert roles == [ + TokenRole.TITLE, # Back + TokenRole.TITLE, # in + TokenRole.TITLE, # Action + TokenRole.YEAR, + TokenRole.RESOLUTION, + TokenRole.SOURCE, + TokenRole.CODEC, # x265-KONTRAST → CODEC with extra.group=KONTRAST + ] + assert annotated[-1].extra["group"] == "KONTRAST" + assert annotated[-1].extra["codec"] == "x265" + + def test_kontrast_tv_episode(self) -> None: + tokens, _ = tokenize( + "Slow.Horses.S05E01.1080p.WEBRip.x265-KONTRAST", _KB + ) + annotated = annotate(tokens, _KB) + assert annotated is not None + + # Year is optional and absent → skipped. Season_episode present. + roles = [t.role for t in annotated] + assert TokenRole.SEASON_EPISODE in roles + assert TokenRole.YEAR not in roles + + def test_elite_no_source(self) -> None: + # ELiTE schema marks source as optional — Foundation.S02 omits it. + tokens, _ = tokenize("Foundation.S02.1080p.x265-ELiTE", _KB) + annotated = annotate(tokens, _KB) + assert annotated is not None, "ELiTE optional source must be tolerated" + + roles = [t.role for t in annotated] + assert TokenRole.SOURCE not in roles + assert TokenRole.RESOLUTION in roles + assert TokenRole.CODEC in roles + + def test_unknown_group_returns_none(self) -> None: + tokens, _ = tokenize("Some.Movie.2020.1080p.WEBRip.x264-RANDOM", _KB) + # RANDOM is not in our release_groups/ → annotate returns None + # and the caller falls back to SHITTY. + assert annotate(tokens, _KB) is None + + +class TestAssemble: + def test_kontrast_movie_fields(self) -> None: + name = "Back.in.Action.2025.1080p.WEBRip.x265-KONTRAST" + tokens, tag = tokenize(name, _KB) + annotated = annotate(tokens, _KB) + fields = assemble(annotated, tag, name, _KB) + + assert fields["title"] == "Back.in.Action" + assert fields["year"] == 2025 + assert fields["season"] is None + assert fields["quality"] == "1080p" + assert fields["source"] == "WEBRip" + assert fields["codec"] == "x265" + assert fields["group"] == "KONTRAST" + assert fields["tech_string"] == "1080p.WEBRip.x265" + assert fields["media_type"] == "movie" + assert fields["site_tag"] is None + + def test_kontrast_tv_fields(self) -> None: + name = "Slow.Horses.S05E01.1080p.WEBRip.x265-KONTRAST" + tokens, tag = tokenize(name, _KB) + annotated = annotate(tokens, _KB) + fields = assemble(annotated, tag, name, _KB) + + assert fields["title"] == "Slow.Horses" + assert fields["year"] is None + assert fields["season"] == 5 + assert fields["episode"] == 1 + assert fields["media_type"] == "tv_show" + assert fields["group"] == "KONTRAST" + + def test_elite_season_pack(self) -> None: + name = "Foundation.S02.1080p.x265-ELiTE" + tokens, tag = tokenize(name, _KB) + annotated = annotate(tokens, _KB) + fields = assemble(annotated, tag, name, _KB) + + assert fields["title"] == "Foundation" + assert fields["season"] == 2 + assert fields["episode"] is None # season pack + assert fields["source"] is None # ELiTE omits it + assert fields["tech_string"] == "1080p.x265" + assert fields["group"] == "ELiTE"