From a2c917618f15a572ecfeef64401910e07042c45b Mon Sep 17 00:00:00 2001 From: Francwa Date: Wed, 20 May 2026 00:12:33 +0200 Subject: [PATCH] feat(release): scaffold v2 parser package (annotate-based pipeline) New package alfred/domain/release/parser/ lays the foundation for the release parser refactor (specs in memory). Exposes: - Token: frozen VO carrying text + stream index + TokenRole + extra dict. with_role() returns a new instance (no mutation). - TokenRole: str-backed enum split into structural (TITLE/YEAR/SEASON_EP/ GROUP), technical (RESOLUTION/SOURCE/CODEC/AUDIO_*/BIT_DEPTH/HDR/ EDITION/LANGUAGE), and meta (SITE_TAG/UNKNOWN) families. - pipeline.strip_site_tag(): pulls a [site.tag] prefix or suffix. - pipeline.tokenize(): release name -> list[Token] (all UNKNOWN), string-ops split on kb.separators (no regex, per CLAUDE.md). - pipeline.annotate(): documented stub. Walk order recorded in docstring (group right-to-left, then season/episode, year, tech, title). Legacy parse_release in release.services remains the live implementation until the annotate step lands. Scaffolding tests verify Token API, site-tag stripping (prefix/suffix), and tokenize output shape. Refs: project_release_parser_v2_specs (memory) --- CHANGELOG.md | 10 ++ alfred/domain/release/parser/__init__.py | 30 +++++ alfred/domain/release/parser/pipeline.py | 115 ++++++++++++++++++ alfred/domain/release/parser/tokens.py | 89 ++++++++++++++ tests/domain/release/__init__.py | 0 .../release/test_parser_v2_scaffolding.py | 79 ++++++++++++ 6 files changed, 323 insertions(+) create mode 100644 alfred/domain/release/parser/__init__.py create mode 100644 alfred/domain/release/parser/pipeline.py create mode 100644 alfred/domain/release/parser/tokens.py create mode 100644 tests/domain/release/__init__.py create mode 100644 tests/domain/release/test_parser_v2_scaffolding.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 575e567..a8d37ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,16 @@ callers). ### Added +- **Release parser v2 scaffolding** (`alfred/domain/release/parser/`): + new package laying the foundation for an annotate-based pipeline + (tokenize → annotate → assemble). Exposes `Token` (frozen VO with + `index` + `role` + `extra`), `TokenRole` enum (structural / technical / + meta families), and a `pipeline.py` module with working `strip_site_tag` + + `tokenize` and a documented `annotate` stub. Legacy `parse_release` + in `release.services` remains the live implementation until the + annotate step is wired in. Scaffolding tests in + `tests/domain/release/test_parser_v2_scaffolding.py`. + - **Real-world release fixtures** under `tests/fixtures/releases/{easy,shitty,path_of_pain}/`, each documenting an expected `ParsedRelease` plus the future `routing` (library / torrents / seed_hardlinks) for the upcoming `organize_media` diff --git a/alfred/domain/release/parser/__init__.py b/alfred/domain/release/parser/__init__.py new file mode 100644 index 0000000..24b33b2 --- /dev/null +++ b/alfred/domain/release/parser/__init__.py @@ -0,0 +1,30 @@ +"""Release parser v2 — annotate-based pipeline. + +This package is the future home of ``parse_release``. It restructures the +parsing logic around a **tokenize → annotate → assemble** pipeline: + +1. **tokenize**: split the release name into atomic tokens. +2. **annotate**: walk tokens left-to-right, assigning each one a + :class:`TokenRole` (TITLE, YEAR, SEASON, RESOLUTION, …) using the + injected :class:`~alfred.domain.release.ports.knowledge.ReleaseKnowledge`. +3. **assemble**: fold the annotated tokens into a :class:`ParsedRelease`. + +The pipeline has three internal paths driven by the detected release group: + +- **EASY**: known group (KONTRAST, RARBG, …) with a schema-driven layout + declared in ``knowledge/release/release_groups/.yaml``. +- **SHITTY**: unknown group, best-effort matching against the global + knowledge sets, with a 0-100 confidence score. +- **PATH OF PAIN**: score below threshold OR critical chunks missing — + signaled to the caller, who decides whether to involve the LLM/user. + +Today the package exposes scaffolding only (token VOs and a thin pipeline +stub). The legacy ``parse_release`` in ``release.services`` keeps serving +production until each piece of the v2 pipeline is wired in. +""" + +from __future__ import annotations + +from .tokens import Token, TokenRole + +__all__ = ["Token", "TokenRole"] diff --git a/alfred/domain/release/parser/pipeline.py b/alfred/domain/release/parser/pipeline.py new file mode 100644 index 0000000..97e3c21 --- /dev/null +++ b/alfred/domain/release/parser/pipeline.py @@ -0,0 +1,115 @@ +"""Annotate-based pipeline skeleton. + +The pipeline is **declared here** in three named stages, but actual logic +is wired in incrementally — current state is intentional scaffolding. + +Stages: + +1. :func:`tokenize` — release name → ``list[Token]`` (all UNKNOWN). Also + pulls out a leading/trailing site tag (e.g. ``[YTS.MX]``) which is + returned separately and never tokenized. +2. :func:`annotate` — walk the tokens, promote roles using + :class:`~alfred.domain.release.ports.knowledge.ReleaseKnowledge`. The + walk is **right-to-left for the group** (scene convention puts it + last) and **left-to-right for the title** (which is always leftmost). +3. :func:`assemble` — fold the annotated stream into a domain VO. Output + type still TBD: the migration target is the existing + :class:`~alfred.domain.release.value_objects.ParsedRelease`, but the + pipeline may grow an intermediate :class:`AnnotatedRelease` first to + keep the score / leftover-tokens information that ``ParsedRelease`` + doesn't carry today. + +Road dispatch (EASY / SHITTY / PATH OF PAIN) happens **inside** +:func:`annotate` — once the group is identified (or not), the annotator +picks the right strategy. EASY consults a per-group schema; SHITTY runs +the generic matcher loop; PATH OF PAIN is a return-state, not a +separate path — the caller (``application/release/inspect.py``) decides +what to do with a low-confidence result. +""" + +from __future__ import annotations + +from ..ports.knowledge import ReleaseKnowledge +from .tokens import Token + + +def strip_site_tag(name: str) -> tuple[str, str | None]: + """Split off a ``[site.tag]`` prefix or suffix. + + The bracketed substring is removed from ``name`` and returned as the + second element. If no tag is found, returns ``(name.strip(), None)``. + """ + s = name.strip() + + if s.startswith("["): + close = s.find("]") + if close != -1: + tag = s[1:close].strip() + remainder = s[close + 1 :].strip() + if tag and remainder: + return remainder, tag + + if s.endswith("]"): + open_bracket = s.rfind("[") + if open_bracket != -1: + tag = s[open_bracket + 1 : -1].strip() + remainder = s[:open_bracket].strip() + if tag and remainder: + return remainder, tag + + return s, None + + +def tokenize(name: str, kb: ReleaseKnowledge) -> tuple[list[Token], str | None]: + """Split ``name`` into tokens after stripping any site tag. + + Returns ``(tokens, site_tag)``. All tokens start with role + :attr:`~.tokens.TokenRole.UNKNOWN` — promotion happens in + :func:`annotate`. + + The tokenizer is a pure character-class split on ``kb.separators``. + String-ops style: no regex (keeps the rule from CLAUDE.md), at the + cost of one pass per separator. The release names we parse are short + (<200 chars), so the constant factor is irrelevant. + """ + clean, site_tag = strip_site_tag(name) + + # Replace every separator with a single delimiter, then split. Using + # \x00 because it cannot legally appear in a release name. + DELIM = "\x00" + buf = clean + for sep in kb.separators: + if sep != DELIM: + buf = buf.replace(sep, DELIM) + + pieces = [p for p in buf.split(DELIM) if p] + tokens = [Token(text=p, index=i) for i, p in enumerate(pieces)] + return tokens, site_tag + + +def annotate(tokens: list[Token], kb: ReleaseKnowledge) -> list[Token]: + """Promote each token's role using ``kb``. + + **Not implemented yet.** Returns the input unchanged so the package + is importable and the pipeline shape is visible. Will be filled in + by subsequent commits, one role family at a time. + + The intended walk order, once implemented: + + 1. **Group (right-to-left)** — find the trailing ``-GROUP`` token, + which also reveals the codec when shaped as ``codec-GROUP``. If + the group matches a schema in ``knowledge/release/release_groups/`` + → EASY path; otherwise SHITTY. + 2. **Season/episode** — single-token scan, ``S01E05`` / ``1x05``. + 3. **Year** — first 4-digit token in [1900, 2099] *after* index 0. + 4. **Tech tokens** — resolutions, sources, codecs, audio, video meta, + editions, languages. Multi-token sequences (``DTS.HD.MA``, + ``Directors.Cut``) handled first to avoid greedy single-token + claims swallowing a sequence prefix. + 5. **Title** — leftmost contiguous UNKNOWN tokens up to the first + structural/technical role boundary. + """ + # TODO(parser-v2): implement annotation. See module docstring for the + # walk order. Until then, the legacy parse_release in + # release.services is the live implementation. + return tokens diff --git a/alfred/domain/release/parser/tokens.py b/alfred/domain/release/parser/tokens.py new file mode 100644 index 0000000..8eb3b44 --- /dev/null +++ b/alfred/domain/release/parser/tokens.py @@ -0,0 +1,89 @@ +"""Token value objects for the annotate-based parser. + +A :class:`Token` carries both the original substring and its position in +the original release name's token stream. A :class:`TokenRole` is the +semantic tag assigned by the annotator. + +Why VOs instead of bare ``str``: the annotate step needs to flag tokens +without consuming them (a token may carry residual info — e.g. a +``codec-GROUP`` token contributes both a CODEC and a GROUP role). Tracking +the index also lets later stages reason about *order* (year must come +after title, group must be rightmost, etc.) without re-scanning the list. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import Enum + + +class TokenRole(str, Enum): + """Semantic role a token can take after annotation. + + A token starts as ``UNKNOWN`` and may be promoted by the annotator. + ``str``-backed for cheap comparisons and YAML/JSON interop. + + Roles split into three families: + + - **structural**: TITLE / YEAR / SEASON_EPISODE / GROUP — drive folder + and filename naming. + - **technical**: RESOLUTION / SOURCE / CODEC / AUDIO_CODEC / + AUDIO_CHANNELS / BIT_DEPTH / HDR / EDITION / LANGUAGE — feed + ``tech_string`` and metadata fields. + - **meta**: SITE_TAG (stripped pre-tokenize), SEPARATOR (kept for the + assemble step if a release uses spaces that need preservation in the + title), UNKNOWN (residual, contributes to the SHITTY score penalty). + """ + + UNKNOWN = "unknown" + + # Structural + TITLE = "title" + YEAR = "year" + SEASON_EPISODE = "season_episode" + GROUP = "group" + + # Technical + RESOLUTION = "resolution" + SOURCE = "source" + CODEC = "codec" + AUDIO_CODEC = "audio_codec" + AUDIO_CHANNELS = "audio_channels" + BIT_DEPTH = "bit_depth" + HDR = "hdr" + EDITION = "edition" + LANGUAGE = "language" + + # Meta + SITE_TAG = "site_tag" + + +@dataclass(frozen=True) +class Token: + """An atomic token from a release name. + + ``text`` is the substring exactly as it appeared after tokenization + (case preserved — uppercase comparisons happen at match time). + ``index`` is the 0-based position in the tokenized stream, used by + downstream stages to enforce ordering invariants. + + ``role`` defaults to :attr:`TokenRole.UNKNOWN`. The annotator returns + new :class:`Token` instances with the role set rather than mutating + (the dataclass is frozen). ``extra`` carries role-specific payload + when the token text alone isn't enough (e.g. a ``codec-GROUP`` token + annotated as CODEC may record the group name in ``extra["group"]``). + """ + + text: str + index: int + role: TokenRole = TokenRole.UNKNOWN + extra: dict[str, str] = field(default_factory=dict) + + def with_role(self, role: TokenRole, **extra: str) -> Token: + """Return a copy of this token with ``role`` (and optional ``extra``).""" + merged = {**self.extra, **extra} if extra else self.extra + return Token(text=self.text, index=self.index, role=role, extra=merged) + + @property + def is_annotated(self) -> bool: + return self.role is not TokenRole.UNKNOWN diff --git a/tests/domain/release/__init__.py b/tests/domain/release/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/domain/release/test_parser_v2_scaffolding.py b/tests/domain/release/test_parser_v2_scaffolding.py new file mode 100644 index 0000000..995c242 --- /dev/null +++ b/tests/domain/release/test_parser_v2_scaffolding.py @@ -0,0 +1,79 @@ +"""Scaffolding tests for the v2 parser package. + +These tests lock the **shape** of the new pipeline (token VOs, tokenize +output, site-tag stripping) before the annotate step is wired in. They +do not check parsed-release output yet — that comes once :func:`annotate` +is implemented and the fixtures-based suite switches over. +""" + +from __future__ import annotations + +from alfred.domain.release.parser import Token, TokenRole +from alfred.domain.release.parser.pipeline import strip_site_tag, tokenize +from alfred.infrastructure.knowledge.release_kb import YamlReleaseKnowledge + +_KB = YamlReleaseKnowledge() + + +class TestToken: + def test_default_role_is_unknown(self) -> None: + t = Token(text="1080p", index=3) + assert t.role is TokenRole.UNKNOWN + assert not t.is_annotated + + def test_with_role_returns_new_instance(self) -> None: + t = Token(text="1080p", index=3) + promoted = t.with_role(TokenRole.RESOLUTION) + assert promoted is not t + assert promoted.role is TokenRole.RESOLUTION + assert t.role is TokenRole.UNKNOWN # original unchanged (frozen) + + def test_with_role_merges_extra(self) -> None: + t = Token(text="x265-KONTRAST", index=5) + promoted = t.with_role(TokenRole.CODEC, group="KONTRAST") + assert promoted.role is TokenRole.CODEC + assert promoted.extra == {"group": "KONTRAST"} + + +class TestStripSiteTag: + def test_no_tag(self) -> None: + clean, tag = strip_site_tag("The.Movie.2020.1080p-GRP") + assert tag is None + assert clean == "The.Movie.2020.1080p-GRP" + + def test_suffix_tag(self) -> None: + clean, tag = strip_site_tag("Sinners.2025.1080p-[YTS.MX]") + assert tag == "YTS.MX" + assert clean == "Sinners.2025.1080p-" + + def test_prefix_tag(self) -> None: + clean, tag = strip_site_tag("[ OxTorrent.vc ] The.Title.S01E01") + assert tag == "OxTorrent.vc" + assert clean == "The.Title.S01E01" + + +class TestTokenize: + def test_simple_release(self) -> None: + tokens, tag = tokenize("Back.in.Action.2025.1080p.WEBRip.x265-KONTRAST", _KB) + assert tag is None + texts = [t.text for t in tokens] + # Dash is not a separator, so x265-KONTRAST stays glued. + assert texts == [ + "Back", "in", "Action", "2025", "1080p", "WEBRip", "x265-KONTRAST", + ] + + def test_all_tokens_start_unknown(self) -> None: + tokens, _ = tokenize("Back.in.Action.2025.1080p.WEBRip.x265-KONTRAST", _KB) + assert all(t.role is TokenRole.UNKNOWN for t in tokens) + + def test_indexes_are_contiguous(self) -> None: + tokens, _ = tokenize("A.B.C.D", _KB) + assert [t.index for t in tokens] == [0, 1, 2, 3] + + def test_strips_site_tag_before_tokenize(self) -> None: + tokens, tag = tokenize( + "Sinners.2025.1080p.WEBRip.x265.10bit.AAC5.1-[YTS.MX]", _KB + ) + assert tag == "YTS.MX" + # Site tag substring must not appear among tokens. + assert not any("YTS" in t.text for t in tokens)