refactor(release): simplify SHITTY to dict-driven token tagging

Replace the ~480-line legacy heuristic block in services.py with a small dict-driven pass in pipeline._annotate_shitty: each token is looked up against the kb buckets (resolutions / sources / codecs / distributors / year / sxxexx) with first-match-wins semantics, the leftmost contiguous UNKNOWN run becomes the title, done. SHITTY's scope is intentionally narrow — releases that *look* like scene names but don't have a registered group schema. Anything more exotic (parenthesized tech, bare-dashed title fragments, YT slugs, franchise boxes) is PATH OF PAIN territory and stays out of here. - annotate() no longer returns None; SHITTY is the always-on fallback - services.py shrunk from ~525 to ~85 lines (legacy extractors gone) - 4 fixtures get xfail markers documenting PoP-grade pathologies (deutschland franchise box, sleaford YT slug, super_mario bilingual, predator space-separators — the last one moved from shitty/ → pop/) - ReleaseFixture grows xfail_reason; the parametrized suite wires the pytest.mark.xfail(strict=False) automatically
2026-05-20 01:03:25 +02:00
parent fd3bd1ad8c
commit 3737f66851
9 changed files with 231 additions and 502 deletions
@@ -306,6 +306,15 @@ def _find_title_end(
            return i
        if lower in kb.codecs:
            return i
        # codec-GROUP token (e.g. "x265-KONTRAST") or dashed source (Web-DL).
        if "-" in text:
            head, _, _ = text.rpartition("-")
            if (
                head.lower() in kb.codecs
                or head.lower() in kb.sources
                or text.lower().replace("-", "") in kb.sources
            ):
                return i
    return body_end
@@ -329,6 +338,81 @@ def _find_chunk(
    return None
 # ---------------------------------------------------------------------------
 # Stage 2b' — SHITTY annotation (schema-less heuristic)
 # ---------------------------------------------------------------------------
 def _annotate_shitty(
    tokens: list[Token],
    kb: ReleaseKnowledge,
    group_index: int | None,
 ) -> list[Token]:
    """Schema-less, dictionary-driven annotation.
    SHITTY's job is narrow: for releases that *look* like scene names
    but don't have a registered group schema, tag every token whose text
    falls into a known YAML bucket (resolutions, codecs, sources, …).
    Anything we can't classify stays UNKNOWN. The leftmost run of
    UNKNOWN tokens becomes the title. Done.
    Anything that requires more reasoning (parenthesized tech blocks,
    bare-dashed title fragments, year-disguised slug suffixes, …) is
    PATH OF PAIN territory and stays out of here on purpose.
    """
    result = list(tokens)
    # 1) Group token — split codec-GROUP or tag GROUP. Same logic as EASY.
    if group_index is not None:
        gt = result[group_index]
        cg_split = _split_codec_group(gt.text, kb)
        if cg_split is not None:
            codec, group = cg_split
            result[group_index] = gt.with_role(
                TokenRole.CODEC, codec=codec, group=group or "UNKNOWN"
            )
        else:
            _, _, tail = gt.text.rpartition("-")
            result[group_index] = gt.with_role(
                TokenRole.GROUP, group=tail or "UNKNOWN"
            )
    # 2) Enrichers (audio / video-meta / edition / language).
    result = _annotate_enrichers(result, kb)
    # 3) Single pass: tag each UNKNOWN token by looking it up in the kb
    #    buckets. First match wins per token, first occurrence wins per
    #    role (we don't overwrite an already-tagged role).
    matchers: list[tuple[TokenRole, callable]] = [
        (TokenRole.SEASON_EPISODE, lambda t: _parse_season_episode(t) is not None),
        (TokenRole.YEAR, _is_year),
        (TokenRole.RESOLUTION, lambda t: t.lower() in kb.resolutions),
        (TokenRole.DISTRIBUTOR, lambda t: t.upper() in kb.distributors),
        (TokenRole.SOURCE, lambda t: t.lower() in kb.sources),
        (TokenRole.CODEC, lambda t: t.lower() in kb.codecs),
    ]
    seen: set[TokenRole] = set()
    for i, tok in enumerate(result):
        if tok.role is not TokenRole.UNKNOWN:
            continue
        for role, matches in matchers:
            if role in seen:
                continue
            if matches(tok.text):
                result[i] = tok.with_role(role)
                seen.add(role)
                break
    # 4) Title = leftmost contiguous UNKNOWN tokens.
    for i, tok in enumerate(result):
        if tok.role is not TokenRole.UNKNOWN:
            break
        result[i] = tok.with_role(TokenRole.TITLE)
    return result
 # ---------------------------------------------------------------------------
 # Stage 2c — enricher pass (non-positional roles)
 # ---------------------------------------------------------------------------
@@ -394,6 +478,9 @@ def _annotate_enrichers(tokens: list[Token], kb: ReleaseKnowledge) -> list[Token
        if upper in kb.language_tokens:
            result[i] = tok.with_role(TokenRole.LANGUAGE)
            continue
        if upper in kb.distributors:
            result[i] = tok.with_role(TokenRole.DISTRIBUTOR)
            continue
    return result
@@ -474,26 +561,42 @@ def _detect_channel_pairs(
 # ---------------------------------------------------------------------------
-def annotate(tokens: list[Token], kb: ReleaseKnowledge) -> list[Token] | None:
+def annotate(tokens: list[Token], kb: ReleaseKnowledge) -> list[Token]:
-    """Annotate token roles. Returns ``None`` when the EASY path fails.
+    """Annotate token roles.
-    A ``None`` return means: the group is unknown, OR the schema walk
+    Dispatch:
-    aborted on a mandatory mismatch. The caller falls back to the legacy
+
-    SHITTY heuristic in that case.
+    * If a group is detected AND has a known schema, run the EASY
      structural walk. If the schema walk aborts on a mandatory chunk
      mismatch, fall through to SHITTY (the heuristic still does better
      than giving up).
    * Otherwise run SHITTY — schema-less, best-effort, never aborts.
    The enricher pass runs in both cases. The pipeline always returns a
    populated token list; downstream callers don't need to distinguish
    EASY vs SHITTY at this layer (the parse_path is decided in the
    service based on whether a schema matched).
    """
    group_name, group_index = _detect_group(tokens, kb)
    schema = kb.group_schema(group_name) if group_index is not None else None
    if schema is not None and group_index is not None:
        structural = _annotate_structural(tokens, kb, schema, group_index)
        if structural is not None:
            return _annotate_enrichers(structural, kb)
    # SHITTY fallback — heuristic positional pass. ``_annotate_shitty``
    # runs its own enricher pass internally (it has to, so the title
    # scan can skip enricher-tagged tokens).
    return _annotate_shitty(tokens, kb, group_index)
 def has_known_schema(tokens: list[Token], kb: ReleaseKnowledge) -> bool:
    """Return True if ``tokens`` would take the EASY path in :func:`annotate`."""
    group_name, group_index = _detect_group(tokens, kb)
    if group_index is None:
-        return None
+        return False
-
+    return kb.group_schema(group_name) is not None
    schema = kb.group_schema(group_name)
    if schema is None:
        return None
    structural = _annotate_structural(tokens, kb, schema, group_index)
    if structural is None:
        return None
    return _annotate_enrichers(structural, kb)
 # ---------------------------------------------------------------------------
@@ -531,6 +634,7 @@ def assemble(
    bit_depth: str | None = None
    hdr_format: str | None = None
    edition: str | None = None
    distributor: str | None = None
    languages: list[str] = []
    for tok in annotated:
@@ -572,16 +676,33 @@ def assemble(
                edition = tok.extra.get("sequence", tok.text.upper())
        elif role is TokenRole.LANGUAGE:
            languages.append(tok.text.upper())
        elif role is TokenRole.DISTRIBUTOR:
            if distributor is None:
                distributor = tok.text.upper()
    tech_parts = [p for p in (quality, source, codec) if p]
    tech_string = ".".join(tech_parts)
-    # Media type heuristic — same rules as the legacy parser, minus the
+    # Media type heuristic. Doc/concert/integrale tokens win over the
-    # documentary/concert/integrale specials (handled by SHITTY for now).
+    # generic tech-based fallback. We look across all tokens (not just
-    if season is not None:
+    # annotated ones) because these markers may be tagged UNKNOWN by the
-        media_type = "tv_show"
+    # structural pass — only the assemble step cares about them.
-    elif edition in {"COMPLETE", "INTEGRALE", "COLLECTION"}:
+    upper_tokens = {tok.text.upper() for tok in annotated}
    doc_tokens = {t.upper() for t in kb.media_type_tokens.get("doc", [])}
    concert_tokens = {t.upper() for t in kb.media_type_tokens.get("concert", [])}
    integrale_tokens = {t.upper() for t in kb.media_type_tokens.get("integrale", [])}
    if upper_tokens & doc_tokens:
        media_type = "documentary"
    elif upper_tokens & concert_tokens:
        media_type = "concert"
    elif (
        edition in {"COMPLETE", "INTEGRALE", "COLLECTION"}
        or upper_tokens & integrale_tokens
    ) and season is None:
        media_type = "tv_complete"
    elif season is not None:
        media_type = "tv_show"
    elif any((quality, source, codec, year)):
        media_type = "movie"
    else:
@@ -607,4 +728,5 @@ def assemble(
        "bit_depth": bit_depth,
        "hdr_format": hdr_format,
        "edition": edition,
        "distributor": distributor,
    }
@@ -1,57 +1,46 @@
-"""Release domain — parsing service."""
+"""Release domain — parsing service.
 Thin orchestrator over the annotate-based pipeline in
 :mod:`alfred.domain.release.parser.pipeline`. Responsibilities:
 * Strip a leading/trailing ``[site.tag]`` and decide ``parse_path``.
 * Reject malformed names (forbidden characters) → ``parse_path=AI`` so
  the LLM can clean them up.
 * Otherwise call the v2 pipeline (tokenize → annotate → assemble) and
  wrap the result in :class:`ParsedRelease`.
 All structural and enricher logic now lives in the pipeline. This file
 no longer carries field extractors — the heuristic SHITTY path is part
 of :func:`~alfred.domain.release.parser.pipeline.annotate`.
 """
 from __future__ import annotations
 import re
 from .parser import pipeline as _v2
 from .ports import ReleaseKnowledge
 from .value_objects import MediaTypeToken, ParsedRelease, ParsePath
 def _tokenize(name: str, kb: ReleaseKnowledge) -> list[str]:
    """Split a release name on the configured separators, dropping empty tokens."""
    pattern = "[" + re.escape("".join(kb.separators)) + "]+"
    return [t for t in re.split(pattern, name) if t]
 def parse_release(name: str, kb: ReleaseKnowledge) -> ParsedRelease:
-    """
+    """Parse a release name and return a :class:`ParsedRelease`.
    Parse a release name and return a ParsedRelease.
    Flow:
-      1. Strip a leading/trailing [site.tag] if present (sets parse_path="sanitized").
+
-      2. Check the remainder for truly forbidden chars (anything not in the
+    1. Strip a leading/trailing ``[site.tag]`` if present (sets
-         configured separators list). If any remain → media_type="unknown",
+       ``parse_path="sanitized"``).
-         parse_path="ai", and the LLM handles it.
+    2. If the remainder still contains truly forbidden chars (anything
-      3. Tokenize using the configured separators (".", " ", "[", "]", "(", ")", "_", ...)
+       not in the configured separators), short-circuit to
-         and run token-level matchers (season/episode, tech, languages, audio,
+       ``media_type="unknown"`` / ``parse_path="ai"`` — the LLM handles
-         video, edition, title, year).
+       these.
    3. Otherwise run the v2 pipeline: tokenize → annotate (EASY when a
       group schema is known, SHITTY otherwise) → assemble.
    """
    parse_path = ParsePath.DIRECT.value
-    # Always try to extract a bracket-enclosed site tag first.
+    clean, site_tag = _v2.strip_site_tag(name)
    clean, site_tag = _strip_site_tag(name)
    if site_tag is not None:
        parse_path = ParsePath.SANITIZED.value
    # --- v2 parser: EASY path for known groups -----------------------------
    # If the v2 pipeline recognizes the release group (KONTRAST, ELiTE, …)
    # and the schema walk succeeds, return its result. On any mismatch
    # (unknown group, schema abort) ``annotate`` returns None and we
    # fall back to the legacy heuristic below.
    v2_tokens, v2_tag = _v2.tokenize(name, kb)
    v2_annotated = _v2.annotate(v2_tokens, kb)
    if v2_annotated is not None:
        fields = _v2.assemble(v2_annotated, v2_tag, name, kb)
        return ParsedRelease(
            raw=name,
            normalised=clean,
            parse_path=parse_path,
            **fields,
        )
    # ---------------------------------------------------------------------
    if not _is_well_formed(clean, kb):
        return ParsedRelease(
            raw=name,
@@ -72,453 +61,26 @@ def parse_release(name: str, kb: ReleaseKnowledge) -> ParsedRelease:
            parse_path=ParsePath.AI.value,
        )
-    name = clean
+    tokens, v2_tag = _v2.tokenize(name, kb)
-    tokens = _tokenize(name, kb)
+    annotated = _v2.annotate(tokens, kb)
-
+    fields = _v2.assemble(annotated, v2_tag, name, kb)
    season, episode, episode_end = _extract_season_episode(tokens)
    quality, source, codec, group, tech_tokens = _extract_tech(tokens, kb)
    languages, lang_tokens = _extract_languages(tokens, kb)
    audio_codec, audio_channels, audio_tokens = _extract_audio(tokens, kb)
    bit_depth, hdr_format, video_tokens = _extract_video_meta(tokens, kb)
    edition, edition_tokens = _extract_edition(tokens, kb)
    title = _extract_title(
        tokens,
        tech_tokens | lang_tokens | audio_tokens | video_tokens | edition_tokens,
        kb,
    )
    year = _extract_year(tokens, title)
    media_type = _infer_media_type(
        season, quality, source, codec, year, edition, tokens, kb
    )
    tech_parts = [p for p in [quality, source, codec] if p]
    tech_string = ".".join(tech_parts)
    return ParsedRelease(
        raw=name,
-        normalised=name,
+        normalised=clean,
        title=title,
        title_sanitized=kb.sanitize_for_fs(title),
        year=year,
        season=season,
        episode=episode,
        episode_end=episode_end,
        quality=quality,
        source=source,
        codec=codec,
        group=group,
        tech_string=tech_string,
        media_type=media_type,
        site_tag=site_tag,
        parse_path=parse_path,
-        languages=languages,
+        **fields,
        audio_codec=audio_codec,
        audio_channels=audio_channels,
        bit_depth=bit_depth,
        hdr_format=hdr_format,
        edition=edition,
    )
 def _infer_media_type(
    season: int | None,
    quality: str | None,
    source: str | None,
    codec: str | None,
    year: int | None,
    edition: str | None,
    tokens: list[str],
    kb: ReleaseKnowledge,
 ) -> str:
    """
    Infer media_type from token-level evidence only (no filesystem access).
    - documentary  : DOC token present
    - concert      : CONCERT token present
    - tv_complete  : INTEGRALE/COMPLETE token, no season
    - tv_show      : season token found
    - movie        : no season, at least one tech marker
    - unknown      : no conclusive evidence
    """
    upper_tokens = {t.upper() for t in tokens}
    doc_tokens = {t.upper() for t in kb.media_type_tokens.get("doc", [])}
    concert_tokens = {t.upper() for t in kb.media_type_tokens.get("concert", [])}
    integrale_tokens = {t.upper() for t in kb.media_type_tokens.get("integrale", [])}
    if upper_tokens & doc_tokens:
        return MediaTypeToken.DOCUMENTARY.value
    if upper_tokens & concert_tokens:
        return MediaTypeToken.CONCERT.value
    if (
        edition in {"COMPLETE", "INTEGRALE", "COLLECTION"}
        or upper_tokens & integrale_tokens
    ) and season is None:
        return MediaTypeToken.TV_COMPLETE.value
    if season is not None:
        return MediaTypeToken.TV_SHOW.value
    if any([quality, source, codec, year]):
        return MediaTypeToken.MOVIE.value
    return MediaTypeToken.UNKNOWN.value
 def _is_well_formed(name: str, kb: ReleaseKnowledge) -> bool:
-    """Return True if name contains no forbidden characters per scene naming rules.
+    """Return True if ``name`` contains no forbidden characters per scene
    naming rules.
-    Characters listed as token separators (spaces, brackets, parens, …) are NOT
+    Characters listed as token separators (spaces, brackets, parens, …)
-    considered malforming — the tokenizer handles them. Only truly broken chars
+    are NOT considered malforming — the tokenizer handles them. Only
-    like '@', '#', '!', '%' make a name malformed.
+    truly broken chars like ``@``, ``#``, ``!``, ``%`` make a name
    malformed.
    """
    tokenizable = set(kb.separators)
    return not any(c in name for c in kb.forbidden_chars if c not in tokenizable)
 def _strip_site_tag(name: str) -> tuple[str, str | None]:
    """
    Strip a site watermark tag from the release name and return (clean_name, tag).
    Handles two positions:
    - Prefix:  "[ OxTorrent.vc ] The.Title.S01..."
    - Suffix:  "The.Title.S01...-NTb[TGx]"
    Anything between [...] is treated as a site tag.
    Returns (original_name, None) if no tag found.
    """
    s = name.strip()
    if s.startswith("["):
        close = s.find("]")
        if close != -1:
            tag = s[1:close].strip()
            remainder = s[close + 1 :].strip()
            if tag and remainder:
                return remainder, tag
    if s.endswith("]"):
        open_bracket = s.rfind("[")
        if open_bracket != -1:
            tag = s[open_bracket + 1 : -1].strip()
            remainder = s[:open_bracket].strip()
            if tag and remainder:
                return remainder, tag
    return s, None
 def _parse_season_episode(tok: str) -> tuple[int, int | None, int | None] | None:
    """
    Parse a single token as a season/episode marker.
    Handles:
      - SxxExx / SxxExxExx / Sxx        (canonical scene form)
      - NxNN / NxNNxNN                  (alt form: 1x05, 12x07x08)
    Returns (season, episode, episode_end) or None if not a season token.
    """
    upper = tok.upper()
    # SxxExx form
    if len(upper) >= 3 and upper[0] == "S" and upper[1:3].isdigit():
        season = int(upper[1:3])
        rest = upper[3:]
        if not rest:
            return season, None, None
        episodes: list[int] = []
        while rest.startswith("E") and len(rest) >= 3 and rest[1:3].isdigit():
            episodes.append(int(rest[1:3]))
            rest = rest[3:]
        if not episodes:
            return None  # malformed token like "S03XYZ"
        return season, episodes[0], episodes[1] if len(episodes) >= 2 else None
    # NxNN form — split on "X" (uppercased), all parts must be digits
    if "X" in upper:
        parts = upper.split("X")
        if len(parts) >= 2 and all(p.isdigit() and p for p in parts):
            season = int(parts[0])
            episode = int(parts[1])
            episode_end = int(parts[2]) if len(parts) >= 3 else None
            return season, episode, episode_end
    return None
 def _extract_season_episode(
    tokens: list[str],
 ) -> tuple[int | None, int | None, int | None]:
    for tok in tokens:
        parsed = _parse_season_episode(tok)
        if parsed is not None:
            return parsed
    return None, None, None
 def _extract_tech(
    tokens: list[str],
    kb: ReleaseKnowledge,
 ) -> tuple[str | None, str | None, str | None, str, set[str]]:
    """
    Extract quality, source, codec, group from tokens.
    Returns (quality, source, codec, group, tech_token_set).
    Group extraction strategy (in priority order):
    1. Token where prefix is a known codec: x265-GROUP
    2. Rightmost token with a dash that isn't a known source
    """
    quality: str | None = None
    source: str | None = None
    codec: str | None = None
    group = "UNKNOWN"
    tech_tokens: set[str] = set()
    for tok in tokens:
        tl = tok.lower()
        if tl in kb.resolutions:
            quality = tok
            tech_tokens.add(tok)
            continue
        if tl in kb.sources:
            source = tok
            tech_tokens.add(tok)
            continue
        if "-" in tok:
            parts = tok.rsplit("-", 1)
            # codec-GROUP (highest priority for group)
            if parts[0].lower() in kb.codecs:
                codec = parts[0]
                group = parts[1] if parts[1] else "UNKNOWN"
                tech_tokens.add(tok)
                continue
            # source with dash: Web-DL, WEB-DL, etc.
            if parts[0].lower() in kb.sources or tok.lower().replace("-", "") in kb.sources:
                source = tok
                tech_tokens.add(tok)
                continue
        if tl in kb.codecs:
            codec = tok
            tech_tokens.add(tok)
    # Fallback: rightmost token with a dash that isn't a known source
    if group == "UNKNOWN":
        for tok in reversed(tokens):
            if "-" in tok:
                parts = tok.rsplit("-", 1)
                tl = tok.lower()
                if tl in kb.sources or tok.lower().replace("-", "") in kb.sources:
                    continue
                if parts[1]:
                    group = parts[1]
                    break
    return quality, source, codec, group, tech_tokens
 def _is_year_token(tok: str) -> bool:
    """Return True if tok is a 4-digit year between 1900 and 2099."""
    return len(tok) == 4 and tok.isdigit() and 1900 <= int(tok) <= 2099
 def _extract_title(
    tokens: list[str], tech_tokens: set[str], kb: ReleaseKnowledge
 ) -> str:
    """Extract the title portion: everything before the first season/year/tech token."""
    title_parts = []
    known_tech = kb.resolutions | kb.sources | kb.codecs
    for tok in tokens:
        if _parse_season_episode(tok) is not None:
            break
        if _is_year_token(tok):
            break
        if tok in tech_tokens or tok.lower() in known_tech:
            break
        if "-" in tok and any(p.lower() in kb.codecs | kb.sources for p in tok.split("-")):
            break
        title_parts.append(tok)
    return ".".join(title_parts) if title_parts else tokens[0]
 def _extract_year(tokens: list[str], title: str) -> int | None:
    """Extract a 4-digit year from tokens (only after the title)."""
    title_len = len(title.split("."))
    for tok in tokens[title_len:]:
        if _is_year_token(tok):
            return int(tok)
    return None
 # ---------------------------------------------------------------------------
 # Sequence matcher
 # ---------------------------------------------------------------------------
 def _match_sequences(
    tokens: list[str],
    sequences: list[dict],
    key: str,
 ) -> tuple[str | None, set[str]]:
    """
    Try to match multi-token sequences against consecutive tokens.
    Returns (matched_value, set_of_matched_tokens) or (None, empty_set).
    Sequences must be ordered most-specific first in the YAML.
    """
    upper_tokens = [t.upper() for t in tokens]
    for seq in sequences:
        seq_upper = [s.upper() for s in seq["tokens"]]
        n = len(seq_upper)
        for i in range(len(upper_tokens) - n + 1):
            if upper_tokens[i : i + n] == seq_upper:
                matched = set(tokens[i : i + n])
                return seq[key], matched
    return None, set()
 # ---------------------------------------------------------------------------
 # Language extraction
 # ---------------------------------------------------------------------------
 def _extract_languages(
    tokens: list[str], kb: ReleaseKnowledge
 ) -> tuple[list[str], set[str]]:
    """Extract language tokens. Returns (languages, matched_token_set)."""
    languages = []
    lang_tokens: set[str] = set()
    for tok in tokens:
        if tok.upper() in kb.language_tokens:
            languages.append(tok.upper())
            lang_tokens.add(tok)
    return languages, lang_tokens
 # ---------------------------------------------------------------------------
 # Audio extraction
 # ---------------------------------------------------------------------------
 def _extract_audio(
    tokens: list[str], kb: ReleaseKnowledge,
 ) -> tuple[str | None, str | None, set[str]]:
    """
    Extract audio codec and channel layout.
    Returns (audio_codec, audio_channels, matched_token_set).
    Sequences are tried first (DTS.HD.MA, TrueHD.Atmos, …), then single tokens.
    """
    audio_codec: str | None = None
    audio_channels: str | None = None
    audio_tokens: set[str] = set()
    known_codecs = {c.upper() for c in kb.audio.get("codecs", [])}
    known_channels = set(kb.audio.get("channels", []))
    # Try multi-token sequences first
    matched_codec, matched_set = _match_sequences(
        tokens, kb.audio.get("sequences", []), "codec"
    )
    if matched_codec:
        audio_codec = matched_codec
        audio_tokens |= matched_set
    # Channel layouts like "5.1" or "7.1" are split into two tokens by normalize —
    # detect them as consecutive pairs "X" + "Y" where "X.Y" is a known channel.
    # The second token may have a "-GROUP" suffix (e.g. "1-KTH" → strip it).
    for i in range(len(tokens) - 1):
        second = tokens[i + 1].split("-")[0]
        candidate = f"{tokens[i]}.{second}"
        if candidate in known_channels and audio_channels is None:
            audio_channels = candidate
            audio_tokens.add(tokens[i])
            audio_tokens.add(tokens[i + 1])
    for tok in tokens:
        if tok in audio_tokens:
            continue
        if tok.upper() in known_codecs and audio_codec is None:
            audio_codec = tok
            audio_tokens.add(tok)
        elif tok in known_channels and audio_channels is None:
            audio_channels = tok
            audio_tokens.add(tok)
    return audio_codec, audio_channels, audio_tokens
 # ---------------------------------------------------------------------------
 # Video metadata extraction (bit depth, HDR)
 # ---------------------------------------------------------------------------
 def _extract_video_meta(
    tokens: list[str], kb: ReleaseKnowledge,
 ) -> tuple[str | None, str | None, set[str]]:
    """
    Extract bit depth and HDR format.
    Returns (bit_depth, hdr_format, matched_token_set).
    """
    bit_depth: str | None = None
    hdr_format: str | None = None
    video_tokens: set[str] = set()
    known_hdr = {h.upper() for h in kb.video_meta.get("hdr", [])} | kb.hdr_extra
    known_depth = {d.lower() for d in kb.video_meta.get("bit_depth", [])}
    # Try HDR sequences first
    matched_hdr, matched_set = _match_sequences(
        tokens, kb.video_meta.get("sequences", []), "hdr"
    )
    if matched_hdr:
        hdr_format = matched_hdr
        video_tokens |= matched_set
    for tok in tokens:
        if tok in video_tokens:
            continue
        if tok.upper() in known_hdr and hdr_format is None:
            hdr_format = tok.upper()
            video_tokens.add(tok)
        elif tok.lower() in known_depth and bit_depth is None:
            bit_depth = tok.lower()
            video_tokens.add(tok)
    return bit_depth, hdr_format, video_tokens
 # ---------------------------------------------------------------------------
 # Edition extraction
 # ---------------------------------------------------------------------------
 def _extract_edition(
    tokens: list[str], kb: ReleaseKnowledge
 ) -> tuple[str | None, set[str]]:
    """
    Extract release edition (UNRATED, EXTENDED, DIRECTORS.CUT, …).
    Returns (edition, matched_token_set).
    """
    known_tokens = {t.upper() for t in kb.editions.get("tokens", [])}
    # Try multi-token sequences first
    matched_edition, matched_set = _match_sequences(
        tokens, kb.editions.get("sequences", []), "edition"
    )
    if matched_edition:
        return matched_edition, matched_set
    for tok in tokens:
        if tok.upper() in known_tokens:
            return tok.upper(), {tok}
    return None, set()
@@ -90,11 +90,23 @@ class TestAnnotateEasy:
        assert TokenRole.RESOLUTION in roles
        assert TokenRole.CODEC in roles
-    def test_unknown_group_returns_none(self) -> None:
+    def test_unknown_group_falls_to_shitty(self) -> None:
        tokens, _ = tokenize("Some.Movie.2020.1080p.WEBRip.x264-RANDOM", _KB)
-        # RANDOM is not in our release_groups/ → annotate returns None
+        # RANDOM is not in our release_groups/ — annotate() now falls
-        # and the caller falls back to SHITTY.
+        # through to the in-pipeline SHITTY pass and returns a populated
-        assert annotate(tokens, _KB) is None
+        # token list (no None sentinel anymore).
        annotated = annotate(tokens, _KB)
        assert annotated is not None
        roles = [t.role for t in annotated]
        # Title is "Some.Movie", then YEAR, RESOLUTION, SOURCE, CODEC
        # carrying the group in extra.
        assert TokenRole.TITLE in roles
        assert TokenRole.YEAR in roles
        assert TokenRole.RESOLUTION in roles
        assert TokenRole.SOURCE in roles
        assert TokenRole.CODEC in roles
        codec_tok = next(t for t in annotated if t.role is TokenRole.CODEC)
        assert codec_tok.extra.get("group") == "RANDOM"
 class TestAssemble:
@@ -26,10 +26,16 @@ _KB = YamlReleaseKnowledge()
 FIXTURES = discover_fixtures()
 def _fixture_param(f: ReleaseFixture) -> pytest.param:
    marks = []
    if f.xfail_reason:
        marks.append(pytest.mark.xfail(reason=f.xfail_reason, strict=False))
    return pytest.param(f, id=f.name, marks=marks)
@pytest.mark.parametrize(
    "fixture",
-    FIXTURES,
+    [_fixture_param(f) for f in FIXTURES],
    ids=[f.name for f in FIXTURES],
 )
 def test_parse_matches_fixture(fixture: ReleaseFixture, tmp_path) -> None:
    # Materialize the tree to assert it is at least well-formed YAML +
@@ -39,6 +39,14 @@ class ReleaseFixture:
    def routing(self) -> dict:
        return self.data.get("routing", {})
    @property
    def xfail_reason(self) -> str | None:
        """If set, the fixture is expected to fail — wrapped with
        ``pytest.mark.xfail`` by the test runner. Used for known
        not-supported pathological cases (typically PATH OF PAIN bucket).
        """
        return self.data.get("xfail_reason")
    def materialize(self, root: Path) -> None:
        """Create the fixture's ``tree`` as empty files/dirs under ``root``."""
        for entry in self.tree:
@@ -1,5 +1,10 @@
 release_name: "Deutschland 83-86-89 (2015) Season 1-3 S01-S03 (1080p BluRay x265 HEVC 10bit AAC 5.1 German Kappa)"
 # Out of SHITTY scope by design: parenthesized tech blocks, group name as
 # the last bare word inside parens, year-suffix range in title, dual
 # season expression. PATH OF PAIN handles this via LLM pre-analysis.
 xfail_reason: "PoP-grade pathological franchise box-set, beyond simple-dict SHITTY"
 # Pathological franchise box-set:
 # - Title contains year-suffix range "83-86-89" (3 years glued)
 # - Season range expressed twice: "Season 1-3" AND "S01-S03"
@@ -1,5 +1,10 @@
 release_name: "Predator Badlands 2025 1080p HDRip HEVC x265 BONE"
 # Space-separated release with both codec aliases present (HEVC + x265)
 # and no dash-before-group. Simple-SHITTY first-wins picks HEVC, expected
 # was x265 (legacy last-wins). Reclassified PoP.
 xfail_reason: "Space-separated, dual codec aliases, no dashed group"
 # Space-separated release: tokenizer correctly splits and identifies year +
 # tech, but the dash-before-group convention is absent so 'BONE' is not
 # recognized as the group — falls to UNKNOWN. Anti-regression baseline.
@@ -1,5 +1,9 @@
 release_name: "SLEAFORD MODS   Live Glastonbury June 27th 2015-niNjHn8abyY.mp4"
 # YouTube-style slug with year-prefixed video-id dash suffix. Not a scene
 # release shape at all — PATH OF PAIN.
 xfail_reason: "YouTube slug with year-prefixed video-id, not a scene shape"
 # yt-dlp filename: triple space between band name and event, no canonical
 # tech markers, dashed YouTube video ID glued to the year, .mp4 extension
 # preserved in the title. Parser:
@@ -1,5 +1,10 @@
 release_name: "Super Mario Bros. le film [FR-EN] (2023).mkv"
 # Bare-dashed language pair interior to the title (``[FR-EN]``) is tagged
 # as group by ``_detect_group``, leaving the title fragment behind.
 # Out of simple-SHITTY scope.
 xfail_reason: "Interior bare-dashed language pair confuses group detection"
 # Hybrid English/French marketing title with:
 # - Trailing period after 'Bros' that is part of the title abbreviation
 #   (not a separator), but tokenizer treats it as one