alfred/alfred/domain/release/parser/tokens.py

"""Token value objects for the annotate-based parser.

A :class:`Token` carries both the original substring and its position in
the original release name's token stream. A :class:`TokenRole` is the
semantic tag assigned by the annotator.

Why VOs instead of bare ``str``: the annotate step needs to flag tokens
without consuming them (a token may carry residual info — e.g. a
``codec-GROUP`` token contributes both a CODEC and a GROUP role). Tracking
the index also lets later stages reason about *order* (year must come
after title, group must be rightmost, etc.) without re-scanning the list.
"""

from __future__ import annotations

from dataclasses import dataclass, field
from enum import Enum


class TokenRole(str, Enum):
    """Semantic role a token can take after annotation.

    A token starts as ``UNKNOWN`` and may be promoted by the annotator.
    ``str``-backed for cheap comparisons and YAML/JSON interop.

    Roles split into three families:

    - **structural**: TITLE / YEAR / SEASON_EPISODE / GROUP — drive folder
      and filename naming.
    - **technical**: RESOLUTION / SOURCE / CODEC / AUDIO_CODEC /
      AUDIO_CHANNELS / BIT_DEPTH / HDR / EDITION / LANGUAGE — feed
      ``tech_string`` and metadata fields.
    - **meta**: SITE_TAG (stripped pre-tokenize), SEPARATOR (kept for the
      assemble step if a release uses spaces that need preservation in the
      title), UNKNOWN (residual, contributes to the SHITTY score penalty).
    """

    UNKNOWN = "unknown"

    # Structural
    TITLE = "title"
    YEAR = "year"
    SEASON_EPISODE = "season_episode"
    GROUP = "group"

    # Technical
    RESOLUTION = "resolution"
    SOURCE = "source"
    CODEC = "codec"
    AUDIO_CODEC = "audio_codec"
    AUDIO_CHANNELS = "audio_channels"
    BIT_DEPTH = "bit_depth"
    HDR = "hdr"
    EDITION = "edition"
    LANGUAGE = "language"

    # Meta
    SITE_TAG = "site_tag"


@dataclass(frozen=True)
class Token:
    """An atomic token from a release name.

    ``text`` is the substring exactly as it appeared after tokenization
    (case preserved — uppercase comparisons happen at match time).
    ``index`` is the 0-based position in the tokenized stream, used by
    downstream stages to enforce ordering invariants.

    ``role`` defaults to :attr:`TokenRole.UNKNOWN`. The annotator returns
    new :class:`Token` instances with the role set rather than mutating
    (the dataclass is frozen). ``extra`` carries role-specific payload
    when the token text alone isn't enough (e.g. a ``codec-GROUP`` token
    annotated as CODEC may record the group name in ``extra["group"]``).
    """

    text: str
    index: int
    role: TokenRole = TokenRole.UNKNOWN
    extra: dict[str, str] = field(default_factory=dict)

    def with_role(self, role: TokenRole, **extra: str) -> Token:
        """Return a copy of this token with ``role`` (and optional ``extra``)."""
        merged = {**self.extra, **extra} if extra else self.extra
        return Token(text=self.text, index=self.index, role=role, extra=merged)

    @property
    def is_annotated(self) -> bool:
        return self.role is not TokenRole.UNKNOWN