alfred/alfred/infrastructure/knowledge/release_kb.py

"""YamlReleaseKnowledge — concrete adapter for the ``ReleaseKnowledge``
domain port.

Loads every release-knowledge YAML once at construction time and exposes
the parsed snapshots as plain attributes. The application layer builds a
single instance at boot and passes it down to ``parse_release`` and to
``ParsedRelease`` builder methods.

A few extras (``video_extensions``, ``non_video_extensions``,
``subtitle_extensions``, ``metadata_extensions``) are not part of the
domain port — they are consumed by application/infra modules that handle
filesystem-level concerns.
"""

from __future__ import annotations

from alfred.domain.release.parser.schema import GroupSchema, SchemaChunk
from alfred.domain.release.parser.tokens import TokenRole

from .release import (
    load_audio,
    load_codecs,
    load_distributors,
    load_editions,
    load_forbidden_chars,
    load_group_schemas,
    load_hdr_extra,
    load_language_tokens,
    load_media_type_tokens,
    load_metadata_extensions,
    load_non_video_extensions,
    load_resolutions,
    load_scoring,
    load_separators,
    load_sources,
    load_sources_extra,
    load_subtitle_extensions,
    load_video,
    load_video_extensions,
    load_win_forbidden_chars,
)


def _build_group_schema(data: dict) -> GroupSchema:
    """Translate a raw YAML schema dict into a frozen :class:`GroupSchema`.

    Unknown roles raise ``ValueError`` early so a typo in a YAML file
    surfaces at construction time, not on first parse.
    """
    chunks = tuple(
        SchemaChunk(
            role=TokenRole(entry["role"]),
            optional=bool(entry.get("optional", False)),
        )
        for entry in data.get("chunk_order", [])
    )
    return GroupSchema(
        name=data["name"],
        separator=data.get("separator", "."),
        chunks=chunks,
    )


class YamlReleaseKnowledge:
    """Single object holding every parsed-release knowledge constant.

    Built once at application boot. Read-only at runtime — call sites
    treat it as a snapshot. To pick up newly learned tokens without a
    restart, build a fresh instance and swap it in at the call sites.
    """

    def __init__(self) -> None:
        # Domain-port surface
        self.resolutions: set[str] = load_resolutions()
        self.sources: set[str] = load_sources() | load_sources_extra()
        self.codecs: set[str] = load_codecs()
        self.distributors: set[str] = load_distributors()
        self.language_tokens: set[str] = load_language_tokens()
        self.forbidden_chars: set[str] = load_forbidden_chars()
        self.hdr_extra: set[str] = load_hdr_extra()

        self.audio: dict = load_audio()
        self.video_meta: dict = load_video()
        self.editions: dict = load_editions()
        self.media_type_tokens: dict = load_media_type_tokens()

        self.separators: list[str] = load_separators()

        # Parse-scoring config (weights / penalties / thresholds).
        self.scoring: dict = load_scoring()

        # File-extension sets (used by application/infra modules, not by
        # the parser itself — kept here so there is a single ownership
        # point for release knowledge).
        self.video_extensions: set[str] = load_video_extensions()
        self.non_video_extensions: set[str] = load_non_video_extensions()
        self.subtitle_extensions: set[str] = load_subtitle_extensions()
        # Metadata + subtitle extensions are both ignored when deciding
        # the media type of a folder (neither is a conclusive signal for
        # movie/tv/other), so we expose the union under the historical
        # name.
        self.metadata_extensions: set[str] = (
            load_metadata_extensions() | self.subtitle_extensions
        )

        # Translation table for stripping Windows-forbidden chars.
        self._win_forbidden_table = str.maketrans(
            "", "", "".join(load_win_forbidden_chars())
        )

        # Group schemas, keyed by uppercase group name for fast lookup.
        self._group_schemas: dict[str, GroupSchema] = {
            key: _build_group_schema(data)
            for key, data in load_group_schemas().items()
        }

    def sanitize_for_fs(self, text: str) -> str:
        """Strip Windows-forbidden characters from ``text``."""
        return text.translate(self._win_forbidden_table)

    def group_schema(self, name: str) -> GroupSchema | None:
        return self._group_schemas.get(name.upper())