alfred/alfred/infrastructure/knowledge/release.py

"""Release knowledge loader.

Three-layer merge (lowest → highest priority):
  1. Builtin   — alfred/knowledge/release/
  2. Sites     — alfred/knowledge/release/sites/*.yaml (all trackers)
  3. Learned   — data/knowledge/release/ (user additions via the learn tool)

Lists are extended additively, scalars from higher layers win.
"""

from pathlib import Path

import yaml

import alfred as _alfred_pkg

_BUILTIN_ROOT = Path(_alfred_pkg.__file__).parent / "knowledge" / "release"
_SITES_ROOT = _BUILTIN_ROOT / "sites"
_GROUPS_ROOT = _BUILTIN_ROOT / "release_groups"
_LEARNED_ROOT = (
    Path(_alfred_pkg.__file__).parent.parent / "data" / "knowledge" / "release"
)
_LEARNED_GROUPS_ROOT = _LEARNED_ROOT / "release_groups"


def _merge(base: dict, overlay: dict) -> dict:
    """Merge overlay into base — lists are extended, scalars from overlay win."""
    result = dict(base)
    for key, val in overlay.items():
        if key in result and isinstance(result[key], list) and isinstance(val, list):
            result[key] = result[key] + [v for v in val if v not in result[key]]
        else:
            result[key] = val
    return result


def _read(path: Path) -> dict:
    try:
        with open(path, encoding="utf-8") as f:
            return yaml.safe_load(f) or {}
    except FileNotFoundError:
        return {}


def _load(filename: str) -> dict:
    result = _read(_BUILTIN_ROOT / filename)
    result = _merge(result, _read(_LEARNED_ROOT / filename))
    return result


def _load_sites() -> dict:
    """Merge all site YAML files into a single dict."""
    result: dict = {}
    for site_file in sorted(_SITES_ROOT.glob("*.yaml")):
        result = _merge(result, _read(site_file))
    return result


def load_resolutions() -> set[str]:
    return set(_load("resolutions.yaml").get("resolutions", []))


def load_sources() -> set[str]:
    return set(_load("sources.yaml").get("sources", []))


def load_distributors() -> set[str]:
    """Streaming distributor tokens (NF, AMZN, DSNP, …).

    Distinct from ``load_sources()`` — distributors are uppercase scene
    tags identifying the platform, not the capture origin.
    """
    return {t.upper() for t in _load("distributors.yaml").get("distributors", [])}


def load_codecs() -> set[str]:
    return set(_load("codecs.yaml").get("codecs", []))


def load_win_forbidden_chars() -> list[str]:
    return _load("filesystem.yaml").get("win_forbidden_chars", [])


def load_video_extensions() -> set[str]:
    return set(_load("file_extensions.yaml").get("video", []))


def load_non_video_extensions() -> set[str]:
    return set(_load("file_extensions.yaml").get("non_video", []))


def load_metadata_extensions() -> set[str]:
    return set(_load("file_extensions.yaml").get("metadata", []))


def load_subtitle_extensions() -> set[str]:
    return set(_load("file_extensions.yaml").get("subtitle", []))


def load_forbidden_chars() -> set[str]:
    return set(_load("release_format.yaml").get("forbidden_chars", []))


def load_language_tokens() -> set[str]:
    base = {t.upper() for t in _load("languages.yaml").get("tokens", [])}
    sites = {t.upper() for t in _load_sites().get("languages", [])}
    return base | sites


def load_audio() -> dict:
    return _load("audio.yaml")


def load_video() -> dict:
    return _load("video.yaml")


def load_editions() -> dict:
    base = _load("editions.yaml")
    site_tokens = _load_sites().get("editions", {}).get("tokens", [])
    if site_tokens:
        existing = base.get("tokens", [])
        base["tokens"] = existing + [t for t in site_tokens if t not in existing]
    return base


def load_sources_extra() -> set[str]:
    """Additional source tokens from site files."""
    return set(_load_sites().get("sources", []))


def load_hdr_extra() -> set[str]:
    """Additional HDR tokens from site files."""
    return {t.upper() for t in _load_sites().get("hdr", [])}


def load_media_type_tokens() -> dict:
    """Site-specific media type tokens (doc, concert, collection, integrale)."""
    return _load_sites().get("media_type_tokens", {})


def load_group_schemas() -> dict:
    """Load every release-group schema YAML keyed by uppercase group name.

    Builtin schemas in ``alfred/knowledge/release/release_groups/`` are
    merged with user-learned schemas in
    ``data/knowledge/release/release_groups/`` (the learned ones win on
    name collision).
    """
    result: dict = {}
    for root in (_GROUPS_ROOT, _LEARNED_GROUPS_ROOT):
        if not root.is_dir():
            continue
        for path in sorted(root.glob("*.yaml")):
            data = _read(path)
            name = data.get("name")
            if not name:
                continue
            result[name.upper()] = data
    return result


def load_scoring() -> dict:
    """Load the parse-scoring config.

    Returns a dict with three top-level keys: ``weights``, ``penalties``,
    ``thresholds``. Defaults are baked in so a missing or partial YAML
    never breaks the parser — only de-tunes it.
    """
    raw = _load("scoring.yaml")
    weights = {
        "title": 30,
        "media_type": 20,
        "year": 15,
        "season": 10,
        "episode": 5,
        "resolution": 5,
        "source": 5,
        "codec": 5,
        "group": 5,
    }
    weights.update(raw.get("weights", {}) or {})
    penalties = {"unknown_token": 5, "max_unknown_penalty": 30}
    penalties.update(raw.get("penalties", {}) or {})
    thresholds = {"shitty_min": 60}
    thresholds.update(raw.get("thresholds", {}) or {})
    return {
        "weights": weights,
        "penalties": penalties,
        "thresholds": thresholds,
    }


def load_probe_mappings() -> dict:
    """Load ffprobe→scene-token translation tables.

    Returns a dict with three keys:

      - ``video_codec``: ``{ffprobe_codec_lower: scene_token}``
      - ``audio_codec``: ``{ffprobe_codec_lower: scene_token}``
      - ``audio_channels``: ``{channel_count_int: layout_str}``

    Channel-count keys are normalized to ``int`` here so the consumer can
    look up ``track.channels`` directly. Missing sections fall back to
    empty dicts — the enrichment code degrades to its uppercase-fallback
    path when a mapping is absent.
    """
    raw = _load("probe_mappings.yaml")
    video_codec = {k.lower(): v for k, v in (raw.get("video_codec") or {}).items()}
    audio_codec = {k.lower(): v for k, v in (raw.get("audio_codec") or {}).items()}
    audio_channels: dict[int, str] = {}
    for k, v in (raw.get("audio_channels") or {}).items():
        try:
            audio_channels[int(k)] = v
        except (TypeError, ValueError):
            continue
    return {
        "video_codec": video_codec,
        "audio_codec": audio_codec,
        "audio_channels": audio_channels,
    }


def load_separators() -> list[str]:
    """Single-char token separators used by the release name tokenizer.

    Always includes the canonical "." even if absent from YAML, to prevent a
    misconfigured file from breaking the parser entirely.
    """
    seps = _load("separators.yaml").get("separators", []) or []
    if "." not in seps:
        seps = [".", *seps]
    return seps