Files
alfred/alfred/infrastructure/knowledge/release_kb.py
T
francwa 98c688f29b feat(release): foundations for parse-confidence scoring
Add the building blocks for Phase A scoring without yet wiring them
into parse_release. Nothing changes at runtime — parse_release still
returns a single ParsedRelease — but the pieces needed to upgrade it
in a follow-up commit are now in place.

- alfred/knowledge/release/scoring.yaml: weights / penalties /
  thresholds. Title and media_type are heavy (30 / 20), structural
  fields medium (year 15, season 10), tech fields light (5 each).
  Unknown-token penalty 5 capped at -30. SHITTY/PoP cutoff at 60.
- load_scoring() loader with safe defaults baked in: a missing or
  partial YAML only de-tunes, never breaks.
- ReleaseKnowledge port grows a 'scoring: dict' field. YamlReleaseKnowledge
  populates it from load_scoring().
- New parser/scoring.py module with Road enum (EASY / SHITTY /
  PATH_OF_PAIN, distinct from ParsePath which records the tokenization
  route), and pure functions: compute_score, decide_road,
  collect_unknown_tokens, collect_missing_critical.
- ParseReport frozen VO in value_objects.py — exported alongside
  ParsedRelease.
2026-05-20 01:21:17 +02:00

123 lines
4.4 KiB
Python

"""YamlReleaseKnowledge — concrete adapter for the ``ReleaseKnowledge``
domain port.
Loads every release-knowledge YAML once at construction time and exposes
the parsed snapshots as plain attributes. The application layer builds a
single instance at boot and passes it down to ``parse_release`` and to
``ParsedRelease`` builder methods.
A few extras (``video_extensions``, ``non_video_extensions``,
``subtitle_extensions``, ``metadata_extensions``) are not part of the
domain port — they are consumed by application/infra modules that handle
filesystem-level concerns.
"""
from __future__ import annotations
from alfred.domain.release.parser.schema import GroupSchema, SchemaChunk
from alfred.domain.release.parser.tokens import TokenRole
from .release import (
load_audio,
load_codecs,
load_distributors,
load_editions,
load_forbidden_chars,
load_group_schemas,
load_hdr_extra,
load_language_tokens,
load_media_type_tokens,
load_metadata_extensions,
load_non_video_extensions,
load_resolutions,
load_scoring,
load_separators,
load_sources,
load_sources_extra,
load_subtitle_extensions,
load_video,
load_video_extensions,
load_win_forbidden_chars,
)
def _build_group_schema(data: dict) -> GroupSchema:
"""Translate a raw YAML schema dict into a frozen :class:`GroupSchema`.
Unknown roles raise ``ValueError`` early so a typo in a YAML file
surfaces at construction time, not on first parse.
"""
chunks = tuple(
SchemaChunk(
role=TokenRole(entry["role"]),
optional=bool(entry.get("optional", False)),
)
for entry in data.get("chunk_order", [])
)
return GroupSchema(
name=data["name"],
separator=data.get("separator", "."),
chunks=chunks,
)
class YamlReleaseKnowledge:
"""Single object holding every parsed-release knowledge constant.
Built once at application boot. Read-only at runtime — call sites
treat it as a snapshot. To pick up newly learned tokens without a
restart, build a fresh instance and swap it in at the call sites.
"""
def __init__(self) -> None:
# Domain-port surface
self.resolutions: set[str] = load_resolutions()
self.sources: set[str] = load_sources() | load_sources_extra()
self.codecs: set[str] = load_codecs()
self.distributors: set[str] = load_distributors()
self.language_tokens: set[str] = load_language_tokens()
self.forbidden_chars: set[str] = load_forbidden_chars()
self.hdr_extra: set[str] = load_hdr_extra()
self.audio: dict = load_audio()
self.video_meta: dict = load_video()
self.editions: dict = load_editions()
self.media_type_tokens: dict = load_media_type_tokens()
self.separators: list[str] = load_separators()
# Parse-scoring config (weights / penalties / thresholds).
self.scoring: dict = load_scoring()
# File-extension sets (used by application/infra modules, not by
# the parser itself — kept here so there is a single ownership
# point for release knowledge).
self.video_extensions: set[str] = load_video_extensions()
self.non_video_extensions: set[str] = load_non_video_extensions()
self.subtitle_extensions: set[str] = load_subtitle_extensions()
# Metadata + subtitle extensions are both ignored when deciding
# the media type of a folder (neither is a conclusive signal for
# movie/tv/other), so we expose the union under the historical
# name.
self.metadata_extensions: set[str] = (
load_metadata_extensions() | self.subtitle_extensions
)
# Translation table for stripping Windows-forbidden chars.
self._win_forbidden_table = str.maketrans(
"", "", "".join(load_win_forbidden_chars())
)
# Group schemas, keyed by uppercase group name for fast lookup.
self._group_schemas: dict[str, GroupSchema] = {
key: _build_group_schema(data)
for key, data in load_group_schemas().items()
}
def sanitize_for_fs(self, text: str) -> str:
"""Strip Windows-forbidden characters from ``text``."""
return text.translate(self._win_forbidden_table)
def group_schema(self, name: str) -> GroupSchema | None:
return self._group_schemas.get(name.upper())