98c688f29b
Add the building blocks for Phase A scoring without yet wiring them into parse_release. Nothing changes at runtime — parse_release still returns a single ParsedRelease — but the pieces needed to upgrade it in a follow-up commit are now in place. - alfred/knowledge/release/scoring.yaml: weights / penalties / thresholds. Title and media_type are heavy (30 / 20), structural fields medium (year 15, season 10), tech fields light (5 each). Unknown-token penalty 5 capped at -30. SHITTY/PoP cutoff at 60. - load_scoring() loader with safe defaults baked in: a missing or partial YAML only de-tunes, never breaks. - ReleaseKnowledge port grows a 'scoring: dict' field. YamlReleaseKnowledge populates it from load_scoring(). - New parser/scoring.py module with Road enum (EASY / SHITTY / PATH_OF_PAIN, distinct from ParsePath which records the tokenization route), and pure functions: compute_score, decide_road, collect_unknown_tokens, collect_missing_critical. - ParseReport frozen VO in value_objects.py — exported alongside ParsedRelease.
123 lines
4.4 KiB
Python
123 lines
4.4 KiB
Python
"""YamlReleaseKnowledge — concrete adapter for the ``ReleaseKnowledge``
|
|
domain port.
|
|
|
|
Loads every release-knowledge YAML once at construction time and exposes
|
|
the parsed snapshots as plain attributes. The application layer builds a
|
|
single instance at boot and passes it down to ``parse_release`` and to
|
|
``ParsedRelease`` builder methods.
|
|
|
|
A few extras (``video_extensions``, ``non_video_extensions``,
|
|
``subtitle_extensions``, ``metadata_extensions``) are not part of the
|
|
domain port — they are consumed by application/infra modules that handle
|
|
filesystem-level concerns.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from alfred.domain.release.parser.schema import GroupSchema, SchemaChunk
|
|
from alfred.domain.release.parser.tokens import TokenRole
|
|
|
|
from .release import (
|
|
load_audio,
|
|
load_codecs,
|
|
load_distributors,
|
|
load_editions,
|
|
load_forbidden_chars,
|
|
load_group_schemas,
|
|
load_hdr_extra,
|
|
load_language_tokens,
|
|
load_media_type_tokens,
|
|
load_metadata_extensions,
|
|
load_non_video_extensions,
|
|
load_resolutions,
|
|
load_scoring,
|
|
load_separators,
|
|
load_sources,
|
|
load_sources_extra,
|
|
load_subtitle_extensions,
|
|
load_video,
|
|
load_video_extensions,
|
|
load_win_forbidden_chars,
|
|
)
|
|
|
|
|
|
def _build_group_schema(data: dict) -> GroupSchema:
|
|
"""Translate a raw YAML schema dict into a frozen :class:`GroupSchema`.
|
|
|
|
Unknown roles raise ``ValueError`` early so a typo in a YAML file
|
|
surfaces at construction time, not on first parse.
|
|
"""
|
|
chunks = tuple(
|
|
SchemaChunk(
|
|
role=TokenRole(entry["role"]),
|
|
optional=bool(entry.get("optional", False)),
|
|
)
|
|
for entry in data.get("chunk_order", [])
|
|
)
|
|
return GroupSchema(
|
|
name=data["name"],
|
|
separator=data.get("separator", "."),
|
|
chunks=chunks,
|
|
)
|
|
|
|
|
|
class YamlReleaseKnowledge:
|
|
"""Single object holding every parsed-release knowledge constant.
|
|
|
|
Built once at application boot. Read-only at runtime — call sites
|
|
treat it as a snapshot. To pick up newly learned tokens without a
|
|
restart, build a fresh instance and swap it in at the call sites.
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
# Domain-port surface
|
|
self.resolutions: set[str] = load_resolutions()
|
|
self.sources: set[str] = load_sources() | load_sources_extra()
|
|
self.codecs: set[str] = load_codecs()
|
|
self.distributors: set[str] = load_distributors()
|
|
self.language_tokens: set[str] = load_language_tokens()
|
|
self.forbidden_chars: set[str] = load_forbidden_chars()
|
|
self.hdr_extra: set[str] = load_hdr_extra()
|
|
|
|
self.audio: dict = load_audio()
|
|
self.video_meta: dict = load_video()
|
|
self.editions: dict = load_editions()
|
|
self.media_type_tokens: dict = load_media_type_tokens()
|
|
|
|
self.separators: list[str] = load_separators()
|
|
|
|
# Parse-scoring config (weights / penalties / thresholds).
|
|
self.scoring: dict = load_scoring()
|
|
|
|
# File-extension sets (used by application/infra modules, not by
|
|
# the parser itself — kept here so there is a single ownership
|
|
# point for release knowledge).
|
|
self.video_extensions: set[str] = load_video_extensions()
|
|
self.non_video_extensions: set[str] = load_non_video_extensions()
|
|
self.subtitle_extensions: set[str] = load_subtitle_extensions()
|
|
# Metadata + subtitle extensions are both ignored when deciding
|
|
# the media type of a folder (neither is a conclusive signal for
|
|
# movie/tv/other), so we expose the union under the historical
|
|
# name.
|
|
self.metadata_extensions: set[str] = (
|
|
load_metadata_extensions() | self.subtitle_extensions
|
|
)
|
|
|
|
# Translation table for stripping Windows-forbidden chars.
|
|
self._win_forbidden_table = str.maketrans(
|
|
"", "", "".join(load_win_forbidden_chars())
|
|
)
|
|
|
|
# Group schemas, keyed by uppercase group name for fast lookup.
|
|
self._group_schemas: dict[str, GroupSchema] = {
|
|
key: _build_group_schema(data)
|
|
for key, data in load_group_schemas().items()
|
|
}
|
|
|
|
def sanitize_for_fs(self, text: str) -> str:
|
|
"""Strip Windows-forbidden characters from ``text``."""
|
|
return text.translate(self._win_forbidden_table)
|
|
|
|
def group_schema(self, name: str) -> GroupSchema | None:
|
|
return self._group_schemas.get(name.upper())
|