98c688f29b
Add the building blocks for Phase A scoring without yet wiring them into parse_release. Nothing changes at runtime — parse_release still returns a single ParsedRelease — but the pieces needed to upgrade it in a follow-up commit are now in place. - alfred/knowledge/release/scoring.yaml: weights / penalties / thresholds. Title and media_type are heavy (30 / 20), structural fields medium (year 15, season 10), tech fields light (5 each). Unknown-token penalty 5 capped at -30. SHITTY/PoP cutoff at 60. - load_scoring() loader with safe defaults baked in: a missing or partial YAML only de-tunes, never breaks. - ReleaseKnowledge port grows a 'scoring: dict' field. YamlReleaseKnowledge populates it from load_scoring(). - New parser/scoring.py module with Road enum (EASY / SHITTY / PATH_OF_PAIN, distinct from ParsePath which records the tokenization route), and pure functions: compute_score, decide_road, collect_unknown_tokens, collect_missing_critical. - ParseReport frozen VO in value_objects.py — exported alongside ParsedRelease.
204 lines
5.9 KiB
Python
204 lines
5.9 KiB
Python
"""Release knowledge loader.
|
|
|
|
Three-layer merge (lowest → highest priority):
|
|
1. Builtin — alfred/knowledge/release/
|
|
2. Sites — alfred/knowledge/release/sites/*.yaml (all trackers)
|
|
3. Learned — data/knowledge/release/ (user additions via the learn tool)
|
|
|
|
Lists are extended additively, scalars from higher layers win.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
|
|
import yaml
|
|
|
|
import alfred as _alfred_pkg
|
|
|
|
_BUILTIN_ROOT = Path(_alfred_pkg.__file__).parent / "knowledge" / "release"
|
|
_SITES_ROOT = _BUILTIN_ROOT / "sites"
|
|
_GROUPS_ROOT = _BUILTIN_ROOT / "release_groups"
|
|
_LEARNED_ROOT = (
|
|
Path(_alfred_pkg.__file__).parent.parent / "data" / "knowledge" / "release"
|
|
)
|
|
_LEARNED_GROUPS_ROOT = _LEARNED_ROOT / "release_groups"
|
|
|
|
|
|
def _merge(base: dict, overlay: dict) -> dict:
|
|
"""Merge overlay into base — lists are extended, scalars from overlay win."""
|
|
result = dict(base)
|
|
for key, val in overlay.items():
|
|
if key in result and isinstance(result[key], list) and isinstance(val, list):
|
|
result[key] = result[key] + [v for v in val if v not in result[key]]
|
|
else:
|
|
result[key] = val
|
|
return result
|
|
|
|
|
|
def _read(path: Path) -> dict:
|
|
try:
|
|
with open(path, encoding="utf-8") as f:
|
|
return yaml.safe_load(f) or {}
|
|
except FileNotFoundError:
|
|
return {}
|
|
|
|
|
|
def _load(filename: str) -> dict:
|
|
result = _read(_BUILTIN_ROOT / filename)
|
|
result = _merge(result, _read(_LEARNED_ROOT / filename))
|
|
return result
|
|
|
|
|
|
def _load_sites() -> dict:
|
|
"""Merge all site YAML files into a single dict."""
|
|
result: dict = {}
|
|
for site_file in sorted(_SITES_ROOT.glob("*.yaml")):
|
|
result = _merge(result, _read(site_file))
|
|
return result
|
|
|
|
|
|
def load_resolutions() -> set[str]:
|
|
return set(_load("resolutions.yaml").get("resolutions", []))
|
|
|
|
|
|
def load_sources() -> set[str]:
|
|
return set(_load("sources.yaml").get("sources", []))
|
|
|
|
|
|
def load_distributors() -> set[str]:
|
|
"""Streaming distributor tokens (NF, AMZN, DSNP, …).
|
|
|
|
Distinct from ``load_sources()`` — distributors are uppercase scene
|
|
tags identifying the platform, not the capture origin.
|
|
"""
|
|
return {t.upper() for t in _load("distributors.yaml").get("distributors", [])}
|
|
|
|
|
|
def load_codecs() -> set[str]:
|
|
return set(_load("codecs.yaml").get("codecs", []))
|
|
|
|
|
|
def load_win_forbidden_chars() -> list[str]:
|
|
return _load("filesystem.yaml").get("win_forbidden_chars", [])
|
|
|
|
|
|
def load_video_extensions() -> set[str]:
|
|
return set(_load("file_extensions.yaml").get("video", []))
|
|
|
|
|
|
def load_non_video_extensions() -> set[str]:
|
|
return set(_load("file_extensions.yaml").get("non_video", []))
|
|
|
|
|
|
def load_metadata_extensions() -> set[str]:
|
|
return set(_load("file_extensions.yaml").get("metadata", []))
|
|
|
|
|
|
def load_subtitle_extensions() -> set[str]:
|
|
return set(_load("file_extensions.yaml").get("subtitle", []))
|
|
|
|
|
|
def load_forbidden_chars() -> set[str]:
|
|
return set(_load("release_format.yaml").get("forbidden_chars", []))
|
|
|
|
|
|
def load_language_tokens() -> set[str]:
|
|
base = {t.upper() for t in _load("languages.yaml").get("tokens", [])}
|
|
sites = {t.upper() for t in _load_sites().get("languages", [])}
|
|
return base | sites
|
|
|
|
|
|
def load_audio() -> dict:
|
|
return _load("audio.yaml")
|
|
|
|
|
|
def load_video() -> dict:
|
|
return _load("video.yaml")
|
|
|
|
|
|
def load_editions() -> dict:
|
|
base = _load("editions.yaml")
|
|
site_tokens = _load_sites().get("editions", {}).get("tokens", [])
|
|
if site_tokens:
|
|
existing = base.get("tokens", [])
|
|
base["tokens"] = existing + [t for t in site_tokens if t not in existing]
|
|
return base
|
|
|
|
|
|
def load_sources_extra() -> set[str]:
|
|
"""Additional source tokens from site files."""
|
|
return set(_load_sites().get("sources", []))
|
|
|
|
|
|
def load_hdr_extra() -> set[str]:
|
|
"""Additional HDR tokens from site files."""
|
|
return {t.upper() for t in _load_sites().get("hdr", [])}
|
|
|
|
|
|
def load_media_type_tokens() -> dict:
|
|
"""Site-specific media type tokens (doc, concert, collection, integrale)."""
|
|
return _load_sites().get("media_type_tokens", {})
|
|
|
|
|
|
def load_group_schemas() -> dict:
|
|
"""Load every release-group schema YAML keyed by uppercase group name.
|
|
|
|
Builtin schemas in ``alfred/knowledge/release/release_groups/`` are
|
|
merged with user-learned schemas in
|
|
``data/knowledge/release/release_groups/`` (the learned ones win on
|
|
name collision).
|
|
"""
|
|
result: dict = {}
|
|
for root in (_GROUPS_ROOT, _LEARNED_GROUPS_ROOT):
|
|
if not root.is_dir():
|
|
continue
|
|
for path in sorted(root.glob("*.yaml")):
|
|
data = _read(path)
|
|
name = data.get("name")
|
|
if not name:
|
|
continue
|
|
result[name.upper()] = data
|
|
return result
|
|
|
|
|
|
def load_scoring() -> dict:
|
|
"""Load the parse-scoring config.
|
|
|
|
Returns a dict with three top-level keys: ``weights``, ``penalties``,
|
|
``thresholds``. Defaults are baked in so a missing or partial YAML
|
|
never breaks the parser — only de-tunes it.
|
|
"""
|
|
raw = _load("scoring.yaml")
|
|
weights = {
|
|
"title": 30,
|
|
"media_type": 20,
|
|
"year": 15,
|
|
"season": 10,
|
|
"episode": 5,
|
|
"resolution": 5,
|
|
"source": 5,
|
|
"codec": 5,
|
|
"group": 5,
|
|
}
|
|
weights.update(raw.get("weights", {}) or {})
|
|
penalties = {"unknown_token": 5, "max_unknown_penalty": 30}
|
|
penalties.update(raw.get("penalties", {}) or {})
|
|
thresholds = {"shitty_min": 60}
|
|
thresholds.update(raw.get("thresholds", {}) or {})
|
|
return {
|
|
"weights": weights,
|
|
"penalties": penalties,
|
|
"thresholds": thresholds,
|
|
}
|
|
|
|
|
|
def load_separators() -> list[str]:
|
|
"""Single-char token separators used by the release name tokenizer.
|
|
|
|
Always includes the canonical "." even if absent from YAML, to prevent a
|
|
misconfigured file from breaking the parser entirely.
|
|
"""
|
|
seps = _load("separators.yaml").get("separators", []) or []
|
|
if "." not in seps:
|
|
seps = [".", *seps]
|
|
return seps
|