fd3bd1ad8c
Introduce a separate dimension for streaming-platform tags (NF, AMZN, DSNP, HMAX, ATVP, …) so they stop polluting the encoding-source field. WEB-DL is the source; the platform that released it is the distributor. - new distributors.yaml knowledge file - ReleaseKnowledge port exposes distributors set - TokenRole.DISTRIBUTOR + ParsedRelease.distributor field - removed NF/AMZN/DSNP/HMAX/ATVP from sources.yaml - notre_planete fixture now records distributor: NF
173 lines
5.0 KiB
Python
173 lines
5.0 KiB
Python
"""Release knowledge loader.
|
|
|
|
Three-layer merge (lowest → highest priority):
|
|
1. Builtin — alfred/knowledge/release/
|
|
2. Sites — alfred/knowledge/release/sites/*.yaml (all trackers)
|
|
3. Learned — data/knowledge/release/ (user additions via the learn tool)
|
|
|
|
Lists are extended additively, scalars from higher layers win.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
|
|
import yaml
|
|
|
|
import alfred as _alfred_pkg
|
|
|
|
_BUILTIN_ROOT = Path(_alfred_pkg.__file__).parent / "knowledge" / "release"
|
|
_SITES_ROOT = _BUILTIN_ROOT / "sites"
|
|
_GROUPS_ROOT = _BUILTIN_ROOT / "release_groups"
|
|
_LEARNED_ROOT = (
|
|
Path(_alfred_pkg.__file__).parent.parent / "data" / "knowledge" / "release"
|
|
)
|
|
_LEARNED_GROUPS_ROOT = _LEARNED_ROOT / "release_groups"
|
|
|
|
|
|
def _merge(base: dict, overlay: dict) -> dict:
|
|
"""Merge overlay into base — lists are extended, scalars from overlay win."""
|
|
result = dict(base)
|
|
for key, val in overlay.items():
|
|
if key in result and isinstance(result[key], list) and isinstance(val, list):
|
|
result[key] = result[key] + [v for v in val if v not in result[key]]
|
|
else:
|
|
result[key] = val
|
|
return result
|
|
|
|
|
|
def _read(path: Path) -> dict:
|
|
try:
|
|
with open(path, encoding="utf-8") as f:
|
|
return yaml.safe_load(f) or {}
|
|
except FileNotFoundError:
|
|
return {}
|
|
|
|
|
|
def _load(filename: str) -> dict:
|
|
result = _read(_BUILTIN_ROOT / filename)
|
|
result = _merge(result, _read(_LEARNED_ROOT / filename))
|
|
return result
|
|
|
|
|
|
def _load_sites() -> dict:
|
|
"""Merge all site YAML files into a single dict."""
|
|
result: dict = {}
|
|
for site_file in sorted(_SITES_ROOT.glob("*.yaml")):
|
|
result = _merge(result, _read(site_file))
|
|
return result
|
|
|
|
|
|
def load_resolutions() -> set[str]:
|
|
return set(_load("resolutions.yaml").get("resolutions", []))
|
|
|
|
|
|
def load_sources() -> set[str]:
|
|
return set(_load("sources.yaml").get("sources", []))
|
|
|
|
|
|
def load_distributors() -> set[str]:
|
|
"""Streaming distributor tokens (NF, AMZN, DSNP, …).
|
|
|
|
Distinct from ``load_sources()`` — distributors are uppercase scene
|
|
tags identifying the platform, not the capture origin.
|
|
"""
|
|
return {t.upper() for t in _load("distributors.yaml").get("distributors", [])}
|
|
|
|
|
|
def load_codecs() -> set[str]:
|
|
return set(_load("codecs.yaml").get("codecs", []))
|
|
|
|
|
|
def load_win_forbidden_chars() -> list[str]:
|
|
return _load("filesystem.yaml").get("win_forbidden_chars", [])
|
|
|
|
|
|
def load_video_extensions() -> set[str]:
|
|
return set(_load("file_extensions.yaml").get("video", []))
|
|
|
|
|
|
def load_non_video_extensions() -> set[str]:
|
|
return set(_load("file_extensions.yaml").get("non_video", []))
|
|
|
|
|
|
def load_metadata_extensions() -> set[str]:
|
|
return set(_load("file_extensions.yaml").get("metadata", []))
|
|
|
|
|
|
def load_subtitle_extensions() -> set[str]:
|
|
return set(_load("file_extensions.yaml").get("subtitle", []))
|
|
|
|
|
|
def load_forbidden_chars() -> set[str]:
|
|
return set(_load("release_format.yaml").get("forbidden_chars", []))
|
|
|
|
|
|
def load_language_tokens() -> set[str]:
|
|
base = {t.upper() for t in _load("languages.yaml").get("tokens", [])}
|
|
sites = {t.upper() for t in _load_sites().get("languages", [])}
|
|
return base | sites
|
|
|
|
|
|
def load_audio() -> dict:
|
|
return _load("audio.yaml")
|
|
|
|
|
|
def load_video() -> dict:
|
|
return _load("video.yaml")
|
|
|
|
|
|
def load_editions() -> dict:
|
|
base = _load("editions.yaml")
|
|
site_tokens = _load_sites().get("editions", {}).get("tokens", [])
|
|
if site_tokens:
|
|
existing = base.get("tokens", [])
|
|
base["tokens"] = existing + [t for t in site_tokens if t not in existing]
|
|
return base
|
|
|
|
|
|
def load_sources_extra() -> set[str]:
|
|
"""Additional source tokens from site files."""
|
|
return set(_load_sites().get("sources", []))
|
|
|
|
|
|
def load_hdr_extra() -> set[str]:
|
|
"""Additional HDR tokens from site files."""
|
|
return {t.upper() for t in _load_sites().get("hdr", [])}
|
|
|
|
|
|
def load_media_type_tokens() -> dict:
|
|
"""Site-specific media type tokens (doc, concert, collection, integrale)."""
|
|
return _load_sites().get("media_type_tokens", {})
|
|
|
|
|
|
def load_group_schemas() -> dict:
|
|
"""Load every release-group schema YAML keyed by uppercase group name.
|
|
|
|
Builtin schemas in ``alfred/knowledge/release/release_groups/`` are
|
|
merged with user-learned schemas in
|
|
``data/knowledge/release/release_groups/`` (the learned ones win on
|
|
name collision).
|
|
"""
|
|
result: dict = {}
|
|
for root in (_GROUPS_ROOT, _LEARNED_GROUPS_ROOT):
|
|
if not root.is_dir():
|
|
continue
|
|
for path in sorted(root.glob("*.yaml")):
|
|
data = _read(path)
|
|
name = data.get("name")
|
|
if not name:
|
|
continue
|
|
result[name.upper()] = data
|
|
return result
|
|
|
|
|
|
def load_separators() -> list[str]:
|
|
"""Single-char token separators used by the release name tokenizer.
|
|
|
|
Always includes the canonical "." even if absent from YAML, to prevent a
|
|
misconfigured file from breaking the parser entirely.
|
|
"""
|
|
seps = _load("separators.yaml").get("separators", []) or []
|
|
if "." not in seps:
|
|
seps = [".", *seps]
|
|
return seps
|