ced72547f7
The domain layer no longer reads YAML files. All knowledge loaders move
from `alfred/domain/*/knowledge/` to `alfred/infrastructure/knowledge/`:
domain/release/knowledge.py
→ infrastructure/knowledge/release.py
domain/shared/knowledge/language_registry.py
→ infrastructure/knowledge/language_registry.py
domain/subtitles/knowledge/{loader,base}.py
→ infrastructure/knowledge/subtitles/{loader,base}.py
Callers in domain/release/{services,value_objects}.py,
domain/subtitles/{aggregates,services/*}.py, and
application/filesystem/manage_subtitles.py updated to absolute imports.
Re-exports of KnowledgeLoader/SubtitleKnowledgeBase from
domain/subtitles/__init__.py dropped (no shim per project convention).
Tests follow the moved targets.
184 lines
6.6 KiB
Python
184 lines
6.6 KiB
Python
"""SubtitleKnowledgeBase — parsed, typed view of the loaded knowledge."""
|
|
|
|
import logging
|
|
|
|
from alfred.infrastructure.knowledge.language_registry import LanguageRegistry
|
|
from alfred.domain.subtitles.value_objects import (
|
|
ScanStrategy,
|
|
SubtitleFormat,
|
|
SubtitleLanguage,
|
|
SubtitleMatchingRules,
|
|
SubtitlePattern,
|
|
SubtitleType,
|
|
TypeDetectionMethod,
|
|
)
|
|
from .loader import KnowledgeLoader
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class SubtitleKnowledgeBase:
|
|
"""
|
|
Typed access to subtitle knowledge (formats, types, languages, patterns).
|
|
|
|
Built from KnowledgeLoader — call kb.reload() to pick up newly learned entries
|
|
without restarting.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
loader: KnowledgeLoader | None = None,
|
|
language_registry: LanguageRegistry | None = None,
|
|
):
|
|
self._loader = loader or KnowledgeLoader()
|
|
self._language_registry = language_registry or LanguageRegistry()
|
|
self._build()
|
|
|
|
def _build(self) -> None: # noqa: PLR0912 — straight-line YAML projection
|
|
data = self._loader.subtitles()
|
|
|
|
self._formats: dict[str, SubtitleFormat] = {}
|
|
for fid, fdata in data.get("formats", {}).items():
|
|
self._formats[fid] = SubtitleFormat(
|
|
id=fid,
|
|
extensions=fdata.get("extensions", []),
|
|
description=fdata.get("description", ""),
|
|
)
|
|
|
|
# Languages are sourced primarily from the canonical LanguageRegistry
|
|
# (alfred/knowledge/iso_languages.yaml — ISO 639-2/B). Subtitle-specific
|
|
# tokens (VOSTFR, VF, VFF…) are merged on top from subtitles.yaml's
|
|
# ``language_tokens`` section.
|
|
subtitle_extras: dict[str, list[str]] = {
|
|
code: list(tokens or [])
|
|
for code, tokens in (data.get("language_tokens", {}) or {}).items()
|
|
}
|
|
|
|
self._languages: dict[str, SubtitleLanguage] = {}
|
|
self._lang_token_map: dict[str, str] = {}
|
|
|
|
for language in self._language_registry.all():
|
|
tokens: list[str] = [language.iso, language.english_name.lower()]
|
|
if language.native_name.lower() not in tokens:
|
|
tokens.append(language.native_name.lower())
|
|
for alias in language.aliases:
|
|
if alias not in tokens:
|
|
tokens.append(alias)
|
|
for extra in subtitle_extras.get(language.iso, []):
|
|
if extra.lower() not in tokens:
|
|
tokens.append(extra.lower())
|
|
|
|
self._languages[language.iso] = SubtitleLanguage(
|
|
code=language.iso,
|
|
tokens=tokens,
|
|
)
|
|
for token in tokens:
|
|
self._lang_token_map[token.lower()] = language.iso
|
|
|
|
# Subtitle-specific tokens for languages NOT in the canonical registry
|
|
# are still honored: register them as a minimal SubtitleLanguage.
|
|
for code, extras in subtitle_extras.items():
|
|
if code in self._languages:
|
|
continue
|
|
tokens = [code] + [e.lower() for e in extras]
|
|
self._languages[code] = SubtitleLanguage(code=code, tokens=tokens)
|
|
for token in tokens:
|
|
self._lang_token_map[token.lower()] = code
|
|
|
|
# Build reverse token → type map
|
|
self._type_token_map: dict[str, SubtitleType] = {}
|
|
for type_id, tdata in data.get("types", {}).items():
|
|
stype = SubtitleType(type_id)
|
|
for token in tdata.get("tokens", []):
|
|
self._type_token_map[token.lower()] = stype
|
|
|
|
d = data.get("defaults", {})
|
|
self._default_rules = SubtitleMatchingRules(
|
|
preferred_languages=d.get("languages", ["fre", "eng"]),
|
|
preferred_formats=d.get("formats", ["srt"]),
|
|
allowed_types=d.get("types", ["standard", "forced"]),
|
|
format_priority=d.get("format_priority", ["srt", "ass"]),
|
|
min_confidence=d.get("min_confidence", 0.7),
|
|
)
|
|
|
|
self._patterns: dict[str, SubtitlePattern] = {}
|
|
for pid, pdata in self._loader.patterns().items():
|
|
try:
|
|
self._patterns[pid] = SubtitlePattern(
|
|
id=pid,
|
|
description=pdata.get("description", ""),
|
|
scan_strategy=ScanStrategy(pdata.get("scan_strategy", "adjacent")),
|
|
root_folder=pdata.get("root_folder"),
|
|
type_detection=TypeDetectionMethod(
|
|
pdata.get("type_detection", {}).get("method", "token_in_name")
|
|
),
|
|
version=pdata.get("version", "1.0"),
|
|
)
|
|
except ValueError as e:
|
|
logger.warning(f"SubtitleKnowledgeBase: skipping pattern '{pid}': {e}")
|
|
|
|
def reload(self) -> None:
|
|
self._loader = KnowledgeLoader()
|
|
self._build()
|
|
logger.info("SubtitleKnowledgeBase: reloaded")
|
|
|
|
# --- Defaults ---
|
|
|
|
def default_rules(self) -> SubtitleMatchingRules:
|
|
return self._default_rules
|
|
|
|
# --- Formats ---
|
|
|
|
def formats(self) -> dict[str, SubtitleFormat]:
|
|
return self._formats
|
|
|
|
def format_for_extension(self, ext: str) -> SubtitleFormat | None:
|
|
for fmt in self._formats.values():
|
|
if fmt.matches_extension(ext):
|
|
return fmt
|
|
return None
|
|
|
|
def known_extensions(self) -> set[str]:
|
|
exts = set()
|
|
for fmt in self._formats.values():
|
|
exts.update(fmt.extensions)
|
|
return exts
|
|
|
|
# --- Languages ---
|
|
|
|
def languages(self) -> dict[str, SubtitleLanguage]:
|
|
return self._languages
|
|
|
|
def language_for_token(self, token: str) -> SubtitleLanguage | None:
|
|
code = self._lang_token_map.get(token.lower())
|
|
return self._languages.get(code) if code else None
|
|
|
|
def is_known_lang_token(self, token: str) -> bool:
|
|
return token.lower() in self._lang_token_map
|
|
|
|
# --- Types ---
|
|
|
|
def type_for_token(self, token: str) -> SubtitleType | None:
|
|
return self._type_token_map.get(token.lower())
|
|
|
|
def is_known_type_token(self, token: str) -> bool:
|
|
return token.lower() in self._type_token_map
|
|
|
|
# --- Patterns ---
|
|
|
|
def patterns(self) -> dict[str, SubtitlePattern]:
|
|
return self._patterns
|
|
|
|
def pattern(self, pattern_id: str) -> SubtitlePattern | None:
|
|
return self._patterns.get(pattern_id)
|
|
|
|
def patterns_for_group(self, group_name: str) -> list[SubtitlePattern]:
|
|
group = self._loader.release_group(group_name)
|
|
if not group:
|
|
return []
|
|
return [
|
|
self._patterns[pid]
|
|
for pid in group.get("known_patterns", [])
|
|
if pid in self._patterns
|
|
]
|