feat: release parser, media type detection, ffprobe integration

Replace the old domain/media release parser with a full rewrite under
domain/release/:
- ParsedRelease with media_type ("movie" | "tv_show" | "tv_complete" |
  "documentary" | "concert" | "other" | "unknown"), site_tag, parse_path,
  languages, audio_codec, audio_channels, bit_depth, hdr_format, edition
- Well-formedness check + sanitize pipeline (_is_well_formed, _sanitize,
  _strip_site_tag) before token-level parsing
- Multi-token sequence matching for audio (DTS-HD.MA, TrueHD.Atmos…),
  HDR (DV.HDR10…) and editions (DIRECTORS.CUT…)
- Knowledge YAML: file_extensions, release_format, languages, audio,
  video, editions, sites/c411

New infrastructure:
- ffprobe.py — single-pass probe returning MediaInfo (video, audio
  tracks, subtitle tracks)
- find_video.py — locate first video file in a release folder

New application helpers:
- detect_media_type — filesystem-based type refinement
- enrich_from_probe — fill missing ParsedRelease fields from MediaInfo

New agent tools:
- analyze_release — parse + detect type + ffprobe in one call
- probe_media — standalone ffprobe for a specific file

New domain value object:
- MediaInfo + AudioTrack + SubtitleTrack (domain/shared/media_info.py)

Testing CLIs:
- recognize_folders_in_downloads.py — full pipeline with colored output
- probe_video.py — display MediaInfo for a video file
This commit is contained in:
2026-05-12 16:14:20 +02:00
parent 249c5de76a
commit 1723b9fa53
32 changed files with 2323 additions and 562 deletions
+1 -1
View File
@@ -8,7 +8,7 @@ from typing import Any
from alfred.infrastructure.persistence import get_memory from alfred.infrastructure.persistence import get_memory
from alfred.settings import settings from alfred.settings import settings
from .prompts import PromptBuilder from .prompt import PromptBuilder
from .registry import Tool, make_tools from .registry import Tool, make_tools
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
-206
View File
@@ -1,206 +0,0 @@
"""Prompt builder for the agent system."""
import json
from typing import Any
from alfred.infrastructure.persistence import get_memory
from alfred.infrastructure.persistence.memory import MemoryRegistry
from .registry import Tool
class PromptBuilder:
"""Builds system prompts for the agent with memory context."""
def __init__(self, tools: dict[str, Tool]):
self.tools = tools
self._memory_registry = MemoryRegistry()
def build_tools_spec(self) -> list[dict[str, Any]]:
"""Build the tool specification for the LLM API."""
tool_specs = []
for tool in self.tools.values():
spec = {
"type": "function",
"function": {
"name": tool.name,
"description": tool.description,
"parameters": tool.parameters,
},
}
tool_specs.append(spec)
return tool_specs
def _format_tools_description(self) -> str:
"""Format tools with their descriptions and parameters."""
if not self.tools:
return ""
return "\n".join(
f"- {tool.name}: {tool.description}\n"
f" Parameters: {json.dumps(tool.parameters, ensure_ascii=False)}"
for tool in self.tools.values()
)
def _format_episodic_context(self, memory) -> str:
"""Format episodic memory context for the prompt."""
lines = []
if memory.episodic.last_search_results:
results = memory.episodic.last_search_results
result_list = results.get("results", [])
lines.append(
f"\nLAST SEARCH: '{results.get('query')}' ({len(result_list)} results)"
)
# Show first 5 results
for i, result in enumerate(result_list[:5]):
name = result.get("name", "Unknown")
lines.append(f" {i + 1}. {name}")
if len(result_list) > 5:
lines.append(f" ... and {len(result_list) - 5} more")
if memory.episodic.pending_question:
question = memory.episodic.pending_question
lines.append(f"\nPENDING QUESTION: {question.get('question')}")
lines.append(f" Type: {question.get('type')}")
if question.get("options"):
lines.append(f" Options: {len(question.get('options'))}")
if memory.episodic.active_downloads:
lines.append(f"\nACTIVE DOWNLOADS: {len(memory.episodic.active_downloads)}")
for dl in memory.episodic.active_downloads[:3]:
lines.append(f" - {dl.get('name')}: {dl.get('progress', 0)}%")
if memory.episodic.recent_errors:
lines.append("\nRECENT ERRORS (up to 3):")
for error in memory.episodic.recent_errors[-3:]:
lines.append(
f" - Action '{error.get('action')}' failed: {error.get('error')}"
)
# Unread events
unread = [e for e in memory.episodic.background_events if not e.get("read")]
if unread:
lines.append(f"\nUNREAD EVENTS: {len(unread)}")
for event in unread[:3]:
lines.append(f" - {event.get('type')}: {event.get('data')}")
return "\n".join(lines)
def _format_stm_context(self, memory) -> str:
"""Format short-term memory context for the prompt."""
lines = []
if memory.stm.current_workflow:
workflow = memory.stm.current_workflow
lines.append(
f"CURRENT WORKFLOW: {workflow.get('type')} (stage: {workflow.get('stage')})"
)
if workflow.get("target"):
lines.append(f" Target: {workflow.get('target')}")
if memory.stm.current_topic:
lines.append(f"CURRENT TOPIC: {memory.stm.current_topic}")
if memory.stm.extracted_entities:
lines.append("EXTRACTED ENTITIES:")
for key, value in memory.stm.extracted_entities.items():
lines.append(f" - {key}: {value}")
if memory.stm.language:
lines.append(f"CONVERSATION LANGUAGE: {memory.stm.language}")
return "\n".join(lines)
def _format_memory_schema(self) -> str:
"""Describe available memory components so the agent knows what to read/write and when."""
schema = self._memory_registry.schema()
tier_labels = {"ltm": "LONG-TERM (persisted)", "stm": "SHORT-TERM (session)", "episodic": "EPISODIC (volatile)"}
lines = ["MEMORY COMPONENTS:"]
for tier, components in schema.items():
if not components:
continue
lines.append(f"\n [{tier_labels.get(tier, tier.upper())}]")
for c in components:
access = c.get("access", "read")
lines.append(f" {c['name']} ({access}): {c['description']}")
for field_name, field_desc in c.get("fields", {}).items():
lines.append(f" · {field_name}: {field_desc}")
return "\n".join(lines)
def _format_config_context(self, memory) -> str:
"""Format configuration context."""
lines = ["CURRENT CONFIGURATION:"]
folders = {**memory.ltm.workspace.as_dict(), **memory.ltm.library_paths.to_dict()}
if folders:
for key, value in folders.items():
lines.append(f" - {key}: {value}")
else:
lines.append(" (no configuration set)")
return "\n".join(lines)
def build_system_prompt(self) -> str:
"""Build the complete system prompt."""
# Get memory once for all context formatting
memory = get_memory()
# Base instruction
base = "You are a helpful AI assistant for managing a media library."
# Language instruction
language_instruction = (
"Your first task is to determine the user's language from their message "
"and use the `set_language` tool if it's different from the current one. "
"After that, proceed to help the user."
)
# Available tools
tools_desc = self._format_tools_description()
tools_section = f"\nAVAILABLE TOOLS:\n{tools_desc}" if tools_desc else ""
# Memory schema
memory_schema = self._format_memory_schema()
# Configuration
config_section = self._format_config_context(memory)
if config_section:
config_section = f"\n{config_section}"
# STM context
stm_context = self._format_stm_context(memory)
if stm_context:
stm_context = f"\n{stm_context}"
# Episodic context
episodic_context = self._format_episodic_context(memory)
# Important rules
rules = """
IMPORTANT RULES:
- Use tools to accomplish tasks
- When search results are available, reference them by index (e.g., "add_torrent_by_index")
- Always confirm actions with the user before executing destructive operations
- Provide clear, concise responses
"""
# Examples
examples = """
EXAMPLES:
- User: "Find Inception" → Use find_media_imdb_id, then find_torrent
- User: "download the 3rd one" → Use add_torrent_by_index with index=3
- User: "List my downloads" → Use list_folder with folder_type="download"
"""
return f"""{base}
{language_instruction}
{tools_section}
{memory_schema}
{config_section}
{stm_context}
{episodic_context}
{rules}
{examples}
"""
+2
View File
@@ -97,6 +97,8 @@ def make_tools(settings) -> dict[str, Tool]:
tool_functions = [ tool_functions = [
fs_tools.set_path_for_folder, fs_tools.set_path_for_folder,
fs_tools.list_folder, fs_tools.list_folder,
fs_tools.analyze_release,
fs_tools.probe_media,
fs_tools.resolve_destination, fs_tools.resolve_destination,
fs_tools.move_media, fs_tools.move_media,
fs_tools.manage_subtitles, fs_tools.manage_subtitles,
+122
View File
@@ -14,7 +14,11 @@ from alfred.application.filesystem import (
ResolveDestinationUseCase, ResolveDestinationUseCase,
SetFolderPathUseCase, SetFolderPathUseCase,
) )
from alfred.application.filesystem.detect_media_type import detect_media_type
from alfred.application.filesystem.enrich_from_probe import enrich_from_probe
from alfred.infrastructure.filesystem import FileManager from alfred.infrastructure.filesystem import FileManager
from alfred.infrastructure.filesystem.ffprobe import probe
from alfred.infrastructure.filesystem.find_video import find_video_file
_LEARNED_ROOT = Path(_alfred_pkg.__file__).parent.parent / "data" / "knowledge" _LEARNED_ROOT = Path(_alfred_pkg.__file__).parent.parent / "data" / "knowledge"
@@ -213,6 +217,124 @@ def set_path_for_folder(folder_name: str, path_value: str) -> dict[str, Any]:
return response.to_dict() return response.to_dict()
def analyze_release(release_name: str, source_path: str) -> dict[str, Any]:
"""
Fully analyze a release: parse name, detect media type, probe video with ffprobe.
Combines parse_release + filesystem type detection + ffprobe in a single call.
Use this at the start of any organize workflow to get a complete picture before
deciding how to route the release.
Args:
release_name: Raw release folder or file name.
source_path: Absolute path to the release folder or file on disk.
Returns:
Dict with all parsed fields: media_type, title, year, season, episode,
quality, codec, source, group, languages, audio_codec, audio_channels,
bit_depth, hdr_format, edition, site_tag, parse_path,
and probe_used (bool).
"""
from alfred.domain.release.services import parse_release
path = Path(source_path)
parsed = parse_release(release_name)
parsed.media_type = detect_media_type(parsed, path)
probe_used = False
if parsed.media_type not in ("unknown", "other"):
video_file = find_video_file(path)
if video_file:
media_info = probe(video_file)
if media_info:
enrich_from_probe(parsed, media_info)
probe_used = True
return {
"status": "ok",
"media_type": parsed.media_type,
"parse_path": parsed.parse_path,
"title": parsed.title,
"year": parsed.year,
"season": parsed.season,
"episode": parsed.episode,
"episode_end": parsed.episode_end,
"quality": parsed.quality,
"source": parsed.source,
"codec": parsed.codec,
"group": parsed.group,
"languages": parsed.languages,
"audio_codec": parsed.audio_codec,
"audio_channels": parsed.audio_channels,
"bit_depth": parsed.bit_depth,
"hdr_format": parsed.hdr_format,
"edition": parsed.edition,
"site_tag": parsed.site_tag,
"is_season_pack": parsed.is_season_pack,
"probe_used": probe_used,
}
def probe_media(source_path: str) -> dict[str, Any]:
"""
Run ffprobe on a video file and return detailed media information.
Use this to inspect a specific file for codec, resolution, audio tracks,
languages, and embedded subtitles — independently of release name parsing.
Args:
source_path: Absolute path to the video file.
Returns:
Dict with video (codec, resolution, width, height, duration, bitrate),
audio_tracks (list of codec/channels/language), subtitle_tracks
(list of codec/language/forced), audio_languages, is_multi_audio —
or error if ffprobe fails.
"""
path = Path(source_path)
if not path.exists():
return {"status": "error", "error": "not_found", "message": f"{source_path} does not exist"}
media_info = probe(path)
if media_info is None:
return {"status": "error", "error": "probe_failed", "message": "ffprobe failed to read the file"}
return {
"status": "ok",
"video": {
"codec": media_info.video_codec,
"resolution": media_info.resolution,
"width": media_info.width,
"height": media_info.height,
"duration_seconds": media_info.duration_seconds,
"bitrate_kbps": media_info.bitrate_kbps,
},
"audio_tracks": [
{
"index": t.index,
"codec": t.codec,
"channels": t.channels,
"channel_layout": t.channel_layout,
"language": t.language,
"is_default": t.is_default,
}
for t in media_info.audio_tracks
],
"subtitle_tracks": [
{
"index": t.index,
"codec": t.codec,
"language": t.language,
"is_default": t.is_default,
"is_forced": t.is_forced,
}
for t in media_info.subtitle_tracks
],
"audio_languages": media_info.audio_languages,
"is_multi_audio": media_info.is_multi_audio,
}
def list_folder(folder_type: str, path: str = ".") -> dict[str, Any]: def list_folder(folder_type: str, path: str = ".") -> dict[str, Any]:
""" """
List contents of a configured folder. List contents of a configured folder.
@@ -0,0 +1,69 @@
"""
detect_media_type — filesystem-based media type refinement.
Enriches a ParsedRelease.media_type with evidence from the actual source path
(file or folder). Called after parse_release() to produce a final classification.
Classification logic:
1. If source_path is a file — check its extension directly.
2. If source_path is a folder — collect all extensions inside (non-recursive
for the first level, then recursive if nothing conclusive found).
3. Decision:
- Any non_video extension AND no video extension → "other"
- Any video extension → keep parsed media_type ("movie" | "tv_show" | "unknown")
- No conclusive extension found → keep parsed media_type as-is
- Mixed (video + non_video) → "unknown"
"""
from __future__ import annotations
from pathlib import Path
from alfred.domain.release.value_objects import (
ParsedRelease,
_METADATA_EXTENSIONS,
_NON_VIDEO_EXTENSIONS,
_VIDEO_EXTENSIONS,
)
def detect_media_type(parsed: ParsedRelease, source_path: Path) -> str:
"""
Return a refined media_type string for the given source_path.
Does not mutate parsed — returns the new media_type value only.
The caller is responsible for updating the ParsedRelease if needed.
"""
extensions = _collect_extensions(source_path)
# Metadata extensions (.nfo, .srt, …) are always present alongside releases
# and must not influence the type decision.
conclusive = extensions - _METADATA_EXTENSIONS
has_video = bool(conclusive & _VIDEO_EXTENSIONS)
has_non_video = bool(conclusive & _NON_VIDEO_EXTENSIONS)
if has_video and has_non_video:
return "unknown"
if has_non_video and not has_video:
return "other"
if has_video:
return parsed.media_type # trust token-level inference
# No conclusive extension — trust token-level inference
return parsed.media_type
def _collect_extensions(path: Path) -> set[str]:
"""Return the set of lowercase extensions found at path (file or folder)."""
if not path.exists():
return set()
if path.is_file():
return {path.suffix.lower()}
# Folder — scan first level only
exts: set[str] = set()
for child in path.iterdir():
if child.is_file():
exts.add(child.suffix.lower())
return exts
@@ -0,0 +1,76 @@
"""enrich_from_probe — fill missing ParsedRelease fields from MediaInfo."""
from __future__ import annotations
from alfred.domain.release.value_objects import ParsedRelease
from alfred.domain.shared.media_info import MediaInfo
# Map ffprobe codec names to scene-style codec tokens
_VIDEO_CODEC_MAP = {
"hevc": "x265",
"h264": "x264",
"h265": "x265",
"av1": "AV1",
"vp9": "VP9",
"mpeg4": "XviD",
}
# Map ffprobe audio codec names to scene-style tokens
_AUDIO_CODEC_MAP = {
"eac3": "EAC3",
"ac3": "AC3",
"dts": "DTS",
"truehd": "TrueHD",
"aac": "AAC",
"flac": "FLAC",
"opus": "OPUS",
"mp3": "MP3",
"pcm_s16l": "PCM",
"pcm_s24l": "PCM",
}
# Map channel count to standard layout string
_CHANNEL_MAP = {
8: "7.1",
6: "5.1",
2: "2.0",
1: "1.0",
}
def enrich_from_probe(parsed: ParsedRelease, info: MediaInfo) -> None:
"""
Fill None fields in parsed using data from ffprobe MediaInfo.
Only overwrites fields that are currently None — token-level values
from the release name always take priority.
Mutates parsed in place.
"""
if parsed.quality is None and info.resolution:
parsed.quality = info.resolution
if parsed.codec is None and info.video_codec:
parsed.codec = _VIDEO_CODEC_MAP.get(info.video_codec.lower(), info.video_codec.upper())
if parsed.bit_depth is None and info.video_codec:
# ffprobe exposes bit depth via pix_fmt — not in MediaInfo yet, skip for now
pass
# Audio — use the default track, fallback to first
default_track = next((t for t in info.audio_tracks if t.is_default), None)
track = default_track or (info.audio_tracks[0] if info.audio_tracks else None)
if track:
if parsed.audio_codec is None and track.codec:
parsed.audio_codec = _AUDIO_CODEC_MAP.get(track.codec.lower(), track.codec.upper())
if parsed.audio_channels is None and track.channels:
parsed.audio_channels = _CHANNEL_MAP.get(track.channels, f"{track.channels}ch")
# Languages — merge ffprobe languages with token-level ones
# "und" = undetermined, not useful
if info.audio_languages:
existing = set(parsed.languages)
for lang in info.audio_languages:
if lang.lower() != "und" and lang.upper() not in existing:
parsed.languages.append(lang)
@@ -16,7 +16,7 @@ import re
from dataclasses import dataclass, field from dataclasses import dataclass, field
from pathlib import Path from pathlib import Path
from alfred.domain.media.release_parser import ParsedRelease, parse_release from alfred.domain.release import ParsedRelease, parse_release
from alfred.infrastructure.persistence import get_memory from alfred.infrastructure.persistence import get_memory
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -109,11 +109,20 @@ class ResolveDestinationUseCase:
parsed = parse_release(release_name) parsed = parse_release(release_name)
ext = Path(source_file).suffix # ".mkv" ext = Path(source_file).suffix # ".mkv"
if parsed.is_movie: if parsed.media_type == "movie":
return self._resolve_movie(parsed, tmdb_title, tmdb_year, ext) return self._resolve_movie(parsed, tmdb_title, tmdb_year, ext)
if parsed.media_type == "tv_show":
return self._resolve_tvshow( return self._resolve_tvshow(
parsed, tmdb_title, tmdb_year, tmdb_episode_title, ext, confirmed_folder parsed, tmdb_title, tmdb_year, tmdb_episode_title, ext, confirmed_folder
) )
return ResolvedDestination(
status="error",
error="unsupported_media_type",
message=(
f"Cannot organize '{release_name}': detected as '{parsed.media_type}'. "
"Only movies and TV shows are supported."
),
)
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Movie # Movie
-5
View File
@@ -1,5 +0,0 @@
"""Media domain — shared naming and release parsing."""
from .release_parser import ParsedRelease, parse_release
__all__ = ["ParsedRelease", "parse_release"]
-306
View File
@@ -1,306 +0,0 @@
"""
release_parser.py — Parse a release name into structured components.
Handles both dot-separated and space-separated release names:
Oz.S03.1080p.WEBRip.x265-KONTRAST
Oz S03 1080p WEBRip x265-KONTRAST
Inception.2010.1080p.BluRay.x265-GROUP
"""
from __future__ import annotations
import re
from dataclasses import dataclass, field
# Known quality tokens
_QUALITIES = {"2160p", "1080p", "720p", "480p", "576p", "4k", "8k"}
# Known source tokens (case-insensitive match)
_SOURCES = {
"bluray", "blu-ray", "bdrip", "brrip",
"webrip", "web-rip", "webdl", "web-dl", "web",
"hdtv", "hdrip", "dvdrip", "dvd", "vodrip",
"amzn", "nf", "dsnp", "hmax", "atvp",
}
# Known codec tokens
_CODECS = {
"x264", "x265", "h264", "h265", "hevc", "avc",
"xvid", "divx", "av1", "vp9",
"h.264", "h.265",
}
# Windows-forbidden characters (we strip these from display names)
_WIN_FORBIDDEN = re.compile(r'[?:*"<>|\\]')
# Episode/season pattern: S01, S01E02, S01E02E03, 1x02, etc.
_SEASON_EP_RE = re.compile(
r"S(\d{1,2})(?:E(\d{2})(?:E(\d{2}))?)?",
re.IGNORECASE,
)
# Year pattern
_YEAR_RE = re.compile(r"\b(19\d{2}|20\d{2})\b")
@dataclass
class ParsedRelease:
"""Structured representation of a parsed release name."""
raw: str # original release name (untouched)
normalised: str # dots instead of spaces
title: str # show/movie title (dots, no year/season/tech)
year: int | None # movie year or show start year (from TMDB)
season: int | None # season number (None for movies)
episode: int | None # first episode number (None if season-pack)
episode_end: int | None # last episode for multi-ep (None otherwise)
quality: str | None # 1080p, 2160p, …
source: str | None # WEBRip, BluRay, …
codec: str | None # x265, HEVC, …
group: str # release group, "UNKNOWN" if missing
tech_string: str # quality.source.codec joined with dots
# -------------------------------------------------------------------------
# Derived helpers
# -------------------------------------------------------------------------
@property
def is_movie(self) -> bool:
return self.season is None
@property
def is_season_pack(self) -> bool:
return self.season is not None and self.episode is None
def show_folder_name(self, tmdb_title: str, tmdb_year: int) -> str:
"""
Build the series root folder name.
Format: {Title}.{Year}.{Tech}-{Group}
Example: Oz.1997.1080p.WEBRip.x265-KONTRAST
"""
title_part = _sanitise_for_fs(tmdb_title).replace(" ", ".")
tech = self.tech_string or "Unknown"
return f"{title_part}.{tmdb_year}.{tech}-{self.group}"
def season_folder_name(self) -> str:
"""
Build the season subfolder name = normalised release name (no episode).
Example: Oz.S03.1080p.WEBRip.x265-KONTRAST
For a single-episode release we still strip the episode token so the
folder can hold the whole season.
"""
return _strip_episode_from_normalised(self.normalised)
def episode_filename(self, tmdb_episode_title: str | None, ext: str) -> str:
"""
Build the episode filename.
Format: {Title}.{SxxExx}.{EpisodeTitle}.{Tech}-{Group}.{ext}
Example: Oz.S01E01.The.Routine.1080p.WEBRip.x265-KONTRAST.mkv
If tmdb_episode_title is None, omits the episode title segment.
"""
title_part = _sanitise_for_fs(self.title) # already dotted from normalised
s = f"S{self.season:02d}" if self.season is not None else ""
e = f"E{self.episode:02d}" if self.episode is not None else ""
se = s + e
ep_title = ""
if tmdb_episode_title:
ep_title = "." + _sanitise_for_fs(tmdb_episode_title).replace(" ", ".")
tech = self.tech_string or "Unknown"
ext_clean = ext.lstrip(".")
return f"{title_part}.{se}{ep_title}.{tech}-{self.group}.{ext_clean}"
def movie_folder_name(self, tmdb_title: str, tmdb_year: int) -> str:
"""
Build the movie folder name.
Format: {Title}.{Year}.{Tech}-{Group}
Example: Inception.2010.1080p.BluRay.x265-GROUP
"""
return self.show_folder_name(tmdb_title, tmdb_year)
def movie_filename(self, tmdb_title: str, tmdb_year: int, ext: str) -> str:
"""
Build the movie filename (same as folder name + extension).
Example: Inception.2010.1080p.BluRay.x265-GROUP.mkv
"""
ext_clean = ext.lstrip(".")
return f"{self.movie_folder_name(tmdb_title, tmdb_year)}.{ext_clean}"
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def parse_release(name: str) -> ParsedRelease:
"""
Parse a release name and return a ParsedRelease.
Accepts both dot-separated and space-separated names.
"""
normalised = _normalise(name)
tokens = normalised.split(".")
season, episode, episode_end = _extract_season_episode(tokens)
quality, source, codec, group, tech_tokens = _extract_tech(tokens)
title = _extract_title(tokens, season, episode, tech_tokens)
year = _extract_year(tokens, title)
tech_parts = [p for p in [quality, source, codec] if p]
tech_string = ".".join(tech_parts)
return ParsedRelease(
raw=name,
normalised=normalised,
title=title,
year=year,
season=season,
episode=episode,
episode_end=episode_end,
quality=quality,
source=source,
codec=codec,
group=group,
tech_string=tech_string,
)
# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------
def _normalise(name: str) -> str:
"""Replace spaces with dots, collapse multiple dots."""
s = name.replace(" ", ".")
s = re.sub(r"\.{2,}", ".", s)
return s.strip(".")
def _sanitise_for_fs(text: str) -> str:
"""Remove Windows-forbidden characters from a string."""
return _WIN_FORBIDDEN.sub("", text)
def _extract_season_episode(tokens: list[str]) -> tuple[int | None, int | None, int | None]:
joined = ".".join(tokens)
m = _SEASON_EP_RE.search(joined)
if not m:
return None, None, None
season = int(m.group(1))
episode = int(m.group(2)) if m.group(2) else None
episode_end = int(m.group(3)) if m.group(3) else None
return season, episode, episode_end
def _extract_tech(
tokens: list[str],
) -> tuple[str | None, str | None, str | None, str, set[str]]:
"""
Extract quality, source, codec, group from tokens.
Returns (quality, source, codec, group, tech_token_set).
Group extraction strategy (in priority order):
1. Token where prefix is a known codec: x265-GROUP
2. Last token in the list that contains a dash (fallback for 10bit-GROUP, AAC5.1-GROUP, etc.)
"""
quality: str | None = None
source: str | None = None
codec: str | None = None
group = "UNKNOWN"
tech_tokens: set[str] = set()
for tok in tokens:
tl = tok.lower()
if tl in _QUALITIES:
quality = tok
tech_tokens.add(tok)
continue
if tl in _SOURCES:
source = tok
tech_tokens.add(tok)
continue
if "-" in tok:
parts = tok.rsplit("-", 1)
# codec-GROUP (highest priority for group)
if parts[0].lower() in _CODECS:
codec = parts[0]
group = parts[1] if parts[1] else "UNKNOWN"
tech_tokens.add(tok)
continue
# source with dash: Web-DL, WEB-DL, etc.
if parts[0].lower() in _SOURCES or tok.lower().replace("-", "") in _SOURCES:
source = tok
tech_tokens.add(tok)
continue
if tl in _CODECS:
codec = tok
tech_tokens.add(tok)
# Fallback: if group still UNKNOWN, use the rightmost token with a dash
# that isn't a known source (handles "10bit-Protozoan", "AAC5.1-YTS", etc.)
if group == "UNKNOWN":
for tok in reversed(tokens):
if "-" in tok:
parts = tok.rsplit("-", 1)
tl = tok.lower()
if tl in _SOURCES or tok.lower().replace("-", "") in _SOURCES:
continue
if parts[1]: # non-empty group part
group = parts[1]
break
return quality, source, codec, group, tech_tokens
def _extract_title(tokens: list[str], season: int | None, episode: int | None, tech_tokens: set[str]) -> str:
"""
Extract the title portion: everything before the first season/year/tech token.
"""
title_parts = []
for tok in tokens:
# Stop at season token
if _SEASON_EP_RE.match(tok):
break
# Stop at year
if _YEAR_RE.fullmatch(tok):
break
# Stop at tech tokens
if tok in tech_tokens or tok.lower() in _QUALITIES | _SOURCES | _CODECS:
break
# Stop if token contains a dash (likely codec-GROUP)
if "-" in tok and any(p.lower() in _CODECS | _SOURCES for p in tok.split("-")):
break
title_parts.append(tok)
return ".".join(title_parts) if title_parts else tokens[0]
def _extract_year(tokens: list[str], title: str) -> int | None:
"""Extract a 4-digit year from tokens (only after the title)."""
title_len = len(title.split("."))
for tok in tokens[title_len:]:
m = _YEAR_RE.fullmatch(tok)
if m:
return int(m.group(1))
return None
def _strip_episode_from_normalised(normalised: str) -> str:
"""
Remove all episode parts (Exx) from a normalised release name, keeping Sxx.
Oz.S03E01.1080p... → Oz.S03.1080p...
Archer.S14E09E10E11.1080p... → Archer.S14.1080p...
"""
return re.sub(r"(S\d{2})(E\d{2})+", r"\1", normalised, flags=re.IGNORECASE)
+6
View File
@@ -0,0 +1,6 @@
"""Release domain — release name parsing and naming conventions."""
from .services import parse_release
from .value_objects import ParsedRelease
__all__ = ["ParsedRelease", "parse_release"]
+121
View File
@@ -0,0 +1,121 @@
"""Release knowledge loader.
Three-layer merge (lowest → highest priority):
1. Builtin — alfred/knowledge/release/
2. Sites — alfred/knowledge/release/sites/*.yaml (all trackers)
3. Learned — data/knowledge/release/ (user additions via the learn tool)
Lists are extended additively, scalars from higher layers win.
"""
from pathlib import Path
import alfred as _alfred_pkg
import yaml
_BUILTIN_ROOT = Path(_alfred_pkg.__file__).parent / "knowledge" / "release"
_SITES_ROOT = _BUILTIN_ROOT / "sites"
_LEARNED_ROOT = Path(_alfred_pkg.__file__).parent.parent / "data" / "knowledge" / "release"
def _merge(base: dict, overlay: dict) -> dict:
"""Merge overlay into base — lists are extended, scalars from overlay win."""
result = dict(base)
for key, val in overlay.items():
if key in result and isinstance(result[key], list) and isinstance(val, list):
result[key] = result[key] + [v for v in val if v not in result[key]]
else:
result[key] = val
return result
def _read(path: Path) -> dict:
try:
with open(path, encoding="utf-8") as f:
return yaml.safe_load(f) or {}
except FileNotFoundError:
return {}
def _load(filename: str) -> dict:
result = _read(_BUILTIN_ROOT / filename)
result = _merge(result, _read(_LEARNED_ROOT / filename))
return result
def _load_sites() -> dict:
"""Merge all site YAML files into a single dict."""
result: dict = {}
for site_file in sorted(_SITES_ROOT.glob("*.yaml")):
result = _merge(result, _read(site_file))
return result
def load_resolutions() -> set[str]:
return set(_load("resolutions.yaml").get("resolutions", []))
def load_sources() -> set[str]:
return set(_load("sources.yaml").get("sources", []))
def load_codecs() -> set[str]:
return set(_load("codecs.yaml").get("codecs", []))
def load_win_forbidden_chars() -> list[str]:
return _load("filesystem.yaml").get("win_forbidden_chars", [])
def load_video_extensions() -> set[str]:
return set(_load("file_extensions.yaml").get("video", []))
def load_non_video_extensions() -> set[str]:
return set(_load("file_extensions.yaml").get("non_video", []))
def load_metadata_extensions() -> set[str]:
return set(_load("file_extensions.yaml").get("metadata", []))
def load_forbidden_chars() -> set[str]:
return set(_load("release_format.yaml").get("forbidden_chars", []))
def load_language_tokens() -> set[str]:
base = {t.upper() for t in _load("languages.yaml").get("tokens", [])}
sites = {t.upper() for t in _load_sites().get("languages", [])}
return base | sites
def load_audio() -> dict:
return _load("audio.yaml")
def load_video() -> dict:
return _load("video.yaml")
def load_editions() -> dict:
base = _load("editions.yaml")
site_tokens = _load_sites().get("editions", {}).get("tokens", [])
if site_tokens:
existing = base.get("tokens", [])
base["tokens"] = existing + [t for t in site_tokens if t not in existing]
return base
def load_sources_extra() -> set[str]:
"""Additional source tokens from site files."""
return {t for t in _load_sites().get("sources", [])}
def load_hdr_extra() -> set[str]:
"""Additional HDR tokens from site files."""
return {t.upper() for t in _load_sites().get("hdr", [])}
def load_media_type_tokens() -> dict:
"""Site-specific media type tokens (doc, concert, collection, integrale)."""
return _load_sites().get("media_type_tokens", {})
+484
View File
@@ -0,0 +1,484 @@
"""Release domain — parsing service."""
from __future__ import annotations
from .value_objects import (
ParsedRelease,
_AUDIO,
_CODECS,
_EDITIONS,
_FORBIDDEN_CHARS,
_HDR_EXTRA,
_LANGUAGE_TOKENS,
_MEDIA_TYPE_TOKENS,
_RESOLUTIONS,
_SOURCES,
_VIDEO_EXTENSIONS,
_VIDEO_META,
_NON_VIDEO_EXTENSIONS,
)
def parse_release(name: str) -> ParsedRelease:
"""
Parse a release name and return a ParsedRelease.
Well-formed names (no forbidden chars) go through full token-level parsing.
Malformed names go through _sanitize() — strip site tags, replace spaces —
then re-checked. Still malformed after sanitization → media_type="unknown", AI handles it.
"""
site_tag = None
parse_path = "direct"
if not _is_well_formed(name):
clean, site_tag = _sanitize(name)
if not _is_well_formed(clean):
return ParsedRelease(
raw=name,
normalised=clean,
title=clean,
year=None,
season=None,
episode=None,
episode_end=None,
quality=None,
source=None,
codec=None,
group="UNKNOWN",
tech_string="",
media_type="unknown",
site_tag=site_tag,
parse_path="ai",
)
name = clean
parse_path = "sanitized"
tokens = name.split(".")
season, episode, episode_end = _extract_season_episode(tokens)
quality, source, codec, group, tech_tokens = _extract_tech(tokens)
languages, lang_tokens = _extract_languages(tokens)
audio_codec, audio_channels, audio_tokens = _extract_audio(tokens)
bit_depth, hdr_format, video_tokens = _extract_video_meta(tokens)
edition, edition_tokens = _extract_edition(tokens)
title = _extract_title(
tokens,
tech_tokens | lang_tokens | audio_tokens | video_tokens | edition_tokens,
)
year = _extract_year(tokens, title)
media_type = _infer_media_type(season, quality, source, codec, year, edition, tokens)
tech_parts = [p for p in [quality, source, codec] if p]
tech_string = ".".join(tech_parts)
return ParsedRelease(
raw=name,
normalised=name,
title=title,
year=year,
season=season,
episode=episode,
episode_end=episode_end,
quality=quality,
source=source,
codec=codec,
group=group,
tech_string=tech_string,
media_type=media_type,
site_tag=site_tag,
parse_path=parse_path,
languages=languages,
audio_codec=audio_codec,
audio_channels=audio_channels,
bit_depth=bit_depth,
hdr_format=hdr_format,
edition=edition,
)
def _infer_media_type(
season: int | None,
quality: str | None,
source: str | None,
codec: str | None,
year: int | None,
edition: str | None,
tokens: list[str],
) -> str:
"""
Infer media_type from token-level evidence only (no filesystem access).
- documentary : DOC token present
- concert : CONCERT token present
- tv_complete : INTEGRALE/COMPLETE token, no season
- tv_show : season token found
- movie : no season, at least one tech marker
- unknown : no conclusive evidence
"""
upper_tokens = {t.upper() for t in tokens}
doc_tokens = {t.upper() for t in _MEDIA_TYPE_TOKENS.get("doc", [])}
concert_tokens = {t.upper() for t in _MEDIA_TYPE_TOKENS.get("concert", [])}
integrale_tokens = {t.upper() for t in _MEDIA_TYPE_TOKENS.get("integrale", [])}
if upper_tokens & doc_tokens:
return "documentary"
if upper_tokens & concert_tokens:
return "concert"
if (edition in {"COMPLETE", "INTEGRALE", "COLLECTION"} or upper_tokens & integrale_tokens) and season is None:
return "tv_complete"
if season is not None:
return "tv_show"
if any([quality, source, codec, year]):
return "movie"
return "unknown"
def _is_well_formed(name: str) -> bool:
"""Return True if name contains no forbidden characters per scene naming rules."""
return not any(c in name for c in _FORBIDDEN_CHARS)
def _sanitize(name: str) -> tuple[str, str | None]:
"""
Attempt to recover a malformed release name.
Steps (in order):
1. Strip site tag prefix/suffix [...]
2. Replace spaces with dots
Returns (clean_name, site_tag).
"""
s, site_tag = _strip_site_tag(name)
s = s.replace(" ", ".")
return s, site_tag
def _strip_site_tag(name: str) -> tuple[str, str | None]:
"""
Strip a site watermark tag from the release name and return (clean_name, tag).
Handles two positions:
- Prefix: "[ OxTorrent.vc ] The.Title.S01..."
- Suffix: "The.Title.S01...-NTb[TGx]"
Anything between [...] is treated as a site tag.
Returns (original_name, None) if no tag found.
"""
s = name.strip()
if s.startswith("["):
close = s.find("]")
if close != -1:
tag = s[1:close].strip()
remainder = s[close + 1:].strip()
if tag and remainder:
return remainder, tag
if s.endswith("]"):
open_bracket = s.rfind("[")
if open_bracket != -1:
tag = s[open_bracket + 1:-1].strip()
remainder = s[:open_bracket].strip()
if tag and remainder:
return remainder, tag
return s, None
def _normalize(name: str) -> str:
"""Replace spaces with dots, collapse multiple dots."""
s = name.replace(" ", ".")
while ".." in s:
s = s.replace("..", ".")
return s.strip(".")
def _parse_season_episode(tok: str) -> tuple[int, int | None, int | None] | None:
"""
Parse a single token as a season/episode marker.
Handles: S03, S03E01, S03E01E02
Returns (season, episode, episode_end) or None if not a season token.
"""
upper = tok.upper()
if not (len(upper) >= 3 and upper[0] == "S" and upper[1:3].isdigit()):
return None
season = int(upper[1:3])
rest = upper[3:] # everything after Sxx
if not rest:
return season, None, None
# Parse one or two Exx segments
episodes: list[int] = []
while rest.startswith("E") and len(rest) >= 3 and rest[1:3].isdigit():
episodes.append(int(rest[1:3]))
rest = rest[3:]
if not episodes:
return None # malformed token like "S03XYZ"
episode = episodes[0]
episode_end = episodes[1] if len(episodes) >= 2 else None
return season, episode, episode_end
def _extract_season_episode(tokens: list[str]) -> tuple[int | None, int | None, int | None]:
for tok in tokens:
parsed = _parse_season_episode(tok)
if parsed is not None:
return parsed
return None, None, None
def _extract_tech(
tokens: list[str],
) -> tuple[str | None, str | None, str | None, str, set[str]]:
"""
Extract quality, source, codec, group from tokens.
Returns (quality, source, codec, group, tech_token_set).
Group extraction strategy (in priority order):
1. Token where prefix is a known codec: x265-GROUP
2. Rightmost token with a dash that isn't a known source
"""
quality: str | None = None
source: str | None = None
codec: str | None = None
group = "UNKNOWN"
tech_tokens: set[str] = set()
for tok in tokens:
tl = tok.lower()
if tl in _RESOLUTIONS:
quality = tok
tech_tokens.add(tok)
continue
if tl in _SOURCES:
source = tok
tech_tokens.add(tok)
continue
if "-" in tok:
parts = tok.rsplit("-", 1)
# codec-GROUP (highest priority for group)
if parts[0].lower() in _CODECS:
codec = parts[0]
group = parts[1] if parts[1] else "UNKNOWN"
tech_tokens.add(tok)
continue
# source with dash: Web-DL, WEB-DL, etc.
if parts[0].lower() in _SOURCES or tok.lower().replace("-", "") in _SOURCES:
source = tok
tech_tokens.add(tok)
continue
if tl in _CODECS:
codec = tok
tech_tokens.add(tok)
# Fallback: rightmost token with a dash that isn't a known source
if group == "UNKNOWN":
for tok in reversed(tokens):
if "-" in tok:
parts = tok.rsplit("-", 1)
tl = tok.lower()
if tl in _SOURCES or tok.lower().replace("-", "") in _SOURCES:
continue
if parts[1]:
group = parts[1]
break
return quality, source, codec, group, tech_tokens
def _is_year_token(tok: str) -> bool:
"""Return True if tok is a 4-digit year between 1900 and 2099."""
return len(tok) == 4 and tok.isdigit() and 1900 <= int(tok) <= 2099
def _extract_title(tokens: list[str], tech_tokens: set[str]) -> str:
"""Extract the title portion: everything before the first season/year/tech token."""
title_parts = []
for tok in tokens:
if _parse_season_episode(tok) is not None:
break
if _is_year_token(tok):
break
if tok in tech_tokens or tok.lower() in _RESOLUTIONS | _SOURCES | _CODECS:
break
if "-" in tok and any(p.lower() in _CODECS | _SOURCES for p in tok.split("-")):
break
title_parts.append(tok)
return ".".join(title_parts) if title_parts else tokens[0]
def _extract_year(tokens: list[str], title: str) -> int | None:
"""Extract a 4-digit year from tokens (only after the title)."""
title_len = len(title.split("."))
for tok in tokens[title_len:]:
if _is_year_token(tok):
return int(tok)
return None
# ---------------------------------------------------------------------------
# Sequence matcher
# ---------------------------------------------------------------------------
def _match_sequences(
tokens: list[str],
sequences: list[dict],
key: str,
) -> tuple[str | None, set[str]]:
"""
Try to match multi-token sequences against consecutive tokens.
Returns (matched_value, set_of_matched_tokens) or (None, empty_set).
Sequences must be ordered most-specific first in the YAML.
"""
upper_tokens = [t.upper() for t in tokens]
for seq in sequences:
seq_upper = [s.upper() for s in seq["tokens"]]
n = len(seq_upper)
for i in range(len(upper_tokens) - n + 1):
if upper_tokens[i:i + n] == seq_upper:
matched = set(tokens[i:i + n])
return seq[key], matched
return None, set()
# ---------------------------------------------------------------------------
# Language extraction
# ---------------------------------------------------------------------------
def _extract_languages(tokens: list[str]) -> tuple[list[str], set[str]]:
"""Extract language tokens. Returns (languages, matched_token_set)."""
languages = []
lang_tokens: set[str] = set()
for tok in tokens:
if tok.upper() in _LANGUAGE_TOKENS:
languages.append(tok.upper())
lang_tokens.add(tok)
return languages, lang_tokens
# ---------------------------------------------------------------------------
# Audio extraction
# ---------------------------------------------------------------------------
def _extract_audio(
tokens: list[str],
) -> tuple[str | None, str | None, set[str]]:
"""
Extract audio codec and channel layout.
Returns (audio_codec, audio_channels, matched_token_set).
Sequences are tried first (DTS.HD.MA, TrueHD.Atmos, …), then single tokens.
"""
audio_codec: str | None = None
audio_channels: str | None = None
audio_tokens: set[str] = set()
known_codecs = {c.upper() for c in _AUDIO.get("codecs", [])}
known_channels = set(_AUDIO.get("channels", []))
# Try multi-token sequences first
matched_codec, matched_set = _match_sequences(tokens, _AUDIO.get("sequences", []), "codec")
if matched_codec:
audio_codec = matched_codec
audio_tokens |= matched_set
# Channel layouts like "5.1" or "7.1" are split into two tokens by normalize —
# detect them as consecutive pairs "X" + "Y" where "X.Y" is a known channel.
# The second token may have a "-GROUP" suffix (e.g. "1-KTH" → strip it).
for i in range(len(tokens) - 1):
second = tokens[i + 1].split("-")[0]
candidate = f"{tokens[i]}.{second}"
if candidate in known_channels and audio_channels is None:
audio_channels = candidate
audio_tokens.add(tokens[i])
audio_tokens.add(tokens[i + 1])
for tok in tokens:
if tok in audio_tokens:
continue
if tok.upper() in known_codecs and audio_codec is None:
audio_codec = tok
audio_tokens.add(tok)
elif tok in known_channels and audio_channels is None:
audio_channels = tok
audio_tokens.add(tok)
return audio_codec, audio_channels, audio_tokens
# ---------------------------------------------------------------------------
# Video metadata extraction (bit depth, HDR)
# ---------------------------------------------------------------------------
def _extract_video_meta(
tokens: list[str],
) -> tuple[str | None, str | None, set[str]]:
"""
Extract bit depth and HDR format.
Returns (bit_depth, hdr_format, matched_token_set).
"""
bit_depth: str | None = None
hdr_format: str | None = None
video_tokens: set[str] = set()
known_hdr = {h.upper() for h in _VIDEO_META.get("hdr", [])} | _HDR_EXTRA
known_depth = {d.lower() for d in _VIDEO_META.get("bit_depth", [])}
# Try HDR sequences first
matched_hdr, matched_set = _match_sequences(tokens, _VIDEO_META.get("sequences", []), "hdr")
if matched_hdr:
hdr_format = matched_hdr
video_tokens |= matched_set
for tok in tokens:
if tok in video_tokens:
continue
if tok.upper() in known_hdr and hdr_format is None:
hdr_format = tok.upper()
video_tokens.add(tok)
elif tok.lower() in known_depth and bit_depth is None:
bit_depth = tok.lower()
video_tokens.add(tok)
return bit_depth, hdr_format, video_tokens
# ---------------------------------------------------------------------------
# Edition extraction
# ---------------------------------------------------------------------------
def _extract_edition(tokens: list[str]) -> tuple[str | None, set[str]]:
"""
Extract release edition (UNRATED, EXTENDED, DIRECTORS.CUT, …).
Returns (edition, matched_token_set).
"""
known_tokens = {t.upper() for t in _EDITIONS.get("tokens", [])}
# Try multi-token sequences first
matched_edition, matched_set = _match_sequences(
tokens, _EDITIONS.get("sequences", []), "edition"
)
if matched_edition:
return matched_edition, matched_set
for tok in tokens:
if tok.upper() in known_tokens:
return tok.upper(), {tok}
return None, set()
+166
View File
@@ -0,0 +1,166 @@
"""Release domain — value objects and token sets."""
from __future__ import annotations
from dataclasses import dataclass
from .knowledge import (
load_audio,
load_codecs,
load_editions,
load_forbidden_chars,
load_hdr_extra,
load_language_tokens,
load_media_type_tokens,
load_metadata_extensions,
load_non_video_extensions,
load_resolutions,
load_sources,
load_sources_extra,
load_video,
load_video_extensions,
load_win_forbidden_chars,
)
# Token sets — loaded once at import time from alfred/knowledge/release/
_RESOLUTIONS: set[str] = load_resolutions()
_SOURCES: set[str] = load_sources() | load_sources_extra()
_CODECS: set[str] = load_codecs()
_VIDEO_EXTENSIONS: set[str] = load_video_extensions()
_NON_VIDEO_EXTENSIONS: set[str] = load_non_video_extensions()
_METADATA_EXTENSIONS: set[str] = load_metadata_extensions()
_FORBIDDEN_CHARS: set[str] = load_forbidden_chars()
_LANGUAGE_TOKENS: set[str] = load_language_tokens()
_AUDIO: dict = load_audio()
_VIDEO_META: dict = load_video()
_EDITIONS: dict = load_editions()
_HDR_EXTRA: set[str] = load_hdr_extra()
_MEDIA_TYPE_TOKENS: dict = load_media_type_tokens()
# Translation table for stripping Windows-forbidden characters
_WIN_FORBIDDEN_TABLE = str.maketrans("", "", "".join(load_win_forbidden_chars()))
def _sanitize_for_fs(text: str) -> str:
"""Remove Windows-forbidden characters from a string."""
return text.translate(_WIN_FORBIDDEN_TABLE)
def _strip_episode_from_normalized(normalized: str) -> str:
"""
Remove all episode parts (Exx) from a normalized release name, keeping Sxx.
Oz.S03E01.1080p... → Oz.S03.1080p...
Archer.S14E09E10E11.1080p... → Archer.S14.1080p...
"""
tokens = normalized.split(".")
result = []
for tok in tokens:
upper = tok.upper()
# Token is SxxExx... — keep only the Sxx part
if len(upper) >= 3 and upper[0] == "S" and upper[1:3].isdigit():
result.append(tok[:3]) # "S" + two digits
else:
result.append(tok)
return ".".join(result)
# Keep old names as aliases for backward compatibility during the US English migration
_sanitise_for_fs = _sanitize_for_fs
_strip_episode_from_normalised = _strip_episode_from_normalized
@dataclass
class ParsedRelease:
"""Structured representation of a parsed release name."""
raw: str # original release name (untouched)
normalised: str # dots instead of spaces
title: str # show/movie title (dots, no year/season/tech)
year: int | None # movie year or show start year (from TMDB)
season: int | None # season number (None for movies)
episode: int | None # first episode number (None if season-pack)
episode_end: int | None # last episode for multi-ep (None otherwise)
quality: str | None # 1080p, 2160p, …
source: str | None # WEBRip, BluRay, …
codec: str | None # x265, HEVC, …
group: str # release group, "UNKNOWN" if missing
tech_string: str # quality.source.codec joined with dots
media_type: str = "unknown" # "movie" | "tv_show" | "tv_complete" | "other" | "unknown"
site_tag: str | None = None # site watermark stripped from name, e.g. "TGx", "OxTorrent.vc"
parse_path: str = "direct" # "direct" | "sanitized" | "ai"
languages: list[str] = None # ["MULTI", "VFF"], ["FRENCH"], …
audio_codec: str | None = None # "DTS-HD.MA", "DDP", "EAC3", …
audio_channels: str | None = None # "5.1", "7.1", "2.0", …
bit_depth: str | None = None # "10bit", "8bit", …
hdr_format: str | None = None # "DV", "HDR10", "DV.HDR10", …
edition: str | None = None # "UNRATED", "EXTENDED", "DIRECTORS.CUT", …
def __post_init__(self):
if self.languages is None:
object.__setattr__(self, "languages", [])
@property
def is_season_pack(self) -> bool:
return self.season is not None and self.episode is None
def show_folder_name(self, tmdb_title: str, tmdb_year: int) -> str:
"""
Build the series root folder name.
Format: {Title}.{Year}.{Tech}-{Group}
Example: Oz.1997.1080p.WEBRip.x265-KONTRAST
"""
title_part = _sanitize_for_fs(tmdb_title).replace(" ", ".")
tech = self.tech_string or "Unknown"
return f"{title_part}.{tmdb_year}.{tech}-{self.group}"
def season_folder_name(self) -> str:
"""
Build the season subfolder name = normalized release name (no episode).
Example: Oz.S03.1080p.WEBRip.x265-KONTRAST
For a single-episode release we still strip the episode token so the
folder can hold the whole season.
"""
return _strip_episode_from_normalized(self.normalised)
def episode_filename(self, tmdb_episode_title: str | None, ext: str) -> str:
"""
Build the episode filename.
Format: {Title}.{SxxExx}.{EpisodeTitle}.{Tech}-{Group}.{ext}
Example: Oz.S01E01.The.Routine.1080p.WEBRip.x265-KONTRAST.mkv
If tmdb_episode_title is None, omits the episode title segment.
"""
title_part = _sanitize_for_fs(self.title)
s = f"S{self.season:02d}" if self.season is not None else ""
e = f"E{self.episode:02d}" if self.episode is not None else ""
se = s + e
ep_title = ""
if tmdb_episode_title:
ep_title = "." + _sanitize_for_fs(tmdb_episode_title).replace(" ", ".")
tech = self.tech_string or "Unknown"
ext_clean = ext.lstrip(".")
return f"{title_part}.{se}{ep_title}.{tech}-{self.group}.{ext_clean}"
def movie_folder_name(self, tmdb_title: str, tmdb_year: int) -> str:
"""
Build the movie folder name.
Format: {Title}.{Year}.{Tech}-{Group}
Example: Inception.2010.1080p.BluRay.x265-GROUP
"""
return self.show_folder_name(tmdb_title, tmdb_year)
def movie_filename(self, tmdb_title: str, tmdb_year: int, ext: str) -> str:
"""
Build the movie filename (same as folder name + extension).
Example: Inception.2010.1080p.BluRay.x265-GROUP.mkv
"""
ext_clean = ext.lstrip(".")
return f"{self.movie_folder_name(tmdb_title, tmdb_year)}.{ext_clean}"
+95
View File
@@ -0,0 +1,95 @@
"""MediaInfo — pure domain dataclass for file-level media metadata."""
from __future__ import annotations
from dataclasses import dataclass, field
@dataclass
class AudioTrack:
"""A single audio track as reported by ffprobe."""
index: int
codec: str | None # aac, ac3, eac3, dts, truehd, flac, …
channels: int | None # 2, 6 (5.1), 8 (7.1), …
channel_layout: str | None # stereo, 5.1, 7.1, …
language: str | None # ISO 639-2: fre, eng, und, …
is_default: bool = False
@dataclass
class SubtitleTrack:
"""A single subtitle track as reported by ffprobe."""
index: int
codec: str | None # subrip, ass, hdmv_pgs_subtitle, …
language: str | None # ISO 639-2: fre, eng, und, …
is_default: bool = False
is_forced: bool = False
@dataclass
class MediaInfo:
"""
File-level media metadata extracted by ffprobe.
All fields are optional — ffprobe may not always report every value.
"""
# Video
width: int | None = None
height: int | None = None
video_codec: str | None = None # h264, hevc, av1, …
duration_seconds: float | None = None
bitrate_kbps: int | None = None
# Audio tracks (ordered by stream index)
audio_tracks: list[AudioTrack] = field(default_factory=list)
# Embedded subtitle tracks
subtitle_tracks: list[SubtitleTrack] = field(default_factory=list)
@property
def resolution(self) -> str | None:
"""
Best-effort resolution string: 2160p, 1080p, 720p, …
Width takes priority over height to handle widescreen/cinema crops
(e.g. 1920×960 scope → 1080p, not 720p).
Falls back to height when width is unavailable.
"""
match (self.width, self.height):
case (None, None):
return None
case (w, h) if w is not None:
match True:
case _ if w >= 3840: return "2160p"
case _ if w >= 1920: return "1080p"
case _ if w >= 1280: return "720p"
case _ if w >= 720: return "576p"
case _ if w >= 640: return "480p"
case _: return f"{h}p" if h else f"{w}w"
case (None, h):
match True:
case _ if h >= 2160: return "2160p"
case _ if h >= 1080: return "1080p"
case _ if h >= 720: return "720p"
case _ if h >= 576: return "576p"
case _ if h >= 480: return "480p"
case _: return f"{h}p"
@property
def audio_languages(self) -> list[str]:
"""Unique audio languages across all tracks (ISO 639-2)."""
seen: set[str] = set()
result = []
for track in self.audio_tracks:
if track.language and track.language not in seen:
seen.add(track.language)
result.append(track.language)
return result
@property
def is_multi_audio(self) -> bool:
"""True if more than one audio language is present."""
return len(self.audio_languages) > 1
@@ -0,0 +1,98 @@
"""ffprobe — infrastructure adapter for extracting MediaInfo from a video file."""
from __future__ import annotations
import json
import logging
import subprocess
from pathlib import Path
from alfred.domain.shared.media_info import AudioTrack, MediaInfo, SubtitleTrack
logger = logging.getLogger(__name__)
_FFPROBE_CMD = [
"ffprobe",
"-v", "quiet",
"-print_format", "json",
"-show_streams",
"-show_format",
]
def probe(path: Path) -> MediaInfo | None:
"""
Run ffprobe on path and return a MediaInfo.
Returns None if ffprobe is not available or the file cannot be probed.
"""
try:
result = subprocess.run(
[*_FFPROBE_CMD, str(path)],
capture_output=True,
text=True,
timeout=30,
)
except subprocess.TimeoutExpired:
logger.warning("ffprobe timed out on %s", path)
return None
if result.returncode != 0:
logger.warning("ffprobe failed on %s: %s", path, result.stderr.strip())
return None
try:
data = json.loads(result.stdout)
except json.JSONDecodeError:
logger.warning("ffprobe returned invalid JSON for %s", path)
return None
return _parse(data)
def _parse(data: dict) -> MediaInfo:
streams = data.get("streams", [])
fmt = data.get("format", {})
info = MediaInfo()
# Format-level
if "duration" in fmt:
try:
info.duration_seconds = float(fmt["duration"])
except ValueError:
pass
if "bit_rate" in fmt:
try:
info.bitrate_kbps = int(fmt["bit_rate"]) // 1000
except ValueError:
pass
for stream in streams:
codec_type = stream.get("codec_type")
if codec_type == "video" and info.video_codec is None:
info.video_codec = stream.get("codec_name")
info.width = stream.get("width")
info.height = stream.get("height")
elif codec_type == "audio":
info.audio_tracks.append(AudioTrack(
index=stream.get("index", len(info.audio_tracks)),
codec=stream.get("codec_name"),
channels=stream.get("channels"),
channel_layout=stream.get("channel_layout"),
language=stream.get("tags", {}).get("language"),
is_default=stream.get("disposition", {}).get("default", 0) == 1,
))
elif codec_type == "subtitle":
info.subtitle_tracks.append(SubtitleTrack(
index=stream.get("index", len(info.subtitle_tracks)),
codec=stream.get("codec_name"),
language=stream.get("tags", {}).get("language"),
is_default=stream.get("disposition", {}).get("default", 0) == 1,
is_forced=stream.get("disposition", {}).get("forced", 0) == 1,
))
return info
@@ -0,0 +1,25 @@
"""find_video — locate the first video file in a release folder."""
from __future__ import annotations
from pathlib import Path
from alfred.domain.release.value_objects import _VIDEO_EXTENSIONS
def find_video_file(path: Path) -> Path | None:
"""
Return the first video file found at path.
- If path is a file and is a video — return it directly.
- If path is a folder — scan recursively, return the first video found
(sorted by name for determinism, picks S01E01 before S01E02 etc.).
"""
if path.is_file():
return path if path.suffix.lower() in _VIDEO_EXTENSIONS else None
for candidate in sorted(path.rglob("*")):
if candidate.is_file() and candidate.suffix.lower() in _VIDEO_EXTENSIONS:
return candidate
return None
+43
View File
@@ -0,0 +1,43 @@
# Audio codec and channel tokens found in scene release names
#
# sequences: multi-token patterns matched left-to-right on consecutive tokens
# Order matters — longest/most specific first.
# codecs: single-token codec identifiers
# channels: single-token channel layout identifiers
sequences:
- tokens: [DTS, HD, MA]
codec: DTS-HD.MA
- tokens: [DTS, HD]
codec: DTS-HD
- tokens: [DTS, X]
codec: DTS-X
- tokens: [TrueHD, Atmos]
codec: TrueHD.Atmos
- tokens: [DD, Plus]
codec: DDP
- tokens: [DDP, Atmos]
codec: DDP.Atmos
- tokens: [EAC3, Atmos]
codec: EAC3.Atmos
codecs:
- DTS
- DDP # Dolby Digital Plus (alternate label)
- EAC3 # Dolby Digital Plus (codec name)
- AC3 # Dolby Digital
- DD # Dolby Digital (alternate label)
- TrueHD
- AAC
- FLAC
- OPUS
- MP3
- PCM
- LPCM
- ATMOS # sometimes appears standalone
channels:
- "7.1"
- "5.1"
- "2.0"
- "1.0"
+14
View File
@@ -0,0 +1,14 @@
# Known video codec tokens (case-insensitive match)
codecs:
- x264
- x265
- h264
- h265
- hevc
- avc
- xvid
- divx
- av1
- vp9
- h.264
- h.265
+28
View File
@@ -0,0 +1,28 @@
# Release edition and version tokens
# sequences: multi-token editions matched on consecutive tokens
# tokens: single-token edition identifiers
sequences:
- tokens: [DIRECTORS, CUT]
edition: DIRECTORS.CUT
- tokens: [EXTENDED, CUT]
edition: EXTENDED.CUT
- tokens: [THEATRICAL, CUT]
edition: THEATRICAL.CUT
tokens:
- UNRATED
- EXTENDED
- THEATRICAL
- REMASTERED
- PROPER # re-release fixing a technical flaw
- REPACK # re-release fixing packaging issue
- RERIP # re-ripped from source
- READNFO # see NFO for details
- LIMITED
- INTERNAL # group-internal release
- RETAIL
- COMPLETE
- INTEGRALE # French equivalent of COMPLETE (full series)
- COLLECTION # film pack/collection
@@ -0,0 +1,64 @@
# File extension classification for media type detection
#
# video — extensions that confirm a video media file
# non_video — extensions that definitively exclude video content (no metadata here)
# metadata — extensions always present alongside releases, ignored in type decision
video:
- .mkv
- .mp4
- .avi
- .mov
- .wmv
- .flv
- .m4v
- .ts
- .m2ts
- .vob
- .ogm
- .webm
- .divx
- .xvid
non_video:
# Disc images
- .iso
- .img
- .bin
- .cue
- .nrg
# Archives
- .rar
- .zip
- .7z
- .tar
- .gz
- .r00
- .r01
# Games / console ROMs
- .nsp
- .xci
- .pkg
- .xex
- .rpx
- .apk
# Executables / installers
- .exe
- .msi
- .dmg
- .deb
- .rpm
metadata:
# Release metadata — always ignored in type detection
- .nfo
- .txt
- .sfv
- .md5
- .jpg
- .png
- .srt
- .sub
- .idx
- .ass
- .ssa
+10
View File
@@ -0,0 +1,10 @@
# Characters forbidden in filenames on Windows (stripped from display names)
win_forbidden_chars:
- "?"
- ":"
- "*"
- "\""
- "<"
- ">"
- "|"
- "\\"
+44
View File
@@ -0,0 +1,44 @@
# Audio/subtitle language tokens found in scene release names
# These are not always strictly scene-compliant — real-world torrent sites
# use additional tokens (VFF, VFQ, VF2, etc.) that are included here.
tokens:
# French variants
- FRENCH
- TRUEFRENCH
- VFF # Version Française Française (dubbed in France)
- VFQ # Version Française Québécoise (dubbed in Quebec)
- VF2 # Multi: VFF + VFQ
- VF # Version Française (generic)
- VOST # Version Originale Sous-Titrée
- VOSTFR # Version Originale Sous-Titrée Français
- VOSTSUB # Version Originale Sous-Titrée (alternate)
# Multi / dual
- MULTI # Multiple audio tracks (usually OV + local dub)
- DUAL # Two audio tracks
- BILINGUAL # Two audio tracks (alternate term)
# Original version
- VO # Version Originale
- VOF # Version Originale Française
# English
- ENG
- ENGLISH
# Other common languages
- SPA
- SPANISH
- GER
- GERMAN
- ITA
- ITALIAN
- POR
- PORTUGUESE
- JAP
- JAPANESE
- KOR
- KOREAN
- CHI
- CHINESE
@@ -0,0 +1,49 @@
# Scene release naming conventions
# Reference: standard warez scene naming rules
#
# A well-formed release name uses only the characters and structure defined here.
# Anything deviating from this is considered malformed and handed off to the AI.
# Characters allowed in a token (a-z, A-Z, 0-9)
token_chars: "[A-Za-z0-9]"
# Valid word separators (only one style per release — no mixing)
separators:
- "."
- "_"
# Dash is allowed only as a group separator at the end of a tech token: x265-GROUP
group_separator: "-"
# A release is malformed if it contains any of these
forbidden_chars:
- " " # spaces must be replaced by separator
- "["
- "]"
- "("
- ")"
- "{"
- "}"
- "@"
- "#"
- "!"
- "+"
- "="
- "~"
- "'"
- "%"
- "&"
- "$"
- "^"
- "`"
# Standard element order (informational — used by AI for context)
element_order:
- title
- year # optional for TV shows
- language # optional: FRENCH, MULTI, VOSTFR, TRUEFRENCH …
- season_episode # optional: S01E01, S01, …
- resolution # optional: 720p, 1080p, 2160p …
- source # optional: BDRip, WEB-DL, HDTV …
- codec # optional: x264, x265, XviD …
- group # after final dash: -NoGroup
@@ -0,0 +1,9 @@
# Known resolution/quality tokens (case-insensitive match)
resolutions:
- 2160p
- 1080p
- 720p
- 576p
- 480p
- 4k
- 8k
+39
View File
@@ -0,0 +1,39 @@
# c411.org site-specific release naming conventions
# Source: https://c411.org/wiki/nommage
#
# This file extends the base knowledge files with tokens and patterns
# specific to this tracker. Merged at runtime with the base knowledge.
languages:
- VFI # Version Française Internationale
- VOF # Version Originale Française
- FANSUB # Fan-subtitled release
sources:
- 4KLight # HDLight variant for 4K
- HDLight # Compressed BluRay (custom source)
- REMUX # Lossless remux from disc
- BDMV # Full Blu-ray disc structure
- UHD # UHD BluRay (used with BluRay)
hdr:
- HDR10PLUS # HDR10+ (alternate spelling without +)
editions:
tokens:
- IMAX
- UNCENSORED
- CUSTOM # custom color grading / encoding
- PROPER
- REPACK
# Site-specific media type tokens
media_type_tokens:
doc:
- DOC # Documentary marker
concert:
- CONCERT
collection:
- COLLECTION # Film pack/collection
integrale:
- INTEGRALE # Complete series (French term for COMPLETE)
+21
View File
@@ -0,0 +1,21 @@
# Known release source tokens (case-insensitive match)
sources:
- bluray
- blu-ray
- bdrip
- brrip
- webrip
- web-rip
- webdl
- web-dl
- web
- hdtv
- hdrip
- dvdrip
- dvd
- vodrip
- amzn
- nf
- dsnp
- hmax
- atvp
+29
View File
@@ -0,0 +1,29 @@
# Video encoding metadata tokens: bit depth, HDR formats
#
# sequences: multi-token HDR patterns, most specific first
# hdr: single-token HDR identifiers
# bit_depth: single-token bit depth identifiers
sequences:
- tokens: [DV, HDR10]
hdr: DV.HDR10
- tokens: [DV, HDR]
hdr: DV.HDR
- tokens: [HDR, HDR10Plus]
hdr: HDR10+
- tokens: [HDR10, Plus]
hdr: HDR10+
hdr:
- DV # Dolby Vision
- HDR10
- HDR10Plus
- HDR
- HLG # Hybrid Log-Gamma
bit_depth:
- 10bit
- 10Bit
- 8bit
- 8Bit
- 12bit
+229
View File
@@ -0,0 +1,229 @@
#!/usr/bin/env python3
"""
parse_release.py — Test ParsedRelease interactively or via CLI args.
Usage:
uv run testing/parse_release.py "Oz.S03.1080p.WEBRip.x265-KONTRAST"
uv run testing/parse_release.py "Oz.S03.1080p.WEBRip.x265-KONTRAST" --tmdb
uv run testing/parse_release.py "Inception.2010.1080p.BluRay.x265-GROUP" --tmdb-title "Inception" --tmdb-year 2010
uv run testing/parse_release.py --interactive
"""
import argparse
import sys
from pathlib import Path
_PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(_PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(_PROJECT_ROOT))
# ---------------------------------------------------------------------------
# Colours
# ---------------------------------------------------------------------------
RESET = "\033[0m"
BOLD = "\033[1m"
DIM = "\033[2m"
GREEN = "\033[32m"
YELLOW = "\033[33m"
RED = "\033[31m"
CYAN = "\033[36m"
BLUE = "\033[34m"
USE_COLOR = True
def c(text: str, *codes: str) -> str:
if not USE_COLOR:
return str(text)
return "".join(codes) + str(text) + RESET
def kv(key: str, val: str, color: str = CYAN) -> None:
print(f" {c(key + ':', BOLD)} {c(val, color)}")
def hr() -> None:
print(c("" * 64, DIM))
# ---------------------------------------------------------------------------
# TMDB lookup
# ---------------------------------------------------------------------------
def _fetch_tmdb(title: str) -> tuple[str | None, int | None]:
"""
Call TMDBClient.search_media() and return (canonical_title, year).
Returns (None, None) on failure.
"""
try:
from alfred.infrastructure.api.tmdb import TMDBClient
client = TMDBClient()
result = client.search_media(title)
year: int | None = None
if result.release_date:
try:
year = int(result.release_date[:4])
except (ValueError, IndexError):
pass
print(c(f" TMDB → {result.title} ({year}) [{result.media_type}] imdb={result.imdb_id}", DIM))
return result.title, year
except Exception as e:
print(c(f" TMDB lookup failed: {e}", YELLOW))
return None, None
# ---------------------------------------------------------------------------
# Display
# ---------------------------------------------------------------------------
def _show(release_name: str, tmdb_title: str | None, tmdb_year: int | None,
tmdb_episode_title: str | None, ext: str) -> None:
from alfred.domain.release import parse_release
p = parse_release(release_name)
# Auto-fetch TMDB if requested and not already provided
if not (tmdb_title and tmdb_year):
fetched_title, fetched_year = _fetch_tmdb(p.title.replace(".", " "))
tmdb_title = tmdb_title or fetched_title
tmdb_year = tmdb_year or fetched_year
print()
print(c("" * 64, BOLD))
print(c(f" ParsedRelease — {p.raw}", BOLD, CYAN))
print(c("" * 64, BOLD))
# Core fields
hr()
kv("raw", p.raw)
kv("normalised", p.normalised)
kv("title", p.title)
kv("year", str(p.year) if p.year else c("None", DIM))
kv("season", str(p.season) if p.season is not None else c("None", DIM))
kv("episode", str(p.episode) if p.episode is not None else c("None", DIM))
kv("episode_end", str(p.episode_end) if p.episode_end is not None else c("None", DIM))
kv("quality", p.quality or c("None", DIM))
kv("source", p.source or c("None", DIM))
kv("codec", p.codec or c("None", DIM))
kv("group", p.group, YELLOW if p.group == "UNKNOWN" else GREEN)
kv("tech_string", p.tech_string or c("(empty)", DIM))
# Derived booleans
hr()
kv("is_movie", c(str(p.is_movie), GREEN if p.is_movie else DIM))
kv("is_season_pack", c(str(p.is_season_pack), GREEN if p.is_season_pack else DIM))
# Generated names
hr()
title_for_names = tmdb_title or p.title.replace(".", " ")
year_for_names = tmdb_year or p.year or 0
if p.is_movie:
kv("movie_folder_name", p.movie_folder_name(title_for_names, year_for_names))
kv("movie_filename", p.movie_filename(title_for_names, year_for_names, ext))
else:
kv("show_folder_name", p.show_folder_name(title_for_names, year_for_names))
kv("season_folder_name", p.season_folder_name())
if not p.is_season_pack:
kv("episode_filename", p.episode_filename(tmdb_episode_title, ext))
else:
kv("episode_filename", c("(season pack — no episode filename)", DIM))
if tmdb_title or tmdb_year or tmdb_episode_title:
hr()
print(c(" TMDB data used:", DIM))
if tmdb_title: kv(" tmdb_title", tmdb_title)
if tmdb_year: kv(" tmdb_year", str(tmdb_year))
if tmdb_episode_title: kv(" tmdb_episode_title", tmdb_episode_title)
print(c("" * 64, BOLD))
print()
# ---------------------------------------------------------------------------
# Interactive mode
# ---------------------------------------------------------------------------
def _interactive() -> None:
print(c("\n Alfred — Release Parser REPL", BOLD, CYAN))
print(c(" Type a release name, or 'q' to quit.", DIM))
print(c(" Inline overrides: ::title=Oz ::year=1997 ::ep=The.Routine ::ext=.mkv\n", DIM))
while True:
try:
raw = input(c(" release> ", BOLD)).strip()
except (EOFError, KeyboardInterrupt):
print()
break
if not raw or raw.lower() in ("q", "quit", "exit"):
break
# Parse inline overrides: "Oz.S03E01... ::title=Oz ::year=1997 ::tmdb"
parts = raw.split("::")
release = parts[0].strip()
overrides: dict[str, str] = {}
for part in parts[1:]:
part = part.strip()
if "=" in part:
k, _, v = part.partition("=")
overrides[k.strip()] = v.strip()
else:
overrides[part] = "1" # flag-style: ::tmdb
tmdb_title = overrides.get("title")
tmdb_year = int(overrides["year"]) if "year" in overrides else None
tmdb_episode_title = overrides.get("ep")
ext = overrides.get("ext", ".mkv")
try:
_show(release, tmdb_title, tmdb_year, tmdb_episode_title, ext)
except Exception as e:
print(c(f" Error: {e}", RED))
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main() -> None:
global USE_COLOR
parser = argparse.ArgumentParser(
description="Test ParsedRelease from domain/release/release_parser.py",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument("release", nargs="?", help="Release name to parse")
parser.add_argument("-i", "--interactive", action="store_true",
help="Interactive REPL mode")
parser.add_argument("--tmdb-title", metavar="TITLE",
help="Override TMDB title for name generation")
parser.add_argument("--tmdb-year", metavar="YEAR", type=int,
help="Override TMDB year for name generation")
parser.add_argument("--episode-title", metavar="TITLE",
help="TMDB episode title for episode_filename()")
parser.add_argument("--ext", default=".mkv", metavar="EXT",
help="File extension for filename generation (default: .mkv)")
parser.add_argument("--no-color", action="store_true")
args = parser.parse_args()
if args.no_color or not sys.stdout.isatty():
USE_COLOR = False
if args.interactive:
_interactive()
return
if not args.release:
parser.print_help()
sys.exit(1)
try:
_show(args.release, args.tmdb_title, args.tmdb_year, args.episode_title, args.ext)
except Exception as e:
print(c(f"Error: {e}", RED), file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()
+160
View File
@@ -0,0 +1,160 @@
#!/usr/bin/env python3
"""
probe_video.py — Display MediaInfo extracted by ffprobe for a video file.
Usage:
uv run testing/probe_video.py /path/to/video.mkv
uv run testing/probe_video.py /path/to/video.mkv --no-color
"""
import argparse
import sys
from pathlib import Path
_PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(_PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(_PROJECT_ROOT))
# ---------------------------------------------------------------------------
# Colours
# ---------------------------------------------------------------------------
RESET = "\033[0m"
BOLD = "\033[1m"
DIM = "\033[2m"
GREEN = "\033[32m"
YELLOW = "\033[33m"
RED = "\033[31m"
CYAN = "\033[36m"
BLUE = "\033[34m"
USE_COLOR = True
def c(text: str, *codes: str) -> str:
if not USE_COLOR:
return str(text)
return "".join(codes) + str(text) + RESET
def kv(key: str, val: str, indent: int = 4, color: str = CYAN) -> None:
print(f"{' ' * indent}{c(key + ':', BOLD)} {c(val, color)}")
def section(title: str) -> None:
print()
print(f" {c('' + title, BOLD, BLUE)}")
def hr() -> None:
print(c("" * 70, DIM))
# ---------------------------------------------------------------------------
# Formatting helpers
# ---------------------------------------------------------------------------
def fmt_duration(seconds: float) -> str:
h = int(seconds // 3600)
m = int((seconds % 3600) // 60)
s = int(seconds % 60)
if h:
return f"{h}h {m:02d}m {s:02d}s"
return f"{m}m {s:02d}s"
def fmt_channels(channels: int | None, layout: str | None) -> str:
parts = []
if channels is not None:
parts.append(str(channels) + "ch")
if layout:
parts.append(f"({layout})")
return " ".join(parts) if parts else ""
def flag(val: bool) -> str:
return c("yes", GREEN) if val else c("no", DIM)
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> None:
global USE_COLOR
parser = argparse.ArgumentParser(description="Probe a video file with ffprobe")
parser.add_argument("file", help="Path to the video file")
parser.add_argument("--no-color", action="store_true")
args = parser.parse_args()
if args.no_color or not sys.stdout.isatty():
USE_COLOR = False
path = Path(args.file)
if not path.exists():
print(c(f"Error: {path} does not exist", RED), file=sys.stderr)
sys.exit(1)
from alfred.infrastructure.filesystem.ffprobe import probe
info = probe(path)
if info is None:
print(c("Error: ffprobe failed to probe the file", RED), file=sys.stderr)
sys.exit(1)
print()
print(c("" * 70, BOLD))
print(c(f" {path.name}", BOLD, CYAN))
print(c(f" {path}", DIM))
print(c("" * 70, BOLD))
# --- Video ---
section("Video")
kv("codec", info.video_codec or c("", DIM))
kv("resolution", info.resolution or c("", DIM))
if info.width and info.height:
kv("dimensions", f"{info.width} × {info.height}")
if info.duration_seconds is not None:
kv("duration", fmt_duration(info.duration_seconds))
if info.bitrate_kbps is not None:
kv("bitrate", f"{info.bitrate_kbps} kbps")
# --- Audio ---
section(f"Audio {c(str(len(info.audio_tracks)) + ' track(s)', DIM)}")
if not info.audio_tracks:
print(f" {c('no audio tracks found', DIM)}")
for track in info.audio_tracks:
lang = track.language or "und"
default_marker = f" {c('default', GREEN, DIM)}" if track.is_default else ""
print(f" {c(f'[{track.index}]', BOLD)} {c(lang, YELLOW)}{default_marker}")
kv("codec", track.codec or c("", DIM), indent=8)
kv("channels", fmt_channels(track.channels, track.channel_layout), indent=8)
# --- Subtitles ---
section(f"Subtitles {c(str(len(info.subtitle_tracks)) + ' track(s)', DIM)}")
if not info.subtitle_tracks:
print(f" {c('no embedded subtitle tracks', DIM)}")
for track in info.subtitle_tracks:
lang = track.language or "und"
markers = []
if track.is_default:
markers.append(c("default", GREEN, DIM))
if track.is_forced:
markers.append(c("forced", YELLOW, DIM))
marker_str = (" " + " ".join(markers)) if markers else ""
print(f" {c(f'[{track.index}]', BOLD)} {c(lang, YELLOW)}{marker_str}")
kv("codec", track.codec or c("", DIM), indent=8)
# --- Summary ---
print()
hr()
multi = c("yes", GREEN) if info.is_multi_audio else c("no", DIM)
langs = ", ".join(info.audio_languages) if info.audio_languages else c("", DIM)
print(f" {c('multi-audio:', BOLD)} {multi} {c('languages:', BOLD)} {c(langs, CYAN)}")
hr()
print()
if __name__ == "__main__":
main()
+203
View File
@@ -0,0 +1,203 @@
#!/usr/bin/env python3
"""
recognize_folders_in_downloads.py — Parse every folder/file in the downloads directory.
Usage:
uv run testing/recognize_folders_in_downloads.py
uv run testing/recognize_folders_in_downloads.py --path /mnt/testipool/downloads
uv run testing/recognize_folders_in_downloads.py --failures-only
uv run testing/recognize_folders_in_downloads.py --successes-only
"""
import argparse
import sys
from pathlib import Path
_PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(_PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(_PROJECT_ROOT))
# ---------------------------------------------------------------------------
# Colours
# ---------------------------------------------------------------------------
RESET = "\033[0m"
BOLD = "\033[1m"
DIM = "\033[2m"
GREEN = "\033[32m"
YELLOW = "\033[33m"
RED = "\033[31m"
CYAN = "\033[36m"
USE_COLOR = True
def c(text: str, *codes: str) -> str:
if not USE_COLOR:
return str(text)
return "".join(codes) + str(text) + RESET
def kv(key: str, val: str, indent: int = 4, color: str = CYAN) -> None:
print(f"{' ' * indent}{c(key + ':', BOLD)} {c(val, color)}")
def hr() -> None:
print(c("" * 70, DIM))
# ---------------------------------------------------------------------------
# Parsing quality check
# ---------------------------------------------------------------------------
def _assess(p) -> list[str]:
"""Return a list of warning strings for fields that look wrong."""
if p.media_type in ("other", "unknown"):
return []
warnings = []
if p.group == "UNKNOWN":
warnings.append("group not found")
if not p.quality:
warnings.append("resolution not found")
if not p.codec:
warnings.append("codec not found")
if not p.title or p.title == p.normalised:
warnings.append("title extraction likely wrong")
return warnings
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> None:
global USE_COLOR
parser = argparse.ArgumentParser(description="Recognize release folders in downloads")
parser.add_argument("--path", default="/mnt/testipool/downloads",
help="Downloads directory (default: /mnt/testipool/downloads)")
parser.add_argument("--failures-only", action="store_true",
help="Show only entries with warnings")
parser.add_argument("--successes-only", action="store_true",
help="Show only fully parsed entries")
parser.add_argument("--no-color", action="store_true")
args = parser.parse_args()
if args.no_color or not sys.stdout.isatty():
USE_COLOR = False
downloads = Path(args.path)
if not downloads.exists():
print(c(f"Error: {downloads} does not exist", RED), file=sys.stderr)
sys.exit(1)
from alfred.domain.release.services import parse_release
from alfred.application.filesystem.detect_media_type import detect_media_type
from alfred.application.filesystem.enrich_from_probe import enrich_from_probe
from alfred.infrastructure.filesystem.find_video import find_video_file
from alfred.infrastructure.filesystem.ffprobe import probe
entries = sorted(downloads.iterdir(), key=lambda p: p.name.lower())
total = len(entries)
ok_count = 0
warn_count = 0
print()
print(c("" * 70, BOLD))
print(c(f" Downloads — {downloads}", BOLD, CYAN))
print(c(f" {total} entries", DIM))
print(c("" * 70, BOLD))
for entry in entries:
name = entry.name
try:
p = parse_release(name)
p.media_type = detect_media_type(p, entry)
if p.media_type not in ("unknown", "other"):
video_file = find_video_file(entry)
if video_file:
media_info = probe(video_file)
if media_info:
enrich_from_probe(p, media_info)
warnings = _assess(p)
except Exception as e:
warnings = [f"parse error: {e}"]
p = None
has_warnings = bool(warnings)
if args.failures_only and not has_warnings:
continue
if args.successes_only and has_warnings:
continue
print()
path_label = ""
if p:
path_label = {
"direct": c("direct", GREEN, DIM),
"sanitized": c("sanitized", YELLOW),
"ai": c("ai", RED),
}.get(p.parse_path, p.parse_path)
if has_warnings:
warn_count += 1
print(f" {c('', YELLOW, BOLD)} {c(name, YELLOW)} {path_label}")
else:
ok_count += 1
print(f" {c('', GREEN, BOLD)} {c(name, BOLD)} {path_label}")
if p:
kind = {
"movie": "movie",
"tv_show": "season pack" if p.is_season_pack else "episode",
"tv_complete": c("tv complete", CYAN),
"documentary": c("documentary", CYAN),
"concert": c("concert", CYAN),
"other": c("other", RED),
"unknown": c("unknown", YELLOW),
}.get(p.media_type, p.media_type)
kv("type", kind)
kv("title", p.title)
if p.season is not None:
ep = f"E{p.episode:02d}" if p.episode is not None else ""
kv("season/ep", f"S{p.season:02d} / {ep}")
if p.year:
kv("year", str(p.year))
if p.languages:
kv("langs", " ".join(p.languages))
kv("quality", p.quality or c("", DIM))
kv("source", p.source or c("", DIM))
kv("codec", p.codec or c("", DIM))
if p.audio_codec:
ch = f" {p.audio_channels}" if p.audio_channels else ""
kv("audio", f"{p.audio_codec}{ch}")
if p.bit_depth or p.hdr_format:
hdr_parts = [x for x in [p.bit_depth, p.hdr_format] if x]
kv("hdr/depth", " ".join(hdr_parts))
if p.edition:
kv("edition", p.edition, color=YELLOW)
kv("group", p.group,
color=YELLOW if p.group == "UNKNOWN" else GREEN)
if p.site_tag:
kv("site tag", p.site_tag, color=YELLOW)
if warnings:
for w in warnings:
print(f" {c('' + w, YELLOW)}")
# Summary
print()
hr()
skipped = total - ok_count - warn_count
print(f" {c('Total:', BOLD)} {total} "
f"{c(str(ok_count) + ' ok', GREEN, BOLD)} "
f"{c(str(warn_count) + ' warnings', YELLOW, BOLD)}"
+ (f" {c(str(skipped) + ' filtered', DIM)}" if skipped else ""))
hr()
print()
if __name__ == "__main__":
main()
+89 -22
View File
@@ -79,22 +79,65 @@ def kv(key: str, val: str) -> None:
# Dry-run tool stubs # Dry-run tool stubs
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def _dry_list_folder(folder_type: str, path: str = ".") -> dict[str, Any]: def _real_list_folder(folder_type: str, path: str = ".") -> dict[str, Any]:
"""Call the real list_folder (read-only, safe in dry-run)."""
# TODO: remove hardcoded fallback once download path is configured in LTM
_HARDCODED_DOWNLOAD_ROOT = "/mnt/testipool/downloads"
try:
from alfred.infrastructure.persistence import get_memory, init_memory
try:
get_memory()
except Exception:
init_memory()
from alfred.agent.tools.filesystem import list_folder
result = list_folder(folder_type=folder_type, path=path)
if result.get("status") == "error" and folder_type == "download":
raise RuntimeError(result.get("message", "not configured"))
return result
except Exception as e:
if folder_type == "download":
warn(f"list_folder: {e} — using hardcoded download root: {_HARDCODED_DOWNLOAD_ROOT}")
import os
resolved = os.path.join(_HARDCODED_DOWNLOAD_ROOT, path) if path != "." else _HARDCODED_DOWNLOAD_ROOT
try:
entries = sorted(os.listdir(resolved))
except OSError as oe:
return {"status": "error", "error": "os_error", "message": str(oe)}
return {
"status": "ok",
"folder_type": folder_type,
"path": resolved,
"entries": entries,
"count": len(entries),
}
warn(f"list_folder: filesystem unavailable ({e}), falling back to stub")
return { return {
"status": "ok", "status": "ok",
"folder_type": folder_type, "folder_type": folder_type,
"path": path, "path": path,
"entries": ["[dry-run — no real listing]"], "entries": ["[stub — filesystem unavailable]"],
"count": 1, "count": 1,
} }
def _dry_find_media_imdb_id(**kwargs) -> dict[str, Any]: def _real_find_media_imdb_id(media_title: str, **kwargs) -> dict[str, Any]:
"""Call the real TMDB API even in dry-run (read-only, no filesystem side effects)."""
try:
from alfred.infrastructure.persistence import get_memory, init_memory
try:
get_memory()
except Exception:
init_memory()
from alfred.agent.tools.api import find_media_imdb_id
return find_media_imdb_id(media_title=media_title)
except Exception as e:
warn(f"find_media_imdb_id: TMDB unavailable ({e}), falling back to stub")
return { return {
"status": "ok", "status": "ok",
"imdb_id": kwargs.get("imdb_id") or "tt0000000", "imdb_id": "tt0000000",
"title": "Dry Run Show", "title": media_title,
"type": "tv_show", "media_type": "tv_show",
"year": 2024, "year": 2024,
} }
@@ -107,7 +150,7 @@ def _dry_resolve_destination(
tmdb_episode_title: str | None = None, tmdb_episode_title: str | None = None,
confirmed_folder: str | None = None, confirmed_folder: str | None = None,
) -> dict[str, Any]: ) -> dict[str, Any]:
from alfred.domain.media.release_parser import parse_release from alfred.domain.release import parse_release
parsed = parse_release(release_name) parsed = parse_release(release_name)
ext = Path(source_file).suffix ext = Path(source_file).suffix
if parsed.is_movie: if parsed.is_movie:
@@ -170,8 +213,8 @@ def _dry_create_seed_links(library_file: str, original_download_folder: str) ->
DRY_RUN_TOOLS: dict[str, Any] = { DRY_RUN_TOOLS: dict[str, Any] = {
"list_folder": _dry_list_folder, "list_folder": _real_list_folder,
"find_media_imdb_id": _dry_find_media_imdb_id, "find_media_imdb_id": _real_find_media_imdb_id,
"resolve_destination": _dry_resolve_destination, "resolve_destination": _dry_resolve_destination,
"move_media": _dry_move_media, "move_media": _dry_move_media,
"manage_subtitles": _dry_manage_subtitles, "manage_subtitles": _dry_manage_subtitles,
@@ -316,10 +359,22 @@ class WorkflowRunner:
self.step_results.append({"id": step_id, "result": {"status": "error", "error": str(e)}}) self.step_results.append({"id": step_id, "result": {"status": "error", "error": str(e)}})
return return
self._print_result(result) self._print_result(result, tool_name=tool_name)
self.context[step_id] = result self.context[step_id] = result
self.step_results.append({"id": step_id, "result": result}) self.step_results.append({"id": step_id, "result": result})
# After list_downloads: confirm the requested media folder exists in downloads
if tool_name == "list_folder" and result.get("status") == "ok" and self.args.source:
folder_path = result.get("path", "")
entries = result.get("entries", [])
if self.args.source in entries:
media_folder = str(Path(folder_path) / self.args.source)
self.context["media_folder"] = media_folder
print()
print(f" {c('Dossier media trouvé:', BOLD, GREEN)} {c(media_folder, CYAN, BOLD)}")
else:
warn(f"Dossier '{self.args.source}' introuvable dans {folder_path}")
def _build_kwargs(self, tool_name: str, step: dict) -> dict[str, Any]: def _build_kwargs(self, tool_name: str, step: dict) -> dict[str, Any]:
"""Build tool kwargs from step params + CLI args + previous context.""" """Build tool kwargs from step params + CLI args + previous context."""
# Start from step-level params (static defaults from YAML) # Start from step-level params (static defaults from YAML)
@@ -335,12 +390,13 @@ class WorkflowRunner:
kwargs["imdb_id"] = a.imdb_id kwargs["imdb_id"] = a.imdb_id
elif tool_name == "resolve_destination": elif tool_name == "resolve_destination":
media_folder = self.context.get("media_folder")
if a.release: if a.release:
kwargs["release_name"] = a.release kwargs["release_name"] = a.release
elif a.source: elif a.source:
kwargs.setdefault("release_name", Path(a.source).parent.name) kwargs.setdefault("release_name", a.source)
if a.source: if media_folder:
kwargs["source_file"] = a.source kwargs["source_file"] = media_folder
if a.tmdb_title: if a.tmdb_title:
kwargs["tmdb_title"] = a.tmdb_title kwargs["tmdb_title"] = a.tmdb_title
if a.tmdb_year: if a.tmdb_year:
@@ -351,16 +407,18 @@ class WorkflowRunner:
elif tool_name == "move_media": elif tool_name == "move_media":
# If resolve_destination ran, use its library_file as destination # If resolve_destination ran, use its library_file as destination
resolved = self.context.get("resolve_destination", {}) resolved = self.context.get("resolve_destination", {})
if a.source: media_folder = self.context.get("media_folder")
kwargs["source"] = a.source if media_folder:
kwargs["source"] = media_folder
dest = a.dest or resolved.get("library_file") dest = a.dest or resolved.get("library_file")
if dest: if dest:
kwargs["destination"] = dest kwargs["destination"] = dest
elif tool_name == "manage_subtitles": elif tool_name == "manage_subtitles":
resolved = self.context.get("resolve_destination", {}) resolved = self.context.get("resolve_destination", {})
if a.source: media_folder = self.context.get("media_folder")
kwargs["source_video"] = a.source if media_folder:
kwargs["source_video"] = media_folder
dest = a.dest or resolved.get("library_file") dest = a.dest or resolved.get("library_file")
if dest: if dest:
kwargs["destination_video"] = dest kwargs["destination_video"] = dest
@@ -372,12 +430,16 @@ class WorkflowRunner:
kwargs["library_file"] = library_file kwargs["library_file"] = library_file
if a.download_folder: if a.download_folder:
kwargs["original_download_folder"] = a.download_folder kwargs["original_download_folder"] = a.download_folder
elif a.source: else:
kwargs.setdefault("original_download_folder", str(Path(a.source).parent)) # Use the resolved folder path from list_downloads context
list_result = self.context.get("list_downloads", {})
folder_path = list_result.get("path")
if folder_path:
kwargs.setdefault("original_download_folder", folder_path)
return kwargs return kwargs
def _print_result(self, result: dict) -> None: def _print_result(self, result: dict, tool_name: str = "") -> None:
status = result.get("status", "?") status = result.get("status", "?")
if status == "ok": if status == "ok":
ok(f"status={c('ok', GREEN)}") ok(f"status={c('ok', GREEN)}")
@@ -387,6 +449,11 @@ class WorkflowRunner:
err(f"status={c(status, RED)} error={result.get('error')} msg={result.get('message')}") err(f"status={c(status, RED)} error={result.get('error')} msg={result.get('message')}")
return return
# Highlight resolved folder path for list_folder
if tool_name == "list_folder" and result.get("path"):
print()
print(f" {c('Dossier résolu:', BOLD, GREEN)} {c(result['path'], CYAN, BOLD)}")
# Pretty-print notable fields # Pretty-print notable fields
skip = {"status", "error", "message"} skip = {"status", "error", "message"}
for k, v in result.items(): for k, v in result.items():
@@ -420,8 +487,8 @@ def parse_args() -> argparse.Namespace:
help="Simulate steps without executing tools (default)") help="Simulate steps without executing tools (default)")
parser.add_argument("--live", action="store_true", parser.add_argument("--live", action="store_true",
help="Actually execute tools against the real filesystem") help="Actually execute tools against the real filesystem")
parser.add_argument("--source", metavar="PATH", parser.add_argument("--source", metavar="FOLDER_NAME",
help="Source video file (in download folder)") help="Release folder name inside the download root (e.g. Oz.S03.1080p.WEBRip.x265-KONTRAST)")
parser.add_argument("--dest", metavar="PATH", parser.add_argument("--dest", metavar="PATH",
help="Destination video file (in library, overrides resolve_destination)") help="Destination video file (in library, overrides resolve_destination)")
parser.add_argument("--download-folder", metavar="PATH", parser.add_argument("--download-folder", metavar="PATH",
+4 -8
View File
@@ -1,5 +1,5 @@
""" """
Tests for alfred.domain.media.release_parser Tests for alfred.domain.release.release_parser
Real-data cases sourced from /mnt/testipool/downloads/. Real-data cases sourced from /mnt/testipool/downloads/.
Covers: parsing, normalisation, naming methods, edge cases. Covers: parsing, normalisation, naming methods, edge cases.
@@ -7,13 +7,9 @@ Covers: parsing, normalisation, naming methods, edge cases.
import pytest import pytest
from alfred.domain.media.release_parser import ( from alfred.domain.release import ParsedRelease, parse_release
ParsedRelease, from alfred.domain.release.services import _normalise
_normalise, from alfred.domain.release.value_objects import _sanitise_for_fs, _strip_episode_from_normalised
_sanitise_for_fs,
_strip_episode_from_normalised,
parse_release,
)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------