diff --git a/CHANGELOG.md b/CHANGELOG.md index 77328d6..eb5ae81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,37 @@ callers). ## [Unreleased] +### Fixed + +- **Multi-episode chain (e.g. `S14E09E10E11`) now collapses to a full + range.** The parser previously captured `episode=9, episode_end=10` + and dropped E11+. It now returns `episode=first, episode_end=last`, + with intermediate values implied. Fixture + `shitty/archer_multi_episode/` updated from anti-regression-of-bug + to anti-regression-of-fix. +- **Apostrophes in titles no longer push the release through the AI + fallback.** `Honey.Don't.2025.2160p.WEBRip.DSNP.DV.HDR.x265-Amen` + previously parsed with `parse_path="ai"` and everything UNKNOWN + because `'` is in the forbidden-chars list. Apostrophes are now + pre-stripped before the well-formed check, so the parse completes + normally (`title=Honey.Dont, year=2025, quality=2160p, ...`); only + the title text loses its apostrophe. `parse_path` becomes + `sanitized` to surface the cleanup. Side win: PoP fixture + `the_prodigy_full_chaos/` also moves from total failure to a + partially-correct parse (year, source, codec extracted). +- **Season-range markers (`Sxx-yy`) are now recognized as + `tv_complete`.** `Der.Tatortreiniger.S01-06.GERMAN...` previously + parsed as `media_type=movie` with `S01-06` glued onto the title. + The parser now recognizes the range, sets `season=first`, + `media_type=tv_complete`, and removes the marker from the title. + `is_season_pack` flips to `true`. +- **Pure-punctuation TITLE tokens are dropped at assembly.** Releases + with surrounding ` - ` separators (`Vinyl - 1x01 - FHD`) previously + produced `title="Vinyl.-"`. Such tokens (a stray dash, a wide pipe + `|`, …) carry no title content and are now filtered out. Side + effect: PoP fixture `khruangbin_yt_wide_pipe/` also benefits — the + YouTube wide-pipe no longer leaks into the title. + ### Added - **`LanguageRepository` port** in `alfred.domain.shared.ports`. Structural diff --git a/alfred/domain/release/parser/pipeline.py b/alfred/domain/release/parser/pipeline.py index 68f8b55..25856aa 100644 --- a/alfred/domain/release/parser/pipeline.py +++ b/alfred/domain/release/parser/pipeline.py @@ -91,14 +91,17 @@ def tokenize(name: str, kb: ReleaseKnowledge) -> tuple[list[Token], str | None]: def _parse_season_episode(text: str) -> tuple[int, int | None, int | None] | None: - """Parse a single token as ``SxxExx`` / ``SxxExxExx`` / ``Sxx`` / ``NxNN``. + """Parse a single token as ``SxxExx`` / ``SxxExxExx`` / ``Sxx`` / + ``Sxx-yy`` (season range) / ``NxNN``. Returns ``(season, episode, episode_end)`` or ``None`` if the token - is not a season/episode marker. + is not a season/episode marker. For ``Sxx-yy``, returns the first + season with no episode info — the caller is expected to detect the + range form and promote ``media_type`` to ``tv_complete`` separately. """ upper = text.upper() - # SxxExx form + # SxxExx form (and Sxx, Sxx-yy) if len(upper) >= 3 and upper[0] == "S" and upper[1:3].isdigit(): season = int(upper[1:3]) rest = upper[3:] @@ -106,6 +109,15 @@ def _parse_season_episode(text: str) -> tuple[int, int | None, int | None] | Non if not rest: return season, None, None + # Sxx-yy season-range form: capture the first season, treat as a + # complete-series marker (no episode info). + if ( + len(rest) == 3 + and rest[0] == "-" + and rest[1:3].isdigit() + ): + return season, None, None + episodes: list[int] = [] while rest.startswith("E") and len(rest) >= 3 and rest[1:3].isdigit(): episodes.append(int(rest[1:3])) @@ -113,7 +125,9 @@ def _parse_season_episode(text: str) -> tuple[int, int | None, int | None] | Non if not episodes: return None - return season, episodes[0], episodes[1] if len(episodes) >= 2 else None + # For chained multi-episode markers (E09E10E11), the range is the + # first → last episode. Intermediate values are implied. + return season, episodes[0], episodes[-1] if len(episodes) >= 2 else None # NxNN form if "X" in upper: @@ -616,7 +630,14 @@ def assemble( layer in additional fields (``parse_path``, ``raw``, …) before instantiation. """ - title_parts = [t.text for t in annotated if t.role is TokenRole.TITLE] + # Pure-punctuation tokens (e.g. a stray "-" left by ` - ` separators in + # human-friendly release names) carry no title content and would leak + # into the joined title as ``"Show.-.Episode"``. Drop them here. + title_parts = [ + t.text + for t in annotated + if t.role is TokenRole.TITLE and any(c.isalnum() for c in t.text) + ] title = ".".join(title_parts) if title_parts else ( annotated[0].text if annotated else raw_name ) @@ -636,6 +657,7 @@ def assemble( edition: str | None = None distributor: str | None = None languages: list[str] = [] + is_season_range = False for tok in annotated: # Skip non-primary members of a multi-token sequence. @@ -649,6 +671,16 @@ def assemble( parsed = _parse_season_episode(tok.text) if parsed is not None: season, episode, episode_end = parsed + # Detect Sxx-yy range form to flag it as a multi-season pack. + upper = tok.text.upper() + if ( + len(upper) == 6 + and upper[0] == "S" + and upper[1:3].isdigit() + and upper[3] == "-" + and upper[4:6].isdigit() + ): + is_season_range = True elif role is TokenRole.RESOLUTION: quality = tok.text elif role is TokenRole.SOURCE: @@ -696,6 +728,8 @@ def assemble( media_type = "documentary" elif upper_tokens & concert_tokens: media_type = "concert" + elif is_season_range: + media_type = "tv_complete" elif ( edition in {"COMPLETE", "INTEGRALE", "COLLECTION"} or upper_tokens & integrale_tokens diff --git a/alfred/domain/release/services.py b/alfred/domain/release/services.py index d8ba8e3..a13b989 100644 --- a/alfred/domain/release/services.py +++ b/alfred/domain/release/services.py @@ -46,7 +46,16 @@ def parse_release( """ parse_path = ParsePath.DIRECT.value - clean, site_tag = _v2.strip_site_tag(name) + # Apostrophes inside titles ("Don't", "L'avare") are common and should + # not push the release through the AI fallback. Strip them up front so + # both strip_site_tag and tokenize see "Dont" / "Lavare", which is good + # enough for token-level matching. The raw name is preserved on the VO. + working_name = name + if "'" in working_name: + working_name = working_name.replace("'", "") + parse_path = ParsePath.SANITIZED.value + + clean, site_tag = _v2.strip_site_tag(working_name) if site_tag is not None: parse_path = ParsePath.SANITIZED.value @@ -77,7 +86,7 @@ def parse_release( ) return parsed, report - tokens, v2_tag = _v2.tokenize(name, kb) + tokens, v2_tag = _v2.tokenize(working_name, kb) annotated = _v2.annotate(tokens, kb) fields = _v2.assemble(annotated, v2_tag, name, kb) diff --git a/tests/fixtures/releases/path_of_pain/khruangbin_yt_wide_pipe/expected.yaml b/tests/fixtures/releases/path_of_pain/khruangbin_yt_wide_pipe/expected.yaml index 8834621..63196f2 100644 --- a/tests/fixtures/releases/path_of_pain/khruangbin_yt_wide_pipe/expected.yaml +++ b/tests/fixtures/releases/path_of_pain/khruangbin_yt_wide_pipe/expected.yaml @@ -1,13 +1,15 @@ release_name: "Khruangbin | Austin City Limits Music Festival 2024 | Full Set [V_-7WWPPeBs].webm" # yt-dlp slug: UTF-8 wide pipe '|' (U+FF5C, not the ASCII '|'), trailing -# YouTube video ID in brackets, .webm extension. Parser extracts the year -# (2024) correctly but mistakes the YouTube ID '7WWPPeBs' for a release -# group, and the wide pipe survives the tokenizer (not a separator). +# YouTube video ID in brackets, .webm extension. The wide pipe survives +# the tokenizer (not a separator) but is now dropped at title assembly +# (pure-punctuation TITLE tokens carry no content). Year (2024) parses +# correctly; the YouTube ID '7WWPPeBs' is still mistaken for a release +# group (separate gap, see PoP backlog). # This is a concert recording — closer to "live music" than "movie", but # media_type=movie is the current degenerate best guess. parsed: - title: "Khruangbin.|.Austin.City.Limits.Music.Festival" + title: "Khruangbin.Austin.City.Limits.Music.Festival" year: 2024 season: null episode: null diff --git a/tests/fixtures/releases/path_of_pain/the_prodigy_full_chaos/expected.yaml b/tests/fixtures/releases/path_of_pain/the_prodigy_full_chaos/expected.yaml index 091e757..c943d90 100644 --- a/tests/fixtures/releases/path_of_pain/the_prodigy_full_chaos/expected.yaml +++ b/tests/fixtures/releases/path_of_pain/the_prodigy_full_chaos/expected.yaml @@ -1,28 +1,26 @@ release_name: "The Prodigy World's on Fire 2011 Blu-ray Remux 1080i AVC DTS-HD MA 5.1 - KRaLiMaRKo.mkv" -# Apocalypse case combining every horror: -# - Unescaped apostrophe ("World's") → forces parse_path="ai" fallback -# - Spaces AND dashes used as separators inconsistently -# - "Blu-ray" with a dash (vs. canonical BluRay) -# - "1080i" interlaced flag (not 1080p) -# - "DTS-HD MA 5.1" multi-word audio codec -# - " - GROUP.mkv" trailing format (space-dash-space before group) +# Apocalypse case combining every horror — partially tamed by the +# apostrophe fix. Remaining gaps (still PoP-worthy): +# - "1080i" interlaced flag (not in quality KB) +# - "Blu-ray" with a dash (vs. canonical BluRay) — recognized as source +# but with the dash form +# - "DTS-HD MA 5.1" multi-word audio codec — the trailing "HD" leaks +# into the group # - Trailing .mkv extension survives in title -# Result: total degeneration — UNKNOWN across the board, title=raw input. -# Once the apostrophe + multi-word-audio + 1080i are handled this fixture -# should be revisited. For now: anti-regression of the failure shape. +# - " - GROUP" trailing format (space-dash-space before group) parsed: - title: "The Prodigy World's on Fire 2011 Blu-ray Remux 1080i AVC DTS-HD MA 5.1 - KRaLiMaRKo.mkv" - year: null + title: "The.Prodigy.Worlds.on.Fire" + year: 2011 season: null episode: null quality: null - source: null - codec: null - group: "UNKNOWN" - tech_string: "" - media_type: "unknown" - parse_path: "ai" + source: "Blu-ray" + codec: "AVC" + group: "HD" + tech_string: "Blu-ray.AVC" + media_type: "movie" + parse_path: "sanitized" is_season_pack: false tree: diff --git a/tests/fixtures/releases/shitty/archer_multi_episode/expected.yaml b/tests/fixtures/releases/shitty/archer_multi_episode/expected.yaml index c63881d..75469f1 100644 --- a/tests/fixtures/releases/shitty/archer_multi_episode/expected.yaml +++ b/tests/fixtures/releases/shitty/archer_multi_episode/expected.yaml @@ -1,14 +1,13 @@ release_name: "Archer.S14E09E10E11.1080p.WEB.h264-ETHEL" -# Tech debt: triple-episode chain (E09E10E11) — current parser captures -# episode=9 and episode_end=10, but E11 is lost. Anti-regression: lock in -# the partial behavior so any future improvement is intentional. +# Triple-episode chain (E09E10E11) — the parser collapses the chain to a +# range (episode=first, episode_end=last). Intermediate values are implied. parsed: title: "Archer" year: null season: 14 episode: 9 - episode_end: 10 + episode_end: 11 quality: "1080p" source: "WEB" codec: "h264" diff --git a/tests/fixtures/releases/shitty/honey_uhd_hdr/expected.yaml b/tests/fixtures/releases/shitty/honey_uhd_hdr/expected.yaml index ff6f975..1b73c7c 100644 --- a/tests/fixtures/releases/shitty/honey_uhd_hdr/expected.yaml +++ b/tests/fixtures/releases/shitty/honey_uhd_hdr/expected.yaml @@ -1,21 +1,22 @@ release_name: "Honey.Don't.2025.2160p.WEBRip.DSNP.DV.HDR.x265.EAC3.5.1-Amen" -# Tech debt: the unescaped apostrophe in "Don't" pushes the whole release -# through the AI fallback path (parse_path="ai") and the parse degenerates to -# UNKNOWN across the board. Anti-regression here — once the tokenizer learns -# to handle apostrophes, this fixture should be revisited. +# Apostrophes inside titles ("Don't", "L'avare") used to push the release +# through the AI fallback (parse_path="ai", everything UNKNOWN). They are +# now pre-stripped before well-formed check and tokenize, so the parse +# completes normally — only the title text loses its apostrophe +# ("Honey.Dont"). parsed: - title: "Honey.Don't.2025.2160p.WEBRip.DSNP.DV.HDR.x265.EAC3.5.1-Amen" - year: null + title: "Honey.Dont" + year: 2025 season: null episode: null - quality: null - source: null - codec: null - group: "UNKNOWN" - tech_string: "" - media_type: "unknown" - parse_path: "ai" + quality: "2160p" + source: "WEBRip" + codec: "x265" + group: "Amen" + tech_string: "2160p.WEBRip.x265" + media_type: "movie" + parse_path: "sanitized" is_season_pack: false tree: diff --git a/tests/fixtures/releases/shitty/tatortreiniger_flat_multiseason/expected.yaml b/tests/fixtures/releases/shitty/tatortreiniger_flat_multiseason/expected.yaml index 80ca66f..b6a83c0 100644 --- a/tests/fixtures/releases/shitty/tatortreiniger_flat_multiseason/expected.yaml +++ b/tests/fixtures/releases/shitty/tatortreiniger_flat_multiseason/expected.yaml @@ -1,22 +1,22 @@ release_name: "Der.Tatortreiniger.S01-06.GERMAN.1080p.WEB.x264-WAYNE" -# Tech debt: range syntax 'S01-06' is not recognized as TV — falls through -# to media_type=movie with the range glued onto the title. Captured here so a -# future ranger-aware parser change is intentional. +# Range syntax 'S01-06' is now recognized as a season-range marker: +# season=1 (first of the range), media_type=tv_complete, and the token +# no longer leaks into the title. parsed: - title: "Der.Tatortreiniger.S01-06" + title: "Der.Tatortreiniger" year: null - season: null + season: 1 episode: null quality: "1080p" source: "WEB" codec: "x264" group: "WAYNE" tech_string: "1080p.WEB.x264" - media_type: "movie" + media_type: "tv_complete" languages: ["GERMAN"] parse_path: "direct" - is_season_pack: false + is_season_pack: true tree: - "Der.Tatortreiniger.S01-06.GERMAN.1080p.WEB.x264-WAYNE/" diff --git a/tests/fixtures/releases/shitty/vinyl_1x01_format/expected.yaml b/tests/fixtures/releases/shitty/vinyl_1x01_format/expected.yaml index 988552c..67ac313 100644 --- a/tests/fixtures/releases/shitty/vinyl_1x01_format/expected.yaml +++ b/tests/fixtures/releases/shitty/vinyl_1x01_format/expected.yaml @@ -1,11 +1,12 @@ release_name: "Vinyl - 1x01 - FHD" -# Tech debt: surrounding ' - ' separators leave a stray '-' token attached -# to the title ("Vinyl.-"). NxNN form correctly identifies S01E01; everything -# tech-side empty (no quality token in KB — "FHD" not yet known). Anti-regression -# the current degenerate title so a future fix is intentional. +# Surrounding ' - ' separators in human-friendly release names left stray +# '-' tokens attached to the title. They are now dropped at assembly time +# (pure-punctuation TITLE tokens carry no content). NxNN form correctly +# identifies S01E01; tech-side stays empty (no quality token in KB — "FHD" +# not yet known). parsed: - title: "Vinyl.-" + title: "Vinyl" year: null season: 1 episode: 1