From b1c7f35ffb9a90617cda9446bd14a4901dc832b5 Mon Sep 17 00:00:00 2001 From: Francwa Date: Wed, 20 May 2026 23:24:40 +0200 Subject: [PATCH] fix(release/parser): drop pure-punctuation TITLE tokens at assembly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Releases using ' - ' as a separator (Vinyl - 1x01 - FHD) tokenize to ['Vinyl', '-', '1x01', '-', 'FHD'] — the standalone '-' tokens were ending up in title_parts and leaked into the joined title ('Vinyl.-'). We can't add '-' to the separator list (it would break codec-GROUP), so we filter at assembly: a TITLE token with no alphanumeric characters carries no title content. Side win: same logic eliminates the UTF-8 wide-pipe '|' from the khruangbin_yt_wide_pipe fixture title. Fixtures updated: - shitty/vinyl_1x01_format/expected.yaml (title: Vinyl.- → Vinyl) - path_of_pain/khruangbin_yt_wide_pipe/expected.yaml (| dropped) --- CHANGELOG.md | 6 ++++++ alfred/domain/release/parser/pipeline.py | 9 ++++++++- .../khruangbin_yt_wide_pipe/expected.yaml | 10 ++++++---- .../releases/shitty/vinyl_1x01_format/expected.yaml | 11 ++++++----- 4 files changed, 26 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a2c33fd..90f2316 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,12 @@ callers). with intermediate values implied. Fixture `shitty/archer_multi_episode/` updated from anti-regression-of-bug to anti-regression-of-fix. +- **Pure-punctuation TITLE tokens are dropped at assembly.** Releases + with surrounding ` - ` separators (`Vinyl - 1x01 - FHD`) previously + produced `title="Vinyl.-"`. Such tokens (a stray dash, a wide pipe + `|`, …) carry no title content and are now filtered out. Side + effect: PoP fixture `khruangbin_yt_wide_pipe/` also benefits — the + YouTube wide-pipe no longer leaks into the title. ### Added diff --git a/alfred/domain/release/parser/pipeline.py b/alfred/domain/release/parser/pipeline.py index de0a883..2a96edb 100644 --- a/alfred/domain/release/parser/pipeline.py +++ b/alfred/domain/release/parser/pipeline.py @@ -618,7 +618,14 @@ def assemble( layer in additional fields (``parse_path``, ``raw``, …) before instantiation. """ - title_parts = [t.text for t in annotated if t.role is TokenRole.TITLE] + # Pure-punctuation tokens (e.g. a stray "-" left by ` - ` separators in + # human-friendly release names) carry no title content and would leak + # into the joined title as ``"Show.-.Episode"``. Drop them here. + title_parts = [ + t.text + for t in annotated + if t.role is TokenRole.TITLE and any(c.isalnum() for c in t.text) + ] title = ".".join(title_parts) if title_parts else ( annotated[0].text if annotated else raw_name ) diff --git a/tests/fixtures/releases/path_of_pain/khruangbin_yt_wide_pipe/expected.yaml b/tests/fixtures/releases/path_of_pain/khruangbin_yt_wide_pipe/expected.yaml index 8834621..63196f2 100644 --- a/tests/fixtures/releases/path_of_pain/khruangbin_yt_wide_pipe/expected.yaml +++ b/tests/fixtures/releases/path_of_pain/khruangbin_yt_wide_pipe/expected.yaml @@ -1,13 +1,15 @@ release_name: "Khruangbin | Austin City Limits Music Festival 2024 | Full Set [V_-7WWPPeBs].webm" # yt-dlp slug: UTF-8 wide pipe '|' (U+FF5C, not the ASCII '|'), trailing -# YouTube video ID in brackets, .webm extension. Parser extracts the year -# (2024) correctly but mistakes the YouTube ID '7WWPPeBs' for a release -# group, and the wide pipe survives the tokenizer (not a separator). +# YouTube video ID in brackets, .webm extension. The wide pipe survives +# the tokenizer (not a separator) but is now dropped at title assembly +# (pure-punctuation TITLE tokens carry no content). Year (2024) parses +# correctly; the YouTube ID '7WWPPeBs' is still mistaken for a release +# group (separate gap, see PoP backlog). # This is a concert recording — closer to "live music" than "movie", but # media_type=movie is the current degenerate best guess. parsed: - title: "Khruangbin.|.Austin.City.Limits.Music.Festival" + title: "Khruangbin.Austin.City.Limits.Music.Festival" year: 2024 season: null episode: null diff --git a/tests/fixtures/releases/shitty/vinyl_1x01_format/expected.yaml b/tests/fixtures/releases/shitty/vinyl_1x01_format/expected.yaml index 988552c..67ac313 100644 --- a/tests/fixtures/releases/shitty/vinyl_1x01_format/expected.yaml +++ b/tests/fixtures/releases/shitty/vinyl_1x01_format/expected.yaml @@ -1,11 +1,12 @@ release_name: "Vinyl - 1x01 - FHD" -# Tech debt: surrounding ' - ' separators leave a stray '-' token attached -# to the title ("Vinyl.-"). NxNN form correctly identifies S01E01; everything -# tech-side empty (no quality token in KB — "FHD" not yet known). Anti-regression -# the current degenerate title so a future fix is intentional. +# Surrounding ' - ' separators in human-friendly release names left stray +# '-' tokens attached to the title. They are now dropped at assembly time +# (pure-punctuation TITLE tokens carry no content). NxNN form correctly +# identifies S01E01; tech-side stays empty (no quality token in KB — "FHD" +# not yet known). parsed: - title: "Vinyl.-" + title: "Vinyl" year: null season: 1 episode: 1