refactor(release): simplify SHITTY to dict-driven token tagging

Replace the ~480-line legacy heuristic block in services.py with a
small dict-driven pass in pipeline._annotate_shitty: each token is
looked up against the kb buckets (resolutions / sources / codecs /
distributors / year / sxxexx) with first-match-wins semantics, the
leftmost contiguous UNKNOWN run becomes the title, done.

SHITTY's scope is intentionally narrow — releases that *look* like
scene names but don't have a registered group schema. Anything more
exotic (parenthesized tech, bare-dashed title fragments, YT slugs,
franchise boxes) is PATH OF PAIN territory and stays out of here.

- annotate() no longer returns None; SHITTY is the always-on fallback
- services.py shrunk from ~525 to ~85 lines (legacy extractors gone)
- 4 fixtures get xfail markers documenting PoP-grade pathologies
  (deutschland franchise box, sleaford YT slug, super_mario bilingual,
  predator space-separators — the last one moved from shitty/ → pop/)
- ReleaseFixture grows xfail_reason; the parametrized suite wires the
  pytest.mark.xfail(strict=False) automatically
This commit is contained in:
2026-05-20 01:03:25 +02:00
parent fd3bd1ad8c
commit 3737f66851
9 changed files with 231 additions and 502 deletions
+16 -4
View File
@@ -90,11 +90,23 @@ class TestAnnotateEasy:
assert TokenRole.RESOLUTION in roles
assert TokenRole.CODEC in roles
def test_unknown_group_returns_none(self) -> None:
def test_unknown_group_falls_to_shitty(self) -> None:
tokens, _ = tokenize("Some.Movie.2020.1080p.WEBRip.x264-RANDOM", _KB)
# RANDOM is not in our release_groups/ annotate returns None
# and the caller falls back to SHITTY.
assert annotate(tokens, _KB) is None
# RANDOM is not in our release_groups/ annotate() now falls
# through to the in-pipeline SHITTY pass and returns a populated
# token list (no None sentinel anymore).
annotated = annotate(tokens, _KB)
assert annotated is not None
roles = [t.role for t in annotated]
# Title is "Some.Movie", then YEAR, RESOLUTION, SOURCE, CODEC
# carrying the group in extra.
assert TokenRole.TITLE in roles
assert TokenRole.YEAR in roles
assert TokenRole.RESOLUTION in roles
assert TokenRole.SOURCE in roles
assert TokenRole.CODEC in roles
codec_tok = next(t for t in annotated if t.role is TokenRole.CODEC)
assert codec_tok.extra.get("group") == "RANDOM"
class TestAssemble:
+8 -2
View File
@@ -26,10 +26,16 @@ _KB = YamlReleaseKnowledge()
FIXTURES = discover_fixtures()
def _fixture_param(f: ReleaseFixture) -> pytest.param:
marks = []
if f.xfail_reason:
marks.append(pytest.mark.xfail(reason=f.xfail_reason, strict=False))
return pytest.param(f, id=f.name, marks=marks)
@pytest.mark.parametrize(
"fixture",
FIXTURES,
ids=[f.name for f in FIXTURES],
[_fixture_param(f) for f in FIXTURES],
)
def test_parse_matches_fixture(fixture: ReleaseFixture, tmp_path) -> None:
# Materialize the tree to assert it is at least well-formed YAML +