docs(runbook): add arr-stack downloads cleanup investigation and scripts

~16T freed on aya01 (92% → 57% mergerfs pool). Documents root cause (no hardlinks across mergerfs due to cross-device mounts), cleanup passes via Sonarr/Radarr API verification, and pending decisions (Bleach remux, 111 skipped Sonarr entries).
2026-04-23 08:06:27 +02:00
parent e87dcd06f3
commit 8239988a70
5 changed files with 943 additions and 0 deletions
--- a/docs/runbooks/arr-cleanup/cleanup-orphans.py
+++ b/docs/runbooks/arr-cleanup/cleanup-orphans.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+"""
+Delete download entries from /media/downloads/sonarr that are NOT in Sonarr,
+logging every action (size, path, timestamp, outcome) to cleanup.log.
+
+Runs in two passes:
+  1. Tries hard to match each orphan against Sonarr (title + romaji + partial).
+     Anything that matches is skipped — only true non-matches are deleted.
+  2. For each confirmed non-match, checks whether a directory with that show
+     name exists in /media/series (belt-and-suspenders). If it does, skips.
+  3. Deletes remaining entries and logs every outcome.
+
+Usage:
+  python3 cleanup-orphans.py --dry-run     # show what would be deleted
+  python3 cleanup-orphans.py --yes         # delete without confirmation
+"""
+
+import urllib.request
+import json
+import subprocess
+import re
+import os
+import sys
+import argparse
+from datetime import datetime, timezone
+
+SONARR_URL    = "http://localhost:8989/api/v3"
+SSH_HOST      = "aya01"
+DL_ROOT       = "/media/downloads/sonarr"
+SERIES_ROOT   = "/media/series"
+
+script_dir = os.path.dirname(os.path.abspath(__file__))
+LOG_FILE   = os.path.join(script_dir, "cleanup.log")
+
+with open(os.path.join(script_dir, '..', 'sonarr.api.env')) as f:
+    SONARR_KEY = f.read().strip()
+
+
+def api_get(url):
+    with urllib.request.urlopen(url, timeout=30) as r:
+        return json.load(r)
+
+
+def norm(s):
+    return re.sub(r'[^a-z0-9]', '', s.lower())
+
+
+def ssh_run(cmd):
+    r = subprocess.run(['ssh', SSH_HOST, cmd], capture_output=True, text=True)
+    return r.stdout.strip()
+
+
+def ssh_exists(path):
+    return ssh_run(f'[ -e {json.dumps(path)} ] && echo yes || echo no') == 'yes'
+
+
+def ssh_size(path):
+    """Return size in bytes, or 0 if path doesn't exist."""
+    out = ssh_run(f'du -sb {json.dumps(path)} 2>/dev/null | cut -f1')
+    try:
+        return int(out)
+    except ValueError:
+        return 0
+
+
+def ssh_delete(path):
+    r = subprocess.run(['ssh', SSH_HOST, f'rm -rf {json.dumps(path)}'],
+                       capture_output=True, text=True)
+    return r.returncode == 0, r.stderr.strip()
+
+
+def log(line):
+    ts = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
+    entry = f"[{ts}] {line}"
+    print(entry)
+    with open(LOG_FILE, 'a') as f:
+        f.write(entry + '\n')
+
+
+def extract_title(name):
+    """Strip season/episode/quality tags to recover a bare show title."""
+    name = re.sub(r'\.(mkv|mp4|ts|avi)$', '', name, flags=re.IGNORECASE)
+    name = re.sub(r'^\[.*?\]\s*', '', name)            # [Group] prefix
+    name = re.sub(r'\s*\[.*?\]\s*', ' ', name)         # inline [tags]
+    name = re.sub(r'[\.\s_\-]?[Ss]\d{1,2}[Ee]\d{1,2}.*$', '', name)
+    name = re.sub(r'[\.\s_\-]?[Ss]\d{1,2}[\.\s_\-].*$', '', name)
+    name = re.sub(r'[\.\s_\-]?[Ss]\d{2}$', '', name)
+    name = re.sub(r'[\.\s_\-]?(19|20)\d{2}.*$', '', name)
+    name = re.sub(r'[\.\s_\-]?\d{3,4}p.*$', '', name) # 1080p etc
+    name = re.sub(r'[\.\-_]+', ' ', name).strip()
+    return name
+
+
+def build_sonarr_index(series):
+    idx = {}
+    for s in series:
+        for title_variant in [s['title'], s.get('titleSlug', ''), s.get('sortTitle', '')]:
+            if title_variant:
+                idx[norm(title_variant)] = s
+        # Also index alternate titles if present
+        for alt in s.get('alternateTitles', []):
+            t = alt.get('title', '')
+            if t:
+                idx[norm(t)] = s
+    return idx
+
+
+def find_in_sonarr(dl_name, idx):
+    title = extract_title(dl_name)
+    tn = norm(title)
+    if tn in idx:
+        return idx[tn], title
+    # Partial: dl title starts with series title (or vice versa), min 6 chars
+    for k, rec in idx.items():
+        if k and len(k) >= 6 and len(tn) >= 6:
+            if tn.startswith(k) or k.startswith(tn):
+                return rec, title
+    return None, title
+
+
+def confirm(prompt):
+    answer = input(f"{prompt} [y/N] ").strip().lower()
+    return answer == 'y'
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dry-run', action='store_true')
+    parser.add_argument('--yes', '-y', action='store_true')
+    args = parser.parse_args()
+
+    if args.dry_run:
+        print("DRY-RUN — nothing will be deleted\n")
+
+    log("=" * 60)
+    log(f"cleanup-orphans.py started (dry_run={args.dry_run})")
+
+    print("Fetching Sonarr series (including alternate titles)...")
+    series = api_get(f"{SONARR_URL}/series?apikey={SONARR_KEY}")
+    print(f"  {len(series)} series")
+    idx = build_sonarr_index(series)
+
+    # Collect series dirs on disk for secondary check
+    # Strip years, imdb tags, and punctuation so "Bleach (2004) {imdb-...}" matches "Bleach"
+    print("Fetching /media/series directory listing...")
+    series_on_disk_raw = ssh_run(f'ls {json.dumps(SERIES_ROOT)}/').splitlines()
+    def norm_dir(d):
+        d = re.sub(r'\{.*?\}', '', d)          # remove {imdb-...}
+        d = re.sub(r'\(?\d{4}\)?', '', d)      # remove years
+        d = re.sub(r'[^a-z0-9]', '', d.lower())
+        return d
+    series_on_disk_norm = {norm_dir(d) for d in series_on_disk_raw if d.strip()}
+
+    print("Fetching download listing...")
+    dl_entries = ssh_run(f'ls {json.dumps(DL_ROOT)}/').splitlines()
+    dl_entries = [e.strip() for e in dl_entries if e.strip()]
+    print(f"  {len(dl_entries)} entries in {DL_ROOT}")
+
+    # --- First pass: match against Sonarr ---
+    not_in_sonarr = []
+    in_sonarr     = []
+
+    for dl in dl_entries:
+        rec, extracted_title = find_in_sonarr(dl, idx)
+        if rec:
+            in_sonarr.append((dl, rec['title']))
+        else:
+            not_in_sonarr.append((dl, extracted_title))
+
+    print(f"\n  Matched to Sonarr:   {len(in_sonarr)}")
+    print(f"  NOT in Sonarr:       {len(not_in_sonarr)}")
+
+    # --- Second pass: check if series dir exists on disk anyway ---
+    skip_has_series_dir = []
+    to_delete = []
+
+    for dl, title in not_in_sonarr:
+        title_n = norm(title)
+        # Check if any series dir on disk has a similar name
+        has_dir = any(
+            d and len(d) >= 6 and (title_n.startswith(d) or d.startswith(title_n))
+            for d in series_on_disk_norm
+        )
+        # Also check the full download path exists
+        dl_path = f"{DL_ROOT}/{dl}"
+        if has_dir:
+            skip_has_series_dir.append((dl, title, dl_path))
+        else:
+            to_delete.append((dl, title, dl_path))
+
+    if skip_has_series_dir:
+        print(f"\n  SKIPPED (series dir found on disk, needs manual review): {len(skip_has_series_dir)}")
+        for dl, title, _ in skip_has_series_dir:
+            print(f"    {title:40s} ← {dl[:60]}")
+
+    print(f"\n{'='*60}")
+    print(f"TO DELETE ({len(to_delete)} entries — not in Sonarr, no series dir on disk)")
+    print(f"{'='*60}")
+
+    # Get sizes in parallel
+    print("\nMeasuring sizes...")
+    size_cmd = ' && '.join(
+        f'du -sb {json.dumps(f"{DL_ROOT}/{dl}")} 2>/dev/null | cut -f1'
+        for dl, _, _ in to_delete
+    )
+    if to_delete:
+        size_out = ssh_run(f'bash -c {json.dumps(size_cmd)}').splitlines()
+    else:
+        size_out = []
+
+    sizes = {}
+    for i, (dl, title, path) in enumerate(to_delete):
+        try:
+            sizes[dl] = int(size_out[i]) if i < len(size_out) else 0
+        except (ValueError, IndexError):
+            sizes[dl] = 0
+
+    total_bytes = sum(sizes.values())
+    for dl, title, path in sorted(to_delete, key=lambda x: x[1]):
+        sz = sizes.get(dl, 0)
+        print(f"  {sz/1e9:6.1f}G  {title:40s} ← {dl[:60]}")
+
+    print(f"\n  Total: {total_bytes/1e9:.1f}G across {len(to_delete)} entries")
+
+    if not to_delete:
+        log("Nothing to delete.")
+        return
+
+    if not args.dry_run and not args.yes:
+        if not confirm(f"\nDelete {len(to_delete)} entries?"):
+            log("Aborted by user.")
+            return
+
+    # --- Delete with logging ---
+    deleted_count = 0
+    deleted_bytes = 0
+    failed_count  = 0
+
+    for dl, title, path in sorted(to_delete, key=lambda x: x[1]):
+        sz = sizes.get(dl, 0)
+        if args.dry_run:
+            log(f"DRY-RUN | {sz/1e9:.2f}G | {title} | {path}")
+            deleted_count += 1
+            deleted_bytes += sz
+        else:
+            ok, err = ssh_delete(path)
+            if ok:
+                log(f"DELETED | {sz/1e9:.2f}G | {title} | {path}")
+                deleted_count += 1
+                deleted_bytes += sz
+            else:
+                log(f"FAILED  | {sz/1e9:.2f}G | {title} | {path} | {err}")
+                failed_count += 1
+
+    log(f"DONE | deleted={deleted_count} | freed={deleted_bytes/1e9:.1f}G | failed={failed_count}")
+
+
+if __name__ == '__main__':
+    main()