#!/usr/bin/env python3 """ Delete download entries from /media/downloads/sonarr that are NOT in Sonarr, logging every action (size, path, timestamp, outcome) to cleanup.log. Runs in two passes: 1. Tries hard to match each orphan against Sonarr (title + romaji + partial). Anything that matches is skipped — only true non-matches are deleted. 2. For each confirmed non-match, checks whether a directory with that show name exists in /media/series (belt-and-suspenders). If it does, skips. 3. Deletes remaining entries and logs every outcome. Usage: python3 cleanup-orphans.py --dry-run # show what would be deleted python3 cleanup-orphans.py --yes # delete without confirmation """ import urllib.request import json import subprocess import re import os import sys import argparse from datetime import datetime, timezone SONARR_URL = "http://localhost:8989/api/v3" SSH_HOST = "aya01" DL_ROOT = "/media/downloads/sonarr" SERIES_ROOT = "/media/series" script_dir = os.path.dirname(os.path.abspath(__file__)) LOG_FILE = os.path.join(script_dir, "cleanup.log") with open(os.path.join(script_dir, '../../../..', 'sonarr.api.env')) as f: SONARR_KEY = f.read().strip() def api_get(url): with urllib.request.urlopen(url, timeout=30) as r: return json.load(r) def norm(s): return re.sub(r'[^a-z0-9]', '', s.lower()) def ssh_run(cmd): r = subprocess.run(['ssh', SSH_HOST, cmd], capture_output=True, text=True) return r.stdout.strip() def ssh_exists(path): return ssh_run(f'[ -e {json.dumps(path)} ] && echo yes || echo no') == 'yes' def ssh_size(path): """Return size in bytes, or 0 if path doesn't exist.""" out = ssh_run(f'du -sb {json.dumps(path)} 2>/dev/null | cut -f1') try: return int(out) except ValueError: return 0 def ssh_delete(path): r = subprocess.run(['ssh', SSH_HOST, f'rm -rf {json.dumps(path)}'], capture_output=True, text=True) return r.returncode == 0, r.stderr.strip() def log(line): ts = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') entry = f"[{ts}] {line}" print(entry) with open(LOG_FILE, 'a') as f: f.write(entry + '\n') def extract_title(name): """Strip season/episode/quality tags to recover a bare show title.""" name = re.sub(r'\.(mkv|mp4|ts|avi)$', '', name, flags=re.IGNORECASE) name = re.sub(r'^\[.*?\]\s*', '', name) # [Group] prefix name = re.sub(r'\s*\[.*?\]\s*', ' ', name) # inline [tags] name = re.sub(r'[\.\s_\-]?[Ss]\d{1,2}[Ee]\d{1,2}.*$', '', name) name = re.sub(r'[\.\s_\-]?[Ss]\d{1,2}[\.\s_\-].*$', '', name) name = re.sub(r'[\.\s_\-]?[Ss]\d{2}$', '', name) name = re.sub(r'[\.\s_\-]?(19|20)\d{2}.*$', '', name) name = re.sub(r'[\.\s_\-]?\d{3,4}p.*$', '', name) # 1080p etc name = re.sub(r'[\.\-_]+', ' ', name).strip() return name def build_sonarr_index(series): idx = {} for s in series: for title_variant in [s['title'], s.get('titleSlug', ''), s.get('sortTitle', '')]: if title_variant: idx[norm(title_variant)] = s # Also index alternate titles if present for alt in s.get('alternateTitles', []): t = alt.get('title', '') if t: idx[norm(t)] = s return idx def find_in_sonarr(dl_name, idx): title = extract_title(dl_name) tn = norm(title) if tn in idx: return idx[tn], title # Partial: dl title starts with series title (or vice versa), min 6 chars for k, rec in idx.items(): if k and len(k) >= 6 and len(tn) >= 6: if tn.startswith(k) or k.startswith(tn): return rec, title return None, title def confirm(prompt): answer = input(f"{prompt} [y/N] ").strip().lower() return answer == 'y' def main(): parser = argparse.ArgumentParser() parser.add_argument('--dry-run', action='store_true') parser.add_argument('--yes', '-y', action='store_true') args = parser.parse_args() if args.dry_run: print("DRY-RUN — nothing will be deleted\n") log("=" * 60) log(f"cleanup-orphans.py started (dry_run={args.dry_run})") print("Fetching Sonarr series (including alternate titles)...") series = api_get(f"{SONARR_URL}/series?apikey={SONARR_KEY}") print(f" {len(series)} series") idx = build_sonarr_index(series) # Collect series dirs on disk for secondary check # Strip years, imdb tags, and punctuation so "Bleach (2004) {imdb-...}" matches "Bleach" print("Fetching /media/series directory listing...") series_on_disk_raw = ssh_run(f'ls {json.dumps(SERIES_ROOT)}/').splitlines() def norm_dir(d): d = re.sub(r'\{.*?\}', '', d) # remove {imdb-...} d = re.sub(r'\(?\d{4}\)?', '', d) # remove years d = re.sub(r'[^a-z0-9]', '', d.lower()) return d series_on_disk_norm = {norm_dir(d) for d in series_on_disk_raw if d.strip()} print("Fetching download listing...") dl_entries = ssh_run(f'ls {json.dumps(DL_ROOT)}/').splitlines() dl_entries = [e.strip() for e in dl_entries if e.strip()] print(f" {len(dl_entries)} entries in {DL_ROOT}") # --- First pass: match against Sonarr --- not_in_sonarr = [] in_sonarr = [] for dl in dl_entries: rec, extracted_title = find_in_sonarr(dl, idx) if rec: in_sonarr.append((dl, rec['title'])) else: not_in_sonarr.append((dl, extracted_title)) print(f"\n Matched to Sonarr: {len(in_sonarr)}") print(f" NOT in Sonarr: {len(not_in_sonarr)}") # --- Second pass: check if series dir exists on disk anyway --- skip_has_series_dir = [] to_delete = [] for dl, title in not_in_sonarr: title_n = norm(title) # Check if any series dir on disk has a similar name has_dir = any( d and len(d) >= 6 and (title_n.startswith(d) or d.startswith(title_n)) for d in series_on_disk_norm ) # Also check the full download path exists dl_path = f"{DL_ROOT}/{dl}" if has_dir: skip_has_series_dir.append((dl, title, dl_path)) else: to_delete.append((dl, title, dl_path)) if skip_has_series_dir: print(f"\n SKIPPED (series dir found on disk, needs manual review): {len(skip_has_series_dir)}") for dl, title, _ in skip_has_series_dir: print(f" {title:40s} ← {dl[:60]}") print(f"\n{'='*60}") print(f"TO DELETE ({len(to_delete)} entries — not in Sonarr, no series dir on disk)") print(f"{'='*60}") # Get sizes in parallel print("\nMeasuring sizes...") size_cmd = ' && '.join( f'du -sb {json.dumps(f"{DL_ROOT}/{dl}")} 2>/dev/null | cut -f1' for dl, _, _ in to_delete ) if to_delete: size_out = ssh_run(f'bash -c {json.dumps(size_cmd)}').splitlines() else: size_out = [] sizes = {} for i, (dl, title, path) in enumerate(to_delete): try: sizes[dl] = int(size_out[i]) if i < len(size_out) else 0 except (ValueError, IndexError): sizes[dl] = 0 total_bytes = sum(sizes.values()) for dl, title, path in sorted(to_delete, key=lambda x: x[1]): sz = sizes.get(dl, 0) print(f" {sz/1e9:6.1f}G {title:40s} ← {dl[:60]}") print(f"\n Total: {total_bytes/1e9:.1f}G across {len(to_delete)} entries") if not to_delete: log("Nothing to delete.") return if not args.dry_run and not args.yes: if not confirm(f"\nDelete {len(to_delete)} entries?"): log("Aborted by user.") return # --- Delete with logging --- deleted_count = 0 deleted_bytes = 0 failed_count = 0 for dl, title, path in sorted(to_delete, key=lambda x: x[1]): sz = sizes.get(dl, 0) if args.dry_run: log(f"DRY-RUN | {sz/1e9:.2f}G | {title} | {path}") deleted_count += 1 deleted_bytes += sz else: ok, err = ssh_delete(path) if ok: log(f"DELETED | {sz/1e9:.2f}G | {title} | {path}") deleted_count += 1 deleted_bytes += sz else: log(f"FAILED | {sz/1e9:.2f}G | {title} | {path} | {err}") failed_count += 1 log(f"DONE | deleted={deleted_count} | freed={deleted_bytes/1e9:.1f}G | failed={failed_count}") if __name__ == '__main__': main()