docs(runbook): add arr-stack downloads cleanup investigation and scripts
~16T freed on aya01 (92% → 57% mergerfs pool). Documents root cause (no hardlinks across mergerfs due to cross-device mounts), cleanup passes via Sonarr/Radarr API verification, and pending decisions (Bleach remux, 111 skipped Sonarr entries).
This commit is contained in:
259
docs/runbooks/arr-cleanup/cleanup-orphans.py
Normal file
259
docs/runbooks/arr-cleanup/cleanup-orphans.py
Normal file
@@ -0,0 +1,259 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Delete download entries from /media/downloads/sonarr that are NOT in Sonarr,
|
||||
logging every action (size, path, timestamp, outcome) to cleanup.log.
|
||||
|
||||
Runs in two passes:
|
||||
1. Tries hard to match each orphan against Sonarr (title + romaji + partial).
|
||||
Anything that matches is skipped — only true non-matches are deleted.
|
||||
2. For each confirmed non-match, checks whether a directory with that show
|
||||
name exists in /media/series (belt-and-suspenders). If it does, skips.
|
||||
3. Deletes remaining entries and logs every outcome.
|
||||
|
||||
Usage:
|
||||
python3 cleanup-orphans.py --dry-run # show what would be deleted
|
||||
python3 cleanup-orphans.py --yes # delete without confirmation
|
||||
"""
|
||||
|
||||
import urllib.request
|
||||
import json
|
||||
import subprocess
|
||||
import re
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
from datetime import datetime, timezone
|
||||
|
||||
SONARR_URL = "http://localhost:8989/api/v3"
|
||||
SSH_HOST = "aya01"
|
||||
DL_ROOT = "/media/downloads/sonarr"
|
||||
SERIES_ROOT = "/media/series"
|
||||
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
LOG_FILE = os.path.join(script_dir, "cleanup.log")
|
||||
|
||||
with open(os.path.join(script_dir, '..', 'sonarr.api.env')) as f:
|
||||
SONARR_KEY = f.read().strip()
|
||||
|
||||
|
||||
def api_get(url):
|
||||
with urllib.request.urlopen(url, timeout=30) as r:
|
||||
return json.load(r)
|
||||
|
||||
|
||||
def norm(s):
|
||||
return re.sub(r'[^a-z0-9]', '', s.lower())
|
||||
|
||||
|
||||
def ssh_run(cmd):
|
||||
r = subprocess.run(['ssh', SSH_HOST, cmd], capture_output=True, text=True)
|
||||
return r.stdout.strip()
|
||||
|
||||
|
||||
def ssh_exists(path):
|
||||
return ssh_run(f'[ -e {json.dumps(path)} ] && echo yes || echo no') == 'yes'
|
||||
|
||||
|
||||
def ssh_size(path):
|
||||
"""Return size in bytes, or 0 if path doesn't exist."""
|
||||
out = ssh_run(f'du -sb {json.dumps(path)} 2>/dev/null | cut -f1')
|
||||
try:
|
||||
return int(out)
|
||||
except ValueError:
|
||||
return 0
|
||||
|
||||
|
||||
def ssh_delete(path):
|
||||
r = subprocess.run(['ssh', SSH_HOST, f'rm -rf {json.dumps(path)}'],
|
||||
capture_output=True, text=True)
|
||||
return r.returncode == 0, r.stderr.strip()
|
||||
|
||||
|
||||
def log(line):
|
||||
ts = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||
entry = f"[{ts}] {line}"
|
||||
print(entry)
|
||||
with open(LOG_FILE, 'a') as f:
|
||||
f.write(entry + '\n')
|
||||
|
||||
|
||||
def extract_title(name):
|
||||
"""Strip season/episode/quality tags to recover a bare show title."""
|
||||
name = re.sub(r'\.(mkv|mp4|ts|avi)$', '', name, flags=re.IGNORECASE)
|
||||
name = re.sub(r'^\[.*?\]\s*', '', name) # [Group] prefix
|
||||
name = re.sub(r'\s*\[.*?\]\s*', ' ', name) # inline [tags]
|
||||
name = re.sub(r'[\.\s_\-]?[Ss]\d{1,2}[Ee]\d{1,2}.*$', '', name)
|
||||
name = re.sub(r'[\.\s_\-]?[Ss]\d{1,2}[\.\s_\-].*$', '', name)
|
||||
name = re.sub(r'[\.\s_\-]?[Ss]\d{2}$', '', name)
|
||||
name = re.sub(r'[\.\s_\-]?(19|20)\d{2}.*$', '', name)
|
||||
name = re.sub(r'[\.\s_\-]?\d{3,4}p.*$', '', name) # 1080p etc
|
||||
name = re.sub(r'[\.\-_]+', ' ', name).strip()
|
||||
return name
|
||||
|
||||
|
||||
def build_sonarr_index(series):
|
||||
idx = {}
|
||||
for s in series:
|
||||
for title_variant in [s['title'], s.get('titleSlug', ''), s.get('sortTitle', '')]:
|
||||
if title_variant:
|
||||
idx[norm(title_variant)] = s
|
||||
# Also index alternate titles if present
|
||||
for alt in s.get('alternateTitles', []):
|
||||
t = alt.get('title', '')
|
||||
if t:
|
||||
idx[norm(t)] = s
|
||||
return idx
|
||||
|
||||
|
||||
def find_in_sonarr(dl_name, idx):
|
||||
title = extract_title(dl_name)
|
||||
tn = norm(title)
|
||||
if tn in idx:
|
||||
return idx[tn], title
|
||||
# Partial: dl title starts with series title (or vice versa), min 6 chars
|
||||
for k, rec in idx.items():
|
||||
if k and len(k) >= 6 and len(tn) >= 6:
|
||||
if tn.startswith(k) or k.startswith(tn):
|
||||
return rec, title
|
||||
return None, title
|
||||
|
||||
|
||||
def confirm(prompt):
|
||||
answer = input(f"{prompt} [y/N] ").strip().lower()
|
||||
return answer == 'y'
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--dry-run', action='store_true')
|
||||
parser.add_argument('--yes', '-y', action='store_true')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.dry_run:
|
||||
print("DRY-RUN — nothing will be deleted\n")
|
||||
|
||||
log("=" * 60)
|
||||
log(f"cleanup-orphans.py started (dry_run={args.dry_run})")
|
||||
|
||||
print("Fetching Sonarr series (including alternate titles)...")
|
||||
series = api_get(f"{SONARR_URL}/series?apikey={SONARR_KEY}")
|
||||
print(f" {len(series)} series")
|
||||
idx = build_sonarr_index(series)
|
||||
|
||||
# Collect series dirs on disk for secondary check
|
||||
# Strip years, imdb tags, and punctuation so "Bleach (2004) {imdb-...}" matches "Bleach"
|
||||
print("Fetching /media/series directory listing...")
|
||||
series_on_disk_raw = ssh_run(f'ls {json.dumps(SERIES_ROOT)}/').splitlines()
|
||||
def norm_dir(d):
|
||||
d = re.sub(r'\{.*?\}', '', d) # remove {imdb-...}
|
||||
d = re.sub(r'\(?\d{4}\)?', '', d) # remove years
|
||||
d = re.sub(r'[^a-z0-9]', '', d.lower())
|
||||
return d
|
||||
series_on_disk_norm = {norm_dir(d) for d in series_on_disk_raw if d.strip()}
|
||||
|
||||
print("Fetching download listing...")
|
||||
dl_entries = ssh_run(f'ls {json.dumps(DL_ROOT)}/').splitlines()
|
||||
dl_entries = [e.strip() for e in dl_entries if e.strip()]
|
||||
print(f" {len(dl_entries)} entries in {DL_ROOT}")
|
||||
|
||||
# --- First pass: match against Sonarr ---
|
||||
not_in_sonarr = []
|
||||
in_sonarr = []
|
||||
|
||||
for dl in dl_entries:
|
||||
rec, extracted_title = find_in_sonarr(dl, idx)
|
||||
if rec:
|
||||
in_sonarr.append((dl, rec['title']))
|
||||
else:
|
||||
not_in_sonarr.append((dl, extracted_title))
|
||||
|
||||
print(f"\n Matched to Sonarr: {len(in_sonarr)}")
|
||||
print(f" NOT in Sonarr: {len(not_in_sonarr)}")
|
||||
|
||||
# --- Second pass: check if series dir exists on disk anyway ---
|
||||
skip_has_series_dir = []
|
||||
to_delete = []
|
||||
|
||||
for dl, title in not_in_sonarr:
|
||||
title_n = norm(title)
|
||||
# Check if any series dir on disk has a similar name
|
||||
has_dir = any(
|
||||
d and len(d) >= 6 and (title_n.startswith(d) or d.startswith(title_n))
|
||||
for d in series_on_disk_norm
|
||||
)
|
||||
# Also check the full download path exists
|
||||
dl_path = f"{DL_ROOT}/{dl}"
|
||||
if has_dir:
|
||||
skip_has_series_dir.append((dl, title, dl_path))
|
||||
else:
|
||||
to_delete.append((dl, title, dl_path))
|
||||
|
||||
if skip_has_series_dir:
|
||||
print(f"\n SKIPPED (series dir found on disk, needs manual review): {len(skip_has_series_dir)}")
|
||||
for dl, title, _ in skip_has_series_dir:
|
||||
print(f" {title:40s} ← {dl[:60]}")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"TO DELETE ({len(to_delete)} entries — not in Sonarr, no series dir on disk)")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Get sizes in parallel
|
||||
print("\nMeasuring sizes...")
|
||||
size_cmd = ' && '.join(
|
||||
f'du -sb {json.dumps(f"{DL_ROOT}/{dl}")} 2>/dev/null | cut -f1'
|
||||
for dl, _, _ in to_delete
|
||||
)
|
||||
if to_delete:
|
||||
size_out = ssh_run(f'bash -c {json.dumps(size_cmd)}').splitlines()
|
||||
else:
|
||||
size_out = []
|
||||
|
||||
sizes = {}
|
||||
for i, (dl, title, path) in enumerate(to_delete):
|
||||
try:
|
||||
sizes[dl] = int(size_out[i]) if i < len(size_out) else 0
|
||||
except (ValueError, IndexError):
|
||||
sizes[dl] = 0
|
||||
|
||||
total_bytes = sum(sizes.values())
|
||||
for dl, title, path in sorted(to_delete, key=lambda x: x[1]):
|
||||
sz = sizes.get(dl, 0)
|
||||
print(f" {sz/1e9:6.1f}G {title:40s} ← {dl[:60]}")
|
||||
|
||||
print(f"\n Total: {total_bytes/1e9:.1f}G across {len(to_delete)} entries")
|
||||
|
||||
if not to_delete:
|
||||
log("Nothing to delete.")
|
||||
return
|
||||
|
||||
if not args.dry_run and not args.yes:
|
||||
if not confirm(f"\nDelete {len(to_delete)} entries?"):
|
||||
log("Aborted by user.")
|
||||
return
|
||||
|
||||
# --- Delete with logging ---
|
||||
deleted_count = 0
|
||||
deleted_bytes = 0
|
||||
failed_count = 0
|
||||
|
||||
for dl, title, path in sorted(to_delete, key=lambda x: x[1]):
|
||||
sz = sizes.get(dl, 0)
|
||||
if args.dry_run:
|
||||
log(f"DRY-RUN | {sz/1e9:.2f}G | {title} | {path}")
|
||||
deleted_count += 1
|
||||
deleted_bytes += sz
|
||||
else:
|
||||
ok, err = ssh_delete(path)
|
||||
if ok:
|
||||
log(f"DELETED | {sz/1e9:.2f}G | {title} | {path}")
|
||||
deleted_count += 1
|
||||
deleted_bytes += sz
|
||||
else:
|
||||
log(f"FAILED | {sz/1e9:.2f}G | {title} | {path} | {err}")
|
||||
failed_count += 1
|
||||
|
||||
log(f"DONE | deleted={deleted_count} | freed={deleted_bytes/1e9:.1f}G | failed={failed_count}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user