ansible/docs/runbooks/arr-cleanup/verify.py

#!/usr/bin/env python3
"""
Cross-reference /media/downloads/sonarr and /media/downloads/radarr against
the Sonarr/Radarr APIs, then verify reported file paths actually exist on disk.

Requirements:
  - kubectl port-forwards active:
      kubectl -n arr-stack port-forward svc/sonarr 8989:8989
      kubectl -n arr-stack port-forward svc/radarr 7878:7878
  - SSH access to aya01
  - API keys in ../../../../sonarr.api.env and ../../../../radarr.api.env

Output:
  /tmp/arr_verified.json  — full structured results for use by cleanup.py
"""

import urllib.request
import json
import subprocess
import re
import sys
import os

SONARR_URL = "http://localhost:8989/api/v3"
RADARR_URL = "http://localhost:7878/api/v3"
SSH_HOST   = "aya01"

script_dir = os.path.dirname(os.path.abspath(__file__))

def load_key(filename):
    path = os.path.join(script_dir, '../../../..', filename)
    return open(path).read().strip()

SONARR_KEY = load_key('sonarr.api.env')
RADARR_KEY = load_key('radarr.api.env')


def api_get(url):
    with urllib.request.urlopen(url, timeout=30) as r:
        return json.load(r)


def norm(s):
    return re.sub(r'[^a-z0-9]', '', s.lower())


def extract_title(name, is_movie):
    """Strip release tags from a download name to recover a bare title."""
    name = re.sub(r'\.(mkv|mp4|avi|m4v)$', '', name, flags=re.IGNORECASE)
    name = re.sub(r'\[.*?\]', '', name)
    if is_movie:
        name = re.sub(r'[\.\s_\-]?(19|20)\d{2}.*$', '', name)
    else:
        name = re.sub(r'[\.\s_\-]?[Ss]\d{1,2}([Ee]\d{1,2})?.*$', '', name)
    return re.sub(r'[\.\-_]+', ' ', name).strip()


def build_index(records, key_fn):
    idx = {}
    for rec in records:
        for k in key_fn(rec):
            if k:
                idx[k] = rec
    return idx


def find_match(dl_name, idx, is_movie):
    title = extract_title(dl_name, is_movie)
    tn = norm(title)
    if tn in idx:
        return idx[tn]
    for k, rec in idx.items():
        if k and len(k) > 5 and (tn.startswith(k) or k.startswith(tn)):
            return rec
    return None


def ssh_check_paths(paths):
    """Return (existing, missing) sets for the given list of paths."""
    if not paths:
        return set(), set()
    cmds = '\n'.join(
        f'[ -e {json.dumps(p)} ] && echo "EXISTS:{p}" || echo "MISSING:{p}"'
        for p in paths
    )
    r = subprocess.run(['ssh', SSH_HOST, 'bash', '-s'],
                       input=cmds, capture_output=True, text=True)
    existing, missing = set(), set()
    for line in r.stdout.splitlines():
        if line.startswith('EXISTS:'):
            existing.add(line[7:])
        elif line.startswith('MISSING:'):
            missing.add(line[8:])
    return existing, missing


def main():
    print("Fetching Radarr movies...")
    radarr_movies = api_get(f"{RADARR_URL}/movie?apikey={RADARR_KEY}")
    print(f"  {len(radarr_movies)} movies")

    print("Fetching Sonarr series...")
    sonarr_series = api_get(f"{SONARR_URL}/series?apikey={SONARR_KEY}")
    print(f"  {len(sonarr_series)} series")

    # Radarr index
    def radarr_keys(m):
        return [norm(m['title']), norm(f"{m['title']}{m.get('year','')}")]

    radarr_idx = build_index(radarr_movies, radarr_keys)

    # Enrich radarr records with disk path
    for m in radarr_movies:
        mf = m.get('movieFile')
        m['_file_path'] = (
            mf['path'].replace('/movies/', '/media/movies/', 1) if mf and mf.get('path') else None
        )
        m['_dir_path'] = m.get('path', '').replace('/movies/', '/media/movies/', 1)

    # Sonarr index
    def sonarr_keys(s):
        return [norm(s['title'])]

    sonarr_idx = build_index(sonarr_series, sonarr_keys)

    for s in sonarr_series:
        s['_dir_path'] = s.get('path', '').replace('/tv/', '/media/series/', 1)

    # Download listings
    print(f"\nFetching download listings from {SSH_HOST}...")
    r = subprocess.run(
        ['ssh', SSH_HOST, 'ls /media/downloads/sonarr/ && echo "===RADARR===" && ls /media/downloads/radarr/'],
        capture_output=True, text=True
    )
    parts = r.stdout.split('===RADARR===\n')
    sonarr_dls = [l.strip() for l in parts[0].splitlines() if l.strip()]
    radarr_dls = [l.strip() for l in parts[1].splitlines() if l.strip()]
    print(f"  Sonarr downloads: {len(sonarr_dls)}")
    print(f"  Radarr downloads: {len(radarr_dls)}")

    # Match and collect paths
    radarr_matched, radarr_orphans = [], []
    for dl in radarr_dls:
        rec = find_match(dl, radarr_idx, is_movie=True)
        if rec is None:
            radarr_orphans.append(dl)
        else:
            check_path = rec['_file_path'] or rec['_dir_path']
            radarr_matched.append({
                'dl': dl,
                'title': rec['title'],
                'year': rec.get('year'),
                'hasFile': rec.get('hasFile', False),
                'monitored': rec.get('monitored'),
                'check_path': check_path,
            })

    sonarr_matched, sonarr_orphans = [], []
    for dl in sonarr_dls:
        rec = find_match(dl, sonarr_idx, is_movie=False)
        if rec is None:
            sonarr_orphans.append(dl)
        else:
            stats = rec.get('statistics', {})
            sonarr_matched.append({
                'dl': dl,
                'title': rec['title'],
                'episodeFileCount': stats.get('episodeFileCount', 0),
                'totalEpisodeCount': stats.get('totalEpisodeCount', 0),
                'percentOfEpisodes': stats.get('percentOfEpisodes', 0),
                'monitored': rec.get('monitored'),
                'status': rec.get('status'),
                'check_path': rec['_dir_path'],
            })

    # Batch disk verification
    all_paths = list(set(
        [m['check_path'] for m in radarr_matched if m['check_path']] +
        [m['check_path'] for m in sonarr_matched if m['check_path']]
    ))
    print(f"\nVerifying {len(all_paths)} paths on disk...")
    existing, missing = ssh_check_paths(all_paths)
    print(f"  {len(existing)} exist, {len(missing)} missing")

    # Classify
    def classify_radarr(m):
        if not m['hasFile'] or not m['check_path']:
            return 'not_imported'
        if m['check_path'] in existing:
            return 'safe'
        return 'path_missing'

    def classify_sonarr(m):
        if m['episodeFileCount'] == 0 or not m['check_path']:
            return 'not_imported'
        if m['check_path'] in existing:
            return 'safe'
        return 'path_missing'

    for m in radarr_matched:
        m['status'] = classify_radarr(m)
    for m in sonarr_matched:
        m['status'] = classify_sonarr(m)

    result = {
        'radarr_matched': radarr_matched,
        'radarr_orphans': radarr_orphans,
        'sonarr_matched': sonarr_matched,
        'sonarr_orphans': sonarr_orphans,
        'existing_paths': list(existing),
        'missing_paths': list(missing),
    }

    out_path = '/tmp/arr_verified.json'
    with open(out_path, 'w') as f:
        json.dump(result, f, indent=2)
    print(f"\nResults written to {out_path}")

    # Summary
    r_safe    = [m for m in radarr_matched if m['status'] == 'safe']
    r_miss    = [m for m in radarr_matched if m['status'] == 'path_missing']
    r_noimp   = [m for m in radarr_matched if m['status'] == 'not_imported']
    s_safe    = [m for m in sonarr_matched if m['status'] == 'safe']
    s_miss    = [m for m in sonarr_matched if m['status'] == 'path_missing']
    s_noimp   = [m for m in sonarr_matched if m['status'] == 'not_imported']

    print("\n" + "="*60)
    print("SUMMARY")
    print("="*60)
    print(f"Radarr:  {len(r_safe)} safe | {len(r_miss)} path missing | {len(r_noimp)} not imported | {len(radarr_orphans)} orphans")
    print(f"Sonarr:  {len(s_safe)} safe | {len(s_miss)} path missing | {len(s_noimp)} not imported | {len(sonarr_orphans)} orphans")

    if r_miss:
        print("\nRadarr path_missing (review manually):")
        for m in r_miss:
            print(f"  {m['title']} → {m['check_path']}")
            print(f"    DL: {m['dl']}")
    if s_miss:
        print("\nSonarr path_missing (review manually):")
        for m in s_miss:
            print(f"  {m['title']} → {m['check_path']}")
            print(f"    DL: {m['dl']}")


if __name__ == '__main__':
    main()