bini-shorts-maker/backend/app/services/audio_separator.py

"""
Audio separation service using Demucs for vocal/music separation.
Also includes speech vs singing detection.
"""
import subprocess
import os
import shutil
from typing import Optional, Tuple
from pathlib import Path

# Demucs runs in a separate Python 3.11 environment due to compatibility issues
DEMUCS_VENV_PATH = os.path.join(
    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
    "venv_demucs"
)
DEMUCS_PYTHON = os.path.join(DEMUCS_VENV_PATH, "bin", "python")


async def separate_vocals(
    input_path: str,
    output_dir: str,
    model: str = "htdemucs"
) -> Tuple[bool, str, Optional[str], Optional[str]]:
    """
    Separate vocals from background music using Demucs.

    Args:
        input_path: Path to input audio/video file
        output_dir: Directory to save separated tracks
        model: Demucs model to use (htdemucs, htdemucs_ft, mdx_extra)

    Returns:
        Tuple of (success, message, vocals_path, no_vocals_path)
    """
    if not os.path.exists(input_path):
        return False, f"Input file not found: {input_path}", None, None

    os.makedirs(output_dir, exist_ok=True)

    # Check if Demucs venv exists
    if not os.path.exists(DEMUCS_PYTHON):
        return False, f"Demucs environment not found at {DEMUCS_VENV_PATH}. Run setup script.", None, None

    # Run Demucs for two-stem separation (vocals vs accompaniment)
    cmd = [
        DEMUCS_PYTHON, "-m", "demucs",
        "--two-stems=vocals",
        "-n", model,
        "-o", output_dir,
        input_path
    ]

    try:
        print(f"Running Demucs separation: {' '.join(cmd)}")
        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            timeout=600,  # 10 minute timeout
        )

        if result.returncode != 0:
            error_msg = result.stderr[-500:] if result.stderr else "Unknown error"
            return False, f"Demucs error: {error_msg}", None, None

        # Find output files
        # Demucs outputs to: output_dir/model_name/track_name/vocals.wav, no_vocals.wav
        input_name = Path(input_path).stem
        demucs_output = os.path.join(output_dir, model, input_name)

        vocals_path = os.path.join(demucs_output, "vocals.wav")
        no_vocals_path = os.path.join(demucs_output, "no_vocals.wav")

        if not os.path.exists(vocals_path):
            return False, "Vocals file not created", None, None

        # Move files to simpler location
        final_vocals = os.path.join(output_dir, "vocals.wav")
        final_no_vocals = os.path.join(output_dir, "no_vocals.wav")

        shutil.move(vocals_path, final_vocals)
        if os.path.exists(no_vocals_path):
            shutil.move(no_vocals_path, final_no_vocals)

        # Clean up Demucs output directory
        shutil.rmtree(os.path.join(output_dir, model), ignore_errors=True)

        return True, "Vocals separated successfully", final_vocals, final_no_vocals

    except subprocess.TimeoutExpired:
        return False, "Separation timed out", None, None
    except FileNotFoundError:
        return False, "Demucs not installed. Run: pip install demucs", None, None
    except Exception as e:
        return False, f"Separation error: {str(e)}", None, None


async def analyze_vocal_type(
    vocals_path: str,
    speech_threshold: float = 0.7
) -> Tuple[str, float]:
    """
    Analyze if vocal track contains speech or singing.

    Uses multiple heuristics:
    1. Speech has more silence gaps (pauses between words)
    2. Speech has more varied pitch changes
    3. Singing has more sustained notes

    Args:
        vocals_path: Path to vocals audio file
        speech_threshold: Threshold for speech detection (0-1)

    Returns:
        Tuple of (vocal_type, confidence)
        vocal_type: "speech", "singing", or "mixed"
    """
    if not os.path.exists(vocals_path):
        return "unknown", 0.0

    # Analyze silence ratio using FFmpeg
    # Speech typically has 30-50% silence, singing has less
    silence_ratio = await _get_silence_ratio(vocals_path)

    # Analyze zero-crossing rate (speech has higher ZCR variance)
    zcr_variance = await _get_zcr_variance(vocals_path)

    # Analyze spectral flatness (speech has higher flatness)
    spectral_score = await _get_spectral_analysis(vocals_path)

    # Combine scores
    speech_score = 0.0

    # High silence ratio indicates speech (pauses between sentences)
    if silence_ratio > 0.25:
        speech_score += 0.4
    elif silence_ratio > 0.15:
        speech_score += 0.2

    # High spectral variance indicates speech
    if spectral_score > 0.5:
        speech_score += 0.3
    elif spectral_score > 0.3:
        speech_score += 0.15

    # ZCR variance
    if zcr_variance > 0.5:
        speech_score += 0.3
    elif zcr_variance > 0.3:
        speech_score += 0.15

    # Determine type
    # speech_threshold=0.7: High confidence speech
    # singing_threshold=0.4: Below this is likely singing (music)
    # Between 0.4-0.7: Mixed or uncertain
    if speech_score >= speech_threshold:
        return "speech", speech_score
    elif speech_score < 0.4:
        return "singing", 1.0 - speech_score
    else:
        # For mixed, lean towards singing if score is closer to lower bound
        # This helps avoid transcribing song lyrics as speech
        return "mixed", speech_score


async def _get_silence_ratio(audio_path: str, threshold_db: float = -35) -> float:
    """Get ratio of silence in audio file."""
    cmd = [
        "ffmpeg", "-i", audio_path,
        "-af", f"silencedetect=noise={threshold_db}dB:d=0.3",
        "-f", "null", "-"
    ]

    try:
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
        stderr = result.stderr

        # Count silence periods
        silence_count = stderr.count("silence_end")

        # Get total duration
        duration = await _get_audio_duration(audio_path)
        if not duration or duration == 0:
            return 0.0

        # Parse total silence duration
        total_silence = 0.0
        lines = stderr.split('\n')
        for line in lines:
            if 'silence_duration' in line:
                try:
                    dur = float(line.split('silence_duration:')[1].strip().split()[0])
                    total_silence += dur
                except (IndexError, ValueError):
                    pass

        return min(total_silence / duration, 1.0)

    except Exception:
        return 0.0


async def _get_zcr_variance(audio_path: str) -> float:
    """Get zero-crossing rate variance (simplified estimation)."""
    # Use FFmpeg to analyze audio stats
    cmd = [
        "ffmpeg", "-i", audio_path,
        "-af", "astats=metadata=1:reset=1",
        "-f", "null", "-"
    ]

    try:
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
        stderr = result.stderr

        # Look for RMS level variations as proxy for ZCR variance
        rms_values = []
        for line in stderr.split('\n'):
            if 'RMS_level' in line:
                try:
                    val = float(line.split(':')[1].strip().split()[0])
                    if val != float('-inf'):
                        rms_values.append(val)
                except (IndexError, ValueError):
                    pass

        if len(rms_values) > 1:
            mean_rms = sum(rms_values) / len(rms_values)
            variance = sum((x - mean_rms) ** 2 for x in rms_values) / len(rms_values)
            # Normalize to 0-1 range
            return min(variance / 100, 1.0)

        return 0.3  # Default moderate value

    except Exception:
        return 0.3


async def _get_spectral_analysis(audio_path: str) -> float:
    """Analyze spectral characteristics (speech has more flat spectrum)."""
    # Use volume detect as proxy for spectral analysis
    cmd = [
        "ffmpeg", "-i", audio_path,
        "-af", "volumedetect",
        "-f", "null", "-"
    ]

    try:
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
        stderr = result.stderr

        mean_vol = None
        max_vol = None

        for line in stderr.split('\n'):
            if 'mean_volume' in line:
                try:
                    mean_vol = float(line.split(':')[1].strip().replace(' dB', ''))
                except (IndexError, ValueError):
                    pass
            elif 'max_volume' in line:
                try:
                    max_vol = float(line.split(':')[1].strip().replace(' dB', ''))
                except (IndexError, ValueError):
                    pass

        if mean_vol is not None and max_vol is not None:
            # Large difference between mean and max indicates speech dynamics
            diff = abs(max_vol - mean_vol)
            # Speech typically has 15-25dB dynamic range
            if diff > 20:
                return 0.7
            elif diff > 12:
                return 0.5
            else:
                return 0.2

        return 0.3

    except Exception:
        return 0.3


async def _get_audio_duration(audio_path: str) -> Optional[float]:
    """Get audio duration in seconds."""
    cmd = [
        "ffprobe",
        "-v", "error",
        "-show_entries", "format=duration",
        "-of", "default=noprint_wrappers=1:nokey=1",
        audio_path
    ]

    try:
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
        if result.returncode == 0:
            return float(result.stdout.strip())
    except Exception:
        pass

    return None


async def check_demucs_available() -> bool:
    """Check if Demucs is installed in the dedicated environment."""
    if not os.path.exists(DEMUCS_PYTHON):
        return False

    try:
        result = subprocess.run(
            [DEMUCS_PYTHON, "-m", "demucs", "--help"],
            capture_output=True,
            timeout=10
        )
        return result.returncode == 0
    except Exception:
        return False