""" Audio separation service using Demucs for vocal/music separation. Also includes speech vs singing detection. """ import subprocess import os import shutil from typing import Optional, Tuple from pathlib import Path # Demucs runs in a separate Python 3.11 environment due to compatibility issues DEMUCS_VENV_PATH = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "venv_demucs" ) DEMUCS_PYTHON = os.path.join(DEMUCS_VENV_PATH, "bin", "python") async def separate_vocals( input_path: str, output_dir: str, model: str = "htdemucs" ) -> Tuple[bool, str, Optional[str], Optional[str]]: """ Separate vocals from background music using Demucs. Args: input_path: Path to input audio/video file output_dir: Directory to save separated tracks model: Demucs model to use (htdemucs, htdemucs_ft, mdx_extra) Returns: Tuple of (success, message, vocals_path, no_vocals_path) """ if not os.path.exists(input_path): return False, f"Input file not found: {input_path}", None, None os.makedirs(output_dir, exist_ok=True) # Check if Demucs venv exists if not os.path.exists(DEMUCS_PYTHON): return False, f"Demucs environment not found at {DEMUCS_VENV_PATH}. Run setup script.", None, None # Run Demucs for two-stem separation (vocals vs accompaniment) cmd = [ DEMUCS_PYTHON, "-m", "demucs", "--two-stems=vocals", "-n", model, "-o", output_dir, input_path ] try: print(f"Running Demucs separation: {' '.join(cmd)}") result = subprocess.run( cmd, capture_output=True, text=True, timeout=600, # 10 minute timeout ) if result.returncode != 0: error_msg = result.stderr[-500:] if result.stderr else "Unknown error" return False, f"Demucs error: {error_msg}", None, None # Find output files # Demucs outputs to: output_dir/model_name/track_name/vocals.wav, no_vocals.wav input_name = Path(input_path).stem demucs_output = os.path.join(output_dir, model, input_name) vocals_path = os.path.join(demucs_output, "vocals.wav") no_vocals_path = os.path.join(demucs_output, "no_vocals.wav") if not os.path.exists(vocals_path): return False, "Vocals file not created", None, None # Move files to simpler location final_vocals = os.path.join(output_dir, "vocals.wav") final_no_vocals = os.path.join(output_dir, "no_vocals.wav") shutil.move(vocals_path, final_vocals) if os.path.exists(no_vocals_path): shutil.move(no_vocals_path, final_no_vocals) # Clean up Demucs output directory shutil.rmtree(os.path.join(output_dir, model), ignore_errors=True) return True, "Vocals separated successfully", final_vocals, final_no_vocals except subprocess.TimeoutExpired: return False, "Separation timed out", None, None except FileNotFoundError: return False, "Demucs not installed. Run: pip install demucs", None, None except Exception as e: return False, f"Separation error: {str(e)}", None, None async def analyze_vocal_type( vocals_path: str, speech_threshold: float = 0.7 ) -> Tuple[str, float]: """ Analyze if vocal track contains speech or singing. Uses multiple heuristics: 1. Speech has more silence gaps (pauses between words) 2. Speech has more varied pitch changes 3. Singing has more sustained notes Args: vocals_path: Path to vocals audio file speech_threshold: Threshold for speech detection (0-1) Returns: Tuple of (vocal_type, confidence) vocal_type: "speech", "singing", or "mixed" """ if not os.path.exists(vocals_path): return "unknown", 0.0 # Analyze silence ratio using FFmpeg # Speech typically has 30-50% silence, singing has less silence_ratio = await _get_silence_ratio(vocals_path) # Analyze zero-crossing rate (speech has higher ZCR variance) zcr_variance = await _get_zcr_variance(vocals_path) # Analyze spectral flatness (speech has higher flatness) spectral_score = await _get_spectral_analysis(vocals_path) # Combine scores speech_score = 0.0 # High silence ratio indicates speech (pauses between sentences) if silence_ratio > 0.25: speech_score += 0.4 elif silence_ratio > 0.15: speech_score += 0.2 # High spectral variance indicates speech if spectral_score > 0.5: speech_score += 0.3 elif spectral_score > 0.3: speech_score += 0.15 # ZCR variance if zcr_variance > 0.5: speech_score += 0.3 elif zcr_variance > 0.3: speech_score += 0.15 # Determine type # speech_threshold=0.7: High confidence speech # singing_threshold=0.4: Below this is likely singing (music) # Between 0.4-0.7: Mixed or uncertain if speech_score >= speech_threshold: return "speech", speech_score elif speech_score < 0.4: return "singing", 1.0 - speech_score else: # For mixed, lean towards singing if score is closer to lower bound # This helps avoid transcribing song lyrics as speech return "mixed", speech_score async def _get_silence_ratio(audio_path: str, threshold_db: float = -35) -> float: """Get ratio of silence in audio file.""" cmd = [ "ffmpeg", "-i", audio_path, "-af", f"silencedetect=noise={threshold_db}dB:d=0.3", "-f", "null", "-" ] try: result = subprocess.run(cmd, capture_output=True, text=True, timeout=60) stderr = result.stderr # Count silence periods silence_count = stderr.count("silence_end") # Get total duration duration = await _get_audio_duration(audio_path) if not duration or duration == 0: return 0.0 # Parse total silence duration total_silence = 0.0 lines = stderr.split('\n') for line in lines: if 'silence_duration' in line: try: dur = float(line.split('silence_duration:')[1].strip().split()[0]) total_silence += dur except (IndexError, ValueError): pass return min(total_silence / duration, 1.0) except Exception: return 0.0 async def _get_zcr_variance(audio_path: str) -> float: """Get zero-crossing rate variance (simplified estimation).""" # Use FFmpeg to analyze audio stats cmd = [ "ffmpeg", "-i", audio_path, "-af", "astats=metadata=1:reset=1", "-f", "null", "-" ] try: result = subprocess.run(cmd, capture_output=True, text=True, timeout=60) stderr = result.stderr # Look for RMS level variations as proxy for ZCR variance rms_values = [] for line in stderr.split('\n'): if 'RMS_level' in line: try: val = float(line.split(':')[1].strip().split()[0]) if val != float('-inf'): rms_values.append(val) except (IndexError, ValueError): pass if len(rms_values) > 1: mean_rms = sum(rms_values) / len(rms_values) variance = sum((x - mean_rms) ** 2 for x in rms_values) / len(rms_values) # Normalize to 0-1 range return min(variance / 100, 1.0) return 0.3 # Default moderate value except Exception: return 0.3 async def _get_spectral_analysis(audio_path: str) -> float: """Analyze spectral characteristics (speech has more flat spectrum).""" # Use volume detect as proxy for spectral analysis cmd = [ "ffmpeg", "-i", audio_path, "-af", "volumedetect", "-f", "null", "-" ] try: result = subprocess.run(cmd, capture_output=True, text=True, timeout=60) stderr = result.stderr mean_vol = None max_vol = None for line in stderr.split('\n'): if 'mean_volume' in line: try: mean_vol = float(line.split(':')[1].strip().replace(' dB', '')) except (IndexError, ValueError): pass elif 'max_volume' in line: try: max_vol = float(line.split(':')[1].strip().replace(' dB', '')) except (IndexError, ValueError): pass if mean_vol is not None and max_vol is not None: # Large difference between mean and max indicates speech dynamics diff = abs(max_vol - mean_vol) # Speech typically has 15-25dB dynamic range if diff > 20: return 0.7 elif diff > 12: return 0.5 else: return 0.2 return 0.3 except Exception: return 0.3 async def _get_audio_duration(audio_path: str) -> Optional[float]: """Get audio duration in seconds.""" cmd = [ "ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", audio_path ] try: result = subprocess.run(cmd, capture_output=True, text=True, timeout=30) if result.returncode == 0: return float(result.stdout.strip()) except Exception: pass return None async def check_demucs_available() -> bool: """Check if Demucs is installed in the dedicated environment.""" if not os.path.exists(DEMUCS_PYTHON): return False try: result = subprocess.run( [DEMUCS_PYTHON, "-m", "demucs", "--help"], capture_output=True, timeout=10 ) return result.returncode == 0 except Exception: return False