Files
bini-shorts-maker/backend/app/services/audio_separator.py
kihong.kim c3795138da Initial commit: YouTube Shorts maker application
Features:
- Video download from TikTok/Douyin using yt-dlp
- Audio transcription with OpenAI Whisper
- GPT-4 translation (direct/summarize/rewrite modes)
- Subtitle generation with ASS format
- Video trimming with frame-accurate preview
- BGM integration with volume control
- Intro text overlay support
- Thumbnail generation with text overlay

Tech stack:
- Backend: FastAPI, Python 3.11+
- Frontend: React, Vite, TailwindCSS
- Video processing: FFmpeg
- AI: OpenAI Whisper, GPT-4

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-03 21:38:34 +09:00

318 lines
9.8 KiB
Python

"""
Audio separation service using Demucs for vocal/music separation.
Also includes speech vs singing detection.
"""
import subprocess
import os
import shutil
from typing import Optional, Tuple
from pathlib import Path
# Demucs runs in a separate Python 3.11 environment due to compatibility issues
DEMUCS_VENV_PATH = os.path.join(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
"venv_demucs"
)
DEMUCS_PYTHON = os.path.join(DEMUCS_VENV_PATH, "bin", "python")
async def separate_vocals(
input_path: str,
output_dir: str,
model: str = "htdemucs"
) -> Tuple[bool, str, Optional[str], Optional[str]]:
"""
Separate vocals from background music using Demucs.
Args:
input_path: Path to input audio/video file
output_dir: Directory to save separated tracks
model: Demucs model to use (htdemucs, htdemucs_ft, mdx_extra)
Returns:
Tuple of (success, message, vocals_path, no_vocals_path)
"""
if not os.path.exists(input_path):
return False, f"Input file not found: {input_path}", None, None
os.makedirs(output_dir, exist_ok=True)
# Check if Demucs venv exists
if not os.path.exists(DEMUCS_PYTHON):
return False, f"Demucs environment not found at {DEMUCS_VENV_PATH}. Run setup script.", None, None
# Run Demucs for two-stem separation (vocals vs accompaniment)
cmd = [
DEMUCS_PYTHON, "-m", "demucs",
"--two-stems=vocals",
"-n", model,
"-o", output_dir,
input_path
]
try:
print(f"Running Demucs separation: {' '.join(cmd)}")
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=600, # 10 minute timeout
)
if result.returncode != 0:
error_msg = result.stderr[-500:] if result.stderr else "Unknown error"
return False, f"Demucs error: {error_msg}", None, None
# Find output files
# Demucs outputs to: output_dir/model_name/track_name/vocals.wav, no_vocals.wav
input_name = Path(input_path).stem
demucs_output = os.path.join(output_dir, model, input_name)
vocals_path = os.path.join(demucs_output, "vocals.wav")
no_vocals_path = os.path.join(demucs_output, "no_vocals.wav")
if not os.path.exists(vocals_path):
return False, "Vocals file not created", None, None
# Move files to simpler location
final_vocals = os.path.join(output_dir, "vocals.wav")
final_no_vocals = os.path.join(output_dir, "no_vocals.wav")
shutil.move(vocals_path, final_vocals)
if os.path.exists(no_vocals_path):
shutil.move(no_vocals_path, final_no_vocals)
# Clean up Demucs output directory
shutil.rmtree(os.path.join(output_dir, model), ignore_errors=True)
return True, "Vocals separated successfully", final_vocals, final_no_vocals
except subprocess.TimeoutExpired:
return False, "Separation timed out", None, None
except FileNotFoundError:
return False, "Demucs not installed. Run: pip install demucs", None, None
except Exception as e:
return False, f"Separation error: {str(e)}", None, None
async def analyze_vocal_type(
vocals_path: str,
speech_threshold: float = 0.7
) -> Tuple[str, float]:
"""
Analyze if vocal track contains speech or singing.
Uses multiple heuristics:
1. Speech has more silence gaps (pauses between words)
2. Speech has more varied pitch changes
3. Singing has more sustained notes
Args:
vocals_path: Path to vocals audio file
speech_threshold: Threshold for speech detection (0-1)
Returns:
Tuple of (vocal_type, confidence)
vocal_type: "speech", "singing", or "mixed"
"""
if not os.path.exists(vocals_path):
return "unknown", 0.0
# Analyze silence ratio using FFmpeg
# Speech typically has 30-50% silence, singing has less
silence_ratio = await _get_silence_ratio(vocals_path)
# Analyze zero-crossing rate (speech has higher ZCR variance)
zcr_variance = await _get_zcr_variance(vocals_path)
# Analyze spectral flatness (speech has higher flatness)
spectral_score = await _get_spectral_analysis(vocals_path)
# Combine scores
speech_score = 0.0
# High silence ratio indicates speech (pauses between sentences)
if silence_ratio > 0.25:
speech_score += 0.4
elif silence_ratio > 0.15:
speech_score += 0.2
# High spectral variance indicates speech
if spectral_score > 0.5:
speech_score += 0.3
elif spectral_score > 0.3:
speech_score += 0.15
# ZCR variance
if zcr_variance > 0.5:
speech_score += 0.3
elif zcr_variance > 0.3:
speech_score += 0.15
# Determine type
# speech_threshold=0.7: High confidence speech
# singing_threshold=0.4: Below this is likely singing (music)
# Between 0.4-0.7: Mixed or uncertain
if speech_score >= speech_threshold:
return "speech", speech_score
elif speech_score < 0.4:
return "singing", 1.0 - speech_score
else:
# For mixed, lean towards singing if score is closer to lower bound
# This helps avoid transcribing song lyrics as speech
return "mixed", speech_score
async def _get_silence_ratio(audio_path: str, threshold_db: float = -35) -> float:
"""Get ratio of silence in audio file."""
cmd = [
"ffmpeg", "-i", audio_path,
"-af", f"silencedetect=noise={threshold_db}dB:d=0.3",
"-f", "null", "-"
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
stderr = result.stderr
# Count silence periods
silence_count = stderr.count("silence_end")
# Get total duration
duration = await _get_audio_duration(audio_path)
if not duration or duration == 0:
return 0.0
# Parse total silence duration
total_silence = 0.0
lines = stderr.split('\n')
for line in lines:
if 'silence_duration' in line:
try:
dur = float(line.split('silence_duration:')[1].strip().split()[0])
total_silence += dur
except (IndexError, ValueError):
pass
return min(total_silence / duration, 1.0)
except Exception:
return 0.0
async def _get_zcr_variance(audio_path: str) -> float:
"""Get zero-crossing rate variance (simplified estimation)."""
# Use FFmpeg to analyze audio stats
cmd = [
"ffmpeg", "-i", audio_path,
"-af", "astats=metadata=1:reset=1",
"-f", "null", "-"
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
stderr = result.stderr
# Look for RMS level variations as proxy for ZCR variance
rms_values = []
for line in stderr.split('\n'):
if 'RMS_level' in line:
try:
val = float(line.split(':')[1].strip().split()[0])
if val != float('-inf'):
rms_values.append(val)
except (IndexError, ValueError):
pass
if len(rms_values) > 1:
mean_rms = sum(rms_values) / len(rms_values)
variance = sum((x - mean_rms) ** 2 for x in rms_values) / len(rms_values)
# Normalize to 0-1 range
return min(variance / 100, 1.0)
return 0.3 # Default moderate value
except Exception:
return 0.3
async def _get_spectral_analysis(audio_path: str) -> float:
"""Analyze spectral characteristics (speech has more flat spectrum)."""
# Use volume detect as proxy for spectral analysis
cmd = [
"ffmpeg", "-i", audio_path,
"-af", "volumedetect",
"-f", "null", "-"
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
stderr = result.stderr
mean_vol = None
max_vol = None
for line in stderr.split('\n'):
if 'mean_volume' in line:
try:
mean_vol = float(line.split(':')[1].strip().replace(' dB', ''))
except (IndexError, ValueError):
pass
elif 'max_volume' in line:
try:
max_vol = float(line.split(':')[1].strip().replace(' dB', ''))
except (IndexError, ValueError):
pass
if mean_vol is not None and max_vol is not None:
# Large difference between mean and max indicates speech dynamics
diff = abs(max_vol - mean_vol)
# Speech typically has 15-25dB dynamic range
if diff > 20:
return 0.7
elif diff > 12:
return 0.5
else:
return 0.2
return 0.3
except Exception:
return 0.3
async def _get_audio_duration(audio_path: str) -> Optional[float]:
"""Get audio duration in seconds."""
cmd = [
"ffprobe",
"-v", "error",
"-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1",
audio_path
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
if result.returncode == 0:
return float(result.stdout.strip())
except Exception:
pass
return None
async def check_demucs_available() -> bool:
"""Check if Demucs is installed in the dedicated environment."""
if not os.path.exists(DEMUCS_PYTHON):
return False
try:
result = subprocess.run(
[DEMUCS_PYTHON, "-m", "demucs", "--help"],
capture_output=True,
timeout=10
)
return result.returncode == 0
except Exception:
return False