Initial commit: YouTube Shorts maker application
Features: - Video download from TikTok/Douyin using yt-dlp - Audio transcription with OpenAI Whisper - GPT-4 translation (direct/summarize/rewrite modes) - Subtitle generation with ASS format - Video trimming with frame-accurate preview - BGM integration with volume control - Intro text overlay support - Thumbnail generation with text overlay Tech stack: - Backend: FastAPI, Python 3.11+ - Frontend: React, Vite, TailwindCSS - Video processing: FFmpeg - AI: OpenAI Whisper, GPT-4 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
317
backend/app/services/audio_separator.py
Normal file
317
backend/app/services/audio_separator.py
Normal file
@@ -0,0 +1,317 @@
|
||||
"""
|
||||
Audio separation service using Demucs for vocal/music separation.
|
||||
Also includes speech vs singing detection.
|
||||
"""
|
||||
import subprocess
|
||||
import os
|
||||
import shutil
|
||||
from typing import Optional, Tuple
|
||||
from pathlib import Path
|
||||
|
||||
# Demucs runs in a separate Python 3.11 environment due to compatibility issues
|
||||
DEMUCS_VENV_PATH = os.path.join(
|
||||
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
|
||||
"venv_demucs"
|
||||
)
|
||||
DEMUCS_PYTHON = os.path.join(DEMUCS_VENV_PATH, "bin", "python")
|
||||
|
||||
|
||||
async def separate_vocals(
|
||||
input_path: str,
|
||||
output_dir: str,
|
||||
model: str = "htdemucs"
|
||||
) -> Tuple[bool, str, Optional[str], Optional[str]]:
|
||||
"""
|
||||
Separate vocals from background music using Demucs.
|
||||
|
||||
Args:
|
||||
input_path: Path to input audio/video file
|
||||
output_dir: Directory to save separated tracks
|
||||
model: Demucs model to use (htdemucs, htdemucs_ft, mdx_extra)
|
||||
|
||||
Returns:
|
||||
Tuple of (success, message, vocals_path, no_vocals_path)
|
||||
"""
|
||||
if not os.path.exists(input_path):
|
||||
return False, f"Input file not found: {input_path}", None, None
|
||||
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Check if Demucs venv exists
|
||||
if not os.path.exists(DEMUCS_PYTHON):
|
||||
return False, f"Demucs environment not found at {DEMUCS_VENV_PATH}. Run setup script.", None, None
|
||||
|
||||
# Run Demucs for two-stem separation (vocals vs accompaniment)
|
||||
cmd = [
|
||||
DEMUCS_PYTHON, "-m", "demucs",
|
||||
"--two-stems=vocals",
|
||||
"-n", model,
|
||||
"-o", output_dir,
|
||||
input_path
|
||||
]
|
||||
|
||||
try:
|
||||
print(f"Running Demucs separation: {' '.join(cmd)}")
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600, # 10 minute timeout
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
error_msg = result.stderr[-500:] if result.stderr else "Unknown error"
|
||||
return False, f"Demucs error: {error_msg}", None, None
|
||||
|
||||
# Find output files
|
||||
# Demucs outputs to: output_dir/model_name/track_name/vocals.wav, no_vocals.wav
|
||||
input_name = Path(input_path).stem
|
||||
demucs_output = os.path.join(output_dir, model, input_name)
|
||||
|
||||
vocals_path = os.path.join(demucs_output, "vocals.wav")
|
||||
no_vocals_path = os.path.join(demucs_output, "no_vocals.wav")
|
||||
|
||||
if not os.path.exists(vocals_path):
|
||||
return False, "Vocals file not created", None, None
|
||||
|
||||
# Move files to simpler location
|
||||
final_vocals = os.path.join(output_dir, "vocals.wav")
|
||||
final_no_vocals = os.path.join(output_dir, "no_vocals.wav")
|
||||
|
||||
shutil.move(vocals_path, final_vocals)
|
||||
if os.path.exists(no_vocals_path):
|
||||
shutil.move(no_vocals_path, final_no_vocals)
|
||||
|
||||
# Clean up Demucs output directory
|
||||
shutil.rmtree(os.path.join(output_dir, model), ignore_errors=True)
|
||||
|
||||
return True, "Vocals separated successfully", final_vocals, final_no_vocals
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, "Separation timed out", None, None
|
||||
except FileNotFoundError:
|
||||
return False, "Demucs not installed. Run: pip install demucs", None, None
|
||||
except Exception as e:
|
||||
return False, f"Separation error: {str(e)}", None, None
|
||||
|
||||
|
||||
async def analyze_vocal_type(
|
||||
vocals_path: str,
|
||||
speech_threshold: float = 0.7
|
||||
) -> Tuple[str, float]:
|
||||
"""
|
||||
Analyze if vocal track contains speech or singing.
|
||||
|
||||
Uses multiple heuristics:
|
||||
1. Speech has more silence gaps (pauses between words)
|
||||
2. Speech has more varied pitch changes
|
||||
3. Singing has more sustained notes
|
||||
|
||||
Args:
|
||||
vocals_path: Path to vocals audio file
|
||||
speech_threshold: Threshold for speech detection (0-1)
|
||||
|
||||
Returns:
|
||||
Tuple of (vocal_type, confidence)
|
||||
vocal_type: "speech", "singing", or "mixed"
|
||||
"""
|
||||
if not os.path.exists(vocals_path):
|
||||
return "unknown", 0.0
|
||||
|
||||
# Analyze silence ratio using FFmpeg
|
||||
# Speech typically has 30-50% silence, singing has less
|
||||
silence_ratio = await _get_silence_ratio(vocals_path)
|
||||
|
||||
# Analyze zero-crossing rate (speech has higher ZCR variance)
|
||||
zcr_variance = await _get_zcr_variance(vocals_path)
|
||||
|
||||
# Analyze spectral flatness (speech has higher flatness)
|
||||
spectral_score = await _get_spectral_analysis(vocals_path)
|
||||
|
||||
# Combine scores
|
||||
speech_score = 0.0
|
||||
|
||||
# High silence ratio indicates speech (pauses between sentences)
|
||||
if silence_ratio > 0.25:
|
||||
speech_score += 0.4
|
||||
elif silence_ratio > 0.15:
|
||||
speech_score += 0.2
|
||||
|
||||
# High spectral variance indicates speech
|
||||
if spectral_score > 0.5:
|
||||
speech_score += 0.3
|
||||
elif spectral_score > 0.3:
|
||||
speech_score += 0.15
|
||||
|
||||
# ZCR variance
|
||||
if zcr_variance > 0.5:
|
||||
speech_score += 0.3
|
||||
elif zcr_variance > 0.3:
|
||||
speech_score += 0.15
|
||||
|
||||
# Determine type
|
||||
# speech_threshold=0.7: High confidence speech
|
||||
# singing_threshold=0.4: Below this is likely singing (music)
|
||||
# Between 0.4-0.7: Mixed or uncertain
|
||||
if speech_score >= speech_threshold:
|
||||
return "speech", speech_score
|
||||
elif speech_score < 0.4:
|
||||
return "singing", 1.0 - speech_score
|
||||
else:
|
||||
# For mixed, lean towards singing if score is closer to lower bound
|
||||
# This helps avoid transcribing song lyrics as speech
|
||||
return "mixed", speech_score
|
||||
|
||||
|
||||
async def _get_silence_ratio(audio_path: str, threshold_db: float = -35) -> float:
|
||||
"""Get ratio of silence in audio file."""
|
||||
cmd = [
|
||||
"ffmpeg", "-i", audio_path,
|
||||
"-af", f"silencedetect=noise={threshold_db}dB:d=0.3",
|
||||
"-f", "null", "-"
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
|
||||
stderr = result.stderr
|
||||
|
||||
# Count silence periods
|
||||
silence_count = stderr.count("silence_end")
|
||||
|
||||
# Get total duration
|
||||
duration = await _get_audio_duration(audio_path)
|
||||
if not duration or duration == 0:
|
||||
return 0.0
|
||||
|
||||
# Parse total silence duration
|
||||
total_silence = 0.0
|
||||
lines = stderr.split('\n')
|
||||
for line in lines:
|
||||
if 'silence_duration' in line:
|
||||
try:
|
||||
dur = float(line.split('silence_duration:')[1].strip().split()[0])
|
||||
total_silence += dur
|
||||
except (IndexError, ValueError):
|
||||
pass
|
||||
|
||||
return min(total_silence / duration, 1.0)
|
||||
|
||||
except Exception:
|
||||
return 0.0
|
||||
|
||||
|
||||
async def _get_zcr_variance(audio_path: str) -> float:
|
||||
"""Get zero-crossing rate variance (simplified estimation)."""
|
||||
# Use FFmpeg to analyze audio stats
|
||||
cmd = [
|
||||
"ffmpeg", "-i", audio_path,
|
||||
"-af", "astats=metadata=1:reset=1",
|
||||
"-f", "null", "-"
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
|
||||
stderr = result.stderr
|
||||
|
||||
# Look for RMS level variations as proxy for ZCR variance
|
||||
rms_values = []
|
||||
for line in stderr.split('\n'):
|
||||
if 'RMS_level' in line:
|
||||
try:
|
||||
val = float(line.split(':')[1].strip().split()[0])
|
||||
if val != float('-inf'):
|
||||
rms_values.append(val)
|
||||
except (IndexError, ValueError):
|
||||
pass
|
||||
|
||||
if len(rms_values) > 1:
|
||||
mean_rms = sum(rms_values) / len(rms_values)
|
||||
variance = sum((x - mean_rms) ** 2 for x in rms_values) / len(rms_values)
|
||||
# Normalize to 0-1 range
|
||||
return min(variance / 100, 1.0)
|
||||
|
||||
return 0.3 # Default moderate value
|
||||
|
||||
except Exception:
|
||||
return 0.3
|
||||
|
||||
|
||||
async def _get_spectral_analysis(audio_path: str) -> float:
|
||||
"""Analyze spectral characteristics (speech has more flat spectrum)."""
|
||||
# Use volume detect as proxy for spectral analysis
|
||||
cmd = [
|
||||
"ffmpeg", "-i", audio_path,
|
||||
"-af", "volumedetect",
|
||||
"-f", "null", "-"
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
|
||||
stderr = result.stderr
|
||||
|
||||
mean_vol = None
|
||||
max_vol = None
|
||||
|
||||
for line in stderr.split('\n'):
|
||||
if 'mean_volume' in line:
|
||||
try:
|
||||
mean_vol = float(line.split(':')[1].strip().replace(' dB', ''))
|
||||
except (IndexError, ValueError):
|
||||
pass
|
||||
elif 'max_volume' in line:
|
||||
try:
|
||||
max_vol = float(line.split(':')[1].strip().replace(' dB', ''))
|
||||
except (IndexError, ValueError):
|
||||
pass
|
||||
|
||||
if mean_vol is not None and max_vol is not None:
|
||||
# Large difference between mean and max indicates speech dynamics
|
||||
diff = abs(max_vol - mean_vol)
|
||||
# Speech typically has 15-25dB dynamic range
|
||||
if diff > 20:
|
||||
return 0.7
|
||||
elif diff > 12:
|
||||
return 0.5
|
||||
else:
|
||||
return 0.2
|
||||
|
||||
return 0.3
|
||||
|
||||
except Exception:
|
||||
return 0.3
|
||||
|
||||
|
||||
async def _get_audio_duration(audio_path: str) -> Optional[float]:
|
||||
"""Get audio duration in seconds."""
|
||||
cmd = [
|
||||
"ffprobe",
|
||||
"-v", "error",
|
||||
"-show_entries", "format=duration",
|
||||
"-of", "default=noprint_wrappers=1:nokey=1",
|
||||
audio_path
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
||||
if result.returncode == 0:
|
||||
return float(result.stdout.strip())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
async def check_demucs_available() -> bool:
|
||||
"""Check if Demucs is installed in the dedicated environment."""
|
||||
if not os.path.exists(DEMUCS_PYTHON):
|
||||
return False
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[DEMUCS_PYTHON, "-m", "demucs", "--help"],
|
||||
capture_output=True,
|
||||
timeout=10
|
||||
)
|
||||
return result.returncode == 0
|
||||
except Exception:
|
||||
return False
|
||||
Reference in New Issue
Block a user