Initial commit: YouTube Shorts maker application

Features: - Video download from TikTok/Douyin using yt-dlp - Audio transcription with OpenAI Whisper - GPT-4 translation (direct/summarize/rewrite modes) - Subtitle generation with ASS format - Video trimming with frame-accurate preview - BGM integration with volume control - Intro text overlay support - Thumbnail generation with text overlay Tech stack: - Backend: FastAPI, Python 3.11+ - Frontend: React, Vite, TailwindCSS - Video processing: FFmpeg - AI: OpenAI Whisper, GPT-4 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-03 21:38:34 +09:00
commit c3795138da
64 changed files with 13059 additions and 0 deletions
--- a/backend/app/services/audio_separator.py
+++ b/backend/app/services/audio_separator.py
@@ -0,0 +1,317 @@
+"""
+Audio separation service using Demucs for vocal/music separation.
+Also includes speech vs singing detection.
+"""
+import subprocess
+import os
+import shutil
+from typing import Optional, Tuple
+from pathlib import Path
+
+# Demucs runs in a separate Python 3.11 environment due to compatibility issues
+DEMUCS_VENV_PATH = os.path.join(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+    "venv_demucs"
+)
+DEMUCS_PYTHON = os.path.join(DEMUCS_VENV_PATH, "bin", "python")
+
+
+async def separate_vocals(
+    input_path: str,
+    output_dir: str,
+    model: str = "htdemucs"
+) -> Tuple[bool, str, Optional[str], Optional[str]]:
+    """
+    Separate vocals from background music using Demucs.
+
+    Args:
+        input_path: Path to input audio/video file
+        output_dir: Directory to save separated tracks
+        model: Demucs model to use (htdemucs, htdemucs_ft, mdx_extra)
+
+    Returns:
+        Tuple of (success, message, vocals_path, no_vocals_path)
+    """
+    if not os.path.exists(input_path):
+        return False, f"Input file not found: {input_path}", None, None
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Check if Demucs venv exists
+    if not os.path.exists(DEMUCS_PYTHON):
+        return False, f"Demucs environment not found at {DEMUCS_VENV_PATH}. Run setup script.", None, None
+
+    # Run Demucs for two-stem separation (vocals vs accompaniment)
+    cmd = [
+        DEMUCS_PYTHON, "-m", "demucs",
+        "--two-stems=vocals",
+        "-n", model,
+        "-o", output_dir,
+        input_path
+    ]
+
+    try:
+        print(f"Running Demucs separation: {' '.join(cmd)}")
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=600,  # 10 minute timeout
+        )
+
+        if result.returncode != 0:
+            error_msg = result.stderr[-500:] if result.stderr else "Unknown error"
+            return False, f"Demucs error: {error_msg}", None, None
+
+        # Find output files
+        # Demucs outputs to: output_dir/model_name/track_name/vocals.wav, no_vocals.wav
+        input_name = Path(input_path).stem
+        demucs_output = os.path.join(output_dir, model, input_name)
+
+        vocals_path = os.path.join(demucs_output, "vocals.wav")
+        no_vocals_path = os.path.join(demucs_output, "no_vocals.wav")
+
+        if not os.path.exists(vocals_path):
+            return False, "Vocals file not created", None, None
+
+        # Move files to simpler location
+        final_vocals = os.path.join(output_dir, "vocals.wav")
+        final_no_vocals = os.path.join(output_dir, "no_vocals.wav")
+
+        shutil.move(vocals_path, final_vocals)
+        if os.path.exists(no_vocals_path):
+            shutil.move(no_vocals_path, final_no_vocals)
+
+        # Clean up Demucs output directory
+        shutil.rmtree(os.path.join(output_dir, model), ignore_errors=True)
+
+        return True, "Vocals separated successfully", final_vocals, final_no_vocals
+
+    except subprocess.TimeoutExpired:
+        return False, "Separation timed out", None, None
+    except FileNotFoundError:
+        return False, "Demucs not installed. Run: pip install demucs", None, None
+    except Exception as e:
+        return False, f"Separation error: {str(e)}", None, None
+
+
+async def analyze_vocal_type(
+    vocals_path: str,
+    speech_threshold: float = 0.7
+) -> Tuple[str, float]:
+    """
+    Analyze if vocal track contains speech or singing.
+
+    Uses multiple heuristics:
+    1. Speech has more silence gaps (pauses between words)
+    2. Speech has more varied pitch changes
+    3. Singing has more sustained notes
+
+    Args:
+        vocals_path: Path to vocals audio file
+        speech_threshold: Threshold for speech detection (0-1)
+
+    Returns:
+        Tuple of (vocal_type, confidence)
+        vocal_type: "speech", "singing", or "mixed"
+    """
+    if not os.path.exists(vocals_path):
+        return "unknown", 0.0
+
+    # Analyze silence ratio using FFmpeg
+    # Speech typically has 30-50% silence, singing has less
+    silence_ratio = await _get_silence_ratio(vocals_path)
+
+    # Analyze zero-crossing rate (speech has higher ZCR variance)
+    zcr_variance = await _get_zcr_variance(vocals_path)
+
+    # Analyze spectral flatness (speech has higher flatness)
+    spectral_score = await _get_spectral_analysis(vocals_path)
+
+    # Combine scores
+    speech_score = 0.0
+
+    # High silence ratio indicates speech (pauses between sentences)
+    if silence_ratio > 0.25:
+        speech_score += 0.4
+    elif silence_ratio > 0.15:
+        speech_score += 0.2
+
+    # High spectral variance indicates speech
+    if spectral_score > 0.5:
+        speech_score += 0.3
+    elif spectral_score > 0.3:
+        speech_score += 0.15
+
+    # ZCR variance
+    if zcr_variance > 0.5:
+        speech_score += 0.3
+    elif zcr_variance > 0.3:
+        speech_score += 0.15
+
+    # Determine type
+    # speech_threshold=0.7: High confidence speech
+    # singing_threshold=0.4: Below this is likely singing (music)
+    # Between 0.4-0.7: Mixed or uncertain
+    if speech_score >= speech_threshold:
+        return "speech", speech_score
+    elif speech_score < 0.4:
+        return "singing", 1.0 - speech_score
+    else:
+        # For mixed, lean towards singing if score is closer to lower bound
+        # This helps avoid transcribing song lyrics as speech
+        return "mixed", speech_score
+
+
+async def _get_silence_ratio(audio_path: str, threshold_db: float = -35) -> float:
+    """Get ratio of silence in audio file."""
+    cmd = [
+        "ffmpeg", "-i", audio_path,
+        "-af", f"silencedetect=noise={threshold_db}dB:d=0.3",
+        "-f", "null", "-"
+    ]
+
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
+        stderr = result.stderr
+
+        # Count silence periods
+        silence_count = stderr.count("silence_end")
+
+        # Get total duration
+        duration = await _get_audio_duration(audio_path)
+        if not duration or duration == 0:
+            return 0.0
+
+        # Parse total silence duration
+        total_silence = 0.0
+        lines = stderr.split('\n')
+        for line in lines:
+            if 'silence_duration' in line:
+                try:
+                    dur = float(line.split('silence_duration:')[1].strip().split()[0])
+                    total_silence += dur
+                except (IndexError, ValueError):
+                    pass
+
+        return min(total_silence / duration, 1.0)
+
+    except Exception:
+        return 0.0
+
+
+async def _get_zcr_variance(audio_path: str) -> float:
+    """Get zero-crossing rate variance (simplified estimation)."""
+    # Use FFmpeg to analyze audio stats
+    cmd = [
+        "ffmpeg", "-i", audio_path,
+        "-af", "astats=metadata=1:reset=1",
+        "-f", "null", "-"
+    ]
+
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
+        stderr = result.stderr
+
+        # Look for RMS level variations as proxy for ZCR variance
+        rms_values = []
+        for line in stderr.split('\n'):
+            if 'RMS_level' in line:
+                try:
+                    val = float(line.split(':')[1].strip().split()[0])
+                    if val != float('-inf'):
+                        rms_values.append(val)
+                except (IndexError, ValueError):
+                    pass
+
+        if len(rms_values) > 1:
+            mean_rms = sum(rms_values) / len(rms_values)
+            variance = sum((x - mean_rms) ** 2 for x in rms_values) / len(rms_values)
+            # Normalize to 0-1 range
+            return min(variance / 100, 1.0)
+
+        return 0.3  # Default moderate value
+
+    except Exception:
+        return 0.3
+
+
+async def _get_spectral_analysis(audio_path: str) -> float:
+    """Analyze spectral characteristics (speech has more flat spectrum)."""
+    # Use volume detect as proxy for spectral analysis
+    cmd = [
+        "ffmpeg", "-i", audio_path,
+        "-af", "volumedetect",
+        "-f", "null", "-"
+    ]
+
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
+        stderr = result.stderr
+
+        mean_vol = None
+        max_vol = None
+
+        for line in stderr.split('\n'):
+            if 'mean_volume' in line:
+                try:
+                    mean_vol = float(line.split(':')[1].strip().replace(' dB', ''))
+                except (IndexError, ValueError):
+                    pass
+            elif 'max_volume' in line:
+                try:
+                    max_vol = float(line.split(':')[1].strip().replace(' dB', ''))
+                except (IndexError, ValueError):
+                    pass
+
+        if mean_vol is not None and max_vol is not None:
+            # Large difference between mean and max indicates speech dynamics
+            diff = abs(max_vol - mean_vol)
+            # Speech typically has 15-25dB dynamic range
+            if diff > 20:
+                return 0.7
+            elif diff > 12:
+                return 0.5
+            else:
+                return 0.2
+
+        return 0.3
+
+    except Exception:
+        return 0.3
+
+
+async def _get_audio_duration(audio_path: str) -> Optional[float]:
+    """Get audio duration in seconds."""
+    cmd = [
+        "ffprobe",
+        "-v", "error",
+        "-show_entries", "format=duration",
+        "-of", "default=noprint_wrappers=1:nokey=1",
+        audio_path
+    ]
+
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+        if result.returncode == 0:
+            return float(result.stdout.strip())
+    except Exception:
+        pass
+
+    return None
+
+
+async def check_demucs_available() -> bool:
+    """Check if Demucs is installed in the dedicated environment."""
+    if not os.path.exists(DEMUCS_PYTHON):
+        return False
+
+    try:
+        result = subprocess.run(
+            [DEMUCS_PYTHON, "-m", "demucs", "--help"],
+            capture_output=True,
+            timeout=10
+        )
+        return result.returncode == 0
+    except Exception:
+        return False