- TimelineEditor, VideoStudio 컴포넌트 신규 추가 - 백엔드 transcriber, video_processor 서비스 개선 - 프론트엔드 HomePage 리팩토링 및 스타일 업데이트 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
817 lines
28 KiB
Python
817 lines
28 KiB
Python
import subprocess
|
|
import asyncio
|
|
import os
|
|
from typing import Optional, Tuple
|
|
from app.config import settings
|
|
|
|
|
|
async def process_video(
|
|
input_path: str,
|
|
output_path: str,
|
|
subtitle_path: Optional[str] = None,
|
|
bgm_path: Optional[str] = None,
|
|
bgm_volume: float = 0.3,
|
|
keep_original_audio: bool = False,
|
|
intro_text: Optional[str] = None,
|
|
intro_duration: float = 0.7,
|
|
intro_font_size: int = 100,
|
|
intro_position: str = "center", # top, center, bottom
|
|
) -> Tuple[bool, str]:
|
|
"""
|
|
Process video: remove audio, add subtitles, add BGM, add intro text.
|
|
|
|
Args:
|
|
input_path: Path to input video
|
|
output_path: Path for output video
|
|
subtitle_path: Path to ASS/SRT subtitle file
|
|
bgm_path: Path to BGM audio file
|
|
bgm_volume: Volume level for BGM (0.0 - 1.0)
|
|
keep_original_audio: Whether to keep original audio
|
|
intro_text: Text to display at the beginning of video (YouTube Shorts thumbnail)
|
|
intro_duration: How long to display intro text (seconds)
|
|
intro_font_size: Font size for intro text (100-120 recommended)
|
|
|
|
Returns:
|
|
Tuple of (success, message)
|
|
"""
|
|
if not os.path.exists(input_path):
|
|
return False, f"Input video not found: {input_path}"
|
|
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
|
|
# Build FFmpeg command
|
|
cmd = ["ffmpeg", "-y"] # -y to overwrite
|
|
|
|
# Input video
|
|
cmd.extend(["-i", input_path])
|
|
|
|
# Input BGM if provided (stream_loop must come BEFORE -i)
|
|
if bgm_path and os.path.exists(bgm_path):
|
|
cmd.extend(["-stream_loop", "-1"]) # Loop BGM infinitely
|
|
cmd.extend(["-i", bgm_path])
|
|
|
|
# Build filter complex
|
|
filter_parts = []
|
|
audio_parts = []
|
|
|
|
# Audio handling
|
|
if keep_original_audio and bgm_path and os.path.exists(bgm_path):
|
|
# Mix original audio with BGM
|
|
filter_parts.append(f"[0:a]volume=1.0[original]")
|
|
filter_parts.append(f"[1:a]volume={bgm_volume}[bgm]")
|
|
filter_parts.append(f"[original][bgm]amix=inputs=2:duration=shortest[audio]")
|
|
audio_output = "[audio]"
|
|
elif bgm_path and os.path.exists(bgm_path):
|
|
# BGM only (no original audio)
|
|
filter_parts.append(f"[1:a]volume={bgm_volume}[audio]")
|
|
audio_output = "[audio]"
|
|
elif keep_original_audio:
|
|
# Original audio only
|
|
audio_output = "0:a"
|
|
else:
|
|
# No audio
|
|
audio_output = None
|
|
|
|
# Build video filter chain
|
|
video_filters = []
|
|
|
|
# 1. Add freeze frame at the beginning if intro text is provided
|
|
# tpad adds frozen frames at start using clone mode (copies first frame)
|
|
if intro_text and intro_duration > 0:
|
|
# Clone the first frame for intro_duration seconds
|
|
video_filters.append(f"tpad=start_duration={intro_duration}:start_mode=clone")
|
|
|
|
# 2. Add subtitle overlay if provided
|
|
if subtitle_path and os.path.exists(subtitle_path):
|
|
escaped_path = subtitle_path.replace("\\", "/").replace(":", "\\:").replace("'", "\\'")
|
|
video_filters.append(f"ass='{escaped_path}'")
|
|
|
|
# 3. Add intro text overlay if provided (shown during frozen frame portion)
|
|
if intro_text:
|
|
# Find a suitable font - try common Korean fonts
|
|
font_options = [
|
|
"/System/Library/Fonts/Supplemental/AppleGothic.ttf", # macOS Korean
|
|
"/System/Library/Fonts/AppleSDGothicNeo.ttc", # macOS Korean
|
|
"/usr/share/fonts/truetype/nanum/NanumGothicBold.ttf", # Linux Korean
|
|
"/usr/share/fonts/truetype/korean/Pretendard-Bold.otf", # Docker Korean
|
|
"/usr/share/fonts/opentype/noto/NotoSansCJK-Bold.ttc", # Linux CJK
|
|
]
|
|
|
|
font_file = None
|
|
for font in font_options:
|
|
if os.path.exists(font):
|
|
font_file = font.replace(":", "\\:")
|
|
break
|
|
|
|
# Calculate font size based on text length to prevent overflow
|
|
# Shorts video is typically 720px width
|
|
# Korean characters are nearly square (width ≈ height), so char_width_ratio ≈ 1.0
|
|
video_width = 720 # Default Shorts width
|
|
box_padding = 40 # boxborderw=20 on each side
|
|
max_width_ratio = 0.75 # Leave 25% margin for safety
|
|
char_width_ratio = 1.0 # Korean characters are nearly square
|
|
available_width = (video_width * max_width_ratio) - box_padding
|
|
|
|
# Split text into 2 lines if too long (more than 10 chars or font would be too small)
|
|
text_len = len(intro_text)
|
|
single_line_font = int(available_width / (text_len * char_width_ratio))
|
|
|
|
# Use 2 lines if single line font would be less than 50px
|
|
if single_line_font < 50 and text_len > 6:
|
|
# Find best split point (prefer space near middle)
|
|
mid = len(intro_text) // 2
|
|
split_pos = None
|
|
|
|
# Search for space within 5 chars of middle
|
|
for offset in range(6):
|
|
if mid + offset < len(intro_text) and intro_text[mid + offset] == ' ':
|
|
split_pos = mid + offset
|
|
break
|
|
if mid - offset >= 0 and intro_text[mid - offset] == ' ':
|
|
split_pos = mid - offset
|
|
break
|
|
|
|
# If no space found, split at middle
|
|
if split_pos is None:
|
|
split_pos = mid
|
|
|
|
line1 = intro_text[:split_pos].strip()
|
|
line2 = intro_text[split_pos:].strip()
|
|
display_text = f"{line1}\\n{line2}"
|
|
|
|
# Calculate font size based on longer line
|
|
max_line_len = max(len(line1), len(line2))
|
|
calculated_max_font = int(available_width / (max_line_len * char_width_ratio))
|
|
print(f"[Intro] Split into 2 lines: '{line1}' / '{line2}' (max {max_line_len} chars)")
|
|
else:
|
|
display_text = intro_text
|
|
calculated_max_font = single_line_font
|
|
print(f"[Intro] Single line: '{intro_text}' ({text_len} chars)")
|
|
|
|
adjusted_font_size = min(intro_font_size, calculated_max_font)
|
|
adjusted_font_size = max(36, adjusted_font_size) # Minimum 36px font size
|
|
print(f"[Intro] Requested font: {intro_font_size}px, Adjusted: {adjusted_font_size}px")
|
|
|
|
# Fade out effect timing (fade starts 0.2s before end)
|
|
fade_out_start = max(0.1, intro_duration - 0.2)
|
|
alpha_expr = f"if(gt(t,{fade_out_start}),(({intro_duration}-t)/0.2),1)"
|
|
|
|
escaped_text = display_text.replace("'", "\\'").replace(":", "\\:")
|
|
|
|
# Calculate vertical position based on intro_position
|
|
if intro_position == "top":
|
|
y_expr = "h*0.15" # 15% from top
|
|
elif intro_position == "bottom":
|
|
y_expr = "h*0.80-text_h" # 80% from top (above subtitle area)
|
|
else: # center
|
|
y_expr = "(h-text_h)/2" # Center vertically
|
|
|
|
# Draw text on screen during freeze frame
|
|
drawtext_parts = [
|
|
f"text='{escaped_text}'",
|
|
f"fontsize={adjusted_font_size}",
|
|
"fontcolor=white",
|
|
"x=(w-text_w)/2", # Center horizontally
|
|
f"y={y_expr}", # Vertical position based on intro_position
|
|
f"enable='lt(t,{intro_duration})'",
|
|
"borderw=4",
|
|
"bordercolor=black",
|
|
"box=1",
|
|
"boxcolor=black@0.7",
|
|
"boxborderw=20",
|
|
f"alpha='{alpha_expr}'",
|
|
"line_spacing=10", # Add spacing between lines
|
|
]
|
|
|
|
if font_file:
|
|
drawtext_parts.insert(1, f"fontfile='{font_file}'")
|
|
|
|
video_filters.append(f"drawtext={':'.join(drawtext_parts)}")
|
|
|
|
# Combine video filters
|
|
video_filter_str = ",".join(video_filters) if video_filters else None
|
|
|
|
# Construct FFmpeg command
|
|
if filter_parts or video_filter_str:
|
|
if filter_parts and video_filter_str:
|
|
full_filter = ";".join(filter_parts) + f";[0:v]{video_filter_str}[vout]"
|
|
cmd.extend(["-filter_complex", full_filter])
|
|
cmd.extend(["-map", "[vout]"])
|
|
if audio_output and audio_output.startswith("["):
|
|
cmd.extend(["-map", audio_output])
|
|
elif audio_output:
|
|
cmd.extend(["-map", audio_output])
|
|
elif video_filter_str:
|
|
cmd.extend(["-vf", video_filter_str])
|
|
if bgm_path and os.path.exists(bgm_path):
|
|
cmd.extend(["-filter_complex", f"[1:a]volume={bgm_volume}[audio]"])
|
|
cmd.extend(["-map", "0:v", "-map", "[audio]"])
|
|
elif not keep_original_audio:
|
|
cmd.extend(["-an"]) # No audio
|
|
elif filter_parts:
|
|
cmd.extend(["-filter_complex", ";".join(filter_parts)])
|
|
cmd.extend(["-map", "0:v"])
|
|
if audio_output and audio_output.startswith("["):
|
|
cmd.extend(["-map", audio_output])
|
|
else:
|
|
if not keep_original_audio:
|
|
cmd.extend(["-an"])
|
|
|
|
# Output settings
|
|
cmd.extend([
|
|
"-c:v", "libx264",
|
|
"-preset", "medium",
|
|
"-crf", "23",
|
|
"-c:a", "aac",
|
|
"-b:a", "128k",
|
|
"-shortest",
|
|
output_path
|
|
])
|
|
|
|
try:
|
|
# Run FFmpeg in thread pool to avoid blocking the event loop
|
|
result = await asyncio.to_thread(
|
|
subprocess.run,
|
|
cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=600, # 10 minute timeout
|
|
)
|
|
|
|
if result.returncode != 0:
|
|
error_msg = result.stderr[-500:] if result.stderr else "Unknown error"
|
|
return False, f"FFmpeg error: {error_msg}"
|
|
|
|
if os.path.exists(output_path):
|
|
return True, "Video processing complete"
|
|
else:
|
|
return False, "Output file not created"
|
|
|
|
except subprocess.TimeoutExpired:
|
|
return False, "Processing timed out"
|
|
except Exception as e:
|
|
return False, f"Processing error: {str(e)}"
|
|
|
|
|
|
async def get_video_duration(video_path: str) -> Optional[float]:
|
|
"""Get video duration in seconds."""
|
|
cmd = [
|
|
"ffprobe",
|
|
"-v", "error",
|
|
"-show_entries", "format=duration",
|
|
"-of", "default=noprint_wrappers=1:nokey=1",
|
|
video_path
|
|
]
|
|
|
|
try:
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
|
if result.returncode == 0:
|
|
return float(result.stdout.strip())
|
|
except Exception:
|
|
pass
|
|
|
|
return None
|
|
|
|
|
|
async def get_video_info(video_path: str) -> Optional[dict]:
|
|
"""Get video information (duration, resolution, etc.)."""
|
|
import json as json_module
|
|
|
|
cmd = [
|
|
"ffprobe",
|
|
"-v", "error",
|
|
"-select_streams", "v:0",
|
|
"-show_entries", "stream=width,height,duration:format=duration",
|
|
"-of", "json",
|
|
video_path
|
|
]
|
|
|
|
try:
|
|
result = await asyncio.to_thread(
|
|
subprocess.run,
|
|
cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30,
|
|
)
|
|
if result.returncode == 0:
|
|
data = json_module.loads(result.stdout)
|
|
info = {}
|
|
|
|
# Get duration from format (more reliable)
|
|
if "format" in data and "duration" in data["format"]:
|
|
info["duration"] = float(data["format"]["duration"])
|
|
|
|
# Get resolution from stream
|
|
if "streams" in data and len(data["streams"]) > 0:
|
|
stream = data["streams"][0]
|
|
info["width"] = stream.get("width")
|
|
info["height"] = stream.get("height")
|
|
|
|
return info if info else None
|
|
except Exception:
|
|
pass
|
|
|
|
return None
|
|
|
|
|
|
async def trim_video(
|
|
input_path: str,
|
|
output_path: str,
|
|
start_time: float,
|
|
end_time: float,
|
|
exclude_regions: list = None,
|
|
) -> Tuple[bool, str]:
|
|
"""
|
|
Trim video to specified time range, optionally excluding middle sections.
|
|
|
|
Args:
|
|
input_path: Path to input video
|
|
output_path: Path for output video
|
|
start_time: Start time in seconds
|
|
end_time: End time in seconds
|
|
exclude_regions: List of dicts with 'start' and 'end' keys for sections to remove
|
|
|
|
Returns:
|
|
Tuple of (success, message)
|
|
"""
|
|
if not os.path.exists(input_path):
|
|
return False, f"Input video not found: {input_path}"
|
|
|
|
# Validate time range
|
|
duration = await get_video_duration(input_path)
|
|
if duration is None:
|
|
return False, "Could not get video duration"
|
|
|
|
if start_time < 0:
|
|
start_time = 0
|
|
if end_time > duration:
|
|
end_time = duration
|
|
if start_time >= end_time:
|
|
return False, f"Invalid time range: start ({start_time}) >= end ({end_time})"
|
|
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
|
|
# If there are exclude regions, use the complex concat approach
|
|
if exclude_regions and len(exclude_regions) > 0:
|
|
return await _trim_with_exclude_regions(
|
|
input_path, output_path, start_time, end_time, exclude_regions
|
|
)
|
|
|
|
# Simple trim without exclude regions
|
|
trim_duration = end_time - start_time
|
|
|
|
# Log trim parameters for debugging
|
|
print(f"[Trim] Input: {input_path}")
|
|
print(f"[Trim] Original duration: {duration:.3f}s")
|
|
print(f"[Trim] Requested: start={start_time:.3f}s, end={end_time:.3f}s")
|
|
print(f"[Trim] Output duration should be: {trim_duration:.3f}s")
|
|
|
|
# Use -ss BEFORE -i for input seeking (faster and more reliable for end trimming)
|
|
# Combined with -t for accurate duration control
|
|
# -accurate_seek ensures frame-accurate seeking
|
|
cmd = [
|
|
"ffmpeg", "-y",
|
|
"-accurate_seek", # Enable accurate seeking
|
|
"-ss", str(start_time), # Input seeking (before -i)
|
|
"-i", input_path,
|
|
"-t", str(trim_duration), # Duration of output
|
|
"-c:v", "libx264", # Re-encode video for accurate cut
|
|
"-preset", "fast", # Fast encoding preset
|
|
"-crf", "18", # High quality (lower = better)
|
|
"-c:a", "aac", # Re-encode audio
|
|
"-b:a", "128k", # Audio bitrate
|
|
"-avoid_negative_ts", "make_zero", # Fix timestamp issues
|
|
output_path
|
|
]
|
|
|
|
print(f"[Trim] Command: {' '.join(cmd)}")
|
|
|
|
try:
|
|
result = await asyncio.to_thread(
|
|
subprocess.run,
|
|
cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
)
|
|
|
|
if result.returncode != 0:
|
|
error_msg = result.stderr[-300:] if result.stderr else "Unknown error"
|
|
print(f"[Trim] FFmpeg error: {error_msg}")
|
|
return False, f"Trim failed: {error_msg}"
|
|
|
|
if os.path.exists(output_path):
|
|
new_duration = await get_video_duration(output_path)
|
|
print(f"[Trim] Success! New duration: {new_duration:.3f}s (expected: {trim_duration:.3f}s)")
|
|
print(f"[Trim] Difference from expected: {abs(new_duration - trim_duration):.3f}s")
|
|
return True, f"Video trimmed successfully ({new_duration:.1f}s)"
|
|
else:
|
|
print("[Trim] Error: Output file not created")
|
|
return False, "Output file not created"
|
|
|
|
except subprocess.TimeoutExpired:
|
|
print("[Trim] Error: Timeout")
|
|
return False, "Trim operation timed out"
|
|
except Exception as e:
|
|
print(f"[Trim] Error: {str(e)}")
|
|
return False, f"Trim error: {str(e)}"
|
|
|
|
|
|
async def _trim_with_exclude_regions(
|
|
input_path: str,
|
|
output_path: str,
|
|
start_time: float,
|
|
end_time: float,
|
|
exclude_regions: list,
|
|
) -> Tuple[bool, str]:
|
|
"""
|
|
Trim video with exclude regions - cuts out specified sections and concatenates remaining parts.
|
|
|
|
Uses FFmpeg's filter_complex with trim and concat filters.
|
|
"""
|
|
import tempfile
|
|
|
|
print(f"[Trim] Trimming with {len(exclude_regions)} exclude regions")
|
|
print(f"[Trim] Main range: {start_time:.3f}s - {end_time:.3f}s")
|
|
for i, region in enumerate(exclude_regions):
|
|
print(f"[Trim] Exclude region {i}: {region['start']:.3f}s - {region['end']:.3f}s")
|
|
|
|
# Sort and merge overlapping exclude regions
|
|
sorted_regions = sorted(exclude_regions, key=lambda r: r['start'])
|
|
merged_regions = []
|
|
for region in sorted_regions:
|
|
# Clip region to main trim range
|
|
region_start = max(region['start'], start_time)
|
|
region_end = min(region['end'], end_time)
|
|
if region_start >= region_end:
|
|
continue # Skip invalid regions
|
|
|
|
if merged_regions and region_start <= merged_regions[-1]['end']:
|
|
merged_regions[-1]['end'] = max(merged_regions[-1]['end'], region_end)
|
|
else:
|
|
merged_regions.append({'start': region_start, 'end': region_end})
|
|
|
|
if not merged_regions:
|
|
# No valid exclude regions, use simple trim
|
|
print("[Trim] No valid exclude regions after merging, using simple trim")
|
|
return await trim_video(input_path, output_path, start_time, end_time, None)
|
|
|
|
# Calculate keep segments (inverse of exclude regions)
|
|
keep_segments = []
|
|
current_pos = start_time
|
|
|
|
for region in merged_regions:
|
|
if current_pos < region['start']:
|
|
keep_segments.append({'start': current_pos, 'end': region['start']})
|
|
current_pos = region['end']
|
|
|
|
# Add final segment if there's remaining time
|
|
if current_pos < end_time:
|
|
keep_segments.append({'start': current_pos, 'end': end_time})
|
|
|
|
if not keep_segments:
|
|
return False, "No video segments remaining after excluding regions"
|
|
|
|
print(f"[Trim] Keep segments: {keep_segments}")
|
|
|
|
# Calculate expected output duration
|
|
expected_duration = sum(seg['end'] - seg['start'] for seg in keep_segments)
|
|
print(f"[Trim] Expected output duration: {expected_duration:.3f}s")
|
|
|
|
# Build FFmpeg filter_complex for concatenation
|
|
# Each segment needs: trim, setpts for video; atrim, asetpts for audio
|
|
video_filters = []
|
|
audio_filters = []
|
|
segment_labels = []
|
|
|
|
for i, seg in enumerate(keep_segments):
|
|
seg_duration = seg['end'] - seg['start']
|
|
# Video filter: trim and reset timestamps
|
|
video_filters.append(
|
|
f"[0:v]trim=start={seg['start']:.6f}:end={seg['end']:.6f},setpts=PTS-STARTPTS[v{i}]"
|
|
)
|
|
# Audio filter: atrim and reset timestamps
|
|
audio_filters.append(
|
|
f"[0:a]atrim=start={seg['start']:.6f}:end={seg['end']:.6f},asetpts=PTS-STARTPTS[a{i}]"
|
|
)
|
|
segment_labels.append(f"[v{i}][a{i}]")
|
|
|
|
# Concat filter
|
|
concat_input = "".join(segment_labels)
|
|
filter_complex = ";".join(video_filters + audio_filters)
|
|
filter_complex += f";{concat_input}concat=n={len(keep_segments)}:v=1:a=1[outv][outa]"
|
|
|
|
cmd = [
|
|
"ffmpeg", "-y",
|
|
"-i", input_path,
|
|
"-filter_complex", filter_complex,
|
|
"-map", "[outv]",
|
|
"-map", "[outa]",
|
|
"-c:v", "libx264",
|
|
"-preset", "fast",
|
|
"-crf", "18",
|
|
"-c:a", "aac",
|
|
"-b:a", "128k",
|
|
"-avoid_negative_ts", "make_zero",
|
|
output_path
|
|
]
|
|
|
|
print(f"[Trim] Command: ffmpeg -y -i {input_path} -filter_complex [complex] -map [outv] -map [outa] ...")
|
|
print(f"[Trim] Filter complex: {filter_complex[:200]}..." if len(filter_complex) > 200 else f"[Trim] Filter complex: {filter_complex}")
|
|
|
|
try:
|
|
result = await asyncio.to_thread(
|
|
subprocess.run,
|
|
cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=300, # Longer timeout for complex operations
|
|
)
|
|
|
|
if result.returncode != 0:
|
|
error_msg = result.stderr[-500:] if result.stderr else "Unknown error"
|
|
print(f"[Trim] FFmpeg error: {error_msg}")
|
|
return False, f"Trim with exclude regions failed: {error_msg}"
|
|
|
|
if os.path.exists(output_path):
|
|
new_duration = await get_video_duration(output_path)
|
|
print(f"[Trim] Success! New duration: {new_duration:.3f}s (expected: {expected_duration:.3f}s)")
|
|
return True, f"Video trimmed successfully ({new_duration:.1f}s, excluded {len(merged_regions)} regions)"
|
|
else:
|
|
print("[Trim] Error: Output file not created")
|
|
return False, "Output file not created"
|
|
|
|
except subprocess.TimeoutExpired:
|
|
print("[Trim] Error: Timeout")
|
|
return False, "Trim operation timed out"
|
|
except Exception as e:
|
|
print(f"[Trim] Error: {str(e)}")
|
|
return False, f"Trim error: {str(e)}"
|
|
|
|
|
|
async def extract_frame(
|
|
video_path: str,
|
|
output_path: str,
|
|
timestamp: float,
|
|
) -> Tuple[bool, str]:
|
|
"""
|
|
Extract a single frame from video at specified timestamp.
|
|
|
|
Args:
|
|
video_path: Path to input video
|
|
output_path: Path for output image (jpg/png)
|
|
timestamp: Time in seconds
|
|
|
|
Returns:
|
|
Tuple of (success, message)
|
|
"""
|
|
if not os.path.exists(video_path):
|
|
return False, f"Video not found: {video_path}"
|
|
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
|
|
cmd = [
|
|
"ffmpeg", "-y",
|
|
"-ss", str(timestamp),
|
|
"-i", video_path,
|
|
"-frames:v", "1",
|
|
"-q:v", "2",
|
|
output_path
|
|
]
|
|
|
|
try:
|
|
result = await asyncio.to_thread(
|
|
subprocess.run,
|
|
cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30,
|
|
)
|
|
|
|
if result.returncode == 0 and os.path.exists(output_path):
|
|
return True, "Frame extracted"
|
|
return False, result.stderr[-200:] if result.stderr else "Unknown error"
|
|
except Exception as e:
|
|
return False, str(e)
|
|
|
|
|
|
async def get_audio_duration(audio_path: str) -> Optional[float]:
|
|
"""Get audio duration in seconds."""
|
|
return await get_video_duration(audio_path) # Same command works
|
|
|
|
|
|
async def extract_audio(video_path: str, output_path: str) -> Tuple[bool, str]:
|
|
"""Extract audio from video."""
|
|
cmd = [
|
|
"ffmpeg", "-y",
|
|
"-i", video_path,
|
|
"-vn",
|
|
"-acodec", "pcm_s16le",
|
|
"-ar", "16000",
|
|
"-ac", "1",
|
|
output_path
|
|
]
|
|
|
|
try:
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
|
if result.returncode == 0:
|
|
return True, "Audio extracted"
|
|
return False, result.stderr
|
|
except Exception as e:
|
|
return False, str(e)
|
|
|
|
|
|
async def extract_audio_with_noise_reduction(
|
|
video_path: str,
|
|
output_path: str,
|
|
noise_reduction_level: str = "medium"
|
|
) -> Tuple[bool, str]:
|
|
"""
|
|
Extract audio from video with noise reduction for better STT accuracy.
|
|
|
|
Args:
|
|
video_path: Path to input video
|
|
output_path: Path for output audio (WAV format recommended)
|
|
noise_reduction_level: "light", "medium", or "heavy"
|
|
|
|
Returns:
|
|
Tuple of (success, message)
|
|
"""
|
|
if not os.path.exists(video_path):
|
|
return False, f"Video file not found: {video_path}"
|
|
|
|
# Build audio filter chain based on noise reduction level
|
|
filters = []
|
|
|
|
# 1. High-pass filter: Remove low frequency rumble (< 80Hz)
|
|
filters.append("highpass=f=80")
|
|
|
|
# 2. Low-pass filter: Remove high frequency hiss (> 8000Hz for speech)
|
|
filters.append("lowpass=f=8000")
|
|
|
|
if noise_reduction_level == "light":
|
|
# Light: Just basic frequency filtering
|
|
pass
|
|
|
|
elif noise_reduction_level == "medium":
|
|
# Medium: Add FFT-based denoiser
|
|
# afftdn: nr=noise reduction amount (0-100), nf=noise floor
|
|
filters.append("afftdn=nf=-25:nr=10:nt=w")
|
|
|
|
elif noise_reduction_level == "heavy":
|
|
# Heavy: More aggressive noise reduction
|
|
filters.append("afftdn=nf=-20:nr=20:nt=w")
|
|
# Add dynamic range compression to normalize volume
|
|
filters.append("acompressor=threshold=-20dB:ratio=4:attack=5:release=50")
|
|
|
|
# 3. Normalize audio levels
|
|
filters.append("loudnorm=I=-16:TP=-1.5:LRA=11")
|
|
|
|
filter_chain = ",".join(filters)
|
|
|
|
cmd = [
|
|
"ffmpeg", "-y",
|
|
"-i", video_path,
|
|
"-vn", # No video
|
|
"-af", filter_chain,
|
|
"-acodec", "pcm_s16le", # PCM format for Whisper
|
|
"-ar", "16000", # 16kHz sample rate (Whisper optimal)
|
|
"-ac", "1", # Mono
|
|
output_path
|
|
]
|
|
|
|
try:
|
|
# Run FFmpeg in thread pool to avoid blocking the event loop
|
|
result = await asyncio.to_thread(
|
|
subprocess.run,
|
|
cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
)
|
|
|
|
if result.returncode != 0:
|
|
error_msg = result.stderr[-300:] if result.stderr else "Unknown error"
|
|
return False, f"Audio extraction failed: {error_msg}"
|
|
|
|
if os.path.exists(output_path):
|
|
return True, f"Audio extracted with {noise_reduction_level} noise reduction"
|
|
else:
|
|
return False, "Output file not created"
|
|
|
|
except subprocess.TimeoutExpired:
|
|
return False, "Audio extraction timed out"
|
|
except Exception as e:
|
|
return False, f"Audio extraction error: {str(e)}"
|
|
|
|
|
|
async def analyze_audio_noise_level(audio_path: str) -> Optional[dict]:
|
|
"""
|
|
Analyze audio to detect noise level.
|
|
|
|
Returns dict with mean_volume, max_volume, noise_floor estimates.
|
|
"""
|
|
cmd = [
|
|
"ffmpeg",
|
|
"-i", audio_path,
|
|
"-af", "volumedetect",
|
|
"-f", "null",
|
|
"-"
|
|
]
|
|
|
|
try:
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
|
|
stderr = result.stderr
|
|
|
|
# Parse volume detection output
|
|
info = {}
|
|
for line in stderr.split('\n'):
|
|
if 'mean_volume' in line:
|
|
info['mean_volume'] = float(line.split(':')[1].strip().replace(' dB', ''))
|
|
elif 'max_volume' in line:
|
|
info['max_volume'] = float(line.split(':')[1].strip().replace(' dB', ''))
|
|
|
|
return info if info else None
|
|
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
async def has_audio_stream(video_path: str) -> bool:
|
|
"""
|
|
Check if video file has an audio stream.
|
|
|
|
Returns:
|
|
True if video has audio, False otherwise
|
|
"""
|
|
cmd = [
|
|
"ffprobe",
|
|
"-v", "error",
|
|
"-select_streams", "a", # Select only audio streams
|
|
"-show_entries", "stream=codec_type",
|
|
"-of", "csv=p=0",
|
|
video_path
|
|
]
|
|
|
|
try:
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
|
# If there's audio, ffprobe will output "audio"
|
|
return "audio" in result.stdout.lower()
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
async def get_audio_volume_info(video_path: str) -> Optional[dict]:
|
|
"""
|
|
Get audio volume information to detect silent audio.
|
|
|
|
Returns:
|
|
dict with mean_volume, or None if no audio or error
|
|
"""
|
|
# First check if audio stream exists
|
|
if not await has_audio_stream(video_path):
|
|
return None
|
|
|
|
cmd = [
|
|
"ffmpeg",
|
|
"-i", video_path,
|
|
"-af", "volumedetect",
|
|
"-f", "null",
|
|
"-"
|
|
]
|
|
|
|
try:
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
|
|
stderr = result.stderr
|
|
|
|
info = {}
|
|
for line in stderr.split('\n'):
|
|
if 'mean_volume' in line:
|
|
info['mean_volume'] = float(line.split(':')[1].strip().replace(' dB', ''))
|
|
elif 'max_volume' in line:
|
|
info['max_volume'] = float(line.split(':')[1].strip().replace(' dB', ''))
|
|
|
|
return info if info else None
|
|
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def is_audio_silent(volume_info: Optional[dict], threshold_db: float = -50.0) -> bool:
|
|
"""
|
|
Check if audio is effectively silent (below threshold).
|
|
|
|
Args:
|
|
volume_info: dict from get_audio_volume_info
|
|
threshold_db: Volume below this is considered silent (default -50dB)
|
|
|
|
Returns:
|
|
True if silent or no audio, False otherwise
|
|
"""
|
|
if not volume_info:
|
|
return True
|
|
|
|
mean_volume = volume_info.get('mean_volume', -100)
|
|
return mean_volume < threshold_db
|