Initial commit: YouTube Shorts maker application

Features: - Video download from TikTok/Douyin using yt-dlp - Audio transcription with OpenAI Whisper - GPT-4 translation (direct/summarize/rewrite modes) - Subtitle generation with ASS format - Video trimming with frame-accurate preview - BGM integration with volume control - Intro text overlay support - Thumbnail generation with text overlay Tech stack: - Backend: FastAPI, Python 3.11+ - Frontend: React, Vite, TailwindCSS - Video processing: FFmpeg - AI: OpenAI Whisper, GPT-4 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-03 21:38:34 +09:00
commit c3795138da
64 changed files with 13059 additions and 0 deletions
--- a/backend/app/services/thumbnail.py
+++ b/backend/app/services/thumbnail.py
@@ -0,0 +1,399 @@
+"""
+Thumbnail Generator Service
+
+Generates YouTube Shorts thumbnails with:
+1. Frame extraction from video
+2. GPT-generated catchphrase
+3. Text overlay with styling
+"""
+
+import os
+import subprocess
+import asyncio
+from typing import Optional, Tuple, List
+from openai import OpenAI
+from PIL import Image, ImageDraw, ImageFont
+from app.config import settings
+from app.models.schemas import TranscriptSegment
+
+
+def get_openai_client() -> OpenAI:
+    """Get OpenAI client."""
+    return OpenAI(api_key=settings.OPENAI_API_KEY)
+
+
+async def extract_frame(
+    video_path: str,
+    output_path: str,
+    timestamp: float = 2.0,
+) -> Tuple[bool, str]:
+    """
+    Extract a single frame from video.
+
+    Args:
+        video_path: Path to video file
+        output_path: Path to save thumbnail image
+        timestamp: Time in seconds to extract frame
+
+    Returns:
+        Tuple of (success, message)
+    """
+    try:
+        cmd = [
+            "ffmpeg", "-y",
+            "-ss", str(timestamp),
+            "-i", video_path,
+            "-vframes", "1",
+            "-q:v", "2",  # High quality JPEG
+            output_path
+        ]
+
+        process = await asyncio.create_subprocess_exec(
+            *cmd,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE
+        )
+
+        _, stderr = await process.communicate()
+
+        if process.returncode != 0:
+            return False, f"FFmpeg error: {stderr.decode()[:200]}"
+
+        if not os.path.exists(output_path):
+            return False, "Frame extraction failed - no output file"
+
+        return True, "Frame extracted successfully"
+
+    except Exception as e:
+        return False, f"Frame extraction error: {str(e)}"
+
+
+async def generate_catchphrase(
+    transcript: List[TranscriptSegment],
+    style: str = "homeshopping",
+) -> Tuple[bool, str, str]:
+    """
+    Generate a catchy thumbnail text using GPT.
+
+    Args:
+        transcript: List of transcript segments (with translations)
+        style: Style of catchphrase (homeshopping, viral, informative)
+
+    Returns:
+        Tuple of (success, message, catchphrase)
+    """
+    if not settings.OPENAI_API_KEY:
+        return False, "OpenAI API key not configured", ""
+
+    try:
+        client = get_openai_client()
+
+        # Combine translated text
+        if transcript and transcript[0].translated:
+            full_text = " ".join([seg.translated for seg in transcript if seg.translated])
+        else:
+            full_text = " ".join([seg.text for seg in transcript])
+
+        style_guides = {
+            "homeshopping": """홈쇼핑 스타일의 임팩트 있는 문구를 만드세요.
+- "이거 하나면 끝!" 같은 강렬한 어필
+- 혜택/효과 강조
+- 숫자 활용 (예: "10초만에", "50% 절약")
+- 질문형도 OK (예: "아직도 힘들게?")""",
+            "viral": """바이럴 쇼츠 스타일의 호기심 유발 문구를 만드세요.
+- 궁금증 유발
+- 반전/놀라움 암시
+- 이모지 1-2개 사용 가능""",
+            "informative": """정보성 콘텐츠 스타일의 명확한 문구를 만드세요.
+- 핵심 정보 전달
+- 간결하고 명확하게""",
+        }
+
+        style_guide = style_guides.get(style, style_guides["homeshopping"])
+
+        system_prompt = f"""당신은 YouTube Shorts 썸네일 문구 전문가입니다.
+
+{style_guide}
+
+규칙:
+- 반드시 15자 이내!
+- 한 줄로 작성
+- 한글만 사용 (영어/한자 금지)
+- 출력은 문구만! (설명 없이)
+
+예시 출력:
+이거 하나면 끝!
+10초면 완성!
+아직도 힘들게?
+진짜 이게 돼요?"""
+
+        response = client.chat.completions.create(
+            model=settings.OPENAI_MODEL,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": f"다음 영상 내용으로 썸네일 문구를 만들어주세요:\n\n{full_text[:500]}"}
+            ],
+            temperature=0.8,
+            max_tokens=50,
+        )
+
+        catchphrase = response.choices[0].message.content.strip()
+        # Clean up
+        catchphrase = catchphrase.strip('"\'""''')
+
+        # Ensure max length
+        if len(catchphrase) > 20:
+            catchphrase = catchphrase[:20]
+
+        return True, "Catchphrase generated", catchphrase
+
+    except Exception as e:
+        return False, f"GPT error: {str(e)}", ""
+
+
+def add_text_overlay(
+    image_path: str,
+    output_path: str,
+    text: str,
+    font_size: int = 80,
+    font_color: str = "#FFFFFF",
+    stroke_color: str = "#000000",
+    stroke_width: int = 4,
+    position: str = "center",
+    font_name: str = "NanumGothicBold",
+) -> Tuple[bool, str]:
+    """
+    Add text overlay to image using PIL.
+
+    Args:
+        image_path: Input image path
+        output_path: Output image path
+        text: Text to overlay
+        font_size: Font size in pixels
+        font_color: Text color (hex)
+        stroke_color: Outline color (hex)
+        stroke_width: Outline thickness
+        position: Text position (top, center, bottom)
+        font_name: Font family name
+
+    Returns:
+        Tuple of (success, message)
+    """
+    try:
+        # Open image
+        img = Image.open(image_path)
+        draw = ImageDraw.Draw(img)
+        img_width, img_height = img.size
+
+        # Maximum text width (90% of image width)
+        max_text_width = int(img_width * 0.9)
+
+        # Try to load font
+        def load_font(size):
+            font_paths = [
+                f"/usr/share/fonts/truetype/nanum/{font_name}.ttf",
+                f"/usr/share/fonts/opentype/nanum/{font_name}.otf",
+                f"/System/Library/Fonts/{font_name}.ttf",
+                f"/Library/Fonts/{font_name}.ttf",
+                f"~/Library/Fonts/{font_name}.ttf",
+                f"/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
+            ]
+            for path in font_paths:
+                expanded_path = os.path.expanduser(path)
+                if os.path.exists(expanded_path):
+                    try:
+                        return ImageFont.truetype(expanded_path, size)
+                    except:
+                        continue
+            return None
+
+        font = load_font(font_size)
+        if font is None:
+            font = ImageFont.load_default()
+            font_size = 40
+
+        # Check text width and adjust if necessary
+        bbox = draw.textbbox((0, 0), text, font=font)
+        text_width = bbox[2] - bbox[0]
+
+        lines = [text]
+
+        if text_width > max_text_width:
+            # Try splitting into 2 lines first
+            mid = len(text) // 2
+            # Find best split point near middle (at space or comma if exists)
+            split_pos = mid
+            for i in range(mid, max(0, mid - 5), -1):
+                if text[i] in ' ,、，':
+                    split_pos = i + 1
+                    break
+            for i in range(mid, min(len(text), mid + 5)):
+                if text[i] in ' ,、，':
+                    split_pos = i + 1
+                    break
+
+            # Split text into 2 lines
+            line1 = text[:split_pos].strip()
+            line2 = text[split_pos:].strip()
+            lines = [line1, line2] if line2 else [line1]
+
+            # Check if 2-line version fits
+            max_line_width = max(
+                draw.textbbox((0, 0), line, font=font)[2] - draw.textbbox((0, 0), line, font=font)[0]
+                for line in lines
+            )
+
+            # If still too wide, reduce font size
+            while max_line_width > max_text_width and font_size > 40:
+                font_size -= 5
+                font = load_font(font_size)
+                if font is None:
+                    font = ImageFont.load_default()
+                    break
+                max_line_width = max(
+                    draw.textbbox((0, 0), line, font=font)[2] - draw.textbbox((0, 0), line, font=font)[0]
+                    for line in lines
+                )
+
+        # Calculate total text height for multi-line
+        line_height = font_size + 10
+        total_height = line_height * len(lines)
+
+        # Calculate starting y position
+        if position == "top":
+            start_y = img_height // 6
+        elif position == "bottom":
+            start_y = img_height - img_height // 4 - total_height
+        else:  # center
+            start_y = (img_height - total_height) // 2
+
+        # Convert hex colors to RGB
+        def hex_to_rgb(hex_color):
+            hex_color = hex_color.lstrip('#')
+            return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
+
+        text_rgb = hex_to_rgb(font_color)
+        stroke_rgb = hex_to_rgb(stroke_color)
+
+        # Draw each line
+        for i, line in enumerate(lines):
+            bbox = draw.textbbox((0, 0), line, font=font)
+            line_width = bbox[2] - bbox[0]
+            # Account for left bearing (bbox[0]) to prevent first character cut-off
+            # Some fonts/characters have non-zero left offset
+            x = (img_width - line_width) // 2 - bbox[0]
+            y = start_y + i * line_height
+
+            # Draw text with stroke (outline)
+            for dx in range(-stroke_width, stroke_width + 1):
+                for dy in range(-stroke_width, stroke_width + 1):
+                    if dx != 0 or dy != 0:
+                        draw.text((x + dx, y + dy), line, font=font, fill=stroke_rgb)
+
+            # Draw main text
+            draw.text((x, y), line, font=font, fill=text_rgb)
+
+        # Save
+        img.save(output_path, "JPEG", quality=95)
+
+        return True, "Text overlay added"
+
+    except Exception as e:
+        return False, f"Text overlay error: {str(e)}"
+
+
+async def generate_thumbnail(
+    job_id: str,
+    video_path: str,
+    transcript: List[TranscriptSegment],
+    timestamp: float = 2.0,
+    style: str = "homeshopping",
+    custom_text: Optional[str] = None,
+    font_size: int = 80,
+    position: str = "center",
+) -> Tuple[bool, str, Optional[str]]:
+    """
+    Generate a complete thumbnail with text overlay.
+
+    Args:
+        job_id: Job ID for naming
+        video_path: Path to video file
+        transcript: Transcript segments
+        timestamp: Time to extract frame
+        style: Catchphrase style
+        custom_text: Custom text (skip GPT generation)
+        font_size: Font size
+        position: Text position
+
+    Returns:
+        Tuple of (success, message, thumbnail_path)
+    """
+    # Paths
+    frame_path = os.path.join(settings.PROCESSED_DIR, f"{job_id}_frame.jpg")
+    thumbnail_path = os.path.join(settings.PROCESSED_DIR, f"{job_id}_thumbnail.jpg")
+
+    # Step 1: Extract frame
+    success, msg = await extract_frame(video_path, frame_path, timestamp)
+    if not success:
+        return False, msg, None
+
+    # Step 2: Generate or use custom text
+    if custom_text:
+        catchphrase = custom_text
+    else:
+        success, msg, catchphrase = await generate_catchphrase(transcript, style)
+        if not success:
+            # Fallback: use first translation
+            catchphrase = transcript[0].translated if transcript and transcript[0].translated else "확인해보세요!"
+
+    # Step 3: Add text overlay
+    success, msg = add_text_overlay(
+        frame_path,
+        thumbnail_path,
+        catchphrase,
+        font_size=font_size,
+        position=position,
+    )
+
+    if not success:
+        return False, msg, None
+
+    # Cleanup frame
+    if os.path.exists(frame_path):
+        os.remove(frame_path)
+
+    return True, f"Thumbnail generated: {catchphrase}", thumbnail_path
+
+
+async def get_video_timestamps(video_path: str, count: int = 5) -> List[float]:
+    """
+    Get evenly distributed timestamps from video for thumbnail selection.
+
+    Args:
+        video_path: Path to video
+        count: Number of timestamps to return
+
+    Returns:
+        List of timestamps in seconds
+    """
+    try:
+        cmd = [
+            "ffprobe", "-v", "error",
+            "-show_entries", "format=duration",
+            "-of", "default=noprint_wrappers=1:nokey=1",
+            video_path
+        ]
+
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        duration = float(result.stdout.strip())
+
+        # Generate evenly distributed timestamps (skip first and last 10%)
+        start = duration * 0.1
+        end = duration * 0.9
+        step = (end - start) / (count - 1) if count > 1 else 0
+
+        timestamps = [start + i * step for i in range(count)]
+        return timestamps
+
+    except Exception:
+        return [1.0, 3.0, 5.0, 7.0, 10.0]  # Fallback