Files
bini-shorts-maker/backend/app/services/thumbnail.py
kihong.kim c3795138da Initial commit: YouTube Shorts maker application
Features:
- Video download from TikTok/Douyin using yt-dlp
- Audio transcription with OpenAI Whisper
- GPT-4 translation (direct/summarize/rewrite modes)
- Subtitle generation with ASS format
- Video trimming with frame-accurate preview
- BGM integration with volume control
- Intro text overlay support
- Thumbnail generation with text overlay

Tech stack:
- Backend: FastAPI, Python 3.11+
- Frontend: React, Vite, TailwindCSS
- Video processing: FFmpeg
- AI: OpenAI Whisper, GPT-4

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-03 21:38:34 +09:00

400 lines
12 KiB
Python

"""
Thumbnail Generator Service
Generates YouTube Shorts thumbnails with:
1. Frame extraction from video
2. GPT-generated catchphrase
3. Text overlay with styling
"""
import os
import subprocess
import asyncio
from typing import Optional, Tuple, List
from openai import OpenAI
from PIL import Image, ImageDraw, ImageFont
from app.config import settings
from app.models.schemas import TranscriptSegment
def get_openai_client() -> OpenAI:
"""Get OpenAI client."""
return OpenAI(api_key=settings.OPENAI_API_KEY)
async def extract_frame(
video_path: str,
output_path: str,
timestamp: float = 2.0,
) -> Tuple[bool, str]:
"""
Extract a single frame from video.
Args:
video_path: Path to video file
output_path: Path to save thumbnail image
timestamp: Time in seconds to extract frame
Returns:
Tuple of (success, message)
"""
try:
cmd = [
"ffmpeg", "-y",
"-ss", str(timestamp),
"-i", video_path,
"-vframes", "1",
"-q:v", "2", # High quality JPEG
output_path
]
process = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
_, stderr = await process.communicate()
if process.returncode != 0:
return False, f"FFmpeg error: {stderr.decode()[:200]}"
if not os.path.exists(output_path):
return False, "Frame extraction failed - no output file"
return True, "Frame extracted successfully"
except Exception as e:
return False, f"Frame extraction error: {str(e)}"
async def generate_catchphrase(
transcript: List[TranscriptSegment],
style: str = "homeshopping",
) -> Tuple[bool, str, str]:
"""
Generate a catchy thumbnail text using GPT.
Args:
transcript: List of transcript segments (with translations)
style: Style of catchphrase (homeshopping, viral, informative)
Returns:
Tuple of (success, message, catchphrase)
"""
if not settings.OPENAI_API_KEY:
return False, "OpenAI API key not configured", ""
try:
client = get_openai_client()
# Combine translated text
if transcript and transcript[0].translated:
full_text = " ".join([seg.translated for seg in transcript if seg.translated])
else:
full_text = " ".join([seg.text for seg in transcript])
style_guides = {
"homeshopping": """홈쇼핑 스타일의 임팩트 있는 문구를 만드세요.
- "이거 하나면 끝!" 같은 강렬한 어필
- 혜택/효과 강조
- 숫자 활용 (예: "10초만에", "50% 절약")
- 질문형도 OK (예: "아직도 힘들게?")""",
"viral": """바이럴 쇼츠 스타일의 호기심 유발 문구를 만드세요.
- 궁금증 유발
- 반전/놀라움 암시
- 이모지 1-2개 사용 가능""",
"informative": """정보성 콘텐츠 스타일의 명확한 문구를 만드세요.
- 핵심 정보 전달
- 간결하고 명확하게""",
}
style_guide = style_guides.get(style, style_guides["homeshopping"])
system_prompt = f"""당신은 YouTube Shorts 썸네일 문구 전문가입니다.
{style_guide}
규칙:
- 반드시 15자 이내!
- 한 줄로 작성
- 한글만 사용 (영어/한자 금지)
- 출력은 문구만! (설명 없이)
예시 출력:
이거 하나면 끝!
10초면 완성!
아직도 힘들게?
진짜 이게 돼요?"""
response = client.chat.completions.create(
model=settings.OPENAI_MODEL,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": f"다음 영상 내용으로 썸네일 문구를 만들어주세요:\n\n{full_text[:500]}"}
],
temperature=0.8,
max_tokens=50,
)
catchphrase = response.choices[0].message.content.strip()
# Clean up
catchphrase = catchphrase.strip('"\'""''')
# Ensure max length
if len(catchphrase) > 20:
catchphrase = catchphrase[:20]
return True, "Catchphrase generated", catchphrase
except Exception as e:
return False, f"GPT error: {str(e)}", ""
def add_text_overlay(
image_path: str,
output_path: str,
text: str,
font_size: int = 80,
font_color: str = "#FFFFFF",
stroke_color: str = "#000000",
stroke_width: int = 4,
position: str = "center",
font_name: str = "NanumGothicBold",
) -> Tuple[bool, str]:
"""
Add text overlay to image using PIL.
Args:
image_path: Input image path
output_path: Output image path
text: Text to overlay
font_size: Font size in pixels
font_color: Text color (hex)
stroke_color: Outline color (hex)
stroke_width: Outline thickness
position: Text position (top, center, bottom)
font_name: Font family name
Returns:
Tuple of (success, message)
"""
try:
# Open image
img = Image.open(image_path)
draw = ImageDraw.Draw(img)
img_width, img_height = img.size
# Maximum text width (90% of image width)
max_text_width = int(img_width * 0.9)
# Try to load font
def load_font(size):
font_paths = [
f"/usr/share/fonts/truetype/nanum/{font_name}.ttf",
f"/usr/share/fonts/opentype/nanum/{font_name}.otf",
f"/System/Library/Fonts/{font_name}.ttf",
f"/Library/Fonts/{font_name}.ttf",
f"~/Library/Fonts/{font_name}.ttf",
f"/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
]
for path in font_paths:
expanded_path = os.path.expanduser(path)
if os.path.exists(expanded_path):
try:
return ImageFont.truetype(expanded_path, size)
except:
continue
return None
font = load_font(font_size)
if font is None:
font = ImageFont.load_default()
font_size = 40
# Check text width and adjust if necessary
bbox = draw.textbbox((0, 0), text, font=font)
text_width = bbox[2] - bbox[0]
lines = [text]
if text_width > max_text_width:
# Try splitting into 2 lines first
mid = len(text) // 2
# Find best split point near middle (at space or comma if exists)
split_pos = mid
for i in range(mid, max(0, mid - 5), -1):
if text[i] in ' ,、,':
split_pos = i + 1
break
for i in range(mid, min(len(text), mid + 5)):
if text[i] in ' ,、,':
split_pos = i + 1
break
# Split text into 2 lines
line1 = text[:split_pos].strip()
line2 = text[split_pos:].strip()
lines = [line1, line2] if line2 else [line1]
# Check if 2-line version fits
max_line_width = max(
draw.textbbox((0, 0), line, font=font)[2] - draw.textbbox((0, 0), line, font=font)[0]
for line in lines
)
# If still too wide, reduce font size
while max_line_width > max_text_width and font_size > 40:
font_size -= 5
font = load_font(font_size)
if font is None:
font = ImageFont.load_default()
break
max_line_width = max(
draw.textbbox((0, 0), line, font=font)[2] - draw.textbbox((0, 0), line, font=font)[0]
for line in lines
)
# Calculate total text height for multi-line
line_height = font_size + 10
total_height = line_height * len(lines)
# Calculate starting y position
if position == "top":
start_y = img_height // 6
elif position == "bottom":
start_y = img_height - img_height // 4 - total_height
else: # center
start_y = (img_height - total_height) // 2
# Convert hex colors to RGB
def hex_to_rgb(hex_color):
hex_color = hex_color.lstrip('#')
return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
text_rgb = hex_to_rgb(font_color)
stroke_rgb = hex_to_rgb(stroke_color)
# Draw each line
for i, line in enumerate(lines):
bbox = draw.textbbox((0, 0), line, font=font)
line_width = bbox[2] - bbox[0]
# Account for left bearing (bbox[0]) to prevent first character cut-off
# Some fonts/characters have non-zero left offset
x = (img_width - line_width) // 2 - bbox[0]
y = start_y + i * line_height
# Draw text with stroke (outline)
for dx in range(-stroke_width, stroke_width + 1):
for dy in range(-stroke_width, stroke_width + 1):
if dx != 0 or dy != 0:
draw.text((x + dx, y + dy), line, font=font, fill=stroke_rgb)
# Draw main text
draw.text((x, y), line, font=font, fill=text_rgb)
# Save
img.save(output_path, "JPEG", quality=95)
return True, "Text overlay added"
except Exception as e:
return False, f"Text overlay error: {str(e)}"
async def generate_thumbnail(
job_id: str,
video_path: str,
transcript: List[TranscriptSegment],
timestamp: float = 2.0,
style: str = "homeshopping",
custom_text: Optional[str] = None,
font_size: int = 80,
position: str = "center",
) -> Tuple[bool, str, Optional[str]]:
"""
Generate a complete thumbnail with text overlay.
Args:
job_id: Job ID for naming
video_path: Path to video file
transcript: Transcript segments
timestamp: Time to extract frame
style: Catchphrase style
custom_text: Custom text (skip GPT generation)
font_size: Font size
position: Text position
Returns:
Tuple of (success, message, thumbnail_path)
"""
# Paths
frame_path = os.path.join(settings.PROCESSED_DIR, f"{job_id}_frame.jpg")
thumbnail_path = os.path.join(settings.PROCESSED_DIR, f"{job_id}_thumbnail.jpg")
# Step 1: Extract frame
success, msg = await extract_frame(video_path, frame_path, timestamp)
if not success:
return False, msg, None
# Step 2: Generate or use custom text
if custom_text:
catchphrase = custom_text
else:
success, msg, catchphrase = await generate_catchphrase(transcript, style)
if not success:
# Fallback: use first translation
catchphrase = transcript[0].translated if transcript and transcript[0].translated else "확인해보세요!"
# Step 3: Add text overlay
success, msg = add_text_overlay(
frame_path,
thumbnail_path,
catchphrase,
font_size=font_size,
position=position,
)
if not success:
return False, msg, None
# Cleanup frame
if os.path.exists(frame_path):
os.remove(frame_path)
return True, f"Thumbnail generated: {catchphrase}", thumbnail_path
async def get_video_timestamps(video_path: str, count: int = 5) -> List[float]:
"""
Get evenly distributed timestamps from video for thumbnail selection.
Args:
video_path: Path to video
count: Number of timestamps to return
Returns:
List of timestamps in seconds
"""
try:
cmd = [
"ffprobe", "-v", "error",
"-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1",
video_path
]
result = subprocess.run(cmd, capture_output=True, text=True)
duration = float(result.stdout.strip())
# Generate evenly distributed timestamps (skip first and last 10%)
start = duration * 0.1
end = duration * 0.9
step = (end - start) / (count - 1) if count > 1 else 0
timestamps = [start + i * step for i in range(count)]
return timestamps
except Exception:
return [1.0, 3.0, 5.0, 7.0, 10.0] # Fallback