Initial commit: YouTube Shorts maker application

Features:
- Video download from TikTok/Douyin using yt-dlp
- Audio transcription with OpenAI Whisper
- GPT-4 translation (direct/summarize/rewrite modes)
- Subtitle generation with ASS format
- Video trimming with frame-accurate preview
- BGM integration with volume control
- Intro text overlay support
- Thumbnail generation with text overlay

Tech stack:
- Backend: FastAPI, Python 3.11+
- Frontend: React, Vite, TailwindCSS
- Video processing: FFmpeg
- AI: OpenAI Whisper, GPT-4

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
kihong.kim
2026-01-03 21:38:34 +09:00
commit c3795138da
64 changed files with 13059 additions and 0 deletions

View File

@@ -0,0 +1,468 @@
import re
from typing import List, Tuple, Optional
from openai import OpenAI
from app.models.schemas import TranscriptSegment
from app.config import settings
def get_openai_client() -> OpenAI:
"""Get OpenAI client."""
return OpenAI(api_key=settings.OPENAI_API_KEY)
class TranslationMode:
"""Translation mode options."""
DIRECT = "direct" # 직접 번역 (원본 구조 유지)
SUMMARIZE = "summarize" # 요약 후 번역
REWRITE = "rewrite" # 요약 + 한글 대본 재작성
async def shorten_text(client: OpenAI, text: str, max_chars: int) -> str:
"""
Shorten a Korean text to fit within character limit.
Args:
client: OpenAI client
text: Text to shorten
max_chars: Maximum character count
Returns:
Shortened text
"""
try:
response = client.chat.completions.create(
model=settings.OPENAI_MODEL,
messages=[
{
"role": "system",
"content": f"""한국어 자막을 {max_chars}자 이내로 줄이세요.
규칙:
- 반드시 {max_chars}자 이하!
- 핵심 의미만 유지
- 자연스러운 한국어
- 존댓말 유지
- 출력은 줄인 문장만!
예시:
입력: "요리할 때마다 한 시간이 걸리셨죠?" (18자)
제한: 10자
출력: "시간 오래 걸리죠" (8자)
입력: "채소 다듬는 데만 30분 걸리셨죠" (16자)
제한: 10자
출력: "채소만 30분" (6자)"""
},
{
"role": "user",
"content": f"입력: \"{text}\" ({len(text)}자)\n제한: {max_chars}\n출력:"
}
],
temperature=0.3,
max_tokens=50,
)
shortened = response.choices[0].message.content.strip()
# Remove quotes, parentheses, and extra characters
shortened = shortened.strip('"\'""''')
# Remove any trailing parenthetical notes like "(10자)"
shortened = re.sub(r'\s*\([^)]*자\)\s*$', '', shortened)
shortened = re.sub(r'\s*\(\d+자\)\s*$', '', shortened)
# Remove any remaining quotes
shortened = shortened.replace('"', '').replace('"', '').replace('"', '')
shortened = shortened.replace("'", '').replace("'", '').replace("'", '')
shortened = shortened.strip()
# If still too long, truncate cleanly
if len(shortened) > max_chars:
shortened = shortened[:max_chars]
return shortened
except Exception as e:
# Fallback: simple truncation
if len(text) > max_chars:
return text[:max_chars-1] + ""
return text
async def translate_segments(
segments: List[TranscriptSegment],
target_language: str = "Korean",
mode: str = TranslationMode.DIRECT,
max_tokens: Optional[int] = None,
) -> Tuple[bool, str, List[TranscriptSegment]]:
"""
Translate transcript segments to target language using OpenAI.
Args:
segments: List of transcript segments
target_language: Target language for translation
mode: Translation mode (direct, summarize, rewrite)
max_tokens: Maximum output tokens (for cost control)
Returns:
Tuple of (success, message, translated_segments)
"""
if not settings.OPENAI_API_KEY:
return False, "OpenAI API key not configured", segments
try:
client = get_openai_client()
# Batch translate for efficiency
texts = [seg.text for seg in segments]
combined_text = "\n---\n".join(texts)
# Calculate video duration for context
total_duration = segments[-1].end if segments else 0
# Calculate segment info for length guidance
segment_info = []
for i, seg in enumerate(segments):
duration = seg.end - seg.start
max_chars = int(duration * 5) # ~5 Korean chars per second (stricter for better sync)
segment_info.append(f"[{i+1}] {duration:.1f}초 = 최대 {max_chars}자 (엄수!)")
# Get custom prompt settings from config
gpt_role = settings.GPT_ROLE or "친근한 유튜브 쇼츠 자막 작가"
gpt_tone = settings.GPT_TONE or "존댓말"
gpt_style = settings.GPT_STYLE or ""
# Tone examples
tone_examples = {
"존댓말": '~해요, ~이에요, ~하죠',
"반말": '~해, ~야, ~지',
"격식체": '~합니다, ~입니다',
}
tone_example = tone_examples.get(gpt_tone, tone_examples["존댓말"])
# Additional style instruction
style_instruction = f"\n6. Style: {gpt_style}" if gpt_style else ""
# Select prompt based on mode
if mode == TranslationMode.REWRITE:
# Build indexed timeline input with Chinese text
# Use segment numbers to handle duplicate timestamps
timeline_input = []
for i, seg in enumerate(segments):
mins = int(seg.start // 60)
secs = int(seg.start % 60)
timeline_input.append(f"[{i+1}] {mins}:{secs:02d} {seg.text}")
system_prompt = f"""당신은 생활용품 유튜브 쇼츠 자막 작가입니다.
중국어 원문의 "의미"만 참고하여, 한국인이 직접 말하는 것처럼 자연스러운 자막을 작성하세요.
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
🎯 핵심 원칙: 번역이 아니라 "재창작"
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
✅ 필수 규칙:
1. 한 문장 = 한 가지 정보 (두 개 이상 금지)
2. 중복 표현 절대 금지 ("편해요"가 이미 나왔으면 다시 안 씀)
3. {gpt_tone} 사용 ({tone_example})
4. 세그먼트 수 유지: 입력 {len(segments)}개 → 출력 {len(segments)}
5. 중국어 한자 금지, 순수 한글만
❌ 금지 표현 (번역투):
- "~할 수 있어요""~돼요", "~됩니다"
- "매우/아주/정말" 남용 → 꼭 필요할 때만
- "그것은/이것은""이거", "이건"
- "~하는 것이" → 직접 표현으로
- "편리해요/편해요" 반복 → 한 번만, 이후 다른 표현
- "좋아요/좋고요" 반복 → 구체적 장점으로 대체
🎵 쇼츠 리듬감:
- 짧게 끊어서
- 한 호흡에 하나씩
- 시청자가 따라 읽을 수 있게
📝 좋은 예시:
원문: "이 작은 박스 디자인이 참 좋네요. 평소에 씨앗 먹을 때 간편하게 먹을 수 있어요."
❌ 나쁜 번역: "이 작은 박스 디자인이 참 좋네요. 평소에 씨앗 먹을 때 간편하게 먹을 수 있어요."
✅ 좋은 재창작: "이 작은 박스, 생각보다 정말 잘 만들었어요."
원문: "테이블에 두거나 손에 들고 사용하기에도 좋고요. 침대에 누워서나 사무실에서도 간식이나 과일 먹기 정말 편해요."
❌ 나쁜 번역: "테이블에 두거나 손에 들고 사용하기에도 좋고요. 침대에 누워서나 사무실에서도 간식이나 과일 먹기 정말 편해요."
✅ 좋은 재창작 (2개로 분리):
- "테이블 위에서도, 침대에서도, 사무실에서도 사용하기 좋고"
- "과일 씻고 물기 빼는 데도 활용 가능합니다."
원문: "가정에서 필수 아이템이에요. 정말 유용하죠. 꼭 하나씩 가져야 할 제품이에요."
❌ 나쁜 번역: 그대로 3문장
✅ 좋은 재창작: "집에 하나 있으면 은근히 자주 쓰게 됩니다."{style_instruction}
출력 형식:
[번호] 시간 자막 내용
⚠️ 입력과 동일한 세그먼트 수({len(segments)}개)를 출력하세요!
⚠️ 각 [번호]는 입력과 1:1 대응해야 합니다!"""
# Use indexed timeline format for user content
combined_text = "[중국어 원문]\n\n" + "\n".join(timeline_input)
elif mode == TranslationMode.SUMMARIZE:
system_prompt = f"""You are: {gpt_role}
Task: Translate Chinese to SHORT Korean subtitles.
Length limits (자막 싱크!):
{chr(10).join(segment_info)}
Rules:
1. Use {gpt_tone} ({tone_example})
2. Summarize to core meaning - be BRIEF
3. Max one short sentence per segment
4. {len(segments)} segments separated by '---'{style_instruction}"""
else: # DIRECT mode
system_prompt = f"""You are: {gpt_role}
Task: Translate Chinese to Korean subtitles.
Length limits (자막 싱크!):
{chr(10).join(segment_info)}
Rules:
1. Use {gpt_tone} ({tone_example})
2. Keep translations SHORT and readable
3. {len(segments)} segments separated by '---'{style_instruction}"""
# Build API request
request_params = {
"model": settings.OPENAI_MODEL,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": combined_text}
],
"temperature": 0.65 if mode == TranslationMode.REWRITE else 0.3,
}
# Add max_tokens if specified (for cost control)
effective_max_tokens = max_tokens or settings.TRANSLATION_MAX_TOKENS
if effective_max_tokens:
# Use higher token limit for REWRITE mode
if mode == TranslationMode.REWRITE:
request_params["max_tokens"] = max(effective_max_tokens, 700)
else:
request_params["max_tokens"] = effective_max_tokens
response = client.chat.completions.create(**request_params)
translated_text = response.choices[0].message.content
# Parse based on mode
if mode == TranslationMode.REWRITE:
# Parse indexed timeline format: "[1] 0:00 자막\n[2] 0:02 자막\n..."
indexed_pattern = re.compile(r'^\[(\d+)\]\s*\d+:\d{2}\s+(.+)$', re.MULTILINE)
matches = indexed_pattern.findall(translated_text)
# Create mapping from segment index to translation
translations_by_index = {}
for idx, text in matches:
translations_by_index[int(idx)] = text.strip()
# Map translations back to segments by index (1-based)
for i, seg in enumerate(segments):
seg_num = i + 1 # 1-based index
if seg_num in translations_by_index:
seg.translated = translations_by_index[seg_num]
else:
# No matching translation found - try fallback to old timestamp-based parsing
seg.translated = ""
# Fallback: if no indexed matches, try old timestamp format
if not matches:
print("[Warning] No indexed format found, falling back to timestamp parsing")
timeline_pattern = re.compile(r'^(\d+):(\d{2})\s+(.+)$', re.MULTILINE)
timestamp_matches = timeline_pattern.findall(translated_text)
# Create mapping from timestamp to translation
translations_by_time = {}
for mins, secs, text in timestamp_matches:
time_sec = int(mins) * 60 + int(secs)
translations_by_time[time_sec] = text.strip()
# Track used translations to prevent duplicates
used_translations = set()
# Map translations back to segments by matching start times
for seg in segments:
start_sec = int(seg.start)
matched_time = None
# Try exact match first
if start_sec in translations_by_time and start_sec not in used_translations:
matched_time = start_sec
else:
# Try to find closest UNUSED match within 1 second
for t in range(start_sec - 1, start_sec + 2):
if t in translations_by_time and t not in used_translations:
matched_time = t
break
if matched_time is not None:
seg.translated = translations_by_time[matched_time]
used_translations.add(matched_time)
else:
seg.translated = ""
else:
# Original parsing for other modes
translated_parts = translated_text.split("---")
for i, seg in enumerate(segments):
if i < len(translated_parts):
seg.translated = translated_parts[i].strip()
else:
seg.translated = seg.text # Fallback to original
# Calculate token usage for logging
usage = response.usage
token_info = f"(tokens: {usage.prompt_tokens}+{usage.completion_tokens}={usage.total_tokens})"
# Post-processing: Shorten segments that exceed character limit
# Skip for REWRITE mode - the prompt handles length naturally
shortened_count = 0
if mode != TranslationMode.REWRITE:
chars_per_sec = 5
for i, seg in enumerate(segments):
if seg.translated:
duration = seg.end - seg.start
max_chars = int(duration * chars_per_sec)
current_len = len(seg.translated)
if current_len > max_chars * 1.3 and max_chars >= 5:
seg.translated = await shorten_text(client, seg.translated, max_chars)
shortened_count += 1
print(f"[Shorten] Seg {i+1}: {current_len}{len(seg.translated)}자 (제한:{max_chars}자)")
shorten_info = f" [축약:{shortened_count}개]" if shortened_count > 0 else ""
return True, f"Translation complete [{mode}] {token_info}{shorten_info}", segments
except Exception as e:
return False, f"Translation error: {str(e)}", segments
async def generate_shorts_script(
segments: List[TranscriptSegment],
style: str = "engaging",
max_tokens: int = 500,
) -> Tuple[bool, str, Optional[str]]:
"""
Generate a completely new Korean Shorts script from Chinese transcript.
Args:
segments: Original transcript segments
style: Script style (engaging, informative, funny, dramatic)
max_tokens: Maximum output tokens
Returns:
Tuple of (success, message, script)
"""
if not settings.OPENAI_API_KEY:
return False, "OpenAI API key not configured", None
try:
client = get_openai_client()
# Combine all text
full_text = " ".join([seg.text for seg in segments])
total_duration = segments[-1].end if segments else 0
style_guides = {
"engaging": "Use hooks, questions, and emotional expressions. Start with attention-grabbing line.",
"informative": "Focus on facts and clear explanations. Use simple, direct language.",
"funny": "Add humor, wordplay, and light-hearted tone. Include relatable jokes.",
"dramatic": "Build tension and suspense. Use impactful short sentences.",
}
style_guide = style_guides.get(style, style_guides["engaging"])
system_prompt = f"""You are a viral Korean YouTube Shorts script writer.
Create a COMPLETELY ORIGINAL Korean script inspired by the Chinese video content.
=== CRITICAL: ANTI-PLAGIARISM RULES ===
- This is NOT translation - it's ORIGINAL CONTENT CREATION
- NEVER copy sentence structures, word order, or phrasing from original
- Extract only the CORE IDEA, then write YOUR OWN script from scratch
- Imagine you're a Korean creator who just learned this interesting fact
- Add your own personality, reactions, and Korean cultural context
=======================================
Video duration: ~{int(total_duration)} seconds
Style: {style}
Guide: {style_guide}
Output format:
[0:00] 첫 번째 대사
[0:03] 두 번째 대사
...
Requirements:
- Write in POLITE FORMAL KOREAN (존댓말/경어) - friendly but respectful
- Each line: 2-3 seconds when spoken aloud
- Start with a HOOK that grabs attention
- Use polite Korean expressions: "이거 아세요?", "정말 신기하죠", "근데 여기서 중요한 건요"
- End with engagement: question, call-to-action, or surprise
- Make it feel like ORIGINAL Korean content, not a translation"""
response = client.chat.completions.create(
model=settings.OPENAI_MODEL,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": f"Chinese transcript:\n{full_text}"}
],
temperature=0.7,
max_tokens=max_tokens,
)
script = response.choices[0].message.content
usage = response.usage
token_info = f"(tokens: {usage.total_tokens})"
return True, f"Script generated [{style}] {token_info}", script
except Exception as e:
return False, f"Script generation error: {str(e)}", None
async def translate_single(
text: str,
target_language: str = "Korean",
max_tokens: Optional[int] = None,
) -> Tuple[bool, str]:
"""Translate a single text."""
if not settings.OPENAI_API_KEY:
return False, text
try:
client = get_openai_client()
request_params = {
"model": settings.OPENAI_MODEL,
"messages": [
{
"role": "system",
"content": f"Translate to {target_language}. Only output the translation, nothing else."
},
{
"role": "user",
"content": text
}
],
"temperature": 0.3,
}
if max_tokens:
request_params["max_tokens"] = max_tokens
response = client.chat.completions.create(**request_params)
translated = response.choices[0].message.content
return True, translated.strip()
except Exception as e:
return False, text