Features: - Video download from TikTok/Douyin using yt-dlp - Audio transcription with OpenAI Whisper - GPT-4 translation (direct/summarize/rewrite modes) - Subtitle generation with ASS format - Video trimming with frame-accurate preview - BGM integration with volume control - Intro text overlay support - Thumbnail generation with text overlay Tech stack: - Backend: FastAPI, Python 3.11+ - Frontend: React, Vite, TailwindCSS - Video processing: FFmpeg - AI: OpenAI Whisper, GPT-4 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
469 lines
18 KiB
Python
469 lines
18 KiB
Python
import re
|
|
from typing import List, Tuple, Optional
|
|
from openai import OpenAI
|
|
from app.models.schemas import TranscriptSegment
|
|
from app.config import settings
|
|
|
|
|
|
def get_openai_client() -> OpenAI:
|
|
"""Get OpenAI client."""
|
|
return OpenAI(api_key=settings.OPENAI_API_KEY)
|
|
|
|
|
|
class TranslationMode:
|
|
"""Translation mode options."""
|
|
DIRECT = "direct" # 직접 번역 (원본 구조 유지)
|
|
SUMMARIZE = "summarize" # 요약 후 번역
|
|
REWRITE = "rewrite" # 요약 + 한글 대본 재작성
|
|
|
|
|
|
async def shorten_text(client: OpenAI, text: str, max_chars: int) -> str:
|
|
"""
|
|
Shorten a Korean text to fit within character limit.
|
|
|
|
Args:
|
|
client: OpenAI client
|
|
text: Text to shorten
|
|
max_chars: Maximum character count
|
|
|
|
Returns:
|
|
Shortened text
|
|
"""
|
|
try:
|
|
response = client.chat.completions.create(
|
|
model=settings.OPENAI_MODEL,
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": f"""한국어 자막을 {max_chars}자 이내로 줄이세요.
|
|
|
|
규칙:
|
|
- 반드시 {max_chars}자 이하!
|
|
- 핵심 의미만 유지
|
|
- 자연스러운 한국어
|
|
- 존댓말 유지
|
|
- 출력은 줄인 문장만!
|
|
|
|
예시:
|
|
입력: "요리할 때마다 한 시간이 걸리셨죠?" (18자)
|
|
제한: 10자
|
|
출력: "시간 오래 걸리죠" (8자)
|
|
|
|
입력: "채소 다듬는 데만 30분 걸리셨죠" (16자)
|
|
제한: 10자
|
|
출력: "채소만 30분" (6자)"""
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f"입력: \"{text}\" ({len(text)}자)\n제한: {max_chars}자\n출력:"
|
|
}
|
|
],
|
|
temperature=0.3,
|
|
max_tokens=50,
|
|
)
|
|
|
|
shortened = response.choices[0].message.content.strip()
|
|
# Remove quotes, parentheses, and extra characters
|
|
shortened = shortened.strip('"\'""''')
|
|
# Remove any trailing parenthetical notes like "(10자)"
|
|
shortened = re.sub(r'\s*\([^)]*자\)\s*$', '', shortened)
|
|
shortened = re.sub(r'\s*\(\d+자\)\s*$', '', shortened)
|
|
# Remove any remaining quotes
|
|
shortened = shortened.replace('"', '').replace('"', '').replace('"', '')
|
|
shortened = shortened.replace("'", '').replace("'", '').replace("'", '')
|
|
shortened = shortened.strip()
|
|
|
|
# If still too long, truncate cleanly
|
|
if len(shortened) > max_chars:
|
|
shortened = shortened[:max_chars]
|
|
|
|
return shortened
|
|
|
|
except Exception as e:
|
|
# Fallback: simple truncation
|
|
if len(text) > max_chars:
|
|
return text[:max_chars-1] + "…"
|
|
return text
|
|
|
|
|
|
async def translate_segments(
|
|
segments: List[TranscriptSegment],
|
|
target_language: str = "Korean",
|
|
mode: str = TranslationMode.DIRECT,
|
|
max_tokens: Optional[int] = None,
|
|
) -> Tuple[bool, str, List[TranscriptSegment]]:
|
|
"""
|
|
Translate transcript segments to target language using OpenAI.
|
|
|
|
Args:
|
|
segments: List of transcript segments
|
|
target_language: Target language for translation
|
|
mode: Translation mode (direct, summarize, rewrite)
|
|
max_tokens: Maximum output tokens (for cost control)
|
|
|
|
Returns:
|
|
Tuple of (success, message, translated_segments)
|
|
"""
|
|
if not settings.OPENAI_API_KEY:
|
|
return False, "OpenAI API key not configured", segments
|
|
|
|
try:
|
|
client = get_openai_client()
|
|
|
|
# Batch translate for efficiency
|
|
texts = [seg.text for seg in segments]
|
|
combined_text = "\n---\n".join(texts)
|
|
|
|
# Calculate video duration for context
|
|
total_duration = segments[-1].end if segments else 0
|
|
|
|
# Calculate segment info for length guidance
|
|
segment_info = []
|
|
for i, seg in enumerate(segments):
|
|
duration = seg.end - seg.start
|
|
max_chars = int(duration * 5) # ~5 Korean chars per second (stricter for better sync)
|
|
segment_info.append(f"[{i+1}] {duration:.1f}초 = 최대 {max_chars}자 (엄수!)")
|
|
|
|
# Get custom prompt settings from config
|
|
gpt_role = settings.GPT_ROLE or "친근한 유튜브 쇼츠 자막 작가"
|
|
gpt_tone = settings.GPT_TONE or "존댓말"
|
|
gpt_style = settings.GPT_STYLE or ""
|
|
|
|
# Tone examples
|
|
tone_examples = {
|
|
"존댓말": '~해요, ~이에요, ~하죠',
|
|
"반말": '~해, ~야, ~지',
|
|
"격식체": '~합니다, ~입니다',
|
|
}
|
|
tone_example = tone_examples.get(gpt_tone, tone_examples["존댓말"])
|
|
|
|
# Additional style instruction
|
|
style_instruction = f"\n6. Style: {gpt_style}" if gpt_style else ""
|
|
|
|
# Select prompt based on mode
|
|
if mode == TranslationMode.REWRITE:
|
|
# Build indexed timeline input with Chinese text
|
|
# Use segment numbers to handle duplicate timestamps
|
|
timeline_input = []
|
|
for i, seg in enumerate(segments):
|
|
mins = int(seg.start // 60)
|
|
secs = int(seg.start % 60)
|
|
timeline_input.append(f"[{i+1}] {mins}:{secs:02d} {seg.text}")
|
|
|
|
system_prompt = f"""당신은 생활용품 유튜브 쇼츠 자막 작가입니다.
|
|
|
|
중국어 원문의 "의미"만 참고하여, 한국인이 직접 말하는 것처럼 자연스러운 자막을 작성하세요.
|
|
|
|
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
🎯 핵심 원칙: 번역이 아니라 "재창작"
|
|
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
|
|
✅ 필수 규칙:
|
|
1. 한 문장 = 한 가지 정보 (두 개 이상 금지)
|
|
2. 중복 표현 절대 금지 ("편해요"가 이미 나왔으면 다시 안 씀)
|
|
3. {gpt_tone} 사용 ({tone_example})
|
|
4. 세그먼트 수 유지: 입력 {len(segments)}개 → 출력 {len(segments)}개
|
|
5. 중국어 한자 금지, 순수 한글만
|
|
|
|
❌ 금지 표현 (번역투):
|
|
- "~할 수 있어요" → "~돼요", "~됩니다"
|
|
- "매우/아주/정말" 남용 → 꼭 필요할 때만
|
|
- "그것은/이것은" → "이거", "이건"
|
|
- "~하는 것이" → 직접 표현으로
|
|
- "편리해요/편해요" 반복 → 한 번만, 이후 다른 표현
|
|
- "좋아요/좋고요" 반복 → 구체적 장점으로 대체
|
|
|
|
🎵 쇼츠 리듬감:
|
|
- 짧게 끊어서
|
|
- 한 호흡에 하나씩
|
|
- 시청자가 따라 읽을 수 있게
|
|
|
|
📝 좋은 예시:
|
|
|
|
원문: "이 작은 박스 디자인이 참 좋네요. 평소에 씨앗 먹을 때 간편하게 먹을 수 있어요."
|
|
❌ 나쁜 번역: "이 작은 박스 디자인이 참 좋네요. 평소에 씨앗 먹을 때 간편하게 먹을 수 있어요."
|
|
✅ 좋은 재창작: "이 작은 박스, 생각보다 정말 잘 만들었어요."
|
|
|
|
원문: "테이블에 두거나 손에 들고 사용하기에도 좋고요. 침대에 누워서나 사무실에서도 간식이나 과일 먹기 정말 편해요."
|
|
❌ 나쁜 번역: "테이블에 두거나 손에 들고 사용하기에도 좋고요. 침대에 누워서나 사무실에서도 간식이나 과일 먹기 정말 편해요."
|
|
✅ 좋은 재창작 (2개로 분리):
|
|
- "테이블 위에서도, 침대에서도, 사무실에서도 사용하기 좋고"
|
|
- "과일 씻고 물기 빼는 데도 활용 가능합니다."
|
|
|
|
원문: "가정에서 필수 아이템이에요. 정말 유용하죠. 꼭 하나씩 가져야 할 제품이에요."
|
|
❌ 나쁜 번역: 그대로 3문장
|
|
✅ 좋은 재창작: "집에 하나 있으면 은근히 자주 쓰게 됩니다."{style_instruction}
|
|
|
|
출력 형식:
|
|
[번호] 시간 자막 내용
|
|
|
|
⚠️ 입력과 동일한 세그먼트 수({len(segments)}개)를 출력하세요!
|
|
⚠️ 각 [번호]는 입력과 1:1 대응해야 합니다!"""
|
|
|
|
# Use indexed timeline format for user content
|
|
combined_text = "[중국어 원문]\n\n" + "\n".join(timeline_input)
|
|
|
|
elif mode == TranslationMode.SUMMARIZE:
|
|
system_prompt = f"""You are: {gpt_role}
|
|
|
|
Task: Translate Chinese to SHORT Korean subtitles.
|
|
|
|
Length limits (자막 싱크!):
|
|
{chr(10).join(segment_info)}
|
|
|
|
Rules:
|
|
1. Use {gpt_tone} ({tone_example})
|
|
2. Summarize to core meaning - be BRIEF
|
|
3. Max one short sentence per segment
|
|
4. {len(segments)} segments separated by '---'{style_instruction}"""
|
|
|
|
else: # DIRECT mode
|
|
system_prompt = f"""You are: {gpt_role}
|
|
|
|
Task: Translate Chinese to Korean subtitles.
|
|
|
|
Length limits (자막 싱크!):
|
|
{chr(10).join(segment_info)}
|
|
|
|
Rules:
|
|
1. Use {gpt_tone} ({tone_example})
|
|
2. Keep translations SHORT and readable
|
|
3. {len(segments)} segments separated by '---'{style_instruction}"""
|
|
|
|
# Build API request
|
|
request_params = {
|
|
"model": settings.OPENAI_MODEL,
|
|
"messages": [
|
|
{"role": "system", "content": system_prompt},
|
|
{"role": "user", "content": combined_text}
|
|
],
|
|
"temperature": 0.65 if mode == TranslationMode.REWRITE else 0.3,
|
|
}
|
|
|
|
# Add max_tokens if specified (for cost control)
|
|
effective_max_tokens = max_tokens or settings.TRANSLATION_MAX_TOKENS
|
|
if effective_max_tokens:
|
|
# Use higher token limit for REWRITE mode
|
|
if mode == TranslationMode.REWRITE:
|
|
request_params["max_tokens"] = max(effective_max_tokens, 700)
|
|
else:
|
|
request_params["max_tokens"] = effective_max_tokens
|
|
|
|
response = client.chat.completions.create(**request_params)
|
|
|
|
translated_text = response.choices[0].message.content
|
|
|
|
# Parse based on mode
|
|
if mode == TranslationMode.REWRITE:
|
|
# Parse indexed timeline format: "[1] 0:00 자막\n[2] 0:02 자막\n..."
|
|
indexed_pattern = re.compile(r'^\[(\d+)\]\s*\d+:\d{2}\s+(.+)$', re.MULTILINE)
|
|
matches = indexed_pattern.findall(translated_text)
|
|
|
|
# Create mapping from segment index to translation
|
|
translations_by_index = {}
|
|
for idx, text in matches:
|
|
translations_by_index[int(idx)] = text.strip()
|
|
|
|
# Map translations back to segments by index (1-based)
|
|
for i, seg in enumerate(segments):
|
|
seg_num = i + 1 # 1-based index
|
|
if seg_num in translations_by_index:
|
|
seg.translated = translations_by_index[seg_num]
|
|
else:
|
|
# No matching translation found - try fallback to old timestamp-based parsing
|
|
seg.translated = ""
|
|
|
|
# Fallback: if no indexed matches, try old timestamp format
|
|
if not matches:
|
|
print("[Warning] No indexed format found, falling back to timestamp parsing")
|
|
timeline_pattern = re.compile(r'^(\d+):(\d{2})\s+(.+)$', re.MULTILINE)
|
|
timestamp_matches = timeline_pattern.findall(translated_text)
|
|
|
|
# Create mapping from timestamp to translation
|
|
translations_by_time = {}
|
|
for mins, secs, text in timestamp_matches:
|
|
time_sec = int(mins) * 60 + int(secs)
|
|
translations_by_time[time_sec] = text.strip()
|
|
|
|
# Track used translations to prevent duplicates
|
|
used_translations = set()
|
|
|
|
# Map translations back to segments by matching start times
|
|
for seg in segments:
|
|
start_sec = int(seg.start)
|
|
matched_time = None
|
|
|
|
# Try exact match first
|
|
if start_sec in translations_by_time and start_sec not in used_translations:
|
|
matched_time = start_sec
|
|
else:
|
|
# Try to find closest UNUSED match within 1 second
|
|
for t in range(start_sec - 1, start_sec + 2):
|
|
if t in translations_by_time and t not in used_translations:
|
|
matched_time = t
|
|
break
|
|
|
|
if matched_time is not None:
|
|
seg.translated = translations_by_time[matched_time]
|
|
used_translations.add(matched_time)
|
|
else:
|
|
seg.translated = ""
|
|
else:
|
|
# Original parsing for other modes
|
|
translated_parts = translated_text.split("---")
|
|
for i, seg in enumerate(segments):
|
|
if i < len(translated_parts):
|
|
seg.translated = translated_parts[i].strip()
|
|
else:
|
|
seg.translated = seg.text # Fallback to original
|
|
|
|
# Calculate token usage for logging
|
|
usage = response.usage
|
|
token_info = f"(tokens: {usage.prompt_tokens}+{usage.completion_tokens}={usage.total_tokens})"
|
|
|
|
# Post-processing: Shorten segments that exceed character limit
|
|
# Skip for REWRITE mode - the prompt handles length naturally
|
|
shortened_count = 0
|
|
if mode != TranslationMode.REWRITE:
|
|
chars_per_sec = 5
|
|
for i, seg in enumerate(segments):
|
|
if seg.translated:
|
|
duration = seg.end - seg.start
|
|
max_chars = int(duration * chars_per_sec)
|
|
current_len = len(seg.translated)
|
|
|
|
if current_len > max_chars * 1.3 and max_chars >= 5:
|
|
seg.translated = await shorten_text(client, seg.translated, max_chars)
|
|
shortened_count += 1
|
|
print(f"[Shorten] Seg {i+1}: {current_len}→{len(seg.translated)}자 (제한:{max_chars}자)")
|
|
|
|
shorten_info = f" [축약:{shortened_count}개]" if shortened_count > 0 else ""
|
|
|
|
return True, f"Translation complete [{mode}] {token_info}{shorten_info}", segments
|
|
|
|
except Exception as e:
|
|
return False, f"Translation error: {str(e)}", segments
|
|
|
|
|
|
async def generate_shorts_script(
|
|
segments: List[TranscriptSegment],
|
|
style: str = "engaging",
|
|
max_tokens: int = 500,
|
|
) -> Tuple[bool, str, Optional[str]]:
|
|
"""
|
|
Generate a completely new Korean Shorts script from Chinese transcript.
|
|
|
|
Args:
|
|
segments: Original transcript segments
|
|
style: Script style (engaging, informative, funny, dramatic)
|
|
max_tokens: Maximum output tokens
|
|
|
|
Returns:
|
|
Tuple of (success, message, script)
|
|
"""
|
|
if not settings.OPENAI_API_KEY:
|
|
return False, "OpenAI API key not configured", None
|
|
|
|
try:
|
|
client = get_openai_client()
|
|
|
|
# Combine all text
|
|
full_text = " ".join([seg.text for seg in segments])
|
|
total_duration = segments[-1].end if segments else 0
|
|
|
|
style_guides = {
|
|
"engaging": "Use hooks, questions, and emotional expressions. Start with attention-grabbing line.",
|
|
"informative": "Focus on facts and clear explanations. Use simple, direct language.",
|
|
"funny": "Add humor, wordplay, and light-hearted tone. Include relatable jokes.",
|
|
"dramatic": "Build tension and suspense. Use impactful short sentences.",
|
|
}
|
|
|
|
style_guide = style_guides.get(style, style_guides["engaging"])
|
|
|
|
system_prompt = f"""You are a viral Korean YouTube Shorts script writer.
|
|
|
|
Create a COMPLETELY ORIGINAL Korean script inspired by the Chinese video content.
|
|
|
|
=== CRITICAL: ANTI-PLAGIARISM RULES ===
|
|
- This is NOT translation - it's ORIGINAL CONTENT CREATION
|
|
- NEVER copy sentence structures, word order, or phrasing from original
|
|
- Extract only the CORE IDEA, then write YOUR OWN script from scratch
|
|
- Imagine you're a Korean creator who just learned this interesting fact
|
|
- Add your own personality, reactions, and Korean cultural context
|
|
=======================================
|
|
|
|
Video duration: ~{int(total_duration)} seconds
|
|
Style: {style}
|
|
Guide: {style_guide}
|
|
|
|
Output format:
|
|
[0:00] 첫 번째 대사
|
|
[0:03] 두 번째 대사
|
|
...
|
|
|
|
Requirements:
|
|
- Write in POLITE FORMAL KOREAN (존댓말/경어) - friendly but respectful
|
|
- Each line: 2-3 seconds when spoken aloud
|
|
- Start with a HOOK that grabs attention
|
|
- Use polite Korean expressions: "이거 아세요?", "정말 신기하죠", "근데 여기서 중요한 건요"
|
|
- End with engagement: question, call-to-action, or surprise
|
|
- Make it feel like ORIGINAL Korean content, not a translation"""
|
|
|
|
response = client.chat.completions.create(
|
|
model=settings.OPENAI_MODEL,
|
|
messages=[
|
|
{"role": "system", "content": system_prompt},
|
|
{"role": "user", "content": f"Chinese transcript:\n{full_text}"}
|
|
],
|
|
temperature=0.7,
|
|
max_tokens=max_tokens,
|
|
)
|
|
|
|
script = response.choices[0].message.content
|
|
usage = response.usage
|
|
token_info = f"(tokens: {usage.total_tokens})"
|
|
|
|
return True, f"Script generated [{style}] {token_info}", script
|
|
|
|
except Exception as e:
|
|
return False, f"Script generation error: {str(e)}", None
|
|
|
|
|
|
async def translate_single(
|
|
text: str,
|
|
target_language: str = "Korean",
|
|
max_tokens: Optional[int] = None,
|
|
) -> Tuple[bool, str]:
|
|
"""Translate a single text."""
|
|
if not settings.OPENAI_API_KEY:
|
|
return False, text
|
|
|
|
try:
|
|
client = get_openai_client()
|
|
|
|
request_params = {
|
|
"model": settings.OPENAI_MODEL,
|
|
"messages": [
|
|
{
|
|
"role": "system",
|
|
"content": f"Translate to {target_language}. Only output the translation, nothing else."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": text
|
|
}
|
|
],
|
|
"temperature": 0.3,
|
|
}
|
|
|
|
if max_tokens:
|
|
request_params["max_tokens"] = max_tokens
|
|
|
|
response = client.chat.completions.create(**request_params)
|
|
|
|
translated = response.choices[0].message.content
|
|
return True, translated.strip()
|
|
|
|
except Exception as e:
|
|
return False, text
|