Initial commit: YouTube Shorts maker application

Features: - Video download from TikTok/Douyin using yt-dlp - Audio transcription with OpenAI Whisper - GPT-4 translation (direct/summarize/rewrite modes) - Subtitle generation with ASS format - Video trimming with frame-accurate preview - BGM integration with volume control - Intro text overlay support - Thumbnail generation with text overlay Tech stack: - Backend: FastAPI, Python 3.11+ - Frontend: React, Vite, TailwindCSS - Video processing: FFmpeg - AI: OpenAI Whisper, GPT-4 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-03 21:38:34 +09:00
commit c3795138da
64 changed files with 13059 additions and 0 deletions
--- a/backend/app/services/translator.py
+++ b/backend/app/services/translator.py
@@ -0,0 +1,468 @@
+import re
+from typing import List, Tuple, Optional
+from openai import OpenAI
+from app.models.schemas import TranscriptSegment
+from app.config import settings
+
+
+def get_openai_client() -> OpenAI:
+    """Get OpenAI client."""
+    return OpenAI(api_key=settings.OPENAI_API_KEY)
+
+
+class TranslationMode:
+    """Translation mode options."""
+    DIRECT = "direct"           # 직접 번역 (원본 구조 유지)
+    SUMMARIZE = "summarize"     # 요약 후 번역
+    REWRITE = "rewrite"         # 요약 + 한글 대본 재작성
+
+
+async def shorten_text(client: OpenAI, text: str, max_chars: int) -> str:
+    """
+    Shorten a Korean text to fit within character limit.
+
+    Args:
+        client: OpenAI client
+        text: Text to shorten
+        max_chars: Maximum character count
+
+    Returns:
+        Shortened text
+    """
+    try:
+        response = client.chat.completions.create(
+            model=settings.OPENAI_MODEL,
+            messages=[
+                {
+                    "role": "system",
+                    "content": f"""한국어 자막을 {max_chars}자 이내로 줄이세요.
+
+규칙:
+- 반드시 {max_chars}자 이하!
+- 핵심 의미만 유지
+- 자연스러운 한국어
+- 존댓말 유지
+- 출력은 줄인 문장만!
+
+예시:
+입력: "요리할 때마다 한 시간이 걸리셨죠?" (18자)
+제한: 10자
+출력: "시간 오래 걸리죠" (8자)
+
+입력: "채소 다듬는 데만 30분 걸리셨죠" (16자)
+제한: 10자
+출력: "채소만 30분" (6자)"""
+                },
+                {
+                    "role": "user",
+                    "content": f"입력: \"{text}\" ({len(text)}자)\n제한: {max_chars}자\n출력:"
+                }
+            ],
+            temperature=0.3,
+            max_tokens=50,
+        )
+
+        shortened = response.choices[0].message.content.strip()
+        # Remove quotes, parentheses, and extra characters
+        shortened = shortened.strip('"\'""''')
+        # Remove any trailing parenthetical notes like "(10자)"
+        shortened = re.sub(r'\s*\([^)]*자\)\s*$', '', shortened)
+        shortened = re.sub(r'\s*\(\d+자\)\s*$', '', shortened)
+        # Remove any remaining quotes
+        shortened = shortened.replace('"', '').replace('"', '').replace('"', '')
+        shortened = shortened.replace("'", '').replace("'", '').replace("'", '')
+        shortened = shortened.strip()
+
+        # If still too long, truncate cleanly
+        if len(shortened) > max_chars:
+            shortened = shortened[:max_chars]
+
+        return shortened
+
+    except Exception as e:
+        # Fallback: simple truncation
+        if len(text) > max_chars:
+            return text[:max_chars-1] + "…"
+        return text
+
+
+async def translate_segments(
+    segments: List[TranscriptSegment],
+    target_language: str = "Korean",
+    mode: str = TranslationMode.DIRECT,
+    max_tokens: Optional[int] = None,
+) -> Tuple[bool, str, List[TranscriptSegment]]:
+    """
+    Translate transcript segments to target language using OpenAI.
+
+    Args:
+        segments: List of transcript segments
+        target_language: Target language for translation
+        mode: Translation mode (direct, summarize, rewrite)
+        max_tokens: Maximum output tokens (for cost control)
+
+    Returns:
+        Tuple of (success, message, translated_segments)
+    """
+    if not settings.OPENAI_API_KEY:
+        return False, "OpenAI API key not configured", segments
+
+    try:
+        client = get_openai_client()
+
+        # Batch translate for efficiency
+        texts = [seg.text for seg in segments]
+        combined_text = "\n---\n".join(texts)
+
+        # Calculate video duration for context
+        total_duration = segments[-1].end if segments else 0
+
+        # Calculate segment info for length guidance
+        segment_info = []
+        for i, seg in enumerate(segments):
+            duration = seg.end - seg.start
+            max_chars = int(duration * 5)  # ~5 Korean chars per second (stricter for better sync)
+            segment_info.append(f"[{i+1}] {duration:.1f}초 = 최대 {max_chars}자 (엄수!)")
+
+        # Get custom prompt settings from config
+        gpt_role = settings.GPT_ROLE or "친근한 유튜브 쇼츠 자막 작가"
+        gpt_tone = settings.GPT_TONE or "존댓말"
+        gpt_style = settings.GPT_STYLE or ""
+
+        # Tone examples
+        tone_examples = {
+            "존댓말": '~해요, ~이에요, ~하죠',
+            "반말": '~해, ~야, ~지',
+            "격식체": '~합니다, ~입니다',
+        }
+        tone_example = tone_examples.get(gpt_tone, tone_examples["존댓말"])
+
+        # Additional style instruction
+        style_instruction = f"\n6. Style: {gpt_style}" if gpt_style else ""
+
+        # Select prompt based on mode
+        if mode == TranslationMode.REWRITE:
+            # Build indexed timeline input with Chinese text
+            # Use segment numbers to handle duplicate timestamps
+            timeline_input = []
+            for i, seg in enumerate(segments):
+                mins = int(seg.start // 60)
+                secs = int(seg.start % 60)
+                timeline_input.append(f"[{i+1}] {mins}:{secs:02d} {seg.text}")
+
+            system_prompt = f"""당신은 생활용품 유튜브 쇼츠 자막 작가입니다.
+
+중국어 원문의 "의미"만 참고하여, 한국인이 직접 말하는 것처럼 자연스러운 자막을 작성하세요.
+
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+🎯 핵심 원칙: 번역이 아니라 "재창작"
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+
+✅ 필수 규칙:
+1. 한 문장 = 한 가지 정보 (두 개 이상 금지)
+2. 중복 표현 절대 금지 ("편해요"가 이미 나왔으면 다시 안 씀)
+3. {gpt_tone} 사용 ({tone_example})
+4. 세그먼트 수 유지: 입력 {len(segments)}개 → 출력 {len(segments)}개
+5. 중국어 한자 금지, 순수 한글만
+
+❌ 금지 표현 (번역투):
+- "~할 수 있어요" → "~돼요", "~됩니다"
+- "매우/아주/정말" 남용 → 꼭 필요할 때만
+- "그것은/이것은" → "이거", "이건"
+- "~하는 것이" → 직접 표현으로
+- "편리해요/편해요" 반복 → 한 번만, 이후 다른 표현
+- "좋아요/좋고요" 반복 → 구체적 장점으로 대체
+
+🎵 쇼츠 리듬감:
+- 짧게 끊어서
+- 한 호흡에 하나씩
+- 시청자가 따라 읽을 수 있게
+
+📝 좋은 예시:
+
+원문: "이 작은 박스 디자인이 참 좋네요. 평소에 씨앗 먹을 때 간편하게 먹을 수 있어요."
+❌ 나쁜 번역: "이 작은 박스 디자인이 참 좋네요. 평소에 씨앗 먹을 때 간편하게 먹을 수 있어요."
+✅ 좋은 재창작: "이 작은 박스, 생각보다 정말 잘 만들었어요."
+
+원문: "테이블에 두거나 손에 들고 사용하기에도 좋고요. 침대에 누워서나 사무실에서도 간식이나 과일 먹기 정말 편해요."
+❌ 나쁜 번역: "테이블에 두거나 손에 들고 사용하기에도 좋고요. 침대에 누워서나 사무실에서도 간식이나 과일 먹기 정말 편해요."
+✅ 좋은 재창작 (2개로 분리):
+  - "테이블 위에서도, 침대에서도, 사무실에서도 사용하기 좋고"
+  - "과일 씻고 물기 빼는 데도 활용 가능합니다."
+
+원문: "가정에서 필수 아이템이에요. 정말 유용하죠. 꼭 하나씩 가져야 할 제품이에요."
+❌ 나쁜 번역: 그대로 3문장
+✅ 좋은 재창작: "집에 하나 있으면 은근히 자주 쓰게 됩니다."{style_instruction}
+
+출력 형식:
+[번호] 시간 자막 내용
+
+⚠️ 입력과 동일한 세그먼트 수({len(segments)}개)를 출력하세요!
+⚠️ 각 [번호]는 입력과 1:1 대응해야 합니다!"""
+
+            # Use indexed timeline format for user content
+            combined_text = "[중국어 원문]\n\n" + "\n".join(timeline_input)
+
+        elif mode == TranslationMode.SUMMARIZE:
+            system_prompt = f"""You are: {gpt_role}
+
+Task: Translate Chinese to SHORT Korean subtitles.
+
+Length limits (자막 싱크!):
+{chr(10).join(segment_info)}
+
+Rules:
+1. Use {gpt_tone} ({tone_example})
+2. Summarize to core meaning - be BRIEF
+3. Max one short sentence per segment
+4. {len(segments)} segments separated by '---'{style_instruction}"""
+
+        else:  # DIRECT mode
+            system_prompt = f"""You are: {gpt_role}
+
+Task: Translate Chinese to Korean subtitles.
+
+Length limits (자막 싱크!):
+{chr(10).join(segment_info)}
+
+Rules:
+1. Use {gpt_tone} ({tone_example})
+2. Keep translations SHORT and readable
+3. {len(segments)} segments separated by '---'{style_instruction}"""
+
+        # Build API request
+        request_params = {
+            "model": settings.OPENAI_MODEL,
+            "messages": [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": combined_text}
+            ],
+            "temperature": 0.65 if mode == TranslationMode.REWRITE else 0.3,
+        }
+
+        # Add max_tokens if specified (for cost control)
+        effective_max_tokens = max_tokens or settings.TRANSLATION_MAX_TOKENS
+        if effective_max_tokens:
+            # Use higher token limit for REWRITE mode
+            if mode == TranslationMode.REWRITE:
+                request_params["max_tokens"] = max(effective_max_tokens, 700)
+            else:
+                request_params["max_tokens"] = effective_max_tokens
+
+        response = client.chat.completions.create(**request_params)
+
+        translated_text = response.choices[0].message.content
+
+        # Parse based on mode
+        if mode == TranslationMode.REWRITE:
+            # Parse indexed timeline format: "[1] 0:00 자막\n[2] 0:02 자막\n..."
+            indexed_pattern = re.compile(r'^\[(\d+)\]\s*\d+:\d{2}\s+(.+)$', re.MULTILINE)
+            matches = indexed_pattern.findall(translated_text)
+
+            # Create mapping from segment index to translation
+            translations_by_index = {}
+            for idx, text in matches:
+                translations_by_index[int(idx)] = text.strip()
+
+            # Map translations back to segments by index (1-based)
+            for i, seg in enumerate(segments):
+                seg_num = i + 1  # 1-based index
+                if seg_num in translations_by_index:
+                    seg.translated = translations_by_index[seg_num]
+                else:
+                    # No matching translation found - try fallback to old timestamp-based parsing
+                    seg.translated = ""
+
+            # Fallback: if no indexed matches, try old timestamp format
+            if not matches:
+                print("[Warning] No indexed format found, falling back to timestamp parsing")
+                timeline_pattern = re.compile(r'^(\d+):(\d{2})\s+(.+)$', re.MULTILINE)
+                timestamp_matches = timeline_pattern.findall(translated_text)
+
+                # Create mapping from timestamp to translation
+                translations_by_time = {}
+                for mins, secs, text in timestamp_matches:
+                    time_sec = int(mins) * 60 + int(secs)
+                    translations_by_time[time_sec] = text.strip()
+
+                # Track used translations to prevent duplicates
+                used_translations = set()
+
+                # Map translations back to segments by matching start times
+                for seg in segments:
+                    start_sec = int(seg.start)
+                    matched_time = None
+
+                    # Try exact match first
+                    if start_sec in translations_by_time and start_sec not in used_translations:
+                        matched_time = start_sec
+                    else:
+                        # Try to find closest UNUSED match within 1 second
+                        for t in range(start_sec - 1, start_sec + 2):
+                            if t in translations_by_time and t not in used_translations:
+                                matched_time = t
+                                break
+
+                    if matched_time is not None:
+                        seg.translated = translations_by_time[matched_time]
+                        used_translations.add(matched_time)
+                    else:
+                        seg.translated = ""
+        else:
+            # Original parsing for other modes
+            translated_parts = translated_text.split("---")
+            for i, seg in enumerate(segments):
+                if i < len(translated_parts):
+                    seg.translated = translated_parts[i].strip()
+                else:
+                    seg.translated = seg.text  # Fallback to original
+
+        # Calculate token usage for logging
+        usage = response.usage
+        token_info = f"(tokens: {usage.prompt_tokens}+{usage.completion_tokens}={usage.total_tokens})"
+
+        # Post-processing: Shorten segments that exceed character limit
+        # Skip for REWRITE mode - the prompt handles length naturally
+        shortened_count = 0
+        if mode != TranslationMode.REWRITE:
+            chars_per_sec = 5
+            for i, seg in enumerate(segments):
+                if seg.translated:
+                    duration = seg.end - seg.start
+                    max_chars = int(duration * chars_per_sec)
+                    current_len = len(seg.translated)
+
+                    if current_len > max_chars * 1.3 and max_chars >= 5:
+                        seg.translated = await shorten_text(client, seg.translated, max_chars)
+                        shortened_count += 1
+                        print(f"[Shorten] Seg {i+1}: {current_len}→{len(seg.translated)}자 (제한:{max_chars}자)")
+
+        shorten_info = f" [축약:{shortened_count}개]" if shortened_count > 0 else ""
+
+        return True, f"Translation complete [{mode}] {token_info}{shorten_info}", segments
+
+    except Exception as e:
+        return False, f"Translation error: {str(e)}", segments
+
+
+async def generate_shorts_script(
+    segments: List[TranscriptSegment],
+    style: str = "engaging",
+    max_tokens: int = 500,
+) -> Tuple[bool, str, Optional[str]]:
+    """
+    Generate a completely new Korean Shorts script from Chinese transcript.
+
+    Args:
+        segments: Original transcript segments
+        style: Script style (engaging, informative, funny, dramatic)
+        max_tokens: Maximum output tokens
+
+    Returns:
+        Tuple of (success, message, script)
+    """
+    if not settings.OPENAI_API_KEY:
+        return False, "OpenAI API key not configured", None
+
+    try:
+        client = get_openai_client()
+
+        # Combine all text
+        full_text = " ".join([seg.text for seg in segments])
+        total_duration = segments[-1].end if segments else 0
+
+        style_guides = {
+            "engaging": "Use hooks, questions, and emotional expressions. Start with attention-grabbing line.",
+            "informative": "Focus on facts and clear explanations. Use simple, direct language.",
+            "funny": "Add humor, wordplay, and light-hearted tone. Include relatable jokes.",
+            "dramatic": "Build tension and suspense. Use impactful short sentences.",
+        }
+
+        style_guide = style_guides.get(style, style_guides["engaging"])
+
+        system_prompt = f"""You are a viral Korean YouTube Shorts script writer.
+
+Create a COMPLETELY ORIGINAL Korean script inspired by the Chinese video content.
+
+=== CRITICAL: ANTI-PLAGIARISM RULES ===
+- This is NOT translation - it's ORIGINAL CONTENT CREATION
+- NEVER copy sentence structures, word order, or phrasing from original
+- Extract only the CORE IDEA, then write YOUR OWN script from scratch
+- Imagine you're a Korean creator who just learned this interesting fact
+- Add your own personality, reactions, and Korean cultural context
+=======================================
+
+Video duration: ~{int(total_duration)} seconds
+Style: {style}
+Guide: {style_guide}
+
+Output format:
+[0:00] 첫 번째 대사
+[0:03] 두 번째 대사
+...
+
+Requirements:
+- Write in POLITE FORMAL KOREAN (존댓말/경어) - friendly but respectful
+- Each line: 2-3 seconds when spoken aloud
+- Start with a HOOK that grabs attention
+- Use polite Korean expressions: "이거 아세요?", "정말 신기하죠", "근데 여기서 중요한 건요"
+- End with engagement: question, call-to-action, or surprise
+- Make it feel like ORIGINAL Korean content, not a translation"""
+
+        response = client.chat.completions.create(
+            model=settings.OPENAI_MODEL,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": f"Chinese transcript:\n{full_text}"}
+            ],
+            temperature=0.7,
+            max_tokens=max_tokens,
+        )
+
+        script = response.choices[0].message.content
+        usage = response.usage
+        token_info = f"(tokens: {usage.total_tokens})"
+
+        return True, f"Script generated [{style}] {token_info}", script
+
+    except Exception as e:
+        return False, f"Script generation error: {str(e)}", None
+
+
+async def translate_single(
+    text: str,
+    target_language: str = "Korean",
+    max_tokens: Optional[int] = None,
+) -> Tuple[bool, str]:
+    """Translate a single text."""
+    if not settings.OPENAI_API_KEY:
+        return False, text
+
+    try:
+        client = get_openai_client()
+
+        request_params = {
+            "model": settings.OPENAI_MODEL,
+            "messages": [
+                {
+                    "role": "system",
+                    "content": f"Translate to {target_language}. Only output the translation, nothing else."
+                },
+                {
+                    "role": "user",
+                    "content": text
+                }
+            ],
+            "temperature": 0.3,
+        }
+
+        if max_tokens:
+            request_params["max_tokens"] = max_tokens
+
+        response = client.chat.completions.create(**request_params)
+
+        translated = response.choices[0].message.content
+        return True, translated.strip()
+
+    except Exception as e:
+        return False, text