bini-shorts-maker/backend/app/services/translator.py

import re
from typing import List, Tuple, Optional
from openai import OpenAI
from app.models.schemas import TranscriptSegment
from app.config import settings


def get_openai_client() -> OpenAI:
    """Get OpenAI client."""
    return OpenAI(api_key=settings.OPENAI_API_KEY)


class TranslationMode:
    """Translation mode options."""
    DIRECT = "direct"           # 직접 번역 (원본 구조 유지)
    SUMMARIZE = "summarize"     # 요약 후 번역
    REWRITE = "rewrite"         # 요약 + 한글 대본 재작성


async def shorten_text(client: OpenAI, text: str, max_chars: int) -> str:
    """
    Shorten a Korean text to fit within character limit.

    Args:
        client: OpenAI client
        text: Text to shorten
        max_chars: Maximum character count

    Returns:
        Shortened text
    """
    try:
        response = client.chat.completions.create(
            model=settings.OPENAI_MODEL,
            messages=[
                {
                    "role": "system",
                    "content": f"""한국어 자막을 {max_chars}자 이내로 줄이세요.

규칙:
- 반드시 {max_chars}자 이하!
- 핵심 의미만 유지
- 자연스러운 한국어
- 존댓말 유지
- 출력은 줄인 문장만!

예시:
입력: "요리할 때마다 한 시간이 걸리셨죠?" (18자)
제한: 10자
출력: "시간 오래 걸리죠" (8자)

입력: "채소 다듬는 데만 30분 걸리셨죠" (16자)
제한: 10자
출력: "채소만 30분" (6자)"""
                },
                {
                    "role": "user",
                    "content": f"입력: \"{text}\" ({len(text)}자)\n제한: {max_chars}자\n출력:"
                }
            ],
            temperature=0.3,
            max_tokens=50,
        )

        shortened = response.choices[0].message.content.strip()
        # Remove quotes, parentheses, and extra characters
        shortened = shortened.strip('"\'""''')
        # Remove any trailing parenthetical notes like "(10자)"
        shortened = re.sub(r'\s*\([^)]*자\)\s*$', '', shortened)
        shortened = re.sub(r'\s*\(\d+자\)\s*$', '', shortened)
        # Remove any remaining quotes
        shortened = shortened.replace('"', '').replace('"', '').replace('"', '')
        shortened = shortened.replace("'", '').replace("'", '').replace("'", '')
        shortened = shortened.strip()

        # If still too long, truncate cleanly
        if len(shortened) > max_chars:
            shortened = shortened[:max_chars]

        return shortened

    except Exception as e:
        # Fallback: simple truncation
        if len(text) > max_chars:
            return text[:max_chars-1] + "…"
        return text


async def translate_segments(
    segments: List[TranscriptSegment],
    target_language: str = "Korean",
    mode: str = TranslationMode.DIRECT,
    max_tokens: Optional[int] = None,
) -> Tuple[bool, str, List[TranscriptSegment]]:
    """
    Translate transcript segments to target language using OpenAI.

    Args:
        segments: List of transcript segments
        target_language: Target language for translation
        mode: Translation mode (direct, summarize, rewrite)
        max_tokens: Maximum output tokens (for cost control)

    Returns:
        Tuple of (success, message, translated_segments)
    """
    if not settings.OPENAI_API_KEY:
        return False, "OpenAI API key not configured", segments

    try:
        client = get_openai_client()

        # Batch translate for efficiency
        texts = [seg.text for seg in segments]
        combined_text = "\n---\n".join(texts)

        # Calculate video duration for context
        total_duration = segments[-1].end if segments else 0

        # Calculate segment info for length guidance
        segment_info = []
        for i, seg in enumerate(segments):
            duration = seg.end - seg.start
            max_chars = int(duration * 5)  # ~5 Korean chars per second (stricter for better sync)
            segment_info.append(f"[{i+1}] {duration:.1f}초 = 최대 {max_chars}자 (엄수!)")

        # Get custom prompt settings from config
        gpt_role = settings.GPT_ROLE or "친근한 유튜브 쇼츠 자막 작가"
        gpt_tone = settings.GPT_TONE or "존댓말"
        gpt_style = settings.GPT_STYLE or ""

        # Tone examples
        tone_examples = {
            "존댓말": '~해요, ~이에요, ~하죠',
            "반말": '~해, ~야, ~지',
            "격식체": '~합니다, ~입니다',
        }
        tone_example = tone_examples.get(gpt_tone, tone_examples["존댓말"])

        # Additional style instruction
        style_instruction = f"\n6. Style: {gpt_style}" if gpt_style else ""

        # Select prompt based on mode
        if mode == TranslationMode.REWRITE:
            # Build indexed timeline input with Chinese text
            # Use segment numbers to handle duplicate timestamps
            timeline_input = []
            for i, seg in enumerate(segments):
                mins = int(seg.start // 60)
                secs = int(seg.start % 60)
                timeline_input.append(f"[{i+1}] {mins}:{secs:02d} {seg.text}")

            system_prompt = f"""당신은 생활용품 유튜브 쇼츠 자막 작가입니다.

중국어 원문의 "의미"만 참고하여, 한국인이 직접 말하는 것처럼 자연스러운 자막을 작성하세요.

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
🎯 핵심 원칙: 번역이 아니라 "재창작"
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

✅ 필수 규칙:
1. 한 문장 = 한 가지 정보 (두 개 이상 금지)
2. 중복 표현 절대 금지 ("편해요"가 이미 나왔으면 다시 안 씀)
3. {gpt_tone} 사용 ({tone_example})
4. 세그먼트 수 유지: 입력 {len(segments)}개 → 출력 {len(segments)}개
5. 중국어 한자 금지, 순수 한글만

❌ 금지 표현 (번역투):
- "~할 수 있어요" → "~돼요", "~됩니다"
- "매우/아주/정말" 남용 → 꼭 필요할 때만
- "그것은/이것은" → "이거", "이건"
- "~하는 것이" → 직접 표현으로
- "편리해요/편해요" 반복 → 한 번만, 이후 다른 표현
- "좋아요/좋고요" 반복 → 구체적 장점으로 대체

🎵 쇼츠 리듬감:
- 짧게 끊어서
- 한 호흡에 하나씩
- 시청자가 따라 읽을 수 있게

📝 좋은 예시:

원문: "이 작은 박스 디자인이 참 좋네요. 평소에 씨앗 먹을 때 간편하게 먹을 수 있어요."
❌ 나쁜 번역: "이 작은 박스 디자인이 참 좋네요. 평소에 씨앗 먹을 때 간편하게 먹을 수 있어요."
✅ 좋은 재창작: "이 작은 박스, 생각보다 정말 잘 만들었어요."

원문: "테이블에 두거나 손에 들고 사용하기에도 좋고요. 침대에 누워서나 사무실에서도 간식이나 과일 먹기 정말 편해요."
❌ 나쁜 번역: "테이블에 두거나 손에 들고 사용하기에도 좋고요. 침대에 누워서나 사무실에서도 간식이나 과일 먹기 정말 편해요."
✅ 좋은 재창작 (2개로 분리):
  - "테이블 위에서도, 침대에서도, 사무실에서도 사용하기 좋고"
  - "과일 씻고 물기 빼는 데도 활용 가능합니다."

원문: "가정에서 필수 아이템이에요. 정말 유용하죠. 꼭 하나씩 가져야 할 제품이에요."
❌ 나쁜 번역: 그대로 3문장
✅ 좋은 재창작: "집에 하나 있으면 은근히 자주 쓰게 됩니다."{style_instruction}

출력 형식:
[번호] 시간 자막 내용

⚠️ 입력과 동일한 세그먼트 수({len(segments)}개)를 출력하세요!
⚠️ 각 [번호]는 입력과 1:1 대응해야 합니다!"""

            # Use indexed timeline format for user content
            combined_text = "[중국어 원문]\n\n" + "\n".join(timeline_input)

        elif mode == TranslationMode.SUMMARIZE:
            system_prompt = f"""You are: {gpt_role}

Task: Translate Chinese to SHORT Korean subtitles.

Length limits (자막 싱크!):
{chr(10).join(segment_info)}

Rules:
1. Use {gpt_tone} ({tone_example})
2. Summarize to core meaning - be BRIEF
3. Max one short sentence per segment
4. {len(segments)} segments separated by '---'{style_instruction}"""

        else:  # DIRECT mode
            system_prompt = f"""You are: {gpt_role}

Task: Translate Chinese to Korean subtitles.

Length limits (자막 싱크!):
{chr(10).join(segment_info)}

Rules:
1. Use {gpt_tone} ({tone_example})
2. Keep translations SHORT and readable
3. {len(segments)} segments separated by '---'{style_instruction}"""

        # Build API request
        request_params = {
            "model": settings.OPENAI_MODEL,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": combined_text}
            ],
            "temperature": 0.65 if mode == TranslationMode.REWRITE else 0.3,
        }

        # Add max_tokens if specified (for cost control)
        effective_max_tokens = max_tokens or settings.TRANSLATION_MAX_TOKENS
        if effective_max_tokens:
            # Use higher token limit for REWRITE mode
            if mode == TranslationMode.REWRITE:
                request_params["max_tokens"] = max(effective_max_tokens, 700)
            else:
                request_params["max_tokens"] = effective_max_tokens

        response = client.chat.completions.create(**request_params)

        translated_text = response.choices[0].message.content

        # Parse based on mode
        if mode == TranslationMode.REWRITE:
            # Parse indexed timeline format: "[1] 0:00 자막\n[2] 0:02 자막\n..."
            indexed_pattern = re.compile(r'^\[(\d+)\]\s*\d+:\d{2}\s+(.+)$', re.MULTILINE)
            matches = indexed_pattern.findall(translated_text)

            # Create mapping from segment index to translation
            translations_by_index = {}
            for idx, text in matches:
                translations_by_index[int(idx)] = text.strip()

            # Map translations back to segments by index (1-based)
            for i, seg in enumerate(segments):
                seg_num = i + 1  # 1-based index
                if seg_num in translations_by_index:
                    seg.translated = translations_by_index[seg_num]
                else:
                    # No matching translation found - try fallback to old timestamp-based parsing
                    seg.translated = ""

            # Fallback: if no indexed matches, try old timestamp format
            if not matches:
                print("[Warning] No indexed format found, falling back to timestamp parsing")
                timeline_pattern = re.compile(r'^(\d+):(\d{2})\s+(.+)$', re.MULTILINE)
                timestamp_matches = timeline_pattern.findall(translated_text)

                # Create mapping from timestamp to translation
                translations_by_time = {}
                for mins, secs, text in timestamp_matches:
                    time_sec = int(mins) * 60 + int(secs)
                    translations_by_time[time_sec] = text.strip()

                # Track used translations to prevent duplicates
                used_translations = set()

                # Map translations back to segments by matching start times
                for seg in segments:
                    start_sec = int(seg.start)
                    matched_time = None

                    # Try exact match first
                    if start_sec in translations_by_time and start_sec not in used_translations:
                        matched_time = start_sec
                    else:
                        # Try to find closest UNUSED match within 1 second
                        for t in range(start_sec - 1, start_sec + 2):
                            if t in translations_by_time and t not in used_translations:
                                matched_time = t
                                break

                    if matched_time is not None:
                        seg.translated = translations_by_time[matched_time]
                        used_translations.add(matched_time)
                    else:
                        seg.translated = ""
        else:
            # Original parsing for other modes
            translated_parts = translated_text.split("---")
            for i, seg in enumerate(segments):
                if i < len(translated_parts):
                    seg.translated = translated_parts[i].strip()
                else:
                    seg.translated = seg.text  # Fallback to original

        # Calculate token usage for logging
        usage = response.usage
        token_info = f"(tokens: {usage.prompt_tokens}+{usage.completion_tokens}={usage.total_tokens})"

        # Post-processing: Shorten segments that exceed character limit
        # Skip for REWRITE mode - the prompt handles length naturally
        shortened_count = 0
        if mode != TranslationMode.REWRITE:
            chars_per_sec = 5
            for i, seg in enumerate(segments):
                if seg.translated:
                    duration = seg.end - seg.start
                    max_chars = int(duration * chars_per_sec)
                    current_len = len(seg.translated)

                    if current_len > max_chars * 1.3 and max_chars >= 5:
                        seg.translated = await shorten_text(client, seg.translated, max_chars)
                        shortened_count += 1
                        print(f"[Shorten] Seg {i+1}: {current_len}→{len(seg.translated)}자 (제한:{max_chars}자)")

        shorten_info = f" [축약:{shortened_count}개]" if shortened_count > 0 else ""

        return True, f"Translation complete [{mode}] {token_info}{shorten_info}", segments

    except Exception as e:
        return False, f"Translation error: {str(e)}", segments


async def generate_shorts_script(
    segments: List[TranscriptSegment],
    style: str = "engaging",
    max_tokens: int = 500,
) -> Tuple[bool, str, Optional[str]]:
    """
    Generate a completely new Korean Shorts script from Chinese transcript.

    Args:
        segments: Original transcript segments
        style: Script style (engaging, informative, funny, dramatic)
        max_tokens: Maximum output tokens

    Returns:
        Tuple of (success, message, script)
    """
    if not settings.OPENAI_API_KEY:
        return False, "OpenAI API key not configured", None

    try:
        client = get_openai_client()

        # Combine all text
        full_text = " ".join([seg.text for seg in segments])
        total_duration = segments[-1].end if segments else 0

        style_guides = {
            "engaging": "Use hooks, questions, and emotional expressions. Start with attention-grabbing line.",
            "informative": "Focus on facts and clear explanations. Use simple, direct language.",
            "funny": "Add humor, wordplay, and light-hearted tone. Include relatable jokes.",
            "dramatic": "Build tension and suspense. Use impactful short sentences.",
        }

        style_guide = style_guides.get(style, style_guides["engaging"])

        system_prompt = f"""You are a viral Korean YouTube Shorts script writer.

Create a COMPLETELY ORIGINAL Korean script inspired by the Chinese video content.

=== CRITICAL: ANTI-PLAGIARISM RULES ===
- This is NOT translation - it's ORIGINAL CONTENT CREATION
- NEVER copy sentence structures, word order, or phrasing from original
- Extract only the CORE IDEA, then write YOUR OWN script from scratch
- Imagine you're a Korean creator who just learned this interesting fact
- Add your own personality, reactions, and Korean cultural context
=======================================

Video duration: ~{int(total_duration)} seconds
Style: {style}
Guide: {style_guide}

Output format:
[0:00] 첫 번째 대사
[0:03] 두 번째 대사
...

Requirements:
- Write in POLITE FORMAL KOREAN (존댓말/경어) - friendly but respectful
- Each line: 2-3 seconds when spoken aloud
- Start with a HOOK that grabs attention
- Use polite Korean expressions: "이거 아세요?", "정말 신기하죠", "근데 여기서 중요한 건요"
- End with engagement: question, call-to-action, or surprise
- Make it feel like ORIGINAL Korean content, not a translation"""

        response = client.chat.completions.create(
            model=settings.OPENAI_MODEL,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": f"Chinese transcript:\n{full_text}"}
            ],
            temperature=0.7,
            max_tokens=max_tokens,
        )

        script = response.choices[0].message.content
        usage = response.usage
        token_info = f"(tokens: {usage.total_tokens})"

        return True, f"Script generated [{style}] {token_info}", script

    except Exception as e:
        return False, f"Script generation error: {str(e)}", None


async def translate_single(
    text: str,
    target_language: str = "Korean",
    max_tokens: Optional[int] = None,
) -> Tuple[bool, str]:
    """Translate a single text."""
    if not settings.OPENAI_API_KEY:
        return False, text

    try:
        client = get_openai_client()

        request_params = {
            "model": settings.OPENAI_MODEL,
            "messages": [
                {
                    "role": "system",
                    "content": f"Translate to {target_language}. Only output the translation, nothing else."
                },
                {
                    "role": "user",
                    "content": text
                }
            ],
            "temperature": 0.3,
        }

        if max_tokens:
            request_params["max_tokens"] = max_tokens

        response = client.chat.completions.create(**request_params)

        translated = response.choices[0].message.content
        return True, translated.strip()

    except Exception as e:
        return False, text