Translation Integration

Translate transcription segments from source to target language using argostranslate (offline, OpenNMT-based). Key limitation: argostranslate has no duration budget — the translation length is unconstrained. Romance languages (Spanish, French, Italian) typically produce longer text than the English source, which creates timing challenges for the downstream TTS stage.

Setup

import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent.parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

IMAGES_DIR = Path.cwd() / "images"
IMAGES_DIR.mkdir(exist_ok=True)

# Load .env (LOGFIRE_TOKEN, HF_TOKEN, etc.)
from dotenv import load_dotenv
load_dotenv(PROJECT_ROOT / ".env")

from foreign_whispers import FWClient

API_URL = "http://localhost:8080"
fw = FWClient(API_URL)

print(f"Project root: {PROJECT_ROOT}")
print(f"Images dir:   {IMAGES_DIR}")
print(f"SDK client ready: FWClient('{API_URL}')")

# Optional: Logfire tracing (no-op shim if unavailable)
try:
    import logfire
    logfire.configure(service_name="foreign-whispers-translation")
    print("Logfire tracing enabled.")
except Exception:
    class _NoopSpan:
        def __enter__(self): return self
        def __exit__(self, *a): pass
    class _noop:
        @staticmethod
        def span(name, **kw): return _NoopSpan()
        @staticmethod
        def info(*a, **kw): pass
    logfire = _noop()
    print("Logfire not configured — using no-op shim.")

Run Translation

# Get video list and select the first video
videos = fw.videos()
video_id = videos[0]["id"]
print(f"Video: {videos[0]['title']} ({video_id})")

# Run translation (source EN -> target ES)
with logfire.span("translate", video_id=video_id):
    result = fw.translate(video_id)

print(f"Target language: {result['target_language']}")
print(f"Segment count:   {len(result['segments'])}")

# Show first 3 segments: EN original alongside ES translation
import json

# Load the source (English) transcription for comparison
transcription_dir = PROJECT_ROOT / "pipeline_data" / "api" / "transcriptions" / "whisper"
title = videos[0]["title"]
en_path = transcription_dir / f"{title}.json"
en_data = json.loads(en_path.read_text())
en_segs = en_data["segments"]
es_segs = result["segments"]

print("\nFirst 3 segments (EN -> ES):")
print("-" * 80)
for i, (en, es) in enumerate(zip(en_segs[:3], es_segs[:3])):
    print(f"Segment {i}:")
    print(f"  EN: {en['text'].strip()}")
    print(f"  ES: {es['text'].strip()}")
    print()

Analyze Translation Length

Compare character counts between source and target segments. Romance languages tend to produce longer text than English, which matters for TTS duration budgets downstream.

import matplotlib.pyplot as plt
import numpy as np

# Load source and target transcripts
transcription_dir = PROJECT_ROOT / "pipeline_data" / "api" / "transcriptions" / "whisper"
translation_dir = PROJECT_ROOT / "pipeline_data" / "api" / "translations" / "argos"

title = videos[0]["title"]
en_data = json.loads((transcription_dir / f"{title}.json").read_text())
es_data = json.loads((translation_dir / f"{title}.json").read_text())

en_segs = en_data["segments"]
es_segs = es_data["segments"]

# Compute char_ratio = len(es_text) / len(en_text) for each segment
ratios = [len(es["text"]) / max(len(en["text"]), 1) for en, es in zip(en_segs, es_segs)]

fig, ax = plt.subplots(figsize=(10, 4))
ax.hist(ratios, bins=20, color="coral", edgecolor="white")
ax.axvline(np.mean(ratios), color="red", linestyle="--", label=f"Mean: {np.mean(ratios):.2f}")
ax.set_xlabel("Character ratio (target / source)")
ax.set_ylabel("Count")
ax.set_title("Translation Length Expansion")
ax.legend()
fig.tight_layout()
fig.savefig(str(IMAGES_DIR / "translation_length_ratio.png"), dpi=150)
plt.show()

print(f"Mean ratio:   {np.mean(ratios):.2f}")
print(f"Median ratio: {np.median(ratios):.2f}")
print(f"Max ratio:    {np.max(ratios):.2f}")

Duration-Aware Re-ranking (Student Assignment)

This corresponds to P8 in the full pipeline notebook. The function get_shorter_translations() in foreign_whispers/reranking.py is a stub that currently returns an empty list. Students implement it to produce shorter target-language translations that fit within a TTS duration budget. See the docstring in reranking.py for the full specification, including:

Input/output contract
Duration heuristic (~15 chars/second for Romance languages)
Suggested implementation approaches (rule-based, multi-backend, LLM, hybrid)

from foreign_whispers import get_shorter_translations, TranslationCandidate

# Demo the stub: pick a segment and request shorter translations
source_text = en_segs[0]["text"].strip()
baseline_es = es_segs[0]["text"].strip()
target_duration_s = 3.0

print(f"Source (EN):    {source_text}")
print(f"Baseline (ES):  {baseline_es}")
print(f"Duration budget: {target_duration_s}s (~{int(target_duration_s * 15)} chars at 15 chars/s)")
print()

candidates = get_shorter_translations(
    source_text=source_text,
    baseline_es=baseline_es,
    target_duration_s=target_duration_s,
)

print(f"Candidates returned: {len(candidates)}")
if candidates:
    for c in candidates:
        print(f"  [{c.char_count} chars] {c.text}  -- {c.brevity_rationale}")
else:
    print("  (empty list -- stub not yet implemented)")

Summary

Translation outputs are stored in pipeline_data/api/translations/argos/
The segment format is preserved from the transcription stage, with the text field replaced by the translated text
argostranslate produces unconstrained translations — no duration budget is enforced
Romance languages (e.g., Spanish) typically expand text length by ~10-30% vs. English
The get_shorter_translations() stub in foreign_whispers/reranking.py is the extension point for duration-aware re-ranking

Edit this page on GitHub or file an issue.

​Setup

​Run Translation

​Analyze Translation Length

​Duration-Aware Re-ranking (Student Assignment)

​Summary

Setup

Run Translation

Analyze Translation Length

Duration-Aware Re-ranking (Student Assignment)

Summary