feat: add transcription model selection and improve caption chunking for CJK text

moonrailgun · moonrailgun · commit 895f249d3b49 · 2026-03-01T02:20:32.000+08:00
diff --git a/apps/web/package.json b/apps/web/package.json
@@ -16,7 +16,8 @@
     "db:push:prod": "cross-env NODE_ENV=production drizzle-kit push",
     "translation:extract": "i18next-toolkit extract",
     "translation:scan": "i18next-toolkit scan",
-    "translation:translate": "i18next-toolkit translate"
+    "translation:translate": "i18next-toolkit translate",
+    "test": "bun test"
   },
   "dependencies": {
     "@cutia/env": "workspace:*",
diff --git a/apps/web/src/components/editor/panels/assets/views/captions.tsx b/apps/web/src/components/editor/panels/assets/views/captions.tsx
@@ -11,14 +11,19 @@ import {
 import { useState, useRef } from "react";
 import { extractTimelineAudio } from "@/lib/media/mediabunny";
 import { useEditor } from "@/hooks/use-editor";
-import { TRANSCRIPTION_LANGUAGES } from "@/constants/transcription-constants";
+import {
+	TRANSCRIPTION_LANGUAGES,
+	TRANSCRIPTION_MODELS,
+	DEFAULT_TRANSCRIPTION_MODEL,
+} from "@/constants/transcription-constants";
 import {
 	SUBTITLE_TEMPLATES,
 	createSubtitleFromTemplate,
 	type SubtitleTemplate,
 } from "@/constants/subtitle-constants";
 import type {
 	TranscriptionLanguage,
+	TranscriptionModelId,
 	TranscriptionProgress,
 } from "@/types/transcription";
 import { transcriptionService } from "@/services/transcription/service";
@@ -32,6 +37,8 @@ export function Captions() {
 	const { t } = useTranslation();
 	const [selectedLanguage, setSelectedLanguage] =
 		useState<TranscriptionLanguage>("auto");
+	const [selectedModelId, setSelectedModelId] =
+		useState<TranscriptionModelId>(DEFAULT_TRANSCRIPTION_MODEL);
 	const [selectedTemplate, setSelectedTemplate] = useState<SubtitleTemplate>(
 		SUBTITLE_TEMPLATES[0],
 	);
@@ -82,6 +89,7 @@ export function Captions() {
 			const result = await transcriptionService.transcribe({
 				audioData: samples,
 				language: selectedLanguage,
+				modelId: selectedModelId,
 				onProgress: handleProgress,
 			});
 
@@ -155,6 +163,32 @@ export function Captions() {
 			className="flex h-full flex-col justify-between"
 		>
 			<div className="flex flex-col gap-5">
+				<div className="flex flex-col gap-3">
+					<Label>{t("Model")}</Label>
+					<Select
+						value={selectedModelId}
+						onValueChange={(value) =>
+							setSelectedModelId(value as TranscriptionModelId)
+						}
+						disabled={isProcessing}
+					>
+						<SelectTrigger>
+							<SelectValue placeholder={t("Select a model")} />
+						</SelectTrigger>
+						<SelectContent>
+							{TRANSCRIPTION_MODELS.map((model) => (
+								<SelectItem key={model.id} value={model.id}>
+									{model.name}
+								</SelectItem>
+							))}
+						</SelectContent>
+					</Select>
+					<p className="text-muted-foreground text-xs">
+						{TRANSCRIPTION_MODELS.find((m) => m.id === selectedModelId)
+							?.description ?? ""}
+					</p>
+				</div>
+
 				<div className="flex flex-col gap-3">
 					<Label>{t("Language")}</Label>
 					<Select
diff --git a/apps/web/src/lib/transcription/__tests__/caption.test.ts b/apps/web/src/lib/transcription/__tests__/caption.test.ts
@@ -0,0 +1,269 @@
+import { describe, it, expect } from "bun:test";
+import { buildCaptionChunks } from "../caption";
+import type { TranscriptionSegment } from "@/types/transcription";
+
+function makeSegment({
+	text,
+	start,
+	end,
+}: {
+	text: string;
+	start: number;
+	end: number;
+}): TranscriptionSegment {
+	return { text, start, end };
+}
+
+describe("buildCaptionChunks", () => {
+	describe("empty / edge cases", () => {
+		it("returns empty array for no segments", () => {
+			const result = buildCaptionChunks({ segments: [] });
+			expect(result).toEqual([]);
+		});
+
+		it("skips segments with empty text", () => {
+			const result = buildCaptionChunks({
+				segments: [makeSegment({ text: "   ", start: 0, end: 5 })],
+			});
+			expect(result).toEqual([]);
+		});
+
+		it("handles a single word segment", () => {
+			const result = buildCaptionChunks({
+				segments: [makeSegment({ text: "Hello", start: 0, end: 2 })],
+			});
+			expect(result).toHaveLength(1);
+			expect(result[0].text).toBe("Hello");
+		});
+	});
+
+	describe("English text (space-delimited)", () => {
+		it("splits into chunks of wordsPerChunk (default 3)", () => {
+			const result = buildCaptionChunks({
+				segments: [
+					makeSegment({
+						text: "one two three four five six",
+						start: 0,
+						end: 6,
+					}),
+				],
+			});
+
+			expect(result).toHaveLength(2);
+			expect(result[0].text).toBe("one two three");
+			expect(result[1].text).toBe("four five six");
+		});
+
+		it("respects custom wordsPerChunk", () => {
+			const result = buildCaptionChunks({
+				segments: [
+					makeSegment({
+						text: "a b c d e f g h",
+						start: 0,
+						end: 8,
+					}),
+				],
+				wordsPerChunk: 4,
+			});
+
+			expect(result).toHaveLength(2);
+			expect(result[0].text).toBe("a b c d");
+			expect(result[1].text).toBe("e f g h");
+		});
+
+		it("creates a shorter last chunk for non-divisible word counts", () => {
+			const result = buildCaptionChunks({
+				segments: [
+					makeSegment({
+						text: "one two three four five",
+						start: 0,
+						end: 5,
+					}),
+				],
+			});
+
+			expect(result).toHaveLength(2);
+			expect(result[0].text).toBe("one two three");
+			expect(result[1].text).toBe("four five");
+		});
+	});
+
+	describe("Chinese text (CJK)", () => {
+		it("splits Chinese text by character count (default 8)", () => {
+			const result = buildCaptionChunks({
+				segments: [
+					makeSegment({
+						text: "今天天气很好我们一起去公园玩吧",
+						start: 0,
+						end: 14,
+					}),
+				],
+			});
+
+			expect(result).toHaveLength(2);
+			expect(result[0].text).toBe("今天天气很好我们");
+			expect(result[1].text).toBe("一起去公园玩吧");
+		});
+
+		it("handles short Chinese text as a single chunk", () => {
+			const result = buildCaptionChunks({
+				segments: [
+					makeSegment({ text: "你好世界", start: 0, end: 4 }),
+				],
+			});
+
+			expect(result).toHaveLength(1);
+			expect(result[0].text).toBe("你好世界");
+		});
+
+		it("splits long Chinese text into multiple chunks", () => {
+			const text = "这是一段用来测试字幕分段功能的较长中文文本内容";
+			const result = buildCaptionChunks({
+				segments: [makeSegment({ text, start: 0, end: 22 })],
+			});
+
+			expect(result.length).toBeGreaterThanOrEqual(2);
+			for (const chunk of result) {
+				expect(chunk.text.length).toBeLessThanOrEqual(8);
+			}
+		});
+	});
+
+	describe("mixed CJK and English", () => {
+		it("treats mixed text with >30% CJK as CJK mode", () => {
+			const result = buildCaptionChunks({
+				segments: [
+					makeSegment({
+						text: "今天我用React开发了一个组件",
+						start: 0,
+						end: 10,
+					}),
+				],
+			});
+
+			expect(result.length).toBeGreaterThanOrEqual(1);
+			for (const chunk of result) {
+				expect(chunk.text.length).toBeLessThanOrEqual(15);
+			}
+		});
+	});
+
+	describe("Japanese text", () => {
+		it("splits Japanese text by character count", () => {
+			const result = buildCaptionChunks({
+				segments: [
+					makeSegment({
+						text: "今日はとても良い天気ですね",
+						start: 0,
+						end: 12,
+					}),
+				],
+			});
+
+			expect(result).toHaveLength(2);
+			expect(result[0].text).toBe("今日はとても良い");
+			expect(result[1].text).toBe("天気ですね");
+		});
+	});
+
+	describe("timing", () => {
+		it("assigns proportional durations based on token count", () => {
+			const result = buildCaptionChunks({
+				segments: [
+					makeSegment({
+						text: "one two three four five six",
+						start: 0,
+						end: 6,
+					}),
+				],
+			});
+
+			expect(result[0].startTime).toBe(0);
+			expect(result[0].duration).toBe(3);
+			expect(result[1].startTime).toBe(3);
+			expect(result[1].duration).toBe(3);
+		});
+
+		it("enforces minDuration for very short chunks", () => {
+			const result = buildCaptionChunks({
+				segments: [
+					makeSegment({
+						text: "hi there everyone welcome",
+						start: 0,
+						end: 0.5,
+					}),
+				],
+				wordsPerChunk: 2,
+				minDuration: 0.8,
+			});
+
+			for (const chunk of result) {
+				expect(chunk.duration).toBeGreaterThanOrEqual(0.8);
+			}
+		});
+
+		it("prevents overlapping captions across segments", () => {
+			const result = buildCaptionChunks({
+				segments: [
+					makeSegment({ text: "first segment text", start: 0, end: 2 }),
+					makeSegment({
+						text: "second segment text",
+						start: 1.5,
+						end: 3,
+					}),
+				],
+			});
+
+			for (let i = 1; i < result.length; i++) {
+				const prevEnd = result[i - 1].startTime + result[i - 1].duration;
+				expect(result[i].startTime).toBeGreaterThanOrEqual(prevEnd);
+			}
+		});
+
+		it("keeps Chinese caption timing proportional", () => {
+			const text = "今天天气很好我们一起去公园";
+			const totalTokens = 13;
+			const segmentDuration = 12;
+			const tokensPerSecond = totalTokens / segmentDuration;
+
+			const result = buildCaptionChunks({
+				segments: [
+					makeSegment({ text, start: 0, end: segmentDuration }),
+				],
+			});
+
+			expect(result).toHaveLength(2);
+			expect(result[0].startTime).toBe(0);
+			expect(result[0].duration).toBeCloseTo(8 / tokensPerSecond);
+			expect(result[1].startTime).toBeCloseTo(8 / tokensPerSecond);
+			expect(result[1].duration).toBeCloseTo(5 / tokensPerSecond);
+		});
+	});
+
+	describe("multiple segments", () => {
+		it("processes segments sequentially", () => {
+			const result = buildCaptionChunks({
+				segments: [
+					makeSegment({
+						text: "hello world there",
+						start: 0,
+						end: 3,
+					}),
+					makeSegment({
+						text: "你好世界测试文本更多",
+						start: 5,
+						end: 15,
+					}),
+				],
+			});
+
+			expect(result.length).toBeGreaterThanOrEqual(2);
+			expect(result[0].text).toBe("hello world there");
+
+			const chineseChunks = result.filter((c) =>
+				/[\u4e00-\u9fff]/.test(c.text),
+			);
+			expect(chineseChunks.length).toBeGreaterThanOrEqual(1);
+		});
+	});
+});
diff --git a/apps/web/src/lib/transcription/caption.ts b/apps/web/src/lib/transcription/caption.ts