Skip to content

Commit 895f249

Browse files
committed
feat: add transcription model selection and improve caption chunking for CJK text
1 parent 553fe41 commit 895f249

4 files changed

Lines changed: 370 additions & 9 deletions

File tree

apps/web/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616
"db:push:prod": "cross-env NODE_ENV=production drizzle-kit push",
1717
"translation:extract": "i18next-toolkit extract",
1818
"translation:scan": "i18next-toolkit scan",
19-
"translation:translate": "i18next-toolkit translate"
19+
"translation:translate": "i18next-toolkit translate",
20+
"test": "bun test"
2021
},
2122
"dependencies": {
2223
"@cutia/env": "workspace:*",

apps/web/src/components/editor/panels/assets/views/captions.tsx

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,19 @@ import {
1111
import { useState, useRef } from "react";
1212
import { extractTimelineAudio } from "@/lib/media/mediabunny";
1313
import { useEditor } from "@/hooks/use-editor";
14-
import { TRANSCRIPTION_LANGUAGES } from "@/constants/transcription-constants";
14+
import {
15+
TRANSCRIPTION_LANGUAGES,
16+
TRANSCRIPTION_MODELS,
17+
DEFAULT_TRANSCRIPTION_MODEL,
18+
} from "@/constants/transcription-constants";
1519
import {
1620
SUBTITLE_TEMPLATES,
1721
createSubtitleFromTemplate,
1822
type SubtitleTemplate,
1923
} from "@/constants/subtitle-constants";
2024
import type {
2125
TranscriptionLanguage,
26+
TranscriptionModelId,
2227
TranscriptionProgress,
2328
} from "@/types/transcription";
2429
import { transcriptionService } from "@/services/transcription/service";
@@ -32,6 +37,8 @@ export function Captions() {
3237
const { t } = useTranslation();
3338
const [selectedLanguage, setSelectedLanguage] =
3439
useState<TranscriptionLanguage>("auto");
40+
const [selectedModelId, setSelectedModelId] =
41+
useState<TranscriptionModelId>(DEFAULT_TRANSCRIPTION_MODEL);
3542
const [selectedTemplate, setSelectedTemplate] = useState<SubtitleTemplate>(
3643
SUBTITLE_TEMPLATES[0],
3744
);
@@ -82,6 +89,7 @@ export function Captions() {
8289
const result = await transcriptionService.transcribe({
8390
audioData: samples,
8491
language: selectedLanguage,
92+
modelId: selectedModelId,
8593
onProgress: handleProgress,
8694
});
8795

@@ -155,6 +163,32 @@ export function Captions() {
155163
className="flex h-full flex-col justify-between"
156164
>
157165
<div className="flex flex-col gap-5">
166+
<div className="flex flex-col gap-3">
167+
<Label>{t("Model")}</Label>
168+
<Select
169+
value={selectedModelId}
170+
onValueChange={(value) =>
171+
setSelectedModelId(value as TranscriptionModelId)
172+
}
173+
disabled={isProcessing}
174+
>
175+
<SelectTrigger>
176+
<SelectValue placeholder={t("Select a model")} />
177+
</SelectTrigger>
178+
<SelectContent>
179+
{TRANSCRIPTION_MODELS.map((model) => (
180+
<SelectItem key={model.id} value={model.id}>
181+
{model.name}
182+
</SelectItem>
183+
))}
184+
</SelectContent>
185+
</Select>
186+
<p className="text-muted-foreground text-xs">
187+
{TRANSCRIPTION_MODELS.find((m) => m.id === selectedModelId)
188+
?.description ?? ""}
189+
</p>
190+
</div>
191+
158192
<div className="flex flex-col gap-3">
159193
<Label>{t("Language")}</Label>
160194
<Select
Lines changed: 269 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,269 @@
1+
import { describe, it, expect } from "bun:test";
2+
import { buildCaptionChunks } from "../caption";
3+
import type { TranscriptionSegment } from "@/types/transcription";
4+
5+
function makeSegment({
6+
text,
7+
start,
8+
end,
9+
}: {
10+
text: string;
11+
start: number;
12+
end: number;
13+
}): TranscriptionSegment {
14+
return { text, start, end };
15+
}
16+
17+
describe("buildCaptionChunks", () => {
18+
describe("empty / edge cases", () => {
19+
it("returns empty array for no segments", () => {
20+
const result = buildCaptionChunks({ segments: [] });
21+
expect(result).toEqual([]);
22+
});
23+
24+
it("skips segments with empty text", () => {
25+
const result = buildCaptionChunks({
26+
segments: [makeSegment({ text: " ", start: 0, end: 5 })],
27+
});
28+
expect(result).toEqual([]);
29+
});
30+
31+
it("handles a single word segment", () => {
32+
const result = buildCaptionChunks({
33+
segments: [makeSegment({ text: "Hello", start: 0, end: 2 })],
34+
});
35+
expect(result).toHaveLength(1);
36+
expect(result[0].text).toBe("Hello");
37+
});
38+
});
39+
40+
describe("English text (space-delimited)", () => {
41+
it("splits into chunks of wordsPerChunk (default 3)", () => {
42+
const result = buildCaptionChunks({
43+
segments: [
44+
makeSegment({
45+
text: "one two three four five six",
46+
start: 0,
47+
end: 6,
48+
}),
49+
],
50+
});
51+
52+
expect(result).toHaveLength(2);
53+
expect(result[0].text).toBe("one two three");
54+
expect(result[1].text).toBe("four five six");
55+
});
56+
57+
it("respects custom wordsPerChunk", () => {
58+
const result = buildCaptionChunks({
59+
segments: [
60+
makeSegment({
61+
text: "a b c d e f g h",
62+
start: 0,
63+
end: 8,
64+
}),
65+
],
66+
wordsPerChunk: 4,
67+
});
68+
69+
expect(result).toHaveLength(2);
70+
expect(result[0].text).toBe("a b c d");
71+
expect(result[1].text).toBe("e f g h");
72+
});
73+
74+
it("creates a shorter last chunk for non-divisible word counts", () => {
75+
const result = buildCaptionChunks({
76+
segments: [
77+
makeSegment({
78+
text: "one two three four five",
79+
start: 0,
80+
end: 5,
81+
}),
82+
],
83+
});
84+
85+
expect(result).toHaveLength(2);
86+
expect(result[0].text).toBe("one two three");
87+
expect(result[1].text).toBe("four five");
88+
});
89+
});
90+
91+
describe("Chinese text (CJK)", () => {
92+
it("splits Chinese text by character count (default 8)", () => {
93+
const result = buildCaptionChunks({
94+
segments: [
95+
makeSegment({
96+
text: "今天天气很好我们一起去公园玩吧",
97+
start: 0,
98+
end: 14,
99+
}),
100+
],
101+
});
102+
103+
expect(result).toHaveLength(2);
104+
expect(result[0].text).toBe("今天天气很好我们");
105+
expect(result[1].text).toBe("一起去公园玩吧");
106+
});
107+
108+
it("handles short Chinese text as a single chunk", () => {
109+
const result = buildCaptionChunks({
110+
segments: [
111+
makeSegment({ text: "你好世界", start: 0, end: 4 }),
112+
],
113+
});
114+
115+
expect(result).toHaveLength(1);
116+
expect(result[0].text).toBe("你好世界");
117+
});
118+
119+
it("splits long Chinese text into multiple chunks", () => {
120+
const text = "这是一段用来测试字幕分段功能的较长中文文本内容";
121+
const result = buildCaptionChunks({
122+
segments: [makeSegment({ text, start: 0, end: 22 })],
123+
});
124+
125+
expect(result.length).toBeGreaterThanOrEqual(2);
126+
for (const chunk of result) {
127+
expect(chunk.text.length).toBeLessThanOrEqual(8);
128+
}
129+
});
130+
});
131+
132+
describe("mixed CJK and English", () => {
133+
it("treats mixed text with >30% CJK as CJK mode", () => {
134+
const result = buildCaptionChunks({
135+
segments: [
136+
makeSegment({
137+
text: "今天我用React开发了一个组件",
138+
start: 0,
139+
end: 10,
140+
}),
141+
],
142+
});
143+
144+
expect(result.length).toBeGreaterThanOrEqual(1);
145+
for (const chunk of result) {
146+
expect(chunk.text.length).toBeLessThanOrEqual(15);
147+
}
148+
});
149+
});
150+
151+
describe("Japanese text", () => {
152+
it("splits Japanese text by character count", () => {
153+
const result = buildCaptionChunks({
154+
segments: [
155+
makeSegment({
156+
text: "今日はとても良い天気ですね",
157+
start: 0,
158+
end: 12,
159+
}),
160+
],
161+
});
162+
163+
expect(result).toHaveLength(2);
164+
expect(result[0].text).toBe("今日はとても良い");
165+
expect(result[1].text).toBe("天気ですね");
166+
});
167+
});
168+
169+
describe("timing", () => {
170+
it("assigns proportional durations based on token count", () => {
171+
const result = buildCaptionChunks({
172+
segments: [
173+
makeSegment({
174+
text: "one two three four five six",
175+
start: 0,
176+
end: 6,
177+
}),
178+
],
179+
});
180+
181+
expect(result[0].startTime).toBe(0);
182+
expect(result[0].duration).toBe(3);
183+
expect(result[1].startTime).toBe(3);
184+
expect(result[1].duration).toBe(3);
185+
});
186+
187+
it("enforces minDuration for very short chunks", () => {
188+
const result = buildCaptionChunks({
189+
segments: [
190+
makeSegment({
191+
text: "hi there everyone welcome",
192+
start: 0,
193+
end: 0.5,
194+
}),
195+
],
196+
wordsPerChunk: 2,
197+
minDuration: 0.8,
198+
});
199+
200+
for (const chunk of result) {
201+
expect(chunk.duration).toBeGreaterThanOrEqual(0.8);
202+
}
203+
});
204+
205+
it("prevents overlapping captions across segments", () => {
206+
const result = buildCaptionChunks({
207+
segments: [
208+
makeSegment({ text: "first segment text", start: 0, end: 2 }),
209+
makeSegment({
210+
text: "second segment text",
211+
start: 1.5,
212+
end: 3,
213+
}),
214+
],
215+
});
216+
217+
for (let i = 1; i < result.length; i++) {
218+
const prevEnd = result[i - 1].startTime + result[i - 1].duration;
219+
expect(result[i].startTime).toBeGreaterThanOrEqual(prevEnd);
220+
}
221+
});
222+
223+
it("keeps Chinese caption timing proportional", () => {
224+
const text = "今天天气很好我们一起去公园";
225+
const totalTokens = 13;
226+
const segmentDuration = 12;
227+
const tokensPerSecond = totalTokens / segmentDuration;
228+
229+
const result = buildCaptionChunks({
230+
segments: [
231+
makeSegment({ text, start: 0, end: segmentDuration }),
232+
],
233+
});
234+
235+
expect(result).toHaveLength(2);
236+
expect(result[0].startTime).toBe(0);
237+
expect(result[0].duration).toBeCloseTo(8 / tokensPerSecond);
238+
expect(result[1].startTime).toBeCloseTo(8 / tokensPerSecond);
239+
expect(result[1].duration).toBeCloseTo(5 / tokensPerSecond);
240+
});
241+
});
242+
243+
describe("multiple segments", () => {
244+
it("processes segments sequentially", () => {
245+
const result = buildCaptionChunks({
246+
segments: [
247+
makeSegment({
248+
text: "hello world there",
249+
start: 0,
250+
end: 3,
251+
}),
252+
makeSegment({
253+
text: "你好世界测试文本更多",
254+
start: 5,
255+
end: 15,
256+
}),
257+
],
258+
});
259+
260+
expect(result.length).toBeGreaterThanOrEqual(2);
261+
expect(result[0].text).toBe("hello world there");
262+
263+
const chineseChunks = result.filter((c) =>
264+
/[\u4e00-\u9fff]/.test(c.text),
265+
);
266+
expect(chineseChunks.length).toBeGreaterThanOrEqual(1);
267+
});
268+
});
269+
});

0 commit comments

Comments
 (0)