Skip to content

Commit 0d6c491

Browse files
committed
feat: support chinese
1 parent d7654ba commit 0d6c491

File tree

6 files changed

+150
-75
lines changed

6 files changed

+150
-75
lines changed

kokoro.js/package-lock.json

Lines changed: 33 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

kokoro.js/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
"description": "High-quality text-to-speech for the web",
3636
"dependencies": {
3737
"@huggingface/transformers": "^3.5.1",
38+
"phonemize": "^1.0.0",
3839
"phonemizer": "^1.2.1"
3940
},
4041
"devDependencies": {

kokoro.js/src/kokoro.js

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,12 @@ export class KokoroTTS {
2323
* Create a new KokoroTTS instance.
2424
* @param {import('@huggingface/transformers').StyleTextToSpeech2Model} model The model
2525
* @param {import('@huggingface/transformers').PreTrainedTokenizer} tokenizer The tokenizer
26+
* @param {"1.0"|"1.1"} version The version of the Chinese phonemization style
2627
*/
27-
constructor(model, tokenizer) {
28+
constructor(model, tokenizer, version = "1.0") {
2829
this.model = model;
2930
this.tokenizer = tokenizer;
31+
this.version = version;
3032
}
3133

3234
/**
@@ -42,8 +44,9 @@ export class KokoroTTS {
4244
const model = StyleTextToSpeech2Model.from_pretrained(model_id, { progress_callback, dtype, device });
4345
const tokenizer = AutoTokenizer.from_pretrained(model_id, { progress_callback });
4446

45-
const info = await Promise.all([model, tokenizer]);
46-
return new KokoroTTS(...info);
47+
const version = model_id.includes("v1.1-zh") ? "1.1" : "1.0";
48+
49+
return new KokoroTTS(await model, await tokenizer, version);
4750
}
4851

4952
get voices() {
@@ -60,7 +63,7 @@ export class KokoroTTS {
6063
console.table(VOICES);
6164
throw new Error(`Voice "${voice}" not found. Should be one of: ${Object.keys(VOICES).join(", ")}.`);
6265
}
63-
const language = /** @type {"a"|"b"} */ (voice.at(0)); // "a" or "b"
66+
const language = /** @type {"a"|"b"|"z"} */ (voice.at(0)); // "a" or "b" or "z"
6467
return language;
6568
}
6669

@@ -74,7 +77,7 @@ export class KokoroTTS {
7477
async generate(text, { voice = "af_heart", speed = 1 } = {}) {
7578
const language = this._validate_voice(voice);
7679

77-
const phonemes = await phonemize(text, language);
80+
const phonemes = await phonemize(text, language, this.version);
7881
const { input_ids } = this.tokenizer(phonemes, {
7982
truncation: true,
8083
});
@@ -135,7 +138,7 @@ export class KokoroTTS {
135138
throw new Error("Invalid input type. Expected string or TextSplitterStream.");
136139
}
137140
for await (const sentence of splitter) {
138-
const phonemes = await phonemize(sentence, language);
141+
const phonemes = await phonemize(sentence, language, this.version);
139142
const { input_ids } = this.tokenizer(phonemes, {
140143
truncation: true,
141144
});

kokoro.js/src/phonemize.js

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { phonemize as espeakng } from "phonemizer";
2+
import { toZhuyin, toIPA } from "phonemize";
23

34
/**
45
* Helper function to split a string on a regex, but keep the delimiters.
@@ -164,25 +165,40 @@ function escapeRegExp(string) {
164165
const PUNCTUATION = ';:,.!?¡¿—…"«»“”(){}[]';
165166
const PUNCTUATION_PATTERN = new RegExp(`(\\s*[${escapeRegExp(PUNCTUATION)}]+\\s*)+`, "g");
166167

168+
const ZH_PATTERN = /[\u4e00-\u9fff]+/;
169+
const MIXED_PATTERN = new RegExp(`(\\s*[${escapeRegExp(PUNCTUATION)}]+\\s*|${ZH_PATTERN.source})`, "g");
170+
167171
/**
168172
* Phonemize text using the eSpeak-NG phonemizer
169173
* @param {string} text The text to phonemize
170-
* @param {"a"|"b"} language The language to use
174+
* @param {"a"|"b"|"z"} language The language to use
175+
* @param {"1.0"|"1.1"} version The version of the Chinese phonemization style
171176
* @param {boolean} norm Whether to normalize the text
172177
* @returns {Promise<string>} The phonemized text
173178
*/
174-
export async function phonemize(text, language = "a", norm = true) {
179+
export async function phonemize(text, language = "a", version = "1.0", norm = true) {
175180
// 1. Normalize text
176181
if (norm) {
177182
text = normalize_text(text);
178183
}
179184

180-
// 2. Split into chunks, to ensure we preserve punctuation
181-
const sections = split(text, PUNCTUATION_PATTERN);
185+
// 2. Split into chunks, to ensure we preserve punctuation and separate Chinese text if needed
186+
const sections = split(text, language === "z" ? MIXED_PATTERN : PUNCTUATION_PATTERN);
182187

183188
// 3. Convert each section to phonemes
184189
const lang = language === "a" ? "en-us" : "en";
185-
const ps = (await Promise.all(sections.map(async ({ match, text }) => (match ? text : (await espeakng(text, lang)).join(" "))))).join("");
190+
const processedSections = await Promise.all(sections.map(async ({ match, text }) => {
191+
if (!match) {
192+
return (await espeakng(text, lang)).join(" ");
193+
}
194+
if (ZH_PATTERN.test(text)) {
195+
return version === "1.0" ? toIPA(text, { toneFormat: "arrow" }) : toZhuyin(text);
196+
} else {
197+
return text;
198+
}
199+
}));
200+
201+
const ps = processedSections.join("");
186202

187203
// 4. Post-process phonemes
188204
let processed = ps

kokoro.js/src/voices.js

Lines changed: 64 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -248,70 +248,70 @@ export const VOICES = Object.freeze({
248248
// targetQuality: "B",
249249
// overallGrade: "C-",
250250
// },
251-
// zf_xiaobei: {
252-
// name: "xiaobei",
253-
// language: "zh",
254-
// gender: "Female",
255-
// traits: "🚺",
256-
// targetQuality: "C",
257-
// overallGrade: "D",
258-
// },
259-
// zf_xiaoni: {
260-
// name: "xiaoni",
261-
// language: "zh",
262-
// gender: "Female",
263-
// traits: "🚺",
264-
// targetQuality: "C",
265-
// overallGrade: "D",
266-
// },
267-
// zf_xiaoxiao: {
268-
// name: "xiaoxiao",
269-
// language: "zh",
270-
// gender: "Female",
271-
// traits: "🚺",
272-
// targetQuality: "C",
273-
// overallGrade: "D",
274-
// },
275-
// zf_xiaoyi: {
276-
// name: "xiaoyi",
277-
// language: "zh",
278-
// gender: "Female",
279-
// traits: "🚺",
280-
// targetQuality: "C",
281-
// overallGrade: "D",
282-
// },
283-
// zm_yunjian: {
284-
// name: "yunjian",
285-
// language: "zh",
286-
// gender: "Male",
287-
// traits: "🚹",
288-
// targetQuality: "C",
289-
// overallGrade: "D",
290-
// },
291-
// zm_yunxi: {
292-
// name: "yunxi",
293-
// language: "zh",
294-
// gender: "Male",
295-
// traits: "🚹",
296-
// targetQuality: "C",
297-
// overallGrade: "D",
298-
// },
299-
// zm_yunxia: {
300-
// name: "yunxia",
301-
// language: "zh",
302-
// gender: "Male",
303-
// traits: "🚹",
304-
// targetQuality: "C",
305-
// overallGrade: "D",
306-
// },
307-
// zm_yunyang: {
308-
// name: "yunyang",
309-
// language: "zh",
310-
// gender: "Male",
311-
// traits: "🚹",
312-
// targetQuality: "C",
313-
// overallGrade: "D",
314-
// },
251+
zf_xiaobei: {
252+
name: "xiaobei",
253+
language: "zh",
254+
gender: "Female",
255+
traits: "🚺",
256+
targetQuality: "C",
257+
overallGrade: "D",
258+
},
259+
zf_xiaoni: {
260+
name: "xiaoni",
261+
language: "zh",
262+
gender: "Female",
263+
traits: "🚺",
264+
targetQuality: "C",
265+
overallGrade: "D",
266+
},
267+
zf_xiaoxiao: {
268+
name: "xiaoxiao",
269+
language: "zh",
270+
gender: "Female",
271+
traits: "🚺",
272+
targetQuality: "C",
273+
overallGrade: "D",
274+
},
275+
zf_xiaoyi: {
276+
name: "xiaoyi",
277+
language: "zh",
278+
gender: "Female",
279+
traits: "🚺",
280+
targetQuality: "C",
281+
overallGrade: "D",
282+
},
283+
zm_yunjian: {
284+
name: "yunjian",
285+
language: "zh",
286+
gender: "Male",
287+
traits: "🚹",
288+
targetQuality: "C",
289+
overallGrade: "D",
290+
},
291+
zm_yunxi: {
292+
name: "yunxi",
293+
language: "zh",
294+
gender: "Male",
295+
traits: "🚹",
296+
targetQuality: "C",
297+
overallGrade: "D",
298+
},
299+
zm_yunxia: {
300+
name: "yunxia",
301+
language: "zh",
302+
gender: "Male",
303+
traits: "🚹",
304+
targetQuality: "C",
305+
overallGrade: "D",
306+
},
307+
zm_yunyang: {
308+
name: "yunyang",
309+
language: "zh",
310+
gender: "Male",
311+
traits: "🚹",
312+
targetQuality: "C",
313+
overallGrade: "D",
314+
},
315315
// ef_dora: {
316316
// name: "dora",
317317
// language: "es",

kokoro.js/tests/phonemize.test.js

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,14 @@ const B_TEST_CASES = new Map([
7777
["X's mark", "ˈɛksɪz mˈɑːk"],
7878
]);
7979

80+
const Z_TEST_CASES = new Map([
81+
["中文, test", "ʈʂʊŋ→ wən↗, tˈɛst"],
82+
]);
83+
84+
const Z_V2_TEST_CASES = new Map([
85+
["中文, test", "ㄓㄨㄥ1 ㄨㄣ2, tˈɛst"],
86+
]);
87+
8088
describe("phonemize", () => {
8189
describe("en-us", () => {
8290
for (const [input, expected] of A_TEST_CASES) {
@@ -92,4 +100,18 @@ describe("phonemize", () => {
92100
});
93101
}
94102
});
103+
describe("zh v1", () => {
104+
for (const [input, expected] of Z_TEST_CASES) {
105+
test(`phonemize("${input}")`, async () => {
106+
expect(await phonemize(input, "z", "1.0")).toEqual(expected);
107+
});
108+
}
109+
});
110+
describe("zh v2", () => {
111+
for (const [input, expected] of Z_V2_TEST_CASES) {
112+
test(`phonemize("${input}")`, async () => {
113+
expect(await phonemize(input, "z", "1.1")).toEqual(expected);
114+
});
115+
}
116+
});
95117
});

0 commit comments

Comments
 (0)