add speed parameter

This commit is contained in:
ANLGBOY
2025-11-19 19:42:24 +09:00
parent c31b6745e4
commit 8518b839c1
30 changed files with 246 additions and 61 deletions
+2
View File
@@ -4,6 +4,8 @@ High-performance text-to-speech inference using ONNX Runtime.
## 📰 Update News
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
## Requirements
+5 -2
View File
@@ -10,6 +10,7 @@ namespace fs = std::filesystem;
struct Args {
std::string onnx_dir = "../assets/onnx";
int total_step = 5;
float speed = 1.05f;
int n_test = 4;
std::vector<std::string> voice_style = {"../assets/voice_styles/M1.json"};
std::vector<std::string> text = {
@@ -36,6 +37,7 @@ Args parseArgs(int argc, char* argv[]) {
std::string arg = argv[i];
if (arg == "--onnx-dir" && i + 1 < argc) args.onnx_dir = argv[++i];
else if (arg == "--total-step" && i + 1 < argc) args.total_step = std::stoi(argv[++i]);
else if (arg == "--speed" && i + 1 < argc) args.speed = std::stof(argv[++i]);
else if (arg == "--n-test" && i + 1 < argc) args.n_test = std::stoi(argv[++i]);
else if (arg == "--voice-style" && i + 1 < argc) args.voice_style = splitString(argv[++i], ',');
else if (arg == "--text" && i + 1 < argc) args.text = splitString(argv[++i], '|');
@@ -51,6 +53,7 @@ int main(int argc, char* argv[]) {
// --- 1. Parse arguments --- //
Args args = parseArgs(argc, argv);
int total_step = args.total_step;
float speed = args.speed;
int n_test = args.n_test;
std::string save_dir = args.save_dir;
std::vector<std::string> voice_style_paths = args.voice_style;
@@ -84,9 +87,9 @@ int main(int argc, char* argv[]) {
auto result = timer("Generating speech from text", [&]() {
if (batch) {
return text_to_speech->batch(memory_info, text_list, style, total_step);
return text_to_speech->batch(memory_info, text_list, style, total_step, speed);
} else {
return text_to_speech->call(memory_info, text_list[0], style, total_step);
return text_to_speech->call(memory_info, text_list[0], style, total_step, speed);
}
});
+12 -4
View File
@@ -160,7 +160,8 @@ TextToSpeech::SynthesisResult TextToSpeech::_infer(
Ort::MemoryInfo& memory_info,
const std::vector<std::string>& text_list,
const Style& style,
int total_step
int total_step,
float speed
) {
int bsz = text_list.size();
@@ -213,6 +214,11 @@ TextToSpeech::SynthesisResult TextToSpeech::_infer(
auto* dur_data = dp_outputs[0].GetTensorMutableData<float>();
std::vector<float> duration(dur_data, dur_data + bsz);
// Apply speed factor to duration
for (auto& dur : duration) {
dur /= speed;
}
// Create new tensors for text encoder (previous ones were moved)
text_ids_tensor = intArrayToTensor(memory_info, text_ids, text_ids_shape);
text_mask_tensor = arrayToTensor(memory_info, text_mask, text_mask_shape);
@@ -370,6 +376,7 @@ TextToSpeech::SynthesisResult TextToSpeech::call(
const std::string& text,
const Style& style,
int total_step,
float speed,
float silence_duration
) {
if (style.getTtlShape()[0] != 1) {
@@ -381,7 +388,7 @@ TextToSpeech::SynthesisResult TextToSpeech::call(
float dur_cat = 0.0f;
for (const auto& chunk : text_list) {
auto result = _infer(memory_info, {chunk}, style, total_step);
auto result = _infer(memory_info, {chunk}, style, total_step, speed);
if (wav_cat.empty()) {
wav_cat = result.wav;
@@ -406,9 +413,10 @@ TextToSpeech::SynthesisResult TextToSpeech::batch(
Ort::MemoryInfo& memory_info,
const std::vector<std::string>& text_list,
const Style& style,
int total_step
int total_step,
float speed
) {
return _infer(memory_info, text_list, style, total_step);
return _infer(memory_info, text_list, style, total_step, speed);
}
// ============================================================================
+5 -2
View File
@@ -91,6 +91,7 @@ public:
const std::string& text,
const Style& style,
int total_step,
float speed = 1.05f,
float silence_duration = 0.3f
);
@@ -98,7 +99,8 @@ public:
Ort::MemoryInfo& memory_info,
const std::vector<std::string>& text_list,
const Style& style,
int total_step
int total_step,
float speed = 1.05f
);
int getSampleRate() const { return sample_rate_; }
@@ -108,7 +110,8 @@ private:
Ort::MemoryInfo& memory_info,
const std::vector<std::string>& text_list,
const Style& style,
int total_step
int total_step,
float speed = 1.05f
);
Config cfgs_;
UnicodeProcessor* text_processor_;
+7 -2
View File
@@ -12,6 +12,7 @@ namespace Supertonic
public bool UseGpu { get; set; } = false;
public string OnnxDir { get; set; } = "assets/onnx";
public int TotalStep { get; set; } = 5;
public float Speed { get; set; } = 1.05f;
public int NTest { get; set; } = 4;
public List<string> VoiceStyle { get; set; } = new List<string> { "assets/voice_styles/M1.json" };
public List<string> Text { get; set; } = new List<string>
@@ -42,6 +43,9 @@ namespace Supertonic
case "--total-step" when i + 1 < args.Length:
result.TotalStep = int.Parse(args[++i]);
break;
case "--speed" when i + 1 < args.Length:
result.Speed = float.Parse(args[++i]);
break;
case "--n-test" when i + 1 < args.Length:
result.NTest = int.Parse(args[++i]);
break;
@@ -67,6 +71,7 @@ namespace Supertonic
// --- 1. Parse arguments --- //
var parsedArgs = ParseArgs(args);
int totalStep = parsedArgs.TotalStep;
float speed = parsedArgs.Speed;
int nTest = parsedArgs.NTest;
string saveDir = parsedArgs.SaveDir;
var voiceStylePaths = parsedArgs.VoiceStyle;
@@ -96,11 +101,11 @@ namespace Supertonic
{
if (batch)
{
return textToSpeech.Batch(textList, style, totalStep);
return textToSpeech.Batch(textList, style, totalStep, speed);
}
else
{
return textToSpeech.Call(textList[0], style, totalStep);
return textToSpeech.Call(textList[0], style, totalStep, speed);
}
});
+11 -5
View File
@@ -194,7 +194,7 @@ namespace Supertonic
return (noisyLatent, latentMask);
}
private (float[] wav, float[] duration) _Infer(List<string> textList, Style style, int totalStep)
private (float[] wav, float[] duration) _Infer(List<string> textList, Style style, int totalStep, float speed = 1.05f)
{
int bsz = textList.Count;
if (bsz != style.TtlShape[0])
@@ -222,6 +222,12 @@ namespace Supertonic
};
using var dpOutputs = _dpOrt.Run(dpInputs);
var durOnnx = dpOutputs.First(o => o.Name == "duration").AsTensor<float>().ToArray();
// Apply speed factor to duration
for (int i = 0; i < durOnnx.Length; i++)
{
durOnnx[i] /= speed;
}
// Run text encoder
var textEncInputs = new List<NamedOnnxValue>
@@ -284,7 +290,7 @@ namespace Supertonic
return (wavTensor.ToArray(), durOnnx);
}
public (float[] wav, float[] duration) Call(string text, Style style, int totalStep, float silenceDuration = 0.3f)
public (float[] wav, float[] duration) Call(string text, Style style, int totalStep, float speed = 1.05f, float silenceDuration = 0.3f)
{
if (style.TtlShape[0] != 1)
{
@@ -297,7 +303,7 @@ namespace Supertonic
foreach (var chunk in textList)
{
var (wav, duration) = _Infer(new List<string> { chunk }, style, totalStep);
var (wav, duration) = _Infer(new List<string> { chunk }, style, totalStep, speed);
if (wavCat.Count == 0)
{
@@ -317,9 +323,9 @@ namespace Supertonic
return (wavCat.ToArray(), new float[] { durCat });
}
public (float[] wav, float[] duration) Batch(List<string> textList, Style style, int totalStep)
public (float[] wav, float[] duration) Batch(List<string> textList, Style style, int totalStep, float speed = 1.05f)
{
return _Infer(textList, style, totalStep);
return _Infer(textList, style, totalStep, speed);
}
}
+2
View File
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `ExampleONNX.cs`.
## 📰 Update News
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
## Installation
+2
View File
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx.go`.
## 📰 Update News
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
## Installation
+5 -2
View File
@@ -15,6 +15,7 @@ type Args struct {
useGPU bool
onnxDir string
totalStep int
speed float64
nTest int
voiceStyle []string
text []string
@@ -28,6 +29,7 @@ func parseArgs() *Args {
flag.BoolVar(&args.useGPU, "use-gpu", false, "Use GPU for inference (default: CPU)")
flag.StringVar(&args.onnxDir, "onnx-dir", "assets/onnx", "Path to ONNX model directory")
flag.IntVar(&args.totalStep, "total-step", 5, "Number of denoising steps")
flag.Float64Var(&args.speed, "speed", 1.05, "Speech speed factor (higher = faster)")
flag.IntVar(&args.nTest, "n-test", 4, "Number of times to generate")
flag.StringVar(&args.saveDir, "save-dir", "results", "Output directory")
flag.BoolVar(&args.batch, "batch", false, "Enable batch mode (multiple text-style pairs)")
@@ -63,6 +65,7 @@ func main() {
// --- 1. Parse arguments --- //
args := parseArgs()
totalStep := args.totalStep
speed := float32(args.speed)
nTest := args.nTest
saveDir := args.saveDir
voiceStylePaths := args.voiceStyle
@@ -123,7 +126,7 @@ func main() {
if batch {
Timer("Generating speech from text", func() interface{} {
w, d, err := textToSpeech.Batch(textList, style, totalStep)
w, d, err := textToSpeech.Batch(textList, style, totalStep, speed)
if err != nil {
fmt.Printf("Error generating speech: %v\n", err)
os.Exit(1)
@@ -134,7 +137,7 @@ func main() {
})
} else {
Timer("Generating speech from text", func() interface{} {
w, d, err := textToSpeech.Call(textList[0], style, totalStep, 0.3)
w, d, err := textToSpeech.Call(textList[0], style, totalStep, speed, 0.3)
if err != nil {
fmt.Printf("Error generating speech: %v\n", err)
os.Exit(1)
+10 -5
View File
@@ -572,7 +572,7 @@ func (tts *TextToSpeech) sampleNoisyLatent(durOnnx []float32) ([][][]float64, []
return noisyLatent, latentMask
}
func (tts *TextToSpeech) _infer(textList []string, style *Style, totalStep int) ([]float32, []float32, error) {
func (tts *TextToSpeech) _infer(textList []string, style *Style, totalStep int, speed float32) ([]float32, []float32, error) {
bsz := len(textList)
// Process text
@@ -597,6 +597,11 @@ func (tts *TextToSpeech) _infer(textList []string, style *Style, totalStep int)
durTensor := dpOutputs[0].(*ort.Tensor[float32])
defer durTensor.Destroy()
durOnnx := durTensor.GetData()
// Apply speed factor to duration
for i := range durOnnx {
durOnnx[i] /= speed
}
// Encode text
textIDsTensor2 := IntArrayToTensor(textIDs, textIDsShape)
@@ -691,14 +696,14 @@ func (tts *TextToSpeech) _infer(textList []string, style *Style, totalStep int)
}
// Call synthesizes speech from a single text with automatic chunking
func (tts *TextToSpeech) Call(text string, style *Style, totalStep int, silenceDuration float32) ([]float32, float32, error) {
func (tts *TextToSpeech) Call(text string, style *Style, totalStep int, speed float32, silenceDuration float32) ([]float32, float32, error) {
chunks := chunkText(text, 0)
var wavCat []float32
var durCat float32
for i, chunk := range chunks {
wav, duration, err := tts._infer([]string{chunk}, style, totalStep)
wav, duration, err := tts._infer([]string{chunk}, style, totalStep, speed)
if err != nil {
return nil, 0, err
}
@@ -724,8 +729,8 @@ func (tts *TextToSpeech) Call(text string, style *Style, totalStep int, silenceD
}
// Batch synthesizes speech from multiple texts
func (tts *TextToSpeech) Batch(textList []string, style *Style, totalStep int) ([]float32, []float32, error) {
return tts._infer(textList, style, totalStep)
func (tts *TextToSpeech) Batch(textList []string, style *Style, totalStep int, speed float32) ([]float32, []float32, error) {
return tts._infer(textList, style, totalStep, speed)
}
func (tts *TextToSpeech) Destroy() {
+7 -2
View File
@@ -15,6 +15,7 @@ public class ExampleONNX {
boolean useGpu = false;
String onnxDir = "assets/onnx";
int totalStep = 5;
float speed = 1.05f;
int nTest = 4;
List<String> voiceStyle = Arrays.asList("assets/voice_styles/M1.json");
List<String> text = Arrays.asList(
@@ -41,6 +42,9 @@ public class ExampleONNX {
case "--total-step":
if (i + 1 < args.length) result.totalStep = Integer.parseInt(args[++i]);
break;
case "--speed":
if (i + 1 < args.length) result.speed = Float.parseFloat(args[++i]);
break;
case "--n-test":
if (i + 1 < args.length) result.nTest = Integer.parseInt(args[++i]);
break;
@@ -76,6 +80,7 @@ public class ExampleONNX {
// --- 1. Parse arguments --- //
Args parsedArgs = parseArgs(args);
int totalStep = parsedArgs.totalStep;
float speed = parsedArgs.speed;
int nTest = parsedArgs.nTest;
String saveDir = parsedArgs.saveDir;
List<String> voiceStylePaths = parsedArgs.voiceStyle;
@@ -111,7 +116,7 @@ public class ExampleONNX {
if (batch) {
ttsResult = Helper.timer("Generating speech from text", () -> {
try {
return textToSpeech.batch(textList, style, totalStep, env);
return textToSpeech.batch(textList, style, totalStep, speed, env);
} catch (Exception e) {
throw new RuntimeException(e);
}
@@ -119,7 +124,7 @@ public class ExampleONNX {
} else {
ttsResult = Helper.timer("Generating speech from text", () -> {
try {
return textToSpeech.call(textList.get(0), style, totalStep, 0.3f, env);
return textToSpeech.call(textList.get(0), style, totalStep, speed, 0.3f, env);
} catch (Exception e) {
throw new RuntimeException(e);
}
+10 -5
View File
@@ -154,7 +154,7 @@ class TextToSpeech {
this.ldim = config.ttl.latentDim;
}
private TTSResult _infer(List<String> textList, Style style, int totalStep, OrtEnvironment env)
private TTSResult _infer(List<String> textList, Style style, int totalStep, float speed, OrtEnvironment env)
throws OrtException {
int bsz = textList.size();
@@ -182,6 +182,11 @@ class TextToSpeech {
duration = (float[]) dpValue;
}
// Apply speed factor to duration
for (int i = 0; i < duration.length; i++) {
duration[i] /= speed;
}
// Encode text
Map<String, OnnxTensor> textEncInputs = new HashMap<>();
textEncInputs.put("text_ids", textIdsTensor);
@@ -301,7 +306,7 @@ class TextToSpeech {
/**
* Synthesize speech from a single text with automatic chunking
*/
public TTSResult call(String text, Style style, int totalStep, float silenceDuration, OrtEnvironment env)
public TTSResult call(String text, Style style, int totalStep, float speed, float silenceDuration, OrtEnvironment env)
throws OrtException {
List<String> chunks = Helper.chunkText(text, 0);
@@ -309,7 +314,7 @@ class TextToSpeech {
float durCat = 0.0f;
for (int i = 0; i < chunks.size(); i++) {
TTSResult result = _infer(Arrays.asList(chunks.get(i)), style, totalStep, env);
TTSResult result = _infer(Arrays.asList(chunks.get(i)), style, totalStep, speed, env);
float dur = result.duration[0];
int wavLen = (int) (sampleRate * dur);
@@ -344,9 +349,9 @@ class TextToSpeech {
/**
* Batch synthesize speech from multiple texts
*/
public TTSResult batch(List<String> textList, Style style, int totalStep, OrtEnvironment env)
public TTSResult batch(List<String> textList, Style style, int totalStep, float speed, OrtEnvironment env)
throws OrtException {
return _infer(textList, style, totalStep, env);
return _infer(textList, style, totalStep, speed, env);
}
public void close() throws OrtException {
+2
View File
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `ExampleONNX.java`.
## 📰 Update News
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
## Installation
+2
View File
@@ -4,6 +4,8 @@ Node.js implementation for TTS inference. Uses ONNX Runtime to generate speech f
## 📰 Update News
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
## Requirements
+6 -2
View File
@@ -15,6 +15,7 @@ function parseArgs() {
useGpu: false,
onnxDir: 'assets/onnx',
totalStep: 5,
speed: 1.05,
nTest: 4,
voiceStyle: ['assets/voice_styles/M1.json'],
text: ['This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen.'],
@@ -32,6 +33,8 @@ function parseArgs() {
args.onnxDir = process.argv[++i];
} else if (arg === '--total-step' && i + 1 < process.argv.length) {
args.totalStep = parseInt(process.argv[++i]);
} else if (arg === '--speed' && i + 1 < process.argv.length) {
args.speed = parseFloat(process.argv[++i]);
} else if (arg === '--n-test' && i + 1 < process.argv.length) {
args.nTest = parseInt(process.argv[++i]);
} else if (arg === '--voice-style' && i + 1 < process.argv.length) {
@@ -55,6 +58,7 @@ async function main() {
// --- 1. Parse arguments --- //
const args = parseArgs();
const totalStep = args.totalStep;
const speed = args.speed;
const nTest = args.nTest;
const saveDir = args.saveDir;
const voiceStylePaths = args.voiceStyle.map(p => path.resolve(__dirname, p));
@@ -79,9 +83,9 @@ async function main() {
const { wav, duration } = await timer('Generating speech from text', async () => {
if (batch) {
return await textToSpeech.batch(textList, style, totalStep);
return await textToSpeech.batch(textList, style, totalStep, speed);
} else {
return await textToSpeech.call(textList[0], style, totalStep);
return await textToSpeech.call(textList[0], style, totalStep, speed);
}
});
+10 -5
View File
@@ -114,7 +114,7 @@ class TextToSpeech {
return { noisyLatent, latentMask };
}
async _infer(textList, style, totalStep) {
async _infer(textList, style, totalStep, speed = 1.05) {
if (textList.length !== style.ttl.dims[0]) {
throw new Error('Number of texts must match number of style vectors');
}
@@ -133,6 +133,11 @@ class TextToSpeech {
const durOnnx = Array.from(dpResult.duration.data);
// Apply speed factor to duration
for (let i = 0; i < durOnnx.length; i++) {
durOnnx[i] /= speed;
}
const textEncResult = await this.textEncOrt.run({
text_ids: intArrayToTensor(textIds, textIdsShape),
style_ttl: style.ttl,
@@ -185,7 +190,7 @@ class TextToSpeech {
return { wav, duration: durOnnx };
}
async call(text, style, totalStep, silenceDuration = 0.3) {
async call(text, style, totalStep, speed = 1.05, silenceDuration = 0.3) {
if (style.ttl.dims[0] !== 1) {
throw new Error('Single speaker text to speech only supports single style');
}
@@ -194,7 +199,7 @@ class TextToSpeech {
let durCat = 0;
for (const chunk of textList) {
const { wav, duration } = await this._infer([chunk], style, totalStep);
const { wav, duration } = await this._infer([chunk], style, totalStep, speed);
if (wavCat === null) {
wavCat = wav;
@@ -210,8 +215,8 @@ class TextToSpeech {
return { wav: wavCat, duration: [durCat] };
}
async batch(textList, style, totalStep) {
return await this._infer(textList, style, totalStep);
async batch(textList, style, totalStep, speed = 1.05) {
return await this._infer(textList, style, totalStep, speed);
}
}
+25
View File
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx.py`.
## 📰 Update News
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed. Adjust the speed factor to make speech faster or slower while maintaining natural quality.
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
## Installation
@@ -85,6 +87,28 @@ This will:
**Note**: When using batch mode (`--batch`), automatic text chunking is disabled. Use non-batch mode for long-form text synthesis.
### Example 5: Adjusting Speech Speed
Control the speed of speech synthesis:
```bash
# Faster speech (speed > 1.0)
uv run example_onnx.py \
--voice-style assets/voice_styles/F2.json \
--text "This text will be synthesized at a faster pace." \
--speed 1.2
# Slower speech (speed < 1.0)
uv run example_onnx.py \
--voice-style assets/voice_styles/M2.json \
--text "This text will be synthesized at a slower, more deliberate pace." \
--speed 0.9
```
This will:
- Use `--speed 1.2` to generate faster speech
- Use `--speed 0.9` to generate slower speech
- Default speed is 1.05 if not specified
- Recommended speed range is between 0.9 and 1.5 for natural-sounding results
## Available Arguments
| Argument | Type | Default | Description |
@@ -92,6 +116,7 @@ This will:
| `--use-gpu` | flag | False | Use GPU for inference (with CPU fallback) |
| `--onnx-dir` | str | `assets/onnx` | Path to ONNX model directory |
| `--total-step` | int | 5 | Number of denoising steps (higher = better quality, slower) |
| `--speed` | float | 1.05 | Speech speed factor (higher = faster, lower = slower) |
| `--n-test` | int | 4 | Number of times to generate each sample |
| `--voice-style` | str+ | `assets/voice_styles/M1.json` | Voice style file path(s) |
| `--text` | str+ | (long default text) | Text(s) to synthesize |
+9 -2
View File
@@ -26,6 +26,12 @@ def parse_args():
parser.add_argument(
"--total-step", type=int, default=5, help="Number of denoising steps"
)
parser.add_argument(
"--speed",
type=float,
default=1.05,
help="Speech speed (default: 1.05, higher = faster)",
)
parser.add_argument(
"--n-test", type=int, default=4, help="Number of times to generate"
)
@@ -62,6 +68,7 @@ print("=== TTS Inference with ONNX Runtime (Python) ===\n")
# --- 1. Parse arguments --- #
args = parse_args()
total_step = args.total_step
speed = args.speed
n_test = args.n_test
save_dir = args.save_dir
voice_style_paths = args.voice_style
@@ -84,9 +91,9 @@ for n in range(n_test):
print(f"\n[{n+1}/{n_test}] Starting synthesis...")
with timer("Generating speech from text"):
if batch:
wav, duration = text_to_speech.batch(text_list, style, total_step)
wav, duration = text_to_speech.batch(text_list, style, total_step, speed)
else:
wav, duration = text_to_speech(text_list[0], style, total_step)
wav, duration = text_to_speech(text_list[0], style, total_step, speed)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
for b in range(bsz):
+11 -5
View File
@@ -86,7 +86,7 @@ class TextToSpeech:
return noisy_latent, latent_mask
def _infer(
self, text_list: list[str], style: Style, total_step: int
self, text_list: list[str], style: Style, total_step: int, speed: float = 1.05
) -> tuple[np.ndarray, np.ndarray]:
assert (
len(text_list) == style.ttl.shape[0]
@@ -96,6 +96,7 @@ class TextToSpeech:
dur_onnx, *_ = self.dp_ort.run(
None, {"text_ids": text_ids, "style_dp": style.dp, "text_mask": text_mask}
)
dur_onnx = dur_onnx / speed
text_emb_onnx, *_ = self.text_enc_ort.run(
None,
{"text_ids": text_ids, "style_ttl": style.ttl, "text_mask": text_mask},
@@ -120,7 +121,12 @@ class TextToSpeech:
return wav, dur_onnx
def __call__(
self, text: str, style: Style, total_step: int, silence_duration: float = 0.3
self,
text: str,
style: Style,
total_step: int,
speed: float = 1.05,
silence_duration: float = 0.3,
) -> tuple[np.ndarray, np.ndarray]:
assert (
style.ttl.shape[0] == 1
@@ -129,7 +135,7 @@ class TextToSpeech:
wav_cat = None
dur_cat = None
for text in text_list:
wav, dur_onnx = self._infer([text], style, total_step)
wav, dur_onnx = self._infer([text], style, total_step, speed)
if wav_cat is None:
wav_cat = wav
dur_cat = dur_onnx
@@ -142,9 +148,9 @@ class TextToSpeech:
return wav_cat, dur_cat
def batch(
self, text_list: list[str], style: Style, total_step: int
self, text_list: list[str], style: Style, total_step: int, speed: float = 1.05
) -> tuple[np.ndarray, np.ndarray]:
return self._infer(text_list, style, total_step)
return self._infer(text_list, style, total_step, speed)
def length_to_mask(lengths: np.ndarray, max_len: Optional[int] = None) -> np.ndarray:
+2
View File
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using Rust.
## 📰 Update News
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
## Installation
+7 -2
View File
@@ -26,6 +26,10 @@ struct Args {
#[arg(long, default_value = "5")]
total_step: usize,
/// Speech speed factor (higher = faster)
#[arg(long, default_value = "1.05")]
speed: f32,
/// Number of times to generate
#[arg(long, default_value = "4")]
n_test: usize,
@@ -53,6 +57,7 @@ fn main() -> Result<()> {
// --- 1. Parse arguments --- //
let args = Args::parse();
let total_step = args.total_step;
let speed = args.speed;
let n_test = args.n_test;
let voice_style_paths = &args.voice_style;
let text_list = &args.text;
@@ -85,11 +90,11 @@ fn main() -> Result<()> {
let (wav, duration) = if batch {
timer("Generating speech from text", || {
text_to_speech.batch(text_list, &style, total_step)
text_to_speech.batch(text_list, &style, total_step, speed)
})?
} else {
let (w, d) = timer("Generating speech from text", || {
text_to_speech.call(&text_list[0], &style, total_step, 0.3)
text_to_speech.call(&text_list[0], &style, total_step, speed, 0.3)
})?;
(w, vec![d])
};
+11 -3
View File
@@ -484,6 +484,7 @@ impl TextToSpeech {
text_list: &[String],
style: &Style,
total_step: usize,
speed: f32,
) -> Result<(Vec<f32>, Vec<f32>)> {
let bsz = text_list.len();
@@ -511,7 +512,12 @@ impl TextToSpeech {
})?;
let (_, duration_data) = dp_outputs["duration"].try_extract_tensor::<f32>()?;
let duration: Vec<f32> = duration_data.to_vec();
let mut duration: Vec<f32> = duration_data.to_vec();
// Apply speed factor to duration
for dur in duration.iter_mut() {
*dur /= speed;
}
// Encode text
let style_ttl_value = Value::from_array(style.ttl.clone())?;
@@ -584,6 +590,7 @@ impl TextToSpeech {
text: &str,
style: &Style,
total_step: usize,
speed: f32,
silence_duration: f32,
) -> Result<(Vec<f32>, f32)> {
let chunks = chunk_text(text, None);
@@ -592,7 +599,7 @@ impl TextToSpeech {
let mut dur_cat: f32 = 0.0;
for (i, chunk) in chunks.iter().enumerate() {
let (wav, duration) = self._infer(&[chunk.clone()], style, total_step)?;
let (wav, duration) = self._infer(&[chunk.clone()], style, total_step, speed)?;
let dur = duration[0];
let wav_len = (self.sample_rate as f32 * dur) as usize;
@@ -619,8 +626,9 @@ impl TextToSpeech {
text_list: &[String],
style: &Style,
total_step: usize,
speed: f32,
) -> Result<(Vec<f32>, Vec<f32>)> {
self._infer(text_list, style, total_step)
self._infer(text_list, style, total_step, speed)
}
}
+2
View File
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx`.
## 📰 Update News
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
## Installation
+8 -2
View File
@@ -5,6 +5,7 @@ struct Args {
var useGpu: Bool = false
var onnxDir: String = "assets/onnx"
var totalStep: Int = 5
var speed: Float = 1.05
var nTest: Int = 4
var voiceStyle: [String] = ["assets/voice_styles/M1.json"]
var text: [String] = ["This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."]
@@ -33,6 +34,11 @@ func parseArgs() -> Args {
args.totalStep = Int(arguments[i + 1]) ?? 5
i += 1
}
case "--speed":
if i + 1 < arguments.count {
args.speed = Float(arguments[i + 1]) ?? 1.05
i += 1
}
case "--n-test":
if i + 1 < arguments.count {
args.nTest = Int(arguments[i + 1]) ?? 4
@@ -102,13 +108,13 @@ struct ExampleONNX {
if args.batch {
let result = try timer("Generating speech from text") {
try textToSpeech.batch(args.text, style, args.totalStep)
try textToSpeech.batch(args.text, style, args.totalStep, speed: args.speed)
}
wav = result.wav
duration = result.duration
} else {
let result = try timer("Generating speech from text") {
try textToSpeech.call(args.text[0], style, args.totalStep, silenceDuration: 0.3)
try textToSpeech.call(args.text[0], style, args.totalStep, speed: args.speed, silenceDuration: 0.3)
}
wav = result.wav
duration = [result.duration]
+11 -6
View File
@@ -453,7 +453,7 @@ class TextToSpeech {
self.sampleRate = cfgs.ae.sample_rate
}
private func _infer(_ textList: [String], _ style: Style, _ totalStep: Int) throws -> (wav: [Float], duration: [Float]) {
private func _infer(_ textList: [String], _ style: Style, _ totalStep: Int, speed: Float = 1.05) throws -> (wav: [Float], duration: [Float]) {
let bsz = textList.count
// Process text
@@ -479,10 +479,15 @@ class TextToSpeech {
runOptions: nil)
let durationData = try dpOutputs["duration"]!.tensorData() as Data
let duration = durationData.withUnsafeBytes { ptr in
var duration = durationData.withUnsafeBytes { ptr in
Array(ptr.bindMemory(to: Float.self))
}
// Apply speed factor to duration
for i in 0..<duration.count {
duration[i] /= speed
}
// Encode text
let textEncOutputs = try textEncOrt.run(withInputs: ["text_ids": textIdsValue, "style_ttl": style.ttl, "text_mask": textMaskValue],
outputNames: ["text_emb"],
@@ -576,14 +581,14 @@ class TextToSpeech {
return (wav, duration)
}
func call(_ text: String, _ style: Style, _ totalStep: Int, silenceDuration: Float) throws -> (wav: [Float], duration: Float) {
func call(_ text: String, _ style: Style, _ totalStep: Int, speed: Float = 1.05, silenceDuration: Float = 0.3) throws -> (wav: [Float], duration: Float) {
let chunks = chunkText(text)
var wavCat = [Float]()
var durCat: Float = 0.0
for (i, chunk) in chunks.enumerated() {
let result = try _infer([chunk], style, totalStep)
let result = try _infer([chunk], style, totalStep, speed: speed)
let dur = result.duration[0]
let wavLen = Int(Float(sampleRate) * dur)
@@ -605,8 +610,8 @@ class TextToSpeech {
return (wavCat, durCat)
}
func batch(_ textList: [String], _ style: Style, _ totalStep: Int) throws -> (wav: [Float], duration: [Float]) {
return try _infer(textList, style, totalStep)
func batch(_ textList: [String], _ style: Style, _ totalStep: Int, speed: Float = 1.05) throws -> (wav: [Float], duration: [Float]) {
return try _infer(textList, style, totalStep, speed: speed)
}
}
+41
View File
@@ -108,6 +108,37 @@ NC='\033[0m' # No Color
declare -a PASSED=()
declare -a FAILED=()
# Helper function to show statistics
show_stats() {
local name=$1
local results_dir=$2
if [ -d "$results_dir" ]; then
# Count .wav files
local file_count=$(find "$results_dir" -name "*.wav" -type f 2>/dev/null | wc -l | tr -d ' ')
if [ "$file_count" -gt 0 ]; then
# Calculate total size
local total_size=0
while IFS= read -r file; do
if [ -f "$file" ]; then
local size=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null)
total_size=$((total_size + size))
fi
done < <(find "$results_dir" -name "*.wav" -type f 2>/dev/null)
# Calculate statistics
local total_size_mb=$(echo "scale=2; $total_size / 1024 / 1024" | bc)
local avg_size_kb=$(echo "scale=2; $total_size / $file_count / 1024" | bc)
echo -e "${BLUE}[$name]${NC} 📊 Statistics:"
echo -e "${BLUE}[$name]${NC} - Files generated: $file_count"
echo -e "${BLUE}[$name]${NC} - Total size: ${total_size_mb} MB"
echo -e "${BLUE}[$name]${NC} - Average file size: ${avg_size_kb} KB"
fi
fi
}
# Helper function to run tests
run_test() {
local name=$1
@@ -118,9 +149,19 @@ run_test() {
echo -e "${BLUE}[$name]${NC} Running inference..."
cd "$SCRIPT_DIR/$dir"
# Determine results directory based on the directory
local results_dir="$SCRIPT_DIR/$dir/results"
if [[ "$dir" == "cpp/build" ]]; then
results_dir="$SCRIPT_DIR/cpp/build/results"
fi
# Run command and prefix each output line with the language name
if eval "$cmd" 2>&1 | sed "s/^/[$name] /"; then
echo -e "${GREEN}[$name]${NC} ✓ Success"
# Show statistics
show_stats "$name" "$results_dir"
PASSED+=("$name")
else
echo -e "${RED}[$name]${NC} ✗ Failed"
+2
View File
@@ -4,6 +4,8 @@ This example demonstrates how to use Supertonic in a web browser using ONNX Runt
## 📰 Update News
**2025.11.19** - Added speed control slider to adjust speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
## Features
+10 -5
View File
@@ -72,7 +72,7 @@ export class TextToSpeech {
this.sampleRate = cfgs.ae.sample_rate;
}
async _infer(textList, style, totalStep, progressCallback = null) {
async _infer(textList, style, totalStep, speed = 1.05, progressCallback = null) {
const bsz = textList.length;
// Process text
@@ -94,6 +94,11 @@ export class TextToSpeech {
});
const duration = Array.from(dpOutputs.duration.data);
// Apply speed factor to duration
for (let i = 0; i < duration.length; i++) {
duration[i] /= speed;
}
// Encode text
const textEncOutputs = await this.textEncOrt.run({
text_ids: textIdsTensor,
@@ -176,7 +181,7 @@ export class TextToSpeech {
return { wav, duration };
}
async call(text, style, totalStep, silenceDuration = 0.3, progressCallback = null) {
async call(text, style, totalStep, speed = 1.05, silenceDuration = 0.3, progressCallback = null) {
if (style.ttl.dims[0] !== 1) {
throw new Error('Single speaker text to speech only supports single style');
}
@@ -185,7 +190,7 @@ export class TextToSpeech {
let durCat = 0;
for (const chunk of textList) {
const { wav, duration } = await this._infer([chunk], style, totalStep, progressCallback);
const { wav, duration } = await this._infer([chunk], style, totalStep, speed, progressCallback);
if (wavCat.length === 0) {
wavCat = wav;
@@ -201,8 +206,8 @@ export class TextToSpeech {
return { wav: wavCat, duration: [durCat] };
}
async batch(textList, style, totalStep, progressCallback = null) {
return await this._infer(textList, style, totalStep, progressCallback);
async batch(textList, style, totalStep, speed = 1.05, progressCallback = null) {
return await this._infer(textList, style, totalStep, speed, progressCallback);
}
sampleNoisyLatent(duration, sampleRate, baseChunkSize, chunkCompress, latentDim) {
+6
View File
@@ -48,6 +48,12 @@
<input type="number" id="totalStep" value="5"
min="1" max="50">
</div>
<div class="section">
<label for="speed">Speed (0.9-1.5 recommended):</label>
<input type="number" id="speed" value="1.05"
min="0.5" max="2.0" step="0.05">
</div>
</div>
+3
View File
@@ -25,6 +25,7 @@ const textInput = document.getElementById('text');
const voiceStyleSelect = document.getElementById('voiceStyleSelect');
const voiceStyleInfo = document.getElementById('voiceStyleInfo');
const totalStepInput = document.getElementById('totalStep');
const speedInput = document.getElementById('speed');
const generateBtn = document.getElementById('generateBtn');
const statusBox = document.getElementById('statusBox');
const statusText = document.getElementById('statusText');
@@ -186,6 +187,7 @@ async function generateSpeech() {
`;
const totalStep = parseInt(totalStepInput.value);
const speed = parseFloat(speedInput.value);
showStatus('️ <strong>Generating speech from text...</strong>');
const tic = Date.now();
@@ -194,6 +196,7 @@ async function generateSpeech() {
text,
currentStyle,
totalStep,
speed,
0.3,
(step, total) => {
showStatus(`️ <strong>Denoising (${step}/${total})...</strong>`);