mirror of
https://github.com/supertone-inc/supertonic.git
synced 2026-06-02 01:38:48 +02:00
add speed parameter
This commit is contained in:
@@ -4,6 +4,8 @@ High-performance text-to-speech inference using ONNX Runtime.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
## Requirements
|
||||
|
||||
@@ -10,6 +10,7 @@ namespace fs = std::filesystem;
|
||||
struct Args {
|
||||
std::string onnx_dir = "../assets/onnx";
|
||||
int total_step = 5;
|
||||
float speed = 1.05f;
|
||||
int n_test = 4;
|
||||
std::vector<std::string> voice_style = {"../assets/voice_styles/M1.json"};
|
||||
std::vector<std::string> text = {
|
||||
@@ -36,6 +37,7 @@ Args parseArgs(int argc, char* argv[]) {
|
||||
std::string arg = argv[i];
|
||||
if (arg == "--onnx-dir" && i + 1 < argc) args.onnx_dir = argv[++i];
|
||||
else if (arg == "--total-step" && i + 1 < argc) args.total_step = std::stoi(argv[++i]);
|
||||
else if (arg == "--speed" && i + 1 < argc) args.speed = std::stof(argv[++i]);
|
||||
else if (arg == "--n-test" && i + 1 < argc) args.n_test = std::stoi(argv[++i]);
|
||||
else if (arg == "--voice-style" && i + 1 < argc) args.voice_style = splitString(argv[++i], ',');
|
||||
else if (arg == "--text" && i + 1 < argc) args.text = splitString(argv[++i], '|');
|
||||
@@ -51,6 +53,7 @@ int main(int argc, char* argv[]) {
|
||||
// --- 1. Parse arguments --- //
|
||||
Args args = parseArgs(argc, argv);
|
||||
int total_step = args.total_step;
|
||||
float speed = args.speed;
|
||||
int n_test = args.n_test;
|
||||
std::string save_dir = args.save_dir;
|
||||
std::vector<std::string> voice_style_paths = args.voice_style;
|
||||
@@ -84,9 +87,9 @@ int main(int argc, char* argv[]) {
|
||||
|
||||
auto result = timer("Generating speech from text", [&]() {
|
||||
if (batch) {
|
||||
return text_to_speech->batch(memory_info, text_list, style, total_step);
|
||||
return text_to_speech->batch(memory_info, text_list, style, total_step, speed);
|
||||
} else {
|
||||
return text_to_speech->call(memory_info, text_list[0], style, total_step);
|
||||
return text_to_speech->call(memory_info, text_list[0], style, total_step, speed);
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
+12
-4
@@ -160,7 +160,8 @@ TextToSpeech::SynthesisResult TextToSpeech::_infer(
|
||||
Ort::MemoryInfo& memory_info,
|
||||
const std::vector<std::string>& text_list,
|
||||
const Style& style,
|
||||
int total_step
|
||||
int total_step,
|
||||
float speed
|
||||
) {
|
||||
int bsz = text_list.size();
|
||||
|
||||
@@ -213,6 +214,11 @@ TextToSpeech::SynthesisResult TextToSpeech::_infer(
|
||||
auto* dur_data = dp_outputs[0].GetTensorMutableData<float>();
|
||||
std::vector<float> duration(dur_data, dur_data + bsz);
|
||||
|
||||
// Apply speed factor to duration
|
||||
for (auto& dur : duration) {
|
||||
dur /= speed;
|
||||
}
|
||||
|
||||
// Create new tensors for text encoder (previous ones were moved)
|
||||
text_ids_tensor = intArrayToTensor(memory_info, text_ids, text_ids_shape);
|
||||
text_mask_tensor = arrayToTensor(memory_info, text_mask, text_mask_shape);
|
||||
@@ -370,6 +376,7 @@ TextToSpeech::SynthesisResult TextToSpeech::call(
|
||||
const std::string& text,
|
||||
const Style& style,
|
||||
int total_step,
|
||||
float speed,
|
||||
float silence_duration
|
||||
) {
|
||||
if (style.getTtlShape()[0] != 1) {
|
||||
@@ -381,7 +388,7 @@ TextToSpeech::SynthesisResult TextToSpeech::call(
|
||||
float dur_cat = 0.0f;
|
||||
|
||||
for (const auto& chunk : text_list) {
|
||||
auto result = _infer(memory_info, {chunk}, style, total_step);
|
||||
auto result = _infer(memory_info, {chunk}, style, total_step, speed);
|
||||
|
||||
if (wav_cat.empty()) {
|
||||
wav_cat = result.wav;
|
||||
@@ -406,9 +413,10 @@ TextToSpeech::SynthesisResult TextToSpeech::batch(
|
||||
Ort::MemoryInfo& memory_info,
|
||||
const std::vector<std::string>& text_list,
|
||||
const Style& style,
|
||||
int total_step
|
||||
int total_step,
|
||||
float speed
|
||||
) {
|
||||
return _infer(memory_info, text_list, style, total_step);
|
||||
return _infer(memory_info, text_list, style, total_step, speed);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
|
||||
+5
-2
@@ -91,6 +91,7 @@ public:
|
||||
const std::string& text,
|
||||
const Style& style,
|
||||
int total_step,
|
||||
float speed = 1.05f,
|
||||
float silence_duration = 0.3f
|
||||
);
|
||||
|
||||
@@ -98,7 +99,8 @@ public:
|
||||
Ort::MemoryInfo& memory_info,
|
||||
const std::vector<std::string>& text_list,
|
||||
const Style& style,
|
||||
int total_step
|
||||
int total_step,
|
||||
float speed = 1.05f
|
||||
);
|
||||
|
||||
int getSampleRate() const { return sample_rate_; }
|
||||
@@ -108,7 +110,8 @@ private:
|
||||
Ort::MemoryInfo& memory_info,
|
||||
const std::vector<std::string>& text_list,
|
||||
const Style& style,
|
||||
int total_step
|
||||
int total_step,
|
||||
float speed = 1.05f
|
||||
);
|
||||
Config cfgs_;
|
||||
UnicodeProcessor* text_processor_;
|
||||
|
||||
@@ -12,6 +12,7 @@ namespace Supertonic
|
||||
public bool UseGpu { get; set; } = false;
|
||||
public string OnnxDir { get; set; } = "assets/onnx";
|
||||
public int TotalStep { get; set; } = 5;
|
||||
public float Speed { get; set; } = 1.05f;
|
||||
public int NTest { get; set; } = 4;
|
||||
public List<string> VoiceStyle { get; set; } = new List<string> { "assets/voice_styles/M1.json" };
|
||||
public List<string> Text { get; set; } = new List<string>
|
||||
@@ -42,6 +43,9 @@ namespace Supertonic
|
||||
case "--total-step" when i + 1 < args.Length:
|
||||
result.TotalStep = int.Parse(args[++i]);
|
||||
break;
|
||||
case "--speed" when i + 1 < args.Length:
|
||||
result.Speed = float.Parse(args[++i]);
|
||||
break;
|
||||
case "--n-test" when i + 1 < args.Length:
|
||||
result.NTest = int.Parse(args[++i]);
|
||||
break;
|
||||
@@ -67,6 +71,7 @@ namespace Supertonic
|
||||
// --- 1. Parse arguments --- //
|
||||
var parsedArgs = ParseArgs(args);
|
||||
int totalStep = parsedArgs.TotalStep;
|
||||
float speed = parsedArgs.Speed;
|
||||
int nTest = parsedArgs.NTest;
|
||||
string saveDir = parsedArgs.SaveDir;
|
||||
var voiceStylePaths = parsedArgs.VoiceStyle;
|
||||
@@ -96,11 +101,11 @@ namespace Supertonic
|
||||
{
|
||||
if (batch)
|
||||
{
|
||||
return textToSpeech.Batch(textList, style, totalStep);
|
||||
return textToSpeech.Batch(textList, style, totalStep, speed);
|
||||
}
|
||||
else
|
||||
{
|
||||
return textToSpeech.Call(textList[0], style, totalStep);
|
||||
return textToSpeech.Call(textList[0], style, totalStep, speed);
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
+11
-5
@@ -194,7 +194,7 @@ namespace Supertonic
|
||||
return (noisyLatent, latentMask);
|
||||
}
|
||||
|
||||
private (float[] wav, float[] duration) _Infer(List<string> textList, Style style, int totalStep)
|
||||
private (float[] wav, float[] duration) _Infer(List<string> textList, Style style, int totalStep, float speed = 1.05f)
|
||||
{
|
||||
int bsz = textList.Count;
|
||||
if (bsz != style.TtlShape[0])
|
||||
@@ -222,6 +222,12 @@ namespace Supertonic
|
||||
};
|
||||
using var dpOutputs = _dpOrt.Run(dpInputs);
|
||||
var durOnnx = dpOutputs.First(o => o.Name == "duration").AsTensor<float>().ToArray();
|
||||
|
||||
// Apply speed factor to duration
|
||||
for (int i = 0; i < durOnnx.Length; i++)
|
||||
{
|
||||
durOnnx[i] /= speed;
|
||||
}
|
||||
|
||||
// Run text encoder
|
||||
var textEncInputs = new List<NamedOnnxValue>
|
||||
@@ -284,7 +290,7 @@ namespace Supertonic
|
||||
return (wavTensor.ToArray(), durOnnx);
|
||||
}
|
||||
|
||||
public (float[] wav, float[] duration) Call(string text, Style style, int totalStep, float silenceDuration = 0.3f)
|
||||
public (float[] wav, float[] duration) Call(string text, Style style, int totalStep, float speed = 1.05f, float silenceDuration = 0.3f)
|
||||
{
|
||||
if (style.TtlShape[0] != 1)
|
||||
{
|
||||
@@ -297,7 +303,7 @@ namespace Supertonic
|
||||
|
||||
foreach (var chunk in textList)
|
||||
{
|
||||
var (wav, duration) = _Infer(new List<string> { chunk }, style, totalStep);
|
||||
var (wav, duration) = _Infer(new List<string> { chunk }, style, totalStep, speed);
|
||||
|
||||
if (wavCat.Count == 0)
|
||||
{
|
||||
@@ -317,9 +323,9 @@ namespace Supertonic
|
||||
return (wavCat.ToArray(), new float[] { durCat });
|
||||
}
|
||||
|
||||
public (float[] wav, float[] duration) Batch(List<string> textList, Style style, int totalStep)
|
||||
public (float[] wav, float[] duration) Batch(List<string> textList, Style style, int totalStep, float speed = 1.05f)
|
||||
{
|
||||
return _Infer(textList, style, totalStep);
|
||||
return _Infer(textList, style, totalStep, speed);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `ExampleONNX.cs`.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
## Installation
|
||||
|
||||
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx.go`.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
## Installation
|
||||
|
||||
+5
-2
@@ -15,6 +15,7 @@ type Args struct {
|
||||
useGPU bool
|
||||
onnxDir string
|
||||
totalStep int
|
||||
speed float64
|
||||
nTest int
|
||||
voiceStyle []string
|
||||
text []string
|
||||
@@ -28,6 +29,7 @@ func parseArgs() *Args {
|
||||
flag.BoolVar(&args.useGPU, "use-gpu", false, "Use GPU for inference (default: CPU)")
|
||||
flag.StringVar(&args.onnxDir, "onnx-dir", "assets/onnx", "Path to ONNX model directory")
|
||||
flag.IntVar(&args.totalStep, "total-step", 5, "Number of denoising steps")
|
||||
flag.Float64Var(&args.speed, "speed", 1.05, "Speech speed factor (higher = faster)")
|
||||
flag.IntVar(&args.nTest, "n-test", 4, "Number of times to generate")
|
||||
flag.StringVar(&args.saveDir, "save-dir", "results", "Output directory")
|
||||
flag.BoolVar(&args.batch, "batch", false, "Enable batch mode (multiple text-style pairs)")
|
||||
@@ -63,6 +65,7 @@ func main() {
|
||||
// --- 1. Parse arguments --- //
|
||||
args := parseArgs()
|
||||
totalStep := args.totalStep
|
||||
speed := float32(args.speed)
|
||||
nTest := args.nTest
|
||||
saveDir := args.saveDir
|
||||
voiceStylePaths := args.voiceStyle
|
||||
@@ -123,7 +126,7 @@ func main() {
|
||||
|
||||
if batch {
|
||||
Timer("Generating speech from text", func() interface{} {
|
||||
w, d, err := textToSpeech.Batch(textList, style, totalStep)
|
||||
w, d, err := textToSpeech.Batch(textList, style, totalStep, speed)
|
||||
if err != nil {
|
||||
fmt.Printf("Error generating speech: %v\n", err)
|
||||
os.Exit(1)
|
||||
@@ -134,7 +137,7 @@ func main() {
|
||||
})
|
||||
} else {
|
||||
Timer("Generating speech from text", func() interface{} {
|
||||
w, d, err := textToSpeech.Call(textList[0], style, totalStep, 0.3)
|
||||
w, d, err := textToSpeech.Call(textList[0], style, totalStep, speed, 0.3)
|
||||
if err != nil {
|
||||
fmt.Printf("Error generating speech: %v\n", err)
|
||||
os.Exit(1)
|
||||
|
||||
+10
-5
@@ -572,7 +572,7 @@ func (tts *TextToSpeech) sampleNoisyLatent(durOnnx []float32) ([][][]float64, []
|
||||
return noisyLatent, latentMask
|
||||
}
|
||||
|
||||
func (tts *TextToSpeech) _infer(textList []string, style *Style, totalStep int) ([]float32, []float32, error) {
|
||||
func (tts *TextToSpeech) _infer(textList []string, style *Style, totalStep int, speed float32) ([]float32, []float32, error) {
|
||||
bsz := len(textList)
|
||||
|
||||
// Process text
|
||||
@@ -597,6 +597,11 @@ func (tts *TextToSpeech) _infer(textList []string, style *Style, totalStep int)
|
||||
durTensor := dpOutputs[0].(*ort.Tensor[float32])
|
||||
defer durTensor.Destroy()
|
||||
durOnnx := durTensor.GetData()
|
||||
|
||||
// Apply speed factor to duration
|
||||
for i := range durOnnx {
|
||||
durOnnx[i] /= speed
|
||||
}
|
||||
|
||||
// Encode text
|
||||
textIDsTensor2 := IntArrayToTensor(textIDs, textIDsShape)
|
||||
@@ -691,14 +696,14 @@ func (tts *TextToSpeech) _infer(textList []string, style *Style, totalStep int)
|
||||
}
|
||||
|
||||
// Call synthesizes speech from a single text with automatic chunking
|
||||
func (tts *TextToSpeech) Call(text string, style *Style, totalStep int, silenceDuration float32) ([]float32, float32, error) {
|
||||
func (tts *TextToSpeech) Call(text string, style *Style, totalStep int, speed float32, silenceDuration float32) ([]float32, float32, error) {
|
||||
chunks := chunkText(text, 0)
|
||||
|
||||
var wavCat []float32
|
||||
var durCat float32
|
||||
|
||||
for i, chunk := range chunks {
|
||||
wav, duration, err := tts._infer([]string{chunk}, style, totalStep)
|
||||
wav, duration, err := tts._infer([]string{chunk}, style, totalStep, speed)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
@@ -724,8 +729,8 @@ func (tts *TextToSpeech) Call(text string, style *Style, totalStep int, silenceD
|
||||
}
|
||||
|
||||
// Batch synthesizes speech from multiple texts
|
||||
func (tts *TextToSpeech) Batch(textList []string, style *Style, totalStep int) ([]float32, []float32, error) {
|
||||
return tts._infer(textList, style, totalStep)
|
||||
func (tts *TextToSpeech) Batch(textList []string, style *Style, totalStep int, speed float32) ([]float32, []float32, error) {
|
||||
return tts._infer(textList, style, totalStep, speed)
|
||||
}
|
||||
|
||||
func (tts *TextToSpeech) Destroy() {
|
||||
|
||||
@@ -15,6 +15,7 @@ public class ExampleONNX {
|
||||
boolean useGpu = false;
|
||||
String onnxDir = "assets/onnx";
|
||||
int totalStep = 5;
|
||||
float speed = 1.05f;
|
||||
int nTest = 4;
|
||||
List<String> voiceStyle = Arrays.asList("assets/voice_styles/M1.json");
|
||||
List<String> text = Arrays.asList(
|
||||
@@ -41,6 +42,9 @@ public class ExampleONNX {
|
||||
case "--total-step":
|
||||
if (i + 1 < args.length) result.totalStep = Integer.parseInt(args[++i]);
|
||||
break;
|
||||
case "--speed":
|
||||
if (i + 1 < args.length) result.speed = Float.parseFloat(args[++i]);
|
||||
break;
|
||||
case "--n-test":
|
||||
if (i + 1 < args.length) result.nTest = Integer.parseInt(args[++i]);
|
||||
break;
|
||||
@@ -76,6 +80,7 @@ public class ExampleONNX {
|
||||
// --- 1. Parse arguments --- //
|
||||
Args parsedArgs = parseArgs(args);
|
||||
int totalStep = parsedArgs.totalStep;
|
||||
float speed = parsedArgs.speed;
|
||||
int nTest = parsedArgs.nTest;
|
||||
String saveDir = parsedArgs.saveDir;
|
||||
List<String> voiceStylePaths = parsedArgs.voiceStyle;
|
||||
@@ -111,7 +116,7 @@ public class ExampleONNX {
|
||||
if (batch) {
|
||||
ttsResult = Helper.timer("Generating speech from text", () -> {
|
||||
try {
|
||||
return textToSpeech.batch(textList, style, totalStep, env);
|
||||
return textToSpeech.batch(textList, style, totalStep, speed, env);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
@@ -119,7 +124,7 @@ public class ExampleONNX {
|
||||
} else {
|
||||
ttsResult = Helper.timer("Generating speech from text", () -> {
|
||||
try {
|
||||
return textToSpeech.call(textList.get(0), style, totalStep, 0.3f, env);
|
||||
return textToSpeech.call(textList.get(0), style, totalStep, speed, 0.3f, env);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
+10
-5
@@ -154,7 +154,7 @@ class TextToSpeech {
|
||||
this.ldim = config.ttl.latentDim;
|
||||
}
|
||||
|
||||
private TTSResult _infer(List<String> textList, Style style, int totalStep, OrtEnvironment env)
|
||||
private TTSResult _infer(List<String> textList, Style style, int totalStep, float speed, OrtEnvironment env)
|
||||
throws OrtException {
|
||||
int bsz = textList.size();
|
||||
|
||||
@@ -182,6 +182,11 @@ class TextToSpeech {
|
||||
duration = (float[]) dpValue;
|
||||
}
|
||||
|
||||
// Apply speed factor to duration
|
||||
for (int i = 0; i < duration.length; i++) {
|
||||
duration[i] /= speed;
|
||||
}
|
||||
|
||||
// Encode text
|
||||
Map<String, OnnxTensor> textEncInputs = new HashMap<>();
|
||||
textEncInputs.put("text_ids", textIdsTensor);
|
||||
@@ -301,7 +306,7 @@ class TextToSpeech {
|
||||
/**
|
||||
* Synthesize speech from a single text with automatic chunking
|
||||
*/
|
||||
public TTSResult call(String text, Style style, int totalStep, float silenceDuration, OrtEnvironment env)
|
||||
public TTSResult call(String text, Style style, int totalStep, float speed, float silenceDuration, OrtEnvironment env)
|
||||
throws OrtException {
|
||||
List<String> chunks = Helper.chunkText(text, 0);
|
||||
|
||||
@@ -309,7 +314,7 @@ class TextToSpeech {
|
||||
float durCat = 0.0f;
|
||||
|
||||
for (int i = 0; i < chunks.size(); i++) {
|
||||
TTSResult result = _infer(Arrays.asList(chunks.get(i)), style, totalStep, env);
|
||||
TTSResult result = _infer(Arrays.asList(chunks.get(i)), style, totalStep, speed, env);
|
||||
|
||||
float dur = result.duration[0];
|
||||
int wavLen = (int) (sampleRate * dur);
|
||||
@@ -344,9 +349,9 @@ class TextToSpeech {
|
||||
/**
|
||||
* Batch synthesize speech from multiple texts
|
||||
*/
|
||||
public TTSResult batch(List<String> textList, Style style, int totalStep, OrtEnvironment env)
|
||||
public TTSResult batch(List<String> textList, Style style, int totalStep, float speed, OrtEnvironment env)
|
||||
throws OrtException {
|
||||
return _infer(textList, style, totalStep, env);
|
||||
return _infer(textList, style, totalStep, speed, env);
|
||||
}
|
||||
|
||||
public void close() throws OrtException {
|
||||
|
||||
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `ExampleONNX.java`.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
## Installation
|
||||
|
||||
@@ -4,6 +4,8 @@ Node.js implementation for TTS inference. Uses ONNX Runtime to generate speech f
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
## Requirements
|
||||
|
||||
@@ -15,6 +15,7 @@ function parseArgs() {
|
||||
useGpu: false,
|
||||
onnxDir: 'assets/onnx',
|
||||
totalStep: 5,
|
||||
speed: 1.05,
|
||||
nTest: 4,
|
||||
voiceStyle: ['assets/voice_styles/M1.json'],
|
||||
text: ['This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen.'],
|
||||
@@ -32,6 +33,8 @@ function parseArgs() {
|
||||
args.onnxDir = process.argv[++i];
|
||||
} else if (arg === '--total-step' && i + 1 < process.argv.length) {
|
||||
args.totalStep = parseInt(process.argv[++i]);
|
||||
} else if (arg === '--speed' && i + 1 < process.argv.length) {
|
||||
args.speed = parseFloat(process.argv[++i]);
|
||||
} else if (arg === '--n-test' && i + 1 < process.argv.length) {
|
||||
args.nTest = parseInt(process.argv[++i]);
|
||||
} else if (arg === '--voice-style' && i + 1 < process.argv.length) {
|
||||
@@ -55,6 +58,7 @@ async function main() {
|
||||
// --- 1. Parse arguments --- //
|
||||
const args = parseArgs();
|
||||
const totalStep = args.totalStep;
|
||||
const speed = args.speed;
|
||||
const nTest = args.nTest;
|
||||
const saveDir = args.saveDir;
|
||||
const voiceStylePaths = args.voiceStyle.map(p => path.resolve(__dirname, p));
|
||||
@@ -79,9 +83,9 @@ async function main() {
|
||||
|
||||
const { wav, duration } = await timer('Generating speech from text', async () => {
|
||||
if (batch) {
|
||||
return await textToSpeech.batch(textList, style, totalStep);
|
||||
return await textToSpeech.batch(textList, style, totalStep, speed);
|
||||
} else {
|
||||
return await textToSpeech.call(textList[0], style, totalStep);
|
||||
return await textToSpeech.call(textList[0], style, totalStep, speed);
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
+10
-5
@@ -114,7 +114,7 @@ class TextToSpeech {
|
||||
return { noisyLatent, latentMask };
|
||||
}
|
||||
|
||||
async _infer(textList, style, totalStep) {
|
||||
async _infer(textList, style, totalStep, speed = 1.05) {
|
||||
if (textList.length !== style.ttl.dims[0]) {
|
||||
throw new Error('Number of texts must match number of style vectors');
|
||||
}
|
||||
@@ -133,6 +133,11 @@ class TextToSpeech {
|
||||
|
||||
const durOnnx = Array.from(dpResult.duration.data);
|
||||
|
||||
// Apply speed factor to duration
|
||||
for (let i = 0; i < durOnnx.length; i++) {
|
||||
durOnnx[i] /= speed;
|
||||
}
|
||||
|
||||
const textEncResult = await this.textEncOrt.run({
|
||||
text_ids: intArrayToTensor(textIds, textIdsShape),
|
||||
style_ttl: style.ttl,
|
||||
@@ -185,7 +190,7 @@ class TextToSpeech {
|
||||
return { wav, duration: durOnnx };
|
||||
}
|
||||
|
||||
async call(text, style, totalStep, silenceDuration = 0.3) {
|
||||
async call(text, style, totalStep, speed = 1.05, silenceDuration = 0.3) {
|
||||
if (style.ttl.dims[0] !== 1) {
|
||||
throw new Error('Single speaker text to speech only supports single style');
|
||||
}
|
||||
@@ -194,7 +199,7 @@ class TextToSpeech {
|
||||
let durCat = 0;
|
||||
|
||||
for (const chunk of textList) {
|
||||
const { wav, duration } = await this._infer([chunk], style, totalStep);
|
||||
const { wav, duration } = await this._infer([chunk], style, totalStep, speed);
|
||||
|
||||
if (wavCat === null) {
|
||||
wavCat = wav;
|
||||
@@ -210,8 +215,8 @@ class TextToSpeech {
|
||||
return { wav: wavCat, duration: [durCat] };
|
||||
}
|
||||
|
||||
async batch(textList, style, totalStep) {
|
||||
return await this._infer(textList, style, totalStep);
|
||||
async batch(textList, style, totalStep, speed = 1.05) {
|
||||
return await this._infer(textList, style, totalStep, speed);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx.py`.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed. Adjust the speed factor to make speech faster or slower while maintaining natural quality.
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
## Installation
|
||||
@@ -85,6 +87,28 @@ This will:
|
||||
|
||||
**Note**: When using batch mode (`--batch`), automatic text chunking is disabled. Use non-batch mode for long-form text synthesis.
|
||||
|
||||
### Example 5: Adjusting Speech Speed
|
||||
Control the speed of speech synthesis:
|
||||
```bash
|
||||
# Faster speech (speed > 1.0)
|
||||
uv run example_onnx.py \
|
||||
--voice-style assets/voice_styles/F2.json \
|
||||
--text "This text will be synthesized at a faster pace." \
|
||||
--speed 1.2
|
||||
|
||||
# Slower speech (speed < 1.0)
|
||||
uv run example_onnx.py \
|
||||
--voice-style assets/voice_styles/M2.json \
|
||||
--text "This text will be synthesized at a slower, more deliberate pace." \
|
||||
--speed 0.9
|
||||
```
|
||||
|
||||
This will:
|
||||
- Use `--speed 1.2` to generate faster speech
|
||||
- Use `--speed 0.9` to generate slower speech
|
||||
- Default speed is 1.05 if not specified
|
||||
- Recommended speed range is between 0.9 and 1.5 for natural-sounding results
|
||||
|
||||
## Available Arguments
|
||||
|
||||
| Argument | Type | Default | Description |
|
||||
@@ -92,6 +116,7 @@ This will:
|
||||
| `--use-gpu` | flag | False | Use GPU for inference (with CPU fallback) |
|
||||
| `--onnx-dir` | str | `assets/onnx` | Path to ONNX model directory |
|
||||
| `--total-step` | int | 5 | Number of denoising steps (higher = better quality, slower) |
|
||||
| `--speed` | float | 1.05 | Speech speed factor (higher = faster, lower = slower) |
|
||||
| `--n-test` | int | 4 | Number of times to generate each sample |
|
||||
| `--voice-style` | str+ | `assets/voice_styles/M1.json` | Voice style file path(s) |
|
||||
| `--text` | str+ | (long default text) | Text(s) to synthesize |
|
||||
|
||||
+9
-2
@@ -26,6 +26,12 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--total-step", type=int, default=5, help="Number of denoising steps"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--speed",
|
||||
type=float,
|
||||
default=1.05,
|
||||
help="Speech speed (default: 1.05, higher = faster)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--n-test", type=int, default=4, help="Number of times to generate"
|
||||
)
|
||||
@@ -62,6 +68,7 @@ print("=== TTS Inference with ONNX Runtime (Python) ===\n")
|
||||
# --- 1. Parse arguments --- #
|
||||
args = parse_args()
|
||||
total_step = args.total_step
|
||||
speed = args.speed
|
||||
n_test = args.n_test
|
||||
save_dir = args.save_dir
|
||||
voice_style_paths = args.voice_style
|
||||
@@ -84,9 +91,9 @@ for n in range(n_test):
|
||||
print(f"\n[{n+1}/{n_test}] Starting synthesis...")
|
||||
with timer("Generating speech from text"):
|
||||
if batch:
|
||||
wav, duration = text_to_speech.batch(text_list, style, total_step)
|
||||
wav, duration = text_to_speech.batch(text_list, style, total_step, speed)
|
||||
else:
|
||||
wav, duration = text_to_speech(text_list[0], style, total_step)
|
||||
wav, duration = text_to_speech(text_list[0], style, total_step, speed)
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
for b in range(bsz):
|
||||
|
||||
+11
-5
@@ -86,7 +86,7 @@ class TextToSpeech:
|
||||
return noisy_latent, latent_mask
|
||||
|
||||
def _infer(
|
||||
self, text_list: list[str], style: Style, total_step: int
|
||||
self, text_list: list[str], style: Style, total_step: int, speed: float = 1.05
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
assert (
|
||||
len(text_list) == style.ttl.shape[0]
|
||||
@@ -96,6 +96,7 @@ class TextToSpeech:
|
||||
dur_onnx, *_ = self.dp_ort.run(
|
||||
None, {"text_ids": text_ids, "style_dp": style.dp, "text_mask": text_mask}
|
||||
)
|
||||
dur_onnx = dur_onnx / speed
|
||||
text_emb_onnx, *_ = self.text_enc_ort.run(
|
||||
None,
|
||||
{"text_ids": text_ids, "style_ttl": style.ttl, "text_mask": text_mask},
|
||||
@@ -120,7 +121,12 @@ class TextToSpeech:
|
||||
return wav, dur_onnx
|
||||
|
||||
def __call__(
|
||||
self, text: str, style: Style, total_step: int, silence_duration: float = 0.3
|
||||
self,
|
||||
text: str,
|
||||
style: Style,
|
||||
total_step: int,
|
||||
speed: float = 1.05,
|
||||
silence_duration: float = 0.3,
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
assert (
|
||||
style.ttl.shape[0] == 1
|
||||
@@ -129,7 +135,7 @@ class TextToSpeech:
|
||||
wav_cat = None
|
||||
dur_cat = None
|
||||
for text in text_list:
|
||||
wav, dur_onnx = self._infer([text], style, total_step)
|
||||
wav, dur_onnx = self._infer([text], style, total_step, speed)
|
||||
if wav_cat is None:
|
||||
wav_cat = wav
|
||||
dur_cat = dur_onnx
|
||||
@@ -142,9 +148,9 @@ class TextToSpeech:
|
||||
return wav_cat, dur_cat
|
||||
|
||||
def batch(
|
||||
self, text_list: list[str], style: Style, total_step: int
|
||||
self, text_list: list[str], style: Style, total_step: int, speed: float = 1.05
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
return self._infer(text_list, style, total_step)
|
||||
return self._infer(text_list, style, total_step, speed)
|
||||
|
||||
|
||||
def length_to_mask(lengths: np.ndarray, max_len: Optional[int] = None) -> np.ndarray:
|
||||
|
||||
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using Rust.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
## Installation
|
||||
|
||||
@@ -26,6 +26,10 @@ struct Args {
|
||||
#[arg(long, default_value = "5")]
|
||||
total_step: usize,
|
||||
|
||||
/// Speech speed factor (higher = faster)
|
||||
#[arg(long, default_value = "1.05")]
|
||||
speed: f32,
|
||||
|
||||
/// Number of times to generate
|
||||
#[arg(long, default_value = "4")]
|
||||
n_test: usize,
|
||||
@@ -53,6 +57,7 @@ fn main() -> Result<()> {
|
||||
// --- 1. Parse arguments --- //
|
||||
let args = Args::parse();
|
||||
let total_step = args.total_step;
|
||||
let speed = args.speed;
|
||||
let n_test = args.n_test;
|
||||
let voice_style_paths = &args.voice_style;
|
||||
let text_list = &args.text;
|
||||
@@ -85,11 +90,11 @@ fn main() -> Result<()> {
|
||||
|
||||
let (wav, duration) = if batch {
|
||||
timer("Generating speech from text", || {
|
||||
text_to_speech.batch(text_list, &style, total_step)
|
||||
text_to_speech.batch(text_list, &style, total_step, speed)
|
||||
})?
|
||||
} else {
|
||||
let (w, d) = timer("Generating speech from text", || {
|
||||
text_to_speech.call(&text_list[0], &style, total_step, 0.3)
|
||||
text_to_speech.call(&text_list[0], &style, total_step, speed, 0.3)
|
||||
})?;
|
||||
(w, vec![d])
|
||||
};
|
||||
|
||||
+11
-3
@@ -484,6 +484,7 @@ impl TextToSpeech {
|
||||
text_list: &[String],
|
||||
style: &Style,
|
||||
total_step: usize,
|
||||
speed: f32,
|
||||
) -> Result<(Vec<f32>, Vec<f32>)> {
|
||||
let bsz = text_list.len();
|
||||
|
||||
@@ -511,7 +512,12 @@ impl TextToSpeech {
|
||||
})?;
|
||||
|
||||
let (_, duration_data) = dp_outputs["duration"].try_extract_tensor::<f32>()?;
|
||||
let duration: Vec<f32> = duration_data.to_vec();
|
||||
let mut duration: Vec<f32> = duration_data.to_vec();
|
||||
|
||||
// Apply speed factor to duration
|
||||
for dur in duration.iter_mut() {
|
||||
*dur /= speed;
|
||||
}
|
||||
|
||||
// Encode text
|
||||
let style_ttl_value = Value::from_array(style.ttl.clone())?;
|
||||
@@ -584,6 +590,7 @@ impl TextToSpeech {
|
||||
text: &str,
|
||||
style: &Style,
|
||||
total_step: usize,
|
||||
speed: f32,
|
||||
silence_duration: f32,
|
||||
) -> Result<(Vec<f32>, f32)> {
|
||||
let chunks = chunk_text(text, None);
|
||||
@@ -592,7 +599,7 @@ impl TextToSpeech {
|
||||
let mut dur_cat: f32 = 0.0;
|
||||
|
||||
for (i, chunk) in chunks.iter().enumerate() {
|
||||
let (wav, duration) = self._infer(&[chunk.clone()], style, total_step)?;
|
||||
let (wav, duration) = self._infer(&[chunk.clone()], style, total_step, speed)?;
|
||||
|
||||
let dur = duration[0];
|
||||
let wav_len = (self.sample_rate as f32 * dur) as usize;
|
||||
@@ -619,8 +626,9 @@ impl TextToSpeech {
|
||||
text_list: &[String],
|
||||
style: &Style,
|
||||
total_step: usize,
|
||||
speed: f32,
|
||||
) -> Result<(Vec<f32>, Vec<f32>)> {
|
||||
self._infer(text_list, style, total_step)
|
||||
self._infer(text_list, style, total_step, speed)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx`.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
## Installation
|
||||
|
||||
@@ -5,6 +5,7 @@ struct Args {
|
||||
var useGpu: Bool = false
|
||||
var onnxDir: String = "assets/onnx"
|
||||
var totalStep: Int = 5
|
||||
var speed: Float = 1.05
|
||||
var nTest: Int = 4
|
||||
var voiceStyle: [String] = ["assets/voice_styles/M1.json"]
|
||||
var text: [String] = ["This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."]
|
||||
@@ -33,6 +34,11 @@ func parseArgs() -> Args {
|
||||
args.totalStep = Int(arguments[i + 1]) ?? 5
|
||||
i += 1
|
||||
}
|
||||
case "--speed":
|
||||
if i + 1 < arguments.count {
|
||||
args.speed = Float(arguments[i + 1]) ?? 1.05
|
||||
i += 1
|
||||
}
|
||||
case "--n-test":
|
||||
if i + 1 < arguments.count {
|
||||
args.nTest = Int(arguments[i + 1]) ?? 4
|
||||
@@ -102,13 +108,13 @@ struct ExampleONNX {
|
||||
|
||||
if args.batch {
|
||||
let result = try timer("Generating speech from text") {
|
||||
try textToSpeech.batch(args.text, style, args.totalStep)
|
||||
try textToSpeech.batch(args.text, style, args.totalStep, speed: args.speed)
|
||||
}
|
||||
wav = result.wav
|
||||
duration = result.duration
|
||||
} else {
|
||||
let result = try timer("Generating speech from text") {
|
||||
try textToSpeech.call(args.text[0], style, args.totalStep, silenceDuration: 0.3)
|
||||
try textToSpeech.call(args.text[0], style, args.totalStep, speed: args.speed, silenceDuration: 0.3)
|
||||
}
|
||||
wav = result.wav
|
||||
duration = [result.duration]
|
||||
|
||||
@@ -453,7 +453,7 @@ class TextToSpeech {
|
||||
self.sampleRate = cfgs.ae.sample_rate
|
||||
}
|
||||
|
||||
private func _infer(_ textList: [String], _ style: Style, _ totalStep: Int) throws -> (wav: [Float], duration: [Float]) {
|
||||
private func _infer(_ textList: [String], _ style: Style, _ totalStep: Int, speed: Float = 1.05) throws -> (wav: [Float], duration: [Float]) {
|
||||
let bsz = textList.count
|
||||
|
||||
// Process text
|
||||
@@ -479,10 +479,15 @@ class TextToSpeech {
|
||||
runOptions: nil)
|
||||
|
||||
let durationData = try dpOutputs["duration"]!.tensorData() as Data
|
||||
let duration = durationData.withUnsafeBytes { ptr in
|
||||
var duration = durationData.withUnsafeBytes { ptr in
|
||||
Array(ptr.bindMemory(to: Float.self))
|
||||
}
|
||||
|
||||
// Apply speed factor to duration
|
||||
for i in 0..<duration.count {
|
||||
duration[i] /= speed
|
||||
}
|
||||
|
||||
// Encode text
|
||||
let textEncOutputs = try textEncOrt.run(withInputs: ["text_ids": textIdsValue, "style_ttl": style.ttl, "text_mask": textMaskValue],
|
||||
outputNames: ["text_emb"],
|
||||
@@ -576,14 +581,14 @@ class TextToSpeech {
|
||||
return (wav, duration)
|
||||
}
|
||||
|
||||
func call(_ text: String, _ style: Style, _ totalStep: Int, silenceDuration: Float) throws -> (wav: [Float], duration: Float) {
|
||||
func call(_ text: String, _ style: Style, _ totalStep: Int, speed: Float = 1.05, silenceDuration: Float = 0.3) throws -> (wav: [Float], duration: Float) {
|
||||
let chunks = chunkText(text)
|
||||
|
||||
var wavCat = [Float]()
|
||||
var durCat: Float = 0.0
|
||||
|
||||
for (i, chunk) in chunks.enumerated() {
|
||||
let result = try _infer([chunk], style, totalStep)
|
||||
let result = try _infer([chunk], style, totalStep, speed: speed)
|
||||
|
||||
let dur = result.duration[0]
|
||||
let wavLen = Int(Float(sampleRate) * dur)
|
||||
@@ -605,8 +610,8 @@ class TextToSpeech {
|
||||
return (wavCat, durCat)
|
||||
}
|
||||
|
||||
func batch(_ textList: [String], _ style: Style, _ totalStep: Int) throws -> (wav: [Float], duration: [Float]) {
|
||||
return try _infer(textList, style, totalStep)
|
||||
func batch(_ textList: [String], _ style: Style, _ totalStep: Int, speed: Float = 1.05) throws -> (wav: [Float], duration: [Float]) {
|
||||
return try _infer(textList, style, totalStep, speed: speed)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+41
@@ -108,6 +108,37 @@ NC='\033[0m' # No Color
|
||||
declare -a PASSED=()
|
||||
declare -a FAILED=()
|
||||
|
||||
# Helper function to show statistics
|
||||
show_stats() {
|
||||
local name=$1
|
||||
local results_dir=$2
|
||||
|
||||
if [ -d "$results_dir" ]; then
|
||||
# Count .wav files
|
||||
local file_count=$(find "$results_dir" -name "*.wav" -type f 2>/dev/null | wc -l | tr -d ' ')
|
||||
|
||||
if [ "$file_count" -gt 0 ]; then
|
||||
# Calculate total size
|
||||
local total_size=0
|
||||
while IFS= read -r file; do
|
||||
if [ -f "$file" ]; then
|
||||
local size=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null)
|
||||
total_size=$((total_size + size))
|
||||
fi
|
||||
done < <(find "$results_dir" -name "*.wav" -type f 2>/dev/null)
|
||||
|
||||
# Calculate statistics
|
||||
local total_size_mb=$(echo "scale=2; $total_size / 1024 / 1024" | bc)
|
||||
local avg_size_kb=$(echo "scale=2; $total_size / $file_count / 1024" | bc)
|
||||
|
||||
echo -e "${BLUE}[$name]${NC} 📊 Statistics:"
|
||||
echo -e "${BLUE}[$name]${NC} - Files generated: $file_count"
|
||||
echo -e "${BLUE}[$name]${NC} - Total size: ${total_size_mb} MB"
|
||||
echo -e "${BLUE}[$name]${NC} - Average file size: ${avg_size_kb} KB"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# Helper function to run tests
|
||||
run_test() {
|
||||
local name=$1
|
||||
@@ -118,9 +149,19 @@ run_test() {
|
||||
echo -e "${BLUE}[$name]${NC} Running inference..."
|
||||
cd "$SCRIPT_DIR/$dir"
|
||||
|
||||
# Determine results directory based on the directory
|
||||
local results_dir="$SCRIPT_DIR/$dir/results"
|
||||
if [[ "$dir" == "cpp/build" ]]; then
|
||||
results_dir="$SCRIPT_DIR/cpp/build/results"
|
||||
fi
|
||||
|
||||
# Run command and prefix each output line with the language name
|
||||
if eval "$cmd" 2>&1 | sed "s/^/[$name] /"; then
|
||||
echo -e "${GREEN}[$name]${NC} ✓ Success"
|
||||
|
||||
# Show statistics
|
||||
show_stats "$name" "$results_dir"
|
||||
|
||||
PASSED+=("$name")
|
||||
else
|
||||
echo -e "${RED}[$name]${NC} ✗ Failed"
|
||||
|
||||
@@ -4,6 +4,8 @@ This example demonstrates how to use Supertonic in a web browser using ONNX Runt
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.19** - Added speed control slider to adjust speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
## Features
|
||||
|
||||
+10
-5
@@ -72,7 +72,7 @@ export class TextToSpeech {
|
||||
this.sampleRate = cfgs.ae.sample_rate;
|
||||
}
|
||||
|
||||
async _infer(textList, style, totalStep, progressCallback = null) {
|
||||
async _infer(textList, style, totalStep, speed = 1.05, progressCallback = null) {
|
||||
const bsz = textList.length;
|
||||
|
||||
// Process text
|
||||
@@ -94,6 +94,11 @@ export class TextToSpeech {
|
||||
});
|
||||
const duration = Array.from(dpOutputs.duration.data);
|
||||
|
||||
// Apply speed factor to duration
|
||||
for (let i = 0; i < duration.length; i++) {
|
||||
duration[i] /= speed;
|
||||
}
|
||||
|
||||
// Encode text
|
||||
const textEncOutputs = await this.textEncOrt.run({
|
||||
text_ids: textIdsTensor,
|
||||
@@ -176,7 +181,7 @@ export class TextToSpeech {
|
||||
return { wav, duration };
|
||||
}
|
||||
|
||||
async call(text, style, totalStep, silenceDuration = 0.3, progressCallback = null) {
|
||||
async call(text, style, totalStep, speed = 1.05, silenceDuration = 0.3, progressCallback = null) {
|
||||
if (style.ttl.dims[0] !== 1) {
|
||||
throw new Error('Single speaker text to speech only supports single style');
|
||||
}
|
||||
@@ -185,7 +190,7 @@ export class TextToSpeech {
|
||||
let durCat = 0;
|
||||
|
||||
for (const chunk of textList) {
|
||||
const { wav, duration } = await this._infer([chunk], style, totalStep, progressCallback);
|
||||
const { wav, duration } = await this._infer([chunk], style, totalStep, speed, progressCallback);
|
||||
|
||||
if (wavCat.length === 0) {
|
||||
wavCat = wav;
|
||||
@@ -201,8 +206,8 @@ export class TextToSpeech {
|
||||
return { wav: wavCat, duration: [durCat] };
|
||||
}
|
||||
|
||||
async batch(textList, style, totalStep, progressCallback = null) {
|
||||
return await this._infer(textList, style, totalStep, progressCallback);
|
||||
async batch(textList, style, totalStep, speed = 1.05, progressCallback = null) {
|
||||
return await this._infer(textList, style, totalStep, speed, progressCallback);
|
||||
}
|
||||
|
||||
sampleNoisyLatent(duration, sampleRate, baseChunkSize, chunkCompress, latentDim) {
|
||||
|
||||
@@ -48,6 +48,12 @@
|
||||
<input type="number" id="totalStep" value="5"
|
||||
min="1" max="50">
|
||||
</div>
|
||||
|
||||
<div class="section">
|
||||
<label for="speed">Speed (0.9-1.5 recommended):</label>
|
||||
<input type="number" id="speed" value="1.05"
|
||||
min="0.5" max="2.0" step="0.05">
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
@@ -25,6 +25,7 @@ const textInput = document.getElementById('text');
|
||||
const voiceStyleSelect = document.getElementById('voiceStyleSelect');
|
||||
const voiceStyleInfo = document.getElementById('voiceStyleInfo');
|
||||
const totalStepInput = document.getElementById('totalStep');
|
||||
const speedInput = document.getElementById('speed');
|
||||
const generateBtn = document.getElementById('generateBtn');
|
||||
const statusBox = document.getElementById('statusBox');
|
||||
const statusText = document.getElementById('statusText');
|
||||
@@ -186,6 +187,7 @@ async function generateSpeech() {
|
||||
`;
|
||||
|
||||
const totalStep = parseInt(totalStepInput.value);
|
||||
const speed = parseFloat(speedInput.value);
|
||||
|
||||
showStatus('ℹ️ <strong>Generating speech from text...</strong>');
|
||||
const tic = Date.now();
|
||||
@@ -194,6 +196,7 @@ async function generateSpeech() {
|
||||
text,
|
||||
currentStyle,
|
||||
totalStep,
|
||||
speed,
|
||||
0.3,
|
||||
(step, total) => {
|
||||
showStatus(`ℹ️ <strong>Denoising (${step}/${total})...</strong>`);
|
||||
|
||||
Reference in New Issue
Block a user