add speed parameter

2026-06-02 01:38:48 +02:00 · 2025-11-19 19:42:24 +09:00
parent c31b6745e4
commit 8518b839c1
30 changed files with 246 additions and 61 deletions
@@ -4,6 +4,8 @@ High-performance text-to-speech inference using ONNX Runtime.

 ## 📰 Update News

+**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
+
 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.

 ## Requirements
@@ -10,6 +10,7 @@ namespace fs = std::filesystem;
 struct Args {
    std::string onnx_dir = "../assets/onnx";
    int total_step = 5;
+    float speed = 1.05f;
    int n_test = 4;
    std::vector<std::string> voice_style = {"../assets/voice_styles/M1.json"};
    std::vector<std::string> text = {
@@ -36,6 +37,7 @@ Args parseArgs(int argc, char* argv[]) {
        std::string arg = argv[i];
        if (arg == "--onnx-dir" && i + 1 < argc) args.onnx_dir = argv[++i];
        else if (arg == "--total-step" && i + 1 < argc) args.total_step = std::stoi(argv[++i]);
+        else if (arg == "--speed" && i + 1 < argc) args.speed = std::stof(argv[++i]);
        else if (arg == "--n-test" && i + 1 < argc) args.n_test = std::stoi(argv[++i]);
        else if (arg == "--voice-style" && i + 1 < argc) args.voice_style = splitString(argv[++i], ',');
        else if (arg == "--text" && i + 1 < argc) args.text = splitString(argv[++i], '|');
@@ -51,6 +53,7 @@ int main(int argc, char* argv[]) {
    // --- 1. Parse arguments --- //
    Args args = parseArgs(argc, argv);
    int total_step = args.total_step;
+    float speed = args.speed;
    int n_test = args.n_test;
    std::string save_dir = args.save_dir;
    std::vector<std::string> voice_style_paths = args.voice_style;
@@ -84,9 +87,9 @@ int main(int argc, char* argv[]) {
        
        auto result = timer("Generating speech from text", [&]() {
            if (batch) {
-                return text_to_speech->batch(memory_info, text_list, style, total_step);
+                return text_to_speech->batch(memory_info, text_list, style, total_step, speed);
            } else {
-                return text_to_speech->call(memory_info, text_list[0], style, total_step);
+                return text_to_speech->call(memory_info, text_list[0], style, total_step, speed);
            }
        });
        
@@ -160,7 +160,8 @@ TextToSpeech::SynthesisResult TextToSpeech::_infer(
    Ort::MemoryInfo& memory_info,
    const std::vector<std::string>& text_list,
    const Style& style,
-    int total_step
+    int total_step,
+    float speed
 ) {
    int bsz = text_list.size();
    
@@ -213,6 +214,11 @@ TextToSpeech::SynthesisResult TextToSpeech::_infer(
    auto* dur_data = dp_outputs[0].GetTensorMutableData<float>();
    std::vector<float> duration(dur_data, dur_data + bsz);
    
+    // Apply speed factor to duration
+    for (auto& dur : duration) {
+        dur /= speed;
+    }
+    
    // Create new tensors for text encoder (previous ones were moved)
    text_ids_tensor = intArrayToTensor(memory_info, text_ids, text_ids_shape);
    text_mask_tensor = arrayToTensor(memory_info, text_mask, text_mask_shape);
@@ -370,6 +376,7 @@ TextToSpeech::SynthesisResult TextToSpeech::call(
    const std::string& text,
    const Style& style,
    int total_step,
+    float speed,
    float silence_duration
 ) {
    if (style.getTtlShape()[0] != 1) {
@@ -381,7 +388,7 @@ TextToSpeech::SynthesisResult TextToSpeech::call(
    float dur_cat = 0.0f;
    
    for (const auto& chunk : text_list) {
-        auto result = _infer(memory_info, {chunk}, style, total_step);
+        auto result = _infer(memory_info, {chunk}, style, total_step, speed);
        
        if (wav_cat.empty()) {
            wav_cat = result.wav;
@@ -406,9 +413,10 @@ TextToSpeech::SynthesisResult TextToSpeech::batch(
    Ort::MemoryInfo& memory_info,
    const std::vector<std::string>& text_list,
    const Style& style,
-    int total_step
+    int total_step,
+    float speed
 ) {
-    return _infer(memory_info, text_list, style, total_step);
+    return _infer(memory_info, text_list, style, total_step, speed);
 }

 // ============================================================================
@@ -91,6 +91,7 @@ public:
        const std::string& text,
        const Style& style,
        int total_step,
+        float speed = 1.05f,
        float silence_duration = 0.3f
    );
    
@@ -98,7 +99,8 @@ public:
        Ort::MemoryInfo& memory_info,
        const std::vector<std::string>& text_list,
        const Style& style,
-        int total_step
+        int total_step,
+        float speed = 1.05f
    );
    
    int getSampleRate() const { return sample_rate_; }
@@ -108,7 +110,8 @@ private:
        Ort::MemoryInfo& memory_info,
        const std::vector<std::string>& text_list,
        const Style& style,
-        int total_step
+        int total_step,
+        float speed = 1.05f
    );
    Config cfgs_;
    UnicodeProcessor* text_processor_;
@@ -12,6 +12,7 @@ namespace Supertonic
            public bool UseGpu { get; set; } = false;
            public string OnnxDir { get; set; } = "assets/onnx";
            public int TotalStep { get; set; } = 5;
+            public float Speed { get; set; } = 1.05f;
            public int NTest { get; set; } = 4;
            public List<string> VoiceStyle { get; set; } = new List<string> { "assets/voice_styles/M1.json" };
            public List<string> Text { get; set; } = new List<string> 
@@ -42,6 +43,9 @@ namespace Supertonic
                    case "--total-step" when i + 1 < args.Length:
                        result.TotalStep = int.Parse(args[++i]);
                        break;
+                    case "--speed" when i + 1 < args.Length:
+                        result.Speed = float.Parse(args[++i]);
+                        break;
                    case "--n-test" when i + 1 < args.Length:
                        result.NTest = int.Parse(args[++i]);
                        break;
@@ -67,6 +71,7 @@ namespace Supertonic
            // --- 1. Parse arguments --- //
            var parsedArgs = ParseArgs(args);
            int totalStep = parsedArgs.TotalStep;
+            float speed = parsedArgs.Speed;
            int nTest = parsedArgs.NTest;
            string saveDir = parsedArgs.SaveDir;
            var voiceStylePaths = parsedArgs.VoiceStyle;
@@ -96,11 +101,11 @@ namespace Supertonic
                {
                    if (batch)
                    {
-                        return textToSpeech.Batch(textList, style, totalStep);
+                        return textToSpeech.Batch(textList, style, totalStep, speed);
                    }
                    else
                    {
-                        return textToSpeech.Call(textList[0], style, totalStep);
+                        return textToSpeech.Call(textList[0], style, totalStep, speed);
                    }
                });

@@ -194,7 +194,7 @@ namespace Supertonic
            return (noisyLatent, latentMask);
        }

-        private (float[] wav, float[] duration) _Infer(List<string> textList, Style style, int totalStep)
+        private (float[] wav, float[] duration) _Infer(List<string> textList, Style style, int totalStep, float speed = 1.05f)
        {
            int bsz = textList.Count;
            if (bsz != style.TtlShape[0])
@@ -222,6 +222,12 @@ namespace Supertonic
            };
            using var dpOutputs = _dpOrt.Run(dpInputs);
            var durOnnx = dpOutputs.First(o => o.Name == "duration").AsTensor<float>().ToArray();
+            
+            // Apply speed factor to duration
+            for (int i = 0; i < durOnnx.Length; i++)
+            {
+                durOnnx[i] /= speed;
+            }

            // Run text encoder
            var textEncInputs = new List<NamedOnnxValue>
@@ -284,7 +290,7 @@ namespace Supertonic
            return (wavTensor.ToArray(), durOnnx);
        }

-        public (float[] wav, float[] duration) Call(string text, Style style, int totalStep, float silenceDuration = 0.3f)
+        public (float[] wav, float[] duration) Call(string text, Style style, int totalStep, float speed = 1.05f, float silenceDuration = 0.3f)
        {
            if (style.TtlShape[0] != 1)
            {
@@ -297,7 +303,7 @@ namespace Supertonic

            foreach (var chunk in textList)
            {
-                var (wav, duration) = _Infer(new List<string> { chunk }, style, totalStep);
+                var (wav, duration) = _Infer(new List<string> { chunk }, style, totalStep, speed);

                if (wavCat.Count == 0)
                {
@@ -317,9 +323,9 @@ namespace Supertonic
            return (wavCat.ToArray(), new float[] { durCat });
        }

-        public (float[] wav, float[] duration) Batch(List<string> textList, Style style, int totalStep)
+        public (float[] wav, float[] duration) Batch(List<string> textList, Style style, int totalStep, float speed = 1.05f)
        {
-            return _Infer(textList, style, totalStep);
+            return _Infer(textList, style, totalStep, speed);
        }
    }

@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `ExampleONNX.cs`.

 ## 📰 Update News

+**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
+
 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.

 ## Installation
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx.go`.

 ## 📰 Update News

+**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
+
 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.

 ## Installation
@@ -15,6 +15,7 @@ type Args struct {
 	useGPU      bool
 	onnxDir     string
 	totalStep   int
+	speed       float64
 	nTest       int
 	voiceStyle  []string
 	text        []string
@@ -28,6 +29,7 @@ func parseArgs() *Args {
 	flag.BoolVar(&args.useGPU, "use-gpu", false, "Use GPU for inference (default: CPU)")
 	flag.StringVar(&args.onnxDir, "onnx-dir", "assets/onnx", "Path to ONNX model directory")
 	flag.IntVar(&args.totalStep, "total-step", 5, "Number of denoising steps")
+	flag.Float64Var(&args.speed, "speed", 1.05, "Speech speed factor (higher = faster)")
 	flag.IntVar(&args.nTest, "n-test", 4, "Number of times to generate")
 	flag.StringVar(&args.saveDir, "save-dir", "results", "Output directory")
 	flag.BoolVar(&args.batch, "batch", false, "Enable batch mode (multiple text-style pairs)")
@@ -63,6 +65,7 @@ func main() {
 	// --- 1. Parse arguments --- //
 	args := parseArgs()
 	totalStep := args.totalStep
+	speed := float32(args.speed)
 	nTest := args.nTest
 	saveDir := args.saveDir
 	voiceStylePaths := args.voiceStyle
@@ -123,7 +126,7 @@ func main() {

 		if batch {
 			Timer("Generating speech from text", func() interface{} {
-				w, d, err := textToSpeech.Batch(textList, style, totalStep)
+				w, d, err := textToSpeech.Batch(textList, style, totalStep, speed)
 				if err != nil {
 					fmt.Printf("Error generating speech: %v\n", err)
 					os.Exit(1)
@@ -134,7 +137,7 @@ func main() {
 			})
 		} else {
 			Timer("Generating speech from text", func() interface{} {
-				w, d, err := textToSpeech.Call(textList[0], style, totalStep, 0.3)
+				w, d, err := textToSpeech.Call(textList[0], style, totalStep, speed, 0.3)
 				if err != nil {
 					fmt.Printf("Error generating speech: %v\n", err)
 					os.Exit(1)
@@ -572,7 +572,7 @@ func (tts *TextToSpeech) sampleNoisyLatent(durOnnx []float32) ([][][]float64, []
 	return noisyLatent, latentMask
 }

-func (tts *TextToSpeech) _infer(textList []string, style *Style, totalStep int) ([]float32, []float32, error) {
+func (tts *TextToSpeech) _infer(textList []string, style *Style, totalStep int, speed float32) ([]float32, []float32, error) {
 	bsz := len(textList)

 	// Process text
@@ -597,6 +597,11 @@ func (tts *TextToSpeech) _infer(textList []string, style *Style, totalStep int)
 	durTensor := dpOutputs[0].(*ort.Tensor[float32])
 	defer durTensor.Destroy()
 	durOnnx := durTensor.GetData()
+	
+	// Apply speed factor to duration
+	for i := range durOnnx {
+		durOnnx[i] /= speed
+	}

 	// Encode text
 	textIDsTensor2 := IntArrayToTensor(textIDs, textIDsShape)
@@ -691,14 +696,14 @@ func (tts *TextToSpeech) _infer(textList []string, style *Style, totalStep int)
 }

 // Call synthesizes speech from a single text with automatic chunking
-func (tts *TextToSpeech) Call(text string, style *Style, totalStep int, silenceDuration float32) ([]float32, float32, error) {
+func (tts *TextToSpeech) Call(text string, style *Style, totalStep int, speed float32, silenceDuration float32) ([]float32, float32, error) {
 	chunks := chunkText(text, 0)
 	
 	var wavCat []float32
 	var durCat float32

 	for i, chunk := range chunks {
-		wav, duration, err := tts._infer([]string{chunk}, style, totalStep)
+		wav, duration, err := tts._infer([]string{chunk}, style, totalStep, speed)
 		if err != nil {
 			return nil, 0, err
 		}
@@ -724,8 +729,8 @@ func (tts *TextToSpeech) Call(text string, style *Style, totalStep int, silenceD
 }

 // Batch synthesizes speech from multiple texts
-func (tts *TextToSpeech) Batch(textList []string, style *Style, totalStep int) ([]float32, []float32, error) {
-	return tts._infer(textList, style, totalStep)
+func (tts *TextToSpeech) Batch(textList []string, style *Style, totalStep int, speed float32) ([]float32, []float32, error) {
+	return tts._infer(textList, style, totalStep, speed)
 }

 func (tts *TextToSpeech) Destroy() {
@@ -15,6 +15,7 @@ public class ExampleONNX {
        boolean useGpu = false;
        String onnxDir = "assets/onnx";
        int totalStep = 5;
+        float speed = 1.05f;
        int nTest = 4;
        List<String> voiceStyle = Arrays.asList("assets/voice_styles/M1.json");
        List<String> text = Arrays.asList(
@@ -41,6 +42,9 @@ public class ExampleONNX {
                case "--total-step":
                    if (i + 1 < args.length) result.totalStep = Integer.parseInt(args[++i]);
                    break;
+                case "--speed":
+                    if (i + 1 < args.length) result.speed = Float.parseFloat(args[++i]);
+                    break;
                case "--n-test":
                    if (i + 1 < args.length) result.nTest = Integer.parseInt(args[++i]);
                    break;
@@ -76,6 +80,7 @@ public class ExampleONNX {
            // --- 1. Parse arguments --- //
            Args parsedArgs = parseArgs(args);
            int totalStep = parsedArgs.totalStep;
+            float speed = parsedArgs.speed;
            int nTest = parsedArgs.nTest;
            String saveDir = parsedArgs.saveDir;
            List<String> voiceStylePaths = parsedArgs.voiceStyle;
@@ -111,7 +116,7 @@ public class ExampleONNX {
                if (batch) {
                    ttsResult = Helper.timer("Generating speech from text", () -> {
                        try {
-                            return textToSpeech.batch(textList, style, totalStep, env);
+                            return textToSpeech.batch(textList, style, totalStep, speed, env);
                        } catch (Exception e) {
                            throw new RuntimeException(e);
                        }
@@ -119,7 +124,7 @@ public class ExampleONNX {
                } else {
                    ttsResult = Helper.timer("Generating speech from text", () -> {
                        try {
-                            return textToSpeech.call(textList.get(0), style, totalStep, 0.3f, env);
+                            return textToSpeech.call(textList.get(0), style, totalStep, speed, 0.3f, env);
                        } catch (Exception e) {
                            throw new RuntimeException(e);
                        }
@@ -154,7 +154,7 @@ class TextToSpeech {
        this.ldim = config.ttl.latentDim;
    }
    
-    private TTSResult _infer(List<String> textList, Style style, int totalStep, OrtEnvironment env) 
+    private TTSResult _infer(List<String> textList, Style style, int totalStep, float speed, OrtEnvironment env) 
            throws OrtException {
        int bsz = textList.size();
        
@@ -182,6 +182,11 @@ class TextToSpeech {
            duration = (float[]) dpValue;
        }
        
+        // Apply speed factor to duration
+        for (int i = 0; i < duration.length; i++) {
+            duration[i] /= speed;
+        }
+        
        // Encode text
        Map<String, OnnxTensor> textEncInputs = new HashMap<>();
        textEncInputs.put("text_ids", textIdsTensor);
@@ -301,7 +306,7 @@ class TextToSpeech {
    /**
     * Synthesize speech from a single text with automatic chunking
     */
-    public TTSResult call(String text, Style style, int totalStep, float silenceDuration, OrtEnvironment env) 
+    public TTSResult call(String text, Style style, int totalStep, float speed, float silenceDuration, OrtEnvironment env) 
            throws OrtException {
        List<String> chunks = Helper.chunkText(text, 0);
        
@@ -309,7 +314,7 @@ class TextToSpeech {
        float durCat = 0.0f;
        
        for (int i = 0; i < chunks.size(); i++) {
-            TTSResult result = _infer(Arrays.asList(chunks.get(i)), style, totalStep, env);
+            TTSResult result = _infer(Arrays.asList(chunks.get(i)), style, totalStep, speed, env);
            
            float dur = result.duration[0];
            int wavLen = (int) (sampleRate * dur);
@@ -344,9 +349,9 @@ class TextToSpeech {
    /**
     * Batch synthesize speech from multiple texts
     */
-    public TTSResult batch(List<String> textList, Style style, int totalStep, OrtEnvironment env) 
+    public TTSResult batch(List<String> textList, Style style, int totalStep, float speed, OrtEnvironment env) 
            throws OrtException {
-        return _infer(textList, style, totalStep, env);
+        return _infer(textList, style, totalStep, speed, env);
    }
    
    public void close() throws OrtException {
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `ExampleONNX.java`.

 ## 📰 Update News

+**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
+
 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.

 ## Installation
@@ -4,6 +4,8 @@ Node.js implementation for TTS inference. Uses ONNX Runtime to generate speech f

 ## 📰 Update News

+**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
+
 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.

 ## Requirements
@@ -15,6 +15,7 @@ function parseArgs() {
        useGpu: false,
        onnxDir: 'assets/onnx',
        totalStep: 5,
+        speed: 1.05,
        nTest: 4,
        voiceStyle: ['assets/voice_styles/M1.json'],
        text: ['This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen.'],
@@ -32,6 +33,8 @@ function parseArgs() {
            args.onnxDir = process.argv[++i];
        } else if (arg === '--total-step' && i + 1 < process.argv.length) {
            args.totalStep = parseInt(process.argv[++i]);
+        } else if (arg === '--speed' && i + 1 < process.argv.length) {
+            args.speed = parseFloat(process.argv[++i]);
        } else if (arg === '--n-test' && i + 1 < process.argv.length) {
            args.nTest = parseInt(process.argv[++i]);
        } else if (arg === '--voice-style' && i + 1 < process.argv.length) {
@@ -55,6 +58,7 @@ async function main() {
    // --- 1. Parse arguments --- //
    const args = parseArgs();
    const totalStep = args.totalStep;
+    const speed = args.speed;
    const nTest = args.nTest;
    const saveDir = args.saveDir;
    const voiceStylePaths = args.voiceStyle.map(p => path.resolve(__dirname, p));
@@ -79,9 +83,9 @@ async function main() {
        
        const { wav, duration } = await timer('Generating speech from text', async () => {
            if (batch) {
-                return await textToSpeech.batch(textList, style, totalStep);
+                return await textToSpeech.batch(textList, style, totalStep, speed);
            } else {
-                return await textToSpeech.call(textList[0], style, totalStep);
+                return await textToSpeech.call(textList[0], style, totalStep, speed);
            }
        });
        
@@ -114,7 +114,7 @@ class TextToSpeech {
        return { noisyLatent, latentMask };
    }

-    async _infer(textList, style, totalStep) {
+    async _infer(textList, style, totalStep, speed = 1.05) {
        if (textList.length !== style.ttl.dims[0]) {
            throw new Error('Number of texts must match number of style vectors');
        }
@@ -133,6 +133,11 @@ class TextToSpeech {
        
        const durOnnx = Array.from(dpResult.duration.data);
        
+        // Apply speed factor to duration
+        for (let i = 0; i < durOnnx.length; i++) {
+            durOnnx[i] /= speed;
+        }
+        
        const textEncResult = await this.textEncOrt.run({
            text_ids: intArrayToTensor(textIds, textIdsShape),
            style_ttl: style.ttl,
@@ -185,7 +190,7 @@ class TextToSpeech {
        return { wav, duration: durOnnx };
    }

-    async call(text, style, totalStep, silenceDuration = 0.3) {
+    async call(text, style, totalStep, speed = 1.05, silenceDuration = 0.3) {
        if (style.ttl.dims[0] !== 1) {
            throw new Error('Single speaker text to speech only supports single style');
        }
@@ -194,7 +199,7 @@ class TextToSpeech {
        let durCat = 0;
        
        for (const chunk of textList) {
-            const { wav, duration } = await this._infer([chunk], style, totalStep);
+            const { wav, duration } = await this._infer([chunk], style, totalStep, speed);
            
            if (wavCat === null) {
                wavCat = wav;
@@ -210,8 +215,8 @@ class TextToSpeech {
        return { wav: wavCat, duration: [durCat] };
    }

-    async batch(textList, style, totalStep) {
-        return await this._infer(textList, style, totalStep);
+    async batch(textList, style, totalStep, speed = 1.05) {
+        return await this._infer(textList, style, totalStep, speed);
    }
 }

@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx.py`.

 ## 📰 Update News

+**2025.11.19** - Added `--speed` parameter to control speech synthesis speed. Adjust the speed factor to make speech faster or slower while maintaining natural quality.
+
 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.

 ## Installation
@@ -85,6 +87,28 @@ This will:

 **Note**: When using batch mode (`--batch`), automatic text chunking is disabled. Use non-batch mode for long-form text synthesis.

+### Example 5: Adjusting Speech Speed
+Control the speed of speech synthesis:
+```bash
+# Faster speech (speed > 1.0)
+uv run example_onnx.py \
+  --voice-style assets/voice_styles/F2.json \
+  --text "This text will be synthesized at a faster pace." \
+  --speed 1.2
+
+# Slower speech (speed < 1.0)
+uv run example_onnx.py \
+  --voice-style assets/voice_styles/M2.json \
+  --text "This text will be synthesized at a slower, more deliberate pace." \
+  --speed 0.9
+```
+
+This will:
+- Use `--speed 1.2` to generate faster speech
+- Use `--speed 0.9` to generate slower speech
+- Default speed is 1.05 if not specified
+- Recommended speed range is between 0.9 and 1.5 for natural-sounding results
+
 ## Available Arguments

 | Argument | Type | Default | Description |
@@ -92,6 +116,7 @@ This will:
 | `--use-gpu` | flag | False | Use GPU for inference (with CPU fallback) |
 | `--onnx-dir` | str | `assets/onnx` | Path to ONNX model directory |
 | `--total-step` | int | 5 | Number of denoising steps (higher = better quality, slower) |
+| `--speed` | float | 1.05 | Speech speed factor (higher = faster, lower = slower) |
 | `--n-test` | int | 4 | Number of times to generate each sample |
 | `--voice-style` | str+ | `assets/voice_styles/M1.json` | Voice style file path(s) |
 | `--text` | str+ | (long default text) | Text(s) to synthesize |
@@ -26,6 +26,12 @@ def parse_args():
    parser.add_argument(
        "--total-step", type=int, default=5, help="Number of denoising steps"
    )
+    parser.add_argument(
+        "--speed",
+        type=float,
+        default=1.05,
+        help="Speech speed (default: 1.05, higher = faster)",
+    )
    parser.add_argument(
        "--n-test", type=int, default=4, help="Number of times to generate"
    )
@@ -62,6 +68,7 @@ print("=== TTS Inference with ONNX Runtime (Python) ===\n")
 # --- 1. Parse arguments --- #
 args = parse_args()
 total_step = args.total_step
+speed = args.speed
 n_test = args.n_test
 save_dir = args.save_dir
 voice_style_paths = args.voice_style
@@ -84,9 +91,9 @@ for n in range(n_test):
    print(f"\n[{n+1}/{n_test}] Starting synthesis...")
    with timer("Generating speech from text"):
        if batch:
-            wav, duration = text_to_speech.batch(text_list, style, total_step)
+            wav, duration = text_to_speech.batch(text_list, style, total_step, speed)
        else:
-            wav, duration = text_to_speech(text_list[0], style, total_step)
+            wav, duration = text_to_speech(text_list[0], style, total_step, speed)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    for b in range(bsz):
@@ -86,7 +86,7 @@ class TextToSpeech:
        return noisy_latent, latent_mask

    def _infer(
-        self, text_list: list[str], style: Style, total_step: int
+        self, text_list: list[str], style: Style, total_step: int, speed: float = 1.05
    ) -> tuple[np.ndarray, np.ndarray]:
        assert (
            len(text_list) == style.ttl.shape[0]
@@ -96,6 +96,7 @@ class TextToSpeech:
        dur_onnx, *_ = self.dp_ort.run(
            None, {"text_ids": text_ids, "style_dp": style.dp, "text_mask": text_mask}
        )
+        dur_onnx = dur_onnx / speed
        text_emb_onnx, *_ = self.text_enc_ort.run(
            None,
            {"text_ids": text_ids, "style_ttl": style.ttl, "text_mask": text_mask},
@@ -120,7 +121,12 @@ class TextToSpeech:
        return wav, dur_onnx

    def __call__(
-        self, text: str, style: Style, total_step: int, silence_duration: float = 0.3
+        self,
+        text: str,
+        style: Style,
+        total_step: int,
+        speed: float = 1.05,
+        silence_duration: float = 0.3,
    ) -> tuple[np.ndarray, np.ndarray]:
        assert (
            style.ttl.shape[0] == 1
@@ -129,7 +135,7 @@ class TextToSpeech:
        wav_cat = None
        dur_cat = None
        for text in text_list:
-            wav, dur_onnx = self._infer([text], style, total_step)
+            wav, dur_onnx = self._infer([text], style, total_step, speed)
            if wav_cat is None:
                wav_cat = wav
                dur_cat = dur_onnx
@@ -142,9 +148,9 @@ class TextToSpeech:
        return wav_cat, dur_cat

    def batch(
-        self, text_list: list[str], style: Style, total_step: int
+        self, text_list: list[str], style: Style, total_step: int, speed: float = 1.05
    ) -> tuple[np.ndarray, np.ndarray]:
-        return self._infer(text_list, style, total_step)
+        return self._infer(text_list, style, total_step, speed)


 def length_to_mask(lengths: np.ndarray, max_len: Optional[int] = None) -> np.ndarray:
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using Rust.

 ## 📰 Update News

+**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
+
 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.

 ## Installation
@@ -26,6 +26,10 @@ struct Args {
    #[arg(long, default_value = "5")]
    total_step: usize,

+    /// Speech speed factor (higher = faster)
+    #[arg(long, default_value = "1.05")]
+    speed: f32,
+
    /// Number of times to generate
    #[arg(long, default_value = "4")]
    n_test: usize,
@@ -53,6 +57,7 @@ fn main() -> Result<()> {
    // --- 1. Parse arguments --- //
    let args = Args::parse();
    let total_step = args.total_step;
+    let speed = args.speed;
    let n_test = args.n_test;
    let voice_style_paths = &args.voice_style;
    let text_list = &args.text;
@@ -85,11 +90,11 @@ fn main() -> Result<()> {

        let (wav, duration) = if batch {
            timer("Generating speech from text", || {
-                text_to_speech.batch(text_list, &style, total_step)
+                text_to_speech.batch(text_list, &style, total_step, speed)
            })?
        } else {
            let (w, d) = timer("Generating speech from text", || {
-                text_to_speech.call(&text_list[0], &style, total_step, 0.3)
+                text_to_speech.call(&text_list[0], &style, total_step, speed, 0.3)
            })?;
            (w, vec![d])
        };
@@ -484,6 +484,7 @@ impl TextToSpeech {
        text_list: &[String],
        style: &Style,
        total_step: usize,
+        speed: f32,
    ) -> Result<(Vec<f32>, Vec<f32>)> {
        let bsz = text_list.len();

@@ -511,7 +512,12 @@ impl TextToSpeech {
        })?;

        let (_, duration_data) = dp_outputs["duration"].try_extract_tensor::<f32>()?;
-        let duration: Vec<f32> = duration_data.to_vec();
+        let mut duration: Vec<f32> = duration_data.to_vec();
+        
+        // Apply speed factor to duration
+        for dur in duration.iter_mut() {
+            *dur /= speed;
+        }

        // Encode text
        let style_ttl_value = Value::from_array(style.ttl.clone())?;
@@ -584,6 +590,7 @@ impl TextToSpeech {
        text: &str,
        style: &Style,
        total_step: usize,
+        speed: f32,
        silence_duration: f32,
    ) -> Result<(Vec<f32>, f32)> {
        let chunks = chunk_text(text, None);
@@ -592,7 +599,7 @@ impl TextToSpeech {
        let mut dur_cat: f32 = 0.0;

        for (i, chunk) in chunks.iter().enumerate() {
-            let (wav, duration) = self._infer(&[chunk.clone()], style, total_step)?;
+            let (wav, duration) = self._infer(&[chunk.clone()], style, total_step, speed)?;
            
            let dur = duration[0];
            let wav_len = (self.sample_rate as f32 * dur) as usize;
@@ -619,8 +626,9 @@ impl TextToSpeech {
        text_list: &[String],
        style: &Style,
        total_step: usize,
+        speed: f32,
    ) -> Result<(Vec<f32>, Vec<f32>)> {
-        self._infer(text_list, style, total_step)
+        self._infer(text_list, style, total_step, speed)
    }
 }

@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx`.

 ## 📰 Update News

+**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
+
 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.

 ## Installation
@@ -5,6 +5,7 @@ struct Args {
    var useGpu: Bool = false
    var onnxDir: String = "assets/onnx"
    var totalStep: Int = 5
+    var speed: Float = 1.05
    var nTest: Int = 4
    var voiceStyle: [String] = ["assets/voice_styles/M1.json"]
    var text: [String] = ["This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."]
@@ -33,6 +34,11 @@ func parseArgs() -> Args {
                args.totalStep = Int(arguments[i + 1]) ?? 5
                i += 1
            }
+        case "--speed":
+            if i + 1 < arguments.count {
+                args.speed = Float(arguments[i + 1]) ?? 1.05
+                i += 1
+            }
        case "--n-test":
            if i + 1 < arguments.count {
                args.nTest = Int(arguments[i + 1]) ?? 4
@@ -102,13 +108,13 @@ struct ExampleONNX {
                
                if args.batch {
                    let result = try timer("Generating speech from text") {
-                        try textToSpeech.batch(args.text, style, args.totalStep)
+                        try textToSpeech.batch(args.text, style, args.totalStep, speed: args.speed)
                    }
                    wav = result.wav
                    duration = result.duration
                } else {
                    let result = try timer("Generating speech from text") {
-                        try textToSpeech.call(args.text[0], style, args.totalStep, silenceDuration: 0.3)
+                        try textToSpeech.call(args.text[0], style, args.totalStep, speed: args.speed, silenceDuration: 0.3)
                    }
                    wav = result.wav
                    duration = [result.duration]
@@ -453,7 +453,7 @@ class TextToSpeech {
        self.sampleRate = cfgs.ae.sample_rate
    }
    
-    private func _infer(_ textList: [String], _ style: Style, _ totalStep: Int) throws -> (wav: [Float], duration: [Float]) {
+    private func _infer(_ textList: [String], _ style: Style, _ totalStep: Int, speed: Float = 1.05) throws -> (wav: [Float], duration: [Float]) {
        let bsz = textList.count
        
        // Process text
@@ -479,10 +479,15 @@ class TextToSpeech {
                                      runOptions: nil)
        
        let durationData = try dpOutputs["duration"]!.tensorData() as Data
-        let duration = durationData.withUnsafeBytes { ptr in
+        var duration = durationData.withUnsafeBytes { ptr in
            Array(ptr.bindMemory(to: Float.self))
        }
        
+        // Apply speed factor to duration
+        for i in 0..<duration.count {
+            duration[i] /= speed
+        }
+        
        // Encode text
        let textEncOutputs = try textEncOrt.run(withInputs: ["text_ids": textIdsValue, "style_ttl": style.ttl, "text_mask": textMaskValue],
                                                outputNames: ["text_emb"],
@@ -576,14 +581,14 @@ class TextToSpeech {
        return (wav, duration)
    }
    
-    func call(_ text: String, _ style: Style, _ totalStep: Int, silenceDuration: Float) throws -> (wav: [Float], duration: Float) {
+    func call(_ text: String, _ style: Style, _ totalStep: Int, speed: Float = 1.05, silenceDuration: Float = 0.3) throws -> (wav: [Float], duration: Float) {
        let chunks = chunkText(text)
        
        var wavCat = [Float]()
        var durCat: Float = 0.0
        
        for (i, chunk) in chunks.enumerated() {
-            let result = try _infer([chunk], style, totalStep)
+            let result = try _infer([chunk], style, totalStep, speed: speed)
            
            let dur = result.duration[0]
            let wavLen = Int(Float(sampleRate) * dur)
@@ -605,8 +610,8 @@ class TextToSpeech {
        return (wavCat, durCat)
    }
    
-    func batch(_ textList: [String], _ style: Style, _ totalStep: Int) throws -> (wav: [Float], duration: [Float]) {
-        return try _infer(textList, style, totalStep)
+    func batch(_ textList: [String], _ style: Style, _ totalStep: Int, speed: Float = 1.05) throws -> (wav: [Float], duration: [Float]) {
+        return try _infer(textList, style, totalStep, speed: speed)
    }
 }

@@ -108,6 +108,37 @@ NC='\033[0m' # No Color
 declare -a PASSED=()
 declare -a FAILED=()

+# Helper function to show statistics
+show_stats() {
+    local name=$1
+    local results_dir=$2
+    
+    if [ -d "$results_dir" ]; then
+        # Count .wav files
+        local file_count=$(find "$results_dir" -name "*.wav" -type f 2>/dev/null | wc -l | tr -d ' ')
+        
+        if [ "$file_count" -gt 0 ]; then
+            # Calculate total size
+            local total_size=0
+            while IFS= read -r file; do
+                if [ -f "$file" ]; then
+                    local size=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null)
+                    total_size=$((total_size + size))
+                fi
+            done < <(find "$results_dir" -name "*.wav" -type f 2>/dev/null)
+            
+            # Calculate statistics
+            local total_size_mb=$(echo "scale=2; $total_size / 1024 / 1024" | bc)
+            local avg_size_kb=$(echo "scale=2; $total_size / $file_count / 1024" | bc)
+            
+            echo -e "${BLUE}[$name]${NC} 📊 Statistics:"
+            echo -e "${BLUE}[$name]${NC}   - Files generated: $file_count"
+            echo -e "${BLUE}[$name]${NC}   - Total size: ${total_size_mb} MB"
+            echo -e "${BLUE}[$name]${NC}   - Average file size: ${avg_size_kb} KB"
+        fi
+    fi
+}
+
 # Helper function to run tests
 run_test() {
    local name=$1
@@ -118,9 +149,19 @@ run_test() {
    echo -e "${BLUE}[$name]${NC} Running inference..."
    cd "$SCRIPT_DIR/$dir"
    
+    # Determine results directory based on the directory
+    local results_dir="$SCRIPT_DIR/$dir/results"
+    if [[ "$dir" == "cpp/build" ]]; then
+        results_dir="$SCRIPT_DIR/cpp/build/results"
+    fi
+    
    # Run command and prefix each output line with the language name
    if eval "$cmd" 2>&1 | sed "s/^/[$name] /"; then
        echo -e "${GREEN}[$name]${NC} ✓ Success"
+        
+        # Show statistics
+        show_stats "$name" "$results_dir"
+        
        PASSED+=("$name")
    else
        echo -e "${RED}[$name]${NC} ✗ Failed"
@@ -4,6 +4,8 @@ This example demonstrates how to use Supertonic in a web browser using ONNX Runt

 ## 📰 Update News

+**2025.11.19** - Added speed control slider to adjust speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
+
 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.

 ## Features
@@ -72,7 +72,7 @@ export class TextToSpeech {
        this.sampleRate = cfgs.ae.sample_rate;
    }

-    async _infer(textList, style, totalStep, progressCallback = null) {
+    async _infer(textList, style, totalStep, speed = 1.05, progressCallback = null) {
        const bsz = textList.length;
        
        // Process text
@@ -94,6 +94,11 @@ export class TextToSpeech {
        });
        const duration = Array.from(dpOutputs.duration.data);
        
+        // Apply speed factor to duration
+        for (let i = 0; i < duration.length; i++) {
+            duration[i] /= speed;
+        }
+        
        // Encode text
        const textEncOutputs = await this.textEncOrt.run({
            text_ids: textIdsTensor,
@@ -176,7 +181,7 @@ export class TextToSpeech {
        return { wav, duration };
    }

-    async call(text, style, totalStep, silenceDuration = 0.3, progressCallback = null) {
+    async call(text, style, totalStep, speed = 1.05, silenceDuration = 0.3, progressCallback = null) {
        if (style.ttl.dims[0] !== 1) {
            throw new Error('Single speaker text to speech only supports single style');
        }
@@ -185,7 +190,7 @@ export class TextToSpeech {
        let durCat = 0;
        
        for (const chunk of textList) {
-            const { wav, duration } = await this._infer([chunk], style, totalStep, progressCallback);
+            const { wav, duration } = await this._infer([chunk], style, totalStep, speed, progressCallback);
            
            if (wavCat.length === 0) {
                wavCat = wav;
@@ -201,8 +206,8 @@ export class TextToSpeech {
        return { wav: wavCat, duration: [durCat] };
    }

-    async batch(textList, style, totalStep, progressCallback = null) {
-        return await this._infer(textList, style, totalStep, progressCallback);
+    async batch(textList, style, totalStep, speed = 1.05, progressCallback = null) {
+        return await this._infer(textList, style, totalStep, speed, progressCallback);
    }

    sampleNoisyLatent(duration, sampleRate, baseChunkSize, chunkCompress, latentDim) {
@@ -48,6 +48,12 @@
                            <input type="number" id="totalStep" value="5"
                                min="1" max="50">
                        </div>
+                        
+                        <div class="section">
+                            <label for="speed">Speed (0.9-1.5 recommended):</label>
+                            <input type="number" id="speed" value="1.05"
+                                min="0.5" max="2.0" step="0.05">
+                        </div>

                    </div>

@@ -25,6 +25,7 @@ const textInput = document.getElementById('text');
 const voiceStyleSelect = document.getElementById('voiceStyleSelect');
 const voiceStyleInfo = document.getElementById('voiceStyleInfo');
 const totalStepInput = document.getElementById('totalStep');
+const speedInput = document.getElementById('speed');
 const generateBtn = document.getElementById('generateBtn');
 const statusBox = document.getElementById('statusBox');
 const statusText = document.getElementById('statusText');
@@ -186,6 +187,7 @@ async function generateSpeech() {
        `;
        
        const totalStep = parseInt(totalStepInput.value);
+        const speed = parseFloat(speedInput.value);
        
        showStatus('ℹ️ <strong>Generating speech from text...</strong>');
        const tic = Date.now();
@@ -194,6 +196,7 @@ async function generateSpeech() {
            text, 
            currentStyle, 
            totalStep,
+            speed,
            0.3,
            (step, total) => {
                showStatus(`ℹ️ <strong>Denoising (${step}/${total})...</strong>`);