add text chunking for long-form generation (Fixes #4)

This commit is contained in:
ANLGBOY
2025-11-19 18:08:30 +09:00
parent d31536d9fc
commit c31b6745e4
30 changed files with 1813 additions and 102 deletions
+26 -2
View File
@@ -2,6 +2,10 @@
High-performance text-to-speech inference using ONNX Runtime.
## 📰 Update News
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
## Requirements
- C++17 compiler, CMake 3.15+
@@ -62,14 +66,16 @@ Process multiple voice styles and texts at once:
```bash
./example_onnx \
--voice-style ../assets/voice_styles/M1.json,../assets/voice_styles/F1.json \
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant."
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant." \
--batch
```
This will:
- Use `--batch` flag to enable batch processing mode
- Generate speech for 2 different voice-text pairs
- Use male voice style (M1.json) for the first text
- Use female voice style (F1.json) for the second text
- Process both samples in a single batch
- Process both samples in a single batch (automatic text chunking disabled)
### Example 3: High Quality Inference
Increase denoising steps for better quality:
@@ -84,6 +90,22 @@ This will:
- Use 10 denoising steps instead of the default 5
- Produce higher quality output at the cost of slower inference
### Example 4: Long-Form Inference
For long texts, the system automatically chunks the text into manageable segments and generates a single audio file:
```bash
./example_onnx \
--voice-style ../assets/voice_styles/M1.json \
--text "Once upon a time, in a small village nestled between rolling hills, there lived a young artist named Clara. Every morning, she would wake up before dawn to capture the first light of day. The golden rays streaming through her window inspired countless paintings. Her work was known throughout the region for its vibrant colors and emotional depth. People from far and wide came to see her gallery, and many said her paintings could tell stories that words never could."
```
This will:
- Automatically split the long text into smaller chunks (max 300 characters by default)
- Process each chunk separately while maintaining natural speech flow
- Insert brief silences (0.3 seconds) between chunks for natural pacing
- Combine all chunks into a single output audio file
**Note**: When using batch mode (`--batch`), automatic text chunking is disabled. Use non-batch mode for long-form text synthesis.
## Available Arguments
| Argument | Type | Default | Description |
@@ -94,8 +116,10 @@ This will:
| `--voice-style` | str | `../assets/voice_styles/M1.json` | Voice style file path(s) (comma-separated for batch) |
| `--text` | str | (long default text) | Text(s) to synthesize (pipe-separated for batch) |
| `--save-dir` | str | `results` | Output directory |
| `--batch` | flag | False | Enable batch mode (disables automatic text chunking) |
## Notes
- **Batch Processing**: The number of `--voice-style` files must match the number of `--text` entries
- **Long-Form Inference**: Without `--batch` flag, long texts are automatically chunked and combined into a single audio file with natural pauses
- **Quality vs Speed**: Higher `--total-step` values produce better quality but take longer
+8 -2
View File
@@ -16,6 +16,7 @@ struct Args {
"This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
};
std::string save_dir = "results";
bool batch = false;
};
auto splitString = [](const std::string& str, char delim) {
@@ -39,6 +40,7 @@ Args parseArgs(int argc, char* argv[]) {
else if (arg == "--voice-style" && i + 1 < argc) args.voice_style = splitString(argv[++i], ',');
else if (arg == "--text" && i + 1 < argc) args.text = splitString(argv[++i], '|');
else if (arg == "--save-dir" && i + 1 < argc) args.save_dir = argv[++i];
else if (arg == "--batch") args.batch = true;
}
return args;
}
@@ -53,13 +55,13 @@ int main(int argc, char* argv[]) {
std::string save_dir = args.save_dir;
std::vector<std::string> voice_style_paths = args.voice_style;
std::vector<std::string> text_list = args.text;
bool batch = args.batch;
if (voice_style_paths.size() != text_list.size()) {
std::cerr << "Error: Number of voice styles (" << voice_style_paths.size()
<< ") must match number of texts (" << text_list.size() << ")\n";
return 1;
}
int bsz = voice_style_paths.size();
// --- 2. Load Text to Speech --- //
@@ -81,7 +83,11 @@ int main(int argc, char* argv[]) {
std::cout << "\n[" << (n + 1) << "/" << n_test << "] Starting synthesis...\n";
auto result = timer("Generating speech from text", [&]() {
return text_to_speech->call(memory_info, text_list, style, total_step);
if (batch) {
return text_to_speech->batch(memory_info, text_list, style, total_step);
} else {
return text_to_speech->call(memory_info, text_list[0], style, total_step);
}
});
int sample_rate = text_to_speech->getSampleRate();
+137 -1
View File
@@ -5,6 +5,7 @@
#include <algorithm>
#include <random>
#include <sstream>
#include <regex>
#include <nlohmann/json.hpp>
using json = nlohmann::json;
@@ -155,7 +156,7 @@ void TextToSpeech::sampleNoisyLatent(
}
}
TextToSpeech::SynthesisResult TextToSpeech::call(
TextToSpeech::SynthesisResult TextToSpeech::_infer(
Ort::MemoryInfo& memory_info,
const std::vector<std::string>& text_list,
const Style& style,
@@ -364,6 +365,52 @@ TextToSpeech::SynthesisResult TextToSpeech::call(
return result;
}
TextToSpeech::SynthesisResult TextToSpeech::call(
Ort::MemoryInfo& memory_info,
const std::string& text,
const Style& style,
int total_step,
float silence_duration
) {
if (style.getTtlShape()[0] != 1) {
throw std::runtime_error("Single speaker text to speech only supports single style");
}
auto text_list = chunkText(text);
std::vector<float> wav_cat;
float dur_cat = 0.0f;
for (const auto& chunk : text_list) {
auto result = _infer(memory_info, {chunk}, style, total_step);
if (wav_cat.empty()) {
wav_cat = result.wav;
dur_cat = result.duration[0];
} else {
int silence_len = static_cast<int>(silence_duration * sample_rate_);
std::vector<float> silence(silence_len, 0.0f);
wav_cat.insert(wav_cat.end(), silence.begin(), silence.end());
wav_cat.insert(wav_cat.end(), result.wav.begin(), result.wav.end());
dur_cat += result.duration[0] + silence_duration;
}
}
SynthesisResult final_result;
final_result.wav = wav_cat;
final_result.duration = {dur_cat};
return final_result;
}
TextToSpeech::SynthesisResult TextToSpeech::batch(
Ort::MemoryInfo& memory_info,
const std::vector<std::string>& text_list,
const Style& style,
int total_step
) {
return _infer(memory_info, text_list, style, total_step);
}
// ============================================================================
// Utility functions
// ============================================================================
@@ -712,3 +759,92 @@ std::string sanitizeFilename(const std::string& text, int max_len) {
}
return result;
}
// ============================================================================
// Chunk text
// ============================================================================
static std::string trim(const std::string& str) {
size_t start = 0;
while (start < str.size() && std::isspace(static_cast<unsigned char>(str[start]))) {
start++;
}
size_t end = str.size();
while (end > start && std::isspace(static_cast<unsigned char>(str[end - 1]))) {
end--;
}
return str.substr(start, end - start);
}
std::vector<std::string> chunkText(const std::string& text, int max_len) {
std::vector<std::string> chunks;
// Split by paragraph (two or more newlines)
std::regex paragraph_regex(R"(\n\s*\n+)");
std::sregex_token_iterator iter(text.begin(), text.end(), paragraph_regex, -1);
std::sregex_token_iterator end;
std::vector<std::string> paragraphs;
for (; iter != end; ++iter) {
std::string para = trim(*iter);
if (!para.empty()) {
paragraphs.push_back(para);
}
}
// Split by sentence boundaries, excluding abbreviations
// This is a simplified version - C++ negative lookbehind is more complex
std::regex sentence_regex(R"([.!?]\s+)");
for (const auto& paragraph : paragraphs) {
std::sregex_token_iterator sent_iter(paragraph.begin(), paragraph.end(), sentence_regex, -1);
std::sregex_token_iterator sent_end;
std::vector<std::string> sentences;
std::string current = "";
for (; sent_iter != sent_end; ++sent_iter) {
std::string sentence = *sent_iter;
if (!sentence.empty()) {
// Add back the punctuation
if (sent_iter != sent_end) {
std::smatch match;
if (std::regex_search(sent_iter->first, paragraph.end(), match, sentence_regex)) {
sentence += match.str();
}
}
sentences.push_back(sentence);
}
}
// Combine sentences into chunks
std::string current_chunk = "";
for (const auto& sentence : sentences) {
if (static_cast<int>(current_chunk.length() + sentence.length() + 1) <= max_len) {
if (!current_chunk.empty()) {
current_chunk += " ";
}
current_chunk += sentence;
} else {
if (!current_chunk.empty()) {
chunks.push_back(trim(current_chunk));
}
current_chunk = sentence;
}
}
if (!current_chunk.empty()) {
chunks.push_back(trim(current_chunk));
}
}
// If no chunks were created, return the original text
if (chunks.empty()) {
chunks.push_back(trim(text));
}
return chunks;
}
+17
View File
@@ -87,6 +87,14 @@ public:
};
SynthesisResult call(
Ort::MemoryInfo& memory_info,
const std::string& text,
const Style& style,
int total_step,
float silence_duration = 0.3f
);
SynthesisResult batch(
Ort::MemoryInfo& memory_info,
const std::vector<std::string>& text_list,
const Style& style,
@@ -96,6 +104,12 @@ public:
int getSampleRate() const { return sample_rate_; }
private:
SynthesisResult _infer(
Ort::MemoryInfo& memory_info,
const std::vector<std::string>& text_list,
const Style& style,
int total_step
);
Config cfgs_;
UnicodeProcessor* text_processor_;
Ort::Session* dp_ort_;
@@ -200,3 +214,6 @@ auto timer(const std::string& name, Func&& func) -> decltype(func()) {
// Sanitize filename
std::string sanitizeFilename(const std::string& text, int max_len);
// Chunk text into manageable segments
std::vector<std::string> chunkText(const std::string& text, int max_len = 300);
+16 -4
View File
@@ -19,6 +19,7 @@ namespace Supertonic
"This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
};
public string SaveDir { get; set; } = "results";
public bool Batch { get; set; } = false;
}
static Args ParseArgs(string[] args)
@@ -32,6 +33,9 @@ namespace Supertonic
case "--use-gpu":
result.UseGpu = true;
break;
case "--batch":
result.Batch = true;
break;
case "--onnx-dir" when i + 1 < args.Length:
result.OnnxDir = args[++i];
break;
@@ -67,13 +71,13 @@ namespace Supertonic
string saveDir = parsedArgs.SaveDir;
var voiceStylePaths = parsedArgs.VoiceStyle;
var textList = parsedArgs.Text;
bool batch = parsedArgs.Batch;
if (voiceStylePaths.Count != textList.Count)
{
throw new ArgumentException(
$"Number of voice styles ({voiceStylePaths.Count}) must match number of texts ({textList.Count})");
}
int bsz = voiceStylePaths.Count;
// --- 2. Load Text to Speech --- //
@@ -88,9 +92,17 @@ namespace Supertonic
{
Console.WriteLine($"\n[{n + 1}/{nTest}] Starting synthesis...");
var (wav, duration) = Helper.Timer("Generating speech from text", () =>
textToSpeech.Call(textList, style, totalStep)
);
var (wav, duration) = Helper.Timer("Generating speech from text", () =>
{
if (batch)
{
return textToSpeech.Batch(textList, style, totalStep);
}
else
{
return textToSpeech.Call(textList[0], style, totalStep);
}
});
if (!Directory.Exists(saveDir))
{
+100 -1
View File
@@ -4,6 +4,7 @@ using System.IO;
using System.Linq;
using System.Text;
using System.Text.Json;
using System.Text.RegularExpressions;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
@@ -193,7 +194,7 @@ namespace Supertonic
return (noisyLatent, latentMask);
}
public (float[] wav, float[] duration) Call(List<string> textList, Style style, int totalStep)
private (float[] wav, float[] duration) _Infer(List<string> textList, Style style, int totalStep)
{
int bsz = textList.Count;
if (bsz != style.TtlShape[0])
@@ -282,6 +283,44 @@ namespace Supertonic
return (wavTensor.ToArray(), durOnnx);
}
public (float[] wav, float[] duration) Call(string text, Style style, int totalStep, float silenceDuration = 0.3f)
{
if (style.TtlShape[0] != 1)
{
throw new ArgumentException("Single speaker text to speech only supports single style");
}
var textList = Helper.ChunkText(text);
var wavCat = new List<float>();
float durCat = 0.0f;
foreach (var chunk in textList)
{
var (wav, duration) = _Infer(new List<string> { chunk }, style, totalStep);
if (wavCat.Count == 0)
{
wavCat.AddRange(wav);
durCat = duration[0];
}
else
{
int silenceLen = (int)(silenceDuration * SampleRate);
var silence = new float[silenceLen];
wavCat.AddRange(silence);
wavCat.AddRange(wav);
durCat += duration[0] + silenceDuration;
}
}
return (wavCat.ToArray(), new float[] { durCat });
}
public (float[] wav, float[] duration) Batch(List<string> textList, Style style, int totalStep)
{
return _Infer(textList, style, totalStep);
}
}
// ============================================================================
@@ -608,5 +647,65 @@ namespace Supertonic
}
return result.ToString();
}
// ============================================================================
// Chunk text
// ============================================================================
public static List<string> ChunkText(string text, int maxLen = 300)
{
var chunks = new List<string>();
// Split by paragraph (two or more newlines)
var paragraphRegex = new Regex(@"\n\s*\n+");
var paragraphs = paragraphRegex.Split(text.Trim())
.Select(p => p.Trim())
.Where(p => !string.IsNullOrEmpty(p))
.ToList();
// Split by sentence boundaries, excluding abbreviations
var sentenceRegex = new Regex(@"(?<!Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.|Sr\.|Jr\.|Ph\.D\.|etc\.|e\.g\.|i\.e\.|vs\.|Inc\.|Ltd\.|Co\.|Corp\.|St\.|Ave\.|Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+");
foreach (var paragraph in paragraphs)
{
var sentences = sentenceRegex.Split(paragraph);
string currentChunk = "";
foreach (var sentence in sentences)
{
if (string.IsNullOrEmpty(sentence)) continue;
if (currentChunk.Length + sentence.Length + 1 <= maxLen)
{
if (!string.IsNullOrEmpty(currentChunk))
{
currentChunk += " ";
}
currentChunk += sentence;
}
else
{
if (!string.IsNullOrEmpty(currentChunk))
{
chunks.Add(currentChunk.Trim());
}
currentChunk = sentence;
}
}
if (!string.IsNullOrEmpty(currentChunk))
{
chunks.Add(currentChunk.Trim());
}
}
// If no chunks were created, return the original text
if (chunks.Count == 0)
{
chunks.Add(text.Trim());
}
return chunks;
}
}
}
+26 -2
View File
@@ -2,6 +2,10 @@
This guide provides examples for running TTS inference using `ExampleONNX.cs`.
## 📰 Update News
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
## Installation
### Prerequisites
@@ -33,14 +37,16 @@ Process multiple voice styles and texts at once:
```bash
dotnet run -- \
--voice-style assets/voice_styles/M1.json,assets/voice_styles/F1.json \
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant."
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant." \
--batch
```
This will:
- Use `--batch` flag to enable batch processing mode
- Generate speech for 2 different voice-text pairs
- Use male voice style (M1.json) for the first text
- Use female voice style (F1.json) for the second text
- Process both samples in a single batch
- Process both samples in a single batch (automatic text chunking disabled)
### Example 3: High Quality Inference
Increase denoising steps for better quality:
@@ -55,6 +61,22 @@ This will:
- Use 10 denoising steps instead of the default 5
- Produce higher quality output at the cost of slower inference
### Example 4: Long-Form Inference
For long texts, the system automatically chunks the text into manageable segments and generates a single audio file:
```bash
dotnet run -- \
--voice-style assets/voice_styles/M1.json \
--text "Once upon a time, in a small village nestled between rolling hills, there lived a young artist named Clara. Every morning, she would wake up before dawn to capture the first light of day. The golden rays streaming through her window inspired countless paintings. Her work was known throughout the region for its vibrant colors and emotional depth. People from far and wide came to see her gallery, and many said her paintings could tell stories that words never could."
```
This will:
- Automatically split the long text into smaller chunks (max 300 characters by default)
- Process each chunk separately while maintaining natural speech flow
- Insert brief silences (0.3 seconds) between chunks for natural pacing
- Combine all chunks into a single output audio file
**Note**: When using batch mode (`--batch`), automatic text chunking is disabled. Use non-batch mode for long-form text synthesis.
## Available Arguments
| Argument | Type | Default | Description |
@@ -66,10 +88,12 @@ This will:
| `--voice-style` | str+ | `assets/voice_styles/M1.json` | Voice style file path(s) (comma-separated) |
| `--text` | str+ | (long default text) | Text(s) to synthesize (pipe-separated: `|`) |
| `--save-dir` | str | `results` | Output directory |
| `--batch` | flag | False | Enable batch mode (disables automatic text chunking) |
## Notes
- **Batch Processing**: The number of `--voice-style` files must match the number of `--text` entries
- **Long-Form Inference**: Without `--batch` flag, long texts are automatically chunked and combined into a single audio file with natural pauses
- **Quality vs Speed**: Higher `--total-step` values produce better quality but take longer
- **GPU Support**: GPU mode is not supported yet
+25 -1
View File
@@ -2,6 +2,10 @@
This guide provides examples for running TTS inference using `example_onnx.go`.
## 📰 Update News
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
## Installation
This project uses Go modules for dependency management.
@@ -73,6 +77,7 @@ This will use:
Process multiple voice styles and texts at once:
```bash
go run example_onnx.go helper.go \
--batch \
-voice-style "assets/voice_styles/M1.json,assets/voice_styles/F1.json" \
-text "The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant."
```
@@ -96,6 +101,23 @@ This will:
- Use 10 denoising steps instead of the default 5
- Produce higher quality output at the cost of slower inference
### Example 4: Long-Form Inference
The system automatically chunks long texts into manageable segments, synthesizes each segment separately, and concatenates them with natural pauses (0.3 seconds by default) into a single audio file. This happens by default when you don't use the `--batch` flag:
```bash
go run example_onnx.go helper.go \
-voice-style "assets/voice_styles/M1.json" \
-text "This is a very long text that will be automatically split into multiple chunks. The system will process each chunk separately and then concatenate them together with natural pauses between segments. This ensures that even very long texts can be processed efficiently while maintaining natural speech flow and avoiding memory issues."
```
This will:
- Automatically split the text into chunks based on paragraph and sentence boundaries
- Synthesize each chunk separately
- Add 0.3 seconds of silence between chunks for natural pauses
- Concatenate all chunks into a single audio file
**Note**: Automatic text chunking is disabled when using `--batch` mode. In batch mode, each text is processed as-is without chunking.
## Available Arguments
| Argument | Type | Default | Description |
@@ -107,10 +129,12 @@ This will:
| `-voice-style` | str | `assets/voice_styles/M1.json` | Voice style file path(s), comma-separated |
| `-text` | str | (long default text) | Text(s) to synthesize, pipe-separated |
| `-save-dir` | str | `results` | Output directory |
| `--batch` | flag | false | Enable batch mode (multiple text-style pairs, disables automatic chunking) |
## Notes
- **Batch Processing**: The number of `-voice-style` files must match the number of `-text` entries
- **Batch Processing**: When using `--batch`, the number of `-voice-style` files must match the number of `-text` entries
- **Automatic Chunking**: Without `--batch`, long texts are automatically split and concatenated with 0.3s pauses
- **Quality vs Speed**: Higher `-total-step` values produce better quality but take longer
- **GPU Support**: GPU mode is not supported yet
+45 -15
View File
@@ -19,6 +19,7 @@ type Args struct {
voiceStyle []string
text []string
saveDir string
batch bool
}
func parseArgs() *Args {
@@ -29,6 +30,7 @@ func parseArgs() *Args {
flag.IntVar(&args.totalStep, "total-step", 5, "Number of denoising steps")
flag.IntVar(&args.nTest, "n-test", 4, "Number of times to generate")
flag.StringVar(&args.saveDir, "save-dir", "results", "Output directory")
flag.BoolVar(&args.batch, "batch", false, "Enable batch mode (multiple text-style pairs)")
var voiceStyleStr, textStr string
flag.StringVar(&voiceStyleStr, "voice-style", "assets/voice_styles/M1.json", "Voice style file path(s), comma-separated")
@@ -65,11 +67,14 @@ func main() {
saveDir := args.saveDir
voiceStylePaths := args.voiceStyle
textList := args.text
batch := args.batch
if len(voiceStylePaths) != len(textList) {
fmt.Printf("Error: Number of voice styles (%d) must match number of texts (%d)\n",
len(voiceStylePaths), len(textList))
os.Exit(1)
if batch {
if len(voiceStylePaths) != len(textList) {
fmt.Printf("Error: Number of voice styles (%d) must match number of texts (%d)\n",
len(voiceStylePaths), len(textList))
os.Exit(1)
}
}
bsz := len(voiceStylePaths)
@@ -115,21 +120,46 @@ func main() {
var wav []float32
var duration []float32
Timer("Generating speech from text", func() interface{} {
w, d, err := textToSpeech.Call(textList, style, totalStep)
if err != nil {
fmt.Printf("Error generating speech: %v\n", err)
os.Exit(1)
}
wav = w
duration = d
return nil
})
if batch {
Timer("Generating speech from text", func() interface{} {
w, d, err := textToSpeech.Batch(textList, style, totalStep)
if err != nil {
fmt.Printf("Error generating speech: %v\n", err)
os.Exit(1)
}
wav = w
duration = d
return nil
})
} else {
Timer("Generating speech from text", func() interface{} {
w, d, err := textToSpeech.Call(textList[0], style, totalStep, 0.3)
if err != nil {
fmt.Printf("Error generating speech: %v\n", err)
os.Exit(1)
}
wav = w
duration = []float32{d}
return nil
})
}
// Save outputs
for i := 0; i < bsz; i++ {
fname := fmt.Sprintf("%s_%d.wav", sanitizeFilename(textList[i], 20), n+1)
wavOut := extractWavSegment(wav, duration[i], textToSpeech.SampleRate, i, bsz)
var wavOut []float64
if batch {
wavOut = extractWavSegment(wav, duration[i], textToSpeech.SampleRate, i, bsz)
} else {
// For non-batch mode, wav is a single concatenated audio
wavLen := int(float32(textToSpeech.SampleRate) * duration[0])
wavOut = make([]float64, wavLen)
for j := 0; j < wavLen && j < len(wav); j++ {
wavOut[j] = float64(wav[j])
}
}
outputPath := filepath.Join(saveDir, fname)
if err := writeWavFile(outputPath, wavOut, textToSpeech.SampleRate); err != nil {
+219 -1
View File
@@ -7,6 +7,8 @@ import (
"math/rand"
"os"
"path/filepath"
"regexp"
"strings"
"time"
"github.com/go-audio/audio"
@@ -145,6 +147,184 @@ func (up *UnicodeProcessor) Call(textList []string) ([][]int64, [][][]float64) {
return textIDs, textMask
}
// Text chunking utilities
const maxChunkLength = 300
var abbreviations = []string{
"Dr.", "Mr.", "Mrs.", "Ms.", "Prof.", "Sr.", "Jr.",
"St.", "Ave.", "Rd.", "Blvd.", "Dept.", "Inc.", "Ltd.",
"Co.", "Corp.", "etc.", "vs.", "i.e.", "e.g.", "Ph.D.",
}
func chunkText(text string, maxLen int) []string {
if maxLen == 0 {
maxLen = maxChunkLength
}
text = strings.TrimSpace(text)
if text == "" {
return []string{""}
}
// Split by paragraphs
paragraphs := regexp.MustCompile(`\n\s*\n`).Split(text, -1)
var chunks []string
for _, para := range paragraphs {
para = strings.TrimSpace(para)
if para == "" {
continue
}
if len(para) <= maxLen {
chunks = append(chunks, para)
continue
}
// Split by sentences
sentences := splitSentences(para)
var current strings.Builder
currentLen := 0
for _, sentence := range sentences {
sentence = strings.TrimSpace(sentence)
if sentence == "" {
continue
}
sentenceLen := len(sentence)
if sentenceLen > maxLen {
// If sentence is longer than maxLen, split by comma or space
if current.Len() > 0 {
chunks = append(chunks, strings.TrimSpace(current.String()))
current.Reset()
currentLen = 0
}
// Try splitting by comma
parts := strings.Split(sentence, ",")
for _, part := range parts {
part = strings.TrimSpace(part)
if part == "" {
continue
}
partLen := len(part)
if partLen > maxLen {
// Split by space as last resort
words := strings.Fields(part)
var wordChunk strings.Builder
wordChunkLen := 0
for _, word := range words {
wordLen := len(word)
if wordChunkLen+wordLen+1 > maxLen && wordChunk.Len() > 0 {
chunks = append(chunks, strings.TrimSpace(wordChunk.String()))
wordChunk.Reset()
wordChunkLen = 0
}
if wordChunk.Len() > 0 {
wordChunk.WriteString(" ")
wordChunkLen++
}
wordChunk.WriteString(word)
wordChunkLen += wordLen
}
if wordChunk.Len() > 0 {
chunks = append(chunks, strings.TrimSpace(wordChunk.String()))
}
} else {
if currentLen+partLen+1 > maxLen && current.Len() > 0 {
chunks = append(chunks, strings.TrimSpace(current.String()))
current.Reset()
currentLen = 0
}
if current.Len() > 0 {
current.WriteString(", ")
currentLen += 2
}
current.WriteString(part)
currentLen += partLen
}
}
continue
}
if currentLen+sentenceLen+1 > maxLen && current.Len() > 0 {
chunks = append(chunks, strings.TrimSpace(current.String()))
current.Reset()
currentLen = 0
}
if current.Len() > 0 {
current.WriteString(" ")
currentLen++
}
current.WriteString(sentence)
currentLen += sentenceLen
}
if current.Len() > 0 {
chunks = append(chunks, strings.TrimSpace(current.String()))
}
}
if len(chunks) == 0 {
return []string{""}
}
return chunks
}
func splitSentences(text string) []string {
// Go's regexp doesn't support lookbehind, so we use a simpler approach
// Split on sentence boundaries and then check if they're abbreviations
re := regexp.MustCompile(`([.!?])\s+`)
// Find all matches
matches := re.FindAllStringIndex(text, -1)
if len(matches) == 0 {
return []string{text}
}
var sentences []string
lastEnd := 0
for _, match := range matches {
// Get the text before the punctuation
beforePunc := text[lastEnd:match[0]]
// Check if this ends with an abbreviation
isAbbrev := false
for _, abbrev := range abbreviations {
if strings.HasSuffix(strings.TrimSpace(beforePunc+text[match[0]:match[0]+1]), abbrev) {
isAbbrev = true
break
}
}
if !isAbbrev {
// This is a real sentence boundary
sentences = append(sentences, text[lastEnd:match[1]])
lastEnd = match[1]
}
}
// Add the remaining text
if lastEnd < len(text) {
sentences = append(sentences, text[lastEnd:])
}
if len(sentences) == 0 {
return []string{text}
}
return sentences
}
// Utility functions
func preprocessText(text string) string {
// Simple normalization (Go doesn't have built-in NFKD normalization)
@@ -392,7 +572,7 @@ func (tts *TextToSpeech) sampleNoisyLatent(durOnnx []float32) ([][][]float64, []
return noisyLatent, latentMask
}
func (tts *TextToSpeech) Call(textList []string, style *Style, totalStep int) ([]float32, []float32, error) {
func (tts *TextToSpeech) _infer(textList []string, style *Style, totalStep int) ([]float32, []float32, error) {
bsz := len(textList)
// Process text
@@ -510,6 +690,44 @@ func (tts *TextToSpeech) Call(textList []string, style *Style, totalStep int) ([
return wav, durOnnx, nil
}
// Call synthesizes speech from a single text with automatic chunking
func (tts *TextToSpeech) Call(text string, style *Style, totalStep int, silenceDuration float32) ([]float32, float32, error) {
chunks := chunkText(text, 0)
var wavCat []float32
var durCat float32
for i, chunk := range chunks {
wav, duration, err := tts._infer([]string{chunk}, style, totalStep)
if err != nil {
return nil, 0, err
}
dur := duration[0]
wavLen := int(float32(tts.SampleRate) * dur)
wavChunk := wav[:wavLen]
if i == 0 {
wavCat = wavChunk
durCat = dur
} else {
silenceLen := int(silenceDuration * float32(tts.SampleRate))
silence := make([]float32, silenceLen)
wavCat = append(wavCat, silence...)
wavCat = append(wavCat, wavChunk...)
durCat += silenceDuration + dur
}
}
return wavCat, durCat, nil
}
// Batch synthesizes speech from multiple texts
func (tts *TextToSpeech) Batch(textList []string, style *Style, totalStep int) ([]float32, []float32, error) {
return tts._infer(textList, style, totalStep)
}
func (tts *TextToSpeech) Destroy() {
if tts.dpOrt != nil {
tts.dpOrt.Destroy()
+40 -14
View File
@@ -21,6 +21,7 @@ public class ExampleONNX {
"This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
);
String saveDir = "results";
boolean batch = false;
}
/**
@@ -56,6 +57,9 @@ public class ExampleONNX {
case "--save-dir":
if (i + 1 < args.length) result.saveDir = args[++i];
break;
case "--batch":
result.batch = true;
break;
}
}
@@ -76,10 +80,13 @@ public class ExampleONNX {
String saveDir = parsedArgs.saveDir;
List<String> voiceStylePaths = parsedArgs.voiceStyle;
List<String> textList = parsedArgs.text;
boolean batch = parsedArgs.batch;
if (voiceStylePaths.size() != textList.size()) {
throw new RuntimeException("Number of voice styles (" + voiceStylePaths.size() +
") must match number of texts (" + textList.size() + ")");
if (batch) {
if (voiceStylePaths.size() != textList.size()) {
throw new RuntimeException("Number of voice styles (" + voiceStylePaths.size() +
") must match number of texts (" + textList.size() + ")");
}
}
int bsz = voiceStylePaths.size();
@@ -100,25 +107,44 @@ public class ExampleONNX {
for (int n = 0; n < nTest; n++) {
System.out.println("\n[" + (n + 1) + "/" + nTest + "] Starting synthesis...");
TTSResult ttsResult = Helper.timer("Generating speech from text", () -> {
try {
return textToSpeech.call(textList, style, totalStep, env);
} catch (Exception e) {
throw new RuntimeException(e);
}
});
TTSResult ttsResult;
if (batch) {
ttsResult = Helper.timer("Generating speech from text", () -> {
try {
return textToSpeech.batch(textList, style, totalStep, env);
} catch (Exception e) {
throw new RuntimeException(e);
}
});
} else {
ttsResult = Helper.timer("Generating speech from text", () -> {
try {
return textToSpeech.call(textList.get(0), style, totalStep, 0.3f, env);
} catch (Exception e) {
throw new RuntimeException(e);
}
});
}
float[] wav = ttsResult.wav;
float[] duration = ttsResult.duration;
// Save outputs
int wavLen = wav.length / bsz;
for (int i = 0; i < bsz; i++) {
String fname = Helper.sanitizeFilename(textList.get(i), 20) + "_" + (n + 1) + ".wav";
int actualLen = (int) (textToSpeech.sampleRate * duration[i]);
float[] wavOut;
float[] wavOut = new float[actualLen];
System.arraycopy(wav, i * wavLen, wavOut, 0, Math.min(actualLen, wavLen));
if (batch) {
int wavLen = wav.length / bsz;
int actualLen = (int) (textToSpeech.sampleRate * duration[i]);
wavOut = new float[actualLen];
System.arraycopy(wav, i * wavLen, wavOut, 0, Math.min(actualLen, wavLen));
} else {
// For non-batch mode, wav is a single concatenated audio
int actualLen = (int) (textToSpeech.sampleRate * duration[0]);
wavOut = new float[Math.min(actualLen, wav.length)];
System.arraycopy(wav, 0, wavOut, 0, wavOut.length);
}
String outputPath = saveDir + "/" + fname;
Helper.writeWavFile(outputPath, wavOut, textToSpeech.sampleRate);
+204 -1
View File
@@ -15,6 +15,8 @@ import java.nio.file.Files;
import java.nio.file.Paths;
import java.text.Normalizer;
import java.util.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
/**
* Configuration classes
@@ -152,7 +154,7 @@ class TextToSpeech {
this.ldim = config.ttl.latentDim;
}
public TTSResult call(List<String> textList, Style style, int totalStep, OrtEnvironment env)
private TTSResult _infer(List<String> textList, Style style, int totalStep, OrtEnvironment env)
throws OrtException {
int bsz = textList.size();
@@ -296,6 +298,57 @@ class TextToSpeech {
return new NoisyLatentResult(noisyLatent, latentMask);
}
/**
* Synthesize speech from a single text with automatic chunking
*/
public TTSResult call(String text, Style style, int totalStep, float silenceDuration, OrtEnvironment env)
throws OrtException {
List<String> chunks = Helper.chunkText(text, 0);
List<Float> wavCat = new ArrayList<>();
float durCat = 0.0f;
for (int i = 0; i < chunks.size(); i++) {
TTSResult result = _infer(Arrays.asList(chunks.get(i)), style, totalStep, env);
float dur = result.duration[0];
int wavLen = (int) (sampleRate * dur);
float[] wavChunk = new float[wavLen];
System.arraycopy(result.wav, 0, wavChunk, 0, Math.min(wavLen, result.wav.length));
if (i == 0) {
for (float val : wavChunk) {
wavCat.add(val);
}
durCat = dur;
} else {
int silenceLen = (int) (silenceDuration * sampleRate);
for (int j = 0; j < silenceLen; j++) {
wavCat.add(0.0f);
}
for (float val : wavChunk) {
wavCat.add(val);
}
durCat += silenceDuration + dur;
}
}
float[] wavArray = new float[wavCat.size()];
for (int i = 0; i < wavCat.size(); i++) {
wavArray[i] = wavCat.get(i);
}
return new TTSResult(wavArray, new float[]{durCat});
}
/**
* Batch synthesize speech from multiple texts
*/
public TTSResult batch(List<String> textList, Style style, int totalStep, OrtEnvironment env)
throws OrtException {
return _infer(textList, style, totalStep, env);
}
public void close() throws OrtException {
if (dpSession != null) dpSession.close();
if (textEncSession != null) textEncSession.close();
@@ -353,6 +406,156 @@ class NoisyLatentResult {
*/
public class Helper {
private static final int MAX_CHUNK_LENGTH = 300;
private static final String[] ABBREVIATIONS = {
"Dr.", "Mr.", "Mrs.", "Ms.", "Prof.", "Sr.", "Jr.",
"St.", "Ave.", "Rd.", "Blvd.", "Dept.", "Inc.", "Ltd.",
"Co.", "Corp.", "etc.", "vs.", "i.e.", "e.g.", "Ph.D."
};
/**
* Chunk text into smaller segments based on paragraphs and sentences
*/
public static List<String> chunkText(String text, int maxLen) {
if (maxLen == 0) {
maxLen = MAX_CHUNK_LENGTH;
}
text = text.trim();
if (text.isEmpty()) {
return Arrays.asList("");
}
// Split by paragraphs
String[] paragraphs = text.split("\\n\\s*\\n");
List<String> chunks = new ArrayList<>();
for (String para : paragraphs) {
para = para.trim();
if (para.isEmpty()) {
continue;
}
if (para.length() <= maxLen) {
chunks.add(para);
continue;
}
// Split by sentences
List<String> sentences = splitSentences(para);
StringBuilder current = new StringBuilder();
int currentLen = 0;
for (String sentence : sentences) {
sentence = sentence.trim();
if (sentence.isEmpty()) {
continue;
}
int sentenceLen = sentence.length();
if (sentenceLen > maxLen) {
// If sentence is longer than maxLen, split by comma or space
if (current.length() > 0) {
chunks.add(current.toString().trim());
current.setLength(0);
currentLen = 0;
}
// Try splitting by comma
String[] parts = sentence.split(",");
for (String part : parts) {
part = part.trim();
if (part.isEmpty()) {
continue;
}
int partLen = part.length();
if (partLen > maxLen) {
// Split by space as last resort
String[] words = part.split("\\s+");
StringBuilder wordChunk = new StringBuilder();
int wordChunkLen = 0;
for (String word : words) {
int wordLen = word.length();
if (wordChunkLen + wordLen + 1 > maxLen && wordChunk.length() > 0) {
chunks.add(wordChunk.toString().trim());
wordChunk.setLength(0);
wordChunkLen = 0;
}
if (wordChunk.length() > 0) {
wordChunk.append(" ");
wordChunkLen++;
}
wordChunk.append(word);
wordChunkLen += wordLen;
}
if (wordChunk.length() > 0) {
chunks.add(wordChunk.toString().trim());
}
} else {
if (currentLen + partLen + 1 > maxLen && current.length() > 0) {
chunks.add(current.toString().trim());
current.setLength(0);
currentLen = 0;
}
if (current.length() > 0) {
current.append(", ");
currentLen += 2;
}
current.append(part);
currentLen += partLen;
}
}
continue;
}
if (currentLen + sentenceLen + 1 > maxLen && current.length() > 0) {
chunks.add(current.toString().trim());
current.setLength(0);
currentLen = 0;
}
if (current.length() > 0) {
current.append(" ");
currentLen++;
}
current.append(sentence);
currentLen += sentenceLen;
}
if (current.length() > 0) {
chunks.add(current.toString().trim());
}
}
if (chunks.isEmpty()) {
return Arrays.asList("");
}
return chunks;
}
/**
* Split text into sentences, avoiding common abbreviations
*/
private static List<String> splitSentences(String text) {
// Build pattern that avoids abbreviations
StringBuilder abbrevPattern = new StringBuilder();
for (int i = 0; i < ABBREVIATIONS.length; i++) {
if (i > 0) abbrevPattern.append("|");
abbrevPattern.append(Pattern.quote(ABBREVIATIONS[i]));
}
// Match sentence endings, but not abbreviations
String patternStr = "(?<!(?:" + abbrevPattern.toString() + "))(?<=[.!?])\\s+";
Pattern pattern = Pattern.compile(patternStr);
return Arrays.asList(pattern.split(text));
}
/**
* Load voice style from JSON files
*/
+24 -3
View File
@@ -2,6 +2,10 @@
This guide provides examples for running TTS inference using `ExampleONNX.java`.
## 📰 Update News
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
## Installation
This project uses [Maven](https://maven.apache.org/) for dependency management.
@@ -35,7 +39,7 @@ This will use:
### Example 2: Batch Inference
Process multiple voice styles and texts at once:
```bash
mvn exec:java -Dexec.args="--voice-style assets/voice_styles/M1.json,assets/voice_styles/F1.json --text 'The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant.'"
mvn exec:java -Dexec.args="--batch --voice-style assets/voice_styles/M1.json,assets/voice_styles/F1.json --text 'The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant.'"
```
This will:
@@ -54,7 +58,22 @@ This will:
- Use 10 denoising steps instead of the default 5
- Produce higher quality output at the cost of slower inference
**Note**: If your text contains apostrophes, use escaping or run the JAR directly:
### Example 4: Long-Form Inference
The system automatically chunks long texts into manageable segments, synthesizes each segment separately, and concatenates them with natural pauses (0.3 seconds by default) into a single audio file. This happens by default when you don't use the `--batch` flag:
```bash
mvn exec:java -Dexec.args="--voice-style assets/voice_styles/M1.json --text 'This is a very long text that will be automatically split into multiple chunks. The system will process each chunk separately and then concatenate them together with natural pauses between segments. This ensures that even very long texts can be processed efficiently while maintaining natural speech flow and avoiding memory issues.'"
```
This will:
- Automatically split the text into chunks based on paragraph and sentence boundaries
- Synthesize each chunk separately
- Add 0.3 seconds of silence between chunks for natural pauses
- Concatenate all chunks into a single audio file
**Note**: Automatic text chunking is disabled when using `--batch` mode. In batch mode, each text is processed as-is without chunking.
**Tip**: If your text contains apostrophes, use escaping or run the JAR directly:
```bash
java -jar target/tts-example.jar --total-step 10 --text "Text with apostrophe's here"
```
@@ -87,10 +106,12 @@ java -jar target/tts-example.jar --total-step 10 --text "Your custom text here"
| `--voice-style` | str+ | `assets/voice_styles/M1.json` | Voice style file path(s) |
| `--text` | str+ | (long default text) | Text(s) to synthesize |
| `--save-dir` | str | `results` | Output directory |
| `--batch` | flag | False | Enable batch mode (multiple text-style pairs, disables automatic chunking) |
## Notes
- **Batch Processing**: The number of `--voice-style` files must match the number of `--text` entries
- **Batch Processing**: When using `--batch`, the number of `--voice-style` files must match the number of `--text` entries
- **Automatic Chunking**: Without `--batch`, long texts are automatically split and concatenated with 0.3s pauses
- **Quality vs Speed**: Higher `--total-step` values produce better quality but take longer
- **GPU Support**: GPU mode is not supported yet
- **Voice Styles**: Uses pre-extracted voice style JSON files for fast inference
+26 -2
View File
@@ -2,6 +2,10 @@
Node.js implementation for TTS inference. Uses ONNX Runtime to generate speech from text.
## 📰 Update News
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
## Requirements
- Node.js v16 or higher
@@ -39,14 +43,16 @@ Process multiple voice styles and texts at once:
```bash
node example_onnx.js \
--voice-style "assets/voice_styles/M1.json,assets/voice_styles/F1.json" \
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant."
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant." \
--batch
```
This will:
- Use `--batch` flag to enable batch processing mode
- Generate speech for 2 different voice-text pairs
- Use male voice style (M1.json) for the first text
- Use female voice style (F1.json) for the second text
- Process both samples in a single batch
- Process both samples in a single batch (automatic text chunking disabled)
### Example 3: High Quality Inference
Increase denoising steps for better quality:
@@ -61,6 +67,22 @@ This will:
- Use 10 denoising steps instead of the default 5
- Produce higher quality output at the cost of slower inference
### Example 4: Long-Form Inference
For long texts, the system automatically chunks the text into manageable segments and generates a single audio file:
```bash
node example_onnx.js \
--voice-style "assets/voice_styles/M1.json" \
--text "Once upon a time, in a small village nestled between rolling hills, there lived a young artist named Clara. Every morning, she would wake up before dawn to capture the first light of day. The golden rays streaming through her window inspired countless paintings. Her work was known throughout the region for its vibrant colors and emotional depth. People from far and wide came to see her gallery, and many said her paintings could tell stories that words never could."
```
This will:
- Automatically split the long text into smaller chunks (max 300 characters by default)
- Process each chunk separately while maintaining natural speech flow
- Insert brief silences (0.3 seconds) between chunks for natural pacing
- Combine all chunks into a single output audio file
**Note**: When using batch mode (`--batch`), automatic text chunking is disabled. Use non-batch mode for long-form text synthesis.
## Available Arguments
| Argument | Type | Default | Description |
@@ -72,10 +94,12 @@ This will:
| `--voice-style` | str+ | `assets/voice_styles/M1.json` | Voice style file path(s). Separate multiple files with commas |
| `--text` | str+ | (long default text) | Text(s) to synthesize. Separate multiple texts with pipes |
| `--save-dir` | str | `results` | Output directory |
| `--batch` | flag | False | Enable batch mode (disables automatic text chunking) |
## Notes
- **Batch Processing**: The number of voice style files must match the number of texts. Use commas to separate files and pipes to separate texts
- **Long-Form Inference**: Without `--batch` flag, long texts are automatically chunked and combined into a single audio file with natural pauses
- **Quality vs Speed**: Higher `--total-step` values produce better quality but take longer
- **GPU Support**: GPU mode is not supported yet
+10 -3
View File
@@ -18,13 +18,16 @@ function parseArgs() {
nTest: 4,
voiceStyle: ['assets/voice_styles/M1.json'],
text: ['This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen.'],
saveDir: 'results'
saveDir: 'results',
batch: false
};
for (let i = 2; i < process.argv.length; i++) {
const arg = process.argv[i];
if (arg === '--use-gpu') {
args.useGpu = true;
} else if (arg === '--batch') {
args.batch = true;
} else if (arg === '--onnx-dir' && i + 1 < process.argv.length) {
args.onnxDir = process.argv[++i];
} else if (arg === '--total-step' && i + 1 < process.argv.length) {
@@ -56,11 +59,11 @@ async function main() {
const saveDir = args.saveDir;
const voiceStylePaths = args.voiceStyle.map(p => path.resolve(__dirname, p));
const textList = args.text;
const batch = args.batch;
if (voiceStylePaths.length !== textList.length) {
throw new Error(`Number of voice styles (${voiceStylePaths.length}) must match number of texts (${textList.length})`);
}
const bsz = voiceStylePaths.length;
// --- 2. Load Text to Speech --- //
@@ -75,7 +78,11 @@ async function main() {
console.log(`\n[${n + 1}/${nTest}] Starting synthesis...`);
const { wav, duration } = await timer('Generating speech from text', async () => {
return await textToSpeech.call(textList, style, totalStep);
if (batch) {
return await textToSpeech.batch(textList, style, totalStep);
} else {
return await textToSpeech.call(textList[0], style, totalStep);
}
});
if (!fs.existsSync(saveDir)) {
+72 -1
View File
@@ -114,7 +114,7 @@ class TextToSpeech {
return { noisyLatent, latentMask };
}
async call(textList, style, totalStep) {
async _infer(textList, style, totalStep) {
if (textList.length !== style.ttl.dims[0]) {
throw new Error('Number of texts must match number of style vectors');
}
@@ -184,6 +184,35 @@ class TextToSpeech {
const wav = Array.from(vocoderResult.wav_tts.data);
return { wav, duration: durOnnx };
}
async call(text, style, totalStep, silenceDuration = 0.3) {
if (style.ttl.dims[0] !== 1) {
throw new Error('Single speaker text to speech only supports single style');
}
const textList = chunkText(text);
let wavCat = null;
let durCat = 0;
for (const chunk of textList) {
const { wav, duration } = await this._infer([chunk], style, totalStep);
if (wavCat === null) {
wavCat = wav;
durCat = duration[0];
} else {
const silenceLen = Math.floor(silenceDuration * this.sampleRate);
const silence = new Array(silenceLen).fill(0);
wavCat = [...wavCat, ...silence, ...wav];
durCat += duration[0] + silenceDuration;
}
}
return { wav: wavCat, duration: [durCat] };
}
async batch(textList, style, totalStep) {
return await this._infer(textList, style, totalStep);
}
}
/**
@@ -390,3 +419,45 @@ export async function timer(name, fn) {
console.log(` -> ${name} completed in ${elapsed} sec`);
return result;
}
/**
* Chunk text into manageable segments
*/
function chunkText(text, maxLen = 300) {
if (typeof text !== 'string') {
throw new Error(`chunkText expects a string, got ${typeof text}`);
}
// Split by paragraph (two or more newlines)
const paragraphs = text.trim().split(/\n\s*\n+/).filter(p => p.trim());
const chunks = [];
for (let paragraph of paragraphs) {
paragraph = paragraph.trim();
if (!paragraph) continue;
// Split by sentence boundaries (period, question mark, exclamation mark followed by space)
// But exclude common abbreviations like Mr., Mrs., Dr., etc. and single capital letters like F.
const sentences = paragraph.split(/(?<!Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.|Sr\.|Jr\.|Ph\.D\.|etc\.|e\.g\.|i\.e\.|vs\.|Inc\.|Ltd\.|Co\.|Corp\.|St\.|Ave\.|Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+/);
let currentChunk = "";
for (let sentence of sentences) {
if (currentChunk.length + sentence.length + 1 <= maxLen) {
currentChunk += (currentChunk ? " " : "") + sentence;
} else {
if (currentChunk) {
chunks.push(currentChunk.trim());
}
currentChunk = sentence;
}
}
if (currentChunk) {
chunks.push(currentChunk.trim());
}
}
return chunks;
}
+26 -2
View File
@@ -2,6 +2,10 @@
This guide provides examples for running TTS inference using `example_onnx.py`.
## 📰 Update News
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
## Installation
This project uses [uv](https://docs.astral.sh/uv/) for fast package management.
@@ -41,14 +45,16 @@ Process multiple voice styles and texts at once:
```bash
uv run example_onnx.py \
--voice-style assets/voice_styles/M1.json assets/voice_styles/F1.json \
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange." "The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant."
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange." "The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant." \
--batch
```
This will:
- Use `--batch` flag to enable batch processing mode
- Generate speech for 2 different voice-text pairs
- Use male voice style (M1.json) for the first text
- Use female voice style (F1.json) for the second text
- Process both samples in a single batch
- Process both samples in a single batch (automatic text chunking disabled)
### Example 3: High Quality Inference
Increase denoising steps for better quality:
@@ -63,6 +69,22 @@ This will:
- Use 10 denoising steps instead of the default 5
- Produce higher quality output at the cost of slower inference
### Example 4: Long-Form Inference
For long texts, the system automatically chunks the text into manageable segments and generates a single audio file:
```bash
uv run example_onnx.py \
--voice-style assets/voice_styles/M1.json \
--text "Once upon a time, in a small village nestled between rolling hills, there lived a young artist named Clara. Every morning, she would wake up before dawn to capture the first light of day. The golden rays streaming through her window inspired countless paintings. Her work was known throughout the region for its vibrant colors and emotional depth. People from far and wide came to see her gallery, and many said her paintings could tell stories that words never could."
```
This will:
- Automatically split the long text into smaller chunks (max 300 characters by default)
- Process each chunk separately while maintaining natural speech flow
- Insert brief silences (0.3 seconds) between chunks for natural pacing
- Combine all chunks into a single output audio file
**Note**: When using batch mode (`--batch`), automatic text chunking is disabled. Use non-batch mode for long-form text synthesis.
## Available Arguments
| Argument | Type | Default | Description |
@@ -74,10 +96,12 @@ This will:
| `--voice-style` | str+ | `assets/voice_styles/M1.json` | Voice style file path(s) |
| `--text` | str+ | (long default text) | Text(s) to synthesize |
| `--save-dir` | str | `results` | Output directory |
| `--batch` | flag | False | Enable batch mode (disables automatic text chunking) |
## Notes
- **Batch Processing**: The number of `--voice-style` files must match the number of `--text` entries
- **Long-Form Inference**: Without `--batch` flag, long texts are automatically chunked and combined into a single audio file with natural pauses
- **Quality vs Speed**: Higher `--total-step` values produce better quality but take longer
- **GPU Support**: GPU mode is not supported yet
+8 -2
View File
@@ -30,6 +30,9 @@ def parse_args():
"--n-test", type=int, default=4, help="Number of times to generate"
)
# Batch processing
parser.add_argument("--batch", action="store_true", help="Batch processing")
# Input/Output
parser.add_argument(
"--voice-style",
@@ -63,11 +66,11 @@ n_test = args.n_test
save_dir = args.save_dir
voice_style_paths = args.voice_style
text_list = args.text
batch = args.batch
assert len(voice_style_paths) == len(
text_list
), f"Number of voice styles ({len(voice_style_paths)}) must match number of texts ({len(text_list)})"
bsz = len(voice_style_paths)
# --- 2. Load Text to Speech --- #
@@ -80,7 +83,10 @@ style = load_voice_style(voice_style_paths, verbose=True)
for n in range(n_test):
print(f"\n[{n+1}/{n_test}] Starting synthesis...")
with timer("Generating speech from text"):
wav, duration = text_to_speech(text_list, style, total_step)
if batch:
wav, duration = text_to_speech.batch(text_list, style, total_step)
else:
wav, duration = text_to_speech(text_list[0], style, total_step)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
for b in range(bsz):
+72 -1
View File
@@ -85,7 +85,7 @@ class TextToSpeech:
noisy_latent = noisy_latent * latent_mask
return noisy_latent, latent_mask
def __call__(
def _infer(
self, text_list: list[str], style: Style, total_step: int
) -> tuple[np.ndarray, np.ndarray]:
assert (
@@ -119,6 +119,33 @@ class TextToSpeech:
wav, *_ = self.vocoder_ort.run(None, {"latent": xt})
return wav, dur_onnx
def __call__(
self, text: str, style: Style, total_step: int, silence_duration: float = 0.3
) -> tuple[np.ndarray, np.ndarray]:
assert (
style.ttl.shape[0] == 1
), "Single speaker text to speech only supports single style"
text_list = chunk_text(text)
wav_cat = None
dur_cat = None
for text in text_list:
wav, dur_onnx = self._infer([text], style, total_step)
if wav_cat is None:
wav_cat = wav
dur_cat = dur_onnx
else:
silence = np.zeros(
(1, int(silence_duration * self.sample_rate)), dtype=np.float32
)
wav_cat = np.concatenate([wav_cat, silence, wav], axis=1)
dur_cat += dur_onnx + silence_duration
return wav_cat, dur_cat
def batch(
self, text_list: list[str], style: Style, total_step: int
) -> tuple[np.ndarray, np.ndarray]:
return self._infer(text_list, style, total_step)
def length_to_mask(lengths: np.ndarray, max_len: Optional[int] = None) -> np.ndarray:
"""
@@ -247,3 +274,47 @@ def sanitize_filename(text: str, max_len: int) -> str:
prefix = text[:max_len]
return re.sub(r"[^a-zA-Z0-9]", "_", prefix)
def chunk_text(text: str, max_len: int = 300) -> list[str]:
"""
Split text into chunks by paragraphs and sentences.
Args:
text: Input text to chunk
max_len: Maximum length of each chunk (default: 300)
Returns:
List of text chunks
"""
import re
# Split by paragraph (two or more newlines)
paragraphs = [p.strip() for p in re.split(r"\n\s*\n+", text.strip()) if p.strip()]
chunks = []
for paragraph in paragraphs:
paragraph = paragraph.strip()
if not paragraph:
continue
# Split by sentence boundaries (period, question mark, exclamation mark followed by space)
# But exclude common abbreviations like Mr., Mrs., Dr., etc. and single capital letters like F.
pattern = r"(?<!Mr\.)(?<!Mrs\.)(?<!Ms\.)(?<!Dr\.)(?<!Prof\.)(?<!Sr\.)(?<!Jr\.)(?<!Ph\.D\.)(?<!etc\.)(?<!e\.g\.)(?<!i\.e\.)(?<!vs\.)(?<!Inc\.)(?<!Ltd\.)(?<!Co\.)(?<!Corp\.)(?<!St\.)(?<!Ave\.)(?<!Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+"
sentences = re.split(pattern, paragraph)
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) + 1 <= max_len:
current_chunk += (" " if current_chunk else "") + sentence
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
+3
View File
@@ -32,6 +32,9 @@ anyhow = "1.0"
# Unicode normalization
unicode-normalization = "0.1"
# Regular expressions
regex = "1.10"
# System calls
libc = "0.2"
+32 -1
View File
@@ -2,6 +2,10 @@
This guide provides examples for running TTS inference using Rust.
## 📰 Update News
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
## Installation
This project uses [Cargo](https://doc.rust-lang.org/cargo/) for package management.
@@ -44,11 +48,13 @@ Process multiple voice styles and texts at once:
```bash
# Using cargo run
cargo run --release --bin example_onnx -- \
--batch \
--voice-style assets/voice_styles/M1.json,assets/voice_styles/F1.json \
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant."
# Or using the binary directly
./target/release/example_onnx \
--batch \
--voice-style assets/voice_styles/M1.json,assets/voice_styles/F1.json \
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant."
```
@@ -79,6 +85,29 @@ This will:
- Use 10 denoising steps instead of the default 5
- Produce higher quality output at the cost of slower inference
### Example 4: Long-Form Inference
The system automatically chunks long texts into manageable segments, synthesizes each segment separately, and concatenates them with natural pauses (0.3 seconds by default) into a single audio file. This happens by default when you don't use the `--batch` flag:
```bash
# Using cargo run
cargo run --release --bin example_onnx -- \
--voice-style assets/voice_styles/M1.json \
--text "This is a very long text that will be automatically split into multiple chunks. The system will process each chunk separately and then concatenate them together with natural pauses between segments. This ensures that even very long texts can be processed efficiently while maintaining natural speech flow and avoiding memory issues."
# Or using the binary directly
./target/release/example_onnx \
--voice-style assets/voice_styles/M1.json \
--text "This is a very long text that will be automatically split into multiple chunks. The system will process each chunk separately and then concatenate them together with natural pauses between segments. This ensures that even very long texts can be processed efficiently while maintaining natural speech flow and avoiding memory issues."
```
This will:
- Automatically split the text into chunks based on paragraph and sentence boundaries
- Synthesize each chunk separately
- Add 0.3 seconds of silence between chunks for natural pauses
- Concatenate all chunks into a single audio file
**Note**: Automatic text chunking is disabled when using `--batch` mode. In batch mode, each text is processed as-is without chunking.
## Available Arguments
| Argument | Type | Default | Description |
@@ -90,10 +119,12 @@ This will:
| `--voice-style` | str+ | `assets/voice_styles/M1.json` | Voice style file path(s) |
| `--text` | str+ | (long default text) | Text(s) to synthesize |
| `--save-dir` | str | `results` | Output directory |
| `--batch` | flag | False | Enable batch mode (multiple text-style pairs, disables automatic chunking) |
## Notes
- **Batch Processing**: The number of `--voice-style` files must match the number of `--text` entries
- **Batch Processing**: When using `--batch`, the number of `--voice-style` files must match the number of `--text` entries
- **Automatic Chunking**: Without `--batch`, long texts are automatically split and concatenated with 0.3s pauses
- **Quality vs Speed**: Higher `--total-step` values produce better quality but take longer
- **GPU Support**: GPU mode is not supported yet
- **Known Issues**: On some platforms (especially macOS), there might be a mutex cleanup warning during exit. This is a known ONNX Runtime issue and doesn't affect functionality. The implementation uses `libc::_exit()` and `mem::forget()` to bypass this issue.
+34 -15
View File
@@ -41,6 +41,10 @@ struct Args {
/// Output directory
#[arg(long, default_value = "results")]
save_dir: String,
/// Enable batch mode (multiple text-style pairs)
#[arg(long, default_value = "false")]
batch: bool,
}
fn main() -> Result<()> {
@@ -53,13 +57,16 @@ fn main() -> Result<()> {
let voice_style_paths = &args.voice_style;
let text_list = &args.text;
let save_dir = &args.save_dir;
let batch = args.batch;
if voice_style_paths.len() != text_list.len() {
anyhow::bail!(
"Number of voice styles ({}) must match number of texts ({})",
voice_style_paths.len(),
text_list.len()
);
if batch {
if voice_style_paths.len() != text_list.len() {
anyhow::bail!(
"Number of voice styles ({}) must match number of texts ({})",
voice_style_paths.len(),
text_list.len()
);
}
}
let bsz = voice_style_paths.len();
@@ -76,19 +83,31 @@ fn main() -> Result<()> {
for n in 0..n_test {
println!("\n[{}/{}] Starting synthesis...", n + 1, n_test);
let (wav, duration) = timer("Generating speech from text", || {
text_to_speech.call(text_list, &style, total_step)
})?;
let (wav, duration) = if batch {
timer("Generating speech from text", || {
text_to_speech.batch(text_list, &style, total_step)
})?
} else {
let (w, d) = timer("Generating speech from text", || {
text_to_speech.call(&text_list[0], &style, total_step, 0.3)
})?;
(w, vec![d])
};
// Save outputs
let wav_len = wav.len() / bsz;
for i in 0..bsz {
let fname = format!("{}_{}.wav", sanitize_filename(&text_list[i], 20), n + 1);
let actual_len = (text_to_speech.sample_rate as f32 * duration[i]) as usize;
let wav_start = i * wav_len;
let wav_end = wav_start + actual_len.min(wav_len);
let wav_slice = &wav[wav_start..wav_end];
let wav_slice = if batch {
let wav_len = wav.len() / bsz;
let actual_len = (text_to_speech.sample_rate as f32 * duration[i]) as usize;
let wav_start = i * wav_len;
let wav_end = wav_start + actual_len.min(wav_len);
&wav[wav_start..wav_end]
} else {
// For non-batch mode, wav is a single concatenated audio
let actual_len = (text_to_speech.sample_rate as f32 * duration[0]) as usize;
&wav[..actual_len.min(wav.len())]
};
let output_path = PathBuf::from(save_dir).join(&fname);
write_wav_file(&output_path, wav_slice, text_to_speech.sample_rate)?;
+227 -1
View File
@@ -12,6 +12,7 @@ use anyhow::{Result, Context};
use unicode_normalization::UnicodeNormalization;
use hound::{WavWriter, WavSpec, SampleFormat};
use rand_distr::{Distribution, Normal};
use regex::Regex;
// ============================================================================
// Configuration Structures
@@ -218,6 +219,187 @@ pub fn write_wav_file<P: AsRef<Path>>(
Ok(())
}
// ============================================================================
// Text Chunking
// ============================================================================
const MAX_CHUNK_LENGTH: usize = 300;
const ABBREVIATIONS: &[&str] = &[
"Dr.", "Mr.", "Mrs.", "Ms.", "Prof.", "Sr.", "Jr.",
"St.", "Ave.", "Rd.", "Blvd.", "Dept.", "Inc.", "Ltd.",
"Co.", "Corp.", "etc.", "vs.", "i.e.", "e.g.", "Ph.D.",
];
pub fn chunk_text(text: &str, max_len: Option<usize>) -> Vec<String> {
let max_len = max_len.unwrap_or(MAX_CHUNK_LENGTH);
let text = text.trim();
if text.is_empty() {
return vec![String::new()];
}
// Split by paragraphs
let para_re = Regex::new(r"\n\s*\n").unwrap();
let paragraphs: Vec<&str> = para_re.split(text).collect();
let mut chunks = Vec::new();
for para in paragraphs {
let para = para.trim();
if para.is_empty() {
continue;
}
if para.len() <= max_len {
chunks.push(para.to_string());
continue;
}
// Split by sentences
let sentences = split_sentences(para);
let mut current = String::new();
let mut current_len = 0;
for sentence in sentences {
let sentence = sentence.trim();
if sentence.is_empty() {
continue;
}
let sentence_len = sentence.len();
if sentence_len > max_len {
// If sentence is longer than max_len, split by comma or space
if !current.is_empty() {
chunks.push(current.trim().to_string());
current.clear();
current_len = 0;
}
// Try splitting by comma
let parts: Vec<&str> = sentence.split(',').collect();
for part in parts {
let part = part.trim();
if part.is_empty() {
continue;
}
let part_len = part.len();
if part_len > max_len {
// Split by space as last resort
let words: Vec<&str> = part.split_whitespace().collect();
let mut word_chunk = String::new();
let mut word_chunk_len = 0;
for word in words {
let word_len = word.len();
if word_chunk_len + word_len + 1 > max_len && !word_chunk.is_empty() {
chunks.push(word_chunk.trim().to_string());
word_chunk.clear();
word_chunk_len = 0;
}
if !word_chunk.is_empty() {
word_chunk.push(' ');
word_chunk_len += 1;
}
word_chunk.push_str(word);
word_chunk_len += word_len;
}
if !word_chunk.is_empty() {
chunks.push(word_chunk.trim().to_string());
}
} else {
if current_len + part_len + 1 > max_len && !current.is_empty() {
chunks.push(current.trim().to_string());
current.clear();
current_len = 0;
}
if !current.is_empty() {
current.push_str(", ");
current_len += 2;
}
current.push_str(part);
current_len += part_len;
}
}
continue;
}
if current_len + sentence_len + 1 > max_len && !current.is_empty() {
chunks.push(current.trim().to_string());
current.clear();
current_len = 0;
}
if !current.is_empty() {
current.push(' ');
current_len += 1;
}
current.push_str(sentence);
current_len += sentence_len;
}
if !current.is_empty() {
chunks.push(current.trim().to_string());
}
}
if chunks.is_empty() {
vec![String::new()]
} else {
chunks
}
}
fn split_sentences(text: &str) -> Vec<String> {
// Rust's regex doesn't support lookbehind, so we use a simpler approach
// Split on sentence boundaries and then check if they're abbreviations
let re = Regex::new(r"([.!?])\s+").unwrap();
// Find all matches
let matches: Vec<_> = re.find_iter(text).collect();
if matches.is_empty() {
return vec![text.to_string()];
}
let mut sentences = Vec::new();
let mut last_end = 0;
for m in matches {
// Get the text before the punctuation
let before_punc = &text[last_end..m.start()];
// Check if this ends with an abbreviation
let mut is_abbrev = false;
for abbrev in ABBREVIATIONS {
let combined = format!("{}{}", before_punc.trim(), &text[m.start()..m.start()+1]);
if combined.ends_with(abbrev) {
is_abbrev = true;
break;
}
}
if !is_abbrev {
// This is a real sentence boundary
sentences.push(text[last_end..m.end()].to_string());
last_end = m.end();
}
}
// Add the remaining text
if last_end < text.len() {
sentences.push(text[last_end..].to_string());
}
if sentences.is_empty() {
vec![text.to_string()]
} else {
sentences
}
}
// ============================================================================
// Utility Functions
// ============================================================================
@@ -297,7 +479,7 @@ impl TextToSpeech {
}
}
pub fn call(
fn _infer(
&mut self,
text_list: &[String],
style: &Style,
@@ -396,6 +578,50 @@ impl TextToSpeech {
Ok((wav, duration))
}
pub fn call(
&mut self,
text: &str,
style: &Style,
total_step: usize,
silence_duration: f32,
) -> Result<(Vec<f32>, f32)> {
let chunks = chunk_text(text, None);
let mut wav_cat: Vec<f32> = Vec::new();
let mut dur_cat: f32 = 0.0;
for (i, chunk) in chunks.iter().enumerate() {
let (wav, duration) = self._infer(&[chunk.clone()], style, total_step)?;
let dur = duration[0];
let wav_len = (self.sample_rate as f32 * dur) as usize;
let wav_chunk = &wav[..wav_len.min(wav.len())];
if i == 0 {
wav_cat.extend_from_slice(wav_chunk);
dur_cat = dur;
} else {
let silence_len = (silence_duration * self.sample_rate as f32) as usize;
let silence = vec![0.0f32; silence_len];
wav_cat.extend_from_slice(&silence);
wav_cat.extend_from_slice(wav_chunk);
dur_cat += silence_duration + dur;
}
}
Ok((wav_cat, dur_cat))
}
pub fn batch(
&mut self,
text_list: &[String],
style: &Style,
total_step: usize,
) -> Result<(Vec<f32>, Vec<f32>)> {
self._infer(text_list, style, total_step)
}
}
// ============================================================================
+25 -1
View File
@@ -2,6 +2,10 @@
This guide provides examples for running TTS inference using `example_onnx`.
## 📰 Update News
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
## Installation
This project uses Swift Package Manager (SPM) for dependency management.
@@ -34,6 +38,7 @@ This will use:
Process multiple voice styles and texts at once:
```bash
.build/release/example_onnx \
--batch \
--voice-style assets/voice_styles/M1.json,assets/voice_styles/F1.json \
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant."
```
@@ -57,6 +62,23 @@ This will:
- Use 10 denoising steps instead of the default 5
- Produce higher quality output at the cost of slower inference
### Example 4: Long-Form Inference
The system automatically chunks long texts into manageable segments, synthesizes each segment separately, and concatenates them with natural pauses (0.3 seconds by default) into a single audio file. This happens by default when you don't use the `--batch` flag:
```bash
.build/release/example_onnx \
--voice-style assets/voice_styles/M1.json \
--text "This is a very long text that will be automatically split into multiple chunks. The system will process each chunk separately and then concatenate them together with natural pauses between segments. This ensures that even very long texts can be processed efficiently while maintaining natural speech flow and avoiding memory issues."
```
This will:
- Automatically split the text into chunks based on paragraph and sentence boundaries
- Synthesize each chunk separately
- Add 0.3 seconds of silence between chunks for natural pauses
- Concatenate all chunks into a single audio file
**Note**: Automatic text chunking is disabled when using `--batch` mode. In batch mode, each text is processed as-is without chunking.
## Available Arguments
| Argument | Type | Default | Description |
@@ -68,9 +90,11 @@ This will:
| `--voice-style` | str+ | `assets/voice_styles/M1.json` | Voice style file path(s) |
| `--text` | str+ | (long default text) | Text(s) to synthesize |
| `--save-dir` | str | `results` | Output directory |
| `--batch` | flag | False | Enable batch mode (multiple text-style pairs, disables automatic chunking) |
## Notes
- **Batch Processing**: The number of `--voice-style` files must match the number of `--text` entries
- **Batch Processing**: When using `--batch`, the number of `--voice-style` files must match the number of `--text` entries
- **Automatic Chunking**: Without `--batch`, long texts are automatically split and concatenated with 0.3s pauses
- **Quality vs Speed**: Higher `--total-step` values produce better quality but take longer
- **GPU Support**: GPU mode is not supported yet
+35 -10
View File
@@ -9,6 +9,7 @@ struct Args {
var voiceStyle: [String] = ["assets/voice_styles/M1.json"]
var text: [String] = ["This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."]
var saveDir: String = "results"
var batch: Bool = false
}
func parseArgs() -> Args {
@@ -52,6 +53,8 @@ func parseArgs() -> Args {
args.saveDir = arguments[i + 1]
i += 1
}
case "--batch":
args.batch = true
default:
break
}
@@ -70,9 +73,11 @@ struct ExampleONNX {
// --- 1. Parse arguments --- //
let args = parseArgs()
guard args.voiceStyle.count == args.text.count else {
print("Error: Number of voice styles (\(args.voiceStyle.count)) must match number of texts (\(args.text.count))")
return
if args.batch {
guard args.voiceStyle.count == args.text.count else {
print("Error: Number of voice styles (\(args.voiceStyle.count)) must match number of texts (\(args.text.count))")
return
}
}
let bsz = args.voiceStyle.count
@@ -92,19 +97,39 @@ struct ExampleONNX {
for n in 0..<args.nTest {
print("\n[\(n + 1)/\(args.nTest)] Starting synthesis...")
let (wav, duration) = try timer("Generating speech from text") {
try textToSpeech.call(args.text, style, args.totalStep)
let wav: [Float]
let duration: [Float]
if args.batch {
let result = try timer("Generating speech from text") {
try textToSpeech.batch(args.text, style, args.totalStep)
}
wav = result.wav
duration = result.duration
} else {
let result = try timer("Generating speech from text") {
try textToSpeech.call(args.text[0], style, args.totalStep, silenceDuration: 0.3)
}
wav = result.wav
duration = [result.duration]
}
// Save outputs
let wavLen = wav.count / bsz
for i in 0..<bsz {
let fname = "\(sanitizeFilename(args.text[i], maxLen: 20))_\(n + 1).wav"
let actualLen = Int(Float(textToSpeech.sampleRate) * duration[i])
let wavOut: [Float]
let wavStart = i * wavLen
let wavEnd = min(wavStart + actualLen, wavStart + wavLen)
let wavOut = Array(wav[wavStart..<wavEnd])
if args.batch {
let wavLen = wav.count / bsz
let actualLen = Int(Float(textToSpeech.sampleRate) * duration[i])
let wavStart = i * wavLen
let wavEnd = min(wavStart + actualLen, wavStart + wavLen)
wavOut = Array(wav[wavStart..<wavEnd])
} else {
// For non-batch mode, wav is a single concatenated audio
let actualLen = Int(Float(textToSpeech.sampleRate) * duration[0])
wavOut = Array(wav.prefix(actualLen))
}
let outputPath = "\(args.saveDir)/\(fname)"
try writeWavFile(outputPath, wavOut, textToSpeech.sampleRate)
+227 -1
View File
@@ -203,6 +203,199 @@ func writeWavFile(_ filename: String, _ audioData: [Float], _ sampleRate: Int) t
try data.write(to: url)
}
// MARK: - Text Chunking
let MAX_CHUNK_LENGTH = 300
let ABBREVIATIONS = [
"Dr.", "Mr.", "Mrs.", "Ms.", "Prof.", "Sr.", "Jr.",
"St.", "Ave.", "Rd.", "Blvd.", "Dept.", "Inc.", "Ltd.",
"Co.", "Corp.", "etc.", "vs.", "i.e.", "e.g.", "Ph.D."
]
func chunkText(_ text: String, maxLen: Int = 0) -> [String] {
let actualMaxLen = maxLen > 0 ? maxLen : MAX_CHUNK_LENGTH
let trimmedText = text.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines)
if trimmedText.isEmpty {
return [""]
}
// Split by paragraphs using regex
let paraPattern = try! NSRegularExpression(pattern: "\\n\\s*\\n")
let paraRange = NSRange(trimmedText.startIndex..., in: trimmedText)
var paragraphs = [String]()
var lastEnd = trimmedText.startIndex
paraPattern.enumerateMatches(in: trimmedText, range: paraRange) { match, _, _ in
if let match = match, let range = Range(match.range, in: trimmedText) {
paragraphs.append(String(trimmedText[lastEnd..<range.lowerBound]))
lastEnd = range.upperBound
}
}
if lastEnd < trimmedText.endIndex {
paragraphs.append(String(trimmedText[lastEnd...]))
}
if paragraphs.isEmpty {
paragraphs = [trimmedText]
}
var chunks = [String]()
for para in paragraphs {
let trimmedPara = para.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines)
if trimmedPara.isEmpty {
continue
}
if trimmedPara.count <= actualMaxLen {
chunks.append(trimmedPara)
continue
}
// Split by sentences
let sentences = splitSentences(trimmedPara)
var current = ""
var currentLen = 0
for sentence in sentences {
let trimmedSentence = sentence.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines)
if trimmedSentence.isEmpty {
continue
}
let sentenceLen = trimmedSentence.count
if sentenceLen > actualMaxLen {
// If sentence is longer than maxLen, split by comma or space
if !current.isEmpty {
chunks.append(current.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines))
current = ""
currentLen = 0
}
// Try splitting by comma
let parts = trimmedSentence.components(separatedBy: ",")
for part in parts {
let trimmedPart = part.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines)
if trimmedPart.isEmpty {
continue
}
let partLen = trimmedPart.count
if partLen > actualMaxLen {
// Split by space as last resort
let words = trimmedPart.components(separatedBy: CharacterSet.whitespaces).filter { !$0.isEmpty }
var wordChunk = ""
var wordChunkLen = 0
for word in words {
let wordLen = word.count
if wordChunkLen + wordLen + 1 > actualMaxLen && !wordChunk.isEmpty {
chunks.append(wordChunk.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines))
wordChunk = ""
wordChunkLen = 0
}
if !wordChunk.isEmpty {
wordChunk += " "
wordChunkLen += 1
}
wordChunk += word
wordChunkLen += wordLen
}
if !wordChunk.isEmpty {
chunks.append(wordChunk.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines))
}
} else {
if currentLen + partLen + 1 > actualMaxLen && !current.isEmpty {
chunks.append(current.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines))
current = ""
currentLen = 0
}
if !current.isEmpty {
current += ", "
currentLen += 2
}
current += trimmedPart
currentLen += partLen
}
}
continue
}
if currentLen + sentenceLen + 1 > actualMaxLen && !current.isEmpty {
chunks.append(current.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines))
current = ""
currentLen = 0
}
if !current.isEmpty {
current += " "
currentLen += 1
}
current += trimmedSentence
currentLen += sentenceLen
}
if !current.isEmpty {
chunks.append(current.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines))
}
}
return chunks.isEmpty ? [""] : chunks
}
func splitSentences(_ text: String) -> [String] {
// Swift's regex doesn't support lookbehind reliably, so we use a simpler approach
// Split on sentence boundaries and then check if they're abbreviations
let regex = try! NSRegularExpression(pattern: "([.!?])\\s+")
let range = NSRange(text.startIndex..., in: text)
// Find all matches
let matches = regex.matches(in: text, range: range)
if matches.isEmpty {
return [text]
}
var sentences = [String]()
var lastEnd = text.startIndex
for match in matches {
guard let matchRange = Range(match.range, in: text) else { continue }
// Get the text before the punctuation
let beforePunc = String(text[lastEnd..<matchRange.lowerBound])
// Get the punctuation character
let puncRange = Range(NSRange(location: match.range.location, length: 1), in: text)!
let punc = String(text[puncRange])
// Check if this ends with an abbreviation
var isAbbrev = false
let combined = beforePunc.trimmingCharacters(in: CharacterSet.whitespaces) + punc
for abbrev in ABBREVIATIONS {
if combined.hasSuffix(abbrev) {
isAbbrev = true
break
}
}
if !isAbbrev {
// This is a real sentence boundary
sentences.append(String(text[lastEnd..<matchRange.upperBound]))
lastEnd = matchRange.upperBound
}
}
// Add the remaining text
if lastEnd < text.endIndex {
sentences.append(String(text[lastEnd...]))
}
return sentences.isEmpty ? [text] : sentences
}
// MARK: - Utility Functions
func timer<T>(_ name: String, _ f: () throws -> T) rethrows -> T {
@@ -260,7 +453,7 @@ class TextToSpeech {
self.sampleRate = cfgs.ae.sample_rate
}
func call(_ textList: [String], _ style: Style, _ totalStep: Int) throws -> (wav: [Float], duration: [Float]) {
private func _infer(_ textList: [String], _ style: Style, _ totalStep: Int) throws -> (wav: [Float], duration: [Float]) {
let bsz = textList.count
// Process text
@@ -382,6 +575,39 @@ class TextToSpeech {
return (wav, duration)
}
func call(_ text: String, _ style: Style, _ totalStep: Int, silenceDuration: Float) throws -> (wav: [Float], duration: Float) {
let chunks = chunkText(text)
var wavCat = [Float]()
var durCat: Float = 0.0
for (i, chunk) in chunks.enumerated() {
let result = try _infer([chunk], style, totalStep)
let dur = result.duration[0]
let wavLen = Int(Float(sampleRate) * dur)
let wavChunk = Array(result.wav.prefix(wavLen))
if i == 0 {
wavCat = wavChunk
durCat = dur
} else {
let silenceLen = Int(silenceDuration * Float(sampleRate))
let silence = [Float](repeating: 0.0, count: silenceLen)
wavCat.append(contentsOf: silence)
wavCat.append(contentsOf: wavChunk)
durCat += silenceDuration + dur
}
}
return (wavCat, durCat)
}
func batch(_ textList: [String], _ style: Style, _ totalStep: Int) throws -> (wav: [Float], duration: [Float]) {
return try _infer(textList, style, totalStep)
}
}
// MARK: - Component Loading Functions
+50 -11
View File
@@ -17,8 +17,9 @@ echo ""
echo "Select test mode:"
echo " 1) Default inference only"
echo " 2) Batch inference only"
echo " 3) Both default and batch inference"
echo -e "Enter your choice (1/2/3) [default: 1]: \c"
echo " 3) Long-form inference only"
echo " 4) All tests (default + batch + long-form)"
echo -e "Enter your choice (1/2/3/4) [default: 1]: \c"
read -r test_mode
test_mode=${test_mode:-1}
@@ -26,22 +27,32 @@ case $test_mode in
1)
TEST_DEFAULT=true
TEST_BATCH=false
TEST_LONGFORM=false
echo "Running default inference tests only"
;;
2)
TEST_DEFAULT=false
TEST_BATCH=true
TEST_LONGFORM=false
echo "Running batch inference tests only"
;;
3)
TEST_DEFAULT=false
TEST_BATCH=false
TEST_LONGFORM=true
echo "Running long-form inference tests only"
;;
4)
TEST_DEFAULT=true
TEST_BATCH=true
echo "Running both default and batch inference tests"
TEST_LONGFORM=true
echo "Running all tests (default + batch + long-form)"
;;
*)
echo "Invalid choice. Using default inference only."
TEST_DEFAULT=true
TEST_BATCH=false
TEST_LONGFORM=false
;;
esac
echo ""
@@ -52,6 +63,10 @@ BATCH_VOICE_STYLE_2="assets/voice_styles/F1.json"
BATCH_TEXT_1="The sun sets behind the mountains, painting the sky in shades of pink and orange."
BATCH_TEXT_2="The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant."
# Long-form inference test data
LONGFORM_VOICE_STYLE="assets/voice_styles/M1.json"
LONGFORM_TEXT="This is a very long text that will be automatically split into multiple chunks. The system will process each chunk separately and then concatenate them together with natural pauses between segments. This ensures that even very long texts can be processed efficiently while maintaining natural speech flow and avoiding memory issues. The text chunking algorithm intelligently splits on paragraph and sentence boundaries, preserving the natural flow of the content. When a sentence is too long, it further splits on commas and spaces as needed. This multi-level approach ensures optimal chunk sizes for inference while maintaining linguistic coherence."
# Ask if user wants to clean results folders
echo -e "Do you want to clean all results folders before running tests? (y/N): \c"
read -r response
@@ -123,7 +138,10 @@ if [ "$TEST_DEFAULT" = true ]; then
run_test "Python (default)" "py" "uv run example_onnx.py"
fi
if [ "$TEST_BATCH" = true ]; then
run_test "Python (batch)" "py" "uv run example_onnx.py --voice-style $BATCH_VOICE_STYLE_1 $BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1' '$BATCH_TEXT_2'"
run_test "Python (batch)" "py" "uv run example_onnx.py --batch --voice-style $BATCH_VOICE_STYLE_1 $BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1' '$BATCH_TEXT_2'"
fi
if [ "$TEST_LONGFORM" = true ]; then
run_test "Python (long-form)" "py" "uv run example_onnx.py --voice-style $LONGFORM_VOICE_STYLE --text '$LONGFORM_TEXT'"
fi
# ====================================
@@ -136,7 +154,10 @@ if [ "$TEST_DEFAULT" = true ]; then
run_test "JavaScript (default)" "nodejs" "node example_onnx.js"
fi
if [ "$TEST_BATCH" = true ]; then
run_test "JavaScript (batch)" "nodejs" "node example_onnx.js --voice-style $BATCH_VOICE_STYLE_1,$BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
run_test "JavaScript (batch)" "nodejs" "node example_onnx.js --batch --voice-style $BATCH_VOICE_STYLE_1,$BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
fi
if [ "$TEST_LONGFORM" = true ]; then
run_test "JavaScript (long-form)" "nodejs" "node example_onnx.js --voice-style $LONGFORM_VOICE_STYLE --text '$LONGFORM_TEXT'"
fi
# ====================================
@@ -150,7 +171,10 @@ if [ "$TEST_DEFAULT" = true ]; then
run_test "Go (default)" "go" "go run example_onnx.go helper.go"
fi
if [ "$TEST_BATCH" = true ]; then
run_test "Go (batch)" "go" "go run example_onnx.go helper.go --voice-style $BATCH_VOICE_STYLE_1,$BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
run_test "Go (batch)" "go" "go run example_onnx.go helper.go --batch -voice-style $BATCH_VOICE_STYLE_1,$BATCH_VOICE_STYLE_2 -text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
fi
if [ "$TEST_LONGFORM" = true ]; then
run_test "Go (long-form)" "go" "go run example_onnx.go helper.go -voice-style $LONGFORM_VOICE_STYLE -text '$LONGFORM_TEXT'"
fi
# ====================================
@@ -163,7 +187,10 @@ if [ "$TEST_DEFAULT" = true ]; then
run_test "Rust (default)" "rust" "cargo run --release"
fi
if [ "$TEST_BATCH" = true ]; then
run_test "Rust (batch)" "rust" "cargo run --release -- --voice-style $BATCH_VOICE_STYLE_1,$BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
run_test "Rust (batch)" "rust" "cargo run --release -- --batch --voice-style $BATCH_VOICE_STYLE_1,$BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
fi
if [ "$TEST_LONGFORM" = true ]; then
run_test "Rust (long-form)" "rust" "cargo run --release -- --voice-style $LONGFORM_VOICE_STYLE --text '$LONGFORM_TEXT'"
fi
# ====================================
@@ -176,7 +203,10 @@ if [ "$TEST_DEFAULT" = true ]; then
run_test "C# (default)" "csharp" "dotnet run --configuration Release"
fi
if [ "$TEST_BATCH" = true ]; then
run_test "C# (batch)" "csharp" "dotnet run --configuration Release -- --voice-style ../$BATCH_VOICE_STYLE_1,../$BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
run_test "C# (batch)" "csharp" "dotnet run --configuration Release -- --batch --voice-style ../$BATCH_VOICE_STYLE_1,../$BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
fi
if [ "$TEST_LONGFORM" = true ]; then
run_test "C# (long-form)" "csharp" "dotnet run --configuration Release -- --voice-style ../$LONGFORM_VOICE_STYLE --text '$LONGFORM_TEXT'"
fi
# ====================================
@@ -189,7 +219,10 @@ if [ "$TEST_DEFAULT" = true ]; then
run_test "Java (default)" "java" "mvn exec:java -q"
fi
if [ "$TEST_BATCH" = true ]; then
run_test "Java (batch)" "java" "mvn exec:java -q -Dexec.args='--voice-style $BATCH_VOICE_STYLE_1,$BATCH_VOICE_STYLE_2 --text \"$BATCH_TEXT_1|$BATCH_TEXT_2\"'"
run_test "Java (batch)" "java" "mvn exec:java -q -Dexec.args='--batch --voice-style $BATCH_VOICE_STYLE_1,$BATCH_VOICE_STYLE_2 --text \"$BATCH_TEXT_1|$BATCH_TEXT_2\"'"
fi
if [ "$TEST_LONGFORM" = true ]; then
run_test "Java (long-form)" "java" "mvn exec:java -q -Dexec.args='--voice-style $LONGFORM_VOICE_STYLE --text \"$LONGFORM_TEXT\"'"
fi
# ====================================
@@ -202,7 +235,10 @@ if [ "$TEST_DEFAULT" = true ]; then
run_test "Swift (default)" "swift" ".build/release/example_onnx"
fi
if [ "$TEST_BATCH" = true ]; then
run_test "Swift (batch)" "swift" ".build/release/example_onnx --voice-style $BATCH_VOICE_STYLE_1,$BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
run_test "Swift (batch)" "swift" ".build/release/example_onnx --batch --voice-style $BATCH_VOICE_STYLE_1,$BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
fi
if [ "$TEST_LONGFORM" = true ]; then
run_test "Swift (long-form)" "swift" ".build/release/example_onnx --voice-style $LONGFORM_VOICE_STYLE --text '$LONGFORM_TEXT'"
fi
# ====================================
@@ -215,7 +251,10 @@ if [ "$TEST_DEFAULT" = true ]; then
run_test "C++ (default)" "cpp/build" "./example_onnx"
fi
if [ "$TEST_BATCH" = true ]; then
run_test "C++ (batch)" "cpp/build" "./example_onnx --voice-style ../$BATCH_VOICE_STYLE_1,../$BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
run_test "C++ (batch)" "cpp/build" "./example_onnx --batch --voice-style ../$BATCH_VOICE_STYLE_1,../$BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
fi
if [ "$TEST_LONGFORM" = true ]; then
run_test "C++ (long-form)" "cpp/build" "./example_onnx --voice-style ../$LONGFORM_VOICE_STYLE --text '$LONGFORM_TEXT'"
fi
# ====================================
+4
View File
@@ -2,6 +2,10 @@
This example demonstrates how to use Supertonic in a web browser using ONNX Runtime Web.
## 📰 Update News
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
## Features
- 🌐 Runs entirely in the browser (no server required for inference)
+72 -1
View File
@@ -72,7 +72,7 @@ export class TextToSpeech {
this.sampleRate = cfgs.ae.sample_rate;
}
async call(textList, style, totalStep, progressCallback = null) {
async _infer(textList, style, totalStep, progressCallback = null) {
const bsz = textList.length;
// Process text
@@ -176,6 +176,35 @@ export class TextToSpeech {
return { wav, duration };
}
async call(text, style, totalStep, silenceDuration = 0.3, progressCallback = null) {
if (style.ttl.dims[0] !== 1) {
throw new Error('Single speaker text to speech only supports single style');
}
const textList = chunkText(text);
let wavCat = [];
let durCat = 0;
for (const chunk of textList) {
const { wav, duration } = await this._infer([chunk], style, totalStep, progressCallback);
if (wavCat.length === 0) {
wavCat = wav;
durCat = duration[0];
} else {
const silenceLen = Math.floor(silenceDuration * this.sampleRate);
const silence = new Array(silenceLen).fill(0);
wavCat = [...wavCat, ...silence, ...wav];
durCat += duration[0] + silenceDuration;
}
}
return { wav: wavCat, duration: [durCat] };
}
async batch(textList, style, totalStep, progressCallback = null) {
return await this._infer(textList, style, totalStep, progressCallback);
}
sampleNoisyLatent(duration, sampleRate, baseChunkSize, chunkCompress, latentDim) {
const bsz = duration.length;
const maxDur = Math.max(...duration);
@@ -347,6 +376,48 @@ export async function loadTextToSpeech(onnxDir, sessionOptions = {}, progressCal
return { textToSpeech, cfgs };
}
/**
* Chunk text into manageable segments
*/
function chunkText(text, maxLen = 300) {
if (typeof text !== 'string') {
throw new Error(`chunkText expects a string, got ${typeof text}`);
}
// Split by paragraph (two or more newlines)
const paragraphs = text.trim().split(/\n\s*\n+/).filter(p => p.trim());
const chunks = [];
for (let paragraph of paragraphs) {
paragraph = paragraph.trim();
if (!paragraph) continue;
// Split by sentence boundaries (period, question mark, exclamation mark followed by space)
// But exclude common abbreviations like Mr., Mrs., Dr., etc. and single capital letters like F.
const sentences = paragraph.split(/(?<!Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.|Sr\.|Jr\.|Ph\.D\.|etc\.|e\.g\.|i\.e\.|vs\.|Inc\.|Ltd\.|Co\.|Corp\.|St\.|Ave\.|Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+/);
let currentChunk = "";
for (let sentence of sentences) {
if (currentChunk.length + sentence.length + 1 <= maxLen) {
currentChunk += (currentChunk ? " " : "") + sentence;
} else {
if (currentChunk) {
chunks.push(currentChunk.trim());
}
currentChunk = sentence;
}
}
if (currentChunk) {
chunks.push(currentChunk.trim());
}
}
return chunks;
}
/**
* Write WAV file to ArrayBuffer
*/
+3 -3
View File
@@ -186,15 +186,15 @@ async function generateSpeech() {
`;
const totalStep = parseInt(totalStepInput.value);
const textList = [text];
showStatus('️ <strong>Generating speech from text...</strong>');
const tic = Date.now();
const { wav, duration } = await textToSpeech.call(
textList,
text,
currentStyle,
totalStep,
totalStep,
0.3,
(step, total) => {
showStatus(`️ <strong>Denoising (${step}/${total})...</strong>`);
}