mirror of
https://github.com/supertone-inc/supertonic.git
synced 2026-06-02 01:38:48 +02:00
add text chunking for long-form generation (Fixes #4)
This commit is contained in:
+26
-2
@@ -2,6 +2,10 @@
|
||||
|
||||
High-performance text-to-speech inference using ONNX Runtime.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
## Requirements
|
||||
|
||||
- C++17 compiler, CMake 3.15+
|
||||
@@ -62,14 +66,16 @@ Process multiple voice styles and texts at once:
|
||||
```bash
|
||||
./example_onnx \
|
||||
--voice-style ../assets/voice_styles/M1.json,../assets/voice_styles/F1.json \
|
||||
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant."
|
||||
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant." \
|
||||
--batch
|
||||
```
|
||||
|
||||
This will:
|
||||
- Use `--batch` flag to enable batch processing mode
|
||||
- Generate speech for 2 different voice-text pairs
|
||||
- Use male voice style (M1.json) for the first text
|
||||
- Use female voice style (F1.json) for the second text
|
||||
- Process both samples in a single batch
|
||||
- Process both samples in a single batch (automatic text chunking disabled)
|
||||
|
||||
### Example 3: High Quality Inference
|
||||
Increase denoising steps for better quality:
|
||||
@@ -84,6 +90,22 @@ This will:
|
||||
- Use 10 denoising steps instead of the default 5
|
||||
- Produce higher quality output at the cost of slower inference
|
||||
|
||||
### Example 4: Long-Form Inference
|
||||
For long texts, the system automatically chunks the text into manageable segments and generates a single audio file:
|
||||
```bash
|
||||
./example_onnx \
|
||||
--voice-style ../assets/voice_styles/M1.json \
|
||||
--text "Once upon a time, in a small village nestled between rolling hills, there lived a young artist named Clara. Every morning, she would wake up before dawn to capture the first light of day. The golden rays streaming through her window inspired countless paintings. Her work was known throughout the region for its vibrant colors and emotional depth. People from far and wide came to see her gallery, and many said her paintings could tell stories that words never could."
|
||||
```
|
||||
|
||||
This will:
|
||||
- Automatically split the long text into smaller chunks (max 300 characters by default)
|
||||
- Process each chunk separately while maintaining natural speech flow
|
||||
- Insert brief silences (0.3 seconds) between chunks for natural pacing
|
||||
- Combine all chunks into a single output audio file
|
||||
|
||||
**Note**: When using batch mode (`--batch`), automatic text chunking is disabled. Use non-batch mode for long-form text synthesis.
|
||||
|
||||
## Available Arguments
|
||||
|
||||
| Argument | Type | Default | Description |
|
||||
@@ -94,8 +116,10 @@ This will:
|
||||
| `--voice-style` | str | `../assets/voice_styles/M1.json` | Voice style file path(s) (comma-separated for batch) |
|
||||
| `--text` | str | (long default text) | Text(s) to synthesize (pipe-separated for batch) |
|
||||
| `--save-dir` | str | `results` | Output directory |
|
||||
| `--batch` | flag | False | Enable batch mode (disables automatic text chunking) |
|
||||
|
||||
## Notes
|
||||
|
||||
- **Batch Processing**: The number of `--voice-style` files must match the number of `--text` entries
|
||||
- **Long-Form Inference**: Without `--batch` flag, long texts are automatically chunked and combined into a single audio file with natural pauses
|
||||
- **Quality vs Speed**: Higher `--total-step` values produce better quality but take longer
|
||||
|
||||
@@ -16,6 +16,7 @@ struct Args {
|
||||
"This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
|
||||
};
|
||||
std::string save_dir = "results";
|
||||
bool batch = false;
|
||||
};
|
||||
|
||||
auto splitString = [](const std::string& str, char delim) {
|
||||
@@ -39,6 +40,7 @@ Args parseArgs(int argc, char* argv[]) {
|
||||
else if (arg == "--voice-style" && i + 1 < argc) args.voice_style = splitString(argv[++i], ',');
|
||||
else if (arg == "--text" && i + 1 < argc) args.text = splitString(argv[++i], '|');
|
||||
else if (arg == "--save-dir" && i + 1 < argc) args.save_dir = argv[++i];
|
||||
else if (arg == "--batch") args.batch = true;
|
||||
}
|
||||
return args;
|
||||
}
|
||||
@@ -53,13 +55,13 @@ int main(int argc, char* argv[]) {
|
||||
std::string save_dir = args.save_dir;
|
||||
std::vector<std::string> voice_style_paths = args.voice_style;
|
||||
std::vector<std::string> text_list = args.text;
|
||||
bool batch = args.batch;
|
||||
|
||||
if (voice_style_paths.size() != text_list.size()) {
|
||||
std::cerr << "Error: Number of voice styles (" << voice_style_paths.size()
|
||||
<< ") must match number of texts (" << text_list.size() << ")\n";
|
||||
return 1;
|
||||
}
|
||||
|
||||
int bsz = voice_style_paths.size();
|
||||
|
||||
// --- 2. Load Text to Speech --- //
|
||||
@@ -81,7 +83,11 @@ int main(int argc, char* argv[]) {
|
||||
std::cout << "\n[" << (n + 1) << "/" << n_test << "] Starting synthesis...\n";
|
||||
|
||||
auto result = timer("Generating speech from text", [&]() {
|
||||
return text_to_speech->call(memory_info, text_list, style, total_step);
|
||||
if (batch) {
|
||||
return text_to_speech->batch(memory_info, text_list, style, total_step);
|
||||
} else {
|
||||
return text_to_speech->call(memory_info, text_list[0], style, total_step);
|
||||
}
|
||||
});
|
||||
|
||||
int sample_rate = text_to_speech->getSampleRate();
|
||||
|
||||
+137
-1
@@ -5,6 +5,7 @@
|
||||
#include <algorithm>
|
||||
#include <random>
|
||||
#include <sstream>
|
||||
#include <regex>
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
using json = nlohmann::json;
|
||||
@@ -155,7 +156,7 @@ void TextToSpeech::sampleNoisyLatent(
|
||||
}
|
||||
}
|
||||
|
||||
TextToSpeech::SynthesisResult TextToSpeech::call(
|
||||
TextToSpeech::SynthesisResult TextToSpeech::_infer(
|
||||
Ort::MemoryInfo& memory_info,
|
||||
const std::vector<std::string>& text_list,
|
||||
const Style& style,
|
||||
@@ -364,6 +365,52 @@ TextToSpeech::SynthesisResult TextToSpeech::call(
|
||||
return result;
|
||||
}
|
||||
|
||||
TextToSpeech::SynthesisResult TextToSpeech::call(
|
||||
Ort::MemoryInfo& memory_info,
|
||||
const std::string& text,
|
||||
const Style& style,
|
||||
int total_step,
|
||||
float silence_duration
|
||||
) {
|
||||
if (style.getTtlShape()[0] != 1) {
|
||||
throw std::runtime_error("Single speaker text to speech only supports single style");
|
||||
}
|
||||
|
||||
auto text_list = chunkText(text);
|
||||
std::vector<float> wav_cat;
|
||||
float dur_cat = 0.0f;
|
||||
|
||||
for (const auto& chunk : text_list) {
|
||||
auto result = _infer(memory_info, {chunk}, style, total_step);
|
||||
|
||||
if (wav_cat.empty()) {
|
||||
wav_cat = result.wav;
|
||||
dur_cat = result.duration[0];
|
||||
} else {
|
||||
int silence_len = static_cast<int>(silence_duration * sample_rate_);
|
||||
std::vector<float> silence(silence_len, 0.0f);
|
||||
wav_cat.insert(wav_cat.end(), silence.begin(), silence.end());
|
||||
wav_cat.insert(wav_cat.end(), result.wav.begin(), result.wav.end());
|
||||
dur_cat += result.duration[0] + silence_duration;
|
||||
}
|
||||
}
|
||||
|
||||
SynthesisResult final_result;
|
||||
final_result.wav = wav_cat;
|
||||
final_result.duration = {dur_cat};
|
||||
|
||||
return final_result;
|
||||
}
|
||||
|
||||
TextToSpeech::SynthesisResult TextToSpeech::batch(
|
||||
Ort::MemoryInfo& memory_info,
|
||||
const std::vector<std::string>& text_list,
|
||||
const Style& style,
|
||||
int total_step
|
||||
) {
|
||||
return _infer(memory_info, text_list, style, total_step);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Utility functions
|
||||
// ============================================================================
|
||||
@@ -712,3 +759,92 @@ std::string sanitizeFilename(const std::string& text, int max_len) {
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Chunk text
|
||||
// ============================================================================
|
||||
|
||||
static std::string trim(const std::string& str) {
|
||||
size_t start = 0;
|
||||
while (start < str.size() && std::isspace(static_cast<unsigned char>(str[start]))) {
|
||||
start++;
|
||||
}
|
||||
|
||||
size_t end = str.size();
|
||||
while (end > start && std::isspace(static_cast<unsigned char>(str[end - 1]))) {
|
||||
end--;
|
||||
}
|
||||
|
||||
return str.substr(start, end - start);
|
||||
}
|
||||
|
||||
std::vector<std::string> chunkText(const std::string& text, int max_len) {
|
||||
std::vector<std::string> chunks;
|
||||
|
||||
// Split by paragraph (two or more newlines)
|
||||
std::regex paragraph_regex(R"(\n\s*\n+)");
|
||||
std::sregex_token_iterator iter(text.begin(), text.end(), paragraph_regex, -1);
|
||||
std::sregex_token_iterator end;
|
||||
|
||||
std::vector<std::string> paragraphs;
|
||||
for (; iter != end; ++iter) {
|
||||
std::string para = trim(*iter);
|
||||
if (!para.empty()) {
|
||||
paragraphs.push_back(para);
|
||||
}
|
||||
}
|
||||
|
||||
// Split by sentence boundaries, excluding abbreviations
|
||||
// This is a simplified version - C++ negative lookbehind is more complex
|
||||
std::regex sentence_regex(R"([.!?]\s+)");
|
||||
|
||||
for (const auto& paragraph : paragraphs) {
|
||||
std::sregex_token_iterator sent_iter(paragraph.begin(), paragraph.end(), sentence_regex, -1);
|
||||
std::sregex_token_iterator sent_end;
|
||||
|
||||
std::vector<std::string> sentences;
|
||||
std::string current = "";
|
||||
|
||||
for (; sent_iter != sent_end; ++sent_iter) {
|
||||
std::string sentence = *sent_iter;
|
||||
if (!sentence.empty()) {
|
||||
// Add back the punctuation
|
||||
if (sent_iter != sent_end) {
|
||||
std::smatch match;
|
||||
if (std::regex_search(sent_iter->first, paragraph.end(), match, sentence_regex)) {
|
||||
sentence += match.str();
|
||||
}
|
||||
}
|
||||
sentences.push_back(sentence);
|
||||
}
|
||||
}
|
||||
|
||||
// Combine sentences into chunks
|
||||
std::string current_chunk = "";
|
||||
|
||||
for (const auto& sentence : sentences) {
|
||||
if (static_cast<int>(current_chunk.length() + sentence.length() + 1) <= max_len) {
|
||||
if (!current_chunk.empty()) {
|
||||
current_chunk += " ";
|
||||
}
|
||||
current_chunk += sentence;
|
||||
} else {
|
||||
if (!current_chunk.empty()) {
|
||||
chunks.push_back(trim(current_chunk));
|
||||
}
|
||||
current_chunk = sentence;
|
||||
}
|
||||
}
|
||||
|
||||
if (!current_chunk.empty()) {
|
||||
chunks.push_back(trim(current_chunk));
|
||||
}
|
||||
}
|
||||
|
||||
// If no chunks were created, return the original text
|
||||
if (chunks.empty()) {
|
||||
chunks.push_back(trim(text));
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
@@ -87,6 +87,14 @@ public:
|
||||
};
|
||||
|
||||
SynthesisResult call(
|
||||
Ort::MemoryInfo& memory_info,
|
||||
const std::string& text,
|
||||
const Style& style,
|
||||
int total_step,
|
||||
float silence_duration = 0.3f
|
||||
);
|
||||
|
||||
SynthesisResult batch(
|
||||
Ort::MemoryInfo& memory_info,
|
||||
const std::vector<std::string>& text_list,
|
||||
const Style& style,
|
||||
@@ -96,6 +104,12 @@ public:
|
||||
int getSampleRate() const { return sample_rate_; }
|
||||
|
||||
private:
|
||||
SynthesisResult _infer(
|
||||
Ort::MemoryInfo& memory_info,
|
||||
const std::vector<std::string>& text_list,
|
||||
const Style& style,
|
||||
int total_step
|
||||
);
|
||||
Config cfgs_;
|
||||
UnicodeProcessor* text_processor_;
|
||||
Ort::Session* dp_ort_;
|
||||
@@ -200,3 +214,6 @@ auto timer(const std::string& name, Func&& func) -> decltype(func()) {
|
||||
|
||||
// Sanitize filename
|
||||
std::string sanitizeFilename(const std::string& text, int max_len);
|
||||
|
||||
// Chunk text into manageable segments
|
||||
std::vector<std::string> chunkText(const std::string& text, int max_len = 300);
|
||||
|
||||
+16
-4
@@ -19,6 +19,7 @@ namespace Supertonic
|
||||
"This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
|
||||
};
|
||||
public string SaveDir { get; set; } = "results";
|
||||
public bool Batch { get; set; } = false;
|
||||
}
|
||||
|
||||
static Args ParseArgs(string[] args)
|
||||
@@ -32,6 +33,9 @@ namespace Supertonic
|
||||
case "--use-gpu":
|
||||
result.UseGpu = true;
|
||||
break;
|
||||
case "--batch":
|
||||
result.Batch = true;
|
||||
break;
|
||||
case "--onnx-dir" when i + 1 < args.Length:
|
||||
result.OnnxDir = args[++i];
|
||||
break;
|
||||
@@ -67,13 +71,13 @@ namespace Supertonic
|
||||
string saveDir = parsedArgs.SaveDir;
|
||||
var voiceStylePaths = parsedArgs.VoiceStyle;
|
||||
var textList = parsedArgs.Text;
|
||||
bool batch = parsedArgs.Batch;
|
||||
|
||||
if (voiceStylePaths.Count != textList.Count)
|
||||
{
|
||||
throw new ArgumentException(
|
||||
$"Number of voice styles ({voiceStylePaths.Count}) must match number of texts ({textList.Count})");
|
||||
}
|
||||
|
||||
int bsz = voiceStylePaths.Count;
|
||||
|
||||
// --- 2. Load Text to Speech --- //
|
||||
@@ -88,9 +92,17 @@ namespace Supertonic
|
||||
{
|
||||
Console.WriteLine($"\n[{n + 1}/{nTest}] Starting synthesis...");
|
||||
|
||||
var (wav, duration) = Helper.Timer("Generating speech from text", () =>
|
||||
textToSpeech.Call(textList, style, totalStep)
|
||||
);
|
||||
var (wav, duration) = Helper.Timer("Generating speech from text", () =>
|
||||
{
|
||||
if (batch)
|
||||
{
|
||||
return textToSpeech.Batch(textList, style, totalStep);
|
||||
}
|
||||
else
|
||||
{
|
||||
return textToSpeech.Call(textList[0], style, totalStep);
|
||||
}
|
||||
});
|
||||
|
||||
if (!Directory.Exists(saveDir))
|
||||
{
|
||||
|
||||
+100
-1
@@ -4,6 +4,7 @@ using System.IO;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.ML.OnnxRuntime;
|
||||
using Microsoft.ML.OnnxRuntime.Tensors;
|
||||
|
||||
@@ -193,7 +194,7 @@ namespace Supertonic
|
||||
return (noisyLatent, latentMask);
|
||||
}
|
||||
|
||||
public (float[] wav, float[] duration) Call(List<string> textList, Style style, int totalStep)
|
||||
private (float[] wav, float[] duration) _Infer(List<string> textList, Style style, int totalStep)
|
||||
{
|
||||
int bsz = textList.Count;
|
||||
if (bsz != style.TtlShape[0])
|
||||
@@ -282,6 +283,44 @@ namespace Supertonic
|
||||
|
||||
return (wavTensor.ToArray(), durOnnx);
|
||||
}
|
||||
|
||||
public (float[] wav, float[] duration) Call(string text, Style style, int totalStep, float silenceDuration = 0.3f)
|
||||
{
|
||||
if (style.TtlShape[0] != 1)
|
||||
{
|
||||
throw new ArgumentException("Single speaker text to speech only supports single style");
|
||||
}
|
||||
|
||||
var textList = Helper.ChunkText(text);
|
||||
var wavCat = new List<float>();
|
||||
float durCat = 0.0f;
|
||||
|
||||
foreach (var chunk in textList)
|
||||
{
|
||||
var (wav, duration) = _Infer(new List<string> { chunk }, style, totalStep);
|
||||
|
||||
if (wavCat.Count == 0)
|
||||
{
|
||||
wavCat.AddRange(wav);
|
||||
durCat = duration[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
int silenceLen = (int)(silenceDuration * SampleRate);
|
||||
var silence = new float[silenceLen];
|
||||
wavCat.AddRange(silence);
|
||||
wavCat.AddRange(wav);
|
||||
durCat += duration[0] + silenceDuration;
|
||||
}
|
||||
}
|
||||
|
||||
return (wavCat.ToArray(), new float[] { durCat });
|
||||
}
|
||||
|
||||
public (float[] wav, float[] duration) Batch(List<string> textList, Style style, int totalStep)
|
||||
{
|
||||
return _Infer(textList, style, totalStep);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
@@ -608,5 +647,65 @@ namespace Supertonic
|
||||
}
|
||||
return result.ToString();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Chunk text
|
||||
// ============================================================================
|
||||
|
||||
public static List<string> ChunkText(string text, int maxLen = 300)
|
||||
{
|
||||
var chunks = new List<string>();
|
||||
|
||||
// Split by paragraph (two or more newlines)
|
||||
var paragraphRegex = new Regex(@"\n\s*\n+");
|
||||
var paragraphs = paragraphRegex.Split(text.Trim())
|
||||
.Select(p => p.Trim())
|
||||
.Where(p => !string.IsNullOrEmpty(p))
|
||||
.ToList();
|
||||
|
||||
// Split by sentence boundaries, excluding abbreviations
|
||||
var sentenceRegex = new Regex(@"(?<!Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.|Sr\.|Jr\.|Ph\.D\.|etc\.|e\.g\.|i\.e\.|vs\.|Inc\.|Ltd\.|Co\.|Corp\.|St\.|Ave\.|Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+");
|
||||
|
||||
foreach (var paragraph in paragraphs)
|
||||
{
|
||||
var sentences = sentenceRegex.Split(paragraph);
|
||||
string currentChunk = "";
|
||||
|
||||
foreach (var sentence in sentences)
|
||||
{
|
||||
if (string.IsNullOrEmpty(sentence)) continue;
|
||||
|
||||
if (currentChunk.Length + sentence.Length + 1 <= maxLen)
|
||||
{
|
||||
if (!string.IsNullOrEmpty(currentChunk))
|
||||
{
|
||||
currentChunk += " ";
|
||||
}
|
||||
currentChunk += sentence;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!string.IsNullOrEmpty(currentChunk))
|
||||
{
|
||||
chunks.Add(currentChunk.Trim());
|
||||
}
|
||||
currentChunk = sentence;
|
||||
}
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(currentChunk))
|
||||
{
|
||||
chunks.Add(currentChunk.Trim());
|
||||
}
|
||||
}
|
||||
|
||||
// If no chunks were created, return the original text
|
||||
if (chunks.Count == 0)
|
||||
{
|
||||
chunks.Add(text.Trim());
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+26
-2
@@ -2,6 +2,10 @@
|
||||
|
||||
This guide provides examples for running TTS inference using `ExampleONNX.cs`.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
## Installation
|
||||
|
||||
### Prerequisites
|
||||
@@ -33,14 +37,16 @@ Process multiple voice styles and texts at once:
|
||||
```bash
|
||||
dotnet run -- \
|
||||
--voice-style assets/voice_styles/M1.json,assets/voice_styles/F1.json \
|
||||
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant."
|
||||
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant." \
|
||||
--batch
|
||||
```
|
||||
|
||||
This will:
|
||||
- Use `--batch` flag to enable batch processing mode
|
||||
- Generate speech for 2 different voice-text pairs
|
||||
- Use male voice style (M1.json) for the first text
|
||||
- Use female voice style (F1.json) for the second text
|
||||
- Process both samples in a single batch
|
||||
- Process both samples in a single batch (automatic text chunking disabled)
|
||||
|
||||
### Example 3: High Quality Inference
|
||||
Increase denoising steps for better quality:
|
||||
@@ -55,6 +61,22 @@ This will:
|
||||
- Use 10 denoising steps instead of the default 5
|
||||
- Produce higher quality output at the cost of slower inference
|
||||
|
||||
### Example 4: Long-Form Inference
|
||||
For long texts, the system automatically chunks the text into manageable segments and generates a single audio file:
|
||||
```bash
|
||||
dotnet run -- \
|
||||
--voice-style assets/voice_styles/M1.json \
|
||||
--text "Once upon a time, in a small village nestled between rolling hills, there lived a young artist named Clara. Every morning, she would wake up before dawn to capture the first light of day. The golden rays streaming through her window inspired countless paintings. Her work was known throughout the region for its vibrant colors and emotional depth. People from far and wide came to see her gallery, and many said her paintings could tell stories that words never could."
|
||||
```
|
||||
|
||||
This will:
|
||||
- Automatically split the long text into smaller chunks (max 300 characters by default)
|
||||
- Process each chunk separately while maintaining natural speech flow
|
||||
- Insert brief silences (0.3 seconds) between chunks for natural pacing
|
||||
- Combine all chunks into a single output audio file
|
||||
|
||||
**Note**: When using batch mode (`--batch`), automatic text chunking is disabled. Use non-batch mode for long-form text synthesis.
|
||||
|
||||
## Available Arguments
|
||||
|
||||
| Argument | Type | Default | Description |
|
||||
@@ -66,10 +88,12 @@ This will:
|
||||
| `--voice-style` | str+ | `assets/voice_styles/M1.json` | Voice style file path(s) (comma-separated) |
|
||||
| `--text` | str+ | (long default text) | Text(s) to synthesize (pipe-separated: `|`) |
|
||||
| `--save-dir` | str | `results` | Output directory |
|
||||
| `--batch` | flag | False | Enable batch mode (disables automatic text chunking) |
|
||||
|
||||
## Notes
|
||||
|
||||
- **Batch Processing**: The number of `--voice-style` files must match the number of `--text` entries
|
||||
- **Long-Form Inference**: Without `--batch` flag, long texts are automatically chunked and combined into a single audio file with natural pauses
|
||||
- **Quality vs Speed**: Higher `--total-step` values produce better quality but take longer
|
||||
- **GPU Support**: GPU mode is not supported yet
|
||||
|
||||
|
||||
+25
-1
@@ -2,6 +2,10 @@
|
||||
|
||||
This guide provides examples for running TTS inference using `example_onnx.go`.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
## Installation
|
||||
|
||||
This project uses Go modules for dependency management.
|
||||
@@ -73,6 +77,7 @@ This will use:
|
||||
Process multiple voice styles and texts at once:
|
||||
```bash
|
||||
go run example_onnx.go helper.go \
|
||||
--batch \
|
||||
-voice-style "assets/voice_styles/M1.json,assets/voice_styles/F1.json" \
|
||||
-text "The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant."
|
||||
```
|
||||
@@ -96,6 +101,23 @@ This will:
|
||||
- Use 10 denoising steps instead of the default 5
|
||||
- Produce higher quality output at the cost of slower inference
|
||||
|
||||
### Example 4: Long-Form Inference
|
||||
The system automatically chunks long texts into manageable segments, synthesizes each segment separately, and concatenates them with natural pauses (0.3 seconds by default) into a single audio file. This happens by default when you don't use the `--batch` flag:
|
||||
|
||||
```bash
|
||||
go run example_onnx.go helper.go \
|
||||
-voice-style "assets/voice_styles/M1.json" \
|
||||
-text "This is a very long text that will be automatically split into multiple chunks. The system will process each chunk separately and then concatenate them together with natural pauses between segments. This ensures that even very long texts can be processed efficiently while maintaining natural speech flow and avoiding memory issues."
|
||||
```
|
||||
|
||||
This will:
|
||||
- Automatically split the text into chunks based on paragraph and sentence boundaries
|
||||
- Synthesize each chunk separately
|
||||
- Add 0.3 seconds of silence between chunks for natural pauses
|
||||
- Concatenate all chunks into a single audio file
|
||||
|
||||
**Note**: Automatic text chunking is disabled when using `--batch` mode. In batch mode, each text is processed as-is without chunking.
|
||||
|
||||
## Available Arguments
|
||||
|
||||
| Argument | Type | Default | Description |
|
||||
@@ -107,10 +129,12 @@ This will:
|
||||
| `-voice-style` | str | `assets/voice_styles/M1.json` | Voice style file path(s), comma-separated |
|
||||
| `-text` | str | (long default text) | Text(s) to synthesize, pipe-separated |
|
||||
| `-save-dir` | str | `results` | Output directory |
|
||||
| `--batch` | flag | false | Enable batch mode (multiple text-style pairs, disables automatic chunking) |
|
||||
|
||||
## Notes
|
||||
|
||||
- **Batch Processing**: The number of `-voice-style` files must match the number of `-text` entries
|
||||
- **Batch Processing**: When using `--batch`, the number of `-voice-style` files must match the number of `-text` entries
|
||||
- **Automatic Chunking**: Without `--batch`, long texts are automatically split and concatenated with 0.3s pauses
|
||||
- **Quality vs Speed**: Higher `-total-step` values produce better quality but take longer
|
||||
- **GPU Support**: GPU mode is not supported yet
|
||||
|
||||
|
||||
+45
-15
@@ -19,6 +19,7 @@ type Args struct {
|
||||
voiceStyle []string
|
||||
text []string
|
||||
saveDir string
|
||||
batch bool
|
||||
}
|
||||
|
||||
func parseArgs() *Args {
|
||||
@@ -29,6 +30,7 @@ func parseArgs() *Args {
|
||||
flag.IntVar(&args.totalStep, "total-step", 5, "Number of denoising steps")
|
||||
flag.IntVar(&args.nTest, "n-test", 4, "Number of times to generate")
|
||||
flag.StringVar(&args.saveDir, "save-dir", "results", "Output directory")
|
||||
flag.BoolVar(&args.batch, "batch", false, "Enable batch mode (multiple text-style pairs)")
|
||||
|
||||
var voiceStyleStr, textStr string
|
||||
flag.StringVar(&voiceStyleStr, "voice-style", "assets/voice_styles/M1.json", "Voice style file path(s), comma-separated")
|
||||
@@ -65,11 +67,14 @@ func main() {
|
||||
saveDir := args.saveDir
|
||||
voiceStylePaths := args.voiceStyle
|
||||
textList := args.text
|
||||
batch := args.batch
|
||||
|
||||
if len(voiceStylePaths) != len(textList) {
|
||||
fmt.Printf("Error: Number of voice styles (%d) must match number of texts (%d)\n",
|
||||
len(voiceStylePaths), len(textList))
|
||||
os.Exit(1)
|
||||
if batch {
|
||||
if len(voiceStylePaths) != len(textList) {
|
||||
fmt.Printf("Error: Number of voice styles (%d) must match number of texts (%d)\n",
|
||||
len(voiceStylePaths), len(textList))
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
bsz := len(voiceStylePaths)
|
||||
@@ -115,21 +120,46 @@ func main() {
|
||||
|
||||
var wav []float32
|
||||
var duration []float32
|
||||
Timer("Generating speech from text", func() interface{} {
|
||||
w, d, err := textToSpeech.Call(textList, style, totalStep)
|
||||
if err != nil {
|
||||
fmt.Printf("Error generating speech: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
wav = w
|
||||
duration = d
|
||||
return nil
|
||||
})
|
||||
|
||||
if batch {
|
||||
Timer("Generating speech from text", func() interface{} {
|
||||
w, d, err := textToSpeech.Batch(textList, style, totalStep)
|
||||
if err != nil {
|
||||
fmt.Printf("Error generating speech: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
wav = w
|
||||
duration = d
|
||||
return nil
|
||||
})
|
||||
} else {
|
||||
Timer("Generating speech from text", func() interface{} {
|
||||
w, d, err := textToSpeech.Call(textList[0], style, totalStep, 0.3)
|
||||
if err != nil {
|
||||
fmt.Printf("Error generating speech: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
wav = w
|
||||
duration = []float32{d}
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
// Save outputs
|
||||
for i := 0; i < bsz; i++ {
|
||||
fname := fmt.Sprintf("%s_%d.wav", sanitizeFilename(textList[i], 20), n+1)
|
||||
wavOut := extractWavSegment(wav, duration[i], textToSpeech.SampleRate, i, bsz)
|
||||
var wavOut []float64
|
||||
|
||||
if batch {
|
||||
wavOut = extractWavSegment(wav, duration[i], textToSpeech.SampleRate, i, bsz)
|
||||
} else {
|
||||
// For non-batch mode, wav is a single concatenated audio
|
||||
wavLen := int(float32(textToSpeech.SampleRate) * duration[0])
|
||||
wavOut = make([]float64, wavLen)
|
||||
for j := 0; j < wavLen && j < len(wav); j++ {
|
||||
wavOut[j] = float64(wav[j])
|
||||
}
|
||||
}
|
||||
|
||||
outputPath := filepath.Join(saveDir, fname)
|
||||
if err := writeWavFile(outputPath, wavOut, textToSpeech.SampleRate); err != nil {
|
||||
|
||||
+219
-1
@@ -7,6 +7,8 @@ import (
|
||||
"math/rand"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/go-audio/audio"
|
||||
@@ -145,6 +147,184 @@ func (up *UnicodeProcessor) Call(textList []string) ([][]int64, [][][]float64) {
|
||||
return textIDs, textMask
|
||||
}
|
||||
|
||||
// Text chunking utilities
|
||||
const maxChunkLength = 300
|
||||
|
||||
var abbreviations = []string{
|
||||
"Dr.", "Mr.", "Mrs.", "Ms.", "Prof.", "Sr.", "Jr.",
|
||||
"St.", "Ave.", "Rd.", "Blvd.", "Dept.", "Inc.", "Ltd.",
|
||||
"Co.", "Corp.", "etc.", "vs.", "i.e.", "e.g.", "Ph.D.",
|
||||
}
|
||||
|
||||
func chunkText(text string, maxLen int) []string {
|
||||
if maxLen == 0 {
|
||||
maxLen = maxChunkLength
|
||||
}
|
||||
|
||||
text = strings.TrimSpace(text)
|
||||
if text == "" {
|
||||
return []string{""}
|
||||
}
|
||||
|
||||
// Split by paragraphs
|
||||
paragraphs := regexp.MustCompile(`\n\s*\n`).Split(text, -1)
|
||||
var chunks []string
|
||||
|
||||
for _, para := range paragraphs {
|
||||
para = strings.TrimSpace(para)
|
||||
if para == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
if len(para) <= maxLen {
|
||||
chunks = append(chunks, para)
|
||||
continue
|
||||
}
|
||||
|
||||
// Split by sentences
|
||||
sentences := splitSentences(para)
|
||||
var current strings.Builder
|
||||
currentLen := 0
|
||||
|
||||
for _, sentence := range sentences {
|
||||
sentence = strings.TrimSpace(sentence)
|
||||
if sentence == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
sentenceLen := len(sentence)
|
||||
if sentenceLen > maxLen {
|
||||
// If sentence is longer than maxLen, split by comma or space
|
||||
if current.Len() > 0 {
|
||||
chunks = append(chunks, strings.TrimSpace(current.String()))
|
||||
current.Reset()
|
||||
currentLen = 0
|
||||
}
|
||||
|
||||
// Try splitting by comma
|
||||
parts := strings.Split(sentence, ",")
|
||||
for _, part := range parts {
|
||||
part = strings.TrimSpace(part)
|
||||
if part == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
partLen := len(part)
|
||||
if partLen > maxLen {
|
||||
// Split by space as last resort
|
||||
words := strings.Fields(part)
|
||||
var wordChunk strings.Builder
|
||||
wordChunkLen := 0
|
||||
|
||||
for _, word := range words {
|
||||
wordLen := len(word)
|
||||
if wordChunkLen+wordLen+1 > maxLen && wordChunk.Len() > 0 {
|
||||
chunks = append(chunks, strings.TrimSpace(wordChunk.String()))
|
||||
wordChunk.Reset()
|
||||
wordChunkLen = 0
|
||||
}
|
||||
|
||||
if wordChunk.Len() > 0 {
|
||||
wordChunk.WriteString(" ")
|
||||
wordChunkLen++
|
||||
}
|
||||
wordChunk.WriteString(word)
|
||||
wordChunkLen += wordLen
|
||||
}
|
||||
|
||||
if wordChunk.Len() > 0 {
|
||||
chunks = append(chunks, strings.TrimSpace(wordChunk.String()))
|
||||
}
|
||||
} else {
|
||||
if currentLen+partLen+1 > maxLen && current.Len() > 0 {
|
||||
chunks = append(chunks, strings.TrimSpace(current.String()))
|
||||
current.Reset()
|
||||
currentLen = 0
|
||||
}
|
||||
|
||||
if current.Len() > 0 {
|
||||
current.WriteString(", ")
|
||||
currentLen += 2
|
||||
}
|
||||
current.WriteString(part)
|
||||
currentLen += partLen
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if currentLen+sentenceLen+1 > maxLen && current.Len() > 0 {
|
||||
chunks = append(chunks, strings.TrimSpace(current.String()))
|
||||
current.Reset()
|
||||
currentLen = 0
|
||||
}
|
||||
|
||||
if current.Len() > 0 {
|
||||
current.WriteString(" ")
|
||||
currentLen++
|
||||
}
|
||||
current.WriteString(sentence)
|
||||
currentLen += sentenceLen
|
||||
}
|
||||
|
||||
if current.Len() > 0 {
|
||||
chunks = append(chunks, strings.TrimSpace(current.String()))
|
||||
}
|
||||
}
|
||||
|
||||
if len(chunks) == 0 {
|
||||
return []string{""}
|
||||
}
|
||||
|
||||
return chunks
|
||||
}
|
||||
|
||||
func splitSentences(text string) []string {
|
||||
// Go's regexp doesn't support lookbehind, so we use a simpler approach
|
||||
// Split on sentence boundaries and then check if they're abbreviations
|
||||
re := regexp.MustCompile(`([.!?])\s+`)
|
||||
|
||||
// Find all matches
|
||||
matches := re.FindAllStringIndex(text, -1)
|
||||
if len(matches) == 0 {
|
||||
return []string{text}
|
||||
}
|
||||
|
||||
var sentences []string
|
||||
lastEnd := 0
|
||||
|
||||
for _, match := range matches {
|
||||
// Get the text before the punctuation
|
||||
beforePunc := text[lastEnd:match[0]]
|
||||
|
||||
// Check if this ends with an abbreviation
|
||||
isAbbrev := false
|
||||
for _, abbrev := range abbreviations {
|
||||
if strings.HasSuffix(strings.TrimSpace(beforePunc+text[match[0]:match[0]+1]), abbrev) {
|
||||
isAbbrev = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !isAbbrev {
|
||||
// This is a real sentence boundary
|
||||
sentences = append(sentences, text[lastEnd:match[1]])
|
||||
lastEnd = match[1]
|
||||
}
|
||||
}
|
||||
|
||||
// Add the remaining text
|
||||
if lastEnd < len(text) {
|
||||
sentences = append(sentences, text[lastEnd:])
|
||||
}
|
||||
|
||||
if len(sentences) == 0 {
|
||||
return []string{text}
|
||||
}
|
||||
|
||||
return sentences
|
||||
}
|
||||
|
||||
// Utility functions
|
||||
func preprocessText(text string) string {
|
||||
// Simple normalization (Go doesn't have built-in NFKD normalization)
|
||||
@@ -392,7 +572,7 @@ func (tts *TextToSpeech) sampleNoisyLatent(durOnnx []float32) ([][][]float64, []
|
||||
return noisyLatent, latentMask
|
||||
}
|
||||
|
||||
func (tts *TextToSpeech) Call(textList []string, style *Style, totalStep int) ([]float32, []float32, error) {
|
||||
func (tts *TextToSpeech) _infer(textList []string, style *Style, totalStep int) ([]float32, []float32, error) {
|
||||
bsz := len(textList)
|
||||
|
||||
// Process text
|
||||
@@ -510,6 +690,44 @@ func (tts *TextToSpeech) Call(textList []string, style *Style, totalStep int) ([
|
||||
return wav, durOnnx, nil
|
||||
}
|
||||
|
||||
// Call synthesizes speech from a single text with automatic chunking
|
||||
func (tts *TextToSpeech) Call(text string, style *Style, totalStep int, silenceDuration float32) ([]float32, float32, error) {
|
||||
chunks := chunkText(text, 0)
|
||||
|
||||
var wavCat []float32
|
||||
var durCat float32
|
||||
|
||||
for i, chunk := range chunks {
|
||||
wav, duration, err := tts._infer([]string{chunk}, style, totalStep)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
dur := duration[0]
|
||||
wavLen := int(float32(tts.SampleRate) * dur)
|
||||
wavChunk := wav[:wavLen]
|
||||
|
||||
if i == 0 {
|
||||
wavCat = wavChunk
|
||||
durCat = dur
|
||||
} else {
|
||||
silenceLen := int(silenceDuration * float32(tts.SampleRate))
|
||||
silence := make([]float32, silenceLen)
|
||||
|
||||
wavCat = append(wavCat, silence...)
|
||||
wavCat = append(wavCat, wavChunk...)
|
||||
durCat += silenceDuration + dur
|
||||
}
|
||||
}
|
||||
|
||||
return wavCat, durCat, nil
|
||||
}
|
||||
|
||||
// Batch synthesizes speech from multiple texts
|
||||
func (tts *TextToSpeech) Batch(textList []string, style *Style, totalStep int) ([]float32, []float32, error) {
|
||||
return tts._infer(textList, style, totalStep)
|
||||
}
|
||||
|
||||
func (tts *TextToSpeech) Destroy() {
|
||||
if tts.dpOrt != nil {
|
||||
tts.dpOrt.Destroy()
|
||||
|
||||
+40
-14
@@ -21,6 +21,7 @@ public class ExampleONNX {
|
||||
"This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
|
||||
);
|
||||
String saveDir = "results";
|
||||
boolean batch = false;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -56,6 +57,9 @@ public class ExampleONNX {
|
||||
case "--save-dir":
|
||||
if (i + 1 < args.length) result.saveDir = args[++i];
|
||||
break;
|
||||
case "--batch":
|
||||
result.batch = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -76,10 +80,13 @@ public class ExampleONNX {
|
||||
String saveDir = parsedArgs.saveDir;
|
||||
List<String> voiceStylePaths = parsedArgs.voiceStyle;
|
||||
List<String> textList = parsedArgs.text;
|
||||
boolean batch = parsedArgs.batch;
|
||||
|
||||
if (voiceStylePaths.size() != textList.size()) {
|
||||
throw new RuntimeException("Number of voice styles (" + voiceStylePaths.size() +
|
||||
") must match number of texts (" + textList.size() + ")");
|
||||
if (batch) {
|
||||
if (voiceStylePaths.size() != textList.size()) {
|
||||
throw new RuntimeException("Number of voice styles (" + voiceStylePaths.size() +
|
||||
") must match number of texts (" + textList.size() + ")");
|
||||
}
|
||||
}
|
||||
|
||||
int bsz = voiceStylePaths.size();
|
||||
@@ -100,25 +107,44 @@ public class ExampleONNX {
|
||||
for (int n = 0; n < nTest; n++) {
|
||||
System.out.println("\n[" + (n + 1) + "/" + nTest + "] Starting synthesis...");
|
||||
|
||||
TTSResult ttsResult = Helper.timer("Generating speech from text", () -> {
|
||||
try {
|
||||
return textToSpeech.call(textList, style, totalStep, env);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
TTSResult ttsResult;
|
||||
if (batch) {
|
||||
ttsResult = Helper.timer("Generating speech from text", () -> {
|
||||
try {
|
||||
return textToSpeech.batch(textList, style, totalStep, env);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
} else {
|
||||
ttsResult = Helper.timer("Generating speech from text", () -> {
|
||||
try {
|
||||
return textToSpeech.call(textList.get(0), style, totalStep, 0.3f, env);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
float[] wav = ttsResult.wav;
|
||||
float[] duration = ttsResult.duration;
|
||||
|
||||
// Save outputs
|
||||
int wavLen = wav.length / bsz;
|
||||
for (int i = 0; i < bsz; i++) {
|
||||
String fname = Helper.sanitizeFilename(textList.get(i), 20) + "_" + (n + 1) + ".wav";
|
||||
int actualLen = (int) (textToSpeech.sampleRate * duration[i]);
|
||||
float[] wavOut;
|
||||
|
||||
float[] wavOut = new float[actualLen];
|
||||
System.arraycopy(wav, i * wavLen, wavOut, 0, Math.min(actualLen, wavLen));
|
||||
if (batch) {
|
||||
int wavLen = wav.length / bsz;
|
||||
int actualLen = (int) (textToSpeech.sampleRate * duration[i]);
|
||||
wavOut = new float[actualLen];
|
||||
System.arraycopy(wav, i * wavLen, wavOut, 0, Math.min(actualLen, wavLen));
|
||||
} else {
|
||||
// For non-batch mode, wav is a single concatenated audio
|
||||
int actualLen = (int) (textToSpeech.sampleRate * duration[0]);
|
||||
wavOut = new float[Math.min(actualLen, wav.length)];
|
||||
System.arraycopy(wav, 0, wavOut, 0, wavOut.length);
|
||||
}
|
||||
|
||||
String outputPath = saveDir + "/" + fname;
|
||||
Helper.writeWavFile(outputPath, wavOut, textToSpeech.sampleRate);
|
||||
|
||||
+204
-1
@@ -15,6 +15,8 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.text.Normalizer;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
|
||||
/**
|
||||
* Configuration classes
|
||||
@@ -152,7 +154,7 @@ class TextToSpeech {
|
||||
this.ldim = config.ttl.latentDim;
|
||||
}
|
||||
|
||||
public TTSResult call(List<String> textList, Style style, int totalStep, OrtEnvironment env)
|
||||
private TTSResult _infer(List<String> textList, Style style, int totalStep, OrtEnvironment env)
|
||||
throws OrtException {
|
||||
int bsz = textList.size();
|
||||
|
||||
@@ -296,6 +298,57 @@ class TextToSpeech {
|
||||
return new NoisyLatentResult(noisyLatent, latentMask);
|
||||
}
|
||||
|
||||
/**
|
||||
* Synthesize speech from a single text with automatic chunking
|
||||
*/
|
||||
public TTSResult call(String text, Style style, int totalStep, float silenceDuration, OrtEnvironment env)
|
||||
throws OrtException {
|
||||
List<String> chunks = Helper.chunkText(text, 0);
|
||||
|
||||
List<Float> wavCat = new ArrayList<>();
|
||||
float durCat = 0.0f;
|
||||
|
||||
for (int i = 0; i < chunks.size(); i++) {
|
||||
TTSResult result = _infer(Arrays.asList(chunks.get(i)), style, totalStep, env);
|
||||
|
||||
float dur = result.duration[0];
|
||||
int wavLen = (int) (sampleRate * dur);
|
||||
float[] wavChunk = new float[wavLen];
|
||||
System.arraycopy(result.wav, 0, wavChunk, 0, Math.min(wavLen, result.wav.length));
|
||||
|
||||
if (i == 0) {
|
||||
for (float val : wavChunk) {
|
||||
wavCat.add(val);
|
||||
}
|
||||
durCat = dur;
|
||||
} else {
|
||||
int silenceLen = (int) (silenceDuration * sampleRate);
|
||||
for (int j = 0; j < silenceLen; j++) {
|
||||
wavCat.add(0.0f);
|
||||
}
|
||||
for (float val : wavChunk) {
|
||||
wavCat.add(val);
|
||||
}
|
||||
durCat += silenceDuration + dur;
|
||||
}
|
||||
}
|
||||
|
||||
float[] wavArray = new float[wavCat.size()];
|
||||
for (int i = 0; i < wavCat.size(); i++) {
|
||||
wavArray[i] = wavCat.get(i);
|
||||
}
|
||||
|
||||
return new TTSResult(wavArray, new float[]{durCat});
|
||||
}
|
||||
|
||||
/**
|
||||
* Batch synthesize speech from multiple texts
|
||||
*/
|
||||
public TTSResult batch(List<String> textList, Style style, int totalStep, OrtEnvironment env)
|
||||
throws OrtException {
|
||||
return _infer(textList, style, totalStep, env);
|
||||
}
|
||||
|
||||
public void close() throws OrtException {
|
||||
if (dpSession != null) dpSession.close();
|
||||
if (textEncSession != null) textEncSession.close();
|
||||
@@ -353,6 +406,156 @@ class NoisyLatentResult {
|
||||
*/
|
||||
public class Helper {
|
||||
|
||||
private static final int MAX_CHUNK_LENGTH = 300;
|
||||
private static final String[] ABBREVIATIONS = {
|
||||
"Dr.", "Mr.", "Mrs.", "Ms.", "Prof.", "Sr.", "Jr.",
|
||||
"St.", "Ave.", "Rd.", "Blvd.", "Dept.", "Inc.", "Ltd.",
|
||||
"Co.", "Corp.", "etc.", "vs.", "i.e.", "e.g.", "Ph.D."
|
||||
};
|
||||
|
||||
/**
|
||||
* Chunk text into smaller segments based on paragraphs and sentences
|
||||
*/
|
||||
public static List<String> chunkText(String text, int maxLen) {
|
||||
if (maxLen == 0) {
|
||||
maxLen = MAX_CHUNK_LENGTH;
|
||||
}
|
||||
|
||||
text = text.trim();
|
||||
if (text.isEmpty()) {
|
||||
return Arrays.asList("");
|
||||
}
|
||||
|
||||
// Split by paragraphs
|
||||
String[] paragraphs = text.split("\\n\\s*\\n");
|
||||
List<String> chunks = new ArrayList<>();
|
||||
|
||||
for (String para : paragraphs) {
|
||||
para = para.trim();
|
||||
if (para.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (para.length() <= maxLen) {
|
||||
chunks.add(para);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Split by sentences
|
||||
List<String> sentences = splitSentences(para);
|
||||
StringBuilder current = new StringBuilder();
|
||||
int currentLen = 0;
|
||||
|
||||
for (String sentence : sentences) {
|
||||
sentence = sentence.trim();
|
||||
if (sentence.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int sentenceLen = sentence.length();
|
||||
if (sentenceLen > maxLen) {
|
||||
// If sentence is longer than maxLen, split by comma or space
|
||||
if (current.length() > 0) {
|
||||
chunks.add(current.toString().trim());
|
||||
current.setLength(0);
|
||||
currentLen = 0;
|
||||
}
|
||||
|
||||
// Try splitting by comma
|
||||
String[] parts = sentence.split(",");
|
||||
for (String part : parts) {
|
||||
part = part.trim();
|
||||
if (part.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int partLen = part.length();
|
||||
if (partLen > maxLen) {
|
||||
// Split by space as last resort
|
||||
String[] words = part.split("\\s+");
|
||||
StringBuilder wordChunk = new StringBuilder();
|
||||
int wordChunkLen = 0;
|
||||
|
||||
for (String word : words) {
|
||||
int wordLen = word.length();
|
||||
if (wordChunkLen + wordLen + 1 > maxLen && wordChunk.length() > 0) {
|
||||
chunks.add(wordChunk.toString().trim());
|
||||
wordChunk.setLength(0);
|
||||
wordChunkLen = 0;
|
||||
}
|
||||
|
||||
if (wordChunk.length() > 0) {
|
||||
wordChunk.append(" ");
|
||||
wordChunkLen++;
|
||||
}
|
||||
wordChunk.append(word);
|
||||
wordChunkLen += wordLen;
|
||||
}
|
||||
|
||||
if (wordChunk.length() > 0) {
|
||||
chunks.add(wordChunk.toString().trim());
|
||||
}
|
||||
} else {
|
||||
if (currentLen + partLen + 1 > maxLen && current.length() > 0) {
|
||||
chunks.add(current.toString().trim());
|
||||
current.setLength(0);
|
||||
currentLen = 0;
|
||||
}
|
||||
|
||||
if (current.length() > 0) {
|
||||
current.append(", ");
|
||||
currentLen += 2;
|
||||
}
|
||||
current.append(part);
|
||||
currentLen += partLen;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (currentLen + sentenceLen + 1 > maxLen && current.length() > 0) {
|
||||
chunks.add(current.toString().trim());
|
||||
current.setLength(0);
|
||||
currentLen = 0;
|
||||
}
|
||||
|
||||
if (current.length() > 0) {
|
||||
current.append(" ");
|
||||
currentLen++;
|
||||
}
|
||||
current.append(sentence);
|
||||
currentLen += sentenceLen;
|
||||
}
|
||||
|
||||
if (current.length() > 0) {
|
||||
chunks.add(current.toString().trim());
|
||||
}
|
||||
}
|
||||
|
||||
if (chunks.isEmpty()) {
|
||||
return Arrays.asList("");
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
/**
|
||||
* Split text into sentences, avoiding common abbreviations
|
||||
*/
|
||||
private static List<String> splitSentences(String text) {
|
||||
// Build pattern that avoids abbreviations
|
||||
StringBuilder abbrevPattern = new StringBuilder();
|
||||
for (int i = 0; i < ABBREVIATIONS.length; i++) {
|
||||
if (i > 0) abbrevPattern.append("|");
|
||||
abbrevPattern.append(Pattern.quote(ABBREVIATIONS[i]));
|
||||
}
|
||||
|
||||
// Match sentence endings, but not abbreviations
|
||||
String patternStr = "(?<!(?:" + abbrevPattern.toString() + "))(?<=[.!?])\\s+";
|
||||
Pattern pattern = Pattern.compile(patternStr);
|
||||
return Arrays.asList(pattern.split(text));
|
||||
}
|
||||
|
||||
/**
|
||||
* Load voice style from JSON files
|
||||
*/
|
||||
|
||||
+24
-3
@@ -2,6 +2,10 @@
|
||||
|
||||
This guide provides examples for running TTS inference using `ExampleONNX.java`.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
## Installation
|
||||
|
||||
This project uses [Maven](https://maven.apache.org/) for dependency management.
|
||||
@@ -35,7 +39,7 @@ This will use:
|
||||
### Example 2: Batch Inference
|
||||
Process multiple voice styles and texts at once:
|
||||
```bash
|
||||
mvn exec:java -Dexec.args="--voice-style assets/voice_styles/M1.json,assets/voice_styles/F1.json --text 'The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant.'"
|
||||
mvn exec:java -Dexec.args="--batch --voice-style assets/voice_styles/M1.json,assets/voice_styles/F1.json --text 'The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant.'"
|
||||
```
|
||||
|
||||
This will:
|
||||
@@ -54,7 +58,22 @@ This will:
|
||||
- Use 10 denoising steps instead of the default 5
|
||||
- Produce higher quality output at the cost of slower inference
|
||||
|
||||
**Note**: If your text contains apostrophes, use escaping or run the JAR directly:
|
||||
### Example 4: Long-Form Inference
|
||||
The system automatically chunks long texts into manageable segments, synthesizes each segment separately, and concatenates them with natural pauses (0.3 seconds by default) into a single audio file. This happens by default when you don't use the `--batch` flag:
|
||||
|
||||
```bash
|
||||
mvn exec:java -Dexec.args="--voice-style assets/voice_styles/M1.json --text 'This is a very long text that will be automatically split into multiple chunks. The system will process each chunk separately and then concatenate them together with natural pauses between segments. This ensures that even very long texts can be processed efficiently while maintaining natural speech flow and avoiding memory issues.'"
|
||||
```
|
||||
|
||||
This will:
|
||||
- Automatically split the text into chunks based on paragraph and sentence boundaries
|
||||
- Synthesize each chunk separately
|
||||
- Add 0.3 seconds of silence between chunks for natural pauses
|
||||
- Concatenate all chunks into a single audio file
|
||||
|
||||
**Note**: Automatic text chunking is disabled when using `--batch` mode. In batch mode, each text is processed as-is without chunking.
|
||||
|
||||
**Tip**: If your text contains apostrophes, use escaping or run the JAR directly:
|
||||
```bash
|
||||
java -jar target/tts-example.jar --total-step 10 --text "Text with apostrophe's here"
|
||||
```
|
||||
@@ -87,10 +106,12 @@ java -jar target/tts-example.jar --total-step 10 --text "Your custom text here"
|
||||
| `--voice-style` | str+ | `assets/voice_styles/M1.json` | Voice style file path(s) |
|
||||
| `--text` | str+ | (long default text) | Text(s) to synthesize |
|
||||
| `--save-dir` | str | `results` | Output directory |
|
||||
| `--batch` | flag | False | Enable batch mode (multiple text-style pairs, disables automatic chunking) |
|
||||
|
||||
## Notes
|
||||
|
||||
- **Batch Processing**: The number of `--voice-style` files must match the number of `--text` entries
|
||||
- **Batch Processing**: When using `--batch`, the number of `--voice-style` files must match the number of `--text` entries
|
||||
- **Automatic Chunking**: Without `--batch`, long texts are automatically split and concatenated with 0.3s pauses
|
||||
- **Quality vs Speed**: Higher `--total-step` values produce better quality but take longer
|
||||
- **GPU Support**: GPU mode is not supported yet
|
||||
- **Voice Styles**: Uses pre-extracted voice style JSON files for fast inference
|
||||
|
||||
+26
-2
@@ -2,6 +2,10 @@
|
||||
|
||||
Node.js implementation for TTS inference. Uses ONNX Runtime to generate speech from text.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Node.js v16 or higher
|
||||
@@ -39,14 +43,16 @@ Process multiple voice styles and texts at once:
|
||||
```bash
|
||||
node example_onnx.js \
|
||||
--voice-style "assets/voice_styles/M1.json,assets/voice_styles/F1.json" \
|
||||
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant."
|
||||
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant." \
|
||||
--batch
|
||||
```
|
||||
|
||||
This will:
|
||||
- Use `--batch` flag to enable batch processing mode
|
||||
- Generate speech for 2 different voice-text pairs
|
||||
- Use male voice style (M1.json) for the first text
|
||||
- Use female voice style (F1.json) for the second text
|
||||
- Process both samples in a single batch
|
||||
- Process both samples in a single batch (automatic text chunking disabled)
|
||||
|
||||
### Example 3: High Quality Inference
|
||||
Increase denoising steps for better quality:
|
||||
@@ -61,6 +67,22 @@ This will:
|
||||
- Use 10 denoising steps instead of the default 5
|
||||
- Produce higher quality output at the cost of slower inference
|
||||
|
||||
### Example 4: Long-Form Inference
|
||||
For long texts, the system automatically chunks the text into manageable segments and generates a single audio file:
|
||||
```bash
|
||||
node example_onnx.js \
|
||||
--voice-style "assets/voice_styles/M1.json" \
|
||||
--text "Once upon a time, in a small village nestled between rolling hills, there lived a young artist named Clara. Every morning, she would wake up before dawn to capture the first light of day. The golden rays streaming through her window inspired countless paintings. Her work was known throughout the region for its vibrant colors and emotional depth. People from far and wide came to see her gallery, and many said her paintings could tell stories that words never could."
|
||||
```
|
||||
|
||||
This will:
|
||||
- Automatically split the long text into smaller chunks (max 300 characters by default)
|
||||
- Process each chunk separately while maintaining natural speech flow
|
||||
- Insert brief silences (0.3 seconds) between chunks for natural pacing
|
||||
- Combine all chunks into a single output audio file
|
||||
|
||||
**Note**: When using batch mode (`--batch`), automatic text chunking is disabled. Use non-batch mode for long-form text synthesis.
|
||||
|
||||
## Available Arguments
|
||||
|
||||
| Argument | Type | Default | Description |
|
||||
@@ -72,10 +94,12 @@ This will:
|
||||
| `--voice-style` | str+ | `assets/voice_styles/M1.json` | Voice style file path(s). Separate multiple files with commas |
|
||||
| `--text` | str+ | (long default text) | Text(s) to synthesize. Separate multiple texts with pipes |
|
||||
| `--save-dir` | str | `results` | Output directory |
|
||||
| `--batch` | flag | False | Enable batch mode (disables automatic text chunking) |
|
||||
|
||||
## Notes
|
||||
|
||||
- **Batch Processing**: The number of voice style files must match the number of texts. Use commas to separate files and pipes to separate texts
|
||||
- **Long-Form Inference**: Without `--batch` flag, long texts are automatically chunked and combined into a single audio file with natural pauses
|
||||
- **Quality vs Speed**: Higher `--total-step` values produce better quality but take longer
|
||||
- **GPU Support**: GPU mode is not supported yet
|
||||
|
||||
|
||||
+10
-3
@@ -18,13 +18,16 @@ function parseArgs() {
|
||||
nTest: 4,
|
||||
voiceStyle: ['assets/voice_styles/M1.json'],
|
||||
text: ['This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen.'],
|
||||
saveDir: 'results'
|
||||
saveDir: 'results',
|
||||
batch: false
|
||||
};
|
||||
|
||||
for (let i = 2; i < process.argv.length; i++) {
|
||||
const arg = process.argv[i];
|
||||
if (arg === '--use-gpu') {
|
||||
args.useGpu = true;
|
||||
} else if (arg === '--batch') {
|
||||
args.batch = true;
|
||||
} else if (arg === '--onnx-dir' && i + 1 < process.argv.length) {
|
||||
args.onnxDir = process.argv[++i];
|
||||
} else if (arg === '--total-step' && i + 1 < process.argv.length) {
|
||||
@@ -56,11 +59,11 @@ async function main() {
|
||||
const saveDir = args.saveDir;
|
||||
const voiceStylePaths = args.voiceStyle.map(p => path.resolve(__dirname, p));
|
||||
const textList = args.text;
|
||||
const batch = args.batch;
|
||||
|
||||
if (voiceStylePaths.length !== textList.length) {
|
||||
throw new Error(`Number of voice styles (${voiceStylePaths.length}) must match number of texts (${textList.length})`);
|
||||
}
|
||||
|
||||
const bsz = voiceStylePaths.length;
|
||||
|
||||
// --- 2. Load Text to Speech --- //
|
||||
@@ -75,7 +78,11 @@ async function main() {
|
||||
console.log(`\n[${n + 1}/${nTest}] Starting synthesis...`);
|
||||
|
||||
const { wav, duration } = await timer('Generating speech from text', async () => {
|
||||
return await textToSpeech.call(textList, style, totalStep);
|
||||
if (batch) {
|
||||
return await textToSpeech.batch(textList, style, totalStep);
|
||||
} else {
|
||||
return await textToSpeech.call(textList[0], style, totalStep);
|
||||
}
|
||||
});
|
||||
|
||||
if (!fs.existsSync(saveDir)) {
|
||||
|
||||
+72
-1
@@ -114,7 +114,7 @@ class TextToSpeech {
|
||||
return { noisyLatent, latentMask };
|
||||
}
|
||||
|
||||
async call(textList, style, totalStep) {
|
||||
async _infer(textList, style, totalStep) {
|
||||
if (textList.length !== style.ttl.dims[0]) {
|
||||
throw new Error('Number of texts must match number of style vectors');
|
||||
}
|
||||
@@ -184,6 +184,35 @@ class TextToSpeech {
|
||||
const wav = Array.from(vocoderResult.wav_tts.data);
|
||||
return { wav, duration: durOnnx };
|
||||
}
|
||||
|
||||
async call(text, style, totalStep, silenceDuration = 0.3) {
|
||||
if (style.ttl.dims[0] !== 1) {
|
||||
throw new Error('Single speaker text to speech only supports single style');
|
||||
}
|
||||
const textList = chunkText(text);
|
||||
let wavCat = null;
|
||||
let durCat = 0;
|
||||
|
||||
for (const chunk of textList) {
|
||||
const { wav, duration } = await this._infer([chunk], style, totalStep);
|
||||
|
||||
if (wavCat === null) {
|
||||
wavCat = wav;
|
||||
durCat = duration[0];
|
||||
} else {
|
||||
const silenceLen = Math.floor(silenceDuration * this.sampleRate);
|
||||
const silence = new Array(silenceLen).fill(0);
|
||||
wavCat = [...wavCat, ...silence, ...wav];
|
||||
durCat += duration[0] + silenceDuration;
|
||||
}
|
||||
}
|
||||
|
||||
return { wav: wavCat, duration: [durCat] };
|
||||
}
|
||||
|
||||
async batch(textList, style, totalStep) {
|
||||
return await this._infer(textList, style, totalStep);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -390,3 +419,45 @@ export async function timer(name, fn) {
|
||||
console.log(` -> ${name} completed in ${elapsed} sec`);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk text into manageable segments
|
||||
*/
|
||||
function chunkText(text, maxLen = 300) {
|
||||
if (typeof text !== 'string') {
|
||||
throw new Error(`chunkText expects a string, got ${typeof text}`);
|
||||
}
|
||||
|
||||
// Split by paragraph (two or more newlines)
|
||||
const paragraphs = text.trim().split(/\n\s*\n+/).filter(p => p.trim());
|
||||
|
||||
const chunks = [];
|
||||
|
||||
for (let paragraph of paragraphs) {
|
||||
paragraph = paragraph.trim();
|
||||
if (!paragraph) continue;
|
||||
|
||||
// Split by sentence boundaries (period, question mark, exclamation mark followed by space)
|
||||
// But exclude common abbreviations like Mr., Mrs., Dr., etc. and single capital letters like F.
|
||||
const sentences = paragraph.split(/(?<!Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.|Sr\.|Jr\.|Ph\.D\.|etc\.|e\.g\.|i\.e\.|vs\.|Inc\.|Ltd\.|Co\.|Corp\.|St\.|Ave\.|Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+/);
|
||||
|
||||
let currentChunk = "";
|
||||
|
||||
for (let sentence of sentences) {
|
||||
if (currentChunk.length + sentence.length + 1 <= maxLen) {
|
||||
currentChunk += (currentChunk ? " " : "") + sentence;
|
||||
} else {
|
||||
if (currentChunk) {
|
||||
chunks.push(currentChunk.trim());
|
||||
}
|
||||
currentChunk = sentence;
|
||||
}
|
||||
}
|
||||
|
||||
if (currentChunk) {
|
||||
chunks.push(currentChunk.trim());
|
||||
}
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
+26
-2
@@ -2,6 +2,10 @@
|
||||
|
||||
This guide provides examples for running TTS inference using `example_onnx.py`.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
## Installation
|
||||
|
||||
This project uses [uv](https://docs.astral.sh/uv/) for fast package management.
|
||||
@@ -41,14 +45,16 @@ Process multiple voice styles and texts at once:
|
||||
```bash
|
||||
uv run example_onnx.py \
|
||||
--voice-style assets/voice_styles/M1.json assets/voice_styles/F1.json \
|
||||
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange." "The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant."
|
||||
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange." "The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant." \
|
||||
--batch
|
||||
```
|
||||
|
||||
This will:
|
||||
- Use `--batch` flag to enable batch processing mode
|
||||
- Generate speech for 2 different voice-text pairs
|
||||
- Use male voice style (M1.json) for the first text
|
||||
- Use female voice style (F1.json) for the second text
|
||||
- Process both samples in a single batch
|
||||
- Process both samples in a single batch (automatic text chunking disabled)
|
||||
|
||||
### Example 3: High Quality Inference
|
||||
Increase denoising steps for better quality:
|
||||
@@ -63,6 +69,22 @@ This will:
|
||||
- Use 10 denoising steps instead of the default 5
|
||||
- Produce higher quality output at the cost of slower inference
|
||||
|
||||
### Example 4: Long-Form Inference
|
||||
For long texts, the system automatically chunks the text into manageable segments and generates a single audio file:
|
||||
```bash
|
||||
uv run example_onnx.py \
|
||||
--voice-style assets/voice_styles/M1.json \
|
||||
--text "Once upon a time, in a small village nestled between rolling hills, there lived a young artist named Clara. Every morning, she would wake up before dawn to capture the first light of day. The golden rays streaming through her window inspired countless paintings. Her work was known throughout the region for its vibrant colors and emotional depth. People from far and wide came to see her gallery, and many said her paintings could tell stories that words never could."
|
||||
```
|
||||
|
||||
This will:
|
||||
- Automatically split the long text into smaller chunks (max 300 characters by default)
|
||||
- Process each chunk separately while maintaining natural speech flow
|
||||
- Insert brief silences (0.3 seconds) between chunks for natural pacing
|
||||
- Combine all chunks into a single output audio file
|
||||
|
||||
**Note**: When using batch mode (`--batch`), automatic text chunking is disabled. Use non-batch mode for long-form text synthesis.
|
||||
|
||||
## Available Arguments
|
||||
|
||||
| Argument | Type | Default | Description |
|
||||
@@ -74,10 +96,12 @@ This will:
|
||||
| `--voice-style` | str+ | `assets/voice_styles/M1.json` | Voice style file path(s) |
|
||||
| `--text` | str+ | (long default text) | Text(s) to synthesize |
|
||||
| `--save-dir` | str | `results` | Output directory |
|
||||
| `--batch` | flag | False | Enable batch mode (disables automatic text chunking) |
|
||||
|
||||
## Notes
|
||||
|
||||
- **Batch Processing**: The number of `--voice-style` files must match the number of `--text` entries
|
||||
- **Long-Form Inference**: Without `--batch` flag, long texts are automatically chunked and combined into a single audio file with natural pauses
|
||||
- **Quality vs Speed**: Higher `--total-step` values produce better quality but take longer
|
||||
- **GPU Support**: GPU mode is not supported yet
|
||||
|
||||
|
||||
+8
-2
@@ -30,6 +30,9 @@ def parse_args():
|
||||
"--n-test", type=int, default=4, help="Number of times to generate"
|
||||
)
|
||||
|
||||
# Batch processing
|
||||
parser.add_argument("--batch", action="store_true", help="Batch processing")
|
||||
|
||||
# Input/Output
|
||||
parser.add_argument(
|
||||
"--voice-style",
|
||||
@@ -63,11 +66,11 @@ n_test = args.n_test
|
||||
save_dir = args.save_dir
|
||||
voice_style_paths = args.voice_style
|
||||
text_list = args.text
|
||||
batch = args.batch
|
||||
|
||||
assert len(voice_style_paths) == len(
|
||||
text_list
|
||||
), f"Number of voice styles ({len(voice_style_paths)}) must match number of texts ({len(text_list)})"
|
||||
|
||||
bsz = len(voice_style_paths)
|
||||
|
||||
# --- 2. Load Text to Speech --- #
|
||||
@@ -80,7 +83,10 @@ style = load_voice_style(voice_style_paths, verbose=True)
|
||||
for n in range(n_test):
|
||||
print(f"\n[{n+1}/{n_test}] Starting synthesis...")
|
||||
with timer("Generating speech from text"):
|
||||
wav, duration = text_to_speech(text_list, style, total_step)
|
||||
if batch:
|
||||
wav, duration = text_to_speech.batch(text_list, style, total_step)
|
||||
else:
|
||||
wav, duration = text_to_speech(text_list[0], style, total_step)
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
for b in range(bsz):
|
||||
|
||||
+72
-1
@@ -85,7 +85,7 @@ class TextToSpeech:
|
||||
noisy_latent = noisy_latent * latent_mask
|
||||
return noisy_latent, latent_mask
|
||||
|
||||
def __call__(
|
||||
def _infer(
|
||||
self, text_list: list[str], style: Style, total_step: int
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
assert (
|
||||
@@ -119,6 +119,33 @@ class TextToSpeech:
|
||||
wav, *_ = self.vocoder_ort.run(None, {"latent": xt})
|
||||
return wav, dur_onnx
|
||||
|
||||
def __call__(
|
||||
self, text: str, style: Style, total_step: int, silence_duration: float = 0.3
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
assert (
|
||||
style.ttl.shape[0] == 1
|
||||
), "Single speaker text to speech only supports single style"
|
||||
text_list = chunk_text(text)
|
||||
wav_cat = None
|
||||
dur_cat = None
|
||||
for text in text_list:
|
||||
wav, dur_onnx = self._infer([text], style, total_step)
|
||||
if wav_cat is None:
|
||||
wav_cat = wav
|
||||
dur_cat = dur_onnx
|
||||
else:
|
||||
silence = np.zeros(
|
||||
(1, int(silence_duration * self.sample_rate)), dtype=np.float32
|
||||
)
|
||||
wav_cat = np.concatenate([wav_cat, silence, wav], axis=1)
|
||||
dur_cat += dur_onnx + silence_duration
|
||||
return wav_cat, dur_cat
|
||||
|
||||
def batch(
|
||||
self, text_list: list[str], style: Style, total_step: int
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
return self._infer(text_list, style, total_step)
|
||||
|
||||
|
||||
def length_to_mask(lengths: np.ndarray, max_len: Optional[int] = None) -> np.ndarray:
|
||||
"""
|
||||
@@ -247,3 +274,47 @@ def sanitize_filename(text: str, max_len: int) -> str:
|
||||
|
||||
prefix = text[:max_len]
|
||||
return re.sub(r"[^a-zA-Z0-9]", "_", prefix)
|
||||
|
||||
|
||||
def chunk_text(text: str, max_len: int = 300) -> list[str]:
|
||||
"""
|
||||
Split text into chunks by paragraphs and sentences.
|
||||
|
||||
Args:
|
||||
text: Input text to chunk
|
||||
max_len: Maximum length of each chunk (default: 300)
|
||||
|
||||
Returns:
|
||||
List of text chunks
|
||||
"""
|
||||
import re
|
||||
|
||||
# Split by paragraph (two or more newlines)
|
||||
paragraphs = [p.strip() for p in re.split(r"\n\s*\n+", text.strip()) if p.strip()]
|
||||
|
||||
chunks = []
|
||||
|
||||
for paragraph in paragraphs:
|
||||
paragraph = paragraph.strip()
|
||||
if not paragraph:
|
||||
continue
|
||||
|
||||
# Split by sentence boundaries (period, question mark, exclamation mark followed by space)
|
||||
# But exclude common abbreviations like Mr., Mrs., Dr., etc. and single capital letters like F.
|
||||
pattern = r"(?<!Mr\.)(?<!Mrs\.)(?<!Ms\.)(?<!Dr\.)(?<!Prof\.)(?<!Sr\.)(?<!Jr\.)(?<!Ph\.D\.)(?<!etc\.)(?<!e\.g\.)(?<!i\.e\.)(?<!vs\.)(?<!Inc\.)(?<!Ltd\.)(?<!Co\.)(?<!Corp\.)(?<!St\.)(?<!Ave\.)(?<!Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+"
|
||||
sentences = re.split(pattern, paragraph)
|
||||
|
||||
current_chunk = ""
|
||||
|
||||
for sentence in sentences:
|
||||
if len(current_chunk) + len(sentence) + 1 <= max_len:
|
||||
current_chunk += (" " if current_chunk else "") + sentence
|
||||
else:
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk.strip())
|
||||
current_chunk = sentence
|
||||
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk.strip())
|
||||
|
||||
return chunks
|
||||
|
||||
@@ -32,6 +32,9 @@ anyhow = "1.0"
|
||||
# Unicode normalization
|
||||
unicode-normalization = "0.1"
|
||||
|
||||
# Regular expressions
|
||||
regex = "1.10"
|
||||
|
||||
# System calls
|
||||
libc = "0.2"
|
||||
|
||||
|
||||
+32
-1
@@ -2,6 +2,10 @@
|
||||
|
||||
This guide provides examples for running TTS inference using Rust.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
## Installation
|
||||
|
||||
This project uses [Cargo](https://doc.rust-lang.org/cargo/) for package management.
|
||||
@@ -44,11 +48,13 @@ Process multiple voice styles and texts at once:
|
||||
```bash
|
||||
# Using cargo run
|
||||
cargo run --release --bin example_onnx -- \
|
||||
--batch \
|
||||
--voice-style assets/voice_styles/M1.json,assets/voice_styles/F1.json \
|
||||
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant."
|
||||
|
||||
# Or using the binary directly
|
||||
./target/release/example_onnx \
|
||||
--batch \
|
||||
--voice-style assets/voice_styles/M1.json,assets/voice_styles/F1.json \
|
||||
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant."
|
||||
```
|
||||
@@ -79,6 +85,29 @@ This will:
|
||||
- Use 10 denoising steps instead of the default 5
|
||||
- Produce higher quality output at the cost of slower inference
|
||||
|
||||
### Example 4: Long-Form Inference
|
||||
The system automatically chunks long texts into manageable segments, synthesizes each segment separately, and concatenates them with natural pauses (0.3 seconds by default) into a single audio file. This happens by default when you don't use the `--batch` flag:
|
||||
|
||||
```bash
|
||||
# Using cargo run
|
||||
cargo run --release --bin example_onnx -- \
|
||||
--voice-style assets/voice_styles/M1.json \
|
||||
--text "This is a very long text that will be automatically split into multiple chunks. The system will process each chunk separately and then concatenate them together with natural pauses between segments. This ensures that even very long texts can be processed efficiently while maintaining natural speech flow and avoiding memory issues."
|
||||
|
||||
# Or using the binary directly
|
||||
./target/release/example_onnx \
|
||||
--voice-style assets/voice_styles/M1.json \
|
||||
--text "This is a very long text that will be automatically split into multiple chunks. The system will process each chunk separately and then concatenate them together with natural pauses between segments. This ensures that even very long texts can be processed efficiently while maintaining natural speech flow and avoiding memory issues."
|
||||
```
|
||||
|
||||
This will:
|
||||
- Automatically split the text into chunks based on paragraph and sentence boundaries
|
||||
- Synthesize each chunk separately
|
||||
- Add 0.3 seconds of silence between chunks for natural pauses
|
||||
- Concatenate all chunks into a single audio file
|
||||
|
||||
**Note**: Automatic text chunking is disabled when using `--batch` mode. In batch mode, each text is processed as-is without chunking.
|
||||
|
||||
## Available Arguments
|
||||
|
||||
| Argument | Type | Default | Description |
|
||||
@@ -90,10 +119,12 @@ This will:
|
||||
| `--voice-style` | str+ | `assets/voice_styles/M1.json` | Voice style file path(s) |
|
||||
| `--text` | str+ | (long default text) | Text(s) to synthesize |
|
||||
| `--save-dir` | str | `results` | Output directory |
|
||||
| `--batch` | flag | False | Enable batch mode (multiple text-style pairs, disables automatic chunking) |
|
||||
|
||||
## Notes
|
||||
|
||||
- **Batch Processing**: The number of `--voice-style` files must match the number of `--text` entries
|
||||
- **Batch Processing**: When using `--batch`, the number of `--voice-style` files must match the number of `--text` entries
|
||||
- **Automatic Chunking**: Without `--batch`, long texts are automatically split and concatenated with 0.3s pauses
|
||||
- **Quality vs Speed**: Higher `--total-step` values produce better quality but take longer
|
||||
- **GPU Support**: GPU mode is not supported yet
|
||||
- **Known Issues**: On some platforms (especially macOS), there might be a mutex cleanup warning during exit. This is a known ONNX Runtime issue and doesn't affect functionality. The implementation uses `libc::_exit()` and `mem::forget()` to bypass this issue.
|
||||
|
||||
+34
-15
@@ -41,6 +41,10 @@ struct Args {
|
||||
/// Output directory
|
||||
#[arg(long, default_value = "results")]
|
||||
save_dir: String,
|
||||
|
||||
/// Enable batch mode (multiple text-style pairs)
|
||||
#[arg(long, default_value = "false")]
|
||||
batch: bool,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
@@ -53,13 +57,16 @@ fn main() -> Result<()> {
|
||||
let voice_style_paths = &args.voice_style;
|
||||
let text_list = &args.text;
|
||||
let save_dir = &args.save_dir;
|
||||
let batch = args.batch;
|
||||
|
||||
if voice_style_paths.len() != text_list.len() {
|
||||
anyhow::bail!(
|
||||
"Number of voice styles ({}) must match number of texts ({})",
|
||||
voice_style_paths.len(),
|
||||
text_list.len()
|
||||
);
|
||||
if batch {
|
||||
if voice_style_paths.len() != text_list.len() {
|
||||
anyhow::bail!(
|
||||
"Number of voice styles ({}) must match number of texts ({})",
|
||||
voice_style_paths.len(),
|
||||
text_list.len()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let bsz = voice_style_paths.len();
|
||||
@@ -76,19 +83,31 @@ fn main() -> Result<()> {
|
||||
for n in 0..n_test {
|
||||
println!("\n[{}/{}] Starting synthesis...", n + 1, n_test);
|
||||
|
||||
let (wav, duration) = timer("Generating speech from text", || {
|
||||
text_to_speech.call(text_list, &style, total_step)
|
||||
})?;
|
||||
let (wav, duration) = if batch {
|
||||
timer("Generating speech from text", || {
|
||||
text_to_speech.batch(text_list, &style, total_step)
|
||||
})?
|
||||
} else {
|
||||
let (w, d) = timer("Generating speech from text", || {
|
||||
text_to_speech.call(&text_list[0], &style, total_step, 0.3)
|
||||
})?;
|
||||
(w, vec![d])
|
||||
};
|
||||
|
||||
// Save outputs
|
||||
let wav_len = wav.len() / bsz;
|
||||
for i in 0..bsz {
|
||||
let fname = format!("{}_{}.wav", sanitize_filename(&text_list[i], 20), n + 1);
|
||||
let actual_len = (text_to_speech.sample_rate as f32 * duration[i]) as usize;
|
||||
|
||||
let wav_start = i * wav_len;
|
||||
let wav_end = wav_start + actual_len.min(wav_len);
|
||||
let wav_slice = &wav[wav_start..wav_end];
|
||||
let wav_slice = if batch {
|
||||
let wav_len = wav.len() / bsz;
|
||||
let actual_len = (text_to_speech.sample_rate as f32 * duration[i]) as usize;
|
||||
let wav_start = i * wav_len;
|
||||
let wav_end = wav_start + actual_len.min(wav_len);
|
||||
&wav[wav_start..wav_end]
|
||||
} else {
|
||||
// For non-batch mode, wav is a single concatenated audio
|
||||
let actual_len = (text_to_speech.sample_rate as f32 * duration[0]) as usize;
|
||||
&wav[..actual_len.min(wav.len())]
|
||||
};
|
||||
|
||||
let output_path = PathBuf::from(save_dir).join(&fname);
|
||||
write_wav_file(&output_path, wav_slice, text_to_speech.sample_rate)?;
|
||||
|
||||
+227
-1
@@ -12,6 +12,7 @@ use anyhow::{Result, Context};
|
||||
use unicode_normalization::UnicodeNormalization;
|
||||
use hound::{WavWriter, WavSpec, SampleFormat};
|
||||
use rand_distr::{Distribution, Normal};
|
||||
use regex::Regex;
|
||||
|
||||
// ============================================================================
|
||||
// Configuration Structures
|
||||
@@ -218,6 +219,187 @@ pub fn write_wav_file<P: AsRef<Path>>(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Text Chunking
|
||||
// ============================================================================
|
||||
|
||||
const MAX_CHUNK_LENGTH: usize = 300;
|
||||
|
||||
const ABBREVIATIONS: &[&str] = &[
|
||||
"Dr.", "Mr.", "Mrs.", "Ms.", "Prof.", "Sr.", "Jr.",
|
||||
"St.", "Ave.", "Rd.", "Blvd.", "Dept.", "Inc.", "Ltd.",
|
||||
"Co.", "Corp.", "etc.", "vs.", "i.e.", "e.g.", "Ph.D.",
|
||||
];
|
||||
|
||||
pub fn chunk_text(text: &str, max_len: Option<usize>) -> Vec<String> {
|
||||
let max_len = max_len.unwrap_or(MAX_CHUNK_LENGTH);
|
||||
let text = text.trim();
|
||||
|
||||
if text.is_empty() {
|
||||
return vec![String::new()];
|
||||
}
|
||||
|
||||
// Split by paragraphs
|
||||
let para_re = Regex::new(r"\n\s*\n").unwrap();
|
||||
let paragraphs: Vec<&str> = para_re.split(text).collect();
|
||||
let mut chunks = Vec::new();
|
||||
|
||||
for para in paragraphs {
|
||||
let para = para.trim();
|
||||
if para.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if para.len() <= max_len {
|
||||
chunks.push(para.to_string());
|
||||
continue;
|
||||
}
|
||||
|
||||
// Split by sentences
|
||||
let sentences = split_sentences(para);
|
||||
let mut current = String::new();
|
||||
let mut current_len = 0;
|
||||
|
||||
for sentence in sentences {
|
||||
let sentence = sentence.trim();
|
||||
if sentence.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let sentence_len = sentence.len();
|
||||
if sentence_len > max_len {
|
||||
// If sentence is longer than max_len, split by comma or space
|
||||
if !current.is_empty() {
|
||||
chunks.push(current.trim().to_string());
|
||||
current.clear();
|
||||
current_len = 0;
|
||||
}
|
||||
|
||||
// Try splitting by comma
|
||||
let parts: Vec<&str> = sentence.split(',').collect();
|
||||
for part in parts {
|
||||
let part = part.trim();
|
||||
if part.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let part_len = part.len();
|
||||
if part_len > max_len {
|
||||
// Split by space as last resort
|
||||
let words: Vec<&str> = part.split_whitespace().collect();
|
||||
let mut word_chunk = String::new();
|
||||
let mut word_chunk_len = 0;
|
||||
|
||||
for word in words {
|
||||
let word_len = word.len();
|
||||
if word_chunk_len + word_len + 1 > max_len && !word_chunk.is_empty() {
|
||||
chunks.push(word_chunk.trim().to_string());
|
||||
word_chunk.clear();
|
||||
word_chunk_len = 0;
|
||||
}
|
||||
|
||||
if !word_chunk.is_empty() {
|
||||
word_chunk.push(' ');
|
||||
word_chunk_len += 1;
|
||||
}
|
||||
word_chunk.push_str(word);
|
||||
word_chunk_len += word_len;
|
||||
}
|
||||
|
||||
if !word_chunk.is_empty() {
|
||||
chunks.push(word_chunk.trim().to_string());
|
||||
}
|
||||
} else {
|
||||
if current_len + part_len + 1 > max_len && !current.is_empty() {
|
||||
chunks.push(current.trim().to_string());
|
||||
current.clear();
|
||||
current_len = 0;
|
||||
}
|
||||
|
||||
if !current.is_empty() {
|
||||
current.push_str(", ");
|
||||
current_len += 2;
|
||||
}
|
||||
current.push_str(part);
|
||||
current_len += part_len;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if current_len + sentence_len + 1 > max_len && !current.is_empty() {
|
||||
chunks.push(current.trim().to_string());
|
||||
current.clear();
|
||||
current_len = 0;
|
||||
}
|
||||
|
||||
if !current.is_empty() {
|
||||
current.push(' ');
|
||||
current_len += 1;
|
||||
}
|
||||
current.push_str(sentence);
|
||||
current_len += sentence_len;
|
||||
}
|
||||
|
||||
if !current.is_empty() {
|
||||
chunks.push(current.trim().to_string());
|
||||
}
|
||||
}
|
||||
|
||||
if chunks.is_empty() {
|
||||
vec![String::new()]
|
||||
} else {
|
||||
chunks
|
||||
}
|
||||
}
|
||||
|
||||
fn split_sentences(text: &str) -> Vec<String> {
|
||||
// Rust's regex doesn't support lookbehind, so we use a simpler approach
|
||||
// Split on sentence boundaries and then check if they're abbreviations
|
||||
let re = Regex::new(r"([.!?])\s+").unwrap();
|
||||
|
||||
// Find all matches
|
||||
let matches: Vec<_> = re.find_iter(text).collect();
|
||||
if matches.is_empty() {
|
||||
return vec![text.to_string()];
|
||||
}
|
||||
|
||||
let mut sentences = Vec::new();
|
||||
let mut last_end = 0;
|
||||
|
||||
for m in matches {
|
||||
// Get the text before the punctuation
|
||||
let before_punc = &text[last_end..m.start()];
|
||||
|
||||
// Check if this ends with an abbreviation
|
||||
let mut is_abbrev = false;
|
||||
for abbrev in ABBREVIATIONS {
|
||||
let combined = format!("{}{}", before_punc.trim(), &text[m.start()..m.start()+1]);
|
||||
if combined.ends_with(abbrev) {
|
||||
is_abbrev = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if !is_abbrev {
|
||||
// This is a real sentence boundary
|
||||
sentences.push(text[last_end..m.end()].to_string());
|
||||
last_end = m.end();
|
||||
}
|
||||
}
|
||||
|
||||
// Add the remaining text
|
||||
if last_end < text.len() {
|
||||
sentences.push(text[last_end..].to_string());
|
||||
}
|
||||
|
||||
if sentences.is_empty() {
|
||||
vec![text.to_string()]
|
||||
} else {
|
||||
sentences
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Utility Functions
|
||||
// ============================================================================
|
||||
@@ -297,7 +479,7 @@ impl TextToSpeech {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn call(
|
||||
fn _infer(
|
||||
&mut self,
|
||||
text_list: &[String],
|
||||
style: &Style,
|
||||
@@ -396,6 +578,50 @@ impl TextToSpeech {
|
||||
|
||||
Ok((wav, duration))
|
||||
}
|
||||
|
||||
pub fn call(
|
||||
&mut self,
|
||||
text: &str,
|
||||
style: &Style,
|
||||
total_step: usize,
|
||||
silence_duration: f32,
|
||||
) -> Result<(Vec<f32>, f32)> {
|
||||
let chunks = chunk_text(text, None);
|
||||
|
||||
let mut wav_cat: Vec<f32> = Vec::new();
|
||||
let mut dur_cat: f32 = 0.0;
|
||||
|
||||
for (i, chunk) in chunks.iter().enumerate() {
|
||||
let (wav, duration) = self._infer(&[chunk.clone()], style, total_step)?;
|
||||
|
||||
let dur = duration[0];
|
||||
let wav_len = (self.sample_rate as f32 * dur) as usize;
|
||||
let wav_chunk = &wav[..wav_len.min(wav.len())];
|
||||
|
||||
if i == 0 {
|
||||
wav_cat.extend_from_slice(wav_chunk);
|
||||
dur_cat = dur;
|
||||
} else {
|
||||
let silence_len = (silence_duration * self.sample_rate as f32) as usize;
|
||||
let silence = vec![0.0f32; silence_len];
|
||||
|
||||
wav_cat.extend_from_slice(&silence);
|
||||
wav_cat.extend_from_slice(wav_chunk);
|
||||
dur_cat += silence_duration + dur;
|
||||
}
|
||||
}
|
||||
|
||||
Ok((wav_cat, dur_cat))
|
||||
}
|
||||
|
||||
pub fn batch(
|
||||
&mut self,
|
||||
text_list: &[String],
|
||||
style: &Style,
|
||||
total_step: usize,
|
||||
) -> Result<(Vec<f32>, Vec<f32>)> {
|
||||
self._infer(text_list, style, total_step)
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
|
||||
+25
-1
@@ -2,6 +2,10 @@
|
||||
|
||||
This guide provides examples for running TTS inference using `example_onnx`.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
## Installation
|
||||
|
||||
This project uses Swift Package Manager (SPM) for dependency management.
|
||||
@@ -34,6 +38,7 @@ This will use:
|
||||
Process multiple voice styles and texts at once:
|
||||
```bash
|
||||
.build/release/example_onnx \
|
||||
--batch \
|
||||
--voice-style assets/voice_styles/M1.json,assets/voice_styles/F1.json \
|
||||
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange.|The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant."
|
||||
```
|
||||
@@ -57,6 +62,23 @@ This will:
|
||||
- Use 10 denoising steps instead of the default 5
|
||||
- Produce higher quality output at the cost of slower inference
|
||||
|
||||
### Example 4: Long-Form Inference
|
||||
The system automatically chunks long texts into manageable segments, synthesizes each segment separately, and concatenates them with natural pauses (0.3 seconds by default) into a single audio file. This happens by default when you don't use the `--batch` flag:
|
||||
|
||||
```bash
|
||||
.build/release/example_onnx \
|
||||
--voice-style assets/voice_styles/M1.json \
|
||||
--text "This is a very long text that will be automatically split into multiple chunks. The system will process each chunk separately and then concatenate them together with natural pauses between segments. This ensures that even very long texts can be processed efficiently while maintaining natural speech flow and avoiding memory issues."
|
||||
```
|
||||
|
||||
This will:
|
||||
- Automatically split the text into chunks based on paragraph and sentence boundaries
|
||||
- Synthesize each chunk separately
|
||||
- Add 0.3 seconds of silence between chunks for natural pauses
|
||||
- Concatenate all chunks into a single audio file
|
||||
|
||||
**Note**: Automatic text chunking is disabled when using `--batch` mode. In batch mode, each text is processed as-is without chunking.
|
||||
|
||||
## Available Arguments
|
||||
|
||||
| Argument | Type | Default | Description |
|
||||
@@ -68,9 +90,11 @@ This will:
|
||||
| `--voice-style` | str+ | `assets/voice_styles/M1.json` | Voice style file path(s) |
|
||||
| `--text` | str+ | (long default text) | Text(s) to synthesize |
|
||||
| `--save-dir` | str | `results` | Output directory |
|
||||
| `--batch` | flag | False | Enable batch mode (multiple text-style pairs, disables automatic chunking) |
|
||||
|
||||
## Notes
|
||||
|
||||
- **Batch Processing**: The number of `--voice-style` files must match the number of `--text` entries
|
||||
- **Batch Processing**: When using `--batch`, the number of `--voice-style` files must match the number of `--text` entries
|
||||
- **Automatic Chunking**: Without `--batch`, long texts are automatically split and concatenated with 0.3s pauses
|
||||
- **Quality vs Speed**: Higher `--total-step` values produce better quality but take longer
|
||||
- **GPU Support**: GPU mode is not supported yet
|
||||
@@ -9,6 +9,7 @@ struct Args {
|
||||
var voiceStyle: [String] = ["assets/voice_styles/M1.json"]
|
||||
var text: [String] = ["This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."]
|
||||
var saveDir: String = "results"
|
||||
var batch: Bool = false
|
||||
}
|
||||
|
||||
func parseArgs() -> Args {
|
||||
@@ -52,6 +53,8 @@ func parseArgs() -> Args {
|
||||
args.saveDir = arguments[i + 1]
|
||||
i += 1
|
||||
}
|
||||
case "--batch":
|
||||
args.batch = true
|
||||
default:
|
||||
break
|
||||
}
|
||||
@@ -70,9 +73,11 @@ struct ExampleONNX {
|
||||
// --- 1. Parse arguments --- //
|
||||
let args = parseArgs()
|
||||
|
||||
guard args.voiceStyle.count == args.text.count else {
|
||||
print("Error: Number of voice styles (\(args.voiceStyle.count)) must match number of texts (\(args.text.count))")
|
||||
return
|
||||
if args.batch {
|
||||
guard args.voiceStyle.count == args.text.count else {
|
||||
print("Error: Number of voice styles (\(args.voiceStyle.count)) must match number of texts (\(args.text.count))")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
let bsz = args.voiceStyle.count
|
||||
@@ -92,19 +97,39 @@ struct ExampleONNX {
|
||||
for n in 0..<args.nTest {
|
||||
print("\n[\(n + 1)/\(args.nTest)] Starting synthesis...")
|
||||
|
||||
let (wav, duration) = try timer("Generating speech from text") {
|
||||
try textToSpeech.call(args.text, style, args.totalStep)
|
||||
let wav: [Float]
|
||||
let duration: [Float]
|
||||
|
||||
if args.batch {
|
||||
let result = try timer("Generating speech from text") {
|
||||
try textToSpeech.batch(args.text, style, args.totalStep)
|
||||
}
|
||||
wav = result.wav
|
||||
duration = result.duration
|
||||
} else {
|
||||
let result = try timer("Generating speech from text") {
|
||||
try textToSpeech.call(args.text[0], style, args.totalStep, silenceDuration: 0.3)
|
||||
}
|
||||
wav = result.wav
|
||||
duration = [result.duration]
|
||||
}
|
||||
|
||||
// Save outputs
|
||||
let wavLen = wav.count / bsz
|
||||
for i in 0..<bsz {
|
||||
let fname = "\(sanitizeFilename(args.text[i], maxLen: 20))_\(n + 1).wav"
|
||||
let actualLen = Int(Float(textToSpeech.sampleRate) * duration[i])
|
||||
let wavOut: [Float]
|
||||
|
||||
let wavStart = i * wavLen
|
||||
let wavEnd = min(wavStart + actualLen, wavStart + wavLen)
|
||||
let wavOut = Array(wav[wavStart..<wavEnd])
|
||||
if args.batch {
|
||||
let wavLen = wav.count / bsz
|
||||
let actualLen = Int(Float(textToSpeech.sampleRate) * duration[i])
|
||||
let wavStart = i * wavLen
|
||||
let wavEnd = min(wavStart + actualLen, wavStart + wavLen)
|
||||
wavOut = Array(wav[wavStart..<wavEnd])
|
||||
} else {
|
||||
// For non-batch mode, wav is a single concatenated audio
|
||||
let actualLen = Int(Float(textToSpeech.sampleRate) * duration[0])
|
||||
wavOut = Array(wav.prefix(actualLen))
|
||||
}
|
||||
|
||||
let outputPath = "\(args.saveDir)/\(fname)"
|
||||
try writeWavFile(outputPath, wavOut, textToSpeech.sampleRate)
|
||||
|
||||
+227
-1
@@ -203,6 +203,199 @@ func writeWavFile(_ filename: String, _ audioData: [Float], _ sampleRate: Int) t
|
||||
try data.write(to: url)
|
||||
}
|
||||
|
||||
// MARK: - Text Chunking
|
||||
|
||||
let MAX_CHUNK_LENGTH = 300
|
||||
let ABBREVIATIONS = [
|
||||
"Dr.", "Mr.", "Mrs.", "Ms.", "Prof.", "Sr.", "Jr.",
|
||||
"St.", "Ave.", "Rd.", "Blvd.", "Dept.", "Inc.", "Ltd.",
|
||||
"Co.", "Corp.", "etc.", "vs.", "i.e.", "e.g.", "Ph.D."
|
||||
]
|
||||
|
||||
func chunkText(_ text: String, maxLen: Int = 0) -> [String] {
|
||||
let actualMaxLen = maxLen > 0 ? maxLen : MAX_CHUNK_LENGTH
|
||||
let trimmedText = text.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines)
|
||||
|
||||
if trimmedText.isEmpty {
|
||||
return [""]
|
||||
}
|
||||
|
||||
// Split by paragraphs using regex
|
||||
let paraPattern = try! NSRegularExpression(pattern: "\\n\\s*\\n")
|
||||
let paraRange = NSRange(trimmedText.startIndex..., in: trimmedText)
|
||||
var paragraphs = [String]()
|
||||
var lastEnd = trimmedText.startIndex
|
||||
|
||||
paraPattern.enumerateMatches(in: trimmedText, range: paraRange) { match, _, _ in
|
||||
if let match = match, let range = Range(match.range, in: trimmedText) {
|
||||
paragraphs.append(String(trimmedText[lastEnd..<range.lowerBound]))
|
||||
lastEnd = range.upperBound
|
||||
}
|
||||
}
|
||||
if lastEnd < trimmedText.endIndex {
|
||||
paragraphs.append(String(trimmedText[lastEnd...]))
|
||||
}
|
||||
if paragraphs.isEmpty {
|
||||
paragraphs = [trimmedText]
|
||||
}
|
||||
|
||||
var chunks = [String]()
|
||||
|
||||
for para in paragraphs {
|
||||
let trimmedPara = para.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines)
|
||||
if trimmedPara.isEmpty {
|
||||
continue
|
||||
}
|
||||
|
||||
if trimmedPara.count <= actualMaxLen {
|
||||
chunks.append(trimmedPara)
|
||||
continue
|
||||
}
|
||||
|
||||
// Split by sentences
|
||||
let sentences = splitSentences(trimmedPara)
|
||||
var current = ""
|
||||
var currentLen = 0
|
||||
|
||||
for sentence in sentences {
|
||||
let trimmedSentence = sentence.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines)
|
||||
if trimmedSentence.isEmpty {
|
||||
continue
|
||||
}
|
||||
|
||||
let sentenceLen = trimmedSentence.count
|
||||
if sentenceLen > actualMaxLen {
|
||||
// If sentence is longer than maxLen, split by comma or space
|
||||
if !current.isEmpty {
|
||||
chunks.append(current.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines))
|
||||
current = ""
|
||||
currentLen = 0
|
||||
}
|
||||
|
||||
// Try splitting by comma
|
||||
let parts = trimmedSentence.components(separatedBy: ",")
|
||||
for part in parts {
|
||||
let trimmedPart = part.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines)
|
||||
if trimmedPart.isEmpty {
|
||||
continue
|
||||
}
|
||||
|
||||
let partLen = trimmedPart.count
|
||||
if partLen > actualMaxLen {
|
||||
// Split by space as last resort
|
||||
let words = trimmedPart.components(separatedBy: CharacterSet.whitespaces).filter { !$0.isEmpty }
|
||||
var wordChunk = ""
|
||||
var wordChunkLen = 0
|
||||
|
||||
for word in words {
|
||||
let wordLen = word.count
|
||||
if wordChunkLen + wordLen + 1 > actualMaxLen && !wordChunk.isEmpty {
|
||||
chunks.append(wordChunk.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines))
|
||||
wordChunk = ""
|
||||
wordChunkLen = 0
|
||||
}
|
||||
|
||||
if !wordChunk.isEmpty {
|
||||
wordChunk += " "
|
||||
wordChunkLen += 1
|
||||
}
|
||||
wordChunk += word
|
||||
wordChunkLen += wordLen
|
||||
}
|
||||
|
||||
if !wordChunk.isEmpty {
|
||||
chunks.append(wordChunk.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines))
|
||||
}
|
||||
} else {
|
||||
if currentLen + partLen + 1 > actualMaxLen && !current.isEmpty {
|
||||
chunks.append(current.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines))
|
||||
current = ""
|
||||
currentLen = 0
|
||||
}
|
||||
|
||||
if !current.isEmpty {
|
||||
current += ", "
|
||||
currentLen += 2
|
||||
}
|
||||
current += trimmedPart
|
||||
currentLen += partLen
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if currentLen + sentenceLen + 1 > actualMaxLen && !current.isEmpty {
|
||||
chunks.append(current.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines))
|
||||
current = ""
|
||||
currentLen = 0
|
||||
}
|
||||
|
||||
if !current.isEmpty {
|
||||
current += " "
|
||||
currentLen += 1
|
||||
}
|
||||
current += trimmedSentence
|
||||
currentLen += sentenceLen
|
||||
}
|
||||
|
||||
if !current.isEmpty {
|
||||
chunks.append(current.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines))
|
||||
}
|
||||
}
|
||||
|
||||
return chunks.isEmpty ? [""] : chunks
|
||||
}
|
||||
|
||||
func splitSentences(_ text: String) -> [String] {
|
||||
// Swift's regex doesn't support lookbehind reliably, so we use a simpler approach
|
||||
// Split on sentence boundaries and then check if they're abbreviations
|
||||
let regex = try! NSRegularExpression(pattern: "([.!?])\\s+")
|
||||
let range = NSRange(text.startIndex..., in: text)
|
||||
|
||||
// Find all matches
|
||||
let matches = regex.matches(in: text, range: range)
|
||||
if matches.isEmpty {
|
||||
return [text]
|
||||
}
|
||||
|
||||
var sentences = [String]()
|
||||
var lastEnd = text.startIndex
|
||||
|
||||
for match in matches {
|
||||
guard let matchRange = Range(match.range, in: text) else { continue }
|
||||
|
||||
// Get the text before the punctuation
|
||||
let beforePunc = String(text[lastEnd..<matchRange.lowerBound])
|
||||
|
||||
// Get the punctuation character
|
||||
let puncRange = Range(NSRange(location: match.range.location, length: 1), in: text)!
|
||||
let punc = String(text[puncRange])
|
||||
|
||||
// Check if this ends with an abbreviation
|
||||
var isAbbrev = false
|
||||
let combined = beforePunc.trimmingCharacters(in: CharacterSet.whitespaces) + punc
|
||||
for abbrev in ABBREVIATIONS {
|
||||
if combined.hasSuffix(abbrev) {
|
||||
isAbbrev = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !isAbbrev {
|
||||
// This is a real sentence boundary
|
||||
sentences.append(String(text[lastEnd..<matchRange.upperBound]))
|
||||
lastEnd = matchRange.upperBound
|
||||
}
|
||||
}
|
||||
|
||||
// Add the remaining text
|
||||
if lastEnd < text.endIndex {
|
||||
sentences.append(String(text[lastEnd...]))
|
||||
}
|
||||
|
||||
return sentences.isEmpty ? [text] : sentences
|
||||
}
|
||||
|
||||
// MARK: - Utility Functions
|
||||
|
||||
func timer<T>(_ name: String, _ f: () throws -> T) rethrows -> T {
|
||||
@@ -260,7 +453,7 @@ class TextToSpeech {
|
||||
self.sampleRate = cfgs.ae.sample_rate
|
||||
}
|
||||
|
||||
func call(_ textList: [String], _ style: Style, _ totalStep: Int) throws -> (wav: [Float], duration: [Float]) {
|
||||
private func _infer(_ textList: [String], _ style: Style, _ totalStep: Int) throws -> (wav: [Float], duration: [Float]) {
|
||||
let bsz = textList.count
|
||||
|
||||
// Process text
|
||||
@@ -382,6 +575,39 @@ class TextToSpeech {
|
||||
|
||||
return (wav, duration)
|
||||
}
|
||||
|
||||
func call(_ text: String, _ style: Style, _ totalStep: Int, silenceDuration: Float) throws -> (wav: [Float], duration: Float) {
|
||||
let chunks = chunkText(text)
|
||||
|
||||
var wavCat = [Float]()
|
||||
var durCat: Float = 0.0
|
||||
|
||||
for (i, chunk) in chunks.enumerated() {
|
||||
let result = try _infer([chunk], style, totalStep)
|
||||
|
||||
let dur = result.duration[0]
|
||||
let wavLen = Int(Float(sampleRate) * dur)
|
||||
let wavChunk = Array(result.wav.prefix(wavLen))
|
||||
|
||||
if i == 0 {
|
||||
wavCat = wavChunk
|
||||
durCat = dur
|
||||
} else {
|
||||
let silenceLen = Int(silenceDuration * Float(sampleRate))
|
||||
let silence = [Float](repeating: 0.0, count: silenceLen)
|
||||
|
||||
wavCat.append(contentsOf: silence)
|
||||
wavCat.append(contentsOf: wavChunk)
|
||||
durCat += silenceDuration + dur
|
||||
}
|
||||
}
|
||||
|
||||
return (wavCat, durCat)
|
||||
}
|
||||
|
||||
func batch(_ textList: [String], _ style: Style, _ totalStep: Int) throws -> (wav: [Float], duration: [Float]) {
|
||||
return try _infer(textList, style, totalStep)
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Component Loading Functions
|
||||
|
||||
+50
-11
@@ -17,8 +17,9 @@ echo ""
|
||||
echo "Select test mode:"
|
||||
echo " 1) Default inference only"
|
||||
echo " 2) Batch inference only"
|
||||
echo " 3) Both default and batch inference"
|
||||
echo -e "Enter your choice (1/2/3) [default: 1]: \c"
|
||||
echo " 3) Long-form inference only"
|
||||
echo " 4) All tests (default + batch + long-form)"
|
||||
echo -e "Enter your choice (1/2/3/4) [default: 1]: \c"
|
||||
read -r test_mode
|
||||
test_mode=${test_mode:-1}
|
||||
|
||||
@@ -26,22 +27,32 @@ case $test_mode in
|
||||
1)
|
||||
TEST_DEFAULT=true
|
||||
TEST_BATCH=false
|
||||
TEST_LONGFORM=false
|
||||
echo "Running default inference tests only"
|
||||
;;
|
||||
2)
|
||||
TEST_DEFAULT=false
|
||||
TEST_BATCH=true
|
||||
TEST_LONGFORM=false
|
||||
echo "Running batch inference tests only"
|
||||
;;
|
||||
3)
|
||||
TEST_DEFAULT=false
|
||||
TEST_BATCH=false
|
||||
TEST_LONGFORM=true
|
||||
echo "Running long-form inference tests only"
|
||||
;;
|
||||
4)
|
||||
TEST_DEFAULT=true
|
||||
TEST_BATCH=true
|
||||
echo "Running both default and batch inference tests"
|
||||
TEST_LONGFORM=true
|
||||
echo "Running all tests (default + batch + long-form)"
|
||||
;;
|
||||
*)
|
||||
echo "Invalid choice. Using default inference only."
|
||||
TEST_DEFAULT=true
|
||||
TEST_BATCH=false
|
||||
TEST_LONGFORM=false
|
||||
;;
|
||||
esac
|
||||
echo ""
|
||||
@@ -52,6 +63,10 @@ BATCH_VOICE_STYLE_2="assets/voice_styles/F1.json"
|
||||
BATCH_TEXT_1="The sun sets behind the mountains, painting the sky in shades of pink and orange."
|
||||
BATCH_TEXT_2="The weather is beautiful and sunny outside. A gentle breeze makes the air feel fresh and pleasant."
|
||||
|
||||
# Long-form inference test data
|
||||
LONGFORM_VOICE_STYLE="assets/voice_styles/M1.json"
|
||||
LONGFORM_TEXT="This is a very long text that will be automatically split into multiple chunks. The system will process each chunk separately and then concatenate them together with natural pauses between segments. This ensures that even very long texts can be processed efficiently while maintaining natural speech flow and avoiding memory issues. The text chunking algorithm intelligently splits on paragraph and sentence boundaries, preserving the natural flow of the content. When a sentence is too long, it further splits on commas and spaces as needed. This multi-level approach ensures optimal chunk sizes for inference while maintaining linguistic coherence."
|
||||
|
||||
# Ask if user wants to clean results folders
|
||||
echo -e "Do you want to clean all results folders before running tests? (y/N): \c"
|
||||
read -r response
|
||||
@@ -123,7 +138,10 @@ if [ "$TEST_DEFAULT" = true ]; then
|
||||
run_test "Python (default)" "py" "uv run example_onnx.py"
|
||||
fi
|
||||
if [ "$TEST_BATCH" = true ]; then
|
||||
run_test "Python (batch)" "py" "uv run example_onnx.py --voice-style $BATCH_VOICE_STYLE_1 $BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1' '$BATCH_TEXT_2'"
|
||||
run_test "Python (batch)" "py" "uv run example_onnx.py --batch --voice-style $BATCH_VOICE_STYLE_1 $BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1' '$BATCH_TEXT_2'"
|
||||
fi
|
||||
if [ "$TEST_LONGFORM" = true ]; then
|
||||
run_test "Python (long-form)" "py" "uv run example_onnx.py --voice-style $LONGFORM_VOICE_STYLE --text '$LONGFORM_TEXT'"
|
||||
fi
|
||||
|
||||
# ====================================
|
||||
@@ -136,7 +154,10 @@ if [ "$TEST_DEFAULT" = true ]; then
|
||||
run_test "JavaScript (default)" "nodejs" "node example_onnx.js"
|
||||
fi
|
||||
if [ "$TEST_BATCH" = true ]; then
|
||||
run_test "JavaScript (batch)" "nodejs" "node example_onnx.js --voice-style $BATCH_VOICE_STYLE_1,$BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
|
||||
run_test "JavaScript (batch)" "nodejs" "node example_onnx.js --batch --voice-style $BATCH_VOICE_STYLE_1,$BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
|
||||
fi
|
||||
if [ "$TEST_LONGFORM" = true ]; then
|
||||
run_test "JavaScript (long-form)" "nodejs" "node example_onnx.js --voice-style $LONGFORM_VOICE_STYLE --text '$LONGFORM_TEXT'"
|
||||
fi
|
||||
|
||||
# ====================================
|
||||
@@ -150,7 +171,10 @@ if [ "$TEST_DEFAULT" = true ]; then
|
||||
run_test "Go (default)" "go" "go run example_onnx.go helper.go"
|
||||
fi
|
||||
if [ "$TEST_BATCH" = true ]; then
|
||||
run_test "Go (batch)" "go" "go run example_onnx.go helper.go --voice-style $BATCH_VOICE_STYLE_1,$BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
|
||||
run_test "Go (batch)" "go" "go run example_onnx.go helper.go --batch -voice-style $BATCH_VOICE_STYLE_1,$BATCH_VOICE_STYLE_2 -text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
|
||||
fi
|
||||
if [ "$TEST_LONGFORM" = true ]; then
|
||||
run_test "Go (long-form)" "go" "go run example_onnx.go helper.go -voice-style $LONGFORM_VOICE_STYLE -text '$LONGFORM_TEXT'"
|
||||
fi
|
||||
|
||||
# ====================================
|
||||
@@ -163,7 +187,10 @@ if [ "$TEST_DEFAULT" = true ]; then
|
||||
run_test "Rust (default)" "rust" "cargo run --release"
|
||||
fi
|
||||
if [ "$TEST_BATCH" = true ]; then
|
||||
run_test "Rust (batch)" "rust" "cargo run --release -- --voice-style $BATCH_VOICE_STYLE_1,$BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
|
||||
run_test "Rust (batch)" "rust" "cargo run --release -- --batch --voice-style $BATCH_VOICE_STYLE_1,$BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
|
||||
fi
|
||||
if [ "$TEST_LONGFORM" = true ]; then
|
||||
run_test "Rust (long-form)" "rust" "cargo run --release -- --voice-style $LONGFORM_VOICE_STYLE --text '$LONGFORM_TEXT'"
|
||||
fi
|
||||
|
||||
# ====================================
|
||||
@@ -176,7 +203,10 @@ if [ "$TEST_DEFAULT" = true ]; then
|
||||
run_test "C# (default)" "csharp" "dotnet run --configuration Release"
|
||||
fi
|
||||
if [ "$TEST_BATCH" = true ]; then
|
||||
run_test "C# (batch)" "csharp" "dotnet run --configuration Release -- --voice-style ../$BATCH_VOICE_STYLE_1,../$BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
|
||||
run_test "C# (batch)" "csharp" "dotnet run --configuration Release -- --batch --voice-style ../$BATCH_VOICE_STYLE_1,../$BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
|
||||
fi
|
||||
if [ "$TEST_LONGFORM" = true ]; then
|
||||
run_test "C# (long-form)" "csharp" "dotnet run --configuration Release -- --voice-style ../$LONGFORM_VOICE_STYLE --text '$LONGFORM_TEXT'"
|
||||
fi
|
||||
|
||||
# ====================================
|
||||
@@ -189,7 +219,10 @@ if [ "$TEST_DEFAULT" = true ]; then
|
||||
run_test "Java (default)" "java" "mvn exec:java -q"
|
||||
fi
|
||||
if [ "$TEST_BATCH" = true ]; then
|
||||
run_test "Java (batch)" "java" "mvn exec:java -q -Dexec.args='--voice-style $BATCH_VOICE_STYLE_1,$BATCH_VOICE_STYLE_2 --text \"$BATCH_TEXT_1|$BATCH_TEXT_2\"'"
|
||||
run_test "Java (batch)" "java" "mvn exec:java -q -Dexec.args='--batch --voice-style $BATCH_VOICE_STYLE_1,$BATCH_VOICE_STYLE_2 --text \"$BATCH_TEXT_1|$BATCH_TEXT_2\"'"
|
||||
fi
|
||||
if [ "$TEST_LONGFORM" = true ]; then
|
||||
run_test "Java (long-form)" "java" "mvn exec:java -q -Dexec.args='--voice-style $LONGFORM_VOICE_STYLE --text \"$LONGFORM_TEXT\"'"
|
||||
fi
|
||||
|
||||
# ====================================
|
||||
@@ -202,7 +235,10 @@ if [ "$TEST_DEFAULT" = true ]; then
|
||||
run_test "Swift (default)" "swift" ".build/release/example_onnx"
|
||||
fi
|
||||
if [ "$TEST_BATCH" = true ]; then
|
||||
run_test "Swift (batch)" "swift" ".build/release/example_onnx --voice-style $BATCH_VOICE_STYLE_1,$BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
|
||||
run_test "Swift (batch)" "swift" ".build/release/example_onnx --batch --voice-style $BATCH_VOICE_STYLE_1,$BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
|
||||
fi
|
||||
if [ "$TEST_LONGFORM" = true ]; then
|
||||
run_test "Swift (long-form)" "swift" ".build/release/example_onnx --voice-style $LONGFORM_VOICE_STYLE --text '$LONGFORM_TEXT'"
|
||||
fi
|
||||
|
||||
# ====================================
|
||||
@@ -215,7 +251,10 @@ if [ "$TEST_DEFAULT" = true ]; then
|
||||
run_test "C++ (default)" "cpp/build" "./example_onnx"
|
||||
fi
|
||||
if [ "$TEST_BATCH" = true ]; then
|
||||
run_test "C++ (batch)" "cpp/build" "./example_onnx --voice-style ../$BATCH_VOICE_STYLE_1,../$BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
|
||||
run_test "C++ (batch)" "cpp/build" "./example_onnx --batch --voice-style ../$BATCH_VOICE_STYLE_1,../$BATCH_VOICE_STYLE_2 --text '$BATCH_TEXT_1|$BATCH_TEXT_2'"
|
||||
fi
|
||||
if [ "$TEST_LONGFORM" = true ]; then
|
||||
run_test "C++ (long-form)" "cpp/build" "./example_onnx --voice-style ../$LONGFORM_VOICE_STYLE --text '$LONGFORM_TEXT'"
|
||||
fi
|
||||
|
||||
# ====================================
|
||||
|
||||
@@ -2,6 +2,10 @@
|
||||
|
||||
This example demonstrates how to use Supertonic in a web browser using ONNX Runtime Web.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
## Features
|
||||
|
||||
- 🌐 Runs entirely in the browser (no server required for inference)
|
||||
|
||||
+72
-1
@@ -72,7 +72,7 @@ export class TextToSpeech {
|
||||
this.sampleRate = cfgs.ae.sample_rate;
|
||||
}
|
||||
|
||||
async call(textList, style, totalStep, progressCallback = null) {
|
||||
async _infer(textList, style, totalStep, progressCallback = null) {
|
||||
const bsz = textList.length;
|
||||
|
||||
// Process text
|
||||
@@ -176,6 +176,35 @@ export class TextToSpeech {
|
||||
return { wav, duration };
|
||||
}
|
||||
|
||||
async call(text, style, totalStep, silenceDuration = 0.3, progressCallback = null) {
|
||||
if (style.ttl.dims[0] !== 1) {
|
||||
throw new Error('Single speaker text to speech only supports single style');
|
||||
}
|
||||
const textList = chunkText(text);
|
||||
let wavCat = [];
|
||||
let durCat = 0;
|
||||
|
||||
for (const chunk of textList) {
|
||||
const { wav, duration } = await this._infer([chunk], style, totalStep, progressCallback);
|
||||
|
||||
if (wavCat.length === 0) {
|
||||
wavCat = wav;
|
||||
durCat = duration[0];
|
||||
} else {
|
||||
const silenceLen = Math.floor(silenceDuration * this.sampleRate);
|
||||
const silence = new Array(silenceLen).fill(0);
|
||||
wavCat = [...wavCat, ...silence, ...wav];
|
||||
durCat += duration[0] + silenceDuration;
|
||||
}
|
||||
}
|
||||
|
||||
return { wav: wavCat, duration: [durCat] };
|
||||
}
|
||||
|
||||
async batch(textList, style, totalStep, progressCallback = null) {
|
||||
return await this._infer(textList, style, totalStep, progressCallback);
|
||||
}
|
||||
|
||||
sampleNoisyLatent(duration, sampleRate, baseChunkSize, chunkCompress, latentDim) {
|
||||
const bsz = duration.length;
|
||||
const maxDur = Math.max(...duration);
|
||||
@@ -347,6 +376,48 @@ export async function loadTextToSpeech(onnxDir, sessionOptions = {}, progressCal
|
||||
return { textToSpeech, cfgs };
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk text into manageable segments
|
||||
*/
|
||||
function chunkText(text, maxLen = 300) {
|
||||
if (typeof text !== 'string') {
|
||||
throw new Error(`chunkText expects a string, got ${typeof text}`);
|
||||
}
|
||||
|
||||
// Split by paragraph (two or more newlines)
|
||||
const paragraphs = text.trim().split(/\n\s*\n+/).filter(p => p.trim());
|
||||
|
||||
const chunks = [];
|
||||
|
||||
for (let paragraph of paragraphs) {
|
||||
paragraph = paragraph.trim();
|
||||
if (!paragraph) continue;
|
||||
|
||||
// Split by sentence boundaries (period, question mark, exclamation mark followed by space)
|
||||
// But exclude common abbreviations like Mr., Mrs., Dr., etc. and single capital letters like F.
|
||||
const sentences = paragraph.split(/(?<!Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.|Sr\.|Jr\.|Ph\.D\.|etc\.|e\.g\.|i\.e\.|vs\.|Inc\.|Ltd\.|Co\.|Corp\.|St\.|Ave\.|Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+/);
|
||||
|
||||
let currentChunk = "";
|
||||
|
||||
for (let sentence of sentences) {
|
||||
if (currentChunk.length + sentence.length + 1 <= maxLen) {
|
||||
currentChunk += (currentChunk ? " " : "") + sentence;
|
||||
} else {
|
||||
if (currentChunk) {
|
||||
chunks.push(currentChunk.trim());
|
||||
}
|
||||
currentChunk = sentence;
|
||||
}
|
||||
}
|
||||
|
||||
if (currentChunk) {
|
||||
chunks.push(currentChunk.trim());
|
||||
}
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write WAV file to ArrayBuffer
|
||||
*/
|
||||
|
||||
+3
-3
@@ -186,15 +186,15 @@ async function generateSpeech() {
|
||||
`;
|
||||
|
||||
const totalStep = parseInt(totalStepInput.value);
|
||||
const textList = [text];
|
||||
|
||||
showStatus('ℹ️ <strong>Generating speech from text...</strong>');
|
||||
const tic = Date.now();
|
||||
|
||||
const { wav, duration } = await textToSpeech.call(
|
||||
textList,
|
||||
text,
|
||||
currentStyle,
|
||||
totalStep,
|
||||
totalStep,
|
||||
0.3,
|
||||
(step, total) => {
|
||||
showStatus(`ℹ️ <strong>Denoising (${step}/${total})...</strong>`);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user