Fix text normalization bug (#16)

This commit is contained in:
ANLGBOY
2025-11-23 13:18:15 +09:00
parent 9015bd095f
commit 8d42b55965
18 changed files with 966 additions and 28 deletions
+2
View File
@@ -4,6 +4,8 @@ High-performance text-to-speech inference using ONNX Runtime.
## 📰 Update News ## 📰 Update News
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5). **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses. **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
+160 -18
View File
@@ -19,6 +19,24 @@ void clearTensorBuffers() {
g_tensor_buffers_int64.clear(); g_tensor_buffers_int64.clear();
} }
// ============================================================================
// Helper function - trim
// ============================================================================
static std::string trim(const std::string& str) {
size_t start = 0;
while (start < str.size() && std::isspace(static_cast<unsigned char>(str[start]))) {
start++;
}
size_t end = str.size();
while (end > start && std::isspace(static_cast<unsigned char>(str[end - 1]))) {
end--;
}
return str.substr(start, end - start);
}
// ============================================================================ // ============================================================================
// UnicodeProcessor implementation // UnicodeProcessor implementation
// ============================================================================ // ============================================================================
@@ -28,10 +46,148 @@ UnicodeProcessor::UnicodeProcessor(const std::string& unicode_indexer_json_path)
} }
std::string UnicodeProcessor::preprocessText(const std::string& text) { std::string UnicodeProcessor::preprocessText(const std::string& text) {
// Simple NFKD normalization (C++ doesn't have built-in Unicode normalization) // TODO: Need advanced normalizer for better performance
// For now, just return the text as-is // NOTE: C++ doesn't have built-in Unicode normalization like Python's NFKD
// TODO: add proper Unicode normalization // For full Unicode normalization, consider using ICU library
return text; // This implementation handles basic text preprocessing
std::string result = text;
// FIXME: this should be fixed for non-English languages
// Remove emojis and various Unicode symbols
// Using regex to remove common emoji ranges and special symbols
// Note: This is a simplified version - full emoji support needs UTF-8 handling
std::regex emoji_pattern(
"[\xF0\x9F][\x80-\xBF]{2}|" // Common emoji pattern in UTF-8
"[\xE2][\x80-\xBF]{2}|" // Various symbols
"[\xE2][\x98-\x9E][\x80-\xBF]" // More symbols
);
result = std::regex_replace(result, emoji_pattern, "");
// Replace various dashes and symbols
struct Replacement {
const char* from;
const char* to;
};
const Replacement replacements[] = {
{"", "-"}, // en dash
{"", "-"}, // non-breaking hyphen
{"", "-"}, // em dash
{"¯", " "}, // macron
{"_", " "}, // underscore
{""", "\""}, // left double quote (U+201C)
{""", "\""}, // right double quote (U+201D)
{"'", "'"}, // left single quote (U+2018)
{"'", "'"}, // right single quote (U+2019)
{"´", "'"}, // acute accent
{"`", "'"}, // grave accent
{"[", " "}, // left bracket
{"]", " "}, // right bracket
{"|", " "}, // vertical bar
{"/", " "}, // slash
{"#", " "}, // hash
{"", " "}, // right arrow
{"", " "}, // left arrow
};
for (const auto& repl : replacements) {
size_t pos = 0;
while ((pos = result.find(repl.from, pos)) != std::string::npos) {
result.replace(pos, strlen(repl.from), repl.to);
pos += strlen(repl.to);
}
}
// Remove combining diacritics (common combining marks in UTF-8)
// FIXME: this should be fixed for non-English languages
std::regex diacritics_pattern(
"[\xCC\xCD][\x80-\xBF]" // Combining diacritical marks range
);
result = std::regex_replace(result, diacritics_pattern, "");
// Remove special symbols
const char* special_symbols[] = {"", "", "", "©", "\\"};
for (const char* symbol : special_symbols) {
size_t pos = 0;
while ((pos = result.find(symbol, pos)) != std::string::npos) {
result.erase(pos, strlen(symbol));
}
}
// Replace known expressions
const Replacement expr_replacements[] = {
{"@", " at "},
{"e.g.,", "for example, "},
{"i.e.,", "that is, "},
};
for (const auto& repl : expr_replacements) {
size_t pos = 0;
while ((pos = result.find(repl.from, pos)) != std::string::npos) {
result.replace(pos, strlen(repl.from), repl.to);
pos += strlen(repl.to);
}
}
// Fix spacing around punctuation
result = std::regex_replace(result, std::regex(" ,"), ",");
result = std::regex_replace(result, std::regex(" \\."), ".");
result = std::regex_replace(result, std::regex(" !"), "!");
result = std::regex_replace(result, std::regex(" \\?"), "?");
result = std::regex_replace(result, std::regex(" ;"), ";");
result = std::regex_replace(result, std::regex(" :"), ":");
result = std::regex_replace(result, std::regex(" '"), "'");
// Remove duplicate quotes
while (result.find("\"\"") != std::string::npos) {
size_t pos = result.find("\"\"");
result.replace(pos, 2, "\"");
}
while (result.find("''") != std::string::npos) {
size_t pos = result.find("''");
result.replace(pos, 2, "'");
}
while (result.find("``") != std::string::npos) {
size_t pos = result.find("``");
result.replace(pos, 2, "`");
}
// Remove extra spaces
result = std::regex_replace(result, std::regex("\\s+"), " ");
result = trim(result);
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
if (!result.empty()) {
char last_char = result.back();
bool ends_with_punct = (
last_char == '.' || last_char == '!' || last_char == '?' ||
last_char == ';' || last_char == ':' || last_char == ',' ||
last_char == '\'' || last_char == '"' || last_char == ')' ||
last_char == ']' || last_char == '}' || last_char == '>'
);
// Check for UTF-8 multibyte ending punctuation (e.g., …, 。, curly quotes, etc.)
if (!ends_with_punct && result.size() >= 3) {
std::string last_three = result.substr(result.size() - 3);
if (last_three == "" || last_three == "" ||
last_three == "" || last_three == "" ||
last_three == "" || last_three == "" ||
last_three == "" || last_three == "" ||
last_three == "»" || last_three == """ ||
last_three == """ || last_three == "'" ||
last_three == "'") {
ends_with_punct = true;
}
}
if (!ends_with_punct) {
result += ".";
}
}
return result;
} }
std::vector<uint16_t> UnicodeProcessor::textToUnicodeValues(const std::string& text) { std::vector<uint16_t> UnicodeProcessor::textToUnicodeValues(const std::string& text) {
@@ -772,20 +928,6 @@ std::string sanitizeFilename(const std::string& text, int max_len) {
// Chunk text // Chunk text
// ============================================================================ // ============================================================================
static std::string trim(const std::string& str) {
size_t start = 0;
while (start < str.size() && std::isspace(static_cast<unsigned char>(str[start]))) {
start++;
}
size_t end = str.size();
while (end > start && std::isspace(static_cast<unsigned char>(str[end - 1]))) {
end--;
}
return str.substr(start, end - start);
}
std::vector<std::string> chunkText(const std::string& text, int max_len) { std::vector<std::string> chunkText(const std::string& text, int max_len) {
std::vector<std::string> chunks; std::vector<std::string> chunks;
+136 -2
View File
@@ -71,10 +71,144 @@ namespace Supertonic
} }
} }
private static string RemoveEmojis(string text)
{
var result = new StringBuilder();
for (int i = 0; i < text.Length; i++)
{
int codePoint;
if (char.IsHighSurrogate(text[i]) && i + 1 < text.Length && char.IsLowSurrogate(text[i + 1]))
{
// Get the full code point from surrogate pair
codePoint = char.ConvertToUtf32(text[i], text[i + 1]);
i++; // Skip the low surrogate
}
else
{
codePoint = text[i];
}
// Check if code point is in emoji ranges
bool isEmoji = (codePoint >= 0x1F600 && codePoint <= 0x1F64F) ||
(codePoint >= 0x1F300 && codePoint <= 0x1F5FF) ||
(codePoint >= 0x1F680 && codePoint <= 0x1F6FF) ||
(codePoint >= 0x1F700 && codePoint <= 0x1F77F) ||
(codePoint >= 0x1F780 && codePoint <= 0x1F7FF) ||
(codePoint >= 0x1F800 && codePoint <= 0x1F8FF) ||
(codePoint >= 0x1F900 && codePoint <= 0x1F9FF) ||
(codePoint >= 0x1FA00 && codePoint <= 0x1FA6F) ||
(codePoint >= 0x1FA70 && codePoint <= 0x1FAFF) ||
(codePoint >= 0x2600 && codePoint <= 0x26FF) ||
(codePoint >= 0x2700 && codePoint <= 0x27BF) ||
(codePoint >= 0x1F1E6 && codePoint <= 0x1F1FF);
if (!isEmoji)
{
if (codePoint > 0xFFFF)
{
// Add back as surrogate pair
result.Append(char.ConvertFromUtf32(codePoint));
}
else
{
result.Append((char)codePoint);
}
}
}
return result.ToString();
}
private string PreprocessText(string text) private string PreprocessText(string text)
{ {
// Simple normalization (C# has Normalize built-in) // TODO: Need advanced normalizer for better performance
return text.Normalize(NormalizationForm.FormKD); text = text.Normalize(NormalizationForm.FormKD);
// FIXME: this should be fixed for non-English languages
// Remove emojis (wide Unicode range)
// C# doesn't support \u{...} syntax in regex, so we use character filtering instead
text = RemoveEmojis(text);
// Replace various dashes and symbols
var replacements = new Dictionary<string, string>
{
{"", "-"}, // en dash
{"", "-"}, // non-breaking hyphen
{"—", "-"}, // em dash
{"¯", " "}, // macron
{"_", " "}, // underscore
{"\u201C", "\""}, // left double quote
{"\u201D", "\""}, // right double quote
{"\u2018", "'"}, // left single quote
{"\u2019", "'"}, // right single quote
{"´", "'"}, // acute accent
{"`", "'"}, // grave accent
{"[", " "}, // left bracket
{"]", " "}, // right bracket
{"|", " "}, // vertical bar
{"/", " "}, // slash
{"#", " "}, // hash
{"→", " "}, // right arrow
{"←", " "}, // left arrow
};
foreach (var kvp in replacements)
{
text = text.Replace(kvp.Key, kvp.Value);
}
// Remove combining diacritics // FIXME: this should be fixed for non-English languages
text = Regex.Replace(text, @"[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]", "");
// Remove special symbols
text = Regex.Replace(text, @"[♥☆♡©\\]", "");
// Replace known expressions
var exprReplacements = new Dictionary<string, string>
{
{"@", " at "},
{"e.g.,", "for example, "},
{"i.e.,", "that is, "},
};
foreach (var kvp in exprReplacements)
{
text = text.Replace(kvp.Key, kvp.Value);
}
// Fix spacing around punctuation
text = Regex.Replace(text, @" ,", ",");
text = Regex.Replace(text, @" \.", ".");
text = Regex.Replace(text, @" !", "!");
text = Regex.Replace(text, @" \?", "?");
text = Regex.Replace(text, @" ;", ";");
text = Regex.Replace(text, @" :", ":");
text = Regex.Replace(text, @" '", "'");
// Remove duplicate quotes
while (text.Contains("\"\""))
{
text = text.Replace("\"\"", "\"");
}
while (text.Contains("''"))
{
text = text.Replace("''", "'");
}
while (text.Contains("``"))
{
text = text.Replace("``", "`");
}
// Remove extra spaces
text = Regex.Replace(text, @"\s+", " ").Trim();
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
if (!Regex.IsMatch(text, @"[.!?;:,'\u0022\u201C\u201D\u2018\u2019)\]}…。」』】〉》›»]$"))
{
text += ".";
}
return text;
} }
private int[] TextToUnicodeValues(string text) private int[] TextToUnicodeValues(string text)
+2
View File
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `ExampleONNX.cs`.
## 📰 Update News ## 📰 Update News
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5). **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses. **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
+2
View File
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx.go`.
## 📰 Update News ## 📰 Update News
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5). **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses. **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
+90 -1
View File
@@ -327,8 +327,97 @@ func splitSentences(text string) []string {
// Utility functions // Utility functions
func preprocessText(text string) string { func preprocessText(text string) string {
// Simple normalization (Go doesn't have built-in NFKD normalization) // TODO: Need advanced normalizer for better performance
// NOTE: Go doesn't have built-in NFKD normalization like Python
// For full Unicode normalization, use golang.org/x/text/unicode/norm // For full Unicode normalization, use golang.org/x/text/unicode/norm
// This implementation handles basic text preprocessing
// FIXME: this should be fixed for non-English languages
// Remove emojis and various Unicode symbols
emojiPattern := regexp.MustCompile(`[\x{1F600}-\x{1F64F}\x{1F300}-\x{1F5FF}\x{1F680}-\x{1F6FF}\x{1F700}-\x{1F77F}\x{1F780}-\x{1F7FF}\x{1F800}-\x{1F8FF}\x{1F900}-\x{1F9FF}\x{1FA00}-\x{1FA6F}\x{1FA70}-\x{1FAFF}\x{2600}-\x{26FF}\x{2700}-\x{27BF}\x{1F1E6}-\x{1F1FF}]+`)
text = emojiPattern.ReplaceAllString(text, "")
// Replace various dashes and symbols
replacements := map[string]string{
"": "-", // en dash
"": "-", // non-breaking hyphen
"—": "-", // em dash
"¯": " ", // macron
"_": " ", // underscore
"\u201C": "\"", // left double quote
"\u201D": "\"", // right double quote
"\u2018": "'", // left single quote
"\u2019": "'", // right single quote
"´": "'", // acute accent
"`": "'", // grave accent
"[": " ", // left bracket
"]": " ", // right bracket
"|": " ", // vertical bar
"/": " ", // slash
"#": " ", // hash
"→": " ", // right arrow
"←": " ", // left arrow
}
for old, new := range replacements {
text = strings.ReplaceAll(text, old, new)
}
// Remove combining diacritics (common combining marks)
// FIXME: this should be fixed for non-English languages
diacriticsPattern := regexp.MustCompile(`[\x{0302}\x{0303}\x{0304}\x{0305}\x{0306}\x{0307}\x{0308}\x{030A}\x{030B}\x{030C}\x{0327}\x{0328}\x{0329}\x{032A}\x{032B}\x{032C}\x{032D}\x{032E}\x{032F}]`)
text = diacriticsPattern.ReplaceAllString(text, "")
// Remove special symbols
specialSymbols := []string{"♥", "☆", "♡", "©", "\\"}
for _, symbol := range specialSymbols {
text = strings.ReplaceAll(text, symbol, "")
}
// Replace known expressions
exprReplacements := map[string]string{
"@": " at ",
"e.g.,": "for example, ",
"i.e.,": "that is, ",
}
for old, new := range exprReplacements {
text = strings.ReplaceAll(text, old, new)
}
// Fix spacing around punctuation
text = regexp.MustCompile(` ,`).ReplaceAllString(text, ",")
text = regexp.MustCompile(` \.`).ReplaceAllString(text, ".")
text = regexp.MustCompile(` !`).ReplaceAllString(text, "!")
text = regexp.MustCompile(` \?`).ReplaceAllString(text, "?")
text = regexp.MustCompile(` ;`).ReplaceAllString(text, ";")
text = regexp.MustCompile(` :`).ReplaceAllString(text, ":")
text = regexp.MustCompile(` '`).ReplaceAllString(text, "'")
// Remove duplicate quotes
for strings.Contains(text, `""`) {
text = strings.ReplaceAll(text, `""`, `"`)
}
for strings.Contains(text, "''") {
text = strings.ReplaceAll(text, "''", "'")
}
for strings.Contains(text, "``") {
text = strings.ReplaceAll(text, "``", "`")
}
// Remove extra spaces
text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ")
text = strings.TrimSpace(text)
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
if text != "" {
endsWithPunct := regexp.MustCompile(`[.!?;:,'"\x{201C}\x{201D}\x{2018}\x{2019})\]}…。」』】〉》›»]$`)
if !endsWithPunct.MatchString(text) {
text += "."
}
}
return text return text
} }
+116 -1
View File
@@ -60,6 +60,42 @@ class UnicodeProcessor {
this.indexer = Helper.loadJsonLongArray(unicodeIndexerJsonPath); this.indexer = Helper.loadJsonLongArray(unicodeIndexerJsonPath);
} }
private static String removeEmojis(String text) {
StringBuilder result = new StringBuilder();
for (int i = 0; i < text.length(); i++) {
int codePoint;
if (Character.isHighSurrogate(text.charAt(i)) && i + 1 < text.length() && Character.isLowSurrogate(text.charAt(i + 1))) {
codePoint = Character.codePointAt(text, i);
i++; // Skip the low surrogate
} else {
codePoint = text.charAt(i);
}
// Check if code point is in emoji ranges
boolean isEmoji = (codePoint >= 0x1F600 && codePoint <= 0x1F64F) ||
(codePoint >= 0x1F300 && codePoint <= 0x1F5FF) ||
(codePoint >= 0x1F680 && codePoint <= 0x1F6FF) ||
(codePoint >= 0x1F700 && codePoint <= 0x1F77F) ||
(codePoint >= 0x1F780 && codePoint <= 0x1F7FF) ||
(codePoint >= 0x1F800 && codePoint <= 0x1F8FF) ||
(codePoint >= 0x1F900 && codePoint <= 0x1F9FF) ||
(codePoint >= 0x1FA00 && codePoint <= 0x1FA6F) ||
(codePoint >= 0x1FA70 && codePoint <= 0x1FAFF) ||
(codePoint >= 0x2600 && codePoint <= 0x26FF) ||
(codePoint >= 0x2700 && codePoint <= 0x27BF) ||
(codePoint >= 0x1F1E6 && codePoint <= 0x1F1FF);
if (!isEmoji) {
if (codePoint > 0xFFFF) {
result.append(Character.toChars(codePoint));
} else {
result.append((char) codePoint);
}
}
}
return result.toString();
}
public TextProcessResult call(List<String> textList) { public TextProcessResult call(List<String> textList) {
List<String> processedTexts = new ArrayList<>(); List<String> processedTexts = new ArrayList<>();
for (String text : textList) { for (String text : textList) {
@@ -86,7 +122,86 @@ class UnicodeProcessor {
} }
private String preprocessText(String text) { private String preprocessText(String text) {
return Normalizer.normalize(text, Normalizer.Form.NFKD); // TODO: Need advanced normalizer for better performance
text = Normalizer.normalize(text, Normalizer.Form.NFKD);
// FIXME: this should be fixed for non-English languages
// Remove emojis (wide Unicode range)
// Java Pattern doesn't support \x{...} syntax for Unicode above \uFFFF
// Use character filtering instead
text = removeEmojis(text);
// Replace various dashes and symbols
Map<String, String> replacements = new HashMap<>();
replacements.put("", "-"); // en dash
replacements.put("", "-"); // non-breaking hyphen
replacements.put("", "-"); // em dash
replacements.put("¯", " "); // macron
replacements.put("_", " "); // underscore
replacements.put("\u201C", "\""); // left double quote
replacements.put("\u201D", "\""); // right double quote
replacements.put("\u2018", "'"); // left single quote
replacements.put("\u2019", "'"); // right single quote
replacements.put("´", "'"); // acute accent
replacements.put("`", "'"); // grave accent
replacements.put("[", " "); // left bracket
replacements.put("]", " "); // right bracket
replacements.put("|", " "); // vertical bar
replacements.put("/", " "); // slash
replacements.put("#", " "); // hash
replacements.put("", " "); // right arrow
replacements.put("", " "); // left arrow
for (Map.Entry<String, String> entry : replacements.entrySet()) {
text = text.replace(entry.getKey(), entry.getValue());
}
// Remove combining diacritics // FIXME: this should be fixed for non-English languages
text = text.replaceAll("[\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u030A\\u030B\\u030C\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F]", "");
// Remove special symbols
text = text.replaceAll("[♥☆♡©\\\\]", "");
// Replace known expressions
Map<String, String> exprReplacements = new HashMap<>();
exprReplacements.put("@", " at ");
exprReplacements.put("e.g.,", "for example, ");
exprReplacements.put("i.e.,", "that is, ");
for (Map.Entry<String, String> entry : exprReplacements.entrySet()) {
text = text.replace(entry.getKey(), entry.getValue());
}
// Fix spacing around punctuation
text = text.replaceAll(" ,", ",");
text = text.replaceAll(" \\.", ".");
text = text.replaceAll(" !", "!");
text = text.replaceAll(" \\?", "?");
text = text.replaceAll(" ;", ";");
text = text.replaceAll(" :", ":");
text = text.replaceAll(" '", "'");
// Remove duplicate quotes
while (text.contains("\"\"")) {
text = text.replace("\"\"", "\"");
}
while (text.contains("''")) {
text = text.replace("''", "'");
}
while (text.contains("``")) {
text = text.replace("``", "`");
}
// Remove extra spaces
text = text.replaceAll("\\s+", " ").trim();
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
if (!text.matches(".*[.!?;:,'\"\\u201C\\u201D\\u2018\\u2019)\\]}…。」』】〉》›»]$")) {
text += ".";
}
return text;
} }
private int[] textToUnicodeValues(String text) { private int[] textToUnicodeValues(String text) {
+2
View File
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `ExampleONNX.java`.
## 📰 Update News ## 📰 Update News
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5). **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses. **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
+2
View File
@@ -4,6 +4,8 @@ Node.js implementation for TTS inference. Uses ONNX Runtime to generate speech f
## 📰 Update News ## 📰 Update News
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5). **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses. **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
+79 -2
View File
@@ -14,8 +14,85 @@ class UnicodeProcessor {
} }
_preprocessText(text) { _preprocessText(text) {
// Simple NFKD normalization (JavaScript has normalize built-in) // TODO: Need advanced normalizer for better performance
return text.normalize('NFKD'); text = text.normalize('NFKD');
// FIXME: this should be fixed for non-English languages
// Remove emojis (wide Unicode range)
const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu;
text = text.replace(emojiPattern, '');
// Replace various dashes and symbols
const replacements = {
'': '-',
'': '-',
'—': '-',
'¯': ' ',
'_': ' ',
'"': '"',
'"': '"',
'\u2018': "'", // left single quote
'\u2019': "'", // right single quote
'´': "'",
'`': "'",
'[': ' ',
']': ' ',
'|': ' ',
'/': ' ',
'#': ' ',
'→': ' ',
'←': ' ',
};
for (const [k, v] of Object.entries(replacements)) {
text = text.replaceAll(k, v);
}
// Remove combining diacritics // FIXME: this should be fixed for non-English languages
text = text.replace(/[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]/g, '');
// Remove special symbols
text = text.replace(/[♥☆♡©\\]/g, '');
// Replace known expressions
const exprReplacements = {
'@': ' at ',
'e.g.,': 'for example, ',
'i.e.,': 'that is, ',
};
for (const [k, v] of Object.entries(exprReplacements)) {
text = text.replaceAll(k, v);
}
// Fix spacing around punctuation
text = text.replace(/ ,/g, ',');
text = text.replace(/ \./g, '.');
text = text.replace(/ !/g, '!');
text = text.replace(/ \?/g, '?');
text = text.replace(/ ;/g, ';');
text = text.replace(/ :/g, ':');
text = text.replace(/ '/g, "'");
// Remove duplicate quotes
while (text.includes('""')) {
text = text.replace('""', '"');
}
while (text.includes("''")) {
text = text.replace("''", "'");
}
while (text.includes('``')) {
text = text.replace('``', '`');
}
// Remove extra spaces
text = text.replace(/\s+/g, ' ').trim();
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
if (!/[.!?;:,'\"')\]}…。」』】〉》›»]$/.test(text)) {
text += '.';
}
return text;
} }
_textToUnicodeValues(text) { _textToUnicodeValues(text) {
+2
View File
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx.py`.
## 📰 Update News ## 📰 Update News
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed. Adjust the speed factor to make speech faster or slower while maintaining natural quality. **2025.11.19** - Added `--speed` parameter to control speech synthesis speed. Adjust the speed factor to make speech faster or slower while maintaining natural quality.
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses. **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
+91 -1
View File
@@ -8,6 +8,8 @@ from unicodedata import normalize
import numpy as np import numpy as np
import onnxruntime as ort import onnxruntime as ort
import re
class UnicodeProcessor: class UnicodeProcessor:
def __init__(self, unicode_indexer_path: str): def __init__(self, unicode_indexer_path: str):
@@ -15,8 +17,96 @@ class UnicodeProcessor:
self.indexer = json.load(f) self.indexer = json.load(f)
def _preprocess_text(self, text: str) -> str: def _preprocess_text(self, text: str) -> str:
# TODO: add more preprocessing # TODO: Need advanced normalizer for better performance
text = normalize("NFKD", text) text = normalize("NFKD", text)
# FIXME: this should be fixed for non-English languages
# Remove emojis (wide Unicode range)
emoji_pattern = re.compile(
"[\U0001f600-\U0001f64f" # emoticons
"\U0001f300-\U0001f5ff" # symbols & pictographs
"\U0001f680-\U0001f6ff" # transport & map symbols
"\U0001f700-\U0001f77f"
"\U0001f780-\U0001f7ff"
"\U0001f800-\U0001f8ff"
"\U0001f900-\U0001f9ff"
"\U0001fa00-\U0001fa6f"
"\U0001fa70-\U0001faff"
"\u2600-\u26ff"
"\u2700-\u27bf"
"\U0001f1e6-\U0001f1ff]+",
flags=re.UNICODE,
)
text = emoji_pattern.sub("", text)
# Replace various dashes and symbols
replacements = {
"": "-",
"": "-",
"": "-",
"¯": " ",
"_": " ",
"": '"',
"": '"',
"": "'",
"": "'",
"´": "'",
"`": "'",
"[": " ",
"]": " ",
"|": " ",
"/": " ",
"#": " ",
"": " ",
"": " ",
}
for k, v in replacements.items():
text = text.replace(k, v)
# Remove combining diacritics # FIXME: this should be fixed for non-English languages
text = re.sub(
r"[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]",
"",
text,
)
# Remove special symbols
text = re.sub(r"[♥☆♡©\\]", "", text)
# Replace known expressions
expr_replacements = {
"@": " at ",
"e.g.,": "for example, ",
"i.e.,": "that is, ",
}
for k, v in expr_replacements.items():
text = text.replace(k, v)
# Fix spacing around punctuation
text = re.sub(r" ,", ",", text)
text = re.sub(r" \.", ".", text)
text = re.sub(r" !", "!", text)
text = re.sub(r" \?", "?", text)
text = re.sub(r" ;", ";", text)
text = re.sub(r" :", ":", text)
text = re.sub(r" '", "'", text)
# Remove duplicate quotes
while '""' in text:
text = text.replace('""', '"')
while "''" in text:
text = text.replace("''", "'")
while "``" in text:
text = text.replace("``", "`")
# Remove extra spaces
text = re.sub(r"\s+", " ", text).strip()
# If text doesn't end with punctuation, quotes, or closing brackets, add a period
if not re.search(r"[.!?;:,'\"')\]}…。」』】〉》›»]$", text):
text += "."
return text return text
def _get_text_mask(self, text_ids_lengths: np.ndarray) -> np.ndarray: def _get_text_mask(self, text_ids_lengths: np.ndarray) -> np.ndarray:
+2
View File
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using Rust.
## 📰 Update News ## 📰 Update News
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5). **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses. **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
+89 -1
View File
@@ -113,7 +113,95 @@ impl UnicodeProcessor {
} }
pub fn preprocess_text(text: &str) -> String { pub fn preprocess_text(text: &str) -> String {
text.nfkd().collect() // TODO: Need advanced normalizer for better performance
let mut text: String = text.nfkd().collect();
// FIXME: this should be fixed for non-English languages
// Remove emojis (wide Unicode range)
let emoji_pattern = Regex::new(r"[\x{1F600}-\x{1F64F}\x{1F300}-\x{1F5FF}\x{1F680}-\x{1F6FF}\x{1F700}-\x{1F77F}\x{1F780}-\x{1F7FF}\x{1F800}-\x{1F8FF}\x{1F900}-\x{1F9FF}\x{1FA00}-\x{1FA6F}\x{1FA70}-\x{1FAFF}\x{2600}-\x{26FF}\x{2700}-\x{27BF}\x{1F1E6}-\x{1F1FF}]+").unwrap();
text = emoji_pattern.replace_all(&text, "").to_string();
// Replace various dashes and symbols
let replacements = [
("", "-"), // en dash
("", "-"), // non-breaking hyphen
("", "-"), // em dash
("¯", " "), // macron
("_", " "), // underscore
("\u{201C}", "\""), // left double quote
("\u{201D}", "\""), // right double quote
("\u{2018}", "'"), // left single quote
("\u{2019}", "'"), // right single quote
("´", "'"), // acute accent
("`", "'"), // grave accent
("[", " "), // left bracket
("]", " "), // right bracket
("|", " "), // vertical bar
("/", " "), // slash
("#", " "), // hash
("", " "), // right arrow
("", " "), // left arrow
];
for (from, to) in &replacements {
text = text.replace(from, to);
}
// Remove combining diacritics // FIXME: this should be fixed for non-English languages
let diacritics_pattern = Regex::new(r"[\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{030A}\u{030B}\u{030C}\u{0327}\u{0328}\u{0329}\u{032A}\u{032B}\u{032C}\u{032D}\u{032E}\u{032F}]").unwrap();
text = diacritics_pattern.replace_all(&text, "").to_string();
// Remove special symbols
let special_symbols = ["", "", "", "©", "\\"];
for symbol in &special_symbols {
text = text.replace(symbol, "");
}
// Replace known expressions
let expr_replacements = [
("@", " at "),
("e.g.,", "for example, "),
("i.e.,", "that is, "),
];
for (from, to) in &expr_replacements {
text = text.replace(from, to);
}
// Fix spacing around punctuation
text = Regex::new(r" ,").unwrap().replace_all(&text, ",").to_string();
text = Regex::new(r" \.").unwrap().replace_all(&text, ".").to_string();
text = Regex::new(r" !").unwrap().replace_all(&text, "!").to_string();
text = Regex::new(r" \?").unwrap().replace_all(&text, "?").to_string();
text = Regex::new(r" ;").unwrap().replace_all(&text, ";").to_string();
text = Regex::new(r" :").unwrap().replace_all(&text, ":").to_string();
text = Regex::new(r" '").unwrap().replace_all(&text, "'").to_string();
// Remove duplicate quotes
while text.contains("\"\"") {
text = text.replace("\"\"", "\"");
}
while text.contains("''") {
text = text.replace("''", "'");
}
while text.contains("``") {
text = text.replace("``", "`");
}
// Remove extra spaces
text = Regex::new(r"\s+").unwrap().replace_all(&text, " ").to_string();
text = text.trim().to_string();
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
if !text.is_empty() {
let ends_with_punct = Regex::new(r#"[.!?;:,'"\u{201C}\u{201D}\u{2018}\u{2019})\]}…。」』】〉》›»]$"#).unwrap();
if !ends_with_punct.is_match(&text) {
text.push('.');
}
}
text
} }
pub fn text_to_unicode_values(text: &str) -> Vec<usize> { pub fn text_to_unicode_values(text: &str) -> Vec<usize> {
+2
View File
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx`.
## 📰 Update News ## 📰 Update News
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5). **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses. **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
+108 -1
View File
@@ -72,7 +72,114 @@ class UnicodeProcessor {
} }
func preprocessText(_ text: String) -> String { func preprocessText(_ text: String) -> String {
return text.precomposedStringWithCompatibilityMapping // TODO: Need advanced normalizer for better performance
var text = text.precomposedStringWithCompatibilityMapping
// FIXME: this should be fixed for non-English languages
// Remove emojis (wide Unicode range)
// Swift NSRegularExpression doesn't support Unicode escapes above \uFFFF
// Use character filtering instead
text = text.unicodeScalars.filter { scalar in
let value = scalar.value
return !((value >= 0x1F600 && value <= 0x1F64F) ||
(value >= 0x1F300 && value <= 0x1F5FF) ||
(value >= 0x1F680 && value <= 0x1F6FF) ||
(value >= 0x1F700 && value <= 0x1F77F) ||
(value >= 0x1F780 && value <= 0x1F7FF) ||
(value >= 0x1F800 && value <= 0x1F8FF) ||
(value >= 0x1F900 && value <= 0x1F9FF) ||
(value >= 0x1FA00 && value <= 0x1FA6F) ||
(value >= 0x1FA70 && value <= 0x1FAFF) ||
(value >= 0x2600 && value <= 0x26FF) ||
(value >= 0x2700 && value <= 0x27BF) ||
(value >= 0x1F1E6 && value <= 0x1F1FF))
}.map { String($0) }.joined()
// Replace various dashes and symbols
let replacements: [String: String] = [
"": "-", // en dash
"": "-", // non-breaking hyphen
"": "-", // em dash
"¯": " ", // macron
"_": " ", // underscore
"\u{201C}": "\"", // left double quote
"\u{201D}": "\"", // right double quote
"\u{2018}": "'", // left single quote
"\u{2019}": "'", // right single quote
"´": "'", // acute accent
"`": "'", // grave accent
"[": " ", // left bracket
"]": " ", // right bracket
"|": " ", // vertical bar
"/": " ", // slash
"#": " ", // hash
"": " ", // right arrow
"": " ", // left arrow
]
for (old, new) in replacements {
text = text.replacingOccurrences(of: old, with: new)
}
// Remove combining diacritics // FIXME: this should be fixed for non-English languages
let diacriticsPattern = try! NSRegularExpression(pattern: "[\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u030A\\u030B\\u030C\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F]")
let diacriticsRange = NSRange(text.startIndex..., in: text)
text = diacriticsPattern.stringByReplacingMatches(in: text, range: diacriticsRange, withTemplate: "")
// Remove special symbols
let specialSymbols = ["", "", "", "©", "\\"]
for symbol in specialSymbols {
text = text.replacingOccurrences(of: symbol, with: "")
}
// Replace known expressions
let exprReplacements: [String: String] = [
"@": " at ",
"e.g.,": "for example, ",
"i.e.,": "that is, ",
]
for (old, new) in exprReplacements {
text = text.replacingOccurrences(of: old, with: new)
}
// Fix spacing around punctuation
text = text.replacingOccurrences(of: " ,", with: ",")
text = text.replacingOccurrences(of: " .", with: ".")
text = text.replacingOccurrences(of: " !", with: "!")
text = text.replacingOccurrences(of: " ?", with: "?")
text = text.replacingOccurrences(of: " ;", with: ";")
text = text.replacingOccurrences(of: " :", with: ":")
text = text.replacingOccurrences(of: " '", with: "'")
// Remove duplicate quotes
while text.contains("\"\"") {
text = text.replacingOccurrences(of: "\"\"", with: "\"")
}
while text.contains("''") {
text = text.replacingOccurrences(of: "''", with: "'")
}
while text.contains("``") {
text = text.replacingOccurrences(of: "``", with: "`")
}
// Remove extra spaces
let whitespacePattern = try! NSRegularExpression(pattern: "\\s+")
let whitespaceRange = NSRange(text.startIndex..., in: text)
text = whitespacePattern.stringByReplacingMatches(in: text, range: whitespaceRange, withTemplate: " ")
text = text.trimmingCharacters(in: .whitespacesAndNewlines)
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
if !text.isEmpty {
let punctPattern = try! NSRegularExpression(pattern: "[.!?;:,'\"\\u201C\\u201D\\u2018\\u2019)\\]}…。」』】〉》›»]$")
let punctRange = NSRange(text.startIndex..., in: text)
if punctPattern.firstMatch(in: text, range: punctRange) == nil {
text += "."
}
}
return text
} }
func lengthToMask(_ lengths: [Int], maxLen: Int? = nil) -> [[[Float]]] { func lengthToMask(_ lengths: [Int], maxLen: Int? = nil) -> [[[Float]]] {
+2
View File
@@ -4,6 +4,8 @@ This example demonstrates how to use Supertonic in a web browser using ONNX Runt
## 📰 Update News ## 📰 Update News
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
**2025.11.19** - Added speed control slider to adjust speech synthesis speed (default: 1.05, recommended range: 0.9-1.5). **2025.11.19** - Added speed control slider to adjust speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses. **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
+79 -1
View File
@@ -28,7 +28,85 @@ export class UnicodeProcessor {
} }
preprocessText(text) { preprocessText(text) {
return text.normalize('NFKC'); // TODO: Need advanced normalizer for better performance
text = text.normalize('NFKD');
// FIXME: this should be fixed for non-English languages
// Remove emojis (wide Unicode range)
const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu;
text = text.replace(emojiPattern, '');
// Replace various dashes and symbols
const replacements = {
'': '-',
'': '-',
'—': '-',
'¯': ' ',
'_': ' ',
'"': '"',
'"': '"',
'\u2018': "'", // left single quote
'\u2019': "'", // right single quote
'´': "'",
'`': "'",
'[': ' ',
']': ' ',
'|': ' ',
'/': ' ',
'#': ' ',
'→': ' ',
'←': ' ',
};
for (const [k, v] of Object.entries(replacements)) {
text = text.replaceAll(k, v);
}
// Remove combining diacritics // FIXME: this should be fixed for non-English languages
text = text.replace(/[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]/g, '');
// Remove special symbols
text = text.replace(/[♥☆♡©\\]/g, '');
// Replace known expressions
const exprReplacements = {
'@': ' at ',
'e.g.,': 'for example, ',
'i.e.,': 'that is, ',
};
for (const [k, v] of Object.entries(exprReplacements)) {
text = text.replaceAll(k, v);
}
// Fix spacing around punctuation
text = text.replace(/ ,/g, ',');
text = text.replace(/ \./g, '.');
text = text.replace(/ !/g, '!');
text = text.replace(/ \?/g, '?');
text = text.replace(/ ;/g, ';');
text = text.replace(/ :/g, ':');
text = text.replace(/ '/g, "'");
// Remove duplicate quotes
while (text.includes('""')) {
text = text.replace('""', '"');
}
while (text.includes("''")) {
text = text.replace("''", "'");
}
while (text.includes('``')) {
text = text.replace('``', '`');
}
// Remove extra spaces
text = text.replace(/\s+/g, ' ').trim();
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
if (!/[.!?;:,'\"')\]}…。」』】〉》›»]$/.test(text)) {
text += '.';
}
return text;
} }
getTextMask(textIdsLengths) { getTextMask(textIdsLengths) {