mirror of
https://github.com/supertone-inc/supertonic.git
synced 2026-06-02 01:38:48 +02:00
Fix text normalization bug (#16)
This commit is contained in:
@@ -4,6 +4,8 @@ High-performance text-to-speech inference using ONNX Runtime.
|
|||||||
|
|
||||||
## 📰 Update News
|
## 📰 Update News
|
||||||
|
|
||||||
|
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
|
||||||
|
|
||||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||||
|
|
||||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||||
|
|||||||
+160
-18
@@ -19,6 +19,24 @@ void clearTensorBuffers() {
|
|||||||
g_tensor_buffers_int64.clear();
|
g_tensor_buffers_int64.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Helper function - trim
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
static std::string trim(const std::string& str) {
|
||||||
|
size_t start = 0;
|
||||||
|
while (start < str.size() && std::isspace(static_cast<unsigned char>(str[start]))) {
|
||||||
|
start++;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t end = str.size();
|
||||||
|
while (end > start && std::isspace(static_cast<unsigned char>(str[end - 1]))) {
|
||||||
|
end--;
|
||||||
|
}
|
||||||
|
|
||||||
|
return str.substr(start, end - start);
|
||||||
|
}
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// UnicodeProcessor implementation
|
// UnicodeProcessor implementation
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
@@ -28,10 +46,148 @@ UnicodeProcessor::UnicodeProcessor(const std::string& unicode_indexer_json_path)
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::string UnicodeProcessor::preprocessText(const std::string& text) {
|
std::string UnicodeProcessor::preprocessText(const std::string& text) {
|
||||||
// Simple NFKD normalization (C++ doesn't have built-in Unicode normalization)
|
// TODO: Need advanced normalizer for better performance
|
||||||
// For now, just return the text as-is
|
// NOTE: C++ doesn't have built-in Unicode normalization like Python's NFKD
|
||||||
// TODO: add proper Unicode normalization
|
// For full Unicode normalization, consider using ICU library
|
||||||
return text;
|
// This implementation handles basic text preprocessing
|
||||||
|
|
||||||
|
std::string result = text;
|
||||||
|
|
||||||
|
// FIXME: this should be fixed for non-English languages
|
||||||
|
|
||||||
|
// Remove emojis and various Unicode symbols
|
||||||
|
// Using regex to remove common emoji ranges and special symbols
|
||||||
|
// Note: This is a simplified version - full emoji support needs UTF-8 handling
|
||||||
|
std::regex emoji_pattern(
|
||||||
|
"[\xF0\x9F][\x80-\xBF]{2}|" // Common emoji pattern in UTF-8
|
||||||
|
"[\xE2][\x80-\xBF]{2}|" // Various symbols
|
||||||
|
"[\xE2][\x98-\x9E][\x80-\xBF]" // More symbols
|
||||||
|
);
|
||||||
|
result = std::regex_replace(result, emoji_pattern, "");
|
||||||
|
|
||||||
|
// Replace various dashes and symbols
|
||||||
|
struct Replacement {
|
||||||
|
const char* from;
|
||||||
|
const char* to;
|
||||||
|
};
|
||||||
|
|
||||||
|
const Replacement replacements[] = {
|
||||||
|
{"–", "-"}, // en dash
|
||||||
|
{"‑", "-"}, // non-breaking hyphen
|
||||||
|
{"—", "-"}, // em dash
|
||||||
|
{"¯", " "}, // macron
|
||||||
|
{"_", " "}, // underscore
|
||||||
|
{""", "\""}, // left double quote (U+201C)
|
||||||
|
{""", "\""}, // right double quote (U+201D)
|
||||||
|
{"'", "'"}, // left single quote (U+2018)
|
||||||
|
{"'", "'"}, // right single quote (U+2019)
|
||||||
|
{"´", "'"}, // acute accent
|
||||||
|
{"`", "'"}, // grave accent
|
||||||
|
{"[", " "}, // left bracket
|
||||||
|
{"]", " "}, // right bracket
|
||||||
|
{"|", " "}, // vertical bar
|
||||||
|
{"/", " "}, // slash
|
||||||
|
{"#", " "}, // hash
|
||||||
|
{"→", " "}, // right arrow
|
||||||
|
{"←", " "}, // left arrow
|
||||||
|
};
|
||||||
|
|
||||||
|
for (const auto& repl : replacements) {
|
||||||
|
size_t pos = 0;
|
||||||
|
while ((pos = result.find(repl.from, pos)) != std::string::npos) {
|
||||||
|
result.replace(pos, strlen(repl.from), repl.to);
|
||||||
|
pos += strlen(repl.to);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove combining diacritics (common combining marks in UTF-8)
|
||||||
|
// FIXME: this should be fixed for non-English languages
|
||||||
|
std::regex diacritics_pattern(
|
||||||
|
"[\xCC\xCD][\x80-\xBF]" // Combining diacritical marks range
|
||||||
|
);
|
||||||
|
result = std::regex_replace(result, diacritics_pattern, "");
|
||||||
|
|
||||||
|
// Remove special symbols
|
||||||
|
const char* special_symbols[] = {"♥", "☆", "♡", "©", "\\"};
|
||||||
|
for (const char* symbol : special_symbols) {
|
||||||
|
size_t pos = 0;
|
||||||
|
while ((pos = result.find(symbol, pos)) != std::string::npos) {
|
||||||
|
result.erase(pos, strlen(symbol));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Replace known expressions
|
||||||
|
const Replacement expr_replacements[] = {
|
||||||
|
{"@", " at "},
|
||||||
|
{"e.g.,", "for example, "},
|
||||||
|
{"i.e.,", "that is, "},
|
||||||
|
};
|
||||||
|
|
||||||
|
for (const auto& repl : expr_replacements) {
|
||||||
|
size_t pos = 0;
|
||||||
|
while ((pos = result.find(repl.from, pos)) != std::string::npos) {
|
||||||
|
result.replace(pos, strlen(repl.from), repl.to);
|
||||||
|
pos += strlen(repl.to);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fix spacing around punctuation
|
||||||
|
result = std::regex_replace(result, std::regex(" ,"), ",");
|
||||||
|
result = std::regex_replace(result, std::regex(" \\."), ".");
|
||||||
|
result = std::regex_replace(result, std::regex(" !"), "!");
|
||||||
|
result = std::regex_replace(result, std::regex(" \\?"), "?");
|
||||||
|
result = std::regex_replace(result, std::regex(" ;"), ";");
|
||||||
|
result = std::regex_replace(result, std::regex(" :"), ":");
|
||||||
|
result = std::regex_replace(result, std::regex(" '"), "'");
|
||||||
|
|
||||||
|
// Remove duplicate quotes
|
||||||
|
while (result.find("\"\"") != std::string::npos) {
|
||||||
|
size_t pos = result.find("\"\"");
|
||||||
|
result.replace(pos, 2, "\"");
|
||||||
|
}
|
||||||
|
while (result.find("''") != std::string::npos) {
|
||||||
|
size_t pos = result.find("''");
|
||||||
|
result.replace(pos, 2, "'");
|
||||||
|
}
|
||||||
|
while (result.find("``") != std::string::npos) {
|
||||||
|
size_t pos = result.find("``");
|
||||||
|
result.replace(pos, 2, "`");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove extra spaces
|
||||||
|
result = std::regex_replace(result, std::regex("\\s+"), " ");
|
||||||
|
result = trim(result);
|
||||||
|
|
||||||
|
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
|
||||||
|
if (!result.empty()) {
|
||||||
|
char last_char = result.back();
|
||||||
|
bool ends_with_punct = (
|
||||||
|
last_char == '.' || last_char == '!' || last_char == '?' ||
|
||||||
|
last_char == ';' || last_char == ':' || last_char == ',' ||
|
||||||
|
last_char == '\'' || last_char == '"' || last_char == ')' ||
|
||||||
|
last_char == ']' || last_char == '}' || last_char == '>'
|
||||||
|
);
|
||||||
|
|
||||||
|
// Check for UTF-8 multibyte ending punctuation (e.g., …, 。, curly quotes, etc.)
|
||||||
|
if (!ends_with_punct && result.size() >= 3) {
|
||||||
|
std::string last_three = result.substr(result.size() - 3);
|
||||||
|
if (last_three == "…" || last_three == "。" ||
|
||||||
|
last_three == "」" || last_three == "』" ||
|
||||||
|
last_three == "】" || last_three == "〉" ||
|
||||||
|
last_three == "》" || last_three == "›" ||
|
||||||
|
last_three == "»" || last_three == """ ||
|
||||||
|
last_three == """ || last_three == "'" ||
|
||||||
|
last_three == "'") {
|
||||||
|
ends_with_punct = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!ends_with_punct) {
|
||||||
|
result += ".";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<uint16_t> UnicodeProcessor::textToUnicodeValues(const std::string& text) {
|
std::vector<uint16_t> UnicodeProcessor::textToUnicodeValues(const std::string& text) {
|
||||||
@@ -772,20 +928,6 @@ std::string sanitizeFilename(const std::string& text, int max_len) {
|
|||||||
// Chunk text
|
// Chunk text
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
||||||
static std::string trim(const std::string& str) {
|
|
||||||
size_t start = 0;
|
|
||||||
while (start < str.size() && std::isspace(static_cast<unsigned char>(str[start]))) {
|
|
||||||
start++;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t end = str.size();
|
|
||||||
while (end > start && std::isspace(static_cast<unsigned char>(str[end - 1]))) {
|
|
||||||
end--;
|
|
||||||
}
|
|
||||||
|
|
||||||
return str.substr(start, end - start);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::string> chunkText(const std::string& text, int max_len) {
|
std::vector<std::string> chunkText(const std::string& text, int max_len) {
|
||||||
std::vector<std::string> chunks;
|
std::vector<std::string> chunks;
|
||||||
|
|
||||||
|
|||||||
+136
-2
@@ -71,10 +71,144 @@ namespace Supertonic
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static string RemoveEmojis(string text)
|
||||||
|
{
|
||||||
|
var result = new StringBuilder();
|
||||||
|
for (int i = 0; i < text.Length; i++)
|
||||||
|
{
|
||||||
|
int codePoint;
|
||||||
|
if (char.IsHighSurrogate(text[i]) && i + 1 < text.Length && char.IsLowSurrogate(text[i + 1]))
|
||||||
|
{
|
||||||
|
// Get the full code point from surrogate pair
|
||||||
|
codePoint = char.ConvertToUtf32(text[i], text[i + 1]);
|
||||||
|
i++; // Skip the low surrogate
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
codePoint = text[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if code point is in emoji ranges
|
||||||
|
bool isEmoji = (codePoint >= 0x1F600 && codePoint <= 0x1F64F) ||
|
||||||
|
(codePoint >= 0x1F300 && codePoint <= 0x1F5FF) ||
|
||||||
|
(codePoint >= 0x1F680 && codePoint <= 0x1F6FF) ||
|
||||||
|
(codePoint >= 0x1F700 && codePoint <= 0x1F77F) ||
|
||||||
|
(codePoint >= 0x1F780 && codePoint <= 0x1F7FF) ||
|
||||||
|
(codePoint >= 0x1F800 && codePoint <= 0x1F8FF) ||
|
||||||
|
(codePoint >= 0x1F900 && codePoint <= 0x1F9FF) ||
|
||||||
|
(codePoint >= 0x1FA00 && codePoint <= 0x1FA6F) ||
|
||||||
|
(codePoint >= 0x1FA70 && codePoint <= 0x1FAFF) ||
|
||||||
|
(codePoint >= 0x2600 && codePoint <= 0x26FF) ||
|
||||||
|
(codePoint >= 0x2700 && codePoint <= 0x27BF) ||
|
||||||
|
(codePoint >= 0x1F1E6 && codePoint <= 0x1F1FF);
|
||||||
|
|
||||||
|
if (!isEmoji)
|
||||||
|
{
|
||||||
|
if (codePoint > 0xFFFF)
|
||||||
|
{
|
||||||
|
// Add back as surrogate pair
|
||||||
|
result.Append(char.ConvertFromUtf32(codePoint));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
result.Append((char)codePoint);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result.ToString();
|
||||||
|
}
|
||||||
|
|
||||||
private string PreprocessText(string text)
|
private string PreprocessText(string text)
|
||||||
{
|
{
|
||||||
// Simple normalization (C# has Normalize built-in)
|
// TODO: Need advanced normalizer for better performance
|
||||||
return text.Normalize(NormalizationForm.FormKD);
|
text = text.Normalize(NormalizationForm.FormKD);
|
||||||
|
|
||||||
|
// FIXME: this should be fixed for non-English languages
|
||||||
|
|
||||||
|
// Remove emojis (wide Unicode range)
|
||||||
|
// C# doesn't support \u{...} syntax in regex, so we use character filtering instead
|
||||||
|
text = RemoveEmojis(text);
|
||||||
|
|
||||||
|
// Replace various dashes and symbols
|
||||||
|
var replacements = new Dictionary<string, string>
|
||||||
|
{
|
||||||
|
{"–", "-"}, // en dash
|
||||||
|
{"‑", "-"}, // non-breaking hyphen
|
||||||
|
{"—", "-"}, // em dash
|
||||||
|
{"¯", " "}, // macron
|
||||||
|
{"_", " "}, // underscore
|
||||||
|
{"\u201C", "\""}, // left double quote
|
||||||
|
{"\u201D", "\""}, // right double quote
|
||||||
|
{"\u2018", "'"}, // left single quote
|
||||||
|
{"\u2019", "'"}, // right single quote
|
||||||
|
{"´", "'"}, // acute accent
|
||||||
|
{"`", "'"}, // grave accent
|
||||||
|
{"[", " "}, // left bracket
|
||||||
|
{"]", " "}, // right bracket
|
||||||
|
{"|", " "}, // vertical bar
|
||||||
|
{"/", " "}, // slash
|
||||||
|
{"#", " "}, // hash
|
||||||
|
{"→", " "}, // right arrow
|
||||||
|
{"←", " "}, // left arrow
|
||||||
|
};
|
||||||
|
|
||||||
|
foreach (var kvp in replacements)
|
||||||
|
{
|
||||||
|
text = text.Replace(kvp.Key, kvp.Value);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove combining diacritics // FIXME: this should be fixed for non-English languages
|
||||||
|
text = Regex.Replace(text, @"[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]", "");
|
||||||
|
|
||||||
|
// Remove special symbols
|
||||||
|
text = Regex.Replace(text, @"[♥☆♡©\\]", "");
|
||||||
|
|
||||||
|
// Replace known expressions
|
||||||
|
var exprReplacements = new Dictionary<string, string>
|
||||||
|
{
|
||||||
|
{"@", " at "},
|
||||||
|
{"e.g.,", "for example, "},
|
||||||
|
{"i.e.,", "that is, "},
|
||||||
|
};
|
||||||
|
|
||||||
|
foreach (var kvp in exprReplacements)
|
||||||
|
{
|
||||||
|
text = text.Replace(kvp.Key, kvp.Value);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fix spacing around punctuation
|
||||||
|
text = Regex.Replace(text, @" ,", ",");
|
||||||
|
text = Regex.Replace(text, @" \.", ".");
|
||||||
|
text = Regex.Replace(text, @" !", "!");
|
||||||
|
text = Regex.Replace(text, @" \?", "?");
|
||||||
|
text = Regex.Replace(text, @" ;", ";");
|
||||||
|
text = Regex.Replace(text, @" :", ":");
|
||||||
|
text = Regex.Replace(text, @" '", "'");
|
||||||
|
|
||||||
|
// Remove duplicate quotes
|
||||||
|
while (text.Contains("\"\""))
|
||||||
|
{
|
||||||
|
text = text.Replace("\"\"", "\"");
|
||||||
|
}
|
||||||
|
while (text.Contains("''"))
|
||||||
|
{
|
||||||
|
text = text.Replace("''", "'");
|
||||||
|
}
|
||||||
|
while (text.Contains("``"))
|
||||||
|
{
|
||||||
|
text = text.Replace("``", "`");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove extra spaces
|
||||||
|
text = Regex.Replace(text, @"\s+", " ").Trim();
|
||||||
|
|
||||||
|
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
|
||||||
|
if (!Regex.IsMatch(text, @"[.!?;:,'\u0022\u201C\u201D\u2018\u2019)\]}…。」』】〉》›»]$"))
|
||||||
|
{
|
||||||
|
text += ".";
|
||||||
|
}
|
||||||
|
|
||||||
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
private int[] TextToUnicodeValues(string text)
|
private int[] TextToUnicodeValues(string text)
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `ExampleONNX.cs`.
|
|||||||
|
|
||||||
## 📰 Update News
|
## 📰 Update News
|
||||||
|
|
||||||
|
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
|
||||||
|
|
||||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||||
|
|
||||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx.go`.
|
|||||||
|
|
||||||
## 📰 Update News
|
## 📰 Update News
|
||||||
|
|
||||||
|
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
|
||||||
|
|
||||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||||
|
|
||||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||||
|
|||||||
+90
-1
@@ -327,8 +327,97 @@ func splitSentences(text string) []string {
|
|||||||
|
|
||||||
// Utility functions
|
// Utility functions
|
||||||
func preprocessText(text string) string {
|
func preprocessText(text string) string {
|
||||||
// Simple normalization (Go doesn't have built-in NFKD normalization)
|
// TODO: Need advanced normalizer for better performance
|
||||||
|
// NOTE: Go doesn't have built-in NFKD normalization like Python
|
||||||
// For full Unicode normalization, use golang.org/x/text/unicode/norm
|
// For full Unicode normalization, use golang.org/x/text/unicode/norm
|
||||||
|
// This implementation handles basic text preprocessing
|
||||||
|
|
||||||
|
// FIXME: this should be fixed for non-English languages
|
||||||
|
|
||||||
|
// Remove emojis and various Unicode symbols
|
||||||
|
emojiPattern := regexp.MustCompile(`[\x{1F600}-\x{1F64F}\x{1F300}-\x{1F5FF}\x{1F680}-\x{1F6FF}\x{1F700}-\x{1F77F}\x{1F780}-\x{1F7FF}\x{1F800}-\x{1F8FF}\x{1F900}-\x{1F9FF}\x{1FA00}-\x{1FA6F}\x{1FA70}-\x{1FAFF}\x{2600}-\x{26FF}\x{2700}-\x{27BF}\x{1F1E6}-\x{1F1FF}]+`)
|
||||||
|
text = emojiPattern.ReplaceAllString(text, "")
|
||||||
|
|
||||||
|
// Replace various dashes and symbols
|
||||||
|
replacements := map[string]string{
|
||||||
|
"–": "-", // en dash
|
||||||
|
"‑": "-", // non-breaking hyphen
|
||||||
|
"—": "-", // em dash
|
||||||
|
"¯": " ", // macron
|
||||||
|
"_": " ", // underscore
|
||||||
|
"\u201C": "\"", // left double quote
|
||||||
|
"\u201D": "\"", // right double quote
|
||||||
|
"\u2018": "'", // left single quote
|
||||||
|
"\u2019": "'", // right single quote
|
||||||
|
"´": "'", // acute accent
|
||||||
|
"`": "'", // grave accent
|
||||||
|
"[": " ", // left bracket
|
||||||
|
"]": " ", // right bracket
|
||||||
|
"|": " ", // vertical bar
|
||||||
|
"/": " ", // slash
|
||||||
|
"#": " ", // hash
|
||||||
|
"→": " ", // right arrow
|
||||||
|
"←": " ", // left arrow
|
||||||
|
}
|
||||||
|
|
||||||
|
for old, new := range replacements {
|
||||||
|
text = strings.ReplaceAll(text, old, new)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove combining diacritics (common combining marks)
|
||||||
|
// FIXME: this should be fixed for non-English languages
|
||||||
|
diacriticsPattern := regexp.MustCompile(`[\x{0302}\x{0303}\x{0304}\x{0305}\x{0306}\x{0307}\x{0308}\x{030A}\x{030B}\x{030C}\x{0327}\x{0328}\x{0329}\x{032A}\x{032B}\x{032C}\x{032D}\x{032E}\x{032F}]`)
|
||||||
|
text = diacriticsPattern.ReplaceAllString(text, "")
|
||||||
|
|
||||||
|
// Remove special symbols
|
||||||
|
specialSymbols := []string{"♥", "☆", "♡", "©", "\\"}
|
||||||
|
for _, symbol := range specialSymbols {
|
||||||
|
text = strings.ReplaceAll(text, symbol, "")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Replace known expressions
|
||||||
|
exprReplacements := map[string]string{
|
||||||
|
"@": " at ",
|
||||||
|
"e.g.,": "for example, ",
|
||||||
|
"i.e.,": "that is, ",
|
||||||
|
}
|
||||||
|
|
||||||
|
for old, new := range exprReplacements {
|
||||||
|
text = strings.ReplaceAll(text, old, new)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fix spacing around punctuation
|
||||||
|
text = regexp.MustCompile(` ,`).ReplaceAllString(text, ",")
|
||||||
|
text = regexp.MustCompile(` \.`).ReplaceAllString(text, ".")
|
||||||
|
text = regexp.MustCompile(` !`).ReplaceAllString(text, "!")
|
||||||
|
text = regexp.MustCompile(` \?`).ReplaceAllString(text, "?")
|
||||||
|
text = regexp.MustCompile(` ;`).ReplaceAllString(text, ";")
|
||||||
|
text = regexp.MustCompile(` :`).ReplaceAllString(text, ":")
|
||||||
|
text = regexp.MustCompile(` '`).ReplaceAllString(text, "'")
|
||||||
|
|
||||||
|
// Remove duplicate quotes
|
||||||
|
for strings.Contains(text, `""`) {
|
||||||
|
text = strings.ReplaceAll(text, `""`, `"`)
|
||||||
|
}
|
||||||
|
for strings.Contains(text, "''") {
|
||||||
|
text = strings.ReplaceAll(text, "''", "'")
|
||||||
|
}
|
||||||
|
for strings.Contains(text, "``") {
|
||||||
|
text = strings.ReplaceAll(text, "``", "`")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove extra spaces
|
||||||
|
text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ")
|
||||||
|
text = strings.TrimSpace(text)
|
||||||
|
|
||||||
|
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
|
||||||
|
if text != "" {
|
||||||
|
endsWithPunct := regexp.MustCompile(`[.!?;:,'"\x{201C}\x{201D}\x{2018}\x{2019})\]}…。」』】〉》›»]$`)
|
||||||
|
if !endsWithPunct.MatchString(text) {
|
||||||
|
text += "."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return text
|
return text
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
+116
-1
@@ -60,6 +60,42 @@ class UnicodeProcessor {
|
|||||||
this.indexer = Helper.loadJsonLongArray(unicodeIndexerJsonPath);
|
this.indexer = Helper.loadJsonLongArray(unicodeIndexerJsonPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static String removeEmojis(String text) {
|
||||||
|
StringBuilder result = new StringBuilder();
|
||||||
|
for (int i = 0; i < text.length(); i++) {
|
||||||
|
int codePoint;
|
||||||
|
if (Character.isHighSurrogate(text.charAt(i)) && i + 1 < text.length() && Character.isLowSurrogate(text.charAt(i + 1))) {
|
||||||
|
codePoint = Character.codePointAt(text, i);
|
||||||
|
i++; // Skip the low surrogate
|
||||||
|
} else {
|
||||||
|
codePoint = text.charAt(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if code point is in emoji ranges
|
||||||
|
boolean isEmoji = (codePoint >= 0x1F600 && codePoint <= 0x1F64F) ||
|
||||||
|
(codePoint >= 0x1F300 && codePoint <= 0x1F5FF) ||
|
||||||
|
(codePoint >= 0x1F680 && codePoint <= 0x1F6FF) ||
|
||||||
|
(codePoint >= 0x1F700 && codePoint <= 0x1F77F) ||
|
||||||
|
(codePoint >= 0x1F780 && codePoint <= 0x1F7FF) ||
|
||||||
|
(codePoint >= 0x1F800 && codePoint <= 0x1F8FF) ||
|
||||||
|
(codePoint >= 0x1F900 && codePoint <= 0x1F9FF) ||
|
||||||
|
(codePoint >= 0x1FA00 && codePoint <= 0x1FA6F) ||
|
||||||
|
(codePoint >= 0x1FA70 && codePoint <= 0x1FAFF) ||
|
||||||
|
(codePoint >= 0x2600 && codePoint <= 0x26FF) ||
|
||||||
|
(codePoint >= 0x2700 && codePoint <= 0x27BF) ||
|
||||||
|
(codePoint >= 0x1F1E6 && codePoint <= 0x1F1FF);
|
||||||
|
|
||||||
|
if (!isEmoji) {
|
||||||
|
if (codePoint > 0xFFFF) {
|
||||||
|
result.append(Character.toChars(codePoint));
|
||||||
|
} else {
|
||||||
|
result.append((char) codePoint);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result.toString();
|
||||||
|
}
|
||||||
|
|
||||||
public TextProcessResult call(List<String> textList) {
|
public TextProcessResult call(List<String> textList) {
|
||||||
List<String> processedTexts = new ArrayList<>();
|
List<String> processedTexts = new ArrayList<>();
|
||||||
for (String text : textList) {
|
for (String text : textList) {
|
||||||
@@ -86,7 +122,86 @@ class UnicodeProcessor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private String preprocessText(String text) {
|
private String preprocessText(String text) {
|
||||||
return Normalizer.normalize(text, Normalizer.Form.NFKD);
|
// TODO: Need advanced normalizer for better performance
|
||||||
|
text = Normalizer.normalize(text, Normalizer.Form.NFKD);
|
||||||
|
|
||||||
|
// FIXME: this should be fixed for non-English languages
|
||||||
|
|
||||||
|
// Remove emojis (wide Unicode range)
|
||||||
|
// Java Pattern doesn't support \x{...} syntax for Unicode above \uFFFF
|
||||||
|
// Use character filtering instead
|
||||||
|
text = removeEmojis(text);
|
||||||
|
|
||||||
|
// Replace various dashes and symbols
|
||||||
|
Map<String, String> replacements = new HashMap<>();
|
||||||
|
replacements.put("–", "-"); // en dash
|
||||||
|
replacements.put("‑", "-"); // non-breaking hyphen
|
||||||
|
replacements.put("—", "-"); // em dash
|
||||||
|
replacements.put("¯", " "); // macron
|
||||||
|
replacements.put("_", " "); // underscore
|
||||||
|
replacements.put("\u201C", "\""); // left double quote
|
||||||
|
replacements.put("\u201D", "\""); // right double quote
|
||||||
|
replacements.put("\u2018", "'"); // left single quote
|
||||||
|
replacements.put("\u2019", "'"); // right single quote
|
||||||
|
replacements.put("´", "'"); // acute accent
|
||||||
|
replacements.put("`", "'"); // grave accent
|
||||||
|
replacements.put("[", " "); // left bracket
|
||||||
|
replacements.put("]", " "); // right bracket
|
||||||
|
replacements.put("|", " "); // vertical bar
|
||||||
|
replacements.put("/", " "); // slash
|
||||||
|
replacements.put("#", " "); // hash
|
||||||
|
replacements.put("→", " "); // right arrow
|
||||||
|
replacements.put("←", " "); // left arrow
|
||||||
|
|
||||||
|
for (Map.Entry<String, String> entry : replacements.entrySet()) {
|
||||||
|
text = text.replace(entry.getKey(), entry.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove combining diacritics // FIXME: this should be fixed for non-English languages
|
||||||
|
text = text.replaceAll("[\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u030A\\u030B\\u030C\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F]", "");
|
||||||
|
|
||||||
|
// Remove special symbols
|
||||||
|
text = text.replaceAll("[♥☆♡©\\\\]", "");
|
||||||
|
|
||||||
|
// Replace known expressions
|
||||||
|
Map<String, String> exprReplacements = new HashMap<>();
|
||||||
|
exprReplacements.put("@", " at ");
|
||||||
|
exprReplacements.put("e.g.,", "for example, ");
|
||||||
|
exprReplacements.put("i.e.,", "that is, ");
|
||||||
|
|
||||||
|
for (Map.Entry<String, String> entry : exprReplacements.entrySet()) {
|
||||||
|
text = text.replace(entry.getKey(), entry.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fix spacing around punctuation
|
||||||
|
text = text.replaceAll(" ,", ",");
|
||||||
|
text = text.replaceAll(" \\.", ".");
|
||||||
|
text = text.replaceAll(" !", "!");
|
||||||
|
text = text.replaceAll(" \\?", "?");
|
||||||
|
text = text.replaceAll(" ;", ";");
|
||||||
|
text = text.replaceAll(" :", ":");
|
||||||
|
text = text.replaceAll(" '", "'");
|
||||||
|
|
||||||
|
// Remove duplicate quotes
|
||||||
|
while (text.contains("\"\"")) {
|
||||||
|
text = text.replace("\"\"", "\"");
|
||||||
|
}
|
||||||
|
while (text.contains("''")) {
|
||||||
|
text = text.replace("''", "'");
|
||||||
|
}
|
||||||
|
while (text.contains("``")) {
|
||||||
|
text = text.replace("``", "`");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove extra spaces
|
||||||
|
text = text.replaceAll("\\s+", " ").trim();
|
||||||
|
|
||||||
|
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
|
||||||
|
if (!text.matches(".*[.!?;:,'\"\\u201C\\u201D\\u2018\\u2019)\\]}…。」』】〉》›»]$")) {
|
||||||
|
text += ".";
|
||||||
|
}
|
||||||
|
|
||||||
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
private int[] textToUnicodeValues(String text) {
|
private int[] textToUnicodeValues(String text) {
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `ExampleONNX.java`.
|
|||||||
|
|
||||||
## 📰 Update News
|
## 📰 Update News
|
||||||
|
|
||||||
|
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
|
||||||
|
|
||||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||||
|
|
||||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ Node.js implementation for TTS inference. Uses ONNX Runtime to generate speech f
|
|||||||
|
|
||||||
## 📰 Update News
|
## 📰 Update News
|
||||||
|
|
||||||
|
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
|
||||||
|
|
||||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||||
|
|
||||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||||
|
|||||||
+79
-2
@@ -14,8 +14,85 @@ class UnicodeProcessor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
_preprocessText(text) {
|
_preprocessText(text) {
|
||||||
// Simple NFKD normalization (JavaScript has normalize built-in)
|
// TODO: Need advanced normalizer for better performance
|
||||||
return text.normalize('NFKD');
|
text = text.normalize('NFKD');
|
||||||
|
|
||||||
|
// FIXME: this should be fixed for non-English languages
|
||||||
|
|
||||||
|
// Remove emojis (wide Unicode range)
|
||||||
|
const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu;
|
||||||
|
text = text.replace(emojiPattern, '');
|
||||||
|
|
||||||
|
// Replace various dashes and symbols
|
||||||
|
const replacements = {
|
||||||
|
'–': '-',
|
||||||
|
'‑': '-',
|
||||||
|
'—': '-',
|
||||||
|
'¯': ' ',
|
||||||
|
'_': ' ',
|
||||||
|
'"': '"',
|
||||||
|
'"': '"',
|
||||||
|
'\u2018': "'", // left single quote
|
||||||
|
'\u2019': "'", // right single quote
|
||||||
|
'´': "'",
|
||||||
|
'`': "'",
|
||||||
|
'[': ' ',
|
||||||
|
']': ' ',
|
||||||
|
'|': ' ',
|
||||||
|
'/': ' ',
|
||||||
|
'#': ' ',
|
||||||
|
'→': ' ',
|
||||||
|
'←': ' ',
|
||||||
|
};
|
||||||
|
for (const [k, v] of Object.entries(replacements)) {
|
||||||
|
text = text.replaceAll(k, v);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove combining diacritics // FIXME: this should be fixed for non-English languages
|
||||||
|
text = text.replace(/[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]/g, '');
|
||||||
|
|
||||||
|
// Remove special symbols
|
||||||
|
text = text.replace(/[♥☆♡©\\]/g, '');
|
||||||
|
|
||||||
|
// Replace known expressions
|
||||||
|
const exprReplacements = {
|
||||||
|
'@': ' at ',
|
||||||
|
'e.g.,': 'for example, ',
|
||||||
|
'i.e.,': 'that is, ',
|
||||||
|
};
|
||||||
|
for (const [k, v] of Object.entries(exprReplacements)) {
|
||||||
|
text = text.replaceAll(k, v);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fix spacing around punctuation
|
||||||
|
text = text.replace(/ ,/g, ',');
|
||||||
|
text = text.replace(/ \./g, '.');
|
||||||
|
text = text.replace(/ !/g, '!');
|
||||||
|
text = text.replace(/ \?/g, '?');
|
||||||
|
text = text.replace(/ ;/g, ';');
|
||||||
|
text = text.replace(/ :/g, ':');
|
||||||
|
text = text.replace(/ '/g, "'");
|
||||||
|
|
||||||
|
// Remove duplicate quotes
|
||||||
|
while (text.includes('""')) {
|
||||||
|
text = text.replace('""', '"');
|
||||||
|
}
|
||||||
|
while (text.includes("''")) {
|
||||||
|
text = text.replace("''", "'");
|
||||||
|
}
|
||||||
|
while (text.includes('``')) {
|
||||||
|
text = text.replace('``', '`');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove extra spaces
|
||||||
|
text = text.replace(/\s+/g, ' ').trim();
|
||||||
|
|
||||||
|
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
|
||||||
|
if (!/[.!?;:,'\"')\]}…。」』】〉》›»]$/.test(text)) {
|
||||||
|
text += '.';
|
||||||
|
}
|
||||||
|
|
||||||
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
_textToUnicodeValues(text) {
|
_textToUnicodeValues(text) {
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx.py`.
|
|||||||
|
|
||||||
## 📰 Update News
|
## 📰 Update News
|
||||||
|
|
||||||
|
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
|
||||||
|
|
||||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed. Adjust the speed factor to make speech faster or slower while maintaining natural quality.
|
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed. Adjust the speed factor to make speech faster or slower while maintaining natural quality.
|
||||||
|
|
||||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||||
|
|||||||
+91
-1
@@ -8,6 +8,8 @@ from unicodedata import normalize
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import onnxruntime as ort
|
import onnxruntime as ort
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
class UnicodeProcessor:
|
class UnicodeProcessor:
|
||||||
def __init__(self, unicode_indexer_path: str):
|
def __init__(self, unicode_indexer_path: str):
|
||||||
@@ -15,8 +17,96 @@ class UnicodeProcessor:
|
|||||||
self.indexer = json.load(f)
|
self.indexer = json.load(f)
|
||||||
|
|
||||||
def _preprocess_text(self, text: str) -> str:
|
def _preprocess_text(self, text: str) -> str:
|
||||||
# TODO: add more preprocessing
|
# TODO: Need advanced normalizer for better performance
|
||||||
text = normalize("NFKD", text)
|
text = normalize("NFKD", text)
|
||||||
|
|
||||||
|
# FIXME: this should be fixed for non-English languages
|
||||||
|
|
||||||
|
# Remove emojis (wide Unicode range)
|
||||||
|
emoji_pattern = re.compile(
|
||||||
|
"[\U0001f600-\U0001f64f" # emoticons
|
||||||
|
"\U0001f300-\U0001f5ff" # symbols & pictographs
|
||||||
|
"\U0001f680-\U0001f6ff" # transport & map symbols
|
||||||
|
"\U0001f700-\U0001f77f"
|
||||||
|
"\U0001f780-\U0001f7ff"
|
||||||
|
"\U0001f800-\U0001f8ff"
|
||||||
|
"\U0001f900-\U0001f9ff"
|
||||||
|
"\U0001fa00-\U0001fa6f"
|
||||||
|
"\U0001fa70-\U0001faff"
|
||||||
|
"\u2600-\u26ff"
|
||||||
|
"\u2700-\u27bf"
|
||||||
|
"\U0001f1e6-\U0001f1ff]+",
|
||||||
|
flags=re.UNICODE,
|
||||||
|
)
|
||||||
|
text = emoji_pattern.sub("", text)
|
||||||
|
|
||||||
|
# Replace various dashes and symbols
|
||||||
|
replacements = {
|
||||||
|
"–": "-",
|
||||||
|
"‑": "-",
|
||||||
|
"—": "-",
|
||||||
|
"¯": " ",
|
||||||
|
"_": " ",
|
||||||
|
"“": '"',
|
||||||
|
"”": '"',
|
||||||
|
"‘": "'",
|
||||||
|
"’": "'",
|
||||||
|
"´": "'",
|
||||||
|
"`": "'",
|
||||||
|
"[": " ",
|
||||||
|
"]": " ",
|
||||||
|
"|": " ",
|
||||||
|
"/": " ",
|
||||||
|
"#": " ",
|
||||||
|
"→": " ",
|
||||||
|
"←": " ",
|
||||||
|
}
|
||||||
|
for k, v in replacements.items():
|
||||||
|
text = text.replace(k, v)
|
||||||
|
|
||||||
|
# Remove combining diacritics # FIXME: this should be fixed for non-English languages
|
||||||
|
text = re.sub(
|
||||||
|
r"[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]",
|
||||||
|
"",
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Remove special symbols
|
||||||
|
text = re.sub(r"[♥☆♡©\\]", "", text)
|
||||||
|
|
||||||
|
# Replace known expressions
|
||||||
|
expr_replacements = {
|
||||||
|
"@": " at ",
|
||||||
|
"e.g.,": "for example, ",
|
||||||
|
"i.e.,": "that is, ",
|
||||||
|
}
|
||||||
|
for k, v in expr_replacements.items():
|
||||||
|
text = text.replace(k, v)
|
||||||
|
|
||||||
|
# Fix spacing around punctuation
|
||||||
|
text = re.sub(r" ,", ",", text)
|
||||||
|
text = re.sub(r" \.", ".", text)
|
||||||
|
text = re.sub(r" !", "!", text)
|
||||||
|
text = re.sub(r" \?", "?", text)
|
||||||
|
text = re.sub(r" ;", ";", text)
|
||||||
|
text = re.sub(r" :", ":", text)
|
||||||
|
text = re.sub(r" '", "'", text)
|
||||||
|
|
||||||
|
# Remove duplicate quotes
|
||||||
|
while '""' in text:
|
||||||
|
text = text.replace('""', '"')
|
||||||
|
while "''" in text:
|
||||||
|
text = text.replace("''", "'")
|
||||||
|
while "``" in text:
|
||||||
|
text = text.replace("``", "`")
|
||||||
|
|
||||||
|
# Remove extra spaces
|
||||||
|
text = re.sub(r"\s+", " ", text).strip()
|
||||||
|
|
||||||
|
# If text doesn't end with punctuation, quotes, or closing brackets, add a period
|
||||||
|
if not re.search(r"[.!?;:,'\"')\]}…。」』】〉》›»]$", text):
|
||||||
|
text += "."
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def _get_text_mask(self, text_ids_lengths: np.ndarray) -> np.ndarray:
|
def _get_text_mask(self, text_ids_lengths: np.ndarray) -> np.ndarray:
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using Rust.
|
|||||||
|
|
||||||
## 📰 Update News
|
## 📰 Update News
|
||||||
|
|
||||||
|
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
|
||||||
|
|
||||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||||
|
|
||||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||||
|
|||||||
+89
-1
@@ -113,7 +113,95 @@ impl UnicodeProcessor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn preprocess_text(text: &str) -> String {
|
pub fn preprocess_text(text: &str) -> String {
|
||||||
text.nfkd().collect()
|
// TODO: Need advanced normalizer for better performance
|
||||||
|
let mut text: String = text.nfkd().collect();
|
||||||
|
|
||||||
|
// FIXME: this should be fixed for non-English languages
|
||||||
|
|
||||||
|
// Remove emojis (wide Unicode range)
|
||||||
|
let emoji_pattern = Regex::new(r"[\x{1F600}-\x{1F64F}\x{1F300}-\x{1F5FF}\x{1F680}-\x{1F6FF}\x{1F700}-\x{1F77F}\x{1F780}-\x{1F7FF}\x{1F800}-\x{1F8FF}\x{1F900}-\x{1F9FF}\x{1FA00}-\x{1FA6F}\x{1FA70}-\x{1FAFF}\x{2600}-\x{26FF}\x{2700}-\x{27BF}\x{1F1E6}-\x{1F1FF}]+").unwrap();
|
||||||
|
text = emoji_pattern.replace_all(&text, "").to_string();
|
||||||
|
|
||||||
|
// Replace various dashes and symbols
|
||||||
|
let replacements = [
|
||||||
|
("–", "-"), // en dash
|
||||||
|
("‑", "-"), // non-breaking hyphen
|
||||||
|
("—", "-"), // em dash
|
||||||
|
("¯", " "), // macron
|
||||||
|
("_", " "), // underscore
|
||||||
|
("\u{201C}", "\""), // left double quote
|
||||||
|
("\u{201D}", "\""), // right double quote
|
||||||
|
("\u{2018}", "'"), // left single quote
|
||||||
|
("\u{2019}", "'"), // right single quote
|
||||||
|
("´", "'"), // acute accent
|
||||||
|
("`", "'"), // grave accent
|
||||||
|
("[", " "), // left bracket
|
||||||
|
("]", " "), // right bracket
|
||||||
|
("|", " "), // vertical bar
|
||||||
|
("/", " "), // slash
|
||||||
|
("#", " "), // hash
|
||||||
|
("→", " "), // right arrow
|
||||||
|
("←", " "), // left arrow
|
||||||
|
];
|
||||||
|
|
||||||
|
for (from, to) in &replacements {
|
||||||
|
text = text.replace(from, to);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove combining diacritics // FIXME: this should be fixed for non-English languages
|
||||||
|
let diacritics_pattern = Regex::new(r"[\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{030A}\u{030B}\u{030C}\u{0327}\u{0328}\u{0329}\u{032A}\u{032B}\u{032C}\u{032D}\u{032E}\u{032F}]").unwrap();
|
||||||
|
text = diacritics_pattern.replace_all(&text, "").to_string();
|
||||||
|
|
||||||
|
// Remove special symbols
|
||||||
|
let special_symbols = ["♥", "☆", "♡", "©", "\\"];
|
||||||
|
for symbol in &special_symbols {
|
||||||
|
text = text.replace(symbol, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Replace known expressions
|
||||||
|
let expr_replacements = [
|
||||||
|
("@", " at "),
|
||||||
|
("e.g.,", "for example, "),
|
||||||
|
("i.e.,", "that is, "),
|
||||||
|
];
|
||||||
|
|
||||||
|
for (from, to) in &expr_replacements {
|
||||||
|
text = text.replace(from, to);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fix spacing around punctuation
|
||||||
|
text = Regex::new(r" ,").unwrap().replace_all(&text, ",").to_string();
|
||||||
|
text = Regex::new(r" \.").unwrap().replace_all(&text, ".").to_string();
|
||||||
|
text = Regex::new(r" !").unwrap().replace_all(&text, "!").to_string();
|
||||||
|
text = Regex::new(r" \?").unwrap().replace_all(&text, "?").to_string();
|
||||||
|
text = Regex::new(r" ;").unwrap().replace_all(&text, ";").to_string();
|
||||||
|
text = Regex::new(r" :").unwrap().replace_all(&text, ":").to_string();
|
||||||
|
text = Regex::new(r" '").unwrap().replace_all(&text, "'").to_string();
|
||||||
|
|
||||||
|
// Remove duplicate quotes
|
||||||
|
while text.contains("\"\"") {
|
||||||
|
text = text.replace("\"\"", "\"");
|
||||||
|
}
|
||||||
|
while text.contains("''") {
|
||||||
|
text = text.replace("''", "'");
|
||||||
|
}
|
||||||
|
while text.contains("``") {
|
||||||
|
text = text.replace("``", "`");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove extra spaces
|
||||||
|
text = Regex::new(r"\s+").unwrap().replace_all(&text, " ").to_string();
|
||||||
|
text = text.trim().to_string();
|
||||||
|
|
||||||
|
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
|
||||||
|
if !text.is_empty() {
|
||||||
|
let ends_with_punct = Regex::new(r#"[.!?;:,'"\u{201C}\u{201D}\u{2018}\u{2019})\]}…。」』】〉》›»]$"#).unwrap();
|
||||||
|
if !ends_with_punct.is_match(&text) {
|
||||||
|
text.push('.');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
text
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn text_to_unicode_values(text: &str) -> Vec<usize> {
|
pub fn text_to_unicode_values(text: &str) -> Vec<usize> {
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx`.
|
|||||||
|
|
||||||
## 📰 Update News
|
## 📰 Update News
|
||||||
|
|
||||||
|
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
|
||||||
|
|
||||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||||
|
|
||||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||||
|
|||||||
+108
-1
@@ -72,7 +72,114 @@ class UnicodeProcessor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func preprocessText(_ text: String) -> String {
|
func preprocessText(_ text: String) -> String {
|
||||||
return text.precomposedStringWithCompatibilityMapping
|
// TODO: Need advanced normalizer for better performance
|
||||||
|
var text = text.precomposedStringWithCompatibilityMapping
|
||||||
|
|
||||||
|
// FIXME: this should be fixed for non-English languages
|
||||||
|
|
||||||
|
// Remove emojis (wide Unicode range)
|
||||||
|
// Swift NSRegularExpression doesn't support Unicode escapes above \uFFFF
|
||||||
|
// Use character filtering instead
|
||||||
|
text = text.unicodeScalars.filter { scalar in
|
||||||
|
let value = scalar.value
|
||||||
|
return !((value >= 0x1F600 && value <= 0x1F64F) ||
|
||||||
|
(value >= 0x1F300 && value <= 0x1F5FF) ||
|
||||||
|
(value >= 0x1F680 && value <= 0x1F6FF) ||
|
||||||
|
(value >= 0x1F700 && value <= 0x1F77F) ||
|
||||||
|
(value >= 0x1F780 && value <= 0x1F7FF) ||
|
||||||
|
(value >= 0x1F800 && value <= 0x1F8FF) ||
|
||||||
|
(value >= 0x1F900 && value <= 0x1F9FF) ||
|
||||||
|
(value >= 0x1FA00 && value <= 0x1FA6F) ||
|
||||||
|
(value >= 0x1FA70 && value <= 0x1FAFF) ||
|
||||||
|
(value >= 0x2600 && value <= 0x26FF) ||
|
||||||
|
(value >= 0x2700 && value <= 0x27BF) ||
|
||||||
|
(value >= 0x1F1E6 && value <= 0x1F1FF))
|
||||||
|
}.map { String($0) }.joined()
|
||||||
|
|
||||||
|
// Replace various dashes and symbols
|
||||||
|
let replacements: [String: String] = [
|
||||||
|
"–": "-", // en dash
|
||||||
|
"‑": "-", // non-breaking hyphen
|
||||||
|
"—": "-", // em dash
|
||||||
|
"¯": " ", // macron
|
||||||
|
"_": " ", // underscore
|
||||||
|
"\u{201C}": "\"", // left double quote
|
||||||
|
"\u{201D}": "\"", // right double quote
|
||||||
|
"\u{2018}": "'", // left single quote
|
||||||
|
"\u{2019}": "'", // right single quote
|
||||||
|
"´": "'", // acute accent
|
||||||
|
"`": "'", // grave accent
|
||||||
|
"[": " ", // left bracket
|
||||||
|
"]": " ", // right bracket
|
||||||
|
"|": " ", // vertical bar
|
||||||
|
"/": " ", // slash
|
||||||
|
"#": " ", // hash
|
||||||
|
"→": " ", // right arrow
|
||||||
|
"←": " ", // left arrow
|
||||||
|
]
|
||||||
|
|
||||||
|
for (old, new) in replacements {
|
||||||
|
text = text.replacingOccurrences(of: old, with: new)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove combining diacritics // FIXME: this should be fixed for non-English languages
|
||||||
|
let diacriticsPattern = try! NSRegularExpression(pattern: "[\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u030A\\u030B\\u030C\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F]")
|
||||||
|
let diacriticsRange = NSRange(text.startIndex..., in: text)
|
||||||
|
text = diacriticsPattern.stringByReplacingMatches(in: text, range: diacriticsRange, withTemplate: "")
|
||||||
|
|
||||||
|
// Remove special symbols
|
||||||
|
let specialSymbols = ["♥", "☆", "♡", "©", "\\"]
|
||||||
|
for symbol in specialSymbols {
|
||||||
|
text = text.replacingOccurrences(of: symbol, with: "")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Replace known expressions
|
||||||
|
let exprReplacements: [String: String] = [
|
||||||
|
"@": " at ",
|
||||||
|
"e.g.,": "for example, ",
|
||||||
|
"i.e.,": "that is, ",
|
||||||
|
]
|
||||||
|
|
||||||
|
for (old, new) in exprReplacements {
|
||||||
|
text = text.replacingOccurrences(of: old, with: new)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fix spacing around punctuation
|
||||||
|
text = text.replacingOccurrences(of: " ,", with: ",")
|
||||||
|
text = text.replacingOccurrences(of: " .", with: ".")
|
||||||
|
text = text.replacingOccurrences(of: " !", with: "!")
|
||||||
|
text = text.replacingOccurrences(of: " ?", with: "?")
|
||||||
|
text = text.replacingOccurrences(of: " ;", with: ";")
|
||||||
|
text = text.replacingOccurrences(of: " :", with: ":")
|
||||||
|
text = text.replacingOccurrences(of: " '", with: "'")
|
||||||
|
|
||||||
|
// Remove duplicate quotes
|
||||||
|
while text.contains("\"\"") {
|
||||||
|
text = text.replacingOccurrences(of: "\"\"", with: "\"")
|
||||||
|
}
|
||||||
|
while text.contains("''") {
|
||||||
|
text = text.replacingOccurrences(of: "''", with: "'")
|
||||||
|
}
|
||||||
|
while text.contains("``") {
|
||||||
|
text = text.replacingOccurrences(of: "``", with: "`")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove extra spaces
|
||||||
|
let whitespacePattern = try! NSRegularExpression(pattern: "\\s+")
|
||||||
|
let whitespaceRange = NSRange(text.startIndex..., in: text)
|
||||||
|
text = whitespacePattern.stringByReplacingMatches(in: text, range: whitespaceRange, withTemplate: " ")
|
||||||
|
text = text.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||||
|
|
||||||
|
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
|
||||||
|
if !text.isEmpty {
|
||||||
|
let punctPattern = try! NSRegularExpression(pattern: "[.!?;:,'\"\\u201C\\u201D\\u2018\\u2019)\\]}…。」』】〉》›»]$")
|
||||||
|
let punctRange = NSRange(text.startIndex..., in: text)
|
||||||
|
if punctPattern.firstMatch(in: text, range: punctRange) == nil {
|
||||||
|
text += "."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return text
|
||||||
}
|
}
|
||||||
|
|
||||||
func lengthToMask(_ lengths: [Int], maxLen: Int? = nil) -> [[[Float]]] {
|
func lengthToMask(_ lengths: [Int], maxLen: Int? = nil) -> [[[Float]]] {
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ This example demonstrates how to use Supertonic in a web browser using ONNX Runt
|
|||||||
|
|
||||||
## 📰 Update News
|
## 📰 Update News
|
||||||
|
|
||||||
|
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
|
||||||
|
|
||||||
**2025.11.19** - Added speed control slider to adjust speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
**2025.11.19** - Added speed control slider to adjust speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||||
|
|
||||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||||
|
|||||||
+79
-1
@@ -28,7 +28,85 @@ export class UnicodeProcessor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
preprocessText(text) {
|
preprocessText(text) {
|
||||||
return text.normalize('NFKC');
|
// TODO: Need advanced normalizer for better performance
|
||||||
|
text = text.normalize('NFKD');
|
||||||
|
|
||||||
|
// FIXME: this should be fixed for non-English languages
|
||||||
|
|
||||||
|
// Remove emojis (wide Unicode range)
|
||||||
|
const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu;
|
||||||
|
text = text.replace(emojiPattern, '');
|
||||||
|
|
||||||
|
// Replace various dashes and symbols
|
||||||
|
const replacements = {
|
||||||
|
'–': '-',
|
||||||
|
'‑': '-',
|
||||||
|
'—': '-',
|
||||||
|
'¯': ' ',
|
||||||
|
'_': ' ',
|
||||||
|
'"': '"',
|
||||||
|
'"': '"',
|
||||||
|
'\u2018': "'", // left single quote
|
||||||
|
'\u2019': "'", // right single quote
|
||||||
|
'´': "'",
|
||||||
|
'`': "'",
|
||||||
|
'[': ' ',
|
||||||
|
']': ' ',
|
||||||
|
'|': ' ',
|
||||||
|
'/': ' ',
|
||||||
|
'#': ' ',
|
||||||
|
'→': ' ',
|
||||||
|
'←': ' ',
|
||||||
|
};
|
||||||
|
for (const [k, v] of Object.entries(replacements)) {
|
||||||
|
text = text.replaceAll(k, v);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove combining diacritics // FIXME: this should be fixed for non-English languages
|
||||||
|
text = text.replace(/[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]/g, '');
|
||||||
|
|
||||||
|
// Remove special symbols
|
||||||
|
text = text.replace(/[♥☆♡©\\]/g, '');
|
||||||
|
|
||||||
|
// Replace known expressions
|
||||||
|
const exprReplacements = {
|
||||||
|
'@': ' at ',
|
||||||
|
'e.g.,': 'for example, ',
|
||||||
|
'i.e.,': 'that is, ',
|
||||||
|
};
|
||||||
|
for (const [k, v] of Object.entries(exprReplacements)) {
|
||||||
|
text = text.replaceAll(k, v);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fix spacing around punctuation
|
||||||
|
text = text.replace(/ ,/g, ',');
|
||||||
|
text = text.replace(/ \./g, '.');
|
||||||
|
text = text.replace(/ !/g, '!');
|
||||||
|
text = text.replace(/ \?/g, '?');
|
||||||
|
text = text.replace(/ ;/g, ';');
|
||||||
|
text = text.replace(/ :/g, ':');
|
||||||
|
text = text.replace(/ '/g, "'");
|
||||||
|
|
||||||
|
// Remove duplicate quotes
|
||||||
|
while (text.includes('""')) {
|
||||||
|
text = text.replace('""', '"');
|
||||||
|
}
|
||||||
|
while (text.includes("''")) {
|
||||||
|
text = text.replace("''", "'");
|
||||||
|
}
|
||||||
|
while (text.includes('``')) {
|
||||||
|
text = text.replace('``', '`');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove extra spaces
|
||||||
|
text = text.replace(/\s+/g, ' ').trim();
|
||||||
|
|
||||||
|
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
|
||||||
|
if (!/[.!?;:,'\"')\]}…。」』】〉》›»]$/.test(text)) {
|
||||||
|
text += '.';
|
||||||
|
}
|
||||||
|
|
||||||
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
getTextMask(textIdsLengths) {
|
getTextMask(textIdsLengths) {
|
||||||
|
|||||||
Reference in New Issue
Block a user