diff --git a/cpp/README.md b/cpp/README.md index 951a997..47231f4 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -4,6 +4,8 @@ High-performance text-to-speech inference using ONNX Runtime. ## ๐Ÿ“ฐ Update News +**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality. + **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5). **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses. diff --git a/cpp/helper.cpp b/cpp/helper.cpp index 208f11c..39c58d1 100644 --- a/cpp/helper.cpp +++ b/cpp/helper.cpp @@ -19,6 +19,24 @@ void clearTensorBuffers() { g_tensor_buffers_int64.clear(); } +// ============================================================================ +// Helper function - trim +// ============================================================================ + +static std::string trim(const std::string& str) { + size_t start = 0; + while (start < str.size() && std::isspace(static_cast(str[start]))) { + start++; + } + + size_t end = str.size(); + while (end > start && std::isspace(static_cast(str[end - 1]))) { + end--; + } + + return str.substr(start, end - start); +} + // ============================================================================ // UnicodeProcessor implementation // ============================================================================ @@ -28,10 +46,148 @@ UnicodeProcessor::UnicodeProcessor(const std::string& unicode_indexer_json_path) } std::string UnicodeProcessor::preprocessText(const std::string& text) { - // Simple NFKD normalization (C++ doesn't have built-in Unicode normalization) - // For now, just return the text as-is - // TODO: add proper Unicode normalization - return text; + // TODO: Need advanced normalizer for better performance + // NOTE: C++ doesn't have built-in Unicode normalization like Python's NFKD + // For full Unicode normalization, consider using ICU library + // This implementation handles basic text preprocessing + + std::string result = text; + + // FIXME: this should be fixed for non-English languages + + // Remove emojis and various Unicode symbols + // Using regex to remove common emoji ranges and special symbols + // Note: This is a simplified version - full emoji support needs UTF-8 handling + std::regex emoji_pattern( + "[\xF0\x9F][\x80-\xBF]{2}|" // Common emoji pattern in UTF-8 + "[\xE2][\x80-\xBF]{2}|" // Various symbols + "[\xE2][\x98-\x9E][\x80-\xBF]" // More symbols + ); + result = std::regex_replace(result, emoji_pattern, ""); + + // Replace various dashes and symbols + struct Replacement { + const char* from; + const char* to; + }; + + const Replacement replacements[] = { + {"โ€“", "-"}, // en dash + {"โ€‘", "-"}, // non-breaking hyphen + {"โ€”", "-"}, // em dash + {"ยฏ", " "}, // macron + {"_", " "}, // underscore + {""", "\""}, // left double quote (U+201C) + {""", "\""}, // right double quote (U+201D) + {"'", "'"}, // left single quote (U+2018) + {"'", "'"}, // right single quote (U+2019) + {"ยด", "'"}, // acute accent + {"`", "'"}, // grave accent + {"[", " "}, // left bracket + {"]", " "}, // right bracket + {"|", " "}, // vertical bar + {"/", " "}, // slash + {"#", " "}, // hash + {"โ†’", " "}, // right arrow + {"โ†", " "}, // left arrow + }; + + for (const auto& repl : replacements) { + size_t pos = 0; + while ((pos = result.find(repl.from, pos)) != std::string::npos) { + result.replace(pos, strlen(repl.from), repl.to); + pos += strlen(repl.to); + } + } + + // Remove combining diacritics (common combining marks in UTF-8) + // FIXME: this should be fixed for non-English languages + std::regex diacritics_pattern( + "[\xCC\xCD][\x80-\xBF]" // Combining diacritical marks range + ); + result = std::regex_replace(result, diacritics_pattern, ""); + + // Remove special symbols + const char* special_symbols[] = {"โ™ฅ", "โ˜†", "โ™ก", "ยฉ", "\\"}; + for (const char* symbol : special_symbols) { + size_t pos = 0; + while ((pos = result.find(symbol, pos)) != std::string::npos) { + result.erase(pos, strlen(symbol)); + } + } + + // Replace known expressions + const Replacement expr_replacements[] = { + {"@", " at "}, + {"e.g.,", "for example, "}, + {"i.e.,", "that is, "}, + }; + + for (const auto& repl : expr_replacements) { + size_t pos = 0; + while ((pos = result.find(repl.from, pos)) != std::string::npos) { + result.replace(pos, strlen(repl.from), repl.to); + pos += strlen(repl.to); + } + } + + // Fix spacing around punctuation + result = std::regex_replace(result, std::regex(" ,"), ","); + result = std::regex_replace(result, std::regex(" \\."), "."); + result = std::regex_replace(result, std::regex(" !"), "!"); + result = std::regex_replace(result, std::regex(" \\?"), "?"); + result = std::regex_replace(result, std::regex(" ;"), ";"); + result = std::regex_replace(result, std::regex(" :"), ":"); + result = std::regex_replace(result, std::regex(" '"), "'"); + + // Remove duplicate quotes + while (result.find("\"\"") != std::string::npos) { + size_t pos = result.find("\"\""); + result.replace(pos, 2, "\""); + } + while (result.find("''") != std::string::npos) { + size_t pos = result.find("''"); + result.replace(pos, 2, "'"); + } + while (result.find("``") != std::string::npos) { + size_t pos = result.find("``"); + result.replace(pos, 2, "`"); + } + + // Remove extra spaces + result = std::regex_replace(result, std::regex("\\s+"), " "); + result = trim(result); + + // If text doesn't end with punctuation, quotes, or closing brackets, add a period + if (!result.empty()) { + char last_char = result.back(); + bool ends_with_punct = ( + last_char == '.' || last_char == '!' || last_char == '?' || + last_char == ';' || last_char == ':' || last_char == ',' || + last_char == '\'' || last_char == '"' || last_char == ')' || + last_char == ']' || last_char == '}' || last_char == '>' + ); + + // Check for UTF-8 multibyte ending punctuation (e.g., โ€ฆ, ใ€‚, curly quotes, etc.) + if (!ends_with_punct && result.size() >= 3) { + std::string last_three = result.substr(result.size() - 3); + if (last_three == "โ€ฆ" || last_three == "ใ€‚" || + last_three == "ใ€" || last_three == "ใ€" || + last_three == "ใ€‘" || last_three == "ใ€‰" || + last_three == "ใ€‹" || last_three == "โ€บ" || + last_three == "ยป" || last_three == """ || + last_three == """ || last_three == "'" || + last_three == "'") { + ends_with_punct = true; + } + } + + if (!ends_with_punct) { + result += "."; + } + } + + return result; } std::vector UnicodeProcessor::textToUnicodeValues(const std::string& text) { @@ -772,20 +928,6 @@ std::string sanitizeFilename(const std::string& text, int max_len) { // Chunk text // ============================================================================ -static std::string trim(const std::string& str) { - size_t start = 0; - while (start < str.size() && std::isspace(static_cast(str[start]))) { - start++; - } - - size_t end = str.size(); - while (end > start && std::isspace(static_cast(str[end - 1]))) { - end--; - } - - return str.substr(start, end - start); -} - std::vector chunkText(const std::string& text, int max_len) { std::vector chunks; diff --git a/csharp/Helper.cs b/csharp/Helper.cs index d90afa8..e555630 100644 --- a/csharp/Helper.cs +++ b/csharp/Helper.cs @@ -71,10 +71,144 @@ namespace Supertonic } } + private static string RemoveEmojis(string text) + { + var result = new StringBuilder(); + for (int i = 0; i < text.Length; i++) + { + int codePoint; + if (char.IsHighSurrogate(text[i]) && i + 1 < text.Length && char.IsLowSurrogate(text[i + 1])) + { + // Get the full code point from surrogate pair + codePoint = char.ConvertToUtf32(text[i], text[i + 1]); + i++; // Skip the low surrogate + } + else + { + codePoint = text[i]; + } + + // Check if code point is in emoji ranges + bool isEmoji = (codePoint >= 0x1F600 && codePoint <= 0x1F64F) || + (codePoint >= 0x1F300 && codePoint <= 0x1F5FF) || + (codePoint >= 0x1F680 && codePoint <= 0x1F6FF) || + (codePoint >= 0x1F700 && codePoint <= 0x1F77F) || + (codePoint >= 0x1F780 && codePoint <= 0x1F7FF) || + (codePoint >= 0x1F800 && codePoint <= 0x1F8FF) || + (codePoint >= 0x1F900 && codePoint <= 0x1F9FF) || + (codePoint >= 0x1FA00 && codePoint <= 0x1FA6F) || + (codePoint >= 0x1FA70 && codePoint <= 0x1FAFF) || + (codePoint >= 0x2600 && codePoint <= 0x26FF) || + (codePoint >= 0x2700 && codePoint <= 0x27BF) || + (codePoint >= 0x1F1E6 && codePoint <= 0x1F1FF); + + if (!isEmoji) + { + if (codePoint > 0xFFFF) + { + // Add back as surrogate pair + result.Append(char.ConvertFromUtf32(codePoint)); + } + else + { + result.Append((char)codePoint); + } + } + } + return result.ToString(); + } + private string PreprocessText(string text) { - // Simple normalization (C# has Normalize built-in) - return text.Normalize(NormalizationForm.FormKD); + // TODO: Need advanced normalizer for better performance + text = text.Normalize(NormalizationForm.FormKD); + + // FIXME: this should be fixed for non-English languages + + // Remove emojis (wide Unicode range) + // C# doesn't support \u{...} syntax in regex, so we use character filtering instead + text = RemoveEmojis(text); + + // Replace various dashes and symbols + var replacements = new Dictionary + { + {"โ€“", "-"}, // en dash + {"โ€‘", "-"}, // non-breaking hyphen + {"โ€”", "-"}, // em dash + {"ยฏ", " "}, // macron + {"_", " "}, // underscore + {"\u201C", "\""}, // left double quote + {"\u201D", "\""}, // right double quote + {"\u2018", "'"}, // left single quote + {"\u2019", "'"}, // right single quote + {"ยด", "'"}, // acute accent + {"`", "'"}, // grave accent + {"[", " "}, // left bracket + {"]", " "}, // right bracket + {"|", " "}, // vertical bar + {"/", " "}, // slash + {"#", " "}, // hash + {"โ†’", " "}, // right arrow + {"โ†", " "}, // left arrow + }; + + foreach (var kvp in replacements) + { + text = text.Replace(kvp.Key, kvp.Value); + } + + // Remove combining diacritics // FIXME: this should be fixed for non-English languages + text = Regex.Replace(text, @"[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]", ""); + + // Remove special symbols + text = Regex.Replace(text, @"[โ™ฅโ˜†โ™กยฉ\\]", ""); + + // Replace known expressions + var exprReplacements = new Dictionary + { + {"@", " at "}, + {"e.g.,", "for example, "}, + {"i.e.,", "that is, "}, + }; + + foreach (var kvp in exprReplacements) + { + text = text.Replace(kvp.Key, kvp.Value); + } + + // Fix spacing around punctuation + text = Regex.Replace(text, @" ,", ","); + text = Regex.Replace(text, @" \.", "."); + text = Regex.Replace(text, @" !", "!"); + text = Regex.Replace(text, @" \?", "?"); + text = Regex.Replace(text, @" ;", ";"); + text = Regex.Replace(text, @" :", ":"); + text = Regex.Replace(text, @" '", "'"); + + // Remove duplicate quotes + while (text.Contains("\"\"")) + { + text = text.Replace("\"\"", "\""); + } + while (text.Contains("''")) + { + text = text.Replace("''", "'"); + } + while (text.Contains("``")) + { + text = text.Replace("``", "`"); + } + + // Remove extra spaces + text = Regex.Replace(text, @"\s+", " ").Trim(); + + // If text doesn't end with punctuation, quotes, or closing brackets, add a period + if (!Regex.IsMatch(text, @"[.!?;:,'\u0022\u201C\u201D\u2018\u2019)\]}โ€ฆใ€‚ใ€ใ€ใ€‘ใ€‰ใ€‹โ€บยป]$")) + { + text += "."; + } + + return text; } private int[] TextToUnicodeValues(string text) diff --git a/csharp/README.md b/csharp/README.md index 4645fc8..21aeded 100644 --- a/csharp/README.md +++ b/csharp/README.md @@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `ExampleONNX.cs`. ## ๐Ÿ“ฐ Update News +**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality. + **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5). **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses. diff --git a/go/README.md b/go/README.md index 062a198..48d6d4d 100644 --- a/go/README.md +++ b/go/README.md @@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx.go`. ## ๐Ÿ“ฐ Update News +**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality. + **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5). **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses. diff --git a/go/helper.go b/go/helper.go index 5441768..a116060 100644 --- a/go/helper.go +++ b/go/helper.go @@ -327,8 +327,97 @@ func splitSentences(text string) []string { // Utility functions func preprocessText(text string) string { - // Simple normalization (Go doesn't have built-in NFKD normalization) + // TODO: Need advanced normalizer for better performance + // NOTE: Go doesn't have built-in NFKD normalization like Python // For full Unicode normalization, use golang.org/x/text/unicode/norm + // This implementation handles basic text preprocessing + + // FIXME: this should be fixed for non-English languages + + // Remove emojis and various Unicode symbols + emojiPattern := regexp.MustCompile(`[\x{1F600}-\x{1F64F}\x{1F300}-\x{1F5FF}\x{1F680}-\x{1F6FF}\x{1F700}-\x{1F77F}\x{1F780}-\x{1F7FF}\x{1F800}-\x{1F8FF}\x{1F900}-\x{1F9FF}\x{1FA00}-\x{1FA6F}\x{1FA70}-\x{1FAFF}\x{2600}-\x{26FF}\x{2700}-\x{27BF}\x{1F1E6}-\x{1F1FF}]+`) + text = emojiPattern.ReplaceAllString(text, "") + + // Replace various dashes and symbols + replacements := map[string]string{ + "โ€“": "-", // en dash + "โ€‘": "-", // non-breaking hyphen + "โ€”": "-", // em dash + "ยฏ": " ", // macron + "_": " ", // underscore + "\u201C": "\"", // left double quote + "\u201D": "\"", // right double quote + "\u2018": "'", // left single quote + "\u2019": "'", // right single quote + "ยด": "'", // acute accent + "`": "'", // grave accent + "[": " ", // left bracket + "]": " ", // right bracket + "|": " ", // vertical bar + "/": " ", // slash + "#": " ", // hash + "โ†’": " ", // right arrow + "โ†": " ", // left arrow + } + + for old, new := range replacements { + text = strings.ReplaceAll(text, old, new) + } + + // Remove combining diacritics (common combining marks) + // FIXME: this should be fixed for non-English languages + diacriticsPattern := regexp.MustCompile(`[\x{0302}\x{0303}\x{0304}\x{0305}\x{0306}\x{0307}\x{0308}\x{030A}\x{030B}\x{030C}\x{0327}\x{0328}\x{0329}\x{032A}\x{032B}\x{032C}\x{032D}\x{032E}\x{032F}]`) + text = diacriticsPattern.ReplaceAllString(text, "") + + // Remove special symbols + specialSymbols := []string{"โ™ฅ", "โ˜†", "โ™ก", "ยฉ", "\\"} + for _, symbol := range specialSymbols { + text = strings.ReplaceAll(text, symbol, "") + } + + // Replace known expressions + exprReplacements := map[string]string{ + "@": " at ", + "e.g.,": "for example, ", + "i.e.,": "that is, ", + } + + for old, new := range exprReplacements { + text = strings.ReplaceAll(text, old, new) + } + + // Fix spacing around punctuation + text = regexp.MustCompile(` ,`).ReplaceAllString(text, ",") + text = regexp.MustCompile(` \.`).ReplaceAllString(text, ".") + text = regexp.MustCompile(` !`).ReplaceAllString(text, "!") + text = regexp.MustCompile(` \?`).ReplaceAllString(text, "?") + text = regexp.MustCompile(` ;`).ReplaceAllString(text, ";") + text = regexp.MustCompile(` :`).ReplaceAllString(text, ":") + text = regexp.MustCompile(` '`).ReplaceAllString(text, "'") + + // Remove duplicate quotes + for strings.Contains(text, `""`) { + text = strings.ReplaceAll(text, `""`, `"`) + } + for strings.Contains(text, "''") { + text = strings.ReplaceAll(text, "''", "'") + } + for strings.Contains(text, "``") { + text = strings.ReplaceAll(text, "``", "`") + } + + // Remove extra spaces + text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ") + text = strings.TrimSpace(text) + + // If text doesn't end with punctuation, quotes, or closing brackets, add a period + if text != "" { + endsWithPunct := regexp.MustCompile(`[.!?;:,'"\x{201C}\x{201D}\x{2018}\x{2019})\]}โ€ฆใ€‚ใ€ใ€ใ€‘ใ€‰ใ€‹โ€บยป]$`) + if !endsWithPunct.MatchString(text) { + text += "." + } + } + return text } diff --git a/java/Helper.java b/java/Helper.java index f53cf71..a75ade4 100644 --- a/java/Helper.java +++ b/java/Helper.java @@ -60,6 +60,42 @@ class UnicodeProcessor { this.indexer = Helper.loadJsonLongArray(unicodeIndexerJsonPath); } + private static String removeEmojis(String text) { + StringBuilder result = new StringBuilder(); + for (int i = 0; i < text.length(); i++) { + int codePoint; + if (Character.isHighSurrogate(text.charAt(i)) && i + 1 < text.length() && Character.isLowSurrogate(text.charAt(i + 1))) { + codePoint = Character.codePointAt(text, i); + i++; // Skip the low surrogate + } else { + codePoint = text.charAt(i); + } + + // Check if code point is in emoji ranges + boolean isEmoji = (codePoint >= 0x1F600 && codePoint <= 0x1F64F) || + (codePoint >= 0x1F300 && codePoint <= 0x1F5FF) || + (codePoint >= 0x1F680 && codePoint <= 0x1F6FF) || + (codePoint >= 0x1F700 && codePoint <= 0x1F77F) || + (codePoint >= 0x1F780 && codePoint <= 0x1F7FF) || + (codePoint >= 0x1F800 && codePoint <= 0x1F8FF) || + (codePoint >= 0x1F900 && codePoint <= 0x1F9FF) || + (codePoint >= 0x1FA00 && codePoint <= 0x1FA6F) || + (codePoint >= 0x1FA70 && codePoint <= 0x1FAFF) || + (codePoint >= 0x2600 && codePoint <= 0x26FF) || + (codePoint >= 0x2700 && codePoint <= 0x27BF) || + (codePoint >= 0x1F1E6 && codePoint <= 0x1F1FF); + + if (!isEmoji) { + if (codePoint > 0xFFFF) { + result.append(Character.toChars(codePoint)); + } else { + result.append((char) codePoint); + } + } + } + return result.toString(); + } + public TextProcessResult call(List textList) { List processedTexts = new ArrayList<>(); for (String text : textList) { @@ -86,7 +122,86 @@ class UnicodeProcessor { } private String preprocessText(String text) { - return Normalizer.normalize(text, Normalizer.Form.NFKD); + // TODO: Need advanced normalizer for better performance + text = Normalizer.normalize(text, Normalizer.Form.NFKD); + + // FIXME: this should be fixed for non-English languages + + // Remove emojis (wide Unicode range) + // Java Pattern doesn't support \x{...} syntax for Unicode above \uFFFF + // Use character filtering instead + text = removeEmojis(text); + + // Replace various dashes and symbols + Map replacements = new HashMap<>(); + replacements.put("โ€“", "-"); // en dash + replacements.put("โ€‘", "-"); // non-breaking hyphen + replacements.put("โ€”", "-"); // em dash + replacements.put("ยฏ", " "); // macron + replacements.put("_", " "); // underscore + replacements.put("\u201C", "\""); // left double quote + replacements.put("\u201D", "\""); // right double quote + replacements.put("\u2018", "'"); // left single quote + replacements.put("\u2019", "'"); // right single quote + replacements.put("ยด", "'"); // acute accent + replacements.put("`", "'"); // grave accent + replacements.put("[", " "); // left bracket + replacements.put("]", " "); // right bracket + replacements.put("|", " "); // vertical bar + replacements.put("/", " "); // slash + replacements.put("#", " "); // hash + replacements.put("โ†’", " "); // right arrow + replacements.put("โ†", " "); // left arrow + + for (Map.Entry entry : replacements.entrySet()) { + text = text.replace(entry.getKey(), entry.getValue()); + } + + // Remove combining diacritics // FIXME: this should be fixed for non-English languages + text = text.replaceAll("[\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u030A\\u030B\\u030C\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F]", ""); + + // Remove special symbols + text = text.replaceAll("[โ™ฅโ˜†โ™กยฉ\\\\]", ""); + + // Replace known expressions + Map exprReplacements = new HashMap<>(); + exprReplacements.put("@", " at "); + exprReplacements.put("e.g.,", "for example, "); + exprReplacements.put("i.e.,", "that is, "); + + for (Map.Entry entry : exprReplacements.entrySet()) { + text = text.replace(entry.getKey(), entry.getValue()); + } + + // Fix spacing around punctuation + text = text.replaceAll(" ,", ","); + text = text.replaceAll(" \\.", "."); + text = text.replaceAll(" !", "!"); + text = text.replaceAll(" \\?", "?"); + text = text.replaceAll(" ;", ";"); + text = text.replaceAll(" :", ":"); + text = text.replaceAll(" '", "'"); + + // Remove duplicate quotes + while (text.contains("\"\"")) { + text = text.replace("\"\"", "\""); + } + while (text.contains("''")) { + text = text.replace("''", "'"); + } + while (text.contains("``")) { + text = text.replace("``", "`"); + } + + // Remove extra spaces + text = text.replaceAll("\\s+", " ").trim(); + + // If text doesn't end with punctuation, quotes, or closing brackets, add a period + if (!text.matches(".*[.!?;:,'\"\\u201C\\u201D\\u2018\\u2019)\\]}โ€ฆใ€‚ใ€ใ€ใ€‘ใ€‰ใ€‹โ€บยป]$")) { + text += "."; + } + + return text; } private int[] textToUnicodeValues(String text) { diff --git a/java/README.md b/java/README.md index 5d60ef3..53b8de4 100644 --- a/java/README.md +++ b/java/README.md @@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `ExampleONNX.java`. ## ๐Ÿ“ฐ Update News +**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality. + **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5). **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses. diff --git a/nodejs/README.md b/nodejs/README.md index 016c6d8..7778bb6 100644 --- a/nodejs/README.md +++ b/nodejs/README.md @@ -4,6 +4,8 @@ Node.js implementation for TTS inference. Uses ONNX Runtime to generate speech f ## ๐Ÿ“ฐ Update News +**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality. + **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5). **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses. diff --git a/nodejs/helper.js b/nodejs/helper.js index bfa897a..84b2aea 100644 --- a/nodejs/helper.js +++ b/nodejs/helper.js @@ -14,8 +14,85 @@ class UnicodeProcessor { } _preprocessText(text) { - // Simple NFKD normalization (JavaScript has normalize built-in) - return text.normalize('NFKD'); + // TODO: Need advanced normalizer for better performance + text = text.normalize('NFKD'); + + // FIXME: this should be fixed for non-English languages + + // Remove emojis (wide Unicode range) + const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu; + text = text.replace(emojiPattern, ''); + + // Replace various dashes and symbols + const replacements = { + 'โ€“': '-', + 'โ€‘': '-', + 'โ€”': '-', + 'ยฏ': ' ', + '_': ' ', + '"': '"', + '"': '"', + '\u2018': "'", // left single quote + '\u2019': "'", // right single quote + 'ยด': "'", + '`': "'", + '[': ' ', + ']': ' ', + '|': ' ', + '/': ' ', + '#': ' ', + 'โ†’': ' ', + 'โ†': ' ', + }; + for (const [k, v] of Object.entries(replacements)) { + text = text.replaceAll(k, v); + } + + // Remove combining diacritics // FIXME: this should be fixed for non-English languages + text = text.replace(/[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]/g, ''); + + // Remove special symbols + text = text.replace(/[โ™ฅโ˜†โ™กยฉ\\]/g, ''); + + // Replace known expressions + const exprReplacements = { + '@': ' at ', + 'e.g.,': 'for example, ', + 'i.e.,': 'that is, ', + }; + for (const [k, v] of Object.entries(exprReplacements)) { + text = text.replaceAll(k, v); + } + + // Fix spacing around punctuation + text = text.replace(/ ,/g, ','); + text = text.replace(/ \./g, '.'); + text = text.replace(/ !/g, '!'); + text = text.replace(/ \?/g, '?'); + text = text.replace(/ ;/g, ';'); + text = text.replace(/ :/g, ':'); + text = text.replace(/ '/g, "'"); + + // Remove duplicate quotes + while (text.includes('""')) { + text = text.replace('""', '"'); + } + while (text.includes("''")) { + text = text.replace("''", "'"); + } + while (text.includes('``')) { + text = text.replace('``', '`'); + } + + // Remove extra spaces + text = text.replace(/\s+/g, ' ').trim(); + + // If text doesn't end with punctuation, quotes, or closing brackets, add a period + if (!/[.!?;:,'\"')\]}โ€ฆใ€‚ใ€ใ€ใ€‘ใ€‰ใ€‹โ€บยป]$/.test(text)) { + text += '.'; + } + + return text; } _textToUnicodeValues(text) { diff --git a/py/README.md b/py/README.md index 0a7caa9..b0467cd 100644 --- a/py/README.md +++ b/py/README.md @@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx.py`. ## ๐Ÿ“ฐ Update News +**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality. + **2025.11.19** - Added `--speed` parameter to control speech synthesis speed. Adjust the speed factor to make speech faster or slower while maintaining natural quality. **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses. diff --git a/py/helper.py b/py/helper.py index 828abf4..9a210d6 100644 --- a/py/helper.py +++ b/py/helper.py @@ -8,6 +8,8 @@ from unicodedata import normalize import numpy as np import onnxruntime as ort +import re + class UnicodeProcessor: def __init__(self, unicode_indexer_path: str): @@ -15,8 +17,96 @@ class UnicodeProcessor: self.indexer = json.load(f) def _preprocess_text(self, text: str) -> str: - # TODO: add more preprocessing + # TODO: Need advanced normalizer for better performance text = normalize("NFKD", text) + + # FIXME: this should be fixed for non-English languages + + # Remove emojis (wide Unicode range) + emoji_pattern = re.compile( + "[\U0001f600-\U0001f64f" # emoticons + "\U0001f300-\U0001f5ff" # symbols & pictographs + "\U0001f680-\U0001f6ff" # transport & map symbols + "\U0001f700-\U0001f77f" + "\U0001f780-\U0001f7ff" + "\U0001f800-\U0001f8ff" + "\U0001f900-\U0001f9ff" + "\U0001fa00-\U0001fa6f" + "\U0001fa70-\U0001faff" + "\u2600-\u26ff" + "\u2700-\u27bf" + "\U0001f1e6-\U0001f1ff]+", + flags=re.UNICODE, + ) + text = emoji_pattern.sub("", text) + + # Replace various dashes and symbols + replacements = { + "โ€“": "-", + "โ€‘": "-", + "โ€”": "-", + "ยฏ": " ", + "_": " ", + "โ€œ": '"', + "โ€": '"', + "โ€˜": "'", + "โ€™": "'", + "ยด": "'", + "`": "'", + "[": " ", + "]": " ", + "|": " ", + "/": " ", + "#": " ", + "โ†’": " ", + "โ†": " ", + } + for k, v in replacements.items(): + text = text.replace(k, v) + + # Remove combining diacritics # FIXME: this should be fixed for non-English languages + text = re.sub( + r"[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]", + "", + text, + ) + + # Remove special symbols + text = re.sub(r"[โ™ฅโ˜†โ™กยฉ\\]", "", text) + + # Replace known expressions + expr_replacements = { + "@": " at ", + "e.g.,": "for example, ", + "i.e.,": "that is, ", + } + for k, v in expr_replacements.items(): + text = text.replace(k, v) + + # Fix spacing around punctuation + text = re.sub(r" ,", ",", text) + text = re.sub(r" \.", ".", text) + text = re.sub(r" !", "!", text) + text = re.sub(r" \?", "?", text) + text = re.sub(r" ;", ";", text) + text = re.sub(r" :", ":", text) + text = re.sub(r" '", "'", text) + + # Remove duplicate quotes + while '""' in text: + text = text.replace('""', '"') + while "''" in text: + text = text.replace("''", "'") + while "``" in text: + text = text.replace("``", "`") + + # Remove extra spaces + text = re.sub(r"\s+", " ", text).strip() + + # If text doesn't end with punctuation, quotes, or closing brackets, add a period + if not re.search(r"[.!?;:,'\"')\]}โ€ฆใ€‚ใ€ใ€ใ€‘ใ€‰ใ€‹โ€บยป]$", text): + text += "." + return text def _get_text_mask(self, text_ids_lengths: np.ndarray) -> np.ndarray: diff --git a/rust/README.md b/rust/README.md index 0098473..c1285c9 100644 --- a/rust/README.md +++ b/rust/README.md @@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using Rust. ## ๐Ÿ“ฐ Update News +**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality. + **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5). **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses. diff --git a/rust/src/helper.rs b/rust/src/helper.rs index 7ed427a..e0206db 100644 --- a/rust/src/helper.rs +++ b/rust/src/helper.rs @@ -113,7 +113,95 @@ impl UnicodeProcessor { } pub fn preprocess_text(text: &str) -> String { - text.nfkd().collect() + // TODO: Need advanced normalizer for better performance + let mut text: String = text.nfkd().collect(); + + // FIXME: this should be fixed for non-English languages + + // Remove emojis (wide Unicode range) + let emoji_pattern = Regex::new(r"[\x{1F600}-\x{1F64F}\x{1F300}-\x{1F5FF}\x{1F680}-\x{1F6FF}\x{1F700}-\x{1F77F}\x{1F780}-\x{1F7FF}\x{1F800}-\x{1F8FF}\x{1F900}-\x{1F9FF}\x{1FA00}-\x{1FA6F}\x{1FA70}-\x{1FAFF}\x{2600}-\x{26FF}\x{2700}-\x{27BF}\x{1F1E6}-\x{1F1FF}]+").unwrap(); + text = emoji_pattern.replace_all(&text, "").to_string(); + + // Replace various dashes and symbols + let replacements = [ + ("โ€“", "-"), // en dash + ("โ€‘", "-"), // non-breaking hyphen + ("โ€”", "-"), // em dash + ("ยฏ", " "), // macron + ("_", " "), // underscore + ("\u{201C}", "\""), // left double quote + ("\u{201D}", "\""), // right double quote + ("\u{2018}", "'"), // left single quote + ("\u{2019}", "'"), // right single quote + ("ยด", "'"), // acute accent + ("`", "'"), // grave accent + ("[", " "), // left bracket + ("]", " "), // right bracket + ("|", " "), // vertical bar + ("/", " "), // slash + ("#", " "), // hash + ("โ†’", " "), // right arrow + ("โ†", " "), // left arrow + ]; + + for (from, to) in &replacements { + text = text.replace(from, to); + } + + // Remove combining diacritics // FIXME: this should be fixed for non-English languages + let diacritics_pattern = Regex::new(r"[\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{030A}\u{030B}\u{030C}\u{0327}\u{0328}\u{0329}\u{032A}\u{032B}\u{032C}\u{032D}\u{032E}\u{032F}]").unwrap(); + text = diacritics_pattern.replace_all(&text, "").to_string(); + + // Remove special symbols + let special_symbols = ["โ™ฅ", "โ˜†", "โ™ก", "ยฉ", "\\"]; + for symbol in &special_symbols { + text = text.replace(symbol, ""); + } + + // Replace known expressions + let expr_replacements = [ + ("@", " at "), + ("e.g.,", "for example, "), + ("i.e.,", "that is, "), + ]; + + for (from, to) in &expr_replacements { + text = text.replace(from, to); + } + + // Fix spacing around punctuation + text = Regex::new(r" ,").unwrap().replace_all(&text, ",").to_string(); + text = Regex::new(r" \.").unwrap().replace_all(&text, ".").to_string(); + text = Regex::new(r" !").unwrap().replace_all(&text, "!").to_string(); + text = Regex::new(r" \?").unwrap().replace_all(&text, "?").to_string(); + text = Regex::new(r" ;").unwrap().replace_all(&text, ";").to_string(); + text = Regex::new(r" :").unwrap().replace_all(&text, ":").to_string(); + text = Regex::new(r" '").unwrap().replace_all(&text, "'").to_string(); + + // Remove duplicate quotes + while text.contains("\"\"") { + text = text.replace("\"\"", "\""); + } + while text.contains("''") { + text = text.replace("''", "'"); + } + while text.contains("``") { + text = text.replace("``", "`"); + } + + // Remove extra spaces + text = Regex::new(r"\s+").unwrap().replace_all(&text, " ").to_string(); + text = text.trim().to_string(); + + // If text doesn't end with punctuation, quotes, or closing brackets, add a period + if !text.is_empty() { + let ends_with_punct = Regex::new(r#"[.!?;:,'"\u{201C}\u{201D}\u{2018}\u{2019})\]}โ€ฆใ€‚ใ€ใ€ใ€‘ใ€‰ใ€‹โ€บยป]$"#).unwrap(); + if !ends_with_punct.is_match(&text) { + text.push('.'); + } + } + + text } pub fn text_to_unicode_values(text: &str) -> Vec { diff --git a/swift/README.md b/swift/README.md index a7065af..32e17c8 100644 --- a/swift/README.md +++ b/swift/README.md @@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx`. ## ๐Ÿ“ฐ Update News +**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality. + **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5). **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses. diff --git a/swift/Sources/Helper.swift b/swift/Sources/Helper.swift index 6459384..de0f3bd 100644 --- a/swift/Sources/Helper.swift +++ b/swift/Sources/Helper.swift @@ -72,7 +72,114 @@ class UnicodeProcessor { } func preprocessText(_ text: String) -> String { - return text.precomposedStringWithCompatibilityMapping + // TODO: Need advanced normalizer for better performance + var text = text.precomposedStringWithCompatibilityMapping + + // FIXME: this should be fixed for non-English languages + + // Remove emojis (wide Unicode range) + // Swift NSRegularExpression doesn't support Unicode escapes above \uFFFF + // Use character filtering instead + text = text.unicodeScalars.filter { scalar in + let value = scalar.value + return !((value >= 0x1F600 && value <= 0x1F64F) || + (value >= 0x1F300 && value <= 0x1F5FF) || + (value >= 0x1F680 && value <= 0x1F6FF) || + (value >= 0x1F700 && value <= 0x1F77F) || + (value >= 0x1F780 && value <= 0x1F7FF) || + (value >= 0x1F800 && value <= 0x1F8FF) || + (value >= 0x1F900 && value <= 0x1F9FF) || + (value >= 0x1FA00 && value <= 0x1FA6F) || + (value >= 0x1FA70 && value <= 0x1FAFF) || + (value >= 0x2600 && value <= 0x26FF) || + (value >= 0x2700 && value <= 0x27BF) || + (value >= 0x1F1E6 && value <= 0x1F1FF)) + }.map { String($0) }.joined() + + // Replace various dashes and symbols + let replacements: [String: String] = [ + "โ€“": "-", // en dash + "โ€‘": "-", // non-breaking hyphen + "โ€”": "-", // em dash + "ยฏ": " ", // macron + "_": " ", // underscore + "\u{201C}": "\"", // left double quote + "\u{201D}": "\"", // right double quote + "\u{2018}": "'", // left single quote + "\u{2019}": "'", // right single quote + "ยด": "'", // acute accent + "`": "'", // grave accent + "[": " ", // left bracket + "]": " ", // right bracket + "|": " ", // vertical bar + "/": " ", // slash + "#": " ", // hash + "โ†’": " ", // right arrow + "โ†": " ", // left arrow + ] + + for (old, new) in replacements { + text = text.replacingOccurrences(of: old, with: new) + } + + // Remove combining diacritics // FIXME: this should be fixed for non-English languages + let diacriticsPattern = try! NSRegularExpression(pattern: "[\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u030A\\u030B\\u030C\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F]") + let diacriticsRange = NSRange(text.startIndex..., in: text) + text = diacriticsPattern.stringByReplacingMatches(in: text, range: diacriticsRange, withTemplate: "") + + // Remove special symbols + let specialSymbols = ["โ™ฅ", "โ˜†", "โ™ก", "ยฉ", "\\"] + for symbol in specialSymbols { + text = text.replacingOccurrences(of: symbol, with: "") + } + + // Replace known expressions + let exprReplacements: [String: String] = [ + "@": " at ", + "e.g.,": "for example, ", + "i.e.,": "that is, ", + ] + + for (old, new) in exprReplacements { + text = text.replacingOccurrences(of: old, with: new) + } + + // Fix spacing around punctuation + text = text.replacingOccurrences(of: " ,", with: ",") + text = text.replacingOccurrences(of: " .", with: ".") + text = text.replacingOccurrences(of: " !", with: "!") + text = text.replacingOccurrences(of: " ?", with: "?") + text = text.replacingOccurrences(of: " ;", with: ";") + text = text.replacingOccurrences(of: " :", with: ":") + text = text.replacingOccurrences(of: " '", with: "'") + + // Remove duplicate quotes + while text.contains("\"\"") { + text = text.replacingOccurrences(of: "\"\"", with: "\"") + } + while text.contains("''") { + text = text.replacingOccurrences(of: "''", with: "'") + } + while text.contains("``") { + text = text.replacingOccurrences(of: "``", with: "`") + } + + // Remove extra spaces + let whitespacePattern = try! NSRegularExpression(pattern: "\\s+") + let whitespaceRange = NSRange(text.startIndex..., in: text) + text = whitespacePattern.stringByReplacingMatches(in: text, range: whitespaceRange, withTemplate: " ") + text = text.trimmingCharacters(in: .whitespacesAndNewlines) + + // If text doesn't end with punctuation, quotes, or closing brackets, add a period + if !text.isEmpty { + let punctPattern = try! NSRegularExpression(pattern: "[.!?;:,'\"\\u201C\\u201D\\u2018\\u2019)\\]}โ€ฆใ€‚ใ€ใ€ใ€‘ใ€‰ใ€‹โ€บยป]$") + let punctRange = NSRange(text.startIndex..., in: text) + if punctPattern.firstMatch(in: text, range: punctRange) == nil { + text += "." + } + } + + return text } func lengthToMask(_ lengths: [Int], maxLen: Int? = nil) -> [[[Float]]] { diff --git a/web/README.md b/web/README.md index 4366251..1daf225 100644 --- a/web/README.md +++ b/web/README.md @@ -4,6 +4,8 @@ This example demonstrates how to use Supertonic in a web browser using ONNX Runt ## ๐Ÿ“ฐ Update News +**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality. + **2025.11.19** - Added speed control slider to adjust speech synthesis speed (default: 1.05, recommended range: 0.9-1.5). **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses. diff --git a/web/helper.js b/web/helper.js index 9553176..2022866 100644 --- a/web/helper.js +++ b/web/helper.js @@ -28,7 +28,85 @@ export class UnicodeProcessor { } preprocessText(text) { - return text.normalize('NFKC'); + // TODO: Need advanced normalizer for better performance + text = text.normalize('NFKD'); + + // FIXME: this should be fixed for non-English languages + + // Remove emojis (wide Unicode range) + const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu; + text = text.replace(emojiPattern, ''); + + // Replace various dashes and symbols + const replacements = { + 'โ€“': '-', + 'โ€‘': '-', + 'โ€”': '-', + 'ยฏ': ' ', + '_': ' ', + '"': '"', + '"': '"', + '\u2018': "'", // left single quote + '\u2019': "'", // right single quote + 'ยด': "'", + '`': "'", + '[': ' ', + ']': ' ', + '|': ' ', + '/': ' ', + '#': ' ', + 'โ†’': ' ', + 'โ†': ' ', + }; + for (const [k, v] of Object.entries(replacements)) { + text = text.replaceAll(k, v); + } + + // Remove combining diacritics // FIXME: this should be fixed for non-English languages + text = text.replace(/[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]/g, ''); + + // Remove special symbols + text = text.replace(/[โ™ฅโ˜†โ™กยฉ\\]/g, ''); + + // Replace known expressions + const exprReplacements = { + '@': ' at ', + 'e.g.,': 'for example, ', + 'i.e.,': 'that is, ', + }; + for (const [k, v] of Object.entries(exprReplacements)) { + text = text.replaceAll(k, v); + } + + // Fix spacing around punctuation + text = text.replace(/ ,/g, ','); + text = text.replace(/ \./g, '.'); + text = text.replace(/ !/g, '!'); + text = text.replace(/ \?/g, '?'); + text = text.replace(/ ;/g, ';'); + text = text.replace(/ :/g, ':'); + text = text.replace(/ '/g, "'"); + + // Remove duplicate quotes + while (text.includes('""')) { + text = text.replace('""', '"'); + } + while (text.includes("''")) { + text = text.replace("''", "'"); + } + while (text.includes('``')) { + text = text.replace('``', '`'); + } + + // Remove extra spaces + text = text.replace(/\s+/g, ' ').trim(); + + // If text doesn't end with punctuation, quotes, or closing brackets, add a period + if (!/[.!?;:,'\"')\]}โ€ฆใ€‚ใ€ใ€ใ€‘ใ€‰ใ€‹โ€บยป]$/.test(text)) { + text += '.'; + } + + return text; } getTextMask(textIdsLengths) {