Fix text normalization bug (#16)

2026-06-02 01:38:48 +02:00 · 2025-11-23 13:18:15 +09:00
parent 9015bd095f
commit 8d42b55965
18 changed files with 966 additions and 28 deletions
@@ -4,6 +4,8 @@ High-performance text-to-speech inference using ONNX Runtime.
 ## 📰 Update News
 **2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
 **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
@@ -19,6 +19,24 @@ void clearTensorBuffers() {
    g_tensor_buffers_int64.clear();
 }
 // ============================================================================
 // Helper function - trim
 // ============================================================================
 static std::string trim(const std::string& str) {
    size_t start = 0;
    while (start < str.size() && std::isspace(static_cast<unsigned char>(str[start]))) {
        start++;
    }
    size_t end = str.size();
    while (end > start && std::isspace(static_cast<unsigned char>(str[end - 1]))) {
        end--;
    }
    return str.substr(start, end - start);
 }
 // ============================================================================
 // UnicodeProcessor implementation
 // ============================================================================
@@ -28,10 +46,148 @@ UnicodeProcessor::UnicodeProcessor(const std::string& unicode_indexer_json_path)
 }
 std::string UnicodeProcessor::preprocessText(const std::string& text) {
-    // Simple NFKD normalization (C++ doesn't have built-in Unicode normalization)
+    // TODO: Need advanced normalizer for better performance
-    // For now, just return the text as-is
+    // NOTE: C++ doesn't have built-in Unicode normalization like Python's NFKD
-    // TODO: add proper Unicode normalization
+    // For full Unicode normalization, consider using ICU library
-    return text;
+    // This implementation handles basic text preprocessing
    std::string result = text;
    // FIXME: this should be fixed for non-English languages
    // Remove emojis and various Unicode symbols
    // Using regex to remove common emoji ranges and special symbols
    // Note: This is a simplified version - full emoji support needs UTF-8 handling
    std::regex emoji_pattern(
        "[\xF0\x9F][\x80-\xBF]{2}|"  // Common emoji pattern in UTF-8
        "[\xE2][\x80-\xBF]{2}|"       // Various symbols
        "[\xE2][\x98-\x9E][\x80-\xBF]" // More symbols
    );
    result = std::regex_replace(result, emoji_pattern, "");
    // Replace various dashes and symbols
    struct Replacement {
        const char* from;
        const char* to;
    };
    const Replacement replacements[] = {
        {"–", "-"},      // en dash
        {"‑", "-"},      // non-breaking hyphen
        {"—", "-"},      // em dash
        {"¯", " "},      // macron
        {"_", " "},      // underscore
        {""", "\""},     // left double quote (U+201C)
        {""", "\""},     // right double quote (U+201D)
        {"'", "'"},      // left single quote (U+2018)
        {"'", "'"},      // right single quote (U+2019)
        {"´", "'"},      // acute accent
        {"`", "'"},      // grave accent
        {"[", " "},      // left bracket
        {"]", " "},      // right bracket
        {"|", " "},      // vertical bar
        {"/", " "},      // slash
        {"#", " "},      // hash
        {"→", " "},      // right arrow
        {"←", " "},      // left arrow
    };
    for (const auto& repl : replacements) {
        size_t pos = 0;
        while ((pos = result.find(repl.from, pos)) != std::string::npos) {
            result.replace(pos, strlen(repl.from), repl.to);
            pos += strlen(repl.to);
        }
    }
    // Remove combining diacritics (common combining marks in UTF-8)
    // FIXME: this should be fixed for non-English languages
    std::regex diacritics_pattern(
        "[\xCC\xCD][\x80-\xBF]"  // Combining diacritical marks range
    );
    result = std::regex_replace(result, diacritics_pattern, "");
    // Remove special symbols
    const char* special_symbols[] = {"♥", "☆", "♡", "©", "\\"};
    for (const char* symbol : special_symbols) {
        size_t pos = 0;
        while ((pos = result.find(symbol, pos)) != std::string::npos) {
            result.erase(pos, strlen(symbol));
        }
    }
    // Replace known expressions
    const Replacement expr_replacements[] = {
        {"@", " at "},
        {"e.g.,", "for example, "},
        {"i.e.,", "that is, "},
    };
    for (const auto& repl : expr_replacements) {
        size_t pos = 0;
        while ((pos = result.find(repl.from, pos)) != std::string::npos) {
            result.replace(pos, strlen(repl.from), repl.to);
            pos += strlen(repl.to);
        }
    }
    // Fix spacing around punctuation
    result = std::regex_replace(result, std::regex(" ,"), ",");
    result = std::regex_replace(result, std::regex(" \\."), ".");
    result = std::regex_replace(result, std::regex(" !"), "!");
    result = std::regex_replace(result, std::regex(" \\?"), "?");
    result = std::regex_replace(result, std::regex(" ;"), ";");
    result = std::regex_replace(result, std::regex(" :"), ":");
    result = std::regex_replace(result, std::regex(" '"), "'");
    // Remove duplicate quotes
    while (result.find("\"\"") != std::string::npos) {
        size_t pos = result.find("\"\"");
        result.replace(pos, 2, "\"");
    }
    while (result.find("''") != std::string::npos) {
        size_t pos = result.find("''");
        result.replace(pos, 2, "'");
    }
    while (result.find("``") != std::string::npos) {
        size_t pos = result.find("``");
        result.replace(pos, 2, "`");
    }
    // Remove extra spaces
    result = std::regex_replace(result, std::regex("\\s+"), " ");
    result = trim(result);
    // If text doesn't end with punctuation, quotes, or closing brackets, add a period
    if (!result.empty()) {
        char last_char = result.back();
        bool ends_with_punct = (
            last_char == '.' || last_char == '!' || last_char == '?' ||
            last_char == ';' || last_char == ':' || last_char == ',' ||
            last_char == '\'' || last_char == '"' || last_char == ')' ||
            last_char == ']' || last_char == '}' || last_char == '>'
        );
        // Check for UTF-8 multibyte ending punctuation (e.g., …, 。, curly quotes, etc.)
        if (!ends_with_punct && result.size() >= 3) {
            std::string last_three = result.substr(result.size() - 3);
            if (last_three == "…" || last_three == "。" || 
                last_three == "」" || last_three == "』" ||
                last_three == "】" || last_three == "〉" ||
                last_three == "》" || last_three == "›" ||
                last_three == "»" || last_three == """ ||
                last_three == """ || last_three == "'" ||
                last_three == "'") {
                ends_with_punct = true;
            }
        }
        if (!ends_with_punct) {
            result += ".";
        }
    }
    return result;
 }
 std::vector<uint16_t> UnicodeProcessor::textToUnicodeValues(const std::string& text) {
@@ -772,20 +928,6 @@ std::string sanitizeFilename(const std::string& text, int max_len) {
 // Chunk text
 // ============================================================================
 static std::string trim(const std::string& str) {
    size_t start = 0;
    while (start < str.size() && std::isspace(static_cast<unsigned char>(str[start]))) {
        start++;
    }
    size_t end = str.size();
    while (end > start && std::isspace(static_cast<unsigned char>(str[end - 1]))) {
        end--;
    }
    return str.substr(start, end - start);
 }
 std::vector<std::string> chunkText(const std::string& text, int max_len) {
    std::vector<std::string> chunks;
@@ -71,10 +71,144 @@ namespace Supertonic
            }
        }
        private static string RemoveEmojis(string text)
        {
            var result = new StringBuilder();
            for (int i = 0; i < text.Length; i++)
            {
                int codePoint;
                if (char.IsHighSurrogate(text[i]) && i + 1 < text.Length && char.IsLowSurrogate(text[i + 1]))
                {
                    // Get the full code point from surrogate pair
                    codePoint = char.ConvertToUtf32(text[i], text[i + 1]);
                    i++; // Skip the low surrogate
                }
                else
                {
                    codePoint = text[i];
                }
                // Check if code point is in emoji ranges
                bool isEmoji = (codePoint >= 0x1F600 && codePoint <= 0x1F64F) ||
                               (codePoint >= 0x1F300 && codePoint <= 0x1F5FF) ||
                               (codePoint >= 0x1F680 && codePoint <= 0x1F6FF) ||
                               (codePoint >= 0x1F700 && codePoint <= 0x1F77F) ||
                               (codePoint >= 0x1F780 && codePoint <= 0x1F7FF) ||
                               (codePoint >= 0x1F800 && codePoint <= 0x1F8FF) ||
                               (codePoint >= 0x1F900 && codePoint <= 0x1F9FF) ||
                               (codePoint >= 0x1FA00 && codePoint <= 0x1FA6F) ||
                               (codePoint >= 0x1FA70 && codePoint <= 0x1FAFF) ||
                               (codePoint >= 0x2600 && codePoint <= 0x26FF) ||
                               (codePoint >= 0x2700 && codePoint <= 0x27BF) ||
                               (codePoint >= 0x1F1E6 && codePoint <= 0x1F1FF);
                if (!isEmoji)
                {
                    if (codePoint > 0xFFFF)
                    {
                        // Add back as surrogate pair
                        result.Append(char.ConvertFromUtf32(codePoint));
                    }
                    else
                    {
                        result.Append((char)codePoint);
                    }
                }
            }
            return result.ToString();
        }
        private string PreprocessText(string text)
        {
-            // Simple normalization (C# has Normalize built-in)
+            // TODO: Need advanced normalizer for better performance
-            return text.Normalize(NormalizationForm.FormKD);
+            text = text.Normalize(NormalizationForm.FormKD);
            // FIXME: this should be fixed for non-English languages
            // Remove emojis (wide Unicode range)
            // C# doesn't support \u{...} syntax in regex, so we use character filtering instead
            text = RemoveEmojis(text);
            // Replace various dashes and symbols
            var replacements = new Dictionary<string, string>
            {
                {"–", "-"},      // en dash
                {"‑", "-"},      // non-breaking hyphen
                {"—", "-"},      // em dash
                {"¯", " "},      // macron
                {"_", " "},      // underscore
                {"\u201C", "\""},     // left double quote
                {"\u201D", "\""},     // right double quote
                {"\u2018", "'"},      // left single quote
                {"\u2019", "'"},      // right single quote
                {"´", "'"},      // acute accent
                {"`", "'"},      // grave accent
                {"[", " "},      // left bracket
                {"]", " "},      // right bracket
                {"|", " "},      // vertical bar
                {"/", " "},      // slash
                {"#", " "},      // hash
                {"→", " "},      // right arrow
                {"←", " "},      // left arrow
            };
            foreach (var kvp in replacements)
            {
                text = text.Replace(kvp.Key, kvp.Value);
            }
            // Remove combining diacritics // FIXME: this should be fixed for non-English languages
            text = Regex.Replace(text, @"[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]", "");
            // Remove special symbols
            text = Regex.Replace(text, @"[♥☆♡©\\]", "");
            // Replace known expressions
            var exprReplacements = new Dictionary<string, string>
            {
                {"@", " at "},
                {"e.g.,", "for example, "},
                {"i.e.,", "that is, "},
            };
            foreach (var kvp in exprReplacements)
            {
                text = text.Replace(kvp.Key, kvp.Value);
            }
            // Fix spacing around punctuation
            text = Regex.Replace(text, @" ,", ",");
            text = Regex.Replace(text, @" \.", ".");
            text = Regex.Replace(text, @" !", "!");
            text = Regex.Replace(text, @" \?", "?");
            text = Regex.Replace(text, @" ;", ";");
            text = Regex.Replace(text, @" :", ":");
            text = Regex.Replace(text, @" '", "'");
            // Remove duplicate quotes
            while (text.Contains("\"\""))
            {
                text = text.Replace("\"\"", "\"");
            }
            while (text.Contains("''"))
            {
                text = text.Replace("''", "'");
            }
            while (text.Contains("``"))
            {
                text = text.Replace("``", "`");
            }
            // Remove extra spaces
            text = Regex.Replace(text, @"\s+", " ").Trim();
            // If text doesn't end with punctuation, quotes, or closing brackets, add a period
            if (!Regex.IsMatch(text, @"[.!?;:,'\u0022\u201C\u201D\u2018\u2019)\]}…。」』】〉》›»]$"))
            {
                text += ".";
            }
            return text;
        }
        private int[] TextToUnicodeValues(string text)
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `ExampleONNX.cs`.
 ## 📰 Update News
 **2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
 **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx.go`.
 ## 📰 Update News
 **2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
 **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
@@ -327,8 +327,97 @@ func splitSentences(text string) []string {
 // Utility functions
 func preprocessText(text string) string {
-	// Simple normalization (Go doesn't have built-in NFKD normalization)
+	// TODO: Need advanced normalizer for better performance
 	// NOTE: Go doesn't have built-in NFKD normalization like Python
 	// For full Unicode normalization, use golang.org/x/text/unicode/norm
 	// This implementation handles basic text preprocessing
 	// FIXME: this should be fixed for non-English languages
 	// Remove emojis and various Unicode symbols
 	emojiPattern := regexp.MustCompile(`[\x{1F600}-\x{1F64F}\x{1F300}-\x{1F5FF}\x{1F680}-\x{1F6FF}\x{1F700}-\x{1F77F}\x{1F780}-\x{1F7FF}\x{1F800}-\x{1F8FF}\x{1F900}-\x{1F9FF}\x{1FA00}-\x{1FA6F}\x{1FA70}-\x{1FAFF}\x{2600}-\x{26FF}\x{2700}-\x{27BF}\x{1F1E6}-\x{1F1FF}]+`)
 	text = emojiPattern.ReplaceAllString(text, "")
 	// Replace various dashes and symbols
 	replacements := map[string]string{
 		"–": "-",    // en dash
 		"‑": "-",    // non-breaking hyphen
 		"—": "-",    // em dash
 		"¯": " ",    // macron
 		"_": " ",    // underscore
 		"\u201C": "\"",   // left double quote
 		"\u201D": "\"",   // right double quote
 		"\u2018": "'",    // left single quote
 		"\u2019": "'",    // right single quote
 		"´": "'",    // acute accent
 		"`": "'",    // grave accent
 		"[": " ",    // left bracket
 		"]": " ",    // right bracket
 		"|": " ",    // vertical bar
 		"/": " ",    // slash
 		"#": " ",    // hash
 		"→": " ",    // right arrow
 		"←": " ",    // left arrow
 	}
 	for old, new := range replacements {
 		text = strings.ReplaceAll(text, old, new)
 	}
 	// Remove combining diacritics (common combining marks)
 	// FIXME: this should be fixed for non-English languages
 	diacriticsPattern := regexp.MustCompile(`[\x{0302}\x{0303}\x{0304}\x{0305}\x{0306}\x{0307}\x{0308}\x{030A}\x{030B}\x{030C}\x{0327}\x{0328}\x{0329}\x{032A}\x{032B}\x{032C}\x{032D}\x{032E}\x{032F}]`)
 	text = diacriticsPattern.ReplaceAllString(text, "")
 	// Remove special symbols
 	specialSymbols := []string{"♥", "☆", "♡", "©", "\\"}
 	for _, symbol := range specialSymbols {
 		text = strings.ReplaceAll(text, symbol, "")
 	}
 	// Replace known expressions
 	exprReplacements := map[string]string{
 		"@":     " at ",
 		"e.g.,": "for example, ",
 		"i.e.,": "that is, ",
 	}
 	for old, new := range exprReplacements {
 		text = strings.ReplaceAll(text, old, new)
 	}
 	// Fix spacing around punctuation
 	text = regexp.MustCompile(` ,`).ReplaceAllString(text, ",")
 	text = regexp.MustCompile(` \.`).ReplaceAllString(text, ".")
 	text = regexp.MustCompile(` !`).ReplaceAllString(text, "!")
 	text = regexp.MustCompile(` \?`).ReplaceAllString(text, "?")
 	text = regexp.MustCompile(` ;`).ReplaceAllString(text, ";")
 	text = regexp.MustCompile(` :`).ReplaceAllString(text, ":")
 	text = regexp.MustCompile(` '`).ReplaceAllString(text, "'")
 	// Remove duplicate quotes
 	for strings.Contains(text, `""`) {
 		text = strings.ReplaceAll(text, `""`, `"`)
 	}
 	for strings.Contains(text, "''") {
 		text = strings.ReplaceAll(text, "''", "'")
 	}
 	for strings.Contains(text, "``") {
 		text = strings.ReplaceAll(text, "``", "`")
 	}
 	// Remove extra spaces
 	text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ")
 	text = strings.TrimSpace(text)
 	// If text doesn't end with punctuation, quotes, or closing brackets, add a period
 	if text != "" {
 		endsWithPunct := regexp.MustCompile(`[.!?;:,'"\x{201C}\x{201D}\x{2018}\x{2019})\]}…。」』】〉》›»]$`)
 		if !endsWithPunct.MatchString(text) {
 			text += "."
 		}
 	}
 	return text
 }
@@ -60,6 +60,42 @@ class UnicodeProcessor {
        this.indexer = Helper.loadJsonLongArray(unicodeIndexerJsonPath);
    }
    private static String removeEmojis(String text) {
        StringBuilder result = new StringBuilder();
        for (int i = 0; i < text.length(); i++) {
            int codePoint;
            if (Character.isHighSurrogate(text.charAt(i)) && i + 1 < text.length() && Character.isLowSurrogate(text.charAt(i + 1))) {
                codePoint = Character.codePointAt(text, i);
                i++; // Skip the low surrogate
            } else {
                codePoint = text.charAt(i);
            }
            // Check if code point is in emoji ranges
            boolean isEmoji = (codePoint >= 0x1F600 && codePoint <= 0x1F64F) ||
                              (codePoint >= 0x1F300 && codePoint <= 0x1F5FF) ||
                              (codePoint >= 0x1F680 && codePoint <= 0x1F6FF) ||
                              (codePoint >= 0x1F700 && codePoint <= 0x1F77F) ||
                              (codePoint >= 0x1F780 && codePoint <= 0x1F7FF) ||
                              (codePoint >= 0x1F800 && codePoint <= 0x1F8FF) ||
                              (codePoint >= 0x1F900 && codePoint <= 0x1F9FF) ||
                              (codePoint >= 0x1FA00 && codePoint <= 0x1FA6F) ||
                              (codePoint >= 0x1FA70 && codePoint <= 0x1FAFF) ||
                              (codePoint >= 0x2600 && codePoint <= 0x26FF) ||
                              (codePoint >= 0x2700 && codePoint <= 0x27BF) ||
                              (codePoint >= 0x1F1E6 && codePoint <= 0x1F1FF);
            if (!isEmoji) {
                if (codePoint > 0xFFFF) {
                    result.append(Character.toChars(codePoint));
                } else {
                    result.append((char) codePoint);
                }
            }
        }
        return result.toString();
    }
    public TextProcessResult call(List<String> textList) {
        List<String> processedTexts = new ArrayList<>();
        for (String text : textList) {
@@ -86,7 +122,86 @@ class UnicodeProcessor {
    }
    private String preprocessText(String text) {
-        return Normalizer.normalize(text, Normalizer.Form.NFKD);
+        // TODO: Need advanced normalizer for better performance
        text = Normalizer.normalize(text, Normalizer.Form.NFKD);
        // FIXME: this should be fixed for non-English languages
        // Remove emojis (wide Unicode range)
        // Java Pattern doesn't support \x{...} syntax for Unicode above \uFFFF
        // Use character filtering instead
        text = removeEmojis(text);
        // Replace various dashes and symbols
        Map<String, String> replacements = new HashMap<>();
        replacements.put("–", "-");      // en dash
        replacements.put("‑", "-");      // non-breaking hyphen
        replacements.put("—", "-");      // em dash
        replacements.put("¯", " ");      // macron
        replacements.put("_", " ");      // underscore
        replacements.put("\u201C", "\"");     // left double quote
        replacements.put("\u201D", "\"");     // right double quote
        replacements.put("\u2018", "'");      // left single quote
        replacements.put("\u2019", "'");      // right single quote
        replacements.put("´", "'");      // acute accent
        replacements.put("`", "'");      // grave accent
        replacements.put("[", " ");      // left bracket
        replacements.put("]", " ");      // right bracket
        replacements.put("|", " ");      // vertical bar
        replacements.put("/", " ");      // slash
        replacements.put("#", " ");      // hash
        replacements.put("→", " ");      // right arrow
        replacements.put("←", " ");      // left arrow
        for (Map.Entry<String, String> entry : replacements.entrySet()) {
            text = text.replace(entry.getKey(), entry.getValue());
        }
        // Remove combining diacritics // FIXME: this should be fixed for non-English languages
        text = text.replaceAll("[\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u030A\\u030B\\u030C\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F]", "");
        // Remove special symbols
        text = text.replaceAll("[♥☆♡©\\\\]", "");
        // Replace known expressions
        Map<String, String> exprReplacements = new HashMap<>();
        exprReplacements.put("@", " at ");
        exprReplacements.put("e.g.,", "for example, ");
        exprReplacements.put("i.e.,", "that is, ");
        for (Map.Entry<String, String> entry : exprReplacements.entrySet()) {
            text = text.replace(entry.getKey(), entry.getValue());
        }
        // Fix spacing around punctuation
        text = text.replaceAll(" ,", ",");
        text = text.replaceAll(" \\.", ".");
        text = text.replaceAll(" !", "!");
        text = text.replaceAll(" \\?", "?");
        text = text.replaceAll(" ;", ";");
        text = text.replaceAll(" :", ":");
        text = text.replaceAll(" '", "'");
        // Remove duplicate quotes
        while (text.contains("\"\"")) {
            text = text.replace("\"\"", "\"");
        }
        while (text.contains("''")) {
            text = text.replace("''", "'");
        }
        while (text.contains("``")) {
            text = text.replace("``", "`");
        }
        // Remove extra spaces
        text = text.replaceAll("\\s+", " ").trim();
        // If text doesn't end with punctuation, quotes, or closing brackets, add a period
        if (!text.matches(".*[.!?;:,'\"\\u201C\\u201D\\u2018\\u2019)\\]}…。」』】〉》›»]$")) {
            text += ".";
        }
        return text;
    }
    private int[] textToUnicodeValues(String text) {
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `ExampleONNX.java`.
 ## 📰 Update News
 **2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
 **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
@@ -4,6 +4,8 @@ Node.js implementation for TTS inference. Uses ONNX Runtime to generate speech f
 ## 📰 Update News
 **2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
 **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
@@ -14,8 +14,85 @@ class UnicodeProcessor {
    }
    _preprocessText(text) {
-        // Simple NFKD normalization (JavaScript has normalize built-in)
+        // TODO: Need advanced normalizer for better performance
-        return text.normalize('NFKD');
+        text = text.normalize('NFKD');
        // FIXME: this should be fixed for non-English languages
        // Remove emojis (wide Unicode range)
        const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu;
        text = text.replace(emojiPattern, '');
        // Replace various dashes and symbols
        const replacements = {
            '–': '-',
            '‑': '-',
            '—': '-',
            '¯': ' ',
            '_': ' ',
            '"': '"',
            '"': '"',
            '\u2018': "'",  // left single quote
            '\u2019': "'",  // right single quote
            '´': "'",
            '`': "'",
            '[': ' ',
            ']': ' ',
            '|': ' ',
            '/': ' ',
            '#': ' ',
            '→': ' ',
            '←': ' ',
        };
        for (const [k, v] of Object.entries(replacements)) {
            text = text.replaceAll(k, v);
        }
        // Remove combining diacritics // FIXME: this should be fixed for non-English languages
        text = text.replace(/[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]/g, '');
        // Remove special symbols
        text = text.replace(/[♥☆♡©\\]/g, '');
        // Replace known expressions
        const exprReplacements = {
            '@': ' at ',
            'e.g.,': 'for example, ',
            'i.e.,': 'that is, ',
        };
        for (const [k, v] of Object.entries(exprReplacements)) {
            text = text.replaceAll(k, v);
        }
        // Fix spacing around punctuation
        text = text.replace(/ ,/g, ',');
        text = text.replace(/ \./g, '.');
        text = text.replace(/ !/g, '!');
        text = text.replace(/ \?/g, '?');
        text = text.replace(/ ;/g, ';');
        text = text.replace(/ :/g, ':');
        text = text.replace(/ '/g, "'");
        // Remove duplicate quotes
        while (text.includes('""')) {
            text = text.replace('""', '"');
        }
        while (text.includes("''")) {
            text = text.replace("''", "'");
        }
        while (text.includes('``')) {
            text = text.replace('``', '`');
        }
        // Remove extra spaces
        text = text.replace(/\s+/g, ' ').trim();
        // If text doesn't end with punctuation, quotes, or closing brackets, add a period
        if (!/[.!?;:,'\"')\]}…。」』】〉》›»]$/.test(text)) {
            text += '.';
        }
        return text;
    }
    _textToUnicodeValues(text) {
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx.py`.
 ## 📰 Update News
 **2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
 **2025.11.19** - Added `--speed` parameter to control speech synthesis speed. Adjust the speed factor to make speech faster or slower while maintaining natural quality.
 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
@@ -8,6 +8,8 @@ from unicodedata import normalize
 import numpy as np
 import onnxruntime as ort
 import re
 class UnicodeProcessor:
    def __init__(self, unicode_indexer_path: str):
@@ -15,8 +17,96 @@ class UnicodeProcessor:
            self.indexer = json.load(f)
    def _preprocess_text(self, text: str) -> str:
-        # TODO: add more preprocessing
+        # TODO: Need advanced normalizer for better performance
        text = normalize("NFKD", text)
        # FIXME: this should be fixed for non-English languages
        # Remove emojis (wide Unicode range)
        emoji_pattern = re.compile(
            "[\U0001f600-\U0001f64f"  # emoticons
            "\U0001f300-\U0001f5ff"  # symbols & pictographs
            "\U0001f680-\U0001f6ff"  # transport & map symbols
            "\U0001f700-\U0001f77f"
            "\U0001f780-\U0001f7ff"
            "\U0001f800-\U0001f8ff"
            "\U0001f900-\U0001f9ff"
            "\U0001fa00-\U0001fa6f"
            "\U0001fa70-\U0001faff"
            "\u2600-\u26ff"
            "\u2700-\u27bf"
            "\U0001f1e6-\U0001f1ff]+",
            flags=re.UNICODE,
        )
        text = emoji_pattern.sub("", text)
        # Replace various dashes and symbols
        replacements = {
            "–": "-",
            "‑": "-",
            "—": "-",
            "¯": " ",
            "_": " ",
            "“": '"',
            "”": '"',
            "‘": "'",
            "’": "'",
            "´": "'",
            "`": "'",
            "[": " ",
            "]": " ",
            "|": " ",
            "/": " ",
            "#": " ",
            "→": " ",
            "←": " ",
        }
        for k, v in replacements.items():
            text = text.replace(k, v)
        # Remove combining diacritics # FIXME: this should be fixed for non-English languages
        text = re.sub(
            r"[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]",
            "",
            text,
        )
        # Remove special symbols
        text = re.sub(r"[♥☆♡©\\]", "", text)
        # Replace known expressions
        expr_replacements = {
            "@": " at ",
            "e.g.,": "for example, ",
            "i.e.,": "that is, ",
        }
        for k, v in expr_replacements.items():
            text = text.replace(k, v)
        # Fix spacing around punctuation
        text = re.sub(r" ,", ",", text)
        text = re.sub(r" \.", ".", text)
        text = re.sub(r" !", "!", text)
        text = re.sub(r" \?", "?", text)
        text = re.sub(r" ;", ";", text)
        text = re.sub(r" :", ":", text)
        text = re.sub(r" '", "'", text)
        # Remove duplicate quotes
        while '""' in text:
            text = text.replace('""', '"')
        while "''" in text:
            text = text.replace("''", "'")
        while "``" in text:
            text = text.replace("``", "`")
        # Remove extra spaces
        text = re.sub(r"\s+", " ", text).strip()
        # If text doesn't end with punctuation, quotes, or closing brackets, add a period
        if not re.search(r"[.!?;:,'\"')\]}…。」』】〉》›»]$", text):
            text += "."
        return text
    def _get_text_mask(self, text_ids_lengths: np.ndarray) -> np.ndarray:
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using Rust.
 ## 📰 Update News
 **2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
 **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
@@ -113,7 +113,95 @@ impl UnicodeProcessor {
 }
 pub fn preprocess_text(text: &str) -> String {
-    text.nfkd().collect()
+    // TODO: Need advanced normalizer for better performance
    let mut text: String = text.nfkd().collect();
    // FIXME: this should be fixed for non-English languages
    // Remove emojis (wide Unicode range)
    let emoji_pattern = Regex::new(r"[\x{1F600}-\x{1F64F}\x{1F300}-\x{1F5FF}\x{1F680}-\x{1F6FF}\x{1F700}-\x{1F77F}\x{1F780}-\x{1F7FF}\x{1F800}-\x{1F8FF}\x{1F900}-\x{1F9FF}\x{1FA00}-\x{1FA6F}\x{1FA70}-\x{1FAFF}\x{2600}-\x{26FF}\x{2700}-\x{27BF}\x{1F1E6}-\x{1F1FF}]+").unwrap();
    text = emoji_pattern.replace_all(&text, "").to_string();
    // Replace various dashes and symbols
    let replacements = [
        ("–", "-"),      // en dash
        ("‑", "-"),      // non-breaking hyphen
        ("—", "-"),      // em dash
        ("¯", " "),      // macron
        ("_", " "),      // underscore
        ("\u{201C}", "\""),     // left double quote
        ("\u{201D}", "\""),     // right double quote
        ("\u{2018}", "'"),      // left single quote
        ("\u{2019}", "'"),      // right single quote
        ("´", "'"),      // acute accent
        ("`", "'"),      // grave accent
        ("[", " "),      // left bracket
        ("]", " "),      // right bracket
        ("|", " "),      // vertical bar
        ("/", " "),      // slash
        ("#", " "),      // hash
        ("→", " "),      // right arrow
        ("←", " "),      // left arrow
    ];
    for (from, to) in &replacements {
        text = text.replace(from, to);
    }
    // Remove combining diacritics // FIXME: this should be fixed for non-English languages
    let diacritics_pattern = Regex::new(r"[\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{030A}\u{030B}\u{030C}\u{0327}\u{0328}\u{0329}\u{032A}\u{032B}\u{032C}\u{032D}\u{032E}\u{032F}]").unwrap();
    text = diacritics_pattern.replace_all(&text, "").to_string();
    // Remove special symbols
    let special_symbols = ["♥", "☆", "♡", "©", "\\"];
    for symbol in &special_symbols {
        text = text.replace(symbol, "");
    }
    // Replace known expressions
    let expr_replacements = [
        ("@", " at "),
        ("e.g.,", "for example, "),
        ("i.e.,", "that is, "),
    ];
    for (from, to) in &expr_replacements {
        text = text.replace(from, to);
    }
    // Fix spacing around punctuation
    text = Regex::new(r" ,").unwrap().replace_all(&text, ",").to_string();
    text = Regex::new(r" \.").unwrap().replace_all(&text, ".").to_string();
    text = Regex::new(r" !").unwrap().replace_all(&text, "!").to_string();
    text = Regex::new(r" \?").unwrap().replace_all(&text, "?").to_string();
    text = Regex::new(r" ;").unwrap().replace_all(&text, ";").to_string();
    text = Regex::new(r" :").unwrap().replace_all(&text, ":").to_string();
    text = Regex::new(r" '").unwrap().replace_all(&text, "'").to_string();
    // Remove duplicate quotes
    while text.contains("\"\"") {
        text = text.replace("\"\"", "\"");
    }
    while text.contains("''") {
        text = text.replace("''", "'");
    }
    while text.contains("``") {
        text = text.replace("``", "`");
    }
    // Remove extra spaces
    text = Regex::new(r"\s+").unwrap().replace_all(&text, " ").to_string();
    text = text.trim().to_string();
    // If text doesn't end with punctuation, quotes, or closing brackets, add a period
    if !text.is_empty() {
        let ends_with_punct = Regex::new(r#"[.!?;:,'"\u{201C}\u{201D}\u{2018}\u{2019})\]}…。」』】〉》›»]$"#).unwrap();
        if !ends_with_punct.is_match(&text) {
            text.push('.');
        }
    }
    text
 }
 pub fn text_to_unicode_values(text: &str) -> Vec<usize> {
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx`.
 ## 📰 Update News
 **2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
 **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
@@ -72,7 +72,114 @@ class UnicodeProcessor {
 }
 func preprocessText(_ text: String) -> String {
-    return text.precomposedStringWithCompatibilityMapping
+    // TODO: Need advanced normalizer for better performance
    var text = text.precomposedStringWithCompatibilityMapping
    // FIXME: this should be fixed for non-English languages
    // Remove emojis (wide Unicode range)
    // Swift NSRegularExpression doesn't support Unicode escapes above \uFFFF
    // Use character filtering instead
    text = text.unicodeScalars.filter { scalar in
        let value = scalar.value
        return !((value >= 0x1F600 && value <= 0x1F64F) ||
                 (value >= 0x1F300 && value <= 0x1F5FF) ||
                 (value >= 0x1F680 && value <= 0x1F6FF) ||
                 (value >= 0x1F700 && value <= 0x1F77F) ||
                 (value >= 0x1F780 && value <= 0x1F7FF) ||
                 (value >= 0x1F800 && value <= 0x1F8FF) ||
                 (value >= 0x1F900 && value <= 0x1F9FF) ||
                 (value >= 0x1FA00 && value <= 0x1FA6F) ||
                 (value >= 0x1FA70 && value <= 0x1FAFF) ||
                 (value >= 0x2600 && value <= 0x26FF) ||
                 (value >= 0x2700 && value <= 0x27BF) ||
                 (value >= 0x1F1E6 && value <= 0x1F1FF))
    }.map { String($0) }.joined()
    // Replace various dashes and symbols
    let replacements: [String: String] = [
        "–": "-",      // en dash
        "‑": "-",      // non-breaking hyphen
        "—": "-",      // em dash
        "¯": " ",      // macron
        "_": " ",      // underscore
        "\u{201C}": "\"",     // left double quote
        "\u{201D}": "\"",     // right double quote
        "\u{2018}": "'",      // left single quote
        "\u{2019}": "'",      // right single quote
        "´": "'",      // acute accent
        "`": "'",      // grave accent
        "[": " ",      // left bracket
        "]": " ",      // right bracket
        "|": " ",      // vertical bar
        "/": " ",      // slash
        "#": " ",      // hash
        "→": " ",      // right arrow
        "←": " ",      // left arrow
    ]
    for (old, new) in replacements {
        text = text.replacingOccurrences(of: old, with: new)
    }
    // Remove combining diacritics // FIXME: this should be fixed for non-English languages
    let diacriticsPattern = try! NSRegularExpression(pattern: "[\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u030A\\u030B\\u030C\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F]")
    let diacriticsRange = NSRange(text.startIndex..., in: text)
    text = diacriticsPattern.stringByReplacingMatches(in: text, range: diacriticsRange, withTemplate: "")
    // Remove special symbols
    let specialSymbols = ["♥", "☆", "♡", "©", "\\"]
    for symbol in specialSymbols {
        text = text.replacingOccurrences(of: symbol, with: "")
    }
    // Replace known expressions
    let exprReplacements: [String: String] = [
        "@": " at ",
        "e.g.,": "for example, ",
        "i.e.,": "that is, ",
    ]
    for (old, new) in exprReplacements {
        text = text.replacingOccurrences(of: old, with: new)
    }
    // Fix spacing around punctuation
    text = text.replacingOccurrences(of: " ,", with: ",")
    text = text.replacingOccurrences(of: " .", with: ".")
    text = text.replacingOccurrences(of: " !", with: "!")
    text = text.replacingOccurrences(of: " ?", with: "?")
    text = text.replacingOccurrences(of: " ;", with: ";")
    text = text.replacingOccurrences(of: " :", with: ":")
    text = text.replacingOccurrences(of: " '", with: "'")
    // Remove duplicate quotes
    while text.contains("\"\"") {
        text = text.replacingOccurrences(of: "\"\"", with: "\"")
    }
    while text.contains("''") {
        text = text.replacingOccurrences(of: "''", with: "'")
    }
    while text.contains("``") {
        text = text.replacingOccurrences(of: "``", with: "`")
    }
    // Remove extra spaces
    let whitespacePattern = try! NSRegularExpression(pattern: "\\s+")
    let whitespaceRange = NSRange(text.startIndex..., in: text)
    text = whitespacePattern.stringByReplacingMatches(in: text, range: whitespaceRange, withTemplate: " ")
    text = text.trimmingCharacters(in: .whitespacesAndNewlines)
    // If text doesn't end with punctuation, quotes, or closing brackets, add a period
    if !text.isEmpty {
        let punctPattern = try! NSRegularExpression(pattern: "[.!?;:,'\"\\u201C\\u201D\\u2018\\u2019)\\]}…。」』】〉》›»]$")
        let punctRange = NSRange(text.startIndex..., in: text)
        if punctPattern.firstMatch(in: text, range: punctRange) == nil {
            text += "."
        }
    }
    return text
 }
 func lengthToMask(_ lengths: [Int], maxLen: Int? = nil) -> [[[Float]]] {
@@ -4,6 +4,8 @@ This example demonstrates how to use Supertonic in a web browser using ONNX Runt
 ## 📰 Update News
 **2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
 **2025.11.19** - Added speed control slider to adjust speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
@@ -28,7 +28,85 @@ export class UnicodeProcessor {
    }
    preprocessText(text) {
-        return text.normalize('NFKC');
+        // TODO: Need advanced normalizer for better performance
        text = text.normalize('NFKD');
        // FIXME: this should be fixed for non-English languages
        // Remove emojis (wide Unicode range)
        const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu;
        text = text.replace(emojiPattern, '');
        // Replace various dashes and symbols
        const replacements = {
            '–': '-',
            '‑': '-',
            '—': '-',
            '¯': ' ',
            '_': ' ',
            '"': '"',
            '"': '"',
            '\u2018': "'",  // left single quote
            '\u2019': "'",  // right single quote
            '´': "'",
            '`': "'",
            '[': ' ',
            ']': ' ',
            '|': ' ',
            '/': ' ',
            '#': ' ',
            '→': ' ',
            '←': ' ',
        };
        for (const [k, v] of Object.entries(replacements)) {
            text = text.replaceAll(k, v);
        }
        // Remove combining diacritics // FIXME: this should be fixed for non-English languages
        text = text.replace(/[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]/g, '');
        // Remove special symbols
        text = text.replace(/[♥☆♡©\\]/g, '');
        // Replace known expressions
        const exprReplacements = {
            '@': ' at ',
            'e.g.,': 'for example, ',
            'i.e.,': 'that is, ',
        };
        for (const [k, v] of Object.entries(exprReplacements)) {
            text = text.replaceAll(k, v);
        }
        // Fix spacing around punctuation
        text = text.replace(/ ,/g, ',');
        text = text.replace(/ \./g, '.');
        text = text.replace(/ !/g, '!');
        text = text.replace(/ \?/g, '?');
        text = text.replace(/ ;/g, ';');
        text = text.replace(/ :/g, ':');
        text = text.replace(/ '/g, "'");
        // Remove duplicate quotes
        while (text.includes('""')) {
            text = text.replace('""', '"');
        }
        while (text.includes("''")) {
            text = text.replace("''", "'");
        }
        while (text.includes('``')) {
            text = text.replace('``', '`');
        }
        // Remove extra spaces
        text = text.replace(/\s+/g, ' ').trim();
        // If text doesn't end with punctuation, quotes, or closing brackets, add a period
        if (!/[.!?;:,'\"')\]}…。」』】〉》›»]$/.test(text)) {
            text += '.';
        }
        return text;
    }
    getTextMask(textIdsLengths) {