Fix text normalization bug (#16)

2026-06-02 01:38:48 +02:00 · 2025-11-23 13:18:15 +09:00
parent 9015bd095f
commit 8d42b55965
18 changed files with 966 additions and 28 deletions
@@ -4,6 +4,8 @@ High-performance text-to-speech inference using ONNX Runtime.

 ## 📰 Update News

+**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
+
 **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).

 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
@@ -19,6 +19,24 @@ void clearTensorBuffers() {
    g_tensor_buffers_int64.clear();
 }

+// ============================================================================
+// Helper function - trim
+// ============================================================================
+
+static std::string trim(const std::string& str) {
+    size_t start = 0;
+    while (start < str.size() && std::isspace(static_cast<unsigned char>(str[start]))) {
+        start++;
+    }
+    
+    size_t end = str.size();
+    while (end > start && std::isspace(static_cast<unsigned char>(str[end - 1]))) {
+        end--;
+    }
+    
+    return str.substr(start, end - start);
+}
+
 // ============================================================================
 // UnicodeProcessor implementation
 // ============================================================================
@@ -28,10 +46,148 @@ UnicodeProcessor::UnicodeProcessor(const std::string& unicode_indexer_json_path)
 }

 std::string UnicodeProcessor::preprocessText(const std::string& text) {
-    // Simple NFKD normalization (C++ doesn't have built-in Unicode normalization)
-    // For now, just return the text as-is
-    // TODO: add proper Unicode normalization
-    return text;
+    // TODO: Need advanced normalizer for better performance
+    // NOTE: C++ doesn't have built-in Unicode normalization like Python's NFKD
+    // For full Unicode normalization, consider using ICU library
+    // This implementation handles basic text preprocessing
+    
+    std::string result = text;
+    
+    // FIXME: this should be fixed for non-English languages
+    
+    // Remove emojis and various Unicode symbols
+    // Using regex to remove common emoji ranges and special symbols
+    // Note: This is a simplified version - full emoji support needs UTF-8 handling
+    std::regex emoji_pattern(
+        "[\xF0\x9F][\x80-\xBF]{2}|"  // Common emoji pattern in UTF-8
+        "[\xE2][\x80-\xBF]{2}|"       // Various symbols
+        "[\xE2][\x98-\x9E][\x80-\xBF]" // More symbols
+    );
+    result = std::regex_replace(result, emoji_pattern, "");
+    
+    // Replace various dashes and symbols
+    struct Replacement {
+        const char* from;
+        const char* to;
+    };
+    
+    const Replacement replacements[] = {
+        {"–", "-"},      // en dash
+        {"‑", "-"},      // non-breaking hyphen
+        {"—", "-"},      // em dash
+        {"¯", " "},      // macron
+        {"_", " "},      // underscore
+        {""", "\""},     // left double quote (U+201C)
+        {""", "\""},     // right double quote (U+201D)
+        {"'", "'"},      // left single quote (U+2018)
+        {"'", "'"},      // right single quote (U+2019)
+        {"´", "'"},      // acute accent
+        {"`", "'"},      // grave accent
+        {"[", " "},      // left bracket
+        {"]", " "},      // right bracket
+        {"|", " "},      // vertical bar
+        {"/", " "},      // slash
+        {"#", " "},      // hash
+        {"→", " "},      // right arrow
+        {"←", " "},      // left arrow
+    };
+    
+    for (const auto& repl : replacements) {
+        size_t pos = 0;
+        while ((pos = result.find(repl.from, pos)) != std::string::npos) {
+            result.replace(pos, strlen(repl.from), repl.to);
+            pos += strlen(repl.to);
+        }
+    }
+    
+    // Remove combining diacritics (common combining marks in UTF-8)
+    // FIXME: this should be fixed for non-English languages
+    std::regex diacritics_pattern(
+        "[\xCC\xCD][\x80-\xBF]"  // Combining diacritical marks range
+    );
+    result = std::regex_replace(result, diacritics_pattern, "");
+    
+    // Remove special symbols
+    const char* special_symbols[] = {"♥", "☆", "♡", "©", "\\"};
+    for (const char* symbol : special_symbols) {
+        size_t pos = 0;
+        while ((pos = result.find(symbol, pos)) != std::string::npos) {
+            result.erase(pos, strlen(symbol));
+        }
+    }
+    
+    // Replace known expressions
+    const Replacement expr_replacements[] = {
+        {"@", " at "},
+        {"e.g.,", "for example, "},
+        {"i.e.,", "that is, "},
+    };
+    
+    for (const auto& repl : expr_replacements) {
+        size_t pos = 0;
+        while ((pos = result.find(repl.from, pos)) != std::string::npos) {
+            result.replace(pos, strlen(repl.from), repl.to);
+            pos += strlen(repl.to);
+        }
+    }
+    
+    // Fix spacing around punctuation
+    result = std::regex_replace(result, std::regex(" ,"), ",");
+    result = std::regex_replace(result, std::regex(" \\."), ".");
+    result = std::regex_replace(result, std::regex(" !"), "!");
+    result = std::regex_replace(result, std::regex(" \\?"), "?");
+    result = std::regex_replace(result, std::regex(" ;"), ";");
+    result = std::regex_replace(result, std::regex(" :"), ":");
+    result = std::regex_replace(result, std::regex(" '"), "'");
+    
+    // Remove duplicate quotes
+    while (result.find("\"\"") != std::string::npos) {
+        size_t pos = result.find("\"\"");
+        result.replace(pos, 2, "\"");
+    }
+    while (result.find("''") != std::string::npos) {
+        size_t pos = result.find("''");
+        result.replace(pos, 2, "'");
+    }
+    while (result.find("``") != std::string::npos) {
+        size_t pos = result.find("``");
+        result.replace(pos, 2, "`");
+    }
+    
+    // Remove extra spaces
+    result = std::regex_replace(result, std::regex("\\s+"), " ");
+    result = trim(result);
+    
+    // If text doesn't end with punctuation, quotes, or closing brackets, add a period
+    if (!result.empty()) {
+        char last_char = result.back();
+        bool ends_with_punct = (
+            last_char == '.' || last_char == '!' || last_char == '?' ||
+            last_char == ';' || last_char == ':' || last_char == ',' ||
+            last_char == '\'' || last_char == '"' || last_char == ')' ||
+            last_char == ']' || last_char == '}' || last_char == '>'
+        );
+        
+        // Check for UTF-8 multibyte ending punctuation (e.g., …, 。, curly quotes, etc.)
+        if (!ends_with_punct && result.size() >= 3) {
+            std::string last_three = result.substr(result.size() - 3);
+            if (last_three == "…" || last_three == "。" || 
+                last_three == "」" || last_three == "』" ||
+                last_three == "】" || last_three == "〉" ||
+                last_three == "》" || last_three == "›" ||
+                last_three == "»" || last_three == """ ||
+                last_three == """ || last_three == "'" ||
+                last_three == "'") {
+                ends_with_punct = true;
+            }
+        }
+        
+        if (!ends_with_punct) {
+            result += ".";
+        }
+    }
+    
+    return result;
 }

 std::vector<uint16_t> UnicodeProcessor::textToUnicodeValues(const std::string& text) {
@@ -772,20 +928,6 @@ std::string sanitizeFilename(const std::string& text, int max_len) {
 // Chunk text
 // ============================================================================

-static std::string trim(const std::string& str) {
-    size_t start = 0;
-    while (start < str.size() && std::isspace(static_cast<unsigned char>(str[start]))) {
-        start++;
-    }
-    
-    size_t end = str.size();
-    while (end > start && std::isspace(static_cast<unsigned char>(str[end - 1]))) {
-        end--;
-    }
-    
-    return str.substr(start, end - start);
-}
-
 std::vector<std::string> chunkText(const std::string& text, int max_len) {
    std::vector<std::string> chunks;
    
@@ -71,10 +71,144 @@ namespace Supertonic
            }
        }

+        private static string RemoveEmojis(string text)
+        {
+            var result = new StringBuilder();
+            for (int i = 0; i < text.Length; i++)
+            {
+                int codePoint;
+                if (char.IsHighSurrogate(text[i]) && i + 1 < text.Length && char.IsLowSurrogate(text[i + 1]))
+                {
+                    // Get the full code point from surrogate pair
+                    codePoint = char.ConvertToUtf32(text[i], text[i + 1]);
+                    i++; // Skip the low surrogate
+                }
+                else
+                {
+                    codePoint = text[i];
+                }
+
+                // Check if code point is in emoji ranges
+                bool isEmoji = (codePoint >= 0x1F600 && codePoint <= 0x1F64F) ||
+                               (codePoint >= 0x1F300 && codePoint <= 0x1F5FF) ||
+                               (codePoint >= 0x1F680 && codePoint <= 0x1F6FF) ||
+                               (codePoint >= 0x1F700 && codePoint <= 0x1F77F) ||
+                               (codePoint >= 0x1F780 && codePoint <= 0x1F7FF) ||
+                               (codePoint >= 0x1F800 && codePoint <= 0x1F8FF) ||
+                               (codePoint >= 0x1F900 && codePoint <= 0x1F9FF) ||
+                               (codePoint >= 0x1FA00 && codePoint <= 0x1FA6F) ||
+                               (codePoint >= 0x1FA70 && codePoint <= 0x1FAFF) ||
+                               (codePoint >= 0x2600 && codePoint <= 0x26FF) ||
+                               (codePoint >= 0x2700 && codePoint <= 0x27BF) ||
+                               (codePoint >= 0x1F1E6 && codePoint <= 0x1F1FF);
+
+                if (!isEmoji)
+                {
+                    if (codePoint > 0xFFFF)
+                    {
+                        // Add back as surrogate pair
+                        result.Append(char.ConvertFromUtf32(codePoint));
+                    }
+                    else
+                    {
+                        result.Append((char)codePoint);
+                    }
+                }
+            }
+            return result.ToString();
+        }
+
        private string PreprocessText(string text)
        {
-            // Simple normalization (C# has Normalize built-in)
-            return text.Normalize(NormalizationForm.FormKD);
+            // TODO: Need advanced normalizer for better performance
+            text = text.Normalize(NormalizationForm.FormKD);
+
+            // FIXME: this should be fixed for non-English languages
+
+            // Remove emojis (wide Unicode range)
+            // C# doesn't support \u{...} syntax in regex, so we use character filtering instead
+            text = RemoveEmojis(text);
+
+            // Replace various dashes and symbols
+            var replacements = new Dictionary<string, string>
+            {
+                {"–", "-"},      // en dash
+                {"‑", "-"},      // non-breaking hyphen
+                {"—", "-"},      // em dash
+                {"¯", " "},      // macron
+                {"_", " "},      // underscore
+                {"\u201C", "\""},     // left double quote
+                {"\u201D", "\""},     // right double quote
+                {"\u2018", "'"},      // left single quote
+                {"\u2019", "'"},      // right single quote
+                {"´", "'"},      // acute accent
+                {"`", "'"},      // grave accent
+                {"[", " "},      // left bracket
+                {"]", " "},      // right bracket
+                {"|", " "},      // vertical bar
+                {"/", " "},      // slash
+                {"#", " "},      // hash
+                {"→", " "},      // right arrow
+                {"←", " "},      // left arrow
+            };
+
+            foreach (var kvp in replacements)
+            {
+                text = text.Replace(kvp.Key, kvp.Value);
+            }
+
+            // Remove combining diacritics // FIXME: this should be fixed for non-English languages
+            text = Regex.Replace(text, @"[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]", "");
+
+            // Remove special symbols
+            text = Regex.Replace(text, @"[♥☆♡©\\]", "");
+
+            // Replace known expressions
+            var exprReplacements = new Dictionary<string, string>
+            {
+                {"@", " at "},
+                {"e.g.,", "for example, "},
+                {"i.e.,", "that is, "},
+            };
+
+            foreach (var kvp in exprReplacements)
+            {
+                text = text.Replace(kvp.Key, kvp.Value);
+            }
+
+            // Fix spacing around punctuation
+            text = Regex.Replace(text, @" ,", ",");
+            text = Regex.Replace(text, @" \.", ".");
+            text = Regex.Replace(text, @" !", "!");
+            text = Regex.Replace(text, @" \?", "?");
+            text = Regex.Replace(text, @" ;", ";");
+            text = Regex.Replace(text, @" :", ":");
+            text = Regex.Replace(text, @" '", "'");
+
+            // Remove duplicate quotes
+            while (text.Contains("\"\""))
+            {
+                text = text.Replace("\"\"", "\"");
+            }
+            while (text.Contains("''"))
+            {
+                text = text.Replace("''", "'");
+            }
+            while (text.Contains("``"))
+            {
+                text = text.Replace("``", "`");
+            }
+
+            // Remove extra spaces
+            text = Regex.Replace(text, @"\s+", " ").Trim();
+
+            // If text doesn't end with punctuation, quotes, or closing brackets, add a period
+            if (!Regex.IsMatch(text, @"[.!?;:,'\u0022\u201C\u201D\u2018\u2019)\]}…。」』】〉》›»]$"))
+            {
+                text += ".";
+            }
+
+            return text;
        }

        private int[] TextToUnicodeValues(string text)
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `ExampleONNX.cs`.

 ## 📰 Update News

+**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
+
 **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).

 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx.go`.

 ## 📰 Update News

+**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
+
 **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).

 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
@@ -327,8 +327,97 @@ func splitSentences(text string) []string {

 // Utility functions
 func preprocessText(text string) string {
-	// Simple normalization (Go doesn't have built-in NFKD normalization)
+	// TODO: Need advanced normalizer for better performance
+	// NOTE: Go doesn't have built-in NFKD normalization like Python
 	// For full Unicode normalization, use golang.org/x/text/unicode/norm
+	// This implementation handles basic text preprocessing
+
+	// FIXME: this should be fixed for non-English languages
+
+	// Remove emojis and various Unicode symbols
+	emojiPattern := regexp.MustCompile(`[\x{1F600}-\x{1F64F}\x{1F300}-\x{1F5FF}\x{1F680}-\x{1F6FF}\x{1F700}-\x{1F77F}\x{1F780}-\x{1F7FF}\x{1F800}-\x{1F8FF}\x{1F900}-\x{1F9FF}\x{1FA00}-\x{1FA6F}\x{1FA70}-\x{1FAFF}\x{2600}-\x{26FF}\x{2700}-\x{27BF}\x{1F1E6}-\x{1F1FF}]+`)
+	text = emojiPattern.ReplaceAllString(text, "")
+
+	// Replace various dashes and symbols
+	replacements := map[string]string{
+		"–": "-",    // en dash
+		"‑": "-",    // non-breaking hyphen
+		"—": "-",    // em dash
+		"¯": " ",    // macron
+		"_": " ",    // underscore
+		"\u201C": "\"",   // left double quote
+		"\u201D": "\"",   // right double quote
+		"\u2018": "'",    // left single quote
+		"\u2019": "'",    // right single quote
+		"´": "'",    // acute accent
+		"`": "'",    // grave accent
+		"[": " ",    // left bracket
+		"]": " ",    // right bracket
+		"|": " ",    // vertical bar
+		"/": " ",    // slash
+		"#": " ",    // hash
+		"→": " ",    // right arrow
+		"←": " ",    // left arrow
+	}
+
+	for old, new := range replacements {
+		text = strings.ReplaceAll(text, old, new)
+	}
+
+	// Remove combining diacritics (common combining marks)
+	// FIXME: this should be fixed for non-English languages
+	diacriticsPattern := regexp.MustCompile(`[\x{0302}\x{0303}\x{0304}\x{0305}\x{0306}\x{0307}\x{0308}\x{030A}\x{030B}\x{030C}\x{0327}\x{0328}\x{0329}\x{032A}\x{032B}\x{032C}\x{032D}\x{032E}\x{032F}]`)
+	text = diacriticsPattern.ReplaceAllString(text, "")
+
+	// Remove special symbols
+	specialSymbols := []string{"♥", "☆", "♡", "©", "\\"}
+	for _, symbol := range specialSymbols {
+		text = strings.ReplaceAll(text, symbol, "")
+	}
+
+	// Replace known expressions
+	exprReplacements := map[string]string{
+		"@":     " at ",
+		"e.g.,": "for example, ",
+		"i.e.,": "that is, ",
+	}
+
+	for old, new := range exprReplacements {
+		text = strings.ReplaceAll(text, old, new)
+	}
+
+	// Fix spacing around punctuation
+	text = regexp.MustCompile(` ,`).ReplaceAllString(text, ",")
+	text = regexp.MustCompile(` \.`).ReplaceAllString(text, ".")
+	text = regexp.MustCompile(` !`).ReplaceAllString(text, "!")
+	text = regexp.MustCompile(` \?`).ReplaceAllString(text, "?")
+	text = regexp.MustCompile(` ;`).ReplaceAllString(text, ";")
+	text = regexp.MustCompile(` :`).ReplaceAllString(text, ":")
+	text = regexp.MustCompile(` '`).ReplaceAllString(text, "'")
+
+	// Remove duplicate quotes
+	for strings.Contains(text, `""`) {
+		text = strings.ReplaceAll(text, `""`, `"`)
+	}
+	for strings.Contains(text, "''") {
+		text = strings.ReplaceAll(text, "''", "'")
+	}
+	for strings.Contains(text, "``") {
+		text = strings.ReplaceAll(text, "``", "`")
+	}
+
+	// Remove extra spaces
+	text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ")
+	text = strings.TrimSpace(text)
+
+	// If text doesn't end with punctuation, quotes, or closing brackets, add a period
+	if text != "" {
+		endsWithPunct := regexp.MustCompile(`[.!?;:,'"\x{201C}\x{201D}\x{2018}\x{2019})\]}…。」』】〉》›»]$`)
+		if !endsWithPunct.MatchString(text) {
+			text += "."
+		}
+	}
+
 	return text
 }

@@ -60,6 +60,42 @@ class UnicodeProcessor {
        this.indexer = Helper.loadJsonLongArray(unicodeIndexerJsonPath);
    }
    
+    private static String removeEmojis(String text) {
+        StringBuilder result = new StringBuilder();
+        for (int i = 0; i < text.length(); i++) {
+            int codePoint;
+            if (Character.isHighSurrogate(text.charAt(i)) && i + 1 < text.length() && Character.isLowSurrogate(text.charAt(i + 1))) {
+                codePoint = Character.codePointAt(text, i);
+                i++; // Skip the low surrogate
+            } else {
+                codePoint = text.charAt(i);
+            }
+            
+            // Check if code point is in emoji ranges
+            boolean isEmoji = (codePoint >= 0x1F600 && codePoint <= 0x1F64F) ||
+                              (codePoint >= 0x1F300 && codePoint <= 0x1F5FF) ||
+                              (codePoint >= 0x1F680 && codePoint <= 0x1F6FF) ||
+                              (codePoint >= 0x1F700 && codePoint <= 0x1F77F) ||
+                              (codePoint >= 0x1F780 && codePoint <= 0x1F7FF) ||
+                              (codePoint >= 0x1F800 && codePoint <= 0x1F8FF) ||
+                              (codePoint >= 0x1F900 && codePoint <= 0x1F9FF) ||
+                              (codePoint >= 0x1FA00 && codePoint <= 0x1FA6F) ||
+                              (codePoint >= 0x1FA70 && codePoint <= 0x1FAFF) ||
+                              (codePoint >= 0x2600 && codePoint <= 0x26FF) ||
+                              (codePoint >= 0x2700 && codePoint <= 0x27BF) ||
+                              (codePoint >= 0x1F1E6 && codePoint <= 0x1F1FF);
+            
+            if (!isEmoji) {
+                if (codePoint > 0xFFFF) {
+                    result.append(Character.toChars(codePoint));
+                } else {
+                    result.append((char) codePoint);
+                }
+            }
+        }
+        return result.toString();
+    }
+    
    public TextProcessResult call(List<String> textList) {
        List<String> processedTexts = new ArrayList<>();
        for (String text : textList) {
@@ -86,7 +122,86 @@ class UnicodeProcessor {
    }
    
    private String preprocessText(String text) {
-        return Normalizer.normalize(text, Normalizer.Form.NFKD);
+        // TODO: Need advanced normalizer for better performance
+        text = Normalizer.normalize(text, Normalizer.Form.NFKD);
+
+        // FIXME: this should be fixed for non-English languages
+
+        // Remove emojis (wide Unicode range)
+        // Java Pattern doesn't support \x{...} syntax for Unicode above \uFFFF
+        // Use character filtering instead
+        text = removeEmojis(text);
+
+        // Replace various dashes and symbols
+        Map<String, String> replacements = new HashMap<>();
+        replacements.put("–", "-");      // en dash
+        replacements.put("‑", "-");      // non-breaking hyphen
+        replacements.put("—", "-");      // em dash
+        replacements.put("¯", " ");      // macron
+        replacements.put("_", " ");      // underscore
+        replacements.put("\u201C", "\"");     // left double quote
+        replacements.put("\u201D", "\"");     // right double quote
+        replacements.put("\u2018", "'");      // left single quote
+        replacements.put("\u2019", "'");      // right single quote
+        replacements.put("´", "'");      // acute accent
+        replacements.put("`", "'");      // grave accent
+        replacements.put("[", " ");      // left bracket
+        replacements.put("]", " ");      // right bracket
+        replacements.put("|", " ");      // vertical bar
+        replacements.put("/", " ");      // slash
+        replacements.put("#", " ");      // hash
+        replacements.put("→", " ");      // right arrow
+        replacements.put("←", " ");      // left arrow
+
+        for (Map.Entry<String, String> entry : replacements.entrySet()) {
+            text = text.replace(entry.getKey(), entry.getValue());
+        }
+
+        // Remove combining diacritics // FIXME: this should be fixed for non-English languages
+        text = text.replaceAll("[\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u030A\\u030B\\u030C\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F]", "");
+
+        // Remove special symbols
+        text = text.replaceAll("[♥☆♡©\\\\]", "");
+
+        // Replace known expressions
+        Map<String, String> exprReplacements = new HashMap<>();
+        exprReplacements.put("@", " at ");
+        exprReplacements.put("e.g.,", "for example, ");
+        exprReplacements.put("i.e.,", "that is, ");
+
+        for (Map.Entry<String, String> entry : exprReplacements.entrySet()) {
+            text = text.replace(entry.getKey(), entry.getValue());
+        }
+
+        // Fix spacing around punctuation
+        text = text.replaceAll(" ,", ",");
+        text = text.replaceAll(" \\.", ".");
+        text = text.replaceAll(" !", "!");
+        text = text.replaceAll(" \\?", "?");
+        text = text.replaceAll(" ;", ";");
+        text = text.replaceAll(" :", ":");
+        text = text.replaceAll(" '", "'");
+
+        // Remove duplicate quotes
+        while (text.contains("\"\"")) {
+            text = text.replace("\"\"", "\"");
+        }
+        while (text.contains("''")) {
+            text = text.replace("''", "'");
+        }
+        while (text.contains("``")) {
+            text = text.replace("``", "`");
+        }
+
+        // Remove extra spaces
+        text = text.replaceAll("\\s+", " ").trim();
+
+        // If text doesn't end with punctuation, quotes, or closing brackets, add a period
+        if (!text.matches(".*[.!?;:,'\"\\u201C\\u201D\\u2018\\u2019)\\]}…。」』】〉》›»]$")) {
+            text += ".";
+        }
+
+        return text;
    }
    
    private int[] textToUnicodeValues(String text) {
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `ExampleONNX.java`.

 ## 📰 Update News

+**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
+
 **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).

 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
@@ -4,6 +4,8 @@ Node.js implementation for TTS inference. Uses ONNX Runtime to generate speech f

 ## 📰 Update News

+**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
+
 **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).

 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
@@ -14,8 +14,85 @@ class UnicodeProcessor {
    }

    _preprocessText(text) {
-        // Simple NFKD normalization (JavaScript has normalize built-in)
-        return text.normalize('NFKD');
+        // TODO: Need advanced normalizer for better performance
+        text = text.normalize('NFKD');
+
+        // FIXME: this should be fixed for non-English languages
+
+        // Remove emojis (wide Unicode range)
+        const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu;
+        text = text.replace(emojiPattern, '');
+
+        // Replace various dashes and symbols
+        const replacements = {
+            '–': '-',
+            '‑': '-',
+            '—': '-',
+            '¯': ' ',
+            '_': ' ',
+            '"': '"',
+            '"': '"',
+            '\u2018': "'",  // left single quote
+            '\u2019': "'",  // right single quote
+            '´': "'",
+            '`': "'",
+            '[': ' ',
+            ']': ' ',
+            '|': ' ',
+            '/': ' ',
+            '#': ' ',
+            '→': ' ',
+            '←': ' ',
+        };
+        for (const [k, v] of Object.entries(replacements)) {
+            text = text.replaceAll(k, v);
+        }
+
+        // Remove combining diacritics // FIXME: this should be fixed for non-English languages
+        text = text.replace(/[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]/g, '');
+
+        // Remove special symbols
+        text = text.replace(/[♥☆♡©\\]/g, '');
+
+        // Replace known expressions
+        const exprReplacements = {
+            '@': ' at ',
+            'e.g.,': 'for example, ',
+            'i.e.,': 'that is, ',
+        };
+        for (const [k, v] of Object.entries(exprReplacements)) {
+            text = text.replaceAll(k, v);
+        }
+
+        // Fix spacing around punctuation
+        text = text.replace(/ ,/g, ',');
+        text = text.replace(/ \./g, '.');
+        text = text.replace(/ !/g, '!');
+        text = text.replace(/ \?/g, '?');
+        text = text.replace(/ ;/g, ';');
+        text = text.replace(/ :/g, ':');
+        text = text.replace(/ '/g, "'");
+
+        // Remove duplicate quotes
+        while (text.includes('""')) {
+            text = text.replace('""', '"');
+        }
+        while (text.includes("''")) {
+            text = text.replace("''", "'");
+        }
+        while (text.includes('``')) {
+            text = text.replace('``', '`');
+        }
+
+        // Remove extra spaces
+        text = text.replace(/\s+/g, ' ').trim();
+
+        // If text doesn't end with punctuation, quotes, or closing brackets, add a period
+        if (!/[.!?;:,'\"')\]}…。」』】〉》›»]$/.test(text)) {
+            text += '.';
+        }
+
+        return text;
    }

    _textToUnicodeValues(text) {
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx.py`.

 ## 📰 Update News

+**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
+
 **2025.11.19** - Added `--speed` parameter to control speech synthesis speed. Adjust the speed factor to make speech faster or slower while maintaining natural quality.

 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
@@ -8,6 +8,8 @@ from unicodedata import normalize
 import numpy as np
 import onnxruntime as ort

+import re
+

 class UnicodeProcessor:
    def __init__(self, unicode_indexer_path: str):
@@ -15,8 +17,96 @@ class UnicodeProcessor:
            self.indexer = json.load(f)

    def _preprocess_text(self, text: str) -> str:
-        # TODO: add more preprocessing
+        # TODO: Need advanced normalizer for better performance
        text = normalize("NFKD", text)
+
+        # FIXME: this should be fixed for non-English languages
+
+        # Remove emojis (wide Unicode range)
+        emoji_pattern = re.compile(
+            "[\U0001f600-\U0001f64f"  # emoticons
+            "\U0001f300-\U0001f5ff"  # symbols & pictographs
+            "\U0001f680-\U0001f6ff"  # transport & map symbols
+            "\U0001f700-\U0001f77f"
+            "\U0001f780-\U0001f7ff"
+            "\U0001f800-\U0001f8ff"
+            "\U0001f900-\U0001f9ff"
+            "\U0001fa00-\U0001fa6f"
+            "\U0001fa70-\U0001faff"
+            "\u2600-\u26ff"
+            "\u2700-\u27bf"
+            "\U0001f1e6-\U0001f1ff]+",
+            flags=re.UNICODE,
+        )
+        text = emoji_pattern.sub("", text)
+
+        # Replace various dashes and symbols
+        replacements = {
+            "–": "-",
+            "‑": "-",
+            "—": "-",
+            "¯": " ",
+            "_": " ",
+            "“": '"',
+            "”": '"',
+            "‘": "'",
+            "’": "'",
+            "´": "'",
+            "`": "'",
+            "[": " ",
+            "]": " ",
+            "|": " ",
+            "/": " ",
+            "#": " ",
+            "→": " ",
+            "←": " ",
+        }
+        for k, v in replacements.items():
+            text = text.replace(k, v)
+
+        # Remove combining diacritics # FIXME: this should be fixed for non-English languages
+        text = re.sub(
+            r"[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]",
+            "",
+            text,
+        )
+
+        # Remove special symbols
+        text = re.sub(r"[♥☆♡©\\]", "", text)
+
+        # Replace known expressions
+        expr_replacements = {
+            "@": " at ",
+            "e.g.,": "for example, ",
+            "i.e.,": "that is, ",
+        }
+        for k, v in expr_replacements.items():
+            text = text.replace(k, v)
+
+        # Fix spacing around punctuation
+        text = re.sub(r" ,", ",", text)
+        text = re.sub(r" \.", ".", text)
+        text = re.sub(r" !", "!", text)
+        text = re.sub(r" \?", "?", text)
+        text = re.sub(r" ;", ";", text)
+        text = re.sub(r" :", ":", text)
+        text = re.sub(r" '", "'", text)
+
+        # Remove duplicate quotes
+        while '""' in text:
+            text = text.replace('""', '"')
+        while "''" in text:
+            text = text.replace("''", "'")
+        while "``" in text:
+            text = text.replace("``", "`")
+
+        # Remove extra spaces
+        text = re.sub(r"\s+", " ", text).strip()
+
+        # If text doesn't end with punctuation, quotes, or closing brackets, add a period
+        if not re.search(r"[.!?;:,'\"')\]}…。」』】〉》›»]$", text):
+            text += "."
+
        return text

    def _get_text_mask(self, text_ids_lengths: np.ndarray) -> np.ndarray:
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using Rust.

 ## 📰 Update News

+**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
+
 **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).

 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
@@ -113,7 +113,95 @@ impl UnicodeProcessor {
 }

 pub fn preprocess_text(text: &str) -> String {
-    text.nfkd().collect()
+    // TODO: Need advanced normalizer for better performance
+    let mut text: String = text.nfkd().collect();
+
+    // FIXME: this should be fixed for non-English languages
+
+    // Remove emojis (wide Unicode range)
+    let emoji_pattern = Regex::new(r"[\x{1F600}-\x{1F64F}\x{1F300}-\x{1F5FF}\x{1F680}-\x{1F6FF}\x{1F700}-\x{1F77F}\x{1F780}-\x{1F7FF}\x{1F800}-\x{1F8FF}\x{1F900}-\x{1F9FF}\x{1FA00}-\x{1FA6F}\x{1FA70}-\x{1FAFF}\x{2600}-\x{26FF}\x{2700}-\x{27BF}\x{1F1E6}-\x{1F1FF}]+").unwrap();
+    text = emoji_pattern.replace_all(&text, "").to_string();
+
+    // Replace various dashes and symbols
+    let replacements = [
+        ("–", "-"),      // en dash
+        ("‑", "-"),      // non-breaking hyphen
+        ("—", "-"),      // em dash
+        ("¯", " "),      // macron
+        ("_", " "),      // underscore
+        ("\u{201C}", "\""),     // left double quote
+        ("\u{201D}", "\""),     // right double quote
+        ("\u{2018}", "'"),      // left single quote
+        ("\u{2019}", "'"),      // right single quote
+        ("´", "'"),      // acute accent
+        ("`", "'"),      // grave accent
+        ("[", " "),      // left bracket
+        ("]", " "),      // right bracket
+        ("|", " "),      // vertical bar
+        ("/", " "),      // slash
+        ("#", " "),      // hash
+        ("→", " "),      // right arrow
+        ("←", " "),      // left arrow
+    ];
+
+    for (from, to) in &replacements {
+        text = text.replace(from, to);
+    }
+
+    // Remove combining diacritics // FIXME: this should be fixed for non-English languages
+    let diacritics_pattern = Regex::new(r"[\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{030A}\u{030B}\u{030C}\u{0327}\u{0328}\u{0329}\u{032A}\u{032B}\u{032C}\u{032D}\u{032E}\u{032F}]").unwrap();
+    text = diacritics_pattern.replace_all(&text, "").to_string();
+
+    // Remove special symbols
+    let special_symbols = ["♥", "☆", "♡", "©", "\\"];
+    for symbol in &special_symbols {
+        text = text.replace(symbol, "");
+    }
+
+    // Replace known expressions
+    let expr_replacements = [
+        ("@", " at "),
+        ("e.g.,", "for example, "),
+        ("i.e.,", "that is, "),
+    ];
+
+    for (from, to) in &expr_replacements {
+        text = text.replace(from, to);
+    }
+
+    // Fix spacing around punctuation
+    text = Regex::new(r" ,").unwrap().replace_all(&text, ",").to_string();
+    text = Regex::new(r" \.").unwrap().replace_all(&text, ".").to_string();
+    text = Regex::new(r" !").unwrap().replace_all(&text, "!").to_string();
+    text = Regex::new(r" \?").unwrap().replace_all(&text, "?").to_string();
+    text = Regex::new(r" ;").unwrap().replace_all(&text, ";").to_string();
+    text = Regex::new(r" :").unwrap().replace_all(&text, ":").to_string();
+    text = Regex::new(r" '").unwrap().replace_all(&text, "'").to_string();
+
+    // Remove duplicate quotes
+    while text.contains("\"\"") {
+        text = text.replace("\"\"", "\"");
+    }
+    while text.contains("''") {
+        text = text.replace("''", "'");
+    }
+    while text.contains("``") {
+        text = text.replace("``", "`");
+    }
+
+    // Remove extra spaces
+    text = Regex::new(r"\s+").unwrap().replace_all(&text, " ").to_string();
+    text = text.trim().to_string();
+
+    // If text doesn't end with punctuation, quotes, or closing brackets, add a period
+    if !text.is_empty() {
+        let ends_with_punct = Regex::new(r#"[.!?;:,'"\u{201C}\u{201D}\u{2018}\u{2019})\]}…。」』】〉》›»]$"#).unwrap();
+        if !ends_with_punct.is_match(&text) {
+            text.push('.');
+        }
+    }
+
+    text
 }

 pub fn text_to_unicode_values(text: &str) -> Vec<usize> {
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx`.

 ## 📰 Update News

+**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
+
 **2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).

 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
@@ -72,7 +72,114 @@ class UnicodeProcessor {
 }

 func preprocessText(_ text: String) -> String {
-    return text.precomposedStringWithCompatibilityMapping
+    // TODO: Need advanced normalizer for better performance
+    var text = text.precomposedStringWithCompatibilityMapping
+
+    // FIXME: this should be fixed for non-English languages
+
+    // Remove emojis (wide Unicode range)
+    // Swift NSRegularExpression doesn't support Unicode escapes above \uFFFF
+    // Use character filtering instead
+    text = text.unicodeScalars.filter { scalar in
+        let value = scalar.value
+        return !((value >= 0x1F600 && value <= 0x1F64F) ||
+                 (value >= 0x1F300 && value <= 0x1F5FF) ||
+                 (value >= 0x1F680 && value <= 0x1F6FF) ||
+                 (value >= 0x1F700 && value <= 0x1F77F) ||
+                 (value >= 0x1F780 && value <= 0x1F7FF) ||
+                 (value >= 0x1F800 && value <= 0x1F8FF) ||
+                 (value >= 0x1F900 && value <= 0x1F9FF) ||
+                 (value >= 0x1FA00 && value <= 0x1FA6F) ||
+                 (value >= 0x1FA70 && value <= 0x1FAFF) ||
+                 (value >= 0x2600 && value <= 0x26FF) ||
+                 (value >= 0x2700 && value <= 0x27BF) ||
+                 (value >= 0x1F1E6 && value <= 0x1F1FF))
+    }.map { String($0) }.joined()
+
+    // Replace various dashes and symbols
+    let replacements: [String: String] = [
+        "–": "-",      // en dash
+        "‑": "-",      // non-breaking hyphen
+        "—": "-",      // em dash
+        "¯": " ",      // macron
+        "_": " ",      // underscore
+        "\u{201C}": "\"",     // left double quote
+        "\u{201D}": "\"",     // right double quote
+        "\u{2018}": "'",      // left single quote
+        "\u{2019}": "'",      // right single quote
+        "´": "'",      // acute accent
+        "`": "'",      // grave accent
+        "[": " ",      // left bracket
+        "]": " ",      // right bracket
+        "|": " ",      // vertical bar
+        "/": " ",      // slash
+        "#": " ",      // hash
+        "→": " ",      // right arrow
+        "←": " ",      // left arrow
+    ]
+
+    for (old, new) in replacements {
+        text = text.replacingOccurrences(of: old, with: new)
+    }
+
+    // Remove combining diacritics // FIXME: this should be fixed for non-English languages
+    let diacriticsPattern = try! NSRegularExpression(pattern: "[\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u030A\\u030B\\u030C\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F]")
+    let diacriticsRange = NSRange(text.startIndex..., in: text)
+    text = diacriticsPattern.stringByReplacingMatches(in: text, range: diacriticsRange, withTemplate: "")
+
+    // Remove special symbols
+    let specialSymbols = ["♥", "☆", "♡", "©", "\\"]
+    for symbol in specialSymbols {
+        text = text.replacingOccurrences(of: symbol, with: "")
+    }
+
+    // Replace known expressions
+    let exprReplacements: [String: String] = [
+        "@": " at ",
+        "e.g.,": "for example, ",
+        "i.e.,": "that is, ",
+    ]
+
+    for (old, new) in exprReplacements {
+        text = text.replacingOccurrences(of: old, with: new)
+    }
+
+    // Fix spacing around punctuation
+    text = text.replacingOccurrences(of: " ,", with: ",")
+    text = text.replacingOccurrences(of: " .", with: ".")
+    text = text.replacingOccurrences(of: " !", with: "!")
+    text = text.replacingOccurrences(of: " ?", with: "?")
+    text = text.replacingOccurrences(of: " ;", with: ";")
+    text = text.replacingOccurrences(of: " :", with: ":")
+    text = text.replacingOccurrences(of: " '", with: "'")
+
+    // Remove duplicate quotes
+    while text.contains("\"\"") {
+        text = text.replacingOccurrences(of: "\"\"", with: "\"")
+    }
+    while text.contains("''") {
+        text = text.replacingOccurrences(of: "''", with: "'")
+    }
+    while text.contains("``") {
+        text = text.replacingOccurrences(of: "``", with: "`")
+    }
+
+    // Remove extra spaces
+    let whitespacePattern = try! NSRegularExpression(pattern: "\\s+")
+    let whitespaceRange = NSRange(text.startIndex..., in: text)
+    text = whitespacePattern.stringByReplacingMatches(in: text, range: whitespaceRange, withTemplate: " ")
+    text = text.trimmingCharacters(in: .whitespacesAndNewlines)
+
+    // If text doesn't end with punctuation, quotes, or closing brackets, add a period
+    if !text.isEmpty {
+        let punctPattern = try! NSRegularExpression(pattern: "[.!?;:,'\"\\u201C\\u201D\\u2018\\u2019)\\]}…。」』】〉》›»]$")
+        let punctRange = NSRange(text.startIndex..., in: text)
+        if punctPattern.firstMatch(in: text, range: punctRange) == nil {
+            text += "."
+        }
+    }
+
+    return text
 }

 func lengthToMask(_ lengths: [Int], maxLen: Int? = nil) -> [[[Float]]] {
@@ -4,6 +4,8 @@ This example demonstrates how to use Supertonic in a web browser using ONNX Runt

 ## 📰 Update News

+**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
+
 **2025.11.19** - Added speed control slider to adjust speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).

 **2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
@@ -28,7 +28,85 @@ export class UnicodeProcessor {
    }

    preprocessText(text) {
-        return text.normalize('NFKC');
+        // TODO: Need advanced normalizer for better performance
+        text = text.normalize('NFKD');
+
+        // FIXME: this should be fixed for non-English languages
+
+        // Remove emojis (wide Unicode range)
+        const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu;
+        text = text.replace(emojiPattern, '');
+
+        // Replace various dashes and symbols
+        const replacements = {
+            '–': '-',
+            '‑': '-',
+            '—': '-',
+            '¯': ' ',
+            '_': ' ',
+            '"': '"',
+            '"': '"',
+            '\u2018': "'",  // left single quote
+            '\u2019': "'",  // right single quote
+            '´': "'",
+            '`': "'",
+            '[': ' ',
+            ']': ' ',
+            '|': ' ',
+            '/': ' ',
+            '#': ' ',
+            '→': ' ',
+            '←': ' ',
+        };
+        for (const [k, v] of Object.entries(replacements)) {
+            text = text.replaceAll(k, v);
+        }
+
+        // Remove combining diacritics // FIXME: this should be fixed for non-English languages
+        text = text.replace(/[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]/g, '');
+
+        // Remove special symbols
+        text = text.replace(/[♥☆♡©\\]/g, '');
+
+        // Replace known expressions
+        const exprReplacements = {
+            '@': ' at ',
+            'e.g.,': 'for example, ',
+            'i.e.,': 'that is, ',
+        };
+        for (const [k, v] of Object.entries(exprReplacements)) {
+            text = text.replaceAll(k, v);
+        }
+
+        // Fix spacing around punctuation
+        text = text.replace(/ ,/g, ',');
+        text = text.replace(/ \./g, '.');
+        text = text.replace(/ !/g, '!');
+        text = text.replace(/ \?/g, '?');
+        text = text.replace(/ ;/g, ';');
+        text = text.replace(/ :/g, ':');
+        text = text.replace(/ '/g, "'");
+
+        // Remove duplicate quotes
+        while (text.includes('""')) {
+            text = text.replace('""', '"');
+        }
+        while (text.includes("''")) {
+            text = text.replace("''", "'");
+        }
+        while (text.includes('``')) {
+            text = text.replace('``', '`');
+        }
+
+        // Remove extra spaces
+        text = text.replace(/\s+/g, ' ').trim();
+
+        // If text doesn't end with punctuation, quotes, or closing brackets, add a period
+        if (!/[.!?;:,'\"')\]}…。」』】〉》›»]$/.test(text)) {
+            text += '.';
+        }
+
+        return text;
    }

    getTextMask(textIdsLengths) {