mirror of
https://github.com/supertone-inc/supertonic.git
synced 2026-06-02 01:38:48 +02:00
Fix text normalization bug (#16)
This commit is contained in:
@@ -4,6 +4,8 @@ High-performance text-to-speech inference using ONNX Runtime.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
|
||||
|
||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
+160
-18
@@ -19,6 +19,24 @@ void clearTensorBuffers() {
|
||||
g_tensor_buffers_int64.clear();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Helper function - trim
|
||||
// ============================================================================
|
||||
|
||||
static std::string trim(const std::string& str) {
|
||||
size_t start = 0;
|
||||
while (start < str.size() && std::isspace(static_cast<unsigned char>(str[start]))) {
|
||||
start++;
|
||||
}
|
||||
|
||||
size_t end = str.size();
|
||||
while (end > start && std::isspace(static_cast<unsigned char>(str[end - 1]))) {
|
||||
end--;
|
||||
}
|
||||
|
||||
return str.substr(start, end - start);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// UnicodeProcessor implementation
|
||||
// ============================================================================
|
||||
@@ -28,10 +46,148 @@ UnicodeProcessor::UnicodeProcessor(const std::string& unicode_indexer_json_path)
|
||||
}
|
||||
|
||||
std::string UnicodeProcessor::preprocessText(const std::string& text) {
|
||||
// Simple NFKD normalization (C++ doesn't have built-in Unicode normalization)
|
||||
// For now, just return the text as-is
|
||||
// TODO: add proper Unicode normalization
|
||||
return text;
|
||||
// TODO: Need advanced normalizer for better performance
|
||||
// NOTE: C++ doesn't have built-in Unicode normalization like Python's NFKD
|
||||
// For full Unicode normalization, consider using ICU library
|
||||
// This implementation handles basic text preprocessing
|
||||
|
||||
std::string result = text;
|
||||
|
||||
// FIXME: this should be fixed for non-English languages
|
||||
|
||||
// Remove emojis and various Unicode symbols
|
||||
// Using regex to remove common emoji ranges and special symbols
|
||||
// Note: This is a simplified version - full emoji support needs UTF-8 handling
|
||||
std::regex emoji_pattern(
|
||||
"[\xF0\x9F][\x80-\xBF]{2}|" // Common emoji pattern in UTF-8
|
||||
"[\xE2][\x80-\xBF]{2}|" // Various symbols
|
||||
"[\xE2][\x98-\x9E][\x80-\xBF]" // More symbols
|
||||
);
|
||||
result = std::regex_replace(result, emoji_pattern, "");
|
||||
|
||||
// Replace various dashes and symbols
|
||||
struct Replacement {
|
||||
const char* from;
|
||||
const char* to;
|
||||
};
|
||||
|
||||
const Replacement replacements[] = {
|
||||
{"–", "-"}, // en dash
|
||||
{"‑", "-"}, // non-breaking hyphen
|
||||
{"—", "-"}, // em dash
|
||||
{"¯", " "}, // macron
|
||||
{"_", " "}, // underscore
|
||||
{""", "\""}, // left double quote (U+201C)
|
||||
{""", "\""}, // right double quote (U+201D)
|
||||
{"'", "'"}, // left single quote (U+2018)
|
||||
{"'", "'"}, // right single quote (U+2019)
|
||||
{"´", "'"}, // acute accent
|
||||
{"`", "'"}, // grave accent
|
||||
{"[", " "}, // left bracket
|
||||
{"]", " "}, // right bracket
|
||||
{"|", " "}, // vertical bar
|
||||
{"/", " "}, // slash
|
||||
{"#", " "}, // hash
|
||||
{"→", " "}, // right arrow
|
||||
{"←", " "}, // left arrow
|
||||
};
|
||||
|
||||
for (const auto& repl : replacements) {
|
||||
size_t pos = 0;
|
||||
while ((pos = result.find(repl.from, pos)) != std::string::npos) {
|
||||
result.replace(pos, strlen(repl.from), repl.to);
|
||||
pos += strlen(repl.to);
|
||||
}
|
||||
}
|
||||
|
||||
// Remove combining diacritics (common combining marks in UTF-8)
|
||||
// FIXME: this should be fixed for non-English languages
|
||||
std::regex diacritics_pattern(
|
||||
"[\xCC\xCD][\x80-\xBF]" // Combining diacritical marks range
|
||||
);
|
||||
result = std::regex_replace(result, diacritics_pattern, "");
|
||||
|
||||
// Remove special symbols
|
||||
const char* special_symbols[] = {"♥", "☆", "♡", "©", "\\"};
|
||||
for (const char* symbol : special_symbols) {
|
||||
size_t pos = 0;
|
||||
while ((pos = result.find(symbol, pos)) != std::string::npos) {
|
||||
result.erase(pos, strlen(symbol));
|
||||
}
|
||||
}
|
||||
|
||||
// Replace known expressions
|
||||
const Replacement expr_replacements[] = {
|
||||
{"@", " at "},
|
||||
{"e.g.,", "for example, "},
|
||||
{"i.e.,", "that is, "},
|
||||
};
|
||||
|
||||
for (const auto& repl : expr_replacements) {
|
||||
size_t pos = 0;
|
||||
while ((pos = result.find(repl.from, pos)) != std::string::npos) {
|
||||
result.replace(pos, strlen(repl.from), repl.to);
|
||||
pos += strlen(repl.to);
|
||||
}
|
||||
}
|
||||
|
||||
// Fix spacing around punctuation
|
||||
result = std::regex_replace(result, std::regex(" ,"), ",");
|
||||
result = std::regex_replace(result, std::regex(" \\."), ".");
|
||||
result = std::regex_replace(result, std::regex(" !"), "!");
|
||||
result = std::regex_replace(result, std::regex(" \\?"), "?");
|
||||
result = std::regex_replace(result, std::regex(" ;"), ";");
|
||||
result = std::regex_replace(result, std::regex(" :"), ":");
|
||||
result = std::regex_replace(result, std::regex(" '"), "'");
|
||||
|
||||
// Remove duplicate quotes
|
||||
while (result.find("\"\"") != std::string::npos) {
|
||||
size_t pos = result.find("\"\"");
|
||||
result.replace(pos, 2, "\"");
|
||||
}
|
||||
while (result.find("''") != std::string::npos) {
|
||||
size_t pos = result.find("''");
|
||||
result.replace(pos, 2, "'");
|
||||
}
|
||||
while (result.find("``") != std::string::npos) {
|
||||
size_t pos = result.find("``");
|
||||
result.replace(pos, 2, "`");
|
||||
}
|
||||
|
||||
// Remove extra spaces
|
||||
result = std::regex_replace(result, std::regex("\\s+"), " ");
|
||||
result = trim(result);
|
||||
|
||||
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
|
||||
if (!result.empty()) {
|
||||
char last_char = result.back();
|
||||
bool ends_with_punct = (
|
||||
last_char == '.' || last_char == '!' || last_char == '?' ||
|
||||
last_char == ';' || last_char == ':' || last_char == ',' ||
|
||||
last_char == '\'' || last_char == '"' || last_char == ')' ||
|
||||
last_char == ']' || last_char == '}' || last_char == '>'
|
||||
);
|
||||
|
||||
// Check for UTF-8 multibyte ending punctuation (e.g., …, 。, curly quotes, etc.)
|
||||
if (!ends_with_punct && result.size() >= 3) {
|
||||
std::string last_three = result.substr(result.size() - 3);
|
||||
if (last_three == "…" || last_three == "。" ||
|
||||
last_three == "」" || last_three == "』" ||
|
||||
last_three == "】" || last_three == "〉" ||
|
||||
last_three == "》" || last_three == "›" ||
|
||||
last_three == "»" || last_three == """ ||
|
||||
last_three == """ || last_three == "'" ||
|
||||
last_three == "'") {
|
||||
ends_with_punct = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!ends_with_punct) {
|
||||
result += ".";
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<uint16_t> UnicodeProcessor::textToUnicodeValues(const std::string& text) {
|
||||
@@ -772,20 +928,6 @@ std::string sanitizeFilename(const std::string& text, int max_len) {
|
||||
// Chunk text
|
||||
// ============================================================================
|
||||
|
||||
static std::string trim(const std::string& str) {
|
||||
size_t start = 0;
|
||||
while (start < str.size() && std::isspace(static_cast<unsigned char>(str[start]))) {
|
||||
start++;
|
||||
}
|
||||
|
||||
size_t end = str.size();
|
||||
while (end > start && std::isspace(static_cast<unsigned char>(str[end - 1]))) {
|
||||
end--;
|
||||
}
|
||||
|
||||
return str.substr(start, end - start);
|
||||
}
|
||||
|
||||
std::vector<std::string> chunkText(const std::string& text, int max_len) {
|
||||
std::vector<std::string> chunks;
|
||||
|
||||
|
||||
+136
-2
@@ -71,10 +71,144 @@ namespace Supertonic
|
||||
}
|
||||
}
|
||||
|
||||
private static string RemoveEmojis(string text)
|
||||
{
|
||||
var result = new StringBuilder();
|
||||
for (int i = 0; i < text.Length; i++)
|
||||
{
|
||||
int codePoint;
|
||||
if (char.IsHighSurrogate(text[i]) && i + 1 < text.Length && char.IsLowSurrogate(text[i + 1]))
|
||||
{
|
||||
// Get the full code point from surrogate pair
|
||||
codePoint = char.ConvertToUtf32(text[i], text[i + 1]);
|
||||
i++; // Skip the low surrogate
|
||||
}
|
||||
else
|
||||
{
|
||||
codePoint = text[i];
|
||||
}
|
||||
|
||||
// Check if code point is in emoji ranges
|
||||
bool isEmoji = (codePoint >= 0x1F600 && codePoint <= 0x1F64F) ||
|
||||
(codePoint >= 0x1F300 && codePoint <= 0x1F5FF) ||
|
||||
(codePoint >= 0x1F680 && codePoint <= 0x1F6FF) ||
|
||||
(codePoint >= 0x1F700 && codePoint <= 0x1F77F) ||
|
||||
(codePoint >= 0x1F780 && codePoint <= 0x1F7FF) ||
|
||||
(codePoint >= 0x1F800 && codePoint <= 0x1F8FF) ||
|
||||
(codePoint >= 0x1F900 && codePoint <= 0x1F9FF) ||
|
||||
(codePoint >= 0x1FA00 && codePoint <= 0x1FA6F) ||
|
||||
(codePoint >= 0x1FA70 && codePoint <= 0x1FAFF) ||
|
||||
(codePoint >= 0x2600 && codePoint <= 0x26FF) ||
|
||||
(codePoint >= 0x2700 && codePoint <= 0x27BF) ||
|
||||
(codePoint >= 0x1F1E6 && codePoint <= 0x1F1FF);
|
||||
|
||||
if (!isEmoji)
|
||||
{
|
||||
if (codePoint > 0xFFFF)
|
||||
{
|
||||
// Add back as surrogate pair
|
||||
result.Append(char.ConvertFromUtf32(codePoint));
|
||||
}
|
||||
else
|
||||
{
|
||||
result.Append((char)codePoint);
|
||||
}
|
||||
}
|
||||
}
|
||||
return result.ToString();
|
||||
}
|
||||
|
||||
private string PreprocessText(string text)
|
||||
{
|
||||
// Simple normalization (C# has Normalize built-in)
|
||||
return text.Normalize(NormalizationForm.FormKD);
|
||||
// TODO: Need advanced normalizer for better performance
|
||||
text = text.Normalize(NormalizationForm.FormKD);
|
||||
|
||||
// FIXME: this should be fixed for non-English languages
|
||||
|
||||
// Remove emojis (wide Unicode range)
|
||||
// C# doesn't support \u{...} syntax in regex, so we use character filtering instead
|
||||
text = RemoveEmojis(text);
|
||||
|
||||
// Replace various dashes and symbols
|
||||
var replacements = new Dictionary<string, string>
|
||||
{
|
||||
{"–", "-"}, // en dash
|
||||
{"‑", "-"}, // non-breaking hyphen
|
||||
{"—", "-"}, // em dash
|
||||
{"¯", " "}, // macron
|
||||
{"_", " "}, // underscore
|
||||
{"\u201C", "\""}, // left double quote
|
||||
{"\u201D", "\""}, // right double quote
|
||||
{"\u2018", "'"}, // left single quote
|
||||
{"\u2019", "'"}, // right single quote
|
||||
{"´", "'"}, // acute accent
|
||||
{"`", "'"}, // grave accent
|
||||
{"[", " "}, // left bracket
|
||||
{"]", " "}, // right bracket
|
||||
{"|", " "}, // vertical bar
|
||||
{"/", " "}, // slash
|
||||
{"#", " "}, // hash
|
||||
{"→", " "}, // right arrow
|
||||
{"←", " "}, // left arrow
|
||||
};
|
||||
|
||||
foreach (var kvp in replacements)
|
||||
{
|
||||
text = text.Replace(kvp.Key, kvp.Value);
|
||||
}
|
||||
|
||||
// Remove combining diacritics // FIXME: this should be fixed for non-English languages
|
||||
text = Regex.Replace(text, @"[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]", "");
|
||||
|
||||
// Remove special symbols
|
||||
text = Regex.Replace(text, @"[♥☆♡©\\]", "");
|
||||
|
||||
// Replace known expressions
|
||||
var exprReplacements = new Dictionary<string, string>
|
||||
{
|
||||
{"@", " at "},
|
||||
{"e.g.,", "for example, "},
|
||||
{"i.e.,", "that is, "},
|
||||
};
|
||||
|
||||
foreach (var kvp in exprReplacements)
|
||||
{
|
||||
text = text.Replace(kvp.Key, kvp.Value);
|
||||
}
|
||||
|
||||
// Fix spacing around punctuation
|
||||
text = Regex.Replace(text, @" ,", ",");
|
||||
text = Regex.Replace(text, @" \.", ".");
|
||||
text = Regex.Replace(text, @" !", "!");
|
||||
text = Regex.Replace(text, @" \?", "?");
|
||||
text = Regex.Replace(text, @" ;", ";");
|
||||
text = Regex.Replace(text, @" :", ":");
|
||||
text = Regex.Replace(text, @" '", "'");
|
||||
|
||||
// Remove duplicate quotes
|
||||
while (text.Contains("\"\""))
|
||||
{
|
||||
text = text.Replace("\"\"", "\"");
|
||||
}
|
||||
while (text.Contains("''"))
|
||||
{
|
||||
text = text.Replace("''", "'");
|
||||
}
|
||||
while (text.Contains("``"))
|
||||
{
|
||||
text = text.Replace("``", "`");
|
||||
}
|
||||
|
||||
// Remove extra spaces
|
||||
text = Regex.Replace(text, @"\s+", " ").Trim();
|
||||
|
||||
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
|
||||
if (!Regex.IsMatch(text, @"[.!?;:,'\u0022\u201C\u201D\u2018\u2019)\]}…。」』】〉》›»]$"))
|
||||
{
|
||||
text += ".";
|
||||
}
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
private int[] TextToUnicodeValues(string text)
|
||||
|
||||
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `ExampleONNX.cs`.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
|
||||
|
||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx.go`.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
|
||||
|
||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
+90
-1
@@ -327,8 +327,97 @@ func splitSentences(text string) []string {
|
||||
|
||||
// Utility functions
|
||||
func preprocessText(text string) string {
|
||||
// Simple normalization (Go doesn't have built-in NFKD normalization)
|
||||
// TODO: Need advanced normalizer for better performance
|
||||
// NOTE: Go doesn't have built-in NFKD normalization like Python
|
||||
// For full Unicode normalization, use golang.org/x/text/unicode/norm
|
||||
// This implementation handles basic text preprocessing
|
||||
|
||||
// FIXME: this should be fixed for non-English languages
|
||||
|
||||
// Remove emojis and various Unicode symbols
|
||||
emojiPattern := regexp.MustCompile(`[\x{1F600}-\x{1F64F}\x{1F300}-\x{1F5FF}\x{1F680}-\x{1F6FF}\x{1F700}-\x{1F77F}\x{1F780}-\x{1F7FF}\x{1F800}-\x{1F8FF}\x{1F900}-\x{1F9FF}\x{1FA00}-\x{1FA6F}\x{1FA70}-\x{1FAFF}\x{2600}-\x{26FF}\x{2700}-\x{27BF}\x{1F1E6}-\x{1F1FF}]+`)
|
||||
text = emojiPattern.ReplaceAllString(text, "")
|
||||
|
||||
// Replace various dashes and symbols
|
||||
replacements := map[string]string{
|
||||
"–": "-", // en dash
|
||||
"‑": "-", // non-breaking hyphen
|
||||
"—": "-", // em dash
|
||||
"¯": " ", // macron
|
||||
"_": " ", // underscore
|
||||
"\u201C": "\"", // left double quote
|
||||
"\u201D": "\"", // right double quote
|
||||
"\u2018": "'", // left single quote
|
||||
"\u2019": "'", // right single quote
|
||||
"´": "'", // acute accent
|
||||
"`": "'", // grave accent
|
||||
"[": " ", // left bracket
|
||||
"]": " ", // right bracket
|
||||
"|": " ", // vertical bar
|
||||
"/": " ", // slash
|
||||
"#": " ", // hash
|
||||
"→": " ", // right arrow
|
||||
"←": " ", // left arrow
|
||||
}
|
||||
|
||||
for old, new := range replacements {
|
||||
text = strings.ReplaceAll(text, old, new)
|
||||
}
|
||||
|
||||
// Remove combining diacritics (common combining marks)
|
||||
// FIXME: this should be fixed for non-English languages
|
||||
diacriticsPattern := regexp.MustCompile(`[\x{0302}\x{0303}\x{0304}\x{0305}\x{0306}\x{0307}\x{0308}\x{030A}\x{030B}\x{030C}\x{0327}\x{0328}\x{0329}\x{032A}\x{032B}\x{032C}\x{032D}\x{032E}\x{032F}]`)
|
||||
text = diacriticsPattern.ReplaceAllString(text, "")
|
||||
|
||||
// Remove special symbols
|
||||
specialSymbols := []string{"♥", "☆", "♡", "©", "\\"}
|
||||
for _, symbol := range specialSymbols {
|
||||
text = strings.ReplaceAll(text, symbol, "")
|
||||
}
|
||||
|
||||
// Replace known expressions
|
||||
exprReplacements := map[string]string{
|
||||
"@": " at ",
|
||||
"e.g.,": "for example, ",
|
||||
"i.e.,": "that is, ",
|
||||
}
|
||||
|
||||
for old, new := range exprReplacements {
|
||||
text = strings.ReplaceAll(text, old, new)
|
||||
}
|
||||
|
||||
// Fix spacing around punctuation
|
||||
text = regexp.MustCompile(` ,`).ReplaceAllString(text, ",")
|
||||
text = regexp.MustCompile(` \.`).ReplaceAllString(text, ".")
|
||||
text = regexp.MustCompile(` !`).ReplaceAllString(text, "!")
|
||||
text = regexp.MustCompile(` \?`).ReplaceAllString(text, "?")
|
||||
text = regexp.MustCompile(` ;`).ReplaceAllString(text, ";")
|
||||
text = regexp.MustCompile(` :`).ReplaceAllString(text, ":")
|
||||
text = regexp.MustCompile(` '`).ReplaceAllString(text, "'")
|
||||
|
||||
// Remove duplicate quotes
|
||||
for strings.Contains(text, `""`) {
|
||||
text = strings.ReplaceAll(text, `""`, `"`)
|
||||
}
|
||||
for strings.Contains(text, "''") {
|
||||
text = strings.ReplaceAll(text, "''", "'")
|
||||
}
|
||||
for strings.Contains(text, "``") {
|
||||
text = strings.ReplaceAll(text, "``", "`")
|
||||
}
|
||||
|
||||
// Remove extra spaces
|
||||
text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ")
|
||||
text = strings.TrimSpace(text)
|
||||
|
||||
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
|
||||
if text != "" {
|
||||
endsWithPunct := regexp.MustCompile(`[.!?;:,'"\x{201C}\x{201D}\x{2018}\x{2019})\]}…。」』】〉》›»]$`)
|
||||
if !endsWithPunct.MatchString(text) {
|
||||
text += "."
|
||||
}
|
||||
}
|
||||
|
||||
return text
|
||||
}
|
||||
|
||||
|
||||
+116
-1
@@ -60,6 +60,42 @@ class UnicodeProcessor {
|
||||
this.indexer = Helper.loadJsonLongArray(unicodeIndexerJsonPath);
|
||||
}
|
||||
|
||||
private static String removeEmojis(String text) {
|
||||
StringBuilder result = new StringBuilder();
|
||||
for (int i = 0; i < text.length(); i++) {
|
||||
int codePoint;
|
||||
if (Character.isHighSurrogate(text.charAt(i)) && i + 1 < text.length() && Character.isLowSurrogate(text.charAt(i + 1))) {
|
||||
codePoint = Character.codePointAt(text, i);
|
||||
i++; // Skip the low surrogate
|
||||
} else {
|
||||
codePoint = text.charAt(i);
|
||||
}
|
||||
|
||||
// Check if code point is in emoji ranges
|
||||
boolean isEmoji = (codePoint >= 0x1F600 && codePoint <= 0x1F64F) ||
|
||||
(codePoint >= 0x1F300 && codePoint <= 0x1F5FF) ||
|
||||
(codePoint >= 0x1F680 && codePoint <= 0x1F6FF) ||
|
||||
(codePoint >= 0x1F700 && codePoint <= 0x1F77F) ||
|
||||
(codePoint >= 0x1F780 && codePoint <= 0x1F7FF) ||
|
||||
(codePoint >= 0x1F800 && codePoint <= 0x1F8FF) ||
|
||||
(codePoint >= 0x1F900 && codePoint <= 0x1F9FF) ||
|
||||
(codePoint >= 0x1FA00 && codePoint <= 0x1FA6F) ||
|
||||
(codePoint >= 0x1FA70 && codePoint <= 0x1FAFF) ||
|
||||
(codePoint >= 0x2600 && codePoint <= 0x26FF) ||
|
||||
(codePoint >= 0x2700 && codePoint <= 0x27BF) ||
|
||||
(codePoint >= 0x1F1E6 && codePoint <= 0x1F1FF);
|
||||
|
||||
if (!isEmoji) {
|
||||
if (codePoint > 0xFFFF) {
|
||||
result.append(Character.toChars(codePoint));
|
||||
} else {
|
||||
result.append((char) codePoint);
|
||||
}
|
||||
}
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public TextProcessResult call(List<String> textList) {
|
||||
List<String> processedTexts = new ArrayList<>();
|
||||
for (String text : textList) {
|
||||
@@ -86,7 +122,86 @@ class UnicodeProcessor {
|
||||
}
|
||||
|
||||
private String preprocessText(String text) {
|
||||
return Normalizer.normalize(text, Normalizer.Form.NFKD);
|
||||
// TODO: Need advanced normalizer for better performance
|
||||
text = Normalizer.normalize(text, Normalizer.Form.NFKD);
|
||||
|
||||
// FIXME: this should be fixed for non-English languages
|
||||
|
||||
// Remove emojis (wide Unicode range)
|
||||
// Java Pattern doesn't support \x{...} syntax for Unicode above \uFFFF
|
||||
// Use character filtering instead
|
||||
text = removeEmojis(text);
|
||||
|
||||
// Replace various dashes and symbols
|
||||
Map<String, String> replacements = new HashMap<>();
|
||||
replacements.put("–", "-"); // en dash
|
||||
replacements.put("‑", "-"); // non-breaking hyphen
|
||||
replacements.put("—", "-"); // em dash
|
||||
replacements.put("¯", " "); // macron
|
||||
replacements.put("_", " "); // underscore
|
||||
replacements.put("\u201C", "\""); // left double quote
|
||||
replacements.put("\u201D", "\""); // right double quote
|
||||
replacements.put("\u2018", "'"); // left single quote
|
||||
replacements.put("\u2019", "'"); // right single quote
|
||||
replacements.put("´", "'"); // acute accent
|
||||
replacements.put("`", "'"); // grave accent
|
||||
replacements.put("[", " "); // left bracket
|
||||
replacements.put("]", " "); // right bracket
|
||||
replacements.put("|", " "); // vertical bar
|
||||
replacements.put("/", " "); // slash
|
||||
replacements.put("#", " "); // hash
|
||||
replacements.put("→", " "); // right arrow
|
||||
replacements.put("←", " "); // left arrow
|
||||
|
||||
for (Map.Entry<String, String> entry : replacements.entrySet()) {
|
||||
text = text.replace(entry.getKey(), entry.getValue());
|
||||
}
|
||||
|
||||
// Remove combining diacritics // FIXME: this should be fixed for non-English languages
|
||||
text = text.replaceAll("[\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u030A\\u030B\\u030C\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F]", "");
|
||||
|
||||
// Remove special symbols
|
||||
text = text.replaceAll("[♥☆♡©\\\\]", "");
|
||||
|
||||
// Replace known expressions
|
||||
Map<String, String> exprReplacements = new HashMap<>();
|
||||
exprReplacements.put("@", " at ");
|
||||
exprReplacements.put("e.g.,", "for example, ");
|
||||
exprReplacements.put("i.e.,", "that is, ");
|
||||
|
||||
for (Map.Entry<String, String> entry : exprReplacements.entrySet()) {
|
||||
text = text.replace(entry.getKey(), entry.getValue());
|
||||
}
|
||||
|
||||
// Fix spacing around punctuation
|
||||
text = text.replaceAll(" ,", ",");
|
||||
text = text.replaceAll(" \\.", ".");
|
||||
text = text.replaceAll(" !", "!");
|
||||
text = text.replaceAll(" \\?", "?");
|
||||
text = text.replaceAll(" ;", ";");
|
||||
text = text.replaceAll(" :", ":");
|
||||
text = text.replaceAll(" '", "'");
|
||||
|
||||
// Remove duplicate quotes
|
||||
while (text.contains("\"\"")) {
|
||||
text = text.replace("\"\"", "\"");
|
||||
}
|
||||
while (text.contains("''")) {
|
||||
text = text.replace("''", "'");
|
||||
}
|
||||
while (text.contains("``")) {
|
||||
text = text.replace("``", "`");
|
||||
}
|
||||
|
||||
// Remove extra spaces
|
||||
text = text.replaceAll("\\s+", " ").trim();
|
||||
|
||||
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
|
||||
if (!text.matches(".*[.!?;:,'\"\\u201C\\u201D\\u2018\\u2019)\\]}…。」』】〉》›»]$")) {
|
||||
text += ".";
|
||||
}
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
private int[] textToUnicodeValues(String text) {
|
||||
|
||||
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `ExampleONNX.java`.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
|
||||
|
||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
@@ -4,6 +4,8 @@ Node.js implementation for TTS inference. Uses ONNX Runtime to generate speech f
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
|
||||
|
||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
+79
-2
@@ -14,8 +14,85 @@ class UnicodeProcessor {
|
||||
}
|
||||
|
||||
_preprocessText(text) {
|
||||
// Simple NFKD normalization (JavaScript has normalize built-in)
|
||||
return text.normalize('NFKD');
|
||||
// TODO: Need advanced normalizer for better performance
|
||||
text = text.normalize('NFKD');
|
||||
|
||||
// FIXME: this should be fixed for non-English languages
|
||||
|
||||
// Remove emojis (wide Unicode range)
|
||||
const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu;
|
||||
text = text.replace(emojiPattern, '');
|
||||
|
||||
// Replace various dashes and symbols
|
||||
const replacements = {
|
||||
'–': '-',
|
||||
'‑': '-',
|
||||
'—': '-',
|
||||
'¯': ' ',
|
||||
'_': ' ',
|
||||
'"': '"',
|
||||
'"': '"',
|
||||
'\u2018': "'", // left single quote
|
||||
'\u2019': "'", // right single quote
|
||||
'´': "'",
|
||||
'`': "'",
|
||||
'[': ' ',
|
||||
']': ' ',
|
||||
'|': ' ',
|
||||
'/': ' ',
|
||||
'#': ' ',
|
||||
'→': ' ',
|
||||
'←': ' ',
|
||||
};
|
||||
for (const [k, v] of Object.entries(replacements)) {
|
||||
text = text.replaceAll(k, v);
|
||||
}
|
||||
|
||||
// Remove combining diacritics // FIXME: this should be fixed for non-English languages
|
||||
text = text.replace(/[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]/g, '');
|
||||
|
||||
// Remove special symbols
|
||||
text = text.replace(/[♥☆♡©\\]/g, '');
|
||||
|
||||
// Replace known expressions
|
||||
const exprReplacements = {
|
||||
'@': ' at ',
|
||||
'e.g.,': 'for example, ',
|
||||
'i.e.,': 'that is, ',
|
||||
};
|
||||
for (const [k, v] of Object.entries(exprReplacements)) {
|
||||
text = text.replaceAll(k, v);
|
||||
}
|
||||
|
||||
// Fix spacing around punctuation
|
||||
text = text.replace(/ ,/g, ',');
|
||||
text = text.replace(/ \./g, '.');
|
||||
text = text.replace(/ !/g, '!');
|
||||
text = text.replace(/ \?/g, '?');
|
||||
text = text.replace(/ ;/g, ';');
|
||||
text = text.replace(/ :/g, ':');
|
||||
text = text.replace(/ '/g, "'");
|
||||
|
||||
// Remove duplicate quotes
|
||||
while (text.includes('""')) {
|
||||
text = text.replace('""', '"');
|
||||
}
|
||||
while (text.includes("''")) {
|
||||
text = text.replace("''", "'");
|
||||
}
|
||||
while (text.includes('``')) {
|
||||
text = text.replace('``', '`');
|
||||
}
|
||||
|
||||
// Remove extra spaces
|
||||
text = text.replace(/\s+/g, ' ').trim();
|
||||
|
||||
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
|
||||
if (!/[.!?;:,'\"')\]}…。」』】〉》›»]$/.test(text)) {
|
||||
text += '.';
|
||||
}
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
_textToUnicodeValues(text) {
|
||||
|
||||
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx.py`.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
|
||||
|
||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed. Adjust the speed factor to make speech faster or slower while maintaining natural quality.
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
+91
-1
@@ -8,6 +8,8 @@ from unicodedata import normalize
|
||||
import numpy as np
|
||||
import onnxruntime as ort
|
||||
|
||||
import re
|
||||
|
||||
|
||||
class UnicodeProcessor:
|
||||
def __init__(self, unicode_indexer_path: str):
|
||||
@@ -15,8 +17,96 @@ class UnicodeProcessor:
|
||||
self.indexer = json.load(f)
|
||||
|
||||
def _preprocess_text(self, text: str) -> str:
|
||||
# TODO: add more preprocessing
|
||||
# TODO: Need advanced normalizer for better performance
|
||||
text = normalize("NFKD", text)
|
||||
|
||||
# FIXME: this should be fixed for non-English languages
|
||||
|
||||
# Remove emojis (wide Unicode range)
|
||||
emoji_pattern = re.compile(
|
||||
"[\U0001f600-\U0001f64f" # emoticons
|
||||
"\U0001f300-\U0001f5ff" # symbols & pictographs
|
||||
"\U0001f680-\U0001f6ff" # transport & map symbols
|
||||
"\U0001f700-\U0001f77f"
|
||||
"\U0001f780-\U0001f7ff"
|
||||
"\U0001f800-\U0001f8ff"
|
||||
"\U0001f900-\U0001f9ff"
|
||||
"\U0001fa00-\U0001fa6f"
|
||||
"\U0001fa70-\U0001faff"
|
||||
"\u2600-\u26ff"
|
||||
"\u2700-\u27bf"
|
||||
"\U0001f1e6-\U0001f1ff]+",
|
||||
flags=re.UNICODE,
|
||||
)
|
||||
text = emoji_pattern.sub("", text)
|
||||
|
||||
# Replace various dashes and symbols
|
||||
replacements = {
|
||||
"–": "-",
|
||||
"‑": "-",
|
||||
"—": "-",
|
||||
"¯": " ",
|
||||
"_": " ",
|
||||
"“": '"',
|
||||
"”": '"',
|
||||
"‘": "'",
|
||||
"’": "'",
|
||||
"´": "'",
|
||||
"`": "'",
|
||||
"[": " ",
|
||||
"]": " ",
|
||||
"|": " ",
|
||||
"/": " ",
|
||||
"#": " ",
|
||||
"→": " ",
|
||||
"←": " ",
|
||||
}
|
||||
for k, v in replacements.items():
|
||||
text = text.replace(k, v)
|
||||
|
||||
# Remove combining diacritics # FIXME: this should be fixed for non-English languages
|
||||
text = re.sub(
|
||||
r"[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]",
|
||||
"",
|
||||
text,
|
||||
)
|
||||
|
||||
# Remove special symbols
|
||||
text = re.sub(r"[♥☆♡©\\]", "", text)
|
||||
|
||||
# Replace known expressions
|
||||
expr_replacements = {
|
||||
"@": " at ",
|
||||
"e.g.,": "for example, ",
|
||||
"i.e.,": "that is, ",
|
||||
}
|
||||
for k, v in expr_replacements.items():
|
||||
text = text.replace(k, v)
|
||||
|
||||
# Fix spacing around punctuation
|
||||
text = re.sub(r" ,", ",", text)
|
||||
text = re.sub(r" \.", ".", text)
|
||||
text = re.sub(r" !", "!", text)
|
||||
text = re.sub(r" \?", "?", text)
|
||||
text = re.sub(r" ;", ";", text)
|
||||
text = re.sub(r" :", ":", text)
|
||||
text = re.sub(r" '", "'", text)
|
||||
|
||||
# Remove duplicate quotes
|
||||
while '""' in text:
|
||||
text = text.replace('""', '"')
|
||||
while "''" in text:
|
||||
text = text.replace("''", "'")
|
||||
while "``" in text:
|
||||
text = text.replace("``", "`")
|
||||
|
||||
# Remove extra spaces
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
# If text doesn't end with punctuation, quotes, or closing brackets, add a period
|
||||
if not re.search(r"[.!?;:,'\"')\]}…。」』】〉》›»]$", text):
|
||||
text += "."
|
||||
|
||||
return text
|
||||
|
||||
def _get_text_mask(self, text_ids_lengths: np.ndarray) -> np.ndarray:
|
||||
|
||||
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using Rust.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
|
||||
|
||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
+89
-1
@@ -113,7 +113,95 @@ impl UnicodeProcessor {
|
||||
}
|
||||
|
||||
pub fn preprocess_text(text: &str) -> String {
|
||||
text.nfkd().collect()
|
||||
// TODO: Need advanced normalizer for better performance
|
||||
let mut text: String = text.nfkd().collect();
|
||||
|
||||
// FIXME: this should be fixed for non-English languages
|
||||
|
||||
// Remove emojis (wide Unicode range)
|
||||
let emoji_pattern = Regex::new(r"[\x{1F600}-\x{1F64F}\x{1F300}-\x{1F5FF}\x{1F680}-\x{1F6FF}\x{1F700}-\x{1F77F}\x{1F780}-\x{1F7FF}\x{1F800}-\x{1F8FF}\x{1F900}-\x{1F9FF}\x{1FA00}-\x{1FA6F}\x{1FA70}-\x{1FAFF}\x{2600}-\x{26FF}\x{2700}-\x{27BF}\x{1F1E6}-\x{1F1FF}]+").unwrap();
|
||||
text = emoji_pattern.replace_all(&text, "").to_string();
|
||||
|
||||
// Replace various dashes and symbols
|
||||
let replacements = [
|
||||
("–", "-"), // en dash
|
||||
("‑", "-"), // non-breaking hyphen
|
||||
("—", "-"), // em dash
|
||||
("¯", " "), // macron
|
||||
("_", " "), // underscore
|
||||
("\u{201C}", "\""), // left double quote
|
||||
("\u{201D}", "\""), // right double quote
|
||||
("\u{2018}", "'"), // left single quote
|
||||
("\u{2019}", "'"), // right single quote
|
||||
("´", "'"), // acute accent
|
||||
("`", "'"), // grave accent
|
||||
("[", " "), // left bracket
|
||||
("]", " "), // right bracket
|
||||
("|", " "), // vertical bar
|
||||
("/", " "), // slash
|
||||
("#", " "), // hash
|
||||
("→", " "), // right arrow
|
||||
("←", " "), // left arrow
|
||||
];
|
||||
|
||||
for (from, to) in &replacements {
|
||||
text = text.replace(from, to);
|
||||
}
|
||||
|
||||
// Remove combining diacritics // FIXME: this should be fixed for non-English languages
|
||||
let diacritics_pattern = Regex::new(r"[\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{030A}\u{030B}\u{030C}\u{0327}\u{0328}\u{0329}\u{032A}\u{032B}\u{032C}\u{032D}\u{032E}\u{032F}]").unwrap();
|
||||
text = diacritics_pattern.replace_all(&text, "").to_string();
|
||||
|
||||
// Remove special symbols
|
||||
let special_symbols = ["♥", "☆", "♡", "©", "\\"];
|
||||
for symbol in &special_symbols {
|
||||
text = text.replace(symbol, "");
|
||||
}
|
||||
|
||||
// Replace known expressions
|
||||
let expr_replacements = [
|
||||
("@", " at "),
|
||||
("e.g.,", "for example, "),
|
||||
("i.e.,", "that is, "),
|
||||
];
|
||||
|
||||
for (from, to) in &expr_replacements {
|
||||
text = text.replace(from, to);
|
||||
}
|
||||
|
||||
// Fix spacing around punctuation
|
||||
text = Regex::new(r" ,").unwrap().replace_all(&text, ",").to_string();
|
||||
text = Regex::new(r" \.").unwrap().replace_all(&text, ".").to_string();
|
||||
text = Regex::new(r" !").unwrap().replace_all(&text, "!").to_string();
|
||||
text = Regex::new(r" \?").unwrap().replace_all(&text, "?").to_string();
|
||||
text = Regex::new(r" ;").unwrap().replace_all(&text, ";").to_string();
|
||||
text = Regex::new(r" :").unwrap().replace_all(&text, ":").to_string();
|
||||
text = Regex::new(r" '").unwrap().replace_all(&text, "'").to_string();
|
||||
|
||||
// Remove duplicate quotes
|
||||
while text.contains("\"\"") {
|
||||
text = text.replace("\"\"", "\"");
|
||||
}
|
||||
while text.contains("''") {
|
||||
text = text.replace("''", "'");
|
||||
}
|
||||
while text.contains("``") {
|
||||
text = text.replace("``", "`");
|
||||
}
|
||||
|
||||
// Remove extra spaces
|
||||
text = Regex::new(r"\s+").unwrap().replace_all(&text, " ").to_string();
|
||||
text = text.trim().to_string();
|
||||
|
||||
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
|
||||
if !text.is_empty() {
|
||||
let ends_with_punct = Regex::new(r#"[.!?;:,'"\u{201C}\u{201D}\u{2018}\u{2019})\]}…。」』】〉》›»]$"#).unwrap();
|
||||
if !ends_with_punct.is_match(&text) {
|
||||
text.push('.');
|
||||
}
|
||||
}
|
||||
|
||||
text
|
||||
}
|
||||
|
||||
pub fn text_to_unicode_values(text: &str) -> Vec<usize> {
|
||||
|
||||
@@ -4,6 +4,8 @@ This guide provides examples for running TTS inference using `example_onnx`.
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
|
||||
|
||||
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
+108
-1
@@ -72,7 +72,114 @@ class UnicodeProcessor {
|
||||
}
|
||||
|
||||
func preprocessText(_ text: String) -> String {
|
||||
return text.precomposedStringWithCompatibilityMapping
|
||||
// TODO: Need advanced normalizer for better performance
|
||||
var text = text.precomposedStringWithCompatibilityMapping
|
||||
|
||||
// FIXME: this should be fixed for non-English languages
|
||||
|
||||
// Remove emojis (wide Unicode range)
|
||||
// Swift NSRegularExpression doesn't support Unicode escapes above \uFFFF
|
||||
// Use character filtering instead
|
||||
text = text.unicodeScalars.filter { scalar in
|
||||
let value = scalar.value
|
||||
return !((value >= 0x1F600 && value <= 0x1F64F) ||
|
||||
(value >= 0x1F300 && value <= 0x1F5FF) ||
|
||||
(value >= 0x1F680 && value <= 0x1F6FF) ||
|
||||
(value >= 0x1F700 && value <= 0x1F77F) ||
|
||||
(value >= 0x1F780 && value <= 0x1F7FF) ||
|
||||
(value >= 0x1F800 && value <= 0x1F8FF) ||
|
||||
(value >= 0x1F900 && value <= 0x1F9FF) ||
|
||||
(value >= 0x1FA00 && value <= 0x1FA6F) ||
|
||||
(value >= 0x1FA70 && value <= 0x1FAFF) ||
|
||||
(value >= 0x2600 && value <= 0x26FF) ||
|
||||
(value >= 0x2700 && value <= 0x27BF) ||
|
||||
(value >= 0x1F1E6 && value <= 0x1F1FF))
|
||||
}.map { String($0) }.joined()
|
||||
|
||||
// Replace various dashes and symbols
|
||||
let replacements: [String: String] = [
|
||||
"–": "-", // en dash
|
||||
"‑": "-", // non-breaking hyphen
|
||||
"—": "-", // em dash
|
||||
"¯": " ", // macron
|
||||
"_": " ", // underscore
|
||||
"\u{201C}": "\"", // left double quote
|
||||
"\u{201D}": "\"", // right double quote
|
||||
"\u{2018}": "'", // left single quote
|
||||
"\u{2019}": "'", // right single quote
|
||||
"´": "'", // acute accent
|
||||
"`": "'", // grave accent
|
||||
"[": " ", // left bracket
|
||||
"]": " ", // right bracket
|
||||
"|": " ", // vertical bar
|
||||
"/": " ", // slash
|
||||
"#": " ", // hash
|
||||
"→": " ", // right arrow
|
||||
"←": " ", // left arrow
|
||||
]
|
||||
|
||||
for (old, new) in replacements {
|
||||
text = text.replacingOccurrences(of: old, with: new)
|
||||
}
|
||||
|
||||
// Remove combining diacritics // FIXME: this should be fixed for non-English languages
|
||||
let diacriticsPattern = try! NSRegularExpression(pattern: "[\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u030A\\u030B\\u030C\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F]")
|
||||
let diacriticsRange = NSRange(text.startIndex..., in: text)
|
||||
text = diacriticsPattern.stringByReplacingMatches(in: text, range: diacriticsRange, withTemplate: "")
|
||||
|
||||
// Remove special symbols
|
||||
let specialSymbols = ["♥", "☆", "♡", "©", "\\"]
|
||||
for symbol in specialSymbols {
|
||||
text = text.replacingOccurrences(of: symbol, with: "")
|
||||
}
|
||||
|
||||
// Replace known expressions
|
||||
let exprReplacements: [String: String] = [
|
||||
"@": " at ",
|
||||
"e.g.,": "for example, ",
|
||||
"i.e.,": "that is, ",
|
||||
]
|
||||
|
||||
for (old, new) in exprReplacements {
|
||||
text = text.replacingOccurrences(of: old, with: new)
|
||||
}
|
||||
|
||||
// Fix spacing around punctuation
|
||||
text = text.replacingOccurrences(of: " ,", with: ",")
|
||||
text = text.replacingOccurrences(of: " .", with: ".")
|
||||
text = text.replacingOccurrences(of: " !", with: "!")
|
||||
text = text.replacingOccurrences(of: " ?", with: "?")
|
||||
text = text.replacingOccurrences(of: " ;", with: ";")
|
||||
text = text.replacingOccurrences(of: " :", with: ":")
|
||||
text = text.replacingOccurrences(of: " '", with: "'")
|
||||
|
||||
// Remove duplicate quotes
|
||||
while text.contains("\"\"") {
|
||||
text = text.replacingOccurrences(of: "\"\"", with: "\"")
|
||||
}
|
||||
while text.contains("''") {
|
||||
text = text.replacingOccurrences(of: "''", with: "'")
|
||||
}
|
||||
while text.contains("``") {
|
||||
text = text.replacingOccurrences(of: "``", with: "`")
|
||||
}
|
||||
|
||||
// Remove extra spaces
|
||||
let whitespacePattern = try! NSRegularExpression(pattern: "\\s+")
|
||||
let whitespaceRange = NSRange(text.startIndex..., in: text)
|
||||
text = whitespacePattern.stringByReplacingMatches(in: text, range: whitespaceRange, withTemplate: " ")
|
||||
text = text.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
|
||||
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
|
||||
if !text.isEmpty {
|
||||
let punctPattern = try! NSRegularExpression(pattern: "[.!?;:,'\"\\u201C\\u201D\\u2018\\u2019)\\]}…。」』】〉》›»]$")
|
||||
let punctRange = NSRange(text.startIndex..., in: text)
|
||||
if punctPattern.firstMatch(in: text, range: punctRange) == nil {
|
||||
text += "."
|
||||
}
|
||||
}
|
||||
|
||||
return text
|
||||
}
|
||||
|
||||
func lengthToMask(_ lengths: [Int], maxLen: Int? = nil) -> [[[Float]]] {
|
||||
|
||||
@@ -4,6 +4,8 @@ This example demonstrates how to use Supertonic in a web browser using ONNX Runt
|
||||
|
||||
## 📰 Update News
|
||||
|
||||
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
|
||||
|
||||
**2025.11.19** - Added speed control slider to adjust speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
|
||||
|
||||
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
|
||||
|
||||
+79
-1
@@ -28,7 +28,85 @@ export class UnicodeProcessor {
|
||||
}
|
||||
|
||||
preprocessText(text) {
|
||||
return text.normalize('NFKC');
|
||||
// TODO: Need advanced normalizer for better performance
|
||||
text = text.normalize('NFKD');
|
||||
|
||||
// FIXME: this should be fixed for non-English languages
|
||||
|
||||
// Remove emojis (wide Unicode range)
|
||||
const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu;
|
||||
text = text.replace(emojiPattern, '');
|
||||
|
||||
// Replace various dashes and symbols
|
||||
const replacements = {
|
||||
'–': '-',
|
||||
'‑': '-',
|
||||
'—': '-',
|
||||
'¯': ' ',
|
||||
'_': ' ',
|
||||
'"': '"',
|
||||
'"': '"',
|
||||
'\u2018': "'", // left single quote
|
||||
'\u2019': "'", // right single quote
|
||||
'´': "'",
|
||||
'`': "'",
|
||||
'[': ' ',
|
||||
']': ' ',
|
||||
'|': ' ',
|
||||
'/': ' ',
|
||||
'#': ' ',
|
||||
'→': ' ',
|
||||
'←': ' ',
|
||||
};
|
||||
for (const [k, v] of Object.entries(replacements)) {
|
||||
text = text.replaceAll(k, v);
|
||||
}
|
||||
|
||||
// Remove combining diacritics // FIXME: this should be fixed for non-English languages
|
||||
text = text.replace(/[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]/g, '');
|
||||
|
||||
// Remove special symbols
|
||||
text = text.replace(/[♥☆♡©\\]/g, '');
|
||||
|
||||
// Replace known expressions
|
||||
const exprReplacements = {
|
||||
'@': ' at ',
|
||||
'e.g.,': 'for example, ',
|
||||
'i.e.,': 'that is, ',
|
||||
};
|
||||
for (const [k, v] of Object.entries(exprReplacements)) {
|
||||
text = text.replaceAll(k, v);
|
||||
}
|
||||
|
||||
// Fix spacing around punctuation
|
||||
text = text.replace(/ ,/g, ',');
|
||||
text = text.replace(/ \./g, '.');
|
||||
text = text.replace(/ !/g, '!');
|
||||
text = text.replace(/ \?/g, '?');
|
||||
text = text.replace(/ ;/g, ';');
|
||||
text = text.replace(/ :/g, ':');
|
||||
text = text.replace(/ '/g, "'");
|
||||
|
||||
// Remove duplicate quotes
|
||||
while (text.includes('""')) {
|
||||
text = text.replace('""', '"');
|
||||
}
|
||||
while (text.includes("''")) {
|
||||
text = text.replace("''", "'");
|
||||
}
|
||||
while (text.includes('``')) {
|
||||
text = text.replace('``', '`');
|
||||
}
|
||||
|
||||
// Remove extra spaces
|
||||
text = text.replace(/\s+/g, ' ').trim();
|
||||
|
||||
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
|
||||
if (!/[.!?;:,'\"')\]}…。」』】〉》›»]$/.test(text)) {
|
||||
text += '.';
|
||||
}
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
getTextMask(textIdsLengths) {
|
||||
|
||||
Reference in New Issue
Block a user