Moved dictionaries manifest into an MTP-fetched pinned post.

This commit is contained in:
23rd
2026-04-20 18:34:56 +03:00
parent 56851519da
commit 5d768d81b0
5 changed files with 921 additions and 81 deletions
@@ -344,24 +344,44 @@ void Inner::setupContent(
const auto queryStream = content->lifetime()
.make_state<rpl::event_stream<QStringView>>();
for (const auto &dict : Spellchecker::Dictionaries()) {
const auto id = dict.id;
const auto row = AddButtonWithLoader(
content,
session,
dict,
ranges::contains(enabledDictionaries, id),
queryStream->events());
row->toggledValue(
) | rpl::on_next([=](auto enabled) {
if (enabled) {
_enabledRows.push_back(id);
} else {
auto &rows = _enabledRows;
rows.erase(ranges::remove(rows, id), end(rows));
}
}, row->lifetime());
}
// Rows are created once, when Spellchecker::Dictionaries() becomes
// non-empty. Manifest is fetched lazily and may arrive after the box
// opens, so we subscribe to DictionariesChanged and populate rows
// then if we haven't already.
const auto built = content->lifetime().make_state<bool>(false);
const auto buildRows = [=] {
if (*built) {
return;
}
const auto dicts = Spellchecker::Dictionaries();
if (dicts.empty()) {
return;
}
*built = true;
for (const auto &dict : dicts) {
const auto id = dict.id;
const auto row = AddButtonWithLoader(
content,
session,
dict,
ranges::contains(enabledDictionaries, id),
queryStream->events());
row->toggledValue(
) | rpl::on_next([=](auto enabled) {
if (enabled) {
_enabledRows.push_back(id);
} else {
auto &rows = _enabledRows;
rows.erase(ranges::remove(rows, id), end(rows));
}
}, row->lifetime());
}
};
buildRows();
Spellchecker::DictionariesChanged(
) | rpl::on_next(buildRows, content->lifetime());
Spellchecker::RefreshDictionariesManifest(session);
_queryCallback = [=](const QString &query) {
if (query.size() >= kMaxQueryLength) {
@@ -10,6 +10,7 @@ https://github.com/telegramdesktop/tdesktop/blob/master/LEGAL
#ifndef TDESKTOP_DISABLE_SPELLCHECK
#include "base/platform/base_platform_info.h"
#include "base/weak_ptr.h"
#include "base/zlib_help.h"
#include "data/data_session.h"
#include "lang/lang_instance.h"
@@ -18,12 +19,16 @@ https://github.com/telegramdesktop/tdesktop/blob/master/LEGAL
#include "main/main_domain.h"
#include "main/main_session.h"
#include "mainwidget.h"
#include "mtproto/dedicated_file_loader.h"
#include "spellcheck/platform/platform_spellcheck.h"
#include "spellcheck/spellcheck_utils.h"
#include "spellcheck/spellcheck_value.h"
#include "core/application.h"
#include "core/core_settings.h"
#include <QtCore/QJsonArray>
#include <QtCore/QJsonDocument>
#include <QtCore/QJsonObject>
#include <QtGui/QGuiApplication>
#include <QtGui/QInputMethod>
@@ -59,53 +64,187 @@ inline auto LanguageFromLocale(QLocale loc) {
: int(locLang);
}
const auto kDictionaries = {
Dict{{ QLocale::English, 649, 174'516, "English" }}, // en_US
Dict{{ QLocale::Bulgarian, 594, 229'658, "\xd0\x91\xd1\x8a\xd0\xbb\xd0\xb3\xd0\xb0\xd1\x80\xd1\x81\xd0\xba\xd0\xb8" }}, // bg_BG
Dict{{ QLocale::Catalan, 595, 417'611, "\x43\x61\x74\x61\x6c\xc3\xa0" }}, // ca_ES
Dict{{ QLocale::Czech, 596, 860'286, "\xc4\x8c\x65\xc5\xa1\x74\x69\x6e\x61" }}, // cs_CZ
Dict{{ QLocale::Welsh, 597, 177'305, "\x43\x79\x6d\x72\x61\x65\x67" }}, // cy_GB
Dict{{ QLocale::Danish, 598, 345'874, "\x44\x61\x6e\x73\x6b" }}, // da_DK
Dict{{ QLocale::German, 599, 2'412'780, "\x44\x65\x75\x74\x73\x63\x68" }}, // de_DE
Dict{{ QLocale::Greek, 600, 1'389'160, "\xce\x95\xce\xbb\xce\xbb\xce\xb7\xce\xbd\xce\xb9\xce\xba\xce\xac" }}, // el_GR
Dict{{ LWC(QLocale::English, QLocale::Australia), 601, 175'266, "English (Australia)" }}, // en_AU
Dict{{ LWC(QLocale::English, QLocale::Canada), 602, 174'295, "English (Canada)" }}, // en_CA
Dict{{ LWC(QLocale::English, QLocale::UnitedKingdom), 603, 174'433, "English (United Kingdom)" }}, // en_GB
Dict{{ QLocale::Spanish, 604, 264'717, "\x45\x73\x70\x61\xc3\xb1\x6f\x6c" }}, // es_ES
Dict{{ QLocale::Estonian, 605, 757'394, "\x45\x65\x73\x74\x69" }}, // et_EE
Dict{{ QLocale::Persian, 606, 333'911, "\xd9\x81\xd8\xa7\xd8\xb1\xd8\xb3\xdb\x8c" }}, // fa_IR
Dict{{ QLocale::French, 607, 321'391, "\x46\x72\x61\x6e\xc3\xa7\x61\x69\x73" }}, // fr_FR
Dict{{ QLocale::Hebrew, 608, 622'550, "\xd7\xa2\xd7\x91\xd7\xa8\xd7\x99\xd7\xaa" }}, // he_IL
Dict{{ QLocale::Hindi, 609, 56'105, "\xe0\xa4\xb9\xe0\xa4\xbf\xe0\xa4\xa8\xe0\xa5\x8d\xe0\xa4\xa6\xe0\xa5\x80" }}, // hi_IN
Dict{{ QLocale::Croatian, 610, 668'876, "\x48\x72\x76\x61\x74\x73\x6b\x69" }}, // hr_HR
Dict{{ QLocale::Hungarian, 611, 660'402, "\x4d\x61\x67\x79\x61\x72" }}, // hu_HU
Dict{{ QLocale::Armenian, 612, 928'746, "\xd5\x80\xd5\xa1\xd5\xb5\xd5\xa5\xd6\x80\xd5\xa5\xd5\xb6" }}, // hy_AM
Dict{{ QLocale::Indonesian, 613, 100'134, "\x49\x6e\x64\x6f\x6e\x65\x73\x69\x61" }}, // id_ID
Dict{{ QLocale::Italian, 614, 324'613, "\x49\x74\x61\x6c\x69\x61\x6e\x6f" }}, // it_IT
Dict{{ QLocale::Korean, 615, 1'256'987, "\xed\x95\x9c\xea\xb5\xad\xec\x96\xb4" }}, // ko_KR
Dict{{ QLocale::Lithuanian, 616, 267'427, "\x4c\x69\x65\x74\x75\x76\x69\xc5\xb3" }}, // lt_LT
Dict{{ QLocale::Latvian, 617, 641'602, "\x4c\x61\x74\x76\x69\x65\xc5\xa1\x75" }}, // lv_LV
Dict{{ QLocale::NorwegianBokmal, 618, 588'650, "\x4e\x6f\x72\x73\x6b" }}, // nb_NO
Dict{{ QLocale::Dutch, 619, 743'406, "\x4e\x65\x64\x65\x72\x6c\x61\x6e\x64\x73" }}, // nl_NL
Dict{{ QLocale::Polish, 620, 1'015'747, "\x50\x6f\x6c\x73\x6b\x69" }}, // pl_PL
Dict{{ QLocale::Portuguese, 621, 1'231'999, "\x50\x6f\x72\x74\x75\x67\x75\xc3\xaa\x73 (Brazil)" }}, // pt_BR
Dict{{ LWC(QLocale::Portuguese, QLocale::Portugal), 622, 138'571, "\x50\x6f\x72\x74\x75\x67\x75\xc3\xaa\x73" }}, // pt_PT
Dict{{ QLocale::Romanian, 623, 455'643, "\x52\x6f\x6d\xc3\xa2\x6e\xc4\x83" }}, // ro_RO
Dict{{ QLocale::Russian, 624, 463'194, "\xd0\xa0\xd1\x83\xd1\x81\xd1\x81\xd0\xba\xd0\xb8\xd0\xb9" }}, // ru_RU
Dict{{ QLocale::Slovak, 625, 525'328, "\x53\x6c\x6f\x76\x65\x6e\xc4\x8d\x69\x6e\x61" }}, // sk_SK
Dict{{ QLocale::Slovenian, 626, 1'143'710, "\x53\x6c\x6f\x76\x65\x6e\xc5\xa1\xc4\x8d\x69\x6e\x61" }}, // sl_SI
Dict{{ QLocale::Albanian, 627, 583'412, "\x53\x68\x71\x69\x70" }}, // sq_AL
Dict{{ QLocale::Swedish, 628, 593'877, "\x53\x76\x65\x6e\x73\x6b\x61" }}, // sv_SE
Dict{{ QLocale::Tamil, 629, 323'193, "\xe0\xae\xa4\xe0\xae\xae\xe0\xae\xbf\xe0\xae\xb4\xe0\xaf\x8d" }}, // ta_IN
Dict{{ QLocale::Tajik, 630, 369'931, "\xd0\xa2\xd0\xbe\xd2\xb7\xd0\xb8\xd0\xba\xd3\xa3" }}, // tg_TG
Dict{{ QLocale::Turkish, 631, 4'301'099, "\x54\xc3\xbc\x72\x6b\xc3\xa7\x65" }}, // tr_TR
Dict{{ QLocale::Ukrainian, 632, 445'711, "\xd0\xa3\xd0\xba\xd1\x80\xd0\xb0\xd1\x97\xd0\xbd\xd1\x81\xd1\x8c\xd0\xba\xd0\xb0" }}, // uk_UA
Dict{{ QLocale::Vietnamese, 633, 12'949, "\x54\x69\xe1\xba\xbf\x6e\x67\x20\x56\x69\xe1\xbb\x87\x74" }}, // vi_VN
// The Tajik code is 'tg_TG' in Chromium, but QT has only 'tg_TJ'.
constexpr auto kDictionariesManifestChannel = "tdhbcfeed"_cs;
constexpr auto kDictionariesManifestPostId = 0;
// Runtime-loaded dictionaries manifest. Kept in memory only: fetched from
// the pinned JSON post the first time something actually needs it
// (Manage Dictionaries, auto-download). Nothing persists to disk, so an
// install whose enabled dictionaries are all on disk never hits the net.
std::vector<Dict> DictionariesList;
rpl::event_stream<> DictionariesListChanged;
void EnsurePath();
bool ParseLocation(const QString &text, QString &channel, int &postId) {
const auto sep = text.indexOf('#');
if (sep <= 0 || sep == text.size() - 1) {
return false;
}
auto ok = false;
const auto parsed = text.mid(sep + 1).toInt(&ok);
if (!ok || parsed <= 0) {
return false;
}
channel = text.left(sep);
postId = parsed;
return true;
}
std::vector<Dict> ParseManifest(const QByteArray &bytes) {
auto result = std::vector<Dict>();
auto err = QJsonParseError();
const auto doc = QJsonDocument::fromJson(bytes, &err);
if (err.error != QJsonParseError::NoError || !doc.isObject()) {
LOG(("Spellcheck Error: manifest JSON parse failed: %1"
).arg(err.errorString()));
return result;
}
const auto list = doc.object().value(u"dictionaries"_q).toArray();
result.reserve(list.size());
for (const auto &v : list) {
const auto obj = v.toObject();
auto d = Dict();
d.id = obj.value(u"id"_q).toInt();
d.size = int64(obj.value(u"size"_q).toDouble());
d.name = obj.value(u"name"_q).toString();
const auto location = obj.value(u"location"_q).toString();
if (!d.id
|| d.name.isEmpty()
|| !ParseLocation(location, d.channel, d.postId)) {
continue;
}
result.push_back(std::move(d));
}
return result;
}
class DictManifestLoader final : public base::has_weak_ptr {
public:
explicit DictManifestLoader(base::weak_ptr<Main::Session> session);
void start();
private:
void resolved(const MTPInputChannel &channel);
void received(const MTPmessages_Messages &result);
void apply(const QByteArray &bytes);
void finish();
MTP::WeakInstance _mtp;
};
std::shared_ptr<DictManifestLoader> ActiveManifestLoader;
DictManifestLoader::DictManifestLoader(base::weak_ptr<Main::Session> session)
: _mtp(session) {
}
void DictManifestLoader::start() {
if (!_mtp.valid()) {
finish();
return;
}
const auto weak = base::make_weak(this);
MTP::ResolveChannel(&_mtp, kDictionariesManifestChannel.utf16(), [=](
const MTPInputChannel &channel) {
if (const auto strong = weak.get()) {
strong->resolved(channel);
}
}, [=] {
if (const auto strong = weak.get()) {
strong->finish();
}
});
}
void DictManifestLoader::resolved(const MTPInputChannel &channel) {
const auto weak = base::make_weak(this);
_mtp.send(
MTPchannels_GetMessages(
channel,
MTP_vector<MTPInputMessage>(1,
MTP_inputMessageID(
MTP_int(kDictionariesManifestPostId)))),
[=](const MTPmessages_Messages &result) {
if (const auto strong = weak.get()) {
strong->received(result);
}
},
[=](const MTP::Error &) {
if (const auto strong = weak.get()) {
strong->finish();
}
});
}
void DictManifestLoader::received(const MTPmessages_Messages &result) {
const auto message = MTP::GetMessagesElement(result);
if (!message || message->type() != mtpc_message) {
LOG(("Spellcheck Error: manifest message not found."));
finish();
return;
}
apply(message->c_message().vmessage().v);
finish();
}
void DictManifestLoader::apply(const QByteArray &bytes) {
auto parsed = ParseManifest(bytes);
if (parsed.empty()) {
LOG(("Spellcheck Error: manifest empty or unparseable."));
return;
}
DictionariesList = std::move(parsed);
DictionariesListChanged.fire({});
}
void DictManifestLoader::finish() {
crl::on_main([] {
ActiveManifestLoader.reset();
});
}
void StartManifestRefresh(not_null<Main::Session*> session) {
if (ActiveManifestLoader) {
return;
}
ActiveManifestLoader = std::make_shared<DictManifestLoader>(
base::make_weak(session));
ActiveManifestLoader->start();
}
// Callers waiting for the first manifest arrival in this session.
// We don't cache manifest to disk, so each cold start begins empty;
// queued callbacks fire once when the MTP fetch lands.
std::vector<Fn<void()>> ManifestPending;
rpl::lifetime ManifestPendingSubscription;
void EnsureManifestThen(
not_null<Main::Session*> session,
Fn<void()> callback) {
if (!DictionariesList.empty()) {
callback();
return;
}
const auto firstCaller = ManifestPending.empty();
ManifestPending.push_back(std::move(callback));
if (firstCaller) {
DictionariesListChanged.events(
) | rpl::take(1) | rpl::on_next([] {
auto fns = std::move(ManifestPending);
ManifestPending.clear();
for (auto &fn : fns) {
fn();
}
}, ManifestPendingSubscription);
}
StartManifestRefresh(session);
}
inline auto IsSupportedLang(int lang) {
return ranges::contains(kDictionaries, lang, &Dict::id);
return ranges::contains(DictionariesList, lang, &Dict::id);
}
void EnsurePath() {
@@ -144,9 +283,12 @@ void DownloadDictionaryInBackground(
if (DictionaryExists(id)) {
auto dicts = Core::App().settings().dictionariesEnabled();
if (!ranges::contains(dicts, id)) {
if (ranges::contains(dicts, id)) {
Platform::Spellchecker::UpdateLanguages(dicts);
} else {
dicts.push_back(id);
Core::App().settings().setDictionariesEnabled(std::move(dicts));
Core::App().settings().setDictionariesEnabled(
std::move(dicts));
Core::App().saveSettingsDelayed();
}
}
@@ -230,17 +372,28 @@ void DictLoader::fail() {
}
std::vector<Dict> Dictionaries() {
return kDictionaries | ranges::to_vector;
return DictionariesList;
}
rpl::producer<> DictionariesChanged() {
return DictionariesListChanged.events();
}
void RefreshDictionariesManifest(not_null<Main::Session*> session) {
StartManifestRefresh(session);
}
int64 GetDownloadSize(int id) {
return ranges::find(kDictionaries, id, &Spellchecker::Dict::id)->size;
const auto i = ranges::find(DictionariesList, id, &Dict::id);
return (i == end(DictionariesList)) ? 0 : i->size;
}
MTP::DedicatedLoader::Location GetDownloadLocation(int id) {
const auto username = kCloudLocationUsername.utf16();
const auto i = ranges::find(kDictionaries, id, &Spellchecker::Dict::id);
return MTP::DedicatedLoader::Location{ username, i->postId };
const auto i = ranges::find(DictionariesList, id, &Dict::id);
if (i == end(DictionariesList)) {
return MTP::DedicatedLoader::Location{};
}
return MTP::DedicatedLoader::Location{ i->channel, i->postId };
}
QString DictPathByLangId(int langId) {
@@ -316,26 +469,26 @@ rpl::producer<QString> ButtonManageDictsState(
if (!Core::App().settings().spellcheckerEnabled()) {
return QString();
}
if (!Core::App().settings().dictionariesEnabled().size()) {
const auto dicts = Core::App().settings().dictionariesEnabled();
if (dicts.empty()) {
return QString();
}
const auto dicts = Core::App().settings().dictionariesEnabled();
const auto filtered = ranges::views::all(
dicts
) | ranges::views::filter(
DictionaryExists
) | ranges::to_vector;
const auto active = Platform::Spellchecker::ActiveLanguages();
return (active.size() == filtered.size())
? QString::number(filtered.size())
: tr::lng_contacts_loading(tr::now);
return (filtered.size() < dicts.size())
? tr::lng_contacts_loading(tr::now)
: QString::number(filtered.size());
};
return rpl::single(
computeString()
) | rpl::then(
rpl::merge(
Spellchecker::SupportedScriptsChanged(),
Spellchecker::DictionariesChanged(),
Core::App().settings().dictionariesEnabledChanges(
) | rpl::to_empty,
Core::App().settings().spellcheckerEnabledChanges(
@@ -418,12 +571,18 @@ void Start(not_null<Main::Session*> session) {
return;
}
const auto l = LanguageFromLocale(method->locale());
if (!IsSupportedLang(l) || DictionaryExists(l)) {
// Avoid pulling the manifest just because an input-method
// locale flipped; only fetch if we'd actually need to
// download something for the new locale.
if (DictionaryExists(l)) {
return;
}
crl::on_main(session, [=] {
EnsureManifestThen(session, crl::guard(session, [=] {
if (!IsSupportedLang(l) || DictionaryExists(l)) {
return;
}
DownloadDictionaryInBackground(session, 0, { l });
});
}));
};
QObject::connect(
method,
@@ -437,8 +596,17 @@ void Start(not_null<Main::Session*> session) {
if (!loaded) {
return;
}
DownloadDictionaryInBackground(session, 0, DefaultLanguages());
const auto enabled = settings->dictionariesEnabled();
if (!enabled.empty()
&& ranges::all_of(enabled, &DictionaryExists)) {
// Every previously-enabled dictionary is already on
// disk; no manifest fetch, no network traffic.
return;
}
EnsureManifestThen(session, crl::guard(session, [=] {
DownloadDictionaryInBackground(
session, 0, DefaultLanguages());
}));
}, lifetime);
connectInput();
@@ -19,6 +19,7 @@ class Session;
namespace Spellchecker {
struct Dict : public Storage::CloudBlob::Blob {
QString channel;
};
int64 GetDownloadSize(int id);
@@ -33,6 +34,8 @@ bool RemoveDictionary(int langId);
bool WriteDefaultDictionary();
std::vector<Dict> Dictionaries();
rpl::producer<> DictionariesChanged();
void RefreshDictionariesManifest(not_null<Main::Session*> session);
void Start(not_null<Main::Session*> session);
[[nodiscard]] rpl::producer<QString> ButtonManageDictsState(
+649
View File
@@ -0,0 +1,649 @@
#!/usr/bin/env python3
# This file is part of Telegram Desktop,
# the official desktop application for the Telegram messaging service.
#
# For license and copyright information please follow this link:
# https://github.com/telegramdesktop/tdesktop/blob/master/LEGAL
"""
Update the Hunspell dictionaries manifest consumed by Telegram Desktop.
One end-to-end run:
1. Shallow-clone Chromium's hunspell_dictionaries repo into --cache-dir.
2. If --manifest-post-id is not provided, send an empty placeholder
message and use its id (printed so you can hardcode it in the client).
3. For each language: read raw .dic and .aff, recode to UTF-8 if needed
(rewriting the SET line), zip them under the Qt-side locale name,
upload via Bot API `sendDocument` to --channel.
4. Build a JSON manifest (each entry carries `location` =
"<blobs-channel>#<message_id>") and write it into the manifest post
via `editMessageText`.
5. Delete --cache-dir unless --keep-cache is passed.
State file tracks sha256 of the UTF-8 dic/aff pair per language so
subsequent runs re-upload only changed dictionaries.
Usage:
# Full one-shot run on a fresh channel: creates the manifest post,
# uploads every dictionary to the same channel, edits the post, and
# deletes the chromium clone.
python3 update_dictionaries.py \\
--bot-token $TG_BOT_TOKEN \\
--channel @my_test_channel \\
--state-file ./dict_state.json
# Manifest and blobs in separate channels. Blobs go to @my_blobs_channel
# (where clients will fetch them), the JSON manifest lives in the
# private coordination channel -100...1438.
python3 update_dictionaries.py \\
--bot-token $TG_BOT_TOKEN \\
--channel @my_test_channel \\
--blobs-channel @my_blobs_channel \\
--state-file ./dict_state.json
# Re-run against an existing manifest post (incremental if state file
# matches; unchanged languages are carried over without re-upload).
python3 update_dictionaries.py \\
--bot-token $TG_BOT_TOKEN \\
--channel @my_test_channel \\
--manifest-post-id 1234 \\
--state-file ./dict_state.json
# Subset / troubleshooting; keep the chromium clone between runs.
python3 update_dictionaries.py \\
--bot-token $TG_BOT_TOKEN \\
--channel @my_test_channel \\
--manifest-post-id 1234 \\
--languages en_US,ru_RU \\
--keep-cache
# Preview: fetch, recode and zip locally; no network upload, no
# manifest edit, no cache cleanup.
python3 update_dictionaries.py \\
--channel @anything \\
--dry-run \\
--languages en_US,sr
Flags:
--channel <id|@name> where the manifest post lives
(sendMessage / editMessageText target).
--blobs-channel <@name> where blob zips go (sendDocument target);
defaults to --channel. Must be @username
since the client resolves blob locations
by public username.
--manifest-post-id <N> reuse an existing manifest post; omit to
create a new placeholder automatically.
--cache-dir <path> where the shallow chromium clone lives
(default: .chromium_hunspell_cache).
--keep-cache keep --cache-dir after completion
(default: delete it).
--state-file <path> sha256/post_id/size per language for
incremental uploads.
The bot must be admin (with post/edit rights) in both --channel and
--blobs-channel. When they coincide, one admin suffices.
"""
from __future__ import annotations
import argparse
import hashlib
import io
import json
import os
import re
import shutil
import subprocess
import sys
import time
import zipfile
from pathlib import Path
from typing import Optional
import requests
CHROMIUM_REPO = (
"https://chromium.googlesource.com/chromium/deps/hunspell_dictionaries"
)
BOT_API = "https://api.telegram.org/bot{token}/{method}"
# QLocale::Language values (stable across Qt 5/6, confirmed against
# qtbase/src/corelib/text/qlocale.h for Qt 6.2 and 6.11).
LANG = {
"Afrikaans": 4,
"Albanian": 9,
"Armenian": 17,
"Bulgarian": 45,
"Catalan": 48,
"Croatian": 66,
"Czech": 67,
"Danish": 68,
"Dutch": 72,
"English": 75,
"Estonian": 78,
"Faroese": 81,
"French": 85,
"Galician": 90,
"German": 94,
"Greek": 96,
"Hebrew": 103,
"Hindi": 105,
"Hungarian": 107,
"Indonesian": 112,
"Italian": 119,
"Korean": 142,
"Latvian": 155,
"Lithuanian": 160,
"NorwegianBokmal":209,
"Persian": 228,
"Polish": 230,
"Portuguese": 231,
"Romanian": 235,
"Russian": 239,
"Serbian": 252,
"Slovak": 262,
"Slovenian": 263,
"Spanish": 270,
"Swedish": 275,
"Tajik": 282,
"Tamil": 283,
"Turkish": 298,
"Ukrainian": 303,
"Vietnamese": 310,
"Welsh": 316,
}
# QLocale::Country values.
COUNTRY = {
"Australia": 15,
"Brazil": 32,
"Canada": 41,
"Portugal": 188,
"UnitedKingdom": 246,
"UnitedStates": 248,
}
# Matches LWC() in spellchecker_common.cpp: default country collapses to
# the bare language id; otherwise language*1000 + country.
_DEFAULT_COUNTRIES = {COUNTRY["UnitedStates"], COUNTRY["Brazil"]}
def lwc(language: int, country: int) -> int:
return language if country in _DEFAULT_COUNTRIES else language * 1000 + country
# Each entry: (chromium_stem, id, qt_name, display_name)
# - chromium_stem: the filename in chromium/deps/hunspell_dictionaries (without ext)
# - id: primary key used by settings / UI (QLocale::Language or LWC())
# - qt_name: must equal QLocale(id).name() at runtime — this is what the
# client uses for both the unpack folder and the <qt_name>.dic/.aff
# lookups. Double-check when adding new entries.
# - display_name: shown in "Manage dictionaries" UI.
LANGUAGES = [
("en_US", LANG["English"], "en_US", "English"),
("bg_BG", LANG["Bulgarian"], "bg_BG", "\u0411\u044a\u043b\u0433\u0430\u0440\u0441\u043a\u0438"),
("ca_ES", LANG["Catalan"], "ca_ES", "Catal\u00e0"),
("cs_CZ", LANG["Czech"], "cs_CZ", "\u010ce\u0161tina"),
("cy_GB", LANG["Welsh"], "cy_GB", "Cymraeg"),
("da_DK", LANG["Danish"], "da_DK", "Dansk"),
("de_DE", LANG["German"], "de_DE", "Deutsch"),
("el_GR", LANG["Greek"], "el_GR", "\u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ac"),
("en_AU", lwc(LANG["English"], COUNTRY["Australia"]), "en_AU", "English (Australia)"),
("en_CA", lwc(LANG["English"], COUNTRY["Canada"]), "en_CA", "English (Canada)"),
("en_GB", lwc(LANG["English"], COUNTRY["UnitedKingdom"]), "en_GB", "English (United Kingdom)"),
("es_ES", LANG["Spanish"], "es_ES", "Espa\u00f1ol"),
("et_EE", LANG["Estonian"], "et_EE", "Eesti"),
("fa_IR", LANG["Persian"], "fa_IR", "\u0641\u0627\u0631\u0633\u06cc"),
("fr_FR", LANG["French"], "fr_FR", "Fran\u00e7ais"),
("he_IL", LANG["Hebrew"], "he_IL", "\u05e2\u05d1\u05e8\u05d9\u05ea"),
("hi_IN", LANG["Hindi"], "hi_IN", "\u0939\u093f\u0928\u094d\u0926\u0940"),
("hr_HR", LANG["Croatian"], "hr_HR", "Hrvatski"),
("hu-HU", LANG["Hungarian"], "hu_HU", "Magyar"),
("hy", LANG["Armenian"], "hy_AM", "\u0540\u0561\u0575\u0565\u0580\u0565\u0576"),
("id_ID", LANG["Indonesian"], "id_ID", "Indonesia"),
("it_IT", LANG["Italian"], "it_IT", "Italiano"),
("ko", LANG["Korean"], "ko_KR", "\ud55c\uad6d\uc5b4"),
("lt_LT", LANG["Lithuanian"], "lt_LT", "Lietuvi\u0173"),
("lv_LV", LANG["Latvian"], "lv_LV", "Latvie\u0161u"),
("nb_NO", LANG["NorwegianBokmal"], "nb_NO", "Norsk"),
("nl_NL", LANG["Dutch"], "nl_NL", "Nederlands"),
("pl_PL", LANG["Polish"], "pl_PL", "Polski"),
("pt_BR", LANG["Portuguese"], "pt_BR", "Portugu\u00eas (Brazil)"),
("pt_PT", lwc(LANG["Portuguese"], COUNTRY["Portugal"]), "pt_PT", "Portugu\u00eas"),
("ro_RO", LANG["Romanian"], "ro_RO", "Rom\u00e2n\u0103"),
("ru_RU", LANG["Russian"], "ru_RU", "\u0420\u0443\u0441\u0441\u043a\u0438\u0439"),
("sk_SK", LANG["Slovak"], "sk_SK", "Sloven\u010dina"),
("sl_SI", LANG["Slovenian"], "sl_SI", "Sloven\u0161\u010dina"),
("sq", LANG["Albanian"], "sq_AL", "Shqip"),
("sv_SE", LANG["Swedish"], "sv_SE", "Svenska"),
("ta_IN", LANG["Tamil"], "ta_IN", "\u0ba4\u0bae\u0bbf\u0bb4\u0bcd"),
("tg_TG", LANG["Tajik"], "tg_TJ", "\u0422\u043e\u04b7\u0438\u043a\u04e3"),
("tr", LANG["Turkish"], "tr_TR", "T\u00fcrk\u00e7e"),
("uk_UA", LANG["Ukrainian"], "uk_UA", "\u0423\u043a\u0440\u0430\u0457\u043d\u0441\u044c\u043a\u0430"),
("vi_VN", LANG["Vietnamese"], "vi_VN", "Ti\u1ebfng Vi\u1ec7t"),
("gl", LANG["Galician"], "gl_ES", "Galego"),
("sr", LANG["Serbian"], "sr_Cyrl_RS", "\u0421\u0440\u043f\u0441\u043a\u0438"),
# Afrikaans (af-ZA) and Faroese (fo-FO) are shipped by Chromium only
# as compiled .bdic — raw .dic/.aff are not checked in. Add them when
# an upstream Hunspell source is picked (LibreOffice, etc.).
]
def ensure_chromium_clone(cache_dir: Path) -> Path:
"""Return path to a fresh shallow clone of Chromium's hunspell repo."""
clone = cache_dir / "hunspell_dictionaries"
if clone.exists() and (clone / ".git").exists():
print(f" using existing clone at {clone}", flush=True)
try:
subprocess.run(
["git", "-C", str(clone), "fetch", "--depth=1", "origin", "main"],
check=True, capture_output=True, text=True,
)
subprocess.run(
["git", "-C", str(clone), "reset", "--hard", "FETCH_HEAD"],
check=True, capture_output=True, text=True,
)
return clone
except subprocess.CalledProcessError as e:
print(f" refresh failed ({e.stderr.strip()}), recloning",
flush=True)
shutil.rmtree(clone)
cache_dir.mkdir(parents=True, exist_ok=True)
print(f" cloning {CHROMIUM_REPO} (shallow) → {clone}", flush=True)
subprocess.run(
["git", "clone", "--depth=1", CHROMIUM_REPO, str(clone)],
check=True,
)
return clone
def read_chromium_file(clone: Path, stem: str, ext: str) -> bytes:
path = clone / f"{stem}.{ext}"
if not path.exists():
raise FileNotFoundError(f"{stem}.{ext} not found at chromium")
return path.read_bytes()
# Chromium SET names → Python codec names when they differ.
_PY_CODEC_ALIAS = {
"windows-1251": "cp1251",
"windows-1252": "cp1252",
}
def _parse_aff_charset(aff: bytes) -> str:
"""Return the SET charset declared in an .aff file. Default per Hunspell
docs is ISO-8859-1 when SET is absent."""
for raw in aff.splitlines():
line = raw.strip()
if line.startswith(b"\xef\xbb\xbf"): # BOM
line = line[3:].strip()
if line.startswith(b"SET "):
return line[4:].strip().decode("ascii", errors="replace").strip()
return "ISO-8859-1"
def _normalize_charset_name(name: str) -> str:
return name.upper().replace("_", "-").replace(" ", "")
def normalize_to_utf8(dic: bytes, aff: bytes) -> tuple[bytes, bytes]:
"""Decode dic/aff using the .aff SET charset and re-emit both as UTF-8,
rewriting (or inserting) the SET line so Hunspell reports utf-8 at runtime.
Idempotent when input is already UTF-8."""
charset = _parse_aff_charset(aff)
normalized = _normalize_charset_name(charset)
if normalized in ("UTF-8", "UTF8"):
return dic, aff
codec = _PY_CODEC_ALIAS.get(charset, charset)
try:
dic_text = dic.decode(codec)
aff_text = aff.decode(codec)
except (LookupError, UnicodeDecodeError) as e:
raise RuntimeError(
f"cannot decode dictionary as {charset!r}: {e}") from None
pattern = re.compile(r"^SET\s+\S+\s*$", re.MULTILINE)
if pattern.search(aff_text):
aff_text = pattern.sub("SET UTF-8", aff_text, count=1)
else:
aff_text = "SET UTF-8\n" + aff_text
return dic_text.encode("utf-8"), aff_text.encode("utf-8")
def make_zip(qt_name: str, dic: bytes, aff: bytes) -> bytes:
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
z.writestr(f"{qt_name}.dic", dic)
z.writestr(f"{qt_name}.aff", aff)
return buf.getvalue()
def sha256_pair(dic: bytes, aff: bytes) -> str:
h = hashlib.sha256()
h.update(b"dic:")
h.update(dic)
h.update(b"aff:")
h.update(aff)
return h.hexdigest()
def bot_call(token: str, method: str, *, data=None, files=None, json_body=None):
url = BOT_API.format(token=token, method=method)
for attempt in range(5):
if json_body is not None:
r = requests.post(url, json=json_body, timeout=120)
else:
r = requests.post(url, data=data, files=files, timeout=300)
if r.status_code == 429:
wait = r.json().get("parameters", {}).get("retry_after", 5)
print(f" rate-limited, sleeping {wait}s", flush=True)
time.sleep(wait + 1)
continue
try:
body = r.json()
except ValueError:
r.raise_for_status()
raise
if r.ok and body.get("ok"):
return body["result"]
raise RuntimeError(
f"Bot API {method} failed ({r.status_code}): {body}"
)
raise RuntimeError(f"Bot API {method}: too many retries")
def bot_send_document(token, chat_id, filename, blob):
result = bot_call(
token,
"sendDocument",
data={"chat_id": chat_id, "disable_notification": "true"},
files={"document": (filename, blob, "application/zip")},
)
return result["message_id"], result["document"]["file_size"]
def bot_edit_message_text(token, chat_id, message_id, text):
try:
bot_call(
token,
"editMessageText",
json_body={
"chat_id": chat_id,
"message_id": message_id,
"text": text,
},
)
except RuntimeError as e:
if "message is not modified" in str(e):
print("manifest post unchanged, skip edit", flush=True)
return
raise
def bot_send_placeholder(token, chat_id):
result = bot_call(
token,
"sendMessage",
json_body={
"chat_id": chat_id,
"text": "{}",
"disable_notification": True,
},
)
return result["message_id"]
def load_state(path: Optional[Path]) -> dict:
if path and path.exists():
return json.loads(path.read_text())
return {}
def save_state(path: Optional[Path], state: dict) -> None:
if not path:
return
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(state, ensure_ascii=False, indent=2) + "\n")
_DEFAULT_CLIENT_SOURCE = (
Path(__file__).resolve().parent.parent
/ "SourceFiles" / "chat_helpers" / "spellchecker_common.cpp"
)
def _client_channel_username(channel: str) -> Optional[str]:
stripped = str(channel).lstrip("@").lstrip("+")
if not stripped or stripped[0] == "-" or stripped.isdigit():
return None
return stripped
def patch_client_source(
path: Path,
channel_username: Optional[str],
post_id: int,
) -> None:
text = path.read_text(encoding="utf-8")
original = text
if channel_username is not None:
text, n = re.subn(
r'(constexpr auto kDictionariesManifestChannel\s*=\s*)'
r'"[^"]*"(_cs\s*;)',
lambda m: f'{m.group(1)}"{channel_username}"{m.group(2)}',
text,
count=1,
)
if n == 0:
raise RuntimeError(
f"patch: kDictionariesManifestChannel not found in {path}")
text, n = re.subn(
r'(constexpr auto kDictionariesManifestPostId\s*=\s*)\d+(\s*;)',
lambda m: f'{m.group(1)}{post_id}{m.group(2)}',
text,
count=1,
)
if n == 0:
raise RuntimeError(
f"patch: kDictionariesManifestPostId not found in {path}")
if text == original:
print(f" {path}: constants already up to date", flush=True)
return
path.write_text(text, encoding="utf-8")
parts = [f"postId={post_id}"]
if channel_username is not None:
parts.append(f"channel={channel_username}")
print(f" patched {path}: {', '.join(parts)}", flush=True)
def format_manifest(entries: list[dict]) -> str:
# One entry per line for readable diffs and to keep message size small
# enough for editMessageText (4096-char limit).
lines = ['{"version":1,"dictionaries":[']
for i, e in enumerate(entries):
sep = "" if i == len(entries) - 1 else ","
lines.append(json.dumps(e, ensure_ascii=False, sort_keys=True) + sep)
lines.append("]}")
return "\n".join(lines)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--bot-token", default=os.environ.get("TG_BOT_TOKEN"),
help="Bot API token (or via TG_BOT_TOKEN env)")
ap.add_argument("--channel", required=True,
help="@username or numeric chat_id of the channel"
" that holds the manifest post (sendMessage /"
" editMessageText target)")
ap.add_argument("--blobs-channel", default=None,
help="@username of the channel blobs are uploaded"
" into via sendDocument; defaults to --channel"
" with any leading @ stripped. Must be a public"
" username — clients resolve locations by it")
ap.add_argument("--manifest-post-id", type=int, default=None,
help="reuse this message_id; if omitted, sends a new"
" placeholder first and uses its id (prints so"
" you can hardcode it in the client)")
ap.add_argument("--state-file", type=Path, default=None,
help="path to persist sha/post_id/size per language"
" for incremental re-uploads")
ap.add_argument("--languages", default="",
help="comma-separated chromium stems to restrict to")
ap.add_argument("--dry-run", action="store_true",
help="fetch and zip but do not upload, edit, or clean")
ap.add_argument("--cache-dir", type=Path,
default=Path(".chromium_hunspell_cache"),
help="directory for the shallow chromium clone")
ap.add_argument("--keep-cache", action="store_true",
help="keep --cache-dir after completion (default:"
" delete the chromium clone when done)")
ap.add_argument("--client-source", type=Path,
default=_DEFAULT_CLIENT_SOURCE,
help="path to spellchecker_common.cpp; after a"
" successful manifest edit the script rewrites"
" kDictionariesManifestChannel and"
" kDictionariesManifestPostId in place")
ap.add_argument("--skip-client-patch", action="store_true",
help="do not rewrite manifest constants in"
" --client-source")
args = ap.parse_args()
if not args.dry_run and not args.bot_token:
sys.exit("error: --bot-token or TG_BOT_TOKEN env required")
blobs_target = args.blobs_channel or args.channel
blobs_username = str(blobs_target).lstrip("@").lstrip("+")
if not blobs_username or blobs_username.startswith("-"):
sys.exit("error: blobs channel must be @username (clients resolve"
" locations by public username, not chat_id). Pass"
" --blobs-channel @name when --channel is numeric.")
manifest_post_id = args.manifest_post_id
if manifest_post_id is None and not args.dry_run:
manifest_post_id = bot_send_placeholder(
args.bot_token, args.channel)
print(f"created manifest placeholder, message_id="
f"{manifest_post_id}", flush=True)
print(f"hardcode in client: kDictionariesManifestPostId = "
f"{manifest_post_id}", flush=True)
def location(post_id: int) -> str:
return f"{blobs_username}#{post_id}"
# Bot uploads go to --blobs-channel (the @username derived above),
# which may or may not equal --channel. Use the args.blobs-channel
# value if supplied, otherwise fall back to --channel as-is.
blobs_chat = args.blobs_channel or args.channel
filter_set = {s for s in args.languages.split(",") if s}
state = load_state(args.state_file)
manifest_entries = []
clone = ensure_chromium_clone(args.cache_dir)
for stem, lang_id, qt_name, display in LANGUAGES:
if filter_set and stem not in filter_set:
prev = state.get(stem)
if prev:
manifest_entries.append({
"id": lang_id,
"name": display,
"location": location(prev["post_id"]),
"size": prev["size"],
})
continue
print(f"[{stem}{qt_name}]", flush=True)
try:
dic_raw = read_chromium_file(clone, stem, "dic")
aff_raw = read_chromium_file(clone, stem, "aff")
except FileNotFoundError as e:
print(f" skip: {e}", flush=True)
continue
try:
dic, aff = normalize_to_utf8(dic_raw, aff_raw)
except RuntimeError as e:
print(f" skip: {e}", flush=True)
continue
if dic is not dic_raw:
print(f" recoded to UTF-8 from "
f"{_parse_aff_charset(aff_raw)}", flush=True)
digest = sha256_pair(dic, aff)
prev = state.get(stem)
if (prev
and prev.get("sha256") == digest
and prev.get("qt_name") == qt_name
and not args.dry_run):
print(f" unchanged (sha {digest[:8]}), carrying postId="
f"{prev['post_id']}", flush=True)
manifest_entries.append({
"id": lang_id,
"name": display,
"location": location(prev["post_id"]),
"size": prev["size"],
})
continue
blob = make_zip(qt_name, dic, aff)
print(f" zipped: dic={len(dic):,} aff={len(aff):,} "
f"zip={len(blob):,}", flush=True)
if args.dry_run:
manifest_entries.append({
"id": lang_id,
"name": display,
"location": location(prev["post_id"] if prev else 0),
"size": len(blob),
})
continue
post_id, size = bot_send_document(
args.bot_token, blobs_chat, qt_name, blob)
print(f" uploaded: postId={post_id} size={size}", flush=True)
state[stem] = {
"sha256": digest,
"post_id": post_id,
"size": size,
"qt_name": qt_name,
}
manifest_entries.append({
"id": lang_id,
"name": display,
"location": location(post_id),
"size": size,
})
manifest_text = format_manifest(manifest_entries)
print(f"\nmanifest: {len(manifest_entries)} entries, "
f"{len(manifest_text):,} chars", flush=True)
if args.dry_run:
print("--- manifest (dry-run) ---")
print(manifest_text)
return
bot_edit_message_text(
args.bot_token, args.channel,
manifest_post_id, manifest_text)
print(f"manifest post {manifest_post_id} updated", flush=True)
if not args.skip_client_patch:
channel_username = _client_channel_username(args.channel)
if channel_username is None:
print(f" --channel {args.channel!r} is not a @username;"
f" updating only kDictionariesManifestPostId in"
f" {args.client_source}", flush=True)
patch_client_source(
args.client_source, channel_username, manifest_post_id)
save_state(args.state_file, state)
if not args.keep_cache and args.cache_dir.exists():
print(f"removing {args.cache_dir}", flush=True)
shutil.rmtree(args.cache_dir)
if __name__ == "__main__":
main()