libnm: make caching of encodings in nm_utils_ssid_to_utf8() thread safe

libnm's data structures are commonly not thread safe (like
NMConnection). However, it must be possible that all operations can
operate on *different* data in a thread safe manner. That means, we need
to take care about our global variables.

nm_utils_ssid_to_utf8() uses a list of encodings, which gets cached.

- replace the GHashTables with a static list. Since it doesn't cost
  anything, make the list sorted and look it up via binary search.
This commit is contained in:
Thomas Haller
2022-01-02 20:43:09 +01:00
parent a5f7f89b04
commit 91653ea784
3 changed files with 249 additions and 139 deletions

View File

@@ -66,4 +66,8 @@ void _nm_team_settings_property_from_dbus_link_watchers(
GVariant *_nm_utils_ip4_dns_to_variant(const char *const *dns, gssize len);
GVariant *_nm_utils_ip6_dns_to_variant(const char *const *dns, gssize len);
const char *const *nmtst_system_encodings_for_lang(const char *lang);
const char *const *nmtst_system_encodings_get_default(void);
const char *const *nmtst_system_encodings_get(void);
#endif

View File

@@ -328,169 +328,181 @@ good:
/*****************************************************************************/
struct IsoLangToEncodings {
const char *lang;
const char *const *encodings;
};
typedef const char *const StrvArray4Type[4];
#define LANG_ENCODINGS(l, ...) \
{ \
.lang = l, .encodings = NM_MAKE_STRV(__VA_ARGS__), \
#define LL(l, ...) \
{ \
.name = l, .value = {__VA_ARGS__, NULL}, \
}
/* 5-letter language codes */
static const struct IsoLangToEncodings isoLangEntries5[] = {
/* Simplified Chinese */
LANG_ENCODINGS("zh_cn", "euc-cn", "gb2312", "gb18030"), /* PRC */
LANG_ENCODINGS("zh_sg", "euc-cn", "gb2312", "gb18030"), /* Singapore */
/* Traditional Chinese */
LANG_ENCODINGS("zh_tw", "big5", "euc-tw"), /* Taiwan */
LANG_ENCODINGS("zh_hk", "big5", "euc-tw", "big5-hkcs"), /* Hong Kong */
LANG_ENCODINGS("zh_mo", "big5", "euc-tw"), /* Macau */
LANG_ENCODINGS(NULL, NULL)};
static _NM_UTILS_STRING_TABLE_LOOKUP_DEFINE(
_iso_lang_entries_5_lookup,
StrvArray4Type,
const char *const *,
{ nm_assert(name); },
{ return NULL; },
,
LL("zh_cn", "euc-cn", "gb2312", "gb18030"), /* Simplified Chinese, PRC */
LL("zh_hk", "big5", "euc-tw", "big5-hkcs"), /* Traditional Chinese, Hong Kong */
LL("zh_mo", "big5", "euc-tw"), /* Traditional Chinese, Macau */
LL("zh_sg", "euc-cn", "gb2312", "gb18030"), /* Simplified Chinese, Singapore */
LL("zh_tw", "big5", "euc-tw"), /* Traditional Chinese, Taiwan */
);
/* 2-letter language codes; we don't care about the other 3 in this table */
static const struct IsoLangToEncodings isoLangEntries2[] = {
/* Japanese */
LANG_ENCODINGS("ja", "euc-jp", "shift_jis", "iso-2022-jp"),
static _NM_UTILS_STRING_TABLE_LOOKUP_DEFINE(
_iso_lang_entries_2_lookup,
StrvArray4Type,
const char *const *,
{ nm_assert(name); },
{ return NULL; },
,
LL("ar", "iso-8859-6", "windows-1256"), /* Arabic */
LL("be", "koi8-r", "windows-1251", "iso-8859-5"), /* Cyrillic, Belorussian */
LL("bg", "windows-1251", "koi8-r", "iso-8859-5"), /* Cyrillic, Bulgarian */
LL("cs", "iso-8859-2", "windows-1250"), /* Central European, Czech */
LL("el", "iso-8859-7", "windows-1253"), /* Greek */
LL("et", "iso-8859-4", "windows-1257"), /* Baltic, Estonian */
LL("he", "iso-8859-8", "windows-1255"), /* Hebrew */
LL("hr", "iso-8859-2", "windows-1250"), /* Central European, Croatian */
LL("hu", "iso-8859-2", "windows-1250"), /* Central European, Hungarian */
LL("iw", "iso-8859-8", "windows-1255"), /* Hebrew */
LL("ja", "euc-jp", "shift_jis", "iso-2022-jp"), /* Japanese */
LL("ko", "euc-kr", "iso-2022-kr", "johab"), /* Korean */
LL("lt", "iso-8859-4", "windows-1257"), /* Baltic, Lithuanian */
LL("lv", "iso-8859-4", "windows-1257"), /* Baltic, Latvian */
LL("mk", "koi8-r", "windows-1251", "iso-8859-5"), /* Cyrillic, Macedonian */
LL("pl", "iso-8859-2", "windows-1250"), /* Central European, Polish */
LL("ro", "iso-8859-2", "windows-1250"), /* Central European, Romanian */
LL("ru", "koi8-r", "windows-1251", "iso-8859-5"), /* Cyrillic, Russian */
LL("sh", "iso-8859-2", "windows-1250"), /* Central European, Serbo-Croatian */
LL("sk", "iso-8859-2", "windows-1250"), /* Central European, Slovakian */
LL("sl", "iso-8859-2", "windows-1250"), /* Central European, Slovenian */
LL("sr", "koi8-r", "windows-1251", "iso-8859-5"), /* Cyrillic, Serbian */
LL("th", "iso-8859-11", "windows-874"), /* Thai */
LL("tr", "iso-8859-9", "windows-1254"), /* Turkish */
LL("uk", "koi8-u", "koi8-r", "windows-1251"), /* Cyrillic, Ukrainian */
);
/* Korean */
LANG_ENCODINGS("ko", "euc-kr", "iso-2022-kr", "johab"),
/* Thai */
LANG_ENCODINGS("th", "iso-8859-11", "windows-874"),
/* Central European */
LANG_ENCODINGS("hu", "iso-8859-2", "windows-1250"), /* Hungarian */
LANG_ENCODINGS("cs", "iso-8859-2", "windows-1250"), /* Czech */
LANG_ENCODINGS("hr", "iso-8859-2", "windows-1250"), /* Croatian */
LANG_ENCODINGS("pl", "iso-8859-2", "windows-1250"), /* Polish */
LANG_ENCODINGS("ro", "iso-8859-2", "windows-1250"), /* Romanian */
LANG_ENCODINGS("sk", "iso-8859-2", "windows-1250"), /* Slovakian */
LANG_ENCODINGS("sl", "iso-8859-2", "windows-1250"), /* Slovenian */
LANG_ENCODINGS("sh", "iso-8859-2", "windows-1250"), /* Serbo-Croatian */
/* Cyrillic */
LANG_ENCODINGS("ru", "koi8-r", "windows-1251", "iso-8859-5"), /* Russian */
LANG_ENCODINGS("be", "koi8-r", "windows-1251", "iso-8859-5"), /* Belorussian */
LANG_ENCODINGS("bg", "windows-1251", "koi8-r", "iso-8859-5"), /* Bulgarian */
LANG_ENCODINGS("mk", "koi8-r", "windows-1251", "iso-8859-5"), /* Macedonian */
LANG_ENCODINGS("sr", "koi8-r", "windows-1251", "iso-8859-5"), /* Serbian */
LANG_ENCODINGS("uk", "koi8-u", "koi8-r", "windows-1251"), /* Ukrainian */
/* Arabic */
LANG_ENCODINGS("ar", "iso-8859-6", "windows-1256"),
/* Baltic */
LANG_ENCODINGS("et", "iso-8859-4", "windows-1257"), /* Estonian */
LANG_ENCODINGS("lt", "iso-8859-4", "windows-1257"), /* Lithuanian */
LANG_ENCODINGS("lv", "iso-8859-4", "windows-1257"), /* Latvian */
/* Greek */
LANG_ENCODINGS("el", "iso-8859-7", "windows-1253"),
/* Hebrew */
LANG_ENCODINGS("he", "iso-8859-8", "windows-1255"),
LANG_ENCODINGS("iw", "iso-8859-8", "windows-1255"),
/* Turkish */
LANG_ENCODINGS("tr", "iso-8859-9", "windows-1254"),
/* Table end */
LANG_ENCODINGS(NULL, NULL)};
static GHashTable *langToEncodings5 = NULL;
static GHashTable *langToEncodings2 = NULL;
static void
init_lang_to_encodings_hash(void)
static const char *const *
_system_encodings_for_lang(const char *lang)
{
struct IsoLangToEncodings *enc;
char tmp_lang[3];
const char *const *e;
if (G_UNLIKELY(langToEncodings5 == NULL)) {
/* Five-letter codes */
enc = (struct IsoLangToEncodings *) &isoLangEntries5[0];
langToEncodings5 = g_hash_table_new(nm_str_hash, g_str_equal);
while (enc->lang) {
g_hash_table_insert(langToEncodings5, (gpointer) enc->lang, (gpointer) enc->encodings);
enc++;
}
nm_assert(lang);
if (lang[0] == '\0' || lang[1] == '\0') {
/* need at least two characters. */
nm_assert(!_iso_lang_entries_5_lookup(lang));
nm_assert(!_iso_lang_entries_2_lookup(lang));
return NULL;
}
if (G_UNLIKELY(langToEncodings2 == NULL)) {
/* Two-letter codes */
enc = (struct IsoLangToEncodings *) &isoLangEntries2[0];
langToEncodings2 = g_hash_table_new(nm_str_hash, g_str_equal);
while (enc->lang) {
g_hash_table_insert(langToEncodings2, (gpointer) enc->lang, (gpointer) enc->encodings);
enc++;
}
if (lang[2] != '\0') {
nm_assert(!_iso_lang_entries_2_lookup(lang));
if (lang[3] != '\0' && lang[4] != '\0' && lang[5] == '\0') {
/* lang is 5 characters long. Try it. */
if ((e = _iso_lang_entries_5_lookup(lang)))
return e;
} else
nm_assert(!_iso_lang_entries_5_lookup(lang));
/* extract the first 2 characters and ignore the rest. */
tmp_lang[0] = lang[0];
tmp_lang[1] = lang[1];
tmp_lang[2] = '\0';
lang = tmp_lang;
}
if ((e = _iso_lang_entries_2_lookup(lang)))
return e;
return NULL;
}
static gboolean
get_encodings_for_lang(const char *lang, const char *const **encodings)
const char *const *
nmtst_system_encodings_for_lang(const char *lang)
{
gs_free char *tmp_lang = NULL;
g_return_val_if_fail(lang, FALSE);
g_return_val_if_fail(encodings, FALSE);
init_lang_to_encodings_hash();
if ((*encodings = g_hash_table_lookup(langToEncodings5, lang)))
return TRUE;
/* Truncate tmp_lang to length of 2 */
if (strlen(lang) > 2) {
tmp_lang = g_strdup(lang);
tmp_lang[2] = '\0';
if ((*encodings = g_hash_table_lookup(langToEncodings2, tmp_lang)))
return TRUE;
}
return FALSE;
return _system_encodings_for_lang(lang);
}
static const char *const *
get_system_encodings(void)
_system_encodings_get_default(void)
{
static const char *const *cached_encodings;
static char *default_encodings[4];
const char *const *encodings = NULL;
char *lang;
static gsize init_once = 0;
static const char *default_encodings[4];
if (cached_encodings)
return cached_encodings;
if (g_once_init_enter(&init_once)) {
const char *e_default = NULL;
int i;
/* Use environment variables as encoding hint */
lang = getenv("LC_ALL");
if (!lang)
lang = getenv("LC_CTYPE");
if (!lang)
lang = getenv("LANG");
if (lang) {
char *dot;
g_get_charset(&e_default);
lang = g_ascii_strdown(lang, -1);
if ((dot = strchr(lang, '.')))
*dot = '\0';
i = 0;
if (e_default)
default_encodings[i++] = e_default;
if (!nm_streq0(e_default, "iso-8859-1"))
default_encodings[i++] = "iso-8859-1";
if (!nm_streq0(e_default, "windows-1251"))
default_encodings[i++] = "windows-1251";
default_encodings[i++] = NULL;
nm_assert(i <= G_N_ELEMENTS(default_encodings));
get_encodings_for_lang(lang, &encodings);
g_free(lang);
}
if (!encodings) {
g_get_charset((const char **) &default_encodings[0]);
default_encodings[1] = "iso-8859-1";
default_encodings[2] = "windows-1251";
default_encodings[3] = NULL;
encodings = (const char *const *) default_encodings;
g_once_init_leave(&init_once, 1);
}
cached_encodings = encodings;
return cached_encodings;
return default_encodings;
}
const char *const *
nmtst_system_encodings_get_default(void)
{
return _system_encodings_get_default();
}
static const char *const *
_system_encodings_get(void)
{
static const char *const *cached = NULL;
const char *const *e;
again:
if (!(e = g_atomic_pointer_get(&cached))) {
const char *lang;
/* Use environment variables as encoding hint */
lang = getenv("LC_ALL") ?: getenv("LC_CTYPE") ?: getenv("LANG");
if (lang) {
gs_free char *lang_down = NULL;
char *dot;
lang_down = g_ascii_strdown(lang, -1);
if ((dot = strchr(lang_down, '.')))
*dot = '\0';
e = _system_encodings_for_lang(lang_down);
}
if (!e)
e = _system_encodings_get_default();
/* in any case, @e is now a static buffer, that we may cache. */
nm_assert(e);
if (!g_atomic_pointer_compare_and_exchange(&cached, NULL, e))
goto again;
}
return e;
}
const char *const *
nmtst_system_encodings_get(void)
{
return _system_encodings_get();
}
/*****************************************************************************/
@@ -567,7 +579,7 @@ nm_utils_ssid_to_utf8(const guint8 *ssid, gsize len)
if (g_utf8_validate((const char *) ssid, len, NULL))
return g_strndup((const char *) ssid, len);
encodings = get_system_encodings();
encodings = _system_encodings_get();
for (e = encodings; *e; e++) {
converted = g_convert((const char *) ssid, len, "UTF-8", *e, NULL, NULL, NULL);

View File

@@ -10666,6 +10666,98 @@ test_vpn_connection_state_reason(void)
/*****************************************************************************/
static void
test_system_encodings(void)
{
const int N_RUN = 10000;
int i_run;
g_assert(nmtst_system_encodings_for_lang("") == NULL);
g_assert(nmtst_system_encodings_for_lang("zh") == NULL);
g_assert(nmtst_system_encodings_for_lang("zh_cx") == NULL);
#define LL(lang, ...) \
G_STMT_START \
{ \
const char *const _lang = "" lang ""; \
\
nmtst_assert_strv(nmtst_system_encodings_for_lang(_lang), __VA_ARGS__); \
\
if (strlen(_lang) == 2) { \
gs_free char *_lang2 = g_strdup_printf("%s%s", _lang, "x"); \
\
nmtst_assert_strv(nmtst_system_encodings_for_lang(_lang2), __VA_ARGS__); \
} \
} \
G_STMT_END
LL("zh_cn", "euc-cn", "gb2312", "gb18030");
LL("zh_hk", "big5", "euc-tw", "big5-hkcs");
LL("zh_mo", "big5", "euc-tw");
LL("zh_sg", "euc-cn", "gb2312", "gb18030");
LL("zh_tw", "big5", "euc-tw");
LL("ar", "iso-8859-6", "windows-1256");
LL("be", "koi8-r", "windows-1251", "iso-8859-5");
LL("bg", "windows-1251", "koi8-r", "iso-8859-5");
LL("cs", "iso-8859-2", "windows-1250");
LL("el", "iso-8859-7", "windows-1253");
LL("et", "iso-8859-4", "windows-1257");
LL("he", "iso-8859-8", "windows-1255");
LL("hr", "iso-8859-2", "windows-1250");
LL("hu", "iso-8859-2", "windows-1250");
LL("iw", "iso-8859-8", "windows-1255");
LL("ja", "euc-jp", "shift_jis", "iso-2022-jp");
LL("ko", "euc-kr", "iso-2022-kr", "johab");
LL("lt", "iso-8859-4", "windows-1257");
LL("lv", "iso-8859-4", "windows-1257");
LL("mk", "koi8-r", "windows-1251", "iso-8859-5");
LL("pl", "iso-8859-2", "windows-1250");
LL("ro", "iso-8859-2", "windows-1250");
LL("ru", "koi8-r", "windows-1251", "iso-8859-5");
LL("sh", "iso-8859-2", "windows-1250");
LL("sk", "iso-8859-2", "windows-1250");
LL("sl", "iso-8859-2", "windows-1250");
LL("sr", "koi8-r", "windows-1251", "iso-8859-5");
LL("th", "iso-8859-11", "windows-874");
LL("tr", "iso-8859-9", "windows-1254");
LL("uk", "koi8-u", "koi8-r", "windows-1251");
g_assert(nmtst_system_encodings_get_default());
g_assert(nmtst_system_encodings_get());
for (i_run = 0; i_run < N_RUN; i_run++) {
char buf[7];
int n_buf;
int i_buf;
const char *const *e;
if (i_run < N_RUN / 3)
n_buf = 2;
else if (i_run < 2 * N_RUN / 3)
n_buf = 5;
else
n_buf = nmtst_get_rand_uint32() % G_N_ELEMENTS(buf);
for (i_buf = 0; i_buf < n_buf; i_buf++) {
do {
buf[i_buf] = (char) nmtst_get_rand_uint32();
} while (buf[i_buf] == '\0');
}
g_assert(i_buf < G_N_ELEMENTS(buf));
buf[i_buf] = '\0';
g_assert_cmpint(n_buf, <, G_N_ELEMENTS(buf));
g_assert_cmpint(strlen(buf), ==, n_buf);
e = nmtst_system_encodings_for_lang(buf);
if (e)
g_assert_cmpint(n_buf, >=, 2);
}
}
/*****************************************************************************/
NMTST_DEFINE();
int
@@ -11008,5 +11100,7 @@ main(int argc, char **argv)
g_test_add_func("/core/general/test_vpn_connection_state_reason",
test_vpn_connection_state_reason);
g_test_add_func("/core/general/test_system_encodings", test_system_encodings);
return g_test_run();
}