shared: add nm_utils_str_utf8safe_*() API to sanitize UTF-8 strings

Use C-style backslash escaping to sanitize non-UTF-8 strings. The functions are compatible with glib's g_strcompress() and g_strescape(). The difference is only that g_strescape() escapes all non-printable, non ASCII character as well, while nm_utils_str_utf8safe_escape() -- depending on the flags -- preserves valid UTF-8 sequence except backslash. The flags allow to optionally escape ASCII control characters and all non-ASCII (valid UTF-8) characters. But the option to preserve valid UTF-8 (non-ASCII) characters verbatim, is what distinguishes from g_strescape().
2017-05-16 18:50:21 +02:00
parent e216d5eac0
commit df6d27b33a
3 changed files with 249 additions and 0 deletions
--- a/libnm-core/tests/test-general.c
+++ b/libnm-core/tests/test-general.c
@@ -5323,6 +5323,100 @@ static void test_nm_utils_enum (void)

 /*****************************************************************************/

+static void
+do_test_utils_str_utf8safe (const char *str, const char *expected, NMUtilsStrUtf8SafeFlags flags)
+{
+	const char *str_safe, *s;
+	gs_free char *str2 = NULL;
+	gs_free char *str3 = NULL;
+
+	str_safe = nm_utils_str_utf8safe_escape (str, flags, &str2);
+
+	str3 = nm_utils_str_utf8safe_escape_cp (str, flags);
+	g_assert_cmpstr (str3, ==, str_safe);
+	g_assert ((!str && !str3) || (str != str3));
+	g_clear_pointer (&str3, g_free);
+
+	if (expected == NULL) {
+		g_assert (str_safe == str);
+		g_assert (!str2);
+		if (str) {
+			g_assert (!strchr (str, '\\'));
+			g_assert (g_utf8_validate (str, -1, NULL));
+		}
+
+		g_assert (str == nm_utils_str_utf8safe_unescape (str_safe, &str3));
+		g_assert (!str3);
+
+		str3 = nm_utils_str_utf8safe_unescape_cp (str_safe);
+		if (str) {
+			g_assert (str3 != str);
+			g_assert_cmpstr (str3, ==, str);
+		} else
+			g_assert (!str3);
+		g_clear_pointer (&str3, g_free);
+		return;
+	}
+
+	g_assert (str);
+	g_assert (str_safe != str);
+	g_assert (str_safe == str2);
+	g_assert (   strchr (str, '\\')
+	          || !g_utf8_validate (str, -1, NULL)
+	          || (   NM_FLAGS_HAS (flags, NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_NON_ASCII)
+	              && NM_STRCHAR_ANY (str, ch, (guchar) ch >= 127))
+	          || (   NM_FLAGS_HAS (flags, NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_CTRL)
+	              && NM_STRCHAR_ANY (str, ch, (guchar) ch < ' ')));
+	g_assert (g_utf8_validate (str_safe, -1, NULL));
+
+	str3 = g_strcompress (str_safe);
+	g_assert_cmpstr (str, ==, str3);
+	g_clear_pointer (&str3, g_free);
+
+	str3 = nm_utils_str_utf8safe_unescape_cp (str_safe);
+	g_assert (str3 != str);
+	g_assert_cmpstr (str3, ==, str);
+	g_clear_pointer (&str3, g_free);
+
+	s = nm_utils_str_utf8safe_unescape (str_safe, &str3);
+	g_assert (str3 != str);
+	g_assert (s == str3);
+	g_assert_cmpstr (str3, ==, str);
+	g_clear_pointer (&str3, g_free);
+
+	g_assert_cmpstr (str_safe, ==, expected);
+}
+
+static void
+test_utils_str_utf8safe (void)
+{
+	do_test_utils_str_utf8safe (NULL, NULL,                                       NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
+	do_test_utils_str_utf8safe ("", NULL,                                         NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
+	do_test_utils_str_utf8safe ("\314", "\\314",                                  NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
+	do_test_utils_str_utf8safe ("\314\315x\315\315x", "\\314\\315x\\315\\315x",   NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
+	do_test_utils_str_utf8safe ("\314\315xx", "\\314\\315xx",                     NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
+	do_test_utils_str_utf8safe ("\314xx", "\\314xx",                              NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
+	do_test_utils_str_utf8safe ("\xa0", "\\240",                                  NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
+	do_test_utils_str_utf8safe ("\xe2\x91\xa0", NULL,                             NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
+	do_test_utils_str_utf8safe ("\xe2\xe2\x91\xa0", "\\342\xe2\x91\xa0",          NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
+	do_test_utils_str_utf8safe ("\xe2\xe2\x91\xa0\xa0", "\\342\xe2\x91\xa0\\240", NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
+	do_test_utils_str_utf8safe ("a", NULL,                                        NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
+	do_test_utils_str_utf8safe ("ab", NULL,                                       NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
+	do_test_utils_str_utf8safe ("ab\314", "ab\\314",                              NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
+	do_test_utils_str_utf8safe ("ab\314adsf", "ab\\314adsf",                      NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
+	do_test_utils_str_utf8safe ("abadsf", NULL,                                   NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
+	do_test_utils_str_utf8safe ("abäb", NULL,                                     NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
+	do_test_utils_str_utf8safe ("x\xa0", "x\\240",                                NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
+	do_test_utils_str_utf8safe ("Ä\304ab\\äb", "Ä\\304ab\\\\äb",                  NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
+	do_test_utils_str_utf8safe ("Äab\\äb", "Äab\\\\äb",                           NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
+	do_test_utils_str_utf8safe ("ÄÄab\\äb", "ÄÄab\\\\äb",                         NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
+	do_test_utils_str_utf8safe ("㈞abä㈞b", NULL,                                 NM_UTILS_STR_UTF8_SAFE_FLAG_NONE);
+	do_test_utils_str_utf8safe ("abäb", "ab\\303\\244b",                          NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_NON_ASCII);
+	do_test_utils_str_utf8safe ("ab\ab", "ab\\007b",                              NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_CTRL);
+}
+
+/*****************************************************************************/
+
 static int
 _test_nm_in_set_get (int *call_counter, gboolean allow_called, int value)
 {
@@ -5680,6 +5774,7 @@ int main (int argc, char **argv)
 	nmtst_init (&argc, &argv, TRUE);

 	/* The tests */
+	g_test_add_func ("/core/general/test_utils_str_utf8safe", test_utils_str_utf8safe);
 	g_test_add_func ("/core/general/test_nm_in_set", test_nm_in_set);
 	g_test_add_func ("/core/general/test_nm_in_strset", test_nm_in_strset);
 	g_test_add_func ("/core/general/test_setting_vpn_items", test_setting_vpn_items);
--- a/shared/nm-utils/nm-shared-utils.c
+++ b/shared/nm-utils/nm-shared-utils.c
@@ -397,3 +397,141 @@ nm_g_object_set_property (GObject *object,
 }

 /*****************************************************************************/
+
+static void
+_str_append_escape (GString *s, char ch)
+{
+	g_string_append_c (s, '\\');
+	g_string_append_c (s, '0' + ((((guchar) ch) >> 6) & 07));
+	g_string_append_c (s, '0' + ((((guchar) ch) >> 3) & 07));
+	g_string_append_c (s, '0' + ( ((guchar) ch)       & 07));
+}
+
+/**
+ * nm_utils_str_utf8safe_escape:
+ * @str: NUL terminated input string, possibly in utf-8 encoding
+ * @flags: #NMUtilsStrUtf8SafeFlags flags
+ * @to_free: (out): return the pointer location of the string
+ *   if a copying was necessary.
+ *
+ * Returns the possible non-UTF-8 NUL terminated string @str
+ * and uses backslash escaping (C escaping, like g_strescape())
+ * to sanitize non UTF-8 characters. The result is valid
+ * UTF-8.
+ *
+ * The operation can be reverted with g_strcompress() or
+ * nm_utils_str_utf8safe_unescape().
+ *
+ * Depending on @flags, valid UTF-8 characters are not escaped at all
+ * (except the escape character '\\'). This is the difference to g_strescape(),
+ * which escapes all non-ASCII characters. This allows to pass on
+ * valid UTF-8 characters as-is and can be directly shown to the user
+ * as UTF-8 -- with exception of the backslash escape character,
+ * invalid UTF-8 sequences, and other (depending on @flags).
+ *
+ * Returns: the escaped input string, as valid UTF-8. If no escaping
+ *   is necessary, it returns the input @str. Otherwise, an allocated
+ *   string @to_free is returned which must be freed by the caller
+ *   with g_free. The escaping can be reverted by g_strcompress().
+ **/
+const char *
+nm_utils_str_utf8safe_escape (const char *str, NMUtilsStrUtf8SafeFlags flags, char **to_free)
+{
+	const char *p = NULL;
+	GString *s;
+
+	g_return_val_if_fail (to_free, NULL);
+
+	*to_free = NULL;
+	if (!str || !str[0])
+		return str;
+
+	if (   g_utf8_validate (str, -1, &p)
+	    && !NM_STRCHAR_ANY (str, ch,
+	                        (   ch == '\\' \
+	                         || (   NM_FLAGS_HAS (flags, NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_CTRL) \
+	                             && ch < ' ') \
+	                         || (   NM_FLAGS_HAS (flags, NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_NON_ASCII) \
+	                             && ((guchar) ch) >= 127))))
+		return str;
+
+	s = g_string_sized_new ((p - str) + strlen (p) + 5);
+
+	do {
+		for (; str < p; str++) {
+			char ch = str[0];
+
+			if (ch == '\\')
+				g_string_append (s, "\\\\");
+			else if (   (   NM_FLAGS_HAS (flags, NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_CTRL) \
+			             && ch < ' ') \
+			         || (   NM_FLAGS_HAS (flags, NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_NON_ASCII) \
+			             && ((guchar) ch) >= 127))
+				_str_append_escape (s, ch);
+			else
+				g_string_append_c (s, ch);
+		}
+
+		if (p[0] == '\0')
+			break;
+		_str_append_escape (s, p[0]);
+
+		str = &p[1];
+		g_utf8_validate (str, -1, &p);
+	} while (TRUE);
+
+	*to_free = g_string_free (s, FALSE);
+	return *to_free;
+}
+
+const char *
+nm_utils_str_utf8safe_unescape (const char *str, char **to_free)
+{
+	g_return_val_if_fail (to_free, NULL);
+
+	if (!str || !strchr (str, '\\')) {
+		*to_free = NULL;
+		return str;
+	}
+	return (*to_free = g_strcompress (str));
+}
+
+/**
+ * nm_utils_str_utf8safe_escape_cp:
+ * @str: NUL terminated input string, possibly in utf-8 encoding
+ * @flags: #NMUtilsStrUtf8SafeFlags flags
+ *
+ * Like nm_utils_str_utf8safe_escape(), except the returned value
+ * is always a copy of the input and must be freed by the caller.
+ *
+ * Returns: the escaped input string in UTF-8 encoding. The returned
+ *   value should be freed with g_free().
+ *   The escaping can be reverted by g_strcompress().
+ **/
+char *
+nm_utils_str_utf8safe_escape_cp (const char *str, NMUtilsStrUtf8SafeFlags flags)
+{
+	char *s;
+
+	nm_utils_str_utf8safe_escape (str, flags, &s);
+	return s ?: g_strdup (str);
+}
+
+char *
+nm_utils_str_utf8safe_unescape_cp (const char *str)
+{
+	return str ? g_strcompress (str) : NULL;
+}
+
+char *
+nm_utils_str_utf8safe_escape_take (char *str, NMUtilsStrUtf8SafeFlags flags)
+{
+	char *str_to_free;
+
+	nm_utils_str_utf8safe_escape (str, flags, &str_to_free);
+	if (str_to_free) {
+		g_free (str);
+		return str_to_free;
+	}
+	return str;
+}
--- a/shared/nm-utils/nm-shared-utils.h
+++ b/shared/nm-utils/nm-shared-utils.h
@@ -97,4 +97,20 @@ gboolean nm_g_object_set_property (GObject *object,

 /*****************************************************************************/

+typedef enum {
+	NM_UTILS_STR_UTF8_SAFE_FLAG_NONE                = 0,
+	NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_CTRL         = 0x0001,
+	NM_UTILS_STR_UTF8_SAFE_FLAG_ESCAPE_NON_ASCII    = 0x0002,
+} NMUtilsStrUtf8SafeFlags;
+
+const char *nm_utils_str_utf8safe_escape   (const char *str, NMUtilsStrUtf8SafeFlags flags, char **to_free);
+const char *nm_utils_str_utf8safe_unescape (const char *str, char **to_free);
+
+char *nm_utils_str_utf8safe_escape_cp   (const char *str, NMUtilsStrUtf8SafeFlags flags);
+char *nm_utils_str_utf8safe_unescape_cp (const char *str);
+
+char *nm_utils_str_utf8safe_escape_take (char *str, NMUtilsStrUtf8SafeFlags flags);
+
+/*****************************************************************************/
+
 #endif /* __NM_SHARED_UTILS_H__ */