charsets: add UTF-16BE as a possible modem charset

Just as an implementation detail to be taken as an extension of UCS2BE, never really to be used as a real modem charset.
2020-08-20 10:58:24 +02:00
parent 93686510d7
commit eb5443b197
3 changed files with 26 additions and 11 deletions
--- a/src/mm-charsets.c
+++ b/src/mm-charsets.c
@@ -43,6 +43,7 @@ static CharsetEntry charset_map[] = {
    { "PCCP437", "CP437",  "CP437",     "CP437//TRANSLIT",     MM_MODEM_CHARSET_PCCP437 },
    { "PCDN",    "CP850",  "CP850",     "CP850//TRANSLIT",     MM_MODEM_CHARSET_PCDN },
    { "HEX",     NULL,     NULL,        NULL,                  MM_MODEM_CHARSET_HEX },
+    { "UTF-16",  "UTF16",  "UTF-16BE",  "UTF-16BE//TRANSLIT",  MM_MODEM_CHARSET_UTF16 },
    { NULL,      NULL,     NULL,        NULL,                  MM_MODEM_CHARSET_UNKNOWN }
 };

@@ -535,6 +536,14 @@ ucs2_is_subset (gunichar c, const char *utf8, gsize ulen)
    return (c <= 0xFFFF);
 }

+static gboolean
+utf16_is_subset (gunichar     c,
+                 const gchar *utf8,
+                 gsize        ulen)
+{
+    return TRUE;
+}
+
 static gboolean
 iso88591_is_subset (gunichar c, const char *utf8, gsize ulen)
 {
@@ -613,6 +622,7 @@ SubsetEntry subset_table[] = {
    { MM_MODEM_CHARSET_GSM,     gsm_is_subset },
    { MM_MODEM_CHARSET_IRA,     ira_is_subset },
    { MM_MODEM_CHARSET_UCS2,    ucs2_is_subset },
+    { MM_MODEM_CHARSET_UTF16,   utf16_is_subset },
    { MM_MODEM_CHARSET_8859_1,  iso88591_is_subset },
    { MM_MODEM_CHARSET_PCCP437, pccp437_is_subset },
    { MM_MODEM_CHARSET_PCDN,    pcdn_is_subset },
@@ -786,7 +796,8 @@ mm_charset_take_and_convert_to_utf8 (gchar *str, MMModemCharset charset)
        break;
    }

-    case MM_MODEM_CHARSET_UCS2: {
+    case MM_MODEM_CHARSET_UCS2:
+    case MM_MODEM_CHARSET_UTF16: {
        gsize len;
        gboolean possibly_hex = TRUE;
        gsize bread = 0, bwritten = 0;
@@ -914,7 +925,8 @@ mm_utf8_take_and_convert_to_charset (gchar *str,
        break;
    }

-    case MM_MODEM_CHARSET_UCS2: {
+    case MM_MODEM_CHARSET_UCS2:
+    case MM_MODEM_CHARSET_UTF16: {
        const gchar *iconv_to;
        gsize encoded_len = 0;
        GError *error = NULL;
--- a/src/mm-charsets.h
+++ b/src/mm-charsets.h
@@ -27,7 +27,8 @@ typedef enum {
    MM_MODEM_CHARSET_UCS2    = 0x00000010,
    MM_MODEM_CHARSET_PCCP437 = 0x00000020,
    MM_MODEM_CHARSET_PCDN    = 0x00000040,
-    MM_MODEM_CHARSET_HEX     = 0x00000080
+    MM_MODEM_CHARSET_HEX     = 0x00000080,
+    MM_MODEM_CHARSET_UTF16   = 0x00000100,
 } MMModemCharset;

 const char *mm_modem_charset_to_string (MMModemCharset charset);
--- a/src/tests/test-charsets.c
+++ b/src/tests/test-charsets.c
@@ -369,6 +369,7 @@ struct charset_can_convert_to_test_s {
    gboolean    to_ira;
    gboolean    to_8859_1;
    gboolean    to_ucs2;
+    gboolean    to_utf16;
    gboolean    to_pccp437;
    gboolean    to_pcdn;
 };
@@ -379,35 +380,35 @@ test_charset_can_covert_to (void)
    static const struct charset_can_convert_to_test_s charset_can_convert_to_test[] = {
        {
            .utf8 = "",
-            .to_gsm = TRUE, .to_ira = TRUE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE,
+            .to_gsm = TRUE, .to_ira = TRUE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_utf16 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE,
        },
        {
            .utf8 = " ",
-            .to_gsm = TRUE, .to_ira = TRUE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE,
+            .to_gsm = TRUE, .to_ira = TRUE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_utf16 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE,
        },
        {
            .utf8 = "some basic ascii",
-            .to_gsm = TRUE, .to_ira = TRUE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE,
+            .to_gsm = TRUE, .to_ira = TRUE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_utf16 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE,
        },
        {
            .utf8 = "ホモ・サピエンス 喂人类 katakana, chinese, english: UCS2 takes it all",
-            .to_gsm = FALSE, .to_ira = FALSE, .to_8859_1 = FALSE, .to_ucs2 = TRUE, .to_pccp437 = FALSE, .to_pcdn = FALSE,
+            .to_gsm = FALSE, .to_ira = FALSE, .to_8859_1 = FALSE, .to_ucs2 = TRUE, .to_utf16 = TRUE, .to_pccp437 = FALSE, .to_pcdn = FALSE,
        },
        {
            .utf8 = "Some from the GSM7 basic set: a % Ψ Ω ñ ö è æ",
-            .to_gsm = TRUE, .to_ira = FALSE, .to_8859_1 = FALSE, .to_ucs2 = TRUE, .to_pccp437 = FALSE, .to_pcdn = FALSE,
+            .to_gsm = TRUE, .to_ira = FALSE, .to_8859_1 = FALSE, .to_ucs2 = TRUE, .to_utf16 = TRUE, .to_pccp437 = FALSE, .to_pcdn = FALSE,
        },
        {
            .utf8 = "More from the GSM7 extended set: {} [] ~ € |",
-            .to_gsm = TRUE, .to_ira = FALSE, .to_8859_1 = FALSE, .to_ucs2 = TRUE, .to_pccp437 = FALSE, .to_pcdn = FALSE,
+            .to_gsm = TRUE, .to_ira = FALSE, .to_8859_1 = FALSE, .to_ucs2 = TRUE, .to_utf16 = TRUE, .to_pccp437 = FALSE, .to_pcdn = FALSE,
        },
        {
            .utf8 = "patín cannot be encoded in GSM7 or IRA, but is valid UCS2, ISO-8859-1, CP437 and CP850",
-            .to_gsm = FALSE, .to_ira = FALSE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE,
+            .to_gsm = FALSE, .to_ira = FALSE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_utf16 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE,
        },
        {
            .utf8 = "ècole can be encoded in multiple ways, but not in IRA",
-            .to_gsm = TRUE, .to_ira = FALSE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE,
+            .to_gsm = TRUE, .to_ira = FALSE, .to_8859_1 = TRUE, .to_ucs2 = TRUE, .to_utf16 = TRUE, .to_pccp437 = TRUE, .to_pcdn = TRUE,
        },
    };
    guint i;
@@ -418,6 +419,7 @@ test_charset_can_covert_to (void)
        g_assert (mm_charset_can_convert_to (charset_can_convert_to_test[i].utf8, MM_MODEM_CHARSET_IRA)     == charset_can_convert_to_test[i].to_ira);
        g_assert (mm_charset_can_convert_to (charset_can_convert_to_test[i].utf8, MM_MODEM_CHARSET_8859_1)  == charset_can_convert_to_test[i].to_8859_1);
        g_assert (mm_charset_can_convert_to (charset_can_convert_to_test[i].utf8, MM_MODEM_CHARSET_UCS2)    == charset_can_convert_to_test[i].to_ucs2);
+        g_assert (mm_charset_can_convert_to (charset_can_convert_to_test[i].utf8, MM_MODEM_CHARSET_UTF16)   == charset_can_convert_to_test[i].to_utf16);
        g_assert (mm_charset_can_convert_to (charset_can_convert_to_test[i].utf8, MM_MODEM_CHARSET_PCCP437) == charset_can_convert_to_test[i].to_pccp437);
        g_assert (mm_charset_can_convert_to (charset_can_convert_to_test[i].utf8, MM_MODEM_CHARSET_PCDN)    == charset_can_convert_to_test[i].to_pcdn);
    }