charsets: new method to do our best to convert from current charset to UTF-8

This method will try to convert the input string to UTF-8. The input string is supposed to be in the given charset; or otherwise is supposed to be the hex representation of the string in the given charset.
2012-02-07 17:50:49 +01:00
parent 9f6f80a63a
commit 00ce1d6874
2 changed files with 95 additions and 1 deletions
--- a/src/mm-charsets.c
+++ b/src/mm-charsets.c
@@ -18,6 +18,7 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <string.h>
 #include <ctype.h>
 #include "mm-charsets.h"
 #include "mm-utils.h"
@@ -703,3 +704,94 @@ gsm_pack (const guint8 *src,
    return packed;
 }
 /* We do all our best to get the given string, which is possibly given in the
 * specified charset, to UTF8. It may happen that the given string is really
 * the hex representation of the charset-encoded string, so we need to cope with
 * that case. */
 gchar *
 mm_charset_take_and_convert_to_utf8 (gchar *str,
                                     MMModemCharset charset)
 {
    gchar *utf8 = NULL;
    switch (charset) {
    case MM_MODEM_CHARSET_UNKNOWN:
        g_warn_if_reached ();
        utf8 = str;
        break;
    case MM_MODEM_CHARSET_HEX:
        /* We'll assume that the HEX string is really valid ASCII at the end */
        utf8 = str;
        break;
    case MM_MODEM_CHARSET_GSM:
    case MM_MODEM_CHARSET_8859_1:
    case MM_MODEM_CHARSET_PCCP437:
    case MM_MODEM_CHARSET_PCDN: {
        const gchar *iconv_from;
        GError *error = NULL;
        iconv_from = charset_iconv_from (charset);
        utf8 = g_convert (str, strlen (str),
                          "UTF-8//TRANSLIT", iconv_from,
                          NULL, NULL, &error);
        if (!utf8 || error) {
            g_clear_error (&error);
            utf8 = NULL;
        }
        g_free (str);
        break;
    }
    case MM_MODEM_CHARSET_UCS2: {
        gsize len;
        gboolean possibly_hex = TRUE;
        /* If the string comes in hex-UCS-2, len needs to be a multiple of 4 */
        len = strlen (str);
        if ((len < 4) || ((len % 4) != 0))
            possibly_hex = FALSE;
        else {
            const gchar *p = str;
            /* All chars in the string must be hex */
            while (*p && possibly_hex)
                possibly_hex = isxdigit (*p++);
        }
        /* If we get UCS-2, we expect the HEX representation of the string */
        if (possibly_hex) {
            utf8 = mm_modem_charset_hex_to_utf8 (str, charset);
            if (!utf8) {
                /* If we couldn't convert the string as HEX-UCS-2, try to see if
                 * the string is valid UTF-8 itself. */
                utf8 = str;
            } else
                g_free (str);
        } else
            /* If we already know it's not hex, try to use the string as it is */
            utf8 = str;
        break;
    }
    /* If the given charset is ASCII or UTF8, we really expect the final string
     * already here */
    case MM_MODEM_CHARSET_IRA:
    case MM_MODEM_CHARSET_UTF8:
        utf8 = str;
        break;
    }
    /* Validate UTF-8 always before returning. This result will be exposed in DBus
     * very likely... */
    if (!g_utf8_validate (utf8, -1, NULL)) {
        /* Better return NULL than an invalid UTF-8 string */
        g_free (utf8);
        utf8 = NULL;
    }
    return utf8;
 }
--- a/src/mm-charsets.h
+++ b/src/mm-charsets.h
@@ -72,5 +72,7 @@ guint8 *gsm_pack (const guint8 *src,
                  guint8 start_offset,  /* in bits */
                  guint32 *out_packed_len);
-#endif /* MM_CHARSETS_H */
+gchar *mm_charset_take_and_convert_to_utf8 (gchar *str,
                                            MMModemCharset charset);
 #endif /* MM_CHARSETS_H */