charsets: make translit optional in gsm_unpacked_to_utf8()

Until now, this method would automatically apply transliteration;
i.e. replacing characters with '?' when no direct translation was
available.

We can attempt to do that transliteration on strings that are not
critical, e.g. the operator name reported by the network. But we
should not do that on other types of strings, e.g. on SMS contents
that may really have additional purposes than just being
human-readable.

This commit makes the transliteration option to be explicitly
requested by the caller.
This commit is contained in:
Aleksander Morgado
2020-11-26 23:07:11 +01:00
parent 5480cb67b2
commit 5ce97abd73
7 changed files with 95 additions and 64 deletions

View File

@@ -2343,7 +2343,7 @@ decode (MMIfaceModem3gppUssd *self,
/* if the last character in a 7-byte block is padding, then drop it */ /* if the last character in a 7-byte block is padding, then drop it */
if ((bin_len % 7 == 0) && (unpacked[unpacked_len - 1] == 0x0d)) if ((bin_len % 7 == 0) && (unpacked[unpacked_len - 1] == 0x0d))
unpacked_len--; unpacked_len--;
return (gchar*) mm_charset_gsm_unpacked_to_utf8 (unpacked, unpacked_len); return (gchar *) mm_charset_gsm_unpacked_to_utf8 (unpacked, unpacked_len, FALSE, error);
} }
/*****************************************************************************/ /*****************************************************************************/

View File

@@ -1427,7 +1427,7 @@ parse_spn (const gchar *response,
buflen--; buflen--;
/* First byte is metadata; remainder is GSM-7 unpacked into octets; convert to UTF8 */ /* First byte is metadata; remainder is GSM-7 unpacked into octets; convert to UTF8 */
return (gchar *)mm_charset_gsm_unpacked_to_utf8 (bin + 1, buflen - 1); return (gchar *)mm_charset_gsm_unpacked_to_utf8 (bin + 1, buflen - 1, FALSE, error);
} }
g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_FAILED, g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_FAILED,

View File

@@ -4820,11 +4820,9 @@ ussd_decode (guint32 scheme,
guint32 unpacked_len; guint32 unpacked_len;
unpacked = mm_charset_gsm_unpack ((const guint8 *)data->data, (data->len * 8) / 7, 0, &unpacked_len); unpacked = mm_charset_gsm_unpack ((const guint8 *)data->data, (data->len * 8) / 7, 0, &unpacked_len);
decoded = (gchar *) mm_charset_gsm_unpacked_to_utf8 (unpacked, unpacked_len); decoded = (gchar *) mm_charset_gsm_unpacked_to_utf8 (unpacked, unpacked_len, FALSE, error);
if (!decoded) if (!decoded)
g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_UNSUPPORTED, g_prefix_error (error, "Error decoding USSD command in 0x%04x scheme (GSM7 charset): ", scheme);
"Error decoding USSD command in 0x%04x scheme (GSM7 charset)",
scheme);
} else if (scheme == MM_MODEM_GSM_USSD_SCHEME_UCS2) { } else if (scheme == MM_MODEM_GSM_USSD_SCHEME_UCS2) {
decoded = mm_modem_charset_byte_array_to_utf8 (data, MM_MODEM_CHARSET_UCS2); decoded = mm_modem_charset_byte_array_to_utf8 (data, MM_MODEM_CHARSET_UCS2);
if (!decoded) if (!decoded)

View File

@@ -360,11 +360,13 @@ utf8_to_gsm_ext_char (const gchar *utf8,
} }
guint8 * guint8 *
mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm, mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm,
guint32 len) guint32 len,
gboolean translit,
GError **error)
{ {
guint i; g_autoptr(GByteArray) utf8 = NULL;
GByteArray *utf8; guint i;
g_return_val_if_fail (gsm != NULL, NULL); g_return_val_if_fail (gsm != NULL, NULL);
g_return_val_if_fail (len < 4096, NULL); g_return_val_if_fail (len < 4096, NULL);
@@ -410,13 +412,18 @@ mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm,
if (ulen) if (ulen)
g_byte_array_append (utf8, &uchars[0], ulen); g_byte_array_append (utf8, &uchars[0], ulen);
else else if (translit)
g_byte_array_append (utf8, (guint8 *) "?", 1); g_byte_array_append (utf8, (guint8 *) "?", 1);
else {
g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
"Invalid conversion from GSM7");
return NULL;
}
} }
/* Always make sure returned string is NUL terminated */ /* Always make sure returned string is NUL terminated */
g_byte_array_append (utf8, (guint8 *) "\0", 1); g_byte_array_append (utf8, (guint8 *) "\0", 1);
return g_byte_array_free (utf8, FALSE); return g_byte_array_free (g_steal_pointer (&utf8), FALSE);
} }
guint8 * guint8 *
@@ -740,7 +747,7 @@ mm_charset_take_and_convert_to_utf8 (gchar *str,
break; break;
case MM_MODEM_CHARSET_GSM: case MM_MODEM_CHARSET_GSM:
utf8 = (gchar *) mm_charset_gsm_unpacked_to_utf8 ((const guint8 *) str, strlen (str)); utf8 = (gchar *) mm_charset_gsm_unpacked_to_utf8 ((const guint8 *) str, strlen (str), FALSE, NULL);
g_free (str); g_free (str);
break; break;

View File

@@ -53,10 +53,12 @@ gchar *mm_modem_charset_byte_array_to_utf8 (GByteArray *array,
gchar *mm_modem_charset_hex_to_utf8 (const gchar *src, gchar *mm_modem_charset_hex_to_utf8 (const gchar *src,
MMModemCharset charset); MMModemCharset charset);
guint8 *mm_charset_utf8_to_unpacked_gsm (const gchar *utf8, guint8 *mm_charset_utf8_to_unpacked_gsm (const gchar *utf8,
guint32 *out_len); guint32 *out_len);
guint8 *mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm, guint8 *mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm,
guint32 len); guint32 len,
gboolean translit,
GError **error);
/* Checks whether conversion to the given charset may be done without errors */ /* Checks whether conversion to the given charset may be done without errors */
gboolean mm_charset_can_convert_to (const gchar *utf8, gboolean mm_charset_can_convert_to (const gchar *utf8,

View File

@@ -120,23 +120,24 @@ sms_string_to_bcd_semi_octets (guint8 *buf, gsize buflen, const char *string)
} }
/* len is in semi-octets */ /* len is in semi-octets */
static char * static gchar *
sms_decode_address (const guint8 *address, int len) sms_decode_address (const guint8 *address,
gint len,
GError **error)
{ {
guint8 addrtype, addrplan; guint8 addrtype, addrplan;
char *utf8; gchar *utf8;
addrtype = address[0] & SMS_NUMBER_TYPE_MASK; addrtype = address[0] & SMS_NUMBER_TYPE_MASK;
addrplan = address[0] & SMS_NUMBER_PLAN_MASK; addrplan = address[0] & SMS_NUMBER_PLAN_MASK;
address++; address++;
if (addrtype == SMS_NUMBER_TYPE_ALPHA) { if (addrtype == SMS_NUMBER_TYPE_ALPHA) {
guint8 *unpacked; g_autofree guint8 *unpacked = NULL;
guint32 unpacked_len; guint32 unpacked_len;
unpacked = mm_charset_gsm_unpack (address, (len * 4) / 7, 0, &unpacked_len); unpacked = mm_charset_gsm_unpack (address, (len * 4) / 7, 0, &unpacked_len);
utf8 = (char *)mm_charset_gsm_unpacked_to_utf8 (unpacked, utf8 = (gchar *) mm_charset_gsm_unpacked_to_utf8 (unpacked, unpacked_len, FALSE, error);
unpacked_len);
g_free (unpacked);
} else if (addrtype == SMS_NUMBER_TYPE_INTL && } else if (addrtype == SMS_NUMBER_TYPE_INTL &&
addrplan == SMS_NUMBER_PLAN_TELEPHONE) { addrplan == SMS_NUMBER_PLAN_TELEPHONE) {
/* International telphone number, format as "+1234567890" */ /* International telphone number, format as "+1234567890" */
@@ -239,41 +240,45 @@ sms_encoding_type (int dcs)
return scheme; return scheme;
} }
static char * static gchar *
sms_decode_text (const guint8 *text, sms_decode_text (const guint8 *text,
int len, int len,
MMSmsEncoding encoding, MMSmsEncoding encoding,
int bit_offset, int bit_offset,
gpointer log_object) gpointer log_object,
GError **error)
{ {
gchar *utf8;
if (encoding == MM_SMS_ENCODING_GSM7) { if (encoding == MM_SMS_ENCODING_GSM7) {
g_autofree guint8 *unpacked = NULL; g_autofree guint8 *unpacked = NULL;
guint32 unpacked_len; guint32 unpacked_len;
gchar *utf8;
mm_obj_dbg (log_object, "converting SMS part text from GSM-7 to UTF-8...");
unpacked = mm_charset_gsm_unpack ((const guint8 *) text, len, bit_offset, &unpacked_len); unpacked = mm_charset_gsm_unpack ((const guint8 *) text, len, bit_offset, &unpacked_len);
utf8 = (char *) mm_charset_gsm_unpacked_to_utf8 (unpacked, unpacked_len); utf8 = (gchar *) mm_charset_gsm_unpacked_to_utf8 (unpacked, unpacked_len, FALSE, error);
mm_obj_dbg (log_object, " got UTF-8 text: '%s'", utf8); if (utf8)
} else if (encoding == MM_SMS_ENCODING_UCS2) { mm_obj_dbg (log_object, "converted SMS part text from GSM-7 to UTF-8: %s", utf8);
g_autoptr(GByteArray) bytearray = NULL; return utf8;
}
if (encoding == MM_SMS_ENCODING_UCS2) {
g_autoptr(GByteArray) bytearray = NULL;
gchar *utf8;
mm_obj_dbg (log_object, "converting SMS part text from UTF-16BE to UTF-8...");
bytearray = g_byte_array_append (g_byte_array_sized_new (len), (const guint8 *)text, len); bytearray = g_byte_array_append (g_byte_array_sized_new (len), (const guint8 *)text, len);
/* Always assume UTF-16 instead of UCS-2! */ /* Always assume UTF-16 instead of UCS-2! */
utf8 = mm_modem_charset_byte_array_to_utf8 (bytearray, MM_MODEM_CHARSET_UTF16); utf8 = mm_modem_charset_byte_array_to_utf8 (bytearray, MM_MODEM_CHARSET_UTF16);
if (!utf8) { if (!utf8)
mm_obj_warn (log_object, "couldn't convert SMS part contents from UTF-16BE to UTF-8: not decoding any text"); g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_FAILED,
utf8 = g_strdup (""); "Couldn't convert SMS part contents from UTF-16BE to UTF-8: not decoding any text");
} else else
mm_obj_dbg (log_object, " got UTF-8 text: '%s'", utf8); mm_obj_dbg (log_object, "converted SMS part text from UTF-16BE to UTF-8: %s", utf8);
} else { return utf8;
mm_obj_warn (log_object, "unexpected encoding: %s; not decoding any text", mm_sms_encoding_get_string (encoding));
utf8 = g_strdup ("");
} }
return utf8; g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_FAILED,
"Couldn't convert SMS part contents from %s to UTF-8",
mm_sms_encoding_get_string (encoding));
return NULL;
} }
static guint static guint
@@ -373,6 +378,7 @@ mm_sms_part_3gpp_new_from_binary_pdu (guint index,
guint tp_dcs_offset = 0; guint tp_dcs_offset = 0;
guint tp_user_data_len_offset = 0; guint tp_user_data_len_offset = 0;
MMSmsEncoding user_data_encoding = MM_SMS_ENCODING_UNKNOWN; MMSmsEncoding user_data_encoding = MM_SMS_ENCODING_UNKNOWN;
gchar *address;
/* Create the new MMSmsPart */ /* Create the new MMSmsPart */
sms_part = mm_sms_part_new (index, MM_SMS_PDU_TYPE_UNKNOWN); sms_part = mm_sms_part_new (index, MM_SMS_PDU_TYPE_UNKNOWN);
@@ -405,8 +411,13 @@ mm_sms_part_3gpp_new_from_binary_pdu (guint index,
if (smsc_addr_size_bytes > 0) { if (smsc_addr_size_bytes > 0) {
PDU_SIZE_CHECK (offset + smsc_addr_size_bytes, "cannot read SMSC address"); PDU_SIZE_CHECK (offset + smsc_addr_size_bytes, "cannot read SMSC address");
/* SMSC may not be given in DELIVER PDUs */ /* SMSC may not be given in DELIVER PDUs */
mm_sms_part_take_smsc (sms_part, address = sms_decode_address (&pdu[1], 2 * (smsc_addr_size_bytes - 1), error);
sms_decode_address (&pdu[1], 2 * (smsc_addr_size_bytes - 1))); if (!address) {
g_prefix_error (error, "Couldn't read SMSC address: ");
mm_sms_part_free (sms_part);
return NULL;
}
mm_sms_part_take_smsc (sms_part, g_steal_pointer (&address));
mm_obj_dbg (log_object, " SMSC address parsed: '%s'", mm_sms_part_get_smsc (sms_part)); mm_obj_dbg (log_object, " SMSC address parsed: '%s'", mm_sms_part_get_smsc (sms_part));
offset += smsc_addr_size_bytes; offset += smsc_addr_size_bytes;
} else } else
@@ -478,9 +489,13 @@ mm_sms_part_3gpp_new_from_binary_pdu (guint index,
tp_addr_size_bytes = (tp_addr_size_digits + 1) >> 1; tp_addr_size_bytes = (tp_addr_size_digits + 1) >> 1;
PDU_SIZE_CHECK (offset + tp_addr_size_bytes, "cannot read number"); PDU_SIZE_CHECK (offset + tp_addr_size_bytes, "cannot read number");
mm_sms_part_take_number (sms_part, address = sms_decode_address (&pdu[offset], tp_addr_size_digits, error);
sms_decode_address (&pdu[offset], if (!address) {
tp_addr_size_digits)); g_prefix_error (error, "Couldn't read address: ");
mm_sms_part_free (sms_part);
return NULL;
}
mm_sms_part_take_number (sms_part, g_steal_pointer (&address));
mm_obj_dbg (log_object, " number parsed: %s", mm_sms_part_get_number (sms_part)); mm_obj_dbg (log_object, " number parsed: %s", mm_sms_part_get_number (sms_part));
offset += (1 + tp_addr_size_bytes); /* +1 due to the Type of Address byte */ offset += (1 + tp_addr_size_bytes); /* +1 due to the Type of Address byte */
@@ -709,17 +724,24 @@ mm_sms_part_3gpp_new_from_binary_pdu (guint index,
switch (user_data_encoding) { switch (user_data_encoding) {
case MM_SMS_ENCODING_GSM7: case MM_SMS_ENCODING_GSM7:
case MM_SMS_ENCODING_UCS2: case MM_SMS_ENCODING_UCS2:
/* Otherwise if it's 7-bit or UCS2 we can decode it */ {
mm_obj_dbg (log_object, "decoding SMS text with %u elements", tp_user_data_size_elements); gchar *text;
mm_sms_part_take_text (sms_part,
sms_decode_text (&pdu[tp_user_data_offset],
tp_user_data_size_elements,
user_data_encoding,
bit_offset,
log_object));
g_warn_if_fail (mm_sms_part_get_text (sms_part) != NULL);
break;
/* Otherwise if it's 7-bit or UCS2 we can decode it */
mm_obj_dbg (log_object, "decoding SMS text with %u elements", tp_user_data_size_elements);
text = sms_decode_text (&pdu[tp_user_data_offset],
tp_user_data_size_elements,
user_data_encoding,
bit_offset,
log_object,
error);
if (!text) {
mm_sms_part_free (sms_part);
return NULL;
}
mm_sms_part_take_text (sms_part, text);
break;
}
case MM_SMS_ENCODING_8BIT: case MM_SMS_ENCODING_8BIT:
case MM_SMS_ENCODING_UNKNOWN: case MM_SMS_ENCODING_UNKNOWN:
default: default:

View File

@@ -30,6 +30,7 @@ common_test_gsm7 (const gchar *in_utf8)
g_autofree guint8 *packed_gsm = NULL; g_autofree guint8 *packed_gsm = NULL;
g_autofree guint8 *unpacked_gsm_2 = NULL; g_autofree guint8 *unpacked_gsm_2 = NULL;
g_autofree gchar *built_utf8 = NULL; g_autofree gchar *built_utf8 = NULL;
g_autoptr(GError) error = NULL;
/* Convert to GSM */ /* Convert to GSM */
unpacked_gsm = mm_charset_utf8_to_unpacked_gsm (in_utf8, &unpacked_gsm_len); unpacked_gsm = mm_charset_utf8_to_unpacked_gsm (in_utf8, &unpacked_gsm_len);
@@ -58,8 +59,9 @@ common_test_gsm7 (const gchar *in_utf8)
g_assert_nonnull (unpacked_gsm_2); g_assert_nonnull (unpacked_gsm_2);
/* And back to UTF-8 */ /* And back to UTF-8 */
built_utf8 = (gchar *) mm_charset_gsm_unpacked_to_utf8 (unpacked_gsm_2, unpacked_gsm_len_2); built_utf8 = (gchar *) mm_charset_gsm_unpacked_to_utf8 (unpacked_gsm_2, unpacked_gsm_len_2, FALSE, &error);
g_assert_nonnull (built_utf8); g_assert_nonnull (built_utf8);
g_assert_no_error (error);
g_assert_cmpstr (built_utf8, ==, in_utf8); g_assert_cmpstr (built_utf8, ==, in_utf8);
} }