charsets: move mm_sms_part_3gpp_util_split_text to mm_charset_util_split_text

This commit is contained in:
Andrey Skvortsov
2022-08-30 01:35:18 +03:00
committed by Aleksander Morgado
parent 9f5a84f777
commit 2ece78c80f
7 changed files with 350 additions and 335 deletions

View File

@@ -112,6 +112,7 @@ generate_3gpp_submit_pdus (MMBaseSms *self,
gsize data_len = 0;
MMSmsEncoding encoding;
MMModemCharset charset;
gchar **split_text = NULL;
GByteArray **split_data = NULL;
@@ -129,7 +130,7 @@ generate_3gpp_submit_pdus (MMBaseSms *self,
g_assert (!(text != NULL && data != NULL));
if (text) {
split_text = mm_sms_part_3gpp_util_split_text (text, &encoding, self);
split_text = mm_charset_util_split_text (text, &charset, self);
if (!split_text) {
g_set_error (error,
MM_CORE_ERROR,
@@ -137,6 +138,7 @@ generate_3gpp_submit_pdus (MMBaseSms *self,
"Cannot generate PDUs: Error processing input text");
return FALSE;
}
encoding = (charset == MM_MODEM_CHARSET_GSM) ? MM_SMS_ENCODING_GSM7 : MM_SMS_ENCODING_UCS2;
n_parts = g_strv_length (split_text);
} else if (data) {
encoding = MM_SMS_ENCODING_8BIT;

View File

@@ -974,3 +974,142 @@ mm_modem_charsets_init (void)
mm_obj_dbg (NULL, "[charsets] %s: iconv conversion to/from charset is supported", charset_settings[i].iconv_name);
}
}
static gchar **
util_split_text_gsm7 (const gchar *text,
gsize text_len,
gpointer log_object)
{
gchar **out;
guint n_chunks;
guint i;
guint j;
/* No splitting needed? */
if (text_len <= 160) {
out = g_new0 (gchar *, 2);
out[0] = g_strdup (text);
return out;
}
/* Compute number of chunks needed */
n_chunks = text_len / 153;
if (text_len % 153 != 0)
n_chunks++;
/* Fill in all chunks */
out = g_new0 (gchar *, n_chunks + 1);
for (i = 0, j = 0; i < n_chunks; i++, j += 153)
out[i] = g_strndup (&text[j], 153);
return out;
}
static gchar **
util_split_text_utf16_or_ucs2 (const gchar *text,
gsize text_len,
gpointer log_object)
{
g_autoptr(GPtrArray) chunks = NULL;
const gchar *walker;
const gchar *chunk_start;
glong encoded_chunk_length;
glong total_encoded_chunk_length;
chunks = g_ptr_array_new_with_free_func ((GDestroyNotify)g_free);
walker = text;
chunk_start = text;
encoded_chunk_length = 0;
total_encoded_chunk_length = 0;
while (walker && *walker) {
g_autofree gunichar2 *unichar2 = NULL;
glong unichar2_written = 0;
glong unichar2_written_bytes = 0;
gunichar single;
single = g_utf8_get_char (walker);
unichar2 = g_ucs4_to_utf16 (&single, 1, NULL, &unichar2_written, NULL);
g_assert (unichar2_written > 0);
/* When splitting for UCS-2 encoding, only one single unichar2 will be
* written, because all codepoints represented in UCS2 fit in the BMP.
* When splitting for UTF-16, though, we may end up writing one or two
* unichar2 (without or with surrogate pairs), because UTF-16 covers the
* whole Unicode spectrum. */
unichar2_written_bytes = (unichar2_written * sizeof (gunichar2));
if ((encoded_chunk_length + unichar2_written_bytes) > 134) {
g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start));
chunk_start = walker;
encoded_chunk_length = unichar2_written_bytes;
} else
encoded_chunk_length += unichar2_written_bytes;
total_encoded_chunk_length += unichar2_written_bytes;
walker = g_utf8_next_char (walker);
}
/* We have split the original string in chunks, where each chunk
* does not require more than 134 bytes when encoded in UTF-16.
* As a special case now, we consider the case that no splitting
* is necessary, i.e. if the total amount of bytes after encoding
* in UTF-16 is less or equal than 140. */
if (total_encoded_chunk_length <= 140) {
gchar **out;
out = g_new0 (gchar *, 2);
out[0] = g_strdup (text);
return out;
}
/* Otherwise, we do need the splitted chunks. Add the last one
* with contents plus the last trailing NULL */
g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start));
g_ptr_array_add (chunks, NULL);
return (gchar **) g_ptr_array_free (g_steal_pointer (&chunks), FALSE);
}
gchar **
mm_charset_util_split_text (const gchar *text,
MMModemCharset *charset,
gpointer log_object)
{
if (!text)
return NULL;
/* Some info about the rules for splitting.
*
* The User Data can be up to 140 bytes in the SMS part:
* 0) If we only need one chunk, it can be of up to 140 bytes.
* If we need more than one chunk, these have to be of 140 - 6 = 134
* bytes each, as we need place for the UDH header.
* 1) If we're using GSM7 encoding, this gives us up to 160 characters,
* as we can pack 160 characters of 7bits each into 140 bytes.
* 160 * 7 = 140 * 8 = 1120.
* If we only have 134 bytes allowed, that would mean that we can pack
* up to 153 input characters:
* 134 * 8 = 1072; 1072/7=153.14
* 2) If we're using UCS2 encoding, we can pack up to 70 characters in
* 140 bytes (each with 2 bytes), or up to 67 characters in 134 bytes.
* 3) If we're using UTF-16 encoding (instead of UCS2), the amount of
* characters we can pack is variable, depends on how the characters
* are encoded in UTF-16 (e.g. if there are characters out of the BMP
* we'll need surrogate pairs and a single character will need 4 bytes
* instead of 2).
*
* This method does the split of the input string into N strings, so that
* each of the strings can be placed in a SMS part.
*/
/* Check if we can do GSM encoding */
if (mm_charset_can_convert_to (text, MM_MODEM_CHARSET_GSM)) {
*charset = MM_MODEM_CHARSET_GSM;
return util_split_text_gsm7 (text, strlen (text), log_object);
}
/* Otherwise fallback to report UCS-2 and split supporting UTF-16 */
*charset = MM_MODEM_CHARSET_UTF16;
return util_split_text_utf16_or_ucs2 (text, strlen (text), log_object);
}

View File

@@ -112,4 +112,15 @@ gchar *mm_modem_charset_str_to_utf8 (const gchar *str,
void mm_modem_charsets_init (void);
/*
* Select appropriate encoding and split an UTF-8 encoded input string
* into N UTF-8 strings, so that each of the strings
* can be encoded into 'charset' and placed in a SMS part.
*/
gchar **mm_charset_util_split_text (const gchar *text,
MMModemCharset *charset,
gpointer log_object);
#endif /* MM_CHARSETS_H */

View File

@@ -1101,144 +1101,6 @@ error:
return NULL;
}
static gchar **
util_split_text_gsm7 (const gchar *text,
gsize text_len,
gpointer log_object)
{
gchar **out;
guint n_chunks;
guint i;
guint j;
/* No splitting needed? */
if (text_len <= 160) {
out = g_new0 (gchar *, 2);
out[0] = g_strdup (text);
return out;
}
/* Compute number of chunks needed */
n_chunks = text_len / 153;
if (text_len % 153 != 0)
n_chunks++;
/* Fill in all chunks */
out = g_new0 (gchar *, n_chunks + 1);
for (i = 0, j = 0; i < n_chunks; i++, j += 153)
out[i] = g_strndup (&text[j], 153);
return out;
}
static gchar **
util_split_text_utf16_or_ucs2 (const gchar *text,
gsize text_len,
gpointer log_object)
{
g_autoptr(GPtrArray) chunks = NULL;
const gchar *walker;
const gchar *chunk_start;
glong encoded_chunk_length;
glong total_encoded_chunk_length;
chunks = g_ptr_array_new_with_free_func ((GDestroyNotify)g_free);
walker = text;
chunk_start = text;
encoded_chunk_length = 0;
total_encoded_chunk_length = 0;
while (walker && *walker) {
g_autofree gunichar2 *unichar2 = NULL;
glong unichar2_written = 0;
glong unichar2_written_bytes = 0;
gunichar single;
single = g_utf8_get_char (walker);
unichar2 = g_ucs4_to_utf16 (&single, 1, NULL, &unichar2_written, NULL);
g_assert (unichar2_written > 0);
/* When splitting for UCS-2 encoding, only one single unichar2 will be
* written, because all codepoints represented in UCS2 fit in the BMP.
* When splitting for UTF-16, though, we may end up writing one or two
* unichar2 (without or with surrogate pairs), because UTF-16 covers the
* whole Unicode spectrum. */
unichar2_written_bytes = (unichar2_written * sizeof (gunichar2));
if ((encoded_chunk_length + unichar2_written_bytes) > 134) {
g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start));
chunk_start = walker;
encoded_chunk_length = unichar2_written_bytes;
} else
encoded_chunk_length += unichar2_written_bytes;
total_encoded_chunk_length += unichar2_written_bytes;
walker = g_utf8_next_char (walker);
}
/* We have split the original string in chunks, where each chunk
* does not require more than 134 bytes when encoded in UTF-16.
* As a special case now, we consider the case that no splitting
* is necessary, i.e. if the total amount of bytes after encoding
* in UTF-16 is less or equal than 140. */
if (total_encoded_chunk_length <= 140) {
gchar **out;
out = g_new0 (gchar *, 2);
out[0] = g_strdup (text);
return out;
}
/* Otherwise, we do need the splitted chunks. Add the last one
* with contents plus the last trailing NULL */
g_ptr_array_add (chunks, g_strndup (chunk_start, walker - chunk_start));
g_ptr_array_add (chunks, NULL);
return (gchar **) g_ptr_array_free (g_steal_pointer (&chunks), FALSE);
}
gchar **
mm_sms_part_3gpp_util_split_text (const gchar *text,
MMSmsEncoding *encoding,
gpointer log_object)
{
if (!text)
return NULL;
/* Some info about the rules for splitting.
*
* The User Data can be up to 140 bytes in the SMS part:
* 0) If we only need one chunk, it can be of up to 140 bytes.
* If we need more than one chunk, these have to be of 140 - 6 = 134
* bytes each, as we need place for the UDH header.
* 1) If we're using GSM7 encoding, this gives us up to 160 characters,
* as we can pack 160 characters of 7bits each into 140 bytes.
* 160 * 7 = 140 * 8 = 1120.
* If we only have 134 bytes allowed, that would mean that we can pack
* up to 153 input characters:
* 134 * 8 = 1072; 1072/7=153.14
* 2) If we're using UCS2 encoding, we can pack up to 70 characters in
* 140 bytes (each with 2 bytes), or up to 67 characters in 134 bytes.
* 3) If we're using UTF-16 encoding (instead of UCS2), the amount of
* characters we can pack is variable, depends on how the characters
* are encoded in UTF-16 (e.g. if there are characters out of the BMP
* we'll need surrogate pairs and a single character will need 4 bytes
* instead of 2).
*
* This method does the split of the input string into N strings, so that
* each of the strings can be placed in a SMS part.
*/
/* Check if we can do GSM encoding */
if (mm_charset_can_convert_to (text, MM_MODEM_CHARSET_GSM)) {
*encoding = MM_SMS_ENCODING_GSM7;
return util_split_text_gsm7 (text, strlen (text), log_object);
}
/* Otherwise fallback to report UCS-2 and split supporting UTF-16 */
*encoding = MM_SMS_ENCODING_UCS2;
return util_split_text_utf16_or_ucs2 (text, strlen (text), log_object);
}
GByteArray **
mm_sms_part_3gpp_util_split_data (const guint8 *data,
gsize data_len)

View File

@@ -44,9 +44,6 @@ guint mm_sms_part_3gpp_encode_address (const gchar *address,
guint8 *buf,
gsize buflen,
gboolean is_smsc);
gchar **mm_sms_part_3gpp_util_split_text (const gchar *text,
MMSmsEncoding *encoding,
gpointer log_object);
GByteArray **mm_sms_part_3gpp_util_split_data (const guint8 *data,
gsize data_len);

View File

@@ -446,6 +446,188 @@ test_charset_can_covert_to (void)
}
}
/********************* TEXT SPLIT TESTS *********************/
static void
common_test_text_split (const gchar *text,
const gchar **expected,
MMModemCharset expected_charset)
{
gchar **out;
MMModemCharset out_charset = MM_MODEM_CHARSET_UNKNOWN;
guint i;
out = mm_charset_util_split_text (text, &out_charset, NULL);
g_assert (out != NULL);
g_assert (out_charset != MM_MODEM_CHARSET_UNKNOWN);
g_assert_cmpuint (g_strv_length (out), ==, g_strv_length ((gchar **)expected));
for (i = 0; out[i]; i++) {
g_assert_cmpstr (out[i], ==, expected[i]);
}
g_strfreev (out);
}
static void
test_text_split_short_gsm7 (void)
{
const gchar *text = "Hello";
const gchar *expected [] = {
"Hello",
NULL
};
common_test_text_split (text, expected, MM_MODEM_CHARSET_GSM);
}
static void
test_text_split_short_ucs2 (void)
{
const gchar *text = "你好"; /* (UTF-8) e4 bd a0 e5 a5 bd */
const gchar *expected [] = {
"你好",
NULL
};
common_test_text_split (text, expected, MM_MODEM_CHARSET_UTF16);
}
static void
test_text_split_short_utf16 (void)
{
const gchar *text = "😉"; /* U+1F609, winking face */
const gchar *expected [] = {
"😉",
NULL
};
common_test_text_split (text, expected, MM_MODEM_CHARSET_UTF16);
}
static void
test_text_split_max_single_pdu_gsm7 (void)
{
const gchar *text =
"0123456789012345678901234567890123456789"
"0123456789012345678901234567890123456789"
"0123456789012345678901234567890123456789"
"0123456789012345678901234567890123456789";
const gchar *expected [] = {
"0123456789012345678901234567890123456789"
"0123456789012345678901234567890123456789"
"0123456789012345678901234567890123456789"
"0123456789012345678901234567890123456789",
NULL
};
common_test_text_split (text, expected, MM_MODEM_CHARSET_GSM);
}
static void
test_text_split_max_single_pdu_ucs2 (void)
{
/* NOTE: This chinese string contains 210 bytes when encoded in
* UTF-8! But still, it can be placed into 140 bytes when in UCS-2
*/
const gchar *text =
"你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
"你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
"你好你好你好";
const gchar *expected [] = {
"你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
"你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
"你好你好你好",
NULL
};
common_test_text_split (text, expected, MM_MODEM_CHARSET_UTF16);
}
static void
test_text_split_max_single_pdu_utf16 (void)
{
/* NOTE: this string contains 35 Bhaiksuki characters, each of
* them requiring 4 bytes both in UTF-8 and in UTF-16 (140 bytes
* in total). */
const gchar *text =
"𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡𑰢𑰣";
const gchar *expected [] = {
"𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡𑰢𑰣",
NULL
};
common_test_text_split (text, expected, MM_MODEM_CHARSET_UTF16);
}
static void
test_text_split_two_pdu_gsm7 (void)
{
const gchar *text =
"0123456789012345678901234567890123456789"
"0123456789012345678901234567890123456789"
"0123456789012345678901234567890123456789"
"01234567890123456789012345678901234567890";
const gchar *expected [] = {
/* First chunk */
"0123456789012345678901234567890123456789"
"0123456789012345678901234567890123456789"
"0123456789012345678901234567890123456789"
"012345678901234567890123456789012",
/* Second chunk */
"34567890",
NULL
};
common_test_text_split (text, expected, MM_MODEM_CHARSET_GSM);
}
static void
test_text_split_two_pdu_ucs2 (void)
{
const gchar *text =
"你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
"你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
"你好你好你好好";
const gchar *expected [] = {
/* First chunk */
"你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
"你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
"你好你",
/* Second chunk */
"好你好好",
NULL
};
common_test_text_split (text, expected, MM_MODEM_CHARSET_UTF16);
}
static void
test_text_split_two_pdu_utf16 (void)
{
/* NOTE: this string contains 35 Bhaiksuki characters, each of
* them requiring 4 bytes both in UTF-8 and in UTF-16 (140 bytes
* in total) plus one ASCII char (encoded with 1 byte in UTF-8 and
* 2 bytes in UTF-16), making it a total of 142 bytes when in
* UTF-16 (so not fitting in one single PDU)
*
* When split in chunks, the last chunk will hold 2 Bhaiksuki
* characters plus the last ASCII one (9 bytes in UTF-16) so that
* the first chunk contains the leading 33 Bhaiksuki characters
* (132 characters, less than 134) */
const gchar *text =
"𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡𑰢𑰣a";
const gchar *expected [] = {
"𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡",
"𑰢𑰣a",
NULL
};
common_test_text_split (text, expected, MM_MODEM_CHARSET_UTF16);
}
int main (int argc, char **argv)
{
setlocale (LC_ALL, "");
@@ -471,5 +653,15 @@ int main (int argc, char **argv)
g_test_add_func ("/MM/charsets/can-convert-to", test_charset_can_covert_to);
g_test_add_func ("/MM/charsets/text-split/gsm7/short", test_text_split_short_gsm7);
g_test_add_func ("/MM/charsets/text-split/ucs2/short", test_text_split_short_ucs2);
g_test_add_func ("/MM/charsets/text-split/utf16/short", test_text_split_short_utf16);
g_test_add_func ("/MM/charsets/text-split/gsm7/max-single-pdu", test_text_split_max_single_pdu_gsm7);
g_test_add_func ("/MM/charsets/text-split/ucs2/max-single-pdu", test_text_split_max_single_pdu_ucs2);
g_test_add_func ("/MM/charsets/text-split/utf16/max-single-pdu", test_text_split_max_single_pdu_utf16);
g_test_add_func ("/MM/charsets/text-split/gsm7/two-pdu", test_text_split_two_pdu_gsm7);
g_test_add_func ("/MM/charsets/text-split/ucs2/two-pdu", test_text_split_two_pdu_ucs2);
g_test_add_func ("/MM/charsets/text-split/utf16/two-pdu", test_text_split_two_pdu_utf16);
return g_test_run ();
}

View File

@@ -24,6 +24,7 @@
#include <libmm-glib.h>
#include "mm-sms-part-3gpp.h"
#include "mm-charsets.h"
#include "mm-log-test.h"
/********************* PDU PARSER TESTS *********************/
@@ -529,9 +530,12 @@ common_test_create_pdu (const gchar *smsc,
if (text) {
gchar **out;
MMSmsEncoding encoding = MM_SMS_ENCODING_UNKNOWN;
MMModemCharset charset = MM_MODEM_CHARSET_UNKNOWN;
/* Detect best encoding */
out = mm_sms_part_3gpp_util_split_text (text, &encoding, NULL);
out = mm_charset_util_split_text (text, &charset, NULL);
if (out)
encoding = (charset == MM_MODEM_CHARSET_GSM) ? MM_SMS_ENCODING_GSM7 : MM_SMS_ENCODING_UCS2;
g_strfreev (out);
mm_sms_part_set_text (part, text);
mm_sms_part_set_encoding (part, encoding);
@@ -708,188 +712,6 @@ test_create_pdu_gsm_no_validity (void)
1); /* expected_msgstart */
}
/********************* TEXT SPLIT TESTS *********************/
static void
common_test_text_split (const gchar *text,
const gchar **expected,
MMSmsEncoding expected_encoding)
{
gchar **out;
MMSmsEncoding out_encoding = MM_SMS_ENCODING_UNKNOWN;
guint i;
out = mm_sms_part_3gpp_util_split_text (text, &out_encoding, NULL);
g_assert (out != NULL);
g_assert (out_encoding != MM_SMS_ENCODING_UNKNOWN);
g_assert_cmpuint (g_strv_length (out), ==, g_strv_length ((gchar **)expected));
for (i = 0; out[i]; i++) {
g_assert_cmpstr (out[i], ==, expected[i]);
}
g_strfreev (out);
}
static void
test_text_split_short_gsm7 (void)
{
const gchar *text = "Hello";
const gchar *expected [] = {
"Hello",
NULL
};
common_test_text_split (text, expected, MM_SMS_ENCODING_GSM7);
}
static void
test_text_split_short_ucs2 (void)
{
const gchar *text = "你好"; /* (UTF-8) e4 bd a0 e5 a5 bd */
const gchar *expected [] = {
"你好",
NULL
};
common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
}
static void
test_text_split_short_utf16 (void)
{
const gchar *text = "😉"; /* U+1F609, winking face */
const gchar *expected [] = {
"😉",
NULL
};
common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
}
static void
test_text_split_max_single_pdu_gsm7 (void)
{
const gchar *text =
"0123456789012345678901234567890123456789"
"0123456789012345678901234567890123456789"
"0123456789012345678901234567890123456789"
"0123456789012345678901234567890123456789";
const gchar *expected [] = {
"0123456789012345678901234567890123456789"
"0123456789012345678901234567890123456789"
"0123456789012345678901234567890123456789"
"0123456789012345678901234567890123456789",
NULL
};
common_test_text_split (text, expected, MM_SMS_ENCODING_GSM7);
}
static void
test_text_split_max_single_pdu_ucs2 (void)
{
/* NOTE: This chinese string contains 210 bytes when encoded in
* UTF-8! But still, it can be placed into 140 bytes when in UCS-2
*/
const gchar *text =
"你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
"你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
"你好你好你好";
const gchar *expected [] = {
"你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
"你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
"你好你好你好",
NULL
};
common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
}
static void
test_text_split_max_single_pdu_utf16 (void)
{
/* NOTE: this string contains 35 Bhaiksuki characters, each of
* them requiring 4 bytes both in UTF-8 and in UTF-16 (140 bytes
* in total). */
const gchar *text =
"𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡𑰢𑰣";
const gchar *expected [] = {
"𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡𑰢𑰣",
NULL
};
common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
}
static void
test_text_split_two_pdu_gsm7 (void)
{
const gchar *text =
"0123456789012345678901234567890123456789"
"0123456789012345678901234567890123456789"
"0123456789012345678901234567890123456789"
"01234567890123456789012345678901234567890";
const gchar *expected [] = {
/* First chunk */
"0123456789012345678901234567890123456789"
"0123456789012345678901234567890123456789"
"0123456789012345678901234567890123456789"
"012345678901234567890123456789012",
/* Second chunk */
"34567890",
NULL
};
common_test_text_split (text, expected, MM_SMS_ENCODING_GSM7);
}
static void
test_text_split_two_pdu_ucs2 (void)
{
const gchar *text =
"你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
"你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
"你好你好你好好";
const gchar *expected [] = {
/* First chunk */
"你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
"你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
"你好你",
/* Second chunk */
"好你好好",
NULL
};
common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
}
static void
test_text_split_two_pdu_utf16 (void)
{
/* NOTE: this string contains 35 Bhaiksuki characters, each of
* them requiring 4 bytes both in UTF-8 and in UTF-16 (140 bytes
* in total) plus one ASCII char (encoded with 1 byte in UTF-8 and
* 2 bytes in UTF-16), making it a total of 142 bytes when in
* UTF-16 (so not fitting in one single PDU)
*
* When split in chunks, the last chunk will hold 2 Bhaiksuki
* characters plus the last ASCII one (9 bytes in UTF-16) so that
* the first chunk contains the leading 33 Bhaiksuki characters
* (132 characters, less than 134) */
const gchar *text =
"𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡𑰢𑰣a";
const gchar *expected [] = {
"𑰀𑰁𑰂𑰃𑰄𑰅𑰆𑰇𑰈𑰊𑰋𑰌𑰍𑰎𑰏𑰐𑰑𑰒𑰓𑰔𑰕𑰖𑰗𑰘𑰙𑰚𑰛𑰜𑰝𑰞𑰟𑰠𑰡",
"𑰢𑰣a",
NULL
};
common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
}
/************************************************************/
int main (int argc, char **argv)
@@ -925,15 +747,5 @@ int main (int argc, char **argv)
g_test_add_func ("/MM/SMS/3GPP/PDU-Creator/GSM-3", test_create_pdu_gsm_3);
g_test_add_func ("/MM/SMS/3GPP/PDU-Creator/GSM-no-validity", test_create_pdu_gsm_no_validity);
g_test_add_func ("/MM/SMS/3GPP/Text-Split/gsm7/short", test_text_split_short_gsm7);
g_test_add_func ("/MM/SMS/3GPP/Text-Split/ucs2/short", test_text_split_short_ucs2);
g_test_add_func ("/MM/SMS/3GPP/Text-Split/utf16/short", test_text_split_short_utf16);
g_test_add_func ("/MM/SMS/3GPP/Text-Split/gsm7/max-single-pdu", test_text_split_max_single_pdu_gsm7);
g_test_add_func ("/MM/SMS/3GPP/Text-Split/ucs2/max-single-pdu", test_text_split_max_single_pdu_ucs2);
g_test_add_func ("/MM/SMS/3GPP/Text-Split/utf16/max-single-pdu", test_text_split_max_single_pdu_utf16);
g_test_add_func ("/MM/SMS/3GPP/Text-Split/gsm7/two-pdu", test_text_split_two_pdu_gsm7);
g_test_add_func ("/MM/SMS/3GPP/Text-Split/ucs2/two-pdu", test_text_split_two_pdu_ucs2);
g_test_add_func ("/MM/SMS/3GPP/Text-Split/utf16/two-pdu", test_text_split_two_pdu_utf16);
return g_test_run ();
}