sms-part: new util to split input text string into chunks to fit in PDUs

2012-09-07 17:00:03 +02:00
parent 6b575cece0
commit e7b094ea3c
3 changed files with 247 additions and 0 deletions
--- a/src/mm-sms-part.c
+++ b/src/mm-sms-part.c
@@ -855,3 +855,110 @@ error:
    g_free (pdu);
    return NULL;
 }
+
+gchar **
+mm_sms_part_util_split_text (const gchar *text,
+                             MMSmsEncoding *encoding)
+{
+    guint gsm_unsupported = 0;
+    gchar **out;
+    guint n_chunks;
+    guint i;
+    guint j;
+    gsize in_len;
+
+    if (!text)
+        return NULL;
+
+    in_len = strlen (text);
+
+    /* Some info about the rules for splitting.
+     *
+     * The User Data can be up to 140 bytes in the SMS part:
+     *  0) If we only need one chunk, it can be of up to 140 bytes.
+     *     If we need more than one chunk, these have to be of 140 - 6 = 134
+     *     bytes each, as we need place for the UDH header.
+     *  1) If we're using GSM7 encoding, this gives us up to 160 characters,
+     *     as we can pack 160 characters of 7bits each into 140 bytes.
+     *      160 * 7 = 140 * 8 = 1120.
+     *     If we only have 134 bytes allowed, that would mean that we can pack
+     *     up to 153 input characters:
+     *      134 * 8 = 1072; 1072/7=153.14
+     *  2) If we're using UCS2 encoding, we can pack up to 70 characters in
+     *     140 bytes (each with 2 bytes), or up to 67 characters in 134 bytes.
+     *
+     * This method does the split of the input string into N strings, so that
+     * each of the strings can be placed in a SMS part.
+     */
+
+    /* Check if we can do GSM encoding */
+    mm_charset_get_encoded_len (text,
+                                MM_MODEM_CHARSET_GSM,
+                                &gsm_unsupported);
+    if (gsm_unsupported > 0) {
+        /* If cannot do it in GSM encoding, do it in UCS-2 */
+        GByteArray *array;
+
+        *encoding = MM_SMS_ENCODING_UCS2;
+
+        /* Guess more or less the size of the output array to avoid multiple
+         * allocations */
+        array = g_byte_array_sized_new (in_len * 2);
+        if (!mm_modem_charset_byte_array_append (array,
+                                                 text,
+                                                 FALSE,
+                                                 MM_MODEM_CHARSET_UCS2)) {
+            g_byte_array_unref (array);
+            return NULL;
+        }
+
+        /* Our bytearray has it in UCS-2 now.
+         * UCS-2 is a fixed-size encoding, which means that the text has exactly
+         * 2 bytes for each unicode point. We can now split this array into
+         * chunks of 67 UCS-2 characters (134 bytes).
+         *
+         * Note that UCS-2 covers unicode points between U+0000 and U+FFFF, which
+         * means that there is no direct relationship between the size of the
+         * input text in UTF-8 and the size of the text in UCS-2. A 3-byte UTF-8
+         * encoded character will still be represented with 2 bytes in UCS-2.
+         */
+        if (array->len <= 140) {
+            out = g_new (gchar *, 2);
+            out[0] = g_strdup (text);
+            out[1] = NULL;
+        } else {
+            n_chunks = array->len / 134;
+            if (array->len % 134 != 0)
+                n_chunks++;
+
+            out = g_new0 (gchar *, n_chunks + 1);
+            for (i = 0, j = 0; i < n_chunks; i++, j += 134) {
+                out[i] = sms_decode_text (&array->data[j],
+                                          MIN (array->len - j, 134),
+                                          MM_SMS_ENCODING_UCS2,
+                                          0);
+            }
+        }
+        g_byte_array_unref (array);
+    } else {
+        /* Do it with GSM encoding */
+        *encoding = MM_SMS_ENCODING_GSM7;
+
+        if (in_len <= 160) {
+            out = g_new (gchar *, 2);
+            out[0] = g_strdup (text);
+            out[1] = NULL;
+        } else {
+            n_chunks = in_len / 153;
+            if (in_len % 153 != 0)
+                n_chunks++;
+
+            out = g_new0 (gchar *, n_chunks + 1);
+            for (i = 0, j = 0; i < n_chunks; i++, j += 153) {
+                out[i] = g_strndup (&text[j], 153);
+            }
+        }
+    }
+
+    return out;
+}
--- a/src/mm-sms-part.h
+++ b/src/mm-sms-part.h
@@ -115,4 +115,7 @@ guint mm_sms_part_encode_address (const gchar *address,
                                  gsize buflen,
                                  gboolean is_smsc);

+gchar **mm_sms_part_util_split_text (const gchar *text,
+                                     MMSmsEncoding *encoding);
+
 #endif /* MM_SMS_PART_H */
--- a/src/tests/test-sms-part.c
+++ b/src/tests/test-sms-part.c
@@ -669,6 +669,136 @@ test_create_pdu_gsm_no_validity (void)
                            1); /* expected_msgstart */
 }

+/********************* TEXT SPLIT TESTS *********************/
+
+static void
+common_test_text_split (const gchar *text,
+                        const gchar **expected,
+                        MMSmsEncoding expected_encoding)
+{
+    gchar **out;
+    MMSmsEncoding out_encoding = MM_SMS_ENCODING_UNKNOWN;
+    guint i;
+
+    out = mm_sms_part_util_split_text (text, &out_encoding);
+
+    g_assert (out != NULL);
+    g_assert (out_encoding != MM_SMS_ENCODING_UNKNOWN);
+
+    g_assert_cmpuint (g_strv_length (out), ==, g_strv_length ((gchar **)expected));
+
+    for (i = 0; out[i]; i++) {
+        g_assert_cmpstr (out[i], ==, expected[i]);
+    }
+}
+
+static void
+test_text_split_short (void)
+{
+    const gchar *text = "Hello";
+    const gchar *expected [] = {
+        "Hello",
+        NULL
+    };
+
+    common_test_text_split (text, expected, MM_SMS_ENCODING_GSM7);
+}
+
+static void
+test_text_split_short_ucs2 (void)
+{
+    const gchar *text = "你好";
+    const gchar *expected [] = {
+        "你好",
+        NULL
+    };
+
+    common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
+}
+
+static void
+test_text_split_max_single_pdu (void)
+{
+    const gchar *text =
+        "0123456789012345678901234567890123456789"
+        "0123456789012345678901234567890123456789"
+        "0123456789012345678901234567890123456789"
+        "0123456789012345678901234567890123456789";
+    const gchar *expected [] = {
+        "0123456789012345678901234567890123456789"
+        "0123456789012345678901234567890123456789"
+        "0123456789012345678901234567890123456789"
+        "0123456789012345678901234567890123456789",
+        NULL
+    };
+
+    common_test_text_split (text, expected, MM_SMS_ENCODING_GSM7);
+}
+
+static void
+test_text_split_max_single_pdu_ucs2 (void)
+{
+    /* NOTE: This chinese string contains 210 bytes when encoded in
+     * UTF-8! But still, it can be placed into 140 bytes when in UCS-2
+     */
+    const gchar *text =
+        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
+        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
+        "你好你好你好";
+    const gchar *expected [] = {
+        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
+        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
+        "你好你好你好",
+        NULL
+    };
+
+    common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
+}
+
+static void
+test_text_split_two_pdu (void)
+{
+    const gchar *text =
+        "0123456789012345678901234567890123456789"
+        "0123456789012345678901234567890123456789"
+        "0123456789012345678901234567890123456789"
+        "01234567890123456789012345678901234567890";
+    const gchar *expected [] = {
+        /* First chunk */
+        "0123456789012345678901234567890123456789"
+        "0123456789012345678901234567890123456789"
+        "0123456789012345678901234567890123456789"
+        "012345678901234567890123456789012",
+        /* Second chunk */
+        "34567890",
+        NULL
+    };
+
+    common_test_text_split (text, expected, MM_SMS_ENCODING_GSM7);
+}
+
+static void
+test_text_split_two_pdu_ucs2 (void)
+{
+    const gchar *text =
+        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
+        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
+        "你好你好你好好";
+    const gchar *expected [] = {
+        /* First chunk */
+        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
+        "你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好你好"
+        "你好你",
+        /* Second chunk */
+        "好你好好",
+        NULL
+    };
+
+    common_test_text_split (text, expected, MM_SMS_ENCODING_UCS2);
+}
+
+/************************************************************/
+
 void
 _mm_log (const char *loc,
         const char *func,
@@ -709,5 +839,12 @@ int main (int argc, char **argv)
    g_test_add_func ("/MM/SMS/PDU-Creator/GSM-3", test_create_pdu_gsm_3);
    g_test_add_func ("/MM/SMS/PDU-Creator/GSM-no-validity", test_create_pdu_gsm_no_validity);

+    g_test_add_func ("/MM/SMS/Text-Split/short", test_text_split_short);
+    g_test_add_func ("/MM/SMS/Text-Split/short-UCS2", test_text_split_short_ucs2);
+    g_test_add_func ("/MM/SMS/Text-Split/max-single-pdu", test_text_split_max_single_pdu);
+    g_test_add_func ("/MM/SMS/Text-Split/max-single-pdu-UCS2", test_text_split_max_single_pdu_ucs2);
+    g_test_add_func ("/MM/SMS/Text-Split/two-pdu", test_text_split_two_pdu);
+    g_test_add_func ("/MM/SMS/Text-Split/two-pdu-UCS2", test_text_split_two_pdu_ucs2);
+
    return g_test_run ();
 }