[Rt-commit] rt branch, 4.4/concatenate-strings-before-encoding-conversion, created. rt-4.4.4-7-g55b70e496

Thu Mar 14 14:51:00 EDT 2019

The branch, 4.4/concatenate-strings-before-encoding-conversion has been created
        at  55b70e4963c4f22c0ca512930e5fe3a81b74114b (commit)

- Log -----------------------------------------------------------------
commit d5113497de3c43a301d724375da451cdcc497c65
Author: sunnavy <sunnavy at bestpractical.com>
Date:   Thu Mar 14 23:03:30 2019 +0800

    Concatenate strings before encoding conversion to avoid character breakage
    
    When a field value is composed by a list of encoded strings like
    "=?UTF-8?B?...?=", converting each part one by one might break the
    string, here is an example:
    
        Subject: =?UTF-8?B?5L2g5Q==?==?UTF-8?B?pb0=?=
    
    The subject is consist of 2 Base64+UTF-8 encoded parts, of which base64
    decoded content is "e4bda0e5" and "a5bd" in bytes, respectively. If we
    convert them separately, since "e4bda0" is Chinese character "你", there
    is no problem of that sub-part, while "e5" is actually incomplete, which
    needs "a5bd" in the following part to represent another Chinese
    character "好", the string is broken, i.e.
    
    If we convert each part separately, we get bytes(rendered as "你���"):
    
        e4bd a0ef bfbd efbf bdef bfbd
    
    If we convert it together, we get correct bytes(rendered as "你好"):
    
        e4bd a0e5 a5bd
    
    This commit fixes this issue by concatenating succssive parts before
    conversion.

diff --git a/lib/RT/I18N.pm b/lib/RT/I18N.pm
index 87645a7fc..dbc49b950 100644
--- a/lib/RT/I18N.pm
+++ b/lib/RT/I18N.pm
@@ -437,7 +437,7 @@ sub _DecodeMIMEWordsToEncoding {
     # string in array this happen when we have 'key="=?encoded?="; key="plain"'
     $list[-1] .= substr($str, pos $str);
 
-    $str = '';
+    my @parts;
     while (@list) {
         my ($prefix, $charset, $encoding, $enc_str, $trailing) =
                 splice @list, 0, 5;
@@ -456,18 +456,46 @@ sub _DecodeMIMEWordsToEncoding {
                 ."only Q(uoted-printable) and B(ase64) are supported");
         }
 
-        # now we have got a decoded subject, try to convert into the encoding
-        if ( $charset ne $to_charset || $charset =~ /^utf-?8(?:-strict)?$/i ) {
-            if ( Encode::find_encoding($charset) ) {
-                Encode::from_to( $enc_str, $charset, $to_charset );
-            } else {
-                $RT::Logger->warning("Charset '$charset' is not supported");
-                $enc_str =~ s/[^[:print:]]/\357\277\275/g;
-                Encode::from_to( $enc_str, 'UTF-8', $to_charset )
-                    unless $to_charset eq 'utf-8';
+        push @parts, grep { defined && length } $prefix, { charset => $charset, value => $enc_str }, $trailing;
+    }
+
+    # Concatenate strings first in case a wide character is split into
+    # succssive parts like "=?UTF-8?B?5L2g5Q==?=" and "=?UTF-8?B?pb0=?="
+    my @merged;
+    for my $part (@parts) {
+        if (   $merged[-1]
+            && ref $merged[-1] eq 'HASH'
+            && ref $part eq 'HASH'
+            && $merged[-1]->{charset} eq $part->{charset} )
+        {
+            $merged[-1]{value} .= $part->{value};
+        }
+        else {
+            push @merged, $part;
+        }
+    }
+
+    $str = '';
+    for my $part (@merged) {
+        if ( ref $part eq 'HASH' ) {
+            my $charset = $part->{charset};
+            my $enc_str = $part->{value};
+            if ( $charset ne $to_charset || $charset =~ /^utf-?8(?:-strict)?$/i ) {
+                if ( Encode::find_encoding($charset) ) {
+                    Encode::from_to( $enc_str, $charset, $to_charset );
+                }
+                else {
+                    $RT::Logger->warning("Charset '$charset' is not supported");
+                    $enc_str =~ s/[^[:print:]]/\357\277\275/g;
+                    Encode::from_to( $enc_str, 'UTF-8', $to_charset )
+                        unless $to_charset eq 'utf-8';
+                }
             }
+            $str .= $enc_str;
+        }
+        else {
+            $str .= $part;
         }
-        $str .= $prefix . $enc_str . $trailing;
     }
 
     return ($str)

commit 55b70e4963c4f22c0ca512930e5fe3a81b74114b
Author: sunnavy <sunnavy at bestpractical.com>
Date:   Fri Mar 15 01:54:46 2019 +0800

    Add tests for wide characters split into successive encoded parts

diff --git a/t/mail/mime_decoding.t b/t/mail/mime_decoding.t
index 97590712b..8b7177eb0 100644
--- a/t/mail/mime_decoding.t
+++ b/t/mail/mime_decoding.t
@@ -246,4 +246,14 @@ diag "Alternating encoded-words and not, space is preserved";
     );
 }
 
+diag "A wide character split into 2 successive encoded words";
+{
+    my $str = q{=?UTF-8?B?5L2g5Q==?==?UTF-8?B?pb0=?=};
+    is_string(
+        RT::I18N::DecodeMIMEWordsToUTF8($str, "Subject"),
+        q{你好},
+        "A wide character split into 2 encoded words is parsed correctly"
+    );
+}
+
 done_testing;

-----------------------------------------------------------------------