[Rt-commit] rt branch, 4.0/encoded-word-whitespace, created. rt-4.0.5-136-g4843514

Mon Apr 30 21:03:47 EDT 2012

The branch, 4.0/encoded-word-whitespace has been created
        at  484351407de29b9417c4646f362ee10ab1138487 (commit)

- Log -----------------------------------------------------------------
commit 5a5f0af5299f5b17f74889751801eb1e88a259a5
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Mon Apr 30 20:52:36 2012 -0400

    Remove all whitespace separating adjacent encoded-word chunks
    
    RFC 2047 says:
    
       When displaying a particular header field that contains multiple
       'encoded-word's, any 'linear-white-space' that separates a pair of
       adjacent 'encoded-word's is ignored.
    
    Implement this by explicitly removing all such whitespace.

diff --git a/lib/RT/I18N.pm b/lib/RT/I18N.pm
index cadf7cc..a277e56 100644
--- a/lib/RT/I18N.pm
+++ b/lib/RT/I18N.pm
@@ -293,18 +293,24 @@ sub DecodeMIMEWordsToEncoding {
         $str = MIME::Field::ParamVal->parse($str)->stringify;
     }
 
+    # Pre-parse by removing all whitespace between encoded words
+    my $encoded_word = qr/
+                 =\?            # =?
+                 ([^?]+?)       # charset
+                 (?:\*[^?]+)?   # optional '*language'
+                 \?             # ?
+                 ([QqBb])       # encoding
+                 \?             # ?
+                 ([^?]+)        # encoded string
+                 \?=            # ?=
+                 /x;
+    1 while $str =~ s/($encoded_word)\s+($encoded_word)/$1$5/;
+
     # XXX TODO: use decode('MIME-Header', ...) and Encode::Alias to replace our
     # custom MIME word decoding and charset canonicalization.  We can't do this
     # until we parse before decode, instead of the other way around.
     my @list = $str =~ m/(.*?)          # prefix
-                         =\?            # =?
-                         ([^?]+?)       # charset
-                         (?:\*[^?]+)?   # optional '*language'
-                         \?             # ?
-                         ([QqBb])       # encoding
-                         \?             # ?
-                         ([^?]+)        # encoded string
-                         \?=            # ?=
+                         $encoded_word
                          ([^=]*)        # trailing
                         /xgcs;
 
diff --git a/t/mail/mime_decoding.t b/t/mail/mime_decoding.t
index 7515e2c..7fc1c93 100644
--- a/t/mail/mime_decoding.t
+++ b/t/mail/mime_decoding.t
@@ -1,7 +1,7 @@
 #!/usr/bin/perl
 use strict;
 use warnings;
-use RT::Test nodb => 1, tests => 9;
+use RT::Test nodb => 1, tests => 11;
 
 use_ok('RT::I18N');
 
@@ -97,3 +97,20 @@ diag q{canonicalize mime word encodings like gb2312};
     );
 }
 
+
+diag q{Whitespace between encoded words should be removed};
+{
+    my $str = "=?utf-8?Q?=E3=82=AD?=    =?utf-8?Q?=E3=83=A3?=";
+    is(
+        RT::I18N::DecodeMIMEWordsToUTF8($str),
+        "ã‚ãƒ£",
+        "whitespace between encoded words is removed",
+    );
+
+    $str = "=?utf-8?Q?=E3=82=AD?=  \n   =?utf-8?Q?=E3=83=A3?=";
+    is(
+        RT::I18N::DecodeMIMEWordsToUTF8($str),
+        "ã‚ãƒ£",
+        "newlines between encoded words also removed",
+    );
+}

commit 484351407de29b9417c4646f362ee10ab1138487
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Mon Apr 30 20:54:02 2012 -0400

    Join all adjacent QP encoded-word chunks, in case octets were split
    
    Though not allowed by RFC 2047, some mail clients split multiple octets
    of a multibyte encoding across different encoded-word chunks.
    Reassemble adjacent encoded-word chunks into one, as long as they match
    in encoding, language, and transfer encoding.  Only quoted-printable
    chunks are joined, as joining base64 hunks is much more complex.

diff --git a/lib/RT/I18N.pm b/lib/RT/I18N.pm
index a277e56..1b0bf6c 100644
--- a/lib/RT/I18N.pm
+++ b/lib/RT/I18N.pm
@@ -306,6 +306,12 @@ sub DecodeMIMEWordsToEncoding {
                  /x;
     1 while $str =~ s/($encoded_word)\s+($encoded_word)/$1$5/;
 
+    # Also merge quoted-printable sections together, in case multiple
+    # octets of a single encoded character were split between chunks.
+    # Though not valid according to RFC 2047, this has been seen in the
+    # wild.
+    1 while $str =~ s/(=\?[^?]+\?[Qq]\?)([^?]+)\?=\1([^?]+)\?=/$1$2$3?=/i;
+
     # XXX TODO: use decode('MIME-Header', ...) and Encode::Alias to replace our
     # custom MIME word decoding and charset canonicalization.  We can't do this
     # until we parse before decode, instead of the other way around.
diff --git a/t/mail/mime_decoding.t b/t/mail/mime_decoding.t
index 7fc1c93..7bd2c86 100644
--- a/t/mail/mime_decoding.t
+++ b/t/mail/mime_decoding.t
@@ -1,7 +1,7 @@
 #!/usr/bin/perl
 use strict;
 use warnings;
-use RT::Test nodb => 1, tests => 11;
+use RT::Test nodb => 1, tests => 13;
 
 use_ok('RT::I18N');
 
@@ -114,3 +114,28 @@ diag q{Whitespace between encoded words should be removed};
         "newlines between encoded words also removed",
     );
 }
+
+diag q{Multiple octets split across QP hunks are correctly reassembled};
+{
+    # This passes even without explicit code to handle it because utf8
+    # is perl's internal string encoding.
+    my $str = "=?utf-8?Q?=E3?=    =?utf-8?Q?=82?=    =?utf-8?Q?=AD?=";
+    is(
+        RT::I18N::DecodeMIMEWordsToUTF8($str),
+        "ã‚",
+        "UTF8 character split in three is successfully reassembled",
+    );
+
+    # Non-utf8 encodings thus also must be checked
+    $str = <<EOT; chomp $str;
+=?gb2312?q?Chinese(gb2312)=20=20=C3=C0=B9=FA=C7=B0=CB=BE=B7=A8=B2=BF=B3?=
+ =?gb2312?q?=A4=C3=E6=BC=FB=C8=F8=B4=EF=C4=B7=BA=F3=B3=C6=C6=E4=D7=B4=CC=AC?=
+ =?gb2312?q?=BA=DC=BA=C3=20=20Chinese=20(gb2312)?=
+EOT
+    is(
+        RT::I18N::DecodeMIMEWordsToUTF8($str),
+        "Chinese(gb2312)  ç¾Žå›½å‰å¸æ³•éƒ¨é•¿é¢è§è¨è¾¾å§†åŽç§°å…¶çŠ¶æ€å¾ˆå¥½  Chinese (gb2312)",
+        "gb2312 character is successfully reassembled",
+    );
+
+}

-----------------------------------------------------------------------