[Rt-commit] rt branch, 4.0/canonicalize-mime-word-charsets, created. rt-4.0.2-202-g3b5a8a1

Wed Nov 2 18:40:53 EDT 2011

The branch, 4.0/canonicalize-mime-word-charsets has been created
        at  3b5a8a1504601d814519d50ec9368adb5662fa07 (commit)

- Log -----------------------------------------------------------------
commit 4bb6dbb8e688c6fe94f375b517a44f37e2845127
Author: Thomas Sibley <trs at bestpractical.com>
Date:   Wed Nov 2 16:19:05 2011 -0400

    Properly indent the body of DecodeMIMEWordsToEncoding
    
    This commit changes only whitespace.

diff --git a/lib/RT/I18N.pm b/lib/RT/I18N.pm
index db10584..9fd95d1 100644
--- a/lib/RT/I18N.pm
+++ b/lib/RT/I18N.pm
@@ -295,58 +295,58 @@ sub DecodeMIMEWordsToEncoding {
     my @list = $str =~ m/(.*?)=\?([^?]+)\?([QqBb])\?([^?]+)\?=([^=]*)/gcs;
 
     if ( @list ) {
-    # add everything that hasn't matched to the end of the latest
-    # string in array this happen when we have 'key="=?encoded?="; key="plain"'
-    $list[-1] .= substr($str, pos $str);
+        # add everything that hasn't matched to the end of the latest
+        # string in array this happen when we have 'key="=?encoded?="; key="plain"'
+        $list[-1] .= substr($str, pos $str);
 
-    $str = "";
-    while (@list) {
-	my ($prefix, $charset, $encoding, $enc_str, $trailing) =
-            splice @list, 0, 5;
-        $encoding = lc $encoding;
+        $str = "";
+        while (@list) {
+            my ($prefix, $charset, $encoding, $enc_str, $trailing) =
+                    splice @list, 0, 5;
+            $encoding = lc $encoding;
 
-        $trailing =~ s/\s?\t?$//;               # Observed from Outlook Express
+            $trailing =~ s/\s?\t?$//;               # Observed from Outlook Express
 
-	if ( $encoding eq 'q' ) {
-	    use MIME::QuotedPrint;
-	    $enc_str =~ tr/_/ /;		# Observed from Outlook Express
-	    $enc_str = decode_qp($enc_str);
-	} elsif ( $encoding eq 'b' ) {
-	    use MIME::Base64;
-	    $enc_str = decode_base64($enc_str);
-	} else {
-	    $RT::Logger->warning("Incorrect encoding '$encoding' in '$str', "
-            ."only Q(uoted-printable) and B(ase64) are supported");
-	}
+            if ( $encoding eq 'q' ) {
+                use MIME::QuotedPrint;
+                $enc_str =~ tr/_/ /;		# Observed from Outlook Express
+                $enc_str = decode_qp($enc_str);
+            } elsif ( $encoding eq 'b' ) {
+                use MIME::Base64;
+                $enc_str = decode_base64($enc_str);
+            } else {
+                $RT::Logger->warning("Incorrect encoding '$encoding' in '$str', "
+                    ."only Q(uoted-printable) and B(ase64) are supported");
+            }
 
-        # now we have got a decoded subject, try to convert into the encoding
-        unless ( $charset eq $to_charset ) {
-            Encode::from_to( $enc_str, $charset, $to_charset );
-        }
+            # now we have got a decoded subject, try to convert into the encoding
+            unless ( $charset eq $to_charset ) {
+                Encode::from_to( $enc_str, $charset, $to_charset );
+            }
 
-        # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers
-        # We _should_ be preserving them encoded until after parsing is completed and
-        # THEN undo the mime-encoding.
-        #
-        # This routine should be translating the existing mimeencoding to utf8 but leaving
-        # things encoded.
-        #
-        # It's legal for headers to contain mime-encoded commas and semicolons which
-        # should not be treated as address separators. (Encoding == quoting here)
-        #
-        # until this is fixed, we must escape any string containing a comma or semicolon
-        # this is only a bandaid
+            # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers
+            # We _should_ be preserving them encoded until after parsing is completed and
+            # THEN undo the mime-encoding.
+            #
+            # This routine should be translating the existing mimeencoding to utf8 but leaving
+            # things encoded.
+            #
+            # It's legal for headers to contain mime-encoded commas and semicolons which
+            # should not be treated as address separators. (Encoding == quoting here)
+            #
+            # until this is fixed, we must escape any string containing a comma or semicolon
+            # this is only a bandaid
 
-        # Some _other_ MUAs encode quotes _already_, and double quotes
-        # confuse us a lot, so only quote it if it isn't quoted
-        # already.
-        $enc_str = qq{"$enc_str"}
-            if $enc_str =~ /[,;]/
-            and $enc_str !~ /^".*"$/
-            and (!$field || $field =~ /^(?:To$|From$|B?Cc$|Content-)/i);
+            # Some _other_ MUAs encode quotes _already_, and double quotes
+            # confuse us a lot, so only quote it if it isn't quoted
+            # already.
+            $enc_str = qq{"$enc_str"}
+                if $enc_str =~ /[,;]/
+                and $enc_str !~ /^".*"$/
+                and (!$field || $field =~ /^(?:To$|From$|B?Cc$|Content-)/i);
 
-	$str .= $prefix . $enc_str . $trailing;
-    }
+            $str .= $prefix . $enc_str . $trailing;
+        }
     }
 
 # handle filename*=ISO-8859-1''%74%E9%73%74%2E%74%78%74, see also rfc 2231

commit 795c20b3182b6c6c5d698953e18e4a1b7cc6c73c
Author: Thomas Sibley <trs at bestpractical.com>
Date:   Wed Nov 2 18:34:15 2011 -0400

    Test charset canonicalization for MIME words and charset aliases

diff --git a/t/api/canonical_charset.t b/t/api/canonical_charset.t
index 05dfa58..a426d89 100644
--- a/t/api/canonical_charset.t
+++ b/t/api/canonical_charset.t
@@ -1,11 +1,13 @@
 use warnings;
 use strict;
 
-use RT::Test nodata => 1, tests => 7;
+use RT::Test nodata => 1, tests => 11;
 use RT::I18N;
 use Encode;
 
 my %map = (
+    'euc-cn'  => 'gbk',
+    'gb-2312' => 'gbk',
     gb2312  => 'gbk',
     utf8    => 'utf-8',
     'utf-8' => 'utf-8',
diff --git a/t/mail/mime_decoding.t b/t/mail/mime_decoding.t
index e5449ef..b02f979 100644
--- a/t/mail/mime_decoding.t
+++ b/t/mail/mime_decoding.t
@@ -1,7 +1,7 @@
 #!/usr/bin/perl
 use strict;
 use warnings;
-use RT::Test nodb => 1, tests => 7;
+use RT::Test nodb => 1, tests => 8;
 
 use_ok('RT::I18N');
 
@@ -67,3 +67,15 @@ diag q{rfc2231};
     );
 }
 
+diag q{canonicalize mime word encodings like gb2312};
+{
+    my $str = qq{Subject: =?gb2312?B?1NrKwL3nuPe12Lmy09CzrN9eX1NpbXBsaWZpZWRfQ05fR0IyMzEyYQ==?=
+	=?gb2312?B?dHRhY2hlbWVudCB0ZXN0IGluIENOIHNpbXBsaWZpZWQ=?=};
+
+    is(
+        RT::I18N::DecodeMIMEWordsToUTF8($str),
+        qq{Subject: åœ¨ä¸–ç•Œå„åœ°å…±æœ‰è¶…éŽ_Simplified_CN_GB2312attachement test in CN simplified},
+        "right decoding"
+    );
+}
+

commit 118eeb6f3575cc543852cdc49ae8c41db82ba6fa
Author: Thomas Sibley <trs at bestpractical.com>
Date:   Wed Nov 2 16:46:23 2011 -0400

    Canonicalize charsets when decoding MIME words
    
    We special case a few encodings, most notably GB2312, in order to handle
    them correctly more often.  Extending this support to our MIME words
    decoding means we don't fail to decode improperly marked Subjects,
    attachment filenames, names in From or recipient headers, etc.
    
    This bug was discovered because Outlook has a habit of sending email
    with the body and MIME words marked as gb2312 even though the text
    contains extended characters and is actually encoded in gbk or gb18030.
    Modern RT properly decoded the body, but the subject and attachment
    filenames were partially mangled.

diff --git a/lib/RT/I18N.pm b/lib/RT/I18N.pm
index 9fd95d1..f55126f 100644
--- a/lib/RT/I18N.pm
+++ b/lib/RT/I18N.pm
@@ -289,7 +289,7 @@ sub DecodeMIMEWordsToUTF8 {
 
 sub DecodeMIMEWordsToEncoding {
     my $str = shift;
-    my $to_charset = shift;
+    my $to_charset = _CanonicalizeCharset(shift);
     my $field = shift || '';
 
     my @list = $str =~ m/(.*?)=\?([^?]+)\?([QqBb])\?([^?]+)\?=([^=]*)/gcs;
@@ -303,6 +303,7 @@ sub DecodeMIMEWordsToEncoding {
         while (@list) {
             my ($prefix, $charset, $encoding, $enc_str, $trailing) =
                     splice @list, 0, 5;
+            $charset  = _CanonicalizeCharset($charset);
             $encoding = lc $encoding;
 
             $trailing =~ s/\s?\t?$//;               # Observed from Outlook Express
@@ -357,6 +358,7 @@ sub DecodeMIMEWordsToEncoding {
             my ( $prefix, $charset, $language, $enc_str, $trailing ) =
               splice @list, 0, 5;
             $prefix =~ s/\*=$/=/; # remove the *
+            $charset = _CanonicalizeCharset($charset);
             $enc_str =~ s/%(\w{2})/chr hex $1/eg;
             unless ( $charset eq $to_charset ) {
                 Encode::from_to( $enc_str, $charset, $to_charset );

commit 3b5a8a1504601d814519d50ec9368adb5662fa07
Author: Thomas Sibley <trs at bestpractical.com>
Date:   Wed Nov 2 17:06:31 2011 -0400

    Resolve aliases when canonicalizing encoding names
    
    Resolving the aliases means we'll correctly decode more problematic
    character sets.  For example, now we substitute gbk for gb2312 as well
    as for euc-cn, gb2312a, gb-2312, etc.

diff --git a/lib/RT/I18N.pm b/lib/RT/I18N.pm
index f55126f..3d2eb73 100644
--- a/lib/RT/I18N.pm
+++ b/lib/RT/I18N.pm
@@ -502,12 +502,19 @@ sub _CanonicalizeCharset {
     my $charset = lc shift;
     return $charset unless $charset;
 
+    # Canonicalize aliases if they're known
+    if (my $canonical = Encode::resolve_alias($charset)) {
+        $charset = $canonical;
+    }
+
     if ( $charset eq 'utf8' || $charset eq 'utf-8-strict' ) {
         return 'utf-8';
     }
-    elsif ( $charset eq 'gb2312' ) {
-        # gbk is superset of gb2312 so it's safe
+    elsif ( $charset eq 'euc-cn' ) {
+        # gbk is superset of gb2312/euc-cn so it's safe
         return 'gbk';
+        # XXX TODO: gb18030 is an even larger, more permissive superset of gbk,
+        # but needs Encode::HanExtra installed
     }
     else {
         return $charset;

-----------------------------------------------------------------------