[Rt-commit] rt branch, 4.0/canonicalize-mime-word-charsets, created. rt-4.0.2-202-g3b5a8a1
Thomas Sibley
trs at bestpractical.com
Wed Nov 2 18:40:53 EDT 2011
The branch, 4.0/canonicalize-mime-word-charsets has been created
at 3b5a8a1504601d814519d50ec9368adb5662fa07 (commit)
- Log -----------------------------------------------------------------
commit 4bb6dbb8e688c6fe94f375b517a44f37e2845127
Author: Thomas Sibley <trs at bestpractical.com>
Date: Wed Nov 2 16:19:05 2011 -0400
Properly indent the body of DecodeMIMEWordsToEncoding
This commit changes only whitespace.
diff --git a/lib/RT/I18N.pm b/lib/RT/I18N.pm
index db10584..9fd95d1 100644
--- a/lib/RT/I18N.pm
+++ b/lib/RT/I18N.pm
@@ -295,58 +295,58 @@ sub DecodeMIMEWordsToEncoding {
my @list = $str =~ m/(.*?)=\?([^?]+)\?([QqBb])\?([^?]+)\?=([^=]*)/gcs;
if ( @list ) {
- # add everything that hasn't matched to the end of the latest
- # string in array this happen when we have 'key="=?encoded?="; key="plain"'
- $list[-1] .= substr($str, pos $str);
+ # add everything that hasn't matched to the end of the latest
+ # string in array this happen when we have 'key="=?encoded?="; key="plain"'
+ $list[-1] .= substr($str, pos $str);
- $str = "";
- while (@list) {
- my ($prefix, $charset, $encoding, $enc_str, $trailing) =
- splice @list, 0, 5;
- $encoding = lc $encoding;
+ $str = "";
+ while (@list) {
+ my ($prefix, $charset, $encoding, $enc_str, $trailing) =
+ splice @list, 0, 5;
+ $encoding = lc $encoding;
- $trailing =~ s/\s?\t?$//; # Observed from Outlook Express
+ $trailing =~ s/\s?\t?$//; # Observed from Outlook Express
- if ( $encoding eq 'q' ) {
- use MIME::QuotedPrint;
- $enc_str =~ tr/_/ /; # Observed from Outlook Express
- $enc_str = decode_qp($enc_str);
- } elsif ( $encoding eq 'b' ) {
- use MIME::Base64;
- $enc_str = decode_base64($enc_str);
- } else {
- $RT::Logger->warning("Incorrect encoding '$encoding' in '$str', "
- ."only Q(uoted-printable) and B(ase64) are supported");
- }
+ if ( $encoding eq 'q' ) {
+ use MIME::QuotedPrint;
+ $enc_str =~ tr/_/ /; # Observed from Outlook Express
+ $enc_str = decode_qp($enc_str);
+ } elsif ( $encoding eq 'b' ) {
+ use MIME::Base64;
+ $enc_str = decode_base64($enc_str);
+ } else {
+ $RT::Logger->warning("Incorrect encoding '$encoding' in '$str', "
+ ."only Q(uoted-printable) and B(ase64) are supported");
+ }
- # now we have got a decoded subject, try to convert into the encoding
- unless ( $charset eq $to_charset ) {
- Encode::from_to( $enc_str, $charset, $to_charset );
- }
+ # now we have got a decoded subject, try to convert into the encoding
+ unless ( $charset eq $to_charset ) {
+ Encode::from_to( $enc_str, $charset, $to_charset );
+ }
- # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers
- # We _should_ be preserving them encoded until after parsing is completed and
- # THEN undo the mime-encoding.
- #
- # This routine should be translating the existing mimeencoding to utf8 but leaving
- # things encoded.
- #
- # It's legal for headers to contain mime-encoded commas and semicolons which
- # should not be treated as address separators. (Encoding == quoting here)
- #
- # until this is fixed, we must escape any string containing a comma or semicolon
- # this is only a bandaid
+ # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers
+ # We _should_ be preserving them encoded until after parsing is completed and
+ # THEN undo the mime-encoding.
+ #
+ # This routine should be translating the existing mimeencoding to utf8 but leaving
+ # things encoded.
+ #
+ # It's legal for headers to contain mime-encoded commas and semicolons which
+ # should not be treated as address separators. (Encoding == quoting here)
+ #
+ # until this is fixed, we must escape any string containing a comma or semicolon
+ # this is only a bandaid
- # Some _other_ MUAs encode quotes _already_, and double quotes
- # confuse us a lot, so only quote it if it isn't quoted
- # already.
- $enc_str = qq{"$enc_str"}
- if $enc_str =~ /[,;]/
- and $enc_str !~ /^".*"$/
- and (!$field || $field =~ /^(?:To$|From$|B?Cc$|Content-)/i);
+ # Some _other_ MUAs encode quotes _already_, and double quotes
+ # confuse us a lot, so only quote it if it isn't quoted
+ # already.
+ $enc_str = qq{"$enc_str"}
+ if $enc_str =~ /[,;]/
+ and $enc_str !~ /^".*"$/
+ and (!$field || $field =~ /^(?:To$|From$|B?Cc$|Content-)/i);
- $str .= $prefix . $enc_str . $trailing;
- }
+ $str .= $prefix . $enc_str . $trailing;
+ }
}
# handle filename*=ISO-8859-1''%74%E9%73%74%2E%74%78%74, see also rfc 2231
commit 795c20b3182b6c6c5d698953e18e4a1b7cc6c73c
Author: Thomas Sibley <trs at bestpractical.com>
Date: Wed Nov 2 18:34:15 2011 -0400
Test charset canonicalization for MIME words and charset aliases
diff --git a/t/api/canonical_charset.t b/t/api/canonical_charset.t
index 05dfa58..a426d89 100644
--- a/t/api/canonical_charset.t
+++ b/t/api/canonical_charset.t
@@ -1,11 +1,13 @@
use warnings;
use strict;
-use RT::Test nodata => 1, tests => 7;
+use RT::Test nodata => 1, tests => 11;
use RT::I18N;
use Encode;
my %map = (
+ 'euc-cn' => 'gbk',
+ 'gb-2312' => 'gbk',
gb2312 => 'gbk',
utf8 => 'utf-8',
'utf-8' => 'utf-8',
diff --git a/t/mail/mime_decoding.t b/t/mail/mime_decoding.t
index e5449ef..b02f979 100644
--- a/t/mail/mime_decoding.t
+++ b/t/mail/mime_decoding.t
@@ -1,7 +1,7 @@
#!/usr/bin/perl
use strict;
use warnings;
-use RT::Test nodb => 1, tests => 7;
+use RT::Test nodb => 1, tests => 8;
use_ok('RT::I18N');
@@ -67,3 +67,15 @@ diag q{rfc2231};
);
}
+diag q{canonicalize mime word encodings like gb2312};
+{
+ my $str = qq{Subject: =?gb2312?B?1NrKwL3nuPe12Lmy09CzrN9eX1NpbXBsaWZpZWRfQ05fR0IyMzEyYQ==?=
+ =?gb2312?B?dHRhY2hlbWVudCB0ZXN0IGluIENOIHNpbXBsaWZpZWQ=?=};
+
+ is(
+ RT::I18N::DecodeMIMEWordsToUTF8($str),
+ qq{Subject: å¨ä¸çåå°å
±æè¶
é_Simplified_CN_GB2312attachement test in CN simplified},
+ "right decoding"
+ );
+}
+
commit 118eeb6f3575cc543852cdc49ae8c41db82ba6fa
Author: Thomas Sibley <trs at bestpractical.com>
Date: Wed Nov 2 16:46:23 2011 -0400
Canonicalize charsets when decoding MIME words
We special case a few encodings, most notably GB2312, in order to handle
them correctly more often. Extending this support to our MIME words
decoding means we don't fail to decode improperly marked Subjects,
attachment filenames, names in From or recipient headers, etc.
This bug was discovered because Outlook has a habit of sending email
with the body and MIME words marked as gb2312 even though the text
contains extended characters and is actually encoded in gbk or gb18030.
Modern RT properly decoded the body, but the subject and attachment
filenames were partially mangled.
diff --git a/lib/RT/I18N.pm b/lib/RT/I18N.pm
index 9fd95d1..f55126f 100644
--- a/lib/RT/I18N.pm
+++ b/lib/RT/I18N.pm
@@ -289,7 +289,7 @@ sub DecodeMIMEWordsToUTF8 {
sub DecodeMIMEWordsToEncoding {
my $str = shift;
- my $to_charset = shift;
+ my $to_charset = _CanonicalizeCharset(shift);
my $field = shift || '';
my @list = $str =~ m/(.*?)=\?([^?]+)\?([QqBb])\?([^?]+)\?=([^=]*)/gcs;
@@ -303,6 +303,7 @@ sub DecodeMIMEWordsToEncoding {
while (@list) {
my ($prefix, $charset, $encoding, $enc_str, $trailing) =
splice @list, 0, 5;
+ $charset = _CanonicalizeCharset($charset);
$encoding = lc $encoding;
$trailing =~ s/\s?\t?$//; # Observed from Outlook Express
@@ -357,6 +358,7 @@ sub DecodeMIMEWordsToEncoding {
my ( $prefix, $charset, $language, $enc_str, $trailing ) =
splice @list, 0, 5;
$prefix =~ s/\*=$/=/; # remove the *
+ $charset = _CanonicalizeCharset($charset);
$enc_str =~ s/%(\w{2})/chr hex $1/eg;
unless ( $charset eq $to_charset ) {
Encode::from_to( $enc_str, $charset, $to_charset );
commit 3b5a8a1504601d814519d50ec9368adb5662fa07
Author: Thomas Sibley <trs at bestpractical.com>
Date: Wed Nov 2 17:06:31 2011 -0400
Resolve aliases when canonicalizing encoding names
Resolving the aliases means we'll correctly decode more problematic
character sets. For example, now we substitute gbk for gb2312 as well
as for euc-cn, gb2312a, gb-2312, etc.
diff --git a/lib/RT/I18N.pm b/lib/RT/I18N.pm
index f55126f..3d2eb73 100644
--- a/lib/RT/I18N.pm
+++ b/lib/RT/I18N.pm
@@ -502,12 +502,19 @@ sub _CanonicalizeCharset {
my $charset = lc shift;
return $charset unless $charset;
+ # Canonicalize aliases if they're known
+ if (my $canonical = Encode::resolve_alias($charset)) {
+ $charset = $canonical;
+ }
+
if ( $charset eq 'utf8' || $charset eq 'utf-8-strict' ) {
return 'utf-8';
}
- elsif ( $charset eq 'gb2312' ) {
- # gbk is superset of gb2312 so it's safe
+ elsif ( $charset eq 'euc-cn' ) {
+ # gbk is superset of gb2312/euc-cn so it's safe
return 'gbk';
+ # XXX TODO: gb18030 is an even larger, more permissive superset of gbk,
+ # but needs Encode::HanExtra installed
}
else {
return $charset;
-----------------------------------------------------------------------
More information about the Rt-commit
mailing list