[Rt-commit] rt branch, 4.0/encoded-word-whitespace, created. rt-4.0.5-136-g4843514
Alex Vandiver
alexmv at bestpractical.com
Mon Apr 30 21:03:47 EDT 2012
The branch, 4.0/encoded-word-whitespace has been created
at 484351407de29b9417c4646f362ee10ab1138487 (commit)
- Log -----------------------------------------------------------------
commit 5a5f0af5299f5b17f74889751801eb1e88a259a5
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Mon Apr 30 20:52:36 2012 -0400
Remove all whitespace separating adjacent encoded-word chunks
RFC 2047 says:
When displaying a particular header field that contains multiple
'encoded-word's, any 'linear-white-space' that separates a pair of
adjacent 'encoded-word's is ignored.
Implement this by explicitly removing all such whitespace.
diff --git a/lib/RT/I18N.pm b/lib/RT/I18N.pm
index cadf7cc..a277e56 100644
--- a/lib/RT/I18N.pm
+++ b/lib/RT/I18N.pm
@@ -293,18 +293,24 @@ sub DecodeMIMEWordsToEncoding {
$str = MIME::Field::ParamVal->parse($str)->stringify;
}
+ # Pre-parse by removing all whitespace between encoded words
+ my $encoded_word = qr/
+ =\? # =?
+ ([^?]+?) # charset
+ (?:\*[^?]+)? # optional '*language'
+ \? # ?
+ ([QqBb]) # encoding
+ \? # ?
+ ([^?]+) # encoded string
+ \?= # ?=
+ /x;
+ 1 while $str =~ s/($encoded_word)\s+($encoded_word)/$1$5/;
+
# XXX TODO: use decode('MIME-Header', ...) and Encode::Alias to replace our
# custom MIME word decoding and charset canonicalization. We can't do this
# until we parse before decode, instead of the other way around.
my @list = $str =~ m/(.*?) # prefix
- =\? # =?
- ([^?]+?) # charset
- (?:\*[^?]+)? # optional '*language'
- \? # ?
- ([QqBb]) # encoding
- \? # ?
- ([^?]+) # encoded string
- \?= # ?=
+ $encoded_word
([^=]*) # trailing
/xgcs;
diff --git a/t/mail/mime_decoding.t b/t/mail/mime_decoding.t
index 7515e2c..7fc1c93 100644
--- a/t/mail/mime_decoding.t
+++ b/t/mail/mime_decoding.t
@@ -1,7 +1,7 @@
#!/usr/bin/perl
use strict;
use warnings;
-use RT::Test nodb => 1, tests => 9;
+use RT::Test nodb => 1, tests => 11;
use_ok('RT::I18N');
@@ -97,3 +97,20 @@ diag q{canonicalize mime word encodings like gb2312};
);
}
+
+diag q{Whitespace between encoded words should be removed};
+{
+ my $str = "=?utf-8?Q?=E3=82=AD?= =?utf-8?Q?=E3=83=A3?=";
+ is(
+ RT::I18N::DecodeMIMEWordsToUTF8($str),
+ "ãã£",
+ "whitespace between encoded words is removed",
+ );
+
+ $str = "=?utf-8?Q?=E3=82=AD?= \n =?utf-8?Q?=E3=83=A3?=";
+ is(
+ RT::I18N::DecodeMIMEWordsToUTF8($str),
+ "ãã£",
+ "newlines between encoded words also removed",
+ );
+}
commit 484351407de29b9417c4646f362ee10ab1138487
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Mon Apr 30 20:54:02 2012 -0400
Join all adjacent QP encoded-word chunks, in case octets were split
Though not allowed by RFC 2047, some mail clients split multiple octets
of a multibyte encoding across different encoded-word chunks.
Reassemble adjacent encoded-word chunks into one, as long as they match
in encoding, language, and transfer encoding. Only quoted-printable
chunks are joined, as joining base64 hunks is much more complex.
diff --git a/lib/RT/I18N.pm b/lib/RT/I18N.pm
index a277e56..1b0bf6c 100644
--- a/lib/RT/I18N.pm
+++ b/lib/RT/I18N.pm
@@ -306,6 +306,12 @@ sub DecodeMIMEWordsToEncoding {
/x;
1 while $str =~ s/($encoded_word)\s+($encoded_word)/$1$5/;
+ # Also merge quoted-printable sections together, in case multiple
+ # octets of a single encoded character were split between chunks.
+ # Though not valid according to RFC 2047, this has been seen in the
+ # wild.
+ 1 while $str =~ s/(=\?[^?]+\?[Qq]\?)([^?]+)\?=\1([^?]+)\?=/$1$2$3?=/i;
+
# XXX TODO: use decode('MIME-Header', ...) and Encode::Alias to replace our
# custom MIME word decoding and charset canonicalization. We can't do this
# until we parse before decode, instead of the other way around.
diff --git a/t/mail/mime_decoding.t b/t/mail/mime_decoding.t
index 7fc1c93..7bd2c86 100644
--- a/t/mail/mime_decoding.t
+++ b/t/mail/mime_decoding.t
@@ -1,7 +1,7 @@
#!/usr/bin/perl
use strict;
use warnings;
-use RT::Test nodb => 1, tests => 11;
+use RT::Test nodb => 1, tests => 13;
use_ok('RT::I18N');
@@ -114,3 +114,28 @@ diag q{Whitespace between encoded words should be removed};
"newlines between encoded words also removed",
);
}
+
+diag q{Multiple octets split across QP hunks are correctly reassembled};
+{
+ # This passes even without explicit code to handle it because utf8
+ # is perl's internal string encoding.
+ my $str = "=?utf-8?Q?=E3?= =?utf-8?Q?=82?= =?utf-8?Q?=AD?=";
+ is(
+ RT::I18N::DecodeMIMEWordsToUTF8($str),
+ "ã",
+ "UTF8 character split in three is successfully reassembled",
+ );
+
+ # Non-utf8 encodings thus also must be checked
+ $str = <<EOT; chomp $str;
+=?gb2312?q?Chinese(gb2312)=20=20=C3=C0=B9=FA=C7=B0=CB=BE=B7=A8=B2=BF=B3?=
+ =?gb2312?q?=A4=C3=E6=BC=FB=C8=F8=B4=EF=C4=B7=BA=F3=B3=C6=C6=E4=D7=B4=CC=AC?=
+ =?gb2312?q?=BA=DC=BA=C3=20=20Chinese=20(gb2312)?=
+EOT
+ is(
+ RT::I18N::DecodeMIMEWordsToUTF8($str),
+ "Chinese(gb2312) ç¾å½åå¸æ³é¨é¿é¢è§è¨è¾¾å§å称å
¶ç¶æå¾å¥½ Chinese (gb2312)",
+ "gb2312 character is successfully reassembled",
+ );
+
+}
-----------------------------------------------------------------------
More information about the Rt-commit
mailing list