[Rt-commit] rt branch, 4.0/protect-more-chars-while-decoding-headers, created. rt-4.0.6-225-g416badf

Sat Nov 17 16:02:29 EST 2012

The branch, 4.0/protect-more-chars-while-decoding-headers has been created
        at  416badfba4b533cc0a623538792991425de8817d (commit)

- Log -----------------------------------------------------------------
commit 70b8c951f1d908798e63eea711690b8fa75873c9
Author: Ruslan Zakirov <ruz at bestpractical.com>
Date:   Thu Jun 28 23:08:45 2012 +0300

    expand list of characters we quote when decoding Q/B
    
    When we get rid of Q/B encodings in formatted fields (From/To/Cc...),
    we have to take care of putting quotes around display name some
    times.
    
    The list comes from Email::Address (which gets it from RFC).

diff --git a/lib/RT/I18N.pm b/lib/RT/I18N.pm
index cadf7cc..d57e715 100644
--- a/lib/RT/I18N.pm
+++ b/lib/RT/I18N.pm
@@ -356,7 +356,7 @@ sub DecodeMIMEWordsToEncoding {
             # confuse us a lot, so only quote it if it isn't quoted
             # already.
             $enc_str = qq{"$enc_str"}
-                if $enc_str =~ /[,;]/
+                if $enc_str =~ /[()<>\[\]:;@\\,.]/
                 and $enc_str !~ /^".*"$/
                 and (!$field || $field =~ /^(?:To$|From$|B?Cc$|Content-)/i);
 

commit 629fdf1b6a4d1e527e00360026c765642bca2c88
Author: Ruslan Zakirov <ruz at bestpractical.com>
Date:   Fri Oct 19 02:32:38 2012 +0400

    fix our magic quoting after decoding Q/B
    
    We decode and if it's special field, eg formatted field
    like to/cc/content-*, then we try to be smart and put
    quotes around decoded part. This is required to keep
    the value still valid formatted field.
    
    Variants:
    
    1) 'xxx=<q/b string>', has no any quotes
    2) 'xxx=<q/b string>', quotes are inside of encoded string
    3) 'xxx="<q/b string>"', quotes are outside
    
    This fixes third case.

diff --git a/lib/RT/I18N.pm b/lib/RT/I18N.pm
index d57e715..af6f4df 100644
--- a/lib/RT/I18N.pm
+++ b/lib/RT/I18N.pm
@@ -357,7 +357,7 @@ sub DecodeMIMEWordsToEncoding {
             # already.
             $enc_str = qq{"$enc_str"}
                 if $enc_str =~ /[()<>\[\]:;@\\,.]/
-                and $enc_str !~ /^".*"$/
+                and not (($enc_str =~ /^"/ or $prefix =~ /"$/) and ($enc_str =~ /"$/ or $trailing =~ /^"/))
                 and (!$field || $field =~ /^(?:To$|From$|B?Cc$|Content-)/i);
 
             $str .= $prefix . $enc_str . $trailing;
diff --git a/t/mail/mime_decoding.t b/t/mail/mime_decoding.t
index 7515e2c..1243dff 100644
--- a/t/mail/mime_decoding.t
+++ b/t/mail/mime_decoding.t
@@ -1,7 +1,7 @@
 #!/usr/bin/perl
 use strict;
 use warnings;
-use RT::Test nodb => 1, tests => 9;
+use RT::Test nodb => 1, tests => 13;
 
 use_ok('RT::I18N');
 
@@ -13,6 +13,11 @@ diag q{'=' char in a leading part before an encoded part};
         'key="plain"; key="Ð¼Ð¾Ð¹_Ñ„Ð°Ð¹Ð».bin"',
         "right decoding"
     );
+    is(
+        RT::I18N::DecodeMIMEWordsToUTF8($str, 'content-disposition'),
+        'key="plain"; key="Ð¼Ð¾Ð¹_Ñ„Ð°Ð¹Ð».bin"',
+        "right decoding"
+    );
 }
 
 diag q{not compliant with standards, but MUAs send such field when attachment has non-ascii in name};
@@ -23,6 +28,11 @@ diag q{not compliant with standards, but MUAs send such field when attachment ha
         'attachment; filename="Ð¼Ð¾Ð¹_Ñ„Ð°Ð¹Ð».bin"',
         "right decoding"
     );
+    is(
+        RT::I18N::DecodeMIMEWordsToUTF8($str, 'content-disposition'),
+        'attachment; filename="Ð¼Ð¾Ð¹_Ñ„Ð°Ð¹Ð».bin"',
+        "right decoding"
+    );
 }
 
 diag q{'=' char in a trailing part after an encoded part};
@@ -33,6 +43,11 @@ diag q{'=' char in a trailing part after an encoded part};
         'attachment; filename="Ð¼Ð¾Ð¹_Ñ„Ð°Ð¹Ð».bin"; some_prop="value"',
         "right decoding"
     );
+    is(
+        RT::I18N::DecodeMIMEWordsToUTF8($str, 'content-disposition'),
+        'attachment; filename="Ð¼Ð¾Ð¹_Ñ„Ð°Ð¹Ð».bin"; some_prop="value"',
+        "right decoding"
+    );
 }
 
 diag q{regression test for #5248 from rt3.fsck.com};
@@ -40,7 +55,7 @@ diag q{regression test for #5248 from rt3.fsck.com};
     my $str = qq{Subject: =?ISO-8859-1?Q?Re=3A_=5BXXXXXX=23269=5D_=5BComment=5D_Frag?=}
         . qq{\n =?ISO-8859-1?Q?e_zu_XXXXXX--xxxxxx_/_Xxxxx=FCxxxxxxxxxx?=};
     is(
-        RT::I18N::DecodeMIMEWordsToUTF8($str),
+        RT::I18N::DecodeMIMEWordsToUTF8($str, 'Subject'),
         qq{Subject: Re: [XXXXXX#269] [Comment] Frage zu XXXXXX--xxxxxx / XxxxxÃ¼xxxxxxxxxx},
         "right decoding"
     );
@@ -54,6 +69,11 @@ diag q{newline and encoded file name};
         qq{application/vnd.ms-powerpoint;\tname="Main presentation.ppt"},
         "right decoding"
     );
+    is(
+        RT::I18N::DecodeMIMEWordsToUTF8($str,'content-type'),
+        qq{application/vnd.ms-powerpoint; name="Main presentation.ppt"},
+        "right decoding"
+    );
 }
 
 diag q{rfc2231};

commit 298eec23c1858227c7537bed125c5b9178725790
Author: Thomas Sibley <trs at bestpractical.com>
Date:   Fri Oct 26 15:50:53 2012 -0700

    Failing tests for more MIME word decoding problems

diff --git a/t/mail/mime_decoding.t b/t/mail/mime_decoding.t
index 1243dff..89928ba 100644
--- a/t/mail/mime_decoding.t
+++ b/t/mail/mime_decoding.t
@@ -1,7 +1,8 @@
 #!/usr/bin/perl
 use strict;
 use warnings;
-use RT::Test nodb => 1, tests => 13;
+use RT::Test nodb => 1, tests => undef;
+use Test::LongString;
 
 use_ok('RT::I18N');
 
@@ -117,3 +118,62 @@ diag q{canonicalize mime word encodings like gb2312};
     );
 }
 
+diag "multiple mime words containing special chars already in quotes";
+{
+    my $str = q{attachment; filename="=?ISO-2022-JP?B?Mi4bJEIlSyVlITwlOSVqJWohPCU5GyhC?= =?ISO-2022-JP?B?LnBkZg==?="};
+    is_string(
+        RT::I18N::DecodeMIMEWordsToUTF8($str, 'Content-Disposition'),
+        q{attachment; filename="2.ãƒ‹ãƒ¥ãƒ¼ã‚¹ãƒªãƒªãƒ¼ã‚¹.pdf"},
+        "base64"
+    );
+
+    $str = q{attachment; filename="=?UTF-8?Q?2=2E=E3=83=8B=E3=83=A5=E3=83=BC=E3=82=B9=E3=83=AA=E3=83=AA?= =?UTF-8?Q?=E3=83=BC=E3=82=B9=2Epdf?="};
+    is_string(
+        RT::I18N::DecodeMIMEWordsToUTF8($str, 'Content-Disposition'),
+        q{attachment; filename="2.ãƒ‹ãƒ¥ãƒ¼ã‚¹ãƒªãƒªãƒ¼ã‚¹.pdf"},
+        "QP"
+    );
+}
+
+diag "mime word combined with text in quoted filename property";
+{
+    my $str = q{attachment; filename="=?UTF-8?B?Q2VjaSBuJ2VzdCBwYXMgdW5l?= pipe.pdf"};
+    is_string(
+        RT::I18N::DecodeMIMEWordsToUTF8($str, 'Content-Disposition'),
+        q{attachment; filename="Ceci n'est pas une pipe.pdf"},
+        "base64"
+    );
+
+    $str = q{attachment; filename="=?UTF-8?B?Q2VjaSBuJ2VzdCBwYXMgdW5lLi4u?= pipe.pdf"};
+    is_string(
+        RT::I18N::DecodeMIMEWordsToUTF8($str, 'Content-Disposition'),
+        q{attachment; filename="Ceci n'est pas une... pipe.pdf"},
+        "base64"
+    );
+
+    $str = q{attachment; filename="=?UTF-8?Q?Ceci n'est pas une?= pipe.pdf"};
+    is_string(
+        RT::I18N::DecodeMIMEWordsToUTF8($str, 'Content-Disposition'),
+        q{attachment; filename="Ceci n'est pas une pipe.pdf"},
+        "QP"
+    );
+
+    $str = q{attachment; filename="=?UTF-8?Q?Ceci n'est pas une...?= pipe.pdf"};
+    is_string(
+        RT::I18N::DecodeMIMEWordsToUTF8($str, 'Content-Disposition'),
+        q{attachment; filename="Ceci n'est pas une... pipe.pdf"},
+        "QP"
+    );
+}
+
+diag "quotes in filename";
+{
+    my $str = q{attachment; filename="=?UTF-8?B?YSAicXVvdGVkIiBmaWxl?="};
+    is_string(
+        RT::I18N::DecodeMIMEWordsToUTF8($str, 'Content-Disposition'),
+        q{attachment; filename="a \"quoted\" file"},
+        "quoted filename correctly decoded"
+    );
+}
+
+done_testing;

commit 4db57bc5a27b93e93a38762c86b9f3ba888d1003
Author: Ruslan Zakirov <ruz at bestpractical.com>
Date:   Mon Nov 5 18:20:07 2012 +0400

    change how we deal with decoding structured fields
    
    We use MIME::Field::ParamVal already to deal with
    continued params (RFC2231). This module allows us
    to deal with separate parameters, decode them and
    deal with quotes inside or absence of those without
    guessing.
    
    There is a bug in the module [1] - wrong parsing
    of quoted strings and wrong quoting on stringify.
    It was fixed, patch is on rt.cpan.org waiting for
    merge and release of the module.
    
    [1] https://rt.cpan.org/Ticket/Display.html?id=80433
    
    Similar situation with From, Cc, Bcc and other fields
    with mailboxes. We were only dealing with subset of
    mailbox fields, now we handle all such headers that
    are mentioned in RFC5322.
    
    Switch to Email::Address::List to parse headers with
    mailboxes. Allows us to report errors, handles obsolete
    emails, don't bails out on not ascii.
    
    Email::Address is still used and a bug [2] was discovered
    in the module that may result in unparsable string
    after re-composing.
    
    [2] https://rt.cpan.org/Ticket/Display.html?id=81170

diff --git a/lib/RT/EmailParser.pm b/lib/RT/EmailParser.pm
index 4cf4184..ab53cd0 100644
--- a/lib/RT/EmailParser.pm
+++ b/lib/RT/EmailParser.pm
@@ -532,24 +532,36 @@ sub ParseEmailAddress {
     my $self = shift;
     my $address_string = shift;
 
-    $address_string =~ s/^\s+|\s+$//g;
+    my @list = Email::Address::Line->parse(
+        $address_string,
+        skip_comments => 1,
+        skip_groups => 1,
+    );
+    my $logger = sub { RT->Logger->error(
+        "Unable to parse an email address from $address_string: ". shift
+    ) };
 
     my @addresses;
-    # if it looks like a username / local only email
-    if ($address_string !~ /@/ && $address_string =~ /^\w+$/) {
-        my $user = RT::User->new( RT->SystemUser );
-        my ($id, $msg) = $user->Load($address_string);
-        if ($id) {
-            push @addresses, Email::Address->new($user->Name,$user->EmailAddress);
+    foreach my $e ( @list ) {
+        if ($e->{'type'} eq 'mailbox') {
+            if ($e->{'not_ascii'}) {
+                $logger->($e->{'value'} ." contains not ASCII values");
+                next;
+            }
+            push @addresses, $e->{'value'}
+        } elsif ( $e->{'value'} =~ /^\s*(\w+)\s*$/ ) {
+            my $user = RT::User->new( RT->SystemUser );
+            $user->Load( $1 );
+            if ($user->id) {
+                push @addresses, Email::Address->new($user->Name, $user->EmailAddress);
+            } else {
+                $logger->($e->{'value'} ." is not a valid email address and is not user name");
+            }
         } else {
-            $RT::Logger->error("Unable to parse an email address from $address_string: $msg");
+            $logger->($e->{'value'} ." is not a valid email address");
         }
-    } else {
-        @addresses = Email::Address->parse($address_string);
     }
-
     return @addresses;
-
 }
 
 =head2 RescueOutlook 
diff --git a/lib/RT/I18N.pm b/lib/RT/I18N.pm
index af6f4df..b7316ed 100644
--- a/lib/RT/I18N.pm
+++ b/lib/RT/I18N.pm
@@ -284,14 +284,65 @@ sub DecodeMIMEWordsToEncoding {
     my $str = shift;
     my $to_charset = _CanonicalizeCharset(shift);
     my $field = shift || '';
+    $RT::Logger->warning(
+        "DecodeMIMEWordsToEncoding was called without field name."
+        ."It's known to cause troubles with decoding fields properly."
+    ) unless $field;
+
+    # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers
+    # We _should_ be preserving them encoded until after parsing is completed and
+    # THEN undo the mime-encoding.
+    #
+    # This routine should be translating the existing mimeencoding to utf8 but leaving
+    # things encoded.
+    #
+    # It's legal for headers to contain mime-encoded commas and semicolons which
+    # should not be treated as address separators. (Encoding == quoting here)
+    #
+    # until this is fixed, we must escape any string containing a comma or semicolon
+    # this is only a bandaid
+
+    # Some _other_ MUAs encode quotes _already_, and double quotes
+    # confuse us a lot, so only quote it if it isn't quoted
+    # already.
 
     # handle filename*=ISO-8859-1''%74%E9%73%74%2E%74%78%74, parameter value
     # continuations, and similar syntax from RFC 2231
-    if ($field =~ /^Content-(Type|Disposition)/i) {
+    if ($field =~ /^Content-/i) {
         # This concatenates continued parameters and normalizes encoded params
         # to QB encoded-words which we handle below
-        $str = MIME::Field::ParamVal->parse($str)->stringify;
+        my $params = MIME::Field::ParamVal->parse_params($str);
+        foreach my $v ( values %$params ) {
+            $v = _DecodeMIMEWordsToEncoding( $v, $to_charset );
+        }
+        $str = bless({}, 'MIME::Field::ParamVal')->set($params)->stringify;
+    }
+    elsif ( $field =~ /^(?:Resent-)?(?:To|From|B?Cc|Sender|Reply-To)$/i ) {
+        my @addresses = RT::EmailParser->ParseEmailAddress( $str );
+        foreach my $address ( @addresses ) {
+            foreach my $field (qw(phrase comment)) {
+                my $v = $address->$field() or next;
+                $v = _DecodeMIMEWordsToEncoding( $v, $to_charset );
+                $address->$field($v);
+            }
+        }
+        $str = join ', ', map $_->format, @addresses;
     }
+    else {
+        $str = _DecodeMIMEWordsToEncoding( $str, $to_charset );
+    }
+
+
+    # We might have \n without trailing whitespace, which will result in
+    # invalid headers.
+    $str =~ s/\n//g;
+
+    return ($str)
+}
+
+sub _DecodeMIMEWordsToEncoding {
+    my $str = shift;
+    my $to_charset = shift;
 
     # XXX TODO: use decode('MIME-Header', ...) and Encode::Alias to replace our
     # custom MIME word decoding and charset canonicalization.  We can't do this
@@ -307,72 +358,44 @@ sub DecodeMIMEWordsToEncoding {
                          \?=            # ?=
                          ([^=]*)        # trailing
                         /xgcs;
+    return $str unless @list;
+
+    # add everything that hasn't matched to the end of the latest
+    # string in array this happen when we have 'key="=?encoded?="; key="plain"'
+    $list[-1] .= substr($str, pos $str);
+
+    $str = '';
+    while (@list) {
+        my ($prefix, $charset, $encoding, $enc_str, $trailing) =
+                splice @list, 0, 5;
+        $charset  = _CanonicalizeCharset($charset);
+        $encoding = lc $encoding;
+
+        $trailing =~ s/\s?\t?$//;               # Observed from Outlook Express
+
+        if ( $encoding eq 'q' ) {
+            use MIME::QuotedPrint;
+            $enc_str =~ tr/_/ /;		# Observed from Outlook Express
+            $enc_str = decode_qp($enc_str);
+        } elsif ( $encoding eq 'b' ) {
+            use MIME::Base64;
+            $enc_str = decode_base64($enc_str);
+        } else {
+            $RT::Logger->warning("Incorrect encoding '$encoding' in '$str', "
+                ."only Q(uoted-printable) and B(ase64) are supported");
+        }
 
-    if ( @list ) {
-        # add everything that hasn't matched to the end of the latest
-        # string in array this happen when we have 'key="=?encoded?="; key="plain"'
-        $list[-1] .= substr($str, pos $str);
-
-        $str = "";
-        while (@list) {
-            my ($prefix, $charset, $encoding, $enc_str, $trailing) =
-                    splice @list, 0, 5;
-            $charset  = _CanonicalizeCharset($charset);
-            $encoding = lc $encoding;
-
-            $trailing =~ s/\s?\t?$//;               # Observed from Outlook Express
-
-            if ( $encoding eq 'q' ) {
-                use MIME::QuotedPrint;
-                $enc_str =~ tr/_/ /;		# Observed from Outlook Express
-                $enc_str = decode_qp($enc_str);
-            } elsif ( $encoding eq 'b' ) {
-                use MIME::Base64;
-                $enc_str = decode_base64($enc_str);
-            } else {
-                $RT::Logger->warning("Incorrect encoding '$encoding' in '$str', "
-                    ."only Q(uoted-printable) and B(ase64) are supported");
-            }
-
-            # now we have got a decoded subject, try to convert into the encoding
-            unless ( $charset eq $to_charset ) {
-                Encode::from_to( $enc_str, $charset, $to_charset );
-            }
-
-            # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers
-            # We _should_ be preserving them encoded until after parsing is completed and
-            # THEN undo the mime-encoding.
-            #
-            # This routine should be translating the existing mimeencoding to utf8 but leaving
-            # things encoded.
-            #
-            # It's legal for headers to contain mime-encoded commas and semicolons which
-            # should not be treated as address separators. (Encoding == quoting here)
-            #
-            # until this is fixed, we must escape any string containing a comma or semicolon
-            # this is only a bandaid
-
-            # Some _other_ MUAs encode quotes _already_, and double quotes
-            # confuse us a lot, so only quote it if it isn't quoted
-            # already.
-            $enc_str = qq{"$enc_str"}
-                if $enc_str =~ /[()<>\[\]:;@\\,.]/
-                and not (($enc_str =~ /^"/ or $prefix =~ /"$/) and ($enc_str =~ /"$/ or $trailing =~ /^"/))
-                and (!$field || $field =~ /^(?:To$|From$|B?Cc$|Content-)/i);
-
-            $str .= $prefix . $enc_str . $trailing;
+        # now we have got a decoded subject, try to convert into the encoding
+        unless ( $charset eq $to_charset ) {
+            Encode::from_to( $enc_str, $charset, $to_charset );
         }
+        $str .= $prefix . $enc_str . $trailing;
     }
 
-    # We might have \n without trailing whitespace, which will result in
-    # invalid headers.
-    $str =~ s/\n//g;
-
     return ($str)
 }
 
 
-
 =head2 _FindOrGuessCharset MIME::Entity, $head_only
 
 When handed a MIME::Entity will first attempt to read what charset the message is encoded in. Failing that, will use Encode::Guess to try to figure it out
diff --git a/t/mail/dashboards.t b/t/mail/dashboards.t
index 00cfc6a..1535595 100644
--- a/t/mail/dashboards.t
+++ b/t/mail/dashboards.t
@@ -102,7 +102,7 @@ sub produces_dashboard_mail_ok { # {{{
 
     my $mail = parse_mail( $mails[0] );
     is($mail->head->get('Subject'), $subject);
-    is($mail->head->get('From'), "root\n");
+    is($mail->head->get('From'), qq{"root" <root\@localhost>\n});
     is($mail->head->get('X-RT-Dashboard-Id'), "$dashboard_id\n");
     is($mail->head->get('X-RT-Dashboard-Subscription-Id'), "$subscription_id\n");
 

commit 416badfba4b533cc0a623538792991425de8817d
Author: Ruslan Zakirov <ruz at bestpractical.com>
Date:   Sat Nov 17 22:20:47 2012 +0400

    deal with hidden quotes within encoded-word
    
    After decoding a part (a param value or display name)
    may contain a quoted string inside, we de-quote and
    de-escape it just to avoid useless quotes.

diff --git a/lib/RT/I18N.pm b/lib/RT/I18N.pm
index b7316ed..3cf68ec 100644
--- a/lib/RT/I18N.pm
+++ b/lib/RT/I18N.pm
@@ -314,6 +314,8 @@ sub DecodeMIMEWordsToEncoding {
         my $params = MIME::Field::ParamVal->parse_params($str);
         foreach my $v ( values %$params ) {
             $v = _DecodeMIMEWordsToEncoding( $v, $to_charset );
+            # de-quote in case those were hidden inside encoded part
+            $v =~ s/\\(.)/$1/g if $v =~ s/^"(.*)"$/$1/;
         }
         $str = bless({}, 'MIME::Field::ParamVal')->set($params)->stringify;
     }
@@ -323,6 +325,10 @@ sub DecodeMIMEWordsToEncoding {
             foreach my $field (qw(phrase comment)) {
                 my $v = $address->$field() or next;
                 $v = _DecodeMIMEWordsToEncoding( $v, $to_charset );
+                if ( $field eq 'phrase' ) {
+                    # de-quote in case quoted value were hidden inside encoded part
+                    $v =~ s/\\(.)/$1/g if $v =~ s/^"(.*)"$/$1/;
+                }
                 $address->$field($v);
             }
         }

-----------------------------------------------------------------------