[Rt-commit] rt branch, 4.2/set-mime-encoding-refactor, created. rt-4.1.13-24-gd31ba34

Wed Jul 3 14:03:39 EDT 2013

The branch, 4.2/set-mime-encoding-refactor has been created
        at  d31ba34246bac48232ea0bf92fc67132dadc8e3f (commit)

- Log -----------------------------------------------------------------
commit bd42e5a269aa34667d4d586f7b99ce4853424eab
Author: sunnavy <sunnavy at bestpractical.com>
Date:   Tue May 21 00:09:42 2013 +0800

    refactor SetMIMEEncoding functionality( see also #17680 )
    
    incoming mail:
    
    1) find encoding
    2) if found then try to convert to utf-8 in croak mode, return if success
    3) guess encoding
    4) if guessed differently then try to convert to utf-8 in croak mode, return if success
    5) mark part as application/octet-stream instead of falling back to any encoding
    
    outgoing mail:
    
    1) find encoding
    2) if didn't find then do nothing, send as is, let MUA deal with it
    3) if found then try to convert it to outgoing encoding in croak mode, return if success
    4) do nothing otherwise, keep original encoding

diff --git a/lib/RT/Action/SendEmail.pm b/lib/RT/Action/SendEmail.pm
index 65f99fd..0128956 100644
--- a/lib/RT/Action/SendEmail.pm
+++ b/lib/RT/Action/SendEmail.pm
@@ -195,9 +195,12 @@ sub Prepare {
         $part->head->mime_attr( "Content-Type.charset" => 'utf-8' );
     }
 
-    RT::I18N::SetMIMEEntityToEncoding( $MIMEObj,
-        RT->Config->Get('EmailOutputEncoding'),
-        'mime_words_ok', );
+    RT::I18N::SetMIMEEntityToEncoding(
+        Entity        => $MIMEObj,
+        Encoding      => RT->Config->Get('EmailOutputEncoding'),
+        PreserveWords => 1,
+        IsOut         => 1,
+    );
 
     # Build up a MIME::Entity that looks like the original message.
     $self->AddAttachments if ( $MIMEObj->head->get('RT-Attach-Message')
diff --git a/lib/RT/I18N.pm b/lib/RT/I18N.pm
index f4592af..31e8741 100644
--- a/lib/RT/I18N.pm
+++ b/lib/RT/I18N.pm
@@ -192,22 +192,71 @@ sub IsTextualContentType {
 }
 
 
-=head2 SetMIMEEntityToEncoding $entity, $encoding
+=head2 SetMIMEEntityToEncoding Entity => ENTITY, Encoding => ENCODING, PreserveWords => BOOL, IsOut => BOOL
 
 An utility function which will try to convert entity body into specified
 charset encoding (encoded as octets, *not* unicode-strings).  It will
 iterate all the entities in $entity, and try to convert each one into
 specified charset if whose Content-Type is 'text/plain'.
 
+If PreserveWords is true, values in mime head will be decoded.(default is false)
+
+Incoming and outgoing mails are handled differently, if IsOut is true(default
+is false), it'll be treated as outgoing mail, otherwise incomding mail:
+
+incoming mail:
+1) find encoding
+2) if found then try to convert to utf-8 in croak mode, return if success
+3) guess encoding
+4) if guessed differently then try to convert to utf-8 in croak mode, return
+   if success
+5) mark part as application/octet-stream instead of falling back to any
+   encoding
+
+outgoing mail:
+1) find encoding
+2) if didn't find then do nothing, send as is, let MUA deal with it
+3) if found then try to convert it to outgoing encoding in croak mode, return
+   if success
+4) do nothing otherwise, keep original encoding
+
 This function doesn't return anything meaningful.
 
 =cut
 
 sub SetMIMEEntityToEncoding {
-    my ( $entity, $enc, $preserve_words ) = ( shift, shift, shift );
+    my ( $entity, $enc, $preserve_words, $is_out );
+
+    if ( @_ <= 3 ) {
+        ( $entity, $enc, $preserve_words ) = @_;
+    }
+    else {
+        my %args = (
+            Entity        => undef,
+            Encoding      => undef,
+            PreserveWords => undef,
+            IsOut         => undef,
+            @_,
+        );
+
+        $entity         = $args{Entity};
+        $enc            = $args{Encoding};
+        $preserve_words = $args{PreserveWords};
+        $is_out         = $args{IsOut};
+    }
+
+    unless ( $entity && $enc ) {
+        RT->Logger->error("Missing Entity or Encoding arguments");
+        return;
+    }
 
     # do the same for parts first of all
-    SetMIMEEntityToEncoding( $_, $enc, $preserve_words ) foreach $entity->parts;
+    SetMIMEEntityToEncoding(
+        Entity        => $_,
+        Encoding      => $enc,
+        PreserveWords => $preserve_words,
+        IsOut         => $is_out,
+    ) foreach $entity->parts;
 
     my $head = $entity->head;
 
@@ -225,9 +274,11 @@ sub SetMIMEEntityToEncoding {
     }
 
     SetMIMEHeadToEncoding(
-        $head,
-        _FindOrGuessCharset($entity, 1) => $enc,
-        $preserve_words
+        Head          => $head,
+        From          => _FindOrGuessCharset( $entity, 1 ),
+        To            => $enc,
+        PreserveWords => $preserve_words,
+        IsOut         => $is_out,
     );
 
     # If this is a textual entity, we'd need to preserve its original encoding
@@ -247,7 +298,25 @@ sub SetMIMEEntityToEncoding {
 
         # NOTE:: see the comments at the end of the sub.
         Encode::_utf8_off($string);
-        Encode::from_to( $string, $charset => $enc );
+        my $orig_string = $string;
+        ( my $success, $string ) = EncodeFromToWithCroak( $orig_string, $charset => $enc );
+        if ( !$success ) {
+            return if $is_out;
+            my $error = $string;
+
+            my $guess = _GuessCharset($orig_string);
+            if ( $guess && $guess ne $charset ) {
+                $RT::Logger->error( "Encoding error: " . $error . " falling back to Guess($guess) => $enc" );
+                ( $success, $string ) = EncodeFromToWithCroak( $orig_string, $guess, $enc );
+                $error = $string unless $success;
+            }
+
+            if ( !$success ) {
+                $RT::Logger->error( "Encoding error: " . $error . " falling back to application/octet-stream" );
+                $head->mime_attr( "content-type" => 'application/octet-stream' );
+                return;
+            }
+        }
 
         my $new_body = MIME::Body::InCore->new($string);
 
@@ -574,7 +643,7 @@ sub _CanonicalizeCharset {
 }
 
 
-=head2 SetMIMEHeadToEncoding HEAD OLD_CHARSET NEW_CHARSET
+=head2 SetMIMEHeadToEncoding MIMEHead => HEAD, From => OLD_ENCODING, To => NEW_Encoding, PreserveWords => BOOL, IsOut => BOOL
 
 Converts a MIME Head from one encoding to another. This totally violates the RFC.
 We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff
@@ -584,7 +653,33 @@ all the time
 =cut
 
 sub SetMIMEHeadToEncoding {
-    my ( $head, $charset, $enc, $preserve_words ) = ( shift, shift, shift, shift );
+    my ( $head, $charset, $enc, $preserve_words, $is_out );
+
+    if ( @_ <= 4 ) {
+        ( $head, $charset, $enc, $preserve_words ) = @_;
+    }
+    else {
+        my %args = (
+            Head      => undef,
+            From          => undef,
+            To            => undef,
+            PreserveWords => undef,
+            IsOut         => undef,
+            @_,
+        );
+
+        $head           = $args{Head};
+        $charset        = $args{From};
+        $enc            = $args{To};
+        $preserve_words = $args{PreserveWords};
+        $is_out         = $args{IsOut};
+    }
+
+    unless ( $head && $charset && $enc ) {
+        RT->Logger->error(
+            "Missing Head or From or To arguments");
+        return;
+    }
 
     $charset = _CanonicalizeCharset($charset);
     $enc     = _CanonicalizeCharset($enc);
@@ -598,8 +693,31 @@ sub SetMIMEHeadToEncoding {
         foreach my $value (@values) {
             if ( $charset ne $enc || $enc =~ /^utf-?8(?:-strict)?$/i ) {
                 Encode::_utf8_off($value);
-                Encode::from_to( $value, $charset => $enc );
+                my $orig_value = $value;
+                ( my $success, $value ) = EncodeFromToWithCroak( $orig_value, $charset => $enc );
+                if ( !$success ) {
+                    my $error = $value;
+                    if ($is_out) {
+                        $value = $orig_value;
+                        $head->add( $tag, $value );
+                        next;
+                    }
+
+                    my $guess = _GuessCharset($orig_value);
+                    if ( $guess && $guess ne $charset ) {
+                        $RT::Logger->error( "Encoding error: " . $error . " falling back to Guess($guess) => $enc" );
+                        ( $success, $value ) = EncodeFromToWithCroak( $orig_value, $guess, $enc );
+                        $error = $value unless $success;
+                    }
+
+                    if ( !$success ) {
+                        $RT::Logger->error( "Encoding error: " . $error . " forcing conversion to $charset => $enc" );
+                        $value = $orig_value;
+                        Encode::from_to( $value, $charset => $enc );
+                    }
+                }
             }
+
             $value = DecodeMIMEWordsToEncoding( $value, $enc, $tag )
                 unless $preserve_words;
 
@@ -614,6 +732,23 @@ sub SetMIMEHeadToEncoding {
 
 }
 
+=head2 EncodeFromToWithCroak $string, $from, $to
+
+Try to encode string from encoding $from to encoding $to in croak mode
+
+return (1, $encoded_string) if success, otherwise (0, $error)
+
+=cut
+
+sub EncodeFromToWithCroak {
+    my $string = shift;
+    my $from   = shift;
+    my $to     = shift;
+
+    eval { Encode::from_to( $string, $from => $to, Encode::FB_CROAK ); };
+    return $@ ? ( 0, $@ ) : ( 1, $string );
+}
+
 RT::Base->_ImportOverlays();
 
 1;  # End of module.

commit d31ba34246bac48232ea0bf92fc67132dadc8e3f
Author: sunnavy <sunnavy at bestpractical.com>
Date:   Thu Jun 16 15:35:57 2011 +0800

    more tests for RT::I18N::SetMIMEEntityToEncoding

diff --git a/t/api/i18n_mime_encoding.t b/t/api/i18n_mime_encoding.t
new file mode 100644
index 0000000..7918d23
--- /dev/null
+++ b/t/api/i18n_mime_encoding.t
@@ -0,0 +1,33 @@
+use warnings;
+use strict;
+
+use RT::Test nodata => 1, tests => undef;
+use RT::I18N;
+use Encode;
+use Test::Warn;
+
+diag "normal mime encoding conversion: utf8 => iso-8859-1";
+{
+    my $mime = MIME::Entity->build(
+        Type => 'text/plain; charset=utf-8',
+        Data => ['Ã€ä¸æ–‡'],
+    );
+
+    warning_like {
+        RT::I18N::SetMIMEEntityToEncoding( $mime, 'iso-8859-1', );
+    }
+    [ qr/does not map to iso-8859-1/ ], 'got one "not map" error';
+    is( $mime->stringify_body, 'Ã€ä¸æ–‡', 'body is not changed' );
+    is( $mime->head->mime_attr('Content-Type'), 'application/octet-stream' );
+}
+
+diag "mime encoding conversion: utf8 => iso-8859-1";
+{
+    my $mime = MIME::Entity->build(
+        Type => 'text/plain; charset=utf-8',
+        Data => ['Ã€ä¸æ–‡'],
+    );
+    is( $mime->stringify_body, 'Ã€ä¸æ–‡', 'body is not changed' );
+}
+
+done_testing;
diff --git a/t/mail/sendmail.t b/t/mail/sendmail.t
index 327d2ad..e4aaa9c 100644
--- a/t/mail/sendmail.t
+++ b/t/mail/sendmail.t
@@ -2,7 +2,7 @@ use strict;
 use warnings;
 use File::Spec ();
 
-use RT::Test tests => 174;
+use RT::Test tests => undef;
 
 use RT::EmailParser;
 use RT::Tickets;
@@ -323,31 +323,8 @@ $parser->ParseMIMEEntityFromScalar($content);
 
  %args =        (message => $content, queue => 1, action => 'correspond');
 
-{
-
-my @warnings;
-local $SIG{__WARN__} = sub {
-    push @warnings, "@_";
-};
-
 RT::Interface::Email::Gateway(\%args);
 
-TODO: {
-        local $TODO =
-'need a better approach of encoding converter, should be fixed in 4.2';
-ok( @warnings == 1 || @warnings == 2, "1 or 2 warnings are ok" );
-ok( @warnings == 1 || ( @warnings == 2 && $warnings[1] eq $warnings[0] ),
-    'if there are 2 warnings, they should be same' );
-
-like(
-    $warnings[0],
-    qr/\QEncoding error: "\x{041f}" does not map to iso-8859-1/,
-"The badly formed Russian spam we have isn't actually well-formed UTF8, which makes Encode (correctly) warn",
-);
-
-}
-}
-
  $tickets = RT::Tickets->new(RT->SystemUser);
 $tickets->OrderBy(FIELD => 'id', ORDER => 'DESC');
 $tickets->Limit(FIELD => 'id' ,OPERATOR => '>', VALUE => '0');
@@ -556,3 +533,5 @@ diag q{regression test for #5248 from rt3.fsck.com};
 
 # Don't taint the environment
 $everyone->PrincipalObj->RevokeRight(Right =>'SuperUser');
+
+done_testing();
diff --git a/t/mail/wrong_mime_charset.t b/t/mail/wrong_mime_charset.t
index 530b5f3..5ca6dce 100644
--- a/t/mail/wrong_mime_charset.t
+++ b/t/mail/wrong_mime_charset.t
@@ -1,6 +1,6 @@
 use strict;
 use warnings;
-use RT::Test nodb => 1, tests => 6;
+use RT::Test nodb => 1, tests => undef;
 
 use_ok('RT::I18N');
 use utf8;
@@ -22,10 +22,6 @@ local $SIG{__WARN__} = sub {
 
 RT::I18N::SetMIMEEntityToEncoding( $mime, 'iso-8859-1' );
 
-TODO: {
-        local $TODO =
-'need a better approach of encoding converter, should be fixed in 4.2';
-
 # this is a weird behavior for different perl versions, 5.12 warns twice,
 # which is correct since we do the encoding thing twice, for Subject
 # and Data respectively.
@@ -46,4 +42,5 @@ is( $subject, $test_string, 'subject is set to iso-8859-1' );
 my $body = decode( 'iso-8859-1', $mime->stringify_body );
 chomp $body;
 is( $body, $test_string, 'body is set to iso-8859-1' );
-}
+
+done_testing;

-----------------------------------------------------------------------