[Rt-commit] rt branch, 4.0/set-mime-encoding-fix, created. rt-4.0.1rc1-9-gd1730ac

Wed Jun 8 17:19:56 EDT 2011

The branch, 4.0/set-mime-encoding-fix has been created
        at  d1730ac6d33f3ba39c09e2f3a6134eda4ec0e37b (commit)

- Log -----------------------------------------------------------------
commit d18e7f38733163d0eea68025dcc6c314a8e45f05
Author: sunnavy <sunnavy at bestpractical.com>
Date:   Thu Jun 9 04:54:23 2011 +0800

    refactor SetMIMEHeaderToEncoding and SetMIMEEntityToEncoding
    
    2 important changes:
    
    1. add $force argument to convert strings without check.
    this is useful in SendEmail action as the original MIME object
    is always utf-8, the fallback stuff is useless there.
    
    2. fallback to the guessed charset instead of hardcoded iso-8859-1

diff --git a/lib/RT/Action/SendEmail.pm b/lib/RT/Action/SendEmail.pm
index cc63106..066e84d 100644
--- a/lib/RT/Action/SendEmail.pm
+++ b/lib/RT/Action/SendEmail.pm
@@ -211,9 +211,11 @@ sub Prepare {
         $part->head->mime_attr( "Content-Type.charset" => 'utf-8' );
     }
 
+    # $MIMEObj is utf-8 encoded, so it's ok to force( 1 in the arguments )
+    # the conversion
     RT::I18N::SetMIMEEntityToEncoding( $MIMEObj,
         RT->Config->Get('EmailOutputEncoding'),
-        'mime_words_ok', );
+        'mime_words_ok', 1 );
 
     # Build up a MIME::Entity that looks like the original message.
     $self->AddAttachments if ( $MIMEObj->head->get('RT-Attach-Message')
diff --git a/lib/RT/I18N.pm b/lib/RT/I18N.pm
index a1f2af5..9887123 100644
--- a/lib/RT/I18N.pm
+++ b/lib/RT/I18N.pm
@@ -192,7 +192,7 @@ sub IsTextualContentType {
 }
 
 
-=head2 SetMIMEEntityToEncoding $entity, $encoding
+=head2 SetMIMEEntityToEncoding $entity, $encoding, $preserve_words, $force
 
 An utility function which will try to convert entity body into specified
 charset encoding (encoded as octets, *not* unicode-strings).  It will
@@ -200,26 +200,31 @@ iterate all the entities in $entity, and try to convert each one into
 specified charset if whose Content-Type is 'text/plain'.
 
 the methods are tries in order:
-1) to convert the entity to $encoding, 
-2) to interpret the entity as iso-8859-1 and then convert it to $encoding,
+1) convert the entity to $encoding, 
+2) guess the entity's encoding then convert it to $encoding,
 3) forcibly convert it to $encoding.
 
+if $force is true, 1 and 2 will be skipped.
+
 This function doesn't return anything meaningful.
 
 =cut
 
 sub SetMIMEEntityToEncoding {
-    my ( $entity, $enc, $preserve_words ) = ( shift, shift, shift );
+    my ( $entity, $enc, $preserve_words, $force ) =
+      ( shift, shift, shift, shift );
 
     # do the same for parts first of all
-    SetMIMEEntityToEncoding( $_, $enc, $preserve_words ) foreach $entity->parts;
+    SetMIMEEntityToEncoding( $_, $enc, $preserve_words, $force )
+      foreach $entity->parts;
 
     my $charset = _FindOrGuessCharset($entity) or return;
 
     SetMIMEHeadToEncoding(
 	$entity->head,
 	_FindOrGuessCharset($entity, 1) => $enc,
-	$preserve_words
+	$preserve_words,
+    $force,
     );
 
     my $head = $entity->head;
@@ -246,31 +251,48 @@ sub SetMIMEEntityToEncoding {
         my $orig_string = $string;
 
         # Convert the body
-        eval {
-            $RT::Logger->debug( "Converting '$charset' to '$enc' for "
-                  . $head->mime_type . " - "
-                  . ( $head->get('subject') || 'Subjectless message' ) );
-            Encode::from_to( $string, $charset => $enc, Encode::FB_CROAK );
-        };
-
-        if ($@) {
-            $RT::Logger->error( "Encoding error: " 
-                  . $@
-                  . " falling back to iso-8859-1 => $enc" );
-            $string = $orig_string;
+        $RT::Logger->debug( "Converting '$charset' to '$enc' for "
+              . $head->mime_type . " - "
+              . ( $head->get('subject') || 'Subjectless message' ) );
+
+        if ( $force ) {
+            Encode::from_to( $string, $charset => $enc );
+        }
+        else {
             eval {
-                Encode::from_to(
-                    $string,
-                    'iso-8859-1' => $enc,
-                    Encode::FB_CROAK
-                );
+                Encode::from_to( $string, $charset => $enc, Encode::FB_CROAK );
             };
+
             if ($@) {
-                $RT::Logger->error( "Encoding error: " 
+                $RT::Logger->error( "Encoding error: "
                       . $@
-                      . " forcing conversion to $charset => $enc" );
+                      . " falling back to iso-8859-1 => $enc" );
                 $string = $orig_string;
-                Encode::from_to( $string, $charset => $enc );
+
+                my $guess = _GuessCharset( $string );
+                my $success;
+
+                if ( $guess eq $enc ) {
+                    $success = 1;
+                }
+                else {
+                    eval {
+                        Encode::from_to(
+                            $string,
+                            $guess => $enc,
+                            Encode::FB_CROAK
+                        );
+                    };
+                    $success = !$@;
+                }
+
+                if ( !$success ) {
+                    $RT::Logger->error( "Encoding error: "
+                          . $@
+                          . " forcing conversion to $charset => $enc" );
+                    $string = $orig_string;
+                    Encode::from_to( $string, $charset => $enc );
+                }
             }
         }
 
@@ -560,7 +582,7 @@ sub _CanonicalizeCharset {
 }
 
 
-=head2 SetMIMEHeadToEncoding HEAD OLD_CHARSET NEW_CHARSET
+=head2 SetMIMEHeadToEncoding HEAD OLD_CHARSET NEW_CHARSET PRESERVE_WORDS FORCE
 
 Converts a MIME Head from one encoding to another. This totally violates the RFC.
 We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff
@@ -570,7 +592,8 @@ all the time
 =cut
 
 sub SetMIMEHeadToEncoding {
-    my ( $head, $charset, $enc, $preserve_words ) = ( shift, shift, shift, shift );
+    my ( $head, $charset, $enc, $preserve_words, $force ) =
+      ( shift, shift, shift, shift, shift );
 
     $charset = _CanonicalizeCharset($charset);
     $enc     = _CanonicalizeCharset($enc);
@@ -585,27 +608,47 @@ sub SetMIMEHeadToEncoding {
             Encode::_utf8_off($value);
             my $orig_value = $value;
             if ( $charset ne $enc ) {
-                eval {
-                    Encode::from_to( $value, $charset => $enc, Encode::FB_CROAK );
-                };
-                if ($@) {
-                    $RT::Logger->error( "Encoding error: " 
-                          . $@
-                          . " falling back to iso-8859-1 => $enc" );
-                    $value = $orig_value;
+                if ( $force ) {
+                    Encode::from_to( $value, $charset => $enc );
+                }
+                else {
                     eval {
                         Encode::from_to(
                             $value,
-                            'iso-8859-1' => $enc,
+                            $charset => $enc,
                             Encode::FB_CROAK
                         );
                     };
                     if ($@) {
-                        $RT::Logger->error( "Encoding error: " 
+
+                        my $guess = _GuessCharset( $orig_value );
+                        $RT::Logger->error( "Encoding error: "
                               . $@
-                              . " forcing conversion to $charset => $enc" );
+                              . " falling back to $guess => $enc" );
                         $value = $orig_value;
-                        Encode::from_to( $value, $charset => $enc );
+
+                        my $success;
+                        if ( $guess eq $enc ) {
+                            $success = 1;
+                        }
+                        else {
+                            eval {
+                                Encode::from_to(
+                                    $value,
+                                    $guess => $enc,
+                                    Encode::FB_CROAK
+                                );
+                            };
+                            $success = !$@;
+                        }
+
+                        if ( !$success ) {
+                            $RT::Logger->error( "Encoding error: "
+                                  . $@
+                                  . " forcing conversion to $charset => $enc" );
+                            $value = $orig_value;
+                            Encode::from_to( $value, $charset => $enc );
+                        }
                     }
                 }
             }
diff --git a/t/mail/sendmail.t b/t/mail/sendmail.t
index 05a59bd..05d900d 100644
--- a/t/mail/sendmail.t
+++ b/t/mail/sendmail.t
@@ -3,7 +3,7 @@
 use strict;
 use File::Spec ();
 
-use RT::Test tests => 139;
+use RT::Test tests => 138;
 use Test::Warn;
 
 use RT::EmailParser;
@@ -316,15 +316,7 @@ $parser->ParseMIMEEntityFromScalar($content);
 
  %args =        (message => $content, queue => 1, action => 'correspond');
 
-warnings_like {
  RT::Interface::Email::Gateway(\%args);
-}
-[
-    qr/Encoding error: "\\x\{041f\}" does not map to iso-8859-1 .*/,
-    qr/Encoding error: "\\x\{041f\}" does not map to iso-8859-1 .*/
-    ],
-"The badly formed Russian spam we have isn't actually well-formed UTF8, which makes Encode (correctly) warn";
-
 
  $tickets = RT::Tickets->new(RT->SystemUser);
 $tickets->OrderBy(FIELD => 'id', ORDER => 'DESC');
diff --git a/t/mail/wrong_mime_charset.t b/t/mail/wrong_mime_charset.t
index 7636de4..4739240 100644
--- a/t/mail/wrong_mime_charset.t
+++ b/t/mail/wrong_mime_charset.t
@@ -1,7 +1,7 @@
 #!/usr/bin/perl
 use strict;
 use warnings;
-use RT::Test nodb => 1, tests => 6;
+use RT::Test nodb => 1, tests => 4;
 
 use_ok('RT::I18N');
 use utf8;
@@ -23,18 +23,8 @@ local $SIG{__WARN__} = sub {
 
 RT::I18N::SetMIMEEntityToEncoding( $mime, 'iso-8859-1' );
 
-# this is a weird behavior for different perl versions, 5.12 warns twice,
-# which is correct since we do the encoding thing twice, for Subject
-# and Data respectively.
-# but 5.8 and 5.10 warns only once.
-ok( @warnings == 1 || @warnings == 2, "1 or 2 warnings are ok" );
-ok(
-    @warnings == 1 || $warnings[1] eq $warnings[0],
-    'if there are 2 warnings, they should be same'
-);
-
 like(
-    $warnings[0],
+    join( '', @warnings ),
     qr/\QEncoding error: "\x{fffd}" does not map to iso-8859-1/,
 "We can't encode something into the wrong encoding without Encode complaining"
 );

commit d1730ac6d33f3ba39c09e2f3a6134eda4ec0e37b
Author: sunnavy <sunnavy at bestpractical.com>
Date:   Thu Jun 9 05:16:59 2011 +0800

    more tests for RT::I18N::SetMIMEEntityToEncoding

diff --git a/t/api/i18n_mime_encoding.t b/t/api/i18n_mime_encoding.t
new file mode 100644
index 0000000..5109633
--- /dev/null
+++ b/t/api/i18n_mime_encoding.t
@@ -0,0 +1,44 @@
+use warnings;
+use strict;
+
+use RT::Test nodata => 1, tests => 4;
+use RT::I18N;
+use Encode;
+
+my @warnings;
+local $SIG{__WARN__} = sub {
+    push @warnings, "@_";
+};
+
+my $result = encode( 'iso-8859-1', decode_utf8('Ã€??') );
+
+diag "normal mime encoding conversion: utf8 => iso-8859-1";
+{
+    my $mime = MIME::Entity->build(
+        Type => 'text/plain; charset=utf-8',
+        Data => ['Ã€ä¸æ–‡'],
+    );
+
+    RT::I18N::SetMIMEEntityToEncoding( $mime, 'iso-8859-1', );
+    like(
+        join( '', @warnings ),
+        qr/does not map to iso-8859-1/,
+        'get no-map warning'
+    );
+    is( $mime->stringify_body, $result,
+        'invalid chars in mail are replaced by ?' );
+    @warnings = ();
+}
+
+diag "force mime encoding conversion: utf8 => iso-8859-1";
+{
+    my $mime     = MIME::Entity->build(
+        Type => 'text/plain; charset=utf-8',
+        Data => ['Ã€ä¸æ–‡'],
+    );
+    RT::I18N::SetMIMEEntityToEncoding( $mime, 'iso-8859-1', '', 1 );
+    is( scalar @warnings, 0, 'no warnings with force' );
+    is( $mime->stringify_body, $result,
+        'invalid chars in mail are replaced by ?' );
+}
+

-----------------------------------------------------------------------