[Rt-commit] rt branch, 4.2/old-html-encoding, created. rt-4.2.4rc1-11-ge162a19

Tue May 13 12:34:39 EDT 2014

The branch, 4.2/old-html-encoding has been created
        at  e162a191d7d3d2952410788ee6288ba3215f3273 (commit)

- Log -----------------------------------------------------------------
commit 1d8ef1cdc591d0d4f0723a318f6283012310c40d
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Mon May 12 16:48:32 2014 -0400

    Re-indent _EncodeLOB and _DecodeLOB

diff --git a/lib/RT/Record.pm b/lib/RT/Record.pm
index b000209..957d30c 100644
--- a/lib/RT/Record.pm
+++ b/lib/RT/Record.pm
@@ -763,72 +763,71 @@ evaluate and encode it.  It will return an octet string.
 =cut
 
 sub _EncodeLOB {
-        my $self = shift;
-        my $Body = shift;
-        my $MIMEType = shift || '';
-        my $Filename = shift;
+    my $self = shift;
+    my $Body = shift;
+    my $MIMEType = shift || '';
+    my $Filename = shift;
 
-        my $ContentEncoding = 'none';
+    my $ContentEncoding = 'none';
 
-        #get the max attachment length from RT
-        my $MaxSize = RT->Config->Get('MaxAttachmentSize');
+    #get the max attachment length from RT
+    my $MaxSize = RT->Config->Get('MaxAttachmentSize');
 
-        #if the current attachment contains nulls and the
-        #database doesn't support embedded nulls
+    #if the current attachment contains nulls and the
+    #database doesn't support embedded nulls
 
-        if ( ( !$RT::Handle->BinarySafeBLOBs ) && ( $Body =~ /\x00/ ) ) {
+    if ( ( !$RT::Handle->BinarySafeBLOBs ) && ( $Body =~ /\x00/ ) ) {
 
-            # set a flag telling us to mimencode the attachment
-            $ContentEncoding = 'base64';
+        # set a flag telling us to mimencode the attachment
+        $ContentEncoding = 'base64';
 
-            #cut the max attchment size by 25% (for mime-encoding overhead.
-            $RT::Logger->debug("Max size is $MaxSize");
-            $MaxSize = $MaxSize * 3 / 4;
-        # Some databases (postgres) can't handle non-utf8 data
-        } elsif (    !$RT::Handle->BinarySafeBLOBs
-                  && $Body =~ /\P{ASCII}/
-                  && !Encode::is_utf8( $Body, 1 ) ) {
-              $ContentEncoding = 'quoted-printable';
-        }
+        #cut the max attchment size by 25% (for mime-encoding overhead.
+        $RT::Logger->debug("Max size is $MaxSize");
+        $MaxSize = $MaxSize * 3 / 4;
+    # Some databases (postgres) can't handle non-utf8 data
+    } elsif (    !$RT::Handle->BinarySafeBLOBs
+              && $Body =~ /\P{ASCII}/
+              && !Encode::is_utf8( $Body, 1 ) ) {
+          $ContentEncoding = 'quoted-printable';
+    }
 
-        #if the attachment is larger than the maximum size
-        if ( ($MaxSize) and ( $MaxSize < length($Body) ) ) {
+    #if the attachment is larger than the maximum size
+    if ( ($MaxSize) and ( $MaxSize < length($Body) ) ) {
 
-            # if we're supposed to truncate large attachments
-            if (RT->Config->Get('TruncateLongAttachments')) {
+        # if we're supposed to truncate large attachments
+        if (RT->Config->Get('TruncateLongAttachments')) {
 
-                # truncate the attachment to that length.
-                $Body = substr( $Body, 0, $MaxSize );
+            # truncate the attachment to that length.
+            $Body = substr( $Body, 0, $MaxSize );
 
-            }
+        }
 
-            # elsif we're supposed to drop large attachments on the floor,
-            elsif (RT->Config->Get('DropLongAttachments')) {
+        # elsif we're supposed to drop large attachments on the floor,
+        elsif (RT->Config->Get('DropLongAttachments')) {
 
-                # drop the attachment on the floor
-                $RT::Logger->info( "$self: Dropped an attachment of size "
-                                   . length($Body));
-                $RT::Logger->info( "It started: " . substr( $Body, 0, 60 ) );
-                $Filename .= ".txt" if $Filename;
-                return ("none", "Large attachment dropped", "text/plain", $Filename );
-            }
+            # drop the attachment on the floor
+            $RT::Logger->info( "$self: Dropped an attachment of size "
+                               . length($Body));
+            $RT::Logger->info( "It started: " . substr( $Body, 0, 60 ) );
+            $Filename .= ".txt" if $Filename;
+            return ("none", "Large attachment dropped", "text/plain", $Filename );
         }
+    }
 
-        # if we need to mimencode the attachment
-        if ( $ContentEncoding eq 'base64' ) {
+    # if we need to mimencode the attachment
+    if ( $ContentEncoding eq 'base64' ) {
 
-            # base64 encode the attachment
-            Encode::_utf8_off($Body);
-            $Body = MIME::Base64::encode_base64($Body);
-
-        } elsif ($ContentEncoding eq 'quoted-printable') {
-            Encode::_utf8_off($Body);
-            $Body = MIME::QuotedPrint::encode($Body);
-        }
+        # base64 encode the attachment
+        Encode::_utf8_off($Body);
+        $Body = MIME::Base64::encode_base64($Body);
 
+    } elsif ($ContentEncoding eq 'quoted-printable') {
+        Encode::_utf8_off($Body);
+        $Body = MIME::QuotedPrint::encode($Body);
+    }
 
-        return ($ContentEncoding, $Body, $MIMEType, $Filename );
 
+    return ($ContentEncoding, $Body, $MIMEType, $Filename );
 }
 
 =head2 _DecodeLOB
@@ -868,9 +867,9 @@ sub _DecodeLOB {
         return ( $self->loc( "Unknown ContentEncoding [_1]", $ContentEncoding ) );
     }
     if ( RT::I18N::IsTextualContentType($ContentType) ) {
-       $Content = Encode::decode('UTF-8',$Content,Encode::FB_PERLQQ) unless Encode::is_utf8($Content);
+        $Content = Encode::decode('UTF-8',$Content,Encode::FB_PERLQQ) unless Encode::is_utf8($Content);
     }
-        return ($Content);
+    return ($Content);
 }
 
 =head2 Update  ARGSHASH

commit 18c810d0822b5f433a9c7a070d90635092304171
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Mon May 12 18:04:19 2014 -0400

    Respect the database Content-Type header in decoding textual parts
    
    The definition of "texual" data has changed over time.  Specifically,
    7365c08 caused text/html messages to begin being stored as utf-8 in the
    database; prior to that, the claimed "charset" and bytes in the body
    were left unmolested during insertion.
    
    text/html attachments inserted into the database prior to 7365c08,
    however, are now expected to be utf-8 when being extracted from the
    database.  This causes PERLQQ'd garbage to be displayed for the non-UTF8
    content stored in the database.  This type of error is likely to also
    re-occur in the future whenever the definition of "textual" data
    (i.e. data we transcode on insertion) changes.
    
    Respect the Content-Type header when decoding data from the database, or
    guess its value from the body; this mirrors the logic in
    RT::I18N::SetMIMEEntityToEncoding, which is what is done for
    currently-detected-as-textual parts on insert.  In cases like text/html
    prior to 7365c08, the Content-Type header was not altered during
    database insertion -- and at worst, the claimed character set is
    incorrect and decoding will result in PERLQQ'd garbage.  This is no
    worse than said message were detected, received, converted, and stored
    in the database as text.

diff --git a/lib/RT/Attachment.pm b/lib/RT/Attachment.pm
index f4d1d73..378e744 100644
--- a/lib/RT/Attachment.pm
+++ b/lib/RT/Attachment.pm
@@ -341,7 +341,7 @@ before returning it.
 sub Content {
     my $self = shift;
     return $self->_DecodeLOB(
-        $self->ContentType,
+        $self->GetHeader('Content-Type'),  # Includes charset, unlike ->ContentType
         $self->ContentEncoding,
         $self->_Value('Content', decode_utf8 => 0),
     );
diff --git a/lib/RT/Record.pm b/lib/RT/Record.pm
index 957d30c..fa2c7e4 100644
--- a/lib/RT/Record.pm
+++ b/lib/RT/Record.pm
@@ -830,7 +830,7 @@ sub _EncodeLOB {
     return ($ContentEncoding, $Body, $MIMEType, $Filename );
 }
 
-=head2 _DecodeLOB
+=head2 _DecodeLOB C<ContentType>, C<ContentEncoding>, C<Content>
 
 Unpacks data stored in the database, which may be base64 or QP encoded
 because of our need to store binary and badly encoded data in columns
@@ -846,6 +846,12 @@ This is similar to how we filter all data coming in via the web UI in
 RT::Interface::Web::DecodeARGS. This filter should only end up being
 applied to old data from less UTF-8-safe versions of RT.
 
+If the passed C<ContentType> includes a character set, that will be used
+to decode textual data; the default character set is UTF-8.  This is
+necessary because while we attempt to store textual data as UTF-8, the
+definition of "textual" has migrated over time, and thus we may now need
+to attempt to decode data that was previously not trancoded on insertion.
+
 Important Note - This function expects an octet string and returns a
 character string for non-binary data.
 
@@ -867,7 +873,13 @@ sub _DecodeLOB {
         return ( $self->loc( "Unknown ContentEncoding [_1]", $ContentEncoding ) );
     }
     if ( RT::I18N::IsTextualContentType($ContentType) ) {
-        $Content = Encode::decode('UTF-8',$Content,Encode::FB_PERLQQ) unless Encode::is_utf8($Content);
+        my $entity = MIME::Entity->new();
+        $entity->head->add("Content-Type", $ContentType);
+        $entity->bodyhandle( MIME::Body::Scalar->new( $Content ) );
+        my $charset = RT::I18N::_FindOrGuessCharset($entity);
+        $charset = 'utf-8' if not $charset or not Encode::find_encoding($charset);
+
+        $Content = Encode::decode($charset,$Content,Encode::FB_PERLQQ) unless Encode::is_utf8($Content);
     }
     return ($Content);
 }

commit d5f4b30723c90ddf553b56ea2d1a87e69b110db4
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Tue May 13 12:15:20 2014 -0400

    Stop needlessly frobbing utf8 internals
    
    bf87c784 swapped in this explicit _utf8_off for what had been a
    _utf8_on.  As the values are fetched with decode_utf8 => 0, $content
    contains bytes, not characters.  Explicitly turning of fthe utf8 bit is
    unnecessary, as they are already bytes.

diff --git a/lib/RT/Attachment.pm b/lib/RT/Attachment.pm
index 378e744..0095515 100644
--- a/lib/RT/Attachment.pm
+++ b/lib/RT/Attachment.pm
@@ -385,10 +385,7 @@ sub OriginalContent {
         return( $self->loc("Unknown ContentEncoding [_1]", $self->ContentEncoding));
     }
 
-    # Turn *off* the SvUTF8 bits here so decode_utf8 and from_to below can work.
     local $@;
-    Encode::_utf8_off($content);
-
     if (!$enc || $enc eq '' ||  $enc eq 'utf8' || $enc eq 'utf-8') {
         # If we somehow fail to do the decode, at least push out the raw bits
         eval { return( Encode::decode_utf8($content)) } || return ($content);

commit 5a45f3b5e86a5567d17102313b1cdc89f379e4e2
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Tue May 13 12:20:38 2014 -0400

    Decoding content, and returning characters, is incorrect
    
    The whole purpose of this function is to return bytes, not characters.
    Decoding the bytes to characters as utf8 defeats the purpose, and only
    works because RT's internal representation of characters happens to be
    utf8, which gets used by MIME::Entity to later encode the characters.
    
    Switch to simply returning the bytes, as-is.

diff --git a/lib/RT/Attachment.pm b/lib/RT/Attachment.pm
index 0095515..5e28d70 100644
--- a/lib/RT/Attachment.pm
+++ b/lib/RT/Attachment.pm
@@ -385,13 +385,12 @@ sub OriginalContent {
         return( $self->loc("Unknown ContentEncoding [_1]", $self->ContentEncoding));
     }
 
-    local $@;
     if (!$enc || $enc eq '' ||  $enc eq 'utf8' || $enc eq 'utf-8') {
-        # If we somehow fail to do the decode, at least push out the raw bits
-        eval { return( Encode::decode_utf8($content)) } || return ($content);
+        return ($content);
     }
 
-    eval { Encode::from_to($content, 'utf8' => $enc) } if $enc;
+    local $@;
+    eval { Encode::from_to($content, 'utf8' => $enc) };
     if ($@) {
         $RT::Logger->error("Could not convert attachment from assumed utf8 to '$enc' :".$@);
     }

commit e162a191d7d3d2952410788ee6288ba3215f3273
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Tue May 13 12:23:34 2014 -0400

    Stop assuming the data in the database is utf8
    
    As noted in 18c810d0, not all content we currently call "texual" was
    always treated as such.  When re-encoding, do not assume that the
    encoding in the database is UTF-8 -- rather, read the Content-Type
    header, and examine the charset stated there.  Convert from that to the
    original encoding.

diff --git a/lib/RT/Attachment.pm b/lib/RT/Attachment.pm
index 5e28d70..c7aecb5 100644
--- a/lib/RT/Attachment.pm
+++ b/lib/RT/Attachment.pm
@@ -372,7 +372,6 @@ sub OriginalContent {
     }
 
     return $self->Content unless RT::I18N::IsTextualContentType($self->ContentType);
-    my $enc = $self->OriginalEncoding;
 
     my $content;
     if ( !$self->ContentEncoding || $self->ContentEncoding eq 'none' ) {
@@ -385,14 +384,20 @@ sub OriginalContent {
         return( $self->loc("Unknown ContentEncoding [_1]", $self->ContentEncoding));
     }
 
-    if (!$enc || $enc eq '' ||  $enc eq 'utf8' || $enc eq 'utf-8') {
-        return ($content);
-    }
+    my $entity = MIME::Entity->new();
+    $entity->head->add("Content-Type", $self->GetHeader("Content-Type"));
+    $entity->bodyhandle( MIME::Body::Scalar->new( $content ) );
+    my $from = RT::I18N::_FindOrGuessCharset($entity);
+    $from = 'utf-8' if not $from or not Encode::find_encoding($from);
+
+    my $to = RT::I18N::_CanonicalizeCharset(
+        $self->OriginalEncoding || 'utf-8'
+    );
 
     local $@;
-    eval { Encode::from_to($content, 'utf8' => $enc) };
+    eval { Encode::from_to($content, $from => $to) };
     if ($@) {
-        $RT::Logger->error("Could not convert attachment from assumed utf8 to '$enc' :".$@);
+        $RT::Logger->error("Could not convert attachment from $from to $to: ".$@);
     }
     return $content;
 }

-----------------------------------------------------------------------