[Rt-commit] rt branch, 3.9-trunk, updated. rt-3.9.7-1039-g3798272

Fri Dec 24 04:58:10 EST 2010

The branch, 3.9-trunk has been updated
       via  379827282fa8ff8a485704cedeb9c66222a11a89 (commit)
       via  a8130dd1a8a62926962a97d4a71826baff6bd61b (commit)
       via  0ec2fc6bfc62e62a96e59926102e15980b81d31c (commit)
      from  e9e17611cb7d024eaf6a362783899de48583cd4a (commit)

Summary of changes:
 etc/RT_Config.pm.in |    5 +-
 lib/RT/I18N.pm      |  103 ++++++++++++++++++++++++++++++++------------------
 t/api/i18n_guess.t  |   47 +++++++++++++++++++++++
 3 files changed, 116 insertions(+), 39 deletions(-)
 create mode 100644 t/api/i18n_guess.t

- Log -----------------------------------------------------------------
commit 0ec2fc6bfc62e62a96e59926102e15980b81d31c
Author: Ruslan Zakirov <ruz at bestpractical.com>
Date:   Fri Dec 24 12:25:53 2010 +0300

    add tests for encoding detection

diff --git a/t/api/i18n_guess.t b/t/api/i18n_guess.t
new file mode 100644
index 0000000..b07941c
--- /dev/null
+++ b/t/api/i18n_guess.t
@@ -0,0 +1,47 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use RT::Test tests => 9;
+
+use Encode qw(encode);
+
+use constant HAS_ENCODE_GUESS => do { local $@; eval { require Encode::Guess; 1 } };
+use constant HAS_ENCODE_DETECT => do { local $@; eval { require Encode::Detect::Detector; 1 } };
+
+my $string = "\x{442}\x{435}\x{441}\x{442} \x{43f}\x{43e}\x{434}\x{434}\x{435}\x{440}\x{436}\x{43a}\x{430}";
+
+sub guess {
+    is( RT::I18N::_GuessCharset( Encode::encode($_[0], $_[1]) ), $_[0], 'correct guess' );
+}
+
+RT->Config->Set(EmailInputEncodings => qw(*));
+SKIP: {
+    skip "No Encode::Detect", 3 unless HAS_ENCODE_DETECT;
+    guess('utf-8', $string);
+    guess('cp1251', $string);
+    guess('koi8-r', $string);
+}
+
+RT->Config->Set(EmailInputEncodings => qw(UTF-8 cp1251 koi8-r));
+SKIP: {
+    skip "No Encode::Guess", 3 unless HAS_ENCODE_GUESS;
+    guess('utf-8', $string);
+    {
+        local $TODO = 'Encode::Guess can not distinguish cp1251 from koi8-r';
+        # we can not todo one test here as it's depends on hash order and
+        # varies from system to system
+        guess('cp1251', $string);
+        guess('koi8-r', $string);
+    }
+}
+
+RT->Config->Set(EmailInputEncodings => qw(* UTF-8 cp1251 koi8-r));
+SKIP: {
+    skip "No Encode::Detect", 3 unless HAS_ENCODE_DETECT;
+    guess('utf-8', $string);
+    guess('cp1251', $string);
+    guess('koi8-r', $string);
+}
+

commit a8130dd1a8a62926962a97d4a71826baff6bd61b
Author: Ruslan Zakirov <ruz at bestpractical.com>
Date:   Fri Dec 24 12:27:30 2010 +0300

    optionally use Encode::Detect for encoding guess
    
    * it's mozilla's code that works in gecko browsers
    * it can distunguish cp1251 from koi8-r
    * it can guess without providing list of encodings

diff --git a/lib/RT/I18N.pm b/lib/RT/I18N.pm
index b4e05fa..10b2482 100755
--- a/lib/RT/I18N.pm
+++ b/lib/RT/I18N.pm
@@ -460,50 +460,79 @@ use Encode::Guess to try to figure it out the string's encoding.
 
 =cut
 
+use constant HAS_ENCODE_GUESS => do { local $@; eval { require Encode::Guess; 1 } };
+use constant HAS_ENCODE_DETECT => do { local $@; eval { require Encode::Detect::Detector; 1 } };
+
 sub _GuessCharset {
-    my $fallback = 'iso-8859-1';
+    my $fallback = _CanonicalizeCharset('iso-8859-1');
 
     # if $_[0] is null/empty, we don't guess its encoding
-    return $fallback unless defined $_[0] && length $_[0];
+    return $fallback
+        unless defined $_[0] && length $_[0];
 
-    my $charset;
     my @encodings = RT->Config->Get('EmailInputEncodings');
-    if ( @encodings and eval { require Encode::Guess; 1 } ) {
-	Encode::Guess->set_suspects( @encodings );
-	my $decoder = Encode::Guess->guess( $_[0] );
-
-      if ( defined($decoder) ) {
-	if ( ref $decoder ) {
-	    $charset = $decoder->name;
-	    $RT::Logger->debug("Guessed encoding: $charset");
-	    return $charset;
-	}
-	elsif ($decoder =~ /(\S+ or .+)/) {
-	    my %matched = map { $_ => 1 } split(/ or /, $1);
-	    return 'utf-8' if $matched{'utf8'}; # one and only normalization
-
-	    foreach my $suspect (RT->Config->Get('EmailInputEncodings')) {
-		next unless $matched{$suspect};
-		$RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
-		$charset = $suspect;
-		last;
-	    }
-	}
-	else {
-	    $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
-	}
-      }
-      else {
-	  $RT::Logger->warning("Encode::Guess failed: decoder is undefined; fallback to $fallback");
-      }
-    }
-    elsif ( @encodings && $@ ) {
-        $RT::Logger->error("You have set EmailInputEncodings, but we couldn't load Encode::Guess: $@");
-    } else {
+    unless ( @encodings ) {
         $RT::Logger->warning("No EmailInputEncodings set, fallback to $fallback");
+        return $fallback;
+    }
+
+    if ( $encodings[0] eq '*' ) {
+        shift @encodings;
+        if ( HAS_ENCODE_DETECT ) {
+            my $charset = Encode::Detect::Detector::detect( $_[0] );
+            if ( $charset ) {
+                $RT::Logger->debug("Encode::Detect::Detector guessed encoding: $charset");
+                return _CanonicalizeCharset( Encode::resolve_alias( $charset ) );
+            }
+            else {
+                $RT::Logger->debug("Encode::Detect::Detector failed to guess encoding");
+            }
+        }
+        else {
+	    $RT::Logger->error(
+                "You requested to guess encoding, but we couldn't"
+                ." load Encode::Detect::Detector module"
+            );
+        }
+    }
+
+    unless ( @encodings ) {
+        $RT::Logger->warning("No EmailInputEncodings set except '*', fallback to $fallback");
+        return $fallback;
+    }
+
+    unless ( HAS_ENCODE_GUESS ) {
+        $RT::Logger->error("We couldn't load Encode::Guess module, fallback to $fallback");
+        return $fallback;
+    }
+
+    Encode::Guess->set_suspects( @encodings );
+    my $decoder = Encode::Guess->guess( $_[0] );
+    unless ( defined $decoder ) {
+        $RT::Logger->warning("Encode::Guess failed: decoder is undefined; fallback to $fallback");
+        return $fallback;
+    }
+
+    if ( ref $decoder ) {
+        my $charset = $decoder->name;
+        $RT::Logger->debug("Encode::Guess guessed encoding: $charset");
+        return _CanonicalizeCharset( $charset );
+    }
+    elsif ($decoder =~ /(\S+ or .+)/) {
+        my %matched = map { $_ => 1 } split(/ or /, $1);
+        return 'utf-8' if $matched{'utf8'}; # one and only normalization
+
+        foreach my $suspect (RT->Config->Get('EmailInputEncodings')) {
+            next unless $matched{$suspect};
+            $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
+            return _CanonicalizeCharset( $suspect );
+        }
+    }
+    else {
+        $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
     }
 
-    return _CanonicalizeCharset($charset || $fallback);
+    return $fallback;
 }
 
 =head2 _CanonicalizeCharset NAME
@@ -517,7 +546,7 @@ sub _CanonicalizeCharset {
     my $charset = lc shift;
     return $charset unless $charset;
 
-    if ( $charset eq 'utf8' ) {
+    if ( $charset eq 'utf8' || $charset eq 'utf-8-strict' ) {
         return 'utf-8';
     }
     elsif ( $charset eq 'gb2312' ) {

commit 379827282fa8ff8a485704cedeb9c66222a11a89
Author: Ruslan Zakirov <ruz at bestpractical.com>
Date:   Fri Dec 24 12:30:08 2010 +0300

    describe new feature in EmailInputEncodings option

diff --git a/etc/RT_Config.pm.in b/etc/RT_Config.pm.in
index df27c87..5d5cbe0 100755
--- a/etc/RT_Config.pm.in
+++ b/etc/RT_Config.pm.in
@@ -1756,8 +1756,9 @@ Set(@LexiconLanguages, qw(*));
 =item C<@EmailInputEncodings>
 
 An array that contains default encodings used to guess which charset
-an attachment uses if not specified.  Must be recognized by
-L<Encode::Guess>.
+an attachment uses if not specified. Must be recognized by
+L<Encode::Guess>. First element can be '*' that enables encoding
+detection using L<Encode::Detect::Detector> if it's installed.
 
 =cut
 

-----------------------------------------------------------------------