[Rt-commit] rt branch, 3.9-trunk, updated. rt-3.9.7-1039-g3798272
Ruslan Zakirov
ruz at bestpractical.com
Fri Dec 24 04:58:10 EST 2010
The branch, 3.9-trunk has been updated
via 379827282fa8ff8a485704cedeb9c66222a11a89 (commit)
via a8130dd1a8a62926962a97d4a71826baff6bd61b (commit)
via 0ec2fc6bfc62e62a96e59926102e15980b81d31c (commit)
from e9e17611cb7d024eaf6a362783899de48583cd4a (commit)
Summary of changes:
etc/RT_Config.pm.in | 5 +-
lib/RT/I18N.pm | 103 ++++++++++++++++++++++++++++++++------------------
t/api/i18n_guess.t | 47 +++++++++++++++++++++++
3 files changed, 116 insertions(+), 39 deletions(-)
create mode 100644 t/api/i18n_guess.t
- Log -----------------------------------------------------------------
commit 0ec2fc6bfc62e62a96e59926102e15980b81d31c
Author: Ruslan Zakirov <ruz at bestpractical.com>
Date: Fri Dec 24 12:25:53 2010 +0300
add tests for encoding detection
diff --git a/t/api/i18n_guess.t b/t/api/i18n_guess.t
new file mode 100644
index 0000000..b07941c
--- /dev/null
+++ b/t/api/i18n_guess.t
@@ -0,0 +1,47 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use RT::Test tests => 9;
+
+use Encode qw(encode);
+
+use constant HAS_ENCODE_GUESS => do { local $@; eval { require Encode::Guess; 1 } };
+use constant HAS_ENCODE_DETECT => do { local $@; eval { require Encode::Detect::Detector; 1 } };
+
+my $string = "\x{442}\x{435}\x{441}\x{442} \x{43f}\x{43e}\x{434}\x{434}\x{435}\x{440}\x{436}\x{43a}\x{430}";
+
+sub guess {
+ is( RT::I18N::_GuessCharset( Encode::encode($_[0], $_[1]) ), $_[0], 'correct guess' );
+}
+
+RT->Config->Set(EmailInputEncodings => qw(*));
+SKIP: {
+ skip "No Encode::Detect", 3 unless HAS_ENCODE_DETECT;
+ guess('utf-8', $string);
+ guess('cp1251', $string);
+ guess('koi8-r', $string);
+}
+
+RT->Config->Set(EmailInputEncodings => qw(UTF-8 cp1251 koi8-r));
+SKIP: {
+ skip "No Encode::Guess", 3 unless HAS_ENCODE_GUESS;
+ guess('utf-8', $string);
+ {
+ local $TODO = 'Encode::Guess can not distinguish cp1251 from koi8-r';
+ # we can not todo one test here as it's depends on hash order and
+ # varies from system to system
+ guess('cp1251', $string);
+ guess('koi8-r', $string);
+ }
+}
+
+RT->Config->Set(EmailInputEncodings => qw(* UTF-8 cp1251 koi8-r));
+SKIP: {
+ skip "No Encode::Detect", 3 unless HAS_ENCODE_DETECT;
+ guess('utf-8', $string);
+ guess('cp1251', $string);
+ guess('koi8-r', $string);
+}
+
commit a8130dd1a8a62926962a97d4a71826baff6bd61b
Author: Ruslan Zakirov <ruz at bestpractical.com>
Date: Fri Dec 24 12:27:30 2010 +0300
optionally use Encode::Detect for encoding guess
* it's mozilla's code that works in gecko browsers
* it can distunguish cp1251 from koi8-r
* it can guess without providing list of encodings
diff --git a/lib/RT/I18N.pm b/lib/RT/I18N.pm
index b4e05fa..10b2482 100755
--- a/lib/RT/I18N.pm
+++ b/lib/RT/I18N.pm
@@ -460,50 +460,79 @@ use Encode::Guess to try to figure it out the string's encoding.
=cut
+use constant HAS_ENCODE_GUESS => do { local $@; eval { require Encode::Guess; 1 } };
+use constant HAS_ENCODE_DETECT => do { local $@; eval { require Encode::Detect::Detector; 1 } };
+
sub _GuessCharset {
- my $fallback = 'iso-8859-1';
+ my $fallback = _CanonicalizeCharset('iso-8859-1');
# if $_[0] is null/empty, we don't guess its encoding
- return $fallback unless defined $_[0] && length $_[0];
+ return $fallback
+ unless defined $_[0] && length $_[0];
- my $charset;
my @encodings = RT->Config->Get('EmailInputEncodings');
- if ( @encodings and eval { require Encode::Guess; 1 } ) {
- Encode::Guess->set_suspects( @encodings );
- my $decoder = Encode::Guess->guess( $_[0] );
-
- if ( defined($decoder) ) {
- if ( ref $decoder ) {
- $charset = $decoder->name;
- $RT::Logger->debug("Guessed encoding: $charset");
- return $charset;
- }
- elsif ($decoder =~ /(\S+ or .+)/) {
- my %matched = map { $_ => 1 } split(/ or /, $1);
- return 'utf-8' if $matched{'utf8'}; # one and only normalization
-
- foreach my $suspect (RT->Config->Get('EmailInputEncodings')) {
- next unless $matched{$suspect};
- $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
- $charset = $suspect;
- last;
- }
- }
- else {
- $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
- }
- }
- else {
- $RT::Logger->warning("Encode::Guess failed: decoder is undefined; fallback to $fallback");
- }
- }
- elsif ( @encodings && $@ ) {
- $RT::Logger->error("You have set EmailInputEncodings, but we couldn't load Encode::Guess: $@");
- } else {
+ unless ( @encodings ) {
$RT::Logger->warning("No EmailInputEncodings set, fallback to $fallback");
+ return $fallback;
+ }
+
+ if ( $encodings[0] eq '*' ) {
+ shift @encodings;
+ if ( HAS_ENCODE_DETECT ) {
+ my $charset = Encode::Detect::Detector::detect( $_[0] );
+ if ( $charset ) {
+ $RT::Logger->debug("Encode::Detect::Detector guessed encoding: $charset");
+ return _CanonicalizeCharset( Encode::resolve_alias( $charset ) );
+ }
+ else {
+ $RT::Logger->debug("Encode::Detect::Detector failed to guess encoding");
+ }
+ }
+ else {
+ $RT::Logger->error(
+ "You requested to guess encoding, but we couldn't"
+ ." load Encode::Detect::Detector module"
+ );
+ }
+ }
+
+ unless ( @encodings ) {
+ $RT::Logger->warning("No EmailInputEncodings set except '*', fallback to $fallback");
+ return $fallback;
+ }
+
+ unless ( HAS_ENCODE_GUESS ) {
+ $RT::Logger->error("We couldn't load Encode::Guess module, fallback to $fallback");
+ return $fallback;
+ }
+
+ Encode::Guess->set_suspects( @encodings );
+ my $decoder = Encode::Guess->guess( $_[0] );
+ unless ( defined $decoder ) {
+ $RT::Logger->warning("Encode::Guess failed: decoder is undefined; fallback to $fallback");
+ return $fallback;
+ }
+
+ if ( ref $decoder ) {
+ my $charset = $decoder->name;
+ $RT::Logger->debug("Encode::Guess guessed encoding: $charset");
+ return _CanonicalizeCharset( $charset );
+ }
+ elsif ($decoder =~ /(\S+ or .+)/) {
+ my %matched = map { $_ => 1 } split(/ or /, $1);
+ return 'utf-8' if $matched{'utf8'}; # one and only normalization
+
+ foreach my $suspect (RT->Config->Get('EmailInputEncodings')) {
+ next unless $matched{$suspect};
+ $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
+ return _CanonicalizeCharset( $suspect );
+ }
+ }
+ else {
+ $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
}
- return _CanonicalizeCharset($charset || $fallback);
+ return $fallback;
}
=head2 _CanonicalizeCharset NAME
@@ -517,7 +546,7 @@ sub _CanonicalizeCharset {
my $charset = lc shift;
return $charset unless $charset;
- if ( $charset eq 'utf8' ) {
+ if ( $charset eq 'utf8' || $charset eq 'utf-8-strict' ) {
return 'utf-8';
}
elsif ( $charset eq 'gb2312' ) {
commit 379827282fa8ff8a485704cedeb9c66222a11a89
Author: Ruslan Zakirov <ruz at bestpractical.com>
Date: Fri Dec 24 12:30:08 2010 +0300
describe new feature in EmailInputEncodings option
diff --git a/etc/RT_Config.pm.in b/etc/RT_Config.pm.in
index df27c87..5d5cbe0 100755
--- a/etc/RT_Config.pm.in
+++ b/etc/RT_Config.pm.in
@@ -1756,8 +1756,9 @@ Set(@LexiconLanguages, qw(*));
=item C<@EmailInputEncodings>
An array that contains default encodings used to guess which charset
-an attachment uses if not specified. Must be recognized by
-L<Encode::Guess>.
+an attachment uses if not specified. Must be recognized by
+L<Encode::Guess>. First element can be '*' that enables encoding
+detection using L<Encode::Detect::Detector> if it's installed.
=cut
-----------------------------------------------------------------------
More information about the Rt-commit
mailing list