[Rt-commit] rt branch, 4.2/html-external-formatter, created. rt-4.2.5-50-g9a3e5c8
Alex Vandiver
alexmv at bestpractical.com
Wed Jun 18 18:50:31 EDT 2014
The branch, 4.2/html-external-formatter has been created
at 9a3e5c825611c6ac2cbbaf208cbed709dee97f8a (commit)
- Log -----------------------------------------------------------------
commit 9b26a726f022948bde4f7c7f155c37b874b5b1ab
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Thu Jan 2 21:37:16 2014 -0500
Use HTML::FormatExternal to allow external HTML → text formatters
The HTML::FormatText::WithLinks::AndTables module is unmaintained, and
contains multiple bugs which trigger runtime errors. Add
HTML::FormatExternal as an optional dependency for slower but more
reliable HTML to text conversion using external text-based web browsers
and text conversion tools.
diff --git a/etc/RT_Config.pm.in b/etc/RT_Config.pm.in
index 9740837..7cc19e3 100755
--- a/etc/RT_Config.pm.in
+++ b/etc/RT_Config.pm.in
@@ -713,6 +713,23 @@ will use the address of the current user and remove RT's subject tag.
Set($ForwardFromUser, 0);
+=item C<$HTMLFormatter>
+
+The external HTML formatter to use. The default is to try, in order,
+C<w3m>, C<elinks>, C<html2text>, C<links>, C<lynx>, and then fall back
+to the C<core> pure-perl formatter. The pure-perl formatter is
+significantly faster than the others, but may fail to successfully
+convert even on some relatively simple HTML. Falls back to C<core> if
+the configured program is not installed.
+
+Using formatters other than C<core> requires that the
+L<HTML::FormatExternal> module be installed; it is an optional
+dependency which RT will not install by default.
+
+=cut
+
+Set($HTMLFormatter, undef);
+
=back
=head2 Email dashboards
diff --git a/lib/RT/Config.pm b/lib/RT/Config.pm
index 7842e57..b00c4f1 100644
--- a/lib/RT/Config.pm
+++ b/lib/RT/Config.pm
@@ -635,6 +635,10 @@ our %META;
$self->Set( MailCommand => 'sendmailpipe' );
},
},
+ HTMLFormatter => {
+ Type => 'SCALAR',
+ PostLoadCheck => sub { RT::Interface::Email->_HTMLFormatter },
+ },
MailPlugins => {
Type => 'ARRAY',
PostLoadCheck => sub {
diff --git a/lib/RT/Interface/Email.pm b/lib/RT/Interface/Email.pm
index facdb38..286831e 100644
--- a/lib/RT/Interface/Email.pm
+++ b/lib/RT/Interface/Email.pm
@@ -50,6 +50,7 @@ package RT::Interface::Email;
use strict;
use warnings;
+use 5.010;
use Email::Address;
use MIME::Entity;
@@ -58,6 +59,7 @@ use File::Temp;
use UNIVERSAL::require;
use Mail::Mailer ();
use Text::ParseWords qw/shellwords/;
+use RT::Util 'safe_run_child';
BEGIN {
use base 'Exporter';
@@ -1793,9 +1795,64 @@ conversion fails.
=cut
sub ConvertHTMLToText {
+ return _HTMLFormatter()->(@_);
+}
+
+sub _HTMLFormatter {
+ state $formatter;
+ return $formatter if defined $formatter;
+
+ my $wanted = RT->Config->Get("HTMLFormatter");
+
+ my @order;
+ if ($wanted) {
+ @order = ($wanted, "core");
+ } else {
+ @order = ("w3m", "elinks", "html2text", "links", "lynx", "core");
+ }
+ # Always fall back to core, even if it is not listed
+ for my $prog (@order) {
+ if ($prog eq "core") {
+ RT->Logger->info("Using internal Perl HTML -> text conversion");
+ require HTML::FormatText::WithLinks::AndTables;
+ $formatter = \&_HTMLFormatText;
+ } else {
+ unless (HTML::FormatExternal->require) {
+ RT->Logger->warn("HTML::FormatExternal is not installed; falling back to internal perl formatter")
+ if $wanted;
+ next;
+ }
+
+ my $package = "HTML::FormatText::" . ucfirst($prog);
+ unless ($package->require) {
+ RT->Logger->warn("$prog is not a valid formatter provided by HTML::FormatExternal")
+ if $wanted;
+ next;
+ }
+
+ unless (defined $package->program_version) {
+ RT->Logger->warn("Could not find external '$prog' HTML formatter; is it installed?")
+ if $wanted;
+ next;
+ }
+
+ RT->Logger->info("Using $prog for HTML -> text conversion");
+ $formatter = sub {
+ my $html = shift;
+ RT::Util::safe_run_child {
+ $package->format_string($html, leftmargin => 0, rightmargin => 78);
+ };
+ };
+ }
+ RT->Config->Set( HTMLFormatter => $prog );
+ last;
+ }
+ return $formatter;
+}
+
+sub _HTMLFormatText {
my $html = shift;
- require HTML::FormatText::WithLinks::AndTables;
my $text;
eval {
$text = HTML::FormatText::WithLinks::AndTables->convert(
commit 056dfba3576720c4ed6f83f5307ac384c6566720
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Wed Jun 18 18:44:52 2014 -0400
Allow $HTMLFormatter to contain the full path
Some webservers unset or limit $ENV{PATH} when running; search in
/usr/bin by default, and further allow the setting to contain an
explicit full path to the executable.
diff --git a/etc/RT_Config.pm.in b/etc/RT_Config.pm.in
index 7cc19e3..70dd93f 100755
--- a/etc/RT_Config.pm.in
+++ b/etc/RT_Config.pm.in
@@ -726,6 +726,10 @@ Using formatters other than C<core> requires that the
L<HTML::FormatExternal> module be installed; it is an optional
dependency which RT will not install by default.
+If the chosen formatter is not in the webserver's $PATH nor in the
+standard F</usr/bin/>, you may set this option the full path to one of
+the aforementioned executables.
+
=cut
Set($HTMLFormatter, undef);
diff --git a/lib/RT/Interface/Email.pm b/lib/RT/Interface/Email.pm
index 286831e..3086097 100644
--- a/lib/RT/Interface/Email.pm
+++ b/lib/RT/Interface/Email.pm
@@ -1808,7 +1808,10 @@ sub _HTMLFormatter {
if ($wanted) {
@order = ($wanted, "core");
} else {
- @order = ("w3m", "elinks", "html2text", "links", "lynx", "core");
+ # Try a sane default full path, in case the webserver has a very
+ # limited $ENV{PATH}
+ @order = map {($_, "/usr/bin/$_")} qw/w3m elinks html2text links lynx/;
+ push @order, "core";
}
# Always fall back to core, even if it is not listed
for my $prog (@order) {
@@ -1823,6 +1826,7 @@ sub _HTMLFormatter {
next;
}
+ my $path = $prog =~ s{(.*/)}{} ? $1 : undef;
my $package = "HTML::FormatText::" . ucfirst($prog);
unless ($package->require) {
RT->Logger->warn("$prog is not a valid formatter provided by HTML::FormatExternal")
@@ -1830,8 +1834,15 @@ sub _HTMLFormatter {
next;
}
- unless (defined $package->program_version) {
- RT->Logger->warn("Could not find external '$prog' HTML formatter; is it installed?")
+ if ($path) {
+ local $ENV{PATH} = $path;
+ if (not defined $package->program_version) {
+ RT->Logger->warn("Could not find or run external '$prog' HTML formatter in $path$prog")
+ if $wanted;
+ next;
+ }
+ } elsif (not defined $package->program_version) {
+ RT->Logger->warn("Could not find or run external '$prog' HTML formatter in \$PATH ($ENV{PATH}) -- you may need to install it or provide the full path")
if $wanted;
next;
}
@@ -1840,6 +1851,7 @@ sub _HTMLFormatter {
$formatter = sub {
my $html = shift;
RT::Util::safe_run_child {
+ local $ENV{PATH} = $path || $ENV{PATH};
$package->format_string($html, leftmargin => 0, rightmargin => 78);
};
};
commit 4201d8f9511e35322fcb71a5e9b5ac2a92a3097e
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Thu Jan 2 21:57:14 2014 -0500
Add HTML::HTML5::ToText as a better pure-perl HTML → text converter
diff --git a/etc/RT_Config.pm.in b/etc/RT_Config.pm.in
index 70dd93f..8721302 100755
--- a/etc/RT_Config.pm.in
+++ b/etc/RT_Config.pm.in
@@ -715,16 +715,18 @@ Set($ForwardFromUser, 0);
=item C<$HTMLFormatter>
-The external HTML formatter to use. The default is to try, in order,
-C<w3m>, C<elinks>, C<html2text>, C<links>, C<lynx>, and then fall back
-to the C<core> pure-perl formatter. The pure-perl formatter is
+The external HTML formatter to use. The default is to try, in order, an
+improved perl formatter (C<perl>), the external programs C<w3m>,
+C<elinks>, C<html2text>, C<links>, or C<lynx>, and then fall back to the
+C<core> pure-perl formatter. The C<perl> and C<core> formatters are
significantly faster than the others, but may fail to successfully
convert even on some relatively simple HTML. Falls back to C<core> if
the configured program is not installed.
-Using formatters other than C<core> requires that the
-L<HTML::FormatExternal> module be installed; it is an optional
-dependency which RT will not install by default.
+Using the C<perl> formatter requires the L<HTML::HTML5::ToText> optional
+module be installed. Using external formatters (i.e. C<w3m>, etc)
+requires that the L<HTML::FormatExternal> module be installed. Neither
+of these modules are installed by RT by default.
If the chosen formatter is not in the webserver's $PATH nor in the
standard F</usr/bin/>, you may set this option the full path to one of
diff --git a/lib/RT/Interface/Email.pm b/lib/RT/Interface/Email.pm
index 3086097..dd1748e 100644
--- a/lib/RT/Interface/Email.pm
+++ b/lib/RT/Interface/Email.pm
@@ -1819,6 +1819,24 @@ sub _HTMLFormatter {
RT->Logger->info("Using internal Perl HTML -> text conversion");
require HTML::FormatText::WithLinks::AndTables;
$formatter = \&_HTMLFormatText;
+ } elsif ($prog eq "perl") {
+ unless (HTML::HTML5::ToText->require) {
+ RT->Logger->warn("Could not load HTML::HTML5::ToText; install it from CPAN")
+ if $wanted;
+ next;
+ }
+ my $obj = HTML::HTML5::ToText->new_with_traits(
+ traits => [qw/ShowLinks ShowImages RenderTables TextFormatting/],
+ );
+ RT->Logger->info("Using HTML::HTML5::ToText for HTML -> text conversion");
+ $formatter = sub {
+ my $text;
+ eval {
+ $text = $obj->process_string( @_ ) // '';
+ };
+ $RT::Logger->error("Failed to downgrade HTML to plain text: $@") if $@;
+ return $text;
+ };
} else {
unless (HTML::FormatExternal->require) {
RT->Logger->warn("HTML::FormatExternal is not installed; falling back to internal perl formatter")
commit 7bc6f4fba90cb73ad314e7690942f079470f9e56
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jan 3 17:15:40 2014 -0500
HTML::HTML5::ToText mistakenly installs a global __WARN__ handler; local it
See https://github.com/tobyink/p5-html-html5-parser/pull/2
diff --git a/lib/RT/Interface/Email.pm b/lib/RT/Interface/Email.pm
index dd1748e..9dd1aa7 100644
--- a/lib/RT/Interface/Email.pm
+++ b/lib/RT/Interface/Email.pm
@@ -1830,6 +1830,7 @@ sub _HTMLFormatter {
);
RT->Logger->info("Using HTML::HTML5::ToText for HTML -> text conversion");
$formatter = sub {
+ local $SIG{__WARN__};
my $text;
eval {
$text = $obj->process_string( @_ ) // '';
commit 9a3e5c825611c6ac2cbbaf208cbed709dee97f8a
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jan 3 17:16:38 2014 -0500
Update tests to work with any of the HTML formatters
diff --git a/t/api/txn_content.t b/t/api/txn_content.t
index d44862d..672d6c2 100644
--- a/t/api/txn_content.t
+++ b/t/api/txn_content.t
@@ -19,5 +19,5 @@ ok( $txn, 'got Create txn' );
# ->Content converts from text/html to plain text if we don't explicitly ask
# for html. Our html -> text converter seems to add an extra trailing newline
-is( $txn->Content, "this is body\n\n", "txn's html content converted to plain text" );
+like( $txn->Content, qr/^\s*this is body\s*$/, "txn's html content converted to plain text" );
is( $txn->Content(Type => 'text/html'), "this is body\n", "txn's html content" );
diff --git a/t/mail/html-outgoing.t b/t/mail/html-outgoing.t
index 1b460c2..63d84c6 100644
--- a/t/mail/html-outgoing.t
+++ b/t/mail/html-outgoing.t
@@ -4,9 +4,9 @@ use RT::Test tests => undef;
BEGIN {
plan skip_all => 'Email::Abstract and Test::Email required.'
unless eval { require Email::Abstract; require Test::Email; 1 };
- plan tests => 22;
}
+
use RT::Test::Email;
use Test::Warn;
@@ -44,7 +44,7 @@ mail_ok {
to => 'enduser at example.com',
subject => qr/\Q[example.com #1] AutoReply: The internet is broken\E/,
body => parts_regex(
- 'trouble ticket regarding The internet is broken',
+ 'trouble ticket regarding \*?The internet is broken\*?',
'trouble ticket regarding <b>The internet is broken</b>'
),
'Content-Type' => qr{multipart},
@@ -52,7 +52,7 @@ mail_ok {
bcc => 'root at localhost',
subject => qr/\Q[example.com #1] The internet is broken\E/,
body => parts_regex(
- 'Request 1 \(http://localhost:\d+/Ticket/Display\.html\?id=1\)\s+?was acted upon by RT_System',
+ 'Request (\[\d+\])?1(\s*[(<]http://localhost:\d+/Ticket/Display\.html\?id=1[)>])?\s*was acted upon by RT_System',
'Request <a href="http://localhost:\d+/Ticket/Display\.html\?id=1">1</a> was acted upon by RT_System\.</b>'
),
'Content-Type' => qr{multipart},
@@ -70,8 +70,8 @@ mail_ok {
bcc => 'root at localhost',
subject => qr/\Q[example.com #1] The internet is broken\E/,
body => parts_regex(
- 'Ticket URL: http://localhost:\d+/Ticket/Display\.html\?id=1.+?'.
- 'This is a test of HTML correspondence\.',
+ 'Ticket URL: (?:\[\d+\])?http://localhost:\d+/Ticket/Display\.html\?id=1.+?'.
+ 'This is a test of \*?HTML\*? correspondence\.',
'Ticket URL: <a href="(http://localhost:\d+/Ticket/Display\.html\?id=1)">\1</a>.+?'.
'<p>This is a test of <b>HTML</b> correspondence\.</p>'
),
@@ -80,35 +80,39 @@ mail_ok {
to => 'enduser at example.com',
subject => qr/\Q[example.com #1] The internet is broken\E/,
body => parts_regex(
- 'This is a test of HTML correspondence\.',
+ 'This is a test of \*?HTML\*? correspondence\.',
'<p>This is a test of <b>HTML</b> correspondence\.</p>'
),
'Content-Type' => qr{multipart},
};
+SKIP: {
+ skip "Only fails on core HTMLFormatter", 9
+ unless RT->Config->Get("HTMLFormatter") eq "core";
+ diag "Failing HTML -> Text conversion";
+ warnings_like {
+ my $body = '<table><tr><td><table><tr><td>Foo</td></tr></table></td></tr></table>';
+ mail_ok {
+ ($ok, $tmsg) = $t->Correspond(
+ MIMEObj => HTML::Mason::Commands::MakeMIMEEntity(
+ Body => $body,
+ Type => 'text/html',
+ ),
+ );
+ } { from => qr/RT System/,
+ bcc => 'root at localhost',
+ subject => qr/\Q[example.com #1] The internet is broken\E/,
+ body => qr{Ticket URL: <a href="(http://localhost:\d+/Ticket/Display\.html\?id=1)">\1</a>.+?$body}s,
+ 'Content-Type' => qr{text/html}, # TODO
+ },{ from => qr/RT System/,
+ to => 'enduser at example.com',
+ subject => qr/\Q[example.com #1] The internet is broken\E/,
+ body => qr{$body},
+ 'Content-Type' => qr{text/html}, # TODO
+ };
+ } [(qr/uninitialized value/, qr/Failed to downgrade HTML/)x3];
+}
-diag "Failing HTML -> Text conversion";
-warnings_like {
- my $body = '<table><tr><td><table><tr><td>Foo</td></tr></table></td></tr></table>';
- mail_ok {
- ($ok, $tmsg) = $t->Correspond(
- MIMEObj => HTML::Mason::Commands::MakeMIMEEntity(
- Body => $body,
- Type => 'text/html',
- ),
- );
- } { from => qr/RT System/,
- bcc => 'root at localhost',
- subject => qr/\Q[example.com #1] The internet is broken\E/,
- body => qr{Ticket URL: <a href="(http://localhost:\d+/Ticket/Display\.html\?id=1)">\1</a>.+?$body}s,
- 'Content-Type' => qr{text/html}, # TODO
- },{ from => qr/RT System/,
- to => 'enduser at example.com',
- subject => qr/\Q[example.com #1] The internet is broken\E/,
- body => qr{<table><tr><td><table><tr><td>Foo</td></tr></table></td></tr></table>},
- 'Content-Type' => qr{text/html}, # TODO
- };
-} [(qr/uninitialized value/, qr/Failed to downgrade HTML/)x3];
diag "Admin Comment in HTML";
mail_ok {
@@ -122,9 +126,9 @@ mail_ok {
bcc => 'root at localhost',
subject => qr/\Q[example.com #1] [Comment] The internet is broken\E/,
body => parts_regex(
- 'This is a comment about ticket 1 \(http://localhost:\d+/Ticket/Display\.html\?id=1\)\..+?'.
+ 'This is a comment about (\[\d+\])?ticket.1(\s*[(<]http://localhost:\d+/Ticket/Display\.html\?id=1[)>])?\..+?'.
'It is not sent to the Requestor\(s\):.+?'.
- 'Comment test, please!',
+ 'Comment test, _?please!_?',
'<p>This is a comment about <a href="http://localhost:\d+/Ticket/Display\.html\?id=1">ticket 1</a>\. '.
'It is not sent to the Requestor\(s\):</p>.+?'.
@@ -175,6 +179,7 @@ mail_ok {
'Content-Type' => qr{multipart},
};
+done_testing;
sub parts_regex {
my ($text, $html) = @_;
-----------------------------------------------------------------------
More information about the rt-commit
mailing list