[Rt-commit] rt branch, 4.2/html-external-formatter, created. rt-4.2.5-70-g6449b2e

Alex Vandiver alexmv at bestpractical.com
Wed Jun 25 21:43:06 EDT 2014


The branch, 4.2/html-external-formatter has been created
        at  6449b2eecc652c63642f8cb231a17a739c0b9d00 (commit)

- Log -----------------------------------------------------------------
commit 2499d7e5d6ca62b4697d20469d61e2df24b7d620
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Thu Jan 2 21:37:16 2014 -0500

    Use HTML::FormatExternal to allow external HTML → text formatters
    
    The HTML::FormatText::WithLinks::AndTables module is unmaintained, and
    contains multiple bugs which trigger runtime errors.  Add
    HTML::FormatExternal as an optional dependency for slower but more
    reliable HTML to text conversion using external text-based web browsers
    and text conversion tools.

diff --git a/etc/RT_Config.pm.in b/etc/RT_Config.pm.in
index 9740837..7cc19e3 100755
--- a/etc/RT_Config.pm.in
+++ b/etc/RT_Config.pm.in
@@ -713,6 +713,23 @@ will use the address of the current user and remove RT's subject tag.
 
 Set($ForwardFromUser, 0);
 
+=item C<$HTMLFormatter>
+
+The external HTML formatter to use.  The default is to try, in order,
+C<w3m>, C<elinks>, C<html2text>, C<links>, C<lynx>, and then fall back
+to the C<core> pure-perl formatter.  The pure-perl formatter is
+significantly faster than the others, but may fail to successfully
+convert even on some relatively simple HTML.  Falls back to C<core> if
+the configured program is not installed.
+
+Using formatters other than C<core> requires that the
+L<HTML::FormatExternal> module be installed; it is an optional
+dependency which RT will not install by default.
+
+=cut
+
+Set($HTMLFormatter, undef);
+
 =back
 
 =head2 Email dashboards
diff --git a/lib/RT/Config.pm b/lib/RT/Config.pm
index 7842e57..b00c4f1 100644
--- a/lib/RT/Config.pm
+++ b/lib/RT/Config.pm
@@ -635,6 +635,10 @@ our %META;
             $self->Set( MailCommand => 'sendmailpipe' );
         },
     },
+    HTMLFormatter => {
+        Type => 'SCALAR',
+        PostLoadCheck => sub { RT::Interface::Email->_HTMLFormatter },
+    },
     MailPlugins  => {
         Type => 'ARRAY',
         PostLoadCheck => sub {
diff --git a/lib/RT/Interface/Email.pm b/lib/RT/Interface/Email.pm
index facdb38..286831e 100644
--- a/lib/RT/Interface/Email.pm
+++ b/lib/RT/Interface/Email.pm
@@ -50,6 +50,7 @@ package RT::Interface::Email;
 
 use strict;
 use warnings;
+use 5.010;
 
 use Email::Address;
 use MIME::Entity;
@@ -58,6 +59,7 @@ use File::Temp;
 use UNIVERSAL::require;
 use Mail::Mailer ();
 use Text::ParseWords qw/shellwords/;
+use RT::Util 'safe_run_child';
 
 BEGIN {
     use base 'Exporter';
@@ -1793,9 +1795,64 @@ conversion fails.
 =cut
 
 sub ConvertHTMLToText {
+    return _HTMLFormatter()->(@_);
+}
+
+sub _HTMLFormatter {
+    state $formatter;
+    return $formatter if defined $formatter;
+
+    my $wanted = RT->Config->Get("HTMLFormatter");
+
+    my @order;
+    if ($wanted) {
+        @order = ($wanted, "core");
+    } else {
+        @order = ("w3m", "elinks", "html2text", "links", "lynx", "core");
+    }
+    # Always fall back to core, even if it is not listed
+    for my $prog (@order) {
+        if ($prog eq "core") {
+            RT->Logger->info("Using internal Perl HTML -> text conversion");
+            require HTML::FormatText::WithLinks::AndTables;
+            $formatter = \&_HTMLFormatText;
+        } else {
+            unless (HTML::FormatExternal->require) {
+                RT->Logger->warn("HTML::FormatExternal is not installed; falling back to internal perl formatter")
+                    if $wanted;
+                next;
+            }
+
+            my $package = "HTML::FormatText::" . ucfirst($prog);
+            unless ($package->require) {
+                RT->Logger->warn("$prog is not a valid formatter provided by HTML::FormatExternal")
+                    if $wanted;
+                next;
+            }
+
+            unless (defined $package->program_version) {
+                RT->Logger->warn("Could not find external '$prog' HTML formatter; is it installed?")
+                    if $wanted;
+                next;
+            }
+
+            RT->Logger->info("Using $prog for HTML -> text conversion");
+            $formatter = sub {
+                my $html = shift;
+                RT::Util::safe_run_child {
+                    $package->format_string($html, leftmargin => 0, rightmargin => 78);
+                };
+            };
+        }
+        RT->Config->Set( HTMLFormatter => $prog );
+        last;
+    }
+    return $formatter;
+}
+
+sub _HTMLFormatText {
     my $html = shift;
 
-    require HTML::FormatText::WithLinks::AndTables;
     my $text;
     eval {
         $text = HTML::FormatText::WithLinks::AndTables->convert(

commit 4fefae358360661328574b59946ae385f38af7e5
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Wed Jun 18 18:44:52 2014 -0400

    Allow $HTMLFormatter to contain the full path

diff --git a/etc/RT_Config.pm.in b/etc/RT_Config.pm.in
index 7cc19e3..c9c209c 100755
--- a/etc/RT_Config.pm.in
+++ b/etc/RT_Config.pm.in
@@ -726,6 +726,9 @@ Using formatters other than C<core> requires that the
 L<HTML::FormatExternal> module be installed; it is an optional
 dependency which RT will not install by default.
 
+If the chosen formatter is not in the webserver's $PATH, you may set
+this option the full path to one of the aforementioned executables.
+
 =cut
 
 Set($HTMLFormatter, undef);
diff --git a/lib/RT/Interface/Email.pm b/lib/RT/Interface/Email.pm
index 286831e..cac5f9e 100644
--- a/lib/RT/Interface/Email.pm
+++ b/lib/RT/Interface/Email.pm
@@ -1823,6 +1823,7 @@ sub _HTMLFormatter {
                 next;
             }
 
+            my $path = $prog =~ s{(.*/)}{} ? $1 : undef;
             my $package = "HTML::FormatText::" . ucfirst($prog);
             unless ($package->require) {
                 RT->Logger->warn("$prog is not a valid formatter provided by HTML::FormatExternal")
@@ -1830,8 +1831,15 @@ sub _HTMLFormatter {
                 next;
             }
 
-            unless (defined $package->program_version) {
-                RT->Logger->warn("Could not find external '$prog' HTML formatter; is it installed?")
+            if ($path) {
+                local $ENV{PATH} = $path;
+                if (not defined $package->program_version) {
+                    RT->Logger->warn("Could not find or run external '$prog' HTML formatter in $path$prog")
+                        if $wanted;
+                    next;
+                }
+            } elsif (not defined $package->program_version) {
+                RT->Logger->warn("Could not find or run external '$prog' HTML formatter in \$PATH ($ENV{PATH}) -- you may need to install it or provide the full path")
                     if $wanted;
                 next;
             }
@@ -1840,6 +1848,7 @@ sub _HTMLFormatter {
             $formatter = sub {
                 my $html = shift;
                 RT::Util::safe_run_child {
+                    local $ENV{PATH} = $path || $ENV{PATH};
                     $package->format_string($html, leftmargin => 0, rightmargin => 78);
                 };
             };

commit 77f6139ce1f539509ef011d55c0d645909a360e4
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Wed Jun 25 21:38:19 2014 -0400

    Provide a default PATH in cases where it is unset (mod_fastcgi)
    
    See 4ef0d833 for the full explanation, which applies in this context as
    well.

diff --git a/lib/RT/Interface/Email.pm b/lib/RT/Interface/Email.pm
index cac5f9e..1e03f19 100644
--- a/lib/RT/Interface/Email.pm
+++ b/lib/RT/Interface/Email.pm
@@ -1838,17 +1838,22 @@ sub _HTMLFormatter {
                         if $wanted;
                     next;
                 }
-            } elsif (not defined $package->program_version) {
-                RT->Logger->warn("Could not find or run external '$prog' HTML formatter in \$PATH ($ENV{PATH}) -- you may need to install it or provide the full path")
-                    if $wanted;
-                next;
+            } else {
+                local $ENV{PATH} = '/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin'
+                    unless defined $ENV{PATH};
+                if (not defined $package->program_version) {
+                    RT->Logger->warn("Could not find or run external '$prog' HTML formatter in \$PATH ($ENV{PATH}) -- you may need to install it or provide the full path")
+                        if $wanted;
+                    next;
+                }
             }
 
             RT->Logger->info("Using $prog for HTML -> text conversion");
             $formatter = sub {
                 my $html = shift;
                 RT::Util::safe_run_child {
-                    local $ENV{PATH} = $path || $ENV{PATH};
+                    local $ENV{PATH} = $path || $ENV{PATH}
+                        || '/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin';
                     $package->format_string($html, leftmargin => 0, rightmargin => 78);
                 };
             };

commit 58b3db3ee763fc3703396804c8ccade7869e3eff
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Thu Jan 2 21:57:14 2014 -0500

    Add HTML::HTML5::ToText as a better pure-perl HTML → text converter

diff --git a/etc/RT_Config.pm.in b/etc/RT_Config.pm.in
index c9c209c..6b48a55 100755
--- a/etc/RT_Config.pm.in
+++ b/etc/RT_Config.pm.in
@@ -715,16 +715,18 @@ Set($ForwardFromUser, 0);
 
 =item C<$HTMLFormatter>
 
-The external HTML formatter to use.  The default is to try, in order,
-C<w3m>, C<elinks>, C<html2text>, C<links>, C<lynx>, and then fall back
-to the C<core> pure-perl formatter.  The pure-perl formatter is
+The external HTML formatter to use.  The default is to try, in order, an
+improved perl formatter (C<perl>), the external programs C<w3m>,
+C<elinks>, C<html2text>, C<links>, or C<lynx>, and then fall back to the
+C<core> pure-perl formatter.  The C<perl> and C<core> formatters are
 significantly faster than the others, but may fail to successfully
 convert even on some relatively simple HTML.  Falls back to C<core> if
 the configured program is not installed.
 
-Using formatters other than C<core> requires that the
-L<HTML::FormatExternal> module be installed; it is an optional
-dependency which RT will not install by default.
+Using the C<perl> formatter requires the L<HTML::HTML5::ToText> optional
+module be installed.  Using external formatters (i.e. C<w3m>, etc)
+requires that the L<HTML::FormatExternal> module be installed.  Neither
+of these modules are installed by RT by default.
 
 If the chosen formatter is not in the webserver's $PATH, you may set
 this option the full path to one of the aforementioned executables.
diff --git a/lib/RT/Interface/Email.pm b/lib/RT/Interface/Email.pm
index 1e03f19..ce04bba 100644
--- a/lib/RT/Interface/Email.pm
+++ b/lib/RT/Interface/Email.pm
@@ -1816,6 +1816,24 @@ sub _HTMLFormatter {
             RT->Logger->info("Using internal Perl HTML -> text conversion");
             require HTML::FormatText::WithLinks::AndTables;
             $formatter = \&_HTMLFormatText;
+        } elsif ($prog eq "perl") {
+            unless (HTML::HTML5::ToText->require) {
+                RT->Logger->warn("Could not load HTML::HTML5::ToText; install it from CPAN")
+                    if $wanted;
+                next;
+            }
+            my $obj = HTML::HTML5::ToText->new_with_traits(
+                traits => [qw/ShowLinks ShowImages RenderTables TextFormatting/],
+            );
+            RT->Logger->info("Using HTML::HTML5::ToText for HTML -> text conversion");
+            $formatter = sub {
+                my $text;
+                eval {
+                    $text = $obj->process_string( @_ ) // '';
+                };
+                $RT::Logger->error("Failed to downgrade HTML to plain text: $@") if $@;
+                return $text;
+            };
         } else {
             unless (HTML::FormatExternal->require) {
                 RT->Logger->warn("HTML::FormatExternal is not installed; falling back to internal perl formatter")

commit 5a54a023d649a6a10d06f4411c785c6f74561c06
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jan 3 17:15:40 2014 -0500

    HTML::HTML5::ToText mistakenly installs a global __WARN__ handler; local it
    
    See https://github.com/tobyink/p5-html-html5-parser/pull/2

diff --git a/lib/RT/Interface/Email.pm b/lib/RT/Interface/Email.pm
index ce04bba..a417193 100644
--- a/lib/RT/Interface/Email.pm
+++ b/lib/RT/Interface/Email.pm
@@ -1827,6 +1827,7 @@ sub _HTMLFormatter {
             );
             RT->Logger->info("Using HTML::HTML5::ToText for HTML -> text conversion");
             $formatter = sub {
+                local $SIG{__WARN__};
                 my $text;
                 eval {
                     $text = $obj->process_string( @_ ) // '';

commit 6449b2eecc652c63642f8cb231a17a739c0b9d00
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jan 3 17:16:38 2014 -0500

    Update tests to work with any of the HTML formatters

diff --git a/t/api/txn_content.t b/t/api/txn_content.t
index d44862d..672d6c2 100644
--- a/t/api/txn_content.t
+++ b/t/api/txn_content.t
@@ -19,5 +19,5 @@ ok( $txn, 'got Create txn' );
 
 # ->Content converts from text/html to plain text if we don't explicitly ask
 # for html. Our html -> text converter seems to add an extra trailing newline
-is( $txn->Content, "this is body\n\n", "txn's html content converted to plain text" );
+like( $txn->Content, qr/^\s*this is body\s*$/, "txn's html content converted to plain text" );
 is( $txn->Content(Type => 'text/html'), "this is body\n", "txn's html content" );
diff --git a/t/mail/html-outgoing.t b/t/mail/html-outgoing.t
index 1b460c2..63d84c6 100644
--- a/t/mail/html-outgoing.t
+++ b/t/mail/html-outgoing.t
@@ -4,9 +4,9 @@ use RT::Test tests => undef;
 BEGIN {
     plan skip_all => 'Email::Abstract and Test::Email required.'
         unless eval { require Email::Abstract; require Test::Email; 1 };
-    plan tests => 22;
 }
 
+
 use RT::Test::Email;
 use Test::Warn;
 
@@ -44,7 +44,7 @@ mail_ok {
     to      => 'enduser at example.com',
     subject => qr/\Q[example.com #1] AutoReply: The internet is broken\E/,
     body    => parts_regex(
-        'trouble ticket regarding The internet is broken',
+        'trouble ticket regarding \*?The internet is broken\*?',
         'trouble ticket regarding <b>The internet is broken</b>'
     ),
     'Content-Type' => qr{multipart},
@@ -52,7 +52,7 @@ mail_ok {
     bcc     => 'root at localhost',
     subject => qr/\Q[example.com #1] The internet is broken\E/,
     body    => parts_regex(
-        'Request 1 \(http://localhost:\d+/Ticket/Display\.html\?id=1\)\s+?was acted upon by RT_System',
+        'Request (\[\d+\])?1(\s*[(<]http://localhost:\d+/Ticket/Display\.html\?id=1[)>])?\s*was acted upon by RT_System',
         'Request <a href="http://localhost:\d+/Ticket/Display\.html\?id=1">1</a> was acted upon by RT_System\.</b>'
     ),
     'Content-Type' => qr{multipart},
@@ -70,8 +70,8 @@ mail_ok {
     bcc     => 'root at localhost',
     subject => qr/\Q[example.com #1] The internet is broken\E/,
     body    => parts_regex(
-        'Ticket URL: http://localhost:\d+/Ticket/Display\.html\?id=1.+?'.
-        'This is a test of HTML correspondence\.',
+        'Ticket URL: (?:\[\d+\])?http://localhost:\d+/Ticket/Display\.html\?id=1.+?'.
+        'This is a test of \*?HTML\*? correspondence\.',
         'Ticket URL: <a href="(http://localhost:\d+/Ticket/Display\.html\?id=1)">\1</a>.+?'.
         '<p>This is a test of <b>HTML</b> correspondence\.</p>'
     ),
@@ -80,35 +80,39 @@ mail_ok {
     to      => 'enduser at example.com',
     subject => qr/\Q[example.com #1] The internet is broken\E/,
     body    => parts_regex(
-        'This is a test of HTML correspondence\.',
+        'This is a test of \*?HTML\*? correspondence\.',
         '<p>This is a test of <b>HTML</b> correspondence\.</p>'
     ),
     'Content-Type' => qr{multipart},
 };
 
+SKIP: {
+    skip "Only fails on core HTMLFormatter", 9
+        unless RT->Config->Get("HTMLFormatter") eq "core";
+    diag "Failing HTML -> Text conversion";
+    warnings_like {
+        my $body = '<table><tr><td><table><tr><td>Foo</td></tr></table></td></tr></table>';
+        mail_ok {
+            ($ok, $tmsg) = $t->Correspond(
+                MIMEObj => HTML::Mason::Commands::MakeMIMEEntity(
+                    Body => $body,
+                    Type => 'text/html',
+                ),
+            );
+        } { from    => qr/RT System/,
+            bcc     => 'root at localhost',
+            subject => qr/\Q[example.com #1] The internet is broken\E/,
+            body    => qr{Ticket URL: <a href="(http://localhost:\d+/Ticket/Display\.html\?id=1)">\1</a>.+?$body}s,
+            'Content-Type' => qr{text/html},  # TODO
+        },{ from    => qr/RT System/,
+            to      => 'enduser at example.com',
+            subject => qr/\Q[example.com #1] The internet is broken\E/,
+            body    => qr{$body},
+            'Content-Type' => qr{text/html},  # TODO
+        };
+    } [(qr/uninitialized value/, qr/Failed to downgrade HTML/)x3];
+}
 
-diag "Failing HTML -> Text conversion";
-warnings_like {
-    my $body = '<table><tr><td><table><tr><td>Foo</td></tr></table></td></tr></table>';
-    mail_ok {
-        ($ok, $tmsg) = $t->Correspond(
-            MIMEObj => HTML::Mason::Commands::MakeMIMEEntity(
-                Body => $body,
-                Type => 'text/html',
-            ),
-        );
-    } { from    => qr/RT System/,
-        bcc     => 'root at localhost',
-        subject => qr/\Q[example.com #1] The internet is broken\E/,
-        body    => qr{Ticket URL: <a href="(http://localhost:\d+/Ticket/Display\.html\?id=1)">\1</a>.+?$body}s,
-        'Content-Type' => qr{text/html},  # TODO
-    },{ from    => qr/RT System/,
-        to      => 'enduser at example.com',
-        subject => qr/\Q[example.com #1] The internet is broken\E/,
-        body    => qr{<table><tr><td><table><tr><td>Foo</td></tr></table></td></tr></table>},
-        'Content-Type' => qr{text/html},  # TODO
-    };
-} [(qr/uninitialized value/, qr/Failed to downgrade HTML/)x3];
 
 diag "Admin Comment in HTML";
 mail_ok {
@@ -122,9 +126,9 @@ mail_ok {
     bcc     => 'root at localhost',
     subject => qr/\Q[example.com #1] [Comment] The internet is broken\E/,
     body    => parts_regex(
-        'This is a comment about ticket 1 \(http://localhost:\d+/Ticket/Display\.html\?id=1\)\..+?'.
+        'This is a comment about (\[\d+\])?ticket.1(\s*[(<]http://localhost:\d+/Ticket/Display\.html\?id=1[)>])?\..+?'.
         'It is not sent to the Requestor\(s\):.+?'.
-        'Comment test, please!',
+        'Comment test, _?please!_?',
 
         '<p>This is a comment about <a href="http://localhost:\d+/Ticket/Display\.html\?id=1">ticket 1</a>\. '.
         'It is not sent to the Requestor\(s\):</p>.+?'.
@@ -175,6 +179,7 @@ mail_ok {
     'Content-Type' => qr{multipart},
 };
 
+done_testing;
 
 sub parts_regex {
     my ($text, $html) = @_;

-----------------------------------------------------------------------


More information about the rt-commit mailing list