[Rt-commit] rt branch, 5.0/fix-html-to-plaintext-conversion-to-preserve-quoting, created. rt-5.0.0-4-ga27a98981c

Dianne Skoll dianne at bestpractical.com
Wed Aug 12 08:40:01 EDT 2020


The branch, 5.0/fix-html-to-plaintext-conversion-to-preserve-quoting has been created
        at  a27a98981c9a5218a642deb463da7f9d6f4e63d6 (commit)

- Log -----------------------------------------------------------------
commit a27a98981c9a5218a642deb463da7f9d6f4e63d6
Author: Dianne Skoll <dianne at bestpractical.com>
Date:   Mon Aug 3 09:29:48 2020 -0400

    Preserve quoting levels when converting HTML to plain text
    
    When you reply to a message in RT using the rich text editor, RT creates
    a text/plain version corresponding to the text/html version.  However,
    quoted text is enclosed in a <blockquote></blockquote> pair and this
    makes quoted text in the text/plain part difficult to discern.
    
    This commit uses heuristics to modify the text/plain output for each
    of the supported converters to detect blockquoted material and replace
    it with the standard text/plain quote prefix ">".
    
    Note that lynx does not give us any way to detect blockquoted material,
    so lynx should be marked as a deprecated converter.  All of the other
    supported converters do give us a way to heuristically detect blockquoted
    material.

diff --git a/lib/RT/Interface/Email.pm b/lib/RT/Interface/Email.pm
index f1b97cfe0c..cedb405e39 100644
--- a/lib/RT/Interface/Email.pm
+++ b/lib/RT/Interface/Email.pm
@@ -1452,6 +1452,36 @@ sub _RecordSendEmailFailure {
     }
 }
 
+# Hash describing how various formatters format <blockquote>...</blockquote>
+# regions.
+our $BlockquoteDescriptor = {
+    w3m       => { indent => 4},
+    elinks    => { indent => 2},
+    links     => { indent => 2},
+    html2text => { indent => 5},
+    lynx      => { indent => 2},
+    core      => { indent => 2},
+};
+
+=head3 ConvertBlockquoteIndentsToQuotemarks
+
+Given plain text that has been converted from HTML to text, adjust
+it to quote blockquote regions with ">".
+
+=cut
+sub ConvertBlockquoteIndentsToQuotemarks {
+    my ($text, $converter) = @_;
+
+    return $text unless exists($BlockquoteDescriptor->{$converter});
+    my $n = $BlockquoteDescriptor->{$converter}{indent};
+    my $spaces = ' ' x $n;
+
+    # Convert each level of indentation to a ">"; add a space aferwards
+    # for readability
+    $text =~ s|^(($spaces)+)|">" x (length($1)/$n) . " "|gem;
+    return $text;
+}
+
 =head3 ConvertHTMLToText HTML
 
 Takes HTML characters and converts it to plain text characters.
@@ -1466,7 +1496,10 @@ sub ConvertHTMLToText {
 
 sub _HTMLFormatter {
     state $formatter;
-    return $formatter if defined $formatter;
+
+    # If we are running under the test harness, we want to create
+    # a new $formatter each time rather than once and caching.
+    return $formatter if defined $formatter && !$ENV{HARNESS_ACTIVE};
 
     my $wanted = RT->Config->Get("HTMLFormatter");
     my @options = ("w3m", "elinks", "links", "html2text", "lynx", "core");
@@ -1529,7 +1562,7 @@ sub _HTMLFormatter {
                     );
                 };
                 $text = Encode::decode( "UTF-8", $text );
-                return $text;
+                return ConvertBlockquoteIndentsToQuotemarks($text, $prog);
             };
         }
         RT->Config->Set( HTMLFormatter => $prog );
@@ -1558,7 +1591,7 @@ sub _HTMLFormatText {
         $text //= '';
     };
     $RT::Logger->error("Failed to downgrade HTML to plain text: $@") if $@;
-    return $text;
+    return ConvertBlockquoteIndentsToQuotemarks($text, 'core');
 }
 
 
diff --git a/t/mail/html-to-text.t b/t/mail/html-to-text.t
new file mode 100644
index 0000000000..0268efed6d
--- /dev/null
+++ b/t/mail/html-to-text.t
@@ -0,0 +1,104 @@
+use strict;
+use warnings;
+
+use RT::Test nodb => 1, tests => undef;
+
+my $html = <<'EOF';
+<html>
+  <head>
+    <title>Test HTML</title>
+  </head>
+  <body>
+  <p>This is a top-level paragraph.</p>
+  <blockquote>
+    <p>This is a first-level quoted paragraph</p>
+    <blockquote>
+      <p>This is a second-level quoted paragraph</p>
+      <p>So is this</p>
+    </blockquote>
+    <p>Back to first-level</p>
+  </blockquote>
+  <p>Back to top-level</p>
+  </body>
+</html>
+EOF
+
+my $expected = <<'EOF';
+This is a top-level paragraph.
+
+> This is a first-level quoted paragraph
+
+>> This is a second-level quoted paragraph
+
+>> So is this
+
+> Back to first-level
+
+Back to top-level
+EOF
+
+my $expected_links = <<'EOF';
+This is a top-level paragraph.
+
+> This is a first-level quoted paragraph
+
+> This is a second-level quoted paragraph
+
+> So is this
+
+> Back to first-level
+
+Back to top-level
+
+EOF
+
+my $expected_html2text = <<'EOF';
+This is a top-level paragraph.
+> This is a first-level quoted paragraph
+>> This is a second-level quoted paragraph
+>> So is this
+> Back to first-level
+Back to top-level
+EOF
+
+# Lynx messes up; no way to preserve quoting. :(
+my $expected_lynx = <<'EOF';
+This is a top-level paragraph.
+
+This is a first-level quoted paragraph
+
+This is a second-level quoted paragraph
+
+So is this
+
+Back to first-level
+
+Back to top-level
+EOF
+
+sub test_conversion
+{
+    my ($converter, $expected) = @_;
+  SKIP: {
+      if ($converter ne 'core' && !RT::Test->find_executable($converter)) {
+          skip "Skipping $converter: Not installed", 1;
+          return;
+      }
+      RT->Config->Set(HTMLFormatter => $converter);
+      my $text = RT::Interface::Email::ConvertHTMLToText($html);
+      is($text, $expected, "Got expected HTML->text conversion using $converter");
+    }
+}
+
+# Set environment variable to force creation of a new
+# formatter each time.
+$ENV{HARNESS_ACTIVE} = 1;
+
+test_conversion('w3m', "$expected\n");  # w3m adds a blank line at the end
+test_conversion('elinks', $expected);
+test_conversion('links', $expected);
+test_conversion('html2text', $expected_html2text);
+test_conversion('lynx', $expected_lynx);
+test_conversion('core', "Test HTML\n\n$expected\n");  # core adds title and blank line
+done_testing();
+1;

-----------------------------------------------------------------------


More information about the rt-commit mailing list