[Rt-commit] rt branch, 4.2/fts-refactor-performance, created. rt-4.2.10-125-g7c48294

Alex Vandiver alexmv at bestpractical.com
Thu Mar 5 00:29:36 EST 2015


The branch, 4.2/fts-refactor-performance has been created
        at  7c48294aa34a7985675f5a56fa5a932aabd64f64 (commit)

- Log -----------------------------------------------------------------
commit 44600fcbcd3ef4761a393c08a47b9046a98ee9eb
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 00:17:32 2014 -0400

    Inline extract_text and extract_html
    
    They consist of mostly-identical code, differing only in the
    decode_entities.  Inline this difference, rather than budying it under
    two levels of method call and goto.

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index b723694..dc12191 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -73,6 +73,7 @@ BEGIN {
     RT::Init();
 };
 use RT::Interface::CLI ();
+use HTML::Entities;
 
 my %OPT = (
     help        => 0,
@@ -183,9 +184,13 @@ foreach my $type ( @types ) {
     while ( my $a = $attachments->Next ) {
         next if filter( $type, $a );
         debug("Found attachment #". $a->id );
-        my $txt = extract($type, $a) or next;
+        my $text = $a->Content;
+        next unless defined $text && length($text);
+        # The rich text editor generates html entities for characters
+        # but Pg doesn't index them; decode to something it can index.
+        HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
         $found++;
-        process( $type, $a, $txt );
+        process( $type, $a, \$text );
         debug("Processed attachment #". $a->id );
     }
     goto REDO if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100)
@@ -242,15 +247,6 @@ sub filter {
     );
 }
 
-sub extract {
-    my $type = shift;
-    return goto_specific(
-        suffix    => $type,
-        error     => "No way to convert $type attachment into text",
-        arguments => \@_,
-    );
-}
-
 sub process {
     return goto_specific(
         suffix    => $db_type,
@@ -342,13 +338,6 @@ sub attachments_text {
     return $res;
 }
 
-sub extract_text {
-    my $attachment = shift;
-    my $text = $attachment->Content;
-    return undef unless defined $text && length($text);
-    return \$text;
-}
-
 sub attachments_html {
     my $res = shift;
     $res->Limit( FIELD => 'ContentType', VALUE => 'text/html' );
@@ -365,17 +354,6 @@ sub filter_html {
     return 0;
 }
 
-sub extract_html {
-    my $attachment = shift;
-    my $text = $attachment->Content;
-    return undef unless defined $text && length($text);
-# the rich text editor generates html entities for characters
-# but Pg doesn't index them, so decode to something it can index.
-    require HTML::Entities;
-    HTML::Entities::decode_entities($text);
-    return \$text;
-}
-
 sub goto_specific {
     my %args = (@_);
 

commit 391b3072b32134f12d2834766d5706b861af1604
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 00:19:10 2014 -0400

    Inline the differences between text/plain and text/html attachment lists

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index dc12191..caec183 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -222,12 +222,11 @@ sub attachments {
         OPERATOR => '!=',
         VALUE => 'deleted'
     );
-
-    return goto_specific(
-        suffix => $type,
-        error => "Don't know how to find $type attachments",
-        arguments => [$res],
+    $res->Limit(
+        FIELD => 'ContentType',
+        VALUE => ($type eq "html" ? "text/html" : "text/plain"),
     );
+    return $res;
 }
 
 sub last_indexed {
@@ -332,18 +331,6 @@ sub process_pg {
     }
 }
 
-sub attachments_text {
-    my $res = shift;
-    $res->Limit( FIELD => 'ContentType', VALUE => 'text/plain' );
-    return $res;
-}
-
-sub attachments_html {
-    my $res = shift;
-    $res->Limit( FIELD => 'ContentType', VALUE => 'text/html' );
-    return $res;
-}
-
 sub filter_html {
     my $attachment = shift;
     if ( my $parent = $attachment->ParentObj ) {

commit b891a82e6752b2c37e2063ffadfe758f8f4edd70
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 00:19:55 2014 -0400

    Stop skipping indexing of text/html within multipart/alternative
    
    Not all text/html within a multipart/alternative is strictly identical
    tothe text part; some mailers send out text/plain parts of alternatives
    that simply say "Your client must be able to render HTML."
    
    The additional space required to index these attachments is worth the
    consitency of having all potential content indexed.  This also is a
    speed increase, as it reduces the number of ad-hoc queries during
    indexing.

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index caec183..98ca894 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -182,7 +182,6 @@ foreach my $type ( @types ) {
 
     my $found = 0;
     while ( my $a = $attachments->Next ) {
-        next if filter( $type, $a );
         debug("Found attachment #". $a->id );
         my $text = $a->Content;
         next unless defined $text && length($text);
@@ -238,14 +237,6 @@ sub last_indexed {
     );
 }
 
-sub filter {
-    my $type = shift;
-    return goto_specific(
-        suffix    => $type,
-        arguments => \@_,
-    );
-}
-
 sub process {
     return goto_specific(
         suffix    => $db_type,
@@ -331,16 +322,6 @@ sub process_pg {
     }
 }
 
-sub filter_html {
-    my $attachment = shift;
-    if ( my $parent = $attachment->ParentObj ) {
-# skip html parts that are alternatives
-        return 1 if $parent->id
-            && $parent->ContentType eq 'mulitpart/alternative';
-    }
-    return 0;
-}
-
 sub goto_specific {
     my %args = (@_);
 

commit 3e9924f4d0fc85867bb13b3ef3c5a18d910ceeb7
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 00:21:52 2014 -0400

    Use the new, shorter, initialization form

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 98ca894..fd52822 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -67,11 +67,7 @@ BEGIN { # BEGIN RT CMD BOILERPLATE
 
 }
 
-BEGIN {
-    use RT;
-    RT::LoadConfig();
-    RT::Init();
-};
+use RT -init;
 use RT::Interface::CLI ();
 use HTML::Entities;
 

commit 1b1ed1bfdafa9493b483a92c10a108529235bd97
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 00:23:40 2014 -0400

    Simplify and condense option parsing
    
    This causes the indexer to be able to accept options that are
    potentially not applicable (for instance, --memory with a non-Oracle
    backend); however, the comprehensibility benefits are worth this tiny
    cost.

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index fd52822..a6a700b 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -71,40 +71,18 @@ use RT -init;
 use RT::Interface::CLI ();
 use HTML::Entities;
 
-my %OPT = (
-    help        => 0,
-    debug       => 0,
-    quiet       => 0,
-);
-my @OPT_LIST = qw(help|h! debug! quiet);
+use Getopt::Long qw(GetOptions);
+my %OPT = ( memory => '2M', limit => 0 );
+GetOptions( \%OPT,
+    "help|h!",
+    "debug!",
+    "quiet!",
 
-my $db_type = RT->Config->Get('DatabaseType');
-if ( $db_type eq 'Pg' ) {
-    %OPT = (
-        %OPT,
-        limit  => 0,
-        all    => 0,
-    );
-    push @OPT_LIST, 'limit=i', 'all!';
-}
-elsif ( $db_type eq 'mysql' ) {
-    %OPT = (
-        %OPT,
-        limit    => 0,
-        all      => 0,
-    );
-    push @OPT_LIST, 'limit=i', 'all!';
-}
-elsif ( $db_type eq 'Oracle' ) {
-    %OPT = (
-        %OPT,
-        memory => '2M',
-    );
-    push @OPT_LIST, qw(memory=s);
-}
+    "all!",
+    "limit=i",
 
-use Getopt::Long qw(GetOptions);
-GetOptions( \%OPT, @OPT_LIST );
+    "memory=s",
+);
 
 if ( $OPT{'help'} ) {
     RT::Interface::CLI->ShowHelp(
@@ -124,6 +102,7 @@ if ( !flock main::DATA, LOCK_EX | LOCK_NB ) {
     }
 }
 
+my $db_type = RT->Config->Get('DatabaseType');
 my $fts_config = RT->Config->Get('FullTextSearch') || {};
 unless ( $fts_config->{'Enable'} ) {
     print STDERR <<EOT;

commit f8c465b8b70fdab33b4f04257b35615099388fd1
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 00:24:01 2014 -0400

    Documentation has moved out; update --help accordingly
    
    The documentation (which included =head1 sections for each database
    type) was moved to the more centralized docs/full_text_indexing.pod in
    fa5dffcb; however, --help still looked for the relevant sections.
    Remove the special-case code which is no logner relevant.

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index a6a700b..c450e58 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -84,11 +84,7 @@ GetOptions( \%OPT,
     "memory=s",
 );
 
-if ( $OPT{'help'} ) {
-    RT::Interface::CLI->ShowHelp(
-        Sections => 'NAME|DESCRIPTION|'. uc($db_type),
-    );
-}
+RT::Interface::CLI->ShowHelp if $OPT{help};
 
 use Fcntl ':flock';
 if ( !flock main::DATA, LOCK_EX | LOCK_NB ) {

commit a0d966fb531a85ea45e5d040a484767f7416ff08
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 01:44:24 2014 -0400

    Remove AUTHOR section; it is unnecessary in core sbin files

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index c450e58..29e5ec2 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -323,11 +323,6 @@ This is a helper script to keep full text indexes in sync with data.
 Read F<docs/full_text_indexing.pod> for complete details on how and when
 to run it.
 
-=head1 AUTHOR
-
-Ruslan Zakirov E<lt>ruz at bestpractical.comE<gt>,
-Alex Vandiver E<lt>alexmv at bestpractical.comE<gt>
-
 =cut
 
 __DATA__

commit 5cb2b876afee4525636924613fa341fb3b89bd27
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 00:28:27 2014 -0400

    Skipping ACL checks yields a sizable performance increase
    
    The indexer spends quite a bit of time checking that the attachment
    content is visible to the current user before indexing it -- all of
    which is unnecessary, as it is run as the system user.  Explicitly
    disable the ACL checking for the indexer, which removes a large number
    of queries and yields a correspondingly sizable increase in indexing
    speed.

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 29e5ec2..c6ade7a 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -139,6 +139,16 @@ EOT
     exit 1;
 }
 
+# Skip ACL checks.  This saves a large number of unnecessary queries
+# (for tickets, ACLs, and users) which are unnecessary, as we are
+# running as the system user.
+{
+    no warnings 'redefine';
+    no warnings 'once';
+    *RT::Attachment::_Value = \&DBIx::SearchBuilder::Record::_Value;
+    *RT::Attachments::Next  = \&DBIx::SearchBuilder::Next;
+}
+
 my @types = qw(text html);
 foreach my $type ( @types ) {
   REDO:

commit 3233a2f306b2c6f7a36467a9613a4a88da422ec7
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 00:36:49 2014 -0400

    Index attachments in one pass through the database, not two
    
    There is no reason to perform two passes through the database instead of
    one; doing one allows for better progress estimates, as well as
    potentially increasing locality for the database.

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index c6ade7a..d961b02 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -149,14 +149,12 @@ EOT
     *RT::Attachments::Next  = \&DBIx::SearchBuilder::Next;
 }
 
-my @types = qw(text html);
-foreach my $type ( @types ) {
-  REDO:
-    my $attachments = attachments($type);
+{
+    my $attachments = attachments();
     $attachments->Limit(
         FIELD => 'id',
         OPERATOR => '>',
-        VALUE => last_indexed($type)
+        VALUE => last_indexed()
     );
     $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
     $attachments->RowsPerPage( $OPT{'limit'} || 100 );
@@ -170,14 +168,13 @@ foreach my $type ( @types ) {
         # but Pg doesn't index them; decode to something it can index.
         HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
         $found++;
-        process( $type, $a, \$text );
+        process( $a, \$text );
         debug("Processed attachment #". $a->id );
     }
-    goto REDO if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100)
+    redo if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100);
 }
 
 sub attachments {
-    my $type = shift;
     my $res = RT::Attachments->new( RT->SystemUser );
     my $txn_alias = $res->Join(
         ALIAS1 => 'main',
@@ -203,17 +200,17 @@ sub attachments {
         VALUE => 'deleted'
     );
     $res->Limit(
-        FIELD => 'ContentType',
-        VALUE => ($type eq "html" ? "text/html" : "text/plain"),
+        FIELD    => 'ContentType',
+        OPERATOR => 'IN',
+        VALUE    => ['text/plain', 'text/html'],
     );
     return $res;
 }
 
 sub last_indexed {
-    my ($type) = (@_);
     return goto_specific(
         suffix => $db_type,
-        error => "Don't know how to find last indexed $type attachment for $db_type DB",
+        error => "Don't know how to find last indexed attachment for $db_type DB",
         arguments => \@_,
     );
 }
@@ -228,7 +225,7 @@ sub process {
 
 sub last_indexed_mysql { last_indexed_pg(@_); }
 sub process_mysql {
-    my ($type, $attachment, $text) = (@_);
+    my ($attachment, $text) = (@_);
 
     my $dbh = $RT::Handle->dbh;
     my $table = $fts_config->{'Table'};
@@ -244,8 +241,7 @@ sub process_mysql {
 }
 
 sub last_indexed_pg {
-    my $type = shift;
-    my $attachments = attachments( $type );
+    my $attachments = attachments();
     my $alias = 'main';
     if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) {
         $alias = $attachments->Join(
@@ -269,7 +265,7 @@ sub last_indexed_pg {
 }
 
 sub process_pg {
-    my ($type, $attachment, $text) = (@_);
+    my ($attachment, $text) = (@_);
 
     my $dbh = $RT::Handle->dbh;
     my $table = $fts_config->{'Table'};

commit fd11115192659e899b2775c6bda7765a99206032
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 00:45:35 2014 -0400

    Index attachments even on deleted tickets
    
    Skipping indexing on tickets which are currently deleted may lead to
    content which is later not indexed and not findable.  Even if a ticket
    becomes un-deleted at a later point, it will never again be indexed, as
    last_indexed() will limit to new attachments since the last run.  The
    overhead of joining through Attachments and into Tickets is also not to
    be overlooked -- on MySQL, it causes pessimal performance for large
    systems, involving more than one filesort and temporary table.
    
    Index all attachments, no matter the source, on the premise that
    limiting to non-deleted tickets is better done at query time, not at
    index time.  It is both more timely, and better limited.

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index d961b02..dcd81ae 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -150,7 +150,12 @@ EOT
 }
 
 {
-    my $attachments = attachments();
+    my $attachments = RT::Attachments->new( RT->SystemUser );
+    $attachments->Limit(
+        FIELD    => 'ContentType',
+        OPERATOR => 'IN',
+        VALUE    => ['text/plain', 'text/html'],
+    );
     $attachments->Limit(
         FIELD => 'id',
         OPERATOR => '>',
@@ -174,39 +179,6 @@ EOT
     redo if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100);
 }
 
-sub attachments {
-    my $res = RT::Attachments->new( RT->SystemUser );
-    my $txn_alias = $res->Join(
-        ALIAS1 => 'main',
-        FIELD1 => 'TransactionId',
-        TABLE2 => 'Transactions',
-        FIELD2 => 'id',
-    );
-    $res->Limit(
-        ALIAS => $txn_alias,
-        FIELD => 'ObjectType',
-        VALUE => 'RT::Ticket',
-    );
-    my $ticket_alias = $res->Join(
-        ALIAS1 => $txn_alias,
-        FIELD1 => 'ObjectId',
-        TABLE2 => 'Tickets',
-        FIELD2 => 'id',
-    );
-    $res->Limit(
-        ALIAS => $ticket_alias,
-        FIELD => 'Status',
-        OPERATOR => '!=',
-        VALUE => 'deleted'
-    );
-    $res->Limit(
-        FIELD    => 'ContentType',
-        OPERATOR => 'IN',
-        VALUE    => ['text/plain', 'text/html'],
-    );
-    return $res;
-}
-
 sub last_indexed {
     return goto_specific(
         suffix => $db_type,
@@ -241,7 +213,12 @@ sub process_mysql {
 }
 
 sub last_indexed_pg {
-    my $attachments = attachments();
+    my $attachments = RT::Attachments->new( RT->SystemUser );
+    $attachments->Limit(
+        FIELD    => 'ContentType',
+        OPERATOR => 'IN',
+        VALUE    => ['text/plain', 'text/html'],
+    );
     my $alias = 'main';
     if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) {
         $alias = $attachments->Join(

commit bbd81766f00b1f01d975e5be1289e0f9ea60a6af
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 00:58:24 2014 -0400

    mysql and pg share the same last_indexed; unify the method

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index dcd81ae..64abd61 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -180,39 +180,6 @@ EOT
 }
 
 sub last_indexed {
-    return goto_specific(
-        suffix => $db_type,
-        error => "Don't know how to find last indexed attachment for $db_type DB",
-        arguments => \@_,
-    );
-}
-
-sub process {
-    return goto_specific(
-        suffix    => $db_type,
-        error     => "No processer for $db_type DB",
-        arguments => \@_,
-    );
-}
-
-sub last_indexed_mysql { last_indexed_pg(@_); }
-sub process_mysql {
-    my ($attachment, $text) = (@_);
-
-    my $dbh = $RT::Handle->dbh;
-    my $table = $fts_config->{'Table'};
-
-    my $query;
-    if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) {
-        $query = "UPDATE $table SET Content = ? WHERE id = ?";
-    } else {
-        $query = "INSERT INTO $table(Content, id) VALUES(?, ?)";
-    }
-
-    $dbh->do( $query, undef, $$text, $attachment->id );
-}
-
-sub last_indexed_pg {
     my $attachments = RT::Attachments->new( RT->SystemUser );
     $attachments->Limit(
         FIELD    => 'ContentType',
@@ -241,6 +208,31 @@ sub last_indexed_pg {
     return $res->id;
 }
 
+sub process {
+    return goto_specific(
+        suffix    => $db_type,
+        error     => "No processer for $db_type DB",
+        arguments => \@_,
+    );
+}
+
+
+sub process_mysql {
+    my ($attachment, $text) = (@_);
+
+    my $dbh = $RT::Handle->dbh;
+    my $table = $fts_config->{'Table'};
+
+    my $query;
+    if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) {
+        $query = "UPDATE $table SET Content = ? WHERE id = ?";
+    } else {
+        $query = "INSERT INTO $table(Content, id) VALUES(?, ?)";
+    }
+
+    $dbh->do( $query, undef, $$text, $attachment->id );
+}
+
 sub process_pg {
     my ($attachment, $text) = (@_);
 

commit 8b51e6cc9eb39c6e78f23e320f153f2a258cfe71
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 01:05:22 2014 -0400

    Replace the last use of goto_specific with explicit function calls

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 64abd61..28ce21d 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -173,7 +173,11 @@ EOT
         # but Pg doesn't index them; decode to something it can index.
         HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
         $found++;
-        process( $a, \$text );
+        if ($db_type eq 'mysql') {
+            process_mysql( $a, \$text );
+        } elsif ($db_type eq 'Pg') {
+            process_pg( $a, \$text );
+        }
         debug("Processed attachment #". $a->id );
     }
     redo if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100);
@@ -208,14 +212,6 @@ sub last_indexed {
     return $res->id;
 }
 
-sub process {
-    return goto_specific(
-        suffix    => $db_type,
-        error     => "No processer for $db_type DB",
-        arguments => \@_,
-    );
-}
-
 
 sub process_mysql {
     my ($attachment, $text) = (@_);
@@ -268,20 +264,6 @@ sub process_pg {
     }
 }
 
-sub goto_specific {
-    my %args = (@_);
-
-    my $func = (caller(1))[3];
-    $func =~ s/.*:://;
-    my $call = $func ."_". lc $args{'suffix'};
-    unless ( defined &$call ) {
-        return undef unless $args{'error'};
-        require Carp; Carp::croak( $args{'error'} );
-    }
-    @_ = @{ $args{'arguments'} };
-    goto &$call;
-}
-
 
 # helper functions
 sub debug    { print @_, "\n" if $OPT{debug}; 1 }

commit b8a79b4b00cacf75ed104777b3c8cb875ab1d85c
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 15:31:36 2014 -0400

    Simplify last_indexed

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 28ce21d..ad871bb 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -184,35 +184,17 @@ EOT
 }
 
 sub last_indexed {
-    my $attachments = RT::Attachments->new( RT->SystemUser );
-    $attachments->Limit(
-        FIELD    => 'ContentType',
-        OPERATOR => 'IN',
-        VALUE    => ['text/plain', 'text/html'],
-    );
-    my $alias = 'main';
-    if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) {
-        $alias = $attachments->Join(
-            TYPE    => 'left',
-            FIELD1 => 'id',
-            TABLE2  => $fts_config->{'Table'},
-            FIELD2 => 'id',
-        );
+    if ( $db_type eq "mysql" ) {
+        return $dbh->selectrow_arrayref("SELECT MAX(id) FROM $table")->[0];
+    } elsif ( $db_type eq "pg" ) {
+        if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) {
+            return $dbh->selectrow_array("SELECT MAX(id) FROM $table")->[0];
+        } else {
+            return $dbh->selectrow_array("SELECT MAX(id) FROM Attachments WHERE $column IS NOT NULL")->[0];
+        }
     }
-    $attachments->Limit(
-        ALIAS => $alias,
-        FIELD => $fts_config->{'Column'},
-        OPERATOR => 'IS NOT',
-        VALUE => 'NULL',
-    );
-    $attachments->OrderBy( FIELD => 'id', ORDER => 'desc' );
-    $attachments->RowsPerPage( 1 );
-    my $res = $attachments->First;
-    return 0 unless $res;
-    return $res->id;
 }
 
-
 sub process_mysql {
     my ($attachment, $text) = (@_);
 

commit eef5a388fcc72a2932e6b3d9a5886313f35caecb
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 15:33:58 2014 -0400

    Only call last_indexed once, as it may be heavy

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index ad871bb..d61a12d 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -149,6 +149,7 @@ EOT
     *RT::Attachments::Next  = \&DBIx::SearchBuilder::Next;
 }
 
+my $LAST = last_indexed();
 {
     my $attachments = RT::Attachments->new( RT->SystemUser );
     $attachments->Limit(
@@ -159,7 +160,7 @@ EOT
     $attachments->Limit(
         FIELD => 'id',
         OPERATOR => '>',
-        VALUE => last_indexed()
+        VALUE => $LAST,
     );
     $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
     $attachments->RowsPerPage( $OPT{'limit'} || 100 );
@@ -180,6 +181,7 @@ EOT
         }
         debug("Processed attachment #". $a->id );
     }
+    $LAST = $attachments->Last->id if $attachments->Count;
     redo if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100);
 }
 

commit 1b79337c4b8d30adec66eee0c0468d9719d98a0e
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 15:36:21 2014 -0400

    Index even empty attachments

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index d61a12d..920a7a1 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -49,6 +49,7 @@
 use strict;
 use warnings;
 no warnings 'once';
+use 5.010;
 
 # fix lib paths, some may be relative
 BEGIN { # BEGIN RT CMD BOILERPLATE
@@ -168,7 +169,7 @@ my $LAST = last_indexed();
     my $found = 0;
     while ( my $a = $attachments->Next ) {
         debug("Found attachment #". $a->id );
-        my $text = $a->Content;
+        my $text = $a->Content // "";
         next unless defined $text && length($text);
         # The rich text editor generates html entities for characters
         # but Pg doesn't index them; decode to something it can index.

commit d9b996e38d7b085dee36800db8e27e2f3ffc9224
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 15:42:19 2014 -0400

    As last_indexed is based on the highest insert, there will never be an UPDATE needed

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 920a7a1..e8e48e8 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -204,12 +204,7 @@ sub process_mysql {
     my $dbh = $RT::Handle->dbh;
     my $table = $fts_config->{'Table'};
 
-    my $query;
-    if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) {
-        $query = "UPDATE $table SET Content = ? WHERE id = ?";
-    } else {
-        $query = "INSERT INTO $table(Content, id) VALUES(?, ?)";
-    }
+    my $query = "INSERT INTO $table(Content, id) VALUES(?, ?)";
 
     $dbh->do( $query, undef, $$text, $attachment->id );
 }
@@ -222,12 +217,8 @@ sub process_pg {
     my $column = $fts_config->{'Column'};
 
     my $query;
-    if ( $table ) {
-        if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) {
-            $query = "UPDATE $table SET $column = to_tsvector(?) WHERE id = ?";
-        } else {
-            $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)";
-        }
+    if ( $table ne 'Attachments' ) {
+        $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)";
     } else {
         $query = "UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?";
     }

commit a2a995ba6b844d94e65da4735716acfb48a13fbb
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 15:45:15 2014 -0400

    Inversion of control of main indexing loops
    
    Rather than having database-dependent if-statements, followed by a
    standard loop which contains further database-dependent if-statements,
    instead turn the loop into a function which can be called from one of
    two database-specific functions.  This is important because MySQL's
    iteration will further diverge from PostgreSQL's in the following
    commits.
    
    This commit is best viewed with:
      git diff --patience --ignore-all-space

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index e8e48e8..83261b5 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -48,7 +48,6 @@
 # END BPS TAGGED BLOCK }}}
 use strict;
 use warnings;
-no warnings 'once';
 use 5.010;
 
 # fix lib paths, some may be relative
@@ -150,72 +149,70 @@ EOT
     *RT::Attachments::Next  = \&DBIx::SearchBuilder::Next;
 }
 
-my $LAST = last_indexed();
-{
-    my $attachments = RT::Attachments->new( RT->SystemUser );
-    $attachments->Limit(
-        FIELD    => 'ContentType',
-        OPERATOR => 'IN',
-        VALUE    => ['text/plain', 'text/html'],
-    );
-    $attachments->Limit(
-        FIELD => 'id',
-        OPERATOR => '>',
-        VALUE => $LAST,
-    );
-    $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
-    $attachments->RowsPerPage( $OPT{'limit'} || 100 );
-
-    my $found = 0;
-    while ( my $a = $attachments->Next ) {
-        debug("Found attachment #". $a->id );
-        my $text = $a->Content // "";
-        next unless defined $text && length($text);
-        # The rich text editor generates html entities for characters
-        # but Pg doesn't index them; decode to something it can index.
-        HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
-        $found++;
-        if ($db_type eq 'mysql') {
-            process_mysql( $a, \$text );
-        } elsif ($db_type eq 'Pg') {
-            process_pg( $a, \$text );
-        }
-        debug("Processed attachment #". $a->id );
-    }
-    $LAST = $attachments->Last->id if $attachments->Count;
-    redo if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100);
+my $LAST;
+if ($db_type eq 'mysql') {
+    process_mysql();
+} elsif ($db_type eq 'Pg') {
+    process_pg();
 }
 
-sub last_indexed {
-    if ( $db_type eq "mysql" ) {
-        return $dbh->selectrow_arrayref("SELECT MAX(id) FROM $table")->[0];
-    } elsif ( $db_type eq "pg" ) {
-        if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) {
-            return $dbh->selectrow_array("SELECT MAX(id) FROM $table")->[0];
-        } else {
-            return $dbh->selectrow_array("SELECT MAX(id) FROM Attachments WHERE $column IS NOT NULL")->[0];
-        }
+sub attachment_loop {
+    my $subref = shift;
+    my $table = $fts_config->{'Table'};
+    $LAST //= 0;
+
+    # Fetch in batches of size --limit
+    {
+        # Indexes all text/plain and text/html attachments
+        my $attachments = RT::Attachments->new( RT->SystemUser );
+        $attachments->Limit(
+            FIELD    => 'ContentType',
+            OPERATOR => 'IN',
+            VALUE    => ['text/plain', 'text/html'],
+        );
+        $attachments->Limit( FIELD => 'id', OPERATOR => '>', VALUE => $LAST );
+        $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
+        $attachments->RowsPerPage( $OPT{'limit'} || 100 );
+
+        # Call back to the DB-specific part
+        $subref->($attachments);
+
+        $LAST = $attachments->Last->id if $attachments->Count;
+
+        redo if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100);
     }
 }
 
 sub process_mysql {
-    my ($attachment, $text) = (@_);
-
     my $dbh = $RT::Handle->dbh;
     my $table = $fts_config->{'Table'};
 
+    ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
+
     my $query = "INSERT INTO $table(Content, id) VALUES(?, ?)";
 
-    $dbh->do( $query, undef, $$text, $attachment->id );
+    attachment_loop( sub {
+        my ($attachments) = @_;
+        while ( my $a = $attachments->Next ) {
+            debug("Found attachment #". $a->id );
+            my $text = $a->Content // "";
+            HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
+            $dbh->do( $query, undef, $text, $a->id );
+        }
+    });
 }
 
 sub process_pg {
-    my ($attachment, $text) = (@_);
-
     my $dbh = $RT::Handle->dbh;
     my $table = $fts_config->{'Table'};
     my $column = $fts_config->{'Column'};
 
+    if ( $table ne 'Attachments' ) {
+        ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
+    } else {
+        ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM Attachments WHERE $column IS NOT NULL");
+    }
+
     my $query;
     if ( $table ne 'Attachments' ) {
         $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)";
@@ -223,21 +220,31 @@ sub process_pg {
         $query = "UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?";
     }
 
-    my $status = eval { $dbh->do( $query, undef, $$text, $attachment->id ) };
-    unless ( $status ) {
-        if ( $dbh->err == 7  && $dbh->state eq '54000' ) {
-            warn "Attachment @{[$attachment->id]} cannot be indexed. Most probably it contains too many unique words. Error: ". $dbh->errstr;
-        } elsif ( $dbh->err == 7 && $dbh->state eq '22021' ) {
-            warn "Attachment @{[$attachment->id]} cannot be indexed. Most probably it contains invalid UTF8 bytes. Error: ". $dbh->errstr;
-        } else {
-            die "error: ". $dbh->errstr;
+    attachment_loop( sub {
+        my ($attachments) = @_;
+        while ( my $a = $attachments->Next ) {
+            debug("Found attachment #". $a->id );
+
+            my $text = $a->Content // "";
+            HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
+
+            my $status = eval { $dbh->do( $query, undef, $text, $a->id ) };
+            unless ( $status ) {
+                if ( $dbh->err == 7  && $dbh->state eq '54000' ) {
+                    warn "Attachment @{[$a->id]} cannot be indexed. Most probably it contains too many unique words. Error: ". $dbh->errstr;
+                } elsif ( $dbh->err == 7 && $dbh->state eq '22021' ) {
+                    warn "Attachment @{[$a->id]} cannot be indexed. Most probably it contains invalid UTF8 bytes. Error: ". $dbh->errstr;
+                } else {
+                    die "error: ". $dbh->errstr;
+                }
+
+                # Insert an empty tsvector, so we count this row as "indexed"
+                # for purposes of knowing where to pick up
+                eval { $dbh->do( $query, undef, "", $a->id ) }
+                    or die "Failed to insert empty row: " . $dbh->errstr;
+            }
         }
-
-        # Insert an empty tsvector, so we count this row as "indexed"
-        # for purposes of knowing where to pick up
-        eval { $dbh->do( $query, undef, "", $attachment->id ) }
-            or die "Failed to insert empty tsvector: " . $dbh->errstr;
-    }
+    });
 }
 
 

commit c0cce1089103b2d5e9be502f3a71441912dd2c6b
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 01:37:10 2014 -0400

    Switch to preparing statements, rather than just setting strings
    
    Prepared statements provide a small speed benefit, removing the need for
    the database to re-parse the query string.

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 83261b5..7feb176 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -189,7 +189,7 @@ sub process_mysql {
 
     ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
 
-    my $query = "INSERT INTO $table(Content, id) VALUES(?, ?)";
+    my $sth = $dbh->prepare("INSERT INTO $table(Content, id) VALUES(?, ?)");
 
     attachment_loop( sub {
         my ($attachments) = @_;
@@ -197,7 +197,7 @@ sub process_mysql {
             debug("Found attachment #". $a->id );
             my $text = $a->Content // "";
             HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
-            $dbh->do( $query, undef, $text, $a->id );
+            $sth->execute( $text, $a->id );
         }
     });
 }
@@ -213,11 +213,11 @@ sub process_pg {
         ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM Attachments WHERE $column IS NOT NULL");
     }
 
-    my $query;
+    my $sth;
     if ( $table ne 'Attachments' ) {
-        $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)";
+        $sth = $dbh->prepare("INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)");
     } else {
-        $query = "UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?";
+        $sth = $dbh->prepare("UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?");
     }
 
     attachment_loop( sub {
@@ -228,7 +228,7 @@ sub process_pg {
             my $text = $a->Content // "";
             HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
 
-            my $status = eval { $dbh->do( $query, undef, $text, $a->id ) };
+            my $status = eval { $sth->execute( $text, $a->id ) };
             unless ( $status ) {
                 if ( $dbh->err == 7  && $dbh->state eq '54000' ) {
                     warn "Attachment @{[$a->id]} cannot be indexed. Most probably it contains too many unique words. Error: ". $dbh->errstr;
@@ -240,7 +240,7 @@ sub process_pg {
 
                 # Insert an empty tsvector, so we count this row as "indexed"
                 # for purposes of knowing where to pick up
-                eval { $dbh->do( $query, undef, "", $a->id ) }
+                eval { $sth->execute( "", $a->id ) }
                     or die "Failed to insert empty row: " . $dbh->errstr;
             }
         }

commit 8f3c2eda6b853bb1889d578d5bb52a9952272acf
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 01:39:22 2014 -0400

    INSERT DELAYED provides notable speed benefits on MyISAM
    
    MySQL supports INSERT DELAYED[1] for MyISAM (but not InnoDB) tables.  This
    allows the server to defer inserts until it has a good opportunity to
    write them, and to write them in bulk.  While there is a small risk of
    data loss (if the server is terminated before the data is written) this
    poses no problem for an AttachmentsIndex table, for which all inserted
    data is trivial to re-generate.
    
    [1] http://dev.mysql.com/doc/refman/5.1/en/insert-delayed.html

diff --git a/lib/RT/Config.pm b/lib/RT/Config.pm
index b976b3a..c634291 100644
--- a/lib/RT/Config.pm
+++ b/lib/RT/Config.pm
@@ -608,6 +608,7 @@ our %META;
                     } else {
                         # Internal, one-column table
                         $v->{Column} = 'Content';
+                        $v->{Engine} = $engine;
                     }
                 }
             } else {
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 7feb176..b2ac922 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -189,7 +189,8 @@ sub process_mysql {
 
     ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
 
-    my $sth = $dbh->prepare("INSERT INTO $table(Content, id) VALUES(?, ?)");
+    my $insert = $fts_config->{Engine} eq "MyISAM" ? "INSERT DELAYED" : "INSERT";
+    my $sth = $dbh->prepare("$insert INTO $table(Content, id) VALUES(?, ?)");
 
     attachment_loop( sub {
         my ($attachments) = @_;

commit b283fa019e231d5d5c30a557be66107be7ad120b
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 01:40:16 2014 -0400

    Improve MySQL insert speed by batching inserts into one statement
    
    MySQL must flush buffers after every insert statement; as such,
    providing large numbers of INSERT statements is quite inefficient, as
    most time is spent in disk I/O.  Instead, store rows to be inserted, and
    insert them batch-by-batch.
    
    This technique is not applicable to PostgreSQL because failure of the
    to_tsvector call to convert a string would abort the entire insert.
    Additionally, most installs use an additional column on the existing
    table, which requires an UPDATE, and not an INSERT, which is not easily
    batched.

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index b2ac922..b3041b3 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -189,17 +189,30 @@ sub process_mysql {
 
     ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
 
+    # Doing large inserts is faster than individual statements, but
+    # comes at a parsing cost; cache the statement handles (99% of which
+    # will be the same size) for a notable (2x) speed gain.
+    my %sthandles;
     my $insert = $fts_config->{Engine} eq "MyISAM" ? "INSERT DELAYED" : "INSERT";
-    my $sth = $dbh->prepare("$insert INTO $table(Content, id) VALUES(?, ?)");
 
     attachment_loop( sub {
         my ($attachments) = @_;
+        my @insert;
+        my $found = 0;
         while ( my $a = $attachments->Next ) {
             debug("Found attachment #". $a->id );
             my $text = $a->Content // "";
             HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
-            $sth->execute( $text, $a->id );
+            push @insert, $text, $a->id;
+            $found++;
         }
+        return unless $found;
+
+        # $found should be the limit size on all but the last go-around.
+        $sthandles{$found} ||=
+            $dbh->prepare("$insert INTO $table(Content, id) VALUES "
+                              . join(", ", ("(?,?)") x $found));
+        $sthandles{$found}->execute(@insert);
     });
 }
 

commit 8c82225e7598e50d40d30dce6b719ab931ba10f3
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 15:49:17 2014 -0400

    Testing finds 200 is a good default batch size
    
    Smaller batch sizes are slightly less efficient; the main trade-off is
    in maximum memory consumption.

diff --git a/docs/full_text_indexing.pod b/docs/full_text_indexing.pod
index 339625b..180122b 100644
--- a/docs/full_text_indexing.pod
+++ b/docs/full_text_indexing.pod
@@ -45,10 +45,9 @@ To keep the index up-to-date, you will need to run:
 
     /opt/rt4/sbin/rt-fulltext-indexer
 
-...at regular intervals.  By default, this will only tokenize up to 100
-tickets at a time; you can adjust this upwards by passing
-C<--limit 500>.  Larger batch sizes will take longer and
-consume more memory.
+...at regular intervals.  By default, this will only tokenize up to 200
+tickets at a time; you can adjust this upwards by passing C<--limit
+500>.  Larger batch sizes will take longer and consume more memory.
 
 If there is already an instances of C<rt-fulltext-indexer> running, new
 ones will exit abnormally (with exit code 1) and the error message
@@ -95,10 +94,9 @@ To keep the index up-to-date, you will need to run:
 
     /opt/rt4/sbin/rt-fulltext-indexer
 
-...at regular intervals.  By default, this will only tokenize up to 100
-tickets at a time; you can adjust this upwards by passing
-C<--limit 500>.  Larger batch sizes will take longer and
-consume more memory.
+...at regular intervals.  By default, this will only tokenize up to 200
+tickets at a time; you can adjust this upwards by passing C<--limit
+500>.  Larger batch sizes will take longer and consume more memory.
 
 If there is already an instances of C<rt-fulltext-indexer> running, new
 ones will exit abnormally (with exit code 1) and the error message
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index b3041b3..50a9980 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -83,6 +83,7 @@ GetOptions( \%OPT,
 
     "memory=s",
 );
+$OPT{limit} ||= 200;
 
 RT::Interface::CLI->ShowHelp if $OPT{help};
 
@@ -172,14 +173,14 @@ sub attachment_loop {
         );
         $attachments->Limit( FIELD => 'id', OPERATOR => '>', VALUE => $LAST );
         $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
-        $attachments->RowsPerPage( $OPT{'limit'} || 100 );
+        $attachments->RowsPerPage( $OPT{'limit'} );
 
         # Call back to the DB-specific part
         $subref->($attachments);
 
         $LAST = $attachments->Last->id if $attachments->Count;
 
-        redo if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100);
+        redo if $OPT{'all'} and $attachments->Count == $OPT{'limit'};
     }
 }
 

commit a22e1a1ab91b7c1344ec75226070ad93acf8d2a3
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Aug 1 14:13:00 2014 -0400

    Indexing of "text" may fail for content with invalid UTF-8 byte sequences
    
    Prior to f04f561f, content that claimed to be UTF-8 was inserted into
    the database verbatim.  This errored immediately upon insertion for
    PostgreSQL, as the Content column is of type "TEXT"; MySQL, however,
    did not attempt to validate the bytes, as its Content column is of type
    "LONGBLOB".
    
    This thus only causes errors when attempting to insert the characters
    into a "TEXT" column.  As multiple rows are inserted in batches, use the
    row number from the error message to determine which row to blank, and
    re-attempt insertion.

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 50a9980..57407b9 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -213,7 +213,22 @@ sub process_mysql {
         $sthandles{$found} ||=
             $dbh->prepare("$insert INTO $table(Content, id) VALUES "
                               . join(", ", ("(?,?)") x $found));
-        $sthandles{$found}->execute(@insert);
+      TRY: {
+            my $status = eval { $sthandles{$found}->execute(@insert); };
+            unless ( $status ) {
+                my ($row) = $dbh->errstr =~ /\brow (\d+)\b/;
+                die $dbh->errstr unless $row;
+
+                my ($content, $id) = ($insert[($row - 1)*2], $insert[($row - 1)*2 +1]);
+                if ($dbh->err == 1366 and $dbh->state eq "HY000") {
+                    warn "Attachment $id cannot be indexed. Most probably it contains invalid UTF8 bytes. Error: ". $dbh->errstr;
+                    $insert[($row - 1)*2] = "";
+                    redo TRY;
+                } else {
+                    die "Attachment $id: ".$dbh->errstr;
+                }
+            }
+        }
     });
 }
 

commit 54d4681040254f181d4d900feb290a9bdd365fd0
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Thu Nov 6 16:29:56 2014 -0800

    Refactor PostgreSQL's insert to also do bulk insertion
    
    PostgreSQL's full-text search can use a separate column, or a separate
    table.  In the case of a separate table, the same bulk-insertion trick
    as MySQL uses can also be used to noticeably speed up indexing time.
    
    Refactor the MySQL loop to be used in both cases.  Note that instead of
    attempting to parse the error message (which only works in English
    locales for MySQL, and does not work at all in PostgreSQL) the rows are
    instead attempted one-at-a-time.  This also better catches the case
    where multiple errors occurred in one bulk insert.
    
    This provides notable speed benefits: 5x for GIN, and 30x for GiST. Note
    that this does not speed up the default configuration for PostgreSQL,
    wherein the index lies in the Attachments table, and thus UPDATE
    statements must be used.

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 57407b9..9acc7a4 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -184,17 +184,17 @@ sub attachment_loop {
     }
 }
 
-sub process_mysql {
+sub process_bulk_insert {
     my $dbh = $RT::Handle->dbh;
-    my $table = $fts_config->{'Table'};
-
-    ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
+    my ($statement, $error) = @_;
 
     # Doing large inserts is faster than individual statements, but
     # comes at a parsing cost; cache the statement handles (99% of which
     # will be the same size) for a notable (2x) speed gain.
     my %sthandles;
-    my $insert = $fts_config->{Engine} eq "MyISAM" ? "INSERT DELAYED" : "INSERT";
+
+    $sthandles{1} =
+        $dbh->prepare($statement->(1));
 
     attachment_loop( sub {
         my ($attachments) = @_;
@@ -210,45 +210,95 @@ sub process_mysql {
         return unless $found;
 
         # $found should be the limit size on all but the last go-around.
-        $sthandles{$found} ||=
-            $dbh->prepare("$insert INTO $table(Content, id) VALUES "
-                              . join(", ", ("(?,?)") x $found));
-      TRY: {
-            my $status = eval { $sthandles{$found}->execute(@insert); };
-            unless ( $status ) {
-                my ($row) = $dbh->errstr =~ /\brow (\d+)\b/;
-                die $dbh->errstr unless $row;
-
-                my ($content, $id) = ($insert[($row - 1)*2], $insert[($row - 1)*2 +1]);
-                if ($dbh->err == 1366 and $dbh->state eq "HY000") {
-                    warn "Attachment $id cannot be indexed. Most probably it contains invalid UTF8 bytes. Error: ". $dbh->errstr;
-                    $insert[($row - 1)*2] = "";
-                    redo TRY;
-                } else {
-                    die "Attachment $id: ".$dbh->errstr;
-                }
-            }
+        $sthandles{$found} ||= $dbh->prepare($statement->($found));
+
+        return if eval { $sthandles{$found}->execute(@insert); };
+
+        # We can catch and recover from some errors; re-do row-by-row to
+        # know which row had which errors
+        while (@insert) {
+            my ($content, $id) = splice(@insert,0,2);
+            next if eval { $sthandles{1}->execute($content, $id); };
+            $error->($id, $content);
+
+            # If this was a semi-expected error, insert an empty
+            # tsvector, so we count this row as "indexed" for
+            # purposes of knowing where to pick up
+            eval { $sthandles{1}->execute( "", $id ) }
+                or die "Failed to insert empty row for attachment $id: " . $dbh->errstr;
         }
     });
 }
 
-sub process_pg {
+sub process_mysql {
     my $dbh = $RT::Handle->dbh;
     my $table = $fts_config->{'Table'};
-    my $column = $fts_config->{'Column'};
 
-    if ( $table ne 'Attachments' ) {
-        ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
-    } else {
-        ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM Attachments WHERE $column IS NOT NULL");
-    }
+    ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
 
-    my $sth;
-    if ( $table ne 'Attachments' ) {
-        $sth = $dbh->prepare("INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)");
+    my $insert = $fts_config->{Engine} eq "MyISAM" ? "INSERT DELAYED" : "INSERT";
+
+    process_bulk_insert(
+        sub {
+            my ($n) = @_;
+            return "$insert INTO $table(Content, id) VALUES "
+                . join(", ", ("(?,?)") x $n);
+        },
+        sub {
+            my ($id) = @_;
+            if ($dbh->err == 1366 and $dbh->state eq "HY000") {
+                warn "Attachment $id cannot be indexed. Most probably it contains invalid UTF8 bytes. ".
+                    "Error: ". $dbh->errstr;
+            } else {
+                die "Attachment $id cannot be indexed: " . $dbh->errstr;
+            }
+        }
+    );
+}
+
+
+sub process_pg {
+    if ( $fts_config->{'Table'} ne 'Attachments' ) {
+        process_pg_insert();
     } else {
-        $sth = $dbh->prepare("UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?");
+        process_pg_update();
     }
+}
+
+sub process_pg_insert {
+    my $dbh = $RT::Handle->dbh;
+    my $table = $fts_config->{'Table'};
+    my $column = $fts_config->{'Column'};
+    ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
+
+    process_bulk_insert(
+        sub {
+            my ($n) = @_;
+            return "INSERT INTO $table($column, id) VALUES "
+                . join(", ", ("(TO_TSVECTOR(?),?)") x $n);
+        },
+        sub {
+            my ($id) = @_;
+            if ( $dbh->err == 7 && $dbh->state eq '54000' ) {
+                warn "Attachment $id cannot be indexed. Most probably it contains too many unique words. ".
+                  "Error: ". $dbh->errstr;
+            } elsif ( $dbh->err == 7 && $dbh->state eq '22021' ) {
+                warn "Attachment $id cannot be indexed. Most probably it contains invalid UTF8 bytes. ".
+                  "Error: ". $dbh->errstr;
+            } else {
+                die "Attachment $id cannot be indexed: " . $dbh->errstr;
+            }
+        }
+    );
+}
+
+sub process_pg_update {
+    my $dbh = $RT::Handle->dbh;
+    my $column = $fts_config->{'Column'};
+
+    ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM Attachments WHERE $column IS NOT NULL");
+
+    $sth = $dbh->prepare("UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?");
 
     attachment_loop( sub {
         my ($attachments) = @_;

commit da757e7d80e7466c63877ee51fcd62e0b3e69cb7
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Thu Nov 6 17:07:58 2014 -0800

    Perform PostgreSQL UPDATE statements inside of a database transaction
    
    This attempts to reduce the number of write operations necessary, by
    allowing PostgreSQL to batch writes, doing them only at COMMIT time.
    This requires optimistically assuming that all UPDATEs will succeed --
    and of any one does not, redoing all of them one-by-one.  In cases where
    indexing errors are frequent (at least one per two batches), this will
    lead to a decrease in performance.  However, in most cases it results in
    a notable performance increase: 3x for GIN, 10x for GiST.

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 9acc7a4..c37cf38 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -298,31 +298,54 @@ sub process_pg_update {
 
     ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM Attachments WHERE $column IS NOT NULL");
 
-    $sth = $dbh->prepare("UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?");
+    my $sth = $dbh->prepare("UPDATE Attachments SET $column = TO_TSVECTOR(?) WHERE id = ?");
 
     attachment_loop( sub {
         my ($attachments) = @_;
+        my @insert;
         while ( my $a = $attachments->Next ) {
             debug("Found attachment #". $a->id );
 
             my $text = $a->Content // "";
             HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
 
-            my $status = eval { $sth->execute( $text, $a->id ) };
-            unless ( $status ) {
-                if ( $dbh->err == 7  && $dbh->state eq '54000' ) {
-                    warn "Attachment @{[$a->id]} cannot be indexed. Most probably it contains too many unique words. Error: ". $dbh->errstr;
-                } elsif ( $dbh->err == 7 && $dbh->state eq '22021' ) {
-                    warn "Attachment @{[$a->id]} cannot be indexed. Most probably it contains invalid UTF8 bytes. Error: ". $dbh->errstr;
-                } else {
-                    die "error: ". $dbh->errstr;
-                }
-
-                # Insert an empty tsvector, so we count this row as "indexed"
-                # for purposes of knowing where to pick up
-                eval { $sth->execute( "", $a->id ) }
-                    or die "Failed to insert empty row: " . $dbh->errstr;
+            push @insert, [$text, $a->id];
+        }
+
+        # Try in one database transaction; if it fails, we roll it back
+        # and try one statement at a time.
+        $dbh->begin_work;
+        my $ok = 1;
+        for (@insert) {
+            $ok = eval { $sth->execute( $_->[0], $_->[1] ) };
+            last unless $ok;
+        }
+        if ($ok) {
+            $dbh->commit;
+            return;
+        }
+        $dbh->rollback;
+
+        # Things didn't go well.  Retry the UPDATE statements one row at
+        # a time, outside of the transaction.
+        for (@insert) {
+            my ($content, $id) = ($_->[0], $_->[1]);
+            next if eval { $sth->execute( $content, $id ) };
+            if ( $dbh->err == 7  && $dbh->state eq '54000' ) {
+                warn "Attachment $id cannot be indexed. Most probably it contains too many unique words. ".
+                  "Error: ". $dbh->errstr;
+            } elsif ( $dbh->err == 7 && $dbh->state eq '22021' ) {
+                warn "Attachment $id cannot be indexed. Most probably it contains invalid UTF8 bytes. ".
+                  "Error: ". $dbh->errstr;
+            } else {
+                die "Attachment $id cannot be indexed: " . $dbh->errstr;
             }
+
+            # If this was a semi-expected error, insert an empty
+            # tsvector, so we count this row as "indexed" for
+            # purposes of knowing where to pick up
+            eval { $sth->execute( "", $id ) }
+                or die "Failed to insert empty row for attachment $id: " . $dbh->errstr;
         }
     });
 }

commit 2003f537e1182a161eef459d14de17701c4c5dcc
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Thu Nov 6 17:41:06 2014 -0800

    If a new table is used for indexing, grant rights on it
    
    Without this, the new table (created by the DBA user) cannot be read or
    inserted into by the RT user, resulting in errors when indexing.

diff --git a/sbin/rt-setup-fulltext-index.in b/sbin/rt-setup-fulltext-index.in
index b5659c7..2c741d7 100644
--- a/sbin/rt-setup-fulltext-index.in
+++ b/sbin/rt-setup-fulltext-index.in
@@ -307,16 +307,20 @@ elsif ( $DB{'type'} eq 'Pg' ) {
         silent  => !$OPT{'ask'},
     );
 
-    my $schema;
+    my @schema;
     my $drop;
     if ( lc($table) eq 'attachments' ) {
         $drop = "ALTER TABLE $table DROP COLUMN $column";
-        $schema = "ALTER TABLE $table ADD COLUMN $column tsvector";
+        push @schema, "ALTER TABLE $table ADD COLUMN $column tsvector";
     } else {
         $drop = "DROP TABLE $table";
-        $schema = "CREATE TABLE $table ( "
-            ."id INTEGER NOT NULL,"
-            ."$column tsvector )";
+        push @schema, split /;\n+/, <<SCHEMA;
+CREATE TABLE $table (
+    id SERIAL,
+    $column tsvector
+);
+GRANT SELECT, INSERT, UPDATE, DELETE ON $table TO "$DB{user}"
+SCHEMA
     }
 
     my $index_type = lc($OPT{'index-type'} || '');
@@ -328,11 +332,11 @@ elsif ( $DB{'type'} eq 'Pg' ) {
             silent  => !$OPT{'ask'},
         );
     }
+    push @schema, "CREATE INDEX ${column}_idx ON $table USING $index_type($column)";
 
     do_error_is_ok( dba_handle() => $drop )
         unless $OPT{'dryrun'};
-    insert_schema( $schema );
-    insert_schema("CREATE INDEX ${column}_idx ON $table USING $index_type($column)");
+    insert_schema( $_ ) for @schema;
 
     print_rt_config( Table => $table, Column => $column );
 }

commit 77e1da6f2ca179ea1e8b4dec939f23203eaf1d59
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Tue Nov 11 00:35:53 2014 -0500

    Insert data to index before creating the index
    
    Having to update the index after every insertion is a notable
    performance hit; it is much more performant to create the index after
    all data has been inserted.
    
    Move the initial run of rt-fulltext-indexer to after the requisite
    tables have been set up, but prior to creating the index and displaying
    the required RT configuration.  This requires serializing the intended
    %FullTextSearch configuration between rt-setup-fulltext-index and
    rt-fulltext-indexer, as the RT_SiteConfig file will not have been
    updated until after both have run; thus, serialize the configuration via
    an environment variable.
    
    The documentation is updated to note that running
    rt-setup-fulltext-index is now a time-intensive operation.

diff --git a/docs/full_text_indexing.pod b/docs/full_text_indexing.pod
index 180122b..24169cb 100644
--- a/docs/full_text_indexing.pod
+++ b/docs/full_text_indexing.pod
@@ -28,16 +28,14 @@ you may need to pass the C<--dba> or C<--dba-password> options:
 
     /opt/rt4/sbin/rt-setup-fulltext-index --dba postgres --dba-password secret
 
-This will also output an appropriate C<%FullTextSearch> configuration to
-add to your F<RT_SiteConfig.pm>; you will need to restart your webserver
-after making these changes.  However, the index will also need to be
-filled before it can be used.  To update the index initially, run:
+This will then tokenize and index all existing attachments in your
+database; it may take quite a while if your database already has a large
+number of tickets in it.
 
-    /opt/rt4/sbin/rt-fulltext-indexer --all
+Finally, it will output an appropriate C<%FullTextSearch> configuration
+to add to your F<RT_SiteConfig.pm>; you will need to restart your
+webserver after making these changes.
 
-This will tokenize and index all existing attachments in your database;
-it may take quite a while if your database already has a large number of
-tickets in it.
 
 =head2 Updating the index
 
@@ -77,16 +75,14 @@ you may need to pass the C<--dba> or C<--dba-password> options:
 
     /opt/rt4/sbin/rt-setup-fulltext-index --dba root --dba-password secret
 
-This will also output an appropriate C<%FullTextSearch> configuration to
-add to your F<RT_SiteConfig.pm>; you will need to restart your webserver
-after making these changes.  However, the index will also need to be
-filled before it can be used.  To update the index initially, run:
+This will then tokenize and index all existing attachments in your
+database; it may take quite a while if your database already has a large
+number of tickets in it.
 
-    /opt/rt4/sbin/rt-fulltext-indexer --all
+Finally, it will output an appropriate C<%FullTextSearch> configuration
+to add to your F<RT_SiteConfig.pm>; you will need to restart your
+webserver after making these changes.
 
-This will tokenize and index all existing attachments in your database;
-it may take quite a while if your database already has a large number of
-tickets in it.
 
 =head3 Updating the index
 
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index c37cf38..bd55adb 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -100,7 +100,8 @@ if ( !flock main::DATA, LOCK_EX | LOCK_NB ) {
 }
 
 my $db_type = RT->Config->Get('DatabaseType');
-my $fts_config = RT->Config->Get('FullTextSearch') || {};
+my $fts_config = $ENV{RT_FTS_CONFIG} ? JSON::from_json($ENV{RT_FTS_CONFIG})
+    : RT->Config->Get('FullTextSearch') || {};
 unless ( $fts_config->{'Enable'} ) {
     print STDERR <<EOT;
 
diff --git a/sbin/rt-setup-fulltext-index.in b/sbin/rt-setup-fulltext-index.in
index 2c741d7..2c024d7 100644
--- a/sbin/rt-setup-fulltext-index.in
+++ b/sbin/rt-setup-fulltext-index.in
@@ -121,6 +121,7 @@ GetOptions(
 
     'dba=s'          => \$DB{'admin'},
     'dba-password=s' => \$DB{'admin_password'},
+    'limit=i'        => \$DB{'batch-size'},
 ) or show_help();
 
 if ( $OPT{'help'} || (!$DB{'admin'} && $DB{'type'} eq 'Oracle' ) ) {
@@ -175,9 +176,13 @@ if ( $DB{'type'} eq 'mysql' ) {
     my $engine = $RT::Handle->dbh->{mysql_serverversion} < 50600 ? "MyISAM" : "InnoDB";
     my $schema = "CREATE TABLE $table ( "
         ."id INT UNSIGNED AUTO_INCREMENT NOT NULL PRIMARY KEY,"
-        ."Content LONGTEXT, FULLTEXT(Content) ) ENGINE=$engine CHARACTER SET utf8";
+        ."Content LONGTEXT ) ENGINE=$engine CHARACTER SET utf8";
     insert_schema( $schema );
 
+    insert_data( Table => $table, Engine => $engine );
+
+    insert_schema( "CREATE FULLTEXT INDEX $table ON $table(Content)" );
+
     print_rt_config( Table => $table );
 } elsif ($DB{'type'} eq 'sphinx') {
     check_sphinx();
@@ -332,12 +337,15 @@ SCHEMA
             silent  => !$OPT{'ask'},
         );
     }
-    push @schema, "CREATE INDEX ${column}_idx ON $table USING $index_type($column)";
 
     do_error_is_ok( dba_handle() => $drop )
         unless $OPT{'dryrun'};
     insert_schema( $_ ) for @schema;
 
+    insert_data( Table => $table, Column => $column );
+
+    insert_schema( "CREATE INDEX ${column}_idx ON $table USING $index_type($column)" );
+
     print_rt_config( Table => $table, Column => $column );
 }
 elsif ( $DB{'type'} eq 'Oracle' ) {
@@ -729,6 +737,16 @@ sub insert_schema {
     }
 }
 
+sub insert_data {
+    return if $OPT{dryrun};
+
+    print "Indexing existing data...\n";
+
+    $ENV{RT_FTS_CONFIG} = JSON::to_json( {Enable => 1, Indexed => 1, @_});
+    system( "$RT::SbinPath/rt-fulltext-indexer", "--all",
+            ($DB{'batch-size'} ? ("--limit", $DB{'batch-size'}) : ()));
+}
+
 =head1 NAME
 
 rt-setup-fulltext-index - Create indexes for full text search

commit e103f6dab758784a09a571366197763487e268f0
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Wed Nov 26 15:18:41 2014 -0500

    Switch the default Postgres index to GIN
    
    GIN indexes, while slower to index, provide a 10x speedup in query time;
    they are suggested for static data, which the Attachments table
    certainly is.  The other improvements in indexing are sufficient to
    still net a 6.5x improvement in indexing speed over the speed of the
    prior GiST default.  In addition, indexing time, which mostly impacts
    only initial deployment, should take second priority behind query time,
    which GIN improves notably.

diff --git a/sbin/rt-setup-fulltext-index.in b/sbin/rt-setup-fulltext-index.in
index 2c024d7..7ea7f80 100644
--- a/sbin/rt-setup-fulltext-index.in
+++ b/sbin/rt-setup-fulltext-index.in
@@ -331,9 +331,9 @@ SCHEMA
     my $index_type = lc($OPT{'index-type'} || '');
     while ( $index_type ne 'gist' and $index_type ne 'gin' ) {
         $index_type = lc prompt(
-            message => "You may choose between GiST or GIN indexes; the former is several times\n"
-                     . "slower to search, but takes less space on disk and is faster to update.",
-            default => 'GiST',
+            message => "You may choose between GiST or GIN indexes; the GiST takes less space on\n"
+                     . "disk and is faster to update, but is an order of magnitude slower to query.",
+            default => 'GIN',
             silent  => !$OPT{'ask'},
         );
     }

commit bda511a1bf0c9858480f8ab12d5f386345c7dbbc
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Wed Nov 26 15:29:06 2014 -0500

    Default to storing the tsvector in a new table, to speed indexing
    
    The ability to perform bulk INSERT, rather than individual UPDATE
    statements, increases indexing speeds by 1.5x.  While this requires an
    additional JOIN at query time, testing shows that this comes at no
    notable cost of execution time -- in fact, queries with the tsvector in
    a separate table perform 5-10% _faster_ than those with it in the
    Attachments table, perhaps because it requires scans of a less
    heavyweight table.

diff --git a/sbin/rt-setup-fulltext-index.in b/sbin/rt-setup-fulltext-index.in
index 7ea7f80..c21a1d0 100644
--- a/sbin/rt-setup-fulltext-index.in
+++ b/sbin/rt-setup-fulltext-index.in
@@ -91,7 +91,7 @@ my %OPT = (
 my %DEFAULT;
 if ( $DB{'type'} eq 'Pg' ) {
     %DEFAULT = (
-        table  => 'Attachments',
+        table  => 'AttachmentsIndex',
         column => 'ContentIndex',
     );
 }
@@ -302,7 +302,7 @@ elsif ( $DB{'type'} eq 'Pg' ) {
     my $table = $OPT{'table'} || prompt(
         message => "Enter the name of a DB table that will be used to store the Pg tsvector.\n"
                  . "You may either use the existing Attachments table, or create a new\n"
-                 . "table.",
+                 . "table.  Creating a new table makes initial indexing faster.",
         default => $DEFAULT{'table'},
         silent  => !$OPT{'ask'},
     );
diff --git a/t/fts/indexed_pg.t b/t/fts/indexed_pg.t
index 88e35ab..1494fde 100644
--- a/t/fts/indexed_pg.t
+++ b/t/fts/indexed_pg.t
@@ -9,7 +9,7 @@ my ($major, $minor) = $RT::Handle->dbh->get_info(18) =~ /^0*(\d+)\.0*(\d+)/;
 plan skip_all => "Need Pg 8.2 or higher; we have $major.$minor"
     if "$major.$minor" < 8.2;
 
-RT->Config->Set( FullTextSearch => Enable => 1, Indexed => 1, Column => 'ContentIndex', Table => 'Attachments' );
+RT->Config->Set( FullTextSearch => Enable => 1, Indexed => 1, Column => 'ContentIndex', Table => 'AttachmentsIndex' );
 
 setup_indexing();
 

commit 7c48294aa34a7985675f5a56fa5a932aabd64f64
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Thu Mar 5 00:29:23 2015 -0500

    Note modified defaults for PostgreSQL indexing

diff --git a/docs/UPGRADING-4.2 b/docs/UPGRADING-4.2
index 07a6004..1a9ab5d 100644
--- a/docs/UPGRADING-4.2
+++ b/docs/UPGRADING-4.2
@@ -361,4 +361,11 @@ functionality can now be implemented via C<%DatabaseExtraDSN>.
 C<$DatabaseRequireSSL> has been removed, and setting it will trigger an
 informational message that setting it is ineffective.
 
+The full-text indexing defaults for PostgreSQL have changed; GiST is now
+the suggested index, as well as storing data in a separate
+AttachmentsIndex table.  Both changes improve lookup speed.  For
+improved search performance, you may wish to drop existing C<tsvector>
+and C<GIN> indexes on C<Attachments>, and re-generate the index using
+C<rt-setup-fulltext-index>.
+
 =cut

-----------------------------------------------------------------------


More information about the rt-commit mailing list