[Rt-commit] rt branch, 4.2/mysql-native-fts, created. rt-4.2.6-52-gb490888

Alex Vandiver alexmv at bestpractical.com
Tue Jul 29 13:09:26 EDT 2014


The branch, 4.2/mysql-native-fts has been created
        at  b490888fd9527edb9c5340ae492251e3ba08a410 (commit)

- Log -----------------------------------------------------------------
commit ff018f3e8f9ca7bc3b8c6a8a2540d94d5f6d64e1
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Thu Apr 3 19:10:31 2014 -0400

    Add additional clarification points about Sphinx on MySQL

diff --git a/docs/full_text_indexing.pod b/docs/full_text_indexing.pod
index 6b0025d..b2d0914 100644
--- a/docs/full_text_indexing.pod
+++ b/docs/full_text_indexing.pod
@@ -71,10 +71,11 @@ for RT's needs.
 
 =head2 Compiling MySQL and SphinxSE
 
-SphinxSE requires MySQL 5.0 or 5.1; later versions of MySQL have not
-been tested at this time.  Sphinx version 2.0.1 has been tested to work,
-but version 0.9.9 may work as well.  Compilation and installation
-instructions for MySQL with SphinxSE can be found at
+SphinxSE requires MySQL 5.0, 5.1, or 5.5; on the latter two, Sphinx can
+be compiled as a loadable module, for easer of deployment in production.
+Sphinx version 2.0.1 has been tested to work, but version 0.9.9 may work
+as well.  Compilation and installation instructions for MySQL with
+SphinxSE can be found at
 L<http://sphinxsearch.com/docs/current.html#sphinxse-installing>.
 
 =head2 Creating and configuring the index
@@ -115,8 +116,14 @@ from RT's database.  Failure to do so will result in stale data.
 
 =head2 Caveats
 
-Sphinx only returns a finite number of matches to any query; this number
-is controlled by C<max_matches> in F</etc/sphinx.conf> and
+RT's integration with Sphinx relies on the use of a special index; there
+exist queries where the MySQL optimizer elects to I<not> use that index,
+instead electing to scan the table, which causes no results to be
+returned.  However, this is rare, and generally only occurs on complex
+queries.
+
+Sphinx also only returns a finite number of matches to any query; this
+number is controlled by C<max_matches> in F</etc/sphinx.conf> and
 C<%FullTextSearch>'s C<MaxMatches> in C<RT_SiteConfig.pm>, which must be
 kept in sync.  The default, set during C<rt-setup-fulltext-index>, is
 10000.  This limit may lead to false negatives in search results if the

commit 7d52ea317f95586e9c9de1247652bbb2425c152b
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 13:36:19 2014 -0400

    Drop sphinx xmlpipe2 output, which was unusable and undocumented
    
    The --xmlpipe2 option to rt-fulltext-indexer was intended to be fed into
    the "xmlpipe2" datasource of sphinx.  However, it was never documented.
    More importantly, it would not work as intended because the
    LastIndexedAttachments attribute that it checks is not set anywhere; as
    such, every run would infinitely repeat the same $OPT{limit} elements.
    
    Remove the code.

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index ec8ff47..a1ca68c 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -90,15 +90,6 @@ if ( $db_type eq 'Pg' ) {
     );
     push @OPT_LIST, 'limit=i', 'all!';
 }
-elsif ( $db_type eq 'mysql' ) {
-    %OPT = (
-        %OPT,
-        limit    => 0,
-        all      => 0,
-        xmlpipe2 => 0,
-    );
-    push @OPT_LIST, 'limit=i', 'all!', 'xmlpipe2!';
-}
 elsif ( $db_type eq 'Oracle' ) {
     %OPT = (
         %OPT,
@@ -157,8 +148,7 @@ if ( $db_type eq 'Oracle' ) {
     );
     exit;
 } elsif ( $db_type eq 'mysql' ) {
-    unless ($OPT{'xmlpipe2'}) {
-        print STDERR <<EOT;
+    print STDERR <<EOT;
 
 Updates to the external Sphinx index are done via running the sphinx
 `indexer` tool:
@@ -166,8 +156,7 @@ Updates to the external Sphinx index are done via running the sphinx
     indexer rt
 
 EOT
-        exit 1;
-    }
+    exit 1;
 }
 
 my @types = qw(text html);
@@ -278,58 +267,6 @@ sub clean {
     );
 }
 
-{
-sub last_indexed_mysql {
-    my $type = shift;
-    my $attr = $RT::System->FirstAttribute('LastIndexedAttachments');
-    return 0 unless $attr;
-    return 0 unless exists $attr->{ $type };
-    return $attr->{ $type } || 0;
-}
-
-sub process_mysql {
-    my ($type, $attachment, $text) = (@_);
-
-    my $doc = sphinx_template();
-
-    my $element = $doc->createElement('sphinx:document');
-    $element->setAttribute( id => $attachment->id );
-    $element->appendTextChild( content => $$text );
-
-    $doc->documentElement->appendChild( $element );
-}
-
-my $doc = undef;
-sub sphinx_template {
-    return $doc if $doc;
-
-    require XML::LibXML;
-    $doc = XML::LibXML::Document->new('1.0', 'UTF-8');
-    my $root = $doc->createElement('sphinx:docset');
-    $doc->setDocumentElement( $root );
-
-    my $schema = $doc->createElement('sphinx:schema');
-    $root->appendChild( $schema );
-    foreach ( qw(content) ) {
-        my $field = $doc->createElement('sphinx:field');
-        $field->setAttribute( name => $_ );
-        $schema->appendChild( $field );
-    }
-
-    return $doc;
-}
-
-sub finalize_mysql {
-    my ($type, $attachments) = @_;
-    sphinx_template()->toFH(*STDOUT, 1);
-}
-
-sub clean_mysql {
-    $doc = undef;
-}
-
-}
-
 sub last_indexed_pg {
     my $type = shift;
     my $attachments = attachments( $type );

commit 3152f64f401d420ecd434635a2a5f92e731a389d
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 13:36:45 2014 -0400

    Drop finalize and clean functions, which are now unused

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index a1ca68c..8a9ff7f 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -180,8 +180,6 @@ foreach my $type ( @types ) {
         process( $type, $a, $txt );
         debug("Processed attachment #". $a->id );
     }
-    finalize( $type, $attachments ) if $found;
-    clean( $type );
     goto REDO if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100)
 }
 
@@ -253,20 +251,6 @@ sub process {
     );
 }
 
-sub finalize {
-    return goto_specific(
-        suffix    => $db_type,
-        arguments => \@_,
-    );
-}
-
-sub clean {
-    return goto_specific(
-        suffix    => $db_type,
-        arguments => \@_,
-    );
-}
-
 sub last_indexed_pg {
     my $type = shift;
     my $attachments = attachments( $type );

commit fd038d946d6a1867dcab0ef83660d93126e9e3f5
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Thu Apr 3 19:13:52 2014 -0400

    Rename Sphinx FTS search

diff --git a/t/fts/indexed_mysql.t b/t/fts/indexed_sphinx.t
similarity index 100%
rename from t/fts/indexed_mysql.t
rename to t/fts/indexed_sphinx.t

commit 23744962471086f5ef2da2d65c424c5e1eaa0f38
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Thu Apr 3 19:15:10 2014 -0400

    Support native FTS on MySQL 5.6 and above

diff --git a/docs/full_text_indexing.pod b/docs/full_text_indexing.pod
index b2d0914..b4ab4e4 100644
--- a/docs/full_text_indexing.pod
+++ b/docs/full_text_indexing.pod
@@ -61,15 +61,57 @@ C<cron>:
 
 =head1 MYSQL
 
-MySQL does not support full-text indexing natively.  However, it does
-integrate with the external Sphinx engine, available from
+MySQL does not support full-text indexing natively until version 5.6 and
+above.  For prior versions, RT can integrate with the external Sphinx
+full-text search engine.
+
+=head2 MySQL 5.6 and above
+
+MySQL 5.6 includes full-text search of InnoDB tables.  However, as RT
+marks attachment data as C<BINARY>, it cannot index this content without
+creating additional tables.  To create the required table, run:
+
+    sbin/rt-setup-fulltext-index
+
+If you have a non-standard database administrator username or password,
+you may need to pass the C<--dba> or C<--dba-password> options:
+
+    sbin/rt-setup-fulltext-index --dba root --dba-password secret
+
+This will also output an appropriate C<%FullTextSearch> configuration to
+add to your F<RT_SiteConfig.pm>; you will need to restart your webserver
+after making these changes.  However, the index will also need to be
+filled before it can be used.  To update the index initially, run:
+
+    sbin/rt-fulltext-indexer --all
+
+This will tokenize and index all existing attachments in your database;
+it may take quite a while if your database already has a large number of
+tickets in it.
+
+=head3 Updating the index
+
+To keep the index up-to-date, you will need to run:
+
+    sbin/rt-fulltext-indexer
+
+...at regular intervals.  By default, this will only tokenize up to 100
+tickets at a time; you can adjust this upwards by passing
+C<--limit 500>.  Larger batch sizes will take longer and
+consume more memory.  Care should be taken to ensure that multiple
+instances of C<rt-fulltext-indexer> are not run at the same time.
+
+
+=head2 MySQL with Sphinx
+
+RT can also integrate with the external Sphinx engine, available from
 L<http://sphinxsearch.com>.  Unfortunately, Sphinx integration (using
 SphinxSE) does require that you recompile MySQL from source.  Most
 distribution-provided packages for MySQL do not include SphinxSE
 integration, merely the external Sphinx tools; these are not sufficient
 for RT's needs.
 
-=head2 Compiling MySQL and SphinxSE
+=head3 Compiling MySQL and SphinxSE
 
 SphinxSE requires MySQL 5.0, 5.1, or 5.5; on the latter two, Sphinx can
 be compiled as a loadable module, for easer of deployment in production.
@@ -78,7 +120,7 @@ as well.  Compilation and installation instructions for MySQL with
 SphinxSE can be found at
 L<http://sphinxsearch.com/docs/current.html#sphinxse-installing>.
 
-=head2 Creating and configuring the index
+=head3 Creating and configuring the index
 
 Once MySQL has been recompiled with SphinxSE, and Sphinx itself is
 installed, you may create the required SphinxSE communication table via:
@@ -105,7 +147,7 @@ Finally, start the Sphinx search daemon:
 
     searchd
 
-=head2 Updating the index
+=head3 Updating the index
 
 To keep the index up-to-date, you will need to run:
 
@@ -114,7 +156,7 @@ To keep the index up-to-date, you will need to run:
 ...at regular intervals in order to pick up new and updated attachments
 from RT's database.  Failure to do so will result in stale data.
 
-=head2 Caveats
+=head3 Caveats
 
 RT's integration with Sphinx relies on the use of a special index; there
 exist queries where the MySQL optimizer elects to I<not> use that index,
diff --git a/lib/RT/Config.pm b/lib/RT/Config.pm
index a4ede15..58366ea 100644
--- a/lib/RT/Config.pm
+++ b/lib/RT/Config.pm
@@ -586,11 +586,25 @@ our %META;
                     $RT::Logger->error("No Table set for full-text index; disabling");
                     $v->{Enable} = $v->{Indexed} = 0;
                 } elsif ($v->{'Table'} eq "Attachments") {
-                    $RT::Logger->error("Table for full-text index is set to Attachments, not SphinxSE table; disabling");
+                    $RT::Logger->error("Table for full-text index is set to Attachments, not FTS table; disabling");
                     $v->{Enable} = $v->{Indexed} = 0;
-                } elsif (not $v->{'MaxMatches'}) {
-                    $RT::Logger->warn("No MaxMatches set for full-text index; defaulting to 10000");
-                    $v->{MaxMatches} = 10_000;
+                } else {
+                    my (undef, $create) = eval { $RT::Handle->dbh->selectrow_array("SHOW CREATE TABLE " . $v->{Table}); };
+                    my ($engine) = ($create||'') =~ /engine=(\S+)/i;
+                    if (not $create) {
+                        $RT::Logger->error("External table ".$v->{Table}." does not exist");
+                        $v->{Enable} = $v->{Indexed} = 0;
+                    } elsif (lc $engine eq "sphinx") {
+                        # External Sphinx indexer
+                        $v->{Sphinx} = 1;
+                        unless ($v->{'MaxMatches'}) {
+                            $RT::Logger->warn("No MaxMatches set for full-text index; defaulting to 10000");
+                            $v->{MaxMatches} = 10_000;
+                        }
+                    } else {
+                        # Internal, one-column table
+                        $v->{Column} = 'Content';
+                    }
                 }
             } else {
                 $RT::Logger->error("Indexed full-text-search not supported for $dbtype");
diff --git a/lib/RT/SearchBuilder.pm b/lib/RT/SearchBuilder.pm
index f60265c..9af436a 100644
--- a/lib/RT/SearchBuilder.pm
+++ b/lib/RT/SearchBuilder.pm
@@ -897,7 +897,8 @@ sub Limit {
                                   |(NOT\s*)?MATCHES
                                   |IS(\s*NOT)?
                                   |(NOT\s*)?IN
-                                  |\@\@)$/ix) {
+                                  |\@\@
+                                  |AGAINST)$/ix) {
         $RT::Logger->crit("Possible SQL injection attack: $ARGS{FIELD} $ARGS{OPERATOR}");
         %ARGS = (
             %ARGS,
diff --git a/lib/RT/Tickets.pm b/lib/RT/Tickets.pm
index 8aea5e7..800e520 100644
--- a/lib/RT/Tickets.pm
+++ b/lib/RT/Tickets.pm
@@ -932,6 +932,17 @@ sub _TransContentLimit {
                 QUOTEVALUE  => 0,
             );
         }
+        elsif ( $db_type eq 'mysql' and not $config->{Sphinx}) {
+            my $dbh = $RT::Handle->dbh;
+            $value =~ s/["\\]+/ /g;
+            $self->Limit(
+                %rest,
+                FUNCTION    => "MATCH($alias.Content)",
+                OPERATOR    => 'AGAINST',
+                VALUE       => '("'. $dbh->quote($value) .'" IN BOOLEAN MODE)',
+                QUOTEVALUE  => 0,
+            );
+        }
         elsif ( $db_type eq 'mysql' ) {
             # XXX: We could theoretically skip the join to Attachments,
             # and have Sphinx simply index and group by the TicketId,
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 8a9ff7f..6329c00 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -90,6 +90,14 @@ if ( $db_type eq 'Pg' ) {
     );
     push @OPT_LIST, 'limit=i', 'all!';
 }
+elsif ( $db_type eq 'mysql' ) {
+    %OPT = (
+        %OPT,
+        limit    => 0,
+        all      => 0,
+    );
+    push @OPT_LIST, 'limit=i', 'all!';
+}
 elsif ( $db_type eq 'Oracle' ) {
     %OPT = (
         %OPT,
@@ -147,7 +155,7 @@ if ( $db_type eq 'Oracle' ) {
         $index, $OPT{'memory'}
     );
     exit;
-} elsif ( $db_type eq 'mysql' ) {
+} elsif ( $fts_config->{Sphinx} ) {
     print STDERR <<EOT;
 
 Updates to the external Sphinx index are done via running the sphinx
@@ -251,6 +259,23 @@ sub process {
     );
 }
 
+sub last_indexed_mysql { last_indexed_pg(@_); }
+sub process_mysql {
+    my ($type, $attachment, $text) = (@_);
+
+    my $dbh = $RT::Handle->dbh;
+    my $table = $fts_config->{'Table'};
+
+    my $query;
+    if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) {
+        $query = "UPDATE $table SET Content = ? WHERE id = ?";
+    } else {
+        $query = "INSERT INTO $table(Content, id) VALUES(?, ?)";
+    }
+
+    $dbh->do( $query, undef, $$text, $attachment->id );
+}
+
 sub last_indexed_pg {
     my $type = shift;
     my $attachments = attachments( $type );
diff --git a/sbin/rt-setup-fulltext-index.in b/sbin/rt-setup-fulltext-index.in
index 99c7fa2..b90bdf6 100644
--- a/sbin/rt-setup-fulltext-index.in
+++ b/sbin/rt-setup-fulltext-index.in
@@ -131,7 +131,48 @@ my $dbh = $RT::Handle->dbh;
 $dbh->{'RaiseError'} = 1;
 $dbh->{'PrintError'} = 1;
 
+# MySQL could either be native of sphinx; find out which
+if ($DB{'type'} eq "mysql") {
+    my $index_type = lc($OPT{'index-type'} || '');
+
+    # Default to sphinx on < 5.6, and error if they provided mysql
+    if ($RT::Handle->dbh->{mysql_serverversion} < 50600) {
+        $index_type ||= 'sphinx';
+        die "Native MySQL indexing is only supported in MySQL 5.6 and above"
+            if $index_type ne 'sphinx';
+    }
+
+    while ( $index_type ne 'sphinx' and $index_type ne 'mysql' ) {
+        $index_type = lc prompt(
+            message => "MySQL 5.6 and above support native full-text indexing; for compatibility\n"
+                      ."with earlier versions of RT, the external Sphinx indexer is still supported.\n"
+                      ."Which indexing solution would you prefer?",
+            default => 'mysql',
+            silent  => !$OPT{'ask'},
+        );
+    };
+    $DB{'type'} = $index_type;
+}
+
 if ( $DB{'type'} eq 'mysql' ) {
+    # MySQL 5.6 has FTS on InnoDB "text" columns -- which the
+    # Attachments table doesn't have, but we can make it have.
+    my $table = $OPT{'table'} || prompt(
+        message => "Enter the name of a new MySQL table that will be used to store the\n"
+                 . "full-text content and indexes:",
+        default => $DEFAULT{'table'},
+        silent  => !$OPT{'ask'},
+    );
+    do_error_is_ok( dba_handle() => "DROP TABLE $table" )
+        unless $OPT{'dryrun'};
+
+    my $schema = "CREATE TABLE $table ( "
+        ."id INT UNSIGNED AUTO_INCREMENT NOT NULL PRIMARY KEY,"
+        ."Content LONGTEXT, FULLTEXT(Content) ) ENGINE=InnoDB CHARACTER SET utf8";
+    insert_schema( $schema );
+
+    print_rt_config( Table => $table );
+} elsif ($DB{'type'} eq 'sphinx') {
     check_sphinx();
     my $table = $OPT{'table'} || prompt(
         message => "Enter name of a new MySQL table that will be used to connect to the\n"
diff --git a/t/fts/indexed_mysql.t b/t/fts/indexed_mysql.t
new file mode 100644
index 0000000..a0145a9
--- /dev/null
+++ b/t/fts/indexed_mysql.t
@@ -0,0 +1,83 @@
+
+use strict;
+use warnings;
+
+use RT::Test tests => undef;
+plan skip_all => 'Not mysql' unless RT->Config->Get('DatabaseType') eq 'mysql';
+plan skip_all => "Need mysql 5.6 or higher"
+    unless $RT::Handle->dbh->{mysql_serverversion} > 50600;
+
+RT->Config->Set( FullTextSearch => Enable => 1, Indexed => 1, Table => 'AttachmentsIndex' );
+
+setup_indexing();
+
+my $q = RT::Test->load_or_create_queue( Name => 'General' );
+ok $q && $q->id, 'loaded or created queue';
+my $queue = $q->Name;
+
+sub setup_indexing {
+    my %args = (
+        'no-ask'       => 1,
+        command        => $RT::SbinPath .'/rt-setup-fulltext-index',
+        dba            => $ENV{'RT_DBA_USER'},
+        'dba-password' => $ENV{'RT_DBA_PASSWORD'},
+    );
+    my ($exit_code, $output) = RT::Test->run_and_capture( %args );
+    ok(!$exit_code, "setted up index") or diag "output: $output";
+}
+
+sub sync_index {
+    my %args = (
+        command => $RT::SbinPath .'/rt-fulltext-indexer',
+    );
+    my ($exit_code, $output) = RT::Test->run_and_capture( %args );
+    ok(!$exit_code, "setted up index") or diag "output: $output";
+}
+
+sub run_tests {
+    my @test = @_;
+    while ( my ($query, $checks) = splice @test, 0, 2 ) {
+        run_test( $query, %$checks );
+    }
+}
+
+my @tickets;
+sub run_test {
+    my ($query, %checks) = @_;
+    my $query_prefix = join ' OR ', map 'id = '. $_->id, @tickets;
+
+    my $tix = RT::Tickets->new(RT->SystemUser);
+    $tix->FromSQL( "( $query_prefix ) AND ( $query )" );
+
+    my $error = 0;
+
+    my $count = 0;
+    $count++ foreach grep $_, values %checks;
+    is($tix->Count, $count, "found correct number of ticket(s) by '$query'") or $error = 1;
+
+    my $good_tickets = ($tix->Count == $count);
+    while ( my $ticket = $tix->Next ) {
+        next if $checks{ $ticket->Subject };
+        diag $ticket->Subject ." ticket has been found when it's not expected";
+        $good_tickets = 0;
+    }
+    ok( $good_tickets, "all tickets are good with '$query'" ) or $error = 1;
+
+    diag "Wrong SQL query for '$query':". $tix->BuildSelectQuery if $error;
+}
+
+ at tickets = RT::Test->create_tickets(
+    { Queue => $q->id },
+    { Subject => 'book', Content => 'book' },
+    { Subject => 'bar', Content => 'bar' },
+);
+sync_index();
+
+run_tests(
+    "Content LIKE 'book'" => { book => 1, bar => 0 },
+    "Content LIKE 'bar'" => { book => 0, bar => 1 },
+);
+
+ at tickets = ();
+
+done_testing;

commit bfd8dce135b63d747af1c3b5b969aabcb69107e7
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Mon Mar 17 21:32:05 2014 -0400

    Using a separate MyISAM table, we can also support FTS on MySQL < 5.6

diff --git a/docs/full_text_indexing.pod b/docs/full_text_indexing.pod
index b4ab4e4..c091cca 100644
--- a/docs/full_text_indexing.pod
+++ b/docs/full_text_indexing.pod
@@ -61,15 +61,15 @@ C<cron>:
 
 =head1 MYSQL
 
-MySQL does not support full-text indexing natively until version 5.6 and
-above.  For prior versions, RT can integrate with the external Sphinx
-full-text search engine.
+On MySQL, full-text search can either be done using native support
+(which may use MyISAM tables on pre-5.6 versions of MySQL), or RT can
+integrate with the external Sphinx full-text search engine.
 
-=head2 MySQL 5.6 and above
+=head2 Native MySQL
 
-MySQL 5.6 includes full-text search of InnoDB tables.  However, as RT
-marks attachment data as C<BINARY>, it cannot index this content without
-creating additional tables.  To create the required table, run:
+As RT marks attachment data as C<BINARY>, MySQL cannot index this
+content without creating an additional table.  To create the required
+table (which is InnoDB on versions of MySQL which support it), run:
 
     sbin/rt-setup-fulltext-index
 
@@ -101,6 +101,14 @@ C<--limit 500>.  Larger batch sizes will take longer and
 consume more memory.  Care should be taken to ensure that multiple
 instances of C<rt-fulltext-indexer> are not run at the same time.
 
+=head3 Caveats
+
+On versions of MySQL prior to 5.6, a MyISAM table is used.  This may
+cause poor performance, as the database server is likely tuned for
+InnoDB performance, not MyISAM performance.  Once the MySQL server is
+upgraded to version 5.6 or above, the extra table should be re-created
+as InnoDB by re-running the steps above.
+
 
 =head2 MySQL with Sphinx
 
diff --git a/sbin/rt-setup-fulltext-index.in b/sbin/rt-setup-fulltext-index.in
index b90bdf6..017024a 100644
--- a/sbin/rt-setup-fulltext-index.in
+++ b/sbin/rt-setup-fulltext-index.in
@@ -136,17 +136,23 @@ if ($DB{'type'} eq "mysql") {
     my $index_type = lc($OPT{'index-type'} || '');
 
     # Default to sphinx on < 5.6, and error if they provided mysql
+    my $msg;
     if ($RT::Handle->dbh->{mysql_serverversion} < 50600) {
-        $index_type ||= 'sphinx';
-        die "Native MySQL indexing is only supported in MySQL 5.6 and above"
-            if $index_type ne 'sphinx';
+        $msg = "Complete support for full-text search requires MySQL 5.6 or higher.  For prior\n"
+              ."versions such as yours, full-text indexing can either be provided using MyISAM\n"
+              ."tables, or the external  Sphinx indexer.  Using MyISAM tables requires that your\n"
+              ."database be tuned to support them, as RT uses InnoDB tables for all other content.\n"
+              ."Using Sphinx will require recompiling MySQL.  Which indexing solution would you\n"
+              ."prefer?"
+    } else {
+        $msg = "MySQL 5.6 and above support native full-text indexing; for compatibility\n"
+              ."with earlier versions of RT, the external Sphinx indexer is still supported.\n"
+              ."Which indexing solution would you prefer?"
     }
 
     while ( $index_type ne 'sphinx' and $index_type ne 'mysql' ) {
         $index_type = lc prompt(
-            message => "MySQL 5.6 and above support native full-text indexing; for compatibility\n"
-                      ."with earlier versions of RT, the external Sphinx indexer is still supported.\n"
-                      ."Which indexing solution would you prefer?",
+            message => $msg,
             default => 'mysql',
             silent  => !$OPT{'ask'},
         );
@@ -166,9 +172,10 @@ if ( $DB{'type'} eq 'mysql' ) {
     do_error_is_ok( dba_handle() => "DROP TABLE $table" )
         unless $OPT{'dryrun'};
 
+    my $engine = $RT::Handle->dbh->{mysql_serverversion} < 50600 ? "MyISAM" : "InnoDB";
     my $schema = "CREATE TABLE $table ( "
         ."id INT UNSIGNED AUTO_INCREMENT NOT NULL PRIMARY KEY,"
-        ."Content LONGTEXT, FULLTEXT(Content) ) ENGINE=InnoDB CHARACTER SET utf8";
+        ."Content LONGTEXT, FULLTEXT(Content) ) ENGINE=$engine CHARACTER SET utf8";
     insert_schema( $schema );
 
     print_rt_config( Table => $table );
diff --git a/t/fts/indexed_mysql.t b/t/fts/indexed_mysql.t
index a0145a9..672b220 100644
--- a/t/fts/indexed_mysql.t
+++ b/t/fts/indexed_mysql.t
@@ -4,8 +4,6 @@ use warnings;
 
 use RT::Test tests => undef;
 plan skip_all => 'Not mysql' unless RT->Config->Get('DatabaseType') eq 'mysql';
-plan skip_all => "Need mysql 5.6 or higher"
-    unless $RT::Handle->dbh->{mysql_serverversion} > 50600;
 
 RT->Config->Set( FullTextSearch => Enable => 1, Indexed => 1, Table => 'AttachmentsIndex' );
 
@@ -68,14 +66,16 @@ sub run_test {
 
 @tickets = RT::Test->create_tickets(
     { Queue => $q->id },
-    { Subject => 'book', Content => 'book' },
-    { Subject => 'bar', Content => 'bar' },
+    { Subject => 'first', Content => 'english' },
+    { Subject => 'second',  Content => 'french' },
+    { Subject => 'third',  Content => 'spanish' },
+    { Subject => 'fourth',  Content => 'german' },
 );
 sync_index();
 
 run_tests(
-    "Content LIKE 'book'" => { book => 1, bar => 0 },
-    "Content LIKE 'bar'" => { book => 0, bar => 1 },
+    "Content LIKE 'english'" => { first => 1, second => 0, third => 0, fourth => 0 },
+    "Content LIKE 'french'" => { first => 0, second => 1, third => 0, fourth => 0 },
 );
 
 @tickets = ();
diff --git a/t/fts/indexed_sphinx.t b/t/fts/indexed_sphinx.t
index 0a4f026..a09b0d2 100644
--- a/t/fts/indexed_sphinx.t
+++ b/t/fts/indexed_sphinx.t
@@ -15,8 +15,6 @@ plan skip_all => "No searchd and indexer under PATH"
 
 plan tests => 15;
 
-RT->Config->Set( FullTextSearch => Enable => 1, Indexed => 1, Table => 'AttachmentsIndex', MaxMatches => 1000 );
-
 setup_indexing();
 
 my $q = RT::Test->load_or_create_queue( Name => 'General' );
@@ -33,6 +31,7 @@ sub setup_indexing {
         dba            => $ENV{'RT_DBA_USER'},
         'dba-password' => $ENV{'RT_DBA_PASSWORD'},
         url            => "sphinx://127.0.0.1:$port/rt",
+        'index-type'   => 'sphinx',
     );
     ok(!$exit_code, "setted up index");
     diag "output: $output" if $ENV{'TEST_VERBOSE'};
@@ -118,6 +117,8 @@ sub run_test {
 );
 sync_index();
 
+RT->Config->Set( FullTextSearch => Enable => 1, Indexed => 1, Table => 'AttachmentsIndex', MaxMatches => 1000, Sphinx => 1 );
+
 run_tests(
     "Content LIKE 'book'" => { book => 1, bar => 0 },
     "Content LIKE 'bar'" => { book => 0, bar => 1 },

commit a4e5362b59a740e4d9d8ecc54ea1447ae23a6a18
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 00:17:32 2014 -0400

    extract_text and extract_html are identical; inline them
    
    The original premise may have been that HTML->text conversion would be
    fone on the HTML before indexing.  While this is still an option for the
    future, there is currently no reason to provide to identical methods.

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 6329c00..9a7b0af 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -183,9 +183,10 @@ foreach my $type ( @types ) {
     while ( my $a = $attachments->Next ) {
         next if filter( $type, $a );
         debug("Found attachment #". $a->id );
-        my $txt = extract($type, $a) or next;
+        my $text = $a->Content;
+        next unless defined $text && length($text);
         $found++;
-        process( $type, $a, $txt );
+        process( $type, $a, \$text );
         debug("Processed attachment #". $a->id );
     }
     goto REDO if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100)
@@ -242,15 +243,6 @@ sub filter {
     );
 }
 
-sub extract {
-    my $type = shift;
-    return goto_specific(
-        suffix    => $type,
-        error     => "No way to convert $type attachment into text",
-        arguments => \@_,
-    );
-}
-
 sub process {
     return goto_specific(
         suffix    => $db_type,
@@ -342,13 +334,6 @@ sub attachments_text {
     return $res;
 }
 
-sub extract_text {
-    my $attachment = shift;
-    my $text = $attachment->Content;
-    return undef unless defined $text && length($text);
-    return \$text;
-}
-
 sub attachments_html {
     my $res = shift;
     $res->Limit( FIELD => 'ContentType', VALUE => 'text/html' );
@@ -365,14 +350,6 @@ sub filter_html {
     return 0;
 }
 
-sub extract_html {
-    my $attachment = shift;
-    my $text = $attachment->Content;
-    return undef unless defined $text && length($text);
-# TODO: html -> text
-    return \$text;
-}
-
 sub goto_specific {
     my %args = (@_);
 

commit 2474f532d6dceb6aaaaf5317342535e1c4253249
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 00:19:10 2014 -0400

    Inline the differences between text/plain and text/html attachment lists

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 9a7b0af..7bfa2e3 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -218,12 +218,11 @@ sub attachments {
         OPERATOR => '!=',
         VALUE => 'deleted'
     );
-
-    return goto_specific(
-        suffix => $type,
-        error => "Don't know how to find $type attachments",
-        arguments => [$res],
+    $res->Limit(
+        FIELD => 'ContentType',
+        VALUE => ($type eq "html" ? "text/html" : "text/plain"),
     );
+    return $res;
 }
 
 sub last_indexed {
@@ -328,18 +327,6 @@ sub process_pg {
     }
 }
 
-sub attachments_text {
-    my $res = shift;
-    $res->Limit( FIELD => 'ContentType', VALUE => 'text/plain' );
-    return $res;
-}
-
-sub attachments_html {
-    my $res = shift;
-    $res->Limit( FIELD => 'ContentType', VALUE => 'text/html' );
-    return $res;
-}
-
 sub filter_html {
     my $attachment = shift;
     if ( my $parent = $attachment->ParentObj ) {

commit b276cbe133f3845fa7036ebf886f476c79ed28c9
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 00:19:55 2014 -0400

    Stop skipping indexing of text/html within multipart/alternative
    
    Not all text/html within a multipart/alternative is strictly identical
    tothe text part; some mailers send out text/plain parts of alternatives
    that simply say "Your client must be able to render HTML."
    
    The additional space required to index these attachments is worth the
    consitency of having all potential content indexed.  This also is a
    speed increase, as it reduces the number of ad-hoc queries during
    indexing.

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 7bfa2e3..b7adaec 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -181,7 +181,6 @@ foreach my $type ( @types ) {
 
     my $found = 0;
     while ( my $a = $attachments->Next ) {
-        next if filter( $type, $a );
         debug("Found attachment #". $a->id );
         my $text = $a->Content;
         next unless defined $text && length($text);
@@ -234,14 +233,6 @@ sub last_indexed {
     );
 }
 
-sub filter {
-    my $type = shift;
-    return goto_specific(
-        suffix    => $type,
-        arguments => \@_,
-    );
-}
-
 sub process {
     return goto_specific(
         suffix    => $db_type,
@@ -327,16 +318,6 @@ sub process_pg {
     }
 }
 
-sub filter_html {
-    my $attachment = shift;
-    if ( my $parent = $attachment->ParentObj ) {
-# skip html parts that are alternatives
-        return 1 if $parent->id
-            && $parent->ContentType eq 'mulitpart/alternative';
-    }
-    return 0;
-}
-
 sub goto_specific {
     my %args = (@_);
 

commit b3bf43a5664efe3d816b2fdd65843644f153f907
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 00:21:52 2014 -0400

    Use the new, shorterm initialization form

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index b7adaec..1df6514 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -67,11 +67,7 @@ BEGIN { # BEGIN RT CMD BOILERPLATE
 
 }
 
-BEGIN {
-    use RT;
-    RT::LoadConfig();
-    RT::Init();
-};
+use RT -init;
 use RT::Interface::CLI ();
 
 my %OPT = (

commit 06e22fe16545ca5dc54f5eb2d32b74fc25784129
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 00:23:40 2014 -0400

    Simplify and condense option parsing
    
    This causes the indexer to be able to accept options that are
    potentially not applicable (for instance, --memory with a non-Oracle
    backend); however, the comprehensibility benefits are worth this tiny
    cost.

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 1df6514..b226ab6 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -70,40 +70,17 @@ BEGIN { # BEGIN RT CMD BOILERPLATE
 use RT -init;
 use RT::Interface::CLI ();
 
-my %OPT = (
-    help        => 0,
-    debug       => 0,
-    quiet       => 0,
-);
-my @OPT_LIST = qw(help|h! debug! quiet);
+use Getopt::Long qw(GetOptions);
+my %OPT = ( memory => '2M', limit => 0 );
+GetOptions( \%OPT,
+    "help|h!",
+    "debug!",
 
-my $db_type = RT->Config->Get('DatabaseType');
-if ( $db_type eq 'Pg' ) {
-    %OPT = (
-        %OPT,
-        limit  => 0,
-        all    => 0,
-    );
-    push @OPT_LIST, 'limit=i', 'all!';
-}
-elsif ( $db_type eq 'mysql' ) {
-    %OPT = (
-        %OPT,
-        limit    => 0,
-        all      => 0,
-    );
-    push @OPT_LIST, 'limit=i', 'all!';
-}
-elsif ( $db_type eq 'Oracle' ) {
-    %OPT = (
-        %OPT,
-        memory => '2M',
-    );
-    push @OPT_LIST, qw(memory=s);
-}
+    "all!",
+    "limit=i",
 
-use Getopt::Long qw(GetOptions);
-GetOptions( \%OPT, @OPT_LIST );
+    "memory=s",
+);
 
 if ( $OPT{'help'} ) {
     RT::Interface::CLI->ShowHelp(
@@ -123,6 +100,7 @@ if ( !flock main::DATA, LOCK_EX | LOCK_NB ) {
     }
 }
 
+my $db_type = RT->Config->Get('DatabaseType');
 my $fts_config = RT->Config->Get('FullTextSearch') || {};
 unless ( $fts_config->{'Enable'} ) {
     print STDERR <<EOT;

commit 5b8ed2aa5a3e4f8ce2385db49ee8e1291bd566ce
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 00:24:01 2014 -0400

    Documentation has moved out; update --help accordingly
    
    The documentation (which included =head1 sections for each database
    type) was moved to the more centralized docs/full_text_indexing.pod in
    fa5dffcb; however, --help still looked for the relevant sections.
    Remove the special-case code which is no logner relevant.

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index b226ab6..76e2306 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -82,11 +82,7 @@ GetOptions( \%OPT,
     "memory=s",
 );
 
-if ( $OPT{'help'} ) {
-    RT::Interface::CLI->ShowHelp(
-        Sections => 'NAME|DESCRIPTION|'. uc($db_type),
-    );
-}
+RT::Interface::CLI->ShowHelp if $OPT{help};
 
 use Fcntl ':flock';
 if ( !flock main::DATA, LOCK_EX | LOCK_NB ) {

commit b06fef2de47532efb53f0fc8887965048b462cae
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 01:44:24 2014 -0400

    Remove AUTHOR section; it is unnecessary in core sbin files

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 76e2306..6b10203 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -318,11 +318,6 @@ This is a helper script to keep full text indexes in sync with data.
 Read F<docs/full_text_indexing.pod> for complete details on how and when
 to run it.
 
-=head1 AUTHOR
-
-Ruslan Zakirov E<lt>ruz at bestpractical.comE<gt>,
-Alex Vandiver E<lt>alexmv at bestpractical.comE<gt>
-
 =cut
 
 __DATA__

commit 929539a05e9e7e26cc9706b9c7d3fe2dc2df627b
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 00:28:27 2014 -0400

    Skipping ACL checks yields a sizable performance increase
    
    The indexer spends quite a bit of time checking that the attachment
    content is visible to the current user before indexing it -- all of
    which is unnecessary, as it is run as the system user.  Explicitly
    disable the ACL checking for the indexer, which removes a large number
    of queries and yields a correspondingly sizable increase in indexing
    speed.

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 6b10203..80a585c 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -137,6 +137,15 @@ EOT
     exit 1;
 }
 
+# Skip ACL checks.  This saves a large number of unnecessary queries
+# (for tickets, ACLs, and users) which are unnecessary, as we are
+# running as the system user.
+{
+    no warnings 'redefine';
+    *RT::Attachment::_Value = \&DBIx::SearchBuilder::Record::_Value;
+    *RT::Attachments::Next  = \&DBIx::SearchBuilder::Next;
+}
+
 my @types = qw(text html);
 foreach my $type ( @types ) {
   REDO:

commit 434556be892aa73f435f10cf29b06e4258027c11
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 00:36:49 2014 -0400

    Index attachments in one pass through the database, not two
    
    There is no reason to perform two passes through the database instead of
    one; doing one allows for better progress estimates, as well as
    potentially increasing locality for the database.

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 80a585c..393e758 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -146,14 +146,12 @@ EOT
     *RT::Attachments::Next  = \&DBIx::SearchBuilder::Next;
 }
 
-my @types = qw(text html);
-foreach my $type ( @types ) {
-  REDO:
-    my $attachments = attachments($type);
+{
+    my $attachments = attachments();
     $attachments->Limit(
         FIELD => 'id',
         OPERATOR => '>',
-        VALUE => last_indexed($type)
+        VALUE => last_indexed()
     );
     $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
     $attachments->RowsPerPage( $OPT{'limit'} || 100 );
@@ -164,14 +162,13 @@ foreach my $type ( @types ) {
         my $text = $a->Content;
         next unless defined $text && length($text);
         $found++;
-        process( $type, $a, \$text );
+        process( $a, \$text );
         debug("Processed attachment #". $a->id );
     }
-    goto REDO if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100)
+    redo if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100);
 }
 
 sub attachments {
-    my $type = shift;
     my $res = RT::Attachments->new( RT->SystemUser );
     my $txn_alias = $res->Join(
         ALIAS1 => 'main',
@@ -197,17 +194,17 @@ sub attachments {
         VALUE => 'deleted'
     );
     $res->Limit(
-        FIELD => 'ContentType',
-        VALUE => ($type eq "html" ? "text/html" : "text/plain"),
+        FIELD    => 'ContentType',
+        OPERATOR => 'IN',
+        VALUE    => ['text/plain', 'text/html'],
     );
     return $res;
 }
 
 sub last_indexed {
-    my ($type) = (@_);
     return goto_specific(
         suffix => $db_type,
-        error => "Don't know how to find last indexed $type attachment for $db_type DB",
+        error => "Don't know how to find last indexed attachment for $db_type DB",
         arguments => \@_,
     );
 }
@@ -222,7 +219,7 @@ sub process {
 
 sub last_indexed_mysql { last_indexed_pg(@_); }
 sub process_mysql {
-    my ($type, $attachment, $text) = (@_);
+    my ($attachment, $text) = (@_);
 
     my $dbh = $RT::Handle->dbh;
     my $table = $fts_config->{'Table'};
@@ -238,8 +235,7 @@ sub process_mysql {
 }
 
 sub last_indexed_pg {
-    my $type = shift;
-    my $attachments = attachments( $type );
+    my $attachments = attachments();
     my $alias = 'main';
     if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) {
         $alias = $attachments->Join(
@@ -263,7 +259,7 @@ sub last_indexed_pg {
 }
 
 sub process_pg {
-    my ($type, $attachment, $text) = (@_);
+    my ($attachment, $text) = (@_);
 
     my $dbh = $RT::Handle->dbh;
     my $table = $fts_config->{'Table'};

commit 1f705126d35fc6aa6c8485388ccf4f184c9d99d2
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 00:45:35 2014 -0400

    Index attachments even on deleted tickets
    
    Skipping indexing on tickets which are currently deleted may lead to
    content which is later not indexed and not findable.  Even if a ticket
    becomes un-deleted at a later point, it will never again be indexed, as
    last_indexed() will limit to new attachments since the last run.  The
    overhead of joining through Attachments and into Tickets is also not to
    be overlooked -- on MySQL, it causes pessimal performance for large
    systems, involving more than one filesort and temporary table.
    
    Index all attachments, no matter the source, on the premise that
    limiting to non-deleted tickets is better done at query time, not at
    index time.  It is both more timely, and better limited.

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 393e758..751e534 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -147,7 +147,12 @@ EOT
 }
 
 {
-    my $attachments = attachments();
+    my $attachments = RT::Attachments->new( RT->SystemUser );
+    $attachments->Limit(
+        FIELD    => 'ContentType',
+        OPERATOR => 'IN',
+        VALUE    => ['text/plain', 'text/html'],
+    );
     $attachments->Limit(
         FIELD => 'id',
         OPERATOR => '>',
@@ -168,39 +173,6 @@ EOT
     redo if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100);
 }
 
-sub attachments {
-    my $res = RT::Attachments->new( RT->SystemUser );
-    my $txn_alias = $res->Join(
-        ALIAS1 => 'main',
-        FIELD1 => 'TransactionId',
-        TABLE2 => 'Transactions',
-        FIELD2 => 'id',
-    );
-    $res->Limit(
-        ALIAS => $txn_alias,
-        FIELD => 'ObjectType',
-        VALUE => 'RT::Ticket',
-    );
-    my $ticket_alias = $res->Join(
-        ALIAS1 => $txn_alias,
-        FIELD1 => 'ObjectId',
-        TABLE2 => 'Tickets',
-        FIELD2 => 'id',
-    );
-    $res->Limit(
-        ALIAS => $ticket_alias,
-        FIELD => 'Status',
-        OPERATOR => '!=',
-        VALUE => 'deleted'
-    );
-    $res->Limit(
-        FIELD    => 'ContentType',
-        OPERATOR => 'IN',
-        VALUE    => ['text/plain', 'text/html'],
-    );
-    return $res;
-}
-
 sub last_indexed {
     return goto_specific(
         suffix => $db_type,
@@ -235,7 +207,12 @@ sub process_mysql {
 }
 
 sub last_indexed_pg {
-    my $attachments = attachments();
+    my $attachments = RT::Attachments->new( RT->SystemUser );
+    $attachments->Limit(
+        FIELD    => 'ContentType',
+        OPERATOR => 'IN',
+        VALUE    => ['text/plain', 'text/html'],
+    );
     my $alias = 'main';
     if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) {
         $alias = $attachments->Join(

commit f9c8eaea8871ec21e3ee4b8fdb47a32651ffe814
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 00:58:24 2014 -0400

    mysql and pg share the same last_indexed; unify the method

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 751e534..b5d6375 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -174,39 +174,6 @@ EOT
 }
 
 sub last_indexed {
-    return goto_specific(
-        suffix => $db_type,
-        error => "Don't know how to find last indexed attachment for $db_type DB",
-        arguments => \@_,
-    );
-}
-
-sub process {
-    return goto_specific(
-        suffix    => $db_type,
-        error     => "No processer for $db_type DB",
-        arguments => \@_,
-    );
-}
-
-sub last_indexed_mysql { last_indexed_pg(@_); }
-sub process_mysql {
-    my ($attachment, $text) = (@_);
-
-    my $dbh = $RT::Handle->dbh;
-    my $table = $fts_config->{'Table'};
-
-    my $query;
-    if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) {
-        $query = "UPDATE $table SET Content = ? WHERE id = ?";
-    } else {
-        $query = "INSERT INTO $table(Content, id) VALUES(?, ?)";
-    }
-
-    $dbh->do( $query, undef, $$text, $attachment->id );
-}
-
-sub last_indexed_pg {
     my $attachments = RT::Attachments->new( RT->SystemUser );
     $attachments->Limit(
         FIELD    => 'ContentType',
@@ -235,6 +202,31 @@ sub last_indexed_pg {
     return $res->id;
 }
 
+sub process {
+    return goto_specific(
+        suffix    => $db_type,
+        error     => "No processer for $db_type DB",
+        arguments => \@_,
+    );
+}
+
+
+sub process_mysql {
+    my ($attachment, $text) = (@_);
+
+    my $dbh = $RT::Handle->dbh;
+    my $table = $fts_config->{'Table'};
+
+    my $query;
+    if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) {
+        $query = "UPDATE $table SET Content = ? WHERE id = ?";
+    } else {
+        $query = "INSERT INTO $table(Content, id) VALUES(?, ?)";
+    }
+
+    $dbh->do( $query, undef, $$text, $attachment->id );
+}
+
 sub process_pg {
     my ($attachment, $text) = (@_);
 

commit b01e7375825448c254ad926755db2999fb62f99f
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 01:05:22 2014 -0400

    Replace the last use of goto_specific with explicit function calls

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index b5d6375..7ee85ee 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -167,7 +167,11 @@ EOT
         my $text = $a->Content;
         next unless defined $text && length($text);
         $found++;
-        process( $a, \$text );
+        if ($db_type eq 'mysql') {
+            process_mysql( $a, \$text );
+        } elsif ($db_type eq 'pg') {
+            process_pg( $a, \$text );
+        }
         debug("Processed attachment #". $a->id );
     }
     redo if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100);
@@ -202,14 +206,6 @@ sub last_indexed {
     return $res->id;
 }
 
-sub process {
-    return goto_specific(
-        suffix    => $db_type,
-        error     => "No processer for $db_type DB",
-        arguments => \@_,
-    );
-}
-
 
 sub process_mysql {
     my ($attachment, $text) = (@_);
@@ -262,20 +258,6 @@ sub process_pg {
     }
 }
 
-sub goto_specific {
-    my %args = (@_);
-
-    my $func = (caller(1))[3];
-    $func =~ s/.*:://;
-    my $call = $func ."_". lc $args{'suffix'};
-    unless ( defined &$call ) {
-        return undef unless $args{'error'};
-        require Carp; Carp::croak( $args{'error'} );
-    }
-    @_ = @{ $args{'arguments'} };
-    goto &$call;
-}
-
 
 # helper functions
 sub debug    { print @_, "\n" if $OPT{debug}; 1 }

commit c876f7c7b956af0735a2f1ce54e0c8dc0650a8d1
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 15:31:36 2014 -0400

    Simplify last_indexed

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 7ee85ee..75a78db 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -178,35 +178,17 @@ EOT
 }
 
 sub last_indexed {
-    my $attachments = RT::Attachments->new( RT->SystemUser );
-    $attachments->Limit(
-        FIELD    => 'ContentType',
-        OPERATOR => 'IN',
-        VALUE    => ['text/plain', 'text/html'],
-    );
-    my $alias = 'main';
-    if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) {
-        $alias = $attachments->Join(
-            TYPE    => 'left',
-            FIELD1 => 'id',
-            TABLE2  => $fts_config->{'Table'},
-            FIELD2 => 'id',
-        );
+    if ( $db_type eq "mysql" ) {
+        return $dbh->selectrow_arrayref("SELECT MAX(id) FROM $table")->[0];
+    } elsif ( $db_type eq "pg" ) {
+        if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) {
+            return $dbh->selectrow_array("SELECT MAX(id) FROM $table")->[0];
+        } else {
+            return $dbh->selectrow_array("SELECT MAX(id) FROM Attachments WHERE $column IS NOT NULL")->[0];
+        }
     }
-    $attachments->Limit(
-        ALIAS => $alias,
-        FIELD => $fts_config->{'Column'},
-        OPERATOR => 'IS NOT',
-        VALUE => 'NULL',
-    );
-    $attachments->OrderBy( FIELD => 'id', ORDER => 'desc' );
-    $attachments->RowsPerPage( 1 );
-    my $res = $attachments->First;
-    return 0 unless $res;
-    return $res->id;
 }
 
-
 sub process_mysql {
     my ($attachment, $text) = (@_);
 

commit 5c56ed6871e9ca010e9cb6723ef312f120d91115
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 15:33:58 2014 -0400

    Only call last_indexed once, as it may be heavy

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 75a78db..1bf4ca5 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -146,6 +146,7 @@ EOT
     *RT::Attachments::Next  = \&DBIx::SearchBuilder::Next;
 }
 
+my $LAST = last_indexed();
 {
     my $attachments = RT::Attachments->new( RT->SystemUser );
     $attachments->Limit(
@@ -156,7 +157,7 @@ EOT
     $attachments->Limit(
         FIELD => 'id',
         OPERATOR => '>',
-        VALUE => last_indexed()
+        VALUE => $LAST,
     );
     $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
     $attachments->RowsPerPage( $OPT{'limit'} || 100 );
@@ -174,6 +175,7 @@ EOT
         }
         debug("Processed attachment #". $a->id );
     }
+    $LAST = $attachments->Last->id if $attachments->Count;
     redo if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100);
 }
 

commit 6768ac5700698b2a15801867acd42beab3b6d3cf
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 15:36:21 2014 -0400

    Index even empty attachments

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 1bf4ca5..6efd5b7 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -49,6 +49,7 @@
 use strict;
 use warnings;
 no warnings 'once';
+use 5.010;
 
 # fix lib paths, some may be relative
 BEGIN { # BEGIN RT CMD BOILERPLATE
@@ -165,8 +166,7 @@ my $LAST = last_indexed();
     my $found = 0;
     while ( my $a = $attachments->Next ) {
         debug("Found attachment #". $a->id );
-        my $text = $a->Content;
-        next unless defined $text && length($text);
+        my $text = $a->Content // "";
         $found++;
         if ($db_type eq 'mysql') {
             process_mysql( $a, \$text );

commit acb17824a3d7cd7a763a7287c117c0d5df55e24c
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 15:42:19 2014 -0400

    As last_indexed is based on the highest insert, there will never be an UPDATE needed

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 6efd5b7..b29d679 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -197,12 +197,7 @@ sub process_mysql {
     my $dbh = $RT::Handle->dbh;
     my $table = $fts_config->{'Table'};
 
-    my $query;
-    if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) {
-        $query = "UPDATE $table SET Content = ? WHERE id = ?";
-    } else {
-        $query = "INSERT INTO $table(Content, id) VALUES(?, ?)";
-    }
+    my $query = "INSERT INTO $table(Content, id) VALUES(?, ?)";
 
     $dbh->do( $query, undef, $$text, $attachment->id );
 }
@@ -216,11 +211,7 @@ sub process_pg {
 
     my $query;
     if ( $table ) {
-        if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) {
-            $query = "UPDATE $table SET $column = to_tsvector(?) WHERE id = ?";
-        } else {
-            $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)";
-        }
+        $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)";
     } else {
         $query = "UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?";
     }

commit d2ddcbeeeae9bf459c601c87c451fd510d97beb8
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 15:45:15 2014 -0400

    Inversion of control

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index b29d679..22079d8 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -48,7 +48,6 @@
 # END BPS TAGGED BLOCK }}}
 use strict;
 use warnings;
-no warnings 'once';
 use 5.010;
 
 # fix lib paths, some may be relative
@@ -147,68 +146,68 @@ EOT
     *RT::Attachments::Next  = \&DBIx::SearchBuilder::Next;
 }
 
-my $LAST = last_indexed();
-{
-    my $attachments = RT::Attachments->new( RT->SystemUser );
-    $attachments->Limit(
-        FIELD    => 'ContentType',
-        OPERATOR => 'IN',
-        VALUE    => ['text/plain', 'text/html'],
-    );
-    $attachments->Limit(
-        FIELD => 'id',
-        OPERATOR => '>',
-        VALUE => $LAST,
-    );
-    $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
-    $attachments->RowsPerPage( $OPT{'limit'} || 100 );
-
-    my $found = 0;
-    while ( my $a = $attachments->Next ) {
-        debug("Found attachment #". $a->id );
-        my $text = $a->Content // "";
-        $found++;
-        if ($db_type eq 'mysql') {
-            process_mysql( $a, \$text );
-        } elsif ($db_type eq 'pg') {
-            process_pg( $a, \$text );
-        }
-        debug("Processed attachment #". $a->id );
-    }
-    $LAST = $attachments->Last->id if $attachments->Count;
-    redo if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100);
+my $LAST;
+if ($db_type eq 'mysql') {
+    process_mysql();
+} elsif ($db_type eq 'pg') {
+    process_pg();
 }
 
-sub last_indexed {
-    if ( $db_type eq "mysql" ) {
-        return $dbh->selectrow_arrayref("SELECT MAX(id) FROM $table")->[0];
-    } elsif ( $db_type eq "pg" ) {
-        if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) {
-            return $dbh->selectrow_array("SELECT MAX(id) FROM $table")->[0];
-        } else {
-            return $dbh->selectrow_array("SELECT MAX(id) FROM Attachments WHERE $column IS NOT NULL")->[0];
-        }
+sub attachment_loop {
+    my $subref = shift;
+    my $table = $fts_config->{'Table'};
+    $LAST //= 0;
+
+    # Fetch in batches of size --limit
+    {
+        # Indexes all text/plain and text/html attachments
+        my $attachments = RT::Attachments->new( RT->SystemUser );
+        $attachments->Limit(
+            FIELD    => 'ContentType',
+            OPERATOR => 'IN',
+            VALUE    => ['text/plain', 'text/html'],
+        );
+        $attachments->Limit( FIELD => 'id', OPERATOR => '>', VALUE => $LAST );
+        $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
+        $attachments->RowsPerPage( $OPT{'limit'} || 100 );
+
+        # Call back to the DB-specific part
+        $subref->($attachments);
+
+        $LAST = $attachments->Last->id if $attachments->Count;
+
+        redo if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100);
     }
 }
 
 sub process_mysql {
-    my ($attachment, $text) = (@_);
-
     my $dbh = $RT::Handle->dbh;
     my $table = $fts_config->{'Table'};
 
+    ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
+
     my $query = "INSERT INTO $table(Content, id) VALUES(?, ?)";
 
-    $dbh->do( $query, undef, $$text, $attachment->id );
+    attachment_loop( sub {
+        my ($attachments) = @_;
+        while ( my $a = $attachments->Next ) {
+            debug("Found attachment #". $a->id );
+            $dbh->do( $query, undef, ($a->Content // ""), $a->id );
+        }
+    });
 }
 
 sub process_pg {
-    my ($attachment, $text) = (@_);
-
     my $dbh = $RT::Handle->dbh;
     my $table = $fts_config->{'Table'};
     my $column = $fts_config->{'Column'};
 
+    if ($table) {
+        ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
+    } else {
+        ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM Attachments WHERE $column IS NOT NULL");
+    }
+
     my $query;
     if ( $table ) {
         $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)";
@@ -216,21 +215,27 @@ sub process_pg {
         $query = "UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?";
     }
 
-    my $status = eval { $dbh->do( $query, undef, $$text, $attachment->id ) };
-    unless ( $status ) {
-        if ( $dbh->err == 7  && $dbh->state eq '54000' ) {
-            warn "Attachment @{[$attachment->id]} cannot be indexed. Most probably it contains too many unique words. Error: ". $dbh->errstr;
-        } elsif ( $dbh->err == 7 && $dbh->state eq '22021' ) {
-            warn "Attachment @{[$attachment->id]} cannot be indexed. Most probably it contains invalid UTF8 bytes. Error: ". $dbh->errstr;
-        } else {
-            die "error: ". $dbh->errstr;
+    attachment_loop( sub {
+        my ($attachments) = @_;
+        while ( my $a = $attachments->Next ) {
+            debug("Found attachment #". $a->id );
+            my $status = eval { $dbh->do( $query, undef, ($a->Content // ""), $a->id ) };
+            unless ( $status ) {
+                if ( $dbh->err == 7  && $dbh->state eq '54000' ) {
+                    warn "Attachment @{[$a->id]} cannot be indexed. Most probably it contains too many unique words. Error: ". $dbh->errstr;
+                } elsif ( $dbh->err == 7 && $dbh->state eq '22021' ) {
+                    warn "Attachment @{[$a->id]} cannot be indexed. Most probably it contains invalid UTF8 bytes. Error: ". $dbh->errstr;
+                } else {
+                    die "error: ". $dbh->errstr;
+                }
+
+                # Insert an empty tsvector, so we count this row as "indexed"
+                # for purposes of knowing where to pick up
+                eval { $dbh->do( $query, undef, "", $a->id ) }
+                    or die "Failed to insert empty row: " . $dbh->errstr;
+            }
         }
-
-        # Insert an empty tsvector, so we count this row as "indexed"
-        # for purposes of knowing where to pick up
-        eval { $dbh->do( $query, undef, "", $attachment->id ) }
-            or die "Failed to insert empty tsvector: " . $dbh->errstr;
-    }
+    });
 }
 
 

commit 7a297995e28030c6636bb953bd3418711195c053
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 01:37:10 2014 -0400

    Switch to preparing statements, rather than just setting strings
    
    Prepared statements provide a small speed benefit, removing the need for
    the database to re-parse the query string.

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 22079d8..4176969 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -186,13 +186,13 @@ sub process_mysql {
 
     ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
 
-    my $query = "INSERT INTO $table(Content, id) VALUES(?, ?)";
+    my $sth = $dbh->prepare("INSERT INTO $table(Content, id) VALUES(?, ?)");
 
     attachment_loop( sub {
         my ($attachments) = @_;
         while ( my $a = $attachments->Next ) {
             debug("Found attachment #". $a->id );
-            $dbh->do( $query, undef, ($a->Content // ""), $a->id );
+            $sth->execute( ($a->Content // ""), $a->id );
         }
     });
 }
@@ -208,18 +208,18 @@ sub process_pg {
         ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM Attachments WHERE $column IS NOT NULL");
     }
 
-    my $query;
+    my $sth;
     if ( $table ) {
-        $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)";
+        $sth = $dbh->prepare("INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)");
     } else {
-        $query = "UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?";
+        $sth = $dbh->prepare("UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?");
     }
 
     attachment_loop( sub {
         my ($attachments) = @_;
         while ( my $a = $attachments->Next ) {
             debug("Found attachment #". $a->id );
-            my $status = eval { $dbh->do( $query, undef, ($a->Content // ""), $a->id ) };
+            my $status = eval { $sth->execute( ($a->Content // ""), $a->id ) };
             unless ( $status ) {
                 if ( $dbh->err == 7  && $dbh->state eq '54000' ) {
                     warn "Attachment @{[$a->id]} cannot be indexed. Most probably it contains too many unique words. Error: ". $dbh->errstr;
@@ -231,7 +231,7 @@ sub process_pg {
 
                 # Insert an empty tsvector, so we count this row as "indexed"
                 # for purposes of knowing where to pick up
-                eval { $dbh->do( $query, undef, "", $a->id ) }
+                eval { $sth->execute( "", $a->id ) }
                     or die "Failed to insert empty row: " . $dbh->errstr;
             }
         }

commit 7d7841318b41f5564fd8608bfa8a69fa8e0ce87c
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 01:39:22 2014 -0400

    INSERT DELAYED provides notable speed benefits on MyISAM
    
    MySQL supports INSERT DELAYED[1] for MyISAM (but not InnoDB) tables.  This
    allows the server to defer inserts until it has a good opportunity to
    write them, and to write them in bulk.  While there is a small risk of
    data loss (if the server is terminated before the data is written) this
    poses no problem for an AttachmentsIndex table, for which all inserted
    data is trivial to re-generate.
    
    [1] http://dev.mysql.com/doc/refman/5.1/en/insert-delayed.html

diff --git a/lib/RT/Config.pm b/lib/RT/Config.pm
index 58366ea..a310255 100644
--- a/lib/RT/Config.pm
+++ b/lib/RT/Config.pm
@@ -604,6 +604,7 @@ our %META;
                     } else {
                         # Internal, one-column table
                         $v->{Column} = 'Content';
+                        $v->{Engine} = $engine;
                     }
                 }
             } else {
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 4176969..7e2e17c 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -186,7 +186,8 @@ sub process_mysql {
 
     ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
 
-    my $sth = $dbh->prepare("INSERT INTO $table(Content, id) VALUES(?, ?)");
+    my $insert = $fts_config->{Engine} eq "MyISAM" ? "INSERT DELAYED" : "INSERT";
+    my $sth = $dbh->prepare("$insert INTO $table(Content, id) VALUES(?, ?)");
 
     attachment_loop( sub {
         my ($attachments) = @_;

commit b490888fd9527edb9c5340ae492251e3ba08a410
Author: Alex Vandiver <alexmv at bestpractical.com>
Date:   Fri Jul 25 01:40:16 2014 -0400

    Improve MySQL insert speed by batching inserts into one statement
    
    MySQL must flush buffers after every insert statement; as such,
    providing large numbers of INSERT statements is quite inefficient, as
    most time is spent in disk I/O.  Instead, store rows to be inserted, and
    insert them batch-by-batch.
    
    This technique is not applicable to PostgreSQL because failure of the
    to_tsvector call to convert a string would abort the entire insert.
    Additionally, most installs use an additional column on the existing
    table, which requires an UPDATE, and not an INSERT, which is not easily
    batched.

diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 7e2e17c..80bac6f 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -186,15 +186,28 @@ sub process_mysql {
 
     ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
 
+    # Doing large inserts is faster than individual statements, but
+    # comes at a parsing cost; cache the statement handles (99% of which
+    # will be the same size) for a notable (2x) speed gain.
+    my %sthandles;
     my $insert = $fts_config->{Engine} eq "MyISAM" ? "INSERT DELAYED" : "INSERT";
-    my $sth = $dbh->prepare("$insert INTO $table(Content, id) VALUES(?, ?)");
 
     attachment_loop( sub {
         my ($attachments) = @_;
+        my @insert;
+        my $found = 0;
         while ( my $a = $attachments->Next ) {
             debug("Found attachment #". $a->id );
-            $sth->execute( ($a->Content // ""), $a->id );
+            push @insert, ($a->Content // ""), $a->id;
+            $found++;
         }
+        return unless $found;
+
+        # $found should be the limit size on all but the last go-around.
+        $sthandles{$found} ||=
+            $dbh->prepare("$insert INTO $table(Content, id) VALUES "
+                              . join(", ", ("(?,?)") x $found));
+        $sthandles{$found}->execute(@insert);
     });
 }
 

-----------------------------------------------------------------------


More information about the rt-commit mailing list