[Rt-commit] rt branch, 4.4/fts-refactor-performance, created. rt-4.2.9-97-ged1131b
Alex Vandiver
alexmv at bestpractical.com
Thu Jan 8 16:32:57 EST 2015
The branch, 4.4/fts-refactor-performance has been created
at ed1131b1ecc5fdbefa8dd17483efd89019de150b (commit)
- Log -----------------------------------------------------------------
commit aa6f0a6b973dc679e157850f103a126d9c84842f
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jul 25 00:17:32 2014 -0400
extract_text and extract_html are identical; inline them
The original premise may have been that HTML->text conversion would be
fone on the HTML before indexing. While this is still an option for the
future, there is currently no reason to provide to identical methods.
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 189c9ae..d4a41eb 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -183,9 +183,10 @@ foreach my $type ( @types ) {
while ( my $a = $attachments->Next ) {
next if filter( $type, $a );
debug("Found attachment #". $a->id );
- my $txt = extract($type, $a) or next;
+ my $text = $a->Content;
+ next unless defined $text && length($text);
$found++;
- process( $type, $a, $txt );
+ process( $type, $a, \$text );
debug("Processed attachment #". $a->id );
}
goto REDO if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100)
@@ -242,15 +243,6 @@ sub filter {
);
}
-sub extract {
- my $type = shift;
- return goto_specific(
- suffix => $type,
- error => "No way to convert $type attachment into text",
- arguments => \@_,
- );
-}
-
sub process {
return goto_specific(
suffix => $db_type,
@@ -342,13 +334,6 @@ sub attachments_text {
return $res;
}
-sub extract_text {
- my $attachment = shift;
- my $text = $attachment->Content;
- return undef unless defined $text && length($text);
- return \$text;
-}
-
sub attachments_html {
my $res = shift;
$res->Limit( FIELD => 'ContentType', VALUE => 'text/html' );
@@ -365,14 +350,6 @@ sub filter_html {
return 0;
}
-sub extract_html {
- my $attachment = shift;
- my $text = $attachment->Content;
- return undef unless defined $text && length($text);
-# TODO: html -> text
- return \$text;
-}
-
sub goto_specific {
my %args = (@_);
commit 8e13857d76d42ebe16a61f5ac02daa28536ac977
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jul 25 00:19:10 2014 -0400
Inline the differences between text/plain and text/html attachment lists
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index d4a41eb..7eaa71d 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -218,12 +218,11 @@ sub attachments {
OPERATOR => '!=',
VALUE => 'deleted'
);
-
- return goto_specific(
- suffix => $type,
- error => "Don't know how to find $type attachments",
- arguments => [$res],
+ $res->Limit(
+ FIELD => 'ContentType',
+ VALUE => ($type eq "html" ? "text/html" : "text/plain"),
);
+ return $res;
}
sub last_indexed {
@@ -328,18 +327,6 @@ sub process_pg {
}
}
-sub attachments_text {
- my $res = shift;
- $res->Limit( FIELD => 'ContentType', VALUE => 'text/plain' );
- return $res;
-}
-
-sub attachments_html {
- my $res = shift;
- $res->Limit( FIELD => 'ContentType', VALUE => 'text/html' );
- return $res;
-}
-
sub filter_html {
my $attachment = shift;
if ( my $parent = $attachment->ParentObj ) {
commit 481fcd57c3c7e1bc09b25b8646d8d82144ac012f
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jul 25 00:19:55 2014 -0400
Stop skipping indexing of text/html within multipart/alternative
Not all text/html within a multipart/alternative is strictly identical
tothe text part; some mailers send out text/plain parts of alternatives
that simply say "Your client must be able to render HTML."
The additional space required to index these attachments is worth the
consitency of having all potential content indexed. This also is a
speed increase, as it reduces the number of ad-hoc queries during
indexing.
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 7eaa71d..62852fb 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -181,7 +181,6 @@ foreach my $type ( @types ) {
my $found = 0;
while ( my $a = $attachments->Next ) {
- next if filter( $type, $a );
debug("Found attachment #". $a->id );
my $text = $a->Content;
next unless defined $text && length($text);
@@ -234,14 +233,6 @@ sub last_indexed {
);
}
-sub filter {
- my $type = shift;
- return goto_specific(
- suffix => $type,
- arguments => \@_,
- );
-}
-
sub process {
return goto_specific(
suffix => $db_type,
@@ -327,16 +318,6 @@ sub process_pg {
}
}
-sub filter_html {
- my $attachment = shift;
- if ( my $parent = $attachment->ParentObj ) {
-# skip html parts that are alternatives
- return 1 if $parent->id
- && $parent->ContentType eq 'mulitpart/alternative';
- }
- return 0;
-}
-
sub goto_specific {
my %args = (@_);
commit 712300ceb854b76897b64b3861a1274167172387
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jul 25 00:21:52 2014 -0400
Use the new, shorter, initialization form
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 62852fb..c2b203b 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -67,11 +67,7 @@ BEGIN { # BEGIN RT CMD BOILERPLATE
}
-BEGIN {
- use RT;
- RT::LoadConfig();
- RT::Init();
-};
+use RT -init;
use RT::Interface::CLI ();
my %OPT = (
commit 2b09f876a4dd89d070861d9b410b89fd7ea72974
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jul 25 00:23:40 2014 -0400
Simplify and condense option parsing
This causes the indexer to be able to accept options that are
potentially not applicable (for instance, --memory with a non-Oracle
backend); however, the comprehensibility benefits are worth this tiny
cost.
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index c2b203b..960d4a4 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -70,40 +70,18 @@ BEGIN { # BEGIN RT CMD BOILERPLATE
use RT -init;
use RT::Interface::CLI ();
-my %OPT = (
- help => 0,
- debug => 0,
- quiet => 0,
-);
-my @OPT_LIST = qw(help|h! debug! quiet);
+use Getopt::Long qw(GetOptions);
+my %OPT = ( memory => '2M', limit => 0 );
+GetOptions( \%OPT,
+ "help|h!",
+ "debug!",
+ "quiet!",
-my $db_type = RT->Config->Get('DatabaseType');
-if ( $db_type eq 'Pg' ) {
- %OPT = (
- %OPT,
- limit => 0,
- all => 0,
- );
- push @OPT_LIST, 'limit=i', 'all!';
-}
-elsif ( $db_type eq 'mysql' ) {
- %OPT = (
- %OPT,
- limit => 0,
- all => 0,
- );
- push @OPT_LIST, 'limit=i', 'all!';
-}
-elsif ( $db_type eq 'Oracle' ) {
- %OPT = (
- %OPT,
- memory => '2M',
- );
- push @OPT_LIST, qw(memory=s);
-}
+ "all!",
+ "limit=i",
-use Getopt::Long qw(GetOptions);
-GetOptions( \%OPT, @OPT_LIST );
+ "memory=s",
+);
if ( $OPT{'help'} ) {
RT::Interface::CLI->ShowHelp(
@@ -123,6 +101,7 @@ if ( !flock main::DATA, LOCK_EX | LOCK_NB ) {
}
}
+my $db_type = RT->Config->Get('DatabaseType');
my $fts_config = RT->Config->Get('FullTextSearch') || {};
unless ( $fts_config->{'Enable'} ) {
print STDERR <<EOT;
commit 9b9fbf4140a3172bf46a7f68deea79b19a3741a8
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jul 25 00:24:01 2014 -0400
Documentation has moved out; update --help accordingly
The documentation (which included =head1 sections for each database
type) was moved to the more centralized docs/full_text_indexing.pod in
fa5dffcb; however, --help still looked for the relevant sections.
Remove the special-case code which is no logner relevant.
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 960d4a4..615357f 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -83,11 +83,7 @@ GetOptions( \%OPT,
"memory=s",
);
-if ( $OPT{'help'} ) {
- RT::Interface::CLI->ShowHelp(
- Sections => 'NAME|DESCRIPTION|'. uc($db_type),
- );
-}
+RT::Interface::CLI->ShowHelp if $OPT{help};
use Fcntl ':flock';
if ( !flock main::DATA, LOCK_EX | LOCK_NB ) {
commit b97c2fa0efb2335cc61a853b805f7e60d198a00b
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jul 25 01:44:24 2014 -0400
Remove AUTHOR section; it is unnecessary in core sbin files
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 615357f..a6e231a 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -319,11 +319,6 @@ This is a helper script to keep full text indexes in sync with data.
Read F<docs/full_text_indexing.pod> for complete details on how and when
to run it.
-=head1 AUTHOR
-
-Ruslan Zakirov E<lt>ruz at bestpractical.comE<gt>,
-Alex Vandiver E<lt>alexmv at bestpractical.comE<gt>
-
=cut
__DATA__
commit 2d7ea7f81d15e9dc1c8b157561a97a9d262fc01a
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jul 25 00:28:27 2014 -0400
Skipping ACL checks yields a sizable performance increase
The indexer spends quite a bit of time checking that the attachment
content is visible to the current user before indexing it -- all of
which is unnecessary, as it is run as the system user. Explicitly
disable the ACL checking for the indexer, which removes a large number
of queries and yields a correspondingly sizable increase in indexing
speed.
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index a6e231a..9b9b809 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -138,6 +138,15 @@ EOT
exit 1;
}
+# Skip ACL checks. This saves a large number of unnecessary queries
+# (for tickets, ACLs, and users) which are unnecessary, as we are
+# running as the system user.
+{
+ no warnings 'redefine';
+ *RT::Attachment::_Value = \&DBIx::SearchBuilder::Record::_Value;
+ *RT::Attachments::Next = \&DBIx::SearchBuilder::Next;
+}
+
my @types = qw(text html);
foreach my $type ( @types ) {
REDO:
commit bb06932088058825707af7f4d135584a9994677a
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jul 25 00:36:49 2014 -0400
Index attachments in one pass through the database, not two
There is no reason to perform two passes through the database instead of
one; doing one allows for better progress estimates, as well as
potentially increasing locality for the database.
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 9b9b809..2b9e368 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -147,14 +147,12 @@ EOT
*RT::Attachments::Next = \&DBIx::SearchBuilder::Next;
}
-my @types = qw(text html);
-foreach my $type ( @types ) {
- REDO:
- my $attachments = attachments($type);
+{
+ my $attachments = attachments();
$attachments->Limit(
FIELD => 'id',
OPERATOR => '>',
- VALUE => last_indexed($type)
+ VALUE => last_indexed()
);
$attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
$attachments->RowsPerPage( $OPT{'limit'} || 100 );
@@ -165,14 +163,13 @@ foreach my $type ( @types ) {
my $text = $a->Content;
next unless defined $text && length($text);
$found++;
- process( $type, $a, \$text );
+ process( $a, \$text );
debug("Processed attachment #". $a->id );
}
- goto REDO if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100)
+ redo if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100);
}
sub attachments {
- my $type = shift;
my $res = RT::Attachments->new( RT->SystemUser );
my $txn_alias = $res->Join(
ALIAS1 => 'main',
@@ -198,17 +195,17 @@ sub attachments {
VALUE => 'deleted'
);
$res->Limit(
- FIELD => 'ContentType',
- VALUE => ($type eq "html" ? "text/html" : "text/plain"),
+ FIELD => 'ContentType',
+ OPERATOR => 'IN',
+ VALUE => ['text/plain', 'text/html'],
);
return $res;
}
sub last_indexed {
- my ($type) = (@_);
return goto_specific(
suffix => $db_type,
- error => "Don't know how to find last indexed $type attachment for $db_type DB",
+ error => "Don't know how to find last indexed attachment for $db_type DB",
arguments => \@_,
);
}
@@ -223,7 +220,7 @@ sub process {
sub last_indexed_mysql { last_indexed_pg(@_); }
sub process_mysql {
- my ($type, $attachment, $text) = (@_);
+ my ($attachment, $text) = (@_);
my $dbh = $RT::Handle->dbh;
my $table = $fts_config->{'Table'};
@@ -239,8 +236,7 @@ sub process_mysql {
}
sub last_indexed_pg {
- my $type = shift;
- my $attachments = attachments( $type );
+ my $attachments = attachments();
my $alias = 'main';
if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) {
$alias = $attachments->Join(
@@ -264,7 +260,7 @@ sub last_indexed_pg {
}
sub process_pg {
- my ($type, $attachment, $text) = (@_);
+ my ($attachment, $text) = (@_);
my $dbh = $RT::Handle->dbh;
my $table = $fts_config->{'Table'};
commit 834cc3cf3628dd5e243f5a6cecc3e2a3689ba593
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jul 25 00:45:35 2014 -0400
Index attachments even on deleted tickets
Skipping indexing on tickets which are currently deleted may lead to
content which is later not indexed and not findable. Even if a ticket
becomes un-deleted at a later point, it will never again be indexed, as
last_indexed() will limit to new attachments since the last run. The
overhead of joining through Attachments and into Tickets is also not to
be overlooked -- on MySQL, it causes pessimal performance for large
systems, involving more than one filesort and temporary table.
Index all attachments, no matter the source, on the premise that
limiting to non-deleted tickets is better done at query time, not at
index time. It is both more timely, and better limited.
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 2b9e368..2f3d6ad 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -148,7 +148,12 @@ EOT
}
{
- my $attachments = attachments();
+ my $attachments = RT::Attachments->new( RT->SystemUser );
+ $attachments->Limit(
+ FIELD => 'ContentType',
+ OPERATOR => 'IN',
+ VALUE => ['text/plain', 'text/html'],
+ );
$attachments->Limit(
FIELD => 'id',
OPERATOR => '>',
@@ -169,39 +174,6 @@ EOT
redo if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100);
}
-sub attachments {
- my $res = RT::Attachments->new( RT->SystemUser );
- my $txn_alias = $res->Join(
- ALIAS1 => 'main',
- FIELD1 => 'TransactionId',
- TABLE2 => 'Transactions',
- FIELD2 => 'id',
- );
- $res->Limit(
- ALIAS => $txn_alias,
- FIELD => 'ObjectType',
- VALUE => 'RT::Ticket',
- );
- my $ticket_alias = $res->Join(
- ALIAS1 => $txn_alias,
- FIELD1 => 'ObjectId',
- TABLE2 => 'Tickets',
- FIELD2 => 'id',
- );
- $res->Limit(
- ALIAS => $ticket_alias,
- FIELD => 'Status',
- OPERATOR => '!=',
- VALUE => 'deleted'
- );
- $res->Limit(
- FIELD => 'ContentType',
- OPERATOR => 'IN',
- VALUE => ['text/plain', 'text/html'],
- );
- return $res;
-}
-
sub last_indexed {
return goto_specific(
suffix => $db_type,
@@ -236,7 +208,12 @@ sub process_mysql {
}
sub last_indexed_pg {
- my $attachments = attachments();
+ my $attachments = RT::Attachments->new( RT->SystemUser );
+ $attachments->Limit(
+ FIELD => 'ContentType',
+ OPERATOR => 'IN',
+ VALUE => ['text/plain', 'text/html'],
+ );
my $alias = 'main';
if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) {
$alias = $attachments->Join(
commit 33ae6baf32ab0b73d4cdcdf5e75dfb94c33f27e5
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jul 25 00:58:24 2014 -0400
mysql and pg share the same last_indexed; unify the method
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 2f3d6ad..e88aec1 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -175,39 +175,6 @@ EOT
}
sub last_indexed {
- return goto_specific(
- suffix => $db_type,
- error => "Don't know how to find last indexed attachment for $db_type DB",
- arguments => \@_,
- );
-}
-
-sub process {
- return goto_specific(
- suffix => $db_type,
- error => "No processer for $db_type DB",
- arguments => \@_,
- );
-}
-
-sub last_indexed_mysql { last_indexed_pg(@_); }
-sub process_mysql {
- my ($attachment, $text) = (@_);
-
- my $dbh = $RT::Handle->dbh;
- my $table = $fts_config->{'Table'};
-
- my $query;
- if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) {
- $query = "UPDATE $table SET Content = ? WHERE id = ?";
- } else {
- $query = "INSERT INTO $table(Content, id) VALUES(?, ?)";
- }
-
- $dbh->do( $query, undef, $$text, $attachment->id );
-}
-
-sub last_indexed_pg {
my $attachments = RT::Attachments->new( RT->SystemUser );
$attachments->Limit(
FIELD => 'ContentType',
@@ -236,6 +203,31 @@ sub last_indexed_pg {
return $res->id;
}
+sub process {
+ return goto_specific(
+ suffix => $db_type,
+ error => "No processer for $db_type DB",
+ arguments => \@_,
+ );
+}
+
+
+sub process_mysql {
+ my ($attachment, $text) = (@_);
+
+ my $dbh = $RT::Handle->dbh;
+ my $table = $fts_config->{'Table'};
+
+ my $query;
+ if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) {
+ $query = "UPDATE $table SET Content = ? WHERE id = ?";
+ } else {
+ $query = "INSERT INTO $table(Content, id) VALUES(?, ?)";
+ }
+
+ $dbh->do( $query, undef, $$text, $attachment->id );
+}
+
sub process_pg {
my ($attachment, $text) = (@_);
commit d284a76b4b66a3b57f261c3cd7a69dae96fd9bd4
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jul 25 01:05:22 2014 -0400
Replace the last use of goto_specific with explicit function calls
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index e88aec1..8039239 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -168,7 +168,11 @@ EOT
my $text = $a->Content;
next unless defined $text && length($text);
$found++;
- process( $a, \$text );
+ if ($db_type eq 'mysql') {
+ process_mysql( $a, \$text );
+ } elsif ($db_type eq 'Pg') {
+ process_pg( $a, \$text );
+ }
debug("Processed attachment #". $a->id );
}
redo if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100);
@@ -203,14 +207,6 @@ sub last_indexed {
return $res->id;
}
-sub process {
- return goto_specific(
- suffix => $db_type,
- error => "No processer for $db_type DB",
- arguments => \@_,
- );
-}
-
sub process_mysql {
my ($attachment, $text) = (@_);
@@ -263,20 +259,6 @@ sub process_pg {
}
}
-sub goto_specific {
- my %args = (@_);
-
- my $func = (caller(1))[3];
- $func =~ s/.*:://;
- my $call = $func ."_". lc $args{'suffix'};
- unless ( defined &$call ) {
- return undef unless $args{'error'};
- require Carp; Carp::croak( $args{'error'} );
- }
- @_ = @{ $args{'arguments'} };
- goto &$call;
-}
-
# helper functions
sub debug { print @_, "\n" if $OPT{debug}; 1 }
commit ec0931f649ce24f27d472fc47ae2c5e2e80378a8
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jul 25 15:31:36 2014 -0400
Simplify last_indexed
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 8039239..0a27860 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -179,35 +179,17 @@ EOT
}
sub last_indexed {
- my $attachments = RT::Attachments->new( RT->SystemUser );
- $attachments->Limit(
- FIELD => 'ContentType',
- OPERATOR => 'IN',
- VALUE => ['text/plain', 'text/html'],
- );
- my $alias = 'main';
- if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) {
- $alias = $attachments->Join(
- TYPE => 'left',
- FIELD1 => 'id',
- TABLE2 => $fts_config->{'Table'},
- FIELD2 => 'id',
- );
+ if ( $db_type eq "mysql" ) {
+ return $dbh->selectrow_arrayref("SELECT MAX(id) FROM $table")->[0];
+ } elsif ( $db_type eq "pg" ) {
+ if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) {
+ return $dbh->selectrow_array("SELECT MAX(id) FROM $table")->[0];
+ } else {
+ return $dbh->selectrow_array("SELECT MAX(id) FROM Attachments WHERE $column IS NOT NULL")->[0];
+ }
}
- $attachments->Limit(
- ALIAS => $alias,
- FIELD => $fts_config->{'Column'},
- OPERATOR => 'IS NOT',
- VALUE => 'NULL',
- );
- $attachments->OrderBy( FIELD => 'id', ORDER => 'desc' );
- $attachments->RowsPerPage( 1 );
- my $res = $attachments->First;
- return 0 unless $res;
- return $res->id;
}
-
sub process_mysql {
my ($attachment, $text) = (@_);
commit edb1d56e2b8dacd3ba5560d0bcd3301bca8334d9
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jul 25 15:33:58 2014 -0400
Only call last_indexed once, as it may be heavy
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 0a27860..7415b39 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -147,6 +147,7 @@ EOT
*RT::Attachments::Next = \&DBIx::SearchBuilder::Next;
}
+my $LAST = last_indexed();
{
my $attachments = RT::Attachments->new( RT->SystemUser );
$attachments->Limit(
@@ -157,7 +158,7 @@ EOT
$attachments->Limit(
FIELD => 'id',
OPERATOR => '>',
- VALUE => last_indexed()
+ VALUE => $LAST,
);
$attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
$attachments->RowsPerPage( $OPT{'limit'} || 100 );
@@ -175,6 +176,7 @@ EOT
}
debug("Processed attachment #". $a->id );
}
+ $LAST = $attachments->Last->id if $attachments->Count;
redo if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100);
}
commit 59abcc042a641fa38a85f62329240da7549c8826
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jul 25 15:36:21 2014 -0400
Index even empty attachments
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 7415b39..b86ac06 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -49,6 +49,7 @@
use strict;
use warnings;
no warnings 'once';
+use 5.010;
# fix lib paths, some may be relative
BEGIN { # BEGIN RT CMD BOILERPLATE
@@ -166,8 +167,7 @@ my $LAST = last_indexed();
my $found = 0;
while ( my $a = $attachments->Next ) {
debug("Found attachment #". $a->id );
- my $text = $a->Content;
- next unless defined $text && length($text);
+ my $text = $a->Content // "";
$found++;
if ($db_type eq 'mysql') {
process_mysql( $a, \$text );
commit ad59d98bed970e30e2ac0af43c60b49d85ad870b
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jul 25 15:42:19 2014 -0400
As last_indexed is based on the highest insert, there will never be an UPDATE needed
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index b86ac06..fe2b245 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -198,12 +198,7 @@ sub process_mysql {
my $dbh = $RT::Handle->dbh;
my $table = $fts_config->{'Table'};
- my $query;
- if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) {
- $query = "UPDATE $table SET Content = ? WHERE id = ?";
- } else {
- $query = "INSERT INTO $table(Content, id) VALUES(?, ?)";
- }
+ my $query = "INSERT INTO $table(Content, id) VALUES(?, ?)";
$dbh->do( $query, undef, $$text, $attachment->id );
}
@@ -216,12 +211,8 @@ sub process_pg {
my $column = $fts_config->{'Column'};
my $query;
- if ( $table ) {
- if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) {
- $query = "UPDATE $table SET $column = to_tsvector(?) WHERE id = ?";
- } else {
- $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)";
- }
+ if ( $table ne 'Attachments' ) {
+ $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)";
} else {
$query = "UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?";
}
commit 8a96a4009babf2ae4d9aa3b8b9587df5bfd05259
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jul 25 15:45:15 2014 -0400
Inversion of control of main indexing loops
Rather than having database-dependent if-statements, followed by a
standard loop which contains further database-dependent if-statements,
instead turn the loop into a function which cane be called from one of
two database-specific functions. This is important because MySQL's
iteration will further diverge from PostgreSQL's in the following
commits.
This commit is best viewed with:
git diff --patience --ignore-all-space
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index fe2b245..ebcf3c2 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -48,7 +48,6 @@
# END BPS TAGGED BLOCK }}}
use strict;
use warnings;
-no warnings 'once';
use 5.010;
# fix lib paths, some may be relative
@@ -148,68 +147,68 @@ EOT
*RT::Attachments::Next = \&DBIx::SearchBuilder::Next;
}
-my $LAST = last_indexed();
-{
- my $attachments = RT::Attachments->new( RT->SystemUser );
- $attachments->Limit(
- FIELD => 'ContentType',
- OPERATOR => 'IN',
- VALUE => ['text/plain', 'text/html'],
- );
- $attachments->Limit(
- FIELD => 'id',
- OPERATOR => '>',
- VALUE => $LAST,
- );
- $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
- $attachments->RowsPerPage( $OPT{'limit'} || 100 );
-
- my $found = 0;
- while ( my $a = $attachments->Next ) {
- debug("Found attachment #". $a->id );
- my $text = $a->Content // "";
- $found++;
- if ($db_type eq 'mysql') {
- process_mysql( $a, \$text );
- } elsif ($db_type eq 'Pg') {
- process_pg( $a, \$text );
- }
- debug("Processed attachment #". $a->id );
- }
- $LAST = $attachments->Last->id if $attachments->Count;
- redo if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100);
+my $LAST;
+if ($db_type eq 'mysql') {
+ process_mysql();
+} elsif ($db_type eq 'Pg') {
+ process_pg();
}
-sub last_indexed {
- if ( $db_type eq "mysql" ) {
- return $dbh->selectrow_arrayref("SELECT MAX(id) FROM $table")->[0];
- } elsif ( $db_type eq "pg" ) {
- if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) {
- return $dbh->selectrow_array("SELECT MAX(id) FROM $table")->[0];
- } else {
- return $dbh->selectrow_array("SELECT MAX(id) FROM Attachments WHERE $column IS NOT NULL")->[0];
- }
+sub attachment_loop {
+ my $subref = shift;
+ my $table = $fts_config->{'Table'};
+ $LAST //= 0;
+
+ # Fetch in batches of size --limit
+ {
+ # Indexes all text/plain and text/html attachments
+ my $attachments = RT::Attachments->new( RT->SystemUser );
+ $attachments->Limit(
+ FIELD => 'ContentType',
+ OPERATOR => 'IN',
+ VALUE => ['text/plain', 'text/html'],
+ );
+ $attachments->Limit( FIELD => 'id', OPERATOR => '>', VALUE => $LAST );
+ $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
+ $attachments->RowsPerPage( $OPT{'limit'} || 100 );
+
+ # Call back to the DB-specific part
+ $subref->($attachments);
+
+ $LAST = $attachments->Last->id if $attachments->Count;
+
+ redo if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100);
}
}
sub process_mysql {
- my ($attachment, $text) = (@_);
-
my $dbh = $RT::Handle->dbh;
my $table = $fts_config->{'Table'};
+ ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
+
my $query = "INSERT INTO $table(Content, id) VALUES(?, ?)";
- $dbh->do( $query, undef, $$text, $attachment->id );
+ attachment_loop( sub {
+ my ($attachments) = @_;
+ while ( my $a = $attachments->Next ) {
+ debug("Found attachment #". $a->id );
+ $dbh->do( $query, undef, ($a->Content // ""), $a->id );
+ }
+ });
}
sub process_pg {
- my ($attachment, $text) = (@_);
-
my $dbh = $RT::Handle->dbh;
my $table = $fts_config->{'Table'};
my $column = $fts_config->{'Column'};
+ if ( $table ne 'Attachments' ) {
+ ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
+ } else {
+ ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM Attachments WHERE $column IS NOT NULL");
+ }
+
my $query;
if ( $table ne 'Attachments' ) {
$query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)";
@@ -217,21 +216,27 @@ sub process_pg {
$query = "UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?";
}
- my $status = eval { $dbh->do( $query, undef, $$text, $attachment->id ) };
- unless ( $status ) {
- if ( $dbh->err == 7 && $dbh->state eq '54000' ) {
- warn "Attachment @{[$attachment->id]} cannot be indexed. Most probably it contains too many unique words. Error: ". $dbh->errstr;
- } elsif ( $dbh->err == 7 && $dbh->state eq '22021' ) {
- warn "Attachment @{[$attachment->id]} cannot be indexed. Most probably it contains invalid UTF8 bytes. Error: ". $dbh->errstr;
- } else {
- die "error: ". $dbh->errstr;
+ attachment_loop( sub {
+ my ($attachments) = @_;
+ while ( my $a = $attachments->Next ) {
+ debug("Found attachment #". $a->id );
+ my $status = eval { $dbh->do( $query, undef, ($a->Content // ""), $a->id ) };
+ unless ( $status ) {
+ if ( $dbh->err == 7 && $dbh->state eq '54000' ) {
+ warn "Attachment @{[$a->id]} cannot be indexed. Most probably it contains too many unique words. Error: ". $dbh->errstr;
+ } elsif ( $dbh->err == 7 && $dbh->state eq '22021' ) {
+ warn "Attachment @{[$a->id]} cannot be indexed. Most probably it contains invalid UTF8 bytes. Error: ". $dbh->errstr;
+ } else {
+ die "error: ". $dbh->errstr;
+ }
+
+ # Insert an empty tsvector, so we count this row as "indexed"
+ # for purposes of knowing where to pick up
+ eval { $dbh->do( $query, undef, "", $a->id ) }
+ or die "Failed to insert empty row: " . $dbh->errstr;
+ }
}
-
- # Insert an empty tsvector, so we count this row as "indexed"
- # for purposes of knowing where to pick up
- eval { $dbh->do( $query, undef, "", $attachment->id ) }
- or die "Failed to insert empty tsvector: " . $dbh->errstr;
- }
+ });
}
commit d52840797534029a18a2954a88c399f131afe9dc
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jul 25 01:37:10 2014 -0400
Switch to preparing statements, rather than just setting strings
Prepared statements provide a small speed benefit, removing the need for
the database to re-parse the query string.
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index ebcf3c2..008ad48 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -187,13 +187,13 @@ sub process_mysql {
($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
- my $query = "INSERT INTO $table(Content, id) VALUES(?, ?)";
+ my $sth = $dbh->prepare("INSERT INTO $table(Content, id) VALUES(?, ?)");
attachment_loop( sub {
my ($attachments) = @_;
while ( my $a = $attachments->Next ) {
debug("Found attachment #". $a->id );
- $dbh->do( $query, undef, ($a->Content // ""), $a->id );
+ $sth->execute( ($a->Content // ""), $a->id );
}
});
}
@@ -209,18 +209,18 @@ sub process_pg {
($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM Attachments WHERE $column IS NOT NULL");
}
- my $query;
+ my $sth;
if ( $table ne 'Attachments' ) {
- $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)";
+ $sth = $dbh->prepare("INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)");
} else {
- $query = "UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?";
+ $sth = $dbh->prepare("UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?");
}
attachment_loop( sub {
my ($attachments) = @_;
while ( my $a = $attachments->Next ) {
debug("Found attachment #". $a->id );
- my $status = eval { $dbh->do( $query, undef, ($a->Content // ""), $a->id ) };
+ my $status = eval { $sth->execute( ($a->Content // ""), $a->id ) };
unless ( $status ) {
if ( $dbh->err == 7 && $dbh->state eq '54000' ) {
warn "Attachment @{[$a->id]} cannot be indexed. Most probably it contains too many unique words. Error: ". $dbh->errstr;
@@ -232,7 +232,7 @@ sub process_pg {
# Insert an empty tsvector, so we count this row as "indexed"
# for purposes of knowing where to pick up
- eval { $dbh->do( $query, undef, "", $a->id ) }
+ eval { $sth->execute( "", $a->id ) }
or die "Failed to insert empty row: " . $dbh->errstr;
}
}
commit 03cb0cd8656b8b084cadbab21fa8078e0afe9506
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jul 25 01:39:22 2014 -0400
INSERT DELAYED provides notable speed benefits on MyISAM
MySQL supports INSERT DELAYED[1] for MyISAM (but not InnoDB) tables. This
allows the server to defer inserts until it has a good opportunity to
write them, and to write them in bulk. While there is a small risk of
data loss (if the server is terminated before the data is written) this
poses no problem for an AttachmentsIndex table, for which all inserted
data is trivial to re-generate.
[1] http://dev.mysql.com/doc/refman/5.1/en/insert-delayed.html
diff --git a/lib/RT/Config.pm b/lib/RT/Config.pm
index 555f81e..b2988c6 100644
--- a/lib/RT/Config.pm
+++ b/lib/RT/Config.pm
@@ -604,6 +604,7 @@ our %META;
} else {
# Internal, one-column table
$v->{Column} = 'Content';
+ $v->{Engine} = $engine;
}
}
} else {
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 008ad48..5a689b4 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -187,7 +187,8 @@ sub process_mysql {
($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
- my $sth = $dbh->prepare("INSERT INTO $table(Content, id) VALUES(?, ?)");
+ my $insert = $fts_config->{Engine} eq "MyISAM" ? "INSERT DELAYED" : "INSERT";
+ my $sth = $dbh->prepare("$insert INTO $table(Content, id) VALUES(?, ?)");
attachment_loop( sub {
my ($attachments) = @_;
commit 4b5ef04df268598e48f6959247323d3d0a861cfc
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jul 25 01:40:16 2014 -0400
Improve MySQL insert speed by batching inserts into one statement
MySQL must flush buffers after every insert statement; as such,
providing large numbers of INSERT statements is quite inefficient, as
most time is spent in disk I/O. Instead, store rows to be inserted, and
insert them batch-by-batch.
This technique is not applicable to PostgreSQL because failure of the
to_tsvector call to convert a string would abort the entire insert.
Additionally, most installs use an additional column on the existing
table, which requires an UPDATE, and not an INSERT, which is not easily
batched.
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 5a689b4..2861149 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -187,15 +187,28 @@ sub process_mysql {
($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
+ # Doing large inserts is faster than individual statements, but
+ # comes at a parsing cost; cache the statement handles (99% of which
+ # will be the same size) for a notable (2x) speed gain.
+ my %sthandles;
my $insert = $fts_config->{Engine} eq "MyISAM" ? "INSERT DELAYED" : "INSERT";
- my $sth = $dbh->prepare("$insert INTO $table(Content, id) VALUES(?, ?)");
attachment_loop( sub {
my ($attachments) = @_;
+ my @insert;
+ my $found = 0;
while ( my $a = $attachments->Next ) {
debug("Found attachment #". $a->id );
- $sth->execute( ($a->Content // ""), $a->id );
+ push @insert, ($a->Content // ""), $a->id;
+ $found++;
}
+ return unless $found;
+
+ # $found should be the limit size on all but the last go-around.
+ $sthandles{$found} ||=
+ $dbh->prepare("$insert INTO $table(Content, id) VALUES "
+ . join(", ", ("(?,?)") x $found));
+ $sthandles{$found}->execute(@insert);
});
}
commit e873ac0386a15b4045250d1fd00a404feafcff27
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Jul 25 15:49:17 2014 -0400
Testing finds 200 is a good default batch size
Smaller batch sizes are slightly less efficient; the main trade-off is
in maximum memory consumption.
diff --git a/docs/full_text_indexing.pod b/docs/full_text_indexing.pod
index 339625b..180122b 100644
--- a/docs/full_text_indexing.pod
+++ b/docs/full_text_indexing.pod
@@ -45,10 +45,9 @@ To keep the index up-to-date, you will need to run:
/opt/rt4/sbin/rt-fulltext-indexer
-...at regular intervals. By default, this will only tokenize up to 100
-tickets at a time; you can adjust this upwards by passing
-C<--limit 500>. Larger batch sizes will take longer and
-consume more memory.
+...at regular intervals. By default, this will only tokenize up to 200
+tickets at a time; you can adjust this upwards by passing C<--limit
+500>. Larger batch sizes will take longer and consume more memory.
If there is already an instances of C<rt-fulltext-indexer> running, new
ones will exit abnormally (with exit code 1) and the error message
@@ -95,10 +94,9 @@ To keep the index up-to-date, you will need to run:
/opt/rt4/sbin/rt-fulltext-indexer
-...at regular intervals. By default, this will only tokenize up to 100
-tickets at a time; you can adjust this upwards by passing
-C<--limit 500>. Larger batch sizes will take longer and
-consume more memory.
+...at regular intervals. By default, this will only tokenize up to 200
+tickets at a time; you can adjust this upwards by passing C<--limit
+500>. Larger batch sizes will take longer and consume more memory.
If there is already an instances of C<rt-fulltext-indexer> running, new
ones will exit abnormally (with exit code 1) and the error message
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 2861149..c39029d 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -82,6 +82,7 @@ GetOptions( \%OPT,
"memory=s",
);
+$OPT{limit} ||= 200;
RT::Interface::CLI->ShowHelp if $OPT{help};
@@ -170,14 +171,14 @@ sub attachment_loop {
);
$attachments->Limit( FIELD => 'id', OPERATOR => '>', VALUE => $LAST );
$attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
- $attachments->RowsPerPage( $OPT{'limit'} || 100 );
+ $attachments->RowsPerPage( $OPT{'limit'} );
# Call back to the DB-specific part
$subref->($attachments);
$LAST = $attachments->Last->id if $attachments->Count;
- redo if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100);
+ redo if $OPT{'all'} and $attachments->Count == $OPT{'limit'};
}
}
commit 1a4798c72b404c5db91e563db52e0575b22bdcb5
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Fri Aug 1 14:13:00 2014 -0400
Indexing of "text" may fail for content with invalid UTF-8 byte sequences
Prior to f04f561f, content that claimed to be UTF-8 was inserted into
the database verbatim. This errored immediately upon insertion for
PostgreSQL, as the Content column is of type "TEXT"; MySQL, however,
did not attempt to validate the bytes, as its Content column is of type
"LONGBLOB".
This thus only causes errors when attempting to insert the characters
into a "TEXT" column. As multiple rows are inserted in batches, use the
row number from the error message to determine which row to blank, and
re-attempt insertion.
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index c39029d..4646151 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -209,7 +209,22 @@ sub process_mysql {
$sthandles{$found} ||=
$dbh->prepare("$insert INTO $table(Content, id) VALUES "
. join(", ", ("(?,?)") x $found));
- $sthandles{$found}->execute(@insert);
+ TRY: {
+ my $status = eval { $sthandles{$found}->execute(@insert); };
+ unless ( $status ) {
+ my ($row) = $dbh->errstr =~ /\brow (\d+)\b/;
+ die $dbh->errstr unless $row;
+
+ my ($content, $id) = ($insert[($row - 1)*2], $insert[($row - 1)*2 +1]);
+ if ($dbh->err == 1366 and $dbh->state eq "HY000") {
+ warn "Attachment $id cannot be indexed. Most probably it contains invalid UTF8 bytes. Error: ". $dbh->errstr;
+ $insert[($row - 1)*2] = "";
+ redo TRY;
+ } else {
+ die "Attachment $id: ".$dbh->errstr;
+ }
+ }
+ }
});
}
commit 9c73198ed040b0481a6bb83ee4aca5848fee602c
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Thu Nov 6 16:29:56 2014 -0800
Refactor PostgreSQL's insert to also do bulk insertion
PostgreSQL's full-text search can use a separate column, or a separate
table. In the case of a separate table, the same bulk-insertion trick
as MySQL uses can also be used to noticeably speed up indexing time.
Refactor the MySQL loop to be used in both cases. Note that instead of
attempting to parse the error message (which only works in English
locales for MySQL, and does not work at all in PostgreSQL) the rows are
instead attempted one-at-a-time. This also better catches the case
where multiple errors occurred in one bulk insert.
This provides notable speed benefits: 5x for GIN, and 30x for GiST. Note
that this does not speed up the default configuration for PostgreSQL,
wherein the index lies in the Attachments table, and thus UPDATE
statements must be used.
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 4646151..0e91002 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -182,17 +182,17 @@ sub attachment_loop {
}
}
-sub process_mysql {
+sub process_bulk_insert {
my $dbh = $RT::Handle->dbh;
- my $table = $fts_config->{'Table'};
-
- ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
+ my ($statement, $error) = @_;
# Doing large inserts is faster than individual statements, but
# comes at a parsing cost; cache the statement handles (99% of which
# will be the same size) for a notable (2x) speed gain.
my %sthandles;
- my $insert = $fts_config->{Engine} eq "MyISAM" ? "INSERT DELAYED" : "INSERT";
+
+ $sthandles{1} =
+ $dbh->prepare($statement->(1));
attachment_loop( sub {
my ($attachments) = @_;
@@ -206,45 +206,95 @@ sub process_mysql {
return unless $found;
# $found should be the limit size on all but the last go-around.
- $sthandles{$found} ||=
- $dbh->prepare("$insert INTO $table(Content, id) VALUES "
- . join(", ", ("(?,?)") x $found));
- TRY: {
- my $status = eval { $sthandles{$found}->execute(@insert); };
- unless ( $status ) {
- my ($row) = $dbh->errstr =~ /\brow (\d+)\b/;
- die $dbh->errstr unless $row;
-
- my ($content, $id) = ($insert[($row - 1)*2], $insert[($row - 1)*2 +1]);
- if ($dbh->err == 1366 and $dbh->state eq "HY000") {
- warn "Attachment $id cannot be indexed. Most probably it contains invalid UTF8 bytes. Error: ". $dbh->errstr;
- $insert[($row - 1)*2] = "";
- redo TRY;
- } else {
- die "Attachment $id: ".$dbh->errstr;
- }
- }
+ $sthandles{$found} ||= $dbh->prepare($statement->($found));
+
+ return if eval { $sthandles{$found}->execute(@insert); };
+
+ # We can catch and recover from some errors; re-do row-by-row to
+ # know which row had which errors
+ while (@insert) {
+ my ($content, $id) = splice(@insert,0,2);
+ next if eval { $sthandles{1}->execute($content, $id); };
+ $error->($id, $content);
+
+ # If this was a semi-expected error, insert an empty
+ # tsvector, so we count this row as "indexed" for
+ # purposes of knowing where to pick up
+ eval { $sthandles{1}->execute( "", $id ) }
+ or die "Failed to insert empty row for attachment $id: " . $dbh->errstr;
}
});
}
-sub process_pg {
+sub process_mysql {
my $dbh = $RT::Handle->dbh;
my $table = $fts_config->{'Table'};
- my $column = $fts_config->{'Column'};
- if ( $table ne 'Attachments' ) {
- ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
- } else {
- ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM Attachments WHERE $column IS NOT NULL");
- }
+ ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
- my $sth;
- if ( $table ne 'Attachments' ) {
- $sth = $dbh->prepare("INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)");
+ my $insert = $fts_config->{Engine} eq "MyISAM" ? "INSERT DELAYED" : "INSERT";
+
+ process_bulk_insert(
+ sub {
+ my ($n) = @_;
+ return "$insert INTO $table(Content, id) VALUES "
+ . join(", ", ("(?,?)") x $n);
+ },
+ sub {
+ my ($id) = @_;
+ if ($dbh->err == 1366 and $dbh->state eq "HY000") {
+ warn "Attachment $id cannot be indexed. Most probably it contains invalid UTF8 bytes. ".
+ "Error: ". $dbh->errstr;
+ } else {
+ die "Attachment $id cannot be indexed: " . $dbh->errstr;
+ }
+ }
+ );
+}
+
+
+sub process_pg {
+ if ( $fts_config->{'Table'} ne 'Attachments' ) {
+ process_pg_insert();
} else {
- $sth = $dbh->prepare("UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?");
+ process_pg_update();
}
+}
+
+sub process_pg_insert {
+ my $dbh = $RT::Handle->dbh;
+ my $table = $fts_config->{'Table'};
+ my $column = $fts_config->{'Column'};
+ ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
+
+ process_bulk_insert(
+ sub {
+ my ($n) = @_;
+ return "INSERT INTO $table($column, id) VALUES "
+ . join(", ", ("(TO_TSVECTOR(?),?)") x $n);
+ },
+ sub {
+ my ($id) = @_;
+ if ( $dbh->err == 7 && $dbh->state eq '54000' ) {
+ warn "Attachment $id cannot be indexed. Most probably it contains too many unique words. ".
+ "Error: ". $dbh->errstr;
+ } elsif ( $dbh->err == 7 && $dbh->state eq '22021' ) {
+ warn "Attachment $id cannot be indexed. Most probably it contains invalid UTF8 bytes. ".
+ "Error: ". $dbh->errstr;
+ } else {
+ die "Attachment $id cannot be indexed: " . $dbh->errstr;
+ }
+ }
+ );
+}
+
+sub process_pg_update {
+ my $dbh = $RT::Handle->dbh;
+ my $column = $fts_config->{'Column'};
+
+ ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM Attachments WHERE $column IS NOT NULL");
+
+ $sth = $dbh->prepare("UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?");
attachment_loop( sub {
my ($attachments) = @_;
commit c5575047991ad8b801e3ca4caebb839e58737663
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Thu Nov 6 17:07:58 2014 -0800
Perform PostgreSQL UPDATE statements inside of a database transaction
This attempts to reduce the number of write operations necessary, by
allowing PostgreSQL to batch writes, doing them only at COMMIT time.
This requires optimistically assuming that all UPDATEs will succeed --
and of any one does not, redoing all of them one-by-one. In cases where
indexing errors are frequent (at least one per two batches), this will
lead to a decrease in performance. However, in most cases it results in
a notable performance increase: 3x for GIN, 10x for GiST.
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index 0e91002..e79ebd8 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -294,27 +294,50 @@ sub process_pg_update {
($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM Attachments WHERE $column IS NOT NULL");
- $sth = $dbh->prepare("UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?");
+ my $sth = $dbh->prepare("UPDATE Attachments SET $column = TO_TSVECTOR(?) WHERE id = ?");
attachment_loop( sub {
my ($attachments) = @_;
+ my @insert;
while ( my $a = $attachments->Next ) {
debug("Found attachment #". $a->id );
- my $status = eval { $sth->execute( ($a->Content // ""), $a->id ) };
- unless ( $status ) {
- if ( $dbh->err == 7 && $dbh->state eq '54000' ) {
- warn "Attachment @{[$a->id]} cannot be indexed. Most probably it contains too many unique words. Error: ". $dbh->errstr;
- } elsif ( $dbh->err == 7 && $dbh->state eq '22021' ) {
- warn "Attachment @{[$a->id]} cannot be indexed. Most probably it contains invalid UTF8 bytes. Error: ". $dbh->errstr;
- } else {
- die "error: ". $dbh->errstr;
- }
-
- # Insert an empty tsvector, so we count this row as "indexed"
- # for purposes of knowing where to pick up
- eval { $sth->execute( "", $a->id ) }
- or die "Failed to insert empty row: " . $dbh->errstr;
+ push @insert, [($a->Content // ""), $a->id];
+ }
+
+ # Try in one database transaction; if it fails, we roll it back
+ # and try one statement at a time.
+ $dbh->begin_work;
+ my $ok = 1;
+ for (@insert) {
+ $ok = eval { $sth->execute( $_->[0], $_->[1] ) };
+ last unless $ok;
+ }
+ if ($ok) {
+ $dbh->commit;
+ return;
+ }
+ $dbh->rollback;
+
+ # Things didn't go well. Retry the UPDATE statements one row at
+ # a time, outside of the transaction.
+ for (@insert) {
+ my ($content, $id) = ($_->[0], $_->[1]);
+ next if eval { $sth->execute( $content, $id ) };
+ if ( $dbh->err == 7 && $dbh->state eq '54000' ) {
+ warn "Attachment $id cannot be indexed. Most probably it contains too many unique words. ".
+ "Error: ". $dbh->errstr;
+ } elsif ( $dbh->err == 7 && $dbh->state eq '22021' ) {
+ warn "Attachment $id cannot be indexed. Most probably it contains invalid UTF8 bytes. ".
+ "Error: ". $dbh->errstr;
+ } else {
+ die "Attachment $id cannot be indexed: " . $dbh->errstr;
}
+
+ # If this was a semi-expected error, insert an empty
+ # tsvector, so we count this row as "indexed" for
+ # purposes of knowing where to pick up
+ eval { $sth->execute( "", $id ) }
+ or die "Failed to insert empty row for attachment $id: " . $dbh->errstr;
}
});
}
commit 75400c90e74c47c1fb35df335097503d0a278de2
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Thu Nov 6 17:41:06 2014 -0800
If a new table is used for indexing, grant rights on it
Without this, the new table (created by the DBA user) cannot be read or
inserted into by the RT user, resulting in errors when indexing.
diff --git a/sbin/rt-setup-fulltext-index.in b/sbin/rt-setup-fulltext-index.in
index b5659c7..2c741d7 100644
--- a/sbin/rt-setup-fulltext-index.in
+++ b/sbin/rt-setup-fulltext-index.in
@@ -307,16 +307,20 @@ elsif ( $DB{'type'} eq 'Pg' ) {
silent => !$OPT{'ask'},
);
- my $schema;
+ my @schema;
my $drop;
if ( lc($table) eq 'attachments' ) {
$drop = "ALTER TABLE $table DROP COLUMN $column";
- $schema = "ALTER TABLE $table ADD COLUMN $column tsvector";
+ push @schema, "ALTER TABLE $table ADD COLUMN $column tsvector";
} else {
$drop = "DROP TABLE $table";
- $schema = "CREATE TABLE $table ( "
- ."id INTEGER NOT NULL,"
- ."$column tsvector )";
+ push @schema, split /;\n+/, <<SCHEMA;
+CREATE TABLE $table (
+ id SERIAL,
+ $column tsvector
+);
+GRANT SELECT, INSERT, UPDATE, DELETE ON $table TO "$DB{user}"
+SCHEMA
}
my $index_type = lc($OPT{'index-type'} || '');
@@ -328,11 +332,11 @@ elsif ( $DB{'type'} eq 'Pg' ) {
silent => !$OPT{'ask'},
);
}
+ push @schema, "CREATE INDEX ${column}_idx ON $table USING $index_type($column)";
do_error_is_ok( dba_handle() => $drop )
unless $OPT{'dryrun'};
- insert_schema( $schema );
- insert_schema("CREATE INDEX ${column}_idx ON $table USING $index_type($column)");
+ insert_schema( $_ ) for @schema;
print_rt_config( Table => $table, Column => $column );
}
commit 92d27929b5a2fad5374017896b7a8e6de0905595
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Tue Nov 11 00:35:53 2014 -0500
Insert data to index before creating the index
Having to update the index after every insertion is a notable
performance hit; it is much more performant to create the index after
all data has been inserted.
Move the initial run of rt-fulltext-indexer to after the requisite
tables have been set up, but prior to creating the index and displaying
the required RT configuration. This requires serializing the intended
%FullTextSearch configuration between rt-setup-fulltext-index and
rt-fulltext-indexer, as the RT_SiteConfig file will not have been
updated until after both have run; thus, serialize the configuration via
an environment variable.
The documentation is updated to note that running
rt-setup-fulltext-index is now a time-intensive operation.
diff --git a/docs/full_text_indexing.pod b/docs/full_text_indexing.pod
index 180122b..24169cb 100644
--- a/docs/full_text_indexing.pod
+++ b/docs/full_text_indexing.pod
@@ -28,16 +28,14 @@ you may need to pass the C<--dba> or C<--dba-password> options:
/opt/rt4/sbin/rt-setup-fulltext-index --dba postgres --dba-password secret
-This will also output an appropriate C<%FullTextSearch> configuration to
-add to your F<RT_SiteConfig.pm>; you will need to restart your webserver
-after making these changes. However, the index will also need to be
-filled before it can be used. To update the index initially, run:
+This will then tokenize and index all existing attachments in your
+database; it may take quite a while if your database already has a large
+number of tickets in it.
- /opt/rt4/sbin/rt-fulltext-indexer --all
+Finally, it will output an appropriate C<%FullTextSearch> configuration
+to add to your F<RT_SiteConfig.pm>; you will need to restart your
+webserver after making these changes.
-This will tokenize and index all existing attachments in your database;
-it may take quite a while if your database already has a large number of
-tickets in it.
=head2 Updating the index
@@ -77,16 +75,14 @@ you may need to pass the C<--dba> or C<--dba-password> options:
/opt/rt4/sbin/rt-setup-fulltext-index --dba root --dba-password secret
-This will also output an appropriate C<%FullTextSearch> configuration to
-add to your F<RT_SiteConfig.pm>; you will need to restart your webserver
-after making these changes. However, the index will also need to be
-filled before it can be used. To update the index initially, run:
+This will then tokenize and index all existing attachments in your
+database; it may take quite a while if your database already has a large
+number of tickets in it.
- /opt/rt4/sbin/rt-fulltext-indexer --all
+Finally, it will output an appropriate C<%FullTextSearch> configuration
+to add to your F<RT_SiteConfig.pm>; you will need to restart your
+webserver after making these changes.
-This will tokenize and index all existing attachments in your database;
-it may take quite a while if your database already has a large number of
-tickets in it.
=head3 Updating the index
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
index e79ebd8..2c256bd 100644
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@ -99,7 +99,8 @@ if ( !flock main::DATA, LOCK_EX | LOCK_NB ) {
}
my $db_type = RT->Config->Get('DatabaseType');
-my $fts_config = RT->Config->Get('FullTextSearch') || {};
+my $fts_config = $ENV{RT_FTS_CONFIG} ? JSON::from_json($ENV{RT_FTS_CONFIG})
+ : RT->Config->Get('FullTextSearch') || {};
unless ( $fts_config->{'Enable'} ) {
print STDERR <<EOT;
diff --git a/sbin/rt-setup-fulltext-index.in b/sbin/rt-setup-fulltext-index.in
index 2c741d7..2c024d7 100644
--- a/sbin/rt-setup-fulltext-index.in
+++ b/sbin/rt-setup-fulltext-index.in
@@ -121,6 +121,7 @@ GetOptions(
'dba=s' => \$DB{'admin'},
'dba-password=s' => \$DB{'admin_password'},
+ 'limit=i' => \$DB{'batch-size'},
) or show_help();
if ( $OPT{'help'} || (!$DB{'admin'} && $DB{'type'} eq 'Oracle' ) ) {
@@ -175,9 +176,13 @@ if ( $DB{'type'} eq 'mysql' ) {
my $engine = $RT::Handle->dbh->{mysql_serverversion} < 50600 ? "MyISAM" : "InnoDB";
my $schema = "CREATE TABLE $table ( "
."id INT UNSIGNED AUTO_INCREMENT NOT NULL PRIMARY KEY,"
- ."Content LONGTEXT, FULLTEXT(Content) ) ENGINE=$engine CHARACTER SET utf8";
+ ."Content LONGTEXT ) ENGINE=$engine CHARACTER SET utf8";
insert_schema( $schema );
+ insert_data( Table => $table, Engine => $engine );
+
+ insert_schema( "CREATE FULLTEXT INDEX $table ON $table(Content)" );
+
print_rt_config( Table => $table );
} elsif ($DB{'type'} eq 'sphinx') {
check_sphinx();
@@ -332,12 +337,15 @@ SCHEMA
silent => !$OPT{'ask'},
);
}
- push @schema, "CREATE INDEX ${column}_idx ON $table USING $index_type($column)";
do_error_is_ok( dba_handle() => $drop )
unless $OPT{'dryrun'};
insert_schema( $_ ) for @schema;
+ insert_data( Table => $table, Column => $column );
+
+ insert_schema( "CREATE INDEX ${column}_idx ON $table USING $index_type($column)" );
+
print_rt_config( Table => $table, Column => $column );
}
elsif ( $DB{'type'} eq 'Oracle' ) {
@@ -729,6 +737,16 @@ sub insert_schema {
}
}
+sub insert_data {
+ return if $OPT{dryrun};
+
+ print "Indexing existing data...\n";
+
+ $ENV{RT_FTS_CONFIG} = JSON::to_json( {Enable => 1, Indexed => 1, @_});
+ system( "$RT::SbinPath/rt-fulltext-indexer", "--all",
+ ($DB{'batch-size'} ? ("--limit", $DB{'batch-size'}) : ()));
+}
+
=head1 NAME
rt-setup-fulltext-index - Create indexes for full text search
commit 04199af31a8bd2f62e4c7265ec653c3c0f7bca03
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Wed Nov 26 15:18:41 2014 -0500
Switch the default Postgres index to GIN
GIN indexes, while slower to index, provide a 10x speedup in query time;
they are suggested for static data, which the Attachments table
certainly is. The other improvements in indexing are sufficient to
still net a 6.5x improvement in indexing speed over the speed of the
prior GiST default. In addition, indexing time, which mostly impacts
only initial deployment, should take second priority behind query time,
which GIN improves notably.
diff --git a/sbin/rt-setup-fulltext-index.in b/sbin/rt-setup-fulltext-index.in
index 2c024d7..7ea7f80 100644
--- a/sbin/rt-setup-fulltext-index.in
+++ b/sbin/rt-setup-fulltext-index.in
@@ -331,9 +331,9 @@ SCHEMA
my $index_type = lc($OPT{'index-type'} || '');
while ( $index_type ne 'gist' and $index_type ne 'gin' ) {
$index_type = lc prompt(
- message => "You may choose between GiST or GIN indexes; the former is several times\n"
- . "slower to search, but takes less space on disk and is faster to update.",
- default => 'GiST',
+ message => "You may choose between GiST or GIN indexes; the GiST takes less space on\n"
+ . "disk and is faster to update, but is an order of magnitude slower to query.",
+ default => 'GIN',
silent => !$OPT{'ask'},
);
}
commit ed1131b1ecc5fdbefa8dd17483efd89019de150b
Author: Alex Vandiver <alexmv at bestpractical.com>
Date: Wed Nov 26 15:29:06 2014 -0500
Default to storing the tsvector in a new table, to speed indexing
The ability to perform bulk INSERT, rather than individual UPDATE
statements, increases indexing speeds by 1.5x. While this requires an
additional JOIN at query time, testing shows that this comes at no
notable cost of execution time -- in fact, queries with the tsvector in
a separate table perform 5-10% _faster_ than those with it in the
Attachments table, perhaps because it requires scans of a less
heavyweight table.
diff --git a/sbin/rt-setup-fulltext-index.in b/sbin/rt-setup-fulltext-index.in
index 7ea7f80..c21a1d0 100644
--- a/sbin/rt-setup-fulltext-index.in
+++ b/sbin/rt-setup-fulltext-index.in
@@ -91,7 +91,7 @@ my %OPT = (
my %DEFAULT;
if ( $DB{'type'} eq 'Pg' ) {
%DEFAULT = (
- table => 'Attachments',
+ table => 'AttachmentsIndex',
column => 'ContentIndex',
);
}
@@ -302,7 +302,7 @@ elsif ( $DB{'type'} eq 'Pg' ) {
my $table = $OPT{'table'} || prompt(
message => "Enter the name of a DB table that will be used to store the Pg tsvector.\n"
. "You may either use the existing Attachments table, or create a new\n"
- . "table.",
+ . "table. Creating a new table makes initial indexing faster.",
default => $DEFAULT{'table'},
silent => !$OPT{'ask'},
);
diff --git a/t/fts/indexed_pg.t b/t/fts/indexed_pg.t
index b8d4b1b..b80cd12 100644
--- a/t/fts/indexed_pg.t
+++ b/t/fts/indexed_pg.t
@@ -11,7 +11,7 @@ plan skip_all => "Need Pg 8.2 or higher; we have $major.$minor"
plan tests => 36;
-RT->Config->Set( FullTextSearch => Enable => 1, Indexed => 1, Column => 'ContentIndex', Table => 'Attachments' );
+RT->Config->Set( FullTextSearch => Enable => 1, Indexed => 1, Column => 'ContentIndex', Table => 'AttachmentsIndex' );
setup_indexing();
-----------------------------------------------------------------------
More information about the rt-commit
mailing list