[Rt-commit] rt branch, 4.2/fts-refactor-performance, repushed
Alex Vandiver
alexmv at bestpractical.com
Wed Mar 4 18:04:17 EST 2015
The branch 4.2/fts-refactor-performance was deleted and repushed:
was b36617027c1d88a612043f904db9e24ac3faae25
now 8ae077a128355971deff91e9eb1cc3367df28ec6
1: a7968ec < --: ------- Add full path to one rt-fulltext-indexer that lacks it
2: 84747c7 < --: ------- Add additional clarification points about Sphinx on MySQL
3: ac688e2 < --: ------- Drop sphinx xmlpipe2 output, which was unusable and undocumented
4: d5f256f < --: ------- Drop finalize and clean functions, which are now unused
5: 02a8588 < --: ------- Rename Sphinx FTS search tests to "sphinx", not "mysql"
6: 84066c4 < --: ------- Support native FTS on MySQL 5.6 and above
7: 77641fc < --: ------- Using a separate MyISAM table, we can also support FTS on MySQL < 5.6
8: 355b60c ! 1: 44600fc Inline extract_text and extract_html
@@ -1,14 +1,22 @@
Author: Alex Vandiver <alexmv at bestpractical.com>
- extract_text and extract_html are identical; inline them
+ Inline extract_text and extract_html
- The original premise may have been that HTML->text conversion would be
- fone on the HTML before indexing. While this is still an option for the
- future, there is currently no reason to provide to identical methods.
+ They consist of mostly-identical code, differing only in the
+ decode_entities. Inline this difference, rather than budying it under
+ two levels of method call and goto.
diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
+@@
+ RT::Init();
+ };
+ use RT::Interface::CLI ();
++use HTML::Entities;
+
+ my %OPT = (
+ help => 0,
@@
while ( my $a = $attachments->Next ) {
next if filter( $type, $a );
@@ -16,6 +24,9 @@
- my $txt = extract($type, $a) or next;
+ my $text = $a->Content;
+ next unless defined $text && length($text);
++ # The rich text editor generates html entities for characters
++ # but Pg doesn't index them; decode to something it can index.
++ HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
$found++;
- process( $type, $a, $txt );
+ process( $type, $a, \$text );
@@ -60,7 +71,10 @@
- my $attachment = shift;
- my $text = $attachment->Content;
- return undef unless defined $text && length($text);
--# TODO: html -> text
+-# the rich text editor generates html entities for characters
+-# but Pg doesn't index them, so decode to something it can index.
+- require HTML::Entities;
+- HTML::Entities::decode_entities($text);
- return \$text;
-}
-
9: aebf2c1 = 2: 391b307 Inline the differences between text/plain and text/html attachment lists
10: 2b3d5df = 3: b891a82 Stop skipping indexing of text/html within multipart/alternative
11: 262ab27 ! 4: 3e9924f Use the new, shorter, initialization form
@@ -16,6 +16,6 @@
-};
+use RT -init;
use RT::Interface::CLI ();
+ use HTML::Entities;
- my %OPT = (
12: 864f449 ! 5: 1b1ed1b Simplify and condense option parsing
@@ -11,8 +11,8 @@
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@
- use RT -init;
use RT::Interface::CLI ();
+ use HTML::Entities;
-my %OPT = (
- help => 0,
13: 05b3246 = 6: f8c465b Documentation has moved out; update --help accordingly
14: 81fa26d = 7: a0d966f Remove AUTHOR section; it is unnecessary in core sbin files
15: 06089a2 = 8: 425c689 Skipping ACL checks yields a sizable performance increase
16: e722688 ! 9: 65f2fda Index attachments in one pass through the database, not two
@@ -28,8 +28,8 @@
$attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
$attachments->RowsPerPage( $OPT{'limit'} || 100 );
@@
- my $text = $a->Content;
- next unless defined $text && length($text);
+ # but Pg doesn't index them; decode to something it can index.
+ HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
$found++;
- process( $type, $a, \$text );
+ process( $a, \$text );
17: eba441b = 10: 568144a Index attachments even on deleted tickets
18: bcd4e6b = 11: 236526b mysql and pg share the same last_indexed; unify the method
19: 41420bf ! 12: beea1fe Replace the last use of goto_specific with explicit function calls
@@ -6,8 +6,8 @@
--- a/sbin/rt-fulltext-indexer.in
+++ b/sbin/rt-fulltext-indexer.in
@@
- my $text = $a->Content;
- next unless defined $text && length($text);
+ # but Pg doesn't index them; decode to something it can index.
+ HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
$found++;
- process( $a, \$text );
+ if ($db_type eq 'mysql') {
20: d23df4e = 13: 636887b Simplify last_indexed
21: aa882bc = 14: 21bd8e7 Only call last_indexed once, as it may be heavy
22: 660f54f ! 15: 3c6a200 Index even empty attachments
@@ -18,9 +18,8 @@
while ( my $a = $attachments->Next ) {
debug("Found attachment #". $a->id );
- my $text = $a->Content;
-- next unless defined $text && length($text);
+ my $text = $a->Content // "";
- $found++;
- if ($db_type eq 'mysql') {
- process_mysql( $a, \$text );
+ next unless defined $text && length($text);
+ # The rich text editor generates html entities for characters
+ # but Pg doesn't index them; decode to something it can index.
23: c5be1d3 = 16: 881b6f8 As last_indexed is based on the highest insert, there will never be an UPDATE needed
24: 08610ec ! 17: 4a3bccf Inversion of control of main indexing loops
@@ -4,7 +4,7 @@
Rather than having database-dependent if-statements, followed by a
standard loop which contains further database-dependent if-statements,
- instead turn the loop into a function which cane be called from one of
+ instead turn the loop into a function which can be called from one of
two database-specific functions. This is important because MySQL's
iteration will further diverge from PostgreSQL's in the following
commits.
@@ -47,6 +47,10 @@
- while ( my $a = $attachments->Next ) {
- debug("Found attachment #". $a->id );
- my $text = $a->Content // "";
+- next unless defined $text && length($text);
+- # The rich text editor generates html entities for characters
+- # but Pg doesn't index them; decode to something it can index.
+- HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
- $found++;
- if ($db_type eq 'mysql') {
- process_mysql( $a, \$text );
@@ -115,7 +119,9 @@
+ my ($attachments) = @_;
+ while ( my $a = $attachments->Next ) {
+ debug("Found attachment #". $a->id );
-+ $dbh->do( $query, undef, ($a->Content // ""), $a->id );
++ my $text = $a->Content // "";
++ HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
++ $dbh->do( $query, undef, $text, $a->id );
+ }
+ });
}
@@ -152,7 +158,11 @@
+ my ($attachments) = @_;
+ while ( my $a = $attachments->Next ) {
+ debug("Found attachment #". $a->id );
-+ my $status = eval { $dbh->do( $query, undef, ($a->Content // ""), $a->id ) };
++
++ my $text = $a->Content // "";
++ HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
++
++ my $status = eval { $dbh->do( $query, undef, $text, $a->id ) };
+ unless ( $status ) {
+ if ( $dbh->err == 7 && $dbh->state eq '54000' ) {
+ warn "Attachment @{[$a->id]} cannot be indexed. Most probably it contains too many unique words. Error: ". $dbh->errstr;
25: 8382220 ! 18: fb0c0f9 Switch to preparing statements, rather than just setting strings
@@ -17,10 +17,12 @@
attachment_loop( sub {
my ($attachments) = @_;
- while ( my $a = $attachments->Next ) {
+@@
debug("Found attachment #". $a->id );
-- $dbh->do( $query, undef, ($a->Content // ""), $a->id );
-+ $sth->execute( ($a->Content // ""), $a->id );
+ my $text = $a->Content // "";
+ HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
+- $dbh->do( $query, undef, $text, $a->id );
++ $sth->execute( $text, $a->id );
}
});
}
@@ -39,11 +41,12 @@
}
attachment_loop( sub {
- my ($attachments) = @_;
- while ( my $a = $attachments->Next ) {
- debug("Found attachment #". $a->id );
-- my $status = eval { $dbh->do( $query, undef, ($a->Content // ""), $a->id ) };
-+ my $status = eval { $sth->execute( ($a->Content // ""), $a->id ) };
+@@
+ my $text = $a->Content // "";
+ HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
+
+- my $status = eval { $dbh->do( $query, undef, $text, $a->id ) };
++ my $status = eval { $sth->execute( $text, $a->id ) };
unless ( $status ) {
if ( $dbh->err == 7 && $dbh->state eq '54000' ) {
warn "Attachment @{[$a->id]} cannot be indexed. Most probably it contains too many unique words. Error: ". $dbh->errstr;
26: 1835d95 = 19: 55cbef6 INSERT DELAYED provides notable speed benefits on MyISAM
27: 8947435 ! 20: 4964a03 Improve MySQL insert speed by batching inserts into one statement
@@ -33,8 +33,10 @@
+ my $found = 0;
while ( my $a = $attachments->Next ) {
debug("Found attachment #". $a->id );
-- $sth->execute( ($a->Content // ""), $a->id );
-+ push @insert, ($a->Content // ""), $a->id;
+ my $text = $a->Content // "";
+ HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
+- $sth->execute( $text, $a->id );
++ push @insert, $text, $a->id;
+ $found++;
}
+ return unless $found;
28: a977c6d = 21: 5f63f98 Testing finds 200 is a good default batch size
29: d26a640 = 22: a4375cc Indexing of "text" may fail for content with invalid UTF-8 byte sequences
30: 040708f = 23: 517978c Refactor PostgreSQL's insert to also do bulk insertion
31: d9a4fb1 ! 24: 22a0990 Perform PostgreSQL UPDATE statements inside of a database transaction
@@ -25,7 +25,11 @@
+ my @insert;
while ( my $a = $attachments->Next ) {
debug("Found attachment #". $a->id );
-- my $status = eval { $sth->execute( ($a->Content // ""), $a->id ) };
+
+ my $text = $a->Content // "";
+ HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
+
+- my $status = eval { $sth->execute( $text, $a->id ) };
- unless ( $status ) {
- if ( $dbh->err == 7 && $dbh->state eq '54000' ) {
- warn "Attachment @{[$a->id]} cannot be indexed. Most probably it contains too many unique words. Error: ". $dbh->errstr;
@@ -39,7 +43,7 @@
- # for purposes of knowing where to pick up
- eval { $sth->execute( "", $a->id ) }
- or die "Failed to insert empty row: " . $dbh->errstr;
-+ push @insert, [($a->Content // ""), $a->id];
++ push @insert, [$text, $a->id];
+ }
+
+ # Try in one database transaction; if it fails, we roll it back
32: 8b2b442 = 25: 3de172d If a new table is used for indexing, grant rights on it
33: 52c0a3f = 26: 1cf38b3 Insert data to index before creating the index
34: de64b1c = 27: a3f9f23 Switch the default Postgres index to GIN
35: b366170 ! 28: 8ae077a Default to storing the tsvector in a new table, to speed indexing
@@ -36,8 +36,8 @@
--- a/t/fts/indexed_pg.t
+++ b/t/fts/indexed_pg.t
@@
-
- plan tests => 36;
+ plan skip_all => "Need Pg 8.2 or higher; we have $major.$minor"
+ if "$major.$minor" < 8.2;
-RT->Config->Set( FullTextSearch => Enable => 1, Indexed => 1, Column => 'ContentIndex', Table => 'Attachments' );
+RT->Config->Set( FullTextSearch => Enable => 1, Indexed => 1, Column => 'ContentIndex', Table => 'AttachmentsIndex' );
More information about the rt-commit
mailing list