[Rt-commit] rt branch, 4.2/fts-refactor-performance, repushed

Wed Mar 4 18:04:17 EST 2015

The branch 4.2/fts-refactor-performance was deleted and repushed:
       was b36617027c1d88a612043f904db9e24ac3faae25
       now 8ae077a128355971deff91e9eb1cc3367df28ec6

 1:  a7968ec < --:  ------- Add full path to one rt-fulltext-indexer that lacks it
 2:  84747c7 < --:  ------- Add additional clarification points about Sphinx on MySQL
 3:  ac688e2 < --:  ------- Drop sphinx xmlpipe2 output, which was unusable and undocumented
 4:  d5f256f < --:  ------- Drop finalize and clean functions, which are now unused
 5:  02a8588 < --:  ------- Rename Sphinx FTS search tests to "sphinx", not "mysql"
 6:  84066c4 < --:  ------- Support native FTS on MySQL 5.6 and above
 7:  77641fc < --:  ------- Using a separate MyISAM table, we can also support FTS on MySQL < 5.6
 8:  355b60c !  1:  44600fc Inline extract_text and extract_html
    @@ -1,14 +1,22 @@
     Author: Alex Vandiver <alexmv at bestpractical.com>
     
    -    extract_text and extract_html are identical; inline them
    +    Inline extract_text and extract_html
         
    -    The original premise may have been that HTML->text conversion would be
    -    fone on the HTML before indexing.  While this is still an option for the
    -    future, there is currently no reason to provide to identical methods.
    +    They consist of mostly-identical code, differing only in the
    +    decode_entities.  Inline this difference, rather than budying it under
    +    two levels of method call and goto.
     
     diff --git a/sbin/rt-fulltext-indexer.in b/sbin/rt-fulltext-indexer.in
     --- a/sbin/rt-fulltext-indexer.in
     +++ b/sbin/rt-fulltext-indexer.in
    +@@
    +     RT::Init();
    + };
    + use RT::Interface::CLI ();
    ++use HTML::Entities;
    + 
    + my %OPT = (
    +     help        => 0,
     @@
          while ( my $a = $attachments->Next ) {
              next if filter( $type, $a );
    @@ -16,6 +24,9 @@
     -        my $txt = extract($type, $a) or next;
     +        my $text = $a->Content;
     +        next unless defined $text && length($text);
    ++        # The rich text editor generates html entities for characters
    ++        # but Pg doesn't index them; decode to something it can index.
    ++        HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
              $found++;
     -        process( $type, $a, $txt );
     +        process( $type, $a, \$text );
    @@ -60,7 +71,10 @@
     -    my $attachment = shift;
     -    my $text = $attachment->Content;
     -    return undef unless defined $text && length($text);
    --# TODO: html -> text
    +-# the rich text editor generates html entities for characters
    +-# but Pg doesn't index them, so decode to something it can index.
    +-    require HTML::Entities;
    +-    HTML::Entities::decode_entities($text);
     -    return \$text;
     -}
     -
 9:  aebf2c1 =  2:  391b307 Inline the differences between text/plain and text/html attachment lists
10:  2b3d5df =  3:  b891a82 Stop skipping indexing of text/html within multipart/alternative
11:  262ab27 !  4:  3e9924f Use the new, shorter, initialization form
    @@ -16,6 +16,6 @@
     -};
     +use RT -init;
      use RT::Interface::CLI ();
    + use HTML::Entities;
      
    - my %OPT = (
     
12:  864f449 !  5:  1b1ed1b Simplify and condense option parsing
    @@ -11,8 +11,8 @@
     --- a/sbin/rt-fulltext-indexer.in
     +++ b/sbin/rt-fulltext-indexer.in
     @@
    - use RT -init;
      use RT::Interface::CLI ();
    + use HTML::Entities;
      
     -my %OPT = (
     -    help        => 0,
13:  05b3246 =  6:  f8c465b Documentation has moved out; update --help accordingly
14:  81fa26d =  7:  a0d966f Remove AUTHOR section; it is unnecessary in core sbin files
15:  06089a2 =  8:  425c689 Skipping ACL checks yields a sizable performance increase
16:  e722688 !  9:  65f2fda Index attachments in one pass through the database, not two
    @@ -28,8 +28,8 @@
          $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
          $attachments->RowsPerPage( $OPT{'limit'} || 100 );
     @@
    -         my $text = $a->Content;
    -         next unless defined $text && length($text);
    +         # but Pg doesn't index them; decode to something it can index.
    +         HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
              $found++;
     -        process( $type, $a, \$text );
     +        process( $a, \$text );
17:  eba441b = 10:  568144a Index attachments even on deleted tickets
18:  bcd4e6b = 11:  236526b mysql and pg share the same last_indexed; unify the method
19:  41420bf ! 12:  beea1fe Replace the last use of goto_specific with explicit function calls
    @@ -6,8 +6,8 @@
     --- a/sbin/rt-fulltext-indexer.in
     +++ b/sbin/rt-fulltext-indexer.in
     @@
    -         my $text = $a->Content;
    -         next unless defined $text && length($text);
    +         # but Pg doesn't index them; decode to something it can index.
    +         HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
              $found++;
     -        process( $a, \$text );
     +        if ($db_type eq 'mysql') {
20:  d23df4e = 13:  636887b Simplify last_indexed
21:  aa882bc = 14:  21bd8e7 Only call last_indexed once, as it may be heavy
22:  660f54f ! 15:  3c6a200 Index even empty attachments
    @@ -18,9 +18,8 @@
          while ( my $a = $attachments->Next ) {
              debug("Found attachment #". $a->id );
     -        my $text = $a->Content;
    --        next unless defined $text && length($text);
     +        my $text = $a->Content // "";
    -         $found++;
    -         if ($db_type eq 'mysql') {
    -             process_mysql( $a, \$text );
    +         next unless defined $text && length($text);
    +         # The rich text editor generates html entities for characters
    +         # but Pg doesn't index them; decode to something it can index.
     
23:  c5be1d3 = 16:  881b6f8 As last_indexed is based on the highest insert, there will never be an UPDATE needed
24:  08610ec ! 17:  4a3bccf Inversion of control of main indexing loops
    @@ -4,7 +4,7 @@
         
         Rather than having database-dependent if-statements, followed by a
         standard loop which contains further database-dependent if-statements,
    -    instead turn the loop into a function which cane be called from one of
    +    instead turn the loop into a function which can be called from one of
         two database-specific functions.  This is important because MySQL's
         iteration will further diverge from PostgreSQL's in the following
         commits.
    @@ -47,6 +47,10 @@
     -    while ( my $a = $attachments->Next ) {
     -        debug("Found attachment #". $a->id );
     -        my $text = $a->Content // "";
    +-        next unless defined $text && length($text);
    +-        # The rich text editor generates html entities for characters
    +-        # but Pg doesn't index them; decode to something it can index.
    +-        HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
     -        $found++;
     -        if ($db_type eq 'mysql') {
     -            process_mysql( $a, \$text );
    @@ -115,7 +119,9 @@
     +        my ($attachments) = @_;
     +        while ( my $a = $attachments->Next ) {
     +            debug("Found attachment #". $a->id );
    -+            $dbh->do( $query, undef, ($a->Content // ""), $a->id );
    ++            my $text = $a->Content // "";
    ++            HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
    ++            $dbh->do( $query, undef, $text, $a->id );
     +        }
     +    });
      }
    @@ -152,7 +158,11 @@
     +        my ($attachments) = @_;
     +        while ( my $a = $attachments->Next ) {
     +            debug("Found attachment #". $a->id );
    -+            my $status = eval { $dbh->do( $query, undef, ($a->Content // ""), $a->id ) };
    ++
    ++            my $text = $a->Content // "";
    ++            HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
    ++
    ++            my $status = eval { $dbh->do( $query, undef, $text, $a->id ) };
     +            unless ( $status ) {
     +                if ( $dbh->err == 7  && $dbh->state eq '54000' ) {
     +                    warn "Attachment @{[$a->id]} cannot be indexed. Most probably it contains too many unique words. Error: ". $dbh->errstr;
25:  8382220 ! 18:  fb0c0f9 Switch to preparing statements, rather than just setting strings
    @@ -17,10 +17,12 @@
      
          attachment_loop( sub {
              my ($attachments) = @_;
    -         while ( my $a = $attachments->Next ) {
    +@@
                  debug("Found attachment #". $a->id );
    --            $dbh->do( $query, undef, ($a->Content // ""), $a->id );
    -+            $sth->execute( ($a->Content // ""), $a->id );
    +             my $text = $a->Content // "";
    +             HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
    +-            $dbh->do( $query, undef, $text, $a->id );
    ++            $sth->execute( $text, $a->id );
              }
          });
      }
    @@ -39,11 +41,12 @@
          }
      
          attachment_loop( sub {
    -         my ($attachments) = @_;
    -         while ( my $a = $attachments->Next ) {
    -             debug("Found attachment #". $a->id );
    --            my $status = eval { $dbh->do( $query, undef, ($a->Content // ""), $a->id ) };
    -+            my $status = eval { $sth->execute( ($a->Content // ""), $a->id ) };
    +@@
    +             my $text = $a->Content // "";
    +             HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
    + 
    +-            my $status = eval { $dbh->do( $query, undef, $text, $a->id ) };
    ++            my $status = eval { $sth->execute( $text, $a->id ) };
                  unless ( $status ) {
                      if ( $dbh->err == 7  && $dbh->state eq '54000' ) {
                          warn "Attachment @{[$a->id]} cannot be indexed. Most probably it contains too many unique words. Error: ". $dbh->errstr;
26:  1835d95 = 19:  55cbef6 INSERT DELAYED provides notable speed benefits on MyISAM
27:  8947435 ! 20:  4964a03 Improve MySQL insert speed by batching inserts into one statement
    @@ -33,8 +33,10 @@
     +        my $found = 0;
              while ( my $a = $attachments->Next ) {
                  debug("Found attachment #". $a->id );
    --            $sth->execute( ($a->Content // ""), $a->id );
    -+            push @insert, ($a->Content // ""), $a->id;
    +             my $text = $a->Content // "";
    +             HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
    +-            $sth->execute( $text, $a->id );
    ++            push @insert, $text, $a->id;
     +            $found++;
              }
     +        return unless $found;
28:  a977c6d = 21:  5f63f98 Testing finds 200 is a good default batch size
29:  d26a640 = 22:  a4375cc Indexing of "text" may fail for content with invalid UTF-8 byte sequences
30:  040708f = 23:  517978c Refactor PostgreSQL's insert to also do bulk insertion
31:  d9a4fb1 ! 24:  22a0990 Perform PostgreSQL UPDATE statements inside of a database transaction
    @@ -25,7 +25,11 @@
     +        my @insert;
              while ( my $a = $attachments->Next ) {
                  debug("Found attachment #". $a->id );
    --            my $status = eval { $sth->execute( ($a->Content // ""), $a->id ) };
    + 
    +             my $text = $a->Content // "";
    +             HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
    + 
    +-            my $status = eval { $sth->execute( $text, $a->id ) };
     -            unless ( $status ) {
     -                if ( $dbh->err == 7  && $dbh->state eq '54000' ) {
     -                    warn "Attachment @{[$a->id]} cannot be indexed. Most probably it contains too many unique words. Error: ". $dbh->errstr;
    @@ -39,7 +43,7 @@
     -                # for purposes of knowing where to pick up
     -                eval { $sth->execute( "", $a->id ) }
     -                    or die "Failed to insert empty row: " . $dbh->errstr;
    -+            push @insert, [($a->Content // ""), $a->id];
    ++            push @insert, [$text, $a->id];
     +        }
     +
     +        # Try in one database transaction; if it fails, we roll it back
32:  8b2b442 = 25:  3de172d If a new table is used for indexing, grant rights on it
33:  52c0a3f = 26:  1cf38b3 Insert data to index before creating the index
34:  de64b1c = 27:  a3f9f23 Switch the default Postgres index to GIN
35:  b366170 ! 28:  8ae077a Default to storing the tsvector in a new table, to speed indexing
    @@ -36,8 +36,8 @@
     --- a/t/fts/indexed_pg.t
     +++ b/t/fts/indexed_pg.t
     @@
    - 
    - plan tests => 36;
    + plan skip_all => "Need Pg 8.2 or higher; we have $major.$minor"
    +     if "$major.$minor" < 8.2;
      
     -RT->Config->Set( FullTextSearch => Enable => 1, Indexed => 1, Column => 'ContentIndex', Table => 'Attachments' );
     +RT->Config->Set( FullTextSearch => Enable => 1, Indexed => 1, Column => 'ContentIndex', Table => 'AttachmentsIndex' );