[Rt-commit] rt branch 5.0/speed-up-importer created. rt-5.0.3-127-gc835c69bca

BPS Git Server git at git.bestpractical.com
Thu Oct 13 21:55:24 UTC 2022


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "rt".

The branch, 5.0/speed-up-importer has been created
        at  c835c69bca5a41ea7479e7f74c91bbf87ac176ad (commit)

- Log -----------------------------------------------------------------
commit c835c69bca5a41ea7479e7f74c91bbf87ac176ad
Author: sunnavy <sunnavy at bestpractical.com>
Date:   Fri Oct 14 05:02:16 2022 +0800

    Reduce unnecessary Load calls after creation for performance
    
    Now we only load objects if needed, like to run PostInflate, etc.
    This can make importer 30% faster.

diff --git a/lib/RT/Migrate/Importer.pm b/lib/RT/Migrate/Importer.pm
index 56c09732f8..e844f91db1 100644
--- a/lib/RT/Migrate/Importer.pm
+++ b/lib/RT/Migrate/Importer.pm
@@ -413,10 +413,14 @@ sub Create {
     $self->{ObjectCount}{$class}++;
     $self->Resolve( $uid => $class, $id );
 
-    # Load it back to get real values into the columns
-    $obj = $class->new( RT->SystemUser );
-    $obj->Load( $id );
-    $obj->PostInflate( $self, $uid );
+    # Attribute, Article and SystemUser have actions in PostInflate. CustomField is for NewCFs.
+    if ( $class =~ /^RT::(Attribute|Article|CustomField)$/ || ( $class eq 'RT::User' && $data->{Name} eq 'RT_System' ) )
+    {
+        # Load it back to get real values into the columns
+        $obj = $class->new( RT->SystemUser );
+        $obj->Load( $id );
+        $obj->PostInflate( $self, $uid );
+    }
 
     return $obj;
 }
@@ -487,6 +491,7 @@ sub ReadStream {
                   ? $origid
                   : $self->Organization . ":$origid";
 
+        $obj->Load( $self->Lookup($uid)->[1] );
         my ($id, $msg) = $obj->AddCustomFieldValue(
             Field             => $self->{OriginalId},
             Value             => $value,

commit 76c9e42c722e2beda330e4819c86f99a51dbdcef
Author: sunnavy <sunnavy at bestpractical.com>
Date:   Thu Oct 13 21:34:01 2022 +0800

    Add batch mode to importer for performance
    
    This only works with cloned serialized data, which doesn't need to
    resolve new ids.
    
    This can make importer 10x faster(for cloned data).

diff --git a/lib/RT/Migrate/Importer.pm b/lib/RT/Migrate/Importer.pm
index 3ee478e0aa..56c09732f8 100644
--- a/lib/RT/Migrate/Importer.pm
+++ b/lib/RT/Migrate/Importer.pm
@@ -72,6 +72,7 @@ sub Init {
         HandleError         => undef,
         ExcludeOrganization => undef,
         AutoCommit          => 1,
+        BatchSize           => 0,
         @_,
     );
 
@@ -83,6 +84,7 @@ sub Init {
     $self->{Progress} = $args{Progress};
 
     $self->{AutoCommit} = $args{AutoCommit};
+    $self->{BatchSize}  = $args{BatchSize};
 
     $self->{HandleError} = sub { 0 };
     $self->{HandleError} = $args{HandleError}
@@ -321,6 +323,10 @@ sub Qualify {
     return $self->{Organization}.": $string";
 }
 
+my @batch;
+my @pids;
+my $is_parent = 1;
+
 sub Create {
     my $self = shift;
     my ($class, $uid, $data) = @_;
@@ -359,6 +365,34 @@ sub Create {
         }
     }
 
+    if ( $self->{Clone} && $self->{BatchSize} ) {
+        push @batch, [ $class, \%$data ];
+        if ( @batch >= $self->{BatchSize} ) {
+            $RT::Handle->Disconnect;
+            my $pid = fork;
+            if ( defined $pid ) {
+                if ( $pid ) {
+                    push @pids, $pid;
+                    $RT::Handle->Connect;
+                    $RT::Handle->BeginTransaction unless $self->{AutoCommit};
+                    $self->{ObjectCount}{$_->[0]}++ for @batch;
+                    @batch = ();
+                    return;
+                }
+                else {
+                    $RT::Handle->Connect;
+                    $is_parent = 0;
+                    $self->BatchCreate(@batch);
+                    exit;
+                }
+            }
+            else {
+                warn "Couldn't fork";
+            }
+        }
+        return;
+    }
+
     my ($id, $msg) = eval {
         # catch and rethrow on the outside so we can provide more info
         local $SIG{__DIE__};
@@ -477,6 +511,13 @@ sub CloseStream {
 
     $self->{Progress}->(undef, 'force') if $self->{Progress};
 
+    $self->{ObjectCount}{$_->[0]}++ for @batch;
+    $self->BatchCreate(@batch);
+
+    if ( $is_parent ) {
+        waitpid $_, 0 for @pids;
+    }
+
     # Fill CGM
     my $dbh = $RT::Handle->dbh;
 
@@ -580,4 +621,38 @@ sub Progress {
     return $self->{Progress} = $_[0];
 }
 
+sub BatchCreate {
+    my $self  = shift;
+    my @items = @_;
+    my %query;
+
+    # Do not actually insert, just get the SQL
+    local *RT::Handle::Insert = sub {
+        my $self = shift;
+        return $self->InsertQueryString(@_);
+    };
+
+    for (@items) {
+        my ( $class, $data ) = @$_;
+        my $obj = $class->new( RT->SystemUser );
+
+        my ( $sql, @bind ) = $obj->DBIx::SearchBuilder::Record::Create(%$data);
+        push @{ $query{$sql} }, \@bind;
+    }
+
+    for my $sql ( keys %query ) {
+        my $dbh   = $RT::Handle->dbh;
+        my $count = @{ $query{$sql} };
+        my $values_paren;
+        if ( $sql =~ /(\(\?.+?\))/i ) {
+            $values_paren = $1;
+        }
+
+        my $batch_sql = $sql . ( ", $values_paren" x ( $count - 1 ) );
+        # DBs have placeholder limitations(64k for Pg), here we replace
+        # placeholders to support bigger batch sizes. The performance is similar.
+        $dbh->do( $RT::Handle->FillIn( $batch_sql, [ map @$_, @{ $query{$sql} } ] ) );
+    }
+}
+
 1;
diff --git a/sbin/rt-importer.in b/sbin/rt-importer.in
index 219e34534e..e3870be200 100644
--- a/sbin/rt-importer.in
+++ b/sbin/rt-importer.in
@@ -102,6 +102,7 @@ GetOptions(
 
     "auto-commit!",
 
+    "batch-size=i",
     "dump=s@",
 ) or Pod::Usage::pod2usage();
 
@@ -151,6 +152,7 @@ my $import = RT::Migrate::Importer::File->new(
     DumpObjects         => $OPT{dump},
     Resume              => $OPT{resume},
     AutoCommit          => $OPT{'auto-commit'},
+    BatchSize           => $OPT{'batch-size'},
     HandleError         => $error_handler,
 );
 
@@ -293,6 +295,17 @@ Works only in conjunction with C<--list>.
 Don't auto commit to database. When this flag is used, it will commit only
 once for each data file.  This could boost performance in some cases.
 
+=item B<--batch-size> I<BATCH_SIZE>
+
+Create objects in batch in forked processes when possible. It's suggested
+to set it to a big number(5000 or 10000), to not create too many child
+processes unnecessarily.
+
+It only works with cloned serialized data and doesn't have any effect on
+other data.
+
+By default, batch processing is not enabled.
+
 =back
 
 

commit 4caf00f2417b89dc9f4c8954ce9311b92380392b
Author: sunnavy <sunnavy at bestpractical.com>
Date:   Thu Oct 13 20:45:31 2022 +0800

    Fill up CachedGroupMembers at the end of importer for performance
    
    Previously we created corresponding CachedGroupMember rows on every
    Group/GroupMember create, which was quite slow. By doing it via plain
    SQL at the end, it's astonishingly faster than before. E.g. for
    CachedGroupMembers with 100k rows, now it could be done in seconds
    (prevously it was in minutes!)
    
    This can make importer 2x faster.

diff --git a/lib/RT/Group.pm b/lib/RT/Group.pm
index 7f9854a204..39f6de0a43 100644
--- a/lib/RT/Group.pm
+++ b/lib/RT/Group.pm
@@ -1752,17 +1752,6 @@ sub PreInflate {
     return 1;
 }
 
-sub PostInflate {
-    my $self = shift;
-
-    my $cgm = RT::CachedGroupMember->new($self->CurrentUser);
-    $cgm->Create(
-        Group  => $self->PrincipalObj,
-        Member => $self->PrincipalObj,
-        ImmediateParent => $self->PrincipalObj
-    );
-}
-
 # If this group represents the members of a custom role, then return
 # the RT::CustomRole object. Otherwise, return undef
 sub _CustomRoleObj {
diff --git a/lib/RT/GroupMember.pm b/lib/RT/GroupMember.pm
index 4a092c332f..2733d3451f 100644
--- a/lib/RT/GroupMember.pm
+++ b/lib/RT/GroupMember.pm
@@ -603,12 +603,6 @@ sub PreInflate {
     return 1;
 }
 
-sub PostInflate {
-    my $self = shift;
-
-    $self->_InsertCGM;
-}
-
 RT::Base->_ImportOverlays();
 
 1;
diff --git a/lib/RT/Migrate/Importer.pm b/lib/RT/Migrate/Importer.pm
index 98c6742b53..3ee478e0aa 100644
--- a/lib/RT/Migrate/Importer.pm
+++ b/lib/RT/Migrate/Importer.pm
@@ -477,6 +477,67 @@ sub CloseStream {
 
     $self->{Progress}->(undef, 'force') if $self->{Progress};
 
+    # Fill CGM
+    my $dbh = $RT::Handle->dbh;
+
+    # Groups
+    $dbh->do(<<'EOF');
+INSERT INTO CachedGroupMembers (GroupId, MemberId, Via, ImmediateParentId, Disabled)
+    SELECT Groups.id, Groups.id, 0, Groups.id, Principals.Disabled FROM Groups
+    LEFT JOIN Principals ON ( Groups.id = Principals.id )
+    LEFT JOIN CachedGroupMembers ON (
+        Groups.id = CachedGroupMembers.GroupId
+        AND CachedGroupMembers.GroupId = CachedGroupMembers.MemberId
+        AND CachedGroupMembers.GroupId = CachedGroupMembers.ImmediateParentId
+        )
+    WHERE CachedGroupMembers.id IS NULL
+EOF
+
+    # GroupMembers
+    $dbh->do(<<'EOF');
+INSERT INTO CachedGroupMembers (GroupId, MemberId, Via, ImmediateParentId, Disabled)
+    SELECT GroupMembers.GroupId, GroupMembers.MemberId, 0, GroupMembers.GroupId, Principals.Disabled FROM GroupMembers
+    LEFT JOIN Principals ON ( GroupMembers.GroupId = Principals.id )
+    LEFT JOIN CachedGroupMembers ON (
+        GroupMembers.GroupId = CachedGroupMembers.GroupId
+        AND GroupMembers.MemberId = CachedGroupMembers.MemberId
+        AND CachedGroupMembers.GroupId = CachedGroupMembers.ImmediateParentId
+    )
+    WHERE CachedGroupMembers.id IS NULL
+EOF
+
+    # Fixup Via
+    $dbh->do(<<'EOF');
+UPDATE CachedGroupMembers SET Via=id WHERE Via=0
+EOF
+
+    # Cascaded GroupMembers, use the same SQL in rt-validator
+    my $cascaded_cgm = <<'EOF';
+INSERT INTO CachedGroupMembers (GroupId, MemberId, Via, ImmediateParentId, Disabled)
+SELECT cgm1.GroupId, gm2.MemberId, cgm1.id AS Via,
+    cgm1.MemberId AS ImmediateParentId, cgm1.Disabled
+FROM
+    CachedGroupMembers cgm1
+    CROSS JOIN GroupMembers gm2
+    LEFT JOIN CachedGroupMembers cgm3 ON (
+            cgm3.GroupId           = cgm1.GroupId
+        AND cgm3.MemberId          = gm2.MemberId
+        AND cgm3.Via               = cgm1.id
+        AND cgm3.ImmediateParentId = cgm1.MemberId )
+    LEFT JOIN Groups g ON (
+        cgm1.GroupId = g.id
+    )
+WHERE cgm1.GroupId != cgm1.MemberId
+AND gm2.GroupId = cgm1.MemberId
+AND cgm3.id IS NULL
+AND g.Domain != 'RT::Ticket-Role'
+EOF
+    # Do this multiple times if needed to fill up cascaded group members
+    while ( my $rv = $dbh->do($cascaded_cgm) ) {
+        # $rv could be 0E0 that is true in bool context but 0 in numeric comparison.
+        last unless $rv > 0;
+    }
+
     return if $self->{Clone};
 
     # Take global CFs which we made and make them un-global

-----------------------------------------------------------------------


hooks/post-receive
-- 
rt


More information about the rt-commit mailing list