[Rt-commit] r19113 - rt/3.999/branches/FTS/sbin

ruz at bestpractical.com ruz at bestpractical.com
Thu Apr 9 10:19:46 EDT 2009


Author: ruz
Date: Thu Apr  9 10:19:46 2009
New Revision: 19113

Modified:
   rt/3.999/branches/FTS/sbin/rt-fulltext-indexer

Log:
* complete indexer API, now works on mysql and Pg, needs docs and update_last_indexed

Modified: rt/3.999/branches/FTS/sbin/rt-fulltext-indexer
==============================================================================
--- rt/3.999/branches/FTS/sbin/rt-fulltext-indexer	(original)
+++ rt/3.999/branches/FTS/sbin/rt-fulltext-indexer	Thu Apr  9 10:19:46 2009
@@ -63,7 +63,7 @@
 
 # Read in the options
 my %opts;
-GetOptions( \%opts, 'help', 'limit', 'skip' );
+GetOptions( \%opts, 'help', 'debug', 'limit', 'skip' );
 if ( $opts{'help'} ) {
     require Pod::Usage;
     import Pod::Usage;
@@ -87,16 +87,97 @@
 
 my $db_type = RT->config->get('DatabaseType');
 
+my @types = qw(text html);
+foreach my $type ( @types ) {
+    my $attaches = attachments($type);
+    $attaches->limit( column => 'id', operator => '>', value => last_indexed($type) );
+    $attaches->order_by( column => 'id', order => 'asc' );
+    $attaches->rows_per_page( $opts{'limit'} || 100 );
+
+    my $found = 0;
+    while ( my $a = $attaches->next ) {
+        print "bla\n";
+        debug("Found attachment #". $a->id );
+        next if filter( $type, $a );
+        debug("Attachment #". $a->id ." hasn't been filtered" );
+        my $txt = extract($type, $a) or next;
+        debug("Extracted text from attachment #". $a->id );
+        $found++;
+        process( $type, $a, $txt );
+        debug("Processed attachment #". $a->id );
+    }
+    finalize( $type, $attaches ) if $found;
+    clean( $type );
+}
+
+sub attachments {
+    my $type = shift;
+    my $res = RT::Model::AttachmentCollection->new( current_user => RT->system_user );
+
+    my $txn_alias = $res->join_transactions;
+    $res->limit( alias => $txn_alias, column => 'object_type', value => 'RT::Model::Ticket' );
+    my $ticket_alias = $res->join(
+        alias1 => $txn_alias, column1 => 'object_id',
+        table2 => new RT::Model::TicketCollection, column2 => 'id'
+    );
+    $res->limit( alias => $ticket_alias, column => 'status', operator => '!=', value => 'deleted' );
+
+    return goto_specific(
+        suffix => $type,
+        error => "Don't know how to find $type attachments",
+        arguments => [$res],
+    );
+}
 
 sub last_indexed {
     my ($type) = (@_);
     return goto_specific(
-        $db_type,
-        "Don't know how to find last indexed $type attachment for $db_type DB",
-        @_
+        suffix => $db_type,
+        error => "Don't know how to find last indexed $type attachment for $db_type DB",
+        arguments => \@_,
     );
 }
 
+sub filter {
+    my $type = shift;
+    return goto_specific(
+        suffix    => $type,
+        arguments => \@_,
+    );
+}
+
+sub extract {
+    my $type = shift;
+    return goto_specific(
+        suffix    => $type,
+        error     => "No way to convert $type attachment into text",
+        arguments => \@_,
+    );
+}
+
+sub process {
+    return goto_specific(
+        suffix    => $db_type,
+        error     => "No processer for $db_type DB",
+        arguments => \@_,
+    );
+}
+
+sub finalize {
+    return goto_specific(
+        suffix    => $db_type,
+        arguments => \@_,
+    );
+}
+
+sub clean {
+    return goto_specific(
+        prefix    => $db_type,
+        arguments => \@_,
+    );
+}
+
+{
 sub last_indexed_mysql {
     my $type = shift;
     my $attr = RT->system->first_attribute('LastIndexedAttachments');
@@ -105,6 +186,49 @@
     return $attr->{ $type } || 0;
 }
 
+sub process_mysql {
+    my ($type, $attachment, $text) = (@_);
+
+    my $doc = sphinx_template();
+
+    my $element = $doc->createElement('sphinx:document');
+    $element->setAttribute( id => $attachment->id );
+    $element->appendTextChild( content => $$text );
+
+    $doc->documentElement->appendChild( $element );
+}
+
+my $doc = undef;
+sub sphinx_template {
+    return $doc if $doc;
+
+    require XML::LibXML;
+    $doc = XML::LibXML::Document->new('1.0', 'UTF-8');
+    my $root = $doc->createElement('sphinx:docset');
+    $doc->setDocumentElement( $root );
+
+    my $schema = $doc->createElement('sphinx:schema');
+    $root->appendChild( $schema );
+    foreach ( qw(content) ) {
+        my $field = $doc->createElement('sphinx:field');
+        $field->setAttribute( name => $_ );
+        $schema->appendChild( $field );
+    }
+
+    return $doc;
+}
+
+sub finalize_mysql {
+    my ($type, $attachments) = @_;
+    sphinx_template()->toFH(*STDOUT, 1);
+}
+
+sub clean_mysql {
+    $doc = undef;
+}
+
+}
+
 sub last_indexed_pg {
     my $type = shift;
     my $attachments = attachments( $type );
@@ -124,34 +248,78 @@
     return $res->id;
 }
 
-sub attachments {
-    my $type = shift;
-    my $res = RT::Model::AttachmentCollection->new( current_user => RT->system_user );
-    return goto_specific(
-        $type,
-        "Don't know how to find $type attachments",
-        $res,
-    );
+sub process_pg {
+    my ($type, $attachment, $text) = (@_);
+
+    my $dbh = Jifty->handle->dbh;
+    my $table = $fts_config->{'Table'};
+    my $column = $fts_config->{'Column'};
+
+    my $query;
+    if ( $table ) {
+        if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) {
+            $query = "UPDATE $table SET $column = to_tsvector(?) WHERE id = ?";
+        } else {
+            $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)";
+        }
+    } else {
+        $query = "UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?";
+    }
+
+    my $status = $dbh->do( $query, undef, $$text, $attachment->id );
+    unless ( $status ) {
+        die "error: ". $dbh->errstr;
+    }
 }
 
 sub attachments_text {
     my $res = shift;
+    $res->limit( column => 'content_type', value => 'text/plain' );
+    return $res;
+}
 
+sub extract_text {
+    my $attachment = shift;
+    my $text = $attachment->content;
+    return undef unless defined $text && length($text);
+    return \$text;
+}
 
-
+sub attachments_html {
+    my $res = shift;
+    $res->limit( column => 'content_type', value => 'text/html' );
     return $res;
 }
 
-sub attachments_html {
+sub filter_html {
+    my $attachment = shift;
+    if ( my $parent = $attachment->parent ) {
+# skip html parts that are alternatives
+        return 1 if $parent->id
+            && $parent->content_type eq 'mulitpart/alternative';
+    }
+    return 0;
+}
+
+sub extract_html {
+    my $attachment = shift;
+    my $text = $attachment->content;
+    return undef unless defined $text && length($text);
+# TODO: html -> text
+    return \$text;
 }
 
 sub goto_specific {
-    my $suffix = shift;
-    my $msg = shift;
+    my %args = (@_);
+
     my $func = (caller(1))[3];
     $func =~ s/.*:://;
-    my $call = $func ."_". lc $suffix;
-    die $msg unless defined &$call;
+    my $call = $func ."_". lc $args{'suffix'};
+    unless ( defined &$call ) {
+        return undef unless $args{'error'};
+        require Carp; Carp::croak( $args{'error'} );
+    }
+    @_ = @{ $args{'arguments'} };
     goto &$call;
 }
 


More information about the Rt-commit mailing list