[Rt-commit] r19113 - rt/3.999/branches/FTS/sbin
ruz at bestpractical.com
ruz at bestpractical.com
Thu Apr 9 10:19:46 EDT 2009
Author: ruz
Date: Thu Apr 9 10:19:46 2009
New Revision: 19113
Modified:
rt/3.999/branches/FTS/sbin/rt-fulltext-indexer
Log:
* complete indexer API, now works on mysql and Pg, needs docs and update_last_indexed
Modified: rt/3.999/branches/FTS/sbin/rt-fulltext-indexer
==============================================================================
--- rt/3.999/branches/FTS/sbin/rt-fulltext-indexer (original)
+++ rt/3.999/branches/FTS/sbin/rt-fulltext-indexer Thu Apr 9 10:19:46 2009
@@ -63,7 +63,7 @@
# Read in the options
my %opts;
-GetOptions( \%opts, 'help', 'limit', 'skip' );
+GetOptions( \%opts, 'help', 'debug', 'limit', 'skip' );
if ( $opts{'help'} ) {
require Pod::Usage;
import Pod::Usage;
@@ -87,16 +87,97 @@
my $db_type = RT->config->get('DatabaseType');
+my @types = qw(text html);
+foreach my $type ( @types ) {
+ my $attaches = attachments($type);
+ $attaches->limit( column => 'id', operator => '>', value => last_indexed($type) );
+ $attaches->order_by( column => 'id', order => 'asc' );
+ $attaches->rows_per_page( $opts{'limit'} || 100 );
+
+ my $found = 0;
+ while ( my $a = $attaches->next ) {
+ print "bla\n";
+ debug("Found attachment #". $a->id );
+ next if filter( $type, $a );
+ debug("Attachment #". $a->id ." hasn't been filtered" );
+ my $txt = extract($type, $a) or next;
+ debug("Extracted text from attachment #". $a->id );
+ $found++;
+ process( $type, $a, $txt );
+ debug("Processed attachment #". $a->id );
+ }
+ finalize( $type, $attaches ) if $found;
+ clean( $type );
+}
+
+sub attachments {
+ my $type = shift;
+ my $res = RT::Model::AttachmentCollection->new( current_user => RT->system_user );
+
+ my $txn_alias = $res->join_transactions;
+ $res->limit( alias => $txn_alias, column => 'object_type', value => 'RT::Model::Ticket' );
+ my $ticket_alias = $res->join(
+ alias1 => $txn_alias, column1 => 'object_id',
+ table2 => new RT::Model::TicketCollection, column2 => 'id'
+ );
+ $res->limit( alias => $ticket_alias, column => 'status', operator => '!=', value => 'deleted' );
+
+ return goto_specific(
+ suffix => $type,
+ error => "Don't know how to find $type attachments",
+ arguments => [$res],
+ );
+}
sub last_indexed {
my ($type) = (@_);
return goto_specific(
- $db_type,
- "Don't know how to find last indexed $type attachment for $db_type DB",
- @_
+ suffix => $db_type,
+ error => "Don't know how to find last indexed $type attachment for $db_type DB",
+ arguments => \@_,
);
}
+sub filter {
+ my $type = shift;
+ return goto_specific(
+ suffix => $type,
+ arguments => \@_,
+ );
+}
+
+sub extract {
+ my $type = shift;
+ return goto_specific(
+ suffix => $type,
+ error => "No way to convert $type attachment into text",
+ arguments => \@_,
+ );
+}
+
+sub process {
+ return goto_specific(
+ suffix => $db_type,
+ error => "No processer for $db_type DB",
+ arguments => \@_,
+ );
+}
+
+sub finalize {
+ return goto_specific(
+ suffix => $db_type,
+ arguments => \@_,
+ );
+}
+
+sub clean {
+ return goto_specific(
+ prefix => $db_type,
+ arguments => \@_,
+ );
+}
+
+{
sub last_indexed_mysql {
my $type = shift;
my $attr = RT->system->first_attribute('LastIndexedAttachments');
@@ -105,6 +186,49 @@
return $attr->{ $type } || 0;
}
+sub process_mysql {
+ my ($type, $attachment, $text) = (@_);
+
+ my $doc = sphinx_template();
+
+ my $element = $doc->createElement('sphinx:document');
+ $element->setAttribute( id => $attachment->id );
+ $element->appendTextChild( content => $$text );
+
+ $doc->documentElement->appendChild( $element );
+}
+
+my $doc = undef;
+sub sphinx_template {
+ return $doc if $doc;
+
+ require XML::LibXML;
+ $doc = XML::LibXML::Document->new('1.0', 'UTF-8');
+ my $root = $doc->createElement('sphinx:docset');
+ $doc->setDocumentElement( $root );
+
+ my $schema = $doc->createElement('sphinx:schema');
+ $root->appendChild( $schema );
+ foreach ( qw(content) ) {
+ my $field = $doc->createElement('sphinx:field');
+ $field->setAttribute( name => $_ );
+ $schema->appendChild( $field );
+ }
+
+ return $doc;
+}
+
+sub finalize_mysql {
+ my ($type, $attachments) = @_;
+ sphinx_template()->toFH(*STDOUT, 1);
+}
+
+sub clean_mysql {
+ $doc = undef;
+}
+
+}
+
sub last_indexed_pg {
my $type = shift;
my $attachments = attachments( $type );
@@ -124,34 +248,78 @@
return $res->id;
}
-sub attachments {
- my $type = shift;
- my $res = RT::Model::AttachmentCollection->new( current_user => RT->system_user );
- return goto_specific(
- $type,
- "Don't know how to find $type attachments",
- $res,
- );
+sub process_pg {
+ my ($type, $attachment, $text) = (@_);
+
+ my $dbh = Jifty->handle->dbh;
+ my $table = $fts_config->{'Table'};
+ my $column = $fts_config->{'Column'};
+
+ my $query;
+ if ( $table ) {
+ if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) {
+ $query = "UPDATE $table SET $column = to_tsvector(?) WHERE id = ?";
+ } else {
+ $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)";
+ }
+ } else {
+ $query = "UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?";
+ }
+
+ my $status = $dbh->do( $query, undef, $$text, $attachment->id );
+ unless ( $status ) {
+ die "error: ". $dbh->errstr;
+ }
}
sub attachments_text {
my $res = shift;
+ $res->limit( column => 'content_type', value => 'text/plain' );
+ return $res;
+}
+sub extract_text {
+ my $attachment = shift;
+ my $text = $attachment->content;
+ return undef unless defined $text && length($text);
+ return \$text;
+}
-
+sub attachments_html {
+ my $res = shift;
+ $res->limit( column => 'content_type', value => 'text/html' );
return $res;
}
-sub attachments_html {
+sub filter_html {
+ my $attachment = shift;
+ if ( my $parent = $attachment->parent ) {
+# skip html parts that are alternatives
+ return 1 if $parent->id
+ && $parent->content_type eq 'mulitpart/alternative';
+ }
+ return 0;
+}
+
+sub extract_html {
+ my $attachment = shift;
+ my $text = $attachment->content;
+ return undef unless defined $text && length($text);
+# TODO: html -> text
+ return \$text;
}
sub goto_specific {
- my $suffix = shift;
- my $msg = shift;
+ my %args = (@_);
+
my $func = (caller(1))[3];
$func =~ s/.*:://;
- my $call = $func ."_". lc $suffix;
- die $msg unless defined &$call;
+ my $call = $func ."_". lc $args{'suffix'};
+ unless ( defined &$call ) {
+ return undef unless $args{'error'};
+ require Carp; Carp::croak( $args{'error'} );
+ }
+ @_ = @{ $args{'arguments'} };
goto &$call;
}
More information about the Rt-commit
mailing list