[Bps-public-commit] rt-extension-tika branch, master, updated. c6bd5fedde171427ab8fdfcf5fa5dd4042281cd8

Dave Goehrig dave at bestpractical.com
Tue Feb 21 10:09:53 EST 2017


The branch, master has been updated
       via  c6bd5fedde171427ab8fdfcf5fa5dd4042281cd8 (commit)
       via  c958fab0546dc12d6229a7c4a81605910f9ed0d8 (commit)
      from  40da07ad1c48b8cad9ee7a5a08581522eeeee717 (commit)

Summary of changes:
 META.yml                                         |  1 -
 Makefile.PL                                      |  3 +-
 README                                           | 26 ++++++-
 lib/RT/Extension/Tika.pm                         | 94 +++++++++++++++++++-----
 sbin/rt-tika-fulltext-indexer                    | 27 ++++---
 sbin/rt-tika-fulltext-indexer.in                 | 24 ++++--
 sbin/{start-tika-server => start-tika-server.in} |  0
 7 files changed, 133 insertions(+), 42 deletions(-)
 copy sbin/{start-tika-server => start-tika-server.in} (100%)

- Log -----------------------------------------------------------------
commit c958fab0546dc12d6229a7c4a81605910f9ed0d8
Author: Dave Goehrig <dave at bestpractical.com>
Date:   Mon Dec 19 14:54:37 2016 -0500

    fix TikaMimeTypes expansion

diff --git a/sbin/rt-tika-fulltext-indexer.in b/sbin/rt-tika-fulltext-indexer.in
index 79a0c5e..cc5b04b 100755
--- a/sbin/rt-tika-fulltext-indexer.in
+++ b/sbin/rt-tika-fulltext-indexer.in
@@ -51,6 +51,7 @@
 use strict;
 use warnings;
 use 5.010;
+use Data::Dumper;
 
 ### after:     use lib qw(@RT_LIB_PATH@);
     use lib qw(/opt/rt4/local/lib /opt/rt4/lib);
@@ -174,11 +175,11 @@ sub attachment_loop {
     {
         # Indexes all text/plain and text/html attachments
         my $attachments = RT::Attachments->new( RT->SystemUser );
-	my @MIMETYPES = (RT->Config->Get('TikaMimeTypes')) || 
-		( 'text/plain', 'text/html','application/pdf', 
+	my @MIMETYPES = @{ RT->Config->Get('TikaMimeTypes') ||
+		[ 'text/plain', 'text/html','application/pdf', 
 		'application/msword', 
-		'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
-	);
+		'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ] };
+
         my $txn_alias = $attachments->Join(
             ALIAS1 => 'main',
             FIELD1 => 'TransactionId',

commit c6bd5fedde171427ab8fdfcf5fa5dd4042281cd8
Author: Dave Goehrig <dave at bestpractical.com>
Date:   Tue Feb 21 10:07:33 2017 -0500

    Removed Apache::Tika dependency
    
    The code base for Apache::Tika provides no
    error handling or ability to handle errors
    as it silently squashes them.
    
    To work around this, this change replaces it
    with equivalent LWP::UserAgent code that calls
    just the interface we need, and handles the
    case where the server goes away or a document
    doesn't process.
    
    Also updated the README to correct some issues
    with the installation and running instructions.

diff --git a/META.yml b/META.yml
index eda7c0c..58a268f 100644
--- a/META.yml
+++ b/META.yml
@@ -19,7 +19,6 @@ no_index:
     - inc
     - jar
 requires:
-  Apache::Tika: 0
   File::MimeInfo::Magic: 0
   IO::Scalar: 0
   perl: 5.10.1
diff --git a/Makefile.PL b/Makefile.PL
index 2f8498f..e80018b 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -38,7 +38,7 @@ print STDERR Dumper  \@tools;
 
 substitute(
 	{ 
-		PERL => $ENV{PERL} || $secure_perl_path,
+		PERL => $ENV{'PERL'} || $secure_perl_path,
 		RT_LIB_PATH => join(' ',$local_lib_path,$lib_path ) 
 	}, {
 		sufix => '.in'
@@ -46,7 +46,6 @@ substitute(
 	@tools
 ) if @tools; 
 
-requires 'Apache::Tika';
 requires 'File::MimeInfo::Magic';
 requires 'IO::Scalar';
 
diff --git a/README b/README
index 2da7863..4404ce7 100644
--- a/README
+++ b/README
@@ -42,7 +42,24 @@ INSTALLATION
 
             Set(@TikaMimeTypes,'application/rtf','application/x-rtf',
                  'application/vnd.oasis.opendocument.text',
-                 'application/vnd.oasis.opendocument.text-master');
+                 'application/vnd.oasis.opendocument.text-master',
+                 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' 
+                 'application/msword',
+                'application/vnd.ms-excel',
+                'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' 
+        
+            );
+
+        You will also need to configure the full text search as with typical
+        RT:
+
+            Set( %FullTextSearch,
+                Enable     => 1,
+                Indexed    => 1,
+                Table      => 'Attachments',
+                Column     => 'ContentIndex',
+            );
+
 
     Clear your mason cache
             rm -rf /opt/rt4/var/mason_data/obj
@@ -95,3 +112,10 @@ LICENSE AND COPYRIGHT
 
       The GNU General Public License, Version 2, June 1991
 
+POD ERRORS
+    Hey! The above document had some coding errors, which are explained
+    below:
+
+    Around line 75:
+        Expected text after =item, not a bullet
+
diff --git a/lib/RT/Extension/Tika.pm b/lib/RT/Extension/Tika.pm
index baf3f3c..03e7d5c 100644
--- a/lib/RT/Extension/Tika.pm
+++ b/lib/RT/Extension/Tika.pm
@@ -2,9 +2,9 @@ use strict;
 use warnings;
 package RT::Extension::Tika;
 
-use Apache::Tika;
 use File::MimeInfo::Magic qw/ mimetype /;
 use IO::Scalar;
+use LWP::UserAgent;
 
 our $VERSION = '0.01';
 
@@ -26,6 +26,9 @@ for use in search engines.  This plugin requires running a tika-server process
 either on the same machine as RT or on another machine, to provide the text 
 extraction for the different supported document types.
 
+Currently this module only supports MySQL and PostgreSQL databases for indexing.
+
+
 =head1 RT VERSION
 
 Works with RT 4.4.1.
@@ -42,6 +45,24 @@ Works with RT 4.4.1.
 
 May need root permissions
 
+=item Configure Full text indexing
+
+In order to use this extension, you will first need to configure
+your RT to use fulltext indexing by running the script:
+    
+    /opt/rt4/sbin/rt-setup-fulltext-index
+
+This will create a new table in your database and prompt you to
+configure your F</opt/rt4/etc/RT_SiteConfig.pm> for your particular
+database configuration, such as: 
+
+    Set( %FullTextSearch,
+        Enable     => 1,
+        Indexed    => 1,
+        Table      => 'AttachmentsIndex'
+    );
+
+
 =item Edit your F</opt/rt4/etc/RT_SiteConfig.pm>
 
 If you are using RT 4.2 or greater, add this line:
@@ -54,18 +75,31 @@ For RT 4.0, add this line:
 
 or add C<rt::extension::tika> to your existing C<@Plugins> line.
 
-By default this extension will index text, html, pdf, doc, and docx files.
-You can add additional mime types by adding them to a list:
-
-    Set(@TikaMimeTypes,'application/rtf','application/x-rtf',
-         'application/vnd.oasis.opendocument.text',
-         'application/vnd.oasis.opendocument.text-master');
-
-=item Clear your mason cache
+To select the attachment types to index, set the C<@TikaMimeTypes>
+value to a list of mime types for indexing:
+
+    Set(@TikaMimeTypes,
+            'text/plain',
+            'text/html',
+            'application/zip',
+            'application/pdf',
+            'application/vnd.oasis.opendocument.text',
+            'application/vnd.oasis.opendocument.text-master',
+            'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+            'application/msword',
+            'application/vnd.ms-excel',
+            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+            'application/rtf',
+            'application/x-rtf'
+    );
+
+The above list contains plain text, html, pdfs, OpenOffice, Microsoft Word and Excel files.
+
+If you want to run the Tika server on a different host from your RT instance you can 
+configure the C<$TikaURL> value to point it at that host:
 
-    rm -rf /opt/rt4/var/mason_data/obj
+    Set($TikaURL, 'http://someotherhost:9998/');
 
-=item Restart your webserver
 
 =item Start the tika server
 
@@ -83,10 +117,6 @@ You can get a list of options (host, port, CORS) by running:
 
 By default the server will listen on localhost:9998
 
-If you change the default path you will need to set the TikaURL in your RT_SiteConfig.pm
-
-    Set($TikaURL, 'http://someotherhost:9998/');
-
 =item Add the indexer to a cron job
 
 In the directory /opt/rt4 you can run the indexer as:
@@ -122,13 +152,37 @@ This is free software, licensed under:
 
 =cut
 
+
+sub config_url {
+    RT->Config->Get('TikaUrl') || 'http://localhost:9998/';
+}
+
+sub mime_file {
+    my ($file) = @_;
+	my $io = new IO::Scalar \$file;
+    mimetype($io);
+}
+
+sub request {
+    my($url,$file,$mimetype) = @_;
+    my $ua = LWP::UserAgent->new();
+    $ua->put($url . "/tika", 
+        'Accept' => 'text/plain',
+        'Content-Type' => $mimetype,
+        'Content' => $file
+    );
+}
+
 sub extract {
 	my ($file) = @_;
-	my $url = RT->Config->Get('TikaUrl') || 'http://localhost:9998/';
-	my $tika = Apache::Tika->new( url => $url );
-	my $io = new IO::Scalar \$file;
-        my $mime_type = mimetype($io);
-	return $tika->tika($file,$mime_type);
+	my $url = config_url;
+    my $mime_type = mime_file($file);
+    my $response = request($url,$file,$mime_type);
+    print STDERR "$mime_type\n";
+    if ($response->is_error) {
+        return ('', $response->message || 'error'); 
+    } 
+    return ($response->content);
 }
 
 1;
diff --git a/sbin/rt-tika-fulltext-indexer b/sbin/rt-tika-fulltext-indexer
index 3f35d36..ff3260c 100755
--- a/sbin/rt-tika-fulltext-indexer
+++ b/sbin/rt-tika-fulltext-indexer
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/home/desktop/perl5/perlbrew/perls/perl-5.24.0/bin/perl
 ### before: #!@PERL@
 #
 # BEGIN BPS TAGGED BLOCK {{{
@@ -51,6 +51,7 @@
 use strict;
 use warnings;
 use 5.010;
+use Data::Dumper;
 
 ### after:     use lib qw(@RT_LIB_PATH@);
     use lib qw(/opt/rt4/local/lib /opt/rt4/lib);
@@ -174,12 +175,11 @@ sub attachment_loop {
     {
         # Indexes all text/plain and text/html attachments
         my $attachments = RT::Attachments->new( RT->SystemUser );
-	my @extra = (RT->Config->Get('TikaMimeTypes'));
-	my @MIMETYPES = ( 'text/plain', 'text/html','application/pdf', 
+	my @MIMETYPES = @{ RT->Config->Get('TikaMimeTypes') ||
+		[ 'text/plain', 'text/html','application/pdf', 
 		'application/msword', 
-		'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
-		@extra
-	);
+		'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ] };
+
         my $txn_alias = $attachments->Join(
             ALIAS1 => 'main',
             FIELD1 => 'TransactionId',
@@ -227,7 +227,7 @@ sub process_bulk_insert {
         my @insert;
         my $found = 0;
 
-        while ( my $a = $attachments->Next ) {
+        ATTACHMENT: while ( my $a = $attachments->Next ) {
             debug("Found attachment #". $a->id );
             if ( $max_size and $a->ContentLength > $max_size ){
                 debug("Attachment #" . $a->id . " is " . $a->ContentLength .
@@ -237,9 +237,16 @@ sub process_bulk_insert {
             }
 
             my $text = $a->Content // "";
-	    if ($a->ContentType ne "text/html" and $a->ContentType ne "text/plain" ) {
-			$text = RT::Extension::Tika::extract($a->Content);	
-	    }
+            my $err;
+            if ($a->ContentType ne "text/html" and $a->ContentType ne "text/plain" ) {
+                ($text,$err) = RT::Extension::Tika::extract($a->Content);
+            }
+            # if an error occured processing the attachment log it,
+            # and move on to the next attachment
+             if ($err) {
+                debug("Failed to process attachment " . $a->id . " : " .$err);
+                next ATTACHMENT;
+            }
             HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
             push @insert, join("\n", $a->Subject // "", $text), $a->id;
             $found++;
diff --git a/sbin/rt-tika-fulltext-indexer.in b/sbin/rt-tika-fulltext-indexer.in
index cc5b04b..0dc3943 100755
--- a/sbin/rt-tika-fulltext-indexer.in
+++ b/sbin/rt-tika-fulltext-indexer.in
@@ -227,7 +227,7 @@ sub process_bulk_insert {
         my @insert;
         my $found = 0;
 
-        while ( my $a = $attachments->Next ) {
+        ATTACHMENT: while ( my $a = $attachments->Next ) {
             debug("Found attachment #". $a->id );
             if ( $max_size and $a->ContentLength > $max_size ){
                 debug("Attachment #" . $a->id . " is " . $a->ContentLength .
@@ -237,9 +237,16 @@ sub process_bulk_insert {
             }
 
             my $text = $a->Content // "";
-	    if ($a->ContentType ne "text/html" and $a->ContentType ne "text/plain" ) {
-			$text = RT::Extension::Tika::extract($a->Content);	
-	    }
+            my $err;
+            if ($a->ContentType ne "text/html" and $a->ContentType ne "text/plain" ) {
+                ($text,$err) = RT::Extension::Tika::extract($a->Content);
+            }
+            # if an error occured processing the attachment log it,
+            # and move on to the next attachment
+             if ($err) {
+                debug("Failed to process attachment " . $a->id . " : " .$err);
+                next ATTACHMENT;
+            }
             HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
             push @insert, join("\n", $a->Subject // "", $text), $a->id;
             $found++;
diff --git a/sbin/start-tika-server.in b/sbin/start-tika-server.in
new file mode 100755
index 0000000..c9b5a74
--- /dev/null
+++ b/sbin/start-tika-server.in
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+java -jar $(dirname 1)/local/plugins/RT-Extension-Tika/lib/auto/share/dist/RT-Extension-Tika/tika-server.jar

-----------------------------------------------------------------------


More information about the Bps-public-commit mailing list