[Bps-public-commit] rt-extension-tika branch, master, updated. c6bd5fedde171427ab8fdfcf5fa5dd4042281cd8
Dave Goehrig
dave at bestpractical.com
Tue Feb 21 10:09:53 EST 2017
The branch, master has been updated
via c6bd5fedde171427ab8fdfcf5fa5dd4042281cd8 (commit)
via c958fab0546dc12d6229a7c4a81605910f9ed0d8 (commit)
from 40da07ad1c48b8cad9ee7a5a08581522eeeee717 (commit)
Summary of changes:
META.yml | 1 -
Makefile.PL | 3 +-
README | 26 ++++++-
lib/RT/Extension/Tika.pm | 94 +++++++++++++++++++-----
sbin/rt-tika-fulltext-indexer | 27 ++++---
sbin/rt-tika-fulltext-indexer.in | 24 ++++--
sbin/{start-tika-server => start-tika-server.in} | 0
7 files changed, 133 insertions(+), 42 deletions(-)
copy sbin/{start-tika-server => start-tika-server.in} (100%)
- Log -----------------------------------------------------------------
commit c958fab0546dc12d6229a7c4a81605910f9ed0d8
Author: Dave Goehrig <dave at bestpractical.com>
Date: Mon Dec 19 14:54:37 2016 -0500
fix TikaMimeTypes expansion
diff --git a/sbin/rt-tika-fulltext-indexer.in b/sbin/rt-tika-fulltext-indexer.in
index 79a0c5e..cc5b04b 100755
--- a/sbin/rt-tika-fulltext-indexer.in
+++ b/sbin/rt-tika-fulltext-indexer.in
@@ -51,6 +51,7 @@
use strict;
use warnings;
use 5.010;
+use Data::Dumper;
### after: use lib qw(@RT_LIB_PATH@);
use lib qw(/opt/rt4/local/lib /opt/rt4/lib);
@@ -174,11 +175,11 @@ sub attachment_loop {
{
# Indexes all text/plain and text/html attachments
my $attachments = RT::Attachments->new( RT->SystemUser );
- my @MIMETYPES = (RT->Config->Get('TikaMimeTypes')) ||
- ( 'text/plain', 'text/html','application/pdf',
+ my @MIMETYPES = @{ RT->Config->Get('TikaMimeTypes') ||
+ [ 'text/plain', 'text/html','application/pdf',
'application/msword',
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
- );
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ] };
+
my $txn_alias = $attachments->Join(
ALIAS1 => 'main',
FIELD1 => 'TransactionId',
commit c6bd5fedde171427ab8fdfcf5fa5dd4042281cd8
Author: Dave Goehrig <dave at bestpractical.com>
Date: Tue Feb 21 10:07:33 2017 -0500
Removed Apache::Tika dependency
The code base for Apache::Tika provides no
error handling or ability to handle errors
as it silently squashes them.
To work around this, this change replaces it
with equivalent LWP::UserAgent code that calls
just the interface we need, and handles the
case where the server goes away or a document
doesn't process.
Also updated the README to correct some issues
with the installation and running instructions.
diff --git a/META.yml b/META.yml
index eda7c0c..58a268f 100644
--- a/META.yml
+++ b/META.yml
@@ -19,7 +19,6 @@ no_index:
- inc
- jar
requires:
- Apache::Tika: 0
File::MimeInfo::Magic: 0
IO::Scalar: 0
perl: 5.10.1
diff --git a/Makefile.PL b/Makefile.PL
index 2f8498f..e80018b 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -38,7 +38,7 @@ print STDERR Dumper \@tools;
substitute(
{
- PERL => $ENV{PERL} || $secure_perl_path,
+ PERL => $ENV{'PERL'} || $secure_perl_path,
RT_LIB_PATH => join(' ',$local_lib_path,$lib_path )
}, {
sufix => '.in'
@@ -46,7 +46,6 @@ substitute(
@tools
) if @tools;
-requires 'Apache::Tika';
requires 'File::MimeInfo::Magic';
requires 'IO::Scalar';
diff --git a/README b/README
index 2da7863..4404ce7 100644
--- a/README
+++ b/README
@@ -42,7 +42,24 @@ INSTALLATION
Set(@TikaMimeTypes,'application/rtf','application/x-rtf',
'application/vnd.oasis.opendocument.text',
- 'application/vnd.oasis.opendocument.text-master');
+ 'application/vnd.oasis.opendocument.text-master',
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+ 'application/msword',
+ 'application/vnd.ms-excel',
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
+
+ );
+
+ You will also need to configure the full text search as with typical
+ RT:
+
+ Set( %FullTextSearch,
+ Enable => 1,
+ Indexed => 1,
+ Table => 'Attachments',
+ Column => 'ContentIndex',
+ );
+
Clear your mason cache
rm -rf /opt/rt4/var/mason_data/obj
@@ -95,3 +112,10 @@ LICENSE AND COPYRIGHT
The GNU General Public License, Version 2, June 1991
+POD ERRORS
+ Hey! The above document had some coding errors, which are explained
+ below:
+
+ Around line 75:
+ Expected text after =item, not a bullet
+
diff --git a/lib/RT/Extension/Tika.pm b/lib/RT/Extension/Tika.pm
index baf3f3c..03e7d5c 100644
--- a/lib/RT/Extension/Tika.pm
+++ b/lib/RT/Extension/Tika.pm
@@ -2,9 +2,9 @@ use strict;
use warnings;
package RT::Extension::Tika;
-use Apache::Tika;
use File::MimeInfo::Magic qw/ mimetype /;
use IO::Scalar;
+use LWP::UserAgent;
our $VERSION = '0.01';
@@ -26,6 +26,9 @@ for use in search engines. This plugin requires running a tika-server process
either on the same machine as RT or on another machine, to provide the text
extraction for the different supported document types.
+Currently this module only supports MySQL and PostgreSQL databases for indexing.
+
+
=head1 RT VERSION
Works with RT 4.4.1.
@@ -42,6 +45,24 @@ Works with RT 4.4.1.
May need root permissions
+=item Configure Full text indexing
+
+In order to use this extension, you will first need to configure
+your RT to use fulltext indexing by running the script:
+
+ /opt/rt4/sbin/rt-setup-fulltext-index
+
+This will create a new table in your database and prompt you to
+configure your F</opt/rt4/etc/RT_SiteConfig.pm> for your particular
+database configuration, such as:
+
+ Set( %FullTextSearch,
+ Enable => 1,
+ Indexed => 1,
+ Table => 'AttachmentsIndex'
+ );
+
+
=item Edit your F</opt/rt4/etc/RT_SiteConfig.pm>
If you are using RT 4.2 or greater, add this line:
@@ -54,18 +75,31 @@ For RT 4.0, add this line:
or add C<rt::extension::tika> to your existing C<@Plugins> line.
-By default this extension will index text, html, pdf, doc, and docx files.
-You can add additional mime types by adding them to a list:
-
- Set(@TikaMimeTypes,'application/rtf','application/x-rtf',
- 'application/vnd.oasis.opendocument.text',
- 'application/vnd.oasis.opendocument.text-master');
-
-=item Clear your mason cache
+To select the attachment types to index, set the C<@TikaMimeTypes>
+value to a list of mime types for indexing:
+
+ Set(@TikaMimeTypes,
+ 'text/plain',
+ 'text/html',
+ 'application/zip',
+ 'application/pdf',
+ 'application/vnd.oasis.opendocument.text',
+ 'application/vnd.oasis.opendocument.text-master',
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+ 'application/msword',
+ 'application/vnd.ms-excel',
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+ 'application/rtf',
+ 'application/x-rtf'
+ );
+
+The above list contains plain text, html, pdfs, OpenOffice, Microsoft Word and Excel files.
+
+If you want to run the Tika server on a different host from your RT instance you can
+configure the C<$TikaURL> value to point it at that host:
- rm -rf /opt/rt4/var/mason_data/obj
+ Set($TikaURL, 'http://someotherhost:9998/');
-=item Restart your webserver
=item Start the tika server
@@ -83,10 +117,6 @@ You can get a list of options (host, port, CORS) by running:
By default the server will listen on localhost:9998
-If you change the default path you will need to set the TikaURL in your RT_SiteConfig.pm
-
- Set($TikaURL, 'http://someotherhost:9998/');
-
=item Add the indexer to a cron job
In the directory /opt/rt4 you can run the indexer as:
@@ -122,13 +152,37 @@ This is free software, licensed under:
=cut
+
+sub config_url {
+ RT->Config->Get('TikaUrl') || 'http://localhost:9998/';
+}
+
+sub mime_file {
+ my ($file) = @_;
+ my $io = new IO::Scalar \$file;
+ mimetype($io);
+}
+
+sub request {
+ my($url,$file,$mimetype) = @_;
+ my $ua = LWP::UserAgent->new();
+ $ua->put($url . "/tika",
+ 'Accept' => 'text/plain',
+ 'Content-Type' => $mimetype,
+ 'Content' => $file
+ );
+}
+
sub extract {
my ($file) = @_;
- my $url = RT->Config->Get('TikaUrl') || 'http://localhost:9998/';
- my $tika = Apache::Tika->new( url => $url );
- my $io = new IO::Scalar \$file;
- my $mime_type = mimetype($io);
- return $tika->tika($file,$mime_type);
+ my $url = config_url;
+ my $mime_type = mime_file($file);
+ my $response = request($url,$file,$mime_type);
+ print STDERR "$mime_type\n";
+ if ($response->is_error) {
+ return ('', $response->message || 'error');
+ }
+ return ($response->content);
}
1;
diff --git a/sbin/rt-tika-fulltext-indexer b/sbin/rt-tika-fulltext-indexer
index 3f35d36..ff3260c 100755
--- a/sbin/rt-tika-fulltext-indexer
+++ b/sbin/rt-tika-fulltext-indexer
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/home/desktop/perl5/perlbrew/perls/perl-5.24.0/bin/perl
### before: #!@PERL@
#
# BEGIN BPS TAGGED BLOCK {{{
@@ -51,6 +51,7 @@
use strict;
use warnings;
use 5.010;
+use Data::Dumper;
### after: use lib qw(@RT_LIB_PATH@);
use lib qw(/opt/rt4/local/lib /opt/rt4/lib);
@@ -174,12 +175,11 @@ sub attachment_loop {
{
# Indexes all text/plain and text/html attachments
my $attachments = RT::Attachments->new( RT->SystemUser );
- my @extra = (RT->Config->Get('TikaMimeTypes'));
- my @MIMETYPES = ( 'text/plain', 'text/html','application/pdf',
+ my @MIMETYPES = @{ RT->Config->Get('TikaMimeTypes') ||
+ [ 'text/plain', 'text/html','application/pdf',
'application/msword',
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
- @extra
- );
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ] };
+
my $txn_alias = $attachments->Join(
ALIAS1 => 'main',
FIELD1 => 'TransactionId',
@@ -227,7 +227,7 @@ sub process_bulk_insert {
my @insert;
my $found = 0;
- while ( my $a = $attachments->Next ) {
+ ATTACHMENT: while ( my $a = $attachments->Next ) {
debug("Found attachment #". $a->id );
if ( $max_size and $a->ContentLength > $max_size ){
debug("Attachment #" . $a->id . " is " . $a->ContentLength .
@@ -237,9 +237,16 @@ sub process_bulk_insert {
}
my $text = $a->Content // "";
- if ($a->ContentType ne "text/html" and $a->ContentType ne "text/plain" ) {
- $text = RT::Extension::Tika::extract($a->Content);
- }
+ my $err;
+ if ($a->ContentType ne "text/html" and $a->ContentType ne "text/plain" ) {
+ ($text,$err) = RT::Extension::Tika::extract($a->Content);
+ }
+ # if an error occured processing the attachment log it,
+ # and move on to the next attachment
+ if ($err) {
+ debug("Failed to process attachment " . $a->id . " : " .$err);
+ next ATTACHMENT;
+ }
HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
push @insert, join("\n", $a->Subject // "", $text), $a->id;
$found++;
diff --git a/sbin/rt-tika-fulltext-indexer.in b/sbin/rt-tika-fulltext-indexer.in
index cc5b04b..0dc3943 100755
--- a/sbin/rt-tika-fulltext-indexer.in
+++ b/sbin/rt-tika-fulltext-indexer.in
@@ -227,7 +227,7 @@ sub process_bulk_insert {
my @insert;
my $found = 0;
- while ( my $a = $attachments->Next ) {
+ ATTACHMENT: while ( my $a = $attachments->Next ) {
debug("Found attachment #". $a->id );
if ( $max_size and $a->ContentLength > $max_size ){
debug("Attachment #" . $a->id . " is " . $a->ContentLength .
@@ -237,9 +237,16 @@ sub process_bulk_insert {
}
my $text = $a->Content // "";
- if ($a->ContentType ne "text/html" and $a->ContentType ne "text/plain" ) {
- $text = RT::Extension::Tika::extract($a->Content);
- }
+ my $err;
+ if ($a->ContentType ne "text/html" and $a->ContentType ne "text/plain" ) {
+ ($text,$err) = RT::Extension::Tika::extract($a->Content);
+ }
+ # if an error occured processing the attachment log it,
+ # and move on to the next attachment
+ if ($err) {
+ debug("Failed to process attachment " . $a->id . " : " .$err);
+ next ATTACHMENT;
+ }
HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
push @insert, join("\n", $a->Subject // "", $text), $a->id;
$found++;
diff --git a/sbin/start-tika-server.in b/sbin/start-tika-server.in
new file mode 100755
index 0000000..c9b5a74
--- /dev/null
+++ b/sbin/start-tika-server.in
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+java -jar $(dirname 1)/local/plugins/RT-Extension-Tika/lib/auto/share/dist/RT-Extension-Tika/tika-server.jar
-----------------------------------------------------------------------
More information about the Bps-public-commit
mailing list