[rt-users] mysql & sphinx

Arkadiusz Miskiewicz arekm at maven.pl
Mon Aug 29 02:32:11 EDT 2011


On Monday 29 of August 2011, Alex Vandiver wrote:

> Does that help to clarify the limitation?

Yes, it does. Thanks.

More questions follows. I see that sphinx is learning
only attachments with ContentType = 'text/plain' entries
which looks unfortunate since I have tons of html email.

Did indexing text/html and having html_strip=1 [1]
in sphinx produce any problems that caused only text/plain
to be choosen for indexation in rt-setup-fulltext-index?

1. http://sphinxsearch.com/docs/current.html#conf-html-strip

ps. here is my setup with delta indexes, could be useful for
other prople

source rt {
    type            = mysql

    sql_host        = localhost
    sql_db          = rt3
    sql_user        = 
    sql_pass        = 

    sql_query_pre   = SET NAMES utf8
    sql_query_pre   = REPLACE INTO SphinxCounters SELECT 1, MAX(id) FROM Attachments
    sql_query       = \
        SELECT a.id, a.content FROM Attachments a \
        JOIN Transactions txn ON a.TransactionId = txn.id AND txn.ObjectType = 'RT::Ticket' \
        JOIN Tickets t ON txn.ObjectId = t.id \
        WHERE a.ContentType = 'text/plain' AND t.Status != 'deleted' \
        AND a.id<=( SELECT max_doc_id FROM SphinxCounters WHERE counter_id=1 )

    sql_query_info  = SELECT * FROM Attachments WHERE id=$id
}

source rt_delta : rt {
    sql_query_pre   = SET NAMES utf8
    sql_query       = \
        SELECT a.id, a.content FROM Attachments a \
        JOIN Transactions txn ON a.TransactionId = txn.id AND txn.ObjectType = 'RT::Ticket' \
        JOIN Tickets t ON txn.ObjectId = t.id \
        WHERE a.ContentType = 'text/plain' AND t.Status != 'deleted' \
        AND a.id>( SELECT max_doc_id FROM SphinxCounters WHERE counter_id=1 )

    sql_query_info  = SELECT * FROM Attachments WHERE id=$id
    sql_query_post_index    =
}


index rt {
    source                  = rt
    path                    = /var/lib/sphinx/rt.index
    docinfo                 = extern
    charset_type            = utf-8
    charset_table   = 0..9, A..Z->a..z, a..z, U+0143->U+0144, U+0104->U+0105, U+0106->U+0107, U+0118->U+0119, U+0141->U+0142, U+00D3->U+00F3, U+015A->U+015B,
 U+0179->U+017A, U+017B->U+017C, U+0105, U+0107, U+0119, U+0142, U+00F3, U+015B, U+017A, U+017C, U+0144
}

(various charset_table are required for many non-english languages)

index rt_delta : rt
{
    source          = rt_delta
    path            = /var/lib/sphinx/rt.delta.index
}

CREATE TABLE `AttachmentsIndex` (
  `id` int(10) unsigned NOT NULL,
  `weight` int(11) NOT NULL,
  `query` varchar(3072) NOT NULL,
  KEY `query` (`query`(255))
) ENGINE=SPHINX DEFAULT CHARSET=utf8 CONNECTION='sphinx://127.0.0.1:3312/rt,rt_delta'

(note, two indexes in CONNECTION)

CREATE TABLE `SphinxCounters` (
  `counter_id` int(11) NOT NULL,
  `max_doc_id` int(11) NOT NULL,
  PRIMARY KEY (`counter_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8


all indexes updated daily, rt_delta updated every 15 minutes

# cat /etc/cron.d/sphinx
@daily      root /usr/bin/indexer --quiet --rotate --all
*/15 * * * * root /usr/bin/indexer --quiet --rotate rt_delta


-- 
Arkadiusz Miśkiewicz        PLD/Linux Team
arekm / maven.pl            http://ftp.pld-linux.org/



More information about the rt-users mailing list