[rt-users] mysql & sphinx
Arkadiusz Miskiewicz
arekm at maven.pl
Mon Aug 29 02:32:11 EDT 2011
On Monday 29 of August 2011, Alex Vandiver wrote:
> Does that help to clarify the limitation?
Yes, it does. Thanks.
More questions follows. I see that sphinx is learning
only attachments with ContentType = 'text/plain' entries
which looks unfortunate since I have tons of html email.
Did indexing text/html and having html_strip=1 [1]
in sphinx produce any problems that caused only text/plain
to be choosen for indexation in rt-setup-fulltext-index?
1. http://sphinxsearch.com/docs/current.html#conf-html-strip
ps. here is my setup with delta indexes, could be useful for
other prople
source rt {
type = mysql
sql_host = localhost
sql_db = rt3
sql_user =
sql_pass =
sql_query_pre = SET NAMES utf8
sql_query_pre = REPLACE INTO SphinxCounters SELECT 1, MAX(id) FROM Attachments
sql_query = \
SELECT a.id, a.content FROM Attachments a \
JOIN Transactions txn ON a.TransactionId = txn.id AND txn.ObjectType = 'RT::Ticket' \
JOIN Tickets t ON txn.ObjectId = t.id \
WHERE a.ContentType = 'text/plain' AND t.Status != 'deleted' \
AND a.id<=( SELECT max_doc_id FROM SphinxCounters WHERE counter_id=1 )
sql_query_info = SELECT * FROM Attachments WHERE id=$id
}
source rt_delta : rt {
sql_query_pre = SET NAMES utf8
sql_query = \
SELECT a.id, a.content FROM Attachments a \
JOIN Transactions txn ON a.TransactionId = txn.id AND txn.ObjectType = 'RT::Ticket' \
JOIN Tickets t ON txn.ObjectId = t.id \
WHERE a.ContentType = 'text/plain' AND t.Status != 'deleted' \
AND a.id>( SELECT max_doc_id FROM SphinxCounters WHERE counter_id=1 )
sql_query_info = SELECT * FROM Attachments WHERE id=$id
sql_query_post_index =
}
index rt {
source = rt
path = /var/lib/sphinx/rt.index
docinfo = extern
charset_type = utf-8
charset_table = 0..9, A..Z->a..z, a..z, U+0143->U+0144, U+0104->U+0105, U+0106->U+0107, U+0118->U+0119, U+0141->U+0142, U+00D3->U+00F3, U+015A->U+015B,
U+0179->U+017A, U+017B->U+017C, U+0105, U+0107, U+0119, U+0142, U+00F3, U+015B, U+017A, U+017C, U+0144
}
(various charset_table are required for many non-english languages)
index rt_delta : rt
{
source = rt_delta
path = /var/lib/sphinx/rt.delta.index
}
CREATE TABLE `AttachmentsIndex` (
`id` int(10) unsigned NOT NULL,
`weight` int(11) NOT NULL,
`query` varchar(3072) NOT NULL,
KEY `query` (`query`(255))
) ENGINE=SPHINX DEFAULT CHARSET=utf8 CONNECTION='sphinx://127.0.0.1:3312/rt,rt_delta'
(note, two indexes in CONNECTION)
CREATE TABLE `SphinxCounters` (
`counter_id` int(11) NOT NULL,
`max_doc_id` int(11) NOT NULL,
PRIMARY KEY (`counter_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8
all indexes updated daily, rt_delta updated every 15 minutes
# cat /etc/cron.d/sphinx
@daily root /usr/bin/indexer --quiet --rotate --all
*/15 * * * * root /usr/bin/indexer --quiet --rotate rt_delta
--
Arkadiusz Miśkiewicz PLD/Linux Team
arekm / maven.pl http://ftp.pld-linux.org/
More information about the rt-users
mailing list