webgui/lib/WebGUI/Search/Index.pm
2006-01-19 05:15:20 +00:00

308 lines
6.6 KiB
Perl

package WebGUI::Search::Index;
=head1 LEGAL
-------------------------------------------------------------------
WebGUI is Copyright 2001-2006 Plain Black Corporation.
-------------------------------------------------------------------
Please read the legal notices (docs/legal.txt) and the license
(docs/license.txt) that came with this distribution before using
this software.
-------------------------------------------------------------------
http://www.plainblack.com info@plainblack.com
-------------------------------------------------------------------
=cut
use strict;
use warnings;
use Plucene::Analysis::SimpleAnalyzer;
use Plucene::Document;
use Plucene::Document::Field;
use Plucene::Index::Reader;
use Plucene::Index::Writer;
use Plucene::Index::Term;
use File::Spec::Functions qw(catfile);
use WebGUI::Utility;
=head1 NAME
Package WebGUI::Search::Index
=head1 DESCRIPTION
A package for working with the WebGUI Search Engine.
=head1 SYNOPSIS
use WebGUI::Search::Index;
=head1 METHODS
These methods are available from this package:
=cut
#-------------------------------------------------------------------
=head2 addDate ( key, epoch )
Adds a date field to the index which may later be used to search on date ranges.
=head3 key
A unique label to store this data.
=head3 epoch
A date represented as the number of seconds since January 1, 1970.
=cut
sub addDate {
my $self = shift;
my $key = shift;
my $epoch = shift;
$self->addKeyword($key, toBase36($epoch*1000));
}
#-------------------------------------------------------------------
=head2 addKeyword ( key, text )
Adds some text that is stored and indexed, but not tokenized. This is best for single word items like keys.
=head3 key
A unique label to store this data.
=head3 text
A string of text.
=cut
sub addKeyword {
my $self = shift;
my $key = shift;
my $text = shift;
$self->{_doc}->add(Plucene::Document::Field->Keyword($key=>$text));
}
#-------------------------------------------------------------------
=head2 addRawText ( text )
This should be used when you're just dumping a big block of raw text into the search indexer. It doesn't store the raw text, just indexes it for key words.
=head3 text
A string of text.
=cut
sub addRawText {
my $self = shift;
$self->{_raw} .= ' '.shift;
}
#-------------------------------------------------------------------
=head2 addText ( key, text )
Adds some text that is stored, indexed, and tokenized. This is best for simple phrases like titles and subjects.
=head3 key
A unique label to store this data.
=head3 text
A string of text.
=cut
sub addText {
my $self = shift;
my $key = shift;
my $text = shift;
$self->{_doc}->add(Plucene::Document::Field->Text($key => $text));
$self->addRawText($text);
}
#-------------------------------------------------------------------
=head2 addUnindexed ( key, text )
Adds some text that is stored but not indexed or tokenized. This should be used sparingly, if ever, and is just a way to store extra metadata with search content that will not actually be used in search matches.
=head3 key
A unique label to store this data.
=head3 text
A string of text.
=cut
sub addUnindexed {
my $self = shift;
my $key = shift;
my $text = shift;
$self->{_doc}->add(Plucene::Document::Field->UnIndexed($key=>$text));
}
#-------------------------------------------------------------------
=head2 addUnstored ( key, text )
Adds some text that is indexed and tokenized, but is not stored verbatim. This is best for big test blocks like descriptions.
=head3 key
A unique label to store this data.
=head3 text
A string of text.
=cut
sub addUnstored {
my $self = shift;
my $key = shift;
my $text = shift;
$self->{_doc}->add(Plucene::Document::Field->UnStored($key => $text));
$self->addRawText($text);
}
#-------------------------------------------------------------------
=head2 commit ( )
Writes the data added using the various add methods to the index. This is the last thing should do and it must be done or the index will not be created.
=cut
sub commit {
my $self = shift;
my $writer = Plucene::Index::Writer->new( $self->{_path}, Plucene::Analysis::SimpleAnalyzer->new(), -e catfile($self->{_path}, "segments") ? 0 : 1);
$self->{_doc}->add(Plucene::Document::Field->UnStored(_raw_=> $self->{_raw}));
$writer->add_document($self->{_doc});
undef $writer;
$self->DESTROY;
}
#-------------------------------------------------------------------
=head2 delete ( )
Deletes this indexed item.
=cut
sub delete {
my $self = shift;
# note: currently this method does nothing because stuff is actually deleted when you call the constructor
$self->DESTROY;
}
#-------------------------------------------------------------------
=head2 DESTROY ( )
Deconstructor.
=cut
sub DESTROY {
my $self = shift;
delete $self->{_doc};
undef $self;
}
#-------------------------------------------------------------------
=head2 getId ( )
Returns the ID used to create this object.
=cut
sub getId {
my $self = shift;
return $self->{_id};
}
#-------------------------------------------------------------------
=head2 new ( session , id )
Constructor.
=head3 session
A reference to the current session.
=head3 id
The unique ID for this record in the index. Should be the assetId for the content you're indexing.
=cut
sub new {
my $class = shift;
my $session = shift;
my $id = shift;
my $doc = Plucene::Document->new;
my $self = {_path => "/tmp/plucy1", _p=>$session->config->get("uploadsPath")."/assetindex", _session=>$session, _doc=>$doc, _id=>$id};
bless $self;
if (-f $self->{_path}."/segments") { # don't make the following checks unless the index has been initialized
my $reader = Plucene::Index::Reader->open($self->{_path});
my $term = Plucene::Index::Term->new({ field => 'id', text => $self->getId });
if ($reader->doc_freq($term)) { # delete the existing index if it already exists
$reader->delete_term(Plucene::Index::Term->new({ field => "id", text => $self->getId }));
$reader->close;
}
}
$doc->add(Plucene::Document::Field->Keyword(id => $id)); # create a new index for this id
return $self;
}
#-------------------------------------------------------------------
=head2 optimize ( session )
=cut
sub optimize {
my $class = shift;
my $session = shift;
Plucene::Index::Writer->new( "/tmp/plucy1", Plucene::Analysis::SimpleAnalyzer->new(), -e catfile("/tmp/plucy1", "segments") ? 0 : 1)->optimize;
}
#-------------------------------------------------------------------
=head2 session ( )
Returns a reference to the current session.
=cut
sub session {
my $self = shift;
return $self->{_session};
}
1;