new search engine stuff

This commit is contained in:
JT Smith 2006-01-20 18:35:05 +00:00
parent 9a4972f3da
commit 18060d8e4e
7 changed files with 91 additions and 323 deletions

View file

@ -16,14 +16,6 @@ package WebGUI::Search::Index;
use strict;
use warnings;
use Plucene::Analysis::SimpleAnalyzer;
use Plucene::Document;
use Plucene::Document::Field;
use Plucene::Index::Reader;
use Plucene::Index::Writer;
use Plucene::Index::Term;
use File::Spec::Functions qw(catfile);
use WebGUI::Utility;
=head1 NAME
@ -46,168 +38,63 @@ These methods are available from this package:
#-------------------------------------------------------------------
=head2 addDate ( key, epoch )
=head2 addKeywords ( text )
Adds a date field to the index which may later be used to search on date ranges.
=head3 key
A unique label to store this data.
=head3 epoch
A date represented as the number of seconds since January 1, 1970.
=cut
sub addDate {
my $self = shift;
my $key = shift;
my $epoch = shift;
$self->addKeyword($key, toBase36($epoch*1000));
}
#-------------------------------------------------------------------
=head2 addKeyword ( key, text )
Adds some text that is stored and indexed, but not tokenized. This is best for single word items like keys.
=head3 key
A unique label to store this data.
Add more text to the keywords index for this asset.
=head3 text
A string of text.
A string of text. You may optionally also put HTML here, and it will be automatically filtered.
=cut
sub addKeyword {
sub addKeywords {
my $self = shift;
my $key = shift;
my $text = shift;
$self->{_doc}->add(Plucene::Document::Field->Keyword($key=>$text));
}
#-------------------------------------------------------------------
=head2 addRawText ( text )
This should be used when you're just dumping a big block of raw text into the search indexer. It doesn't store the raw text, just indexes it for key words.
=head3 text
A string of text.
=cut
sub addRawText {
my $self = shift;
$self->{_raw} .= ' '.shift;
$text = WebGUI::HTML::filter($text, "all");
my $add = $self->session->db->prepare("update assetIndex set keywords=concat(keywords,' ',?) where assetId = ?");
$add->execute([$text, $self->getId]);
}
#-------------------------------------------------------------------
=head2 addText ( key, text )
=head2 create ( asset )
Adds some text that is stored, indexed, and tokenized. This is best for simple phrases like titles and subjects.
=head3 key
A unique label to store this data.
=head3 text
A string of text.
Constructor that also creates the initial index of an asset.
=cut
sub addText {
my $self = shift;
my $key = shift;
my $text = shift;
$self->{_doc}->add(Plucene::Document::Field->Text($key => $text));
$self->addRawText($text);
sub create {
my $class = shift;
my $asset = shift;
my $self = $class->new($asset);
$self->delete;
my $url = $asset->get("url");
$url =~ s/\/|\-|\_/ /g;
my $description = WebGUI::HTML::filter($description, "all");
my $keywords = join(" ",$asset->get("title"), $asset->get("menuTitle"), $asset->get("synopsis"), $url, $description));
my $add = $self->session->db->prepare("insert into assetIndex (assetId, title, startDate, endDate, creationDate, revisionDate,
ownerUserId, groupIdView, groupIdEdit, lineage, className, synopsis, keywords) values ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )");
$add->execute([$asset->getId, $asset->get("title"), $asset->get("startDate"), $asset->get("endDate"), $asset->get("creationDate"),
$asset->get("revisionDate"), $asset->get("ownerUserId"), $asset->get("groupIdView", $asset->get("groupIdEdit"),
$asset->get("lineage"), $asset->get("className"), $asset->get("synopsis"), $keywords]);
return $self;
}
#-------------------------------------------------------------------
=head2 addUnindexed ( key, text )
Adds some text that is stored but not indexed or tokenized. This should be used sparingly, if ever, and is just a way to store extra metadata with search content that will not actually be used in search matches.
=head3 key
A unique label to store this data.
=head3 text
A string of text.
=cut
sub addUnindexed {
my $self = shift;
my $key = shift;
my $text = shift;
$self->{_doc}->add(Plucene::Document::Field->UnIndexed($key=>$text));
}
#-------------------------------------------------------------------
=head2 addUnstored ( key, text )
Adds some text that is indexed and tokenized, but is not stored verbatim. This is best for big test blocks like descriptions.
=head3 key
A unique label to store this data.
=head3 text
A string of text.
=cut
sub addUnstored {
my $self = shift;
my $key = shift;
my $text = shift;
$self->{_doc}->add(Plucene::Document::Field->UnStored($key => $text));
$self->addRawText($text);
}
#-------------------------------------------------------------------
=head2 commit ( )
Writes the data added using the various add methods to the index. This is the last thing should do and it must be done or the index will not be created.
=cut
sub commit {
my $self = shift;
my $writer = Plucene::Index::Writer->new( $self->{_path}, Plucene::Analysis::SimpleAnalyzer->new(), -e catfile($self->{_path}, "segments") ? 0 : 1);
$self->{_doc}->add(Plucene::Document::Field->UnStored(_raw_=> $self->{_raw}));
$writer->add_document($self->{_doc});
undef $writer;
$self->DESTROY;
}
#-------------------------------------------------------------------
=head2 delete ( )
Deletes this indexed item.
Deletes this indexed asset.
=cut
sub delete {
my $self = shift;
# note: currently this method does nothing because stuff is actually deleted when you call the constructor
$self->DESTROY;
my $delete = $self->session->db->prepare("delete from assetIndex where assetId=?");
$delete->execute([$self->getId]);
}
#-------------------------------------------------------------------
@ -220,7 +107,6 @@ Deconstructor.
sub DESTROY {
my $self = shift;
delete $self->{_doc};
undef $self;
}
@ -239,53 +125,24 @@ sub getId {
#-------------------------------------------------------------------
=head2 new ( session , id )
=head2 new ( asset )
Constructor.
=head3 session
=head3 asset
A reference to the current session.
=head3 id
The unique ID for this record in the index. Should be the assetId for the content you're indexing.
A reference to an asset object.
=cut
sub new {
my $class = shift;
my $session = shift;
my $id = shift;
my $doc = Plucene::Document->new;
my $self = {_path => "/tmp/plucy1", _p=>$session->config->get("uploadsPath")."/assetindex", _session=>$session, _doc=>$doc, _id=>$id};
bless $self;
if (-f $self->{_path}."/segments") { # don't make the following checks unless the index has been initialized
my $reader = Plucene::Index::Reader->open($self->{_path});
my $term = Plucene::Index::Term->new({ field => 'id', text => $self->getId });
if ($reader->doc_freq($term)) { # delete the existing index if it already exists
$reader->delete_term(Plucene::Index::Term->new({ field => "id", text => $self->getId }));
$reader->close;
}
}
$doc->add(Plucene::Document::Field->Keyword(id => $id)); # create a new index for this id
my $asset = shift;
my $self = {_session=>$asset->session, _id=>$asset->getId};
return $self;
}
#-------------------------------------------------------------------
=head2 optimize ( session )
=cut
sub optimize {
my $class = shift;
my $session = shift;
Plucene::Index::Writer->new( "/tmp/plucy1", Plucene::Analysis::SimpleAnalyzer->new(), -e catfile("/tmp/plucy1", "segments") ? 0 : 1)->optimize;
}
#-------------------------------------------------------------------
=head2 session ( )
@ -299,10 +156,28 @@ sub session {
return $self->{_session};
}
#-------------------------------------------------------------------
=head2 updateSynopsis ( text )
Overrides the asset's default synopsis with a new chunk of text.
NOTE: This doesn't change the asset itself, only the synopsis in the search index.
=head3 text
The text to put in place of the current synopsis.
=cut
sub updateSynopsis {
my $self = shift;
my $text = shift;
my $add = $self->session->db->prepare("update assetIndex set synopsis=? where assetId=?");
$add->execute([$text,$self->getId]);
}
1;