new search engine stuff
This commit is contained in:
parent
9a4972f3da
commit
18060d8e4e
7 changed files with 91 additions and 323 deletions
|
|
@ -16,14 +16,6 @@ package WebGUI::Search::Index;
|
|||
|
||||
use strict;
|
||||
use warnings;
|
||||
use Plucene::Analysis::SimpleAnalyzer;
|
||||
use Plucene::Document;
|
||||
use Plucene::Document::Field;
|
||||
use Plucene::Index::Reader;
|
||||
use Plucene::Index::Writer;
|
||||
use Plucene::Index::Term;
|
||||
use File::Spec::Functions qw(catfile);
|
||||
use WebGUI::Utility;
|
||||
|
||||
=head1 NAME
|
||||
|
||||
|
|
@ -46,168 +38,63 @@ These methods are available from this package:
|
|||
|
||||
#-------------------------------------------------------------------
|
||||
|
||||
=head2 addDate ( key, epoch )
|
||||
=head2 addKeywords ( text )
|
||||
|
||||
Adds a date field to the index which may later be used to search on date ranges.
|
||||
|
||||
=head3 key
|
||||
|
||||
A unique label to store this data.
|
||||
|
||||
=head3 epoch
|
||||
|
||||
A date represented as the number of seconds since January 1, 1970.
|
||||
|
||||
=cut
|
||||
|
||||
sub addDate {
|
||||
my $self = shift;
|
||||
my $key = shift;
|
||||
my $epoch = shift;
|
||||
$self->addKeyword($key, toBase36($epoch*1000));
|
||||
}
|
||||
|
||||
#-------------------------------------------------------------------
|
||||
|
||||
=head2 addKeyword ( key, text )
|
||||
|
||||
Adds some text that is stored and indexed, but not tokenized. This is best for single word items like keys.
|
||||
|
||||
=head3 key
|
||||
|
||||
A unique label to store this data.
|
||||
Add more text to the keywords index for this asset.
|
||||
|
||||
=head3 text
|
||||
|
||||
A string of text.
|
||||
A string of text. You may optionally also put HTML here, and it will be automatically filtered.
|
||||
|
||||
=cut
|
||||
|
||||
sub addKeyword {
|
||||
sub addKeywords {
|
||||
my $self = shift;
|
||||
my $key = shift;
|
||||
my $text = shift;
|
||||
$self->{_doc}->add(Plucene::Document::Field->Keyword($key=>$text));
|
||||
}
|
||||
|
||||
#-------------------------------------------------------------------
|
||||
|
||||
=head2 addRawText ( text )
|
||||
|
||||
This should be used when you're just dumping a big block of raw text into the search indexer. It doesn't store the raw text, just indexes it for key words.
|
||||
|
||||
=head3 text
|
||||
|
||||
A string of text.
|
||||
|
||||
=cut
|
||||
|
||||
sub addRawText {
|
||||
my $self = shift;
|
||||
$self->{_raw} .= ' '.shift;
|
||||
$text = WebGUI::HTML::filter($text, "all");
|
||||
my $add = $self->session->db->prepare("update assetIndex set keywords=concat(keywords,' ',?) where assetId = ?");
|
||||
$add->execute([$text, $self->getId]);
|
||||
}
|
||||
|
||||
|
||||
#-------------------------------------------------------------------
|
||||
|
||||
=head2 addText ( key, text )
|
||||
=head2 create ( asset )
|
||||
|
||||
Adds some text that is stored, indexed, and tokenized. This is best for simple phrases like titles and subjects.
|
||||
|
||||
=head3 key
|
||||
|
||||
A unique label to store this data.
|
||||
|
||||
=head3 text
|
||||
|
||||
A string of text.
|
||||
Constructor that also creates the initial index of an asset.
|
||||
|
||||
=cut
|
||||
|
||||
sub addText {
|
||||
my $self = shift;
|
||||
my $key = shift;
|
||||
my $text = shift;
|
||||
$self->{_doc}->add(Plucene::Document::Field->Text($key => $text));
|
||||
$self->addRawText($text);
|
||||
sub create {
|
||||
my $class = shift;
|
||||
my $asset = shift;
|
||||
my $self = $class->new($asset);
|
||||
$self->delete;
|
||||
my $url = $asset->get("url");
|
||||
$url =~ s/\/|\-|\_/ /g;
|
||||
my $description = WebGUI::HTML::filter($description, "all");
|
||||
my $keywords = join(" ",$asset->get("title"), $asset->get("menuTitle"), $asset->get("synopsis"), $url, $description));
|
||||
my $add = $self->session->db->prepare("insert into assetIndex (assetId, title, startDate, endDate, creationDate, revisionDate,
|
||||
ownerUserId, groupIdView, groupIdEdit, lineage, className, synopsis, keywords) values ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )");
|
||||
$add->execute([$asset->getId, $asset->get("title"), $asset->get("startDate"), $asset->get("endDate"), $asset->get("creationDate"),
|
||||
$asset->get("revisionDate"), $asset->get("ownerUserId"), $asset->get("groupIdView", $asset->get("groupIdEdit"),
|
||||
$asset->get("lineage"), $asset->get("className"), $asset->get("synopsis"), $keywords]);
|
||||
return $self;
|
||||
}
|
||||
|
||||
#-------------------------------------------------------------------
|
||||
|
||||
=head2 addUnindexed ( key, text )
|
||||
|
||||
Adds some text that is stored but not indexed or tokenized. This should be used sparingly, if ever, and is just a way to store extra metadata with search content that will not actually be used in search matches.
|
||||
|
||||
=head3 key
|
||||
|
||||
A unique label to store this data.
|
||||
|
||||
=head3 text
|
||||
|
||||
A string of text.
|
||||
|
||||
=cut
|
||||
|
||||
sub addUnindexed {
|
||||
my $self = shift;
|
||||
my $key = shift;
|
||||
my $text = shift;
|
||||
$self->{_doc}->add(Plucene::Document::Field->UnIndexed($key=>$text));
|
||||
}
|
||||
|
||||
#-------------------------------------------------------------------
|
||||
|
||||
=head2 addUnstored ( key, text )
|
||||
|
||||
Adds some text that is indexed and tokenized, but is not stored verbatim. This is best for big test blocks like descriptions.
|
||||
|
||||
=head3 key
|
||||
|
||||
A unique label to store this data.
|
||||
|
||||
=head3 text
|
||||
|
||||
A string of text.
|
||||
|
||||
=cut
|
||||
|
||||
sub addUnstored {
|
||||
my $self = shift;
|
||||
my $key = shift;
|
||||
my $text = shift;
|
||||
$self->{_doc}->add(Plucene::Document::Field->UnStored($key => $text));
|
||||
$self->addRawText($text);
|
||||
}
|
||||
|
||||
#-------------------------------------------------------------------
|
||||
|
||||
=head2 commit ( )
|
||||
|
||||
Writes the data added using the various add methods to the index. This is the last thing should do and it must be done or the index will not be created.
|
||||
|
||||
=cut
|
||||
|
||||
sub commit {
|
||||
my $self = shift;
|
||||
my $writer = Plucene::Index::Writer->new( $self->{_path}, Plucene::Analysis::SimpleAnalyzer->new(), -e catfile($self->{_path}, "segments") ? 0 : 1);
|
||||
$self->{_doc}->add(Plucene::Document::Field->UnStored(_raw_=> $self->{_raw}));
|
||||
$writer->add_document($self->{_doc});
|
||||
undef $writer;
|
||||
$self->DESTROY;
|
||||
}
|
||||
|
||||
#-------------------------------------------------------------------
|
||||
|
||||
=head2 delete ( )
|
||||
|
||||
Deletes this indexed item.
|
||||
Deletes this indexed asset.
|
||||
|
||||
=cut
|
||||
|
||||
sub delete {
|
||||
my $self = shift;
|
||||
# note: currently this method does nothing because stuff is actually deleted when you call the constructor
|
||||
$self->DESTROY;
|
||||
my $delete = $self->session->db->prepare("delete from assetIndex where assetId=?");
|
||||
$delete->execute([$self->getId]);
|
||||
}
|
||||
|
||||
#-------------------------------------------------------------------
|
||||
|
|
@ -220,7 +107,6 @@ Deconstructor.
|
|||
|
||||
sub DESTROY {
|
||||
my $self = shift;
|
||||
delete $self->{_doc};
|
||||
undef $self;
|
||||
}
|
||||
|
||||
|
|
@ -239,53 +125,24 @@ sub getId {
|
|||
|
||||
#-------------------------------------------------------------------
|
||||
|
||||
=head2 new ( session , id )
|
||||
=head2 new ( asset )
|
||||
|
||||
Constructor.
|
||||
|
||||
=head3 session
|
||||
=head3 asset
|
||||
|
||||
A reference to the current session.
|
||||
|
||||
=head3 id
|
||||
|
||||
The unique ID for this record in the index. Should be the assetId for the content you're indexing.
|
||||
A reference to an asset object.
|
||||
|
||||
=cut
|
||||
|
||||
sub new {
|
||||
my $class = shift;
|
||||
my $session = shift;
|
||||
my $id = shift;
|
||||
my $doc = Plucene::Document->new;
|
||||
my $self = {_path => "/tmp/plucy1", _p=>$session->config->get("uploadsPath")."/assetindex", _session=>$session, _doc=>$doc, _id=>$id};
|
||||
bless $self;
|
||||
if (-f $self->{_path}."/segments") { # don't make the following checks unless the index has been initialized
|
||||
my $reader = Plucene::Index::Reader->open($self->{_path});
|
||||
my $term = Plucene::Index::Term->new({ field => 'id', text => $self->getId });
|
||||
if ($reader->doc_freq($term)) { # delete the existing index if it already exists
|
||||
$reader->delete_term(Plucene::Index::Term->new({ field => "id", text => $self->getId }));
|
||||
$reader->close;
|
||||
}
|
||||
}
|
||||
$doc->add(Plucene::Document::Field->Keyword(id => $id)); # create a new index for this id
|
||||
my $asset = shift;
|
||||
my $self = {_session=>$asset->session, _id=>$asset->getId};
|
||||
return $self;
|
||||
}
|
||||
|
||||
|
||||
#-------------------------------------------------------------------
|
||||
|
||||
=head2 optimize ( session )
|
||||
|
||||
=cut
|
||||
|
||||
sub optimize {
|
||||
my $class = shift;
|
||||
my $session = shift;
|
||||
Plucene::Index::Writer->new( "/tmp/plucy1", Plucene::Analysis::SimpleAnalyzer->new(), -e catfile("/tmp/plucy1", "segments") ? 0 : 1)->optimize;
|
||||
}
|
||||
|
||||
|
||||
#-------------------------------------------------------------------
|
||||
|
||||
=head2 session ( )
|
||||
|
|
@ -299,10 +156,28 @@ sub session {
|
|||
return $self->{_session};
|
||||
}
|
||||
|
||||
#-------------------------------------------------------------------
|
||||
|
||||
=head2 updateSynopsis ( text )
|
||||
|
||||
Overrides the asset's default synopsis with a new chunk of text.
|
||||
|
||||
NOTE: This doesn't change the asset itself, only the synopsis in the search index.
|
||||
|
||||
=head3 text
|
||||
|
||||
The text to put in place of the current synopsis.
|
||||
|
||||
=cut
|
||||
|
||||
sub updateSynopsis {
|
||||
my $self = shift;
|
||||
my $text = shift;
|
||||
my $add = $self->session->db->prepare("update assetIndex set synopsis=? where assetId=?");
|
||||
$add->execute([$text,$self->getId]);
|
||||
}
|
||||
|
||||
|
||||
|
||||
1;
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue