new search engine stuff

This commit is contained in:
JT Smith 2006-01-20 18:35:05 +00:00
parent 9a4972f3da
commit 18060d8e4e
7 changed files with 91 additions and 323 deletions

View file

@ -17,12 +17,6 @@ save you many hours of grief.
* You need to upgrade to the latest 6.8 release before you can
upgrade to 6.9.
* Due to new features in 6.9 there are several new perl modules you
need to install before the upgrade:
Bit::Vector::Minimal
Plucene
* If you wish to use one of the new optional template engines
you'll need to install the perl modules for them:

View file

@ -26,6 +26,43 @@ removeFiles();
finish($session); # this line required
#-------------------------------------------------
sub addSearchEngine {
print "\tUpgrading search engine.\n" unless ($quiet);
$session->db->write("create table assetIndex (
assetId varchar(22) binary not null primary key,
title varchar(255),
synopsis text,
startDate bigint,
endDate bigint,
creationDate bigint,
modifiedDate bigint,
ownerUserId varchar(22) binary,
groupToView varchar(22) binary,
groupToEdit varchar(22) binary,
lineage varchar(255),
className varchar(255),
keywords mediumtext,
fulltext (keywords)
)");
my @searchParents = $session->db->buildArray("select parentId from asset where className='WebGUI::Asset::Wobject::IndexedSearch'");
my @searchIds = $session->db->buildArray("select assetId from asset where className='WebGUI::Asset::Wobject::IndexedSearch'");
$session->db->write("delete from asset where className='WebGUI::Asset::Wobject::IndexedSearch'");
my $deleteWobject = $session->db->prepare("delete from wobject where assetId=?");
my $deleteAssetData = $session->db->prepare("delete from assetData where assetId=?");
foreach my $id (@searchIds) {
$deleteWobject->execute($id);
$deleteAssetData->execute($id);
}
$deleteWobject->finish;
$deleteAssetData->finish;
$session->db->write("drop table if exists IndexedSearch");
$session->db->write("drop table if exists IndexedSearch_default");
$session->db->write("drop table if exists IndexedSearch_default_data");
$session->db->write("drop table if exists IndexedSearch_default_words");
$session->db->write("drop table if exists IndexedSearch_docInfo");
}
#-------------------------------------------------
sub templateParsers {
print "\tAdding support for multiple template parsers.\n" unless ($quiet);
@ -46,6 +83,8 @@ sub removeFiles {
unlink '../../lib/WebGUI/Style.pm';
unlink '../../lib/WebGUI/Setting.pm';
unlink '../../lib/WebGUI/Grouping.pm';
unlink '../../lib/WebGUI/Asset/Wobject/IndexedSearch.pm';
rmtree('../../lib/WebGUI/Asset/Wobject/IndexedSearch');
}

View file

@ -115,9 +115,8 @@ An array reference containing a list of values to be used in the placeholders de
sub execute {
my $self = shift;
my $placeholders = shift || [];
my $sql = $self->{_sql};
$self->sth->execute(@{$placeholders}) or $self->db->session->errorHandler->fatal("Couldn't execute prepared statement: $sql Root cause: ". $self->errorMessage);
$self->sth->execute(shift||[]) or $self->db->session->errorHandler->fatal("Couldn't execute prepared statement: $sql Root cause: ". $self->errorMessage);
}

View file

@ -1,91 +0,0 @@
package WebGUI::Search::DateTimeFilter;
=head1 LEGAL
-------------------------------------------------------------------
WebGUI is Copyright 2001-2006 Plain Black Corporation.
-------------------------------------------------------------------
Please read the legal notices (docs/legal.txt) and the license
(docs/license.txt) that came with this distribution before using
this software.
-------------------------------------------------------------------
http://www.plainblack.com info@plainblack.com
-------------------------------------------------------------------
=cut
use strict;
use base 'Plucene::Search::Filter';
use Bit::Vector::Minimal;
use Plucene::Index::Term;
use WebGUI::Utility;
=head1 DESCRIPTION
This package is a replacement for Plucene::Search::DateFilter that uses epochs rather than Time::Piece objects.
=cut
#-------------------------------------------------------------------
=head2 new ( field [, from, to] )
Constructor.
=head3 field
The field name to build the filter for.
=head3 from
An epoch date to start the search from. Defaults to now minus one year.
=head3 to
An epoch date to end searching on. Defaults to now.
=cut
sub new {
my $class = shift;
my $args = shift;
bless {
field => $args->{field},
from => toBase36(($args->{from}||time()-60*60*24*365)*1000),
to => toBase36(($args->{to}||time())*1000),
}, $class;
}
#-------------------------------------------------------------------
=head2 bits ( )
The actual filter method required by Plucene::Search::IndexSearcher.
=cut
sub bits {
my ($self, $reader) = @_;
my $bits = Bit::Vector::Minimal->new(size => $reader->max_doc);
my $enum = $reader->terms(
Plucene::Index::Term->new({
field => $self->{field},
text => $self->{from} }));
return $bits unless $enum->term;
my $termdocs = $reader->term_docs;
my $stop = Plucene::Index::Term->new({
field => $self->{field},
text => $self->{to} });
while ($enum->term->le($stop)) {
$termdocs->seek($enum->term);
$bits->set($termdocs->doc) while $termdocs->next;
last unless $enum->next;
}
return $bits;
}
1;

View file

@ -16,14 +16,6 @@ package WebGUI::Search::Index;
use strict;
use warnings;
use Plucene::Analysis::SimpleAnalyzer;
use Plucene::Document;
use Plucene::Document::Field;
use Plucene::Index::Reader;
use Plucene::Index::Writer;
use Plucene::Index::Term;
use File::Spec::Functions qw(catfile);
use WebGUI::Utility;
=head1 NAME
@ -46,168 +38,63 @@ These methods are available from this package:
#-------------------------------------------------------------------
=head2 addDate ( key, epoch )
=head2 addKeywords ( text )
Adds a date field to the index which may later be used to search on date ranges.
=head3 key
A unique label to store this data.
=head3 epoch
A date represented as the number of seconds since January 1, 1970.
=cut
sub addDate {
my $self = shift;
my $key = shift;
my $epoch = shift;
$self->addKeyword($key, toBase36($epoch*1000));
}
#-------------------------------------------------------------------
=head2 addKeyword ( key, text )
Adds some text that is stored and indexed, but not tokenized. This is best for single word items like keys.
=head3 key
A unique label to store this data.
Add more text to the keywords index for this asset.
=head3 text
A string of text.
A string of text. You may optionally also put HTML here, and it will be automatically filtered.
=cut
sub addKeyword {
sub addKeywords {
my $self = shift;
my $key = shift;
my $text = shift;
$self->{_doc}->add(Plucene::Document::Field->Keyword($key=>$text));
}
#-------------------------------------------------------------------
=head2 addRawText ( text )
This should be used when you're just dumping a big block of raw text into the search indexer. It doesn't store the raw text, just indexes it for key words.
=head3 text
A string of text.
=cut
sub addRawText {
my $self = shift;
$self->{_raw} .= ' '.shift;
$text = WebGUI::HTML::filter($text, "all");
my $add = $self->session->db->prepare("update assetIndex set keywords=concat(keywords,' ',?) where assetId = ?");
$add->execute([$text, $self->getId]);
}
#-------------------------------------------------------------------
=head2 addText ( key, text )
=head2 create ( asset )
Adds some text that is stored, indexed, and tokenized. This is best for simple phrases like titles and subjects.
=head3 key
A unique label to store this data.
=head3 text
A string of text.
Constructor that also creates the initial index of an asset.
=cut
sub addText {
my $self = shift;
my $key = shift;
my $text = shift;
$self->{_doc}->add(Plucene::Document::Field->Text($key => $text));
$self->addRawText($text);
sub create {
my $class = shift;
my $asset = shift;
my $self = $class->new($asset);
$self->delete;
my $url = $asset->get("url");
$url =~ s/\/|\-|\_/ /g;
my $description = WebGUI::HTML::filter($description, "all");
my $keywords = join(" ",$asset->get("title"), $asset->get("menuTitle"), $asset->get("synopsis"), $url, $description));
my $add = $self->session->db->prepare("insert into assetIndex (assetId, title, startDate, endDate, creationDate, revisionDate,
ownerUserId, groupIdView, groupIdEdit, lineage, className, synopsis, keywords) values ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )");
$add->execute([$asset->getId, $asset->get("title"), $asset->get("startDate"), $asset->get("endDate"), $asset->get("creationDate"),
$asset->get("revisionDate"), $asset->get("ownerUserId"), $asset->get("groupIdView", $asset->get("groupIdEdit"),
$asset->get("lineage"), $asset->get("className"), $asset->get("synopsis"), $keywords]);
return $self;
}
#-------------------------------------------------------------------
=head2 addUnindexed ( key, text )
Adds some text that is stored but not indexed or tokenized. This should be used sparingly, if ever, and is just a way to store extra metadata with search content that will not actually be used in search matches.
=head3 key
A unique label to store this data.
=head3 text
A string of text.
=cut
sub addUnindexed {
my $self = shift;
my $key = shift;
my $text = shift;
$self->{_doc}->add(Plucene::Document::Field->UnIndexed($key=>$text));
}
#-------------------------------------------------------------------
=head2 addUnstored ( key, text )
Adds some text that is indexed and tokenized, but is not stored verbatim. This is best for big test blocks like descriptions.
=head3 key
A unique label to store this data.
=head3 text
A string of text.
=cut
sub addUnstored {
my $self = shift;
my $key = shift;
my $text = shift;
$self->{_doc}->add(Plucene::Document::Field->UnStored($key => $text));
$self->addRawText($text);
}
#-------------------------------------------------------------------
=head2 commit ( )
Writes the data added using the various add methods to the index. This is the last thing should do and it must be done or the index will not be created.
=cut
sub commit {
my $self = shift;
my $writer = Plucene::Index::Writer->new( $self->{_path}, Plucene::Analysis::SimpleAnalyzer->new(), -e catfile($self->{_path}, "segments") ? 0 : 1);
$self->{_doc}->add(Plucene::Document::Field->UnStored(_raw_=> $self->{_raw}));
$writer->add_document($self->{_doc});
undef $writer;
$self->DESTROY;
}
#-------------------------------------------------------------------
=head2 delete ( )
Deletes this indexed item.
Deletes this indexed asset.
=cut
sub delete {
my $self = shift;
# note: currently this method does nothing because stuff is actually deleted when you call the constructor
$self->DESTROY;
my $delete = $self->session->db->prepare("delete from assetIndex where assetId=?");
$delete->execute([$self->getId]);
}
#-------------------------------------------------------------------
@ -220,7 +107,6 @@ Deconstructor.
sub DESTROY {
my $self = shift;
delete $self->{_doc};
undef $self;
}
@ -239,53 +125,24 @@ sub getId {
#-------------------------------------------------------------------
=head2 new ( session , id )
=head2 new ( asset )
Constructor.
=head3 session
=head3 asset
A reference to the current session.
=head3 id
The unique ID for this record in the index. Should be the assetId for the content you're indexing.
A reference to an asset object.
=cut
sub new {
my $class = shift;
my $session = shift;
my $id = shift;
my $doc = Plucene::Document->new;
my $self = {_path => "/tmp/plucy1", _p=>$session->config->get("uploadsPath")."/assetindex", _session=>$session, _doc=>$doc, _id=>$id};
bless $self;
if (-f $self->{_path}."/segments") { # don't make the following checks unless the index has been initialized
my $reader = Plucene::Index::Reader->open($self->{_path});
my $term = Plucene::Index::Term->new({ field => 'id', text => $self->getId });
if ($reader->doc_freq($term)) { # delete the existing index if it already exists
$reader->delete_term(Plucene::Index::Term->new({ field => "id", text => $self->getId }));
$reader->close;
}
}
$doc->add(Plucene::Document::Field->Keyword(id => $id)); # create a new index for this id
my $asset = shift;
my $self = {_session=>$asset->session, _id=>$asset->getId};
return $self;
}
#-------------------------------------------------------------------
=head2 optimize ( session )
=cut
sub optimize {
my $class = shift;
my $session = shift;
Plucene::Index::Writer->new( "/tmp/plucy1", Plucene::Analysis::SimpleAnalyzer->new(), -e catfile("/tmp/plucy1", "segments") ? 0 : 1)->optimize;
}
#-------------------------------------------------------------------
=head2 session ( )
@ -299,10 +156,28 @@ sub session {
return $self->{_session};
}
#-------------------------------------------------------------------
=head2 updateSynopsis ( text )
Overrides the asset's default synopsis with a new chunk of text.
NOTE: This doesn't change the asset itself, only the synopsis in the search index.
=head3 text
The text to put in place of the current synopsis.
=cut
sub updateSynopsis {
my $self = shift;
my $text = shift;
my $add = $self->session->db->prepare("update assetIndex set synopsis=? where assetId=?");
$add->execute([$text,$self->getId]);
}
1;

View file

@ -23,7 +23,7 @@ use Tie::IxHash;
our @ISA = qw(Exporter);
our @EXPORT = qw(&isBetween &makeTabSafe &makeArrayTabSafe &randomizeHash &commify &randomizeArray
&formatBytes &sortHashDescending &sortHash &isIn &makeCommaSafe &makeArrayCommaSafe &randint &round
&fromBase36 &toBase36);
);
=head1 NAME
@ -39,7 +39,6 @@ This package provides miscellaneous but useful utilities to the WebGUI programme
use WebGUI::Utility;
$string = commify($integer);
$size = formatBytes($integer);
$number = fromBase36($string);
$boolean = isIn($value, @array);
makeArrayCommaSafe(\@array);
makeArrayTabSafe(\@array);
@ -50,7 +49,6 @@ This package provides miscellaneous but useful utilities to the WebGUI programme
$hashRef = randomizeHash(\%hash);
%hash = sortHash(%hash);
%hash = sortHashDescending(%hash);
$string = toBase36($number);
=head1 METHODS
@ -101,27 +99,6 @@ sub formatBytes {
}
}
#-------------------------------------------------------------------
=head2 fromBase36 ( string )
Returns a number that has been decoded from base36.
=head3 string
A base 36 encoded string.
=cut
sub fromBase36 {
my $string = shift;
my $exponent = 0;
my $number;
for (reverse split //, $string) {
$number += ($_ =~ /\d/ ? $_ : (ord($_) - 87)) * (36**$exponent++);
}
return $number;
}
#-------------------------------------------------------------------
@ -385,29 +362,6 @@ sub sortHashDescending {
return %newHash;
}
#-------------------------------------------------------------------
=head2 toBase36 ( number )
Returns a string that is base36 encoded.
=head3 number
A number that you wish to encode.
=cut
sub toBase36 {
my $number = shift;
my $string = "";
while ($number) {
my $quot = $number % 36;
$string = ($quot < 10 ? $quot : chr($quot + 87)) . $string;
$number = int($number / 36);
}
$string = "0$string" while length($string) < 9;
$string;
}
1;

View file

@ -77,8 +77,6 @@ checkModule("Parse::PlainConfig",1.1);
checkModule("XML::RSSLite",0.11);
checkModule("JSON",0.991);
checkModule("Finance::Quote",1.08);
checkModule("Bit::Vector::Minimal",1.3);
checkModule("Plucene",1.24);
#checkModule("POE",0.3202);
#checkModule("POE::Component::IKC::Server",0.18);
#checkModule("POE::Component::JobQueue",0.5402);