webgui/lib/WebGUI/Search/Index.pm

339 lines
8.7 KiB
Perl

package WebGUI::Search::Index;
=head1 LEGAL
-------------------------------------------------------------------
WebGUI is Copyright 2001-2009 Plain Black Corporation.
-------------------------------------------------------------------
Please read the legal notices (docs/legal.txt) and the license
(docs/license.txt) that came with this distribution before using
this software.
-------------------------------------------------------------------
http://www.plainblack.com info@plainblack.com
-------------------------------------------------------------------
=cut
use strict;
use HTML::Entities;
=head1 NAME
Package WebGUI::Search::Index
=head1 DESCRIPTION
A package for working with the WebGUI Search Engine.
=head1 SYNOPSIS
use WebGUI::Search::Index;
=head1 METHODS
These methods are available from this package:
=cut
#-------------------------------------------------------------------
=head2 addFile ( path )
Use an external filter defined in the config file as searchIndexerPlugins to pull keywords from a file and
add them to the index.
=head3 path
The path to the filename to index, including the filename.
=cut
sub addFile {
my $self = shift;
my $path = shift;
my $keywords = $self->getKeywordsForFile($path);
return unless $keywords =~ /\S/;
return $self->addKeywords($keywords)
}
#-------------------------------------------------------------------
=head2 addKeywords ( text )
Add more text to the keywords index for this asset.
=head3 text
A string (or array of strings) of text. You may optionally also put HTML here, and it will be automatically filtered.
=cut
sub addKeywords {
my $self = shift;
my $text = join(" ", @_);
$text = $self->_filterKeywords($text);
my ($keywords) = $self->session->db->quickArray("select keywords from assetIndex where assetId=? and url=?",[$self->getId, $self->asset->get('url')]);
$self->session->db->write("update assetIndex set keywords =? where assetId=? and url=?", [$keywords.' '.$text, $self->getId, $self->asset->get('url')]);
}
#-------------------------------------------------------------------
=head2 addRecord ( %fields )
Adds a duplicate record for the current asset, along with fields that are overridden.
=head3 %fields
A hash of fields to override in the record. Entries for url and keywords are mandatory, and
no record will be added unless they exist in the hash.
The lineage entry cannot be overridden.
=cut
sub addRecord {
my $self = shift;
my %fields = @_;
return unless $fields{url} and $fields{keywords};
my $asset = $self->asset;
##Get the asset's record from the database.
my %defaults = $self->session->db->quickHash('select * from assetIndex where assetId=? and url=?', [$asset->get('assetId'), $asset->get('url')]);
$fields{keywords} = $self->_filterKeywords($fields{keywords});
%fields = (%defaults, %fields);
$fields{lineage} = $defaults{lineage};
my $add = $self->session->db->prepare("replace into assetIndex (assetId, url, title, creationDate, revisionDate,
ownerUserId, groupIdView, groupIdEdit, lineage, className, synopsis, keywords, subId) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )");
$add->execute([@fields{qw/assetId url title creationDate revisionDate ownerUserId groupIdView groupIdEdit lineage className synopsis keywords subId/}]);
}
#-------------------------------------------------------------------
=head2 asset ( )
Returns a reference to the asset object we're indexing.
=cut
sub asset {
my $self = shift;
return $self->{_asset};
}
#-------------------------------------------------------------------
=head2 create ( asset )
Constructor that also creates the initial index of an asset.
=cut
sub create {
my $class = shift;
my $asset = shift;
my $self = $class->new($asset);
$self->delete;
my $url = $asset->get("url");
$url =~ s/\/|\-|\_/ /g;
my $description = WebGUI::HTML::filter($asset->get('description'), "all");
my $keywords = join(" ",$asset->get("title"), $asset->get("menuTitle"), $asset->get("synopsis"), $url,
$description, WebGUI::Keyword->new($self->session)->getKeywordsForAsset({asset=>$asset}));
$keywords = $self->_filterKeywords($keywords);
my $synopsis = $asset->get("synopsis") || substr($description,0,255);
my $add = $self->session->db->prepare("insert into assetIndex (assetId, title, url, creationDate, revisionDate,
ownerUserId, groupIdView, groupIdEdit, lineage, className, synopsis, keywords) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )");
$add->execute([$asset->getId, $asset->get("title"), $asset->get("url"), $asset->get("creationDate"),
$asset->get("revisionDate"), $asset->get("ownerUserId"), $asset->get("groupIdView"), $asset->get("groupIdEdit"),
$asset->get("lineage"), $asset->get("className"), $synopsis, $keywords]);
return $self;
}
#-------------------------------------------------------------------
=head2 delete ( )
Deletes this indexed asset.
=cut
sub delete {
my $self = shift;
my $delete = $self->session->db->prepare("delete from assetIndex where assetId=?");
$delete->execute([$self->getId]);
}
#-------------------------------------------------------------------
=head2 DESTROY ( )
Deconstructor.
=cut
sub DESTROY {
my $self = shift;
undef $self;
}
#-------------------------------------------------------------------
=head2 _filterKeywords ( $keywords )
Perform filtering and cleaning up of the keywords before submitting them. Ideographic characters are padded
so that they are still searchable. HTML entities are decoded.
=head3 $keywords
A string containing keywords.
=cut
sub _filterKeywords {
my $self = shift;
my $keywords = shift;
$keywords = WebGUI::HTML::filter($keywords, "all");
$keywords = HTML::Entities::decode_entities($keywords);
utf8::upgrade($keywords);
# split into 'words'. Ideographic characters (such as Chinese) are
# treated as distinct words. Everything else is space delimited.
my @words = grep { $_ ne '' } split /\s+|(\p{Ideographic})/, $keywords;
# remove punctuation characters at the start and end of each word.
for my $word ( @words ) {
$word =~ s/\A\p{isPunct}+//;
$word =~ s/\p{isPunct}+\z//;
# we add padding to ideographic characters to avoid minimum word length limits on indexing
if ($word =~ /\p{Ideographic}/) {
$word = q{''}.$word.q{''};
}
}
$keywords = join q{ }, @words;
return $keywords;
}
#-------------------------------------------------------------------
=head2 getId ( )
Returns the ID used to create this object.
=cut
sub getId {
my $self = shift;
return $self->{_id};
}
#-------------------------------------------------------------------
=head2 getKeywordsForFile ( path )
Use an external filter defined in the config file as searchIndexerPlugins to get keywords
from a file.
=head3 path
The path to the filename to index, including the filename.
=cut
sub getKeywordsForFile {
my $self = shift;
my $path = shift;
my $filters = $self->session->config->get("searchIndexerPlugins");
my $content;
if ($path =~ m/\.(\w+)$/) {
my $type = lc($1);
if ($filters->{$type}) {
open my $fh, "$filters->{$type} $path |" or return undef; # open pipe to filter
$content = do { local $/; <$fh> }; # slurp file
close $fh;
}
}
return $content;
}
#-------------------------------------------------------------------
=head2 setIsPublic ( boolean )
Sets the status of whether this asset will appear in public searches.
=cut
sub setIsPublic {
my $self = shift;
my $boolean = shift;
my $set = $self->session->db->prepare("update assetIndex set isPublic=? where assetId=?");
$set->execute([$boolean, $self->getId]);
}
#-------------------------------------------------------------------
=head2 new ( asset )
Constructor.
=head3 asset
A reference to an asset object.
=cut
sub new {
my $class = shift;
my $asset = shift;
my $self = {_asset=>$asset, _session=>$asset->session, _id=>$asset->getId};
bless $self, $class;
}
#-------------------------------------------------------------------
=head2 session ( )
Returns a reference to the current session.
=cut
sub session {
my $self = shift;
return $self->{_session};
}
#-------------------------------------------------------------------
=head2 updateSynopsis ( text )
Overrides the asset's default synopsis with a new chunk of text.
NOTE: This doesn't change the asset itself, only the synopsis in the search index.
=head3 text
The text to put in place of the current synopsis.
=cut
sub updateSynopsis {
my $self = shift;
my $text = shift;
my $add = $self->session->db->prepare("update assetIndex set synopsis=? where assetId=?");
$add->execute([$text,$self->getId]);
}
1;