webgui/sbin/search.pl
2011-01-03 21:42:31 -06:00

243 lines
7 KiB
Perl
Executable file

#!/usr/bin/env perl
#-------------------------------------------------------------------
# WebGUI is Copyright 2001-2009 Plain Black Corporation.
#-------------------------------------------------------------------
# Please read the legal notices (docs/legal.txt) and the license
# (docs/license.txt) that came with this distribution before using
# this software.
#-------------------------------------------------------------------
# http://www.plainblack.com info@plainblack.com
#-------------------------------------------------------------------
use strict;
use File::Basename ();
use File::Spec;
my $webguiRoot;
BEGIN {
$webguiRoot = File::Spec->rel2abs(File::Spec->catdir(File::Basename::dirname(__FILE__), File::Spec->updir));
unshift @INC, File::Spec->catdir($webguiRoot, 'lib');
}
foreach my $libDir ( readLines( "preload.custom" ) ) {
if ( !-d $libDir ) {
warn "WARNING: Not adding lib directory '$libDir' from preload.custom: Directory does not exist.\n";
next;
}
unshift @INC, $libDir;
}
use Getopt::Long;
use WebGUI::Asset;
use WebGUI::Config;
use WebGUI::Session;
use WebGUI::Search;
use WebGUI::Search::Index;
use Time::HiRes;
use Pod::Usage;
$|=1;
my $search = "";
my $help = 0;
my $indexsite = 0;
my $configFile = "";
my $indexAll = "";
my $updatesite = 0;
GetOptions(
'indexall'=>\$indexAll,
'configFile=s'=>\$configFile,
'search=s'=>\$search,
'help'=>\$help,
'indexsite'=>\$indexsite,
'updatesite'=>\$updatesite
);
pod2usage( verbose => 2 ) if $help;
if ($configFile) {
my $session = WebGUI::Session->open($webguiRoot, $configFile);
if ($indexsite) {
reindexSite($session);
} elsif ($updatesite) {
updateSite($session);
} elsif ($search) {
searchSite($session, $search);
} else {
pod2usage();
}
$session->var->end;
$session->close;
} elsif ($indexAll) {
reindexAllSites();
} else {
pod2usage();
}
#-------------------------------------------------------------------
sub reindexAllSites {
my $configs = WebGUI::Config->readAllConfigs($webguiRoot);
foreach my $site (keys %{$configs}) {
print "Indexing ".$site."...\n";
my $session = WebGUI::Session->open($webguiRoot,$site);
reindexSite($session);
$session->var->end;
$session->close;
print "Finished indexing ".$site.".\n";
}
}
#-------------------------------------------------------------------
sub reindexSite {
my $session = shift;
my $siteTime = [Time::HiRes::gettimeofday()];
my $rs = $session->db->read("select assetId, className from asset where state='published'");
my @searchableAssetIds;
while (my ($id, $class) = $rs->array) {
my $asset = WebGUI::Asset->new($session,$id,$class);
if ( !$asset ) {
warn sprintf "- Asset %s (%s) could not be instantiated\n", $id, $class;
next;
}
if ($asset->get("state") eq "published" && ($asset->get("status") eq "approved" || $asset->get("status") eq "archived")) {
print $asset->getId."\t".$asset->getTitle."\t";
my $t = [Time::HiRes::gettimeofday()];
$asset->indexContent;
print "(".Time::HiRes::tv_interval($t).")\n";
push (@searchableAssetIds, $id);
}
}
# delete indexes of assets that are no longer searchable
my $list = $session->db->quoteAndJoin(\@searchableAssetIds) if scalar(@searchableAssetIds);
$session->db->write("delete from assetIndex where assetId not in (".$list.")") if $list;
print "\nSite indexing took ".Time::HiRes::tv_interval($siteTime)." seconds.\n";
}
#-------------------------------------------------------------------
sub searchSite {
my $session = shift;
my $keywords = shift;
my $t = [Time::HiRes::gettimeofday()];
my $search = WebGUI::Search->new($session, 0);
$search->search({keywords=>$keywords});
my $rs = $search->getResultSet;
while (my $data = $rs->hashRef) {
print $data->{assetId}."\t".$data->{title}."\n";
}
print "\nSearch took ".Time::HiRes::tv_interval($t)." seconds.\n";
}
#-------------------------------------------------------------------
sub updateSite {
my $session = shift;
my $siteTime = [Time::HiRes::gettimeofday()];
my $rs = $session->db->read("select assetId, className from asset where state='published'");
my @searchableAssetIds;
while (my ($id, $class) = $rs->array) {
push(@searchableAssetIds, $id);
my ($done) = $session->db->quickArray("select count(*) from assetIndex where assetId=?",[$id]);
next if $done;
my $asset = WebGUI::Asset->new($session,$id,$class);
if (defined $asset && $asset->get("status") eq "approved" || defined $asset && $asset->get("status") eq "archived") {
print $asset->getId."\t".$asset->getTitle."\t";
my $t = [Time::HiRes::gettimeofday()];
$asset->indexContent;
print "(".Time::HiRes::tv_interval($t).")\n";
}
}
# delete indexes of assets that are no longer searchable
my $list = $session->db->quoteAndJoin(\@searchableAssetIds) if scalar(@searchableAssetIds);
$session->db->write("delete from assetIndex where assetId not in (".$list.")") if $list;
print "\nSite indexing took ".Time::HiRes::tv_interval($siteTime)." seconds.\n";
}
#-------------------------------------------------
sub readLines {
my $file = shift;
my @lines;
if (open(my $fh, '<', $file)) {
while (my $line = <$fh>) {
$line =~ s/#.*//;
$line =~ s/^\s+//;
$line =~ s/\s+$//;
next if !$line;
push @lines, $line;
}
close $fh;
}
return @lines;
}
__END__
=head1 NAME
search - Reindex and search a WebGUI site.
=head1 SYNOPSIS
search --configFile config.conf --indexsite
search --configFile config.conf --updatesite
search --configFile config.conf --search text
search --indexall
search --help
=head1 DESCRIPTION
This WebGUI utility scripts helps maintaining search indexes on
any site. It can be used to build the index for an entire site,
build the index only for new content, and perform searches.
=over
=item B<--configFile config.conf>
The WebGUI config file to use. Only the file name needs to be specified,
since it will be looked up inside WebGUI's configuration directory.
This parameter is required.
=item B<--indexsite>
Reindexes the entire site specified in the config file. This process
may take a while (even hours) depending on the amount of content the
site has. Server performance will suffer somewhat during the
indexing process. This option requires a B<--configFile> to be
specified.
=item B<--updatesite>
Indexes content that has not be indexed for the site specified in
the config file, keeping the indexes for already indexed content
intact. This is useful if the B<--indexsite> had to be stopped
partway through. This option requires a B<--configFile> to be
specified.
=item B<--search text>
Searches the site specified in the config file for a given keyword or
phrase, returning the results. This option requires a B<--configFile>
to be specified.
=item B<--indexall>
Reindexes B<all> the sites. Note that this can take many hours and
will affect performance of the server during the indexing process.
=item B<--help>
Shows this documentation, then exits.
=back
=head1 AUTHOR
Copyright 2001-2009 Plain Black Corporation.
=cut