#!/usr/bin/env perl #------------------------------------------------------------------- # WebGUI is Copyright 2001-2009 Plain Black Corporation. #------------------------------------------------------------------- # Please read the legal notices (docs/legal.txt) and the license # (docs/license.txt) that came with this distribution before using # this software. #------------------------------------------------------------------- # http://www.plainblack.com info@plainblack.com #------------------------------------------------------------------- use strict; use File::Basename (); use File::Spec; my $webguiRoot; BEGIN { $webguiRoot = File::Spec->rel2abs(File::Spec->catdir(File::Basename::dirname(__FILE__), File::Spec->updir)); unshift @INC, File::Spec->catdir($webguiRoot, 'lib'); } use Getopt::Long; use WebGUI::Asset; use WebGUI::Config; use WebGUI::Session; use WebGUI::Search; use WebGUI::Search::Index; use Time::HiRes; use Pod::Usage; $|=1; my $search = ""; my $help = 0; my $indexsite = 0; my $configFile = ""; my $indexAll = ""; my $updatesite = 0; GetOptions( 'indexall'=>\$indexAll, 'configFile=s'=>\$configFile, 'search=s'=>\$search, 'help'=>\$help, 'indexsite'=>\$indexsite, 'updatesite'=>\$updatesite ); pod2usage( verbose => 2 ) if $help; if ($configFile) { my $session = WebGUI::Session->open($webguiRoot, $configFile); if ($indexsite) { reindexSite($session); } elsif ($updatesite) { updateSite($session); } elsif ($search) { searchSite($session, $search); } else { pod2usage(); } $session->var->end; $session->close; } elsif ($indexAll) { reindexAllSites(); } else { pod2usage(); } #------------------------------------------------------------------- sub reindexAllSites { my $configs = WebGUI::Config->readAllConfigs($webguiRoot); foreach my $site (keys %{$configs}) { print "Indexing ".$site."...\n"; my $session = WebGUI::Session->open($webguiRoot,$site); reindexSite($session); $session->var->end; $session->close; print "Finished indexing ".$site.".\n"; } } #------------------------------------------------------------------- sub reindexSite { my $session = shift; my $siteTime = [Time::HiRes::gettimeofday()]; my $rs = $session->db->read("select assetId, className from asset where state='published'"); my @searchableAssetIds; while (my ($id, $class) = $rs->array) { my $asset = WebGUI::Asset->new($session,$id,$class); if (defined $asset && $asset->get("state") eq "published" && ($asset->get("status") eq "approved" || $asset->get("status") eq "archived")) { print $asset->getId."\t".$asset->getTitle."\t"; my $t = [Time::HiRes::gettimeofday()]; $asset->indexContent; print "(".Time::HiRes::tv_interval($t).")\n"; push (@searchableAssetIds, $id); } } # delete indexes of assets that are no longer searchable my $list = $session->db->quoteAndJoin(\@searchableAssetIds) if scalar(@searchableAssetIds); $session->db->write("delete from assetIndex where assetId not in (".$list.")") if $list; print "\nSite indexing took ".Time::HiRes::tv_interval($siteTime)." seconds.\n"; } #------------------------------------------------------------------- sub searchSite { my $session = shift; my $keywords = shift; my $t = [Time::HiRes::gettimeofday()]; my $search = WebGUI::Search->new($session, 0); $search->search({keywords=>$keywords}); my $rs = $search->getResultSet; while (my $data = $rs->hashRef) { print $data->{assetId}."\t".$data->{title}."\n"; } print "\nSearch took ".Time::HiRes::tv_interval($t)." seconds.\n"; } #------------------------------------------------------------------- sub updateSite { my $session = shift; my $siteTime = [Time::HiRes::gettimeofday()]; my $rs = $session->db->read("select assetId, className from asset where state='published'"); my @searchableAssetIds; while (my ($id, $class) = $rs->array) { push(@searchableAssetIds, $id); my ($done) = $session->db->quickArray("select count(*) from assetIndex where assetId=?",[$id]); next if $done; my $asset = WebGUI::Asset->new($session,$id,$class); if (defined $asset && $asset->get("status") eq "approved" || defined $asset && $asset->get("status") eq "archived") { print $asset->getId."\t".$asset->getTitle."\t"; my $t = [Time::HiRes::gettimeofday()]; $asset->indexContent; print "(".Time::HiRes::tv_interval($t).")\n"; } } # delete indexes of assets that are no longer searchable my $list = $session->db->quoteAndJoin(\@searchableAssetIds) if scalar(@searchableAssetIds); $session->db->write("delete from assetIndex where assetId not in (".$list.")") if $list; print "\nSite indexing took ".Time::HiRes::tv_interval($siteTime)." seconds.\n"; } __END__ =head1 NAME search - Reindex and search a WebGUI site. =head1 SYNOPSIS search --configFile config.conf --indexsite search --configFile config.conf --updatesite search --configFile config.conf --search text search --indexall search --help =head1 DESCRIPTION This WebGUI utility scripts helps maintaining search indexes on any site. It can be used to build the index for an entire site, build the index only for new content, and perform searches. =over =item B<--configFile config.conf> The WebGUI config file to use. Only the file name needs to be specified, since it will be looked up inside WebGUI's configuration directory. This parameter is required. =item B<--indexsite> Reindexes the entire site specified in the config file. This process may take a while (even hours) depending on the amount of content the site has. Server performance will suffer somewhat during the indexing process. This option requires a B<--configFile> to be specified. =item B<--updatesite> Indexes content that has not be indexed for the site specified in the config file, keeping the indexes for already indexed content intact. This is useful if the B<--indexsite> had to be stopped partway through. This option requires a B<--configFile> to be specified. =item B<--search text> Searches the site specified in the config file for a given keyword or phrase, returning the results. This option requires a B<--configFile> to be specified. =item B<--indexall> Reindexes B the sites. Note that this can take many hours and will affect performance of the server during the indexing process. =item B<--help> Shows this documentation, then exits. =back =head1 AUTHOR Copyright 2001-2009 Plain Black Corporation. =cut