From 0c7bc12dcf35bd0b41baf669c596add2a008aea1 Mon Sep 17 00:00:00 2001 From: JT Smith Date: Wed, 18 Jan 2006 21:21:50 +0000 Subject: [PATCH] adding the start of the new search system --- lib/WebGUI/Search.pm | 137 ++++++++++++++++++++++++++++ lib/WebGUI/Search/DateTimeFilter.pm | 77 ++++++++++++++++ 2 files changed, 214 insertions(+) create mode 100644 lib/WebGUI/Search.pm create mode 100644 lib/WebGUI/Search/DateTimeFilter.pm diff --git a/lib/WebGUI/Search.pm b/lib/WebGUI/Search.pm new file mode 100644 index 000000000..6e2657aaa --- /dev/null +++ b/lib/WebGUI/Search.pm @@ -0,0 +1,137 @@ +package WebGUI::Search; + +use strict; +use warnings; + +use Carp; +use Plucene::Analysis::SimpleAnalyzer; +use Plucene::Analysis::WhitespaceAnalyzer; +use Plucene::Document; +use Plucene::Document::Field; +use Plucene::Index::Reader; +use Plucene::Index::Writer; +use Plucene::QueryParser; +use Plucene::Search::HitCollector; +use Plucene::Search::IndexSearcher; +use Plucene::Index::Term; +use File::Spec::Functions qw(catfile); +use WebGUI::Search::DateTimeFilter; +use WebGUI::Utility; + +sub open { + my ($class, $dir) = @_; + $dir or croak "No directory given"; + bless { _dir => $dir }, $class; +} + +sub _dir { shift->{_dir} } + +sub _parsed_query { + my ($self, $query, $default) = @_; + my $parser = Plucene::QueryParser->new({ + analyzer => Plucene::Analysis::SimpleAnalyzer->new(), + default => $default + }); + $parser->parse($query); +} + +sub _searcher { Plucene::Search::IndexSearcher->new(shift->_dir) } + +sub _reader { Plucene::Index::Reader->open(shift->_dir) } + +sub search { + my ($self, $sstring) = @_; + return () unless $sstring; + my @docs; + my $searcher = $self->_searcher; + my $hc = Plucene::Search::HitCollector->new( + collect => sub { + my ($self, $doc, $score) = @_; + my $res = eval { $searcher->doc($doc) }; + push @docs, [ $res, $score ] if $res; + }); + $searcher->search_hc($self->_parsed_query($sstring, 'text'), $hc); + return map $_->[0]->get("id")->string, sort { $b->[1] <=> $a->[1] } @docs; +} + +sub search_during { + my ($self, $sstring, $date1, $date2) = @_; + return () unless $sstring; + my $filter = WebGUI::Search::DateTimeFilter->new({ + field => '_date_', + from => $date1, + to => $date2 + }); + my $qp = Plucene::QueryParser->new({ + analyzer => Plucene::Analysis::WhitespaceAnalyzer->new(), + default => "text" + }); + my $query = $qp->parse($sstring); + my $hits = $self->_searcher->search($query, $filter); + return () unless $hits->length; + my @docs = map $hits->doc($_), 0 .. ($hits->length - 1); + return map $_->get("id")->string, @docs; +} + +sub _writer { + my $self = shift; + return Plucene::Index::Writer->new( + $self->_dir, + Plucene::Analysis::SimpleAnalyzer->new(), + -e catfile($self->_dir, "segments") ? 0 : 1 + ); +} + +sub add { + my ($self, @data) = @_; + my $writer = $self->_writer; + while (my ($id, $terms) = splice @data, 0, 2) { + my $doc = Plucene::Document->new; + $doc->add(Plucene::Document::Field->Keyword(id => $id)); + foreach my $key (keys %$terms) { + if ($key eq 'text') { + next; # gets added at the end anyway + } elsif ($key eq "date") { +use DateTime; + $doc->add(Plucene::Document::Field->Keyword("_date_", toBase36($terms->{date}*1000))); + $doc->add(Plucene::Document::Field->Keyword("date", DateTime->from_epoch(epoch=>$terms->{date})->ymd)); + } else { + $doc->add(Plucene::Document::Field->UnStored($key => $terms->{$key})); + $terms->{text} .= " " . $terms->{$key} unless $key =~ /^_/; + } + } + $doc->add(Plucene::Document::Field->UnStored(text => $terms->{text})); + $writer->add_document($doc); + } + undef $writer; +} + +sub index_document { + my ($self, $id, $data) = @_; + my $writer = $self->_writer; + my $doc = Plucene::Document->new; + $doc->add(Plucene::Document::Field->Keyword(id => $id)); + $doc->add(Plucene::Document::Field->UnStored('text' => $data)); + $writer->add_document($doc); + undef $writer; +} + +sub delete_document { + my ($self, $id) = @_; + my $reader = $self->_reader; + $reader->delete_term( + Plucene::Index::Term->new({ field => "id", text => $id })); + $reader->close; +} + +sub optimize { shift->_writer->optimize() } + +sub indexed { + my ($self, $id) = @_; + my $term = Plucene::Index::Term->new({ field => 'id', text => $id }); + return $self->_reader->doc_freq($term); +} + + + +1; diff --git a/lib/WebGUI/Search/DateTimeFilter.pm b/lib/WebGUI/Search/DateTimeFilter.pm new file mode 100644 index 000000000..719a89401 --- /dev/null +++ b/lib/WebGUI/Search/DateTimeFilter.pm @@ -0,0 +1,77 @@ +package WebGUI::Search::DateTimeFilter; + +use strict; +use base 'Plucene::Search::Filter'; +use Bit::Vector::Minimal; +use Plucene::Index::Term; +use WebGUI::Utility; + +=head1 DESCRIPTION + +This package is a replacement for Plucene::Search::DateFilter that uses epochs rather than Time::Piece objects. + +=cut + + +#------------------------------------------------------------------- + +=head2 new ( field [, from, to] ) + +Constructor. + +=head3 field + +The field name to build the filter for. + +=head3 from + +An epoch date to start the search from. Defaults to now minus one year. + +=head3 to + +An epoch date to end searching on. Defaults to now. + +=cut + +sub new { + my $class = shift; + my $args = shift; + bless { + field => $args->{field}, + from => toBase36(($args->{from}||time()-60*60*24*365)*1000), + to => toBase36(($args->{to}||time())*1000), + }, $class; +} + +#------------------------------------------------------------------- + +=head2 bits ( ) + +The actual filter method required by Plucene::Search::IndexSearcher. + +=cut + +sub bits { + my ($self, $reader) = @_; + my $bits = Bit::Vector::Minimal->new(size => $reader->max_doc); + my $enum = $reader->terms( + Plucene::Index::Term->new({ + field => $self->{field}, + text => $self->{from} })); + return $bits unless $enum->term; + my $termdocs = $reader->term_docs; + + my $stop = Plucene::Index::Term->new({ + field => $self->{field}, + text => $self->{to} }); + while ($enum->term->le($stop)) { + $termdocs->seek($enum->term); + $bits->set($termdocs->doc) while $termdocs->next; + last unless $enum->next; + } + return $bits; +} + +1; + +