diff --git a/lib/DBIx/FullTextSearch.pm b/lib/DBIx/FullTextSearch.pm new file mode 100644 index 000000000..328ae822e --- /dev/null +++ b/lib/DBIx/FullTextSearch.pm @@ -0,0 +1,1243 @@ +# -*- Mode: Perl; indent-tabs-mode: t; tab-width: 2 -*- + +=head1 NAME + +DBIx::FullTextSearch - Indexing documents with MySQL as storage + +=cut + +package DBIx::FullTextSearch; +use strict; +use Parse::RecDescent; + +use vars qw($errstr $VERSION $parse); +$errstr = undef; +$VERSION = '0.73'; + +use locale; + +my %DEFAULT_PARAMS = ( + 'num_of_docs' => 0, # statistical value, should be maintained + 'word_length' => 30, # max length of words we index + + 'protocol' => 40, # we only support protocol with the same numbers + + 'blob_direct_fetch' => 20, # with the blob store, when we stop searching + # and fetch everything at once + 'data_table' => undef, # table where the actual index is stored + 'name_length' => 255, # for filenames or URLs, what's the max length + + 'word_id_bits' => 16, # num of bits for word_id (column store) + 'doc_id_bits' => 16, # num of bits for doc_id + 'count_bits' => 8, # num of bits for count value + 'position_bits' => 32, # num of bits for word positions + + 'backend' => 'blob', # what database backend (way the data is + # stored) we use + 'frontend' => 'none', # what application frontend we use (how + # the index behaves externaly) + 'filter' => 'map { lc $_ }', + 'search_splitter' => '/(\w{2,$word_length}\*?)/g', + 'index_splitter' => '/(\w{2,$word_length})/g', + # can use the $word_length + # variable + 'init_env' => '' + ); +my %backend_types = ( + 'blob' => 'DBIx::FullTextSearch::Blob', + 'blobfast' => 'DBIx::FullTextSearch::BlobFast', + 'column' => 'DBIx::FullTextSearch::Column', + 'phrase' => 'DBIx::FullTextSearch::Phrase', + ); +my %frontend_types = ( + 'none' => 'DBIx::FullTextSearch', + 'default' => 'DBIx::FullTextSearch', + 'file' => 'DBIx::FullTextSearch::File', + 'string' => 'DBIx::FullTextSearch::String', + 'url' => 'DBIx::FullTextSearch::URL', + 'table' => 'DBIx::FullTextSearch::Table', + ); + +use vars qw! %BITS_TO_PACK %BITS_TO_INT %INT_TO_BITS !; +%BITS_TO_PACK = qw! 0 A0 8 C 16 S 32 L !; +%BITS_TO_INT = qw! 8 tinyint 16 smallint 24 mediumint 32 int 64 bigint !; +%INT_TO_BITS = map { ($BITS_TO_INT{$_} => $_ ) }keys %BITS_TO_INT; + +# Open reads in the information about existing index, creates an object +# in memory +sub open { + my ($class, $dbh, $TABLE) = @_; + $errstr = undef; + + # the $dbh is either a real dbh of a DBI->connect parameters arrayref + my $mydbh = 0; + if (ref $dbh eq 'ARRAY') { + if (not $dbh = DBI->connect(@$dbh)) { + $errstr = $DBI::errstr; return; + } + + $mydbh = 1; + } + + # load the parameters to the object + my %PARAMS = %DEFAULT_PARAMS; + my $sth = $dbh->prepare("select * from $TABLE"); + $sth->{'PrintError'} = 0; + $sth->{'RaiseError'} = 0; + $sth->execute or do { + if (not grep { $TABLE eq $_ } + DBIx::FullTextSearch->list_fts_indexes($dbh)) { + $errstr = "FullTextSearch index $TABLE doesn't exist."; + } else { + $errstr = $sth->errstr; + } + return; + }; + while (my ($param, $value) = $sth->fetchrow_array) { + $PARAMS{$param} = $value; + } + + my $self = bless { + 'dbh' => $dbh, + 'table' => $TABLE, + %PARAMS, + }, $class; + my $data_table = $self->{'data_table'}; + + # we should disconnect if we've opened the dbh here + if ($mydbh) { $self->{'disconnect_on_destroy'} = 1; } + + # some basic sanity check + if (not defined $dbh->selectrow_array("select count(*) from $data_table")) { + $errstr = "Table $data_table not found in the database\n"; + return; + } + + + # load and set the application frontend + my $front_module = $frontend_types{$PARAMS{'frontend'}}; + if (defined $front_module) { + if ($front_module ne $class) { + eval "use $front_module"; + die $@ if $@; + } + bless $self, $front_module; + $self->_open_tables; + } + else { + $errstr = "Specified frontend type `$PARAMS{'frontend'}' is unknown\n"; return; + } + + # load and set the backend (actual database access) module + my $back_module = $backend_types{$PARAMS{'backend'}}; + if (defined $back_module) { + eval "use $back_module"; + die $@ if $@; + $self->{'db_backend'} = $back_module->open($self); + } + else { + $errstr = "Specified backend type `$PARAMS{'backend'}' is unknown\n"; return; + } + + # load DBIx::FullTextSearch::StopList object (if specified) + if ($PARAMS{'stoplist'}) { + eval "use DBIx::FullTextSearch::StopList"; + die $@ if $@; + $self->{'stoplist'} = DBIx::FullTextSearch::StopList->open($dbh, $PARAMS{'stoplist'}); + } + + # load Lingua::Stem object (if specified) + if($PARAMS{'stemmer'}){ + eval "use Lingua::Stem"; + die $@ if $@; + $self->{'stemmer'} = Lingua::Stem->new(-locale => $PARAMS{'stemmer'}); + } + + # finally, return the object + $self; +} + +# Create creates tables in the database according to the options, then +# calls open to load the object to memory +sub create { + my ($class, $dbh, $TABLE, %OPTIONS) = @_; + $errstr = undef; + my $mydbh = 0; + if (ref $dbh eq 'ARRAY') { + $dbh = DBI->connect(@$dbh) + or do { $errstr = $DBI::errstr; return; }; + $mydbh = 1; + } + + my $self = bless { + 'dbh' => $dbh, + 'table' => $TABLE, + %DEFAULT_PARAMS, + %OPTIONS + }, $class; + + $self->{'data_table'} = $TABLE.'_data' + unless defined $self->{'data_table'}; + + # convert array reference to CSV string + $self->{'column_name'} = join(",",@{$self->{'column_name'}}) if ref($self->{'column_name'}) eq 'ARRAY'; + + my $CREATE_PARAM = <do($CREATE_PARAM) or do { $errstr = $dbh->errstr; return; }; + push @{$self->{'created_tables'}}, $TABLE; + + # load and set the frontend database structures + my $front_module = $frontend_types{$self->{'frontend'}}; + if (defined $front_module) { + eval "use $front_module"; + die $@ if $@; + bless $self, $front_module; + $errstr = $self->_create_tables; + if (defined $errstr) { + $self->clean_failed_create; warn $errstr; return; + } + } + else { + $errstr = "Specified frontend type `$self->{'frontend'}' is unknown\n"; $self->clean_failed_create; return; + } + + # create the backend database structures + my $back_module = $backend_types{$self->{'backend'}}; + if (defined $back_module) { + eval "use $back_module"; + die $@ if $@; + $errstr = $back_module->_create_tables($self); + if (defined $errstr) { + $self->clean_failed_create; warn $errstr; return; + } + } + else { + $errstr = "Specified backend type `$self->{'backend'}' is unknown\n"; $self->clean_failed_create; return; + } + + for (grep { not ref $self->{$_} } keys %$self) { + $dbh->do("insert into $TABLE values (?, ?)", {}, $_, $self->{$_}); + } + + return $class->open($dbh, $TABLE); +} + +sub _create_tables {} +sub _open_tables {} + +sub clean_failed_create { + my $self = shift; + my $dbh = $self->{'dbh'}; + for my $table (@{$self->{'created_tables'}}) { + $dbh->do("drop table $table"); + } +} + +sub drop { + my $self = shift; + my $dbh = $self->{'dbh'}; + for my $tag (keys %$self) { + next unless $tag =~ /(^|_)table$/; + $dbh->do("drop table $self->{$tag}"); + } + 1; +} + +sub empty { + my $self = shift; + my $dbh = $self->{'dbh'}; + + for my $tag (keys %$self) { + next unless $tag =~ /_table$/; + $dbh->do("delete from $self->{$tag}"); + } + $dbh->do("replace into $self->{'table'} values ('max_doc_id', 0)"); + return 1; +} + +sub errstr { + my $self = shift; + ref $self ? $self->{'errstr'} : $errstr; +} + +sub list_fts_indexes { + my ($class, $dbh) = @_; + my %tables = map { ( $_->[0] => 1 ) } + @{$dbh->selectall_arrayref('show tables')}; + my %indexes = (); + for my $table (keys %tables) { + local $dbh->{'PrintError'} = 0; + local $dbh->{'RaiseError'} = 0; + if ($dbh->selectrow_array("select param, value from $table + where param = 'data_table'")) { + $indexes{$table} = 1; + } + } + return sort keys %indexes; +} + +sub index_document { + my ($self, $id, $data) = @_; + return unless defined $id; + + my $dbh = $self->{'dbh'}; + + my $param_table = $self->{'table'}; + + my $adding_doc = 0; + + my $adding = 0; + if (not defined $self->{'max_doc_id'} or $id > $self->{'max_doc_id'}) { + $self->{'max_doc_id'} = $id; + my $update_max_doc_id_sth = + ( defined $self->{'update_max_doc_id_sth'} + ? $self->{'update_max_doc_id_sth'} + : $self->{'update_max_doc_id_sth'} = $dbh->prepare("replace into $param_table values (?, ?)")); + $update_max_doc_id_sth->execute('max_doc_id', $id); + $adding_doc = 1; + } + + my $init_env = $self->{'init_env'}; # use packages, etc. + eval $init_env if defined $init_env; + print STDERR "Init_env failed with $@\n" if $@; + + $data = '' unless defined $data; + return $self->{'db_backend'}->parse_and_index_data($adding_doc, + $id, $data); +} + +# used for backends that need a count for each of the words +sub parse_and_index_data_count { + my ($backend, $adding_doc, $id, $data) = @_; + ## note that this is run with backend object + my $self = $backend->{'fts'}; + + my $word_length = $self->{'word_length'}; + # this needs to get parametrized (lc, il2_to_ascii, parsing of + # HTML tags, ...) + + my %words; + my @data_sets = ref $data ? @$data : ($data); + + # We can just join the data sets together, since we don't care about position + my $data_string = join(" ", @data_sets); + + my $filter = $self->{'filter'} . ' $data_string =~ ' . $self->{'index_splitter'}; + my $stoplist = $self->{'stoplist'}; + my $stemmer = $self->{'stemmer'}; + my @words = eval $filter; + @words = grep !$stoplist->is_stop_word($_), @words if defined($stoplist); + @words = @{$stemmer->stem(@words)} if defined($stemmer); + for my $word ( @words ) { + $words{$word} = 0 if not defined $words{$word}; + $words{$word}++; + } + + my @result; + if ($adding_doc) { + @result = $backend->add_document($id, \%words); + } else { + @result = $backend->update_document($id, \%words); + } + + if (wantarray) { + return @result; + } + return $result[0]; +} + +# used for backends where list of occurencies is needed +sub parse_and_index_data_list { + my ($backend, $adding_doc, $id, $data) = @_; + ## note that this is run with backend object + my $self = $backend->{'fts'}; + + my $word_length = $self->{'word_length'}; + # this needs to get parametrized (lc, il2_to_ascii, parsing of + # HTML tags, ...) + + my %words; + my @data_sets = ref $data ? @$data : ($data); + + foreach my $data_set (@data_sets) { + my $filter = $self->{'filter'}.' $data_set =~ '.$self->{'index_splitter'}; + + my $i = 0; # $i stores the position(s) of each word in the document. + my $stoplist = $self->{'stoplist'}; + my $stemmer = $self->{'stemmer'}; + my @words = eval $filter; + @words = grep !$stoplist->is_stop_word($_), @words if defined($stoplist); + @words = @{$stemmer->stem(@words)} if defined($stemmer); + for my $word ( @words ) { + push @{$words{$word}}, ++$i; + } + # Make sure the data sets are considered far apart in position, to + # avoid phrase searches overlapping between table columns. + $i += 100; + } + + my @result; + if ($adding_doc) { + @result = $backend->add_document($id, \%words); + } else { + @result = $backend->update_document($id, \%words); + } + + if (wantarray) { + return @result; + } + return $result[0]; +} + +sub delete_document { + my $self = shift; + $self->{'db_backend'}->delete_document(@_); +} + +sub contains_hashref { + my $self = shift; + my $word_length = $self->{'word_length'}; + my $stemmer = $self->{'stemmer'}; + my $filter = $self->{'filter'}; + my $stoplist = $self->{'stoplist'}; + my @phrases; + for (@_){ + my $phrase; + my $splitter = ' map { ' . $self->{'search_splitter'} . ' } $_'; + my @words = eval $splitter; + @words = eval $filter.' @words'; + @words = grep !$stoplist->is_stop_word($_), @words if defined($stoplist); + if (defined($stemmer)){ + my @stemmed_words = (); + for (@words){ + if (m/\*$/){ + # wildcard search, make work with stemming + my $stem_word = $stemmer->stem($_); + for (@$stem_word){ + $_ .= "*"; + push @stemmed_words, $_; + } + } else { + push @stemmed_words, @{$stemmer->stem($_)}; + } + } + $phrase = join(' ',@stemmed_words); + } else { + $phrase = join(' ',@words); + } + # change wildcard to SQL version (* -> %) + $phrase =~ s/\*/%/g; + push @phrases, $phrase; + } + $self->{'db_backend'}->contains_hashref(@phrases); +} + +sub contains { + my $self = shift; + my $res = $self->contains_hashref(@_); + if (not $self->{'count_bits'}) { return keys %$res; } + return sort { $res->{$b} <=> $res->{$a} } keys %$res; +} + +sub econtains_hashref { + my $self = shift; + my $docs = {}; + my $word_num = 0; + + my $stoplist = $self->{'stoplist'}; + + my @plus_words = map { /^\+(.+)$/s } @_; + @plus_words = grep !$stoplist->is_stop_word($_), @plus_words if defined($stoplist); + + # required words + for my $word (@plus_words) { + $word_num++; + my $oneword = $self->contains_hashref($word); + if ($word_num == 1) { $docs = $oneword; next; } + for my $doc (keys %$oneword) { + $docs->{$doc} += $oneword->{$doc} if defined $docs->{$doc}; + } + for my $doc (keys %$docs) { + delete $docs->{$doc} unless defined $oneword->{$doc}; + } + } + + # optional words + for my $word ( map { /^([^+-].*)$/s } @_) { + my $oneword = $self->contains_hashref($word); + for my $doc (keys %$oneword) { + if (@plus_words) { + $docs->{$doc} += $oneword->{$doc} if defined $docs->{$doc}; + } + else { + $docs->{$doc} = 0 unless defined $docs->{$doc}; + $docs->{$doc} += $oneword->{$doc}; + } + } + } + + # prohibited words + for my $word ( map { /^-(.+)$/s } @_) { + my $oneword = $self->contains_hashref($word); + for my $doc (keys %$oneword) { + delete $docs->{$doc}; + } + } + $docs; +} + +sub econtains { + my $self = shift; + my $res = $self->econtains_hashref(@_); + if (not $self->{'count_bits'}) { return keys %$res; } + return sort { $res->{$b} <=> $res->{$a} } keys %$res; +} + +sub _search_terms { + my ($self, $query) = @_; + + if ($self->{'backend'} eq 'phrase') { + # phrase backend, must deal with quotes + + # handle + and - operations on phrases + $query =~ s/([\+\-])"/"$1/g; + + my $inQuote = 0; + my @phrases = (); + + my @blocks = split(/\"/, $query); + + # deal with quotes + for (@blocks){ + if($inQuote == 0){ + # we are outside quotes, search for individual words + push @phrases, split(' '); + } else { + # we are inside quote, search for whole phrase + push @phrases, $_; + } + $inQuote = ++$inQuote % 2; + } + return @phrases; + } else + { + # not phrase backend, don't deal with quotes + return split(' ', $query); + } +} + +sub _search_boolean { + my ($self, $query) = @_; + + unless ($parse) { + $::RD_AUTOACTION = q{ [@item] }; + my $grammar = q{ + expr : disj + disj : conj 'or' disj | conj + conj : unary 'and' conj | unary + unary : '(' expr ')' + | atom + atom : /([^\(\)\s]|\s(?!and)(?!or))+/ + }; + $parse = new Parse::RecDescent ($grammar); + } + my $tree = $parse->expr($query); + return $self->_search_in_tree($tree); +} + +sub _search_in_tree { + my ($self, $tree) = @_; + + if (ref($tree->[1]) && ref($tree->[3])) { + if (defined($tree->[2]) && $tree->[2] eq 'and') { + my $hash_ref1 = $self->_search_in_tree($tree->[1]); + my $hash_ref2 = $self->_search_in_tree($tree->[3]); + for my $k (keys %$hash_ref1) { + unless ($hash_ref2->{$k}) { + delete $hash_ref1->{$k}; + } else { + $hash_ref1->{$k} += $hash_ref2->{$k}; + } + } + return $hash_ref1; + } elsif (defined($tree->[2]) && $tree->[2] eq 'or') { + my $hash_ref1 = $self->_search_in_tree($tree->[1]); + my $hash_ref2 = $self->_search_in_tree($tree->[3]); + for my $k (keys %$hash_ref2) { + $hash_ref1->{$k} += $hash_ref2->{$k}; + } + return $hash_ref1; + } + return {}; + } elsif ($tree->[1] eq '(' && ref($tree->[2]) && $tree->[3] eq ')') { + return $self->_search_in_tree($tree->[2]); + } elsif (ref($tree->[1])) { + return $self->_search_in_tree($tree->[1]); + } elsif (defined($tree->[0]) && $tree->[0] eq 'atom') { + return $self->econtains_hashref($self->_search_terms($tree->[1])); + } else { + warn "Unknown tree nodes " . join("\t", @$tree); + return {}; + } +} + +sub search { + my ($self, $query) = @_; + if ($query =~ s/\b(and|or|not)\b/lc($1)/eig) { + return keys %{$self->_search_boolean($query)}; + } + return $self->econtains($self->_search_terms($query)); +} + +sub search_hashref { + my ($self, $query) = @_; + if ($query =~ s/\b(and|or|not)\b/lc($1)/eig) { + return $self->_search_boolean($query); + } + return $self->econtains_hashref($self->_search_terms($query)); +} + +sub document_count { + my $self = shift; + my $dbh = $self->{'dbh'}; + + my $SQL = qq{ + select distinct doc_id from $self->{'data_table'} + }; + my $ary_ref = $dbh->selectall_arrayref($SQL); + return scalar @$ary_ref; +} + +# find all words that are contained in at least $k % of all documents +sub common_word { + my $self = shift; + my $k = shift || 80; + $self->{'db_backend'}->common_word($k); +} + +sub DESTROY { + my $self = shift; + $self->{'db_backend'}->DESTROY() + if (exists $self->{'db_backend'} && $self->{'db_backend'} &&$self->{'db_backend'}->can('DESTROY')); +} + +1; + +=head1 SYNOPSIS + +DBIx::FullTextSearch uses a MySQL database backend to index files, web +documents and database fields. Supports must include, can include, and cannot +include words and phrases. Support for boolean (AND/OR) queries, stop words and stemming. + + use DBIx::FullTextSearch; + use DBI; + # connect to database (regular DBI) + my $dbh = DBI->connect('dbi:mysql:database', 'user', 'passwd'); + + # create a new stoplist + my $sl = DBIx::FullTextSearch::StopList->create_default($dbh, 'sl_en', 'English'); + + # create a new index with default english stoplist and english stemmer + my $fts = DBIx::FullTextSearch->create($dbh, 'fts_web_1', + frontend => 'string', backend => 'blob', + stoplist => 'sl_en', stemmer => 'en-us'); + # or open existing one + # my $fts = DBIx::FullTextSearch->open($dbh, 'fts_web_1'); + + # index documents + $fts->index_document('krtek', 'krtek leze pod zemi'); + $fts->index_document('jezek', 'Jezek ma ostre bodliny.'); + + # search for matches + my @docs = $fts->contains('foo'); + my @docs = $fts->econtains('+foo', '-Bar'); + my @docs = $fts->search('+foo -Bar'); + my @docs = $fts->search('foo AND (bar OR baz)'); + +=head1 DESCRIPTION + +DBIx::FullTextSearch is a flexible solution for indexing contents of documents. +It uses the MySQL database to store the information about words and +documents and provides Perl interface for indexing new documents, +making changes and searching for matches. For DBIx::FullTextSearch, a document +is nearly anything -- Perl scalar, file, Web document, database field. + +The basic style of interface is shown above. What you need is a MySQL +database and a L with L. Then you create a DBIx::FullTextSearch index +-- a set of tables that maintain all necessary information. Once created +it can be accessed many times, either for updating the index (adding +documents) or searching. + +DBIx::FullTextSearch uses one basic table to store parameters of the index. Second +table is used to store the actual information about documents and words, +and depending on the type of the index (specified during index creation) +there may be more tables to store additional information (like +conversion from external string names (eg. URL's) to internal numeric +form). For a user, these internal thingies and internal behaviour of the +index are not important. The important part is the API, the methods to +index document and ask questions about words in documents. However, +certain understanding of how it all works may be usefull when you are +deciding if this module is for you and what type of index will best +suit your needs. + +=head2 Frontends + +From the user, application point of view, the DBIx::FullTextSearch index stores +documents that are named in a certain way, allows adding new documents, +and provides methods to ask: "give me list of names of documents that +contain this list of words". The DBIx::FullTextSearch index doesn't store the +documents itself. Instead, it stores information about words in the +documents in such a structured way that it makes easy and fast to look +up what documents contain certain words and return names of the +documents. + +DBIx::FullTextSearch provides a couple of predefined frontend classes that specify +various types of documents (and the way they relate to their names). + +=over 4 + +=item default + +By default, user specifies the integer number of the document and the +content (body) of the document. The code would for example read + + $fts->index_document(53, 'zastavujeme vyplaty vkladu'); + +and DBIx::FullTextSearch will remember that the document 53 contains three words. +When looking for all documents containing word (string) vklad, a call + + my @docs = $fts->contains('vklad*'); + +would return numbers of all documents containing words starting with +'vklad', 53 among them. + +So here it's user's responsibility to maintain a relation between the +document numbers and their content, to know that a document 53 is about +vklady. Perhaps the documents are already stored somewhere and have +unique numeric id. + +Note that the numeric id must be no larger than 2^C. + +=item string + +Frontend B allows the user to specify the names of the documents as +strings, instead of numbers. Still the user has to specify both the +name of the document and the content: + + $fts->index_document('foobar', + 'the quick brown fox jumped over lazy dog!'); + +After that, + + $fts->contains('dog') + +will return 'foobar' as one of the names of documents with word +'dog' in it. + +=item file + +To index files, use the frontend B. Here the content of the document +is clearly the content of the file specified by the filename, so in +a call to index_document, only the name is needed -- the content of the +file is read by the DBIx::FullTextSearch transparently: + + $fts->index_document('/usr/doc/FAQ/Linux-FAQ'); + my @files = $fts->contains('penguin'); + +=item url + +Web document can be indexed by the frontend B. DBIx::FullTextSearch uses L to +get the document and then parses it normally: + + $fts->index_document('http://www.perl.com/'); + +Note that the HTML tags themselves are indexed along with the text. + +=item table + +You can have a DBIx::FullTextSearch index that indexes char or blob fields in MySQL +table. Since MySQL doesn't support triggers, you have to call the +C method of DBIx::FullTextSearch any time something changes +in the table. So the sequence probably will be + + $dbh->do('insert into the_table (id, data, other_fields) + values (?, ?, ?)', {}, $name, $data, $date_or_something); + $fts->index_document($name); + +When calling C, the id (name) of the record will be returned. If +the id in the_table is numeric, it's directly used as the internal +numeric id, otherwise a string's way of converting the id to numeric +form is used. + +When creating this index, you'll have to pass it three additionial options, +C, C, and C. You may use the optional +column_process option to pre-process data in the specified columns. + +=back + +The structure of DBIx::FullTextSearch is very flexible and adding new frontend +(what will be indexed) is very easy. + +=head2 Backends + +While frontend specifies what is indexed and how the user sees the +collection of documents, backend is about low level database way of +actually storing the information in the tables. Three types are +available: + +=over 4 + +=item blob + +For each word, a blob holding list of all documents containing that word +is stored in the table, with the count (number of occurencies) +associated with each document number. That makes it for very compact +storage. Since the document names (for example URL) are internally +converted to numbers, storing and fetching the data is fast. However, +updating the information is very slow, since information concerning one +document is spread across all table, without any direct database access. +Updating a document (or merely reindexing it) requires update of all +blobs, which is slow. + +The list of documents is stored sorted by document name so that +fetching an information about a document for one word is relatively +easy, still a need to update (or at least scan) all records in the table +makes this storage unsuitable for collections of documents that often +change. + +=item column + +The B backend stores a word/document pair in database fields, +indexing both, thus allowing both fast retrieval and updates -- it's +easy to delete all records describing one document and insert new ones. +However, the database indexes that have to be maintained are large. + +Both B and B backends only store a count -- number of +occurencies of the word in the document (and even this can be switched +off, yielding just a yes/no information about the word's presence). +This allows questions like + + all documents containing words 'voda' or 'Mattoni' + but not a word 'kyselka' + +but you cannot ask whether a document contains a phrase 'kyselka +Mattoni' because such information is not maintained by these types of +backends. + +=item phrase + +To allow phrase matching, a B backend is available. For each word +and document number it stores a blob of lists of positions of the word +in the document. A query + + $fts->contains('kyselk* Mattoni'); + +then only returns those documents (document names/numbers) where word +kyselka (or kyselky, or so) is just before word Mattoni. + +=back + +=head2 Mixing frontends and backends + +Any frontend can be used with any backend in one DBIx::FullTextSearch index. You +can index Web documents with C frontend and C backend +to be able to find phrases in the documents. And you can use the +default, number based document scheme with C backend to use the disk +space as efficiently as possible -- this is usefull for example for +mailing-list archives, where we need to index huge number of documents +that do not change at all. + +Finding optimal combination is very important and may require some +analysis of the document collection and manipulation, as well as the +speed and storage requirements. Benchmarking on actual target platform +is very useful during the design phase. + +=head1 METHODS + +The following methods are available on the user side as DBIx::FullTextSearch API. + +=over 4 + +=item create + + my $fts = DBIx::FullTextSearch->create($dbh, $index_name, %opts); + +The class method C creates index of given name (the name of the +index is the name of its basic parameter table) and all necessary +tables, returns an object -- newly created index. The options that may +be specified after the index name define the frontend and backend types, +storage parameters (how many bits for what values), etc. See below for +list of create options and discussion of their use. + +=item open + + my $fts = DBIx::FullTextSearch->open($dbh, $index_name); + +Opens and returns object, accessing specifies DBIx::FullTextSearch index. Since all +the index parameters and information are stored in the C<$index_name> table +(including names of all other needed tables), the database handler and +the name of the parameter table are the only needed arguments. + +=item index_document + + $fts->index_document(45, 'Sleva pri nakupu stribra.'); + $fts->index_document('http://www.mozilla.org/'); + $fts->index_document('http://www.mozilla.org/','This is the mozilla web site'); + +For the C and C frontends, two arguments are expected -- the +name (number or string) of the document and its content. For C, +C, and C frontends the content is optional. Any content that you pass +will be appended to the content from the file, URL, or database table. + +=item delete_document + + $fts->delete_document('http://www.mozilla.org/'); + +Removes information about document from the index. Note that for C +backend this is very time consuming process. + +=item contains + + my @docs = $fts->contains('sleva', 'strib*'); + +Returns list of names (numbers or strings, depending on the frontend) +of documents that contain some of specified words. + +=item econtains + + my @docs = $fts->contains('foo', '+bar*', '-koo'); + +Econtains stands for extended contains and allows words to be prefixed +by plus or minus signs to specify that the word must or mustn't be +present in the document for it to match. + +=item search + + my @docs = $fts->search(qq{+"this is a phrase" -koo +bar foo}); + my @docs = $fts->search("(foo OR baz) AND (bar OR caz)"); + +This is a wrapper to econtains which takes a user input string and parses +it into can-include, must-include, and must-not-include words and phrases. +It also can handle boolean (AND/OR) queries. + +=item contains_hashref, econtains_hashref, search_hashref + +Similar to C, C and C, +only instead of list of document +names, these methods return a hash reference to a hash where keys are +the document names and values are the number of occurencies of the +words. + +=item drop + +Removes all tables associated with the index, including the base +parameter table. Effectivelly destroying the index form the database. + + $fts->drop; + +=item empty + +Emptys the index so you can reindex the data. + + $fts->empty; + +=back + +=head1 INDEX OPTIONS + +Here we list the options that may be passed to C method. +These allow to specify the style and storage parameters in great detail. + +=over 4 + +=item backend + +The backend type, default C, possible values C, C and C +(see above for explanation). + +=item frontend + +The frontend type. The C frontend requires the user to specify +numeric id of the document together with the content of the document, +other possible values are C, C and C (see above for +more info). + +=item word_length + +Maximum length of words that may be indexed, default 30. + +=item data_table + +Name of the table where the actual data about word/document relation is +stored. By default, the name of the index (of the base table) with _data +suffix is used. + +=item name_length + +Any frontend that uses strings as names of documents needs to maintain +a conversion table from these names to internal integer ids. This value +specifies maximum length of these string names (URLs, file names, ...). + +=item blob_direct_fetch + +Only for C backend. When looking for information about specific +document in the list stored in the blob, the blob backend uses division +of interval to find the correct place in the blob. When the interval +gets equal or shorter that this value, all values are fetched from the +database and the final search is done in Perl code sequentially. + +=item word_id_bits + +With C or C backends, DBIx::FullTextSearch maintains a numeric id for each +word to optimize the space requirements. The word_id_bits parameter +specifies the number of bits to reserve for this conversion and thus +effectively limits number of distinct words that may be indexed. The +default is 16 bits and possible values are 8, 16, 24 or 32 bits. + +=item word_id_table + +Name of the table that holds conversion from words to their numeric id +(for C and C backends). By default is the name of the index +with _words suffix. + +=item doc_id_bits + +A number of bits to hold a numeric id of the document (that is either +provided by the user (with C frontend) or generated by the module +to accomplish the conversion from the string name of the document). This +value limits the maximum number of documents to hold. The default is 16 +bits and possible values are 8, 16 and 32 bits for C backend and 8, +16, 24 and 32 bits for C and C backends. + +=item doc_id_table + +Name of the table that holds conversion from string names of documents +to their numeric id, by default the name of the index with _docid +suffix. + +=item count_bits + +Number of bits reserved for storing number of occurencies of each word +in the document. The default is 8 and possible values are the same as +with doc_id_bits. + +=item position_bits + +With C, DBIx::FullTextSearch stores positions of each word of the +documents. This value specifies how much space should be reserved for +this purpose. The default is 32 bits and possible values are 8, 16 or 32 +bits. This value limits the maximum number of words of each document +that can be stored. + +=item index_splitter + +DBIx::FullTextSearch allows the user to provide any Perl code that will be used to +split the content of the document to words when indexing documents. +The code will be evalled inside of the DBIx::FullTextSearch code. The default is + + /(\w{2,$word_length})/g + +and shows that the input is stored in the variable C<$data> and the code +may access any other variable available in the perl_and_index_data_* +methods (see source), especially C<$word_length> to get the maximum length +of words and C<$backend> to get the backend object. + +The default value also shows that by default, the minimum length of +words indexed is 2. + +=item search_splitter + +This is similar to the C method, +except that it is used in the C method +when searching for documents instead of when indexing documents. The default is + + /(\w{2,$word_length}\*?)/g + +Which, unlike the default C, allows for the wild card character (*). + +=item filter + +The output words of splitter (and also any parameter of (e)contains* +methods) are send to filter that may do further processing. Filter is +again a Perl code, the default is + + map { lc $_ } + +showing that the filter operates on input list and by default does +conversion to lowercase (yielding case insensitive index). + +=item init_env + +Because user defined splitter or filter may depend on other things that +it is reasonable to set before the actual procession of words, you can +use yet another Perl hook to set things up. The default is no initialization +hook. + +=item stoplist + +This is the name of a L object that is used +for stop words. + +=item stemmer + +If this option is set, then word stemming will be enabled in the indexing and searching. + +The value is the name of a L recognized locale. +Currently, 'en', 'en-us' and 'en-uk' are the only recognized locales. +All locale identifiers are converted to lowercase. + +=item table_name + +For C
frontend; this is the name of the table that will be indexed. + +=item column_name + +For C
frontend; this is a reference to an array of columns in the +C that contains the documents -- data to be indexed. It can +also have a form table.column that will be used if the C +option is not specified. + +=item column_id_name + +For C
frontend; this is the name of the field in C that +holds names (ids) of the records. If not specified, a field that has +primary key on it is used. If this field is numeric, it's values are +directly used as identifiers, otherwise a conversion to numeric values +is made. + +=back + +=head1 NOTES + +To handle internationalization, it may help to use the following in your code +(for example Spanish in Chile): + + use POSIX; + my $loc = POSIX::setlocale( &POSIX::LC_ALL, "es_CL" ); + +I haven't tested this, so I would be interested in hearing whether this +works. + +=head1 ERROR HANDLING + +The create and open methods return the DBIx::FullTextSearch object on success, upon +failure they return undef and set error message in C<$DBIx::FullTextSearch::errstr> +variable. + +All other methods return reasonable (documented above) value on success, +failure is signalized by unreasonable (typically undef or null) return +value; the error message may then be retrieved by C<$fts-Eerrstr> method +call. + +=head1 VERSION + +This documentation describes DBIx::FullTextSearch module version 0.73. + +=head1 BUGS + +Error handling needs more polishing. + +We do not check if the stored values are larger that specified by the +*_bits parameters. + +No CGI administration tool at the moment. + +No scoring algorithm implemented. + +=head1 DEVELOPMENT + +These modules are under active development. +If you would like to contribute, please e-mail tjmather@maxmind.com + +There are two mailing lists for this module, one for users, and another for developers. To subscribe, +visit http://sourceforge.net/mail/?group_id=8645 + +=head1 AUTHOR + +(Original) Jan Pazdziora, adelton@fi.muni.cz, +http://www.fi.muni.cz/~adelton/ at Faculty of Informatics, Masaryk University in Brno, Czech +Republic + +(Current Maintainer) T.J. Mather, tjmather@maxmind.com, +http://www.maxmind.com/app/opensourceservices Princeton, NJ USA + +Paid support is available from directly from the maintainers of this package. +Please see L for more details. + +=head1 CREDITS + +Fixes, Bug Reports, Docs have been generously provided by: + + Vladimir Bogdanov + Ade Olonoh + Kate Pugh + Sven Paulus + Andrew Turner + Tom Bille + Joern Reder + Tarik Alkasab + Dan Collis Puro + Tony Bowden + Mario Minati + Miroslav Suchý + Stephen Patterson + Joern Reder + Hans Poo + +Of course, big thanks to Jan Pazdziora, the original author of this +module. Especially for providing a clean, modular code base! + +=head1 COPYRIGHT + +All rights reserved. This package is free software; you can +redistribute it and/or modify it under the same terms as Perl itself. + +=head1 SEE ALSO + +L, +L + +=head1 OTHER PRODUCTS and why I've written this module + +I'm aware of L and L +modules and about UdmSearch utility, and +about htdig and glimpse on the non-database side of the world. + +To me, using a database gives reasonable maintenance benefits. With +products that use their own files to store the information (even if the +storage algorithms are efficient and well thought of), you always +struggle with permissions on files and directories for various users, +with files that somebody accidently deleted or mungled, and making the +index available remotely is not trivial. + +That's why I've wanted a module that will use a database as a storage +backend. With MySQL, you get remote access and access control for free, +and on many web servers MySQL is part of the standard equipment. So +using it for text indexes seemed natural. + +However, existing L and UdmSearch are too narrow-aimed to +me. The first only supports indexing of data that is stored in the +database, but you may not always want or need to store the documents in +the database as well. The UdmSearch on the other hand is only for web +documents, making it unsuitable for indexing mailing-list archives or +local data. + +I believe that DBIx::FullTextSearch is reasonably flexible and still very +efficient. It doesn't enforce its own idea of what is good for you -- +the number of options is big and you can always extend the module with +your own backend of frontend if you feel that those provided are not +sufficient. Or you can extend existing by adding one or two parameters +that will add new features. Of course, patches are always welcome. +DBIx::FullTextSearch is a tool that can be deployed in many projects. It's not +a complete environment since different people have different needs. On +the other hand, the methods that it provides make it easy to build +a complete solution on top of this in very short course of time. + +=cut + diff --git a/lib/DBIx/FullTextSearch/Blob.pm b/lib/DBIx/FullTextSearch/Blob.pm new file mode 100644 index 000000000..445f60897 --- /dev/null +++ b/lib/DBIx/FullTextSearch/Blob.pm @@ -0,0 +1,263 @@ + +package DBIx::FullTextSearch::Blob; +use strict; + +# Open in the backend just sets the object +sub open { + my ($class, $fts) = @_; + return bless { 'fts' => $fts }, $class; +} +# Create creates the table(s) according to the parameters +sub _create_tables { + my ($class, $fts) = @_; + my $CREATE_DATA = <{'data_table'} ( + word varchar($fts->{'word_length'}) binary + default '' not null, + idx longblob default '' not null, + primary key (word) + ) +EOF + my $dbh = $fts->{'dbh'}; + $dbh->do($CREATE_DATA) or return $dbh->errstr; + push @{$fts->{'created_tables'}}, $fts->{'data_table'}; + return; +} + +sub add_document { + my ($self, $id, $words) = @_; + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $data_table = $fts->{'data_table'}; + + my $update_sth = ( defined $self->{'adding_update_sth'} + ? $self->{'adding_update_sth'} + : $self->{'adding_update_sth'} = $dbh->prepare( + "update $data_table set idx = concat(idx, ?) + where word = ?") ); + + my @insert_values; + + my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}} + . $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}}; + my $num_words = 0; + for my $word ( keys %$words ) { +### print STDERR "$word($id) adding\n"; + # here we will want to parametrize the bit size of the + # data + my $value = pack $packstring, $id, $words->{$word}; + my $rows = $update_sth->execute($value, $word); + push @insert_values, $word, $value if $rows == 0; + $num_words += $words->{$word}; + } + + if(@insert_values){ + my $sql_str = "insert into $data_table values ". join(',', ('(?, ?)') x (@insert_values/2)); + $dbh->do($sql_str,{},@insert_values); + } + + return $num_words; +} + +sub delete_document { + my $self = shift; + for my $id (@_) { $self->update_document($id, {}); } +} + +sub update_document { + my ($self, $id, $words) = @_; + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $data_table = $fts->{'data_table'}; + + my $insert_sth = ( defined $self->{'insert_sth'} + ? $self->{'insert_sth'} + : $self->{'insert_sth'} = $dbh->prepare(" + insert into $data_table values (?, ?)") ); + + my $update_sth = ( defined $self->{'update_update_sth'} + ? $self->{'update_update_sth'} + : $self->{'update_update_sth'} = + $dbh->prepare("update $data_table set idx = + concat(substring(idx, 1, ?), ?, substring(idx, ?)) + where word = ?") ); + + + my @insert_values; + + $dbh->do("lock tables $data_table write"); + + my $select_sth = $dbh->prepare("select word from $data_table"); + $select_sth->execute; + + my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}} + . $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}}; + my ($packnulls) = pack $packstring, 0, 0; + my $packlength = length $packnulls; + my $num_words = 0; + while (my ($word) = $select_sth->fetchrow_array) { + my $value = (defined $words->{$word} ? + pack($packstring, $id, $words->{$word}) : ''); + + # the method find_position finds the position of the + # "record" for document $id with word $word; returned is + # the position in bytes and yes/no values specifying if + # the record is already present in the blob; if it is, + # we need to replace it, otherwise just insert. + + my ($pos, $shift) = $self->find_position($word, $id); + if (not defined $pos) { + push @insert_values, $word, $value; + } + else { + my $spos = $pos + 1; # I'm not sure why this + $spos += $packlength if $shift; + $update_sth->execute($pos, $value, $spos, $word); + } + delete $words->{$word}; + $num_words++ if defined $value; + } + + for my $word ( keys %$words ) { + my $value = pack $packstring, $id, $words->{$word}; + push @insert_values, $word, $value; +# $insert_sth->execute($word, $value); + $num_words++; + } + + if(@insert_values){ + my $sql_str = "insert into $data_table values ". join(',', ('(?, ?)') x (@insert_values/2)); + $dbh->do($sql_str,{},@insert_values); + } + + $dbh->do("unlock tables"); + + return $num_words; +} + +sub find_position { + my ($self, $word, $id) = @_; + # here, with the calculation of where in the blob we have the + # docid and where the count of words and how long they are, we + # should really look at the parameters (num of bits of various + # structures and values) given to create + + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $data_table = $fts->{'data_table'}; + + # Sth to read the length of the blob holding the document/count info + my $get_length_sth = ( defined $self->{'get_length_sth'} + ? $self->{'get_length_sth'} + : $self->{'get_length_sth'} = $dbh->prepare("select + length(idx) from $data_table where word = ?")); + my $length = $dbh->selectrow_array($get_length_sth, {}, $word); + + my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}} + . $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}}; + my ($packnulls) = pack $packstring, 0, 0; + my $packlength = length $packnulls; + + if (not defined $length) { return; } + $length = int($length/$packlength); + + my ($bot, $top, $med, $val) = (0, $length); + + if (not defined $fts->{'max_doc_id'}) + { $med = int(($top - $bot) / 2); } + else + { $med = int($top * $id / $fts->{'max_doc_id'}); } + + my $blob_direct_fetch = $fts->{'blob_direct_fetch'}; + # we divide the interval + while ($bot != $top) { + $med = $top - 1 if $med >= $top; + $med = $bot if $med < $bot; + + if ($top - $bot <= $blob_direct_fetch) { + my $get_interval_sth = ( + defined $self->{'get_interval_sth'} + ? $self->{'get_interval_sth'} + : $self->{'get_interval_sth'} = $dbh->prepare("select substring(idx,?,?) from $data_table where word = ?")); + my $alldata = $dbh->selectrow_array($get_interval_sth, + {}, + $bot * $packlength + 1, + ($top - $bot) * $packlength, + $word); + return unless defined $alldata; + + my @docs; + my $i = 0; + while ($i < length $alldata) { + push @docs, unpack $packstring, + substr $alldata, $i, $packlength; + $i += $packlength; + } + for (my $i = 0; $i < @docs; $i += 2) { + if ($docs[$i] == $id) { return (($bot+($i/2))*$packlength, 1); } + if ($docs[$i] > $id) { return (($bot+($i/2))*$packlength, 0); } + } + return ($top * $packlength, 0); + } + ($val) = $dbh->selectrow_array( + "select substring(idx, ?, 2) from $data_table + where word = ?", {}, ($med * $packlength) + 1, $word); + ($val) = unpack $packstring, $val; + + if (not defined $val) { return; } + if ($val == $id) { return ($med * $packlength, 1); } + + elsif ($val < $id) { $bot = $med + 1; } + else { $top = $med; } + + $med = int($med * $id / $val); + } + return ($bot * $packlength, 0); +} + +sub contains_hashref { + my $self = shift; + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $data_table = $fts->{'data_table'}; + + my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}} + . $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}}; + my ($packnulls) = pack $packstring, 0, 0; + my $packlength = length $packnulls; + + my $sth = ( defined $self->{'get_idx_sth'} ? + $self->{'get_idx_sth'} : + $self->{'get_idx_sth'} = + $dbh->prepare( + "select idx from $data_table where word like ?" + )); + + my $out = {}; + for my $word (@_) { + $sth->execute($word); + while (my ($blob) = $sth->fetchrow_array) { + next unless defined $blob; + my @data; + my $i = 0; + while ($i < length $blob) { + push @data, unpack $packstring, + substr $blob, $i, $packlength; + $i += $packlength; + } + while (@data) { + my $doc = shift @data; + my $count = shift @data; + unless (defined $out->{$doc}) { $out->{$doc} = 0; } + $out->{$doc} += $count; + } + } + $sth->finish; + } + $out; +} + +*parse_and_index_data = \&DBIx::FullTextSearch::parse_and_index_data_count; + +1; + diff --git a/lib/DBIx/FullTextSearch/BlobFast.pm b/lib/DBIx/FullTextSearch/BlobFast.pm new file mode 100644 index 000000000..3494bd21a --- /dev/null +++ b/lib/DBIx/FullTextSearch/BlobFast.pm @@ -0,0 +1,83 @@ + +package DBIx::FullTextSearch::BlobFast; +use DBIx::FullTextSearch::Blob; +use vars qw! @ISA !; +@ISA = qw! DBIx::FullTextSearch::Blob !; +use strict; + +sub delete_document { + my $self = shift; + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $data_table = $fts->{'data_table'}; + + my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}} + . $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}}; + my $value = ''; + for my $id (@_) { + $value .= pack $packstring, $id, 0; + } + $dbh->do(" + update $data_table set idx = concat(idx, ?) + ", {}, $value); + 1; +} + +sub update_document { + my $self = shift; + $self->delete_document($_[0]); + $self->add_document(@_); +} + +sub contains_hashref { + my $self = shift; + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $data_table = $fts->{'data_table'}; + + my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}} + . $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}}; + my ($packnulls) = pack $packstring, 0, 0; + my $packlength = length $packnulls; + + my $sth = ( defined $self->{'get_idx_sth'} ? + $self->{'get_idx_sth'} : + $self->{'get_idx_sth'} = + $dbh->prepare( + "select idx from $data_table where word like ?" + )); + + my $out = {}; + for my $word (@_) { + $sth->execute($word); + while (my ($blob) = $sth->fetchrow_array) { + next unless defined $blob; + my %docs = (); + my @data; + my $i = length($blob) - $packlength; + while ($i >= 0) { + my ($doc_id, $count) = + unpack "\@$i$packstring", $blob; +### print STDERR "$doc_id $count\n"; + $i -= $packlength; + next if exists $docs{$doc_id}; + $docs{$doc_id} = 1; + next unless $count; + push @data, $doc_id, $count; + } + while (@data) { + my $doc = shift @data; + my $count = shift @data; + unless (defined $out->{$doc}) { $out->{$doc} = 0; } + $out->{$doc} += $count; + } + } + $sth->finish; + } + $out; +} + +*parse_and_index_data = \&DBIx::FullTextSearch::parse_and_index_data_count; + +1; + diff --git a/lib/DBIx/FullTextSearch/Column.pm b/lib/DBIx/FullTextSearch/Column.pm new file mode 100644 index 000000000..bc8b41e95 --- /dev/null +++ b/lib/DBIx/FullTextSearch/Column.pm @@ -0,0 +1,188 @@ + +package DBIx::FullTextSearch::Column; +use strict; + +# Open in the backend just sets the object +sub open { + my ($class, $fts) = @_; + return bless { 'fts' => $fts }, $class; +} + +sub DESTROY { + my ($self) = @_; + if (defined $self->{'select_wordid_sth'}) { + $self->{'select_wordid_sth'}->finish(); + } +} + +# Create creates the table(s) according to the parameters +sub _create_tables { + my ($class, $fts) = @_; + my $COUNT_FIELD = ''; + if ($fts->{'count_bits'}) { + $COUNT_FIELD = "count $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'count_bits'}} unsigned," + } + my $CREATE_DATA = <{'data_table'} ( + word_id $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'word_id_bits'}} unsigned not null, + doc_id $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'doc_id_bits'}} unsigned not null, + $COUNT_FIELD + index (word_id), + index (doc_id) + ) +EOF + + $fts->{'word_id_table'} = $fts->{'table'}.'_words' + unless defined $fts->{'word_id_table'}; + + + my $CREATE_WORD_ID = <{'word_id_table'} ( + word varchar($fts->{'word_length'}) binary + default '' not null, + id $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'word_id_bits'}} unsigned not null auto_increment, + primary key (id), + unique (word) + ) +EOF + + my $dbh = $fts->{'dbh'}; + $dbh->do($CREATE_DATA) or return $dbh->errstr; + push @{$fts->{'created_tables'}}, $fts->{'data_table'}; + $dbh->do($CREATE_WORD_ID) or return $dbh->errstr; + push @{$fts->{'created_tables'}}, $fts->{'word_id_table'}; + return; +} + +sub add_document { + my ($self, $id, $words) = @_; + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $word_id_table = $fts->{'word_id_table'}; + if (not defined $self->{'select_wordid_sth'}) { + $self->{'select_wordid_sth'} = $dbh->prepare(" + select id from $word_id_table where word = ? + "); + } + my $data_table = $fts->{'data_table'}; + my $count_bits = $fts->{'count_bits'}; + my $num_words = 0; + my (@wids,@data,@widshandler,@datahandler); + my $wordid; + $dbh->do("lock tables $word_id_table write"); + my ($maxid) = $dbh->selectrow_array("select max(id) + from $word_id_table"); + foreach my $word (keys %$words) { + if(!defined $self->{'wordids'}->{$word}) { + $self->{'select_wordid_sth'}->execute($word); + ($wordid) = $self->{'select_wordid_sth'}->fetchrow_array(); + unless ($wordid) { + $maxid++; + push @widshandler, "(?,$maxid)"; + push @wids, $word; + $wordid = $maxid; + } + $self->{'wordids'}->{$word} = $wordid; + } else { + $wordid=$self->{'wordids'}->{$word}; + } + if ($count_bits) { + push @datahandler, "($wordid,$id,?)"; + push @data, $words->{$word}; + } else { + push @datahandler, "($wordid,$id)"; + } + $num_words++; + }; + $dbh->do("insert into $word_id_table values " . + join (',',@widshandler),undef,@wids) if @wids; + $dbh->do("unlock tables"); + if ($count_bits) { + $dbh->do("insert into $data_table values " . join (',',@datahandler),undef,@data) if @data; + } else { + $dbh->do("insert into $data_table values " . join (',',@datahandler)) if @datahandler; + } + return $num_words; +} + +sub delete_document { + my $self = shift; + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $data_table = $fts->{'data_table'}; + my $sth = $dbh->prepare("delete from $data_table where doc_id = ?"); + for my $id (@_) { $sth->execute($id); } +} + +sub update_document { + my ($self, $id, $words) = @_; + $self->delete_document($id); + $self->add_document($id, $words); +} + +sub contains_hashref { + my $self = shift; + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $data_table = $fts->{'data_table'}; + my $word_id_table = $fts->{'word_id_table'}; + + my $count_bits = $fts->{'count_bits'}; + my $sth = ( defined $self->{'get_data_sth'} + ? $self->{'get_data_sth'} + : ( $count_bits + ? ( $self->{'get_data_sth'} = $dbh->prepare( + "select doc_id, count + from $data_table, $word_id_table + where word like ? + and id = word_id" ) ) + : ( $self->{'get_data_sth'} = $dbh->prepare( + "select doc_id, 1 + from $data_table, $word_id_table + where word like ? + and id = word_id" ) ) + ) ); + + my $out = {}; + for my $word (@_) { + $sth->execute($word); + while (my ($doc, $count) = $sth->fetchrow_array) { + $out->{$doc} += $count; + } + $sth->finish; + } + $out; +} + +sub common_word { + my ($self, $k) = @_; + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + + my $num = $fts->document_count; + + $k /= 100; + + my $SQL = <{'data_table'} + group by word_id + having k >= ? +EOF + my $ary_ref = $dbh->selectcol_arrayref($SQL, {}, $num, $k); + return unless @$ary_ref; + + my $QUESTION_MARKS = join ',', ('?') x scalar(@$ary_ref); + + $SQL = <{'word_id_table'} + where id IN ($QUESTION_MARKS) +EOF + return $dbh->selectcol_arrayref($SQL, {}, @$ary_ref); +} + +*parse_and_index_data = \&DBIx::FullTextSearch::parse_and_index_data_count; + +1; + diff --git a/lib/DBIx/FullTextSearch/File.pm b/lib/DBIx/FullTextSearch/File.pm new file mode 100644 index 000000000..80e844714 --- /dev/null +++ b/lib/DBIx/FullTextSearch/File.pm @@ -0,0 +1,27 @@ + +package DBIx::FullTextSearch::File; +use DBIx::FullTextSearch::String; +use strict; +use vars qw! @ISA !; +@ISA = qw! DBIx::FullTextSearch::String !; + +sub index_document { + my ($self, $file, $extra_data) = @_; + my $dbh = $self->{'dbh'}; + + open FILE, $file or do { + $self->{'errstr'} = "Reading the file `$file' failed: $!"; + return; + }; + my $data; + { + local $/ = undef; + $data = ; + } + $data .= " $extra_data" if $extra_data; + close FILE; + $self->SUPER::index_document($file, $data); +} + +1; + diff --git a/lib/DBIx/FullTextSearch/Phrase.pm b/lib/DBIx/FullTextSearch/Phrase.pm new file mode 100644 index 000000000..8d1c27249 --- /dev/null +++ b/lib/DBIx/FullTextSearch/Phrase.pm @@ -0,0 +1,182 @@ +package DBIx::FullTextSearch::Phrase; +use strict; +use DBIx::FullTextSearch::Column; +use vars qw! @ISA !; +@ISA = qw! DBIx::FullTextSearch::Column !; + +# Open in the backend just sets the object +sub open { + my ($class, $fts) = @_; + return bless { 'fts' => $fts }, $class; +} + +sub DESTROY { + my ($self) = @_; + if (defined $self->{'select_wordid_sth'}) { + $self->{'select_wordid_sth'}->finish(); + } +} + + +# Create creates the table(s) according to the parameters +sub _create_tables { + my ($class, $fts) = @_; + my $COUNT_FIELD = ''; + + my $CREATE_DATA = <{'data_table'} ( + word_id $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'word_id_bits'}} unsigned not null, + doc_id $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'doc_id_bits'}} unsigned not null, + idx longblob default '' not null, + index (word_id), + index (doc_id) + ) +EOF + + $fts->{'word_id_table'} = $fts->{'table'}.'_words' + unless defined $fts->{'word_id_table'}; + + + my $CREATE_WORD_ID = <{'word_id_table'} ( + word varchar($fts->{'word_length'}) binary + default '' not null, + id $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'word_id_bits'}} unsigned not null auto_increment, + primary key (id), + unique (word) + ) +EOF + + my $dbh = $fts->{'dbh'}; + $dbh->do($CREATE_DATA) or return $dbh->errstr; + push @{$fts->{'created_tables'}}, $fts->{'data_table'}; + $dbh->do($CREATE_WORD_ID) or return $dbh->errstr; + push @{$fts->{'created_tables'}}, $fts->{'word_id_table'}; + return; +} + +sub add_document { + my ($self, $id, $words) = @_; + # here the value in the %$words hash is an array of word + # positions + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $word_id_table = $fts->{'word_id_table'}; + if (not defined $self->{'select_wordid_sth'}) { + $self->{'select_wordid_sth'} = $dbh->prepare(" + select id from $word_id_table where word = ? + "); + } + my $data_table = $fts->{'data_table'}; + my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'position_bits'}}; + my $num_words = 0; + my (@wids,@data,@widshandler,@datahandler); + my $wordid; + $dbh->do("lock tables $word_id_table write"); + my ($maxid) = $dbh->selectrow_array("select max(id) + from $word_id_table"); + foreach my $word (keys %$words) { + if(!defined $self->{'wordids'}->{$word}) { + $self->{'select_wordid_sth'}->execute($word); + ($wordid) = $self->{'select_wordid_sth'}->fetchrow_array(); + unless ($wordid) { + $maxid++; + push @widshandler, "(?,$maxid)"; + push @wids, $word; + $wordid = $maxid; + } + $self->{'wordids'}->{$word} = $wordid; + } else { + $wordid=$self->{'wordids'}->{$word}; + } + push @datahandler, "($wordid,$id,?)"; + push @data, pack $packstring.'*', @{$words->{$word}}; + $num_words++; + }; + $dbh->do("insert into $word_id_table values " . + join (',',@widshandler),undef,@wids) if @wids; + $dbh->do("unlock tables"); + $dbh->do("insert into $data_table values " . + join (',',@datahandler),undef,@data) if @data; + return $num_words; +} + +sub update_document { + my ($self, $id, $words) = @_; + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $data_table = $fts->{'data_table'}; + $dbh->do("delete from $data_table where doc_id = ?", {}, $id); + + $self->add_document($id, $words); +} + +sub contains_hashref { + my $self = shift; + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $data_table = $fts->{'data_table'}; + my $word_id_table = $fts->{'word_id_table'}; + + my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'position_bits'}}; + + my $SQL = qq{ + select doc_id, idx + from $data_table, $word_id_table + where word like ? + and id = word_id +}; + my $out = {}; + + for my $phrase (@_){ + + my @words = split(' ', $phrase); + + my @sths; + for (my $i = 0; $i < @words; $i++) { + $sths[$i] = $dbh->prepare($SQL); + $sths[$i]->execute($words[$i]); + } + + my %prev_pos = (); + my %cur_pos = (); + + # iterate through words in phrase + for (my $i = 0; $i < @words; $i++){ + if($i > 0){ + %prev_pos = %cur_pos; + %cur_pos = (); + } + # get docs that have this word + while (my ($doc, $data) = $sths[$i]->fetchrow_array){ + # get positions of words in doc + my @positions = unpack $packstring.'*', $data; + map { $cur_pos{$doc}->{$_} = 1 } @positions; + } + if($i > 0){ + # check to see if word $i comes after word $i-1 + for my $doc (keys %cur_pos){ + my $isPhrase = 0; + for my $position (keys %{$cur_pos{$doc}}){ + if ($position > 0 && exists $prev_pos{$doc}{$position - 1}){ + $isPhrase = 1; + } else { + delete $cur_pos{$doc}{$position}; + } + } + delete $cur_pos{$doc} unless $isPhrase; + } + } + } + + for my $doc (keys %cur_pos){ + my @positions = keys %{$cur_pos{$doc}}; + $out->{$doc} += scalar (@positions); + } + } + return $out; +} + +*parse_and_index_data = \&DBIx::FullTextSearch::parse_and_index_data_list; + +1; diff --git a/lib/DBIx/FullTextSearch/StopList.pm b/lib/DBIx/FullTextSearch/StopList.pm new file mode 100644 index 000000000..b27ab98a5 --- /dev/null +++ b/lib/DBIx/FullTextSearch/StopList.pm @@ -0,0 +1,271 @@ +package DBIx::FullTextSearch::StopList; +use strict; + +use Carp; + +sub create_default { + my ($class, $dbh, $TABLE, $language) = @_; + + croak("Error: no language specified") unless $language; + + $language = lc $language; + + my @stopList; + + if($language eq 'english'){ + @stopList = qw/ a about after all also an and any are as at be because been but by can co corp could for from had has have he her his if in inc into is it its last more most mr mrs ms mz no not of on one only or other out over s says she so some such than that the their there they this to up was we were when which who will with would /; + } elsif ($language eq 'czech'){ + @stopList = qw/ a aby ale ani až bude by byl byla bylo být co či další do i jak jako je jeho jejich jen ještě již jsem jsme jsou k kde když korun která které který kteří let mezi má může na nebo není než o od pak po podle pouze pro proti první před při roce roku řekl s se si své tak také tedy to tom tím u už v ve však z za ze že/; + } elsif ($language eq 'danish'){ + @stopList = qw/ af aldrig alle altid bagved De de der du efter eller en endnu et fĺ fjernt for foran fra gennem god han her hos hovfor hun hurtig hvad hvem hvonĺr hvor hvordan hvorhen I i imod ja jeg langsom lidt mange mĺske med meget mellem mere mindre nĺr nede nej nok nu og oppe pĺ rask sammen temmelig til uden udenfor under ved vi /; + } elsif ($language eq 'dutch'){ + @stopList = qw/ aan aangaande aangezien achter achterna afgelopen al aldaar aldus alhoewel alias alle allebei alleen alsnog altijd altoos ander andere anders anderszins behalve behoudens beide beiden ben beneden bent bepaald betreffende bij binnen binnenin boven bovenal bovendien bovengenoemd bovenstaand bovenvermeld buiten daar daarheen daarin daarna daarnet daarom daarop daarvanlangs dan dat de die dikwijls dit door doorgaand dus echter eer eerdat eerder eerlang eerst elk elke en enig enigszins enkel er erdoor even eveneens evenwel gauw gedurende geen gehad gekund geleden gelijk gemoeten gemogen geweest gewoon gewoonweg haar had hadden hare heb hebben hebt heeft hem hen het hierbeneden hierboven hij hoe hoewel hun hunne ik ikzelf in inmiddels inzake is jezelf jij jijzelf jou jouw jouwe juist jullie kan klaar kon konden krachtens kunnen kunt later liever maar mag meer met mezelf mij mijn mijnent mijner mijzelf misschien mocht mochten moest moesten moet moeten mogen na naar nadat net niet noch nog nogal nu of ofschoon om omdat omhoog omlaag omstreeks omtrent omver onder ondertussen ongeveer ons onszelf onze ook op opnieuw opzij over overeind overigens pas precies reeds rond rondom sedert sinds sindsdien slechts sommige spoedig steeds tamelijk tenzij terwijl thans tijdens toch toen toenmaals toenmalig tot totdat tussen uit uitgezonderd vaak van vandaan vanuit vanwege veeleer verder vervolgens vol volgens voor vooraf vooral vooralsnog voorbij voordat voordezen voordien voorheen voorop vooruit vrij vroeg waar waarom wanneer want waren was wat weer weg wegens wel weldra welk welke wie wiens wier wij wijzelf zal ze zelfs zichzelf zij zijn zijne zo zodra zonder zou zouden zowat zulke zullen zult /; + } elsif ($language eq 'finnish'){ + @stopList = qw/ aina alla ansiosta ehkä ei enemmän ennen etessa haikki hän he hitaasti hoikein hyvin ilman ja jälkeen jos kanssa kaukana kenties keskellä kesken koskaan kuinkan kukka kyllä kylliksi lähellä läpi liian lla lla luona me mikä miksi milloin milloinkan minä missä miten nopeasti nyt oikea oikealla paljon siellä sinä ssa sta suoraan tai takana takia tarpeeksi tässä te ulkopuolella vähän vahemmän vasen vasenmalla vastan vielä vieressä yhdessä ylös /; + } elsif ($language eq 'french'){ + @stopList = qw/ a ŕ afin ailleurs ainsi alors aprčs attendant au aucun aucune au-dessous au-dessus auprčs auquel aussi aussitôt autant autour aux auxquelles auxquels avec beaucoup ça ce ceci cela celle celles celui cependant certain certaine certaines certains ces cet cette ceux chacun chacune chaque chez combien comme comment concernant dans de dedans dehors déjŕ delŕ depuis des dčs desquelles desquels dessus donc donné dont du duquel durant elle elles en encore entre et étaient était étant etc eux furent grâce hormis hors ici il ils jadis je jusqu jusque la lŕ laquelle le lequel les lesquelles lesquels leur leurs lors lorsque lui ma mais malgré me męme męmes mes mien mienne miennes miens moins moment mon moyennant ne ni non nos notamment notre nôtre notres nôtres nous nulle nulles on ou oů par parce parmi plus plusieurs pour pourquoi prčs puis puisque quand quant que quel quelle quelque quelques-unes quelques-uns quelqu''un quelqu''une quels qui quiconque quoi quoique sa sans sauf se selon ses sien sienne siennes siens soi soi-męme soit sont suis sur ta tandis tant te telle telles tes tienne tiennes tiens toi ton toujours tous toute toutes trčs trop tu un une vos votre vôtre vôtres vous vu y /; + } elsif ($language eq 'german'){ + @stopList = qw/ ab aber allein als also am an auch auf aus außer bald bei beim bin bis bißchen bist da dabei dadurch dafür dagegen dahinter damit danach daneben dann daran darauf daraus darin darüber darum darunter das daß dasselbe davon davor dazu dazwischen dein deine deinem deinen deiner deines dem demselben den denn der derselben des desselben dessen dich die dies diese dieselbe dieselben diesem diesen dieser dieses dir doch dort du ebenso ehe ein eine einem einen einer eines entlang er es etwa etwas euch euer eure eurem euren eurer eures für fürs ganz gar gegen genau gewesen her herein herum hin hinter hintern ich ihm ihn Ihnen ihnen ihr Ihre ihre Ihrem ihrem Ihren ihren Ihrer ihrer Ihres ihres im in ist ja je jedesmal jedoch jene jenem jenen jener jenes kaum kein keine keinem keinen keiner keines man mehr mein meine meinem meinen meiner meines mich mir mit nach nachdem nämlich neben nein nicht nichts noch nun nur ob ober obgleich oder ohne paar sehr sei sein seine seinem seinen seiner seines seit seitdem selbst sich Sie sie sind so sogar solch solche solchem solchen solcher solches sondern sonst soviel soweit über um und uns unser unsre unsrem unsren unsrer unsres vom von vor während war wäre wären warum was wegen weil weit welche welchem welchen welcher welches wem wen wenn wer weshalb wessen wie wir wo womit zu zum zur zwar zwischen zwischens /; + } elsif ($language eq 'italian'){ + @stopList = qw/ a affinchč agl'' agli ai al all'' alla alle allo anzichč avere bensě che chi cioč come comunque con contro cosa da dachč dagl'' dagli dai dal dall'' dalla dalle dallo degl'' degli dei del dell'' delle dello di dopo dove dunque durante e egli eppure essere essi finché fino fra giacchč gl'' gli grazie I il in inoltre io l'' la le lo loro ma mentre mio ne neanche negl'' negli nei nel nell'' nella nelle nello nemmeno neppure noi nonchč nondimeno nostro o onde oppure ossia ovvero per perchč perciň perň poichč prima purchč quand''anche quando quantunque quasi quindi se sebbene sennonchč senza seppure si siccome sopra sotto su subito sugl'' sugli sui sul sull'' sulla sulle sullo suo talchč tu tuo tuttavia tutti un una uno voi vostr/; + } elsif ($language eq 'portuguese'){ + @stopList = qw/ a abaixo adiante agora ali antes aqui até atras bastante bem com como contra debaixo demais depois depressa devagar direito e ela elas ęle eles em entre eu fora junto longe mais menos muito năo ninguem nós nunca onde ou para por porque pouco próximo qual quando quanto que quem se sem sempre sim sob sobre talvez todas todos vagarosamente vocę vocęs /; + } elsif ($language eq 'spanish'){ + @stopList = qw/ a acá ahí ajena ajenas ajeno ajenos al algo algún alguna algunas alguno algunos allá allí aquel aquella aquellas aquello aquellos aquí cada cierta ciertas cierto ciertos como cómo con conmigo consigo contigo cualquier cualquiera cualquieras cuan cuán cuanta cuánta cuantas cuántas cuanto cuánto cuantos cuántos de dejar del demás demasiada demasiadas demasiado demasiados el él ella ellas ellos esa esas ese esos esta estar estas este estos hacer hasta jamás junto juntos la las lo los mas más me menos mía mientras mío misma mismas mismo mismos mucha muchas muchísima muchísimas muchísimo muchísimos mucho muchos muy nada ni ninguna ningunas ninguno ningunos no nos nosotras nosotros nuestra nuestras nuestro nuestros nunca os otra otras otro otros para parecer poca pocas poco pocos por porque que qué querer quien quién quienes quienesquiera quienquiera ser si sí siempre sín Sr Sra Sres Sta suya suyas suyo suyos tal tales tan tanta tantas tanto tantos te tener ti toda todas todo todos tomar tú tuya tuyo un una unas unos usted ustedes varias varios vosotras vosotros vuestra vuestras vuestro vuestros y yo /; + } elsif ($language eq 'swedish'){ + @stopList = qw/ ab aldrig all alla alltid än ännu ĺnyo är att av avser avses bakom bra bredvid dä där de dem den denna deras dess det detta du efter efterät eftersom ej eller emot en ett fastän för fort framför frĺn genom gott hamske han här hellre hon hos hur i in ingen innan inte ja jag lĺngsamt lĺngt lite man med medan mellan mer mera mindre mot myckett när nära nej nere ni nu och oksa om över pĺ sĺ sĺdan sin skall som till tillräckligt tillsammans trotsatt under uppe ut utan utom vad väl var varför vart varthän vem vems vi vid vilken /; + } + + croak("Error: language $language is not a supported") unless @stopList; + + my $sl = $class->create_empty($dbh, $TABLE); + + $sl->add_stop_word(\@stopList); + return $sl; +} + +sub create_empty { + my ($class, $dbh, $name) = @_; + + my $table = $name . '_stoplist'; + + my $SQL = qq{ +CREATE TABLE $table +(word VARCHAR(255) PRIMARY KEY) +}; + + $dbh->do($SQL) or croak "Can't create table $table: " . $dbh->errstr; + + my $self = {}; + $self->{'dbh'} = $dbh; + $self->{'name'} = $name; + $self->{'table'} = $table; + $self->{'stoplist'} = {}; + bless $self, $class; + return $self; +} + +sub open { + my ($class, $dbh, $name) = @_; + + my $table = $name . '_stoplist'; + + my $self = {}; + $self->{'dbh'} = $dbh; + $self->{'name'} = $name; + $self->{'table'} = $table; + $self->{'stoplist'} = {}; + bless $self, $class; + + # load stoplist into a hash + my $SQL = qq{ +SELECT word FROM $table +}; + my $ary_ref = $dbh->selectcol_arrayref($SQL) or croak "Can't load stoplist from $table: " . $dbh->errstr; + for (@$ary_ref){ + $self->{'stoplist'}->{$_} = 1; + } + + return $self; +} + +sub drop { + my $self = shift; + my $dbh = $self->{'dbh'}; + my $table = $self->{'table'}; + my $SQL = qq{ +DROP table $table +}; + $dbh->do($SQL) or croak "Can't drop table $table: " . $dbh->errstr; + $self->{'stoplist'} = {}; +} + +sub empty { + my $self = shift; + my $dbh = $self->{'dbh'}; + my $table = $self->{'table'}; + my $SQL = qq{ +DELETE FROM $table +}; + $dbh->do($SQL) or croak "Can't empty table $table: " . $dbh->errstr; + $self->{'stoplist'} = {}; +} + +sub add_stop_word { + my ($self, $words) = @_; + my $dbh = $self->{'dbh'}; + + $words = [ $words ] unless ref($words) eq 'ARRAY'; + + my @new_stop_words; + + for my $word (@$words){ + next if $self->is_stop_word($word); + push @new_stop_words, $word; + $self->{'stoplist'}->{lc($word)} = 1; + } + my $SQL = "INSERT INTO $self->{'table'} (word) VALUES " . join(',', ('(?)') x @new_stop_words); + $dbh->do($SQL,{},@new_stop_words); +} + +sub remove_stop_word { + my ($self, $words) = @_; + my $dbh = $self->{'dbh'}; + + $words = [ $words ] unless ref($words) eq 'ARRAY'; + + my $SQL = qq{ +DELETE FROM $self->{'table'} WHERE word=? +}; + + my $sth = $dbh->prepare($SQL); + + my $stoplist = $self->{'stoplist'}; + + for my $word (@$words){ + next unless $self->is_stop_word($word); + $sth->execute($word); + delete $stoplist->{lc($word)}; + } +} + +sub is_stop_word { + exists shift->{'stoplist'}->{lc($_[0])}; +} + +1; + +__END__ + +=head1 NAME + +DBIx::FullTextSearch::StopList - Stopwords for DBIx::FullTextSearch + +=head1 SYNOPSIS + + use DBIx::FullTextSearch::StopList; + # connect to database (regular DBI) + my $dbh = DBI->connect('dbi:mysql:database', 'user', 'passwd'); + + # create a new empty stop word list + my $sl1 = DBIx::FullTextSearch::StopList->create_empty($dbh, 'sl_web_1'); + + # or create a new one with default stop words + my $sl2 = DBIx::FullTextSearch::StopList->create_default($dbh, 'sl_web_2', 'english'); + + # or open an existing one + my $sl3 = DBIx::FullTextSearch::StopList->open($dbh, 'sl_web_3'); + + # add stop words + $sl1->add_stop_word(['a','in','on','the']); + + # remove stop words + $sl2->remove_stop_word(['be','because','been','but','by']); + + # check if word is in stoplist + $bool = $sl1->is_stop_word('in'); + + # empty stop words + $sl3->empty; + + # drop stop word table + $sl2->drop; + +=head1 DESCRIPTION + +DBIx::FullTextSearch::StopList provides stop lists that can be used -L. +StopList objects can be reused accross several FullTextSearch objects. + +=head1 METHODS + +=over 4 + +=head2 CONSTRUCTERS + +=item create_empty + + my $sl = DBIx::FullTextSearch::StopList->create_empty($dbh, $sl_name); + +This class method creates a new StopList object. + +=item create_default + + my $sl = DBIx::FullTextSearch::StopList->create_default($dbh, $sl_name, $language); + +This class method creates a new StopList object, with default words loaded in for the +given language. Supported languages include Czech, Danish, Dutch, English, Finnish, French, +German, Italian, Portuguese, Spanish, and Swedish. + +=item open + + my $sl = DBIx::FullTextSearch::StopList->open($dbh, $sl_name); + +Opens and returns StopList object + +=head2 OBJECT METHODS + +=item add_stop_word + + $sl->add_stop_word(\@stop_words); + +Adds stop words to StopList object. Expects array reference as argument. + +=item remove_stop_word + + $sl->remove_stop_word(\@stop_words); + +Remove stop words from StopList object. + +=item is_stop_word + + $bool = $sl->is_stop_word($stop_word); + +Returns true iff stop_word is StopList object + +=item empty + + $sl->empty; + +Removes all stop words in StopList object. + +=item drop + + $sl->drop; + +Removes table associated with the StopList object. + +=back + +=head1 AUTHOR + +T.J. Mather, tjmather@tjmather.com, +http://www.tjmather.com/ + +=head1 COPYRIGHT + +All rights reserved. This package is free software; you can +redistribute it and/or modify it under the same terms as Perl itself. + +=head1 SEE ALSO + +L diff --git a/lib/DBIx/FullTextSearch/String.pm b/lib/DBIx/FullTextSearch/String.pm new file mode 100644 index 000000000..1793df960 --- /dev/null +++ b/lib/DBIx/FullTextSearch/String.pm @@ -0,0 +1,78 @@ + +package DBIx::FullTextSearch::String; +use DBIx::FullTextSearch; +use strict; +use vars qw! @ISA !; +@ISA = qw! DBIx::FullTextSearch !; + +# Create creates the conversion table that converts string names of +# documents to numbers +sub _create_tables { + my $fts = shift; + $fts->{'doc_id_table'} = $fts->{'table'} . '_docid' + unless defined $fts->{'doc_id_table'}; + + unless($fts->{'name_length'}){ + return "The parameter name_length has to be specified."; + } + + my $CREATE_DOCID = <{'doc_id_table'} ( + name varchar($fts->{'name_length'}) binary not null, + id $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'doc_id_bits'}} unsigned not null auto_increment, + primary key (id), + unique (name) + ) +EOF + my $dbh = $fts->{'dbh'}; + $dbh->do($CREATE_DOCID) or return $dbh->errstr; + push @{$fts->{'created_tables'}}, $fts->{'doc_id_table'}; + return; +} + +sub get_id_for_name { + my ($self, $string) = @_; + my $dbh = $self->{'dbh'}; + my $doc_id_table = $self->{'doc_id_table'}; + + my $name_to_id_sth = ( defined $self->{'name_to_id_sth'} + ? $self->{'name_to_id_sth'} + : $self->{'name_to_id_sth'} = $dbh->prepare("select id from $doc_id_table where name = ?") or die $dbh->errstr); + my $id = $dbh->selectrow_array($name_to_id_sth, {}, $string); + if (not defined $id) { + my $new_name_sth = (defined $self->{'new_name_sth'} + ? $self->{'new_name_sth'} + : $self->{'new_name_sth'} = + $dbh->prepare("insert into $doc_id_table values (?, null)") or die $dbh->errstr ); + $new_name_sth->execute($string) or die $new_name_sth->errstr; + $id = $new_name_sth->{'mysql_insertid'}; + } + $id; +} + +sub index_document { + my ($self, $string, $data) = @_; + my $id = $self->get_id_for_name($string); + $self->SUPER::index_document($id, $data); +} + +sub delete_document { + my ($self, $doc_id) = @_; + + $self->SUPER::delete_document($self->get_id_for_name($doc_id)); +} + +sub contains_hashref { + my $self = shift; + my $res = $self->SUPER::contains_hashref(@_); + return unless keys %$res; + + my $doc_id_table = $self->{'doc_id_table'}; + + my $data = $self->{'dbh'}->selectall_arrayref("select name, id from $doc_id_table where id in (" . join(',', ('?') x keys %$res).")", {}, keys %$res); + return { map { ( $_->[0], $res->{$_->[1]} ) } @$data }; +} + + +1; + diff --git a/lib/DBIx/FullTextSearch/Table.pm b/lib/DBIx/FullTextSearch/Table.pm new file mode 100644 index 000000000..a6b47c6ef --- /dev/null +++ b/lib/DBIx/FullTextSearch/Table.pm @@ -0,0 +1,142 @@ + +package DBIx::FullTextSearch::TableString; +use vars qw! @ISA !; +@ISA = qw! DBIx::FullTextSearch::String DBIx::FullTextSearch::Table !; + +sub index_document { + my ($self, $id, $data) = @_; + my @data_sets = $self->get_the_data_from_table($id); + push @data_sets, $data if $data; + $self->SUPER::index_document($id, \@data_sets); +} + +package DBIx::FullTextSearch::TableNum; +use vars qw! @ISA !; +@ISA = qw! DBIx::FullTextSearch::Table !; + +sub index_document { + my ($self, $id, $extra_data) = @_; + my @data_sets = $self->get_the_data_from_table($id); + push @data_sets, $extra_data if $extra_data; + $self->SUPER::index_document($id, \@data_sets); +} + + +package DBIx::FullTextSearch::Table; +use DBIx::FullTextSearch; +use strict; +use vars qw! @ISA !; +@ISA = qw! DBIx::FullTextSearch !; + +sub _open_tables { + my $self = shift; + if (defined $self->{'doc_id_table'}) { + eval 'use DBIx::FullTextSearch::String'; + bless $self, 'DBIx::FullTextSearch::TableString'; + } + else { + bless $self, 'DBIx::FullTextSearch::TableNum'; + } +} + +# we do not create any new tables, we just check that the parameters are +# OK (the table and columns exist, etc.) +sub _create_tables { + my $fts = shift; + my ($table, $column, $id) = @{$fts}{ qw! table_name column_name + column_id_name ! }; + if (not defined $table and $column =~ /\./) { + ($table, $column) = ($column =~ /^(.*)\.(.*)$/s); + } + my $id_type; + + if (not defined $table) { + return "The parameter table_name has to be specified with the table frontend."; + } + if (not defined $column) { + return "The parameter column_name has to be specified with the table frontend."; + } + my $dbh = $fts->{'dbh'}; + my $sth = $dbh->prepare("show columns from $table"); + $sth->{'PrintError'} = 0; + $sth->{'RaiseError'} = 0; + $sth->execute or return "The table `$table' doesn't exist."; + + my $info = $dbh->selectall_arrayref($sth, + { 'PrintError' => 0, 'RaiseError' => 0 }); + if (not defined $info) { + return "The table `$table' doesn't exist."; + } + +# use Data::Dumper; print Dumper $info; + + if (not defined $id) { + # search for column with primary key + my $pri_num = 0; + for my $i (0 .. $#$info) { + if ($info->[$i][3] eq 'PRI') { + $pri_num++; + $id = $info->[$i][0]; + $id_type = $info->[$i][1]; + } + } + if ($pri_num > 1) { + return 'The primary key has to be one-column.'; + } + if ($pri_num == 0) { + return "No primary key found in the table `$table'."; + } + } + else { + # find '$id' column + for my $i (0 .. $#$info) { + if ($info->[$i][0] eq $id){ + $id_type = $info->[$i][1]; + last; + } + } + } + + unless(defined $id_type){ + return "No key named '$id' found in the table '$table'"; + } + + my $testcol = $dbh->prepare("select $column from $table where 1 = 0"); + $testcol->execute or + return "Column `$column' doesn't exist in table `$table'."; + $testcol->finish; + + $fts->{'column_id_name'} = $id; + + my $errstr; + + if ($id_type =~ /([a-z]*int)/) { + $fts->{'doc_id_bits'} = $DBIx::FullTextSearch::INT_TO_BITS{$1}; + bless $fts, 'DBIx::FullTextSearch::TableNum'; + } + else { + my ($length) = ($id_type =~ /^\w+\((\d+)\)$/); + $fts->{'name_length'} = $1; + eval 'use DBIx::FullTextSearch::String'; + bless $fts, 'DBIx::FullTextSearch::TableString'; + $errstr = $fts->DBIx::FullTextSearch::String::_create_tables($fts); + } +### use Data::Dumper; print Dumper $fts; + return $errstr; +} + +sub get_the_data_from_table { + my ($self, $id) = @_; + my $dbh = $self->{'dbh'}; + my $get_data = ( defined $self->{'get_data_sth'} + ? $self->{'get_data_sth'} + : $self->{'get_data_sth'} = $dbh->prepare(" + select $self->{'column_name'} from $self->{'table_name'} + where $self->{'column_id_name'} = ? + ") ); + + my @data_ary = $dbh->selectrow_array($get_data, {}, $id); + return wantarray ? @data_ary : join(" ", @data_ary); +} + +1; diff --git a/lib/DBIx/FullTextSearch/TestConfig.pm b/lib/DBIx/FullTextSearch/TestConfig.pm new file mode 100644 index 000000000..a6a8ff70e --- /dev/null +++ b/lib/DBIx/FullTextSearch/TestConfig.pm @@ -0,0 +1,6 @@ +%DBIx::FullTextSearch::TestConfig::Config = ( + 'password' => undef, + 'dsn' => 'dbi:mysql:test', + 'user' => 'test' + ); +1; diff --git a/lib/DBIx/FullTextSearch/URL.pm b/lib/DBIx/FullTextSearch/URL.pm new file mode 100644 index 000000000..b6b29ac42 --- /dev/null +++ b/lib/DBIx/FullTextSearch/URL.pm @@ -0,0 +1,30 @@ + +package DBIx::FullTextSearch::URL; +use DBIx::FullTextSearch::String; +use strict; +use vars qw! @ISA !; +@ISA = qw! DBIx::FullTextSearch::String !; + +use LWP::UserAgent; + +sub index_document { + my ($self, $uri, $extra_data) = @_; + my $ua = ( defined $self->{'user_agent'} + ? $self->{'user_agent'} + : $self->{'user_agent'} = new LWP::UserAgent ); + + my $request = new HTTP::Request('GET', $uri); + my $response = $ua->simple_request($request); + if ($response->is_success) { + my $data = $response->content; + $data .= " $extra_data" if $extra_data; + return $self->SUPER::index_document($uri, $data); + } + else { + $self->{'errstr'} = $response->message; + } + return; +} + +1; + diff --git a/lib/HTML/Highlight.pm b/lib/HTML/Highlight.pm new file mode 100644 index 000000000..3e230447f --- /dev/null +++ b/lib/HTML/Highlight.pm @@ -0,0 +1,395 @@ + +package HTML::Highlight; + +use locale; + +use strict; +use Carp; + +BEGIN { + use vars qw ($VERSION @ISA); + $VERSION = 0.20; + @ISA = (); +} + +END { } + +my $MIN_SECTION_LENGTH = 60; +my $DEFAULT_SECTION_LENGTH = 80; + +sub new { + $_ = shift; + my $class = ref($_) || $_; + + croak ('HTML::Highlight - even number of parameters expected.') + if (@_ % 2); + + # set the defaults + my $self = { + words => [], + wildcards => [], + colors => [ + '#ffff66', + '#A0FFFF', + '#99ff99', + '#ff9999', + '#ff66ff' + ], + czech_language => 0, + debug => 0 + }; + + bless ($self, $class); + + # get parameters, overiding the defaults + for (my $i = 0; $i <= $#_; $i += 2) { + exists ( $self->{lc($_[$i])} ) or + croak ('HTML::Highlight - invalid parameter ' . $_[$i] . '.'); + $self->{lc($_[$i])} = $_[($i + 1)]; + } + + croak ('HTML::Highlight - "words" and "wildcards" parameters must be references to arrays') + if (ref($self->{words}) ne 'ARRAY' or ref($self->{wildcards}) ne 'ARRAY'); + + require CzFast if ($self->{czech_language}); + + return $self; +} + + +sub highlight { + my $self = shift; + my $document = shift; + + croak ('HTML::Highlight - no document defined') + if (not defined($document)); + return '' if (length($document) == 0); + + my $doc = $document; + + for (my $i = 0, my $cindex = 0; $i < @{$self->{words}}; $i++, $cindex++) { + my $color; + my $out; + if ($self->{colors}->[$cindex]) { + $color = $self->{colors}->[$cindex]; + } + else { + $cindex = 0; + $color = $self->{colors}->[$cindex]; + } + while($doc) { + if ($doc !~ /(.*?)(<.*?>)(.*)/s) { + $out .= $self->_highlight($doc, $i, $color); + last; + } + else { + my $str = $1; + my $html = $2; + my $rest = $3; + $out .= $self->_highlight($str, $i, $color); + $out .= $html; + $doc = $rest; + } + } + $doc = $out; + } + +return $doc; +} + +sub preview_context { + my $self = shift; + my $document = shift; + my $sectlen = shift; + + $self->{context} = {}; + $self->{sectlen} = $sectlen >= $MIN_SECTION_LENGTH ? + $sectlen : $DEFAULT_SECTION_LENGTH; + $self->{sections} = []; + + $document =~ s/<.*?>//g; + + for (my $i = 0; $i < @{$self->{words}}; $i++) { + my $pattern = $self->{czech_language} ? + &CzFast::czregexp($self->{words}->[$i]) : + $self->{words}->[$i]; + + my $wildcard = $self->{wildcards}->[$i]; + my $regexp; + + if ($wildcard eq '%') { + $regexp = "${pattern}\\w*"; + } + elsif ($wildcard eq '*') { + $regexp = "${pattern}s?"; + } + else { + $regexp = $pattern; + } + + if (not $self->{context}->{$pattern} + and not grep (/$regexp/i, values %{$self->{context}})) { + my $chars = int(($self->{sectlen} - length($pattern)) / 2); + print "Chars: $chars\n" if ($self->{debug}); + if ($document =~ /(?:^|\W)(.{0,$chars})(\W+|^)($regexp)(\W+|$)(.{0,$chars})(?:\W|$)/si) { + my $section = $1.$2.$3.$4.$5; + $self->{context}->{$pattern} = $section; + push(@{$self->{sections}}, $section); + } + } + } + + return $self->{sections}; +} + +######################### +#### private methods #### +######################### + +sub _highlight { + my $self = shift; + my $str = shift; + my $word = shift; + my $color = shift; + + my $pattern = $self->{words}->[$word]; + $pattern = &CzFast::czregexp($pattern) if ($self->{czech_language}); + + my $wildcard = $self->{wildcards}->[$word]; + my $regexp; + + if ($wildcard eq '%') { + my $pat = $self->{czech_language} ? &_cz_pattern : '\w*'; + $regexp = "${pattern}$pat"; + } + elsif ($wildcard eq '*') { + $regexp = "${pattern}s?"; + } + else { + $regexp = $pattern; + } + + print "$str: $pattern | $wildcard | $regexp | $color\n" if ($self->{debug}); + $str =~ s!(\W+|^)($regexp)!$1$2!sig; + return $str; +} + +sub _cz_pattern { + my @chars; + my $pat = '('; + foreach my $char ('a'..'z') { + push(@chars, &CzFast::czregexp($char)); + } + $pat .= join('|',@chars); + $pat .= ')*'; + return $pat; +} + + +1; + +__END__ + +=head1 NAME + +B + +=head1 SYNOPSIS + + use HTML::Highlight; + + # create the highlighter object + + my $hl = new HTML::Highlight ( + words => [ + 'word', + 'any', + 'car', + 'some phrase' + ], + wildcards => [ + undef, + '%', + '*', + undef + ], + colors => [ + '#FF0000', + 'red', + 'green', + 'rgb(255, 0, 0)' + ], + czech_language => 0, + debug => 0 + ); + + # Remember that you don't need to specify your own colors. + # The default colors should be optimal. + + # Now you can use the object to highlight patterns in a document + # by passing content of the document to its highlight() method. + # The highlighter object "remembers" its configuration. + + my $highlighted_document = $hl->highlight($document); + + +=head1 MOTIVATION + +This module was originaly created to work together with fulltext +indexing module DBIx::TextIndex to highlight search results. + +A need for a highlighter that takes wildcard matches and HTML tags into +account and supports czech language (or other Slavic languages) was +the motivation to create this module. + +=head1 DESCRIPTION + +This module provides Google-like highlighting of words or patterns in HTML +documents. This feature is typically used to highlight search results. + + +=item The construcutor: + + my $hl = new HTML::Highlight ( + words => [], + wildcards => [], + colors => [], + czech_language => 0, + debug => 0 + ); + +This is a constructor of the highlighter object. It takes an array of +even number of parameters. + + +The B parameter is a reference to an array of words to highlight. + +The B parameter is a reference to an array of wildcards, that +are applied to corresponding words in the B array. + +A wildcard can be either undef or one of '%' or '*'. + +B means "match any characters": + + "%" applied to 'car' ==> matches "car", "cars", "careful", ... + + +B means "match also plural form of the word": + + "*" applied to 'car' ==> matches only "car" or "cars" + + +B means "match exactly the corresponding word": + + undefined wildcard applied to 'car' ==> matches only "car" + + + +The B parameter is a reference to an array of CSS color +identificators, that are used to highlight the corresponding words in +the B array. + +Default Google-like colors are used if you don't specify your own +colors. Number of colors can be lower than number of words - in this case +the colors are rotated and some of the words are therefore +highlighted using the same color. + +The highlighter takes HTML tags into account and therefore does not +"highlight" a word or a pattern inside a tag. + +A support for diacritics insenstive matching for ISO-8859-2 languages (for +for example the czech language) can be activated using the B +option. This feature requires a module B that is available on CPAN in +a directory of author TRIPIE or at http://geocities.com/tripiecz/. + +B + + +=item highlight + + my $hl_document = $hl->highlight($document); + +The only parameter is a document in that you want +to highlight the words that were passed to the constructor of the +highlighter object. The method returns a version of the document in which +the words are highlighted. + + +=item preview_context + + my $sections = $hl->preview_context($document, $num); + + +This method takes two parameters. The first one is the document you +want to scan for the words that were passed to the constructor of the +highlighter object. The second parameter is an optional integer +that specifies maximum number of characters in each of the context +sections (see below). This parameter defaults to 80 +characters if it's not specified. Minimum allowed value of this +parameter is 60. + +The method returns a reference to an array of sections of the document +in which the words that were passed to the constructor appear. +HTML tags are removed before the document is proccessed and are +not present in the ouput. +This feature is typically used in search engines to preview a context +in which words from a search query appear in the resulting documents. +The words are always in the middle of each of the sections. The +number of sections this method returns is equal to the number of words +passed to the constructor of the highlighter object. +That means only the first occurence of each of the words is taken into +account. + +=head1 SUPPORT + +No official support is provided, but I welcome any comments, patches +and suggestions on my email. + +=head1 BUGS + +I am aware of no bugs. + +=head1 AVAILABILITY + + http://geocities.com/tripiecz/ + +=head1 AUTHOR + +B, tripie@cpan.org, CPAN-ID TRIPIE + +Prague, the Czech republic + +=head1 LICENSE + +HTML::Highlight - A module to highlight words or patterns in HTML documents + +Copyright (C) 2000 Tomas Styblo (tripie@cpan.org) + +This module is free software; you can redistribute it and/or modify it +under the terms of either: + +a) the GNU General Public License as published by the Free Software +Foundation; either version 1, or (at your option) any later version, +or + +b) the "Artistic License" which comes with this module. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See either +the GNU General Public License or the Artistic License for more details. + +You should have received a copy of the Artistic License with this +module, in the file Artistic. If not, I'll be glad to provide one. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 +USA + +=head1 SEE ALSO + +perl(1). + +=cut diff --git a/lib/Parse/RecDescent.pm b/lib/Parse/RecDescent.pm new file mode 100644 index 000000000..35b9e9d2c --- /dev/null +++ b/lib/Parse/RecDescent.pm @@ -0,0 +1,3045 @@ +# GENERATE RECURSIVE DESCENT PARSER OBJECTS FROM A GRAMMARC +# SEE RecDescent.pod FOR FULL DETAILS + +use 5.005; +use strict; + +package Parse::RecDescent; + +use Text::Balanced qw ( extract_codeblock extract_bracketed extract_quotelike extract_delimited ); + +use vars qw ( $skip ); + + *defskip = \ '\s*'; # DEFAULT SEPARATOR IS OPTIONAL WHITESPACE + $skip = '\s*'; # UNIVERSAL SEPARATOR IS OPTIONAL WHITESPACE +my $MAXREP = 100_000_000; # REPETITIONS MATCH AT MOST 100,000,000 TIMES + + +sub import # IMPLEMENT PRECOMPILER BEHAVIOUR UNDER: + # perl -MParse::RecDescent - +{ + local *_die = sub { print @_, "\n"; exit }; + + my ($package, $file, $line) = caller; + if (substr($file,0,1) eq '-' && $line == 0) + { + _die("Usage: perl -MLocalTest - ") + unless @ARGV == 2; + + my ($sourcefile, $class) = @ARGV; + + local *IN; + open IN, $sourcefile + or _die("Can't open grammar file '$sourcefile'"); + + my $grammar = join '', ; + + Parse::RecDescent->Precompile($grammar, $class, $sourcefile); + exit; + } +} + +sub Save +{ + my ($self, $class) = @_; + $self->{saving} = 1; + $self->Precompile(undef,$class); + $self->{saving} = 0; +} + +sub Precompile +{ + my ($self, $grammar, $class, $sourcefile) = @_; + + $class =~ /^(\w+::)*\w+$/ or croak("Bad class name: $class"); + + my $modulefile = $class; + $modulefile =~ s/.*:://; + $modulefile .= ".pm"; + + open OUT, ">$modulefile" + or croak("Can't write to new module file '$modulefile'"); + + print STDERR "precompiling grammar from file '$sourcefile'\n", + "to class $class in module file '$modulefile'\n" + if $grammar && $sourcefile; + + # local $::RD_HINT = 1; + $self = Parse::RecDescent->new($grammar,1,$class) + || croak("Can't compile bad grammar") + if $grammar; + + foreach ( keys %{$self->{rules}} ) + { $self->{rules}{$_}{changed} = 1 } + + print OUT "package $class;\nuse Parse::RecDescent;\n\n"; + + print OUT "{ my \$ERRORS;\n\n"; + + print OUT $self->_code(); + + print OUT "}\npackage $class; sub new { "; + print OUT "my "; + + require Data::Dumper; + print OUT Data::Dumper->Dump([$self], [qw(self)]); + + print OUT "}"; + + close OUT + or croak("Can't write to new module file '$modulefile'"); +} + + +package Parse::RecDescent::LineCounter; + + +sub TIESCALAR # ($classname, \$text, $thisparser, $prevflag) +{ + bless { + text => $_[1], + parser => $_[2], + prev => $_[3]?1:0, + }, $_[0]; +} + +my %counter_cache; + +sub FETCH +{ + my $parser = $_[0]->{parser}; + my $from = $parser->{fulltextlen}-length(${$_[0]->{text}})-$_[0]->{prev} +; + + unless (exists $counter_cache{$from}) { + $parser->{lastlinenum} = $parser->{offsetlinenum} + - Parse::RecDescent::_linecount(substr($parser->{fulltext},$from)) + + 1; + $counter_cache{$from} = $parser->{lastlinenum}; + } + return $counter_cache{$from}; +} + +sub STORE +{ + my $parser = $_[0]->{parser}; + $parser->{offsetlinenum} -= $parser->{lastlinenum} - $_[1]; + return undef; +} + +sub resync # ($linecounter) +{ + my $self = tied($_[0]); + die "Tried to alter something other than a LineCounter\n" + unless $self =~ /Parse::RecDescent::LineCounter/; + + my $parser = $self->{parser}; + my $apparently = $parser->{offsetlinenum} + - Parse::RecDescent::_linecount(${$self->{text}}) + + 1; + + $parser->{offsetlinenum} += $parser->{lastlinenum} - $apparently; + return 1; +} + +package Parse::RecDescent::ColCounter; + +sub TIESCALAR # ($classname, \$text, $thisparser, $prevflag) +{ + bless { + text => $_[1], + parser => $_[2], + prev => $_[3]?1:0, + }, $_[0]; +} + +sub FETCH +{ + my $parser = $_[0]->{parser}; + my $missing = $parser->{fulltextlen}-length(${$_[0]->{text}})-$_[0]->{prev}+1; + substr($parser->{fulltext},0,$missing) =~ m/^(.*)\Z/m; + return length($1); +} + +sub STORE +{ + die "Can't set column number via \$thiscolumn\n"; +} + + +package Parse::RecDescent::OffsetCounter; + +sub TIESCALAR # ($classname, \$text, $thisparser, $prev) +{ + bless { + text => $_[1], + parser => $_[2], + prev => $_[3]?-1:0, + }, $_[0]; +} + +sub FETCH +{ + my $parser = $_[0]->{parser}; + return $parser->{fulltextlen}-length(${$_[0]->{text}})+$_[0]->{prev}; +} + +sub STORE +{ + die "Can't set current offset via \$thisoffset or \$prevoffset\n"; +} + + + +package Parse::RecDescent::Rule; + +sub new ($$$$$) +{ + my $class = ref($_[0]) || $_[0]; + my $name = $_[1]; + my $owner = $_[2]; + my $line = $_[3]; + my $replace = $_[4]; + + if (defined $owner->{"rules"}{$name}) + { + my $self = $owner->{"rules"}{$name}; + if ($replace && !$self->{"changed"}) + { + $self->reset; + } + return $self; + } + else + { + return $owner->{"rules"}{$name} = + bless + { + "name" => $name, + "prods" => [], + "calls" => [], + "changed" => 0, + "line" => $line, + "impcount" => 0, + "opcount" => 0, + "vars" => "", + }, $class; + } +} + +sub reset($) +{ + @{$_[0]->{"prods"}} = (); + @{$_[0]->{"calls"}} = (); + $_[0]->{"changed"} = 0; + $_[0]->{"impcount"} = 0; + $_[0]->{"opcount"} = 0; + $_[0]->{"vars"} = ""; +} + +sub DESTROY {} + +sub hasleftmost($$) +{ + my ($self, $ref) = @_; + + my $prod; + foreach $prod ( @{$self->{"prods"}} ) + { + return 1 if $prod->hasleftmost($ref); + } + + return 0; +} + +sub leftmostsubrules($) +{ + my $self = shift; + my @subrules = (); + + my $prod; + foreach $prod ( @{$self->{"prods"}} ) + { + push @subrules, $prod->leftmostsubrule(); + } + + return @subrules; +} + +sub expected($) +{ + my $self = shift; + my @expected = (); + + my $prod; + foreach $prod ( @{$self->{"prods"}} ) + { + my $next = $prod->expected(); + unless (! $next or _contains($next,@expected) ) + { + push @expected, $next; + } + } + + return join ', or ', @expected; +} + +sub _contains($@) +{ + my $target = shift; + my $item; + foreach $item ( @_ ) { return 1 if $target eq $item; } + return 0; +} + +sub addcall($$) +{ + my ( $self, $subrule ) = @_; + unless ( _contains($subrule, @{$self->{"calls"}}) ) + { + push @{$self->{"calls"}}, $subrule; + } +} + +sub addprod($$) +{ + my ( $self, $prod ) = @_; + push @{$self->{"prods"}}, $prod; + $self->{"changed"} = 1; + $self->{"impcount"} = 0; + $self->{"opcount"} = 0; + $prod->{"number"} = $#{$self->{"prods"}}; + return $prod; +} + +sub addvar +{ + my ( $self, $var, $parser ) = @_; + if ($var =~ /\A\s*local\s+([%@\$]\w+)/) + { + $parser->{localvars} .= " $1"; + $self->{"vars"} .= "$var;\n" } + else + { $self->{"vars"} .= "my $var;\n" } + $self->{"changed"} = 1; + return 1; +} + +sub addautoscore +{ + my ( $self, $code ) = @_; + $self->{"autoscore"} = $code; + $self->{"changed"} = 1; + return 1; +} + +sub nextoperator($) +{ + my $self = shift; + my $prodcount = scalar @{$self->{"prods"}}; + my $opcount = ++$self->{"opcount"}; + return "_operator_${opcount}_of_production_${prodcount}_of_rule_$self->{name}"; +} + +sub nextimplicit($) +{ + my $self = shift; + my $prodcount = scalar @{$self->{"prods"}}; + my $impcount = ++$self->{"impcount"}; + return "_alternation_${impcount}_of_production_${prodcount}_of_rule_$self->{name}"; +} + + +sub code +{ + my ($self, $namespace, $parser) = @_; + +eval 'undef &' . $namespace . '::' . $self->{"name"} unless $parser->{saving}; + + my $code = +' +# ARGS ARE: ($parser, $text; $repeating, $_noactions, \@args) +sub ' . $namespace . '::' . $self->{"name"} . ' +{ + my $thisparser = $_[0]; + use vars q{$tracelevel}; + local $tracelevel = ($tracelevel||0)+1; + $ERRORS = 0; + my $thisrule = $thisparser->{"rules"}{"' . $self->{"name"} . '"}; + + Parse::RecDescent::_trace(q{Trying rule: [' . $self->{"name"} . ']}, + Parse::RecDescent::_tracefirst($_[1]), + q{' . $self->{"name"} . '}, + $tracelevel) + if defined $::RD_TRACE; + + ' . ($parser->{deferrable} + ? 'my $def_at = @{$thisparser->{deferred}};' + : '') . + ' + my $err_at = @{$thisparser->{errors}}; + + my $score; + my $score_return; + my $_tok; + my $return = undef; + my $_matched=0; + my $commit=0; + my @item = (); + my %item = (); + my $repeating = defined($_[2]) && $_[2]; + my $_noactions = defined($_[3]) && $_[3]; + my @arg = defined $_[4] ? @{ &{$_[4]} } : (); + my %arg = ($#arg & 01) ? @arg : (@arg, undef); + my $text; + my $lastsep=""; + my $expectation = new Parse::RecDescent::Expectation($thisrule->expected()); + $expectation->at($_[1]); + '. ($parser->{_check}{thisoffset}?' + my $thisoffset; + tie $thisoffset, q{Parse::RecDescent::OffsetCounter}, \$text, $thisparser; + ':'') . ($parser->{_check}{prevoffset}?' + my $prevoffset; + tie $prevoffset, q{Parse::RecDescent::OffsetCounter}, \$text, $thisparser, 1; + ':'') . ($parser->{_check}{thiscolumn}?' + my $thiscolumn; + tie $thiscolumn, q{Parse::RecDescent::ColCounter}, \$text, $thisparser; + ':'') . ($parser->{_check}{prevcolumn}?' + my $prevcolumn; + tie $prevcolumn, q{Parse::RecDescent::ColCounter}, \$text, $thisparser, 1; + ':'') . ($parser->{_check}{prevline}?' + my $prevline; + tie $prevline, q{Parse::RecDescent::LineCounter}, \$text, $thisparser, 1; + ':'') . ' + my $thisline; + tie $thisline, q{Parse::RecDescent::LineCounter}, \$text, $thisparser; + + '. $self->{vars} .' +'; + + my $prod; + foreach $prod ( @{$self->{"prods"}} ) + { + $prod->addscore($self->{autoscore},0,0) if $self->{autoscore}; + next unless $prod->checkleftmost(); + $code .= $prod->code($namespace,$self,$parser); + + $code .= $parser->{deferrable} + ? ' splice + @{$thisparser->{deferred}}, $def_at unless $_matched; + ' + : ''; + } + + $code .= +' + unless ( $_matched || defined($return) || defined($score) ) + { + ' .($parser->{deferrable} + ? ' splice @{$thisparser->{deferred}}, $def_at; + ' + : '') . ' + + $_[1] = $text; # NOT SURE THIS IS NEEDED + Parse::RecDescent::_trace(q{<>}, + Parse::RecDescent::_tracefirst($_[1]), + q{' . $self->{"name"} .'}, + $tracelevel) + if defined $::RD_TRACE; + return undef; + } + if (!defined($return) && defined($score)) + { + Parse::RecDescent::_trace(q{>>Accepted scored production<<}, "", + q{' . $self->{"name"} .'}, + $tracelevel) + if defined $::RD_TRACE; + $return = $score_return; + } + splice @{$thisparser->{errors}}, $err_at; + $return = $item[$#item] unless defined $return; + if (defined $::RD_TRACE) + { + Parse::RecDescent::_trace(q{>>Matched rule<< (return value: [} . + $return . q{])}, "", + q{' . $self->{"name"} .'}, + $tracelevel); + Parse::RecDescent::_trace(q{(consumed: [} . + Parse::RecDescent::_tracemax(substr($_[1],0,-length($text))) . q{])}, + Parse::RecDescent::_tracefirst($text), + , q{' . $self->{"name"} .'}, + $tracelevel) + } + $_[1] = $text; + return $return; +} +'; + + return $code; +} + +my @left; +sub isleftrec($$) +{ + my ($self, $rules) = @_; + my $root = $self->{"name"}; + @left = $self->leftmostsubrules(); + my $next; + foreach $next ( @left ) + { + next unless defined $rules->{$next}; # SKIP NON-EXISTENT RULES + return 1 if $next eq $root; + my $child; + foreach $child ( $rules->{$next}->leftmostsubrules() ) + { + push(@left, $child) + if ! _contains($child, @left) ; + } + } + return 0; +} + +package Parse::RecDescent::Production; + +sub describe ($;$) +{ + return join ' ', map { $_->describe($_[1]) or () } @{$_[0]->{items}}; +} + +sub new ($$;$$) +{ + my ($self, $line, $uncommit, $error) = @_; + my $class = ref($self) || $self; + + bless + { + "items" => [], + "uncommit" => $uncommit, + "error" => $error, + "line" => $line, + strcount => 0, + patcount => 0, + dircount => 0, + actcount => 0, + }, $class; +} + +sub expected ($) +{ + my $itemcount = scalar @{$_[0]->{"items"}}; + return ($itemcount) ? $_[0]->{"items"}[0]->describe(1) : ''; +} + +sub hasleftmost ($$) +{ + my ($self, $ref) = @_; + return ${$self->{"items"}}[0] eq $ref if scalar @{$self->{"items"}}; + return 0; +} + +sub leftmostsubrule($) +{ + my $self = shift; + + if ( $#{$self->{"items"}} >= 0 ) + { + my $subrule = $self->{"items"}[0]->issubrule(); + return $subrule if defined $subrule; + } + + return (); +} + +sub checkleftmost($) +{ + my @items = @{$_[0]->{"items"}}; + if (@items==1 && ref($items[0]) =~ /\AParse::RecDescent::Error/ + && $items[0]->{commitonly} ) + { + Parse::RecDescent::_warn(2,"Lone in production treated + as "); + Parse::RecDescent::_hint("A production consisting of a single + conditional directive would + normally succeed (with the value zero) if the + rule is not 'commited' when it is + tried. Since you almost certainly wanted + ' ' Parse::RecDescent + supplied it for you."); + push @{$_[0]->{items}}, + Parse::RecDescent::UncondReject->new(0,0,''); + } + elsif (@items==1 && ($items[0]->describe||"") =~ /describe||"") =~ /describe ."]"); + my $what = $items[0]->describe =~ / (which acts like an unconditional during parsing)" + : $items[0]->describe =~ / (which acts like an unconditional during parsing)" + : "an unconditional "; + my $caveat = $items[0]->describe =~ / 1 + ? "However, there were also other (useless) items after the leading " + . $items[0]->describe + . ", so you may have been expecting some other behaviour." + : "You can safely ignore this message."; + Parse::RecDescent::_hint("The production starts with $what. That means that the + production can never successfully match, so it was + optimized out of the final parser$caveat. $advice"); + return 0; + } + return 1; +} + +sub changesskip($) +{ + my $item; + foreach $item (@{$_[0]->{"items"}}) + { + if (ref($item) =~ /Parse::RecDescent::(Action|Directive)/) + { + return 1 if $item->{code} =~ /\$skip/; + } + } + return 0; +} + +sub adddirective +{ + my ( $self, $whichop, $line, $name ) = @_; + push @{$self->{op}}, + { type=>$whichop, line=>$line, name=>$name, + offset=> scalar(@{$self->{items}}) }; +} + +sub addscore +{ + my ( $self, $code, $lookahead, $line ) = @_; + $self->additem(Parse::RecDescent::Directive->new( + "local \$^W; + my \$thisscore = do { $code } + 0; + if (!defined(\$score) || \$thisscore>\$score) + { \$score=\$thisscore; \$score_return=\$item[-1]; } + undef;", $lookahead, $line,"") ) + unless $self->{items}[-1]->describe =~ /{op}) + { + while (my $next = pop @{$self->{op}}) + { + Parse::RecDescent::_error("Incomplete <$next->{type}op:...>.", $line); + Parse::RecDescent::_hint( + "The current production ended without completing the + <$next->{type}op:...> directive that started near line + $next->{line}. Did you forget the closing '>'?"); + } + } + return 1; +} + +sub enddirective +{ + my ( $self, $line, $minrep, $maxrep ) = @_; + unless ($self->{op}) + { + Parse::RecDescent::_error("Unmatched > found.", $line); + Parse::RecDescent::_hint( + "A '>' angle bracket was encountered, which typically + indicates the end of a directive. However no suitable + preceding directive was encountered. Typically this + indicates either a extra '>' in the grammar, or a + problem inside the previous directive."); + return; + } + my $op = pop @{$self->{op}}; + my $span = @{$self->{items}} - $op->{offset}; + if ($op->{type} =~ /left|right/) + { + if ($span != 3) + { + Parse::RecDescent::_error( + "Incorrect <$op->{type}op:...> specification: + expected 3 args, but found $span instead", $line); + Parse::RecDescent::_hint( + "The <$op->{type}op:...> directive requires a + sequence of exactly three elements. For example: + <$op->{type}op:leftarg /op/ rightarg>"); + } + else + { + push @{$self->{items}}, + Parse::RecDescent::Operator->new( + $op->{type}, $minrep, $maxrep, splice(@{$self->{"items"}}, -3)); + $self->{items}[-1]->sethashname($self); + $self->{items}[-1]{name} = $op->{name}; + } + } +} + +sub prevwasreturn +{ + my ( $self, $line ) = @_; + unless (@{$self->{items}}) + { + Parse::RecDescent::_error( + "Incorrect specification: + expected item missing", $line); + Parse::RecDescent::_hint( + "The directive requires a + sequence of at least one item. For example: + "); + return; + } + push @{$self->{items}}, + Parse::RecDescent::Result->new(); +} + +sub additem +{ + my ( $self, $item ) = @_; + $item->sethashname($self); + push @{$self->{"items"}}, $item; + return $item; +} + + +sub preitempos +{ + return q + { + push @itempos, {'offset' => {'from'=>$thisoffset, 'to'=>undef}, + 'line' => {'from'=>$thisline, 'to'=>undef}, + 'column' => {'from'=>$thiscolumn, 'to'=>undef} }; + } +} + +sub incitempos +{ + return q + { + $itempos[$#itempos]{'offset'}{'from'} += length($1); + $itempos[$#itempos]{'line'}{'from'} = $thisline; + $itempos[$#itempos]{'column'}{'from'} = $thiscolumn; + } +} + +sub postitempos +{ + return q + { + $itempos[$#itempos]{'offset'}{'to'} = $prevoffset; + $itempos[$#itempos]{'line'}{'to'} = $prevline; + $itempos[$#itempos]{'column'}{'to'} = $prevcolumn; + } +} + +sub code($$$$) +{ + my ($self,$namespace,$rule,$parser) = @_; + my $code = +' + while (!$_matched' + . (defined $self->{"uncommit"} ? '' : ' && !$commit') + . ') + { + ' . + ($self->changesskip() + ? 'local $skip = defined($skip) ? $skip : $Parse::RecDescent::skip;' + : '') .' + Parse::RecDescent::_trace(q{Trying production: [' + . $self->describe . ']}, + Parse::RecDescent::_tracefirst($_[1]), + q{' . $rule ->{name}. '}, + $tracelevel) + if defined $::RD_TRACE; + my $thisprod = $thisrule->{"prods"}[' . $self->{"number"} . ']; + ' . (defined $self->{"error"} ? '' : '$text = $_[1];' ) . ' + my $_savetext; + @item = (q{' . $rule->{"name"} . '}); + %item = (__RULE__ => q{' . $rule->{"name"} . '}); + my $repcount = 0; + +'; + $code .= +' my @itempos = ({}); +' if $parser->{_check}{itempos}; + + my $item; + my $i; + + for ($i = 0; $i < @{$self->{"items"}}; $i++) + { + $item = ${$self->{items}}[$i]; + + $code .= preitempos() if $parser->{_check}{itempos}; + + $code .= $item->code($namespace,$rule,$parser->{_check}); + + $code .= postitempos() if $parser->{_check}{itempos}; + + } + + if ($parser->{_AUTOACTION} && defined($item) && !$item->isa("Parse::RecDescent::Action")) + { + $code .= $parser->{_AUTOACTION}->code($namespace,$rule); + Parse::RecDescent::_warn(1,"Autogenerating action in rule + \"$rule->{name}\": + $parser->{_AUTOACTION}{code}") + and + Parse::RecDescent::_hint("The \$::RD_AUTOACTION was defined, + so any production not ending in an + explicit action has the specified + \"auto-action\" automatically + appended."); + } + elsif ($parser->{_AUTOTREE} && defined($item) && !$item->isa("Parse::RecDescent::Action")) + { + if ($i==1 && $item->isterminal) + { + $code .= $parser->{_AUTOTREE}{TERMINAL}->code($namespace,$rule); + } + else + { + $code .= $parser->{_AUTOTREE}{NODE}->code($namespace,$rule); + } + Parse::RecDescent::_warn(1,"Autogenerating tree-building action in rule + \"$rule->{name}\"") + and + Parse::RecDescent::_hint("The directive was specified, + so any production not ending + in an explicit action has + some parse-tree building code + automatically appended."); + } + + $code .= +' + + Parse::RecDescent::_trace(q{>>Matched production: [' + . $self->describe . ']<<}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{name} . '}, + $tracelevel) + if defined $::RD_TRACE; + $_matched = 1; + last; + } + +'; + return $code; +} + +1; + +package Parse::RecDescent::Action; + +sub describe { undef } + +sub sethashname { $_[0]->{hashname} = '__ACTION' . ++$_[1]->{actcount} .'__'; } + +sub new +{ + my $class = ref($_[0]) || $_[0]; + bless + { + "code" => $_[1], + "lookahead" => $_[2], + "line" => $_[3], + }, $class; +} + +sub issubrule { undef } +sub isterminal { 0 } + +sub code($$$$) +{ + my ($self, $namespace, $rule) = @_; + +' + Parse::RecDescent::_trace(q{Trying action}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{name} . '}, + $tracelevel) + if defined $::RD_TRACE; + ' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) .' + + $_tok = ($_noactions) ? 0 : do ' . $self->{"code"} . '; + ' . ($self->{"lookahead"}<0?'if':'unless') . ' (defined $_tok) + { + Parse::RecDescent::_trace(q{<> (return value: [undef])}) + if defined $::RD_TRACE; + last; + } + Parse::RecDescent::_trace(q{>>Matched action<< (return value: [} + . $_tok . q{])}, + Parse::RecDescent::_tracefirst($text)) + if defined $::RD_TRACE; + push @item, $_tok; + ' . ($self->{line}>=0 ? '$item{'. $self->{hashname} .'}=$_tok;' : '' ) .' + ' . ($self->{"lookahead"} ? '$text = $_savetext;' : '' ) .' +' +} + + +1; + +package Parse::RecDescent::Directive; + +sub sethashname { $_[0]->{hashname} = '__DIRECTIVE' . ++$_[1]->{dircount} . '__'; } + +sub issubrule { undef } +sub isterminal { 0 } +sub describe { $_[1] ? '' : $_[0]->{name} } + +sub new ($$$$$) +{ + my $class = ref($_[0]) || $_[0]; + bless + { + "code" => $_[1], + "lookahead" => $_[2], + "line" => $_[3], + "name" => $_[4], + }, $class; +} + +sub code($$$$) +{ + my ($self, $namespace, $rule) = @_; + +' + ' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) .' + + Parse::RecDescent::_trace(q{Trying directive: [' + . $self->describe . ']}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{name} . '}, + $tracelevel) + if defined $::RD_TRACE; ' .' + $_tok = do { ' . $self->{"code"} . ' }; + if (defined($_tok)) + { + Parse::RecDescent::_trace(q{>>Matched directive<< (return value: [} + . $_tok . q{])}, + Parse::RecDescent::_tracefirst($text)) + if defined $::RD_TRACE; + } + else + { + Parse::RecDescent::_trace(q{<>}, + Parse::RecDescent::_tracefirst($text)) + if defined $::RD_TRACE; + } + ' . ($self->{"lookahead"} ? '$text = $_savetext and ' : '' ) .' + last ' + . ($self->{"lookahead"}<0?'if':'unless') . ' defined $_tok; + push @item, $item{'.$self->{hashname}.'}=$_tok; + ' . ($self->{"lookahead"} ? '$text = $_savetext;' : '' ) .' +' +} + +1; + +package Parse::RecDescent::UncondReject; + +sub issubrule { undef } +sub isterminal { 0 } +sub describe { $_[1] ? '' : $_[0]->{name} } +sub sethashname { $_[0]->{hashname} = '__DIRECTIVE' . ++$_[1]->{dircount} . '__'; } + +sub new ($$$;$) +{ + my $class = ref($_[0]) || $_[0]; + bless + { + "lookahead" => $_[1], + "line" => $_[2], + "name" => $_[3], + }, $class; +} + +# MARK, YOU MAY WANT TO OPTIMIZE THIS. + + +sub code($$$$) +{ + my ($self, $namespace, $rule) = @_; + +' + Parse::RecDescent::_trace(q{>>Rejecting production<< (found ' + . $self->describe . ')}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{name} . '}, + $tracelevel) + if defined $::RD_TRACE; + undef $return; + ' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) .' + + $_tok = undef; + ' . ($self->{"lookahead"} ? '$text = $_savetext and ' : '' ) .' + last ' + . ($self->{"lookahead"}<0?'if':'unless') . ' defined $_tok; +' +} + +1; + +package Parse::RecDescent::Error; + +sub issubrule { undef } +sub isterminal { 0 } +sub describe { $_[1] ? '' : $_[0]->{commitonly} ? '' : '' } +sub sethashname { $_[0]->{hashname} = '__DIRECTIVE' . ++$_[1]->{dircount} . '__'; } + +sub new ($$$$$) +{ + my $class = ref($_[0]) || $_[0]; + bless + { + "msg" => $_[1], + "lookahead" => $_[2], + "commitonly" => $_[3], + "line" => $_[4], + }, $class; +} + +sub code($$$$) +{ + my ($self, $namespace, $rule) = @_; + + my $action = ''; + + if ($self->{"msg"}) # ERROR MESSAGE SUPPLIED + { + #WAS: $action .= "Parse::RecDescent::_error(qq{$self->{msg}}" . ',$thisline);'; + $action .= 'push @{$thisparser->{errors}}, [qq{'.$self->{msg}.'},$thisline];'; + + } + else # GENERATE ERROR MESSAGE DURING PARSE + { + $action .= ' + my $rule = $item[0]; + $rule =~ s/_/ /g; + #WAS: Parse::RecDescent::_error("Invalid $rule: " . $expectation->message() ,$thisline); + push @{$thisparser->{errors}}, ["Invalid $rule: " . $expectation->message() ,$thisline]; + '; + } + + my $dir = + new Parse::RecDescent::Directive('if (' . + ($self->{"commitonly"} ? '$commit' : '1') . + ") { do {$action} unless ".' $_noactions; undef } else {0}', + $self->{"lookahead"},0,$self->describe); + $dir->{hashname} = $self->{hashname}; + return $dir->code($namespace, $rule, 0); +} + +1; + +package Parse::RecDescent::Token; + +sub sethashname { $_[0]->{hashname} = '__PATTERN' . ++$_[1]->{patcount} . '__'; } + +sub issubrule { undef } +sub isterminal { 1 } +sub describe ($) { shift->{'description'}} + + +# ARGS ARE: $self, $pattern, $left_delim, $modifiers, $lookahead, $linenum +sub new ($$$$$$) +{ + my $class = ref($_[0]) || $_[0]; + my $pattern = $_[1]; + my $pat = $_[1]; + my $ldel = $_[2]; + my $rdel = $ldel; + $rdel =~ tr/{[(/; + + my $mod = $_[3]; + + my $desc; + + if ($ldel eq '/') { $desc = "$ldel$pattern$rdel$mod" } + else { $desc = "m$ldel$pattern$rdel$mod" } + $desc =~ s/\\/\\\\/g; + $desc =~ s/\$$/\\\$/g; + $desc =~ s/}/\\}/g; + $desc =~ s/{/\\{/g; + + if (!eval "no strict; + local \$SIG{__WARN__} = sub {0}; + '' =~ m$ldel$pattern$rdel" and $@) + { + Parse::RecDescent::_warn(3, "Token pattern \"m$ldel$pattern$rdel\" + may not be a valid regular expression", + $_[5]); + $@ =~ s/ at \(eval.*/./; + Parse::RecDescent::_hint($@); + } + + # QUIETLY PREVENT (WELL-INTENTIONED) CALAMITY + $mod =~ s/[gc]//g; + $pattern =~ s/(\A|[^\\])\\G/$1/g; + + bless + { + "pattern" => $pattern, + "ldelim" => $ldel, + "rdelim" => $rdel, + "mod" => $mod, + "lookahead" => $_[4], + "line" => $_[5], + "description" => $desc, + }, $class; +} + + +sub code($$$$) +{ + my ($self, $namespace, $rule, $check) = @_; + my $ldel = $self->{"ldelim"}; + my $rdel = $self->{"rdelim"}; + my $sdel = $ldel; + my $mod = $self->{"mod"}; + + $sdel =~ s/[[{(<]/{}/; + +my $code = ' + Parse::RecDescent::_trace(q{Trying terminal: [' . $self->describe + . ']}, Parse::RecDescent::_tracefirst($text), + q{' . $rule->{name} . '}, + $tracelevel) + if defined $::RD_TRACE; + $lastsep = ""; + $expectation->is(q{' . ($rule->hasleftmost($self) ? '' + : $self->describe ) . '})->at($text); + ' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) . ' + + ' . ($self->{"lookahead"}<0?'if':'unless') + . ' ($text =~ s/\A($skip)/$lastsep=$1 and ""/e and ' + . ($check->{itempos}? 'do {'.Parse::RecDescent::Production::incitempos().' 1} and ' : '') + . ' $text =~ s' . $ldel . '\A(?:' . $self->{"pattern"} . ')' + . $rdel . $sdel . $mod . ') + { + '.($self->{"lookahead"} ? '$text = $_savetext;' : '').' + $expectation->failed(); + Parse::RecDescent::_trace(q{<>}, + Parse::RecDescent::_tracefirst($text)) + if defined $::RD_TRACE; + + last; + } + Parse::RecDescent::_trace(q{>>Matched terminal<< (return value: [} + . $& . q{])}, + Parse::RecDescent::_tracefirst($text)) + if defined $::RD_TRACE; + push @item, $item{'.$self->{hashname}.'}=$&; + ' . ($self->{"lookahead"} ? '$text = $_savetext;' : '' ) .' +'; + + return $code; +} + +1; + +package Parse::RecDescent::Literal; + +sub sethashname { $_[0]->{hashname} = '__STRING' . ++$_[1]->{strcount} . '__'; } + +sub issubrule { undef } +sub isterminal { 1 } +sub describe ($) { shift->{'description'} } + +sub new ($$$$) +{ + my $class = ref($_[0]) || $_[0]; + + my $pattern = $_[1]; + + my $desc = $pattern; + $desc=~s/\\/\\\\/g; + $desc=~s/}/\\}/g; + $desc=~s/{/\\{/g; + + bless + { + "pattern" => $pattern, + "lookahead" => $_[2], + "line" => $_[3], + "description" => "'$desc'", + }, $class; +} + + +sub code($$$$) +{ + my ($self, $namespace, $rule, $check) = @_; + +my $code = ' + Parse::RecDescent::_trace(q{Trying terminal: [' . $self->describe + . ']}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{name} . '}, + $tracelevel) + if defined $::RD_TRACE; + $lastsep = ""; + $expectation->is(q{' . ($rule->hasleftmost($self) ? '' + : $self->describe ) . '})->at($text); + ' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) . ' + + ' . ($self->{"lookahead"}<0?'if':'unless') + . ' ($text =~ s/\A($skip)/$lastsep=$1 and ""/e and ' + . ($check->{itempos}? 'do {'.Parse::RecDescent::Production::incitempos().' 1} and ' : '') + . ' $text =~ s/\A' . quotemeta($self->{"pattern"}) . '//) + { + '.($self->{"lookahead"} ? '$text = $_savetext;' : '').' + $expectation->failed(); + Parse::RecDescent::_trace(qq{<>}, + Parse::RecDescent::_tracefirst($text)) + if defined $::RD_TRACE; + last; + } + Parse::RecDescent::_trace(q{>>Matched terminal<< (return value: [} + . $& . q{])}, + Parse::RecDescent::_tracefirst($text)) + if defined $::RD_TRACE; + push @item, $item{'.$self->{hashname}.'}=$&; + ' . ($self->{"lookahead"} ? '$text = $_savetext;' : '' ) .' +'; + + return $code; +} + +1; + +package Parse::RecDescent::InterpLit; + +sub sethashname { $_[0]->{hashname} = '__STRING' . ++$_[1]->{strcount} . '__'; } + +sub issubrule { undef } +sub isterminal { 1 } +sub describe ($) { shift->{'description'} } + +sub new ($$$$) +{ + my $class = ref($_[0]) || $_[0]; + + my $pattern = $_[1]; + $pattern =~ s#/#\\/#g; + + my $desc = $pattern; + $desc=~s/\\/\\\\/g; + $desc=~s/}/\\}/g; + $desc=~s/{/\\{/g; + + bless + { + "pattern" => $pattern, + "lookahead" => $_[2], + "line" => $_[3], + "description" => "'$desc'", + }, $class; +} + +sub code($$$$) +{ + my ($self, $namespace, $rule, $check) = @_; + +my $code = ' + Parse::RecDescent::_trace(q{Trying terminal: [' . $self->describe + . ']}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{name} . '}, + $tracelevel) + if defined $::RD_TRACE; + $lastsep = ""; + $expectation->is(q{' . ($rule->hasleftmost($self) ? '' + : $self->describe ) . '})->at($text); + ' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) . ' + + ' . ($self->{"lookahead"}<0?'if':'unless') + . ' ($text =~ s/\A($skip)/$lastsep=$1 and ""/e and ' + . ($check->{itempos}? 'do {'.Parse::RecDescent::Production::incitempos().' 1} and ' : '') + . ' do { $_tok = "' . $self->{"pattern"} . '"; 1 } and + substr($text,0,length($_tok)) eq $_tok and + do { substr($text,0,length($_tok)) = ""; 1; } + ) + { + '.($self->{"lookahead"} ? '$text = $_savetext;' : '').' + $expectation->failed(); + Parse::RecDescent::_trace(q{<>}, + Parse::RecDescent::_tracefirst($text)) + if defined $::RD_TRACE; + last; + } + Parse::RecDescent::_trace(q{>>Matched terminal<< (return value: [} + . $_tok . q{])}, + Parse::RecDescent::_tracefirst($text)) + if defined $::RD_TRACE; + push @item, $item{'.$self->{hashname}.'}=$_tok; + ' . ($self->{"lookahead"} ? '$text = $_savetext;' : '' ) .' +'; + + return $code; +} + +1; + +package Parse::RecDescent::Subrule; + +sub issubrule ($) { return $_[0]->{"subrule"} } +sub isterminal { 0 } +sub sethashname {} + +sub describe ($) +{ + my $desc = $_[0]->{"implicit"} || $_[0]->{"subrule"}; + $desc = "" if $_[0]->{"matchrule"}; + return $desc; +} + +sub callsyntax($$) +{ + if ($_[0]->{"matchrule"}) + { + return "&{'$_[1]'.qq{$_[0]->{subrule}}}"; + } + else + { + return $_[1].$_[0]->{"subrule"}; + } +} + +sub new ($$$$;$$$) +{ + my $class = ref($_[0]) || $_[0]; + bless + { + "subrule" => $_[1], + "lookahead" => $_[2], + "line" => $_[3], + "implicit" => $_[4] || undef, + "matchrule" => $_[5], + "argcode" => $_[6] || undef, + }, $class; +} + + +sub code($$$$) +{ + my ($self, $namespace, $rule) = @_; + +' + Parse::RecDescent::_trace(q{Trying subrule: [' . $self->{"subrule"} . ']}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{"name"} . '}, + $tracelevel) + if defined $::RD_TRACE; + if (1) { no strict qw{refs}; + $expectation->is(' . ($rule->hasleftmost($self) ? 'q{}' + # WAS : 'qq{'.$self->describe.'}' ) . ')->at($text); + : 'q{'.$self->describe.'}' ) . ')->at($text); + ' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) + . ($self->{"lookahead"}<0?'if':'unless') + . ' (defined ($_tok = ' + . $self->callsyntax($namespace.'::') + . '($thisparser,$text,$repeating,' + . ($self->{"lookahead"}?'1':'$_noactions') + . ($self->{argcode} ? ",sub { return $self->{argcode} }" + : ',sub { \\@arg }') + . '))) + { + '.($self->{"lookahead"} ? '$text = $_savetext;' : '').' + Parse::RecDescent::_trace(q{<{subrule} . ']>>}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{"name"} .'}, + $tracelevel) + if defined $::RD_TRACE; + $expectation->failed(); + last; + } + Parse::RecDescent::_trace(q{>>Matched subrule: [' + . $self->{subrule} . ']<< (return value: [} + . $_tok . q{]}, + + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{"name"} .'}, + $tracelevel) + if defined $::RD_TRACE; + $item{q{' . $self->{subrule} . '}} = $_tok; + push @item, $_tok; + ' . ($self->{"lookahead"} ? '$text = $_savetext;' : '' ) .' + } +' +} + +package Parse::RecDescent::Repetition; + +sub issubrule ($) { return $_[0]->{"subrule"} } +sub isterminal { 0 } +sub sethashname { } + +sub describe ($) +{ + my $desc = $_[0]->{"expected"} || $_[0]->{"subrule"}; + $desc = "" if $_[0]->{"matchrule"}; + return $desc; +} + +sub callsyntax($$) +{ + if ($_[0]->{matchrule}) + { return "sub { goto &{''.qq{$_[1]$_[0]->{subrule}}} }"; } + else + { return "\\&$_[1]$_[0]->{subrule}"; } +} + +sub new ($$$$$$$$$$) +{ + my ($self, $subrule, $repspec, $min, $max, $lookahead, $line, $parser, $matchrule, $argcode) = @_; + my $class = ref($self) || $self; + ($max, $min) = ( $min, $max) if ($max<$min); + + my $desc; + if ($subrule=~/\A_alternation_\d+_of_production_\d+_of_rule/) + { $desc = $parser->{"rules"}{$subrule}->expected } + + if ($lookahead) + { + if ($min>0) + { + return new Parse::RecDescent::Subrule($subrule,$lookahead,$line,$desc,$matchrule,$argcode); + } + else + { + Parse::RecDescent::_error("Not symbol (\"!\") before + \"$subrule\" doesn't make + sense.",$line); + Parse::RecDescent::_hint("Lookahead for negated optional + repetitions (such as + \"!$subrule($repspec)\" can never + succeed, since optional items always + match (zero times at worst). + Did you mean a single \"!$subrule\", + instead?"); + } + } + bless + { + "subrule" => $subrule, + "repspec" => $repspec, + "min" => $min, + "max" => $max, + "lookahead" => $lookahead, + "line" => $line, + "expected" => $desc, + "argcode" => $argcode || undef, + "matchrule" => $matchrule, + }, $class; +} + +sub code($$$$) +{ + my ($self, $namespace, $rule) = @_; + + my ($subrule, $repspec, $min, $max, $lookahead) = + @{$self}{ qw{subrule repspec min max lookahead} }; + +' + Parse::RecDescent::_trace(q{Trying repeated subrule: [' . $self->describe . ']}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{"name"} . '}, + $tracelevel) + if defined $::RD_TRACE; + $expectation->is(' . ($rule->hasleftmost($self) ? 'q{}' + # WAS : 'qq{'.$self->describe.'}' ) . ')->at($text); + : 'q{'.$self->describe.'}' ) . ')->at($text); + ' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) .' + unless (defined ($_tok = $thisparser->_parserepeat($text, ' + . $self->callsyntax($namespace.'::') + . ', ' . $min . ', ' . $max . ', ' + . ($self->{"lookahead"}?'1':'$_noactions') + . ',$expectation,' + . ($self->{argcode} ? "sub { return $self->{argcode} }" + : 'undef') + . '))) + { + Parse::RecDescent::_trace(q{<describe . ']>>}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{"name"} .'}, + $tracelevel) + if defined $::RD_TRACE; + last; + } + Parse::RecDescent::_trace(q{>>Matched repeated subrule: [' + . $self->{subrule} . ']<< (} + . @$_tok . q{ times)}, + + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{"name"} .'}, + $tracelevel) + if defined $::RD_TRACE; + $item{q{' . "$self->{subrule}($self->{repspec})" . '}} = $_tok; + push @item, $_tok; + ' . ($self->{"lookahead"} ? '$text = $_savetext;' : '' ) .' + +' +} + +package Parse::RecDescent::Result; + +sub issubrule { 0 } +sub isterminal { 0 } +sub describe { '' } + +sub new +{ + my ($class, $pos) = @_; + + bless {}, $class; +} + +sub code($$$$) +{ + my ($self, $namespace, $rule) = @_; + + ' + $return = $item[-1]; + '; +} + +package Parse::RecDescent::Operator; + +my @opertype = ( " non-optional", "n optional" ); + +sub issubrule { 0 } +sub isterminal { 0 } + +sub describe { $_[0]->{"expected"} } +sub sethashname { $_[0]->{hashname} = '__DIRECTIVE' . ++$_[1]->{dircount} . '__'; } + + +sub new +{ + my ($class, $type, $minrep, $maxrep, $leftarg, $op, $rightarg) = @_; + + bless + { + "type" => "${type}op", + "leftarg" => $leftarg, + "op" => $op, + "min" => $minrep, + "max" => $maxrep, + "rightarg" => $rightarg, + "expected" => "<${type}op: ".$leftarg->describe." ".$op->describe." ".$rightarg->describe.">", + }, $class; +} + +sub code($$$$) +{ + my ($self, $namespace, $rule) = @_; + + my ($leftarg, $op, $rightarg) = + @{$self}{ qw{leftarg op rightarg} }; + + my $code = ' + Parse::RecDescent::_trace(q{Trying operator: [' . $self->describe . ']}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{"name"} . '}, + $tracelevel) + if defined $::RD_TRACE; + $expectation->is(' . ($rule->hasleftmost($self) ? 'q{}' + # WAS : 'qq{'.$self->describe.'}' ) . ')->at($text); + : 'q{'.$self->describe.'}' ) . ')->at($text); + + $_tok = undef; + OPLOOP: while (1) + { + $repcount = 0; + my @item; + '; + + if ($self->{type} eq "leftop" ) + { + $code .= ' + # MATCH LEFTARG + ' . $leftarg->code(@_[1..2]) . ' + + $repcount++; + + my $savetext = $text; + my $backtrack; + + # MATCH (OP RIGHTARG)(s) + while ($repcount < ' . $self->{max} . ') + { + $backtrack = 0; + ' . $op->code(@_[1..2]) . ' + ' . ($op->isterminal() ? 'pop @item;' : '$backtrack=1;' ) . ' + ' . (ref($op) eq 'Parse::RecDescent::Token' + ? 'if (defined $1) {push @item, $item{'.($self->{name}||$self->{hashname}).'}=$1; $backtrack=1;}' + : "" ) . ' + ' . $rightarg->code(@_[1..2]) . ' + $savetext = $text; + $repcount++; + } + $text = $savetext; + pop @item if $backtrack; + + '; + } + else + { + $code .= ' + my $savetext = $text; + my $backtrack; + # MATCH (LEFTARG OP)(s) + while ($repcount < ' . $self->{max} . ') + { + $backtrack = 0; + ' . $leftarg->code(@_[1..2]) . ' + $repcount++; + $backtrack = 1; + ' . $op->code(@_[1..2]) . ' + $savetext = $text; + ' . ($op->isterminal() ? 'pop @item;' : "" ) . ' + ' . (ref($op) eq 'Parse::RecDescent::Token' ? 'do { push @item, $item{'.($self->{name}||$self->{hashname}).'}=$1; } if defined $1;' : "" ) . ' + } + $text = $savetext; + pop @item if $backtrack; + + # MATCH RIGHTARG + ' . $rightarg->code(@_[1..2]) . ' + $repcount++; + '; + } + + $code .= 'unless (@item) { undef $_tok; last }' unless $self->{min}==0; + + $code .= ' + $_tok = [ @item ]; + last; + } + + unless ($repcount>='.$self->{min}.') + { + Parse::RecDescent::_trace(q{<describe + . ']>>}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{"name"} .'}, + $tracelevel) + if defined $::RD_TRACE; + $expectation->failed(); + last; + } + Parse::RecDescent::_trace(q{>>Matched operator: [' + . $self->describe + . ']<< (return value: [} + . qq{@{$_tok||[]}} . q{]}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{"name"} .'}, + $tracelevel) + if defined $::RD_TRACE; + + push @item, $item{'.($self->{name}||$self->{hashname}).'}=$_tok||[]; + +'; + return $code; +} + + +package Parse::RecDescent::Expectation; + +sub new ($) +{ + bless { + "failed" => 0, + "expected" => "", + "unexpected" => "", + "lastexpected" => "", + "lastunexpected" => "", + "defexpected" => $_[1], + }; +} + +sub is ($$) +{ + $_[0]->{lastexpected} = $_[1]; return $_[0]; +} + +sub at ($$) +{ + $_[0]->{lastunexpected} = $_[1]; return $_[0]; +} + +sub failed ($) +{ + return unless $_[0]->{lastexpected}; + $_[0]->{expected} = $_[0]->{lastexpected} unless $_[0]->{failed}; + $_[0]->{unexpected} = $_[0]->{lastunexpected} unless $_[0]->{failed}; + $_[0]->{failed} = 1; +} + +sub message ($) +{ + my ($self) = @_; + $self->{expected} = $self->{defexpected} unless $self->{expected}; + $self->{expected} =~ s/_/ /g; + if (!$self->{unexpected} || $self->{unexpected} =~ /\A\s*\Z/s) + { + return "Was expecting $self->{expected}"; + } + else + { + $self->{unexpected} =~ /\s*(.*)/; + return "Was expecting $self->{expected} but found \"$1\" instead"; + } +} + +1; + +package Parse::RecDescent; + +use Carp; +use vars qw ( $AUTOLOAD $VERSION ); + +my $ERRORS = 0; + +$VERSION = '1.94'; + +# BUILDING A PARSER + +my $nextnamespace = "namespace000001"; + +sub _nextnamespace() +{ + return "Parse::RecDescent::" . $nextnamespace++; +} + +sub new ($$$) +{ + my $class = ref($_[0]) || $_[0]; + local $Parse::RecDescent::compiling = $_[2]; + my $name_space_name = defined $_[3] + ? "Parse::RecDescent::".$_[3] + : _nextnamespace(); + my $self = + { + "rules" => {}, + "namespace" => $name_space_name, + "startcode" => '', + "localvars" => '', + "_AUTOACTION" => undef, + "_AUTOTREE" => undef, + }; + if ($::RD_AUTOACTION) + { + my $sourcecode = $::RD_AUTOACTION; + $sourcecode = "{ $sourcecode }" + unless $sourcecode =~ /\A\s*\{.*\}\s*\Z/; + $self->{_check}{itempos} = + $sourcecode =~ /\@itempos\b|\$itempos\s*\[/; + $self->{_AUTOACTION} + = new Parse::RecDescent::Action($sourcecode,0,-1) + } + + bless $self, $class; + shift; + return $self->Replace(@_) +} + +sub Compile($$$$) { + + die "Compilation of Parse::RecDescent grammars not yet implemented\n"; +} + +sub DESTROY {} # SO AUTOLOADER IGNORES IT + +# BUILDING A GRAMMAR.... + +sub Replace ($$) +{ + splice(@_, 2, 0, 1); + return _generate(@_); +} + +sub Extend ($$) +{ + splice(@_, 2, 0, 0); + return _generate(@_); +} + +sub _no_rule ($$;$) +{ + _error("Ruleless $_[0] at start of grammar.",$_[1]); + my $desc = $_[2] ? "\"$_[2]\"" : ""; + _hint("You need to define a rule for the $_[0] $desc + to be part of."); +} + +my $NEGLOOKAHEAD = '\G(\s*\.\.\.\!)'; +my $POSLOOKAHEAD = '\G(\s*\.\.\.)'; +my $RULE = '\G\s*(\w+)[ \t]*:'; +my $PROD = '\G\s*([|])'; +my $TOKEN = q{\G\s*/((\\\\/|[^/])*)/([cgimsox]*)}; +my $MTOKEN = q{\G\s*(m\s*[^\w\s])}; +my $LITERAL = q{\G\s*'((\\\\['\\\\]|[^'])*)'}; +my $INTERPLIT = q{\G\s*"((\\\\["\\\\]|[^"])*)"}; +my $SUBRULE = '\G\s*(\w+)'; +my $MATCHRULE = '\G(\s*{_check}{itempos} = ($grammar =~ /\@itempos\b|\$itempos\s*\[/) + unless $self->{_check}{itempos}; + for (qw(thisoffset thiscolumn prevline prevoffset prevcolumn)) + { + $self->{_check}{$_} = + ($grammar =~ /\$$_/) || $self->{_check}{itempos} + unless $self->{_check}{$_}; + } + my $line; + + my $rule = undef; + my $prod = undef; + my $item = undef; + my $lastgreedy = ''; + pos $grammar = 0; + study $grammar; + + while (pos $grammar < length $grammar) + { + $line = $lines - _linecount($grammar) + 1; + my $commitonly; + my $code = ""; + my @components = (); + if ($grammar =~ m/$COMMENT/gco) + { + _parse("a comment",0,$line); + next; + } + elsif ($grammar =~ m/$NEGLOOKAHEAD/gco) + { + _parse("a negative lookahead",$aftererror,$line); + $lookahead = $lookahead ? -$lookahead : -1; + $lookaheadspec .= $1; + next; # SKIP LOOKAHEAD RESET AT END OF while LOOP + } + elsif ($grammar =~ m/$POSLOOKAHEAD/gco) + { + _parse("a positive lookahead",$aftererror,$line); + $lookahead = $lookahead ? $lookahead : 1; + $lookaheadspec .= $1; + next; # SKIP LOOKAHEAD RESET AT END OF while LOOP + } + elsif ($grammar =~ m/(?=$ACTION)/gco + and do { ($code) = extract_codeblock($grammar); $code }) + { + _parse("an action", $aftererror, $line, $code); + $item = new Parse::RecDescent::Action($code,$lookahead,$line); + $prod and $prod->additem($item) + or $self->_addstartcode($code); + } + elsif ($grammar =~ m/(?=$IMPLICITSUBRULE)/gco + and do { ($code) = extract_codeblock($grammar,'{([',undef,'(',1); + $code }) + { + $code =~ s/\A\s*\(|\)\Z//g; + _parse("an implicit subrule", $aftererror, $line, + "( $code )"); + my $implicit = $rule->nextimplicit; + $self->_generate("$implicit : $code",$replace,1); + my $pos = pos $grammar; + substr($grammar,$pos,0,$implicit); + pos $grammar = $pos;; + } + elsif ($grammar =~ m/$ENDDIRECTIVEMK/gco) + { + + # EXTRACT TRAILING REPETITION SPECIFIER (IF ANY) + + my ($minrep,$maxrep) = (1,$MAXREP); + if ($grammar =~ m/\G[(]/gc) + { + pos($grammar)--; + + if ($grammar =~ m/$OPTIONAL/gco) + { ($minrep, $maxrep) = (0,1) } + elsif ($grammar =~ m/$ANY/gco) + { $minrep = 0 } + elsif ($grammar =~ m/$EXACTLY/gco) + { ($minrep, $maxrep) = ($1,$1) } + elsif ($grammar =~ m/$BETWEEN/gco) + { ($minrep, $maxrep) = ($1,$2) } + elsif ($grammar =~ m/$ATLEAST/gco) + { $minrep = $1 } + elsif ($grammar =~ m/$ATMOST/gco) + { $maxrep = $1 } + elsif ($grammar =~ m/$MANY/gco) + { } + elsif ($grammar =~ m/$BADREP/gco) + { + _parse("an invalid repetition specifier", 0,$line); + _error("Incorrect specification of a repeated directive", + $line); + _hint("Repeated directives cannot have + a maximum repetition of zero, nor can they have + negative components in their ranges."); + } + } + + $prod && $prod->enddirective($line,$minrep,$maxrep); + } + elsif ($grammar =~ m/\G\s*<[^m]/gc) + { + pos($grammar)-=2; + + if ($grammar =~ m/$OPMK/gco) + { + # $DB::single=1; + _parse("a $1-associative operator directive", $aftererror, $line, "<$1op:...>"); + $prod->adddirective($1, $line,$2||''); + } + elsif ($grammar =~ m/$UNCOMMITMK/gco) + { + _parse("an uncommit marker", $aftererror,$line); + $item = new Parse::RecDescent::Directive('$commit=0;1', + $lookahead,$line,""); + $prod and $prod->additem($item) + or _no_rule("",$line); + } + elsif ($grammar =~ m/$QUOTELIKEMK/gco) + { + _parse("an perl quotelike marker", $aftererror,$line); + $item = new Parse::RecDescent::Directive( + 'my ($match,@res); + ($match,$text,undef,@res) = + Text::Balanced::extract_quotelike($text,$skip); + $match ? \@res : undef; + ', $lookahead,$line,""); + $prod and $prod->additem($item) + or _no_rule("",$line); + } + elsif ($grammar =~ m/$CODEBLOCKMK/gco) + { + my $outer = $1||"{}"; + _parse("an perl codeblock marker", $aftererror,$line); + $item = new Parse::RecDescent::Directive( + 'Text::Balanced::extract_codeblock($text,undef,$skip,\''.$outer.'\'); + ', $lookahead,$line,""); + $prod and $prod->additem($item) + or _no_rule("",$line); + } + elsif ($grammar =~ m/$VARIABLEMK/gco) + { + _parse("an perl variable marker", $aftererror,$line); + $item = new Parse::RecDescent::Directive( + 'Text::Balanced::extract_variable($text,$skip); + ', $lookahead,$line,""); + $prod and $prod->additem($item) + or _no_rule("",$line); + } + elsif ($grammar =~ m/$NOCHECKMK/gco) + { + _parse("a disable checking marker", $aftererror,$line); + if ($rule) + { + _error(" directive not at start of grammar", $line); + _hint("The directive can only + be specified at the start of a + grammar (before the first rule + is defined."); + } + else + { + local $::RD_CHECK = 1; + } + } + elsif ($grammar =~ m/$AUTOSTUBMK/gco) + { + _parse("an autostub marker", $aftererror,$line); + $::RD_AUTOSTUB = ""; + } + elsif ($grammar =~ m/$AUTORULEMK/gco) + { + _parse("an autorule marker", $aftererror,$line); + $::RD_AUTOSTUB = $1; + } + elsif ($grammar =~ m/$AUTOTREEMK/gco) + { + _parse("an autotree marker", $aftererror,$line); + if ($rule) + { + _error(" directive not at start of grammar", $line); + _hint("The directive can only + be specified at the start of a + grammar (before the first rule + is defined."); + } + else + { + undef $self->{_AUTOACTION}; + $self->{_AUTOTREE}{NODE} + = new Parse::RecDescent::Action(q{{bless \%item, $item[0]}},0,-1); + $self->{_AUTOTREE}{TERMINAL} + = new Parse::RecDescent::Action(q{{bless {__VALUE__=>$item[1]}, $item[0]}},0,-1); + } + } + + elsif ($grammar =~ m/$REJECTMK/gco) + { + _parse("an reject marker", $aftererror,$line); + $item = new Parse::RecDescent::UncondReject($lookahead,$line,""); + $prod and $prod->additem($item) + or _no_rule("",$line); + } + elsif ($grammar =~ m/(?=$CONDREJECTMK)/gco + and do { ($code) = extract_codeblock($grammar,'{',undef,'<'); + $code }) + { + _parse("a (conditional) reject marker", $aftererror,$line); + $code =~ /\A\s*\Z/s; + $item = new Parse::RecDescent::Directive( + "($1) ? undef : 1", $lookahead,$line,""); + $prod and $prod->additem($item) + or _no_rule("",$line); + } + elsif ($grammar =~ m/(?=$SCOREMK)/gco + and do { ($code) = extract_codeblock($grammar,'{',undef,'<'); + $code }) + { + _parse("a score marker", $aftererror,$line); + $code =~ /\A\s*\Z/s; + $prod and $prod->addscore($1, $lookahead, $line) + or _no_rule($code,$line); + } + elsif ($grammar =~ m/(?=$AUTOSCOREMK)/gco + and do { ($code) = extract_codeblock($grammar,'{',undef,'<'); + $code; + } ) + { + _parse("an autoscore specifier", $aftererror,$line,$code); + $code =~ /\A\s*\Z/s; + + $rule and $rule->addautoscore($1,$self) + or _no_rule($code,$line); + + $item = new Parse::RecDescent::UncondReject($lookahead,$line,$code); + $prod and $prod->additem($item) + or _no_rule($code,$line); + } + elsif ($grammar =~ m/$RESYNCMK/gco) + { + _parse("a resync to newline marker", $aftererror,$line); + $item = new Parse::RecDescent::Directive( + 'if ($text =~ s/\A[^\n]*\n//) { $return = 0; $& } else { undef }', + $lookahead,$line,""); + $prod and $prod->additem($item) + or _no_rule("",$line); + } + elsif ($grammar =~ m/(?=$RESYNCPATMK)/gco + and do { ($code) = extract_bracketed($grammar,'<'); + $code }) + { + _parse("a resync with pattern marker", $aftererror,$line); + $code =~ /\A\s*\Z/s; + $item = new Parse::RecDescent::Directive( + 'if ($text =~ s/\A'.$1.'//) { $return = 0; $& } else { undef }', + $lookahead,$line,$code); + $prod and $prod->additem($item) + or _no_rule($code,$line); + } + elsif ($grammar =~ m/(?=$SKIPMK)/gco + and do { ($code) = extract_codeblock($grammar,'<'); + $code }) + { + _parse("a skip marker", $aftererror,$line); + $code =~ /\A\s*\Z/s; + $item = new Parse::RecDescent::Directive( + 'my $oldskip = $skip; $skip='.$1.'; $oldskip', + $lookahead,$line,$code); + $prod and $prod->additem($item) + or _no_rule($code,$line); + } + elsif ($grammar =~ m/(?=$RULEVARPATMK)/gco + and do { ($code) = extract_codeblock($grammar,'{',undef,'<'); + $code; + } ) + { + _parse("a rule variable specifier", $aftererror,$line,$code); + $code =~ /\A\s*\Z/s; + + $rule and $rule->addvar($1,$self) + or _no_rule($code,$line); + + $item = new Parse::RecDescent::UncondReject($lookahead,$line,$code); + $prod and $prod->additem($item) + or _no_rule($code,$line); + } + elsif ($grammar =~ m/(?=$DEFERPATMK)/gco + and do { ($code) = extract_codeblock($grammar,'{',undef,'<'); + $code; + } ) + { + _parse("a deferred action specifier", $aftererror,$line,$code); + $code =~ s/\A\s*\Z/$1/s; + if ($code =~ /\A\s*[^{]|[^}]\s*\Z/) + { + $code = "{ $code }" + } + + $item = new Parse::RecDescent::Directive( + "push \@{\$thisparser->{deferred}}, sub $code;", + $lookahead,$line,""); + $prod and $prod->additem($item) + or _no_rule("",$line); + + $self->{deferrable} = 1; + } + elsif ($grammar =~ m/(?=$TOKENPATMK)/gco + and do { ($code) = extract_codeblock($grammar,'{',undef,'<'); + $code; + } ) + { + _parse("a token constructor", $aftererror,$line,$code); + $code =~ s/\A\s*\Z/$1/s; + + my $types = eval 'no strict; local $SIG{__WARN__} = sub {0}; my @arr=('.$code.'); @arr' || (); + if (!$types) + { + _error("Incorrect token specification: \"$@\"", $line); + _hint("The directive requires a list + of one or more strings representing possible + types of the specified token. For example: + "); + } + else + { + $item = new Parse::RecDescent::Directive( + 'no strict; + $return = { text => $item[-1] }; + @{$return->{type}}{'.$code.'} = (1..'.$types.');', + $lookahead,$line,""); + $prod and $prod->additem($item) + or _no_rule("",$line); + } + } + elsif ($grammar =~ m/$COMMITMK/gco) + { + _parse("an commit marker", $aftererror,$line); + $item = new Parse::RecDescent::Directive('$commit = 1', + $lookahead,$line,""); + $prod and $prod->additem($item) + or _no_rule("",$line); + } + elsif ($grammar =~ m/$AUTOERRORMK/gco) + { + $commitonly = $1; + _parse("an error marker", $aftererror,$line); + $item = new Parse::RecDescent::Error('',$lookahead,$1,$line); + $prod and $prod->additem($item) + or _no_rule("",$line); + $aftererror = !$commitonly; + } + elsif ($grammar =~ m/(?=$MSGERRORMK)/gco + and do { $commitonly = $1; + ($code) = extract_bracketed($grammar,'<'); + $code }) + { + _parse("an error marker", $aftererror,$line,$code); + $code =~ /\A\s*\Z/s; + $item = new Parse::RecDescent::Error($1,$lookahead,$commitonly,$line); + $prod and $prod->additem($item) + or _no_rule("$code",$line); + $aftererror = !$commitonly; + } + elsif (do { $commitonly = $1; + ($code) = extract_bracketed($grammar,'<'); + $code }) + { + if ($code =~ /^<[A-Z_]+>$/) + { + _error("Token items are not yet + supported: \"$code\"", + $line); + _hint("Items like $code that consist of angle + brackets enclosing a sequence of + uppercase characters will eventually + be used to specify pre-lexed tokens + in a grammar. That functionality is not + yet implemented. Or did you misspell + \"$code\"?"); + } + else + { + _error("Untranslatable item encountered: \"$code\"", + $line); + _hint("Did you misspell \"$code\" + or forget to comment it out?"); + } + } + } + elsif ($grammar =~ m/$RULE/gco) + { + _parseunneg("a rule declaration", 0, + $lookahead,$line) or next; + my $rulename = $1; + if ($rulename =~ /Replace|Extend|Precompile|Save/ ) + { + _warn(2,"Rule \"$rulename\" hidden by method + Parse::RecDescent::$rulename",$line) + and + _hint("The rule named \"$rulename\" cannot be directly + called through the Parse::RecDescent object + for this grammar (although it may still + be used as a subrule of other rules). + It can't be directly called because + Parse::RecDescent::$rulename is already defined (it + is the standard method of all + parsers)."); + } + $rule = new Parse::RecDescent::Rule($rulename,$self,$line,$replace); + $prod->check_pending($line) if $prod; + $prod = $rule->addprod( new Parse::RecDescent::Production ); + $aftererror = 0; + } + elsif ($grammar =~ m/$UNCOMMITPROD/gco) + { + pos($grammar)-=9; + _parseunneg("a new (uncommitted) production", + 0, $lookahead, $line) or next; + + $prod->check_pending($line) if $prod; + $prod = new Parse::RecDescent::Production($line,1); + $rule and $rule->addprod($prod) + or _no_rule("",$line); + $aftererror = 0; + } + elsif ($grammar =~ m/$ERRORPROD/gco) + { + pos($grammar)-=6; + _parseunneg("a new (error) production", $aftererror, + $lookahead,$line) or next; + $prod->check_pending($line) if $prod; + $prod = new Parse::RecDescent::Production($line,0,1); + $rule and $rule->addprod($prod) + or _no_rule("",$line); + $aftererror = 0; + } + elsif ($grammar =~ m/$PROD/gco) + { + _parseunneg("a new production", 0, + $lookahead,$line) or next; + $rule + and (!$prod || $prod->check_pending($line)) + and $prod = $rule->addprod(new Parse::RecDescent::Production($line)) + or _no_rule("production",$line); + $aftererror = 0; + } + elsif ($grammar =~ m/$LITERAL/gco) + { + ($code = $1) =~ s/\\\\/\\/g; + _parse("a literal terminal", $aftererror,$line,$1); + $item = new Parse::RecDescent::Literal($code,$lookahead,$line); + $prod and $prod->additem($item) + or _no_rule("literal terminal",$line,"'$1'"); + } + elsif ($grammar =~ m/$INTERPLIT/gco) + { + _parse("an interpolated literal terminal", $aftererror,$line); + $item = new Parse::RecDescent::InterpLit($1,$lookahead,$line); + $prod and $prod->additem($item) + or _no_rule("interpolated literal terminal",$line,"'$1'"); + } + elsif ($grammar =~ m/$TOKEN/gco) + { + _parse("a /../ pattern terminal", $aftererror,$line); + $item = new Parse::RecDescent::Token($1,'/',$3?$3:'',$lookahead,$line); + $prod and $prod->additem($item) + or _no_rule("pattern terminal",$line,"/$1/"); + } + elsif ($grammar =~ m/(?=$MTOKEN)/gco + and do { ($code, undef, @components) + = extract_quotelike($grammar); + $code } + ) + + { + _parse("an m/../ pattern terminal", $aftererror,$line,$code); + $item = new Parse::RecDescent::Token(@components[3,2,8], + $lookahead,$line); + $prod and $prod->additem($item) + or _no_rule("pattern terminal",$line,$code); + } + elsif ($grammar =~ m/(?=$MATCHRULE)/gco + and do { ($code) = extract_bracketed($grammar,'<'); + $code + } + or $grammar =~ m/$SUBRULE/gco + and $code = $1) + { + my $name = $code; + my $matchrule = 0; + if (substr($name,0,1) eq '<') + { + $name =~ s/$MATCHRULE\s*//; + $name =~ s/\s*>\Z//; + $matchrule = 1; + } + + # EXTRACT TRAILING ARG LIST (IF ANY) + + my ($argcode) = extract_codeblock($grammar, "[]",'') || ''; + + # EXTRACT TRAILING REPETITION SPECIFIER (IF ANY) + + if ($grammar =~ m/\G[(]/gc) + { + pos($grammar)--; + + if ($grammar =~ m/$OPTIONAL/gco) + { + _parse("an zero-or-one subrule match", $aftererror,$line,"$code$argcode($1)"); + $item = new Parse::RecDescent::Repetition($name,$1,0,1, + $lookahead,$line, + $self, + $matchrule, + $argcode); + $prod and $prod->additem($item) + or _no_rule("repetition",$line,"$code$argcode($1)"); + + !$matchrule and $rule and $rule->addcall($name); + } + elsif ($grammar =~ m/$ANY/gco) + { + _parse("a zero-or-more subrule match", $aftererror,$line,"$code$argcode($1)"); + if ($2) + { + my $pos = pos $grammar; + substr($grammar,$pos,0, + "(s?) "); + + pos $grammar = $pos; + } + else + { + $item = new Parse::RecDescent::Repetition($name,$1,0,$MAXREP, + $lookahead,$line, + $self, + $matchrule, + $argcode); + $prod and $prod->additem($item) + or _no_rule("repetition",$line,"$code$argcode($1)"); + + !$matchrule and $rule and $rule->addcall($name); + + _check_insatiable($name,$1,$grammar,$line) if $::RD_CHECK; + } + } + elsif ($grammar =~ m/$MANY/gco) + { + _parse("a one-or-more subrule match", $aftererror,$line,"$code$argcode($1)"); + if ($2) + { + # $DB::single=1; + my $pos = pos $grammar; + substr($grammar,$pos,0, + " "); + + pos $grammar = $pos; + } + else + { + $item = new Parse::RecDescent::Repetition($name,$1,1,$MAXREP, + $lookahead,$line, + $self, + $matchrule, + $argcode); + + $prod and $prod->additem($item) + or _no_rule("repetition",$line,"$code$argcode($1)"); + + !$matchrule and $rule and $rule->addcall($name); + + _check_insatiable($name,$1,$grammar,$line) if $::RD_CHECK; + } + } + elsif ($grammar =~ m/$EXACTLY/gco) + { + _parse("an exactly-$1-times subrule match", $aftererror,$line,"$code$argcode($1)"); + if ($2) + { + my $pos = pos $grammar; + substr($grammar,$pos,0, + "($1) "); + + pos $grammar = $pos; + } + else + { + $item = new Parse::RecDescent::Repetition($name,$1,$1,$1, + $lookahead,$line, + $self, + $matchrule, + $argcode); + $prod and $prod->additem($item) + or _no_rule("repetition",$line,"$code$argcode($1)"); + + !$matchrule and $rule and $rule->addcall($name); + } + } + elsif ($grammar =~ m/$BETWEEN/gco) + { + _parse("a $1-to-$2 subrule match", $aftererror,$line,"$code$argcode($1..$2)"); + if ($3) + { + my $pos = pos $grammar; + substr($grammar,$pos,0, + "($1..$2) "); + + pos $grammar = $pos; + } + else + { + $item = new Parse::RecDescent::Repetition($name,"$1..$2",$1,$2, + $lookahead,$line, + $self, + $matchrule, + $argcode); + $prod and $prod->additem($item) + or _no_rule("repetition",$line,"$code$argcode($1..$2)"); + + !$matchrule and $rule and $rule->addcall($name); + } + } + elsif ($grammar =~ m/$ATLEAST/gco) + { + _parse("a $1-or-more subrule match", $aftererror,$line,"$code$argcode($1..)"); + if ($2) + { + my $pos = pos $grammar; + substr($grammar,$pos,0, + "($1..) "); + + pos $grammar = $pos; + } + else + { + $item = new Parse::RecDescent::Repetition($name,"$1..",$1,$MAXREP, + $lookahead,$line, + $self, + $matchrule, + $argcode); + $prod and $prod->additem($item) + or _no_rule("repetition",$line,"$code$argcode($1..)"); + + !$matchrule and $rule and $rule->addcall($name); + _check_insatiable($name,"$1..",$grammar,$line) if $::RD_CHECK; + } + } + elsif ($grammar =~ m/$ATMOST/gco) + { + _parse("a one-to-$1 subrule match", $aftererror,$line,"$code$argcode(..$1)"); + if ($2) + { + my $pos = pos $grammar; + substr($grammar,$pos,0, + "(..$1) "); + + pos $grammar = $pos; + } + else + { + $item = new Parse::RecDescent::Repetition($name,"..$1",1,$1, + $lookahead,$line, + $self, + $matchrule, + $argcode); + $prod and $prod->additem($item) + or _no_rule("repetition",$line,"$code$argcode(..$1)"); + + !$matchrule and $rule and $rule->addcall($name); + } + } + elsif ($grammar =~ m/$BADREP/gco) + { + _parse("an subrule match with invalid repetition specifier", 0,$line); + _error("Incorrect specification of a repeated subrule", + $line); + _hint("Repeated subrules like \"$code$argcode$&\" cannot have + a maximum repetition of zero, nor can they have + negative components in their ranges."); + } + } + else + { + _parse("a subrule match", $aftererror,$line,$code); + my $desc; + if ($name=~/\A_alternation_\d+_of_production_\d+_of_rule/) + { $desc = $self->{"rules"}{$name}->expected } + $item = new Parse::RecDescent::Subrule($name, + $lookahead, + $line, + $desc, + $matchrule, + $argcode); + + $prod and $prod->additem($item) + or _no_rule("(sub)rule",$line,$name); + + !$matchrule and $rule and $rule->addcall($name); + } + } + elsif ($grammar =~ m/$LONECOLON/gco ) + { + _error("Unexpected colon encountered", $line); + _hint("Did you mean \"|\" (to start a new production)? + Or perhaps you forgot that the colon + in a rule definition must be + on the same line as the rule name?"); + } + elsif ($grammar =~ m/$ACTION/gco ) # BAD ACTION, ALREADY FAILED + { + _error("Malformed action encountered", + $line); + _hint("Did you forget the closing curly bracket + or is there a syntax error in the action?"); + } + elsif ($grammar =~ m/$OTHER/gco ) + { + _error("Untranslatable item encountered: \"$1\"", + $line); + _hint("Did you misspell \"$1\" + or forget to comment it out?"); + } + + if ($lookaheadspec =~ tr /././ > 3) + { + $lookaheadspec =~ s/\A\s+//; + $lookahead = $lookahead<0 + ? 'a negative lookahead ("...!")' + : 'a positive lookahead ("...")' ; + _warn(1,"Found two or more lookahead specifiers in a + row.",$line) + and + _hint("Multiple positive and/or negative lookaheads + are simply multiplied together to produce a + single positive or negative lookahead + specification. In this case the sequence + \"$lookaheadspec\" was reduced to $lookahead. + Was this your intention?"); + } + $lookahead = 0; + $lookaheadspec = ""; + + $grammar =~ m/\G\s+/gc; + } + + unless ($ERRORS or $isimplicit or !$::RD_CHECK) + { + $self->_check_grammar(); + } + + unless ($ERRORS or $isimplicit or $Parse::RecDescent::compiling) + { + my $code = $self->_code(); + if (defined $::RD_TRACE) + { + print STDERR "printing code (", length($code),") to RD_TRACE\n"; + local *TRACE_FILE; + open TRACE_FILE, ">RD_TRACE" + and print TRACE_FILE "my \$ERRORS;\n$code" + and close TRACE_FILE; + } + + unless ( eval "$code 1" ) + { + _error("Internal error in generated parser code!"); + $@ =~ s/at grammar/in grammar at/; + _hint($@); + } + } + + if ($ERRORS and !_verbosity("HINT")) + { + local $::RD_HINT = 1; + _hint('Set $::RD_HINT (or -RD_HINT if you\'re using "perl -s") + for hints on fixing these problems.'); + } + if ($ERRORS) { $ERRORS=0; return } + return $self; +} + + +sub _addstartcode($$) +{ + my ($self, $code) = @_; + $code =~ s/\A\s*\{(.*)\}\Z/$1/s; + + $self->{"startcode"} .= "$code;\n"; +} + +# CHECK FOR GRAMMAR PROBLEMS.... + +sub _check_insatiable($$$$) +{ + my ($subrule,$repspec,$grammar,$line) = @_; + pos($grammar)=pos($_[2]); + return if $grammar =~ m/$OPTIONAL/gco || $grammar =~ m/$ANY/gco; + my $min = 1; + if ( $grammar =~ m/$MANY/gco + || $grammar =~ m/$EXACTLY/gco + || $grammar =~ m/$ATMOST/gco + || $grammar =~ m/$BETWEEN/gco && do { $min=$2; 1 } + || $grammar =~ m/$ATLEAST/gco && do { $min=$2; 1 } + || $grammar =~ m/$SUBRULE(?!\s*:)/gco + ) + { + return unless $1 eq $subrule && $min > 0; + _warn(3,"Subrule sequence \"$subrule($repspec) $&\" will + (almost certainly) fail.",$line) + and + _hint("Unless subrule \"$subrule\" performs some cunning + lookahead, the repetition \"$subrule($repspec)\" will + insatiably consume as many matches of \"$subrule\" as it + can, leaving none to match the \"$&\" that follows."); + } +} + +sub _check_grammar ($) +{ + my $self = shift; + my $rules = $self->{"rules"}; + my $rule; + foreach $rule ( values %$rules ) + { + next if ! $rule->{"changed"}; + + # CHECK FOR UNDEFINED RULES + + my $call; + foreach $call ( @{$rule->{"calls"}} ) + { + if (!defined ${$rules}{$call} + &&!defined &{"Parse::RecDescent::$call"}) + { + if (!defined $::RD_AUTOSTUB) + { + _warn(3,"Undefined (sub)rule \"$call\" + used in a production.") + and + _hint("Will you be providing this rule + later, or did you perhaps + misspell \"$call\"? Otherwise + it will be treated as an + immediate ."); + eval "sub $self->{namespace}::$call {undef}"; + } + else # EXPERIMENTAL + { + my $rule = $::RD_AUTOSTUB || qq{'$call'}; + _warn(1,"Autogenerating rule: $call") + and + _hint("A call was made to a subrule + named \"$call\", but no such + rule was specified. However, + since \$::RD_AUTOSTUB + was defined, a rule stub + ($call : $rule) was + automatically created."); + + $self->_generate("$call : $rule",0,1); + } + } + } + + # CHECK FOR LEFT RECURSION + + if ($rule->isleftrec($rules)) + { + _error("Rule \"$rule->{name}\" is left-recursive."); + _hint("Redesign the grammar so it's not left-recursive. + That will probably mean you need to re-implement + repetitions using the '(s)' notation. + For example: \"$rule->{name}(s)\"."); + next; + } + } +} + +# GENERATE ACTUAL PARSER CODE + +sub _code($) +{ + my $self = shift; + my $code = qq{ +package $self->{namespace}; +use strict; +use vars qw(\$skip \$AUTOLOAD $self->{localvars} ); +\$skip = '$skip'; +$self->{startcode} + +{ +local \$SIG{__WARN__} = sub {0}; +# PRETEND TO BE IN Parse::RecDescent NAMESPACE +*$self->{namespace}::AUTOLOAD = sub +{ + no strict 'refs'; + \$AUTOLOAD =~ s/^$self->{namespace}/Parse::RecDescent/; + goto &{\$AUTOLOAD}; +} +} + +}; + $code .= "push \@$self->{namespace}\::ISA, 'Parse::RecDescent';"; + $self->{"startcode"} = ''; + + my $rule; + foreach $rule ( values %{$self->{"rules"}} ) + { + if ($rule->{"changed"}) + { + $code .= $rule->code($self->{"namespace"},$self); + $rule->{"changed"} = 0; + } + } + + return $code; +} + + +# EXECUTING A PARSE.... + +sub AUTOLOAD # ($parser, $text; $linenum, @args) +{ + croak "Could not find method: $AUTOLOAD\n" unless ref $_[0]; + my $class = ref($_[0]) || $_[0]; + my $text = ref($_[1]) ? ${$_[1]} : $_[1]; + $_[0]->{lastlinenum} = $_[2]||_linecount($_[1]); + $_[0]->{lastlinenum} = _linecount($_[1]); + $_[0]->{lastlinenum} += $_[2] if @_ > 2; + $_[0]->{offsetlinenum} = $_[0]->{lastlinenum}; + $_[0]->{fulltext} = $text; + $_[0]->{fulltextlen} = length $text; + $_[0]->{deferred} = []; + $_[0]->{errors} = []; + my @args = @_[3..$#_]; + my $args = sub { [ @args ] }; + + $AUTOLOAD =~ s/$class/$_[0]->{namespace}/; + no strict "refs"; + + croak "Unknown starting rule ($AUTOLOAD) called\n" + unless defined &$AUTOLOAD; + my $retval = &{$AUTOLOAD}($_[0],$text,undef,undef,$args); + + if (defined $retval) + { + foreach ( @{$_[0]->{deferred}} ) { &$_; } + } + else + { + foreach ( @{$_[0]->{errors}} ) { _error(@$_); } + } + + if (ref $_[1]) { ${$_[1]} = $text } + + $ERRORS = 0; + return $retval; +} + +sub _parserepeat($$$$$$$$$$) # RETURNS A REF TO AN ARRAY OF MATCHES +{ + my ($parser, $text, $prod, $min, $max, $_noactions, $expectation, $argcode) = @_; + my @tokens = (); + + my $reps; + for ($reps=0; $reps<$max;) + { + $_[6]->at($text); # $_[6] IS $expectation FROM CALLER + my $_savetext = $text; + my $prevtextlen = length $text; + my $_tok; + if (! defined ($_tok = &$prod($parser,$text,1,$_noactions,$argcode))) + { + $text = $_savetext; + last; + } + push @tokens, $_tok if defined $_tok; + last if ++$reps >= $min and $prevtextlen == length $text; + } + + do { $_[6]->failed(); return undef} if $reps<$min; + + $_[1] = $text; + return [@tokens]; +} + + +# ERROR REPORTING.... + +my $errortext; +my $errorprefix; + +open (ERROR, ">&STDERR"); +format ERROR = +@>>>>>>>>>>>>>>>>>>>>: ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +$errorprefix, $errortext +~~ ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + $errortext +. + +select ERROR; +$| = 1; + +# TRACING + +my $tracemsg; +my $tracecontext; +my $tracerulename; +use vars '$tracelevel'; + +open (TRACE, ">&STDERR"); +format TRACE = +@>|@|||||||||@^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<| +$tracelevel, $tracerulename, '|', $tracemsg + | ~~ |^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<| + $tracemsg +. + +select TRACE; +$| = 1; + +open (TRACECONTEXT, ">&STDERR"); +format TRACECONTEXT = +@>|@|||||||||@ |^<<<<<<<<<<<<<<<<<<<<<<<<<<< +$tracelevel, $tracerulename, '|', $tracecontext + | ~~ | |^<<<<<<<<<<<<<<<<<<<<<<<<<<< + $tracecontext +. + + +select TRACECONTEXT; +$| = 1; + +select STDOUT; + +sub _verbosity($) +{ + defined $::RD_TRACE + or defined $::RD_HINT and $_[0] =~ /ERRORS|WARN|HINT/ + or defined $::RD_WARN and $_[0] =~ /ERRORS|WARN/ + or defined $::RD_ERRORS and $_[0] =~ /ERRORS/ +} + +sub _error($;$) +{ + $ERRORS++; + return 0 if ! _verbosity("ERRORS"); + $errortext = $_[0]; + $errorprefix = "ERROR" . ($_[1] ? " (line $_[1])" : ""); + $errortext =~ s/\s+/ /g; + print ERROR "\n" if _verbosity("WARN"); + write ERROR; + return 1; +} + +sub _warn($$;$) +{ + return 0 unless _verbosity("WARN") && ($::RD_HINT || $_[0] >= ($::RD_WARN||1)); + $errortext = $_[1]; + $errorprefix = "Warning" . ($_[2] ? " (line $_[2])" : ""); + print ERROR "\n"; + $errortext =~ s/\s+/ /g; + write ERROR; + return 1; +} + +sub _hint($) +{ + return 0 unless defined $::RD_HINT; + $errortext = "$_[0])"; + $errorprefix = "(Hint"; + $errortext =~ s/\s+/ /g; + write ERROR; + return 1; +} + +sub _tracemax($) +{ + if (defined $::RD_TRACE + && $::RD_TRACE =~ /\d+/ + && $::RD_TRACE>1 + && $::RD_TRACE+10..." + . substr($_[0],-$::RD_TRACE/2); + } + else + { + return $_[0]; + } +} + +sub _tracefirst($) +{ + if (defined $::RD_TRACE + && $::RD_TRACE =~ /\d+/ + && $::RD_TRACE>1 + && $::RD_TRACE+10"; + } + else + { + return $_[0]; + } +} + +my $lastcontext = ''; +my $lastrulename = ''; +my $lastlevel = ''; + +sub _trace($;$$$) +{ + $tracemsg = $_[0]; + $tracecontext = $_[1]||$lastcontext; + $tracerulename = $_[2]||$lastrulename; + $tracelevel = $_[3]||$lastlevel; + if ($tracerulename) { $lastrulename = $tracerulename } + if ($tracelevel) { $lastlevel = $tracelevel } + + $tracecontext =~ s/\n/\\n/g; + $tracecontext =~ s/\s+/ /g; + $tracerulename = qq{$tracerulename}; + write TRACE; + if ($tracecontext ne $lastcontext) + { + if ($tracecontext) + { + $lastcontext = _tracefirst($tracecontext); + $tracecontext = qq{"$tracecontext"}; + } + else + { + $tracecontext = qq{}; + } + write TRACECONTEXT; + } +} + +sub _parseunneg($$$$) +{ + _parse($_[0],$_[1],$_[3]); + if ($_[2]<0) + { + _error("Can't negate \"$&\".",$_[3]); + _hint("You can't negate $_[0]. Remove the \"...!\" before + \"$&\"."); + return 0; + } + return 1; +} + +sub _parse($$$;$) +{ + my $what = $_[3] || $&; + $what =~ s/^\s+//; + if ($_[1]) + { + _warn(3,"Found $_[0] ($what) after an unconditional ",$_[2]) + and + _hint("An unconditional always causes the + production containing it to immediately fail. + \u$_[0] that follows an + will never be reached. Did you mean to use + instead?"); + } + + return if ! _verbosity("TRACE"); + $errortext = "Treating \"$what\" as $_[0]"; + $errorprefix = "Parse::RecDescent"; + $errortext =~ s/\s+/ /g; + write ERROR; +} + +sub _linecount($) { + scalar substr($_[0], pos $_[0]||0) =~ tr/\n// +} + + +package main; + +use vars qw ( $RD_ERRORS $RD_WARN $RD_HINT $RD_TRACE $RD_CHECK ); +$::RD_CHECK = 1; +$::RD_ERRORS = 1; +$::RD_WARN = 3; + +1; + diff --git a/lib/Parse/RecDescent.pod b/lib/Parse/RecDescent.pod new file mode 100644 index 000000000..99783e7b3 --- /dev/null +++ b/lib/Parse/RecDescent.pod @@ -0,0 +1,2823 @@ +=head1 NAME + +Parse::RecDescent - Generate Recursive-Descent Parsers + +=head1 VERSION + +This document describes version 1.94 of Parse::RecDescent, +released April 9, 2003. + +=head1 SYNOPSIS + + use Parse::RecDescent; + + # Generate a parser from the specification in $grammar: + + $parser = new Parse::RecDescent ($grammar); + + # Generate a parser from the specification in $othergrammar + + $anotherparser = new Parse::RecDescent ($othergrammar); + + + # Parse $text using rule 'startrule' (which must be + # defined in $grammar): + + $parser->startrule($text); + + + # Parse $text using rule 'otherrule' (which must also + # be defined in $grammar): + + $parser->otherrule($text); + + + # Change the universal token prefix pattern + # (the default is: '\s*'): + + $Parse::RecDescent::skip = '[ \t]+'; + + + # Replace productions of existing rules (or create new ones) + # with the productions defined in $newgrammar: + + $parser->Replace($newgrammar); + + + # Extend existing rules (or create new ones) + # by adding extra productions defined in $moregrammar: + + $parser->Extend($moregrammar); + + + # Global flags (useful as command line arguments under -s): + + $::RD_ERRORS # unless undefined, report fatal errors + $::RD_WARN # unless undefined, also report non-fatal problems + $::RD_HINT # if defined, also suggestion remedies + $::RD_TRACE # if defined, also trace parsers' behaviour + $::RD_AUTOSTUB # if defined, generates "stubs" for undefined rules + $::RD_AUTOACTION # if defined, appends specified action to productions + + +=head1 DESCRIPTION + +=head2 Overview + +Parse::RecDescent incrementally generates top-down recursive-descent text +parsers from simple I-like grammar specifications. It provides: + +=over 4 + +=item * + +Regular expressions or literal strings as terminals (tokens), + +=item * + +Multiple (non-contiguous) productions for any rule, + +=item * + +Repeated and optional subrules within productions, + +=item * + +Full access to Perl within actions specified as part of the grammar, + +=item * + +Simple automated error reporting during parser generation and parsing, + +=item * + +The ability to commit to, uncommit to, or reject particular +productions during a parse, + +=item * + +The ability to pass data up and down the parse tree ("down" via subrule +argument lists, "up" via subrule return values) + +=item * + +Incremental extension of the parsing grammar (even during a parse), + +=item * + +Precompilation of parser objects, + +=item * + +User-definable reduce-reduce conflict resolution via +"scoring" of matching productions. + +=back + +=head2 Using C + +Parser objects are created by calling C, passing in a +grammar specification (see the following subsections). If the grammar is +correct, C returns a blessed reference which can then be used to initiate +parsing through any rule specified in the original grammar. A typical sequence +looks like this: + + $grammar = q { + # GRAMMAR SPECIFICATION HERE + }; + + $parser = new Parse::RecDescent ($grammar) or die "Bad grammar!\n"; + + # acquire $text + + defined $parser->startrule($text) or print "Bad text!\n"; + +The rule through which parsing is initiated must be explicitly defined +in the grammar (i.e. for the above example, the grammar must include a +rule of the form: "startrule: ". + +If the starting rule succeeds, its value (see below) +is returned. Failure to generate the original parser or failure to match a text +is indicated by returning C. Note that it's easy to set up grammars +that can succeed, but which return a value of 0, "0", or "". So don't be +tempted to write: + + $parser->startrule($text) or print "Bad text!\n"; + +Normally, the parser has no effect on the original text. So in the +previous example the value of $text would be unchanged after having +been parsed. + +If, however, the text to be matched is passed by reference: + + $parser->startrule(\$text) + +then any text which was consumed during the match will be removed from the +start of $text. + + +=head2 Rules + +In the grammar from which the parser is built, rules are specified by +giving an identifier (which must satisfy /[A-Za-z]\w*/), followed by a +colon I, followed by one or more productions, +separated by single vertical bars. The layout of the productions +is entirely free-format: + + rule1: production1 + | production2 | + production3 | production4 + +At any point in the grammar previously defined rules may be extended with +additional productions. This is achieved by redeclaring the rule with the new +productions. Thus: + + rule1: a | b | c + rule2: d | e | f + rule1: g | h + +is exactly equivalent to: + + rule1: a | b | c | g | h + rule2: d | e | f + +Each production in a rule consists of zero or more items, each of which +may be either: the name of another rule to be matched (a "subrule"), +a pattern or string literal to be matched directly (a "token"), a +block of Perl code to be executed (an "action"), a special instruction +to the parser (a "directive"), or a standard Perl comment (which is +ignored). + +A rule matches a text if one of its productions matches. A production +matches if each of its items match consecutive substrings of the +text. The productions of a rule being matched are tried in the same +order that they appear in the original grammar, and the first matching +production terminates the match attempt (successfully). If all +productions are tried and none matches, the match attempt fails. + +Note that this behaviour is quite different from the "prefer the longer match" +behaviour of I. For example, if I were parsing the rule: + + seq : 'A' 'B' + | 'A' 'B' 'C' + +upon matching "AB" it would look ahead to see if a 'C' is next and, if +so, will match the second production in preference to the first. In +other words, I effectively tries all the productions of a rule +breadth-first in parallel, and selects the "best" match, where "best" +means longest (note that this is a gross simplification of the true +behaviour of I but it will do for our purposes). + +In contrast, C tries each production depth-first in +sequence, and selects the "best" match, where "best" means first. This is +the fundamental difference between "bottom-up" and "recursive descent" +parsing. + +Each successfully matched item in a production is assigned a value, +which can be accessed in subsequent actions within the same +production (or, in some cases, as the return value of a successful +subrule call). Unsuccessful items don't have an associated value, +since the failure of an item causes the entire surrounding production +to immediately fail. The following sections describe the various types +of items and their success values. + + +=head2 Subrules + +A subrule which appears in a production is an instruction to the parser to +attempt to match the named rule at that point in the text being +parsed. If the named subrule is not defined when requested the +production containing it immediately fails (unless it was "autostubbed" - see +L). + +A rule may (recursively) call itself as a subrule, but I as the +left-most item in any of its productions (since such recursions are usually +non-terminating). + +The value associated with a subrule is the value associated with its +C<$return> variable (see L<"Actions"> below), or with the last successfully +matched item in the subrule match. + +Subrules may also be specified with a trailing repetition specifier, +indicating that they are to be (greedily) matched the specified number +of times. The available specifiers are: + + subrule(?) # Match one-or-zero times + subrule(s) # Match one-or-more times + subrule(s?) # Match zero-or-more times + subrule(N) # Match exactly N times for integer N > 0 + subrule(N..M) # Match between N and M times + subrule(..M) # Match between 1 and M times + subrule(N..) # Match at least N times + +Repeated subrules keep matching until either the subrule fails to +match, or it has matched the minimal number of times but fails to +consume any of the parsed text (this second condition prevents the +subrule matching forever in some cases). + +Since a repeated subrule may match many instances of the subrule itself, the +value associated with it is not a simple scalar, but rather a reference to a +list of scalars, each of which is the value associated with one of the +individual subrule matches. In other words in the rule: + + program: statement(s) + +the value associated with the repeated subrule "statement(s)" is a reference +to an array containing the values matched by each call to the individual +subrule "statement". + +Repetition modifieres may include a separator pattern: + + program: statement(s /;/) + +specifying some sequence of characters to be skipped between each repetition. +This is really just a shorthand for the Eleftop:...E directive +(see below). + +=head2 Tokens + +If a quote-delimited string or a Perl regex appears in a production, +the parser attempts to match that string or pattern at that point in +the text. For example: + + typedef: "typedef" typename identifier ';' + + identifier: /[A-Za-z_][A-Za-z0-9_]*/ + +As in regular Perl, a single quoted string is uninterpolated, whilst +a double-quoted string or a pattern is interpolated (at the time +of matching, I when the parser is constructed). Hence, it is +possible to define rules in which tokens can be set at run-time: + + typedef: "$::typedefkeyword" typename identifier ';' + + identifier: /$::identpat/ + +Note that, since each rule is implemented inside a special namespace +belonging to its parser, it is necessary to explicitly quantify +variables from the main package. + +Regex tokens can be specified using just slashes as delimiters +or with the explicit CdelimiterE......EdelimiterE> syntax: + + typedef: "typedef" typename identifier ';' + + typename: /[A-Za-z_][A-Za-z0-9_]*/ + + identifier: m{[A-Za-z_][A-Za-z0-9_]*} + +A regex of either type can also have any valid trailing parameter(s) +(that is, any of [cgimsox]): + + typedef: "typedef" typename identifier ';' + + identifier: / [a-z_] # LEADING ALPHA OR UNDERSCORE + [a-z0-9_]* # THEN DIGITS ALSO ALLOWED + /ix # CASE/SPACE/COMMENT INSENSITIVE + +The value associated with any successfully matched token is a string +containing the actual text which was matched by the token. + +It is important to remember that, since each grammar is specified in a +Perl string, all instances of the universal escape character '\' within +a grammar must be "doubled", so that they interpolate to single '\'s when +the string is compiled. For example, to use the grammar: + + word: /\S+/ | backslash + line: prefix word(s) "\n" + backslash: '\\' + +the following code is required: + + $parser = new Parse::RecDescent (q{ + + word: /\\S+/ | backslash + line: prefix word(s) "\\n" + backslash: '\\\\' + + }); + + +=head2 Terminal Separators + +For the purpose of matching, each terminal in a production is considered +to be preceded by a "prefix" - a pattern which must be +matched before a token match is attempted. By default, the +prefix is optional whitespace (which always matches, at +least trivially), but this default may be reset in any production. + +The variable C<$Parse::RecDescent::skip> stores the universal +prefix, which is the default for all terminal matches in all parsers +built with C. + +The prefix for an individual production can be altered +by using the Cskip:...E> directive (see below). + + +=head2 Actions + +An action is a block of Perl code which is to be executed (as the +block of a C statement) when the parser reaches that point in a +production. The action executes within a special namespace belonging to +the active parser, so care must be taken in correctly qualifying variable +names (see also L below). + +The action is considered to succeed if the final value of the block +is defined (that is, if the implied C statement evaluates to a +defined value - I). Note +that the value associated with a successful action is also the final +value in the block. + +An action will I if its last evaluated value is C. This is +surprisingly easy to accomplish by accident. For instance, here's an +infuriating case of an action that makes its production fail, but only +when debugging I activated: + + description: name rank serial_number + { print "Got $item[2] $item[1] ($item[3])\n" + if $::debugging + } + +If C<$debugging> is false, no statement in the block is executed, so +the final value is C, and the entire production fails. The solution is: + + description: name rank serial_number + { print "Got $item[2] $item[1] ($item[3])\n" + if $::debugging; + 1; + } + +Within an action, a number of useful parse-time variables are +available in the special parser namespace (there are other variables +also accessible, but meddling with them will probably just break your +parser. As a general rule, if you avoid referring to unqualified +variables - especially those starting with an underscore - inside an action, +things should be okay): + +=over 4 + +=item C<@item> and C<%item> + +The array slice C<@item[1..$#item]> stores the value associated with each item +(that is, each subrule, token, or action) in the current production. The +analogy is to C<$1>, C<$2>, etc. in a I grammar. +Note that, for obvious reasons, C<@item> only contains the +values of items I the current point in the production. + +The first element (C<$item[0]>) stores the name of the current rule +being matched. + +C<@item> is a standard Perl array, so it can also be indexed with negative +numbers, representing the number of items I from the current position in +the parse: + + stuff: /various/ bits 'and' pieces "then" data 'end' + { print $item[-2] } # PRINTS data + # (EASIER THAN: $item[6]) + +The C<%item> hash complements the <@item> array, providing named +access to the same item values: + + stuff: /various/ bits 'and' pieces "then" data 'end' + { print $item{data} # PRINTS data + # (EVEN EASIER THAN USING @item) + + +The results of named subrules are stored in the hash under each +subrule's name (including the repetition specifier, if any), +whilst all other items are stored under a "named +positional" key that indictates their ordinal position within their item +type: __STRINGI__, __PATTERNI__, __DIRECTIVEI__, __ACTIONI__: + + stuff: /various/ bits 'and' pieces "then" data 'end' { save } + { print $item{__PATTERN1__}, # PRINTS 'various' + $item{__STRING2__}, # PRINTS 'then' + $item{__ACTION1__}, # PRINTS RETURN + # VALUE OF save + } + + +If you want proper I access to patterns or literals, you need to turn +them into separate rules: + + stuff: various bits 'and' pieces "then" data 'end' + { print $item{various} # PRINTS various + } + + various: /various/ + + +The special entry C<$item{__RULE__}> stores the name of the current +rule (i.e. the same value as C<$item[0]>. + +The advantage of using C<%item>, instead of C<@items> is that it +removes the need to track items positions that may change as a grammar +evolves. For example, adding an interim CskipE> directive +of action can silently ruin a trailing action, by moving an C<@item> +element "down" the array one place. In contrast, the named entry +of C<%item> is unaffected by such an insertion. + +A limitation of the C<%item> hash is that it only records the I +value of a particular subrule. For example: + + range: '(' number '..' number )' + { $return = $item{number} } + +will return only the value corresponding to the I match of the +C subrule. In other words, successive calls to a subrule +overwrite the corresponding entry in C<%item>. Once again, the +solution is to rename each subrule in its own rule: + + range: '(' from_num '..' to_num )' + { $return = $item{from_num} } + + from_num: number + to_num: number + + + +=item C<@arg> and C<%arg> + +The array C<@arg> and the hash C<%arg> store any arguments passed to +the rule from some other rule (see L<"Subrule argument lists>). Changes +to the elements of either variable do not propagate back to the calling +rule (data can be passed back from a subrule via the C<$return> +variable - see next item). + + +=item C<$return> + +If a value is assigned to C<$return> within an action, that value is +returned if the production containing the action eventually matches +successfully. Note that setting C<$return> I cause the current +production to succeed. It merely tells it what to return if it I succeed. +Hence C<$return> is analogous to C<$$> in a I grammar. + +If C<$return> is not assigned within a production, the value of the +last component of the production (namely: C<$item[$#item]>) is +returned if the production succeeds. + + +=item C<$commit> + +The current state of commitment to the current production (see L<"Directives"> +below). + +=item C<$skip> + +The current terminal prefix (see L<"Directives"> below). + +=item C<$text> + +The remaining (unparsed) text. Changes to C<$text> I out of unsuccessful productions, but I survive +successful productions. Hence it is possible to dynamically alter the +text being parsed - for example, to provide a C<#include>-like facility: + + hash_include: '#include' filename + { $text = ::loadfile($item[2]) . $text } + + filename: '<' /[a-z0-9._-]+/i '>' { $return = $item[2] } + | '"' /[a-z0-9._-]+/i '"' { $return = $item[2] } + + +=item C<$thisline> and C<$prevline> + +C<$thisline> stores the current line number within the current parse +(starting from 1). C<$prevline> stores the line number for the last +character which was already successfully parsed (this will be different from +C<$thisline> at the end of each line). + +For efficiency, C<$thisline> and C<$prevline> are actually tied +hashes, and only recompute the required line number when the variable's +value is used. + +Assignment to C<$thisline> adjusts the line number calculator, so that +it believes that the current line number is the value being assigned. Note +that this adjustment will be reflected in all subsequent line numbers +calculations. + +Modifying the value of the variable C<$text> (as in the previous +C example, for instance) will confuse the line +counting mechanism. To prevent this, you should call +C I +after any assignment to the variable C<$text> (or, at least, before the +next attempt to use C<$thisline>). + +Note that if a production fails after assigning to or +resync'ing C<$thisline>, the parser's line counter mechanism will +usually be corrupted. + +Also see the entry for C<@itempos>. + +The line number can be set to values other than 1, by calling the start +rule with a second argument. For example: + + $parser = new Parse::RecDescent ($grammar); + + $parser->input($text, 10); # START LINE NUMBERS AT 10 + + +=item C<$thiscolumn> and C<$prevcolumn> + +C<$thiscolumn> stores the current column number within the current line +being parsed (starting from 1). C<$prevcolumn> stores the column number +of the last character which was actually successfully parsed. Usually +C<$prevcolumn == $thiscolumn-1>, but not at the end of lines. + +For efficiency, C<$thiscolumn> and C<$prevcolumn> are +actually tied hashes, and only recompute the required column number +when the variable's value is used. + +Assignment to C<$thiscolumn> or C<$prevcolumn> is a fatal error. + +Modifying the value of the variable C<$text> (as in the previous +C example, for instance) may confuse the column +counting mechanism. + +Note that C<$thiscolumn> reports the column number I any +whitespace that might be skipped before reading a token. Hence +if you wish to know where a token started (and ended) use something like this: + + rule: token1 token2 startcol token3 endcol token4 + { print "token3: columns $item[3] to $item[5]"; } + + startcol: '' { $thiscolumn } # NEED THE '' TO STEP PAST TOKEN SEP + endcol: { $prevcolumn } + +Also see the entry for C<@itempos>. + +=item C<$thisoffset> and C<$prevoffset> + +C<$thisoffset> stores the offset of the current parsing position +within the complete text +being parsed (starting from 0). C<$prevoffset> stores the offset +of the last character which was actually successfully parsed. In all +cases C<$prevoffset == $thisoffset-1>. + +For efficiency, C<$thisoffset> and C<$prevoffset> are +actually tied hashes, and only recompute the required offset +when the variable's value is used. + +Assignment to C<$thisoffset> or <$prevoffset> is a fatal error. + +Modifying the value of the variable C<$text> will I affect the +offset counting mechanism. + +Also see the entry for C<@itempos>. + +=item C<@itempos> + +The array C<@itempos> stores a hash reference corresponding to +each element of C<@item>. The elements of the hash provide the +following: + + $itempos[$n]{offset}{from} # VALUE OF $thisoffset BEFORE $item[$n] + $itempos[$n]{offset}{to} # VALUE OF $prevoffset AFTER $item[$n] + $itempos[$n]{line}{from} # VALUE OF $thisline BEFORE $item[$n] + $itempos[$n]{line}{to} # VALUE OF $prevline AFTER $item[$n] + $itempos[$n]{column}{from} # VALUE OF $thiscolumn BEFORE $item[$n] + $itempos[$n]{column}{to} # VALUE OF $prevcolumn AFTER $item[$n] + +Note that the various C<$itempos[$n]...{from}> values record the +appropriate value I any token prefix has been skipped. + +Hence, instead of the somewhat tedious and error-prone: + + rule: startcol token1 endcol + startcol token2 endcol + startcol token3 endcol + { print "token1: columns $item[1] + to $item[3] + token2: columns $item[4] + to $item[6] + token3: columns $item[7] + to $item[9]" } + + startcol: '' { $thiscolumn } # NEED THE '' TO STEP PAST TOKEN SEP + endcol: { $prevcolumn } + +it is possible to write: + + rule: token1 token2 token3 + { print "token1: columns $itempos[1]{column}{from} + to $itempos[1]{column}{to} + token2: columns $itempos[2]{column}{from} + to $itempos[2]{column}{to} + token3: columns $itempos[3]{column}{from} + to $itempos[3]{column}{to}" } + +Note however that (in the current implementation) the use of C<@itempos> +anywhere in a grammar implies that item positioning information is +collected I during the parse. Depending on the grammar +and the size of the text to be parsed, this may be prohibitively +expensive and the explicit use of C<$thisline>, C<$thiscolumn>, etc. may +be a better choice. + + +=item C<$thisparser> + +A reference to the S> object through which +parsing was initiated. + +The value of C<$thisparser> propagates down the subrules of a parse +but not back up. Hence, you can invoke subrules from another parser +for the scope of the current rule as follows: + + rule: subrule1 subrule2 + | { $thisparser = $::otherparser } + | subrule3 subrule4 + | subrule5 + +The result is that the production calls "subrule1" and "subrule2" of +the current parser, and the remaining productions call the named subrules +from C<$::otherparser>. Note, however that "Bad Things" will happen if +C<::otherparser> isn't a blessed reference and/or doesn't have methods +with the same names as the required subrules! + +=item C<$thisrule> + +A reference to the S> object corresponding to the +rule currently being matched. + +=item C<$thisprod> + +A reference to the S> object +corresponding to the production currently being matched. + +=item C<$score> and C<$score_return> + +$score stores the best production score to date, as specified by +an earlier Cscore:...E> directive. $score_return stores +the corresponding return value for the successful production. + +See L. + +=back + +B the parser relies on the information in the various C +objects in some non-obvious ways. Tinkering with the other members of +these objects will probably cause Bad Things to happen, unless you +I know what you're doing. The only exception to this advice is +that the use of C<$this...-E{local}> is always safe. + + +=head2 Start-up Actions + +Any actions which appear I the first rule definition in a +grammar are treated as "start-up" actions. Each such action is +stripped of its outermost brackets and then evaluated (in the parser's +special namespace) just before the rules of the grammar are first +compiled. + +The main use of start-up actions is to declare local variables within the +parser's special namespace: + + { my $lastitem = '???'; } + + list: item(s) { $return = $lastitem } + + item: book { $lastitem = 'book'; } + bell { $lastitem = 'bell'; } + candle { $lastitem = 'candle'; } + +but start-up actions can be used to execute I valid Perl code +within a parser's special namespace. + +Start-up actions can appear within a grammar extension or replacement +(that is, a partial grammar installed via C or +C - see L), and will be +executed before the new grammar is installed. Note, however, that a +particular start-up action is only ever executed once. + + +=head2 Autoactions + +It is sometimes desirable to be able to specify a default action to be +taken at the end of every production (for example, in order to easily +build a parse tree). If the variable C<$::RD_AUTOACTION> is defined +when C is called, the contents of that +variable are treated as a specification of an action which is to appended +to each production in the corresponding grammar. So, for example, to construct +a simple parse tree: + + $::RD_AUTOACTION = q { [@item] }; + + parser = new Parse::RecDescent (q{ + expression: and_expr '||' expression | and_expr + and_expr: not_expr '&&' and_expr | not_expr + not_expr: '!' brack_expr | brack_expr + brack_expr: '(' expression ')' | identifier + identifier: /[a-z]+/i + }); + +which is equivalent to: + + parser = new Parse::RecDescent (q{ + expression: and_expr '||' expression + { [@item] } + | and_expr + { [@item] } + + and_expr: not_expr '&&' and_expr + { [@item] } + | not_expr + { [@item] } + + not_expr: '!' brack_expr + { [@item] } + | brack_expr + { [@item] } + + brack_expr: '(' expression ')' + { [@item] } + | identifier + { [@item] } + + identifier: /[a-z]+/i + { [@item] } + }); + +Alternatively, we could take an object-oriented approach, use different +classes for each node (and also eliminating redundant intermediate nodes): + + $::RD_AUTOACTION = q + { $#item==1 ? $item[1] : new ${"$item[0]_node"} (@item[1..$#item]) }; + + parser = new Parse::RecDescent (q{ + expression: and_expr '||' expression | and_expr + and_expr: not_expr '&&' and_expr | not_expr + not_expr: '!' brack_expr | brack_expr + brack_expr: '(' expression ')' | identifier + identifier: /[a-z]+/i + }); + +which is equivalent to: + + parser = new Parse::RecDescent (q{ + expression: and_expr '||' expression + { new expression_node (@item[1..3]) } + | and_expr + + and_expr: not_expr '&&' and_expr + { new and_expr_node (@item[1..3]) } + | not_expr + + not_expr: '!' brack_expr + { new not_expr_node (@item[1..2]) } + | brack_expr + + brack_expr: '(' expression ')' + { new brack_expr_node (@item[1..3]) } + | identifier + + identifier: /[a-z]+/i + { new identifer_node (@item[1]) } + }); + +Note that, if a production already ends in an action, no autoaction is appended +to it. For example, in this version: + + $::RD_AUTOACTION = q + { $#item==1 ? $item[1] : new ${"$item[0]_node"} (@item[1..$#item]) }; + + parser = new Parse::RecDescent (q{ + expression: and_expr '&&' expression | and_expr + and_expr: not_expr '&&' and_expr | not_expr + not_expr: '!' brack_expr | brack_expr + brack_expr: '(' expression ')' | identifier + identifier: /[a-z]+/i + { new terminal_node($item[1]) } + }); + +each C match produces a C object, I an +C object. + +A level 1 warning is issued each time an "autoaction" is added to +some production. + + +=head2 Autotrees + +A commonly needed autoaction is one that builds a parse-tree. It is moderately +tricky to set up such an action (which must treat terminals differently from +non-terminals), so Parse::RecDescent simplifies the process by providing the +CautotreeE> directive. + +If this directive appears at the start of grammar, it causes +Parse::RecDescent to insert autoactions at the end of any rule except +those which already end in an action. The action inserted depends on whether +the production is an intermediate rule (two or more items), or a terminal +of the grammar (i.e. a single pattern or string item). + +So, for example, the following grammar: + + + + file : command(s) + command : get | set | vet + get : 'get' ident ';' + set : 'set' ident 'to' value ';' + vet : 'check' ident 'is' value ';' + ident : /\w+/ + value : /\d+/ + +is equivalent to: + + file : command(s) { bless \%item, $item[0] } + command : get { bless \%item, $item[0] } + | set { bless \%item, $item[0] } + | vet { bless \%item, $item[0] } + get : 'get' ident ';' { bless \%item, $item[0] } + set : 'set' ident 'to' value ';' { bless \%item, $item[0] } + vet : 'check' ident 'is' value ';' { bless \%item, $item[0] } + + ident : /\w+/ { bless {__VALUE__=>$item[1]}, $item[0] } + value : /\d+/ { bless {__VALUE__=>$item[1]}, $item[0] } + +Note that each node in the tree is blessed into a class of the same name +as the rule itself. This makes it easy to build object-oriented +processors for the parse-trees that the grammar produces. Note too that +the last two rules produce special objects with the single attribute +'__VALUE__'. This is because they consist solely of a single terminal. + +This autoaction-ed grammar would then produce a parse tree in a data +structure like this: + + { + file => { + command => { + [ get => { + identifier => { __VALUE__ => 'a' }, + }, + set => { + identifier => { __VALUE__ => 'b' }, + value => { __VALUE__ => '7' }, + }, + vet => { + identifier => { __VALUE__ => 'b' }, + value => { __VALUE__ => '7' }, + }, + ], + }, + } + } + +(except, of course, that each nested hash would also be blessed into +the appropriate class). + + +=head2 Autostubbing + +Normally, if a subrule appears in some production, but no rule of that +name is ever defined in the grammar, the production which refers to the +non-existent subrule fails immediately. This typically occurs as a +result of misspellings, and is a sufficiently common occurance that a +warning is generated for such situations. + +However, when prototyping a grammar it is sometimes useful to be +able to use subrules before a proper specification of them is +really possible. For example, a grammar might include a section like: + + function_call: identifier '(' arg(s?) ')' + + identifier: /[a-z]\w*/i + +where the possible format of an argument is sufficiently complex that +it is not worth specifying in full until the general function call +syntax has been debugged. In this situation it is convenient to leave +the real rule C undefined and just slip in a placeholder (or +"stub"): + + arg: 'arg' + +so that the function call syntax can be tested with dummy input such as: + + f0() + f1(arg) + f2(arg arg) + f3(arg arg arg) + +et cetera. + +Early in prototyping, many such "stubs" may be required, so +C provides a means of automating their definition. +If the variable C<$::RD_AUTOSTUB> is defined when a parser is built, +a subrule reference to any non-existent rule (say, C), +causes a "stub" rule of the form: + + sr: 'sr' + +to be automatically defined in the generated parser. +A level 1 warning is issued for each such "autostubbed" rule. + +Hence, with C<$::AUTOSTUB> defined, it is possible to only partially +specify a grammar, and then "fake" matches of the unspecified +(sub)rules by just typing in their name. + + + +=head2 Look-ahead + +If a subrule, token, or action is prefixed by "...", then it is +treated as a "look-ahead" request. That means that the current production can +(as usual) only succeed if the specified item is matched, but that the matching +I. This is very similar to the +C look-ahead construct in Perl patterns. Thus, the rule: + + inner_word: word ...word + +will match whatever the subrule "word" matches, provided that match is followed +by some more text which subrule "word" would also match (although this +second substring is not actually consumed by "inner_word") + +Likewise, a "...!" prefix, causes the following item to succeed (without +consuming any text) if and only if it would normally fail. Hence, a +rule such as: + + identifier: ...!keyword ...!'_' /[A-Za-z_]\w*/ + +matches a string of characters which satisfies the pattern +C, but only if the same sequence of characters would +not match either subrule "keyword" or the literal token '_'. + +Sequences of look-ahead prefixes accumulate, multiplying their positive and/or +negative senses. Hence: + + inner_word: word ...!......!word + +is exactly equivalent the the original example above (a warning is issued in +cases like these, since they often indicate something left out, or +misunderstood). + +Note that actions can also be treated as look-aheads. In such cases, +the state of the parser text (in the local variable C<$text>) +I the look-ahead action is guaranteed to be identical to its +state I the action, regardless of how it's changed I +the action (unless you actually undefine C<$text>, in which case you +get the disaster you deserve :-). + + +=head2 Directives + +Directives are special pre-defined actions which may be used to alter +the behaviour of the parser. There are currently eighteen directives: +CcommitE>, +CuncommitE>, +CrejectE>, +CscoreE>, +CautoscoreE>, +CskipE>, +CresyncE>, +CerrorE>, +CrulevarE>, +CmatchruleE>, +CleftopE>, +CrightopE>, +CdeferE>, +CnocheckE>, +Cperl_quotelikeE>, +Cperl_codeblockE>, +Cperl_variableE>, +and CtokenE>. + +=over 4 + +=item Committing and uncommitting + +The CcommitE> and CuncommitE> directives permit the recursive +descent of the parse tree to be pruned (or "cut") for efficiency. +Within a rule, a CcommitE> directive instructs the rule to ignore subsequent +productions if the current production fails. For example: + + command: 'find' filename + | 'open' filename + | 'move' filename filename + +Clearly, if the leading token 'find' is matched in the first production but that +production fails for some other reason, then the remaining +productions cannot possibly match. The presence of the +CcommitE> causes the "command" rule to fail immediately if +an invalid "find" command is found, and likewise if an invalid "open" +command is encountered. + +It is also possible to revoke a previous commitment. For example: + + if_statement: 'if' condition + 'then' block + 'else' block + | 'if' condition + 'then' block + +In this case, a failure to find an "else" block in the first +production shouldn't preclude trying the second production, but a +failure to find a "condition" certainly should. + +As a special case, any production in which the I item is an +CuncommitE> immediately revokes a preceding CcommitE> +(even though the production would not otherwise have been tried). For +example, in the rule: + + request: 'explain' expression + | 'explain' keyword + | 'save' + | 'quit' + | term '?' + +if the text being matched was "explain?", and the first two +productions failed, then the CcommitE> in production two would cause +productions three and four to be skipped, but the leading +CuncommitE> in the production five would allow that production to +attempt a match. + +Note in the preceding example, that the CcommitE> was only placed +in production two. If production one had been: + + request: 'explain' expression + +then production two would be (inappropriately) skipped if a leading +"explain..." was encountered. + +Both CcommitE> and CuncommitE> directives always succeed, and their value +is always 1. + + +=item Rejecting a production + +The CrejectE> directive immediately causes the current production +to fail (it is exactly equivalent to, but more obvious than, the +action C<{undef}>). A CrejectE> is useful when it is desirable to get +the side effects of the actions in one production, without prejudicing a match +by some other production later in the rule. For example, to insert +tracing code into the parse: + + complex_rule: { print "In complex rule...\n"; } + + complex_rule: simple_rule '+' 'i' '*' simple_rule + | 'i' '*' simple_rule + | simple_rule + + +It is also possible to specify a conditional rejection, using the +form Creject:IE>, which only rejects if the +specified condition is true. This form of rejection is exactly +equivalent to the action C<{(I)?undef:1}E>. +For example: + + command: save_command + | restore_command + | { exit } + | + +A CrejectE> directive never succeeds (and hence has no +associated value). A conditional rejection may succeed (if its +condition is not satisfied), in which case its value is 1. + +As an extra optimization, C ignores any production +which I with an unconditional CrejectE> directive, +since any such production can never successfully match or have any +useful side-effects. A level 1 warning is issued in all such cases. + +Note that productions beginning with conditional +Creject:...E> directives are I "optimized away" in +this manner, even if they are always guaranteed to fail (for example: +Creject:1E>) + +Due to the way grammars are parsed, there is a minor restriction on the +condition of a conditional Creject:...E>: it cannot +contain any raw '<' or '>' characters. For example: + + line: cmd max> data + +results in an error when a parser is built from this grammar (since the +grammar parser has no way of knowing whether the first > is a "less than" +or the end of the Creject:...E>. + +To overcome this problem, put the condition inside a do{} block: + + line: cmd max}> data + +Note that the same problem may occur in other directives that take +arguments. The same solution will work in all cases. + +=item Skipping between terminals + +The CskipE> directive enables the terminal prefix used in +a production to be changed. For example: + + OneLiner: Command Arg(s) /;/ + +causes only blanks and tabs to be skipped before terminals in the C +subrule (and any of I subrules>, and also before the final C terminal. +Once the production is complete, the previous terminal prefix is +reinstated. Note that this implies that distinct productions of a rule +must reset their terminal prefixes individually. + +The CskipE> directive evaluates to the I terminal prefix, +so it's easy to reinstate a prefix later in a production: + + Command: CSV(s) Modifier + +The value specified after the colon is interpolated into a pattern, so all of +the following are equivalent (though their efficiency increases down the list): + + # ASSUMING THE VARS HOLD THE OBVIOUS VALUES + + + + + + + +There is no way of directly setting the prefix for +an entire rule, except as follows: + + Rule: Prod1 + | Prod2a Prod2b + | Prod3 + +or, better: + + Rule: + ( + Prod1 + | Prod2a Prod2b + | Prod3 + ) + + +B + + +=item Resynchronization + +The CresyncE> directive provides a visually distinctive +means of consuming some of the text being parsed, usually to skip an +erroneous input. In its simplest form CresyncE> simply +consumes text up to and including the next newline (C<"\n">) +character, succeeding only if the newline is found, in which case it +causes its surrounding rule to return zero on success. + +In other words, a CresyncE> is exactly equivalent to the token +C followed by the action S> (except that +productions beginning with a CresyncE> are ignored when generating +error messages). A typical use might be: + + script : command(s) + + command: save_command + | restore_command + | # TRY NEXT LINE, IF POSSIBLE + +It is also possible to explicitly specify a resynchronization +pattern, using the Cresync:IE> variant. This version +succeeds only if the specified pattern matches (and consumes) the +parsed text. In other words, Cresync:IE> is exactly +equivalent to the token C/> (followed by a S> +action). For example, if commands were terminated by newlines or semi-colons: + + command: save_command + | restore_command + | + +The value of a successfully matched CresyncE> directive (of either +type) is the text that it consumed. Note, however, that since the +directive also sets C<$return>, a production consisting of a lone +CresyncE> succeeds but returns the value zero (which a calling rule +may find useful to distinguish between "true" matches and "tolerant" matches). +Remember that returning a zero value indicates that the rule I (since +only an C denotes failure within C parsers. + + +=item Error handling + +The CerrorE> directive provides automatic or user-defined +generation of error messages during a parse. In its simplest form +CerrorE> prepares an error message based on +the mismatch between the last item expected and the text which cause +it to fail. For example, given the rule: + + McCoy: curse ',' name ', I'm a doctor, not a' a_profession '!' + | pronoun 'dead,' name '!' + | + +the following strings would produce the following messages: + +=over 4 + +=item "Amen, Jim!" + + ERROR (line 1): Invalid McCoy: Expected curse or pronoun + not found + +=item "Dammit, Jim, I'm a doctor!" + + ERROR (line 1): Invalid McCoy: Expected ", I'm a doctor, not a" + but found ", I'm a doctor!" instead + +=item "He's dead,\n" + + ERROR (line 2): Invalid McCoy: Expected name not found + +=item "He's alive!" + + ERROR (line 1): Invalid McCoy: Expected 'dead,' but found + "alive!" instead + +=item "Dammit, Jim, I'm a doctor, not a pointy-eared Vulcan!" + + ERROR (line 1): Invalid McCoy: Expected a profession but found + "pointy-eared Vulcan!" instead + + +=back + +Note that, when autogenerating error messages, all underscores in any +rule name used in a message are replaced by single spaces (for example +"a_production" becomes "a production"). Judicious choice of rule +names can therefore considerably improve the readability of automatic +error messages (as well as the maintainability of the original +grammar). + +If the automatically generated error is not sufficient, it is possible to +provide an explicit message as part of the error directive. For example: + + Spock: "Fascinating ',' (name | 'Captain') '.' + | "Highly illogical, doctor." + | + +which would result in I failures to parse a "Spock" subrule printing the +following message: + + ERROR (line ): Invalid Spock: He never said that! + +The error message is treated as a "qq{...}" string and interpolated +when the error is generated (I when the directive is specified!). +Hence: + + + +would correctly insert the ambient text string which caused the error. + +There are two other forms of error directive: Cerror?E> and +Serror?: msgE>>. These behave just like CerrorE> +and Serror: msgE>> respectively, except that they are +only triggered if the rule is "committed" at the time they are +encountered. For example: + + Scotty: "Ya kenna change the Laws of Phusics," name + | name ',' 'she's goanta blaw!' + | + +will only generate an error for a string beginning with "Ya kenna +change the Laws o' Phusics," or a valid name, but which still fails to match the +corresponding production. That is, C<$parser-EScotty("Aye, Cap'ain")> will +fail silently (since neither production will "commit" the rule on that +input), whereas SScotty("Mr Spock, ah jest kenna do'ut!")>> +will fail with the error message: + + ERROR (line 1): Invalid Scotty: expected 'she's goanta blaw!' + but found 'I jest kenna do'ut!' instead. + +since in that case the second production would commit after matching +the leading name. + +Note that to allow this behaviour, all CerrorE> directives which are +the first item in a production automatically uncommit the rule just +long enough to allow their production to be attempted (that is, when +their production fails, the commitment is reinstated so that +subsequent productions are skipped). + +In order to I uncommit the rule before an error message, +it is necessary to put an explicit CuncommitE> before the +CerrorE>. For example: + + line: 'Kirk:' Kirk + | 'Spock:' Spock + | 'McCoy:' McCoy + | + | + + +Error messages generated by the various Cerror...E> directives +are not displayed immediately. Instead, they are "queued" in a buffer and +are only displayed once parsing ultimately fails. Moreover, +Cerror...E> directives that cause one production of a rule +to fail are automatically removed from the message queue +if another production subsequently causes the entire rule to succeed. +This means that you can put +Cerror...E> directives wherever useful diagnosis can be done, +and only those associated with actual parser failure will ever be +displayed. Also see L<"Gotchas">. + +As a general rule, the most useful diagnostics are usually generated +either at the very lowest level within the grammar, or at the very +highest. A good rule of thumb is to identify those subrules which +consist mainly (or entirely) of terminals, and then put an +Cerror...E> directive at the end of any other rule which calls +one or more of those subrules. + +There is one other situation in which the output of the various types of +error directive is suppressed; namely, when the rule containing them +is being parsed as part of a "look-ahead" (see L<"Look-ahead">). In this +case, the error directive will still cause the rule to fail, but will do +so silently. + +An unconditional CerrorE> directive always fails (and hence has no +associated value). This means that encountering such a directive +always causes the production containing it to fail. Hence an +CerrorE> directive will inevitably be the last (useful) item of a +rule (a level 3 warning is issued if a production contains items after an unconditional +CerrorE> directive). + +An Cerror?E> directive will I (that is: fail to fail :-), if +the current rule is uncommitted when the directive is encountered. In +that case the directive's associated value is zero. Hence, this type +of error directive I be used before the end of a +production. For example: + + command: 'do' something + | 'report' something + | + + +B The Cerror?E> directive does I mean "always fail (but +do so silently unless committed)". It actually means "only fail (and report) if +committed, otherwise I". To achieve the "fail silently if uncommitted" +semantics, it is necessary to use: + + rule: item item(s) + | # FAIL SILENTLY UNLESS COMMITTED + +However, because people seem to expect a lone Cerror?E> directive +to work like this: + + rule: item item(s) + | + | + +Parse::RecDescent automatically appends a +CrejectE> directive if the Cerror?E> directive +is the only item in a production. A level 2 warning (see below) +is issued when this happens. + +The level of error reporting during both parser construction and +parsing is controlled by the presence or absence of four global +variables: C<$::RD_ERRORS>, C<$::RD_WARN>, C<$::RD_HINT>, and +<$::RD_TRACE>. If C<$::RD_ERRORS> is defined (and, by default, it is) +then fatal errors are reported. + +Whenever C<$::RD_WARN> is defined, certain non-fatal problems are also reported. +Warnings have an associated "level": 1, 2, or 3. The higher the level, +the more serious the warning. The value of the corresponding global +variable (C<$::RD_WARN>) determines the I level of warning to +be displayed. Hence, to see I warnings, set C<$::RD_WARN> to 1. +To see only the most serious warnings set C<$::RD_WARN> to 3. +By default C<$::RD_WARN> is initialized to 3, ensuring that serious but +non-fatal errors are automatically reported. + +See F<"DIAGNOSTICS"> for a list of the varous error and warning messages +that Parse::RecDescent generates when these two variables are defined. + +Defining any of the remaining variables (which are not defined by +default) further increases the amount of information reported. +Defining C<$::RD_HINT> causes the parser generator to offer +more detailed analyses and hints on both errors and warnings. +Note that setting C<$::RD_HINT> at any point automagically +sets C<$::RD_WARN> to 1. + +Defining C<$::RD_TRACE> causes the parser generator and the parser to +report their progress to STDERR in excruciating detail (although, without hints +unless $::RD_HINT is separately defined). This detail +can be moderated in only one respect: if C<$::RD_TRACE> has an +integer value (I) greater than 1, only the I characters of +the "current parsing context" (that is, where in the input string we +are at any point in the parse) is reported at any time. + > +C<$::RD_TRACE> is mainly useful for debugging a grammar that isn't +behaving as you expected it to. To this end, if C<$::RD_TRACE> is +defined when a parser is built, any actual parser code which is +generated is also written to a file named "RD_TRACE" in the local +directory. + +Note that the four variables belong to the "main" package, which +makes them easier to refer to in the code controlling the parser, and +also makes it easy to turn them into command line flags ("-RD_ERRORS", +"-RD_WARN", "-RD_HINT", "-RD_TRACE") under B. + +=item Specifying local variables + +It is occasionally convenient to specify variables which are local +to a single rule. This may be achieved by including a +Crulevar:...E> directive anywhere in the rule. For example: + + markup: + + markup: tag {($tag=$item[1]) =~ s/^<|>$//g} body[$tag] + +The example Crulevar: $tagE> directive causes a "my" variable named +C<$tag> to be declared at the start of the subroutine implementing the +C rule (that is, I the first production, regardless of +where in the rule it is specified). + +Specifically, any directive of the form: +Crulevar:IE> causes a line of the form C;> +to be added at the beginning of the rule subroutine, immediately after +the definitions of the following local variables: + + $thisparser $commit + $thisrule @item + $thisline @arg + $text %arg + +This means that the following CrulevarE> directives work +as expected: + + + + + + + + + + + +If a variable that is also visible to subrules is required, it needs +to be C'd, not C'd. C defaults to C, but if C +is explicitly specified: + + + +then a C-ized variable is declared instead, and will be available +within subrules. + +Note however that, because all such variables are "my" variables, their +values I between match attempts on a given rule. To +preserve values between match attempts, values can be stored within the +"local" member of the C<$thisrule> object: + + countedrule: { $thisrule->{"local"}{"count"}++ } + + | subrule1 + | subrule2 + | {"local"}{"count"} == 1> + subrule3 + + +When matching a rule, each CrulevarE> directive is matched as +if it were an unconditional CrejectE> directive (that is, it +causes any production in which it appears to immediately fail to match). +For this reason (and to improve readability) it is usual to specify any +CrulevarE> directive in a separate production at the start of +the rule (this has the added advantage that it enables +C to optimize away such productions, just as it does +for the CrejectE> directive). + + +=item Dynamically matched rules + +Because regexes and double-quoted strings are interpolated, it is relatively +easy to specify productions with "context sensitive" tokens. For example: + + command: keyword body "end $item[1]" + +which ensures that a command block is bounded by a +"IkeywordE>...end Isame keywordE>" pair. + +Building productions in which subrules are context sensitive is also possible, +via the Cmatchrule:...E> directive. This directive behaves +identically to a subrule item, except that the rule which is invoked to match +it is determined by the string specified after the colon. For example, we could +rewrite the C rule like this: + + command: keyword "end $item[1]" + +Whatever appears after the colon in the directive is treated as an interpolated +string (that is, as if it appeared in C operator) and the value of +that interpolated string is the name of the subrule to be matched. + +Of course, just putting a constant string like C in a +Cmatchrule:...E> directive is of little interest or benefit. +The power of directive is seen when we use a string that interpolates +to something interesting. For example: + + command: keyword "end $item[1]" + + keyword: 'while' | 'if' | 'function' + + while_body: condition block + + if_body: condition block ('else' block)(?) + + function_body: arglist block + +Now the C rule selects how to proceed on the basis of the keyword +that is found. It is as if C were declared: + + command: 'while' while_body "end while" + | 'if' if_body "end if" + | 'function' function_body "end function" + + +When a Cmatchrule:...E> directive is used as a repeated +subrule, the rule name expression is "late-bound". That is, the name of +the rule to be called is re-evaluated I a match attempt is +made. Hence, the following grammar: + + { $::species = 'dogs' } + + pair: 'two' (s) + + dogs: /dogs/ { $::species = 'cats' } + + cats: /cats/ + +will match the string "two dogs cats cats" completely, whereas it will +only match the string "two dogs dogs dogs" up to the eighth letter. If +the rule name were "early bound" (that is, evaluated only the first +time the directive is encountered in a production), the reverse +behaviour would be expected. + +Note that the C directive takes a string that is to be treated +as a rule name, I as a rule invocation. That is, +it's like a Perl symbolic reference, not an C. Just as you can say: + + $subname = 'foo'; + + # and later... + + &{$foo}(@args); + +but not: + + $subname = 'foo(@args)'; + + # and later... + + &{$foo}; + +likewise you can say: + + $rulename = 'foo'; + + # and in the grammar... + + [@args] + +but not: + + $rulename = 'foo[@args]'; + + # and in the grammar... + + + + +=item Deferred actions + +The Cdefer:...E> directive is used to specify an action to be +performed when (and only if!) the current production ultimately succeeds. + +Whenever a Cdefer:...E> directive appears, the code it specifies +is converted to a closure (an anonymous subroutine reference) which is +queued within the active parser object. Note that, +because the deferred code is converted to a closure, the values of any +"local" variable (such as C<$text>, <@item>, etc.) are preserved +until the deferred code is actually executed. + +If the parse ultimately succeeds +I the production in which the Cdefer:...E> directive was +evaluated formed part of the successful parse, then the deferred code is +executed immediately before the parse returns. If however the production +which queued a deferred action fails, or one of the higher-level +rules which called that production fails, then the deferred action is +removed from the queue, and hence is never executed. + +For example, given the grammar: + + sentence: noun trans noun + | noun intrans + + noun: 'the dog' + { print "$item[1]\t(noun)\n" } + | 'the meat' + { print "$item[1]\t(noun)\n" } + + trans: 'ate' + { print "$item[1]\t(transitive)\n" } + + intrans: 'ate' + { print "$item[1]\t(intransitive)\n" } + | 'barked' + { print "$item[1]\t(intransitive)\n" } + +then parsing the sentence C<"the dog ate"> would produce the output: + + the dog (noun) + ate (transitive) + the dog (noun) + ate (intransitive) + +This is because, even though the first production of C +ultimately fails, its initial subrules C and C do match, +and hence they execute their associated actions. +Then the second production of C succeeds, causing the +actions of the subrules C and C to be executed as well. + +On the other hand, if the actions were replaced by Cdefer:...E> +directives: + + sentence: noun trans noun + | noun intrans + + noun: 'the dog' + + | 'the meat' + + + trans: 'ate' + + + intrans: 'ate' + + | 'barked' + + +the output would be: + + the dog (noun) + ate (intransitive) + +since deferred actions are only executed if they were evaluated in +a production which ultimately contributes to the successful parse. + +In this case, even though the first production of C caused +the subrules C and C to match, that production ultimately +failed and so the deferred actions queued by those subrules were subsequently +disgarded. The second production then succeeded, causing the entire +parse to succeed, and so the deferred actions queued by the (second) match of +the C subrule and the subsequent match of C I preserved and +eventually executed. + +Deferred actions provide a means of improving the performance of a parser, +by only executing those actions which are part of the final parse-tree +for the input data. + +Alternatively, deferred actions can be viewed as a mechanism for building +(and executing) a +customized subroutine corresponding to the given input data, much in the +same way that autoactions (see L<"Autoactions">) can be used to build a +customized data structure for specific input. + +Whether or not the action it specifies is ever executed, +a Cdefer:...E> directive always succeeds, returning the +number of deferred actions currently queued at that point. + + +=item Parsing Perl + +Parse::RecDescent provides limited support for parsing subsets of Perl, +namely: quote-like operators, Perl variables, and complete code blocks. + +The Cperl_quotelikeE> directive can be used to parse any Perl +quote-like operator: C<'a string'>, C, C, +etc. It does this by calling Text::Balanced::quotelike(). + +If a quote-like operator is found, a reference to an array of eight elements +is returned. Those elements are identical to the last eight elements returned +by Text::Balanced::extract_quotelike() in an array context, namely: + +=over 4 + +=item [0] + +the name of the quotelike operator -- 'q', 'qq', 'm', 's', 'tr' -- if the +operator was named; otherwise C, + +=item [1] + +the left delimiter of the first block of the operation, + +=item [2] + +the text of the first block of the operation +(that is, the contents of +a quote, the regex of a match, or substitution or the target list of a +translation), + +=item [3] + +the right delimiter of the first block of the operation, + +=item [4] + +the left delimiter of the second block of the operation if there is one +(that is, if it is a C, C, or C); otherwise C, + +=item [5] + +the text of the second block of the operation if there is one +(that is, the replacement of a substitution or the translation list +of a translation); otherwise C, + +=item [6] + +the right delimiter of the second block of the operation (if any); +otherwise C, + +=item [7] + +the trailing modifiers on the operation (if any); otherwise C. + +=back + +If a quote-like expression is not found, the directive fails with the usual +C value. + +The Cperl_variableE> directive can be used to parse any Perl +variable: $scalar, @array, %hash, $ref->{field}[$index], etc. +It does this by calling Text::Balanced::extract_variable(). + +If the directive matches text representing a valid Perl variable +specification, it returns that text. Otherwise it fails with the usual +C value. + +The Cperl_codeblockE> directive can be used to parse curly-brace-delimited block of Perl code, such as: { $a = 1; f() =~ m/pat/; }. +It does this by calling Text::Balanced::extract_codeblock(). + +If the directive matches text representing a valid Perl code block, +it returns that text. Otherwise it fails with the usual C value. + +You can also tell it what kind of brackets to use as the outermost +delimiters. For example: + + arglist: + +causes an arglist to match a perl code block whose outermost delimiters +are C<(...)> (rather than the default C<{...}>). + + +=item Constructing tokens + +Eventually, Parse::RecDescent will be able to parse tokenized input, as +well as ordinary strings. In preparation for this joyous day, the +Ctoken:...E> directive has been provided. +This directive creates a token which will be suitable for +input to a Parse::RecDescent parser (when it eventually supports +tokenized input). + +The text of the token is the value of the +immediately preceding item in the production. A +Ctoken:...E> directive always succeeds with a return +value which is the hash reference that is the new token. It also +sets the return value for the production to that hash ref. + +The Ctoken:...E> directive makes it easy to build +a Parse::RecDescent-compatible lexer in Parse::RecDescent: + + my $lexer = new Parse::RecDescent q + { + lex: token(s) + + token: /a\b/ + | /the\b/ + | /fly\b/ + | /[a-z]+/i { lc $item[1] } + | + + }; + +which will eventually be able to be used with a regular Parse::RecDescent +grammar: + + my $parser = new Parse::RecDescent q + { + startrule: subrule1 subrule 2 + + # ETC... + }; + +either with a pre-lexing phase: + + $parser->startrule( $lexer->lex($data) ); + +or with a lex-on-demand approach: + + $parser->startrule( sub{$lexer->token(\$data)} ); + +But at present, only the Ctoken:...E> directive is +actually implemented. The rest is vapourware. + +=item Specifying operations + +One of the commonest requirements when building a parser is to specify +binary operators. Unfortunately, in a normal grammar, the rules for +such things are awkward: + + disjunction: conjunction ('or' conjunction)(s?) + { $return = [ $item[1], @{$item[2]} ] } + + conjunction: atom ('and' atom)(s?) + { $return = [ $item[1], @{$item[2]} ] } + +or inefficient: + + disjunction: conjunction 'or' disjunction + { $return = [ $item[1], @{$item[2]} ] } + | conjunction + { $return = [ $item[1] ] } + + conjunction: atom 'and' conjunction + { $return = [ $item[1], @{$item[2]} ] } + | atom + { $return = [ $item[1] ] } + +and either way is ugly and hard to get right. + +The Cleftop:...E> and Crightop:...E> directives provide an +easier way of specifying such operations. Using Cleftop:...E> the +above examples become: + + disjunction: + conjunction: + +The Cleftop:...E> directive specifies a left-associative binary operator. +It is specified around three other grammar elements +(typically subrules or terminals), which match the left operand, +the operator itself, and the right operand respectively. + +A Cleftop:...E> directive such as: + + disjunction: + +is converted to the following: + + disjunction: ( conjunction ('or' conjunction)(s?) + { $return = [ $item[1], @{$item[2]} ] } ) + +In other words, a Cleftop:...E> directive matches the left operand followed by zero +or more repetitions of both the operator and the right operand. It then +flattens the matched items into an anonymous array which becomes the +(single) value of the entire Cleftop:...E> directive. + +For example, an Cleftop:...E> directive such as: + + output: + +when given a string such as: + + cout << var << "str" << 3 + +would match, and C<$item[1]> would be set to: + + [ 'cout', 'var', '"str"', '3' ] + +In other words: + + output: + +is equivalent to a left-associative operator: + + output: ident { $return = [$item[1]] } + | ident '<<' expr { $return = [@item[1,3]] } + | ident '<<' expr '<<' expr { $return = [@item[1,3,5]] } + | ident '<<' expr '<<' expr '<<' expr { $return = [@item[1,3,5,7]] } + # ...etc... + + +Similarly, the Crightop:...E> directive takes a left operand, an operator, and a right operand: + + assign: leftop:...E> and Crightop:...E> directives, the directive does not normally +return the operator itself, just a list of the operands involved. This is +particularly handy for specifying lists: + + list: '(' ')' + { $return = $item[2] } + +There is, however, a problem: sometimes the operator is itself significant. +For example, in a Perl list a comma and a C<=E> are both +valid separators, but the C<=E> has additional stringification semantics. +Hence it's important to know which was used in each case. + +To solve this problem the +Cleftop:...E> and Crightop:...E> directives +I return the operator(s) as well, under two circumstances. +The first case is where the operator is specified as a subrule. In that instance, +whatever the operator matches is returned (on the assumption that if the operator +is important enough to have its own subrule, then it's important enough to return). + +The second case is where the operator is specified as a regular +expression. In that case, if the first bracketed subpattern of the +regular expression matches, that matching value is returned (this is analogous to +the behaviour of the Perl C function, except that only the first subpattern +is returned). + +In other words, given the input: + + ( a=>1, b=>2 ) + +the specifications: + + list: '(' ')' + + separator: ',' | '=>' + +or: + + list: '(' )/ list_item> ')' + +cause the list separators to be interleaved with the operands in the +anonymous array in C<$item[2]>: + + [ 'a', '=>', '1', ',', 'b', '=>', '2' ] + + +But the following version: + + list: '(' / list_item> ')' + +returns only the operators: + + [ 'a', '1', 'b', '2' ] + +Of course, none of the above specifications handle the case of an empty +list, since the Cleftop:...E> and Crightop:...E> directives +require at least a single right or left operand to match. To specify +that the operator can match "trivially", +it's necessary to add a C<(?)> qualifier to the directive: + + list: '(' )/ list_item>(?) ')' + +Note that in almost all the above examples, the first and third arguments +of the C<> directive were the same subrule. That is because +C<>'s are frequently used to specify "separated" lists of the +same type of item. To make such lists easier to specify, the following +syntax: + + list: element(s /,/) + +is exactly equivalent to: + + list: + +Note that the separator must be specified as a raw pattern (i.e. +not a string or subrule). + + +=item Scored productions + +By default, Parse::RecDescent grammar rules always accept the first +production that matches the input. But if two or more productions may +potentially match the same input, choosing the first that does so may +not be optimal. + +For example, if you were parsing the sentence "time flies like an arrow", +you might use a rule like this: + + sentence: verb noun preposition article noun { [@item] } + | adjective noun verb article noun { [@item] } + | noun verb preposition article noun { [@item] } + +Each of these productions matches the sentence, but the third one +is the most likely interpretation. However, if the sentence had been +"fruit flies like a banana", then the second production is probably +the right match. + +To cater for such situtations, the Cscore:...E> can be used. +The directive is equivalent to an unconditional CrejectE>, +except that it allows you to specify a "score" for the current +production. If that score is numerically greater than the best +score of any preceding production, the current production is cached for later +consideration. If no later production matches, then the cached +production is treated as having matched, and the value of the +item immediately before its Cscore:...E> directive is returned as the +result. + +In other words, by putting a Cscore:...E> directive at the end of +each production, you can select which production matches using +criteria other than specification order. For example: + + sentence: verb noun preposition article noun { [@item] } + | adjective noun verb article noun { [@item] } + | noun verb preposition article noun { [@item] } + +Now, when each production reaches its respective Cscore:...E> +directive, the subroutine C will be called to evaluate the +matched items (somehow). Once all productions have been tried, the +one which C scored most highly will be the one that is +accepted as a match for the rule. + +The variable $score always holds the current best score of any production, +and the variable $score_return holds the corresponding return value. + +As another example, the following grammar matches lines that may be +separated by commas, colons, or semi-colons. This can be tricky if +a colon-separated line also contains commas, or vice versa. The grammar +resolves the ambiguity by selecting the rule that results in the +fewest fields: + + line: seplist[sep=>','] + | seplist[sep=>':'] + | seplist[sep=>" "] + + seplist: + +Note the use of negation within the Cscore:...E> directive +to ensure that the seplist with the most items gets the lowest score. + +As the above examples indicate, it is often the case that all productions +in a rule use exactly the same Cscore:...E> directive. It is +tedious to have to repeat this identical directive in every production, so +Parse::RecDescent also provides the Cautoscore:...E> directive. + +If an Cautoscore:...E> directive appears in any +production of a rule, the code it specifies is used as the scoring +code for every production of that rule, except productions that already +end with an explicit Cscore:...E> directive. Thus the rules above could +be rewritten: + + line: + line: seplist[sep=>','] + | seplist[sep=>':'] + | seplist[sep=>" "] + + + sentence: + | verb noun preposition article noun { [@item] } + | adjective noun verb article noun { [@item] } + | noun verb preposition article noun { [@item] } + +Note that the Cautoscore:...E> directive itself acts as an +unconditional CrejectE>, and (like the Crulevar:...E> +directive) is pruned at compile-time wherever possible. + + +=item Dispensing with grammar checks + +During the compilation phase of parser construction, Parse::RecDescent performs +a small number of checks on the grammar it's given. Specifically it checks that +the grammar is not left-recursive, that there are no "insatiable" constructs of +the form: + + rule: subrule(s) subrule + +and that there are no rules missing (i.e. referred to, but never defined). + +These checks are important during development, but can slow down parser +construction in stable code. So Parse::RecDescent provides the +EnocheckE directive to turn them off. The directive can only appear +before the first rule definition, and switches off checking throughout the rest +of the current grammar. + +Typically, this directive would be added when a parser has been thoroughly +tested and is ready for release. + +=back + + +=head2 Subrule argument lists + +It is occasionally useful to pass data to a subrule which is being invoked. For +example, consider the following grammar fragment: + + classdecl: keyword decl + + keyword: 'struct' | 'class'; + + decl: # WHATEVER + +The C rule might wish to know which of the two keywords was used +(since it may affect some aspect of the way the subsequent declaration +is interpreted). C allows the grammar designer to +pass data into a rule, by placing that data in an I +(that is, in square brackets) immediately after any subrule item in a +production. Hence, we could pass the keyword to C as follows: + + classdecl: keyword decl[ $item[1] ] + + keyword: 'struct' | 'class'; + + decl: # WHATEVER + +The argument list can consist of any number (including zero!) of comma-separated +Perl expressions. In other words, it looks exactly like a Perl anonymous +array reference. For example, we could pass the keyword, the name of the +surrounding rule, and the literal 'keyword' to C like so: + + classdecl: keyword decl[$item[1],$item[0],'keyword'] + + keyword: 'struct' | 'class'; + + decl: # WHATEVER + +Within the rule to which the data is passed (C in the above examples) +that data is available as the elements of a local variable C<@arg>. Hence +C might report its intentions as follows: + + classdecl: keyword decl[$item[1],$item[0],'keyword'] + + keyword: 'struct' | 'class'; + + decl: { print "Declaring $arg[0] (a $arg[2])\n"; + print "(this rule called by $arg[1])" } + +Subrule argument lists can also be interpreted as hashes, simply by using +the local variable C<%arg> instead of C<@arg>. Hence we could rewrite the +previous example: + + classdecl: keyword decl[keyword => $item[1], + caller => $item[0], + type => 'keyword'] + + keyword: 'struct' | 'class'; + + decl: { print "Declaring $arg{keyword} (a $arg{type})\n"; + print "(this rule called by $arg{caller})" } + +Both C<@arg> and C<%arg> are always available, so the grammar designer may +choose whichever convention (or combination of conventions) suits best. + +Subrule argument lists are also useful for creating "rule templates" +(especially when used in conjunction with the Cmatchrule:...E> +directive). For example, the subrule: + + list: /$arg{sep}/ list[%arg] + { $return = [ $item[1], @{$item[3]} ] } + | + { $return = [ $item[1]] } + +is a handy template for the common problem of matching a separated list. +For example: + + function: 'func' name '(' list[rule=>'param',sep=>';'] ')' + + param: list[rule=>'name',sep=>','] ':' typename + + name: /\w+/ + + typename: name + + +When a subrule argument list is used with a repeated subrule, the argument list +goes I the repetition specifier: + + list: /some|many/ thing[ $item[1] ](s) + +The argument list is "late bound". That is, it is re-evaluated for every +repetition of the repeated subrule. +This means that each repeated attempt to match the subrule may be +passed a completely different set of arguments if the value of the +expression in the argument list changes between attempts. So, for +example, the grammar: + + { $::species = 'dogs' } + + pair: 'two' animal[$::species](s) + + animal: /$arg[0]/ { $::species = 'cats' } + +will match the string "two dogs cats cats" completely, whereas +it will only match the string "two dogs dogs dogs" up to the +eighth letter. If the value of the argument list were "early bound" +(that is, evaluated only the first time a repeated subrule match is +attempted), one would expect the matching behaviours to be reversed. + +Of course, it is possible to effectively "early bind" such argument lists +by passing them a value which does not change on each repetition. For example: + + { $::species = 'dogs' } + + pair: 'two' { $::species } animal[$item[2]](s) + + animal: /$arg[0]/ { $::species = 'cats' } + + +Arguments can also be passed to the start rule, simply by appending them +to the argument list with which the start rule is called (I the +"line number" parameter). For example, given: + + $parser = new Parse::RecDescent ( $grammar ); + + $parser->data($text, 1, "str", 2, \@arr); + + # ^^^^^ ^ ^^^^^^^^^^^^^^^ + # | | | + # TEXT TO BE PARSED | | + # STARTING LINE NUMBER | + # ELEMENTS OF @arg WHICH IS PASSED TO RULE data + +then within the productions of the rule C, the array C<@arg> will contain +C<("str", 2, \@arr)>. + + +=head2 Alternations + +Alternations are implicit (unnamed) rules defined as part of a production. An +alternation is defined as a series of '|'-separated productions inside a +pair of round brackets. For example: + + character: 'the' ( good | bad | ugly ) /dude/ + +Every alternation implicitly defines a new subrule, whose +automatically-generated name indicates its origin: +"_alternation__of_production_

_of_rule" for the appropriate +values of ,

, and . A call to this implicit subrule is then +inserted in place of the brackets. Hence the above example is merely a +convenient short-hand for: + + character: 'the' + _alternation_1_of_production_1_of_rule_character + /dude/ + + _alternation_1_of_production_1_of_rule_character: + good | bad | ugly + +Since alternations are parsed by recursively calling the parser generator, +any type(s) of item can appear in an alternation. For example: + + character: 'the' ( 'high' "plains" # Silent, with poncho + | /no[- ]name/ # Silent, no poncho + | vengeance_seeking # Poncho-optional + | + ) drifter + +In this case, if an error occurred, the automatically generated +message would be: + + ERROR (line ): Invalid implicit subrule: Expected + 'high' or /no[- ]name/ or generic, + but found "pacifist" instead + +Since every alternation actually has a name, it's even possible +to extend or replace them: + + parser->Replace( + "_alternation_1_of_production_1_of_rule_character: + 'generic Eastwood'" + ); + +More importantly, since alternations are a form of subrule, they can be given +repetition specifiers: + + character: 'the' ( good | bad | ugly )(?) /dude/ + + +=head2 Incremental Parsing + +C provides two methods - C and C - which +can be used to alter the grammar matched by a parser. Both methods +take the same argument as C, namely a +grammar specification string + +C interprets the grammar specification and adds any +productions it finds to the end of the rules for which they are specified. For +example: + + $add = "name: 'Jimmy-Bob' | 'Bobby-Jim'\ndesc: colour /necks?/"; + parser->Extend($add); + +adds two productions to the rule "name" (creating it if necessary) and one +production to the rule "desc". + +C is identical, except that it first resets are +rule specified in the additional grammar, removing any existing productions. +Hence after: + + $add = "name: 'Jimmy-Bob' | 'Bobby-Jim'\ndesc: colour /necks?/"; + parser->Replace($add); + +are are I valid "name"s and the one possible description. + +A more interesting use of the C and C methods is to call them +inside the action of an executing parser. For example: + + typedef: 'typedef' type_name identifier ';' + { $thisparser->Extend("type_name: '$item[3]'") } + | + + identifier: ...!type_name /[A-Za-z_]w*/ + +which automatically prevents type names from being typedef'd, or: + + command: 'map' key_name 'to' abort_key + { $thisparser->Replace("abort_key: '$item[2]'") } + | 'map' key_name 'to' key_name + { map_key($item[2],$item[4]) } + | abort_key + { exit if confirm("abort?") } + + abort_key: 'q' + + key_name: ...!abort_key /[A-Za-z]/ + +which allows the user to change the abort key binding, but not to unbind it. + +The careful use of such constructs makes it possible to reconfigure a +a running parser, eliminating the need for semantic feedback by +providing syntactic feedback instead. However, as currently implemented, +C and C have to regenerate and re-C the +entire parser whenever they are called. This makes them quite slow for +large grammars. + +In such cases, the judicious use of an interpolated regex is likely to +be far more efficient: + + typedef: 'typedef' type_name/ identifier ';' + { $thisparser->{local}{type_name} .= "|$item[3]" } + | + + identifier: ...!type_name /[A-Za-z_]w*/ + + type_name: /$thisparser->{local}{type_name}/ + + +=head2 Precompiling parsers + +Normally Parse::RecDescent builds a parser from a grammar at run-time. +That approach simplifies the design and implementation of parsing code, +but has the disadvantage that it slows the parsing process down - you +have to wait for Parse::RecDescent to build the parser every time the +program runs. Long or complex grammars can be particularly slow to +build, leading to unacceptable delays at start-up. + +To overcome this, the module provides a way of "pre-building" a parser +object and saving it in a separate module. That module can then be used +to create clones of the original parser. + +A grammar may be precompiled using the C class method. +For example, to precompile a grammar stored in the scalar $grammar, +and produce a class named PreGrammar in a module file named PreGrammar.pm, +you could use: + + use Parse::RecDescent; + + Parse::RecDescent->Precompile($grammar, "PreGrammar"); + +The first argument is the grammar string, the second is the name of the class +to be built. The name of the module file is generated automatically by +appending ".pm" to the last element of the class name. Thus + + Parse::RecDescent->Precompile($grammar, "My::New::Parser"); + +would produce a module file named Parser.pm. + +It is somewhat tedious to have to write a small Perl program just to +generate a precompiled grammar class, so Parse::RecDescent has some special +magic that allows you to do the job directly from the command-line. + +If your grammar is specified in a file named F, you can generate +a class named Yet::Another::Grammar like so: + + > perl -MParse::RecDescent - grammar Yet::Another::Grammar + +This would produce a file named F containing the full +definition of a class called Yet::Another::Grammar. Of course, to use +that class, you would need to put the F file in a +directory named F, somewhere in your Perl include path. + +Having created the new class, it's very easy to use it to build +a parser. You simply C the new module, and then call its +C method to create a parser object. For example: + + use Yet::Another::Grammar; + my $parser = Yet::Another::Grammar->new(); + +The effect of these two lines is exactly the same as: + + use Parse::RecDescent; + + open GRAMMAR_FILE, "grammar" or die; + local $/; + my $grammar = ; + + my $parser = Parse::RecDescent->new($grammar); + +only considerably faster. + +Note however that the parsers produced by either approach are exactly +the same, so whilst precompilation has an effect on I speed, +it has no effect on I speed. RecDescent 2.0 will address that +problem. + + +=head2 A Metagrammar for C + +The following is a specification of grammar format accepted by +C (specified in the C grammar format!): + + grammar : components(s) + + component : rule | comment + + rule : "\n" identifier ":" production(s?) + + production : items(s) + + item : lookahead(?) simpleitem + | directive + | comment + + lookahead : '...' | '...!' # +'ve or -'ve lookahead + + simpleitem : subrule args(?) # match another rule + | repetition # match repeated subrules + | terminal # match the next input + | bracket args(?) # match alternative items + | action # do something + + subrule : identifier # the name of the rule + + args : {extract_codeblock($text,'[]')} # just like a [...] array ref + + repetition : subrule args(?) howoften + + howoften : '(?)' # 0 or 1 times + | '(s?)' # 0 or more times + | '(s)' # 1 or more times + | /(\d+)[.][.](/\d+)/ # $1 to $2 times + | /[.][.](/\d*)/ # at most $1 times + | /(\d*)[.][.])/ # at least $1 times + + terminal : /[/]([\][/]|[^/])*[/]/ # interpolated pattern + | /"([\]"|[^"])*"/ # interpolated literal + | /'([\]'|[^'])*'/ # uninterpolated literal + + action : { extract_codeblock($text) } # embedded Perl code + + bracket : '(' Item(s) production(s?) ')' # alternative subrules + + directive : '' # commit to production + | '' # cancel commitment + | '' # skip to newline + | '' # skip + | '' # fail this production + | '' # fail if + | '' # report an error + | '' # report error as "" + | '' # error only if committed + | '' # " " " " + | ']+/ '>' # define rule-local variable + | '' # invoke rule named in string + + identifier : /[a-z]\w*/i # must start with alpha + + comment : /#[^\n]*/ # same as Perl + + pattern : {extract_bracketed($text,'<')} # allow embedded "<..>" + + condition : {extract_codeblock($text,'{<')} # full Perl expression + + string : {extract_variable($text)} # any Perl variable + | {extract_quotelike($text)} # or quotelike string + | {extract_bracketed($text,'<')} # or balanced brackets + + +=head1 GOTCHAS + +This section describes common mistakes that grammar writers seem to +make on a regular basis. + +=head2 1. Expecting an error to always invalidate a parse + +A common mistake when using error messages is to write the grammar like this: + + file: line(s) + + line: line_type_1 + | line_type_2 + | line_type_3 + | + +The expectation seems to be that any line that is not of type 1, 2 or 3 will +invoke the CerrorE> directive and thereby cause the parse to fail. + +Unfortunately, that only happens if the error occurs in the very first line. +The first rule states that a C is matched by one or more lines, so if +even a single line succeeds, the first rule is completely satisfied and the +parse as a whole succeeds. That means that any error messages generated by +subsequent failures in the C rule are quietly ignored. + +Typically what's really needed is this: + + file: line(s) eofile { $return = $item[1] } + + line: line_type_1 + | line_type_2 + | line_type_3 + | + + eofile: /^\Z/ + +The addition of the C subrule to the first production means that +a file only matches a series of successful C matches I. If any input text remains after the lines are matched, +there must have been an error in the last C. In that case the C +rule will fail, causing the entire C rule to fail too. + +Note too that C must match C (end-of-text), I +C or C (end-of-file). + +And don't forget the action at the end of the production. If you just +write: + + file: line(s) eofile + +then the value returned by the C rule will be the value of its +last item: C. Since C always returns an empty string +on success, that will cause the C rule to return that empty +string. Apart from returning the wrong value, returning an empty string +will trip up code such as: + + $parser->file($filetext) || die; + +(since "" is false). + +Remember that Parse::RecDescent returns undef on failure, +so the only safe test for failure is: + + defined($parser->file($filetext)) || die; + + +=head1 DIAGNOSTICS + +Diagnostics are intended to be self-explanatory (particularly if you +use B<-RD_HINT> (under B) or define C<$::RD_HINT> inside the program). + +C currently diagnoses the following: + +=over 4 + +=item * + +Invalid regular expressions used as pattern terminals (fatal error). + +=item * + +Invalid Perl code in code blocks (fatal error). + +=item * + +Lookahead used in the wrong place or in a nonsensical way (fatal error). + +=item * + +"Obvious" cases of left-recursion (fatal error). + +=item * + +Missing or extra components in a CleftopE> or CrightopE> +directive. + +=item * + +Unrecognisable components in the grammar specification (fatal error). + +=item * + +"Orphaned" rule components specified before the first rule (fatal error) +or after an CerrorE> directive (level 3 warning). + +=item * + +Missing rule definitions (this only generates a level 3 warning, since you +may be providing them later via C). + +=item * + +Instances where greedy repetition behaviour will almost certainly +cause the failure of a production (a level 3 warning - see +L<"ON-GOING ISSUES AND FUTURE DIRECTIONS"> below). + +=item * + +Attempts to define rules named 'Replace' or 'Extend', which cannot be +called directly through the parser object because of the predefined +meaning of C and +C. (Only a level 2 warning is generated, since +such rules I still be used as subrules). + +=item * + +Productions which consist of a single Cerror?E> +directive, and which therefore may succeed unexpectedly +(a level 2 warning, since this might conceivably be the desired effect). + +=item * + +Multiple consecutive lookahead specifiers (a level 1 warning only, since their +effects simply accumulate). + +=item * + +Productions which start with a CrejectE> or Crulevar:...E> +directive. Such productions are optimized away (a level 1 warning). + +=item * + +Rules which are autogenerated under C<$::AUTOSTUB> (a level 1 warning). + +=back + +=head1 AUTHOR + +Damian Conway (damian@conway.org) + +=head1 BUGS AND IRRITATIONS + +There are undoubtedly serious bugs lurking somewhere in this much code :-) +Bug reports and other feedback are most welcome. + +Ongoing annoyances include: + +=over 4 + +=item * + +There's no support for parsing directly from an input stream. +If and when the Perl Gods give us regular expressions on streams, +this should be trivial (ahem!) to implement. + +=item * + +The parser generator can get confused if actions aren't properly +closed or if they contain particularly nasty Perl syntax errors +(especially unmatched curly brackets). + +=item * + +The generator only detects the most obvious form of left recursion +(potential recursion on the first subrule in a rule). More subtle +forms of left recursion (for example, through the second item in a +rule after a "zero" match of a preceding "zero-or-more" repetition, +or after a match of a subrule with an empty production) are not found. + +=item * + +Instead of complaining about left-recursion, the generator should +silently transform the grammar to remove it. Don't expect this +feature any time soon as it would require a more sophisticated +approach to parser generation than is currently used. + +=item * + +The generated parsers don't always run as fast as might be wished. + +=item * + +The meta-parser should be bootstrapped using C :-) + +=back + +=head1 ON-GOING ISSUES AND FUTURE DIRECTIONS + +=over 4 + +=item 1. + +Repetitions are "incorrigibly greedy" in that they will eat everything they can +and won't backtrack if that behaviour causes a production to fail needlessly. +So, for example: + + rule: subrule(s) subrule + +will I succeed, because the repetition will eat all the +subrules it finds, leaving none to match the second item. Such +constructions are relatively rare (and C generates a +warning whenever they occur) so this may not be a problem, especially +since the insatiable behaviour can be overcome "manually" by writing: + + rule: penultimate_subrule(s) subrule + + penultimate_subrule: subrule ...subrule + +The issue is that this construction is exactly twice as expensive as the +original, whereas backtracking would add only 1/I to the cost (for +matching I repetitions of C). I would welcome feedback on +the need for backtracking; particularly on cases where the lack of it +makes parsing performance problematical. + +=item 2. + +Having opened that can of worms, it's also necessary to consider whether there +is a need for non-greedy repetition specifiers. Again, it's possible (at some +cost) to manually provide the required functionality: + + rule: nongreedy_subrule(s) othersubrule + + nongreedy_subrule: subrule ...!othersubrule + +Overall, the issue is whether the benefit of this extra functionality +outweighs the drawbacks of further complicating the (currently +minimalist) grammar specification syntax, and (worse) introducing more overhead +into the generated parsers. + +=item 3. + +An CautocommitE> directive would be nice. That is, it would be useful to be +able to say: + + command: + command: 'find' name + | 'find' address + | 'do' command 'at' time 'if' condition + | 'do' command 'at' time + | 'do' command + | unusual_command + +and have the generator work out that this should be "pruned" thus: + + command: 'find' name + | 'find' address + | 'do' command + 'at' time + 'if' condition + | 'do' command + 'at' time + | 'do' command + | unusual_command + +There are several issues here. Firstly, should the +CautocommitE> automatically install an CuncommitE> +at the start of the last production (on the grounds that the "command" +rule doesn't know whether an "unusual_command" might start with "find" +or "do") or should the "unusual_command" subgraph be analysed (to see +if it I be viable after a "find" or "do")? + +The second issue is how regular expressions should be treated. The simplest +approach would be simply to uncommit before them (on the grounds that they +I match). Better efficiency would be obtained by analyzing all preceding +literal tokens to determine whether the pattern would match them. + +Overall, the issues are: can such automated "pruning" approach a hand-tuned +version sufficiently closely to warrant the extra set-up expense, and (more +importantly) is the problem important enough to even warrant the non-trivial +effort of building an automated solution? + +=back + +=head1 COPYRIGHT + +Copyright (c) 1997-2000, Damian Conway. All Rights Reserved. +This module is free software. It may be used, redistributed +and/or modified under the terms of the Perl Artistic License + (see http://www.perl.com/perl/misc/Artistic.html) diff --git a/lib/WebGUI/Wobject/IndexedSearch.pm b/lib/WebGUI/Wobject/IndexedSearch.pm new file mode 100644 index 000000000..8734a8569 --- /dev/null +++ b/lib/WebGUI/Wobject/IndexedSearch.pm @@ -0,0 +1,539 @@ +package WebGUI::Wobject::IndexedSearch; +$VERSION = "1.4"; + +#Test to see if Time::HiRes will load. +my $hasTimeHiRes=1; +eval "use Time::HiRes"; $hasTimeHiRes=0 if $@; + +use strict; +use WebGUI::Wobject::IndexedSearch::Search; +use WebGUI::HTMLForm; +use WebGUI::HTML; +use WebGUI::Macro; +use WebGUI::International; +use WebGUI::Session; +use WebGUI::SQL; +use WebGUI::Wobject; +use Tie::IxHash; +use WebGUI::Utility; +use WebGUI::Paginator; + +our @ISA = qw(WebGUI::Wobject); + +#------------------------------------------------------------------- +sub name { + return WebGUI::International::get(17,$_[0]->get("namespace")); +} + +#------------------------------------------------------------------- +sub new { + my $class = shift; + my $property = shift; + my $self = WebGUI::Wobject->new( + -useTemplate=>1, + -properties=>$property, + -extendedProperties=>{ + indexName=>{ + defaultValue=>'default' + }, + searchRoot=>{ + fieldType=>'selectList', + defaultValue=>'any' + }, + users=>{ + fieldType=>'selectList', + defaultValue=>'any' + }, + namespaces=>{ + fieldType=>'selectList', + defaultValue=>'any' + }, + languages=>{ + fieldType=>'selectList', + defaultValue=>'any' + }, + contentTypes=>{ + fieldType=>'selectList', + defaultValue=>'any' + }, + paginateAfter=>{ + defaultValue=>10 + }, + highlight=>{ + defaultValue=>1 + }, + previewLength=>{ + defaultValue=>130 + }, + highlight_1=>{ + defaultValue=>'#ffff66' + }, + highlight_2=>{ + defaultValue=>'#A0FFFF' + }, + highlight_3=>{ + defaultValue=>'#99ff99' + }, + highlight_4=>{ + defaultValue=>'#ff9999' + }, + highlight_5=>{ + defaultValue=>'#ff66ff' + }, + } + ); + bless $self, $class; +} + +#------------------------------------------------------------------- +sub uiLevel { + return 5; +} + +#------------------------------------------------------------------- +sub www_edit { + my $self = shift; + my (@data, %indexName); + + tie my %searchRoot, 'Tie::IxHash'; + + my $layout = WebGUI::HTMLForm->new; + my $properties = WebGUI::HTMLForm->new; + my $privileges = WebGUI::HTMLForm->new; + + # Unconditional read to catch intallation errors. + my $sth = WebGUI::SQL->unconditionalRead("select distinct(indexName), indexName from IndexedSearch_docInfo"); + unless ($sth->errorCode < 1) { + return "

" . WebGUI::International::get(1,$self->get("namespace")) . $sth->errorMessage."

"; + } + while (@data = $sth->array) { + $indexName{$data[0]} = $data[1]; + } + $sth->finish; + unless(%indexName) { + return "

" . WebGUI::International::get(2,$self->get("namespace")) . + "

" . WebGUI::International::get(3,$self->get("namespace")) . "

"; + } + + # Index to use + $properties->radioList( -name=>'indexName', + -options=>\%indexName, + -label=>WebGUI::International::get(5,$self->get("namespace")), + -value=>$self->getValue("indexName"), + -vertical=>1 + ); + + # Page roots + %searchRoot = ( 'any'=>WebGUI::International::get(15,$self->get("namespace")), + $session{page}{pageId}=>WebGUI::International::get(4,$self->get("namespace")), + WebGUI::SQL->buildHash("select pageId,title from page where parentId=0 and (pageId=1 or pageId>999) order by title") + ); + $properties->checkList ( -name=>'searchRoot', + -options=>\%searchRoot, + -label=>WebGUI::International::get(6,$self->get("namespace")), + -value=>[ split("\n", $self->getValue("searchRoot")) ], + -multiple=>1, + -vertical=>1, + ); + + # Content of specific user + $properties->selectList ( -name=>'users', + -options=>$self->_getUsers(), + -label=>WebGUI::International::get(7,$self->get("namespace")), + -value=>[ split("\n", $self->getValue("users")) ], + -multiple=>1, + -size=>5 + ); + + # Content in specific namespaces + $properties->selectList ( -name=>'namespaces', + -options=>$self->_getNamespaces, + -label=>WebGUI::International::get(8,$self->get("namespace")), + -value=>[ split("\n", $self->getValue("namespaces")) ], + -multiple=>1, + -size=>5 + ); + + # Content in specific language + $properties->checkList ( -name=>'languages', + -options=>$self->_getLanguages(), + -label=>WebGUI::International::get(9,$self->get("namespace")), + -value=>[ split("\n", $self->getValue("languages")) ], + -multiple=>1, + ); + + # Only specific content types + my $contentTypes = $self->_getContentTypes(); + delete $contentTypes->{content}; + $properties->checkList ( -name=>'contentTypes', + -options=>$contentTypes, + -label=>WebGUI::International::get(10,$self->get("namespace")), + -value=>[ split("\n", $self->getValue("contentTypes")) ], + -multiple=>1, + -vertical=>1, + ); + $layout->integer ( -name=>'paginateAfter', + -label=>WebGUI::International::get(11,$self->get("namespace")), + -value=>$self->getValue("paginateAfter"), + ); + $layout->integer ( -name=>'previewLength', + -label=>WebGUI::International::get(12,$self->get("namespace")), + -value=>$self->getValue("previewLength"), + ); + $layout->yesNo ( -name=>'highlight', + -label=>WebGUI::International::get(13,$self->get("namespace")), + -value=>$self->getValue("highlight"), + ); + + # Color picker for highlight colors + $layout->raw ( -value=>' + + ' + ); + for (1..5) { + my $highlight = "highlight_$_"; + $layout->text ( -name=>$highlight, + -label=>WebGUI::International::get(14,$self->get("namespace")) ." $_:", + -size=>7, + -value=>$self->getValue($highlight), + -subtext=>qq{ + Pick} + ); + } + + return $self->SUPER::www_edit( + -properties=>$properties->printRowsOnly, + -layout=>$layout->printRowsOnly, + -privileges=>$privileges->printRowsOnly, + -heading=>"Edit Search", + -helpId=>1 + ); + +} + +#------------------------------------------------------------------- +sub www_editSave { + # default editSave overruled to build & save the pageList for faster retrieval. + return WebGUI::Privilege::insufficient() unless (WebGUI::Privilege::canEditPage()); + my $self = shift; + $self->SUPER::www_editSave(); + my (%pages, $pageList); + my $searchRoot = $self->get("searchRoot"); + if ($searchRoot =~ /any/i) { + $pageList = 'any'; + } else { + foreach my $pageId (split(/\n+/,$searchRoot)) { + %pages = (%pages, _getSearchablePages($pageId), $pageId => defined); + } + $pageList = join(" , ", keys %pages); + } + WebGUI::SQL->write("update IndexedSearch set pageList = ".quote($pageList)." where wobjectId = ".$self->get("wobjectId")); + return ''; +} + +#------------------------------------------------------------------- +sub www_view { + my $self = shift; + my (%var, @resultsLoop); + + # Do some query handling + $var{exactPhrase} = $session{form}{exactPhrase}; + $var{allWords} = $session{form}{allWords}; + $var{atLeastOne} = $session{form}{atLeastOne}; + $var{without} = $session{form}{without}; + $var{query} = $session{form}{query}; + $var{query} .= qq{ +"$var{exactPhrase}"} if ($var{exactPhrase}); + $var{query} .= " ".join(" ",map("+".$_,split(/\s+/,$var{allWords}))) if ($var{allWords}); + $var{query} .= qq{ $var{atLeastOne}} if ($var{atLeastOne}); + $var{query} .= " ".join(" ",map("-".$_,split(/\s+/,$var{without}))) if ($var{without}); + + # Set some standard vars + $var{submit} = WebGUI::Form::submit({value=>WebGUI::International::get(16, $self->get("namespace"))}); + $var{"int.search"} = WebGUI::International::get(16,$self->get("namespace")); + $var{wid} = $self->get("wobjectId"); + $var{numberOfResults} = '0'; + $var{"select_".$self->getValue("paginateAfter")} = "selected"; + + # Do the search + my $startTime = ($hasTimeHiRes) ? Time::HiRes::time() : time(); + my $filter = $self->_buildFilter; + my $search = WebGUI::Wobject::IndexedSearch::Search->new($self->getValue('indexName')); + $search->open; + my $results = $search->search($var{query},$filter); + $var{duration} = (($hasTimeHiRes) ? Time::HiRes::time() : time()) - $startTime; + $var{duration} = sprintf("%.3f", $var{duration}) if $hasTimeHiRes; # Duration rounded to 3 decimal places + + # Let's see if the search returned any results + if (defined ($results)) { + $var{numberOfResults} = scalar(@$results); + + # Deal with pagination + my $url = "wid=".$self->get("wobjectId")."&func=view&query=".WebGUI::URL::escape($var{query}); + map {$url .= "&users=".WebGUI::URL::escape($_)} $session{cgi}->param('users'); + map {$url .= "&namespaces=".WebGUI::URL::escape($_)} $session{cgi}->param('namespaces'); + map {$url .= "&languages=".WebGUI::URL::escape($_)} $session{cgi}->param('languages'); + map {$url .= "&contentTypes=".WebGUI::URL::escape($_)} $session{cgi}->param('contentTypes'); + $url .= "&paginateAfter=".$self->getValue("paginateAfter"); + my $p = WebGUI::Paginator->new(WebGUI::URL::page($url), $results, $self->getValue("paginateAfter")); + $var{startNr} = 1; + if($session{form}{pn}) { + $var{startNr} = (($session{form}{pn} - 1) * $self->getValue("paginateAfter")) + 1; + } + + my @highlightColors = map { $self->getValue("highlight_$_") } (1..5); + $var{queryHighlighted} = $search->highlight($var{query}, undef, \@highlightColors); + + # Get result details for this page + if($p->getPageNumber > $p->getNumberOfPages) { + $var{numberOfResults} = 0; + $var{resultsLoop} = []; + } else { + $var{resultsLoop} = $search->getDetails($p->getPageData, + highlightColors => \@highlightColors, + previewLength => $self->getValue('previewLength'), + highlight => $self->getValue('highlight') + ); + # Pagination variables + $var{endNr} = $var{startNr}+(scalar(@{$var{resultsLoop}}))-1; + $p->appendTemplateVars(\%var); + } + } + + # Create a loop with namespaces + $var{namespaces} = []; + my $namespaces = $self->_getNamespaces('restricted'); + foreach(keys %$namespaces) { + my $selected = 0; + if (scalar $session{cgi}->param('namespaces')) { + $selected = isIn($_, $session{cgi}->param('namespaces')); + } else { + $selected = ($session{form}{namespaces} =~ /$_/); + } + push(@{$var{namespaces}}, { value => $_, name => $namespaces->{$_}, selected => $selected }); + } + + # Create a loop with contentTypes + # + # And while we are busy we also create a loop with simplified contentTypes + # This means: wobject, page, wobjectDetail are masked in one option: content + + $var{contentTypes} = []; + $var{contentTypesSimple} = []; + my $contentTypes = $self->_getContentTypes('restricted'); + foreach(keys %$contentTypes) { + my $selected = 0; + if (scalar $session{cgi}->param('contentTypes')) { + $selected = isIn($_, $session{cgi}->param('contentTypes')); + } else { + $selected = ($session{form}{contentTypes} =~ /$_/); + } + unless(/^content$/) { # No shortcut in the detailed contentType list + push(@{$var{contentTypes}}, { value => $_, + name => $contentTypes->{$_}, + selected => $selected, + 'type_'.$_ => 1 }); + } + unless(/^page|wobject|wobjectDetail$/) { # No details in the simple contentType list + push(@{$var{contentTypesSimple}}, { value => $_, + name => $contentTypes->{$_}, + selected => $selected, + 'type_'.$_ => 1 }); + } + } + + # Create a loop with users + $var{users} = []; + my $users = $self->_getUsers('restricted'); + foreach(keys %$users) { + my $selected = 0; + if (scalar $session{cgi}->param('users')) { + $selected = isIn($_, $session{cgi}->param('users')); + } else { + $selected = ($session{form}{users} =~ /$_/); + } + push(@{$var{users}}, { value => $_, name => $users->{$_}, selected => $selected }); + } + + # Create a loop with languages + $var{languages} = []; + my $languages = $self->_getLanguages('restricted'); + foreach(keys %$languages) { + my $selected = 0; + if (scalar $session{cgi}->param('languages')) { + $selected = isIn($_, $session{cgi}->param('languages')); + } else { + $selected = ($session{form}{languages} =~ /$_/); + } + push(@{$var{languages}}, { value => $_, name => $languages->{$_}, selected => $selected }); + } + + # close the search + $search->close; + + return $self->processTemplate($self->get("templateId"),\%var); +} + +#------------------------------------------------------------------- +sub _buildFilter { + my $self = shift; + my %filter = (); + + # pages + if($self->getValue('pageList') ne 'any') { + $filter{pageId} = [ split(/\n+/, $self->getValue('pageList')) ]; + } + + # languages + if($session{form}{languages} && ! isIn('any', $session{cgi}->param('languages'))) { + $filter{languageId} = [ map { quote($_) } $session{cgi}->param('languages') ]; + } elsif ($self->getValue('languages') !~ /any/i) { + $filter{languageId} = [ map { quote($_) } split(/\n/, $self->getValue('languages')) ]; + } + push(@{$filter{languageId}}, '0') if (exists $filter{languageId}); # Some content (i.e. profiles) + # don't have a language. They + # must be found as well. + + # content-types + if($session{form}{contentTypes} && ! isIn('any', $session{cgi}->param('contentTypes'))) { + $filter{contentType} = [ map { quote($_) } $session{cgi}->param('contentTypes') ]; + + # contentType "content" is a shortcut for "page", "wobject" and "wobjectDetail" + if (isIn('content', $session{cgi}->param('contentTypes'))) { + push(@{$filter{contentType}}, map { quote($_) } qw/page wobject wobjectDetail/); + } + } elsif ($self->getValue('contentTypes') !~ /any/i) { + $filter{contentType} = [ map { quote($_) } split(/\n/, $self->getValue('contentTypes')) ]; + } + + # users + if($session{form}{users} && ! isIn('any', $session{cgi}->param('users'))) { + $filter{ownerId} = []; + foreach my $user ($session{cgi}->param('users')) { + if ($user =~ /\D/) { + $user =~ s/\*/%/g; + ($user) = WebGUI::SQL->buildArray("select userId from users where username like ".quote($user)); + } + push(@{$filter{ownerId}}, quote($user)) if ($user =~ /^\d+$/); + } + } elsif ($self->getValue('users') !~ /any/i) { + $filter{ownerId} = [ map { quote($_) } split(/\n/, $self->getValue('users')) ]; + } + + # namespaces + if($session{form}{namespaces} && ! isIn('any', $session{cgi}->param('namespaces'))) { + $filter{namespace} = [ map { quote($_) } $session{cgi}->param('namespaces') ]; + } elsif ($self->getValue('namespaces') !~ /any/i) { + $filter{namespace} = [ map { quote($_) } split(/\n/, $self->getValue('namespaces')) ]; + } + + # delete $filter{ownerId} if it is an empty array reference + if(exists($filter{ownerId})) { + delete $filter{ownerId} unless (scalar(@{$filter{ownerId}})); + } + return \%filter; +} + +#------------------------------------------------------------------- +sub _getLanguages { + my ($self, $restricted) = @_; + my $international = WebGUI::SQL->buildHashRef("select distinct(IndexedSearch_docInfo.languageId), language.language from IndexedSearch_docInfo, language + where language.languageId = IndexedSearch_docInfo.languageId"); + tie my %languages, 'Tie::IxHash'; + if ($restricted and $self->get('languages') !~ /any/i) { + $languages{any} = WebGUI::International::get(24,$self->get("namespace")); + foreach (split/\n/, $self->get('languages')) { + $languages{$_} = $international->{$_}; + } + } else { + %languages = ('any' => WebGUI::International::get(24,$self->get("namespace")) , %$international); + } + return \%languages; +} + +#------------------------------------------------------------------- +sub _getNamespaces { + my ($self, $restricted) = @_; + my %international; + foreach my $wobject (@{$session{config}{wobjects}}){ + my $cmd = "WebGUI::Wobject::".$wobject; + my $w = $cmd->new({namespace=>$wobject, wobjectId=>'new'}); + $international{$wobject} = $w->name; + } + tie my %namespaces, 'Tie::IxHash'; + if ($restricted and $self->get('namespaces') !~ /any/i) { + $namespaces{any} = WebGUI::International::get(18,$self->get("namespace")); + foreach (split/\n/, $self->get('namespaces')) { + $namespaces{$_} = $international{$_} || ucfirst($_); + } + } else { + $namespaces{any} = WebGUI::International::get(18,$self->get("namespace")); + foreach (WebGUI::SQL->buildArray("select distinct(namespace) from IndexedSearch_docInfo order by namespace")) { + $namespaces{$_} = $international{$_} ||ucfirst($_); + } + } + return \%namespaces; +} + +#------------------------------------------------------------------- +sub _getContentTypes { + my ($self, $restricted) = @_; + my %international = ( 'page' => WebGUI::International::get(2), + 'wobject' => WebGUI::International::get(19,$self->get("namespace")), + 'wobjectDetail' => WebGUI::International::get(20,$self->get("namespace")), + 'content' => WebGUI::International::get(21,$self->get("namespace")), + 'discussion' => WebGUI::International::get(892), + 'profile' => WebGUI::International::get(22,$self->get("namespace")), + 'help' => WebGUI::International::get(93), + 'any' => WebGUI::International::get(23,$self->get("namespace")), + ); + tie my %contentTypes, 'Tie::IxHash'; + if ($restricted and $self->get('contentTypes') !~ /any/i) { + $contentTypes{any} = $international{any}; + $contentTypes{content} = $international{content}; # shortcut for page, wobject and wobjectDetail + foreach (split/\n/, $self->get('contentTypes')) { + $contentTypes{$_} = $international{$_}; + } + } else { + %contentTypes = ( 'any' => $international{any}, + 'content' => $international{content}, # shortcut for page, wobject and wobjectDetail + ); + foreach (WebGUI::SQL->buildArray("select distinct(contentType) from IndexedSearch_docInfo order by contentType")) { + $contentTypes{$_} = $international{$_} || ucfirst($_); + } + } + return \%contentTypes; +} + +#------------------------------------------------------------------- +sub _getSearchablePages { + my $searchRoot = shift; + my %pages; + my $sth = WebGUI::SQL->read("select pageId from page where parentId = $searchRoot"); + while (my %data = $sth->hash) { + $pages{$data{pageId}} = defined; + %pages = (%pages, _getSearchablePages($data{pageId}) ); + } + return %pages; +} + +#------------------------------------------------------------------- +sub _getUsers { + my ($self, $restricted) = @_; + tie my %users, 'Tie::IxHash'; + if ($restricted and $self->get('users') !~ /any/i) { + $users{any} = WebGUI::International::get(25,$self->get("namespace")); + foreach (split/\n/, $self->get('users')) { + $users{$_} = $_; + } + } else { + %users = ( 'any' => WebGUI::International::get(25,$self->get("namespace")), + WebGUI::SQL->buildHash("select userId, username from users order by username") + ); + } + return \%users; +} + +1; diff --git a/lib/WebGUI/Wobject/IndexedSearch/Search.pm b/lib/WebGUI/Wobject/IndexedSearch/Search.pm new file mode 100644 index 000000000..8cfaff2ac --- /dev/null +++ b/lib/WebGUI/Wobject/IndexedSearch/Search.pm @@ -0,0 +1,682 @@ +package WebGUI::Wobject::IndexedSearch::Search; + +=head1 LEGAL + + ------------------------------------------------------------------- + WebGUI is Copyright 2001-2003 Plain Black LLC. + ------------------------------------------------------------------- + Please read the legal notices (docs/legal.txt) and the license + (docs/license.txt) that came with this distribution before using + this software. + ------------------------------------------------------------------- + http://www.plainblack.com info@plainblack.com + ------------------------------------------------------------------- + +=cut + +use strict; +use DBIx::FullTextSearch; +use WebGUI::SQL; +use WebGUI::URL; +use WebGUI::HTML; +use WebGUI::ErrorHandler; +use DBIx::FullTextSearch::StopList; +use WebGUI::Utility; +use WebGUI::Session; +use WebGUI::Privilege; +use HTML::Highlight; +use WebGUI::Macro; + +=head1 NAME + +Package WebGUI::Wobject::IndexedSearch::Search + +=head1 DESCRIPTION + +Search implementation for WebGUI. + +=head1 SYNOPSIS + + use WebGUI::Wobject::IndexedSearch::Search; + my $search = WebGUI::Wobject::IndexedSearch::Search->new(); + $search->indexDocument( { text => 'Index this text', + location => 'http://www.mysite.com/index.pl/faq#45', + languageId => 3, + namespace => 'FAQ' + }); + my $hits = search->search("+foo -bar koo",{ namespace = ['Article', 'FAQ']} ); + + $search->close; + + +=head1 SEE ALSO + +This package is an extension to DBIx::FullTextSearch and HTML::Highlight. +See that packages for documentation of their methods. + +=head1 METHODS + +These methods are available from this package: + +=cut + +#------------------------------------------------------------------- + +=head2 close ( ) + +Closes the DBIx::FullTextSearch session. + +=cut + +sub close { + my $self=shift; + $self->DESTROY(); +} + +#------------------------------------------------------------------- + +=head2 create ( [ %options ] ) + +Creates a new DBIx::FullTextSearch index. + +=over + +=item %options + +Options to pass to DBIx::FullTextSearch. +The default options that are used are: + +( backend => column, word_length => 20, stoplist => undef ) + +Please refer to the DBIx::FullTextSearch documentation for a complete list of options. + +=back + +=cut + +sub create { + my ($self, %options) = @_; + %options = (%{$self->{_createOptions}}, %options); + if($options{stemmer}) { + eval "use Lingua::Stem"; + if ($@) { + WebGUI::ErrorHandler::warn("IndexedSearch: Can't use stemmer: $@"); + delete $options{stemmer}; + } + } + if($options{stoplist}) { + if(not $self->existsTable($self->getIndexName."_".$options{stoplist}."_stoplist")) { + DBIx::FullTextSearch::StopList->create_default($self->getDbh, $self->getIndexName."_".$options{stoplist}, $options{stoplist}); + } + $options{stoplist} = $self->getIndexName."_".$options{stoplist}; + } + $self->{_fts} = DBIx::FullTextSearch->create($self->getDbh, $self->getIndexName, %options); + if (not defined $self->{_fts}) { + WebGUI::ErrorHandler::fatalError("IndexedSearch: Unable to create index.\n$DBIx::FullTextSearch::errstr"); + return undef; + } + $self->{_docId} = 1; + return $self->{_fts}; +} + +#------------------------------------------------------------------- + +=head2 existsTable ( tableName ) + +Returns true if tableName exists in database. + +=over + +=item tableName + +The name of table. + +=back + +=cut + +sub existsTable { + my ($self, $table) = @_; + return isIn($table, WebGUI::SQL->buildArray("show tables")); +} + +#------------------------------------------------------------------- + +=head2 getDetails ( docIdList , [ %options ] ) + +Returns an array reference containing details for each docId. + +=over + +=item docIdList + +An array reference containing docIds. + +=item previewLength + +The maximum number of characters in each of the context sections. Defaults to "80". + +=item highlight + +A boolean indicating whether or not to enable highlight. Defaults to "1". + +=item highlightColors + +A reference to an array of CSS color identificators. + +=item + +=back + +=cut + +sub getDetails { + my ($self, $docIdList, %options) = @_; + my $docIds = join(',',@$docIdList); + my (@searchDetails, %namespace); + foreach my $wobject (@{$session{config}{wobjects}}){ + my $cmd = "WebGUI::Wobject::".$wobject; + my $w = $cmd->new({namespace=>$wobject, wobjectId=>'new'}); + $namespace{$wobject} = $w->name; + } + my $sql = "select * from IndexedSearch_docInfo where docId in ($docIds) and indexName = ".quote($self->getIndexName) ; + $sql .= " ORDER BY FIELD(docId, $docIds)"; # Maintain $docIdList order + my $sth = WebGUI::SQL->read($sql); + while (my %data = $sth->hash) { + $data{namespace} = $namespace{$data{namespace}} || ucfirst($data{namespace}); + if ($data{ownerId}) { + ($data{username}) = WebGUI::SQL->quickArray("select username from users where userId = ".quote($data{ownerId})); + $data{userProfile} = WebGUI::URL::page("op=viewProfile&uid=$data{ownerId}"); + } + if ($data{bodyShortcut} =~ /^\s*select /i) { + $data{body} = (WebGUI::SQL->quickArray($data{bodyShortcut}))[0]; + } else { + $data{body} = $data{bodyShortcut}; + } + if ($data{headerShortcut} =~ /^\s*select /i) { + $data{header} = (WebGUI::SQL->quickArray($data{headerShortcut}))[0]; + } else { + $data{header} = $data{headerShortcut}; + } + delete($data{bodyShortcut}); + delete($data{headerShortcut}); + if($data{body}) { + $data{body} = WebGUI::Macro::filter($data{body}); + $data{body} = WebGUI::HTML::filter($data{body},'all'); + $data{body} = $self->preview($data{body}, $options{previewLength}); + $data{body} = $self->highlight($data{body},undef, $options{highlightColors}) if ($options{highlight}); + } + if($data{header}) { + $data{header} = WebGUI::Macro::filter($data{header}); + $data{header} = WebGUI::HTML::filter($data{header},'all'); + $data{header} = $self->highlight($data{header},undef, $options{highlightColors}) if ($options{highlight}); + $data{location} = WebGUI::URL::gateway($data{location}); + } + # $data{crumbTrail} = WebGUI::Macro::C_crumbTrail::_recurseCrumbTrail($data{pageId}, ' > '); + # $data{crumbTrail} =~ s/\s*>\s*$//; + push(@searchDetails, \%data); + } + $sth->finish; + return \@searchDetails; +} + +#------------------------------------------------------------------- + +=head2 getDbh ( ) + +Returns the object's database handler. + +=cut + +sub getDbh { + my $self = shift; + return $self->{_dbh}; +} + +#------------------------------------------------------------------- + +=head2 getDocId ( ) + +Returns the next docId for this object. + +=cut + +sub getDocId { + my $self=shift; + return $self->{_docId}; +} + +#------------------------------------------------------------------- + +=head2 getIndexName ( ) + +Returns the full index name of this object. + +=cut + +sub getIndexName { + my $self = shift; + return $self->{_indexName}; +} + +#------------------------------------------------------------------- + +=head2 _queryToWords ( [ query ] ) + +Converts a DBIx::FullTextSearch query to (\@Words, \@Wildcards) suitable to pass to HTML::Highlight + +=cut + +sub _queryToWords { + my ($self, $query) = @_; + my $query ||= $self->{_query}; + + # Return the processed words / wildcards from memory if it's cached. + if ($self->{$query."words"} && $self->{$query."wildcards"}) { + return ($self->{$query."words"}, $self->{$query."wildcards"}); + } + + # deal with quotes + my $inQuote=0; + my (@words, @wildcards); + foreach (split(/\"/, $query)) { + if($inQuote == 0) { + foreach (split(/\s+/, $_)) { + next if (/^AND$/i); # boolean AND + next if (/^OR$/i); # boolean OR + next if (/^NOT$/i); # boolean OR + next if (/^\-/); # exclude word + next if (/^.{0,1}$/); # at least 2 characters + if (/\*/) { + push(@wildcards, '%'); # match any character + } else { + push(@wildcards, '*'); # Also match plural of word + } + s/['"()+*]+//g; # remove query operators and quotes + push(@words, $_); + } + } else { + my $phrase = $_; + push(@words, qq/$phrase/); + push(@wildcards, undef); # Exact match + } + $inQuote = ++$inQuote % 2; + } + # Store words / wildcards in memory + $self->{$query."words"} = \@words; + $self->{$query."wildcards"} = \@wildcards; + + return (\@words, \@wildcards); +} + +#------------------------------------------------------------------- + +=head2 highlight ( text [ , query , colors ] ) + +highlight words or patterns in HTML documents. + +=over + +=item text + +The text to highlight + +=item query + +A query containing the words to highlight. Defaults to the last used $search->search query. +Special case: When query contains only an asterisk '*', no highlighting is applied. + +=item colors + +A reference to an array of CSS color identificators. + +=back + +=cut + +sub highlight { + my ($self, $text, $query, $colors) = @_; + my $query ||= $self->{_query}; + return $text if ($query =~ /^\s*\*\s*$/); # query = '*', no highlight + my ($words, $wildcards) = $self->_queryToWords($query); + my $hl = new HTML::Highlight ( words => $words, + wildcards => $wildcards, + colors => $colors + ); + return $hl->highlight($text); +} + +#------------------------------------------------------------------- + +=head2 indexDocument ( hashRef ) + +Adds a document to the index. + +This method doesn't store the document itself. Instead, it stores information about words +in the document in such a structured way that it makes easy and fast to look up what +documents contain certain words and return id's of the documents. + +=over + +=item text + +The text to index. + +=item location + +The location of the document. Most likely an URL. + +=item contentType + +The content type of this document. + +=item docId + +The unique Id of this document. Defaults to the next empty docId. + +=item pageId + +The pageId of the page on which this document resides. Defaults to 0. + +=item wobjectId + +The wobjectID of the wobject that holds this document. Defaults to 0. + +=item ownerId + +The ownerId of the document. Defaults to 3. + +=item languageId + +The languageId of this document. Defaults to undef. + +=item namespace + +The namespace of this document. Defaults to 'WebGUI'. + +=item page_groupIdView + +Id of group authorized to view this page. Defaults to '7' (everyone) + +=item wobject_groupIdView + +Id of group authorized to view this wobject. Defaults to '7' (everyone) + +=item wobject_special_groupIdView + +Id of group authorized to view the details of this wobject. + +=item headerShortcut + +An sql statement that returns the header (title, question, subject, name, whatever) +of this document. + +=item bodyShortcut + +An sql statement that returns the body (description, answer, message, whatever) +of this document. + +=back + +=cut + +sub indexDocument { + my ($self, $document) = @_; + $self->{_fts}->index_document($document->{docId} || $self->{_docId}, $document->{text}); + WebGUI::SQL->write("insert into IndexedSearch_docInfo ( docId, + indexName, + pageId, + wobjectId, + languageId, + namespace, + location, + page_groupIdView, + wobject_groupIdView, + wobject_special_groupIdView, + headerShortcut, + bodyShortcut, + contentType, + ownerId ) + values ( ". + ($document->{docId} || $self->{_docId}).", ". + quote($self->getIndexName).", ". + ($document->{pageId} || 0).", ". + ($document->{wobjectId} || 0).", ". + ($document->{languageId} || quote('')).", ". + quote($document->{namespace} || 'WebGUI')." , ". + quote($document->{location}).", ". + ($document->{page_groupIdView} || 7).", ". + ($document->{wobject_groupIdView} || 7).", ". + ($document->{wobject_special_groupIdView} || 7).", ". + quote($document->{headerShortcut})." ,". + quote($document->{bodyShortcut})." ,". + quote($document->{contentType})." ,". + ($document->{ownerId} || 3)." )" + ); + $self->{_docId}++; +} + +#------------------------------------------------------------------- + +=head2 new ( [ indexName , dbh ] ) + +Constructor. + +=over + +=item indexName + +The name of the index to open. Defaults to 'default'. + +=item $dbh + +Database handler to use. Defaults to $WebGUI::Session::session{dbh}. + +=back + +=cut + +sub new { + my ($class, $indexName, $dbh) = @_; + $indexName = $indexName || 'default'; + my $self = { _indexName => $indexName, + _dbh => $dbh || $WebGUI::Session::session{dbh}, + _createOptions => {( backend => 'column', + word_length => 20, + filter => 'map { lc $_ if ($_ !~ /\^.*;/) }' + )}, + }; + bless $self, $class; +} + +#------------------------------------------------------------------- + +=head2 open ( ) + +Opens an existing DBIx::FullTextSearch index. + +=cut + +sub open { + my ($self) = @_; + $self->{_fts} = DBIx::FullTextSearch->open($self->getDbh, $self->getIndexName); + if (not defined $self->{_fts}) { + WebGUI::ErrorHandler::fatalError("IndexedSearch: Unable to open index.\n$DBIx::FullTextSearch::errstr"); + return undef; + } + ($self->{_docId}) = WebGUI::SQL->quickArray("select max(docId) from IndexedSearch_docInfo where indexName = ".quote($self->getIndexName)); + $self->{_docId}++; + return $self->{_fts}; +} + +#------------------------------------------------------------------- + +=head2 preview ( text , [ previewLength , query ] ) + +Returns a context preview in which words from a search query appear in the resulting documents. +The words are always in the middle of each of the sections. + +=over + +=item text + +The text to preview + +=item previewLength + +The maximum number of characters in each of the context sections. Defaults to 80. +A preview length of "0" means no preview, +while a negative preview length returns the complete text. + +=item query + +A query containing the words to highlight. Defaults to the last used $search->search query. + +=back + +=cut + +sub preview { + my ($self, $text, $previewLength, $query) = @_; + $previewLength = 80 if (not defined $previewLength); + return '' unless ($previewLength); + return $text if ($previewLength < 0); + my $query ||= $self->{_query}; + if(($query =~ /^\s*\*\s*$/) or not $query) { # Query is '*' or empty. + $text = WebGUI::HTML::filter($text,'all'); + $text =~ s/^(.{1,$previewLength})\s+.*$/$1/s; + } else { + my ($words, $wildcards) = $self->_queryToWords($query); + my $hl = new HTML::Highlight ( words => $words, + wildcards => $wildcards + ); + my $preview = join('... ',@{$hl->preview_context($text, $previewLength)}); + if ($preview) { + $text = $preview; + } else { + $text = WebGUI::HTML::filter($text,'all'); + $text =~ s/^(.{1,$previewLength})\s+.*$/$1/s; + } + } + $text =~ s/^(\s| )+//; + $text =~ s/(\s| )+$//; + if($text ne '') { + $text = '... '.$text if ($text !~ /^[A-Z]+/); # ... broken up at the beginning + $text .=' ...' if ($text !~ /\.$/); # broken up at the end ... + } + return $text; +} + +#------------------------------------------------------------------- + +=head2 recreate ( [ %options ] ) + +Like create, but first drops the existing index. Useful when rebuilding the index. + +=over + +=item %options + +Options to pass to WebGUI::IndexedSearch->create() + +=back + +=cut + +sub recreate { + my ($self, %options) = @_; + $self->{_fts} = DBIx::FullTextSearch->open($self->getDbh, $self->getIndexName); + if (defined $self->{_fts}) { + $self->{_fts}->drop; + } + $self->{_fts} = $self->create($self->getIndexName, $self->getDbh, %options); + WebGUI::SQL->write("delete from IndexedSearch_docInfo where indexName = ".quote($self->getIndexName)); + return $self->{_fts}; +} + +#------------------------------------------------------------------- + +=head2 search ( query, \%filter ) + +Returns an array reference of docId's of documents that match the query. +If the search has no results, undef is returned. + +=over + +=item query + +user input string. Will be parsed into can-include, must-include and must-not-include words and phrases. +Special case: when query is an asterisk (*), then no full text search is done, and results are returned +using \%filter. + +Examples are: + +"this is a phrase" -koo +bar foo + (foo OR baz) AND (bar OR caz) + +=item filter + +A hash reference containing filter elements. + +Example: + { + language => [ 1, 3 ], + namespace => [ 'Article', 'USS' ] + } + +=back + +=cut + +sub search { + my ($self, $query, $filter) = @_; + $self->{_query} = $query; + my $noFtsSearch = ($query =~ /^\s*\*\s*$/); # query = '*', no full text search + my @fts_docIds = $self->{_fts}->search($query) unless $noFtsSearch ; + if(@fts_docIds || $noFtsSearch) { + my $groups = join(',',@{$self->_getGroups}); + my $docIds = join(',',@fts_docIds); + my $sql = "select docId from IndexedSearch_docInfo where indexName = ".quote($self->getIndexName); + $sql .= " and docId in ($docIds)" unless $noFtsSearch; + $sql .= " and page_groupIdView in ($groups)"; + $sql .= " and wobject_special_groupIdView in ($groups)"; + if ($session{setting}{wobjectPrivileges}) { + $sql .= " and wobject_groupIdView in ($groups)"; + } + foreach my $filterElement (keys %{$filter}) { + $sql .= " AND $filterElement in (".join(',', @{$filter->{$filterElement}}).")"; + } + # No trash or other garbage + $sql .= " AND (pageId > 999 or pageId < 0 or pageId = 1) "; + # Keep @fts_docIds list order + $sql .= " ORDER BY FIELD(docID,$docIds)" unless $noFtsSearch; + my $filteredDocIds = WebGUI::SQL->buildArrayRef($sql); + return $filteredDocIds if (ref $filteredDocIds eq 'ARRAY' and @{$filteredDocIds}); + } + return undef; +} + +#------------------------------------------------------------------- + +=head2 _getGroups ( ) + +Returns an array reference containing all groupIds of groups the user is in. + +=cut + +sub _getGroups { + my @groups; + foreach my $groupId (WebGUI::SQL->buildArray("select groupId from groups")) { + push(@groups, $groupId) if (WebGUI::Privilege::isInGroup($groupId)); + } + return \@groups; +} + +#------------------------------------------------------------------- +sub DESTROY { + my $self=shift; + if (ref($self->{_fts})) { + $self->{_fts}->DESTROY(); + } +} + +1; diff --git a/sbin/Hourly/IndexedSearch_buildIndex.pm b/sbin/Hourly/IndexedSearch_buildIndex.pm new file mode 100644 index 000000000..f503f8196 --- /dev/null +++ b/sbin/Hourly/IndexedSearch_buildIndex.pm @@ -0,0 +1,228 @@ +package Hourly::IndexedSearch_buildIndex; + +#------------------------------------------------------------------- +# WebGUI is Copyright 2001-2003 Plain Black LLC. +#------------------------------------------------------------------- +# Please read the legal notices (docs/legal.txt) and the license +# (docs/license.txt) that came with this distribution before using +# this software. +#------------------------------------------------------------------- +# http://www.plainblack.com info@plainblack.com +#------------------------------------------------------------------- + +use DBI; +use strict; +use WebGUI::DateTime; +use WebGUI::Session; +use WebGUI::Utility; +use WebGUI::SQL; +use WebGUI::URL; +use WebGUI::Wobject::IndexedSearch::Search; + + +#------------------------------------------------------------------- +sub process { + my $verbose = shift; + print "\n"; + my $indexName = 'IndexedSearch_default'; + my $htmlFilter = 'all'; + my $stopList = 'none'; + undef $stopList if ($stopList eq 'none'); + my $stemmer = 'none'; + undef $stemmer if ($stemmer eq 'none'); + my $backend = 'phrase'; + my $indexInfo = getIndexerParams(); + my $search = WebGUI::Wobject::IndexedSearch::Search->new($indexName); + $search->recreate('','',stemmer => $stemmer, stoplist => $stopList, backend => $backend); + my $startTime = WebGUI::DateTime::time(); + foreach my $namespace (keys %{$indexInfo}) { + my $sth = WebGUI::SQL->read($indexInfo->{$namespace}{sql}); + my $total = $sth->rows; + my $actual = 1; + while (my %data = $sth->hash) { + if ($verbose) { + print "\r\t\tIndexing $namespace data ($total items) ...". + (" " x (30 - (length($namespace)) - length("$total"))). + int(($actual/$total)*100)." % "; + } + my $textToIndex = ""; + foreach my $field (@{$indexInfo->{$namespace}{fieldsToIndex}}) { + if($field =~ /^\s*select/i) { + my $sql = eval 'sprintf("%s","'.$field.'")'; + $textToIndex .= join("\n", WebGUI::SQL->buildArray($sql)); + } else { + $textToIndex .= $data{$field}."\n"; + } + } + $textToIndex = WebGUI::HTML::filter($textToIndex,$htmlFilter); + my $url = eval $indexInfo->{$namespace}{url}; + my $headerShortcut = eval 'sprintf("%s","'.$indexInfo->{$namespace}{headerShortcut}.'")'; + my $bodyShortcut = eval 'sprintf("%s","'.$indexInfo->{$namespace}{bodyShortcut}.'")'; + $search->indexDocument({ + text => $textToIndex, + location => $url, + pageId => $data{pageId}, + wobjectId => $data{wid}, + languageId => $data{languageId}, + namespace => $data{namespace}, + page_groupIdView => $data{page_groupIdView}, + wobject_groupIdView => $data{wobject_groupIdView}, + wobject_special_groupIdView => $data{wobject_special_groupIdView}, + headerShortcut => $headerShortcut, + bodyShortcut => $bodyShortcut, + contentType => $indexInfo->{$namespace}{contentType}, + ownerId => $data{ownerId} + }); + $actual++; + } + print "\n" if ($verbose && $total); + } + print "\t\t".(($search->getDocId -1)." WebGUI items indexed in ".(time() - $startTime)." seconds.\n\t") if ($verbose); + $search->close; +} + +#------------------------------------------------------------------- +sub getIndexerParams { + my $now = WebGUI::DateTime::time(); + my %params = ( + page => { + sql => "select pageId, + title, + urlizedTitle, + synopsis, + languageId, + ownerId, + 'Page' as namespace, + groupIdView as page_groupIdView, + 7 as wobject_groupIdView, + 7 as wobject_special_groupIdView + from page + where startDate < $now and endDate > $now", + fieldsToIndex => ["synopsis" , "title"], + contentType => 'page', + url => '$data{urlizedTitle}', + headerShortcut => 'select title from page where pageId = $data{pageId}', + bodyShortcut => 'select synopsis from page where pageId = $data{pageId}', + }, + wobject => { + sql => "select wobject.namespace as namespace, + wobject.title as title, + wobject.description as description, + wobject.wobjectId as wid, + wobject.addedBy as ownerId, + page.urlizedTitle as urlizedTitle, + page.languageId as languageId, + page.pageId as pageId, + page.groupIdView as page_groupIdView, + wobject.groupIdView as wobject_groupIdView, + 7 as wobject_special_groupIdView + from wobject , page + where wobject.pageId = page.pageId + and wobject.startDate < $now + and wobject.endDate > $now + and page.startDate < $now + and page.endDate > $now", + fieldsToIndex => ["title", "description"], + contentType => 'wobject', + url => '$data{urlizedTitle}."#".$data{wid}', + headerShortcut => 'select title from wobject where wobjectId = $data{wid}', + bodyShortcut => 'select description from wobject where wobjectId = $data{wid}', + }, + wobjectDiscussion => { + sql => "select forumPost.forumPostId, + forumPost.username, + forumPost.subject, + forumPost.message, + forumPost.userId as ownerId, + wobject.namespace as namespace, + wobject.wobjectId as wid, + forumThread.forumId as forumId, + page.urlizedTitle as urlizedTitle, + page.languageId as languageId, + page.pageId as pageId, + page.groupIdView as page_groupIdView, + wobject.groupIdView as wobject_groupIdView, + 7 as wobject_special_groupIdView + from forumPost, forumThread, wobject, page + where forumPost.forumThreadId = forumThread.forumThreadId + and forumThread.forumId = wobject.forumId + and wobject.pageId = page.pageId + and wobject.startDate < $now + and wobject.endDate > $now + and page.startDate < $now + and page.endDate > $now", + fieldsToIndex => ["username", "subject", "message"], + contentType => 'discussion', + url => 'WebGUI::URL::append($data{urlizedTitle},"func=view&wid=$data{wid}&forumId=$data{forumId}&forumOp=viewThread&forumPostId=$data{forumPostId}")', + headerShortcut => 'select subject from forumPost where forumPostId = $data{forumPostId}', + bodyShortcut => 'select message from forumPost where forumPostId = $data{forumPostId}', + }, + help => { + sql => "select distinct(page.languageId) as languageId, + title.message as title, + body.message as body, + help.helpId as hid, + help.titleId as titleId, + help.bodyId as bodyId, + help.namespace as namespace, + 1 as pageId, + 7 as page_groupIdView, + 7 as wobject_groupIdView, + 7 as wobject_special_groupIdView + from help, page + left join international body on bodyId = body.internationalId + and help.namespace = body.namespace + and page.languageId = body.languageId + left join international title on titleId = title.internationalId + and help.namespace = title.namespace + and page.languageId = title.languageId + where body.languageId = title.languageId", + fieldsToIndex => ["title", "body"], + contentType => 'help', + url => '"?op=viewHelp&hid=$data{hid}&namespace=$data{namespace}"', + headerShortcut => q/select message from international where languageId=$data{languageId} + and namespace='$data{namespace}' and internationalId=$data{titleId}/, + bodyShortcut => q/select message from international where languageId=$data{languageId} + and namespace='$data{namespace}' and internationalId=$data{bodyId}/, + + }, + userProfileData => { + sql => "select distinct(userProfileData.userId), + userProfileData.userId as ownerId, + '' as languageId, + b.fieldData as publicProfile, + 'profile' as namespace, + 1 as pageId, + 7 as page_groupIdView, + 7 as wobject_groupIdView, + 7 as wobject_special_groupIdView + from userProfileData + LEFT join userProfileData b + on userProfileData.userId=b.userId + and b.fieldName='publicProfile' + where b.fieldData=1;", + fieldsToIndex => [ q/select concat(userProfileField.fieldName,' ',userProfileData.fieldData) + from userProfileField, userProfileCategory, userProfileData + where userProfileField.profileCategoryId=userProfileCategory.profileCategoryId + and userProfileCategory.visible=1 + and userProfileField.visible=1 + and userProfileData.fieldName = userProfileField.fieldName + and fieldData <> '' + and userProfileData.userId = $data{userId} + / ], + url => '"?op=viewProfile&uid=$data{userId}"', + contentType => 'profile', + headerShortcut => 'select username from users where userId = $data{userId}', + #bodyShortcut => q/select concat(fieldName,': ',fieldData) from userProfileData where userId = $data{userId}/ + bodyShortcut => '$textToIndex', + } + ); + foreach my $wobject (@{$session{config}{wobjects}}) { + my $cmd = "WebGUI::Wobject::".$wobject; + my $w = $cmd->new({wobjectId=>"new",namespace=>$wobject}); + %params = (%params, %{$w->getIndexerParams($now)}); + } + return \%params; +} + +1; diff --git a/www/extras/wobject/IndexedSearch/ColorPicker2.js b/www/extras/wobject/IndexedSearch/ColorPicker2.js new file mode 100644 index 000000000..c8b91822a --- /dev/null +++ b/www/extras/wobject/IndexedSearch/ColorPicker2.js @@ -0,0 +1,73 @@ +// =================================================================== +// Author: Matt Kruse +// WWW: http://www.mattkruse.com/ +// +// NOTICE: You may use this code for any purpose, commercial or +// private, without any further permission from the author. You may +// remove this notice from your final code if you wish, however it is +// appreciated by the author if at least my web site address is kept. +// +// You may *NOT* re-distribute this code in any way except through its +// use. That means, you can include it in your product, or your web +// site, or any other form where the code is actually being used. You +// may not put the plain javascript up on your site for download or +// include it in your javascript libraries for download. +// If you wish to share this code with others, please just point them +// to the URL instead. +// Please DO NOT link directly to my .js files from your site. Copy +// the files to your server and use them there. Thank you. +// =================================================================== + +/* SOURCE FILE: AnchorPosition.js */ +function getAnchorPosition(anchorname){var useWindow=false;var coordinates=new Object();var x=0,y=0;var use_gebi=false, use_css=false, use_layers=false;if(document.getElementById){use_gebi=true;}else if(document.all){use_css=true;}else if(document.layers){use_layers=true;}if(use_gebi && document.all){x=AnchorPosition_getPageOffsetLeft(document.all[anchorname]);y=AnchorPosition_getPageOffsetTop(document.all[anchorname]);}else if(use_gebi){var o=document.getElementById(anchorname);x=AnchorPosition_getPageOffsetLeft(o);y=AnchorPosition_getPageOffsetTop(o);}else if(use_css){x=AnchorPosition_getPageOffsetLeft(document.all[anchorname]);y=AnchorPosition_getPageOffsetTop(document.all[anchorname]);}else if(use_layers){var found=0;for(var i=0;i screen.availHeight){this.y = screen.availHeight - this.height;}}if(screen && screen.availWidth){if((this.x + this.width) > screen.availWidth){this.x = screen.availWidth - this.width;}}this.popupWindow = window.open("about:blank","window_"+anchorname,"toolbar=no,location=no,status=no,menubar=no,scrollbars=auto,resizable,alwaysRaised,dependent,titlebar=no,width="+this.width+",height="+this.height+",screenX="+this.x+",left="+this.x+",screenY="+this.y+",top="+this.y+"");}this.refresh();}} +function PopupWindow_hidePopup(){if(this.divName != null){if(this.use_gebi){document.getElementById(this.divName).style.visibility = "hidden";}else if(this.use_css){document.all[this.divName].style.visibility = "hidden";}else if(this.use_layers){document.layers[this.divName].visibility = "hidden";}}else{if(this.popupWindow && !this.popupWindow.closed){this.popupWindow.close();this.popupWindow = null;}}} +function PopupWindow_isClicked(e){if(this.divName != null){if(this.use_layers){var clickX = e.pageX;var clickY = e.pageY;var t = document.layers[this.divName];if((clickX > t.left) &&(clickX < t.left+t.clip.width) &&(clickY > t.top) &&(clickY < t.top+t.clip.height)){return true;}else{return false;}}else if(document.all){var t = window.event.srcElement;while(t.parentElement != null){if(t.id==this.divName){return true;}t = t.parentElement;}return false;}else if(this.use_gebi){var t = e.originalTarget;while(t.parentNode != null){if(t.id==this.divName){return true;}t = t.parentNode;}return false;}return false;}return false;} +function PopupWindow_hideIfNotClicked(e){if(this.autoHideEnabled && !this.isClicked(e)){this.hidePopup();}} +function PopupWindow_autoHide(){this.autoHideEnabled = true;} +function PopupWindow_hidePopupWindows(e){for(var i=0;i0){this.type="DIV";this.divName = arguments[0];}else{this.type="WINDOW";}this.use_gebi = false;this.use_css = false;this.use_layers = false;if(document.getElementById){this.use_gebi = true;}else if(document.all){this.use_css = true;}else if(document.layers){this.use_layers = true;}else{this.type = "WINDOW";}this.offsetX = 0;this.offsetY = 0;this.getXYPosition = PopupWindow_getXYPosition;this.populate = PopupWindow_populate;this.refresh = PopupWindow_refresh;this.showPopup = PopupWindow_showPopup;this.hidePopup = PopupWindow_hidePopup;this.setSize = PopupWindow_setSize;this.isClicked = PopupWindow_isClicked;this.autoHide = PopupWindow_autoHide;this.hideIfNotClicked = PopupWindow_hideIfNotClicked;} + + +/* SOURCE FILE: ColorPicker2.js */ + +ColorPicker_targetInput = null; +function ColorPicker_writeDiv(){document.writeln("");} +function ColorPicker_show(anchorname){this.showPopup(anchorname);} +function ColorPicker_pickColor(color,obj){obj.hidePopup();pickColor(color);} +function pickColor(color){if(ColorPicker_targetInput==null){alert("Target Input is null, which means you either didn't use the 'select' function or you have no defined your own 'pickColor' function to handle the picked color!");return;}ColorPicker_targetInput.value = color;} +function ColorPicker_select(inputobj,linkname){if(inputobj.type!="text" && inputobj.type!="hidden" && inputobj.type!="textarea"){alert("colorpicker.select: Input object passed is not a valid form input object");window.ColorPicker_targetInput=null;return;}window.ColorPicker_targetInput = inputobj;this.show(linkname);} +function ColorPicker_highlightColor(c){var thedoc =(arguments.length>1)?arguments[1]:window.document;var d = thedoc.getElementById("colorPickerSelectedColor");d.style.backgroundColor = c;d = thedoc.getElementById("colorPickerSelectedColorValue");d.innerHTML = c;} +function ColorPicker(){var windowMode = false;if(arguments.length==0){var divname = "colorPickerDiv";}else if(arguments[0] == "window"){var divname = '';windowMode = true;}else{var divname = arguments[0];}if(divname != ""){var cp = new PopupWindow(divname);}else{var cp = new PopupWindow();cp.setSize(250,225);}cp.currentValue = "#FFFFFF";cp.writeDiv = ColorPicker_writeDiv;cp.highlightColor = ColorPicker_highlightColor;cp.show = ColorPicker_show;cp.select = ColorPicker_select;var colors = new Array("#000000","#000033","#000066","#000099","#0000CC","#0000FF","#330000","#330033","#330066","#330099","#3300CC", +"#3300FF","#660000","#660033","#660066","#660099","#6600CC","#6600FF","#990000","#990033","#990066","#990099", +"#9900CC","#9900FF","#CC0000","#CC0033","#CC0066","#CC0099","#CC00CC","#CC00FF","#FF0000","#FF0033","#FF0066", +"#FF0099","#FF00CC","#FF00FF","#003300","#003333","#003366","#003399","#0033CC","#0033FF","#333300","#333333", +"#333366","#333399","#3333CC","#3333FF","#663300","#663333","#663366","#663399","#6633CC","#6633FF","#993300", +"#993333","#993366","#993399","#9933CC","#9933FF","#CC3300","#CC3333","#CC3366","#CC3399","#CC33CC","#CC33FF", +"#FF3300","#FF3333","#FF3366","#FF3399","#FF33CC","#FF33FF","#006600","#006633","#006666","#006699","#0066CC", +"#0066FF","#336600","#336633","#336666","#336699","#3366CC","#3366FF","#666600","#666633","#666666","#666699", +"#6666CC","#6666FF","#996600","#996633","#996666","#996699","#9966CC","#9966FF","#CC6600","#CC6633","#CC6666", +"#CC6699","#CC66CC","#CC66FF","#FF6600","#FF6633","#FF6666","#FF6699","#FF66CC","#FF66FF","#009900","#009933", +"#009966","#009999","#0099CC","#0099FF","#339900","#339933","#339966","#339999","#3399CC","#3399FF","#669900", +"#669933","#669966","#669999","#6699CC","#6699FF","#999900","#999933","#999966","#999999","#9999CC","#9999FF", +"#CC9900","#CC9933","#CC9966","#CC9999","#CC99CC","#CC99FF","#FF9900","#FF9933","#FF9966","#FF9999","#FF99CC", +"#FF99FF","#00CC00","#00CC33","#00CC66","#00CC99","#00CCCC","#00CCFF","#33CC00","#33CC33","#33CC66","#33CC99", +"#33CCCC","#33CCFF","#66CC00","#66CC33","#66CC66","#66CC99","#66CCCC","#66CCFF","#99CC00","#99CC33","#99CC66", +"#99CC99","#99CCCC","#99CCFF","#CCCC00","#CCCC33","#CCCC66","#CCCC99","#CCCCCC","#CCCCFF","#FFCC00","#FFCC33", +"#FFCC66","#FFCC99","#FFCCCC","#FFCCFF","#00FF00","#00FF33","#00FF66","#00FF99","#00FFCC","#00FFFF","#33FF00", +"#33FF33","#33FF66","#33FF99","#33FFCC","#33FFFF","#66FF00","#66FF33","#66FF66","#66FF99","#66FFCC","#66FFFF", +"#99FF00","#99FF33","#99FF66","#99FF99","#99FFCC","#99FFFF","#CCFF00","#CCFF33","#CCFF66","#CCFF99","#CCFFCC", +"#CCFFFF","#FFFF00","#FFFF33","#FFFF66","#FFFF99","#FFFFCC","#FFFFFF");var total = colors.length;var width = 18;var cp_contents = "";var windowRef =(windowMode)?"window.opener.":"";if(windowMode){cp_contents += "Select Color";cp_contents += "
";}cp_contents += "
";var use_highlight =(document.getElementById || document.all)?true:false;for(var i=0;i   ';if( ((i+1)>=total) ||(((i+1) % width) == 0)){cp_contents += "";}}if(document.getElementById){var width1 = Math.floor(width/2);var width2 = width = width1;cp_contents += "";}cp_contents += "
 #FFFFFF
";if(windowMode){cp_contents += "";}cp.populate(cp_contents+"\n");cp.offsetY = 25;cp.autoHide();return cp;} +