From 598ccd1766ed41500ca2c344ef623b32a6c64585 Mon Sep 17 00:00:00 2001 From: JT Smith Date: Sun, 24 Apr 2005 01:05:29 +0000 Subject: [PATCH] readded DBIx::FullTextSearch because it's almost impossible to add locally --- lib/DBIx/FullTextSearch.pm | 1243 ++++++++++ lib/DBIx/FullTextSearch/Blob.pm | 263 +++ lib/DBIx/FullTextSearch/BlobFast.pm | 83 + lib/DBIx/FullTextSearch/Column.pm | 188 ++ lib/DBIx/FullTextSearch/File.pm | 27 + lib/DBIx/FullTextSearch/Phrase.pm | 182 ++ lib/DBIx/FullTextSearch/StopList.pm | 271 +++ lib/DBIx/FullTextSearch/String.pm | 78 + lib/DBIx/FullTextSearch/Table.pm | 142 ++ lib/DBIx/FullTextSearch/TestConfig.pm | 6 + lib/DBIx/FullTextSearch/URL.pm | 30 + lib/Parse/RecDescent.pm | 3045 +++++++++++++++++++++++++ 12 files changed, 5558 insertions(+) create mode 100644 lib/DBIx/FullTextSearch.pm create mode 100644 lib/DBIx/FullTextSearch/Blob.pm create mode 100644 lib/DBIx/FullTextSearch/BlobFast.pm create mode 100644 lib/DBIx/FullTextSearch/Column.pm create mode 100644 lib/DBIx/FullTextSearch/File.pm create mode 100644 lib/DBIx/FullTextSearch/Phrase.pm create mode 100644 lib/DBIx/FullTextSearch/StopList.pm create mode 100644 lib/DBIx/FullTextSearch/String.pm create mode 100644 lib/DBIx/FullTextSearch/Table.pm create mode 100644 lib/DBIx/FullTextSearch/TestConfig.pm create mode 100644 lib/DBIx/FullTextSearch/URL.pm create mode 100644 lib/Parse/RecDescent.pm diff --git a/lib/DBIx/FullTextSearch.pm b/lib/DBIx/FullTextSearch.pm new file mode 100644 index 000000000..328ae822e --- /dev/null +++ b/lib/DBIx/FullTextSearch.pm @@ -0,0 +1,1243 @@ +# -*- Mode: Perl; indent-tabs-mode: t; tab-width: 2 -*- + +=head1 NAME + +DBIx::FullTextSearch - Indexing documents with MySQL as storage + +=cut + +package DBIx::FullTextSearch; +use strict; +use Parse::RecDescent; + +use vars qw($errstr $VERSION $parse); +$errstr = undef; +$VERSION = '0.73'; + +use locale; + +my %DEFAULT_PARAMS = ( + 'num_of_docs' => 0, # statistical value, should be maintained + 'word_length' => 30, # max length of words we index + + 'protocol' => 40, # we only support protocol with the same numbers + + 'blob_direct_fetch' => 20, # with the blob store, when we stop searching + # and fetch everything at once + 'data_table' => undef, # table where the actual index is stored + 'name_length' => 255, # for filenames or URLs, what's the max length + + 'word_id_bits' => 16, # num of bits for word_id (column store) + 'doc_id_bits' => 16, # num of bits for doc_id + 'count_bits' => 8, # num of bits for count value + 'position_bits' => 32, # num of bits for word positions + + 'backend' => 'blob', # what database backend (way the data is + # stored) we use + 'frontend' => 'none', # what application frontend we use (how + # the index behaves externaly) + 'filter' => 'map { lc $_ }', + 'search_splitter' => '/(\w{2,$word_length}\*?)/g', + 'index_splitter' => '/(\w{2,$word_length})/g', + # can use the $word_length + # variable + 'init_env' => '' + ); +my %backend_types = ( + 'blob' => 'DBIx::FullTextSearch::Blob', + 'blobfast' => 'DBIx::FullTextSearch::BlobFast', + 'column' => 'DBIx::FullTextSearch::Column', + 'phrase' => 'DBIx::FullTextSearch::Phrase', + ); +my %frontend_types = ( + 'none' => 'DBIx::FullTextSearch', + 'default' => 'DBIx::FullTextSearch', + 'file' => 'DBIx::FullTextSearch::File', + 'string' => 'DBIx::FullTextSearch::String', + 'url' => 'DBIx::FullTextSearch::URL', + 'table' => 'DBIx::FullTextSearch::Table', + ); + +use vars qw! %BITS_TO_PACK %BITS_TO_INT %INT_TO_BITS !; +%BITS_TO_PACK = qw! 0 A0 8 C 16 S 32 L !; +%BITS_TO_INT = qw! 8 tinyint 16 smallint 24 mediumint 32 int 64 bigint !; +%INT_TO_BITS = map { ($BITS_TO_INT{$_} => $_ ) }keys %BITS_TO_INT; + +# Open reads in the information about existing index, creates an object +# in memory +sub open { + my ($class, $dbh, $TABLE) = @_; + $errstr = undef; + + # the $dbh is either a real dbh of a DBI->connect parameters arrayref + my $mydbh = 0; + if (ref $dbh eq 'ARRAY') { + if (not $dbh = DBI->connect(@$dbh)) { + $errstr = $DBI::errstr; return; + } + + $mydbh = 1; + } + + # load the parameters to the object + my %PARAMS = %DEFAULT_PARAMS; + my $sth = $dbh->prepare("select * from $TABLE"); + $sth->{'PrintError'} = 0; + $sth->{'RaiseError'} = 0; + $sth->execute or do { + if (not grep { $TABLE eq $_ } + DBIx::FullTextSearch->list_fts_indexes($dbh)) { + $errstr = "FullTextSearch index $TABLE doesn't exist."; + } else { + $errstr = $sth->errstr; + } + return; + }; + while (my ($param, $value) = $sth->fetchrow_array) { + $PARAMS{$param} = $value; + } + + my $self = bless { + 'dbh' => $dbh, + 'table' => $TABLE, + %PARAMS, + }, $class; + my $data_table = $self->{'data_table'}; + + # we should disconnect if we've opened the dbh here + if ($mydbh) { $self->{'disconnect_on_destroy'} = 1; } + + # some basic sanity check + if (not defined $dbh->selectrow_array("select count(*) from $data_table")) { + $errstr = "Table $data_table not found in the database\n"; + return; + } + + + # load and set the application frontend + my $front_module = $frontend_types{$PARAMS{'frontend'}}; + if (defined $front_module) { + if ($front_module ne $class) { + eval "use $front_module"; + die $@ if $@; + } + bless $self, $front_module; + $self->_open_tables; + } + else { + $errstr = "Specified frontend type `$PARAMS{'frontend'}' is unknown\n"; return; + } + + # load and set the backend (actual database access) module + my $back_module = $backend_types{$PARAMS{'backend'}}; + if (defined $back_module) { + eval "use $back_module"; + die $@ if $@; + $self->{'db_backend'} = $back_module->open($self); + } + else { + $errstr = "Specified backend type `$PARAMS{'backend'}' is unknown\n"; return; + } + + # load DBIx::FullTextSearch::StopList object (if specified) + if ($PARAMS{'stoplist'}) { + eval "use DBIx::FullTextSearch::StopList"; + die $@ if $@; + $self->{'stoplist'} = DBIx::FullTextSearch::StopList->open($dbh, $PARAMS{'stoplist'}); + } + + # load Lingua::Stem object (if specified) + if($PARAMS{'stemmer'}){ + eval "use Lingua::Stem"; + die $@ if $@; + $self->{'stemmer'} = Lingua::Stem->new(-locale => $PARAMS{'stemmer'}); + } + + # finally, return the object + $self; +} + +# Create creates tables in the database according to the options, then +# calls open to load the object to memory +sub create { + my ($class, $dbh, $TABLE, %OPTIONS) = @_; + $errstr = undef; + my $mydbh = 0; + if (ref $dbh eq 'ARRAY') { + $dbh = DBI->connect(@$dbh) + or do { $errstr = $DBI::errstr; return; }; + $mydbh = 1; + } + + my $self = bless { + 'dbh' => $dbh, + 'table' => $TABLE, + %DEFAULT_PARAMS, + %OPTIONS + }, $class; + + $self->{'data_table'} = $TABLE.'_data' + unless defined $self->{'data_table'}; + + # convert array reference to CSV string + $self->{'column_name'} = join(",",@{$self->{'column_name'}}) if ref($self->{'column_name'}) eq 'ARRAY'; + + my $CREATE_PARAM = <do($CREATE_PARAM) or do { $errstr = $dbh->errstr; return; }; + push @{$self->{'created_tables'}}, $TABLE; + + # load and set the frontend database structures + my $front_module = $frontend_types{$self->{'frontend'}}; + if (defined $front_module) { + eval "use $front_module"; + die $@ if $@; + bless $self, $front_module; + $errstr = $self->_create_tables; + if (defined $errstr) { + $self->clean_failed_create; warn $errstr; return; + } + } + else { + $errstr = "Specified frontend type `$self->{'frontend'}' is unknown\n"; $self->clean_failed_create; return; + } + + # create the backend database structures + my $back_module = $backend_types{$self->{'backend'}}; + if (defined $back_module) { + eval "use $back_module"; + die $@ if $@; + $errstr = $back_module->_create_tables($self); + if (defined $errstr) { + $self->clean_failed_create; warn $errstr; return; + } + } + else { + $errstr = "Specified backend type `$self->{'backend'}' is unknown\n"; $self->clean_failed_create; return; + } + + for (grep { not ref $self->{$_} } keys %$self) { + $dbh->do("insert into $TABLE values (?, ?)", {}, $_, $self->{$_}); + } + + return $class->open($dbh, $TABLE); +} + +sub _create_tables {} +sub _open_tables {} + +sub clean_failed_create { + my $self = shift; + my $dbh = $self->{'dbh'}; + for my $table (@{$self->{'created_tables'}}) { + $dbh->do("drop table $table"); + } +} + +sub drop { + my $self = shift; + my $dbh = $self->{'dbh'}; + for my $tag (keys %$self) { + next unless $tag =~ /(^|_)table$/; + $dbh->do("drop table $self->{$tag}"); + } + 1; +} + +sub empty { + my $self = shift; + my $dbh = $self->{'dbh'}; + + for my $tag (keys %$self) { + next unless $tag =~ /_table$/; + $dbh->do("delete from $self->{$tag}"); + } + $dbh->do("replace into $self->{'table'} values ('max_doc_id', 0)"); + return 1; +} + +sub errstr { + my $self = shift; + ref $self ? $self->{'errstr'} : $errstr; +} + +sub list_fts_indexes { + my ($class, $dbh) = @_; + my %tables = map { ( $_->[0] => 1 ) } + @{$dbh->selectall_arrayref('show tables')}; + my %indexes = (); + for my $table (keys %tables) { + local $dbh->{'PrintError'} = 0; + local $dbh->{'RaiseError'} = 0; + if ($dbh->selectrow_array("select param, value from $table + where param = 'data_table'")) { + $indexes{$table} = 1; + } + } + return sort keys %indexes; +} + +sub index_document { + my ($self, $id, $data) = @_; + return unless defined $id; + + my $dbh = $self->{'dbh'}; + + my $param_table = $self->{'table'}; + + my $adding_doc = 0; + + my $adding = 0; + if (not defined $self->{'max_doc_id'} or $id > $self->{'max_doc_id'}) { + $self->{'max_doc_id'} = $id; + my $update_max_doc_id_sth = + ( defined $self->{'update_max_doc_id_sth'} + ? $self->{'update_max_doc_id_sth'} + : $self->{'update_max_doc_id_sth'} = $dbh->prepare("replace into $param_table values (?, ?)")); + $update_max_doc_id_sth->execute('max_doc_id', $id); + $adding_doc = 1; + } + + my $init_env = $self->{'init_env'}; # use packages, etc. + eval $init_env if defined $init_env; + print STDERR "Init_env failed with $@\n" if $@; + + $data = '' unless defined $data; + return $self->{'db_backend'}->parse_and_index_data($adding_doc, + $id, $data); +} + +# used for backends that need a count for each of the words +sub parse_and_index_data_count { + my ($backend, $adding_doc, $id, $data) = @_; + ## note that this is run with backend object + my $self = $backend->{'fts'}; + + my $word_length = $self->{'word_length'}; + # this needs to get parametrized (lc, il2_to_ascii, parsing of + # HTML tags, ...) + + my %words; + my @data_sets = ref $data ? @$data : ($data); + + # We can just join the data sets together, since we don't care about position + my $data_string = join(" ", @data_sets); + + my $filter = $self->{'filter'} . ' $data_string =~ ' . $self->{'index_splitter'}; + my $stoplist = $self->{'stoplist'}; + my $stemmer = $self->{'stemmer'}; + my @words = eval $filter; + @words = grep !$stoplist->is_stop_word($_), @words if defined($stoplist); + @words = @{$stemmer->stem(@words)} if defined($stemmer); + for my $word ( @words ) { + $words{$word} = 0 if not defined $words{$word}; + $words{$word}++; + } + + my @result; + if ($adding_doc) { + @result = $backend->add_document($id, \%words); + } else { + @result = $backend->update_document($id, \%words); + } + + if (wantarray) { + return @result; + } + return $result[0]; +} + +# used for backends where list of occurencies is needed +sub parse_and_index_data_list { + my ($backend, $adding_doc, $id, $data) = @_; + ## note that this is run with backend object + my $self = $backend->{'fts'}; + + my $word_length = $self->{'word_length'}; + # this needs to get parametrized (lc, il2_to_ascii, parsing of + # HTML tags, ...) + + my %words; + my @data_sets = ref $data ? @$data : ($data); + + foreach my $data_set (@data_sets) { + my $filter = $self->{'filter'}.' $data_set =~ '.$self->{'index_splitter'}; + + my $i = 0; # $i stores the position(s) of each word in the document. + my $stoplist = $self->{'stoplist'}; + my $stemmer = $self->{'stemmer'}; + my @words = eval $filter; + @words = grep !$stoplist->is_stop_word($_), @words if defined($stoplist); + @words = @{$stemmer->stem(@words)} if defined($stemmer); + for my $word ( @words ) { + push @{$words{$word}}, ++$i; + } + # Make sure the data sets are considered far apart in position, to + # avoid phrase searches overlapping between table columns. + $i += 100; + } + + my @result; + if ($adding_doc) { + @result = $backend->add_document($id, \%words); + } else { + @result = $backend->update_document($id, \%words); + } + + if (wantarray) { + return @result; + } + return $result[0]; +} + +sub delete_document { + my $self = shift; + $self->{'db_backend'}->delete_document(@_); +} + +sub contains_hashref { + my $self = shift; + my $word_length = $self->{'word_length'}; + my $stemmer = $self->{'stemmer'}; + my $filter = $self->{'filter'}; + my $stoplist = $self->{'stoplist'}; + my @phrases; + for (@_){ + my $phrase; + my $splitter = ' map { ' . $self->{'search_splitter'} . ' } $_'; + my @words = eval $splitter; + @words = eval $filter.' @words'; + @words = grep !$stoplist->is_stop_word($_), @words if defined($stoplist); + if (defined($stemmer)){ + my @stemmed_words = (); + for (@words){ + if (m/\*$/){ + # wildcard search, make work with stemming + my $stem_word = $stemmer->stem($_); + for (@$stem_word){ + $_ .= "*"; + push @stemmed_words, $_; + } + } else { + push @stemmed_words, @{$stemmer->stem($_)}; + } + } + $phrase = join(' ',@stemmed_words); + } else { + $phrase = join(' ',@words); + } + # change wildcard to SQL version (* -> %) + $phrase =~ s/\*/%/g; + push @phrases, $phrase; + } + $self->{'db_backend'}->contains_hashref(@phrases); +} + +sub contains { + my $self = shift; + my $res = $self->contains_hashref(@_); + if (not $self->{'count_bits'}) { return keys %$res; } + return sort { $res->{$b} <=> $res->{$a} } keys %$res; +} + +sub econtains_hashref { + my $self = shift; + my $docs = {}; + my $word_num = 0; + + my $stoplist = $self->{'stoplist'}; + + my @plus_words = map { /^\+(.+)$/s } @_; + @plus_words = grep !$stoplist->is_stop_word($_), @plus_words if defined($stoplist); + + # required words + for my $word (@plus_words) { + $word_num++; + my $oneword = $self->contains_hashref($word); + if ($word_num == 1) { $docs = $oneword; next; } + for my $doc (keys %$oneword) { + $docs->{$doc} += $oneword->{$doc} if defined $docs->{$doc}; + } + for my $doc (keys %$docs) { + delete $docs->{$doc} unless defined $oneword->{$doc}; + } + } + + # optional words + for my $word ( map { /^([^+-].*)$/s } @_) { + my $oneword = $self->contains_hashref($word); + for my $doc (keys %$oneword) { + if (@plus_words) { + $docs->{$doc} += $oneword->{$doc} if defined $docs->{$doc}; + } + else { + $docs->{$doc} = 0 unless defined $docs->{$doc}; + $docs->{$doc} += $oneword->{$doc}; + } + } + } + + # prohibited words + for my $word ( map { /^-(.+)$/s } @_) { + my $oneword = $self->contains_hashref($word); + for my $doc (keys %$oneword) { + delete $docs->{$doc}; + } + } + $docs; +} + +sub econtains { + my $self = shift; + my $res = $self->econtains_hashref(@_); + if (not $self->{'count_bits'}) { return keys %$res; } + return sort { $res->{$b} <=> $res->{$a} } keys %$res; +} + +sub _search_terms { + my ($self, $query) = @_; + + if ($self->{'backend'} eq 'phrase') { + # phrase backend, must deal with quotes + + # handle + and - operations on phrases + $query =~ s/([\+\-])"/"$1/g; + + my $inQuote = 0; + my @phrases = (); + + my @blocks = split(/\"/, $query); + + # deal with quotes + for (@blocks){ + if($inQuote == 0){ + # we are outside quotes, search for individual words + push @phrases, split(' '); + } else { + # we are inside quote, search for whole phrase + push @phrases, $_; + } + $inQuote = ++$inQuote % 2; + } + return @phrases; + } else + { + # not phrase backend, don't deal with quotes + return split(' ', $query); + } +} + +sub _search_boolean { + my ($self, $query) = @_; + + unless ($parse) { + $::RD_AUTOACTION = q{ [@item] }; + my $grammar = q{ + expr : disj + disj : conj 'or' disj | conj + conj : unary 'and' conj | unary + unary : '(' expr ')' + | atom + atom : /([^\(\)\s]|\s(?!and)(?!or))+/ + }; + $parse = new Parse::RecDescent ($grammar); + } + my $tree = $parse->expr($query); + return $self->_search_in_tree($tree); +} + +sub _search_in_tree { + my ($self, $tree) = @_; + + if (ref($tree->[1]) && ref($tree->[3])) { + if (defined($tree->[2]) && $tree->[2] eq 'and') { + my $hash_ref1 = $self->_search_in_tree($tree->[1]); + my $hash_ref2 = $self->_search_in_tree($tree->[3]); + for my $k (keys %$hash_ref1) { + unless ($hash_ref2->{$k}) { + delete $hash_ref1->{$k}; + } else { + $hash_ref1->{$k} += $hash_ref2->{$k}; + } + } + return $hash_ref1; + } elsif (defined($tree->[2]) && $tree->[2] eq 'or') { + my $hash_ref1 = $self->_search_in_tree($tree->[1]); + my $hash_ref2 = $self->_search_in_tree($tree->[3]); + for my $k (keys %$hash_ref2) { + $hash_ref1->{$k} += $hash_ref2->{$k}; + } + return $hash_ref1; + } + return {}; + } elsif ($tree->[1] eq '(' && ref($tree->[2]) && $tree->[3] eq ')') { + return $self->_search_in_tree($tree->[2]); + } elsif (ref($tree->[1])) { + return $self->_search_in_tree($tree->[1]); + } elsif (defined($tree->[0]) && $tree->[0] eq 'atom') { + return $self->econtains_hashref($self->_search_terms($tree->[1])); + } else { + warn "Unknown tree nodes " . join("\t", @$tree); + return {}; + } +} + +sub search { + my ($self, $query) = @_; + if ($query =~ s/\b(and|or|not)\b/lc($1)/eig) { + return keys %{$self->_search_boolean($query)}; + } + return $self->econtains($self->_search_terms($query)); +} + +sub search_hashref { + my ($self, $query) = @_; + if ($query =~ s/\b(and|or|not)\b/lc($1)/eig) { + return $self->_search_boolean($query); + } + return $self->econtains_hashref($self->_search_terms($query)); +} + +sub document_count { + my $self = shift; + my $dbh = $self->{'dbh'}; + + my $SQL = qq{ + select distinct doc_id from $self->{'data_table'} + }; + my $ary_ref = $dbh->selectall_arrayref($SQL); + return scalar @$ary_ref; +} + +# find all words that are contained in at least $k % of all documents +sub common_word { + my $self = shift; + my $k = shift || 80; + $self->{'db_backend'}->common_word($k); +} + +sub DESTROY { + my $self = shift; + $self->{'db_backend'}->DESTROY() + if (exists $self->{'db_backend'} && $self->{'db_backend'} &&$self->{'db_backend'}->can('DESTROY')); +} + +1; + +=head1 SYNOPSIS + +DBIx::FullTextSearch uses a MySQL database backend to index files, web +documents and database fields. Supports must include, can include, and cannot +include words and phrases. Support for boolean (AND/OR) queries, stop words and stemming. + + use DBIx::FullTextSearch; + use DBI; + # connect to database (regular DBI) + my $dbh = DBI->connect('dbi:mysql:database', 'user', 'passwd'); + + # create a new stoplist + my $sl = DBIx::FullTextSearch::StopList->create_default($dbh, 'sl_en', 'English'); + + # create a new index with default english stoplist and english stemmer + my $fts = DBIx::FullTextSearch->create($dbh, 'fts_web_1', + frontend => 'string', backend => 'blob', + stoplist => 'sl_en', stemmer => 'en-us'); + # or open existing one + # my $fts = DBIx::FullTextSearch->open($dbh, 'fts_web_1'); + + # index documents + $fts->index_document('krtek', 'krtek leze pod zemi'); + $fts->index_document('jezek', 'Jezek ma ostre bodliny.'); + + # search for matches + my @docs = $fts->contains('foo'); + my @docs = $fts->econtains('+foo', '-Bar'); + my @docs = $fts->search('+foo -Bar'); + my @docs = $fts->search('foo AND (bar OR baz)'); + +=head1 DESCRIPTION + +DBIx::FullTextSearch is a flexible solution for indexing contents of documents. +It uses the MySQL database to store the information about words and +documents and provides Perl interface for indexing new documents, +making changes and searching for matches. For DBIx::FullTextSearch, a document +is nearly anything -- Perl scalar, file, Web document, database field. + +The basic style of interface is shown above. What you need is a MySQL +database and a L with L. Then you create a DBIx::FullTextSearch index +-- a set of tables that maintain all necessary information. Once created +it can be accessed many times, either for updating the index (adding +documents) or searching. + +DBIx::FullTextSearch uses one basic table to store parameters of the index. Second +table is used to store the actual information about documents and words, +and depending on the type of the index (specified during index creation) +there may be more tables to store additional information (like +conversion from external string names (eg. URL's) to internal numeric +form). For a user, these internal thingies and internal behaviour of the +index are not important. The important part is the API, the methods to +index document and ask questions about words in documents. However, +certain understanding of how it all works may be usefull when you are +deciding if this module is for you and what type of index will best +suit your needs. + +=head2 Frontends + +From the user, application point of view, the DBIx::FullTextSearch index stores +documents that are named in a certain way, allows adding new documents, +and provides methods to ask: "give me list of names of documents that +contain this list of words". The DBIx::FullTextSearch index doesn't store the +documents itself. Instead, it stores information about words in the +documents in such a structured way that it makes easy and fast to look +up what documents contain certain words and return names of the +documents. + +DBIx::FullTextSearch provides a couple of predefined frontend classes that specify +various types of documents (and the way they relate to their names). + +=over 4 + +=item default + +By default, user specifies the integer number of the document and the +content (body) of the document. The code would for example read + + $fts->index_document(53, 'zastavujeme vyplaty vkladu'); + +and DBIx::FullTextSearch will remember that the document 53 contains three words. +When looking for all documents containing word (string) vklad, a call + + my @docs = $fts->contains('vklad*'); + +would return numbers of all documents containing words starting with +'vklad', 53 among them. + +So here it's user's responsibility to maintain a relation between the +document numbers and their content, to know that a document 53 is about +vklady. Perhaps the documents are already stored somewhere and have +unique numeric id. + +Note that the numeric id must be no larger than 2^C. + +=item string + +Frontend B allows the user to specify the names of the documents as +strings, instead of numbers. Still the user has to specify both the +name of the document and the content: + + $fts->index_document('foobar', + 'the quick brown fox jumped over lazy dog!'); + +After that, + + $fts->contains('dog') + +will return 'foobar' as one of the names of documents with word +'dog' in it. + +=item file + +To index files, use the frontend B. Here the content of the document +is clearly the content of the file specified by the filename, so in +a call to index_document, only the name is needed -- the content of the +file is read by the DBIx::FullTextSearch transparently: + + $fts->index_document('/usr/doc/FAQ/Linux-FAQ'); + my @files = $fts->contains('penguin'); + +=item url + +Web document can be indexed by the frontend B. DBIx::FullTextSearch uses L to +get the document and then parses it normally: + + $fts->index_document('http://www.perl.com/'); + +Note that the HTML tags themselves are indexed along with the text. + +=item table + +You can have a DBIx::FullTextSearch index that indexes char or blob fields in MySQL +table. Since MySQL doesn't support triggers, you have to call the +C method of DBIx::FullTextSearch any time something changes +in the table. So the sequence probably will be + + $dbh->do('insert into the_table (id, data, other_fields) + values (?, ?, ?)', {}, $name, $data, $date_or_something); + $fts->index_document($name); + +When calling C, the id (name) of the record will be returned. If +the id in the_table is numeric, it's directly used as the internal +numeric id, otherwise a string's way of converting the id to numeric +form is used. + +When creating this index, you'll have to pass it three additionial options, +C, C, and C. You may use the optional +column_process option to pre-process data in the specified columns. + +=back + +The structure of DBIx::FullTextSearch is very flexible and adding new frontend +(what will be indexed) is very easy. + +=head2 Backends + +While frontend specifies what is indexed and how the user sees the +collection of documents, backend is about low level database way of +actually storing the information in the tables. Three types are +available: + +=over 4 + +=item blob + +For each word, a blob holding list of all documents containing that word +is stored in the table, with the count (number of occurencies) +associated with each document number. That makes it for very compact +storage. Since the document names (for example URL) are internally +converted to numbers, storing and fetching the data is fast. However, +updating the information is very slow, since information concerning one +document is spread across all table, without any direct database access. +Updating a document (or merely reindexing it) requires update of all +blobs, which is slow. + +The list of documents is stored sorted by document name so that +fetching an information about a document for one word is relatively +easy, still a need to update (or at least scan) all records in the table +makes this storage unsuitable for collections of documents that often +change. + +=item column + +The B backend stores a word/document pair in database fields, +indexing both, thus allowing both fast retrieval and updates -- it's +easy to delete all records describing one document and insert new ones. +However, the database indexes that have to be maintained are large. + +Both B and B backends only store a count -- number of +occurencies of the word in the document (and even this can be switched +off, yielding just a yes/no information about the word's presence). +This allows questions like + + all documents containing words 'voda' or 'Mattoni' + but not a word 'kyselka' + +but you cannot ask whether a document contains a phrase 'kyselka +Mattoni' because such information is not maintained by these types of +backends. + +=item phrase + +To allow phrase matching, a B backend is available. For each word +and document number it stores a blob of lists of positions of the word +in the document. A query + + $fts->contains('kyselk* Mattoni'); + +then only returns those documents (document names/numbers) where word +kyselka (or kyselky, or so) is just before word Mattoni. + +=back + +=head2 Mixing frontends and backends + +Any frontend can be used with any backend in one DBIx::FullTextSearch index. You +can index Web documents with C frontend and C backend +to be able to find phrases in the documents. And you can use the +default, number based document scheme with C backend to use the disk +space as efficiently as possible -- this is usefull for example for +mailing-list archives, where we need to index huge number of documents +that do not change at all. + +Finding optimal combination is very important and may require some +analysis of the document collection and manipulation, as well as the +speed and storage requirements. Benchmarking on actual target platform +is very useful during the design phase. + +=head1 METHODS + +The following methods are available on the user side as DBIx::FullTextSearch API. + +=over 4 + +=item create + + my $fts = DBIx::FullTextSearch->create($dbh, $index_name, %opts); + +The class method C creates index of given name (the name of the +index is the name of its basic parameter table) and all necessary +tables, returns an object -- newly created index. The options that may +be specified after the index name define the frontend and backend types, +storage parameters (how many bits for what values), etc. See below for +list of create options and discussion of their use. + +=item open + + my $fts = DBIx::FullTextSearch->open($dbh, $index_name); + +Opens and returns object, accessing specifies DBIx::FullTextSearch index. Since all +the index parameters and information are stored in the C<$index_name> table +(including names of all other needed tables), the database handler and +the name of the parameter table are the only needed arguments. + +=item index_document + + $fts->index_document(45, 'Sleva pri nakupu stribra.'); + $fts->index_document('http://www.mozilla.org/'); + $fts->index_document('http://www.mozilla.org/','This is the mozilla web site'); + +For the C and C frontends, two arguments are expected -- the +name (number or string) of the document and its content. For C, +C, and C frontends the content is optional. Any content that you pass +will be appended to the content from the file, URL, or database table. + +=item delete_document + + $fts->delete_document('http://www.mozilla.org/'); + +Removes information about document from the index. Note that for C +backend this is very time consuming process. + +=item contains + + my @docs = $fts->contains('sleva', 'strib*'); + +Returns list of names (numbers or strings, depending on the frontend) +of documents that contain some of specified words. + +=item econtains + + my @docs = $fts->contains('foo', '+bar*', '-koo'); + +Econtains stands for extended contains and allows words to be prefixed +by plus or minus signs to specify that the word must or mustn't be +present in the document for it to match. + +=item search + + my @docs = $fts->search(qq{+"this is a phrase" -koo +bar foo}); + my @docs = $fts->search("(foo OR baz) AND (bar OR caz)"); + +This is a wrapper to econtains which takes a user input string and parses +it into can-include, must-include, and must-not-include words and phrases. +It also can handle boolean (AND/OR) queries. + +=item contains_hashref, econtains_hashref, search_hashref + +Similar to C, C and C, +only instead of list of document +names, these methods return a hash reference to a hash where keys are +the document names and values are the number of occurencies of the +words. + +=item drop + +Removes all tables associated with the index, including the base +parameter table. Effectivelly destroying the index form the database. + + $fts->drop; + +=item empty + +Emptys the index so you can reindex the data. + + $fts->empty; + +=back + +=head1 INDEX OPTIONS + +Here we list the options that may be passed to C method. +These allow to specify the style and storage parameters in great detail. + +=over 4 + +=item backend + +The backend type, default C, possible values C, C and C +(see above for explanation). + +=item frontend + +The frontend type. The C frontend requires the user to specify +numeric id of the document together with the content of the document, +other possible values are C, C and C (see above for +more info). + +=item word_length + +Maximum length of words that may be indexed, default 30. + +=item data_table + +Name of the table where the actual data about word/document relation is +stored. By default, the name of the index (of the base table) with _data +suffix is used. + +=item name_length + +Any frontend that uses strings as names of documents needs to maintain +a conversion table from these names to internal integer ids. This value +specifies maximum length of these string names (URLs, file names, ...). + +=item blob_direct_fetch + +Only for C backend. When looking for information about specific +document in the list stored in the blob, the blob backend uses division +of interval to find the correct place in the blob. When the interval +gets equal or shorter that this value, all values are fetched from the +database and the final search is done in Perl code sequentially. + +=item word_id_bits + +With C or C backends, DBIx::FullTextSearch maintains a numeric id for each +word to optimize the space requirements. The word_id_bits parameter +specifies the number of bits to reserve for this conversion and thus +effectively limits number of distinct words that may be indexed. The +default is 16 bits and possible values are 8, 16, 24 or 32 bits. + +=item word_id_table + +Name of the table that holds conversion from words to their numeric id +(for C and C backends). By default is the name of the index +with _words suffix. + +=item doc_id_bits + +A number of bits to hold a numeric id of the document (that is either +provided by the user (with C frontend) or generated by the module +to accomplish the conversion from the string name of the document). This +value limits the maximum number of documents to hold. The default is 16 +bits and possible values are 8, 16 and 32 bits for C backend and 8, +16, 24 and 32 bits for C and C backends. + +=item doc_id_table + +Name of the table that holds conversion from string names of documents +to their numeric id, by default the name of the index with _docid +suffix. + +=item count_bits + +Number of bits reserved for storing number of occurencies of each word +in the document. The default is 8 and possible values are the same as +with doc_id_bits. + +=item position_bits + +With C, DBIx::FullTextSearch stores positions of each word of the +documents. This value specifies how much space should be reserved for +this purpose. The default is 32 bits and possible values are 8, 16 or 32 +bits. This value limits the maximum number of words of each document +that can be stored. + +=item index_splitter + +DBIx::FullTextSearch allows the user to provide any Perl code that will be used to +split the content of the document to words when indexing documents. +The code will be evalled inside of the DBIx::FullTextSearch code. The default is + + /(\w{2,$word_length})/g + +and shows that the input is stored in the variable C<$data> and the code +may access any other variable available in the perl_and_index_data_* +methods (see source), especially C<$word_length> to get the maximum length +of words and C<$backend> to get the backend object. + +The default value also shows that by default, the minimum length of +words indexed is 2. + +=item search_splitter + +This is similar to the C method, +except that it is used in the C method +when searching for documents instead of when indexing documents. The default is + + /(\w{2,$word_length}\*?)/g + +Which, unlike the default C, allows for the wild card character (*). + +=item filter + +The output words of splitter (and also any parameter of (e)contains* +methods) are send to filter that may do further processing. Filter is +again a Perl code, the default is + + map { lc $_ } + +showing that the filter operates on input list and by default does +conversion to lowercase (yielding case insensitive index). + +=item init_env + +Because user defined splitter or filter may depend on other things that +it is reasonable to set before the actual procession of words, you can +use yet another Perl hook to set things up. The default is no initialization +hook. + +=item stoplist + +This is the name of a L object that is used +for stop words. + +=item stemmer + +If this option is set, then word stemming will be enabled in the indexing and searching. + +The value is the name of a L recognized locale. +Currently, 'en', 'en-us' and 'en-uk' are the only recognized locales. +All locale identifiers are converted to lowercase. + +=item table_name + +For C
frontend; this is the name of the table that will be indexed. + +=item column_name + +For C
frontend; this is a reference to an array of columns in the +C that contains the documents -- data to be indexed. It can +also have a form table.column that will be used if the C +option is not specified. + +=item column_id_name + +For C
frontend; this is the name of the field in C that +holds names (ids) of the records. If not specified, a field that has +primary key on it is used. If this field is numeric, it's values are +directly used as identifiers, otherwise a conversion to numeric values +is made. + +=back + +=head1 NOTES + +To handle internationalization, it may help to use the following in your code +(for example Spanish in Chile): + + use POSIX; + my $loc = POSIX::setlocale( &POSIX::LC_ALL, "es_CL" ); + +I haven't tested this, so I would be interested in hearing whether this +works. + +=head1 ERROR HANDLING + +The create and open methods return the DBIx::FullTextSearch object on success, upon +failure they return undef and set error message in C<$DBIx::FullTextSearch::errstr> +variable. + +All other methods return reasonable (documented above) value on success, +failure is signalized by unreasonable (typically undef or null) return +value; the error message may then be retrieved by C<$fts-Eerrstr> method +call. + +=head1 VERSION + +This documentation describes DBIx::FullTextSearch module version 0.73. + +=head1 BUGS + +Error handling needs more polishing. + +We do not check if the stored values are larger that specified by the +*_bits parameters. + +No CGI administration tool at the moment. + +No scoring algorithm implemented. + +=head1 DEVELOPMENT + +These modules are under active development. +If you would like to contribute, please e-mail tjmather@maxmind.com + +There are two mailing lists for this module, one for users, and another for developers. To subscribe, +visit http://sourceforge.net/mail/?group_id=8645 + +=head1 AUTHOR + +(Original) Jan Pazdziora, adelton@fi.muni.cz, +http://www.fi.muni.cz/~adelton/ at Faculty of Informatics, Masaryk University in Brno, Czech +Republic + +(Current Maintainer) T.J. Mather, tjmather@maxmind.com, +http://www.maxmind.com/app/opensourceservices Princeton, NJ USA + +Paid support is available from directly from the maintainers of this package. +Please see L for more details. + +=head1 CREDITS + +Fixes, Bug Reports, Docs have been generously provided by: + + Vladimir Bogdanov + Ade Olonoh + Kate Pugh + Sven Paulus + Andrew Turner + Tom Bille + Joern Reder + Tarik Alkasab + Dan Collis Puro + Tony Bowden + Mario Minati + Miroslav Suchý + Stephen Patterson + Joern Reder + Hans Poo + +Of course, big thanks to Jan Pazdziora, the original author of this +module. Especially for providing a clean, modular code base! + +=head1 COPYRIGHT + +All rights reserved. This package is free software; you can +redistribute it and/or modify it under the same terms as Perl itself. + +=head1 SEE ALSO + +L, +L + +=head1 OTHER PRODUCTS and why I've written this module + +I'm aware of L and L +modules and about UdmSearch utility, and +about htdig and glimpse on the non-database side of the world. + +To me, using a database gives reasonable maintenance benefits. With +products that use their own files to store the information (even if the +storage algorithms are efficient and well thought of), you always +struggle with permissions on files and directories for various users, +with files that somebody accidently deleted or mungled, and making the +index available remotely is not trivial. + +That's why I've wanted a module that will use a database as a storage +backend. With MySQL, you get remote access and access control for free, +and on many web servers MySQL is part of the standard equipment. So +using it for text indexes seemed natural. + +However, existing L and UdmSearch are too narrow-aimed to +me. The first only supports indexing of data that is stored in the +database, but you may not always want or need to store the documents in +the database as well. The UdmSearch on the other hand is only for web +documents, making it unsuitable for indexing mailing-list archives or +local data. + +I believe that DBIx::FullTextSearch is reasonably flexible and still very +efficient. It doesn't enforce its own idea of what is good for you -- +the number of options is big and you can always extend the module with +your own backend of frontend if you feel that those provided are not +sufficient. Or you can extend existing by adding one or two parameters +that will add new features. Of course, patches are always welcome. +DBIx::FullTextSearch is a tool that can be deployed in many projects. It's not +a complete environment since different people have different needs. On +the other hand, the methods that it provides make it easy to build +a complete solution on top of this in very short course of time. + +=cut + diff --git a/lib/DBIx/FullTextSearch/Blob.pm b/lib/DBIx/FullTextSearch/Blob.pm new file mode 100644 index 000000000..445f60897 --- /dev/null +++ b/lib/DBIx/FullTextSearch/Blob.pm @@ -0,0 +1,263 @@ + +package DBIx::FullTextSearch::Blob; +use strict; + +# Open in the backend just sets the object +sub open { + my ($class, $fts) = @_; + return bless { 'fts' => $fts }, $class; +} +# Create creates the table(s) according to the parameters +sub _create_tables { + my ($class, $fts) = @_; + my $CREATE_DATA = <{'data_table'} ( + word varchar($fts->{'word_length'}) binary + default '' not null, + idx longblob default '' not null, + primary key (word) + ) +EOF + my $dbh = $fts->{'dbh'}; + $dbh->do($CREATE_DATA) or return $dbh->errstr; + push @{$fts->{'created_tables'}}, $fts->{'data_table'}; + return; +} + +sub add_document { + my ($self, $id, $words) = @_; + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $data_table = $fts->{'data_table'}; + + my $update_sth = ( defined $self->{'adding_update_sth'} + ? $self->{'adding_update_sth'} + : $self->{'adding_update_sth'} = $dbh->prepare( + "update $data_table set idx = concat(idx, ?) + where word = ?") ); + + my @insert_values; + + my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}} + . $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}}; + my $num_words = 0; + for my $word ( keys %$words ) { +### print STDERR "$word($id) adding\n"; + # here we will want to parametrize the bit size of the + # data + my $value = pack $packstring, $id, $words->{$word}; + my $rows = $update_sth->execute($value, $word); + push @insert_values, $word, $value if $rows == 0; + $num_words += $words->{$word}; + } + + if(@insert_values){ + my $sql_str = "insert into $data_table values ". join(',', ('(?, ?)') x (@insert_values/2)); + $dbh->do($sql_str,{},@insert_values); + } + + return $num_words; +} + +sub delete_document { + my $self = shift; + for my $id (@_) { $self->update_document($id, {}); } +} + +sub update_document { + my ($self, $id, $words) = @_; + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $data_table = $fts->{'data_table'}; + + my $insert_sth = ( defined $self->{'insert_sth'} + ? $self->{'insert_sth'} + : $self->{'insert_sth'} = $dbh->prepare(" + insert into $data_table values (?, ?)") ); + + my $update_sth = ( defined $self->{'update_update_sth'} + ? $self->{'update_update_sth'} + : $self->{'update_update_sth'} = + $dbh->prepare("update $data_table set idx = + concat(substring(idx, 1, ?), ?, substring(idx, ?)) + where word = ?") ); + + + my @insert_values; + + $dbh->do("lock tables $data_table write"); + + my $select_sth = $dbh->prepare("select word from $data_table"); + $select_sth->execute; + + my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}} + . $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}}; + my ($packnulls) = pack $packstring, 0, 0; + my $packlength = length $packnulls; + my $num_words = 0; + while (my ($word) = $select_sth->fetchrow_array) { + my $value = (defined $words->{$word} ? + pack($packstring, $id, $words->{$word}) : ''); + + # the method find_position finds the position of the + # "record" for document $id with word $word; returned is + # the position in bytes and yes/no values specifying if + # the record is already present in the blob; if it is, + # we need to replace it, otherwise just insert. + + my ($pos, $shift) = $self->find_position($word, $id); + if (not defined $pos) { + push @insert_values, $word, $value; + } + else { + my $spos = $pos + 1; # I'm not sure why this + $spos += $packlength if $shift; + $update_sth->execute($pos, $value, $spos, $word); + } + delete $words->{$word}; + $num_words++ if defined $value; + } + + for my $word ( keys %$words ) { + my $value = pack $packstring, $id, $words->{$word}; + push @insert_values, $word, $value; +# $insert_sth->execute($word, $value); + $num_words++; + } + + if(@insert_values){ + my $sql_str = "insert into $data_table values ". join(',', ('(?, ?)') x (@insert_values/2)); + $dbh->do($sql_str,{},@insert_values); + } + + $dbh->do("unlock tables"); + + return $num_words; +} + +sub find_position { + my ($self, $word, $id) = @_; + # here, with the calculation of where in the blob we have the + # docid and where the count of words and how long they are, we + # should really look at the parameters (num of bits of various + # structures and values) given to create + + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $data_table = $fts->{'data_table'}; + + # Sth to read the length of the blob holding the document/count info + my $get_length_sth = ( defined $self->{'get_length_sth'} + ? $self->{'get_length_sth'} + : $self->{'get_length_sth'} = $dbh->prepare("select + length(idx) from $data_table where word = ?")); + my $length = $dbh->selectrow_array($get_length_sth, {}, $word); + + my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}} + . $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}}; + my ($packnulls) = pack $packstring, 0, 0; + my $packlength = length $packnulls; + + if (not defined $length) { return; } + $length = int($length/$packlength); + + my ($bot, $top, $med, $val) = (0, $length); + + if (not defined $fts->{'max_doc_id'}) + { $med = int(($top - $bot) / 2); } + else + { $med = int($top * $id / $fts->{'max_doc_id'}); } + + my $blob_direct_fetch = $fts->{'blob_direct_fetch'}; + # we divide the interval + while ($bot != $top) { + $med = $top - 1 if $med >= $top; + $med = $bot if $med < $bot; + + if ($top - $bot <= $blob_direct_fetch) { + my $get_interval_sth = ( + defined $self->{'get_interval_sth'} + ? $self->{'get_interval_sth'} + : $self->{'get_interval_sth'} = $dbh->prepare("select substring(idx,?,?) from $data_table where word = ?")); + my $alldata = $dbh->selectrow_array($get_interval_sth, + {}, + $bot * $packlength + 1, + ($top - $bot) * $packlength, + $word); + return unless defined $alldata; + + my @docs; + my $i = 0; + while ($i < length $alldata) { + push @docs, unpack $packstring, + substr $alldata, $i, $packlength; + $i += $packlength; + } + for (my $i = 0; $i < @docs; $i += 2) { + if ($docs[$i] == $id) { return (($bot+($i/2))*$packlength, 1); } + if ($docs[$i] > $id) { return (($bot+($i/2))*$packlength, 0); } + } + return ($top * $packlength, 0); + } + ($val) = $dbh->selectrow_array( + "select substring(idx, ?, 2) from $data_table + where word = ?", {}, ($med * $packlength) + 1, $word); + ($val) = unpack $packstring, $val; + + if (not defined $val) { return; } + if ($val == $id) { return ($med * $packlength, 1); } + + elsif ($val < $id) { $bot = $med + 1; } + else { $top = $med; } + + $med = int($med * $id / $val); + } + return ($bot * $packlength, 0); +} + +sub contains_hashref { + my $self = shift; + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $data_table = $fts->{'data_table'}; + + my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}} + . $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}}; + my ($packnulls) = pack $packstring, 0, 0; + my $packlength = length $packnulls; + + my $sth = ( defined $self->{'get_idx_sth'} ? + $self->{'get_idx_sth'} : + $self->{'get_idx_sth'} = + $dbh->prepare( + "select idx from $data_table where word like ?" + )); + + my $out = {}; + for my $word (@_) { + $sth->execute($word); + while (my ($blob) = $sth->fetchrow_array) { + next unless defined $blob; + my @data; + my $i = 0; + while ($i < length $blob) { + push @data, unpack $packstring, + substr $blob, $i, $packlength; + $i += $packlength; + } + while (@data) { + my $doc = shift @data; + my $count = shift @data; + unless (defined $out->{$doc}) { $out->{$doc} = 0; } + $out->{$doc} += $count; + } + } + $sth->finish; + } + $out; +} + +*parse_and_index_data = \&DBIx::FullTextSearch::parse_and_index_data_count; + +1; + diff --git a/lib/DBIx/FullTextSearch/BlobFast.pm b/lib/DBIx/FullTextSearch/BlobFast.pm new file mode 100644 index 000000000..3494bd21a --- /dev/null +++ b/lib/DBIx/FullTextSearch/BlobFast.pm @@ -0,0 +1,83 @@ + +package DBIx::FullTextSearch::BlobFast; +use DBIx::FullTextSearch::Blob; +use vars qw! @ISA !; +@ISA = qw! DBIx::FullTextSearch::Blob !; +use strict; + +sub delete_document { + my $self = shift; + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $data_table = $fts->{'data_table'}; + + my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}} + . $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}}; + my $value = ''; + for my $id (@_) { + $value .= pack $packstring, $id, 0; + } + $dbh->do(" + update $data_table set idx = concat(idx, ?) + ", {}, $value); + 1; +} + +sub update_document { + my $self = shift; + $self->delete_document($_[0]); + $self->add_document(@_); +} + +sub contains_hashref { + my $self = shift; + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $data_table = $fts->{'data_table'}; + + my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}} + . $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}}; + my ($packnulls) = pack $packstring, 0, 0; + my $packlength = length $packnulls; + + my $sth = ( defined $self->{'get_idx_sth'} ? + $self->{'get_idx_sth'} : + $self->{'get_idx_sth'} = + $dbh->prepare( + "select idx from $data_table where word like ?" + )); + + my $out = {}; + for my $word (@_) { + $sth->execute($word); + while (my ($blob) = $sth->fetchrow_array) { + next unless defined $blob; + my %docs = (); + my @data; + my $i = length($blob) - $packlength; + while ($i >= 0) { + my ($doc_id, $count) = + unpack "\@$i$packstring", $blob; +### print STDERR "$doc_id $count\n"; + $i -= $packlength; + next if exists $docs{$doc_id}; + $docs{$doc_id} = 1; + next unless $count; + push @data, $doc_id, $count; + } + while (@data) { + my $doc = shift @data; + my $count = shift @data; + unless (defined $out->{$doc}) { $out->{$doc} = 0; } + $out->{$doc} += $count; + } + } + $sth->finish; + } + $out; +} + +*parse_and_index_data = \&DBIx::FullTextSearch::parse_and_index_data_count; + +1; + diff --git a/lib/DBIx/FullTextSearch/Column.pm b/lib/DBIx/FullTextSearch/Column.pm new file mode 100644 index 000000000..bc8b41e95 --- /dev/null +++ b/lib/DBIx/FullTextSearch/Column.pm @@ -0,0 +1,188 @@ + +package DBIx::FullTextSearch::Column; +use strict; + +# Open in the backend just sets the object +sub open { + my ($class, $fts) = @_; + return bless { 'fts' => $fts }, $class; +} + +sub DESTROY { + my ($self) = @_; + if (defined $self->{'select_wordid_sth'}) { + $self->{'select_wordid_sth'}->finish(); + } +} + +# Create creates the table(s) according to the parameters +sub _create_tables { + my ($class, $fts) = @_; + my $COUNT_FIELD = ''; + if ($fts->{'count_bits'}) { + $COUNT_FIELD = "count $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'count_bits'}} unsigned," + } + my $CREATE_DATA = <{'data_table'} ( + word_id $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'word_id_bits'}} unsigned not null, + doc_id $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'doc_id_bits'}} unsigned not null, + $COUNT_FIELD + index (word_id), + index (doc_id) + ) +EOF + + $fts->{'word_id_table'} = $fts->{'table'}.'_words' + unless defined $fts->{'word_id_table'}; + + + my $CREATE_WORD_ID = <{'word_id_table'} ( + word varchar($fts->{'word_length'}) binary + default '' not null, + id $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'word_id_bits'}} unsigned not null auto_increment, + primary key (id), + unique (word) + ) +EOF + + my $dbh = $fts->{'dbh'}; + $dbh->do($CREATE_DATA) or return $dbh->errstr; + push @{$fts->{'created_tables'}}, $fts->{'data_table'}; + $dbh->do($CREATE_WORD_ID) or return $dbh->errstr; + push @{$fts->{'created_tables'}}, $fts->{'word_id_table'}; + return; +} + +sub add_document { + my ($self, $id, $words) = @_; + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $word_id_table = $fts->{'word_id_table'}; + if (not defined $self->{'select_wordid_sth'}) { + $self->{'select_wordid_sth'} = $dbh->prepare(" + select id from $word_id_table where word = ? + "); + } + my $data_table = $fts->{'data_table'}; + my $count_bits = $fts->{'count_bits'}; + my $num_words = 0; + my (@wids,@data,@widshandler,@datahandler); + my $wordid; + $dbh->do("lock tables $word_id_table write"); + my ($maxid) = $dbh->selectrow_array("select max(id) + from $word_id_table"); + foreach my $word (keys %$words) { + if(!defined $self->{'wordids'}->{$word}) { + $self->{'select_wordid_sth'}->execute($word); + ($wordid) = $self->{'select_wordid_sth'}->fetchrow_array(); + unless ($wordid) { + $maxid++; + push @widshandler, "(?,$maxid)"; + push @wids, $word; + $wordid = $maxid; + } + $self->{'wordids'}->{$word} = $wordid; + } else { + $wordid=$self->{'wordids'}->{$word}; + } + if ($count_bits) { + push @datahandler, "($wordid,$id,?)"; + push @data, $words->{$word}; + } else { + push @datahandler, "($wordid,$id)"; + } + $num_words++; + }; + $dbh->do("insert into $word_id_table values " . + join (',',@widshandler),undef,@wids) if @wids; + $dbh->do("unlock tables"); + if ($count_bits) { + $dbh->do("insert into $data_table values " . join (',',@datahandler),undef,@data) if @data; + } else { + $dbh->do("insert into $data_table values " . join (',',@datahandler)) if @datahandler; + } + return $num_words; +} + +sub delete_document { + my $self = shift; + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $data_table = $fts->{'data_table'}; + my $sth = $dbh->prepare("delete from $data_table where doc_id = ?"); + for my $id (@_) { $sth->execute($id); } +} + +sub update_document { + my ($self, $id, $words) = @_; + $self->delete_document($id); + $self->add_document($id, $words); +} + +sub contains_hashref { + my $self = shift; + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $data_table = $fts->{'data_table'}; + my $word_id_table = $fts->{'word_id_table'}; + + my $count_bits = $fts->{'count_bits'}; + my $sth = ( defined $self->{'get_data_sth'} + ? $self->{'get_data_sth'} + : ( $count_bits + ? ( $self->{'get_data_sth'} = $dbh->prepare( + "select doc_id, count + from $data_table, $word_id_table + where word like ? + and id = word_id" ) ) + : ( $self->{'get_data_sth'} = $dbh->prepare( + "select doc_id, 1 + from $data_table, $word_id_table + where word like ? + and id = word_id" ) ) + ) ); + + my $out = {}; + for my $word (@_) { + $sth->execute($word); + while (my ($doc, $count) = $sth->fetchrow_array) { + $out->{$doc} += $count; + } + $sth->finish; + } + $out; +} + +sub common_word { + my ($self, $k) = @_; + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + + my $num = $fts->document_count; + + $k /= 100; + + my $SQL = <{'data_table'} + group by word_id + having k >= ? +EOF + my $ary_ref = $dbh->selectcol_arrayref($SQL, {}, $num, $k); + return unless @$ary_ref; + + my $QUESTION_MARKS = join ',', ('?') x scalar(@$ary_ref); + + $SQL = <{'word_id_table'} + where id IN ($QUESTION_MARKS) +EOF + return $dbh->selectcol_arrayref($SQL, {}, @$ary_ref); +} + +*parse_and_index_data = \&DBIx::FullTextSearch::parse_and_index_data_count; + +1; + diff --git a/lib/DBIx/FullTextSearch/File.pm b/lib/DBIx/FullTextSearch/File.pm new file mode 100644 index 000000000..80e844714 --- /dev/null +++ b/lib/DBIx/FullTextSearch/File.pm @@ -0,0 +1,27 @@ + +package DBIx::FullTextSearch::File; +use DBIx::FullTextSearch::String; +use strict; +use vars qw! @ISA !; +@ISA = qw! DBIx::FullTextSearch::String !; + +sub index_document { + my ($self, $file, $extra_data) = @_; + my $dbh = $self->{'dbh'}; + + open FILE, $file or do { + $self->{'errstr'} = "Reading the file `$file' failed: $!"; + return; + }; + my $data; + { + local $/ = undef; + $data = ; + } + $data .= " $extra_data" if $extra_data; + close FILE; + $self->SUPER::index_document($file, $data); +} + +1; + diff --git a/lib/DBIx/FullTextSearch/Phrase.pm b/lib/DBIx/FullTextSearch/Phrase.pm new file mode 100644 index 000000000..8d1c27249 --- /dev/null +++ b/lib/DBIx/FullTextSearch/Phrase.pm @@ -0,0 +1,182 @@ +package DBIx::FullTextSearch::Phrase; +use strict; +use DBIx::FullTextSearch::Column; +use vars qw! @ISA !; +@ISA = qw! DBIx::FullTextSearch::Column !; + +# Open in the backend just sets the object +sub open { + my ($class, $fts) = @_; + return bless { 'fts' => $fts }, $class; +} + +sub DESTROY { + my ($self) = @_; + if (defined $self->{'select_wordid_sth'}) { + $self->{'select_wordid_sth'}->finish(); + } +} + + +# Create creates the table(s) according to the parameters +sub _create_tables { + my ($class, $fts) = @_; + my $COUNT_FIELD = ''; + + my $CREATE_DATA = <{'data_table'} ( + word_id $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'word_id_bits'}} unsigned not null, + doc_id $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'doc_id_bits'}} unsigned not null, + idx longblob default '' not null, + index (word_id), + index (doc_id) + ) +EOF + + $fts->{'word_id_table'} = $fts->{'table'}.'_words' + unless defined $fts->{'word_id_table'}; + + + my $CREATE_WORD_ID = <{'word_id_table'} ( + word varchar($fts->{'word_length'}) binary + default '' not null, + id $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'word_id_bits'}} unsigned not null auto_increment, + primary key (id), + unique (word) + ) +EOF + + my $dbh = $fts->{'dbh'}; + $dbh->do($CREATE_DATA) or return $dbh->errstr; + push @{$fts->{'created_tables'}}, $fts->{'data_table'}; + $dbh->do($CREATE_WORD_ID) or return $dbh->errstr; + push @{$fts->{'created_tables'}}, $fts->{'word_id_table'}; + return; +} + +sub add_document { + my ($self, $id, $words) = @_; + # here the value in the %$words hash is an array of word + # positions + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $word_id_table = $fts->{'word_id_table'}; + if (not defined $self->{'select_wordid_sth'}) { + $self->{'select_wordid_sth'} = $dbh->prepare(" + select id from $word_id_table where word = ? + "); + } + my $data_table = $fts->{'data_table'}; + my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'position_bits'}}; + my $num_words = 0; + my (@wids,@data,@widshandler,@datahandler); + my $wordid; + $dbh->do("lock tables $word_id_table write"); + my ($maxid) = $dbh->selectrow_array("select max(id) + from $word_id_table"); + foreach my $word (keys %$words) { + if(!defined $self->{'wordids'}->{$word}) { + $self->{'select_wordid_sth'}->execute($word); + ($wordid) = $self->{'select_wordid_sth'}->fetchrow_array(); + unless ($wordid) { + $maxid++; + push @widshandler, "(?,$maxid)"; + push @wids, $word; + $wordid = $maxid; + } + $self->{'wordids'}->{$word} = $wordid; + } else { + $wordid=$self->{'wordids'}->{$word}; + } + push @datahandler, "($wordid,$id,?)"; + push @data, pack $packstring.'*', @{$words->{$word}}; + $num_words++; + }; + $dbh->do("insert into $word_id_table values " . + join (',',@widshandler),undef,@wids) if @wids; + $dbh->do("unlock tables"); + $dbh->do("insert into $data_table values " . + join (',',@datahandler),undef,@data) if @data; + return $num_words; +} + +sub update_document { + my ($self, $id, $words) = @_; + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $data_table = $fts->{'data_table'}; + $dbh->do("delete from $data_table where doc_id = ?", {}, $id); + + $self->add_document($id, $words); +} + +sub contains_hashref { + my $self = shift; + my $fts = $self->{'fts'}; + my $dbh = $fts->{'dbh'}; + my $data_table = $fts->{'data_table'}; + my $word_id_table = $fts->{'word_id_table'}; + + my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'position_bits'}}; + + my $SQL = qq{ + select doc_id, idx + from $data_table, $word_id_table + where word like ? + and id = word_id +}; + my $out = {}; + + for my $phrase (@_){ + + my @words = split(' ', $phrase); + + my @sths; + for (my $i = 0; $i < @words; $i++) { + $sths[$i] = $dbh->prepare($SQL); + $sths[$i]->execute($words[$i]); + } + + my %prev_pos = (); + my %cur_pos = (); + + # iterate through words in phrase + for (my $i = 0; $i < @words; $i++){ + if($i > 0){ + %prev_pos = %cur_pos; + %cur_pos = (); + } + # get docs that have this word + while (my ($doc, $data) = $sths[$i]->fetchrow_array){ + # get positions of words in doc + my @positions = unpack $packstring.'*', $data; + map { $cur_pos{$doc}->{$_} = 1 } @positions; + } + if($i > 0){ + # check to see if word $i comes after word $i-1 + for my $doc (keys %cur_pos){ + my $isPhrase = 0; + for my $position (keys %{$cur_pos{$doc}}){ + if ($position > 0 && exists $prev_pos{$doc}{$position - 1}){ + $isPhrase = 1; + } else { + delete $cur_pos{$doc}{$position}; + } + } + delete $cur_pos{$doc} unless $isPhrase; + } + } + } + + for my $doc (keys %cur_pos){ + my @positions = keys %{$cur_pos{$doc}}; + $out->{$doc} += scalar (@positions); + } + } + return $out; +} + +*parse_and_index_data = \&DBIx::FullTextSearch::parse_and_index_data_list; + +1; diff --git a/lib/DBIx/FullTextSearch/StopList.pm b/lib/DBIx/FullTextSearch/StopList.pm new file mode 100644 index 000000000..b27ab98a5 --- /dev/null +++ b/lib/DBIx/FullTextSearch/StopList.pm @@ -0,0 +1,271 @@ +package DBIx::FullTextSearch::StopList; +use strict; + +use Carp; + +sub create_default { + my ($class, $dbh, $TABLE, $language) = @_; + + croak("Error: no language specified") unless $language; + + $language = lc $language; + + my @stopList; + + if($language eq 'english'){ + @stopList = qw/ a about after all also an and any are as at be because been but by can co corp could for from had has have he her his if in inc into is it its last more most mr mrs ms mz no not of on one only or other out over s says she so some such than that the their there they this to up was we were when which who will with would /; + } elsif ($language eq 'czech'){ + @stopList = qw/ a aby ale ani až bude by byl byla bylo být co či další do i jak jako je jeho jejich jen ještě již jsem jsme jsou k kde když korun která které který kteří let mezi má může na nebo není než o od pak po podle pouze pro proti první před při roce roku řekl s se si své tak také tedy to tom tím u už v ve však z za ze že/; + } elsif ($language eq 'danish'){ + @stopList = qw/ af aldrig alle altid bagved De de der du efter eller en endnu et fĺ fjernt for foran fra gennem god han her hos hovfor hun hurtig hvad hvem hvonĺr hvor hvordan hvorhen I i imod ja jeg langsom lidt mange mĺske med meget mellem mere mindre nĺr nede nej nok nu og oppe pĺ rask sammen temmelig til uden udenfor under ved vi /; + } elsif ($language eq 'dutch'){ + @stopList = qw/ aan aangaande aangezien achter achterna afgelopen al aldaar aldus alhoewel alias alle allebei alleen alsnog altijd altoos ander andere anders anderszins behalve behoudens beide beiden ben beneden bent bepaald betreffende bij binnen binnenin boven bovenal bovendien bovengenoemd bovenstaand bovenvermeld buiten daar daarheen daarin daarna daarnet daarom daarop daarvanlangs dan dat de die dikwijls dit door doorgaand dus echter eer eerdat eerder eerlang eerst elk elke en enig enigszins enkel er erdoor even eveneens evenwel gauw gedurende geen gehad gekund geleden gelijk gemoeten gemogen geweest gewoon gewoonweg haar had hadden hare heb hebben hebt heeft hem hen het hierbeneden hierboven hij hoe hoewel hun hunne ik ikzelf in inmiddels inzake is jezelf jij jijzelf jou jouw jouwe juist jullie kan klaar kon konden krachtens kunnen kunt later liever maar mag meer met mezelf mij mijn mijnent mijner mijzelf misschien mocht mochten moest moesten moet moeten mogen na naar nadat net niet noch nog nogal nu of ofschoon om omdat omhoog omlaag omstreeks omtrent omver onder ondertussen ongeveer ons onszelf onze ook op opnieuw opzij over overeind overigens pas precies reeds rond rondom sedert sinds sindsdien slechts sommige spoedig steeds tamelijk tenzij terwijl thans tijdens toch toen toenmaals toenmalig tot totdat tussen uit uitgezonderd vaak van vandaan vanuit vanwege veeleer verder vervolgens vol volgens voor vooraf vooral vooralsnog voorbij voordat voordezen voordien voorheen voorop vooruit vrij vroeg waar waarom wanneer want waren was wat weer weg wegens wel weldra welk welke wie wiens wier wij wijzelf zal ze zelfs zichzelf zij zijn zijne zo zodra zonder zou zouden zowat zulke zullen zult /; + } elsif ($language eq 'finnish'){ + @stopList = qw/ aina alla ansiosta ehkä ei enemmän ennen etessa haikki hän he hitaasti hoikein hyvin ilman ja jälkeen jos kanssa kaukana kenties keskellä kesken koskaan kuinkan kukka kyllä kylliksi lähellä läpi liian lla lla luona me mikä miksi milloin milloinkan minä missä miten nopeasti nyt oikea oikealla paljon siellä sinä ssa sta suoraan tai takana takia tarpeeksi tässä te ulkopuolella vähän vahemmän vasen vasenmalla vastan vielä vieressä yhdessä ylös /; + } elsif ($language eq 'french'){ + @stopList = qw/ a ŕ afin ailleurs ainsi alors aprčs attendant au aucun aucune au-dessous au-dessus auprčs auquel aussi aussitôt autant autour aux auxquelles auxquels avec beaucoup ça ce ceci cela celle celles celui cependant certain certaine certaines certains ces cet cette ceux chacun chacune chaque chez combien comme comment concernant dans de dedans dehors déjŕ delŕ depuis des dčs desquelles desquels dessus donc donné dont du duquel durant elle elles en encore entre et étaient était étant etc eux furent grâce hormis hors ici il ils jadis je jusqu jusque la lŕ laquelle le lequel les lesquelles lesquels leur leurs lors lorsque lui ma mais malgré me męme męmes mes mien mienne miennes miens moins moment mon moyennant ne ni non nos notamment notre nôtre notres nôtres nous nulle nulles on ou oů par parce parmi plus plusieurs pour pourquoi prčs puis puisque quand quant que quel quelle quelque quelques-unes quelques-uns quelqu''un quelqu''une quels qui quiconque quoi quoique sa sans sauf se selon ses sien sienne siennes siens soi soi-męme soit sont suis sur ta tandis tant te telle telles tes tienne tiennes tiens toi ton toujours tous toute toutes trčs trop tu un une vos votre vôtre vôtres vous vu y /; + } elsif ($language eq 'german'){ + @stopList = qw/ ab aber allein als also am an auch auf aus außer bald bei beim bin bis bißchen bist da dabei dadurch dafür dagegen dahinter damit danach daneben dann daran darauf daraus darin darüber darum darunter das daß dasselbe davon davor dazu dazwischen dein deine deinem deinen deiner deines dem demselben den denn der derselben des desselben dessen dich die dies diese dieselbe dieselben diesem diesen dieser dieses dir doch dort du ebenso ehe ein eine einem einen einer eines entlang er es etwa etwas euch euer eure eurem euren eurer eures für fürs ganz gar gegen genau gewesen her herein herum hin hinter hintern ich ihm ihn Ihnen ihnen ihr Ihre ihre Ihrem ihrem Ihren ihren Ihrer ihrer Ihres ihres im in ist ja je jedesmal jedoch jene jenem jenen jener jenes kaum kein keine keinem keinen keiner keines man mehr mein meine meinem meinen meiner meines mich mir mit nach nachdem nämlich neben nein nicht nichts noch nun nur ob ober obgleich oder ohne paar sehr sei sein seine seinem seinen seiner seines seit seitdem selbst sich Sie sie sind so sogar solch solche solchem solchen solcher solches sondern sonst soviel soweit über um und uns unser unsre unsrem unsren unsrer unsres vom von vor während war wäre wären warum was wegen weil weit welche welchem welchen welcher welches wem wen wenn wer weshalb wessen wie wir wo womit zu zum zur zwar zwischen zwischens /; + } elsif ($language eq 'italian'){ + @stopList = qw/ a affinchč agl'' agli ai al all'' alla alle allo anzichč avere bensě che chi cioč come comunque con contro cosa da dachč dagl'' dagli dai dal dall'' dalla dalle dallo degl'' degli dei del dell'' delle dello di dopo dove dunque durante e egli eppure essere essi finché fino fra giacchč gl'' gli grazie I il in inoltre io l'' la le lo loro ma mentre mio ne neanche negl'' negli nei nel nell'' nella nelle nello nemmeno neppure noi nonchč nondimeno nostro o onde oppure ossia ovvero per perchč perciň perň poichč prima purchč quand''anche quando quantunque quasi quindi se sebbene sennonchč senza seppure si siccome sopra sotto su subito sugl'' sugli sui sul sull'' sulla sulle sullo suo talchč tu tuo tuttavia tutti un una uno voi vostr/; + } elsif ($language eq 'portuguese'){ + @stopList = qw/ a abaixo adiante agora ali antes aqui até atras bastante bem com como contra debaixo demais depois depressa devagar direito e ela elas ęle eles em entre eu fora junto longe mais menos muito năo ninguem nós nunca onde ou para por porque pouco próximo qual quando quanto que quem se sem sempre sim sob sobre talvez todas todos vagarosamente vocę vocęs /; + } elsif ($language eq 'spanish'){ + @stopList = qw/ a acá ahí ajena ajenas ajeno ajenos al algo algún alguna algunas alguno algunos allá allí aquel aquella aquellas aquello aquellos aquí cada cierta ciertas cierto ciertos como cómo con conmigo consigo contigo cualquier cualquiera cualquieras cuan cuán cuanta cuánta cuantas cuántas cuanto cuánto cuantos cuántos de dejar del demás demasiada demasiadas demasiado demasiados el él ella ellas ellos esa esas ese esos esta estar estas este estos hacer hasta jamás junto juntos la las lo los mas más me menos mía mientras mío misma mismas mismo mismos mucha muchas muchísima muchísimas muchísimo muchísimos mucho muchos muy nada ni ninguna ningunas ninguno ningunos no nos nosotras nosotros nuestra nuestras nuestro nuestros nunca os otra otras otro otros para parecer poca pocas poco pocos por porque que qué querer quien quién quienes quienesquiera quienquiera ser si sí siempre sín Sr Sra Sres Sta suya suyas suyo suyos tal tales tan tanta tantas tanto tantos te tener ti toda todas todo todos tomar tú tuya tuyo un una unas unos usted ustedes varias varios vosotras vosotros vuestra vuestras vuestro vuestros y yo /; + } elsif ($language eq 'swedish'){ + @stopList = qw/ ab aldrig all alla alltid än ännu ĺnyo är att av avser avses bakom bra bredvid dä där de dem den denna deras dess det detta du efter efterät eftersom ej eller emot en ett fastän för fort framför frĺn genom gott hamske han här hellre hon hos hur i in ingen innan inte ja jag lĺngsamt lĺngt lite man med medan mellan mer mera mindre mot myckett när nära nej nere ni nu och oksa om över pĺ sĺ sĺdan sin skall som till tillräckligt tillsammans trotsatt under uppe ut utan utom vad väl var varför vart varthän vem vems vi vid vilken /; + } + + croak("Error: language $language is not a supported") unless @stopList; + + my $sl = $class->create_empty($dbh, $TABLE); + + $sl->add_stop_word(\@stopList); + return $sl; +} + +sub create_empty { + my ($class, $dbh, $name) = @_; + + my $table = $name . '_stoplist'; + + my $SQL = qq{ +CREATE TABLE $table +(word VARCHAR(255) PRIMARY KEY) +}; + + $dbh->do($SQL) or croak "Can't create table $table: " . $dbh->errstr; + + my $self = {}; + $self->{'dbh'} = $dbh; + $self->{'name'} = $name; + $self->{'table'} = $table; + $self->{'stoplist'} = {}; + bless $self, $class; + return $self; +} + +sub open { + my ($class, $dbh, $name) = @_; + + my $table = $name . '_stoplist'; + + my $self = {}; + $self->{'dbh'} = $dbh; + $self->{'name'} = $name; + $self->{'table'} = $table; + $self->{'stoplist'} = {}; + bless $self, $class; + + # load stoplist into a hash + my $SQL = qq{ +SELECT word FROM $table +}; + my $ary_ref = $dbh->selectcol_arrayref($SQL) or croak "Can't load stoplist from $table: " . $dbh->errstr; + for (@$ary_ref){ + $self->{'stoplist'}->{$_} = 1; + } + + return $self; +} + +sub drop { + my $self = shift; + my $dbh = $self->{'dbh'}; + my $table = $self->{'table'}; + my $SQL = qq{ +DROP table $table +}; + $dbh->do($SQL) or croak "Can't drop table $table: " . $dbh->errstr; + $self->{'stoplist'} = {}; +} + +sub empty { + my $self = shift; + my $dbh = $self->{'dbh'}; + my $table = $self->{'table'}; + my $SQL = qq{ +DELETE FROM $table +}; + $dbh->do($SQL) or croak "Can't empty table $table: " . $dbh->errstr; + $self->{'stoplist'} = {}; +} + +sub add_stop_word { + my ($self, $words) = @_; + my $dbh = $self->{'dbh'}; + + $words = [ $words ] unless ref($words) eq 'ARRAY'; + + my @new_stop_words; + + for my $word (@$words){ + next if $self->is_stop_word($word); + push @new_stop_words, $word; + $self->{'stoplist'}->{lc($word)} = 1; + } + my $SQL = "INSERT INTO $self->{'table'} (word) VALUES " . join(',', ('(?)') x @new_stop_words); + $dbh->do($SQL,{},@new_stop_words); +} + +sub remove_stop_word { + my ($self, $words) = @_; + my $dbh = $self->{'dbh'}; + + $words = [ $words ] unless ref($words) eq 'ARRAY'; + + my $SQL = qq{ +DELETE FROM $self->{'table'} WHERE word=? +}; + + my $sth = $dbh->prepare($SQL); + + my $stoplist = $self->{'stoplist'}; + + for my $word (@$words){ + next unless $self->is_stop_word($word); + $sth->execute($word); + delete $stoplist->{lc($word)}; + } +} + +sub is_stop_word { + exists shift->{'stoplist'}->{lc($_[0])}; +} + +1; + +__END__ + +=head1 NAME + +DBIx::FullTextSearch::StopList - Stopwords for DBIx::FullTextSearch + +=head1 SYNOPSIS + + use DBIx::FullTextSearch::StopList; + # connect to database (regular DBI) + my $dbh = DBI->connect('dbi:mysql:database', 'user', 'passwd'); + + # create a new empty stop word list + my $sl1 = DBIx::FullTextSearch::StopList->create_empty($dbh, 'sl_web_1'); + + # or create a new one with default stop words + my $sl2 = DBIx::FullTextSearch::StopList->create_default($dbh, 'sl_web_2', 'english'); + + # or open an existing one + my $sl3 = DBIx::FullTextSearch::StopList->open($dbh, 'sl_web_3'); + + # add stop words + $sl1->add_stop_word(['a','in','on','the']); + + # remove stop words + $sl2->remove_stop_word(['be','because','been','but','by']); + + # check if word is in stoplist + $bool = $sl1->is_stop_word('in'); + + # empty stop words + $sl3->empty; + + # drop stop word table + $sl2->drop; + +=head1 DESCRIPTION + +DBIx::FullTextSearch::StopList provides stop lists that can be used -L. +StopList objects can be reused accross several FullTextSearch objects. + +=head1 METHODS + +=over 4 + +=head2 CONSTRUCTERS + +=item create_empty + + my $sl = DBIx::FullTextSearch::StopList->create_empty($dbh, $sl_name); + +This class method creates a new StopList object. + +=item create_default + + my $sl = DBIx::FullTextSearch::StopList->create_default($dbh, $sl_name, $language); + +This class method creates a new StopList object, with default words loaded in for the +given language. Supported languages include Czech, Danish, Dutch, English, Finnish, French, +German, Italian, Portuguese, Spanish, and Swedish. + +=item open + + my $sl = DBIx::FullTextSearch::StopList->open($dbh, $sl_name); + +Opens and returns StopList object + +=head2 OBJECT METHODS + +=item add_stop_word + + $sl->add_stop_word(\@stop_words); + +Adds stop words to StopList object. Expects array reference as argument. + +=item remove_stop_word + + $sl->remove_stop_word(\@stop_words); + +Remove stop words from StopList object. + +=item is_stop_word + + $bool = $sl->is_stop_word($stop_word); + +Returns true iff stop_word is StopList object + +=item empty + + $sl->empty; + +Removes all stop words in StopList object. + +=item drop + + $sl->drop; + +Removes table associated with the StopList object. + +=back + +=head1 AUTHOR + +T.J. Mather, tjmather@tjmather.com, +http://www.tjmather.com/ + +=head1 COPYRIGHT + +All rights reserved. This package is free software; you can +redistribute it and/or modify it under the same terms as Perl itself. + +=head1 SEE ALSO + +L diff --git a/lib/DBIx/FullTextSearch/String.pm b/lib/DBIx/FullTextSearch/String.pm new file mode 100644 index 000000000..1793df960 --- /dev/null +++ b/lib/DBIx/FullTextSearch/String.pm @@ -0,0 +1,78 @@ + +package DBIx::FullTextSearch::String; +use DBIx::FullTextSearch; +use strict; +use vars qw! @ISA !; +@ISA = qw! DBIx::FullTextSearch !; + +# Create creates the conversion table that converts string names of +# documents to numbers +sub _create_tables { + my $fts = shift; + $fts->{'doc_id_table'} = $fts->{'table'} . '_docid' + unless defined $fts->{'doc_id_table'}; + + unless($fts->{'name_length'}){ + return "The parameter name_length has to be specified."; + } + + my $CREATE_DOCID = <{'doc_id_table'} ( + name varchar($fts->{'name_length'}) binary not null, + id $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'doc_id_bits'}} unsigned not null auto_increment, + primary key (id), + unique (name) + ) +EOF + my $dbh = $fts->{'dbh'}; + $dbh->do($CREATE_DOCID) or return $dbh->errstr; + push @{$fts->{'created_tables'}}, $fts->{'doc_id_table'}; + return; +} + +sub get_id_for_name { + my ($self, $string) = @_; + my $dbh = $self->{'dbh'}; + my $doc_id_table = $self->{'doc_id_table'}; + + my $name_to_id_sth = ( defined $self->{'name_to_id_sth'} + ? $self->{'name_to_id_sth'} + : $self->{'name_to_id_sth'} = $dbh->prepare("select id from $doc_id_table where name = ?") or die $dbh->errstr); + my $id = $dbh->selectrow_array($name_to_id_sth, {}, $string); + if (not defined $id) { + my $new_name_sth = (defined $self->{'new_name_sth'} + ? $self->{'new_name_sth'} + : $self->{'new_name_sth'} = + $dbh->prepare("insert into $doc_id_table values (?, null)") or die $dbh->errstr ); + $new_name_sth->execute($string) or die $new_name_sth->errstr; + $id = $new_name_sth->{'mysql_insertid'}; + } + $id; +} + +sub index_document { + my ($self, $string, $data) = @_; + my $id = $self->get_id_for_name($string); + $self->SUPER::index_document($id, $data); +} + +sub delete_document { + my ($self, $doc_id) = @_; + + $self->SUPER::delete_document($self->get_id_for_name($doc_id)); +} + +sub contains_hashref { + my $self = shift; + my $res = $self->SUPER::contains_hashref(@_); + return unless keys %$res; + + my $doc_id_table = $self->{'doc_id_table'}; + + my $data = $self->{'dbh'}->selectall_arrayref("select name, id from $doc_id_table where id in (" . join(',', ('?') x keys %$res).")", {}, keys %$res); + return { map { ( $_->[0], $res->{$_->[1]} ) } @$data }; +} + + +1; + diff --git a/lib/DBIx/FullTextSearch/Table.pm b/lib/DBIx/FullTextSearch/Table.pm new file mode 100644 index 000000000..a6b47c6ef --- /dev/null +++ b/lib/DBIx/FullTextSearch/Table.pm @@ -0,0 +1,142 @@ + +package DBIx::FullTextSearch::TableString; +use vars qw! @ISA !; +@ISA = qw! DBIx::FullTextSearch::String DBIx::FullTextSearch::Table !; + +sub index_document { + my ($self, $id, $data) = @_; + my @data_sets = $self->get_the_data_from_table($id); + push @data_sets, $data if $data; + $self->SUPER::index_document($id, \@data_sets); +} + +package DBIx::FullTextSearch::TableNum; +use vars qw! @ISA !; +@ISA = qw! DBIx::FullTextSearch::Table !; + +sub index_document { + my ($self, $id, $extra_data) = @_; + my @data_sets = $self->get_the_data_from_table($id); + push @data_sets, $extra_data if $extra_data; + $self->SUPER::index_document($id, \@data_sets); +} + + +package DBIx::FullTextSearch::Table; +use DBIx::FullTextSearch; +use strict; +use vars qw! @ISA !; +@ISA = qw! DBIx::FullTextSearch !; + +sub _open_tables { + my $self = shift; + if (defined $self->{'doc_id_table'}) { + eval 'use DBIx::FullTextSearch::String'; + bless $self, 'DBIx::FullTextSearch::TableString'; + } + else { + bless $self, 'DBIx::FullTextSearch::TableNum'; + } +} + +# we do not create any new tables, we just check that the parameters are +# OK (the table and columns exist, etc.) +sub _create_tables { + my $fts = shift; + my ($table, $column, $id) = @{$fts}{ qw! table_name column_name + column_id_name ! }; + if (not defined $table and $column =~ /\./) { + ($table, $column) = ($column =~ /^(.*)\.(.*)$/s); + } + my $id_type; + + if (not defined $table) { + return "The parameter table_name has to be specified with the table frontend."; + } + if (not defined $column) { + return "The parameter column_name has to be specified with the table frontend."; + } + my $dbh = $fts->{'dbh'}; + my $sth = $dbh->prepare("show columns from $table"); + $sth->{'PrintError'} = 0; + $sth->{'RaiseError'} = 0; + $sth->execute or return "The table `$table' doesn't exist."; + + my $info = $dbh->selectall_arrayref($sth, + { 'PrintError' => 0, 'RaiseError' => 0 }); + if (not defined $info) { + return "The table `$table' doesn't exist."; + } + +# use Data::Dumper; print Dumper $info; + + if (not defined $id) { + # search for column with primary key + my $pri_num = 0; + for my $i (0 .. $#$info) { + if ($info->[$i][3] eq 'PRI') { + $pri_num++; + $id = $info->[$i][0]; + $id_type = $info->[$i][1]; + } + } + if ($pri_num > 1) { + return 'The primary key has to be one-column.'; + } + if ($pri_num == 0) { + return "No primary key found in the table `$table'."; + } + } + else { + # find '$id' column + for my $i (0 .. $#$info) { + if ($info->[$i][0] eq $id){ + $id_type = $info->[$i][1]; + last; + } + } + } + + unless(defined $id_type){ + return "No key named '$id' found in the table '$table'"; + } + + my $testcol = $dbh->prepare("select $column from $table where 1 = 0"); + $testcol->execute or + return "Column `$column' doesn't exist in table `$table'."; + $testcol->finish; + + $fts->{'column_id_name'} = $id; + + my $errstr; + + if ($id_type =~ /([a-z]*int)/) { + $fts->{'doc_id_bits'} = $DBIx::FullTextSearch::INT_TO_BITS{$1}; + bless $fts, 'DBIx::FullTextSearch::TableNum'; + } + else { + my ($length) = ($id_type =~ /^\w+\((\d+)\)$/); + $fts->{'name_length'} = $1; + eval 'use DBIx::FullTextSearch::String'; + bless $fts, 'DBIx::FullTextSearch::TableString'; + $errstr = $fts->DBIx::FullTextSearch::String::_create_tables($fts); + } +### use Data::Dumper; print Dumper $fts; + return $errstr; +} + +sub get_the_data_from_table { + my ($self, $id) = @_; + my $dbh = $self->{'dbh'}; + my $get_data = ( defined $self->{'get_data_sth'} + ? $self->{'get_data_sth'} + : $self->{'get_data_sth'} = $dbh->prepare(" + select $self->{'column_name'} from $self->{'table_name'} + where $self->{'column_id_name'} = ? + ") ); + + my @data_ary = $dbh->selectrow_array($get_data, {}, $id); + return wantarray ? @data_ary : join(" ", @data_ary); +} + +1; diff --git a/lib/DBIx/FullTextSearch/TestConfig.pm b/lib/DBIx/FullTextSearch/TestConfig.pm new file mode 100644 index 000000000..a6a8ff70e --- /dev/null +++ b/lib/DBIx/FullTextSearch/TestConfig.pm @@ -0,0 +1,6 @@ +%DBIx::FullTextSearch::TestConfig::Config = ( + 'password' => undef, + 'dsn' => 'dbi:mysql:test', + 'user' => 'test' + ); +1; diff --git a/lib/DBIx/FullTextSearch/URL.pm b/lib/DBIx/FullTextSearch/URL.pm new file mode 100644 index 000000000..b6b29ac42 --- /dev/null +++ b/lib/DBIx/FullTextSearch/URL.pm @@ -0,0 +1,30 @@ + +package DBIx::FullTextSearch::URL; +use DBIx::FullTextSearch::String; +use strict; +use vars qw! @ISA !; +@ISA = qw! DBIx::FullTextSearch::String !; + +use LWP::UserAgent; + +sub index_document { + my ($self, $uri, $extra_data) = @_; + my $ua = ( defined $self->{'user_agent'} + ? $self->{'user_agent'} + : $self->{'user_agent'} = new LWP::UserAgent ); + + my $request = new HTTP::Request('GET', $uri); + my $response = $ua->simple_request($request); + if ($response->is_success) { + my $data = $response->content; + $data .= " $extra_data" if $extra_data; + return $self->SUPER::index_document($uri, $data); + } + else { + $self->{'errstr'} = $response->message; + } + return; +} + +1; + diff --git a/lib/Parse/RecDescent.pm b/lib/Parse/RecDescent.pm new file mode 100644 index 000000000..35b9e9d2c --- /dev/null +++ b/lib/Parse/RecDescent.pm @@ -0,0 +1,3045 @@ +# GENERATE RECURSIVE DESCENT PARSER OBJECTS FROM A GRAMMARC +# SEE RecDescent.pod FOR FULL DETAILS + +use 5.005; +use strict; + +package Parse::RecDescent; + +use Text::Balanced qw ( extract_codeblock extract_bracketed extract_quotelike extract_delimited ); + +use vars qw ( $skip ); + + *defskip = \ '\s*'; # DEFAULT SEPARATOR IS OPTIONAL WHITESPACE + $skip = '\s*'; # UNIVERSAL SEPARATOR IS OPTIONAL WHITESPACE +my $MAXREP = 100_000_000; # REPETITIONS MATCH AT MOST 100,000,000 TIMES + + +sub import # IMPLEMENT PRECOMPILER BEHAVIOUR UNDER: + # perl -MParse::RecDescent - +{ + local *_die = sub { print @_, "\n"; exit }; + + my ($package, $file, $line) = caller; + if (substr($file,0,1) eq '-' && $line == 0) + { + _die("Usage: perl -MLocalTest - ") + unless @ARGV == 2; + + my ($sourcefile, $class) = @ARGV; + + local *IN; + open IN, $sourcefile + or _die("Can't open grammar file '$sourcefile'"); + + my $grammar = join '', ; + + Parse::RecDescent->Precompile($grammar, $class, $sourcefile); + exit; + } +} + +sub Save +{ + my ($self, $class) = @_; + $self->{saving} = 1; + $self->Precompile(undef,$class); + $self->{saving} = 0; +} + +sub Precompile +{ + my ($self, $grammar, $class, $sourcefile) = @_; + + $class =~ /^(\w+::)*\w+$/ or croak("Bad class name: $class"); + + my $modulefile = $class; + $modulefile =~ s/.*:://; + $modulefile .= ".pm"; + + open OUT, ">$modulefile" + or croak("Can't write to new module file '$modulefile'"); + + print STDERR "precompiling grammar from file '$sourcefile'\n", + "to class $class in module file '$modulefile'\n" + if $grammar && $sourcefile; + + # local $::RD_HINT = 1; + $self = Parse::RecDescent->new($grammar,1,$class) + || croak("Can't compile bad grammar") + if $grammar; + + foreach ( keys %{$self->{rules}} ) + { $self->{rules}{$_}{changed} = 1 } + + print OUT "package $class;\nuse Parse::RecDescent;\n\n"; + + print OUT "{ my \$ERRORS;\n\n"; + + print OUT $self->_code(); + + print OUT "}\npackage $class; sub new { "; + print OUT "my "; + + require Data::Dumper; + print OUT Data::Dumper->Dump([$self], [qw(self)]); + + print OUT "}"; + + close OUT + or croak("Can't write to new module file '$modulefile'"); +} + + +package Parse::RecDescent::LineCounter; + + +sub TIESCALAR # ($classname, \$text, $thisparser, $prevflag) +{ + bless { + text => $_[1], + parser => $_[2], + prev => $_[3]?1:0, + }, $_[0]; +} + +my %counter_cache; + +sub FETCH +{ + my $parser = $_[0]->{parser}; + my $from = $parser->{fulltextlen}-length(${$_[0]->{text}})-$_[0]->{prev} +; + + unless (exists $counter_cache{$from}) { + $parser->{lastlinenum} = $parser->{offsetlinenum} + - Parse::RecDescent::_linecount(substr($parser->{fulltext},$from)) + + 1; + $counter_cache{$from} = $parser->{lastlinenum}; + } + return $counter_cache{$from}; +} + +sub STORE +{ + my $parser = $_[0]->{parser}; + $parser->{offsetlinenum} -= $parser->{lastlinenum} - $_[1]; + return undef; +} + +sub resync # ($linecounter) +{ + my $self = tied($_[0]); + die "Tried to alter something other than a LineCounter\n" + unless $self =~ /Parse::RecDescent::LineCounter/; + + my $parser = $self->{parser}; + my $apparently = $parser->{offsetlinenum} + - Parse::RecDescent::_linecount(${$self->{text}}) + + 1; + + $parser->{offsetlinenum} += $parser->{lastlinenum} - $apparently; + return 1; +} + +package Parse::RecDescent::ColCounter; + +sub TIESCALAR # ($classname, \$text, $thisparser, $prevflag) +{ + bless { + text => $_[1], + parser => $_[2], + prev => $_[3]?1:0, + }, $_[0]; +} + +sub FETCH +{ + my $parser = $_[0]->{parser}; + my $missing = $parser->{fulltextlen}-length(${$_[0]->{text}})-$_[0]->{prev}+1; + substr($parser->{fulltext},0,$missing) =~ m/^(.*)\Z/m; + return length($1); +} + +sub STORE +{ + die "Can't set column number via \$thiscolumn\n"; +} + + +package Parse::RecDescent::OffsetCounter; + +sub TIESCALAR # ($classname, \$text, $thisparser, $prev) +{ + bless { + text => $_[1], + parser => $_[2], + prev => $_[3]?-1:0, + }, $_[0]; +} + +sub FETCH +{ + my $parser = $_[0]->{parser}; + return $parser->{fulltextlen}-length(${$_[0]->{text}})+$_[0]->{prev}; +} + +sub STORE +{ + die "Can't set current offset via \$thisoffset or \$prevoffset\n"; +} + + + +package Parse::RecDescent::Rule; + +sub new ($$$$$) +{ + my $class = ref($_[0]) || $_[0]; + my $name = $_[1]; + my $owner = $_[2]; + my $line = $_[3]; + my $replace = $_[4]; + + if (defined $owner->{"rules"}{$name}) + { + my $self = $owner->{"rules"}{$name}; + if ($replace && !$self->{"changed"}) + { + $self->reset; + } + return $self; + } + else + { + return $owner->{"rules"}{$name} = + bless + { + "name" => $name, + "prods" => [], + "calls" => [], + "changed" => 0, + "line" => $line, + "impcount" => 0, + "opcount" => 0, + "vars" => "", + }, $class; + } +} + +sub reset($) +{ + @{$_[0]->{"prods"}} = (); + @{$_[0]->{"calls"}} = (); + $_[0]->{"changed"} = 0; + $_[0]->{"impcount"} = 0; + $_[0]->{"opcount"} = 0; + $_[0]->{"vars"} = ""; +} + +sub DESTROY {} + +sub hasleftmost($$) +{ + my ($self, $ref) = @_; + + my $prod; + foreach $prod ( @{$self->{"prods"}} ) + { + return 1 if $prod->hasleftmost($ref); + } + + return 0; +} + +sub leftmostsubrules($) +{ + my $self = shift; + my @subrules = (); + + my $prod; + foreach $prod ( @{$self->{"prods"}} ) + { + push @subrules, $prod->leftmostsubrule(); + } + + return @subrules; +} + +sub expected($) +{ + my $self = shift; + my @expected = (); + + my $prod; + foreach $prod ( @{$self->{"prods"}} ) + { + my $next = $prod->expected(); + unless (! $next or _contains($next,@expected) ) + { + push @expected, $next; + } + } + + return join ', or ', @expected; +} + +sub _contains($@) +{ + my $target = shift; + my $item; + foreach $item ( @_ ) { return 1 if $target eq $item; } + return 0; +} + +sub addcall($$) +{ + my ( $self, $subrule ) = @_; + unless ( _contains($subrule, @{$self->{"calls"}}) ) + { + push @{$self->{"calls"}}, $subrule; + } +} + +sub addprod($$) +{ + my ( $self, $prod ) = @_; + push @{$self->{"prods"}}, $prod; + $self->{"changed"} = 1; + $self->{"impcount"} = 0; + $self->{"opcount"} = 0; + $prod->{"number"} = $#{$self->{"prods"}}; + return $prod; +} + +sub addvar +{ + my ( $self, $var, $parser ) = @_; + if ($var =~ /\A\s*local\s+([%@\$]\w+)/) + { + $parser->{localvars} .= " $1"; + $self->{"vars"} .= "$var;\n" } + else + { $self->{"vars"} .= "my $var;\n" } + $self->{"changed"} = 1; + return 1; +} + +sub addautoscore +{ + my ( $self, $code ) = @_; + $self->{"autoscore"} = $code; + $self->{"changed"} = 1; + return 1; +} + +sub nextoperator($) +{ + my $self = shift; + my $prodcount = scalar @{$self->{"prods"}}; + my $opcount = ++$self->{"opcount"}; + return "_operator_${opcount}_of_production_${prodcount}_of_rule_$self->{name}"; +} + +sub nextimplicit($) +{ + my $self = shift; + my $prodcount = scalar @{$self->{"prods"}}; + my $impcount = ++$self->{"impcount"}; + return "_alternation_${impcount}_of_production_${prodcount}_of_rule_$self->{name}"; +} + + +sub code +{ + my ($self, $namespace, $parser) = @_; + +eval 'undef &' . $namespace . '::' . $self->{"name"} unless $parser->{saving}; + + my $code = +' +# ARGS ARE: ($parser, $text; $repeating, $_noactions, \@args) +sub ' . $namespace . '::' . $self->{"name"} . ' +{ + my $thisparser = $_[0]; + use vars q{$tracelevel}; + local $tracelevel = ($tracelevel||0)+1; + $ERRORS = 0; + my $thisrule = $thisparser->{"rules"}{"' . $self->{"name"} . '"}; + + Parse::RecDescent::_trace(q{Trying rule: [' . $self->{"name"} . ']}, + Parse::RecDescent::_tracefirst($_[1]), + q{' . $self->{"name"} . '}, + $tracelevel) + if defined $::RD_TRACE; + + ' . ($parser->{deferrable} + ? 'my $def_at = @{$thisparser->{deferred}};' + : '') . + ' + my $err_at = @{$thisparser->{errors}}; + + my $score; + my $score_return; + my $_tok; + my $return = undef; + my $_matched=0; + my $commit=0; + my @item = (); + my %item = (); + my $repeating = defined($_[2]) && $_[2]; + my $_noactions = defined($_[3]) && $_[3]; + my @arg = defined $_[4] ? @{ &{$_[4]} } : (); + my %arg = ($#arg & 01) ? @arg : (@arg, undef); + my $text; + my $lastsep=""; + my $expectation = new Parse::RecDescent::Expectation($thisrule->expected()); + $expectation->at($_[1]); + '. ($parser->{_check}{thisoffset}?' + my $thisoffset; + tie $thisoffset, q{Parse::RecDescent::OffsetCounter}, \$text, $thisparser; + ':'') . ($parser->{_check}{prevoffset}?' + my $prevoffset; + tie $prevoffset, q{Parse::RecDescent::OffsetCounter}, \$text, $thisparser, 1; + ':'') . ($parser->{_check}{thiscolumn}?' + my $thiscolumn; + tie $thiscolumn, q{Parse::RecDescent::ColCounter}, \$text, $thisparser; + ':'') . ($parser->{_check}{prevcolumn}?' + my $prevcolumn; + tie $prevcolumn, q{Parse::RecDescent::ColCounter}, \$text, $thisparser, 1; + ':'') . ($parser->{_check}{prevline}?' + my $prevline; + tie $prevline, q{Parse::RecDescent::LineCounter}, \$text, $thisparser, 1; + ':'') . ' + my $thisline; + tie $thisline, q{Parse::RecDescent::LineCounter}, \$text, $thisparser; + + '. $self->{vars} .' +'; + + my $prod; + foreach $prod ( @{$self->{"prods"}} ) + { + $prod->addscore($self->{autoscore},0,0) if $self->{autoscore}; + next unless $prod->checkleftmost(); + $code .= $prod->code($namespace,$self,$parser); + + $code .= $parser->{deferrable} + ? ' splice + @{$thisparser->{deferred}}, $def_at unless $_matched; + ' + : ''; + } + + $code .= +' + unless ( $_matched || defined($return) || defined($score) ) + { + ' .($parser->{deferrable} + ? ' splice @{$thisparser->{deferred}}, $def_at; + ' + : '') . ' + + $_[1] = $text; # NOT SURE THIS IS NEEDED + Parse::RecDescent::_trace(q{<>}, + Parse::RecDescent::_tracefirst($_[1]), + q{' . $self->{"name"} .'}, + $tracelevel) + if defined $::RD_TRACE; + return undef; + } + if (!defined($return) && defined($score)) + { + Parse::RecDescent::_trace(q{>>Accepted scored production<<}, "", + q{' . $self->{"name"} .'}, + $tracelevel) + if defined $::RD_TRACE; + $return = $score_return; + } + splice @{$thisparser->{errors}}, $err_at; + $return = $item[$#item] unless defined $return; + if (defined $::RD_TRACE) + { + Parse::RecDescent::_trace(q{>>Matched rule<< (return value: [} . + $return . q{])}, "", + q{' . $self->{"name"} .'}, + $tracelevel); + Parse::RecDescent::_trace(q{(consumed: [} . + Parse::RecDescent::_tracemax(substr($_[1],0,-length($text))) . q{])}, + Parse::RecDescent::_tracefirst($text), + , q{' . $self->{"name"} .'}, + $tracelevel) + } + $_[1] = $text; + return $return; +} +'; + + return $code; +} + +my @left; +sub isleftrec($$) +{ + my ($self, $rules) = @_; + my $root = $self->{"name"}; + @left = $self->leftmostsubrules(); + my $next; + foreach $next ( @left ) + { + next unless defined $rules->{$next}; # SKIP NON-EXISTENT RULES + return 1 if $next eq $root; + my $child; + foreach $child ( $rules->{$next}->leftmostsubrules() ) + { + push(@left, $child) + if ! _contains($child, @left) ; + } + } + return 0; +} + +package Parse::RecDescent::Production; + +sub describe ($;$) +{ + return join ' ', map { $_->describe($_[1]) or () } @{$_[0]->{items}}; +} + +sub new ($$;$$) +{ + my ($self, $line, $uncommit, $error) = @_; + my $class = ref($self) || $self; + + bless + { + "items" => [], + "uncommit" => $uncommit, + "error" => $error, + "line" => $line, + strcount => 0, + patcount => 0, + dircount => 0, + actcount => 0, + }, $class; +} + +sub expected ($) +{ + my $itemcount = scalar @{$_[0]->{"items"}}; + return ($itemcount) ? $_[0]->{"items"}[0]->describe(1) : ''; +} + +sub hasleftmost ($$) +{ + my ($self, $ref) = @_; + return ${$self->{"items"}}[0] eq $ref if scalar @{$self->{"items"}}; + return 0; +} + +sub leftmostsubrule($) +{ + my $self = shift; + + if ( $#{$self->{"items"}} >= 0 ) + { + my $subrule = $self->{"items"}[0]->issubrule(); + return $subrule if defined $subrule; + } + + return (); +} + +sub checkleftmost($) +{ + my @items = @{$_[0]->{"items"}}; + if (@items==1 && ref($items[0]) =~ /\AParse::RecDescent::Error/ + && $items[0]->{commitonly} ) + { + Parse::RecDescent::_warn(2,"Lone in production treated + as "); + Parse::RecDescent::_hint("A production consisting of a single + conditional directive would + normally succeed (with the value zero) if the + rule is not 'commited' when it is + tried. Since you almost certainly wanted + ' ' Parse::RecDescent + supplied it for you."); + push @{$_[0]->{items}}, + Parse::RecDescent::UncondReject->new(0,0,''); + } + elsif (@items==1 && ($items[0]->describe||"") =~ /describe||"") =~ /describe ."]"); + my $what = $items[0]->describe =~ / (which acts like an unconditional during parsing)" + : $items[0]->describe =~ / (which acts like an unconditional during parsing)" + : "an unconditional "; + my $caveat = $items[0]->describe =~ / 1 + ? "However, there were also other (useless) items after the leading " + . $items[0]->describe + . ", so you may have been expecting some other behaviour." + : "You can safely ignore this message."; + Parse::RecDescent::_hint("The production starts with $what. That means that the + production can never successfully match, so it was + optimized out of the final parser$caveat. $advice"); + return 0; + } + return 1; +} + +sub changesskip($) +{ + my $item; + foreach $item (@{$_[0]->{"items"}}) + { + if (ref($item) =~ /Parse::RecDescent::(Action|Directive)/) + { + return 1 if $item->{code} =~ /\$skip/; + } + } + return 0; +} + +sub adddirective +{ + my ( $self, $whichop, $line, $name ) = @_; + push @{$self->{op}}, + { type=>$whichop, line=>$line, name=>$name, + offset=> scalar(@{$self->{items}}) }; +} + +sub addscore +{ + my ( $self, $code, $lookahead, $line ) = @_; + $self->additem(Parse::RecDescent::Directive->new( + "local \$^W; + my \$thisscore = do { $code } + 0; + if (!defined(\$score) || \$thisscore>\$score) + { \$score=\$thisscore; \$score_return=\$item[-1]; } + undef;", $lookahead, $line,"") ) + unless $self->{items}[-1]->describe =~ /{op}) + { + while (my $next = pop @{$self->{op}}) + { + Parse::RecDescent::_error("Incomplete <$next->{type}op:...>.", $line); + Parse::RecDescent::_hint( + "The current production ended without completing the + <$next->{type}op:...> directive that started near line + $next->{line}. Did you forget the closing '>'?"); + } + } + return 1; +} + +sub enddirective +{ + my ( $self, $line, $minrep, $maxrep ) = @_; + unless ($self->{op}) + { + Parse::RecDescent::_error("Unmatched > found.", $line); + Parse::RecDescent::_hint( + "A '>' angle bracket was encountered, which typically + indicates the end of a directive. However no suitable + preceding directive was encountered. Typically this + indicates either a extra '>' in the grammar, or a + problem inside the previous directive."); + return; + } + my $op = pop @{$self->{op}}; + my $span = @{$self->{items}} - $op->{offset}; + if ($op->{type} =~ /left|right/) + { + if ($span != 3) + { + Parse::RecDescent::_error( + "Incorrect <$op->{type}op:...> specification: + expected 3 args, but found $span instead", $line); + Parse::RecDescent::_hint( + "The <$op->{type}op:...> directive requires a + sequence of exactly three elements. For example: + <$op->{type}op:leftarg /op/ rightarg>"); + } + else + { + push @{$self->{items}}, + Parse::RecDescent::Operator->new( + $op->{type}, $minrep, $maxrep, splice(@{$self->{"items"}}, -3)); + $self->{items}[-1]->sethashname($self); + $self->{items}[-1]{name} = $op->{name}; + } + } +} + +sub prevwasreturn +{ + my ( $self, $line ) = @_; + unless (@{$self->{items}}) + { + Parse::RecDescent::_error( + "Incorrect specification: + expected item missing", $line); + Parse::RecDescent::_hint( + "The directive requires a + sequence of at least one item. For example: + "); + return; + } + push @{$self->{items}}, + Parse::RecDescent::Result->new(); +} + +sub additem +{ + my ( $self, $item ) = @_; + $item->sethashname($self); + push @{$self->{"items"}}, $item; + return $item; +} + + +sub preitempos +{ + return q + { + push @itempos, {'offset' => {'from'=>$thisoffset, 'to'=>undef}, + 'line' => {'from'=>$thisline, 'to'=>undef}, + 'column' => {'from'=>$thiscolumn, 'to'=>undef} }; + } +} + +sub incitempos +{ + return q + { + $itempos[$#itempos]{'offset'}{'from'} += length($1); + $itempos[$#itempos]{'line'}{'from'} = $thisline; + $itempos[$#itempos]{'column'}{'from'} = $thiscolumn; + } +} + +sub postitempos +{ + return q + { + $itempos[$#itempos]{'offset'}{'to'} = $prevoffset; + $itempos[$#itempos]{'line'}{'to'} = $prevline; + $itempos[$#itempos]{'column'}{'to'} = $prevcolumn; + } +} + +sub code($$$$) +{ + my ($self,$namespace,$rule,$parser) = @_; + my $code = +' + while (!$_matched' + . (defined $self->{"uncommit"} ? '' : ' && !$commit') + . ') + { + ' . + ($self->changesskip() + ? 'local $skip = defined($skip) ? $skip : $Parse::RecDescent::skip;' + : '') .' + Parse::RecDescent::_trace(q{Trying production: [' + . $self->describe . ']}, + Parse::RecDescent::_tracefirst($_[1]), + q{' . $rule ->{name}. '}, + $tracelevel) + if defined $::RD_TRACE; + my $thisprod = $thisrule->{"prods"}[' . $self->{"number"} . ']; + ' . (defined $self->{"error"} ? '' : '$text = $_[1];' ) . ' + my $_savetext; + @item = (q{' . $rule->{"name"} . '}); + %item = (__RULE__ => q{' . $rule->{"name"} . '}); + my $repcount = 0; + +'; + $code .= +' my @itempos = ({}); +' if $parser->{_check}{itempos}; + + my $item; + my $i; + + for ($i = 0; $i < @{$self->{"items"}}; $i++) + { + $item = ${$self->{items}}[$i]; + + $code .= preitempos() if $parser->{_check}{itempos}; + + $code .= $item->code($namespace,$rule,$parser->{_check}); + + $code .= postitempos() if $parser->{_check}{itempos}; + + } + + if ($parser->{_AUTOACTION} && defined($item) && !$item->isa("Parse::RecDescent::Action")) + { + $code .= $parser->{_AUTOACTION}->code($namespace,$rule); + Parse::RecDescent::_warn(1,"Autogenerating action in rule + \"$rule->{name}\": + $parser->{_AUTOACTION}{code}") + and + Parse::RecDescent::_hint("The \$::RD_AUTOACTION was defined, + so any production not ending in an + explicit action has the specified + \"auto-action\" automatically + appended."); + } + elsif ($parser->{_AUTOTREE} && defined($item) && !$item->isa("Parse::RecDescent::Action")) + { + if ($i==1 && $item->isterminal) + { + $code .= $parser->{_AUTOTREE}{TERMINAL}->code($namespace,$rule); + } + else + { + $code .= $parser->{_AUTOTREE}{NODE}->code($namespace,$rule); + } + Parse::RecDescent::_warn(1,"Autogenerating tree-building action in rule + \"$rule->{name}\"") + and + Parse::RecDescent::_hint("The directive was specified, + so any production not ending + in an explicit action has + some parse-tree building code + automatically appended."); + } + + $code .= +' + + Parse::RecDescent::_trace(q{>>Matched production: [' + . $self->describe . ']<<}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{name} . '}, + $tracelevel) + if defined $::RD_TRACE; + $_matched = 1; + last; + } + +'; + return $code; +} + +1; + +package Parse::RecDescent::Action; + +sub describe { undef } + +sub sethashname { $_[0]->{hashname} = '__ACTION' . ++$_[1]->{actcount} .'__'; } + +sub new +{ + my $class = ref($_[0]) || $_[0]; + bless + { + "code" => $_[1], + "lookahead" => $_[2], + "line" => $_[3], + }, $class; +} + +sub issubrule { undef } +sub isterminal { 0 } + +sub code($$$$) +{ + my ($self, $namespace, $rule) = @_; + +' + Parse::RecDescent::_trace(q{Trying action}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{name} . '}, + $tracelevel) + if defined $::RD_TRACE; + ' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) .' + + $_tok = ($_noactions) ? 0 : do ' . $self->{"code"} . '; + ' . ($self->{"lookahead"}<0?'if':'unless') . ' (defined $_tok) + { + Parse::RecDescent::_trace(q{<> (return value: [undef])}) + if defined $::RD_TRACE; + last; + } + Parse::RecDescent::_trace(q{>>Matched action<< (return value: [} + . $_tok . q{])}, + Parse::RecDescent::_tracefirst($text)) + if defined $::RD_TRACE; + push @item, $_tok; + ' . ($self->{line}>=0 ? '$item{'. $self->{hashname} .'}=$_tok;' : '' ) .' + ' . ($self->{"lookahead"} ? '$text = $_savetext;' : '' ) .' +' +} + + +1; + +package Parse::RecDescent::Directive; + +sub sethashname { $_[0]->{hashname} = '__DIRECTIVE' . ++$_[1]->{dircount} . '__'; } + +sub issubrule { undef } +sub isterminal { 0 } +sub describe { $_[1] ? '' : $_[0]->{name} } + +sub new ($$$$$) +{ + my $class = ref($_[0]) || $_[0]; + bless + { + "code" => $_[1], + "lookahead" => $_[2], + "line" => $_[3], + "name" => $_[4], + }, $class; +} + +sub code($$$$) +{ + my ($self, $namespace, $rule) = @_; + +' + ' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) .' + + Parse::RecDescent::_trace(q{Trying directive: [' + . $self->describe . ']}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{name} . '}, + $tracelevel) + if defined $::RD_TRACE; ' .' + $_tok = do { ' . $self->{"code"} . ' }; + if (defined($_tok)) + { + Parse::RecDescent::_trace(q{>>Matched directive<< (return value: [} + . $_tok . q{])}, + Parse::RecDescent::_tracefirst($text)) + if defined $::RD_TRACE; + } + else + { + Parse::RecDescent::_trace(q{<>}, + Parse::RecDescent::_tracefirst($text)) + if defined $::RD_TRACE; + } + ' . ($self->{"lookahead"} ? '$text = $_savetext and ' : '' ) .' + last ' + . ($self->{"lookahead"}<0?'if':'unless') . ' defined $_tok; + push @item, $item{'.$self->{hashname}.'}=$_tok; + ' . ($self->{"lookahead"} ? '$text = $_savetext;' : '' ) .' +' +} + +1; + +package Parse::RecDescent::UncondReject; + +sub issubrule { undef } +sub isterminal { 0 } +sub describe { $_[1] ? '' : $_[0]->{name} } +sub sethashname { $_[0]->{hashname} = '__DIRECTIVE' . ++$_[1]->{dircount} . '__'; } + +sub new ($$$;$) +{ + my $class = ref($_[0]) || $_[0]; + bless + { + "lookahead" => $_[1], + "line" => $_[2], + "name" => $_[3], + }, $class; +} + +# MARK, YOU MAY WANT TO OPTIMIZE THIS. + + +sub code($$$$) +{ + my ($self, $namespace, $rule) = @_; + +' + Parse::RecDescent::_trace(q{>>Rejecting production<< (found ' + . $self->describe . ')}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{name} . '}, + $tracelevel) + if defined $::RD_TRACE; + undef $return; + ' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) .' + + $_tok = undef; + ' . ($self->{"lookahead"} ? '$text = $_savetext and ' : '' ) .' + last ' + . ($self->{"lookahead"}<0?'if':'unless') . ' defined $_tok; +' +} + +1; + +package Parse::RecDescent::Error; + +sub issubrule { undef } +sub isterminal { 0 } +sub describe { $_[1] ? '' : $_[0]->{commitonly} ? '' : '' } +sub sethashname { $_[0]->{hashname} = '__DIRECTIVE' . ++$_[1]->{dircount} . '__'; } + +sub new ($$$$$) +{ + my $class = ref($_[0]) || $_[0]; + bless + { + "msg" => $_[1], + "lookahead" => $_[2], + "commitonly" => $_[3], + "line" => $_[4], + }, $class; +} + +sub code($$$$) +{ + my ($self, $namespace, $rule) = @_; + + my $action = ''; + + if ($self->{"msg"}) # ERROR MESSAGE SUPPLIED + { + #WAS: $action .= "Parse::RecDescent::_error(qq{$self->{msg}}" . ',$thisline);'; + $action .= 'push @{$thisparser->{errors}}, [qq{'.$self->{msg}.'},$thisline];'; + + } + else # GENERATE ERROR MESSAGE DURING PARSE + { + $action .= ' + my $rule = $item[0]; + $rule =~ s/_/ /g; + #WAS: Parse::RecDescent::_error("Invalid $rule: " . $expectation->message() ,$thisline); + push @{$thisparser->{errors}}, ["Invalid $rule: " . $expectation->message() ,$thisline]; + '; + } + + my $dir = + new Parse::RecDescent::Directive('if (' . + ($self->{"commitonly"} ? '$commit' : '1') . + ") { do {$action} unless ".' $_noactions; undef } else {0}', + $self->{"lookahead"},0,$self->describe); + $dir->{hashname} = $self->{hashname}; + return $dir->code($namespace, $rule, 0); +} + +1; + +package Parse::RecDescent::Token; + +sub sethashname { $_[0]->{hashname} = '__PATTERN' . ++$_[1]->{patcount} . '__'; } + +sub issubrule { undef } +sub isterminal { 1 } +sub describe ($) { shift->{'description'}} + + +# ARGS ARE: $self, $pattern, $left_delim, $modifiers, $lookahead, $linenum +sub new ($$$$$$) +{ + my $class = ref($_[0]) || $_[0]; + my $pattern = $_[1]; + my $pat = $_[1]; + my $ldel = $_[2]; + my $rdel = $ldel; + $rdel =~ tr/{[(/; + + my $mod = $_[3]; + + my $desc; + + if ($ldel eq '/') { $desc = "$ldel$pattern$rdel$mod" } + else { $desc = "m$ldel$pattern$rdel$mod" } + $desc =~ s/\\/\\\\/g; + $desc =~ s/\$$/\\\$/g; + $desc =~ s/}/\\}/g; + $desc =~ s/{/\\{/g; + + if (!eval "no strict; + local \$SIG{__WARN__} = sub {0}; + '' =~ m$ldel$pattern$rdel" and $@) + { + Parse::RecDescent::_warn(3, "Token pattern \"m$ldel$pattern$rdel\" + may not be a valid regular expression", + $_[5]); + $@ =~ s/ at \(eval.*/./; + Parse::RecDescent::_hint($@); + } + + # QUIETLY PREVENT (WELL-INTENTIONED) CALAMITY + $mod =~ s/[gc]//g; + $pattern =~ s/(\A|[^\\])\\G/$1/g; + + bless + { + "pattern" => $pattern, + "ldelim" => $ldel, + "rdelim" => $rdel, + "mod" => $mod, + "lookahead" => $_[4], + "line" => $_[5], + "description" => $desc, + }, $class; +} + + +sub code($$$$) +{ + my ($self, $namespace, $rule, $check) = @_; + my $ldel = $self->{"ldelim"}; + my $rdel = $self->{"rdelim"}; + my $sdel = $ldel; + my $mod = $self->{"mod"}; + + $sdel =~ s/[[{(<]/{}/; + +my $code = ' + Parse::RecDescent::_trace(q{Trying terminal: [' . $self->describe + . ']}, Parse::RecDescent::_tracefirst($text), + q{' . $rule->{name} . '}, + $tracelevel) + if defined $::RD_TRACE; + $lastsep = ""; + $expectation->is(q{' . ($rule->hasleftmost($self) ? '' + : $self->describe ) . '})->at($text); + ' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) . ' + + ' . ($self->{"lookahead"}<0?'if':'unless') + . ' ($text =~ s/\A($skip)/$lastsep=$1 and ""/e and ' + . ($check->{itempos}? 'do {'.Parse::RecDescent::Production::incitempos().' 1} and ' : '') + . ' $text =~ s' . $ldel . '\A(?:' . $self->{"pattern"} . ')' + . $rdel . $sdel . $mod . ') + { + '.($self->{"lookahead"} ? '$text = $_savetext;' : '').' + $expectation->failed(); + Parse::RecDescent::_trace(q{<>}, + Parse::RecDescent::_tracefirst($text)) + if defined $::RD_TRACE; + + last; + } + Parse::RecDescent::_trace(q{>>Matched terminal<< (return value: [} + . $& . q{])}, + Parse::RecDescent::_tracefirst($text)) + if defined $::RD_TRACE; + push @item, $item{'.$self->{hashname}.'}=$&; + ' . ($self->{"lookahead"} ? '$text = $_savetext;' : '' ) .' +'; + + return $code; +} + +1; + +package Parse::RecDescent::Literal; + +sub sethashname { $_[0]->{hashname} = '__STRING' . ++$_[1]->{strcount} . '__'; } + +sub issubrule { undef } +sub isterminal { 1 } +sub describe ($) { shift->{'description'} } + +sub new ($$$$) +{ + my $class = ref($_[0]) || $_[0]; + + my $pattern = $_[1]; + + my $desc = $pattern; + $desc=~s/\\/\\\\/g; + $desc=~s/}/\\}/g; + $desc=~s/{/\\{/g; + + bless + { + "pattern" => $pattern, + "lookahead" => $_[2], + "line" => $_[3], + "description" => "'$desc'", + }, $class; +} + + +sub code($$$$) +{ + my ($self, $namespace, $rule, $check) = @_; + +my $code = ' + Parse::RecDescent::_trace(q{Trying terminal: [' . $self->describe + . ']}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{name} . '}, + $tracelevel) + if defined $::RD_TRACE; + $lastsep = ""; + $expectation->is(q{' . ($rule->hasleftmost($self) ? '' + : $self->describe ) . '})->at($text); + ' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) . ' + + ' . ($self->{"lookahead"}<0?'if':'unless') + . ' ($text =~ s/\A($skip)/$lastsep=$1 and ""/e and ' + . ($check->{itempos}? 'do {'.Parse::RecDescent::Production::incitempos().' 1} and ' : '') + . ' $text =~ s/\A' . quotemeta($self->{"pattern"}) . '//) + { + '.($self->{"lookahead"} ? '$text = $_savetext;' : '').' + $expectation->failed(); + Parse::RecDescent::_trace(qq{<>}, + Parse::RecDescent::_tracefirst($text)) + if defined $::RD_TRACE; + last; + } + Parse::RecDescent::_trace(q{>>Matched terminal<< (return value: [} + . $& . q{])}, + Parse::RecDescent::_tracefirst($text)) + if defined $::RD_TRACE; + push @item, $item{'.$self->{hashname}.'}=$&; + ' . ($self->{"lookahead"} ? '$text = $_savetext;' : '' ) .' +'; + + return $code; +} + +1; + +package Parse::RecDescent::InterpLit; + +sub sethashname { $_[0]->{hashname} = '__STRING' . ++$_[1]->{strcount} . '__'; } + +sub issubrule { undef } +sub isterminal { 1 } +sub describe ($) { shift->{'description'} } + +sub new ($$$$) +{ + my $class = ref($_[0]) || $_[0]; + + my $pattern = $_[1]; + $pattern =~ s#/#\\/#g; + + my $desc = $pattern; + $desc=~s/\\/\\\\/g; + $desc=~s/}/\\}/g; + $desc=~s/{/\\{/g; + + bless + { + "pattern" => $pattern, + "lookahead" => $_[2], + "line" => $_[3], + "description" => "'$desc'", + }, $class; +} + +sub code($$$$) +{ + my ($self, $namespace, $rule, $check) = @_; + +my $code = ' + Parse::RecDescent::_trace(q{Trying terminal: [' . $self->describe + . ']}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{name} . '}, + $tracelevel) + if defined $::RD_TRACE; + $lastsep = ""; + $expectation->is(q{' . ($rule->hasleftmost($self) ? '' + : $self->describe ) . '})->at($text); + ' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) . ' + + ' . ($self->{"lookahead"}<0?'if':'unless') + . ' ($text =~ s/\A($skip)/$lastsep=$1 and ""/e and ' + . ($check->{itempos}? 'do {'.Parse::RecDescent::Production::incitempos().' 1} and ' : '') + . ' do { $_tok = "' . $self->{"pattern"} . '"; 1 } and + substr($text,0,length($_tok)) eq $_tok and + do { substr($text,0,length($_tok)) = ""; 1; } + ) + { + '.($self->{"lookahead"} ? '$text = $_savetext;' : '').' + $expectation->failed(); + Parse::RecDescent::_trace(q{<>}, + Parse::RecDescent::_tracefirst($text)) + if defined $::RD_TRACE; + last; + } + Parse::RecDescent::_trace(q{>>Matched terminal<< (return value: [} + . $_tok . q{])}, + Parse::RecDescent::_tracefirst($text)) + if defined $::RD_TRACE; + push @item, $item{'.$self->{hashname}.'}=$_tok; + ' . ($self->{"lookahead"} ? '$text = $_savetext;' : '' ) .' +'; + + return $code; +} + +1; + +package Parse::RecDescent::Subrule; + +sub issubrule ($) { return $_[0]->{"subrule"} } +sub isterminal { 0 } +sub sethashname {} + +sub describe ($) +{ + my $desc = $_[0]->{"implicit"} || $_[0]->{"subrule"}; + $desc = "" if $_[0]->{"matchrule"}; + return $desc; +} + +sub callsyntax($$) +{ + if ($_[0]->{"matchrule"}) + { + return "&{'$_[1]'.qq{$_[0]->{subrule}}}"; + } + else + { + return $_[1].$_[0]->{"subrule"}; + } +} + +sub new ($$$$;$$$) +{ + my $class = ref($_[0]) || $_[0]; + bless + { + "subrule" => $_[1], + "lookahead" => $_[2], + "line" => $_[3], + "implicit" => $_[4] || undef, + "matchrule" => $_[5], + "argcode" => $_[6] || undef, + }, $class; +} + + +sub code($$$$) +{ + my ($self, $namespace, $rule) = @_; + +' + Parse::RecDescent::_trace(q{Trying subrule: [' . $self->{"subrule"} . ']}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{"name"} . '}, + $tracelevel) + if defined $::RD_TRACE; + if (1) { no strict qw{refs}; + $expectation->is(' . ($rule->hasleftmost($self) ? 'q{}' + # WAS : 'qq{'.$self->describe.'}' ) . ')->at($text); + : 'q{'.$self->describe.'}' ) . ')->at($text); + ' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) + . ($self->{"lookahead"}<0?'if':'unless') + . ' (defined ($_tok = ' + . $self->callsyntax($namespace.'::') + . '($thisparser,$text,$repeating,' + . ($self->{"lookahead"}?'1':'$_noactions') + . ($self->{argcode} ? ",sub { return $self->{argcode} }" + : ',sub { \\@arg }') + . '))) + { + '.($self->{"lookahead"} ? '$text = $_savetext;' : '').' + Parse::RecDescent::_trace(q{<{subrule} . ']>>}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{"name"} .'}, + $tracelevel) + if defined $::RD_TRACE; + $expectation->failed(); + last; + } + Parse::RecDescent::_trace(q{>>Matched subrule: [' + . $self->{subrule} . ']<< (return value: [} + . $_tok . q{]}, + + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{"name"} .'}, + $tracelevel) + if defined $::RD_TRACE; + $item{q{' . $self->{subrule} . '}} = $_tok; + push @item, $_tok; + ' . ($self->{"lookahead"} ? '$text = $_savetext;' : '' ) .' + } +' +} + +package Parse::RecDescent::Repetition; + +sub issubrule ($) { return $_[0]->{"subrule"} } +sub isterminal { 0 } +sub sethashname { } + +sub describe ($) +{ + my $desc = $_[0]->{"expected"} || $_[0]->{"subrule"}; + $desc = "" if $_[0]->{"matchrule"}; + return $desc; +} + +sub callsyntax($$) +{ + if ($_[0]->{matchrule}) + { return "sub { goto &{''.qq{$_[1]$_[0]->{subrule}}} }"; } + else + { return "\\&$_[1]$_[0]->{subrule}"; } +} + +sub new ($$$$$$$$$$) +{ + my ($self, $subrule, $repspec, $min, $max, $lookahead, $line, $parser, $matchrule, $argcode) = @_; + my $class = ref($self) || $self; + ($max, $min) = ( $min, $max) if ($max<$min); + + my $desc; + if ($subrule=~/\A_alternation_\d+_of_production_\d+_of_rule/) + { $desc = $parser->{"rules"}{$subrule}->expected } + + if ($lookahead) + { + if ($min>0) + { + return new Parse::RecDescent::Subrule($subrule,$lookahead,$line,$desc,$matchrule,$argcode); + } + else + { + Parse::RecDescent::_error("Not symbol (\"!\") before + \"$subrule\" doesn't make + sense.",$line); + Parse::RecDescent::_hint("Lookahead for negated optional + repetitions (such as + \"!$subrule($repspec)\" can never + succeed, since optional items always + match (zero times at worst). + Did you mean a single \"!$subrule\", + instead?"); + } + } + bless + { + "subrule" => $subrule, + "repspec" => $repspec, + "min" => $min, + "max" => $max, + "lookahead" => $lookahead, + "line" => $line, + "expected" => $desc, + "argcode" => $argcode || undef, + "matchrule" => $matchrule, + }, $class; +} + +sub code($$$$) +{ + my ($self, $namespace, $rule) = @_; + + my ($subrule, $repspec, $min, $max, $lookahead) = + @{$self}{ qw{subrule repspec min max lookahead} }; + +' + Parse::RecDescent::_trace(q{Trying repeated subrule: [' . $self->describe . ']}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{"name"} . '}, + $tracelevel) + if defined $::RD_TRACE; + $expectation->is(' . ($rule->hasleftmost($self) ? 'q{}' + # WAS : 'qq{'.$self->describe.'}' ) . ')->at($text); + : 'q{'.$self->describe.'}' ) . ')->at($text); + ' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) .' + unless (defined ($_tok = $thisparser->_parserepeat($text, ' + . $self->callsyntax($namespace.'::') + . ', ' . $min . ', ' . $max . ', ' + . ($self->{"lookahead"}?'1':'$_noactions') + . ',$expectation,' + . ($self->{argcode} ? "sub { return $self->{argcode} }" + : 'undef') + . '))) + { + Parse::RecDescent::_trace(q{<describe . ']>>}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{"name"} .'}, + $tracelevel) + if defined $::RD_TRACE; + last; + } + Parse::RecDescent::_trace(q{>>Matched repeated subrule: [' + . $self->{subrule} . ']<< (} + . @$_tok . q{ times)}, + + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{"name"} .'}, + $tracelevel) + if defined $::RD_TRACE; + $item{q{' . "$self->{subrule}($self->{repspec})" . '}} = $_tok; + push @item, $_tok; + ' . ($self->{"lookahead"} ? '$text = $_savetext;' : '' ) .' + +' +} + +package Parse::RecDescent::Result; + +sub issubrule { 0 } +sub isterminal { 0 } +sub describe { '' } + +sub new +{ + my ($class, $pos) = @_; + + bless {}, $class; +} + +sub code($$$$) +{ + my ($self, $namespace, $rule) = @_; + + ' + $return = $item[-1]; + '; +} + +package Parse::RecDescent::Operator; + +my @opertype = ( " non-optional", "n optional" ); + +sub issubrule { 0 } +sub isterminal { 0 } + +sub describe { $_[0]->{"expected"} } +sub sethashname { $_[0]->{hashname} = '__DIRECTIVE' . ++$_[1]->{dircount} . '__'; } + + +sub new +{ + my ($class, $type, $minrep, $maxrep, $leftarg, $op, $rightarg) = @_; + + bless + { + "type" => "${type}op", + "leftarg" => $leftarg, + "op" => $op, + "min" => $minrep, + "max" => $maxrep, + "rightarg" => $rightarg, + "expected" => "<${type}op: ".$leftarg->describe." ".$op->describe." ".$rightarg->describe.">", + }, $class; +} + +sub code($$$$) +{ + my ($self, $namespace, $rule) = @_; + + my ($leftarg, $op, $rightarg) = + @{$self}{ qw{leftarg op rightarg} }; + + my $code = ' + Parse::RecDescent::_trace(q{Trying operator: [' . $self->describe . ']}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{"name"} . '}, + $tracelevel) + if defined $::RD_TRACE; + $expectation->is(' . ($rule->hasleftmost($self) ? 'q{}' + # WAS : 'qq{'.$self->describe.'}' ) . ')->at($text); + : 'q{'.$self->describe.'}' ) . ')->at($text); + + $_tok = undef; + OPLOOP: while (1) + { + $repcount = 0; + my @item; + '; + + if ($self->{type} eq "leftop" ) + { + $code .= ' + # MATCH LEFTARG + ' . $leftarg->code(@_[1..2]) . ' + + $repcount++; + + my $savetext = $text; + my $backtrack; + + # MATCH (OP RIGHTARG)(s) + while ($repcount < ' . $self->{max} . ') + { + $backtrack = 0; + ' . $op->code(@_[1..2]) . ' + ' . ($op->isterminal() ? 'pop @item;' : '$backtrack=1;' ) . ' + ' . (ref($op) eq 'Parse::RecDescent::Token' + ? 'if (defined $1) {push @item, $item{'.($self->{name}||$self->{hashname}).'}=$1; $backtrack=1;}' + : "" ) . ' + ' . $rightarg->code(@_[1..2]) . ' + $savetext = $text; + $repcount++; + } + $text = $savetext; + pop @item if $backtrack; + + '; + } + else + { + $code .= ' + my $savetext = $text; + my $backtrack; + # MATCH (LEFTARG OP)(s) + while ($repcount < ' . $self->{max} . ') + { + $backtrack = 0; + ' . $leftarg->code(@_[1..2]) . ' + $repcount++; + $backtrack = 1; + ' . $op->code(@_[1..2]) . ' + $savetext = $text; + ' . ($op->isterminal() ? 'pop @item;' : "" ) . ' + ' . (ref($op) eq 'Parse::RecDescent::Token' ? 'do { push @item, $item{'.($self->{name}||$self->{hashname}).'}=$1; } if defined $1;' : "" ) . ' + } + $text = $savetext; + pop @item if $backtrack; + + # MATCH RIGHTARG + ' . $rightarg->code(@_[1..2]) . ' + $repcount++; + '; + } + + $code .= 'unless (@item) { undef $_tok; last }' unless $self->{min}==0; + + $code .= ' + $_tok = [ @item ]; + last; + } + + unless ($repcount>='.$self->{min}.') + { + Parse::RecDescent::_trace(q{<describe + . ']>>}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{"name"} .'}, + $tracelevel) + if defined $::RD_TRACE; + $expectation->failed(); + last; + } + Parse::RecDescent::_trace(q{>>Matched operator: [' + . $self->describe + . ']<< (return value: [} + . qq{@{$_tok||[]}} . q{]}, + Parse::RecDescent::_tracefirst($text), + q{' . $rule->{"name"} .'}, + $tracelevel) + if defined $::RD_TRACE; + + push @item, $item{'.($self->{name}||$self->{hashname}).'}=$_tok||[]; + +'; + return $code; +} + + +package Parse::RecDescent::Expectation; + +sub new ($) +{ + bless { + "failed" => 0, + "expected" => "", + "unexpected" => "", + "lastexpected" => "", + "lastunexpected" => "", + "defexpected" => $_[1], + }; +} + +sub is ($$) +{ + $_[0]->{lastexpected} = $_[1]; return $_[0]; +} + +sub at ($$) +{ + $_[0]->{lastunexpected} = $_[1]; return $_[0]; +} + +sub failed ($) +{ + return unless $_[0]->{lastexpected}; + $_[0]->{expected} = $_[0]->{lastexpected} unless $_[0]->{failed}; + $_[0]->{unexpected} = $_[0]->{lastunexpected} unless $_[0]->{failed}; + $_[0]->{failed} = 1; +} + +sub message ($) +{ + my ($self) = @_; + $self->{expected} = $self->{defexpected} unless $self->{expected}; + $self->{expected} =~ s/_/ /g; + if (!$self->{unexpected} || $self->{unexpected} =~ /\A\s*\Z/s) + { + return "Was expecting $self->{expected}"; + } + else + { + $self->{unexpected} =~ /\s*(.*)/; + return "Was expecting $self->{expected} but found \"$1\" instead"; + } +} + +1; + +package Parse::RecDescent; + +use Carp; +use vars qw ( $AUTOLOAD $VERSION ); + +my $ERRORS = 0; + +$VERSION = '1.94'; + +# BUILDING A PARSER + +my $nextnamespace = "namespace000001"; + +sub _nextnamespace() +{ + return "Parse::RecDescent::" . $nextnamespace++; +} + +sub new ($$$) +{ + my $class = ref($_[0]) || $_[0]; + local $Parse::RecDescent::compiling = $_[2]; + my $name_space_name = defined $_[3] + ? "Parse::RecDescent::".$_[3] + : _nextnamespace(); + my $self = + { + "rules" => {}, + "namespace" => $name_space_name, + "startcode" => '', + "localvars" => '', + "_AUTOACTION" => undef, + "_AUTOTREE" => undef, + }; + if ($::RD_AUTOACTION) + { + my $sourcecode = $::RD_AUTOACTION; + $sourcecode = "{ $sourcecode }" + unless $sourcecode =~ /\A\s*\{.*\}\s*\Z/; + $self->{_check}{itempos} = + $sourcecode =~ /\@itempos\b|\$itempos\s*\[/; + $self->{_AUTOACTION} + = new Parse::RecDescent::Action($sourcecode,0,-1) + } + + bless $self, $class; + shift; + return $self->Replace(@_) +} + +sub Compile($$$$) { + + die "Compilation of Parse::RecDescent grammars not yet implemented\n"; +} + +sub DESTROY {} # SO AUTOLOADER IGNORES IT + +# BUILDING A GRAMMAR.... + +sub Replace ($$) +{ + splice(@_, 2, 0, 1); + return _generate(@_); +} + +sub Extend ($$) +{ + splice(@_, 2, 0, 0); + return _generate(@_); +} + +sub _no_rule ($$;$) +{ + _error("Ruleless $_[0] at start of grammar.",$_[1]); + my $desc = $_[2] ? "\"$_[2]\"" : ""; + _hint("You need to define a rule for the $_[0] $desc + to be part of."); +} + +my $NEGLOOKAHEAD = '\G(\s*\.\.\.\!)'; +my $POSLOOKAHEAD = '\G(\s*\.\.\.)'; +my $RULE = '\G\s*(\w+)[ \t]*:'; +my $PROD = '\G\s*([|])'; +my $TOKEN = q{\G\s*/((\\\\/|[^/])*)/([cgimsox]*)}; +my $MTOKEN = q{\G\s*(m\s*[^\w\s])}; +my $LITERAL = q{\G\s*'((\\\\['\\\\]|[^'])*)'}; +my $INTERPLIT = q{\G\s*"((\\\\["\\\\]|[^"])*)"}; +my $SUBRULE = '\G\s*(\w+)'; +my $MATCHRULE = '\G(\s*{_check}{itempos} = ($grammar =~ /\@itempos\b|\$itempos\s*\[/) + unless $self->{_check}{itempos}; + for (qw(thisoffset thiscolumn prevline prevoffset prevcolumn)) + { + $self->{_check}{$_} = + ($grammar =~ /\$$_/) || $self->{_check}{itempos} + unless $self->{_check}{$_}; + } + my $line; + + my $rule = undef; + my $prod = undef; + my $item = undef; + my $lastgreedy = ''; + pos $grammar = 0; + study $grammar; + + while (pos $grammar < length $grammar) + { + $line = $lines - _linecount($grammar) + 1; + my $commitonly; + my $code = ""; + my @components = (); + if ($grammar =~ m/$COMMENT/gco) + { + _parse("a comment",0,$line); + next; + } + elsif ($grammar =~ m/$NEGLOOKAHEAD/gco) + { + _parse("a negative lookahead",$aftererror,$line); + $lookahead = $lookahead ? -$lookahead : -1; + $lookaheadspec .= $1; + next; # SKIP LOOKAHEAD RESET AT END OF while LOOP + } + elsif ($grammar =~ m/$POSLOOKAHEAD/gco) + { + _parse("a positive lookahead",$aftererror,$line); + $lookahead = $lookahead ? $lookahead : 1; + $lookaheadspec .= $1; + next; # SKIP LOOKAHEAD RESET AT END OF while LOOP + } + elsif ($grammar =~ m/(?=$ACTION)/gco + and do { ($code) = extract_codeblock($grammar); $code }) + { + _parse("an action", $aftererror, $line, $code); + $item = new Parse::RecDescent::Action($code,$lookahead,$line); + $prod and $prod->additem($item) + or $self->_addstartcode($code); + } + elsif ($grammar =~ m/(?=$IMPLICITSUBRULE)/gco + and do { ($code) = extract_codeblock($grammar,'{([',undef,'(',1); + $code }) + { + $code =~ s/\A\s*\(|\)\Z//g; + _parse("an implicit subrule", $aftererror, $line, + "( $code )"); + my $implicit = $rule->nextimplicit; + $self->_generate("$implicit : $code",$replace,1); + my $pos = pos $grammar; + substr($grammar,$pos,0,$implicit); + pos $grammar = $pos;; + } + elsif ($grammar =~ m/$ENDDIRECTIVEMK/gco) + { + + # EXTRACT TRAILING REPETITION SPECIFIER (IF ANY) + + my ($minrep,$maxrep) = (1,$MAXREP); + if ($grammar =~ m/\G[(]/gc) + { + pos($grammar)--; + + if ($grammar =~ m/$OPTIONAL/gco) + { ($minrep, $maxrep) = (0,1) } + elsif ($grammar =~ m/$ANY/gco) + { $minrep = 0 } + elsif ($grammar =~ m/$EXACTLY/gco) + { ($minrep, $maxrep) = ($1,$1) } + elsif ($grammar =~ m/$BETWEEN/gco) + { ($minrep, $maxrep) = ($1,$2) } + elsif ($grammar =~ m/$ATLEAST/gco) + { $minrep = $1 } + elsif ($grammar =~ m/$ATMOST/gco) + { $maxrep = $1 } + elsif ($grammar =~ m/$MANY/gco) + { } + elsif ($grammar =~ m/$BADREP/gco) + { + _parse("an invalid repetition specifier", 0,$line); + _error("Incorrect specification of a repeated directive", + $line); + _hint("Repeated directives cannot have + a maximum repetition of zero, nor can they have + negative components in their ranges."); + } + } + + $prod && $prod->enddirective($line,$minrep,$maxrep); + } + elsif ($grammar =~ m/\G\s*<[^m]/gc) + { + pos($grammar)-=2; + + if ($grammar =~ m/$OPMK/gco) + { + # $DB::single=1; + _parse("a $1-associative operator directive", $aftererror, $line, "<$1op:...>"); + $prod->adddirective($1, $line,$2||''); + } + elsif ($grammar =~ m/$UNCOMMITMK/gco) + { + _parse("an uncommit marker", $aftererror,$line); + $item = new Parse::RecDescent::Directive('$commit=0;1', + $lookahead,$line,""); + $prod and $prod->additem($item) + or _no_rule("",$line); + } + elsif ($grammar =~ m/$QUOTELIKEMK/gco) + { + _parse("an perl quotelike marker", $aftererror,$line); + $item = new Parse::RecDescent::Directive( + 'my ($match,@res); + ($match,$text,undef,@res) = + Text::Balanced::extract_quotelike($text,$skip); + $match ? \@res : undef; + ', $lookahead,$line,""); + $prod and $prod->additem($item) + or _no_rule("",$line); + } + elsif ($grammar =~ m/$CODEBLOCKMK/gco) + { + my $outer = $1||"{}"; + _parse("an perl codeblock marker", $aftererror,$line); + $item = new Parse::RecDescent::Directive( + 'Text::Balanced::extract_codeblock($text,undef,$skip,\''.$outer.'\'); + ', $lookahead,$line,""); + $prod and $prod->additem($item) + or _no_rule("",$line); + } + elsif ($grammar =~ m/$VARIABLEMK/gco) + { + _parse("an perl variable marker", $aftererror,$line); + $item = new Parse::RecDescent::Directive( + 'Text::Balanced::extract_variable($text,$skip); + ', $lookahead,$line,""); + $prod and $prod->additem($item) + or _no_rule("",$line); + } + elsif ($grammar =~ m/$NOCHECKMK/gco) + { + _parse("a disable checking marker", $aftererror,$line); + if ($rule) + { + _error(" directive not at start of grammar", $line); + _hint("The directive can only + be specified at the start of a + grammar (before the first rule + is defined."); + } + else + { + local $::RD_CHECK = 1; + } + } + elsif ($grammar =~ m/$AUTOSTUBMK/gco) + { + _parse("an autostub marker", $aftererror,$line); + $::RD_AUTOSTUB = ""; + } + elsif ($grammar =~ m/$AUTORULEMK/gco) + { + _parse("an autorule marker", $aftererror,$line); + $::RD_AUTOSTUB = $1; + } + elsif ($grammar =~ m/$AUTOTREEMK/gco) + { + _parse("an autotree marker", $aftererror,$line); + if ($rule) + { + _error(" directive not at start of grammar", $line); + _hint("The directive can only + be specified at the start of a + grammar (before the first rule + is defined."); + } + else + { + undef $self->{_AUTOACTION}; + $self->{_AUTOTREE}{NODE} + = new Parse::RecDescent::Action(q{{bless \%item, $item[0]}},0,-1); + $self->{_AUTOTREE}{TERMINAL} + = new Parse::RecDescent::Action(q{{bless {__VALUE__=>$item[1]}, $item[0]}},0,-1); + } + } + + elsif ($grammar =~ m/$REJECTMK/gco) + { + _parse("an reject marker", $aftererror,$line); + $item = new Parse::RecDescent::UncondReject($lookahead,$line,""); + $prod and $prod->additem($item) + or _no_rule("",$line); + } + elsif ($grammar =~ m/(?=$CONDREJECTMK)/gco + and do { ($code) = extract_codeblock($grammar,'{',undef,'<'); + $code }) + { + _parse("a (conditional) reject marker", $aftererror,$line); + $code =~ /\A\s*\Z/s; + $item = new Parse::RecDescent::Directive( + "($1) ? undef : 1", $lookahead,$line,""); + $prod and $prod->additem($item) + or _no_rule("",$line); + } + elsif ($grammar =~ m/(?=$SCOREMK)/gco + and do { ($code) = extract_codeblock($grammar,'{',undef,'<'); + $code }) + { + _parse("a score marker", $aftererror,$line); + $code =~ /\A\s*\Z/s; + $prod and $prod->addscore($1, $lookahead, $line) + or _no_rule($code,$line); + } + elsif ($grammar =~ m/(?=$AUTOSCOREMK)/gco + and do { ($code) = extract_codeblock($grammar,'{',undef,'<'); + $code; + } ) + { + _parse("an autoscore specifier", $aftererror,$line,$code); + $code =~ /\A\s*\Z/s; + + $rule and $rule->addautoscore($1,$self) + or _no_rule($code,$line); + + $item = new Parse::RecDescent::UncondReject($lookahead,$line,$code); + $prod and $prod->additem($item) + or _no_rule($code,$line); + } + elsif ($grammar =~ m/$RESYNCMK/gco) + { + _parse("a resync to newline marker", $aftererror,$line); + $item = new Parse::RecDescent::Directive( + 'if ($text =~ s/\A[^\n]*\n//) { $return = 0; $& } else { undef }', + $lookahead,$line,""); + $prod and $prod->additem($item) + or _no_rule("",$line); + } + elsif ($grammar =~ m/(?=$RESYNCPATMK)/gco + and do { ($code) = extract_bracketed($grammar,'<'); + $code }) + { + _parse("a resync with pattern marker", $aftererror,$line); + $code =~ /\A\s*\Z/s; + $item = new Parse::RecDescent::Directive( + 'if ($text =~ s/\A'.$1.'//) { $return = 0; $& } else { undef }', + $lookahead,$line,$code); + $prod and $prod->additem($item) + or _no_rule($code,$line); + } + elsif ($grammar =~ m/(?=$SKIPMK)/gco + and do { ($code) = extract_codeblock($grammar,'<'); + $code }) + { + _parse("a skip marker", $aftererror,$line); + $code =~ /\A\s*\Z/s; + $item = new Parse::RecDescent::Directive( + 'my $oldskip = $skip; $skip='.$1.'; $oldskip', + $lookahead,$line,$code); + $prod and $prod->additem($item) + or _no_rule($code,$line); + } + elsif ($grammar =~ m/(?=$RULEVARPATMK)/gco + and do { ($code) = extract_codeblock($grammar,'{',undef,'<'); + $code; + } ) + { + _parse("a rule variable specifier", $aftererror,$line,$code); + $code =~ /\A\s*\Z/s; + + $rule and $rule->addvar($1,$self) + or _no_rule($code,$line); + + $item = new Parse::RecDescent::UncondReject($lookahead,$line,$code); + $prod and $prod->additem($item) + or _no_rule($code,$line); + } + elsif ($grammar =~ m/(?=$DEFERPATMK)/gco + and do { ($code) = extract_codeblock($grammar,'{',undef,'<'); + $code; + } ) + { + _parse("a deferred action specifier", $aftererror,$line,$code); + $code =~ s/\A\s*\Z/$1/s; + if ($code =~ /\A\s*[^{]|[^}]\s*\Z/) + { + $code = "{ $code }" + } + + $item = new Parse::RecDescent::Directive( + "push \@{\$thisparser->{deferred}}, sub $code;", + $lookahead,$line,""); + $prod and $prod->additem($item) + or _no_rule("",$line); + + $self->{deferrable} = 1; + } + elsif ($grammar =~ m/(?=$TOKENPATMK)/gco + and do { ($code) = extract_codeblock($grammar,'{',undef,'<'); + $code; + } ) + { + _parse("a token constructor", $aftererror,$line,$code); + $code =~ s/\A\s*\Z/$1/s; + + my $types = eval 'no strict; local $SIG{__WARN__} = sub {0}; my @arr=('.$code.'); @arr' || (); + if (!$types) + { + _error("Incorrect token specification: \"$@\"", $line); + _hint("The directive requires a list + of one or more strings representing possible + types of the specified token. For example: + "); + } + else + { + $item = new Parse::RecDescent::Directive( + 'no strict; + $return = { text => $item[-1] }; + @{$return->{type}}{'.$code.'} = (1..'.$types.');', + $lookahead,$line,""); + $prod and $prod->additem($item) + or _no_rule("",$line); + } + } + elsif ($grammar =~ m/$COMMITMK/gco) + { + _parse("an commit marker", $aftererror,$line); + $item = new Parse::RecDescent::Directive('$commit = 1', + $lookahead,$line,""); + $prod and $prod->additem($item) + or _no_rule("",$line); + } + elsif ($grammar =~ m/$AUTOERRORMK/gco) + { + $commitonly = $1; + _parse("an error marker", $aftererror,$line); + $item = new Parse::RecDescent::Error('',$lookahead,$1,$line); + $prod and $prod->additem($item) + or _no_rule("",$line); + $aftererror = !$commitonly; + } + elsif ($grammar =~ m/(?=$MSGERRORMK)/gco + and do { $commitonly = $1; + ($code) = extract_bracketed($grammar,'<'); + $code }) + { + _parse("an error marker", $aftererror,$line,$code); + $code =~ /\A\s*\Z/s; + $item = new Parse::RecDescent::Error($1,$lookahead,$commitonly,$line); + $prod and $prod->additem($item) + or _no_rule("$code",$line); + $aftererror = !$commitonly; + } + elsif (do { $commitonly = $1; + ($code) = extract_bracketed($grammar,'<'); + $code }) + { + if ($code =~ /^<[A-Z_]+>$/) + { + _error("Token items are not yet + supported: \"$code\"", + $line); + _hint("Items like $code that consist of angle + brackets enclosing a sequence of + uppercase characters will eventually + be used to specify pre-lexed tokens + in a grammar. That functionality is not + yet implemented. Or did you misspell + \"$code\"?"); + } + else + { + _error("Untranslatable item encountered: \"$code\"", + $line); + _hint("Did you misspell \"$code\" + or forget to comment it out?"); + } + } + } + elsif ($grammar =~ m/$RULE/gco) + { + _parseunneg("a rule declaration", 0, + $lookahead,$line) or next; + my $rulename = $1; + if ($rulename =~ /Replace|Extend|Precompile|Save/ ) + { + _warn(2,"Rule \"$rulename\" hidden by method + Parse::RecDescent::$rulename",$line) + and + _hint("The rule named \"$rulename\" cannot be directly + called through the Parse::RecDescent object + for this grammar (although it may still + be used as a subrule of other rules). + It can't be directly called because + Parse::RecDescent::$rulename is already defined (it + is the standard method of all + parsers)."); + } + $rule = new Parse::RecDescent::Rule($rulename,$self,$line,$replace); + $prod->check_pending($line) if $prod; + $prod = $rule->addprod( new Parse::RecDescent::Production ); + $aftererror = 0; + } + elsif ($grammar =~ m/$UNCOMMITPROD/gco) + { + pos($grammar)-=9; + _parseunneg("a new (uncommitted) production", + 0, $lookahead, $line) or next; + + $prod->check_pending($line) if $prod; + $prod = new Parse::RecDescent::Production($line,1); + $rule and $rule->addprod($prod) + or _no_rule("",$line); + $aftererror = 0; + } + elsif ($grammar =~ m/$ERRORPROD/gco) + { + pos($grammar)-=6; + _parseunneg("a new (error) production", $aftererror, + $lookahead,$line) or next; + $prod->check_pending($line) if $prod; + $prod = new Parse::RecDescent::Production($line,0,1); + $rule and $rule->addprod($prod) + or _no_rule("",$line); + $aftererror = 0; + } + elsif ($grammar =~ m/$PROD/gco) + { + _parseunneg("a new production", 0, + $lookahead,$line) or next; + $rule + and (!$prod || $prod->check_pending($line)) + and $prod = $rule->addprod(new Parse::RecDescent::Production($line)) + or _no_rule("production",$line); + $aftererror = 0; + } + elsif ($grammar =~ m/$LITERAL/gco) + { + ($code = $1) =~ s/\\\\/\\/g; + _parse("a literal terminal", $aftererror,$line,$1); + $item = new Parse::RecDescent::Literal($code,$lookahead,$line); + $prod and $prod->additem($item) + or _no_rule("literal terminal",$line,"'$1'"); + } + elsif ($grammar =~ m/$INTERPLIT/gco) + { + _parse("an interpolated literal terminal", $aftererror,$line); + $item = new Parse::RecDescent::InterpLit($1,$lookahead,$line); + $prod and $prod->additem($item) + or _no_rule("interpolated literal terminal",$line,"'$1'"); + } + elsif ($grammar =~ m/$TOKEN/gco) + { + _parse("a /../ pattern terminal", $aftererror,$line); + $item = new Parse::RecDescent::Token($1,'/',$3?$3:'',$lookahead,$line); + $prod and $prod->additem($item) + or _no_rule("pattern terminal",$line,"/$1/"); + } + elsif ($grammar =~ m/(?=$MTOKEN)/gco + and do { ($code, undef, @components) + = extract_quotelike($grammar); + $code } + ) + + { + _parse("an m/../ pattern terminal", $aftererror,$line,$code); + $item = new Parse::RecDescent::Token(@components[3,2,8], + $lookahead,$line); + $prod and $prod->additem($item) + or _no_rule("pattern terminal",$line,$code); + } + elsif ($grammar =~ m/(?=$MATCHRULE)/gco + and do { ($code) = extract_bracketed($grammar,'<'); + $code + } + or $grammar =~ m/$SUBRULE/gco + and $code = $1) + { + my $name = $code; + my $matchrule = 0; + if (substr($name,0,1) eq '<') + { + $name =~ s/$MATCHRULE\s*//; + $name =~ s/\s*>\Z//; + $matchrule = 1; + } + + # EXTRACT TRAILING ARG LIST (IF ANY) + + my ($argcode) = extract_codeblock($grammar, "[]",'') || ''; + + # EXTRACT TRAILING REPETITION SPECIFIER (IF ANY) + + if ($grammar =~ m/\G[(]/gc) + { + pos($grammar)--; + + if ($grammar =~ m/$OPTIONAL/gco) + { + _parse("an zero-or-one subrule match", $aftererror,$line,"$code$argcode($1)"); + $item = new Parse::RecDescent::Repetition($name,$1,0,1, + $lookahead,$line, + $self, + $matchrule, + $argcode); + $prod and $prod->additem($item) + or _no_rule("repetition",$line,"$code$argcode($1)"); + + !$matchrule and $rule and $rule->addcall($name); + } + elsif ($grammar =~ m/$ANY/gco) + { + _parse("a zero-or-more subrule match", $aftererror,$line,"$code$argcode($1)"); + if ($2) + { + my $pos = pos $grammar; + substr($grammar,$pos,0, + "(s?) "); + + pos $grammar = $pos; + } + else + { + $item = new Parse::RecDescent::Repetition($name,$1,0,$MAXREP, + $lookahead,$line, + $self, + $matchrule, + $argcode); + $prod and $prod->additem($item) + or _no_rule("repetition",$line,"$code$argcode($1)"); + + !$matchrule and $rule and $rule->addcall($name); + + _check_insatiable($name,$1,$grammar,$line) if $::RD_CHECK; + } + } + elsif ($grammar =~ m/$MANY/gco) + { + _parse("a one-or-more subrule match", $aftererror,$line,"$code$argcode($1)"); + if ($2) + { + # $DB::single=1; + my $pos = pos $grammar; + substr($grammar,$pos,0, + " "); + + pos $grammar = $pos; + } + else + { + $item = new Parse::RecDescent::Repetition($name,$1,1,$MAXREP, + $lookahead,$line, + $self, + $matchrule, + $argcode); + + $prod and $prod->additem($item) + or _no_rule("repetition",$line,"$code$argcode($1)"); + + !$matchrule and $rule and $rule->addcall($name); + + _check_insatiable($name,$1,$grammar,$line) if $::RD_CHECK; + } + } + elsif ($grammar =~ m/$EXACTLY/gco) + { + _parse("an exactly-$1-times subrule match", $aftererror,$line,"$code$argcode($1)"); + if ($2) + { + my $pos = pos $grammar; + substr($grammar,$pos,0, + "($1) "); + + pos $grammar = $pos; + } + else + { + $item = new Parse::RecDescent::Repetition($name,$1,$1,$1, + $lookahead,$line, + $self, + $matchrule, + $argcode); + $prod and $prod->additem($item) + or _no_rule("repetition",$line,"$code$argcode($1)"); + + !$matchrule and $rule and $rule->addcall($name); + } + } + elsif ($grammar =~ m/$BETWEEN/gco) + { + _parse("a $1-to-$2 subrule match", $aftererror,$line,"$code$argcode($1..$2)"); + if ($3) + { + my $pos = pos $grammar; + substr($grammar,$pos,0, + "($1..$2) "); + + pos $grammar = $pos; + } + else + { + $item = new Parse::RecDescent::Repetition($name,"$1..$2",$1,$2, + $lookahead,$line, + $self, + $matchrule, + $argcode); + $prod and $prod->additem($item) + or _no_rule("repetition",$line,"$code$argcode($1..$2)"); + + !$matchrule and $rule and $rule->addcall($name); + } + } + elsif ($grammar =~ m/$ATLEAST/gco) + { + _parse("a $1-or-more subrule match", $aftererror,$line,"$code$argcode($1..)"); + if ($2) + { + my $pos = pos $grammar; + substr($grammar,$pos,0, + "($1..) "); + + pos $grammar = $pos; + } + else + { + $item = new Parse::RecDescent::Repetition($name,"$1..",$1,$MAXREP, + $lookahead,$line, + $self, + $matchrule, + $argcode); + $prod and $prod->additem($item) + or _no_rule("repetition",$line,"$code$argcode($1..)"); + + !$matchrule and $rule and $rule->addcall($name); + _check_insatiable($name,"$1..",$grammar,$line) if $::RD_CHECK; + } + } + elsif ($grammar =~ m/$ATMOST/gco) + { + _parse("a one-to-$1 subrule match", $aftererror,$line,"$code$argcode(..$1)"); + if ($2) + { + my $pos = pos $grammar; + substr($grammar,$pos,0, + "(..$1) "); + + pos $grammar = $pos; + } + else + { + $item = new Parse::RecDescent::Repetition($name,"..$1",1,$1, + $lookahead,$line, + $self, + $matchrule, + $argcode); + $prod and $prod->additem($item) + or _no_rule("repetition",$line,"$code$argcode(..$1)"); + + !$matchrule and $rule and $rule->addcall($name); + } + } + elsif ($grammar =~ m/$BADREP/gco) + { + _parse("an subrule match with invalid repetition specifier", 0,$line); + _error("Incorrect specification of a repeated subrule", + $line); + _hint("Repeated subrules like \"$code$argcode$&\" cannot have + a maximum repetition of zero, nor can they have + negative components in their ranges."); + } + } + else + { + _parse("a subrule match", $aftererror,$line,$code); + my $desc; + if ($name=~/\A_alternation_\d+_of_production_\d+_of_rule/) + { $desc = $self->{"rules"}{$name}->expected } + $item = new Parse::RecDescent::Subrule($name, + $lookahead, + $line, + $desc, + $matchrule, + $argcode); + + $prod and $prod->additem($item) + or _no_rule("(sub)rule",$line,$name); + + !$matchrule and $rule and $rule->addcall($name); + } + } + elsif ($grammar =~ m/$LONECOLON/gco ) + { + _error("Unexpected colon encountered", $line); + _hint("Did you mean \"|\" (to start a new production)? + Or perhaps you forgot that the colon + in a rule definition must be + on the same line as the rule name?"); + } + elsif ($grammar =~ m/$ACTION/gco ) # BAD ACTION, ALREADY FAILED + { + _error("Malformed action encountered", + $line); + _hint("Did you forget the closing curly bracket + or is there a syntax error in the action?"); + } + elsif ($grammar =~ m/$OTHER/gco ) + { + _error("Untranslatable item encountered: \"$1\"", + $line); + _hint("Did you misspell \"$1\" + or forget to comment it out?"); + } + + if ($lookaheadspec =~ tr /././ > 3) + { + $lookaheadspec =~ s/\A\s+//; + $lookahead = $lookahead<0 + ? 'a negative lookahead ("...!")' + : 'a positive lookahead ("...")' ; + _warn(1,"Found two or more lookahead specifiers in a + row.",$line) + and + _hint("Multiple positive and/or negative lookaheads + are simply multiplied together to produce a + single positive or negative lookahead + specification. In this case the sequence + \"$lookaheadspec\" was reduced to $lookahead. + Was this your intention?"); + } + $lookahead = 0; + $lookaheadspec = ""; + + $grammar =~ m/\G\s+/gc; + } + + unless ($ERRORS or $isimplicit or !$::RD_CHECK) + { + $self->_check_grammar(); + } + + unless ($ERRORS or $isimplicit or $Parse::RecDescent::compiling) + { + my $code = $self->_code(); + if (defined $::RD_TRACE) + { + print STDERR "printing code (", length($code),") to RD_TRACE\n"; + local *TRACE_FILE; + open TRACE_FILE, ">RD_TRACE" + and print TRACE_FILE "my \$ERRORS;\n$code" + and close TRACE_FILE; + } + + unless ( eval "$code 1" ) + { + _error("Internal error in generated parser code!"); + $@ =~ s/at grammar/in grammar at/; + _hint($@); + } + } + + if ($ERRORS and !_verbosity("HINT")) + { + local $::RD_HINT = 1; + _hint('Set $::RD_HINT (or -RD_HINT if you\'re using "perl -s") + for hints on fixing these problems.'); + } + if ($ERRORS) { $ERRORS=0; return } + return $self; +} + + +sub _addstartcode($$) +{ + my ($self, $code) = @_; + $code =~ s/\A\s*\{(.*)\}\Z/$1/s; + + $self->{"startcode"} .= "$code;\n"; +} + +# CHECK FOR GRAMMAR PROBLEMS.... + +sub _check_insatiable($$$$) +{ + my ($subrule,$repspec,$grammar,$line) = @_; + pos($grammar)=pos($_[2]); + return if $grammar =~ m/$OPTIONAL/gco || $grammar =~ m/$ANY/gco; + my $min = 1; + if ( $grammar =~ m/$MANY/gco + || $grammar =~ m/$EXACTLY/gco + || $grammar =~ m/$ATMOST/gco + || $grammar =~ m/$BETWEEN/gco && do { $min=$2; 1 } + || $grammar =~ m/$ATLEAST/gco && do { $min=$2; 1 } + || $grammar =~ m/$SUBRULE(?!\s*:)/gco + ) + { + return unless $1 eq $subrule && $min > 0; + _warn(3,"Subrule sequence \"$subrule($repspec) $&\" will + (almost certainly) fail.",$line) + and + _hint("Unless subrule \"$subrule\" performs some cunning + lookahead, the repetition \"$subrule($repspec)\" will + insatiably consume as many matches of \"$subrule\" as it + can, leaving none to match the \"$&\" that follows."); + } +} + +sub _check_grammar ($) +{ + my $self = shift; + my $rules = $self->{"rules"}; + my $rule; + foreach $rule ( values %$rules ) + { + next if ! $rule->{"changed"}; + + # CHECK FOR UNDEFINED RULES + + my $call; + foreach $call ( @{$rule->{"calls"}} ) + { + if (!defined ${$rules}{$call} + &&!defined &{"Parse::RecDescent::$call"}) + { + if (!defined $::RD_AUTOSTUB) + { + _warn(3,"Undefined (sub)rule \"$call\" + used in a production.") + and + _hint("Will you be providing this rule + later, or did you perhaps + misspell \"$call\"? Otherwise + it will be treated as an + immediate ."); + eval "sub $self->{namespace}::$call {undef}"; + } + else # EXPERIMENTAL + { + my $rule = $::RD_AUTOSTUB || qq{'$call'}; + _warn(1,"Autogenerating rule: $call") + and + _hint("A call was made to a subrule + named \"$call\", but no such + rule was specified. However, + since \$::RD_AUTOSTUB + was defined, a rule stub + ($call : $rule) was + automatically created."); + + $self->_generate("$call : $rule",0,1); + } + } + } + + # CHECK FOR LEFT RECURSION + + if ($rule->isleftrec($rules)) + { + _error("Rule \"$rule->{name}\" is left-recursive."); + _hint("Redesign the grammar so it's not left-recursive. + That will probably mean you need to re-implement + repetitions using the '(s)' notation. + For example: \"$rule->{name}(s)\"."); + next; + } + } +} + +# GENERATE ACTUAL PARSER CODE + +sub _code($) +{ + my $self = shift; + my $code = qq{ +package $self->{namespace}; +use strict; +use vars qw(\$skip \$AUTOLOAD $self->{localvars} ); +\$skip = '$skip'; +$self->{startcode} + +{ +local \$SIG{__WARN__} = sub {0}; +# PRETEND TO BE IN Parse::RecDescent NAMESPACE +*$self->{namespace}::AUTOLOAD = sub +{ + no strict 'refs'; + \$AUTOLOAD =~ s/^$self->{namespace}/Parse::RecDescent/; + goto &{\$AUTOLOAD}; +} +} + +}; + $code .= "push \@$self->{namespace}\::ISA, 'Parse::RecDescent';"; + $self->{"startcode"} = ''; + + my $rule; + foreach $rule ( values %{$self->{"rules"}} ) + { + if ($rule->{"changed"}) + { + $code .= $rule->code($self->{"namespace"},$self); + $rule->{"changed"} = 0; + } + } + + return $code; +} + + +# EXECUTING A PARSE.... + +sub AUTOLOAD # ($parser, $text; $linenum, @args) +{ + croak "Could not find method: $AUTOLOAD\n" unless ref $_[0]; + my $class = ref($_[0]) || $_[0]; + my $text = ref($_[1]) ? ${$_[1]} : $_[1]; + $_[0]->{lastlinenum} = $_[2]||_linecount($_[1]); + $_[0]->{lastlinenum} = _linecount($_[1]); + $_[0]->{lastlinenum} += $_[2] if @_ > 2; + $_[0]->{offsetlinenum} = $_[0]->{lastlinenum}; + $_[0]->{fulltext} = $text; + $_[0]->{fulltextlen} = length $text; + $_[0]->{deferred} = []; + $_[0]->{errors} = []; + my @args = @_[3..$#_]; + my $args = sub { [ @args ] }; + + $AUTOLOAD =~ s/$class/$_[0]->{namespace}/; + no strict "refs"; + + croak "Unknown starting rule ($AUTOLOAD) called\n" + unless defined &$AUTOLOAD; + my $retval = &{$AUTOLOAD}($_[0],$text,undef,undef,$args); + + if (defined $retval) + { + foreach ( @{$_[0]->{deferred}} ) { &$_; } + } + else + { + foreach ( @{$_[0]->{errors}} ) { _error(@$_); } + } + + if (ref $_[1]) { ${$_[1]} = $text } + + $ERRORS = 0; + return $retval; +} + +sub _parserepeat($$$$$$$$$$) # RETURNS A REF TO AN ARRAY OF MATCHES +{ + my ($parser, $text, $prod, $min, $max, $_noactions, $expectation, $argcode) = @_; + my @tokens = (); + + my $reps; + for ($reps=0; $reps<$max;) + { + $_[6]->at($text); # $_[6] IS $expectation FROM CALLER + my $_savetext = $text; + my $prevtextlen = length $text; + my $_tok; + if (! defined ($_tok = &$prod($parser,$text,1,$_noactions,$argcode))) + { + $text = $_savetext; + last; + } + push @tokens, $_tok if defined $_tok; + last if ++$reps >= $min and $prevtextlen == length $text; + } + + do { $_[6]->failed(); return undef} if $reps<$min; + + $_[1] = $text; + return [@tokens]; +} + + +# ERROR REPORTING.... + +my $errortext; +my $errorprefix; + +open (ERROR, ">&STDERR"); +format ERROR = +@>>>>>>>>>>>>>>>>>>>>: ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +$errorprefix, $errortext +~~ ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + $errortext +. + +select ERROR; +$| = 1; + +# TRACING + +my $tracemsg; +my $tracecontext; +my $tracerulename; +use vars '$tracelevel'; + +open (TRACE, ">&STDERR"); +format TRACE = +@>|@|||||||||@^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<| +$tracelevel, $tracerulename, '|', $tracemsg + | ~~ |^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<| + $tracemsg +. + +select TRACE; +$| = 1; + +open (TRACECONTEXT, ">&STDERR"); +format TRACECONTEXT = +@>|@|||||||||@ |^<<<<<<<<<<<<<<<<<<<<<<<<<<< +$tracelevel, $tracerulename, '|', $tracecontext + | ~~ | |^<<<<<<<<<<<<<<<<<<<<<<<<<<< + $tracecontext +. + + +select TRACECONTEXT; +$| = 1; + +select STDOUT; + +sub _verbosity($) +{ + defined $::RD_TRACE + or defined $::RD_HINT and $_[0] =~ /ERRORS|WARN|HINT/ + or defined $::RD_WARN and $_[0] =~ /ERRORS|WARN/ + or defined $::RD_ERRORS and $_[0] =~ /ERRORS/ +} + +sub _error($;$) +{ + $ERRORS++; + return 0 if ! _verbosity("ERRORS"); + $errortext = $_[0]; + $errorprefix = "ERROR" . ($_[1] ? " (line $_[1])" : ""); + $errortext =~ s/\s+/ /g; + print ERROR "\n" if _verbosity("WARN"); + write ERROR; + return 1; +} + +sub _warn($$;$) +{ + return 0 unless _verbosity("WARN") && ($::RD_HINT || $_[0] >= ($::RD_WARN||1)); + $errortext = $_[1]; + $errorprefix = "Warning" . ($_[2] ? " (line $_[2])" : ""); + print ERROR "\n"; + $errortext =~ s/\s+/ /g; + write ERROR; + return 1; +} + +sub _hint($) +{ + return 0 unless defined $::RD_HINT; + $errortext = "$_[0])"; + $errorprefix = "(Hint"; + $errortext =~ s/\s+/ /g; + write ERROR; + return 1; +} + +sub _tracemax($) +{ + if (defined $::RD_TRACE + && $::RD_TRACE =~ /\d+/ + && $::RD_TRACE>1 + && $::RD_TRACE+10..." + . substr($_[0],-$::RD_TRACE/2); + } + else + { + return $_[0]; + } +} + +sub _tracefirst($) +{ + if (defined $::RD_TRACE + && $::RD_TRACE =~ /\d+/ + && $::RD_TRACE>1 + && $::RD_TRACE+10"; + } + else + { + return $_[0]; + } +} + +my $lastcontext = ''; +my $lastrulename = ''; +my $lastlevel = ''; + +sub _trace($;$$$) +{ + $tracemsg = $_[0]; + $tracecontext = $_[1]||$lastcontext; + $tracerulename = $_[2]||$lastrulename; + $tracelevel = $_[3]||$lastlevel; + if ($tracerulename) { $lastrulename = $tracerulename } + if ($tracelevel) { $lastlevel = $tracelevel } + + $tracecontext =~ s/\n/\\n/g; + $tracecontext =~ s/\s+/ /g; + $tracerulename = qq{$tracerulename}; + write TRACE; + if ($tracecontext ne $lastcontext) + { + if ($tracecontext) + { + $lastcontext = _tracefirst($tracecontext); + $tracecontext = qq{"$tracecontext"}; + } + else + { + $tracecontext = qq{}; + } + write TRACECONTEXT; + } +} + +sub _parseunneg($$$$) +{ + _parse($_[0],$_[1],$_[3]); + if ($_[2]<0) + { + _error("Can't negate \"$&\".",$_[3]); + _hint("You can't negate $_[0]. Remove the \"...!\" before + \"$&\"."); + return 0; + } + return 1; +} + +sub _parse($$$;$) +{ + my $what = $_[3] || $&; + $what =~ s/^\s+//; + if ($_[1]) + { + _warn(3,"Found $_[0] ($what) after an unconditional ",$_[2]) + and + _hint("An unconditional always causes the + production containing it to immediately fail. + \u$_[0] that follows an + will never be reached. Did you mean to use + instead?"); + } + + return if ! _verbosity("TRACE"); + $errortext = "Treating \"$what\" as $_[0]"; + $errorprefix = "Parse::RecDescent"; + $errortext =~ s/\s+/ /g; + write ERROR; +} + +sub _linecount($) { + scalar substr($_[0], pos $_[0]||0) =~ tr/\n// +} + + +package main; + +use vars qw ( $RD_ERRORS $RD_WARN $RD_HINT $RD_TRACE $RD_CHECK ); +$::RD_CHECK = 1; +$::RD_ERRORS = 1; +$::RD_WARN = 3; + +1; +