diff --git a/docs/changelog/7.x.x.txt b/docs/changelog/7.x.x.txt index 0220a22ec..1352860f7 100644 --- a/docs/changelog/7.x.x.txt +++ b/docs/changelog/7.x.x.txt @@ -1,4 +1,5 @@ 7.7.18 + - fixed #10797: Unable to search for non-ASCII text - fixed #10800: Ogone payment driver typo - fixed #10799: Shop: Ogone payment driver- typo - fixed #10798: Couple of typos in Shop EU Tax Driver screen diff --git a/docs/gotcha.txt b/docs/gotcha.txt index aedd0b5ba..aaa828273 100644 --- a/docs/gotcha.txt +++ b/docs/gotcha.txt @@ -7,6 +7,16 @@ upgrading from one version to the next, or even between multiple versions. Be sure to heed the warnings contained herein as they will save you many hours of grief. +7.7.18 +-------------------------------------------------------------------- + * The search indexer was not properly indexing non-ASCII content. The + behaviour has been corrected. If your site has content in languages + other than English, you should re-index the site. This can be done + with the search.pl utility script by running + perl search.pl --configFile=www.mysite.com.conf --indexsite + or, to index all of the sites on the server: + perl search.pl --indexall + 7.7.17 -------------------------------------------------------------------- * It was found that the combination of diff --git a/lib/WebGUI/Search.pm b/lib/WebGUI/Search.pm index 173a98652..1fbdd70e2 100644 --- a/lib/WebGUI/Search.pm +++ b/lib/WebGUI/Search.pm @@ -366,32 +366,27 @@ sub search { my $query = ""; my @clauses; my @orClauses; - if ($rules->{keywords}) { - my $keywords = $rules->{keywords}; - unless ($keywords =~ m/"|\*/) { # do wildcards for people, like they'd expect - my @terms = split(' ',$keywords); - for (my $i = 0; $i < scalar(@terms); $i++) { - #-------------- Edited by zxp for Chinese Word Segment - utf8::decode($terms[$i]); - my @segs = split /([A-z,+-|\d]+|\S)/, $terms[$i]; - $terms[$i] = join " ",@segs; - $terms[$i] =~ s/\s{2,}/ /g; - $terms[$i] =~ s/(^\s|\s$)//g; - $terms[$i] =~ s/\s/\'\'/g; - if($terms[$i] =~ m/\'/) { # has non-latin latter in terms - $terms[$i] = '"' . $terms[$i] . '"'; - } - #-------------- Edited by zxp end - $terms[$i] .= "*"; - - # By default results need to match ALL keywords / Len Kranendonk 20060811 - # Do not force matching of possible stopwords - if (!$self->_isStopword( $terms[$i] )) { - $terms[$i] = "+" . $terms[$i] if ($terms[$i] !~ m/^[+-]/); + if ($rules->{keywords}) { + my $keywords = $rules->{keywords}; + # do wildcards for people like they'd expect unless they are doing it themselves + unless ($keywords =~ m/"|\*/) { + # split into 'words'. Ideographic characters (such as Chinese) are + # treated as distinct words. Everything else is space delimited. + my @terms = grep { $_ ne q{} } split /\s+|(\p{Ideographic})/, $keywords; + for my $term (@terms) { + # we add padding to ideographic characters to avoid minimum word length limits on indexing + if ($term =~ /\p{Ideographic}/) { + $term = qq{''$term''}; } + $term .= q{*}; + next + if $self->_isStopword($term); + next + if $term =~ /^[+-]/; + $term = q{+} . $term; } - $keywords = join(" ", @terms); - } + $keywords = join q{ }, @terms; + } push(@params, $keywords, $keywords); $self->{_score} = "match (keywords) against (?) as score"; push(@clauses, "match (keywords) against (? in boolean mode)"); diff --git a/lib/WebGUI/Search/Index.pm b/lib/WebGUI/Search/Index.pm index d986022df..eb72b21e6 100644 --- a/lib/WebGUI/Search/Index.pm +++ b/lib/WebGUI/Search/Index.pm @@ -175,18 +175,27 @@ A string containing keywords. =cut sub _filterKeywords { - my $self = shift; + my $self = shift; my $keywords = shift; - $keywords = WebGUI::HTML::filter($keywords, "all"); -#-------------------- added by zxp for chinese word segment - utf8::decode($keywords); - my @segs = split /([A-z|\d]+|\S)/, $keywords; - my $newKeywords = join " ",@segs; - $newKeywords =~ s/(^\s+|\s+$)//g; - $newKeywords =~ s/\s+/\'\'/g; -#-------------------- added by zxp end - return $newKeywords; + $keywords = WebGUI::HTML::filter($keywords, "all"); + + # split into 'words'. Ideographic characters (such as Chinese) are + # treated as distinct words. Everything else is space delimited. + my @words = grep { $_ ne '' } split /\s+|(\p{Ideographic})/, $keywords; + + # remove punctuation characters at the start and end of each word. + for my $word ( @words ) { + $word =~ s/\A\p{isPunct}+//; + $word =~ s/\p{isPunct}+\z//; + # we add padding to ideographic characters to avoid minimum word length limits on indexing + if ($word =~ /\p{Ideographic}/) { + $word = qq{''$word''}; + } + } + + $keywords = join q{ }, @words; + return $keywords; } #-------------------------------------------------------------------