fixed #10797: Unable to search for non-ASCII text
This commit is contained in:
parent
a5204a7386
commit
4984e19d68
4 changed files with 49 additions and 34 deletions
|
|
@ -1,4 +1,5 @@
|
|||
7.7.18
|
||||
- fixed #10797: Unable to search for non-ASCII text
|
||||
- fixed #10800: Ogone payment driver typo
|
||||
- fixed #10799: Shop: Ogone payment driver- typo
|
||||
- fixed #10798: Couple of typos in Shop EU Tax Driver screen
|
||||
|
|
|
|||
|
|
@ -7,6 +7,16 @@ upgrading from one version to the next, or even between multiple
|
|||
versions. Be sure to heed the warnings contained herein as they will
|
||||
save you many hours of grief.
|
||||
|
||||
7.7.18
|
||||
--------------------------------------------------------------------
|
||||
* The search indexer was not properly indexing non-ASCII content. The
|
||||
behaviour has been corrected. If your site has content in languages
|
||||
other than English, you should re-index the site. This can be done
|
||||
with the search.pl utility script by running
|
||||
perl search.pl --configFile=www.mysite.com.conf --indexsite
|
||||
or, to index all of the sites on the server:
|
||||
perl search.pl --indexall
|
||||
|
||||
7.7.17
|
||||
--------------------------------------------------------------------
|
||||
* It was found that the combination of
|
||||
|
|
|
|||
|
|
@ -366,32 +366,27 @@ sub search {
|
|||
my $query = "";
|
||||
my @clauses;
|
||||
my @orClauses;
|
||||
if ($rules->{keywords}) {
|
||||
my $keywords = $rules->{keywords};
|
||||
unless ($keywords =~ m/"|\*/) { # do wildcards for people, like they'd expect
|
||||
my @terms = split(' ',$keywords);
|
||||
for (my $i = 0; $i < scalar(@terms); $i++) {
|
||||
#-------------- Edited by zxp for Chinese Word Segment
|
||||
utf8::decode($terms[$i]);
|
||||
my @segs = split /([A-z,+-|\d]+|\S)/, $terms[$i];
|
||||
$terms[$i] = join " ",@segs;
|
||||
$terms[$i] =~ s/\s{2,}/ /g;
|
||||
$terms[$i] =~ s/(^\s|\s$)//g;
|
||||
$terms[$i] =~ s/\s/\'\'/g;
|
||||
if($terms[$i] =~ m/\'/) { # has non-latin latter in terms
|
||||
$terms[$i] = '"' . $terms[$i] . '"';
|
||||
}
|
||||
#-------------- Edited by zxp end
|
||||
$terms[$i] .= "*";
|
||||
|
||||
# By default results need to match ALL keywords / Len Kranendonk 20060811
|
||||
# Do not force matching of possible stopwords
|
||||
if (!$self->_isStopword( $terms[$i] )) {
|
||||
$terms[$i] = "+" . $terms[$i] if ($terms[$i] !~ m/^[+-]/);
|
||||
if ($rules->{keywords}) {
|
||||
my $keywords = $rules->{keywords};
|
||||
# do wildcards for people like they'd expect unless they are doing it themselves
|
||||
unless ($keywords =~ m/"|\*/) {
|
||||
# split into 'words'. Ideographic characters (such as Chinese) are
|
||||
# treated as distinct words. Everything else is space delimited.
|
||||
my @terms = grep { $_ ne q{} } split /\s+|(\p{Ideographic})/, $keywords;
|
||||
for my $term (@terms) {
|
||||
# we add padding to ideographic characters to avoid minimum word length limits on indexing
|
||||
if ($term =~ /\p{Ideographic}/) {
|
||||
$term = qq{''$term''};
|
||||
}
|
||||
$term .= q{*};
|
||||
next
|
||||
if $self->_isStopword($term);
|
||||
next
|
||||
if $term =~ /^[+-]/;
|
||||
$term = q{+} . $term;
|
||||
}
|
||||
$keywords = join(" ", @terms);
|
||||
}
|
||||
$keywords = join q{ }, @terms;
|
||||
}
|
||||
push(@params, $keywords, $keywords);
|
||||
$self->{_score} = "match (keywords) against (?) as score";
|
||||
push(@clauses, "match (keywords) against (? in boolean mode)");
|
||||
|
|
|
|||
|
|
@ -175,18 +175,27 @@ A string containing keywords.
|
|||
=cut
|
||||
|
||||
sub _filterKeywords {
|
||||
my $self = shift;
|
||||
my $self = shift;
|
||||
my $keywords = shift;
|
||||
|
||||
$keywords = WebGUI::HTML::filter($keywords, "all");
|
||||
#-------------------- added by zxp for chinese word segment
|
||||
utf8::decode($keywords);
|
||||
my @segs = split /([A-z|\d]+|\S)/, $keywords;
|
||||
my $newKeywords = join " ",@segs;
|
||||
$newKeywords =~ s/(^\s+|\s+$)//g;
|
||||
$newKeywords =~ s/\s+/\'\'/g;
|
||||
#-------------------- added by zxp end
|
||||
return $newKeywords;
|
||||
$keywords = WebGUI::HTML::filter($keywords, "all");
|
||||
|
||||
# split into 'words'. Ideographic characters (such as Chinese) are
|
||||
# treated as distinct words. Everything else is space delimited.
|
||||
my @words = grep { $_ ne '' } split /\s+|(\p{Ideographic})/, $keywords;
|
||||
|
||||
# remove punctuation characters at the start and end of each word.
|
||||
for my $word ( @words ) {
|
||||
$word =~ s/\A\p{isPunct}+//;
|
||||
$word =~ s/\p{isPunct}+\z//;
|
||||
# we add padding to ideographic characters to avoid minimum word length limits on indexing
|
||||
if ($word =~ /\p{Ideographic}/) {
|
||||
$word = qq{''$word''};
|
||||
}
|
||||
}
|
||||
|
||||
$keywords = join q{ }, @words;
|
||||
return $keywords;
|
||||
}
|
||||
|
||||
#-------------------------------------------------------------------
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue