fixed #10797: Unable to search for non-ASCII text

2009-08-20 19:24:31 +00:00 · 2009-08-20 19:24:31 +00:00 · 4984e19d68
commit 4984e19d68
parent a5204a7386
4 changed files with 49 additions and 34 deletions
--- a/docs/changelog/7.x.x.txt
+++ b/docs/changelog/7.x.x.txt
@ -1,4 +1,5 @@
 7.7.18
+ - fixed #10797: Unable to search for non-ASCII text
 - fixed #10800: Ogone payment driver typo
 - fixed #10799: Shop: Ogone payment driver- typo
 - fixed #10798: Couple of typos in Shop EU Tax Driver screen
--- a/docs/gotcha.txt
+++ b/docs/gotcha.txt
@ -7,6 +7,16 @@ upgrading from one version to the next, or even between multiple
 versions. Be sure to heed the warnings contained herein as they will
 save you many hours of grief.

+7.7.18
+--------------------------------------------------------------------
+    * The search indexer was not properly indexing non-ASCII content.  The
+      behaviour has been corrected.  If your site has content in languages
+      other than English, you should re-index the site.  This can be done
+      with the search.pl utility script by running
+        perl search.pl --configFile=www.mysite.com.conf --indexsite
+      or, to index all of the sites on the server:
+        perl search.pl --indexall
+
 7.7.17
 --------------------------------------------------------------------
    * It was found that the combination of
--- a/lib/WebGUI/Search.pm
+++ b/lib/WebGUI/Search.pm
@ -366,32 +366,27 @@ sub search {
 	my $query = "";
 	my @clauses;
        my @orClauses;
-	if ($rules->{keywords}) {
-		my $keywords = $rules->{keywords};
-		unless ($keywords =~ m/"|\*/) { # do wildcards for people, like they'd expect
-        		my @terms = split(' ',$keywords);
-        		for (my $i = 0; $i < scalar(@terms); $i++) {
-			#-------------- Edited by zxp for Chinese Word Segment
-				utf8::decode($terms[$i]);
-				my @segs = split /([A-z,+-|\d]+|\S)/, $terms[$i];
-				$terms[$i] = join " ",@segs;
-				$terms[$i] =~ s/\s{2,}/ /g;
-				$terms[$i] =~ s/(^\s|\s$)//g;
-				$terms[$i] =~ s/\s/\'\'/g;
-				if($terms[$i] =~ m/\'/) { # has non-latin latter in terms
-					$terms[$i] = '"' . $terms[$i] . '"';
-				}
-			#-------------- Edited by zxp end
-                $terms[$i] .= "*";
-				
-                # By default results need to match ALL keywords / Len Kranendonk 20060811
-                # Do not force matching of possible stopwords
-                if (!$self->_isStopword( $terms[$i] )) {
-                    $terms[$i] = "+" . $terms[$i] if ($terms[$i] !~ m/^[+-]/);
+    if ($rules->{keywords}) {
+        my $keywords = $rules->{keywords};
+        # do wildcards for people like they'd expect unless they are doing it themselves
+        unless ($keywords =~ m/"|\*/) {
+            # split into 'words'.  Ideographic characters (such as Chinese) are
+            # treated as distinct words.  Everything else is space delimited.
+            my @terms = grep { $_ ne q{} } split /\s+|(\p{Ideographic})/, $keywords;
+            for my $term (@terms) {
+                # we add padding to ideographic characters to avoid minimum word length limits on indexing
+                if ($term =~ /\p{Ideographic}/) {
+                    $term = qq{''$term''};
                }
+                $term .= q{*};
+                next
+                    if $self->_isStopword($term);
+                next
+                    if $term =~ /^[+-]/;
+                $term = q{+} . $term;
            }
-            $keywords = join(" ", @terms);
-		}	
+            $keywords = join q{ }, @terms;
+        }
 		push(@params, $keywords, $keywords);
 		$self->{_score} = "match (keywords) against (?) as score";
 		push(@clauses, "match (keywords) against (? in boolean mode)");
--- a/lib/WebGUI/Search/Index.pm
+++ b/lib/WebGUI/Search/Index.pm
@ -175,18 +175,27 @@ A string containing keywords.
 =cut

 sub _filterKeywords {
-	my $self     = shift;
+    my $self     = shift;
    my $keywords = shift;

-	$keywords = WebGUI::HTML::filter($keywords, "all");
-#-------------------- added by zxp for chinese word segment
-	utf8::decode($keywords);
-	my @segs = split /([A-z|\d]+|\S)/, $keywords;
-	my $newKeywords = join " ",@segs;
-	$newKeywords =~ s/(^\s+|\s+$)//g;
-	$newKeywords =~ s/\s+/\'\'/g;
-#-------------------- added by zxp end
-    return $newKeywords;
+    $keywords = WebGUI::HTML::filter($keywords, "all");
+
+    # split into 'words'.  Ideographic characters (such as Chinese) are
+    # treated as distinct words.  Everything else is space delimited.
+    my @words = grep { $_ ne '' } split /\s+|(\p{Ideographic})/, $keywords;
+
+    # remove punctuation characters at the start and end of each word.
+    for my $word ( @words ) {
+        $word =~ s/\A\p{isPunct}+//;
+        $word =~ s/\p{isPunct}+\z//;
+        # we add padding to ideographic characters to avoid minimum word length limits on indexing
+        if ($word =~ /\p{Ideographic}/) {
+            $word = qq{''$word''};
+        }
+    }
+
+    $keywords = join q{ }, @words;
+    return $keywords;
 }

 #-------------------------------------------------------------------