From ea51ba559eb1f1831864daf7b14d566708e6c96d Mon Sep 17 00:00:00 2001 From: Colin Kuskie Date: Fri, 21 Aug 2009 21:29:43 +0000 Subject: [PATCH] Add tests for ideogram searching. ft_min_word_len must be 2 or less. --- docs/gotcha.txt | 5 +++++ lib/WebGUI/Search.pm | 2 +- lib/WebGUI/Search/Index.pm | 2 +- t/Search.t | 30 +++++++++++++++++++++++++++++- 4 files changed, 36 insertions(+), 3 deletions(-) diff --git a/docs/gotcha.txt b/docs/gotcha.txt index a12b567a2..e2a969aa9 100644 --- a/docs/gotcha.txt +++ b/docs/gotcha.txt @@ -18,6 +18,11 @@ save you many hours of grief. or, to index all of the sites on the server: perl search.pl --indexall + * For searching content with ideograms, ft_min_word_len must be + set to 2 in the Mysql config file. WebGUI will safely pad the + characters so that it does not need to be set to 1. The WRE + has ft_min_word_len set to 2 by default. + 7.7.18 -------------------------------------------------------------------- * The search indexer was not properly indexing non-ASCII content. The diff --git a/lib/WebGUI/Search.pm b/lib/WebGUI/Search.pm index b830b521f..7d401946c 100644 --- a/lib/WebGUI/Search.pm +++ b/lib/WebGUI/Search.pm @@ -383,7 +383,7 @@ sub search { for my $term (@terms) { # we add padding to ideographic characters to avoid minimum word length limits on indexing if ($term =~ /\p{Ideographic}/) { - $term = qq{''$term''}; + $term = q{''}.$term.q{''}; } $term .= q{*}; next diff --git a/lib/WebGUI/Search/Index.pm b/lib/WebGUI/Search/Index.pm index 08605afea..d12612a0e 100644 --- a/lib/WebGUI/Search/Index.pm +++ b/lib/WebGUI/Search/Index.pm @@ -193,7 +193,7 @@ sub _filterKeywords { $word =~ s/\p{isPunct}+\z//; # we add padding to ideographic characters to avoid minimum word length limits on indexing if ($word =~ /\p{Ideographic}/) { - $word = qq{''$word''}; + $word = q{''}.$word.q{''}; } } diff --git a/t/Search.t b/t/Search.t index 31ce4a8ca..cb22ef303 100644 --- a/t/Search.t +++ b/t/Search.t @@ -17,6 +17,7 @@ use FindBin; use strict; use lib "$FindBin::Bin/lib"; use Test::More; +use Test::Deep; use WebGUI::Test; # Must use this before any other WebGUI modules use WebGUI::Session; @@ -28,12 +29,13 @@ my $session = WebGUI::Test->session; #---------------------------------------------------------------------------- # Tests -plan tests => 7; # Increment this number for each test you create +plan tests => 10; # Increment this number for each test you create #---------------------------------------------------------------------------- # put your tests here use_ok('WebGUI::Search'); +use_ok('WebGUI::Search::Index'); my $search = WebGUI::Search->new($session); @@ -50,6 +52,32 @@ ok( $search->_isStopword('anybody+'), '_isStopword: regex metacharacter + ok( $search->_isStopword('maybe?'), '_isStopword: regex metacharacter ? does not crash the search'); ok(! $search->_isStopword('private.+'), '_isStopword: regex metacharacters .+ do not crash the search'); +################################################ +# +# Chinese ideograph handling +# +################################################ +{ + use utf8; + + # Create an article to index + my $article = WebGUI::Asset->getImportNode( $session )->addChild( { + className => 'WebGUI::Asset::Wobject::Article', + title => 'Chinese ideograph experiment', + description => "甲骨文", + } ); + my $tag = WebGUI::VersionTag->getWorking( $session ); + $tag->commit; + WebGUI::Test->tagsToRollback($tag); + WebGUI::Search::Index->create( $article ); + my $searcher = WebGUI::Search->new($session); + my $assetIds = $searcher->search({ keywords => "甲", })->getAssetIds; + cmp_deeply( $assetIds, [ $article->getId ], 'basic test for search works'); + my $searcher = WebGUI::Search->new($session); + my $assetIds = $searcher->search({ keywords => "Chinese", })->getAssetIds; + cmp_deeply( $assetIds, [ $article->getId ], 'ideograph search works'); +} + #---------------------------------------------------------------------------- # Cleanup END {