From f68db111aaa1f55242fbdb1aa4b26c07456ff6ac Mon Sep 17 00:00:00 2001 From: Colin Kuskie Date: Fri, 21 Aug 2009 16:55:57 +0000 Subject: [PATCH] Decode HTML entities sent by rich editors. Add tests for the decoding inside the search indexer. Gotcha note. Fixes #10797. --- docs/changelog/7.x.x.txt | 1 + docs/gotcha.txt | 11 +++++++++++ lib/WebGUI/Search/Index.pm | 7 +++++-- t/Search/Index.t | 37 ++++++++++++++++++++++++++----------- 4 files changed, 43 insertions(+), 13 deletions(-) diff --git a/docs/changelog/7.x.x.txt b/docs/changelog/7.x.x.txt index 0098b99cd..fb631af79 100644 --- a/docs/changelog/7.x.x.txt +++ b/docs/changelog/7.x.x.txt @@ -1,4 +1,5 @@ 7.7.19 + - fixed #10797: searching non-ascii-characters (e.g. wiki), part 2 7.7.18 - fixed #10801: Payment Methods: can select a blank method diff --git a/docs/gotcha.txt b/docs/gotcha.txt index aaa828273..a12b567a2 100644 --- a/docs/gotcha.txt +++ b/docs/gotcha.txt @@ -7,6 +7,17 @@ upgrading from one version to the next, or even between multiple versions. Be sure to heed the warnings contained herein as they will save you many hours of grief. +7.7.19 +-------------------------------------------------------------------- + * The search indexer was not properly indexing non-ASCII content + entered via TinyMCE. The behaviour has been corrected. If your + site has content in languages other than English, you should + re-index the site. This can be done with the search.pl utility + script by running + perl search.pl --configFile=www.mysite.com.conf --indexsite + or, to index all of the sites on the server: + perl search.pl --indexall + 7.7.18 -------------------------------------------------------------------- * The search indexer was not properly indexing non-ASCII content. The diff --git a/lib/WebGUI/Search/Index.pm b/lib/WebGUI/Search/Index.pm index eb72b21e6..08605afea 100644 --- a/lib/WebGUI/Search/Index.pm +++ b/lib/WebGUI/Search/Index.pm @@ -15,6 +15,7 @@ package WebGUI::Search::Index; =cut use strict; +use HTML::Entities; =head1 NAME @@ -82,7 +83,6 @@ sub addKeywords { my $self = shift; my $text = join(" ", @_); - $text = WebGUI::HTML::filter($text, "all"); $text = $self->_filterKeywords($text); my ($keywords) = $self->session->db->quickArray("select keywords from assetIndex where assetId=?",[$self->getId]); $self->session->db->write("update assetIndex set keywords =? where assetId=?", [$keywords.' '.$text, $self->getId]); @@ -166,7 +166,8 @@ sub DESTROY { =head2 _filterKeywords ( $keywords ) -Perform filtering and cleaning up of the keywords before submitting them. +Perform filtering and cleaning up of the keywords before submitting them. Ideographic characters are padded +so that they are still searchable. HTML entities are decoded. =head3 $keywords @@ -179,6 +180,8 @@ sub _filterKeywords { my $keywords = shift; $keywords = WebGUI::HTML::filter($keywords, "all"); + $keywords = HTML::Entities::decode_entities($keywords); + utf8::upgrade($keywords); # split into 'words'. Ideographic characters (such as Chinese) are # treated as distinct words. Everything else is space delimited. diff --git a/t/Search/Index.t b/t/Search/Index.t index d45eafcb4..f8059dcbb 100644 --- a/t/Search/Index.t +++ b/t/Search/Index.t @@ -40,7 +40,7 @@ WebGUI::Test->tagsToRollback( #---------------------------------------------------------------------------- # Tests -plan tests => 16; # Increment this number for each test you create +plan tests => 15; # Increment this number for each test you create use_ok( 'WebGUI::Search::Index' ); @@ -123,7 +123,7 @@ $article->update({ } ); $indexer = WebGUI::Search::Index->create( $article ); -ok ( my $row = $db->quickHashRef( "SELECT * FROM assetIndex WHERE assetId=?", [ $article->getId ] ), +ok ( $row = $db->quickHashRef( "SELECT * FROM assetIndex WHERE assetId=?", [ $article->getId ] ), "assetId exists in assetIndex" ); cmp_deeply ( @@ -149,7 +149,7 @@ cmp_deeply ( ), lineage => $article->get('lineage'), }, - "Index has correct information" + "Index has synopsis information in keywords" ); @@ -161,9 +161,7 @@ $article->update({ }); $indexer = WebGUI::Search::Index->create( $article ); -ok ( my $row = $db->quickHashRef( "SELECT * FROM assetIndex WHERE assetId=?", [ $article->getId ] ), - "assetId exists in assetIndex" -); +$row = $db->quickHashRef( "SELECT * FROM assetIndex WHERE assetId=?", [ $article->getId ]); cmp_deeply ( $row, { @@ -187,7 +185,7 @@ cmp_deeply ( ), lineage => $article->get('lineage'), }, - "Index has correct information" + "Index has description in keywords" ); @@ -199,9 +197,7 @@ $article->update({ }); $indexer = WebGUI::Search::Index->create( $article ); -ok ( my $row = $db->quickHashRef( "SELECT * FROM assetIndex WHERE assetId=?", [ $article->getId ] ), - "assetId exists in assetIndex" -); +$row = $db->quickHashRef( "SELECT * FROM assetIndex WHERE assetId=?", [ $article->getId ] ); cmp_deeply ( $row, { @@ -224,7 +220,26 @@ cmp_deeply ( ), lineage => $article->get('lineage'), }, - "Index has correct information" + "Index has synopsis and description in keywords" +); + +#---------------------------------------------------------------------------- +# Test that HTML entities are decoded. +$article->update({ + description => "schön cañón", +}); +$indexer = WebGUI::Search::Index->create( $article ); + +$row = $db->quickHashRef( "SELECT * FROM assetIndex WHERE assetId=?", [ $article->getId ] ); +cmp_deeply ( + $row, + superhashof({ + keywords => all( # keywords contains title, menuTitle, every part of the URL and every keyword + re("sch\xF6n"), + re("ca\xF1\xF3n"), + ), + }), + "Index has decoded entities" ); #vim:ft=perl