Decode HTML entities sent by rich editors. Add tests for the decoding

inside the search indexer.  Gotcha note.  Fixes #10797.
This commit is contained in:
Colin Kuskie 2009-08-21 16:55:57 +00:00
parent 91b37aae36
commit f68db111aa
4 changed files with 43 additions and 13 deletions

View file

@ -1,4 +1,5 @@
7.7.19
- fixed #10797: searching non-ascii-characters (e.g. wiki), part 2
7.7.18
- fixed #10801: Payment Methods: can select a blank method

View file

@ -7,6 +7,17 @@ upgrading from one version to the next, or even between multiple
versions. Be sure to heed the warnings contained herein as they will
save you many hours of grief.
7.7.19
--------------------------------------------------------------------
* The search indexer was not properly indexing non-ASCII content
entered via TinyMCE. The behaviour has been corrected. If your
site has content in languages other than English, you should
re-index the site. This can be done with the search.pl utility
script by running
perl search.pl --configFile=www.mysite.com.conf --indexsite
or, to index all of the sites on the server:
perl search.pl --indexall
7.7.18
--------------------------------------------------------------------
* The search indexer was not properly indexing non-ASCII content. The

View file

@ -15,6 +15,7 @@ package WebGUI::Search::Index;
=cut
use strict;
use HTML::Entities;
=head1 NAME
@ -82,7 +83,6 @@ sub addKeywords {
my $self = shift;
my $text = join(" ", @_);
$text = WebGUI::HTML::filter($text, "all");
$text = $self->_filterKeywords($text);
my ($keywords) = $self->session->db->quickArray("select keywords from assetIndex where assetId=?",[$self->getId]);
$self->session->db->write("update assetIndex set keywords =? where assetId=?", [$keywords.' '.$text, $self->getId]);
@ -166,7 +166,8 @@ sub DESTROY {
=head2 _filterKeywords ( $keywords )
Perform filtering and cleaning up of the keywords before submitting them.
Perform filtering and cleaning up of the keywords before submitting them. Ideographic characters are padded
so that they are still searchable. HTML entities are decoded.
=head3 $keywords
@ -179,6 +180,8 @@ sub _filterKeywords {
my $keywords = shift;
$keywords = WebGUI::HTML::filter($keywords, "all");
$keywords = HTML::Entities::decode_entities($keywords);
utf8::upgrade($keywords);
# split into 'words'. Ideographic characters (such as Chinese) are
# treated as distinct words. Everything else is space delimited.

View file

@ -40,7 +40,7 @@ WebGUI::Test->tagsToRollback(
#----------------------------------------------------------------------------
# Tests
plan tests => 16; # Increment this number for each test you create
plan tests => 15; # Increment this number for each test you create
use_ok( 'WebGUI::Search::Index' );
@ -123,7 +123,7 @@ $article->update({
} );
$indexer = WebGUI::Search::Index->create( $article );
ok ( my $row = $db->quickHashRef( "SELECT * FROM assetIndex WHERE assetId=?", [ $article->getId ] ),
ok ( $row = $db->quickHashRef( "SELECT * FROM assetIndex WHERE assetId=?", [ $article->getId ] ),
"assetId exists in assetIndex"
);
cmp_deeply (
@ -149,7 +149,7 @@ cmp_deeply (
),
lineage => $article->get('lineage'),
},
"Index has correct information"
"Index has synopsis information in keywords"
);
@ -161,9 +161,7 @@ $article->update({
});
$indexer = WebGUI::Search::Index->create( $article );
ok ( my $row = $db->quickHashRef( "SELECT * FROM assetIndex WHERE assetId=?", [ $article->getId ] ),
"assetId exists in assetIndex"
);
$row = $db->quickHashRef( "SELECT * FROM assetIndex WHERE assetId=?", [ $article->getId ]);
cmp_deeply (
$row,
{
@ -187,7 +185,7 @@ cmp_deeply (
),
lineage => $article->get('lineage'),
},
"Index has correct information"
"Index has description in keywords"
);
@ -199,9 +197,7 @@ $article->update({
});
$indexer = WebGUI::Search::Index->create( $article );
ok ( my $row = $db->quickHashRef( "SELECT * FROM assetIndex WHERE assetId=?", [ $article->getId ] ),
"assetId exists in assetIndex"
);
$row = $db->quickHashRef( "SELECT * FROM assetIndex WHERE assetId=?", [ $article->getId ] );
cmp_deeply (
$row,
{
@ -224,7 +220,26 @@ cmp_deeply (
),
lineage => $article->get('lineage'),
},
"Index has correct information"
"Index has synopsis and description in keywords"
);
#----------------------------------------------------------------------------
# Test that HTML entities are decoded.
$article->update({
description => "schön cañón",
});
$indexer = WebGUI::Search::Index->create( $article );
$row = $db->quickHashRef( "SELECT * FROM assetIndex WHERE assetId=?", [ $article->getId ] );
cmp_deeply (
$row,
superhashof({
keywords => all( # keywords contains title, menuTitle, every part of the URL and every keyword
re("sch\xF6n"),
re("ca\xF1\xF3n"),
),
}),
"Index has decoded entities"
);
#vim:ft=perl