From 1845a3e1d52a7e6bb9657602b08094911000618d Mon Sep 17 00:00:00 2001 From: JT Smith Date: Thu, 13 Jul 2006 15:41:30 +0000 Subject: [PATCH] - Added Chinese character support to search engine and indexer thanks to Zhou Xiaopeng. --- docs/changelog/7.x.x.txt | 2 ++ docs/credits.txt | 2 ++ lib/WebGUI/Search.pm | 11 +++++++++++ lib/WebGUI/Search/Index.pm | 10 ++++++++++ 4 files changed, 25 insertions(+) diff --git a/docs/changelog/7.x.x.txt b/docs/changelog/7.x.x.txt index 893c96ebc..312c686e3 100644 --- a/docs/changelog/7.x.x.txt +++ b/docs/changelog/7.x.x.txt @@ -6,6 +6,8 @@ / Procolix) - fix: Insert WebGUI Image inserts image, but does not retain border, spacing or alignment.(Martin Kamerbeek / Procolix) + - Added Chinese character support to search engine and indexer thanks to Zhou + Xiaopeng. 7.0.1 - fix: User profile field "Department" needs i18n diff --git a/docs/credits.txt b/docs/credits.txt index 6cc333a4c..f40acd269 100644 --- a/docs/credits.txt +++ b/docs/credits.txt @@ -45,8 +45,10 @@ Contributing Developers..............Lucas Bartholemy Steve Swanson / Plain Black Jeff Szpak / Plain Black Sean Tu / WDI + Vladimir Vitkovsky / WebGUI Worldwide Madsen Wikholm Matt Wilson / Plain Black + Zhou Xiaopeng / WebGUI Worldwide Gerald Young diff --git a/lib/WebGUI/Search.pm b/lib/WebGUI/Search.pm index 1b8578711..88336a188 100644 --- a/lib/WebGUI/Search.pm +++ b/lib/WebGUI/Search.pm @@ -270,6 +270,17 @@ sub search { unless ($keywords =~ m/"|\*/) { # do wildcards for people, like they'd expect my @terms = split(' ',$keywords); for (my $i = 0; $i < scalar(@terms); $i++) { + #-------------- Edited by zxp for Chinese Word Segment + utf8::decode($terms[$i]); + my @segs = split /([A-z|\d]+|\S)/, $terms[$i]; + $terms[$i] = join " ",@segs; + $terms[$i] =~ s/\s{2,}/ /g; + $terms[$i] =~ s/(^\s|\s$)//g; + $terms[$i] =~ s/\s/\'\'/g; + if($terms[$i] =~ m/\'/) { # has non-latin latter in terms + $terms[$i] = '"' . $terms[$i] . '"'; + } + #-------------- Edited by zxp end $terms[$i] .= "*"; } $keywords = join(" ", @terms); diff --git a/lib/WebGUI/Search/Index.pm b/lib/WebGUI/Search/Index.pm index cc03715dc..4d2ca0fbb 100644 --- a/lib/WebGUI/Search/Index.pm +++ b/lib/WebGUI/Search/Index.pm @@ -112,6 +112,16 @@ sub create { my $description = WebGUI::HTML::filter($asset->get('description'), "all"); my $keywords = WebGUI::HTML::filter(join(" ",$asset->get("title"), $asset->get("menuTitle"), $asset->get("synopsis"), $url, $description), "all"); my $synopsis = $asset->get("synopsis") || substr($description,0,255) || substr($keywords,0,255); + +#-------------------- added by zxp for chinese word segment + utf8::decode($keywords); + my @segs = split /([A-z|\d]+|\S)/, $keywords; + $keywords = join " ",@segs; + $keywords =~ s/\s{2,}/ /g; + $keywords =~ s/(^\s|\s$)//g; + $keywords =~ s/\s/\'\'/g; +#-------------------- added by zxp end + my $add = $self->session->db->prepare("insert into assetIndex (assetId, title, url, creationDate, revisionDate, ownerUserId, groupIdView, groupIdEdit, lineage, className, synopsis, keywords) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )"); $add->execute([$asset->getId, $asset->get("title"), $asset->get("url"), $asset->get("creationDate"),