added code and test suite for a series of functions that

identify non-human web clients so that advertising can
eliminate ghost impressions and ghost clicks
This commit is contained in:
David Delikat 2009-02-23 00:20:46 +00:00
parent fdeece9a64
commit af5ad84fc8
5 changed files with 305 additions and 2 deletions

View file

@ -1,4 +1,5 @@
7.6.13
- fixed AdSpace bug: impressions and clicks for most non-human web clients will not be counted.
- fixed #9760: DataForm not working in demo.plainblack.com
- fixed #9759: Delete Entry Button missing in Data Form
- fixed #9767: FileAsset breaks 'null' rule for FileAsset table

View file

@ -52,6 +52,7 @@ sub countClick {
my $session = shift;
my $id = shift;
my ($url) = $session->db->quickArray("select url from advertisement where adId=?",[$id]);
return $url if $self->session->env->requestNotViewed();
$session->db->write("update advertisement set clicks=clicks+1 where adId=?",[$id]);
return $url;
}
@ -131,6 +132,7 @@ A boolean that tells the ad system not to count this impression if true.
sub displayImpression {
my $self = shift;
my $dontCount = shift;
return '' if $self->session->env->requestNotViewed();
my ($id, $ad, $priority, $clicks, $clicksBought, $impressions, $impressionsBought) = $self->session->db->quickArray("select adId, renderedAd, priority, clicks, clicksBought, impressions, impressionsBought from advertisement where adSpaceId=? and isActive=1 order by nextInPriority asc limit 1",[$self->getId]);
unless ($dontCount) {
my $isActive = 1;

View file

@ -262,10 +262,25 @@ sub set {
# prerender the ad for faster display
my $adSpace = WebGUI::AdSpace->new($self->session, $self->get("adSpaceId"));
if ($self->get("type") eq "text") {
$self->{_properties}{renderedAd} = '<div style="position:relative; width:'.($adSpace->get("width")-2).'px; height:'.($adSpace->get("height")-2).'px; margin:0px; overflow:hidden; border:solid '.$self->get("borderColor").' 1px;"><a href="'.$self->session->url->gateway(undef, "op=clickAd;id=".$self->getId).'" style="position:absolute; padding: 3px; top:0px; left:0px; width:100%; height:100%; z-index:10; display:block; text-decoration:none; vertical-align:top; background-color:'.$self->get("backgroundColor").'; font-size: 13px; font-weight: normal;"><b><span style="color:'.$self->get("textColor").';">'.$self->get("title").'</span></b><br /><span style="color:'.$self->get("textColor").';">'.$self->get("adText").'</span></a></div>';
$self->{_properties}{renderedAd} = '<div style="position:relative; width:' . ($adSpace->get("width")-2) . 'px; height:' .
($adSpace->get("height")-2) . 'px; margin:0px; overflow:hidden; border:solid ' . $self->get("borderColor") .
q{ 1px;"><a href='#' OnClick="window.location.assign('} .
$self->session->url->gateway(undef, "op=clickAd;id=".$self->getId) .
q{')" style="position:absolute; padding: 3px; top:0px; left:0px; width:100%; height:100%; z-index:10;} .
' display:block; text-decoration:none; vertical-align:top; background-color:' . $self->get("backgroundColor") .
'; font-size: 13px; font-weight: normal;"><b><span style="color:' . $self->get("textColor") . ';">' .
$self->get("title") . '</span></b><br /><span style="color:' . $self->get("textColor") . ';">' .
$self->get("adText") . '</span></a></div>';
} elsif ($self->get("type") eq "image") {
my $storage = WebGUI::Storage->get($self->session, $self->get("storageId"));
$self->{_properties}{renderedAd} = '<div style="position:relative; width:'.$adSpace->get("width").'px; height:'.$adSpace->get("height").'px; margin:0px; overflow:hidden; border:0px;"><a href="'.$self->session->url->gateway(undef, "op=clickAd;id=".$self->getId).'" style="position:absolute; padding: 3px; top:0px; left:0px; width:100%; height:100%; z-index:10; display:block; text-decoration:none; vertical-align:top;"><img src="'.$storage->getUrl($storage->getFiles->[0]).'" alt="'.$self->get("title").'" style="z-index:0;position:relative;border-style:none;border: 0px;" /></a></div>';
$self->{_properties}{renderedAd} = '<div style="position:relative; width:' . $adSpace->get("width") . 'px; height:' .
$adSpace->get("height") . 'px; margin:0px; overflow:hidden; border:0px;"><a href="#" ' .
q{onClick="window.location.assign('} .
$self->session->url->gateway(undef, "op=clickAd;id=".$self->getId) . q{')" style="position:absolute; padding: } .
'3px; top:0px; left:0px; width:100%; height:100%; z-index:10; display:block; text-decoration:none; ' .
'vertical-align:top;"><img ' .
'src="' . $storage->getUrl($storage->getFiles->[0]) . '" alt="' . $self->get("title") .
'" style="z-index:0;position:relative;border-style:none;border: 0px;" /></a></div>';
} elsif ($self->get("type") eq "rich") {
my $ad = $self->get("richMedia");
WebGUI::Macro::process($self->session, \$ad);

View file

@ -30,6 +30,8 @@ $env = WebGUI::Session::Env->new;
$value = $env->get('REMOTE_ADDR');
return 'not gonna see it' if $env->requestNotViewed() ;
=head1 METHODS
These methods are available from this package:
@ -37,6 +39,66 @@ These methods are available from this package:
=cut
#-------------------------------------------------------------------
=head2 callerIsSearchSite ( )
returns true if the remote address matches a site which is a known indexer or spider.
=cut
sub callerIsSearchSite {
my $self = shift;
my $remoteAddress = $self->get('REMOTE_ADDR');
return 1 if $remoteAddress =~ /203\.87\.123\.1../ # Blaiz Enterprise Rawgrunt search
|| $remoteAddress =~ /123\.113\.184\.2../ # Unknown Yahoo Robot
|| $remoteAddress == '';
return 0;
}
#-------------------------------------------------------------------
=head2 clientIsSpider ( )
returns true is the client/agent is a spider/indexer or some other non-human interface
=cut
sub clientIsSpider {
my $self = shift;
my $userAgent = $self->get('HTTP_USER_AGENT');
return 1 if $userAgent eq ''
|| $userAgent =~ m<(^wre\/| # the WRE wget's http://localhost/ every 2-3 minutes 24 hours a day...
^morpheus|
libwww|
s[pb]ider|
bot|
robo|
sco[ou]t|
crawl|
miner|
reaper|
finder|
search|
engine|
download|
fetch|
scan|
slurp)>ix;
return 0;
}
#-------------------------------------------------------------------
=head2 DESTROY ( )
@ -100,5 +162,22 @@ sub new {
bless {_env=>\%ENV}, $class;
}
#-------------------------------------------------------------------
=head2 requestNotViewed ( )
returns true is the client/agent is a spider/indexer or some other non-human interface
=cut
sub requestNotViewed {
my $self = shift;
return $self->clientIsSpider()
|| $self->callerIsSearchSite();
}
1;

206
t/Session/CheckClient.t Normal file
View file

@ -0,0 +1,206 @@
#-------------------------------------------------------------------
# WebGUI is Copyright 2001-2009 Plain Black Corporation.
#-------------------------------------------------------------------
# Please read the legal notices (docs/legal.txt) and the license
# (docs/license.txt) that came with this distribution before using
# this software.
#-------------------------------------------------------------------
# http://www.plainblack.com info@plainblack.com
#-------------------------------------------------------------------
# this test can take two parameters
# first is an xml file, second indicates
# the percentage of items to test.
# the xml file can be downloaded from
# http://www.user-agents.org/
# the percent will default to 25 and
# should be passed as a whole number
# so 100 will test all items, 75 will
# test 75% or 3 out of four items
use FindBin;
use strict;
use lib "$FindBin::Bin/lib";
use lib '/data/WebGUI/t/lib';
use WebGUI::Test;
use WebGUI::Session;
use Test::More;
my $session = WebGUI::Test->session;
# this test is for code in the WebGUI::Session::Env Module
my @testArray = (
{
agent => "",
output => 1,
comment => "blank user agent"
},
{
agent => "&lt;a href='http://www.unchaos.com/'> UnChaos &lt;/a> From Chaos To Order Hybrid Web Search Engine.(vadim_goncharunchaos.com)",
output => 1,
comment => "UnChaos hybrid search engine"
},
{
agent => "(DreamPassport/3.0; isao/MyDiGiRabi)",
output => 0,
comment => "DreamCast DreamPassport browser"
},
{
agent => "Privoxy web proxy", # I think proxy's whould be considered browsers?
output => 0,
comment => "s.also Privoxy/3.0 (Anonymous)"
},
{
agent => "*/Nutch-0.9-dev",
address => "123.113.184.232",
output => 1,
comment => "Unknown Yahoo robot"
},
{
agent => "123spider-Bot (Version: 1.02&#44; powered by www.123spider.de",
output => 1,
comment => "123spider.de (Germany) web directory link checking"
},
{
agent => "1st ZipCommander (Net) - http://www.zipcommander.com/",
output => 0,
comment => "1st ZipCommander Net - IE based browser"
},
{
agent => "2Bone_LinkChecker/1.0 libwww-perl/5.64",
output => 1,
comment => "2Bone online link checker"
},
{
agent => "A-Online Search",
output => 1,
comment => "A-Online.at robot - now Jet2Web Search"
},
{
agent => "Advanced Browser (http://www.avantbrowser.com)",
output => 0,
comment => "Avant Browser - IE based browser"
},
{
agent => "AESOP_com_SpiderMan",
output => 1,
comment => "Aesop robot"
},
{
agent => "Mozilla/5.0 (compatible; SpurlBot/0.2)",
output => 1,
comment => "Spurl.net bookmark service & search engine (84.40.30.xxx)"
},
{
agent => "Mozilla/5.0 (compatible;MAINSEEK_BOT)",
output => 1,
comment => "Mozilla/5.0 (compatible;MAINSEEK_BOT)"
},
{
agent => "Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.0.1) Gecko/20021219 Chimera/0.6",
output => 0,
comment => "Chimera browser (Mozilla/Gecko engine) - now Camino Mac PowerPC"
},
{
agent => "Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en-US) AppleWebKit/xx (KHTML like Gecko) OmniWeb/v5xx.xx",
output => 0,
comment => "OmniWeb 5.x.x Mac OS X browser"
},
{
agent => "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:x.x.x) Gecko/20041107 Firefox/x.x",
output => 0,
comment => "Firefox browser (Mozilla/Gecko engine) - ex Firebird WinXP"
},
{
agent => "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.8.1) VoilaBot BETA 1.2 (support.voilabotorange-ftgroup.com)",
output => 1,
comment => "Voila.fr robot"
},
{
agent => "Mozilla/5.0 (Windows;) NimbleCrawler 1.12 obeys UserAgent NimbleCrawler For problems contact: crawlerhealth",
output => 1,
comment => "Healthline health related search robot (72.5.115.xx)"
},
{
agent => "Mozilla/5.0 (X11; U; Linux i686; de-AT; rv:1.8.0.2) Gecko/20060309 SeaMonkey/1.0",
output => 0,
comment => "SeaMonkey browser suite (ex Mozilla) on Linux"
},
{
agent => "Mozilla/5.0 [en] (compatible; Gulper Web Bot 0.2.4 www.ecsl.cs.sunysb.edu/~maxim/cgi-bin/Link/GulperBot)",
output => 1,
comment => "Yuntis : Collaborative Web Resource Categorization and Ranking Project robot"
},
);
sub transType {
return 0 if $_[0] =~ /(B|P)/; # browser or proxy
return 1;
}
sub getAddress { # There are precious few that have an IP that can be gotten out of the XML so I decided to skip this.
my $x = '69.42.78.32';
#if( $_[0]{Comment} =~ /\d\.\d\.\d/ ) {
# print $_[0]{Comment},"\t|\t",$_[0]{Description},"\n";
# $x = $_[0]{Comment};
# $x =~ s/x/2/;
#}
return $x;
}
sub testCount {
if( @ARGV ) {
if( $ARGV[0] =~ /\.xml$/ && -r $ARGV[0] ) {
my $infile = shift @ARGV ;
my $percent = shift @ARGV || 25;
use XML::Simple;
my $xml = new XML::Simple;
my $data = $xml->XMLin($infile);
# use Data::Dumper;
# print Dumper $data;
@testArray = ();
my $c = 1;
my $div = 20;
my $n = $div * $percent / 100;
foreach my $set (@{$data->{'user-agent'}}) {
$c = 1 if $c > $div;
if( $c <= $n ) {
push @testArray, {
agent => $set->{String},
output => transType($set->{Type}),
type => $set->{Type},
comment => $set->{Description},
# comment => $set->{String}, # this is handy for fine tuning the code: it shows the string that failed...
address => getAddress($set),
};
}
$c ++;
}
# use Data::Dumper;
# print Dumper \@testArray;
}
}
return scalar(@testArray);
}
plan tests => testCount() ;
my $output;
foreach my $testSet (@testArray) {
$output = new FAKE_ENV( $testSet->{agent},
$testSet->{address} || '69.42.78.32')
->requestNotViewed();
is($output, $testSet->{output}, $testSet->{comment});
}
{ # this is a local fake of the session, used for testing only
package FAKE_ENV;
use base 'WebGUI::Session::Env';
sub new { shift; return bless { _env => { HTTP_USER_AGENT => $_[0], REMOTE_ADDR => $_[1] } }, __PACKAGE__; }
}