added code and test suite for a series of functions that
identify non-human web clients so that advertising can eliminate ghost impressions and ghost clicks
This commit is contained in:
parent
fdeece9a64
commit
af5ad84fc8
5 changed files with 305 additions and 2 deletions
206
t/Session/CheckClient.t
Normal file
206
t/Session/CheckClient.t
Normal file
|
|
@ -0,0 +1,206 @@
|
|||
#-------------------------------------------------------------------
|
||||
# WebGUI is Copyright 2001-2009 Plain Black Corporation.
|
||||
#-------------------------------------------------------------------
|
||||
# Please read the legal notices (docs/legal.txt) and the license
|
||||
# (docs/license.txt) that came with this distribution before using
|
||||
# this software.
|
||||
#-------------------------------------------------------------------
|
||||
# http://www.plainblack.com info@plainblack.com
|
||||
#-------------------------------------------------------------------
|
||||
|
||||
# this test can take two parameters
|
||||
# first is an xml file, second indicates
|
||||
# the percentage of items to test.
|
||||
# the xml file can be downloaded from
|
||||
# http://www.user-agents.org/
|
||||
# the percent will default to 25 and
|
||||
# should be passed as a whole number
|
||||
# so 100 will test all items, 75 will
|
||||
# test 75% or 3 out of four items
|
||||
|
||||
use FindBin;
|
||||
use strict;
|
||||
use lib "$FindBin::Bin/lib";
|
||||
use lib '/data/WebGUI/t/lib';
|
||||
|
||||
use WebGUI::Test;
|
||||
use WebGUI::Session;
|
||||
|
||||
use Test::More;
|
||||
|
||||
my $session = WebGUI::Test->session;
|
||||
|
||||
# this test is for code in the WebGUI::Session::Env Module
|
||||
|
||||
my @testArray = (
|
||||
{
|
||||
agent => "",
|
||||
output => 1,
|
||||
comment => "blank user agent"
|
||||
},
|
||||
{
|
||||
agent => "<a href='http://www.unchaos.com/'> UnChaos </a> From Chaos To Order Hybrid Web Search Engine.(vadim_goncharunchaos.com)",
|
||||
output => 1,
|
||||
comment => "UnChaos hybrid search engine"
|
||||
},
|
||||
{
|
||||
agent => "(DreamPassport/3.0; isao/MyDiGiRabi)",
|
||||
output => 0,
|
||||
comment => "DreamCast DreamPassport browser"
|
||||
},
|
||||
{
|
||||
agent => "Privoxy web proxy", # I think proxy's whould be considered browsers?
|
||||
output => 0,
|
||||
comment => "s.also Privoxy/3.0 (Anonymous)"
|
||||
},
|
||||
{
|
||||
agent => "*/Nutch-0.9-dev",
|
||||
address => "123.113.184.232",
|
||||
output => 1,
|
||||
comment => "Unknown Yahoo robot"
|
||||
},
|
||||
{
|
||||
agent => "123spider-Bot (Version: 1.02, powered by www.123spider.de",
|
||||
output => 1,
|
||||
comment => "123spider.de (Germany) web directory link checking"
|
||||
},
|
||||
{
|
||||
agent => "1st ZipCommander (Net) - http://www.zipcommander.com/",
|
||||
output => 0,
|
||||
comment => "1st ZipCommander Net - IE based browser"
|
||||
},
|
||||
{
|
||||
agent => "2Bone_LinkChecker/1.0 libwww-perl/5.64",
|
||||
output => 1,
|
||||
comment => "2Bone online link checker"
|
||||
},
|
||||
{
|
||||
agent => "A-Online Search",
|
||||
output => 1,
|
||||
comment => "A-Online.at robot - now Jet2Web Search"
|
||||
},
|
||||
{
|
||||
agent => "Advanced Browser (http://www.avantbrowser.com)",
|
||||
output => 0,
|
||||
comment => "Avant Browser - IE based browser"
|
||||
},
|
||||
{
|
||||
agent => "AESOP_com_SpiderMan",
|
||||
output => 1,
|
||||
comment => "Aesop robot"
|
||||
},
|
||||
{
|
||||
agent => "Mozilla/5.0 (compatible; SpurlBot/0.2)",
|
||||
output => 1,
|
||||
comment => "Spurl.net bookmark service & search engine (84.40.30.xxx)"
|
||||
},
|
||||
{
|
||||
agent => "Mozilla/5.0 (compatible;MAINSEEK_BOT)",
|
||||
output => 1,
|
||||
comment => "Mozilla/5.0 (compatible;MAINSEEK_BOT)"
|
||||
},
|
||||
{
|
||||
agent => "Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.0.1) Gecko/20021219 Chimera/0.6",
|
||||
output => 0,
|
||||
comment => "Chimera browser (Mozilla/Gecko engine) - now Camino Mac PowerPC"
|
||||
},
|
||||
{
|
||||
agent => "Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en-US) AppleWebKit/xx (KHTML like Gecko) OmniWeb/v5xx.xx",
|
||||
output => 0,
|
||||
comment => "OmniWeb 5.x.x Mac OS X browser"
|
||||
},
|
||||
{
|
||||
agent => "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:x.x.x) Gecko/20041107 Firefox/x.x",
|
||||
output => 0,
|
||||
comment => "Firefox browser (Mozilla/Gecko engine) - ex Firebird WinXP"
|
||||
},
|
||||
{
|
||||
agent => "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.8.1) VoilaBot BETA 1.2 (support.voilabotorange-ftgroup.com)",
|
||||
output => 1,
|
||||
comment => "Voila.fr robot"
|
||||
},
|
||||
{
|
||||
agent => "Mozilla/5.0 (Windows;) NimbleCrawler 1.12 obeys UserAgent NimbleCrawler For problems contact: crawlerhealth",
|
||||
output => 1,
|
||||
comment => "Healthline health related search robot (72.5.115.xx)"
|
||||
},
|
||||
{
|
||||
agent => "Mozilla/5.0 (X11; U; Linux i686; de-AT; rv:1.8.0.2) Gecko/20060309 SeaMonkey/1.0",
|
||||
output => 0,
|
||||
comment => "SeaMonkey browser suite (ex Mozilla) on Linux"
|
||||
},
|
||||
{
|
||||
agent => "Mozilla/5.0 [en] (compatible; Gulper Web Bot 0.2.4 www.ecsl.cs.sunysb.edu/~maxim/cgi-bin/Link/GulperBot)",
|
||||
output => 1,
|
||||
comment => "Yuntis : Collaborative Web Resource Categorization and Ranking Project robot"
|
||||
},
|
||||
);
|
||||
|
||||
sub transType {
|
||||
return 0 if $_[0] =~ /(B|P)/; # browser or proxy
|
||||
return 1;
|
||||
}
|
||||
|
||||
sub getAddress { # There are precious few that have an IP that can be gotten out of the XML so I decided to skip this.
|
||||
my $x = '69.42.78.32';
|
||||
#if( $_[0]{Comment} =~ /\d\.\d\.\d/ ) {
|
||||
# print $_[0]{Comment},"\t|\t",$_[0]{Description},"\n";
|
||||
# $x = $_[0]{Comment};
|
||||
# $x =~ s/x/2/;
|
||||
#}
|
||||
return $x;
|
||||
}
|
||||
|
||||
sub testCount {
|
||||
|
||||
if( @ARGV ) {
|
||||
if( $ARGV[0] =~ /\.xml$/ && -r $ARGV[0] ) {
|
||||
my $infile = shift @ARGV ;
|
||||
my $percent = shift @ARGV || 25;
|
||||
use XML::Simple;
|
||||
my $xml = new XML::Simple;
|
||||
my $data = $xml->XMLin($infile);
|
||||
# use Data::Dumper;
|
||||
# print Dumper $data;
|
||||
@testArray = ();
|
||||
my $c = 1;
|
||||
my $div = 20;
|
||||
my $n = $div * $percent / 100;
|
||||
foreach my $set (@{$data->{'user-agent'}}) {
|
||||
$c = 1 if $c > $div;
|
||||
if( $c <= $n ) {
|
||||
push @testArray, {
|
||||
agent => $set->{String},
|
||||
output => transType($set->{Type}),
|
||||
type => $set->{Type},
|
||||
comment => $set->{Description},
|
||||
# comment => $set->{String}, # this is handy for fine tuning the code: it shows the string that failed...
|
||||
address => getAddress($set),
|
||||
};
|
||||
}
|
||||
$c ++;
|
||||
}
|
||||
# use Data::Dumper;
|
||||
# print Dumper \@testArray;
|
||||
}
|
||||
}
|
||||
return scalar(@testArray);
|
||||
}
|
||||
|
||||
|
||||
plan tests => testCount() ;
|
||||
|
||||
my $output;
|
||||
foreach my $testSet (@testArray) {
|
||||
$output = new FAKE_ENV( $testSet->{agent},
|
||||
$testSet->{address} || '69.42.78.32')
|
||||
->requestNotViewed();
|
||||
is($output, $testSet->{output}, $testSet->{comment});
|
||||
}
|
||||
|
||||
{ # this is a local fake of the session, used for testing only
|
||||
package FAKE_ENV;
|
||||
use base 'WebGUI::Session::Env';
|
||||
sub new { shift; return bless { _env => { HTTP_USER_AGENT => $_[0], REMOTE_ADDR => $_[1] } }, __PACKAGE__; }
|
||||
}
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue