From 06c9b824108145f36b22ce2addb3e272b83fc79d Mon Sep 17 00:00:00 2001 From: Robert Kaiser Date: Sat, 19 May 2007 14:51:43 +0200 Subject: [PATCH] improve automatic detection of UAs a bit, remove a few now unneeded hardcoded detections, add some bot UAs in check list --- include/classes/useragent.php-class | 36 ++++++++--------------------- testbed/ua_list_raw.txt | 4 ++++ 2 files changed, 13 insertions(+), 27 deletions(-) diff --git a/include/classes/useragent.php-class b/include/classes/useragent.php-class index 8b94648..2aa2ddc 100755 --- a/include/classes/useragent.php-class +++ b/include/classes/useragent.php-class @@ -135,21 +135,23 @@ class userAgent { // get UA brand and version $this->brand = 'Unknown'; $this->version = null; // find reasonable defaults - if (preg_match('|([0-9a-zA-Z\.:()_ -]+)/([0-9a-zA-Z\._+-]+)|', $this->uastring, $regs)) { + if (preg_match('|([0-9a-zA-Z\.:()_ -]+)/(\d[0-9a-zA-Z\._+-]*)|', $this->uastring, $regs)) { $this->brand = trim($regs[1]); $this->version = $regs[2]; } - elseif (preg_match('|^([a-zA-Z\._ -]+)[_ -][vV]?([0-9][0-9a-zA-Z\.+]+)|', $this->uastring, $regs)) { + elseif (preg_match('|^([a-zA-Z\._ -]+)[_ -][vV]?(\d[0-9a-zA-Z\.+]*)|', $this->uastring, $regs)) { $this->brand = trim($regs[1]); $this->version = $regs[2]; } - elseif (preg_match('|^([a-zA-Z\._ -]+)|', $this->uastring, $regs)) { + elseif (preg_match('|^([0-9a-zA-Z\._ -]+)|', $this->uastring, $regs)) { $this->brand = trim($regs[1]); $this->version = null; } $this->bot = (strpos(strtolower($this->brand), 'bot') !== false) || (strpos(strtolower($this->brand), 'crawler') !== false) - || (strpos(strtolower($this->brand), 'spider') !== false); + || (strpos(strtolower($this->brand), 'spider') !== false) + || (strpos(strtolower($this->brand), 'search') !== false) + || (strpos(strtolower($this->brand), 'seek') !== false); // search for any real and/or special UAs if (preg_match('|Netscape6/([0-9a-zA-Z\.+]+)|', $this->uastring, $regs)) { @@ -389,11 +391,6 @@ class userAgent { $this->version = $regs[1]; $this->bot = false; } - elseif (preg_match('|wget[/ ]([0-9a-zA-Z\.+]+)|i', $this->uastring, $regs)) { - $this->brand = 'wget'; - $this->version = $regs[1]; - $this->bot = false; - } elseif (preg_match('|WinHttp.WinHttpRequest.([0-9\.]+)|i', $this->uastring, $regs)) { $this->brand = 'WinHttpRequest'; $this->version = $regs[1]; @@ -434,11 +431,6 @@ class userAgent { $this->version = $regs[1]; $this->bot = true; } - elseif (preg_match('|Googlebot/?([0-9a-zA-Z\.+]+)?|', $this->uastring, $regs)) { - $this->brand = 'Googlebot'; - $this->version = $regs[1]; - $this->bot = true; - } elseif (preg_match('|Ask Jeeves/([0-9a-zA-Z\.+]+)|', $this->uastring, $regs)) { $this->brand = 'Ask Jeeves'; $this->version = $regs[1]; @@ -540,11 +532,6 @@ class userAgent { $this->version = null; $this->bot = true; } - elseif (preg_match('|sitecheck.internetseer.com|', $this->uastring)) { - $this->brand = 'internetseer'; - $this->version = null; - $this->bot = true; - } elseif (preg_match('|Really Gmane.org\'s favicon grabber|', $this->uastring)) { $this->brand = 'Really Gmane.org\'s favicon grabber'; $this->version = null; @@ -575,11 +562,6 @@ class userAgent { $this->version = null; $this->bot = true; } - elseif (preg_match('|42_HAL|', $this->uastring)) { - $this->brand = '42_HAL'; - $this->version = null; - $this->bot = true; - } elseif (preg_match('|Baiduspider|i', $this->uastring)) { $this->brand = 'BaiDuSpider'; $this->version = null; @@ -707,11 +689,11 @@ class userAgent { $this->bot = false; } - $botArray = array('Scooter','Spinne','Vagabondo','Firefly','Scrubby','NG','Pompos','Szukacz','ASPseek', + $botArray = array('Scooter','Spinne','Vagabondo','Firefly','Scrubby','NG','Pompos','Szukacz','Schmozilla','42_HAL', 'NetResearchServer','LinkWalker','Zeus','W3C_Validator','ZyBorg','Ask Jeeves','ia_archiver', 'PingALink Monitoring Services','IlTrovatore-Setaccio','Nutch','Mercator','search.ch', - 'appie','larbin','NutchCVS','ObjectsSearch','Webchat','Mediapartners-Google','Schmozilla', - 'FavOrg','findlinks','DataCha0s','ichiro','Francis','','','','','','',''); + 'appie','larbin','NutchCVS','Webchat','Mediapartners-Google','sitecheck.internetseer.com', + 'FavOrg','findlinks','DataCha0s','ichiro','Francis','','','','',''); if (in_array($this->brand, $botArray)) { $this->bot = true; diff --git a/testbed/ua_list_raw.txt b/testbed/ua_list_raw.txt index 084cb89..753a767 100755 --- a/testbed/ua_list_raw.txt +++ b/testbed/ua_list_raw.txt @@ -266,11 +266,15 @@ Mercator-2.0 appie 1.1 (www.walhello.com) larbin_2.6.2 (larbin2.6.2@unspecified.mail) OWR_Crawler 0.1 +ISC Systems iRc Search 2.1 +NASA Search 1.0 search.ch V1.4.2 (spiderman@search.ch; WebFilter Robot 1.0 WWWeasel Robot v1.00 (http://wwweasel.de) 2.0_AC-Plug - http://www.iOpus.com Openfind data gatherer, Openbot/3.0+(robot-response@openfind.com.tw;+ +MSRBOT (http://research.microsoft.com/research/sv/msrbot) +ICCrawler - ICjobs (http://www.icjobs.de/bot.htm) Baiduspider+(+http://www.baidu.com/search/spider.htm) BaiDuSpider LinkWalker -- 2.35.3