improve automatic detection of UAs a bit, remove a few now unneeded hardcoded detecti...
authorRobert Kaiser <kairo@kairo.at>
Sat, 19 May 2007 12:51:43 +0000 (14:51 +0200)
committerRobert Kaiser <kairo@kairo.at>
Sat, 19 May 2007 12:51:43 +0000 (14:51 +0200)
include/classes/useragent.php-class
testbed/ua_list_raw.txt

index 8b94648395896018d42d0df3b464f1f74619568c..2aa2ddc08fbec5dc6af9ad34f59bb4cd5cc604f8 100755 (executable)
@@ -135,21 +135,23 @@ class userAgent {
     // get UA brand and version
     $this->brand = 'Unknown'; $this->version = null;
     // find reasonable defaults
-    if (preg_match('|([0-9a-zA-Z\.:()_ -]+)/([0-9a-zA-Z\._+-]+)|', $this->uastring, $regs)) {
+    if (preg_match('|([0-9a-zA-Z\.:()_ -]+)/(\d[0-9a-zA-Z\._+-]*)|', $this->uastring, $regs)) {
       $this->brand = trim($regs[1]);
       $this->version = $regs[2];
     }
-    elseif (preg_match('|^([a-zA-Z\._ -]+)[_ -][vV]?([0-9][0-9a-zA-Z\.+]+)|', $this->uastring, $regs)) {
+    elseif (preg_match('|^([a-zA-Z\._ -]+)[_ -][vV]?(\d[0-9a-zA-Z\.+]*)|', $this->uastring, $regs)) {
       $this->brand = trim($regs[1]);
       $this->version = $regs[2];
     }
-    elseif (preg_match('|^([a-zA-Z\._ -]+)|', $this->uastring, $regs)) {
+    elseif (preg_match('|^([0-9a-zA-Z\._ -]+)|', $this->uastring, $regs)) {
       $this->brand = trim($regs[1]);
       $this->version = null;
     }
     $this->bot = (strpos(strtolower($this->brand), 'bot') !== false)
                  || (strpos(strtolower($this->brand), 'crawler') !== false)
-                 || (strpos(strtolower($this->brand), 'spider') !== false);
+                 || (strpos(strtolower($this->brand), 'spider') !== false)
+                 || (strpos(strtolower($this->brand), 'search') !== false)
+                 || (strpos(strtolower($this->brand), 'seek') !== false);
 
     // search for any real and/or special UAs
     if (preg_match('|Netscape6/([0-9a-zA-Z\.+]+)|', $this->uastring, $regs)) {
@@ -389,11 +391,6 @@ class userAgent {
       $this->version = $regs[1];
       $this->bot = false;
     }
-    elseif (preg_match('|wget[/ ]([0-9a-zA-Z\.+]+)|i', $this->uastring, $regs)) {
-      $this->brand = 'wget';
-      $this->version = $regs[1];
-      $this->bot = false;
-    }
     elseif (preg_match('|WinHttp.WinHttpRequest.([0-9\.]+)|i', $this->uastring, $regs)) {
       $this->brand = 'WinHttpRequest';
       $this->version = $regs[1];
@@ -434,11 +431,6 @@ class userAgent {
       $this->version = $regs[1];
       $this->bot = true;
     }
-    elseif (preg_match('|Googlebot/?([0-9a-zA-Z\.+]+)?|', $this->uastring, $regs)) {
-      $this->brand = 'Googlebot';
-      $this->version = $regs[1];
-      $this->bot = true;
-    }
     elseif (preg_match('|Ask Jeeves/([0-9a-zA-Z\.+]+)|', $this->uastring, $regs)) {
       $this->brand = 'Ask Jeeves';
       $this->version = $regs[1];
@@ -540,11 +532,6 @@ class userAgent {
       $this->version = null;
       $this->bot = true;
     }
-    elseif (preg_match('|sitecheck.internetseer.com|', $this->uastring)) {
-      $this->brand = 'internetseer';
-      $this->version = null;
-      $this->bot = true;
-    }
     elseif (preg_match('|Really Gmane.org\'s favicon grabber|', $this->uastring)) {
       $this->brand = 'Really Gmane.org\'s favicon grabber';
       $this->version = null;
@@ -575,11 +562,6 @@ class userAgent {
       $this->version = null;
       $this->bot = true;
     }
-    elseif (preg_match('|42_HAL|', $this->uastring)) {
-      $this->brand = '42_HAL';
-      $this->version = null;
-      $this->bot = true;
-    }
     elseif (preg_match('|Baiduspider|i', $this->uastring)) {
       $this->brand = 'BaiDuSpider';
       $this->version = null;
@@ -707,11 +689,11 @@ class userAgent {
       $this->bot = false;
     }
 
-    $botArray = array('Scooter','Spinne','Vagabondo','Firefly','Scrubby','NG','Pompos','Szukacz','ASPseek',
+    $botArray = array('Scooter','Spinne','Vagabondo','Firefly','Scrubby','NG','Pompos','Szukacz','Schmozilla','42_HAL',
                       'NetResearchServer','LinkWalker','Zeus','W3C_Validator','ZyBorg','Ask Jeeves','ia_archiver',
                       'PingALink Monitoring Services','IlTrovatore-Setaccio','Nutch','Mercator','search.ch',
-                      'appie','larbin','NutchCVS','ObjectsSearch','Webchat','Mediapartners-Google','Schmozilla',
-                      'FavOrg','findlinks','DataCha0s','ichiro','Francis','','','','','','','');
+                      'appie','larbin','NutchCVS','Webchat','Mediapartners-Google','sitecheck.internetseer.com',
+                      'FavOrg','findlinks','DataCha0s','ichiro','Francis','','','','','');
 
     if (in_array($this->brand, $botArray)) {
       $this->bot = true;
index 084cb89c0cb5bad763097e39b566eb9e3266870c..753a76742bffe716370872e6398be4742d309b89 100755 (executable)
@@ -266,11 +266,15 @@ Mercator-2.0
 appie 1.1 (www.walhello.com)
 larbin_2.6.2 (larbin2.6.2@unspecified.mail)
 OWR_Crawler 0.1
+ISC Systems iRc Search 2.1
+NASA Search 1.0
 search.ch V1.4.2 (spiderman@search.ch;
 WebFilter Robot 1.0
 WWWeasel Robot v1.00 (http://wwweasel.de)
 2.0_AC-Plug - http://www.iOpus.com
 Openfind data gatherer, Openbot/3.0+(robot-response@openfind.com.tw;+
+MSRBOT (http://research.microsoft.com/research/sv/msrbot)
+ICCrawler - ICjobs (http://www.icjobs.de/bot.htm)
 Baiduspider+(+http://www.baidu.com/search/spider.htm)
 BaiDuSpider
 LinkWalker