1 package Yaala::Parser::WebserverTools;
5 use vars qw(%fields %MONTH_NUMBERS);
8 use Yaala::Config qw#get_config read_config#;
10 @Yaala::Parser::WebserverTools::EXPORT_OK = qw(%MONTH_NUMBERS detect_referer
11 detect_browser detect_os extract_data);
12 @Yaala::Parser::WebserverTools::ISA = ('Exporter');
14 read_config ('webserver.config');
16 our $referer_format = get_config ('referer_format');
17 our $localhost_name = '';
18 our @local_aliases = get_config ('localhost');
20 our %recognized_browsers;
23 # Used to translate the month's name into it's number
24 %MONTH_NUMBERS = ( 'Jan' => 1,
38 (# the CGI fields that different search engines use to store the search strings in
39 'MT' => 'lycos', # hotbot.lycos.com
40 'ask' => 'ask.com', # ask.com/main/metaAnswer.asp
41 'origq' => 'msn', # search.msn.com/results.asp
42 'p' => 'yahoo', # google.yahoo.com/bin/query
43 'q' => 'google|freshmeat', # google.com/search, freshmeat.net/search, google.de/search
44 'qs' => 'virgilio', # search.virgilio.it/search/cgi/search.cgi
45 'query' => 'lycos', # search-arianna.iol.it/abin/internationalsearch, search.lycos.com/main/default.asp, suche.lycos.de/cgi-bin/pursuit
46 'search' => 'altavista|excite' # altavista.com/iepane, search.excite.ca/search.gw
50 my $include_local = get_config ('referer_include_localhost');
52 if ($include_local =~ m/true/i)
54 $localhost_name = 'localhost';
58 my $VERSION = '$Id: WebserverTools.pm,v 1.5 2003/12/07 16:47:14 octo Exp $';
59 print STDERR $/, __FILE__, ": $VERSION" if ($::DEBUG);
64 # Used to extract the referer if parsing webserver
72 ($host, $uri, $params) =
73 $referer =~ m#^\w+://([^:/]+)(?::\d+)?(/[^\?]*)(\??.*)#;
74 #$referer =~ m#^\w+://([^:/]+)(?::\d+)?#;
76 return ('') unless (defined ($host));
78 if (grep { $host =~ m/$_/i } @local_aliases)
80 $host = $localhost_name;
83 return ('*NONE*') unless ($host);
85 if ($referer_format eq 'full')
87 return ($host . $uri . $params);
89 elsif ($referer_format eq 'url')
91 return ($host . $uri);
100 # This is used to (try) to translate the browser
101 # string into something more human-readable and
102 # to have a smaller number of browsers so
103 # information is easier to cathegorize.. If you
104 # don't understand this routine without comments
105 # you should invest in some perl book, I think..
109 if (defined $recognized_browsers{$browser})
111 return ($recognized_browsers{$browser});
114 my $name = 'unknown';
115 if ($browser =~ /Lynx/i) { $name = 'Lynx'; }
116 elsif ($browser =~ /Links/i) { $name = 'Links'; }
117 elsif ($browser =~ /Opera/i) { $name = 'Opera'; }
118 elsif ($browser =~ /WebTV/i) { $name = 'WebTV'; }
119 elsif ($browser =~ /curl/i) { $name = 'curl'; }
120 elsif ($browser =~ /wget/i) { $name = 'wget'; }
121 elsif ($browser =~ /GetRight|GoZilla/i) { $name = 'Download Manager'; }
122 elsif ($browser =~ /bot|Google|Slurp|Scooter|Spider|Infoseek|Crawl|Mercator|FireBall|av\.com|Teoma|Ask Jeeves/i) { $name = 'Search Engines'; }
123 elsif ($browser =~ /Mozilla/i)
125 if ($browser =~ /Galeon/i) { $name = 'Galeon'; }
126 elsif ($browser =~ /Phoenix/i) { $name = 'Phoenix'; }
127 elsif ($browser =~ /Chimera|Camino/i) { $name = 'Camino'; }
128 elsif ($browser =~ /Konqueror/i) { $name = 'Konqueror'; }
129 elsif ($browser =~ /Safari/i) { $name = 'Safari'; }
130 elsif ($browser =~ /MultiZilla/i) { $name = 'MultiZilla'; }
131 elsif ($browser =~ /MSIE/i) { $name = 'MSIE'; }
132 elsif ($browser =~ /compatible/i) { $name = 'Netscape compatible'; }
133 elsif ($browser =~ m!Mozilla/[0-4]!i or $browser =~ m/Netscape/i)
134 { $name = 'Netscape Navigator'; }
135 else { $name = 'Mozilla'; }
137 elsif ($::DEBUG & 0x2000)
139 print $/, __FILE__, ": Unknown browser: '$browser'";
142 $recognized_browsers{$browser} = $name;
147 # uses the same string "detect_browser" does,
148 # except for that it extracts the operating system
149 # as good as possible.
153 if (defined $recognized_oses{$os})
155 return ($recognized_oses{$os});
158 my $name = 'unknown';
159 if ($os =~ /IRIX/i) { $name = 'IRIX'; }
160 elsif ($os =~ /AIX/i) { $name = 'AIX'; }
161 elsif ($os =~ /Sun/i) { $name = 'SunOS'; }
162 elsif ($os =~ /BeOS/i) { $name = 'BeOS'; }
163 elsif ($os =~ /OS.?2/i) { $name = 'OS/2'; }
164 elsif ($os =~ /Amiga/i) { $name = 'AmigaOS'; }
165 elsif ($os =~ /Mac|PPC/i) { $name = 'MacOS'; }
166 elsif ($os =~ /BSD/i)
168 if ($os =~ /open/i) { $name = 'OpenBSD'; }
169 elsif ($os =~ /free/i) { $name = 'FreeBSD'; }
170 elsif ($os =~ /net/i) { $name = 'NetBSD'; }
171 else { $name = 'some BSD'; }
173 elsif ($os =~ /Linux|X11|KDE|Genome|Gnome/i) { $name = 'Linux'; }
174 elsif ($os =~ /Win/i)
176 if ($os =~ /95/) { $name = 'Windows 95'; }
177 elsif ($os =~ /98/) { $name = 'Windows 98'; }
178 elsif ($os =~ /Me/i) { $name = 'Windows ME'; }
181 if ($os =~ /NT.5.1/i) { $name = 'Windows XP'; }
182 elsif ($os =~ /NT.5.0/i) { $name = 'Windows 2000'; }
183 else { $name = 'Windows NT'; }
185 elsif ($os =~ /2000|2k/i) { $name = 'Windows 2000'; }
186 elsif ($os =~ /xp/i) { $name = 'Windows XP'; }
187 else { $name = 'some Windows'; }
189 elsif ($os =~ /ix/i) { $name = 'some UNIX'; }
190 elsif ($::DEBUG & 0x2000)
192 print $/, __FILE__, ": Unknown OS: '$os'";
195 $recognized_oses{$os} = $name;
200 # This routine looks for data in the referer and
201 # extracts terms that visitors of this site were
202 # searching for at ome of the major searchengines.
203 # I know that my list is far from being complete.
204 # If your favorite search engine isn't included
205 # please feel free to contact me.
207 # If there is a field that may contain such
208 # information, then it's this one..
211 # We will save every field (if any) here with it's
212 # data being the value..
214 my ($key, $val) = ('', '');
216 # $server is the server the visitor is coming
217 # from, $string the entire data which will need
219 my ($server, $string) = split (/\?/, $referer, 2);
221 # Don't do anything unless there is any data..
222 # We have to return an empty list since zero would
223 # get interpreted as a one-element array with the
224 # only value being "0", making zero the top word..
225 return () unless $string;
230 # Split data into key=value pairs
231 foreach (split (/\&/, $string))
233 ($key, $val) = split (/=/, $_, 2);
234 next unless defined $val;
236 # A "+" in the request-string means a whitespace
239 # Ignore all special characters.. I know that's
240 # lazy and will screw up words like "foo-bar", but
241 # IMO it does more good than bad. If you don't
242 # think so either uncomment the appended line or
243 # write better code and drop me a copy..
244 # $val =~ s/\%(.{2})/pack("C", hex($1))/eg;
245 $val =~ s/\%(.{2})//g;
249 # Print the hash's content to STDOUT if you set
250 # $::DEBUG to anything higher than 2 (3, eg.)
251 # This is extremely usefull for finding search-
252 # engines and which fields they are using..
253 # use './yaala | grep DATA | sort | less' for the
254 # best/easiest to read results..
255 if ($::DEBUG & 0x1000)
257 print $/, __FILE__, "Extracted data: $_ = ", $form{$_} for (keys %form);
261 # Cycles through every PREdefined field that may
262 # contain the information we want. If this field
263 # exists, we check wether the previous visited
264 # server matches the regexp (the corresponding
265 # value in %fields) and if that's the case, we
266 # split the line into words saving it to %words to
267 # prevent duplicates. (otherwise a search for
268 # "foo foo foo foo foo foo foo" would result into
269 # increasing "foo" dramatically..
270 foreach $field (keys %fields)
272 # check for this field's existance..
273 next unless defined $form{$field};
275 $regexp = $fields{$field};
277 # check wether the server matches out regexp..
278 next unless $server =~ /$regexp/i;
280 $string = lc ($form{$field});
282 # this is a google-only thing that appears when
283 # the visitor used google's cache option..
284 next if $string =~ /^cache:/;
286 # And, after all these tests, save the data..
287 map { if (length ($_) > 2) { $words{$_} = 1; } } (split (/\s+/, $string));
289 # return %words's keys as a list, which may be