1 package Yaala::Parser::WebserverTools;
5 use vars qw(%fields %MONTH_NUMBERS);
8 use Yaala::Config qw#get_config read_config#;
10 @Yaala::Parser::WebserverTools::EXPORT_OK = qw(%MONTH_NUMBERS detect_referer
11 detect_browser detect_os extract_data);
12 @Yaala::Parser::WebserverTools::ISA = ('Exporter');
14 read_config ('webserver.config');
16 our $referer_format = get_config ('referer_format');
17 our $localhost_name = '';
18 our @local_aliases = get_config ('localhost');
20 our %recognized_browsers;
23 # Used to translate the month's name into it's number
24 %MONTH_NUMBERS = ( 'Jan' => 1,
38 (# the CGI fields that different search engines use to store the search strings in
39 'MT' => 'lycos', # hotbot.lycos.com
40 'ask' => 'ask.com', # ask.com/main/metaAnswer.asp
41 'origq' => 'msn', # search.msn.com/results.asp
42 'p' => 'yahoo', # google.yahoo.com/bin/query
43 'q' => 'google|freshmeat', # google.com/search, freshmeat.net/search, google.de/search
44 'qs' => 'virgilio', # search.virgilio.it/search/cgi/search.cgi
45 'query' => 'lycos', # search-arianna.iol.it/abin/internationalsearch, search.lycos.com/main/default.asp, suche.lycos.de/cgi-bin/pursuit
46 'search' => 'altavista|excite' # altavista.com/iepane, search.excite.ca/search.gw
50 my $include_local = get_config ('referer_include_localhost');
52 if ($include_local =~ m/true/i)
54 $localhost_name = 'localhost';
58 my $VERSION = '$Id: WebserverTools.pm,v 1.5 2003/12/07 16:47:14 octo Exp $';
59 print STDERR $/, __FILE__, ": $VERSION" if ($::DEBUG);
64 # Used to extract the referer if parsing webserver
72 ($host, $uri, $params) =
73 $referer =~ m#^\w+://([^:/]+)(?::\d+)?(/[^\?]*)(\??.*)#;
74 #$referer =~ m#^\w+://([^:/]+)(?::\d+)?#;
76 return ('') unless (defined ($host));
78 if (grep { $host =~ m/$_/i } @local_aliases)
80 $host = $localhost_name;
83 return ('*NONE*') unless ($host);
85 if ($referer_format eq 'full')
87 return ($host . $uri . $params);
89 elsif ($referer_format eq 'url')
91 return ($host . $uri);
100 # This is used to (try) to translate the browser
101 # string into something more human-readable and
102 # to have a smaller number of browsers so
103 # information is easier to cathegorize.. If you
104 # don't understand this routine without comments
105 # you should invest in some perl book, I think..
109 if (defined $recognized_browsers{$browser})
111 return ($recognized_browsers{$browser});
114 my $name = 'unknown';
115 if ($browser =~ /Mozilla/i)
117 if ($browser =~ m/Firefox|Iceweasel|Minefield|BonEcho/i) { $name = 'Firefox'; }
118 elsif ($browser =~ m/SeaMonkey/i) { $name = 'SeaMonkey'; }
119 elsif ($browser =~ m/Konqueror/i) { $name = 'Konqueror'; }
120 elsif ($browser =~ m/Safari/i) { $name = 'Safari'; }
121 elsif ($browser =~ m/MSIE/i) { $name = 'MSIE'; }
122 elsif ($browser =~ m/Epiphany/i) { $name = 'Epiphany'; }
123 elsif ($browser =~ m/compatible/i) { $name = 'Netscape compatible'; }
124 elsif ($browser =~ m!Mozilla/[0-4]!i or $browser =~ m/Netscape/i) { $name = 'Netscape Navigator'; }
130 elsif ($browser =~ /Lynx/i) { $name = 'Lynx'; }
131 elsif ($browser =~ /Links/i) { $name = 'Links'; }
132 elsif ($browser =~ /Opera/i) { $name = 'Opera'; }
133 elsif ($browser =~ /WebTV/i) { $name = 'WebTV'; }
134 elsif ($browser =~ /curl/i) { $name = 'curl'; }
135 elsif ($browser =~ /wget/i) { $name = 'wget'; }
136 elsif ($browser =~ /GetRight|GoZilla/i) { $name = 'Download Manager'; }
137 elsif ($browser =~ /bot|Google|Slurp|Scooter|Spider|Infoseek|Crawl|Mercator|FireBall|av\.com|Teoma|Ask Jeeves/i) { $name = 'Search Engines'; }
138 elsif ($::DEBUG & 0x2000)
140 print $/, __FILE__, ": Unknown browser: '$browser'";
143 $recognized_browsers{$browser} = $name;
148 # uses the same string "detect_browser" does,
149 # except for that it extracts the operating system
150 # as good as possible.
154 if (defined $recognized_oses{$os})
156 return ($recognized_oses{$os});
159 my $name = 'unknown';
160 if ($os =~ /IRIX/i) { $name = 'IRIX'; }
161 elsif ($os =~ /AIX/i) { $name = 'AIX'; }
162 elsif ($os =~ /Sun/i) { $name = 'SunOS'; }
163 elsif ($os =~ /BeOS/i) { $name = 'BeOS'; }
164 elsif ($os =~ /OS.?2/i) { $name = 'OS/2'; }
165 elsif ($os =~ /Amiga/i) { $name = 'AmigaOS'; }
166 elsif ($os =~ /Mac|PPC/i) { $name = 'MacOS'; }
167 elsif ($os =~ /BSD/i)
169 if ($os =~ /open/i) { $name = 'OpenBSD'; }
170 elsif ($os =~ /free/i) { $name = 'FreeBSD'; }
171 elsif ($os =~ /net/i) { $name = 'NetBSD'; }
172 else { $name = 'some BSD'; }
174 elsif ($os =~ /Linux|X11|KDE|Genome|Gnome/i) { $name = 'Linux'; }
175 elsif ($os =~ /Win/i)
177 if ($os =~ /95/) { $name = 'Windows 95'; }
178 elsif ($os =~ /98/) { $name = 'Windows 98'; }
179 elsif ($os =~ /Me/i) { $name = 'Windows ME'; }
182 if ($os =~ /NT.5.1/i) { $name = 'Windows XP'; }
183 elsif ($os =~ /NT.5.0/i) { $name = 'Windows 2000'; }
184 else { $name = 'Windows NT'; }
186 elsif ($os =~ /2000|2k/i) { $name = 'Windows 2000'; }
187 elsif ($os =~ /xp/i) { $name = 'Windows XP'; }
188 else { $name = 'some Windows'; }
190 elsif ($os =~ /ix/i) { $name = 'some UNIX'; }
191 elsif ($::DEBUG & 0x2000)
193 print $/, __FILE__, ": Unknown OS: '$os'";
196 $recognized_oses{$os} = $name;
201 # This routine looks for data in the referer and
202 # extracts terms that visitors of this site were
203 # searching for at ome of the major searchengines.
204 # I know that my list is far from being complete.
205 # If your favorite search engine isn't included
206 # please feel free to contact me.
208 # If there is a field that may contain such
209 # information, then it's this one..
212 # We will save every field (if any) here with it's
213 # data being the value..
215 my ($key, $val) = ('', '');
217 # $server is the server the visitor is coming
218 # from, $string the entire data which will need
220 my ($server, $string) = split (/\?/, $referer, 2);
222 # Don't do anything unless there is any data..
223 # We have to return an empty list since zero would
224 # get interpreted as a one-element array with the
225 # only value being "0", making zero the top word..
226 return () unless $string;
231 # Split data into key=value pairs
232 foreach (split (/\&/, $string))
234 ($key, $val) = split (/=/, $_, 2);
235 next unless defined $val;
237 # A "+" in the request-string means a whitespace
240 # Ignore all special characters.. I know that's
241 # lazy and will screw up words like "foo-bar", but
242 # IMO it does more good than bad. If you don't
243 # think so either uncomment the appended line or
244 # write better code and drop me a copy..
245 # $val =~ s/\%(.{2})/pack("C", hex($1))/eg;
246 $val =~ s/\%(.{2})//g;
250 # Print the hash's content to STDOUT if you set
251 # $::DEBUG to anything higher than 2 (3, eg.)
252 # This is extremely usefull for finding search-
253 # engines and which fields they are using..
254 # use './yaala | grep DATA | sort | less' for the
255 # best/easiest to read results..
256 if ($::DEBUG & 0x1000)
258 print $/, __FILE__, "Extracted data: $_ = ", $form{$_} for (keys %form);
262 # Cycles through every PREdefined field that may
263 # contain the information we want. If this field
264 # exists, we check wether the previous visited
265 # server matches the regexp (the corresponding
266 # value in %fields) and if that's the case, we
267 # split the line into words saving it to %words to
268 # prevent duplicates. (otherwise a search for
269 # "foo foo foo foo foo foo foo" would result into
270 # increasing "foo" dramatically..
271 foreach $field (keys %fields)
273 # check for this field's existance..
274 next unless defined $form{$field};
276 $regexp = $fields{$field};
278 # check wether the server matches out regexp..
279 next unless $server =~ /$regexp/i;
281 $string = lc ($form{$field});
283 # this is a google-only thing that appears when
284 # the visitor used google's cache option..
285 next if $string =~ /^cache:/;
287 # And, after all these tests, save the data..
288 map { if (length ($_) > 2) { $words{$_} = 1; } } (split (/\s+/, $string));
290 # return %words's keys as a list, which may be