79d762585bd4d08f45d725cc42aa9cb7857f4342
[yaala.git] / lib / Yaala / Parser / Ncsa.pm
1 package Yaala::Parser;
2
3 use strict;
4 use warnings;
5 use vars qw(%DATAFIELDS);
6
7 use Exporter;
8 use Yaala::Parser::WebserverTools qw#%MONTH_NUMBERS detect_referer detect_browser
9         detect_os extract_data#;
10 use Yaala::Data::Persistent qw#init#;
11
12 @Yaala::Parser::EXPORT_OK = qw(parse extra %DATAFIELDS);
13 @Yaala::Parser::ISA = ('Exporter');
14
15 our $LASTDATE = init ('$LASTDATE', 'scalar');
16 our $EXTRA = init ('$EXTRA', 'hash');
17
18 if (!$$LASTDATE) { $$LASTDATE = 0; }
19 if (!defined ($EXTRA->{'total'})) { $EXTRA->{'total'} = 0; }
20 if (!defined ($EXTRA->{'days'} )) { $EXTRA->{'days'}  = {}; }
21 if (!defined ($EXTRA->{'search_terms'} )) { $EXTRA->{'search_terms'} = {}; }
22
23 %DATAFIELDS = ( 
24         host    => 'key:host',
25         user    => 'key',
26         date    => 'key:date',
27         hour    => 'key:hour',
28         tld     => 'key',
29         file    => 'key',
30         status  => 'key:numeric',
31         browser => 'key',
32         os      => 'key',
33         referer => 'key:url',
34         
35         bytes   => 'agg:bytes',
36         requests => 'agg'
37 );
38
39 # This needs to be done at runtime, since Data uses Setup which relies on
40 # %datafields to be defined  -octo
41 require Yaala::Data::Core;
42 import Yaala::Data::Core qw#store#;
43
44 my $VERSION = '$Id: Ncsa.pm,v 1.10 2003/12/07 15:40:35 octo Exp $';
45 print STDERR $/, __FILE__, ": $VERSION" if ($::DEBUG);
46
47 return (1);
48
49 sub parse
50 {
51         my $line = shift or return undef;
52         
53         #if ($line =~ m#^(\S+)\s(\S+)\s(\S+)\s\[([^\]]+)\]\s"([^"]+)"\s(\d+)\s(\S+)\s"([^"]+)"\s"([^"]+)"(?:\s"([^"]+)")?$#)
54         if ($line =~ m#^(\S+) (\S+) (\S+) \[([^\]]+)\] "([^"]+)" (\d+) (\S+) "([^"]+)" "([^"]+)"(?: "([^"]+)")?$#)
55         {
56 # Initialize the variables that we can get out of
57 # each line first..
58                 my ($host, $ident, $user, $date, $request, $status,
59                         $bytes, $referer, $browser, $cookie) =
60                 ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10);
61
62 # And now initialize all the variables we will use
63 # to get more information out of each field..
64                 my ($day, $month, $year, $hour, $minute, $second) =
65                         $date =~ m#(\d\d)/(\w{3})/(\d{4}):(\d\d):(\d\d):(\d\d)#;
66
67                 $month = $MONTH_NUMBERS{$month};
68                 $date = sprintf("%04u-%02u-%02u", $year, $month, $day);
69
70                 {
71                         my $tmp = int (sprintf ("%04u%02u%02u%02u%02u%02u",
72                                         $year, $month, $day, $hour, $minute, $second));
73                         
74                         if ($tmp < $$LASTDATE)
75                         {
76                                 print STDERR $/, __FILE__, ": Skipping.. ($tmp < $$LASTDATE)" if ($::DEBUG & 0x0200);
77                                 next;
78                         }
79                         else { $$LASTDATE = $tmp; }
80                 }
81                 
82                 my ($method, $file, $params);
83                 if ($request =~ m#(\S+) ([^ \?]+)\??(\S*)#)
84                 {
85                         $method = $1;
86                         $file = $2;
87                         $params = (defined ($3) ? $3 : '');
88                 }
89                 else
90                 {
91                         print STDERR $/, __FILE__, ": Malformed request: ``$request''." if ($::DEBUG);
92                         return (0);
93                 }
94                 
95                 if (($user ne '-') and ($status >= 400) and ($status < 500))
96                 {
97                         $user = '*INVALID*';
98                 }
99
100                 if ($user eq '-') { $user = '*UNKNOWN*'; }
101                 if ($bytes eq '-') { $bytes = 0; }
102
103                 my $tld;
104                 if ($host =~ m/\.([a-z]{2,})$/i)                                
105                 {
106                         $tld = lc ($1);
107                 }
108                 else
109                 {       
110                         $tld = '*UNRESOLVED*';
111                 }
112
113                 my $os = detect_os ($browser);
114                 my $browser_name = detect_browser ($browser);
115                 my @search_terms = extract_data ($referer);
116                 if ($referer eq '-') { $referer = ''; }
117
118                 $EXTRA->{'total'}++;
119                 $EXTRA->{'days'}{$date}++;
120
121                 if (scalar @search_terms)
122                 {
123                         print $/, __FILE__, ": Search Terms: ",
124                                 join (' ', @search_terms)
125                                 if ($::DEBUG & 0x1000);
126                         
127                         $EXTRA->{'search_terms'}{$_}++ for (@search_terms);
128                 }
129
130                 my %combined = (
131                                         'host'          =>      $host,
132                                         'user'          =>      $user,
133                                         'date'          =>      $date,
134                                         'hour'          =>      $hour,
135                                         'browser'       =>      $browser_name,
136                                         'os'            =>      $os,
137                                         'tld'           =>      $tld,
138                                         'file'          =>      $file,
139                                         'referer'       =>      $referer,
140                                         'status'        =>      $status,
141                                         'bytes'         =>      $bytes,
142                                         'requests'      =>      1
143                                 );
144                 store (\%combined);
145         }
146         elsif ($::DEBUG)
147         {
148                 chomp ($line);
149                 print STDERR $/, __FILE__, ": Unable to parse: '$line'";
150         }
151 }
152
153 sub extra
154 {
155         my ($average, $days) = (0, 0);
156         
157         $days = scalar (keys (%{$EXTRA->{'days'}}));
158         return (0) unless ($days);
159         
160         $average = sprintf ("%.1f", ($EXTRA->{'total'} / $days));
161
162         $::EXTRA->{'Total requests'} = $EXTRA->{'total'};
163         $::EXTRA->{'Average requests per day'} = $average;
164         $::EXTRA->{'Reporting period'} = "$days days";
165         
166         my @sorted_terms = sort
167                 { $EXTRA->{'search_terms'}{$b} <=> $EXTRA->{'search_terms'}{$a} }
168                 (keys %{$EXTRA->{'search_terms'}});
169         
170         if (@sorted_terms)
171         {
172                 my $max = $EXTRA->{'search_terms'}{$sorted_terms[0]};
173                 my @scalar_terms = ();
174                 
175                 while (@sorted_terms and
176                         ($EXTRA->{'search_terms'}{$sorted_terms[0]} / $max) > 0.1)
177                 {
178                         $_ = shift (@sorted_terms);
179                         
180                         push (@scalar_terms,
181                                 sprintf ("%s (%u)",
182                                         $_, $EXTRA->{'search_terms'}{$_})
183                         );
184                 }
185                 $::EXTRA->{'Search terms used'} = join ("<br />\n      ", @scalar_terms);
186
187                 if (@sorted_terms)
188                 {
189                         my $skipped = scalar (@sorted_terms);
190                         $::EXTRA->{'Search terms used'} .= "<br />\n      $skipped more skipped";
191                 }
192         }
193 }