#!/usr/bin/perl
#
# parsehints.pl
# Edmund Edgar, 2006-01-02
#
# This script parses log files in combined format make a list of the pages users are most likely to click after each page in your access logs, based on the referer data.
# It then uses that to generate a <Location> directive for each page, containing a set of mod_headers directives to provide prefetching hints.
# By default it will issue a prefetching hint whenever more than 2% of users go from the page in question to a subsequent one, up to a maximum of 5 prefetching hints per page. You can control these settings with optional parameters.
#
# Usage: ./parsehints.pl <logfile> <domain>[,<domain>] [<min percent clicks to justify prefetch>] [<max prefetches per page>]
# Example: ./parsehints.pl /var/log/apache2/access.log edochan.com,www.edochan.com 20 10 > /etc/apache2/prefetch.conf
#
# a "combined" format log file looks like this:
# 220.217.50.156 - - [07/May/2005:15:31:02 +0100] "GET /learn/lessons.cgi HTTP/1.1" 403 299 "-" "Mozilla/4.0 (compatible; MSIE 6.0; X11; Linux i686; en) Opera 8.0"
#
# Copyright (C) 2006 Edmund Edgar
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# Edmund Edgar, webmaster@edochan.com
#

use strict;

my $defaultPrefetchesPerPage = 5;
my $defaultMinPercentClicksToJustifyPrefetch = 2;

my ($logFile, $domainList, $minPercentClicksToJustifyPrefetchArg, $prefetchesPerPageArg) = @ARGV;
unless ($logFile && $domainList) {
   die "Usage: ./parsehints.pl <logfile> <domain1>[,<domain2>...] [<min percent clicks to justify prefetch>] [<max prefetches per page>] ";
}

my @domains = split /,/, $domainList;
my %myDomains;
foreach my $d(@domains) {
   $myDomains{$d} = 1;
}

my $minPercentClicksToJustifyPrefetch = $defaultMinPercentClicksToJustifyPrefetch;
if ($minPercentClicksToJustifyPrefetchArg > 0) {
   $minPercentClicksToJustifyPrefetch = $minPercentClicksToJustifyPrefetchArg;
}

my $prefetchesPerPage = $defaultPrefetchesPerPage;
if ($prefetchesPerPageArg > 0) {
   $prefetchesPerPage = $prefetchesPerPageArg;
}

my $unreadableLines = 0;
open LOG, $logFile or die "Cannot open log file $logFile: $!";

my %referedRefererCounts;
my %pageHitCounts;

while (<LOG>) {

   # Hairy regex to match pull a URL, its referer and the user agent string out of log files in "combined" or "combined_with_prefetching_hack" format.
   # I'm ignoring POST requests...
   if ($_=~/.*?\s.*?\s.*?\s\[.*?\]\s\"GET\s(.*?)\s.*?\s\"(.*?)\"\s\"(.*?)\"/) {

      my ($refered, $refererWithDomain, $userAgent) = ($1,$2,$3);

      my $referer;
      # Split URL as in the URI.pm docs.
      my($scheme, $authority, $path, $query, $fragment) = $refererWithDomain=~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
      unless ($myDomains{$authority}) {
         # print "\n$authority is not $myDomain";
         next;
      }
      $referer = $path.$query;

      # Don't prefetch with anything except html pages linked to by other html pages.
      # This is very restrictive - you may well want to change it...
      unless ($refered=~m/\.html?/) {
         next;
      }
      # I'm also skipping anything with a query string. 
      # Your needs may be different...
      if ($refered=~/\?/) {
         next;
      }

      # keep a note of how many hits there are for each page.
      # this will allow us to figure out the proportion of visitors to this page who hit each subsequent page.
      $pageHitCounts{$refered}++;

      unless ($referer=~m/\.html?/) {
         next;
      }
      if ($referer=~/\?/) {
         next;
      }

      # A page should never need to prefetch itself. 
      if ($refered eq $referer) {
         next;
      }

      # If there's no referer, the referer field will say "-".
      if ($referer eq '-') {
         next;
      }

      # We want to exclude hits that are really prefetches.
      # This will only work if, like me, you've squidged the X-Moz header into your user agent field, to make the format I call "combined_with_prefetching_hack".
      if ($userAgent=~m/prefetch$/i) {
         next;
      }
      
      # print "\n$refered,$referer,$userAgent";
      $referedRefererCounts{$referer}->{$refered}++; 

   } else {

      $unreadableLines++;

   }

}

close LOG;

print "<IfModule mod_headers.c>\n";
foreach my $k(sort keys %referedRefererCounts) {
   my %nextPages = %{ $referedRefererCounts{$k} };

   my $numHitsToPage = $pageHitCounts{$k};

   if (scalar keys %nextPages > 0) {
      $k = &sanitizeForApacheConfig($k);
      my $linkCount = 0;
      my $openedLocationTags = 0;
      PAGELOOP: foreach my $p( reverse sort { $nextPages{$a}<=>$nextPages{$b} } keys %nextPages ) {

         my $proportionFromHereToThere;
	 if ($numHitsToPage) {
            $proportionFromHereToThere = $nextPages{$p} / $numHitsToPage;
	 }

         if ($linkCount >= $prefetchesPerPage) {
	    last PAGELOOP;
	 }

         if ( ($minPercentClicksToJustifyPrefetch/100) > $proportionFromHereToThere) {
	    last PAGELOOP;
	 }
     
         if ($linkCount == 0) {
            print "   <Location $k>\n";
	    $openedLocationTags = 1;
	 }

	 $p = &sanitizeForApacheConfig($p);
         print "            Header append Link \"<$p>; rel=prefetch\"\n";
         $linkCount++;

      }
      if ($openedLocationTags) {
         print "   </Location>\n";
      }
   }
}
print "</IfModule>\n";

sub sanitizeForApacheConfig {

# Both the referer and the refered URL in our logs can be entered freely by anyone with access to our website.
# We don't want people to be able to stick just anything in our prefetch.conf file...
# That means we need to either
#    a) Sanitize the data we find here to make sure it can't be used to do anything beyond affecting the prefetch header.
#    b) Check the generated configuration file manually before using it.
#    c) Check the data some other way.
# This function is a fairly limited attempt at doing (a). o
# It is almost certainly insufficient, so for now I recommend (b) - read the file before you give it to apache.

   my ($url) = @_;

   $url=~s/\"//g;
   $url=~s/\\//g;
   $url=~s/<//g;
   $url=~s/>//g;

   return $url;

}
