#!/usr/bin/perl # # parsehints.pl # Edmund Edgar, 2006-01-02 # # This script parses log files in combined format make a list of the pages users are most likely to click after each page in your access logs, based on the referer data. # It then uses that to generate a directive for each page, containing a set of mod_headers directives to provide prefetching hints. # By default it will issue a prefetching hint whenever more than 2% of users go from the page in question to a subsequent one, up to a maximum of 5 prefetching hints per page. You can control these settings with optional parameters.
#
# Usage: ./parsehints.pl [,] [] []
# Example: ./parsehints.pl /var/log/apache2/access.log edochan.com,www.edochan.com 20 10 > /etc/apache2/prefetch.conf
#
# a "combined" format log file looks like this:
# - - [07/May/2005:15:31:02 +0100] "GET /learn/lessons.cgi HTTP/1.1" 403 299 "-" "Mozilla/4.0 (compatible; MSIE 6.0; X11; Linux i686; en) Opera 8.0"
#
# Copyright (C) 2006 Edmund Edgar
#
# Edmund Edgar, webmaster@edochan.com
# use strict;

my $defaultPrefetchesPerPage = 5;
my $defaultMinPercentClicksToJustifyPrefetch = 2;

my ($logFile, $domainList, $minPercentClicksToJustifyPrefetchArg, $prefetchesPerPageArg) = @ARGV;

unless ($logFile && $domainList) {
	die "Usage: ./parsehints.pl [,...] [] [] ";
}

my @domains = split /,/, $domainList;
my %myDomains;
foreach my $d(@domains) {
	$myDomains{$d} = 1;
}

my $minPercentClicksToJustifyPrefetch = $defaultMinPercentClicksToJustifyPrefetch;
if ($minPercentClicksToJustifyPrefetchArg > 0) {
	$minPercentClicksToJustifyPrefetch = $minPercentClicksToJustifyPrefetchArg;
}

my $prefetchesPerPage = $defaultPrefetchesPerPage;
if ($prefetchesPerPageArg > 0) {
	$prefetchesPerPage = $prefetchesPerPageArg;
}

my $unreadableLines = 0;

open LOG, $logFile or die "Cannot open log file $logFile: $!";

my %referedRefererCounts;
my %pageHitCounts;

while () {
	# Hairy regex to match pull a URL, its referer and the user agent string out of log files in "combined" or "combined_with_prefetching_hack" format.
	# I'm ignoring POST requests...
	if ($_=~/.*?\s.*?\s.*?\s\[.*?\]\s\"GET\s(.*?)\s.*?\s\"(.*?)\"\s\"(.*?)\"/) {
		my ($refered, $refererWithDomain, $userAgent) = ($1,$2,$3);
		my $referer;
		# Split URL as in the URI.pm docs.
		my($scheme, $authority, $path, $query, $fragment) =
			$refererWithDomain=~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
		unless ($myDomains{$authority}) {
			# print "\n$authority is not $myDomain";
			next;
		}
		$referer = $path.$query;
		# Don't prefetch with anything except html pages linked to by other html pages.
		# This is very restrictive - you may well want to change it...
		unless ($refered=~m/\.html?/) {
			next;
		}
		# I'm also skipping anything with a query string.
		# Your needs may be different...
		if ($refered=~/\?/) {
			next;
		}
		# keep a note of how many hits there are for each page.
		# this will allow us to figure out the proportion of visitors to this page who hit each subsequent page.
		$pageHitCounts{$refered}++;
		unless ($referer=~m/\.html?/) {
			next;
		}
		if ($referer=~/\?/) {
			next;
		}
		# A page should never need to prefetch itself.
		if ($refered eq $referer) {
			next;
		}
		# If there's no referer, the referer field will say "-".
		if ($referer eq '-') {
			next;
		}
		# We want to exclude hits that are really prefetches.
		# This will only work if, like me, you've squidged the X-Moz header into your user agent field, to make the format I call "combined_with_prefetching_hack".
		if ($userAgent=~m/prefetch$/i) {
			next;
		}
		# print "\n$refered,$referer,$userAgent";
		$referedRefererCounts{$referer}->{$refered}++;
	} else {
		$unreadableLines++;
	}
}

close LOG;

print "\n";

foreach my $k(sort keys %referedRefererCounts) {
	my %nextPages = %{ $referedRefererCounts{$k} };
	my $numHitsToPage = $pageHitCounts{$k};
	if (scalar keys %nextPages > 0) {
		$k = &sanitizeForApacheConfig($k);
		my $linkCount = 0;
		my $openedLocationTags = 0;
		PAGELOOP: foreach my $p( reverse sort { $nextPages{$a}<=>$nextPages{$b} } keys %nextPages ) {
			my $proportionFromHereToThere;
			if ($numHitsToPage) {
				$proportionFromHereToThere = $nextPages{$p} / $numHitsToPage;
			}
			if ($linkCount >= $prefetchesPerPage) {
				last PAGELOOP;
			}
			if ( ($minPercentClicksToJustifyPrefetch/100) > $proportionFromHereToThere) {
				last PAGELOOP;
			}
			if ($linkCount == 0) {
				print "  \n";
				$openedLocationTags = 1;
			}
			$p = &sanitizeForApacheConfig($p);
			print "    Header append Link \"<$p>; rel=prefetch\"\n";
			$linkCount++;
		}
		if ($openedLocationTags) {
			print "  \n";
		}
	}
}

print "\n";

sub sanitizeForApacheConfig {
	# Both the referer and the refered URL in our logs can be entered freely by anyone with access to our website.
	# We don't want people to be able to stick just anything in our prefetch.conf file...
	# That means we need to either
	#   a) Sanitize the data we find here to make sure it can't be used to do anything beyond affecting the prefetch header.
	#   b) Check the generated configuration file manually before using it.
	#   c) Check the data some other way.
	# This function is a fairly limited attempt at doing (a). o
	# It is almost certainly insufficient, so for now I recommend (b) - read the file before you give it to apache.
	my ($url) = @_;
	$url=~s/\"//g;
	$url=~s/\\//g;
	$url=~s///g;
	return $url;
}