#!/usr/bin/perl # # parsehints.pl # Edmund Edgar, 2006-01-02 # # This script parses log files in combined format make a list of the pages users are most likely to click after each page in your access logs, based on the referer data. # It then uses that to generate a directive for each page, containing a set of mod_headers directives to provide prefetching hints. # By default it will issue a prefetching hint whenever more than 2% of users go from the page in question to a subsequent one, up to a maximum of 5 prefetching hints per page. You can control these settings with optional parameters. # # Usage: ./parsehints.pl [,] [] [] # Example: ./parsehints.pl /var/log/apache2/access.log edochan.com,www.edochan.com 20 10 > /etc/apache2/prefetch.conf # # a "combined" format log file looks like this: # 220.217.50.156 - - [07/May/2005:15:31:02 +0100] "GET /learn/lessons.cgi HTTP/1.1" 403 299 "-" "Mozilla/4.0 (compatible; MSIE 6.0; X11; Linux i686; en) Opera 8.0" # # Copyright (C) 2006 Edmund Edgar # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # # Edmund Edgar, webmaster@edochan.com # use strict; my $defaultPrefetchesPerPage = 5; my $defaultMinPercentClicksToJustifyPrefetch = 2; my ($logFile, $domainList, $minPercentClicksToJustifyPrefetchArg, $prefetchesPerPageArg) = @ARGV; unless ($logFile && $domainList) { die "Usage: ./parsehints.pl [,...] [] [] "; } my @domains = split /,/, $domainList; my %myDomains; foreach my $d(@domains) { $myDomains{$d} = 1; } my $minPercentClicksToJustifyPrefetch = $defaultMinPercentClicksToJustifyPrefetch; if ($minPercentClicksToJustifyPrefetchArg > 0) { $minPercentClicksToJustifyPrefetch = $minPercentClicksToJustifyPrefetchArg; } my $prefetchesPerPage = $defaultPrefetchesPerPage; if ($prefetchesPerPageArg > 0) { $prefetchesPerPage = $prefetchesPerPageArg; } my $unreadableLines = 0; open LOG, $logFile or die "Cannot open log file $logFile: $!"; my %referedRefererCounts; my %pageHitCounts; while () { # Hairy regex to match pull a URL, its referer and the user agent string out of log files in "combined" or "combined_with_prefetching_hack" format. # I'm ignoring POST requests... if ($_=~/.*?\s.*?\s.*?\s\[.*?\]\s\"GET\s(.*?)\s.*?\s\"(.*?)\"\s\"(.*?)\"/) { my ($refered, $refererWithDomain, $userAgent) = ($1,$2,$3); my $referer; # Split URL as in the URI.pm docs. my($scheme, $authority, $path, $query, $fragment) = $refererWithDomain=~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|; unless ($myDomains{$authority}) { # print "\n$authority is not $myDomain"; next; } $referer = $path.$query; # Don't prefetch with anything except html pages linked to by other html pages. # This is very restrictive - you may well want to change it... unless ($refered=~m/\.html?/) { next; } # I'm also skipping anything with a query string. # Your needs may be different... if ($refered=~/\?/) { next; } # keep a note of how many hits there are for each page. # this will allow us to figure out the proportion of visitors to this page who hit each subsequent page. $pageHitCounts{$refered}++; unless ($referer=~m/\.html?/) { next; } if ($referer=~/\?/) { next; } # A page should never need to prefetch itself. if ($refered eq $referer) { next; } # If there's no referer, the referer field will say "-". if ($referer eq '-') { next; } # We want to exclude hits that are really prefetches. # This will only work if, like me, you've squidged the X-Moz header into your user agent field, to make the format I call "combined_with_prefetching_hack". if ($userAgent=~m/prefetch$/i) { next; } # print "\n$refered,$referer,$userAgent"; $referedRefererCounts{$referer}->{$refered}++; } else { $unreadableLines++; } } close LOG; print "\n"; foreach my $k(sort keys %referedRefererCounts) { my %nextPages = %{ $referedRefererCounts{$k} }; my $numHitsToPage = $pageHitCounts{$k}; if (scalar keys %nextPages > 0) { $k = &sanitizeForApacheConfig($k); my $linkCount = 0; my $openedLocationTags = 0; PAGELOOP: foreach my $p( reverse sort { $nextPages{$a}<=>$nextPages{$b} } keys %nextPages ) { my $proportionFromHereToThere; if ($numHitsToPage) { $proportionFromHereToThere = $nextPages{$p} / $numHitsToPage; } if ($linkCount >= $prefetchesPerPage) { last PAGELOOP; } if ( ($minPercentClicksToJustifyPrefetch/100) > $proportionFromHereToThere) { last PAGELOOP; } if ($linkCount == 0) { print " \n"; $openedLocationTags = 1; } $p = &sanitizeForApacheConfig($p); print " Header append Link \"<$p>; rel=prefetch\"\n"; $linkCount++; } if ($openedLocationTags) { print " \n"; } } } print "\n"; sub sanitizeForApacheConfig { # Both the referer and the refered URL in our logs can be entered freely by anyone with access to our website. # We don't want people to be able to stick just anything in our prefetch.conf file... # That means we need to either # a) Sanitize the data we find here to make sure it can't be used to do anything beyond affecting the prefetch header. # b) Check the generated configuration file manually before using it. # c) Check the data some other way. # This function is a fairly limited attempt at doing (a). o # It is almost certainly insufficient, so for now I recommend (b) - read the file before you give it to apache. my ($url) = @_; $url=~s/\"//g; $url=~s/\\//g; $url=~s///g; return $url; }