#!/usr/bin/env python	

"""filestats.py

Copyright (c) 2003 by Dave Seidel.

This work is licensed under the Creative Commons Attribution License.
To view a copy of this license, visit http://creativecommons.org/licenses/by/1.0
or send a letter to Creative Commons, 559 Nathan Abbott Way,
Stanford, California 94305, USA.

CONTACT
Email: dave at superluminal dot com
Weblog: http://radio.weblogs.com/0100130
Home: http://www.superluminal.com/dave

SUMMARY
This program reads in a web server log file and analyzes it to show who
(i.e., what client machines) downloaded one of more specific files from the
server.  The clients are grouped either by referer (the default) or by UserAgent.
I wrote it because I wanted to be able to see how certain files, e.g., RSS files,
were being accessed, and the usual log analyzers (Analog and Webalizer) don't
show the data the way I want to see it.  Because I think of the result of the
analysis as hierarchical data, and because I prefer to use XML. it made sense to
me to use Dave Winer's OPML format (see http://www.opml.org/spec), which is
an XML dialect for outlines.  I use Dave's Radio Userland program, which among
other things is an OPML editor, and I also use a Radio add-in called ActiveRenderer,
which does a beautiful job of rendering OPML in DHTML. so these were additional
motivations.

URL
http://www.superluminal.com/dave/code/filestats.py

CAVEATS
I have only tested this so far on Windows 2000, but it should be
portable.

REQUIREMENTS
- Python 2.2 (an earlier version may work, but I haven't tried it)
- Mark Nottingham's Weblog package for Python (http://www.mnot.net/python/WebLog/)
- To view/render the output, an OPML-aware tool such as Radio Userland
  (http://radio.userland.com).  If you use Radio, you should also be using
  ActiveRenderer (http://www.activerenderer.com).
- Input files must be in Apache's "combined" format, which includes both the Referer
  and UserAgent fields.  The Apache format specification is
  "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\""

USAGE
- Run "python filestats.py -h" for the help message.
- Example: python filestats.py -a -v -s http://www.example.com -i access_log -o -f /weblog/rss.xml -f /weblog/categories/spam/rss.xml

NOTES
- I used the referers.py sample program from the Weblog package as a starting point.
  The resemblance is probably pretty obscure at this point, but there are still
  traces of the original code remaining.  Thanks, Mark!
- The program creates a files called dns.cache in the current directory, which is
  used to save the results of gethostbtaddr() lookups.  You can delete the file,
  but the program will run a lot faster if you don't.

HISTORY
- 1.0, Feb 20, 2003	Initial release
- 1.1, Feb 21, 2003	Bug fixes (error handling)
- 1.2, Feb 24, 2003 Quit with an error message if log file is in the wrong format (Simon Fell)

TODO
- collapse similar client hostnames (e.g., dialups)
- sort referers/UAs by number of clients (?)
- for each client, show latest access date/time
- handle variant log file (e.g., IIS) (?)

"""

import sys, getopt, cPickle, re
from socket import gethostbyaddr
from weblog import combined, url, clean, referer

#
# constants
#

VERSION = "1.2"

USAGE_MSG = """
Usage: filestats [options] -i logfile -s siteurl [-o outfile] -f file [-f file ...]

Where:
    logfile         is a weblog file in standard "combined" format
    siteurl         is the URL of site being analyzed
    outfile         if the name of the output OPML file (defaults to "report.opml")
    file     	    is the pathname of a file for usage analysis (repeat for each distinct file)

Options:
	-q				quiet (no console output; takes precedence over -v)
	-v				verbose
	-a				analyze by UserAgent (defaults to analysis by referer)
"""

OPML_START = """<?xml version=\"1.0\" encoding=\"UTF-8\"?>
<opml version=\"1.0\">
<head>
	<title>%s</title>
	<dateCreated>%s</dateCreated>
	<expansionState>1</expansionState>
	<vertScrollState>1</vertScrollState>
</head>
<body>
"""

OPML_END = """	</body>
</opml>
"""

#
# backend analysis and rendering code
#

class Analyzer:
	logname = ""
	outname = ""
	siteurl = ""
	targets = []
	useAgent = 0

	log = None
	hosts = {}
	resources = {}
	cacheName = "dns.cache"

	def __init__(self, siteurl, input, output, files, useAgent=0):
		self.siteurl = siteurl
		self.logname = input
		self.outname = output
		self.targets = files
		self.useAgent = useAgent

	def _loadDNSCache(self):
		# load the cached lookups
		try:
			fd = file(self.cacheName, "r")
			self.hosts = cPickle.load(fd)
			fd.close()
		except Exception:
			return

	def _storeDNSCache(self):
		# save off the cached lookups
		try:
			fd = file(self.cacheName, "w")
			cPickle.dump(self.hosts, fd)
			fd.close()
		except:
			return

	def analyze(self):
		try:
			log_fd = file(self.logname, 'r')
		except IOError, e:
			return str(e)
		o_log = combined.Parser(log_fd)
		p_log = url.Parser(o_log)
		c_log = clean.Cleaner(p_log)
		c_log.directory_index = ['index.html', 'index.htm', 'index.php', 'index.asp', 'index.jsp']

		self.log = referer.Typer(c_log)
		self.log.siteurl.append(self.siteurl)

		self._loadDNSCache()

		counter = 0
		while self.log.getlogent():
			counter += 1
			if self.log.referer_type is 'OFFSITE':
				try:
					if self.log.url in self.targets:
						try:
							if (self.hosts.has_key(self.log.client)):
								client = self.hosts[self.log.client]
							else:
								info = gethostbyaddr(self.log.client)
								client = self.hosts[self.log.client] = info[0]
						except:
							client = self.hosts[self.log.client] = self.log.client
						if self.useAgent:
							if 0 == self.resources.has_key(self.log.url):
								self.resources[self.log.url] = { self.log.agent: { client : 1 } }
							else:
								if 0 == self.resources[self.log.url].has_key(self.log.agent):
									self.resources[self.log.url][self.log.agent] = { client : 1 }
								else:
									self.resources[self.log.url][self.log.agent][client] = \
										self.resources[self.log.url][self.log.agent].get(client, 0) + 1
						else:
							if 0 == self.resources.has_key(self.log.url):
								self.resources[self.log.url] = { self.log.referer: { client : 1 } }
							else:
								if 0 == self.resources[self.log.url].has_key(self.log.referer):
									self.resources[self.log.url][self.log.referer] = { client : 1 }
								else:
									self.resources[self.log.url][self.log.referer][client] = \
										self.resources[self.log.url][self.log.referer].get(client, 0) + 1
				except KeyError:
					if self.useAgent:
						self.resources[self.log.url] = { self.log.agent : [] }
					else:
						self.resources[self.log.url] = { self.log.referer : [] }

		log_fd.close()
		self._storeDNSCache()
		if counter < 1:
			return "Invalid logfile format"
		return None

	def renderOPML(self):
		try:
			out_fd = file(self.outname, 'w')
		except IOError, e:
			return str(e)
		if self.useAgent:
			aType = "UserAgents"
		else:
			aType = "referers"
		resources = self.resources.keys()
		resources.sort()
		out_fd.write(OPML_START % (self.outname, "a date"))
		out_fd.write("\t\t<outline text=\"File usage by %s for site %s\">\n" % (aType, self.siteurl))
		for res in resources:
			refs = self.resources[res]
			keys = refs.keys()
			out_fd.write("\t\t\t<outline text=\"File: %s (%s: %d)\">\n" % (res, aType, len(keys)))
			for k in keys:
				hosts = refs[k].items()
				# sort on count ([1]) then hostname ([0])
				hosts.sort(lambda x,y: cmp(y[1], x[1]) or cmp(y[0], x[0]))
				url = k.replace("&", "&amp;")
				out_fd.write("\t\t\t\t<outline text=\"%s (clients: %d)\">\n" % (url, len(hosts)))
				for host in hosts:
					out_fd.write("\t\t\t\t\t<outline text=\"[%d] %s\"/>\n" % (host[1], host[0]))
				out_fd.write("\t\t\t\t</outline>\n")
			out_fd.write("\t\t\t</outline>\n")
		out_fd.write("\t\t</outline>\n")
		out_fd.write(OPML_END)
		out_fd.close()
		return None

#
# frontend driver code
#

def usage():
	print USAGE_MSG

def fatal(msg, rc=1):
	print "\nError:", msg
	sys.exit(1)

def main():
	if len(sys.argv) < 2:
		usage()
		sys.exit()

	# default values
	quiet = 0
	verbose = 0
	useAgent = 0
	siteurl = ""
	input = ""
	output = "report.opml"
	files = []

	# get command line args	
	try:
		opts, args = getopt.getopt(sys.argv[1:],
								   "hqvas:i:o:f:",
								   ["help", "quiet", "verbose", "agent", "siteurl=", "input=", "output=", "file="])
	except getopt.GetoptError:
		usage()
		sys.exit(2)
	for o, a in opts:
		if o in ("-h", "--help"):
			usage()
			sys.exit()
		elif o in ("-q", "--quiet"):
			quiet = 1
		elif o in ("-v", "--verbose"):
			verbose = 1
		elif o in ("-a", "--agent"):
			useAgent = 1
		elif o in ("-s", "--siteurl"):
			siteurl = a
		elif o in ("-i", "--input"):
			input = a
		elif o in ("-o", "--output"):
			output = a
		elif o in ("-f", "--file"):
			files.append(a)

	# validate args
	if (quiet):
		verbose = 0
	if input == "":
		fatal("no input file specified")
	if len(files) < 1:
		fatal("no file(s) specified for analysis")

	# do the work
	if 0 == quiet:
		print "\nfilestats ", VERSION, "\nCopyright (c) 2003 Dave Seidel\n"
	analyzer = Analyzer(siteurl, input, output, files, useAgent);
	if 0 == quiet:
		if useAgent:
			aType = "UserAgent"
		else:
			aType = "referer"
		print "Site: %s\nLog file: %s\nOutput file: %s\nAnalysis by %s\nFiles:" % (siteurl, input, output, aType)
		for f in files:
			print "    %s" % (f)
	if verbose:
		print "\nAnalyzing...",
	err = analyzer.analyze()
	if (err):
		fatal(err, 3)
	if verbose:
		print "\nRendering...",
	err = analyzer.renderOPML()
	if (err):
		fatal(err, 4)
	if verbose:
		print "\nDone"


if __name__ == "__main__":
	main()
