#!/usr/bin/env python # Copyright (c) 2007, Corey Goldberg (corey@goldb.org) # # This file is part of PerfLog. # # PerfLog is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # excpects logs in W3C Extended Log File Format: # http://www.w3.org/TR/WD-logfile.html import re import time class WebLog(object): def __init__(self, file_name): self.file_name = file_name self.log_lines = self.__open() self.field_map = self.__map_fields() self.line_parts = self.__parse() self.methods = self.count_methods() self.content_types = self.count_content_types() self.status_codes = self.count_status_codes() self.num_requests = len(self.line_parts) def count_methods(self): # creates a dictionary all http status codes (200, 304, etc) and their counts position = self.field_map['cs-method'] return self.__count_fields(position) def count_status_codes(self): # creates a dictionary all HTTP methods used (GETs, POSTs, etc) and their counts position = self.field_map['sc-status'] return self.__count_fields(position) def count_content_types(self): # creates a dictionary all content types and their counts position = self.field_map['cs-uri-stem'] regex_doc = re.compile('.*/(.*)$') regex_doctype = re.compile('.*\.(.*)$') # if doc contains a dot, we assume a file extension and group by that all_content_type_names = [] for line in self.line_parts: loc = line[position] m = regex_doc.search(loc) if m: doc = m.group(1) m = regex_doctype.search(doc) if m: content_type = m.group(1) else: content_type = doc all_content_type_names.append(content_type) else: print 'WARNING: No Doc Match\n' content_type_names = self.__remove_dups(all_content_type_names) content_types = {} for content_type_name in content_type_names: content_types[content_type_name] = len([x[position] for x in self.line_parts if content_type_name in x[position]]) return content_types def __count_fields(self, position): unq_keys = self.__remove_dups([line[position] for line in self.line_parts]) fields = {} for key in unq_keys: fields[key] = len([x[position] for x in self.line_parts if x[position] == key]) return fields def __map_fields(self): # we look for the Fields Header line and parse field positions from it for line in self.log_lines: m = re.match('^#Fields: (.*)', line) if m: fields = m.group(1).strip().split(' ') field_map = {} for i in range(len(fields)): field_map[fields[i]] = i return field_map def __open(self): fh = open(self.file_name, 'rb') log_lines = fh.readlines() fh.close() return log_lines def __parse(self): # remove whitespace and comments, split the lines into a list of fields line_parts = [] for line in self.log_lines: if not line.startswith('#'): splat = line.strip().split(' ') if len(splat) > 1: #add the epoch time for each line as the last element in the list epoch = self.__convert_to_epoch(splat[0], splat[1]) splat.append(epoch) line_parts.append(splat) return line_parts def __remove_dups(self, seq): x = {} for y in seq: x[y] = 1 u = x.keys() u.sort() return u def __convert_to_epoch(self, ymd_date, hms_time): date_time = '%s %s' % (ymd_date, hms_time) pattern = '%Y-%m-%d %H:%M:%S' epoch = int(time.mktime(time.strptime(date_time, pattern))) return epoch