A quick log analyzer

Here comes a quick log analyzer for web logs. It is an AWK script that parses a log file and generates insert commands. You can pipe this insert commands into a mysql database (the table is provided in the comments) and, using the power of SQL, generate queries about queries per day, queries per month, single visits, etc.

A little of AWK is required, and knowledge of regular expressions.

10.255.255.255 - - [17/Apr/2012:18:49:44 +0200] GET /this/is/the/web/page.html HTTP/1.1 200 1767 Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.162 Safari/535.19 -
10.256.256.256 - - [17/Apr/2012:18:49:44 +0200] GET /this/is/another/web/page/default.css HTTP/1.1 304 - Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.162 Safari/535.19 -
#
# logs2mysql.awk - inserta los logs en una base de datos para hacer anĂ¡lisis
#
# To execute: 
# awk -f logs2mysql.awk logfile.log| mysql -h hostname -u username -p database 
#
# 
# create table analisis 
# ( ip  varchar( 1000 ),
#   value1 varchar( 1000 ), 
#   value2 varchar( 1000 ), 
#   time_stamp date,
#   command varchar( 1000 ), 
#   url varchar( 1000 ), 
#   protocol varchar( 1000 ), 
#   response varchar( 1000 ), 
#   bytes varchar( 1000 ), 
#   user_agent varchar( 2000) 
# );
# 
 
BEGIN	{
		print "-- awk -f logs2mysql.sql logfile.log| mysql -h hostname -u username -p database ";
		print "-- awk -f logs2mysql.sql logfile.log| mysql -h hostname -u username -p database ";
		print "-- awk -f logs2mysql.sql logfile.log| mysql -h hostname -u username -p database "; 
 
 
 
		command = "insert into analisis " \
					"(ip, " \
					"value1, " \
					"value2, " \
					"time_stamp, " \
					"command, " \
					"url, " \
					"protocol, " \
					"response, " \
					"bytes, " \
					"user_agent)" \
					"values " \
					"('##ip', " \
					"'##value1', " \
					"'##value2', " \
					"str_to_date( '##time_stamp', '%d/%b/%Y:%H:%i:%s' ), " \
					"'##command', " \
					"'##url', " \
					"'##protocol', " \
					"'##response', " \
					"'##value3', " \
					"'##user_agent'); " 
		}
 
/.*/	{
		#10.256.256.256 - - [26/Apr/2012:08:16:48 +0200] GET /example/web/page.html HTTP/1.1 200 4149 Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; PRM_IE; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022) -
		match( $0, "^([^ ]*) ([^ ]*) ([^ ]*) \\[(.*)\\] ([[:alpha:]]*) ([^ ]*) ([^ ]*) ([[:digit:]]*) ([[:digit:]-]*) (.*)", values )
# 		print "ip: " values[1];
# 		print "value1: " values[2];
# 		print "value2: " values[3];
# 		print "time_stamp: " values[4];
# 		print "command: " values[5];
# 		print "url: " values[6];
# 		print "protocol: " values[7];
# 		print "response: " values[8];
# 		print "bytes: " values[9];
# 		print "user agent: " values[10];
# 		print "-----------------------------"; 
 
		url = values[6];
		ignore = 0;
		ignore = ignore || match( url, "\.png$" );
		ignore = ignore || match( url, "\.css$" );
		ignore = ignore || match( url, "\.js$" );
 
		if( !ignore ) 
		{
			out = command;
			out = gensub( "##ip", escape( values[1] ), "g", out ); 
			out = gensub( "##value1", escape( values[2] ), "g", out ); 
			out = gensub( "##value2", escape( values[3] ), "g", out ); 
			the_date = gensub( "\+0200", "", "g", values[4] );
			out = gensub( "##time_stamp", escape( the_date ), "g", out ); 
			out = gensub( "##command", escape( values[5] ), "g", out ); 
			out = gensub( "##url", escape( values[6] ), "g", out ); 
			out = gensub( "##protocol", escape( values[7] ), "g", out ); 
			out = gensub( "##response", escape( values[8] ), "g", out ); 
			out = gensub( "##value3", escape( values[9] ), "g", out ); 
			out = gensub( "##user_agent", escape( values[10] ), "g", out ); 
			print out;
		} # if ignore
 
		}
 
END		{
		}
 
function escape( input ) 
{
	out = input;
	out = gensub( ";", "\;", "g", out );
	out = gensub( "\\*", "\\\*", "g", out );
	out = gensub( "%", "\%", "g", out );
	out = gensub( "'", "\'", "g", out );
	out = gensub( "\"", "\\\"", "g", out );
 
	return out; 
 
} # escape