from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext conf = SparkConf().setAppName("Spark-Logs-Handling").setMaster("local[*]") sc = SparkContext.getOrCreate(conf) sqlcontext = SQLContext(sc) rdd = sc.textFile("/FileStore/tables/NASA_access_log_*.gz") rdd.count()
Out[1]: 3461613
i=0 for line in rdd.sample(withReplacement = False, fraction = 0.00001, seed = 100).collect(): i=i+1 print(line) if i >5: break
ix-stp-fl2-19.ix.netcom.com - - [03/Aug/1995:23:03:09 -0400] "GET /images/faq.gif HTTP/1.0" 200 263
slip183-1.kw.jp.ibm.net - - [04/Aug/1995:18:42:17 -0400] "GET /shuttle/missions/sts-70/images/DSC-95EC-0001.gif HTTP/1.0" 200 107133
piweba4y.prodigy.com - - [05/Aug/1995:19:17:41 -0400] "GET /icons/menu.xbm HTTP/1.0" 200 527
ruperts.bt-sys.bt.co.uk - - [07/Aug/1995:04:44:10 -0400] "GET /shuttle/countdown/video/livevideo2.gif HTTP/1.0" 200 69067
dal06-04.ppp.iadfw.net - - [07/Aug/1995:21:10:19 -0400] "GET /images/NASA-logosmall.gif HTTP/1.0" 200 786
p15.ppp-1.directnet.com - - [10/Aug/1995:01:22:54 -0400] "GET /images/KSC-logosmall.gif HTTP/1.0" 200 1204
rslt=(rdd.map(lambda line: re.search('^(\S+)((\s)(-))+\s(\[\S+ -\d{4}\])\s("\w+\s+([^\s]+)\s+HTTP.*")\s(\d{3}\s(\d*)$)',line) .groups()) .take(3)) rslt
Out[8]:
[('in24.inetnebr.com',
' -',
' ',
'-',
'[01/Aug/1995:00:00:01 -0400]',
'"GET /shuttle/missions/sts-68/news/sts-68-mcc-05.txt HTTP/1.0"',
'/shuttle/missions/sts-68/news/sts-68-mcc-05.txt',
'200 1839',
'1839'),
('uplherc.upl.com',
' -',
' ',
'-',
'[01/Aug/1995:00:00:07 -0400]',
'"GET / HTTP/1.0"',
'/',
'304 0',
'0'),
('uplherc.upl.com',
' -',
' ',
'-',
'[01/Aug/1995:00:00:08 -0400]',
'"GET /images/ksclogo-medium.gif HTTP/1.0"',
'/images/ksclogo-medium.gif',
'304 0',
'0')]
n_logs = rdd.count() failed = rdd.map(lambda line: parse_log1(line)).filter(lambda line: line[1] == 0).count() print('Out of a total of {} logs, {} failed to parse'.format(n_logs,failed)) # Get the failed records line[1] == 0 failed1=rdd.map(lambda line: parse_log1(line)).filter(lambda line: line[1]==0) failed1.take(3)
Out of a total of 3461613 logs, 38768 failed to parse
Out[10]:
[('gw1.att.com - - [01/Aug/1995:00:03:53 -0400] "GET /shuttle/missions/sts-73/news HTTP/1.0" 302 -',
0),
('js002.cc.utsunomiya-u.ac.jp - - [01/Aug/1995:00:07:33 -0400] "GET /shuttle/resources/orbiters/discovery.gif HTTP/1.0" 404 -',
0),
('pipe1.nyc.pipeline.com - - [01/Aug/1995:00:12:37 -0400] "GET /history/apollo/apollo-13/apollo-13-patch-small.gif" 200 12859',
0)]
failed2=rdd.map(lambda line: parse_failed(line)).filter(lambda line: line[1]==1) failed2.take(5)
Out[13]:
[('gw1.att.com - - [01/Aug/1995:00:03:53 -0400] "GET /shuttle/missions/sts-73/news HTTP/1.0" 302 -',
1),
('js002.cc.utsunomiya-u.ac.jp - - [01/Aug/1995:00:07:33 -0400] "GET /shuttle/resources/orbiters/discovery.gif HTTP/1.0" 404 -',
1),
('tia1.eskimo.com - - [01/Aug/1995:00:28:41 -0400] "GET /pub/winvn/release.txt HTTP/1.0" 404 -',
1),
('itws.info.eng.niigata-u.ac.jp - - [01/Aug/1995:00:38:01 -0400] "GET /ksc.html/facts/about_ksc.html HTTP/1.0" 403 -',
1),
('grimnet23.idirect.com - - [01/Aug/1995:00:50:12 -0400] "GET /www/software/winvn/winvn.html HTTP/1.0" 404 -',
1)]
import re def parse_log2(line): # Parse logs with the rule below match = re.search('^(\S+)((\s)(-))+\s(\[\S+ -\d{4}\])\s("\w+\s+([^\s]+)\s+HTTP.*")\s(\d{3})\s(\d*)$',line) # If match failed then use the rule below if match is None: match = re.search('^(\S+)((\s)(-))+\s(\[\S+ -\d{4}\])\s("\w+\s+([^\s]+)\s+HTTP.*")\s(\d{3}\s-$)',line) if match is None: return (line, 0) # Return 0 for failure else: return (line, 1) # Return 1 for success
An essential and unavoidable aspect of Big Data processing is the need to process unstructured text.Web server logs are one such area which requires Big Data techniques to process massive amounts of logs. The Common Log Format also known as the NCSA Common log format, is a standardized text file format used by web servers when generating server log files. Because the format is standardized, the files can be readily analyzed.
A publicly available webserver logs is the NASA-HTTP Web server logs. This is good dataset with which we can play around to get familiar to handling web server logs. The logs can be accessed at NASA-HTTP
Description These two traces contain two month's worth of all HTTP requests to the NASA Kennedy Space Center WWW server in Florida.
Format The logs are an ASCII file with one line per request, with the following columns:
-host making the request. A hostname when possible, otherwise the Internet address if the name could not be looked up.
-timestamp in the format "DAY MON DD HH:MM:SS YYYY", where DAY is the day of the week, MON is the name of the month, DD is the day of the month, HH:MM:SS is the time of day using a 24-hour clock, and YYYY is the year. The timezone is -0400.
-request given in quotes.
-HTTP reply code.
-bytes in the reply.
Last refresh: Never