Welcome to OGeek Q&A Community for programmer and developer-Open, Learning and Share
Welcome To Ask or Share your Answers For Others

Categories

0 votes
848 views
in Technique[技术] by (71.8m points)

apache - java parse log file

I am trying to parse the apache log file which is logging in this format

LogFormat "%t %u [%D %h %{True-Client-IP}i %{UNIQUE_ID}e %r] %{Cookie}i %s "%{User-Agent}i" "%{host}i" %l %b %{Referer}i"

below is the code I am testing with...

import java.util.regex.*;


interface LogExample {
    /** The number of fields that must be found. */
    public static final int NUM_FIELDS = 11;

    /** The sample log entry to be parsed. */
    public static final //String logEntryLine = "123.45.67.89 - - [27/Oct/2000:09:27:09 -0400] "GET /java/javaResources.html HTTP/1.0" 200 10450 "-" "Mozilla/4.6 [en] (X11; U; OpenBSD 2.8 i386; Nav)"";
    String logEntryLine = "[02/Dec/2013:14:10:30 -0000] - [52075 10.102.4.254 177.43.52.210 UpyU1gpmBAwAACfd5W0AAAAW GET /SS14-VTam-ny_019.jpg.rendition.zoomable.jpg HTTP/1.1] hsfirstvisit=http%3A%2F%2Fwww.domain.com%2Fen-us||1372268254000; _opt_vi_3FNG8DZU=F870DCFD-CBA4-4B6E-BB58-4605A78EE71A; __ptca=145721067.0aDxsZlIuM48.1372279055.1379945057.1379950362.9; __ptv_62vY4e=0aDxsZlIuM48; __pti_62vY4e=0aDxsZlIuM48; __ptcz=145721067.1372279055.1.0.ptmcsr=(direct)|ptmcmd=(none)|ptmccn=(direct); __hstc=145721067.b86362bb7a1d257bfa2d1fb77e128a85.1372268254968.1379934256743.1379939561848.9; hubspotutk=b86362bb7a1d257bfa2d1fb77e128a85; USER_GROUP=julinho%3Afalse; has_js=1; WT_FPC=id=177.43.52.210-1491335248.30301337:lv=1385997780893:ss=1385997780893; dtCookie=1F2E0E1037589799D8D503EB8CFA12A1|_default|1; RM=julinho%3A5248423ad3fe062f06c54915e6cde5cb45147977; wcid=UpyKsQpmBAwAABURyNoAAAAS%3A35d8227ba1e8a9a9cebaaf8d019a74777c32b4c8; Carte::KerberosLexicon_getWGSN=82ae3dcd1b956288c3c86bdbed6ebcc0fd040e1e; UserData=Username%3AJULINHO%3AHomepage%3A1%3AReReg%3A0%3ATrialist%3A0%3ALanguage%3Aen%3ACcode%3Abr%3AForceReReg%3A0; UserID=1356673%3A12345%3A1234567890%3A123%3Accode%3Abr; USER_DATA=1356673%3Ajulinho%3AJulio+Jose%3Ada+Silva%3Ajulinho%40tecnoblu.com.br%3A0%3A1%3Aen%3Abr%3A%3AWGSN%3A1385990833.81925%3A82ae3dcd1b956288c3c86bdbed6ebcc0fd040e1e; MODE=FONTIS; SECTION=%2Fcontent%2Fsection%2Fhome.html; edge_auth=ip%3D177.43.52.210~expires%3D1385994522~access%3D%2Fapps%2F%2A%21%2Fbin%2F%2A%21%2Fcontent%2F%2A%21%2Fetc%2F%2A%21%2Fhome%2F%2A%21%2Flibs%2F%2A%21%2Freport%2F%2A%21%2Fsection%2F%2A%21%2Fwgsn%2F%2A~md5%3D90e73ee10161c1afacab12c6ea30b4ef; __utma=94539802.1793276213.1372268248.1385572390.1385990581.16; __utmb=94539802.52.9.1385991739764; __utmc=94539802; __utmz=94539802.1372268248.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); WT_FPC=id=177.43.52.210-1491335248.30301337:lv=1386000374581:ss=1386000374581; dtPC=-; NSC_wtfswfs_xfcgbsn40-41=ffffffff096e1a1d45525d5f4f58455e445a4a423660; akamai-edge=5ac6e5b3d0bbe2ea771bb2916d8bab34ea222a6a 200 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36" "www.domain.com" - 463952 http://www.domain.com/content/report/shows/New_York/KSHK/trip/s_s_14_ny_ww/sheers.html";

}

public class readLog implements LogExample {

    public static void main(String argv[]) {

        //String logEntryPattern = "^([\d.]+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(.+?)" (\d{3}) (\d+) "([^"]+)" "([^"]+)"";
        //String logEntryPattern = "\[([\w:/]+\s[+\-]\d{4})\] (\S+) \[(\d]+) ([\d.]+) ([\d.]+) (.+?)\] (.*)";
        String logEntryPattern = "\[([\w:/]+\s[+\-]\d{4})\] (.*)";

        System.out.println("Using RE Pattern:");
        System.out.println(logEntryPattern);

        System.out.println("Input line is:");
        System.out.println(logEntryLine);

        Pattern p = Pattern.compile(logEntryPattern);
        Matcher matcher = p.matcher(logEntryLine);
        //if (!matcher.matches() ||
        //        NUM_FIELDS != matcher.groupCount()) {
        //    System.err.println("Bad log entry (or problem with RE?):");
        //    System.err.println(logEntryLine);
        //    return;
       // }
        System.out.println("Date&Time: " + matcher.group(0));
        System.out.println("Time taken to serve request: " + matcher.group(1));
        System.out.println("Source IP: " + matcher.group(2));
        System.out.println("Client IP: " + matcher.group(3));
        System.out.println("Unique ID: " + matcher.group(4));
        System.out.println("Request: " + matcher.group(5));
        System.out.println("Cookie: " + matcher.group(6));
        System.out.println("Request type: " + matcher.group(7));
        System.out.println("User agent: " + matcher.group(8));
        System.out.println("Remote logname: " + matcher.group(9));
        System.out.println("Size of response: " + matcher.group(10));
        System.out.println("Referrer: " + matcher.group(11));
    }
}

some example logs are:

[03/Dec/2013:10:53:59 +0000] - [32002 10.102.4.254 195.229.241.182 Up24RwpmBAwAAA1LWJsAAAAR GET /content/dam/Central_Library/Street_Shots/Youth/2012/09sep/LFW/Gallery_03/LFW_SS13_SEPT_12_777.jpg.image.W0N539E3452S3991w313.original.jpg HTTP/1.1] __utmc=94539802; dtCookie=EFD9D09B6A2E1789F1329FC1381A356A|_default|1; dtPC=471217988_141#_load_; Carte::KerberosLexicon_getdomain=6701c1320dd96688b2e40b92ce748eee7ae99722; UserData=Username%3ALSHARMA%3AHomepage%3A1%3AReReg%3A0%3ATrialist%3A0%3ALanguage%3Aen%3ACcode%3Aae%3AForceReReg%3A0; UserID=1375493%3A12345%3A1234567890%3A123%3Accode%3Aae; USER_DATA=1375493%3ALSharma%3ALokesh%3ASharma%3Alokesh.sharma%40landmarkgroup.com%3A0%3A1%3Aen%3Aae%3A%3Adomain%3A1386060868.51392%3A6701c1320dd96688b2e40b92ce748eee7ae99722; MODE=FONTIS; __utma=94539802.911097326.1339390457.1386060848.1386065609.190; __utmz=94539802.1384758205.177.38.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); __kti=1339390460526,http%3A%2F%2Fwww.domain.com%2F,; __ktv=28e8-6c4-be3-ce54137d9e48271; WT_FPC=id=2.50.27.157-3067016480.30226245:lv=1386047044279:ss=1386046439530; _opt_vi_3FNG8DZU=42880957-D2F1-4DC5-AF16-FEF88891D24E; __hstc=145721067.750d315a49c64268192826b3911a4e5a.1351772962050.1381151113005.1381297633204.66; hsfirstvisit=http%3A%2F%2Fwww.domain.com%2F|http%3A%2F%2Fwww.google.co.in%2Furl%3Fsa%3Dt%26rct%3Dj%26q%3Ddomain.com%26source%3Dweb%26cd%3D1%26ved%3D0CB0QFjAA%26url%3Dhttp%3A%2F%2Fwww.domain.com%2F%26ei%3DDmuSULW3AcTLhAfJ24CoDA%26usg%3DAFQjCNGvPmmyn8Bk67OUv-HwjVU4Ff3q1w|1351772962000; hubspotutk=750d315a49c64268192826b3911a4e5a; __ptca=145721067.jQ7lN5U3C4eN.1351758562.1381136713.1381283233.66; __ptv_62vY4e=jQ7lN5U3C4eN; __pti_62vY4e=jQ7lN5U3C4eN; __ptcz=145721067.1351758562.1.0.ptmcsr=google|ptmcmd=organic|ptmccn=(organic)|ptmctr=domain.com; RM=Lsharma%3Ac163b6097f90d2869e537f95900e1c464daa8fb9; wcid=Up2cRApmBAwAAFOiVhcAAAAH%3Af32e5e5f5b593175bfc71af082ab26e4055efeb6; __utmb=94539802.71.9.1386067462709; edge_auth=ip%3D195.229.241.182~expires%3D1386069280~access%3D%2Fapps%2F%2A%21%2Fbin%2F%2A%21%2Fcontent%2F%2A%21%2Fetc%2F%2A%21%2Fhome%2F%2A%21%2Flibs%2F%2A%21%2Freport%2F%2A%21%2Fsection%2F%2A%21%2Fdomain%2F%2A~md5%3D5b47f34172392487dcd44c1d837e2e54; has_js=1; SECTION=%2Fcontent%2Fsection%2Finspiration-design%2Fstreet-shots.html; JSESSIONID=b9377099-7708-45ae-b6e7-c575ffe82187; WT_FPC=id=2.50.27.157-3067016480.30226245:lv=1386053618209:ss=1386053618209; USER_GROUP=LSharma%3Afalse; NSC_wtfswfs_xfcgbsn40-41=ffffffff096e1a1d45525d5f4f58455e445a4a423660 200 "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)" "www.domain.com" - 24516 http://www.domain.com/content/report/Street_Shots/Youth/Global_round_up/2013/01_Jan/mens_youth_stylingglobalround-up1.html
[03/Dec/2013:10:53:59 +0000] - [5998 10.102.4.254 165.193.178.71 Up24RwpmBAwAAH6zEtsAAAAB GET /content/dam/Central_Library/Catwalk_Shows/Paris/Spring_Summer_2013/Mens_foot/Comme_Des_Garcons/CDGarcons_FOOT_01201206291020.jpg.rendition.thumbnail.jpg HTTP/1.1] SECTION=%2Fcontent%2Fsection%2Fhome.html; USER_GROUP=anonymous%3Afalse; search_cookie=; searchtime_cookie=all; search_type=domain_photos; WT_FPC=id=46.65.238.194-75238752.30316538:lv=1386068091347:ss=1386066333452; NSC_wtfswfs_xfcgbsn40-41=ffffffff096e1a1d45525d5f4f58455e445a4a423660; Carte::KerberosLexicon_getdomain=bb0da583303a49a8294403860ea3b4d326e6934d; UserData=Username%3AUARTSLONDON%3AHomepage%3A1%3AReReg%3A0%3ATrialist%3A0%3ALanguage%3Aen%3ACcode%3Agb%3AForceReReg%3A0; UserID=1264925%3A12345%3A1234567890%3A123%3Accode%3Agb; USER_DATA=1264925%3Auartslondon%3AUniversity%3Aof+the+Arts+London%3A%3A0%3A1%3Aen%3Agb%3AEDU%3Adomain%3A1386066044.0231%3Abb0da583303a49a8294403860ea3b4d326e6934d; MODE=FONTIS; edge_auth=ip%3D165.193.178.71~expires%3D1386067844~access%3D%2Fapps%2F%2A%21%2Fbin%2F%2A%21%2Fcontent%2F%2A%21%2Fetc%2F%2A%21%2Fhome%2F%2A%21%2Flibs%2F%2A%21%2Freport%2F%2A%21%2Fsection%2F%2A%21%2Fdomain%2F%2A~md5%3Dd1210aafdb5701c303c348f2fec1c3ff; dtCookie=6CF28DC8E50C0E5179365B4683DCB3D8|_default|1 200 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36" "www.domain.com" - 6068 http://www.domain.com/content/domain/unifiedsearch.html?q=comme%20des%20garcons&tags=&type=domain_photos&t=all&_charset_=utf-8
[03/Dec/2013:10:53:59 +0000] - [324 10.102.4.254 - Up24RwpmBAwAAHv627sAAAAQ GET /akamai-sureroute-test-object.htm HTTP/1.1] dtCookie=85A2D2D0984B2071B94E304F711146D1|_default|1 200 "FirstFlowAgent" "day-cms.domain.com.akadns.net" - 214 -
[03/Dec/2013:10:53:59 +0000] - [258 10.102.4.254 127.0.0.1 Up24RwpmBAwAAAl@3VUAAAAJ GET /akamai-sureroute-test-object.htm HTTP/1.1] dtCookie=1AD942993C377EC2C95223611ED38204|_default|1 200 "-" "secure.domain.com" - 215 -

any idea about the regex filters which I need for this logformat?

See Question&Answers more detail:os

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome To Ask or Share your Answers For Others

1 Reply

0 votes
by (71.8m points)

i managed to now split this...

import java.util.regex.*;
public class stringRS {
    public static void main(String[] args){
        String line = "[03/Dec/2013:10:53:59 +0000] - [32002 10.102.4.254 195.229.241.182 Up24RwpmBAwAAA1LWJsAAAAR GET /content/dam/original.jpg HTTP/1.1] __utmc=94539802; dtCookie=EFD9D09B6A2E1; dtPC=load_; Carte::Kertdomain=6701c1320dd99722; UserData=Username%3ALSHARMA%3AHomepageg%3A0; UserID=1375493%de%3Aae; USER_DATA=rma%40landmain%3A13860608699722; MODE=FONTIS; __utma=945326.5609.190; __utmz=94ic|utmcomain.com%2F,; __ktv=28e8-6c4-be3-ce54137d9e48271; WT_FPC=id=2.50480.30226245:lv=1386047044279:ss=1386046439530; _opt_vi_3FNG8DZU=42880957-D2F1-4DC5-AF16-FEF88891D24E; __hstc=145721067.750d315a49c64268192826b3911a4e5a.1351772962050.1381151113005.1381297633204.66; hsfirstvisit=http%3A%2F%2Fwww.domain.com%2F|http%3A%2F%2Fwww.google.co.in%2Furl%3Fsa%3Dt%26rct%3Dj%26q%3Ddomain.com%26source%3Dweb%26cd%3D1%26ved%3D0CB0QFjAA%26url%3Dhttp%3A%2F%2Fwww.domain.com%2F%26ei%3DDmuSULW3AcTLhAfJ24CoDA%26usg%3DAFQjCNGvPmmyn8Bk67OUv-HwjVU4Ff3q1w|1351772962000; hubspotutk=750d315a49c64268192826b3911a4e5a; __ptca=145721067.jQ7lN5U3C4eN.1351758562.1381136713.1381283233.66; __ptv_62vY4e=jQ7lN5U3C4eN; __pti_62vY4e=jQ7lN5U3C4eN; __ptcz=145721067.1351758562.1.0.ptmcsr=google|ptmcmd=organic|ptmccn=(organic)|ptmctr=domain.com; RM=Lsharma%3Ac163b6097f90d2869e537f95900e1c464daa8fb9; wcid=Up2cRApmBAwAAFOiVhcAAAAH%3Af32e5e5f5b593175bfc71af082ab26e4055efeb6; __utmb=94539802.71.9.1386067462709; edge_auth=ip%3D195.229.241.182~expires%3D1386069280~access%3D%2Fapps%2F%2A%21%2Fbin%2F%2A%21%2Fcontent%2F%2A%21%2Fetc%2F%2A%21%2Fhome%2F%2A%21%2Flibs%2F%2A%21%2Freport%2F%2A%21%2Fsection%2F%2A%21%2Fdomain%2F%2A~md5%3D5b47f34172392487dcd44c1d837e2e54; has_js=1; SECTION=%2Fcontent%2Fsection%2Finspiration-design%2Fstreet-shots.html; JSESSIONID=b9377099-7708-45ae-b6e7-c575ffe82187; WT_FPC=id=2.50.27.157-3067016480.30226245:lv=1386053618209:ss=1386053618209; USER_GROUP=LSharma%3Afalse; NSC_wtfswfs_xfcgbsn40-41=ffffffff096e1a1d45525d5f4f58455e445a4a423660 200 "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)" "www.domain.com" - 24516 http://www.domain.com/content/report/Street_Shots/Youth/Global_round_up/2013/01_Jan/mens_youth_stylingglobalround-up1.html";

        String pattern = "\[([\w:/]+\s[+\-]\d{4})\] (\S) \[(\w+) ([\d.]+) ([\d.]+) (\w+) (\w+.*)\] (\w.*?\d{3}) ([\d.]+) "([^"]*)" "([^"]*)" (\S) (\w+) (.*)";
        Pattern r = Pattern.compile(pattern);

        // Now create matcher object.
        Matcher m = r.matcher(line);
        if (m.find( )) {
            System.out.println("String: " + m.group(0) );
            System.out.println("Date&Time: " + m.group(1) );
            System.out.println("User: " + m.group(2) );
            System.out.println("Time taken: " + m.group(3) );
            System.out.println("Source IP: " + m.group(4) );
            System.out.println("Client IP: " + m.group(5) );
            System.out.println("Unique ID: " + m.group(6) );
            System.out.println("Reguest: " + m.group(7) );
            System.out.println("Cookie: " + m.group(8) );
            System.out.println("Request Type: " + m.group(9) );
            System.out.println("Browser: " + m.group(10) );
            System.out.println("Domain: " + m.group(11) );
            System.out.println("Remote logname: " + m.group(12) );
            System.out.println("bytes sent: " + m.group(13) );
            System.out.println("Referrer: " + m.group(14) );
        } else {
            System.out.println("NO MATCH");
        }
    }
}

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
OGeek|极客中国-欢迎来到极客的世界,一个免费开放的程序员编程交流平台!开放,进步,分享!让技术改变生活,让极客改变未来! Welcome to OGeek Q&A Community for programmer and developer-Open, Learning and Share
Click Here to Ask a Question

...