From f442af956cb8e5039c6856ce92a8de9e8f76f066 Mon Sep 17 00:00:00 2001
From: hydrox <hydrox@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Fri, 17 Nov 2006 11:49:21 +0000
Subject: [PATCH] *) first version of build-in logalizer

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2965 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 .../server/logging/LogalizerHandler.java      | 115 ++++++++
 .../server/logging/logParsers/LogParser.java  |  36 +++
 .../logging/logParsers/LogParserPLASMA.java   | 267 ++++++++++++++++++
 3 files changed, 418 insertions(+)
 create mode 100644 source/de/anomic/server/logging/LogalizerHandler.java
 create mode 100644 source/de/anomic/server/logging/logParsers/LogParser.java
 create mode 100644 source/de/anomic/server/logging/logParsers/LogParserPLASMA.java

diff --git a/source/de/anomic/server/logging/LogalizerHandler.java b/source/de/anomic/server/logging/LogalizerHandler.java
new file mode 100644
index 000000000..122876a06
--- /dev/null
+++ b/source/de/anomic/server/logging/LogalizerHandler.java
@@ -0,0 +1,115 @@
+package de.anomic.server.logging;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.HashMap;
+import java.util.logging.Handler;
+import java.util.logging.LogManager;
+import java.util.logging.LogRecord;
+
+import de.anomic.plasma.plasmaParser;
+import de.anomic.server.logging.logParsers.LogParser;
+
+public class LogalizerHandler extends Handler {
+
+    public static boolean enabled;
+    public static boolean debug;
+    private String logParserPackage;
+    private HashMap parsers;
+    
+    public LogalizerHandler() {
+        super();
+        configure();
+    }    
+
+    private HashMap loadParsers() {
+        HashMap parsers = new HashMap();
+        try {
+            if (enabled) System.out.println("Searching for additional content parsers in package " + logParserPackage);
+            // getting an uri to the parser subpackage
+            String packageURI = plasmaParser.class.getResource("/"+logParserPackage.replace('.','/')).toString();
+            if (enabled) System.out.println("LogParser directory is " + packageURI);
+            
+            File parserDir = new File(new URI(packageURI));
+            //System.out.println(parserDir.toString());
+            String [] parserDirFiles = parserDir.list(parserNameFilter);
+            if(parserDirFiles == null && enabled) {
+                System.out.println("Can't find any parsers in "+parserDir.getAbsolutePath());
+            }
+            //System.out.println(parserDirFiles.length);
+            for (int i=0; i<parserDirFiles.length; i++) {
+                String tmp = parserDirFiles[i].substring(0,parserDirFiles[i].indexOf(".class"));
+                Class tempClass = Class.forName(logParserPackage+"."+tmp);
+                if (tempClass.isInterface() && enabled) System.out.println(tempClass.getName() + " is an Interface");
+                else {
+                    Object theParser = tempClass.newInstance();
+                    if (theParser instanceof LogParser) {
+                        LogParser theLogParser = (LogParser) theParser;
+                        //System.out.println(bla.getName() + " is a logParser");
+                        parsers.put(theLogParser.getParserType(), theParser);
+                        if (enabled) System.out.println("Added " + theLogParser.getClass().getName() + " as " + theLogParser.getParserType() + " Parser.");
+                    }
+                    else {
+                        //System.out.println(bla.getName() + " is not a logParser");
+                        if (enabled) System.out.println("Rejected " + tempClass.getName() + ". Class does not implement the logParser-Interface");
+
+                    }
+                }
+            }
+        } catch (ClassNotFoundException e) {
+            e.printStackTrace();
+        } catch (InstantiationException e) {
+            e.printStackTrace();
+        } catch (IllegalAccessException e) {
+            e.printStackTrace();
+        } catch (URISyntaxException e) {
+            e.printStackTrace();
+        }
+        return parsers;
+    }
+    
+    /**
+     * Get any configuration properties set
+     */
+    private void configure() {
+        LogManager manager = LogManager.getLogManager();
+        String className = getClass().getName();
+
+        if(manager.getProperty(className + ".enabled").equalsIgnoreCase("true")) enabled = true;
+
+        logParserPackage = manager.getProperty(className + ".parserPackage");
+
+        parsers = loadParsers();
+    }
+    
+    public void publish(LogRecord record) {
+        if (enabled) {
+            LogParser temp = (LogParser) parsers.get(record.getLoggerName());
+            if (temp != null) {
+                int returnV = temp.parse(record.getLevel().toString(), record.getMessage());
+                //if (enabled) System.out.println("Logalizertest: " + returnV + " --- " + record.getLevel() + " --- " + record.getMessage());
+                if (enabled) System.out.println("Logalizertest: " + returnV + " --- " + record.getLevel());
+            }
+        }
+    }
+    
+    public void close() throws SecurityException {
+        // TODO Auto-generated method stub
+
+    }
+
+    public void flush() {
+        // TODO Auto-generated method stub
+
+    }
+    
+    private static final FilenameFilter parserNameFilter = new FilenameFilter() {
+        public boolean accept(File dir, String name) {
+            return name.matches(".*.class");
+        }
+    };
+
+
+}
diff --git a/source/de/anomic/server/logging/logParsers/LogParser.java b/source/de/anomic/server/logging/logParsers/LogParser.java
new file mode 100644
index 000000000..031ce561c
--- /dev/null
+++ b/source/de/anomic/server/logging/logParsers/LogParser.java
@@ -0,0 +1,36 @@
+package de.anomic.server.logging.logParsers;
+
+/**
+ * This is the logParser-Interface which all yacy Logalizer-Parser must
+ * implement.
+ *
+ * @author Matthias Söhnholz
+ */
+public interface LogParser {
+    /** 
+     * This is the basic parser-method to parse single loglines. It can
+     * request to give the current logLine and a number of additional logLines,
+     * defined by the return value, to be passed over to the
+     * <tt>advancedParse</tt>-method. The method should return -1 if the given
+     * line was not processed.
+     *
+     * TODO: description of logLevels
+     *
+     * @param logLevel The LogLevel of the line to analyze.
+     * @param logLine  The line to be analyze by the parser.
+     * @return number of additional lines to be loaded and passed over to the
+     * <tt>advancedParse</tt>-method, or if the line was not processed by the
+     * parser "-1".
+     */
+    public int parse(String logLevel, String logLine);
+    /**
+     * This method prints the Parser-Results to the standard-output.
+     */
+    public void printResults();
+    /**
+     * The return value defines which logLines the parser will handle.
+     * @return a String that defines the logLines to analyze. For example
+     * <b>PLASMA</b> or <b>YACY</b>
+     */
+    public String getParserType();
+}
diff --git a/source/de/anomic/server/logging/logParsers/LogParserPLASMA.java b/source/de/anomic/server/logging/logParsers/LogParserPLASMA.java
new file mode 100644
index 000000000..a62b25163
--- /dev/null
+++ b/source/de/anomic/server/logging/logParsers/LogParserPLASMA.java
@@ -0,0 +1,267 @@
+package de.anomic.server.logging.logParsers;
+
+import java.util.HashSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class LogParserPLASMA implements LogParser{
+    //RegExp for LogLevel I
+    private static Pattern i1 = Pattern.compile("Received (\\d*) URLs from peer [\\w-_]{12}:[\\w-_]*/[\\w.-]* in (\\d*) ms, Blocked (\\d*) URLs");
+    private static Pattern i2 = Pattern.compile("Received (\\d*) Entries (\\d*) Words \\[[\\w-_]{12} .. [\\w-_]{12}\\]/[\\w.-]* from [\\w-_]{12}:[\\w-_]*/[\\w.-]*, processed in (\\d*) milliseconds, requesting (\\d*)/(\\d*) URLs, blocked (\\d*) RWIs");
+    private static Pattern i2_2 = Pattern.compile("Received (\\d*) Entries (\\d*) Words \\[[\\w-_]{12} .. [\\w-_]{12}\\]/[\\w.-]* from [\\w-_]{12}:[\\w-_]*, processed in (\\d*) milliseconds, requesting (\\d*)/(\\d*) URLs, blocked (\\d*) RWIs");
+    private static Pattern i3 = Pattern.compile("Index transfer of (\\d*) words \\[[\\w-_]{12} .. [\\w-_]{12}\\] to peer ([\\w-_]*):([\\w-_]{12}) in (\\d*) seconds successful \\((\\d*) words/s, (\\d*) Bytes\\)");
+    private static Pattern i4 = Pattern.compile("Index transfer of (\\d*) entries (\\d*) words \\[[\\w-_]{12} .. [\\w-_]{12}\\] and (\\d*) URLs to peer ([\\w-_]*):([\\w-_]{12}) in (\\d*) seconds successful \\((\\d*) words/s, (\\d*) Bytes\\)");
+    private static Pattern i5 = Pattern.compile("Selected \\w* DHT target peer ([\\w-_]*):([\\w-_]{12}), distance = ([\\w.-]*)");
+    private static Pattern i6 = Pattern.compile("Rejecting RWIs from peer ([\\w-_]{12}):([\\w-_]*)/([\\w.]*) ([\\w. ]*)");
+    private static Pattern i7 = Pattern.compile("DHT distribution: transfer to peer [\\w-]* finished.");
+    private static Pattern i8 = Pattern.compile("Index selection of (\\d*) words \\[[\\w-_]{12} .. [\\w-_]{12}\\] in (\\d*) seconds");
+    private static Pattern i9 = Pattern.compile("RankingDistribution - transmitted file [\\w-:.\\\\]* to [\\w.]*:\\d* successfully in (\\d)* seconds");
+    private static Pattern i10 = Pattern.compile("RankingDistribution - error transmitting file");
+    private static Pattern i11 = Pattern.compile("Peer [\\w-_]*:[\\w-_]{12} is busy\\. Waiting \\d* ms\\.");
+    //private static Pattern i12 = Pattern.compile("\\*Indexed \\d* words in URL [\\w:.&?/%-~$§@=]* \\[[\\w-_]{12}\\]");
+    private static Pattern i13 = Pattern.compile("WROTE HEADER for |LOCALCRAWL\\[\\d*, \\d*, \\d*, \\d*\\]|REJECTED WRONG STATUS TYPE");
+    //RegExp for LogLevel W
+    private static Pattern w1 = Pattern.compile("found not enough \\(\\d*\\) peers for distribution");
+    private static Pattern w2 = Pattern.compile("Transfer to peer ([\\w-_]*):([\\w-_]{12}) failed:'(\\w*)'");
+    //RegExp for LogLevel E
+    private static Pattern e1 = Pattern.compile("INTERNAL ERROR AT plasmaCrawlLURL:store:de.anomic.kelondro.kelondroException: tried to create (\\w*) node twice in db");
+    private static Pattern e2 = Pattern.compile("INTERNAL ERROR [\\w./: ]* java.net.MalformedURLException");
+
+    private Matcher m;
+    //RegExp for advancedParser
+    //private Pattern adv1 = Pattern.compile("\\*Indexed (\\d*) words in URL [\\w:.&?/%-=]* \\[[\\w-_]{12}\\]\\n\\tDescription: ([\\w- ]*)\\n\\tMimeType: ([\\w-_/]*) \\| Size: (\\d*) bytes \\| Anchors: (\\d*)\\n\\tStackingTime: (\\d*) ms \\| ParsingTime: (\\d*) ms \\| IndexingTime: (\\d*) ms \\| StorageTime: (\\d*) ms");
+    private Pattern adv1 = Pattern.compile("\\*Indexed (\\d*) words in URL [\\w:.&?/%-~$§@=]* \\[[\\w-_]{12}\\][\\r\\n]*\\tDescription: ([\\w-\\.,:!?='\"„|/äöüßŸ¶œ¼©³–+@() ]*)[\\r\\n]*\\tMimeType: ([\\w-_~/]*) \\| Size: (\\d*) bytes \\| Anchors: (\\d*)[\\r\\n]*\\tStackingTime:[ ]*(\\d*) ms \\| ParsingTime:[ ]*(\\d*) ms \\| IndexingTime: (\\d*) ms \\| StorageTime: (\\d*) ms");
+
+    private int urlSum=0;
+    private int urlReqSum=0;
+    private int blockedURLSum=0;
+    private int wordsSum=0;
+    private int rwiSum=0;
+    private int blockedRWISum=0;
+    private long urlTimeSum=0;
+    private long rwiTimeSum=0;
+    private long DHTSendTraffic=0;
+    private int DHTSendURLs=0;
+    private int RWIRejectCount=0;
+    private HashSet RWIRejectPeerNames = new HashSet();
+    private HashSet RWIRejectPeerHashs = new HashSet();
+    private HashSet DHTPeerNames = new HashSet();
+    private HashSet DHTPeerHashs = new HashSet();
+    private int DHTSelectionTargetCount = 0;
+    private int DHTSelectionWordsCount = 0;
+    private int DHTSelectionWordsTimeCount = 0;
+    private double minDHTDist = 1;
+    private double maxDHTDist = 0;
+    private double avgDHTDist = 0;
+    private int busyPeerCount = 0;
+    private int notEnoughDHTPeers = 0;
+    private int failedIndexDistributionCount = 0;
+    private int leftChildTwiceCount = 0;
+    private int rightChildTwiceCount = 0;
+    private int rankingDistributionCount = 0;
+    private int rankingDistributionTime = 0;
+    private int rankingDistributionFailCount = 0;
+    private int malformedURLCount = 0;
+    private int indexedSites = 0;
+    private int indexedWordSum = 0;
+    private int indexedSiteSizeSum = 0;
+    private int indexedAnchorsCount = 0;
+    private int indexedStackingTime = 0;
+    private int indexedParsingTime = 0;
+    private int indexedIndexingTime = 0;
+    private int indexedStorageTime = 0;
+    private final String parserType = "PLASMA";
+    
+    public int parse(String logLevel, String logLine) {
+        if (logLevel.equals("INFO")){
+            m = i1.matcher (logLine);
+            
+            if (m.find ()) {
+                //System.out.println(m.group(1) + " " + m.group(2) + " " + m.group(3));
+                urlSum += Integer.parseInt(m.group(1));
+                urlTimeSum += Integer.parseInt(m.group(2));
+                blockedURLSum += Integer.parseInt(m.group(3));
+                return 0;
+            }
+            m = i2.matcher (logLine);
+            
+            if (m.find ()) {
+                rwiSum += Integer.parseInt(m.group(1));
+                wordsSum += Integer.parseInt(m.group(2));
+                rwiTimeSum += Integer.parseInt(m.group(3));
+                urlReqSum += Integer.parseInt(m.group(4));
+                blockedRWISum += Integer.parseInt(m.group(6));
+                return 0;
+            }
+            m = i2_2.matcher (logLine);
+            
+            if (m.find ()) {
+                rwiSum += Integer.parseInt(m.group(1));
+                wordsSum += Integer.parseInt(m.group(2));
+                rwiTimeSum += Integer.parseInt(m.group(3));
+                urlReqSum += Integer.parseInt(m.group(4));
+                blockedRWISum += Integer.parseInt(m.group(6));
+                return 0;
+            }
+            m = i3.matcher (logLine);
+            
+            if (m.find ()) {
+                DHTSendTraffic += Integer.parseInt(m.group(6));
+                DHTPeerNames.add(m.group(2));
+                DHTPeerHashs.add(m.group(3));
+                return 0;
+            }
+            m = i4.matcher (logLine);
+            
+            if (m.find ()) {
+                DHTSendTraffic += Integer.parseInt(m.group(8));
+                DHTSendURLs += Integer.parseInt(m.group(3));
+                DHTPeerNames.add(m.group(4));
+                DHTPeerHashs.add(m.group(5));
+                return 0;
+            }
+            m = i5.matcher (logLine);
+            
+            if (m.find ()) {
+                minDHTDist = Math.min(minDHTDist, Double.parseDouble(m.group(3)));
+                maxDHTDist = Math.max(maxDHTDist, Double.parseDouble(m.group(3)));
+                avgDHTDist += Double.parseDouble(m.group(3));
+                DHTSelectionTargetCount++;
+                return 0;
+            }
+            m = i6.matcher (logLine);
+            
+            if (m.find ()) {
+                RWIRejectPeerNames.add(m.group(2));
+                RWIRejectPeerHashs.add(m.group(1));
+                RWIRejectCount++;
+                return 0;
+            }
+            m = i7.matcher (logLine);
+            
+            if (m.find ()) {
+                return 0;
+            }
+            m = i8.matcher (logLine);
+            
+            if (m.find ()) {
+                DHTSelectionWordsCount += Double.parseDouble(m.group(1));
+                DHTSelectionWordsTimeCount += Double.parseDouble(m.group(2));
+                return 0;
+            }
+            m = i9.matcher (logLine);
+            
+            if (m.find ()) {
+                rankingDistributionCount++;
+                rankingDistributionTime += Integer.parseInt(m.group(1));
+                return 0;
+            }
+            m = i10.matcher (logLine);
+            
+            if (m.find ()) {
+                rankingDistributionFailCount++;
+                return 0;
+            }
+            m = i11.matcher (logLine);
+            
+            if (m.find ()) {
+                busyPeerCount++;
+                return 0;
+            }
+//            m = i12.matcher (logLine);
+//            
+//            if (m.find ()) {
+//                return 3;
+//            }
+            m = i13.matcher (logLine);
+            
+            if (m.find ()) {
+                return 0;
+            }
+            m = adv1.matcher (logLine);
+            
+            if (m.find ()) {
+                indexedSites++;
+                indexedWordSum += Integer.parseInt(m.group(1));
+                indexedSiteSizeSum += Integer.parseInt(m.group(4));
+                indexedAnchorsCount += Integer.parseInt(m.group(5));
+                indexedStackingTime += Integer.parseInt(m.group(6));
+                indexedParsingTime += Integer.parseInt(m.group(7));
+                indexedIndexingTime += Integer.parseInt(m.group(8));
+                indexedStorageTime += Integer.parseInt(m.group(9));
+            }
+
+        } else if (logLevel.equals("WARNING")){
+            m = w1.matcher (logLine);
+            
+            if (m.find ()) {
+                notEnoughDHTPeers++;
+                return 0;
+            }
+            m = w2.matcher (logLine);
+            
+            if (m.find ()) {
+                failedIndexDistributionCount++;
+                return 0;
+            }
+        } else if (logLevel.equals("SEVERE")){
+            m = e1.matcher (logLine);
+            
+            if (m.find ()) {
+                if (m.group(1).equals("leftchild")) leftChildTwiceCount++;
+                else if (m.group(1).equals("rightchild")) rightChildTwiceCount++;
+                return 0;
+            }
+            m = e2.matcher (logLine);
+            
+            if (m.find ()) {
+                malformedURLCount++;
+                return 0;
+            }
+        }
+        return -1;
+    }
+
+    public void printResults() {
+        if(rankingDistributionCount == 0) rankingDistributionCount = 1;
+        if(DHTSelectionWordsTimeCount == 0) DHTSelectionWordsTimeCount = 1;
+        if(indexedSites != 0) indexedSites++;
+        System.out.println("INDEXER: Indexed " + indexedSites + " sites in " + (indexedStackingTime + indexedParsingTime + indexedIndexingTime + indexedStorageTime) + " milliseconds.");
+        System.out.println("INDEXER: Indexed " + indexedWordSum + " words on " + indexedSites + " sites. (avg. words per site: " + (indexedWordSum / indexedSites) + ").");
+        System.out.println("INDEXER: Total Size of indexed sites: " + indexedSiteSizeSum + " bytes (avg. size per site: " + (indexedSiteSizeSum / indexedSites) + " bytes).");
+        System.out.println("INDEXER: Total Number of Anchors found: " + indexedAnchorsCount + "(avg. Anchors per site: " + (indexedAnchorsCount / indexedSites) + ").");
+        System.out.println("INDEXER: Total StackingTime: " + indexedStackingTime + " milliseconds (avg. StackingTime: " + (indexedStackingTime / indexedSites) + " milliseconds).");
+        System.out.println("INDEXER: Total ParsingTime: " + indexedParsingTime + " milliseconds (avg. ParsingTime: " + (indexedParsingTime / indexedSites) + " milliseconds).");
+        System.out.println("INDEXER: Total IndexingTime: " + indexedIndexingTime + " milliseconds (avg. IndexingTime: " + (indexedIndexingTime / indexedSites) + " milliseconds).");
+        System.out.println("INDEXER: Total StorageTime: " + indexedStorageTime + " milliseconds (avg. StorageTime: " + (indexedStorageTime / indexedSites) + " milliseconds)."); 
+        if(urlSum != 0) urlSum++;
+        System.out.println("DHT: Recieved " + urlSum + " Urls in " + urlTimeSum + " ms. Blocked " + blockedURLSum + " URLs.");
+        System.out.println("DHT: " + urlTimeSum / urlSum + " milliseconds per URL.");            
+        if(rwiSum != 0) rwiSum++;
+        System.out.println("DHT: Recieved " + rwiSum + " RWIs from " + wordsSum + " Words in " + rwiTimeSum + " ms. " + urlReqSum + " requested URLs.");
+        System.out.println("DHT: Blocked " + blockedRWISum + " RWIs before requesting URLs, because URL-Hash was blacklisted.");
+        System.out.println("DHT: " + rwiTimeSum / rwiSum + " milliseconds per RWI.");            
+        System.out.println("DHT: Rejected " + RWIRejectCount + " Indextransfers from " + RWIRejectPeerNames.size() + " PeerNames with " + RWIRejectPeerHashs.size() + " PeerHashs.");
+        System.out.println("DHT: " + ((double)Math.round(DHTSendTraffic*100/(1024*1024)))/100 + " MegaBytes (" + DHTSendTraffic + " Bytes) of DHT-Transfertraffic.");
+        System.out.println("DHT: Sended " + DHTSendURLs + " URLs via DHT.");
+        System.out.println("DHT: DHT Transfers send to " + DHTPeerNames.size() + " Peernames with " + DHTPeerHashs.size() + " Peerhashs.");
+        System.out.println("DHT: Totally selected " + DHTSelectionWordsCount + " words in " + DHTSelectionWordsTimeCount + " seconds (" + (float)DHTSelectionWordsCount/DHTSelectionWordsTimeCount + " words/s)");
+        System.out.println("DHT: Selected " + DHTSelectionTargetCount + " possible DHT Targets (min. Distance: " + minDHTDist + " max. Distance: " + maxDHTDist + " avg. Distance: " + ((double)avgDHTDist/DHTSelectionTargetCount));
+        System.out.println("DHT: " + busyPeerCount + " times a targetpeer was too busy to accept a transfer.");
+        System.out.println("DHT: " + notEnoughDHTPeers + " times there were not enought targetpeers for the selected DHTChunk");
+        System.out.println("DHT: IndexDistribution failed " + failedIndexDistributionCount + " times.");
+        System.out.println("RANKING: Transmitted " + rankingDistributionCount + " Rankingfiles in " + rankingDistributionTime + " seconds (" + rankingDistributionTime/rankingDistributionCount + " seconds/file)");
+        System.out.println("RANKING: RankingDistribution failed " + rankingDistributionFailCount + " times.");
+        if (leftChildTwiceCount != 0)
+            System.out.println("ERRORS: tried " + leftChildTwiceCount + " times to create leftchild node twice in db");
+        if (rightChildTwiceCount != 0)
+            System.out.println("ERRORS: tried " + rightChildTwiceCount + " times to create rightchild node twice in db");
+        if (malformedURLCount != 0)
+            System.out.println("ERRORS: " + malformedURLCount + " MalformedURLExceptions accord.");
+    }
+
+    public String getParserType() {
+        return parserType;
+    }
+
+}