You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
364 lines
20 KiB
364 lines
20 KiB
//LogParserPLASMA.java
|
|
//-------------------------------------
|
|
//part of YACY
|
|
//(C) by Michael Peter Christen; mc@anomic.de
|
|
//first published on http://www.anomic.de
|
|
//Frankfurt, Germany, 2004
|
|
//
|
|
//This file ist contributed by Matthias Soehnholz
|
|
//last major change: $LastChangedDate$ by $LastChangedBy$
|
|
//Revision: $LastChangedRevision$
|
|
//
|
|
//This program is free software; you can redistribute it and/or modify
|
|
//it under the terms of the GNU General Public License as published by
|
|
//the Free Software Foundation; either version 2 of the License, or
|
|
//(at your option) any later version.
|
|
//
|
|
//This program is distributed in the hope that it will be useful,
|
|
//but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
//GNU General Public License for more details.
|
|
//
|
|
//You should have received a copy of the GNU General Public License
|
|
//along with this program; if not, write to the Free Software
|
|
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
//
|
|
//Using this software in any meaning (reading, learning, copying, compiling,
|
|
//running) means that you agree that the Author(s) is (are) not responsible
|
|
//for cost, loss of data or any harm that may be caused directly or indirectly
|
|
//by usage of this softare or this documentation. The usage of this software
|
|
//is on your own risk. The installation and usage (starting/running) of this
|
|
//software may allow other people or application to access your computer and
|
|
//any attached devices and is highly dependent on the configuration of the
|
|
//software which must be done by the user of the software; the author(s) is
|
|
//(are) also not responsible for proper configuration and usage of the
|
|
//software, even if provoked by documentation provided together with
|
|
//the software.
|
|
//
|
|
//Any changes to this file according to the GPL as documented in the file
|
|
//gpl.txt aside this file in the shipment you received can be done to the
|
|
//lines that follows this copyright notice here, but changes must not be
|
|
//done inside the copyright notive above. A re-distribution must contain
|
|
//the intact and unchanged copyright notice.
|
|
//Contributions and changes to the program code must be marked as such.
|
|
|
|
package de.anomic.server.logging.logParsers;
|
|
|
|
import java.util.HashSet;
|
|
import java.util.Hashtable;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
public class LogParserPLASMA implements LogParser{
|
|
|
|
private final double parserVersion = 0.1;
|
|
private final String parserType = "PLASMA";
|
|
|
|
//RegExp for LogLevel I
|
|
private static Pattern i1 = Pattern.compile("Received (\\d*) URLs from peer [\\w-_]{12}:[\\w-_]*/[\\w.-]* in (\\d*) ms, Blocked (\\d*) URLs");
|
|
private static Pattern i2 = Pattern.compile("Received (\\d*) Entries (\\d*) Words \\[[\\w-_]{12} .. [\\w-_]{12}\\]/[\\w.-]* from [\\w-_]{12}:[\\w-_]*/[\\w.-]*, processed in (\\d*) milliseconds, requesting (\\d*)/(\\d*) URLs, blocked (\\d*) RWIs");
|
|
private static Pattern i2_2 = Pattern.compile("Received (\\d*) Entries (\\d*) Words \\[[\\w-_]{12} .. [\\w-_]{12}\\]/[\\w.-]* from [\\w-_]{12}:[\\w-_]*, processed in (\\d*) milliseconds, requesting (\\d*)/(\\d*) URLs, blocked (\\d*) RWIs");
|
|
private static Pattern i3 = Pattern.compile("Index transfer of (\\d*) words \\[[\\w-_]{12} .. [\\w-_]{12}\\] to peer ([\\w-_]*):([\\w-_]{12}) in (\\d*) seconds successful \\((\\d*) words/s, (\\d*) Bytes\\)");
|
|
private static Pattern i4 = Pattern.compile("Index transfer of (\\d*) entries (\\d*) words \\[[\\w-_]{12} .. [\\w-_]{12}\\] and (\\d*) URLs to peer ([\\w-_]*):([\\w-_]{12}) in (\\d*) seconds successful \\((\\d*) words/s, (\\d*) Bytes\\)");
|
|
private static Pattern i5 = Pattern.compile("Selected \\w* DHT target peer ([\\w-_]*):([\\w-_]{12}), distance = ([\\w.-]*)");
|
|
private static Pattern i6 = Pattern.compile("Rejecting RWIs from peer ([\\w-_]{12}):([\\w-_]*)/([\\w.]*) ([\\w. ]*)");
|
|
private static Pattern i7 = Pattern.compile("DHT distribution: transfer to peer [\\w-]* finished.");
|
|
private static Pattern i8 = Pattern.compile("Index selection of (\\d*) words \\[[\\w-_]{12} .. [\\w-_]{12}\\] in (\\d*) seconds");
|
|
private static Pattern i9 = Pattern.compile("RankingDistribution - transmitted file [\\w-:.\\\\]* to [\\w.]*:\\d* successfully in (\\d)* seconds");
|
|
private static Pattern i10 = Pattern.compile("RankingDistribution - error transmitting file");
|
|
private static Pattern i11 = Pattern.compile("Peer [\\w-_]*:[\\w-_]{12} is busy\\. Waiting \\d* ms\\.");
|
|
//private static Pattern i12 = Pattern.compile("\\*Indexed \\d* words in URL [\\w:.&/%-~$\u00A7@=]* \\[[\\w-_]{12}\\]");
|
|
private static Pattern i13 = Pattern.compile("WROTE HEADER for |LOCALCRAWL\\[\\d*, \\d*, \\d*, \\d*\\]|REJECTED WRONG STATUS TYPE");
|
|
//RegExp for LogLevel W
|
|
private static Pattern w1 = Pattern.compile("found not enough \\(\\d*\\) peers for distribution");
|
|
private static Pattern w2 = Pattern.compile("Transfer to peer ([\\w-_]*):([\\w-_]{12}) failed:'(\\w*)'");
|
|
//RegExp for LogLevel E
|
|
private static Pattern e1 = Pattern.compile("INTERNAL ERROR AT plasmaCrawlLURL:store:de.anomic.kelondro.kelondroException: tried to create (\\w*) node twice in db");
|
|
private static Pattern e2 = Pattern.compile("INTERNAL ERROR [\\w./: ]* java.net.MalformedURLException");
|
|
|
|
private Matcher m;
|
|
//RegExp for advancedParser
|
|
//private Pattern adv1 = Pattern.compile("\\*Indexed (\\d*) words in URL [\\w:.&?/%-=]* \\[[\\w-_]{12}\\]\\n\\tDescription: ([\\w- ]*)\\n\\tMimeType: ([\\w-_/]*) \\| Size: (\\d*) bytes \\| Anchors: (\\d*)\\n\\tStackingTime: (\\d*) ms \\| ParsingTime: (\\d*) ms \\| IndexingTime: (\\d*) ms \\| StorageTime: (\\d*) ms");
|
|
private Pattern adv1 = Pattern.compile("\\*Indexed (\\d*) words in URL [\\w:.&/%-~$\u00A7@=]* \\[[\\w-_]{12}\\][\\r\\n]*\\tDescription: ([\\w-\\.,:!='\"|/+@() ]*)[\\r\\n]*\\tMimeType: ([\\w-_~/]*) \\| Size: (\\d*) bytes \\| Anchors: (\\d*)[\\r\\n]*\\tStackingTime:[ ]*(\\d*) ms \\| ParsingTime:[ ]*(\\d*) ms \\| IndexingTime: (\\d*) ms \\| StorageTime: (\\d*) ms");
|
|
|
|
private int urlSum=0;
|
|
private int urlReqSum=0;
|
|
private int blockedURLSum=0;
|
|
private int wordsSum=0;
|
|
private int rwiSum=0;
|
|
private int blockedRWISum=0;
|
|
private long urlTimeSum=0;
|
|
private long rwiTimeSum=0;
|
|
private long DHTSendTraffic=0;
|
|
private int DHTSendURLs=0;
|
|
private int RWIRejectCount=0;
|
|
private HashSet RWIRejectPeerNames = new HashSet();
|
|
private HashSet RWIRejectPeerHashs = new HashSet();
|
|
private HashSet DHTPeerNames = new HashSet();
|
|
private HashSet DHTPeerHashs = new HashSet();
|
|
private int DHTSelectionTargetCount = 0;
|
|
private int DHTSelectionWordsCount = 0;
|
|
private int DHTSelectionWordsTimeCount = 0;
|
|
private double minDHTDist = 1;
|
|
private double maxDHTDist = 0;
|
|
private double avgDHTDist = 0;
|
|
private int busyPeerCount = 0;
|
|
private int notEnoughDHTPeers = 0;
|
|
private int failedIndexDistributionCount = 0;
|
|
private int leftChildTwiceCount = 0;
|
|
private int rightChildTwiceCount = 0;
|
|
private int rankingDistributionCount = 0;
|
|
private int rankingDistributionTime = 0;
|
|
private int rankingDistributionFailCount = 0;
|
|
private int malformedURLCount = 0;
|
|
private int indexedSites = 0;
|
|
private int indexedWordSum = 0;
|
|
private int indexedSiteSizeSum = 0;
|
|
private int indexedAnchorsCount = 0;
|
|
private int indexedStackingTime = 0;
|
|
private int indexedParsingTime = 0;
|
|
private int indexedIndexingTime = 0;
|
|
private int indexedStorageTime = 0;
|
|
|
|
public int parse(String logLevel, String logLine) {
|
|
if (logLevel.equals("INFO")){
|
|
m = i1.matcher (logLine);
|
|
|
|
if (m.find ()) {
|
|
//System.out.println(m.group(1) + " " + m.group(2) + " " + m.group(3));
|
|
urlSum += Integer.parseInt(m.group(1));
|
|
urlTimeSum += Integer.parseInt(m.group(2));
|
|
blockedURLSum += Integer.parseInt(m.group(3));
|
|
return 0;
|
|
}
|
|
m = i2.matcher (logLine);
|
|
|
|
if (m.find ()) {
|
|
rwiSum += Integer.parseInt(m.group(1));
|
|
wordsSum += Integer.parseInt(m.group(2));
|
|
rwiTimeSum += Integer.parseInt(m.group(3));
|
|
urlReqSum += Integer.parseInt(m.group(4));
|
|
blockedRWISum += Integer.parseInt(m.group(6));
|
|
return 0;
|
|
}
|
|
m = i2_2.matcher (logLine);
|
|
|
|
if (m.find ()) {
|
|
rwiSum += Integer.parseInt(m.group(1));
|
|
wordsSum += Integer.parseInt(m.group(2));
|
|
rwiTimeSum += Integer.parseInt(m.group(3));
|
|
urlReqSum += Integer.parseInt(m.group(4));
|
|
blockedRWISum += Integer.parseInt(m.group(6));
|
|
return 0;
|
|
}
|
|
m = i3.matcher (logLine);
|
|
|
|
if (m.find ()) {
|
|
DHTSendTraffic += Integer.parseInt(m.group(6));
|
|
DHTPeerNames.add(m.group(2));
|
|
DHTPeerHashs.add(m.group(3));
|
|
return 0;
|
|
}
|
|
m = i4.matcher (logLine);
|
|
|
|
if (m.find ()) {
|
|
DHTSendTraffic += Integer.parseInt(m.group(8));
|
|
DHTSendURLs += Integer.parseInt(m.group(3));
|
|
DHTPeerNames.add(m.group(4));
|
|
DHTPeerHashs.add(m.group(5));
|
|
return 0;
|
|
}
|
|
m = i5.matcher (logLine);
|
|
|
|
if (m.find ()) {
|
|
minDHTDist = Math.min(minDHTDist, Double.parseDouble(m.group(3)));
|
|
maxDHTDist = Math.max(maxDHTDist, Double.parseDouble(m.group(3)));
|
|
avgDHTDist += Double.parseDouble(m.group(3));
|
|
DHTSelectionTargetCount++;
|
|
return 0;
|
|
}
|
|
m = i6.matcher (logLine);
|
|
|
|
if (m.find ()) {
|
|
RWIRejectPeerNames.add(m.group(2));
|
|
RWIRejectPeerHashs.add(m.group(1));
|
|
RWIRejectCount++;
|
|
return 0;
|
|
}
|
|
m = i7.matcher (logLine);
|
|
|
|
if (m.find ()) {
|
|
return 0;
|
|
}
|
|
m = i8.matcher (logLine);
|
|
|
|
if (m.find ()) {
|
|
DHTSelectionWordsCount += Double.parseDouble(m.group(1));
|
|
DHTSelectionWordsTimeCount += Double.parseDouble(m.group(2));
|
|
return 0;
|
|
}
|
|
m = i9.matcher (logLine);
|
|
|
|
if (m.find ()) {
|
|
rankingDistributionCount++;
|
|
rankingDistributionTime += Integer.parseInt(m.group(1));
|
|
return 0;
|
|
}
|
|
m = i10.matcher (logLine);
|
|
|
|
if (m.find ()) {
|
|
rankingDistributionFailCount++;
|
|
return 0;
|
|
}
|
|
m = i11.matcher (logLine);
|
|
|
|
if (m.find ()) {
|
|
busyPeerCount++;
|
|
return 0;
|
|
}
|
|
// m = i12.matcher (logLine);
|
|
//
|
|
// if (m.find ()) {
|
|
// return 3;
|
|
// }
|
|
m = i13.matcher (logLine);
|
|
|
|
if (m.find ()) {
|
|
return 0;
|
|
}
|
|
m = adv1.matcher (logLine);
|
|
|
|
if (m.find ()) {
|
|
indexedSites++;
|
|
indexedWordSum += Integer.parseInt(m.group(1));
|
|
indexedSiteSizeSum += Integer.parseInt(m.group(4));
|
|
indexedAnchorsCount += Integer.parseInt(m.group(5));
|
|
indexedStackingTime += Integer.parseInt(m.group(6));
|
|
indexedParsingTime += Integer.parseInt(m.group(7));
|
|
indexedIndexingTime += Integer.parseInt(m.group(8));
|
|
indexedStorageTime += Integer.parseInt(m.group(9));
|
|
}
|
|
|
|
} else if (logLevel.equals("WARNING")){
|
|
m = w1.matcher (logLine);
|
|
|
|
if (m.find ()) {
|
|
notEnoughDHTPeers++;
|
|
return 0;
|
|
}
|
|
m = w2.matcher (logLine);
|
|
|
|
if (m.find ()) {
|
|
failedIndexDistributionCount++;
|
|
return 0;
|
|
}
|
|
} else if (logLevel.equals("SEVERE")){
|
|
m = e1.matcher (logLine);
|
|
|
|
if (m.find ()) {
|
|
if (m.group(1).equals("leftchild")) leftChildTwiceCount++;
|
|
else if (m.group(1).equals("rightchild")) rightChildTwiceCount++;
|
|
return 0;
|
|
}
|
|
m = e2.matcher (logLine);
|
|
|
|
if (m.find ()) {
|
|
malformedURLCount++;
|
|
return 0;
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
public Hashtable getResults() {
|
|
Hashtable results = new Hashtable();
|
|
results.put("version", new Double(parserVersion));
|
|
results.put("urlSum", new Integer(urlSum));
|
|
results.put("urlReqSum", new Integer(urlReqSum));
|
|
results.put("blockedURLSum", new Integer(blockedURLSum));
|
|
results.put("wordsSum", new Integer(wordsSum));
|
|
results.put("rwiSum", new Integer(rwiSum));
|
|
results.put("blockedRWISum", new Integer(blockedRWISum));
|
|
results.put("urlTimeSum", new Long(urlTimeSum));
|
|
results.put("rwiTimeSum", new Long(rwiTimeSum));
|
|
results.put("DHTSendTraffic", new Long(DHTSendTraffic));
|
|
results.put("DHTSendURLs", new Integer(DHTSendURLs));
|
|
results.put("RWIRejectCount", new Integer(RWIRejectCount));
|
|
results.put("RWIRejectPeerNames", RWIRejectPeerNames);
|
|
results.put("RWIRejectPeerHashs", RWIRejectPeerHashs);
|
|
results.put("DHTPeerNames", DHTPeerNames);
|
|
results.put("DHTPeerHashs", DHTPeerHashs);
|
|
results.put("DHTSelectionTargetCount", new Integer(DHTSelectionTargetCount));
|
|
results.put("DHTSelectionWordsCount", new Integer(DHTSelectionWordsCount));
|
|
results.put("DHTSelectionWordsTimeCount", new Integer(DHTSelectionWordsTimeCount));
|
|
results.put("minDHTDist", new Double(minDHTDist));
|
|
results.put("maxDHTDist", new Double(maxDHTDist));
|
|
results.put("avgDHTDist", new Double(avgDHTDist));
|
|
results.put("busyPeerCount", new Integer(busyPeerCount));
|
|
results.put("notEnoughDHTPeers", new Integer(notEnoughDHTPeers));
|
|
results.put("failedIndexDistributionCount", new Integer(failedIndexDistributionCount));
|
|
results.put("leftChildTwiceCount", new Integer(leftChildTwiceCount));
|
|
results.put("rightChildTwiceCount", new Integer(rightChildTwiceCount));
|
|
results.put("rankingDistributionCount", new Integer(rankingDistributionCount));
|
|
results.put("rankingDistributionTime", new Integer(rankingDistributionTime));
|
|
results.put("rankingDistributionFailCount", new Integer(rankingDistributionFailCount));
|
|
results.put("malformedURLCount", new Integer(malformedURLCount));
|
|
results.put("indexedSites", new Integer(indexedSites));
|
|
results.put("indexedWordSum", new Integer(indexedWordSum));
|
|
results.put("indexedSiteSizeSum", new Integer(indexedSiteSizeSum));
|
|
results.put("indexedAnchorsCount", new Integer(indexedAnchorsCount));
|
|
results.put("indexedStackingTime", new Integer(indexedStackingTime));
|
|
results.put("indexedParsingTime", new Integer(indexedParsingTime));
|
|
results.put("indexedIndexingTime", new Integer(indexedIndexingTime));
|
|
results.put("indexedStorageTime", new Integer(indexedStorageTime));
|
|
return results;
|
|
}
|
|
|
|
public String getParserType() {
|
|
return parserType;
|
|
}
|
|
|
|
public double getParserVersion() {
|
|
return parserVersion;
|
|
}
|
|
|
|
public void printResults() {
|
|
if(rankingDistributionCount == 0) rankingDistributionCount = 1;
|
|
if(DHTSelectionWordsTimeCount == 0) DHTSelectionWordsTimeCount = 1;
|
|
if(indexedSites != 0) indexedSites++;
|
|
System.out.println("INDEXER: Indexed " + indexedSites + " sites in " + (indexedStackingTime + indexedParsingTime + indexedIndexingTime + indexedStorageTime) + " milliseconds.");
|
|
System.out.println("INDEXER: Indexed " + indexedWordSum + " words on " + indexedSites + " sites. (avg. words per site: " + (indexedWordSum / indexedSites) + ").");
|
|
System.out.println("INDEXER: Total Size of indexed sites: " + indexedSiteSizeSum + " bytes (avg. size per site: " + (indexedSiteSizeSum / indexedSites) + " bytes).");
|
|
System.out.println("INDEXER: Total Number of Anchors found: " + indexedAnchorsCount + "(avg. Anchors per site: " + (indexedAnchorsCount / indexedSites) + ").");
|
|
System.out.println("INDEXER: Total StackingTime: " + indexedStackingTime + " milliseconds (avg. StackingTime: " + (indexedStackingTime / indexedSites) + " milliseconds).");
|
|
System.out.println("INDEXER: Total ParsingTime: " + indexedParsingTime + " milliseconds (avg. ParsingTime: " + (indexedParsingTime / indexedSites) + " milliseconds).");
|
|
System.out.println("INDEXER: Total IndexingTime: " + indexedIndexingTime + " milliseconds (avg. IndexingTime: " + (indexedIndexingTime / indexedSites) + " milliseconds).");
|
|
System.out.println("INDEXER: Total StorageTime: " + indexedStorageTime + " milliseconds (avg. StorageTime: " + (indexedStorageTime / indexedSites) + " milliseconds).");
|
|
if(urlSum != 0) urlSum++;
|
|
System.out.println("DHT: Recieved " + urlSum + " Urls in " + urlTimeSum + " ms. Blocked " + blockedURLSum + " URLs.");
|
|
System.out.println("DHT: " + urlTimeSum / urlSum + " milliseconds per URL.");
|
|
if(rwiSum != 0) rwiSum++;
|
|
System.out.println("DHT: Recieved " + rwiSum + " RWIs from " + wordsSum + " Words in " + rwiTimeSum + " ms. " + urlReqSum + " requested URLs.");
|
|
System.out.println("DHT: Blocked " + blockedRWISum + " RWIs before requesting URLs, because URL-Hash was blacklisted.");
|
|
System.out.println("DHT: " + rwiTimeSum / rwiSum + " milliseconds per RWI.");
|
|
System.out.println("DHT: Rejected " + RWIRejectCount + " Indextransfers from " + RWIRejectPeerNames.size() + " PeerNames with " + RWIRejectPeerHashs.size() + " PeerHashs.");
|
|
System.out.println("DHT: " + ((double)Math.round(DHTSendTraffic*100/(1024*1024)))/100 + " MegaBytes (" + DHTSendTraffic + " Bytes) of DHT-Transfertraffic.");
|
|
System.out.println("DHT: Sended " + DHTSendURLs + " URLs via DHT.");
|
|
System.out.println("DHT: DHT Transfers send to " + DHTPeerNames.size() + " Peernames with " + DHTPeerHashs.size() + " Peerhashs.");
|
|
System.out.println("DHT: Totally selected " + DHTSelectionWordsCount + " words in " + DHTSelectionWordsTimeCount + " seconds (" + (float)DHTSelectionWordsCount/DHTSelectionWordsTimeCount + " words/s)");
|
|
System.out.println("DHT: Selected " + DHTSelectionTargetCount + " possible DHT Targets (min. Distance: " + minDHTDist + " max. Distance: " + maxDHTDist + " avg. Distance: " + ((double)avgDHTDist/DHTSelectionTargetCount));
|
|
System.out.println("DHT: " + busyPeerCount + " times a targetpeer was too busy to accept a transfer.");
|
|
System.out.println("DHT: " + notEnoughDHTPeers + " times there were not enought targetpeers for the selected DHTChunk");
|
|
System.out.println("DHT: IndexDistribution failed " + failedIndexDistributionCount + " times.");
|
|
System.out.println("RANKING: Transmitted " + rankingDistributionCount + " Rankingfiles in " + rankingDistributionTime + " seconds (" + rankingDistributionTime/rankingDistributionCount + " seconds/file)");
|
|
System.out.println("RANKING: RankingDistribution failed " + rankingDistributionFailCount + " times.");
|
|
if (leftChildTwiceCount != 0)
|
|
System.out.println("ERRORS: tried " + leftChildTwiceCount + " times to create leftchild node twice in db");
|
|
if (rightChildTwiceCount != 0)
|
|
System.out.println("ERRORS: tried " + rightChildTwiceCount + " times to create rightchild node twice in db");
|
|
if (malformedURLCount != 0)
|
|
System.out.println("ERRORS: " + malformedURLCount + " MalformedURLExceptions accord.");
|
|
}
|
|
|
|
}
|