*) Bugfix for "-UNRESOLVED_PATTERN-" Bug on IndexCreateWWWLocalQueue_p.html and "urlEntry.url() == null" Bug

- Logging message for "urlEntry.url() == null" is now displayed as info
   - IndexCreateWWWLocalQueue_p.html now detects null entries while looping throug the list and removes them automatically
   See: 
   - http://www.yacy-forum.de/viewtopic.php?t=532#8781
   - http://www.yacy-forum.de/viewtopic.php?t=639
   - http://www.yacy-forum.de/viewtopic.php?t=1071
   - http://www.yacy-forum.de/viewtopic.php?t=338
   - http://www.yacy-forum.de/viewtopic.php?t=980

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@640 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 20 years ago
parent 33aaffbfc6
commit 732a107160

@ -113,13 +113,13 @@ public class IndexCreateIndexingQueue_p {
pcentry = (plasmaSwitchboardQueue.Entry) entryList.get(i); pcentry = (plasmaSwitchboardQueue.Entry) entryList.get(i);
if ((pcentry != null)&&(pcentry.url() != null)) { if ((pcentry != null)&&(pcentry.url() != null)) {
initiator = yacyCore.seedDB.getConnected(pcentry.initiator()); initiator = yacyCore.seedDB.getConnected(pcentry.initiator());
prop.put("indexing-queue_list_"+i+"_dark", ((dark) ? 1 : 0)); prop.put("indexing-queue_list_"+entryCount+"_dark", ((dark) ? 1 : 0));
prop.put("indexing-queue_list_"+i+"_initiator", ((initiator == null) ? "proxy" : initiator.getName())); prop.put("indexing-queue_list_"+entryCount+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("indexing-queue_list_"+i+"_depth", pcentry.depth()); prop.put("indexing-queue_list_"+entryCount+"_depth", pcentry.depth());
prop.put("indexing-queue_list_"+i+"_modified", (pcentry.responseHeader() == null) ? "" : daydate(pcentry.responseHeader().lastModified())); prop.put("indexing-queue_list_"+entryCount+"_modified", (pcentry.responseHeader() == null) ? "" : daydate(pcentry.responseHeader().lastModified()));
prop.put("indexing-queue_list_"+i+"_anchor", (pcentry.anchorName()==null)?"":pcentry.anchorName()); prop.put("indexing-queue_list_"+entryCount+"_anchor", (pcentry.anchorName()==null)?"":pcentry.anchorName());
prop.put("indexing-queue_list_"+i+"_url", pcentry.normalizedURLString()); prop.put("indexing-queue_list_"+entryCount+"_url", pcentry.normalizedURLString());
prop.put("indexing-queue_list_"+i+"_size", Status.bytesToString(pcentry.size())); prop.put("indexing-queue_list_"+entryCount+"_size", Status.bytesToString(pcentry.size()));
dark = !dark; dark = !dark;
entryCount++; entryCount++;
} }

@ -84,33 +84,38 @@ public class IndexCreateWWWLocalQueue_p {
} }
} }
int stackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE); int showNum = 0, stackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
if (stackSize == 0) { if (stackSize == 0) {
prop.put("crawler-queue", 0); prop.put("crawler-queue", 0);
} else { } else {
prop.put("crawler-queue", 1); prop.put("crawler-queue", 1);
plasmaCrawlNURL.Entry[] crawlerList = switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, 100); plasmaCrawlNURL.Entry[] crawlerList = switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, 120);
prop.put("crawler-queue_num", stackSize);//num Entries
prop.put("crawler-queue_show-num", crawlerList.length); //showin sjow-num most recent
plasmaCrawlNURL.Entry urle; plasmaCrawlNURL.Entry urle;
boolean dark = true; boolean dark = true;
yacySeed initiator; yacySeed initiator;
int i; int i;
for (i = 0; i < crawlerList.length; i++) { for (i = 0; (i < crawlerList.length) && (showNum < 100); i++) {
urle = crawlerList[i]; urle = crawlerList[i];
if (urle != null) { if ((urle != null)&&(urle.url()!=null)) {
initiator = yacyCore.seedDB.getConnected(urle.initiator()); initiator = yacyCore.seedDB.getConnected(urle.initiator());
prop.put("crawler-queue_list_"+i+"_dark", ((dark) ? 1 : 0) ); prop.put("crawler-queue_list_"+showNum+"_dark", ((dark) ? 1 : 0) );
prop.put("crawler-queue_list_"+i+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) ); prop.put("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
prop.put("crawler-queue_list_"+i+"_depth", urle.depth()); prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth());
prop.put("crawler-queue_list_"+i+"_modified", daydate(urle.loaddate()) ); prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.loaddate()) );
prop.put("crawler-queue_list_"+i+"_anchor", urle.name()); prop.put("crawler-queue_list_"+showNum+"_anchor", urle.name());
prop.put("crawler-queue_list_"+i+"_url", urle.url()); prop.put("crawler-queue_list_"+showNum+"_url", urle.url());
prop.put("crawler-queue_list_"+i+"_hash", urle.hash()); prop.put("crawler-queue_list_"+showNum+"_hash", urle.hash());
dark = !dark; dark = !dark;
showNum++;
} else {
stackSize--;
} }
} }
prop.put("crawler-queue_list", i); prop.put("crawler-queue_list", showNum);
prop.put("crawler-queue_num", stackSize);//num Entries
prop.put("crawler-queue_show-num", showNum); //showin sjow-num most recent
} }
// return rewrite properties // return rewrite properties

@ -46,6 +46,7 @@ package de.anomic.plasma;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.net.URL; import java.net.URL;
import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
@ -281,17 +282,19 @@ public class plasmaCrawlNURL extends plasmaURL {
} }
private Entry[] top(kelondroStack stack, int count) { private Entry[] top(kelondroStack stack, int count) {
// this is a filo - top // this is a filo - top
if (count > stack.size()) count = stack.size(); if (count > stack.size()) count = stack.size();
Entry[] list = new Entry[count]; ArrayList list = new ArrayList(count);
try { try {
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
list[i] = new Entry(new String(stack.top(i)[0])); byte[] hash = stack.top(i)[0];
} if (hash == null) continue;
return list; list.add(new Entry(new String(hash)));
}
return (Entry[])list.toArray(new Entry[list.size()]);
} catch (IOException e) { } catch (IOException e) {
return null; return null;
} }
} }
public synchronized Entry getEntry(String hash) { public synchronized Entry getEntry(String hash) {
@ -349,7 +352,7 @@ public class plasmaCrawlNURL extends plasmaURL {
public String toString() { public String toString() {
StringBuffer str = new StringBuffer(); StringBuffer str = new StringBuffer();
str.append("hash: ").append(url==null ? "null" : urlHash(url)).append(" | ") str.append("hash: ").append(hash==null ? "null" : hash).append(" | ")
.append("initiator: ").append(initiator==null?"null":initiator).append(" | ") .append("initiator: ").append(initiator==null?"null":initiator).append(" | ")
.append("url: ").append(url==null?"null":url.toString()).append(" | ") .append("url: ").append(url==null?"null":url.toString()).append(" | ")
.append("referrer: ").append((referrer == null) ? dummyHash : referrer).append(" | ") .append("referrer: ").append((referrer == null) ? dummyHash : referrer).append(" | ")

@ -733,8 +733,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
plasmaCrawlNURL.Entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE); plasmaCrawlNURL.Entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE);
String stats = "LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; String stats = "LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
if ((urlEntry.url() == null) || (urlEntry.url().toString().length() < 10)) { if ((urlEntry.url() == null) || (urlEntry.url().toString().length() < 10)) {
log.logSevere(stats + ": urlEntry.url() == null. URL-Hash: " + ((urlEntry.hash()==null)?"Unknown":urlEntry.hash())); log.logInfo(stats + ": URL with hash " + ((urlEntry.hash()==null)?"Unknown":urlEntry.hash()) + " already removed from queue.");
return true; return true;
} }
String profileHandle = urlEntry.profileHandle(); String profileHandle = urlEntry.profileHandle();
//System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url()); //System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
@ -747,7 +747,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logSevere(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url()); log.logSevere(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
return true; return true;
} }
log.logFine("LOCALCRAWL: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + log.logFine("LOCALCRAWL: URL=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() +
", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + profile.generalFilter() + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + profile.generalFilter() +
", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false"))); ", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false")));

Loading…
Cancel
Save