diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index 9b0ba7808..cbf9fb748 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -52,7 +52,6 @@ import java.util.LinkedList; import java.util.Map; import java.util.TreeMap; -import de.anomic.kelondro.kelondroAbstractRecords; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroEcoTable; import de.anomic.kelondro.kelondroIndex; @@ -255,22 +254,15 @@ public class Balancer { return urlRAMStack.size() > 0 || urlFileStack.size() > 0 || domainStacksNotEmpty(); } - public synchronized int size() { - int componentsize = urlFileStack.size() + urlRAMStack.size() + sizeDomainStacks(); - if (componentsize != urlFileIndex.size()) { - // here is urlIndexFile.size() always smaller. why? - if (kelondroAbstractRecords.debugmode) { - serverLog.logWarning("BALANCER", "size wrong in " + stackname + - " - urlFileIndex = " + urlFileIndex.size() + - ", componentsize = " + componentsize + - " = (urlFileStack = " + urlFileStack.size() + - ", urlRAMStack = " + urlRAMStack.size() + - ", sizeDomainStacks = " + sizeDomainStacks() + ")"); - } - if ((componentsize == 0) && (urlFileIndex.size() > 0)) { - resetFileIndex(); - } - } + public int size() { + int componentsize = urlFileIndex.size(); + assert componentsize == urlFileStack.size() + urlRAMStack.size() + sizeDomainStacks() : + "size wrong in " + stackname + + " - urlFileIndex = " + urlFileIndex.size() + + ", componentsize = " + componentsize + + " = (urlFileStack = " + urlFileStack.size() + + ", urlRAMStack = " + urlRAMStack.size() + + ", sizeDomainStacks = " + sizeDomainStacks() + ")"; return componentsize; } @@ -504,7 +496,7 @@ public class Balancer { 15000, Math.max( (crawlEntry.url().isLocal()) ? minimumLocalDelta : minimumGlobalDelta, - plasmaSwitchboard.getSwitchboard().robots.crawlDelay(crawlEntry.url().getHost()) * 1000) + plasmaSwitchboard.getSwitchboard().robots.crawlDelay(crawlEntry.url()) * 1000) ); // prevent that that robots file can stop our indexer completely if (delta < genericDelta) { // force a busy waiting here diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index a7ac207aa..0c0ea714a 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -78,6 +78,7 @@ public class RobotsTxt { kelondroMap robotsTable; private final File robotsTableFile; + //private static final HashSet loadedRobots = new HashSet(); // only for debugging public RobotsTxt(File robotsTableFile) { this.robotsTableFile = robotsTableFile; @@ -115,24 +116,113 @@ public class RobotsTxt { return this.robotsTable.size(); } - private Entry getEntry(String hostName) { + @SuppressWarnings("unchecked") + private Entry getEntry(String urlHostPort, boolean fetchOnlineIfNotAvailableOrNotFresh) { + // this method will always return a non-null value + Entry robotsTxt4Host = null; try { - HashMap record = this.robotsTable.get(hostName); - if (record == null) return null; - return new Entry(hostName, record); + HashMap record = this.robotsTable.get(urlHostPort); + if (record != null) robotsTxt4Host = new Entry(urlHostPort, record); } catch (kelondroException e) { resetDatabase(); - return null; } catch (IOException e) { resetDatabase(); - return null; } + + if (fetchOnlineIfNotAvailableOrNotFresh && ( + robotsTxt4Host == null || + robotsTxt4Host.getLoadedDate() == null || + System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() > 7*24*60*60*1000 + )) synchronized(this) { + // if we have not found any data or the data is older than 7 days, we need to load it from the remote server + + // check the robots table again for all threads that come here because they waited for another one + // to complete a download + try { + HashMap record = this.robotsTable.get(urlHostPort); + if (record != null) robotsTxt4Host = new Entry(urlHostPort, record); + } catch (kelondroException e) { + resetDatabase(); + } catch (IOException e) { + resetDatabase(); + } + if (robotsTxt4Host != null && + robotsTxt4Host.getLoadedDate() != null && + System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() <= 7*24*60*60*1000) { + return robotsTxt4Host; + } + + // generating the proper url to download the robots txt + yacyURL robotsURL = null; + try { + robotsURL = new yacyURL("http://" + urlHostPort + "/robots.txt", null); + } catch (MalformedURLException e) { + serverLog.logSevere("ROBOTS","Unable to generate robots.txt URL for host:port '" + urlHostPort + "'."); + robotsURL = null; + } + + Object[] result = null; + if (robotsURL != null) { + serverLog.logFine("ROBOTS","Trying to download the robots.txt file from URL '" + robotsURL + "'."); + try { + result = downloadRobotsTxt(robotsURL, 5, robotsTxt4Host); + } catch (Exception e) { + result = null; + } + } + /* + assert !loadedRobots.contains(robotsURL.toNormalform(false, false)) : + "robots-url=" + robotsURL.toString() + + ", robots=" + ((result == null || result[DOWNLOAD_ROBOTS_TXT] == null) ? "NULL" : new String((byte[]) result[DOWNLOAD_ROBOTS_TXT])) + + ", robotsTxt4Host=" + ((robotsTxt4Host == null) ? "NULL" : robotsTxt4Host.getLoadedDate().toString()); + loadedRobots.add(robotsURL.toNormalform(false, false)); + */ + + if (result == null) { + // no robots.txt available, make an entry to prevent that the robots loading is done twice + if (robotsTxt4Host == null) { + // generate artificial entry + robotsTxt4Host = new Entry( + urlHostPort, + new ArrayList(), + new Date(), + new Date(), + null, + null, + new Integer(0)); + } else { + robotsTxt4Host.setLoadedDate(new Date()); + } + + // store the data into the robots DB + addEntry(robotsTxt4Host); + } else { + Object[] parserResult = robotsParser.parse((byte[]) result[DOWNLOAD_ROBOTS_TXT]); + ArrayList denyPath = (ArrayList) parserResult[0]; + if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) { + denyPath = new ArrayList(); + denyPath.add("/"); + } + + // store the data into the robots DB + robotsTxt4Host = addEntry( + urlHostPort, + denyPath, + new Date(), + (Date) result[DOWNLOAD_MODDATE], + (String) result[DOWNLOAD_ETAG], + (String) parserResult[1], + (Integer) parserResult[2]); + } + } + + return robotsTxt4Host; } - public int crawlDelay(String hostname) { - RobotsTxt.Entry robotsEntry = getEntry(hostname); - Integer hostDelay = (robotsEntry == null) ? null : robotsEntry.getCrawlDelay(); - if (hostDelay == null) return 0; else return hostDelay.intValue(); + public int crawlDelay(yacyURL theURL) { + String urlHostPort = getHostPort(theURL); + RobotsTxt.Entry robotsEntry = getEntry(urlHostPort, true); + return robotsEntry.getCrawlDelay(); } private Entry addEntry( @@ -268,11 +358,13 @@ public class RobotsTxt { return null; } - public Integer getCrawlDelay() { - if (this.mem.containsKey(CRAWL_DELAY)) { - return Integer.valueOf(this.mem.get(CRAWL_DELAY)); + public int getCrawlDelay() { + if (this.mem.containsKey(CRAWL_DELAY)) try { + return Integer.parseInt(this.mem.get(CRAWL_DELAY)); + } catch (NumberFormatException e) { + return 0; } - return null; + return 0; } public boolean isDisallowed(String path) { @@ -336,10 +428,8 @@ public class RobotsTxt { yacyURL sitemapURL = null; // generating the hostname:poart string needed to do a DB lookup - String urlHostPort = getHostPort(theURL); - - RobotsTxt.Entry robotsTxt4Host = this.getEntry(urlHostPort); - if (robotsTxt4Host == null) return null; + String urlHostPort = getHostPort(theURL); + RobotsTxt.Entry robotsTxt4Host = this.getEntry(urlHostPort, true); try { String sitemapUrlStr = robotsTxt4Host.getSitemap(); @@ -354,10 +444,8 @@ public class RobotsTxt { Integer crawlDelay = null; // generating the hostname:poart string needed to do a DB lookup - String urlHostPort = getHostPort(theURL); - - RobotsTxt.Entry robotsTxt4Host = getEntry(urlHostPort); - if (robotsTxt4Host == null) return null; + String urlHostPort = getHostPort(theURL); + RobotsTxt.Entry robotsTxt4Host = getEntry(urlHostPort, true); try { crawlDelay = robotsTxt4Host.getCrawlDelay(); @@ -366,90 +454,13 @@ public class RobotsTxt { return crawlDelay; } - //private static final HashSet loadedRobots = new HashSet(); // only for debugging - - @SuppressWarnings("unchecked") public boolean isDisallowed(yacyURL nexturl) { if (nexturl == null) throw new IllegalArgumentException(); - // generating the hostname:poart string needed to do a DB lookup + // generating the hostname:port string needed to do a DB lookup String urlHostPort = getHostPort(nexturl); RobotsTxt.Entry robotsTxt4Host = null; - synchronized(this) { - - // do a DB lookup to determine if the robots data is already available - robotsTxt4Host = getEntry(urlHostPort); - - // if we have not found any data or the data is older than 7 days, we need to load it from the remote server - if ( - (robotsTxt4Host == null) || - (robotsTxt4Host.getLoadedDate() == null) || - (System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() > 7*24*60*60*1000) - ) { - - // generating the proper url to download the robots txt - yacyURL robotsURL = null; - try { - robotsURL = new yacyURL(nexturl.getProtocol(),nexturl.getHost(),getPort(nexturl),"/robots.txt"); - } catch (MalformedURLException e) { - serverLog.logSevere("ROBOTS","Unable to generate robots.txt URL for URL '" + nexturl.toString() + "'."); - return false; - } - - Object[] result = null; - serverLog.logFine("ROBOTS","Trying to download the robots.txt file from URL '" + robotsURL + "'."); - try { - result = downloadRobotsTxt(robotsURL, 5, robotsTxt4Host); - } catch (Exception e) { - result = null; - } - /* - assert !loadedRobots.contains(robotsURL.toNormalform(false, false)) : - "robots-url=" + robotsURL.toString() + - ", robots=" + ((result == null || result[DOWNLOAD_ROBOTS_TXT] == null) ? "NULL" : new String((byte[]) result[DOWNLOAD_ROBOTS_TXT])) + - ", robotsTxt4Host=" + ((robotsTxt4Host == null) ? "NULL" : robotsTxt4Host.getLoadedDate().toString()); - loadedRobots.add(robotsURL.toNormalform(false, false)); - */ - - if (result == null) { - // no robots.txt available, make an entry to prevent that the robots loading is done twice - if (robotsTxt4Host == null) { - // generate artificial entry - robotsTxt4Host = new Entry( - urlHostPort, - new ArrayList(), - new Date(), - new Date(), - null, - null, - new Integer(0)); - } else { - robotsTxt4Host.setLoadedDate(new Date()); - } - - // store the data into the robots DB - addEntry(robotsTxt4Host); - } else { - Object[] parserResult = robotsParser.parse((byte[]) result[DOWNLOAD_ROBOTS_TXT]); - ArrayList denyPath = (ArrayList) parserResult[0]; - if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) { - denyPath = new ArrayList(); - denyPath.add("/"); - } - - // store the data into the robots DB - robotsTxt4Host = addEntry( - urlHostPort, - denyPath, - new Date(), - (Date) result[DOWNLOAD_MODDATE], - (String) result[DOWNLOAD_ETAG], - (String) parserResult[1], - (Integer) parserResult[2]); - } - } - } - + robotsTxt4Host = getEntry(urlHostPort, true); return robotsTxt4Host.isDisallowed(nexturl.getFile()); } diff --git a/source/de/anomic/kelondro/kelondroBLOBHeap.java b/source/de/anomic/kelondro/kelondroBLOBHeap.java index 9c58f73be..4370b4a02 100755 --- a/source/de/anomic/kelondro/kelondroBLOBHeap.java +++ b/source/de/anomic/kelondro/kelondroBLOBHeap.java @@ -92,7 +92,9 @@ public final class kelondroBLOBHeap implements kelondroBLOB { // read length of the following record without the length of the record size bytes reclen = file.readInt(); - + assert reclen > 0; + if (reclen == 0) break loop; // very bad file inconsistency + // read key file.readFully(key); @@ -104,7 +106,7 @@ public final class kelondroBLOBHeap implements kelondroBLOB { // check if this record is empty if (key == null || key[0] == 0) { // it is an empty record, store to free list - free.add(new Long[]{new Long(seek), new Long(reclen)}); + if (reclen > 0) free.add(new Long[]{new Long(seek), new Long(reclen)}); } else { // store key and access address of entry in index try {