From f3baaca920b124a69d1af22af930586b4034c100 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 22 Mar 2011 09:34:10 +0000 Subject: [PATCH] - enhancements to DNS IP caching and crawler speed - bugfixes (NPEs) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7619 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexControlRWIs_p.html | 4 +- htroot/IndexImportWikimedia_p.java | 40 +++------ htroot/PerformanceMemory_p.java | 2 +- source/de/anomic/crawler/Balancer.java | 4 +- source/net/yacy/cora/protocol/Domains.java | 88 +++++++++---------- source/net/yacy/document/Condenser.java | 5 +- source/net/yacy/document/Document.java | 5 +- .../document/importer/MediawikiImporter.java | 4 +- 8 files changed, 66 insertions(+), 86 deletions(-) diff --git a/htroot/IndexControlRWIs_p.html b/htroot/IndexControlRWIs_p.html index 77d756878..ce73b400e 100644 --- a/htroot/IndexControlRWIs_p.html +++ b/htroot/IndexControlRWIs_p.html @@ -41,13 +41,13 @@
Index Deletion







- +
diff --git a/htroot/IndexImportWikimedia_p.java b/htroot/IndexImportWikimedia_p.java index 17581de59..68e5d9cce 100644 --- a/htroot/IndexImportWikimedia_p.java +++ b/htroot/IndexImportWikimedia_p.java @@ -23,11 +23,9 @@ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import java.io.File; -import java.net.MalformedURLException; import net.yacy.cora.protocol.RequestHeader; import net.yacy.document.importer.MediawikiImporter; -import net.yacy.kelondro.logging.Log; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; @@ -57,33 +55,17 @@ public class IndexImportWikimedia_p { } else { if (post.containsKey("file")) { final File sourcefile = new File(post.get("file")); - //final String name = sourcefile.getName(); // i.e. dewiki-20090311-pages-articles.xml.bz2 - /* - if (!name.endsWith("pages-articles.xml.bz2")) { - prop.put("import", 0); - prop.put("import_status", 1); - prop.put("import_status_message", "file name must end with 'pages-articles.xml.bz2'"); - return prop; - } - */ - try { - MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath); - MediawikiImporter.job.start(); - prop.put("import", 1); - prop.put("import_thread", "started"); - prop.put("import_dump", MediawikiImporter.job.source()); - prop.put("import_count", 0); - prop.put("import_speed", 0); - prop.put("import_runningHours", 0); - prop.put("import_runningMinutes", 0); - prop.put("import_remainingHours", 0); - prop.put("import_remainingMinutes", 0); - } catch (MalformedURLException e) { - Log.logException(e); - prop.put("import", 0); - prop.put("import_status", 1); - prop.put("import_status_message", e.getMessage()); - } + MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath); + MediawikiImporter.job.start(); + prop.put("import", 1); + prop.put("import_thread", "started"); + prop.put("import_dump", MediawikiImporter.job.source()); + prop.put("import_count", 0); + prop.put("import_speed", 0); + prop.put("import_runningHours", 0); + prop.put("import_runningMinutes", 0); + prop.put("import_remainingHours", 0); + prop.put("import_remainingMinutes", 0); } return prop; } diff --git a/htroot/PerformanceMemory_p.java b/htroot/PerformanceMemory_p.java index ef50be35c..31578f8fe 100644 --- a/htroot/PerformanceMemory_p.java +++ b/htroot/PerformanceMemory_p.java @@ -196,7 +196,7 @@ public class PerformanceMemory_p { // other caching structures prop.putNum("namecacheHit.size", Domains.nameCacheHitSize()); prop.putNum("namecacheMiss.size", Domains.nameCacheMissSize()); - prop.putNum("namecache.noCache", Domains.nameCacheNoCachingListSize()); + prop.putNum("namecache.noCache", 0); prop.putNum("blacklistcache.size", Switchboard.urlBlacklist.blacklistCacheSize()); prop.putNum("searchevent.size", SearchEventCache.size()); prop.putNum("searchevent.hit", SearchEventCache.cacheHit); diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index 52ae85e4d..0ab788020 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -453,13 +453,13 @@ public class Balancer { Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta) + ", top.size() = " + top.size() + ", delayed.size() = " + delayed.size() + ", domainStacks.size() = " + domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize); long loops = sleeptime / 1000; long rest = sleeptime % 1000; - if (loops < 2) { + if (loops < 3) { rest = rest + 1000 * loops; loops = 0; } if (rest > 0) {try {this.wait(rest); } catch (final InterruptedException e) {}} for (int i = 0; i < loops; i++) { - Log.logInfo("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + ((loops - i) * 3) + " seconds remaining..."); + Log.logInfo("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining..."); try {this.wait(1000); } catch (final InterruptedException e) {} } } diff --git a/source/net/yacy/cora/protocol/Domains.java b/source/net/yacy/cora/protocol/Domains.java index 0b752bf8d..135eec8a0 100644 --- a/source/net/yacy/cora/protocol/Domains.java +++ b/source/net/yacy/cora/protocol/Domains.java @@ -51,15 +51,14 @@ public class Domains { private static final String LOCAL_PATTERNS = "10\\..*,127\\..*,172\\.(1[6-9]|2[0-9]|3[0-1])\\..*,169\\.254\\..*,192\\.168\\..*,localhost"; private static final int MAX_NAME_CACHE_HIT_SIZE = 20000; private static final int MAX_NAME_CACHE_MISS_SIZE = 20000; - private static final int MAX_NAME_NO_CACHING_LIST_SIZE = 20000; private static final int CONCURRENCY_LEVEL = Runtime.getRuntime().availableProcessors() + 1; // a dns cache private static final ARC NAME_CACHE_HIT = new ConcurrentARC(MAX_NAME_CACHE_HIT_SIZE, CONCURRENCY_LEVEL); private static final ARC NAME_CACHE_MISS = new ConcurrentARC(MAX_NAME_CACHE_MISS_SIZE, CONCURRENCY_LEVEL); - private static final ARC NAME_CACHE_NO_CACHING_LIST = new ConcurrentARC(MAX_NAME_NO_CACHING_LIST_SIZE, CONCURRENCY_LEVEL); - public static List nameCacheNoCachingPatterns = Collections.synchronizedList(new LinkedList()); - public static final List LOCALHOST_PATTERNS = makePatterns(LOCAL_PATTERNS); + private static final ConcurrentHashMap LOOKUP_SYNC = new ConcurrentHashMap(); + private static List nameCacheNoCachingPatterns = Collections.synchronizedList(new LinkedList()); + private static final List LOCALHOST_PATTERNS = makePatterns(LOCAL_PATTERNS); /** * ! ! ! A T T E N T I O N A T T E N T I O N A T T E N T I O N ! ! ! @@ -496,46 +495,59 @@ public class Domains { // try to resolve host by doing a name cache lookup ip = NAME_CACHE_HIT.get(host); - if (ip != null) return ip; - - if (NAME_CACHE_MISS.containsKey(host)) return null; + if (ip != null) { + //System.out.println("DNSLOOKUP-CACHE-HIT(CONC) " + host); + return ip; + } + if (NAME_CACHE_MISS.containsKey(host)) { + //System.out.println("DNSLOOKUP-CACHE-MISS(CONC) " + host); + return null; + } // call dnsResolveNetBased(host) using concurrency to interrupt execution in case of a time-out - try { - boolean doCaching = true; - ip = InetAddress.getByName(host); //TimeoutRequest.getByName(host, 1000); // this makes the DNS request to backbone - if ((ip == null) || - (ip.isLoopbackAddress()) || - (NAME_CACHE_NO_CACHING_LIST.containsKey(host)) - ) { - doCaching = false; - } else { - if (matchesList(host, nameCacheNoCachingPatterns)) { - NAME_CACHE_NO_CACHING_LIST.put(host, PRESENT); - doCaching = false; - } + final Object sync_obj_new = new Object(); + Object sync_obj = LOOKUP_SYNC.putIfAbsent(host, sync_obj_new); + if (sync_obj == null) sync_obj = sync_obj_new; + synchronized (sync_obj) { + // now look again if the host is in the cache where it may be meanwhile because of the synchronization + ip = NAME_CACHE_HIT.get(host); + if (ip != null) { + //System.out.println("DNSLOOKUP-CACHE-HIT(SYNC) " + host); + return ip; + } + if (NAME_CACHE_MISS.containsKey(host)) { + //System.out.println("DNSLOOKUP-CACHE-MISS(SYNC) " + host); + return null; } - if (doCaching && ip != null) { - + // do the dns lookup on the dns server + //if (!matchesList(host, nameCacheNoCachingPatterns)) System.out.println("DNSLOOKUP " + host); + try { + ip = InetAddress.getByName(host); //TimeoutRequest.getByName(host, 1000); // this makes the DNS request to backbone + } catch (final UnknownHostException e) { + // add new entries + NAME_CACHE_MISS.put(host, PRESENT); + LOOKUP_SYNC.remove(host); + return null; + } + + if ((ip != null) && + (!ip.isLoopbackAddress()) && + (!matchesList(host, nameCacheNoCachingPatterns)) + ) { // add new entries NAME_CACHE_HIT.put(host, ip); } + LOOKUP_SYNC.remove(host); return ip; - } catch (final UnknownHostException e) { - // remove old entries - flushMissNameCache(); - - // add new entries - NAME_CACHE_MISS.put(host, PRESENT); } - return null; } private final static Pattern dotPattern = Pattern.compile("\\."); - private static final InetAddress parseInetAddress(final String ip) { + private static final InetAddress parseInetAddress(String ip) { if (ip == null || ip.length() < 8) return null; + if (ip.equals("0:0:0:0:0:0:0:1%0")) ip = "127.0.0.1"; final String[] ips = dotPattern.split(ip); if (ips.length != 4) return null; final byte[] ipb = new byte[4]; @@ -567,22 +579,6 @@ public class Domains { return NAME_CACHE_MISS.size(); } - /** - * Returns the number of entries in the nameCacheNoCachingList list - * - * @return int The number of entries in the nameCacheNoCachingList list - */ - public static int nameCacheNoCachingListSize() { - return NAME_CACHE_NO_CACHING_LIST.size(); - } - - /** - * Removes old entries from the dns miss cache - */ - public static void flushMissNameCache() { - if (NAME_CACHE_MISS.size() > MAX_NAME_CACHE_MISS_SIZE) NAME_CACHE_MISS.clear(); - } - private static String localHostName = "127.0.0.1"; private static Set localHostAddresses = new HashSet(); private static Set localHostNames = new HashSet(); diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index d8c65958d..620bf762d 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -201,9 +201,12 @@ public final class Condenser { // images final Iterator j = document.getImages().values().iterator(); ImageEntry ientry; + MultiProtocolURI url; while (j.hasNext()) { ientry = j.next(); - insertTextToWords(ientry.url().toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS, false, meaningLib); + url = ientry.url(); + if (url == null) continue; + insertTextToWords(url.toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS, false, meaningLib); insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS, true, meaningLib); } diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index f9c2fa904..9300e1ba5 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -481,7 +481,7 @@ dc_rights final Map v = new HashMap(); final Iterator i = links.iterator(); Object o; - MultiProtocolURI url; + MultiProtocolURI url = null; String u; int pos; loop: while (i.hasNext()) @@ -495,8 +495,9 @@ dc_rights url = ((ImageEntry) o).url(); else { assert false; - continue; + continue loop; } + if (url == null) continue loop; u = url.toNormalform(true, true); if ((pos = u.toLowerCase().indexOf("http://", 7)) > 0) { i.remove(); diff --git a/source/net/yacy/document/importer/MediawikiImporter.java b/source/net/yacy/document/importer/MediawikiImporter.java index 570c0268f..8fc1719e3 100644 --- a/source/net/yacy/document/importer/MediawikiImporter.java +++ b/source/net/yacy/document/importer/MediawikiImporter.java @@ -90,7 +90,7 @@ public class MediawikiImporter extends Thread implements Importer { private String hostport, urlStub; - public MediawikiImporter(File sourcefile, File targetdir) throws MalformedURLException { + public MediawikiImporter(File sourcefile, File targetdir) { this.sourcefile = sourcefile; this.docsize = sourcefile.length(); this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L); @@ -762,8 +762,6 @@ public class MediawikiImporter extends Thread implements Importer { mi.join(); } catch (InterruptedException e) { Log.logException(e); - } catch (IOException e) { - Log.logException(e); } }