From 7a650d002314bd06c87820b3569dbe2dee45a191 Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 27 Mar 2006 16:45:29 +0000 Subject: [PATCH] several bugfixes git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1971 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexCreate_p.java | 2 +- source/de/anomic/plasma/plasmaCrawlLURL.java | 3 ++- .../de/anomic/plasma/plasmaCrawlProfile.java | 24 +++++++++++++++---- .../de/anomic/plasma/plasmaCrawlStacker.java | 4 ++-- .../plasmaWordIndexAssortmentCluster.java | 12 ++++++---- 5 files changed, 32 insertions(+), 13 deletions(-) diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java index 9423d9cae..bd7123495 100644 --- a/htroot/IndexCreate_p.java +++ b/htroot/IndexCreate_p.java @@ -392,7 +392,7 @@ public class IndexCreate_p { prop.put("crawlProfiles_"+count+"_filter", profile.generalFilter()); prop.put("crawlProfiles_"+count+"_crawlingIfOlder", (profile.recrawlIfOlder() == Long.MAX_VALUE) ? "no re-crawl" : ""+profile.recrawlIfOlder()); prop.put("crawlProfiles_"+count+"_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : ""+profile.domFilterDepth()); - prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", profile.domNames(true)); + prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", profile.domNames(true, 160)); prop.put("crawlProfiles_"+count+"_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : ""+profile.domMaxPages()); prop.put("crawlProfiles_"+count+"_withQuery", ((profile.crawlingQ()) ? 1 : 0)); prop.put("crawlProfiles_"+count+"_storeCache", ((profile.storeHTCache()) ? 1 : 0)); diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index f868c3040..1d482122c 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -416,10 +416,11 @@ public final class plasmaCrawlLURL extends plasmaURL { // - author / copyright owner // - keywords // - phrasecount, total number of phrases - // - boolean: URL attributes + // - boolean: URL attributes (see Word-Entity definition) // - boolean: appearance of bold and/or italics // - int: # of outlinks to same domain // - int: # of outlinks to outside domain + // - ETag: for re-crawl decision upon HEAD request public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, int size, int wordCount) { // create new entry and store it into database diff --git a/source/de/anomic/plasma/plasmaCrawlProfile.java b/source/de/anomic/plasma/plasmaCrawlProfile.java index 6e5e5ee4c..470052bd9 100644 --- a/source/de/anomic/plasma/plasmaCrawlProfile.java +++ b/source/de/anomic/plasma/plasmaCrawlProfile.java @@ -414,12 +414,24 @@ public class plasmaCrawlProfile { } domsCache.put(this.mem.get("handle"), doms); } - public int domCount(String domain) { + public boolean grantedDomAppearance(String domain) { + int max = domFilterDepth(); + if (max == Integer.MAX_VALUE) return true; DomProfile dp = (DomProfile) doms.get(domain); if (dp == null) { - return 0; + return 0 < max; + } else { + return dp.count < max; + } + } + public boolean grantedDomCount(String domain) { + int max = domMaxPages(); + if (max == Integer.MAX_VALUE) return true; + DomProfile dp = (DomProfile) doms.get(domain); + if (dp == null) { + return 0 < max; } else { - return dp.count; + return dp.count < max; } } public int domSize() { @@ -429,7 +441,7 @@ public class plasmaCrawlProfile { if (domFilterDepth() == Integer.MAX_VALUE) return true; return doms.containsKey(domain); } - public String domNames(boolean attr) { + public String domNames(boolean attr, int maxlength) { Iterator domnamesi = doms.entrySet().iterator(); String domnames=""; Map.Entry ey; @@ -438,6 +450,10 @@ public class plasmaCrawlProfile { ey = (Map.Entry) domnamesi.next(); dp = (DomProfile) ey.getValue(); domnames += ((String) ey.getKey()) + ((attr) ? ("/d=" + dp.depth + ",c=" + dp.count + " ") : " "); + if ((maxlength > 0) && (domnames.length() >= maxlength)) { + domnames = domnames.substring(0, maxlength-3) + "..."; + break; + } } return domnames; } diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index d33a4e8dd..acb2e77bf 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -323,7 +323,7 @@ public final class plasmaCrawlStacker { } // deny urls that do not match with the profile domain list - if (profile.domCount(nexturl.getHost()) == 0) { + if (!(profile.grantedDomAppearance(nexturl.getHost()))) { reason = "denied_(no_match_with_domain_filter)"; this.log.logFine("URL '" + nexturlString + "' is not listed in granted domains. " + "Stack processing time: " + (System.currentTimeMillis()-startTime)); @@ -331,7 +331,7 @@ public final class plasmaCrawlStacker { } // deny urls that exceed allowed number of occurrences - if (profile.domCount(nexturl.getHost()) > profile.domMaxPages()) { + if (!(profile.grantedDomCount(nexturl.getHost()))) { reason = "denied_(domain_count_exceeded)"; this.log.logFine("URL '" + nexturlString + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed. "+ "Stack processing time: " + (System.currentTimeMillis()-startTime)); diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java index c37ca1e35..84f32ddc4 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java @@ -172,11 +172,13 @@ public final class plasmaWordIndexAssortmentCluster { int need = newContainer.size(); int selectedAssortment = testsize - 1; while (selectedAssortment >= 0) { - spaces[selectedAssortment] = (assortments[selectedAssortment].get(wordHash) == null) ? (selectedAssortment + 1) : 0; - need -= spaces[selectedAssortment]; - assert (need >= 0); - if (need == 0) break; - selectedAssortment = (need < selectedAssortment) ? need : selectedAssortment - 1; + if (selectedAssortment + 1 <= need) { + spaces[selectedAssortment] = (assortments[selectedAssortment].get(wordHash) == null) ? (selectedAssortment + 1) : 0; + need -= spaces[selectedAssortment]; + assert (need >= 0); + if (need == 0) break; + } + selectedAssortment--; } if (need == 0) { // we found spaces so that we can put in the newContainer into these spaces