several bugfixes

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1971 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 57fc0cf063
commit 7a650d0023

@ -392,7 +392,7 @@ public class IndexCreate_p {
prop.put("crawlProfiles_"+count+"_filter", profile.generalFilter());
prop.put("crawlProfiles_"+count+"_crawlingIfOlder", (profile.recrawlIfOlder() == Long.MAX_VALUE) ? "no re-crawl" : ""+profile.recrawlIfOlder());
prop.put("crawlProfiles_"+count+"_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : ""+profile.domFilterDepth());
prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", profile.domNames(true));
prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", profile.domNames(true, 160));
prop.put("crawlProfiles_"+count+"_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : ""+profile.domMaxPages());
prop.put("crawlProfiles_"+count+"_withQuery", ((profile.crawlingQ()) ? 1 : 0));
prop.put("crawlProfiles_"+count+"_storeCache", ((profile.storeHTCache()) ? 1 : 0));

@ -416,10 +416,11 @@ public final class plasmaCrawlLURL extends plasmaURL {
// - author / copyright owner
// - keywords
// - phrasecount, total number of phrases
// - boolean: URL attributes
// - boolean: URL attributes (see Word-Entity definition)
// - boolean: appearance of bold and/or italics
// - int: # of outlinks to same domain
// - int: # of outlinks to outside domain
// - ETag: for re-crawl decision upon HEAD request
public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, int size, int wordCount) {
// create new entry and store it into database

@ -414,12 +414,24 @@ public class plasmaCrawlProfile {
}
domsCache.put(this.mem.get("handle"), doms);
}
public int domCount(String domain) {
public boolean grantedDomAppearance(String domain) {
int max = domFilterDepth();
if (max == Integer.MAX_VALUE) return true;
DomProfile dp = (DomProfile) doms.get(domain);
if (dp == null) {
return 0;
return 0 < max;
} else {
return dp.count < max;
}
}
public boolean grantedDomCount(String domain) {
int max = domMaxPages();
if (max == Integer.MAX_VALUE) return true;
DomProfile dp = (DomProfile) doms.get(domain);
if (dp == null) {
return 0 < max;
} else {
return dp.count;
return dp.count < max;
}
}
public int domSize() {
@ -429,7 +441,7 @@ public class plasmaCrawlProfile {
if (domFilterDepth() == Integer.MAX_VALUE) return true;
return doms.containsKey(domain);
}
public String domNames(boolean attr) {
public String domNames(boolean attr, int maxlength) {
Iterator domnamesi = doms.entrySet().iterator();
String domnames="";
Map.Entry ey;
@ -438,6 +450,10 @@ public class plasmaCrawlProfile {
ey = (Map.Entry) domnamesi.next();
dp = (DomProfile) ey.getValue();
domnames += ((String) ey.getKey()) + ((attr) ? ("/d=" + dp.depth + ",c=" + dp.count + " ") : " ");
if ((maxlength > 0) && (domnames.length() >= maxlength)) {
domnames = domnames.substring(0, maxlength-3) + "...";
break;
}
}
return domnames;
}

@ -323,7 +323,7 @@ public final class plasmaCrawlStacker {
}
// deny urls that do not match with the profile domain list
if (profile.domCount(nexturl.getHost()) == 0) {
if (!(profile.grantedDomAppearance(nexturl.getHost()))) {
reason = "denied_(no_match_with_domain_filter)";
this.log.logFine("URL '" + nexturlString + "' is not listed in granted domains. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime));
@ -331,7 +331,7 @@ public final class plasmaCrawlStacker {
}
// deny urls that exceed allowed number of occurrences
if (profile.domCount(nexturl.getHost()) > profile.domMaxPages()) {
if (!(profile.grantedDomCount(nexturl.getHost()))) {
reason = "denied_(domain_count_exceeded)";
this.log.logFine("URL '" + nexturlString + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed. "+
"Stack processing time: " + (System.currentTimeMillis()-startTime));

@ -172,11 +172,13 @@ public final class plasmaWordIndexAssortmentCluster {
int need = newContainer.size();
int selectedAssortment = testsize - 1;
while (selectedAssortment >= 0) {
spaces[selectedAssortment] = (assortments[selectedAssortment].get(wordHash) == null) ? (selectedAssortment + 1) : 0;
need -= spaces[selectedAssortment];
assert (need >= 0);
if (need == 0) break;
selectedAssortment = (need < selectedAssortment) ? need : selectedAssortment - 1;
if (selectedAssortment + 1 <= need) {
spaces[selectedAssortment] = (assortments[selectedAssortment].get(wordHash) == null) ? (selectedAssortment + 1) : 0;
need -= spaces[selectedAssortment];
assert (need >= 0);
if (need == 0) break;
}
selectedAssortment--;
}
if (need == 0) {
// we found spaces so that we can put in the newContainer into these spaces

Loading…
Cancel
Save