fixed some problems with crawl profiles

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1967 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 4a5b5515b5
commit 59d52fb4a9

@ -381,8 +381,6 @@ public class IndexCreate_p {
Iterator it = switchboard.profiles.profiles(true);
plasmaCrawlProfile.entry profile;
dark = true;
Iterator domnamesi;
String domnames;
while (it.hasNext()) {
profile = (plasmaCrawlProfile.entry) it.next();
//table += profile.map().toString() + "<br>";
@ -394,9 +392,7 @@ public class IndexCreate_p {
prop.put("crawlProfiles_"+count+"_filter", profile.generalFilter());
prop.put("crawlProfiles_"+count+"_crawlingIfOlder", (profile.recrawlIfOlder() == Long.MAX_VALUE) ? "no re-crawl" : ""+profile.recrawlIfOlder());
prop.put("crawlProfiles_"+count+"_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : ""+profile.domFilterDepth());
domnamesi = profile.domNames();
domnames=""; while (domnamesi.hasNext()) domnames += ((String) domnamesi.next()) + ", ";
prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", domnames);
prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", profile.domNames(true));
prop.put("crawlProfiles_"+count+"_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : ""+profile.domMaxPages());
prop.put("crawlProfiles_"+count+"_withQuery", ((profile.crawlingQ()) ? 1 : 0));
prop.put("crawlProfiles_"+count+"_storeCache", ((profile.storeHTCache()) ? 1 : 0));

@ -115,10 +115,10 @@ public class plasmaCrawlProfile {
public class profileIterator implements Iterator {
// the iterator iterates all keys, which are byte[] objects
kelondroDyn.dynKeyIterator handleIterator;
entry next;
String lastkey;
public profileIterator(boolean up) throws IOException {
handleIterator = profileTable.keys(up, false);
next = null;
lastkey = null;
}
public boolean hasNext() {
try {
@ -130,16 +130,16 @@ public class plasmaCrawlProfile {
}
public Object next() {
try {
return getEntry((String) handleIterator.next());
lastkey = (String) handleIterator.next();
return getEntry(lastkey);
} catch (kelondroException e) {
resetDatabase();
return null;
}
}
public void remove() {
if (next != null) try {
Object handle = next.handle();
if (handle != null) removeEntry((String) handle);
if (lastkey != null) try {
removeEntry(lastkey);
} catch (kelondroException e) {
resetDatabase();
}
@ -221,7 +221,22 @@ public class plasmaCrawlProfile {
}
}
public class DomProfile {
public String referrer;
public int depth, count;
public DomProfile(String ref, int d) {
this.referrer = ref;
this.depth = d;
this.count = 1;
}
public void inc() {
this.count++;
}
}
public class entry {
// this is a simple record structure that hold all properties of a single crawl start
@ -387,33 +402,44 @@ public class plasmaCrawlProfile {
mem.put(propName, newValue);
profileTable.set(handle(), mem);
}
public void domInc(String domain) {
Integer c = (Integer) doms.get(domain);
if (c == null) {
public void domInc(String domain, String referrer, int depth) {
DomProfile dp = (DomProfile) doms.get(domain);
if (dp == null) {
// new domain
doms.put(domain, new Integer(1));
doms.put(domain, new DomProfile(referrer, depth));
} else {
// increase counter
doms.put(domain, new Integer(c.intValue() + 1));
dp.inc();
doms.put(domain, dp);
}
domsCache.put(this.mem.get("handle"), doms);
}
public int domCount(String domain) {
Integer c = (Integer) doms.get(domain);
if (c == null) {
DomProfile dp = (DomProfile) doms.get(domain);
if (dp == null) {
return 0;
} else {
return c.intValue();
return dp.count;
}
}
public int domSize() {
return doms.size();
}
public boolean domExists(String domain) {
if (domFilterDepth() == Integer.MAX_VALUE) return true;
return doms.containsKey(domain);
}
public Iterator domNames() {
return doms.keySet().iterator();
public String domNames(boolean attr) {
Iterator domnamesi = doms.entrySet().iterator();
String domnames="";
Map.Entry ey;
DomProfile dp;
while (domnamesi.hasNext()) {
ey = (Map.Entry) domnamesi.next();
dp = (DomProfile) ey.getValue();
domnames += ((String) ey.getKey()) + ((attr) ? ("/d=" + dp.depth + ",c=" + dp.count + " ") : " ");
}
return domnames;
}
}
}

@ -239,8 +239,14 @@ public final class plasmaCrawlStacker {
return reason;
}
*/
URL nexturl = null;
URL nexturl = null, referrerURL = null;
if ((initiatorHash == null) || (initiatorHash.length() == 0)) initiatorHash = plasmaURL.dummyHash;
try {
referrerURL = new URL(referrerString);
} catch (MalformedURLException e) {
referrerURL = null;
referrerString = null;
}
String referrerHash = (referrerString==null)?null:plasmaURL.urlHash(referrerString);
try {
nexturl = new URL(nexturlString);
@ -313,7 +319,7 @@ public final class plasmaCrawlStacker {
// add domain to profile domain list
if (currentdepth <= profile.domFilterDepth()) {
profile.domInc(nexturl.getHost());
profile.domInc(nexturl.getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), currentdepth);
}
// deny urls that do not match with the profile domain list

Loading…
Cancel
Save