fixed some problems with crawl profiles

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1967 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 4a5b5515b5
commit 59d52fb4a9

@ -381,8 +381,6 @@ public class IndexCreate_p {
Iterator it = switchboard.profiles.profiles(true); Iterator it = switchboard.profiles.profiles(true);
plasmaCrawlProfile.entry profile; plasmaCrawlProfile.entry profile;
dark = true; dark = true;
Iterator domnamesi;
String domnames;
while (it.hasNext()) { while (it.hasNext()) {
profile = (plasmaCrawlProfile.entry) it.next(); profile = (plasmaCrawlProfile.entry) it.next();
//table += profile.map().toString() + "<br>"; //table += profile.map().toString() + "<br>";
@ -394,9 +392,7 @@ public class IndexCreate_p {
prop.put("crawlProfiles_"+count+"_filter", profile.generalFilter()); prop.put("crawlProfiles_"+count+"_filter", profile.generalFilter());
prop.put("crawlProfiles_"+count+"_crawlingIfOlder", (profile.recrawlIfOlder() == Long.MAX_VALUE) ? "no re-crawl" : ""+profile.recrawlIfOlder()); prop.put("crawlProfiles_"+count+"_crawlingIfOlder", (profile.recrawlIfOlder() == Long.MAX_VALUE) ? "no re-crawl" : ""+profile.recrawlIfOlder());
prop.put("crawlProfiles_"+count+"_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : ""+profile.domFilterDepth()); prop.put("crawlProfiles_"+count+"_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : ""+profile.domFilterDepth());
domnamesi = profile.domNames(); prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", profile.domNames(true));
domnames=""; while (domnamesi.hasNext()) domnames += ((String) domnamesi.next()) + ", ";
prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", domnames);
prop.put("crawlProfiles_"+count+"_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : ""+profile.domMaxPages()); prop.put("crawlProfiles_"+count+"_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : ""+profile.domMaxPages());
prop.put("crawlProfiles_"+count+"_withQuery", ((profile.crawlingQ()) ? 1 : 0)); prop.put("crawlProfiles_"+count+"_withQuery", ((profile.crawlingQ()) ? 1 : 0));
prop.put("crawlProfiles_"+count+"_storeCache", ((profile.storeHTCache()) ? 1 : 0)); prop.put("crawlProfiles_"+count+"_storeCache", ((profile.storeHTCache()) ? 1 : 0));

@ -115,10 +115,10 @@ public class plasmaCrawlProfile {
public class profileIterator implements Iterator { public class profileIterator implements Iterator {
// the iterator iterates all keys, which are byte[] objects // the iterator iterates all keys, which are byte[] objects
kelondroDyn.dynKeyIterator handleIterator; kelondroDyn.dynKeyIterator handleIterator;
entry next; String lastkey;
public profileIterator(boolean up) throws IOException { public profileIterator(boolean up) throws IOException {
handleIterator = profileTable.keys(up, false); handleIterator = profileTable.keys(up, false);
next = null; lastkey = null;
} }
public boolean hasNext() { public boolean hasNext() {
try { try {
@ -130,16 +130,16 @@ public class plasmaCrawlProfile {
} }
public Object next() { public Object next() {
try { try {
return getEntry((String) handleIterator.next()); lastkey = (String) handleIterator.next();
return getEntry(lastkey);
} catch (kelondroException e) { } catch (kelondroException e) {
resetDatabase(); resetDatabase();
return null; return null;
} }
} }
public void remove() { public void remove() {
if (next != null) try { if (lastkey != null) try {
Object handle = next.handle(); removeEntry(lastkey);
if (handle != null) removeEntry((String) handle);
} catch (kelondroException e) { } catch (kelondroException e) {
resetDatabase(); resetDatabase();
} }
@ -221,7 +221,22 @@ public class plasmaCrawlProfile {
} }
} }
public class DomProfile {
public String referrer;
public int depth, count;
public DomProfile(String ref, int d) {
this.referrer = ref;
this.depth = d;
this.count = 1;
}
public void inc() {
this.count++;
}
}
public class entry { public class entry {
// this is a simple record structure that hold all properties of a single crawl start // this is a simple record structure that hold all properties of a single crawl start
@ -387,33 +402,44 @@ public class plasmaCrawlProfile {
mem.put(propName, newValue); mem.put(propName, newValue);
profileTable.set(handle(), mem); profileTable.set(handle(), mem);
} }
public void domInc(String domain) { public void domInc(String domain, String referrer, int depth) {
Integer c = (Integer) doms.get(domain); DomProfile dp = (DomProfile) doms.get(domain);
if (c == null) { if (dp == null) {
// new domain // new domain
doms.put(domain, new Integer(1)); doms.put(domain, new DomProfile(referrer, depth));
} else { } else {
// increase counter // increase counter
doms.put(domain, new Integer(c.intValue() + 1)); dp.inc();
doms.put(domain, dp);
} }
domsCache.put(this.mem.get("handle"), doms); domsCache.put(this.mem.get("handle"), doms);
} }
public int domCount(String domain) { public int domCount(String domain) {
Integer c = (Integer) doms.get(domain); DomProfile dp = (DomProfile) doms.get(domain);
if (c == null) { if (dp == null) {
return 0; return 0;
} else { } else {
return c.intValue(); return dp.count;
} }
} }
public int domSize() { public int domSize() {
return doms.size(); return doms.size();
} }
public boolean domExists(String domain) { public boolean domExists(String domain) {
if (domFilterDepth() == Integer.MAX_VALUE) return true;
return doms.containsKey(domain); return doms.containsKey(domain);
} }
public Iterator domNames() { public String domNames(boolean attr) {
return doms.keySet().iterator(); Iterator domnamesi = doms.entrySet().iterator();
String domnames="";
Map.Entry ey;
DomProfile dp;
while (domnamesi.hasNext()) {
ey = (Map.Entry) domnamesi.next();
dp = (DomProfile) ey.getValue();
domnames += ((String) ey.getKey()) + ((attr) ? ("/d=" + dp.depth + ",c=" + dp.count + " ") : " ");
}
return domnames;
} }
} }
} }

@ -239,8 +239,14 @@ public final class plasmaCrawlStacker {
return reason; return reason;
} }
*/ */
URL nexturl = null; URL nexturl = null, referrerURL = null;
if ((initiatorHash == null) || (initiatorHash.length() == 0)) initiatorHash = plasmaURL.dummyHash; if ((initiatorHash == null) || (initiatorHash.length() == 0)) initiatorHash = plasmaURL.dummyHash;
try {
referrerURL = new URL(referrerString);
} catch (MalformedURLException e) {
referrerURL = null;
referrerString = null;
}
String referrerHash = (referrerString==null)?null:plasmaURL.urlHash(referrerString); String referrerHash = (referrerString==null)?null:plasmaURL.urlHash(referrerString);
try { try {
nexturl = new URL(nexturlString); nexturl = new URL(nexturlString);
@ -313,7 +319,7 @@ public final class plasmaCrawlStacker {
// add domain to profile domain list // add domain to profile domain list
if (currentdepth <= profile.domFilterDepth()) { if (currentdepth <= profile.domFilterDepth()) {
profile.domInc(nexturl.getHost()); profile.domInc(nexturl.getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), currentdepth);
} }
// deny urls that do not match with the profile domain list // deny urls that do not match with the profile domain list

Loading…
Cancel
Save