diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java index 23d9ad514..9423d9cae 100644 --- a/htroot/IndexCreate_p.java +++ b/htroot/IndexCreate_p.java @@ -381,8 +381,6 @@ public class IndexCreate_p { Iterator it = switchboard.profiles.profiles(true); plasmaCrawlProfile.entry profile; dark = true; - Iterator domnamesi; - String domnames; while (it.hasNext()) { profile = (plasmaCrawlProfile.entry) it.next(); //table += profile.map().toString() + "
"; @@ -394,9 +392,7 @@ public class IndexCreate_p { prop.put("crawlProfiles_"+count+"_filter", profile.generalFilter()); prop.put("crawlProfiles_"+count+"_crawlingIfOlder", (profile.recrawlIfOlder() == Long.MAX_VALUE) ? "no re-crawl" : ""+profile.recrawlIfOlder()); prop.put("crawlProfiles_"+count+"_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : ""+profile.domFilterDepth()); - domnamesi = profile.domNames(); - domnames=""; while (domnamesi.hasNext()) domnames += ((String) domnamesi.next()) + ", "; - prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", domnames); + prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", profile.domNames(true)); prop.put("crawlProfiles_"+count+"_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : ""+profile.domMaxPages()); prop.put("crawlProfiles_"+count+"_withQuery", ((profile.crawlingQ()) ? 1 : 0)); prop.put("crawlProfiles_"+count+"_storeCache", ((profile.storeHTCache()) ? 1 : 0)); diff --git a/source/de/anomic/plasma/plasmaCrawlProfile.java b/source/de/anomic/plasma/plasmaCrawlProfile.java index 31b6fd7b6..6e5e5ee4c 100644 --- a/source/de/anomic/plasma/plasmaCrawlProfile.java +++ b/source/de/anomic/plasma/plasmaCrawlProfile.java @@ -115,10 +115,10 @@ public class plasmaCrawlProfile { public class profileIterator implements Iterator { // the iterator iterates all keys, which are byte[] objects kelondroDyn.dynKeyIterator handleIterator; - entry next; + String lastkey; public profileIterator(boolean up) throws IOException { handleIterator = profileTable.keys(up, false); - next = null; + lastkey = null; } public boolean hasNext() { try { @@ -130,16 +130,16 @@ public class plasmaCrawlProfile { } public Object next() { try { - return getEntry((String) handleIterator.next()); + lastkey = (String) handleIterator.next(); + return getEntry(lastkey); } catch (kelondroException e) { resetDatabase(); return null; } } public void remove() { - if (next != null) try { - Object handle = next.handle(); - if (handle != null) removeEntry((String) handle); + if (lastkey != null) try { + removeEntry(lastkey); } catch (kelondroException e) { resetDatabase(); } @@ -221,7 +221,22 @@ public class plasmaCrawlProfile { } } - + public class DomProfile { + + public String referrer; + public int depth, count; + + public DomProfile(String ref, int d) { + this.referrer = ref; + this.depth = d; + this.count = 1; + } + + public void inc() { + this.count++; + } + + } public class entry { // this is a simple record structure that hold all properties of a single crawl start @@ -387,33 +402,44 @@ public class plasmaCrawlProfile { mem.put(propName, newValue); profileTable.set(handle(), mem); } - public void domInc(String domain) { - Integer c = (Integer) doms.get(domain); - if (c == null) { + public void domInc(String domain, String referrer, int depth) { + DomProfile dp = (DomProfile) doms.get(domain); + if (dp == null) { // new domain - doms.put(domain, new Integer(1)); + doms.put(domain, new DomProfile(referrer, depth)); } else { // increase counter - doms.put(domain, new Integer(c.intValue() + 1)); + dp.inc(); + doms.put(domain, dp); } domsCache.put(this.mem.get("handle"), doms); } public int domCount(String domain) { - Integer c = (Integer) doms.get(domain); - if (c == null) { + DomProfile dp = (DomProfile) doms.get(domain); + if (dp == null) { return 0; } else { - return c.intValue(); + return dp.count; } } public int domSize() { return doms.size(); } public boolean domExists(String domain) { + if (domFilterDepth() == Integer.MAX_VALUE) return true; return doms.containsKey(domain); } - public Iterator domNames() { - return doms.keySet().iterator(); + public String domNames(boolean attr) { + Iterator domnamesi = doms.entrySet().iterator(); + String domnames=""; + Map.Entry ey; + DomProfile dp; + while (domnamesi.hasNext()) { + ey = (Map.Entry) domnamesi.next(); + dp = (DomProfile) ey.getValue(); + domnames += ((String) ey.getKey()) + ((attr) ? ("/d=" + dp.depth + ",c=" + dp.count + " ") : " "); + } + return domnames; } } } diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 8dd191141..d33a4e8dd 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -239,8 +239,14 @@ public final class plasmaCrawlStacker { return reason; } */ - URL nexturl = null; + URL nexturl = null, referrerURL = null; if ((initiatorHash == null) || (initiatorHash.length() == 0)) initiatorHash = plasmaURL.dummyHash; + try { + referrerURL = new URL(referrerString); + } catch (MalformedURLException e) { + referrerURL = null; + referrerString = null; + } String referrerHash = (referrerString==null)?null:plasmaURL.urlHash(referrerString); try { nexturl = new URL(nexturlString); @@ -313,7 +319,7 @@ public final class plasmaCrawlStacker { // add domain to profile domain list if (currentdepth <= profile.domFilterDepth()) { - profile.domInc(nexturl.getHost()); + profile.domInc(nexturl.getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), currentdepth); } // deny urls that do not match with the profile domain list