From 00a5d435e2b11f8e03784c39737d14a4894991ae Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 13 Apr 2006 23:19:36 +0000 Subject: [PATCH] - fixed some bugs with domain filter - added new ranking filter "prefermask": urls that match the filter are ranked better git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2022 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- build.properties | 2 +- htroot/DetailedSearch.java | 2 +- htroot/IndexCreate_p.java | 5 ++--- htroot/index.html | 15 +++++++++++++++ htroot/index.java | 6 ++++-- htroot/yacy/search.java | 3 ++- htroot/yacysearch.html | 1 + htroot/yacysearch.java | 7 +++++-- source/de/anomic/plasma/plasmaCrawlProfile.java | 2 +- source/de/anomic/plasma/plasmaCrawlStacker.java | 2 +- source/de/anomic/plasma/plasmaSearchQuery.java | 7 +++++-- .../anomic/plasma/plasmaSearchRankingProfile.java | 7 +++++++ 12 files changed, 45 insertions(+), 14 deletions(-) diff --git a/build.properties b/build.properties index f7aae6a4b..04675e6c1 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.4 javacTarget=1.4 # Release Configuration -releaseVersion=0.443 +releaseVersion=0.444 releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz #releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr} diff --git a/htroot/DetailedSearch.java b/htroot/DetailedSearch.java index cd90d365d..6bea34c98 100644 --- a/htroot/DetailedSearch.java +++ b/htroot/DetailedSearch.java @@ -132,7 +132,7 @@ public class DetailedSearch { } // do the search - plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, wdist, count, searchtime, urlmask, + plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, wdist, "", count, searchtime, urlmask, ((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL, "", 20); plasmaSearchRankingProfile localRanking = new plasmaSearchRankingProfile("local", post.toString()); diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java index 89addb84b..5a4928ded 100644 --- a/htroot/IndexCreate_p.java +++ b/htroot/IndexCreate_p.java @@ -81,8 +81,6 @@ public class IndexCreate_p { prop.put("error", 0); prop.put("info", 0); prop.put("refreshbutton", 0); - - switchboard.cleanProfiles(); if (post != null) { if (post.containsKey("crawlingstart")) { @@ -395,6 +393,7 @@ public class IndexCreate_p { // sed crawl profiles int count = 0; + int domlistlength = (post == null) ? 160 : post.getInt("domlistlength", 160); //try{ Iterator it = switchboard.profiles.profiles(true); plasmaCrawlProfile.entry profile; @@ -410,7 +409,7 @@ public class IndexCreate_p { prop.put("crawlProfiles_"+count+"_filter", profile.generalFilter()); prop.put("crawlProfiles_"+count+"_crawlingIfOlder", (profile.recrawlIfOlder() == Long.MAX_VALUE) ? "no re-crawl" : ""+profile.recrawlIfOlder()); prop.put("crawlProfiles_"+count+"_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : ""+profile.domFilterDepth()); - prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", profile.domNames(true, 160)); + prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", profile.domNames(true, domlistlength)); prop.put("crawlProfiles_"+count+"_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : ""+profile.domMaxPages()); prop.put("crawlProfiles_"+count+"_withQuery", ((profile.crawlingQ()) ? 1 : 0)); prop.put("crawlProfiles_"+count+"_storeCache", ((profile.storeHTCache()) ? 1 : 0)); diff --git a/htroot/index.html b/htroot/index.html index 4dac23c85..f388059c2 100644 --- a/htroot/index.html +++ b/htroot/index.html @@ -34,6 +34,8 @@ + + more options... @@ -106,6 +108,19 @@ #(/urlmaskoptions)# + + + Prefer mask: + + + #(prefermaskoptions)# + + :: + restrict on + show all + #(/prefermaskoptions)# + + #(/searchoptions)# diff --git a/htroot/index.java b/htroot/index.java index 24e54be80..4a41f9afd 100644 --- a/htroot/index.java +++ b/htroot/index.java @@ -102,8 +102,8 @@ public class index { prop.put("combine", 0); prop.put("resultbottomline", 0); prop.put("searchoptions", searchoptions); - prop.put("searchoptions_count-10", 1); - prop.put("searchoptions_count-50", 0); + prop.put("searchoptions_count-10", 0); + prop.put("searchoptions_count-50", 1); prop.put("searchoptions_count-100", 0); prop.put("searchoptions_count-1000", 0); prop.put("searchoptions_order-ybr-date-quality", plasmaSearchPreOrder.canUseYBR() ? 1 : 0); @@ -122,6 +122,8 @@ public class index { prop.put("searchoptions_time-60", 0); prop.put("searchoptions_urlmaskoptions", 0); prop.put("searchoptions_urlmaskoptions_urlmaskfilter", ".*"); + prop.put("searchoptions_prefermaskoptions", 0); + prop.put("searchoptions_prefermaskoptions_prefermaskfilter", ""); prop.put("results", ""); prop.put("cat", "href"); prop.put("type", "0"); diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index dd44fe9de..2e8752d96 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -83,6 +83,7 @@ public final class search { final long duetime= post.getLong("duetime", 3000); final int count = post.getInt("count", 10); // maximum number of wanted results final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE); + final String prefer = post.get("prefer", ""); // final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers // Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time @@ -104,7 +105,7 @@ public final class search { } final long timestamp = System.currentTimeMillis(); - plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, maxdist, count, duetime, ".*"); + plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, maxdist, prefer, count, duetime, ".*"); squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; serverObjects prop = new serverObjects(); diff --git a/htroot/yacysearch.html b/htroot/yacysearch.html index 7319dd62a..1a1181d8c 100644 --- a/htroot/yacysearch.html +++ b/htroot/yacysearch.html @@ -55,6 +55,7 @@ picPlus.src = "/env/grafics/plus.gif"; + diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index a876bebc0..bd769a35d 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -109,6 +109,7 @@ public class yacysearch { prop.put("resource", "global"); prop.put("time", 6); prop.put("urlmaskfilter", ".*"); + prop.put("prefermaskfilter", ""); prop.put("cat", "href"); prop.put("depth", "0"); prop.put("type", 0); @@ -144,7 +145,8 @@ public class yacysearch { } else { urlmask = (post.containsKey("urlmaskfilter")) ? (String) post.get("urlmaskfilter") : ".*"; } - String prefer = post.get("prefer", ".*"); + String prefermask = post.get("prefermaskfilter", ""); + if ((prefermask.length() > 0) && (prefermask.indexOf(".*") < 0)) prefermask = ".*" + prefermask + ".*"; serverObjects prop = new serverObjects(); @@ -189,6 +191,7 @@ public class yacysearch { plasmaSearchQuery thisSearch = new plasmaSearchQuery( query, maxDistance, + prefermask, count, searchtime, urlmask, @@ -351,7 +354,7 @@ public class yacysearch { prop.put("resource", (global) ? "global" : "local"); prop.put("time", searchtime / 1000); prop.put("urlmaskfilter", urlmask); - prop.put("prefer", prefer); + prop.put("prefermaskfilter", prefermask); prop.put("display", display); // return rewrite properties diff --git a/source/de/anomic/plasma/plasmaCrawlProfile.java b/source/de/anomic/plasma/plasmaCrawlProfile.java index 21e711ea4..fa14483c1 100644 --- a/source/de/anomic/plasma/plasmaCrawlProfile.java +++ b/source/de/anomic/plasma/plasmaCrawlProfile.java @@ -456,7 +456,7 @@ public class plasmaCrawlProfile { while (domnamesi.hasNext()) { ey = (Map.Entry) domnamesi.next(); dp = (DomProfile) ey.getValue(); - domnames += ((String) ey.getKey()) + ((attr) ? ("/d=" + dp.depth + ",c=" + dp.count + " ") : " "); + domnames += ((String) ey.getKey()) + ((attr) ? ("/r=" + dp.referrer + ", d=" + dp.depth + ", c=" + dp.count + " ") : " ") + "
"; if ((maxlength > 0) && (domnames.length() >= maxlength)) { domnames = domnames.substring(0, maxlength-3) + "..."; break; diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index acb2e77bf..735ae2331 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -318,7 +318,7 @@ public final class plasmaCrawlStacker { } // add domain to profile domain list - if (currentdepth <= profile.domFilterDepth()) { + if ((profile.domFilterDepth() != Integer.MAX_VALUE) || (profile.domMaxPages() != Integer.MAX_VALUE)) { profile.domInc(nexturl.getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), currentdepth); } diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index 55a2e31c0..0f68e887b 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -61,6 +61,7 @@ public final class plasmaSearchQuery { public Set queryWords; public Set queryHashes; public int wantedResults; + public String prefer; public long maximumTime; public String urlMask; public int domType; @@ -68,11 +69,12 @@ public final class plasmaSearchQuery { public int domMaxTargets; public int maxDistance; - public plasmaSearchQuery(Set queryWords, int maxDistance, + public plasmaSearchQuery(Set queryWords, int maxDistance, String prefer, int wantedResults, long maximumTime, String urlMask, int domType, String domGroupName, int domMaxTargets) { this.queryWords = queryWords; this.maxDistance = maxDistance; + this.prefer = prefer; this.queryHashes = words2hashes(queryWords); this.wantedResults = wantedResults; this.maximumTime = maximumTime; @@ -82,10 +84,11 @@ public final class plasmaSearchQuery { this.domMaxTargets = domMaxTargets; } - public plasmaSearchQuery(Set queryHashes, int maxDistance, + public plasmaSearchQuery(Set queryHashes, int maxDistance, String prefer, int wantedResults, long maximumTime, String urlMask) { this.queryWords = null; this.maxDistance = maxDistance; + this.prefer = prefer; this.queryHashes = queryHashes; this.wantedResults = wantedResults; this.maximumTime = maximumTime; diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java index e172bdb0f..b898bd0bb 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java @@ -73,6 +73,7 @@ public class plasmaSearchRankingProfile { public static final String QUERYINDESCR = "queryindescr"; public static final String URLCOMPINTOPLIST = "urlcompintoplist"; public static final String DESCRCOMPINTOPLIST = "descrcompintoplist"; + public static final String PREFER = "prefer"; public String[] order; private HashMap coeff; @@ -96,6 +97,7 @@ public class plasmaSearchRankingProfile { coeff.put(QUERYINDESCR, new Integer(8)); coeff.put(URLCOMPINTOPLIST, new Integer(3)); coeff.put(DESCRCOMPINTOPLIST, new Integer(2)); + coeff.put(PREFER, new Integer(15)); } public plasmaSearchRankingProfile(String prefix, String profile) { @@ -183,6 +185,10 @@ public class plasmaSearchRankingProfile { // apply pre-calculated order attributes long ranking = this.preRanking(normalizedEntry); + // prefer hit with 'prefer' pattern + if (page.url().toString().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue(); + if (page.descr().toString().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue(); + // apply 'common-sense' heuristic using references for (int j = 0; j < urlcomps.length; j++) { if (topwords.contains(urlcomps[j])) ranking += 256 << ((Integer) coeff.get(URLCOMPINTOPLIST)).intValue(); @@ -210,6 +216,7 @@ public class plasmaSearchRankingProfile { ranking += (255 * page.descr().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue(); ranking += (255 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12) << ((Integer) coeff.get(DESCRCOMPS)).intValue(); + return ranking; }