- fixed some bugs with domain filter

- added new ranking filter "prefermask": urls that match the filter are ranked better


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2022 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent b00756378f
commit 00a5d435e2

@ -3,7 +3,7 @@ javacSource=1.4
javacTarget=1.4
# Release Configuration
releaseVersion=0.443
releaseVersion=0.444
releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
#releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}

@ -132,7 +132,7 @@ public class DetailedSearch {
}
// do the search
plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, wdist, count, searchtime, urlmask,
plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, wdist, "", count, searchtime, urlmask,
((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL,
"", 20);
plasmaSearchRankingProfile localRanking = new plasmaSearchRankingProfile("local", post.toString());

@ -81,8 +81,6 @@ public class IndexCreate_p {
prop.put("error", 0);
prop.put("info", 0);
prop.put("refreshbutton", 0);
switchboard.cleanProfiles();
if (post != null) {
if (post.containsKey("crawlingstart")) {
@ -395,6 +393,7 @@ public class IndexCreate_p {
// sed crawl profiles
int count = 0;
int domlistlength = (post == null) ? 160 : post.getInt("domlistlength", 160);
//try{
Iterator it = switchboard.profiles.profiles(true);
plasmaCrawlProfile.entry profile;
@ -410,7 +409,7 @@ public class IndexCreate_p {
prop.put("crawlProfiles_"+count+"_filter", profile.generalFilter());
prop.put("crawlProfiles_"+count+"_crawlingIfOlder", (profile.recrawlIfOlder() == Long.MAX_VALUE) ? "no re-crawl" : ""+profile.recrawlIfOlder());
prop.put("crawlProfiles_"+count+"_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : ""+profile.domFilterDepth());
prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", profile.domNames(true, 160));
prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", profile.domNames(true, domlistlength));
prop.put("crawlProfiles_"+count+"_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : ""+profile.domMaxPages());
prop.put("crawlProfiles_"+count+"_withQuery", ((profile.crawlingQ()) ? 1 : 0));
prop.put("crawlProfiles_"+count+"_storeCache", ((profile.storeHTCache()) ? 1 : 0));

@ -34,6 +34,8 @@
<input type="hidden" name="order" value="Date-YBR-Quality">
<input type="hidden" name="resource" value="global">
<input type="hidden" name="time" value="6">
<input type="hidden" name="urlmaskfilter" value=".*">
<input type="hidden" name="prefermaskfilter" value="">
<tr align="left">
<td></td>
<td><a href="/index.html?searchoptions=1&display=#[display]#">more options...</a></td>
@ -106,6 +108,19 @@
#(/urlmaskoptions)#
</td>
</tr>
<tr align="left">
<td>
Prefer mask:
</td>
<td>
#(prefermaskoptions)#
<input name="prefermaskfilter" type="text" size="12" maxlength="80" value="#[prefermaskfilter]#">
::
<input type="radio" name="prefermask" value="yes" checked> restrict on <input name="prefermaskfilter" type="text" size="12" maxlength="80" value="#[prefermaskfilter]#">
<input type="radio" name="prefermask" value="no"> show all
#(/prefermaskoptions)#
</td>
</tr>
#(/searchoptions)#
</table>
</center>

@ -102,8 +102,8 @@ public class index {
prop.put("combine", 0);
prop.put("resultbottomline", 0);
prop.put("searchoptions", searchoptions);
prop.put("searchoptions_count-10", 1);
prop.put("searchoptions_count-50", 0);
prop.put("searchoptions_count-10", 0);
prop.put("searchoptions_count-50", 1);
prop.put("searchoptions_count-100", 0);
prop.put("searchoptions_count-1000", 0);
prop.put("searchoptions_order-ybr-date-quality", plasmaSearchPreOrder.canUseYBR() ? 1 : 0);
@ -122,6 +122,8 @@ public class index {
prop.put("searchoptions_time-60", 0);
prop.put("searchoptions_urlmaskoptions", 0);
prop.put("searchoptions_urlmaskoptions_urlmaskfilter", ".*");
prop.put("searchoptions_prefermaskoptions", 0);
prop.put("searchoptions_prefermaskoptions_prefermaskfilter", "");
prop.put("results", "");
prop.put("cat", "href");
prop.put("type", "0");

@ -83,6 +83,7 @@ public final class search {
final long duetime= post.getLong("duetime", 3000);
final int count = post.getInt("count", 10); // maximum number of wanted results
final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE);
final String prefer = post.get("prefer", "");
// final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers
// Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time
@ -104,7 +105,7 @@ public final class search {
}
final long timestamp = System.currentTimeMillis();
plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, maxdist, count, duetime, ".*");
plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, maxdist, prefer, count, duetime, ".*");
squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
serverObjects prop = new serverObjects();

@ -55,6 +55,7 @@ picPlus.src = "/env/grafics/plus.gif";
<input type="hidden" name="resource" value="#[resource]#">
<input type="hidden" name="time" value="#[time]#">
<input type="hidden" name="urlmaskfilter" value="#[urlmaskfilter]#">
<input type="hidden" name="prefermaskfilter" value="#[prefermaskfilter]#">
<input type="hidden" name="depth" value="#[depth]#">
<input type="hidden" name="cat" value="#[cat]#">
<input type="hidden" name="type" value="#[type]#">

@ -109,6 +109,7 @@ public class yacysearch {
prop.put("resource", "global");
prop.put("time", 6);
prop.put("urlmaskfilter", ".*");
prop.put("prefermaskfilter", "");
prop.put("cat", "href");
prop.put("depth", "0");
prop.put("type", 0);
@ -144,7 +145,8 @@ public class yacysearch {
} else {
urlmask = (post.containsKey("urlmaskfilter")) ? (String) post.get("urlmaskfilter") : ".*";
}
String prefer = post.get("prefer", ".*");
String prefermask = post.get("prefermaskfilter", "");
if ((prefermask.length() > 0) && (prefermask.indexOf(".*") < 0)) prefermask = ".*" + prefermask + ".*";
serverObjects prop = new serverObjects();
@ -189,6 +191,7 @@ public class yacysearch {
plasmaSearchQuery thisSearch = new plasmaSearchQuery(
query,
maxDistance,
prefermask,
count,
searchtime,
urlmask,
@ -351,7 +354,7 @@ public class yacysearch {
prop.put("resource", (global) ? "global" : "local");
prop.put("time", searchtime / 1000);
prop.put("urlmaskfilter", urlmask);
prop.put("prefer", prefer);
prop.put("prefermaskfilter", prefermask);
prop.put("display", display);
// return rewrite properties

@ -456,7 +456,7 @@ public class plasmaCrawlProfile {
while (domnamesi.hasNext()) {
ey = (Map.Entry) domnamesi.next();
dp = (DomProfile) ey.getValue();
domnames += ((String) ey.getKey()) + ((attr) ? ("/d=" + dp.depth + ",c=" + dp.count + " ") : " ");
domnames += ((String) ey.getKey()) + ((attr) ? ("/r=" + dp.referrer + ", d=" + dp.depth + ", c=" + dp.count + " ") : " ") + "<br>";
if ((maxlength > 0) && (domnames.length() >= maxlength)) {
domnames = domnames.substring(0, maxlength-3) + "...";
break;

@ -318,7 +318,7 @@ public final class plasmaCrawlStacker {
}
// add domain to profile domain list
if (currentdepth <= profile.domFilterDepth()) {
if ((profile.domFilterDepth() != Integer.MAX_VALUE) || (profile.domMaxPages() != Integer.MAX_VALUE)) {
profile.domInc(nexturl.getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), currentdepth);
}

@ -61,6 +61,7 @@ public final class plasmaSearchQuery {
public Set queryWords;
public Set queryHashes;
public int wantedResults;
public String prefer;
public long maximumTime;
public String urlMask;
public int domType;
@ -68,11 +69,12 @@ public final class plasmaSearchQuery {
public int domMaxTargets;
public int maxDistance;
public plasmaSearchQuery(Set queryWords, int maxDistance,
public plasmaSearchQuery(Set queryWords, int maxDistance, String prefer,
int wantedResults, long maximumTime, String urlMask,
int domType, String domGroupName, int domMaxTargets) {
this.queryWords = queryWords;
this.maxDistance = maxDistance;
this.prefer = prefer;
this.queryHashes = words2hashes(queryWords);
this.wantedResults = wantedResults;
this.maximumTime = maximumTime;
@ -82,10 +84,11 @@ public final class plasmaSearchQuery {
this.domMaxTargets = domMaxTargets;
}
public plasmaSearchQuery(Set queryHashes, int maxDistance,
public plasmaSearchQuery(Set queryHashes, int maxDistance, String prefer,
int wantedResults, long maximumTime, String urlMask) {
this.queryWords = null;
this.maxDistance = maxDistance;
this.prefer = prefer;
this.queryHashes = queryHashes;
this.wantedResults = wantedResults;
this.maximumTime = maximumTime;

@ -73,6 +73,7 @@ public class plasmaSearchRankingProfile {
public static final String QUERYINDESCR = "queryindescr";
public static final String URLCOMPINTOPLIST = "urlcompintoplist";
public static final String DESCRCOMPINTOPLIST = "descrcompintoplist";
public static final String PREFER = "prefer";
public String[] order;
private HashMap coeff;
@ -96,6 +97,7 @@ public class plasmaSearchRankingProfile {
coeff.put(QUERYINDESCR, new Integer(8));
coeff.put(URLCOMPINTOPLIST, new Integer(3));
coeff.put(DESCRCOMPINTOPLIST, new Integer(2));
coeff.put(PREFER, new Integer(15));
}
public plasmaSearchRankingProfile(String prefix, String profile) {
@ -183,6 +185,10 @@ public class plasmaSearchRankingProfile {
// apply pre-calculated order attributes
long ranking = this.preRanking(normalizedEntry);
// prefer hit with 'prefer' pattern
if (page.url().toString().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue();
if (page.descr().toString().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue();
// apply 'common-sense' heuristic using references
for (int j = 0; j < urlcomps.length; j++) {
if (topwords.contains(urlcomps[j])) ranking += 256 << ((Integer) coeff.get(URLCOMPINTOPLIST)).intValue();
@ -210,6 +216,7 @@ public class plasmaSearchRankingProfile {
ranking += (255 * page.descr().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue();
ranking += (255 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12) << ((Integer) coeff.get(DESCRCOMPS)).intValue();
return ranking;
}

Loading…
Cancel
Save