using a Pattern object with precompiled regular expressions to apply must-match constraints to search results: should speed up pre-sorting of search results and should cause richer search result sets

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6762 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 5233177a7f
commit bb63c5d075

@ -71,8 +71,8 @@ public class yacysearchtrailer {
for (i = 0; i < Math.min(10, namespaceNavigator.size()); i++) {
entry = namespaceNavigator.get(i);
prop.put("nav-namespace_element_" + i + "_name", entry.name);
prop.put("nav-namespace_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.urlMask, "inurl:" + entry.name, theQuery.navigators) + "\">" + entry.name + " (" + entry.count + ")</a>");
prop.putJSON("nav-namespace_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask, "inurl:" + entry.name, theQuery.navigators));
prop.put("nav-namespace_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.urlMask.toString(), "inurl:" + entry.name, theQuery.navigators) + "\">" + entry.name + " (" + entry.count + ")</a>");
prop.putJSON("nav-namespace_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask.toString(), "inurl:" + entry.name, theQuery.navigators));
prop.put("nav-namespace_element_" + i + "_count", entry.count);
prop.put("nav-namespace_element_" + i + "_modifier", "inurl:" + entry.name);
prop.put("nav-namespace_element_" + i + "_nl", 1);
@ -93,8 +93,8 @@ public class yacysearchtrailer {
for (i = 0; i < Math.min(10, hostNavigator.size()); i++) {
entry = hostNavigator.get(i);
prop.put("nav-domains_element_" + i + "_name", entry.name);
prop.put("nav-domains_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.urlMask, "site:" + entry.name, theQuery.navigators) + "\">" + entry.name + " (" + entry.count + ")</a>");
prop.putJSON("nav-domains_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask, "site:" + entry.name, theQuery.navigators));
prop.put("nav-domains_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.urlMask.toString(), "site:" + entry.name, theQuery.navigators) + "\">" + entry.name + " (" + entry.count + ")</a>");
prop.putJSON("nav-domains_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask.toString(), "site:" + entry.name, theQuery.navigators));
prop.put("nav-domains_element_" + i + "_count", entry.count);
prop.put("nav-domains_element_" + i + "_modifier", "site:" + entry.name);
prop.put("nav-domains_element_" + i + "_nl", 1);
@ -117,8 +117,8 @@ public class yacysearchtrailer {
entry = authorNavigator.get(i);
anav = (entry.name.indexOf(' ') < 0) ? "author:" + entry.name : "author:'" + entry.name + "'";
prop.put("nav-authors_element_" + i + "_name", entry.name);
prop.put("nav-authors_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.urlMask, anav, theQuery.navigators) + "\">" + entry.name + " (" + entry.count + ")</a>");
prop.putJSON("nav-authors_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask, anav, theQuery.navigators));
prop.put("nav-authors_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.urlMask.toString(), anav, theQuery.navigators) + "\">" + entry.name + " (" + entry.count + ")</a>");
prop.putJSON("nav-authors_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask.toString(), anav, theQuery.navigators));
prop.put("nav-authors_element_" + i + "_count", entry.count);
prop.put("nav-authors_element_" + i + "_modifier", "author:'" + entry.name + "'");
prop.put("nav-authors_element_" + i + "_nl", 1);
@ -143,8 +143,8 @@ public class yacysearchtrailer {
if (/*(theQuery == null) ||*/ (theQuery.queryString == null)) break;
if (e != null && e.name != null) {
prop.putHTML("nav-topics_element_" + i + "_name", e.name);
prop.put("nav-topics_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.urlMask, e.name, theQuery.navigators) + "\">" + e.name + " (" + e.count + ")</a>");
prop.putJSON("nav-topics_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask, e.name, theQuery.navigators));
prop.put("nav-topics_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.urlMask.toString(), e.name, theQuery.navigators) + "\">" + e.name + " (" + e.count + ")</a>");
prop.putJSON("nav-topics_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask.toString(), e.name, theQuery.navigators));
prop.put("nav-topics_element_" + i + "_count", e.count);
prop.put("nav-topics_element_" + i + "_modifier", e.name);
prop.put("nav-topics_element_" + i + "_nl", (iter.hasNext() && i < MAX_TOPWORDS) ? 1 : 0);

@ -29,6 +29,7 @@ package de.anomic.search;
import java.util.HashSet;
import java.util.Iterator;
import java.util.TreeSet;
import java.util.regex.Pattern;
import net.yacy.document.Condenser;
import net.yacy.document.parser.html.AbstractScraper;
@ -62,9 +63,8 @@ public final class QueryParams {
public String queryString;
public TreeSet<byte[]> fullqueryHashes, queryHashes, excludeHashes;
public int itemsPerPage, offset;
public String prefer;
public Pattern urlMask, prefer;
public ContentDomain contentdom;
public String urlMask;
public String targetlang;
public String navigators;
public int domType;
@ -107,11 +107,11 @@ public final class QueryParams {
this.ranking = ranking;
this.tenant = null;
this.maxDistance = Integer.MAX_VALUE;
this.prefer = "";
this.prefer = Pattern.compile("");
this.contentdom = ContentDomain.ALL;
this.itemsPerPage = itemsPerPage;
this.offset = 0;
this.urlMask = ".*";
this.urlMask = Pattern.compile(".*");
this.targetlang = "en";
this.domType = SEARCHDOM_LOCAL;
this.zonecode = DigestURI.TLD_any_zone_filter;
@ -155,11 +155,11 @@ public final class QueryParams {
this.tenant = (tenant != null && tenant.length() == 0) ? null : tenant;
this.ranking = ranking;
this.maxDistance = maxDistance;
this.prefer = prefer;
this.prefer = Pattern.compile(prefer);
this.contentdom = contentdom;
this.itemsPerPage = Math.min((specialRights) ? 1000 : 50, itemsPerPage);
this.offset = Math.min((specialRights) ? 10000 : 100, offset);
this.urlMask = urlMask;
this.urlMask = Pattern.compile(urlMask);
assert language != null;
this.targetlang = language;
this.navigators = navigators;

@ -390,15 +390,25 @@ public final class RankingProcess extends Thread {
// prepare values for constraint check
final URIMetadataRow.Components metadata = page.metadata();
// check url constraints
if (metadata == null || metadata.url() == null) {
// check errors
if (metadata == null) {
continue; // rare case where the url is corrupted
}
// check url mask
if (!metadata.matches(query.urlMask)) {
continue;
}
// check for more errors
if (metadata.url() == null) {
continue; // rare case where the url is corrupted
}
final String pageurl = metadata.url().toNormalform(true, true);
final String pageauthor = metadata.dc_creator();
final String pagetitle = metadata.dc_title().toLowerCase();
// check exclusion
if ((QueryParams.matches(pagetitle, query.excludeHashes)) ||
(QueryParams.matches(pageurl.toLowerCase(), query.excludeHashes)) ||
@ -406,11 +416,6 @@ public final class RankingProcess extends Thread {
continue;
}
// check url mask
if (!(pageurl.matches(query.urlMask))) {
continue;
}
// check index-of constraint
if ((query.constraint != null) &&
(query.constraint.get(Condenser.flag_cat_indexof)) &&

@ -365,8 +365,8 @@ public class ResultFetcher {
if (query.contentdom == ContentDomain.APP ) r += rentry.lapp() << query.ranking.coeff_cathasapp;
// prefer hit with 'prefer' pattern
if (rentry.url().toNormalform(true, true).matches(query.prefer)) r += 256 << query.ranking.coeff_prefer;
if (rentry.title().matches(query.prefer)) r += 256 << query.ranking.coeff_prefer;
if (query.prefer.matcher(rentry.url().toNormalform(true, true)).matches()) r += 256 << query.ranking.coeff_prefer;
if (query.prefer.matcher(rentry.title()).matches()) r += 256 << query.ranking.coeff_prefer;
// apply 'common-sense' heuristic using references
final String urlstring = rentry.url().toNormalform(true, true);

@ -54,6 +54,7 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Pattern;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.document.parser.xml.RSSReader;
@ -427,8 +428,8 @@ public final class yacyClient {
final String wordhashes,
final String excludehashes,
final String urlhashes,
final String prefer,
final String filter,
final Pattern prefer,
final Pattern filter,
final String language,
final String sitehash,
final String authorhash,
@ -472,8 +473,8 @@ public final class yacyClient {
post.add(new DefaultCharsetStringPart("exclude", excludehashes));
post.add(new DefaultCharsetStringPart("duetime", "1000"));
post.add(new DefaultCharsetStringPart("urls", urlhashes));
post.add(new DefaultCharsetStringPart("prefer", prefer));
post.add(new DefaultCharsetStringPart("filter", filter));
post.add(new DefaultCharsetStringPart("prefer", prefer.toString()));
post.add(new DefaultCharsetStringPart("filter", filter.toString()));
post.add(new DefaultCharsetStringPart("language", language));
post.add(new DefaultCharsetStringPart("sitehash", sitehash));
post.add(new DefaultCharsetStringPart("authorhash", authorhash));

@ -30,6 +30,7 @@ import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Pattern;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Bitfield;
@ -58,7 +59,8 @@ public class yacySearch extends Thread {
private String[] urls;
private final int count, maxDistance;
final private RankingProfile rankingProfile;
final private String prefer, filter, language;
final private Pattern prefer, filter;
final private String language;
final private Bitfield constraint;
final private yacySeedDB peers;
@ -67,9 +69,9 @@ public class yacySearch extends Thread {
public yacySearch(
final String wordhashes, final String excludehashes,
final String urlhashes,
final String prefer, final String filter, final String language,
final String sitehash,
final String authorhash,
final Pattern prefer, final Pattern filter,
final String language,
final String sitehash, final String authorhash,
final int count, final int maxDistance,
final boolean global, final int partitions,
final yacySeed targetPeer,
@ -251,7 +253,7 @@ public class yacySearch extends Thread {
public static yacySearch[] primaryRemoteSearches(
final String wordhashes, final String excludehashes, final String urlhashes,
final String prefer, final String filter, String language,
final Pattern prefer, final Pattern filter, String language,
final String sitehash,
final String authorhash,
final int count, final int maxDist,
@ -314,7 +316,7 @@ public class yacySearch extends Thread {
if (targetPeer == null || targetPeer.hash == null) return null;
if (clusterselection != null) targetPeer.setAlternativeAddress(clusterselection.get(targetPeer.hash.getBytes()));
final yacySearch searchThread = new yacySearch(
wordhashes, excludehashes, urlhashes, "", "", "", "", "", 0, 9999, true, 0, targetPeer,
wordhashes, excludehashes, urlhashes, Pattern.compile(""), Pattern.compile(".*"), "", "", "", 0, 9999, true, 0, targetPeer,
indexSegment, peers, crawlResults, containerCache, new TreeMap<String, TreeMap<String, String>>(), blacklist, rankingProfile, constraint);
searchThread.start();
return searchThread;

@ -32,6 +32,7 @@ import java.text.ParseException;
import java.util.Date;
import java.util.Iterator;
import java.util.Properties;
import java.util.regex.Pattern;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.data.word.WordReferenceVars;
@ -369,19 +370,14 @@ public class URIMetadataRow implements URIMetadata {
if (this.comp != null) return this.comp;
// parse elements from comp string;
final Iterator<String> cl = FileUtils.strings(this.entry.getCol("comp", null));
try {
this.comp = new Components(
this.comp = new Components(
(cl.hasNext()) ? cl.next() : "",
hash(),
(cl.hasNext()) ? cl.next() : "",
(cl.hasNext()) ? cl.next() : "",
(cl.hasNext()) ? cl.next() : "",
(cl.hasNext()) ? cl.next() : "");
return this.comp;
} catch (MalformedURLException e) {
Log.logWarning("URLMetadataRow", "corrupted component / url: " + e.getMessage(), e);
return null;
}
return this.comp;
}
public Date moddate() {
@ -521,11 +517,14 @@ public class URIMetadataRow implements URIMetadata {
}
public class Components {
private final DigestURI url;
private DigestURI url;
private String urlRaw, urlHash;
private final String dc_title, dc_creator, dc_subject, ETag;
public Components(final String url, final String urlhash, final String title, final String author, final String tags, final String ETag) throws MalformedURLException {
this.url = new DigestURI(url, urlhash);
public Components(final String urlRaw, final String urlhash, final String title, final String author, final String tags, final String ETag) {
this.url = null;
this.urlRaw = urlRaw;
this.urlHash = urlhash;
this.dc_title = title;
this.dc_creator = author;
this.dc_subject = tags;
@ -533,12 +532,30 @@ public class URIMetadataRow implements URIMetadata {
}
public Components(final DigestURI url, final String descr, final String author, final String tags, final String ETag) {
this.url = url;
this.urlRaw = null;
this.urlHash = null;
this.dc_title = descr;
this.dc_creator = author;
this.dc_subject = tags;
this.ETag = ETag;
}
public DigestURI url() { return this.url; }
public boolean matches(Pattern matcher) {
if (this.urlRaw != null) return matcher.matcher(this.urlRaw).matches();
if (this.url != null) return matcher.matcher(this.url.toNormalform(true, true)).matches();
return false;
}
public DigestURI url() {
if (this.url == null) {
try {
this.url = new DigestURI(this.urlRaw, this.urlHash);
} catch (MalformedURLException e) {
this.url = null;
}
this.urlRaw = null;
this.urlHash = null;
}
return this.url;
}
public String dc_title() { return this.dc_title; }
public String dc_creator() { return this.dc_creator; }
public String dc_subject() { return this.dc_subject; }

Loading…
Cancel
Save