From bb63c5d075a802963991d5d2c9c4b4ae7661425c Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 23 Mar 2010 10:17:28 +0000 Subject: [PATCH] using a Pattern object with precompiled regular expressions to apply must-match constraints to search results: should speed up pre-sorting of search results and should cause richer search result sets git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6762 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/yacysearchtrailer.java | 16 ++++---- source/de/anomic/search/QueryParams.java | 12 +++--- source/de/anomic/search/RankingProcess.java | 21 ++++++---- source/de/anomic/search/ResultFetcher.java | 4 +- source/de/anomic/yacy/yacyClient.java | 9 +++-- source/de/anomic/yacy/yacySearch.java | 14 ++++--- .../kelondro/data/meta/URIMetadataRow.java | 39 +++++++++++++------ 7 files changed, 70 insertions(+), 45 deletions(-) diff --git a/htroot/yacysearchtrailer.java b/htroot/yacysearchtrailer.java index 24ca54b9e..1717b763c 100644 --- a/htroot/yacysearchtrailer.java +++ b/htroot/yacysearchtrailer.java @@ -71,8 +71,8 @@ public class yacysearchtrailer { for (i = 0; i < Math.min(10, namespaceNavigator.size()); i++) { entry = namespaceNavigator.get(i); prop.put("nav-namespace_element_" + i + "_name", entry.name); - prop.put("nav-namespace_element_" + i + "_url", "" + entry.name + " (" + entry.count + ")"); - prop.putJSON("nav-namespace_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask, "inurl:" + entry.name, theQuery.navigators)); + prop.put("nav-namespace_element_" + i + "_url", "" + entry.name + " (" + entry.count + ")"); + prop.putJSON("nav-namespace_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask.toString(), "inurl:" + entry.name, theQuery.navigators)); prop.put("nav-namespace_element_" + i + "_count", entry.count); prop.put("nav-namespace_element_" + i + "_modifier", "inurl:" + entry.name); prop.put("nav-namespace_element_" + i + "_nl", 1); @@ -93,8 +93,8 @@ public class yacysearchtrailer { for (i = 0; i < Math.min(10, hostNavigator.size()); i++) { entry = hostNavigator.get(i); prop.put("nav-domains_element_" + i + "_name", entry.name); - prop.put("nav-domains_element_" + i + "_url", "" + entry.name + " (" + entry.count + ")"); - prop.putJSON("nav-domains_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask, "site:" + entry.name, theQuery.navigators)); + prop.put("nav-domains_element_" + i + "_url", "" + entry.name + " (" + entry.count + ")"); + prop.putJSON("nav-domains_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask.toString(), "site:" + entry.name, theQuery.navigators)); prop.put("nav-domains_element_" + i + "_count", entry.count); prop.put("nav-domains_element_" + i + "_modifier", "site:" + entry.name); prop.put("nav-domains_element_" + i + "_nl", 1); @@ -117,8 +117,8 @@ public class yacysearchtrailer { entry = authorNavigator.get(i); anav = (entry.name.indexOf(' ') < 0) ? "author:" + entry.name : "author:'" + entry.name + "'"; prop.put("nav-authors_element_" + i + "_name", entry.name); - prop.put("nav-authors_element_" + i + "_url", "" + entry.name + " (" + entry.count + ")"); - prop.putJSON("nav-authors_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask, anav, theQuery.navigators)); + prop.put("nav-authors_element_" + i + "_url", "" + entry.name + " (" + entry.count + ")"); + prop.putJSON("nav-authors_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask.toString(), anav, theQuery.navigators)); prop.put("nav-authors_element_" + i + "_count", entry.count); prop.put("nav-authors_element_" + i + "_modifier", "author:'" + entry.name + "'"); prop.put("nav-authors_element_" + i + "_nl", 1); @@ -143,8 +143,8 @@ public class yacysearchtrailer { if (/*(theQuery == null) ||*/ (theQuery.queryString == null)) break; if (e != null && e.name != null) { prop.putHTML("nav-topics_element_" + i + "_name", e.name); - prop.put("nav-topics_element_" + i + "_url", "" + e.name + " (" + e.count + ")"); - prop.putJSON("nav-topics_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask, e.name, theQuery.navigators)); + prop.put("nav-topics_element_" + i + "_url", "" + e.name + " (" + e.count + ")"); + prop.putJSON("nav-topics_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask.toString(), e.name, theQuery.navigators)); prop.put("nav-topics_element_" + i + "_count", e.count); prop.put("nav-topics_element_" + i + "_modifier", e.name); prop.put("nav-topics_element_" + i + "_nl", (iter.hasNext() && i < MAX_TOPWORDS) ? 1 : 0); diff --git a/source/de/anomic/search/QueryParams.java b/source/de/anomic/search/QueryParams.java index 2fec27d65..3f9abf680 100644 --- a/source/de/anomic/search/QueryParams.java +++ b/source/de/anomic/search/QueryParams.java @@ -29,6 +29,7 @@ package de.anomic.search; import java.util.HashSet; import java.util.Iterator; import java.util.TreeSet; +import java.util.regex.Pattern; import net.yacy.document.Condenser; import net.yacy.document.parser.html.AbstractScraper; @@ -62,9 +63,8 @@ public final class QueryParams { public String queryString; public TreeSet fullqueryHashes, queryHashes, excludeHashes; public int itemsPerPage, offset; - public String prefer; + public Pattern urlMask, prefer; public ContentDomain contentdom; - public String urlMask; public String targetlang; public String navigators; public int domType; @@ -107,11 +107,11 @@ public final class QueryParams { this.ranking = ranking; this.tenant = null; this.maxDistance = Integer.MAX_VALUE; - this.prefer = ""; + this.prefer = Pattern.compile(""); this.contentdom = ContentDomain.ALL; this.itemsPerPage = itemsPerPage; this.offset = 0; - this.urlMask = ".*"; + this.urlMask = Pattern.compile(".*"); this.targetlang = "en"; this.domType = SEARCHDOM_LOCAL; this.zonecode = DigestURI.TLD_any_zone_filter; @@ -155,11 +155,11 @@ public final class QueryParams { this.tenant = (tenant != null && tenant.length() == 0) ? null : tenant; this.ranking = ranking; this.maxDistance = maxDistance; - this.prefer = prefer; + this.prefer = Pattern.compile(prefer); this.contentdom = contentdom; this.itemsPerPage = Math.min((specialRights) ? 1000 : 50, itemsPerPage); this.offset = Math.min((specialRights) ? 10000 : 100, offset); - this.urlMask = urlMask; + this.urlMask = Pattern.compile(urlMask); assert language != null; this.targetlang = language; this.navigators = navigators; diff --git a/source/de/anomic/search/RankingProcess.java b/source/de/anomic/search/RankingProcess.java index 1a67b451c..5da2243e3 100644 --- a/source/de/anomic/search/RankingProcess.java +++ b/source/de/anomic/search/RankingProcess.java @@ -390,15 +390,25 @@ public final class RankingProcess extends Thread { // prepare values for constraint check final URIMetadataRow.Components metadata = page.metadata(); - // check url constraints - if (metadata == null || metadata.url() == null) { + // check errors + if (metadata == null) { continue; // rare case where the url is corrupted } + // check url mask + if (!metadata.matches(query.urlMask)) { + continue; + } + + // check for more errors + if (metadata.url() == null) { + continue; // rare case where the url is corrupted + } + final String pageurl = metadata.url().toNormalform(true, true); final String pageauthor = metadata.dc_creator(); final String pagetitle = metadata.dc_title().toLowerCase(); - + // check exclusion if ((QueryParams.matches(pagetitle, query.excludeHashes)) || (QueryParams.matches(pageurl.toLowerCase(), query.excludeHashes)) || @@ -406,11 +416,6 @@ public final class RankingProcess extends Thread { continue; } - // check url mask - if (!(pageurl.matches(query.urlMask))) { - continue; - } - // check index-of constraint if ((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof)) && diff --git a/source/de/anomic/search/ResultFetcher.java b/source/de/anomic/search/ResultFetcher.java index 9453385bc..f00b01870 100644 --- a/source/de/anomic/search/ResultFetcher.java +++ b/source/de/anomic/search/ResultFetcher.java @@ -365,8 +365,8 @@ public class ResultFetcher { if (query.contentdom == ContentDomain.APP ) r += rentry.lapp() << query.ranking.coeff_cathasapp; // prefer hit with 'prefer' pattern - if (rentry.url().toNormalform(true, true).matches(query.prefer)) r += 256 << query.ranking.coeff_prefer; - if (rentry.title().matches(query.prefer)) r += 256 << query.ranking.coeff_prefer; + if (query.prefer.matcher(rentry.url().toNormalform(true, true)).matches()) r += 256 << query.ranking.coeff_prefer; + if (query.prefer.matcher(rentry.title()).matches()) r += 256 << query.ranking.coeff_prefer; // apply 'common-sense' heuristic using references final String urlstring = rentry.url().toNormalform(true, true); diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index d5393861a..d73d8ed4d 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -54,6 +54,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.TreeMap; +import java.util.regex.Pattern; import net.yacy.document.parser.xml.RSSFeed; import net.yacy.document.parser.xml.RSSReader; @@ -427,8 +428,8 @@ public final class yacyClient { final String wordhashes, final String excludehashes, final String urlhashes, - final String prefer, - final String filter, + final Pattern prefer, + final Pattern filter, final String language, final String sitehash, final String authorhash, @@ -472,8 +473,8 @@ public final class yacyClient { post.add(new DefaultCharsetStringPart("exclude", excludehashes)); post.add(new DefaultCharsetStringPart("duetime", "1000")); post.add(new DefaultCharsetStringPart("urls", urlhashes)); - post.add(new DefaultCharsetStringPart("prefer", prefer)); - post.add(new DefaultCharsetStringPart("filter", filter)); + post.add(new DefaultCharsetStringPart("prefer", prefer.toString())); + post.add(new DefaultCharsetStringPart("filter", filter.toString())); post.add(new DefaultCharsetStringPart("language", language)); post.add(new DefaultCharsetStringPart("sitehash", sitehash)); post.add(new DefaultCharsetStringPart("authorhash", authorhash)); diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index bfe1c4b9a..3a85b8a13 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -30,6 +30,7 @@ import java.util.Iterator; import java.util.Map; import java.util.TreeMap; import java.util.TreeSet; +import java.util.regex.Pattern; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Bitfield; @@ -58,7 +59,8 @@ public class yacySearch extends Thread { private String[] urls; private final int count, maxDistance; final private RankingProfile rankingProfile; - final private String prefer, filter, language; + final private Pattern prefer, filter; + final private String language; final private Bitfield constraint; final private yacySeedDB peers; @@ -67,9 +69,9 @@ public class yacySearch extends Thread { public yacySearch( final String wordhashes, final String excludehashes, final String urlhashes, - final String prefer, final String filter, final String language, - final String sitehash, - final String authorhash, + final Pattern prefer, final Pattern filter, + final String language, + final String sitehash, final String authorhash, final int count, final int maxDistance, final boolean global, final int partitions, final yacySeed targetPeer, @@ -251,7 +253,7 @@ public class yacySearch extends Thread { public static yacySearch[] primaryRemoteSearches( final String wordhashes, final String excludehashes, final String urlhashes, - final String prefer, final String filter, String language, + final Pattern prefer, final Pattern filter, String language, final String sitehash, final String authorhash, final int count, final int maxDist, @@ -314,7 +316,7 @@ public class yacySearch extends Thread { if (targetPeer == null || targetPeer.hash == null) return null; if (clusterselection != null) targetPeer.setAlternativeAddress(clusterselection.get(targetPeer.hash.getBytes())); final yacySearch searchThread = new yacySearch( - wordhashes, excludehashes, urlhashes, "", "", "", "", "", 0, 9999, true, 0, targetPeer, + wordhashes, excludehashes, urlhashes, Pattern.compile(""), Pattern.compile(".*"), "", "", "", 0, 9999, true, 0, targetPeer, indexSegment, peers, crawlResults, containerCache, new TreeMap>(), blacklist, rankingProfile, constraint); searchThread.start(); return searchThread; diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java index d9c3d6523..b3d5c5ef9 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java @@ -32,6 +32,7 @@ import java.text.ParseException; import java.util.Date; import java.util.Iterator; import java.util.Properties; +import java.util.regex.Pattern; import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.data.word.WordReferenceVars; @@ -369,19 +370,14 @@ public class URIMetadataRow implements URIMetadata { if (this.comp != null) return this.comp; // parse elements from comp string; final Iterator cl = FileUtils.strings(this.entry.getCol("comp", null)); - try { - this.comp = new Components( + this.comp = new Components( (cl.hasNext()) ? cl.next() : "", hash(), (cl.hasNext()) ? cl.next() : "", (cl.hasNext()) ? cl.next() : "", (cl.hasNext()) ? cl.next() : "", (cl.hasNext()) ? cl.next() : ""); - return this.comp; - } catch (MalformedURLException e) { - Log.logWarning("URLMetadataRow", "corrupted component / url: " + e.getMessage(), e); - return null; - } + return this.comp; } public Date moddate() { @@ -521,11 +517,14 @@ public class URIMetadataRow implements URIMetadata { } public class Components { - private final DigestURI url; + private DigestURI url; + private String urlRaw, urlHash; private final String dc_title, dc_creator, dc_subject, ETag; - public Components(final String url, final String urlhash, final String title, final String author, final String tags, final String ETag) throws MalformedURLException { - this.url = new DigestURI(url, urlhash); + public Components(final String urlRaw, final String urlhash, final String title, final String author, final String tags, final String ETag) { + this.url = null; + this.urlRaw = urlRaw; + this.urlHash = urlhash; this.dc_title = title; this.dc_creator = author; this.dc_subject = tags; @@ -533,12 +532,30 @@ public class URIMetadataRow implements URIMetadata { } public Components(final DigestURI url, final String descr, final String author, final String tags, final String ETag) { this.url = url; + this.urlRaw = null; + this.urlHash = null; this.dc_title = descr; this.dc_creator = author; this.dc_subject = tags; this.ETag = ETag; } - public DigestURI url() { return this.url; } + public boolean matches(Pattern matcher) { + if (this.urlRaw != null) return matcher.matcher(this.urlRaw).matches(); + if (this.url != null) return matcher.matcher(this.url.toNormalform(true, true)).matches(); + return false; + } + public DigestURI url() { + if (this.url == null) { + try { + this.url = new DigestURI(this.urlRaw, this.urlHash); + } catch (MalformedURLException e) { + this.url = null; + } + this.urlRaw = null; + this.urlHash = null; + } + return this.url; + } public String dc_title() { return this.dc_title; } public String dc_creator() { return this.dc_creator; } public String dc_subject() { return this.dc_subject; }