From 0f5b6f38c18f0435a77f9f9cf99f551b23a099be Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 3 Jan 2013 19:21:21 +0100 Subject: [PATCH] enhanced root-url detection --- htroot/IndexControlRWIs_p.java | 3 +- .../yacy/kelondro/data/meta/DigestURI.java | 32 ++++++++++++------- .../net/yacy/kelondro/index/RowHandleMap.java | 2 +- .../yacy/search/index/SolrConfiguration.java | 16 ++++++---- .../yacy/search/ranking/ReferenceOrder.java | 6 ++-- 5 files changed, 35 insertions(+), 24 deletions(-) diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index 2f28e0663..d81fae130 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -541,8 +541,7 @@ public class IndexControlRWIs_p { + ((entry.word().flags().get(WordReferenceRow.flag_app_dc_creator)) ? "appears in author, " : "") + ((entry.word().flags().get(WordReferenceRow.flag_app_dc_subject)) ? "appears in subject, " : "") + ((entry.word().flags().get(WordReferenceRow.flag_app_dc_description)) ? "appears in description, " : "") - + ((entry.word().flags().get(WordReferenceRow.flag_app_emphasized)) ? "appears emphasized, " : "") - + ((DigestURI.probablyRootURL(entry.word().urlhash())) ? "probably root url" : "")); + + ((entry.word().flags().get(WordReferenceRow.flag_app_emphasized)) ? "appears emphasized, " : "")); if ( Switchboard.urlBlacklist.isListed(BlacklistType.DHT, url) ) { prop.put("genUrlList_urlList_" + i + "_urlExists_urlhxChecked", "1"); } diff --git a/source/net/yacy/kelondro/data/meta/DigestURI.java b/source/net/yacy/kelondro/data/meta/DigestURI.java index a43674d29..190ef46d7 100644 --- a/source/net/yacy/kelondro/data/meta/DigestURI.java +++ b/source/net/yacy/kelondro/data/meta/DigestURI.java @@ -40,6 +40,7 @@ import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Digest; import net.yacy.cora.protocol.Domains; import net.yacy.cora.util.CommonPattern; +import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.ByteArray; @@ -278,20 +279,29 @@ public class DigestURI extends MultiProtocolURI implements Serializable { return Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(sb.toString())).charAt(0); } - private static final char rootURLFlag0 = subdomPortPath("", 80, ""); - private static final char rootURLFlag1 = subdomPortPath("www", 80, ""); - private static final char rootURLFlag2 = subdomPortPath("", 21, ""); - private static final char rootURLFlag3 = subdomPortPath("ftp", 21, ""); - - public final Pattern rootPattern = Pattern.compile("/|/index.htm(l?)|/index.php"); + public final Pattern rootPattern = Pattern.compile("/|/index.htm(l?)|/index.php|/home.htm(l?)|/home.php|/default.htm(l?)|/default.php"); public final boolean probablyRootURL() { - return this.path.length() == 0 || rootPattern.matcher(this.path).matches() || probablyRootURL(this.hash); + return this.path.length() <= 1 || rootPattern.matcher(this.path).matches(); } - - public static final boolean probablyRootURL(final byte[] urlHash) { - final char c = (char) urlHash[5]; - return c == rootURLFlag0 || c == rootURLFlag1 || c == rootURLFlag2 || c == rootURLFlag3; + + public RowHandleSet getPossibleRootHashes() { + RowHandleSet rootCandidates = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10); + String rootStub = this.getProtocol() + "://" + this.getHost(); + try { + rootCandidates.put(new DigestURI(rootStub).hash()); + rootCandidates.put(new DigestURI(rootStub + "/").hash()); + rootCandidates.put(new DigestURI(rootStub + "/index.htm").hash()); + rootCandidates.put(new DigestURI(rootStub + "/index.html").hash()); + rootCandidates.put(new DigestURI(rootStub + "/index.php").hash()); + rootCandidates.put(new DigestURI(rootStub + "/home.htm").hash()); + rootCandidates.put(new DigestURI(rootStub + "/home.html").hash()); + rootCandidates.put(new DigestURI(rootStub + "/home.php").hash()); + rootCandidates.put(new DigestURI(rootStub + "/default.htm").hash()); + rootCandidates.put(new DigestURI(rootStub + "/default.html").hash()); + rootCandidates.put(new DigestURI(rootStub + "/default.php").hash()); + } catch (Throwable e) {} + return rootCandidates; } private static final String hosthash5(final String protocol, final String host, final int port) { diff --git a/source/net/yacy/kelondro/index/RowHandleMap.java b/source/net/yacy/kelondro/index/RowHandleMap.java index b7706eefb..0cf8f825d 100644 --- a/source/net/yacy/kelondro/index/RowHandleMap.java +++ b/source/net/yacy/kelondro/index/RowHandleMap.java @@ -267,7 +267,7 @@ public final class RowHandleMap implements HandleMap, Iterable 0; // it does not make sense to add 0. If this occurres, it is a performance issue + assert a >= 0; // it does not make sense to add 0. If this occurres, it is a performance issue synchronized (this.index) { final Row.Entry indexentry = this.index.get(key, true); if (indexentry == null) { diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index 099967c28..249d2ae74 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -330,15 +330,16 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable String docurl = digestURI.toNormalform(true); add(doc, YaCySchema.sku, docurl); - if (allAttr || contains(YaCySchema.clickdepth_i)) { - boolean fronturl = digestURI.probablyRootURL(); - if (fronturl) { + if ((allAttr || contains(YaCySchema.clickdepth_i)) && citations != null) { + if (digestURI.probablyRootURL()) { + boolean lc = this.lazy; this.lazy = false; add(doc, YaCySchema.clickdepth_i, 0); + this.lazy = lc; } else { // search the citations for references int clickdepth = -1; try { - clickdepth = getClickDepth(citations, digestURI.hash()); + clickdepth = getClickDepth(citations, digestURI); } catch (IOException e) { add(doc, YaCySchema.clickdepth_i, -1); } @@ -840,8 +841,11 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable * @return the clickdepth level or -1 if the root url cannot be found or a recursion limit is reached * @throws IOException */ - private int getClickDepth(final IndexCell citations, byte[] searchhash) throws IOException { + private static int getClickDepth(final IndexCell citations, final DigestURI url) throws IOException { + final byte[] searchhash = url.hash(); + RowHandleSet rootCandidates = url.getPossibleRootHashes(); + RowHandleSet ignore = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops RowHandleSet levelhashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry try {levelhashes.put(searchhash);} catch (SpaceExceededException e) {throw new IOException(e);} @@ -873,7 +877,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (!ByteBuffer.equals(u, 6, hosthash, 0, 6)) continue nextloop; // check if the url is a root url - if (DigestURI.probablyRootURL(u)) { + if (rootCandidates.has(u)) { return leveldepth + 1; } diff --git a/source/net/yacy/search/ranking/ReferenceOrder.java b/source/net/yacy/search/ranking/ReferenceOrder.java index 961fdf046..94238cda3 100644 --- a/source/net/yacy/search/ranking/ReferenceOrder.java +++ b/source/net/yacy/search/ranking/ReferenceOrder.java @@ -255,8 +255,7 @@ public class ReferenceOrder { + ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0) + ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0) + ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0) - + ((ByteBuffer.equals(t.getLanguage(), this.language)) ? 255 << this.ranking.coeff_language : 0) - + ((DigestURI.probablyRootURL(t.urlhash())) ? 15 << this.ranking.coeff_urllength : 0); + + ((ByteBuffer.equals(t.getLanguage(), this.language)) ? 255 << this.ranking.coeff_language : 0); //if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0; @@ -290,8 +289,7 @@ public class ReferenceOrder { + ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0) + ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0) + ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0) - + ((ByteBuffer.equals(t.language(), this.language)) ? 255 << this.ranking.coeff_language : 0) - + ((DigestURI.probablyRootURL(t.hash())) ? 15 << this.ranking.coeff_urllength : 0); + + ((ByteBuffer.equals(t.language(), this.language)) ? 255 << this.ranking.coeff_language : 0); return r; // the higher the number the better the ranking. }