removed overhead by preventing generation of full search results when

only the url is requested
12 years ago · 4eab3aae60
parent a114bb23bb
commit 4eab3aae60
11 changed files with 85 additions and 56 deletions
--- a/htroot/IndexControlRWIs_p.java
+++ b/htroot/IndexControlRWIs_p.java
@ -378,10 +378,9 @@ public class IndexControlRWIs_p {
                            } catch ( final SpaceExceededException e ) {
                                Log.logException(e);
                            }
-                            final URIMetadataNode e = segment.fulltext().getMetadata(b);
+                            url = segment.fulltext().getURL(b);
                            segment.fulltext().remove(b);
-                            if ( e != null ) {
-                                url = e.url();
+                            if ( url != null ) {
                                pw.println(url.getHost() + "/" + url.getFile());
                                for ( final String supportedBlacklistType : supportedBlacklistTypes ) {
                                    if ( ListManager.listSetContains(
@ -413,10 +412,9 @@ public class IndexControlRWIs_p {
                            } catch ( final SpaceExceededException e ) {
                                Log.logException(e);
                            }
-                            final URIMetadataNode e = segment.fulltext().getMetadata(b);
+                            url = segment.fulltext().getURL(b);
                            segment.fulltext().remove(b);
-                            if ( e != null ) {
-                                url = e.url();
+                            if ( url != null ) {
                                pw.println(url.getHost() + "/.*");
                                for ( final BlacklistType supportedBlacklistType : BlacklistType.values() ) {
                                    if ( ListManager.listSetContains(
--- a/htroot/IndexControlURLs_p.java
+++ b/htroot/IndexControlURLs_p.java
@ -175,11 +175,11 @@ public class IndexControlURLs_p {
        }

        if (post.containsKey("urlhashdelete")) {
-            final URIMetadataNode entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash));
-            if (entry == null) {
+            final DigestURI url = segment.fulltext().getURL(ASCII.getBytes(urlhash));
+            if (url == null) {
                prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
            } else {
-                urlstring = entry.url().toNormalform(true);
+                urlstring = url.toNormalform(true);
                prop.put("urlstring", "");
                sb.urlRemove(segment, urlhash.getBytes());
                prop.putHTML("result", "Removed URL " + urlstring);
@ -233,9 +233,9 @@ public class IndexControlURLs_p {

        // generate list
        if (post.containsKey("urlhashsimilar")) {
-            final Iterator<URIMetadataNode> entryIt = new RotateIterator<URIMetadataNode>(segment.fulltext().entries(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), (int) segment.RWICount());
+            final Iterator<DigestURI> entryIt = new RotateIterator<DigestURI>(segment.fulltext().urls(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), (int) segment.RWICount());
 			final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
-			URIMetadataNode entry;
+			DigestURI entry;
 			int i = 0, rows = 0, cols = 0;
 			prop.put("urlhashsimilar", "1");
 			while (entryIt.hasNext() && i < 256) {
--- a/htroot/api/ymarks/add_ymark.java
+++ b/htroot/api/ymarks/add_ymark.java
@ -33,7 +33,7 @@ public class add_ymark {

            if(post.containsKey("urlHash")) {
            	final String urlHash = post.get("urlHash",YMarkUtil.EMPTY_STRING);
-            	final DigestURI url = sb.index.fulltext().getMetadata(urlHash.getBytes()).url();
+            	final DigestURI url = sb.index.fulltext().getURL(urlHash.getBytes());
            	final String folders = post.get(YMarkEntry.BOOKMARK.FOLDERS.key(),YMarkEntry.BOOKMARK.FOLDERS.deflt());
            	final String tags = post.get(YMarkEntry.BOOKMARK.TAGS.key(),YMarkUtil.EMPTY_STRING);
            	try {
--- a/htroot/gsa/searchresult.java
+++ b/htroot/gsa/searchresult.java
@ -115,7 +115,7 @@ public class searchresult {
        post.put(CommonParams.ROWS, post.remove("num"));
        post.put(CommonParams.ROWS, Math.min(post.getInt(CommonParams.ROWS, 10), (authenticated) ? 5000 : 100));
        post.put("defType", "edismax");
-        post.put("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^200.0"); // a bost query that moves double content to the back
+        post.put("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^100000.0"); // a bost query that moves double content to the back
        post.put(CommonParams.FL,
                YaCySchema.content_type.getSolrFieldName() + ',' +
                YaCySchema.id.getSolrFieldName() + ',' +
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@ -677,12 +677,12 @@ public class yacysearch {
                    return prop;
                }
                final String bookmarkHash = post.get("bookmarkref", ""); // urlhash
-                final URIMetadataNode urlentry = indexSegment.fulltext().getMetadata(UTF8.getBytes(bookmarkHash));
-                if ( urlentry != null ) {
+                final DigestURI url = indexSegment.fulltext().getURL(UTF8.getBytes(bookmarkHash));
+                if ( url != null ) {
                    try {
                        sb.tables.bookmarks.createBookmark(
                            sb.loader,
-                            urlentry.url(),
+                            url,
                            YMarkTables.USER_ADMIN,
                            true,
                            "searchresult",
--- a/source/net/yacy/data/ymark/YMarkMetadata.java
+++ b/source/net/yacy/data/ymark/YMarkMetadata.java
@ -82,7 +82,7 @@ public class YMarkMetadata {
 	public YMarkMetadata(final byte[] urlHash, final Segment indexSegment) {
 		this.document = null;
 		this.indexSegment = indexSegment;
-		this.uri = this.indexSegment.fulltext().getMetadata(urlHash).url();
+		this.uri = this.indexSegment.fulltext().getURL(urlHash);
 	}

 	public YMarkMetadata(final Document document) {
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -1521,16 +1521,10 @@ public final class Switchboard extends serverSwitch {
    }

    public DigestURI getURL(final byte[] urlhash) {
-        if ( urlhash == null ) {
-            return null;
-        }
-        if ( urlhash.length == 0 ) {
-            return null;
-        }
-        final URIMetadataNode le = this.index.fulltext().getMetadata(urlhash);
-        if ( le != null ) {
-            return le.url();
-        }
+        if (urlhash == null) return null;
+        if (urlhash.length == 0) return null;
+        final DigestURI url = this.index.fulltext().getURL(urlhash);
+        if (url != null) return url;
        return this.crawlQueues.getURL(urlhash);
    }

--- a/source/net/yacy/search/index/Fulltext.java
+++ b/source/net/yacy/search/index/Fulltext.java
@ -227,13 +227,26 @@ public final class Fulltext implements Iterable<byte[]> {
        Date now = new Date();
        return x.after(now) ? now : x;
    }
+
+    public DigestURI getURL(final byte[] urlHash) {
+        if (urlHash == null) return null;
+        SolrDocument doc;
+        try {
+            doc = this.solr.getById(ASCII.String(urlHash), YaCySchema.sku.getSolrFieldName());
+        } catch (IOException e) {
+            return null;
+        }
+        if (doc == null) return null;
+        String x = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName());
+        if (x == null) return null;
+        try {
+            DigestURI uri = new DigestURI(x, urlHash);
+            return uri;
+        } catch (MalformedURLException e) {
+            return null;
+        }
+    }
    
-    /**
-     * generates an plasmaLURLEntry using the url hash
-     * if the url cannot be found, this returns null
-     * @param obrwi
-     * @return
-     */
    public URIMetadataNode getMetadata(WordReference wre, long weight) {
        if (wre == null) return null; // all time was already wasted in takeRWI to get another element
        return getMetadata(wre.urlhash(), wre, weight);
@ -243,7 +256,7 @@ public final class Fulltext implements Iterable<byte[]> {
        if (urlHash == null) return null;
        return getMetadata(urlHash, null, 0);
    }
-
+    
    private URIMetadataNode getMetadata(final byte[] urlHash, WordReference wre, long weight) {

        // get the metadata from Solr
@ -519,9 +532,37 @@ public final class Fulltext implements Iterable<byte[]> {
                true);
    }

+    public CloneableIterator<DigestURI> urls() {
+        // enumerates entry elements
+        final Iterator<byte[]> ids = iterator();
+        return new CloneableIterator<DigestURI>() {
+            @Override
+            public CloneableIterator<DigestURI> clone(final Object secondHash) {
+                return this;
+            }
+            @Override
+            public final boolean hasNext() {
+                return ids.hasNext();
+            }
+            @Override
+            public final DigestURI next() {
+                byte[] id = ids.next();
+                if (id == null) return null;
+                return getURL(id);
+            }
+            @Override
+            public final void remove() {
+                ids.remove();
+            }
+            @Override
+            public void close() {
+            }
+        };
+    }
+
    public CloneableIterator<URIMetadataNode> entries() {
        // enumerates entry elements
-    	final Iterator<byte[]> ids = iterator();
+        final Iterator<byte[]> ids = iterator();
        return new CloneableIterator<URIMetadataNode>() {
            @Override
            public CloneableIterator<URIMetadataNode> clone(final Object secondHash) {
@ -783,15 +824,15 @@ public final class Fulltext implements Iterable<byte[]> {
        // collect hashes from all domains

        // fetch urls from the database to determine the host in clear text
-        URIMetadataNode urlref;
+        DigestURI url;
        if (count < 0 || count > domainSamples.size()) count = domainSamples.size();
        this.statsDump = new ArrayList<HostStat>();
        final TreeSet<String> set = new TreeSet<String>();
        for (final URLHashCounter hs: domainSamples.values()) {
            if (hs == null) continue;
-            urlref = this.getMetadata(hs.urlhashb);
-            if (urlref == null || urlref.url() == null || urlref.url().getHost() == null) continue;
-            set.add(urlref.url().getHost());
+            url = this.getURL(hs.urlhashb);
+            if (url == null || url.getHost() == null) continue;
+            set.add(url.getHost());
            count--;
            if (count == 0) break;
        }
@ -820,7 +861,6 @@ public final class Fulltext implements Iterable<byte[]> {
     */
    public Map<String, HostStat> domainHashResolver(final Map<String, URLHashCounter> domainSamples) {
        final HashMap<String, HostStat> hostMap = new HashMap<String, HostStat>();
-        URIMetadataNode urlref;

        final ScoreMap<String> hosthashScore = new ConcurrentScoreMap<String>();
        for (final Map.Entry<String, URLHashCounter> e: domainSamples.entrySet()) {
@ -828,8 +868,7 @@ public final class Fulltext implements Iterable<byte[]> {
        }
        DigestURI url;
        for (final Map.Entry<String, URLHashCounter> e: domainSamples.entrySet()) {
-            urlref = this.getMetadata(e.getValue().urlhashb);
-            url = urlref.url();
+            url = this.getURL(e.getValue().urlhashb);
            hostMap.put(e.getKey(), new HostStat(url.getHost(), url.getPort(), e.getKey(), hosthashScore.get(e.getKey())));
        }
        return hostMap;
@ -841,7 +880,6 @@ public final class Fulltext implements Iterable<byte[]> {

        // fetch urls from the database to determine the host in clear text
        final Iterator<String> j = domainScore.keys(false); // iterate urlhash-examples in reverse order (biggest first)
-        URIMetadataNode urlref;
        String urlhash;
        count += 10; // make some more to prevent that we have to do this again after deletions too soon.
        if (count < 0 || domainScore.sizeSmaller(count)) count = domainScore.size();
@ -850,10 +888,9 @@ public final class Fulltext implements Iterable<byte[]> {
        while (j.hasNext()) {
            urlhash = j.next();
            if (urlhash == null) continue;
-            urlref = this.getMetadata(ASCII.getBytes(urlhash));
-            if (urlref == null || urlref.url() == null || urlref.url().getHost() == null) continue;
+            url = this.getURL(ASCII.getBytes(urlhash));
+            if (url == null || url.getHost() == null) continue;
            if (this.statsDump == null) return new ArrayList<HostStat>().iterator(); // some other operation has destroyed the object
-            url = urlref.url();
            this.statsDump.add(new HostStat(url.getHost(), url.getPort(), urlhash.substring(6), domainScore.get(urlhash)));
            count--;
            if (count == 0) break;
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@ -58,7 +58,6 @@ import net.yacy.document.Parser;
 import net.yacy.kelondro.data.citation.CitationReference;
 import net.yacy.kelondro.data.citation.CitationReferenceFactory;
 import net.yacy.kelondro.data.meta.DigestURI;
-import net.yacy.kelondro.data.meta.URIMetadataNode;
 import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.data.word.WordReference;
 import net.yacy.kelondro.data.word.WordReferenceFactory;
@ -235,7 +234,7 @@ public class Segment {
                        return null;
                    }
                    if (id == null || id == AbstractSolrConnector.POISON_ID) return null;
-                    DigestURI u = Segment.this.fulltext.getMetadata(ASCII.getBytes(id)).url();
+                    DigestURI u = Segment.this.fulltext.getURL(ASCII.getBytes(id));
                    if (u.toNormalform(true).startsWith(urlstub)) return u;
                }
            }
@ -508,13 +507,12 @@ public class Segment {

        if (urlhash == null) return 0;
        // determine the url string
-        final URIMetadataNode entry = fulltext().getMetadata(urlhash);
-        if (entry == null) return 0;
-        if (entry.url() == null) return 0;
+        final DigestURI url = fulltext().getURL(urlhash);
+        if (url == null) return 0;

        try {
            // parse the resource
-            final Document document = Document.mergeDocuments(entry.url(), null, loader.loadDocuments(loader.request(entry.url(), true, false), cacheStrategy, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay));
+            final Document document = Document.mergeDocuments(url, null, loader.loadDocuments(loader.request(url, true, false), cacheStrategy, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay));
            if (document == null) {
                // delete just the url entry
                fulltext().remove(urlhash);
--- a/source/net/yacy/search/query/QueryParams.java
+++ b/source/net/yacy/search/query/QueryParams.java
@ -429,6 +429,8 @@ public final class QueryParams {

        // construct query
        final SolrQuery params = new SolrQuery();
+        params.setParam("defType", "edismax");
+        params.setParam("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^100000.0"); // a bost query that moves double content to the back
        params.setStart(this.offset);
        params.setRows(this.itemsPerPage);
        params.setFacet(false);
--- a/source/net/yacy/search/query/RankingProcess.java
+++ b/source/net/yacy/search/query/RankingProcess.java
@ -51,7 +51,7 @@ import net.yacy.cora.storage.HandleSet;
 import net.yacy.cora.util.SpaceExceededException;
 import net.yacy.document.Condenser;
 import net.yacy.document.LibraryProvider;
-import net.yacy.kelondro.data.meta.URIMetadataNode;
+import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.meta.URIMetadataRow;
 import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.data.word.WordReference;
@ -411,7 +411,7 @@ public final class RankingProcess extends Thread {
        final ScoreMap<String> result = new ConcurrentScoreMap<String>();

        final Iterator<String> domhashs = this.hostHashNavigator.keys(false);
-        URIMetadataNode row;
+        DigestURI url;
        byte[] urlhash;
        String hosthash, hostname;
        if ( this.hostHashResolver != null ) {
@ -421,8 +421,8 @@ public final class RankingProcess extends Thread {
                    continue;
                }
                urlhash = this.hostHashResolver.get(hosthash);
-                row = urlhash == null ? null : this.query.getSegment().fulltext().getMetadata(urlhash);
-                hostname = row == null ? null : row.url().getHost();
+                url = urlhash == null ? null : this.query.getSegment().fulltext().getURL(urlhash);
+                hostname = url == null ? null : url.getHost();
                if ( hostname != null ) {
                    result.set(hostname, this.hostHashNavigator.get(hosthash));
                }