added a fascinating new way to search _and_ start a web crawl at the same time:

implemented a hint from dulcedo "use site: - operator as crawl start point". YaCy already was able to search using a site-constraint. This function is now extended with a instant crawling feature. When you now use the site-operator, then the landing page of the site iand every page that is linked from this page are loaded, indexed and selected for the search result within that search request. When the remote server responds quickly enough, then this process can result in search results during the normal search result preparation .. just in some seconds. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6941 6c8d7289-2bf4-0310-a012-ef5d649a1542
15 years ago · 3a9dc52ac2
parent 8e3cbbb6a9
commit 3a9dc52ac2
6 changed files with 164 additions and 18 deletions
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@ -322,14 +322,15 @@ public class yacysearch {
            }
            int site = querystring.indexOf("site:");
            String sitehash = null;
+            String sitehost = null;
            if (site >= 0) {
                int ftb = querystring.indexOf(' ', site);
                if (ftb == -1) ftb = querystring.length();
-                String domain = querystring.substring(site + 5, ftb);
-                querystring = querystring.replace("site:" + domain, "");
-                while (domain.length() > 0 && domain.charAt(0) == '.') domain = domain.substring(1);
-                while (domain.endsWith(".")) domain = domain.substring(0, domain.length() - 1);
-                sitehash = DigestURI.domhash(domain);
+                sitehost = querystring.substring(site + 5, ftb);
+                querystring = querystring.replace("site:" + sitehost, "");
+                while (sitehost.length() > 0 && sitehost.charAt(0) == '.') sitehost = sitehost.substring(1);
+                while (sitehost.endsWith(".")) sitehost = sitehost.substring(0, sitehost.length() - 1);
+                sitehash = DigestURI.domhash(sitehost);
            }
            int authori = querystring.indexOf("author:");
        	String authorhash = null;
@ -502,6 +503,7 @@ public class yacysearch {
            final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.crawlResults, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader);
            try {Thread.sleep(global ? 100 : 10);} catch (InterruptedException e1) {} // wait a little time to get first results in the search
            
+            if (sitehost != null && authenticated) sb.quickFillSite(sitehost, theSearch);
            // generate result object
            //serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER ORDERING OF SEARCH RESULTS: " + (System.currentTimeMillis() - timestamp) + " ms");
            //serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER RESULT PREPARATION: " + (System.currentTimeMillis() - timestamp) + " ms");
--- a/source/de/anomic/search/DocumentIndex.java
+++ b/source/de/anomic/search/DocumentIndex.java
@ -137,7 +137,8 @@ public class DocumentIndex extends Segment {
                new Date(),
                url.length(),
                document,
-                condenser
+                condenser,
+                null
                );
    }
    
--- a/source/de/anomic/search/QueryParams.java
+++ b/source/de/anomic/search/QueryParams.java
@ -26,10 +26,13 @@

 package de.anomic.search;

+import java.util.HashMap;
 import java.util.Iterator;
+import java.util.Map;
 import java.util.TreeSet;
 import java.util.regex.Pattern;

+import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.document.Condenser;
 import net.yacy.document.parser.html.AbstractScraper;
 import net.yacy.document.parser.html.CharacterCoding;
@ -280,7 +283,7 @@ public final class QueryParams {
     * @param text
     * @return true if the query matches with the given text
     */
-    public final boolean matches(final String text) {
+    public final boolean matchesText(final String text) {
        final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text).keySet());
        if (SetTools.anymatch(wordhashes, this.excludeHashes)) return false;
        return SetTools.totalInclusion(this.queryHashes, wordhashes);
@ -352,6 +355,25 @@ public final class QueryParams {
    	for (byte[] b: blues) queryHashes.remove(b);
    }

+
+    public final Map<MultiProtocolURI, String> separateMatches(Map<MultiProtocolURI, String> links) {
+        Map<MultiProtocolURI, String> matcher = new HashMap<MultiProtocolURI, String>();
+        Iterator <Map.Entry<MultiProtocolURI, String>> i = links.entrySet().iterator();
+        Map.Entry<MultiProtocolURI, String> entry;
+        MultiProtocolURI url;
+        String anchorText;
+        while (i.hasNext()) {
+            entry = i.next();
+            url = entry.getKey();
+            anchorText = entry.getValue();
+            if (this.matchesText(anchorText)) {
+                matcher.put(url, anchorText);
+                i.remove();
+            }
+        }
+        return matcher;
+    }
+    
    public String id(final boolean anonymized) {
        // generate a string that identifies a search so results can be re-used in a cache
        String context =
--- a/source/de/anomic/search/Segment.java
+++ b/source/de/anomic/search/Segment.java
@ -49,6 +49,7 @@ import net.yacy.kelondro.data.word.WordReferenceFactory;
 import net.yacy.kelondro.data.word.WordReferenceRow;
 import net.yacy.kelondro.data.word.WordReferenceVars;
 import net.yacy.kelondro.index.HandleSet;
+import net.yacy.kelondro.index.RowSpaceExceededException;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.order.Base64Order;
 import net.yacy.kelondro.order.ByteOrder;
@ -195,7 +196,16 @@ public class Segment {
     * @param outlinksOther
     * @return
     */
-    private int addPageIndex(final DigestURI url, final Date urlModified, final Document document, final Condenser condenser, final String language, final char doctype, final int outlinksSame, final int outlinksOther) {
+    private int addPageIndex(
+            final DigestURI url,
+            final Date urlModified,
+            final Document document,
+            final Condenser condenser,
+            final String language,
+            final char doctype,
+            final int outlinksSame,
+            final int outlinksOther,
+            final SearchEvent searchEvent) {
        int wordCount = 0;
        final int urlLength = url.toNormalform(true, true).length();
        final int urlComps = MultiProtocolURI.urlComps(url.toString()).length;
@ -215,18 +225,30 @@ public class Segment {
                                doctype,
                                outlinksSame, outlinksOther);
        Word wprop;
+        byte[] wordhash;
        while (i.hasNext()) {
            wentry = i.next();
            word = wentry.getKey();
            wprop = wentry.getValue();
            assert (wprop.flags != null);
            ientry.setWord(wprop);
+            wordhash = Word.word2hash(word);
            try {
-                this.termIndex.add(Word.word2hash(word), ientry);
+                this.termIndex.add(wordhash, ientry);
            } catch (Exception e) {
                Log.logException(e);
            }
            wordCount++;
+            if (searchEvent != null && !searchEvent.getQuery().excludeHashes.has(wordhash) && searchEvent.getQuery().queryHashes.has(wordhash)) {
+                ReferenceContainer<WordReference> container;
+                try {
+                    container = ReferenceContainer.emptyContainer(Segment.wordReferenceFactory, wordhash, 1);
+                    container.add(ientry);
+                } catch (RowSpaceExceededException e) {
+                    continue;
+                }
+                searchEvent.getRankingResult().add(container, false, -1);
+            }
        }
        
        return wordCount;
@ -245,7 +267,8 @@ public class Segment {
            final Date loadDate,
            final long sourcesize,
            final Document document,
-            final Condenser condenser
+            final Condenser condenser,
+            final SearchEvent searchEvent
            ) throws IOException {
        final long startTime = System.currentTimeMillis();

@ -333,9 +356,10 @@ public class Segment {
                document,                                     // document content
                condenser,                                    // document condenser
                language,                                     // document language
-                Response.docType(document.dc_format()),  // document type
+                Response.docType(document.dc_format()),       // document type
                document.inboundLinks(),                      // inbound links
-                document.outboundLinks()                      // outbound links
+                document.outboundLinks(),                     // outbound links
+                searchEvent                                   // a search event that can have results directly
        );
            
        final long indexingEndTime = System.currentTimeMillis();
--- a/source/de/anomic/search/Segments.java
+++ b/source/de/anomic/search/Segments.java
@ -209,7 +209,8 @@ public class Segments implements Iterable<Segment> {
            final Date loadDate,
            final long sourcesize,
            final Document document,
-            final Condenser condenser
+            final Condenser condenser,
+            final SearchEvent searchEvent
            ) throws IOException {
        return segment(segmentName).storeDocument(
                url,
@ -218,7 +219,8 @@ public class Segments implements Iterable<Segment> {
                loadDate,
                sourcesize,
                document,
-                condenser
+                condenser,
+                searchEvent
         );
    }
    
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@ -117,6 +117,7 @@ import de.anomic.crawler.ResourceObserver;
 import de.anomic.crawler.ResultImages;
 import de.anomic.crawler.ResultURLs;
 import de.anomic.crawler.RobotsTxt;
+import de.anomic.crawler.CrawlProfile.CacheStrategy;
 import de.anomic.crawler.CrawlProfile.entry;
 import de.anomic.crawler.retrieval.EventOrigin;
 import de.anomic.crawler.retrieval.HTTPLoader;
@ -1815,11 +1816,11 @@ public final class Switchboard extends serverSwitch {
    
    public void storeDocumentIndex(final indexingQueueEntry in) {
        in.queueEntry.updateStatus(Response.QUEUE_STATE_INDEXSTORAGE);
-        storeDocumentIndex(in.process, in.queueEntry, in.document, in.condenser);
+        storeDocumentIndex(in.process, in.queueEntry, in.document, in.condenser, null);
        in.queueEntry.updateStatus(Response.QUEUE_STATE_FINISHED);
    }
    
-    private void storeDocumentIndex(final Segments.Process process, final Response queueEntry, final Document document, final Condenser condenser) {
+    private void storeDocumentIndex(final Segments.Process process, final Response queueEntry, final Document document, final Condenser condenser, final SearchEvent searchEvent) {
        
        // CREATE INDEX
        final String dc_title = document.dc_title();
@ -1834,7 +1835,7 @@ public final class Switchboard extends serverSwitch {
        }
        
        if (!queueEntry.profile().indexText() && !queueEntry.profile().indexMedia()) {
-            if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase);
+            if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name());
            addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "denied by profile rule");
            return;
        }
@ -1852,7 +1853,8 @@ public final class Switchboard extends serverSwitch {
                    new Date(),
                    queueEntry.size(),
                    document,
-                    condenser);
+                    condenser,
+                    searchEvent);
            RSSFeed.channels(Base64Order.enhancedCoder.equal(queueEntry.initiator(), peers.mySeed().hash.getBytes()) ? RSSFeed.LOCALINDEXING : RSSFeed.REMOTEINDEXING).addMessage(new RSSMessage("Indexed web page", dc_title, queueEntry.url().toNormalform(true, false)));
        } catch (final IOException e) {
            if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase);
@ -1892,6 +1894,66 @@ public final class Switchboard extends serverSwitch {
        }
    }
    
+    /**
+     * load the content of a URL, parse the content and add the content to the index
+     * This process is started concurrently. The method returns immediately after the call.
+     * @param url the url that shall be indexed
+     * @param searchEvent (optional) a search event that shall get results from the indexed pages directly feeded. If object is null then it is ignored
+     * @throws IOException
+     * @throws ParserException
+     */
+    public void addToIndex(final DigestURI url, final SearchEvent searchEvent) throws IOException, ParserException {
+        new Thread() {public void run() {
+            try {
+                Segments.Process process = Segments.Process.LOCALCRAWLING;
+                if (indexSegments.segment(process).urlMetadata.exists(url.hash())) return; // don't do double-work
+                Request request = loader.request(url, true, true);
+                Response response = loader.load(request, CacheStrategy.IFFRESH, Long.MAX_VALUE);
+                if (response == null) throw new IOException("response == null");
+                if (response.getContent() == null) throw new IOException("content == null");
+                if (response.getResponseHeader() == null) throw new IOException("header == null");
+                Document document = response.parse();
+                if (document.indexingDenied()) throw new ParserException("indexing is denied", url);
+                Condenser condenser = new Condenser(document, true, true);
+                ResultImages.registerImages(document, true);
+                webStructure.generateCitationReference(document, condenser, response.lastModified());
+                storeDocumentIndex(process, response, document, condenser, searchEvent);
+                log.logInfo("QuickFill of url " + url.toNormalform(true, true) + " finished");
+            } catch (IOException e) {
+                Log.logException(e);
+            } catch (ParserException e) {
+                Log.logException(e);
+            }
+        }}.start();
+    }
+    
+    public final void addAllToIndex(final DigestURI url, final Map<MultiProtocolURI, String> links, final SearchEvent searchEvent) {
+
+        // add the landing page to the index. should not load that again since it should be in the cache
+        try {
+            this.addToIndex(url, searchEvent);
+        } catch (IOException e) {} catch (ParserException e) {}
+        
+        // check if some of the links match with the query
+        Map<MultiProtocolURI, String> matcher = searchEvent.getQuery().separateMatches(links);
+        
+        // take the matcher and load them all
+        for (Map.Entry<MultiProtocolURI, String> entry: matcher.entrySet()) {
+            try {
+                this.addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent);
+            } catch (IOException e) {} catch (ParserException e) {}
+        }
+        
+        // take then the no-matcher and load them also
+        for (Map.Entry<MultiProtocolURI, String> entry: links.entrySet()) {
+            try {
+                this.addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent);
+            } catch (IOException e) {} catch (ParserException e) {}
+        }
+    }
+    
+    
+    
    public class receiptSending implements Runnable {
        yacySeed initiatorPeer;
        URIMetadataRow reference;
@ -2103,6 +2165,39 @@ public final class Switchboard extends serverSwitch {
        crawlQueues.errorURL.push(bentry, initiator, new Date(), 0, failreason);
    }
    
+    public final void quickFillSite(final String host, final SearchEvent searchEvent) {
+        new Thread() {public void run() {
+            String r = host;
+            if (r.indexOf("//") < 0) r = "http://" + r;
+            
+            // get the links for a specific site
+            DigestURI url;
+            try {
+                url = new DigestURI(r, null);
+            } catch (MalformedURLException e) {
+                Log.logException(e);
+                return;
+            }
+    
+            Map<MultiProtocolURI, String> links = null;
+            try {
+                links = loader.loadLinks(url, CrawlProfile.CacheStrategy.NOCACHE);
+            } catch (IOException e) {
+                Log.logException(e);
+                return;
+            }
+            Iterator<MultiProtocolURI> i = links.keySet().iterator();
+            MultiProtocolURI u;
+            while (i.hasNext()) {
+                u = i.next();
+                if (!u.getHost().endsWith(host)) i.remove();
+            }
+            
+            // add all pages to the index
+            addAllToIndex(url, links, searchEvent);
+        }}.start();
+    }
+    
    public int currentPPM() {
        return EventTracker.countEvents("indexed", 20000) * 3;
    }