From 3a9dc52ac21d62a21f865627c98f2498105c54df Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Wed, 23 Jun 2010 11:19:32 +0000
Subject: [PATCH] added a fascinating new way to search _and_ start a web crawl
 at the same time: implemented a hint from dulcedo "use site: - operator as
 crawl start point". YaCy already was able to search using a site-constraint.
 This function is now extended with a instant crawling feature. When you now
 use the site-operator, then the landing page of the site iand every page that
 is linked from this page are loaded, indexed and selected for the search
 result within that search request. When the remote server responds quickly
 enough, then this process can result in search results during the normal
 search result preparation .. just in some seconds.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6941 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 htroot/yacysearch.java                     |  12 ++-
 source/de/anomic/search/DocumentIndex.java |   3 +-
 source/de/anomic/search/QueryParams.java   |  24 ++++-
 source/de/anomic/search/Segment.java       |  34 ++++++-
 source/de/anomic/search/Segments.java      |   6 +-
 source/de/anomic/search/Switchboard.java   | 103 ++++++++++++++++++++-
 6 files changed, 164 insertions(+), 18 deletions(-)

diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java
index 398c928e6..058eca2d3 100644
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@@ -322,14 +322,15 @@ public class yacysearch {
             }
             int site = querystring.indexOf("site:");
             String sitehash = null;
+            String sitehost = null;
             if (site >= 0) {
                 int ftb = querystring.indexOf(' ', site);
                 if (ftb == -1) ftb = querystring.length();
-                String domain = querystring.substring(site + 5, ftb);
-                querystring = querystring.replace("site:" + domain, "");
-                while (domain.length() > 0 && domain.charAt(0) == '.') domain = domain.substring(1);
-                while (domain.endsWith(".")) domain = domain.substring(0, domain.length() - 1);
-                sitehash = DigestURI.domhash(domain);
+                sitehost = querystring.substring(site + 5, ftb);
+                querystring = querystring.replace("site:" + sitehost, "");
+                while (sitehost.length() > 0 && sitehost.charAt(0) == '.') sitehost = sitehost.substring(1);
+                while (sitehost.endsWith(".")) sitehost = sitehost.substring(0, sitehost.length() - 1);
+                sitehash = DigestURI.domhash(sitehost);
             }
             int authori = querystring.indexOf("author:");
         	String authorhash = null;
@@ -502,6 +503,7 @@ public class yacysearch {
             final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.crawlResults, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader);
             try {Thread.sleep(global ? 100 : 10);} catch (InterruptedException e1) {} // wait a little time to get first results in the search
             
+            if (sitehost != null && authenticated) sb.quickFillSite(sitehost, theSearch);
             // generate result object
             //serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER ORDERING OF SEARCH RESULTS: " + (System.currentTimeMillis() - timestamp) + " ms");
             //serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER RESULT PREPARATION: " + (System.currentTimeMillis() - timestamp) + " ms");
diff --git a/source/de/anomic/search/DocumentIndex.java b/source/de/anomic/search/DocumentIndex.java
index b0a3e5b0a..2e0a0867f 100644
--- a/source/de/anomic/search/DocumentIndex.java
+++ b/source/de/anomic/search/DocumentIndex.java
@@ -137,7 +137,8 @@ public class DocumentIndex extends Segment {
                 new Date(),
                 url.length(),
                 document,
-                condenser
+                condenser,
+                null
                 );
     }
     
diff --git a/source/de/anomic/search/QueryParams.java b/source/de/anomic/search/QueryParams.java
index e0f7cf323..3449449c5 100644
--- a/source/de/anomic/search/QueryParams.java
+++ b/source/de/anomic/search/QueryParams.java
@@ -26,10 +26,13 @@
 
 package de.anomic.search;
 
+import java.util.HashMap;
 import java.util.Iterator;
+import java.util.Map;
 import java.util.TreeSet;
 import java.util.regex.Pattern;
 
+import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.document.Condenser;
 import net.yacy.document.parser.html.AbstractScraper;
 import net.yacy.document.parser.html.CharacterCoding;
@@ -280,7 +283,7 @@ public final class QueryParams {
      * @param text
      * @return true if the query matches with the given text
      */
-    public final boolean matches(final String text) {
+    public final boolean matchesText(final String text) {
         final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text).keySet());
         if (SetTools.anymatch(wordhashes, this.excludeHashes)) return false;
         return SetTools.totalInclusion(this.queryHashes, wordhashes);
@@ -352,6 +355,25 @@ public final class QueryParams {
     	for (byte[] b: blues) queryHashes.remove(b);
     }
 
+
+    public final Map<MultiProtocolURI, String> separateMatches(Map<MultiProtocolURI, String> links) {
+        Map<MultiProtocolURI, String> matcher = new HashMap<MultiProtocolURI, String>();
+        Iterator <Map.Entry<MultiProtocolURI, String>> i = links.entrySet().iterator();
+        Map.Entry<MultiProtocolURI, String> entry;
+        MultiProtocolURI url;
+        String anchorText;
+        while (i.hasNext()) {
+            entry = i.next();
+            url = entry.getKey();
+            anchorText = entry.getValue();
+            if (this.matchesText(anchorText)) {
+                matcher.put(url, anchorText);
+                i.remove();
+            }
+        }
+        return matcher;
+    }
+    
     public String id(final boolean anonymized) {
         // generate a string that identifies a search so results can be re-used in a cache
         String context =
diff --git a/source/de/anomic/search/Segment.java b/source/de/anomic/search/Segment.java
index e7e89988f..74144c836 100644
--- a/source/de/anomic/search/Segment.java
+++ b/source/de/anomic/search/Segment.java
@@ -49,6 +49,7 @@ import net.yacy.kelondro.data.word.WordReferenceFactory;
 import net.yacy.kelondro.data.word.WordReferenceRow;
 import net.yacy.kelondro.data.word.WordReferenceVars;
 import net.yacy.kelondro.index.HandleSet;
+import net.yacy.kelondro.index.RowSpaceExceededException;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.order.Base64Order;
 import net.yacy.kelondro.order.ByteOrder;
@@ -195,7 +196,16 @@ public class Segment {
      * @param outlinksOther
      * @return
      */
-    private int addPageIndex(final DigestURI url, final Date urlModified, final Document document, final Condenser condenser, final String language, final char doctype, final int outlinksSame, final int outlinksOther) {
+    private int addPageIndex(
+            final DigestURI url,
+            final Date urlModified,
+            final Document document,
+            final Condenser condenser,
+            final String language,
+            final char doctype,
+            final int outlinksSame,
+            final int outlinksOther,
+            final SearchEvent searchEvent) {
         int wordCount = 0;
         final int urlLength = url.toNormalform(true, true).length();
         final int urlComps = MultiProtocolURI.urlComps(url.toString()).length;
@@ -215,18 +225,30 @@ public class Segment {
                                 doctype,
                                 outlinksSame, outlinksOther);
         Word wprop;
+        byte[] wordhash;
         while (i.hasNext()) {
             wentry = i.next();
             word = wentry.getKey();
             wprop = wentry.getValue();
             assert (wprop.flags != null);
             ientry.setWord(wprop);
+            wordhash = Word.word2hash(word);
             try {
-                this.termIndex.add(Word.word2hash(word), ientry);
+                this.termIndex.add(wordhash, ientry);
             } catch (Exception e) {
                 Log.logException(e);
             }
             wordCount++;
+            if (searchEvent != null && !searchEvent.getQuery().excludeHashes.has(wordhash) && searchEvent.getQuery().queryHashes.has(wordhash)) {
+                ReferenceContainer<WordReference> container;
+                try {
+                    container = ReferenceContainer.emptyContainer(Segment.wordReferenceFactory, wordhash, 1);
+                    container.add(ientry);
+                } catch (RowSpaceExceededException e) {
+                    continue;
+                }
+                searchEvent.getRankingResult().add(container, false, -1);
+            }
         }
         
         return wordCount;
@@ -245,7 +267,8 @@ public class Segment {
             final Date loadDate,
             final long sourcesize,
             final Document document,
-            final Condenser condenser
+            final Condenser condenser,
+            final SearchEvent searchEvent
             ) throws IOException {
         final long startTime = System.currentTimeMillis();
 
@@ -333,9 +356,10 @@ public class Segment {
                 document,                                     // document content
                 condenser,                                    // document condenser
                 language,                                     // document language
-                Response.docType(document.dc_format()),  // document type
+                Response.docType(document.dc_format()),       // document type
                 document.inboundLinks(),                      // inbound links
-                document.outboundLinks()                      // outbound links
+                document.outboundLinks(),                     // outbound links
+                searchEvent                                   // a search event that can have results directly
         );
             
         final long indexingEndTime = System.currentTimeMillis();
diff --git a/source/de/anomic/search/Segments.java b/source/de/anomic/search/Segments.java
index 9370ce812..b550eb2e0 100644
--- a/source/de/anomic/search/Segments.java
+++ b/source/de/anomic/search/Segments.java
@@ -209,7 +209,8 @@ public class Segments implements Iterable<Segment> {
             final Date loadDate,
             final long sourcesize,
             final Document document,
-            final Condenser condenser
+            final Condenser condenser,
+            final SearchEvent searchEvent
             ) throws IOException {
         return segment(segmentName).storeDocument(
                 url,
@@ -218,7 +219,8 @@ public class Segments implements Iterable<Segment> {
                 loadDate,
                 sourcesize,
                 document,
-                condenser
+                condenser,
+                searchEvent
          );
     }
     
diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java
index 5341253b0..7fc4e143e 100644
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@@ -117,6 +117,7 @@ import de.anomic.crawler.ResourceObserver;
 import de.anomic.crawler.ResultImages;
 import de.anomic.crawler.ResultURLs;
 import de.anomic.crawler.RobotsTxt;
+import de.anomic.crawler.CrawlProfile.CacheStrategy;
 import de.anomic.crawler.CrawlProfile.entry;
 import de.anomic.crawler.retrieval.EventOrigin;
 import de.anomic.crawler.retrieval.HTTPLoader;
@@ -1815,11 +1816,11 @@ public final class Switchboard extends serverSwitch {
     
     public void storeDocumentIndex(final indexingQueueEntry in) {
         in.queueEntry.updateStatus(Response.QUEUE_STATE_INDEXSTORAGE);
-        storeDocumentIndex(in.process, in.queueEntry, in.document, in.condenser);
+        storeDocumentIndex(in.process, in.queueEntry, in.document, in.condenser, null);
         in.queueEntry.updateStatus(Response.QUEUE_STATE_FINISHED);
     }
     
-    private void storeDocumentIndex(final Segments.Process process, final Response queueEntry, final Document document, final Condenser condenser) {
+    private void storeDocumentIndex(final Segments.Process process, final Response queueEntry, final Document document, final Condenser condenser, final SearchEvent searchEvent) {
         
         // CREATE INDEX
         final String dc_title = document.dc_title();
@@ -1834,7 +1835,7 @@ public final class Switchboard extends serverSwitch {
         }
         
         if (!queueEntry.profile().indexText() && !queueEntry.profile().indexMedia()) {
-            if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase);
+            if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name());
             addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "denied by profile rule");
             return;
         }
@@ -1852,7 +1853,8 @@ public final class Switchboard extends serverSwitch {
                     new Date(),
                     queueEntry.size(),
                     document,
-                    condenser);
+                    condenser,
+                    searchEvent);
             RSSFeed.channels(Base64Order.enhancedCoder.equal(queueEntry.initiator(), peers.mySeed().hash.getBytes()) ? RSSFeed.LOCALINDEXING : RSSFeed.REMOTEINDEXING).addMessage(new RSSMessage("Indexed web page", dc_title, queueEntry.url().toNormalform(true, false)));
         } catch (final IOException e) {
             if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase);
@@ -1892,6 +1894,66 @@ public final class Switchboard extends serverSwitch {
         }
     }
     
+    /**
+     * load the content of a URL, parse the content and add the content to the index
+     * This process is started concurrently. The method returns immediately after the call.
+     * @param url the url that shall be indexed
+     * @param searchEvent (optional) a search event that shall get results from the indexed pages directly feeded. If object is null then it is ignored
+     * @throws IOException
+     * @throws ParserException
+     */
+    public void addToIndex(final DigestURI url, final SearchEvent searchEvent) throws IOException, ParserException {
+        new Thread() {public void run() {
+            try {
+                Segments.Process process = Segments.Process.LOCALCRAWLING;
+                if (indexSegments.segment(process).urlMetadata.exists(url.hash())) return; // don't do double-work
+                Request request = loader.request(url, true, true);
+                Response response = loader.load(request, CacheStrategy.IFFRESH, Long.MAX_VALUE);
+                if (response == null) throw new IOException("response == null");
+                if (response.getContent() == null) throw new IOException("content == null");
+                if (response.getResponseHeader() == null) throw new IOException("header == null");
+                Document document = response.parse();
+                if (document.indexingDenied()) throw new ParserException("indexing is denied", url);
+                Condenser condenser = new Condenser(document, true, true);
+                ResultImages.registerImages(document, true);
+                webStructure.generateCitationReference(document, condenser, response.lastModified());
+                storeDocumentIndex(process, response, document, condenser, searchEvent);
+                log.logInfo("QuickFill of url " + url.toNormalform(true, true) + " finished");
+            } catch (IOException e) {
+                Log.logException(e);
+            } catch (ParserException e) {
+                Log.logException(e);
+            }
+        }}.start();
+    }
+    
+    public final void addAllToIndex(final DigestURI url, final Map<MultiProtocolURI, String> links, final SearchEvent searchEvent) {
+
+        // add the landing page to the index. should not load that again since it should be in the cache
+        try {
+            this.addToIndex(url, searchEvent);
+        } catch (IOException e) {} catch (ParserException e) {}
+        
+        // check if some of the links match with the query
+        Map<MultiProtocolURI, String> matcher = searchEvent.getQuery().separateMatches(links);
+        
+        // take the matcher and load them all
+        for (Map.Entry<MultiProtocolURI, String> entry: matcher.entrySet()) {
+            try {
+                this.addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent);
+            } catch (IOException e) {} catch (ParserException e) {}
+        }
+        
+        // take then the no-matcher and load them also
+        for (Map.Entry<MultiProtocolURI, String> entry: links.entrySet()) {
+            try {
+                this.addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent);
+            } catch (IOException e) {} catch (ParserException e) {}
+        }
+    }
+    
+    
+    
     public class receiptSending implements Runnable {
         yacySeed initiatorPeer;
         URIMetadataRow reference;
@@ -2103,6 +2165,39 @@ public final class Switchboard extends serverSwitch {
         crawlQueues.errorURL.push(bentry, initiator, new Date(), 0, failreason);
     }
     
+    public final void quickFillSite(final String host, final SearchEvent searchEvent) {
+        new Thread() {public void run() {
+            String r = host;
+            if (r.indexOf("//") < 0) r = "http://" + r;
+            
+            // get the links for a specific site
+            DigestURI url;
+            try {
+                url = new DigestURI(r, null);
+            } catch (MalformedURLException e) {
+                Log.logException(e);
+                return;
+            }
+    
+            Map<MultiProtocolURI, String> links = null;
+            try {
+                links = loader.loadLinks(url, CrawlProfile.CacheStrategy.NOCACHE);
+            } catch (IOException e) {
+                Log.logException(e);
+                return;
+            }
+            Iterator<MultiProtocolURI> i = links.keySet().iterator();
+            MultiProtocolURI u;
+            while (i.hasNext()) {
+                u = i.next();
+                if (!u.getHost().endsWith(host)) i.remove();
+            }
+            
+            // add all pages to the index
+            addAllToIndex(url, links, searchEvent);
+        }}.start();
+    }
+    
     public int currentPPM() {
         return EventTracker.countEvents("indexed", 20000) * 3;
     }