From 3a9dc52ac21d62a21f865627c98f2498105c54df Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 23 Jun 2010 11:19:32 +0000 Subject: [PATCH] added a fascinating new way to search _and_ start a web crawl at the same time: implemented a hint from dulcedo "use site: - operator as crawl start point". YaCy already was able to search using a site-constraint. This function is now extended with a instant crawling feature. When you now use the site-operator, then the landing page of the site iand every page that is linked from this page are loaded, indexed and selected for the search result within that search request. When the remote server responds quickly enough, then this process can result in search results during the normal search result preparation .. just in some seconds. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6941 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/yacysearch.java | 12 ++- source/de/anomic/search/DocumentIndex.java | 3 +- source/de/anomic/search/QueryParams.java | 24 ++++- source/de/anomic/search/Segment.java | 34 ++++++- source/de/anomic/search/Segments.java | 6 +- source/de/anomic/search/Switchboard.java | 103 ++++++++++++++++++++- 6 files changed, 164 insertions(+), 18 deletions(-) diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 398c928e6..058eca2d3 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -322,14 +322,15 @@ public class yacysearch { } int site = querystring.indexOf("site:"); String sitehash = null; + String sitehost = null; if (site >= 0) { int ftb = querystring.indexOf(' ', site); if (ftb == -1) ftb = querystring.length(); - String domain = querystring.substring(site + 5, ftb); - querystring = querystring.replace("site:" + domain, ""); - while (domain.length() > 0 && domain.charAt(0) == '.') domain = domain.substring(1); - while (domain.endsWith(".")) domain = domain.substring(0, domain.length() - 1); - sitehash = DigestURI.domhash(domain); + sitehost = querystring.substring(site + 5, ftb); + querystring = querystring.replace("site:" + sitehost, ""); + while (sitehost.length() > 0 && sitehost.charAt(0) == '.') sitehost = sitehost.substring(1); + while (sitehost.endsWith(".")) sitehost = sitehost.substring(0, sitehost.length() - 1); + sitehash = DigestURI.domhash(sitehost); } int authori = querystring.indexOf("author:"); String authorhash = null; @@ -502,6 +503,7 @@ public class yacysearch { final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.crawlResults, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader); try {Thread.sleep(global ? 100 : 10);} catch (InterruptedException e1) {} // wait a little time to get first results in the search + if (sitehost != null && authenticated) sb.quickFillSite(sitehost, theSearch); // generate result object //serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER ORDERING OF SEARCH RESULTS: " + (System.currentTimeMillis() - timestamp) + " ms"); //serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER RESULT PREPARATION: " + (System.currentTimeMillis() - timestamp) + " ms"); diff --git a/source/de/anomic/search/DocumentIndex.java b/source/de/anomic/search/DocumentIndex.java index b0a3e5b0a..2e0a0867f 100644 --- a/source/de/anomic/search/DocumentIndex.java +++ b/source/de/anomic/search/DocumentIndex.java @@ -137,7 +137,8 @@ public class DocumentIndex extends Segment { new Date(), url.length(), document, - condenser + condenser, + null ); } diff --git a/source/de/anomic/search/QueryParams.java b/source/de/anomic/search/QueryParams.java index e0f7cf323..3449449c5 100644 --- a/source/de/anomic/search/QueryParams.java +++ b/source/de/anomic/search/QueryParams.java @@ -26,10 +26,13 @@ package de.anomic.search; +import java.util.HashMap; import java.util.Iterator; +import java.util.Map; import java.util.TreeSet; import java.util.regex.Pattern; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.Condenser; import net.yacy.document.parser.html.AbstractScraper; import net.yacy.document.parser.html.CharacterCoding; @@ -280,7 +283,7 @@ public final class QueryParams { * @param text * @return true if the query matches with the given text */ - public final boolean matches(final String text) { + public final boolean matchesText(final String text) { final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text).keySet()); if (SetTools.anymatch(wordhashes, this.excludeHashes)) return false; return SetTools.totalInclusion(this.queryHashes, wordhashes); @@ -352,6 +355,25 @@ public final class QueryParams { for (byte[] b: blues) queryHashes.remove(b); } + + public final Map separateMatches(Map links) { + Map matcher = new HashMap(); + Iterator > i = links.entrySet().iterator(); + Map.Entry entry; + MultiProtocolURI url; + String anchorText; + while (i.hasNext()) { + entry = i.next(); + url = entry.getKey(); + anchorText = entry.getValue(); + if (this.matchesText(anchorText)) { + matcher.put(url, anchorText); + i.remove(); + } + } + return matcher; + } + public String id(final boolean anonymized) { // generate a string that identifies a search so results can be re-used in a cache String context = diff --git a/source/de/anomic/search/Segment.java b/source/de/anomic/search/Segment.java index e7e89988f..74144c836 100644 --- a/source/de/anomic/search/Segment.java +++ b/source/de/anomic/search/Segment.java @@ -49,6 +49,7 @@ import net.yacy.kelondro.data.word.WordReferenceFactory; import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.data.word.WordReferenceVars; import net.yacy.kelondro.index.HandleSet; +import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.ByteOrder; @@ -195,7 +196,16 @@ public class Segment { * @param outlinksOther * @return */ - private int addPageIndex(final DigestURI url, final Date urlModified, final Document document, final Condenser condenser, final String language, final char doctype, final int outlinksSame, final int outlinksOther) { + private int addPageIndex( + final DigestURI url, + final Date urlModified, + final Document document, + final Condenser condenser, + final String language, + final char doctype, + final int outlinksSame, + final int outlinksOther, + final SearchEvent searchEvent) { int wordCount = 0; final int urlLength = url.toNormalform(true, true).length(); final int urlComps = MultiProtocolURI.urlComps(url.toString()).length; @@ -215,18 +225,30 @@ public class Segment { doctype, outlinksSame, outlinksOther); Word wprop; + byte[] wordhash; while (i.hasNext()) { wentry = i.next(); word = wentry.getKey(); wprop = wentry.getValue(); assert (wprop.flags != null); ientry.setWord(wprop); + wordhash = Word.word2hash(word); try { - this.termIndex.add(Word.word2hash(word), ientry); + this.termIndex.add(wordhash, ientry); } catch (Exception e) { Log.logException(e); } wordCount++; + if (searchEvent != null && !searchEvent.getQuery().excludeHashes.has(wordhash) && searchEvent.getQuery().queryHashes.has(wordhash)) { + ReferenceContainer container; + try { + container = ReferenceContainer.emptyContainer(Segment.wordReferenceFactory, wordhash, 1); + container.add(ientry); + } catch (RowSpaceExceededException e) { + continue; + } + searchEvent.getRankingResult().add(container, false, -1); + } } return wordCount; @@ -245,7 +267,8 @@ public class Segment { final Date loadDate, final long sourcesize, final Document document, - final Condenser condenser + final Condenser condenser, + final SearchEvent searchEvent ) throws IOException { final long startTime = System.currentTimeMillis(); @@ -333,9 +356,10 @@ public class Segment { document, // document content condenser, // document condenser language, // document language - Response.docType(document.dc_format()), // document type + Response.docType(document.dc_format()), // document type document.inboundLinks(), // inbound links - document.outboundLinks() // outbound links + document.outboundLinks(), // outbound links + searchEvent // a search event that can have results directly ); final long indexingEndTime = System.currentTimeMillis(); diff --git a/source/de/anomic/search/Segments.java b/source/de/anomic/search/Segments.java index 9370ce812..b550eb2e0 100644 --- a/source/de/anomic/search/Segments.java +++ b/source/de/anomic/search/Segments.java @@ -209,7 +209,8 @@ public class Segments implements Iterable { final Date loadDate, final long sourcesize, final Document document, - final Condenser condenser + final Condenser condenser, + final SearchEvent searchEvent ) throws IOException { return segment(segmentName).storeDocument( url, @@ -218,7 +219,8 @@ public class Segments implements Iterable { loadDate, sourcesize, document, - condenser + condenser, + searchEvent ); } diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 5341253b0..7fc4e143e 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -117,6 +117,7 @@ import de.anomic.crawler.ResourceObserver; import de.anomic.crawler.ResultImages; import de.anomic.crawler.ResultURLs; import de.anomic.crawler.RobotsTxt; +import de.anomic.crawler.CrawlProfile.CacheStrategy; import de.anomic.crawler.CrawlProfile.entry; import de.anomic.crawler.retrieval.EventOrigin; import de.anomic.crawler.retrieval.HTTPLoader; @@ -1815,11 +1816,11 @@ public final class Switchboard extends serverSwitch { public void storeDocumentIndex(final indexingQueueEntry in) { in.queueEntry.updateStatus(Response.QUEUE_STATE_INDEXSTORAGE); - storeDocumentIndex(in.process, in.queueEntry, in.document, in.condenser); + storeDocumentIndex(in.process, in.queueEntry, in.document, in.condenser, null); in.queueEntry.updateStatus(Response.QUEUE_STATE_FINISHED); } - private void storeDocumentIndex(final Segments.Process process, final Response queueEntry, final Document document, final Condenser condenser) { + private void storeDocumentIndex(final Segments.Process process, final Response queueEntry, final Document document, final Condenser condenser, final SearchEvent searchEvent) { // CREATE INDEX final String dc_title = document.dc_title(); @@ -1834,7 +1835,7 @@ public final class Switchboard extends serverSwitch { } if (!queueEntry.profile().indexText() && !queueEntry.profile().indexMedia()) { - if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase); + if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name()); addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "denied by profile rule"); return; } @@ -1852,7 +1853,8 @@ public final class Switchboard extends serverSwitch { new Date(), queueEntry.size(), document, - condenser); + condenser, + searchEvent); RSSFeed.channels(Base64Order.enhancedCoder.equal(queueEntry.initiator(), peers.mySeed().hash.getBytes()) ? RSSFeed.LOCALINDEXING : RSSFeed.REMOTEINDEXING).addMessage(new RSSMessage("Indexed web page", dc_title, queueEntry.url().toNormalform(true, false))); } catch (final IOException e) { if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase); @@ -1892,6 +1894,66 @@ public final class Switchboard extends serverSwitch { } } + /** + * load the content of a URL, parse the content and add the content to the index + * This process is started concurrently. The method returns immediately after the call. + * @param url the url that shall be indexed + * @param searchEvent (optional) a search event that shall get results from the indexed pages directly feeded. If object is null then it is ignored + * @throws IOException + * @throws ParserException + */ + public void addToIndex(final DigestURI url, final SearchEvent searchEvent) throws IOException, ParserException { + new Thread() {public void run() { + try { + Segments.Process process = Segments.Process.LOCALCRAWLING; + if (indexSegments.segment(process).urlMetadata.exists(url.hash())) return; // don't do double-work + Request request = loader.request(url, true, true); + Response response = loader.load(request, CacheStrategy.IFFRESH, Long.MAX_VALUE); + if (response == null) throw new IOException("response == null"); + if (response.getContent() == null) throw new IOException("content == null"); + if (response.getResponseHeader() == null) throw new IOException("header == null"); + Document document = response.parse(); + if (document.indexingDenied()) throw new ParserException("indexing is denied", url); + Condenser condenser = new Condenser(document, true, true); + ResultImages.registerImages(document, true); + webStructure.generateCitationReference(document, condenser, response.lastModified()); + storeDocumentIndex(process, response, document, condenser, searchEvent); + log.logInfo("QuickFill of url " + url.toNormalform(true, true) + " finished"); + } catch (IOException e) { + Log.logException(e); + } catch (ParserException e) { + Log.logException(e); + } + }}.start(); + } + + public final void addAllToIndex(final DigestURI url, final Map links, final SearchEvent searchEvent) { + + // add the landing page to the index. should not load that again since it should be in the cache + try { + this.addToIndex(url, searchEvent); + } catch (IOException e) {} catch (ParserException e) {} + + // check if some of the links match with the query + Map matcher = searchEvent.getQuery().separateMatches(links); + + // take the matcher and load them all + for (Map.Entry entry: matcher.entrySet()) { + try { + this.addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent); + } catch (IOException e) {} catch (ParserException e) {} + } + + // take then the no-matcher and load them also + for (Map.Entry entry: links.entrySet()) { + try { + this.addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent); + } catch (IOException e) {} catch (ParserException e) {} + } + } + + + public class receiptSending implements Runnable { yacySeed initiatorPeer; URIMetadataRow reference; @@ -2103,6 +2165,39 @@ public final class Switchboard extends serverSwitch { crawlQueues.errorURL.push(bentry, initiator, new Date(), 0, failreason); } + public final void quickFillSite(final String host, final SearchEvent searchEvent) { + new Thread() {public void run() { + String r = host; + if (r.indexOf("//") < 0) r = "http://" + r; + + // get the links for a specific site + DigestURI url; + try { + url = new DigestURI(r, null); + } catch (MalformedURLException e) { + Log.logException(e); + return; + } + + Map links = null; + try { + links = loader.loadLinks(url, CrawlProfile.CacheStrategy.NOCACHE); + } catch (IOException e) { + Log.logException(e); + return; + } + Iterator i = links.keySet().iterator(); + MultiProtocolURI u; + while (i.hasNext()) { + u = i.next(); + if (!u.getHost().endsWith(host)) i.remove(); + } + + // add all pages to the index + addAllToIndex(url, links, searchEvent); + }}.start(); + } + public int currentPPM() { return EventTracker.countEvents("indexed", 20000) * 3; }