diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 398c928e6..058eca2d3 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -322,14 +322,15 @@ public class yacysearch { } int site = querystring.indexOf("site:"); String sitehash = null; + String sitehost = null; if (site >= 0) { int ftb = querystring.indexOf(' ', site); if (ftb == -1) ftb = querystring.length(); - String domain = querystring.substring(site + 5, ftb); - querystring = querystring.replace("site:" + domain, ""); - while (domain.length() > 0 && domain.charAt(0) == '.') domain = domain.substring(1); - while (domain.endsWith(".")) domain = domain.substring(0, domain.length() - 1); - sitehash = DigestURI.domhash(domain); + sitehost = querystring.substring(site + 5, ftb); + querystring = querystring.replace("site:" + sitehost, ""); + while (sitehost.length() > 0 && sitehost.charAt(0) == '.') sitehost = sitehost.substring(1); + while (sitehost.endsWith(".")) sitehost = sitehost.substring(0, sitehost.length() - 1); + sitehash = DigestURI.domhash(sitehost); } int authori = querystring.indexOf("author:"); String authorhash = null; @@ -502,6 +503,7 @@ public class yacysearch { final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.crawlResults, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader); try {Thread.sleep(global ? 100 : 10);} catch (InterruptedException e1) {} // wait a little time to get first results in the search + if (sitehost != null && authenticated) sb.quickFillSite(sitehost, theSearch); // generate result object //serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER ORDERING OF SEARCH RESULTS: " + (System.currentTimeMillis() - timestamp) + " ms"); //serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER RESULT PREPARATION: " + (System.currentTimeMillis() - timestamp) + " ms"); diff --git a/source/de/anomic/search/DocumentIndex.java b/source/de/anomic/search/DocumentIndex.java index b0a3e5b0a..2e0a0867f 100644 --- a/source/de/anomic/search/DocumentIndex.java +++ b/source/de/anomic/search/DocumentIndex.java @@ -137,7 +137,8 @@ public class DocumentIndex extends Segment { new Date(), url.length(), document, - condenser + condenser, + null ); } diff --git a/source/de/anomic/search/QueryParams.java b/source/de/anomic/search/QueryParams.java index e0f7cf323..3449449c5 100644 --- a/source/de/anomic/search/QueryParams.java +++ b/source/de/anomic/search/QueryParams.java @@ -26,10 +26,13 @@ package de.anomic.search; +import java.util.HashMap; import java.util.Iterator; +import java.util.Map; import java.util.TreeSet; import java.util.regex.Pattern; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.Condenser; import net.yacy.document.parser.html.AbstractScraper; import net.yacy.document.parser.html.CharacterCoding; @@ -280,7 +283,7 @@ public final class QueryParams { * @param text * @return true if the query matches with the given text */ - public final boolean matches(final String text) { + public final boolean matchesText(final String text) { final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text).keySet()); if (SetTools.anymatch(wordhashes, this.excludeHashes)) return false; return SetTools.totalInclusion(this.queryHashes, wordhashes); @@ -352,6 +355,25 @@ public final class QueryParams { for (byte[] b: blues) queryHashes.remove(b); } + + public final Map separateMatches(Map links) { + Map matcher = new HashMap(); + Iterator > i = links.entrySet().iterator(); + Map.Entry entry; + MultiProtocolURI url; + String anchorText; + while (i.hasNext()) { + entry = i.next(); + url = entry.getKey(); + anchorText = entry.getValue(); + if (this.matchesText(anchorText)) { + matcher.put(url, anchorText); + i.remove(); + } + } + return matcher; + } + public String id(final boolean anonymized) { // generate a string that identifies a search so results can be re-used in a cache String context = diff --git a/source/de/anomic/search/Segment.java b/source/de/anomic/search/Segment.java index e7e89988f..74144c836 100644 --- a/source/de/anomic/search/Segment.java +++ b/source/de/anomic/search/Segment.java @@ -49,6 +49,7 @@ import net.yacy.kelondro.data.word.WordReferenceFactory; import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.data.word.WordReferenceVars; import net.yacy.kelondro.index.HandleSet; +import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.ByteOrder; @@ -195,7 +196,16 @@ public class Segment { * @param outlinksOther * @return */ - private int addPageIndex(final DigestURI url, final Date urlModified, final Document document, final Condenser condenser, final String language, final char doctype, final int outlinksSame, final int outlinksOther) { + private int addPageIndex( + final DigestURI url, + final Date urlModified, + final Document document, + final Condenser condenser, + final String language, + final char doctype, + final int outlinksSame, + final int outlinksOther, + final SearchEvent searchEvent) { int wordCount = 0; final int urlLength = url.toNormalform(true, true).length(); final int urlComps = MultiProtocolURI.urlComps(url.toString()).length; @@ -215,18 +225,30 @@ public class Segment { doctype, outlinksSame, outlinksOther); Word wprop; + byte[] wordhash; while (i.hasNext()) { wentry = i.next(); word = wentry.getKey(); wprop = wentry.getValue(); assert (wprop.flags != null); ientry.setWord(wprop); + wordhash = Word.word2hash(word); try { - this.termIndex.add(Word.word2hash(word), ientry); + this.termIndex.add(wordhash, ientry); } catch (Exception e) { Log.logException(e); } wordCount++; + if (searchEvent != null && !searchEvent.getQuery().excludeHashes.has(wordhash) && searchEvent.getQuery().queryHashes.has(wordhash)) { + ReferenceContainer container; + try { + container = ReferenceContainer.emptyContainer(Segment.wordReferenceFactory, wordhash, 1); + container.add(ientry); + } catch (RowSpaceExceededException e) { + continue; + } + searchEvent.getRankingResult().add(container, false, -1); + } } return wordCount; @@ -245,7 +267,8 @@ public class Segment { final Date loadDate, final long sourcesize, final Document document, - final Condenser condenser + final Condenser condenser, + final SearchEvent searchEvent ) throws IOException { final long startTime = System.currentTimeMillis(); @@ -333,9 +356,10 @@ public class Segment { document, // document content condenser, // document condenser language, // document language - Response.docType(document.dc_format()), // document type + Response.docType(document.dc_format()), // document type document.inboundLinks(), // inbound links - document.outboundLinks() // outbound links + document.outboundLinks(), // outbound links + searchEvent // a search event that can have results directly ); final long indexingEndTime = System.currentTimeMillis(); diff --git a/source/de/anomic/search/Segments.java b/source/de/anomic/search/Segments.java index 9370ce812..b550eb2e0 100644 --- a/source/de/anomic/search/Segments.java +++ b/source/de/anomic/search/Segments.java @@ -209,7 +209,8 @@ public class Segments implements Iterable { final Date loadDate, final long sourcesize, final Document document, - final Condenser condenser + final Condenser condenser, + final SearchEvent searchEvent ) throws IOException { return segment(segmentName).storeDocument( url, @@ -218,7 +219,8 @@ public class Segments implements Iterable { loadDate, sourcesize, document, - condenser + condenser, + searchEvent ); } diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 5341253b0..7fc4e143e 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -117,6 +117,7 @@ import de.anomic.crawler.ResourceObserver; import de.anomic.crawler.ResultImages; import de.anomic.crawler.ResultURLs; import de.anomic.crawler.RobotsTxt; +import de.anomic.crawler.CrawlProfile.CacheStrategy; import de.anomic.crawler.CrawlProfile.entry; import de.anomic.crawler.retrieval.EventOrigin; import de.anomic.crawler.retrieval.HTTPLoader; @@ -1815,11 +1816,11 @@ public final class Switchboard extends serverSwitch { public void storeDocumentIndex(final indexingQueueEntry in) { in.queueEntry.updateStatus(Response.QUEUE_STATE_INDEXSTORAGE); - storeDocumentIndex(in.process, in.queueEntry, in.document, in.condenser); + storeDocumentIndex(in.process, in.queueEntry, in.document, in.condenser, null); in.queueEntry.updateStatus(Response.QUEUE_STATE_FINISHED); } - private void storeDocumentIndex(final Segments.Process process, final Response queueEntry, final Document document, final Condenser condenser) { + private void storeDocumentIndex(final Segments.Process process, final Response queueEntry, final Document document, final Condenser condenser, final SearchEvent searchEvent) { // CREATE INDEX final String dc_title = document.dc_title(); @@ -1834,7 +1835,7 @@ public final class Switchboard extends serverSwitch { } if (!queueEntry.profile().indexText() && !queueEntry.profile().indexMedia()) { - if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase); + if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name()); addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "denied by profile rule"); return; } @@ -1852,7 +1853,8 @@ public final class Switchboard extends serverSwitch { new Date(), queueEntry.size(), document, - condenser); + condenser, + searchEvent); RSSFeed.channels(Base64Order.enhancedCoder.equal(queueEntry.initiator(), peers.mySeed().hash.getBytes()) ? RSSFeed.LOCALINDEXING : RSSFeed.REMOTEINDEXING).addMessage(new RSSMessage("Indexed web page", dc_title, queueEntry.url().toNormalform(true, false))); } catch (final IOException e) { if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase); @@ -1892,6 +1894,66 @@ public final class Switchboard extends serverSwitch { } } + /** + * load the content of a URL, parse the content and add the content to the index + * This process is started concurrently. The method returns immediately after the call. + * @param url the url that shall be indexed + * @param searchEvent (optional) a search event that shall get results from the indexed pages directly feeded. If object is null then it is ignored + * @throws IOException + * @throws ParserException + */ + public void addToIndex(final DigestURI url, final SearchEvent searchEvent) throws IOException, ParserException { + new Thread() {public void run() { + try { + Segments.Process process = Segments.Process.LOCALCRAWLING; + if (indexSegments.segment(process).urlMetadata.exists(url.hash())) return; // don't do double-work + Request request = loader.request(url, true, true); + Response response = loader.load(request, CacheStrategy.IFFRESH, Long.MAX_VALUE); + if (response == null) throw new IOException("response == null"); + if (response.getContent() == null) throw new IOException("content == null"); + if (response.getResponseHeader() == null) throw new IOException("header == null"); + Document document = response.parse(); + if (document.indexingDenied()) throw new ParserException("indexing is denied", url); + Condenser condenser = new Condenser(document, true, true); + ResultImages.registerImages(document, true); + webStructure.generateCitationReference(document, condenser, response.lastModified()); + storeDocumentIndex(process, response, document, condenser, searchEvent); + log.logInfo("QuickFill of url " + url.toNormalform(true, true) + " finished"); + } catch (IOException e) { + Log.logException(e); + } catch (ParserException e) { + Log.logException(e); + } + }}.start(); + } + + public final void addAllToIndex(final DigestURI url, final Map links, final SearchEvent searchEvent) { + + // add the landing page to the index. should not load that again since it should be in the cache + try { + this.addToIndex(url, searchEvent); + } catch (IOException e) {} catch (ParserException e) {} + + // check if some of the links match with the query + Map matcher = searchEvent.getQuery().separateMatches(links); + + // take the matcher and load them all + for (Map.Entry entry: matcher.entrySet()) { + try { + this.addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent); + } catch (IOException e) {} catch (ParserException e) {} + } + + // take then the no-matcher and load them also + for (Map.Entry entry: links.entrySet()) { + try { + this.addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent); + } catch (IOException e) {} catch (ParserException e) {} + } + } + + + public class receiptSending implements Runnable { yacySeed initiatorPeer; URIMetadataRow reference; @@ -2103,6 +2165,39 @@ public final class Switchboard extends serverSwitch { crawlQueues.errorURL.push(bentry, initiator, new Date(), 0, failreason); } + public final void quickFillSite(final String host, final SearchEvent searchEvent) { + new Thread() {public void run() { + String r = host; + if (r.indexOf("//") < 0) r = "http://" + r; + + // get the links for a specific site + DigestURI url; + try { + url = new DigestURI(r, null); + } catch (MalformedURLException e) { + Log.logException(e); + return; + } + + Map links = null; + try { + links = loader.loadLinks(url, CrawlProfile.CacheStrategy.NOCACHE); + } catch (IOException e) { + Log.logException(e); + return; + } + Iterator i = links.keySet().iterator(); + MultiProtocolURI u; + while (i.hasNext()) { + u = i.next(); + if (!u.getHost().endsWith(host)) i.remove(); + } + + // add all pages to the index + addAllToIndex(url, links, searchEvent); + }}.start(); + } + public int currentPPM() { return EventTracker.countEvents("indexed", 20000) * 3; }