added a fascinating new way to search _and_ start a web crawl at the same time:

implemented a hint from dulcedo "use site: - operator as crawl start point".
YaCy already was able to search using a site-constraint. This function is now extended with a instant crawling feature.
When you now use the site-operator, then the landing page of the site iand every page that is linked from this page are loaded, indexed and selected for the search result within that search request. When the remote server responds quickly enough, then this process can result in search results during the normal search result preparation .. just in some seconds.


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6941 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 8e3cbbb6a9
commit 3a9dc52ac2

@ -322,14 +322,15 @@ public class yacysearch {
} }
int site = querystring.indexOf("site:"); int site = querystring.indexOf("site:");
String sitehash = null; String sitehash = null;
String sitehost = null;
if (site >= 0) { if (site >= 0) {
int ftb = querystring.indexOf(' ', site); int ftb = querystring.indexOf(' ', site);
if (ftb == -1) ftb = querystring.length(); if (ftb == -1) ftb = querystring.length();
String domain = querystring.substring(site + 5, ftb); sitehost = querystring.substring(site + 5, ftb);
querystring = querystring.replace("site:" + domain, ""); querystring = querystring.replace("site:" + sitehost, "");
while (domain.length() > 0 && domain.charAt(0) == '.') domain = domain.substring(1); while (sitehost.length() > 0 && sitehost.charAt(0) == '.') sitehost = sitehost.substring(1);
while (domain.endsWith(".")) domain = domain.substring(0, domain.length() - 1); while (sitehost.endsWith(".")) sitehost = sitehost.substring(0, sitehost.length() - 1);
sitehash = DigestURI.domhash(domain); sitehash = DigestURI.domhash(sitehost);
} }
int authori = querystring.indexOf("author:"); int authori = querystring.indexOf("author:");
String authorhash = null; String authorhash = null;
@ -502,6 +503,7 @@ public class yacysearch {
final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.crawlResults, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader); final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.crawlResults, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader);
try {Thread.sleep(global ? 100 : 10);} catch (InterruptedException e1) {} // wait a little time to get first results in the search try {Thread.sleep(global ? 100 : 10);} catch (InterruptedException e1) {} // wait a little time to get first results in the search
if (sitehost != null && authenticated) sb.quickFillSite(sitehost, theSearch);
// generate result object // generate result object
//serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER ORDERING OF SEARCH RESULTS: " + (System.currentTimeMillis() - timestamp) + " ms"); //serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER ORDERING OF SEARCH RESULTS: " + (System.currentTimeMillis() - timestamp) + " ms");
//serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER RESULT PREPARATION: " + (System.currentTimeMillis() - timestamp) + " ms"); //serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER RESULT PREPARATION: " + (System.currentTimeMillis() - timestamp) + " ms");

@ -137,7 +137,8 @@ public class DocumentIndex extends Segment {
new Date(), new Date(),
url.length(), url.length(),
document, document,
condenser condenser,
null
); );
} }

@ -26,10 +26,13 @@
package de.anomic.search; package de.anomic.search;
import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.document.parser.html.AbstractScraper; import net.yacy.document.parser.html.AbstractScraper;
import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.CharacterCoding;
@ -280,7 +283,7 @@ public final class QueryParams {
* @param text * @param text
* @return true if the query matches with the given text * @return true if the query matches with the given text
*/ */
public final boolean matches(final String text) { public final boolean matchesText(final String text) {
final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text).keySet()); final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text).keySet());
if (SetTools.anymatch(wordhashes, this.excludeHashes)) return false; if (SetTools.anymatch(wordhashes, this.excludeHashes)) return false;
return SetTools.totalInclusion(this.queryHashes, wordhashes); return SetTools.totalInclusion(this.queryHashes, wordhashes);
@ -352,6 +355,25 @@ public final class QueryParams {
for (byte[] b: blues) queryHashes.remove(b); for (byte[] b: blues) queryHashes.remove(b);
} }
public final Map<MultiProtocolURI, String> separateMatches(Map<MultiProtocolURI, String> links) {
Map<MultiProtocolURI, String> matcher = new HashMap<MultiProtocolURI, String>();
Iterator <Map.Entry<MultiProtocolURI, String>> i = links.entrySet().iterator();
Map.Entry<MultiProtocolURI, String> entry;
MultiProtocolURI url;
String anchorText;
while (i.hasNext()) {
entry = i.next();
url = entry.getKey();
anchorText = entry.getValue();
if (this.matchesText(anchorText)) {
matcher.put(url, anchorText);
i.remove();
}
}
return matcher;
}
public String id(final boolean anonymized) { public String id(final boolean anonymized) {
// generate a string that identifies a search so results can be re-used in a cache // generate a string that identifies a search so results can be re-used in a cache
String context = String context =

@ -49,6 +49,7 @@ import net.yacy.kelondro.data.word.WordReferenceFactory;
import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.data.word.WordReferenceVars; import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.ByteOrder; import net.yacy.kelondro.order.ByteOrder;
@ -195,7 +196,16 @@ public class Segment {
* @param outlinksOther * @param outlinksOther
* @return * @return
*/ */
private int addPageIndex(final DigestURI url, final Date urlModified, final Document document, final Condenser condenser, final String language, final char doctype, final int outlinksSame, final int outlinksOther) { private int addPageIndex(
final DigestURI url,
final Date urlModified,
final Document document,
final Condenser condenser,
final String language,
final char doctype,
final int outlinksSame,
final int outlinksOther,
final SearchEvent searchEvent) {
int wordCount = 0; int wordCount = 0;
final int urlLength = url.toNormalform(true, true).length(); final int urlLength = url.toNormalform(true, true).length();
final int urlComps = MultiProtocolURI.urlComps(url.toString()).length; final int urlComps = MultiProtocolURI.urlComps(url.toString()).length;
@ -215,18 +225,30 @@ public class Segment {
doctype, doctype,
outlinksSame, outlinksOther); outlinksSame, outlinksOther);
Word wprop; Word wprop;
byte[] wordhash;
while (i.hasNext()) { while (i.hasNext()) {
wentry = i.next(); wentry = i.next();
word = wentry.getKey(); word = wentry.getKey();
wprop = wentry.getValue(); wprop = wentry.getValue();
assert (wprop.flags != null); assert (wprop.flags != null);
ientry.setWord(wprop); ientry.setWord(wprop);
wordhash = Word.word2hash(word);
try { try {
this.termIndex.add(Word.word2hash(word), ientry); this.termIndex.add(wordhash, ientry);
} catch (Exception e) { } catch (Exception e) {
Log.logException(e); Log.logException(e);
} }
wordCount++; wordCount++;
if (searchEvent != null && !searchEvent.getQuery().excludeHashes.has(wordhash) && searchEvent.getQuery().queryHashes.has(wordhash)) {
ReferenceContainer<WordReference> container;
try {
container = ReferenceContainer.emptyContainer(Segment.wordReferenceFactory, wordhash, 1);
container.add(ientry);
} catch (RowSpaceExceededException e) {
continue;
}
searchEvent.getRankingResult().add(container, false, -1);
}
} }
return wordCount; return wordCount;
@ -245,7 +267,8 @@ public class Segment {
final Date loadDate, final Date loadDate,
final long sourcesize, final long sourcesize,
final Document document, final Document document,
final Condenser condenser final Condenser condenser,
final SearchEvent searchEvent
) throws IOException { ) throws IOException {
final long startTime = System.currentTimeMillis(); final long startTime = System.currentTimeMillis();
@ -333,9 +356,10 @@ public class Segment {
document, // document content document, // document content
condenser, // document condenser condenser, // document condenser
language, // document language language, // document language
Response.docType(document.dc_format()), // document type Response.docType(document.dc_format()), // document type
document.inboundLinks(), // inbound links document.inboundLinks(), // inbound links
document.outboundLinks() // outbound links document.outboundLinks(), // outbound links
searchEvent // a search event that can have results directly
); );
final long indexingEndTime = System.currentTimeMillis(); final long indexingEndTime = System.currentTimeMillis();

@ -209,7 +209,8 @@ public class Segments implements Iterable<Segment> {
final Date loadDate, final Date loadDate,
final long sourcesize, final long sourcesize,
final Document document, final Document document,
final Condenser condenser final Condenser condenser,
final SearchEvent searchEvent
) throws IOException { ) throws IOException {
return segment(segmentName).storeDocument( return segment(segmentName).storeDocument(
url, url,
@ -218,7 +219,8 @@ public class Segments implements Iterable<Segment> {
loadDate, loadDate,
sourcesize, sourcesize,
document, document,
condenser condenser,
searchEvent
); );
} }

@ -117,6 +117,7 @@ import de.anomic.crawler.ResourceObserver;
import de.anomic.crawler.ResultImages; import de.anomic.crawler.ResultImages;
import de.anomic.crawler.ResultURLs; import de.anomic.crawler.ResultURLs;
import de.anomic.crawler.RobotsTxt; import de.anomic.crawler.RobotsTxt;
import de.anomic.crawler.CrawlProfile.CacheStrategy;
import de.anomic.crawler.CrawlProfile.entry; import de.anomic.crawler.CrawlProfile.entry;
import de.anomic.crawler.retrieval.EventOrigin; import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.HTTPLoader;
@ -1815,11 +1816,11 @@ public final class Switchboard extends serverSwitch {
public void storeDocumentIndex(final indexingQueueEntry in) { public void storeDocumentIndex(final indexingQueueEntry in) {
in.queueEntry.updateStatus(Response.QUEUE_STATE_INDEXSTORAGE); in.queueEntry.updateStatus(Response.QUEUE_STATE_INDEXSTORAGE);
storeDocumentIndex(in.process, in.queueEntry, in.document, in.condenser); storeDocumentIndex(in.process, in.queueEntry, in.document, in.condenser, null);
in.queueEntry.updateStatus(Response.QUEUE_STATE_FINISHED); in.queueEntry.updateStatus(Response.QUEUE_STATE_FINISHED);
} }
private void storeDocumentIndex(final Segments.Process process, final Response queueEntry, final Document document, final Condenser condenser) { private void storeDocumentIndex(final Segments.Process process, final Response queueEntry, final Document document, final Condenser condenser, final SearchEvent searchEvent) {
// CREATE INDEX // CREATE INDEX
final String dc_title = document.dc_title(); final String dc_title = document.dc_title();
@ -1834,7 +1835,7 @@ public final class Switchboard extends serverSwitch {
} }
if (!queueEntry.profile().indexText() && !queueEntry.profile().indexMedia()) { if (!queueEntry.profile().indexText() && !queueEntry.profile().indexMedia()) {
if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase); if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name());
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "denied by profile rule"); addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "denied by profile rule");
return; return;
} }
@ -1852,7 +1853,8 @@ public final class Switchboard extends serverSwitch {
new Date(), new Date(),
queueEntry.size(), queueEntry.size(),
document, document,
condenser); condenser,
searchEvent);
RSSFeed.channels(Base64Order.enhancedCoder.equal(queueEntry.initiator(), peers.mySeed().hash.getBytes()) ? RSSFeed.LOCALINDEXING : RSSFeed.REMOTEINDEXING).addMessage(new RSSMessage("Indexed web page", dc_title, queueEntry.url().toNormalform(true, false))); RSSFeed.channels(Base64Order.enhancedCoder.equal(queueEntry.initiator(), peers.mySeed().hash.getBytes()) ? RSSFeed.LOCALINDEXING : RSSFeed.REMOTEINDEXING).addMessage(new RSSMessage("Indexed web page", dc_title, queueEntry.url().toNormalform(true, false)));
} catch (final IOException e) { } catch (final IOException e) {
if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase); if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase);
@ -1892,6 +1894,66 @@ public final class Switchboard extends serverSwitch {
} }
} }
/**
* load the content of a URL, parse the content and add the content to the index
* This process is started concurrently. The method returns immediately after the call.
* @param url the url that shall be indexed
* @param searchEvent (optional) a search event that shall get results from the indexed pages directly feeded. If object is null then it is ignored
* @throws IOException
* @throws ParserException
*/
public void addToIndex(final DigestURI url, final SearchEvent searchEvent) throws IOException, ParserException {
new Thread() {public void run() {
try {
Segments.Process process = Segments.Process.LOCALCRAWLING;
if (indexSegments.segment(process).urlMetadata.exists(url.hash())) return; // don't do double-work
Request request = loader.request(url, true, true);
Response response = loader.load(request, CacheStrategy.IFFRESH, Long.MAX_VALUE);
if (response == null) throw new IOException("response == null");
if (response.getContent() == null) throw new IOException("content == null");
if (response.getResponseHeader() == null) throw new IOException("header == null");
Document document = response.parse();
if (document.indexingDenied()) throw new ParserException("indexing is denied", url);
Condenser condenser = new Condenser(document, true, true);
ResultImages.registerImages(document, true);
webStructure.generateCitationReference(document, condenser, response.lastModified());
storeDocumentIndex(process, response, document, condenser, searchEvent);
log.logInfo("QuickFill of url " + url.toNormalform(true, true) + " finished");
} catch (IOException e) {
Log.logException(e);
} catch (ParserException e) {
Log.logException(e);
}
}}.start();
}
public final void addAllToIndex(final DigestURI url, final Map<MultiProtocolURI, String> links, final SearchEvent searchEvent) {
// add the landing page to the index. should not load that again since it should be in the cache
try {
this.addToIndex(url, searchEvent);
} catch (IOException e) {} catch (ParserException e) {}
// check if some of the links match with the query
Map<MultiProtocolURI, String> matcher = searchEvent.getQuery().separateMatches(links);
// take the matcher and load them all
for (Map.Entry<MultiProtocolURI, String> entry: matcher.entrySet()) {
try {
this.addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent);
} catch (IOException e) {} catch (ParserException e) {}
}
// take then the no-matcher and load them also
for (Map.Entry<MultiProtocolURI, String> entry: links.entrySet()) {
try {
this.addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent);
} catch (IOException e) {} catch (ParserException e) {}
}
}
public class receiptSending implements Runnable { public class receiptSending implements Runnable {
yacySeed initiatorPeer; yacySeed initiatorPeer;
URIMetadataRow reference; URIMetadataRow reference;
@ -2103,6 +2165,39 @@ public final class Switchboard extends serverSwitch {
crawlQueues.errorURL.push(bentry, initiator, new Date(), 0, failreason); crawlQueues.errorURL.push(bentry, initiator, new Date(), 0, failreason);
} }
public final void quickFillSite(final String host, final SearchEvent searchEvent) {
new Thread() {public void run() {
String r = host;
if (r.indexOf("//") < 0) r = "http://" + r;
// get the links for a specific site
DigestURI url;
try {
url = new DigestURI(r, null);
} catch (MalformedURLException e) {
Log.logException(e);
return;
}
Map<MultiProtocolURI, String> links = null;
try {
links = loader.loadLinks(url, CrawlProfile.CacheStrategy.NOCACHE);
} catch (IOException e) {
Log.logException(e);
return;
}
Iterator<MultiProtocolURI> i = links.keySet().iterator();
MultiProtocolURI u;
while (i.hasNext()) {
u = i.next();
if (!u.getHost().endsWith(host)) i.remove();
}
// add all pages to the index
addAllToIndex(url, links, searchEvent);
}}.start();
}
public int currentPPM() { public int currentPPM() {
return EventTracker.countEvents("indexed", 20000) * 3; return EventTracker.countEvents("indexed", 20000) * 3;
} }

Loading…
Cancel
Save