this is the beginning of some architecture changes that will hopefully bring some more stability, speed and transparency to the search process. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6260 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
c4d0e22a77
commit
72ac5bd80f
@ -1,40 +1,41 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<classpath>
|
||||
<classpathentry excluding="env/|htdocsdefault/|proxymsg/|yacy/|env/|yacy/user/|yacy/user/|yacy/ui/|processing/domaingraph/applet/|processing/domaingraph/|api/|api/bookmarks/posts/|api/bookmarks/|api/util/|api/bookmarks/xbel/|api/bookmarks/tags/" kind="src" path="htroot"/>
|
||||
<classpathentry kind="src" path="test"/>
|
||||
<classpathentry excluding="user/|user/|ui/" kind="src" path="htroot/yacy"/>
|
||||
<classpathentry kind="src" path="htroot/env"/>
|
||||
<classpathentry kind="src" path="source"/>
|
||||
<classpathentry kind="src" path="htroot/yacy/ui"/>
|
||||
<classpathentry excluding="bookmarks/posts/|bookmarks/|util/|bookmarks/xbel/|bookmarks/tags/" kind="src" path="htroot/api"/>
|
||||
<classpathentry kind="src" path="htroot/api/bookmarks/posts"/>
|
||||
<classpathentry excluding="posts/|xbel/|tags/" kind="src" path="htroot/api/bookmarks"/>
|
||||
<classpathentry kind="src" path="htroot/api/util"/>
|
||||
<classpathentry kind="src" path="htroot/api/bookmarks/xbel"/>
|
||||
<classpathentry kind="src" path="htroot/api/bookmarks/tags"/>
|
||||
<classpathentry exported="true" kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/commons-httpclient-3.1.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/commons-logging-1.1.1.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/commons-io-1.4.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/commons-fileupload-1.2.1.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/servlet-api.jar"/>
|
||||
<classpathentry kind="lib" path="lib/xerces.jar"/>
|
||||
<classpathentry kind="lib" path="lib/bzip2.jar"/>
|
||||
<classpathentry kind="lib" path="lib/mysql-connector-java-5.1.7-bin.jar"/>
|
||||
<classpathentry kind="lib" path="lib/bcmail-jdk14-139.jar"/>
|
||||
<classpathentry kind="lib" path="lib/bcprov-jdk14-139.jar"/>
|
||||
<classpathentry kind="lib" path="lib/commons-codec-1.3.jar"/>
|
||||
<classpathentry kind="lib" path="lib/FontBox-0.1.0-dev.jar"/>
|
||||
<classpathentry kind="lib" path="lib/J7Zip-modified.jar"/>
|
||||
<classpathentry kind="lib" path="lib/jakarta-oro-2.0.7.jar"/>
|
||||
<classpathentry kind="lib" path="lib/jsch-0.1.21.jar"/>
|
||||
<classpathentry kind="lib" path="lib/log4j-1.2.9.jar"/>
|
||||
<classpathentry kind="lib" path="lib/PDFBox-0.7.3.jar"/>
|
||||
<classpathentry kind="lib" path="lib/poi-3.2-FINAL-20081019.jar"/>
|
||||
<classpathentry kind="lib" path="lib/poi-scratchpad-3.2-FINAL-20081019.jar"/>
|
||||
<classpathentry kind="lib" path="lib/webcat-0.1-swf.jar"/>
|
||||
<classpathentry kind="lib" path="lib/activation.jar"/>
|
||||
<classpathentry kind="lib" path="lib/commons-jxpath-1.3.jar"/>
|
||||
<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
|
||||
<classpathentry kind="output" path="gen"/>
|
||||
</classpath>
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<classpath>
|
||||
<classpathentry excluding="env/|htdocsdefault/|proxymsg/|yacy/|env/|yacy/user/|yacy/user/|yacy/ui/|processing/domaingraph/applet/|processing/domaingraph/|api/|api/bookmarks/posts/|api/bookmarks/|api/util/|api/bookmarks/xbel/|api/bookmarks/tags/" kind="src" path="htroot"/>
|
||||
<classpathentry kind="src" path="test"/>
|
||||
<classpathentry excluding="user/|user/|ui/" kind="src" path="htroot/yacy"/>
|
||||
<classpathentry kind="src" path="htroot/env"/>
|
||||
<classpathentry kind="src" path="source"/>
|
||||
<classpathentry kind="src" path="htroot/yacy/ui"/>
|
||||
<classpathentry excluding="bookmarks/posts/|bookmarks/|util/|bookmarks/xbel/|bookmarks/tags/" kind="src" path="htroot/api"/>
|
||||
<classpathentry kind="src" path="htroot/api/bookmarks/posts"/>
|
||||
<classpathentry excluding="posts/|xbel/|tags/" kind="src" path="htroot/api/bookmarks"/>
|
||||
<classpathentry kind="src" path="htroot/api/util"/>
|
||||
<classpathentry kind="src" path="htroot/api/bookmarks/xbel"/>
|
||||
<classpathentry kind="src" path="htroot/api/bookmarks/tags"/>
|
||||
<classpathentry exported="true" kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/commons-httpclient-3.1.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/commons-logging-1.1.1.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/commons-io-1.4.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/commons-fileupload-1.2.1.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/servlet-api.jar"/>
|
||||
<classpathentry kind="lib" path="lib/xerces.jar"/>
|
||||
<classpathentry kind="lib" path="lib/bzip2.jar"/>
|
||||
<classpathentry kind="lib" path="lib/mysql-connector-java-5.1.7-bin.jar"/>
|
||||
<classpathentry kind="lib" path="lib/bcmail-jdk14-139.jar"/>
|
||||
<classpathentry kind="lib" path="lib/bcprov-jdk14-139.jar"/>
|
||||
<classpathentry kind="lib" path="lib/commons-codec-1.3.jar"/>
|
||||
<classpathentry kind="lib" path="lib/FontBox-0.1.0-dev.jar"/>
|
||||
<classpathentry kind="lib" path="lib/J7Zip-modified.jar"/>
|
||||
<classpathentry kind="lib" path="lib/jakarta-oro-2.0.7.jar"/>
|
||||
<classpathentry kind="lib" path="lib/jsch-0.1.21.jar"/>
|
||||
<classpathentry kind="lib" path="lib/log4j-1.2.9.jar"/>
|
||||
<classpathentry kind="lib" path="lib/PDFBox-0.7.3.jar"/>
|
||||
<classpathentry kind="lib" path="lib/poi-3.2-FINAL-20081019.jar"/>
|
||||
<classpathentry kind="lib" path="lib/poi-scratchpad-3.2-FINAL-20081019.jar"/>
|
||||
<classpathentry kind="lib" path="lib/webcat-0.1-swf.jar"/>
|
||||
<classpathentry kind="lib" path="lib/activation.jar"/>
|
||||
<classpathentry kind="lib" path="lib/commons-jxpath-1.3.jar"/>
|
||||
<classpathentry kind="lib" path="libt/junit.jar"/>
|
||||
<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
|
||||
<classpathentry kind="output" path="gen"/>
|
||||
</classpath>
|
||||
|
@ -0,0 +1,163 @@
|
||||
// ResultEntry.java
|
||||
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 10.10.2005 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
||||
// $LastChangedRevision: 1986 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
|
||||
import de.anomic.document.Condenser;
|
||||
import de.anomic.document.Word;
|
||||
import de.anomic.kelondro.order.Bitfield;
|
||||
import de.anomic.kelondro.text.Reference;
|
||||
import de.anomic.kelondro.text.Segment;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceVars;
|
||||
import de.anomic.yacy.yacySeed;
|
||||
import de.anomic.yacy.yacySeedDB;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
public class ResultEntry {
|
||||
|
||||
// payload objects
|
||||
private final URLMetadataRow urlentry;
|
||||
private final URLMetadataRow.Components urlcomps; // buffer for components
|
||||
private String alternative_urlstring;
|
||||
private String alternative_urlname;
|
||||
private final SnippetCache.TextSnippet textSnippet;
|
||||
private final ArrayList<SnippetCache.MediaSnippet> mediaSnippets;
|
||||
|
||||
// statistic objects
|
||||
public long dbRetrievalTime, snippetComputationTime;
|
||||
|
||||
public ResultEntry(final URLMetadataRow urlentry,
|
||||
final Segment indexSegment,
|
||||
yacySeedDB peers,
|
||||
final SnippetCache.TextSnippet textSnippet,
|
||||
final ArrayList<SnippetCache.MediaSnippet> mediaSnippets,
|
||||
final long dbRetrievalTime, final long snippetComputationTime) {
|
||||
this.urlentry = urlentry;
|
||||
this.urlcomps = urlentry.metadata();
|
||||
this.alternative_urlstring = null;
|
||||
this.alternative_urlname = null;
|
||||
this.textSnippet = textSnippet;
|
||||
this.mediaSnippets = mediaSnippets;
|
||||
this.dbRetrievalTime = dbRetrievalTime;
|
||||
this.snippetComputationTime = snippetComputationTime;
|
||||
final String host = urlcomps.url().getHost();
|
||||
if (host.endsWith(".yacyh")) {
|
||||
// translate host into current IP
|
||||
int p = host.indexOf(".");
|
||||
final String hash = yacySeed.hexHash2b64Hash(host.substring(p + 1, host.length() - 6));
|
||||
final yacySeed seed = peers.getConnected(hash);
|
||||
final String filename = urlcomps.url().getFile();
|
||||
String address = null;
|
||||
if ((seed == null) || ((address = seed.getPublicAddress()) == null)) {
|
||||
// seed is not known from here
|
||||
try {
|
||||
indexSegment.termIndex().remove(
|
||||
Word.words2hashes(Condenser.getWords(
|
||||
("yacyshare " +
|
||||
filename.replace('?', ' ') +
|
||||
" " +
|
||||
urlcomps.dc_title())).keySet()),
|
||||
urlentry.hash());
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
indexSegment.urlMetadata().remove(urlentry.hash()); // clean up
|
||||
throw new RuntimeException("index void");
|
||||
}
|
||||
alternative_urlstring = "http://" + address + "/" + host.substring(0, p) + filename;
|
||||
alternative_urlname = "http://share." + seed.getName() + ".yacy" + filename;
|
||||
if ((p = alternative_urlname.indexOf("?")) > 0) alternative_urlname = alternative_urlname.substring(0, p);
|
||||
}
|
||||
}
|
||||
public int hashCode() {
|
||||
return urlentry.hash().hashCode();
|
||||
}
|
||||
public String hash() {
|
||||
return urlentry.hash();
|
||||
}
|
||||
public yacyURL url() {
|
||||
return urlcomps.url();
|
||||
}
|
||||
public Bitfield flags() {
|
||||
return urlentry.flags();
|
||||
}
|
||||
public String urlstring() {
|
||||
return (alternative_urlstring == null) ? urlcomps.url().toNormalform(false, true) : alternative_urlstring;
|
||||
}
|
||||
public String urlname() {
|
||||
return (alternative_urlname == null) ? yacyURL.unescape(urlcomps.url().toNormalform(false, true)) : alternative_urlname;
|
||||
}
|
||||
public String title() {
|
||||
return urlcomps.dc_title();
|
||||
}
|
||||
public SnippetCache.TextSnippet textSnippet() {
|
||||
return this.textSnippet;
|
||||
}
|
||||
public ArrayList<SnippetCache.MediaSnippet> mediaSnippets() {
|
||||
return this.mediaSnippets;
|
||||
}
|
||||
public Date modified() {
|
||||
return urlentry.moddate();
|
||||
}
|
||||
public int filesize() {
|
||||
return urlentry.size();
|
||||
}
|
||||
public int limage() {
|
||||
return urlentry.limage();
|
||||
}
|
||||
public int laudio() {
|
||||
return urlentry.laudio();
|
||||
}
|
||||
public int lvideo() {
|
||||
return urlentry.lvideo();
|
||||
}
|
||||
public int lapp() {
|
||||
return urlentry.lapp();
|
||||
}
|
||||
public WordReferenceVars word() {
|
||||
final Reference word = urlentry.word();
|
||||
assert word instanceof WordReferenceVars;
|
||||
return (WordReferenceVars) word;
|
||||
}
|
||||
public boolean hasTextSnippet() {
|
||||
return (this.textSnippet != null) && (this.textSnippet.getErrorCode() < 11);
|
||||
}
|
||||
public boolean hasMediaSnippets() {
|
||||
return (this.mediaSnippets != null) && (this.mediaSnippets.size() > 0);
|
||||
}
|
||||
public String resource() {
|
||||
// generate transport resource
|
||||
if ((textSnippet == null) || (!textSnippet.exists())) {
|
||||
return urlentry.toString();
|
||||
}
|
||||
return urlentry.toString(textSnippet.getLineRaw());
|
||||
}
|
||||
}
|
@ -0,0 +1,129 @@
|
||||
// SearchEventCache.java
|
||||
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 10.10.2005 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
||||
// $LastChangedRevision: 1986 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import de.anomic.crawler.ResultURLs;
|
||||
import de.anomic.kelondro.text.Segment;
|
||||
import de.anomic.search.SearchEvent.SnippetFetcher;
|
||||
import de.anomic.yacy.yacySeedDB;
|
||||
import de.anomic.yacy.logging.Log;
|
||||
|
||||
public class SearchEventCache {
|
||||
|
||||
protected static ConcurrentHashMap<String, SearchEvent> lastEvents = new ConcurrentHashMap<String, SearchEvent>(); // a cache for objects from this class: re-use old search requests
|
||||
public static final long eventLifetime = 60000; // the time an event will stay in the cache, 1 Minute
|
||||
|
||||
public static void cleanupEvents(final boolean all) {
|
||||
// remove old events in the event cache
|
||||
final Iterator<SearchEvent> i = lastEvents.values().iterator();
|
||||
SearchEvent cleanEvent;
|
||||
while (i.hasNext()) {
|
||||
cleanEvent = i.next();
|
||||
if ((all) || (cleanEvent.eventTime + eventLifetime < System.currentTimeMillis())) {
|
||||
// execute deletion of failed words
|
||||
int rw = cleanEvent.failedURLs.size();
|
||||
if (rw > 0) {
|
||||
final TreeSet<byte[]> removeWords = cleanEvent.query.queryHashes;
|
||||
removeWords.addAll(cleanEvent.query.excludeHashes);
|
||||
try {
|
||||
final Iterator<byte[]> j = removeWords.iterator();
|
||||
// remove the same url hashes for multiple words
|
||||
while (j.hasNext()) {
|
||||
cleanEvent.indexSegment.termIndex().remove(j.next(), cleanEvent.failedURLs.keySet());
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
Log.logInfo("SearchEvents", "cleaning up event " + cleanEvent.query.id(true) + ", removed " + rw + " URL references on " + removeWords.size() + " words");
|
||||
}
|
||||
|
||||
// remove the event
|
||||
i.remove();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static SearchEvent getEvent(final String eventID) {
|
||||
return lastEvents.get(eventID);
|
||||
}
|
||||
|
||||
public static SearchEvent getEvent(
|
||||
final QueryParams query,
|
||||
final Segment indexSegment,
|
||||
final yacySeedDB peers,
|
||||
final ResultURLs crawlResults,
|
||||
final TreeMap<byte[], String> preselectedPeerHashes,
|
||||
final boolean generateAbstracts) {
|
||||
|
||||
String id = query.id(false);
|
||||
SearchEvent event = SearchEventCache.lastEvents.get(id);
|
||||
if (Switchboard.getSwitchboard().crawlQueues.noticeURL.size() > 0 && event != null && System.currentTimeMillis() - event.eventTime > 60000) {
|
||||
// if a local crawl is ongoing, don't use the result from the cache to use possibly more results that come from the current crawl
|
||||
// to prevent that this happens during a person switches between the different result pages, a re-search happens no more than
|
||||
// once a minute
|
||||
SearchEventCache.lastEvents.remove(id);
|
||||
event = null;
|
||||
} else {
|
||||
if (event != null) {
|
||||
//re-new the event time for this event, so it is not deleted next time too early
|
||||
event.eventTime = System.currentTimeMillis();
|
||||
// replace the query, because this contains the current result offset
|
||||
event.query = query;
|
||||
}
|
||||
}
|
||||
if (event == null) {
|
||||
// generate a new event
|
||||
event = new SearchEvent(query, indexSegment, peers, crawlResults, preselectedPeerHashes, generateAbstracts);
|
||||
} else {
|
||||
// if worker threads had been alive, but did not succeed, start them again to fetch missing links
|
||||
if ((!event.anyWorkerAlive()) &&
|
||||
(((query.contentdom == QueryParams.CONTENTDOM_IMAGE) && (event.images.size() + 30 < query.neededResults())) ||
|
||||
(event.result.size() < query.neededResults() + 10)) &&
|
||||
//(event.query.onlineSnippetFetch) &&
|
||||
(event.getRankingResult().getLocalResourceSize() + event.getRankingResult().getRemoteResourceSize() > event.result.size())) {
|
||||
// set new timeout
|
||||
event.eventTime = System.currentTimeMillis();
|
||||
// start worker threads to fetch urls and snippets
|
||||
event.workerThreads = new SnippetFetcher[SearchEvent.workerThreadCount];
|
||||
SnippetFetcher worker;
|
||||
for (int i = 0; i < event.workerThreads.length; i++) {
|
||||
worker = event.new SnippetFetcher(i, 6000, (query.onlineSnippetFetch) ? 2 : 0);
|
||||
worker.start();
|
||||
event.workerThreads[i] = worker;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return event;
|
||||
}
|
||||
}
|
Loading…
Reference in new issue