added snippet-routines (not yet finished)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@218 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 9244b6ad6f
commit ca3b4ccaf4

@ -181,7 +181,7 @@ public class Status {
try {
long mins = Long.parseLong(minsAsString);
StringBuilder uptime = new StringBuilder();
StringBuffer uptime = new StringBuffer();
int uptimeDays = (int) (Math.floor(mins/1440));
int uptimeHours = (int) (Math.floor(mins/60)%24);
@ -189,7 +189,7 @@ public class Status {
uptime.append(uptimeDays)
.append(((uptimeDays == 1)?" day ":" days "))
.append((uptimeHours < 10)?"0":"")
.append((uptimeHours < 10)?"0":"")
.append(uptimeHours)
.append(":")
.append((uptimeMins < 10)?"0":"")

@ -333,6 +333,7 @@ public class plasmaCrawlLURL extends plasmaURL {
private char doctype;
private long size;
private int wordCount;
private String snippet;
public entry(URL url, String descr, Date moddate, Date loaddate,
String referrerHash, int copyCount, boolean localNeed,
@ -351,6 +352,7 @@ public class plasmaCrawlLURL extends plasmaURL {
this.doctype = doctype;
this.size = size;
this.wordCount = wordCount;
this.snippet = null;
store();
}
@ -378,6 +380,7 @@ public class plasmaCrawlLURL extends plasmaURL {
this.doctype = (char) entry[10][0];
this.size = (long) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[11]));
this.wordCount = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[12]));
this.snippet = null;
return;
}
} catch (Exception e) {
@ -409,6 +412,8 @@ public class plasmaCrawlLURL extends plasmaURL {
this.doctype = prop.getProperty("dt", "t").charAt(0);
this.size = Long.parseLong(prop.getProperty("size", "0"));
this.wordCount = Integer.parseInt(prop.getProperty("wc", "0"));
this.snippet = prop.getProperty("snippet", "");
if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null);
store();
//}
} catch (Exception e) {
@ -507,6 +512,12 @@ public class plasmaCrawlLURL extends plasmaURL {
return wordCount;
}
public String snippet() {
// the snippet may appear here if the url was transported in a remote search
// it will not be saved anywhere, but can only be requested here
return snippet;
}
private String corePropList() {
// generate a parseable string; this is a simple property-list
try {

@ -83,7 +83,7 @@ public final class plasmaHTCache {
public long currCacheSize;
public long maxCacheSize;
private long lastAcc;
private final File cachePath;
public final File cachePath;
public static serverLog log;
public static final int CACHE_UNFILLED = 0; // default case without assignment

@ -0,0 +1,234 @@
// plasmaSnippetCache.java
// -----------------------
// part of YaCy
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
// last major change: 07.06.2005
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma;
import java.util.*;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverLog;
import de.anomic.http.httpHeader;
import de.anomic.yacy.yacySearch;
public class plasmaSnippetCache {
private static final int maxCache = 500;
private int snippetsScoreCounter;
private kelondroMScoreCluster snippetsScore;
private HashMap snippetsCache;
private plasmaHTCache cacheManager;
private plasmaParser parser;
private serverLog log;
private String remoteProxyHost;
private int remoteProxyPort;
private boolean remoteProxyUse;
public plasmaSnippetCache(plasmaHTCache cacheManager, plasmaParser parser,
String remoteProxyHost, int remoteProxyPort, boolean remoteProxyUse,
serverLog log) {
this.cacheManager = cacheManager;
this.parser = parser;
this.log = log;
this.remoteProxyHost = remoteProxyHost;
this.remoteProxyPort = remoteProxyPort;
this.remoteProxyUse = remoteProxyUse;
this.snippetsScoreCounter = 0;
this.snippetsScore = new kelondroMScoreCluster();
this.snippetsCache = new HashMap();
}
public synchronized void store(String wordhashes, String urlhash, String snippet) {
// generate key
String key = urlhash + wordhashes;
// do nothing if snippet is known
if (snippetsCache.containsKey(key)) return;
// learn new snippet
snippetsScore.addScore(key, snippetsScoreCounter++);
snippetsCache.put(key, snippet);
// care for counter
if (snippetsScoreCounter == java.lang.Integer.MAX_VALUE) {
snippetsScoreCounter = 0;
snippetsScore = new kelondroMScoreCluster();
snippetsCache = new HashMap();
}
// flush cache if cache is full
while (snippetsCache.size() > maxCache) {
key = (String) snippetsScore.getMinObject();
snippetsScore.deleteScore(key);
snippetsCache.remove(key);
}
}
private String retrieve(String wordhashes, String urlhash) {
// generate key
String key = urlhash + wordhashes;
return (String) snippetsCache.get(key);
}
public String retrieve(java.net.URL url, boolean fetchOnline, Set query, boolean queryAreHashes) {
if (query.size() == 0) return null;
if (!(queryAreHashes)) query = plasmaSearch.words2hashes(query);
String urlhash = plasmaURL.urlHash(url);
// try to get snippet from snippetCache
String wordhashes = yacySearch.set2string(query);
String snippet = retrieve(wordhashes, urlhash);
if (snippet != null) return snippet;
// if the snippet is not in the cache, we can try to get it from the htcache
plasmaParserDocument document = getDocument(url, fetchOnline);
if (document == null) return null;
String[] sentences = document.getSentences();
//System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
if ((sentences == null) || (sentences.length == 0)) return null;
// we have found a parseable non-empty file: use the lines
TreeMap sentencematrix = hashMatrix(sentences);
Iterator i = query.iterator();
String hash;
kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
Iterator j;
Integer sentencenumber;
Map.Entry entry;
while (i.hasNext()) {
hash = (String) i.next();
j = sentencematrix.entrySet().iterator();
while (j.hasNext()) {
entry = (Map.Entry) j.next();
sentencenumber = (Integer) entry.getKey();
if (((HashSet) entry.getValue()).contains(hash)) hitTable.addScore(sentencenumber, sentences[sentencenumber.intValue()].length());
}
}
Integer maxLine = (Integer) hitTable.getMaxObject();
if (maxLine == null) return null;
snippet = sentences[maxLine.intValue()];
if (snippet.length() > 140) return null;
// finally store this snippet in our own cache
store(wordhashes, urlhash, snippet);
return snippet;
}
private TreeMap hashMatrix(String[] sentences) {
TreeMap map = new TreeMap();
HashSet set;
Enumeration words;
for (int i = 0; i < sentences.length; i++) {
set = new HashSet();
words = plasmaCondenser.wordTokenizer(sentences[i]);
while (words.hasMoreElements()) set.add(plasmaWordIndexEntry.word2hash((String) words.nextElement()));
map.put(new Integer(i), set);
}
return map;
}
private byte[] getResource(URL url, boolean fetchOnline) {
// load the url as resource from the web
try {
//return httpc.singleGET(url, 5000, null, null, remoteProxyHost, remoteProxyPort);
byte[] resource = getResourceFromCache(url);
if ((fetchOnline) && (resource == null)) {
loadResourceFromWeb(url, 5000);
resource = getResourceFromCache(url);
}
return resource;
} catch (IOException e) {
return null;
}
}
private byte[] getResourceFromCache(URL url) {
// load the url as resource from the cache
String path = htmlFilterContentScraper.urlNormalform(url).substring(6);
File cache = cacheManager.cachePath;
File f = new File(cache, path);
if (f.exists()) try {
return serverFileUtils.read(f);
} catch (IOException e) {
return null;
} else {
return null;
}
}
private void loadResourceFromWeb(URL url, int socketTimeout) throws IOException {
plasmaCrawlWorker.load(
url,
null,
null,
0,
null,
socketTimeout,
remoteProxyHost,
remoteProxyPort,
remoteProxyUse,
cacheManager,
log);
}
public plasmaParserDocument getDocument(URL url, boolean fetchOnline) {
byte[] resource = getResource(url, fetchOnline);
if (resource == null) return null;
httpHeader header = null;
try {
header = cacheManager.getCachedResponse(plasmaURL.urlHash(url));
} catch (IOException e) {
return null;
}
if (header == null) return null;
if (plasmaParser.supportedMimeTypesContains(header.mime())) {
return parser.parseSource(url, header.mime(), resource);
} else {
return null;
}
}
}

@ -137,6 +137,7 @@ import de.anomic.server.serverCodings;
import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverThread;
import de.anomic.server.serverInstantThread;
import de.anomic.server.serverLog;
import de.anomic.server.serverObjects;
@ -170,6 +171,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public plasmaWordIndex wordIndex;
public plasmaSearch searchManager;
public plasmaHTCache cacheManager;
public plasmaSnippetCache snippetCache;
public plasmaCrawlLoader cacheLoader;
public LinkedList processStack = new LinkedList();
public messageBoard messageDB;
@ -216,6 +218,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
remoteProxyPort = 0;
}
if (!(listsPath.exists())) listsPath.mkdirs();
// load coloured lists
@ -317,6 +320,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
long[] testresult = facilityDB.selectLong("statistik", "yyyyMMddHHm");
testresult = facilityDB.selectLong("statistik", (new serverDate()).toShortString(false).substring(0, 11));
// generate snippets cache
log.logSystem("Initializing Snippet Cache");
snippetCache = new plasmaSnippetCache(cacheManager, parser,
remoteProxyHost, remoteProxyPort, remoteProxyUse,
log);
// start yacy core
log.logSystem("Starting YaCy Protocol Core");
yacyCore yc = new yacyCore(this);
@ -328,6 +338,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
new serverInstantThread(this, "cleanupJob", "cleanupJobSize"), 10000); // all 5 Minutes
deployThread("80_dequeue", "Indexing Dequeue", "thread that creates database entries from scraped web content and performes indexing" ,
new serverInstantThread(this, "deQueue", "queueSize"), 10000);
setConfig("81_dequeue_idlesleep" , getConfig("80_dequeue_idlesleep", ""));
setConfig("81_dequeue_busysleep" , getConfig("80_dequeue_busysleep", ""));
deployThread("81_dequeue", "Indexing Dequeue (second job, test run)", "thread that creates database entries from scraped web content and performes indexing" ,
new serverInstantThread(this, "deQueue", "queueSize"), 11000);
deployThread("70_cachemanager", "Proxy Cache Enqueue", "job takes new proxy files from RAM stack, stores them, and hands over to the Indexing Stack",
new serverInstantThread(cacheManager, "job", "size"), 10000);
deployThread("62_remotetriggeredcrawl", "Remote Crawl Job", "thread that performes a single crawl/indexing step triggered by a remote peer",
@ -986,36 +1000,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (u == null) return plasmaURL.dummyHash; else return u.toString();
}
/*
private void processCrawlingX(plasmaCrawlNURL.entry urlEntry, String initiator) {
if (urlEntry.url() == null) return;
String profileHandle = urlEntry.profileHandle();
//System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle);
if (profile == null) {
log.logError("CRAWL[" + noticeURL.coreStackSize() + ", " + noticeURL.remoteStackSize() + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
return;
}
log.logDebug("plasmaSwitchboard.processCrawling: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() +
", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + profile.generalFilter() +
", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false")));
boolean tryRemote =
(profile.remoteIndexing()) &&
(urlEntry.depth() == profile.generalDepth()) &&
(urlEntry.initiator() != null) && (!(urlEntry.initiator().equals(plasmaURL.dummyHash))) &&
((yacyCore.seedDB.mySeed.isSenior()) ||
(yacyCore.seedDB.mySeed.isPrincipal())) ;
if (tryRemote) {
boolean success = processRemoteCrawlTrigger(urlEntry);
if (!(success)) processLocalCrawling(urlEntry, profile);
} else {
processLocalCrawling(urlEntry, profile);
}
}
*/
private boolean processLocalCrawling(plasmaCrawlNURL.entry urlEntry, plasmaCrawlProfile.entry profile) {
// work off one Crawl stack entry
if ((urlEntry == null) && (urlEntry.url() == null)) {
@ -1118,6 +1102,42 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (date == null) return ""; else return DateFormatter.format(date);
}
public class presearch extends Thread {
Set queryhashes;
char[] order;
String urlmask;
long time;
public presearch(Set queryhashes, char[] order, long time /*milliseconds*/, String urlmask) {
this.queryhashes = queryhashes;
this.order = order;
this.urlmask = urlmask;
this.time = time;
}
public void run() {
try {
// search the database locally
plasmaWordIndexEntity idx = searchManager.searchHashes(queryhashes, time);
plasmaSearch.result acc = searchManager.order(idx, queryhashes, stopwords, order, time, 3);
if (acc == null) return;
// take some elements and fetch the snippets
int i = 0;
plasmaCrawlLURL.entry urlentry;
String urlstring;
while ((acc.hasMoreElements()) && (i < 3)) {
urlentry = acc.nextElement();
if (urlentry.url().getHost().endsWith(".yacyh")) continue;
urlstring = htmlFilterContentScraper.urlNormalform(urlentry.url());
if (urlstring.matches(urlmask)) { //.* is default
snippetCache.retrieve(urlentry.url(), true, queryhashes, true);
i++;
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
public serverObjects searchFromLocal(Set querywords, String order1, String order2, int count, boolean global, long time /*milliseconds*/, String urlmask) {
@ -1141,6 +1161,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logInfo("INIT WORD SEARCH: " + gs + " - " + count + " links, " + (time / 1000) + " seconds");
long timestamp = System.currentTimeMillis();
//Thread preselect = new presearch(querywords, order, time / 10, urlmask);
//preselect.start();
// do global fetching
int globalresults = 0;
if (global) {
@ -1148,7 +1171,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
int fetchpeers = ((int) time / 1000) * 3; // number of target peers; means 30 peers in 10 seconds
long fetchtime = time * 7 / 10; // time to waste
if (fetchcount > count) fetchcount = count;
globalresults = yacySearch.search(querywords, loadedURL, searchManager, fetchcount, fetchpeers, fetchtime);
globalresults = yacySearch.searchHashes(queryhashes, loadedURL, searchManager, fetchcount, fetchpeers, snippetCache, fetchtime);
log.logDebug("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
}
prop.put("globalresults", globalresults); // the result are written to the local DB
@ -1156,7 +1179,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// now search locally (the global results should be now in the local db)
long remainingTime = time - (System.currentTimeMillis() - timestamp);
plasmaWordIndexEntity idx = searchManager.searchWords(querywords, remainingTime * 8 / 10); // the search
plasmaWordIndexEntity idx = searchManager.searchHashes(queryhashes, remainingTime * 8 / 10); // the search
log.logDebug("SEARCH TIME AFTER FINDING " + idx.size() + " ELEMENTS: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
remainingTime = time - (System.currentTimeMillis() - timestamp);
@ -1176,10 +1199,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
URL url;
plasmaCrawlLURL.entry urlentry;
String urlstring, urlname, filename;
String host, hash;
String descr = "";
String host, hash, address, snippet, descr = "";
yacySeed seed;
String address;
//kelondroMScoreCluster ref = new kelondroMScoreCluster();
while ((acc.hasMoreElements()) && (i < count)) {
urlentry = acc.nextElement();
@ -1218,14 +1239,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
*/
//addScoreForked(ref, gs, descr.split(" "));
//addScoreForked(ref, gs, urlstring.split("/"));
String snippet;
if (urlstring.matches(urlmask)) { //.* is default
prop.put("results_" + i + "_description", descr);
prop.put("results_" + i + "_url", urlstring);
prop.put("results_" + i + "_urlname", urlname);
prop.put("results_" + i + "_date", dateString(urlentry.moddate()));
prop.put("results_" + i + "_size", Long.toString(urlentry.size()));
snippet = getSnippet(url, false, querywords, false);
snippet = snippetCache.retrieve(url, false, querywords, false);
if ((snippet == null) || (snippet.length() < 10)) {
prop.put("results_" + i + "_snippet", 0);
prop.put("results_" + i + "_snippet_text", "");
@ -1302,7 +1322,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String snippet;
while ((acc.hasMoreElements()) && (i < count)) {
urlentry = acc.nextElement();
snippet = getSnippet(urlentry.url(), false, hashes, true);
snippet = snippetCache.retrieve(urlentry.url(), false, hashes, true);
if ((snippet == null) || (snippet.length() < 10)) {
resource = urlentry.toString();
} else {
@ -1375,7 +1395,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (url == null) return 0;
// get set of words
//Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
Set words = plasmaCondenser.getWords(getDocument(url, fetchOnline).getText());
Set words = plasmaCondenser.getWords(snippetCache.getDocument(url, fetchOnline).getText());
// delete all word references
int count = removeReferences(urlhash, words);
// finally delete the url entry itself
@ -1401,112 +1421,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
return count;
}
private byte[] getResource(URL url, boolean fetchOnline) {
// load the url as resource from the web
try {
//return httpc.singleGET(url, 5000, null, null, remoteProxyHost, remoteProxyPort);
byte[] resource = getResourceFromCache(url);
if ((fetchOnline) && (resource == null)) {
loadResourceFromWeb(url, 5000);
resource = getResourceFromCache(url);
}
return resource;
} catch (IOException e) {
return null;
}
}
private byte[] getResourceFromCache(URL url) {
// load the url as resource from the cache
String path = htmlFilterContentScraper.urlNormalform(url).substring(6);
File cache = new File(getRootPath(), getConfig("proxyCache", "DATA/HTCACHE"));
File f = new File(cache, path);
if (f.exists()) try {
return serverFileUtils.read(f);
} catch (IOException e) {
return null;
} else {
return null;
}
}
private void loadResourceFromWeb(URL url, int socketTimeout) throws IOException {
plasmaCrawlWorker.load(
url,
null,
null,
0,
null,
socketTimeout,
remoteProxyHost,
remoteProxyPort,
remoteProxyUse,
cacheManager,
log);
}
private plasmaParserDocument getDocument(URL url, boolean fetchOnline) {
byte[] resource = getResource(url, fetchOnline);
if (resource == null) return null;
httpHeader header = null;
try {
header = cacheManager.getCachedResponse(plasmaURL.urlHash(url));
} catch (IOException e) {
return null;
}
if (header == null) return null;
if (plasmaParser.supportedMimeTypesContains(header.mime())) {
return parser.parseSource(url, header.mime(), resource);
} else {
return null;
}
}
private String getSnippet(URL url, boolean fetchOnline, Set query, boolean queryAreHashes) {
if (query.size() == 0) return null;
plasmaParserDocument document = getDocument(url, fetchOnline);
if (document == null) return null;
String[] sentences = document.getSentences();
//System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
if ((sentences == null) || (sentences.length == 0)) return null;
TreeMap sentencematrix = hashMatrix(sentences);
if (!(queryAreHashes)) query = plasmaSearch.words2hashes(query);
Iterator i = query.iterator();
String hash;
kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
Iterator j;
Integer sentencenumber;
Map.Entry entry;
while (i.hasNext()) {
hash = (String) i.next();
j = sentencematrix.entrySet().iterator();
while (j.hasNext()) {
entry = (Map.Entry) j.next();
sentencenumber = (Integer) entry.getKey();
if (((HashSet) entry.getValue()).contains(hash)) hitTable.addScore(sentencenumber, sentences[sentencenumber.intValue()].length());
}
}
Integer maxLine = (Integer) hitTable.getMaxObject();
if (maxLine == null) return null;
String snippet = sentences[maxLine.intValue()];
if (snippet.length() > 140) return null;
return snippet;
}
private TreeMap hashMatrix(String[] sentences) {
TreeMap map = new TreeMap();
HashSet set;
Enumeration words;
for (int i = 0; i < sentences.length; i++) {
set = new HashSet();
words = plasmaCondenser.wordTokenizer(sentences[i]);
while (words.hasMoreElements()) set.add(plasmaWordIndexEntry.word2hash((String) words.nextElement()));
map.put(new Integer(i), set);
}
return map;
}
public class distributeIndex {
// distributes parts of the index to other peers
// stops as soon as an error occurrs

@ -1,3 +1,42 @@
// serverInstantThread.java
// -----------------------
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
// last major change: 14.03.2005
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.server;

@ -56,6 +56,7 @@ import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndexEntity;
import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.tools.crypt;
@ -215,8 +216,9 @@ public class yacyClient {
}
public static int search(String wordhashes, int count, boolean global,
yacySeed targetPeer, plasmaCrawlLURL urlManager, plasmaSearch searchManager,
long duetime) {
yacySeed targetPeer, plasmaCrawlLURL urlManager,
plasmaSearch searchManager, plasmaSnippetCache snippets,
long duetime) {
// send a search request to peer with remote Hash
// this mainly converts the words into word hashes
@ -294,10 +296,19 @@ public class yacyClient {
// insert results to containers
for (int n = 0; n < results; n++) {
// get one single search result
link = urlManager.newEntry((String) result.get("resource" + n), true, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
// save the url entry
plasmaWordIndexEntry entry = new plasmaWordIndexEntry(link.hash(), link.wordCount(), 0, 0, 0,
plasmaSearch.calcVirtualAge(link.moddate()), link.quality(),
link.language(), link.doctype(), false);
if (link.snippet() != null) {
// we don't store the snippets along the url entry, because they are search-specific.
// instead, they are placed in a snipped-search cache.
//System.out.println("--- RECEIVED SNIPPET '" + link.snippet() + "'");
snippets.store(wordhashes, link.hash(), link.snippet());
}
// add the url entry to the word indexes
for (int m = 0; m < words; m++) {
container[m].add(new plasmaWordIndexEntry[]{entry}, System.currentTimeMillis());
}

@ -45,6 +45,7 @@ import java.util.Iterator;
import java.util.Set;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSearch;
@ -55,28 +56,27 @@ public class yacySearch extends Thread {
private boolean global;
private plasmaCrawlLURL urlManager;
private plasmaSearch searchManager;
private plasmaSnippetCache snippetCache;
private yacySeed targetPeer;
private int links;
private long duetime;
public yacySearch(Set wordhashes, int count, boolean global, yacySeed targetPeer,
plasmaCrawlLURL urlManager, plasmaSearch searchManager, long duetime) {
plasmaCrawlLURL urlManager, plasmaSearch searchManager, plasmaSnippetCache snippetCache, long duetime) {
super("yacySearch_" + targetPeer.getName());
this.wordhashes = wordhashes;
this.count = count;
this.global = global;
this.urlManager = urlManager;
this.searchManager = searchManager;
this.snippetCache = snippetCache;
this.targetPeer = targetPeer;
this.links = -1;
this.duetime = duetime;
}
public void run() {
String wh = "";
Iterator i = wordhashes.iterator();
while (i.hasNext()) wh = wh + (String) i.next();
this.links = yacyClient.search(wh, count, global, targetPeer, urlManager, searchManager, duetime);
this.links = yacyClient.search(set2string(wordhashes), count, global, targetPeer, urlManager, searchManager, snippetCache, duetime);
if (links != 0) {
//yacyCore.log.logInfo("REMOTE SEARCH - remote peer '" + targetPeer.get("Name", "anonymous") + "' contributed " + links + " links for word hash " + wordhashes);
yacyCore.seedDB.mySeed.incRI(links);
@ -84,6 +84,13 @@ public class yacySearch extends Thread {
}
}
public static String set2string(Set hashes) {
String wh = "";
Iterator i = hashes.iterator();
while (i.hasNext()) wh = wh + (String) i.next();
return wh;
}
public int links() {
return this.links;
}
@ -119,8 +126,8 @@ public class yacySearch extends Thread {
return result;
}
public static int search(Set querywords, plasmaCrawlLURL urlManager, plasmaSearch searchManager,
int count, int targets, long waitingtime) {
public static int searchHashes(Set wordhashes, plasmaCrawlLURL urlManager, plasmaSearch searchManager,
int count, int targets, plasmaSnippetCache snippetCache, long waitingtime) {
// check own peer status
if ((yacyCore.seedDB.mySeed == null) || (yacyCore.seedDB.mySeed.getAddress() == null)) return 0;
@ -132,7 +139,7 @@ public class yacySearch extends Thread {
if (duetime < 1000) duetime = 1000;
// prepare seed targets and threads
Set wordhashes = plasmaSearch.words2hashes(querywords);
//Set wordhashes = plasmaSearch.words2hashes(querywords);
yacySeed[] targetPeers = selectPeers(wordhashes, targets);
if (targetPeers == null) return 0;
targets = targetPeers.length;
@ -140,7 +147,7 @@ public class yacySearch extends Thread {
yacySearch[] searchThreads = new yacySearch[targets];
for (int i = 0; i < targets; i++) {
searchThreads[i]= new yacySearch(wordhashes, count, true, targetPeers[i],
urlManager, searchManager, duetime);
urlManager, searchManager, snippetCache, duetime);
searchThreads[i].start();
try {Thread.currentThread().sleep(20);} catch (InterruptedException e) {}
if ((System.currentTimeMillis() - start) > waitingtime) {

Loading…
Cancel
Save