enhanced snippet-loading with threads

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@322 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 4afcf10158
commit 3addf58046

@ -128,6 +128,10 @@ public class CacheAdmin_p {
info += "<b>MEDIA:</b><br>" + formatAnchor(document.getMedialinks()) + "<br>";
info += "<b>EMAIL:</b><br>" + formatAnchor(document.getEmaillinks()) + "<br>";
info += "<b>TEXT:</b><br><span class=\"small\">" + new String(scraper.getText()) + "</span><br>";
info += "<b>LINES:</b><br><span class=\"small\">";
String[] sentences = document.getSentences();
for (int i = 0; i < sentences.length; i++) info += sentences + "<br>";
info += "</span><br>";
}
} catch (Exception e) {
info += e.toString();

@ -314,7 +314,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
}
// string conversions
private static String code_iso8859s(byte c) {
private static String code_iso8859s(int c) {
switch ((int) c & 0xff) {
// german umlaute and ligaturen
@ -361,7 +361,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
String z;
for (int i = 0; i < bb.length(); i++) {
b = bb.byteAt(i);
z = code_iso8859s(b);
z = code_iso8859s(b & 0xff);
if (z == null) t.append(b); else t.append(z);
}
return t;

@ -148,7 +148,7 @@ public class kelondroMSetTools {
}
// now the same for set-set
public static TreeSet joinConstructive(TreeSet set1, TreeSet set2) {
public static TreeSet joinConstructive(TreeSet set1, TreeSet set2) {
// comparators must be equal
if ((set1 == null) || (set2 == null)) return null;
if (set1.comparator() != set2.comparator()) return null;

@ -324,12 +324,12 @@ public final class plasmaSearch {
public class result /*implements Enumeration*/ {
final TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry
final kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic
final Set searchhashes; // hashes that are searched here
final Set stopwords; // words that are excluded from the commonSense heuristic
final char[] order; // order of heuristics
ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects
TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry
kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic
Set searchhashes; // hashes that are searched here
Set stopwords; // words that are excluded from the commonSense heuristic
char[] order; // order of heuristics
ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects
public result(Set searchhashes, Set stopwords, char[] order) {
this.pageAcc = new TreeMap();
@ -340,6 +340,15 @@ public final class plasmaSearch {
this.results = new ArrayList();
}
public result cloneSmart() {
// clones only the top structure
result theClone = new result(this.searchhashes, this.stopwords, this.order);
theClone.pageAcc = (TreeMap) this.pageAcc.clone();
theClone.ref = this.ref;
theClone.results = this.results;
return theClone;
}
public int sizeOrdered() {
return pageAcc.size();
}

@ -59,6 +59,7 @@ public class plasmaSnippetCache {
public static final int SOURCE_CACHE = 0;
public static final int SOURCE_FILE = 0;
public static final int SOURCE_WEB = 0;
public static final int SOURCE_ERROR = 0;
private int snippetsScoreCounter;
@ -87,20 +88,26 @@ public class plasmaSnippetCache {
public class result {
public String line;
public String error;
public int source;
public result(String line, int source) {
public result(String line, int source, String error) {
this.line = line;
this.source = source;
this.error = error;
}
public String toString() {
return line;
}
}
public result retrieve(java.net.URL url, boolean fetchOnline, Set queryhashes) {
public boolean existsInCache(URL url, Set queryhashes) {
return retrieveFromCache(yacySearch.set2string(queryhashes), plasmaURL.urlHash(url)) != null;
}
public result retrieve(URL url, Set queryhashes, boolean fetchOnline) {
if (queryhashes.size() == 0) {
//System.out.println("found no queryhashes for url retrieve " + url);
return null;
return new result(null, SOURCE_ERROR, "no query hashes given");
}
String urlhash = plasmaURL.urlHash(url);
@ -109,7 +116,7 @@ public class plasmaSnippetCache {
String line = retrieveFromCache(wordhashes, urlhash);
if (line != null) {
//System.out.println("found snippet for url " + url + " in cache: " + line);
return new result(line, SOURCE_CACHE);
return new result(line, SOURCE_CACHE, null);
}
// if the snippet is not in the cache, we can try to get it from the htcache
@ -123,32 +130,32 @@ public class plasmaSnippetCache {
source = SOURCE_WEB;
}
} catch (IOException e) {
return null;
return new result(null, SOURCE_ERROR, "error loading resource from web: " + e.getMessage());
}
if (resource == null) {
//System.out.println("cannot load document for url " + url);
return null;
return new result(null, SOURCE_ERROR, "error loading resource from web, cacheManager returned NULL");
}
plasmaParserDocument document = parseDocument(url, resource);
if (document == null) return null; // cannot be parsed
if (document == null) return new result(null, SOURCE_ERROR, "parser error/failed"); // cannot be parsed
//System.out.println("loaded document for url " + url);
String[] sentences = document.getSentences();
//System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
if ((sentences == null) || (sentences.length == 0)) {
//System.out.println("found no sentences in url " + url);
return null;
return new result(null, SOURCE_ERROR, "parser returned no sentences");
}
// we have found a parseable non-empty file: use the lines
line = computeSnippet(sentences, queryhashes, 12 * queryhashes.size(), 120);
//System.out.println("loaded snippet for url " + url + ": " + line);
if (line == null) return null;
if (line == null) return new result(null, SOURCE_ERROR, "no matching snippet found");
if (line.length() > 120) line = line.substring(0, 120);
// finally store this snippet in our own cache
storeToCache(wordhashes, urlhash, line);
return new result(line, source);
return new result(line, source, null);
}
public synchronized void storeToCache(String wordhashes, String urlhash, String snippet) {
@ -184,24 +191,50 @@ public class plasmaSnippetCache {
}
private String computeSnippet(String[] sentences, Set queryhashes, int minLength, int maxLength) {
if ((sentences == null) || (sentences.length == 0)) return null;
if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
Iterator i;
Iterator j;
HashSet hs;
for (int j = 0; j < sentences.length; j++) {
if ((sentences[j].length() > minLength) && (sentences[j].length() < maxLength)) {
hs = hashSentence(sentences[j]);
i = queryhashes.iterator();
while (i.hasNext()) {
if (hs.contains((String) i.next())) hitTable.incScore(new Integer(j));
for (int i = 0; i < sentences.length; i++) {
if ((sentences[i].length() > minLength) && (sentences[i].length() < maxLength)) {
hs = hashSentence(sentences[i]);
j = queryhashes.iterator();
while (j.hasNext()) {
if (hs.contains((String) j.next())) hitTable.incScore(new Integer(i));
}
}
}
Integer maxLine = (Integer) hitTable.getMaxObject();
if (maxLine == null) return null;
if (hitTable.getScore(maxLine) == 0) return null;
return sentences[maxLine.intValue()];
int score = hitTable.getMaxScore(); // best number of hits
if (score <= 0) return null;
// we found (a) line(s) that have <score> hits.
// now find the shortest line of these hits
int shortLineIndex = -1;
int shortLineLength = Integer.MAX_VALUE;
for (int i = 0; i < sentences.length; i++) {
if ((hitTable.getScore(new Integer(i)) == score) &&
(sentences[i].length() < shortLineLength)) {
shortLineIndex = i;
shortLineLength = sentences[i].length();
}
}
// find a first result
String result = sentences[shortLineIndex];
if (score == queryhashes.size()) return result;
// the result has not all words in it.
// find another sentence that represents the missing other words
// first remove all words that appear in the result from the queryhashes
hs = hashSentence(result);
j = queryhashes.iterator();
while (j.hasNext()) {
if (hs.contains((String) j.next())) j.remove();
}
if (queryhashes.size() == 0) return result;
// now find recursively more sentences
String nextSnippet = computeSnippet(sentences, queryhashes, minLength, maxLength);
return result + ((nextSnippet == null) ? "" : (" ... " + nextSnippet));
}
private HashSet hashSentence(String sentence) {
HashSet set = new HashSet();
Enumeration words = plasmaCondenser.wordTokenizer(sentence);
@ -264,5 +297,4 @@ public class plasmaSnippetCache {
log);
}
}

@ -1136,12 +1136,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
char[] order;
String urlmask;
long time;
int fetchcount;
public presearch(Set queryhashes, char[] order, long time /*milliseconds*/, String urlmask, int fetchcount) {
int searchcount, fetchcount;
public presearch(Set queryhashes, char[] order, long time /*milliseconds*/, String urlmask, int searchcount, int fetchcount) {
this.queryhashes = queryhashes;
this.order = order;
this.urlmask = urlmask;
this.time = time;
this.searchcount = searchcount;
this.fetchcount = fetchcount;
}
public void run() {
@ -1150,26 +1151,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logDebug("presearch: started job");
plasmaWordIndexEntity idx = searchManager.searchHashes(queryhashes, time);
log.logDebug("presearch: found " + idx.size() + " results");
plasmaSearch.result acc = searchManager.order(idx, queryhashes, stopwords, order, time, fetchcount);
plasmaSearch.result acc = searchManager.order(idx, queryhashes, stopwords, order, time, searchcount);
if (acc == null) return;
log.logDebug("presearch: ordered results, now " + acc.sizeOrdered() + " URLs ready for fetch");
// take some elements and fetch the snippets
int i = 0;
plasmaCrawlLURL.entry urlentry;
String urlstring;
plasmaSnippetCache.result snippet;
while ((acc.hasMoreElements()) && (i < fetchcount)) {
urlentry = acc.nextElement();
if (urlentry.url().getHost().endsWith(".yacyh")) continue;
urlstring = htmlFilterContentScraper.urlNormalform(urlentry.url());
if (urlstring.matches(urlmask)) { //.* is default
log.logDebug("presearch: fetching URL " + urlstring);
snippet = snippetCache.retrieve(urlentry.url(), true, queryhashes);
if (snippet != null) log.logDebug("found snippet for URL " + urlstring + ": '" + snippet.line + "'");
i++;
}
}
fetchSnippets(acc, queryhashes, urlmask, fetchcount);
} catch (IOException e) {
e.printStackTrace();
}
@ -1177,6 +1164,42 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
}
public void fetchSnippets(plasmaSearch.result acc, Set queryhashes, String urlmask, int fetchcount) {
// fetch the snippets
int i = 0;
plasmaCrawlLURL.entry urlentry;
String urlstring;
plasmaSnippetCache.result snippet;
while ((acc.hasMoreElements()) && (i < fetchcount)) {
urlentry = acc.nextElement();
if (urlentry.url().getHost().endsWith(".yacyh")) continue;
urlstring = htmlFilterContentScraper.urlNormalform(urlentry.url());
if ((urlstring.matches(urlmask)) &&
(!(snippetCache.existsInCache(urlentry.url(), queryhashes)))) {
new snippetFetcher(urlentry.url(), queryhashes).start();
i++;
}
}
}
public class snippetFetcher extends Thread {
URL url;
Set queryhashes;
public snippetFetcher(URL url, Set queryhashes) {
if (url.getHost().endsWith(".yacyh")) return;
this.url = url;
this.queryhashes = queryhashes;
}
public void run() {
log.logDebug("snippetFetcher: try to get URL " + url);
plasmaSnippetCache.result snippet = snippetCache.retrieve(url, queryhashes, true);
if (snippet.line == null)
log.logDebug("snippetFetcher: cannot get URL " + url + ". error: " + snippet.error);
else
log.logDebug("snippetFetcher: got URL " + url + ", the snippet is '" + snippet.line + "', source=" + snippet.source);
}
}
public serverObjects searchFromLocal(Set querywords, String order1, String order2, int count, boolean global, long time /*milliseconds*/, String urlmask) {
serverObjects prop = new serverObjects();
@ -1199,11 +1222,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logInfo("INIT WORD SEARCH: " + gs + " - " + count + " links, " + (time / 1000) + " seconds");
long timestamp = System.currentTimeMillis();
// start a presearch, which makes only sense if we idle afterwards.
// this is especially the case if we start a global search and idle until search
if (global) {
// start a presearch, which makes only sense if we idle afterwards.
// this is especially the case if we start a global search and idle until search
// results appear from other peers
Thread preselect = new presearch(queryhashes, order, time / 10, urlmask, 5);
Thread preselect = new presearch(queryhashes, order, time / 10, urlmask, 10, 3);
preselect.start();
}
@ -1229,6 +1251,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (remainingTime < 500) remainingTime = 500;
if (remainingTime > 3000) remainingTime = 3000;
plasmaSearch.result acc = searchManager.order(idx, queryhashes, stopwords, order, remainingTime, 10);
if (!(global)) fetchSnippets(acc.cloneSmart(), queryhashes, urlmask, 10);
log.logDebug("SEARCH TIME AFTER ORDERING OF SEARCH RESULT: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
// result is a List of urlEntry elements: prepare answer
@ -1289,8 +1312,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
prop.put("results_" + i + "_urlname", urlname);
prop.put("results_" + i + "_date", dateString(urlentry.moddate()));
prop.put("results_" + i + "_size", Long.toString(urlentry.size()));
snippet = snippetCache.retrieve(url, false, queryhashes);
if ((snippet == null) || (snippet.line.length() < 10)) {
snippet = snippetCache.retrieve(url, queryhashes, false);
if (snippet.line == null) {
prop.put("results_" + i + "_snippet", 0);
prop.put("results_" + i + "_snippet_text", "");
} else {
@ -1366,8 +1389,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
plasmaSnippetCache.result snippet;
while ((acc.hasMoreElements()) && (i < count)) {
urlentry = acc.nextElement();
snippet = snippetCache.retrieve(urlentry.url(), false, hashes);
if ((snippet == null) || (snippet.line.length() < 10)) {
snippet = snippetCache.retrieve(urlentry.url(), hashes, false);
if (snippet.line == null) {
resource = urlentry.toString();
} else {
resource = urlentry.toString(snippet.line);

Loading…
Cancel
Save