diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java
index 72d6035ad..1b83dc28b 100644
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@@ -128,6 +128,10 @@ public class CacheAdmin_p {
info += "MEDIA:
" + formatAnchor(document.getMedialinks()) + "
";
info += "EMAIL:
" + formatAnchor(document.getEmaillinks()) + "
";
info += "TEXT:
" + new String(scraper.getText()) + "
";
+ info += "LINES:
";
+ String[] sentences = document.getSentences();
+ for (int i = 0; i < sentences.length; i++) info += sentences + "
";
+ info += "
";
}
} catch (Exception e) {
info += e.toString();
diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
index 301bcab8c..17b1a3ffd 100644
--- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
@@ -314,7 +314,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
}
// string conversions
- private static String code_iso8859s(byte c) {
+ private static String code_iso8859s(int c) {
switch ((int) c & 0xff) {
// german umlaute and ligaturen
@@ -361,7 +361,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
String z;
for (int i = 0; i < bb.length(); i++) {
b = bb.byteAt(i);
- z = code_iso8859s(b);
+ z = code_iso8859s(b & 0xff);
if (z == null) t.append(b); else t.append(z);
}
return t;
diff --git a/source/de/anomic/kelondro/kelondroMSetTools.java b/source/de/anomic/kelondro/kelondroMSetTools.java
index 50ce7c04a..ff1bd17b6 100644
--- a/source/de/anomic/kelondro/kelondroMSetTools.java
+++ b/source/de/anomic/kelondro/kelondroMSetTools.java
@@ -148,7 +148,7 @@ public class kelondroMSetTools {
}
// now the same for set-set
- public static TreeSet joinConstructive(TreeSet set1, TreeSet set2) {
+ public static TreeSet joinConstructive(TreeSet set1, TreeSet set2) {
// comparators must be equal
if ((set1 == null) || (set2 == null)) return null;
if (set1.comparator() != set2.comparator()) return null;
diff --git a/source/de/anomic/plasma/plasmaSearch.java b/source/de/anomic/plasma/plasmaSearch.java
index b22a3d79a..da5529f69 100644
--- a/source/de/anomic/plasma/plasmaSearch.java
+++ b/source/de/anomic/plasma/plasmaSearch.java
@@ -324,12 +324,12 @@ public final class plasmaSearch {
public class result /*implements Enumeration*/ {
- final TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry
- final kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic
- final Set searchhashes; // hashes that are searched here
- final Set stopwords; // words that are excluded from the commonSense heuristic
- final char[] order; // order of heuristics
- ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects
+ TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry
+ kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic
+ Set searchhashes; // hashes that are searched here
+ Set stopwords; // words that are excluded from the commonSense heuristic
+ char[] order; // order of heuristics
+ ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects
public result(Set searchhashes, Set stopwords, char[] order) {
this.pageAcc = new TreeMap();
@@ -340,6 +340,15 @@ public final class plasmaSearch {
this.results = new ArrayList();
}
+ public result cloneSmart() {
+ // clones only the top structure
+ result theClone = new result(this.searchhashes, this.stopwords, this.order);
+ theClone.pageAcc = (TreeMap) this.pageAcc.clone();
+ theClone.ref = this.ref;
+ theClone.results = this.results;
+ return theClone;
+ }
+
public int sizeOrdered() {
return pageAcc.size();
}
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index 5d824c3bf..a51785167 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -59,6 +59,7 @@ public class plasmaSnippetCache {
public static final int SOURCE_CACHE = 0;
public static final int SOURCE_FILE = 0;
public static final int SOURCE_WEB = 0;
+ public static final int SOURCE_ERROR = 0;
private int snippetsScoreCounter;
@@ -87,20 +88,26 @@ public class plasmaSnippetCache {
public class result {
public String line;
+ public String error;
public int source;
- public result(String line, int source) {
+ public result(String line, int source, String error) {
this.line = line;
this.source = source;
+ this.error = error;
}
public String toString() {
return line;
}
}
- public result retrieve(java.net.URL url, boolean fetchOnline, Set queryhashes) {
+ public boolean existsInCache(URL url, Set queryhashes) {
+ return retrieveFromCache(yacySearch.set2string(queryhashes), plasmaURL.urlHash(url)) != null;
+ }
+
+ public result retrieve(URL url, Set queryhashes, boolean fetchOnline) {
if (queryhashes.size() == 0) {
//System.out.println("found no queryhashes for url retrieve " + url);
- return null;
+ return new result(null, SOURCE_ERROR, "no query hashes given");
}
String urlhash = plasmaURL.urlHash(url);
@@ -109,7 +116,7 @@ public class plasmaSnippetCache {
String line = retrieveFromCache(wordhashes, urlhash);
if (line != null) {
//System.out.println("found snippet for url " + url + " in cache: " + line);
- return new result(line, SOURCE_CACHE);
+ return new result(line, SOURCE_CACHE, null);
}
// if the snippet is not in the cache, we can try to get it from the htcache
@@ -123,32 +130,32 @@ public class plasmaSnippetCache {
source = SOURCE_WEB;
}
} catch (IOException e) {
- return null;
+ return new result(null, SOURCE_ERROR, "error loading resource from web: " + e.getMessage());
}
if (resource == null) {
//System.out.println("cannot load document for url " + url);
- return null;
+ return new result(null, SOURCE_ERROR, "error loading resource from web, cacheManager returned NULL");
}
plasmaParserDocument document = parseDocument(url, resource);
- if (document == null) return null; // cannot be parsed
+ if (document == null) return new result(null, SOURCE_ERROR, "parser error/failed"); // cannot be parsed
//System.out.println("loaded document for url " + url);
String[] sentences = document.getSentences();
//System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
if ((sentences == null) || (sentences.length == 0)) {
//System.out.println("found no sentences in url " + url);
- return null;
+ return new result(null, SOURCE_ERROR, "parser returned no sentences");
}
// we have found a parseable non-empty file: use the lines
line = computeSnippet(sentences, queryhashes, 12 * queryhashes.size(), 120);
//System.out.println("loaded snippet for url " + url + ": " + line);
- if (line == null) return null;
+ if (line == null) return new result(null, SOURCE_ERROR, "no matching snippet found");
if (line.length() > 120) line = line.substring(0, 120);
// finally store this snippet in our own cache
storeToCache(wordhashes, urlhash, line);
- return new result(line, source);
+ return new result(line, source, null);
}
public synchronized void storeToCache(String wordhashes, String urlhash, String snippet) {
@@ -184,24 +191,50 @@ public class plasmaSnippetCache {
}
private String computeSnippet(String[] sentences, Set queryhashes, int minLength, int maxLength) {
+ if ((sentences == null) || (sentences.length == 0)) return null;
+ if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
- Iterator i;
+ Iterator j;
HashSet hs;
- for (int j = 0; j < sentences.length; j++) {
- if ((sentences[j].length() > minLength) && (sentences[j].length() < maxLength)) {
- hs = hashSentence(sentences[j]);
- i = queryhashes.iterator();
- while (i.hasNext()) {
- if (hs.contains((String) i.next())) hitTable.incScore(new Integer(j));
+ for (int i = 0; i < sentences.length; i++) {
+ if ((sentences[i].length() > minLength) && (sentences[i].length() < maxLength)) {
+ hs = hashSentence(sentences[i]);
+ j = queryhashes.iterator();
+ while (j.hasNext()) {
+ if (hs.contains((String) j.next())) hitTable.incScore(new Integer(i));
}
}
}
- Integer maxLine = (Integer) hitTable.getMaxObject();
- if (maxLine == null) return null;
- if (hitTable.getScore(maxLine) == 0) return null;
- return sentences[maxLine.intValue()];
+ int score = hitTable.getMaxScore(); // best number of hits
+ if (score <= 0) return null;
+ // we found (a) line(s) that have hits.
+ // now find the shortest line of these hits
+ int shortLineIndex = -1;
+ int shortLineLength = Integer.MAX_VALUE;
+ for (int i = 0; i < sentences.length; i++) {
+ if ((hitTable.getScore(new Integer(i)) == score) &&
+ (sentences[i].length() < shortLineLength)) {
+ shortLineIndex = i;
+ shortLineLength = sentences[i].length();
+ }
+ }
+ // find a first result
+ String result = sentences[shortLineIndex];
+ if (score == queryhashes.size()) return result;
+ // the result has not all words in it.
+ // find another sentence that represents the missing other words
+ // first remove all words that appear in the result from the queryhashes
+ hs = hashSentence(result);
+ j = queryhashes.iterator();
+ while (j.hasNext()) {
+ if (hs.contains((String) j.next())) j.remove();
+ }
+ if (queryhashes.size() == 0) return result;
+ // now find recursively more sentences
+ String nextSnippet = computeSnippet(sentences, queryhashes, minLength, maxLength);
+ return result + ((nextSnippet == null) ? "" : (" ... " + nextSnippet));
}
-
+
private HashSet hashSentence(String sentence) {
HashSet set = new HashSet();
Enumeration words = plasmaCondenser.wordTokenizer(sentence);
@@ -264,5 +297,4 @@ public class plasmaSnippetCache {
log);
}
-
}
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 634019d34..537fdc58c 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -1136,12 +1136,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
char[] order;
String urlmask;
long time;
- int fetchcount;
- public presearch(Set queryhashes, char[] order, long time /*milliseconds*/, String urlmask, int fetchcount) {
+ int searchcount, fetchcount;
+ public presearch(Set queryhashes, char[] order, long time /*milliseconds*/, String urlmask, int searchcount, int fetchcount) {
this.queryhashes = queryhashes;
this.order = order;
this.urlmask = urlmask;
this.time = time;
+ this.searchcount = searchcount;
this.fetchcount = fetchcount;
}
public void run() {
@@ -1150,26 +1151,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logDebug("presearch: started job");
plasmaWordIndexEntity idx = searchManager.searchHashes(queryhashes, time);
log.logDebug("presearch: found " + idx.size() + " results");
- plasmaSearch.result acc = searchManager.order(idx, queryhashes, stopwords, order, time, fetchcount);
+ plasmaSearch.result acc = searchManager.order(idx, queryhashes, stopwords, order, time, searchcount);
if (acc == null) return;
log.logDebug("presearch: ordered results, now " + acc.sizeOrdered() + " URLs ready for fetch");
// take some elements and fetch the snippets
- int i = 0;
- plasmaCrawlLURL.entry urlentry;
- String urlstring;
- plasmaSnippetCache.result snippet;
- while ((acc.hasMoreElements()) && (i < fetchcount)) {
- urlentry = acc.nextElement();
- if (urlentry.url().getHost().endsWith(".yacyh")) continue;
- urlstring = htmlFilterContentScraper.urlNormalform(urlentry.url());
- if (urlstring.matches(urlmask)) { //.* is default
- log.logDebug("presearch: fetching URL " + urlstring);
- snippet = snippetCache.retrieve(urlentry.url(), true, queryhashes);
- if (snippet != null) log.logDebug("found snippet for URL " + urlstring + ": '" + snippet.line + "'");
- i++;
- }
- }
+ fetchSnippets(acc, queryhashes, urlmask, fetchcount);
} catch (IOException e) {
e.printStackTrace();
}
@@ -1177,6 +1164,42 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
}
+ public void fetchSnippets(plasmaSearch.result acc, Set queryhashes, String urlmask, int fetchcount) {
+ // fetch the snippets
+ int i = 0;
+ plasmaCrawlLURL.entry urlentry;
+ String urlstring;
+ plasmaSnippetCache.result snippet;
+ while ((acc.hasMoreElements()) && (i < fetchcount)) {
+ urlentry = acc.nextElement();
+ if (urlentry.url().getHost().endsWith(".yacyh")) continue;
+ urlstring = htmlFilterContentScraper.urlNormalform(urlentry.url());
+ if ((urlstring.matches(urlmask)) &&
+ (!(snippetCache.existsInCache(urlentry.url(), queryhashes)))) {
+ new snippetFetcher(urlentry.url(), queryhashes).start();
+ i++;
+ }
+ }
+ }
+
+ public class snippetFetcher extends Thread {
+ URL url;
+ Set queryhashes;
+ public snippetFetcher(URL url, Set queryhashes) {
+ if (url.getHost().endsWith(".yacyh")) return;
+ this.url = url;
+ this.queryhashes = queryhashes;
+ }
+ public void run() {
+ log.logDebug("snippetFetcher: try to get URL " + url);
+ plasmaSnippetCache.result snippet = snippetCache.retrieve(url, queryhashes, true);
+ if (snippet.line == null)
+ log.logDebug("snippetFetcher: cannot get URL " + url + ". error: " + snippet.error);
+ else
+ log.logDebug("snippetFetcher: got URL " + url + ", the snippet is '" + snippet.line + "', source=" + snippet.source);
+ }
+ }
+
public serverObjects searchFromLocal(Set querywords, String order1, String order2, int count, boolean global, long time /*milliseconds*/, String urlmask) {
serverObjects prop = new serverObjects();
@@ -1199,11 +1222,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logInfo("INIT WORD SEARCH: " + gs + " - " + count + " links, " + (time / 1000) + " seconds");
long timestamp = System.currentTimeMillis();
+ // start a presearch, which makes only sense if we idle afterwards.
+ // this is especially the case if we start a global search and idle until search
if (global) {
- // start a presearch, which makes only sense if we idle afterwards.
- // this is especially the case if we start a global search and idle until search
- // results appear from other peers
- Thread preselect = new presearch(queryhashes, order, time / 10, urlmask, 5);
+ Thread preselect = new presearch(queryhashes, order, time / 10, urlmask, 10, 3);
preselect.start();
}
@@ -1229,6 +1251,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (remainingTime < 500) remainingTime = 500;
if (remainingTime > 3000) remainingTime = 3000;
plasmaSearch.result acc = searchManager.order(idx, queryhashes, stopwords, order, remainingTime, 10);
+ if (!(global)) fetchSnippets(acc.cloneSmart(), queryhashes, urlmask, 10);
log.logDebug("SEARCH TIME AFTER ORDERING OF SEARCH RESULT: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
// result is a List of urlEntry elements: prepare answer
@@ -1289,8 +1312,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
prop.put("results_" + i + "_urlname", urlname);
prop.put("results_" + i + "_date", dateString(urlentry.moddate()));
prop.put("results_" + i + "_size", Long.toString(urlentry.size()));
- snippet = snippetCache.retrieve(url, false, queryhashes);
- if ((snippet == null) || (snippet.line.length() < 10)) {
+ snippet = snippetCache.retrieve(url, queryhashes, false);
+ if (snippet.line == null) {
prop.put("results_" + i + "_snippet", 0);
prop.put("results_" + i + "_snippet_text", "");
} else {
@@ -1366,8 +1389,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
plasmaSnippetCache.result snippet;
while ((acc.hasMoreElements()) && (i < count)) {
urlentry = acc.nextElement();
- snippet = snippetCache.retrieve(urlentry.url(), false, hashes);
- if ((snippet == null) || (snippet.line.length() < 10)) {
+ snippet = snippetCache.retrieve(urlentry.url(), hashes, false);
+ if (snippet.line == null) {
resource = urlentry.toString();
} else {
resource = urlentry.toString(snippet.line);