diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java index 72d6035ad..1b83dc28b 100644 --- a/htroot/CacheAdmin_p.java +++ b/htroot/CacheAdmin_p.java @@ -128,6 +128,10 @@ public class CacheAdmin_p { info += "MEDIA:
" + formatAnchor(document.getMedialinks()) + "
"; info += "EMAIL:
" + formatAnchor(document.getEmaillinks()) + "
"; info += "TEXT:
" + new String(scraper.getText()) + "
"; + info += "LINES:
"; + String[] sentences = document.getSentences(); + for (int i = 0; i < sentences.length; i++) info += sentences + "
"; + info += "

"; } } catch (Exception e) { info += e.toString(); diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java index 301bcab8c..17b1a3ffd 100644 --- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java @@ -314,7 +314,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { } // string conversions - private static String code_iso8859s(byte c) { + private static String code_iso8859s(int c) { switch ((int) c & 0xff) { // german umlaute and ligaturen @@ -361,7 +361,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { String z; for (int i = 0; i < bb.length(); i++) { b = bb.byteAt(i); - z = code_iso8859s(b); + z = code_iso8859s(b & 0xff); if (z == null) t.append(b); else t.append(z); } return t; diff --git a/source/de/anomic/kelondro/kelondroMSetTools.java b/source/de/anomic/kelondro/kelondroMSetTools.java index 50ce7c04a..ff1bd17b6 100644 --- a/source/de/anomic/kelondro/kelondroMSetTools.java +++ b/source/de/anomic/kelondro/kelondroMSetTools.java @@ -148,7 +148,7 @@ public class kelondroMSetTools { } // now the same for set-set - public static TreeSet joinConstructive(TreeSet set1, TreeSet set2) { + public static TreeSet joinConstructive(TreeSet set1, TreeSet set2) { // comparators must be equal if ((set1 == null) || (set2 == null)) return null; if (set1.comparator() != set2.comparator()) return null; diff --git a/source/de/anomic/plasma/plasmaSearch.java b/source/de/anomic/plasma/plasmaSearch.java index b22a3d79a..da5529f69 100644 --- a/source/de/anomic/plasma/plasmaSearch.java +++ b/source/de/anomic/plasma/plasmaSearch.java @@ -324,12 +324,12 @@ public final class plasmaSearch { public class result /*implements Enumeration*/ { - final TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry - final kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic - final Set searchhashes; // hashes that are searched here - final Set stopwords; // words that are excluded from the commonSense heuristic - final char[] order; // order of heuristics - ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects + TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry + kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic + Set searchhashes; // hashes that are searched here + Set stopwords; // words that are excluded from the commonSense heuristic + char[] order; // order of heuristics + ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects public result(Set searchhashes, Set stopwords, char[] order) { this.pageAcc = new TreeMap(); @@ -340,6 +340,15 @@ public final class plasmaSearch { this.results = new ArrayList(); } + public result cloneSmart() { + // clones only the top structure + result theClone = new result(this.searchhashes, this.stopwords, this.order); + theClone.pageAcc = (TreeMap) this.pageAcc.clone(); + theClone.ref = this.ref; + theClone.results = this.results; + return theClone; + } + public int sizeOrdered() { return pageAcc.size(); } diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 5d824c3bf..a51785167 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -59,6 +59,7 @@ public class plasmaSnippetCache { public static final int SOURCE_CACHE = 0; public static final int SOURCE_FILE = 0; public static final int SOURCE_WEB = 0; + public static final int SOURCE_ERROR = 0; private int snippetsScoreCounter; @@ -87,20 +88,26 @@ public class plasmaSnippetCache { public class result { public String line; + public String error; public int source; - public result(String line, int source) { + public result(String line, int source, String error) { this.line = line; this.source = source; + this.error = error; } public String toString() { return line; } } - public result retrieve(java.net.URL url, boolean fetchOnline, Set queryhashes) { + public boolean existsInCache(URL url, Set queryhashes) { + return retrieveFromCache(yacySearch.set2string(queryhashes), plasmaURL.urlHash(url)) != null; + } + + public result retrieve(URL url, Set queryhashes, boolean fetchOnline) { if (queryhashes.size() == 0) { //System.out.println("found no queryhashes for url retrieve " + url); - return null; + return new result(null, SOURCE_ERROR, "no query hashes given"); } String urlhash = plasmaURL.urlHash(url); @@ -109,7 +116,7 @@ public class plasmaSnippetCache { String line = retrieveFromCache(wordhashes, urlhash); if (line != null) { //System.out.println("found snippet for url " + url + " in cache: " + line); - return new result(line, SOURCE_CACHE); + return new result(line, SOURCE_CACHE, null); } // if the snippet is not in the cache, we can try to get it from the htcache @@ -123,32 +130,32 @@ public class plasmaSnippetCache { source = SOURCE_WEB; } } catch (IOException e) { - return null; + return new result(null, SOURCE_ERROR, "error loading resource from web: " + e.getMessage()); } if (resource == null) { //System.out.println("cannot load document for url " + url); - return null; + return new result(null, SOURCE_ERROR, "error loading resource from web, cacheManager returned NULL"); } plasmaParserDocument document = parseDocument(url, resource); - if (document == null) return null; // cannot be parsed + if (document == null) return new result(null, SOURCE_ERROR, "parser error/failed"); // cannot be parsed //System.out.println("loaded document for url " + url); String[] sentences = document.getSentences(); //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]); if ((sentences == null) || (sentences.length == 0)) { //System.out.println("found no sentences in url " + url); - return null; + return new result(null, SOURCE_ERROR, "parser returned no sentences"); } // we have found a parseable non-empty file: use the lines line = computeSnippet(sentences, queryhashes, 12 * queryhashes.size(), 120); //System.out.println("loaded snippet for url " + url + ": " + line); - if (line == null) return null; + if (line == null) return new result(null, SOURCE_ERROR, "no matching snippet found"); if (line.length() > 120) line = line.substring(0, 120); // finally store this snippet in our own cache storeToCache(wordhashes, urlhash, line); - return new result(line, source); + return new result(line, source, null); } public synchronized void storeToCache(String wordhashes, String urlhash, String snippet) { @@ -184,24 +191,50 @@ public class plasmaSnippetCache { } private String computeSnippet(String[] sentences, Set queryhashes, int minLength, int maxLength) { + if ((sentences == null) || (sentences.length == 0)) return null; + if ((queryhashes == null) || (queryhashes.size() == 0)) return null; kelondroMScoreCluster hitTable = new kelondroMScoreCluster(); - Iterator i; + Iterator j; HashSet hs; - for (int j = 0; j < sentences.length; j++) { - if ((sentences[j].length() > minLength) && (sentences[j].length() < maxLength)) { - hs = hashSentence(sentences[j]); - i = queryhashes.iterator(); - while (i.hasNext()) { - if (hs.contains((String) i.next())) hitTable.incScore(new Integer(j)); + for (int i = 0; i < sentences.length; i++) { + if ((sentences[i].length() > minLength) && (sentences[i].length() < maxLength)) { + hs = hashSentence(sentences[i]); + j = queryhashes.iterator(); + while (j.hasNext()) { + if (hs.contains((String) j.next())) hitTable.incScore(new Integer(i)); } } } - Integer maxLine = (Integer) hitTable.getMaxObject(); - if (maxLine == null) return null; - if (hitTable.getScore(maxLine) == 0) return null; - return sentences[maxLine.intValue()]; + int score = hitTable.getMaxScore(); // best number of hits + if (score <= 0) return null; + // we found (a) line(s) that have hits. + // now find the shortest line of these hits + int shortLineIndex = -1; + int shortLineLength = Integer.MAX_VALUE; + for (int i = 0; i < sentences.length; i++) { + if ((hitTable.getScore(new Integer(i)) == score) && + (sentences[i].length() < shortLineLength)) { + shortLineIndex = i; + shortLineLength = sentences[i].length(); + } + } + // find a first result + String result = sentences[shortLineIndex]; + if (score == queryhashes.size()) return result; + // the result has not all words in it. + // find another sentence that represents the missing other words + // first remove all words that appear in the result from the queryhashes + hs = hashSentence(result); + j = queryhashes.iterator(); + while (j.hasNext()) { + if (hs.contains((String) j.next())) j.remove(); + } + if (queryhashes.size() == 0) return result; + // now find recursively more sentences + String nextSnippet = computeSnippet(sentences, queryhashes, minLength, maxLength); + return result + ((nextSnippet == null) ? "" : (" ... " + nextSnippet)); } - + private HashSet hashSentence(String sentence) { HashSet set = new HashSet(); Enumeration words = plasmaCondenser.wordTokenizer(sentence); @@ -264,5 +297,4 @@ public class plasmaSnippetCache { log); } - } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 634019d34..537fdc58c 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1136,12 +1136,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser char[] order; String urlmask; long time; - int fetchcount; - public presearch(Set queryhashes, char[] order, long time /*milliseconds*/, String urlmask, int fetchcount) { + int searchcount, fetchcount; + public presearch(Set queryhashes, char[] order, long time /*milliseconds*/, String urlmask, int searchcount, int fetchcount) { this.queryhashes = queryhashes; this.order = order; this.urlmask = urlmask; this.time = time; + this.searchcount = searchcount; this.fetchcount = fetchcount; } public void run() { @@ -1150,26 +1151,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser log.logDebug("presearch: started job"); plasmaWordIndexEntity idx = searchManager.searchHashes(queryhashes, time); log.logDebug("presearch: found " + idx.size() + " results"); - plasmaSearch.result acc = searchManager.order(idx, queryhashes, stopwords, order, time, fetchcount); + plasmaSearch.result acc = searchManager.order(idx, queryhashes, stopwords, order, time, searchcount); if (acc == null) return; log.logDebug("presearch: ordered results, now " + acc.sizeOrdered() + " URLs ready for fetch"); // take some elements and fetch the snippets - int i = 0; - plasmaCrawlLURL.entry urlentry; - String urlstring; - plasmaSnippetCache.result snippet; - while ((acc.hasMoreElements()) && (i < fetchcount)) { - urlentry = acc.nextElement(); - if (urlentry.url().getHost().endsWith(".yacyh")) continue; - urlstring = htmlFilterContentScraper.urlNormalform(urlentry.url()); - if (urlstring.matches(urlmask)) { //.* is default - log.logDebug("presearch: fetching URL " + urlstring); - snippet = snippetCache.retrieve(urlentry.url(), true, queryhashes); - if (snippet != null) log.logDebug("found snippet for URL " + urlstring + ": '" + snippet.line + "'"); - i++; - } - } + fetchSnippets(acc, queryhashes, urlmask, fetchcount); } catch (IOException e) { e.printStackTrace(); } @@ -1177,6 +1164,42 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } } + public void fetchSnippets(plasmaSearch.result acc, Set queryhashes, String urlmask, int fetchcount) { + // fetch the snippets + int i = 0; + plasmaCrawlLURL.entry urlentry; + String urlstring; + plasmaSnippetCache.result snippet; + while ((acc.hasMoreElements()) && (i < fetchcount)) { + urlentry = acc.nextElement(); + if (urlentry.url().getHost().endsWith(".yacyh")) continue; + urlstring = htmlFilterContentScraper.urlNormalform(urlentry.url()); + if ((urlstring.matches(urlmask)) && + (!(snippetCache.existsInCache(urlentry.url(), queryhashes)))) { + new snippetFetcher(urlentry.url(), queryhashes).start(); + i++; + } + } + } + + public class snippetFetcher extends Thread { + URL url; + Set queryhashes; + public snippetFetcher(URL url, Set queryhashes) { + if (url.getHost().endsWith(".yacyh")) return; + this.url = url; + this.queryhashes = queryhashes; + } + public void run() { + log.logDebug("snippetFetcher: try to get URL " + url); + plasmaSnippetCache.result snippet = snippetCache.retrieve(url, queryhashes, true); + if (snippet.line == null) + log.logDebug("snippetFetcher: cannot get URL " + url + ". error: " + snippet.error); + else + log.logDebug("snippetFetcher: got URL " + url + ", the snippet is '" + snippet.line + "', source=" + snippet.source); + } + } + public serverObjects searchFromLocal(Set querywords, String order1, String order2, int count, boolean global, long time /*milliseconds*/, String urlmask) { serverObjects prop = new serverObjects(); @@ -1199,11 +1222,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser log.logInfo("INIT WORD SEARCH: " + gs + " - " + count + " links, " + (time / 1000) + " seconds"); long timestamp = System.currentTimeMillis(); + // start a presearch, which makes only sense if we idle afterwards. + // this is especially the case if we start a global search and idle until search if (global) { - // start a presearch, which makes only sense if we idle afterwards. - // this is especially the case if we start a global search and idle until search - // results appear from other peers - Thread preselect = new presearch(queryhashes, order, time / 10, urlmask, 5); + Thread preselect = new presearch(queryhashes, order, time / 10, urlmask, 10, 3); preselect.start(); } @@ -1229,6 +1251,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (remainingTime < 500) remainingTime = 500; if (remainingTime > 3000) remainingTime = 3000; plasmaSearch.result acc = searchManager.order(idx, queryhashes, stopwords, order, remainingTime, 10); + if (!(global)) fetchSnippets(acc.cloneSmart(), queryhashes, urlmask, 10); log.logDebug("SEARCH TIME AFTER ORDERING OF SEARCH RESULT: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds"); // result is a List of urlEntry elements: prepare answer @@ -1289,8 +1312,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser prop.put("results_" + i + "_urlname", urlname); prop.put("results_" + i + "_date", dateString(urlentry.moddate())); prop.put("results_" + i + "_size", Long.toString(urlentry.size())); - snippet = snippetCache.retrieve(url, false, queryhashes); - if ((snippet == null) || (snippet.line.length() < 10)) { + snippet = snippetCache.retrieve(url, queryhashes, false); + if (snippet.line == null) { prop.put("results_" + i + "_snippet", 0); prop.put("results_" + i + "_snippet_text", ""); } else { @@ -1366,8 +1389,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser plasmaSnippetCache.result snippet; while ((acc.hasMoreElements()) && (i < count)) { urlentry = acc.nextElement(); - snippet = snippetCache.retrieve(urlentry.url(), false, hashes); - if ((snippet == null) || (snippet.line.length() < 10)) { + snippet = snippetCache.retrieve(urlentry.url(), hashes, false); + if (snippet.line == null) { resource = urlentry.toString(); } else { resource = urlentry.toString(snippet.line);