From d6c85228a669b816f3c965e269798f4e433aa02c Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 23 Jun 2005 12:12:12 +0000 Subject: [PATCH] enhanced snippet computation git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@319 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- build.properties | 2 +- .../de/anomic/plasma/plasmaSnippetCache.java | 202 ++++++++++-------- .../de/anomic/plasma/plasmaSwitchboard.java | 21 +- .../anomic/plasma/plasmaWordIndexCache.java | 2 +- source/de/anomic/yacy/yacyClient.java | 2 +- 5 files changed, 128 insertions(+), 101 deletions(-) diff --git a/build.properties b/build.properties index 1d0c85525..f0152f69e 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.4 javacTarget=1.4 # Release Configuration -releaseVersion=0.383 +releaseVersion=0.384 releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz #releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr} diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 708927b3f..5d824c3bf 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -56,6 +56,10 @@ import de.anomic.yacy.yacySearch; public class plasmaSnippetCache { private static final int maxCache = 500; + public static final int SOURCE_CACHE = 0; + public static final int SOURCE_FILE = 0; + public static final int SOURCE_WEB = 0; + private int snippetsScoreCounter; private kelondroMScoreCluster snippetsScore; @@ -81,8 +85,73 @@ public class plasmaSnippetCache { this.snippetsCache = new HashMap(); } + public class result { + public String line; + public int source; + public result(String line, int source) { + this.line = line; + this.source = source; + } + public String toString() { + return line; + } + } - public synchronized void store(String wordhashes, String urlhash, String snippet) { + public result retrieve(java.net.URL url, boolean fetchOnline, Set queryhashes) { + if (queryhashes.size() == 0) { + //System.out.println("found no queryhashes for url retrieve " + url); + return null; + } + String urlhash = plasmaURL.urlHash(url); + + // try to get snippet from snippetCache + String wordhashes = yacySearch.set2string(queryhashes); + String line = retrieveFromCache(wordhashes, urlhash); + if (line != null) { + //System.out.println("found snippet for url " + url + " in cache: " + line); + return new result(line, SOURCE_CACHE); + } + + // if the snippet is not in the cache, we can try to get it from the htcache + byte[] resource = null; + int source = SOURCE_CACHE; + try { + resource = cacheManager.loadResource(url); + if ((fetchOnline) && (resource == null)) { + loadResourceFromWeb(url, 5000); + resource = cacheManager.loadResource(url); + source = SOURCE_WEB; + } + } catch (IOException e) { + return null; + } + if (resource == null) { + //System.out.println("cannot load document for url " + url); + return null; + } + plasmaParserDocument document = parseDocument(url, resource); + + if (document == null) return null; // cannot be parsed + //System.out.println("loaded document for url " + url); + String[] sentences = document.getSentences(); + //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]); + if ((sentences == null) || (sentences.length == 0)) { + //System.out.println("found no sentences in url " + url); + return null; + } + + // we have found a parseable non-empty file: use the lines + line = computeSnippet(sentences, queryhashes, 12 * queryhashes.size(), 120); + //System.out.println("loaded snippet for url " + url + ": " + line); + if (line == null) return null; + if (line.length() > 120) line = line.substring(0, 120); + + // finally store this snippet in our own cache + storeToCache(wordhashes, urlhash, line); + return new result(line, source); + } + + public synchronized void storeToCache(String wordhashes, String urlhash, String snippet) { // generate key String key = urlhash + wordhashes; @@ -108,83 +177,64 @@ public class plasmaSnippetCache { } } - private String retrieve(String wordhashes, String urlhash) { + private String retrieveFromCache(String wordhashes, String urlhash) { // generate key String key = urlhash + wordhashes; return (String) snippetsCache.get(key); } - public String retrieve(java.net.URL url, boolean fetchOnline, Set queryhashes) { - if (queryhashes.size() == 0) { - //System.out.println("found no queryhashes for url retrieve " + url); - return null; - } - String urlhash = plasmaURL.urlHash(url); - - // try to get snippet from snippetCache - String wordhashes = yacySearch.set2string(queryhashes); - String snippet = retrieve(wordhashes, urlhash); - if (snippet != null) { - //System.out.println("found snippet for url " + url + " in cache: " + snippet); - return snippet; - } - - // if the snippet is not in the cache, we can try to get it from the htcache - plasmaParserDocument document = getDocument(url, fetchOnline); - if (document == null) { - //System.out.println("cannot load document for url " + url); - return null; - } - //System.out.println("loaded document for url " + url); - String[] sentences = document.getSentences(); - //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]); - if ((sentences == null) || (sentences.length == 0)) { - //System.out.println("found no sentences in url " + url); - return null; - } - - // we have found a parseable non-empty file: use the lines - TreeMap sentencematrix = hashMatrix(sentences); - Iterator i = queryhashes.iterator(); - String hash; + private String computeSnippet(String[] sentences, Set queryhashes, int minLength, int maxLength) { kelondroMScoreCluster hitTable = new kelondroMScoreCluster(); - Iterator j; - Integer sentencenumber; - Map.Entry entry; - while (i.hasNext()) { - hash = (String) i.next(); - j = sentencematrix.entrySet().iterator(); - while (j.hasNext()) { - entry = (Map.Entry) j.next(); - sentencenumber = (Integer) entry.getKey(); - if (((HashSet) entry.getValue()).contains(hash)) hitTable.addScore(sentencenumber, sentences[sentencenumber.intValue()].length()); + Iterator i; + HashSet hs; + for (int j = 0; j < sentences.length; j++) { + if ((sentences[j].length() > minLength) && (sentences[j].length() < maxLength)) { + hs = hashSentence(sentences[j]); + i = queryhashes.iterator(); + while (i.hasNext()) { + if (hs.contains((String) i.next())) hitTable.incScore(new Integer(j)); + } } } Integer maxLine = (Integer) hitTable.getMaxObject(); if (maxLine == null) return null; - snippet = sentences[maxLine.intValue()]; - //System.out.println("loaded snippet for url " + url + ": " + snippet); - if (snippet.length() > 120) snippet = snippet.substring(0, 120); - - // finally store this snippet in our own cache - store(wordhashes, urlhash, snippet); - return snippet; + if (hitTable.getScore(maxLine) == 0) return null; + return sentences[maxLine.intValue()]; + } + + private HashSet hashSentence(String sentence) { + HashSet set = new HashSet(); + Enumeration words = plasmaCondenser.wordTokenizer(sentence); + while (words.hasMoreElements()) set.add(plasmaWordIndexEntry.word2hash((String) words.nextElement())); + return set; } + + public plasmaParserDocument parseDocument(URL url, byte[] resource) { + if (resource == null) return null; + httpHeader header = null; + try { + header = cacheManager.getCachedResponse(plasmaURL.urlHash(url)); + } catch (IOException e) {} - private TreeMap hashMatrix(String[] sentences) { - TreeMap map = new TreeMap(); - HashSet set; - Enumeration words; - for (int i = 0; i < sentences.length; i++) { - set = new HashSet(); - words = plasmaCondenser.wordTokenizer(sentences[i]); - while (words.hasMoreElements()) set.add(plasmaWordIndexEntry.word2hash((String) words.nextElement())); - map.put(new Integer(i), set); + if (header == null) { + String filename = url.getFile(); + int p = filename.lastIndexOf('.'); + if ((p < 0) || + ((p >= 0) && (plasmaParser.supportedFileExtContains(filename.substring(p + 1))))) { + return parser.parseSource(url, "text/html", resource); + } else { + return null; + } + } else { + if (plasmaParser.supportedMimeTypesContains(header.mime())) { + return parser.parseSource(url, header.mime(), resource); + } else { + return null; + } } - return map; } - private byte[] getResource(URL url, boolean fetchOnline) { + public byte[] getResource(URL url, boolean fetchOnline) { // load the url as resource from the web try { //return httpc.singleGET(url, 5000, null, null, remoteProxyHost, remoteProxyPort); @@ -214,29 +264,5 @@ public class plasmaSnippetCache { log); } - public plasmaParserDocument getDocument(URL url, boolean fetchOnline) { - byte[] resource = getResource(url, fetchOnline); - if (resource == null) return null; - httpHeader header = null; - try { - header = cacheManager.getCachedResponse(plasmaURL.urlHash(url)); - } catch (IOException e) {} - - if (header == null) { - String filename = url.getFile(); - int p = filename.lastIndexOf('.'); - if ((p < 0) || - ((p >= 0) && (plasmaParser.supportedFileExtContains(filename.substring(p + 1))))) { - return parser.parseSource(url, "text/html", resource); - } else { - return null; - } - } else { - if (plasmaParser.supportedMimeTypesContains(header.mime())) { - return parser.parseSource(url, header.mime(), resource); - } else { - return null; - } - } - } + } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 213ba6097..a533d7766 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -585,7 +585,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } public boolean coreCrawlJob() { - System.gc(); // debug if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) == 0) { //log.logDebug("CoreCrawl: queue is empty"); return false; @@ -1158,7 +1157,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // take some elements and fetch the snippets int i = 0; plasmaCrawlLURL.entry urlentry; - String urlstring, snippet; + String urlstring; + plasmaSnippetCache.result snippet; while ((acc.hasMoreElements()) && (i < fetchcount)) { urlentry = acc.nextElement(); if (urlentry.url().getHost().endsWith(".yacyh")) continue; @@ -1166,7 +1166,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (urlstring.matches(urlmask)) { //.* is default log.logDebug("presearch: fetching URL " + urlstring); snippet = snippetCache.retrieve(urlentry.url(), true, queryhashes); - if (snippet != null) log.logDebug("found snippet for URL " + urlstring + ": '" + snippet + "'"); + if (snippet != null) log.logDebug("found snippet for URL " + urlstring + ": '" + snippet.line + "'"); i++; } } @@ -1237,8 +1237,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser URL url; plasmaCrawlLURL.entry urlentry; String urlstring, urlname, filename; - String host, hash, address, snippet, descr = ""; + String host, hash, address, descr = ""; yacySeed seed; + plasmaSnippetCache.result snippet; //kelondroMScoreCluster ref = new kelondroMScoreCluster(); while ((acc.hasMoreElements()) && (i < count)) { urlentry = acc.nextElement(); @@ -1284,12 +1285,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser prop.put("results_" + i + "_date", dateString(urlentry.moddate())); prop.put("results_" + i + "_size", Long.toString(urlentry.size())); snippet = snippetCache.retrieve(url, false, queryhashes); - if ((snippet == null) || (snippet.length() < 10)) { + if ((snippet == null) || (snippet.line.length() < 10)) { prop.put("results_" + i + "_snippet", 0); prop.put("results_" + i + "_snippet_text", ""); } else { prop.put("results_" + i + "_snippet", 1); - prop.put("results_" + i + "_snippet_text", snippet); + prop.put("results_" + i + "_snippet_text", snippet.line); } i++; } @@ -1357,14 +1358,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String resource = ""; //plasmaIndexEntry pie; plasmaCrawlLURL.entry urlentry; - String snippet; + plasmaSnippetCache.result snippet; while ((acc.hasMoreElements()) && (i < count)) { urlentry = acc.nextElement(); snippet = snippetCache.retrieve(urlentry.url(), false, hashes); - if ((snippet == null) || (snippet.length() < 10)) { + if ((snippet == null) || (snippet.line.length() < 10)) { resource = urlentry.toString(); } else { - resource = urlentry.toString(snippet); + resource = urlentry.toString(snippet.line); } if (resource != null) { links.append("resource").append(i).append("=").append(resource).append(serverCore.crlfString); @@ -1433,7 +1434,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (url == null) return 0; // get set of words //Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline))); - Set words = plasmaCondenser.getWords(snippetCache.getDocument(url, fetchOnline).getText()); + Set words = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline)).getText()); // delete all word references int count = removeReferences(urlhash, words); // finally delete the url entry itself diff --git a/source/de/anomic/plasma/plasmaWordIndexCache.java b/source/de/anomic/plasma/plasmaWordIndexCache.java index e68b01cdf..2a655d241 100644 --- a/source/de/anomic/plasma/plasmaWordIndexCache.java +++ b/source/de/anomic/plasma/plasmaWordIndexCache.java @@ -209,7 +209,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { addEntry(wordHash, wordEntry, creationTime); urlCount++; // protect against memory shortage - while (rt.freeMemory() < 1000000) {flushFromMem(); System.gc();} + while (rt.freeMemory() < 1000000) flushFromMem(); // write a log if (System.currentTimeMillis() > messageTime) { System.gc(); // for better statistic diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 703a2d22c..fe098a4fb 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -329,7 +329,7 @@ public class yacyClient { // we don't store the snippets along the url entry, because they are search-specific. // instead, they are placed in a snipped-search cache. //System.out.println("--- RECEIVED SNIPPET '" + link.snippet() + "'"); - snippets.store(wordhashes, link.hash(), link.snippet()); + snippets.storeToCache(wordhashes, link.hash(), link.snippet()); } // add the url entry to the word indexes for (int m = 0; m < words; m++) {