enhanced snippet computation

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@319 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent d53b2393e5
commit d6c85228a6

@ -3,7 +3,7 @@ javacSource=1.4
javacTarget=1.4
# Release Configuration
releaseVersion=0.383
releaseVersion=0.384
releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
#releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}

@ -56,6 +56,10 @@ import de.anomic.yacy.yacySearch;
public class plasmaSnippetCache {
private static final int maxCache = 500;
public static final int SOURCE_CACHE = 0;
public static final int SOURCE_FILE = 0;
public static final int SOURCE_WEB = 0;
private int snippetsScoreCounter;
private kelondroMScoreCluster snippetsScore;
@ -81,8 +85,73 @@ public class plasmaSnippetCache {
this.snippetsCache = new HashMap();
}
public class result {
public String line;
public int source;
public result(String line, int source) {
this.line = line;
this.source = source;
}
public String toString() {
return line;
}
}
public synchronized void store(String wordhashes, String urlhash, String snippet) {
public result retrieve(java.net.URL url, boolean fetchOnline, Set queryhashes) {
if (queryhashes.size() == 0) {
//System.out.println("found no queryhashes for url retrieve " + url);
return null;
}
String urlhash = plasmaURL.urlHash(url);
// try to get snippet from snippetCache
String wordhashes = yacySearch.set2string(queryhashes);
String line = retrieveFromCache(wordhashes, urlhash);
if (line != null) {
//System.out.println("found snippet for url " + url + " in cache: " + line);
return new result(line, SOURCE_CACHE);
}
// if the snippet is not in the cache, we can try to get it from the htcache
byte[] resource = null;
int source = SOURCE_CACHE;
try {
resource = cacheManager.loadResource(url);
if ((fetchOnline) && (resource == null)) {
loadResourceFromWeb(url, 5000);
resource = cacheManager.loadResource(url);
source = SOURCE_WEB;
}
} catch (IOException e) {
return null;
}
if (resource == null) {
//System.out.println("cannot load document for url " + url);
return null;
}
plasmaParserDocument document = parseDocument(url, resource);
if (document == null) return null; // cannot be parsed
//System.out.println("loaded document for url " + url);
String[] sentences = document.getSentences();
//System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
if ((sentences == null) || (sentences.length == 0)) {
//System.out.println("found no sentences in url " + url);
return null;
}
// we have found a parseable non-empty file: use the lines
line = computeSnippet(sentences, queryhashes, 12 * queryhashes.size(), 120);
//System.out.println("loaded snippet for url " + url + ": " + line);
if (line == null) return null;
if (line.length() > 120) line = line.substring(0, 120);
// finally store this snippet in our own cache
storeToCache(wordhashes, urlhash, line);
return new result(line, source);
}
public synchronized void storeToCache(String wordhashes, String urlhash, String snippet) {
// generate key
String key = urlhash + wordhashes;
@ -108,83 +177,64 @@ public class plasmaSnippetCache {
}
}
private String retrieve(String wordhashes, String urlhash) {
private String retrieveFromCache(String wordhashes, String urlhash) {
// generate key
String key = urlhash + wordhashes;
return (String) snippetsCache.get(key);
}
public String retrieve(java.net.URL url, boolean fetchOnline, Set queryhashes) {
if (queryhashes.size() == 0) {
//System.out.println("found no queryhashes for url retrieve " + url);
return null;
}
String urlhash = plasmaURL.urlHash(url);
// try to get snippet from snippetCache
String wordhashes = yacySearch.set2string(queryhashes);
String snippet = retrieve(wordhashes, urlhash);
if (snippet != null) {
//System.out.println("found snippet for url " + url + " in cache: " + snippet);
return snippet;
}
// if the snippet is not in the cache, we can try to get it from the htcache
plasmaParserDocument document = getDocument(url, fetchOnline);
if (document == null) {
//System.out.println("cannot load document for url " + url);
return null;
}
//System.out.println("loaded document for url " + url);
String[] sentences = document.getSentences();
//System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
if ((sentences == null) || (sentences.length == 0)) {
//System.out.println("found no sentences in url " + url);
return null;
}
// we have found a parseable non-empty file: use the lines
TreeMap sentencematrix = hashMatrix(sentences);
Iterator i = queryhashes.iterator();
String hash;
private String computeSnippet(String[] sentences, Set queryhashes, int minLength, int maxLength) {
kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
Iterator j;
Integer sentencenumber;
Map.Entry entry;
while (i.hasNext()) {
hash = (String) i.next();
j = sentencematrix.entrySet().iterator();
while (j.hasNext()) {
entry = (Map.Entry) j.next();
sentencenumber = (Integer) entry.getKey();
if (((HashSet) entry.getValue()).contains(hash)) hitTable.addScore(sentencenumber, sentences[sentencenumber.intValue()].length());
Iterator i;
HashSet hs;
for (int j = 0; j < sentences.length; j++) {
if ((sentences[j].length() > minLength) && (sentences[j].length() < maxLength)) {
hs = hashSentence(sentences[j]);
i = queryhashes.iterator();
while (i.hasNext()) {
if (hs.contains((String) i.next())) hitTable.incScore(new Integer(j));
}
}
}
Integer maxLine = (Integer) hitTable.getMaxObject();
if (maxLine == null) return null;
snippet = sentences[maxLine.intValue()];
//System.out.println("loaded snippet for url " + url + ": " + snippet);
if (snippet.length() > 120) snippet = snippet.substring(0, 120);
// finally store this snippet in our own cache
store(wordhashes, urlhash, snippet);
return snippet;
if (hitTable.getScore(maxLine) == 0) return null;
return sentences[maxLine.intValue()];
}
private HashSet hashSentence(String sentence) {
HashSet set = new HashSet();
Enumeration words = plasmaCondenser.wordTokenizer(sentence);
while (words.hasMoreElements()) set.add(plasmaWordIndexEntry.word2hash((String) words.nextElement()));
return set;
}
public plasmaParserDocument parseDocument(URL url, byte[] resource) {
if (resource == null) return null;
httpHeader header = null;
try {
header = cacheManager.getCachedResponse(plasmaURL.urlHash(url));
} catch (IOException e) {}
private TreeMap hashMatrix(String[] sentences) {
TreeMap map = new TreeMap();
HashSet set;
Enumeration words;
for (int i = 0; i < sentences.length; i++) {
set = new HashSet();
words = plasmaCondenser.wordTokenizer(sentences[i]);
while (words.hasMoreElements()) set.add(plasmaWordIndexEntry.word2hash((String) words.nextElement()));
map.put(new Integer(i), set);
if (header == null) {
String filename = url.getFile();
int p = filename.lastIndexOf('.');
if ((p < 0) ||
((p >= 0) && (plasmaParser.supportedFileExtContains(filename.substring(p + 1))))) {
return parser.parseSource(url, "text/html", resource);
} else {
return null;
}
} else {
if (plasmaParser.supportedMimeTypesContains(header.mime())) {
return parser.parseSource(url, header.mime(), resource);
} else {
return null;
}
}
return map;
}
private byte[] getResource(URL url, boolean fetchOnline) {
public byte[] getResource(URL url, boolean fetchOnline) {
// load the url as resource from the web
try {
//return httpc.singleGET(url, 5000, null, null, remoteProxyHost, remoteProxyPort);
@ -214,29 +264,5 @@ public class plasmaSnippetCache {
log);
}
public plasmaParserDocument getDocument(URL url, boolean fetchOnline) {
byte[] resource = getResource(url, fetchOnline);
if (resource == null) return null;
httpHeader header = null;
try {
header = cacheManager.getCachedResponse(plasmaURL.urlHash(url));
} catch (IOException e) {}
if (header == null) {
String filename = url.getFile();
int p = filename.lastIndexOf('.');
if ((p < 0) ||
((p >= 0) && (plasmaParser.supportedFileExtContains(filename.substring(p + 1))))) {
return parser.parseSource(url, "text/html", resource);
} else {
return null;
}
} else {
if (plasmaParser.supportedMimeTypesContains(header.mime())) {
return parser.parseSource(url, header.mime(), resource);
} else {
return null;
}
}
}
}

@ -585,7 +585,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public boolean coreCrawlJob() {
System.gc(); // debug
if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) == 0) {
//log.logDebug("CoreCrawl: queue is empty");
return false;
@ -1158,7 +1157,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// take some elements and fetch the snippets
int i = 0;
plasmaCrawlLURL.entry urlentry;
String urlstring, snippet;
String urlstring;
plasmaSnippetCache.result snippet;
while ((acc.hasMoreElements()) && (i < fetchcount)) {
urlentry = acc.nextElement();
if (urlentry.url().getHost().endsWith(".yacyh")) continue;
@ -1166,7 +1166,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (urlstring.matches(urlmask)) { //.* is default
log.logDebug("presearch: fetching URL " + urlstring);
snippet = snippetCache.retrieve(urlentry.url(), true, queryhashes);
if (snippet != null) log.logDebug("found snippet for URL " + urlstring + ": '" + snippet + "'");
if (snippet != null) log.logDebug("found snippet for URL " + urlstring + ": '" + snippet.line + "'");
i++;
}
}
@ -1237,8 +1237,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
URL url;
plasmaCrawlLURL.entry urlentry;
String urlstring, urlname, filename;
String host, hash, address, snippet, descr = "";
String host, hash, address, descr = "";
yacySeed seed;
plasmaSnippetCache.result snippet;
//kelondroMScoreCluster ref = new kelondroMScoreCluster();
while ((acc.hasMoreElements()) && (i < count)) {
urlentry = acc.nextElement();
@ -1284,12 +1285,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
prop.put("results_" + i + "_date", dateString(urlentry.moddate()));
prop.put("results_" + i + "_size", Long.toString(urlentry.size()));
snippet = snippetCache.retrieve(url, false, queryhashes);
if ((snippet == null) || (snippet.length() < 10)) {
if ((snippet == null) || (snippet.line.length() < 10)) {
prop.put("results_" + i + "_snippet", 0);
prop.put("results_" + i + "_snippet_text", "");
} else {
prop.put("results_" + i + "_snippet", 1);
prop.put("results_" + i + "_snippet_text", snippet);
prop.put("results_" + i + "_snippet_text", snippet.line);
}
i++;
}
@ -1357,14 +1358,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String resource = "";
//plasmaIndexEntry pie;
plasmaCrawlLURL.entry urlentry;
String snippet;
plasmaSnippetCache.result snippet;
while ((acc.hasMoreElements()) && (i < count)) {
urlentry = acc.nextElement();
snippet = snippetCache.retrieve(urlentry.url(), false, hashes);
if ((snippet == null) || (snippet.length() < 10)) {
if ((snippet == null) || (snippet.line.length() < 10)) {
resource = urlentry.toString();
} else {
resource = urlentry.toString(snippet);
resource = urlentry.toString(snippet.line);
}
if (resource != null) {
links.append("resource").append(i).append("=").append(resource).append(serverCore.crlfString);
@ -1433,7 +1434,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (url == null) return 0;
// get set of words
//Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
Set words = plasmaCondenser.getWords(snippetCache.getDocument(url, fetchOnline).getText());
Set words = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline)).getText());
// delete all word references
int count = removeReferences(urlhash, words);
// finally delete the url entry itself

@ -209,7 +209,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
addEntry(wordHash, wordEntry, creationTime);
urlCount++;
// protect against memory shortage
while (rt.freeMemory() < 1000000) {flushFromMem(); System.gc();}
while (rt.freeMemory() < 1000000) flushFromMem();
// write a log
if (System.currentTimeMillis() > messageTime) {
System.gc(); // for better statistic

@ -329,7 +329,7 @@ public class yacyClient {
// we don't store the snippets along the url entry, because they are search-specific.
// instead, they are placed in a snipped-search cache.
//System.out.println("--- RECEIVED SNIPPET '" + link.snippet() + "'");
snippets.store(wordhashes, link.hash(), link.snippet());
snippets.storeToCache(wordhashes, link.hash(), link.snippet());
}
// add the url entry to the word indexes
for (int m = 0; m < words; m++) {

Loading…
Cancel
Save