git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@218 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
9244b6ad6f
commit
ca3b4ccaf4
@ -0,0 +1,234 @@
|
||||
// plasmaSnippetCache.java
|
||||
// -----------------------
|
||||
// part of YaCy
|
||||
// (C) by Michael Peter Christen; mc@anomic.de
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2005
|
||||
// last major change: 07.06.2005
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
//
|
||||
// Using this software in any meaning (reading, learning, copying, compiling,
|
||||
// running) means that you agree that the Author(s) is (are) not responsible
|
||||
// for cost, loss of data or any harm that may be caused directly or indirectly
|
||||
// by usage of this softare or this documentation. The usage of this software
|
||||
// is on your own risk. The installation and usage (starting/running) of this
|
||||
// software may allow other people or application to access your computer and
|
||||
// any attached devices and is highly dependent on the configuration of the
|
||||
// software which must be done by the user of the software; the author(s) is
|
||||
// (are) also not responsible for proper configuration and usage of the
|
||||
// software, even if provoked by documentation provided together with
|
||||
// the software.
|
||||
//
|
||||
// Any changes to this file according to the GPL as documented in the file
|
||||
// gpl.txt aside this file in the shipment you received can be done to the
|
||||
// lines that follows this copyright notice here, but changes must not be
|
||||
// done inside the copyright notive above. A re-distribution must contain
|
||||
// the intact and unchanged copyright notice.
|
||||
// Contributions and changes to the program code must be marked as such.
|
||||
|
||||
|
||||
package de.anomic.plasma;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.URL;
|
||||
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
||||
import de.anomic.kelondro.kelondroMScoreCluster;
|
||||
import de.anomic.server.serverFileUtils;
|
||||
import de.anomic.server.serverLog;
|
||||
import de.anomic.http.httpHeader;
|
||||
import de.anomic.yacy.yacySearch;
|
||||
|
||||
public class plasmaSnippetCache {
|
||||
|
||||
private static final int maxCache = 500;
|
||||
|
||||
private int snippetsScoreCounter;
|
||||
private kelondroMScoreCluster snippetsScore;
|
||||
private HashMap snippetsCache;
|
||||
private plasmaHTCache cacheManager;
|
||||
private plasmaParser parser;
|
||||
private serverLog log;
|
||||
private String remoteProxyHost;
|
||||
private int remoteProxyPort;
|
||||
private boolean remoteProxyUse;
|
||||
|
||||
public plasmaSnippetCache(plasmaHTCache cacheManager, plasmaParser parser,
|
||||
String remoteProxyHost, int remoteProxyPort, boolean remoteProxyUse,
|
||||
serverLog log) {
|
||||
this.cacheManager = cacheManager;
|
||||
this.parser = parser;
|
||||
this.log = log;
|
||||
this.remoteProxyHost = remoteProxyHost;
|
||||
this.remoteProxyPort = remoteProxyPort;
|
||||
this.remoteProxyUse = remoteProxyUse;
|
||||
this.snippetsScoreCounter = 0;
|
||||
this.snippetsScore = new kelondroMScoreCluster();
|
||||
this.snippetsCache = new HashMap();
|
||||
}
|
||||
|
||||
|
||||
public synchronized void store(String wordhashes, String urlhash, String snippet) {
|
||||
// generate key
|
||||
String key = urlhash + wordhashes;
|
||||
|
||||
// do nothing if snippet is known
|
||||
if (snippetsCache.containsKey(key)) return;
|
||||
|
||||
// learn new snippet
|
||||
snippetsScore.addScore(key, snippetsScoreCounter++);
|
||||
snippetsCache.put(key, snippet);
|
||||
|
||||
// care for counter
|
||||
if (snippetsScoreCounter == java.lang.Integer.MAX_VALUE) {
|
||||
snippetsScoreCounter = 0;
|
||||
snippetsScore = new kelondroMScoreCluster();
|
||||
snippetsCache = new HashMap();
|
||||
}
|
||||
|
||||
// flush cache if cache is full
|
||||
while (snippetsCache.size() > maxCache) {
|
||||
key = (String) snippetsScore.getMinObject();
|
||||
snippetsScore.deleteScore(key);
|
||||
snippetsCache.remove(key);
|
||||
}
|
||||
}
|
||||
|
||||
private String retrieve(String wordhashes, String urlhash) {
|
||||
// generate key
|
||||
String key = urlhash + wordhashes;
|
||||
return (String) snippetsCache.get(key);
|
||||
}
|
||||
|
||||
public String retrieve(java.net.URL url, boolean fetchOnline, Set query, boolean queryAreHashes) {
|
||||
if (query.size() == 0) return null;
|
||||
if (!(queryAreHashes)) query = plasmaSearch.words2hashes(query);
|
||||
String urlhash = plasmaURL.urlHash(url);
|
||||
|
||||
// try to get snippet from snippetCache
|
||||
String wordhashes = yacySearch.set2string(query);
|
||||
String snippet = retrieve(wordhashes, urlhash);
|
||||
if (snippet != null) return snippet;
|
||||
|
||||
// if the snippet is not in the cache, we can try to get it from the htcache
|
||||
plasmaParserDocument document = getDocument(url, fetchOnline);
|
||||
if (document == null) return null;
|
||||
String[] sentences = document.getSentences();
|
||||
//System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
|
||||
if ((sentences == null) || (sentences.length == 0)) return null;
|
||||
|
||||
// we have found a parseable non-empty file: use the lines
|
||||
TreeMap sentencematrix = hashMatrix(sentences);
|
||||
Iterator i = query.iterator();
|
||||
String hash;
|
||||
kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
|
||||
Iterator j;
|
||||
Integer sentencenumber;
|
||||
Map.Entry entry;
|
||||
while (i.hasNext()) {
|
||||
hash = (String) i.next();
|
||||
j = sentencematrix.entrySet().iterator();
|
||||
while (j.hasNext()) {
|
||||
entry = (Map.Entry) j.next();
|
||||
sentencenumber = (Integer) entry.getKey();
|
||||
if (((HashSet) entry.getValue()).contains(hash)) hitTable.addScore(sentencenumber, sentences[sentencenumber.intValue()].length());
|
||||
}
|
||||
}
|
||||
Integer maxLine = (Integer) hitTable.getMaxObject();
|
||||
if (maxLine == null) return null;
|
||||
snippet = sentences[maxLine.intValue()];
|
||||
if (snippet.length() > 140) return null;
|
||||
|
||||
// finally store this snippet in our own cache
|
||||
store(wordhashes, urlhash, snippet);
|
||||
return snippet;
|
||||
}
|
||||
|
||||
private TreeMap hashMatrix(String[] sentences) {
|
||||
TreeMap map = new TreeMap();
|
||||
HashSet set;
|
||||
Enumeration words;
|
||||
for (int i = 0; i < sentences.length; i++) {
|
||||
set = new HashSet();
|
||||
words = plasmaCondenser.wordTokenizer(sentences[i]);
|
||||
while (words.hasMoreElements()) set.add(plasmaWordIndexEntry.word2hash((String) words.nextElement()));
|
||||
map.put(new Integer(i), set);
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
private byte[] getResource(URL url, boolean fetchOnline) {
|
||||
// load the url as resource from the web
|
||||
try {
|
||||
//return httpc.singleGET(url, 5000, null, null, remoteProxyHost, remoteProxyPort);
|
||||
byte[] resource = getResourceFromCache(url);
|
||||
if ((fetchOnline) && (resource == null)) {
|
||||
loadResourceFromWeb(url, 5000);
|
||||
resource = getResourceFromCache(url);
|
||||
}
|
||||
return resource;
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private byte[] getResourceFromCache(URL url) {
|
||||
// load the url as resource from the cache
|
||||
String path = htmlFilterContentScraper.urlNormalform(url).substring(6);
|
||||
File cache = cacheManager.cachePath;
|
||||
File f = new File(cache, path);
|
||||
if (f.exists()) try {
|
||||
return serverFileUtils.read(f);
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private void loadResourceFromWeb(URL url, int socketTimeout) throws IOException {
|
||||
plasmaCrawlWorker.load(
|
||||
url,
|
||||
null,
|
||||
null,
|
||||
0,
|
||||
null,
|
||||
socketTimeout,
|
||||
remoteProxyHost,
|
||||
remoteProxyPort,
|
||||
remoteProxyUse,
|
||||
cacheManager,
|
||||
log);
|
||||
}
|
||||
|
||||
public plasmaParserDocument getDocument(URL url, boolean fetchOnline) {
|
||||
byte[] resource = getResource(url, fetchOnline);
|
||||
if (resource == null) return null;
|
||||
httpHeader header = null;
|
||||
try {
|
||||
header = cacheManager.getCachedResponse(plasmaURL.urlHash(url));
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
}
|
||||
if (header == null) return null;
|
||||
if (plasmaParser.supportedMimeTypesContains(header.mime())) {
|
||||
return parser.parseSource(url, header.mime(), resource);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in new issue