git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@218 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
9244b6ad6f
commit
ca3b4ccaf4
@ -0,0 +1,234 @@
|
|||||||
|
// plasmaSnippetCache.java
|
||||||
|
// -----------------------
|
||||||
|
// part of YaCy
|
||||||
|
// (C) by Michael Peter Christen; mc@anomic.de
|
||||||
|
// first published on http://www.anomic.de
|
||||||
|
// Frankfurt, Germany, 2005
|
||||||
|
// last major change: 07.06.2005
|
||||||
|
//
|
||||||
|
// This program is free software; you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU General Public License as published by
|
||||||
|
// the Free Software Foundation; either version 2 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU General Public License
|
||||||
|
// along with this program; if not, write to the Free Software
|
||||||
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
//
|
||||||
|
// Using this software in any meaning (reading, learning, copying, compiling,
|
||||||
|
// running) means that you agree that the Author(s) is (are) not responsible
|
||||||
|
// for cost, loss of data or any harm that may be caused directly or indirectly
|
||||||
|
// by usage of this softare or this documentation. The usage of this software
|
||||||
|
// is on your own risk. The installation and usage (starting/running) of this
|
||||||
|
// software may allow other people or application to access your computer and
|
||||||
|
// any attached devices and is highly dependent on the configuration of the
|
||||||
|
// software which must be done by the user of the software; the author(s) is
|
||||||
|
// (are) also not responsible for proper configuration and usage of the
|
||||||
|
// software, even if provoked by documentation provided together with
|
||||||
|
// the software.
|
||||||
|
//
|
||||||
|
// Any changes to this file according to the GPL as documented in the file
|
||||||
|
// gpl.txt aside this file in the shipment you received can be done to the
|
||||||
|
// lines that follows this copyright notice here, but changes must not be
|
||||||
|
// done inside the copyright notive above. A re-distribution must contain
|
||||||
|
// the intact and unchanged copyright notice.
|
||||||
|
// Contributions and changes to the program code must be marked as such.
|
||||||
|
|
||||||
|
|
||||||
|
package de.anomic.plasma;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URL;
|
||||||
|
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
||||||
|
import de.anomic.kelondro.kelondroMScoreCluster;
|
||||||
|
import de.anomic.server.serverFileUtils;
|
||||||
|
import de.anomic.server.serverLog;
|
||||||
|
import de.anomic.http.httpHeader;
|
||||||
|
import de.anomic.yacy.yacySearch;
|
||||||
|
|
||||||
|
public class plasmaSnippetCache {
|
||||||
|
|
||||||
|
private static final int maxCache = 500;
|
||||||
|
|
||||||
|
private int snippetsScoreCounter;
|
||||||
|
private kelondroMScoreCluster snippetsScore;
|
||||||
|
private HashMap snippetsCache;
|
||||||
|
private plasmaHTCache cacheManager;
|
||||||
|
private plasmaParser parser;
|
||||||
|
private serverLog log;
|
||||||
|
private String remoteProxyHost;
|
||||||
|
private int remoteProxyPort;
|
||||||
|
private boolean remoteProxyUse;
|
||||||
|
|
||||||
|
public plasmaSnippetCache(plasmaHTCache cacheManager, plasmaParser parser,
|
||||||
|
String remoteProxyHost, int remoteProxyPort, boolean remoteProxyUse,
|
||||||
|
serverLog log) {
|
||||||
|
this.cacheManager = cacheManager;
|
||||||
|
this.parser = parser;
|
||||||
|
this.log = log;
|
||||||
|
this.remoteProxyHost = remoteProxyHost;
|
||||||
|
this.remoteProxyPort = remoteProxyPort;
|
||||||
|
this.remoteProxyUse = remoteProxyUse;
|
||||||
|
this.snippetsScoreCounter = 0;
|
||||||
|
this.snippetsScore = new kelondroMScoreCluster();
|
||||||
|
this.snippetsCache = new HashMap();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public synchronized void store(String wordhashes, String urlhash, String snippet) {
|
||||||
|
// generate key
|
||||||
|
String key = urlhash + wordhashes;
|
||||||
|
|
||||||
|
// do nothing if snippet is known
|
||||||
|
if (snippetsCache.containsKey(key)) return;
|
||||||
|
|
||||||
|
// learn new snippet
|
||||||
|
snippetsScore.addScore(key, snippetsScoreCounter++);
|
||||||
|
snippetsCache.put(key, snippet);
|
||||||
|
|
||||||
|
// care for counter
|
||||||
|
if (snippetsScoreCounter == java.lang.Integer.MAX_VALUE) {
|
||||||
|
snippetsScoreCounter = 0;
|
||||||
|
snippetsScore = new kelondroMScoreCluster();
|
||||||
|
snippetsCache = new HashMap();
|
||||||
|
}
|
||||||
|
|
||||||
|
// flush cache if cache is full
|
||||||
|
while (snippetsCache.size() > maxCache) {
|
||||||
|
key = (String) snippetsScore.getMinObject();
|
||||||
|
snippetsScore.deleteScore(key);
|
||||||
|
snippetsCache.remove(key);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String retrieve(String wordhashes, String urlhash) {
|
||||||
|
// generate key
|
||||||
|
String key = urlhash + wordhashes;
|
||||||
|
return (String) snippetsCache.get(key);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String retrieve(java.net.URL url, boolean fetchOnline, Set query, boolean queryAreHashes) {
|
||||||
|
if (query.size() == 0) return null;
|
||||||
|
if (!(queryAreHashes)) query = plasmaSearch.words2hashes(query);
|
||||||
|
String urlhash = plasmaURL.urlHash(url);
|
||||||
|
|
||||||
|
// try to get snippet from snippetCache
|
||||||
|
String wordhashes = yacySearch.set2string(query);
|
||||||
|
String snippet = retrieve(wordhashes, urlhash);
|
||||||
|
if (snippet != null) return snippet;
|
||||||
|
|
||||||
|
// if the snippet is not in the cache, we can try to get it from the htcache
|
||||||
|
plasmaParserDocument document = getDocument(url, fetchOnline);
|
||||||
|
if (document == null) return null;
|
||||||
|
String[] sentences = document.getSentences();
|
||||||
|
//System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
|
||||||
|
if ((sentences == null) || (sentences.length == 0)) return null;
|
||||||
|
|
||||||
|
// we have found a parseable non-empty file: use the lines
|
||||||
|
TreeMap sentencematrix = hashMatrix(sentences);
|
||||||
|
Iterator i = query.iterator();
|
||||||
|
String hash;
|
||||||
|
kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
|
||||||
|
Iterator j;
|
||||||
|
Integer sentencenumber;
|
||||||
|
Map.Entry entry;
|
||||||
|
while (i.hasNext()) {
|
||||||
|
hash = (String) i.next();
|
||||||
|
j = sentencematrix.entrySet().iterator();
|
||||||
|
while (j.hasNext()) {
|
||||||
|
entry = (Map.Entry) j.next();
|
||||||
|
sentencenumber = (Integer) entry.getKey();
|
||||||
|
if (((HashSet) entry.getValue()).contains(hash)) hitTable.addScore(sentencenumber, sentences[sentencenumber.intValue()].length());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Integer maxLine = (Integer) hitTable.getMaxObject();
|
||||||
|
if (maxLine == null) return null;
|
||||||
|
snippet = sentences[maxLine.intValue()];
|
||||||
|
if (snippet.length() > 140) return null;
|
||||||
|
|
||||||
|
// finally store this snippet in our own cache
|
||||||
|
store(wordhashes, urlhash, snippet);
|
||||||
|
return snippet;
|
||||||
|
}
|
||||||
|
|
||||||
|
private TreeMap hashMatrix(String[] sentences) {
|
||||||
|
TreeMap map = new TreeMap();
|
||||||
|
HashSet set;
|
||||||
|
Enumeration words;
|
||||||
|
for (int i = 0; i < sentences.length; i++) {
|
||||||
|
set = new HashSet();
|
||||||
|
words = plasmaCondenser.wordTokenizer(sentences[i]);
|
||||||
|
while (words.hasMoreElements()) set.add(plasmaWordIndexEntry.word2hash((String) words.nextElement()));
|
||||||
|
map.put(new Integer(i), set);
|
||||||
|
}
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
|
||||||
|
private byte[] getResource(URL url, boolean fetchOnline) {
|
||||||
|
// load the url as resource from the web
|
||||||
|
try {
|
||||||
|
//return httpc.singleGET(url, 5000, null, null, remoteProxyHost, remoteProxyPort);
|
||||||
|
byte[] resource = getResourceFromCache(url);
|
||||||
|
if ((fetchOnline) && (resource == null)) {
|
||||||
|
loadResourceFromWeb(url, 5000);
|
||||||
|
resource = getResourceFromCache(url);
|
||||||
|
}
|
||||||
|
return resource;
|
||||||
|
} catch (IOException e) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private byte[] getResourceFromCache(URL url) {
|
||||||
|
// load the url as resource from the cache
|
||||||
|
String path = htmlFilterContentScraper.urlNormalform(url).substring(6);
|
||||||
|
File cache = cacheManager.cachePath;
|
||||||
|
File f = new File(cache, path);
|
||||||
|
if (f.exists()) try {
|
||||||
|
return serverFileUtils.read(f);
|
||||||
|
} catch (IOException e) {
|
||||||
|
return null;
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void loadResourceFromWeb(URL url, int socketTimeout) throws IOException {
|
||||||
|
plasmaCrawlWorker.load(
|
||||||
|
url,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
0,
|
||||||
|
null,
|
||||||
|
socketTimeout,
|
||||||
|
remoteProxyHost,
|
||||||
|
remoteProxyPort,
|
||||||
|
remoteProxyUse,
|
||||||
|
cacheManager,
|
||||||
|
log);
|
||||||
|
}
|
||||||
|
|
||||||
|
public plasmaParserDocument getDocument(URL url, boolean fetchOnline) {
|
||||||
|
byte[] resource = getResource(url, fetchOnline);
|
||||||
|
if (resource == null) return null;
|
||||||
|
httpHeader header = null;
|
||||||
|
try {
|
||||||
|
header = cacheManager.getCachedResponse(plasmaURL.urlHash(url));
|
||||||
|
} catch (IOException e) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (header == null) return null;
|
||||||
|
if (plasmaParser.supportedMimeTypesContains(header.mime())) {
|
||||||
|
return parser.parseSource(url, header.mime(), resource);
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in new issue