added experimental snipplet-generation (to be disabled for 0.38)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@206 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 3771b10b89
commit d8fdc2526e

@ -3,7 +3,7 @@ javacSource=1.4
javacTarget=1.4
# Release Configuration
releaseVersion=0.376
releaseVersion=0.377
releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}
releaseNr=$Revision$

@ -137,7 +137,7 @@
<include name="de/anomic/yacy/seedUpload/yacySeedUploadFtp.java"/>
</javac>
<!-- compiling htroot and htdocsdefault -->
<!-- compiling htroot, htroot/yacy and htroot/htdocsdefault -->
<javac srcdir="${htroot}/" destdir="${htroot}"
classpathref="project.class.path"
source="${javacSource}" target="${javacTarget}"/>

@ -104,6 +104,12 @@ from 'late' peers to enricht this search result.
<p><b>
#[description]#
</b><br>
#(snipplet)#
::
<i>
#[text]#
</i><br>
#(/snipplet)#
<a href="#[url]#">#[urlname]#</a><br>
#[date]#<br></p>
<!-- link end -->

@ -127,8 +127,6 @@ public class plasmaCondenser {
}
public static String intString(int number, int length) {
String s = "" + number;
while (s.length() < length) s = "0" + s;
@ -323,7 +321,16 @@ public class plasmaCondenser {
}
public void reconstruct() {
public void print() {
String[] s = sentences();
// printout a reconstruction of the text
for (int i = 0; i < s.length; i++) {
if (s[i] != null) System.out.print("#T " + intString(i, numlength) + " " + s[i]);
}
}
public String[] sentences() {
// we reconstruct the word hashtable
// and order the entries by the number of the sentence
// this structure is only needed to reconstruct the text
@ -342,20 +349,24 @@ public class plasmaCondenser {
Object[] orderedSentences = makeOrderedSentences();
// printout a reconstruction of the text
// create a reconstruction of the text
String[] result = new String[orderedSentences.length];
String s;
for (int i = 0; i < orderedSentences.length; i++) {
if (orderedSentences[i] != null) {
System.out.print("#T " + intString(i, numlength) + " " + ((String[]) orderedSentences[i])[0] + " ");
s = "";
for (int j = 2; j < ((String[]) orderedSentences[i]).length; j++) {
System.out.print(" " +
orderedWords[Integer.parseInt(((String[]) orderedSentences[i])[j])]
);
s += " " + orderedWords[Integer.parseInt(((String[]) orderedSentences[i])[j])];
}
System.out.println(((String[]) orderedSentences[i])[1]);
}
s += ((String[]) orderedSentences[i])[1];
result[i] = (s.length() > 1) ? s.substring(1) : s;
} else {
result[i] = "";
}
}
return result;
}
private Object[] makeOrderedSentences() {
// we reconstruct the sentence hashtable again and create by-handle ordered entries
// this structure is needed to present the strings in the right order in a printout
@ -652,7 +663,7 @@ public class plasmaCondenser {
textStream.close();
// output result
pc.writeMapToFile(new File(args[2]));
pc.reconstruct();
pc.print();
System.out.println("ANALYSIS:" + pc.getAnalysis().toString());
} catch (IOException e) {
System.out.println("Problem with input file: " + e.getMessage());

@ -543,7 +543,14 @@ public class plasmaCrawlLURL extends plasmaURL {
"}";
}
public String toString() {
public String toString(String snipplet) {
// add information needed for remote transport
String core = corePropList();
if (core == null) return null;
return "{" + core + ",snipplet=" + crypt.simpleEncode(snipplet) + "}";
}
public String toString() {
String core = corePropList();
if (core == null) return null;
return "{" + core + "}";

@ -73,7 +73,7 @@ public final class plasmaCrawlWorker extends Thread {
private int depth;
private long startdate;
private plasmaCrawlProfile.entry profile;
private String error;
//private String error;
private boolean running = false;
private boolean stopped = false;
@ -110,7 +110,7 @@ public final class plasmaCrawlWorker extends Thread {
this.profile = theMsg.profile;
this.startdate = System.currentTimeMillis();
this.error = null;
//this.error = null;
this.done = false;
@ -129,7 +129,7 @@ public final class plasmaCrawlWorker extends Thread {
this.depth = 0;
this.startdate = 0;
this.profile = null;
this.error = null;
//this.error = null;
}
public void run() {
@ -177,7 +177,10 @@ public final class plasmaCrawlWorker extends Thread {
public void execute() throws IOException {
try {
this.setName(this.threadBaseName + "_" + this.url);
load(this.url, this.referer, this.initiator, this.depth, this.profile);
load(this.url, this.referer, this.initiator, this.depth, this.profile,
this.socketTimeout, this.remoteProxyHost, this.remoteProxyPort, this.remoteProxyUse,
this.cacheManager, this.log);
} catch (IOException e) {
//throw e;
}
@ -186,6 +189,7 @@ public final class plasmaCrawlWorker extends Thread {
}
}
/*
private httpc newhttpc(String server, int port, boolean ssl) throws IOException {
// a new httpc connection, combined with possible remote proxy
if (remoteProxyUse)
@ -289,7 +293,8 @@ public final class plasmaCrawlWorker extends Thread {
if (remote != null) httpc.returnInstance(remote);
}
}
*/
public void setStopped(boolean stopped) {
this.stopped = stopped;
}
@ -298,5 +303,112 @@ public final class plasmaCrawlWorker extends Thread {
return this.running;
}
public static void load(
URL url,
String referer,
String initiator,
int depth,
plasmaCrawlProfile.entry profile,
int socketTimeout,
String remoteProxyHost,
int remoteProxyPort,
boolean remoteProxyUse,
plasmaHTCache cacheManager,
serverLog log
) throws IOException {
if (url == null) return;
Date requestDate = new Date(); // remember the time...
String host = url.getHost();
String path = url.getPath();
int port = url.getPort();
boolean ssl = url.getProtocol().equals("https");
if (port < 0) port = (ssl) ? 443 : 80;
// set referrer; in some case advertise a little bit:
referer = (referer == null) ? "" : referer.trim();
if (referer.length() == 0) referer = "http://www.yacy.net/yacy/";
// take a file from the net
httpc remote = null;
try {
// create a request header
httpHeader requestHeader = new httpHeader();
requestHeader.put("User-Agent", httpdProxyHandler.userAgent);
requestHeader.put("Referer", referer);
requestHeader.put("Accept-Encoding", "gzip,deflate");
//System.out.println("CRAWLER_REQUEST_HEADER=" + requestHeader.toString()); // DEBUG
// open the connection
if (remoteProxyUse)
remote = httpc.getInstance(host, port, socketTimeout, ssl, remoteProxyHost, remoteProxyPort);
else
remote = httpc.getInstance(host, port, socketTimeout, ssl);
// send request
httpc.response res = remote.GET(path, requestHeader);
if (res.status.startsWith("200")) {
// the transfer is ok
long contentLength = res.responseHeader.contentLength();
// reserve cache entry
plasmaHTCache.Entry htCache = cacheManager.newEntry(requestDate, depth, url, requestHeader, res.status, res.responseHeader, initiator, profile);
// request has been placed and result has been returned. work off response
File cacheFile = cacheManager.getCachePath(url);
try {
String error = null;
if (!(plasmaParser.supportedMimeTypesContains(res.responseHeader.mime()))) {
// if the response has not the right file type then reject file
remote.close();
log.logInfo("REJECTED WRONG MIME TYPE " + res.responseHeader.mime() + " for url " + url.toString());
htCache.status = plasmaHTCache.CACHE_UNFILLED;
} else if ((profile == null) || ((profile.storeHTCache()) && ((error = htCache.shallStoreCache()) == null))) {
// we write the new cache entry to file system directly
cacheFile.getParentFile().mkdirs();
FileOutputStream fos = new FileOutputStream(cacheFile);
htCache.cacheArray = res.writeContent(fos); // writes in cacheArray and cache file
fos.close();
htCache.status = plasmaHTCache.CACHE_FILL;
} else {
if (error != null) log.logDebug("CRAWLER NOT STORED RESOURCE " + url.toString() + ": " + error);
// anyway, the content still lives in the content scraper
htCache.cacheArray = res.writeContent(null); // writes only into cacheArray
htCache.status = plasmaHTCache.CACHE_PASSING;
}
// enQueue new entry with response header
if ((initiator == null) || (initiator.length() == 0)) {
// enqueued for proxy writings
cacheManager.stackProcess(htCache);
} else {
// direct processing for crawling
cacheManager.process(htCache);
}
} catch (SocketException e) {
// this may happen if the client suddenly closes its connection
// maybe the user has stopped loading
// in that case, we are not responsible and just forget it
// but we clean the cache also, since it may be only partial
// and most possible corrupted
if (cacheFile.exists()) cacheFile.delete();
log.logError("CRAWLER LOADER ERROR1: with url=" + url.toString() + ": " + e.toString());
}
} else {
// if the response has not the right response type then reject file
log.logInfo("REJECTED WRONG STATUS TYPE '" + res.status + "' for url " + url.toString());
// not processed any further
}
remote.close();
} catch (Exception e) {
// this may happen if the targeted host does not exist or anything with the
// remote server was wrong.
log.logError("CRAWLER LOADER ERROR2 with url=" + url.toString() + ": " + e.toString());
e.printStackTrace();
} finally {
if (remote != null) httpc.returnInstance(remote);
}
}
}

@ -42,6 +42,8 @@
package de.anomic.plasma;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.URL;
import java.util.HashMap;
import java.util.Iterator;
@ -65,6 +67,7 @@ public class plasmaParserDocument {
Map hyperlinks;
Map medialinks;
Map emaillinks;
plasmaCondenser condenser;
public plasmaParserDocument(URL location, String mimeType,
String keywords, String shortTitle, String longTitle,
@ -83,6 +86,7 @@ public class plasmaParserDocument {
this.hyperlinks = null;
this.medialinks = null;
this.emaillinks = null;
this.condenser = null;
}
private String absolutePath(String relativePath) {
@ -114,10 +118,20 @@ public class plasmaParserDocument {
return text;
}
public plasmaCondenser getCondenser() {
if (condenser == null) try {
condenser = new plasmaCondenser(new ByteArrayInputStream(getText()), 0, 0);
} catch (IOException e) {}
return condenser;
}
public String[] getSentences() {
return getCondenser().sentences();
}
public String getKeywords() {
return this.keywords;
}
}
public Map getAnchors() {
// returns all links embedded as anchors (clickeable entities)

@ -117,6 +117,7 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.TreeMap;
import java.util.Set;
import java.util.TreeSet;
import java.util.Vector;
@ -129,6 +130,7 @@ import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroTables;
import de.anomic.server.serverAbstractSwitch;
import de.anomic.server.serverCodings;
@ -174,6 +176,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public wikiBoard wikiDB;
public String remoteProxyHost;
public int remoteProxyPort;
public boolean remoteProxyUse;
public plasmaCrawlProfile profiles;
public plasmaCrawlProfile.entry defaultProxyProfile;
public plasmaCrawlProfile.entry defaultRemoteProfile;
@ -205,7 +208,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} catch (NumberFormatException e) {
remoteProxyPort = 3128;
}
if (!(getConfig("remoteProxyUse", "false").equals("true"))) {
if (getConfig("remoteProxyUse", "false").equals("true")) {
remoteProxyUse = true;
} else {
remoteProxyUse = false;
remoteProxyHost = null;
remoteProxyPort = 0;
}
@ -340,11 +346,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// init migratiion from 0.37 -> 0.38
classicCache = new plasmaWordIndexClassicCacheMigration(plasmaPath, wordIndex);
setConfig("99_indexcachemigration_idlesleep" , 10000);
setConfig("99_indexcachemigration_busysleep" , 40);
deployThread("99_indexcachemigration", "index cache migration", "migration of index cache data structures 0.37 -> 0.38",
new serverInstantThread(classicCache, "oneStepMigration", "size"), 30000);
if (classicCache.size() > 0) {
setConfig("99_indexcachemigration_idlesleep" , 10000);
setConfig("99_indexcachemigration_busysleep" , 40);
deployThread("99_indexcachemigration", "index cache migration", "migration of index cache data structures 0.37 -> 0.38",
new serverInstantThread(classicCache, "oneStepMigration", "size"), 30000);
}
}
private static String ppRamString(int bytes) {
@ -1211,12 +1218,21 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
*/
//addScoreForked(ref, gs, descr.split(" "));
//addScoreForked(ref, gs, urlstring.split("/"));
String snipplet;
if (urlstring.matches(urlmask)) { //.* is default
prop.put("results_" + i + "_description", descr);
prop.put("results_" + i + "_url", urlstring);
prop.put("results_" + i + "_urlname", urlname);
prop.put("results_" + i + "_date", dateString(urlentry.moddate()));
prop.put("results_" + i + "_size", Long.toString(urlentry.size()));
prop.put("results_" + i + "_size", Long.toString(urlentry.size()));
snipplet = getSnipplet(url, false, querywords, false);
if ((snipplet == null) || (snipplet.length() < 10)) {
prop.put("results_" + i + "_snipplet", 0);
prop.put("results_" + i + "_snipplet_text", "");
} else {
prop.put("results_" + i + "_snipplet", 1);
prop.put("results_" + i + "_snipplet_text", snipplet);
}
i++;
}
}
@ -1283,9 +1299,15 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String resource = "";
//plasmaIndexEntry pie;
plasmaCrawlLURL.entry urlentry;
String snipplet;
while ((acc.hasMoreElements()) && (i < count)) {
urlentry = acc.nextElement();
resource = urlentry.toString();
snipplet = getSnipplet(urlentry.url(), false, hashes, true);
if ((snipplet == null) || (snipplet.length() < 10)) {
resource = urlentry.toString();
} else {
resource = urlentry.toString(snipplet);
}
if (resource != null) {
links.append("resource").append(i).append("=").append(resource).append(serverCore.crlfString);
i++;
@ -1352,7 +1374,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
URL url = entry.url();
if (url == null) return 0;
// get set of words
Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
//Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
Set words = plasmaCondenser.getWords(getDocument(url, fetchOnline).getText());
// delete all word references
int count = removeReferences(urlhash, words);
// finally delete the url entry itself
@ -1380,13 +1403,18 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
private byte[] getResource(URL url, boolean fetchOnline) {
byte[] resource = null;
// first load from cache
resource = getResourceFromCache(url);
// if not succedded then load from web
if ((fetchOnline) && (resource == null)) resource = getResourceFromWeb(url);
// the result
return resource;
// load the url as resource from the web
try {
//return httpc.singleGET(url, 5000, null, null, remoteProxyHost, remoteProxyPort);
byte[] resource = getResourceFromCache(url);
if ((fetchOnline) && (resource == null)) {
loadResourceFromWeb(url, 5000);
resource = getResourceFromCache(url);
}
return resource;
} catch (IOException e) {
return null;
}
}
private byte[] getResourceFromCache(URL url) {
@ -1394,33 +1422,89 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String path = htmlFilterContentScraper.urlNormalform(url).substring(6);
File cache = new File(getRootPath(), getConfig("proxyCache", "DATA/HTCACHE"));
File f = new File(cache, path);
try {
if (f.exists()) try {
return serverFileUtils.read(f);
} catch (IOException e) {
return null;
} else {
return null;
}
}
private byte[] getResourceFromWeb(URL url) {
// load the url as resource from the web
try {
return httpc.singleGET(url, 5000, null, null, remoteProxyHost, remoteProxyPort);
} catch (IOException e) {
return null;
}
private void loadResourceFromWeb(URL url, int socketTimeout) throws IOException {
plasmaCrawlWorker.load(
url,
null,
null,
0,
null,
socketTimeout,
remoteProxyHost,
remoteProxyPort,
remoteProxyUse,
cacheManager,
log);
}
private static byte[] getText(byte[] resource) {
private plasmaParserDocument getDocument(URL url, boolean fetchOnline) {
byte[] resource = getResource(url, fetchOnline);
if (resource == null) return null;
// generate word list from resource
htmlFilterContentScraper scraper = new htmlFilterContentScraper(null);
OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
httpHeader header = null;
try {
serverFileUtils.write(resource, os);
return scraper.getText();
header = cacheManager.getCachedResponse(plasmaURL.urlHash(url));
} catch (IOException e) {
return null;
}
if (header == null) return null;
if (plasmaParser.supportedMimeTypesContains(header.mime())) {
return parser.parseSource(url, header.mime(), resource);
} else {
return null;
}
}
private String getSnipplet(URL url, boolean fetchOnline, Set query, boolean queryAreHashes) {
if (query.size() == 0) return null;
plasmaParserDocument document = getDocument(url, fetchOnline);
if (document == null) return null;
String[] sentences = document.getSentences();
//System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
if ((sentences == null) || (sentences.length == 0)) return null;
TreeMap sentencematrix = hashMatrix(sentences);
if (!(queryAreHashes)) query = plasmaSearch.words2hashes(query);
Iterator i = query.iterator();
String hash;
kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
Iterator j;
Integer sentencenumber;
Map.Entry entry;
while (i.hasNext()) {
hash = (String) i.next();
j = sentencematrix.entrySet().iterator();
while (j.hasNext()) {
entry = (Map.Entry) j.next();
sentencenumber = (Integer) entry.getKey();
if (((HashSet) entry.getValue()).contains(hash)) hitTable.addScore(sentencenumber, sentences[sentencenumber.intValue()].length());
}
}
Integer maxLine = (Integer) hitTable.getMaxObject();
if (maxLine == null) return null;
String snipplet = sentences[maxLine.intValue()];
if (snipplet.length() > 140) return null;
return snipplet;
}
private TreeMap hashMatrix(String[] sentences) {
TreeMap map = new TreeMap();
HashSet set;
Enumeration words;
for (int i = 0; i < sentences.length; i++) {
set = new HashSet();
words = plasmaCondenser.wordTokenizer(sentences[i]);
while (words.hasMoreElements()) set.add(plasmaWordIndexEntry.word2hash((String) words.nextElement()));
map.put(new Integer(i), set);
}
return map;
}
public class distributeIndex {

Loading…
Cancel
Save