From d8fdc2526e38d5a98029caadd583653179b4d435 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 2 Jun 2005 01:33:10 +0000 Subject: [PATCH] added experimental snipplet-generation (to be disabled for 0.38) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@206 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- build.properties | 2 +- build.xml | 2 +- htroot/index.html | 6 + source/de/anomic/plasma/plasmaCondenser.java | 35 +++-- source/de/anomic/plasma/plasmaCrawlLURL.java | 9 +- .../de/anomic/plasma/plasmaCrawlWorker.java | 122 ++++++++++++++- .../anomic/plasma/plasmaParserDocument.java | 18 ++- .../de/anomic/plasma/plasmaSwitchboard.java | 144 ++++++++++++++---- 8 files changed, 286 insertions(+), 52 deletions(-) diff --git a/build.properties b/build.properties index 9f5fcbc67..41f8800fa 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.4 javacTarget=1.4 # Release Configuration -releaseVersion=0.376 +releaseVersion=0.377 releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr} releaseNr=$Revision$ diff --git a/build.xml b/build.xml index f2b576321..3b13db3a2 100644 --- a/build.xml +++ b/build.xml @@ -137,7 +137,7 @@ - + diff --git a/htroot/index.html b/htroot/index.html index 5c7672f17..e70754137 100644 --- a/htroot/index.html +++ b/htroot/index.html @@ -104,6 +104,12 @@ from 'late' peers to enricht this search result.

#[description]#
+#(snipplet)# +:: + +#[text]# +
+#(/snipplet)# #[urlname]#
#[date]#

diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index c88f05f4b..bf20c3f3c 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -127,8 +127,6 @@ public class plasmaCondenser { } - - public static String intString(int number, int length) { String s = "" + number; while (s.length() < length) s = "0" + s; @@ -323,7 +321,16 @@ public class plasmaCondenser { } - public void reconstruct() { + public void print() { + String[] s = sentences(); + + // printout a reconstruction of the text + for (int i = 0; i < s.length; i++) { + if (s[i] != null) System.out.print("#T " + intString(i, numlength) + " " + s[i]); + } + } + + public String[] sentences() { // we reconstruct the word hashtable // and order the entries by the number of the sentence // this structure is only needed to reconstruct the text @@ -342,20 +349,24 @@ public class plasmaCondenser { Object[] orderedSentences = makeOrderedSentences(); - // printout a reconstruction of the text + // create a reconstruction of the text + String[] result = new String[orderedSentences.length]; + String s; for (int i = 0; i < orderedSentences.length; i++) { if (orderedSentences[i] != null) { - System.out.print("#T " + intString(i, numlength) + " " + ((String[]) orderedSentences[i])[0] + " "); + s = ""; for (int j = 2; j < ((String[]) orderedSentences[i]).length; j++) { - System.out.print(" " + - orderedWords[Integer.parseInt(((String[]) orderedSentences[i])[j])] - ); + s += " " + orderedWords[Integer.parseInt(((String[]) orderedSentences[i])[j])]; } - System.out.println(((String[]) orderedSentences[i])[1]); - } + s += ((String[]) orderedSentences[i])[1]; + result[i] = (s.length() > 1) ? s.substring(1) : s; + } else { + result[i] = ""; + } } + return result; } - + private Object[] makeOrderedSentences() { // we reconstruct the sentence hashtable again and create by-handle ordered entries // this structure is needed to present the strings in the right order in a printout @@ -652,7 +663,7 @@ public class plasmaCondenser { textStream.close(); // output result pc.writeMapToFile(new File(args[2])); - pc.reconstruct(); + pc.print(); System.out.println("ANALYSIS:" + pc.getAnalysis().toString()); } catch (IOException e) { System.out.println("Problem with input file: " + e.getMessage()); diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index e62275d31..68b3fc83a 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -543,7 +543,14 @@ public class plasmaCrawlLURL extends plasmaURL { "}"; } - public String toString() { + public String toString(String snipplet) { + // add information needed for remote transport + String core = corePropList(); + if (core == null) return null; + return "{" + core + ",snipplet=" + crypt.simpleEncode(snipplet) + "}"; + } + + public String toString() { String core = corePropList(); if (core == null) return null; return "{" + core + "}"; diff --git a/source/de/anomic/plasma/plasmaCrawlWorker.java b/source/de/anomic/plasma/plasmaCrawlWorker.java index f4964346c..282680fe5 100644 --- a/source/de/anomic/plasma/plasmaCrawlWorker.java +++ b/source/de/anomic/plasma/plasmaCrawlWorker.java @@ -73,7 +73,7 @@ public final class plasmaCrawlWorker extends Thread { private int depth; private long startdate; private plasmaCrawlProfile.entry profile; - private String error; + //private String error; private boolean running = false; private boolean stopped = false; @@ -110,7 +110,7 @@ public final class plasmaCrawlWorker extends Thread { this.profile = theMsg.profile; this.startdate = System.currentTimeMillis(); - this.error = null; + //this.error = null; this.done = false; @@ -129,7 +129,7 @@ public final class plasmaCrawlWorker extends Thread { this.depth = 0; this.startdate = 0; this.profile = null; - this.error = null; + //this.error = null; } public void run() { @@ -177,7 +177,10 @@ public final class plasmaCrawlWorker extends Thread { public void execute() throws IOException { try { this.setName(this.threadBaseName + "_" + this.url); - load(this.url, this.referer, this.initiator, this.depth, this.profile); + load(this.url, this.referer, this.initiator, this.depth, this.profile, + this.socketTimeout, this.remoteProxyHost, this.remoteProxyPort, this.remoteProxyUse, + this.cacheManager, this.log); + } catch (IOException e) { //throw e; } @@ -186,6 +189,7 @@ public final class plasmaCrawlWorker extends Thread { } } + /* private httpc newhttpc(String server, int port, boolean ssl) throws IOException { // a new httpc connection, combined with possible remote proxy if (remoteProxyUse) @@ -289,7 +293,8 @@ public final class plasmaCrawlWorker extends Thread { if (remote != null) httpc.returnInstance(remote); } } - + */ + public void setStopped(boolean stopped) { this.stopped = stopped; } @@ -298,5 +303,112 @@ public final class plasmaCrawlWorker extends Thread { return this.running; } + public static void load( + URL url, + String referer, + String initiator, + int depth, + plasmaCrawlProfile.entry profile, + int socketTimeout, + String remoteProxyHost, + int remoteProxyPort, + boolean remoteProxyUse, + plasmaHTCache cacheManager, + serverLog log + ) throws IOException { + if (url == null) return; + Date requestDate = new Date(); // remember the time... + String host = url.getHost(); + String path = url.getPath(); + int port = url.getPort(); + boolean ssl = url.getProtocol().equals("https"); + if (port < 0) port = (ssl) ? 443 : 80; + + // set referrer; in some case advertise a little bit: + referer = (referer == null) ? "" : referer.trim(); + if (referer.length() == 0) referer = "http://www.yacy.net/yacy/"; + + // take a file from the net + httpc remote = null; + try { + // create a request header + httpHeader requestHeader = new httpHeader(); + requestHeader.put("User-Agent", httpdProxyHandler.userAgent); + requestHeader.put("Referer", referer); + requestHeader.put("Accept-Encoding", "gzip,deflate"); + + //System.out.println("CRAWLER_REQUEST_HEADER=" + requestHeader.toString()); // DEBUG + + // open the connection + if (remoteProxyUse) + remote = httpc.getInstance(host, port, socketTimeout, ssl, remoteProxyHost, remoteProxyPort); + else + remote = httpc.getInstance(host, port, socketTimeout, ssl); + + // send request + httpc.response res = remote.GET(path, requestHeader); + + if (res.status.startsWith("200")) { + // the transfer is ok + long contentLength = res.responseHeader.contentLength(); + + // reserve cache entry + plasmaHTCache.Entry htCache = cacheManager.newEntry(requestDate, depth, url, requestHeader, res.status, res.responseHeader, initiator, profile); + + // request has been placed and result has been returned. work off response + File cacheFile = cacheManager.getCachePath(url); + try { + String error = null; + if (!(plasmaParser.supportedMimeTypesContains(res.responseHeader.mime()))) { + // if the response has not the right file type then reject file + remote.close(); + log.logInfo("REJECTED WRONG MIME TYPE " + res.responseHeader.mime() + " for url " + url.toString()); + htCache.status = plasmaHTCache.CACHE_UNFILLED; + } else if ((profile == null) || ((profile.storeHTCache()) && ((error = htCache.shallStoreCache()) == null))) { + // we write the new cache entry to file system directly + cacheFile.getParentFile().mkdirs(); + FileOutputStream fos = new FileOutputStream(cacheFile); + htCache.cacheArray = res.writeContent(fos); // writes in cacheArray and cache file + fos.close(); + htCache.status = plasmaHTCache.CACHE_FILL; + } else { + if (error != null) log.logDebug("CRAWLER NOT STORED RESOURCE " + url.toString() + ": " + error); + // anyway, the content still lives in the content scraper + htCache.cacheArray = res.writeContent(null); // writes only into cacheArray + htCache.status = plasmaHTCache.CACHE_PASSING; + } + // enQueue new entry with response header + if ((initiator == null) || (initiator.length() == 0)) { + // enqueued for proxy writings + cacheManager.stackProcess(htCache); + } else { + // direct processing for crawling + cacheManager.process(htCache); + } + } catch (SocketException e) { + // this may happen if the client suddenly closes its connection + // maybe the user has stopped loading + // in that case, we are not responsible and just forget it + // but we clean the cache also, since it may be only partial + // and most possible corrupted + if (cacheFile.exists()) cacheFile.delete(); + log.logError("CRAWLER LOADER ERROR1: with url=" + url.toString() + ": " + e.toString()); + } + } else { + // if the response has not the right response type then reject file + log.logInfo("REJECTED WRONG STATUS TYPE '" + res.status + "' for url " + url.toString()); + // not processed any further + } + remote.close(); + } catch (Exception e) { + // this may happen if the targeted host does not exist or anything with the + // remote server was wrong. + log.logError("CRAWLER LOADER ERROR2 with url=" + url.toString() + ": " + e.toString()); + e.printStackTrace(); + } finally { + if (remote != null) httpc.returnInstance(remote); + } + } + } diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index f5027efb6..30b223084 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -42,6 +42,8 @@ package de.anomic.plasma; +import java.io.ByteArrayInputStream; +import java.io.IOException; import java.net.URL; import java.util.HashMap; import java.util.Iterator; @@ -65,6 +67,7 @@ public class plasmaParserDocument { Map hyperlinks; Map medialinks; Map emaillinks; + plasmaCondenser condenser; public plasmaParserDocument(URL location, String mimeType, String keywords, String shortTitle, String longTitle, @@ -83,6 +86,7 @@ public class plasmaParserDocument { this.hyperlinks = null; this.medialinks = null; this.emaillinks = null; + this.condenser = null; } private String absolutePath(String relativePath) { @@ -114,10 +118,20 @@ public class plasmaParserDocument { return text; } + public plasmaCondenser getCondenser() { + if (condenser == null) try { + condenser = new plasmaCondenser(new ByteArrayInputStream(getText()), 0, 0); + } catch (IOException e) {} + return condenser; + } + + public String[] getSentences() { + return getCondenser().sentences(); + } + public String getKeywords() { return this.keywords; - - } + } public Map getAnchors() { // returns all links embedded as anchors (clickeable entities) diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 93074a9f4..538fe47b1 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -117,6 +117,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.Map; +import java.util.TreeMap; import java.util.Set; import java.util.TreeSet; import java.util.Vector; @@ -129,6 +130,7 @@ import de.anomic.http.httpHeader; import de.anomic.http.httpc; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroMSetTools; +import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroTables; import de.anomic.server.serverAbstractSwitch; import de.anomic.server.serverCodings; @@ -174,6 +176,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public wikiBoard wikiDB; public String remoteProxyHost; public int remoteProxyPort; + public boolean remoteProxyUse; public plasmaCrawlProfile profiles; public plasmaCrawlProfile.entry defaultProxyProfile; public plasmaCrawlProfile.entry defaultRemoteProfile; @@ -205,7 +208,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } catch (NumberFormatException e) { remoteProxyPort = 3128; } - if (!(getConfig("remoteProxyUse", "false").equals("true"))) { + if (getConfig("remoteProxyUse", "false").equals("true")) { + remoteProxyUse = true; + } else { + remoteProxyUse = false; remoteProxyHost = null; remoteProxyPort = 0; } @@ -340,11 +346,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // init migratiion from 0.37 -> 0.38 classicCache = new plasmaWordIndexClassicCacheMigration(plasmaPath, wordIndex); - setConfig("99_indexcachemigration_idlesleep" , 10000); - setConfig("99_indexcachemigration_busysleep" , 40); - deployThread("99_indexcachemigration", "index cache migration", "migration of index cache data structures 0.37 -> 0.38", - new serverInstantThread(classicCache, "oneStepMigration", "size"), 30000); - + if (classicCache.size() > 0) { + setConfig("99_indexcachemigration_idlesleep" , 10000); + setConfig("99_indexcachemigration_busysleep" , 40); + deployThread("99_indexcachemigration", "index cache migration", "migration of index cache data structures 0.37 -> 0.38", + new serverInstantThread(classicCache, "oneStepMigration", "size"), 30000); + } } private static String ppRamString(int bytes) { @@ -1211,12 +1218,21 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser */ //addScoreForked(ref, gs, descr.split(" ")); //addScoreForked(ref, gs, urlstring.split("/")); + String snipplet; if (urlstring.matches(urlmask)) { //.* is default prop.put("results_" + i + "_description", descr); prop.put("results_" + i + "_url", urlstring); prop.put("results_" + i + "_urlname", urlname); prop.put("results_" + i + "_date", dateString(urlentry.moddate())); - prop.put("results_" + i + "_size", Long.toString(urlentry.size())); + prop.put("results_" + i + "_size", Long.toString(urlentry.size())); + snipplet = getSnipplet(url, false, querywords, false); + if ((snipplet == null) || (snipplet.length() < 10)) { + prop.put("results_" + i + "_snipplet", 0); + prop.put("results_" + i + "_snipplet_text", ""); + } else { + prop.put("results_" + i + "_snipplet", 1); + prop.put("results_" + i + "_snipplet_text", snipplet); + } i++; } } @@ -1283,9 +1299,15 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String resource = ""; //plasmaIndexEntry pie; plasmaCrawlLURL.entry urlentry; + String snipplet; while ((acc.hasMoreElements()) && (i < count)) { urlentry = acc.nextElement(); - resource = urlentry.toString(); + snipplet = getSnipplet(urlentry.url(), false, hashes, true); + if ((snipplet == null) || (snipplet.length() < 10)) { + resource = urlentry.toString(); + } else { + resource = urlentry.toString(snipplet); + } if (resource != null) { links.append("resource").append(i).append("=").append(resource).append(serverCore.crlfString); i++; @@ -1352,7 +1374,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser URL url = entry.url(); if (url == null) return 0; // get set of words - Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline))); + //Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline))); + Set words = plasmaCondenser.getWords(getDocument(url, fetchOnline).getText()); // delete all word references int count = removeReferences(urlhash, words); // finally delete the url entry itself @@ -1380,13 +1403,18 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } private byte[] getResource(URL url, boolean fetchOnline) { - byte[] resource = null; - // first load from cache - resource = getResourceFromCache(url); - // if not succedded then load from web - if ((fetchOnline) && (resource == null)) resource = getResourceFromWeb(url); - // the result - return resource; + // load the url as resource from the web + try { + //return httpc.singleGET(url, 5000, null, null, remoteProxyHost, remoteProxyPort); + byte[] resource = getResourceFromCache(url); + if ((fetchOnline) && (resource == null)) { + loadResourceFromWeb(url, 5000); + resource = getResourceFromCache(url); + } + return resource; + } catch (IOException e) { + return null; + } } private byte[] getResourceFromCache(URL url) { @@ -1394,33 +1422,89 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String path = htmlFilterContentScraper.urlNormalform(url).substring(6); File cache = new File(getRootPath(), getConfig("proxyCache", "DATA/HTCACHE")); File f = new File(cache, path); - try { + if (f.exists()) try { return serverFileUtils.read(f); } catch (IOException e) { return null; + } else { + return null; } } - private byte[] getResourceFromWeb(URL url) { - // load the url as resource from the web - try { - return httpc.singleGET(url, 5000, null, null, remoteProxyHost, remoteProxyPort); - } catch (IOException e) { - return null; - } + private void loadResourceFromWeb(URL url, int socketTimeout) throws IOException { + plasmaCrawlWorker.load( + url, + null, + null, + 0, + null, + socketTimeout, + remoteProxyHost, + remoteProxyPort, + remoteProxyUse, + cacheManager, + log); } - private static byte[] getText(byte[] resource) { + private plasmaParserDocument getDocument(URL url, boolean fetchOnline) { + byte[] resource = getResource(url, fetchOnline); if (resource == null) return null; - // generate word list from resource - htmlFilterContentScraper scraper = new htmlFilterContentScraper(null); - OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); + httpHeader header = null; try { - serverFileUtils.write(resource, os); - return scraper.getText(); + header = cacheManager.getCachedResponse(plasmaURL.urlHash(url)); } catch (IOException e) { return null; } + if (header == null) return null; + if (plasmaParser.supportedMimeTypesContains(header.mime())) { + return parser.parseSource(url, header.mime(), resource); + } else { + return null; + } + } + + private String getSnipplet(URL url, boolean fetchOnline, Set query, boolean queryAreHashes) { + if (query.size() == 0) return null; + plasmaParserDocument document = getDocument(url, fetchOnline); + if (document == null) return null; + String[] sentences = document.getSentences(); + //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]); + if ((sentences == null) || (sentences.length == 0)) return null; + TreeMap sentencematrix = hashMatrix(sentences); + if (!(queryAreHashes)) query = plasmaSearch.words2hashes(query); + Iterator i = query.iterator(); + String hash; + kelondroMScoreCluster hitTable = new kelondroMScoreCluster(); + Iterator j; + Integer sentencenumber; + Map.Entry entry; + while (i.hasNext()) { + hash = (String) i.next(); + j = sentencematrix.entrySet().iterator(); + while (j.hasNext()) { + entry = (Map.Entry) j.next(); + sentencenumber = (Integer) entry.getKey(); + if (((HashSet) entry.getValue()).contains(hash)) hitTable.addScore(sentencenumber, sentences[sentencenumber.intValue()].length()); + } + } + Integer maxLine = (Integer) hitTable.getMaxObject(); + if (maxLine == null) return null; + String snipplet = sentences[maxLine.intValue()]; + if (snipplet.length() > 140) return null; + return snipplet; + } + + private TreeMap hashMatrix(String[] sentences) { + TreeMap map = new TreeMap(); + HashSet set; + Enumeration words; + for (int i = 0; i < sentences.length; i++) { + set = new HashSet(); + words = plasmaCondenser.wordTokenizer(sentences[i]); + while (words.hasMoreElements()) set.add(plasmaWordIndexEntry.word2hash((String) words.nextElement())); + map.put(new Integer(i), set); + } + return map; } public class distributeIndex {