added experimental snipplet-generation (to be disabled for 0.38)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@206 6c8d7289-2bf4-0310-a012-ef5d649a1542
20 years ago · d8fdc2526e
parent 3771b10b89
commit d8fdc2526e
8 changed files with 286 additions and 52 deletions
--- a/build.properties
+++ b/build.properties
@ -3,7 +3,7 @@ javacSource=1.4
 javacTarget=1.4

 # Release Configuration
-releaseVersion=0.376
+releaseVersion=0.377
 releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
 releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}
 releaseNr=$Revision$
--- a/build.xml
+++ b/build.xml
@ -137,7 +137,7 @@
    <include name="de/anomic/yacy/seedUpload/yacySeedUploadFtp.java"/>    
  </javac> 
    
-  <!-- compiling htroot and htdocsdefault -->
+  <!-- compiling htroot, htroot/yacy and htroot/htdocsdefault -->
  <javac srcdir="${htroot}/" destdir="${htroot}"
         classpathref="project.class.path"
         source="${javacSource}" target="${javacTarget}"/>
--- a/htroot/index.html
+++ b/htroot/index.html
@ -104,6 +104,12 @@ from 'late' peers to enricht this search result.
 <p><b>
 #[description]#
 </b><br>
+#(snipplet)#
+::
+<i>
+#[text]#
+</i><br>
+#(/snipplet)#
 <a href="#[url]#">#[urlname]#</a><br>
 #[date]#<br></p>
 <!-- link end -->
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@ -127,8 +127,6 @@ public class plasmaCondenser {
 	
    }

-    
-
    public static String intString(int number, int length) {
 	String s = "" + number;
 	while (s.length() < length) s = "0" + s;
@ -323,7 +321,16 @@ public class plasmaCondenser {

    }

-    public void reconstruct() {
+    public void print() {
+	String[] s = sentences();
+
+	// printout a reconstruction of the text
+	for (int i = 0; i < s.length; i++) {
+	    if (s[i] != null) System.out.print("#T " + intString(i, numlength) + " " + s[i]);
+	}
+    }
+
+    public String[] sentences() {
 	// we reconstruct the word hashtable
 	// and order the entries by the number of the sentence
 	// this structure is only needed to reconstruct the text
@ -342,18 +349,22 @@ public class plasmaCondenser {

 	Object[] orderedSentences = makeOrderedSentences();

-	// printout a reconstruction of the text
+	// create a reconstruction of the text
+        String[] result = new String[orderedSentences.length];
+        String s;
 	for (int i = 0; i < orderedSentences.length; i++) {
 	    if (orderedSentences[i] != null) {
-		System.out.print("#T " + intString(i, numlength) + " " + ((String[]) orderedSentences[i])[0] + " ");
+		s = "";
 		for (int j = 2; j < ((String[]) orderedSentences[i]).length; j++) {
-		    System.out.print(" " +
-				     orderedWords[Integer.parseInt(((String[]) orderedSentences[i])[j])]
-				     );
+		    s += " " + orderedWords[Integer.parseInt(((String[]) orderedSentences[i])[j])];
 		}
-		System.out.println(((String[]) orderedSentences[i])[1]);
-	    }
+		s += ((String[]) orderedSentences[i])[1];
+                result[i] = (s.length() > 1) ? s.substring(1) : s;
+	    } else {
+                result[i] = "";
+            }
 	}
+        return result;
    }
        
    private Object[] makeOrderedSentences() {
@ -652,7 +663,7 @@ public class plasmaCondenser {
 	    textStream.close();
 	    // output result
 	    pc.writeMapToFile(new File(args[2]));
-	    pc.reconstruct();
+	    pc.print();
 	    System.out.println("ANALYSIS:" + pc.getAnalysis().toString());
 	} catch (IOException e) {
 	    System.out.println("Problem with input file: " + e.getMessage());
--- a/source/de/anomic/plasma/plasmaCrawlLURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlLURL.java
@ -543,7 +543,14 @@ public class plasmaCrawlLURL extends plasmaURL {
 		    "}";
 	}

-	public String toString() {
+	public String toString(String snipplet) {
+            // add information needed for remote transport
+	    String core = corePropList();
+            if (core == null) return null;
+	    return "{" + core + ",snipplet=" + crypt.simpleEncode(snipplet) + "}";
+	}
+        
+        public String toString() {
 	    String core = corePropList();
            if (core == null) return null;
 	    return "{" + core + "}";
--- a/source/de/anomic/plasma/plasmaCrawlWorker.java
+++ b/source/de/anomic/plasma/plasmaCrawlWorker.java
@ -73,7 +73,7 @@ public final class plasmaCrawlWorker extends Thread {
    private int depth;
    private long startdate;
    private plasmaCrawlProfile.entry profile;
-    private String error;
+    //private String error;
    
    private boolean running = false;
    private boolean stopped = false;
@ -110,7 +110,7 @@ public final class plasmaCrawlWorker extends Thread {
        this.profile = theMsg.profile;
        
        this.startdate = System.currentTimeMillis();        
-        this.error = null;        
+        //this.error = null;        
        

        this.done = false;        
@ -129,7 +129,7 @@ public final class plasmaCrawlWorker extends Thread {
        this.depth = 0;
        this.startdate = 0;
        this.profile = null;
-        this.error = null;        
+        //this.error = null;        
    }
    
    public void run()  {
@ -177,7 +177,10 @@ public final class plasmaCrawlWorker extends Thread {
    public void execute() throws IOException {
        try {
            this.setName(this.threadBaseName + "_" + this.url);
-            load(this.url, this.referer, this.initiator, this.depth, this.profile);
+            load(this.url, this.referer, this.initiator, this.depth, this.profile,
+                 this.socketTimeout, this.remoteProxyHost, this.remoteProxyPort, this.remoteProxyUse,
+                 this.cacheManager, this.log);
+            
        } catch (IOException e) {
            //throw e;
        }
@ -186,6 +189,7 @@ public final class plasmaCrawlWorker extends Thread {
        }
    }

+    /*
    private httpc newhttpc(String server, int port, boolean ssl) throws IOException {
        // a new httpc connection, combined with possible remote proxy
        if (remoteProxyUse)
@ -289,6 +293,7 @@ public final class plasmaCrawlWorker extends Thread {
            if (remote != null) httpc.returnInstance(remote);
        }
    }
+    */
    
    public void setStopped(boolean stopped) {
        this.stopped = stopped;           
@ -298,5 +303,112 @@ public final class plasmaCrawlWorker extends Thread {
        return this.running;
    }

+    public static void load(
+            URL url, 
+            String referer, 
+            String initiator, 
+            int depth, 
+            plasmaCrawlProfile.entry profile,
+            int socketTimeout,
+            String remoteProxyHost,
+            int remoteProxyPort,
+            boolean remoteProxyUse,
+            plasmaHTCache cacheManager,
+            serverLog log
+        ) throws IOException {
+        if (url == null) return;
+        Date requestDate = new Date(); // remember the time...
+        String host = url.getHost();
+        String path = url.getPath();
+        int port = url.getPort();
+        boolean ssl = url.getProtocol().equals("https");
+        if (port < 0) port = (ssl) ? 443 : 80;
+    
+        // set referrer; in some case advertise a little bit:
+        referer = (referer == null) ? "" : referer.trim();
+        if (referer.length() == 0) referer = "http://www.yacy.net/yacy/";
+        
+        // take a file from the net
+        httpc remote = null;
+        try {
+            // create a request header
+            httpHeader requestHeader = new httpHeader();
+            requestHeader.put("User-Agent", httpdProxyHandler.userAgent);
+            requestHeader.put("Referer", referer);
+            requestHeader.put("Accept-Encoding", "gzip,deflate");
+    
+            //System.out.println("CRAWLER_REQUEST_HEADER=" + requestHeader.toString()); // DEBUG
+                    
+            // open the connection
+            if (remoteProxyUse)
+                remote = httpc.getInstance(host, port, socketTimeout, ssl, remoteProxyHost, remoteProxyPort);
+            else
+                remote = httpc.getInstance(host, port, socketTimeout, ssl);
+            
+            // send request
+            httpc.response res = remote.GET(path, requestHeader);
+                
+            if (res.status.startsWith("200")) {
+                // the transfer is ok
+                long contentLength = res.responseHeader.contentLength();
+                
+                // reserve cache entry
+                plasmaHTCache.Entry htCache = cacheManager.newEntry(requestDate, depth, url, requestHeader, res.status, res.responseHeader, initiator, profile);
+                
+                // request has been placed and result has been returned. work off response
+                File cacheFile = cacheManager.getCachePath(url);
+                try {
+                    String error = null;
+                    if (!(plasmaParser.supportedMimeTypesContains(res.responseHeader.mime()))) {
+                        // if the response has not the right file type then reject file
+                        remote.close();
+                        log.logInfo("REJECTED WRONG MIME TYPE " + res.responseHeader.mime() + " for url " + url.toString());
+                        htCache.status = plasmaHTCache.CACHE_UNFILLED;
+                    } else if ((profile == null) || ((profile.storeHTCache()) && ((error = htCache.shallStoreCache()) == null))) {
+                        // we write the new cache entry to file system directly
+                        cacheFile.getParentFile().mkdirs();
+                        FileOutputStream fos = new FileOutputStream(cacheFile);
+                        htCache.cacheArray = res.writeContent(fos); // writes in cacheArray and cache file
+                        fos.close();
+                        htCache.status = plasmaHTCache.CACHE_FILL;
+                    } else {
+                        if (error != null) log.logDebug("CRAWLER NOT STORED RESOURCE " + url.toString() + ": " + error);
+                        // anyway, the content still lives in the content scraper
+                        htCache.cacheArray = res.writeContent(null); // writes only into cacheArray
+                        htCache.status = plasmaHTCache.CACHE_PASSING;
+                    }
+                    // enQueue new entry with response header
+                    if ((initiator == null) || (initiator.length() == 0)) {
+                        // enqueued for proxy writings
+                        cacheManager.stackProcess(htCache);
+                    } else {
+                        // direct processing for crawling
+                        cacheManager.process(htCache);
+                    }
+                } catch (SocketException e) {
+                    // this may happen if the client suddenly closes its connection
+                    // maybe the user has stopped loading
+                    // in that case, we are not responsible and just forget it
+                    // but we clean the cache also, since it may be only partial
+                    // and most possible corrupted
+                    if (cacheFile.exists()) cacheFile.delete();
+                    log.logError("CRAWLER LOADER ERROR1: with url=" + url.toString() + ": " + e.toString());
+                }
+            } else {
+                // if the response has not the right response type then reject file
+                log.logInfo("REJECTED WRONG STATUS TYPE '" + res.status + "' for url " + url.toString());
+                // not processed any further
+            }
+            remote.close();
+        } catch (Exception e) {
+            // this may happen if the targeted host does not exist or anything with the
+            // remote server was wrong.
+            log.logError("CRAWLER LOADER ERROR2 with url=" + url.toString() + ": " + e.toString());
+            e.printStackTrace();
+        } finally {
+            if (remote != null) httpc.returnInstance(remote);
+        }
+    }
+    
 }

--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@ -42,6 +42,8 @@

 package de.anomic.plasma;

+import java.io.ByteArrayInputStream;
+import java.io.IOException;
 import java.net.URL;
 import java.util.HashMap;
 import java.util.Iterator;
@ -65,6 +67,7 @@ public class plasmaParserDocument {
    Map hyperlinks;
    Map medialinks;
    Map emaillinks;
+    plasmaCondenser condenser;
                    
    public plasmaParserDocument(URL location, String mimeType,
                    String keywords, String shortTitle, String longTitle,
@ -83,6 +86,7 @@ public class plasmaParserDocument {
        this.hyperlinks = null;
        this.medialinks = null;
        this.emaillinks = null;
+        this.condenser = null;
    }
    
    private String absolutePath(String relativePath) {
@ -114,9 +118,19 @@ public class plasmaParserDocument {
        return text;
    }
    
+    public plasmaCondenser getCondenser() {
+        if (condenser == null) try {
+            condenser = new plasmaCondenser(new ByteArrayInputStream(getText()), 0, 0);
+        } catch (IOException e) {}
+        return condenser;
+    }
+    
+    public String[] getSentences() {
+        return getCondenser().sentences();
+    }
+    
    public String getKeywords() {
        return this.keywords;
-        
    }
    
    public Map getAnchors() {
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -117,6 +117,7 @@ import java.util.HashSet;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.Map;
+import java.util.TreeMap;
 import java.util.Set;
 import java.util.TreeSet;
 import java.util.Vector;
@ -129,6 +130,7 @@ import de.anomic.http.httpHeader;
 import de.anomic.http.httpc;
 import de.anomic.kelondro.kelondroException;
 import de.anomic.kelondro.kelondroMSetTools;
+import de.anomic.kelondro.kelondroMScoreCluster;
 import de.anomic.kelondro.kelondroTables;
 import de.anomic.server.serverAbstractSwitch;
 import de.anomic.server.serverCodings;
@ -174,6 +176,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
    public  wikiBoard              wikiDB;
    public  String                 remoteProxyHost;
    public  int                    remoteProxyPort;
+    public  boolean                remoteProxyUse;
    public  plasmaCrawlProfile     profiles;
    public  plasmaCrawlProfile.entry defaultProxyProfile;
    public  plasmaCrawlProfile.entry defaultRemoteProfile;
@ -205,7 +208,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        } catch (NumberFormatException e) {
            remoteProxyPort = 3128;
        }
-        if (!(getConfig("remoteProxyUse", "false").equals("true"))) {
+        if (getConfig("remoteProxyUse", "false").equals("true")) {
+            remoteProxyUse = true;
+        } else {
+            remoteProxyUse = false;
            remoteProxyHost = null;
            remoteProxyPort = 0;
        }
@ -340,11 +346,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
            
        // init migratiion from 0.37 -> 0.38
        classicCache = new plasmaWordIndexClassicCacheMigration(plasmaPath, wordIndex);
-        setConfig("99_indexcachemigration_idlesleep" , 10000);
-        setConfig("99_indexcachemigration_busysleep" , 40);
-        deployThread("99_indexcachemigration", "index cache migration", "migration of index cache data structures 0.37 -> 0.38",
-                     new serverInstantThread(classicCache, "oneStepMigration", "size"), 30000);
-
+        if (classicCache.size() > 0) {
+            setConfig("99_indexcachemigration_idlesleep" , 10000);
+            setConfig("99_indexcachemigration_busysleep" , 40);
+            deployThread("99_indexcachemigration", "index cache migration", "migration of index cache data structures 0.37 -> 0.38",
+            new serverInstantThread(classicCache, "oneStepMigration", "size"), 30000);
+        }
    }
    
    private static String ppRamString(int bytes) {
@ -1211,12 +1218,21 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                         */
                    //addScoreForked(ref, gs, descr.split(" "));
                    //addScoreForked(ref, gs, urlstring.split("/"));
+                    String snipplet;
                    if (urlstring.matches(urlmask)) { //.* is default
 			prop.put("results_" + i + "_description", descr);
 			prop.put("results_" + i + "_url", urlstring); 
 			prop.put("results_" + i + "_urlname", urlname); 
 			prop.put("results_" + i + "_date", dateString(urlentry.moddate()));
-            prop.put("results_" + i + "_size", Long.toString(urlentry.size())); 
+                        prop.put("results_" + i + "_size", Long.toString(urlentry.size()));
+                        snipplet = getSnipplet(url, false, querywords, false);
+                        if ((snipplet == null) || (snipplet.length() < 10)) {
+                            prop.put("results_" + i + "_snipplet", 0);
+                            prop.put("results_" + i + "_snipplet_text", "");
+                        } else {
+                            prop.put("results_" + i + "_snipplet", 1);
+                            prop.put("results_" + i + "_snipplet_text", snipplet);
+                        }
                        i++;
                    }
                }
@ -1283,9 +1299,15 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                String resource = "";
                //plasmaIndexEntry pie;
                plasmaCrawlLURL.entry urlentry;
+                String snipplet;
                while ((acc.hasMoreElements()) && (i < count)) {
                    urlentry = acc.nextElement();
-                    resource = urlentry.toString();
+                    snipplet = getSnipplet(urlentry.url(), false, hashes, true);
+                    if ((snipplet == null) || (snipplet.length() < 10)) {
+                        resource = urlentry.toString();
+                    } else {
+                        resource = urlentry.toString(snipplet);
+                    }
                    if (resource != null) {
                        links.append("resource").append(i).append("=").append(resource).append(serverCore.crlfString);
                        i++;
@ -1352,7 +1374,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        URL url = entry.url();
        if (url == null) return 0;
        // get set of words
-        Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
+        //Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
+        Set words = plasmaCondenser.getWords(getDocument(url, fetchOnline).getText());
        // delete all word references
        int count = removeReferences(urlhash, words);
        // finally delete the url entry itself
@ -1380,13 +1403,18 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
    }
    
    private byte[] getResource(URL url, boolean fetchOnline) {
-        byte[] resource = null;
-        // first load from cache
-        resource = getResourceFromCache(url);
-        // if not succedded then load from web
-        if ((fetchOnline) && (resource == null)) resource = getResourceFromWeb(url);
-        // the result
-        return resource;
+        // load the url as resource from the web
+        try {
+            //return httpc.singleGET(url, 5000, null, null, remoteProxyHost, remoteProxyPort);
+            byte[] resource = getResourceFromCache(url);
+            if ((fetchOnline) && (resource == null)) {
+                loadResourceFromWeb(url, 5000);
+                resource = getResourceFromCache(url);
+            }
+            return resource;
+        } catch (IOException e) {
+            return null;
+        }
    }
    
    private byte[] getResourceFromCache(URL url) {
@ -1394,33 +1422,89 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        String path = htmlFilterContentScraper.urlNormalform(url).substring(6);
        File cache = new File(getRootPath(), getConfig("proxyCache", "DATA/HTCACHE"));
        File f = new File(cache, path);
-        try {
+        if (f.exists()) try {
            return serverFileUtils.read(f);
        } catch (IOException e) {
            return null;
+        } else {
+            return null;
        }
    }
    
-    private byte[] getResourceFromWeb(URL url) {
-        // load the url as resource from the web
-        try {
-            return httpc.singleGET(url, 5000, null, null, remoteProxyHost, remoteProxyPort);
-        } catch (IOException e) {
-            return null;
-        }
+    private void loadResourceFromWeb(URL url, int socketTimeout) throws IOException {
+        plasmaCrawlWorker.load(
+            url, 
+            null, 
+            null, 
+            0, 
+            null,
+            socketTimeout,
+            remoteProxyHost,
+            remoteProxyPort,
+            remoteProxyUse,
+            cacheManager,
+            log);
    }
    
-    private static byte[] getText(byte[] resource) {
+    private plasmaParserDocument getDocument(URL url, boolean fetchOnline) {
+        byte[] resource = getResource(url, fetchOnline);
        if (resource == null) return null;
-        // generate word list from resource
-        htmlFilterContentScraper scraper = new htmlFilterContentScraper(null);
-        OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
+        httpHeader header = null;
        try {
-            serverFileUtils.write(resource, os);
-            return scraper.getText();
+            header = cacheManager.getCachedResponse(plasmaURL.urlHash(url));
        } catch (IOException e) {
            return null;
        }
+        if (header == null) return null;
+        if (plasmaParser.supportedMimeTypesContains(header.mime())) {
+            return parser.parseSource(url, header.mime(), resource);
+        } else {
+            return null;
+        }
+    }
+    
+    private String getSnipplet(URL url, boolean fetchOnline, Set query, boolean queryAreHashes) {
+        if (query.size() == 0) return null;
+        plasmaParserDocument document = getDocument(url, fetchOnline);
+        if (document == null) return null;
+        String[] sentences = document.getSentences();
+        //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
+        if ((sentences == null) || (sentences.length == 0)) return null;
+        TreeMap sentencematrix = hashMatrix(sentences);
+        if (!(queryAreHashes)) query = plasmaSearch.words2hashes(query);
+        Iterator i = query.iterator();
+        String hash;
+        kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
+        Iterator j;
+        Integer sentencenumber;
+        Map.Entry entry;
+        while (i.hasNext()) {
+            hash = (String) i.next();
+            j = sentencematrix.entrySet().iterator();
+            while (j.hasNext()) {
+                entry = (Map.Entry) j.next();
+                sentencenumber = (Integer) entry.getKey();
+                if (((HashSet) entry.getValue()).contains(hash)) hitTable.addScore(sentencenumber, sentences[sentencenumber.intValue()].length());
+            }
+        }
+        Integer maxLine = (Integer) hitTable.getMaxObject();
+        if (maxLine == null) return null;
+        String snipplet = sentences[maxLine.intValue()];
+        if (snipplet.length() > 140) return null;
+        return snipplet;
+    }
+        
+    private TreeMap hashMatrix(String[] sentences) {
+        TreeMap map = new TreeMap();
+        HashSet set;
+        Enumeration words;
+        for (int i = 0; i < sentences.length; i++) {
+            set = new HashSet();
+            words = plasmaCondenser.wordTokenizer(sentences[i]);
+            while (words.hasMoreElements()) set.add(plasmaWordIndexEntry.word2hash((String) words.nextElement()));
+            map.put(new Integer(i), set);
+        }
+        return map;
    }
    
    public class distributeIndex {