YaCy can now use the solr index to compute text snippets. This makes search result preparation MUCH faster because no document fetching and parsing is necessary any more.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7943 6c8d7289-2bf4-0310-a012-ef5d649a1542
14 years ago · 85a5487d6d
parent 0819e1d397
commit 85a5487d6d
14 changed files with 304 additions and 95 deletions
--- a/htroot/IndexControlRWIs_p.java
+++ b/htroot/IndexControlRWIs_p.java
@ -66,6 +66,7 @@ import de.anomic.search.RankingProcess;
 import de.anomic.search.ReferenceOrder;
 import de.anomic.search.SearchEventCache;
 import de.anomic.search.Segment;
 import de.anomic.search.Segments;
 import de.anomic.search.Switchboard;
 import de.anomic.search.SwitchboardConstants;
 import de.anomic.server.serverObjects;
@ -86,7 +87,7 @@ public class IndexControlRWIs_p {
        prop.put("keyhash", "");
        prop.put("result", "");
        prop.put("cleanup", post == null || post.containsKey("maxReferencesLimit") ? 1 : 0);
-        prop.put("cleanup_solr", sb.solrConnector == null || !sb.getConfigBool("federated.service.solr.indexing.enabled", false) ? 0 : 1);
+        prop.put("cleanup_solr", sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null || !sb.getConfigBool("federated.service.solr.indexing.enabled", false) ? 0 : 1);
        String segmentName = sb.getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default");
        int i = 0;
@ -157,7 +158,7 @@ public class IndexControlRWIs_p {
                    segment.clear();
                }
                if (post.get("deleteSolr", "").equals("on") && sb.getConfigBool("federated.service.solr.indexing.enabled", false)) try {
-                    sb.solrConnector.clear();
+                    sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().clear();
                } catch (final Exception e) {
                    Log.logException(e);
                }
--- a/htroot/IndexFederated_p.java
+++ b/htroot/IndexFederated_p.java
@ -33,9 +33,12 @@ import net.yacy.cora.document.UTF8;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.services.federated.solr.SolrChardingConnector;
 import net.yacy.cora.services.federated.solr.SolrChardingSelection;
 import net.yacy.cora.services.federated.solr.SolrConnector;
 import net.yacy.cora.services.federated.solr.SolrScheme;
 import net.yacy.cora.services.federated.solr.SolrSingleConnector;
 import net.yacy.cora.storage.ConfigurationSet;
 import net.yacy.kelondro.logging.Log;
 import de.anomic.search.Segments;
 import de.anomic.search.Switchboard;
 import de.anomic.server.serverObjects;
 import de.anomic.server.serverSwitch;
@ -75,8 +78,8 @@ public class IndexFederated_p {
            if (solrWasOn) {
                // switch off
-                sb.solrConnector.close();
+                sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().close();
-                sb.solrConnector = null;
+                sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr(null);
            }
            final SolrScheme scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/" + schemename));
@ -85,10 +88,10 @@ public class IndexFederated_p {
                // switch on
                final boolean usesolr = sb.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0;
                try {
-                    sb.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null;
+                    sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr((usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null);
                } catch (final IOException e) {
                    Log.logException(e);
-                    sb.solrConnector = null;
+                    sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr(null);
                }
            }
@ -110,12 +113,13 @@ public class IndexFederated_p {
        }
        // show solr host table
-        if (sb.solrConnector == null) {
+        if (sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null) {
            prop.put("table", 0);
        } else {
            prop.put("table", 1);
-            final long[] size = sb.solrConnector.getSizeList();
+            final SolrConnector solr = sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr();
-            final String[] urls = sb.solrConnector.getAdminInterfaceList();
+            final long[] size = (solr instanceof SolrChardingConnector) ? ((SolrChardingConnector) solr).getSizeList() : new long[]{((SolrSingleConnector) solr).getSize()};
            final String[] urls = (solr instanceof SolrChardingConnector) ? ((SolrChardingConnector) solr).getAdminInterfaceList() : new String[]{((SolrSingleConnector) solr).getAdminInterface()};
            boolean dark = false;
            for (int i = 0; i < size.length; i++) {
                prop.put("table_list_" + i + "_dark", dark ? 1 : 0); dark = !dark;
@ -126,7 +130,7 @@ public class IndexFederated_p {
        }
        // write scheme
-        SolrScheme scheme = (sb.solrConnector == null) ? null : sb.solrConnector.getScheme();
+        SolrScheme scheme = (sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null) ? null : sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().getScheme();
        final String schemename = sb.getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list");
        if (scheme == null) {
            scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/" + schemename));
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@ -62,6 +62,7 @@ public class CrawlQueues {
    private static final String ERROR_DB_FILENAME = "urlError3.db";
    private static final String DELEGATED_DB_FILENAME = "urlDelegated3.db";
    private static final Segments.Process PROCESS = Segments.Process.LOCALCRAWLING;
    protected Switchboard sb;
    protected Log log;
@ -81,8 +82,8 @@ public class CrawlQueues {
        this.log.logConfig("Starting Crawling Management");
        this.noticeURL = new NoticedURL(queuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727);
        FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME));
-        this.errorURL = new ZURL(sb.solrConnector, queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
+        this.errorURL = new ZURL(sb.indexSegments.segment(PROCESS).getSolr(), queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
-        this.delegatedURL = new ZURL(sb.solrConnector, queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
+        this.delegatedURL = new ZURL(sb.indexSegments.segment(PROCESS).getSolr(), queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
    }
    public void relocate(final File newQueuePath) {
@ -93,8 +94,8 @@ public class CrawlQueues {
        this.noticeURL = new NoticedURL(newQueuePath, this.sb.peers.myBotIDs(), this.sb.useTailCache, this.sb.exceed134217727);
        FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME));
-        this.errorURL = new ZURL(this.sb.solrConnector, newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727);
+        this.errorURL = new ZURL(this.sb.indexSegments.segment(PROCESS).getSolr(), newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727);
-        this.delegatedURL = new ZURL(this.sb.solrConnector, newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727);
+        this.delegatedURL = new ZURL(this.sb.indexSegments.segment(PROCESS).getSolr(), newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727);
    }
    public void close() {
@ -249,7 +250,7 @@ public class CrawlQueues {
                        return true;
                    }
                    try {
-                        this.sb.indexingDocumentProcessor.enQueue(new indexingQueueEntry(Segments.Process.LOCALCRAWLING, new Response(urlEntry, profile), null, null));
+                        this.sb.indexingDocumentProcessor.enQueue(new indexingQueueEntry(PROCESS, new Response(urlEntry, profile), null, null));
                        Log.logInfo("CrawlQueues", "placed NOLOAD URL on indexing queue: " + urlEntry.url().toNormalform(true, false));
                    } catch (final InterruptedException e) {
                        Log.logException(e);
--- a/source/de/anomic/crawler/ZURL.java
+++ b/source/de/anomic/crawler/ZURL.java
@ -36,6 +36,7 @@ import java.util.concurrent.ConcurrentLinkedQueue;
 import net.yacy.cora.document.ASCII;
 import net.yacy.cora.document.UTF8;
 import net.yacy.cora.services.federated.solr.SolrChardingConnector;
 import net.yacy.cora.services.federated.solr.SolrConnector;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.index.Index;
@ -76,10 +77,10 @@ public class ZURL implements Iterable<ZURL.Entry> {
    // the class object
    private Index urlIndex;
    private final ConcurrentLinkedQueue<byte[]> stack;
-    private final SolrChardingConnector solrConnector;
+    private final SolrConnector solrConnector;
    public ZURL(
-            final SolrChardingConnector solrConnector,
+            final SolrConnector solrConnector,
    		final File cachePath,
    		final String tablename,
    		final boolean startWithEmptyFile,
--- a/source/de/anomic/search/ResultFetcher.java
+++ b/source/de/anomic/search/ResultFetcher.java
@ -31,11 +31,13 @@ import java.util.Iterator;
 import java.util.List;
 import java.util.regex.Pattern;
 import net.yacy.cora.document.ASCII;
 import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.cora.ranking.ScoreMap;
 import net.yacy.cora.ranking.WeakPriorityBlockingQueue;
 import net.yacy.cora.ranking.WeakPriorityBlockingQueue.ReverseElement;
 import net.yacy.cora.services.federated.solr.SolrConnector;
 import net.yacy.cora.services.federated.yacy.CacheStrategy;
 import net.yacy.document.Condenser;
 import net.yacy.kelondro.data.meta.URIMetadataRow;
@ -46,6 +48,10 @@ import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.util.EventTracker;
 import net.yacy.kelondro.util.MemoryControl;
 import net.yacy.repository.LoaderDispatcher;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrDocumentList;
 import de.anomic.data.WorkTables;
 import de.anomic.http.client.Cache;
 import de.anomic.yacy.yacySeedDB;
@ -322,6 +328,7 @@ public class ResultFetcher {
        private final int neededResults;
        private final Pattern snippetPattern;
        private boolean shallrun;
        private final SolrConnector solr;
        public Worker(final int id, final long maxlifetime, final CacheStrategy cacheStrategy, final Pattern snippetPattern, final int neededResults) {
            this.id = id;
@ -331,6 +338,7 @@ public class ResultFetcher {
            this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime);
            this.neededResults = neededResults;
            this.shallrun = true;
            this.solr = ResultFetcher.this.rankingProcess.getQuery().getSegment().getSolr();
        }
        @Override
@ -373,8 +381,18 @@ public class ResultFetcher {
                    }
                    if (ResultFetcher.this.query.filterfailurls && ResultFetcher.this.workTables.failURLsContains(page.hash())) continue;
                    // in case that we have an attached solr, we load also the solr document
                    String solrContent = null;
                    if (this.solr != null) {
                        SolrDocument sd = null;
                        final SolrDocumentList sdl = this.solr.get("id:" + ASCII.String(page.hash()), 0, 1);
                        if (sdl.size() > 0) sd = sdl.get(0);
                        if (sd != null) solrContent = this.solr.getScheme().solrGetText(sd);
                    }
                    loops++;
-                    resultEntry = fetchSnippet(page, this.cacheStrategy); // does not fetch snippets if snippetMode == 0
+                    resultEntry = fetchSnippet(page, solrContent, this.cacheStrategy); // does not fetch snippets if snippetMode == 0
                    if (resultEntry == null) continue; // the entry had some problems, cannot be used
                    rawLine = resultEntry.textSnippet() == null ? null : resultEntry.textSnippet().getLineRaw();
                    //System.out.println("***SNIPPET*** raw='" + rawLine + "', pattern='" + this.snippetPattern.toString() + "'");
@ -412,7 +430,7 @@ public class ResultFetcher {
        }
    }
-    protected ResultEntry fetchSnippet(final URIMetadataRow page, final CacheStrategy cacheStrategy) {
+    protected ResultEntry fetchSnippet(final URIMetadataRow page, final String solrText, final CacheStrategy cacheStrategy) {
        // Snippet Fetching can has 3 modes:
        // 0 - do not fetch snippets
        // 1 - fetch snippets offline only
@ -429,6 +447,7 @@ public class ResultFetcher {
        if (cacheStrategy == null) {
            final TextSnippet snippet = new TextSnippet(
                    null,
                    solrText,
                    metadata,
                    this.snippetFetchWordHashes,
                    null,
@ -445,6 +464,7 @@ public class ResultFetcher {
            startTime = System.currentTimeMillis();
            final TextSnippet snippet = new TextSnippet(
                    this.loader,
                    solrText,
                    metadata,
                    this.snippetFetchWordHashes,
                    cacheStrategy,
--- a/source/de/anomic/search/Segment.java
+++ b/source/de/anomic/search/Segment.java
@ -37,6 +37,7 @@ import java.util.TreeSet;
 import net.yacy.cora.document.ASCII;
 import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.cora.document.UTF8;
 import net.yacy.cora.services.federated.solr.SolrConnector;
 import net.yacy.cora.services.federated.yacy.CacheStrategy;
 import net.yacy.document.Condenser;
 import net.yacy.document.Document;
@ -81,6 +82,7 @@ public class Segment {
    protected final IndexCell<WordReference>       termIndex;
    //private   final IndexCell<NavigationReference> authorNavIndex;
    protected final MetadataRepository             urlMetadata;
    private         SolrConnector                  solr;
    private   final File                           segmentPath;
    public Segment(
@ -98,6 +100,7 @@ public class Segment {
        this.log = log;
        this.segmentPath = segmentPath;
        this.solr = null;
        this.termIndex = new IndexCell<WordReference>(
                segmentPath,
@ -126,6 +129,14 @@ public class Segment {
        this.urlMetadata = new MetadataRepository(segmentPath, "text.urlmd", useTailCache, exceed134217727);
    }
    public void connectSolr(final SolrConnector solr) {
        this.solr = solr;
    }
    public SolrConnector getSolr() {
        return this.solr;
    }
    public static void migrateTextIndex(final File oldSegmentPath, final File newSegmentPath) {
        final File oldCellPath = new File(oldSegmentPath, "RICELL");
        if (!oldCellPath.exists()) return;
@ -254,6 +265,7 @@ public class Segment {
    public void close() {
        this.termIndex.close();
        this.urlMetadata.close();
        if (this.solr != null) this.solr.close();
    }
    public URIMetadataRow storeDocument(
--- a/source/de/anomic/search/Segments.java
+++ b/source/de/anomic/search/Segments.java
@ -38,13 +38,13 @@ import net.yacy.kelondro.rwi.IndexCell;
 public class Segments implements Iterable<Segment> {
-    
+
    /**
     * process enumeration type
     * defines constants that can be used to assign process-related segment names
     */
    public enum Process {
-        
+
        RECEIPTS,
        QUERIES,
        DHTIN,
@ -59,7 +59,7 @@ public class Segments implements Iterable<Segment> {
            throw new UnsupportedOperationException("toString not allowed");
        }
    }
-    
+
    private final Log log;
    private final File segmentsPath;
    private final int entityCacheMaxSize;
@ -68,7 +68,7 @@ public class Segments implements Iterable<Segment> {
    private final HashMap<Process, String> process_assignment;
    private final boolean useTailCache;
    private final boolean exceed134217727;
-    
+
    public Segments(
            final Log log,
            final File segmentsPath,
@ -96,41 +96,41 @@ public class Segments implements Iterable<Segment> {
        this.process_assignment.put(Process.PUBLIC,         "default");
        this.process_assignment.put(Process.SURROGATES,     "default");
    }
-    
+
-    public void setSegment(Process process, String segmentName) {
+    public void setSegment(final Process process, final String segmentName) {
        this.process_assignment.put(process, segmentName);
    }
-    
+
-    public static void migrateOld(File oldSingleSegment, File newSegmentsPath, String newSegmentName) {
+    public static void migrateOld(final File oldSingleSegment, final File newSegmentsPath, final String newSegmentName) {
        if (!oldSingleSegment.exists()) return;
-        File newSegmentPath = new File(newSegmentsPath, newSegmentName);
+        final File newSegmentPath = new File(newSegmentsPath, newSegmentName);
        if (!newSegmentPath.exists()) newSegmentPath.mkdirs();
        Segment.migrateTextIndex(oldSingleSegment, newSegmentPath);
        Segment.migrateTextMetadata(oldSingleSegment, newSegmentPath);
-        
+
-        String[] oldFiles = oldSingleSegment.list();
+        final String[] oldFiles = oldSingleSegment.list();
-        for (String oldFile: oldFiles) {
+        for (final String oldFile: oldFiles) {
            if (oldFile.startsWith("text.")) {
                new File(oldSingleSegment, oldFile).renameTo(new File(newSegmentPath, oldFile));
            }
        }
    }
-    
+
    public String[] segmentNames() {
        return this.segments.keySet().toArray(new String[this.segments.size()]);
    }
-    
+
    public boolean segmentExist(final String segmentName) {
-        return segments.containsKey(segmentName);
+        return this.segments.containsKey(segmentName);
    }
-    
+
    public Segment segment(final Process process) {
        return segment(this.process_assignment.get(process));
    }
-    
+
    public Segment segment(final String segmentName) {
-        if (segments == null) return null;
+        if (this.segments == null) return null;
-        Segment segment = segments.get(segmentName);
+        Segment segment = this.segments.get(segmentName);
        if (segment == null) {
            // generate the segment
            try {
@ -141,7 +141,7 @@ public class Segments implements Iterable<Segment> {
                        this.maxFileSize,
                        this.useTailCache,
                        this.exceed134217727);
-            } catch (IOException e) {
+            } catch (final IOException e) {
                Log.logException(e);
                return null;
            }
@ -149,28 +149,28 @@ public class Segments implements Iterable<Segment> {
        }
        return segment;
    }
-    
+
    public long URLCount() {
        if (this.segments == null) return 0;
        long c = 0;
-        for (Segment s: this.segments.values()) c += (long) s.urlMetadata().size();
+        for (final Segment s: this.segments.values()) c += s.urlMetadata().size();
        return c;
    }
-    
+
    public long RWICount() {
        if (this.segments == null) return 0;
        long c = 0;
-        for (Segment s: this.segments.values()) c += (long) s.termIndex().sizesMax();
+        for (final Segment s: this.segments.values()) c += s.termIndex().sizesMax();
        return c;
    }
-    
+
    public int RWIBufferCount() {
        if (this.segments == null) return 0;
        int c = 0;
-        for (Segment s: this.segments.values()) c += s.termIndex().getBufferSize();
+        for (final Segment s: this.segments.values()) c += s.termIndex().getBufferSize();
        return c;
    }
-    
+
    public MetadataRepository urlMetadata(final Process process) {
        return segment(this.process_assignment.get(process)).urlMetadata();
    }
@ -178,11 +178,11 @@ public class Segments implements Iterable<Segment> {
    public IndexCell<WordReference> termIndex(final Process process) {
        return segment(this.process_assignment.get(process)).termIndex();
    }
-    
+
    public void clear(final Process process) {
        segment(this.process_assignment.get(process)).clear();
    }
-    
+
    public File getLocation(final Process process) {
        return segment(this.process_assignment.get(process)).getLocation();
    }
@ -190,16 +190,16 @@ public class Segments implements Iterable<Segment> {
    public void close(final Process process) {
        segment(this.process_assignment.get(process)).close();
    }
-    
+
    public void close() {
-        if (segments != null) for (Segment s: this.segments.values()) s.close();
+        if (this.segments != null) for (final Segment s: this.segments.values()) s.close();
        this.segments = null;
    }
    public void finalize() {
        this.close();
    }
-    
+
    public synchronized Segment.ReferenceCleaner getReferenceCleaner(final String segmentName, final byte[] startHash) {
        return segment(segmentName).getReferenceCleaner(startHash);
    }
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@ -247,7 +247,6 @@ public final class Switchboard extends serverSwitch {
    private final Semaphore shutdownSync = new Semaphore(0);
    private boolean terminate = false;
    public SolrChardingConnector solrConnector = null;
    //private Object  crawlingPausedSync = new Object();
    //private boolean crawlingIsPaused = false;
@ -592,10 +591,10 @@ public final class Switchboard extends serverSwitch {
        final String solrurls = getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
        final boolean usesolr = getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0;
        try {
-            this.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, workingScheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null;
+            this.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr((usesolr) ? new SolrChardingConnector(solrurls, workingScheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null);
        } catch (final IOException e) {
            Log.logException(e);
-            this.solrConnector = null;
+            this.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr(null);
        }
        // start a loader
@ -1314,7 +1313,6 @@ public final class Switchboard extends serverSwitch {
        Cache.close();
        this.tables.close();
        Domains.close();
        if (this.solrConnector != null && getConfigBool("federated.service.solr.indexing.enabled", false)) this.solrConnector.close();
        AccessTracker.dumpLog(new File("DATA/LOG/queries.log"));
        UPnP.deletePortMapping();
        Tray.removeTray();
@ -1989,7 +1987,7 @@ public final class Switchboard extends serverSwitch {
    public indexingQueueEntry condenseDocument(final indexingQueueEntry in) {
        in.queueEntry.updateStatus(Response.QUEUE_STATE_CONDENSING);
-        if (this.solrConnector != null && getConfigBool("federated.service.solr.indexing.enabled", false)/*in.queueEntry.profile().pushSolr()*/) {
+        if (this.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() != null && getConfigBool("federated.service.solr.indexing.enabled", false)/*in.queueEntry.profile().pushSolr()*/) {
            // send the documents to solr
            for (final Document doc: in.documents) {
                try {
@ -2000,7 +1998,7 @@ public final class Switchboard extends serverSwitch {
                        // in case that this happens it appears that the doc id is the right one
                    }
                    try {
-                        this.solrConnector.add(id, in.queueEntry.getResponseHeader(), doc);
+                        this.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().add(id, in.queueEntry.getResponseHeader(), doc);
                    } catch (final IOException e) {
                        Log.logWarning("SOLR", "failed to send " + in.queueEntry.url().toNormalform(true, false) + " to solr: " + e.getMessage());
                    }
--- a/source/de/anomic/search/TextSnippet.java
+++ b/source/de/anomic/search/TextSnippet.java
@ -24,6 +24,7 @@
 package de.anomic.search;
 import java.io.ByteArrayInputStream;
 import java.util.Collection;
 import java.util.Comparator;
 import java.util.Iterator;
@ -34,6 +35,7 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import net.yacy.cora.document.ASCII;
 import net.yacy.cora.document.UTF8;
 import net.yacy.cora.services.federated.yacy.CacheStrategy;
 import net.yacy.cora.storage.ARC;
 import net.yacy.cora.storage.ConcurrentARC;
@ -140,6 +142,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
    public TextSnippet(
            final LoaderDispatcher loader,
            final String solrText,
            final URIMetadataRow.Components comp,
            final HandleSet queryhashes,
            final CacheStrategy cacheStrategy,
@ -156,7 +159,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
        }
        // try to get snippet from snippetCache
-        ResultClass source = ResultClass.SOURCE_CACHE;
+        final ResultClass source = ResultClass.SOURCE_CACHE;
        final String wordhashes = yacySearch.set2string(queryhashes);
        final String urls = ASCII.String(url.hash());
        String snippetLine = snippetsCache.get(wordhashes, urls);
@ -165,32 +168,37 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
            init(url.hash(), snippetLine, source, null);
            return;
        }
-        
+
        // try to get the snippet from a document at the cache (or in the web)
        // this requires that the document is parsed after loading
        String textline = null;
        HandleSet remainingHashes = queryhashes;
        { //encapsulate potential expensive sentences
-	        final Collection<StringBuilder> sentences;
+	        Collection<StringBuilder> sentences = null;
-	        { //encapsulate potential expensive document 
+
-		        final Document document = loadDocument(loader, comp, queryhashes, cacheStrategy, url, reindexing, source);
+	        // try the solr text first
-		        if (document == null) {
+	        if (solrText != null) {
-		            return;
+                // compute sentences from solr query
-		        }
+                sentences = Document.getSentences(pre, new ByteArrayInputStream(UTF8.getBytes(solrText)));
-		
+            }
-		        /* ===========================================================================
+
-		         * COMPUTE SNIPPET
+            // if then no sentences are found, we fail-over to get the content from the re-loaded document
-		         * =========================================================================== */
+            if (sentences == null) {
-		        // we have found a parseable non-empty file: use the lines
+    	        final Document document = loadDocument(loader, comp, queryhashes, cacheStrategy, url, reindexing, source);
-		
+    	        if (document == null) {
-		        // compute snippet from text
+    	            return;
-		        sentences = document.getSentences(pre);
+    	        }
-		        document.close();
+
-	        } //encapsulate potential expensive document END
+    	        // compute sentences from parsed document
-	        
+    	        sentences = document.getSentences(pre);
-	        if (sentences == null) {
+    	        document.close();
-	            init(url.hash(), null, ResultClass.ERROR_PARSER_NO_LINES, "parser returned no sentences");
+
-	            return;
+                if (sentences == null) {
-	        }
+                    init(url.hash(), null, ResultClass.ERROR_PARSER_NO_LINES, "parser returned no sentences");
-	        
+                    return;
                }
            }
 	        try {
 	        	final SnippetExtractor tsr = new SnippetExtractor(sentences, queryhashes, snippetMaxLength);
 	            textline = tsr.getSnippet();
@ -227,7 +235,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
 //        document.close();
        init(url.hash(), snippetLine, source, null);
    }
-    
+
    private Document loadDocument(
    		final LoaderDispatcher loader,
    		final URIMetadataRow.Components comp,
--- a/source/net/yacy/cora/services/federated/solr/SolrChardingConnector.java
+++ b/source/net/yacy/cora/services/federated/solr/SolrChardingConnector.java
@ -34,14 +34,13 @@ import net.yacy.cora.protocol.Domains;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.document.Document;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.logging.Log;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrDocumentList;
 import org.apache.solr.common.SolrInputDocument;
-public class SolrChardingConnector {
+public class SolrChardingConnector implements SolrConnector {
    private final List<SolrSingleConnector> connectors;
    private final SolrScheme scheme;
@ -164,13 +163,7 @@ public class SolrChardingConnector {
        final long[] size = new long[this.connectors.size()];
        int i = 0;
        for (final SolrSingleConnector connector: this.connectors) {
-            try {
+            size[i++] = connector.getSize();
                final SolrDocumentList list = connector.get("*:*", 0, 1);
                size[i++] = list.getNumFound();
            } catch (final Exception e) {
                Log.logException(e);
                size[i++] = 0;
            }
        }
        return size;
    }
--- a/source/net/yacy/cora/services/federated/solr/SolrConnector.java
+++ b/source/net/yacy/cora/services/federated/solr/SolrConnector.java
@ -0,0 +1,99 @@
 /**
 *  SolrConnector
 *  Copyright 2011 by Michael Peter Christen
 *  First released 13.09.2011 at http://yacy.net
 *
 *  $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $
 *  $LastChangedRevision: 7654 $
 *  $LastChangedBy: orbiter $
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */
 package net.yacy.cora.services.federated.solr;
 import java.io.IOException;
 import java.util.List;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.document.Document;
 import net.yacy.kelondro.data.meta.DigestURI;
 import org.apache.solr.common.SolrDocumentList;
 public interface SolrConnector {
    /**
     * with a scheme the fields of a SolrDocument can be translated to actual data values
     * @return the solr scheme that can translate the SolrDocument
     */
    public SolrScheme getScheme();
    public void close();
    /**
     * delete everything in the solr index
     * @throws IOException
     */
    public void clear() throws IOException;
    /**
     * delete an entry from solr
     * @param id the url hash of the entry
     * @throws IOException
     */
    public void delete(final String id) throws IOException;
    /**
     * delete a set of entries from solr; entries are identified by their url hash
     * @param ids a list of url hashes
     * @throws IOException
     */
    public void delete(final List<String> ids) throws IOException;
    /**
     * add a YaCy document. This calls the scheme processor to add the document as solr document
     * @param id the url hash of the entry
     * @param header the http response header
     * @param doc the YaCy document
     * @throws IOException
     */
    public void add(final String id, final ResponseHeader header, final Document doc) throws IOException;
    /**
     * register an entry as error document
     * @param digestURI
     * @param failReason
     * @param httpstatus
     * @throws IOException
     */
    public void err(final DigestURI digestURI, final String failReason, final int httpstatus) throws IOException;
    /**
     * get a query result from solr
     * to get all results set the query String to "*:*"
     * @param querystring
     * @throws IOException
     */
    public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException;
    /**
     * get the size of the index
     * @return number of results if solr is queries with a catch-all pattern
     */
    public long getSize();
 }
--- a/source/net/yacy/cora/services/federated/solr/SolrScheme.java
+++ b/source/net/yacy/cora/services/federated/solr/SolrScheme.java
@ -27,6 +27,8 @@ package net.yacy.cora.services.federated.solr;
 import java.io.File;
 import java.net.InetAddress;
 import java.net.MalformedURLException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Date;
 import java.util.Map;
@ -44,6 +46,7 @@ import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.document.parser.html.ImageEntry;
 import net.yacy.kelondro.data.meta.DigestURI;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrInputDocument;
 public class SolrScheme extends ConfigurationSet {
@ -349,6 +352,46 @@ public class SolrScheme extends ConfigurationSet {
        return solrdoc;
    }
    public String solrGetID(final SolrDocument solr) {
        return (String) solr.getFieldValue("id");
    }
    public DigestURI solrGetURL(final SolrDocument solr) {
        try {
            return new DigestURI((String) solr.getFieldValue("sku"));
        } catch (final MalformedURLException e) {
            return null;
        }
    }
    public String solrGetTitle(final SolrDocument solr) {
        return (String) solr.getFieldValue("title");
    }
    public String solrGetText(final SolrDocument solr) {
        return (String) solr.getFieldValue("text_t");
    }
    public String solrGetAuthor(final SolrDocument solr) {
        return (String) solr.getFieldValue("author");
    }
    public String solrGetDescription(final SolrDocument solr) {
        return (String) solr.getFieldValue("description");
    }
    public Date solrGetDate(final SolrDocument solr) {
        return (Date) solr.getFieldValue("last_modified");
    }
    public Collection<String> solrGetKeywords(final SolrDocument solr) {
        final Collection<Object> c = solr.getFieldValues("keywords");
        final ArrayList<String> a = new ArrayList<String>();
        for (final Object s: c) {
            a.add((String) s);
        }
        return a;
    }
    /*
     * standard solr scheme
--- a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java
+++ b/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java
@ -57,7 +57,7 @@ import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrInputDocument;
-public class SolrSingleConnector {
+public class SolrSingleConnector implements SolrConnector {
    private final String solrurl, host, solrpath, solraccount, solrpw;
    private final int port;
@ -178,6 +178,22 @@ public class SolrSingleConnector {
        }
    }
    @Override
    public SolrScheme getScheme() {
        return this.scheme;
    }
    @Override
    public long getSize() {
        try {
            final SolrDocumentList list = get("*:*", 0, 1);
            return list.getNumFound();
        } catch (final Exception e) {
            Log.logException(e);
            return 0;
        }
    }
    /**
     * delete everything in the solr index
     * @throws IOException
@ -325,6 +341,16 @@ public class SolrSingleConnector {
        //return result;
    }
    public String getAdminInterface() {
        final InetAddress localhostExternAddress = Domains.myPublicLocalIP();
        final String localhostExtern = localhostExternAddress == null ? "127.0.0.1" : localhostExternAddress.getHostAddress();
        String u = this.solrurl;
        int p = u.indexOf("localhost"); if (p < 0) p = u.indexOf("127.0.0.1");
        if (p >= 0) u = u.substring(0, p) + localhostExtern + u.substring(p + 9);
        return u + (u.endsWith("/") ? "admin/" : "/admin/");
    }
    public static void main(final String args[]) {
        SolrSingleConnector solr;
        try {
@ -347,5 +373,4 @@ public class SolrSingleConnector {
            e.printStackTrace();
        }
    }
 }
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -312,8 +312,12 @@ dc_rights
    }
    public List<StringBuilder> getSentences(final boolean pre) {
-        if (this.text == null) return null;
+        return getSentences(pre, getText());
-        final SentenceReader e = new SentenceReader(getText());
+    }
    public static List<StringBuilder> getSentences(final boolean pre, final InputStream text) {
        if (text == null) return null;
        final SentenceReader e = new SentenceReader(text);
        e.pre(pre);
        final List<StringBuilder> sentences = new ArrayList<StringBuilder>();
        while (e.hasNext()) {