YaCy can now use the solr index to compute text snippets. This makes search result preparation MUCH faster because no document fetching and parsing is necessary any more.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7943 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 0819e1d397
commit 85a5487d6d

@ -66,6 +66,7 @@ import de.anomic.search.RankingProcess;
import de.anomic.search.ReferenceOrder;
import de.anomic.search.SearchEventCache;
import de.anomic.search.Segment;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
import de.anomic.search.SwitchboardConstants;
import de.anomic.server.serverObjects;
@ -86,7 +87,7 @@ public class IndexControlRWIs_p {
prop.put("keyhash", "");
prop.put("result", "");
prop.put("cleanup", post == null || post.containsKey("maxReferencesLimit") ? 1 : 0);
prop.put("cleanup_solr", sb.solrConnector == null || !sb.getConfigBool("federated.service.solr.indexing.enabled", false) ? 0 : 1);
prop.put("cleanup_solr", sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null || !sb.getConfigBool("federated.service.solr.indexing.enabled", false) ? 0 : 1);
String segmentName = sb.getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default");
int i = 0;
@ -157,7 +158,7 @@ public class IndexControlRWIs_p {
segment.clear();
}
if (post.get("deleteSolr", "").equals("on") && sb.getConfigBool("federated.service.solr.indexing.enabled", false)) try {
sb.solrConnector.clear();
sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().clear();
} catch (final Exception e) {
Log.logException(e);
}

@ -33,9 +33,12 @@ import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.solr.SolrChardingConnector;
import net.yacy.cora.services.federated.solr.SolrChardingSelection;
import net.yacy.cora.services.federated.solr.SolrConnector;
import net.yacy.cora.services.federated.solr.SolrScheme;
import net.yacy.cora.services.federated.solr.SolrSingleConnector;
import net.yacy.cora.storage.ConfigurationSet;
import net.yacy.kelondro.logging.Log;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -75,8 +78,8 @@ public class IndexFederated_p {
if (solrWasOn) {
// switch off
sb.solrConnector.close();
sb.solrConnector = null;
sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().close();
sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr(null);
}
final SolrScheme scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/" + schemename));
@ -85,10 +88,10 @@ public class IndexFederated_p {
// switch on
final boolean usesolr = sb.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0;
try {
sb.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null;
sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr((usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null);
} catch (final IOException e) {
Log.logException(e);
sb.solrConnector = null;
sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr(null);
}
}
@ -110,12 +113,13 @@ public class IndexFederated_p {
}
// show solr host table
if (sb.solrConnector == null) {
if (sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null) {
prop.put("table", 0);
} else {
prop.put("table", 1);
final long[] size = sb.solrConnector.getSizeList();
final String[] urls = sb.solrConnector.getAdminInterfaceList();
final SolrConnector solr = sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr();
final long[] size = (solr instanceof SolrChardingConnector) ? ((SolrChardingConnector) solr).getSizeList() : new long[]{((SolrSingleConnector) solr).getSize()};
final String[] urls = (solr instanceof SolrChardingConnector) ? ((SolrChardingConnector) solr).getAdminInterfaceList() : new String[]{((SolrSingleConnector) solr).getAdminInterface()};
boolean dark = false;
for (int i = 0; i < size.length; i++) {
prop.put("table_list_" + i + "_dark", dark ? 1 : 0); dark = !dark;
@ -126,7 +130,7 @@ public class IndexFederated_p {
}
// write scheme
SolrScheme scheme = (sb.solrConnector == null) ? null : sb.solrConnector.getScheme();
SolrScheme scheme = (sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null) ? null : sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().getScheme();
final String schemename = sb.getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list");
if (scheme == null) {
scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/" + schemename));

@ -62,6 +62,7 @@ public class CrawlQueues {
private static final String ERROR_DB_FILENAME = "urlError3.db";
private static final String DELEGATED_DB_FILENAME = "urlDelegated3.db";
private static final Segments.Process PROCESS = Segments.Process.LOCALCRAWLING;
protected Switchboard sb;
protected Log log;
@ -81,8 +82,8 @@ public class CrawlQueues {
this.log.logConfig("Starting Crawling Management");
this.noticeURL = new NoticedURL(queuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727);
FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME));
this.errorURL = new ZURL(sb.solrConnector, queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
this.delegatedURL = new ZURL(sb.solrConnector, queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
this.errorURL = new ZURL(sb.indexSegments.segment(PROCESS).getSolr(), queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
this.delegatedURL = new ZURL(sb.indexSegments.segment(PROCESS).getSolr(), queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
}
public void relocate(final File newQueuePath) {
@ -93,8 +94,8 @@ public class CrawlQueues {
this.noticeURL = new NoticedURL(newQueuePath, this.sb.peers.myBotIDs(), this.sb.useTailCache, this.sb.exceed134217727);
FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME));
this.errorURL = new ZURL(this.sb.solrConnector, newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727);
this.delegatedURL = new ZURL(this.sb.solrConnector, newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727);
this.errorURL = new ZURL(this.sb.indexSegments.segment(PROCESS).getSolr(), newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727);
this.delegatedURL = new ZURL(this.sb.indexSegments.segment(PROCESS).getSolr(), newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727);
}
public void close() {
@ -249,7 +250,7 @@ public class CrawlQueues {
return true;
}
try {
this.sb.indexingDocumentProcessor.enQueue(new indexingQueueEntry(Segments.Process.LOCALCRAWLING, new Response(urlEntry, profile), null, null));
this.sb.indexingDocumentProcessor.enQueue(new indexingQueueEntry(PROCESS, new Response(urlEntry, profile), null, null));
Log.logInfo("CrawlQueues", "placed NOLOAD URL on indexing queue: " + urlEntry.url().toNormalform(true, false));
} catch (final InterruptedException e) {
Log.logException(e);

@ -36,6 +36,7 @@ import java.util.concurrent.ConcurrentLinkedQueue;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.services.federated.solr.SolrChardingConnector;
import net.yacy.cora.services.federated.solr.SolrConnector;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.Index;
@ -76,10 +77,10 @@ public class ZURL implements Iterable<ZURL.Entry> {
// the class object
private Index urlIndex;
private final ConcurrentLinkedQueue<byte[]> stack;
private final SolrChardingConnector solrConnector;
private final SolrConnector solrConnector;
public ZURL(
final SolrChardingConnector solrConnector,
final SolrConnector solrConnector,
final File cachePath,
final String tablename,
final boolean startWithEmptyFile,

@ -31,11 +31,13 @@ import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.ranking.ScoreMap;
import net.yacy.cora.ranking.WeakPriorityBlockingQueue;
import net.yacy.cora.ranking.WeakPriorityBlockingQueue.ReverseElement;
import net.yacy.cora.services.federated.solr.SolrConnector;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.URIMetadataRow;
@ -46,6 +48,10 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.EventTracker;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.repository.LoaderDispatcher;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import de.anomic.data.WorkTables;
import de.anomic.http.client.Cache;
import de.anomic.yacy.yacySeedDB;
@ -322,6 +328,7 @@ public class ResultFetcher {
private final int neededResults;
private final Pattern snippetPattern;
private boolean shallrun;
private final SolrConnector solr;
public Worker(final int id, final long maxlifetime, final CacheStrategy cacheStrategy, final Pattern snippetPattern, final int neededResults) {
this.id = id;
@ -331,6 +338,7 @@ public class ResultFetcher {
this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime);
this.neededResults = neededResults;
this.shallrun = true;
this.solr = ResultFetcher.this.rankingProcess.getQuery().getSegment().getSolr();
}
@Override
@ -373,8 +381,18 @@ public class ResultFetcher {
}
if (ResultFetcher.this.query.filterfailurls && ResultFetcher.this.workTables.failURLsContains(page.hash())) continue;
// in case that we have an attached solr, we load also the solr document
String solrContent = null;
if (this.solr != null) {
SolrDocument sd = null;
final SolrDocumentList sdl = this.solr.get("id:" + ASCII.String(page.hash()), 0, 1);
if (sdl.size() > 0) sd = sdl.get(0);
if (sd != null) solrContent = this.solr.getScheme().solrGetText(sd);
}
loops++;
resultEntry = fetchSnippet(page, this.cacheStrategy); // does not fetch snippets if snippetMode == 0
resultEntry = fetchSnippet(page, solrContent, this.cacheStrategy); // does not fetch snippets if snippetMode == 0
if (resultEntry == null) continue; // the entry had some problems, cannot be used
rawLine = resultEntry.textSnippet() == null ? null : resultEntry.textSnippet().getLineRaw();
//System.out.println("***SNIPPET*** raw='" + rawLine + "', pattern='" + this.snippetPattern.toString() + "'");
@ -412,7 +430,7 @@ public class ResultFetcher {
}
}
protected ResultEntry fetchSnippet(final URIMetadataRow page, final CacheStrategy cacheStrategy) {
protected ResultEntry fetchSnippet(final URIMetadataRow page, final String solrText, final CacheStrategy cacheStrategy) {
// Snippet Fetching can has 3 modes:
// 0 - do not fetch snippets
// 1 - fetch snippets offline only
@ -429,6 +447,7 @@ public class ResultFetcher {
if (cacheStrategy == null) {
final TextSnippet snippet = new TextSnippet(
null,
solrText,
metadata,
this.snippetFetchWordHashes,
null,
@ -445,6 +464,7 @@ public class ResultFetcher {
startTime = System.currentTimeMillis();
final TextSnippet snippet = new TextSnippet(
this.loader,
solrText,
metadata,
this.snippetFetchWordHashes,
cacheStrategy,

@ -37,6 +37,7 @@ import java.util.TreeSet;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.services.federated.solr.SolrConnector;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
@ -81,6 +82,7 @@ public class Segment {
protected final IndexCell<WordReference> termIndex;
//private final IndexCell<NavigationReference> authorNavIndex;
protected final MetadataRepository urlMetadata;
private SolrConnector solr;
private final File segmentPath;
public Segment(
@ -98,6 +100,7 @@ public class Segment {
this.log = log;
this.segmentPath = segmentPath;
this.solr = null;
this.termIndex = new IndexCell<WordReference>(
segmentPath,
@ -126,6 +129,14 @@ public class Segment {
this.urlMetadata = new MetadataRepository(segmentPath, "text.urlmd", useTailCache, exceed134217727);
}
public void connectSolr(final SolrConnector solr) {
this.solr = solr;
}
public SolrConnector getSolr() {
return this.solr;
}
public static void migrateTextIndex(final File oldSegmentPath, final File newSegmentPath) {
final File oldCellPath = new File(oldSegmentPath, "RICELL");
if (!oldCellPath.exists()) return;
@ -254,6 +265,7 @@ public class Segment {
public void close() {
this.termIndex.close();
this.urlMetadata.close();
if (this.solr != null) this.solr.close();
}
public URIMetadataRow storeDocument(

@ -38,13 +38,13 @@ import net.yacy.kelondro.rwi.IndexCell;
public class Segments implements Iterable<Segment> {
/**
* process enumeration type
* defines constants that can be used to assign process-related segment names
*/
public enum Process {
RECEIPTS,
QUERIES,
DHTIN,
@ -59,7 +59,7 @@ public class Segments implements Iterable<Segment> {
throw new UnsupportedOperationException("toString not allowed");
}
}
private final Log log;
private final File segmentsPath;
private final int entityCacheMaxSize;
@ -68,7 +68,7 @@ public class Segments implements Iterable<Segment> {
private final HashMap<Process, String> process_assignment;
private final boolean useTailCache;
private final boolean exceed134217727;
public Segments(
final Log log,
final File segmentsPath,
@ -96,41 +96,41 @@ public class Segments implements Iterable<Segment> {
this.process_assignment.put(Process.PUBLIC, "default");
this.process_assignment.put(Process.SURROGATES, "default");
}
public void setSegment(Process process, String segmentName) {
public void setSegment(final Process process, final String segmentName) {
this.process_assignment.put(process, segmentName);
}
public static void migrateOld(File oldSingleSegment, File newSegmentsPath, String newSegmentName) {
public static void migrateOld(final File oldSingleSegment, final File newSegmentsPath, final String newSegmentName) {
if (!oldSingleSegment.exists()) return;
File newSegmentPath = new File(newSegmentsPath, newSegmentName);
final File newSegmentPath = new File(newSegmentsPath, newSegmentName);
if (!newSegmentPath.exists()) newSegmentPath.mkdirs();
Segment.migrateTextIndex(oldSingleSegment, newSegmentPath);
Segment.migrateTextMetadata(oldSingleSegment, newSegmentPath);
String[] oldFiles = oldSingleSegment.list();
for (String oldFile: oldFiles) {
final String[] oldFiles = oldSingleSegment.list();
for (final String oldFile: oldFiles) {
if (oldFile.startsWith("text.")) {
new File(oldSingleSegment, oldFile).renameTo(new File(newSegmentPath, oldFile));
}
}
}
public String[] segmentNames() {
return this.segments.keySet().toArray(new String[this.segments.size()]);
}
public boolean segmentExist(final String segmentName) {
return segments.containsKey(segmentName);
return this.segments.containsKey(segmentName);
}
public Segment segment(final Process process) {
return segment(this.process_assignment.get(process));
}
public Segment segment(final String segmentName) {
if (segments == null) return null;
Segment segment = segments.get(segmentName);
if (this.segments == null) return null;
Segment segment = this.segments.get(segmentName);
if (segment == null) {
// generate the segment
try {
@ -141,7 +141,7 @@ public class Segments implements Iterable<Segment> {
this.maxFileSize,
this.useTailCache,
this.exceed134217727);
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
return null;
}
@ -149,28 +149,28 @@ public class Segments implements Iterable<Segment> {
}
return segment;
}
public long URLCount() {
if (this.segments == null) return 0;
long c = 0;
for (Segment s: this.segments.values()) c += (long) s.urlMetadata().size();
for (final Segment s: this.segments.values()) c += s.urlMetadata().size();
return c;
}
public long RWICount() {
if (this.segments == null) return 0;
long c = 0;
for (Segment s: this.segments.values()) c += (long) s.termIndex().sizesMax();
for (final Segment s: this.segments.values()) c += s.termIndex().sizesMax();
return c;
}
public int RWIBufferCount() {
if (this.segments == null) return 0;
int c = 0;
for (Segment s: this.segments.values()) c += s.termIndex().getBufferSize();
for (final Segment s: this.segments.values()) c += s.termIndex().getBufferSize();
return c;
}
public MetadataRepository urlMetadata(final Process process) {
return segment(this.process_assignment.get(process)).urlMetadata();
}
@ -178,11 +178,11 @@ public class Segments implements Iterable<Segment> {
public IndexCell<WordReference> termIndex(final Process process) {
return segment(this.process_assignment.get(process)).termIndex();
}
public void clear(final Process process) {
segment(this.process_assignment.get(process)).clear();
}
public File getLocation(final Process process) {
return segment(this.process_assignment.get(process)).getLocation();
}
@ -190,16 +190,16 @@ public class Segments implements Iterable<Segment> {
public void close(final Process process) {
segment(this.process_assignment.get(process)).close();
}
public void close() {
if (segments != null) for (Segment s: this.segments.values()) s.close();
if (this.segments != null) for (final Segment s: this.segments.values()) s.close();
this.segments = null;
}
public void finalize() {
this.close();
}
public synchronized Segment.ReferenceCleaner getReferenceCleaner(final String segmentName, final byte[] startHash) {
return segment(segmentName).getReferenceCleaner(startHash);
}

@ -247,7 +247,6 @@ public final class Switchboard extends serverSwitch {
private final Semaphore shutdownSync = new Semaphore(0);
private boolean terminate = false;
public SolrChardingConnector solrConnector = null;
//private Object crawlingPausedSync = new Object();
//private boolean crawlingIsPaused = false;
@ -592,10 +591,10 @@ public final class Switchboard extends serverSwitch {
final String solrurls = getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
final boolean usesolr = getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0;
try {
this.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, workingScheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null;
this.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr((usesolr) ? new SolrChardingConnector(solrurls, workingScheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null);
} catch (final IOException e) {
Log.logException(e);
this.solrConnector = null;
this.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr(null);
}
// start a loader
@ -1314,7 +1313,6 @@ public final class Switchboard extends serverSwitch {
Cache.close();
this.tables.close();
Domains.close();
if (this.solrConnector != null && getConfigBool("federated.service.solr.indexing.enabled", false)) this.solrConnector.close();
AccessTracker.dumpLog(new File("DATA/LOG/queries.log"));
UPnP.deletePortMapping();
Tray.removeTray();
@ -1989,7 +1987,7 @@ public final class Switchboard extends serverSwitch {
public indexingQueueEntry condenseDocument(final indexingQueueEntry in) {
in.queueEntry.updateStatus(Response.QUEUE_STATE_CONDENSING);
if (this.solrConnector != null && getConfigBool("federated.service.solr.indexing.enabled", false)/*in.queueEntry.profile().pushSolr()*/) {
if (this.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() != null && getConfigBool("federated.service.solr.indexing.enabled", false)/*in.queueEntry.profile().pushSolr()*/) {
// send the documents to solr
for (final Document doc: in.documents) {
try {
@ -2000,7 +1998,7 @@ public final class Switchboard extends serverSwitch {
// in case that this happens it appears that the doc id is the right one
}
try {
this.solrConnector.add(id, in.queueEntry.getResponseHeader(), doc);
this.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().add(id, in.queueEntry.getResponseHeader(), doc);
} catch (final IOException e) {
Log.logWarning("SOLR", "failed to send " + in.queueEntry.url().toNormalform(true, false) + " to solr: " + e.getMessage());
}

@ -24,6 +24,7 @@
package de.anomic.search;
import java.io.ByteArrayInputStream;
import java.util.Collection;
import java.util.Comparator;
import java.util.Iterator;
@ -34,6 +35,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.cora.storage.ARC;
import net.yacy.cora.storage.ConcurrentARC;
@ -140,6 +142,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
public TextSnippet(
final LoaderDispatcher loader,
final String solrText,
final URIMetadataRow.Components comp,
final HandleSet queryhashes,
final CacheStrategy cacheStrategy,
@ -156,7 +159,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
}
// try to get snippet from snippetCache
ResultClass source = ResultClass.SOURCE_CACHE;
final ResultClass source = ResultClass.SOURCE_CACHE;
final String wordhashes = yacySearch.set2string(queryhashes);
final String urls = ASCII.String(url.hash());
String snippetLine = snippetsCache.get(wordhashes, urls);
@ -165,32 +168,37 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
init(url.hash(), snippetLine, source, null);
return;
}
// try to get the snippet from a document at the cache (or in the web)
// this requires that the document is parsed after loading
String textline = null;
HandleSet remainingHashes = queryhashes;
{ //encapsulate potential expensive sentences
final Collection<StringBuilder> sentences;
{ //encapsulate potential expensive document
final Document document = loadDocument(loader, comp, queryhashes, cacheStrategy, url, reindexing, source);
if (document == null) {
return;
}
/* ===========================================================================
* COMPUTE SNIPPET
* =========================================================================== */
// we have found a parseable non-empty file: use the lines
// compute snippet from text
sentences = document.getSentences(pre);
document.close();
} //encapsulate potential expensive document END
if (sentences == null) {
init(url.hash(), null, ResultClass.ERROR_PARSER_NO_LINES, "parser returned no sentences");
return;
}
Collection<StringBuilder> sentences = null;
// try the solr text first
if (solrText != null) {
// compute sentences from solr query
sentences = Document.getSentences(pre, new ByteArrayInputStream(UTF8.getBytes(solrText)));
}
// if then no sentences are found, we fail-over to get the content from the re-loaded document
if (sentences == null) {
final Document document = loadDocument(loader, comp, queryhashes, cacheStrategy, url, reindexing, source);
if (document == null) {
return;
}
// compute sentences from parsed document
sentences = document.getSentences(pre);
document.close();
if (sentences == null) {
init(url.hash(), null, ResultClass.ERROR_PARSER_NO_LINES, "parser returned no sentences");
return;
}
}
try {
final SnippetExtractor tsr = new SnippetExtractor(sentences, queryhashes, snippetMaxLength);
textline = tsr.getSnippet();
@ -227,7 +235,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
// document.close();
init(url.hash(), snippetLine, source, null);
}
private Document loadDocument(
final LoaderDispatcher loader,
final URIMetadataRow.Components comp,

@ -34,14 +34,13 @@ import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
public class SolrChardingConnector {
public class SolrChardingConnector implements SolrConnector {
private final List<SolrSingleConnector> connectors;
private final SolrScheme scheme;
@ -164,13 +163,7 @@ public class SolrChardingConnector {
final long[] size = new long[this.connectors.size()];
int i = 0;
for (final SolrSingleConnector connector: this.connectors) {
try {
final SolrDocumentList list = connector.get("*:*", 0, 1);
size[i++] = list.getNumFound();
} catch (final Exception e) {
Log.logException(e);
size[i++] = 0;
}
size[i++] = connector.getSize();
}
return size;
}

@ -0,0 +1,99 @@
/**
* SolrConnector
* Copyright 2011 by Michael Peter Christen
* First released 13.09.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7654 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.services.federated.solr;
import java.io.IOException;
import java.util.List;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
import org.apache.solr.common.SolrDocumentList;
public interface SolrConnector {
/**
* with a scheme the fields of a SolrDocument can be translated to actual data values
* @return the solr scheme that can translate the SolrDocument
*/
public SolrScheme getScheme();
public void close();
/**
* delete everything in the solr index
* @throws IOException
*/
public void clear() throws IOException;
/**
* delete an entry from solr
* @param id the url hash of the entry
* @throws IOException
*/
public void delete(final String id) throws IOException;
/**
* delete a set of entries from solr; entries are identified by their url hash
* @param ids a list of url hashes
* @throws IOException
*/
public void delete(final List<String> ids) throws IOException;
/**
* add a YaCy document. This calls the scheme processor to add the document as solr document
* @param id the url hash of the entry
* @param header the http response header
* @param doc the YaCy document
* @throws IOException
*/
public void add(final String id, final ResponseHeader header, final Document doc) throws IOException;
/**
* register an entry as error document
* @param digestURI
* @param failReason
* @param httpstatus
* @throws IOException
*/
public void err(final DigestURI digestURI, final String failReason, final int httpstatus) throws IOException;
/**
* get a query result from solr
* to get all results set the query String to "*:*"
* @param querystring
* @throws IOException
*/
public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException;
/**
* get the size of the index
* @return number of results if solr is queries with a catch-all pattern
*/
public long getSize();
}

@ -27,6 +27,8 @@ package net.yacy.cora.services.federated.solr;
import java.io.File;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.Map;
@ -44,6 +46,7 @@ import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
public class SolrScheme extends ConfigurationSet {
@ -349,6 +352,46 @@ public class SolrScheme extends ConfigurationSet {
return solrdoc;
}
public String solrGetID(final SolrDocument solr) {
return (String) solr.getFieldValue("id");
}
public DigestURI solrGetURL(final SolrDocument solr) {
try {
return new DigestURI((String) solr.getFieldValue("sku"));
} catch (final MalformedURLException e) {
return null;
}
}
public String solrGetTitle(final SolrDocument solr) {
return (String) solr.getFieldValue("title");
}
public String solrGetText(final SolrDocument solr) {
return (String) solr.getFieldValue("text_t");
}
public String solrGetAuthor(final SolrDocument solr) {
return (String) solr.getFieldValue("author");
}
public String solrGetDescription(final SolrDocument solr) {
return (String) solr.getFieldValue("description");
}
public Date solrGetDate(final SolrDocument solr) {
return (Date) solr.getFieldValue("last_modified");
}
public Collection<String> solrGetKeywords(final SolrDocument solr) {
final Collection<Object> c = solr.getFieldValues("keywords");
final ArrayList<String> a = new ArrayList<String>();
for (final Object s: c) {
a.add((String) s);
}
return a;
}
/*
* standard solr scheme

@ -57,7 +57,7 @@ import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
public class SolrSingleConnector {
public class SolrSingleConnector implements SolrConnector {
private final String solrurl, host, solrpath, solraccount, solrpw;
private final int port;
@ -178,6 +178,22 @@ public class SolrSingleConnector {
}
}
@Override
public SolrScheme getScheme() {
return this.scheme;
}
@Override
public long getSize() {
try {
final SolrDocumentList list = get("*:*", 0, 1);
return list.getNumFound();
} catch (final Exception e) {
Log.logException(e);
return 0;
}
}
/**
* delete everything in the solr index
* @throws IOException
@ -325,6 +341,16 @@ public class SolrSingleConnector {
//return result;
}
public String getAdminInterface() {
final InetAddress localhostExternAddress = Domains.myPublicLocalIP();
final String localhostExtern = localhostExternAddress == null ? "127.0.0.1" : localhostExternAddress.getHostAddress();
String u = this.solrurl;
int p = u.indexOf("localhost"); if (p < 0) p = u.indexOf("127.0.0.1");
if (p >= 0) u = u.substring(0, p) + localhostExtern + u.substring(p + 9);
return u + (u.endsWith("/") ? "admin/" : "/admin/");
}
public static void main(final String args[]) {
SolrSingleConnector solr;
try {
@ -347,5 +373,4 @@ public class SolrSingleConnector {
e.printStackTrace();
}
}
}

@ -312,8 +312,12 @@ dc_rights
}
public List<StringBuilder> getSentences(final boolean pre) {
if (this.text == null) return null;
final SentenceReader e = new SentenceReader(getText());
return getSentences(pre, getText());
}
public static List<StringBuilder> getSentences(final boolean pre, final InputStream text) {
if (text == null) return null;
final SentenceReader e = new SentenceReader(text);
e.pre(pre);
final List<StringBuilder> sentences = new ArrayList<StringBuilder>();
while (e.hasNext()) {

Loading…
Cancel
Save