YaCy can now use the solr index to compute text snippets. This makes search result preparation MUCH faster because no document fetching and parsing is necessary any more.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7943 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 0819e1d397
commit 85a5487d6d

@ -66,6 +66,7 @@ import de.anomic.search.RankingProcess;
import de.anomic.search.ReferenceOrder; import de.anomic.search.ReferenceOrder;
import de.anomic.search.SearchEventCache; import de.anomic.search.SearchEventCache;
import de.anomic.search.Segment; import de.anomic.search.Segment;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
import de.anomic.search.SwitchboardConstants; import de.anomic.search.SwitchboardConstants;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
@ -86,7 +87,7 @@ public class IndexControlRWIs_p {
prop.put("keyhash", ""); prop.put("keyhash", "");
prop.put("result", ""); prop.put("result", "");
prop.put("cleanup", post == null || post.containsKey("maxReferencesLimit") ? 1 : 0); prop.put("cleanup", post == null || post.containsKey("maxReferencesLimit") ? 1 : 0);
prop.put("cleanup_solr", sb.solrConnector == null || !sb.getConfigBool("federated.service.solr.indexing.enabled", false) ? 0 : 1); prop.put("cleanup_solr", sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null || !sb.getConfigBool("federated.service.solr.indexing.enabled", false) ? 0 : 1);
String segmentName = sb.getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default"); String segmentName = sb.getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default");
int i = 0; int i = 0;
@ -157,7 +158,7 @@ public class IndexControlRWIs_p {
segment.clear(); segment.clear();
} }
if (post.get("deleteSolr", "").equals("on") && sb.getConfigBool("federated.service.solr.indexing.enabled", false)) try { if (post.get("deleteSolr", "").equals("on") && sb.getConfigBool("federated.service.solr.indexing.enabled", false)) try {
sb.solrConnector.clear(); sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().clear();
} catch (final Exception e) { } catch (final Exception e) {
Log.logException(e); Log.logException(e);
} }

@ -33,9 +33,12 @@ import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.solr.SolrChardingConnector; import net.yacy.cora.services.federated.solr.SolrChardingConnector;
import net.yacy.cora.services.federated.solr.SolrChardingSelection; import net.yacy.cora.services.federated.solr.SolrChardingSelection;
import net.yacy.cora.services.federated.solr.SolrConnector;
import net.yacy.cora.services.federated.solr.SolrScheme; import net.yacy.cora.services.federated.solr.SolrScheme;
import net.yacy.cora.services.federated.solr.SolrSingleConnector;
import net.yacy.cora.storage.ConfigurationSet; import net.yacy.cora.storage.ConfigurationSet;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
@ -75,8 +78,8 @@ public class IndexFederated_p {
if (solrWasOn) { if (solrWasOn) {
// switch off // switch off
sb.solrConnector.close(); sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().close();
sb.solrConnector = null; sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr(null);
} }
final SolrScheme scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/" + schemename)); final SolrScheme scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/" + schemename));
@ -85,10 +88,10 @@ public class IndexFederated_p {
// switch on // switch on
final boolean usesolr = sb.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0; final boolean usesolr = sb.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0;
try { try {
sb.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null; sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr((usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null);
} catch (final IOException e) { } catch (final IOException e) {
Log.logException(e); Log.logException(e);
sb.solrConnector = null; sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr(null);
} }
} }
@ -110,12 +113,13 @@ public class IndexFederated_p {
} }
// show solr host table // show solr host table
if (sb.solrConnector == null) { if (sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null) {
prop.put("table", 0); prop.put("table", 0);
} else { } else {
prop.put("table", 1); prop.put("table", 1);
final long[] size = sb.solrConnector.getSizeList(); final SolrConnector solr = sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr();
final String[] urls = sb.solrConnector.getAdminInterfaceList(); final long[] size = (solr instanceof SolrChardingConnector) ? ((SolrChardingConnector) solr).getSizeList() : new long[]{((SolrSingleConnector) solr).getSize()};
final String[] urls = (solr instanceof SolrChardingConnector) ? ((SolrChardingConnector) solr).getAdminInterfaceList() : new String[]{((SolrSingleConnector) solr).getAdminInterface()};
boolean dark = false; boolean dark = false;
for (int i = 0; i < size.length; i++) { for (int i = 0; i < size.length; i++) {
prop.put("table_list_" + i + "_dark", dark ? 1 : 0); dark = !dark; prop.put("table_list_" + i + "_dark", dark ? 1 : 0); dark = !dark;
@ -126,7 +130,7 @@ public class IndexFederated_p {
} }
// write scheme // write scheme
SolrScheme scheme = (sb.solrConnector == null) ? null : sb.solrConnector.getScheme(); SolrScheme scheme = (sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null) ? null : sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().getScheme();
final String schemename = sb.getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list"); final String schemename = sb.getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list");
if (scheme == null) { if (scheme == null) {
scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/" + schemename)); scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/" + schemename));

@ -62,6 +62,7 @@ public class CrawlQueues {
private static final String ERROR_DB_FILENAME = "urlError3.db"; private static final String ERROR_DB_FILENAME = "urlError3.db";
private static final String DELEGATED_DB_FILENAME = "urlDelegated3.db"; private static final String DELEGATED_DB_FILENAME = "urlDelegated3.db";
private static final Segments.Process PROCESS = Segments.Process.LOCALCRAWLING;
protected Switchboard sb; protected Switchboard sb;
protected Log log; protected Log log;
@ -81,8 +82,8 @@ public class CrawlQueues {
this.log.logConfig("Starting Crawling Management"); this.log.logConfig("Starting Crawling Management");
this.noticeURL = new NoticedURL(queuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727); this.noticeURL = new NoticedURL(queuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727);
FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME)); FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME));
this.errorURL = new ZURL(sb.solrConnector, queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727); this.errorURL = new ZURL(sb.indexSegments.segment(PROCESS).getSolr(), queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
this.delegatedURL = new ZURL(sb.solrConnector, queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727); this.delegatedURL = new ZURL(sb.indexSegments.segment(PROCESS).getSolr(), queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
} }
public void relocate(final File newQueuePath) { public void relocate(final File newQueuePath) {
@ -93,8 +94,8 @@ public class CrawlQueues {
this.noticeURL = new NoticedURL(newQueuePath, this.sb.peers.myBotIDs(), this.sb.useTailCache, this.sb.exceed134217727); this.noticeURL = new NoticedURL(newQueuePath, this.sb.peers.myBotIDs(), this.sb.useTailCache, this.sb.exceed134217727);
FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME)); FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME));
this.errorURL = new ZURL(this.sb.solrConnector, newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727); this.errorURL = new ZURL(this.sb.indexSegments.segment(PROCESS).getSolr(), newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727);
this.delegatedURL = new ZURL(this.sb.solrConnector, newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727); this.delegatedURL = new ZURL(this.sb.indexSegments.segment(PROCESS).getSolr(), newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727);
} }
public void close() { public void close() {
@ -249,7 +250,7 @@ public class CrawlQueues {
return true; return true;
} }
try { try {
this.sb.indexingDocumentProcessor.enQueue(new indexingQueueEntry(Segments.Process.LOCALCRAWLING, new Response(urlEntry, profile), null, null)); this.sb.indexingDocumentProcessor.enQueue(new indexingQueueEntry(PROCESS, new Response(urlEntry, profile), null, null));
Log.logInfo("CrawlQueues", "placed NOLOAD URL on indexing queue: " + urlEntry.url().toNormalform(true, false)); Log.logInfo("CrawlQueues", "placed NOLOAD URL on indexing queue: " + urlEntry.url().toNormalform(true, false));
} catch (final InterruptedException e) { } catch (final InterruptedException e) {
Log.logException(e); Log.logException(e);

@ -36,6 +36,7 @@ import java.util.concurrent.ConcurrentLinkedQueue;
import net.yacy.cora.document.ASCII; import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8; import net.yacy.cora.document.UTF8;
import net.yacy.cora.services.federated.solr.SolrChardingConnector; import net.yacy.cora.services.federated.solr.SolrChardingConnector;
import net.yacy.cora.services.federated.solr.SolrConnector;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.Index; import net.yacy.kelondro.index.Index;
@ -76,10 +77,10 @@ public class ZURL implements Iterable<ZURL.Entry> {
// the class object // the class object
private Index urlIndex; private Index urlIndex;
private final ConcurrentLinkedQueue<byte[]> stack; private final ConcurrentLinkedQueue<byte[]> stack;
private final SolrChardingConnector solrConnector; private final SolrConnector solrConnector;
public ZURL( public ZURL(
final SolrChardingConnector solrConnector, final SolrConnector solrConnector,
final File cachePath, final File cachePath,
final String tablename, final String tablename,
final boolean startWithEmptyFile, final boolean startWithEmptyFile,

@ -31,11 +31,13 @@ import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.ranking.ScoreMap; import net.yacy.cora.ranking.ScoreMap;
import net.yacy.cora.ranking.WeakPriorityBlockingQueue; import net.yacy.cora.ranking.WeakPriorityBlockingQueue;
import net.yacy.cora.ranking.WeakPriorityBlockingQueue.ReverseElement; import net.yacy.cora.ranking.WeakPriorityBlockingQueue.ReverseElement;
import net.yacy.cora.services.federated.solr.SolrConnector;
import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
@ -46,6 +48,10 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.EventTracker; import net.yacy.kelondro.util.EventTracker;
import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.util.MemoryControl;
import net.yacy.repository.LoaderDispatcher; import net.yacy.repository.LoaderDispatcher;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import de.anomic.data.WorkTables; import de.anomic.data.WorkTables;
import de.anomic.http.client.Cache; import de.anomic.http.client.Cache;
import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacySeedDB;
@ -322,6 +328,7 @@ public class ResultFetcher {
private final int neededResults; private final int neededResults;
private final Pattern snippetPattern; private final Pattern snippetPattern;
private boolean shallrun; private boolean shallrun;
private final SolrConnector solr;
public Worker(final int id, final long maxlifetime, final CacheStrategy cacheStrategy, final Pattern snippetPattern, final int neededResults) { public Worker(final int id, final long maxlifetime, final CacheStrategy cacheStrategy, final Pattern snippetPattern, final int neededResults) {
this.id = id; this.id = id;
@ -331,6 +338,7 @@ public class ResultFetcher {
this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime); this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime);
this.neededResults = neededResults; this.neededResults = neededResults;
this.shallrun = true; this.shallrun = true;
this.solr = ResultFetcher.this.rankingProcess.getQuery().getSegment().getSolr();
} }
@Override @Override
@ -373,8 +381,18 @@ public class ResultFetcher {
} }
if (ResultFetcher.this.query.filterfailurls && ResultFetcher.this.workTables.failURLsContains(page.hash())) continue; if (ResultFetcher.this.query.filterfailurls && ResultFetcher.this.workTables.failURLsContains(page.hash())) continue;
// in case that we have an attached solr, we load also the solr document
String solrContent = null;
if (this.solr != null) {
SolrDocument sd = null;
final SolrDocumentList sdl = this.solr.get("id:" + ASCII.String(page.hash()), 0, 1);
if (sdl.size() > 0) sd = sdl.get(0);
if (sd != null) solrContent = this.solr.getScheme().solrGetText(sd);
}
loops++; loops++;
resultEntry = fetchSnippet(page, this.cacheStrategy); // does not fetch snippets if snippetMode == 0 resultEntry = fetchSnippet(page, solrContent, this.cacheStrategy); // does not fetch snippets if snippetMode == 0
if (resultEntry == null) continue; // the entry had some problems, cannot be used if (resultEntry == null) continue; // the entry had some problems, cannot be used
rawLine = resultEntry.textSnippet() == null ? null : resultEntry.textSnippet().getLineRaw(); rawLine = resultEntry.textSnippet() == null ? null : resultEntry.textSnippet().getLineRaw();
//System.out.println("***SNIPPET*** raw='" + rawLine + "', pattern='" + this.snippetPattern.toString() + "'"); //System.out.println("***SNIPPET*** raw='" + rawLine + "', pattern='" + this.snippetPattern.toString() + "'");
@ -412,7 +430,7 @@ public class ResultFetcher {
} }
} }
protected ResultEntry fetchSnippet(final URIMetadataRow page, final CacheStrategy cacheStrategy) { protected ResultEntry fetchSnippet(final URIMetadataRow page, final String solrText, final CacheStrategy cacheStrategy) {
// Snippet Fetching can has 3 modes: // Snippet Fetching can has 3 modes:
// 0 - do not fetch snippets // 0 - do not fetch snippets
// 1 - fetch snippets offline only // 1 - fetch snippets offline only
@ -429,6 +447,7 @@ public class ResultFetcher {
if (cacheStrategy == null) { if (cacheStrategy == null) {
final TextSnippet snippet = new TextSnippet( final TextSnippet snippet = new TextSnippet(
null, null,
solrText,
metadata, metadata,
this.snippetFetchWordHashes, this.snippetFetchWordHashes,
null, null,
@ -445,6 +464,7 @@ public class ResultFetcher {
startTime = System.currentTimeMillis(); startTime = System.currentTimeMillis();
final TextSnippet snippet = new TextSnippet( final TextSnippet snippet = new TextSnippet(
this.loader, this.loader,
solrText,
metadata, metadata,
this.snippetFetchWordHashes, this.snippetFetchWordHashes,
cacheStrategy, cacheStrategy,

@ -37,6 +37,7 @@ import java.util.TreeSet;
import net.yacy.cora.document.ASCII; import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8; import net.yacy.cora.document.UTF8;
import net.yacy.cora.services.federated.solr.SolrConnector;
import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.document.Document; import net.yacy.document.Document;
@ -81,6 +82,7 @@ public class Segment {
protected final IndexCell<WordReference> termIndex; protected final IndexCell<WordReference> termIndex;
//private final IndexCell<NavigationReference> authorNavIndex; //private final IndexCell<NavigationReference> authorNavIndex;
protected final MetadataRepository urlMetadata; protected final MetadataRepository urlMetadata;
private SolrConnector solr;
private final File segmentPath; private final File segmentPath;
public Segment( public Segment(
@ -98,6 +100,7 @@ public class Segment {
this.log = log; this.log = log;
this.segmentPath = segmentPath; this.segmentPath = segmentPath;
this.solr = null;
this.termIndex = new IndexCell<WordReference>( this.termIndex = new IndexCell<WordReference>(
segmentPath, segmentPath,
@ -126,6 +129,14 @@ public class Segment {
this.urlMetadata = new MetadataRepository(segmentPath, "text.urlmd", useTailCache, exceed134217727); this.urlMetadata = new MetadataRepository(segmentPath, "text.urlmd", useTailCache, exceed134217727);
} }
public void connectSolr(final SolrConnector solr) {
this.solr = solr;
}
public SolrConnector getSolr() {
return this.solr;
}
public static void migrateTextIndex(final File oldSegmentPath, final File newSegmentPath) { public static void migrateTextIndex(final File oldSegmentPath, final File newSegmentPath) {
final File oldCellPath = new File(oldSegmentPath, "RICELL"); final File oldCellPath = new File(oldSegmentPath, "RICELL");
if (!oldCellPath.exists()) return; if (!oldCellPath.exists()) return;
@ -254,6 +265,7 @@ public class Segment {
public void close() { public void close() {
this.termIndex.close(); this.termIndex.close();
this.urlMetadata.close(); this.urlMetadata.close();
if (this.solr != null) this.solr.close();
} }
public URIMetadataRow storeDocument( public URIMetadataRow storeDocument(

@ -38,13 +38,13 @@ import net.yacy.kelondro.rwi.IndexCell;
public class Segments implements Iterable<Segment> { public class Segments implements Iterable<Segment> {
/** /**
* process enumeration type * process enumeration type
* defines constants that can be used to assign process-related segment names * defines constants that can be used to assign process-related segment names
*/ */
public enum Process { public enum Process {
RECEIPTS, RECEIPTS,
QUERIES, QUERIES,
DHTIN, DHTIN,
@ -59,7 +59,7 @@ public class Segments implements Iterable<Segment> {
throw new UnsupportedOperationException("toString not allowed"); throw new UnsupportedOperationException("toString not allowed");
} }
} }
private final Log log; private final Log log;
private final File segmentsPath; private final File segmentsPath;
private final int entityCacheMaxSize; private final int entityCacheMaxSize;
@ -68,7 +68,7 @@ public class Segments implements Iterable<Segment> {
private final HashMap<Process, String> process_assignment; private final HashMap<Process, String> process_assignment;
private final boolean useTailCache; private final boolean useTailCache;
private final boolean exceed134217727; private final boolean exceed134217727;
public Segments( public Segments(
final Log log, final Log log,
final File segmentsPath, final File segmentsPath,
@ -96,41 +96,41 @@ public class Segments implements Iterable<Segment> {
this.process_assignment.put(Process.PUBLIC, "default"); this.process_assignment.put(Process.PUBLIC, "default");
this.process_assignment.put(Process.SURROGATES, "default"); this.process_assignment.put(Process.SURROGATES, "default");
} }
public void setSegment(Process process, String segmentName) { public void setSegment(final Process process, final String segmentName) {
this.process_assignment.put(process, segmentName); this.process_assignment.put(process, segmentName);
} }
public static void migrateOld(File oldSingleSegment, File newSegmentsPath, String newSegmentName) { public static void migrateOld(final File oldSingleSegment, final File newSegmentsPath, final String newSegmentName) {
if (!oldSingleSegment.exists()) return; if (!oldSingleSegment.exists()) return;
File newSegmentPath = new File(newSegmentsPath, newSegmentName); final File newSegmentPath = new File(newSegmentsPath, newSegmentName);
if (!newSegmentPath.exists()) newSegmentPath.mkdirs(); if (!newSegmentPath.exists()) newSegmentPath.mkdirs();
Segment.migrateTextIndex(oldSingleSegment, newSegmentPath); Segment.migrateTextIndex(oldSingleSegment, newSegmentPath);
Segment.migrateTextMetadata(oldSingleSegment, newSegmentPath); Segment.migrateTextMetadata(oldSingleSegment, newSegmentPath);
String[] oldFiles = oldSingleSegment.list(); final String[] oldFiles = oldSingleSegment.list();
for (String oldFile: oldFiles) { for (final String oldFile: oldFiles) {
if (oldFile.startsWith("text.")) { if (oldFile.startsWith("text.")) {
new File(oldSingleSegment, oldFile).renameTo(new File(newSegmentPath, oldFile)); new File(oldSingleSegment, oldFile).renameTo(new File(newSegmentPath, oldFile));
} }
} }
} }
public String[] segmentNames() { public String[] segmentNames() {
return this.segments.keySet().toArray(new String[this.segments.size()]); return this.segments.keySet().toArray(new String[this.segments.size()]);
} }
public boolean segmentExist(final String segmentName) { public boolean segmentExist(final String segmentName) {
return segments.containsKey(segmentName); return this.segments.containsKey(segmentName);
} }
public Segment segment(final Process process) { public Segment segment(final Process process) {
return segment(this.process_assignment.get(process)); return segment(this.process_assignment.get(process));
} }
public Segment segment(final String segmentName) { public Segment segment(final String segmentName) {
if (segments == null) return null; if (this.segments == null) return null;
Segment segment = segments.get(segmentName); Segment segment = this.segments.get(segmentName);
if (segment == null) { if (segment == null) {
// generate the segment // generate the segment
try { try {
@ -141,7 +141,7 @@ public class Segments implements Iterable<Segment> {
this.maxFileSize, this.maxFileSize,
this.useTailCache, this.useTailCache,
this.exceed134217727); this.exceed134217727);
} catch (IOException e) { } catch (final IOException e) {
Log.logException(e); Log.logException(e);
return null; return null;
} }
@ -149,28 +149,28 @@ public class Segments implements Iterable<Segment> {
} }
return segment; return segment;
} }
public long URLCount() { public long URLCount() {
if (this.segments == null) return 0; if (this.segments == null) return 0;
long c = 0; long c = 0;
for (Segment s: this.segments.values()) c += (long) s.urlMetadata().size(); for (final Segment s: this.segments.values()) c += s.urlMetadata().size();
return c; return c;
} }
public long RWICount() { public long RWICount() {
if (this.segments == null) return 0; if (this.segments == null) return 0;
long c = 0; long c = 0;
for (Segment s: this.segments.values()) c += (long) s.termIndex().sizesMax(); for (final Segment s: this.segments.values()) c += s.termIndex().sizesMax();
return c; return c;
} }
public int RWIBufferCount() { public int RWIBufferCount() {
if (this.segments == null) return 0; if (this.segments == null) return 0;
int c = 0; int c = 0;
for (Segment s: this.segments.values()) c += s.termIndex().getBufferSize(); for (final Segment s: this.segments.values()) c += s.termIndex().getBufferSize();
return c; return c;
} }
public MetadataRepository urlMetadata(final Process process) { public MetadataRepository urlMetadata(final Process process) {
return segment(this.process_assignment.get(process)).urlMetadata(); return segment(this.process_assignment.get(process)).urlMetadata();
} }
@ -178,11 +178,11 @@ public class Segments implements Iterable<Segment> {
public IndexCell<WordReference> termIndex(final Process process) { public IndexCell<WordReference> termIndex(final Process process) {
return segment(this.process_assignment.get(process)).termIndex(); return segment(this.process_assignment.get(process)).termIndex();
} }
public void clear(final Process process) { public void clear(final Process process) {
segment(this.process_assignment.get(process)).clear(); segment(this.process_assignment.get(process)).clear();
} }
public File getLocation(final Process process) { public File getLocation(final Process process) {
return segment(this.process_assignment.get(process)).getLocation(); return segment(this.process_assignment.get(process)).getLocation();
} }
@ -190,16 +190,16 @@ public class Segments implements Iterable<Segment> {
public void close(final Process process) { public void close(final Process process) {
segment(this.process_assignment.get(process)).close(); segment(this.process_assignment.get(process)).close();
} }
public void close() { public void close() {
if (segments != null) for (Segment s: this.segments.values()) s.close(); if (this.segments != null) for (final Segment s: this.segments.values()) s.close();
this.segments = null; this.segments = null;
} }
public void finalize() { public void finalize() {
this.close(); this.close();
} }
public synchronized Segment.ReferenceCleaner getReferenceCleaner(final String segmentName, final byte[] startHash) { public synchronized Segment.ReferenceCleaner getReferenceCleaner(final String segmentName, final byte[] startHash) {
return segment(segmentName).getReferenceCleaner(startHash); return segment(segmentName).getReferenceCleaner(startHash);
} }

@ -247,7 +247,6 @@ public final class Switchboard extends serverSwitch {
private final Semaphore shutdownSync = new Semaphore(0); private final Semaphore shutdownSync = new Semaphore(0);
private boolean terminate = false; private boolean terminate = false;
public SolrChardingConnector solrConnector = null;
//private Object crawlingPausedSync = new Object(); //private Object crawlingPausedSync = new Object();
//private boolean crawlingIsPaused = false; //private boolean crawlingIsPaused = false;
@ -592,10 +591,10 @@ public final class Switchboard extends serverSwitch {
final String solrurls = getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr"); final String solrurls = getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
final boolean usesolr = getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0; final boolean usesolr = getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0;
try { try {
this.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, workingScheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null; this.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr((usesolr) ? new SolrChardingConnector(solrurls, workingScheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null);
} catch (final IOException e) { } catch (final IOException e) {
Log.logException(e); Log.logException(e);
this.solrConnector = null; this.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr(null);
} }
// start a loader // start a loader
@ -1314,7 +1313,6 @@ public final class Switchboard extends serverSwitch {
Cache.close(); Cache.close();
this.tables.close(); this.tables.close();
Domains.close(); Domains.close();
if (this.solrConnector != null && getConfigBool("federated.service.solr.indexing.enabled", false)) this.solrConnector.close();
AccessTracker.dumpLog(new File("DATA/LOG/queries.log")); AccessTracker.dumpLog(new File("DATA/LOG/queries.log"));
UPnP.deletePortMapping(); UPnP.deletePortMapping();
Tray.removeTray(); Tray.removeTray();
@ -1989,7 +1987,7 @@ public final class Switchboard extends serverSwitch {
public indexingQueueEntry condenseDocument(final indexingQueueEntry in) { public indexingQueueEntry condenseDocument(final indexingQueueEntry in) {
in.queueEntry.updateStatus(Response.QUEUE_STATE_CONDENSING); in.queueEntry.updateStatus(Response.QUEUE_STATE_CONDENSING);
if (this.solrConnector != null && getConfigBool("federated.service.solr.indexing.enabled", false)/*in.queueEntry.profile().pushSolr()*/) { if (this.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() != null && getConfigBool("federated.service.solr.indexing.enabled", false)/*in.queueEntry.profile().pushSolr()*/) {
// send the documents to solr // send the documents to solr
for (final Document doc: in.documents) { for (final Document doc: in.documents) {
try { try {
@ -2000,7 +1998,7 @@ public final class Switchboard extends serverSwitch {
// in case that this happens it appears that the doc id is the right one // in case that this happens it appears that the doc id is the right one
} }
try { try {
this.solrConnector.add(id, in.queueEntry.getResponseHeader(), doc); this.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().add(id, in.queueEntry.getResponseHeader(), doc);
} catch (final IOException e) { } catch (final IOException e) {
Log.logWarning("SOLR", "failed to send " + in.queueEntry.url().toNormalform(true, false) + " to solr: " + e.getMessage()); Log.logWarning("SOLR", "failed to send " + in.queueEntry.url().toNormalform(true, false) + " to solr: " + e.getMessage());
} }

@ -24,6 +24,7 @@
package de.anomic.search; package de.anomic.search;
import java.io.ByteArrayInputStream;
import java.util.Collection; import java.util.Collection;
import java.util.Comparator; import java.util.Comparator;
import java.util.Iterator; import java.util.Iterator;
@ -34,6 +35,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import net.yacy.cora.document.ASCII; import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.cora.storage.ARC; import net.yacy.cora.storage.ARC;
import net.yacy.cora.storage.ConcurrentARC; import net.yacy.cora.storage.ConcurrentARC;
@ -140,6 +142,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
public TextSnippet( public TextSnippet(
final LoaderDispatcher loader, final LoaderDispatcher loader,
final String solrText,
final URIMetadataRow.Components comp, final URIMetadataRow.Components comp,
final HandleSet queryhashes, final HandleSet queryhashes,
final CacheStrategy cacheStrategy, final CacheStrategy cacheStrategy,
@ -156,7 +159,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
} }
// try to get snippet from snippetCache // try to get snippet from snippetCache
ResultClass source = ResultClass.SOURCE_CACHE; final ResultClass source = ResultClass.SOURCE_CACHE;
final String wordhashes = yacySearch.set2string(queryhashes); final String wordhashes = yacySearch.set2string(queryhashes);
final String urls = ASCII.String(url.hash()); final String urls = ASCII.String(url.hash());
String snippetLine = snippetsCache.get(wordhashes, urls); String snippetLine = snippetsCache.get(wordhashes, urls);
@ -165,32 +168,37 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
init(url.hash(), snippetLine, source, null); init(url.hash(), snippetLine, source, null);
return; return;
} }
// try to get the snippet from a document at the cache (or in the web)
// this requires that the document is parsed after loading
String textline = null; String textline = null;
HandleSet remainingHashes = queryhashes; HandleSet remainingHashes = queryhashes;
{ //encapsulate potential expensive sentences { //encapsulate potential expensive sentences
final Collection<StringBuilder> sentences; Collection<StringBuilder> sentences = null;
{ //encapsulate potential expensive document
final Document document = loadDocument(loader, comp, queryhashes, cacheStrategy, url, reindexing, source); // try the solr text first
if (document == null) { if (solrText != null) {
return; // compute sentences from solr query
} sentences = Document.getSentences(pre, new ByteArrayInputStream(UTF8.getBytes(solrText)));
}
/* ===========================================================================
* COMPUTE SNIPPET // if then no sentences are found, we fail-over to get the content from the re-loaded document
* =========================================================================== */ if (sentences == null) {
// we have found a parseable non-empty file: use the lines final Document document = loadDocument(loader, comp, queryhashes, cacheStrategy, url, reindexing, source);
if (document == null) {
// compute snippet from text return;
sentences = document.getSentences(pre); }
document.close();
} //encapsulate potential expensive document END // compute sentences from parsed document
sentences = document.getSentences(pre);
if (sentences == null) { document.close();
init(url.hash(), null, ResultClass.ERROR_PARSER_NO_LINES, "parser returned no sentences");
return; if (sentences == null) {
} init(url.hash(), null, ResultClass.ERROR_PARSER_NO_LINES, "parser returned no sentences");
return;
}
}
try { try {
final SnippetExtractor tsr = new SnippetExtractor(sentences, queryhashes, snippetMaxLength); final SnippetExtractor tsr = new SnippetExtractor(sentences, queryhashes, snippetMaxLength);
textline = tsr.getSnippet(); textline = tsr.getSnippet();
@ -227,7 +235,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
// document.close(); // document.close();
init(url.hash(), snippetLine, source, null); init(url.hash(), snippetLine, source, null);
} }
private Document loadDocument( private Document loadDocument(
final LoaderDispatcher loader, final LoaderDispatcher loader,
final URIMetadataRow.Components comp, final URIMetadataRow.Components comp,

@ -34,14 +34,13 @@ import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputDocument;
public class SolrChardingConnector { public class SolrChardingConnector implements SolrConnector {
private final List<SolrSingleConnector> connectors; private final List<SolrSingleConnector> connectors;
private final SolrScheme scheme; private final SolrScheme scheme;
@ -164,13 +163,7 @@ public class SolrChardingConnector {
final long[] size = new long[this.connectors.size()]; final long[] size = new long[this.connectors.size()];
int i = 0; int i = 0;
for (final SolrSingleConnector connector: this.connectors) { for (final SolrSingleConnector connector: this.connectors) {
try { size[i++] = connector.getSize();
final SolrDocumentList list = connector.get("*:*", 0, 1);
size[i++] = list.getNumFound();
} catch (final Exception e) {
Log.logException(e);
size[i++] = 0;
}
} }
return size; return size;
} }

@ -0,0 +1,99 @@
/**
* SolrConnector
* Copyright 2011 by Michael Peter Christen
* First released 13.09.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7654 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.services.federated.solr;
import java.io.IOException;
import java.util.List;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
import org.apache.solr.common.SolrDocumentList;
public interface SolrConnector {
/**
* with a scheme the fields of a SolrDocument can be translated to actual data values
* @return the solr scheme that can translate the SolrDocument
*/
public SolrScheme getScheme();
public void close();
/**
* delete everything in the solr index
* @throws IOException
*/
public void clear() throws IOException;
/**
* delete an entry from solr
* @param id the url hash of the entry
* @throws IOException
*/
public void delete(final String id) throws IOException;
/**
* delete a set of entries from solr; entries are identified by their url hash
* @param ids a list of url hashes
* @throws IOException
*/
public void delete(final List<String> ids) throws IOException;
/**
* add a YaCy document. This calls the scheme processor to add the document as solr document
* @param id the url hash of the entry
* @param header the http response header
* @param doc the YaCy document
* @throws IOException
*/
public void add(final String id, final ResponseHeader header, final Document doc) throws IOException;
/**
* register an entry as error document
* @param digestURI
* @param failReason
* @param httpstatus
* @throws IOException
*/
public void err(final DigestURI digestURI, final String failReason, final int httpstatus) throws IOException;
/**
* get a query result from solr
* to get all results set the query String to "*:*"
* @param querystring
* @throws IOException
*/
public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException;
/**
* get the size of the index
* @return number of results if solr is queries with a catch-all pattern
*/
public long getSize();
}

@ -27,6 +27,8 @@ package net.yacy.cora.services.federated.solr;
import java.io.File; import java.io.File;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.Date; import java.util.Date;
import java.util.Map; import java.util.Map;
@ -44,6 +46,7 @@ import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputDocument;
public class SolrScheme extends ConfigurationSet { public class SolrScheme extends ConfigurationSet {
@ -349,6 +352,46 @@ public class SolrScheme extends ConfigurationSet {
return solrdoc; return solrdoc;
} }
public String solrGetID(final SolrDocument solr) {
return (String) solr.getFieldValue("id");
}
public DigestURI solrGetURL(final SolrDocument solr) {
try {
return new DigestURI((String) solr.getFieldValue("sku"));
} catch (final MalformedURLException e) {
return null;
}
}
public String solrGetTitle(final SolrDocument solr) {
return (String) solr.getFieldValue("title");
}
public String solrGetText(final SolrDocument solr) {
return (String) solr.getFieldValue("text_t");
}
public String solrGetAuthor(final SolrDocument solr) {
return (String) solr.getFieldValue("author");
}
public String solrGetDescription(final SolrDocument solr) {
return (String) solr.getFieldValue("description");
}
public Date solrGetDate(final SolrDocument solr) {
return (Date) solr.getFieldValue("last_modified");
}
public Collection<String> solrGetKeywords(final SolrDocument solr) {
final Collection<Object> c = solr.getFieldValues("keywords");
final ArrayList<String> a = new ArrayList<String>();
for (final Object s: c) {
a.add((String) s);
}
return a;
}
/* /*
* standard solr scheme * standard solr scheme

@ -57,7 +57,7 @@ import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputDocument;
public class SolrSingleConnector { public class SolrSingleConnector implements SolrConnector {
private final String solrurl, host, solrpath, solraccount, solrpw; private final String solrurl, host, solrpath, solraccount, solrpw;
private final int port; private final int port;
@ -178,6 +178,22 @@ public class SolrSingleConnector {
} }
} }
@Override
public SolrScheme getScheme() {
return this.scheme;
}
@Override
public long getSize() {
try {
final SolrDocumentList list = get("*:*", 0, 1);
return list.getNumFound();
} catch (final Exception e) {
Log.logException(e);
return 0;
}
}
/** /**
* delete everything in the solr index * delete everything in the solr index
* @throws IOException * @throws IOException
@ -325,6 +341,16 @@ public class SolrSingleConnector {
//return result; //return result;
} }
public String getAdminInterface() {
final InetAddress localhostExternAddress = Domains.myPublicLocalIP();
final String localhostExtern = localhostExternAddress == null ? "127.0.0.1" : localhostExternAddress.getHostAddress();
String u = this.solrurl;
int p = u.indexOf("localhost"); if (p < 0) p = u.indexOf("127.0.0.1");
if (p >= 0) u = u.substring(0, p) + localhostExtern + u.substring(p + 9);
return u + (u.endsWith("/") ? "admin/" : "/admin/");
}
public static void main(final String args[]) { public static void main(final String args[]) {
SolrSingleConnector solr; SolrSingleConnector solr;
try { try {
@ -347,5 +373,4 @@ public class SolrSingleConnector {
e.printStackTrace(); e.printStackTrace();
} }
} }
} }

@ -312,8 +312,12 @@ dc_rights
} }
public List<StringBuilder> getSentences(final boolean pre) { public List<StringBuilder> getSentences(final boolean pre) {
if (this.text == null) return null; return getSentences(pre, getText());
final SentenceReader e = new SentenceReader(getText()); }
public static List<StringBuilder> getSentences(final boolean pre, final InputStream text) {
if (text == null) return null;
final SentenceReader e = new SentenceReader(text);
e.pre(pre); e.pre(pre);
final List<StringBuilder> sentences = new ArrayList<StringBuilder>(); final List<StringBuilder> sentences = new ArrayList<StringBuilder>();
while (e.hasNext()) { while (e.hasNext()) {

Loading…
Cancel
Save