changed options in IndexFederated_p to switch on/off parts of the index

individually. The settings are experimental and the values of the
settings will be overwritten when an index migration from urldb to solr
starts.
pull/1/head
Michael Peter Christen 12 years ago
parent cba4ab862e
commit 826967513b

@ -1047,11 +1047,15 @@ federated.service.solr.indexing.schemefile = solr.keys.default.list
# the lazy attribute causes that fields containing "" or 0 are not added and not written
federated.service.solr.indexing.lazy = true
# the indexing engine in YaCy can be switched off or on
# (off may make sense if federated.service.solr.indexing.enabled = true)
# for experiments the value federated.service.yacy.indexing.engine = solr may be used
# allowed values are: classic, solr, off
federated.service.yacy.indexing.engine = classic
# temporary definition of backend services to use. The standard is urldb+rwi, but in the future it should be rwi+solr
# to get a handle for a migration, these values are defined as temporary, if the migration starts the values are renamed
# and defined with different default values.
# The citation service is used for ranking; this is a reverse linking index. It should be on before and after the migration.
# It can be switched off if only a remote solr index is used.
core.service.urldb.tmp = true
core.service.rwi.tmp = true
core.service.solr.tmp = false
core.service.citation.tmp = true
# RDF triplestore settings
triplestore.persistent = true

@ -21,12 +21,15 @@
<form action="IndexFederated_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<legend>
The built-in search index can either be 'classic' (as before YaCy 1.03), 'solr' (experimental since 1.03) and 'off' (useful only if a remote solr index is used)
Local Search Index
</legend>
This is an experimental switchboard to test an index migration from embedded metadata to embedded solr. The 'classic' configuration is rwi + metadata switched on. The future configuration is rwi + solr switched on.
The rwi index is necessary for index transmission and shall be switched off in future portalmode configurations.
<dl>
<dt><input type="radio" name="yacy.indexing" value="classic" id="yacy.indexing.engine.classic" #(yacy.indexing.engine.classic.checked)#:: checked="checked"#(/yacy.indexing.engine.classic.checked)# /></dt><dd>embedded 'classic' search index</dd>
<dt><input type="radio" name="yacy.indexing" value="solr" id="yacy.indexing.engine.solr" #(yacy.indexing.engine.solr.checked)#:: checked="checked"#(/yacy.indexing.engine.solr.checked)# /></dt><dd>embedded solr search index</dd>
<dt><input type="radio" name="yacy.indexing" value="off" id="yacy.indexing.engine.off" #(yacy.indexing.engine.off.checked)#:: checked="checked"#(/yacy.indexing.engine.off.checked)# /></dt><dd>no local index</dd>
<dt><input type="checkbox" name="core.service.rwi.tmp" id="core.service.rwi" #(core.service.rwi.tmp.checked)#:: checked="checked"#(/core.service.rwi.tmp.checked)# /></dt><dd>embedded 'classic' rwi index</dd>
<dt><input type="checkbox" name="core.service.urldb.tmp" id="core.service.urldb" #(core.service.urldb.tmp.checked)#:: checked="checked"#(/core.service.urldb.tmp.checked)# /></dt><dd>embedded 'classic' metadata index</dd>
<dt><input type="checkbox" name="core.service.solr.tmp" id="core.service.solr" #(core.service.solr.tmp.checked)#:: checked="checked"#(/core.service.solr.tmp.checked)# /></dt><dd>embedded solr search index</dd>
<dt><input type="checkbox" name="core.service.citation.tmp" id="core.service.citation" #(core.service.citation.tmp.checked)#:: checked="checked"#(/core.service.citation.tmp.checked)# /></dt><dd>embedded citation reference index (link structure, used for ranking)</dd>
<dt></dt><dd><input type="submit" name="set" value="Set" /></dd>
</dl>
</fieldset>

@ -36,6 +36,7 @@ import net.yacy.cora.services.federated.solr.SingleSolrConnector;
import net.yacy.cora.services.federated.solr.SolrConnector;
import net.yacy.cora.storage.ConfigurationSet;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.OS;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.index.SolrField;
@ -51,26 +52,40 @@ public class IndexFederated_p {
if (post != null && post.containsKey("set")) {
// yacy
String localindex = post.get("yacy.indexing", "off"); // possible values: classic, solr, off
final boolean solrLocalWasOn = sb.index.getLocalSolr() != null && env.getConfig(SwitchboardConstants.FEDERATED_SERVICE_YACY_INDEXING_ENGINE, "off").equals("solr");
final boolean solrLocalIsOnAfterwards = localindex.equals("solr");
env.setConfig(SwitchboardConstants.FEDERATED_SERVICE_YACY_INDEXING_ENGINE, localindex);
if (solrLocalWasOn && !solrLocalIsOnAfterwards) {
sb.index.disconnectLocalSolr();
}
boolean post_core_rwi = post.getBoolean(SwitchboardConstants.CORE_SERVICE_RWI);
final boolean previous_core_rwi = sb.index.connectedRWI() && env.getConfigBool(SwitchboardConstants.CORE_SERVICE_RWI, false);
env.setConfig(SwitchboardConstants.CORE_SERVICE_RWI, post_core_rwi);
if (previous_core_rwi && !post_core_rwi) sb.index.disconnectRWI(); // switch off
if (!previous_core_rwi && post_core_rwi) try {
final int wordCacheMaxCount = (int) sb.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 20000);
final long fileSizeMax = (OS.isWindows) ? sb.getConfigLong("filesize.max.win", Integer.MAX_VALUE) : sb.getConfigLong( "filesize.max.other", Integer.MAX_VALUE);
sb.index.connectRWI(wordCacheMaxCount, fileSizeMax);
} catch (IOException e) { Log.logException(e); } // switch on
boolean post_core_citation = post.getBoolean(SwitchboardConstants.CORE_SERVICE_CITATION);
final boolean previous_core_citation = sb.index.connectedCitation() && env.getConfigBool(SwitchboardConstants.CORE_SERVICE_CITATION, false);
env.setConfig(SwitchboardConstants.CORE_SERVICE_CITATION, post_core_citation);
if (previous_core_citation && !post_core_citation) sb.index.disconnectCitation(); // switch off
if (!previous_core_citation && post_core_citation) try {
final int wordCacheMaxCount = (int) sb.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 20000);
final long fileSizeMax = (OS.isWindows) ? sb.getConfigLong("filesize.max.win", Integer.MAX_VALUE) : sb.getConfigLong( "filesize.max.other", Integer.MAX_VALUE);
sb.index.connectCitation(wordCacheMaxCount, fileSizeMax);
} catch (IOException e) { Log.logException(e); } // switch on
boolean post_core_solr = post.getBoolean(SwitchboardConstants.CORE_SERVICE_SOLR);
final boolean previous_core_solr = sb.index.connectedLocalSolr() && env.getConfigBool(SwitchboardConstants.CORE_SERVICE_SOLR, false);
env.setConfig(SwitchboardConstants.CORE_SERVICE_SOLR, post_core_solr);
if (previous_core_solr && !post_core_solr) sb.index.disconnectLocalSolr(); // switch off
if (!previous_core_solr && post_core_solr) try { sb.index.connectLocalSolr(); } catch (IOException e) { Log.logException(e); } // switch on
boolean post_core_urldb = post.getBoolean(SwitchboardConstants.CORE_SERVICE_URLDB);
final boolean previous_core_urldb = sb.index.connectedUrlDb() && env.getConfigBool(SwitchboardConstants.CORE_SERVICE_URLDB, false);
env.setConfig(SwitchboardConstants.CORE_SERVICE_URLDB, post_core_urldb);
if (previous_core_urldb && !post_core_urldb) sb.index.disconnectUrlDb(); // switch off
if (!previous_core_urldb && post_core_urldb) sb.index.connectUrlDb(sb.useTailCache, sb.exceed134217727);
if (!solrLocalWasOn && solrLocalIsOnAfterwards) {
// switch on
try {
sb.index.connectLocalSolr();
} catch (IOException e) {
Log.logException(e);
}
}
// solr
final boolean solrRemoteWasOn = sb.index.getRemoteSolr() != null && env.getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED, true);
final boolean solrRemoteWasOn = sb.index.connectedRemoteSolr() && env.getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED, true);
final boolean solrRemoteIsOnAfterwards = post.getBoolean("solr.indexing.solrremote");
env.setConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED, solrRemoteIsOnAfterwards);
String solrurls = post.get("solr.indexing.url", env.getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_URL, "http://127.0.0.1:8983/solr"));
@ -198,9 +213,11 @@ public class IndexFederated_p {
// fill attribute fields
// allowed values are: classic, solr, off
// federated.service.yacy.indexing.engine = classic
prop.put("yacy.indexing.engine.classic.checked", env.getConfig(SwitchboardConstants.FEDERATED_SERVICE_YACY_INDEXING_ENGINE, "classic").equals("classic") ? 1 : 0);
prop.put("yacy.indexing.engine.solr.checked", env.getConfig(SwitchboardConstants.FEDERATED_SERVICE_YACY_INDEXING_ENGINE, "classic").equals("solr") ? 1 : 0);
prop.put("yacy.indexing.engine.off.checked", env.getConfig(SwitchboardConstants.FEDERATED_SERVICE_YACY_INDEXING_ENGINE, "classic").equals("off") ? 1 : 0);
prop.put(SwitchboardConstants.CORE_SERVICE_URLDB + ".checked", env.getConfigBool(SwitchboardConstants.CORE_SERVICE_URLDB, false) ? 1 : 0);
prop.put(SwitchboardConstants.CORE_SERVICE_RWI + ".checked", env.getConfigBool(SwitchboardConstants.CORE_SERVICE_RWI, false) ? 1 : 0);
prop.put(SwitchboardConstants.CORE_SERVICE_SOLR + ".checked", env.getConfigBool(SwitchboardConstants.CORE_SERVICE_SOLR, false) ? 1 : 0);
prop.put(SwitchboardConstants.CORE_SERVICE_CITATION + ".checked", env.getConfigBool(SwitchboardConstants.CORE_SERVICE_CITATION, false) ? 1 : 0);
prop.put("solr.indexing.solrremote.checked", env.getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED, false) ? 1 : 0);
prop.put("solr.indexing.url", env.getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_URL, "http://127.0.0.1:8983/solr").replace(",", "\n"));
prop.put("solr.indexing.commitWithinMs", env.getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_COMMITWITHINMS, 180000));

@ -32,7 +32,6 @@ import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.document.LibraryProvider;
import net.yacy.kelondro.util.Formatter;
import net.yacy.peers.graphics.ProfilingGraph;
import net.yacy.search.EventTracker;
import net.yacy.search.query.QueryParams;

@ -40,6 +40,7 @@ import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.RowSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.index.Segment;
public final class ReferenceContainerArray<ReferenceType extends Reference> {
@ -453,7 +454,7 @@ public final class ReferenceContainerArray<ReferenceType extends Reference> {
final HandleMap references = new HandleMap(payloadrow.primaryKeyLength, termOrder, 4, 1000000, heapLocation.getAbsolutePath());
final String[] files = heapLocation.list();
for (final String f: files) {
if (f.length() < 22 || !f.startsWith("text.index") || !f.endsWith(".blob")) continue;
if (f.length() < 22 || !f.startsWith(Segment.termIndexName) || !f.endsWith(".blob")) continue;
final File fl = new File(heapLocation, f);
System.out.println("CELL REFERENCE COLLECTION opening blob " + fl);
final CloneableIterator<ReferenceContainer<ReferenceType>> ei = new ReferenceIterator<ReferenceType>(fl, factory);

@ -369,10 +369,7 @@ public final class Switchboard extends serverSwitch
// start indexing management
this.log.logConfig("Starting Indexing Management");
final String networkName = getConfig(SwitchboardConstants.NETWORK_NAME, "");
final long fileSizeMax =
(OS.isWindows) ? sb.getConfigLong("filesize.max.win", Integer.MAX_VALUE) : sb.getConfigLong(
"filesize.max.other",
Integer.MAX_VALUE);
final long fileSizeMax = (OS.isWindows) ? sb.getConfigLong("filesize.max.win", Integer.MAX_VALUE) : sb.getConfigLong( "filesize.max.other", Integer.MAX_VALUE);
final int redundancy = (int) sb.getConfigLong("network.unit.dhtredundancy.senior", 1);
final int partitionExponent = (int) sb.getConfigLong("network.unit.dht.partitionExponent", 0);
this.networkRoot = new File(new File(indexPath, networkName), "NETWORK");
@ -383,21 +380,12 @@ public final class Switchboard extends serverSwitch
// initialize index
ReferenceContainer.maxReferences = getConfigInt("index.maxReferences", 0);
final File segmentsPath = new File(new File(indexPath, networkName), "SEGMENTS");
final boolean solrLocal = this.getConfig(SwitchboardConstants.FEDERATED_SERVICE_YACY_INDEXING_ENGINE, "off").equals("solr");
this.index =
new Segment(
this.log,
new File(segmentsPath, "default"),
wordCacheMaxCount,
fileSizeMax,
this.useTailCache,
this.exceed134217727,
solrLocal,
true, // useCitationIndex
true, // useRWI
true // useMetadata
);
this.index = new Segment(this.log, new File(segmentsPath, "default"));
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_RWI, true)) this.index.connectRWI(wordCacheMaxCount, fileSizeMax);
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_CITATION, true)) this.index.connectCitation(wordCacheMaxCount, fileSizeMax);
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_URLDB, true)) this.index.connectUrlDb(this.useTailCache, this.exceed134217727);
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_SOLR, true)) this.index.connectLocalSolr();
// prepare a solr index profile switch list
final File solrBackupProfile = new File("defaults/solr.keys.list");
final String schemename =
@ -417,7 +405,6 @@ public final class Switchboard extends serverSwitch
// set up the solr interface
final String solrurls = getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_URL, "http://127.0.0.1:8983/solr");
final boolean usesolr = getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED, false) & solrurls.length() > 0;
int commitWithinMs = getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_COMMITWITHINMS, 180000);
if (usesolr && solrurls != null && solrurls.length() > 0) {
try {
@ -425,7 +412,7 @@ public final class Switchboard extends serverSwitch
solrurls,
ShardSelection.Method.MODULO_HOST_MD5,
10000, true);
solr.setCommitWithinMs(commitWithinMs);
solr.setCommitWithinMs(getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_COMMITWITHINMS, 180000));
this.index.connectRemoteSolr(solr);
} catch ( final IOException e ) {
Log.logException(e);
@ -1186,7 +1173,6 @@ public final class Switchboard extends serverSwitch
setConfig("heuristic.site", false);
setConfig("heuristic.blekko", false);
final boolean solrLocal = this.getConfig(SwitchboardConstants.FEDERATED_SERVICE_YACY_INDEXING_ENGINE, "off").equals("solr");
// relocate
this.peers.relocate(
this.networkRoot,
@ -1194,22 +1180,31 @@ public final class Switchboard extends serverSwitch
partitionExponent,
this.useTailCache,
this.exceed134217727);
this.index =
new Segment(
this.log,
new File(new File(new File(indexPrimaryPath, networkName), "SEGMENTS"), "default"),
wordCacheMaxCount,
fileSizeMax,
this.useTailCache,
this.exceed134217727,
solrLocal,
true, // useCitationIndex
true, // useRWI
true // useMetadata
);
this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object
this.index = new Segment(this.log, new File(new File(new File(indexPrimaryPath, networkName), "SEGMENTS"), "default"));
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_RWI, true)) this.index.connectRWI(wordCacheMaxCount, fileSizeMax);
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_CITATION, true)) this.index.connectCitation(wordCacheMaxCount, fileSizeMax);
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_SOLR, true)) this.index.connectLocalSolr();
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_URLDB, true)) this.index.connectUrlDb(this.useTailCache, this.exceed134217727);
// set up the solr interface
final String solrurls = getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_URL, "http://127.0.0.1:8983/solr");
final boolean usesolr = getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED, false) & solrurls.length() > 0;
if (usesolr && solrurls != null && solrurls.length() > 0) {
try {
SolrConnector solr = new ShardSolrConnector(
solrurls,
ShardSelection.Method.MODULO_HOST_MD5,
10000, true);
solr.setCommitWithinMs(getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_COMMITWITHINMS, 180000));
this.index.connectRemoteSolr(solr);
} catch ( final IOException e ) {
Log.logException(e);
}
}
// create a crawler
this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object
this.crawler = new CrawlSwitchboard(networkName, this.log, this.queuesRoot);
// init a DHT transmission dispatcher
@ -2456,7 +2451,7 @@ public final class Switchboard extends serverSwitch
}
// check if we should accept the document for our index
if (!getConfig(SwitchboardConstants.FEDERATED_SERVICE_YACY_INDEXING_ENGINE, "classic").equals("classic")) {
if (!this.getConfigBool(SwitchboardConstants.CORE_SERVICE_RWI, true)) {
if ( this.log.isInfo() ) {
this.log.logInfo("Not Condensed Resource '"
+ in.queueEntry.url().toNormalform(false, true)

@ -296,8 +296,12 @@ public final class SwitchboardConstants {
public static final String FEDERATED_SERVICE_SOLR_INDEXING_SHARDING = "federated.service.solr.indexing.sharding";
public static final String FEDERATED_SERVICE_SOLR_INDEXING_SCHEMEFILE = "federated.service.solr.indexing.schemefile";
public static final String FEDERATED_SERVICE_SOLR_INDEXING_LAZY = "federated.service.solr.indexing.lazy";
public static final String FEDERATED_SERVICE_YACY_INDEXING_ENGINE = "federated.service.yacy.indexing.engine";
public static final String CORE_SERVICE_URLDB = "core.service.urldb.tmp";
public static final String CORE_SERVICE_RWI = "core.service.rwi.tmp";
public static final String CORE_SERVICE_SOLR = "core.service.solr.tmp";
public static final String CORE_SERVICE_CITATION = "core.service.citation.tmp";
/**
* <p><code>public static final String <strong>CRAWLER_THREADS_ACTIVE_MAX</strong> = "crawler.MaxActiveThreads"</code></p>
* <p>Name of the setting how many active crawler-threads may maximal be running on the same time</p>

@ -75,19 +75,14 @@ public class DocumentIndex extends Segment
public DocumentIndex(final File segmentPath, final CallbackListener callback, final int cachesize)
throws IOException {
super(
new Log("DocumentIndex"),
segmentPath,
cachesize,
targetFileSize * 4 - 1,
false, // useTailCache
false, // exceed134217727
true, // connectLocalSolr
true, // useCitationIndex
true, // useRWI
true // useMetadata
);
super(new Log("DocumentIndex"), segmentPath);
super.connectRWI(cachesize, targetFileSize * 4 - 1);
super.connectCitation(cachesize, targetFileSize * 4 - 1);
super.connectUrlDb(
false, // useTailCache
false // exceed134217727
);
super.connectLocalSolr();
final int cores = Runtime.getRuntime().availableProcessors() + 1;
this.callback = callback;
this.queue = new LinkedBlockingQueue<DigestURI>(cores * 300);

@ -66,6 +66,7 @@ import net.yacy.search.Switchboard;
import net.yacy.search.solr.EmbeddedSolrConnector;
import org.apache.lucene.util.Version;
import de.anomic.crawler.CrawlStacker;
public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]> {
@ -87,7 +88,11 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
this.remoteSolr = null;
this.localSolr = null;
}
public boolean connectedUrlDb() {
return this.urlIndexFile != null;
}
public void connectUrlDb(final String tablename, final boolean useTailCache, final boolean exceed134217727) {
if (this.urlIndexFile != null) return;
this.tablename = tablename;
@ -100,6 +105,10 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
this.urlIndexFile = null;
}
public boolean connectedLocalSolr() {
return this.localSolr != null;
}
public void connectLocalSolr() throws IOException {
File solrLocation = this.location;
if (solrLocation.getName().equals("default")) solrLocation = solrLocation.getParentFile();
@ -120,6 +129,10 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
this.localSolr = null;
}
public boolean connectedRemoteSolr() {
return this.remoteSolr != null;
}
public void connectRemoteSolr(final SolrConnector solr) {
this.remoteSolr = solr;
}
@ -129,7 +142,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
this.remoteSolr.close();
this.remoteSolr = null;
}
public SolrConnector getLocalSolr() {
return this.localSolr;
}
@ -248,7 +261,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
//entry = oldEntry;
return; // this did not need to be stored, but is updated
}
try {
this.urlIndexFile.put(((URIMetadataRow) entry).toRowEntry());
} catch (final RowSpaceExceededException e) {

@ -90,87 +90,106 @@ public class Segment {
public static final long targetFileSize = 64 * 1024 * 1024; // 256 MB
public static final int writeBufferSize = 4 * 1024 * 1024;
public static final String UrlDbName = "text.urlmd";
public static final String termIndexName = "text.index";
public static final String citationIndexName = "citation.index";
// the reference factory
public static final ReferenceFactory<WordReference> wordReferenceFactory = new WordReferenceFactory();
public static final ReferenceFactory<CitationReference> citationReferenceFactory = new CitationReferenceFactory();
//public static final ReferenceFactory<NavigationReference> navigationReferenceFactory = new NavigationReferenceFactory();
public static final ByteOrder wordOrder = Base64Order.enhancedCoder;
private final Log log;
protected final IndexCell<WordReference> termIndex;
protected final IndexCell<CitationReference> urlCitationIndex;
//private final IndexCell<NavigationReference> authorNavIndex;
protected final MetadataRepository urlMetadata;
private final File segmentPath;
protected final MetadataRepository urlMetadata;
protected IndexCell<WordReference> termIndex;
protected IndexCell<CitationReference> urlCitationIndex;
public Segment(
final Log log,
final File segmentPath,
final int entityCacheMaxSize,
final long maxFileSize,
final boolean useTailCache,
final boolean exceed134217727,
final boolean connectLocalSolr,
final boolean useCitationIndex,
final boolean useRWI,
final boolean useMetadata) throws IOException {
public Segment(final Log log, final File segmentPath) {
log.logInfo("Initializing Segment '" + segmentPath + ".");
this.log = log;
this.segmentPath = segmentPath;
this.termIndex = useRWI ? new IndexCell<WordReference>(
segmentPath,
"text.index",
wordReferenceFactory,
wordOrder,
Word.commonHashLength,
entityCacheMaxSize,
targetFileSize,
maxFileSize,
writeBufferSize) : null;
this.urlCitationIndex = useCitationIndex ? new IndexCell<CitationReference>(
segmentPath,
"citation.index",
citationReferenceFactory,
wordOrder,
Word.commonHashLength,
entityCacheMaxSize,
targetFileSize,
maxFileSize,
writeBufferSize) : null;
// create LURL-db
this.urlMetadata = new MetadataRepository(segmentPath);
if (useMetadata) this.urlMetadata.connectUrlDb(UrlDbName, useTailCache, exceed134217727);
if (connectLocalSolr) this.connectLocalSolr();
}
public long URLCount() {
return this.urlMetadata.size();
public boolean connectedRWI() {
return this.termIndex != null;
}
public long RWICount() {
if (this.termIndex == null) return 0;
return this.termIndex.sizesMax();
public void connectRWI(final int entityCacheMaxSize, final long maxFileSize) throws IOException {
if (this.termIndex != null) return;
this.termIndex = new IndexCell<WordReference>(
this.segmentPath,
termIndexName,
wordReferenceFactory,
wordOrder,
Word.commonHashLength,
entityCacheMaxSize,
targetFileSize,
maxFileSize,
writeBufferSize);
}
public int RWIBufferCount() {
if (this.termIndex == null) return 0;
return this.termIndex.getBufferSize();
public void disconnectRWI() {
if (this.termIndex == null) return;
this.termIndex.close();
this.termIndex = null;
}
public boolean connectedCitation() {
return this.urlCitationIndex != null;
}
public void connectCitation(final int entityCacheMaxSize, final long maxFileSize) throws IOException {
if (this.urlCitationIndex != null) return;
this.urlCitationIndex = new IndexCell<CitationReference>(
this.segmentPath,
citationIndexName,
citationReferenceFactory,
wordOrder,
Word.commonHashLength,
entityCacheMaxSize,
targetFileSize,
maxFileSize,
writeBufferSize);
}
public void disconnectCitation() {
if (this.urlCitationIndex == null) return;
this.urlCitationIndex.close();
this.urlCitationIndex = null;
}
public boolean connectedUrlDb() {
return this.urlMetadata.connectedUrlDb();
}
public void connectUrlDb(final boolean useTailCache, final boolean exceed134217727) {
this.urlMetadata.connectUrlDb(UrlDbName, useTailCache, exceed134217727);
}
public void disconnectUrlDb() {
this.urlMetadata.disconnectUrlDb();
}
public boolean connectedRemoteSolr() {
return this.urlMetadata.connectedRemoteSolr();
}
public void connectRemoteSolr(final SolrConnector solr) {
this.urlMetadata.connectRemoteSolr(solr);
}
public void disconnectRemoteSolr() {
this.urlMetadata.disconnectRemoteSolr();
}
public boolean connectedLocalSolr() {
return this.urlMetadata.connectedLocalSolr();
}
public void connectLocalSolr() throws IOException {
this.urlMetadata.connectLocalSolr();
}
@ -199,6 +218,20 @@ public class Segment {
return this.urlCitationIndex;
}
public long URLCount() {
return this.urlMetadata.size();
}
public long RWICount() {
if (this.termIndex == null) return 0;
return this.termIndex.sizesMax();
}
public int RWIBufferCount() {
if (this.termIndex == null) return 0;
return this.termIndex.getBufferSize();
}
public boolean exists(final byte[] urlhash) {
return this.urlMetadata.exists(urlhash);
}
@ -382,7 +415,7 @@ public class Segment {
MultiProtocolURI anchor = anchorEntry.getKey();
byte[] refhash = new DigestURI(anchor).hash();
//System.out.println("*** addCitationIndex: urlhash = " + ASCII.String(urlhash) + ", refhash = " + ASCII.String(refhash) + ", urldate = " + urlModified.toString());
try {
if (this.urlCitationIndex != null) try {
this.urlCitationIndex.add(refhash, new CitationReference(urlhash, urldate));
} catch (final Exception e) {
Log.logException(e);

@ -662,22 +662,13 @@ public final class yacy {
// db used to hold all neede urls
final MetadataRepository minimizedUrlDB = new MetadataRepository(new File(new File(indexRoot2, networkName), "TEXT"));
minimizedUrlDB.connectUrlDb(Segment.UrlDbName, false, false);
final int cacheMem = (int)(MemoryControl.maxMemory() - MemoryControl.total());
if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
final Segment wordIndex = new Segment(
log,
new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"),
10000,
Integer.MAX_VALUE,
false, // useTailCache
false, // exceed134217727
false, // connectLocalSolr
false, // useCitationIndex
true, // useRWI
true // useMetadata
);
final Segment wordIndex = new Segment(log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"));
wordIndex.connectRWI(10000, Integer.MAX_VALUE);
wordIndex.connectUrlDb(false, false);
final Iterator<ReferenceContainer<WordReference>> indexContainerIterator = wordIndex.termIndex().referenceContainerIterator("AAAAAAAAAAAA".getBytes(), false, false);
long urlCounter = 0, wordCounter = 0;
@ -854,18 +845,9 @@ public final class yacy {
try {
Iterator<ReferenceContainer<WordReference>> indexContainerIterator = null;
if (resource.equals("all")) {
WordIndex = new Segment(
log,
new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"),
10000,
Integer.MAX_VALUE,
false, // useTailCache
false, // exceed134217727
false, // connectLocalSolr
false, // useCitationIndex
true, // useRWI
true // useMetadata
);
WordIndex = new Segment(log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"));
WordIndex.connectRWI(10000, Integer.MAX_VALUE);
WordIndex.connectUrlDb(false, false);
indexContainerIterator = WordIndex.termIndex().referenceContainerIterator(wordChunkStartHash.getBytes(), false, false);
}
int counter = 0;

Loading…
Cancel
Save