added reindex option for documents with disabled or obsolete fields to Solr Schema Editor page (IndexSchema_p.html)

this allows to remove obsolete fields from the index (according to current schema config)
by selecting all documents containig disabled fields.
pull/1/head
reger 12 years ago
parent cf36c1614f
commit 79401cb938

@ -0,0 +1,70 @@
/**
* IndexReIndexMonitor_p Copyright 2013 by Michael Peter Christen First released
* 29.04.2013 at http://yacy.net
*
* This library is free software; you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt If not, see
* <http://www.gnu.org/licenses/>.
*/
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.workflow.BusyThread;
import net.yacy.migration;
import net.yacy.search.Switchboard;
import net.yacy.search.index.ReindexSolrBusyThread;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
public class IndexReIndexMonitor_p {
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
prop.put("docsprocessed", "0");
prop.put("currentselectquery","");
BusyThread bt = sb.getThread("reindexSolr");
if (bt != null) {
prop.put("querysize", bt.getJobCount());
if (bt instanceof ReindexSolrBusyThread) {
prop.put("docsprocessed", ((ReindexSolrBusyThread) bt).getProcessed());
prop.put("currentselectquery","q="+((ReindexSolrBusyThread) bt).getCurrentQuery());
}
if (post != null && post.containsKey("stopreindex")) {
sb.terminateThread("reindexSolr", false);
prop.put("infomessage", "reindex job stopped");
prop.put("showstartbutton", 1);
} else {
prop.put("infomessage", "reindex is running");
prop.put("showstartbutton", 0);
}
} else {
if (post != null && post.containsKey("reindexnow")) {
migration.reindexToschema(sb);
prop.put("showstartbutton", 0);
prop.put("querysize", "0");
prop.put("infomessage","reindex job started");
} else {
prop.put("showstartbutton", 1);
prop.put("querysize", "is empty");
prop.put("infomessage", "no reindex job running");
}
}
// return rewrite properties
return prop;
}
}

@ -0,0 +1,46 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >
<head>
<title>YaCy '#[clientname]#': ReIndex Monitor</title>
#%env/templates/metas.template%#
</head>
<body id="IndexReindexMonitor">
#%env/templates/header.template%#
#%env/templates/submenuIndexControl.template%#
<h2>ReIndex Monitor</h2>
<p></p>
<form action="IndexReindexMonitor_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<table>
<tr>
<td>Documents in current queue</td>
<td>#[querysize]#</td>
<td>#(showstartbutton)#<input type="submit" value="refresh page"/>::#(/showstartbutton)#</td>
</tr>
<tr>
<td>Documents processed</td>
<td>#[docsprocessed]#</td>
<td></td>
</tr>
<tr>
<td>current select query </td>
<td>#[currentselectquery]#</td>
<td></td>
</tr>
<tr>
<td>&nbsp;</td>
<td></td>
<td></td>
</tr>
</table>
#(showstartbutton)#
<input type="submit" name="stopreindex" value="stop reindexing"/>
::<input type="submit" name="reindexnow" value="start reindex job now"/>
#(/showstartbutton)#
<p class="info">#[infomessage]#</p>
</fieldset>
</form>
#%env/templates/footer.template%#
</body>
</html>

@ -50,9 +50,22 @@
<div>
<input type="submit" name="set" value="Set" />
<input style="float:right" type="submit" name="resetselectiontodefault" value="reset selection to default" />
</div>
</div>
</form>
<p><br /></p>
<form action="IndexReIndexMonitor_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<legend><label>Reindex documents</label></legend>
<p>If you unselected some fields, old documents in the index still contain the unselected fields.
To physically remove them from the index you need to reindex the documents.
Here you can reindex all documents with inactive fields.</p>
<div style="text-align:center ">
<input type="submit" name="reindexSolr" value="reindex Solr" />
<input type="hidden" name="reindexnow"/>
</div>
<p>You may monitor progress (or stop the job) under <a href="IndexReIndexMonitor_p.html">IndexReIndexMonitor_p.html</a></p>
</fieldset>
</form>
#%env/templates/footer.template%#
</body>
</html>

@ -21,6 +21,7 @@ package net.yacy;
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import net.yacy.search.index.ReindexSolrBusyThread;
import java.io.File;
import java.io.IOException;
import java.util.List;
@ -33,11 +34,25 @@ import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import com.google.common.io.Files;
import static java.lang.Thread.MIN_PRIORITY;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.concurrent.Semaphore;
import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector;
import net.yacy.cora.storage.Configuration.Entry;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.Index;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.workflow.AbstractBusyThread;
import net.yacy.kelondro.workflow.AbstractThread;
import net.yacy.kelondro.workflow.BusyThread;
import net.yacy.kelondro.workflow.InstantBusyThread;
import net.yacy.kelondro.workflow.WorkflowThread;
import net.yacy.search.index.Fulltext;
import net.yacy.search.schema.CollectionConfiguration;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
public class migration {
//SVN constants
@ -45,7 +60,7 @@ public class migration {
public static final int TAGDB_WITH_TAGHASH=1635; //tagDB keys are tagHashes instead of plain tagname.
public static final int NEW_OVERLAYS=4422;
public static final int IDX_HOST=7724; // api for index retrieval: host index
public static void migrate(final Switchboard sb, final int fromRev, final int toRev){
if(fromRev < toRev){
if(fromRev < TAGDB_WITH_TAGHASH){
@ -334,4 +349,50 @@ public class migration {
}
return ret;
}
/**
* Reindex embedded solr index
* - all documents with inactive fields (according to current schema)
* - all documents with obsolete fields
* A worker thread is initialized with fieldnames or a solr query which selects the documents for reindexing
* implemented via deployed BusyThread which is called repeatedly by system
* reindexes a fixed chunk of documents per cycle (allowing to easy interrupt process after completion of a chunck)
* and monitoring in default process monitor (PerformanceQueues_p.html)
*/
public static int reindexToschema (final Switchboard sb) {
BusyThread bt = sb.getThread("reindexSolr");
// a reindex job is already running
if (bt != null) {
return bt.getJobCount();
}
ReindexSolrBusyThread reidx = new ReindexSolrBusyThread(null); // ("*:*" would reindex all)
// add all disabled fields
CollectionConfiguration colcfg = Switchboard.getSwitchboard().index.fulltext().getDefaultConfiguration();
Iterator<Entry> itcol = colcfg.entryIterator();
while (itcol.hasNext()) {
Entry etr = itcol.next();
if (!etr.enabled()) {
reidx.addSelectFieldname(etr.key());
}
}
// add obsolete fields (not longer part of main index)
reidx.addSelectFieldname("inboundlinks_tag_txt");
reidx.addSelectFieldname("inboundlinks_relflags_val");
reidx.addSelectFieldname("inboundlinks_rel_sxt");
reidx.addSelectFieldname("inboundlinks_text_txt");
reidx.addSelectFieldname("inboundlinks_alttag_txt");
reidx.addSelectFieldname("outboundlinks_tag_txt");
reidx.addSelectFieldname("outboundlinks_relflags_val");
reidx.addSelectFieldname("outboundlinks_rel_sxt");
reidx.addSelectFieldname("outboundlinks_text_txt");
reidx.addSelectFieldname("outboundlinks_alttag_txt");
sb.deployThread("reindexSolr", "Reindex Solr", "reindex documents with obsolete fields in embedded Solr index", "/IndexReIndexMonitor_p.html",reidx /*privateWorkerThread*/, 0);
return 0;
}
}

@ -0,0 +1,192 @@
package net.yacy.search.index;
/**
* ReindexSolrBusyThread
* Copyright 2013 by Michael Peter Christen
* First released 13.05.2013 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.IOException;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import java.util.ArrayList;
import java.util.concurrent.Semaphore;
import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector;
import net.yacy.kelondro.workflow.AbstractBusyThread;
import net.yacy.search.schema.CollectionConfiguration;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
/**
* Reindex selected documents of embedded Solr index.
* As the <b>toSolrInputDocument</b> acts only on current schema fields
* this can be used to remove obsolete fields physically from index
*
* can be deployed as BusyThread which is periodically called by system allowing easy interruption
* after each reindex chunk of 100 documents.
* If queue is empty this removes itself from list of servers workerthreads list
* Process: - initialize with one or more select queries
* - deploy as BusyThread (or call job repeatedly until it returns false)
* - job reindexes on each call chunk of 100 documents
*/
public class ReindexSolrBusyThread extends AbstractBusyThread {
final EmbeddedSolrConnector esc;
final CollectionConfiguration colcfg; // collection config
int processed = 0; // total number of reindexed documents
int docstoreindex = 0; // documents found to reindex for current query
Semaphore sem = new Semaphore(1);
ArrayList<String> querylist = new ArrayList<String>(); // list of select statements to reindex
int start = 0; // startindex
int chunksize = 100; // number of documents to reindex per cycle
/**
* @param query = a solr query to select documents to reindex (like h5_txt:[* TO *])
*/
public ReindexSolrBusyThread(String query) {
super(100,1000,0,500);
this.esc = Switchboard.getSwitchboard().index.fulltext().getDefaultEmbeddedConnector();
this.colcfg = Switchboard.getSwitchboard().index.fulltext().getDefaultConfiguration();
if (Switchboard.getSwitchboard().getThread("reindexSolr") != null) {
this.interrupt(); // only one active reindex job should exist
} else {
if (query != null) {
this.querylist.add(query);
}
}
setName("reindexSolr");
this.setPriority(Thread.MIN_PRIORITY);
}
/**
* add a query selecting documents to reindex
*/
public void addSelectQuery(String query) {
if (query != null && !query.isEmpty()) {
querylist.add(query);
}
}
/**
* add a fieldname to select documents to reindex all documents
* containing the given fieldname are reindexed
*
* @param field a solr fieldname
*/
public void addSelectFieldname(String field) {
if (field != null && !field.isEmpty()) {
querylist.add(field + ":[* TO *]");
}
}
/**
* each call reindexes a chunk of 100 documents until all selected documents are reindexed
* @return false if no documents selected
*/
@Override
public boolean job() {
boolean ret = true;
if (esc != null && colcfg != null && querylist.size() > 0) {
if (sem.tryAcquire()) {
try {
String query = querylist.get(0);
boolean go = true;
SolrDocumentList xdocs = esc.getDocumentListByQuery(query, start, chunksize);
docstoreindex = (int) xdocs.getNumFound();
if (xdocs.size() == 0) { // no documents returned = all of current query reindexed (or eventual start to large)
querylist.remove(0); // consider normal case and remove current query
if (start > 0) { // if previous cycle reindexed, commit to prevent reindex of same documents
esc.commit(true);
start = 0;
}
if (chunksize < 100) { // try to increase chunksize (if reduced by freemem)
chunksize = chunksize + 10;
}
} else {
Log.logInfo("MIGRATION-REINDEX", "reindex docs with query=" + query + " found=" + docstoreindex + " start=" + start);
start = start + chunksize;
}
for (SolrDocument doc : xdocs) {
SolrInputDocument idoc = colcfg.toSolrInputDocument(doc);
Switchboard.getSwitchboard().index.fulltext().putDocument(idoc);
processed++;
}
} catch (IOException ex) {
Log.logException(ex);
} finally {
sem.release();
}
}
} else {
ret = false;
}
if (querylist.isEmpty()) { // if all processed remove from scheduled list (and terminate thread)
Switchboard.getSwitchboard().terminateThread("reindexSolr", false);
ret = false;
}
return ret;
}
@Override
public void terminate(final boolean waitFor) {
querylist.clear();
super.terminate(waitFor);
}
/**
* @return total number of processed documents
*/
public int getProcessed() {
return processed;
}
/**
* @return the currently processed Solr select query
*/
public String getCurrentQuery() {
return querylist.isEmpty() ? "" : querylist.get(0);
}
/**
* @return number of currently selected (found) documents
*/
@Override
public int getJobCount() {
return docstoreindex;
}
@Override
public void freemem() {
// reduce number of docs processed in one job cycle
if (chunksize > 2) {
this.chunksize = this.chunksize / 2;
}
}
}
Loading…
Cancel
Save