added reindex option for documents with disabled or obsolete fields to Solr Schema Editor page (IndexSchema_p.html)
this allows to remove obsolete fields from the index (according to current schema config) by selecting all documents containig disabled fields.pull/1/head
parent
cf36c1614f
commit
79401cb938
@ -0,0 +1,70 @@
|
||||
|
||||
/**
|
||||
* IndexReIndexMonitor_p Copyright 2013 by Michael Peter Christen First released
|
||||
* 29.04.2013 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or modify it under
|
||||
* the terms of the GNU Lesser General Public License as published by the Free
|
||||
* Software Foundation; either version 2.1 of the License, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||
* details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt If not, see
|
||||
* <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.kelondro.workflow.BusyThread;
|
||||
import net.yacy.migration;
|
||||
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.index.ReindexSolrBusyThread;
|
||||
import net.yacy.server.serverObjects;
|
||||
import net.yacy.server.serverSwitch;
|
||||
|
||||
public class IndexReIndexMonitor_p {
|
||||
|
||||
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
|
||||
|
||||
final Switchboard sb = (Switchboard) env;
|
||||
final serverObjects prop = new serverObjects();
|
||||
|
||||
prop.put("docsprocessed", "0");
|
||||
prop.put("currentselectquery","");
|
||||
BusyThread bt = sb.getThread("reindexSolr");
|
||||
if (bt != null) {
|
||||
prop.put("querysize", bt.getJobCount());
|
||||
|
||||
if (bt instanceof ReindexSolrBusyThread) {
|
||||
prop.put("docsprocessed", ((ReindexSolrBusyThread) bt).getProcessed());
|
||||
prop.put("currentselectquery","q="+((ReindexSolrBusyThread) bt).getCurrentQuery());
|
||||
}
|
||||
|
||||
if (post != null && post.containsKey("stopreindex")) {
|
||||
sb.terminateThread("reindexSolr", false);
|
||||
prop.put("infomessage", "reindex job stopped");
|
||||
prop.put("showstartbutton", 1);
|
||||
} else {
|
||||
prop.put("infomessage", "reindex is running");
|
||||
prop.put("showstartbutton", 0);
|
||||
}
|
||||
} else {
|
||||
if (post != null && post.containsKey("reindexnow")) {
|
||||
migration.reindexToschema(sb);
|
||||
prop.put("showstartbutton", 0);
|
||||
prop.put("querysize", "0");
|
||||
prop.put("infomessage","reindex job started");
|
||||
} else {
|
||||
prop.put("showstartbutton", 1);
|
||||
prop.put("querysize", "is empty");
|
||||
prop.put("infomessage", "no reindex job running");
|
||||
}
|
||||
}
|
||||
// return rewrite properties
|
||||
return prop;
|
||||
}
|
||||
}
|
@ -0,0 +1,46 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >
|
||||
<head>
|
||||
<title>YaCy '#[clientname]#': ReIndex Monitor</title>
|
||||
#%env/templates/metas.template%#
|
||||
</head>
|
||||
<body id="IndexReindexMonitor">
|
||||
|
||||
#%env/templates/header.template%#
|
||||
#%env/templates/submenuIndexControl.template%#
|
||||
<h2>ReIndex Monitor</h2>
|
||||
<p></p>
|
||||
<form action="IndexReindexMonitor_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
|
||||
<fieldset>
|
||||
<table>
|
||||
<tr>
|
||||
<td>Documents in current queue</td>
|
||||
<td>#[querysize]#</td>
|
||||
<td>#(showstartbutton)#<input type="submit" value="refresh page"/>::#(/showstartbutton)#</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Documents processed</td>
|
||||
<td>#[docsprocessed]#</td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>current select query </td>
|
||||
<td>#[currentselectquery]#</td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td> </td>
|
||||
<td></td>
|
||||
<td></td>
|
||||
</tr>
|
||||
</table>
|
||||
#(showstartbutton)#
|
||||
<input type="submit" name="stopreindex" value="stop reindexing"/>
|
||||
::<input type="submit" name="reindexnow" value="start reindex job now"/>
|
||||
#(/showstartbutton)#
|
||||
<p class="info">#[infomessage]#</p>
|
||||
</fieldset>
|
||||
</form>
|
||||
#%env/templates/footer.template%#
|
||||
</body>
|
||||
</html>
|
@ -0,0 +1,192 @@
|
||||
package net.yacy.search.index;
|
||||
/**
|
||||
* ReindexSolrBusyThread
|
||||
* Copyright 2013 by Michael Peter Christen
|
||||
* First released 13.05.2013 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.search.Switchboard;
|
||||
import java.util.ArrayList;
|
||||
import java.util.concurrent.Semaphore;
|
||||
import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector;
|
||||
import net.yacy.kelondro.workflow.AbstractBusyThread;
|
||||
import net.yacy.search.schema.CollectionConfiguration;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
|
||||
/**
|
||||
* Reindex selected documents of embedded Solr index.
|
||||
* As the <b>toSolrInputDocument</b> acts only on current schema fields
|
||||
* this can be used to remove obsolete fields physically from index
|
||||
*
|
||||
* can be deployed as BusyThread which is periodically called by system allowing easy interruption
|
||||
* after each reindex chunk of 100 documents.
|
||||
* If queue is empty this removes itself from list of servers workerthreads list
|
||||
* Process: - initialize with one or more select queries
|
||||
* - deploy as BusyThread (or call job repeatedly until it returns false)
|
||||
* - job reindexes on each call chunk of 100 documents
|
||||
*/
|
||||
public class ReindexSolrBusyThread extends AbstractBusyThread {
|
||||
|
||||
final EmbeddedSolrConnector esc;
|
||||
final CollectionConfiguration colcfg; // collection config
|
||||
int processed = 0; // total number of reindexed documents
|
||||
int docstoreindex = 0; // documents found to reindex for current query
|
||||
Semaphore sem = new Semaphore(1);
|
||||
ArrayList<String> querylist = new ArrayList<String>(); // list of select statements to reindex
|
||||
int start = 0; // startindex
|
||||
int chunksize = 100; // number of documents to reindex per cycle
|
||||
|
||||
/**
|
||||
* @param query = a solr query to select documents to reindex (like h5_txt:[* TO *])
|
||||
*/
|
||||
public ReindexSolrBusyThread(String query) {
|
||||
super(100,1000,0,500);
|
||||
this.esc = Switchboard.getSwitchboard().index.fulltext().getDefaultEmbeddedConnector();
|
||||
this.colcfg = Switchboard.getSwitchboard().index.fulltext().getDefaultConfiguration();
|
||||
|
||||
if (Switchboard.getSwitchboard().getThread("reindexSolr") != null) {
|
||||
this.interrupt(); // only one active reindex job should exist
|
||||
} else {
|
||||
if (query != null) {
|
||||
this.querylist.add(query);
|
||||
}
|
||||
}
|
||||
setName("reindexSolr");
|
||||
this.setPriority(Thread.MIN_PRIORITY);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* add a query selecting documents to reindex
|
||||
*/
|
||||
public void addSelectQuery(String query) {
|
||||
if (query != null && !query.isEmpty()) {
|
||||
querylist.add(query);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* add a fieldname to select documents to reindex all documents
|
||||
* containing the given fieldname are reindexed
|
||||
*
|
||||
* @param field a solr fieldname
|
||||
*/
|
||||
public void addSelectFieldname(String field) {
|
||||
if (field != null && !field.isEmpty()) {
|
||||
querylist.add(field + ":[* TO *]");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* each call reindexes a chunk of 100 documents until all selected documents are reindexed
|
||||
* @return false if no documents selected
|
||||
*/
|
||||
@Override
|
||||
public boolean job() {
|
||||
boolean ret = true;
|
||||
if (esc != null && colcfg != null && querylist.size() > 0) {
|
||||
|
||||
if (sem.tryAcquire()) {
|
||||
try {
|
||||
String query = querylist.get(0);
|
||||
boolean go = true;
|
||||
SolrDocumentList xdocs = esc.getDocumentListByQuery(query, start, chunksize);
|
||||
docstoreindex = (int) xdocs.getNumFound();
|
||||
|
||||
if (xdocs.size() == 0) { // no documents returned = all of current query reindexed (or eventual start to large)
|
||||
querylist.remove(0); // consider normal case and remove current query
|
||||
|
||||
if (start > 0) { // if previous cycle reindexed, commit to prevent reindex of same documents
|
||||
esc.commit(true);
|
||||
start = 0;
|
||||
}
|
||||
|
||||
if (chunksize < 100) { // try to increase chunksize (if reduced by freemem)
|
||||
chunksize = chunksize + 10;
|
||||
}
|
||||
} else {
|
||||
Log.logInfo("MIGRATION-REINDEX", "reindex docs with query=" + query + " found=" + docstoreindex + " start=" + start);
|
||||
start = start + chunksize;
|
||||
}
|
||||
|
||||
for (SolrDocument doc : xdocs) {
|
||||
SolrInputDocument idoc = colcfg.toSolrInputDocument(doc);
|
||||
Switchboard.getSwitchboard().index.fulltext().putDocument(idoc);
|
||||
processed++;
|
||||
}
|
||||
|
||||
} catch (IOException ex) {
|
||||
Log.logException(ex);
|
||||
} finally {
|
||||
sem.release();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ret = false;
|
||||
}
|
||||
|
||||
if (querylist.isEmpty()) { // if all processed remove from scheduled list (and terminate thread)
|
||||
Switchboard.getSwitchboard().terminateThread("reindexSolr", false);
|
||||
ret = false;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void terminate(final boolean waitFor) {
|
||||
querylist.clear();
|
||||
super.terminate(waitFor);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return total number of processed documents
|
||||
*/
|
||||
public int getProcessed() {
|
||||
return processed;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the currently processed Solr select query
|
||||
*/
|
||||
public String getCurrentQuery() {
|
||||
return querylist.isEmpty() ? "" : querylist.get(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return number of currently selected (found) documents
|
||||
*/
|
||||
@Override
|
||||
public int getJobCount() {
|
||||
return docstoreindex;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void freemem() {
|
||||
// reduce number of docs processed in one job cycle
|
||||
if (chunksize > 2) {
|
||||
this.chunksize = this.chunksize / 2;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in new issue