From 79401cb9386e3965edbcb52051338c0e23f7519a Mon Sep 17 00:00:00 2001 From: reger Date: Mon, 13 May 2013 04:06:57 +0200 Subject: [PATCH] added reindex option for documents with disabled or obsolete fields to Solr Schema Editor page (IndexSchema_p.html) this allows to remove obsolete fields from the index (according to current schema config) by selecting all documents containig disabled fields. --- htroot/IndexReIndexMonitor_p.java | 70 +++++++ htroot/IndexReindexMonitor_p.html | 46 +++++ htroot/IndexSchema_p.html | 17 +- source/net/yacy/migration.java | 63 +++++- .../search/index/ReindexSolrBusyThread.java | 192 ++++++++++++++++++ 5 files changed, 385 insertions(+), 3 deletions(-) create mode 100644 htroot/IndexReIndexMonitor_p.java create mode 100644 htroot/IndexReindexMonitor_p.html create mode 100644 source/net/yacy/search/index/ReindexSolrBusyThread.java diff --git a/htroot/IndexReIndexMonitor_p.java b/htroot/IndexReIndexMonitor_p.java new file mode 100644 index 000000000..d77525cb3 --- /dev/null +++ b/htroot/IndexReIndexMonitor_p.java @@ -0,0 +1,70 @@ + +/** + * IndexReIndexMonitor_p Copyright 2013 by Michael Peter Christen First released + * 29.04.2013 at http://yacy.net + * + * This library is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 2.1 of the License, or (at your option) + * any later version. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt If not, see + * . + */ +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.kelondro.workflow.BusyThread; +import net.yacy.migration; + +import net.yacy.search.Switchboard; +import net.yacy.search.index.ReindexSolrBusyThread; +import net.yacy.server.serverObjects; +import net.yacy.server.serverSwitch; + +public class IndexReIndexMonitor_p { + + public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { + + final Switchboard sb = (Switchboard) env; + final serverObjects prop = new serverObjects(); + + prop.put("docsprocessed", "0"); + prop.put("currentselectquery",""); + BusyThread bt = sb.getThread("reindexSolr"); + if (bt != null) { + prop.put("querysize", bt.getJobCount()); + + if (bt instanceof ReindexSolrBusyThread) { + prop.put("docsprocessed", ((ReindexSolrBusyThread) bt).getProcessed()); + prop.put("currentselectquery","q="+((ReindexSolrBusyThread) bt).getCurrentQuery()); + } + + if (post != null && post.containsKey("stopreindex")) { + sb.terminateThread("reindexSolr", false); + prop.put("infomessage", "reindex job stopped"); + prop.put("showstartbutton", 1); + } else { + prop.put("infomessage", "reindex is running"); + prop.put("showstartbutton", 0); + } + } else { + if (post != null && post.containsKey("reindexnow")) { + migration.reindexToschema(sb); + prop.put("showstartbutton", 0); + prop.put("querysize", "0"); + prop.put("infomessage","reindex job started"); + } else { + prop.put("showstartbutton", 1); + prop.put("querysize", "is empty"); + prop.put("infomessage", "no reindex job running"); + } + } + // return rewrite properties + return prop; + } +} diff --git a/htroot/IndexReindexMonitor_p.html b/htroot/IndexReindexMonitor_p.html new file mode 100644 index 000000000..f9d32cf3f --- /dev/null +++ b/htroot/IndexReindexMonitor_p.html @@ -0,0 +1,46 @@ + + + + YaCy '#[clientname]#': ReIndex Monitor + #%env/templates/metas.template%# + + + + #%env/templates/header.template%# + #%env/templates/submenuIndexControl.template%# +

ReIndex Monitor

+

+
+
+ + + + + + + + + + + + + + + + + + + + + +
Documents in current queue#[querysize]##(showstartbutton)#::#(/showstartbutton)#
Documents processed#[docsprocessed]#
current select query #[currentselectquery]#
 
+ #(showstartbutton)# + + :: + #(/showstartbutton)# +

#[infomessage]#

+
+
+ #%env/templates/footer.template%# + + diff --git a/htroot/IndexSchema_p.html b/htroot/IndexSchema_p.html index 9b80ab0b3..faf34e4e5 100644 --- a/htroot/IndexSchema_p.html +++ b/htroot/IndexSchema_p.html @@ -50,9 +50,22 @@
-
+ + +


+
+
+ +

If you unselected some fields, old documents in the index still contain the unselected fields. + To physically remove them from the index you need to reindex the documents. + Here you can reindex all documents with inactive fields.

+
+ + +
+

You may monitor progress (or stop the job) under IndexReIndexMonitor_p.html

+
- #%env/templates/footer.template%# diff --git a/source/net/yacy/migration.java b/source/net/yacy/migration.java index 555f7c921..ec192dc42 100644 --- a/source/net/yacy/migration.java +++ b/source/net/yacy/migration.java @@ -21,6 +21,7 @@ package net.yacy; // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +import net.yacy.search.index.ReindexSolrBusyThread; import java.io.File; import java.io.IOException; import java.util.List; @@ -33,11 +34,25 @@ import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import com.google.common.io.Files; +import static java.lang.Thread.MIN_PRIORITY; +import java.util.ArrayList; import java.util.Iterator; +import java.util.concurrent.Semaphore; +import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector; +import net.yacy.cora.storage.Configuration.Entry; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.index.Index; import net.yacy.kelondro.index.Row; +import net.yacy.kelondro.workflow.AbstractBusyThread; +import net.yacy.kelondro.workflow.AbstractThread; +import net.yacy.kelondro.workflow.BusyThread; +import net.yacy.kelondro.workflow.InstantBusyThread; +import net.yacy.kelondro.workflow.WorkflowThread; import net.yacy.search.index.Fulltext; +import net.yacy.search.schema.CollectionConfiguration; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.common.SolrInputDocument; public class migration { //SVN constants @@ -45,7 +60,7 @@ public class migration { public static final int TAGDB_WITH_TAGHASH=1635; //tagDB keys are tagHashes instead of plain tagname. public static final int NEW_OVERLAYS=4422; public static final int IDX_HOST=7724; // api for index retrieval: host index - + public static void migrate(final Switchboard sb, final int fromRev, final int toRev){ if(fromRev < toRev){ if(fromRev < TAGDB_WITH_TAGHASH){ @@ -334,4 +349,50 @@ public class migration { } return ret; } + + /** + * Reindex embedded solr index + * - all documents with inactive fields (according to current schema) + * - all documents with obsolete fields + * A worker thread is initialized with fieldnames or a solr query which selects the documents for reindexing + * implemented via deployed BusyThread which is called repeatedly by system + * reindexes a fixed chunk of documents per cycle (allowing to easy interrupt process after completion of a chunck) + * and monitoring in default process monitor (PerformanceQueues_p.html) + */ + public static int reindexToschema (final Switchboard sb) { + + BusyThread bt = sb.getThread("reindexSolr"); + // a reindex job is already running + if (bt != null) { + return bt.getJobCount(); + } + + ReindexSolrBusyThread reidx = new ReindexSolrBusyThread(null); // ("*:*" would reindex all) + + // add all disabled fields + CollectionConfiguration colcfg = Switchboard.getSwitchboard().index.fulltext().getDefaultConfiguration(); + Iterator itcol = colcfg.entryIterator(); + while (itcol.hasNext()) { + Entry etr = itcol.next(); + if (!etr.enabled()) { + reidx.addSelectFieldname(etr.key()); + } + } + + // add obsolete fields (not longer part of main index) + reidx.addSelectFieldname("inboundlinks_tag_txt"); + reidx.addSelectFieldname("inboundlinks_relflags_val"); + reidx.addSelectFieldname("inboundlinks_rel_sxt"); + reidx.addSelectFieldname("inboundlinks_text_txt"); + reidx.addSelectFieldname("inboundlinks_alttag_txt"); + + reidx.addSelectFieldname("outboundlinks_tag_txt"); + reidx.addSelectFieldname("outboundlinks_relflags_val"); + reidx.addSelectFieldname("outboundlinks_rel_sxt"); + reidx.addSelectFieldname("outboundlinks_text_txt"); + reidx.addSelectFieldname("outboundlinks_alttag_txt"); + + sb.deployThread("reindexSolr", "Reindex Solr", "reindex documents with obsolete fields in embedded Solr index", "/IndexReIndexMonitor_p.html",reidx /*privateWorkerThread*/, 0); + return 0; + } } diff --git a/source/net/yacy/search/index/ReindexSolrBusyThread.java b/source/net/yacy/search/index/ReindexSolrBusyThread.java new file mode 100644 index 000000000..1697b25d1 --- /dev/null +++ b/source/net/yacy/search/index/ReindexSolrBusyThread.java @@ -0,0 +1,192 @@ +package net.yacy.search.index; +/** + * ReindexSolrBusyThread + * Copyright 2013 by Michael Peter Christen + * First released 13.05.2013 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +import java.io.IOException; +import net.yacy.kelondro.logging.Log; +import net.yacy.search.Switchboard; +import java.util.ArrayList; +import java.util.concurrent.Semaphore; +import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector; +import net.yacy.kelondro.workflow.AbstractBusyThread; +import net.yacy.search.schema.CollectionConfiguration; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.common.SolrInputDocument; + + + /** + * Reindex selected documents of embedded Solr index. + * As the toSolrInputDocument acts only on current schema fields + * this can be used to remove obsolete fields physically from index + * + * can be deployed as BusyThread which is periodically called by system allowing easy interruption + * after each reindex chunk of 100 documents. + * If queue is empty this removes itself from list of servers workerthreads list + * Process: - initialize with one or more select queries + * - deploy as BusyThread (or call job repeatedly until it returns false) + * - job reindexes on each call chunk of 100 documents + */ + public class ReindexSolrBusyThread extends AbstractBusyThread { + + final EmbeddedSolrConnector esc; + final CollectionConfiguration colcfg; // collection config + int processed = 0; // total number of reindexed documents + int docstoreindex = 0; // documents found to reindex for current query + Semaphore sem = new Semaphore(1); + ArrayList querylist = new ArrayList(); // list of select statements to reindex + int start = 0; // startindex + int chunksize = 100; // number of documents to reindex per cycle + + /** + * @param query = a solr query to select documents to reindex (like h5_txt:[* TO *]) + */ + public ReindexSolrBusyThread(String query) { + super(100,1000,0,500); + this.esc = Switchboard.getSwitchboard().index.fulltext().getDefaultEmbeddedConnector(); + this.colcfg = Switchboard.getSwitchboard().index.fulltext().getDefaultConfiguration(); + + if (Switchboard.getSwitchboard().getThread("reindexSolr") != null) { + this.interrupt(); // only one active reindex job should exist + } else { + if (query != null) { + this.querylist.add(query); + } + } + setName("reindexSolr"); + this.setPriority(Thread.MIN_PRIORITY); + + } + + /** + * add a query selecting documents to reindex + */ + public void addSelectQuery(String query) { + if (query != null && !query.isEmpty()) { + querylist.add(query); + } + } + + /** + * add a fieldname to select documents to reindex all documents + * containing the given fieldname are reindexed + * + * @param field a solr fieldname + */ + public void addSelectFieldname(String field) { + if (field != null && !field.isEmpty()) { + querylist.add(field + ":[* TO *]"); + } + } + + /** + * each call reindexes a chunk of 100 documents until all selected documents are reindexed + * @return false if no documents selected + */ + @Override + public boolean job() { + boolean ret = true; + if (esc != null && colcfg != null && querylist.size() > 0) { + + if (sem.tryAcquire()) { + try { + String query = querylist.get(0); + boolean go = true; + SolrDocumentList xdocs = esc.getDocumentListByQuery(query, start, chunksize); + docstoreindex = (int) xdocs.getNumFound(); + + if (xdocs.size() == 0) { // no documents returned = all of current query reindexed (or eventual start to large) + querylist.remove(0); // consider normal case and remove current query + + if (start > 0) { // if previous cycle reindexed, commit to prevent reindex of same documents + esc.commit(true); + start = 0; + } + + if (chunksize < 100) { // try to increase chunksize (if reduced by freemem) + chunksize = chunksize + 10; + } + } else { + Log.logInfo("MIGRATION-REINDEX", "reindex docs with query=" + query + " found=" + docstoreindex + " start=" + start); + start = start + chunksize; + } + + for (SolrDocument doc : xdocs) { + SolrInputDocument idoc = colcfg.toSolrInputDocument(doc); + Switchboard.getSwitchboard().index.fulltext().putDocument(idoc); + processed++; + } + + } catch (IOException ex) { + Log.logException(ex); + } finally { + sem.release(); + } + } + } else { + ret = false; + } + + if (querylist.isEmpty()) { // if all processed remove from scheduled list (and terminate thread) + Switchboard.getSwitchboard().terminateThread("reindexSolr", false); + ret = false; + } + return ret; + } + + + @Override + public void terminate(final boolean waitFor) { + querylist.clear(); + super.terminate(waitFor); + } + + /** + * @return total number of processed documents + */ + public int getProcessed() { + return processed; + } + + /** + * @return the currently processed Solr select query + */ + public String getCurrentQuery() { + return querylist.isEmpty() ? "" : querylist.get(0); + } + + /** + * @return number of currently selected (found) documents + */ + @Override + public int getJobCount() { + return docstoreindex; + } + + @Override + public void freemem() { + // reduce number of docs processed in one job cycle + if (chunksize > 2) { + this.chunksize = this.chunksize / 2; + } + } + + } +