- added regular-expression based deletions

- on-demand collection-list generation for collection-based deletions
instead of a default collection-list presentation (this makes calling
the interface much faster since the computation of collections lists for
large indexes may take some seconds)
pull/1/head
Michael Peter Christen 12 years ago
parent 3841854c97
commit d7fd346917

@ -22,7 +22,7 @@
<dt>Matching Method</dt>
<dd>
<input type="radio" name="urldelete-mm" value="subpath" #(urldelete-mm-subpath-checked)#::checked="checked"#(/urldelete-mm-subpath-checked)# />sub-path of given URLs&nbsp;&nbsp;&nbsp;
<input type="radio" name="urldelete-mm" value="regexp" #(urldelete-mm-regexp-checked)#::checked="checked"#(/urldelete-mm-regexp-checked)# disabled="disabled"/>matching with regular expression
<input type="radio" name="urldelete-mm" value="regexp" #(urldelete-mm-regexp-checked)#::checked="checked"#(/urldelete-mm-regexp-checked)# />matching with regular expression
<dt><input type="submit" name="simulate-urldelete" value="Simulate Deletion" class="submitready" title="no actual deletion, generates only a deletion count"/></dt>
<dd><input type="submit" name="engage-urldelete" id="engage-urldelete" value="Engage Deletion" #(urldelete-active)#class="dangerdisarmed" disabled="disabled" title="simulate a deletion first to calculate the deletion count"::class="dangerready"::class="dangerdisarmed" disabled="disabled" title="engaged"#(/urldelete-active)#/>
#(urldelete-active)#::<span class="pending">selected #[count]# documents for deletion</span><input type="hidden" name="count" id="count" value="#[count]#" />::<span class="commit">deleted #[count]# documents</span>#(/urldelete-active)#
@ -81,15 +81,19 @@
<form id="IndexDeletionCollection" action="IndexDeletion_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<legend><label>Delete Collections</label></legend>
<p>Delete all documents which are inside specific collections. This is the list of known collections: #[collectionlist]#</p>
<p>Delete all documents which are inside specific collections.</p>
<dl>
<dt>Not Assiged</dt>
<dd><input type="radio" name="collectiondelete-mode" id="collectiondelete-mode-unassigned" value="unassigned" #(collectiondelete-mode-unassigned-checked)#::checked="checked"#(/collectiondelete-mode-unassigned-checked)# onClick="d=document.getElementById('engage-collectiondelete');d.disabled=true;d.className='dangerdisarmed';"/>Delete all documents which are not assigned to any collection
</dd>
<dt>Assigned</dt>
<dd><input type="radio" name="collectiondelete-mode" id="collectiondelete-mode-assigned" value="assigned" #(collectiondelete-mode-assigned-checked)#::checked="checked"#(/collectiondelete-mode-assigned-checked)# onClick="d=document.getElementById('engage-collectiondelete');d.disabled=true;d.className='dangerdisarmed';"/>Delete all documents which are assigned to the following collection(s), separated by ',' (comma) or '|' (vertical bar)<br/>
<input type="text" name="collectiondelete" id="collections" value="#[collectiondelete]#" size="96" maxlength="1024" onClick="d=document.getElementById('engage-collectiondelete');d.disabled=true;d.className='dangerdisarmed';document.getElementById('collectiondelete-mode-assigned').checked=true;"/>
<dd><input type="radio" name="collectiondelete-mode" id="collectiondelete-mode-assigned" value="assigned" #(collectiondelete-mode-assigned-checked)#::checked="checked"#(/collectiondelete-mode-assigned-checked)# onClick="d=document.getElementById('engage-collectiondelete');d.disabled=true;d.className='dangerdisarmed';"/>Delete all documents which are assigned to the following collection(s)
#(collectiondelete-select)#, separated by ',' (comma) or '|' (vertical bar); or <a href="IndexDeletion_p.html?collectionlist=">generate the collection list...</a><br/><input type="text" name="collectiondelete" id="collections" value="#[collectiondelete]#" size="96" maxlength="1024" onClick="d=document.getElementById('engage-collectiondelete');d.disabled=true;d.className='dangerdisarmed';document.getElementById('collectiondelete-mode-assigned').checked=true;"/>::
<br/><select id="collectiondelete" name="collectiondelete">
#{list}#<option value="#[collection-value]#" onClick="d=document.getElementById('engage-collectiondelete');d.disabled=true;d.className='dangerdisarmed';document.getElementById('collectiondelete-mode-assigned').checked=true;">#[collection-name]#</option>#{/list}#
</select>
#(/collectiondelete-select)#
</dd>
<dt><input type="submit" name="simulate-collectiondelete" value="Simulate Deletion" class="submitready" title="no actual deletion, generates only a deletion count"/></dt>
<dd><input type="submit" name="engage-collectiondelete" id="engage-collectiondelete" value="Engage Deletion" #(collectiondelete-active)#class="dangerdisarmed" disabled="disabled" title="simulate a deletion first to calculate the deletion count"::class="dangerready"::class="dangerdisarmed" disabled="disabled" title="engaged"#(/collectiondelete-active)#/>

@ -22,6 +22,7 @@ import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.regex.Pattern;
@ -32,6 +33,7 @@ import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard;
@ -52,11 +54,6 @@ public class IndexDeletion_p {
SolrConnector webgraphConnector = sb.index.fulltext().getWebgraphConnector();
defaultConnector.commit(false); // we must do a commit here because the user cannot see a proper count.
prop.put("doccount", defaultConnector.getSize());
try {
prop.put("collectionlist", defaultConnector.getFacets("*:*", 1000, CollectionSchema.collection_sxt.getSolrFieldName()).get(CollectionSchema.collection_sxt.getSolrFieldName()).toString());
} catch (IOException e1) {
prop.put("collectionlist", "[]");
}
// Delete by URL Matching
String urldelete = post == null ? "" : post.get("urldelete", "");
@ -83,9 +80,29 @@ public class IndexDeletion_p {
// Delete Collections
boolean collectiondelete_mode_unassigned_checked = post == null ? true : post.get("collectiondelete-mode", "unassigned").equals("unassigned");
String collectiondelete = post == null ? "" : post.get("collectiondelete", "");
if (post != null && post.containsKey("collectionlist")) {
collectiondelete_mode_unassigned_checked = false;
prop.put("collectiondelete-select", 1);
try {
ScoreMap<String> collectionMap = defaultConnector.getFacets("*:*", 1000, CollectionSchema.collection_sxt.getSolrFieldName()).get(CollectionSchema.collection_sxt.getSolrFieldName());
Iterator<String> i = collectionMap.iterator();
int c = 0;
while (i.hasNext()) {
String collection = i.next();
prop.put("collectiondelete-select_list_" + c + "_collection-name", collection + "/" + collectionMap.get(collection));
prop.put("collectiondelete-select_list_" + c + "_collection-value", collection);
c++;
}
prop.put("collectiondelete-select_list", c );
} catch (IOException e1) {
prop.put("collectiondelete-select", 0);
}
} else {
prop.put("collectiondelete-select", 0);
}
prop.put("collectiondelete-mode-unassigned-checked", collectiondelete_mode_unassigned_checked ? 1 : 0);
prop.put("collectiondelete-mode-assigned-checked", collectiondelete_mode_unassigned_checked ? 0 : 1);
prop.put("collectiondelete", collectiondelete);
prop.put("collectiondelete-select_collectiondelete", collectiondelete);
prop.put("collectiondelete-active", 0);
// Delete by Solr Query
@ -94,46 +111,67 @@ public class IndexDeletion_p {
prop.put("querydelete", querydelete);
prop.put("querydelete-active", 0);
int count = post == null ? -1 : post.getInt("count", -1);
if (post != null && (post.containsKey("simulate-urldelete") || post.containsKey("engage-urldelete"))) {
boolean simulate = post.containsKey("simulate-urldelete");
// parse the input
urldelete = urldelete.trim();
String[] stubURLs = urldelete.indexOf('\n') > 0 || urldelete.indexOf('\r') > 0 ? urldelete.split("[\\r\\n]+") : urldelete.split(Pattern.quote("|"));
Set<String> ids = new HashSet<String>();
for (String urlStub: stubURLs) {
if (urlStub == null || urlStub.length() == 0) continue;
int pos = urlStub.indexOf("://",0);
if (pos == -1) {
if (urlStub.startsWith("www")) urlStub = "http://" + urlStub;
if (urlStub.startsWith("ftp")) urlStub = "ftp://" + urlStub;
}
try {
DigestURI u = new DigestURI(urlStub);
BlockingQueue<SolrDocument> dq = defaultConnector.concurrentDocumentsByQuery(CollectionSchema.host_s.getSolrFieldName() + ":" + u.getHost(), 0, 100000000, Long.MAX_VALUE, 100, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
SolrDocument doc;
if (urldelete_mm_subpath_checked) {
// collect using url stubs
Set<String> ids = new HashSet<String>();
String[] stubURLs = urldelete.indexOf('\n') > 0 || urldelete.indexOf('\r') > 0 ? urldelete.split("[\\r\\n]+") : urldelete.split(Pattern.quote("|"));
for (String urlStub: stubURLs) {
if (urlStub == null || urlStub.length() == 0) continue;
int pos = urlStub.indexOf("://",0);
if (pos == -1) {
if (urlStub.startsWith("www")) urlStub = "http://" + urlStub;
if (urlStub.startsWith("ftp")) urlStub = "ftp://" + urlStub;
}
try {
while ((doc = dq.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String url = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
if (url.startsWith(urlStub)) ids.add((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
DigestURI u = new DigestURI(urlStub);
BlockingQueue<SolrDocument> dq = defaultConnector.concurrentDocumentsByQuery(CollectionSchema.host_s.getSolrFieldName() + ":" + u.getHost(), 0, 100000000, Long.MAX_VALUE, 100, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
SolrDocument doc;
try {
while ((doc = dq.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String url = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
if (url.startsWith(urlStub)) ids.add((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
}
} catch (InterruptedException e) {
}
} catch (InterruptedException e) {
}
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, docs matching with " + urldelete);
} catch (MalformedURLException e) {}
}
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, docs matching with " + urldelete);
} catch (MalformedURLException e) {}
}
if (simulate) {
count = ids.size();
prop.put("urldelete-active", count == 0 ? 2 : 1);
if (simulate) {
count = ids.size();
prop.put("urldelete-active", count == 0 ? 2 : 1);
} else {
try {
defaultConnector.deleteByIds(ids);
//webgraphConnector.deleteByQuery(webgraphQuery);
} catch (IOException e) {
}
prop.put("urldelete-active", 2);
}
} else {
try {
defaultConnector.deleteByIds(ids);
//webgraphConnector.deleteByQuery(webgraphQuery);
} catch (IOException e) {
// collect using a regular expression on urls
String regexquery = CollectionSchema.sku.getSolrFieldName() + ":/" + urldelete + "/";
if (simulate) {
try {
count = (int) defaultConnector.getCountByQuery(regexquery);
} catch (IOException e) {
}
prop.put("urldelete-active", count == 0 ? 2 : 1);
} else {
try {
defaultConnector.deleteByQuery(regexquery);
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, regex match = " + urldelete);
} catch (IOException e) {
}
prop.put("urldelete-active", 2);
}
prop.put("urldelete-active", 2);
}
prop.put("urldelete-active_count", count);
}

Loading…
Cancel
Save