on reindex delete index document with invalid url

if discovered
pull/14/head
reger 10 years ago
parent 1e8369e18b
commit 98ab655917

@ -23,6 +23,7 @@
*/
package net.yacy.crawler;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.HashSet;
import java.util.Set;
@ -54,7 +55,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
private int chunkstart = 0;
private int chunksize = 200;
final Switchboard sb;
private Set<DigestURL> urlstack; // buffer of urls to recrawl
private final Set<DigestURL> urlstack; // buffer of urls to recrawl
public long urlsfound = 0;
public RecrawlBusyThread(Switchboard xsb) {
@ -181,6 +182,12 @@ public class RecrawlBusyThread extends AbstractBusyThread {
try {
this.urlstack.add(new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())));
} catch (MalformedURLException ex) {
try { // if index entry hasn't a valid url (useless), delete it
solrConnector.deleteById((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
ConcurrentLog.severe(THREAD_NAME, "deleted index document with invalid url " + (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
} catch (IOException ex1) {
ConcurrentLog.severe(THREAD_NAME, ex1.getMessage());
}
}
}
this.chunkstart = this.chunkstart + this.chunksize;

Loading…
Cancel
Save