From 98ab6559173ee948edbdcf939f2367681f4f57dc Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 12 Sep 2015 23:06:13 +0200 Subject: [PATCH] on reindex delete index document with invalid url if discovered --- source/net/yacy/crawler/RecrawlBusyThread.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/source/net/yacy/crawler/RecrawlBusyThread.java b/source/net/yacy/crawler/RecrawlBusyThread.java index d1e8eaea7..840f9fc90 100644 --- a/source/net/yacy/crawler/RecrawlBusyThread.java +++ b/source/net/yacy/crawler/RecrawlBusyThread.java @@ -23,6 +23,7 @@ */ package net.yacy.crawler; +import java.io.IOException; import java.net.MalformedURLException; import java.util.HashSet; import java.util.Set; @@ -54,7 +55,7 @@ public class RecrawlBusyThread extends AbstractBusyThread { private int chunkstart = 0; private int chunksize = 200; final Switchboard sb; - private Set urlstack; // buffer of urls to recrawl + private final Set urlstack; // buffer of urls to recrawl public long urlsfound = 0; public RecrawlBusyThread(Switchboard xsb) { @@ -181,6 +182,12 @@ public class RecrawlBusyThread extends AbstractBusyThread { try { this.urlstack.add(new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()))); } catch (MalformedURLException ex) { + try { // if index entry hasn't a valid url (useless), delete it + solrConnector.deleteById((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName())); + ConcurrentLog.severe(THREAD_NAME, "deleted index document with invalid url " + (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())); + } catch (IOException ex1) { + ConcurrentLog.severe(THREAD_NAME, ex1.getMessage()); + } } } this.chunkstart = this.chunkstart + this.chunksize;