on reindex delete index document with invalid url

if discovered
10 years ago · 98ab655917
parent 1e8369e18b
commit 98ab655917
1 changed files with 8 additions and 1 deletions
--- a/source/net/yacy/crawler/RecrawlBusyThread.java
+++ b/source/net/yacy/crawler/RecrawlBusyThread.java
@ -23,6 +23,7 @@
 */
 package net.yacy.crawler;

+import java.io.IOException;
 import java.net.MalformedURLException;
 import java.util.HashSet;
 import java.util.Set;
@ -54,7 +55,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
    private int chunkstart = 0;
    private int chunksize = 200;
    final Switchboard sb;
-    private Set<DigestURL> urlstack; // buffer of urls to recrawl
+    private final Set<DigestURL> urlstack; // buffer of urls to recrawl
    public long urlsfound = 0;

    public RecrawlBusyThread(Switchboard xsb) {
@ -181,6 +182,12 @@ public class RecrawlBusyThread extends AbstractBusyThread {
                try {
                    this.urlstack.add(new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())));
                } catch (MalformedURLException ex) {
+                    try { // if index entry hasn't a valid url (useless), delete it
+                        solrConnector.deleteById((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
+                        ConcurrentLog.severe(THREAD_NAME, "deleted index document with invalid url " + (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
+                    } catch (IOException ex1) {
+                        ConcurrentLog.severe(THREAD_NAME, ex1.getMessage());
+                    }
                }
            }
            this.chunkstart = this.chunkstart + this.chunksize;