diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index e9a46fc91..a3404bec0 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -37,6 +37,7 @@ import java.io.UnsupportedEncodingException; import java.net.InetAddress; import java.net.MalformedURLException; import java.net.URLDecoder; +import java.net.http.HttpResponse; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.BitSet; @@ -2578,6 +2579,32 @@ public class MultiProtocolURL implements Serializable, Comparable 0; + } + if (isHTTP() || isHTTPS()) { + try (final HTTPClient client = new HTTPClient(agent)) { + client.setHost(getHost()); + org.apache.http.HttpResponse response = client.HEADResponse(this, true); + return response != null && (response.getStatusLine().getStatusCode() == 200 || response.getStatusLine().getStatusCode() == 301); + } + } + return false; + } catch (IOException e) { + return false; + } + } + /** * Read fully the source, close it and return its content as a bytes array. * @param source the source to read diff --git a/source/net/yacy/document/importer/ZimImporter.java b/source/net/yacy/document/importer/ZimImporter.java index a96a79b18..26f36f787 100644 --- a/source/net/yacy/document/importer/ZimImporter.java +++ b/source/net/yacy/document/importer/ZimImporter.java @@ -30,6 +30,7 @@ import java.util.Map; import java.util.TreeMap; import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.util.ConcurrentLog; @@ -84,6 +85,12 @@ public class ZimImporter extends Thread implements Importer { this.reader = new ZIMReader(this.file); this.guessedSource = getSource(this.reader); + // verify the source + DirectoryEntry mainEntry = this.reader.getMainDirectoryEntry(); + DigestURL url = new DigestURL(mainEntry.url); + if (!url.exists(ClientIdentification.browserAgent)) return; + + // read all documents for (int i = 0; i < this.file.header_entryCount; i++) { if (this.abort) break; DirectoryEntry de = this.reader.getDirectoryInfo(i); @@ -304,7 +311,9 @@ public class ZimImporter extends Thread implements Importer { System.out.println("guessed domain: " + guessDomainName(f.getName())); String source = getSource(r); System.out.println("guessed Source: " + source); - System.out.println("guessed main article: " + guessURL(source, de)); + String mainURL = guessURL(source, de); + System.out.println("guessed main article: " + mainURL); + System.out.println("main article exists: " + new DigestURL(mainURL).exists(ClientIdentification.browserAgent)); System.out.println(); } catch (IOException e) { e.printStackTrace();