diff --git a/htroot/IndexImportOAIPMHList_p.html b/htroot/IndexImportOAIPMHList_p.html index 168a1f85d..2b2e78440 100644 --- a/htroot/IndexImportOAIPMHList_p.html +++ b/htroot/IndexImportOAIPMHList_p.html @@ -57,6 +57,7 @@ Source Processed
Chunks Imported
Records + Complete at
# Records Speed
(records/second) #{table}# @@ -65,6 +66,7 @@ #[source]# #[chunkCount]# #[recordsCount]# + #[completeListSize]# #[speed]# #{/table}# diff --git a/htroot/IndexImportOAIPMHList_p.java b/htroot/IndexImportOAIPMHList_p.java index ceba573a2..4fb79abda 100644 --- a/htroot/IndexImportOAIPMHList_p.java +++ b/htroot/IndexImportOAIPMHList_p.java @@ -75,6 +75,7 @@ public class IndexImportOAIPMHList_p { prop.put("import_table_" + count + "_source", job.source()); prop.put("import_table_" + count + "_chunkCount", job.chunkCount()); prop.put("import_table_" + count + "_recordsCount", job.count()); + prop.put("import_table_" + count + "_completeListSize", job.getCompleteListSize()); prop.put("import_table_" + count + "_speed", job.speed()); dark = !dark; count++; diff --git a/htroot/IndexImportOAIPMH_p.java b/htroot/IndexImportOAIPMH_p.java index dc6985979..300896331 100644 --- a/htroot/IndexImportOAIPMH_p.java +++ b/htroot/IndexImportOAIPMH_p.java @@ -68,7 +68,7 @@ public class IndexImportOAIPMH_p { // set next default url try { - DigestURI nexturl = (rt == null) ? null : rt.resumptionURL(url); + DigestURI nexturl = (rt == null) ? null : rt.resumptionURL(); if (rt != null) prop.put("defaulturl", (nexturl == null) ? "" : nexturl.toNormalform(true, false)); } catch (MalformedURLException e) { prop.put("defaulturl", e.getMessage()); diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 12c64ff69..3fe6a217b 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -1318,9 +1318,9 @@ public final class Switchboard extends serverSwitch { // check if url is in accepted domain assert surrogate != null; assert crawlStacker != null; - final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.getIdentifier()); + final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.getIdentifier(true)); if (urlRejectReason != null) { - if (this.log.isFine()) this.log.logInfo("Rejected URL '" + surrogate.getIdentifier() + "': " + urlRejectReason); + if (this.log.isFine()) this.log.logInfo("Rejected URL '" + surrogate.getIdentifier(true) + "': " + urlRejectReason); continue; } @@ -1328,7 +1328,7 @@ public final class Switchboard extends serverSwitch { Document document = surrogate.document(); Request request = new Request( peers.mySeed().hash.getBytes(), - surrogate.getIdentifier(), + surrogate.getIdentifier(true), null, "", new Date(), diff --git a/source/net/yacy/document/content/DCEntry.java b/source/net/yacy/document/content/DCEntry.java index 7e6ba94bb..b388baee9 100644 --- a/source/net/yacy/document/content/DCEntry.java +++ b/source/net/yacy/document/content/DCEntry.java @@ -104,9 +104,30 @@ public class DCEntry extends TreeMap { } } - public DigestURI getIdentifier() { + public DigestURI getIdentifier(boolean useRelationAsAlternative) { String u = this.get("url"); if (u == null) u = this.get("dc:identifier"); + if (u == null) return useRelationAsAlternative ? getRelation() : null; + String[] urls = u.split(";"); + if (urls.length > 1) { + // select one that fits + u = bestU(urls); + } + try { + return new DigestURI(u, null); + } catch (MalformedURLException e) { + if (useRelationAsAlternative) { + DigestURI relation = this.getRelation(); + if (relation != null) return relation; + Log.logWarning("DCEntry", "getIdentifier: url is bad, relation also: " + e.getMessage()); + } + Log.logWarning("DCEntry", "getIdentifier: url is bad: " + e.getMessage()); + return null; + } + } + + public DigestURI getRelation() { + String u = this.get("dc:relation"); if (u == null) return null; String[] urls = u.split(";"); if (urls.length > 1) { @@ -116,7 +137,7 @@ public class DCEntry extends TreeMap { try { return new DigestURI(u, null); } catch (MalformedURLException e) { - Log.logException(e); + Log.logWarning("DCEntry", "getRelation: url is bad: " + e.getMessage()); return null; } } @@ -139,7 +160,7 @@ public class DCEntry extends TreeMap { public String getLanguage() { String l = this.get("language"); if (l == null) l = this.get("dc:language"); - if (l == null) return getIdentifier().language(); + if (l == null) return getIdentifier(true).language(); return l; } @@ -220,7 +241,7 @@ public class DCEntry extends TreeMap { try { return new Document( - getIdentifier(), + getIdentifier(true), "text/html", "UTF-8", languages, diff --git a/source/net/yacy/document/content/SurrogateReader.java b/source/net/yacy/document/content/SurrogateReader.java index 3b3682044..cbaaee514 100644 --- a/source/net/yacy/document/content/SurrogateReader.java +++ b/source/net/yacy/document/content/SurrogateReader.java @@ -29,8 +29,10 @@ import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.util.ArrayList; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; +import java.util.zip.GZIPInputStream; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; @@ -170,21 +172,20 @@ public class SurrogateReader extends DefaultHandler implements Runnable { File f = new File(args[0]); SurrogateReader sr; try { - sr = new SurrogateReader(new BufferedInputStream(new FileInputStream(f)), 1); + InputStream is = new BufferedInputStream(new FileInputStream(f)); + if (f.getName().endsWith(".gz")) is = new GZIPInputStream(is); + sr = new SurrogateReader(is, 1); Thread t = new Thread(sr, "Surrogate-Reader " + f.getAbsolutePath()); t.start(); DCEntry s; - System.out.println("1"); while ((s = sr.take()) != DCEntry.poison) { System.out.println("Title: " + s.getTitle()); System.out.println("Date: " + s.getDate()); - System.out.println("URL: " + s.getIdentifier()); + System.out.println("URL: " + s.getIdentifier(true)); System.out.println("Language: " + s.getLanguage()); System.out.println("Body: " + s.getDescription()); - System.out.println("Categories: " + s.getSubject()); } - System.out.println("2"); } catch (IOException e) { Log.logException(e); } diff --git a/source/net/yacy/document/importer/OAIPMHImporter.java b/source/net/yacy/document/importer/OAIPMHImporter.java index 7fc5b453d..b557e37c1 100644 --- a/source/net/yacy/document/importer/OAIPMHImporter.java +++ b/source/net/yacy/document/importer/OAIPMHImporter.java @@ -53,7 +53,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable { int recordCounter; - public ResumptionToken(final byte[] b) throws IOException { + private DigestURI source; + + public ResumptionToken(DigestURI source, final byte[] b) throws IOException { super((Collator) insensitiveCollator.clone()); + this.source = source; this.recordCounter = 0; new Parser(b); } + /* public ResumptionToken( + DigestURI source, Date expirationDate, int completeListSize, int cursor, String token ) { super((Collator) insensitiveCollator.clone()); + this.source = source; this.recordCounter = 0; this.put("expirationDate", DateFormatter.formatISO8601(expirationDate)); this.put("completeListSize", Integer.toString(completeListSize)); @@ -77,18 +83,21 @@ public class ResumptionToken extends TreeMap { } public ResumptionToken( + DigestURI source, String expirationDate, int completeListSize, int cursor, String token ) { super((Collator) insensitiveCollator.clone()); + this.source = source; this.recordCounter = 0; this.put("expirationDate", expirationDate); this.put("completeListSize", Integer.toString(completeListSize)); this.put("cursor", Integer.toString(cursor)); this.put("token", token); } + */ /** * truncate the given url at the '?' @@ -116,12 +125,13 @@ public class ResumptionToken extends TreeMap { * @return * @throws IOException in case that no follow-up url can be generated; i.e. if the expiration date is exceeded */ - public DigestURI resumptionURL(DigestURI givenURL) throws IOException { - // decide which kind of encoding stratgy was used to get a resumptionToken: + public DigestURI resumptionURL() throws IOException { + // decide which kind of encoding strategy was used to get a resumptionToken: String token = this.getToken(); - if (token == null || token.length() == 0) throw new IOException("end of resumption reached"); - String url = truncatedURL(givenURL); + if (token == null) throw new IOException("end of resumption reached - token == null"); + if (token.length() == 0) throw new IOException("end of resumption reached - token.length() == 0"); + String url = truncatedURL(this.source); // encoded state if (token.indexOf("from=") >= 0) { @@ -135,8 +145,40 @@ public class ResumptionToken extends TreeMap { if (expiration.before(new Date())) throw new IOException("the resumption is expired at " + DateFormatter.formatISO8601(expiration) + " (now: " + DateFormatter.formatISO8601(new Date())); // the resumption token is still fresh } - - return new DigestURI(url + "verb=ListRecords&resumptionToken=" + token, null); + String u = url + "verb=ListRecords&resumptionToken=" + escape(token); + return new DigestURI(u, null); + } + + public static StringBuilder escape(final String s) { + final int len = s.length(); + final StringBuilder sbuf = new StringBuilder(len + 10); + for (int i = 0; i < len; i++) { + final int ch = s.charAt(i); + if (ch == '/') { + sbuf.append("%2F"); + } else if (ch == '?') { + sbuf.append("%3F"); + } else if (ch == '#') { + sbuf.append("%23"); + } else if (ch == '=') { + sbuf.append("%3D"); + } else if (ch == '&') { + sbuf.append("%26"); + } else if (ch == ':') { + sbuf.append("%3A"); + } else if (ch == ';') { + sbuf.append("%3B"); + } else if (ch == ' ') { + sbuf.append("%20"); + } else if (ch == '%') { + sbuf.append("%25"); + } else if (ch == '+') { + sbuf.append("%2B"); + } else { + sbuf.append((char)ch); + } + } + return sbuf; } /** @@ -199,7 +241,7 @@ public class ResumptionToken extends TreeMap { } public String toString() { - return "expirationDate=" + DateFormatter.formatISO8601(this.getExpirationDate()) + ", completeListSize=" + getCompleteListSize() + + return "source = " + this.source + ", expirationDate=" + DateFormatter.formatISO8601(this.getExpirationDate()) + ", completeListSize=" + getCompleteListSize() + ", cursor=" + this.getCursor() + ", token=" + this.getToken(); } @@ -224,13 +266,13 @@ public class ResumptionToken extends TreeMap { this.saxParser.parse(this.stream, this); } catch (SAXException e) { Log.logException(e); - Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b)); + Log.logWarning("ResumptionToken", "token was not parsed (1):\n" + new String(b)); } catch (IOException e) { Log.logException(e); - Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b)); + Log.logWarning("ResumptionToken", "token was not parsed (2):\n" + new String(b)); } catch (ParserConfigurationException e) { Log.logException(e); - Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b)); + Log.logWarning("ResumptionToken", "token was not parsed (3):\n" + new String(b)); throw new IOException(e.getMessage()); } finally { try { @@ -246,7 +288,13 @@ public class ResumptionToken extends TreeMap { completeListSize="226" cursor="0">688 */ - + + /* + 1518323588 + */ + public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException { if ("record".equals(tag)) { recordCounter++;