diff --git a/htroot/IndexImportOAIPMHList_p.html b/htroot/IndexImportOAIPMHList_p.html
index 168a1f85d..2b2e78440 100644
--- a/htroot/IndexImportOAIPMHList_p.html
+++ b/htroot/IndexImportOAIPMHList_p.html
@@ -57,6 +57,7 @@
Source |
Processed Chunks |
Imported Records |
+ Complete at # Records |
Speed (records/second) |
#{table}#
@@ -65,6 +66,7 @@
#[source]# |
#[chunkCount]# |
#[recordsCount]# |
+ #[completeListSize]# |
#[speed]# |
#{/table}#
diff --git a/htroot/IndexImportOAIPMHList_p.java b/htroot/IndexImportOAIPMHList_p.java
index ceba573a2..4fb79abda 100644
--- a/htroot/IndexImportOAIPMHList_p.java
+++ b/htroot/IndexImportOAIPMHList_p.java
@@ -75,6 +75,7 @@ public class IndexImportOAIPMHList_p {
prop.put("import_table_" + count + "_source", job.source());
prop.put("import_table_" + count + "_chunkCount", job.chunkCount());
prop.put("import_table_" + count + "_recordsCount", job.count());
+ prop.put("import_table_" + count + "_completeListSize", job.getCompleteListSize());
prop.put("import_table_" + count + "_speed", job.speed());
dark = !dark;
count++;
diff --git a/htroot/IndexImportOAIPMH_p.java b/htroot/IndexImportOAIPMH_p.java
index dc6985979..300896331 100644
--- a/htroot/IndexImportOAIPMH_p.java
+++ b/htroot/IndexImportOAIPMH_p.java
@@ -68,7 +68,7 @@ public class IndexImportOAIPMH_p {
// set next default url
try {
- DigestURI nexturl = (rt == null) ? null : rt.resumptionURL(url);
+ DigestURI nexturl = (rt == null) ? null : rt.resumptionURL();
if (rt != null) prop.put("defaulturl", (nexturl == null) ? "" : nexturl.toNormalform(true, false));
} catch (MalformedURLException e) {
prop.put("defaulturl", e.getMessage());
diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java
index 12c64ff69..3fe6a217b 100644
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@@ -1318,9 +1318,9 @@ public final class Switchboard extends serverSwitch {
// check if url is in accepted domain
assert surrogate != null;
assert crawlStacker != null;
- final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.getIdentifier());
+ final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.getIdentifier(true));
if (urlRejectReason != null) {
- if (this.log.isFine()) this.log.logInfo("Rejected URL '" + surrogate.getIdentifier() + "': " + urlRejectReason);
+ if (this.log.isFine()) this.log.logInfo("Rejected URL '" + surrogate.getIdentifier(true) + "': " + urlRejectReason);
continue;
}
@@ -1328,7 +1328,7 @@ public final class Switchboard extends serverSwitch {
Document document = surrogate.document();
Request request = new Request(
peers.mySeed().hash.getBytes(),
- surrogate.getIdentifier(),
+ surrogate.getIdentifier(true),
null,
"",
new Date(),
diff --git a/source/net/yacy/document/content/DCEntry.java b/source/net/yacy/document/content/DCEntry.java
index 7e6ba94bb..b388baee9 100644
--- a/source/net/yacy/document/content/DCEntry.java
+++ b/source/net/yacy/document/content/DCEntry.java
@@ -104,9 +104,30 @@ public class DCEntry extends TreeMap {
}
}
- public DigestURI getIdentifier() {
+ public DigestURI getIdentifier(boolean useRelationAsAlternative) {
String u = this.get("url");
if (u == null) u = this.get("dc:identifier");
+ if (u == null) return useRelationAsAlternative ? getRelation() : null;
+ String[] urls = u.split(";");
+ if (urls.length > 1) {
+ // select one that fits
+ u = bestU(urls);
+ }
+ try {
+ return new DigestURI(u, null);
+ } catch (MalformedURLException e) {
+ if (useRelationAsAlternative) {
+ DigestURI relation = this.getRelation();
+ if (relation != null) return relation;
+ Log.logWarning("DCEntry", "getIdentifier: url is bad, relation also: " + e.getMessage());
+ }
+ Log.logWarning("DCEntry", "getIdentifier: url is bad: " + e.getMessage());
+ return null;
+ }
+ }
+
+ public DigestURI getRelation() {
+ String u = this.get("dc:relation");
if (u == null) return null;
String[] urls = u.split(";");
if (urls.length > 1) {
@@ -116,7 +137,7 @@ public class DCEntry extends TreeMap {
try {
return new DigestURI(u, null);
} catch (MalformedURLException e) {
- Log.logException(e);
+ Log.logWarning("DCEntry", "getRelation: url is bad: " + e.getMessage());
return null;
}
}
@@ -139,7 +160,7 @@ public class DCEntry extends TreeMap {
public String getLanguage() {
String l = this.get("language");
if (l == null) l = this.get("dc:language");
- if (l == null) return getIdentifier().language();
+ if (l == null) return getIdentifier(true).language();
return l;
}
@@ -220,7 +241,7 @@ public class DCEntry extends TreeMap {
try {
return new Document(
- getIdentifier(),
+ getIdentifier(true),
"text/html",
"UTF-8",
languages,
diff --git a/source/net/yacy/document/content/SurrogateReader.java b/source/net/yacy/document/content/SurrogateReader.java
index 3b3682044..cbaaee514 100644
--- a/source/net/yacy/document/content/SurrogateReader.java
+++ b/source/net/yacy/document/content/SurrogateReader.java
@@ -29,8 +29,10 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.util.ArrayList;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
+import java.util.zip.GZIPInputStream;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
@@ -170,21 +172,20 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
File f = new File(args[0]);
SurrogateReader sr;
try {
- sr = new SurrogateReader(new BufferedInputStream(new FileInputStream(f)), 1);
+ InputStream is = new BufferedInputStream(new FileInputStream(f));
+ if (f.getName().endsWith(".gz")) is = new GZIPInputStream(is);
+ sr = new SurrogateReader(is, 1);
Thread t = new Thread(sr, "Surrogate-Reader " + f.getAbsolutePath());
t.start();
DCEntry s;
- System.out.println("1");
while ((s = sr.take()) != DCEntry.poison) {
System.out.println("Title: " + s.getTitle());
System.out.println("Date: " + s.getDate());
- System.out.println("URL: " + s.getIdentifier());
+ System.out.println("URL: " + s.getIdentifier(true));
System.out.println("Language: " + s.getLanguage());
System.out.println("Body: " + s.getDescription());
- System.out.println("Categories: " + s.getSubject());
}
- System.out.println("2");
} catch (IOException e) {
Log.logException(e);
}
diff --git a/source/net/yacy/document/importer/OAIPMHImporter.java b/source/net/yacy/document/importer/OAIPMHImporter.java
index 7fc5b453d..b557e37c1 100644
--- a/source/net/yacy/document/importer/OAIPMHImporter.java
+++ b/source/net/yacy/document/importer/OAIPMHImporter.java
@@ -53,7 +53,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable {
int recordCounter;
- public ResumptionToken(final byte[] b) throws IOException {
+ private DigestURI source;
+
+ public ResumptionToken(DigestURI source, final byte[] b) throws IOException {
super((Collator) insensitiveCollator.clone());
+ this.source = source;
this.recordCounter = 0;
new Parser(b);
}
+ /*
public ResumptionToken(
+ DigestURI source,
Date expirationDate,
int completeListSize,
int cursor,
String token
) {
super((Collator) insensitiveCollator.clone());
+ this.source = source;
this.recordCounter = 0;
this.put("expirationDate", DateFormatter.formatISO8601(expirationDate));
this.put("completeListSize", Integer.toString(completeListSize));
@@ -77,18 +83,21 @@ public class ResumptionToken extends TreeMap {
}
public ResumptionToken(
+ DigestURI source,
String expirationDate,
int completeListSize,
int cursor,
String token
) {
super((Collator) insensitiveCollator.clone());
+ this.source = source;
this.recordCounter = 0;
this.put("expirationDate", expirationDate);
this.put("completeListSize", Integer.toString(completeListSize));
this.put("cursor", Integer.toString(cursor));
this.put("token", token);
}
+ */
/**
* truncate the given url at the '?'
@@ -116,12 +125,13 @@ public class ResumptionToken extends TreeMap {
* @return
* @throws IOException in case that no follow-up url can be generated; i.e. if the expiration date is exceeded
*/
- public DigestURI resumptionURL(DigestURI givenURL) throws IOException {
- // decide which kind of encoding stratgy was used to get a resumptionToken:
+ public DigestURI resumptionURL() throws IOException {
+ // decide which kind of encoding strategy was used to get a resumptionToken:
String token = this.getToken();
- if (token == null || token.length() == 0) throw new IOException("end of resumption reached");
- String url = truncatedURL(givenURL);
+ if (token == null) throw new IOException("end of resumption reached - token == null");
+ if (token.length() == 0) throw new IOException("end of resumption reached - token.length() == 0");
+ String url = truncatedURL(this.source);
// encoded state
if (token.indexOf("from=") >= 0) {
@@ -135,8 +145,40 @@ public class ResumptionToken extends TreeMap {
if (expiration.before(new Date())) throw new IOException("the resumption is expired at " + DateFormatter.formatISO8601(expiration) + " (now: " + DateFormatter.formatISO8601(new Date()));
// the resumption token is still fresh
}
-
- return new DigestURI(url + "verb=ListRecords&resumptionToken=" + token, null);
+ String u = url + "verb=ListRecords&resumptionToken=" + escape(token);
+ return new DigestURI(u, null);
+ }
+
+ public static StringBuilder escape(final String s) {
+ final int len = s.length();
+ final StringBuilder sbuf = new StringBuilder(len + 10);
+ for (int i = 0; i < len; i++) {
+ final int ch = s.charAt(i);
+ if (ch == '/') {
+ sbuf.append("%2F");
+ } else if (ch == '?') {
+ sbuf.append("%3F");
+ } else if (ch == '#') {
+ sbuf.append("%23");
+ } else if (ch == '=') {
+ sbuf.append("%3D");
+ } else if (ch == '&') {
+ sbuf.append("%26");
+ } else if (ch == ':') {
+ sbuf.append("%3A");
+ } else if (ch == ';') {
+ sbuf.append("%3B");
+ } else if (ch == ' ') {
+ sbuf.append("%20");
+ } else if (ch == '%') {
+ sbuf.append("%25");
+ } else if (ch == '+') {
+ sbuf.append("%2B");
+ } else {
+ sbuf.append((char)ch);
+ }
+ }
+ return sbuf;
}
/**
@@ -199,7 +241,7 @@ public class ResumptionToken extends TreeMap {
}
public String toString() {
- return "expirationDate=" + DateFormatter.formatISO8601(this.getExpirationDate()) + ", completeListSize=" + getCompleteListSize() +
+ return "source = " + this.source + ", expirationDate=" + DateFormatter.formatISO8601(this.getExpirationDate()) + ", completeListSize=" + getCompleteListSize() +
", cursor=" + this.getCursor() + ", token=" + this.getToken();
}
@@ -224,13 +266,13 @@ public class ResumptionToken extends TreeMap {
this.saxParser.parse(this.stream, this);
} catch (SAXException e) {
Log.logException(e);
- Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b));
+ Log.logWarning("ResumptionToken", "token was not parsed (1):\n" + new String(b));
} catch (IOException e) {
Log.logException(e);
- Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b));
+ Log.logWarning("ResumptionToken", "token was not parsed (2):\n" + new String(b));
} catch (ParserConfigurationException e) {
Log.logException(e);
- Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b));
+ Log.logWarning("ResumptionToken", "token was not parsed (3):\n" + new String(b));
throw new IOException(e.getMessage());
} finally {
try {
@@ -246,7 +288,13 @@ public class ResumptionToken extends TreeMap {
completeListSize="226"
cursor="0">688
*/
-
+
+ /*
+ 1518323588
+ */
+
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
if ("record".equals(tag)) {
recordCounter++;