- fixed resumption token generation for oai-pmh import

- relaxed dublin core parsing: the dc:reference tag may replace dc:identifier if this does not contain a valid url
- parsing of completeRecords number and presentation in the download list of oai import

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6850 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent af13a02307
commit 5fbf866cae

@ -57,6 +57,7 @@
<td>Source</td>
<td>Processed<br />Chunks</td>
<td>Imported<br />Records</td>
<td>Complete at<br /># Records</td>
<td>Speed<br />(records/second)</td>
</tr>
#{table}#
@ -65,6 +66,7 @@
<td>#[source]#</td>
<td>#[chunkCount]#</td>
<td>#[recordsCount]#</td>
<td>#[completeListSize]#</td>
<td>#[speed]#</td>
</tr>
#{/table}#

@ -75,6 +75,7 @@ public class IndexImportOAIPMHList_p {
prop.put("import_table_" + count + "_source", job.source());
prop.put("import_table_" + count + "_chunkCount", job.chunkCount());
prop.put("import_table_" + count + "_recordsCount", job.count());
prop.put("import_table_" + count + "_completeListSize", job.getCompleteListSize());
prop.put("import_table_" + count + "_speed", job.speed());
dark = !dark;
count++;

@ -68,7 +68,7 @@ public class IndexImportOAIPMH_p {
// set next default url
try {
DigestURI nexturl = (rt == null) ? null : rt.resumptionURL(url);
DigestURI nexturl = (rt == null) ? null : rt.resumptionURL();
if (rt != null) prop.put("defaulturl", (nexturl == null) ? "" : nexturl.toNormalform(true, false));
} catch (MalformedURLException e) {
prop.put("defaulturl", e.getMessage());

@ -1318,9 +1318,9 @@ public final class Switchboard extends serverSwitch {
// check if url is in accepted domain
assert surrogate != null;
assert crawlStacker != null;
final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.getIdentifier());
final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.getIdentifier(true));
if (urlRejectReason != null) {
if (this.log.isFine()) this.log.logInfo("Rejected URL '" + surrogate.getIdentifier() + "': " + urlRejectReason);
if (this.log.isFine()) this.log.logInfo("Rejected URL '" + surrogate.getIdentifier(true) + "': " + urlRejectReason);
continue;
}
@ -1328,7 +1328,7 @@ public final class Switchboard extends serverSwitch {
Document document = surrogate.document();
Request request = new Request(
peers.mySeed().hash.getBytes(),
surrogate.getIdentifier(),
surrogate.getIdentifier(true),
null,
"",
new Date(),

@ -104,9 +104,30 @@ public class DCEntry extends TreeMap<String, String> {
}
}
public DigestURI getIdentifier() {
public DigestURI getIdentifier(boolean useRelationAsAlternative) {
String u = this.get("url");
if (u == null) u = this.get("dc:identifier");
if (u == null) return useRelationAsAlternative ? getRelation() : null;
String[] urls = u.split(";");
if (urls.length > 1) {
// select one that fits
u = bestU(urls);
}
try {
return new DigestURI(u, null);
} catch (MalformedURLException e) {
if (useRelationAsAlternative) {
DigestURI relation = this.getRelation();
if (relation != null) return relation;
Log.logWarning("DCEntry", "getIdentifier: url is bad, relation also: " + e.getMessage());
}
Log.logWarning("DCEntry", "getIdentifier: url is bad: " + e.getMessage());
return null;
}
}
public DigestURI getRelation() {
String u = this.get("dc:relation");
if (u == null) return null;
String[] urls = u.split(";");
if (urls.length > 1) {
@ -116,7 +137,7 @@ public class DCEntry extends TreeMap<String, String> {
try {
return new DigestURI(u, null);
} catch (MalformedURLException e) {
Log.logException(e);
Log.logWarning("DCEntry", "getRelation: url is bad: " + e.getMessage());
return null;
}
}
@ -139,7 +160,7 @@ public class DCEntry extends TreeMap<String, String> {
public String getLanguage() {
String l = this.get("language");
if (l == null) l = this.get("dc:language");
if (l == null) return getIdentifier().language();
if (l == null) return getIdentifier(true).language();
return l;
}
@ -220,7 +241,7 @@ public class DCEntry extends TreeMap<String, String> {
try {
return new Document(
getIdentifier(),
getIdentifier(true),
"text/html",
"UTF-8",
languages,

@ -29,8 +29,10 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.zip.GZIPInputStream;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
@ -170,21 +172,20 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
File f = new File(args[0]);
SurrogateReader sr;
try {
sr = new SurrogateReader(new BufferedInputStream(new FileInputStream(f)), 1);
InputStream is = new BufferedInputStream(new FileInputStream(f));
if (f.getName().endsWith(".gz")) is = new GZIPInputStream(is);
sr = new SurrogateReader(is, 1);
Thread t = new Thread(sr, "Surrogate-Reader " + f.getAbsolutePath());
t.start();
DCEntry s;
System.out.println("1");
while ((s = sr.take()) != DCEntry.poison) {
System.out.println("Title: " + s.getTitle());
System.out.println("Date: " + s.getDate());
System.out.println("URL: " + s.getIdentifier());
System.out.println("URL: " + s.getIdentifier(true));
System.out.println("Language: " + s.getLanguage());
System.out.println("Body: " + s.getDescription());
System.out.println("Categories: " + s.getSubject());
}
System.out.println("2");
} catch (IOException e) {
Log.logException(e);
}

@ -53,7 +53,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
private final LoaderDispatcher loader;
private DigestURI source;
private int recordsCount, chunkCount;
private int recordsCount, chunkCount, completeListSize;
private final long startTime;
private long finishTime;
private final ResumptionToken resumptionToken;
@ -65,6 +65,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
this.loader = loader;
this.recordsCount = 0;
this.chunkCount = 0;
this.completeListSize = 0;
this.startTime = System.currentTimeMillis();
this.finishTime = 0;
this.resumptionToken = null;
@ -97,6 +98,10 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
return this.resumptionToken;
}
public int getCompleteListSize() {
return this.completeListSize;
}
public long remainingTime() {
return (this.isAlive()) ? Long.MAX_VALUE : 0; // we don't know
}
@ -123,9 +128,10 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
while (true) {
try {
OAIPMHLoader loader = new OAIPMHLoader(this.loader, this.source, Switchboard.getSwitchboard().surrogatesInPath, filenamePrefix);
this.completeListSize = Math.max(this.completeListSize, loader.getResumptionToken().getCompleteListSize());
this.chunkCount++;
this.recordsCount += loader.getResumptionToken().getRecordCounter();
this.source = loader.getResumptionToken().resumptionURL(this.source);
this.source = loader.getResumptionToken().resumptionURL();
if (this.source == null) {
this.message = "import terminated with source = null";
break;

@ -50,7 +50,8 @@ public class OAIPMHLoader {
// load the file from the net
Response response = loader.load(source, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
byte[] b = response.getContent();
this.resumptionToken = new ResumptionToken(b);
this.resumptionToken = new ResumptionToken(source, b);
//System.out.println("*** ResumptionToken = " + this.resumptionToken.toString());
File f1 = new File(targetDir, OAIPMHImporter.filename4Source(source));
File f0 = new File(targetDir, f1.getName() + ".tmp");

@ -56,19 +56,25 @@ public class ResumptionToken extends TreeMap<String, String> {
int recordCounter;
public ResumptionToken(final byte[] b) throws IOException {
private DigestURI source;
public ResumptionToken(DigestURI source, final byte[] b) throws IOException {
super((Collator) insensitiveCollator.clone());
this.source = source;
this.recordCounter = 0;
new Parser(b);
}
/*
public ResumptionToken(
DigestURI source,
Date expirationDate,
int completeListSize,
int cursor,
String token
) {
super((Collator) insensitiveCollator.clone());
this.source = source;
this.recordCounter = 0;
this.put("expirationDate", DateFormatter.formatISO8601(expirationDate));
this.put("completeListSize", Integer.toString(completeListSize));
@ -77,18 +83,21 @@ public class ResumptionToken extends TreeMap<String, String> {
}
public ResumptionToken(
DigestURI source,
String expirationDate,
int completeListSize,
int cursor,
String token
) {
super((Collator) insensitiveCollator.clone());
this.source = source;
this.recordCounter = 0;
this.put("expirationDate", expirationDate);
this.put("completeListSize", Integer.toString(completeListSize));
this.put("cursor", Integer.toString(cursor));
this.put("token", token);
}
*/
/**
* truncate the given url at the '?'
@ -116,12 +125,13 @@ public class ResumptionToken extends TreeMap<String, String> {
* @return
* @throws IOException in case that no follow-up url can be generated; i.e. if the expiration date is exceeded
*/
public DigestURI resumptionURL(DigestURI givenURL) throws IOException {
// decide which kind of encoding stratgy was used to get a resumptionToken:
public DigestURI resumptionURL() throws IOException {
// decide which kind of encoding strategy was used to get a resumptionToken:
String token = this.getToken();
if (token == null || token.length() == 0) throw new IOException("end of resumption reached");
String url = truncatedURL(givenURL);
if (token == null) throw new IOException("end of resumption reached - token == null");
if (token.length() == 0) throw new IOException("end of resumption reached - token.length() == 0");
String url = truncatedURL(this.source);
// encoded state
if (token.indexOf("from=") >= 0) {
@ -135,8 +145,40 @@ public class ResumptionToken extends TreeMap<String, String> {
if (expiration.before(new Date())) throw new IOException("the resumption is expired at " + DateFormatter.formatISO8601(expiration) + " (now: " + DateFormatter.formatISO8601(new Date()));
// the resumption token is still fresh
}
return new DigestURI(url + "verb=ListRecords&resumptionToken=" + token, null);
String u = url + "verb=ListRecords&resumptionToken=" + escape(token);
return new DigestURI(u, null);
}
public static StringBuilder escape(final String s) {
final int len = s.length();
final StringBuilder sbuf = new StringBuilder(len + 10);
for (int i = 0; i < len; i++) {
final int ch = s.charAt(i);
if (ch == '/') {
sbuf.append("%2F");
} else if (ch == '?') {
sbuf.append("%3F");
} else if (ch == '#') {
sbuf.append("%23");
} else if (ch == '=') {
sbuf.append("%3D");
} else if (ch == '&') {
sbuf.append("%26");
} else if (ch == ':') {
sbuf.append("%3A");
} else if (ch == ';') {
sbuf.append("%3B");
} else if (ch == ' ') {
sbuf.append("%20");
} else if (ch == '%') {
sbuf.append("%25");
} else if (ch == '+') {
sbuf.append("%2B");
} else {
sbuf.append((char)ch);
}
}
return sbuf;
}
/**
@ -199,7 +241,7 @@ public class ResumptionToken extends TreeMap<String, String> {
}
public String toString() {
return "expirationDate=" + DateFormatter.formatISO8601(this.getExpirationDate()) + ", completeListSize=" + getCompleteListSize() +
return "source = " + this.source + ", expirationDate=" + DateFormatter.formatISO8601(this.getExpirationDate()) + ", completeListSize=" + getCompleteListSize() +
", cursor=" + this.getCursor() + ", token=" + this.getToken();
}
@ -224,13 +266,13 @@ public class ResumptionToken extends TreeMap<String, String> {
this.saxParser.parse(this.stream, this);
} catch (SAXException e) {
Log.logException(e);
Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b));
Log.logWarning("ResumptionToken", "token was not parsed (1):\n" + new String(b));
} catch (IOException e) {
Log.logException(e);
Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b));
Log.logWarning("ResumptionToken", "token was not parsed (2):\n" + new String(b));
} catch (ParserConfigurationException e) {
Log.logException(e);
Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b));
Log.logWarning("ResumptionToken", "token was not parsed (3):\n" + new String(b));
throw new IOException(e.getMessage());
} finally {
try {
@ -246,7 +288,13 @@ public class ResumptionToken extends TreeMap<String, String> {
completeListSize="226"
cursor="0">688</resumptionToken>
*/
/*
<resumptionToken expirationDate="2010-05-03T19:30:43Z"
completeListSize="578"
cursor="0">1518323588</resumptionToken>
*/
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
if ("record".equals(tag)) {
recordCounter++;

Loading…
Cancel
Save