- fixed resumption token generation for oai-pmh import

- relaxed dublin core parsing: the dc:reference tag may replace dc:identifier if this does not contain a valid url
- parsing of completeRecords number and presentation in the download list of oai import

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6850 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent af13a02307
commit 5fbf866cae

@ -57,6 +57,7 @@
<td>Source</td> <td>Source</td>
<td>Processed<br />Chunks</td> <td>Processed<br />Chunks</td>
<td>Imported<br />Records</td> <td>Imported<br />Records</td>
<td>Complete at<br /># Records</td>
<td>Speed<br />(records/second)</td> <td>Speed<br />(records/second)</td>
</tr> </tr>
#{table}# #{table}#
@ -65,6 +66,7 @@
<td>#[source]#</td> <td>#[source]#</td>
<td>#[chunkCount]#</td> <td>#[chunkCount]#</td>
<td>#[recordsCount]#</td> <td>#[recordsCount]#</td>
<td>#[completeListSize]#</td>
<td>#[speed]#</td> <td>#[speed]#</td>
</tr> </tr>
#{/table}# #{/table}#

@ -75,6 +75,7 @@ public class IndexImportOAIPMHList_p {
prop.put("import_table_" + count + "_source", job.source()); prop.put("import_table_" + count + "_source", job.source());
prop.put("import_table_" + count + "_chunkCount", job.chunkCount()); prop.put("import_table_" + count + "_chunkCount", job.chunkCount());
prop.put("import_table_" + count + "_recordsCount", job.count()); prop.put("import_table_" + count + "_recordsCount", job.count());
prop.put("import_table_" + count + "_completeListSize", job.getCompleteListSize());
prop.put("import_table_" + count + "_speed", job.speed()); prop.put("import_table_" + count + "_speed", job.speed());
dark = !dark; dark = !dark;
count++; count++;

@ -68,7 +68,7 @@ public class IndexImportOAIPMH_p {
// set next default url // set next default url
try { try {
DigestURI nexturl = (rt == null) ? null : rt.resumptionURL(url); DigestURI nexturl = (rt == null) ? null : rt.resumptionURL();
if (rt != null) prop.put("defaulturl", (nexturl == null) ? "" : nexturl.toNormalform(true, false)); if (rt != null) prop.put("defaulturl", (nexturl == null) ? "" : nexturl.toNormalform(true, false));
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
prop.put("defaulturl", e.getMessage()); prop.put("defaulturl", e.getMessage());

@ -1318,9 +1318,9 @@ public final class Switchboard extends serverSwitch {
// check if url is in accepted domain // check if url is in accepted domain
assert surrogate != null; assert surrogate != null;
assert crawlStacker != null; assert crawlStacker != null;
final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.getIdentifier()); final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.getIdentifier(true));
if (urlRejectReason != null) { if (urlRejectReason != null) {
if (this.log.isFine()) this.log.logInfo("Rejected URL '" + surrogate.getIdentifier() + "': " + urlRejectReason); if (this.log.isFine()) this.log.logInfo("Rejected URL '" + surrogate.getIdentifier(true) + "': " + urlRejectReason);
continue; continue;
} }
@ -1328,7 +1328,7 @@ public final class Switchboard extends serverSwitch {
Document document = surrogate.document(); Document document = surrogate.document();
Request request = new Request( Request request = new Request(
peers.mySeed().hash.getBytes(), peers.mySeed().hash.getBytes(),
surrogate.getIdentifier(), surrogate.getIdentifier(true),
null, null,
"", "",
new Date(), new Date(),

@ -104,9 +104,30 @@ public class DCEntry extends TreeMap<String, String> {
} }
} }
public DigestURI getIdentifier() { public DigestURI getIdentifier(boolean useRelationAsAlternative) {
String u = this.get("url"); String u = this.get("url");
if (u == null) u = this.get("dc:identifier"); if (u == null) u = this.get("dc:identifier");
if (u == null) return useRelationAsAlternative ? getRelation() : null;
String[] urls = u.split(";");
if (urls.length > 1) {
// select one that fits
u = bestU(urls);
}
try {
return new DigestURI(u, null);
} catch (MalformedURLException e) {
if (useRelationAsAlternative) {
DigestURI relation = this.getRelation();
if (relation != null) return relation;
Log.logWarning("DCEntry", "getIdentifier: url is bad, relation also: " + e.getMessage());
}
Log.logWarning("DCEntry", "getIdentifier: url is bad: " + e.getMessage());
return null;
}
}
public DigestURI getRelation() {
String u = this.get("dc:relation");
if (u == null) return null; if (u == null) return null;
String[] urls = u.split(";"); String[] urls = u.split(";");
if (urls.length > 1) { if (urls.length > 1) {
@ -116,7 +137,7 @@ public class DCEntry extends TreeMap<String, String> {
try { try {
return new DigestURI(u, null); return new DigestURI(u, null);
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
Log.logException(e); Log.logWarning("DCEntry", "getRelation: url is bad: " + e.getMessage());
return null; return null;
} }
} }
@ -139,7 +160,7 @@ public class DCEntry extends TreeMap<String, String> {
public String getLanguage() { public String getLanguage() {
String l = this.get("language"); String l = this.get("language");
if (l == null) l = this.get("dc:language"); if (l == null) l = this.get("dc:language");
if (l == null) return getIdentifier().language(); if (l == null) return getIdentifier(true).language();
return l; return l;
} }
@ -220,7 +241,7 @@ public class DCEntry extends TreeMap<String, String> {
try { try {
return new Document( return new Document(
getIdentifier(), getIdentifier(true),
"text/html", "text/html",
"UTF-8", "UTF-8",
languages, languages,

@ -29,8 +29,10 @@ import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.ArrayList;
import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue; import java.util.concurrent.BlockingQueue;
import java.util.zip.GZIPInputStream;
import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParser;
@ -170,21 +172,20 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
File f = new File(args[0]); File f = new File(args[0]);
SurrogateReader sr; SurrogateReader sr;
try { try {
sr = new SurrogateReader(new BufferedInputStream(new FileInputStream(f)), 1); InputStream is = new BufferedInputStream(new FileInputStream(f));
if (f.getName().endsWith(".gz")) is = new GZIPInputStream(is);
sr = new SurrogateReader(is, 1);
Thread t = new Thread(sr, "Surrogate-Reader " + f.getAbsolutePath()); Thread t = new Thread(sr, "Surrogate-Reader " + f.getAbsolutePath());
t.start(); t.start();
DCEntry s; DCEntry s;
System.out.println("1");
while ((s = sr.take()) != DCEntry.poison) { while ((s = sr.take()) != DCEntry.poison) {
System.out.println("Title: " + s.getTitle()); System.out.println("Title: " + s.getTitle());
System.out.println("Date: " + s.getDate()); System.out.println("Date: " + s.getDate());
System.out.println("URL: " + s.getIdentifier()); System.out.println("URL: " + s.getIdentifier(true));
System.out.println("Language: " + s.getLanguage()); System.out.println("Language: " + s.getLanguage());
System.out.println("Body: " + s.getDescription()); System.out.println("Body: " + s.getDescription());
System.out.println("Categories: " + s.getSubject());
} }
System.out.println("2");
} catch (IOException e) { } catch (IOException e) {
Log.logException(e); Log.logException(e);
} }

@ -53,7 +53,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
private final LoaderDispatcher loader; private final LoaderDispatcher loader;
private DigestURI source; private DigestURI source;
private int recordsCount, chunkCount; private int recordsCount, chunkCount, completeListSize;
private final long startTime; private final long startTime;
private long finishTime; private long finishTime;
private final ResumptionToken resumptionToken; private final ResumptionToken resumptionToken;
@ -65,6 +65,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
this.loader = loader; this.loader = loader;
this.recordsCount = 0; this.recordsCount = 0;
this.chunkCount = 0; this.chunkCount = 0;
this.completeListSize = 0;
this.startTime = System.currentTimeMillis(); this.startTime = System.currentTimeMillis();
this.finishTime = 0; this.finishTime = 0;
this.resumptionToken = null; this.resumptionToken = null;
@ -97,6 +98,10 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
return this.resumptionToken; return this.resumptionToken;
} }
public int getCompleteListSize() {
return this.completeListSize;
}
public long remainingTime() { public long remainingTime() {
return (this.isAlive()) ? Long.MAX_VALUE : 0; // we don't know return (this.isAlive()) ? Long.MAX_VALUE : 0; // we don't know
} }
@ -123,9 +128,10 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
while (true) { while (true) {
try { try {
OAIPMHLoader loader = new OAIPMHLoader(this.loader, this.source, Switchboard.getSwitchboard().surrogatesInPath, filenamePrefix); OAIPMHLoader loader = new OAIPMHLoader(this.loader, this.source, Switchboard.getSwitchboard().surrogatesInPath, filenamePrefix);
this.completeListSize = Math.max(this.completeListSize, loader.getResumptionToken().getCompleteListSize());
this.chunkCount++; this.chunkCount++;
this.recordsCount += loader.getResumptionToken().getRecordCounter(); this.recordsCount += loader.getResumptionToken().getRecordCounter();
this.source = loader.getResumptionToken().resumptionURL(this.source); this.source = loader.getResumptionToken().resumptionURL();
if (this.source == null) { if (this.source == null) {
this.message = "import terminated with source = null"; this.message = "import terminated with source = null";
break; break;

@ -50,7 +50,8 @@ public class OAIPMHLoader {
// load the file from the net // load the file from the net
Response response = loader.load(source, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE); Response response = loader.load(source, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
byte[] b = response.getContent(); byte[] b = response.getContent();
this.resumptionToken = new ResumptionToken(b); this.resumptionToken = new ResumptionToken(source, b);
//System.out.println("*** ResumptionToken = " + this.resumptionToken.toString());
File f1 = new File(targetDir, OAIPMHImporter.filename4Source(source)); File f1 = new File(targetDir, OAIPMHImporter.filename4Source(source));
File f0 = new File(targetDir, f1.getName() + ".tmp"); File f0 = new File(targetDir, f1.getName() + ".tmp");

@ -56,19 +56,25 @@ public class ResumptionToken extends TreeMap<String, String> {
int recordCounter; int recordCounter;
public ResumptionToken(final byte[] b) throws IOException { private DigestURI source;
public ResumptionToken(DigestURI source, final byte[] b) throws IOException {
super((Collator) insensitiveCollator.clone()); super((Collator) insensitiveCollator.clone());
this.source = source;
this.recordCounter = 0; this.recordCounter = 0;
new Parser(b); new Parser(b);
} }
/*
public ResumptionToken( public ResumptionToken(
DigestURI source,
Date expirationDate, Date expirationDate,
int completeListSize, int completeListSize,
int cursor, int cursor,
String token String token
) { ) {
super((Collator) insensitiveCollator.clone()); super((Collator) insensitiveCollator.clone());
this.source = source;
this.recordCounter = 0; this.recordCounter = 0;
this.put("expirationDate", DateFormatter.formatISO8601(expirationDate)); this.put("expirationDate", DateFormatter.formatISO8601(expirationDate));
this.put("completeListSize", Integer.toString(completeListSize)); this.put("completeListSize", Integer.toString(completeListSize));
@ -77,18 +83,21 @@ public class ResumptionToken extends TreeMap<String, String> {
} }
public ResumptionToken( public ResumptionToken(
DigestURI source,
String expirationDate, String expirationDate,
int completeListSize, int completeListSize,
int cursor, int cursor,
String token String token
) { ) {
super((Collator) insensitiveCollator.clone()); super((Collator) insensitiveCollator.clone());
this.source = source;
this.recordCounter = 0; this.recordCounter = 0;
this.put("expirationDate", expirationDate); this.put("expirationDate", expirationDate);
this.put("completeListSize", Integer.toString(completeListSize)); this.put("completeListSize", Integer.toString(completeListSize));
this.put("cursor", Integer.toString(cursor)); this.put("cursor", Integer.toString(cursor));
this.put("token", token); this.put("token", token);
} }
*/
/** /**
* truncate the given url at the '?' * truncate the given url at the '?'
@ -116,12 +125,13 @@ public class ResumptionToken extends TreeMap<String, String> {
* @return * @return
* @throws IOException in case that no follow-up url can be generated; i.e. if the expiration date is exceeded * @throws IOException in case that no follow-up url can be generated; i.e. if the expiration date is exceeded
*/ */
public DigestURI resumptionURL(DigestURI givenURL) throws IOException { public DigestURI resumptionURL() throws IOException {
// decide which kind of encoding stratgy was used to get a resumptionToken: // decide which kind of encoding strategy was used to get a resumptionToken:
String token = this.getToken(); String token = this.getToken();
if (token == null || token.length() == 0) throw new IOException("end of resumption reached"); if (token == null) throw new IOException("end of resumption reached - token == null");
String url = truncatedURL(givenURL); if (token.length() == 0) throw new IOException("end of resumption reached - token.length() == 0");
String url = truncatedURL(this.source);
// encoded state // encoded state
if (token.indexOf("from=") >= 0) { if (token.indexOf("from=") >= 0) {
@ -135,8 +145,40 @@ public class ResumptionToken extends TreeMap<String, String> {
if (expiration.before(new Date())) throw new IOException("the resumption is expired at " + DateFormatter.formatISO8601(expiration) + " (now: " + DateFormatter.formatISO8601(new Date())); if (expiration.before(new Date())) throw new IOException("the resumption is expired at " + DateFormatter.formatISO8601(expiration) + " (now: " + DateFormatter.formatISO8601(new Date()));
// the resumption token is still fresh // the resumption token is still fresh
} }
String u = url + "verb=ListRecords&resumptionToken=" + escape(token);
return new DigestURI(url + "verb=ListRecords&resumptionToken=" + token, null); return new DigestURI(u, null);
}
public static StringBuilder escape(final String s) {
final int len = s.length();
final StringBuilder sbuf = new StringBuilder(len + 10);
for (int i = 0; i < len; i++) {
final int ch = s.charAt(i);
if (ch == '/') {
sbuf.append("%2F");
} else if (ch == '?') {
sbuf.append("%3F");
} else if (ch == '#') {
sbuf.append("%23");
} else if (ch == '=') {
sbuf.append("%3D");
} else if (ch == '&') {
sbuf.append("%26");
} else if (ch == ':') {
sbuf.append("%3A");
} else if (ch == ';') {
sbuf.append("%3B");
} else if (ch == ' ') {
sbuf.append("%20");
} else if (ch == '%') {
sbuf.append("%25");
} else if (ch == '+') {
sbuf.append("%2B");
} else {
sbuf.append((char)ch);
}
}
return sbuf;
} }
/** /**
@ -199,7 +241,7 @@ public class ResumptionToken extends TreeMap<String, String> {
} }
public String toString() { public String toString() {
return "expirationDate=" + DateFormatter.formatISO8601(this.getExpirationDate()) + ", completeListSize=" + getCompleteListSize() + return "source = " + this.source + ", expirationDate=" + DateFormatter.formatISO8601(this.getExpirationDate()) + ", completeListSize=" + getCompleteListSize() +
", cursor=" + this.getCursor() + ", token=" + this.getToken(); ", cursor=" + this.getCursor() + ", token=" + this.getToken();
} }
@ -224,13 +266,13 @@ public class ResumptionToken extends TreeMap<String, String> {
this.saxParser.parse(this.stream, this); this.saxParser.parse(this.stream, this);
} catch (SAXException e) { } catch (SAXException e) {
Log.logException(e); Log.logException(e);
Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b)); Log.logWarning("ResumptionToken", "token was not parsed (1):\n" + new String(b));
} catch (IOException e) { } catch (IOException e) {
Log.logException(e); Log.logException(e);
Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b)); Log.logWarning("ResumptionToken", "token was not parsed (2):\n" + new String(b));
} catch (ParserConfigurationException e) { } catch (ParserConfigurationException e) {
Log.logException(e); Log.logException(e);
Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b)); Log.logWarning("ResumptionToken", "token was not parsed (3):\n" + new String(b));
throw new IOException(e.getMessage()); throw new IOException(e.getMessage());
} finally { } finally {
try { try {
@ -246,7 +288,13 @@ public class ResumptionToken extends TreeMap<String, String> {
completeListSize="226" completeListSize="226"
cursor="0">688</resumptionToken> cursor="0">688</resumptionToken>
*/ */
/*
<resumptionToken expirationDate="2010-05-03T19:30:43Z"
completeListSize="578"
cursor="0">1518323588</resumptionToken>
*/
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException { public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
if ("record".equals(tag)) { if ("record".equals(tag)) {
recordCounter++; recordCounter++;

Loading…
Cancel
Save