diff --git a/htroot/IndexImportOAIPMHList_p.java b/htroot/IndexImportOAIPMHList_p.java index b6e0593d6..6c8859540 100644 --- a/htroot/IndexImportOAIPMHList_p.java +++ b/htroot/IndexImportOAIPMHList_p.java @@ -47,7 +47,7 @@ public class IndexImportOAIPMHList_p { if (post != null && post.containsKey("source")) { ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); - final Set oaiRoots = OAIListFriendsLoader.getListFriends(sb.loader, agent).keySet(); + final Set oaiRoots = new OAIListFriendsLoader().getListFriends(sb.loader, agent).keySet(); boolean dark = false; int count = 0; diff --git a/source/net/yacy/document/importer/OAIListFriendsLoader.java b/source/net/yacy/document/importer/OAIListFriendsLoader.java index d492f9065..730189338 100644 --- a/source/net/yacy/document/importer/OAIListFriendsLoader.java +++ b/source/net/yacy/document/importer/OAIListFriendsLoader.java @@ -83,7 +83,7 @@ public class OAIListFriendsLoader implements Serializable { } - public static Map getListFriends(final LoaderDispatcher loader, final ClientIdentification.Agent agent) { + public Map getListFriends(final LoaderDispatcher loader, final ClientIdentification.Agent agent) { final Map map = new TreeMap(); Map m; for (final Map.Entry oaiFriend: listFriends.entrySet()) try { @@ -105,7 +105,7 @@ public class OAIListFriendsLoader implements Serializable { } private static final ThreadLocal tlSax = new ThreadLocal(); - private static SAXParser getParser() throws SAXException { + private SAXParser getParser() throws SAXException { SAXParser parser = tlSax.get(); if (parser == null) { try { @@ -119,7 +119,7 @@ public class OAIListFriendsLoader implements Serializable { } // get a resumption token using a SAX xml parser from am input stream - public static class Parser extends DefaultHandler { + private class Parser extends DefaultHandler { // class variables private final StringBuilder buffer; diff --git a/source/net/yacy/document/importer/OAIPMHImporter.java b/source/net/yacy/document/importer/OAIPMHImporter.java index 222ea279e..74fe2b269 100644 --- a/source/net/yacy/document/importer/OAIPMHImporter.java +++ b/source/net/yacy/document/importer/OAIPMHImporter.java @@ -29,7 +29,6 @@ import java.text.ParseException; import java.util.Date; import java.util.HashMap; import java.util.Map; -import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import net.yacy.cora.date.GenericFormatter; @@ -46,7 +45,7 @@ import net.yacy.search.Switchboard; public class OAIPMHImporter extends Thread implements Importer, Comparable { private static int importerCounter = Integer.MAX_VALUE; - private static Object N = new Object(); + private static final Object N = new Object(); public static ConcurrentHashMap startedJobs = new ConcurrentHashMap(); public static ConcurrentHashMap runningJobs = new ConcurrentHashMap(); @@ -137,16 +136,16 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable getUnloadedOAIServer( - LoaderDispatcher loader, - File surrogatesIn, - File surrogatesOut, - long staleLimit, - ClientIdentification.Agent agent) { - Set plainList = OAIListFriendsLoader.getListFriends(loader, agent).keySet(); - Map loaded = getLoadedOAIServer(surrogatesIn, surrogatesOut); - long limit = System.currentTimeMillis() - staleLimit; - for (Map.Entry a: loaded.entrySet()) { - if (a.getValue().getTime() > limit) plainList.remove(a.getKey()); - } - return plainList; - } - /** * get a map for already loaded oai-pmh servers and their latest access date * @param surrogatesIn diff --git a/source/net/yacy/document/importer/OAIPMHLoader.java b/source/net/yacy/document/importer/OAIPMHLoader.java index fdda7dd30..6e4e77d4b 100644 --- a/source/net/yacy/document/importer/OAIPMHLoader.java +++ b/source/net/yacy/document/importer/OAIPMHLoader.java @@ -82,61 +82,6 @@ public class OAIPMHLoader { public String source() { return this.source.toNormalform(true); } - - public static StringBuilder escape(final String s) { - final int len = s.length(); - final StringBuilder sbuf = new StringBuilder(len + 10); - for (int i = 0; i < len; i++) { - final int ch = s.charAt(i); - if ('A' <= ch && ch <= 'Z') { // 'A'..'Z' - sbuf.append((char)ch); - } else if ('a' <= ch && ch <= 'z') { // 'a'..'z' - sbuf.append((char)ch); - } else if ('0' <= ch && ch <= '9') { // '0'..'9' - sbuf.append((char)ch); - } else if (ch == ' ') { // space - sbuf.append("%20"); - } else if (ch == '&' || ch == ':' // unreserved - || ch == '-' || ch == '_' - || ch == '.' || ch == '!' - || ch == '~' || ch == '*' - || ch == '\'' || ch == '(' - || ch == ')' || ch == ';') { - sbuf.append((char)ch); - } - } - return sbuf; - } - - public static String unescape(final String s) { - final int l = s.length(); - final StringBuilder sbuf = new StringBuilder(l); - int ch = -1; - int b; - for (int i = 0; i < l; i++) { - /* Get next byte b from URL segment s */ - switch (ch = s.charAt(i)) { - case '%': - if (i + 2 < l) { - ch = s.charAt(++i); - final int hb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase((char) ch) - 'a') & 0xF; - ch = s.charAt(++i); - final int lb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase ((char) ch) - 'a') & 0xF; - b = (hb << 4) | lb; - } else { - b = ch; - } - break; - case '+': - b = ' '; - break; - default: - b = ch; - } - sbuf.append(b); - } - return sbuf.toString(); - } } /* diff --git a/source/net/yacy/document/importer/ResumptionToken.java b/source/net/yacy/document/importer/ResumptionToken.java index da15ecca4..785c12d26 100644 --- a/source/net/yacy/document/importer/ResumptionToken.java +++ b/source/net/yacy/document/importer/ResumptionToken.java @@ -55,7 +55,7 @@ public class ResumptionToken extends TreeMap { insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION); } - int recordCounter; + private int recordCounter; private final DigestURL source; @@ -66,40 +66,6 @@ public class ResumptionToken extends TreeMap { new Parser(b); } - /* - public ResumptionToken( - DigestURI source, - Date expirationDate, - int completeListSize, - int cursor, - String token - ) { - super((Collator) insensitiveCollator.clone()); - this.source = source; - this.recordCounter = 0; - this.put("expirationDate", DateFormatter.formatISO8601(expirationDate)); - this.put("completeListSize", Integer.toString(completeListSize)); - this.put("cursor", Integer.toString(cursor)); - this.put("token", token); - } - - public ResumptionToken( - DigestURI source, - String expirationDate, - int completeListSize, - int cursor, - String token - ) { - super((Collator) insensitiveCollator.clone()); - this.source = source; - this.recordCounter = 0; - this.put("expirationDate", expirationDate); - this.put("completeListSize", Integer.toString(completeListSize)); - this.put("cursor", Integer.toString(cursor)); - this.put("token", token); - } - */ - /** * truncate the given url at the '?' * @param url @@ -150,7 +116,7 @@ public class ResumptionToken extends TreeMap { return new DigestURL(u); } - public static StringBuilder escape(final String s) { + private StringBuilder escape(final String s) { final int len = s.length(); final StringBuilder sbuf = new StringBuilder(len + 10); for (int i = 0; i < len; i++) { @@ -260,7 +226,7 @@ public class ResumptionToken extends TreeMap { } return parser; } - + // get a resumption token using a SAX xml parser from am input stream private class Parser extends DefaultHandler { @@ -268,27 +234,49 @@ public class ResumptionToken extends TreeMap { private final StringBuilder buffer; private boolean parsingValue; private SAXParser saxParser; - private final InputStream stream; private Attributes atts; public Parser(final byte[] b) throws IOException { this.buffer = new StringBuilder(); this.parsingValue = false; this.atts = null; - this.stream = new ByteArrayInputStream(b); + InputStream stream = new ByteArrayInputStream(b); try { this.saxParser = getParser(); - this.saxParser.parse(this.stream, this); + this.saxParser.parse(stream, this); } catch (final SAXException e) { - ConcurrentLog.logException(e); - ConcurrentLog.warn("ResumptionToken", "token was not parsed (1):\n" + UTF8.String(b)); + // some received xml are not valid, common error is '&' sign in xml text + // causing a fatal sax error before entry is reached + // this patch tries to extract the only to allow loading + // the next oai-pmh xml. + + // extract and parse only resumptionToken line + String in = UTF8.String(b); + final int istart = in.lastIndexOf("= 0) { + final int iend = in.indexOf("= 0) { + in = in.substring(istart, iend) + ""; + stream = new ByteArrayInputStream(UTF8.getBytes(in)); + try { + this.saxParser.parse(stream, this); + ResumptionToken.this.recordCounter++; + } catch (final SAXException e2) { + ConcurrentLog.warn("ResumptionToken", "token was not parsed (invalid resumption token): " + in); + } + ConcurrentLog.warn("ResumptionToken", "input file with error: " + e.getMessage()); + } + } else { + ConcurrentLog.logException(e); + ConcurrentLog.warn("ResumptionToken", "token was not parsed (parser error):\n" + UTF8.String(b)); + } } catch (final IOException e) { ConcurrentLog.logException(e); - ConcurrentLog.warn("ResumptionToken", "token was not parsed (2):\n" + UTF8.String(b)); + ConcurrentLog.warn("ResumptionToken", "token was not parsed (IO error)"); throw new IOException(e.getMessage()); } finally { try { - this.stream.close(); + stream.close(); } catch (final IOException e) { ConcurrentLog.logException(e); }