From 5c9dcc269d4dab8e185098b6e428e7dbb2cc2c55 Mon Sep 17 00:00:00 2001 From: reger Date: Tue, 4 Mar 2014 03:08:37 +0100 Subject: [PATCH] improve OAI-PMH import identifier recognition - find best fittng identifier (url) by checking all given dc:identifier in record (many entries proviede several identifiers) as identifier is currently a multivalued field use "getParams" in preference of splitting the 1st string by ";" - add resolve DOI:... identifier via http://dx.doi.org/ --- htroot/IndexImportOAIPMHList_p.java | 2 +- source/net/yacy/document/content/DCEntry.java | 47 ++++++++++++++----- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/htroot/IndexImportOAIPMHList_p.java b/htroot/IndexImportOAIPMHList_p.java index 68e1cf997..b6e0593d6 100644 --- a/htroot/IndexImportOAIPMHList_p.java +++ b/htroot/IndexImportOAIPMHList_p.java @@ -55,7 +55,7 @@ public class IndexImportOAIPMHList_p { prop.put("source_table_" + count + "_dark", (dark) ? "1" : "0"); prop.put("source_table_" + count + "_count", count); prop.put("source_table_" + count + "_source", CharacterCoding.unicode2html(root, true)); - prop.put("source_table_" + count + "_loadurl", "" + CharacterCoding.unicode2html(root, true) + ""); + prop.put("source_table_" + count + "_loadurl", "" + CharacterCoding.unicode2html(root, true) + ""); dark = !dark; count++; } diff --git a/source/net/yacy/document/content/DCEntry.java b/source/net/yacy/document/content/DCEntry.java index 42d57d33f..f513f533c 100644 --- a/source/net/yacy/document/content/DCEntry.java +++ b/source/net/yacy/document/content/DCEntry.java @@ -115,12 +115,22 @@ public class DCEntry extends MultiMapSolrParams { } } + /** + * get Identifier (url) (so far only used for surrogate processing) + * @param useRelationAsAlternative true = take relation if no identifier resolves to url + * @return + */ public DigestURL getIdentifier(boolean useRelationAsAlternative) { + // identifier may be included multiple times (with all kinds of syntax - example is from on record) + // Astronomy and Astrophysics, 539, A99, 2012 + // http://hdl.handle.net/2104/8302 + // 10.1051/0004-6361/201117940 String u = this.get("url"); - if (u == null) u = this.get("dc:identifier"); - if (u == null) return useRelationAsAlternative ? getRelation() : null; - String[] urls = u.split(";"); - if (urls.length > 1) { + String[] urls = null; + if (u == null) urls = this.getParams("dc:identifier"); + if (urls == null) return useRelationAsAlternative ? getRelation() : null; + // String[] urls = u.split(";"); // splitting may not succeed (see above) + if (urls.length > 0) { // check best also with 1 in case it's not http urn // select one that fits u = bestU(urls); } @@ -154,16 +164,27 @@ public class DCEntry extends MultiMapSolrParams { } private static String bestU(String[] urls) { + if (urls.length > 1) { // with only one ... no choice + for (String uu: urls) { + if (uu.startsWith("http://") && (uu.endsWith(".html") || uu.endsWith(".htm") || uu.endsWith(".pdf") || uu.endsWith(".doc") || uu.endsWith(".rss") || uu.endsWith(".xml"))) return uu; + } + for (String uu: urls) { + if (uu.startsWith("http://")) return uu; + } + for (String uu: urls) { + if (uu.startsWith("https://")) return uu; + } + for (String uu: urls) { + if (uu.startsWith("ftp://")) return uu; + } + } // but check urn:/doi: resolve for (String uu: urls) { - if (uu.startsWith("http://") && (uu.endsWith(".html") || uu.endsWith(".htm") || uu.endsWith(".pdf") || uu.endsWith(".doc") || uu.endsWith(".rss") || uu.endsWith(".xml"))) return uu; - } - for (String uu: urls) { - if (uu.startsWith("http://")) return uu; - } - for (String uu: urls) { - if (uu.startsWith("ftp://")) return uu; - } - for (String uu: urls) { + //doi identifier can be resolved through dx.doi.org: + // http://dx.doi.org/doi:10.12775/SIT.2010.010 + //or http://dx.doi.org/10.12775/SIT.2010.010 + if (uu.startsWith("DOI:") || uu.startsWith("doi:")) // saw it upper & lower-case + return "http://dx.doi.org/" + uu; + //urn identifier koennen ueber den resolver der d-nb aufgeloest werden: //http://nbn-resolving.de/urn:nbn:de:bsz:960-opus-1860 if (uu.startsWith("urn:")) return "http://nbn-resolving.de/" + uu;