From 18c7563dbe9225d43ad452fb15df1c56b173828a Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 5 Mar 2017 02:26:10 +0100 Subject: [PATCH] Extend DCEntry.getLanguage convert to ISO639-1 codes for more languages by using icu.ULocale for languages not already covered (ICU normalizes to ISO639-1 2 char codes). Add test class Use DublinCore vocabulary declarations in DCEntry and SurrogateReader for easier usage debugging, Init SurrogateReader.inputSource on first use. --- source/net/yacy/document/content/DCEntry.java | 58 ++++++++------- .../document/content/SurrogateReader.java | 57 +++++++-------- .../yacy/document/content/DCEntryTest.java | 72 +++++++++++++++++++ 3 files changed, 130 insertions(+), 57 deletions(-) create mode 100644 test/java/net/yacy/document/content/DCEntryTest.java diff --git a/source/net/yacy/document/content/DCEntry.java b/source/net/yacy/document/content/DCEntry.java index 2b2142a3b..17e364497 100644 --- a/source/net/yacy/document/content/DCEntry.java +++ b/source/net/yacy/document/content/DCEntry.java @@ -25,6 +25,8 @@ package net.yacy.document.content; +import com.ibm.icu.util.ULocale; + import java.io.IOException; import java.io.OutputStreamWriter; import java.net.MalformedURLException; @@ -42,6 +44,8 @@ import org.apache.solr.common.params.MultiMapSolrParams; import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.lod.vocabulary.DublinCore; +import net.yacy.cora.lod.vocabulary.Geo; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.Document; @@ -74,13 +78,13 @@ public class DCEntry extends MultiMapSolrParams { double lon ) { super(new TreeMap((Collator) insensitiveCollator.clone())); - this.getMap().put("dc:identifier", new String[]{url.toNormalform(true)}); - this.getMap().put("dc:date", new String[]{ISO8601Formatter.FORMATTER.format(date)}); - this.getMap().put("dc:title", new String[]{title}); - this.getMap().put("dc:creator", new String[]{author}); - this.getMap().put("dc:description", new String[]{body}); - this.getMap().put("geo:lat", new String[]{Double.toString(lat)}); - this.getMap().put("geo:long", new String[]{Double.toString(lon)}); + this.getMap().put(DublinCore.Identifier.getURIref(), new String[]{url.toNormalform(true)}); + this.getMap().put(DublinCore.Date.getURIref(), new String[]{ISO8601Formatter.FORMATTER.format(date)}); + this.getMap().put(DublinCore.Title.getURIref(), new String[]{title}); + this.getMap().put(DublinCore.Creator.getURIref(), new String[]{author}); + this.getMap().put(DublinCore.Description.getURIref(), new String[]{body}); + this.getMap().put(Geo.Lat.getURIref(), new String[]{Double.toString(lat)}); + this.getMap().put(Geo.Long.getURIref(), new String[]{Double.toString(lon)}); } /* @@ -105,7 +109,7 @@ public class DCEntry extends MultiMapSolrParams { public Date getDate() { String d = this.get("docdatetime"); if (d == null) d = this.get("date"); - if (d == null) d = this.get("dc:date"); + if (d == null) d = this.get(DublinCore.Date.getURIref()); if (d == null) d = this.get("last-modified"); if (d == null) return null; if (d.isEmpty()) return null; @@ -133,7 +137,7 @@ public class DCEntry extends MultiMapSolrParams { if (u == null) u = this.get("sku"); if (u == null) { - final String[] urls = this.getParams("dc:identifier"); + final String[] urls = this.getParams(DublinCore.Identifier.getURIref()); if (urls == null) { return useRelationAsAlternative ? getRelation() : null; } @@ -157,7 +161,7 @@ public class DCEntry extends MultiMapSolrParams { } public DigestURL getRelation() { - String u = this.get("dc:relation"); + String u = this.get(DublinCore.Relation.getURIref()); if (u == null) return null; String[] urls = CommonPattern.SEMICOLON.split(u); if (urls.length > 1) { @@ -206,9 +210,8 @@ public class DCEntry extends MultiMapSolrParams { //modified by copperdust; Ukraine, 2012 public String getLanguage() {//final language computation - String l = this.get("dc:language");//from document metainfo + String l = this.get(DublinCore.Language.getURIref());//from document metainfo // OAI uses often 3-char languages (ISO639-2) convert to ISO639-1 2-char code) - // TODO: implement complete list of ISO639-2/ISO639-3 language codes if (l != null && l.length() == 3) { if (l.startsWith("ger") || l.startsWith("deu")) l = "de"; else if (l.startsWith("eng")) l = "en"; @@ -223,7 +226,12 @@ public class DCEntry extends MultiMapSolrParams { else if (l.startsWith("fre") || l.startsWith("fra")) l = "fr"; else if (l.startsWith("eus") || l.startsWith("baq")) l = "eu"; else if (l.startsWith("gre") || l.startsWith("ell")) l = "el"; - + else { + // icu.ULocale performs a normalization (of ISO639-2/T) to ISO639-1 2-char language code + // (fyi: ISO639-2 allows (T)erminology and (B)ibliographic (e.g. chi=zh and zho=zh), ICU handles (T) ) + ULocale loc = new ULocale(l); + l = loc.getLanguage(); + } return l; } if (l == null) l = getIdentifier(true).language(); // determine from identifier-url.TLD @@ -232,39 +240,39 @@ public class DCEntry extends MultiMapSolrParams { } public String getType() { - String t = this.get("dc:type"); + String t = this.get(DublinCore.Type.getURIref()); if (t == null) return ""; return t; } public String getFormat() { - String t = this.get("dc:format"); + String t = this.get(DublinCore.Format.getURIref()); if (t == null) return ""; return t; } public String getSource() { - String t = this.get("dc:source"); + String t = this.get(DublinCore.Source.getURIref()); if (t == null) return ""; return t; } public String getRights() { - String t = this.get("dc:rights"); + String t = this.get(DublinCore.Rights.getURIref()); if (t == null) return ""; return t; } public String getTitle() { String t = this.get("title"); - if (t == null) t = this.get("dc:title"); + if (t == null) t = this.get(DublinCore.Title.getURIref()); t = stripCDATA(t); if (t == null) return ""; return t; } public String getPublisher() { - String t = this.get("dc:publisher"); + String t = this.get(DublinCore.Publisher.getURIref()); t = stripCDATA(t); if (t == null) return ""; return t; @@ -272,14 +280,14 @@ public class DCEntry extends MultiMapSolrParams { public String getCreator() { String t = this.get("author"); - if (t == null) t = this.get("dc:creator"); + if (t == null) t = this.get(DublinCore.Creator.getURIref()); t = stripCDATA(t); if (t == null) return ""; return t; } public List getDescriptions() { - String[] t = this.getParams("dc:description"); + String[] t = this.getParams(DublinCore.Description.getURIref()); List descriptions = new ArrayList(); if (t == null) return descriptions; for (String s: t) descriptions.add(stripCDATA(s)); @@ -297,7 +305,7 @@ public class DCEntry extends MultiMapSolrParams { t = stripCDATA(t); return CommonPattern.SEMICOLON.split(t); } - tx = this.getParams("dc:subject"); + tx = this.getParams(DublinCore.Subject.getURIref()); if (tx != null) { for (int i = 0; i < tx.length; i++) { @@ -308,15 +316,15 @@ public class DCEntry extends MultiMapSolrParams { } public double getLon() { - String t = this.get("geo:long"); - if (t == null) t = this.get("geo:lon"); + String t = this.get(Geo.Long.getURIref()); + if (t == null) t = this.get("geo:lon"); // try geo:long with possible typing error t = stripCDATA(t); if (t == null) return 0.0d; return Double.parseDouble(t); } public double getLat() { - String t = this.get("geo:lat"); + String t = this.get(Geo.Lat.getURIref()); t = stripCDATA(t); if (t == null) return 0.0d; return Double.parseDouble(t); diff --git a/source/net/yacy/document/content/SurrogateReader.java b/source/net/yacy/document/content/SurrogateReader.java index 8caf267ec..dc818ea70 100644 --- a/source/net/yacy/document/content/SurrogateReader.java +++ b/source/net/yacy/document/content/SurrogateReader.java @@ -29,10 +29,8 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PushbackInputStream; -import java.io.Reader; import java.io.StringReader; import java.net.MalformedURLException; -import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.Map; import java.util.concurrent.ArrayBlockingQueue; @@ -53,6 +51,8 @@ import org.xml.sax.SAXParseException; import org.xml.sax.helpers.DefaultHandler; import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.lod.vocabulary.DublinCore; +import net.yacy.cora.lod.vocabulary.Geo; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.CrawlStacker; import net.yacy.search.schema.CollectionConfiguration; @@ -65,9 +65,9 @@ public class SurrogateReader extends DefaultHandler implements Runnable { "surrogates"; public final static String SURROGATES_MAIN_ELEMENT_OPEN = "<" + SURROGATES_MAIN_ELEMENT_NAME + - " xmlns:dc=\"http://purl.org/dc/elements/1.1/\"" + + " xmlns:dc=\"" + DublinCore.NAMESPACE + "\"" + " xmlns:yacy=\"http://yacy.net/\"" + - " xmlns:geo=\"http://www.w3.org/2003/01/geo/wgs84_pos#\">"; + " xmlns:geo=\"" + Geo.NAMESPACE + "\">"; public final static String SURROGATES_MAIN_ELEMENT_CLOSE = ""; public final static SolrInputDocument POISON_DOCUMENT = new SolrInputDocument(); @@ -80,12 +80,10 @@ public class SurrogateReader extends DefaultHandler implements Runnable { /** Surrogates are either SolrInputDocument or DCEntry instances*/ private final BlockingQueue surrogates; private SAXParser saxParser; - private final InputSource inputSource; private final PushbackInputStream inputStream; private final CrawlStacker crawlStacker; private final CollectionConfiguration configuration; private final int concurrency; - private Charset charset = StandardCharsets.UTF_8; private static final ThreadLocal tlSax = new ThreadLocal(); private static SAXParser getParser() throws SAXException { @@ -114,10 +112,6 @@ public class SurrogateReader extends DefaultHandler implements Runnable { this.dcEntry = null; this.elementName = null; this.surrogates = new ArrayBlockingQueue<>(queueSize); - - Reader reader = new BufferedReader(new InputStreamReader(stream, this.charset)); - this.inputSource = new InputSource(reader); - this.inputSource.setEncoding(this.charset.name()); this.inputStream = stream; try { @@ -132,8 +126,8 @@ public class SurrogateReader extends DefaultHandler implements Runnable { public void run() { // test the syntax of the stream by reading parts of the beginning try { + BufferedReader br = new BufferedReader(new InputStreamReader(this.inputStream, StandardCharsets.UTF_8)); if (isSolrDump()) { - BufferedReader br = new BufferedReader(new InputStreamReader(this.inputStream, this.charset)); String line; while ((line = br.readLine()) != null) { if (!line.startsWith("")) continue; @@ -159,7 +153,9 @@ public class SurrogateReader extends DefaultHandler implements Runnable { } } } else { - this.saxParser.parse(this.inputSource, this); + final InputSource inputSource = new InputSource(br); + inputSource.setEncoding(StandardCharsets.UTF_8.name()); + this.saxParser.parse(inputSource, this); } } catch (final SAXParseException e) { ConcurrentLog.logException(e); @@ -169,17 +165,17 @@ public class SurrogateReader extends DefaultHandler implements Runnable { ConcurrentLog.logException(e); } finally { for (int i = 0; i < this.concurrency; i++) { - try { - this.surrogates.put(POISON_DOCUMENT); - } catch (final InterruptedException e1) { - ConcurrentLog.logException(e1); - } + try { + this.surrogates.put(POISON_DOCUMENT); + } catch (final InterruptedException e1) { + ConcurrentLog.logException(e1); + } + } + try { + this.inputStream.close(); + } catch (final IOException e) { + ConcurrentLog.logException(e); } - try { - this.inputStream.close(); - } catch (final IOException e) { - ConcurrentLog.logException(e); - } } } @@ -191,13 +187,13 @@ public class SurrogateReader extends DefaultHandler implements Runnable { byte[] b = new byte[100]; int nbRead = -1; try { - nbRead = this.inputStream.read(b); - if(nbRead > 0) { - String s = new String(b, 0, nbRead, this.charset); - if ((s.contains("") && s.contains("")) || s.startsWith("")) { - res = true; - } - } + nbRead = this.inputStream.read(b); + if (nbRead > 0) { + String s = new String(b, 0, nbRead, StandardCharsets.UTF_8); + if ((s.contains("") && s.contains("")) || s.startsWith("")) { + res = true; + } + } } catch (IOException e) { ConcurrentLog.logException(e); } finally { @@ -236,7 +232,6 @@ public class SurrogateReader extends DefaultHandler implements Runnable { if (tag == null) return; tag = tag.toLowerCase(); if ("record".equals(tag) || "document".equals(tag) || "doc".equals(tag)) { - //System.out.println("A Title: " + this.surrogate.title()); try { // check if url is in accepted domain final String urlRejectReason = this.crawlStacker.urlInAcceptedDomain(this.dcEntry.getIdentifier(true)); @@ -247,7 +242,6 @@ public class SurrogateReader extends DefaultHandler implements Runnable { } catch (final InterruptedException e) { ConcurrentLog.logException(e); } finally { - //System.out.println("B Title: " + this.surrogate.title()); this.dcEntry = null; this.buffer.setLength(0); this.parsingValue = false; @@ -263,7 +257,6 @@ public class SurrogateReader extends DefaultHandler implements Runnable { this.buffer.setLength(0); this.parsingValue = false; } else if ("value".equals(tag)) { - //System.out.println("BUFFER-SIZE=" + buffer.length()); final String value = buffer.toString().trim(); if (this.elementName != null) { this.dcEntry.getMap().put(this.elementName, new String[]{value}); diff --git a/test/java/net/yacy/document/content/DCEntryTest.java b/test/java/net/yacy/document/content/DCEntryTest.java new file mode 100644 index 000000000..ef78c6e8e --- /dev/null +++ b/test/java/net/yacy/document/content/DCEntryTest.java @@ -0,0 +1,72 @@ +/** + * DCEntryTest + * part of YaCy + * Copyright 2017 by reger24; https://github.com/reger24 + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ +package net.yacy.document.content; + +import java.util.HashMap; +import java.util.Map; +import net.yacy.cora.lod.vocabulary.DublinCore; +import org.junit.Test; +import static org.junit.Assert.*; + + +public class DCEntryTest { + + /** + * Test of getLanguage method, of class DCEntry for ISO639-2 3-char language + * codes as input to test convert of 3-char to the interal used ISO639-1 + * 2-char code. + */ + @Test + public void testGetLanguage_ISO639_2() { + Map testmap = new HashMap(); + + // key=ISO639-2 (3 char language code), value= corresponding ISO639-1 (2 char language code) + testmap.put("ger", "de"); testmap.put("deu", "de"); + testmap.put("eng", "en"); + testmap.put("rus", "ru"); + testmap.put("jpn", "ja"); + testmap.put("ita", "it"); + testmap.put("por", "pt"); + testmap.put("pol", "pl"); + testmap.put("spa", "es"); + testmap.put("ukr", "uk"); + testmap.put("chi", "zh"); testmap.put("zho", "zh"); + testmap.put("fre", "fr"); testmap.put("fra", "fr"); + testmap.put("eus", "eu"); testmap.put("baq", "eu"); + testmap.put("gre", "el"); testmap.put("ell", "el"); + // some additional languages to test icu.ULocale of .getLanguage() + testmap.put("ara", "ar"); + testmap.put("ces", "cs"); + testmap.put("nld", "nl"); + testmap.put("tur", "tr"); + + for (String testlang : testmap.keySet()) { + DCEntry dce = new DCEntry(); + // set a 3-char ISO639-2/ISO639-3 + dce.getMap().put(DublinCore.Language.getURIref(), new String[]{testlang}); + + String expectedresult = testmap.get(testlang); + String lng = dce.getLanguage(); + + assertEquals("convert language code=" + testlang, expectedresult, lng); + } + } + +}