From 651d057e93970bda213558b22febb37b01f6902f Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 23 Mar 2014 00:40:36 +0100 Subject: [PATCH] surrogate import translate dc:language 3-char codes OAI records often use 3-char language codes, start converting some 3-char lang's to the internal ISO639-1 2-char code --- source/net/yacy/document/content/DCEntry.java | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/source/net/yacy/document/content/DCEntry.java b/source/net/yacy/document/content/DCEntry.java index 96b6e11fa..731b9181a 100644 --- a/source/net/yacy/document/content/DCEntry.java +++ b/source/net/yacy/document/content/DCEntry.java @@ -199,7 +199,23 @@ public class DCEntry extends MultiMapSolrParams { //modified by copperdust; Ukraine, 2012 public String getLanguage() {//final language computation String l = this.get("dc:language");//from document metainfo - if (l == null) l = getIdentifier(true).language();//from symbolic frequency table + // OAI uses often 3-char languages (ISO639-2) convert to ISO639-1 2-char code) + // TODO: implement complete list of ISO639-2/ISO639-3 language codes + if (l != null && l.length() == 3) { + if (l.startsWith("ger") || l.startsWith("deu")) l = "de"; + if (l.startsWith("eng")) l = "en"; + if (l.startsWith("rus")) l = "ru"; + if (l.startsWith("jpn")) l = "ja"; + if (l.startsWith("ita")) l = "it"; + if (l.startsWith("por")) l = "pt"; + if (l.startsWith("spa")) l = "es"; + if (l.startsWith("chi") || l.startsWith("zho")) l = "zh"; + if (l.startsWith("fre") || l.startsWith("fra")) l = "fr"; + if (l.startsWith("eus") || l.startsWith("baq")) l = "eu"; + if (l.startsWith("gre") || l.startsWith("ell")) l = "el"; + return l; + } + if (l == null) l = getIdentifier(true).language(); // determine from identifier-url.TLD if (l == null) return this.get("language");//from TLD return l; }