diff --git a/htroot/Ranking_p.java b/htroot/Ranking_p.java index 8f173d367..3768cefcf 100644 --- a/htroot/Ranking_p.java +++ b/htroot/Ranking_p.java @@ -75,6 +75,7 @@ public class Ranking_p { rankingParameters.put(plasmaSearchRankingProfile.WORDSINTEXT, "Words In Text"); rankingParameters.put(plasmaSearchRankingProfile.WORDSINTITLE, "Words In Title"); rankingParameters.put(plasmaSearchRankingProfile.YBR, "YaCy Block Rank"); + rankingParameters.put(plasmaSearchRankingProfile.LANGUAGE, "Preferred Language"); } private static serverObjects defaultValues() { diff --git a/htroot/xml/util/getpageinfo_p.java b/htroot/xml/util/getpageinfo_p.java index ae5dfebdf..70118e045 100644 --- a/htroot/xml/util/getpageinfo_p.java +++ b/htroot/xml/util/getpageinfo_p.java @@ -29,6 +29,7 @@ package xml.util; import java.io.IOException; import java.io.Writer; import java.net.MalformedURLException; +import java.util.Set; import de.anomic.crawler.HTTPLoader; import de.anomic.htmlFilter.htmlFilterContentScraper; @@ -103,8 +104,9 @@ public class getpageinfo_p { prop.put("tags", count); // put description prop.putHTML("desc", scraper.getDescription(), true); - // put language - prop.putHTML("lang", scraper.getContentLanguages()[0], true); + // put language + Set languages = scraper.getContentLanguages(); + prop.putHTML("lang", (languages == null) ? "unknown" : languages.iterator().next(), true); } catch (final MalformedURLException e) { /* ignore this */ } catch (final IOException e) { /* ignore this */ diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index ea3a9ed16..d5f7bd89e 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -50,6 +50,7 @@ import de.anomic.http.httpRequestHeader; import de.anomic.server.serverCharBuffer; import de.anomic.server.serverFileUtils; import de.anomic.yacy.yacyURL; +import de.anomic.tools.iso639; public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper { @@ -381,11 +382,21 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen return s; } - public String[] getContentLanguages() { + public HashSet getContentLanguages() { String s = metas.get("content-language"); if (s == null) s = metas.get("dc.language"); - if (s == null) s = ""; - return s.split(" |,"); + if (s == null) return null; + HashSet hs = new HashSet(); + String[] cl = s.split(" |,"); + int p; + for (int i = 0; i < cl.length; i++) { + cl[i] = cl[i].toLowerCase(); + p = cl[i].indexOf('-'); + if (p > 0) cl[i] = cl[i].substring(0, p); + if (iso639.exists(cl[i])) hs.add(cl[i]); + } + if (hs.size() == 0) return null; + return hs; } public String[] getKeywords() { diff --git a/source/de/anomic/index/indexContainerHeap.java b/source/de/anomic/index/indexContainerHeap.java index 75f74fe89..2d7ba8f1f 100755 --- a/source/de/anomic/index/indexContainerHeap.java +++ b/source/de/anomic/index/indexContainerHeap.java @@ -118,6 +118,7 @@ public final class indexContainerHeap { int urlCount = 0; synchronized (cache) { for (final indexContainer container : new heapFileEntries(heapFile, this.payloadrow)) { + // TODO: in this loop a lot of memory may be allocated. A check if the memory gets low is necessary. But what do when the memory is low? if (container == null) break; cache.put(container.getWordHash(), container); urlCount += container.size(); @@ -252,6 +253,10 @@ public final class indexContainerHeap { } } + /** + * return an index container + * because they may get very large, it is wise to deallocate some memory before calling next() + */ public indexContainer next() { final indexContainer n = this.nextContainer; this.nextContainer = next0(); diff --git a/source/de/anomic/plasma/parser/doc/docParser.java b/source/de/anomic/plasma/parser/doc/docParser.java index ef1590977..97b2146ce 100644 --- a/source/de/anomic/plasma/parser/doc/docParser.java +++ b/source/de/anomic/plasma/parser/doc/docParser.java @@ -70,6 +70,7 @@ public class docParser extends AbstractParser implements Parser { mimeType, "UTF-8", null, + null, ((contents.length() > 80)? contents.substring(0, 80):contents.trim()). replaceAll("\r\n"," "). replaceAll("\n"," "). diff --git a/source/de/anomic/plasma/parser/odt/odtParser.java b/source/de/anomic/plasma/parser/odt/odtParser.java index 06d43d635..6aa609f10 100644 --- a/source/de/anomic/plasma/parser/odt/odtParser.java +++ b/source/de/anomic/plasma/parser/odt/odtParser.java @@ -32,7 +32,9 @@ import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.charset.Charset; import java.util.Enumeration; +import java.util.HashSet; import java.util.Hashtable; +import java.util.Set; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; @@ -89,6 +91,7 @@ public class odtParser extends AbstractParser implements Parser { String docShortTitle = null; String docLongTitle = null; String docAuthor = null; + String docLanguage = null; // opening the file as zip file final ZipFile zipFile= new ZipFile(dest); @@ -134,9 +137,14 @@ public class odtParser extends AbstractParser implements Parser { docShortTitle = metaData.getTitle(); docLongTitle = metaData.getSubject(); docAuthor = metaData.getCreator(); + docLanguage = metaData.getLanguage(); } } + // make the languages set + Set languages = new HashSet(1); + if (docLanguage != null) languages.add(docLanguage); + // if there is no title availabe we generate one if (docLongTitle == null) { if (docShortTitle != null) { @@ -156,6 +164,7 @@ public class odtParser extends AbstractParser implements Parser { location, mimeType, "UTF-8", + languages, docKeywords, docLongTitle, docAuthor, @@ -169,6 +178,7 @@ public class odtParser extends AbstractParser implements Parser { location, mimeType, "UTF-8", + languages, docKeywords, docLongTitle, docAuthor, diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java index 822522b1d..7996231eb 100644 --- a/source/de/anomic/plasma/parser/pdf/pdfParser.java +++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java @@ -143,6 +143,7 @@ public class pdfParser extends AbstractParser implements Parser { location, mimeType, "UTF-8", + null, docKeywords, (docTitle == null) ? docSubject : docTitle, docAuthor, @@ -156,6 +157,7 @@ public class pdfParser extends AbstractParser implements Parser { location, mimeType, "UTF-8", + null, docKeywords, (docTitle == null) ? docSubject : docTitle, docAuthor, diff --git a/source/de/anomic/plasma/parser/ppt/pptParser.java b/source/de/anomic/plasma/parser/ppt/pptParser.java index 95a2baf7d..9256201fe 100644 --- a/source/de/anomic/plasma/parser/ppt/pptParser.java +++ b/source/de/anomic/plasma/parser/ppt/pptParser.java @@ -88,6 +88,7 @@ public class pptParser extends AbstractParser implements Parser { mimeType, "UTF-8", null, + null, ((contents.length() > 80) ? contents.substring(0, 80) : contents.trim()). replaceAll("\r\n"," "). replaceAll("\n"," "). diff --git a/source/de/anomic/plasma/parser/ps/psParser.java b/source/de/anomic/plasma/parser/ps/psParser.java index e078d8372..34c71ca62 100644 --- a/source/de/anomic/plasma/parser/ps/psParser.java +++ b/source/de/anomic/plasma/parser/ps/psParser.java @@ -117,6 +117,7 @@ public class psParser extends AbstractParser implements Parser { "UTF-8", null, null, + null, "", null, null, diff --git a/source/de/anomic/plasma/parser/rpm/rpmParser.java b/source/de/anomic/plasma/parser/rpm/rpmParser.java index 4f6b92e62..7667c0e87 100644 --- a/source/de/anomic/plasma/parser/rpm/rpmParser.java +++ b/source/de/anomic/plasma/parser/rpm/rpmParser.java @@ -134,6 +134,7 @@ public class rpmParser extends AbstractParser implements Parser { mimeType, "UTF-8", null, + null, summary, packager, null, diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java index a6df6224b..d1ecd2c95 100644 --- a/source/de/anomic/plasma/parser/rss/rssParser.java +++ b/source/de/anomic/plasma/parser/rss/rssParser.java @@ -158,6 +158,7 @@ public class rssParser extends AbstractParser implements Parser { mimeType, "UTF-8", null, + null, feedTitle, (authors.length() > 0)?authors.toString(1,authors.length()):"", feedSections.toArray(new String[feedSections.size()]), diff --git a/source/de/anomic/plasma/parser/rtf/rtfParser.java b/source/de/anomic/plasma/parser/rtf/rtfParser.java index 6a2f800e8..d6839e38d 100644 --- a/source/de/anomic/plasma/parser/rtf/rtfParser.java +++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java @@ -75,6 +75,7 @@ public class rtfParser extends AbstractParser implements Parser { mimeType, "UTF-8", null, + null, ((bodyText.length() > 80)? bodyText.substring(0, 80):bodyText.trim()). replaceAll("\r\n"," "). replaceAll("\n"," "). diff --git a/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java b/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java index 9fde0a890..4bf324896 100644 --- a/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java +++ b/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java @@ -63,7 +63,7 @@ public class sevenzipParser extends AbstractParser implements Parser { public plasmaParserDocument parse(final yacyURL location, final String mimeType, final String charset, final IInStream source, final long maxRamSize) throws ParserException, InterruptedException { - final plasmaParserDocument doc = new plasmaParserDocument(location, mimeType, charset); + final plasmaParserDocument doc = new plasmaParserDocument(location, mimeType, charset, null); Handler archive; super.theLogger.logFine("opening 7zip archive..."); try { diff --git a/source/de/anomic/plasma/parser/swf/swfParser.java b/source/de/anomic/plasma/parser/swf/swfParser.java index 8e4be9b86..9a5df0c05 100644 --- a/source/de/anomic/plasma/parser/swf/swfParser.java +++ b/source/de/anomic/plasma/parser/swf/swfParser.java @@ -107,6 +107,7 @@ public class swfParser extends AbstractParser implements Parser { location, // url of the source document mimeType, // the documents mime type "UTF-8", // charset of the document text + null, null, //keywords ((contents.length() > 80)? contents.substring(0, 80):contents.trim()). replaceAll("\r\n"," "). diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java index b2dc48cb1..0eef6abc1 100644 --- a/source/de/anomic/plasma/parser/tar/tarParser.java +++ b/source/de/anomic/plasma/parser/tar/tarParser.java @@ -188,6 +188,7 @@ public class tarParser extends AbstractParser implements Parser { location, mimeType, null, + null, docKeywords.toString().split(" |,"), docLongTitle.toString(), "", // TODO: AUTHOR @@ -201,6 +202,7 @@ public class tarParser extends AbstractParser implements Parser { location, mimeType, null, + null, docKeywords.toString().split(" |,"), docLongTitle.toString(), "", // TODO: AUTHOR diff --git a/source/de/anomic/plasma/parser/vcf/vcfParser.java b/source/de/anomic/plasma/parser/vcf/vcfParser.java index 7b5ad1ec4..085cb85b8 100644 --- a/source/de/anomic/plasma/parser/vcf/vcfParser.java +++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java @@ -78,7 +78,7 @@ public class vcfParser extends AbstractParser implements Parser { return SUPPORTED_MIME_TYPES; } - public plasmaParserDocument parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { + public plasmaParserDocument parse(final yacyURL url, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { try { final StringBuffer parsedTitle = new StringBuffer(); @@ -213,7 +213,7 @@ public class vcfParser extends AbstractParser implements Parser { } else { if (theLogger.isFinest()) this.theLogger.logFinest("Invalid data in vcf file" + - "\n\tURL: " + location + + "\n\tURL: " + url + "\n\tLine: " + line + "\n\tLine-Nr: " + lineNr); } @@ -222,10 +222,11 @@ public class vcfParser extends AbstractParser implements Parser { final String[] sections = parsedNames.toArray(new String[parsedNames.size()]); final byte[] text = parsedDataText.toString().getBytes(); final plasmaParserDocument theDoc = new plasmaParserDocument( - location, // url of the source document + url, // url of the source document mimeType, // the documents mime type null, null, // a list of extracted keywords + null, // the language parsedTitle.toString(), // a long document title "", // TODO: AUTHOR sections, // an array of section headlines @@ -238,7 +239,7 @@ public class vcfParser extends AbstractParser implements Parser { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof ParserException) throw (ParserException) e; - throw new ParserException("Unexpected error while parsing vcf resource. " + e.getMessage(),location); + throw new ParserException("Unexpected error while parsing vcf resource. " + e.getMessage(),url); } } diff --git a/source/de/anomic/plasma/parser/xls/xlsParser.java b/source/de/anomic/plasma/parser/xls/xlsParser.java index 411425aff..dcb452ed4 100644 --- a/source/de/anomic/plasma/parser/xls/xlsParser.java +++ b/source/de/anomic/plasma/parser/xls/xlsParser.java @@ -116,6 +116,7 @@ public class xlsParser extends AbstractParser implements Parser, HSSFListener { mimeType, "UTF-8", null, + null, ((contents.length() > 80) ? contents.substring(0, 80) : contents.trim()). replaceAll("\r\n"," "). replaceAll("\n"," "). diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java index 34108b1f9..c524984bd 100644 --- a/source/de/anomic/plasma/parser/zip/zipParser.java +++ b/source/de/anomic/plasma/parser/zip/zipParser.java @@ -172,6 +172,7 @@ public class zipParser extends AbstractParser implements Parser { location, mimeType, null, + null, docKeywords.toString().split(" |,"), docLongTitle.toString(), "", // TODO: AUTHOR @@ -185,6 +186,7 @@ public class zipParser extends AbstractParser implements Parser { location, mimeType, null, + null, docKeywords.toString().split(" |,"), docLongTitle.toString(), "", // TODO: AUTHOR diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 3bc8e9c90..93feb65de 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -727,6 +727,7 @@ public final class plasmaParser { location, mimeType, charSet, + scraper.getContentLanguages(), scraper.getKeywords(), scraper.getTitle(), scraper.getAuthor(), diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index 14b6a6dde..263882661 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -36,6 +36,7 @@ import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.TreeSet; import de.anomic.htmlFilter.htmlFilterContentScraper; @@ -67,8 +68,9 @@ public class plasmaParserDocument { private boolean resorted; private InputStream textStream; private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure + private Set languages; - protected plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, + protected plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set languages, final String[] keywords, final String title, final String author, final String[] sections, final String abstrct, final Object text, final Map anchors, final HashMap images) { @@ -90,6 +92,7 @@ public class plasmaParserDocument { this.resorted = false; this.inboundLinks = -1; this.outboundLinks = -1; + this.languages = languages; if (text == null) try { this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE); @@ -101,31 +104,48 @@ public class plasmaParserDocument { } } - public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset) { - this(location, mimeType, charset, null, null, null, null, null, (Object)null, null, null); + public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set languages) { + this(location, mimeType, charset, languages, null, null, null, null, null, (Object)null, null, null); } - public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, + public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set languages, final String[] keywords, final String title, final String author, final String[] sections, final String abstrct, final byte[] text, final Map anchors, final HashMap images) { - this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images); + this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images); } - public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, + public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set languages, final String[] keywords, final String title, final String author, final String[] sections, final String abstrct, final File text, final Map anchors, final HashMap images) { - this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images); + this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images); } - public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, + public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set languages, final String[] keywords, final String title, final String author, final String[] sections, final String abstrct, final serverCachedFileOutputStream text, final Map anchors, final HashMap images) { - this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images); + this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images); } + /** + * compute a set of languages that this document contains + * the language is not computed using a statistical analysis of the content, only from given metadata that came with the document + * if there are several languages defined in the document, the TLD is taken to check which one should be picked + * If there is no metadata at all, null is returned + * @return a string with a language name using the alpha-2 code of ISO 639 + */ + public String languageByMetadata() { + if (this.languages == null) return null; + if (this.languages.size() == 0) return null; + if (this.languages.size() == 1) return languages.iterator().next(); + if (this.languages.contains(this.source.language())) return this.source.language(); + // now we are confused: the declared languages differ all from the TLD + // just pick one of the languages that we have + return languages.iterator().next(); + } + /* DC according to rfc 5013 diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java index f2b861edd..66a5a7083 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java @@ -57,6 +57,7 @@ public class plasmaSearchRankingProfile { public static final String CATHASVIDEO = "cathasvideo"; public static final String CATHASAPP = "cathasapp"; public static final String TERMFREQUENCY = "tf"; + public static final String LANGUAGE = "language"; // ranking of preferred language // post-sort predicates public static final String URLCOMPINTOPLIST = "urlcompintoplist"; @@ -74,7 +75,7 @@ public class plasmaSearchRankingProfile { coeff_appurl, coeff_app_dc_title, coeff_app_dc_creator, coeff_app_dc_subject, coeff_app_dc_description, coeff_appemph, coeff_catindexof, coeff_cathasimage, coeff_cathasaudio, coeff_cathasvideo, coeff_cathasapp, coeff_urlcompintoplist, coeff_descrcompintoplist, coeff_prefer, - coeff_termfrequency; + coeff_termfrequency, coeff_language; public plasmaSearchRankingProfile(final int mediatype) { // set default-values @@ -109,6 +110,7 @@ public class plasmaSearchRankingProfile { coeff_urlcompintoplist = 3; coeff_descrcompintoplist = 2; coeff_prefer = 14; + coeff_language = 13; } public plasmaSearchRankingProfile(final String prefix, final String profile) { @@ -160,6 +162,7 @@ public class plasmaSearchRankingProfile { coeff_urlcompintoplist = parseMap(coeff, URLCOMPINTOPLIST, coeff_urlcompintoplist); coeff_descrcompintoplist = parseMap(coeff, DESCRCOMPINTOPLIST, coeff_descrcompintoplist); coeff_prefer = parseMap(coeff, PREFER, coeff_prefer); + coeff_language = parseMap(coeff, LANGUAGE, coeff_language); } } @@ -209,6 +212,7 @@ public class plasmaSearchRankingProfile { ext.put(prefix + CATHASVIDEO, Integer.toString(coeff_cathasvideo)); ext.put(prefix + CATHASAPP, Integer.toString(coeff_cathasapp)); ext.put(prefix + TERMFREQUENCY, Integer.toString(coeff_termfrequency)); + ext.put(prefix + LANGUAGE, Integer.toString(coeff_language)); return ext; } diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 70f60ed3b..1163583e1 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -819,12 +819,29 @@ public final class plasmaWordIndex implements indexRI { final yacyURL referrerURL = entry.referrerURL(); final Date docDate = entry.getModificationDate(); String language = condenser.language(); + String bymetadata = document.languageByMetadata(); // the languageByMetadata may return null if there was no declaration if (language == null) { - System.out.println("*** DEBUG LANGUAGE: identification of " + entry.url() + " FAILED, taking TLD"); - language = entry.url().language(); + language = (bymetadata == null) ? entry.url().language() : bymetadata; + System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " FAILED, taking " + ((bymetadata == null) ? "TLD" : "metadata") + ": " + language); } else { - System.out.println("*** DEBUG LANGUAGE: identification of " + entry.url() + " SUCCESS: " + language); - if (language.equals("pl")) language = entry.url().language(); // patch a bug TODO: remove this if bug is fixed + if (language.equals("pl")) { + System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " HAS BUG: " + language); + language = (bymetadata == null) ? entry.url().language() : bymetadata; // extra handling of this case: overwrite with bymetadata + } else { + if (bymetadata == null) { + if (language.equals(entry.url().language())) + System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IDENTICAL: " + language); + else { + System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " (the language given by the TLD is " + entry.url().language() + ")"); + language = entry.url().language(); + } + } else { + if (language.equals(bymetadata)) + System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language); + else + System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " (the language given by metadata is " + bymetadata + ")"); + } + } } // create a new loaded URL db entry diff --git a/source/de/anomic/tools/iso639.java b/source/de/anomic/tools/iso639.java new file mode 100755 index 000000000..01cb2e9c5 --- /dev/null +++ b/source/de/anomic/tools/iso639.java @@ -0,0 +1,197 @@ +// iso639.java +// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 19.09.2008 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.tools; + +import java.util.HashMap; + +public class iso639 { + + static final String[] codes = { + "aa-Afar", + "ab-Abkhazian", + "af-Afrikaans", + "am-Amharic", + "ar-Arabic", + "as-Assamese", + "ay-Aymara", + "az-Azerbaijani", + "ba-Bashkir", + "be-Byelorussian", + "bg-Bulgarian", + "bh-Bihari", + "bi-Bislama", + "bn-Bengali;-Bangla", + "bo-Tibetan", + "br-Breton", + "ca-Catalan", + "co-Corsican", + "cs-Czech", + "cy-Welsh", + "da-Danish", + "de-German", + "dz-Bhutani", + "el-Greek", + "en-English", + "eo-Esperanto", + "es-Spanish", + "et-Estonian", + "eu-Basque", + "fa-Persian", + "fi-Finnish", + "fj-Fiji", + "fo-Faeroese", + "fr-French", + "fy-Frisian", + "ga-Irish", + "gd-Scots-Gaelic", + "gl-Galician", + "gn-Guarani", + "gu-Gujarati", + "ha-Hausa", + "hi-Hindi", + "hr-Croatian", + "hu-Hungarian", + "hy-Armenian", + "ia-Interlingua", + "ie-Interlingue", + "ik-Inupiak", + "in-Indonesian", + "is-Icelandic", + "it-Italian", + "iw-Hebrew", + "ja-Japanese", + "ji-Yiddish", + "jw-Javanese", + "ka-Georgian", + "kk-Kazakh", + "kl-Greenlandic", + "km-Cambodian", + "kn-Kannada", + "ko-Korean", + "ks-Kashmiri", + "ku-Kurdish", + "ky-Kirghiz", + "la-Latin", + "ln-Lingala", + "lo-Laothian", + "lt-Lithuanian", + "lv-Latvian,-Lettish", + "mg-Malagasy", + "mi-Maori", + "mk-Macedonian", + "ml-Malayalam", + "mn-Mongolian", + "mo-Moldavian", + "mr-Marathi", + "ms-Malay", + "mt-Maltese", + "my-Burmese", + "na-Nauru", + "ne-Nepali", + "nl-Dutch", + "no-Norwegian", + "oc-Occitan", + "om-(Afan)-Oromo", + "or-Oriya", + "pa-Punjabi", + "pl-Polish", + "ps-Pashto,-Pushto", + "pt-Portuguese", + "qu-Quechua", + "rm-Rhaeto-Romance", + "rn-Kirundi", + "ro-Romanian", + "ru-Russian", + "rw-Kinyarwanda", + "sa-Sanskrit", + "sd-Sindhi", + "sg-Sangro", + "sh-Serbo-Croatian", + "si-Singhalese", + "sk-Slovak", + "sl-Slovenian", + "sm-Samoan", + "sn-Shona", + "so-Somali", + "sq-Albanian", + "sr-Serbian", + "ss-Siswati", + "st-Sesotho", + "su-Sundanese", + "sv-Swedish", + "sw-Swahili", + "ta-Tamil", + "te-Tegulu", + "tg-Tajik", + "th-Thai", + "ti-Tigrinya", + "tk-Turkmen", + "tl-Tagalog", + "tn-Setswana", + "to-Tonga", + "tr-Turkish", + "ts-Tsonga", + "tt-Tatar", + "tw-Twi", + "uk-Ukrainian", + "ur-Urdu", + "uz-Uzbek", + "vi-Vietnamese", + "vo-Volapuk", + "wo-Wolof", + "xh-Xhosa", + "yo-Yoruba", + "zh-Chinese", + "zu-Zulu"}; + + static HashMap mapping = new HashMap(); + + static { + for (int i = 0; i < codes.length; i++) { + mapping.put(codes[i].substring(0, 2), codes[i].substring(3)); + } + } + + /** + * get the name of the alpha-2 country code + * @param code, the mnemonic of the country in alpha-2 + * @return the name of the country + */ + public static final String country(String code) { + return mapping.get(code.toLowerCase()); + } + + /** + * see if the given country in alpha-2 country code exists + * @param code, the mnemonic of the country in alpha-2 + * @return true if the code exists + */ + public static final boolean exists(String code) { + return mapping.containsKey(code.toLowerCase()); + } + +} \ No newline at end of file diff --git a/source/de/anomic/yacy/yacyURL.java b/source/de/anomic/yacy/yacyURL.java index a7466994f..4058f4546 100644 --- a/source/de/anomic/yacy/yacyURL.java +++ b/source/de/anomic/yacy/yacyURL.java @@ -848,7 +848,7 @@ public class yacyURL implements Serializable { // language calculation public String language() { - String language = "uk"; + String language = "en"; final int pos = host.lastIndexOf("."); if ((pos > 0) && (host.length() - pos == 3)) language = host.substring(pos + 1).toLowerCase(); return language;