- added language detection using metadata from documents: html and odt documents provide this information

- metadata and results from statistical analysis are compared and result is printed out as debug lines
- added ranking profile for wanted language
- added class with ISO 639 table, a list of all valid country codes that will be used for the language identification

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5187 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 3768a1bd32
commit bfcf9b7aa3

@ -75,6 +75,7 @@ public class Ranking_p {
rankingParameters.put(plasmaSearchRankingProfile.WORDSINTEXT, "Words In Text"); rankingParameters.put(plasmaSearchRankingProfile.WORDSINTEXT, "Words In Text");
rankingParameters.put(plasmaSearchRankingProfile.WORDSINTITLE, "Words In Title"); rankingParameters.put(plasmaSearchRankingProfile.WORDSINTITLE, "Words In Title");
rankingParameters.put(plasmaSearchRankingProfile.YBR, "YaCy Block Rank"); rankingParameters.put(plasmaSearchRankingProfile.YBR, "YaCy Block Rank");
rankingParameters.put(plasmaSearchRankingProfile.LANGUAGE, "Preferred Language");
} }
private static serverObjects defaultValues() { private static serverObjects defaultValues() {

@ -29,6 +29,7 @@ package xml.util;
import java.io.IOException; import java.io.IOException;
import java.io.Writer; import java.io.Writer;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.Set;
import de.anomic.crawler.HTTPLoader; import de.anomic.crawler.HTTPLoader;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentScraper;
@ -103,8 +104,9 @@ public class getpageinfo_p {
prop.put("tags", count); prop.put("tags", count);
// put description // put description
prop.putHTML("desc", scraper.getDescription(), true); prop.putHTML("desc", scraper.getDescription(), true);
// put language // put language
prop.putHTML("lang", scraper.getContentLanguages()[0], true); Set<String> languages = scraper.getContentLanguages();
prop.putHTML("lang", (languages == null) ? "unknown" : languages.iterator().next(), true);
} catch (final MalformedURLException e) { /* ignore this */ } catch (final MalformedURLException e) { /* ignore this */
} catch (final IOException e) { /* ignore this */ } catch (final IOException e) { /* ignore this */

@ -50,6 +50,7 @@ import de.anomic.http.httpRequestHeader;
import de.anomic.server.serverCharBuffer; import de.anomic.server.serverCharBuffer;
import de.anomic.server.serverFileUtils; import de.anomic.server.serverFileUtils;
import de.anomic.yacy.yacyURL; import de.anomic.yacy.yacyURL;
import de.anomic.tools.iso639;
public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper { public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper {
@ -381,11 +382,21 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return s; return s;
} }
public String[] getContentLanguages() { public HashSet<String> getContentLanguages() {
String s = metas.get("content-language"); String s = metas.get("content-language");
if (s == null) s = metas.get("dc.language"); if (s == null) s = metas.get("dc.language");
if (s == null) s = ""; if (s == null) return null;
return s.split(" |,"); HashSet<String> hs = new HashSet<String>();
String[] cl = s.split(" |,");
int p;
for (int i = 0; i < cl.length; i++) {
cl[i] = cl[i].toLowerCase();
p = cl[i].indexOf('-');
if (p > 0) cl[i] = cl[i].substring(0, p);
if (iso639.exists(cl[i])) hs.add(cl[i]);
}
if (hs.size() == 0) return null;
return hs;
} }
public String[] getKeywords() { public String[] getKeywords() {

@ -118,6 +118,7 @@ public final class indexContainerHeap {
int urlCount = 0; int urlCount = 0;
synchronized (cache) { synchronized (cache) {
for (final indexContainer container : new heapFileEntries(heapFile, this.payloadrow)) { for (final indexContainer container : new heapFileEntries(heapFile, this.payloadrow)) {
// TODO: in this loop a lot of memory may be allocated. A check if the memory gets low is necessary. But what do when the memory is low?
if (container == null) break; if (container == null) break;
cache.put(container.getWordHash(), container); cache.put(container.getWordHash(), container);
urlCount += container.size(); urlCount += container.size();
@ -252,6 +253,10 @@ public final class indexContainerHeap {
} }
} }
/**
* return an index container
* because they may get very large, it is wise to deallocate some memory before calling next()
*/
public indexContainer next() { public indexContainer next() {
final indexContainer n = this.nextContainer; final indexContainer n = this.nextContainer;
this.nextContainer = next0(); this.nextContainer = next0();

@ -70,6 +70,7 @@ public class docParser extends AbstractParser implements Parser {
mimeType, mimeType,
"UTF-8", "UTF-8",
null, null,
null,
((contents.length() > 80)? contents.substring(0, 80):contents.trim()). ((contents.length() > 80)? contents.substring(0, 80):contents.trim()).
replaceAll("\r\n"," "). replaceAll("\r\n"," ").
replaceAll("\n"," "). replaceAll("\n"," ").

@ -32,7 +32,9 @@ import java.io.OutputStreamWriter;
import java.io.Writer; import java.io.Writer;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.HashSet;
import java.util.Hashtable; import java.util.Hashtable;
import java.util.Set;
import java.util.zip.ZipEntry; import java.util.zip.ZipEntry;
import java.util.zip.ZipFile; import java.util.zip.ZipFile;
@ -89,6 +91,7 @@ public class odtParser extends AbstractParser implements Parser {
String docShortTitle = null; String docShortTitle = null;
String docLongTitle = null; String docLongTitle = null;
String docAuthor = null; String docAuthor = null;
String docLanguage = null;
// opening the file as zip file // opening the file as zip file
final ZipFile zipFile= new ZipFile(dest); final ZipFile zipFile= new ZipFile(dest);
@ -134,9 +137,14 @@ public class odtParser extends AbstractParser implements Parser {
docShortTitle = metaData.getTitle(); docShortTitle = metaData.getTitle();
docLongTitle = metaData.getSubject(); docLongTitle = metaData.getSubject();
docAuthor = metaData.getCreator(); docAuthor = metaData.getCreator();
docLanguage = metaData.getLanguage();
} }
} }
// make the languages set
Set<String> languages = new HashSet<String>(1);
if (docLanguage != null) languages.add(docLanguage);
// if there is no title availabe we generate one // if there is no title availabe we generate one
if (docLongTitle == null) { if (docLongTitle == null) {
if (docShortTitle != null) { if (docShortTitle != null) {
@ -156,6 +164,7 @@ public class odtParser extends AbstractParser implements Parser {
location, location,
mimeType, mimeType,
"UTF-8", "UTF-8",
languages,
docKeywords, docKeywords,
docLongTitle, docLongTitle,
docAuthor, docAuthor,
@ -169,6 +178,7 @@ public class odtParser extends AbstractParser implements Parser {
location, location,
mimeType, mimeType,
"UTF-8", "UTF-8",
languages,
docKeywords, docKeywords,
docLongTitle, docLongTitle,
docAuthor, docAuthor,

@ -143,6 +143,7 @@ public class pdfParser extends AbstractParser implements Parser {
location, location,
mimeType, mimeType,
"UTF-8", "UTF-8",
null,
docKeywords, docKeywords,
(docTitle == null) ? docSubject : docTitle, (docTitle == null) ? docSubject : docTitle,
docAuthor, docAuthor,
@ -156,6 +157,7 @@ public class pdfParser extends AbstractParser implements Parser {
location, location,
mimeType, mimeType,
"UTF-8", "UTF-8",
null,
docKeywords, docKeywords,
(docTitle == null) ? docSubject : docTitle, (docTitle == null) ? docSubject : docTitle,
docAuthor, docAuthor,

@ -88,6 +88,7 @@ public class pptParser extends AbstractParser implements Parser {
mimeType, mimeType,
"UTF-8", "UTF-8",
null, null,
null,
((contents.length() > 80) ? contents.substring(0, 80) : contents.trim()). ((contents.length() > 80) ? contents.substring(0, 80) : contents.trim()).
replaceAll("\r\n"," "). replaceAll("\r\n"," ").
replaceAll("\n"," "). replaceAll("\n"," ").

@ -117,6 +117,7 @@ public class psParser extends AbstractParser implements Parser {
"UTF-8", "UTF-8",
null, null,
null, null,
null,
"", "",
null, null,
null, null,

@ -134,6 +134,7 @@ public class rpmParser extends AbstractParser implements Parser {
mimeType, mimeType,
"UTF-8", "UTF-8",
null, null,
null,
summary, summary,
packager, packager,
null, null,

@ -158,6 +158,7 @@ public class rssParser extends AbstractParser implements Parser {
mimeType, mimeType,
"UTF-8", "UTF-8",
null, null,
null,
feedTitle, feedTitle,
(authors.length() > 0)?authors.toString(1,authors.length()):"", (authors.length() > 0)?authors.toString(1,authors.length()):"",
feedSections.toArray(new String[feedSections.size()]), feedSections.toArray(new String[feedSections.size()]),

@ -75,6 +75,7 @@ public class rtfParser extends AbstractParser implements Parser {
mimeType, mimeType,
"UTF-8", "UTF-8",
null, null,
null,
((bodyText.length() > 80)? bodyText.substring(0, 80):bodyText.trim()). ((bodyText.length() > 80)? bodyText.substring(0, 80):bodyText.trim()).
replaceAll("\r\n"," "). replaceAll("\r\n"," ").
replaceAll("\n"," "). replaceAll("\n"," ").

@ -63,7 +63,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
public plasmaParserDocument parse(final yacyURL location, final String mimeType, final String charset, public plasmaParserDocument parse(final yacyURL location, final String mimeType, final String charset,
final IInStream source, final long maxRamSize) throws ParserException, InterruptedException { final IInStream source, final long maxRamSize) throws ParserException, InterruptedException {
final plasmaParserDocument doc = new plasmaParserDocument(location, mimeType, charset); final plasmaParserDocument doc = new plasmaParserDocument(location, mimeType, charset, null);
Handler archive; Handler archive;
super.theLogger.logFine("opening 7zip archive..."); super.theLogger.logFine("opening 7zip archive...");
try { try {

@ -107,6 +107,7 @@ public class swfParser extends AbstractParser implements Parser {
location, // url of the source document location, // url of the source document
mimeType, // the documents mime type mimeType, // the documents mime type
"UTF-8", // charset of the document text "UTF-8", // charset of the document text
null,
null, //keywords null, //keywords
((contents.length() > 80)? contents.substring(0, 80):contents.trim()). ((contents.length() > 80)? contents.substring(0, 80):contents.trim()).
replaceAll("\r\n"," "). replaceAll("\r\n"," ").

@ -188,6 +188,7 @@ public class tarParser extends AbstractParser implements Parser {
location, location,
mimeType, mimeType,
null, null,
null,
docKeywords.toString().split(" |,"), docKeywords.toString().split(" |,"),
docLongTitle.toString(), docLongTitle.toString(),
"", // TODO: AUTHOR "", // TODO: AUTHOR
@ -201,6 +202,7 @@ public class tarParser extends AbstractParser implements Parser {
location, location,
mimeType, mimeType,
null, null,
null,
docKeywords.toString().split(" |,"), docKeywords.toString().split(" |,"),
docLongTitle.toString(), docLongTitle.toString(),
"", // TODO: AUTHOR "", // TODO: AUTHOR

@ -78,7 +78,7 @@ public class vcfParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public plasmaParserDocument parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { public plasmaParserDocument parse(final yacyURL url, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
try { try {
final StringBuffer parsedTitle = new StringBuffer(); final StringBuffer parsedTitle = new StringBuffer();
@ -213,7 +213,7 @@ public class vcfParser extends AbstractParser implements Parser {
} else { } else {
if (theLogger.isFinest()) this.theLogger.logFinest("Invalid data in vcf file" + if (theLogger.isFinest()) this.theLogger.logFinest("Invalid data in vcf file" +
"\n\tURL: " + location + "\n\tURL: " + url +
"\n\tLine: " + line + "\n\tLine: " + line +
"\n\tLine-Nr: " + lineNr); "\n\tLine-Nr: " + lineNr);
} }
@ -222,10 +222,11 @@ public class vcfParser extends AbstractParser implements Parser {
final String[] sections = parsedNames.toArray(new String[parsedNames.size()]); final String[] sections = parsedNames.toArray(new String[parsedNames.size()]);
final byte[] text = parsedDataText.toString().getBytes(); final byte[] text = parsedDataText.toString().getBytes();
final plasmaParserDocument theDoc = new plasmaParserDocument( final plasmaParserDocument theDoc = new plasmaParserDocument(
location, // url of the source document url, // url of the source document
mimeType, // the documents mime type mimeType, // the documents mime type
null, null,
null, // a list of extracted keywords null, // a list of extracted keywords
null, // the language
parsedTitle.toString(), // a long document title parsedTitle.toString(), // a long document title
"", // TODO: AUTHOR "", // TODO: AUTHOR
sections, // an array of section headlines sections, // an array of section headlines
@ -238,7 +239,7 @@ public class vcfParser extends AbstractParser implements Parser {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e; if (e instanceof ParserException) throw (ParserException) e;
throw new ParserException("Unexpected error while parsing vcf resource. " + e.getMessage(),location); throw new ParserException("Unexpected error while parsing vcf resource. " + e.getMessage(),url);
} }
} }

@ -116,6 +116,7 @@ public class xlsParser extends AbstractParser implements Parser, HSSFListener {
mimeType, mimeType,
"UTF-8", "UTF-8",
null, null,
null,
((contents.length() > 80) ? contents.substring(0, 80) : contents.trim()). ((contents.length() > 80) ? contents.substring(0, 80) : contents.trim()).
replaceAll("\r\n"," "). replaceAll("\r\n"," ").
replaceAll("\n"," "). replaceAll("\n"," ").

@ -172,6 +172,7 @@ public class zipParser extends AbstractParser implements Parser {
location, location,
mimeType, mimeType,
null, null,
null,
docKeywords.toString().split(" |,"), docKeywords.toString().split(" |,"),
docLongTitle.toString(), docLongTitle.toString(),
"", // TODO: AUTHOR "", // TODO: AUTHOR
@ -185,6 +186,7 @@ public class zipParser extends AbstractParser implements Parser {
location, location,
mimeType, mimeType,
null, null,
null,
docKeywords.toString().split(" |,"), docKeywords.toString().split(" |,"),
docLongTitle.toString(), docLongTitle.toString(),
"", // TODO: AUTHOR "", // TODO: AUTHOR

@ -727,6 +727,7 @@ public final class plasmaParser {
location, location,
mimeType, mimeType,
charSet, charSet,
scraper.getContentLanguages(),
scraper.getKeywords(), scraper.getKeywords(),
scraper.getTitle(), scraper.getTitle(),
scraper.getAuthor(), scraper.getAuthor(),

@ -36,6 +36,7 @@ import java.util.Iterator;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentScraper;
@ -67,8 +68,9 @@ public class plasmaParserDocument {
private boolean resorted; private boolean resorted;
private InputStream textStream; private InputStream textStream;
private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure
private Set<String> languages;
protected plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, protected plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
final String[] keywords, final String title, final String author, final String[] keywords, final String title, final String author,
final String[] sections, final String abstrct, final String[] sections, final String abstrct,
final Object text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) { final Object text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) {
@ -90,6 +92,7 @@ public class plasmaParserDocument {
this.resorted = false; this.resorted = false;
this.inboundLinks = -1; this.inboundLinks = -1;
this.outboundLinks = -1; this.outboundLinks = -1;
this.languages = languages;
if (text == null) try { if (text == null) try {
this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE); this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
@ -101,31 +104,48 @@ public class plasmaParserDocument {
} }
} }
public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset) { public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages) {
this(location, mimeType, charset, null, null, null, null, null, (Object)null, null, null); this(location, mimeType, charset, languages, null, null, null, null, null, (Object)null, null, null);
} }
public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
final String[] keywords, final String title, final String author, final String[] keywords, final String title, final String author,
final String[] sections, final String abstrct, final String[] sections, final String abstrct,
final byte[] text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) { final byte[] text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) {
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images); this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
} }
public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
final String[] keywords, final String title, final String author, final String[] keywords, final String title, final String author,
final String[] sections, final String abstrct, final String[] sections, final String abstrct,
final File text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) { final File text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) {
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images); this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
} }
public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
final String[] keywords, final String title, final String author, final String[] keywords, final String title, final String author,
final String[] sections, final String abstrct, final String[] sections, final String abstrct,
final serverCachedFileOutputStream text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) { final serverCachedFileOutputStream text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) {
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images); this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
} }
/**
* compute a set of languages that this document contains
* the language is not computed using a statistical analysis of the content, only from given metadata that came with the document
* if there are several languages defined in the document, the TLD is taken to check which one should be picked
* If there is no metadata at all, null is returned
* @return a string with a language name using the alpha-2 code of ISO 639
*/
public String languageByMetadata() {
if (this.languages == null) return null;
if (this.languages.size() == 0) return null;
if (this.languages.size() == 1) return languages.iterator().next();
if (this.languages.contains(this.source.language())) return this.source.language();
// now we are confused: the declared languages differ all from the TLD
// just pick one of the languages that we have
return languages.iterator().next();
}
/* /*
DC according to rfc 5013 DC according to rfc 5013

@ -57,6 +57,7 @@ public class plasmaSearchRankingProfile {
public static final String CATHASVIDEO = "cathasvideo"; public static final String CATHASVIDEO = "cathasvideo";
public static final String CATHASAPP = "cathasapp"; public static final String CATHASAPP = "cathasapp";
public static final String TERMFREQUENCY = "tf"; public static final String TERMFREQUENCY = "tf";
public static final String LANGUAGE = "language"; // ranking of preferred language
// post-sort predicates // post-sort predicates
public static final String URLCOMPINTOPLIST = "urlcompintoplist"; public static final String URLCOMPINTOPLIST = "urlcompintoplist";
@ -74,7 +75,7 @@ public class plasmaSearchRankingProfile {
coeff_appurl, coeff_app_dc_title, coeff_app_dc_creator, coeff_app_dc_subject, coeff_app_dc_description, coeff_appemph, coeff_appurl, coeff_app_dc_title, coeff_app_dc_creator, coeff_app_dc_subject, coeff_app_dc_description, coeff_appemph,
coeff_catindexof, coeff_cathasimage, coeff_cathasaudio, coeff_cathasvideo, coeff_cathasapp, coeff_catindexof, coeff_cathasimage, coeff_cathasaudio, coeff_cathasvideo, coeff_cathasapp,
coeff_urlcompintoplist, coeff_descrcompintoplist, coeff_prefer, coeff_urlcompintoplist, coeff_descrcompintoplist, coeff_prefer,
coeff_termfrequency; coeff_termfrequency, coeff_language;
public plasmaSearchRankingProfile(final int mediatype) { public plasmaSearchRankingProfile(final int mediatype) {
// set default-values // set default-values
@ -109,6 +110,7 @@ public class plasmaSearchRankingProfile {
coeff_urlcompintoplist = 3; coeff_urlcompintoplist = 3;
coeff_descrcompintoplist = 2; coeff_descrcompintoplist = 2;
coeff_prefer = 14; coeff_prefer = 14;
coeff_language = 13;
} }
public plasmaSearchRankingProfile(final String prefix, final String profile) { public plasmaSearchRankingProfile(final String prefix, final String profile) {
@ -160,6 +162,7 @@ public class plasmaSearchRankingProfile {
coeff_urlcompintoplist = parseMap(coeff, URLCOMPINTOPLIST, coeff_urlcompintoplist); coeff_urlcompintoplist = parseMap(coeff, URLCOMPINTOPLIST, coeff_urlcompintoplist);
coeff_descrcompintoplist = parseMap(coeff, DESCRCOMPINTOPLIST, coeff_descrcompintoplist); coeff_descrcompintoplist = parseMap(coeff, DESCRCOMPINTOPLIST, coeff_descrcompintoplist);
coeff_prefer = parseMap(coeff, PREFER, coeff_prefer); coeff_prefer = parseMap(coeff, PREFER, coeff_prefer);
coeff_language = parseMap(coeff, LANGUAGE, coeff_language);
} }
} }
@ -209,6 +212,7 @@ public class plasmaSearchRankingProfile {
ext.put(prefix + CATHASVIDEO, Integer.toString(coeff_cathasvideo)); ext.put(prefix + CATHASVIDEO, Integer.toString(coeff_cathasvideo));
ext.put(prefix + CATHASAPP, Integer.toString(coeff_cathasapp)); ext.put(prefix + CATHASAPP, Integer.toString(coeff_cathasapp));
ext.put(prefix + TERMFREQUENCY, Integer.toString(coeff_termfrequency)); ext.put(prefix + TERMFREQUENCY, Integer.toString(coeff_termfrequency));
ext.put(prefix + LANGUAGE, Integer.toString(coeff_language));
return ext; return ext;
} }

@ -819,12 +819,29 @@ public final class plasmaWordIndex implements indexRI {
final yacyURL referrerURL = entry.referrerURL(); final yacyURL referrerURL = entry.referrerURL();
final Date docDate = entry.getModificationDate(); final Date docDate = entry.getModificationDate();
String language = condenser.language(); String language = condenser.language();
String bymetadata = document.languageByMetadata(); // the languageByMetadata may return null if there was no declaration
if (language == null) { if (language == null) {
System.out.println("*** DEBUG LANGUAGE: identification of " + entry.url() + " FAILED, taking TLD"); language = (bymetadata == null) ? entry.url().language() : bymetadata;
language = entry.url().language(); System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " FAILED, taking " + ((bymetadata == null) ? "TLD" : "metadata") + ": " + language);
} else { } else {
System.out.println("*** DEBUG LANGUAGE: identification of " + entry.url() + " SUCCESS: " + language); if (language.equals("pl")) {
if (language.equals("pl")) language = entry.url().language(); // patch a bug TODO: remove this if bug is fixed System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " HAS BUG: " + language);
language = (bymetadata == null) ? entry.url().language() : bymetadata; // extra handling of this case: overwrite with bymetadata
} else {
if (bymetadata == null) {
if (language.equals(entry.url().language()))
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IDENTICAL: " + language);
else {
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " (the language given by the TLD is " + entry.url().language() + ")");
language = entry.url().language();
}
} else {
if (language.equals(bymetadata))
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language);
else
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " (the language given by metadata is " + bymetadata + ")");
}
}
} }
// create a new loaded URL db entry // create a new loaded URL db entry

@ -0,0 +1,197 @@
// iso639.java
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 19.09.2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.tools;
import java.util.HashMap;
public class iso639 {
static final String[] codes = {
"aa-Afar",
"ab-Abkhazian",
"af-Afrikaans",
"am-Amharic",
"ar-Arabic",
"as-Assamese",
"ay-Aymara",
"az-Azerbaijani",
"ba-Bashkir",
"be-Byelorussian",
"bg-Bulgarian",
"bh-Bihari",
"bi-Bislama",
"bn-Bengali;-Bangla",
"bo-Tibetan",
"br-Breton",
"ca-Catalan",
"co-Corsican",
"cs-Czech",
"cy-Welsh",
"da-Danish",
"de-German",
"dz-Bhutani",
"el-Greek",
"en-English",
"eo-Esperanto",
"es-Spanish",
"et-Estonian",
"eu-Basque",
"fa-Persian",
"fi-Finnish",
"fj-Fiji",
"fo-Faeroese",
"fr-French",
"fy-Frisian",
"ga-Irish",
"gd-Scots-Gaelic",
"gl-Galician",
"gn-Guarani",
"gu-Gujarati",
"ha-Hausa",
"hi-Hindi",
"hr-Croatian",
"hu-Hungarian",
"hy-Armenian",
"ia-Interlingua",
"ie-Interlingue",
"ik-Inupiak",
"in-Indonesian",
"is-Icelandic",
"it-Italian",
"iw-Hebrew",
"ja-Japanese",
"ji-Yiddish",
"jw-Javanese",
"ka-Georgian",
"kk-Kazakh",
"kl-Greenlandic",
"km-Cambodian",
"kn-Kannada",
"ko-Korean",
"ks-Kashmiri",
"ku-Kurdish",
"ky-Kirghiz",
"la-Latin",
"ln-Lingala",
"lo-Laothian",
"lt-Lithuanian",
"lv-Latvian,-Lettish",
"mg-Malagasy",
"mi-Maori",
"mk-Macedonian",
"ml-Malayalam",
"mn-Mongolian",
"mo-Moldavian",
"mr-Marathi",
"ms-Malay",
"mt-Maltese",
"my-Burmese",
"na-Nauru",
"ne-Nepali",
"nl-Dutch",
"no-Norwegian",
"oc-Occitan",
"om-(Afan)-Oromo",
"or-Oriya",
"pa-Punjabi",
"pl-Polish",
"ps-Pashto,-Pushto",
"pt-Portuguese",
"qu-Quechua",
"rm-Rhaeto-Romance",
"rn-Kirundi",
"ro-Romanian",
"ru-Russian",
"rw-Kinyarwanda",
"sa-Sanskrit",
"sd-Sindhi",
"sg-Sangro",
"sh-Serbo-Croatian",
"si-Singhalese",
"sk-Slovak",
"sl-Slovenian",
"sm-Samoan",
"sn-Shona",
"so-Somali",
"sq-Albanian",
"sr-Serbian",
"ss-Siswati",
"st-Sesotho",
"su-Sundanese",
"sv-Swedish",
"sw-Swahili",
"ta-Tamil",
"te-Tegulu",
"tg-Tajik",
"th-Thai",
"ti-Tigrinya",
"tk-Turkmen",
"tl-Tagalog",
"tn-Setswana",
"to-Tonga",
"tr-Turkish",
"ts-Tsonga",
"tt-Tatar",
"tw-Twi",
"uk-Ukrainian",
"ur-Urdu",
"uz-Uzbek",
"vi-Vietnamese",
"vo-Volapuk",
"wo-Wolof",
"xh-Xhosa",
"yo-Yoruba",
"zh-Chinese",
"zu-Zulu"};
static HashMap<String, String> mapping = new HashMap<String, String>();
static {
for (int i = 0; i < codes.length; i++) {
mapping.put(codes[i].substring(0, 2), codes[i].substring(3));
}
}
/**
* get the name of the alpha-2 country code
* @param code, the mnemonic of the country in alpha-2
* @return the name of the country
*/
public static final String country(String code) {
return mapping.get(code.toLowerCase());
}
/**
* see if the given country in alpha-2 country code exists
* @param code, the mnemonic of the country in alpha-2
* @return true if the code exists
*/
public static final boolean exists(String code) {
return mapping.containsKey(code.toLowerCase());
}
}

@ -848,7 +848,7 @@ public class yacyURL implements Serializable {
// language calculation // language calculation
public String language() { public String language() {
String language = "uk"; String language = "en";
final int pos = host.lastIndexOf("."); final int pos = host.lastIndexOf(".");
if ((pos > 0) && (host.length() - pos == 3)) language = host.substring(pos + 1).toLowerCase(); if ((pos > 0) && (host.length() - pos == 3)) language = host.substring(pos + 1).toLowerCase();
return language; return language;

Loading…
Cancel
Save