override detected language (statistic langdetect) only with TLD determided

language if langdetect probability is not high.
+ additionally truncate zh-cn / zh-tw returned by langdetect to 2 char ISO639-1 zh
used by YaCy
pull/44/head
reger 9 years ago
parent b65e2b527d
commit 6f0b073bf3

@ -266,6 +266,14 @@ public final class Condenser extends Tokenizer {
return this.languageIdentificator.getLanguage();
}
/**
* get the probability of the detected language received by {@link #language()}
* @return 0.0 to 1.0
*/
public double languageProbability() {
return this.languageIdentificator.getProbability();
}
public static void main(final String[] args) {
// read a property file and convert them into configuration lines
try {

@ -55,6 +55,12 @@ public final class Identificator {
this.detector.append(" " + word); // detector internally caches text up to maxtextlen = default = 10000 chars
}
/**
* Get the detected language with highest probability
* if detection probability is above 0.3 (30%)
* Underlaying detector differentiates zh-cn and zh-tw, these are returned as zh here.
* @return 2 char language code (ISO 639-1)
*/
public String getLanguage() {
try {
ArrayList<Language> probabilities = this.detector.getProbabilities();
@ -67,11 +73,25 @@ public final class Identificator {
}
// Return language only if probability is higher than 30% to account for missing language profiles
if (this.language.prob > 0.3) {
return this.language.lang;
if (this.language.lang.length() == 2)
return this.language.lang;
else
return this.language.lang.substring(0,2);
}
return null;
}
/**
* Get the probability of the detected language (returned by {@link #getLanguage()})
* @return 0.0 to 1.0
*/
public double getProbability() {
if (language != null) {
return language.prob;
} else
return 0.0;
}
}

@ -502,17 +502,19 @@ public class Segment {
language = (bymetadata == null) ? url.language() : bymetadata;
} else {
if (bymetadata == null) {
// two possible results: compare and report conflicts
if (!language.equals(url.language())) {
// see if we have a hint in the url that the statistic was right
final String u = urlNormalform.toLowerCase();
String ISO639_country = ISO639.country(language);
if (u.contains("/" + language + "/") ||
(ISO639_country != null && u.contains("/" + ISO639.country(language).toLowerCase() + "/"))) {
// this is a strong hint that the statistics was in fact correct
} else {
// no confirmation using the url, use the TLD
language = url.language();
if (condenser.languageProbability() < 0.9) { // if probability of statistic is not very high, examine url
// two possible results: compare and report conflicts
if (!language.equals(url.language())) {
// see if we have a hint in the url that the statistic was right
final String u = urlNormalform.toLowerCase();
String ISO639_country = ISO639.country(language);
if (u.contains("/" + language + "/") ||
(ISO639_country != null && u.contains("/" + ISO639.country(language).toLowerCase() + "/"))) {
// this is a strong hint that the statistics was in fact correct
} else {
// no confirmation using the url, use the TLD
language = url.language();
}
}
}
} else {
@ -682,13 +684,12 @@ public class Segment {
final long storageEndTime = System.currentTimeMillis();
// STORE PAGE INDEX INTO WORD INDEX DB
int outlinksSame = document.inboundLinks().size();
int outlinksOther = document.outboundLinks().size();
final int urlLength = urlNormalform.length();
final int urlComps = MultiProtocolURL.urlComps(url.toNormalform(false)).length;
// create a word prototype which is re-used for all entries
if ((this.termIndex != null && storeToRWI) || searchEvent != null) {
final int outlinksSame = document.inboundLinks().size();
final int outlinksOther = document.outboundLinks().size();
final int urlLength = urlNormalform.length();
final int urlComps = MultiProtocolURL.urlComps(url.toNormalform(false)).length;
final int wordsintitle = CommonPattern.SPACES.split(dc_title).length; // same calculation as for CollectionSchema.title_words_val
final WordReferenceRow ientry = new WordReferenceRow(
url.hash(),

Loading…
Cancel
Save