|
|
|
@ -502,17 +502,19 @@ public class Segment {
|
|
|
|
|
language = (bymetadata == null) ? url.language() : bymetadata;
|
|
|
|
|
} else {
|
|
|
|
|
if (bymetadata == null) {
|
|
|
|
|
// two possible results: compare and report conflicts
|
|
|
|
|
if (!language.equals(url.language())) {
|
|
|
|
|
// see if we have a hint in the url that the statistic was right
|
|
|
|
|
final String u = urlNormalform.toLowerCase();
|
|
|
|
|
String ISO639_country = ISO639.country(language);
|
|
|
|
|
if (u.contains("/" + language + "/") ||
|
|
|
|
|
(ISO639_country != null && u.contains("/" + ISO639.country(language).toLowerCase() + "/"))) {
|
|
|
|
|
// this is a strong hint that the statistics was in fact correct
|
|
|
|
|
} else {
|
|
|
|
|
// no confirmation using the url, use the TLD
|
|
|
|
|
language = url.language();
|
|
|
|
|
if (condenser.languageProbability() < 0.9) { // if probability of statistic is not very high, examine url
|
|
|
|
|
// two possible results: compare and report conflicts
|
|
|
|
|
if (!language.equals(url.language())) {
|
|
|
|
|
// see if we have a hint in the url that the statistic was right
|
|
|
|
|
final String u = urlNormalform.toLowerCase();
|
|
|
|
|
String ISO639_country = ISO639.country(language);
|
|
|
|
|
if (u.contains("/" + language + "/") ||
|
|
|
|
|
(ISO639_country != null && u.contains("/" + ISO639.country(language).toLowerCase() + "/"))) {
|
|
|
|
|
// this is a strong hint that the statistics was in fact correct
|
|
|
|
|
} else {
|
|
|
|
|
// no confirmation using the url, use the TLD
|
|
|
|
|
language = url.language();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
@ -682,13 +684,12 @@ public class Segment {
|
|
|
|
|
final long storageEndTime = System.currentTimeMillis();
|
|
|
|
|
|
|
|
|
|
// STORE PAGE INDEX INTO WORD INDEX DB
|
|
|
|
|
int outlinksSame = document.inboundLinks().size();
|
|
|
|
|
int outlinksOther = document.outboundLinks().size();
|
|
|
|
|
final int urlLength = urlNormalform.length();
|
|
|
|
|
final int urlComps = MultiProtocolURL.urlComps(url.toNormalform(false)).length;
|
|
|
|
|
|
|
|
|
|
// create a word prototype which is re-used for all entries
|
|
|
|
|
if ((this.termIndex != null && storeToRWI) || searchEvent != null) {
|
|
|
|
|
final int outlinksSame = document.inboundLinks().size();
|
|
|
|
|
final int outlinksOther = document.outboundLinks().size();
|
|
|
|
|
final int urlLength = urlNormalform.length();
|
|
|
|
|
final int urlComps = MultiProtocolURL.urlComps(url.toNormalform(false)).length;
|
|
|
|
|
final int wordsintitle = CommonPattern.SPACES.split(dc_title).length; // same calculation as for CollectionSchema.title_words_val
|
|
|
|
|
final WordReferenceRow ientry = new WordReferenceRow(
|
|
|
|
|
url.hash(),
|
|
|
|
|