added ranking and evaluation of language type in a search

the wanted language is taken from the browser user-agent string

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5192 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent e201ad0e9f
commit 00c1535f84

@ -3,7 +3,7 @@ javacSource=1.5
javacTarget=1.5 javacTarget=1.5
# Release Configuration # Release Configuration
releaseVersion=0.601 releaseVersion=0.602
stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz

@ -51,6 +51,7 @@ import de.anomic.server.serverObjects;
import de.anomic.server.serverProfiling; import de.anomic.server.serverProfiling;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
import de.anomic.tools.crypt; import de.anomic.tools.crypt;
import de.anomic.tools.iso639;
import de.anomic.xml.RSSFeed; import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage; import de.anomic.xml.RSSMessage;
import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacyCore;
@ -86,6 +87,13 @@ public final class search {
final String prefer = post.get("prefer", ""); final String prefer = post.get("prefer", "");
final String contentdom = post.get("contentdom", "text"); final String contentdom = post.get("contentdom", "text");
final String filter = post.get("filter", ".*"); final String filter = post.get("filter", ".*");
String language = post.get("language", "");
if (!iso639.exists(language)) {
// take language from the user agent
String agent = header.get("User-Agent");
if (agent == null) agent = System.getProperty("user.language");
language = (agent == null) ? "en" : iso639.userAgentLanguageDetection(agent);
}
final int partitions = post.getInt("partitions", 30); final int partitions = post.getInt("partitions", 30);
String profile = post.get("profile", ""); // remote profile hand-over String profile = post.get("profile", ""); // remote profile hand-over
if (profile.length() > 0) profile = crypt.simpleDecode(profile, null); if (profile.length() > 0) profile = crypt.simpleDecode(profile, null);
@ -174,7 +182,7 @@ public final class search {
plasmaSearchEvent theSearch = null; plasmaSearchEvent theSearch = null;
if ((query.length() == 0) && (abstractSet != null)) { if ((query.length() == 0) && (abstractSet != null)) {
// this is _not_ a normal search, only a request for index abstracts // this is _not_ a normal search, only a request for index abstracts
theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet<String>(kelondroBase64Order.enhancedComparator), rankingProfile, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, null, false, yacyURL.TLD_any_zone_filter, client, false); theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet<String>(kelondroBase64Order.enhancedComparator), rankingProfile, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), language, false, count, 0, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, null, false, yacyURL.TLD_any_zone_filter, client, false);
theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");
@ -200,7 +208,7 @@ public final class search {
} else { } else {
// retrieve index containers from search request // retrieve index containers from search request
theQuery = new plasmaSearchQuery(null, queryhashes, excludehashes, rankingProfile, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, constraint, false, yacyURL.TLD_any_zone_filter, client, false); theQuery = new plasmaSearchQuery(null, queryhashes, excludehashes, rankingProfile, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), language, false, count, 0, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, constraint, false, yacyURL.TLD_any_zone_filter, client, false);
theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");
RSSFeed.channels(RSSFeed.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes), "")); RSSFeed.channels(RSSFeed.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes), ""));

@ -40,6 +40,7 @@ import de.anomic.server.serverCore;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.tools.iso639;
import de.anomic.tools.yFormatter; import de.anomic.tools.yFormatter;
import de.anomic.xml.RSSFeed; import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage; import de.anomic.xml.RSSMessage;
@ -66,6 +67,7 @@ public class ysearch {
// get query // get query
String querystring = (post == null) ? "" : post.get("search", "").trim(); String querystring = (post == null) ? "" : post.get("search", "").trim();
boolean fetchSnippets = (post != null && post.get("verify", "false").equals("true"));
final serverObjects prop = new serverObjects(); final serverObjects prop = new serverObjects();
final boolean rss = (post == null) ? false : post.get("rss", "false").equals("true"); final boolean rss = (post == null) ? false : post.get("rss", "false").equals("true");
@ -132,6 +134,11 @@ public class ysearch {
final int domainzone = (post == null ? yacyURL.TLD_any_zone_filter : post.getInt("zone", yacyURL.TLD_any_zone_filter)); final int domainzone = (post == null ? yacyURL.TLD_any_zone_filter : post.getInt("zone", yacyURL.TLD_any_zone_filter));
// find out language of the user by reading of the user-agent string
String agent = header.get("User-Agent");
if (agent == null) agent = System.getProperty("user.language");
String language = (agent == null) ? "en" : iso639.userAgentLanguageDetection(agent);
// SEARCH // SEARCH
//final boolean indexDistributeGranted = sb.getConfig(plasmaSwitchboard.INDEX_DIST_ALLOW, "true").equals("true"); //final boolean indexDistributeGranted = sb.getConfig(plasmaSwitchboard.INDEX_DIST_ALLOW, "true").equals("true");
//final boolean indexReceiveGranted = sb.getConfig("allowReceiveIndex", "true").equals("true"); //final boolean indexReceiveGranted = sb.getConfig("allowReceiveIndex", "true").equals("true");
@ -152,18 +159,23 @@ public class ysearch {
TreeSet<Long> trackerHandles = sb.localSearchTracker.get(client); TreeSet<Long> trackerHandles = sb.localSearchTracker.get(client);
if (trackerHandles == null) trackerHandles = new TreeSet<Long>(); if (trackerHandles == null) trackerHandles = new TreeSet<Long>();
boolean block = false; boolean block = false;
if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 3000)).size() > 1) try { if (global || fetchSnippets) {
Thread.sleep(3000); // in case that we do a global search or we want to fetch snippets, we check for DoS cases
block = true; if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 3000)).size() > 1) {
} catch (final InterruptedException e) { e.printStackTrace(); } global = false;
if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 60000)).size() > 12) try { fetchSnippets = false;
Thread.sleep(10000); }
if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 60000)).size() > 30) {
global = false;
fetchSnippets = false;
block = true; block = true;
} catch (final InterruptedException e) { e.printStackTrace(); } }
if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 600000)).size() > 36) try { if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 600000)).size() > 100) {
Thread.sleep(30000); global = false;
fetchSnippets = false;
block = true; block = true;
} catch (final InterruptedException e) { e.printStackTrace(); } }
}
if ((!block) && (post == null || post.get("cat", "href").equals("href"))) { if ((!block) && (post == null || post.get("cat", "href").equals("href"))) {
@ -198,6 +210,7 @@ public class ysearch {
maxDistance, maxDistance,
prefermask, prefermask,
contentdomCode, contentdomCode,
language,
true, true,
itemsPerPage, itemsPerPage,
offset, offset,

@ -50,6 +50,7 @@ import de.anomic.server.serverObjects;
import de.anomic.server.serverProfiling; import de.anomic.server.serverProfiling;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.tools.iso639;
import de.anomic.tools.yFormatter; import de.anomic.tools.yFormatter;
import de.anomic.xml.RSSFeed; import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage; import de.anomic.xml.RSSMessage;
@ -76,7 +77,7 @@ public class yacysearch {
// get query // get query
String querystring = (post == null) ? "" : post.get("query", post.get("search", "")).trim(); // SRU compliance String querystring = (post == null) ? "" : post.get("query", post.get("search", "")).trim(); // SRU compliance
final boolean fetchSnippets = (post != null && post.get("verify", "false").equals("true")); boolean fetchSnippets = (post != null && post.get("verify", "false").equals("true"));
final serverObjects prop = new serverObjects(); final serverObjects prop = new serverObjects();
final boolean rss = (post == null) ? false : post.get("rss", "false").equals("true"); final boolean rss = (post == null) ? false : post.get("rss", "false").equals("true");
@ -137,6 +138,11 @@ public class yacysearch {
constraint.set(plasmaCondenser.flag_cat_indexof, true); constraint.set(plasmaCondenser.flag_cat_indexof, true);
} }
// find out language of the user by reading of the user-agent string
String agent = header.get("User-Agent");
if (agent == null) agent = System.getProperty("user.language");
String language = (agent == null) ? "en" : iso639.userAgentLanguageDetection(agent);
// SEARCH // SEARCH
//final boolean indexDistributeGranted = sb.getConfig(plasmaSwitchboard.INDEX_DIST_ALLOW, "true").equals("true"); //final boolean indexDistributeGranted = sb.getConfig(plasmaSwitchboard.INDEX_DIST_ALLOW, "true").equals("true");
//final boolean indexReceiveGranted = sb.getConfig("allowReceiveIndex", "true").equals("true"); //final boolean indexReceiveGranted = sb.getConfig("allowReceiveIndex", "true").equals("true");
@ -159,23 +165,23 @@ public class yacysearch {
boolean block = false; boolean block = false;
if (global || fetchSnippets) { if (global || fetchSnippets) {
// in case that we do a global search or we want to fetch snippets, we check for DoS cases // in case that we do a global search or we want to fetch snippets, we check for DoS cases
if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 3000)).size() > 1) try { if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 1000)).size() > 2) {
Thread.sleep(3000); global = false;
block = true; fetchSnippets = false;
} catch (final InterruptedException e) { }
e.printStackTrace(); if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 3000)).size() > 1) {
global = false;
fetchSnippets = false;
} }
if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 60000)).size() > 12) try { if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 60000)).size() > 200) {
Thread.sleep(10000); global = false;
fetchSnippets = false;
block = true; block = true;
} catch (final InterruptedException e) {
e.printStackTrace();
} }
if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 600000)).size() > 36) try { if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 600000)).size() > 600) {
Thread.sleep(30000); global = false;
fetchSnippets = false;
block = true; block = true;
} catch (final InterruptedException e) {
e.printStackTrace();
} }
} }
@ -265,6 +271,7 @@ public class yacysearch {
maxDistance, maxDistance,
prefermask, prefermask,
contentdomCode, contentdomCode,
language,
fetchSnippets, fetchSnippets,
itemsPerPage, itemsPerPage,
offset, offset,

@ -44,13 +44,15 @@ public class indexRWIEntryOrder {
private final plasmaSearchRankingProfile ranking; private final plasmaSearchRankingProfile ranking;
private final kelondroMScoreCluster<String> doms; // collected for "authority" heuristic private final kelondroMScoreCluster<String> doms; // collected for "authority" heuristic
private int maxdomcount; private int maxdomcount;
private String language;
public indexRWIEntryOrder(final plasmaSearchRankingProfile profile) { public indexRWIEntryOrder(final plasmaSearchRankingProfile profile, String language) {
this.min = null; this.min = null;
this.max = null; this.max = null;
this.ranking = profile; this.ranking = profile;
this.doms = new kelondroMScoreCluster<String>(); this.doms = new kelondroMScoreCluster<String>();
this.maxdomcount = 0; this.maxdomcount = 0;
this.language = language;
} }
public ArrayList<indexRWIVarEntry> normalizeWith(final indexContainer container) { public ArrayList<indexRWIVarEntry> normalizeWith(final indexContainer container) {
@ -134,23 +136,29 @@ public class indexRWIEntryOrder {
+ ((max.hitcount() == min.hitcount()) ? 0 : (((t.hitcount() - min.hitcount() ) << 8) / (max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount) + ((max.hitcount() == min.hitcount()) ? 0 : (((t.hitcount() - min.hitcount() ) << 8) / (max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount)
+ tf + tf
+ ((ranking.coeff_authority > 12) ? (authority(t.urlHash()) << ranking.coeff_authority) : 0) + ((ranking.coeff_authority > 12) ? (authority(t.urlHash()) << ranking.coeff_authority) : 0)
+ (((flags.get(indexRWIEntry.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0)) + ((flags.get(indexRWIEntry.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0)
+ (((flags.get(indexRWIEntry.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title : 0)) + ((flags.get(indexRWIEntry.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title : 0)
+ (((flags.get(indexRWIEntry.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator : 0)) + ((flags.get(indexRWIEntry.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator : 0)
+ (((flags.get(indexRWIEntry.flag_app_dc_subject)) ? 255 << ranking.coeff_app_dc_subject : 0)) + ((flags.get(indexRWIEntry.flag_app_dc_subject)) ? 255 << ranking.coeff_app_dc_subject : 0)
+ (((flags.get(indexRWIEntry.flag_app_dc_description)) ? 255 << ranking.coeff_app_dc_description : 0)) + ((flags.get(indexRWIEntry.flag_app_dc_description)) ? 255 << ranking.coeff_app_dc_description : 0)
+ (((flags.get(indexRWIEntry.flag_app_emphasized)) ? 255 << ranking.coeff_appemph : 0)) + ((flags.get(indexRWIEntry.flag_app_emphasized)) ? 255 << ranking.coeff_appemph : 0)
+ (((flags.get(plasmaCondenser.flag_cat_indexof)) ? 255 << ranking.coeff_catindexof : 0)) + ((flags.get(plasmaCondenser.flag_cat_indexof)) ? 255 << ranking.coeff_catindexof : 0)
+ (((flags.get(plasmaCondenser.flag_cat_hasimage)) ? 255 << ranking.coeff_cathasimage : 0)) + ((flags.get(plasmaCondenser.flag_cat_hasimage)) ? 255 << ranking.coeff_cathasimage : 0)
+ (((flags.get(plasmaCondenser.flag_cat_hasaudio)) ? 255 << ranking.coeff_cathasaudio : 0)) + ((flags.get(plasmaCondenser.flag_cat_hasaudio)) ? 255 << ranking.coeff_cathasaudio : 0)
+ (((flags.get(plasmaCondenser.flag_cat_hasvideo)) ? 255 << ranking.coeff_cathasvideo : 0)) + ((flags.get(plasmaCondenser.flag_cat_hasvideo)) ? 255 << ranking.coeff_cathasvideo : 0)
+ (((flags.get(plasmaCondenser.flag_cat_hasapp)) ? 255 << ranking.coeff_cathasapp : 0)) + ((flags.get(plasmaCondenser.flag_cat_hasapp)) ? 255 << ranking.coeff_cathasapp : 0)
+ (((yacyURL.probablyRootURL(t.urlHash())) ? 15 << ranking.coeff_urllength : 0)); + ((patchUK(t.language).equals(this.language)) ? 255 << ranking.coeff_language : 0)
+ ((yacyURL.probablyRootURL(t.urlHash())) ? 15 << ranking.coeff_urllength : 0);
//if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0; //if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0;
return Long.MAX_VALUE - r; // returns a reversed number: the lower the number the better the ranking. This is used for simple sorting with a TreeMap return Long.MAX_VALUE - r; // returns a reversed number: the lower the number the better the ranking. This is used for simple sorting with a TreeMap
} }
private static final String patchUK(String l) {
// this is to patch a bad language name setting that was used in 0.60 and before
if (l.equals("uk")) return "en"; else return l;
}
public static class minmaxfinder extends Thread { public static class minmaxfinder extends Thread {
indexRWIVarEntry entryMin; indexRWIVarEntry entryMin;

@ -142,6 +142,7 @@ public final class plasmaSearchEvent {
"", "",
query.prefer, query.prefer,
query.urlMask, query.urlMask,
query.targetlang,
query.displayResults(), query.displayResults(),
query.maxDistance, query.maxDistance,
wordIndex, wordIndex,

@ -62,6 +62,7 @@ public final class plasmaSearchQuery {
public String prefer; public String prefer;
public int contentdom; public int contentdom;
public String urlMask; public String urlMask;
public String targetlang;
public int domType; public int domType;
public int zonecode; public int zonecode;
public int domMaxTargets; public int domMaxTargets;
@ -100,6 +101,7 @@ public final class plasmaSearchQuery {
this.linesPerPage = lines; this.linesPerPage = lines;
this.offset = 0; this.offset = 0;
this.urlMask = ".*"; this.urlMask = ".*";
this.targetlang = "en";
this.domType = SEARCHDOM_LOCAL; this.domType = SEARCHDOM_LOCAL;
this.zonecode = yacyURL.TLD_any_zone_filter; this.zonecode = yacyURL.TLD_any_zone_filter;
this.domMaxTargets = 0; this.domMaxTargets = 0;
@ -117,6 +119,7 @@ public final class plasmaSearchQuery {
final TreeSet<String> excludeHashes, final TreeSet<String> excludeHashes,
final plasmaSearchRankingProfile ranking, final plasmaSearchRankingProfile ranking,
final int maxDistance, final String prefer, final int contentdom, final int maxDistance, final String prefer, final int contentdom,
final String language,
final boolean onlineSnippetFetch, final boolean onlineSnippetFetch,
final int lines, final int offset, final String urlMask, final int lines, final int offset, final String urlMask,
final int domType, final String domGroupName, final int domMaxTargets, final int domType, final String domGroupName, final int domMaxTargets,
@ -134,6 +137,7 @@ public final class plasmaSearchQuery {
this.linesPerPage = Math.min((specialRights) ? 1000 : 10, lines); this.linesPerPage = Math.min((specialRights) ? 1000 : 10, lines);
this.offset = Math.min((specialRights) ? 10000 : 100, offset); this.offset = Math.min((specialRights) ? 10000 : 100, offset);
this.urlMask = urlMask; this.urlMask = urlMask;
this.targetlang = language;
this.domType = domType; this.domType = domType;
this.zonecode = domainzone; this.zonecode = domainzone;
this.domMaxTargets = domMaxTargets; this.domMaxTargets = domMaxTargets;
@ -286,6 +290,7 @@ public final class plasmaSearchQuery {
"*" + indexWord.word2hash(this.ranking.toExternalString()) + "*" + indexWord.word2hash(this.ranking.toExternalString()) +
"*" + this.prefer + "*" + this.prefer +
"*" + this.urlMask + "*" + this.urlMask +
"*" + this.targetlang +
"*" + this.constraint + "*" + this.constraint +
"*" + this.maxDistance; "*" + this.maxDistance;
if (anonymized) if (anonymized)

@ -81,7 +81,7 @@ public final class plasmaSearchRankingProcess {
this.stack = new kelondroSortStack<indexRWIVarEntry>(maxentries); this.stack = new kelondroSortStack<indexRWIVarEntry>(maxentries);
this.doubleDomCache = new HashMap<String, kelondroSortStack<indexRWIVarEntry>>(); this.doubleDomCache = new HashMap<String, kelondroSortStack<indexRWIVarEntry>>();
this.handover = new HashMap<String, String>(); this.handover = new HashMap<String, String>();
this.order = (query == null) ? null : new indexRWIEntryOrder(query.ranking); this.order = (query == null) ? null : new indexRWIEntryOrder(query.ranking, query.targetlang);
this.query = query; this.query = query;
this.maxentries = maxentries; this.maxentries = maxentries;
this.remote_peerCount = 0; this.remote_peerCount = 0;

@ -64,6 +64,7 @@ import de.anomic.kelondro.kelondroRowCollection;
import de.anomic.server.serverMemory; import de.anomic.server.serverMemory;
import de.anomic.server.serverProfiling; import de.anomic.server.serverProfiling;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.tools.iso639;
import de.anomic.xml.RSSFeed; import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage; import de.anomic.xml.RSSMessage;
import de.anomic.yacy.yacyDHTAction; import de.anomic.yacy.yacyDHTAction;
@ -815,31 +816,49 @@ public final class plasmaWordIndex implements indexRI {
final long startTime = System.currentTimeMillis(); final long startTime = System.currentTimeMillis();
// CREATE INDEX // CREATE INDEX
// load some document metadata
final String dc_title = document.dc_title(); final String dc_title = document.dc_title();
final yacyURL referrerURL = entry.referrerURL(); final yacyURL referrerURL = entry.referrerURL();
final Date docDate = entry.getModificationDate(); final Date docDate = entry.getModificationDate();
String language = condenser.language();
// do a identification of the language
String language = condenser.language(); // this is a statistical analysation of the content: will be compared with other attributes
String bymetadata = document.languageByMetadata(); // the languageByMetadata may return null if there was no declaration String bymetadata = document.languageByMetadata(); // the languageByMetadata may return null if there was no declaration
if (language == null) { if (language == null) {
// no statistics available, we take either the metadata (if given) or the TLD
language = (bymetadata == null) ? entry.url().language() : bymetadata; language = (bymetadata == null) ? entry.url().language() : bymetadata;
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " FAILED, taking " + ((bymetadata == null) ? "TLD" : "metadata") + ": " + language); System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " FAILED, taking " + ((bymetadata == null) ? "TLD" : "metadata") + ": " + language);
} else {
if (language.equals("pl")) {
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " HAS BUG: " + language);
language = (bymetadata == null) ? entry.url().language() : bymetadata; // extra handling of this case: overwrite with bymetadata
} else { } else {
if (bymetadata == null) { if (bymetadata == null) {
// two possible results: compare and report conflicts
if (language.equals(entry.url().language())) if (language.equals(entry.url().language()))
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IDENTICAL: " + language); System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IDENTICAL: " + language);
else { else {
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " (the language given by the TLD is " + entry.url().language() + ")"); String error = "*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " (the language given by the TLD is " + entry.url().language() + ")";
// see if we have a hint in the url that the statistic was right
String u = entry.url().toNormalform(true, false).toLowerCase();
if (!u.contains("/" + language + "/") && !u.contains("/" + iso639.country(language).toLowerCase() + "/")) {
// no confirmation using the url, use the TLD
language = entry.url().language(); language = entry.url().language();
System.out.println(error + ", corrected using the TLD");
} else {
// this is a strong hint that the statistics was in fact correct
System.out.println(error + ", but the url proves that the statistic is correct");
}
} }
} else { } else {
if (language.equals(bymetadata)) // here we have three results: we can do a voting
if (language.equals(bymetadata)) {
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language); System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language);
else } else if (language.equals(entry.url().language())) {
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " (the language given by metadata is " + bymetadata + ")"); System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IS IDENTICAL: " + language);
} else if (bymetadata.equals(entry.url().language())) {
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " BUT METADATA AND TLD ARE IDENTICAL: " + bymetadata + ")");
language = bymetadata;
} else {
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: ALL DIFFERENT! statistic: " + language + ", metadata: " + bymetadata + ", TLD: + " + entry.url().language() + ". taking metadata.");
language = bymetadata;
} }
} }
} }

@ -194,4 +194,28 @@ public class iso639 {
return mapping.containsKey(code.toLowerCase()); return mapping.containsKey(code.toLowerCase());
} }
/**
* analyse a user-agent string and return language as given in the agent string
* @param userAgent string
* @return the language code if it is possible to parse the string and find a language code or null if not
*/
public static final String userAgentLanguageDetection(String userAgent) {
if (userAgent == null || userAgent.length() < 2) return null;
userAgent = userAgent.toLowerCase();
if (userAgent.length() == 2 && mapping.containsKey(userAgent)) return userAgent;
if (userAgent.length() == 5 && mapping.containsKey(userAgent.substring(0, 2))) return userAgent.substring(0, 2);
int p = 2;
// search for entries like ' en-'
while (p < userAgent.length() - 1 && (p = userAgent.indexOf('-', p)) > 2) {
if (userAgent.charAt(p - 3) == ' ' && mapping.containsKey(userAgent.substring(p - 2, p))) return userAgent.substring(p - 2, p);
p++;
}
// search for entries like ' en;'
p = 1;
while (p < userAgent.length() - 1 && (p = userAgent.indexOf(';', p)) > 2) {
if (userAgent.charAt(p - 3) == ' ' && mapping.containsKey(userAgent.substring(p - 2, p))) return userAgent.substring(p - 2, p);
p++;
}
return null;
}
} }

@ -422,6 +422,7 @@ public final class yacyClient {
final String urlhashes, final String urlhashes,
final String prefer, final String prefer,
final String filter, final String filter,
final String language,
final int count, final int count,
final int maxDistance, final int maxDistance,
final boolean global, final boolean global,
@ -464,6 +465,7 @@ public final class yacyClient {
post.add(new DefaultCharsetStringPart("urls", urlhashes)); post.add(new DefaultCharsetStringPart("urls", urlhashes));
post.add(new DefaultCharsetStringPart("prefer", prefer)); post.add(new DefaultCharsetStringPart("prefer", prefer));
post.add(new DefaultCharsetStringPart("filter", filter)); post.add(new DefaultCharsetStringPart("filter", filter));
post.add(new DefaultCharsetStringPart("language", language));
post.add(new DefaultCharsetStringPart("ttl", "0")); post.add(new DefaultCharsetStringPart("ttl", "0"));
post.add(new DefaultCharsetStringPart("maxdist", Integer.toString(maxDistance))); post.add(new DefaultCharsetStringPart("maxdist", Integer.toString(maxDistance)));
post.add(new DefaultCharsetStringPart("profile", crypt.simpleEncode(rankingProfile.toExternalString()))); post.add(new DefaultCharsetStringPart("profile", crypt.simpleEncode(rankingProfile.toExternalString())));

@ -73,11 +73,14 @@ public class yacySearch extends Thread {
private String[] urls; private String[] urls;
private final int count, maxDistance; private final int count, maxDistance;
final private plasmaSearchRankingProfile rankingProfile; final private plasmaSearchRankingProfile rankingProfile;
final private String prefer, filter; final private String prefer, filter, language;
final private kelondroBitfield constraint; final private kelondroBitfield constraint;
ResultURLs crawlResults; ResultURLs crawlResults;
public yacySearch(final String wordhashes, final String excludehashes, final String urlhashes, final String prefer, final String filter, final int count, final int maxDistance, public yacySearch(final String wordhashes, final String excludehashes, final String urlhashes,
final String prefer, final String filter, final String language,
final int count, final int maxDistance,
final boolean global, final int partitions, final yacySeed targetPeer, final plasmaWordIndex wordIndex, final boolean global, final int partitions, final yacySeed targetPeer, final plasmaWordIndex wordIndex,
final ResultURLs crawlResults, final ResultURLs crawlResults,
final plasmaSearchRankingProcess containerCache, final plasmaSearchRankingProcess containerCache,
@ -92,6 +95,7 @@ public class yacySearch extends Thread {
this.urlhashes = urlhashes; this.urlhashes = urlhashes;
this.prefer = prefer; this.prefer = prefer;
this.filter = filter; this.filter = filter;
this.language = language;
this.global = global; this.global = global;
this.partitions = partitions; this.partitions = partitions;
this.wordIndex = wordIndex; this.wordIndex = wordIndex;
@ -110,7 +114,7 @@ public class yacySearch extends Thread {
public void run() { public void run() {
this.urls = yacyClient.search( this.urls = yacyClient.search(
wordIndex.seedDB.mySeed(), wordIndex.seedDB.mySeed(),
wordhashes, excludehashes, urlhashes, prefer, filter, count, maxDistance, global, partitions, wordhashes, excludehashes, urlhashes, prefer, filter, language, count, maxDistance, global, partitions,
targetPeer, wordIndex, crawlResults, containerCache, abstractCache, targetPeer, wordIndex, crawlResults, containerCache, abstractCache,
blacklist, rankingProfile, constraint); blacklist, rankingProfile, constraint);
if (urls != null) { if (urls != null) {
@ -276,7 +280,8 @@ public class yacySearch extends Thread {
public static yacySearch[] primaryRemoteSearches( public static yacySearch[] primaryRemoteSearches(
final String wordhashes, final String excludehashes, final String urlhashes, final String wordhashes, final String excludehashes, final String urlhashes,
final String prefer, final String filter, final int count, final int maxDist, final String prefer, final String filter, String language,
final int count, final int maxDist,
final plasmaWordIndex wordIndex, final plasmaWordIndex wordIndex,
final ResultURLs crawlResults, final ResultURLs crawlResults,
final plasmaSearchRankingProcess containerCache, final plasmaSearchRankingProcess containerCache,
@ -297,7 +302,7 @@ public class yacySearch extends Thread {
final yacySearch[] searchThreads = new yacySearch[targets]; final yacySearch[] searchThreads = new yacySearch[targets];
for (int i = 0; i < targets; i++) { for (int i = 0; i < targets; i++) {
if (targetPeers[i] == null || targetPeers[i].hash == null) continue; if (targetPeers[i] == null || targetPeers[i].hash == null) continue;
searchThreads[i] = new yacySearch(wordhashes, excludehashes, urlhashes, prefer, filter, count, maxDist, true, targets, targetPeers[i], searchThreads[i] = new yacySearch(wordhashes, excludehashes, urlhashes, prefer, filter, language, count, maxDist, true, targets, targetPeers[i],
wordIndex, crawlResults, containerCache, abstractCache, blacklist, rankingProfile, constraint); wordIndex, crawlResults, containerCache, abstractCache, blacklist, rankingProfile, constraint);
searchThreads[i].start(); searchThreads[i].start();
//try {Thread.sleep(20);} catch (InterruptedException e) {} //try {Thread.sleep(20);} catch (InterruptedException e) {}
@ -305,7 +310,8 @@ public class yacySearch extends Thread {
return searchThreads; return searchThreads;
} }
public static yacySearch secondaryRemoteSearch(final String wordhashes, final String excludehashes, final String urlhashes, public static yacySearch secondaryRemoteSearch(
final String wordhashes, final String excludehashes, final String urlhashes,
final plasmaWordIndex wordIndex, final plasmaWordIndex wordIndex,
final ResultURLs crawlResults, final ResultURLs crawlResults,
final plasmaSearchRankingProcess containerCache, final plasmaSearchRankingProcess containerCache,
@ -319,7 +325,7 @@ public class yacySearch extends Thread {
final yacySeed targetPeer = wordIndex.seedDB.getConnected(targethash); final yacySeed targetPeer = wordIndex.seedDB.getConnected(targethash);
if (targetPeer == null || targetPeer.hash == null) return null; if (targetPeer == null || targetPeer.hash == null) return null;
if (clusterselection != null) targetPeer.setAlternativeAddress(clusterselection.get(targetPeer.hash)); if (clusterselection != null) targetPeer.setAlternativeAddress(clusterselection.get(targetPeer.hash));
final yacySearch searchThread = new yacySearch(wordhashes, excludehashes, urlhashes, "", "", 0, 9999, true, 0, targetPeer, final yacySearch searchThread = new yacySearch(wordhashes, excludehashes, urlhashes, "", "", "en", 0, 9999, true, 0, targetPeer,
wordIndex, crawlResults, containerCache, new TreeMap<String, TreeMap<String, String>>(), blacklist, rankingProfile, constraint); wordIndex, crawlResults, containerCache, new TreeMap<String, TreeMap<String, String>>(), blacklist, rankingProfile, constraint);
searchThread.start(); searchThread.start();
return searchThread; return searchThread;

Loading…
Cancel
Save