From 00c1535f84db648098235b99a4717d51dbaa889e Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 21 Sep 2008 00:04:42 +0000 Subject: [PATCH] added ranking and evaluation of language type in a search the wanted language is taken from the browser user-agent string git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5192 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- build.properties | 2 +- htroot/yacy/search.java | 12 ++++- htroot/yacy/user/ysearch.java | 37 ++++++++++----- htroot/yacysearch.java | 35 ++++++++------ .../de/anomic/index/indexRWIEntryOrder.java | 34 +++++++++----- .../de/anomic/plasma/plasmaSearchEvent.java | 1 + .../de/anomic/plasma/plasmaSearchQuery.java | 7 ++- .../plasma/plasmaSearchRankingProcess.java | 2 +- source/de/anomic/plasma/plasmaWordIndex.java | 47 +++++++++++++------ source/de/anomic/tools/iso639.java | 24 ++++++++++ source/de/anomic/yacy/yacyClient.java | 2 + source/de/anomic/yacy/yacySearch.java | 20 +++++--- 12 files changed, 158 insertions(+), 65 deletions(-) diff --git a/build.properties b/build.properties index 4ed50b995..b1aba106b 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.5 javacTarget=1.5 # Release Configuration -releaseVersion=0.601 +releaseVersion=0.602 stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 5ad00c3f2..1f7a545df 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -51,6 +51,7 @@ import de.anomic.server.serverObjects; import de.anomic.server.serverProfiling; import de.anomic.server.serverSwitch; import de.anomic.tools.crypt; +import de.anomic.tools.iso639; import de.anomic.xml.RSSFeed; import de.anomic.xml.RSSMessage; import de.anomic.yacy.yacyCore; @@ -86,6 +87,13 @@ public final class search { final String prefer = post.get("prefer", ""); final String contentdom = post.get("contentdom", "text"); final String filter = post.get("filter", ".*"); + String language = post.get("language", ""); + if (!iso639.exists(language)) { + // take language from the user agent + String agent = header.get("User-Agent"); + if (agent == null) agent = System.getProperty("user.language"); + language = (agent == null) ? "en" : iso639.userAgentLanguageDetection(agent); + } final int partitions = post.getInt("partitions", 30); String profile = post.get("profile", ""); // remote profile hand-over if (profile.length() > 0) profile = crypt.simpleDecode(profile, null); @@ -174,7 +182,7 @@ public final class search { plasmaSearchEvent theSearch = null; if ((query.length() == 0) && (abstractSet != null)) { // this is _not_ a normal search, only a request for index abstracts - theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet(kelondroBase64Order.enhancedComparator), rankingProfile, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, null, false, yacyURL.TLD_any_zone_filter, client, false); + theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet(kelondroBase64Order.enhancedComparator), rankingProfile, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), language, false, count, 0, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, null, false, yacyURL.TLD_any_zone_filter, client, false); theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); @@ -200,7 +208,7 @@ public final class search { } else { // retrieve index containers from search request - theQuery = new plasmaSearchQuery(null, queryhashes, excludehashes, rankingProfile, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, constraint, false, yacyURL.TLD_any_zone_filter, client, false); + theQuery = new plasmaSearchQuery(null, queryhashes, excludehashes, rankingProfile, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), language, false, count, 0, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, constraint, false, yacyURL.TLD_any_zone_filter, client, false); theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); RSSFeed.channels(RSSFeed.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes), "")); diff --git a/htroot/yacy/user/ysearch.java b/htroot/yacy/user/ysearch.java index b06991969..4a1d57c04 100644 --- a/htroot/yacy/user/ysearch.java +++ b/htroot/yacy/user/ysearch.java @@ -40,6 +40,7 @@ import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.server.logging.serverLog; +import de.anomic.tools.iso639; import de.anomic.tools.yFormatter; import de.anomic.xml.RSSFeed; import de.anomic.xml.RSSMessage; @@ -66,6 +67,7 @@ public class ysearch { // get query String querystring = (post == null) ? "" : post.get("search", "").trim(); + boolean fetchSnippets = (post != null && post.get("verify", "false").equals("true")); final serverObjects prop = new serverObjects(); final boolean rss = (post == null) ? false : post.get("rss", "false").equals("true"); @@ -132,6 +134,11 @@ public class ysearch { final int domainzone = (post == null ? yacyURL.TLD_any_zone_filter : post.getInt("zone", yacyURL.TLD_any_zone_filter)); + // find out language of the user by reading of the user-agent string + String agent = header.get("User-Agent"); + if (agent == null) agent = System.getProperty("user.language"); + String language = (agent == null) ? "en" : iso639.userAgentLanguageDetection(agent); + // SEARCH //final boolean indexDistributeGranted = sb.getConfig(plasmaSwitchboard.INDEX_DIST_ALLOW, "true").equals("true"); //final boolean indexReceiveGranted = sb.getConfig("allowReceiveIndex", "true").equals("true"); @@ -152,18 +159,23 @@ public class ysearch { TreeSet trackerHandles = sb.localSearchTracker.get(client); if (trackerHandles == null) trackerHandles = new TreeSet(); boolean block = false; - if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 3000)).size() > 1) try { - Thread.sleep(3000); - block = true; - } catch (final InterruptedException e) { e.printStackTrace(); } - if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 60000)).size() > 12) try { - Thread.sleep(10000); - block = true; - } catch (final InterruptedException e) { e.printStackTrace(); } - if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 600000)).size() > 36) try { - Thread.sleep(30000); - block = true; - } catch (final InterruptedException e) { e.printStackTrace(); } + if (global || fetchSnippets) { + // in case that we do a global search or we want to fetch snippets, we check for DoS cases + if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 3000)).size() > 1) { + global = false; + fetchSnippets = false; + } + if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 60000)).size() > 30) { + global = false; + fetchSnippets = false; + block = true; + } + if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 600000)).size() > 100) { + global = false; + fetchSnippets = false; + block = true; + } + } if ((!block) && (post == null || post.get("cat", "href").equals("href"))) { @@ -198,6 +210,7 @@ public class ysearch { maxDistance, prefermask, contentdomCode, + language, true, itemsPerPage, offset, diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 168a41ff8..3b88d6fb5 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -50,6 +50,7 @@ import de.anomic.server.serverObjects; import de.anomic.server.serverProfiling; import de.anomic.server.serverSwitch; import de.anomic.server.logging.serverLog; +import de.anomic.tools.iso639; import de.anomic.tools.yFormatter; import de.anomic.xml.RSSFeed; import de.anomic.xml.RSSMessage; @@ -76,7 +77,7 @@ public class yacysearch { // get query String querystring = (post == null) ? "" : post.get("query", post.get("search", "")).trim(); // SRU compliance - final boolean fetchSnippets = (post != null && post.get("verify", "false").equals("true")); + boolean fetchSnippets = (post != null && post.get("verify", "false").equals("true")); final serverObjects prop = new serverObjects(); final boolean rss = (post == null) ? false : post.get("rss", "false").equals("true"); @@ -137,6 +138,11 @@ public class yacysearch { constraint.set(plasmaCondenser.flag_cat_indexof, true); } + // find out language of the user by reading of the user-agent string + String agent = header.get("User-Agent"); + if (agent == null) agent = System.getProperty("user.language"); + String language = (agent == null) ? "en" : iso639.userAgentLanguageDetection(agent); + // SEARCH //final boolean indexDistributeGranted = sb.getConfig(plasmaSwitchboard.INDEX_DIST_ALLOW, "true").equals("true"); //final boolean indexReceiveGranted = sb.getConfig("allowReceiveIndex", "true").equals("true"); @@ -159,23 +165,23 @@ public class yacysearch { boolean block = false; if (global || fetchSnippets) { // in case that we do a global search or we want to fetch snippets, we check for DoS cases - if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 3000)).size() > 1) try { - Thread.sleep(3000); - block = true; - } catch (final InterruptedException e) { - e.printStackTrace(); + if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 1000)).size() > 2) { + global = false; + fetchSnippets = false; + } + if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 3000)).size() > 1) { + global = false; + fetchSnippets = false; } - if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 60000)).size() > 12) try { - Thread.sleep(10000); + if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 60000)).size() > 200) { + global = false; + fetchSnippets = false; block = true; - } catch (final InterruptedException e) { - e.printStackTrace(); } - if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 600000)).size() > 36) try { - Thread.sleep(30000); + if (trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 600000)).size() > 600) { + global = false; + fetchSnippets = false; block = true; - } catch (final InterruptedException e) { - e.printStackTrace(); } } @@ -265,6 +271,7 @@ public class yacysearch { maxDistance, prefermask, contentdomCode, + language, fetchSnippets, itemsPerPage, offset, diff --git a/source/de/anomic/index/indexRWIEntryOrder.java b/source/de/anomic/index/indexRWIEntryOrder.java index 487902faf..c0f034e42 100644 --- a/source/de/anomic/index/indexRWIEntryOrder.java +++ b/source/de/anomic/index/indexRWIEntryOrder.java @@ -44,13 +44,15 @@ public class indexRWIEntryOrder { private final plasmaSearchRankingProfile ranking; private final kelondroMScoreCluster doms; // collected for "authority" heuristic private int maxdomcount; + private String language; - public indexRWIEntryOrder(final plasmaSearchRankingProfile profile) { + public indexRWIEntryOrder(final plasmaSearchRankingProfile profile, String language) { this.min = null; this.max = null; this.ranking = profile; this.doms = new kelondroMScoreCluster(); this.maxdomcount = 0; + this.language = language; } public ArrayList normalizeWith(final indexContainer container) { @@ -134,23 +136,29 @@ public class indexRWIEntryOrder { + ((max.hitcount() == min.hitcount()) ? 0 : (((t.hitcount() - min.hitcount() ) << 8) / (max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount) + tf + ((ranking.coeff_authority > 12) ? (authority(t.urlHash()) << ranking.coeff_authority) : 0) - + (((flags.get(indexRWIEntry.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0)) - + (((flags.get(indexRWIEntry.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title : 0)) - + (((flags.get(indexRWIEntry.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator : 0)) - + (((flags.get(indexRWIEntry.flag_app_dc_subject)) ? 255 << ranking.coeff_app_dc_subject : 0)) - + (((flags.get(indexRWIEntry.flag_app_dc_description)) ? 255 << ranking.coeff_app_dc_description : 0)) - + (((flags.get(indexRWIEntry.flag_app_emphasized)) ? 255 << ranking.coeff_appemph : 0)) - + (((flags.get(plasmaCondenser.flag_cat_indexof)) ? 255 << ranking.coeff_catindexof : 0)) - + (((flags.get(plasmaCondenser.flag_cat_hasimage)) ? 255 << ranking.coeff_cathasimage : 0)) - + (((flags.get(plasmaCondenser.flag_cat_hasaudio)) ? 255 << ranking.coeff_cathasaudio : 0)) - + (((flags.get(plasmaCondenser.flag_cat_hasvideo)) ? 255 << ranking.coeff_cathasvideo : 0)) - + (((flags.get(plasmaCondenser.flag_cat_hasapp)) ? 255 << ranking.coeff_cathasapp : 0)) - + (((yacyURL.probablyRootURL(t.urlHash())) ? 15 << ranking.coeff_urllength : 0)); + + ((flags.get(indexRWIEntry.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0) + + ((flags.get(indexRWIEntry.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title : 0) + + ((flags.get(indexRWIEntry.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator : 0) + + ((flags.get(indexRWIEntry.flag_app_dc_subject)) ? 255 << ranking.coeff_app_dc_subject : 0) + + ((flags.get(indexRWIEntry.flag_app_dc_description)) ? 255 << ranking.coeff_app_dc_description : 0) + + ((flags.get(indexRWIEntry.flag_app_emphasized)) ? 255 << ranking.coeff_appemph : 0) + + ((flags.get(plasmaCondenser.flag_cat_indexof)) ? 255 << ranking.coeff_catindexof : 0) + + ((flags.get(plasmaCondenser.flag_cat_hasimage)) ? 255 << ranking.coeff_cathasimage : 0) + + ((flags.get(plasmaCondenser.flag_cat_hasaudio)) ? 255 << ranking.coeff_cathasaudio : 0) + + ((flags.get(plasmaCondenser.flag_cat_hasvideo)) ? 255 << ranking.coeff_cathasvideo : 0) + + ((flags.get(plasmaCondenser.flag_cat_hasapp)) ? 255 << ranking.coeff_cathasapp : 0) + + ((patchUK(t.language).equals(this.language)) ? 255 << ranking.coeff_language : 0) + + ((yacyURL.probablyRootURL(t.urlHash())) ? 15 << ranking.coeff_urllength : 0); //if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0; return Long.MAX_VALUE - r; // returns a reversed number: the lower the number the better the ranking. This is used for simple sorting with a TreeMap } + private static final String patchUK(String l) { + // this is to patch a bad language name setting that was used in 0.60 and before + if (l.equals("uk")) return "en"; else return l; + } + public static class minmaxfinder extends Thread { indexRWIVarEntry entryMin; diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 731805f72..14b8d17de 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -142,6 +142,7 @@ public final class plasmaSearchEvent { "", query.prefer, query.urlMask, + query.targetlang, query.displayResults(), query.maxDistance, wordIndex, diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index 6fbd199c5..86936c0db 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -62,6 +62,7 @@ public final class plasmaSearchQuery { public String prefer; public int contentdom; public String urlMask; + public String targetlang; public int domType; public int zonecode; public int domMaxTargets; @@ -100,6 +101,7 @@ public final class plasmaSearchQuery { this.linesPerPage = lines; this.offset = 0; this.urlMask = ".*"; + this.targetlang = "en"; this.domType = SEARCHDOM_LOCAL; this.zonecode = yacyURL.TLD_any_zone_filter; this.domMaxTargets = 0; @@ -117,6 +119,7 @@ public final class plasmaSearchQuery { final TreeSet excludeHashes, final plasmaSearchRankingProfile ranking, final int maxDistance, final String prefer, final int contentdom, + final String language, final boolean onlineSnippetFetch, final int lines, final int offset, final String urlMask, final int domType, final String domGroupName, final int domMaxTargets, @@ -134,7 +137,8 @@ public final class plasmaSearchQuery { this.linesPerPage = Math.min((specialRights) ? 1000 : 10, lines); this.offset = Math.min((specialRights) ? 10000 : 100, offset); this.urlMask = urlMask; - this.domType = domType; + this.targetlang = language; + this.domType = domType; this.zonecode = domainzone; this.domMaxTargets = domMaxTargets; this.constraint = constraint; @@ -286,6 +290,7 @@ public final class plasmaSearchQuery { "*" + indexWord.word2hash(this.ranking.toExternalString()) + "*" + this.prefer + "*" + this.urlMask + + "*" + this.targetlang + "*" + this.constraint + "*" + this.maxDistance; if (anonymized) diff --git a/source/de/anomic/plasma/plasmaSearchRankingProcess.java b/source/de/anomic/plasma/plasmaSearchRankingProcess.java index 9e4304d88..676c7b6da 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProcess.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProcess.java @@ -81,7 +81,7 @@ public final class plasmaSearchRankingProcess { this.stack = new kelondroSortStack(maxentries); this.doubleDomCache = new HashMap>(); this.handover = new HashMap(); - this.order = (query == null) ? null : new indexRWIEntryOrder(query.ranking); + this.order = (query == null) ? null : new indexRWIEntryOrder(query.ranking, query.targetlang); this.query = query; this.maxentries = maxentries; this.remote_peerCount = 0; diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 1163583e1..17229b805 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -64,6 +64,7 @@ import de.anomic.kelondro.kelondroRowCollection; import de.anomic.server.serverMemory; import de.anomic.server.serverProfiling; import de.anomic.server.logging.serverLog; +import de.anomic.tools.iso639; import de.anomic.xml.RSSFeed; import de.anomic.xml.RSSMessage; import de.anomic.yacy.yacyDHTAction; @@ -815,31 +816,49 @@ public final class plasmaWordIndex implements indexRI { final long startTime = System.currentTimeMillis(); // CREATE INDEX + + // load some document metadata final String dc_title = document.dc_title(); final yacyURL referrerURL = entry.referrerURL(); final Date docDate = entry.getModificationDate(); - String language = condenser.language(); + + // do a identification of the language + String language = condenser.language(); // this is a statistical analysation of the content: will be compared with other attributes String bymetadata = document.languageByMetadata(); // the languageByMetadata may return null if there was no declaration if (language == null) { + // no statistics available, we take either the metadata (if given) or the TLD language = (bymetadata == null) ? entry.url().language() : bymetadata; System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " FAILED, taking " + ((bymetadata == null) ? "TLD" : "metadata") + ": " + language); } else { - if (language.equals("pl")) { - System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " HAS BUG: " + language); - language = (bymetadata == null) ? entry.url().language() : bymetadata; // extra handling of this case: overwrite with bymetadata - } else { - if (bymetadata == null) { - if (language.equals(entry.url().language())) - System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IDENTICAL: " + language); - else { - System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " (the language given by the TLD is " + entry.url().language() + ")"); + if (bymetadata == null) { + // two possible results: compare and report conflicts + if (language.equals(entry.url().language())) + System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IDENTICAL: " + language); + else { + String error = "*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " (the language given by the TLD is " + entry.url().language() + ")"; + // see if we have a hint in the url that the statistic was right + String u = entry.url().toNormalform(true, false).toLowerCase(); + if (!u.contains("/" + language + "/") && !u.contains("/" + iso639.country(language).toLowerCase() + "/")) { + // no confirmation using the url, use the TLD language = entry.url().language(); + System.out.println(error + ", corrected using the TLD"); + } else { + // this is a strong hint that the statistics was in fact correct + System.out.println(error + ", but the url proves that the statistic is correct"); } + } + } else { + // here we have three results: we can do a voting + if (language.equals(bymetadata)) { + System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language); + } else if (language.equals(entry.url().language())) { + System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IS IDENTICAL: " + language); + } else if (bymetadata.equals(entry.url().language())) { + System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " BUT METADATA AND TLD ARE IDENTICAL: " + bymetadata + ")"); + language = bymetadata; } else { - if (language.equals(bymetadata)) - System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language); - else - System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " (the language given by metadata is " + bymetadata + ")"); + System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: ALL DIFFERENT! statistic: " + language + ", metadata: " + bymetadata + ", TLD: + " + entry.url().language() + ". taking metadata."); + language = bymetadata; } } } diff --git a/source/de/anomic/tools/iso639.java b/source/de/anomic/tools/iso639.java index 01cb2e9c5..b72992a15 100755 --- a/source/de/anomic/tools/iso639.java +++ b/source/de/anomic/tools/iso639.java @@ -194,4 +194,28 @@ public class iso639 { return mapping.containsKey(code.toLowerCase()); } + /** + * analyse a user-agent string and return language as given in the agent string + * @param userAgent string + * @return the language code if it is possible to parse the string and find a language code or null if not + */ + public static final String userAgentLanguageDetection(String userAgent) { + if (userAgent == null || userAgent.length() < 2) return null; + userAgent = userAgent.toLowerCase(); + if (userAgent.length() == 2 && mapping.containsKey(userAgent)) return userAgent; + if (userAgent.length() == 5 && mapping.containsKey(userAgent.substring(0, 2))) return userAgent.substring(0, 2); + int p = 2; + // search for entries like ' en-' + while (p < userAgent.length() - 1 && (p = userAgent.indexOf('-', p)) > 2) { + if (userAgent.charAt(p - 3) == ' ' && mapping.containsKey(userAgent.substring(p - 2, p))) return userAgent.substring(p - 2, p); + p++; + } + // search for entries like ' en;' + p = 1; + while (p < userAgent.length() - 1 && (p = userAgent.indexOf(';', p)) > 2) { + if (userAgent.charAt(p - 3) == ' ' && mapping.containsKey(userAgent.substring(p - 2, p))) return userAgent.substring(p - 2, p); + p++; + } + return null; + } } \ No newline at end of file diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index b1a178adb..e0af3fb32 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -422,6 +422,7 @@ public final class yacyClient { final String urlhashes, final String prefer, final String filter, + final String language, final int count, final int maxDistance, final boolean global, @@ -464,6 +465,7 @@ public final class yacyClient { post.add(new DefaultCharsetStringPart("urls", urlhashes)); post.add(new DefaultCharsetStringPart("prefer", prefer)); post.add(new DefaultCharsetStringPart("filter", filter)); + post.add(new DefaultCharsetStringPart("language", language)); post.add(new DefaultCharsetStringPart("ttl", "0")); post.add(new DefaultCharsetStringPart("maxdist", Integer.toString(maxDistance))); post.add(new DefaultCharsetStringPart("profile", crypt.simpleEncode(rankingProfile.toExternalString()))); diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index 4b28e78b5..cf305f0cc 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -73,11 +73,14 @@ public class yacySearch extends Thread { private String[] urls; private final int count, maxDistance; final private plasmaSearchRankingProfile rankingProfile; - final private String prefer, filter; + final private String prefer, filter, language; final private kelondroBitfield constraint; + ResultURLs crawlResults; - public yacySearch(final String wordhashes, final String excludehashes, final String urlhashes, final String prefer, final String filter, final int count, final int maxDistance, + public yacySearch(final String wordhashes, final String excludehashes, final String urlhashes, + final String prefer, final String filter, final String language, + final int count, final int maxDistance, final boolean global, final int partitions, final yacySeed targetPeer, final plasmaWordIndex wordIndex, final ResultURLs crawlResults, final plasmaSearchRankingProcess containerCache, @@ -92,6 +95,7 @@ public class yacySearch extends Thread { this.urlhashes = urlhashes; this.prefer = prefer; this.filter = filter; + this.language = language; this.global = global; this.partitions = partitions; this.wordIndex = wordIndex; @@ -110,7 +114,7 @@ public class yacySearch extends Thread { public void run() { this.urls = yacyClient.search( wordIndex.seedDB.mySeed(), - wordhashes, excludehashes, urlhashes, prefer, filter, count, maxDistance, global, partitions, + wordhashes, excludehashes, urlhashes, prefer, filter, language, count, maxDistance, global, partitions, targetPeer, wordIndex, crawlResults, containerCache, abstractCache, blacklist, rankingProfile, constraint); if (urls != null) { @@ -276,7 +280,8 @@ public class yacySearch extends Thread { public static yacySearch[] primaryRemoteSearches( final String wordhashes, final String excludehashes, final String urlhashes, - final String prefer, final String filter, final int count, final int maxDist, + final String prefer, final String filter, String language, + final int count, final int maxDist, final plasmaWordIndex wordIndex, final ResultURLs crawlResults, final plasmaSearchRankingProcess containerCache, @@ -297,7 +302,7 @@ public class yacySearch extends Thread { final yacySearch[] searchThreads = new yacySearch[targets]; for (int i = 0; i < targets; i++) { if (targetPeers[i] == null || targetPeers[i].hash == null) continue; - searchThreads[i] = new yacySearch(wordhashes, excludehashes, urlhashes, prefer, filter, count, maxDist, true, targets, targetPeers[i], + searchThreads[i] = new yacySearch(wordhashes, excludehashes, urlhashes, prefer, filter, language, count, maxDist, true, targets, targetPeers[i], wordIndex, crawlResults, containerCache, abstractCache, blacklist, rankingProfile, constraint); searchThreads[i].start(); //try {Thread.sleep(20);} catch (InterruptedException e) {} @@ -305,7 +310,8 @@ public class yacySearch extends Thread { return searchThreads; } - public static yacySearch secondaryRemoteSearch(final String wordhashes, final String excludehashes, final String urlhashes, + public static yacySearch secondaryRemoteSearch( + final String wordhashes, final String excludehashes, final String urlhashes, final plasmaWordIndex wordIndex, final ResultURLs crawlResults, final plasmaSearchRankingProcess containerCache, @@ -319,7 +325,7 @@ public class yacySearch extends Thread { final yacySeed targetPeer = wordIndex.seedDB.getConnected(targethash); if (targetPeer == null || targetPeer.hash == null) return null; if (clusterselection != null) targetPeer.setAlternativeAddress(clusterselection.get(targetPeer.hash)); - final yacySearch searchThread = new yacySearch(wordhashes, excludehashes, urlhashes, "", "", 0, 9999, true, 0, targetPeer, + final yacySearch searchThread = new yacySearch(wordhashes, excludehashes, urlhashes, "", "", "en", 0, 9999, true, 0, targetPeer, wordIndex, crawlResults, containerCache, new TreeMap>(), blacklist, rankingProfile, constraint); searchThread.start(); return searchThread;