diff --git a/.classpath b/.classpath index 58442a9c2..fe06ef8b2 100644 --- a/.classpath +++ b/.classpath @@ -3,119 +3,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -126,6 +13,16 @@ + + + + + + + + + + diff --git a/.project b/.project index 6cbb7fca6..7bc830b00 100644 --- a/.project +++ b/.project @@ -23,5 +23,6 @@ org.eclipse.jdt.core.javanature + org.apache.ivyde.eclipse.ivynature diff --git a/defaults/yacy.init b/defaults/yacy.init index bf639f6cb..a93162a3c 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -1370,16 +1370,6 @@ core.service.webgraph.tmp = false parserAugmentation = false parserAugmentation.RDFa = false -# Content control settings -contentcontrol.enabled = false -contentcontrol.bookmarklist = contentcontrol -contentcontrol.mandatoryfilterlist = yacy -contentcontrol.smwimport.enabled = false -contentcontrol.smwimport.baseurl = -contentcontrol.smwimport.purgelistoninit = true -contentcontrol.smwimport.targetlist = contentcontrol -contentcontrol.smwimport.defaultcategory = yacy - # host browser settings # Allow the administrator to stack URLs to the local crawl queue from the host browser page, automatically (when a path is unknown) or manually through a "load and index" link browser.autoload = false diff --git a/htroot/ContentControl_p.html b/htroot/ContentControl_p.html deleted file mode 100644 index b3b234248..000000000 --- a/htroot/ContentControl_p.html +++ /dev/null @@ -1,95 +0,0 @@ - - - - YaCy '#[clientname]#': Content Control - #%env/templates/metas.template%# - - - #%env/templates/header.template%# - #%env/templates/submenuBlacklist.template%# - -

Content Control

- -
- -
Peer Content Control URL Filter -

- With this settings you can activate or deactivate content control on this peer. -

- - -
- -
-
- Enabled
-

- Enables or disables content control. -

-
- - -
-
-

-

- Define a table. Default: contentcontrol -

-
-
-
-
-
-
- - - -
-
Content Control SMW Import Settings -

- With this settings you can define the content control import settings. You can define a Semantic Media Wiki with the appropriate extensions. -

- -
- -
-
- Enabled
-

- Enable or disable constant background synchronization of content control list from SMW (Semantic Mediawiki). Requires restart! -

-
- -
-
-

-

- Define base URL for SMW special page "Ask". Example: http://my.wiki.cc/wiki/Special:Ask -

-
- -
-
-

-

- Define import target table. Default: contentcontrol -

-
- -
-
- Enabled
-

- Purge content control list on initial synchronisation after startup. -

-
-
-
-
-
-
- - - #%env/templates/footer.template%# - - diff --git a/htroot/ContentControl_p.java b/htroot/ContentControl_p.java deleted file mode 100644 index e39632f15..000000000 --- a/htroot/ContentControl_p.java +++ /dev/null @@ -1,68 +0,0 @@ -import net.yacy.cora.protocol.RequestHeader; -import net.yacy.server.serverObjects; -import net.yacy.server.serverSwitch; - -public final class ContentControl_p { - - public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, - final serverObjects post, final serverSwitch env) { - - final serverObjects prop = new serverObjects(); - - if (post != null) { - - if (post.containsKey("contentcontrolExtraSettings")) { - - env.setConfig("contentcontrol.smwimport.baseurl", - post.get("ccsmwimporturl")); - - env.setConfig("contentcontrol.smwimport.enabled", - "on".equals(post.get("ccsmwimport")) ? true : false); - - env.setConfig("contentcontrol.smwimport.purgelistoninit", - "on".equals(post.get("ccsmwpurge")) ? true : false); - - env.setConfig("contentcontrol.smwimport.targetlist", - post.get("ccsmwimportlist")); - - - } - - if (post.containsKey("contentcontrolSettings")) { - - env.setConfig("contentcontrol.enabled", - "on".equals(post.get("contentcontrolenabled")) ? true : false); - - - env.setConfig("contentcontrol.bookmarklist", - post.get("contentcontrolbml")); - - } - - } - - - prop.putHTML("ccsmwimportlist", - env.getConfig("contentcontrol.smwimport.targetlist", "contentcontrol")); - - prop.put("ccsmwpurge_checked", env.getConfigBool( - "contentcontrol.smwimport.purgelistoninit", false) ? "1" : "0"); - - prop.putHTML("ccsmwimporturl", - env.getConfig("contentcontrol.smwimport.baseurl", "")); - - prop.put("ccsmwimport_checked", env.getConfigBool( - "contentcontrol.smwimport.enabled", false) ? "1" : "0"); - - - prop.put("contentcontrolenabled_checked", - env.getConfigBool("contentcontrol.enabled", false) ? "1" : "0"); - - prop.putHTML("contentcontrolbml", - env.getConfig("contentcontrol.bookmarklist", "")); - - // return rewrite properties - return prop; - } - -} diff --git a/htroot/env/templates/submenuBlacklist.template b/htroot/env/templates/submenuBlacklist.template index 100b5cacb..d1003f050 100644 --- a/htroot/env/templates/submenuBlacklist.template +++ b/htroot/env/templates/submenuBlacklist.template @@ -5,6 +5,5 @@
  • Blacklist Cleaner
  • Blacklist Test
  • Import/Export
  • -
  • Content Control
  • \ No newline at end of file diff --git a/ivy.xml b/ivy.xml index 1cd5276a5..3c330c96e 100644 --- a/ivy.xml +++ b/ivy.xml @@ -13,7 +13,6 @@ - @@ -94,7 +93,6 @@ - diff --git a/locales/de.lng b/locales/de.lng index 97ecf4df0..f38fa6dec 100644 --- a/locales/de.lng +++ b/locales/de.lng @@ -953,30 +953,6 @@ Duration==Dauer #ID==ID #----------------------------- -#File: ContentControl_p.html -#--------------------------- -Content Control<==Inhaltskontrolle< -Peer Content Control URL Filter==Peer Inhaltskontrolle URL Filter -With this settings you can activate or deactivate content control on this peer.==Mit dieser Einstellung kann die Inhaltskontrolle auf diesem Peer an- oder abgeschalten werden. -Use content control filtering:==Verwende Inhaltskontrollfilter: ->Enabled<==>Aktiviert< -Enables or disables content control.==Schaltet Inhaltskontrolle an- oder ab. -Use this table to create filter:==Verwenden Sie diese Tabelle, um Filter zu erzeugen: -Define a table. Default:==Definieren Sie ein Tabelle. Standardeinstellung: -Content Control SMW Import Settings==Inhaltskontrolle SMW Importeinstellungen -With this settings you can define the content control import settings. You can define a==Mit diesen Einstellungen können Sie die Importeinstellungen für die Inhaltskontrolle definieren. Definieren Sie ein -Semantic Media Wiki with the appropriate extensions.==Semantisches Media Wiki mit den passenden Erweiterungen. -SMW import to content control list:==SMW Import für die Inhalts-Kontroll-Liste: -Enable or disable constant background synchronization of content control list from SMW (Semantic Mediawiki). Requires restart!==Konstante Synchronisation der Inhalts-Kontroll-Liste vom SMW (Semantisches Medienwiki) im Hintergrund. Benötigt Neustart! -SMW import base URL:==SMW Import Basis URL: -Define base URL for SMW special page "Ask". Example: ==Definiere Basis URL für SMW Spezialseite "Ask". Beispiel: -SMW import target table:==SMW Import Ziele Tabelle: -Define import target table. Default: contentcontrol==Definieren Import Ziel Tabelle. Standardeinstellung: contentcontrol -Purge content control list on initial sync:==Verwerfe Inhalts-Kontroll-Listen bei der ersten Synchronisation: -Purge content control list on initial synchronisation after startup.==Verwerfe Inhalts-Kontroll-Listen bei der ersten Synchronisation nach dem Start. -"Submit"=="Absenden" -#----------------------------- - #File: CookieMonitorIncoming_p.html #--------------------------- diff --git a/locales/es.lng b/locales/es.lng index 82b9dabcf..a6ec220ea 100644 --- a/locales/es.lng +++ b/locales/es.lng @@ -530,13 +530,6 @@ Duration==Duración ID==ID #----------------------------- -#File: ContentControl_p.html -#--------------------------- -Content Control<==Control de contenido< ->Enabled<==>Habilitado -"Submit"=="Enviar" -#----------------------------- - #File: CookieMonitorIncoming_p.html #--------------------------- diff --git a/locales/it.lng b/locales/it.lng index e767631aa..cc0a435b9 100644 --- a/locales/it.lng +++ b/locales/it.lng @@ -510,13 +510,6 @@ Duration==Durata ID==ID #----------------------------- -#File: ContentControl_p.html -#--------------------------- -Content Control<==Controllo dei contenuti< ->Enabled<==>Abilitato -"Submit"=="Invia" -#----------------------------- - #File: CookieMonitorIncoming_p.html #--------------------------- diff --git a/locales/ja.lng b/locales/ja.lng index bf8e4ca27..8a4b6ab28 100644 --- a/locales/ja.lng +++ b/locales/ja.lng @@ -714,13 +714,6 @@ Last Deploy==最後の展開 Connection Tracking==接続の追跡 #----------------------------- -#File: ContentControl_p.html -#--------------------------- -Content Control<==コンテントの制御< -"Submit"=="確定する" -#----------------------------- - - #File: CookieMonitorIncoming_p.html #--------------------------- Incoming Cookies Monitor==着信したCookieのモニター diff --git a/locales/master.lng.xlf b/locales/master.lng.xlf index 1ca931d41..1e7c429d0 100644 --- a/locales/master.lng.xlf +++ b/locales/master.lng.xlf @@ -2429,71 +2429,6 @@ - - - - Content Control< - - - Peer Content Control URL Filter - - - With this settings you can activate or deactivate content control on this peer. - - - Use content control filtering: - - - >Enabled< - - - Enables or disables content control. - - - Use this table to create filter: - - - Define a table. Default: - - - Content Control SMW Import Settings - - - With this settings you can define the content control import settings. You can define a - - - Semantic Media Wiki with the appropriate extensions. - - - SMW import to content control list: - - - Enable or disable constant background synchronization of content control list from SMW (Semantic Mediawiki). Requires restart! - - - SMW import base URL: - - - Define base URL for SMW special page "Ask". Example: - - - SMW import target table: - - - Define import target table. Default: contentcontrol - - - Purge content control list on initial sync: - - - Purge content control list on initial synchronisation after startup. - - - "Submit" - - - - diff --git a/locales/ru.lng b/locales/ru.lng index d98dbe29e..f4af9dafc 100644 --- a/locales/ru.lng +++ b/locales/ru.lng @@ -1059,30 +1059,6 @@ Duration==Длительность #ID==ID #----------------------------- -#File: ContentControl_p.html -#--------------------------- -Content Control<==Управление контентом< -Peer Content Control URL Filter==Управление контентом узла -With this settings you can activate or deactivate content control on this peer.==Эти настройки позволяют включить или отключить управление контентом для вашего узла. -Use content control filtering:==Использовать фильтр управления контентом: ->Enabled<==>Включить< -Enables or disables content control.==Включение или отключение управления контентом. -Use this table to create filter:==Использовать это поле для создания фильтра: -Define a table. Default:==Задать значение поля. По-умолчанию: -Content Control SMW Import Settings==Импорт настроек управления контентом SMW -With this settings you can define the content control import settings. You can define a==Эти настройки позволяют задать параметры импорта настроек управления контентом -Semantic Media Wiki with the appropriate extensions.==Semantic Media Wiki с соответствующими расширениями. -SMW import to content control list:== Импорт SMW в список управления контентом: -Enable or disable constant background synchronization of content control list from SMW (Semantic Mediawiki). Requires restart!==Включение или отключение постоянной фоновой синхронизации списка управления контентом из SMW (Semantic Mediawiki). Потребуется перезапуск программы! -SMW import base URL:==Ссылка на импортируемую базу SMW: -Define base URL for SMW special page "Ask". Example: ==Укажите ссылку на базу SMW на специальной странице "Ask". Например: -SMW import target table:==Поле назначения импорта SMW: -Define import target table. Default: contentcontrol==Укажите поле назначения импорта. По-умолчанию: contentcontrol -Purge content control list on initial sync:==Удалить список управления контентом в начале синхронизации: -Purge content control list on initial synchronisation after startup.==Удалить список управления контентом в начале синхронизации после запуска программы. -"Submit"=="Сохранить" -#----------------------------- - #File: CookieMonitorIncoming_p.html #--------------------------- diff --git a/locales/zh.lng b/locales/zh.lng index e1c7d5262..8f78c26e6 100644 --- a/locales/zh.lng +++ b/locales/zh.lng @@ -1033,31 +1033,6 @@ For minTokenLen = 2 the quantRate value should not be below 0.24; for minTokenLe The quantRate is a measurement for the number of words that take part in a signature computation. The higher the number==quantRate是参与签名计算的单词数量的度量。 数字越高,越少 #----------------------------- -#File: ContentControl_p.html -#--------------------------- -Content Control<==内容控制< -Peer Content Control URL Filter==节点内容控制地址过滤器 -With this settings you can activate or deactivate content control on this peer==使用此设置,你可以激活或取消激活此YaCy节点上的内容控制 -Use content control filtering:==使用内容控制过滤: ->Enabled<==>已启用< -Enables or disables content control==启用或禁用内容控制 -Use this table to create filter:==使用此表创建过滤器: -Define a table. Default:==定义一个表格. 默认: -Content Control SMW Import Settings==内容控制SMW导入设置 -With this settings you can define the content control import settings. You can define a==使用此设置,你可以定义内容控制导入设置. 你可以定义一个 -Semantic Media Wiki with the appropriate extensions==语义媒体百科与适当的扩展 -SMW import to content control list:==SMW导入到内容控制列表: -Enable or disable constant background synchronization of content control list from SMW (Semantic Mediawiki). Requires restart!==启用或禁用来自SMW(Semantic Mediawiki)的内容控制列表的恒定后台同步。 需要重启! -SMW import base URL:==SMW导入基URL: -Define base URL for SMW special page "Ask". Example: ==为SMW特殊页面“Ask”定义基础地址.例: -SMW import target table:==SMW导入目标表: -Define import target table. Default: contentcontrol==定义导入目标表. 默认值:contentcontrol -Purge content control list on initial sync:==在初始同步时清除内容控制列表: -Purge content control list on initial synchronisation after startup.==重启后,清除初始同步的内容控制列表. -"Submit"=="提交" -Define base URL for SMW special page "Ask". Example:==为SMW特殊页面“Ask”定义基础地址.例: -#----------------------------- - #File: ContentIntegrationPHPBB3_p.html #--------------------------- Content Integration: Retrieval from phpBB3 Databases==内容集成: 从phpBB3数据库中导入 diff --git a/source/net/yacy/contentcontrol/ContentControlFilterUpdateThread.java b/source/net/yacy/contentcontrol/ContentControlFilterUpdateThread.java deleted file mode 100644 index 9a3d12e2f..000000000 --- a/source/net/yacy/contentcontrol/ContentControlFilterUpdateThread.java +++ /dev/null @@ -1,90 +0,0 @@ -package net.yacy.contentcontrol; - -import java.io.IOException; -import java.util.Iterator; - -import net.yacy.kelondro.blob.Tables; -import net.yacy.kelondro.blob.Tables.Row; -import net.yacy.repository.FilterEngine; -import net.yacy.search.Switchboard; - -public class ContentControlFilterUpdateThread implements Runnable { - - private final Switchboard sb; - - private Boolean locked = false; - - private static FilterEngine networkfilter; - - public ContentControlFilterUpdateThread(final Switchboard sb) { - - this.sb = sb; - - } - - @Override - public final void run() { - - if (!this.locked) { - - this.locked = true; - - if (this.sb.getConfigBool("contentcontrol.enabled", false) == true) { - - if (SMWListSyncThread.dirty) { - - networkfilter = updateFilter(); - - SMWListSyncThread.dirty = false; - - } - - } - - this.locked = false; - - } - - return; - } - - private static FilterEngine updateFilter () { - - FilterEngine newfilter = new FilterEngine(); - - Switchboard sb = Switchboard.getSwitchboard(); - - Iterator it; - try { - it = sb.tables.iterator(sb.getConfig("contentcontrol.bookmarklist", - "contentcontrol")); - - while (it.hasNext()) { - Row b = it.next(); - - if (!b.get("filter", "").equals("")) { - - newfilter.add(b.get("filter", ""), null); - } - } - - } catch (final IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - - return newfilter; - } - - - public static FilterEngine getNetworkFilter() { - FilterEngine f = networkfilter; - - if (f != null && f.size() > 0) - return f; - - return null; - - } - -} diff --git a/source/net/yacy/contentcontrol/SMWListImporter.java b/source/net/yacy/contentcontrol/SMWListImporter.java deleted file mode 100644 index 228a5aa57..000000000 --- a/source/net/yacy/contentcontrol/SMWListImporter.java +++ /dev/null @@ -1,163 +0,0 @@ -package net.yacy.contentcontrol; - -import java.io.IOException; -import java.io.Reader; -import java.util.HashMap; -import java.util.Map.Entry; -import java.util.concurrent.ArrayBlockingQueue; - -import net.yacy.cora.util.ConcurrentLog; - -import org.json.simple.parser.ContentHandler; -import org.json.simple.parser.JSONParser; -import org.json.simple.parser.ParseException; - -public class SMWListImporter implements Runnable, ContentHandler{ - - // Importer Variables - private final ArrayBlockingQueue listEntries; - private final Reader importFile; - - private SMWListRow row; - private final JSONParser parser; - - // Parser Variables - private final StringBuilder value; - private final StringBuilder key; - private final HashMap obj; - - private Boolean isElement; - - public SMWListImporter(final Reader importFile, final int queueSize) { - this.listEntries = new ArrayBlockingQueue(queueSize); - this.importFile = importFile; - - this.row = new SMWListRow(); - - this.parser = new JSONParser(); - - this.value = new StringBuilder(128); - this.key = new StringBuilder(16); - this.obj = new HashMap(); - - this.isElement = false; - - } - - @Override - public void startJSON() throws ParseException, IOException { - } - - @Override - public void endJSON() throws ParseException, IOException { - } - - @Override - public boolean startArray() throws ParseException, IOException { - final String key = this.key.toString(); - - if (key.equals("items")) { - - this.isElement = true; - - } - return true; - } - - @Override - public boolean endArray() throws ParseException, IOException { - - return true; - } - - @Override - public boolean startObject() throws ParseException, IOException { - - return true; - } - - @Override - public boolean endObject() throws ParseException, IOException { - - if(this.isElement) { - - for (Entry e: this.obj.entrySet()) { - this.row.add (e.getKey(), e.getValue()); - } - try { - this.listEntries.put(this.row); - //this.count++; - } catch (final InterruptedException e) { - ConcurrentLog.logException(e); - } - this.obj.clear(); - this.row = new SMWListRow(); - } - - return true; - } - - @Override - public boolean startObjectEntry(String key) throws ParseException, IOException { - this.key.setLength(0); - this.key.append(key); - - return true; - } - - @Override - public boolean primitive(Object value) throws ParseException, IOException { - - this.value.setLength(0); - if(value instanceof java.lang.String) { - this.value.append((String)value); - } else if(value instanceof java.lang.Boolean) { - this.value.append(value); - } else if(value instanceof java.lang.Number) { - this.value.append(value); - } - - return true; - } - - @Override - public boolean endObjectEntry() throws ParseException, IOException { - - final String key = this.key.toString(); - final String value = this.value.toString(); - - this.obj.put(key, value); - - return true; - } - - @Override - public void run() { - try { - ConcurrentLog.info("SMWLISTSYNC", "Importer run()"); - this.parser.parse(this.importFile, this, true); - - } catch (final IOException e) { - ConcurrentLog.logException(e); - } catch (final ParseException e) { - ConcurrentLog.logException(e); - } finally { - - try { - ConcurrentLog.info("SMWLISTSYNC", "Importer inserted poison pill in queue"); - this.listEntries.put(SMWListRow.POISON); - } catch (final InterruptedException e) { - ConcurrentLog.logException(e); - } - } - } - - public SMWListRow take() { - try { - return this.listEntries.take(); - } catch (final InterruptedException e) { - ConcurrentLog.logException(e); - return null; - } - } -} diff --git a/source/net/yacy/contentcontrol/SMWListImporterFormatObsolete.java b/source/net/yacy/contentcontrol/SMWListImporterFormatObsolete.java deleted file mode 100644 index 85351c040..000000000 --- a/source/net/yacy/contentcontrol/SMWListImporterFormatObsolete.java +++ /dev/null @@ -1,117 +0,0 @@ -package net.yacy.contentcontrol; - -import java.io.IOException; -import java.io.Reader; -import java.util.Iterator; -import java.util.concurrent.ArrayBlockingQueue; - -import net.yacy.cora.util.ConcurrentLog; - -import org.json.simple.JSONArray; -import org.json.simple.JSONObject; -import org.json.simple.parser.JSONParser; -import org.json.simple.parser.ParseException; - -public class SMWListImporterFormatObsolete implements Runnable{ - - private final ArrayBlockingQueue listEntries; - private final Reader importFile; - private final JSONParser parser; - - public SMWListImporterFormatObsolete(final Reader importFile, final int queueSize) { - this.listEntries = new ArrayBlockingQueue(queueSize); - this.importFile = importFile; - this.parser = new JSONParser(); - - } - - - @Override - public void run() { - try { - ConcurrentLog.info("SMWLISTSYNC", "Importer run()"); - Object obj = this.parser.parse(this.importFile); - - JSONObject jsonObject = (JSONObject) obj; - - JSONArray items = (JSONArray) jsonObject.get("items"); - - @SuppressWarnings("unchecked") - Iterator iterator = items.iterator(); - while (iterator.hasNext()) { - this.parseItem (iterator.next()); - } - - } catch (final IOException e) { - ConcurrentLog.logException(e); - } catch (final ParseException e) { - ConcurrentLog.logException(e); - } finally { - - try { - ConcurrentLog.info("SMWLISTSYNC", "Importer inserted poison pill in queue"); - this.listEntries.put(SMWListRow.POISON); - } catch (final InterruptedException e) { - ConcurrentLog.logException(e); - } - } - } - - private void parseItem(JSONObject jsonObject) { - - try { - SMWListRow row = new SMWListRow(); - @SuppressWarnings("unchecked") - Iterator iterator = jsonObject.keySet().iterator(); - - while (iterator.hasNext()) { - String entryKey = iterator.next(); - - Object value = jsonObject.get (entryKey); - String valueKey = ""; - - if (value instanceof java.lang.String) { - valueKey = value.toString(); - } else if (value instanceof JSONArray) { - valueKey = jsonListAll ((JSONArray) value); - } - - row.add (entryKey, valueKey); - } - - this.listEntries.put(row); - - } catch (final Exception e) { - ConcurrentLog.info("SMWLISTSYNC", "import of entry failed"); - } - - } - - - private String jsonListAll(JSONArray value) { - String res = ""; - - @SuppressWarnings("unchecked") - Iterator iterator = value.listIterator(); - while (iterator.hasNext()) { - Object val = iterator.next(); - res += val.toString()+","; - } - - if (res.endsWith (",")) { - res = res.substring (0, res.length()-1); - } - - return res; - } - - - public SMWListRow take() { - try { - return this.listEntries.take(); - } catch (final InterruptedException e) { - ConcurrentLog.logException(e); - return null; - } - } -} diff --git a/source/net/yacy/contentcontrol/SMWListRow.java b/source/net/yacy/contentcontrol/SMWListRow.java deleted file mode 100644 index 78c085d40..000000000 --- a/source/net/yacy/contentcontrol/SMWListRow.java +++ /dev/null @@ -1,24 +0,0 @@ -package net.yacy.contentcontrol; - -import net.yacy.kelondro.blob.Tables; - -public class SMWListRow { - - private Tables.Data data; - - public static final SMWListRow POISON = new SMWListRow(); - public static final SMWListRow EMPTY = new SMWListRow(); - - public SMWListRow() { - this.data = new Tables.Data(); - } - - public void add (String key, String value) { - this.data.put(key, value); - } - - public Tables.Data getData() { - return this.data; - } - -} diff --git a/source/net/yacy/contentcontrol/SMWListSyncThread.java b/source/net/yacy/contentcontrol/SMWListSyncThread.java deleted file mode 100644 index 80cf8a0d3..000000000 --- a/source/net/yacy/contentcontrol/SMWListSyncThread.java +++ /dev/null @@ -1,201 +0,0 @@ -package net.yacy.contentcontrol; - -import java.io.IOException; -import java.io.InputStreamReader; -import java.net.MalformedURLException; -import java.net.URL; -import java.nio.charset.StandardCharsets; - -import net.yacy.cora.document.encoding.UTF8; -import net.yacy.cora.protocol.ClientIdentification; -import net.yacy.cora.protocol.http.HTTPClient; -import net.yacy.cora.util.CommonPattern; -import net.yacy.cora.util.ConcurrentLog; -import net.yacy.search.Switchboard; - -public class SMWListSyncThread implements Runnable { - - private final Switchboard sb; - private Boolean locked = false; - private String lastsync = "1900-01-01T01:00:00"; - private String currenttimestamp = "1900-01-01T01:00:00"; - private long offset = 0; - private final long limit = 500; - private long currentmax = 0; - private boolean runningjob = false; - - private String targetList; - private String parameters; - private String query; - - public static Boolean dirty = false; - - public SMWListSyncThread(final Switchboard sb, final String targetList, final String query, final String parameters, final Boolean purgeOnInit) { - this.sb = sb; - this.targetList = targetList; - this.parameters = parameters; - this.query = query; - if (purgeOnInit) { - this.sb.tables.clear(targetList); - - } - } - - private final String wikiurlify (String s) { - String ret = s; - ret = ret.replace("-", "-2D"); - ret = ret.replace("+", "-2B"); - ret = ret.replace(" ", "-20"); - ret = ret.replace("[", "-5B"); - ret = ret.replace("]", "-5D"); - ret = ret.replace(":", "-3A"); - ret = ret.replace(">", "-3E"); - ret = ret.replace("?", "-3F"); - - return ret; - } - - @Override - public final void run() { - - if (!this.locked) { - this.locked = true; - if (this.sb.getConfigBool("contentcontrol.smwimport.enabled", false) == true) { - - if (!this.runningjob) { - - // we have to count all new elements first - try { - if (!this.sb.getConfig("contentcontrol.smwimport.baseurl","").equals("")) { - URL urlCount; - - urlCount = new URL( - this.sb.getConfig( - "contentcontrol.smwimport.baseurl", - "") - + wikiurlify ("/[["+this.query+"]] [[Modification date::>" +this.lastsync+ "]]") - - + wikiurlify (this.parameters) - - + "/mainlabel%3D" - + "/offset%3D0" - + "/limit%3D200000" - + "/format%3Dystat"); - - String reply = UTF8.String(new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent).GETbytes(urlCount.toString(), null, null, false)); - String overallcount = CommonPattern.COMMA.split(reply)[0]; - String lastsyncstring = CommonPattern.COMMA.split(reply)[1]; - this.currentmax = Integer.parseInt(overallcount); - - if (this.currentmax > 0) { - ConcurrentLog.info("SMWLISTSYNC", - "import job counts " - + this.currentmax - + " new elements between " - + this.lastsync + " and " - + this.currenttimestamp); - - this.currenttimestamp = this.lastsync; - - this.runningjob = true; - this.lastsync = lastsyncstring; - this.offset = 0; - } - } else { - ConcurrentLog.warn("SMWLISTSYNC", - "No SMWimport URL defined"); - } - } catch (final MalformedURLException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } catch (final IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - - - } else { - - // there are new elements to be imported - ConcurrentLog.info("SMWLISTSYNC", - "importing max. " + this.limit - + " elements at " + this.offset + " of " - + this.currentmax + ", since " - + this.currenttimestamp); - URL urlImport; - try { - if (!this.sb.getConfig("contentcontrol.smwimport.baseurl","").equals("")) { - urlImport = new URL( - this.sb.getConfig( - "contentcontrol.smwimport.baseurl", - "") - + wikiurlify ("/[["+this.query+"]] [[Modification date::>" +this.currenttimestamp+ "]]") - - + wikiurlify (this.parameters) - - + "/mainlabel%3D" - + "/syntax%3Dobsolete" - + "/offset%3D" + this.offset - + "/limit%3D" + this.limit - + "/format%3Djson"); - - this.offset += this.limit; - if (this.offset > this.currentmax) { - this.runningjob = false; - } - - InputStreamReader reader = null; - try { - reader = new InputStreamReader( - urlImport.openStream(), StandardCharsets.UTF_8); - } catch (final Exception e) { - ConcurrentLog.logException(e); - this.runningjob = false; - } - - if (reader != null) { - SMWListImporterFormatObsolete smwListImporter = null; - try { - smwListImporter = new SMWListImporterFormatObsolete( - reader, 200); - } catch (final Exception e) { - // TODO: display an error message - ConcurrentLog.logException(e); - this.runningjob = false; - } - Thread t; - SMWListRow row; - t = new Thread(smwListImporter,"SMW List Importer"); - t.start(); - while ((row = smwListImporter.take()) != SMWListRow.POISON) { - if (row == SMWListRow.EMPTY) { - this.runningjob = false; - } else { - try { - this.sb.tables.insert(targetList, row.getData()); - - dirty = true; - - } catch (final Exception e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - } - } - } - } - - - } catch (final MalformedURLException e2) { - // TODO Auto-generated catch block - e2.printStackTrace(); - } - - } - this.locked = false; - } - } - return; - } - -} diff --git a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java index de0ecc351..188bd21c0 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java @@ -26,31 +26,31 @@ import java.util.ArrayList; import java.util.Collection; import java.util.List; -import net.yacy.cora.federate.solr.instance.ServerShard; -import net.yacy.cora.util.ConcurrentLog; -import net.yacy.search.schema.CollectionSchema; - -import org.apache.solr.common.SolrDocumentList; -import org.apache.solr.common.SolrException; -import org.apache.solr.common.SolrInputDocument; -import org.apache.solr.common.params.ModifiableSolrParams; -import org.apache.solr.common.util.NamedList; -import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; -import org.apache.solr.client.solrj.impl.XMLResponseParser; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; +import org.apache.solr.client.solrj.impl.XMLResponseParser; import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest; import org.apache.solr.client.solrj.request.LukeRequest; import org.apache.solr.client.solrj.request.UpdateRequest; -import org.apache.solr.client.solrj.response.LukeResponse.FieldInfo; import org.apache.solr.client.solrj.response.LukeResponse; +import org.apache.solr.client.solrj.response.LukeResponse.FieldInfo; import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.util.NamedList; + +import net.yacy.cora.federate.solr.instance.ServerShard; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.search.schema.CollectionSchema; public abstract class SolrServerConnector extends AbstractSolrConnector implements SolrConnector { protected final static ConcurrentLog log = new ConcurrentLog(SolrServerConnector.class.getName()); - public final static org.apache.lucene.analysis.CharArrayMap classLoaderSynchro = new org.apache.lucene.analysis.CharArrayMap(0, true); + public final static org.apache.lucene.analysis.CharArrayMap classLoaderSynchro = new org.apache.lucene.analysis.CharArrayMap<>(0, true); // pre-instantiate this object to prevent sun.misc.Launcher$AppClassLoader deadlocks // this is a very nasty problem; solr instantiates objects dynamically which can cause deadlocks static { @@ -69,7 +69,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen public SolrClient getServer() { return this.server; } - + @Override public void commit(final boolean softCommit) { if (this.server == null) return; @@ -158,8 +158,8 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen @Override public void deleteByIds(final Collection ids) throws IOException { if (this.server == null) return; - List l = new ArrayList(); - for (String s: ids) l.add(s); + final List l = new ArrayList<>(); + for (final String s: ids) l.add(s); synchronized (this.server) { try { this.server.deleteById(l, -1); @@ -247,7 +247,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen @Override public void add(final Collection solrdocs) throws IOException, SolrException { if (this.server == null) return; - for (SolrInputDocument solrdoc : solrdocs) { + for (final SolrInputDocument solrdoc : solrdocs) { if (solrdoc.containsKey("_version_")) solrdoc.setField("_version_",0L); // prevent Solr "version conflict" } synchronized (this.server) { @@ -278,8 +278,8 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen this.server.add(solrdocs, -1); } catch (final Throwable ee) { ConcurrentLog.logException(ee); - List ids = new ArrayList(); - for (SolrInputDocument solrdoc : solrdocs) ids.add((String) solrdoc.getFieldValue(CollectionSchema.id.getSolrFieldName())); + final List ids = new ArrayList<>(); + for (final SolrInputDocument solrdoc : solrdocs) ids.add((String) solrdoc.getFieldValue(CollectionSchema.id.getSolrFieldName())); log.warn(e.getMessage() + " IDs=" + ids.toString()); throw new IOException(ee); } @@ -300,11 +300,11 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException { if (this.server == null) throw new IOException("server disconnected"); // during the solr query we set the thread name to the query string to get more debugging info in thread dumps - String q = params.get(CommonParams.Q); - String fq = params.get(CommonParams.FQ); - String sort = params.get(CommonParams.SORT); - String fl = params.get(CommonParams.FL); - String threadname = Thread.currentThread().getName(); + final String q = params.get(CommonParams.Q); + final String fq = params.get(CommonParams.FQ); + final String sort = params.get(CommonParams.SORT); + final String fl = params.get(CommonParams.FL); + final String threadname = Thread.currentThread().getName(); QueryResponse rsp; int retry = 0; Throwable error = null; @@ -322,13 +322,13 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen clearCaches(); // prevent further OOM if this was caused by OOM } ConcurrentLog.severe("SolrServerConnector", "Failed to query remote Solr: " + error.getMessage() + ", query:" + q + (fq == null ? "" : ", fq = " + fq)); - try {Thread.sleep(1000);} catch (InterruptedException e) {} + try {Thread.sleep(1000);} catch (final InterruptedException e) {} } throw new IOException("Error executing query", error); } - + // luke requests: these do not work for attached SolrCloud Server - + public Collection getFields() throws SolrServerException { // get all fields contained in index return getIndexBrowser(false).getFieldInfo().values(); @@ -342,10 +342,10 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen public int getSegmentCount() { if (this.server == null) return 0; try { - LukeResponse lukeResponse = getIndexBrowser(false); - NamedList info = lukeResponse.getIndexInfo(); + final LukeResponse lukeResponse = getIndexBrowser(false); + final NamedList info = lukeResponse.getIndexInfo(); if (info == null) return 0; - Integer segmentCount = (Integer) info.get("segmentCount"); + final Integer segmentCount = (Integer) info.get("segmentCount"); if (segmentCount == null) return 1; return segmentCount.intValue(); } catch (final Throwable e) { @@ -356,31 +356,31 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen } private int useluke = 0; // 3-value logic: 1=yes, -1=no, 0=dontknow - + @Override public long getSize() { if (this.server == null) return 0; if (this.server instanceof ServerShard) { // the server can be a single shard; we don't know here // to test that, we submit requests to bots variants - if (useluke == 1) return getSizeLukeRequest(); - if (useluke == -1) return getSizeQueryRequest(); - long ls = getSizeLukeRequest(); - long qs = getSizeQueryRequest(); + if (this.useluke == 1) return getSizeLukeRequest(); + if (this.useluke == -1) return getSizeQueryRequest(); + final long ls = getSizeLukeRequest(); + final long qs = getSizeQueryRequest(); if (ls == 0 && qs == 0) { // we don't know if this is caused by an error or not; don't change the useluke value return 0; } if (ls == qs) { - useluke = 1; + this.useluke = 1; return ls; } - useluke = -1; + this.useluke = -1; return qs; } return getSizeLukeRequest(); } - + private long getSizeQueryRequest() { if (this.server == null) return 0; try { @@ -394,13 +394,13 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen return 0; } } - + private long getSizeLukeRequest() { if (this.server == null) return 0; try { - LukeResponse lukeResponse = getIndexBrowser(false); + final LukeResponse lukeResponse = getIndexBrowser(false); if (lukeResponse == null) return 0; - Integer numDocs = lukeResponse.getNumDocs(); + final Integer numDocs = lukeResponse.getNumDocs(); if (numDocs == null) return 0; return numDocs.longValue(); } catch (final Throwable e) { @@ -409,7 +409,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen return 0; } } - + private LukeResponse getIndexBrowser(final boolean showSchema) throws SolrServerException { // get all fields contained in index final LukeRequest lukeRequest = new LukeRequest(); @@ -419,7 +419,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen LukeResponse lukeResponse = null; try { lukeResponse = lukeRequest.process(this.server); - } catch (IOException e) { + } catch (final IOException e) { throw new SolrServerException(e.getMessage()); } return lukeResponse; diff --git a/source/net/yacy/cora/federate/solr/instance/EmbeddedInstance.java b/source/net/yacy/cora/federate/solr/instance/EmbeddedInstance.java index 45704e805..5f10da685 100644 --- a/source/net/yacy/cora/federate/solr/instance/EmbeddedInstance.java +++ b/source/net/yacy/cora/federate/solr/instance/EmbeddedInstance.java @@ -27,10 +27,6 @@ import java.util.Collection; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; -import net.yacy.cora.document.encoding.ASCII; -import net.yacy.cora.util.ConcurrentLog; -import net.yacy.kelondro.util.MemoryControl; - import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; import org.apache.solr.core.CoreContainer; @@ -38,17 +34,21 @@ import org.apache.solr.core.SolrCore; import com.google.common.io.Files; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.kelondro.util.MemoryControl; + public class EmbeddedInstance implements SolrInstance { private final static String[] confFiles = {"solrconfig.xml", "schema.xml", "stopwords.txt", "synonyms.txt", "protwords.txt", "currency.xml", "elevate.xml", "xslt/example.xsl", "xslt/json.xsl", "lang/"}; - // additional a optional solrcore.properties (or solrcore.x86.properties for 32bit systems is copied + // additional a optional solrcore.properties (or solrcore.x86.properties for 32bit systems is copied private CoreContainer coreContainer; - private String defaultCoreName; - private SolrCore defaultCore; - private SolrClient defaultCoreServer; - private File containerPath; - private Map cores; - private Map server; + private final String defaultCoreName; + private final SolrCore defaultCore; + private final SolrClient defaultCoreServer; + private final File containerPath; + private final Map cores; + private final Map server; public EmbeddedInstance(final File solr_config, final File containerPath, String givenDefaultCoreName, String[] initializeCoreNames) throws IOException { super(); @@ -56,30 +56,30 @@ public class EmbeddedInstance implements SolrInstance { this.containerPath = containerPath; // ensure that default core path exists - File defaultCorePath = new File(containerPath, givenDefaultCoreName); + final File defaultCorePath = new File(containerPath, givenDefaultCoreName); if (!defaultCorePath.exists()) defaultCorePath.mkdirs(); // migrate old conf directory - File oldConf = new File(containerPath, "conf"); - File confDir = new File(defaultCorePath, "conf"); + final File oldConf = new File(containerPath, "conf"); + final File confDir = new File(defaultCorePath, "conf"); if (oldConf.exists()) oldConf.renameTo(confDir); // migrate old data directory - File oldData = new File(containerPath, "data"); - File dataDir = new File(defaultCorePath, "data"); + final File oldData = new File(containerPath, "data"); + final File dataDir = new File(defaultCorePath, "data"); if (oldData.exists()) oldData.renameTo(dataDir); // create index subdirectory in data if it does not exist - File indexDir = new File(dataDir, "index"); + final File indexDir = new File(dataDir, "index"); if (!indexDir.exists()) indexDir.mkdirs(); // initialize the cores' configuration - for (String coreName: initializeCoreNames) { + for (final String coreName: initializeCoreNames) { initializeCoreConf(solr_config, containerPath, coreName); } // initialize the coreContainer - File configFile = new File(solr_config, "solr.xml"); // the configuration file for all cores + final File configFile = new File(solr_config, "solr.xml"); // the configuration file for all cores this.coreContainer = CoreContainer.createAndLoad(containerPath.toPath(), configFile.toPath()); // this may take indefinitely long if solr files are broken if (this.coreContainer == null) throw new IOException("cannot create core container dir = " + containerPath + ", configFile = " + configFile); @@ -94,9 +94,9 @@ public class EmbeddedInstance implements SolrInstance { this.defaultCoreServer = new EmbeddedSolrServer(this.coreContainer, this.defaultCoreName); // initialize core cache - this.cores = new ConcurrentHashMap(); + this.cores = new ConcurrentHashMap<>(); this.cores.put(this.defaultCoreName, this.defaultCore); - this.server = new ConcurrentHashMap(); + this.server = new ConcurrentHashMap<>(); this.server.put(this.defaultCoreName, this.defaultCoreServer); } @@ -113,42 +113,42 @@ public class EmbeddedInstance implements SolrInstance { private static void initializeCoreConf(final File solr_config, final File containerPath, String coreName) { // ensure that default core path exists - File corePath = new File(containerPath, coreName); + final File corePath = new File(containerPath, coreName); if (!corePath.exists()) corePath.mkdirs(); // check if core.properties exists in each path (thats new in Solr 5.0) - File core_properties = new File(corePath, "core.properties"); + final File core_properties = new File(corePath, "core.properties"); if (!core_properties.exists()) { // create the file try ( - /* Automatically closed by this try-with-resources statement */ - FileOutputStream fos = new FileOutputStream(core_properties); - ) { + /* Automatically closed by this try-with-resources statement */ + FileOutputStream fos = new FileOutputStream(core_properties); + ) { fos.write(ASCII.getBytes("name=" + coreName + "\n")); fos.write(ASCII.getBytes("shard=${shard:}\n")); fos.write(ASCII.getBytes("collection=${collection:" + coreName + "}\n")); fos.write(ASCII.getBytes("config=${solrconfig:solrconfig.xml}\n")); fos.write(ASCII.getBytes("schema=${schema:schema.xml}\n")); fos.write(ASCII.getBytes("coreNodeName=${coreNodeName:}\n")); - } catch (IOException e) { + } catch (final IOException e) { ConcurrentLog.logException(e); } } // ensure necessary subpaths exist - File conf = new File(corePath, "conf"); + final File conf = new File(corePath, "conf"); conf.mkdirs(); - File data = new File(corePath, "data"); + final File data = new File(corePath, "data"); data.mkdirs(); // (over-)write configuration into conf path File source, target; - for (String cf: confFiles) { + for (final String cf: confFiles) { source = new File(solr_config, cf); if (source.isDirectory()) { target = new File(conf, cf); target.mkdirs(); - for (String cfl: source.list()) { + for (final String cfl: source.list()) { try { Files.copy(new File(source, cfl), new File(target, cfl)); } catch (final IOException e) { @@ -168,7 +168,7 @@ public class EmbeddedInstance implements SolrInstance { // copy the solrcore.properties // for 32bit systems (os.arch name not containing '64') take the solrcore.x86.properties as solrcore.properties if exists - String os = System.getProperty("os.arch"); + final String os = System.getProperty("os.arch"); if (os.contains("64")) { source = new File(solr_config, "solrcore.properties"); } else { @@ -242,7 +242,7 @@ public class EmbeddedInstance implements SolrInstance { @Override public synchronized void close() { - for (SolrCore core: cores.values()) core.close(); + for (final SolrCore core: this.cores.values()) core.close(); if (this.coreContainer != null) try { this.coreContainer.shutdown(); this.coreContainer = null; diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index 643df9e00..3c5c7174e 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -36,7 +36,6 @@ import java.util.Set; import java.util.concurrent.BlockingQueue; import java.util.concurrent.atomic.AtomicInteger; -import net.yacy.contentcontrol.ContentControlFilterUpdateThread; import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; @@ -360,13 +359,13 @@ public final class CrawlStacker implements WorkflowTask{ final boolean proxy = (entry.initiator() == null || entry.initiator().length == 0 || ASCII.String(entry.initiator()).equals("------------")) && profile.handle().equals(this.crawler.defaultProxyProfile.handle()); final boolean remote = profile.handle().equals(this.crawler.defaultRemoteProfile.handle()); final boolean global = - (profile.remoteIndexing()) /* granted */ && - (entry.depth() == profile.depth()) /* leaf node */ && - //(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ && - ( - (this.peers.mySeed().isSenior()) || - (this.peers.mySeed().isPrincipal()) - ) /* qualified */; + (profile.remoteIndexing()) /* granted */ && + (entry.depth() == profile.depth()) /* leaf node */ && + //(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ && + ( + (this.peers.mySeed().isSenior()) || + (this.peers.mySeed().isPrincipal()) + ) /* qualified */; if (!local && !global && !remote && !proxy) { error = "URL '" + entry.url().toString() + "' cannot be crawled. initiator = " + ((entry.initiator() == null) ? "" : ASCII.String(entry.initiator())) + ", profile.handle = " + profile.handle(); @@ -424,7 +423,7 @@ public final class CrawlStacker implements WorkflowTask{ if (dbocc != null) { return CRAWL_REJECT_REASON_DOUBLE_IN_PREFIX + ": " + dbocc.name(); } - String urls = url.toNormalform(false); + final String urls = url.toNormalform(false); final long oldDate = this.indexSegment.getLoadTime(url.hash()); // deny urls that exceed allowed number of occurrences @@ -441,7 +440,7 @@ public final class CrawlStacker implements WorkflowTask{ if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' appeared too often in result stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed."); return "result stack domain counter exceeded (test by domainCount)"; } - */ + */ } //final Long oldDate = oldEntry == null ? null : oldEntry.date; @@ -453,7 +452,7 @@ public final class CrawlStacker implements WorkflowTask{ if (recrawl) { if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("RE-CRAWL of URL '" + urlstring + "': this url was crawled " + - ((System.currentTimeMillis() - oldDate) / 60000 / 60 / 24) + " days ago."); + ((System.currentTimeMillis() - oldDate) / 60000 / 60 / 24) + " days ago."); } else { return CRAWL_REJECT_REASON_DOUBLE_IN_PREFIX + ": local index, recrawl rejected. Document date = " + ISO8601Formatter.FORMATTER.format(new Date(oldDate)) + " is not older than crawl profile recrawl minimum date = " @@ -574,26 +573,6 @@ public final class CrawlStacker implements WorkflowTask{ } } - if (Switchboard.getSwitchboard().getConfigBool( - "contentcontrol.enabled", false) == true) { - - if (!Switchboard.getSwitchboard() - .getConfig("contentcontrol.mandatoryfilterlist", "") - .equals("")) { - final FilterEngine f = ContentControlFilterUpdateThread.getNetworkFilter(); - if (f != null) { - if (!f.isListed(url, null)) { - - return "the url '" - + url - + "' does not belong to the network mandatory filter list"; - - } - } - } - - } - final boolean local = url.isLocal(); if (this.acceptLocalURLs && local) return null; if (this.acceptGlobalURLs && !local) return null; @@ -604,8 +583,8 @@ public final class CrawlStacker implements WorkflowTask{ //assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above! final InetAddress ia = Domains.dnsResolve(host); return (local) ? - ("the host '" + host + "' is local, but local addresses are not accepted: " + ((ia == null) ? "DNS lookup resulted in null (unknown host name)" : ia.getHostAddress())) : - ("the host '" + host + "' is global, but global addresses are not accepted: " + ((ia == null) ? "null" : ia.getHostAddress())); + ("the host '" + host + "' is local, but local addresses are not accepted: " + ((ia == null) ? "DNS lookup resulted in null (unknown host name)" : ia.getHostAddress())) : + ("the host '" + host + "' is global, but global addresses are not accepted: " + ((ia == null) ? "null" : ia.getHostAddress())); } public String urlInAcceptedDomainHash(final byte[] urlhash) { @@ -617,8 +596,8 @@ public final class CrawlStacker implements WorkflowTask{ if (this.acceptLocalURLs && local) return null; if (this.acceptGlobalURLs && !local) return null; return (local) ? - ("the urlhash '" + ASCII.String(urlhash) + "' is local, but local addresses are not accepted") : - ("the urlhash '" + ASCII.String(urlhash) + "' is global, but global addresses are not accepted"); + ("the urlhash '" + ASCII.String(urlhash) + "' is local, but local addresses are not accepted") : + ("the urlhash '" + ASCII.String(urlhash) + "' is global, but global addresses are not accepted"); } public boolean acceptLocalURLs() { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 62e033b34..c33924ef1 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -114,8 +114,6 @@ import com.hazelcast.config.NetworkConfig; import com.hazelcast.core.Hazelcast; import com.hazelcast.core.HazelcastInstance; -import net.yacy.contentcontrol.ContentControlFilterUpdateThread; -import net.yacy.contentcontrol.SMWListSyncThread; import net.yacy.cora.date.AbstractFormatter; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.date.ISO8601Formatter; @@ -343,8 +341,8 @@ public final class Switchboard extends serverSwitch { if (TimeoutRequest.ping(Domains.LOCALHOST, port, 500)) { throw new RuntimeException( "a server is already running on the YaCy port " - + port - + "; possibly another YaCy process has not terminated yet. Please stop YaCy before running a new instance."); + + port + + "; possibly another YaCy process has not terminated yet. Please stop YaCy before running a new instance."); } MemoryTracker.startSystemProfiling(); @@ -391,10 +389,10 @@ public final class Switchboard extends serverSwitch { final File archivePath = this.getDataPath(SwitchboardConstants.INDEX_ARCHIVE_PATH, SwitchboardConstants.INDEX_ARCHIVE_DEFAULT); this.log.config("Index Archive Path: " + archivePath.toString()); this.listsPath = - this.getDataPath(SwitchboardConstants.LISTS_PATH, SwitchboardConstants.LISTS_PATH_DEFAULT); + this.getDataPath(SwitchboardConstants.LISTS_PATH, SwitchboardConstants.LISTS_PATH_DEFAULT); this.log.config("Lists Path: " + this.listsPath.toString()); this.htDocsPath = - this.getDataPath(SwitchboardConstants.HTDOCS_PATH, SwitchboardConstants.HTDOCS_PATH_DEFAULT); + this.getDataPath(SwitchboardConstants.HTDOCS_PATH, SwitchboardConstants.HTDOCS_PATH_DEFAULT); this.log.config("HTDOCS Path: " + this.htDocsPath.toString()); this.workPath = this.getDataPath(SwitchboardConstants.WORK_PATH, SwitchboardConstants.WORK_PATH_DEFAULT); this.workPath.mkdirs(); @@ -416,17 +414,17 @@ public final class Switchboard extends serverSwitch { this.log.config("Work Path: " + this.workPath.toString()); this.dictionariesPath = - this.getDataPath( - SwitchboardConstants.DICTIONARY_SOURCE_PATH, - SwitchboardConstants.DICTIONARY_SOURCE_PATH_DEFAULT); + this.getDataPath( + SwitchboardConstants.DICTIONARY_SOURCE_PATH, + SwitchboardConstants.DICTIONARY_SOURCE_PATH_DEFAULT); this.log.config("Dictionaries Path:" + this.dictionariesPath.toString()); if (!this.dictionariesPath.exists()) this.dictionariesPath.mkdirs(); this.classificationPath = this.getDataPath( - SwitchboardConstants.CLASSIFICATION_SOURCE_PATH, - SwitchboardConstants.CLASSIFICATION_SOURCE_PATH_DEFAULT); - this.log.config("Classification Path:" + this.classificationPath.toString()); + SwitchboardConstants.CLASSIFICATION_SOURCE_PATH, + SwitchboardConstants.CLASSIFICATION_SOURCE_PATH_DEFAULT); + this.log.config("Classification Path:" + this.classificationPath.toString()); if (!this.classificationPath.exists()) this.classificationPath.mkdirs(); CollectionConfiguration.UNIQUE_HEURISTIC_PREFER_HTTPS = this.getConfigBool("search.ranking.uniqueheuristic.preferhttps", false); @@ -501,8 +499,6 @@ public final class Switchboard extends serverSwitch { // load the network definition try { this.overwriteNetworkDefinition(this.getSysinfo()); - } catch (final FileNotFoundException e) { - ConcurrentLog.logException(e); } catch (final IOException e) { ConcurrentLog.logException(e); } @@ -580,7 +576,7 @@ public final class Switchboard extends serverSwitch { String bf = this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTFUNCTION_ + i, ""); // apply some hard-coded patches for earlier experiments we do not want any more if (bf.equals("product(recip(rord(last_modified),1,1000,1000),div(product(log(product(references_external_i,references_exthosts_i)),div(references_internal_i,host_extent_i)),add(crawldepth_i,1)))") || - bf.equals("scale(cr_host_norm_i,1,20)")) bf = ""; + bf.equals("scale(cr_host_norm_i,1,20)")) bf = ""; if (bf.equals("recip(rord(last_modified),1,1000,1000)")) bf = "recip(ms(NOW,last_modified),3.16e-11,1,1)"; // that was an outdated date boost that did not work well if (i == 0 && bq.equals("fuzzy_signature_unique_b:true^100000.0")) bq = "crawldepth_i:0^0.8 crawldepth_i:1^0.4"; if (bq.equals("crawldepth_i:0^0.8 crawldepth_i:1^0.4")) bq = "crawldepth_i:0^0.8\ncrawldepth_i:1^0.4"; // Fix issue with multiple Boost Queries @@ -612,698 +608,677 @@ public final class Switchboard extends serverSwitch { final String solrurls = this.getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_URL, "http://127.0.0.1:8983/solr"); final boolean usesolr = this.getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED, SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED_DEFAULT) & solrurls.length() > 0; - final int solrtimeout = this.getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_TIMEOUT, 60000); - final boolean writeEnabled = this.getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_WRITEENABLED, true); - final boolean trustSelfSignedOnAuthenticatedServer = Switchboard.getSwitchboard().getConfigBool( - SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_AUTHENTICATED_ALLOW_SELF_SIGNED, - SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_AUTHENTICATED_ALLOW_SELF_SIGNED_DEFAULT); - - if (usesolr && solrurls != null && solrurls.length() > 0) { - try { - final ArrayList instances = RemoteInstance.getShardInstances(solrurls, null, null, solrtimeout, trustSelfSignedOnAuthenticatedServer); - final String shardMethodName = this.getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_SHARDING, ShardSelection.Method.MODULO_HOST_MD5.name()); - final ShardSelection.Method shardMethod = ShardSelection.Method.valueOf(shardMethodName); - this.index.fulltext().connectRemoteSolr(instances, shardMethod, writeEnabled); - } catch (final IOException e ) { - ConcurrentLog.logException(e); - } - } + final int solrtimeout = this.getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_TIMEOUT, 60000); + final boolean writeEnabled = this.getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_WRITEENABLED, true); + final boolean trustSelfSignedOnAuthenticatedServer = Switchboard.getSwitchboard().getConfigBool( + SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_AUTHENTICATED_ALLOW_SELF_SIGNED, + SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_AUTHENTICATED_ALLOW_SELF_SIGNED_DEFAULT); - // initialize network database - final File mySeedFile = new File(this.networkRoot, SeedDB.DBFILE_OWN_SEED); - this.peers = - new SeedDB( - this.networkRoot, - "seed.new.heap", - "seed.old.heap", - "seed.pot.heap", - mySeedFile, - redundancy, - partitionExponent, - false, - this.exceed134217727); - final String agent = this.getConfig(SwitchboardConstants.NETWORK_UNIT_AGENT, ""); - if (!agent.isEmpty()) this.peers.setMyName(agent); // this can thus be set using the environment variable yacy.network.unit.agent - - // initialize peer scan - this.localcluster_scan = Collections.newSetFromMap(new ConcurrentHashMap<>()); - if (this.getConfigBool("scan.enabled", false)) { - new OneTimeBusyThread("Switchboard.scanForOtherYaCyInIntranet") { - @Override - public boolean jobImpl() throws Exception { - Switchboard.this.localcluster_scan.addAll(Scanner.scanForOtherYaCyInIntranet()); - return true; + if (usesolr && solrurls != null && solrurls.length() > 0) { + try { + final ArrayList instances = RemoteInstance.getShardInstances(solrurls, null, null, solrtimeout, trustSelfSignedOnAuthenticatedServer); + final String shardMethodName = this.getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_SHARDING, ShardSelection.Method.MODULO_HOST_MD5.name()); + final ShardSelection.Method shardMethod = ShardSelection.Method.valueOf(shardMethodName); + this.index.fulltext().connectRemoteSolr(instances, shardMethod, writeEnabled); + } catch (final IOException e ) { + ConcurrentLog.logException(e); + } } - }.start(); - } - // initialize hazelcast - final InterfacesConfig interfacesConfig = new InterfacesConfig(); - Domains.myIntranetIPs().forEach(ip -> interfacesConfig.addInterface(ip.getHostAddress())); - final NetworkConfig networkConfig = new NetworkConfig().setInterfaces(interfacesConfig); - final JoinConfig join = networkConfig.getJoin(); - join.getMulticastConfig().setEnabled(true); - final Config config = new Config().setClusterName("YaCyP2P").setInstanceName("Peer").setNetworkConfig(networkConfig); - config.getCPSubsystemConfig().setCPMemberCount(3); - try { - this.localcluster_hazelcast = Hazelcast.newHazelcastInstance(config); - final String uuid = this.localcluster_hazelcast.getCluster().getLocalMember().getUuid().toString(); - this.localcluster_hazelcast.getMap("status").put(uuid, Memory.status()); - } catch (final Exception e) { - this.log.warn(e); - this.localcluster_hazelcast = null; - } + // initialize network database + final File mySeedFile = new File(this.networkRoot, SeedDB.DBFILE_OWN_SEED); + this.peers = + new SeedDB( + this.networkRoot, + "seed.new.heap", + "seed.old.heap", + "seed.pot.heap", + mySeedFile, + redundancy, + partitionExponent, + false, + this.exceed134217727); + final String agent = this.getConfig(SwitchboardConstants.NETWORK_UNIT_AGENT, ""); + if (!agent.isEmpty()) this.peers.setMyName(agent); // this can thus be set using the environment variable yacy.network.unit.agent + + // initialize peer scan + this.localcluster_scan = Collections.newSetFromMap(new ConcurrentHashMap<>()); + if (this.getConfigBool("scan.enabled", false)) { + new OneTimeBusyThread("Switchboard.scanForOtherYaCyInIntranet") { + @Override + public boolean jobImpl() throws Exception { + Switchboard.this.localcluster_scan.addAll(Scanner.scanForOtherYaCyInIntranet()); + return true; + } + }.start(); + } - // load domainList - try { - this.domainList = null; - if (!this.getConfig("network.unit.domainlist", "").equals("")) { - final Reader r = this.getConfigFileFromWebOrLocally( - this.getConfig("network.unit.domainlist", ""), - this.getAppPath().getAbsolutePath(), - new File(this.networkRoot, "domainlist.txt")); - this.domainList = new FilterEngine(); - final BufferedReader br = new BufferedReader(r); - this.domainList.loadList(br, null); - br.close(); - } - } catch (final FileNotFoundException e ) { - this.log.severe("CONFIG: domainlist not found: " + e.getMessage()); - } catch (final IOException e ) { - this.log.severe("CONFIG: error while retrieving domainlist: " + e.getMessage()); - } + // initialize hazelcast + final InterfacesConfig interfacesConfig = new InterfacesConfig(); + Domains.myIntranetIPs().forEach(ip -> interfacesConfig.addInterface(ip.getHostAddress())); + final NetworkConfig networkConfig = new NetworkConfig().setInterfaces(interfacesConfig); + final JoinConfig join = networkConfig.getJoin(); + join.getMulticastConfig().setEnabled(true); + final Config config = new Config().setClusterName("YaCyP2P").setInstanceName("Peer").setNetworkConfig(networkConfig); + config.getCPSubsystemConfig().setCPMemberCount(3); + try { + this.localcluster_hazelcast = Hazelcast.newHazelcastInstance(config); + final String uuid = this.localcluster_hazelcast.getCluster().getLocalMember().getUuid().toString(); + this.localcluster_hazelcast.getMap("status").put(uuid, Memory.status()); + } catch (final Exception e) { + this.log.warn(e); + this.localcluster_hazelcast = null; + } - // create a crawler - this.crawler = new CrawlSwitchboard(this); + // load domainList + try { + this.domainList = null; + if (!this.getConfig("network.unit.domainlist", "").equals("")) { + final Reader r = this.getConfigFileFromWebOrLocally( + this.getConfig("network.unit.domainlist", ""), + this.getAppPath().getAbsolutePath(), + new File(this.networkRoot, "domainlist.txt")); + this.domainList = new FilterEngine(); + final BufferedReader br = new BufferedReader(r); + this.domainList.loadList(br, null); + br.close(); + } + } catch (final FileNotFoundException e ) { + this.log.severe("CONFIG: domainlist not found: " + e.getMessage()); + } catch (final IOException e ) { + this.log.severe("CONFIG: error while retrieving domainlist: " + e.getMessage()); + } - // start yacy core - this.log.config("Starting YaCy Protocol Core"); - this.yc = new Network(this); - new OneTimeBusyThread("Switchboard.loadSeedLists") { + // create a crawler + this.crawler = new CrawlSwitchboard(this); - @Override - public boolean jobImpl() throws Exception { - Switchboard.this.loadSeedLists(); - return true; - } - }.start(); - //final long startedSeedListAquisition = System.currentTimeMillis(); - - // init a DHT transmission dispatcher - this.dhtDispatcher = (this.peers.sizeConnected() == 0) ? null : new Dispatcher(this, true, 10000); - - // set up local robots.txt - this.robotstxtConfig = RobotsTxtConfig.init(this); - - // setting timestamp of last proxy access - this.proxyLastAccess = System.currentTimeMillis() - 10000; - this.localSearchLastAccess = System.currentTimeMillis() - 10000; - this.remoteSearchLastAccess = System.currentTimeMillis() - 10000; - this.adminAuthenticationLastAccess = 0; // timestamp last admin authentication (as not autenticated here, stamp with 0) - this.optimizeLastRun = System.currentTimeMillis(); - this.webStructure = new WebStructureGraph(new File(this.queuesRoot, "webStructure.map")); - - // configuring list path - if ( !(this.listsPath.exists()) ) { - this.listsPath.mkdirs(); - } - - // load coloured lists - if ( blueList == null ) { - // read only once upon first instantiation of this class - final String f = - this.getConfig(SwitchboardConstants.LIST_BLUE, SwitchboardConstants.LIST_BLUE_DEFAULT); - final File plasmaBlueListFile = new File(f); - if ( f != null ) { - blueList = SetTools.loadList(plasmaBlueListFile, NaturalOrder.naturalComparator); - } else { - blueList = new TreeSet<>(); - } - this.log.config("loaded blue-list from file " - + plasmaBlueListFile.getName() - + ", " - + blueList.size() - + " entries, " - + ppRamString(plasmaBlueListFile.length() / 1024)); - } - - // load blacklist - this.log.config("Loading blacklist ..."); - final File blacklistsPath = - this.getDataPath(SwitchboardConstants.LISTS_PATH, SwitchboardConstants.LISTS_PATH_DEFAULT); - urlBlacklist = new Blacklist(blacklistsPath); - ListManager.switchboard = this; - ListManager.listsPath = blacklistsPath; - ListManager.reloadBlacklists(); - - // Set jvm default locale to match UI language ( - String lng = this.getConfig("locale.language", "en"); - if (!"browser".equals(lng) && !"default".equals(lng)) { - Locale.setDefault(new Locale(lng)); - } else { - lng = "en"; // default = English - } + // start yacy core + this.log.config("Starting YaCy Protocol Core"); + this.yc = new Network(this); + new OneTimeBusyThread("Switchboard.loadSeedLists") { - // load badwords (to filter the topwords) - if ( badwords == null || badwords.isEmpty() ) { - File badwordsFile = new File(appPath, "DATA/SETTINGS/" + SwitchboardConstants.LIST_BADWORDS_DEFAULT); - if (!badwordsFile.exists()) { - badwordsFile = new File(appPath, "defaults/" + SwitchboardConstants.LIST_BADWORDS_DEFAULT); - } - badwords = SetTools.loadList(badwordsFile, NaturalOrder.naturalComparator); - this.log.config("loaded badwords from file " - + badwordsFile.getName() - + ", " - + badwords.size() - + " entries, " - + ppRamString(badwordsFile.length() / 1024)); - } + @Override + public boolean jobImpl() throws Exception { + Switchboard.this.loadSeedLists(); + return true; + } + }.start(); + //final long startedSeedListAquisition = System.currentTimeMillis(); + + // init a DHT transmission dispatcher + this.dhtDispatcher = (this.peers.sizeConnected() == 0) ? null : new Dispatcher(this, true, 10000); + + // set up local robots.txt + this.robotstxtConfig = RobotsTxtConfig.init(this); + + // setting timestamp of last proxy access + this.proxyLastAccess = System.currentTimeMillis() - 10000; + this.localSearchLastAccess = System.currentTimeMillis() - 10000; + this.remoteSearchLastAccess = System.currentTimeMillis() - 10000; + this.adminAuthenticationLastAccess = 0; // timestamp last admin authentication (as not autenticated here, stamp with 0) + this.optimizeLastRun = System.currentTimeMillis(); + this.webStructure = new WebStructureGraph(new File(this.queuesRoot, "webStructure.map")); + + // configuring list path + if ( !(this.listsPath.exists()) ) { + this.listsPath.mkdirs(); + } - // load stopwords (to filter query and topwords) - if ( stopwords == null || stopwords.isEmpty() ) { - File stopwordsFile = new File(dataPath, "DATA/SETTINGS/" + SwitchboardConstants.LIST_STOPWORDS_DEFAULT); - if (!stopwordsFile.exists()) { - stopwordsFile = new File(appPath, "defaults/"+SwitchboardConstants.LIST_STOPWORDS_DEFAULT); - } - stopwords = SetTools.loadList(stopwordsFile, NaturalOrder.naturalComparator); - // append locale language stopwords using setting of interface language (file yacy.stopwords.xx) - // english is stored as default (needed for locale html file overlay) - File stopwordsFilelocale = new File (dataPath, "DATA/SETTINGS/"+stopwordsFile.getName()+"."+lng); - if (!stopwordsFilelocale.exists()) stopwordsFilelocale = new File (appPath, "defaults/"+stopwordsFile.getName()+"."+lng); - if (stopwordsFilelocale.exists()) { - // load YaCy locale stopword list - stopwords.addAll(SetTools.loadList(stopwordsFilelocale, NaturalOrder.naturalComparator)); - this.log.config("append stopwords from file " + stopwordsFilelocale.getName()); - } else { - // alternatively load/append default solr stopword list - stopwordsFilelocale = new File (appPath, "defaults/solr/lang/stopwords_" + lng + ".txt"); - if (stopwordsFilelocale.exists()) { - stopwords.addAll(SetTools.loadList(stopwordsFilelocale, NaturalOrder.naturalComparator)); - this.log.config("append stopwords from file " + stopwordsFilelocale.getName()); + // load coloured lists + if ( blueList == null ) { + // read only once upon first instantiation of this class + final String f = + this.getConfig(SwitchboardConstants.LIST_BLUE, SwitchboardConstants.LIST_BLUE_DEFAULT); + final File plasmaBlueListFile = new File(f); + if ( f != null ) { + blueList = SetTools.loadList(plasmaBlueListFile, NaturalOrder.naturalComparator); + } else { + blueList = new TreeSet<>(); + } + this.log.config("loaded blue-list from file " + + plasmaBlueListFile.getName() + + ", " + + blueList.size() + + " entries, " + + ppRamString(plasmaBlueListFile.length() / 1024)); } - } - } - // start a cache manager - this.log.config("Starting HT Cache Manager"); - - // create the cache directory - this.htCachePath = - this.getDataPath(SwitchboardConstants.HTCACHE_PATH, SwitchboardConstants.HTCACHE_PATH_DEFAULT); - this.log.info("HTCACHE Path = " + this.htCachePath.getAbsolutePath()); - final long maxCacheSize = - 1024L * 1024L * Long.parseLong(this.getConfig(SwitchboardConstants.PROXY_CACHE_SIZE, "2")); // this is megabyte - Cache.init(this.htCachePath, this.peers.mySeed().hash, maxCacheSize, - this.getConfigLong(SwitchboardConstants.HTCACHE_SYNC_LOCK_TIMEOUT, - SwitchboardConstants.HTCACHE_SYNC_LOCK_TIMEOUT_DEFAULT), - this.getConfigInt(SwitchboardConstants.HTCACHE_COMPRESSION_LEVEL, - SwitchboardConstants.HTCACHE_COMPRESSION_LEVEL_DEFAULT)); - final File transactiondir = new File(this.htCachePath, "snapshots"); - Transactions.init(transactiondir, this.getConfigLong(SwitchboardConstants.SNAPSHOTS_WKHTMLTOPDF_TIMEOUT, - SwitchboardConstants.SNAPSHOTS_WKHTMLTOPDF_TIMEOUT_DEFAULT)); - - // create the surrogates directories - this.surrogatesInPath = - this.getDataPath( - SwitchboardConstants.SURROGATES_IN_PATH, - SwitchboardConstants.SURROGATES_IN_PATH_DEFAULT); - this.log.info("surrogates.in Path = " + this.surrogatesInPath.getAbsolutePath()); - this.surrogatesInPath.mkdirs(); - this.surrogatesOutPath = - this.getDataPath( - SwitchboardConstants.SURROGATES_OUT_PATH, - SwitchboardConstants.SURROGATES_OUT_PATH_DEFAULT); - this.log.info("surrogates.out Path = " + this.surrogatesOutPath.getAbsolutePath()); - this.surrogatesOutPath.mkdirs(); - - // copy opensearch heuristic config (if not exist) - final File osdConfig = new File(this.getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf"); - if (!osdConfig.exists()) { - final File osdDefaultConfig = new File(appPath, "defaults/heuristicopensearch.conf"); - this.log.info("heuristic.opensearch list Path = " + osdDefaultConfig.getAbsolutePath()); - try { - Files.copy(osdDefaultConfig, osdConfig); - } catch (final IOException ex) { } - } + // load blacklist + this.log.config("Loading blacklist ..."); + final File blacklistsPath = + this.getDataPath(SwitchboardConstants.LISTS_PATH, SwitchboardConstants.LISTS_PATH_DEFAULT); + urlBlacklist = new Blacklist(blacklistsPath); + ListManager.switchboard = this; + ListManager.listsPath = blacklistsPath; + ListManager.reloadBlacklists(); + + // Set jvm default locale to match UI language ( + String lng = this.getConfig("locale.language", "en"); + if (!"browser".equals(lng) && !"default".equals(lng)) { + Locale.setDefault(new Locale(lng)); + } else { + lng = "en"; // default = English + } - // create the release download directory - this.releasePath = - this.getDataPath(SwitchboardConstants.RELEASE_PATH, SwitchboardConstants.RELEASE_PATH_DEFAULT); - this.releasePath.mkdirs(); - this.log.info("RELEASE Path = " + this.releasePath.getAbsolutePath()); + // load badwords (to filter the topwords) + if ( badwords == null || badwords.isEmpty() ) { + File badwordsFile = new File(appPath, "DATA/SETTINGS/" + SwitchboardConstants.LIST_BADWORDS_DEFAULT); + if (!badwordsFile.exists()) { + badwordsFile = new File(appPath, "defaults/" + SwitchboardConstants.LIST_BADWORDS_DEFAULT); + } + badwords = SetTools.loadList(badwordsFile, NaturalOrder.naturalComparator); + this.log.config("loaded badwords from file " + + badwordsFile.getName() + + ", " + + badwords.size() + + " entries, " + + ppRamString(badwordsFile.length() / 1024)); + } - // starting message board - try { - this.initMessages(); - } catch (final IOException e) { - ConcurrentLog.logException(e); - } + // load stopwords (to filter query and topwords) + if ( stopwords == null || stopwords.isEmpty() ) { + File stopwordsFile = new File(dataPath, "DATA/SETTINGS/" + SwitchboardConstants.LIST_STOPWORDS_DEFAULT); + if (!stopwordsFile.exists()) { + stopwordsFile = new File(appPath, "defaults/"+SwitchboardConstants.LIST_STOPWORDS_DEFAULT); + } + stopwords = SetTools.loadList(stopwordsFile, NaturalOrder.naturalComparator); + // append locale language stopwords using setting of interface language (file yacy.stopwords.xx) + // english is stored as default (needed for locale html file overlay) + File stopwordsFilelocale = new File (dataPath, "DATA/SETTINGS/"+stopwordsFile.getName()+"."+lng); + if (!stopwordsFilelocale.exists()) stopwordsFilelocale = new File (appPath, "defaults/"+stopwordsFile.getName()+"."+lng); + if (stopwordsFilelocale.exists()) { + // load YaCy locale stopword list + stopwords.addAll(SetTools.loadList(stopwordsFilelocale, NaturalOrder.naturalComparator)); + this.log.config("append stopwords from file " + stopwordsFilelocale.getName()); + } else { + // alternatively load/append default solr stopword list + stopwordsFilelocale = new File (appPath, "defaults/solr/lang/stopwords_" + lng + ".txt"); + if (stopwordsFilelocale.exists()) { + stopwords.addAll(SetTools.loadList(stopwordsFilelocale, NaturalOrder.naturalComparator)); + this.log.config("append stopwords from file " + stopwordsFilelocale.getName()); + } + } + } - // starting wiki - try { - this.initWiki(); - } catch (final IOException e) { - ConcurrentLog.logException(e); - } + // start a cache manager + this.log.config("Starting HT Cache Manager"); + + // create the cache directory + this.htCachePath = + this.getDataPath(SwitchboardConstants.HTCACHE_PATH, SwitchboardConstants.HTCACHE_PATH_DEFAULT); + this.log.info("HTCACHE Path = " + this.htCachePath.getAbsolutePath()); + final long maxCacheSize = + 1024L * 1024L * Long.parseLong(this.getConfig(SwitchboardConstants.PROXY_CACHE_SIZE, "2")); // this is megabyte + Cache.init(this.htCachePath, this.peers.mySeed().hash, maxCacheSize, + this.getConfigLong(SwitchboardConstants.HTCACHE_SYNC_LOCK_TIMEOUT, + SwitchboardConstants.HTCACHE_SYNC_LOCK_TIMEOUT_DEFAULT), + this.getConfigInt(SwitchboardConstants.HTCACHE_COMPRESSION_LEVEL, + SwitchboardConstants.HTCACHE_COMPRESSION_LEVEL_DEFAULT)); + final File transactiondir = new File(this.htCachePath, "snapshots"); + Transactions.init(transactiondir, this.getConfigLong(SwitchboardConstants.SNAPSHOTS_WKHTMLTOPDF_TIMEOUT, + SwitchboardConstants.SNAPSHOTS_WKHTMLTOPDF_TIMEOUT_DEFAULT)); + + // create the surrogates directories + this.surrogatesInPath = + this.getDataPath( + SwitchboardConstants.SURROGATES_IN_PATH, + SwitchboardConstants.SURROGATES_IN_PATH_DEFAULT); + this.log.info("surrogates.in Path = " + this.surrogatesInPath.getAbsolutePath()); + this.surrogatesInPath.mkdirs(); + this.surrogatesOutPath = + this.getDataPath( + SwitchboardConstants.SURROGATES_OUT_PATH, + SwitchboardConstants.SURROGATES_OUT_PATH_DEFAULT); + this.log.info("surrogates.out Path = " + this.surrogatesOutPath.getAbsolutePath()); + this.surrogatesOutPath.mkdirs(); + + // copy opensearch heuristic config (if not exist) + final File osdConfig = new File(this.getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf"); + if (!osdConfig.exists()) { + final File osdDefaultConfig = new File(appPath, "defaults/heuristicopensearch.conf"); + this.log.info("heuristic.opensearch list Path = " + osdDefaultConfig.getAbsolutePath()); + try { + Files.copy(osdDefaultConfig, osdConfig); + } catch (final IOException ex) { } + } - //starting blog - try { - this.initBlog(); - } catch (final IOException e) { - ConcurrentLog.logException(e); - } + // create the release download directory + this.releasePath = + this.getDataPath(SwitchboardConstants.RELEASE_PATH, SwitchboardConstants.RELEASE_PATH_DEFAULT); + this.releasePath.mkdirs(); + this.log.info("RELEASE Path = " + this.releasePath.getAbsolutePath()); - // init User DB - this.log.config("Loading User DB"); - final File userDbFile = new File(this.getDataPath(), "DATA/SETTINGS/user.heap"); - try { - this.userDB = new UserDB(userDbFile); - this.log.config("Loaded User DB from file " - + userDbFile.getName() - + ", " - + this.userDB.size() - + " entries" - + ", " - + ppRamString(userDbFile.length() / 1024)); - } catch (final IOException e) { - ConcurrentLog.logException(e); - } + // starting message board + try { + this.initMessages(); + } catch (final IOException e) { + ConcurrentLog.logException(e); + } - // init html parser evaluation scheme - File parserPropertiesPath = new File(appPath, "defaults/"); - String[] settingsList = parserPropertiesPath.list(); - for ( final String l : settingsList ) { - if ( l.startsWith("parser.") && l.endsWith(".properties") ) { + // starting wiki try { - Evaluation.add(new File(parserPropertiesPath, l)); + this.initWiki(); } catch (final IOException e) { ConcurrentLog.logException(e); } - } - } - parserPropertiesPath = new File(this.getDataPath(), "DATA/SETTINGS/"); - settingsList = parserPropertiesPath.list(); - for ( final String l : settingsList ) { - if ( l.startsWith("parser.") && l.endsWith(".properties") ) { + + //starting blog try { - Evaluation.add(new File(parserPropertiesPath, l)); + this.initBlog(); } catch (final IOException e) { ConcurrentLog.logException(e); } - } - } - // init bookmarks DB: needs more time since this does a DNS lookup for each Bookmark. - // Can be started concurrently - new Thread("Switchboard.initBookmarks") { - @Override - public void run() { + // init User DB + this.log.config("Loading User DB"); + final File userDbFile = new File(this.getDataPath(), "DATA/SETTINGS/user.heap"); try { - Switchboard.this.initBookmarks(); - } catch (final IOException e ) { + this.userDB = new UserDB(userDbFile); + this.log.config("Loaded User DB from file " + + userDbFile.getName() + + ", " + + this.userDB.size() + + " entries" + + ", " + + ppRamString(userDbFile.length() / 1024)); + } catch (final IOException e) { ConcurrentLog.logException(e); } - } - }.start(); - // define a realtime parsable mimetype list - this.log.config("Parser: Initializing Mime Type deny list"); + // init html parser evaluation scheme + File parserPropertiesPath = new File(appPath, "defaults/"); + String[] settingsList = parserPropertiesPath.list(); + for ( final String l : settingsList ) { + if ( l.startsWith("parser.") && l.endsWith(".properties") ) { + try { + Evaluation.add(new File(parserPropertiesPath, l)); + } catch (final IOException e) { + ConcurrentLog.logException(e); + } + } + } + parserPropertiesPath = new File(this.getDataPath(), "DATA/SETTINGS/"); + settingsList = parserPropertiesPath.list(); + for ( final String l : settingsList ) { + if ( l.startsWith("parser.") && l.endsWith(".properties") ) { + try { + Evaluation.add(new File(parserPropertiesPath, l)); + } catch (final IOException e) { + ConcurrentLog.logException(e); + } + } + } - final boolean enableAudioTags = this.getConfigBool("parser.enableAudioTags", false); - this.log.config("Parser: parser.enableAudioTags= "+enableAudioTags); - final Set denyExt = this.getConfigSet(SwitchboardConstants.PARSER_EXTENSIONS_DENY); - final Set denyMime = this.getConfigSet(SwitchboardConstants.PARSER_MIME_DENY); + // init bookmarks DB: needs more time since this does a DNS lookup for each Bookmark. + // Can be started concurrently + new Thread("Switchboard.initBookmarks") { + @Override + public void run() { + try { + Switchboard.this.initBookmarks(); + } catch (final IOException e ) { + ConcurrentLog.logException(e); + } + } + }.start(); - /* audioTagParser is disabled by default as it needs a temporary file (because of the JAudiotagger implementation) for each parsed document */ - if (!enableAudioTags) { - denyExt.addAll(audioTagParser.SupportedAudioFormat.getAllFileExtensions()); - denyMime.addAll(audioTagParser.SupportedAudioFormat.getAllMediaTypes()); + // define a realtime parsable mimetype list + this.log.config("Parser: Initializing Mime Type deny list"); - this.setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, denyExt); - this.setConfig(SwitchboardConstants.PARSER_MIME_DENY, denyMime); - this.setConfig("parser.enableAudioTags", true); - } + final boolean enableAudioTags = this.getConfigBool("parser.enableAudioTags", false); + this.log.config("Parser: parser.enableAudioTags= "+enableAudioTags); + final Set denyExt = this.getConfigSet(SwitchboardConstants.PARSER_EXTENSIONS_DENY); + final Set denyMime = this.getConfigSet(SwitchboardConstants.PARSER_MIME_DENY); - TextParser.setDenyMime(this.getConfig(SwitchboardConstants.PARSER_MIME_DENY, "")); - TextParser.setDenyExtension(this.getConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, "")); - pdfParser.individualPages = this.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false); - pdfParser.individualPagePropertyname = this.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page"); + /* audioTagParser is disabled by default as it needs a temporary file (because of the JAudiotagger implementation) for each parsed document */ + if (!enableAudioTags) { + denyExt.addAll(audioTagParser.SupportedAudioFormat.getAllFileExtensions()); + denyMime.addAll(audioTagParser.SupportedAudioFormat.getAllMediaTypes()); - // start a loader - this.log.config("Starting Crawl Loader"); - this.loader = new LoaderDispatcher(this); + this.setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, denyExt); + this.setConfig(SwitchboardConstants.PARSER_MIME_DENY, denyMime); + this.setConfig("parser.enableAudioTags", true); + } - // load the robots.txt db - this.log.config("Initializing robots.txt DB"); - this.robots = new RobotsTxt(this.tables, this.loader, - this.getConfigInt(SwitchboardConstants.ROBOTS_TXT_THREADS_ACTIVE_MAX, SwitchboardConstants.ROBOTS_TXT_THREADS_ACTIVE_MAX_DEFAULT)); - try { - this.log.config("Loaded robots.txt DB: " + this.robots.size() + " entries"); - } catch (final IOException e) { - ConcurrentLog.logException(e); - } + TextParser.setDenyMime(this.getConfig(SwitchboardConstants.PARSER_MIME_DENY, "")); + TextParser.setDenyExtension(this.getConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, "")); + pdfParser.individualPages = this.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false); + pdfParser.individualPagePropertyname = this.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page"); - // load oai tables - final Map oaiFriends = - OAIListFriendsLoader.loadListFriendsSources( - new File(appPath, "defaults/oaiListFriendsSource.xml"), - this.getDataPath()); - OAIListFriendsLoader.init(this.loader, oaiFriends, ClientIdentification.yacyInternetCrawlerAgent); - this.crawlQueues = new CrawlQueues(this, this.queuesRoot); - - // on startup, resume all crawls - this.setConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL + "_isPaused", "false"); - this.setConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL + "_isPaused_cause", ""); - this.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL + "_isPaused", "false"); - this.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL + "_isPaused_cause", ""); - this.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER + "_isPaused", "false"); - this.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER + "_isPaused_cause", ""); - this.crawlJobsStatus.put(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL, new Object[] {new Object(), false}); - this.crawlJobsStatus.put(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL, new Object[] {new Object(), false}); - this.crawlJobsStatus.put(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER, new Object[] {new Object(), false}); - - // init cookie-Monitor - this.log.config("Starting Cookie Monitor"); - this.outgoingCookies = new ConcurrentHashMap<>(); - this.incomingCookies = new ConcurrentHashMap<>(); - - // init search history trackers - this.localSearchTracker = new ConcurrentHashMap<>(); // String:TreeSet - IP:set of Long(accessTime) - this.remoteSearchTracker = new ConcurrentHashMap<>(); - - // init messages: clean up message symbol - final File notifierSource = - new File(this.getAppPath(), this.getConfig( - SwitchboardConstants.HTROOT_PATH, - SwitchboardConstants.HTROOT_PATH_DEFAULT) + "/env/grafics/empty.gif"); - final File notifierDest = - new File( - this.getDataPath(SwitchboardConstants.HTDOCS_PATH, SwitchboardConstants.HTDOCS_PATH_DEFAULT), - "notifier.gif"); - try { - Files.copy(notifierSource, notifierDest); - } catch (final IOException e ) { - } + // start a loader + this.log.config("Starting Crawl Loader"); + this.loader = new LoaderDispatcher(this); - // init nameCacheNoCachingList - try { - Domains.setNoCachingPatterns(this.getConfig(SwitchboardConstants.HTTPC_NAME_CACHE_CACHING_PATTERNS_NO, "")); - } catch (final PatternSyntaxException pse) { - ConcurrentLog.severe("Switchboard", "Invalid regular expression in " + // load the robots.txt db + this.log.config("Initializing robots.txt DB"); + this.robots = new RobotsTxt(this.tables, this.loader, + this.getConfigInt(SwitchboardConstants.ROBOTS_TXT_THREADS_ACTIVE_MAX, SwitchboardConstants.ROBOTS_TXT_THREADS_ACTIVE_MAX_DEFAULT)); + try { + this.log.config("Loaded robots.txt DB: " + this.robots.size() + " entries"); + } catch (final IOException e) { + ConcurrentLog.logException(e); + } + + // load oai tables + final Map oaiFriends = + OAIListFriendsLoader.loadListFriendsSources( + new File(appPath, "defaults/oaiListFriendsSource.xml"), + this.getDataPath()); + OAIListFriendsLoader.init(this.loader, oaiFriends, ClientIdentification.yacyInternetCrawlerAgent); + this.crawlQueues = new CrawlQueues(this, this.queuesRoot); + + // on startup, resume all crawls + this.setConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL + "_isPaused", "false"); + this.setConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL + "_isPaused_cause", ""); + this.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL + "_isPaused", "false"); + this.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL + "_isPaused_cause", ""); + this.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER + "_isPaused", "false"); + this.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER + "_isPaused_cause", ""); + this.crawlJobsStatus.put(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL, new Object[] {new Object(), false}); + this.crawlJobsStatus.put(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL, new Object[] {new Object(), false}); + this.crawlJobsStatus.put(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER, new Object[] {new Object(), false}); + + // init cookie-Monitor + this.log.config("Starting Cookie Monitor"); + this.outgoingCookies = new ConcurrentHashMap<>(); + this.incomingCookies = new ConcurrentHashMap<>(); + + // init search history trackers + this.localSearchTracker = new ConcurrentHashMap<>(); // String:TreeSet - IP:set of Long(accessTime) + this.remoteSearchTracker = new ConcurrentHashMap<>(); + + // init messages: clean up message symbol + final File notifierSource = + new File(this.getAppPath(), this.getConfig( + SwitchboardConstants.HTROOT_PATH, + SwitchboardConstants.HTROOT_PATH_DEFAULT) + "/env/grafics/empty.gif"); + final File notifierDest = + new File( + this.getDataPath(SwitchboardConstants.HTDOCS_PATH, SwitchboardConstants.HTDOCS_PATH_DEFAULT), + "notifier.gif"); + try { + Files.copy(notifierSource, notifierDest); + } catch (final IOException e ) { + } + + // init nameCacheNoCachingList + try { + Domains.setNoCachingPatterns(this.getConfig(SwitchboardConstants.HTTPC_NAME_CACHE_CACHING_PATTERNS_NO, "")); + } catch (final PatternSyntaxException pse) { + ConcurrentLog.severe("Switchboard", "Invalid regular expression in " + SwitchboardConstants.HTTPC_NAME_CACHE_CACHING_PATTERNS_NO + " property: " + pse.getMessage()); - System.exit(-1); - } + System.exit(-1); + } - // generate snippets cache - this.log.config("Initializing Snippet Cache"); + // generate snippets cache + this.log.config("Initializing Snippet Cache"); - TextSnippet.statistics.setEnabled(this.getConfigBool(SwitchboardConstants.DEBUG_SNIPPETS_STATISTICS_ENABLED, - SwitchboardConstants.DEBUG_SNIPPETS_STATISTICS_ENABLED_DEFAULT)); + TextSnippet.statistics.setEnabled(this.getConfigBool(SwitchboardConstants.DEBUG_SNIPPETS_STATISTICS_ENABLED, + SwitchboardConstants.DEBUG_SNIPPETS_STATISTICS_ENABLED_DEFAULT)); - // init the wiki - wikiParser = new WikiCode(); + // init the wiki + wikiParser = new WikiCode(); - // initializing the resourceObserver - this.observer = new ResourceObserver(this); + // initializing the resourceObserver + this.observer = new ResourceObserver(this); - final ResourceObserver resourceObserver = this.observer; - new OneTimeBusyThread("ResourceObserver.resourceObserverJob") { + final ResourceObserver resourceObserver = this.observer; + new OneTimeBusyThread("ResourceObserver.resourceObserverJob") { - @Override - public boolean jobImpl() throws Exception { - resourceObserver.resourceObserverJob(); - return true; - } - }.start(); + @Override + public boolean jobImpl() throws Exception { + resourceObserver.resourceObserverJob(); + return true; + } + }.start(); + + // initializing the stackCrawlThread + this.crawlStacker = + new CrawlStacker( + this.robots, + this.crawlQueues, + this.crawler, + this.index, + this.peers, + this.isIntranetMode(), + this.isGlobalMode(), + this.domainList); // Intranet and Global mode may be both true! + + // possibly switch off localIP check + Domains.setNoLocalCheck(this.isAllIPMode()); + + // check status of account configuration: when local url crawling is allowed, it is not allowed + // that an automatic authorization of localhost is done, because in this case crawls from local + // addresses are blocked to prevent attack szenarios where remote pages contain links to localhost + // addresses that can steer a YaCy peer + if ( !this.getConfigBool(SwitchboardConstants.ADMIN_ACCOUNT_FOR_LOCALHOST, false) ) { + if ( this.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, "").startsWith("0000") ) { + // the password was set automatically with a random value. + // We must remove that here to prevent that a user cannot log in any more + this.setConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, ""); + // after this a message must be generated to alert the user to set a new password + this.log.info("RANDOM PASSWORD REMOVED! User must set a new password"); + } + } - // initializing the stackCrawlThread - this.crawlStacker = - new CrawlStacker( - this.robots, - this.crawlQueues, - this.crawler, - this.index, - this.peers, - this.isIntranetMode(), - this.isGlobalMode(), - this.domainList); // Intranet and Global mode may be both true! - - // possibly switch off localIP check - Domains.setNoLocalCheck(this.isAllIPMode()); - - // check status of account configuration: when local url crawling is allowed, it is not allowed - // that an automatic authorization of localhost is done, because in this case crawls from local - // addresses are blocked to prevent attack szenarios where remote pages contain links to localhost - // addresses that can steer a YaCy peer - if ( !this.getConfigBool(SwitchboardConstants.ADMIN_ACCOUNT_FOR_LOCALHOST, false) ) { - if ( this.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, "").startsWith("0000") ) { - // the password was set automatically with a random value. - // We must remove that here to prevent that a user cannot log in any more - this.setConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, ""); - // after this a message must be generated to alert the user to set a new password - this.log.info("RANDOM PASSWORD REMOVED! User must set a new password"); - } - } + // initializing dht chunk generation + this.dhtMaxReferenceCount = (int) this.getConfigLong(SwitchboardConstants.INDEX_DIST_CHUNK_SIZE_START, 50); + + // init robinson cluster + // before we do that, we wait some time until the seed list is loaded. + this.clusterhashes = this.peers.clusterHashes(this.getConfig("cluster.peers.yacydomain", "")); + + // deploy blocking threads + this.indexingStorageProcessor = + new WorkflowProcessor<>( + "storeDocumentIndex", + "This is the sequencing step of the indexing queue. Files are written as streams, too much councurrency would destroy IO performance. In this process the words are written to the RWI cache, which flushes if it is full.", + new String[] { + "RWI/Cache/Collections" + }, + in -> { + Switchboard.this.storeDocumentIndex(in); + return null; + }, + 2, + null, + 1); + this.indexingAnalysisProcessor = + new WorkflowProcessor<>( + "webStructureAnalysis", + "This just stores the link structure of the document into a web structure database.", + new String[] { + "storeDocumentIndex" + }, + in -> Switchboard.this.webStructureAnalysis(in), + WorkflowProcessor.availableCPU + 1, + this.indexingStorageProcessor, + WorkflowProcessor.availableCPU); + this.indexingCondensementProcessor = + new WorkflowProcessor<>( + "condenseDocument", + "This does a structural analysis of plain texts: markup of headlines, slicing into phrases (i.e. sentences), markup with position, counting of words, calculation of term frequency.", + new String[] { + "webStructureAnalysis" + }, + in -> Switchboard.this.condenseDocument(in), + WorkflowProcessor.availableCPU + 1, + this.indexingAnalysisProcessor, + WorkflowProcessor.availableCPU); + this.indexingDocumentProcessor = + new WorkflowProcessor<>( + "parseDocument", + "This does the parsing of the newly loaded documents from the web. The result is not only a plain text document, but also a list of URLs that are embedded into the document. The urls are handed over to the CrawlStacker. This process has two child process queues!", + new String[] { + "condenseDocument", "CrawlStacker" + }, + in -> Switchboard.this.parseDocument(in), + Math.max(20, WorkflowProcessor.availableCPU * 2), // it may happen that this is filled with new files from the search process. That means there should be enough place for two result pages + this.indexingCondensementProcessor, + WorkflowProcessor.availableCPU); + + // deploy busy threads + this.log.config("Starting Threads"); + MemoryControl.gc(10000, "plasmaSwitchboard, help for profiler"); // help for profiler - thq - // initializing dht chunk generation - this.dhtMaxReferenceCount = (int) this.getConfigLong(SwitchboardConstants.INDEX_DIST_CHUNK_SIZE_START, 50); + this.deployThread( + SwitchboardConstants.CLEANUP, + "Cleanup", + "cleaning process", + null, + new InstantBusyThread("Switchboard.cleanupJob", 30000, 10000) { - // init robinson cluster - // before we do that, we wait some time until the seed list is loaded. - this.clusterhashes = this.peers.clusterHashes(this.getConfig("cluster.peers.yacydomain", "")); + @Override + public boolean jobImpl() throws Exception { + return Switchboard.this.cleanupJob(); + } - // deploy blocking threads - this.indexingStorageProcessor = - new WorkflowProcessor<>( - "storeDocumentIndex", - "This is the sequencing step of the indexing queue. Files are written as streams, too much councurrency would destroy IO performance. In this process the words are written to the RWI cache, which flushes if it is full.", - new String[] { - "RWI/Cache/Collections" - }, - in -> { - Switchboard.this.storeDocumentIndex(in); - return null; - }, - 2, - null, - 1); - this.indexingAnalysisProcessor = - new WorkflowProcessor<>( - "webStructureAnalysis", - "This just stores the link structure of the document into a web structure database.", - new String[] { - "storeDocumentIndex" - }, - in -> Switchboard.this.webStructureAnalysis(in), - WorkflowProcessor.availableCPU + 1, - this.indexingStorageProcessor, - WorkflowProcessor.availableCPU); - this.indexingCondensementProcessor = - new WorkflowProcessor<>( - "condenseDocument", - "This does a structural analysis of plain texts: markup of headlines, slicing into phrases (i.e. sentences), markup with position, counting of words, calculation of term frequency.", - new String[] { - "webStructureAnalysis" - }, - in -> Switchboard.this.condenseDocument(in), - WorkflowProcessor.availableCPU + 1, - this.indexingAnalysisProcessor, - WorkflowProcessor.availableCPU); - this.indexingDocumentProcessor = - new WorkflowProcessor<>( - "parseDocument", - "This does the parsing of the newly loaded documents from the web. The result is not only a plain text document, but also a list of URLs that are embedded into the document. The urls are handed over to the CrawlStacker. This process has two child process queues!", - new String[] { - "condenseDocument", "CrawlStacker" - }, - in -> Switchboard.this.parseDocument(in), - Math.max(20, WorkflowProcessor.availableCPU * 2), // it may happen that this is filled with new files from the search process. That means there should be enough place for two result pages - this.indexingCondensementProcessor, - WorkflowProcessor.availableCPU); - - // deploy busy threads - this.log.config("Starting Threads"); - MemoryControl.gc(10000, "plasmaSwitchboard, help for profiler"); // help for profiler - thq - - this.deployThread( - SwitchboardConstants.CLEANUP, - "Cleanup", - "cleaning process", - null, - new InstantBusyThread("Switchboard.cleanupJob", 30000, 10000) { + @Override + public int getJobCount() { + return Switchboard.this.cleanupJobSize(); + } - @Override - public boolean jobImpl() throws Exception { - return Switchboard.this.cleanupJob(); - } + @Override + public void freememImpl() { + } - @Override - public int getJobCount() { - return Switchboard.this.cleanupJobSize(); - } + }, + 60000); // all 10 minutes, wait 1 minute until first run - @Override - public void freememImpl() { - } + this.deployThread( + SwitchboardConstants.SCHEDULER, + "Scheduler", + "starts scheduled processes from the API Processing table", + null, + new InstantBusyThread("Switchboard.schedulerJob", 30000, 10000) { + @Override + public boolean jobImpl() throws Exception { + return Switchboard.this.schedulerJob(); + } - }, - 60000); // all 10 minutes, wait 1 minute until first run + @Override + public int getJobCount() { + return Switchboard.this.schedulerJobSize(); + } - this.deployThread( - SwitchboardConstants.SCHEDULER, - "Scheduler", - "starts scheduled processes from the API Processing table", - null, - new InstantBusyThread("Switchboard.schedulerJob", 30000, 10000) { - @Override - public boolean jobImpl() throws Exception { - return Switchboard.this.schedulerJob(); - } + @Override + public void freememImpl() { + } + }, + 60000); // all 10 minutes, wait 1 minute until first run - @Override - public int getJobCount() { - return Switchboard.this.schedulerJobSize(); - } + this.deployThread( + SwitchboardConstants.SURROGATES, + "Surrogates", + "A thread that polls the SURROGATES path and puts all Documents in one surroagte file into the indexing queue.", + null, + new InstantBusyThread("Switchboard.surrogateProcess", 20000, 0) { + @Override + public boolean jobImpl() throws Exception { + return Switchboard.this.surrogateProcess(); + } - @Override - public void freememImpl() { - } - }, - 60000); // all 10 minutes, wait 1 minute until first run - - this.deployThread( - SwitchboardConstants.SURROGATES, - "Surrogates", - "A thread that polls the SURROGATES path and puts all Documents in one surroagte file into the indexing queue.", - null, - new InstantBusyThread("Switchboard.surrogateProcess", 20000, 0) { - @Override - public boolean jobImpl() throws Exception { - return Switchboard.this.surrogateProcess(); - } + @Override + public int getJobCount() { + return Switchboard.this.surrogateQueueSize(); + } - @Override - public int getJobCount() { - return Switchboard.this.surrogateQueueSize(); - } + @Override + public void freememImpl() { + Switchboard.this.surrogateFreeMem(); + } + }, + 10000); - @Override - public void freememImpl() { - Switchboard.this.surrogateFreeMem(); - } - }, - 10000); - - this.initRemoteCrawler(this.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false)); - this.initAutocrawl(this.getConfigBool(SwitchboardConstants.AUTOCRAWL, false)); - - final CrawlQueues crawlQueue = this.crawlQueues; - this.deployThread( - SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL, - "Local Crawl", - "thread that performes a single crawl step from the local crawl queue", - "/IndexCreateQueues_p.html?stack=LOCAL", - new InstantBusyThread("CrawlQueues.coreCrawlJob", 0, 0) { - @Override - public boolean jobImpl() throws Exception { - return crawlQueue.coreCrawlJob(); - } + this.initRemoteCrawler(this.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false)); + this.initAutocrawl(this.getConfigBool(SwitchboardConstants.AUTOCRAWL, false)); - @Override - public int getJobCount() { - return crawlQueue.coreCrawlJobSize(); - } + final CrawlQueues crawlQueue = this.crawlQueues; + this.deployThread( + SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL, + "Local Crawl", + "thread that performes a single crawl step from the local crawl queue", + "/IndexCreateQueues_p.html?stack=LOCAL", + new InstantBusyThread("CrawlQueues.coreCrawlJob", 0, 0) { + @Override + public boolean jobImpl() throws Exception { + return crawlQueue.coreCrawlJob(); + } - @Override - public void freememImpl() { - crawlQueue.freemem(); - } - }, - 10000); - - final Network net = this.yc; - this.deployThread( - SwitchboardConstants.SEED_UPLOAD, - "Seed-List Upload", - "task that a principal peer performes to generate and upload a seed-list to a ftp account", - null, - new InstantBusyThread("Network.publishSeedList", 600000, 300000) { - @Override - public boolean jobImpl() throws Exception { - net.publishSeedList(); - return true; - } - }, - 180000); - - this.deployThread( - SwitchboardConstants.PEER_PING, - "YaCy Core", - "this is the p2p-control and peer-ping task", - null, - new InstantBusyThread("Network.peerPing", 30000, 30000) { - @Override - public boolean jobImpl() throws Exception { - net.peerPing(); - return true; - } - }, - 10000); - this.deployThread( - SwitchboardConstants.INDEX_DIST, - "DHT Distribution", - "selection, transfer and deletion of index entries that are not searched on your peer, but on others", - null, - new InstantBusyThread("Switchboard.dhtTransferJob", 10000, 1000) { - @Override - public boolean jobImpl() throws Exception { - return Switchboard.this.dhtTransferJob(); + @Override + public int getJobCount() { + return crawlQueue.coreCrawlJobSize(); + } + + @Override + public void freememImpl() { + crawlQueue.freemem(); + } + }, + 10000); + + final Network net = this.yc; + this.deployThread( + SwitchboardConstants.SEED_UPLOAD, + "Seed-List Upload", + "task that a principal peer performes to generate and upload a seed-list to a ftp account", + null, + new InstantBusyThread("Network.publishSeedList", 600000, 300000) { + @Override + public boolean jobImpl() throws Exception { + net.publishSeedList(); + return true; + } + }, + 180000); + + this.deployThread( + SwitchboardConstants.PEER_PING, + "YaCy Core", + "this is the p2p-control and peer-ping task", + null, + new InstantBusyThread("Network.peerPing", 30000, 30000) { + @Override + public boolean jobImpl() throws Exception { + net.peerPing(); + return true; + } + }, + 10000); + this.deployThread( + SwitchboardConstants.INDEX_DIST, + "DHT Distribution", + "selection, transfer and deletion of index entries that are not searched on your peer, but on others", + null, + new InstantBusyThread("Switchboard.dhtTransferJob", 10000, 1000) { + @Override + public boolean jobImpl() throws Exception { + return Switchboard.this.dhtTransferJob(); + } + }, + 60000, + Long.parseLong(this.getConfig(SwitchboardConstants.INDEX_DIST_IDLESLEEP, "5000")), + Long.parseLong(this.getConfig(SwitchboardConstants.INDEX_DIST_BUSYSLEEP, "0")), + Long.parseLong(this.getConfig(SwitchboardConstants.INDEX_DIST_MEMPREREQ, "1000000")), + Double.parseDouble(this.getConfig(SwitchboardConstants.INDEX_DIST_LOADPREREQ, "9.0"))); + + // set network-specific performance attributes + if ( this.firstInit ) { + this.setRemotecrawlPPM(Math.max(1, (int) this.getConfigLong("network.unit.remotecrawl.speed", 60))); } - }, - 60000, - Long.parseLong(this.getConfig(SwitchboardConstants.INDEX_DIST_IDLESLEEP, "5000")), - Long.parseLong(this.getConfig(SwitchboardConstants.INDEX_DIST_BUSYSLEEP, "0")), - Long.parseLong(this.getConfig(SwitchboardConstants.INDEX_DIST_MEMPREREQ, "1000000")), - Double.parseDouble(this.getConfig(SwitchboardConstants.INDEX_DIST_LOADPREREQ, "9.0"))); - - // content control: initialize list sync thread - this.deployThread( - "720_ccimport", - "Content Control Import", - "this is the content control import thread", - null, - InstantBusyThread.createFromRunnable( - new SMWListSyncThread(this, sb.getConfig("contentcontrol.bookmarklist", "contentcontrol"), - "Category:Content Source", "/?Url/?Filter/?Category/?Modification date", - sb.getConfigBool("contentcontrol.smwimport.purgelistoninit", false)), - 3000, 3000), - 2000); - - this.deployThread( - "730_ccfilter", - "Content Control Filter", - "this is the content control filter update thread", - null, - InstantBusyThread.createFromRunnable(new ContentControlFilterUpdateThread(this), 3000, 3000), - 2000); - - // set network-specific performance attributes - if ( this.firstInit ) { - this.setRemotecrawlPPM(Math.max(1, (int) this.getConfigLong("network.unit.remotecrawl.speed", 60))); - } - - // test routine for snippet fetch - //Set query = new HashSet(); - //query.add(CrawlSwitchboardEntry.word2hash("Weitergabe")); - //query.add(CrawlSwitchboardEntry.word2hash("Zahl")); - //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/mobil/newsticker/meldung/mail/54980"), query, true); - //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/security/news/foren/go.shtml?read=1&msg_id=7301419&forum_id=72721"), query, true); - //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/kiosk/archiv/ct/2003/4/20"), query, true, 260); - - this.trail = new LinkedBlockingQueue<>(); - - this.log.config("Finished Switchboard Initialization"); + + // test routine for snippet fetch + //Set query = new HashSet(); + //query.add(CrawlSwitchboardEntry.word2hash("Weitergabe")); + //query.add(CrawlSwitchboardEntry.word2hash("Zahl")); + //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/mobil/newsticker/meldung/mail/54980"), query, true); + //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/security/news/foren/go.shtml?read=1&msg_id=7301419&forum_id=72721"), query, true); + //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/kiosk/archiv/ct/2003/4/20"), query, true, 260); + + this.trail = new LinkedBlockingQueue<>(); + + this.log.config("Finished Switchboard Initialization"); } /** @@ -1314,11 +1289,11 @@ public final class Switchboard extends serverSwitch { if(systemEnableSniExt == null) { /* Only apply custom configuration when the JVM system option jsse.enableSNIExtension is not defined */ HTTPClient.ENABLE_SNI_EXTENSION - .set(this.getConfigBool(SwitchboardConstants.HTTP_OUTGOING_GENERAL_TLS_SNI_EXTENSION_ENABLED, - HTTPClient.ENABLE_SNI_EXTENSION_DEFAULT)); + .set(this.getConfigBool(SwitchboardConstants.HTTP_OUTGOING_GENERAL_TLS_SNI_EXTENSION_ENABLED, + HTTPClient.ENABLE_SNI_EXTENSION_DEFAULT)); RemoteInstance.ENABLE_SNI_EXTENSION.set(this.getConfigBool(SwitchboardConstants.HTTP_OUTGOING_REMOTE_SOLR_TLS_SNI_EXTENSION_ENABLED, - RemoteInstance.ENABLE_SNI_EXTENSION_DEFAULT)); + RemoteInstance.ENABLE_SNI_EXTENSION_DEFAULT)); } } @@ -1365,16 +1340,16 @@ public final class Switchboard extends serverSwitch { public int getIndexingProcessorsQueueSize() { return this.indexingDocumentProcessor.getQueueSize() - + this.indexingCondensementProcessor.getQueueSize() - + this.indexingAnalysisProcessor.getQueueSize() - + this.indexingStorageProcessor.getQueueSize(); + + this.indexingCondensementProcessor.getQueueSize() + + this.indexingAnalysisProcessor.getQueueSize() + + this.indexingStorageProcessor.getQueueSize(); } public void overwriteNetworkDefinition(final String sysinfo) throws FileNotFoundException, IOException { // load network configuration into settings String networkUnitDefinition = - this.getConfig("network.unit.definition", "defaults/yacy.network.freeworld.unit"); + this.getConfig("network.unit.definition", "defaults/yacy.network.freeworld.unit"); if (networkUnitDefinition.isEmpty()) networkUnitDefinition = "defaults/yacy.network.freeworld.unit"; // patch for a strange failure case where the path was overwritten by empty string // patch old values @@ -1404,9 +1379,9 @@ public final class Switchboard extends serverSwitch { // or independently using a bootstrap URL Map initProps; final Reader netDefReader = - this.getConfigFileFromWebOrLocally(networkUnitDefinition, this.getAppPath().getAbsolutePath(), new File( - this.workPath, - "network.definition.backup")); + this.getConfigFileFromWebOrLocally(networkUnitDefinition, this.getAppPath().getAbsolutePath(), new File( + this.workPath, + "network.definition.backup")); initProps = FileUtils.table(netDefReader); this.setConfig(initProps); @@ -1431,10 +1406,10 @@ public final class Switchboard extends serverSwitch { // get public key if it's in config try { final String publicKeyString = - this.getConfig("network.unit.update.location" + i + ".key", null); + this.getConfig("network.unit.update.location" + i + ".key", null); if ( publicKeyString != null ) { final byte[] publicKeyBytes = - Base64Order.standardCoder.decode(publicKeyString.trim()); + Base64Order.standardCoder.decode(publicKeyString.trim()); publicKey = cryptoLib.getPublicKeyFromBytes(publicKeyBytes); } } catch (final InvalidKeySpecException e ) { @@ -1469,7 +1444,7 @@ public final class Switchboard extends serverSwitch { setConfig(plasmaSwitchboardConstants.INDEX_DIST_ALLOW, true); setConfig(plasmaSwitchboardConstants.INDEX_RECEIVE_ALLOW, true); } - */ + */ // write the YaCy network identification inside the yacybot client user agent to distinguish networks ClientIdentification.generateYaCyBot(sysinfo); } @@ -1522,11 +1497,11 @@ public final class Switchboard extends serverSwitch { this.setConfig("network.unit.definition", networkDefinition); this.overwriteNetworkDefinition(this.getSysinfo()); final File indexPrimaryPath = - this.getDataPath(SwitchboardConstants.INDEX_PRIMARY_PATH, SwitchboardConstants.INDEX_PATH_DEFAULT); + this.getDataPath(SwitchboardConstants.INDEX_PRIMARY_PATH, SwitchboardConstants.INDEX_PATH_DEFAULT); final int wordCacheMaxCount = - (int) this.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 20000); + (int) this.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 20000); final long fileSizeMax = - (OS.isWindows) ? this.getConfigLong("filesize.max.win", Integer.MAX_VALUE) : this.getConfigLong("filesize.max.other", Integer.MAX_VALUE); + (OS.isWindows) ? this.getConfigLong("filesize.max.win", Integer.MAX_VALUE) : this.getConfigLong("filesize.max.other", Integer.MAX_VALUE); final int redundancy = (int) this.getConfigLong("network.unit.dhtredundancy.senior", 1); final int partitionExponent = (int) this.getConfigLong("network.unit.dht.partitionExponent", 0); final String networkName = this.getConfig(SwitchboardConstants.NETWORK_NAME, ""); @@ -1544,11 +1519,11 @@ public final class Switchboard extends serverSwitch { // relocate this.peers.relocate( - this.networkRoot, - redundancy, - partitionExponent, - this.useTailCache, - this.exceed134217727); + this.networkRoot, + redundancy, + partitionExponent, + this.useTailCache, + this.exceed134217727); final File segmentsPath = new File(new File(indexPrimaryPath, networkName), "SEGMENTS"); final File archivePath = this.getDataPath(SwitchboardConstants.INDEX_ARCHIVE_PATH, SwitchboardConstants.INDEX_ARCHIVE_DEFAULT); this.index = new Segment(this.log, segmentsPath, archivePath, collectionConfiguration, webgraphConfiguration); @@ -1564,62 +1539,62 @@ public final class Switchboard extends serverSwitch { final String solrurls = this.getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_URL, "http://127.0.0.1:8983/solr"); final boolean usesolr = this.getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED, SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED_DEFAULT) & solrurls.length() > 0; - final int solrtimeout = this.getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_TIMEOUT, 60000); - final boolean writeEnabled = this.getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_WRITEENABLED, true); - final boolean trustSelfSignedOnAuthenticatedServer = Switchboard.getSwitchboard().getConfigBool( - SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_AUTHENTICATED_ALLOW_SELF_SIGNED, - SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_AUTHENTICATED_ALLOW_SELF_SIGNED_DEFAULT); + final int solrtimeout = this.getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_TIMEOUT, 60000); + final boolean writeEnabled = this.getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_WRITEENABLED, true); + final boolean trustSelfSignedOnAuthenticatedServer = Switchboard.getSwitchboard().getConfigBool( + SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_AUTHENTICATED_ALLOW_SELF_SIGNED, + SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_AUTHENTICATED_ALLOW_SELF_SIGNED_DEFAULT); - if (usesolr && solrurls != null && solrurls.length() > 0) { - try { - final ArrayList instances = RemoteInstance.getShardInstances(solrurls, null, null, solrtimeout, trustSelfSignedOnAuthenticatedServer); - final String shardMethodName = this.getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_SHARDING, ShardSelection.Method.MODULO_HOST_MD5.name()); - final ShardSelection.Method shardMethod = ShardSelection.Method.valueOf(shardMethodName); - this.index.fulltext().connectRemoteSolr(instances, shardMethod, writeEnabled); - } catch (final IOException e ) { - ConcurrentLog.logException(e); - } - } + if (usesolr && solrurls != null && solrurls.length() > 0) { + try { + final ArrayList instances = RemoteInstance.getShardInstances(solrurls, null, null, solrtimeout, trustSelfSignedOnAuthenticatedServer); + final String shardMethodName = this.getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_SHARDING, ShardSelection.Method.MODULO_HOST_MD5.name()); + final ShardSelection.Method shardMethod = ShardSelection.Method.valueOf(shardMethodName); + this.index.fulltext().connectRemoteSolr(instances, shardMethod, writeEnabled); + } catch (final IOException e ) { + ConcurrentLog.logException(e); + } + } - // create a crawler - this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object - this.crawler = new CrawlSwitchboard(this); + // create a crawler + this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object + this.crawler = new CrawlSwitchboard(this); - // init a DHT transmission dispatcher - this.dhtDispatcher = (this.peers.sizeConnected() == 0) ? null : new Dispatcher(this, true, 10000); + // init a DHT transmission dispatcher + this.dhtDispatcher = (this.peers.sizeConnected() == 0) ? null : new Dispatcher(this, true, 10000); - // create new web structure - this.webStructure = new WebStructureGraph(new File(this.queuesRoot, "webStructure.map")); + // create new web structure + this.webStructure = new WebStructureGraph(new File(this.queuesRoot, "webStructure.map")); - // load domainList - try { - this.domainList = null; - if ( !this.getConfig("network.unit.domainlist", "").equals("") ) { - final Reader r = this.getConfigFileFromWebOrLocally( - this.getConfig("network.unit.domainlist", ""), - this.getAppPath().getAbsolutePath(), - new File(this.networkRoot, "domainlist.txt")); - this.domainList = new FilterEngine(); - final BufferedReader br = new BufferedReader(r); - this.domainList.loadList(br, null); - br.close(); - } - } catch (final FileNotFoundException e ) { - this.log.severe("CONFIG: domainlist not found: " + e.getMessage()); - } catch (final IOException e ) { - this.log.severe("CONFIG: error while retrieving domainlist: " + e.getMessage()); - } + // load domainList + try { + this.domainList = null; + if ( !this.getConfig("network.unit.domainlist", "").equals("") ) { + final Reader r = this.getConfigFileFromWebOrLocally( + this.getConfig("network.unit.domainlist", ""), + this.getAppPath().getAbsolutePath(), + new File(this.networkRoot, "domainlist.txt")); + this.domainList = new FilterEngine(); + final BufferedReader br = new BufferedReader(r); + this.domainList.loadList(br, null); + br.close(); + } + } catch (final FileNotFoundException e ) { + this.log.severe("CONFIG: domainlist not found: " + e.getMessage()); + } catch (final IOException e ) { + this.log.severe("CONFIG: error while retrieving domainlist: " + e.getMessage()); + } - this.crawlStacker = - new CrawlStacker( - this.robots, - this.crawlQueues, - this.crawler, - this.index, - this.peers, - "local.any".indexOf(this.getConfig(SwitchboardConstants.NETWORK_DOMAIN, "global")) >= 0, - "global.any".indexOf(this.getConfig(SwitchboardConstants.NETWORK_DOMAIN, "global")) >= 0, - this.domainList); + this.crawlStacker = + new CrawlStacker( + this.robots, + this.crawlQueues, + this.crawler, + this.index, + this.peers, + "local.any".indexOf(this.getConfig(SwitchboardConstants.NETWORK_DOMAIN, "global")) >= 0, + "global.any".indexOf(this.getConfig(SwitchboardConstants.NETWORK_DOMAIN, "global")) >= 0, + this.domainList); } Domains.setNoLocalCheck(this.isAllIPMode()); // possibly switch off localIP check @@ -1628,7 +1603,7 @@ public final class Switchboard extends serverSwitch { this.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); this.continueCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); this.log - .info("SWITCH NETWORK: FINISHED START UP, new network is now '" + networkDefinition + "'."); + .info("SWITCH NETWORK: FINISHED START UP, new network is now '" + networkDefinition + "'."); // set the network-specific remote crawl ppm this.setRemotecrawlPPM(Math.max(1, (int) this.getConfigLong("network.unit.remotecrawl.speed", 60))); @@ -1762,12 +1737,12 @@ public final class Switchboard extends serverSwitch { final File messageDbFile = new File(this.workPath, "message.heap"); this.messageDB = new MessageBoard(messageDbFile); this.log.config("Loaded Message Board DB from file " - + messageDbFile.getName() - + ", " - + this.messageDB.size() - + " entries" - + ", " - + ppRamString(messageDbFile.length() / 1024)); + + messageDbFile.getName() + + ", " + + this.messageDB.size() + + " entries" + + ", " + + ppRamString(messageDbFile.length() / 1024)); } public void initWiki() throws IOException { @@ -1775,12 +1750,12 @@ public final class Switchboard extends serverSwitch { final File wikiDbFile = new File(this.workPath, "wiki.heap"); this.wikiDB = new WikiBoard(wikiDbFile, new File(this.workPath, "wiki-bkp.heap")); this.log.config("Loaded Wiki Board DB from file " - + wikiDbFile.getName() - + ", " - + this.wikiDB.size() - + " entries" - + ", " - + ppRamString(wikiDbFile.length() / 1024)); + + wikiDbFile.getName() + + ", " + + this.wikiDB.size() + + " entries" + + ", " + + ppRamString(wikiDbFile.length() / 1024)); } public void initBlog() throws IOException { @@ -1788,22 +1763,22 @@ public final class Switchboard extends serverSwitch { final File blogDbFile = new File(this.workPath, "blog.heap"); this.blogDB = new BlogBoard(blogDbFile); this.log.config("Loaded Blog DB from file " - + blogDbFile.getName() - + ", " - + this.blogDB.size() - + " entries" - + ", " - + ppRamString(blogDbFile.length() / 1024)); + + blogDbFile.getName() + + ", " + + this.blogDB.size() + + " entries" + + ", " + + ppRamString(blogDbFile.length() / 1024)); final File blogCommentDbFile = new File(this.workPath, "blogComment.heap"); this.blogCommentDB = new BlogBoardComments(blogCommentDbFile); this.log.config("Loaded Blog-Comment DB from file " - + blogCommentDbFile.getName() - + ", " - + this.blogCommentDB.size() - + " entries" - + ", " - + ppRamString(blogCommentDbFile.length() / 1024)); + + blogCommentDbFile.getName() + + ", " + + this.blogCommentDB.size() + + " entries" + + ", " + + ppRamString(blogCommentDbFile.length() / 1024)); } public void initBookmarks() throws IOException { @@ -1814,13 +1789,13 @@ public final class Switchboard extends serverSwitch { tagsFile.delete(); this.bookmarksDB = new BookmarksDB(bookmarksFile, datesFile); this.log.config("Loaded Bookmarks DB from files " - + bookmarksFile.getName() - + ", " - + tagsFile.getName()); + + bookmarksFile.getName() + + ", " + + tagsFile.getName()); this.log.config(this.bookmarksDB.tagsSize() - + " Tag, " - + this.bookmarksDB.bookmarksSize() - + " Bookmarks"); + + " Tag, " + + this.bookmarksDB.bookmarksSize() + + " Bookmarks"); } public static Switchboard getSwitchboard() { @@ -1857,17 +1832,17 @@ public final class Switchboard extends serverSwitch { // we need to take care that search requests and remote indexing requests go only // to the peers in the same cluster, if we run a robinson cluster. return (this.peers != null && this.peers.sizeConnected() == 0) - || (!this.getConfigBool(SwitchboardConstants.INDEX_DIST_ALLOW, false) && - !this.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW, false)); + || (!this.getConfigBool(SwitchboardConstants.INDEX_DIST_ALLOW, false) && + !this.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW, false)); } public boolean isPublicRobinson() { // robinson peers may be member of robinson clusters, which can be public or private // this does not check the robinson attribute, only the specific subtype of the cluster final String clustermode = - this.getConfig(SwitchboardConstants.CLUSTER_MODE, SwitchboardConstants.CLUSTER_MODE_PUBLIC_PEER); + this.getConfig(SwitchboardConstants.CLUSTER_MODE, SwitchboardConstants.CLUSTER_MODE_PUBLIC_PEER); return (clustermode.equals(SwitchboardConstants.CLUSTER_MODE_PUBLIC_CLUSTER)) - || (clustermode.equals(SwitchboardConstants.CLUSTER_MODE_PUBLIC_PEER)); + || (clustermode.equals(SwitchboardConstants.CLUSTER_MODE_PUBLIC_PEER)); } public boolean isInMyCluster(final String peer) { @@ -1882,7 +1857,7 @@ public final class Switchboard extends serverSwitch { return false; } final String clustermode = - this.getConfig(SwitchboardConstants.CLUSTER_MODE, SwitchboardConstants.CLUSTER_MODE_PUBLIC_PEER); + this.getConfig(SwitchboardConstants.CLUSTER_MODE, SwitchboardConstants.CLUSTER_MODE_PUBLIC_PEER); if ( clustermode.equals(SwitchboardConstants.CLUSTER_MODE_PUBLIC_CLUSTER) ) { // check if we got the request from a peer in the public cluster return this.clusterhashes.contains(ASCII.getBytes(peer)); @@ -1900,7 +1875,7 @@ public final class Switchboard extends serverSwitch { return false; } final String clustermode = - this.getConfig(SwitchboardConstants.CLUSTER_MODE, SwitchboardConstants.CLUSTER_MODE_PUBLIC_PEER); + this.getConfig(SwitchboardConstants.CLUSTER_MODE, SwitchboardConstants.CLUSTER_MODE_PUBLIC_PEER); if ( clustermode.equals(SwitchboardConstants.CLUSTER_MODE_PUBLIC_CLUSTER) ) { // check if we got the request from a peer in the public cluster return this.clusterhashes.contains(ASCII.getBytes(seed.hash)); @@ -1950,8 +1925,8 @@ public final class Switchboard extends serverSwitch { public RankingProfile getRanking() { return (this.getConfig(SwitchboardConstants.SEARCH_RANKING_RWI_PROFILE, "").isEmpty()) - ? new RankingProfile(Classification.ContentDomain.TEXT) - : new RankingProfile("", crypt.simpleDecode(this.getConfig(SwitchboardConstants.SEARCH_RANKING_RWI_PROFILE, ""))); + ? new RankingProfile(Classification.ContentDomain.TEXT) + : new RankingProfile("", crypt.simpleDecode(this.getConfig(SwitchboardConstants.SEARCH_RANKING_RWI_PROFILE, ""))); } /** @@ -1963,18 +1938,18 @@ public final class Switchboard extends serverSwitch { */ public String onlineCaution() { if ( System.currentTimeMillis() - this.proxyLastAccess < Integer.parseInt(this.getConfig( - SwitchboardConstants.PROXY_ONLINE_CAUTION_DELAY, - "100")) ) { + SwitchboardConstants.PROXY_ONLINE_CAUTION_DELAY, + "100")) ) { return "proxy"; } if ( System.currentTimeMillis() - this.localSearchLastAccess < Integer.parseInt(this.getConfig( - SwitchboardConstants.LOCALSEACH_ONLINE_CAUTION_DELAY, - "1000")) ) { + SwitchboardConstants.LOCALSEACH_ONLINE_CAUTION_DELAY, + "1000")) ) { return "localsearch"; } if ( System.currentTimeMillis() - this.remoteSearchLastAccess < Integer.parseInt(this.getConfig( - SwitchboardConstants.REMOTESEARCH_ONLINE_CAUTION_DELAY, - "500")) ) { + SwitchboardConstants.REMOTESEARCH_ONLINE_CAUTION_DELAY, + "500")) ) { return "remotesearch"; } return null; @@ -2025,13 +2000,13 @@ public final class Switchboard extends serverSwitch { */ public boolean cleanProfiles() throws InterruptedException { if (this.getIndexingProcessorsQueueSize() > 0 || - this.crawlQueues.activeWorkerEntries().size() > 0 || - this.crawlQueues.coreCrawlJobSize() > 0 || - this.crawlQueues.limitCrawlJobSize() > 0 || - this.crawlQueues.remoteTriggeredCrawlJobSize() > 0 || - this.crawlQueues.noloadCrawlJobSize() > 0 || - (this.crawlStacker != null && !this.crawlStacker.isEmpty()) || - !this.crawlQueues.noticeURL.isEmpty()) { + this.crawlQueues.activeWorkerEntries().size() > 0 || + this.crawlQueues.coreCrawlJobSize() > 0 || + this.crawlQueues.limitCrawlJobSize() > 0 || + this.crawlQueues.remoteTriggeredCrawlJobSize() > 0 || + this.crawlQueues.noloadCrawlJobSize() > 0 || + (this.crawlStacker != null && !this.crawlStacker.isEmpty()) || + !this.crawlQueues.noticeURL.isEmpty()) { return false; } return this.crawler.clear(); @@ -2059,7 +2034,7 @@ public final class Switchboard extends serverSwitch { if ( this.dhtDispatcher != null ) { this.dhtDispatcher.close(); } -// de.anomic.http.client.Client.closeAllConnections(); + // de.anomic.http.client.Client.closeAllConnections(); this.wikiDB.close(); this.blogDB.close(); this.blogCommentDB.close(); @@ -2158,9 +2133,9 @@ public final class Switchboard extends serverSwitch { } this.indexingDocumentProcessor.enQueue(new IndexingQueueEntry( - response, - null, - null)); + response, + null, + null)); return null; } @@ -2233,18 +2208,16 @@ public final class Switchboard extends serverSwitch { final String gzname = outfile.getName() + ".gz"; final File gzfile = new File(outfile.getParentFile(), gzname); try ( - /* Resources automatically closed by this try-with-resources statement */ - final FileOutputStream fileOutStream = new FileOutputStream(gzfile); - final OutputStream os = new BufferedOutputStream(new GZIPOutputStream(fileOutStream, 65536){{this.def.setLevel(Deflater.BEST_COMPRESSION);}}); - final FileInputStream fileInStream = new FileInputStream(outfile); - final BufferedInputStream bis = new BufferedInputStream(fileInStream); - ) { + /* Resources automatically closed by this try-with-resources statement */ + final FileOutputStream fileOutStream = new FileOutputStream(gzfile); + final OutputStream os = new BufferedOutputStream(new GZIPOutputStream(fileOutStream, 65536){{this.def.setLevel(Deflater.BEST_COMPRESSION);}}); + final FileInputStream fileInStream = new FileInputStream(outfile); + final BufferedInputStream bis = new BufferedInputStream(fileInStream); + ) { FileUtils.copy(bis, os); if ( gzfile.exists() ) { FileUtils.deletedelete(outfile); } - } catch (final FileNotFoundException e ) { - ConcurrentLog.logException(e); } catch (final IOException e ) { /* Catch but log any IO exception that can occur on copy, automatic closing or streams creation */ ConcurrentLog.logException(e); @@ -2488,18 +2461,18 @@ public final class Switchboard extends serverSwitch { final DCEntry entry = (DCEntry)surrogateObj; final Document document = entry.document(); final Request request = - new Request( - ASCII.getBytes(Switchboard.this.peers.mySeed().hash), - entry.getIdentifier(true), - null, - "", - entry.getDate(), - Switchboard.this.crawler.defaultSurrogateProfile.handle(), - 0, - Switchboard.this.crawler.defaultSurrogateProfile.timezoneOffset()); + new Request( + ASCII.getBytes(Switchboard.this.peers.mySeed().hash), + entry.getIdentifier(true), + null, + "", + entry.getDate(), + Switchboard.this.crawler.defaultSurrogateProfile.handle(), + 0, + Switchboard.this.crawler.defaultSurrogateProfile.timezoneOffset()); final Response response = new Response(request, null, null, Switchboard.this.crawler.defaultSurrogateProfile, false, null); final IndexingQueueEntry queueEntry = - new IndexingQueueEntry(response, new Document[] {document}, null); + new IndexingQueueEntry(response, new Document[] {document}, null); Switchboard.this.indexingCondensementProcessor.enQueue(queueEntry); } @@ -2526,12 +2499,12 @@ public final class Switchboard extends serverSwitch { int count = 0; for ( final String s : surrogatelist ) { if ( s.endsWith(".xml") - || s.endsWith(".xml.gz") - || s.endsWith(".xml.zip") - || s.endsWith(".warc") - || s.endsWith(".warc.gz") - || s.endsWith(".jsonlist") - || s.endsWith(".flatjson") ) { + || s.endsWith(".xml.gz") + || s.endsWith(".xml.zip") + || s.endsWith(".warc") + || s.endsWith(".warc.gz") + || s.endsWith(".jsonlist") + || s.endsWith(".flatjson") ) { count++; } if ( count >= 100 ) { @@ -2551,8 +2524,8 @@ public final class Switchboard extends serverSwitch { if ( cautionCause != null ) { if ( this.log.isFine() ) { this.log.fine("deQueue: online caution for " - + cautionCause - + ", omitting resource stack processing"); + + cautionCause + + ", omitting resource stack processing"); } return false; } @@ -2568,12 +2541,12 @@ public final class Switchboard extends serverSwitch { this.checkInterruption(); if ( surrogate.endsWith(".xml") - || surrogate.endsWith(".xml.gz") - || surrogate.endsWith(".xml.zip") - || surrogate.endsWith(".warc") - || surrogate.endsWith(".warc.gz") - || surrogate.endsWith(".jsonlist") - || surrogate.endsWith(".flatjson") ) { + || surrogate.endsWith(".xml.gz") + || surrogate.endsWith(".xml.zip") + || surrogate.endsWith(".warc") + || surrogate.endsWith(".warc.gz") + || surrogate.endsWith(".jsonlist") + || surrogate.endsWith(".flatjson") ) { // read the surrogate file and store entry in index if ( this.processSurrogate(surrogate) ) { return true; @@ -2794,8 +2767,8 @@ public final class Switchboard extends serverSwitch { if (this.crawlQueues.delegatedURL != null && (this.crawlQueues.delegatedURL.size() > 1000) ) { if ( this.log.isFine() ) { this.log.fine("Cleaning Delegated-URLs report stack, " - + this.crawlQueues.delegatedURL.size() - + " entries on stack"); + + this.crawlQueues.delegatedURL.size() + + " entries on stack"); } this.crawlQueues.delegatedURL.clear(); } @@ -2805,8 +2778,8 @@ public final class Switchboard extends serverSwitch { if ( (this.crawlQueues.errorURL.stackSize() > 1000) ) { if ( this.log.isFine() ) { this.log.fine("Cleaning Error-URLs report stack, " - + this.crawlQueues.errorURL.stackSize() - + " entries on stack"); + + this.crawlQueues.errorURL.stackSize() + + " entries on stack"); } this.crawlQueues.errorURL.clearStack(); } @@ -2817,9 +2790,9 @@ public final class Switchboard extends serverSwitch { if ( ResultURLs.getStackSize(origin) > 1000 ) { if ( this.log.isFine() ) { this.log.fine("Cleaning Loaded-URLs report stack, " - + ResultURLs.getStackSize(origin) - + " entries on stack " - + origin.getCode()); + + ResultURLs.getStackSize(origin) + + " entries on stack " + + origin.getCode()); } ResultURLs.clearStack(origin); } @@ -2830,8 +2803,8 @@ public final class Switchboard extends serverSwitch { try { if ( this.log.isFine() ) { this.log.fine("Cleaning Incoming News, " - + this.peers.newsPool.size(NewsPool.INCOMING_DB) - + " entries on stack"); + + this.peers.newsPool.size(NewsPool.INCOMING_DB) + + " entries on stack"); } this.peers.newsPool.automaticProcess(this.peers); } catch (final Exception e ) { @@ -2847,7 +2820,7 @@ public final class Switchboard extends serverSwitch { // clean up seed-dbs if ( this.getConfigBool("routing.deleteOldSeeds.permission", true) ) { final long deleteOldSeedsTime = - this.getConfigLong("routing.deleteOldSeeds.time", 7) * 24 * 3600000; + this.getConfigLong("routing.deleteOldSeeds.time", 7) * 24 * 3600000; Iterator e = this.peers.seedsSortedDisconnected(true, Seed.LASTSEEN); Seed seed = null; final List deleteQueue = new ArrayList<>(); @@ -2895,10 +2868,10 @@ public final class Switchboard extends serverSwitch { final boolean devenvironment = new File(this.getAppPath(), ".git").exists(); if ( devenvironment ) { this.log - .info("AUTO-UPDATE: omitting update because this is a development environment"); + .info("AUTO-UPDATE: omitting update because this is a development environment"); } else if ( (downloaded == null) || (!downloaded.exists()) || (downloaded.length() == 0) ) { this.log - .info("AUTO-UPDATE: omitting update because download failed (file cannot be found, is too small or signature is bad)"); + .info("AUTO-UPDATE: omitting update because download failed (file cannot be found, is too small or signature is bad)"); } else { if(yacyRelease.deployRelease(downloaded)) { this.terminate(10, "auto-update to install " + downloaded.getName()); @@ -2934,9 +2907,9 @@ public final class Switchboard extends serverSwitch { final Properties news = new Properties(); news.put("homepage", profile.get("homepage")); this.peers.newsPool.publishMyNews( - this.peers.mySeed(), - NewsPool.CATEGORY_PROFILE_BROADCAST, - news); + this.peers.mySeed(), + NewsPool.CATEGORY_PROFILE_BROADCAST, + news); } } @@ -3026,7 +2999,7 @@ public final class Switchboard extends serverSwitch { fulltext.optimize(opts); this.optimizeLastRun = System.currentTimeMillis(); } - */ + */ } // write statistics @@ -3112,8 +3085,6 @@ public final class Switchboard extends serverSwitch { Document[] documents = null; try { documents = this.parseDocument(in.queueEntry); - } catch (final InterruptedException e ) { - documents = null; } catch (final Exception e ) { documents = null; } @@ -3130,13 +3101,13 @@ public final class Switchboard extends serverSwitch { if ( this.log.isFine() ) { this.log.fine( - "processResourceStack processCase=" + processCase - + ", depth=" + response.depth() - + ", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth())) - + ", must-match=" + ((response.profile() == null) ? "null" : response.profile().formattedUrlMustMatchPattern()) - + ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustNotMatchPattern().toString()) - + ", initiatorHash=" + ((response.initiator() == null) ? "null" : ASCII.String(response.initiator())) - + ", url=" + response.url()); // DEBUG + "processResourceStack processCase=" + processCase + + ", depth=" + response.depth() + + ", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth())) + + ", must-match=" + ((response.profile() == null) ? "null" : response.profile().formattedUrlMustMatchPattern()) + + ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustNotMatchPattern().toString()) + + ", initiatorHash=" + ((response.initiator() == null) ? "null" : ASCII.String(response.initiator())) + + ", url=" + response.url()); // DEBUG } // PARSE CONTENT @@ -3160,13 +3131,13 @@ public final class Switchboard extends serverSwitch { if(response.profile().isIndexNonParseableUrls()) { /* Apply the generic parser add the URL as a simple link (no content metadata) to the index */ documents = TextParser.genericParseSource(new AnchorURL(response.url()), - response.getMimeType(), - response.getCharacterEncoding(), - response.profile().ignoreDivClassName(), - response.profile().scraper(), - response.profile().timezoneOffset(), - response.depth(), - response.getContent()); + response.getMimeType(), + response.getCharacterEncoding(), + response.profile().ignoreDivClassName(), + response.profile().scraper(), + response.profile().timezoneOffset(), + response.depth(), + response.getContent()); } else { this.log.warn("Resource '" + response.url().toNormalform(true) + "' is not supported. " + supportError); // create a new errorURL DB entry @@ -3222,19 +3193,19 @@ public final class Switchboard extends serverSwitch { // collect anchors within remaining documents if ((processCase == EventOrigin.PROXY_LOAD || processCase == EventOrigin.LOCAL_CRAWLING) && - ( - response.profile() == null || - response.depth() < response.profile().depth() || - response.profile().crawlerNoDepthLimitMatchPattern().matcher(response.url().toNormalform(true)).matches() - ) - ) { + ( + response.profile() == null || + response.depth() < response.profile().depth() || + response.profile().crawlerNoDepthLimitMatchPattern().matcher(response.url().toNormalform(true)).matches() + ) + ) { final Pattern crawlerOriginUrlMustMatch = response.profile().getCrawlerOriginUrlMustMatchPattern(); final Pattern crawlerOriginUrlMustNotMatch = response.profile().getCrawlerOriginUrlMustNotMatchPattern(); if (!(crawlerOriginUrlMustMatch == CrawlProfile.MATCH_ALL_PATTERN || crawlerOriginUrlMustMatch.matcher(response.url().toNormalform(true)).matches()) || (crawlerOriginUrlMustNotMatch != CrawlProfile.MATCH_NEVER_PATTERN - && crawlerOriginUrlMustNotMatch.matcher(response.url().toNormalform(true)).matches())) { + && crawlerOriginUrlMustNotMatch.matcher(response.url().toNormalform(true)).matches())) { if (this.log.isInfo()) { this.log.info("CRAWL: Ignored links from document at " + response.url().toNormalform(true) + " : prevented by regular expression on URL origin of links, " @@ -3292,9 +3263,9 @@ public final class Switchboard extends serverSwitch { nextUrl = nextEntry.getKey(); String u = nextUrl.toNormalform(true, true); if ( !(u.startsWith("http://") - || u.startsWith("https://") - || u.startsWith("ftp://") - || u.startsWith("smb://") || u.startsWith("file://")) ) { + || u.startsWith("https://") + || u.startsWith("ftp://") + || u.startsWith("smb://") || u.startsWith("file://")) ) { continue; } @@ -3311,14 +3282,14 @@ public final class Switchboard extends serverSwitch { final int nextdepth = nextEntry.getValue() != null && nextEntry.getValue().equals(Document.CANONICAL_MARKER) ? response.depth() : response.depth() + 1; // canonical documents are on the same depth try { this.crawlStacker.enqueueEntry(new Request( - response.initiator(), - new DigestURL(u), - response.url().hash(), - nextEntry.getValue(), - new Date(), - response.profile().handle(), - nextdepth, - response.profile().timezoneOffset())); + response.initiator(), + new DigestURL(u), + response.url().hash(), + nextEntry.getValue(), + new Date(), + response.profile().handle(), + nextdepth, + response.profile().timezoneOffset())); } catch (final MalformedURLException e ) { ConcurrentLog.logException(e); } @@ -3326,13 +3297,13 @@ public final class Switchboard extends serverSwitch { final long stackEndTime = System.currentTimeMillis(); if ( this.log.isInfo() ) { this.log.info("CRAWL: ADDED " - + hl.size() - + " LINKS FROM " - + response.url().toNormalform(true) - + ", STACKING TIME = " - + (stackEndTime - stackStartTime) - + ", PARSING TIME = " - + (parsingEndTime - parsingStartTime)); + + hl.size() + + " LINKS FROM " + + response.url().toNormalform(true) + + ", STACKING TIME = " + + (stackEndTime - stackStartTime) + + ", PARSING TIME = " + + (parsingEndTime - parsingStartTime)); } } } @@ -3368,7 +3339,7 @@ public final class Switchboard extends serverSwitch { } } if (!(profile.indexUrlMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexUrlMustMatchPattern().matcher(urls).matches()) || - (profile.indexUrlMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexUrlMustNotMatchPattern().matcher(urls).matches())) { + (profile.indexUrlMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexUrlMustNotMatchPattern().matcher(urls).matches())) { if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern()); // create a new errorURL DB entry this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern(), -1); @@ -3385,7 +3356,7 @@ public final class Switchboard extends serverSwitch { continue docloop; } if (!(profile.indexContentMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexContentMustMatchPattern().matcher(document.getTextString()).matches()) || - (profile.indexContentMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexContentMustNotMatchPattern().matcher(document.getTextString()).matches())) { + (profile.indexContentMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexContentMustNotMatchPattern().matcher(document.getTextString()).matches())) { if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern()); // create a new errorURL DB entry this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern(), -1); @@ -3398,13 +3369,13 @@ public final class Switchboard extends serverSwitch { if (!(mustMatchMediaType == CrawlProfile.MATCH_ALL_PATTERN || mustMatchMediaType.matcher(document.dc_format()).matches()) || (mustNotMatchMediaType != CrawlProfile.MATCH_NEVER_PATTERN - && mustNotMatchMediaType.matcher(document.dc_format()).matches())) { + && mustNotMatchMediaType.matcher(document.dc_format()).matches())) { final String failReason = new StringBuilder( "indexing prevented by regular expression on media type; indexContentMustMatchPattern = ") - .append(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH).append(" = ") - .append(mustMatchMediaType.pattern()).append(", ") - .append(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH).append(" = ") - .append(mustNotMatchMediaType.pattern()).toString(); + .append(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH).append(" = ") + .append(mustMatchMediaType.pattern()).append(", ") + .append(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH).append(" = ") + .append(mustNotMatchMediaType.pattern()).toString(); if (this.log.isInfo()) { this.log.info("Not Condensed Resource '" + urls + " : " + failReason); } @@ -3427,19 +3398,19 @@ public final class Switchboard extends serverSwitch { final Condenser[] condenser = new Condenser[in.documents.length]; for ( int i = 0; i < in.documents.length; i++ ) { condenser[i] = - new Condenser( - in.documents[i], in.queueEntry.profile().scraper(), in.queueEntry.profile().indexText(), - in.queueEntry.profile().indexMedia(), - LibraryProvider.dymLib, true, - this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_dts), - profile.timezoneOffset()); + new Condenser( + in.documents[i], in.queueEntry.profile().scraper(), in.queueEntry.profile().indexText(), + in.queueEntry.profile().indexMedia(), + LibraryProvider.dymLib, true, + this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_dts), + profile.timezoneOffset()); // update image result list statistics // its good to do this concurrently here, because it needs a DNS lookup // to compute a URL hash which is necessary for a double-check ResultImages.registerImages(in.queueEntry.url(), in.documents[i], (profile == null) - ? true - : !profile.remoteIndexing()); + ? true + : !profile.remoteIndexing()); } return new IndexingQueueEntry(in.queueEntry, in.documents, condenser); } @@ -3470,12 +3441,12 @@ public final class Switchboard extends serverSwitch { for ( int i = 0; i < in.documents.length; i++ ) { final CrawlProfile profile = in.queueEntry.profile(); this.storeDocumentIndex( - in.queueEntry, - in.queueEntry.profile().collections(), - in.documents[i], - in.condenser[i], - null, - profile == null ? "crawler" : profile.handle()); + in.queueEntry, + in.queueEntry.profile().collections(), + in.documents[i], + in.condenser[i], + null, + profile == null ? "crawler" : profile.handle()); } } in.queueEntry.updateStatus(Response.QUEUE_STATE_FINISHED); @@ -3491,12 +3462,12 @@ public final class Switchboard extends serverSwitch { * @param sourceName if this document was created by a crawl, then the sourceName contains the crawl hash */ private void storeDocumentIndex( - final Response queueEntry, - final Map collections, - final Document document, - final Condenser condenser, - final SearchEvent searchEvent, - final String sourceName) { + final Response queueEntry, + final Map collections, + final Document document, + final Condenser condenser, + final SearchEvent searchEvent, + final String sourceName) { //TODO: document must carry referer, size and last modified @@ -3524,9 +3495,9 @@ public final class Switchboard extends serverSwitch { //if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name()); // create a new errorURL DB entry this.crawlQueues.errorURL.push(url, queueEntry.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, "denied by profile rule, process case=" - + processCase - + ", profile name = " - + profile.collectionName(), -1); + + processCase + + ", profile name = " + + profile.collectionName(), -1); return; } @@ -3549,33 +3520,33 @@ public final class Switchboard extends serverSwitch { this.crawlQueues.errorURL.push(url, queueEntry.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, profileSolrFilterError + ", process case=" + processCase + ", profile name = " + profile.collectionName(), - -1); + -1); return; } // STORE WORD INDEX final SolrInputDocument newEntry = - this.index.storeDocument( - url, - profile, - queueEntry.getResponseHeader(), - document, - vector, - language, - condenser, - searchEvent, - sourceName, - this.getConfigBool(SwitchboardConstants.NETWORK_UNIT_DHT, false), - this.getConfigBool(SwitchboardConstants.PROXY_TRANSPARENT_PROXY, false) ? "http://127.0.0.1:" + sb.getConfigInt(SwitchboardConstants.SERVER_PORT, 8090) : null, - this.getConfig("crawler.http.acceptLanguage", null)); + this.index.storeDocument( + url, + profile, + queueEntry.getResponseHeader(), + document, + vector, + language, + condenser, + searchEvent, + sourceName, + this.getConfigBool(SwitchboardConstants.NETWORK_UNIT_DHT, false), + this.getConfigBool(SwitchboardConstants.PROXY_TRANSPARENT_PROXY, false) ? "http://127.0.0.1:" + sb.getConfigInt(SwitchboardConstants.SERVER_PORT, 8090) : null, + this.getConfig("crawler.http.acceptLanguage", null)); final RSSFeed feed = - EventChannel.channels(queueEntry.initiator() == null + EventChannel.channels(queueEntry.initiator() == null ? EventChannel.PROXY - : Base64Order.enhancedCoder.equal( - queueEntry.initiator(), - ASCII.getBytes(this.peers.mySeed().hash)) - ? EventChannel.LOCALINDEXING - : EventChannel.REMOTEINDEXING); + : Base64Order.enhancedCoder.equal( + queueEntry.initiator(), + ASCII.getBytes(this.peers.mySeed().hash)) + ? EventChannel.LOCALINDEXING + : EventChannel.REMOTEINDEXING); feed.addMessage(new RSSMessage("Indexed web page", dc_title, queueEntry.url(), ASCII.String(queueEntry.url().hash()))); if (this.getConfigBool(SwitchboardConstants.DECORATION_AUDIO, false)) Audio.Soundclip.newdoc.play(-20.0f); @@ -3595,12 +3566,12 @@ public final class Switchboard extends serverSwitch { // update url result list statistics ResultURLs.stack( - ASCII.String(url.hash()), // loaded url db entry - url.getHost(), - queueEntry.initiator(), // initiator peer hash - UTF8.getBytes(this.peers.mySeed().hash), // executor peer hash - processCase // process case - ); + ASCII.String(url.hash()), // loaded url db entry + url.getHost(), + queueEntry.initiator(), // initiator peer hash + UTF8.getBytes(this.peers.mySeed().hash), // executor peer hash + processCase // process case + ); // update profiling info if ( System.currentTimeMillis() - lastPPMUpdate > 20000 ) { @@ -3647,7 +3618,7 @@ public final class Switchboard extends serverSwitch { if ((indexFilterQuery != null && !indexFilterQuery.isEmpty() && !CrawlProfile.SOLR_MATCH_ALL_QUERY.equals(indexFilterQuery)) || (indexSolrQueryMustNotMatch != null - && !CrawlProfile.SOLR_EMPTY_QUERY.equals(indexSolrQueryMustNotMatch))) { + && !CrawlProfile.SOLR_EMPTY_QUERY.equals(indexSolrQueryMustNotMatch))) { final EmbeddedInstance embeddedSolr = this.index.fulltext().getEmbeddedInstance(); final SolrCore embeddedCore = embeddedSolr != null ? embeddedSolr.getDefaultCore() : null; final boolean embeddedSolrConnected = embeddedSolr != null && embeddedCore != null; @@ -3687,12 +3658,12 @@ public final class Switchboard extends serverSwitch { } public final void addAllToIndex( - final DigestURL url, - final Map links, - final SearchEvent searchEvent, - final String heuristicName, - final Map collections, - final boolean doublecheck) { + final DigestURL url, + final Map links, + final SearchEvent searchEvent, + final String heuristicName, + final Map collections, + final boolean doublecheck) { final List urls = new ArrayList<>(); // add the landing page to the index. should not load that again since it should be in the cache @@ -3948,7 +3919,7 @@ public final class Switchboard extends serverSwitch { continue; } requests.add(request); - } + } new Thread() { @Override @@ -3982,12 +3953,12 @@ public final class Switchboard extends serverSwitch { ResultImages.registerImages(url, document, true); Switchboard.this.webStructure.generateCitationReference(url, document); Switchboard.this.storeDocumentIndex( - response, - collections, - document, - condenser, - searchEvent, - "heuristic:" + heuristicName); + response, + collections, + document, + condenser, + searchEvent, + "heuristic:" + heuristicName); Switchboard.this.log.info("addToIndex fill of url " + urlName + " finished"); } } @@ -4001,7 +3972,7 @@ public final class Switchboard extends serverSwitch { }.start(); } - /** + /** * add urls to Crawler - which itself loads the URL, parses the content and adds it to the index * transparent alternative to "addToIndex" including, double in crawler check, display in crawl monitor * but doesn't return results for a ongoing search @@ -4054,22 +4025,22 @@ public final class Switchboard extends serverSwitch { ""); if ( response == null ) { Switchboard.this.log.info("Sending crawl receipt for '" - + this.reference.url().toNormalform(true) - + "' to " - + this.initiatorPeer.getName() - + " FAILED, send time = " - + (System.currentTimeMillis() - t)); + + this.reference.url().toNormalform(true) + + "' to " + + this.initiatorPeer.getName() + + " FAILED, send time = " + + (System.currentTimeMillis() - t)); return; } final String delay = response.get("delay"); Switchboard.this.log.info("Sending crawl receipt for '" - + this.reference.url().toNormalform(true) - + "' to " - + this.initiatorPeer.getName() - + " success, delay = " - + delay - + ", send time = " - + (System.currentTimeMillis() - t)); + + this.reference.url().toNormalform(true) + + "' to " + + this.initiatorPeer.getName() + + " success, delay = " + + delay + + ", send time = " + + (System.currentTimeMillis() - t)); } } @@ -4098,7 +4069,7 @@ public final class Switchboard extends serverSwitch { // TODO: same is true for credential checks below (at least with BASIC auth -> login should expire at least on restart if (requestHeader.isUserInRole(UserDB.AccessRight.ADMIN_RIGHT.toString())) { if (this.adminAuthenticationLastAccess + 60000 > System.currentTimeMillis()) // 1 minute - return 4; // hard-authenticated, quick return + return 4; // hard-authenticated, quick return } // authorization in case that there is no account stored @@ -4206,19 +4177,19 @@ public final class Switchboard extends serverSwitch { public boolean verifyAuthentication(final RequestHeader header) { // handle access rights switch ( this.adminAuthenticated(header) ) { - case 0: // wrong password given - //try { Thread.sleep(3000); } catch (final InterruptedException e) { } // prevent brute-force - return false; - case 1: // no password given - return false; - case 2: // no password stored - return true; - case 3: // soft-authenticated for localhost only - return true; - case 4: // hard-authenticated, all ok - return true; - default: - return false; + case 0: // wrong password given + //try { Thread.sleep(3000); } catch (final InterruptedException e) { } // prevent brute-force + return false; + case 1: // no password given + return false; + case 2: // no password stored + return true; + case 3: // soft-authenticated for localhost only + return true; + case 4: // hard-authenticated, all ok + return true; + default: + return false; } } @@ -4248,19 +4219,19 @@ public final class Switchboard extends serverSwitch { final Segment indexSegment = this.index; if ( indexSegment.RWICount() < 100 ) { return "no DHT distribution: not enough words - wordIndex.size() = " - + indexSegment.RWICount(); + + indexSegment.RWICount(); } if ( (this.getConfig(SwitchboardConstants.INDEX_DIST_ALLOW_WHILE_CRAWLING, "false").equalsIgnoreCase("false")) && (!this.crawlQueues.noticeURL.isEmptyLocal()) ) { return "no DHT distribution: crawl in progress: noticeURL.stackSize() = " - + this.crawlQueues.noticeURL.size() - + ", sbQueue.size() = " - + this.getIndexingProcessorsQueueSize(); + + this.crawlQueues.noticeURL.size() + + ", sbQueue.size() = " + + this.getIndexingProcessorsQueueSize(); } if ( (this.getConfig(SwitchboardConstants.INDEX_DIST_ALLOW_WHILE_INDEXING, "false").equalsIgnoreCase("false")) && (this.getIndexingProcessorsQueueSize() > 1) ) { return "no DHT distribution: indexing in progress: noticeURL.stackSize() = " - + this.crawlQueues.noticeURL.size() - + ", sbQueue.size() = " - + this.getIndexingProcessorsQueueSize(); + + this.crawlQueues.noticeURL.size() + + ", sbQueue.size() = " + + this.getIndexingProcessorsQueueSize(); } return null; // this means; yes, please do dht transfer @@ -4282,16 +4253,16 @@ public final class Switchboard extends serverSwitch { // accumulate RWIs to transmission buffer if ( this.dhtDispatcher.bufferSize() > this.peers.scheme.verticalPartitions() ) { this.log.info("dhtTransferJob: no selection, too many entries in transmission buffer: " - + this.dhtDispatcher.bufferSize()); + + this.dhtDispatcher.bufferSize()); } else if ( MemoryControl.available() < 1024 * 1024 * 25 ) { this.log.info("dhtTransferJob: no selection, too less memory available : " - + (MemoryControl.available() / 1024 / 1024) - + " MB"); + + (MemoryControl.available() / 1024 / 1024) + + " MB"); } else if ( ConnectionInfo.getLoadPercent() > 50 ) { this.log.info("dhtTransferJob: too many connections in httpc pool : " - + ConnectionInfo.getCount()); + + ConnectionInfo.getCount()); // close unused connections -// Client.cleanup(); + // Client.cleanup(); } else if ( kbytesUp > 128 ) { this.log.info("dhtTransferJob: too much upload(1), currently uploading: " + kbytesUp + " Kb"); } else { @@ -4312,12 +4283,12 @@ public final class Switchboard extends serverSwitch { this.log.info("dhtTransferJob: selected " + ASCII.String(startHash) + " as start hash"); this.log.info("dhtTransferJob: selected " + ASCII.String(limitHash) + " as limit hash"); final boolean enqueued = - this.dhtDispatcher.selectContainersEnqueueToBuffer( - startHash, - limitHash, - dhtMaxContainerCount, - this.dhtMaxReferenceCount, - 5000); + this.dhtDispatcher.selectContainersEnqueueToBuffer( + startHash, + limitHash, + dhtMaxContainerCount, + this.dhtMaxReferenceCount, + 5000); hasDoneSomething = hasDoneSomething | enqueued; this.log.info("dhtTransferJob: result from enqueueing: " + ((enqueued) ? "true" : "false")); } @@ -4325,13 +4296,13 @@ public final class Switchboard extends serverSwitch { // check if we can deliver entries to other peers if ( this.dhtDispatcher.transmissionSize() >= 10 ) { this.log - .info("dhtTransferJob: no dequeueing from buffer to transmission: too many concurrent sessions: " + .info("dhtTransferJob: no dequeueing from buffer to transmission: too many concurrent sessions: " + this.dhtDispatcher.transmissionSize()); } else if ( ConnectionInfo.getLoadPercent() > 75 ) { this.log.info("dhtTransferJob: too many connections in httpc pool : " - + ConnectionInfo.getCount()); + + ConnectionInfo.getCount()); // close unused connections -// Client.cleanup(); + // Client.cleanup(); } else if ( kbytesUp > 256 ) { this.log.info("dhtTransferJob: too much upload(2), currently uploading: " + kbytesUp + " Kb"); } else { @@ -4452,9 +4423,9 @@ public final class Switchboard extends serverSwitch { */ @Deprecated // not used (since 2015-01-18, v1.81) public final void heuristicRSS( - final String urlpattern, - final SearchEvent searchEvent, - final String feedName) { + final String urlpattern, + final SearchEvent searchEvent, + final String feedName) { new Thread("heuristicRSS:" + feedName) { @Override @@ -4472,7 +4443,7 @@ public final class Switchboard extends serverSwitch { searchEvent.oneFeederStarted(); try { final Response response = - Switchboard.this.loader.load(Switchboard.this.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent); + Switchboard.this.loader.load(Switchboard.this.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent); final byte[] resource = (response == null) ? null : response.getContent(); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); if ( rss != null ) { @@ -4487,10 +4458,10 @@ public final class Switchboard extends serverSwitch { } ConcurrentLog.info("heuristicRSS", "Heuristic: adding " - + links.size() - + " links from '" - + feedName - + "' rss feed"); + + links.size() + + " links from '" + + feedName + + "' rss feed"); // add all pages to the index Switchboard.this.addAllToIndex(null, links, searchEvent, feedName, CrawlProfile.collectionParser("rss"), true); } diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index b64223300..afb1ee60d 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -51,7 +51,6 @@ import java.util.regex.Pattern; import org.apache.solr.common.SolrDocument; -import net.yacy.contentcontrol.ContentControlFilterUpdateThread; import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; @@ -95,7 +94,6 @@ import net.yacy.peers.RemoteSearch; import net.yacy.peers.SeedDB; import net.yacy.peers.graphics.ProfilingGraph; import net.yacy.repository.Blacklist.BlacklistType; -import net.yacy.repository.FilterEngine; import net.yacy.repository.LoaderDispatcher; import net.yacy.search.EventTracker; import net.yacy.search.Switchboard; @@ -110,10 +108,10 @@ import net.yacy.search.snippet.TextSnippet; import net.yacy.search.snippet.TextSnippet.ResultClass; public final class SearchEvent implements ScoreMapUpdatesListener { - - /** Supported protocols to be displayed in the protocol navigator. - * (Using here a single String constant is faster than a unmodifiable Set instance) */ - private static final String PROTOCOL_NAVIGATOR_SUPPORTED_VALUES = "http,https,smb,ftp,file"; + + /** Supported protocols to be displayed in the protocol navigator. + * (Using here a single String constant is faster than a unmodifiable Set instance) */ + private static final String PROTOCOL_NAVIGATOR_SUPPORTED_VALUES = "http,https,smb,ftp,file"; private static final int max_results_rwi = 3000; private static final int max_results_node = 150; @@ -126,12 +124,12 @@ public final class SearchEvent implements ScoreMapUpdatesListener { } catch (final ParseException e) { } } - */ + */ public final static ConcurrentLog log = new ConcurrentLog("SEARCH"); public static final int SNIPPET_MAX_LENGTH = 220; - + /** Default count of words for topicnavigagtor */ private static final int MAX_TOPWORDS = 12; @@ -150,250 +148,250 @@ public final class SearchEvent implements ScoreMapUpdatesListener { private byte[] IAmaxcounthash, IAneardhthash; public Thread rwiProcess; public Thread localsolrsearch; - + /** Offset of the next local Solr index request * Example : last local request with offset=10 and itemsPerPage=20, sets this attribute to 30. */ private int localsolroffset; - + /** counter for referenced that had been sorted out for other reasons */ private final AtomicInteger expectedRemoteReferences, maxExpectedRemoteReferences; /** a counter for the appearance of location coordinates .*/ public final ScoreMap locationNavigator; - + /** a counter for protocol types */ public final ScoreMap protocolNavigator; - + /** a counter for file types */ public final ConcurrentScoreMap dateNavigator; - + /** counters for Vocabularies; key is metatag.getVocabularyName() */ public final Map> vocabularyNavigator; - + /** if 0 no topicNavigator, holds expected number of terms for the topicNavigator */ private final int topicNavigatorCount; - + /** map of search custom/configured search navigators in addition to above standard navigators (which use special handling or display forms) */ public final Map navigatorPlugins; - - /** Holds the total number of successful write operations performed on all the active navigators since their initialization. */ - private final AtomicLong navGeneration = new AtomicLong(); - + + /** Holds the total number of successful write operations performed on all the active navigators since their initialization. */ + private final AtomicLong navGeneration = new AtomicLong(); + private final LoaderDispatcher loader; - + /** a set of word hashes that are used to match with the snippets */ private final HandleSet snippetFetchWordHashes; /** a set of words that are used to match with the snippets */ private final Set snippetFetchWords; private final boolean deleteIfSnippetFail; - private long urlRetrievalAllTime; - private long snippetComputationAllTime; - private ConcurrentHashMap> snippets; + private final long urlRetrievalAllTime; + private final long snippetComputationAllTime; + private final ConcurrentHashMap> snippets; private final boolean remote; - + /** add received results to local index (defult=true) */ public final boolean addResultsToLocalIndex; - + /** Maximum size allowed (in kbytes) for a remote document result to be stored to local index */ private long remoteStoredDocMaxSize; private SortedMap> localSearchInclusion; - + /** reference score computation for the commonSense heuristic */ private final ScoreMap ref; private final long maxtime; - + /** key = domhash (6 bytes); value = like stack */ private final ConcurrentHashMap> doubleDomCache; - + /** flag counter */ private final int[] flagcount; private final AtomicInteger feedersAlive, feedersTerminated, snippetFetchAlive; private boolean addRunning; private final AtomicInteger receivedRemoteReferences; private final ReferenceOrder order; - + /** map for double-check; String/Long relation, addresses ranking number (backreference for deletion) */ private final HandleSet urlhashes; - + /** a map from tagging vocabulary names to tagging predicate uris */ private final Map taggingPredicates; - + /** thats the bag where the RWI search process writes to. Contains both references from both local and remote RWIs. */ private final WeakPriorityBlockingQueue rwiStack; - + /** thats the bag where the solr results are written to */ private final WeakPriorityBlockingQueue nodeStack; - + /** thats the result list where the actual search result is waiting to be displayed */ private final WeakPriorityBlockingQueue resultList; - + /** if this is true, then every entry in result List is polled immediately to prevent a re-ranking in the resultList. This is usefull if there is only one index source. */ private final boolean pollImmediately; public final boolean excludeintext_image; - + // the following values are filled during the search process as statistics for the search // In the next comments "filtering" is doubles checking and applying eventual search query constraints/modifiers - + /** the number of hits generated/ranked by the local search in rwi index, after filtering */ public final AtomicInteger local_rwi_available; - + /** the number of existing hits by the local search in rwi index, before any supplementary filtering */ public final AtomicInteger local_rwi_stored; - + /** the number of hits imported from remote peers (rwi/solr mixed + eventual site heuristics), after filtering */ public final AtomicInteger remote_rwi_available; - + /** the number of existing hits at remote sites, before any filtering */ public final AtomicInteger remote_rwi_stored; - + /** the number of peers which contributed to the remote search result */ public final AtomicInteger remote_rwi_peerCount; - + /** The number of results evicted from local solr results after filtering up to the current query offset */ public final AtomicInteger local_solr_evicted; - + /** the total number of existing hits by the local search in solr, before any supplementary filtering */ public final AtomicInteger local_solr_stored; - + /** the number of hits imported from remote peers (rwi/solr mixed), after filtering */ public final AtomicInteger remote_solr_available; - + /** the number of existing hits at remote site (rwi/solr mixed)*/ public final AtomicInteger remote_solr_stored; - + /** the number of peers which contributed to the remote search result */ public final AtomicInteger remote_solr_peerCount; - + /** Ensure only one {@link #resortCachedResults()} operation to be performed on this search event */ public final Semaphore resortCacheAllowed; - /** - * Called when a search navigator has been updated : update the overall - * navGeneration counter to help then tracking changes and eventually refresh the yacysearchtrailer. - */ - @Override - public void updatedScoreMap() { - this.navGeneration.incrementAndGet(); - } - /** - * @return the total number of results currently available and filtered (checking doubles and eventual query constraints/modifiers) from the different data sources + * Called when a search navigator has been updated : update the overall + * navGeneration counter to help then tracking changes and eventually refresh the yacysearchtrailer. + */ + @Override + public void updatedScoreMap() { + this.navGeneration.incrementAndGet(); + } + + /** + * @return the total number of results currently available and filtered (checking doubles and eventual query constraints/modifiers) from the different data sources */ public int getResultCount() { return Math.max( this.local_rwi_available.get() + this.remote_rwi_available.get() + this.remote_solr_available.get() + Math.max(0, this.local_solr_stored.get() - this.local_solr_evicted.get()), - imageViewed.size() + sizeSpare() - ); + this.imageViewed.size() + sizeSpare() + ); } - + /** * @return the total number of successful write operations performed on all the active navigators since their initialization. */ public long getNavGeneration() { - return this.navGeneration.get(); - } - + return this.navGeneration.get(); + } + /** * Set maximum size allowed (in kbytes) for a remote document result to be stored to local index. - * @param maxSize document content max size in kbytes. Zero or negative value means no limit. + * @param maxSize document content max size in kbytes. Zero or negative value means no limit. */ public void setRemoteDocStoredMaxSize(long maxSize) { - this.remoteStoredDocMaxSize = maxSize; + this.remoteStoredDocMaxSize = maxSize; } - + /** * @return maximum size allowed (in kbytes) for a remote document result to be stored to local index. - * Zero or negative value means no limit. + * Zero or negative value means no limit. */ public long getRemoteDocStoredMaxSize() { - return this.remoteStoredDocMaxSize; + return this.remoteStoredDocMaxSize; } - + protected SearchEvent( - final QueryParams query, - final SeedDB peers, - final WorkTables workTables, - final SortedSet preselectedPeerHashes, - final boolean generateAbstracts, - final LoaderDispatcher loader, - final int remote_maxcount, - final long remote_maxtime, - final boolean deleteIfSnippetFail, - final boolean addResultsToLocalIdx) { + final QueryParams query, + final SeedDB peers, + final WorkTables workTables, + final SortedSet preselectedPeerHashes, + final boolean generateAbstracts, + final LoaderDispatcher loader, + final int remote_maxcount, + final long remote_maxtime, + final boolean deleteIfSnippetFail, + final boolean addResultsToLocalIdx) { long ab = MemoryControl.available(); if (ab < 1024 * 1024 * 200) { - int eb = SearchEventCache.size(); + final int eb = SearchEventCache.size(); SearchEventCache.cleanupEvents(false); - int en = SearchEventCache.size(); + final int en = SearchEventCache.size(); if (en < eb) { log.info("Cleaned up search event cache (1) " + eb + "->" + en + ", " + (ab - MemoryControl.available()) / 1024 / 1024 + " MB freed"); } } ab = MemoryControl.available(); - int eb = SearchEventCache.size(); + final int eb = SearchEventCache.size(); SearchEventCache.cleanupEvents(Math.max(1, (int) (MemoryControl.available() / (1024 * 1024 * 120)))); - int en = SearchEventCache.size(); + final int en = SearchEventCache.size(); if (en < eb) { log.info("Cleaned up search event cache (2) " + eb + "->" + en + ", " + (ab - MemoryControl.available()) / 1024 / 1024 + " MB freed"); } - + this.eventTime = System.currentTimeMillis(); // for lifetime check this.peers = peers; this.workTables = workTables; this.query = query; if(query != null) { - /* Image counter will eventually grow up faster than offset, but must start first with the same value as query offset */ - this.imagePageCounter = query.offset; + /* Image counter will eventually grow up faster than offset, but must start first with the same value as query offset */ + this.imagePageCounter = query.offset; } this.loader = loader; - this.nodeStack = new WeakPriorityBlockingQueue(max_results_node, false); + this.nodeStack = new WeakPriorityBlockingQueue<>(max_results_node, false); this.maxExpectedRemoteReferences = new AtomicInteger(0); this.expectedRemoteReferences = new AtomicInteger(0); this.excludeintext_image = Switchboard.getSwitchboard().getConfigBool("search.excludeintext.image", true); - + // prepare configured search navigation final Set navConfigs = Switchboard.getSwitchboard().getConfigSet("search.navigation"); - + boolean locationNavEnabled = false; boolean protocolNavEnabled = false; boolean topicsNavEnabled = false; boolean dateNavEnabled = false; for(final String navConfig : navConfigs) { - final String navName = NavigatorPlugins.getNavName(navConfig); - if("location".equals(navName)) { - locationNavEnabled = true; - } else if("protocol".equals(navName)) { - protocolNavEnabled = true; - } else if("topics".equals(navName)) { - topicsNavEnabled = true; - } else if("date".equals(navName)) { - dateNavEnabled = true; - } - } - + final String navName = NavigatorPlugins.getNavName(navConfig); + if("location".equals(navName)) { + locationNavEnabled = true; + } else if("protocol".equals(navName)) { + protocolNavEnabled = true; + } else if("topics".equals(navName)) { + topicsNavEnabled = true; + } else if("date".equals(navName)) { + dateNavEnabled = true; + } + } + this.locationNavigator = locationNavEnabled ? new ConcurrentScoreMap<>(this) : null; this.protocolNavigator = protocolNavEnabled ? new ConcurrentScoreMap<>(this) : null; this.dateNavigator = dateNavEnabled ? new ConcurrentScoreMap<>(this) : null; this.topicNavigatorCount = topicsNavEnabled ? MAX_TOPWORDS : 0; - this.vocabularyNavigator = new TreeMap>(); + this.vocabularyNavigator = new TreeMap<>(); // prepare configured search navigation (plugins) this.navigatorPlugins = NavigatorPlugins.initFromCfgStrings(navConfigs); if(this.navigatorPlugins != null) { - for(final Navigator nav : this.navigatorPlugins.values()) { - nav.setUpdatesListener(this); - } + for(final Navigator nav : this.navigatorPlugins.values()) { + nav.setUpdatesListener(this); + } } - this.snippets = new ConcurrentHashMap>(); + this.snippets = new ConcurrentHashMap<>(); this.secondarySearchSuperviser = (this.query.getQueryGoal().getIncludeHashes().size() > 1) ? new SecondarySearchSuperviser(this) : null; // generate abstracts only for combined searches if (this.secondarySearchSuperviser != null) this.secondarySearchSuperviser.start(); this.secondarySearchThreads = null; this.preselectedPeerHashes = preselectedPeerHashes; - this.IAResults = new TreeMap(Base64Order.enhancedCoder); - this.IACount = new TreeMap(Base64Order.enhancedCoder); - this.heuristics = new TreeMap(Base64Order.enhancedCoder); + this.IAResults = new TreeMap<>(Base64Order.enhancedCoder); + this.IACount = new TreeMap<>(Base64Order.enhancedCoder); + this.heuristics = new TreeMap<>(Base64Order.enhancedCoder); this.IAmaxcounthash = null; this.IAneardhthash = null; this.remote = (peers != null && peers.sizeConnected() > 0) && (this.query.domType == QueryParams.Searchdom.CLUSTER || (this.query.domType == QueryParams.Searchdom.GLOBAL && Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW_SEARCH, false))); @@ -415,15 +413,15 @@ public final class SearchEvent implements ScoreMapUpdatesListener { // do a soft commit for fresh results //query.getSegment().fulltext().commit(true); - + // we collect the urlhashes and construct a list with urlEntry objects // attention: if minEntries is too high, this method will not terminate within the maxTime // sortorder: 0 = hash, 1 = url, 2 = ranking this.localSearchInclusion = null; - this.ref = new ConcurrentScoreMap(this); + this.ref = new ConcurrentScoreMap<>(this); this.maxtime = query.maxtime; - this.rwiStack = new WeakPriorityBlockingQueue(max_results_rwi, false); - this.doubleDomCache = new ConcurrentHashMap>(); + this.rwiStack = new WeakPriorityBlockingQueue<>(max_results_rwi, false); + this.doubleDomCache = new ConcurrentHashMap<>(); this.flagcount = new int[32]; for ( int i = 0; i < 32; i++ ) { this.flagcount[i] = 0; @@ -435,26 +433,26 @@ public final class SearchEvent implements ScoreMapUpdatesListener { this.receivedRemoteReferences = new AtomicInteger(0); this.order = new ReferenceOrder(this.query.ranking, this.query.targetlang); this.urlhashes = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 100); - this.taggingPredicates = new HashMap(); - for (Tagging t: LibraryProvider.autotagging.getVocabularies()) { + this.taggingPredicates = new HashMap<>(); + for (final Tagging t: LibraryProvider.autotagging.getVocabularies()) { this.taggingPredicates.put(t.getName(), t.getPredicate()); } // start a local solr search if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_SOLR_OFF, false)) { - final boolean useSolrFacets = true; - this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, - this.query.solrQuery(this.query.contentdom, this.query.isStrictContentDom(), useSolrFacets, this.excludeintext_image), this.query.offset, - this.query.itemsPerPage, null /* this peer */, 0, Switchboard.urlBlacklist, useSolrFacets, true); + final boolean useSolrFacets = true; + this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, + this.query.solrQuery(this.query.contentdom, this.query.isStrictContentDom(), useSolrFacets, this.excludeintext_image), this.query.offset, + this.query.itemsPerPage, null /* this peer */, 0, Switchboard.urlBlacklist, useSolrFacets, true); } this.localsolroffset = this.query.offset + this.query.itemsPerPage; - + // start a local RWI search concurrently this.rwiProcess = null; if (query.getSegment().connectedRWI() && !Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_DHT_OFF, false)) { // we start the local search only if this peer is doing a remote search or when it is doing a local search and the peer is old - rwiProcess = new RWIProcess(this.localsolrsearch); - rwiProcess.start(); + this.rwiProcess = new RWIProcess(this.localsolrsearch); + this.rwiProcess.start(); } if (this.remote) { @@ -465,8 +463,8 @@ public final class SearchEvent implements ScoreMapUpdatesListener { this.primarySearchThreadsL = null; this.nodeSearchThreads = null; } else { - this.primarySearchThreadsL = new ArrayList(); - this.nodeSearchThreads = new ArrayList(); + this.primarySearchThreadsL = new ArrayList<>(); + this.nodeSearchThreads = new ArrayList<>(); // start this concurrently because the remote search needs an enumeration // of the remote peers which may block in some cases when i.e. DHT is active // at the same time. @@ -474,20 +472,20 @@ public final class SearchEvent implements ScoreMapUpdatesListener { @Override public void run() { RemoteSearch.primaryRemoteSearches( - SearchEvent.this, - 0, remote_maxcount, - remote_maxtime, - Switchboard.urlBlacklist, - (SearchEvent.this.query.domType == QueryParams.Searchdom.GLOBAL) ? null : preselectedPeerHashes); + SearchEvent.this, + 0, remote_maxcount, + remote_maxtime, + Switchboard.urlBlacklist, + (SearchEvent.this.query.domType == QueryParams.Searchdom.GLOBAL) ? null : preselectedPeerHashes); } }.start(); } if ( this.primarySearchThreadsL != null ) { ConcurrentLog.fine("SEARCH_EVENT", "STARTING " - + this.primarySearchThreadsL.size() - + " THREADS TO CATCH EACH " - + remote_maxcount - + " URLs"); + + this.primarySearchThreadsL.size() + + " THREADS TO CATCH EACH " + + remote_maxcount + + " URLs"); EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEventType.REMOTESEARCH_START, "", this.primarySearchThreadsL.size(), System.currentTimeMillis() - timer), false); // finished searching ConcurrentLog.fine("SEARCH_EVENT", "SEARCH TIME AFTER GLOBAL-TRIGGER TO " + this.primarySearchThreadsL.size() + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds"); @@ -502,7 +500,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { if ( generateAbstracts ) { // we need the results now try { - if (rwiProcess != null && query.getSegment().connectedRWI()) rwiProcess.join(); + if (this.rwiProcess != null && query.getSegment().connectedRWI()) this.rwiProcess.join(); } catch (final Throwable e ) { } // compute index abstracts @@ -535,7 +533,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { // give process time to accumulate a certain amount of data // before a reading process wants to get results from it try { - if (rwiProcess != null && query.getSegment().connectedRWI() && rwiProcess.isAlive()) rwiProcess.join(100); + if (this.rwiProcess != null && query.getSegment().connectedRWI() && this.rwiProcess.isAlive()) this.rwiProcess.join(100); } catch (final Throwable e ) { } // this will reduce the maximum waiting time until results are available to 100 milliseconds @@ -547,14 +545,14 @@ public final class SearchEvent implements ScoreMapUpdatesListener { this.deleteIfSnippetFail = deleteIfSnippetFail; this.urlRetrievalAllTime = 0; this.snippetComputationAllTime = 0; - this.resultList = new WeakPriorityBlockingQueue(Math.max(max_results_node, 10 * query.itemsPerPage()), true); // this is the result, enriched with snippets, ranked and ordered by ranking + this.resultList = new WeakPriorityBlockingQueue<>(Math.max(max_results_node, 10 * query.itemsPerPage()), true); // this is the result, enriched with snippets, ranked and ordered by ranking // snippets do not need to match with the complete query hashes, - // only with the query minus the stopwords which had not been used for the search + // only with the query minus the stopwords which had not been used for the search boolean filtered = false; // check if query contains stopword if (Switchboard.stopwordHashes != null) { - Iterator it = query.getQueryGoal().getIncludeHashes().iterator(); + final Iterator it = query.getQueryGoal().getIncludeHashes().iterator(); while (it.hasNext()) { if (Switchboard.stopwordHashes.contains((it.next()))) { filtered = true; @@ -562,11 +560,11 @@ public final class SearchEvent implements ScoreMapUpdatesListener { } } } - this.snippetFetchWordHashes = query.getQueryGoal().getIncludeHashes().clone(); + this.snippetFetchWordHashes = query.getQueryGoal().getIncludeHashes().clone(); if (filtered) { // remove stopwords this.snippetFetchWordHashes.excludeDestructive(Switchboard.stopwordHashes); } - + this.snippetFetchWords = query.getQueryGoal().getIncludeWordsSet(); // remove stopwords this.snippetFetchWords.removeAll(Switchboard.stopwords); @@ -586,39 +584,39 @@ public final class SearchEvent implements ScoreMapUpdatesListener { * A concurrent task to perform the current search query on the local RWI. */ private class RWIProcess extends Thread { - + final Thread waitForThread; - + public RWIProcess(final Thread waitForThread) { super("SearchEvent.RWIProcess(" + (waitForThread != null ? waitForThread.getName() : "") + ")"); this.waitForThread = waitForThread; } - + /** * Query the local RWI and feed the search event with the obtained results. */ @Override public void run() { - - if (query.getSegment().termIndex() == null) return; // nothing to do; this index is not used - + + if (SearchEvent.this.query.getSegment().termIndex() == null) return; // nothing to do; this index is not used + // do a search oneFeederStarted(); - + // sort the local containers and truncate it to a limited count, // so following sortings together with the global results will be fast try { final long timer = System.currentTimeMillis(); TermSearch search = - SearchEvent.this.query + SearchEvent.this.query .getSegment() .termIndex() .query( SearchEvent.this.query.getQueryGoal().getIncludeHashes(), SearchEvent.this.query.getQueryGoal().getExcludeHashes(), - null, - Segment.wordReferenceFactory, - SearchEvent.this.query.maxDistance); + null, + Segment.wordReferenceFactory, + SearchEvent.this.query.maxDistance); SearchEvent.this.localSearchInclusion = search.inclusion(); ReferenceContainer index = search.joined(); if ( !index.isEmpty() ) { @@ -626,22 +624,22 @@ public final class SearchEvent implements ScoreMapUpdatesListener { if (this.waitForThread != null && this.waitForThread.isAlive()) { this.waitForThread.join(); } - + // add the index to the result int successcount = addRWIs(index, true, "local index: " + SearchEvent.this.query.getSegment().getLocation(), index.size(), SearchEvent.this.maxtime); if (successcount == 0 && - SearchEvent.this.query.getQueryGoal().getIncludeHashes().has(Segment.catchallHash) && - SearchEvent.this.query.modifier.sitehost != null && SearchEvent.this.query.modifier.sitehost.length() > 0 - ) { + SearchEvent.this.query.getQueryGoal().getIncludeHashes().has(Segment.catchallHash) && + SearchEvent.this.query.modifier.sitehost != null && SearchEvent.this.query.modifier.sitehost.length() > 0 + ) { // try again with sitehost - String newGoal = Domains.getSmartSLD(SearchEvent.this.query.modifier.sitehost); + final String newGoal = Domains.getSmartSLD(SearchEvent.this.query.modifier.sitehost); search = SearchEvent.this.query - .getSegment() - .termIndex() - .query( - QueryParams.hashes2Set(ASCII.String(Word.word2hash(newGoal))), - SearchEvent.this.query.getQueryGoal().getExcludeHashes(), + .getSegment() + .termIndex() + .query( + QueryParams.hashes2Set(ASCII.String(Word.word2hash(newGoal))), + SearchEvent.this.query.getQueryGoal().getExcludeHashes(), null, Segment.wordReferenceFactory, SearchEvent.this.query.maxDistance); @@ -655,11 +653,11 @@ public final class SearchEvent implements ScoreMapUpdatesListener { EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch( SearchEvent.this.query.id(true), - SearchEventType.JOIN, - SearchEvent.this.query.getQueryGoal().getQueryString(false), - successcount, - System.currentTimeMillis() - timer), - false); + SearchEventType.JOIN, + SearchEvent.this.query.getQueryGoal().getQueryString(false), + successcount, + System.currentTimeMillis() - timer), + false); SearchEvent.this.addFinalize(); } } catch (final Exception e ) { @@ -671,11 +669,11 @@ public final class SearchEvent implements ScoreMapUpdatesListener { } public int addRWIs( - final ReferenceContainer index, - final boolean local, - final String resourceName, - final int fullResource, - final long maxtime) { + final ReferenceContainer index, + final boolean local, + final String resourceName, + final int fullResource, + final long maxtime) { // we collect the urlhashes and construct a list with urlEntry objects // attention: if minEntries is too high, this method will not terminate within the maxTime //Log.logInfo("SearchEvent", "added a container, size = " + index.size()); @@ -695,20 +693,20 @@ public final class SearchEvent implements ScoreMapUpdatesListener { // normalize entries final BlockingQueue decodedEntries = this.order.normalizeWith(index, maxtime, local); - int is = index.size(); + final int is = index.size(); EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch( - this.query.id(true), - SearchEventType.NORMALIZING, - resourceName, - is, - System.currentTimeMillis() - timer), false); + this.query.id(true), + SearchEventType.NORMALIZING, + resourceName, + is, + System.currentTimeMillis() - timer), false); if (!local) this.receivedRemoteReferences.addAndGet(is); // iterate over normalized entries and select some that are better than currently stored timer = System.currentTimeMillis(); // apply all constraints - long timeout = maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime; + final long timeout = maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime; int successcounter = 0; try { WordReferenceVars iEntry; @@ -716,7 +714,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { String acceptableAlternativeSitehash = null; if (this.query.modifier.sitehost != null && this.query.modifier.sitehost.length() > 0) try { acceptableAlternativeSitehash = DigestURL.hosthash(this.query.modifier.sitehost.startsWith("www.") ? this.query.modifier.sitehost.substring(4) : "www." + this.query.modifier.sitehost, 80); - } catch (MalformedURLException e1) {} + } catch (final MalformedURLException e1) {} pollloop: while ( true ) { remaining = timeout - System.currentTimeMillis(); if (remaining <= 0) { @@ -738,9 +736,9 @@ public final class SearchEvent implements ScoreMapUpdatesListener { if (log.isFine()) log.fine("dropped RWI: doublecheck"); continue pollloop; } - + // increase flag counts - Bitfield flags = iEntry.flags(); + final Bitfield flags = iEntry.flags(); for (int j = 0; j < 32; j++) { if (flags.get(j)) this.flagcount[j]++; } @@ -753,37 +751,37 @@ public final class SearchEvent implements ScoreMapUpdatesListener { // check document domain if (this.query.contentdom.getCode() > 0) { - boolean domainMatch = true; - if(this.query.isStrictContentDom()) { - if((this.query.contentdom == ContentDomain.AUDIO && iEntry.getType() != Response.DT_AUDIO) || + boolean domainMatch = true; + if(this.query.isStrictContentDom()) { + if((this.query.contentdom == ContentDomain.AUDIO && iEntry.getType() != Response.DT_AUDIO) || (this.query.contentdom == ContentDomain.VIDEO && iEntry.getType() != Response.DT_MOVIE) || (this.query.contentdom == ContentDomain.IMAGE && iEntry.getType() != Response.DT_IMAGE) || (this.query.contentdom == ContentDomain.APP && !(flags.get(Tokenizer.flag_cat_hasapp)))) { - domainMatch = false; - } - } else if((this.query.contentdom == ContentDomain.AUDIO && !(flags.get(Tokenizer.flag_cat_hasaudio))) || - (this.query.contentdom == ContentDomain.VIDEO && !(flags.get(Tokenizer.flag_cat_hasvideo))) || - (this.query.contentdom == ContentDomain.IMAGE && !(flags.get(Tokenizer.flag_cat_hasimage))) || - (this.query.contentdom == ContentDomain.APP && !(flags.get(Tokenizer.flag_cat_hasapp)))) { - domainMatch = false; - } - if(!domainMatch) { - if (log.isFine()) { - log.fine("dropped RWI: contentdom fail"); - } - continue pollloop; - } - } - + domainMatch = false; + } + } else if((this.query.contentdom == ContentDomain.AUDIO && !(flags.get(Tokenizer.flag_cat_hasaudio))) || + (this.query.contentdom == ContentDomain.VIDEO && !(flags.get(Tokenizer.flag_cat_hasvideo))) || + (this.query.contentdom == ContentDomain.IMAGE && !(flags.get(Tokenizer.flag_cat_hasimage))) || + (this.query.contentdom == ContentDomain.APP && !(flags.get(Tokenizer.flag_cat_hasapp)))) { + domainMatch = false; + } + if(!domainMatch) { + if (log.isFine()) { + log.fine("dropped RWI: contentdom fail"); + } + continue pollloop; + } + } + // check language - if (this.query.modifier.language != null && !this.query.modifier.language.isEmpty() - && !this.query.modifier.language.equals(iEntry.getLanguageString())) { - if (log.isFine()) { - log.fine("dropped RWI: language constraint = " + this.query.modifier.language); - } - continue pollloop; - } - + if (this.query.modifier.language != null && !this.query.modifier.language.isEmpty() + && !this.query.modifier.language.equals(iEntry.getLanguageString())) { + if (log.isFine()) { + log.fine("dropped RWI: language constraint = " + this.query.modifier.language); + } + continue pollloop; + } + // count domZones //this.domZones[DigestURI.domDomain(iEntry.metadataHash())]++; @@ -806,7 +804,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { this.urlhashes.putUnique(iEntry.urlhash()); rankingtryloop: while (true) { try { - this.rwiStack.put(new ReverseElement(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest) + this.rwiStack.put(new ReverseElement<>(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest) break rankingtryloop; } catch (final ArithmeticException e ) { // this may happen if the concurrent normalizer changes values during cardinal computation @@ -816,25 +814,24 @@ public final class SearchEvent implements ScoreMapUpdatesListener { } // increase counter for statistics if (local) this.local_rwi_available.incrementAndGet(); else this.remote_rwi_available.incrementAndGet(); - + successcounter++; } if (System.currentTimeMillis() >= timeout) ConcurrentLog.warn("SearchEvent", "rwi normalization ended with timeout = " + maxtime); - } catch (final InterruptedException e ) { - } catch (final SpaceExceededException e ) { + } catch (final InterruptedException | SpaceExceededException e ) { } //if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true); EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch( - this.query.id(true), - SearchEventType.PRESORT, - resourceName, - index.size(), - System.currentTimeMillis() - timer), false); + this.query.id(true), + SearchEventType.PRESORT, + resourceName, + index.size(), + System.currentTimeMillis() - timer), false); return successcounter; } - + public long getEventTime() { return this.eventTime; } @@ -847,7 +844,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { // stop all threads if (this.localsolrsearch != null) { - if (localsolrsearch.isAlive()) synchronized (this.localsolrsearch) {this.localsolrsearch.interrupt();} + if (this.localsolrsearch.isAlive()) synchronized (this.localsolrsearch) {this.localsolrsearch.interrupt();} } if (this.nodeSearchThreads != null) { for (final Thread search : this.nodeSearchThreads) { @@ -924,11 +921,11 @@ public final class SearchEvent implements ScoreMapUpdatesListener { return this.heuristics.get(urlhash); } } - + /** * Add result entries to this nodes stack and update eventual navigators counters. * @param nodeList a list of entries from a Solr instance - * @param facets a map from a field name to scored values (aka Solr facet). May be null : in that case the navigators counters are updated one by one when inserting each result in the nodes stack. + * @param facets a map from a field name to scored values (aka Solr facet). May be null : in that case the navigators counters are updated one by one when inserting each result in the nodes stack. * @param solrsnippets a map from urlhash to snippet text * @param local true when the nodeList comes from the local Solr * @param resourceName the name of the data source to use for monitoring in the event tracker @@ -936,16 +933,16 @@ public final class SearchEvent implements ScoreMapUpdatesListener { * @param incrementNavigators when true, increment event navigators either with facet counts or with individual results */ public void addNodes( - final List nodeList, - final Map> facets, - final Map> solrsnippets, - final boolean local, - final String resourceName, - final int fullResource, - final boolean incrementNavigators) { + final List nodeList, + final Map> facets, + final Map> solrsnippets, + final boolean local, + final String resourceName, + final int fullResource, + final boolean incrementNavigators) { this.addBegin(); - + // check if all results have snippets /* for (URIMetadataNode node: nodeList) { @@ -953,7 +950,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { log.logInfo("no snippet from Solr for " + node.url().toNormalform(true)); } } - */ + */ this.snippets.putAll(solrsnippets); assert (nodeList != null); if (nodeList.isEmpty()) return; @@ -969,7 +966,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { long timer = System.currentTimeMillis(); // normalize entries - int is = nodeList.size(); + final int is = nodeList.size(); EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEventType.NORMALIZING, resourceName, is, System.currentTimeMillis() - timer), false); if (!local) { this.receivedRemoteReferences.addAndGet(is); @@ -980,23 +977,23 @@ public final class SearchEvent implements ScoreMapUpdatesListener { // collect navigation information from Solr facets when available if(incrementNavigators) { - incrNavigatorsFromSolrFacets(facets); + incrNavigatorsFromSolrFacets(facets); } - + // apply all constraints try { - pollloop: for (URIMetadataNode iEntry: nodeList) { - + pollloop: for (final URIMetadataNode iEntry: nodeList) { + // check url related eventual constraints (protocol, tld, sitehost, and filetype) - final String matchingResult = QueryParams.matchesURL(this.query.modifier, this.query.tld, iEntry.url()); + final String matchingResult = QueryParams.matchesURL(this.query.modifier, this.query.tld, iEntry.url()); if (!matchingResult.isEmpty()) { if (log.isFine()) { - log.fine("dropped Node: " + matchingResult); + log.fine("dropped Node: " + matchingResult); } updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators); continue pollloop; } - + if ( !this.query.urlMask_isCatchall && this.query.urlMaskPattern != null) { // check url mask, only when not redundant with query modifier and tld constraints if (!iEntry.matches(this.query.urlMaskPattern)) { @@ -1005,7 +1002,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { continue pollloop; } } - + // doublecheck for urls if (this.urlhashes.has(iEntry.hash())) { if (log.isFine()) log.fine("dropped Node: double check"); @@ -1019,7 +1016,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { } // check constraints - Bitfield flags = iEntry.flags(); + final Bitfield flags = iEntry.flags(); if (!this.testFlags(flags)) { if (log.isFine()) log.fine("dropped Node: flag test"); updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators); @@ -1028,28 +1025,28 @@ public final class SearchEvent implements ScoreMapUpdatesListener { // check document domain if (this.query.contentdom.getCode() > 0) { - boolean domainMatch = true; - if(this.query.isStrictContentDom()) { + boolean domainMatch = true; + if(this.query.isStrictContentDom()) { if(this.query.contentdom != iEntry.getContentDomain()) { - domainMatch = false; + domainMatch = false; + } + } else if((this.query.contentdom == ContentDomain.AUDIO && !(flags.get(Tokenizer.flag_cat_hasaudio))) || + (this.query.contentdom == ContentDomain.VIDEO && !(flags.get(Tokenizer.flag_cat_hasvideo))) || + (this.query.contentdom == ContentDomain.IMAGE && !(flags.get(Tokenizer.flag_cat_hasimage))) || + (this.query.contentdom == ContentDomain.APP && !(flags.get(Tokenizer.flag_cat_hasapp)))) { + domainMatch = false; + } + if(!domainMatch) { + if (log.isFine()) { + log.fine("dropped Node: content domain does not match"); } - } else if((this.query.contentdom == ContentDomain.AUDIO && !(flags.get(Tokenizer.flag_cat_hasaudio))) || - (this.query.contentdom == ContentDomain.VIDEO && !(flags.get(Tokenizer.flag_cat_hasvideo))) || - (this.query.contentdom == ContentDomain.IMAGE && !(flags.get(Tokenizer.flag_cat_hasimage))) || - (this.query.contentdom == ContentDomain.APP && !(flags.get(Tokenizer.flag_cat_hasapp)))) { - domainMatch = false; + updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators); + continue pollloop; } - if(!domainMatch) { - if (log.isFine()) { - log.fine("dropped Node: content domain does not match"); - } - updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators); - continue pollloop; - } - } - + } + // filter out media links in text search, if wanted - String ext = MultiProtocolURL.getFileExtension(iEntry.url().getFileName()); + final String ext = MultiProtocolURL.getFileExtension(iEntry.url().getFileName()); if (this.query.contentdom == ContentDomain.TEXT && Classification.isImageExtension(ext) && this.excludeintext_image) { if (log.isFine()) log.fine("dropped Node: file name domain does not match"); updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators); @@ -1097,12 +1094,12 @@ public final class SearchEvent implements ScoreMapUpdatesListener { long score; // determine nodestack ranking (will be altered by postranking) // so far Solr score is used (with abitrary factor to get value similar to rwi ranking values) - Float scorex = (Float) iEntry.getFieldValue("score"); // this is a special field containing the ranking score of a Solr search result + final Float scorex = (Float) iEntry.getFieldValue("score"); // this is a special field containing the ranking score of a Solr search result if (scorex != null && scorex > 0) score = (long) ((1000000.0f * scorex) - iEntry.urllength()); // we modify the score here since the solr score is equal in many cases and then the order would simply depend on the url hash which would be silly else score = this.order.cardinal(iEntry); - this.nodeStack.put(new ReverseElement(iEntry, score)); // inserts the element and removes the worst (which is smallest) + this.nodeStack.put(new ReverseElement<>(iEntry, score)); // inserts the element and removes the worst (which is smallest) break rankingtryloop; } catch (final ArithmeticException e ) { // this may happen if the concurrent normalizer changes values during cardinal computation @@ -1111,12 +1108,12 @@ public final class SearchEvent implements ScoreMapUpdatesListener { } // increase counter for statistics if (!local) { - this.remote_solr_available.incrementAndGet(); + this.remote_solr_available.incrementAndGet(); } - + // collect navigation information not available in facets if(incrementNavigators) { - incrNavigatorsFromSingleDocument(iEntry, facets); + incrNavigatorsFromSingleDocument(iEntry, facets); } } } catch (final SpaceExceededException e ) { @@ -1128,157 +1125,157 @@ public final class SearchEvent implements ScoreMapUpdatesListener { * Increment this event eventual navigators with the given facets processed by a Solr instance * @param facets facets counts from a Solr instance */ - private void incrNavigatorsFromSolrFacets(final Map> facets) { - if(facets != null && !facets.isEmpty()) { - /* Iterate over active navigator plugins to let them update the counters */ - for (String s : this.navigatorPlugins.keySet()) { - Navigator navi = this.navigatorPlugins.get(s); - if (navi != null) { - navi.incFacet(facets); - } - } - - ReversibleScoreMap fcts; - if (this.locationNavigator != null) { - /* Is is still relevant? It looks like this nav is currently never filled, as a constraint on coordinates - * is expressed as a spatial filter not producing facets counts (see QueryParams.getFacetsFilterQueries()). */ - fcts = facets.get(CollectionSchema.coordinate_p_0_coordinate.getSolrFieldName()); - if (fcts != null) { - for (String coordinate: fcts) { - int hc = fcts.get(coordinate); - if (hc == 0) continue; - this.locationNavigator.inc(coordinate, hc); - } - } - } - - if (this.dateNavigator != null) { - fcts = facets.get(CollectionSchema.dates_in_content_dts.getSolrFieldName()); - if (fcts != null) this.dateNavigator.inc(fcts); - } - - if (this.protocolNavigator != null) { - fcts = facets.get(CollectionSchema.url_protocol_s.getSolrFieldName()); - if (fcts != null) { - // remove all protocols that we don't know - Iterator i = fcts.iterator(); - while (i.hasNext()) { - String protocol = i.next(); - if (PROTOCOL_NAVIGATOR_SUPPORTED_VALUES.indexOf(protocol) < 0) { - i.remove(); - } - } - this.protocolNavigator.inc(fcts); - } - } - - // get the vocabulary navigation - Set genericFacets = new LinkedHashSet<>(); - for (Tagging v: LibraryProvider.autotagging.getVocabularies()) genericFacets.add(v.getName()); - genericFacets.addAll(ProbabilisticClassifier.getContextNames()); - for (String vocName: genericFacets) { - fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + vocName + CollectionSchema.VOCABULARY_TERMS_SUFFIX); - if (fcts != null) { - ScoreMap vocNav = this.vocabularyNavigator.get(vocName); - if (vocNav == null) { - vocNav = new ConcurrentScoreMap(); - this.vocabularyNavigator.put(vocName, vocNav); - } - vocNav.inc(fcts); - } - } - } - } - + private void incrNavigatorsFromSolrFacets(final Map> facets) { + if(facets != null && !facets.isEmpty()) { + /* Iterate over active navigator plugins to let them update the counters */ + for (final String s : this.navigatorPlugins.keySet()) { + final Navigator navi = this.navigatorPlugins.get(s); + if (navi != null) { + navi.incFacet(facets); + } + } + + ReversibleScoreMap fcts; + if (this.locationNavigator != null) { + /* Is is still relevant? It looks like this nav is currently never filled, as a constraint on coordinates + * is expressed as a spatial filter not producing facets counts (see QueryParams.getFacetsFilterQueries()). */ + fcts = facets.get(CollectionSchema.coordinate_p_0_coordinate.getSolrFieldName()); + if (fcts != null) { + for (final String coordinate: fcts) { + final int hc = fcts.get(coordinate); + if (hc == 0) continue; + this.locationNavigator.inc(coordinate, hc); + } + } + } + + if (this.dateNavigator != null) { + fcts = facets.get(CollectionSchema.dates_in_content_dts.getSolrFieldName()); + if (fcts != null) this.dateNavigator.inc(fcts); + } + + if (this.protocolNavigator != null) { + fcts = facets.get(CollectionSchema.url_protocol_s.getSolrFieldName()); + if (fcts != null) { + // remove all protocols that we don't know + final Iterator i = fcts.iterator(); + while (i.hasNext()) { + final String protocol = i.next(); + if (PROTOCOL_NAVIGATOR_SUPPORTED_VALUES.indexOf(protocol) < 0) { + i.remove(); + } + } + this.protocolNavigator.inc(fcts); + } + } + + // get the vocabulary navigation + final Set genericFacets = new LinkedHashSet<>(); + for (final Tagging v: LibraryProvider.autotagging.getVocabularies()) genericFacets.add(v.getName()); + genericFacets.addAll(ProbabilisticClassifier.getContextNames()); + for (final String vocName: genericFacets) { + fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + vocName + CollectionSchema.VOCABULARY_TERMS_SUFFIX); + if (fcts != null) { + ScoreMap vocNav = this.vocabularyNavigator.get(vocName); + if (vocNav == null) { + vocNav = new ConcurrentScoreMap<>(); + this.vocabularyNavigator.put(vocName, vocNav); + } + vocNav.inc(fcts); + } + } + } + } + /** * Increment this event eventual navigators with the given entry, only when the concerned field is not present in facets * @param doc a document entry from a Solr source * @param facets facets counts from a Solr instance */ - private void incrNavigatorsFromSingleDocument(final URIMetadataNode doc, - final Map> facets) { - - /* Iterate over active navigator plugins to let them update the counters */ - for (String s : this.navigatorPlugins.keySet()) { - Navigator navi = this.navigatorPlugins.get(s); - if (navi != null && facets == null || !facets.containsKey(navi.getIndexFieldName())) { - navi.incDoc(doc); - } - } - - /* Note : would it be relevant to update here this.locationNavigator ? + private void incrNavigatorsFromSingleDocument(final URIMetadataNode doc, + final Map> facets) { + + /* Iterate over active navigator plugins to let them update the counters */ + for (final String s : this.navigatorPlugins.keySet()) { + final Navigator navi = this.navigatorPlugins.get(s); + if (navi != null && facets == null || !facets.containsKey(navi.getIndexFieldName())) { + navi.incDoc(doc); + } + } + + /* Note : would it be relevant to update here this.locationNavigator ? It looks like this nav is currently never filled */ - if (this.dateNavigator != null) { - if (facets == null || !facets.containsKey(CollectionSchema.dates_in_content_dts.getSolrFieldName())) { - Date[] dates = doc.datesInContent(); - if (dates != null) { - for (final Date date : dates) { - if (date != null) { - this.dateNavigator.inc(ISO8601Formatter.FORMATTER.format(date)); - } - } - } - } - } - - if (this.protocolNavigator != null) { - if (facets == null || !facets.containsKey(CollectionSchema.url_protocol_s.getSolrFieldName())) { - final String protocol = doc.url().getProtocol(); - // include only protocols supported protocols - if (protocol != null && PROTOCOL_NAVIGATOR_SUPPORTED_VALUES.indexOf(protocol) >= 0) { - this.protocolNavigator.inc(protocol); - } - } - } - - // get the vocabulary navigation - if(this.vocabularyNavigator != null) { - Set genericFacets = new LinkedHashSet<>(); - for (Tagging v : LibraryProvider.autotagging.getVocabularies()) { - genericFacets.add(v.getName()); - } - genericFacets.addAll(ProbabilisticClassifier.getContextNames()); - for (String vocName : genericFacets) { - final String fieldName = CollectionSchema.VOCABULARY_PREFIX + vocName + CollectionSchema.VOCABULARY_TERMS_SUFFIX; - if (facets == null || !facets.containsKey(fieldName)) { - incrementVocNavigator(doc, vocName, fieldName); - } - } - } - } - - /** - * Increment a vocabulary navigator with the given document - * @param doc a document entry. Must not be null. - * @param vocName the name of the vocabulary. Must not be null. - * @param fieldName the name of the field eventually holding the vocabulary information in the document entry - */ - protected void incrementVocNavigator(final URIMetadataNode doc, final String vocName, final String fieldName) { - final Object docValue = doc.getFieldValue(fieldName); - if(docValue instanceof String) { - ScoreMap vocNav = this.vocabularyNavigator.get(vocName); - if (vocNav == null) { - vocNav = new ConcurrentScoreMap(); - this.vocabularyNavigator.put(vocName, vocNav); - } - vocNav.inc((String)docValue); - } else if(docValue instanceof Collection) { - if (!((Collection) docValue).isEmpty()) { - ScoreMap vocNav = this.vocabularyNavigator.get(vocName); - if (vocNav == null) { - vocNav = new ConcurrentScoreMap(); - this.vocabularyNavigator.put(vocName, vocNav); - } - for (final Object singleDocValue : (Collection) docValue) { - if (singleDocValue instanceof String) { - vocNav.inc((String) singleDocValue); - } - } - } - } - } - + if (this.dateNavigator != null) { + if (facets == null || !facets.containsKey(CollectionSchema.dates_in_content_dts.getSolrFieldName())) { + final Date[] dates = doc.datesInContent(); + if (dates != null) { + for (final Date date : dates) { + if (date != null) { + this.dateNavigator.inc(ISO8601Formatter.FORMATTER.format(date)); + } + } + } + } + } + + if (this.protocolNavigator != null) { + if (facets == null || !facets.containsKey(CollectionSchema.url_protocol_s.getSolrFieldName())) { + final String protocol = doc.url().getProtocol(); + // include only protocols supported protocols + if (protocol != null && PROTOCOL_NAVIGATOR_SUPPORTED_VALUES.indexOf(protocol) >= 0) { + this.protocolNavigator.inc(protocol); + } + } + } + + // get the vocabulary navigation + if(this.vocabularyNavigator != null) { + final Set genericFacets = new LinkedHashSet<>(); + for (final Tagging v : LibraryProvider.autotagging.getVocabularies()) { + genericFacets.add(v.getName()); + } + genericFacets.addAll(ProbabilisticClassifier.getContextNames()); + for (final String vocName : genericFacets) { + final String fieldName = CollectionSchema.VOCABULARY_PREFIX + vocName + CollectionSchema.VOCABULARY_TERMS_SUFFIX; + if (facets == null || !facets.containsKey(fieldName)) { + incrementVocNavigator(doc, vocName, fieldName); + } + } + } + } + + /** + * Increment a vocabulary navigator with the given document + * @param doc a document entry. Must not be null. + * @param vocName the name of the vocabulary. Must not be null. + * @param fieldName the name of the field eventually holding the vocabulary information in the document entry + */ + protected void incrementVocNavigator(final URIMetadataNode doc, final String vocName, final String fieldName) { + final Object docValue = doc.getFieldValue(fieldName); + if(docValue instanceof String) { + ScoreMap vocNav = this.vocabularyNavigator.get(vocName); + if (vocNav == null) { + vocNav = new ConcurrentScoreMap<>(); + this.vocabularyNavigator.put(vocName, vocNav); + } + vocNav.inc((String)docValue); + } else if(docValue instanceof Collection) { + if (!((Collection) docValue).isEmpty()) { + ScoreMap vocNav = this.vocabularyNavigator.get(vocName); + if (vocNav == null) { + vocNav = new ConcurrentScoreMap<>(); + this.vocabularyNavigator.put(vocName, vocNav); + } + for (final Object singleDocValue : (Collection) docValue) { + if (singleDocValue instanceof String) { + vocNav.inc((String) singleDocValue); + } + } + } + } + } + public void addExpectedRemoteReferences(int x) { if ( x > 0 ) { this.maxExpectedRemoteReferences.addAndGet(x); @@ -1306,14 +1303,14 @@ public final class SearchEvent implements ScoreMapUpdatesListener { rwi = this.rwiStack.poll(); if (rwi == null) return null; if (!skipDoubleDom) { - URIMetadataNode node = this.query.getSegment().fulltext().getMetadata(rwi); + final URIMetadataNode node = this.query.getSegment().fulltext().getMetadata(rwi); if (node == null) { - decrementCounts(rwi.getElement()); - continue pollloop; + decrementCounts(rwi.getElement()); + continue pollloop; } return node; } - + // check doubledom final String hosthash = rwi.getElement().hosthash(); m = this.doubleDomCache.get(hosthash); @@ -1322,12 +1319,12 @@ public final class SearchEvent implements ScoreMapUpdatesListener { m = this.doubleDomCache.get(hosthash); if (m == null) { // first appearance of dom. we create an entry to signal that one of that domain was already returned - m = new WeakPriorityBlockingQueue(max_results_rwi, false); + m = new WeakPriorityBlockingQueue<>(max_results_rwi, false); this.doubleDomCache.put(hosthash, m); - URIMetadataNode node = this.query.getSegment().fulltext().getMetadata(rwi); + final URIMetadataNode node = this.query.getSegment().fulltext().getMetadata(rwi); if (node == null) { - decrementCounts(rwi.getElement()); - continue pollloop; + decrementCounts(rwi.getElement()); + continue pollloop; } return node; } @@ -1338,13 +1335,13 @@ public final class SearchEvent implements ScoreMapUpdatesListener { m.put(rwi); } } - + // no more entries in sorted RWI entries. Now take Elements from the doubleDomCache if (this.doubleDomCache.isEmpty()) { //Log.logWarning("SearchEvent", "doubleDomCache.isEmpty"); return null; } - + // find best entry from all caches WeakPriorityBlockingQueue.Element bestEntry = null; WeakPriorityBlockingQueue.Element o; @@ -1370,7 +1367,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { //Log.logWarning("SearchEvent", "bestEntry == null (1)"); return null; } - + // finally remove the best entry from the doubledom cache m = this.doubleDomCache.get(bestEntry.getElement().hosthash()); if (m != null) { @@ -1390,7 +1387,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { URIMetadataNode node = null; try { node = this.query.getSegment().fulltext().getMetadata(bestEntry); - } catch (Throwable e) { + } catch (final Throwable e) { ConcurrentLog.logException(e); } if (node == null) { @@ -1401,7 +1398,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { return node; } } - + /** * get one metadata entry from the ranked results. This will be the 'best' entry so far according to the * applied ranking. If there are no more entries left or the timeout limit is reached then null is @@ -1415,20 +1412,20 @@ public final class SearchEvent implements ScoreMapUpdatesListener { // returns from the current RWI list the best URL entry and removes this entry from the list URIMetadataNode page; mainloop: while ((page = pullOneRWI(skipDoubleDom)) != null) { - + // check url related eventual constraints (protocol, tld, sitehost, and filetype) - final String matchingResult = QueryParams.matchesURL(this.query.modifier, this.query.tld, page.url()); + final String matchingResult = QueryParams.matchesURL(this.query.modifier, this.query.tld, page.url()); if (!matchingResult.isEmpty()) { if (log.isFine()) { - log.fine("dropped RWI: no match on " + matchingResult); + log.fine("dropped RWI: no match on " + matchingResult); } decrementCounts(page.word()); continue; } - if (!this.query.urlMask_isCatchall && this.query.urlMaskPattern != null - && !page.matches(this.query.urlMaskPattern)) { - // check url mask, only when not redundant with query modifier and tld constraints + if (!this.query.urlMask_isCatchall && this.query.urlMaskPattern != null + && !page.matches(this.query.urlMaskPattern)) { + // check url mask, only when not redundant with query modifier and tld constraints if (log.isFine()) log.fine("dropped RWI: no match with urlMask"); decrementCounts(page.word()); continue; @@ -1442,19 +1439,19 @@ public final class SearchEvent implements ScoreMapUpdatesListener { } // check content domain - ContentDomain contentDomain = page.getContentDomain(); + final ContentDomain contentDomain = page.getContentDomain(); if (this.query.contentdom.getCode() > 0 && ( - (this.query.contentdom == Classification.ContentDomain.IMAGE && contentDomain != Classification.ContentDomain.IMAGE) || - (this.query.contentdom == Classification.ContentDomain.AUDIO && contentDomain != Classification.ContentDomain.AUDIO) || - (this.query.contentdom == Classification.ContentDomain.VIDEO && contentDomain != Classification.ContentDomain.VIDEO) || - (this.query.contentdom == Classification.ContentDomain.APP && contentDomain != Classification.ContentDomain.APP)) && this.query.urlMask_isCatchall) { + (this.query.contentdom == Classification.ContentDomain.IMAGE && contentDomain != Classification.ContentDomain.IMAGE) || + (this.query.contentdom == Classification.ContentDomain.AUDIO && contentDomain != Classification.ContentDomain.AUDIO) || + (this.query.contentdom == Classification.ContentDomain.VIDEO && contentDomain != Classification.ContentDomain.VIDEO) || + (this.query.contentdom == Classification.ContentDomain.APP && contentDomain != Classification.ContentDomain.APP)) && this.query.urlMask_isCatchall) { if (log.isFine()) log.fine("dropped RWI: wrong contentdom = " + this.query.contentdom + ", domain = " + contentDomain); decrementCounts(page.word()); continue; } - + // filter out media links in text search, if wanted - String ext = MultiProtocolURL.getFileExtension(page.url().getFileName()); + final String ext = MultiProtocolURL.getFileExtension(page.url().getFileName()); if (this.query.contentdom == ContentDomain.TEXT && Classification.isImageExtension(ext) && this.excludeintext_image) { if (log.isFine()) log.fine("dropped RWI: file name domain does not match"); continue; @@ -1462,7 +1459,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { // filter query modifiers variables (these are host, filetype, protocol, language, author, collection, dates_in_content(on,from,to,timezone) ) - /* check again modifier constraint (language) with the language in the full metadata, + /* check again modifier constraint (language) with the language in the full metadata, * that may differ from the one in the reverse word reference which is already checked in addRWIs()*/ if (this.query.modifier.language != null && !this.query.modifier.language.equals(page.language())) { if (log.isFine()) log.fine("dropped RWI: language constraint = " + this.query.modifier.language); @@ -1478,9 +1475,9 @@ public final class SearchEvent implements ScoreMapUpdatesListener { } // check modifier constraint collection - // this is not available in pure RWI entries (but in local or via solr query received metadate/entries), + // this is not available in pure RWI entries (but in local or via solr query received metadate/entries), if (this.query.modifier.collection != null) { - Collection docCols = page.getFieldValues(CollectionSchema.collection_sxt.getSolrFieldName()); // get multivalued value + final Collection docCols = page.getFieldValues(CollectionSchema.collection_sxt.getSolrFieldName()); // get multivalued value if (docCols == null) { // no collection info decrementCounts(page.word()); continue; @@ -1504,25 +1501,15 @@ public final class SearchEvent implements ScoreMapUpdatesListener { continue; } - // content control - if (Switchboard.getSwitchboard().getConfigBool("contentcontrol.enabled", false)) { - FilterEngine f = ContentControlFilterUpdateThread.getNetworkFilter(); - if (f != null && !f.isListed(page.url(), null)) { - if (log.isFine()) log.fine("dropped RWI: url is blacklisted in contentcontrol"); - decrementCounts(page.word()); - continue; - } - } - final String pageurl = page.url().toNormalform(true); final String pageauthor = page.dc_creator(); final String pagetitle = page.dc_title().toLowerCase(); // check exclusion if (this.query.getQueryGoal().getExcludeSize() != 0 && - ((QueryParams.anymatch(pagetitle, this.query.getQueryGoal().getExcludeWords())) - || (QueryParams.anymatch(pageurl.toLowerCase(), this.query.getQueryGoal().getExcludeWords())) - || (QueryParams.anymatch(pageauthor.toLowerCase(), this.query.getQueryGoal().getExcludeWords())))) { + ((QueryParams.anymatch(pagetitle, this.query.getQueryGoal().getExcludeWords())) + || (QueryParams.anymatch(pageurl.toLowerCase(), this.query.getQueryGoal().getExcludeWords())) + || (QueryParams.anymatch(pageauthor.toLowerCase(), this.query.getQueryGoal().getExcludeWords())))) { if (log.isFine()) log.fine("dropped RWI: no match with query goal exclusion"); decrementCounts(page.word()); continue; @@ -1551,75 +1538,75 @@ public final class SearchEvent implements ScoreMapUpdatesListener { // check geo coordinates double lat, lon; if (this.query.radius > 0.0d && this.query.lat != 0.0d && this.query.lon != 0.0d && (lat = page.lat()) != 0.0d && (lon = page.lon()) != 0.0d) { - double latDelta = this.query.lat - lat; - double lonDelta = this.query.lon - lon; - double distance = Math.sqrt(latDelta * latDelta + lonDelta * lonDelta); // pythagoras + final double latDelta = this.query.lat - lat; + final double lonDelta = this.query.lon - lon; + final double distance = Math.sqrt(latDelta * latDelta + lonDelta * lonDelta); // pythagoras if (distance > this.query.radius) { if (log.isFine()) log.fine("dropped RWI: radius constraint"); decrementCounts(page.word()); continue; } } - + // check vocabulary terms (metatags) {only available in Solr index as vocabulary_xxyyzzz_sxt field} // TODO: vocabulary is only valid and available in local Solr index (consider to auto-switch to Searchdom.LOCAL) if (this.query.metatags != null && !this.query.metatags.isEmpty()) { - tagloop: for (Tagging.Metatag tag : this.query.metatags) { - SolrDocument sdoc = page; + tagloop: for (final Tagging.Metatag tag : this.query.metatags) { + final SolrDocument sdoc = page; if (sdoc != null) { - Collection tagvalues = sdoc.getFieldValues(CollectionSchema.VOCABULARY_PREFIX + tag.getVocabularyName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX); + final Collection tagvalues = sdoc.getFieldValues(CollectionSchema.VOCABULARY_PREFIX + tag.getVocabularyName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX); if (tagvalues != null && tagvalues.contains(tag.getObject())) { - continue tagloop; // metatag exists check next tag (filter may consist of several tags) - } + continue tagloop; // metatag exists check next tag (filter may consist of several tags) + } } // if we reach this point the metatag was not found (= drop entry) if (log.isFine()) log.fine("dropped RWI: url not tagged with vocabulary " + tag.getVocabularyName()); decrementCounts(page.word()); continue mainloop; } } - + // from here: collect navigation information // TODO: it may be a little bit late here, to update navigator counters // iterate over active navigator plugins (the rwi metadata may contain the field the plugin counts) - for (String s : this.navigatorPlugins.keySet()) { - Navigator navi = this.navigatorPlugins.get(s); + for (final String s : this.navigatorPlugins.keySet()) { + final Navigator navi = this.navigatorPlugins.get(s); if (navi != null) { navi.incDoc(page); } } - + if(this.protocolNavigator != null && page.url() != null) { - final String protocol = page.url().getProtocol(); - if(protocol != null) { - this.protocolNavigator.inc(protocol); - } + final String protocol = page.url().getProtocol(); + if(protocol != null) { + this.protocolNavigator.inc(protocol); + } } - + if(this.dateNavigator != null) { - Date[] dates = page.datesInContent(); - if (dates != null) { - for (final Date date : dates) { - if (date != null) { - this.dateNavigator.inc(ISO8601Formatter.FORMATTER.format(date)); - } - } - } - } - - // handle the vocabulary navigator - if (this.vocabularyNavigator != null) { - Set genericFacets = new LinkedHashSet<>(); - for (Tagging v : LibraryProvider.autotagging.getVocabularies()) { - genericFacets.add(v.getName()); - } - genericFacets.addAll(ProbabilisticClassifier.getContextNames()); - for (final String vocName : genericFacets) { - final String fieldName = CollectionSchema.VOCABULARY_PREFIX + vocName - + CollectionSchema.VOCABULARY_TERMS_SUFFIX; - incrementVocNavigator(page, vocName, fieldName); - } - } + final Date[] dates = page.datesInContent(); + if (dates != null) { + for (final Date date : dates) { + if (date != null) { + this.dateNavigator.inc(ISO8601Formatter.FORMATTER.format(date)); + } + } + } + } + + // handle the vocabulary navigator + if (this.vocabularyNavigator != null) { + final Set genericFacets = new LinkedHashSet<>(); + for (final Tagging v : LibraryProvider.autotagging.getVocabularies()) { + genericFacets.add(v.getName()); + } + genericFacets.addAll(ProbabilisticClassifier.getContextNames()); + for (final String vocName : genericFacets) { + final String fieldName = CollectionSchema.VOCABULARY_PREFIX + vocName + + CollectionSchema.VOCABULARY_TERMS_SUFFIX; + incrementVocNavigator(page, vocName, fieldName); + } + } return page; // accept url } @@ -1630,21 +1617,21 @@ public final class SearchEvent implements ScoreMapUpdatesListener { * Decrement statistics counts for the given RWI entry. * @param entry an RWI entry result */ - private void decrementCounts(final WordReferenceVars entry) { - if(entry == null) { - return; - } - if (entry.local()) { - if(this.local_rwi_available.get() > 0) { - this.local_rwi_available.decrementAndGet(); - } - } else { - if(this.remote_rwi_available.get() > 0) { - this.remote_rwi_available.decrementAndGet(); - } - } - } - + private void decrementCounts(final WordReferenceVars entry) { + if(entry == null) { + return; + } + if (entry.local()) { + if(this.local_rwi_available.get() > 0) { + this.local_rwi_available.decrementAndGet(); + } + } else { + if(this.remote_rwi_available.get() > 0) { + this.remote_rwi_available.decrementAndGet(); + } + } + } + /** * Update counters when evicting a Solr entry from results. * @param entry a Solr entry result to be evicted @@ -1652,145 +1639,145 @@ public final class SearchEvent implements ScoreMapUpdatesListener { * @param facets facets counts from Solr * @param navIncrementedEarlier when true, navigators have been incremented earlier with other facets or individual documents */ - private void updateCountsOnSolrEntryToEvict(final URIMetadataNode entry, - final Map> facets, final boolean local, - final boolean navIncrementedEarlier) { - if (entry == null) { - return; - } - if (local) { - this.local_solr_evicted.incrementAndGet(); - /* - * No need to decrement remote_solr_available as this counter is only - * incremented after all filterings have been applied - */ - } - - /* - * Update eventual navigators counters when relevant - */ - final boolean navIncrementedWithFacets = facets != null && !facets.isEmpty() && !navIncrementedEarlier; - ReversibleScoreMap fcts; - - - /* Iterate over active navigator plugins to let them update the counters */ - for (String s : this.navigatorPlugins.keySet()) { - Navigator navi = this.navigatorPlugins.get(s); - if (navi != null) { - if (navIncrementedWithFacets) { - fcts = facets.get(navi.getIndexFieldName()); - } else { - fcts = null; - } - final Object value = entry.getFieldValue(navi.getIndexFieldName()); - if (value != null) { - if(value instanceof Collection) { - for (final Object singleVal : (Collection) value) { - if (singleVal instanceof String) { - final String singleStringVal = (String)singleVal; - if (navIncrementedEarlier || (fcts != null && fcts.containsKey(singleStringVal))) { - if (navi.get(singleStringVal) > 0) { - navi.dec(singleStringVal); - } - } - } - } - } else if(value instanceof String){ - final String stringValue = (String)value; - if (navIncrementedEarlier || (fcts != null && fcts.containsKey(stringValue))) { - if (navi.get(stringValue) > 0) { - navi.dec(stringValue); - } - } - } - } - } - } - - - /* Note : would it be relevant to update here this.locationNavigator ? - * It looks like this nav is currently never filled, as a constraint on coordinates - * is expressed as a spatial filter not producing facets counts (see QueryParams.getFacetsFilterQueries()) - */ - - if (this.dateNavigator != null) { - if (navIncrementedWithFacets) { - fcts = facets.get(CollectionSchema.dates_in_content_dts.getSolrFieldName()); - } else { - fcts = null; - } - Date[] dates = entry.datesInContent(); - if (dates != null) { - for (final Date date : dates) { - if (date != null) { - final String dateStr = ISO8601Formatter.FORMATTER.format(date); - if (navIncrementedEarlier || (fcts != null && fcts.containsKey(dateStr))) { - if (this.dateNavigator.get(dateStr) > 0) { - this.dateNavigator.dec(dateStr); - } - } - } - } - } - } - - if (this.protocolNavigator != null) { - if (navIncrementedWithFacets) { - fcts = facets.get(CollectionSchema.url_protocol_s.getSolrFieldName()); - } else { - fcts = null; - } - final String protocol = entry.url().getProtocol(); - if (protocol != null) { - if (navIncrementedEarlier || (fcts != null && fcts.containsKey(protocol))) { - if (this.protocolNavigator.get(protocol) > 0) { - this.protocolNavigator.dec(protocol); - } - } - } - } - - // get the vocabulary navigation - if (this.vocabularyNavigator != null) { - Set genericFacets = new LinkedHashSet<>(); - for (Tagging v : LibraryProvider.autotagging.getVocabularies()) { - genericFacets.add(v.getName()); - } - genericFacets.addAll(ProbabilisticClassifier.getContextNames()); - for (String vocName : genericFacets) { - final String fieldName = CollectionSchema.VOCABULARY_PREFIX + vocName - + CollectionSchema.VOCABULARY_TERMS_SUFFIX; - if (navIncrementedWithFacets) { - fcts = facets.get(fieldName); - } else { - fcts = null; - } - Object docValue = entry.getFieldValue(fieldName); - if (docValue instanceof String) { - if (navIncrementedEarlier || (fcts != null && fcts.containsKey((String) docValue))) { - ScoreMap vocNav = this.vocabularyNavigator.get(vocName); - if (vocNav != null && vocNav.get((String) docValue) > 0) { - vocNav.dec((String) docValue); - } - } - } else if(docValue instanceof Collection) { - if (!((Collection) docValue).isEmpty()) { - for (Object singleDocValue : (Collection) docValue) { - if (singleDocValue instanceof String) { - if (navIncrementedEarlier || (fcts != null && fcts.containsKey((String) singleDocValue))) { - ScoreMap vocNav = this.vocabularyNavigator.get(vocName); - if (vocNav != null && vocNav.get((String) singleDocValue) > 0) { - vocNav.dec((String) singleDocValue); - } - } - } - } - } - } - } - } - } - + private void updateCountsOnSolrEntryToEvict(final URIMetadataNode entry, + final Map> facets, final boolean local, + final boolean navIncrementedEarlier) { + if (entry == null) { + return; + } + if (local) { + this.local_solr_evicted.incrementAndGet(); + /* + * No need to decrement remote_solr_available as this counter is only + * incremented after all filterings have been applied + */ + } + + /* + * Update eventual navigators counters when relevant + */ + final boolean navIncrementedWithFacets = facets != null && !facets.isEmpty() && !navIncrementedEarlier; + ReversibleScoreMap fcts; + + + /* Iterate over active navigator plugins to let them update the counters */ + for (final String s : this.navigatorPlugins.keySet()) { + final Navigator navi = this.navigatorPlugins.get(s); + if (navi != null) { + if (navIncrementedWithFacets) { + fcts = facets.get(navi.getIndexFieldName()); + } else { + fcts = null; + } + final Object value = entry.getFieldValue(navi.getIndexFieldName()); + if (value != null) { + if(value instanceof Collection) { + for (final Object singleVal : (Collection) value) { + if (singleVal instanceof String) { + final String singleStringVal = (String)singleVal; + if (navIncrementedEarlier || (fcts != null && fcts.containsKey(singleStringVal))) { + if (navi.get(singleStringVal) > 0) { + navi.dec(singleStringVal); + } + } + } + } + } else if(value instanceof String){ + final String stringValue = (String)value; + if (navIncrementedEarlier || (fcts != null && fcts.containsKey(stringValue))) { + if (navi.get(stringValue) > 0) { + navi.dec(stringValue); + } + } + } + } + } + } + + + /* Note : would it be relevant to update here this.locationNavigator ? + * It looks like this nav is currently never filled, as a constraint on coordinates + * is expressed as a spatial filter not producing facets counts (see QueryParams.getFacetsFilterQueries()) + */ + + if (this.dateNavigator != null) { + if (navIncrementedWithFacets) { + fcts = facets.get(CollectionSchema.dates_in_content_dts.getSolrFieldName()); + } else { + fcts = null; + } + final Date[] dates = entry.datesInContent(); + if (dates != null) { + for (final Date date : dates) { + if (date != null) { + final String dateStr = ISO8601Formatter.FORMATTER.format(date); + if (navIncrementedEarlier || (fcts != null && fcts.containsKey(dateStr))) { + if (this.dateNavigator.get(dateStr) > 0) { + this.dateNavigator.dec(dateStr); + } + } + } + } + } + } + + if (this.protocolNavigator != null) { + if (navIncrementedWithFacets) { + fcts = facets.get(CollectionSchema.url_protocol_s.getSolrFieldName()); + } else { + fcts = null; + } + final String protocol = entry.url().getProtocol(); + if (protocol != null) { + if (navIncrementedEarlier || (fcts != null && fcts.containsKey(protocol))) { + if (this.protocolNavigator.get(protocol) > 0) { + this.protocolNavigator.dec(protocol); + } + } + } + } + + // get the vocabulary navigation + if (this.vocabularyNavigator != null) { + final Set genericFacets = new LinkedHashSet<>(); + for (final Tagging v : LibraryProvider.autotagging.getVocabularies()) { + genericFacets.add(v.getName()); + } + genericFacets.addAll(ProbabilisticClassifier.getContextNames()); + for (final String vocName : genericFacets) { + final String fieldName = CollectionSchema.VOCABULARY_PREFIX + vocName + + CollectionSchema.VOCABULARY_TERMS_SUFFIX; + if (navIncrementedWithFacets) { + fcts = facets.get(fieldName); + } else { + fcts = null; + } + final Object docValue = entry.getFieldValue(fieldName); + if (docValue instanceof String) { + if (navIncrementedEarlier || (fcts != null && fcts.containsKey((String) docValue))) { + final ScoreMap vocNav = this.vocabularyNavigator.get(vocName); + if (vocNav != null && vocNav.get((String) docValue) > 0) { + vocNav.dec((String) docValue); + } + } + } else if(docValue instanceof Collection) { + if (!((Collection) docValue).isEmpty()) { + for (final Object singleDocValue : (Collection) docValue) { + if (singleDocValue instanceof String) { + if (navIncrementedEarlier || (fcts != null && fcts.containsKey((String) singleDocValue))) { + final ScoreMap vocNav = this.vocabularyNavigator.get(vocName); + if (vocNav != null && vocNav.get((String) singleDocValue) > 0) { + vocNav.dec((String) singleDocValue); + } + } + } + } + } + } + } + } + } + public long getURLRetrievalTime() { return this.urlRetrievalAllTime; } @@ -1815,10 +1802,10 @@ public final class SearchEvent implements ScoreMapUpdatesListener { if (this.ref.size() <= ic) { // size matches return map directly result = this.getTopics(/*ic, 500*/); } else { // collect top most count topics - result = new ConcurrentScoreMap(); - Iterator it = this.getTopics(/*ic, 500*/).keys(false); + result = new ConcurrentScoreMap<>(); + final Iterator it = this.getTopics(/*ic, 500*/).keys(false); while (ic-- > 0 && it.hasNext()) { - String word = it.next(); + final String word = it.next(); result.set(word, this.ref.get(word)); } } @@ -1836,20 +1823,20 @@ public final class SearchEvent implements ScoreMapUpdatesListener { */ public boolean drainStacksToResult(boolean concurrentSnippetFetch) { // we take one entry from both stacks at the same time - boolean solrSuccess = drainSolrStackToResult(concurrentSnippetFetch); - boolean rwiSuccess = drainRWIStackToResult(concurrentSnippetFetch); + final boolean solrSuccess = drainSolrStackToResult(concurrentSnippetFetch); + final boolean rwiSuccess = drainRWIStackToResult(concurrentSnippetFetch); return solrSuccess || rwiSuccess; } /** * Adds the retrieved results from local and remotes RWI to the result list and * computes the text snippets - * @param concurrentSnippetFetch when true, allow starting a concurrent task to fetch a snippet when no one is already available + * @param concurrentSnippetFetch when true, allow starting a concurrent task to fetch a snippet when no one is already available * @return true when an entry has been effectively added to resultlist otherwise false */ - private boolean drainRWIStackToResult(boolean concurrentSnippetFetch) { - boolean success = false; - if (SearchEvent.this.snippetFetchAlive.get() >= 10 || MemoryControl.shortStatus() || !concurrentSnippetFetch) { + private boolean drainRWIStackToResult(boolean concurrentSnippetFetch) { + boolean success = false; + if (SearchEvent.this.snippetFetchAlive.get() >= 10 || MemoryControl.shortStatus() || !concurrentSnippetFetch) { // too many concurrent processes final URIMetadataNode noderwi = pullOneFilteredFromRWI(true); if (noderwi != null) { @@ -1857,7 +1844,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { success = true; } } else { - Thread t = new Thread("SearchEvent.drainStacksToResult.oneFilteredFromRWI") { + final Thread t = new Thread("SearchEvent.drainStacksToResult.oneFilteredFromRWI") { @Override public void run() { SearchEvent.this.oneFeederStarted(); @@ -1869,7 +1856,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { addResult(getSnippet(noderwi, SearchEvent.this.query.snippetCacheStrategy), noderwi.score()); } catch (final Throwable e) { ConcurrentLog.logException(e); - } finally { + } finally { SearchEvent.this.snippetFetchAlive.decrementAndGet(); } } @@ -1881,20 +1868,20 @@ public final class SearchEvent implements ScoreMapUpdatesListener { if (SearchEvent.this.query.snippetCacheStrategy == null) t.run(); else t.start(); //no need for concurrency if there is no latency } return success; - } + } /** * Adds the retrieved full text results from local and remotes Solr to the result list and * computes the text snippets - * @param concurrentSnippetFetch when true, allow starting a concurrent task to fetch a snippet when no one is already available + * @param concurrentSnippetFetch when true, allow starting a concurrent task to fetch a snippet when no one is already available * @return true when an entry has been effectively added to resultlist otherwise false */ - private boolean drainSolrStackToResult(boolean concurrentSnippetFetch) { - boolean success = false; - final Element localEntryElement = this.nodeStack.sizeQueue() > 0 ? this.nodeStack.poll() : null; + private boolean drainSolrStackToResult(boolean concurrentSnippetFetch) { + boolean success = false; + final Element localEntryElement = this.nodeStack.sizeQueue() > 0 ? this.nodeStack.poll() : null; final URIMetadataNode node = localEntryElement == null ? null : localEntryElement.getElement(); if (node != null) { - LinkedHashSet solrsnippetlines = this.snippets.remove(ASCII.String(node.hash())); // we can remove this because it's used only once + final LinkedHashSet solrsnippetlines = this.snippets.remove(ASCII.String(node.hash())); // we can remove this because it's used only once if (solrsnippetlines != null && solrsnippetlines.size() > 0) { OpensearchResponseWriter.removeSubsumedTitle(solrsnippetlines, node.dc_title()); final TextSnippet solrsnippet = new TextSnippet(node.url(), OpensearchResponseWriter.getLargestSnippet(solrsnippetlines), true, ResultClass.SOURCE_SOLR, ""); @@ -1908,7 +1895,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { false); final String solrsnippetline = solrsnippet.descriptionline(this.getQuery().getQueryGoal()); final String yacysnippetline = yacysnippet.descriptionline(this.getQuery().getQueryGoal()); - URIMetadataNode re = node.makeResultEntry(this.query.getSegment(), this.peers, solrsnippetline.length() > yacysnippetline.length() ? solrsnippet : yacysnippet); + final URIMetadataNode re = node.makeResultEntry(this.query.getSegment(), this.peers, solrsnippetline.length() > yacysnippetline.length() ? solrsnippet : yacysnippet); addResult(re, localEntryElement.getWeight()); success = true; } else { @@ -1938,12 +1925,12 @@ public final class SearchEvent implements ScoreMapUpdatesListener { } } } - return success; - } - + return success; + } + /** * place the result to the result vector and apply post-ranking - * post-ranking is added to the current score, + * post-ranking is added to the current score, * @param resultEntry to add * @param score current ranking */ @@ -1955,8 +1942,8 @@ public final class SearchEvent implements ScoreMapUpdatesListener { // final long ranking = ((long) (score * 128.f)) + postRanking(resultEntry, this.getTopicNavigator(MAX_TOPWORDS)); resultEntry.setScore(ranking); // update the score of resultEntry for access by search interface / api - this.resultList.put(new ReverseElement(resultEntry, ranking)); // remove smallest in case of overflow - if (pollImmediately) this.resultList.poll(); // prevent re-ranking in case there is only a single index source which has already ranked entries. + this.resultList.put(new ReverseElement<>(resultEntry, ranking)); // remove smallest in case of overflow + if (this.pollImmediately) this.resultList.poll(); // prevent re-ranking in case there is only a single index source which has already ranked entries. this.addTopics(resultEntry); } @@ -1965,26 +1952,26 @@ public final class SearchEvent implements ScoreMapUpdatesListener { // for media search: prefer pages with many links switch (this.query.contentdom) { - case IMAGE: - r += rentry.limage() << this.query.ranking.coeff_cathasimage; - break; - case AUDIO: - r += rentry.laudio() << this.query.ranking.coeff_cathasaudio; - break; - case VIDEO: - r += rentry.lvideo() << this.query.ranking.coeff_cathasvideo; - break; - case APP: - r += rentry.lapp() << this.query.ranking.coeff_cathasapp; - break; - default: - break; + case IMAGE: + r += rentry.limage() << this.query.ranking.coeff_cathasimage; + break; + case AUDIO: + r += rentry.laudio() << this.query.ranking.coeff_cathasaudio; + break; + case VIDEO: + r += rentry.lvideo() << this.query.ranking.coeff_cathasvideo; + break; + case APP: + r += rentry.lapp() << this.query.ranking.coeff_cathasapp; + break; + default: + break; } // apply citation count //System.out.println("POSTRANKING CITATION: references = " + rentry.referencesCount() + ", inbound = " + rentry.llocal() + ", outbound = " + rentry.lother()); if (this.query.getSegment().connectedCitation()) { - int referencesCount = this.query.getSegment().urlCitation().count(rentry.hash()); + final int referencesCount = this.query.getSegment().urlCitation().count(rentry.hash()); r += (128 * referencesCount / (1 + 2 * rentry.llocal() + rentry.lother())) << this.query.ranking.coeff_citation; } // prefer hit with 'prefer' pattern @@ -2002,11 +1989,11 @@ public final class SearchEvent implements ScoreMapUpdatesListener { // the token map is used (instead of urlcomps/descrcomps) to determine appearance in url/title and eliminate double occurances // (example Title="News News News News News News - today is party -- News News News News News News" to add one score instead of 12 * score !) for (final String urlcomp : urlcompmap) { - int tc = topwords.get(urlcomp); + final int tc = topwords.get(urlcomp); if (tc > 0) r += tc << this.query.ranking.coeff_urlcompintoplist; } for (final String descrcomp : descrcompmap) { - int tc = topwords.get(descrcomp); + final int tc = topwords.get(descrcomp); if (tc > 0) r += tc << this.query.ranking.coeff_descrcompintoplist; } @@ -2019,7 +2006,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { } return r; } - + public URIMetadataNode getSnippet(URIMetadataNode page, final CacheStrategy cacheStrategy) { if (page == null) return null; @@ -2037,10 +2024,10 @@ public final class SearchEvent implements ScoreMapUpdatesListener { } // load snippet - ContentDomain contentDomain = page.getContentDomain(); + final ContentDomain contentDomain = page.getContentDomain(); if (contentDomain == Classification.ContentDomain.TEXT || contentDomain == Classification.ContentDomain.ALL) { // attach text snippet - long startTime = System.currentTimeMillis(); + final long startTime = System.currentTimeMillis(); final TextSnippet snippet = new TextSnippet( this.loader, page, @@ -2075,7 +2062,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { } return page.makeResultEntry(this.query.getSegment(), this.peers, null); // result without snippet } - + /** * This is the access point for the search interface to retrive ranked results. * for display. @@ -2089,73 +2076,73 @@ public final class SearchEvent implements ScoreMapUpdatesListener { // (happens if a search pages is accessed a second time) final long finishTime = timeout == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + timeout; EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEventType.ONERESULT, "started, item = " + item + ", available = " + this.getResultCount(), 0, 0), false); - + // wait until a local solr is finished, we must do that to be able to check if we need more - if (this.localsolrsearch != null && this.localsolrsearch.isAlive()) { - try { - this.localsolrsearch.join(100); - } catch (final InterruptedException e) { - log.warn("Wait for local solr search was interrupted."); - } - } - if (item >= this.localsolroffset && this.local_solr_stored.get() == 0 && (this.localsolrsearch != null && this.localsolrsearch.isAlive())) { - try { - this.localsolrsearch.join(); - } catch (final InterruptedException e) { - log.warn("Wait for local solr search was interrupted."); - } - } + if (this.localsolrsearch != null && this.localsolrsearch.isAlive()) { + try { + this.localsolrsearch.join(100); + } catch (final InterruptedException e) { + log.warn("Wait for local solr search was interrupted."); + } + } + if (item >= this.localsolroffset && this.local_solr_stored.get() == 0 && (this.localsolrsearch != null && this.localsolrsearch.isAlive())) { + try { + this.localsolrsearch.join(); + } catch (final InterruptedException e) { + log.warn("Wait for local solr search was interrupted."); + } + } if (this.remote && item >= this.localsolroffset && this.local_solr_stored.get() > item) { - /* Request mixing remote and local Solr results : load remaining local solr results now. - * For local only search, a new SearchEvent should be created, starting directly at the requested offset, - * thus allowing to handle last pages of large resultsets - */ - int nextitems = item - this.localsolroffset + this.query.itemsPerPage; // example: suddenly switch to item 60, just 10 had been shown, 20 loaded. + /* Request mixing remote and local Solr results : load remaining local solr results now. + * For local only search, a new SearchEvent should be created, starting directly at the requested offset, + * thus allowing to handle last pages of large resultsets + */ + final int nextitems = item - this.localsolroffset + this.query.itemsPerPage; // example: suddenly switch to item 60, just 10 had been shown, 20 loaded. if (this.localsolrsearch != null && this.localsolrsearch.isAlive()) {try {this.localsolrsearch.join();} catch (final InterruptedException e) {}} - if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_SOLR_OFF, false)) { - // Do not increment again navigators from the local Solr on next local pages retrieval, as facets counts scope is on the total results and should already have been added - final boolean useSolrFacets = (this.localsolrsearch == null); - final boolean incrementNavigators = false; - this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, - this.query.solrQuery(this.query.contentdom, this.query.isStrictContentDom(), useSolrFacets, this.excludeintext_image), - this.localsolroffset, nextitems, null /* this peer */, 0, Switchboard.urlBlacklist, useSolrFacets, incrementNavigators); - } - this.localsolroffset += nextitems; - } - + if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_SOLR_OFF, false)) { + // Do not increment again navigators from the local Solr on next local pages retrieval, as facets counts scope is on the total results and should already have been added + final boolean useSolrFacets = (this.localsolrsearch == null); + final boolean incrementNavigators = false; + this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, + this.query.solrQuery(this.query.contentdom, this.query.isStrictContentDom(), useSolrFacets, this.excludeintext_image), + this.localsolroffset, nextitems, null /* this peer */, 0, Switchboard.urlBlacklist, useSolrFacets, incrementNavigators); + } + this.localsolroffset += nextitems; + } + // now pull results as long as needed and as long as possible - if (this.remote && item < 10 && this.resultList.sizeAvailable() <= item) { - try { - Thread.sleep(100); - } catch (final InterruptedException e) { - log.warn("Remote search results wait was interrupted."); - } - } - + if (this.remote && item < 10 && this.resultList.sizeAvailable() <= item) { + try { + Thread.sleep(100); + } catch (final InterruptedException e) { + log.warn("Remote search results wait was interrupted."); + } + } + final int resultListIndex; if (this.remote) { - resultListIndex = item; + resultListIndex = item; } else { - resultListIndex = item - (this.localsolroffset - this.query.itemsPerPage); + resultListIndex = item - (this.localsolroffset - this.query.itemsPerPage); } while ( this.resultList.sizeAvailable() <= resultListIndex && (this.rwiQueueSize() > 0 || this.nodeStack.sizeQueue() > 0 || - (!this.isFeedingFinished() && System.currentTimeMillis() < finishTime))) { - if (!drainStacksToResult(true)) { - try { - Thread.sleep(10); - } catch (final InterruptedException e) { - log.warn("Search results wait was interrupted."); - } - } - } - + (!this.isFeedingFinished() && System.currentTimeMillis() < finishTime))) { + if (!drainStacksToResult(true)) { + try { + Thread.sleep(10); + } catch (final InterruptedException e) { + log.warn("Search results wait was interrupted."); + } + } + } + // check if we have a success if (this.resultList.sizeAvailable() > resultListIndex) { // we have the wanted result already in the result array .. return that final URIMetadataNode re = this.resultList.element(resultListIndex).getElement(); EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEventType.ONERESULT, "fetched, item = " + item + ", available = " + this.getResultCount() + ": " + re.urlstring(), 0, 0), false); - + /* if (this.localsolrsearch == null || (!this.localsolrsearch.isAlive() && this.local_solr_stored.get() > this.localsolroffset && (item + 1) % this.query.itemsPerPage == 0)) { // at the end of a list, trigger a next solr search @@ -2164,7 +2151,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { } this.localsolroffset += this.query.itemsPerPage; } - */ + */ return re; } @@ -2175,43 +2162,43 @@ public final class SearchEvent implements ScoreMapUpdatesListener { /** Image results counter */ private int imagePageCounter = 0; - private LinkedHashMap imageViewed = new LinkedHashMap(); - private LinkedHashMap imageSpareGood = new LinkedHashMap(); - private LinkedHashMap imageSpareBad = new LinkedHashMap(); + private final LinkedHashMap imageViewed = new LinkedHashMap<>(); + private final LinkedHashMap imageSpareGood = new LinkedHashMap<>(); + private final LinkedHashMap imageSpareBad = new LinkedHashMap<>(); private ImageResult nthImage(int item) { - Object o = SetTools.nth(this.imageViewed.values(), item); + final Object o = SetTools.nth(this.imageViewed.values(), item); if (o == null) return null; return (ImageResult) o; } private boolean hasSpare() { - return imageSpareGood.size() > 0 || imageSpareBad.size() > 0; + return this.imageSpareGood.size() > 0 || this.imageSpareBad.size() > 0; } private boolean containsSpare(String id) { - return imageSpareGood.containsKey(id) || imageSpareBad.containsKey(id); + return this.imageSpareGood.containsKey(id) || this.imageSpareBad.containsKey(id); } private int sizeSpare() { - return imageSpareGood.size() + imageSpareBad.size(); + return this.imageSpareGood.size() + this.imageSpareBad.size(); } private ImageResult nextSpare() { - if (imageSpareGood.size() > 0) { - Map.Entry next = imageSpareGood.entrySet().iterator().next(); - imageViewed.put(next.getKey(), next.getValue()); - imageSpareGood.remove(next.getKey()); + if (this.imageSpareGood.size() > 0) { + final Map.Entry next = this.imageSpareGood.entrySet().iterator().next(); + this.imageViewed.put(next.getKey(), next.getValue()); + this.imageSpareGood.remove(next.getKey()); return next.getValue(); } - if (imageSpareBad.size() > 0) { - Map.Entry next = imageSpareBad.entrySet().iterator().next(); - imageViewed.put(next.getKey(), next.getValue()); - imageSpareBad.remove(next.getKey()); + if (this.imageSpareBad.size() > 0) { + final Map.Entry next = this.imageSpareBad.entrySet().iterator().next(); + this.imageViewed.put(next.getKey(), next.getValue()); + this.imageSpareBad.remove(next.getKey()); return next.getValue(); } return null; } - + public ImageResult oneImageResult(final int item, final long timeout, final boolean strictContentDom) throws MalformedURLException { - if (item < imageViewed.size()) return nthImage(item); - if (imageSpareGood.size() > 0) return nextSpare(); // first put out all good spare, but no bad spare - URIMetadataNode doc = oneResult(imagePageCounter++, timeout); // we must use a different counter here because the image counter can be higher when one page filled up several spare + if (item < this.imageViewed.size()) return nthImage(item); + if (this.imageSpareGood.size() > 0) return nextSpare(); // first put out all good spare, but no bad spare + final URIMetadataNode doc = oneResult(this.imagePageCounter++, timeout); // we must use a different counter here because the image counter can be higher when one page filled up several spare // check if the match was made in the url or in the image links if (doc == null) { if (hasSpare()) return nextSpare(); @@ -2224,52 +2211,52 @@ public final class SearchEvent implements ScoreMapUpdatesListener { // boolean fakeImageHost = ms.url().getHost() != null && ms.url().getHost().indexOf("wikipedia") > 0; // pages with image extension from wikipedia do not contain image files but html files... I know this is a bad hack, but many results come from wikipedia and we must handle that // generalize above hack (regarding url with file extension but beeing a html (with html mime) if (doc.doctype() == Response.DT_IMAGE) { - /* Icons are not always .ico files and should now be indexed in icons_urlstub_sxt. But this test still makes sense for older indexed documents, - * or documents coming from previous versions peers */ + /* Icons are not always .ico files and should now be indexed in icons_urlstub_sxt. But this test still makes sense for older indexed documents, + * or documents coming from previous versions peers */ if (!doc.url().getFileName().endsWith(".ico")) { // we don't want favicons final String id = ASCII.String(doc.hash()); // check image size final Collection height = doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName()); final Collection width = doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName()); - int h = height == null ? 0 : (Integer) height.iterator().next(); // might be -1 for unknown - int w = width == null ? 0 : (Integer) width.iterator().next(); + final int h = height == null ? 0 : (Integer) height.iterator().next(); // might be -1 for unknown + final int w = width == null ? 0 : (Integer) width.iterator().next(); if ((h <= 0 || h > 16) && (w <= 0 || w > 16)) { // we don't want too small images (< 16x16) - if (!imageViewed.containsKey(id) && !containsSpare(id)) imageSpareGood.put(id, new ImageResult(doc.url(), doc.url(), doc.mime(), doc.title(), w, h, 0)); + if (!this.imageViewed.containsKey(id) && !containsSpare(id)) this.imageSpareGood.put(id, new ImageResult(doc.url(), doc.url(), doc.mime(), doc.title(), w, h, 0)); } } } else if(!strictContentDom) { - Collection altO = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName()); - Collection imgO = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName()); + final Collection altO = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName()); + final Collection imgO = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName()); if (imgO != null && imgO.size() > 0 && imgO instanceof List) { - List alt = altO == null ? null : (List) altO; - List img = (List) imgO; - List prt = CollectionConfiguration.indexedList2protocolList(doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName()), img.size()); - Collection heightO = doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName()); - Collection widthO = doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName()); - List height = heightO == null ? null : (List) heightO; - List width = widthO == null ? null : (List) widthO; + final List alt = altO == null ? null : (List) altO; + final List img = (List) imgO; + final List prt = CollectionConfiguration.indexedList2protocolList(doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName()), img.size()); + final Collection heightO = doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName()); + final Collection widthO = doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName()); + final List height = heightO == null ? null : (List) heightO; + final List width = widthO == null ? null : (List) widthO; for (int c = 0; c < img.size(); c++) { - String image_urlstub = (String) img.get(c); - /* Icons are not always .ico files and should now be indexed in icons_urlstub_sxt. But this test still makes sense for older indexed documents, - * or documents coming from previous versions peers */ + final String image_urlstub = (String) img.get(c); + /* Icons are not always .ico files and should now be indexed in icons_urlstub_sxt. But this test still makes sense for older indexed documents, + * or documents coming from previous versions peers */ if (image_urlstub.endsWith(".ico")) continue; // we don't want favicons, makes the result look idiotic try { - int h = height == null ? 0 : (Integer) height.get(c); - int w = width == null ? 0 : (Integer) width.get(c); + final int h = height == null ? 0 : (Integer) height.get(c); + final int w = width == null ? 0 : (Integer) width.get(c); // check size good for display (parser may init unknown dimension with -1) if (h > 0 && h <= 16) continue; // to small for display if (w > 0 && w <= 16) continue; // to small for display - - DigestURL imageUrl = new DigestURL((prt != null && prt.size() > c ? prt.get(c) : "http") + "://" + image_urlstub); - String id = ASCII.String(imageUrl.hash()); - if (!imageViewed.containsKey(id) && !containsSpare(id)) { - String image_alt = (alt != null && alt.size() > c) ? (String) alt.get(c) : ""; - ImageResult imageResult = new ImageResult(doc.url(), imageUrl, "", image_alt, w, h, 0); - boolean match = (query.getQueryGoal().matches(image_urlstub) || query.getQueryGoal().matches(image_alt)); - if (match) imageSpareGood.put(id, imageResult); else imageSpareBad.put(id, imageResult); + + final DigestURL imageUrl = new DigestURL((prt != null && prt.size() > c ? prt.get(c) : "http") + "://" + image_urlstub); + final String id = ASCII.String(imageUrl.hash()); + if (!this.imageViewed.containsKey(id) && !containsSpare(id)) { + final String image_alt = (alt != null && alt.size() > c) ? (String) alt.get(c) : ""; + final ImageResult imageResult = new ImageResult(doc.url(), imageUrl, "", image_alt, w, h, 0); + final boolean match = (this.query.getQueryGoal().matches(image_urlstub) || this.query.getQueryGoal().matches(image_alt)); + if (match) this.imageSpareGood.put(id, imageResult); else this.imageSpareBad.put(id, imageResult); } - } catch (MalformedURLException e) { + } catch (final MalformedURLException e) { continue; } } @@ -2297,76 +2284,76 @@ public final class SearchEvent implements ScoreMapUpdatesListener { return this.imageUrl.toNormalform(false); } } - + public ArrayList> completeResults(final long waitingtime) { final long timeout = waitingtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + waitingtime; int i = 0; - + while (this.resultList.sizeAvailable() < this.query.neededResults() && System.currentTimeMillis() < timeout) { - URIMetadataNode re = oneResult(i++, timeout - System.currentTimeMillis()); + final URIMetadataNode re = oneResult(i++, timeout - System.currentTimeMillis()); if (re == null) break; } return this.resultList.list(Math.min(this.query.neededResults(), this.resultList.sizeAvailable())); } - - /** - * Re-sort results cached in the resultList and eventually include in that list - * elements with higher ranks from the Solr and RWI stacks. - */ - public void resortCachedResults() { - /* - * If stacks feeding is finished, drain as much as possible elements from stacks - * while their ranking is higher than the last element in the result list - */ - if (isFeedingFinished() && this.resortCacheAllowed.tryAcquire()) { - /* - * First put all elements of the resultList in its own sorted queue to have a - * consistent sorting on the whole set - */ - this.resultList.requeueDrainedElements(); - - /* - * Note : if the resultList is full (its maxSize has been reached) some elements - * with the lowest ranking may be lost in this next step. Not really a problem - * because they were not supposed to be here. If really necessary to keep them, - * growing the maxSize of the resultList should be considered here. - */ - WeakPriorityBlockingQueue.Element initialLastResult = this.resultList.getLastInQueue(); - - /* - * Drain stacks in two steps (Solr, then RWI), because one stack might still - * contains higher ranked results when only lower ranked remain in the other - */ - - /* - * Here we do not fetch snippets concurrently as we want to know immediately the - * drained element position in the final result list - */ - boolean drained = drainSolrStackToResult(false); - WeakPriorityBlockingQueue.Element newLastResult = this.resultList.getLastInQueue(); - - /* - * Loop while at least one element has been added to the results list and is not - * the last considering its final rank - */ - while (drained && newLastResult == initialLastResult) { - drained = drainSolrStackToResult(false); - newLastResult = this.resultList.getLastInQueue(); - } - - drained = drainRWIStackToResult(false); - newLastResult = this.resultList.getLastInQueue(); - - /* - * Loop while at least one element has been added to the results list and is not - * the last considering its final rank - */ - while (drained && newLastResult == initialLastResult) { - drained = drainRWIStackToResult(false); - newLastResult = this.resultList.getLastInQueue(); - } - } - } + + /** + * Re-sort results cached in the resultList and eventually include in that list + * elements with higher ranks from the Solr and RWI stacks. + */ + public void resortCachedResults() { + /* + * If stacks feeding is finished, drain as much as possible elements from stacks + * while their ranking is higher than the last element in the result list + */ + if (isFeedingFinished() && this.resortCacheAllowed.tryAcquire()) { + /* + * First put all elements of the resultList in its own sorted queue to have a + * consistent sorting on the whole set + */ + this.resultList.requeueDrainedElements(); + + /* + * Note : if the resultList is full (its maxSize has been reached) some elements + * with the lowest ranking may be lost in this next step. Not really a problem + * because they were not supposed to be here. If really necessary to keep them, + * growing the maxSize of the resultList should be considered here. + */ + final WeakPriorityBlockingQueue.Element initialLastResult = this.resultList.getLastInQueue(); + + /* + * Drain stacks in two steps (Solr, then RWI), because one stack might still + * contains higher ranked results when only lower ranked remain in the other + */ + + /* + * Here we do not fetch snippets concurrently as we want to know immediately the + * drained element position in the final result list + */ + boolean drained = drainSolrStackToResult(false); + WeakPriorityBlockingQueue.Element newLastResult = this.resultList.getLastInQueue(); + + /* + * Loop while at least one element has been added to the results list and is not + * the last considering its final rank + */ + while (drained && newLastResult == initialLastResult) { + drained = drainSolrStackToResult(false); + newLastResult = this.resultList.getLastInQueue(); + } + + drained = drainRWIStackToResult(false); + newLastResult = this.resultList.getLastInQueue(); + + /* + * Loop while at least one element has been added to the results list and is not + * the last considering its final rank + */ + while (drained && newLastResult == initialLastResult) { + drained = drainRWIStackToResult(false); + newLastResult = this.resultList.getLastInQueue(); + } + } + } /** * delete a specific entry from the search results @@ -2390,17 +2377,17 @@ public final class SearchEvent implements ScoreMapUpdatesListener { public ReferenceOrder getOrder() { return this.order; } - - /** - * Check whether feeding from all available data sources is finished (remote - * RWI and Solr requests, local RWI and Solr requests, Heuristics - * requests...) - * @return true when all available feeders on this search event are terminated - */ + + /** + * Check whether feeding from all available data sources is finished (remote + * RWI and Solr requests, local RWI and Solr requests, Heuristics + * requests...) + * @return true when all available feeders on this search event are terminated + */ public boolean isFeedingFinished() { return - this.feedersTerminated.intValue() > (this.remote ? 1 : 0) && - this.feedersAlive.get() == 0; + this.feedersTerminated.intValue() > (this.remote ? 1 : 0) && + this.feedersAlive.get() == 0; } /** @@ -2415,7 +2402,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { public void oneFeederStarted() { this.feedersAlive.incrementAndGet(); } - + public QueryParams getQuery() { return this.query; } @@ -2435,7 +2422,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { protected boolean addRunning() { return this.addRunning; } - + public boolean rwiIsEmpty() { if ( !this.rwiStack.isEmpty() ) { return false; @@ -2455,7 +2442,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { } return c; } - + protected boolean testFlags(final Bitfield flags) { if (this.query.constraint == null) return true; // test if ientry matches with filter @@ -2472,7 +2459,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { } return false; } - + protected Map> searchContainerMap() { // direct access to the result maps is needed for abstract generation // this is only available if execQuery() was called before @@ -2536,12 +2523,12 @@ public final class SearchEvent implements ScoreMapUpdatesListener { for ( final String w : words ) { word = w.toLowerCase(); if ( word.length() > 2 - && "http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off" + && "http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off" .indexOf(word) < 0 - && !this.query.getQueryGoal().containsInclude(word) - && lettermatch.matcher(word).matches() - && !Switchboard.badwords.contains(word) - && !Switchboard.stopwords.contains(word) ) { + && !this.query.getQueryGoal().containsInclude(word) + && lettermatch.matcher(word).matches() + && !Switchboard.badwords.contains(word) + && !Switchboard.stopwords.contains(word) ) { this.ref.inc(word); } }