From 54207c821b75070b3e7d4e490cd10f8be5d330b7 Mon Sep 17 00:00:00 2001 From: reger Date: Fri, 24 Jun 2016 01:42:04 +0200 Subject: [PATCH 01/12] fix missing quote in gr.lng, it.lng, add sentence in master.lng.xlf and remove 2 not needed entries --- locales/gr.lng | 2 +- locales/it.lng | 2 +- locales/master.lng.xlf | 23 +++++++++++++---------- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/locales/gr.lng b/locales/gr.lng index fa73b2581..bccae501d 100644 --- a/locales/gr.lng +++ b/locales/gr.lng @@ -101,7 +101,7 @@ You do not need to provide any personal data here, but if you want to distribute #Yahoo!==Yahoo! #MSN=MSN Comment==Σχόλιο -"Save"==Αποθήκευση +"Save"=="Αποθήκευση" #----------------------------- #File: Connections_p.html diff --git a/locales/it.lng b/locales/it.lng index aa0777d40..4fb128405 100644 --- a/locales/it.lng +++ b/locales/it.lng @@ -97,7 +97,7 @@ You do not need to provide any personal data here, but if you want to distribute #Yahoo!==Yahoo! #MSN=MSN Comment==Commento -"Save"==Salva +"Save"=="Salva" #----------------------------- #File: Connections_p.html diff --git a/locales/master.lng.xlf b/locales/master.lng.xlf index 2d2e5b2ce..8e01c1ff8 100644 --- a/locales/master.lng.xlf +++ b/locales/master.lng.xlf @@ -1587,9 +1587,6 @@ "Submit" - - http://www.iana.org/assignments/media-types/</a> - @@ -2117,6 +2114,9 @@ Release will be installed. Please wait. + + You installed YaCy with a package manager. + To update YaCy, use the package manager: @@ -2192,9 +2192,6 @@ Last Deploy - - You installed YaCy with a package manager. - @@ -6752,10 +6749,19 @@ "Set Boost Query" - + field not in local index (boost has no effect) + + You can boost with vocabularies, use the field + + + with values + + + You can also boost on logarithmic occurrence counters of the fields + "Set Field Boosts" @@ -8658,9 +8664,6 @@ See the page info about the url. - - YaCy '#[clientname]#': View URL Content - View URL Content From 1122c9f0e8b24634f90b60a4bf96fd0a6319a9a5 Mon Sep 17 00:00:00 2001 From: reger Date: Fri, 24 Jun 2016 23:52:42 +0200 Subject: [PATCH 02/12] add maven release profile plugin version (following maven's suggestion) + upd some plugin version --- pom.xml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 1f626a6b2..40e4e71e4 100644 --- a/pom.xml +++ b/pom.xml @@ -151,7 +151,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 2.10.3 + 2.10.4 javadoc true @@ -248,7 +248,7 @@ maven-assembly-plugin - 2.5.3 + 2.6 assembly.xml @@ -266,6 +266,16 @@ + + org.apache.maven.plugins + maven-source-plugin + 3.0.0 + + + org.apache.maven.plugins + maven-deploy-plugin + 2.8.2 + From d4da4805a836e1124003efaf04b0c65ebb5af882 Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 25 Jun 2016 02:46:44 +0200 Subject: [PATCH 03/12] internal wiki code, require header line to start with markup (to allow something like "one=two" as text) + incl. test case --- source/net/yacy/data/wiki/WikiCode.java | 12 ++++++++- .../java/net/yacy/data/wiki/WikiCodeTest.java | 25 ++++++++++++++++++- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/source/net/yacy/data/wiki/WikiCode.java b/source/net/yacy/data/wiki/WikiCode.java index 7ca013074..f28072213 100644 --- a/source/net/yacy/data/wiki/WikiCode.java +++ b/source/net/yacy/data/wiki/WikiCode.java @@ -937,7 +937,17 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { //extra treatment for headlines if (Arrays.binarySearch(HEADLINE_TAGS, tags.openWiki) >= 0) { - processHeadline(stringBuilder, firstPosition, tags, secondPosition, direlem); + // require line starts with headline markup (hdr e.g. " == Title == " but not "Seven = six plus one" ) + int i = 0; + boolean beginsWith = true; + while (i < firstPosition) { + if (stringBuilder.charAt(i) > ' ') { + beginsWith = false; + break; + } + i++; + } + if (beginsWith) processHeadline(stringBuilder, firstPosition, tags, secondPosition, direlem); } else { final int oldLength = stringBuilder.length(); stringBuilder.replace(firstPosition, firstPosition + tags.openWikiLength, tags.openHTML); diff --git a/test/java/net/yacy/data/wiki/WikiCodeTest.java b/test/java/net/yacy/data/wiki/WikiCodeTest.java index c4b75a511..5a2d4e08d 100644 --- a/test/java/net/yacy/data/wiki/WikiCodeTest.java +++ b/test/java/net/yacy/data/wiki/WikiCodeTest.java @@ -1,5 +1,6 @@ package net.yacy.data.wiki; +import java.io.BufferedReader; import org.junit.Test; import static org.junit.Assert.*; @@ -32,4 +33,26 @@ public class WikiCodeTest { } } -} \ No newline at end of file + /** + * test header wiki markup + */ + @Test + public void testProcessLineOfWikiCode() { + String[] hdrTeststr = new String[]{ // ok test header + "== Header ==", "==Header=="}; + + String[] nohdrTeststr = new String[]{ // wrong test header + "Text of = Header =", "One=Two"}; + + WikiCode wc = new WikiCode(); + + for (String s : hdrTeststr) { // test ok header + String erg = wc.transform("8090", s); + assertTrue("

tag expected:"+erg, erg.contains("

")); + } + for (String s : nohdrTeststr) { // test wrong header + String erg = wc.transform("8090", s); + assertFalse("no header tag expected:"+erg, erg.contains("

")); + } + } +} From a476d06aec7e39a9fcc39a083c75cdbe3f083be0 Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 25 Jun 2016 02:59:44 +0200 Subject: [PATCH 04/12] wiki header code test string add "closing" tag --- test/java/net/yacy/data/wiki/WikiCodeTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/java/net/yacy/data/wiki/WikiCodeTest.java b/test/java/net/yacy/data/wiki/WikiCodeTest.java index 5a2d4e08d..ca110fb9a 100644 --- a/test/java/net/yacy/data/wiki/WikiCodeTest.java +++ b/test/java/net/yacy/data/wiki/WikiCodeTest.java @@ -42,7 +42,7 @@ public class WikiCodeTest { "== Header ==", "==Header=="}; String[] nohdrTeststr = new String[]{ // wrong test header - "Text of = Header =", "One=Two"}; + "Text of = Header, false = wrong", "One=Two"}; WikiCode wc = new WikiCode(); From 41c36ffd751dadb84376ca4d8c1556c09d6375d9 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 26 Jun 2016 06:46:26 +0200 Subject: [PATCH 05/12] exclude rejected results from result count (by using the resultcontainer.size instead of input docList.size) skip waiting for write-search-result-to-local-index (by removing the Thread.join - which will bring a small performance increase) --- source/net/yacy/peers/Protocol.java | 111 +++++++++++++--------------- 1 file changed, 51 insertions(+), 60 deletions(-) diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index bb5ea5954..1acfef42a 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -1147,25 +1147,25 @@ public final class Protocol { return 0; } - List container = new ArrayList(); + List resultContainer = new ArrayList(); Network.log.info("SEARCH (solr), returned " + docList[0].size() + " out of " + docList[0].getNumFound() + " documents and " + facets.size() + " facets " + facets.keySet().toString() + " from " + (target == null ? "shard" : ("peer " + target.hash + ":" + target.getName()))); int term = count; Collection docs; if (event.addResultsToLocalIndex) { // only needed to store remote results docs = new ArrayList(docList[0].size()); } else docs = null; - for (final SolrDocument doc: docList[0]) { + for (final SolrDocument tmpdoc: docList[0]) { //System.out.println("***DEBUG*** " + ((String) doc.getFieldValue("sku"))); if ( term-- <= 0 ) { break; // do not process more that requested (in case that evil peers fill us up with rubbish) } // get one single search result - if ( doc == null ) { + if ( tmpdoc == null ) { continue; } URIMetadataNode urlEntry; try { - urlEntry = new URIMetadataNode(doc); + urlEntry = new URIMetadataNode(tmpdoc); } catch (MalformedURLException ex) { continue; } @@ -1198,73 +1198,61 @@ public final class Protocol { // put the remote documents to the local index. We must convert the solr document to a solr input document: if (event.addResultsToLocalIndex) { - /* Check document size, only if a limit is set on remote documents size allowed to be stored to local index */ - if(checkDocumentSize(doc, event.getRemoteDocStoredMaxSize() * 1024)) { - final SolrInputDocument sid = event.query.getSegment().fulltext().getDefaultConfiguration().toSolrInputDocument(doc); - - // the input document stays untouched because it contains top-level cloned objects - docs.add(sid); - // will be stored to index, and is a full solr document, can be added to firstseen - event.query.getSegment().setFirstSeenTime(urlEntry.hash(), Math.min(urlEntry.moddate().getTime(), System.currentTimeMillis())); - } else { - Network.log.info("Document size greater than " + event.getRemoteDocStoredMaxSize() + " kbytes, excludes it from being stored to local index. Url : " + urlEntry.urlstring()); - } + /* Check document size, only if a limit is set on remote documents size allowed to be stored to local index */ + if (checkDocumentSize(tmpdoc, event.getRemoteDocStoredMaxSize() * 1024)) { + final SolrInputDocument sid = event.query.getSegment().fulltext().getDefaultConfiguration().toSolrInputDocument(tmpdoc); + + // the input document stays untouched because it contains top-level cloned objects + docs.add(sid); + // will be stored to index, and is a full solr document, can be added to firstseen + event.query.getSegment().setFirstSeenTime(urlEntry.hash(), Math.min(urlEntry.moddate().getTime(), System.currentTimeMillis())); + } else { + Network.log.info("Document size greater than " + event.getRemoteDocStoredMaxSize() + " kbytes, excludes it from being stored to local index. Url : " + urlEntry.urlstring()); + } } // after this conversion we can remove the largest and not used field text_t and synonyms_sxt from the document // because that goes into a search cache and would take a lot of memory in the search cache //doc.removeFields(CollectionSchema.text_t.getSolrFieldName()); - doc.removeFields(CollectionSchema.synonyms_sxt.getSolrFieldName()); - + tmpdoc.removeFields(CollectionSchema.synonyms_sxt.getSolrFieldName()); + ResultURLs.stack( - ASCII.String(urlEntry.url().hash()), - urlEntry.url().getHost(), - event.peers.mySeed().hash.getBytes(), - UTF8.getBytes(target.hash), - EventOrigin.QUERIES); + ASCII.String(urlEntry.url().hash()), + urlEntry.url().getHost(), + event.peers.mySeed().hash.getBytes(), + UTF8.getBytes(target.hash), + EventOrigin.QUERIES); } - // add the url entry to the word indexes - container.add(urlEntry); + // add the url entry to the checked results + resultContainer.add(urlEntry); } - final int dls = docList[0].size(); final int numFound = (int) docList[0].getNumFound(); docList[0].clear(); docList[0] = null; if (localsearch) { - event.addNodes(container, facets, snippets, true, "localpeer", numFound); + event.addNodes(resultContainer, facets, snippets, true, "localpeer", numFound); event.addFinalize(); event.addExpectedRemoteReferences(-count); - Network.log.info("local search (solr): localpeer sent " + container.size() + "/" + numFound + " references"); + Network.log.info("local search (solr): localpeer sent " + resultContainer.size() + "/" + numFound + " references"); } else { if (event.addResultsToLocalIndex) { - /* - * Current thread might be interrupted by SearchEvent.cleanup() - */ - if (Thread.interrupted()) { - throw new InterruptedException("solrQuery interrupted"); - } - WriteToLocalIndexThread writeToLocalIndexThread = new WriteToLocalIndexThread(event.query.getSegment(), - docs); - writeToLocalIndexThread.start(); - try { - writeToLocalIndexThread.join(); - } catch (InterruptedException e) { - /* - * Current thread interruption might happen while waiting - * for writeToLocalIndexThread. - */ - writeToLocalIndexThread.stopWriting(); - throw new InterruptedException("solrQuery interrupted"); - } - docs.clear(); + /* + * Current thread might be interrupted by SearchEvent.cleanup() + */ + if (Thread.interrupted()) { + throw new InterruptedException("solrQuery interrupted"); + } + WriteToLocalIndexThread writeToLocalIndexThread = new WriteToLocalIndexThread(event.query.getSegment(), + docs); // will clear docs on return + writeToLocalIndexThread.start(); } - event.addNodes(container, facets, snippets, false, target.getName() + "/" + target.hash, numFound); + event.addNodes(resultContainer, facets, snippets, false, target.getName() + "/" + target.hash, numFound); event.addFinalize(); event.addExpectedRemoteReferences(-count); - Network.log.info("remote search (solr): peer " + target.getName() + " sent " + (container.size() == 0 ? 0 : container.size()) + "/" + numFound + " references"); + Network.log.info("remote search (solr): peer " + target.getName() + " sent " + (resultContainer.size()) + "/" + numFound + " references"); } - return dls; + return resultContainer.size(); } /** @@ -1285,6 +1273,7 @@ public final class Protocol { /** * Parameters must be not null. + * After writing the collection is cleared * @param segment solr segment to write * @param docs solr documents collection to put to segment */ @@ -1300,17 +1289,19 @@ public final class Protocol { this.stop.set(true); } - @Override - public void run() { - for (SolrInputDocument doc: docs) { - if(stop.get()) { - Network.log.info("Writing documents collection to Solr segment was stopped."); - return; - } - segment.putDocument(doc); + @Override + public void run() { + for (SolrInputDocument doc : docs) { + if (stop.get()) { + docs.clear(); + Network.log.info("Writing documents collection to Solr segment was stopped."); + return; + } + segment.putDocument(doc); } - } - } + docs.clear(); + } + } /** * Only when maxSize is greater than zero, check that doc size is lower. To From fb7af84d579e5bb36479472fb33f02fbaabe5acd Mon Sep 17 00:00:00 2001 From: reger Date: Mon, 27 Jun 2016 00:17:18 +0200 Subject: [PATCH 06/12] remove redundant/unused translation in ru.lng, unify Network.html header + dto. in uk.lng and comment-out some German leftovers. + upd master.lng accordingly --- locales/master.lng.xlf | 12 +++--------- locales/ru.lng | 9 ++------- locales/uk.lng | 31 +++++++++++++++---------------- 3 files changed, 20 insertions(+), 32 deletions(-) diff --git a/locales/master.lng.xlf b/locales/master.lng.xlf index 8e01c1ff8..e472cec81 100644 --- a/locales/master.lng.xlf +++ b/locales/master.lng.xlf @@ -5515,12 +5515,9 @@ - - YaCy '#[clientname]#': YaCy Search Network - - - YaCy Search Network '#[networkName]#' - + + YaCy Search Network + YaCy Network< @@ -5812,9 +5809,6 @@ >DHT-in< - - YaCy Search Network - Count of Connected Senior Peers diff --git a/locales/ru.lng b/locales/ru.lng index 05bbf3e7b..12a205810 100644 --- a/locales/ru.lng +++ b/locales/ru.lng @@ -1131,7 +1131,6 @@ List of possible crawl start URLs==Список ссылок для провер #File: CrawlProfileEditor_p.html #--------------------------- Crawl Profile Editor==Изменение профиля индексирования ->Crawl Profile Editor<==>Изменение профиля индексирования< >Crawler Steering<==>Управление индексатором< >Crawl Scheduler<==>Планировщик индексирования< >Scheduled Crawls can be modified in this table<==>Запланированное индексирование можно изменить в этой таблице< @@ -2383,8 +2382,7 @@ The target peer is alive but did not receive your message. Sorry.==Узел по #File: Network.html #--------------------------- -YaCy '#[clientname]#': YaCy Search Network==YaCy '#[clientname]#': Мониторинг сети -YaCy Search Network '#[networkName]#'==Мониторинг сети YaCy +YaCy Search Network==Мониторинг сети YaCy YaCy Network<==Сеть YaCy< The information that is presented on this page can also be retrieved as XML.==Информация, указанная на этой странице, также может быть получена как XML. Click the API icon to see the XML.==Нажмите на иконку API, чтобы увидеть XML. @@ -2876,7 +2874,6 @@ field not in local index (boost has no effect)==поля нет в локаль #File: RegexTest.html #--------------------------- -RegexTest==Тест регулярного выражения Regex Test==Тест регулярного выражения Test String==Тест строки Regular Expression==Регулярное выражение @@ -3767,7 +3764,6 @@ Parsed Sentences==Разобранные предложения Parsed Tokens/Words==Разобранные маркеры/слова Link List==Список ссылок Citation Report==Отчет цитирования ->CitationReport<==>Отчет цитирования< "Show"=="Показать" Unable to find URL Entry in DB==Невозможно найти запись ссылки в базе данных. Invalid URL==Неправильный URL-адрес @@ -3938,14 +3934,13 @@ Title==Заголовок #File: WatchWebStructure_p.html #--------------------------- -Web Structure<==Вэб-структура< +Web Structure==Вэб-структура The data that is visualized here can also be retrieved in a XML file, which lists the reference relation between the domains.==Эти данные, также могут быть получены в виде XML-файла с перекрёстными ссылками между доменами. With a GET-property 'about' you get only reference relations about the host that you give in the argument field for 'about'.==Указав параметр "GET" 'about' вы получите только перекрёстные ссылки о хосте, которые указан в поле 'about'. With a GET-property 'latest' you get a list of references that had been computed during the current run-time of YaCy, and with each next call only an update to the next list of references.==Указав параметр GET" 'latest' вы получите список ссылок вычисленных во время текущей работы YaCy, обновляющийся при каждом следующем вызове. Click the API icon to see the XML file.==Нажмите на иконку API для просмотра XML-файла. To see a list of all APIs, please visit the==Для просмотра списка всех API, пожалуйста, посетите API wiki page==страницу API Wiki -Web Structure==Вэб-структура >Host List<==>Список хостов< >#[count]# outlinks==>#[count]# внешних ссылок host<==Хост< diff --git a/locales/uk.lng b/locales/uk.lng index 15f99fdae..7ab57e9a0 100644 --- a/locales/uk.lng +++ b/locales/uk.lng @@ -427,7 +427,7 @@ You can also use your peer without opening it, but this is not recomended.==Ви #File: ConfigHeuristics_p.html #--------------------------- Heuristics Configuration==Настройки евристики -A heuristic is an 'experience-based technique that help in problem solving, learning and discovery' (wikipedia).==Heuristik 'bezeichnet die Kunst, mit begrenztem Wissen und wenig Zeit zu guten Lösungen zu kommen.' (Wikipedia). +#A heuristic is an 'experience-based technique that help in problem solving, learning and discovery' (wikipedia).==Heuristik 'bezeichnet die Kunst, mit begrenztem Wissen und wenig Zeit zu guten Lösungen zu kommen.' (Wikipedia). The search heuristics that can be switched on here are techniques that help the discovery of possible search results based on link guessing, in-search crawling and requests to other search engines.==Пошукова евристика може бути використовувати методи, які допомагають виявити можливі результати пошуку з використанням запитів по посиланнях, вбудованого сканування та запитів до інших пошукових систем. When a search heuristic is used, the resulting links are not used directly as search result but the loaded pages are indexed and stored like other content.==При використанні пошукової евристики знайдені посилання не відображаються як пошукові результати, а індексуються та зберігаються разом з іншим вмістом. This ensures that blacklists can be used and that the searched word actually appears on the page that was discovered by the heuristic.==Це гарантує, що чорні списки можуть бути використані, і що пошукові терміни з’являються дійсно на сторінках, які були знайдені за допомогою евристики. @@ -1993,8 +1993,7 @@ You cannot call this page directly. Instead, use a link on the setup the proxy befor #File: QuickCrawlLink_p.html #--------------------------- -Quick Crawl Link==Schnell Crawl Link -Quickly adding Bookmarks:==Schnell Crawl Lesezeichen: -Simply drag and drop the link shown below to your Browsers Toolbar/Link-Bar.==Ziehen Sie einfach den unten stehenden Link auf Ihre Browser Toolbar/Linkbar. -If you click on it while browsing, the currently viewed website will be inserted into the YaCy crawling queue for indexing.==Wenn Sie, während Sie surfen, auf dieses Lesezeichen klicken, wird die gerade betrachtete Seite zum YaCy Crawler-Puffer hinzugefügt, um indexiert zu werden. -Crawl with YaCy==Mit YaCy crawlen -Title:==Titel: -Link:==link: -Status:==Status: -URL successfully added to Crawler Queue==Die Url wurde erfolgreich zum Crawler-Puffer hinzugefügt. -Malformed URL==Fehler in der URL -Unable to create new crawling profile for URL:==Es ist nicht möglich für diese URL ein Crawling Profil zu erstellen: -Unable to add URL to crawler queue:==Es ist nicht möglich die URL zum Crawler-Puffer hinzuzufügen: +Quick Crawl Link==Швидке сканування посилання +#Quickly adding Bookmarks:==Schnell Crawl Lesezeichen: +#Simply drag and drop the link shown below to your Browsers Toolbar/Link-Bar.==Ziehen Sie einfach den unten stehenden Link auf Ihre Browser Toolbar/Linkbar. +#If you click on it while browsing, the currently viewed website will be inserted into the YaCy crawling queue for indexing.==Wenn Sie, während Sie surfen, auf dieses Lesezeichen klicken, wird die gerade betrachtete Seite zum YaCy Crawler-Puffer hinzugefügt, um indexiert zu werden. +#Crawl with YaCy==Mit YaCy crawlen +#Title:==Titel: +#Link:==link: +#Status:==Status: +#URL successfully added to Crawler Queue==Die Url wurde erfolgreich zum Crawler-Puffer hinzugefügt. +#Malformed URL==Fehler in der URL +#Unable to create new crawling profile for URL:==Es ist nicht möglich für diese URL ein Crawling Profil zu erstellen: +#Unable to add URL to crawler queue:==Es ist nicht möglich die URL zum Crawler-Puffer hinzuzufügen: #----------------------------- #File: Ranking_p.html @@ -2941,7 +2940,7 @@ Go back to the Settings page==Назад до ст Your system is not protected by a password==Ваша система не захищена паролем Please go to the User Administration page and set an administration password.==Будь-ласка, перейдіть на сторінку керування користувачами і виставте основний пароль. You don't have the correct access right to perform this task.==У вас немає дозволу на запуск цього додатка. -Please log in.==Bitte melden Sie sich an. +#Please log in.==Bitte melden Sie sich an. You can now go back to the Settings page if you want to make more changes.==Якщо хочете зробити інші зміни, можна перейти назад на сторінку налаштувань. See you soon!==До зустрічі! Just a moment, please!==Зачекайте трохи, будь ласка! From b71a60c04b022868e628cbfba02abc5ddfccd3c5 Mon Sep 17 00:00:00 2001 From: reger Date: Mon, 27 Jun 2016 03:12:39 +0200 Subject: [PATCH 07/12] fix NPE in CrawlMonitorRemoteStart servlet due to missing startURL + add a startURL attribute while generating news record for above (in Crawler_p) --- htroot/CrawlMonitorRemoteStart.java | 8 ++++---- htroot/Crawler_p.java | 3 +++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/htroot/CrawlMonitorRemoteStart.java b/htroot/CrawlMonitorRemoteStart.java index 8accaf87f..906de0ea5 100644 --- a/htroot/CrawlMonitorRemoteStart.java +++ b/htroot/CrawlMonitorRemoteStart.java @@ -61,8 +61,8 @@ public class CrawlMonitorRemoteStart { prop.put("otherCrawlStartInProgress_" + showedCrawl + "_dark", dark ? "1" : "0"); prop.put("otherCrawlStartInProgress_" + showedCrawl + "_cre", record.created().toString()); prop.put("otherCrawlStartInProgress_" + showedCrawl + "_peername", peername); - prop.put("otherCrawlStartInProgress_" + showedCrawl + "_startURL", record.attributes().get("startURL").toString()); - prop.put("otherCrawlStartInProgress_" + showedCrawl + "_intention", record.attributes().get("intention").toString()); + prop.put("otherCrawlStartInProgress_" + showedCrawl + "_startURL", record.attributes().get("startURL")); + prop.put("otherCrawlStartInProgress_" + showedCrawl + "_intention", record.attributes().get("intention")); prop.put("otherCrawlStartInProgress_" + showedCrawl + "_generalDepth", record.attributes().get("generalDepth")); prop.put("otherCrawlStartInProgress_" + showedCrawl + "_crawlingQ", ("true".equals(record.attributes().get("crawlingQ"))) ? "1" : "0"); showedCrawl++; @@ -88,8 +88,8 @@ public class CrawlMonitorRemoteStart { prop.put("otherCrawlStartFinished_" + showedCrawl + "_dark", dark ? "1" : "0"); prop.put("otherCrawlStartFinished_" + showedCrawl + "_cre", record.created().toString()); prop.putHTML("otherCrawlStartFinished_" + showedCrawl + "_peername", peername); - prop.putHTML("otherCrawlStartFinished_" + showedCrawl + "_startURL", record.attributes().get("startURL").toString()); - prop.put("otherCrawlStartFinished_" + showedCrawl + "_intention", record.attributes().get("intention").toString()); + prop.putHTML("otherCrawlStartFinished_" + showedCrawl + "_startURL", record.attributes().get("startURL")); + prop.put("otherCrawlStartFinished_" + showedCrawl + "_intention", record.attributes().get("intention")); prop.put("otherCrawlStartFinished_" + showedCrawl + "_generalDepth", record.attributes().get("generalDepth")); prop.put("otherCrawlStartFinished_" + showedCrawl + "_crawlingQ", ("true".equals(record.attributes().get("crawlingQ"))) ? "1" : "0"); showedCrawl++; diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 4f440a0ac..c964596e8 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -594,6 +594,9 @@ public class Crawler_p { m.remove("generalFilter"); m.remove("specificFilter"); m.put("intention", post.get("intention", "").replace(',', '/')); + if (successurls.size() > 0) { // just include at least one of the startURL's in case of multiple for the news service + m.put("startURL", successurls.iterator().next().toNormalform(true)); + } sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), NewsPool.CATEGORY_CRAWL_START, m); } } else { From 5523998fdf2a33d700e0e8b6343263360a7bbeb4 Mon Sep 17 00:00:00 2001 From: sixcooler Date: Tue, 28 Jun 2016 20:58:58 +0200 Subject: [PATCH 08/12] rise limit of reversion to >9999 --- libbuild/GitRevMavenTask/src/GitRevMavenTask.java | 2 +- libbuild/GitRevTask/GitRevTask.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libbuild/GitRevMavenTask/src/GitRevMavenTask.java b/libbuild/GitRevMavenTask/src/GitRevMavenTask.java index ebee6b1db..0775905e4 100644 --- a/libbuild/GitRevMavenTask/src/GitRevMavenTask.java +++ b/libbuild/GitRevMavenTask/src/GitRevMavenTask.java @@ -87,7 +87,7 @@ public class GitRevMavenTask extends AbstractMojo { break; } } - if (lastTag != null || distance++ > 999) { + if (lastTag != null || distance++ > 90999) { break; } } diff --git a/libbuild/GitRevTask/GitRevTask.java b/libbuild/GitRevTask/GitRevTask.java index 959e0b50f..8834a87bf 100644 --- a/libbuild/GitRevTask/GitRevTask.java +++ b/libbuild/GitRevTask/GitRevTask.java @@ -79,7 +79,7 @@ public class GitRevTask extends org.apache.tools.ant.Task { break; } } - if (lastTag != null || distance++ > 999) break; + if (lastTag != null || distance++ > 90999) break; } walk.dispose(); if (lastTag == null) { From 5aaa057c65da7814dd3dbc301eecdd3099cee413 Mon Sep 17 00:00:00 2001 From: reger Date: Tue, 28 Jun 2016 23:44:28 +0200 Subject: [PATCH 09/12] ignore empty input lines in FileUtils.getListArray() to poka joke blacklist read. equalizes behavior with getListString() improves: case were blacklist file contained a undesired empty line, not fixed by blacklist-cleaner. --- source/net/yacy/kelondro/util/FileUtils.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/source/net/yacy/kelondro/util/FileUtils.java b/source/net/yacy/kelondro/util/FileUtils.java index e3f748c65..c9e8564fb 100644 --- a/source/net/yacy/kelondro/util/FileUtils.java +++ b/source/net/yacy/kelondro/util/FileUtils.java @@ -517,6 +517,7 @@ public final class FileUtils { /** * Read lines of a file into an ArrayList. + * Empty lines in the file are ignored. * * @param listFile the file * @return the resulting array as an ArrayList @@ -529,7 +530,7 @@ public final class FileUtils { br = new BufferedReader(new InputStreamReader(new FileInputStream(listFile), StandardCharsets.UTF_8)); while ( (line = br.readLine()) != null ) { - list.add(line); + if (!line.isEmpty()) list.add(line); } br.close(); } catch (final IOException e ) { @@ -576,6 +577,7 @@ public final class FileUtils { /** * Read lines of a text file into a String, optionally ignoring comments. + * Empty lines are always ignored. * * @param listFile the File to read from. * @param withcomments If false ignore lines starting with '#'. From 900ec17d1ab5c2021ed08d21917aede59a7c4e55 Mon Sep 17 00:00:00 2001 From: reger Date: Wed, 29 Jun 2016 23:27:59 +0200 Subject: [PATCH 10/12] add de hint translation for CrawlStartScanner_p rem missing translation line in other lng --- locales/cn.lng | 3 +-- locales/de.lng | 5 ++--- locales/hi.lng | 2 +- locales/master.lng.xlf | 4 ++-- locales/ru.lng | 2 +- 5 files changed, 7 insertions(+), 9 deletions(-) diff --git a/locales/cn.lng b/locales/cn.lng index 98b01c6fe..a200bd6e2 100644 --- a/locales/cn.lng +++ b/locales/cn.lng @@ -1069,7 +1069,7 @@ Network Scanner==网络扫描器 YaCy can scan a network segment for available http, ftp and smb server.==YaCy可扫描http, ftp 和smb服务器. You must first select a IP range and then, after this range is scanned,==须先指定IP范围, 再进行扫描, it is possible to select servers that had been found for a full-site crawl.==才有可能选择主机并将其作为全站crawl的服务器. -No servers had been detected in the given IP range #[iprange]#. +#No servers had been detected in the given IP range== Please enter a different IP range for another scan.==未检测到可用服务器, 请重新指定IP范围. Please wait...==请稍候... >Scan the network<==>扫描网络< @@ -2952,7 +2952,6 @@ New Password is empty.==新密码为空. #File: ViewFile.html #--------------------------- -YaCy '#[clientname]#': View URL Content==YaCy '#[clientname]#': 查看文件内容 View URL Content==查看链接内容 >Get URL Viewer<==>获取链接浏览器< >URL Metadata<==>链接元数据< diff --git a/locales/de.lng b/locales/de.lng index f03046679..8de7210a6 100644 --- a/locales/de.lng +++ b/locales/de.lng @@ -1334,7 +1334,7 @@ Network Scanner==Netzwerk Scanner YaCy can scan a network segment for available http, ftp and smb server.==YaCy kann ein Netzwerksegment auf verfügbare HTTP, FTP und SMB Server hin absuchen. You must first select a IP range and then, after this range is scanned,==Sie müssen zuerst einen IP Bereich festlegen und dann, nachdem dieser Bereich gescannt wurde, it is possible to select servers that had been found for a full-site crawl.==ist es möglich einen gefunden Server für eine volle Seiten Suche crawlen zu lassen. -No servers had been detected in the given IP range #[iprange]#. +No servers had been detected in the given IP range==Es wurde kein Server im angegebenen IP Bereich gefunden Please enter a different IP range for another scan.==Bitte geben Sie einen anderen IP Bereich ein für einen weiteren Scan. Please wait...==Bitte warten... >Scan the network<==>Das Netzwerk Scannen< @@ -3147,7 +3147,7 @@ For community support, please visit our==Für Unterstützung aus der Community, #File: Status_p.inc #--------------------------- -#System Status==System Status +System Status==Systemstatus Unknown==unbekannt YaCy version:==YaCy Version: Uptime:==Online seit: @@ -3493,7 +3493,6 @@ New Password is empty.==Das neue Passwort ist leer. #File: ViewFile.html #--------------------------- -YaCy '#[clientname]#': View URL Content==YaCy '#[clientname]#': Zeige URL Inhalte View URL Content==Zeige URL Inhalte >Get URL Viewer<==>URL Betrachter< "Show Metadata"=="Metadaten anzeigen" diff --git a/locales/hi.lng b/locales/hi.lng index 5c69825ce..3f2f7774e 100644 --- a/locales/hi.lng +++ b/locales/hi.lng @@ -1077,7 +1077,7 @@ Network Scanner==नेटवर्क स्कैनर YaCy can scan a network segment for available http, ftp and smb server.==YaCy उपलब्ध HTTP, FTP और किसी सर्वर के लिए एक नेटवर्क खंड स्कैन कर सकते हैं. You must first select a IP range and then, after this range is scanned,==इस श्रृंखला स्कैन के बाद आप पहली बार, तो एक आईपी श्रेणी का चयन करना चाहिए it is possible to select servers that had been found for a full-site crawl.==यह एक पूरी साइट क्रॉल के लिए पाया गया था कि सर्वर का चयन करने के लिए संभव है. -No servers had been detected in the given IP range #[iprange]#. +#No servers had been detected in the given IP range== Please enter a different IP range for another scan.==एक और स्कैन के लिए एक अलग आईपी रेंज दर्ज करें. Please wait...==कृपया प्रतीक्षा करें ... >Scan the network<==>नेटवर्क स्कैन< diff --git a/locales/master.lng.xlf b/locales/master.lng.xlf index e472cec81..294c984de 100644 --- a/locales/master.lng.xlf +++ b/locales/master.lng.xlf @@ -3294,8 +3294,8 @@ it is possible to select servers that had been found for a full-site crawl. - - No servers had been detected in the given IP range #[iprange]#. + + No servers had been detected in the given IP range Please enter a different IP range for another scan. diff --git a/locales/ru.lng b/locales/ru.lng index 12a205810..48189adf4 100644 --- a/locales/ru.lng +++ b/locales/ru.lng @@ -1460,7 +1460,7 @@ Network Scanner==Сканер сети YaCy can scan a network segment for available http, ftp and smb server.==YaCy может сканировать такие сегменты сети как http-, ftp- и smb-серверы . You must first select a IP range and then, after this range is scanned,==Сначала вы должны выбрать диапазон IP-адресов, а затем диапазон сканирования. it is possible to select servers that had been found for a full-site crawl.==После этого можно выбрать серверы для полного индексирования сайта. -No servers had been detected in the given IP range #[iprange]#.==Серверы не обнаружены в заданном диапазоне IP-адресов. +No servers had been detected in the given IP range==Серверы не обнаружены в заданном диапазоне IP-адресов Please enter a different IP range for another scan.==Пожалуйста, введите другой диапазон IP-адресов, для повторного сканирования. Please wait...==Пожалуйста, подождите... >Scan the network<==>Сканирование сети< From 7bac7567208a211659ee27e4c0d88b15a9885cc3 Mon Sep 17 00:00:00 2001 From: reger Date: Fri, 1 Jul 2016 00:02:10 +0200 Subject: [PATCH 11/12] prevent dealing with -UNRESOLVED_PATTERN- eventID parameter in html includes on first landing on search page --- htroot/yacysearch.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index f00bba375..fd8e604e1 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -60,9 +60,7 @@ import net.yacy.data.BookmarksDB.Bookmark; import net.yacy.data.DidYouMean; import net.yacy.data.UserDB; import net.yacy.data.ymark.YMarkTables; -import net.yacy.document.Document; import net.yacy.document.LibraryProvider; -import net.yacy.document.Parser; import net.yacy.document.Tokenizer; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.util.Bitfield; @@ -73,7 +71,6 @@ import net.yacy.kelondro.util.SetTools; import net.yacy.peers.EventChannel; import net.yacy.peers.NewsPool; import net.yacy.peers.graphics.ProfilingGraph; -import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.EventTracker; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; @@ -195,6 +192,7 @@ public class yacysearch { prop.put("geoinfo", "0"); prop.put("rss_queryenc", ""); prop.put("meanCount", 5); + prop.put("eventID",""); // mandatory parameter for yacysearchtrailer/yacysearchitem includes return prop; } From 8d58a480294a41ac5ef7f907dcada13679a8b317 Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 2 Jul 2016 20:33:23 +0200 Subject: [PATCH 12/12] remove wrong log line in CrawlSwitchboard + don't allow CrawlSwitchboard to exit application making network param unused --- source/net/yacy/crawler/CrawlSwitchboard.java | 12 +++--------- source/net/yacy/search/Switchboard.java | 4 ++-- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index f6e5c1619..ac2a6244c 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -59,8 +59,8 @@ import net.yacy.search.SwitchboardConstants; public final class CrawlSwitchboard { - public static final String CRAWL_PROFILE_AUTOCRAWL_DEEP = "autocrawlDeep"; - public static final String CRAWL_PROFILE_AUTOCRAWL_SHALLOW = "autocrawlShallow"; + public static final String CRAWL_PROFILE_AUTOCRAWL_DEEP = "autocrawlDeep"; + public static final String CRAWL_PROFILE_AUTOCRAWL_SHALLOW = "autocrawlShallow"; public static final String CRAWL_PROFILE_PROXY = "proxy"; public static final String CRAWL_PROFILE_REMOTE = "remote"; public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = "snippetLocalText"; @@ -107,18 +107,12 @@ public final class CrawlSwitchboard { private final File queuesRoot; private Switchboard switchboard; - public CrawlSwitchboard(final String networkName, Switchboard switchboard) { + public CrawlSwitchboard(Switchboard switchboard) { this.switchboard = switchboard; this.log = this.switchboard.log; this.queuesRoot = this.switchboard.queuesRoot; this.defaultPushProfiles = new ConcurrentHashMap<>(); - this.log.info("Initializing Word Index for the network '" + networkName + "'."); - - if ( networkName == null || networkName.isEmpty() ) { - log.severe("no network name given - shutting down"); - System.exit(0); - } this.profilesActiveCrawlsCache = Collections.synchronizedMap(new TreeMap(Base64Order.enhancedCoder)); this.profilesActiveCrawlsCounter = new ConcurrentHashMap(); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 174ba7d57..290a32115 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -607,7 +607,7 @@ public final class Switchboard extends serverSwitch { } // create a crawler - this.crawler = new CrawlSwitchboard(networkName, this); + this.crawler = new CrawlSwitchboard(this); // start yacy core this.log.config("Starting YaCy Protocol Core"); @@ -1398,7 +1398,7 @@ public final class Switchboard extends serverSwitch { // create a crawler this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object - this.crawler = new CrawlSwitchboard(networkName, this); + this.crawler = new CrawlSwitchboard(this); // init a DHT transmission dispatcher this.dhtDispatcher =