From 3959d43a5c3412704f21b4c69aa723e71e2134e8 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 3 Aug 2021 16:57:24 +0200 Subject: [PATCH] fixed doku link --- htroot/CrawlStartExpert.html | 4 +- htroot/IndexFederated_p.html | 2 +- locales/master.lng.xlf | 2 +- locales/ru.lng | 2 +- startYACY.sh | 268 +++++++++--------- .../document/parser/GenericXMLParserTest.java | 100 +++---- 6 files changed, 191 insertions(+), 187 deletions(-) diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html index e3cdb0d25..4093df5a7 100644 --- a/htroot/CrawlStartExpert.html +++ b/htroot/CrawlStartExpert.html @@ -217,7 +217,7 @@ #%env/templates/submenuIndexCreate.template%#
-API +API Click on this API button to see a documentation of the POST request parameter for crawl starts.
@@ -228,7 +228,7 @@ You can define URLs as start points for Web page crawling and start crawling here. "Crawling" means that YaCy will download the given website, extract all links in it and then download the content behind these links. This is repeated as long as specified under "Crawling Depth". - A crawl can also be started using wget and the post arguments for this web page. + A crawl can also be started using wget and the post arguments for this web page.

diff --git a/htroot/IndexFederated_p.html b/htroot/IndexFederated_p.html index dc7902ce4..3edd5ceb7 100644 --- a/htroot/IndexFederated_p.html +++ b/htroot/IndexFederated_p.html @@ -34,7 +34,7 @@ If you switch off this index, a remote Solr must be activated.
Use remote Solr server(s) 
-
It's easy to attach an external Solr to YaCy. +
It's easy to attach an external Solr to YaCy. This external Solr can be used instead the internal Solr. It can also be used additionally to the internal Solr, then both Solr indexes are mirrored.
diff --git a/locales/master.lng.xlf b/locales/master.lng.xlf index 7c77feffb..83d5b1317 100644 --- a/locales/master.lng.xlf +++ b/locales/master.lng.xlf @@ -4935,7 +4935,7 @@ Index Size - It's easy to <a href="http://www.yacy-websearch.net/wiki/index.php/Dev:Solr" target="_blank">attach an external Solr to YaCy</a>. + It's easy to <a href="https://wiki.yacy.net/index.php/Dev:Solr" target="_blank">attach an external Solr to YaCy</a>. This external Solr can be used instead the internal Solr. It can also be used additionally to the internal Solr, then both Solr indexes are mirrored. diff --git a/locales/ru.lng b/locales/ru.lng index b28f21170..e90ef4f18 100644 --- a/locales/ru.lng +++ b/locales/ru.lng @@ -578,7 +578,7 @@ Use remote Solr server(s)==Использовать удалённую базу Solr Hosts==Хосты Solr Solr Host Administration Interface==Интерфейс управления Solr Index Size==Документов в индексе -It's easy to attach an external Solr to YaCy.==Присоединить внешнюю базу Solr просто. +It's easy to attach an external Solr to YaCy.==Присоединить внешнюю базу Solr просто. This external Solr can be used instead the internal Solr. It can also be used additionally to the internal Solr, then both Solr indexes are mirrored.==Внешняя база данных Solr будет использоваться вместо встроенной. Вы также можете использовать дополнительно встроенную базу, но тогда индексы будут сохраняться в обе базы. Solr URL(s)==Ссылки на базу Solr You can set one or more Solr targets here which are accessed as a shard. For several targets, list them using a ',' (comma) as separator.==Вы можете установить одну или более баз Solr, которые будут доступны распределённо. Адреса нескольких баз указывайте через запятую. diff --git a/startYACY.sh b/startYACY.sh index 0280b01ee..7e5d0365a 100755 --- a/startYACY.sh +++ b/startYACY.sh @@ -22,26 +22,26 @@ fi if [ ! -x "$JAVA" ] then - echo "The java command is not executable." - echo "Either you have not installed java or it is not in your PATH" - #Cron supports setting the path in - #echo "Has this script been invoked by CRON?" - #echo "if so, please set PATH in the crontab, or set the correct path in the variable in this script." - exit 1 + echo "The java command is not executable." + echo "Either you have not installed java or it is not in your PATH" + #Cron supports setting the path in + #echo "Has this script been invoked by CRON?" + #echo "if so, please set PATH in the crontab, or set the correct path in the variable in this script." + exit 1 fi usage() { - cat - <.log) **" - echo "** STOP YaCy: execute stopYACY.sh and wait some seconds **" + echo "****************** YaCy Web Crawler/Indexer & Search Engine *******************" + echo "**** (C) by Michael Peter Christen, usage granted under the GPL Version 2 ****" + echo "**** USE AT YOUR OWN RISK! Project home and releases: http://yacy.net/ ****" + echo "** LOG of YaCy: DATA/LOG/yacy00.log (and yacy.log) **" + echo "** STOP YaCy: execute stopYACY.sh and wait some seconds **" echo "** GET HELP for YaCy: join our community at https://searchlab.eu **" - echo "*******************************************************************************" - if [ $DEBUG -eq 1 ] #debug - then - # with exec the java process become the main process and will receive signals such as SIGTERM - exec $cmdline - elif [ $FOREGROUND -eq 1 ];then # foreground process without remote JMX monitoring - # with exec the java process become the main process and will receive signals such as SIGTERM - exec $cmdline - else - echo " >> YaCy started as daemon process. Administration at http://localhost:$PORT << " - eval $cmdline - if [ "$TAILLOG" -eq "1" -a ! "$DEBUG" -eq "1" ];then - sleep 1 - tail -f DATA/LOG/yacy00.log - fi - fi + echo "*******************************************************************************" + if [ $DEBUG -eq 1 ] #debug + then + # with exec the java process become the main process and will receive signals such as SIGTERM + exec $cmdline + elif [ $FOREGROUND -eq 1 ];then # foreground process without remote JMX monitoring + # with exec the java process become the main process and will receive signals such as SIGTERM + exec $cmdline + else + echo " >> YaCy started as daemon process. Administration at http://localhost:$PORT << " + eval $cmdline + if [ "$TAILLOG" -eq "1" -a ! "$DEBUG" -eq "1" ];then + sleep 1 + tail -f DATA/LOG/yacy00.log + fi + fi fi diff --git a/test/java/net/yacy/document/parser/GenericXMLParserTest.java b/test/java/net/yacy/document/parser/GenericXMLParserTest.java index d18ed22bc..3d2ae67a3 100644 --- a/test/java/net/yacy/document/parser/GenericXMLParserTest.java +++ b/test/java/net/yacy/document/parser/GenericXMLParserTest.java @@ -23,9 +23,9 @@ package net.yacy.document.parser; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; -import static org.junit.Assert.assertFalse; import java.io.ByteArrayInputStream; import java.io.File; @@ -45,7 +45,7 @@ import net.yacy.document.VocabularyScraper; /** * Unit tests for the {@link GenericXMLParser} class - * + * * @author luccioman * */ @@ -58,13 +58,13 @@ public class GenericXMLParserTest { @Before public void setUp() { - this.parser = new GenericXMLParser(); + parser = new GenericXMLParser(); } /** * Unit test for the GenericXMLParser.parse() function with some small XML * test files. - * + * * @throws Exception * when an unexpected error occurred */ @@ -77,7 +77,7 @@ public class GenericXMLParserTest { FileInputStream inStream = new FileInputStream(new File(folder, fileName)); DigestURL location = new DigestURL("http://localhost/" + fileName); try { - Document[] documents = this.parser.parse(location, "text/xml", null, new VocabularyScraper(), 0, + Document[] documents = parser.parse(location, "text/xml", null, new VocabularyScraper(), 0, inStream); assertNotNull("Parser result must not be null for file " + fileName, documents); assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString()); @@ -90,7 +90,7 @@ public class GenericXMLParserTest { } /** - * + * * @param parser * generic xml parser instance. Must not be null. * @param encodedXML @@ -123,10 +123,10 @@ public class GenericXMLParserTest { /** * Test UTF-8 charset detection - * + * * @see RFC 7303 "UTF-8 Charset" example * (https://tools.ietf.org/html/rfc7303#section-8.1) - * + * * @throws Exception * when an unexpected error occurred */ @@ -138,7 +138,7 @@ public class GenericXMLParserTest { */ byte[] encodedXML = ("" + UMLAUT_TEXT_TAG) .getBytes(StandardCharsets.UTF_8); - testCharsetDetection(this.parser, encodedXML, "application/xml; charset=utf-8", StandardCharsets.UTF_8.name(), + testCharsetDetection(parser, encodedXML, "application/xml; charset=utf-8", StandardCharsets.UTF_8.name(), "Maßkrügen"); /* @@ -146,18 +146,18 @@ public class GenericXMLParserTest { * declaration */ encodedXML = ("" + UMLAUT_TEXT_TAG).getBytes(StandardCharsets.UTF_8); - testCharsetDetection(this.parser, encodedXML, "application/xml; charset=utf-8", StandardCharsets.UTF_8.name(), + testCharsetDetection(parser, encodedXML, "application/xml; charset=utf-8", StandardCharsets.UTF_8.name(), "Maßkrügen"); } /** * Test UTF-16 charset detection - * + * * @see RFC 7303 "UTF-16 Charset" and * "Omitted Charset and 16-Bit MIME Entity" examples * (https://tools.ietf.org/html/rfc7303#section-8.2 and * https://tools.ietf.org/html/rfc7303#section-8.4) - * + * * @throws Exception * when an unexpected error occurred */ @@ -169,7 +169,7 @@ public class GenericXMLParserTest { */ byte[] encodedXML = ("" + UMLAUT_TEXT_TAG) .getBytes(StandardCharsets.UTF_16); - testCharsetDetection(this.parser, encodedXML, "application/xml; charset=utf-16", StandardCharsets.UTF_16.name(), + testCharsetDetection(parser, encodedXML, "application/xml; charset=utf-16", StandardCharsets.UTF_16.name(), "Maßkrügen"); /* @@ -177,7 +177,7 @@ public class GenericXMLParserTest { * XML declaration having only BOM (Byte Order Mark) */ encodedXML = ("" + UMLAUT_TEXT_TAG).getBytes(StandardCharsets.UTF_16); - testCharsetDetection(this.parser, encodedXML, "application/xml; charset=utf-16", + testCharsetDetection(parser, encodedXML, "application/xml; charset=utf-16", StandardCharsets.UTF_16BE.name(), "Maßkrügen"); /* @@ -186,22 +186,22 @@ public class GenericXMLParserTest { */ encodedXML = ("" + UMLAUT_TEXT_TAG) .getBytes(StandardCharsets.UTF_16); - testCharsetDetection(this.parser, encodedXML, "application/xml", StandardCharsets.UTF_16.name(), "Maßkrügen"); + testCharsetDetection(parser, encodedXML, "application/xml", StandardCharsets.UTF_16.name(), "Maßkrügen"); /* * Charset is omitted in both Content-Type HTTP header and XML * declaration with BOM (Byte Order Mark) */ encodedXML = ("" + UMLAUT_TEXT_TAG).getBytes(StandardCharsets.UTF_16); - testCharsetDetection(this.parser, encodedXML, "application/xml", StandardCharsets.UTF_16BE.name(), "Maßkrügen"); + testCharsetDetection(parser, encodedXML, "application/xml", StandardCharsets.UTF_16BE.name(), "Maßkrügen"); } /** * Test ISO-8859-1 charset detection - * + * * @see RFC 7303 "Omitted Charset and 8-Bit MIME Entity" example * (https://tools.ietf.org/html/rfc7303#section-8.3) - * + * * @throws Exception * when an unexpected error occurred */ @@ -213,7 +213,7 @@ public class GenericXMLParserTest { */ byte[] encodedXML = ("" + UMLAUT_TEXT_TAG) .getBytes(StandardCharsets.ISO_8859_1); - testCharsetDetection(this.parser, encodedXML, "application/xml", StandardCharsets.ISO_8859_1.name(), + testCharsetDetection(parser, encodedXML, "application/xml", StandardCharsets.ISO_8859_1.name(), "Maßkrügen"); } @@ -221,10 +221,10 @@ public class GenericXMLParserTest { * Test charset detection when the character encoding is omitted in * Content-Type header, and content has a XML declaration with no encoding * declaration - * + * * @see RFC 7303 "Omitted Charset, No Internal Encoding Declaration" example * (https://tools.ietf.org/html/rfc7303#section-8.5) - * + * * @throws Exception * when an unexpected error occurred */ @@ -242,15 +242,15 @@ public class GenericXMLParserTest { encodedXML = ("" + "In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen") .getBytes(StandardCharsets.US_ASCII); - testCharsetDetection(this.parser, encodedXML, "application/xml", StandardCharsets.UTF_8.name(), "Maßkrügen"); + testCharsetDetection(parser, encodedXML, "application/xml", StandardCharsets.UTF_8.name(), "Maßkrügen"); } /** * Test UTF-16BE charset detection - * + * * @see RFC 7303 "UTF-16BE Charset" example * (https://tools.ietf.org/html/rfc7303#section-8.6) - * + * * @throws Exception * when an unexpected error occurred */ @@ -262,13 +262,13 @@ public class GenericXMLParserTest { */ byte[] encodedXML = ("" + UMLAUT_TEXT_TAG) .getBytes(StandardCharsets.UTF_16BE); - testCharsetDetection(this.parser, encodedXML, "application/xml; charset=utf-16be", + testCharsetDetection(parser, encodedXML, "application/xml; charset=utf-16be", StandardCharsets.UTF_16BE.name(), "Maßkrügen"); } /** * Test absolute URLs detection in XML elements attributes. - * + * * @throws Exception * when an unexpected error occurred */ @@ -288,7 +288,7 @@ public class GenericXMLParserTest { String charsetFromHttpHeader = HeaderFramework.getCharacterEncoding(contentTypeHeader); DigestURL location = new DigestURL("http://localhost/testfile.xml"); try { - Document[] documents = this.parser.parse(location, contentTypeHeader, charsetFromHttpHeader, + Document[] documents = parser.parse(location, contentTypeHeader, charsetFromHttpHeader, new VocabularyScraper(), 0, inStream); assertEquals(1, documents.length); Collection detectedAnchors = documents[0].getAnchors(); @@ -304,7 +304,7 @@ public class GenericXMLParserTest { /** * Test absolute URLs detection in XML elements text. - * + * * @throws Exception * when an unexpected error occurred */ @@ -324,7 +324,7 @@ public class GenericXMLParserTest { String charsetFromHttpHeader = HeaderFramework.getCharacterEncoding(contentTypeHeader); DigestURL location = new DigestURL("http://localhost/testfile.xml"); try { - Document[] documents = this.parser.parse(location, contentTypeHeader, charsetFromHttpHeader, + Document[] documents = parser.parse(location, contentTypeHeader, charsetFromHttpHeader, new VocabularyScraper(), 0, inStream); assertEquals(1, documents.length); Collection detectedAnchors = documents[0].getAnchors(); @@ -337,7 +337,7 @@ public class GenericXMLParserTest { inStream.close(); } } - + /** * Test parsing well-formed XML fragment (no XML declaration, no DTD or schema) * @throws Exception when an unexpected error occurred @@ -351,18 +351,18 @@ public class GenericXMLParserTest { String charsetFromHttpHeader = HeaderFramework.getCharacterEncoding(contentTypeHeader); DigestURL location = new DigestURL("http://localhost/testfile.xml"); try { - Document[] documents = this.parser.parse(location, contentTypeHeader, charsetFromHttpHeader, + Document[] documents = parser.parse(location, contentTypeHeader, charsetFromHttpHeader, new VocabularyScraper(), 0, inStream); assertEquals(1, documents.length); assertEquals("Node content1 Node content2", documents[0].getTextString()); } finally { inStream.close(); - } + } } - + /** * Test URLs detection when applying limits. - * + * * @throws Exception * when an unexpected error occurred */ @@ -376,7 +376,7 @@ public class GenericXMLParserTest { + "Home page : http://yacy.net - International Forum : " + "https://searchlab.eu " + "and this is a mention to a relative URL : /document.html

" - + "

Here are YaCybug tracker and Wiki." + + "

Here are YaCybug tracker and Wiki." + "And this is a relative link to another sub document

" + "" + ""; @@ -386,12 +386,12 @@ public class GenericXMLParserTest { String charsetFromHttpHeader = HeaderFramework.getCharacterEncoding(contentTypeHeader); DigestURL location = new DigestURL("http://localhost/testfile.xml"); try { - Document[] documents = this.parser.parseWithLimits(location, contentTypeHeader, charsetFromHttpHeader, new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, Long.MAX_VALUE); + Document[] documents = parser.parseWithLimits(location, contentTypeHeader, charsetFromHttpHeader, new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, Long.MAX_VALUE); assertEquals(1, documents.length); assertFalse(documents[0].isPartiallyParsed()); - + assertTrue(documents[0].getTextString().contains("And this is a relative link")); - + Collection detectedAnchors = documents[0].getAnchors(); assertNotNull(detectedAnchors); assertEquals(5, detectedAnchors.size()); @@ -399,22 +399,22 @@ public class GenericXMLParserTest { assertTrue(detectedAnchors.contains(new AnchorURL("http://yacy.net"))); assertTrue(detectedAnchors.contains(new AnchorURL("https://searchlab.eu"))); assertTrue(detectedAnchors.contains(new AnchorURL("http://mantis.tokeek.de"))); - assertTrue(detectedAnchors.contains(new AnchorURL("http://www.yacy-websearch.net/wiki/"))); + assertTrue(detectedAnchors.contains(new AnchorURL("https://wiki.yacy.net/index.php/"))); } finally { inStream.close(); } - + /* Links limit exceeded */ inStream = new ByteArrayInputStream(xhtml.getBytes(StandardCharsets.UTF_8.name())); try { - Document[] documents = this.parser.parseWithLimits(location, contentTypeHeader, charsetFromHttpHeader, + Document[] documents = parser.parseWithLimits(location, contentTypeHeader, charsetFromHttpHeader, new VocabularyScraper(), 0, inStream, 2, Long.MAX_VALUE); assertEquals(1, documents.length); assertTrue(documents[0].isPartiallyParsed()); - + assertTrue(documents[0].getTextString().contains("Home page")); assertFalse(documents[0].getTextString().contains("And this is a relative link")); - + Collection detectedAnchors = documents[0].getAnchors(); assertNotNull(detectedAnchors); assertEquals(2, detectedAnchors.size()); @@ -423,7 +423,7 @@ public class GenericXMLParserTest { } finally { inStream.close(); } - + /* Bytes limit exceeded */ StringBuilder xhtmlBuilder = new StringBuilder("") .append("") @@ -436,25 +436,25 @@ public class GenericXMLParserTest { .append("Home page : http://yacy.net - International Forum : ") .append("https://searchlab.eu ") .append("and this is a mention to a relative URL : /document.html

"); - + /* Add some filler text to reach a total size beyond SAX parser internal input stream buffers */ while(xhtmlBuilder.length() < 1024 * 20) { xhtmlBuilder.append("

Some text to parse

"); } - + int firstBytes = xhtmlBuilder.toString().getBytes(StandardCharsets.UTF_8.name()).length; - xhtmlBuilder.append("

Here are YaCybug tracker and Wiki.") + xhtmlBuilder.append("

Here are YaCybug tracker and Wiki.") .append("And this is a relative link to another sub document

") .append(""); inStream = new ByteArrayInputStream(xhtmlBuilder.toString().getBytes(StandardCharsets.UTF_8.name())); try { - Document[] documents = this.parser.parseWithLimits(location, contentTypeHeader, charsetFromHttpHeader, new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, firstBytes); + Document[] documents = parser.parseWithLimits(location, contentTypeHeader, charsetFromHttpHeader, new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, firstBytes); assertEquals(1, documents.length); assertTrue(documents[0].isPartiallyParsed()); - + assertTrue(documents[0].getTextString().contains("and this is a mention to a relative URL")); assertFalse(documents[0].getTextString().contains("And this is a relative link to another")); - + Collection detectedAnchors = documents[0].getAnchors(); assertNotNull(detectedAnchors); assertEquals(3, detectedAnchors.size());