Merge branch 'master' of ssh://git@gitorious.org/yacy/rc1.git

pull/1/head
Michael Peter Christen 10 years ago
commit 09d2867050

@ -72,20 +72,20 @@
<classpathentry kind="lib" path="lib/icu4j-core.jar"/>
<classpathentry kind="lib" path="lib/htmllexer.jar"/>
<classpathentry kind="lib" path="lib/jsoup-1.8.1.jar"/>
<classpathentry kind="lib" path="lib/jetty-client-9.2.3.v20140905.jar"/>
<classpathentry kind="lib" path="lib/jetty-continuation-9.2.3.v20140905.jar"/>
<classpathentry kind="lib" path="lib/jetty-deploy-9.2.3.v20140905.jar"/>
<classpathentry kind="lib" path="lib/jetty-http-9.2.3.v20140905.jar"/>
<classpathentry kind="lib" path="lib/jetty-io-9.2.3.v20140905.jar"/>
<classpathentry kind="lib" path="lib/jetty-jmx-9.2.3.v20140905.jar"/>
<classpathentry kind="lib" path="lib/jetty-proxy-9.2.3.v20140905.jar"/>
<classpathentry kind="lib" path="lib/jetty-security-9.2.3.v20140905.jar"/>
<classpathentry kind="lib" path="lib/jetty-server-9.2.3.v20140905.jar"/>
<classpathentry kind="lib" path="lib/jetty-servlet-9.2.3.v20140905.jar"/>
<classpathentry kind="lib" path="lib/jetty-servlets-9.2.3.v20140905.jar"/>
<classpathentry kind="lib" path="lib/jetty-util-9.2.3.v20140905.jar"/>
<classpathentry kind="lib" path="lib/jetty-webapp-9.2.3.v20140905.jar"/>
<classpathentry kind="lib" path="lib/jetty-xml-9.2.3.v20140905.jar"/>
<classpathentry kind="lib" path="lib/jetty-client-9.2.4.v20141103.jar"/>
<classpathentry kind="lib" path="lib/jetty-continuation-9.2.4.v20141103.jar"/>
<classpathentry kind="lib" path="lib/jetty-deploy-9.2.4.v20141103.jar"/>
<classpathentry kind="lib" path="lib/jetty-http-9.2.4.v20141103.jar"/>
<classpathentry kind="lib" path="lib/jetty-io-9.2.4.v20141103.jar"/>
<classpathentry kind="lib" path="lib/jetty-jmx-9.2.4.v20141103.jar"/>
<classpathentry kind="lib" path="lib/jetty-proxy-9.2.4.v20141103.jar"/>
<classpathentry kind="lib" path="lib/jetty-security-9.2.4.v20141103.jar"/>
<classpathentry kind="lib" path="lib/jetty-server-9.2.4.v20141103.jar"/>
<classpathentry kind="lib" path="lib/jetty-servlet-9.2.4.v20141103.jar"/>
<classpathentry kind="lib" path="lib/jetty-servlets-9.2.4.v20141103.jar"/>
<classpathentry kind="lib" path="lib/jetty-util-9.2.4.v20141103.jar"/>
<classpathentry kind="lib" path="lib/jetty-webapp-9.2.4.v20141103.jar"/>
<classpathentry kind="lib" path="lib/jetty-xml-9.2.4.v20141103.jar"/>
<classpathentry kind="lib" path="lib/javax.servlet-api-3.1.0.jar"/>
<classpathentry kind="lib" path="lib/weupnp-0.1.2.jar"/>
<classpathentry kind="output" path="gen"/>

@ -62,20 +62,20 @@
<string>$JAVAROOT/lib/jcl-over-slf4j-1.7.2.jar</string>
<string>$JAVAROOT/lib/jempbox-1.8.7.jar</string>
<string>$JAVAROOT/lib/javax.servlet-api-3.1.0.jar</string>
<string>$JAVAROOT/lib/jetty-client-9.2.3.v20140905.jar</string>
<string>$JAVAROOT/lib/jetty-continuation-9.2.3.v20140905.jar</string>
<string>$JAVAROOT/lib/jetty-deploy-9.2.3.v20140905.jar</string>
<string>$JAVAROOT/lib/jetty-http-9.2.3.v20140905.jar</string>
<string>$JAVAROOT/lib/jetty-io-9.2.3.v20140905.jar</string>
<string>$JAVAROOT/lib/jetty-jmx-9.2.3.v20140905.jar</string>
<string>$JAVAROOT/lib/jetty-proxy-9.2.3.v20140905.jar</string>
<string>$JAVAROOT/lib/jetty-security-9.2.3.v20140905.jar</string>
<string>$JAVAROOT/lib/jetty-server-9.2.3.v20140905.jar</string>
<string>$JAVAROOT/lib/jetty-servlet-9.2.3.v20140905.jar</string>
<string>$JAVAROOT/lib/jetty-servlets-9.2.3.v20140905.jar</string>
<string>$JAVAROOT/lib/jetty-util-9.2.3.v20140905.jar</string>
<string>$JAVAROOT/lib/jetty-webapp-9.2.3.v20140905.jar</string>
<string>$JAVAROOT/lib/jetty-xml-9.2.3.v20140905.jar</string>
<string>$JAVAROOT/lib/jetty-client-9.2.4.v20141103.jar</string>
<string>$JAVAROOT/lib/jetty-continuation-9.2.4.v20141103.jar</string>
<string>$JAVAROOT/lib/jetty-deploy-9.2.4.v20141103.jar</string>
<string>$JAVAROOT/lib/jetty-http-9.2.4.v20141103.jar</string>
<string>$JAVAROOT/lib/jetty-io-9.2.4.v20141103.jar</string>
<string>$JAVAROOT/lib/jetty-jmx-9.2.4.v20141103.jar</string>
<string>$JAVAROOT/lib/jetty-proxy-9.2.4.v20141103.jar</string>
<string>$JAVAROOT/lib/jetty-security-9.2.4.v20141103.jar</string>
<string>$JAVAROOT/lib/jetty-server-9.2.4.v20141103.jar</string>
<string>$JAVAROOT/lib/jetty-servlet-9.2.4.v20141103.jar</string>
<string>$JAVAROOT/lib/jetty-servlets-9.2.4.v20141103.jar</string>
<string>$JAVAROOT/lib/jetty-util-9.2.4.v20141103.jar</string>
<string>$JAVAROOT/lib/jetty-webapp-9.2.4.v20141103.jar</string>
<string>$JAVAROOT/lib/jetty-xml-9.2.4.v20141103.jar</string>
<string>$JAVAROOT/lib/jsch-0.1.50.jar</string>
<string>$JAVAROOT/lib/json-simple-1.1.1.jar</string>
<string>$JAVAROOT/lib/jsoup-1.8.1.jar</string>

@ -186,20 +186,20 @@
<pathelement location="${lib}/jcl-over-slf4j-1.7.2.jar" />
<pathelement location="${lib}/jempbox-1.8.7" />
<pathelement location="${lib}/javax.servlet-api-3.1.0.jar" />
<pathelement location="${lib}/jetty-client-9.2.3.v20140905.jar" />
<pathelement location="${lib}/jetty-continuation-9.2.3.v20140905.jar" />
<pathelement location="${lib}/jetty-deploy-9.2.3.v20140905.jar" />
<pathelement location="${lib}/jetty-http-9.2.3.v20140905.jar" />
<pathelement location="${lib}/jetty-io-9.2.3.v20140905.jar" />
<pathelement location="${lib}/jetty-jmx-9.2.3.v20140905.jar" />
<pathelement location="${lib}/jetty-proxy-9.2.3.v20140905.jar" />
<pathelement location="${lib}/jetty-security-9.2.3.v20140905.jar" />
<pathelement location="${lib}/jetty-server-9.2.3.v20140905.jar" />
<pathelement location="${lib}/jetty-servlet-9.2.3.v20140905.jar" />
<pathelement location="${lib}/jetty-servlets-9.2.3.v20140905.jar" />
<pathelement location="${lib}/jetty-util-9.2.3.v20140905.jar" />
<pathelement location="${lib}/jetty-webapp-9.2.3.v20140905.jar" />
<pathelement location="${lib}/jetty-xml-9.2.3.v20140905.jar" />
<pathelement location="${lib}/jetty-client-9.2.4.v20141103.jar" />
<pathelement location="${lib}/jetty-continuation-9.2.4.v20141103.jar" />
<pathelement location="${lib}/jetty-deploy-9.2.4.v20141103.jar" />
<pathelement location="${lib}/jetty-http-9.2.4.v20141103.jar" />
<pathelement location="${lib}/jetty-io-9.2.4.v20141103.jar" />
<pathelement location="${lib}/jetty-jmx-9.2.4.v20141103.jar" />
<pathelement location="${lib}/jetty-proxy-9.2.4.v20141103.jar" />
<pathelement location="${lib}/jetty-security-9.2.4.v20141103.jar" />
<pathelement location="${lib}/jetty-server-9.2.4.v20141103.jar" />
<pathelement location="${lib}/jetty-servlet-9.2.4.v20141103.jar" />
<pathelement location="${lib}/jetty-servlets-9.2.4.v20141103.jar" />
<pathelement location="${lib}/jetty-util-9.2.4.v20141103.jar" />
<pathelement location="${lib}/jetty-webapp-9.2.4.v20141103.jar" />
<pathelement location="${lib}/jetty-xml-9.2.4.v20141103.jar" />
<pathelement location="${lib}/jsch-0.1.50.jar" />
<pathelement location="${lib}/json-simple-1.1.1.jar" />
<pathelement location="${lib}/jsoup-1.8.1.jar" />

@ -462,6 +462,7 @@ public class Crawler_p {
indexMedia,
storeHTCache,
crawlOrder,
-1, // temporary; stub commit
cachePolicy,
collection,
agentName);

@ -152,6 +152,7 @@ public class QuickCrawlLink_p {
obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
indexText, indexMedia,
storeHTCache, remoteIndexing,
-1,
CacheStrategy.IFFRESH,
collection,
ClientIdentification.yacyIntranetCrawlerAgentName);

@ -2,9 +2,7 @@
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': Peer Steering</title>
#(showtable)#::
<link rel="alternate" type="application/xml" title="Tables" href="Tables.rss?table=#[table]#" />
#(/showtable)#
#%env/templates/metas.template%#
<script type="text/javascript">
<!--
@ -40,7 +38,7 @@
</p>::#(/inline)#
<div id="api">
<a href="api/table_p.xml?table=api&count=100&search=" id="apilink"><img src="env/grafics/api.png" width="60" height="40" alt="API"/></a>
<a href="Tables_p.xml?table=api&count=100&search=" id="apilink"><img src="env/grafics/api.png" width="60" height="40" alt="API"/></a>
<span>The information that is presented on this page can also be retrieved as XML
Click the API icon to see the XML.
To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de/wiki/index.php/Dev:API" target="_blank">API wiki page</a>.</span>

@ -115,7 +115,7 @@ Use the RSS search result format to add static searches to your RSS reader, if y
::
<p>No Results.</p>
::
<p>No Results. (length of search words must be at least 2 characters)</p>
<p>No Results. (length of search words must be at least 1 character)</p>
::
<div id="results"></div>
<div class="progress">

@ -845,8 +845,8 @@ public class yacysearch {
prop.put("excluded", "0");
}
if ( prop == null || prop.isEmpty() ) {
if ( post.get("query", post.get("search", "")).length() < 2 ) {
if (prop.isEmpty() || querystring.length() == 0) {
if ( querystring.length() == 0 ) { // querystring is trimmed originalquerystring
prop.put("num-results", "2"); // no results - at least 2 chars
} else {
prop.put("num-results", "1"); // no results

@ -3787,7 +3787,7 @@ Illegal prefer mask:==Ungültige bevorzugte Maske:
Did you mean:==Meinten Sie vielleicht:
The following words are stop-words and had been excluded from the search:==Folgende Wörter sind Stopwords und wurden von der Suche ausgeschlossen:
No Results.==Keine Ergebnisse.
length of search words must be at least 2 characters==Die Länge der Suchbegriffe muss mindestens 2 Zeichen betragen
length of search words must be at least 1 character==Die Länge der Suchbegriffe muss mindestens 1 Zeichen betragen
Searching the web with this peer is disabled for unauthorized users. Please==Die Websuche auf diesem Peer ist für nicht angemeldete Benutzer deaktiviert. Bitte
>log in<==>loggen Sie sich<
as administrator to use the search function==als Administrator ein, um die Suchfunktion nutzen zu können.

@ -4143,7 +4143,7 @@ Illegal prefer mask:==Недопустимый предпочтительный
Did you mean:==Возможно вы имели ввиду:
The following words are stop-words and had been excluded from the search:==Следующие слова являются стоп-словами и были исключены из поиска:
No Results.==Нет результатов.
length of search words must be at least 2 characters==Длина искомого слова должно быть не менее 2 символов
length of search words must be at least 1 character==Длина искомого слова должно быть не менее 1 символов
Searching the web with this peer is disabled for unauthorized users. Please==Поиск с помощью этого узла невозможен для неавторизованных пользователей. Пожалуйста,
>log in<==>войдите<
as administrator to use the search function==под своей учётной записью, для использования поиска.

@ -83,7 +83,7 @@
<compilation-unit>
<package-root>source</package-root>
<package-root>htroot</package-root>
<classpath mode="compile">lib/J7Zip-modified.jar;lib/apache-mime4j-0.6.jar;lib/bcmail-jdk15-1.46.jar;lib/bcprov-jdk15-1.46.jar;lib/commons-codec-1.9.jar;lib/commons-compress-1.8.1.jar;lib/commons-fileupload-1.2.2.jar;lib/commons-io-2.3.jar;lib/commons-jxpath-1.3.jar;lib/commons-lang-2.6.jar;lib/commons-logging-1.1.3.jar;lib/fontbox-1.8.7.jar;lib/geronimo-stax-api_1.0_spec-1.0.1.jar;lib/guava-16.0.1.jar;lib/htmllexer.jar;lib/httpclient-4.3.6.jar;lib/httpcore-4.3.3.jar;lib/httpmime-4.3.6.jar;lib/icu4j-core.jar;lib/jakarta-oro-2.0.8.jar;lib/jaudiotagger-2.0.4-20111207.115108-15.jar;lib/javax.servlet-api-3.1.0.jar;lib/jcifs-1.3.17.jar;lib/jcl-over-slf4j-1.7.2.jar;lib/jempbox-1.8.7.jar;lib/jetty-client-9.2.3.v20140905.jar;lib/jetty-continuation-9.2.3.v20140905.jar;lib/jetty-deploy-9.2.3.v20140905.jar;lib/jetty-http-9.2.3.v20140905.jar;lib/jetty-io-9.2.3.v20140905.jar;lib/jetty-jmx-9.2.3.v20140905.jar;lib/jetty-proxy-9.2.3.v20140905.jar;lib/jetty-security-9.2.3.v20140905.jar;lib/jetty-server-9.2.3.v20140905.jar;lib/jetty-servlet-9.2.3.v20140905.jar;lib/jetty-servlets-9.2.3.v20140905.jar;lib/jetty-util-9.2.3.v20140905.jar;lib/jetty-webapp-9.2.3.v20140905.jar;lib/jetty-xml-9.2.3.v20140905.jar;lib/jsch-0.1.50.jar;lib/json-simple-1.1.1.jar;lib/jsoup-1.8.1.jar;lib/log4j-over-slf4j-1.7.2.jar;lib/lucene-analyzers-common-4.10.2.jar;lib/lucene-analyzers-phonetic-4.10.2.jar;lib/lucene-classification-4.10.2.jar;lib/lucene-codecs-4.10.2.jar;lib/lucene-core-4.10.2.jar;lib/lucene-facet-4.10.2.jar;lib/lucene-grouping-4.10.2.jar;lib/lucene-highlighter-4.10.2.jar;lib/lucene-join-4.10.2.jar;lib/lucene-memory-4.10.2.jar;lib/lucene-misc-4.10.2.jar;lib/lucene-queries-4.10.2.jar;lib/lucene-queryparser-4.10.2.jar;lib/lucene-spatial-4.10.2.jar;lib/lucene-suggest-4.10.2.jar;lib/metadata-extractor-2.6.2.jar;lib/noggit-0.5.jar;lib/org.restlet.jar;lib/pdfbox-1.8.7.jar;lib/poi-3.10-FINAL-20140208.jar;lib/poi-scratchpad-3.10-FINAL-20140208.jar;lib/slf4j-api-1.7.6.jar;lib/slf4j-jdk14-1.7.2.jar;lib/solr-core-4.10.2.jar;lib/solr-solrj-4.10.2.jar;lib/spatial4j-0.4.1.jar;lib/webcat-0.1-swf.jar;lib/weupnp-0.1.2.jar;lib/wstx-asl-3.2.9.jar;lib/xercesImpl.jar;lib/xml-apis.jar;lib/zookeeper-3.4.6.jar</classpath>
<classpath mode="compile">lib/J7Zip-modified.jar;lib/apache-mime4j-0.6.jar;lib/bcmail-jdk15-1.46.jar;lib/bcprov-jdk15-1.46.jar;lib/commons-codec-1.9.jar;lib/commons-compress-1.8.1.jar;lib/commons-fileupload-1.2.2.jar;lib/commons-io-2.3.jar;lib/commons-jxpath-1.3.jar;lib/commons-lang-2.6.jar;lib/commons-logging-1.1.3.jar;lib/fontbox-1.8.7.jar;lib/geronimo-stax-api_1.0_spec-1.0.1.jar;lib/guava-16.0.1.jar;lib/htmllexer.jar;lib/httpclient-4.3.6.jar;lib/httpcore-4.3.3.jar;lib/httpmime-4.3.6.jar;lib/icu4j-core.jar;lib/jakarta-oro-2.0.8.jar;lib/jaudiotagger-2.0.4-20111207.115108-15.jar;lib/javax.servlet-api-3.1.0.jar;lib/jcifs-1.3.17.jar;lib/jcl-over-slf4j-1.7.2.jar;lib/jempbox-1.8.7.jar;lib/jetty-client-9.2.4.v20141103.jar;lib/jetty-continuation-9.2.4.v20141103.jar;lib/jetty-deploy-9.2.4.v20141103.jar;lib/jetty-http-9.2.4.v20141103.jar;lib/jetty-io-9.2.4.v20141103.jar;lib/jetty-jmx-9.2.4.v20141103.jar;lib/jetty-proxy-9.2.4.v20141103.jar;lib/jetty-security-9.2.4.v20141103.jar;lib/jetty-server-9.2.4.v20141103.jar;lib/jetty-servlet-9.2.4.v20141103.jar;lib/jetty-servlets-9.2.4.v20141103.jar;lib/jetty-util-9.2.4.v20141103.jar;lib/jetty-webapp-9.2.4.v20141103.jar;lib/jetty-xml-9.2.4.v20141103.jar;lib/jsch-0.1.50.jar;lib/json-simple-1.1.1.jar;lib/jsoup-1.8.1.jar;lib/log4j-over-slf4j-1.7.2.jar;lib/lucene-analyzers-common-4.10.2.jar;lib/lucene-analyzers-phonetic-4.10.2.jar;lib/lucene-classification-4.10.2.jar;lib/lucene-codecs-4.10.2.jar;lib/lucene-core-4.10.2.jar;lib/lucene-facet-4.10.2.jar;lib/lucene-grouping-4.10.2.jar;lib/lucene-highlighter-4.10.2.jar;lib/lucene-join-4.10.2.jar;lib/lucene-memory-4.10.2.jar;lib/lucene-misc-4.10.2.jar;lib/lucene-queries-4.10.2.jar;lib/lucene-queryparser-4.10.2.jar;lib/lucene-spatial-4.10.2.jar;lib/lucene-suggest-4.10.2.jar;lib/metadata-extractor-2.6.2.jar;lib/noggit-0.5.jar;lib/org.restlet.jar;lib/pdfbox-1.8.7.jar;lib/poi-3.10-FINAL-20140208.jar;lib/poi-scratchpad-3.10-FINAL-20140208.jar;lib/slf4j-api-1.7.6.jar;lib/slf4j-jdk14-1.7.2.jar;lib/solr-core-4.10.2.jar;lib/solr-solrj-4.10.2.jar;lib/spatial4j-0.4.1.jar;lib/webcat-0.1-swf.jar;lib/weupnp-0.1.2.jar;lib/wstx-asl-3.2.9.jar;lib/xercesImpl.jar;lib/xml-apis.jar;lib/zookeeper-3.4.6.jar</classpath>
<built-to>lib/yacycore.jar</built-to>
<source-level>1.7</source-level>
</compilation-unit>

@ -533,68 +533,67 @@
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-server</artifactId>
<version>9.2.3.v20140905</version>
<version>9.2.4.v20141103</version>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-servlet</artifactId>
<version>9.2.3.v20140905</version>
<version>9.2.4.v20141103</version>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-servlets</artifactId>
<version>9.2.3.v20140905</version>
<version>9.2.4.v20141103</version>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-webapp</artifactId>
<version>9.2.3.v20140905</version>
<version>9.2.4.v20141103</version>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-util</artifactId>
<version>9.2.3.v20140905</version>
<type>jar</type>
<version>9.2.4.v20141103</version>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-xml</artifactId>
<version>9.2.3.v20140905</version>
<version>9.2.4.v20141103</version>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-http</artifactId>
<version>9.2.3.v20140905</version>
<version>9.2.4.v20141103</version>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-security</artifactId>
<version>9.2.3.v20140905</version>
<version>9.2.4.v20141103</version>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-io</artifactId>
<version>9.2.3.v20140905</version>
<version>9.2.4.v20141103</version>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-continuation</artifactId>
<version>9.2.3.v20140905</version>
<version>9.2.4.v20141103</version>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-jmx</artifactId>
<version>9.2.3.v20140905</version>
<version>9.2.4.v20141103</version>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-proxy</artifactId>
<version>9.2.3.v20140905</version>
<version>9.2.4.v20141103</version>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-deploy</artifactId>
<version>9.2.3.v20140905</version>
<version>9.2.4.v20141103</version>
</dependency>
<dependency>
<groupId>org.bitlet</groupId>

@ -293,6 +293,7 @@ public final class CrawlSwitchboard {
sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true),
true,
sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_REMOTE, false),
-1,
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_PROXY,
ClientIdentification.yacyProxyAgentName);
@ -322,6 +323,7 @@ public final class CrawlSwitchboard {
true,
false,
false,
-1,
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_REMOTE,
ClientIdentification.yacyInternetCrawlerAgentName);
@ -351,6 +353,7 @@ public final class CrawlSwitchboard {
false,
true,
false,
-1,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName);
@ -380,6 +383,7 @@ public final class CrawlSwitchboard {
true,
true,
false,
-1,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName);
@ -410,6 +414,7 @@ public final class CrawlSwitchboard {
false,
true,
false,
-1,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
ClientIdentification.browserAgentName);
@ -439,6 +444,7 @@ public final class CrawlSwitchboard {
false,
true,
false,
-1,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName);
@ -468,6 +474,7 @@ public final class CrawlSwitchboard {
true,
true,
false,
-1,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName);
@ -497,6 +504,7 @@ public final class CrawlSwitchboard {
false,
false,
false,
-1,
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_SURROGATE,
ClientIdentification.yacyIntranetCrawlerAgentName);
@ -529,6 +537,7 @@ public final class CrawlSwitchboard {
true,
false,
false,
-1,
CacheStrategy.NOCACHE,
collection,
ClientIdentification.yacyIntranetCrawlerAgentName);

@ -86,6 +86,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String INDEXING_URL_MUSTNOTMATCH = "indexURLMustNotMatch";
public static final String INDEXING_CONTENT_MUSTMATCH = "indexContentMustMatch";
public static final String INDEXING_CONTENT_MUSTNOTMATCH = "indexContentMustNotMatch";
public static final String LOADPREVIEWMAXDEPTH = "loadpreviewmaxdepth"; // if previews shall be loaded, this is positive and denotes the maximum depth; if not this is -1
private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
@ -141,6 +142,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final boolean indexMedia,
final boolean storeHTCache,
final boolean remoteIndexing,
final int loadPreviewMaxdepth,
final CacheStrategy cacheStrategy,
final String collections,
final String userAgentName) {
@ -176,6 +178,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(INDEX_MEDIA, indexMedia);
put(STORE_HTCACHE, storeHTCache);
put(REMOTE_INDEXING, remoteIndexing);
put(LOADPREVIEWMAXDEPTH, loadPreviewMaxdepth);
put(CACHE_STRAGEGY, cacheStrategy.toString());
put(COLLECTIONS, CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
}
@ -565,11 +568,25 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean remoteIndexing() {
final String r = get(REMOTE_INDEXING);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public int loadPreviewMaxdepth() {
final String r = get(LOADPREVIEWMAXDEPTH);
if (r == null) return -1;
try {
final int i = Integer.parseInt(r);
if (i < 0) return -1;
return i;
} catch (final NumberFormatException e) {
ConcurrentLog.logException(e);
return -1;
}
}
/**
* get a recrawl date for a given age in minutes

@ -0,0 +1,160 @@
/**
* DocumentImage
* Copyright 2014 by Michael Peter Christen
* First released 29.11.2014 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.crawler.data;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import org.apache.solr.common.SolrDocument;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.search.index.Fulltext;
import net.yacy.search.schema.CollectionSchema;
/**
* This class hosts document snapshots.
*
* The storage is organized in the following hierarchy:
* - in the root path are subpaths for each host:port
* - in the host:port path are subpaths for the crawl depth, two digits length
* - in the crawl depth path are subpaths for the first two charaters of the url-hash, called shard
* - in the shard path are files, named with <urlhash>'.'<date>.<ext>
* .. where the <date> has the form "yyyyMMdd" and ext may be one of {pdf,jpg,png,xml,json}.
* The pdf is created with wxhtmltopdf, jpg/png is created with convert
* and the xml/json is an extract from solr.
*
* The construction of the file name with the date allows to make several copies of the same document
* for different snapshot-times. The usage of the crawl depth makes it easier to extract a specific part
* of the domain.
*/
public class Snapshots {
private File storageLocation;
public Snapshots(File location) {
this.storageLocation = location;
}
/**
* Load a pdf snapshot of a document.
* A proxy must be given to ensure that multiple loads containing i.e. image are cached
* Use http://localhost:<thisport> as proxy.
* @param url
* @param depth
* @param date
* @param proxy - a string of the form 'http://<host>:<port>
* @return
*/
public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, String proxy) {
File path = definePath(url, "pdf", depth, date);
path.getParentFile().mkdirs();
// STUB
return path;
}
/**
* Compute the path of a snapshot. This does not create the snapshot, only gives a path.
* Also, the path to the storage location is not created.
* @param url
* @param ext
* @param depth
* @param date
* @return a file to the snapshot
*/
public File definePath(final DigestURL url, final String ext, final int depth, final Date date) {
String id = ASCII.String(url.hash());
String ds = GenericFormatter.SHORT_DAY_FORMATTER.format(date);
File path = new File(pathToShard(url, depth), id + "." + ds + "." + ext);
return path;
}
/**
* get the depth to a document, helper method for definePath to determine the depth value
* @param url
* @param fulltext
* @return the crawldepth of the document
*/
public int getDepth(final DigestURL url, final Fulltext fulltext) {
Integer depth = null;
if (fulltext.getDefaultConfiguration().contains(CollectionSchema.crawldepth_i)) {
try {
SolrDocument doc = fulltext.getDefaultConnector().getDocumentById(ASCII.String(url.hash()), CollectionSchema.crawldepth_i.getSolrFieldName());
if (doc != null) {
depth = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());
}
} catch (IOException e) {
}
}
return depth == null ? 0 : depth;
}
/**
* for a given url, get all paths for storage locations.
* The locations are all for the single url but may represent different storage times.
* This method is inefficient because it tests all different depths, it would be better to use
* findPaths/3 with a given depth.
* @param url
* @param ext
* @return a set of files for snapshots of the url
*/
public Collection<File> findPaths(final DigestURL url, final String ext) {
for (int i = 0; i < 100; i++) {
Collection<File> paths = findPaths(url, ext, i);
if (paths.size() > 0) return paths;
}
return new ArrayList<>(0);
}
/**
* for a given url, get all paths for storage locations.
* The locations are all for the single url but may represent different storage times.
* @param url
* @param ext
* @param depth
* @return a set of files for snapshots of the url
*/
public Collection<File> findPaths(final DigestURL url, final String ext, final int depth) {
String id = ASCII.String(url.hash());
File pathToShard = pathToShard(url, depth);
String[] list = pathToShard.list();
ArrayList<File> paths = new ArrayList<>();
for (String f: list) {
if (f.startsWith(id) && f.endsWith(ext)) paths.add(new File(pathToShard, f));
}
return paths;
}
private File pathToShard(final DigestURL url, final int depth) {
String id = ASCII.String(url.hash());
File pathToHostDir = new File(storageLocation, url.getHost() + ":" + url.getPort());
File pathToDepthDir = new File(pathToHostDir, depth < 10 ? "0" + depth : Integer.toString(depth));
File pathToShard = new File(pathToDepthDir, id.substring(0, 2));
return pathToShard;
}
}

@ -25,6 +25,7 @@
package net.yacy.crawler.retrieval;
import java.io.IOException;
import java.util.Date;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.FailCategory;
@ -68,10 +69,16 @@ public final class HTTPLoader {
}
public Response load(final Request entry, CrawlProfile profile, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
// load fulltext of html page
Latency.updateBeforeLoad(entry.url());
final long start = System.currentTimeMillis();
final Response doc = load(entry, profile, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType, agent);
Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start);
// load pdf in case that is wanted. This can later be used to compute a web page preview in the search results
if (entry.depth() <= profile.loadPreviewMaxdepth() && "html|shtml|php".indexOf(entry.url().getFile()) >= 0) {
sb.snapshots.downloadPDFSnapshot(entry.url(), entry.depth(), new Date(), "http://127.0.0.1:" + sb.getConfigInt("port", 8090));
}
return doc;
}

@ -186,6 +186,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
crawlingQ,
true, true, true, false,
true, true, false,
-1,
CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName); // TODO: make this a default profile in CrawlSwitchboard

@ -58,6 +58,10 @@ public class BufferedObjectIndex implements Index, Iterable<Row.Entry> {
this.entryComparator = new Row.EntryComparator(backend.row().objectOrder);
}
public boolean isOnDemand() {
return this.backend instanceof OnDemandOpenFileIndex;
}
@Override
public byte[] smallestKey() {
if (this.buffer == null || this.buffer.isEmpty()) return this.backend.smallestKey();

@ -31,7 +31,6 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Vector;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.util.ConcurrentLog;
@ -197,13 +196,13 @@ public final class OS {
FileUtils.deletedelete(starterFile);
}
public static Vector<String> execSynchronous(final String command) throws IOException {
public static List<String> execSynchronous(final String command) throws IOException {
// runs a unix/linux command and returns output as Vector of Strings
// this method blocks until the command is executed
final Process p = Runtime.getRuntime().exec(command);
final BufferedReader in = new BufferedReader(new InputStreamReader(p.getInputStream()));
String text;
final Vector<String> output = new Vector<String>();
final List<String> output = new ArrayList<String>();
while ((text = in.readLine()) != null) {
output.add(text);
}
@ -212,9 +211,16 @@ public final class OS {
}
public static void main(final String[] args) {
try {
List<String> v = execSynchronous("/usr/local/bin/wkhtmltoimage");
for (String r: v) java.lang.System.out.println(r);
} catch (IOException e) {
}
/*
if (args[0].equals("-m")) {
java.lang.System.out.println("Maximum possible memory: " + Integer.toString(getWin32MaxHeap()) + "m");
}
*/
}
}

@ -122,6 +122,7 @@ import net.yacy.crawler.HarvestProcess;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.Snapshots;
import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.data.ResultImages;
import net.yacy.crawler.data.ResultURLs;
@ -243,6 +244,7 @@ public final class Switchboard extends serverSwitch {
public File queuesRoot;
public File surrogatesInPath;
//public File surrogatesOutPath;
public Snapshots snapshots;
public Segment index;
public LoaderDispatcher loader;
public CrawlSwitchboard crawler;
@ -344,6 +346,7 @@ public final class Switchboard extends serverSwitch {
this.htDocsPath =
getDataPath(SwitchboardConstants.HTDOCS_PATH, SwitchboardConstants.HTDOCS_PATH_DEFAULT);
this.log.config("HTDOCS Path: " + this.htDocsPath.toString());
this.snapshots = new Snapshots(new File(this.htDocsPath, "SNAPSHOTS"));
this.workPath = getDataPath(SwitchboardConstants.WORK_PATH, SwitchboardConstants.WORK_PATH_DEFAULT);
this.workPath.mkdirs();
// if default work files exist, copy them (don't overwrite existing!)
@ -3853,27 +3856,6 @@ public final class Switchboard extends serverSwitch {
i++;
}
}
/*
public File getPDF(DigestURL url) {
String depth = "00";
String idstub = ASCII.String(url.hash()).substring(0, 6);
if (this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.crawldepth_i)) {
try {
SolrDocument doc = this.index.fulltext().getDefaultConnector().getDocumentById(ASCII.String(url.hash()), CollectionSchema.crawldepth_i.getSolrFieldName());
if (doc != null) {
depth = (String) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());
if (depth == null) depth = "00"; else if (depth.length() < 2) depth = "0" + depth;
}
} catch (IOException e) {
}
}
File pathToPdf = new File(this.htCachePath, url.getHost() + ":" + url.getPort());
File pdfFile = new File(pathToPdf, depth + "-" + idstub);
}
*/
public void checkInterruption() throws InterruptedException {
final Thread curThread = Thread.currentThread();

Loading…
Cancel
Save