Merge branch 'master' of gitorious.org:yacy/rc1

pull/1/head
Marc Nause 10 years ago
commit ce9368246b

@ -32,11 +32,12 @@
</colgroup>
<tr class="TableHeader">
<td><strong>Crawl Thread</strong></td>
<td><strong>Collections</strong></td>
<td><strong>Status</strong></td>
<td><strong>Depth</strong></td>
<td><strong>Must Match</strong></td>
<td><strong>Must Not Match</strong></td>
<td><strong>MaxAge</strong></td>
<td><strong>Recrawl if older than</strong></td>
<td><strong>Domain Counter Content</strong></td>
<td><strong>Max Page Per Domain</strong></td>
<td><strong>Accept '?' URLs</strong></td>
@ -48,6 +49,7 @@
#{crawlProfiles}#
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
<td>#[name]#</td>
<td>#[collections]#</td>
<td>#(terminateButton)#::
<div style="text-decoration:blink">Running</div>
<form action="CrawlProfileEditor_p.html" method="get" enctype="multipart/form-data" accept-charset="UTF-8"><div>
@ -64,13 +66,13 @@
#(/deleteButton)#
</td>
<td>#[depth]#</td>
<td>#[mustmatch]#</td>
<td>#[mustnotmatch]#</td>
<td>#[crawlingIfOlder]#</td>
<td>#[crawlerURLMustMatch]#</td>
<td>#[crawlerURLMustNotMatch]#</td>
<td>#[recrawlIfOlder]#</td>
<td>#{crawlingDomFilterContent}##[item]#<br />#{/crawlingDomFilterContent}#</td>
<td>#[crawlingDomMaxPages]#</td>
<td>#[domMaxPages]#</td>
<td>#(withQuery)#no::yes#(/withQuery)#</td>
<td>#(storeCache)#no::yes#(/storeCache)#</td>
<td>#(storeHTCache)#no::yes#(/storeHTCache)#</td>
<td>#(indexText)#no::yes#(/indexText)#</td>
<td>#(indexMedia)#no::yes#(/indexMedia)#</td>
<td>#(remoteIndexing)#no::yes#(/remoteIndexing)#</td>

@ -2,23 +2,40 @@
<crawlProfiles>
#{crawlProfiles}#
<crawlProfile>
<handle>#[handle]#</handle>
<name>#[name]#</name>
<status>#(status)#terminated::active::system#(/status)#</status>
<collections>#[collections]#</collections>
<agentName>#[agentName]#</agentName>
<userAgent>#[userAgent]#</userAgent>
<depth>#[depth]#</depth>
<mustmatch>#[mustmatch]#</mustmatch>
<mustnotmatch>#[mustnotmatch]#</mustnotmatch>
<crawlingIfOlder>#[crawlingIfOlder]#</crawlingIfOlder>
<directDocByURL>#(directDocByURL)#false::true#(/directDocByURL)#</directDocByURL>
<recrawlIfOlder>#[recrawlIfOlder]#</recrawlIfOlder>
<domMaxPages>#[domMaxPages]#</domMaxPages>
<crawlingQ>#(crawlingQ)#false::true#(/crawlingQ)#</crawlingQ>
<followFrames>#(followFrames)#false::true#(/followFrames)#</followFrames>
<obeyHtmlRobotsNoindex>#(obeyHtmlRobotsNoindex)#false::true#(/obeyHtmlRobotsNoindex)#</obeyHtmlRobotsNoindex>
<obeyHtmlRobotsNofollow>#(obeyHtmlRobotsNofollow)#false::true#(/obeyHtmlRobotsNofollow)#</obeyHtmlRobotsNofollow>
<indexText>#(indexText)#false::true#(/indexText)#</indexText>
<indexMedia>#(indexMedia)#false::true#(/indexMedia)#</indexMedia>
<storeHTCache>#(storeHTCache)#false::true#(/storeHTCache)#</storeHTCache>
<remoteIndexing>#(remoteIndexing)#false::true#(/remoteIndexing)#</remoteIndexing>
<cacheStrategy>#[cacheStrategy]#</cacheStrategy>
<crawlerURLMustMatch>#[crawlerURLMustMatch]#</crawlerURLMustMatch>
<crawlerURLMustNotMatch>#[crawlerURLMustNotMatch]#</crawlerURLMustNotMatch>
<crawlerIPMustMatch>#[crawlerIPMustMatch]#</crawlerIPMustMatch>
<crawlerIPMustNotMatch>#[crawlerIPMustNotMatch]#</crawlerIPMustNotMatch>
<crawlerCountryMustMatch>#[crawlerCountryMustMatch]#</crawlerCountryMustMatch>
<crawlerNoLimitURLMustMatch>#[crawlerNoLimitURLMustMatch]#</crawlerNoLimitURLMustMatch>
<indexURLMustMatch>#[indexURLMustMatch]#</indexURLMustMatch>
<indexURLMustNotMatch>#[indexURLMustNotMatch]#</indexURLMustNotMatch>
<indexContentMustMatch>#[indexContentMustMatch]#</indexContentMustMatch>
<indexContentMustNotMatch>#[indexContentMustNotMatch]#</indexContentMustNotMatch>
<status>#(status)#terminated::active::system#(/status)#</status>
<crawlingDomFilterContent>
#{crawlingDomFilterContent}#
<item>#[item]#</item>
#{/crawlingDomFilterContent}#
</crawlingDomFilterContent>
<crawlingDomMaxPages>#[crawlingDomMaxPages]#</crawlingDomMaxPages>
<withQuery>#(withQuery)#no::yes#(/withQuery)#</withQuery>
<storeCache>#(storeCache)#no::yes#(/storeCache)#</storeCache>
<indexText>#(indexText)#no::yes#(/indexText)#</indexText>
<indexMedia>#(indexMedia)#no::yes#(/indexMedia)#</indexMedia>
<remoteIndexing>#(remoteIndexing)#no::yes#(/remoteIndexing)#</remoteIndexing>
</crawlProfile>
#{/crawlProfiles}#
</crawlProfiles>

@ -25,7 +25,6 @@
package net.yacy.crawler.data;
import java.text.DateFormat;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
@ -625,39 +624,58 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
boolean terminateButton = active && !CrawlSwitchboard.DEFAULT_PROFILES.contains(this.name());
boolean deleteButton = !active;
prop.put(CRAWL_PROFILE_PREFIX + count + "_dark", dark ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_name", this.collectionName());
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_handle", this.handle());
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_name", this.name());
//prop.putXML(CRAWL_PROFILE_PREFIX + count + "_collection", this.get(COLLECTIONS)); // TODO: remove, replace with 'collections'
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_collections", this.get(COLLECTIONS));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_agentName", this.get(AGENT_NAME));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_userAgent", this.getAgent().userAgent);
prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", this.depth());
prop.put(CRAWL_PROFILE_PREFIX + count + "_directDocByURL", this.directDocByURL() ? 1 : 0);
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_recrawlIfOlder", this.recrawlIfOlder() == Long.MAX_VALUE ? "eternity" : (new Date(this.recrawlIfOlder()).toString()));
prop.put(CRAWL_PROFILE_PREFIX + count + "_domMaxPages", this.domMaxPages());
//prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomMaxPages", (this.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : Integer.toString(this.domMaxPages())); // TODO: remove, replace with 'domMaxPages'
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingQ", this.crawlingQ() ? 1 : 0);
//prop.put(CRAWL_PROFILE_PREFIX + count + "_withQuery", (this.crawlingQ()) ? "1" : "0"); // TODO: remove, replace with crawlingQ
prop.put(CRAWL_PROFILE_PREFIX + count + "_followFrames", this.followFrames() ? 1 : 0);
prop.put(CRAWL_PROFILE_PREFIX + count + "_obeyHtmlRobotsNoindex", this.obeyHtmlRobotsNoindex() ? 1 : 0);
prop.put(CRAWL_PROFILE_PREFIX + count + "_obeyHtmlRobotsNofollow", this.obeyHtmlRobotsNofollow() ? 1 : 0);
prop.put(CRAWL_PROFILE_PREFIX + count + "_indexText", this.indexText() ? 1 : 0);
prop.put(CRAWL_PROFILE_PREFIX + count + "_indexMedia", this.indexMedia() ? 1 : 0);
//prop.put(CRAWL_PROFILE_PREFIX + count + "_storeCache", this.storeHTCache() ? 1 : 0); // TODO: remove, replace with 'storeHTCache'
prop.put(CRAWL_PROFILE_PREFIX + count + "_storeHTCache", this.storeHTCache() ? 1 : 0);
prop.put(CRAWL_PROFILE_PREFIX + count + "_remoteIndexing", this.remoteIndexing() ? 1 : 0);
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_cacheStrategy", this.get(CACHE_STRAGEGY));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustMatch", this.get(CRAWLER_URL_MUSTMATCH));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustNotMatch", this.get(CRAWLER_URL_MUSTNOTMATCH));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustMatch", this.get(CRAWLER_IP_MUSTMATCH));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustNotMatch", this.get(CRAWLER_IP_MUSTNOTMATCH));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerCountryMustMatch", this.get(CRAWLER_COUNTRY_MUSTMATCH));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerNoLimitURLMustMatch", this.get(CRAWLER_URL_NODEPTHLIMITMATCH));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustMatch", this.get(INDEXING_URL_MUSTMATCH));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustNotMatch", this.get(INDEXING_URL_MUSTNOTMATCH));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustMatch", this.get(INDEXING_CONTENT_MUSTMATCH));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustNotMatch", this.get(INDEXING_CONTENT_MUSTNOTMATCH));
//prop.putXML(CRAWL_PROFILE_PREFIX + count + "_mustmatch", this.urlMustMatchPattern().toString()); // TODO: remove, replace with crawlerURLMustMatch
//prop.putXML(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", this.urlMustNotMatchPattern().toString()); // TODO: remove, replace with crawlerURLMustNotMatch
//prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (this.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(this.recrawlIfOlder())); // TODO: remove, replace with recrawlIfOlder
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive");
prop.put(CRAWL_PROFILE_PREFIX + count + "_status", terminateButton ? 1 : deleteButton ? 0 : 2);
prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton", terminateButton);
prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton_handle", this.handle());
prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton", deleteButton);
prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton_handle", this.handle());
prop.put(CRAWL_PROFILE_PREFIX + count + "_handle", this.handle());
prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", this.depth());
prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", this.urlMustMatchPattern().toString());
prop.put(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", this.urlMustNotMatchPattern().toString());
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (this.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(this.recrawlIfOlder()));
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive");
int i = 0;
if (active && this.domMaxPages() > 0
&& this.domMaxPages() != Integer.MAX_VALUE) {
String item;
while (i <= domlistlength && !(item = this.domName(true, i)).isEmpty()){
if (i == domlistlength) {
item += " ...";
if (active && this.domMaxPages() > 0 && this.domMaxPages() != Integer.MAX_VALUE) {
String item;
while (i <= domlistlength && !(item = this.domName(true, i)).isEmpty()) {
if (i == domlistlength) item += " ...";
prop.putHTML(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterContent_" + i + "_item", item);
i++;
}
prop.putHTML(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterContent_" + i + "_item", item);
i++;
}
}
prop.put(CRAWL_PROFILE_PREFIX+count+"_crawlingDomFilterContent", i);
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomMaxPages", (this.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : Integer.toString(this.domMaxPages()));
prop.put(CRAWL_PROFILE_PREFIX + count + "_withQuery", (this.crawlingQ()) ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_storeCache", (this.storeHTCache()) ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_indexText", (this.indexText()) ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_indexMedia", (this.indexMedia()) ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_remoteIndexing", (this.remoteIndexing()) ? "1" : "0");
}
}

@ -307,11 +307,11 @@ public class serverObjects implements Serializable, Cloneable {
* @see CharacterCoding#encodeUnicode2html(String, boolean)
*/
public void putHTML(final String key, final String value) {
put(key, CharacterCoding.unicode2html(value, true));
put(key, value == null ? "" : CharacterCoding.unicode2html(value, true));
}
public void putHTML(final String key, final byte[] value) {
putHTML(key, UTF8.String(value));
putHTML(key, value == null ? "" : UTF8.String(value));
}
/**
@ -321,7 +321,7 @@ public class serverObjects implements Serializable, Cloneable {
* replaced in the returned String.
*/
public void putXML(final String key, final String value) {
put(key, CharacterCoding.unicode2xml(value, true));
put(key, value == null ? "" : CharacterCoding.unicode2xml(value, true));
}
/**
@ -332,9 +332,9 @@ public class serverObjects implements Serializable, Cloneable {
* @return
*/
public void put(final RequestHeader.FileType fileType, final String key, final String value) {
if (fileType == FileType.JSON) putJSON(key, value);
else if (fileType == FileType.XML) putXML(key, value);
else putHTML(key, value);
if (fileType == FileType.JSON) putJSON(key, value == null ? "" : value);
else if (fileType == FileType.XML) putXML(key, value == null ? "" : value);
else putHTML(key, value == null ? "" : value);
}
/**

Loading…
Cancel
Save