fix for values in CrawlProfileEditor table and xml; now the full profile

is available in the xml.
pull/1/head
Michael Peter Christen 11 years ago
parent ec95dfa2e6
commit d8beafba3a

@ -32,11 +32,12 @@
</colgroup> </colgroup>
<tr class="TableHeader"> <tr class="TableHeader">
<td><strong>Crawl Thread</strong></td> <td><strong>Crawl Thread</strong></td>
<td><strong>Collections</strong></td>
<td><strong>Status</strong></td> <td><strong>Status</strong></td>
<td><strong>Depth</strong></td> <td><strong>Depth</strong></td>
<td><strong>Must Match</strong></td> <td><strong>Must Match</strong></td>
<td><strong>Must Not Match</strong></td> <td><strong>Must Not Match</strong></td>
<td><strong>MaxAge</strong></td> <td><strong>Recrawl if older than</strong></td>
<td><strong>Domain Counter Content</strong></td> <td><strong>Domain Counter Content</strong></td>
<td><strong>Max Page Per Domain</strong></td> <td><strong>Max Page Per Domain</strong></td>
<td><strong>Accept '?' URLs</strong></td> <td><strong>Accept '?' URLs</strong></td>
@ -48,6 +49,7 @@
#{crawlProfiles}# #{crawlProfiles}#
<tr class="TableCell#(dark)#Light::Dark#(/dark)#"> <tr class="TableCell#(dark)#Light::Dark#(/dark)#">
<td>#[name]#</td> <td>#[name]#</td>
<td>#[collections]#</td>
<td>#(terminateButton)#:: <td>#(terminateButton)#::
<div style="text-decoration:blink">Running</div> <div style="text-decoration:blink">Running</div>
<form action="CrawlProfileEditor_p.html" method="get" enctype="multipart/form-data" accept-charset="UTF-8"><div> <form action="CrawlProfileEditor_p.html" method="get" enctype="multipart/form-data" accept-charset="UTF-8"><div>
@ -64,13 +66,13 @@
#(/deleteButton)# #(/deleteButton)#
</td> </td>
<td>#[depth]#</td> <td>#[depth]#</td>
<td>#[mustmatch]#</td> <td>#[crawlerURLMustMatch]#</td>
<td>#[mustnotmatch]#</td> <td>#[crawlerURLMustNotMatch]#</td>
<td>#[crawlingIfOlder]#</td> <td>#[recrawlIfOlder]#</td>
<td>#{crawlingDomFilterContent}##[item]#<br />#{/crawlingDomFilterContent}#</td> <td>#{crawlingDomFilterContent}##[item]#<br />#{/crawlingDomFilterContent}#</td>
<td>#[crawlingDomMaxPages]#</td> <td>#[domMaxPages]#</td>
<td>#(withQuery)#no::yes#(/withQuery)#</td> <td>#(withQuery)#no::yes#(/withQuery)#</td>
<td>#(storeCache)#no::yes#(/storeCache)#</td> <td>#(storeHTCache)#no::yes#(/storeHTCache)#</td>
<td>#(indexText)#no::yes#(/indexText)#</td> <td>#(indexText)#no::yes#(/indexText)#</td>
<td>#(indexMedia)#no::yes#(/indexMedia)#</td> <td>#(indexMedia)#no::yes#(/indexMedia)#</td>
<td>#(remoteIndexing)#no::yes#(/remoteIndexing)#</td> <td>#(remoteIndexing)#no::yes#(/remoteIndexing)#</td>

@ -2,23 +2,40 @@
<crawlProfiles> <crawlProfiles>
#{crawlProfiles}# #{crawlProfiles}#
<crawlProfile> <crawlProfile>
<handle>#[handle]#</handle>
<name>#[name]#</name> <name>#[name]#</name>
<status>#(status)#terminated::active::system#(/status)#</status> <collections>#[collections]#</collections>
<agentName>#[agentName]#</agentName>
<userAgent>#[userAgent]#</userAgent>
<depth>#[depth]#</depth> <depth>#[depth]#</depth>
<mustmatch>#[mustmatch]#</mustmatch> <directDocByURL>#(directDocByURL)#false::true#(/directDocByURL)#</directDocByURL>
<mustnotmatch>#[mustnotmatch]#</mustnotmatch> <recrawlIfOlder>#[recrawlIfOlder]#</recrawlIfOlder>
<crawlingIfOlder>#[crawlingIfOlder]#</crawlingIfOlder> <domMaxPages>#[domMaxPages]#</domMaxPages>
<crawlingQ>#(crawlingQ)#false::true#(/crawlingQ)#</crawlingQ>
<followFrames>#(followFrames)#false::true#(/followFrames)#</followFrames>
<obeyHtmlRobotsNoindex>#(obeyHtmlRobotsNoindex)#false::true#(/obeyHtmlRobotsNoindex)#</obeyHtmlRobotsNoindex>
<obeyHtmlRobotsNofollow>#(obeyHtmlRobotsNofollow)#false::true#(/obeyHtmlRobotsNofollow)#</obeyHtmlRobotsNofollow>
<indexText>#(indexText)#false::true#(/indexText)#</indexText>
<indexMedia>#(indexMedia)#false::true#(/indexMedia)#</indexMedia>
<storeHTCache>#(storeHTCache)#false::true#(/storeHTCache)#</storeHTCache>
<remoteIndexing>#(remoteIndexing)#false::true#(/remoteIndexing)#</remoteIndexing>
<cacheStrategy>#[cacheStrategy]#</cacheStrategy>
<crawlerURLMustMatch>#[crawlerURLMustMatch]#</crawlerURLMustMatch>
<crawlerURLMustNotMatch>#[crawlerURLMustNotMatch]#</crawlerURLMustNotMatch>
<crawlerIPMustMatch>#[crawlerIPMustMatch]#</crawlerIPMustMatch>
<crawlerIPMustNotMatch>#[crawlerIPMustNotMatch]#</crawlerIPMustNotMatch>
<crawlerCountryMustMatch>#[crawlerCountryMustMatch]#</crawlerCountryMustMatch>
<crawlerNoLimitURLMustMatch>#[crawlerNoLimitURLMustMatch]#</crawlerNoLimitURLMustMatch>
<indexURLMustMatch>#[indexURLMustMatch]#</indexURLMustMatch>
<indexURLMustNotMatch>#[indexURLMustNotMatch]#</indexURLMustNotMatch>
<indexContentMustMatch>#[indexContentMustMatch]#</indexContentMustMatch>
<indexContentMustNotMatch>#[indexContentMustNotMatch]#</indexContentMustNotMatch>
<status>#(status)#terminated::active::system#(/status)#</status>
<crawlingDomFilterContent> <crawlingDomFilterContent>
#{crawlingDomFilterContent}# #{crawlingDomFilterContent}#
<item>#[item]#</item> <item>#[item]#</item>
#{/crawlingDomFilterContent}# #{/crawlingDomFilterContent}#
</crawlingDomFilterContent> </crawlingDomFilterContent>
<crawlingDomMaxPages>#[crawlingDomMaxPages]#</crawlingDomMaxPages>
<withQuery>#(withQuery)#no::yes#(/withQuery)#</withQuery>
<storeCache>#(storeCache)#no::yes#(/storeCache)#</storeCache>
<indexText>#(indexText)#no::yes#(/indexText)#</indexText>
<indexMedia>#(indexMedia)#no::yes#(/indexMedia)#</indexMedia>
<remoteIndexing>#(remoteIndexing)#no::yes#(/remoteIndexing)#</remoteIndexing>
</crawlProfile> </crawlProfile>
#{/crawlProfiles}# #{/crawlProfiles}#
</crawlProfiles> </crawlProfiles>

@ -25,7 +25,6 @@
package net.yacy.crawler.data; package net.yacy.crawler.data;
import java.text.DateFormat;
import java.util.Collection; import java.util.Collection;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
@ -625,39 +624,58 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
boolean terminateButton = active && !CrawlSwitchboard.DEFAULT_PROFILES.contains(this.name()); boolean terminateButton = active && !CrawlSwitchboard.DEFAULT_PROFILES.contains(this.name());
boolean deleteButton = !active; boolean deleteButton = !active;
prop.put(CRAWL_PROFILE_PREFIX + count + "_dark", dark ? "1" : "0"); prop.put(CRAWL_PROFILE_PREFIX + count + "_dark", dark ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_name", this.collectionName()); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_handle", this.handle());
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_name", this.name());
//prop.putXML(CRAWL_PROFILE_PREFIX + count + "_collection", this.get(COLLECTIONS)); // TODO: remove, replace with 'collections'
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_collections", this.get(COLLECTIONS));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_agentName", this.get(AGENT_NAME));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_userAgent", this.getAgent().userAgent);
prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", this.depth());
prop.put(CRAWL_PROFILE_PREFIX + count + "_directDocByURL", this.directDocByURL() ? 1 : 0);
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_recrawlIfOlder", this.recrawlIfOlder() == Long.MAX_VALUE ? "eternity" : (new Date(this.recrawlIfOlder()).toString()));
prop.put(CRAWL_PROFILE_PREFIX + count + "_domMaxPages", this.domMaxPages());
//prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomMaxPages", (this.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : Integer.toString(this.domMaxPages())); // TODO: remove, replace with 'domMaxPages'
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingQ", this.crawlingQ() ? 1 : 0);
//prop.put(CRAWL_PROFILE_PREFIX + count + "_withQuery", (this.crawlingQ()) ? "1" : "0"); // TODO: remove, replace with crawlingQ
prop.put(CRAWL_PROFILE_PREFIX + count + "_followFrames", this.followFrames() ? 1 : 0);
prop.put(CRAWL_PROFILE_PREFIX + count + "_obeyHtmlRobotsNoindex", this.obeyHtmlRobotsNoindex() ? 1 : 0);
prop.put(CRAWL_PROFILE_PREFIX + count + "_obeyHtmlRobotsNofollow", this.obeyHtmlRobotsNofollow() ? 1 : 0);
prop.put(CRAWL_PROFILE_PREFIX + count + "_indexText", this.indexText() ? 1 : 0);
prop.put(CRAWL_PROFILE_PREFIX + count + "_indexMedia", this.indexMedia() ? 1 : 0);
//prop.put(CRAWL_PROFILE_PREFIX + count + "_storeCache", this.storeHTCache() ? 1 : 0); // TODO: remove, replace with 'storeHTCache'
prop.put(CRAWL_PROFILE_PREFIX + count + "_storeHTCache", this.storeHTCache() ? 1 : 0);
prop.put(CRAWL_PROFILE_PREFIX + count + "_remoteIndexing", this.remoteIndexing() ? 1 : 0);
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_cacheStrategy", this.get(CACHE_STRAGEGY));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustMatch", this.get(CRAWLER_URL_MUSTMATCH));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustNotMatch", this.get(CRAWLER_URL_MUSTNOTMATCH));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustMatch", this.get(CRAWLER_IP_MUSTMATCH));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustNotMatch", this.get(CRAWLER_IP_MUSTNOTMATCH));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerCountryMustMatch", this.get(CRAWLER_COUNTRY_MUSTMATCH));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerNoLimitURLMustMatch", this.get(CRAWLER_URL_NODEPTHLIMITMATCH));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustMatch", this.get(INDEXING_URL_MUSTMATCH));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustNotMatch", this.get(INDEXING_URL_MUSTNOTMATCH));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustMatch", this.get(INDEXING_CONTENT_MUSTMATCH));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustNotMatch", this.get(INDEXING_CONTENT_MUSTNOTMATCH));
//prop.putXML(CRAWL_PROFILE_PREFIX + count + "_mustmatch", this.urlMustMatchPattern().toString()); // TODO: remove, replace with crawlerURLMustMatch
//prop.putXML(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", this.urlMustNotMatchPattern().toString()); // TODO: remove, replace with crawlerURLMustNotMatch
//prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (this.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(this.recrawlIfOlder())); // TODO: remove, replace with recrawlIfOlder
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive");
prop.put(CRAWL_PROFILE_PREFIX + count + "_status", terminateButton ? 1 : deleteButton ? 0 : 2); prop.put(CRAWL_PROFILE_PREFIX + count + "_status", terminateButton ? 1 : deleteButton ? 0 : 2);
prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton", terminateButton); prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton", terminateButton);
prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton_handle", this.handle()); prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton_handle", this.handle());
prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton", deleteButton); prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton", deleteButton);
prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton_handle", this.handle()); prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton_handle", this.handle());
prop.put(CRAWL_PROFILE_PREFIX + count + "_handle", this.handle());
prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", this.depth());
prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", this.urlMustMatchPattern().toString());
prop.put(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", this.urlMustNotMatchPattern().toString());
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (this.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(this.recrawlIfOlder()));
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive");
int i = 0; int i = 0;
if (active && this.domMaxPages() > 0 if (active && this.domMaxPages() > 0 && this.domMaxPages() != Integer.MAX_VALUE) {
&& this.domMaxPages() != Integer.MAX_VALUE) { String item;
String item; while (i <= domlistlength && !(item = this.domName(true, i)).isEmpty()) {
while (i <= domlistlength && !(item = this.domName(true, i)).isEmpty()){ if (i == domlistlength) item += " ...";
if (i == domlistlength) { prop.putHTML(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterContent_" + i + "_item", item);
item += " ..."; i++;
} }
prop.putHTML(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterContent_" + i + "_item", item);
i++;
} }
}
prop.put(CRAWL_PROFILE_PREFIX+count+"_crawlingDomFilterContent", i); prop.put(CRAWL_PROFILE_PREFIX+count+"_crawlingDomFilterContent", i);
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomMaxPages", (this.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : Integer.toString(this.domMaxPages()));
prop.put(CRAWL_PROFILE_PREFIX + count + "_withQuery", (this.crawlingQ()) ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_storeCache", (this.storeHTCache()) ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_indexText", (this.indexText()) ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_indexMedia", (this.indexMedia()) ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_remoteIndexing", (this.remoteIndexing()) ? "1" : "0");
} }
} }

@ -307,11 +307,11 @@ public class serverObjects implements Serializable, Cloneable {
* @see CharacterCoding#encodeUnicode2html(String, boolean) * @see CharacterCoding#encodeUnicode2html(String, boolean)
*/ */
public void putHTML(final String key, final String value) { public void putHTML(final String key, final String value) {
put(key, CharacterCoding.unicode2html(value, true)); put(key, value == null ? "" : CharacterCoding.unicode2html(value, true));
} }
public void putHTML(final String key, final byte[] value) { public void putHTML(final String key, final byte[] value) {
putHTML(key, UTF8.String(value)); putHTML(key, value == null ? "" : UTF8.String(value));
} }
/** /**
@ -321,7 +321,7 @@ public class serverObjects implements Serializable, Cloneable {
* replaced in the returned String. * replaced in the returned String.
*/ */
public void putXML(final String key, final String value) { public void putXML(final String key, final String value) {
put(key, CharacterCoding.unicode2xml(value, true)); put(key, value == null ? "" : CharacterCoding.unicode2xml(value, true));
} }
/** /**
@ -332,9 +332,9 @@ public class serverObjects implements Serializable, Cloneable {
* @return * @return
*/ */
public void put(final RequestHeader.FileType fileType, final String key, final String value) { public void put(final RequestHeader.FileType fileType, final String key, final String value) {
if (fileType == FileType.JSON) putJSON(key, value); if (fileType == FileType.JSON) putJSON(key, value == null ? "" : value);
else if (fileType == FileType.XML) putXML(key, value); else if (fileType == FileType.XML) putXML(key, value == null ? "" : value);
else putHTML(key, value); else putHTML(key, value == null ? "" : value);
} }
/** /**

Loading…
Cancel
Save