removed options for stopwords which are not used

pull/1/head
Michael Peter Christen 13 years ago
parent ce3fed8882
commit ac9540dfb6

@ -310,14 +310,6 @@
so they can omit starting a crawl with the same start point. so they can omit starting a crawl with the same start point.
</td> </td>
</tr> </tr>
<tr valign="top" class="TableCellDark">
<td><label for="xsstopw">Exclude <em>static</em> Stop-Words</label>:</td>
<td><input type="checkbox" name="xsstopw" id="xsstopw" #(xsstopwChecked)#::checked="checked"#(/xsstopwChecked)# /></td>
<td>
This can be useful to circumvent that extremely common words are added to the database, i.e. "the", "he", "she", "it"... To exclude all words given in the file <tt>yacy.stopwords</tt> from indexing,
check this box.
</td>
</tr>
<tr valign="top" class="TableCellLight"> <tr valign="top" class="TableCellLight">
<td><label for="collection">Add Crawl result to collection(s)</label>:</td> <td><label for="collection">Add Crawl result to collection(s)</label>:</td>
<td> <td>
@ -327,24 +319,6 @@
A crawl result can be tagged with names which are candidates for a collection request. These tags can be selected with the <a href="/gsa/search?q=www&site=#[collection]#">GSA interface</a> using the 'site' operator. To use this option, the 'collection_sxt'-field must be switched on in the <a href="/IndexFederated_p.html">Solr Schema</a> A crawl result can be tagged with names which are candidates for a collection request. These tags can be selected with the <a href="/gsa/search?q=www&site=#[collection]#">GSA interface</a> using the 'site' operator. To use this option, the 'collection_sxt'-field must be switched on in the <a href="/IndexFederated_p.html">Solr Schema</a>
</td> </td>
</tr> </tr>
<!--
<tr valign="top" class="TableCellDark">
<td>Exclude <em>dynamic</em> Stop-Words</td>
<td><input type="checkbox" name="xdstopw" #(xdstopwChecked)#::checked="checked"#(/xdstopwChecked)# /></td>
<td colspan="3">
Excludes all words from indexing which are listed by statistic rules.
<em>THIS IS NOT YET FUNCTIONAL</em>
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td>Exclude <em>parent-indexed</em> words</td>
<td><input type="checkbox" name="xpstopw" #(xpstopwChecked)#::checked="checked"#(/xpstopwChecked)# /></td>
<td colspan="3">
Excludes all words from indexing which had been indexed in the parent web page.
<em>THIS IS NOT YET FUNCTIONAL</em>
</td>
</tr>
-->
<tr valign="top" class="TableCellSummary"> <tr valign="top" class="TableCellSummary">
<td colspan="5"><input type="submit" name="crawlingstart" value="Start New Crawl" class="submitready"/></td> <td colspan="5"><input type="submit" name="crawlingstart" value="Start New Crawl" class="submitready"/></td>
</tr> </tr>

@ -101,9 +101,6 @@
<input type="hidden" name="indexText" id="indexText" value="on" /> <input type="hidden" name="indexText" id="indexText" value="on" />
<input type="hidden" name="indexMedia" id="indexMedia" value="on" /> <input type="hidden" name="indexMedia" id="indexMedia" value="on" />
<input type="hidden" name="intention" id="intention" value="" /> <input type="hidden" name="intention" id="intention" value="" />
<input type="hidden" name="xsstopw" id="xsstopw" value="on" />
<input type="hidden" name="xdstopw" id="xdstopw" value="off" />
<input type="hidden" name="xpstopw" id="xpstopw" value="off" />
<input type="hidden" name="collection" id="collection" value="" /> <input type="hidden" name="collection" id="collection" value="" />
</dd> </dd>
<dt><label>Start</label></dt> <dt><label>Start</label></dt>

@ -265,15 +265,6 @@ public class Crawler_p {
CacheStrategy cachePolicy = CacheStrategy.parse(post.get("cachePolicy", "iffresh")); CacheStrategy cachePolicy = CacheStrategy.parse(post.get("cachePolicy", "iffresh"));
if (cachePolicy == null) cachePolicy = CacheStrategy.IFFRESH; if (cachePolicy == null) cachePolicy = CacheStrategy.IFFRESH;
final boolean xsstopw = "on".equals(post.get("xsstopw", "off"));
env.setConfig("xsstopw", xsstopw);
final boolean xdstopw = "on".equals(post.get("xdstopw", "off"));
env.setConfig("xdstopw", xdstopw);
final boolean xpstopw = "on".equals(post.get("xpstopw", "off"));
env.setConfig("xpstopw", xpstopw);
String crawlingMode = post.get("crawlingMode","url"); String crawlingMode = post.get("crawlingMode","url");
if ("file".equals(crawlingMode) && post.containsKey("crawlingFile")) { if ("file".equals(crawlingMode) && post.containsKey("crawlingFile")) {
@ -365,9 +356,6 @@ public class Crawler_p {
indexMedia, indexMedia,
storeHTCache, storeHTCache,
crawlOrder, crawlOrder,
xsstopw,
xdstopw,
xpstopw,
cachePolicy, cachePolicy,
collection); collection);
byte[] handle = ASCII.getBytes(profile.handle()); byte[] handle = ASCII.getBytes(profile.handle());

@ -102,9 +102,6 @@ public class QuickCrawlLink_p {
final boolean indexMedia = post.get("indexMedia", "off").equals("on"); final boolean indexMedia = post.get("indexMedia", "off").equals("on");
final boolean storeHTCache = post.get("storeHTCache", "").equals("on"); final boolean storeHTCache = post.get("storeHTCache", "").equals("on");
final boolean remoteIndexing = post.get("crawlOrder", "").equals("on"); final boolean remoteIndexing = post.get("crawlOrder", "").equals("on");
final boolean xsstopw = post.get("xsstopw", "").equals("on");
final boolean xdstopw = post.get("xdstopw", "").equals("on");
final boolean xpstopw = post.get("xpstopw", "").equals("on");
final String collection = post.get("collection", "user"); final String collection = post.get("collection", "user");
prop.put("mode_url", (crawlingStart == null) ? "unknown" : crawlingStart); prop.put("mode_url", (crawlingStart == null) ? "unknown" : crawlingStart);
@ -151,9 +148,6 @@ public class QuickCrawlLink_p {
indexMedia, indexMedia,
storeHTCache, storeHTCache,
remoteIndexing, remoteIndexing,
xsstopw,
xdstopw,
xpstopw,
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
collection); collection);
sb.crawler.putActive(pe.handle().getBytes(), pe); sb.crawler.putActive(pe.handle().getBytes(), pe);

@ -245,9 +245,6 @@ public final class CrawlSwitchboard {
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/, true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
true, true,
false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/,
true,
true,
true,
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_PROXY); "robot_" + CRAWL_PROFILE_PROXY);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
@ -274,9 +271,6 @@ public final class CrawlSwitchboard {
true, true,
false, false,
false, false,
true,
true,
false,
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_REMOTE); "robot_" + CRAWL_PROFILE_REMOTE);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
@ -303,9 +297,6 @@ public final class CrawlSwitchboard {
false, false,
true, true,
false, false,
true,
true,
false,
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT); "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
@ -332,9 +323,6 @@ public final class CrawlSwitchboard {
true, true,
true, true,
false, false,
true,
true,
false,
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT); "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
@ -362,9 +350,6 @@ public final class CrawlSwitchboard {
false, false,
true, true,
false, false,
true,
true,
false,
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA); "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
@ -391,9 +376,6 @@ public final class CrawlSwitchboard {
true, true,
true, true,
false, false,
true,
true,
false,
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA); "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
@ -420,9 +402,6 @@ public final class CrawlSwitchboard {
false, false,
false, false,
false, false,
true,
true,
false,
CacheStrategy.NOCACHE, CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_SURROGATE); "robot_" + CRAWL_PROFILE_SURROGATE);
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(

@ -126,9 +126,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final boolean indexMedia, final boolean indexMedia,
final boolean storeHTCache, final boolean storeHTCache,
final boolean remoteIndexing, final boolean remoteIndexing,
final boolean xsstopw,
final boolean xdstopw,
final boolean xpstopw,
final CacheStrategy cacheStrategy, final CacheStrategy cacheStrategy,
final String collections) { final String collections) {
super(40); super(40);

@ -56,12 +56,6 @@ public class YMarkCrawlStart extends HashMap<String,String>{
SINGLE, ONE_LINK, FULL_DOMAIN SINGLE, ONE_LINK, FULL_DOMAIN
} }
public YMarkCrawlStart(final WorkTables worktables) {
super();
this.date_recording = new Date(0);
this.worktables = worktables;
}
public YMarkCrawlStart(final WorkTables worktables, final String url) { public YMarkCrawlStart(final WorkTables worktables, final String url) {
super(); super();
this.worktables = worktables; this.worktables = worktables;
@ -187,7 +181,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
-1, -1,
crawlingQ, crawlingQ,
true, true, true, false, true, true, true, true, true, true, false,
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA); // TODO: make this a default profile in CrawlSwitchboard "robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA); // TODO: make this a default profile in CrawlSwitchboard
sb.crawler.putActive(pe.handle().getBytes(), pe); sb.crawler.putActive(pe.handle().getBytes(), pe);

Loading…
Cancel
Save