removed options for stopwords which are not used

pull/1/head
Michael Peter Christen 13 years ago
parent ce3fed8882
commit ac9540dfb6

@ -310,14 +310,6 @@
so they can omit starting a crawl with the same start point.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td><label for="xsstopw">Exclude <em>static</em> Stop-Words</label>:</td>
<td><input type="checkbox" name="xsstopw" id="xsstopw" #(xsstopwChecked)#::checked="checked"#(/xsstopwChecked)# /></td>
<td>
This can be useful to circumvent that extremely common words are added to the database, i.e. "the", "he", "she", "it"... To exclude all words given in the file <tt>yacy.stopwords</tt> from indexing,
check this box.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="collection">Add Crawl result to collection(s)</label>:</td>
<td>
@ -327,24 +319,6 @@
A crawl result can be tagged with names which are candidates for a collection request. These tags can be selected with the <a href="/gsa/search?q=www&site=#[collection]#">GSA interface</a> using the 'site' operator. To use this option, the 'collection_sxt'-field must be switched on in the <a href="/IndexFederated_p.html">Solr Schema</a>
</td>
</tr>
<!--
<tr valign="top" class="TableCellDark">
<td>Exclude <em>dynamic</em> Stop-Words</td>
<td><input type="checkbox" name="xdstopw" #(xdstopwChecked)#::checked="checked"#(/xdstopwChecked)# /></td>
<td colspan="3">
Excludes all words from indexing which are listed by statistic rules.
<em>THIS IS NOT YET FUNCTIONAL</em>
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td>Exclude <em>parent-indexed</em> words</td>
<td><input type="checkbox" name="xpstopw" #(xpstopwChecked)#::checked="checked"#(/xpstopwChecked)# /></td>
<td colspan="3">
Excludes all words from indexing which had been indexed in the parent web page.
<em>THIS IS NOT YET FUNCTIONAL</em>
</td>
</tr>
-->
<tr valign="top" class="TableCellSummary">
<td colspan="5"><input type="submit" name="crawlingstart" value="Start New Crawl" class="submitready"/></td>
</tr>

@ -101,9 +101,6 @@
<input type="hidden" name="indexText" id="indexText" value="on" />
<input type="hidden" name="indexMedia" id="indexMedia" value="on" />
<input type="hidden" name="intention" id="intention" value="" />
<input type="hidden" name="xsstopw" id="xsstopw" value="on" />
<input type="hidden" name="xdstopw" id="xdstopw" value="off" />
<input type="hidden" name="xpstopw" id="xpstopw" value="off" />
<input type="hidden" name="collection" id="collection" value="" />
</dd>
<dt><label>Start</label></dt>

@ -265,15 +265,6 @@ public class Crawler_p {
CacheStrategy cachePolicy = CacheStrategy.parse(post.get("cachePolicy", "iffresh"));
if (cachePolicy == null) cachePolicy = CacheStrategy.IFFRESH;
final boolean xsstopw = "on".equals(post.get("xsstopw", "off"));
env.setConfig("xsstopw", xsstopw);
final boolean xdstopw = "on".equals(post.get("xdstopw", "off"));
env.setConfig("xdstopw", xdstopw);
final boolean xpstopw = "on".equals(post.get("xpstopw", "off"));
env.setConfig("xpstopw", xpstopw);
String crawlingMode = post.get("crawlingMode","url");
if ("file".equals(crawlingMode) && post.containsKey("crawlingFile")) {
@ -365,9 +356,6 @@ public class Crawler_p {
indexMedia,
storeHTCache,
crawlOrder,
xsstopw,
xdstopw,
xpstopw,
cachePolicy,
collection);
byte[] handle = ASCII.getBytes(profile.handle());

@ -102,9 +102,6 @@ public class QuickCrawlLink_p {
final boolean indexMedia = post.get("indexMedia", "off").equals("on");
final boolean storeHTCache = post.get("storeHTCache", "").equals("on");
final boolean remoteIndexing = post.get("crawlOrder", "").equals("on");
final boolean xsstopw = post.get("xsstopw", "").equals("on");
final boolean xdstopw = post.get("xdstopw", "").equals("on");
final boolean xpstopw = post.get("xpstopw", "").equals("on");
final String collection = post.get("collection", "user");
prop.put("mode_url", (crawlingStart == null) ? "unknown" : crawlingStart);
@ -151,9 +148,6 @@ public class QuickCrawlLink_p {
indexMedia,
storeHTCache,
remoteIndexing,
xsstopw,
xdstopw,
xpstopw,
CacheStrategy.IFFRESH,
collection);
sb.crawler.putActive(pe.handle().getBytes(), pe);

@ -245,9 +245,6 @@ public final class CrawlSwitchboard {
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
true,
false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/,
true,
true,
true,
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_PROXY);
this.profilesActiveCrawls.put(
@ -274,9 +271,6 @@ public final class CrawlSwitchboard {
true,
false,
false,
true,
true,
false,
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_REMOTE);
this.profilesActiveCrawls.put(
@ -303,9 +297,6 @@ public final class CrawlSwitchboard {
false,
true,
false,
true,
true,
false,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT);
this.profilesActiveCrawls.put(
@ -332,9 +323,6 @@ public final class CrawlSwitchboard {
true,
true,
false,
true,
true,
false,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT);
this.profilesActiveCrawls.put(
@ -362,9 +350,6 @@ public final class CrawlSwitchboard {
false,
true,
false,
true,
true,
false,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA);
this.profilesActiveCrawls.put(
@ -391,9 +376,6 @@ public final class CrawlSwitchboard {
true,
true,
false,
true,
true,
false,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA);
this.profilesActiveCrawls.put(
@ -420,9 +402,6 @@ public final class CrawlSwitchboard {
false,
false,
false,
true,
true,
false,
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_SURROGATE);
this.profilesActiveCrawls.put(

@ -126,9 +126,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final boolean indexMedia,
final boolean storeHTCache,
final boolean remoteIndexing,
final boolean xsstopw,
final boolean xdstopw,
final boolean xpstopw,
final CacheStrategy cacheStrategy,
final String collections) {
super(40);

@ -56,12 +56,6 @@ public class YMarkCrawlStart extends HashMap<String,String>{
SINGLE, ONE_LINK, FULL_DOMAIN
}
public YMarkCrawlStart(final WorkTables worktables) {
super();
this.date_recording = new Date(0);
this.worktables = worktables;
}
public YMarkCrawlStart(final WorkTables worktables, final String url) {
super();
this.worktables = worktables;
@ -187,7 +181,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
-1,
crawlingQ,
true, true, true, false, true, true, true,
true, true, true, false,
CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA); // TODO: make this a default profile in CrawlSwitchboard
sb.crawler.putActive(pe.handle().getBytes(), pe);

Loading…
Cancel
Save