- replaced String by StringBuffer in condenser

- added CamelCase parser in condenser
- added option to switch on or off indexing for proxy

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3292 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent 14f2068daf
commit f25c0e98d1

@ -166,7 +166,7 @@ public class CacheAdmin_p {
if (sentences != null)
while (sentences.hasNext()) {
prop.put("info_type_lines_" + i + "_line",
de.anomic.data.wikiCode.replaceXMLEntities(sentences.next().toString().replaceAll("\n", "").trim()));
de.anomic.data.wikiCode.replaceXMLEntities(((StringBuffer) sentences.next()).toString().replaceAll("\n", "").trim()));
i++;
}
prop.put("info_type_lines", i);

@ -38,9 +38,23 @@
<td><input type="checkbox" id="prxy_storeHTCache" name="proxyStoreHTCache"#(proxyStoreHTCacheChecked)#:: checked="checked"#(/proxyStoreHTCacheChecked)# /></td>
<td>It is almost always recommended to set this on. The only exception is that you have another caching proxy running as secondary proxy and YaCy is configured to used that proxy in proxy-proxy - mode.</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="prxy_index_text">Do Local Text-Indexing</label></td>
<td><input type="checkbox" id="prxy_index_text" name="proxyIndexingLocalText"#(proxyIndexingLocalText)#:: checked="checked"#(/proxyIndexingLocalText)# /></td>
<td>
If this is on, all pages (except private content) that passes the proxy is indexed.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="prxy_index_media">Do Local Media-Indexing</label></td>
<td><input type="checkbox" id="prxy_index_media" name="proxyIndexingLocalMedia"#(proxyIndexingLocalMedia)#:: checked="checked"#(/proxyIndexingLocalMedia)# /></td>
<td>
This is the same as for Local Text-Indexing, but switches only the indexing of media content on.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="prxy_crawl_order">Do Remote Indexing</label></td>
<td><input type="checkbox" id="prxy_crawl_order" name="proxyCrawlOrder"#(proxyCrawlOrder)#:: checked="checked"#(/proxyCrawlOrder)# /></td>
<td><input type="checkbox" id="prxy_crawl_order" name="proxyIndexingRemote"#(proxyIndexingRemote)#:: checked="checked"#(/proxyIndexingRemote)# /></td>
<td>
If checked, the crawler will contact other peers and use them as remote indexers for your crawl.
If you need your crawling results locally, you should switch this off.
@ -82,7 +96,9 @@
<!-- info 2 -->
<p><strong>Pre-fetch is now set to depth-#[message]#.</strong></p>
<p><strong>Caching is now #(caching)#off::on#(/caching)#.</strong></p>
<p><strong>Remote Indexing is now #(crawlOrder)#off::on#(/crawlOrder)#.</strong></p>
<p><strong>Local Text Indexing is now #(indexingLocalText)#off::on#(/indexingLocalText)#.</strong></p>
<p><strong>Local Media Indexing is now #(indexingLocalMedia)#off::on#(/indexingLocalMedia)#.</strong></p>
<p><strong>Remote Indexing is now #(indexingRemote)#off::on#(/indexingRemote)#.</strong></p>
#(path)#::<p><strong>Cachepath is now set to '#[return]#'.</strong> Please move the old data in the new directory.</p>#(/path)#
#(size)#::<p><strong>Cachesize is now set to #[return]#MB.</strong></p>#(/size)#
#(restart)#::<p style="color:red;"><strong>Changes will take effect after restart only.</strong></p>#(/restart)#

@ -86,8 +86,12 @@ public class ProxyIndexingMonitor_p {
env.setConfig("proxyPrefetchDepth", Integer.toString(newProxyPrefetchDepth));
boolean proxyStoreHTCache = post.containsKey("proxyStoreHTCache");
env.setConfig("proxyStoreHTCache", (proxyStoreHTCache) ? "true" : "false");
boolean proxyCrawlOrder = post.containsKey("proxyCrawlOrder");
env.setConfig("proxyCrawlOrder", proxyCrawlOrder ? "true" : "false");
boolean proxyIndexingRemote = post.containsKey("proxyIndexingRemote");
env.setConfig("proxyIndexingRemote", proxyIndexingRemote ? "true" : "false");
boolean proxyIndexingLocalText = post.containsKey("proxyIndexingLocalText");
env.setConfig("proxyIndexingLocalText", proxyIndexingLocalText ? "true" : "false");
boolean proxyIndexingLocalMedia = post.containsKey("proxyIndexingLocalMedia");
env.setConfig("proxyIndexingLocalMedia", proxyIndexingLocalMedia ? "true" : "false");
// added proxyCache, proxyCacheSize - Borg-0300
// proxyCache - check and create the directory
@ -115,12 +119,16 @@ public class ProxyIndexingMonitor_p {
try {
sb.defaultProxyProfile.changeEntry("generalDepth", Integer.toString(newProxyPrefetchDepth));
sb.defaultProxyProfile.changeEntry("storeHTCache", (proxyStoreHTCache) ? "true": "false");
sb.defaultProxyProfile.changeEntry("remoteIndexing",proxyCrawlOrder ? "true":"false");
sb.defaultProxyProfile.changeEntry("remoteIndexing",proxyIndexingRemote ? "true":"false");
sb.defaultProxyProfile.changeEntry("indexText",proxyIndexingLocalText ? "true":"false");
sb.defaultProxyProfile.changeEntry("indexMedia",proxyIndexingLocalMedia ? "true":"false");
prop.put("info", 2);//new proxyPrefetchdepth
prop.put("info_message", newProxyPrefetchDepth);
prop.put("info_caching", (proxyStoreHTCache) ? 1 : 0);
prop.put("info_crawlOrder", (proxyCrawlOrder) ? 1 : 0);
prop.put("info_indexingLocalText", (proxyIndexingLocalText) ? 1 : 0);
prop.put("info_indexingLocalMedia", (proxyIndexingLocalMedia) ? 1 : 0);
prop.put("info_indexingRemote", (proxyIndexingRemote) ? 1 : 0);
// proxyCache - only display on change
if (oldProxyCachePath.equals(newProxyCachePath)) {
@ -159,7 +167,9 @@ public class ProxyIndexingMonitor_p {
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
prop.put("proxyStoreHTCacheChecked", env.getConfig("proxyStoreHTCache", "").equals("true") ? 1 : 0);
prop.put("proxyCrawlOrder", env.getConfig("proxyCrawlOrder", "").equals("true") ? 1 : 0);
prop.put("proxyIndexingRemote", env.getConfig("proxyIndexingRemote", "").equals("true") ? 1 : 0);
prop.put("proxyIndexingLocalText", env.getConfig("proxyIndexingLocalText", "").equals("true") ? 1 : 0);
prop.put("proxyIndexingLocalMedia", env.getConfig("proxyIndexingLocalMedia", "").equals("true") ? 1 : 0);
prop.put("proxyCache", env.getConfig("proxyCache", "DATA/HTCACHE"));
prop.put("proxyCacheSize", env.getConfig("proxyCacheSize", "64"));
// return rewrite properties

@ -320,7 +320,7 @@ public class ViewFile {
// Search word highlighting
while (sentences.hasNext()) {
sentence = (String)sentences.next();
sentence = ((StringBuffer) sentences.next()).toString();
if (sentence.trim().length() > 0) {
prop.put("viewMode_sentences_" + i + "_nr", Integer.toString(i + 1));
prop.put("viewMode_sentences_" + i + "_text", markup(wordArray, sentence));

@ -236,7 +236,7 @@ public final class plasmaCondenser {
}
int pip = 0;
while (wordenum.hasMoreElements()) {
word = ((String) wordenum.nextElement()).toLowerCase();
word = (new String((StringBuffer) wordenum.nextElement())).toLowerCase();
wprop = (wordStatProp) words.get(word);
if (wprop == null) wprop = new wordStatProp(0, pip, phrase);
if (wprop.flags == null) wprop.flags = (kelondroBitfield) flagstemplate.clone();
@ -387,7 +387,7 @@ public final class plasmaCondenser {
// read source
sievedWordsEnum wordenum = new sievedWordsEnum(is, charset, wordminsize);
while (wordenum.hasMoreElements()) {
word = ((String) wordenum.nextElement()).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars?
word = (new String((StringBuffer) wordenum.nextElement())).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars?
//System.out.println("PARSED-WORD " + word);
// distinguish punctuation and words
@ -665,10 +665,10 @@ public final class plasmaCondenser {
}
private Object nextElement0() {
String s;
StringBuffer s;
char c;
loop: while (e.hasMoreElements()) {
s = (String) e.nextElement();
s = (StringBuffer) e.nextElement();
if ((s.length() == 1) && (htmlFilterContentScraper.punctuation(s.charAt(0)))) return s;
if ((s.length() < ml) && (!(s.equals("of")))) continue loop;
for (int i = 0; i < s.length(); i++) {
@ -697,14 +697,14 @@ public final class plasmaCondenser {
}
private static class unsievedWordsEnum implements Enumeration {
// returns an enumeration of StringBuffer Objects
Object buffer = null;
sentencesFromInputStreamEnum e;
String s;
StringBuffer s;
public unsievedWordsEnum(InputStream is, String charset) throws UnsupportedEncodingException {
e = new sentencesFromInputStreamEnum(is, charset);
s = "";
s = new StringBuffer();
buffer = nextElement0();
}
@ -712,15 +712,15 @@ public final class plasmaCondenser {
e.pre(x);
}
private Object nextElement0() {
String r;
private StringBuffer nextElement0() {
StringBuffer r;
StringBuffer sb;
char c;
while (s.length() == 0) {
if (e.hasNext()) {
r = (String) e.next();
r = (StringBuffer) e.next();
if (r == null) return null;
r = r.trim();
r = trim(r);
sb = new StringBuffer(r.length() * 2);
for (int i = 0; i < r.length(); i++) {
c = r.charAt(i);
@ -728,7 +728,7 @@ public final class plasmaCondenser {
else if (htmlFilterContentScraper.punctuation(c)) sb = sb.append(' ').append(c).append(' ');
else sb = sb.append(c);
}
s = sb.toString().trim();
s = trim(sb);
//System.out.println("PARSING-LINE '" + r + "'->'" + s + "'");
} else {
return null;
@ -737,11 +737,11 @@ public final class plasmaCondenser {
int p = s.indexOf(" ");
if (p < 0) {
r = s;
s = "";
s = new StringBuffer();
return r;
}
r = s.substring(0, p);
s = s.substring(p + 1).trim();
r = trim(new StringBuffer(s.substring(0, p)));
s = trim(s.delete(0, p + 1));
return r;
}
@ -757,6 +757,14 @@ public final class plasmaCondenser {
}
public static StringBuffer trim(StringBuffer sb) {
synchronized (sb) {
while ((sb.length() > 0) && (sb.charAt(0) <= ' ')) sb = sb.deleteCharAt(0);
while ((sb.length() > 0) && (sb.charAt(sb.length() - 1) <= ' ')) sb = sb.deleteCharAt(sb.length() - 1);
}
return sb;
}
public static sentencesFromInputStreamEnum sentencesFromInputStream(InputStream is, String charset) {
try {
return new sentencesFromInputStreamEnum(is, charset);
@ -767,9 +775,9 @@ public final class plasmaCondenser {
public static class sentencesFromInputStreamEnum implements Iterator {
// read sentences from a given input stream
// this enumerates String objects
// this enumerates StringBuffer objects
Object buffer = null;
StringBuffer buffer = null;
BufferedReader raf;
int counter = 0;
boolean pre = false;
@ -785,9 +793,9 @@ public final class plasmaCondenser {
this.pre = x;
}
private Object nextElement0() {
private StringBuffer nextElement0() {
try {
String s = readSentence(raf, pre);
StringBuffer s = readSentence(raf, pre);
//System.out.println(" SENTENCE='" + s + "'"); // DEBUG
if (s == null) {
raf.close();
@ -811,8 +819,8 @@ public final class plasmaCondenser {
if (buffer == null) {
return null;
} else {
counter = counter + ((String) buffer).length() + 1;
Object r = buffer;
counter = counter + buffer.length() + 1;
StringBuffer r = buffer;
buffer = nextElement0();
return r;
}
@ -827,7 +835,7 @@ public final class plasmaCondenser {
}
}
static String readSentence(Reader reader, boolean pre) throws IOException {
static StringBuffer readSentence(Reader reader, boolean pre) throws IOException {
StringBuffer s = new StringBuffer();
int nextChar;
char c;
@ -854,8 +862,7 @@ public final class plasmaCondenser {
}
// remove all double-spaces
int p; while ((p = s.indexOf(" ")) >= 0) s.deleteCharAt(p);
return new String(s);
return s;
}
public static Map getWords(byte[] text, String charset) throws UnsupportedEncodingException {

@ -958,7 +958,7 @@ public final class plasmaParser {
int i = 0;
if (sentences != null) while (sentences.hasNext()) {
System.out.print("line " + i + ": ");
System.out.println((String) sentences.next());
System.out.println(((StringBuffer) sentences.next()).toString());
i++;
}

@ -465,15 +465,15 @@ public class plasmaSnippetCache {
Iterator j;
HashMap hs;
String hash;
String sentence;
StringBuffer sentence;
TreeMap os = new TreeMap();
int uniqCounter = 9999;
int score;
while (sentences.hasNext()) {
sentence = (String) sentences.next();
sentence = (StringBuffer) sentences.next();
//System.out.println("Snippet-Sentence :" + sentence); // DEBUG
if (sentence.length() > minLength) {
hs = hashSentence(sentence);
hs = hashSentence(sentence.toString());
j = queryhashes.iterator();
score = 0;
while (j.hasNext()) {
@ -492,8 +492,8 @@ public class plasmaSnippetCache {
String result;
Set remaininghashes;
while (os.size() > 0) {
sentence = (String) os.remove((Integer) os.lastKey()); // sentence with the biggest score
result = computeTextSnippet(sentence, queryhashes, minLength, maxLength);
sentence = (StringBuffer) os.remove((Integer) os.lastKey()); // sentence with the biggest score
result = computeTextSnippet(sentence.toString(), queryhashes, minLength, maxLength);
if ((result != null) && (result.length() > 0)) {
remaininghashes = removeAppearanceHashes(result, queryhashes);
if (remaininghashes.size() == 0) {
@ -688,10 +688,10 @@ public class plasmaSnippetCache {
HashMap map = new HashMap();
Enumeration words = plasmaCondenser.wordTokenizer(sentence, "UTF-8", 0);
int pos = 0;
String word;
StringBuffer word;
while (words.hasMoreElements()) {
word = (String) words.nextElement();
map.put(plasmaCondenser.word2hash(word), new Integer(pos));
word = (StringBuffer) words.nextElement();
map.put(plasmaCondenser.word2hash(new String(word)), new Integer(pos));
pos += word.length() + 1;
}
return map;

@ -814,7 +814,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*",
Integer.parseInt(getConfig("proxyPrefetchDepth", "0")),
Integer.parseInt(getConfig("proxyPrefetchDepth", "0")),
60 * 24, -1, -1, false, true, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true);
60 * 24, -1, -1, false,
getConfigBool("proxyIndexingLocalText", true),
getConfigBool("proxyIndexingLocalMedia", true),
true, true,
getConfigBool("proxyIndexingRemote", false), true, true, true);
}
if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling

@ -426,7 +426,9 @@ defaultLinkReceiveFrequency=30
# of 2 would result in hundreds of prefetched URLs for each single proxy fill.
proxyPrefetchDepth=0
proxyStoreHTCache=true
proxyCrawlOrder=false
proxyIndexingRemote=false
proxyIndexingLocalText=true
proxyIndexingLocalMedia=true
# From the 'IndexCreate' menu point you can also define a crawling start point.
# The crawling works the same way as the prefetch, but it is possible to

Loading…
Cancel
Save