- replaced String by StringBuffer in condenser

- added CamelCase parser in condenser
- added option to switch on or off indexing for proxy

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3292 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent 14f2068daf
commit f25c0e98d1

@ -166,7 +166,7 @@ public class CacheAdmin_p {
if (sentences != null) if (sentences != null)
while (sentences.hasNext()) { while (sentences.hasNext()) {
prop.put("info_type_lines_" + i + "_line", prop.put("info_type_lines_" + i + "_line",
de.anomic.data.wikiCode.replaceXMLEntities(sentences.next().toString().replaceAll("\n", "").trim())); de.anomic.data.wikiCode.replaceXMLEntities(((StringBuffer) sentences.next()).toString().replaceAll("\n", "").trim()));
i++; i++;
} }
prop.put("info_type_lines", i); prop.put("info_type_lines", i);

@ -38,9 +38,23 @@
<td><input type="checkbox" id="prxy_storeHTCache" name="proxyStoreHTCache"#(proxyStoreHTCacheChecked)#:: checked="checked"#(/proxyStoreHTCacheChecked)# /></td> <td><input type="checkbox" id="prxy_storeHTCache" name="proxyStoreHTCache"#(proxyStoreHTCacheChecked)#:: checked="checked"#(/proxyStoreHTCacheChecked)# /></td>
<td>It is almost always recommended to set this on. The only exception is that you have another caching proxy running as secondary proxy and YaCy is configured to used that proxy in proxy-proxy - mode.</td> <td>It is almost always recommended to set this on. The only exception is that you have another caching proxy running as secondary proxy and YaCy is configured to used that proxy in proxy-proxy - mode.</td>
</tr> </tr>
<tr valign="top" class="TableCellLight">
<td><label for="prxy_index_text">Do Local Text-Indexing</label></td>
<td><input type="checkbox" id="prxy_index_text" name="proxyIndexingLocalText"#(proxyIndexingLocalText)#:: checked="checked"#(/proxyIndexingLocalText)# /></td>
<td>
If this is on, all pages (except private content) that passes the proxy is indexed.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="prxy_index_media">Do Local Media-Indexing</label></td>
<td><input type="checkbox" id="prxy_index_media" name="proxyIndexingLocalMedia"#(proxyIndexingLocalMedia)#:: checked="checked"#(/proxyIndexingLocalMedia)# /></td>
<td>
This is the same as for Local Text-Indexing, but switches only the indexing of media content on.
</td>
</tr>
<tr valign="top" class="TableCellLight"> <tr valign="top" class="TableCellLight">
<td><label for="prxy_crawl_order">Do Remote Indexing</label></td> <td><label for="prxy_crawl_order">Do Remote Indexing</label></td>
<td><input type="checkbox" id="prxy_crawl_order" name="proxyCrawlOrder"#(proxyCrawlOrder)#:: checked="checked"#(/proxyCrawlOrder)# /></td> <td><input type="checkbox" id="prxy_crawl_order" name="proxyIndexingRemote"#(proxyIndexingRemote)#:: checked="checked"#(/proxyIndexingRemote)# /></td>
<td> <td>
If checked, the crawler will contact other peers and use them as remote indexers for your crawl. If checked, the crawler will contact other peers and use them as remote indexers for your crawl.
If you need your crawling results locally, you should switch this off. If you need your crawling results locally, you should switch this off.
@ -82,7 +96,9 @@
<!-- info 2 --> <!-- info 2 -->
<p><strong>Pre-fetch is now set to depth-#[message]#.</strong></p> <p><strong>Pre-fetch is now set to depth-#[message]#.</strong></p>
<p><strong>Caching is now #(caching)#off::on#(/caching)#.</strong></p> <p><strong>Caching is now #(caching)#off::on#(/caching)#.</strong></p>
<p><strong>Remote Indexing is now #(crawlOrder)#off::on#(/crawlOrder)#.</strong></p> <p><strong>Local Text Indexing is now #(indexingLocalText)#off::on#(/indexingLocalText)#.</strong></p>
<p><strong>Local Media Indexing is now #(indexingLocalMedia)#off::on#(/indexingLocalMedia)#.</strong></p>
<p><strong>Remote Indexing is now #(indexingRemote)#off::on#(/indexingRemote)#.</strong></p>
#(path)#::<p><strong>Cachepath is now set to '#[return]#'.</strong> Please move the old data in the new directory.</p>#(/path)# #(path)#::<p><strong>Cachepath is now set to '#[return]#'.</strong> Please move the old data in the new directory.</p>#(/path)#
#(size)#::<p><strong>Cachesize is now set to #[return]#MB.</strong></p>#(/size)# #(size)#::<p><strong>Cachesize is now set to #[return]#MB.</strong></p>#(/size)#
#(restart)#::<p style="color:red;"><strong>Changes will take effect after restart only.</strong></p>#(/restart)# #(restart)#::<p style="color:red;"><strong>Changes will take effect after restart only.</strong></p>#(/restart)#

@ -86,8 +86,12 @@ public class ProxyIndexingMonitor_p {
env.setConfig("proxyPrefetchDepth", Integer.toString(newProxyPrefetchDepth)); env.setConfig("proxyPrefetchDepth", Integer.toString(newProxyPrefetchDepth));
boolean proxyStoreHTCache = post.containsKey("proxyStoreHTCache"); boolean proxyStoreHTCache = post.containsKey("proxyStoreHTCache");
env.setConfig("proxyStoreHTCache", (proxyStoreHTCache) ? "true" : "false"); env.setConfig("proxyStoreHTCache", (proxyStoreHTCache) ? "true" : "false");
boolean proxyCrawlOrder = post.containsKey("proxyCrawlOrder"); boolean proxyIndexingRemote = post.containsKey("proxyIndexingRemote");
env.setConfig("proxyCrawlOrder", proxyCrawlOrder ? "true" : "false"); env.setConfig("proxyIndexingRemote", proxyIndexingRemote ? "true" : "false");
boolean proxyIndexingLocalText = post.containsKey("proxyIndexingLocalText");
env.setConfig("proxyIndexingLocalText", proxyIndexingLocalText ? "true" : "false");
boolean proxyIndexingLocalMedia = post.containsKey("proxyIndexingLocalMedia");
env.setConfig("proxyIndexingLocalMedia", proxyIndexingLocalMedia ? "true" : "false");
// added proxyCache, proxyCacheSize - Borg-0300 // added proxyCache, proxyCacheSize - Borg-0300
// proxyCache - check and create the directory // proxyCache - check and create the directory
@ -115,12 +119,16 @@ public class ProxyIndexingMonitor_p {
try { try {
sb.defaultProxyProfile.changeEntry("generalDepth", Integer.toString(newProxyPrefetchDepth)); sb.defaultProxyProfile.changeEntry("generalDepth", Integer.toString(newProxyPrefetchDepth));
sb.defaultProxyProfile.changeEntry("storeHTCache", (proxyStoreHTCache) ? "true": "false"); sb.defaultProxyProfile.changeEntry("storeHTCache", (proxyStoreHTCache) ? "true": "false");
sb.defaultProxyProfile.changeEntry("remoteIndexing",proxyCrawlOrder ? "true":"false"); sb.defaultProxyProfile.changeEntry("remoteIndexing",proxyIndexingRemote ? "true":"false");
sb.defaultProxyProfile.changeEntry("indexText",proxyIndexingLocalText ? "true":"false");
sb.defaultProxyProfile.changeEntry("indexMedia",proxyIndexingLocalMedia ? "true":"false");
prop.put("info", 2);//new proxyPrefetchdepth prop.put("info", 2);//new proxyPrefetchdepth
prop.put("info_message", newProxyPrefetchDepth); prop.put("info_message", newProxyPrefetchDepth);
prop.put("info_caching", (proxyStoreHTCache) ? 1 : 0); prop.put("info_caching", (proxyStoreHTCache) ? 1 : 0);
prop.put("info_crawlOrder", (proxyCrawlOrder) ? 1 : 0); prop.put("info_indexingLocalText", (proxyIndexingLocalText) ? 1 : 0);
prop.put("info_indexingLocalMedia", (proxyIndexingLocalMedia) ? 1 : 0);
prop.put("info_indexingRemote", (proxyIndexingRemote) ? 1 : 0);
// proxyCache - only display on change // proxyCache - only display on change
if (oldProxyCachePath.equals(newProxyCachePath)) { if (oldProxyCachePath.equals(newProxyCachePath)) {
@ -159,7 +167,9 @@ public class ProxyIndexingMonitor_p {
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0")); prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
prop.put("proxyStoreHTCacheChecked", env.getConfig("proxyStoreHTCache", "").equals("true") ? 1 : 0); prop.put("proxyStoreHTCacheChecked", env.getConfig("proxyStoreHTCache", "").equals("true") ? 1 : 0);
prop.put("proxyCrawlOrder", env.getConfig("proxyCrawlOrder", "").equals("true") ? 1 : 0); prop.put("proxyIndexingRemote", env.getConfig("proxyIndexingRemote", "").equals("true") ? 1 : 0);
prop.put("proxyIndexingLocalText", env.getConfig("proxyIndexingLocalText", "").equals("true") ? 1 : 0);
prop.put("proxyIndexingLocalMedia", env.getConfig("proxyIndexingLocalMedia", "").equals("true") ? 1 : 0);
prop.put("proxyCache", env.getConfig("proxyCache", "DATA/HTCACHE")); prop.put("proxyCache", env.getConfig("proxyCache", "DATA/HTCACHE"));
prop.put("proxyCacheSize", env.getConfig("proxyCacheSize", "64")); prop.put("proxyCacheSize", env.getConfig("proxyCacheSize", "64"));
// return rewrite properties // return rewrite properties

@ -320,7 +320,7 @@ public class ViewFile {
// Search word highlighting // Search word highlighting
while (sentences.hasNext()) { while (sentences.hasNext()) {
sentence = (String)sentences.next(); sentence = ((StringBuffer) sentences.next()).toString();
if (sentence.trim().length() > 0) { if (sentence.trim().length() > 0) {
prop.put("viewMode_sentences_" + i + "_nr", Integer.toString(i + 1)); prop.put("viewMode_sentences_" + i + "_nr", Integer.toString(i + 1));
prop.put("viewMode_sentences_" + i + "_text", markup(wordArray, sentence)); prop.put("viewMode_sentences_" + i + "_text", markup(wordArray, sentence));

@ -236,7 +236,7 @@ public final class plasmaCondenser {
} }
int pip = 0; int pip = 0;
while (wordenum.hasMoreElements()) { while (wordenum.hasMoreElements()) {
word = ((String) wordenum.nextElement()).toLowerCase(); word = (new String((StringBuffer) wordenum.nextElement())).toLowerCase();
wprop = (wordStatProp) words.get(word); wprop = (wordStatProp) words.get(word);
if (wprop == null) wprop = new wordStatProp(0, pip, phrase); if (wprop == null) wprop = new wordStatProp(0, pip, phrase);
if (wprop.flags == null) wprop.flags = (kelondroBitfield) flagstemplate.clone(); if (wprop.flags == null) wprop.flags = (kelondroBitfield) flagstemplate.clone();
@ -387,7 +387,7 @@ public final class plasmaCondenser {
// read source // read source
sievedWordsEnum wordenum = new sievedWordsEnum(is, charset, wordminsize); sievedWordsEnum wordenum = new sievedWordsEnum(is, charset, wordminsize);
while (wordenum.hasMoreElements()) { while (wordenum.hasMoreElements()) {
word = ((String) wordenum.nextElement()).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars? word = (new String((StringBuffer) wordenum.nextElement())).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars?
//System.out.println("PARSED-WORD " + word); //System.out.println("PARSED-WORD " + word);
// distinguish punctuation and words // distinguish punctuation and words
@ -665,10 +665,10 @@ public final class plasmaCondenser {
} }
private Object nextElement0() { private Object nextElement0() {
String s; StringBuffer s;
char c; char c;
loop: while (e.hasMoreElements()) { loop: while (e.hasMoreElements()) {
s = (String) e.nextElement(); s = (StringBuffer) e.nextElement();
if ((s.length() == 1) && (htmlFilterContentScraper.punctuation(s.charAt(0)))) return s; if ((s.length() == 1) && (htmlFilterContentScraper.punctuation(s.charAt(0)))) return s;
if ((s.length() < ml) && (!(s.equals("of")))) continue loop; if ((s.length() < ml) && (!(s.equals("of")))) continue loop;
for (int i = 0; i < s.length(); i++) { for (int i = 0; i < s.length(); i++) {
@ -697,14 +697,14 @@ public final class plasmaCondenser {
} }
private static class unsievedWordsEnum implements Enumeration { private static class unsievedWordsEnum implements Enumeration {
// returns an enumeration of StringBuffer Objects
Object buffer = null; Object buffer = null;
sentencesFromInputStreamEnum e; sentencesFromInputStreamEnum e;
String s; StringBuffer s;
public unsievedWordsEnum(InputStream is, String charset) throws UnsupportedEncodingException { public unsievedWordsEnum(InputStream is, String charset) throws UnsupportedEncodingException {
e = new sentencesFromInputStreamEnum(is, charset); e = new sentencesFromInputStreamEnum(is, charset);
s = ""; s = new StringBuffer();
buffer = nextElement0(); buffer = nextElement0();
} }
@ -712,15 +712,15 @@ public final class plasmaCondenser {
e.pre(x); e.pre(x);
} }
private Object nextElement0() { private StringBuffer nextElement0() {
String r; StringBuffer r;
StringBuffer sb; StringBuffer sb;
char c; char c;
while (s.length() == 0) { while (s.length() == 0) {
if (e.hasNext()) { if (e.hasNext()) {
r = (String) e.next(); r = (StringBuffer) e.next();
if (r == null) return null; if (r == null) return null;
r = r.trim(); r = trim(r);
sb = new StringBuffer(r.length() * 2); sb = new StringBuffer(r.length() * 2);
for (int i = 0; i < r.length(); i++) { for (int i = 0; i < r.length(); i++) {
c = r.charAt(i); c = r.charAt(i);
@ -728,7 +728,7 @@ public final class plasmaCondenser {
else if (htmlFilterContentScraper.punctuation(c)) sb = sb.append(' ').append(c).append(' '); else if (htmlFilterContentScraper.punctuation(c)) sb = sb.append(' ').append(c).append(' ');
else sb = sb.append(c); else sb = sb.append(c);
} }
s = sb.toString().trim(); s = trim(sb);
//System.out.println("PARSING-LINE '" + r + "'->'" + s + "'"); //System.out.println("PARSING-LINE '" + r + "'->'" + s + "'");
} else { } else {
return null; return null;
@ -737,11 +737,11 @@ public final class plasmaCondenser {
int p = s.indexOf(" "); int p = s.indexOf(" ");
if (p < 0) { if (p < 0) {
r = s; r = s;
s = ""; s = new StringBuffer();
return r; return r;
} }
r = s.substring(0, p); r = trim(new StringBuffer(s.substring(0, p)));
s = s.substring(p + 1).trim(); s = trim(s.delete(0, p + 1));
return r; return r;
} }
@ -757,6 +757,14 @@ public final class plasmaCondenser {
} }
public static StringBuffer trim(StringBuffer sb) {
synchronized (sb) {
while ((sb.length() > 0) && (sb.charAt(0) <= ' ')) sb = sb.deleteCharAt(0);
while ((sb.length() > 0) && (sb.charAt(sb.length() - 1) <= ' ')) sb = sb.deleteCharAt(sb.length() - 1);
}
return sb;
}
public static sentencesFromInputStreamEnum sentencesFromInputStream(InputStream is, String charset) { public static sentencesFromInputStreamEnum sentencesFromInputStream(InputStream is, String charset) {
try { try {
return new sentencesFromInputStreamEnum(is, charset); return new sentencesFromInputStreamEnum(is, charset);
@ -767,9 +775,9 @@ public final class plasmaCondenser {
public static class sentencesFromInputStreamEnum implements Iterator { public static class sentencesFromInputStreamEnum implements Iterator {
// read sentences from a given input stream // read sentences from a given input stream
// this enumerates String objects // this enumerates StringBuffer objects
Object buffer = null; StringBuffer buffer = null;
BufferedReader raf; BufferedReader raf;
int counter = 0; int counter = 0;
boolean pre = false; boolean pre = false;
@ -785,9 +793,9 @@ public final class plasmaCondenser {
this.pre = x; this.pre = x;
} }
private Object nextElement0() { private StringBuffer nextElement0() {
try { try {
String s = readSentence(raf, pre); StringBuffer s = readSentence(raf, pre);
//System.out.println(" SENTENCE='" + s + "'"); // DEBUG //System.out.println(" SENTENCE='" + s + "'"); // DEBUG
if (s == null) { if (s == null) {
raf.close(); raf.close();
@ -811,8 +819,8 @@ public final class plasmaCondenser {
if (buffer == null) { if (buffer == null) {
return null; return null;
} else { } else {
counter = counter + ((String) buffer).length() + 1; counter = counter + buffer.length() + 1;
Object r = buffer; StringBuffer r = buffer;
buffer = nextElement0(); buffer = nextElement0();
return r; return r;
} }
@ -827,7 +835,7 @@ public final class plasmaCondenser {
} }
} }
static String readSentence(Reader reader, boolean pre) throws IOException { static StringBuffer readSentence(Reader reader, boolean pre) throws IOException {
StringBuffer s = new StringBuffer(); StringBuffer s = new StringBuffer();
int nextChar; int nextChar;
char c; char c;
@ -854,8 +862,7 @@ public final class plasmaCondenser {
} }
// remove all double-spaces // remove all double-spaces
int p; while ((p = s.indexOf(" ")) >= 0) s.deleteCharAt(p); int p; while ((p = s.indexOf(" ")) >= 0) s.deleteCharAt(p);
return new String(s); return s;
} }
public static Map getWords(byte[] text, String charset) throws UnsupportedEncodingException { public static Map getWords(byte[] text, String charset) throws UnsupportedEncodingException {

@ -958,7 +958,7 @@ public final class plasmaParser {
int i = 0; int i = 0;
if (sentences != null) while (sentences.hasNext()) { if (sentences != null) while (sentences.hasNext()) {
System.out.print("line " + i + ": "); System.out.print("line " + i + ": ");
System.out.println((String) sentences.next()); System.out.println(((StringBuffer) sentences.next()).toString());
i++; i++;
} }

@ -465,15 +465,15 @@ public class plasmaSnippetCache {
Iterator j; Iterator j;
HashMap hs; HashMap hs;
String hash; String hash;
String sentence; StringBuffer sentence;
TreeMap os = new TreeMap(); TreeMap os = new TreeMap();
int uniqCounter = 9999; int uniqCounter = 9999;
int score; int score;
while (sentences.hasNext()) { while (sentences.hasNext()) {
sentence = (String) sentences.next(); sentence = (StringBuffer) sentences.next();
//System.out.println("Snippet-Sentence :" + sentence); // DEBUG //System.out.println("Snippet-Sentence :" + sentence); // DEBUG
if (sentence.length() > minLength) { if (sentence.length() > minLength) {
hs = hashSentence(sentence); hs = hashSentence(sentence.toString());
j = queryhashes.iterator(); j = queryhashes.iterator();
score = 0; score = 0;
while (j.hasNext()) { while (j.hasNext()) {
@ -492,8 +492,8 @@ public class plasmaSnippetCache {
String result; String result;
Set remaininghashes; Set remaininghashes;
while (os.size() > 0) { while (os.size() > 0) {
sentence = (String) os.remove((Integer) os.lastKey()); // sentence with the biggest score sentence = (StringBuffer) os.remove((Integer) os.lastKey()); // sentence with the biggest score
result = computeTextSnippet(sentence, queryhashes, minLength, maxLength); result = computeTextSnippet(sentence.toString(), queryhashes, minLength, maxLength);
if ((result != null) && (result.length() > 0)) { if ((result != null) && (result.length() > 0)) {
remaininghashes = removeAppearanceHashes(result, queryhashes); remaininghashes = removeAppearanceHashes(result, queryhashes);
if (remaininghashes.size() == 0) { if (remaininghashes.size() == 0) {
@ -688,10 +688,10 @@ public class plasmaSnippetCache {
HashMap map = new HashMap(); HashMap map = new HashMap();
Enumeration words = plasmaCondenser.wordTokenizer(sentence, "UTF-8", 0); Enumeration words = plasmaCondenser.wordTokenizer(sentence, "UTF-8", 0);
int pos = 0; int pos = 0;
String word; StringBuffer word;
while (words.hasMoreElements()) { while (words.hasMoreElements()) {
word = (String) words.nextElement(); word = (StringBuffer) words.nextElement();
map.put(plasmaCondenser.word2hash(word), new Integer(pos)); map.put(plasmaCondenser.word2hash(new String(word)), new Integer(pos));
pos += word.length() + 1; pos += word.length() + 1;
} }
return map; return map;

@ -814,7 +814,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*", this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*",
Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")),
Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")),
60 * 24, -1, -1, false, true, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true); 60 * 24, -1, -1, false,
getConfigBool("proxyIndexingLocalText", true),
getConfigBool("proxyIndexingLocalMedia", true),
true, true,
getConfigBool("proxyIndexingRemote", false), true, true, true);
} }
if (this.defaultRemoteProfile == null) { if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling // generate new default entry for remote crawling

@ -426,7 +426,9 @@ defaultLinkReceiveFrequency=30
# of 2 would result in hundreds of prefetched URLs for each single proxy fill. # of 2 would result in hundreds of prefetched URLs for each single proxy fill.
proxyPrefetchDepth=0 proxyPrefetchDepth=0
proxyStoreHTCache=true proxyStoreHTCache=true
proxyCrawlOrder=false proxyIndexingRemote=false
proxyIndexingLocalText=true
proxyIndexingLocalMedia=true
# From the 'IndexCreate' menu point you can also define a crawling start point. # From the 'IndexCreate' menu point you can also define a crawling start point.
# The crawling works the same way as the prefetch, but it is possible to # The crawling works the same way as the prefetch, but it is possible to

Loading…
Cancel
Save