From f25c0e98d19c5e58bbf73cf4483e698c30976574 Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 29 Jan 2007 01:11:22 +0000 Subject: [PATCH] - replaced String by StringBuffer in condenser - added CamelCase parser in condenser - added option to switch on or off indexing for proxy git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3292 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/CacheAdmin_p.java | 2 +- htroot/ProxyIndexingMonitor_p.html | 20 ++++++- htroot/ProxyIndexingMonitor_p.java | 20 +++++-- htroot/ViewFile.java | 2 +- source/de/anomic/plasma/plasmaCondenser.java | 55 +++++++++++-------- source/de/anomic/plasma/plasmaParser.java | 2 +- .../de/anomic/plasma/plasmaSnippetCache.java | 16 +++--- .../de/anomic/plasma/plasmaSwitchboard.java | 6 +- yacy.init | 4 +- 9 files changed, 83 insertions(+), 44 deletions(-) diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java index 0b644b556..4e8c8a689 100644 --- a/htroot/CacheAdmin_p.java +++ b/htroot/CacheAdmin_p.java @@ -166,7 +166,7 @@ public class CacheAdmin_p { if (sentences != null) while (sentences.hasNext()) { prop.put("info_type_lines_" + i + "_line", - de.anomic.data.wikiCode.replaceXMLEntities(sentences.next().toString().replaceAll("\n", "").trim())); + de.anomic.data.wikiCode.replaceXMLEntities(((StringBuffer) sentences.next()).toString().replaceAll("\n", "").trim())); i++; } prop.put("info_type_lines", i); diff --git a/htroot/ProxyIndexingMonitor_p.html b/htroot/ProxyIndexingMonitor_p.html index b89efe87b..91bef0c89 100644 --- a/htroot/ProxyIndexingMonitor_p.html +++ b/htroot/ProxyIndexingMonitor_p.html @@ -38,9 +38,23 @@ It is almost always recommended to set this on. The only exception is that you have another caching proxy running as secondary proxy and YaCy is configured to used that proxy in proxy-proxy - mode. + + + + + If this is on, all pages (except private content) that passes the proxy is indexed. + + + + + + + This is the same as for Local Text-Indexing, but switches only the indexing of media content on. + + - + If checked, the crawler will contact other peers and use them as remote indexers for your crawl. If you need your crawling results locally, you should switch this off. @@ -82,7 +96,9 @@

Pre-fetch is now set to depth-#[message]#.

Caching is now #(caching)#off::on#(/caching)#.

-

Remote Indexing is now #(crawlOrder)#off::on#(/crawlOrder)#.

+

Local Text Indexing is now #(indexingLocalText)#off::on#(/indexingLocalText)#.

+

Local Media Indexing is now #(indexingLocalMedia)#off::on#(/indexingLocalMedia)#.

+

Remote Indexing is now #(indexingRemote)#off::on#(/indexingRemote)#.

#(path)#::

Cachepath is now set to '#[return]#'. Please move the old data in the new directory.

#(/path)# #(size)#::

Cachesize is now set to #[return]#MB.

#(/size)# #(restart)#::

Changes will take effect after restart only.

#(/restart)# diff --git a/htroot/ProxyIndexingMonitor_p.java b/htroot/ProxyIndexingMonitor_p.java index 91badc907..5bf82335b 100644 --- a/htroot/ProxyIndexingMonitor_p.java +++ b/htroot/ProxyIndexingMonitor_p.java @@ -86,8 +86,12 @@ public class ProxyIndexingMonitor_p { env.setConfig("proxyPrefetchDepth", Integer.toString(newProxyPrefetchDepth)); boolean proxyStoreHTCache = post.containsKey("proxyStoreHTCache"); env.setConfig("proxyStoreHTCache", (proxyStoreHTCache) ? "true" : "false"); - boolean proxyCrawlOrder = post.containsKey("proxyCrawlOrder"); - env.setConfig("proxyCrawlOrder", proxyCrawlOrder ? "true" : "false"); + boolean proxyIndexingRemote = post.containsKey("proxyIndexingRemote"); + env.setConfig("proxyIndexingRemote", proxyIndexingRemote ? "true" : "false"); + boolean proxyIndexingLocalText = post.containsKey("proxyIndexingLocalText"); + env.setConfig("proxyIndexingLocalText", proxyIndexingLocalText ? "true" : "false"); + boolean proxyIndexingLocalMedia = post.containsKey("proxyIndexingLocalMedia"); + env.setConfig("proxyIndexingLocalMedia", proxyIndexingLocalMedia ? "true" : "false"); // added proxyCache, proxyCacheSize - Borg-0300 // proxyCache - check and create the directory @@ -115,12 +119,16 @@ public class ProxyIndexingMonitor_p { try { sb.defaultProxyProfile.changeEntry("generalDepth", Integer.toString(newProxyPrefetchDepth)); sb.defaultProxyProfile.changeEntry("storeHTCache", (proxyStoreHTCache) ? "true": "false"); - sb.defaultProxyProfile.changeEntry("remoteIndexing",proxyCrawlOrder ? "true":"false"); + sb.defaultProxyProfile.changeEntry("remoteIndexing",proxyIndexingRemote ? "true":"false"); + sb.defaultProxyProfile.changeEntry("indexText",proxyIndexingLocalText ? "true":"false"); + sb.defaultProxyProfile.changeEntry("indexMedia",proxyIndexingLocalMedia ? "true":"false"); prop.put("info", 2);//new proxyPrefetchdepth prop.put("info_message", newProxyPrefetchDepth); prop.put("info_caching", (proxyStoreHTCache) ? 1 : 0); - prop.put("info_crawlOrder", (proxyCrawlOrder) ? 1 : 0); + prop.put("info_indexingLocalText", (proxyIndexingLocalText) ? 1 : 0); + prop.put("info_indexingLocalMedia", (proxyIndexingLocalMedia) ? 1 : 0); + prop.put("info_indexingRemote", (proxyIndexingRemote) ? 1 : 0); // proxyCache - only display on change if (oldProxyCachePath.equals(newProxyCachePath)) { @@ -159,7 +167,9 @@ public class ProxyIndexingMonitor_p { prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0")); prop.put("proxyStoreHTCacheChecked", env.getConfig("proxyStoreHTCache", "").equals("true") ? 1 : 0); - prop.put("proxyCrawlOrder", env.getConfig("proxyCrawlOrder", "").equals("true") ? 1 : 0); + prop.put("proxyIndexingRemote", env.getConfig("proxyIndexingRemote", "").equals("true") ? 1 : 0); + prop.put("proxyIndexingLocalText", env.getConfig("proxyIndexingLocalText", "").equals("true") ? 1 : 0); + prop.put("proxyIndexingLocalMedia", env.getConfig("proxyIndexingLocalMedia", "").equals("true") ? 1 : 0); prop.put("proxyCache", env.getConfig("proxyCache", "DATA/HTCACHE")); prop.put("proxyCacheSize", env.getConfig("proxyCacheSize", "64")); // return rewrite properties diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 0a95a7ba7..d39053f72 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -320,7 +320,7 @@ public class ViewFile { // Search word highlighting while (sentences.hasNext()) { - sentence = (String)sentences.next(); + sentence = ((StringBuffer) sentences.next()).toString(); if (sentence.trim().length() > 0) { prop.put("viewMode_sentences_" + i + "_nr", Integer.toString(i + 1)); prop.put("viewMode_sentences_" + i + "_text", markup(wordArray, sentence)); diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index b4f182578..54ea93751 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -236,7 +236,7 @@ public final class plasmaCondenser { } int pip = 0; while (wordenum.hasMoreElements()) { - word = ((String) wordenum.nextElement()).toLowerCase(); + word = (new String((StringBuffer) wordenum.nextElement())).toLowerCase(); wprop = (wordStatProp) words.get(word); if (wprop == null) wprop = new wordStatProp(0, pip, phrase); if (wprop.flags == null) wprop.flags = (kelondroBitfield) flagstemplate.clone(); @@ -387,7 +387,7 @@ public final class plasmaCondenser { // read source sievedWordsEnum wordenum = new sievedWordsEnum(is, charset, wordminsize); while (wordenum.hasMoreElements()) { - word = ((String) wordenum.nextElement()).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars? + word = (new String((StringBuffer) wordenum.nextElement())).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars? //System.out.println("PARSED-WORD " + word); // distinguish punctuation and words @@ -665,10 +665,10 @@ public final class plasmaCondenser { } private Object nextElement0() { - String s; + StringBuffer s; char c; loop: while (e.hasMoreElements()) { - s = (String) e.nextElement(); + s = (StringBuffer) e.nextElement(); if ((s.length() == 1) && (htmlFilterContentScraper.punctuation(s.charAt(0)))) return s; if ((s.length() < ml) && (!(s.equals("of")))) continue loop; for (int i = 0; i < s.length(); i++) { @@ -697,14 +697,14 @@ public final class plasmaCondenser { } private static class unsievedWordsEnum implements Enumeration { - + // returns an enumeration of StringBuffer Objects Object buffer = null; sentencesFromInputStreamEnum e; - String s; + StringBuffer s; public unsievedWordsEnum(InputStream is, String charset) throws UnsupportedEncodingException { e = new sentencesFromInputStreamEnum(is, charset); - s = ""; + s = new StringBuffer(); buffer = nextElement0(); } @@ -712,15 +712,15 @@ public final class plasmaCondenser { e.pre(x); } - private Object nextElement0() { - String r; + private StringBuffer nextElement0() { + StringBuffer r; StringBuffer sb; char c; while (s.length() == 0) { if (e.hasNext()) { - r = (String) e.next(); + r = (StringBuffer) e.next(); if (r == null) return null; - r = r.trim(); + r = trim(r); sb = new StringBuffer(r.length() * 2); for (int i = 0; i < r.length(); i++) { c = r.charAt(i); @@ -728,7 +728,7 @@ public final class plasmaCondenser { else if (htmlFilterContentScraper.punctuation(c)) sb = sb.append(' ').append(c).append(' '); else sb = sb.append(c); } - s = sb.toString().trim(); + s = trim(sb); //System.out.println("PARSING-LINE '" + r + "'->'" + s + "'"); } else { return null; @@ -737,11 +737,11 @@ public final class plasmaCondenser { int p = s.indexOf(" "); if (p < 0) { r = s; - s = ""; + s = new StringBuffer(); return r; } - r = s.substring(0, p); - s = s.substring(p + 1).trim(); + r = trim(new StringBuffer(s.substring(0, p))); + s = trim(s.delete(0, p + 1)); return r; } @@ -757,6 +757,14 @@ public final class plasmaCondenser { } + public static StringBuffer trim(StringBuffer sb) { + synchronized (sb) { + while ((sb.length() > 0) && (sb.charAt(0) <= ' ')) sb = sb.deleteCharAt(0); + while ((sb.length() > 0) && (sb.charAt(sb.length() - 1) <= ' ')) sb = sb.deleteCharAt(sb.length() - 1); + } + return sb; + } + public static sentencesFromInputStreamEnum sentencesFromInputStream(InputStream is, String charset) { try { return new sentencesFromInputStreamEnum(is, charset); @@ -767,9 +775,9 @@ public final class plasmaCondenser { public static class sentencesFromInputStreamEnum implements Iterator { // read sentences from a given input stream - // this enumerates String objects + // this enumerates StringBuffer objects - Object buffer = null; + StringBuffer buffer = null; BufferedReader raf; int counter = 0; boolean pre = false; @@ -785,9 +793,9 @@ public final class plasmaCondenser { this.pre = x; } - private Object nextElement0() { + private StringBuffer nextElement0() { try { - String s = readSentence(raf, pre); + StringBuffer s = readSentence(raf, pre); //System.out.println(" SENTENCE='" + s + "'"); // DEBUG if (s == null) { raf.close(); @@ -811,8 +819,8 @@ public final class plasmaCondenser { if (buffer == null) { return null; } else { - counter = counter + ((String) buffer).length() + 1; - Object r = buffer; + counter = counter + buffer.length() + 1; + StringBuffer r = buffer; buffer = nextElement0(); return r; } @@ -827,7 +835,7 @@ public final class plasmaCondenser { } } - static String readSentence(Reader reader, boolean pre) throws IOException { + static StringBuffer readSentence(Reader reader, boolean pre) throws IOException { StringBuffer s = new StringBuffer(); int nextChar; char c; @@ -854,8 +862,7 @@ public final class plasmaCondenser { } // remove all double-spaces int p; while ((p = s.indexOf(" ")) >= 0) s.deleteCharAt(p); - return new String(s); - + return s; } public static Map getWords(byte[] text, String charset) throws UnsupportedEncodingException { diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 1c06181f5..c5b819819 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -958,7 +958,7 @@ public final class plasmaParser { int i = 0; if (sentences != null) while (sentences.hasNext()) { System.out.print("line " + i + ": "); - System.out.println((String) sentences.next()); + System.out.println(((StringBuffer) sentences.next()).toString()); i++; } diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 00df6509d..ce20ef7d5 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -465,15 +465,15 @@ public class plasmaSnippetCache { Iterator j; HashMap hs; String hash; - String sentence; + StringBuffer sentence; TreeMap os = new TreeMap(); int uniqCounter = 9999; int score; while (sentences.hasNext()) { - sentence = (String) sentences.next(); + sentence = (StringBuffer) sentences.next(); //System.out.println("Snippet-Sentence :" + sentence); // DEBUG if (sentence.length() > minLength) { - hs = hashSentence(sentence); + hs = hashSentence(sentence.toString()); j = queryhashes.iterator(); score = 0; while (j.hasNext()) { @@ -492,8 +492,8 @@ public class plasmaSnippetCache { String result; Set remaininghashes; while (os.size() > 0) { - sentence = (String) os.remove((Integer) os.lastKey()); // sentence with the biggest score - result = computeTextSnippet(sentence, queryhashes, minLength, maxLength); + sentence = (StringBuffer) os.remove((Integer) os.lastKey()); // sentence with the biggest score + result = computeTextSnippet(sentence.toString(), queryhashes, minLength, maxLength); if ((result != null) && (result.length() > 0)) { remaininghashes = removeAppearanceHashes(result, queryhashes); if (remaininghashes.size() == 0) { @@ -688,10 +688,10 @@ public class plasmaSnippetCache { HashMap map = new HashMap(); Enumeration words = plasmaCondenser.wordTokenizer(sentence, "UTF-8", 0); int pos = 0; - String word; + StringBuffer word; while (words.hasMoreElements()) { - word = (String) words.nextElement(); - map.put(plasmaCondenser.word2hash(word), new Integer(pos)); + word = (StringBuffer) words.nextElement(); + map.put(plasmaCondenser.word2hash(new String(word)), new Integer(pos)); pos += word.length() + 1; } return map; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index a304706e6..cd2485ebc 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -814,7 +814,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), - 60 * 24, -1, -1, false, true, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true); + 60 * 24, -1, -1, false, + getConfigBool("proxyIndexingLocalText", true), + getConfigBool("proxyIndexingLocalMedia", true), + true, true, + getConfigBool("proxyIndexingRemote", false), true, true, true); } if (this.defaultRemoteProfile == null) { // generate new default entry for remote crawling diff --git a/yacy.init b/yacy.init index 64a54278f..42100230b 100644 --- a/yacy.init +++ b/yacy.init @@ -426,7 +426,9 @@ defaultLinkReceiveFrequency=30 # of 2 would result in hundreds of prefetched URLs for each single proxy fill. proxyPrefetchDepth=0 proxyStoreHTCache=true -proxyCrawlOrder=false +proxyIndexingRemote=false +proxyIndexingLocalText=true +proxyIndexingLocalMedia=true # From the 'IndexCreate' menu point you can also define a crawling start point. # The crawling works the same way as the prefetch, but it is possible to