- wiki-parser is now configurable via the config setting wikiParser.class which holds the class-name for the parser to use

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3742 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
karlchenofhell 18 years ago
parent 601fc7d1c5
commit baa9402b97

@ -55,7 +55,6 @@ import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.Properties; import java.util.Properties;
import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
@ -73,7 +72,6 @@ public class ViewProfile {
// listManager.switchboard = (plasmaSwitchboard) env; // listManager.switchboard = (plasmaSwitchboard) env;
serverObjects prop = new serverObjects(); serverObjects prop = new serverObjects();
plasmaSwitchboard switchboard = (plasmaSwitchboard) env; plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
wikiCode wikiTransformer = new wikiCode(switchboard);
boolean authenticated = switchboard.adminAuthenticated(header) >= 2; boolean authenticated = switchboard.adminAuthenticated(header) >= 2;
int display = ((post == null) || (!authenticated)) ? 0 : post.getInt("display", 0); int display = ((post == null) || (!authenticated)) ? 0 : post.getInt("display", 0);
prop.put("display", display); prop.put("display", display);
@ -175,9 +173,9 @@ public class ViewProfile {
prop.put("success_" + key, 1); prop.put("success_" + key, 1);
// only comments get "wikified" // only comments get "wikified"
if(key.equals("comment")){ if(key.equals("comment")){
prop.putASIS( prop.putWiki(
"success_" + key + "_value", "success_" + key + "_value",
wikiTransformer.transform(((String) entry.getValue()).replaceAll("\r", "").replaceAll("\\\\n", "\n")) ((String) entry.getValue()).replaceAll("\r", "").replaceAll("\\\\n", "\n")
); );
prop.putASIS("success_" + key + "_b64value",kelondroBase64Order.standardCoder.encodeString((String) entry.getValue())); prop.putASIS("success_" + key + "_b64value",kelondroBase64Order.standardCoder.encodeString((String) entry.getValue()));
}else{ }else{

@ -62,34 +62,12 @@ import de.anomic.yacy.yacyCore;
public class knwikiParser implements wikiParser { public class knwikiParser implements wikiParser {
public final Token[] tokens; public Token[] tokens;
private final String[] BEs; private String[] BEs;
private final plasmaSwitchboard sb;
public knwikiParser(plasmaSwitchboard sb) { public knwikiParser(plasmaSwitchboard sb) {
tokens = new Token[] { this.sb = sb;
new SimpleToken('=', '=', new String[][] { null, { "h2" }, { "h3" }, { "h4" } }, true),
new SimpleToken('\'', '\'', new String[][] { null, { "i" }, { "b" }, null, { "b", "i" } }, false),
new LinkToken(yacyCore.seedDB.mySeed.getPublicAddress(), "Wiki.html?page=", sb),
new ListToken('*', "ul"),
new ListToken('#', "ol"),
new ListToken(':', "blockquote", null),
new ListToken(' ', null, "tt", false),
new DefinitionListToken(),
new TableToken()
};
ArrayList r = new ArrayList();
for (int i=0, k, j; i<tokens.length; i++)
if (tokens[i].getBlockElementNames() != null)
for (j=0; j<tokens[i].getBlockElementNames().length; j++) {
if (tokens[i].getBlockElementNames()[j] == null) continue;
if ((k = tokens[i].getBlockElementNames()[j].indexOf(' ')) > 1) {
r.add(tokens[i].getBlockElementNames()[j].substring(0, k));
} else {
r.add(tokens[i].getBlockElementNames()[j]);
}
}
r.add("hr");
BEs = (String[])r.toArray(new String[r.size()]);
} }
public static void main(String[] args) { public static void main(String[] args) {
@ -157,6 +135,31 @@ public class knwikiParser implements wikiParser {
} }
public String parse(String text) { public String parse(String text) {
tokens = new Token[] {
new SimpleToken('=', '=', new String[][] { null, { "h2" }, { "h3" }, { "h4" } }, true),
new SimpleToken('\'', '\'', new String[][] { null, { "i" }, { "b" }, null, { "b", "i" } }, false),
new LinkToken(yacyCore.seedDB.mySeed.getPublicAddress(), "Wiki.html?page=", sb),
new ListToken('*', "ul"),
new ListToken('#', "ol"),
new ListToken(':', "blockquote", null),
new ListToken(' ', null, "tt", false),
new DefinitionListToken(),
new TableToken()
};
ArrayList r = new ArrayList();
for (int i=0, k, j; i<tokens.length; i++)
if (tokens[i].getBlockElementNames() != null)
for (j=0; j<tokens[i].getBlockElementNames().length; j++) {
if (tokens[i].getBlockElementNames()[j] == null) continue;
if ((k = tokens[i].getBlockElementNames()[j].indexOf(' ')) > 1) {
r.add(tokens[i].getBlockElementNames()[j].substring(0, k));
} else {
r.add(tokens[i].getBlockElementNames()[j]);
}
}
r.add("hr");
BEs = (String[])r.toArray(new String[r.size()]);
Text[] tt = Text.split2Texts(text, "[=", "=]"); Text[] tt = Text.split2Texts(text, "[=", "=]");
for (int i=0; i<tt.length; i+=2) for (int i=0; i<tt.length; i+=2)
tt[i].setText(parseUnescaped(tt[i].getText())); tt[i].setText(parseUnescaped(tt[i].getText()));

@ -384,6 +384,16 @@ public final class plasmaCondenser {
int wordInSentenceCounter = 1; int wordInSentenceCounter = 1;
Iterator it, it1; Iterator it, it1;
boolean comb_indexof = false, last_last = false, last_index = false; boolean comb_indexof = false, last_last = false, last_index = false;
RandomAccessFile fa;
final boolean dumpWords = false;
if (dumpWords) try {
fa = new RandomAccessFile(new File("dump.txt"), "rw");
fa.seek(fa.length());
} catch (IOException e) {
e.printStackTrace();
fa = null;
}
// read source // read source
sievedWordsEnum wordenum = new sievedWordsEnum(is, charset, wordminsize); sievedWordsEnum wordenum = new sievedWordsEnum(is, charset, wordminsize);
@ -392,19 +402,12 @@ public final class plasmaCondenser {
//System.out.println("PARSED-WORD " + word); //System.out.println("PARSED-WORD " + word);
//This is useful for testing what YaCy "sees" of a website. //This is useful for testing what YaCy "sees" of a website.
if (false) { if (dumpWords && fa != null) try {
File f = new File("dump.txt"); fa.writeBytes(word);
RandomAccessFile fa = null; fa.write(160);
try { } catch (IOException e) {
fa = new RandomAccessFile(f, "rw"); e.printStackTrace();
fa.seek(fa.length()); }
fa.writeBytes(word);
fa.write(160);
fa.close();
} catch (IOException e) {
e.printStackTrace();
}
}
// distinguish punctuation and words // distinguish punctuation and words
wordlen = word.length(); wordlen = word.length();
@ -479,6 +482,13 @@ public final class plasmaCondenser {
sentences.put(sentence, new phraseStatProp(sentenceHandleCount++)); sentences.put(sentence, new phraseStatProp(sentenceHandleCount++));
} }
} }
if (dumpWords && fa != null) try {
fa.write('\n');
fa.close();
} catch (IOException e) {
e.printStackTrace();
}
// ------------------- // -------------------

@ -128,6 +128,7 @@ import de.anomic.data.listManager;
import de.anomic.data.messageBoard; import de.anomic.data.messageBoard;
import de.anomic.data.userDB; import de.anomic.data.userDB;
import de.anomic.data.wikiBoard; import de.anomic.data.wikiBoard;
import de.anomic.data.wiki.wikiParser;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.http.httpRemoteProxyConfig; import de.anomic.http.httpRemoteProxyConfig;
@ -200,6 +201,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public static TreeSet stopwords = null; public static TreeSet stopwords = null;
public static plasmaURLPattern urlBlacklist; public static plasmaURLPattern urlBlacklist;
public static wikiParser wikiParser = null;
// storage management // storage management
public File htCachePath; public File htCachePath;
private File plasmaPath; private File plasmaPath;
@ -672,6 +675,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public static final String ROBOTS_TXT = "httpd.robots.txt"; public static final String ROBOTS_TXT = "httpd.robots.txt";
public static final String ROBOTS_TXT_DEFAULT = httpdRobotsTxtConfig.LOCKED + "," + httpdRobotsTxtConfig.DIRS; public static final String ROBOTS_TXT_DEFAULT = httpdRobotsTxtConfig.LOCKED + "," + httpdRobotsTxtConfig.DIRS;
public static final String WIKIPARSER_CLASS = "wikiParser.class";
public static final String WIKIPARSER_CLASS_DEFAULT = "de.anomic.data.wikiCode";
////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////
// Lists // Lists
////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////
@ -963,7 +969,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
ppRamString(stopwordsFile.length()/1024)); ppRamString(stopwordsFile.length()/1024));
} }
// load ranking tables // load ranking tablesb
File YBRPath = new File(rootPath, "ranking/YBR"); File YBRPath = new File(rootPath, "ranking/YBR");
if (YBRPath.exists()) { if (YBRPath.exists()) {
plasmaSearchPreOrder.loadYBR(YBRPath, 15); plasmaSearchPreOrder.loadYBR(YBRPath, 15);
@ -1219,6 +1225,16 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// System.gc(); try{Thread.currentThread().sleep(5000);} catch (InterruptedException e) {} // for profiler // System.gc(); try{Thread.currentThread().sleep(5000);} catch (InterruptedException e) {} // for profiler
serverInstantThread.oneTimeJob(yc, "loadSeeds", yacyCore.log, 3000); serverInstantThread.oneTimeJob(yc, "loadSeeds", yacyCore.log, 3000);
String wikiParserClassName = getConfig(WIKIPARSER_CLASS, WIKIPARSER_CLASS_DEFAULT);
this.log.logConfig("Loading wiki parser " + wikiParserClassName + " ...");
try {
Class wikiParserClass = Class.forName(wikiParserClassName);
Constructor wikiParserClassConstr = wikiParserClass.getConstructor(new Class[] { plasmaSwitchboard.class });
wikiParser = (wikiParser)wikiParserClassConstr.newInstance(new Object[] { this });
} catch (Exception e) {
this.log.logSevere("Unable to load wiki parser, the wiki won't work", e);
}
// initializing the stackCrawlThread // initializing the stackCrawlThread
this.sbStackCrawlThread = new plasmaCrawlStacker(this, this.plasmaPath, ramPreNURL_time, (int) getConfigLong("tableTypeForPreNURL", 0)); this.sbStackCrawlThread = new plasmaCrawlStacker(this, this.plasmaPath, ramPreNURL_time, (int) getConfigLong("tableTypeForPreNURL", 0));
//this.sbStackCrawlThread = new plasmaStackCrawlThread(this,this.plasmaPath,ramPreNURL); //this.sbStackCrawlThread = new plasmaStackCrawlThread(this,this.plasmaPath,ramPreNURL);

@ -68,7 +68,6 @@ import java.util.Hashtable;
import java.util.Map; import java.util.Map;
import de.anomic.data.htmlTools; import de.anomic.data.htmlTools;
import de.anomic.data.wikiCode;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
public class serverObjects extends Hashtable implements Cloneable { public class serverObjects extends Hashtable implements Cloneable {
@ -124,15 +123,13 @@ public class serverObjects extends Hashtable implements Cloneable {
return (String) this.put(key, (Object) value); return (String) this.put(key, (Object) value);
} }
public String putWiki(Object key, String wikiCode){ public String putWiki(Object key, String wikiCode){
//XXX: This is ineffizient, if a lot of wikiCode is used on the same page. return this.putASIS(key, plasmaSwitchboard.wikiParser.transform(wikiCode));
//TODO: Cache the wikiCode Object?
return this.putASIS(key, (new wikiCode(plasmaSwitchboard.getSwitchboard())).transform(wikiCode));
} }
public String putWiki(Object key, byte[] wikiCode) { public String putWiki(Object key, byte[] wikiCode) {
try { try {
return this.putASIS(key, (new wikiCode(plasmaSwitchboard.getSwitchboard())).transform(wikiCode)); return this.putASIS(key, plasmaSwitchboard.wikiParser.transform(wikiCode));
} catch (UnsupportedEncodingException e) { } catch (UnsupportedEncodingException e) {
return "Internal error: no UTF-8 supported"; return this.putASIS(key, "Internal error pasting wiki-code: " + e.getMessage());
} }
} }

@ -13,7 +13,7 @@
# e.g. 8080 # e.g. 8080
# #eth0:8080 # #eth0:8080
# 192.168.0.1:8080 # 192.168.0.1:8080
port = 8080 port = 8085
#sometimes you may want yacy to bind to another port, than the one reachable from outside. #sometimes you may want yacy to bind to another port, than the one reachable from outside.
#then set bindPort to the port yacy should bind on, and port to the port, visible from outside #then set bindPort to the port yacy should bind on, and port to the port, visible from outside
@ -909,3 +909,6 @@ thumbnailProgram =
# - surftips : the surftips-page # - surftips : the surftips-page
# - wiki : the wiki-page # - wiki : the wiki-page
httpd.robots.txt = locked,dirs httpd.robots.txt = locked,dirs
# class to use for parsing wikicode
wikiParser.class = de.anomic.data.wikiCode

Loading…
Cancel
Save