enhanced computation speed of many replaceAll string operations

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7107 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent e8228fba09
commit 22047ffad5

@ -40,7 +40,6 @@ import net.yacy.cora.document.MultiProtocolURI;
public class RobotsEntry {
public static final String ROBOTS_DB_PATH_SEPARATOR = ";";
public static final String HOST_NAME = "hostname";
public static final String ALLOW_PATH_LIST = "allow";
public static final String DISALLOW_PATH_LIST = "disallow";
@ -64,7 +63,7 @@ public class RobotsEntry {
this.denyPathList = new LinkedList<String>();
final String csPl = new String(this.mem.get(DISALLOW_PATH_LIST));
if (csPl.length() > 0){
final String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR);
final String[] pathArray = csPl.split(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR);
if ((pathArray != null)&&(pathArray.length > 0)) {
this.denyPathList.addAll(Arrays.asList(pathArray));
}
@ -76,7 +75,7 @@ public class RobotsEntry {
this.allowPathList = new LinkedList<String>();
final String csPl = new String(this.mem.get(ALLOW_PATH_LIST));
if (csPl.length() > 0){
final String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR);
final String[] pathArray = csPl.split(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR);
if ((pathArray != null)&&(pathArray.length > 0)) {
this.allowPathList.addAll(Arrays.asList(pathArray));
}
@ -116,7 +115,7 @@ public class RobotsEntry {
final StringBuilder pathListStr = new StringBuilder(allowPathList.size() * 30);
for (String element : allowPathList) {
pathListStr.append(element)
.append(ROBOTS_DB_PATH_SEPARATOR);
.append(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR);
}
this.mem.put(ALLOW_PATH_LIST, pathListStr.substring(0,pathListStr.length()-1).getBytes());
}
@ -127,7 +126,7 @@ public class RobotsEntry {
final StringBuilder pathListStr = new StringBuilder(disallowPathList.size() * 30);
for (String element : disallowPathList) {
pathListStr.append(element)
.append(ROBOTS_DB_PATH_SEPARATOR);
.append(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR);
}
this.mem.put(DISALLOW_PATH_LIST,pathListStr.substring(0, pathListStr.length()-1).getBytes());
}
@ -197,7 +196,7 @@ public class RobotsEntry {
// if the path is null or empty we set it to /
if ((path == null) || (path.length() == 0)) path = "/";
// escaping all occurences of ; because this char is used as special char in the Robots DB
else path = path.replaceAll(ROBOTS_DB_PATH_SEPARATOR,"%3B");
else path = RobotsTxt.ROBOTS_DB_PATH_SEPARATOR_MATCHER.matcher(path).replaceAll("%3B");
for (String element : this.denyPathList) {

@ -32,6 +32,7 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
@ -50,6 +51,7 @@ public class RobotsTxt {
private static Logger log = Logger.getLogger(RobotsTxt.class);
public static final String ROBOTS_DB_PATH_SEPARATOR = ";";
public static final Pattern ROBOTS_DB_PATH_SEPARATOR_MATCHER = Pattern.compile(ROBOTS_DB_PATH_SEPARATOR);
BEncodedHeap robotsTable;
private final ConcurrentHashMap<String, DomSync> syncObjects;

@ -33,6 +33,7 @@ import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.regex.Pattern;
/*
* A class for Parsing robots.txt files.
@ -55,6 +56,8 @@ import java.util.ArrayList;
public final class robotsParser {
private static final Pattern patternTab = Pattern.compile("\t");
public static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase();
public static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase();
public static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
@ -109,7 +112,7 @@ public final class robotsParser {
try {
lineparser: while ((line = reader.readLine()) != null) {
// replacing all tabs with spaces
line = line.replaceAll("\t"," ").trim();
line = patternTab.matcher(line).replaceAll(" ").trim();
lineUpper = line.toUpperCase();
// parse empty line
@ -218,7 +221,7 @@ public final class robotsParser {
}
// escaping all occurences of ; because this char is used as special char in the Robots DB
path = path.replaceAll(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR,"%3B");
path = RobotsTxt.ROBOTS_DB_PATH_SEPARATOR_MATCHER.matcher(path).replaceAll("%3B");
// adding it to the pathlist
if (isDisallowRule) {

@ -67,6 +67,8 @@ import de.anomic.search.MetadataRepository.Export;
public class URLAnalysis {
private static final Pattern patternMinus = Pattern.compile("-");
/**
* processes to analyse URL lists
*/
@ -99,7 +101,7 @@ public class URLAnalysis {
try {
url = in.take();
if (url == poison) break;
update(url.getHost().replaceAll("-", "\\.").split("\\."));
update(patternMinus.matcher(url.getHost()).replaceAll("\\.").split("\\."));
update(p.matcher(url.getPath()).replaceAll("/").split("/"));
} catch (InterruptedException e) {
Log.logException(e);

@ -51,6 +51,7 @@ import java.net.InetAddress;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.parser.html.CharacterCoding;
@ -62,6 +63,14 @@ import de.anomic.search.Switchboard;
public class serverObjects extends HashMap<String, String> implements Cloneable {
private final static Pattern patternNewline = Pattern.compile("\n");
private final static Pattern patternDoublequote = Pattern.compile("\"");
private final static Pattern patternSlash = Pattern.compile("/");
private final static Pattern patternB = Pattern.compile("\b");
private final static Pattern patternF = Pattern.compile("\f");
private final static Pattern patternR = Pattern.compile("\r");
private final static Pattern patternT = Pattern.compile("\t");
private static final long serialVersionUID = 1L;
private boolean localized = true;
@ -165,14 +174,14 @@ public class serverObjects extends HashMap<String, String> implements Cloneable
* @return the modified String that was added to the map.
*/
public String putJSON(final String key, String value) {
value = value.replaceAll("\"", "'");
value = value.replaceAll("/", "\\/");
// value = value.replaceAll("\\", "\\\\");
value = value.replaceAll("\b", "\\b");
value = value.replaceAll("\f", "\\f");
value = value.replaceAll("\n", "\\r");
value = value.replaceAll("\r", "\\r");
value = value.replaceAll("\t", "\\t");
value = patternDoublequote.matcher(value).replaceAll("'");
value = patternSlash.matcher(value).replaceAll("\\/");
value = patternB.matcher(value).replaceAll("\\b");
value = patternF.matcher(value).replaceAll("\\f");
value = patternNewline.matcher(value).replaceAll("\\r");
value = patternR.matcher(value).replaceAll("\\r");
value = patternT.matcher(value).replaceAll("\\t");
return put(key, value);
}
public String putJSON(final String key, final byte[] value) {
@ -333,7 +342,7 @@ public class serverObjects extends HashMap<String, String> implements Cloneable
String key, value;
for (Map.Entry<String, String> entry: entrySet()) {
key = entry.getKey();
value = entry.getValue().replaceAll("\n", "\\\\n");
value = patternNewline.matcher(entry.getValue()).replaceAll("\\\\n");
fos.write((key + "=" + value + "\r\n").getBytes());
}
} finally {

@ -0,0 +1,209 @@
package net.yacy.ai.example;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import net.yacy.ai.greedy.AbstractFinding;
import net.yacy.ai.greedy.AbstractModel;
import net.yacy.ai.greedy.Finding;
import net.yacy.ai.greedy.Model;
import net.yacy.ai.greedy.Role;
public class SchwarzerPeter {
public static enum Kartentyp {
A, B, C, D, E, F, G, H, P;
}
public static enum Kartenzaehler {
p, q;
}
public static class Karte {
private Kartentyp kartentyp;
private Kartenzaehler kartenzaehler;
public Karte(Kartentyp kartentyp, Kartenzaehler kartenzaehler) {
this.kartentyp = kartentyp; this.kartenzaehler = kartenzaehler;
}
public boolean equals(Object obj) {
return this.kartentyp == ((Karte) obj).kartentyp && this.kartenzaehler == ((Karte) obj).kartenzaehler;
}
public int hashCode() {
return this.kartentyp.hashCode() + 16 + this.kartenzaehler.hashCode();
}
public boolean istSchwarzerPeter() {
return this.kartentyp == Kartentyp.P;
}
public static boolean istPaar(Karte k1, Karte k2) {
return k1.kartentyp == k2.kartentyp;
}
}
public static final List<Karte> alleKarten;
static {
alleKarten = new ArrayList<Karte>(33);
for (Kartentyp typ: Kartentyp.values()) {
alleKarten.add(new Karte(typ, Kartenzaehler.p));
alleKarten.add(new Karte(typ, Kartenzaehler.q));
}
alleKarten.add(new Karte(Kartentyp.P, Kartenzaehler.p));
}
public static final List<Karte> neuerStapel(Random r) {
List<Karte> stapel0 = new ArrayList<Karte>();
for (Karte karte: alleKarten) stapel0.add(karte);
List<Karte> stapel1 = new ArrayList<Karte>();
while (stapel0.size() > 0) stapel1.add(stapel0.remove(r.nextInt(stapel0.size())));
return stapel1;
}
public static class Spieler implements Role {
private int spielernummer;
private int spieleranzahl;
public Spieler(int spielernummer, int spieleranzahl) {
this.spielernummer = spielernummer;
this.spieleranzahl = spieleranzahl;
}
@Override
public Spieler nextRole() {
int n = (this.spielernummer == this.spieleranzahl - 1) ? 0 : this.spielernummer + 1;
return new Spieler(n, this.spieleranzahl);
}
public Spieler linkerNachbar() {
int n = (this.spielernummer == 0) ? this.spieleranzahl - 1 : this.spielernummer - 1;
return new Spieler(n, this.spieleranzahl);
}
public boolean equals(Object obj) {
return this.spielernummer == ((Spieler) obj).spielernummer;
}
public int hashCode() {
return this.spielernummer;
}
}
public static enum Strategy {
nichtsortieren_linksziehen,
nichtsortieren_zufallsziehen,
sortieren_linksziehen,
sortieren_zufallsziehen;
}
public static class Hand extends ArrayList<Karte> {
private static final long serialVersionUID = -5274023237476645059L;
private Strategy strategy;
public Hand(Strategy strategy) {
this.strategy = strategy;
}
public void annehmen(Random r, Karte karte) {
if (this.strategy == Strategy.nichtsortieren_linksziehen || this.strategy == Strategy.nichtsortieren_zufallsziehen) {
this.add(this.set(r.nextInt(this.size()), karte));
} else {
this.add(karte);
}
}
public Karte abgeben(Random r) {
if (this.strategy == Strategy.nichtsortieren_linksziehen || this.strategy == Strategy.sortieren_linksziehen) {
return this.remove(0);
} else {
return this.remove(r.nextInt(this.size()));
}
}
public boolean paerchenAblegen() {
return true;
}
}
public static class Zug extends AbstractFinding<Spieler> implements Finding<Spieler> {
public Zug(Spieler spieler, int priority) {
super(spieler, priority);
}
@Override
public Object clone() {
return this;
}
@Override
public boolean equals(Object other) {
return true;
}
@Override
public int hashCode() {
return 0;
}
}
public static class Spiel extends AbstractModel<Spieler, Zug> implements Model<Spieler, Zug>, Cloneable {
private Hand[] haende;
private Random random;
public Spiel(Spieler spieler, Random r) {
super(spieler);
this.random = r;
haende = new Hand[spieler.spieleranzahl];
for (int i = 0; i < spieler.spieleranzahl; i++) haende[i] = new Hand(Strategy.nichtsortieren_linksziehen);
List<Karte> geben = neuerStapel(r);
while (geben.size() > 0) {
haende[spieler.spielernummer].annehmen(r, geben.remove(0));
spieler = spieler.nextRole();
}
}
@Override
public List<Zug> explore() {
return new ArrayList<Zug>(0);
}
@Override
public void applyFinding(Zug finding) {
haende[this.currentRole().spielernummer].annehmen(random, this.haende[this.currentRole().linkerNachbar().spielernummer].abgeben(random));
}
@Override
public int getRanking(int findings, Spieler role) {
// TODO Auto-generated method stub
return 0;
}
@Override
public boolean isTermination(Spieler role) {
// TODO Auto-generated method stub
return false;
}
@Override
public Spieler isTermination() {
// TODO Auto-generated method stub
return null;
}
@Override
public Object clone() {
// TODO Auto-generated method stub
return null;
}
@Override
public boolean equals(Object other) {
// TODO Auto-generated method stub
return false;
}
@Override
public int hashCode() {
// TODO Auto-generated method stub
return 0;
}
}
}

@ -56,6 +56,7 @@ public class MultiProtocolURI implements Serializable {
private static final Pattern backPathPattern = Pattern.compile("(/[^/]+(?<!/\\.{1,2})/)[.]{2}(?=/|$)|/\\.(?=/)|/(?=/)");
private static final Pattern patternDot = Pattern.compile("\\.");
private static final Pattern patternSlash = Pattern.compile("/");
private static final Pattern patternBackSlash = Pattern.compile("\\\\");
private static final Pattern patternAmp = Pattern.compile("&");
private static final Pattern patternMail = Pattern.compile("^[a-z]+:.*?");
@ -116,7 +117,7 @@ public class MultiProtocolURI implements Serializable {
assert (url != null);
url = url.trim();
if (url.startsWith("\\\\")) {
url = "smb://" + url.substring(2).replaceAll("\\\\", "/");
url = "smb://" + patternBackSlash.matcher(url.substring(2)).replaceAll("/");
}
if (url.length() > 1 && url.charAt(1) == ':') {
@ -684,10 +685,11 @@ public class MultiProtocolURI implements Serializable {
return toNormalform(excludeReference, stripAmp, false);
}
private static final Pattern ampPattern = Pattern.compile("&amp;");
public String toNormalform(final boolean excludeReference, final boolean stripAmp, final boolean removeSessionID) {
String result = toNormalform0(excludeReference, removeSessionID);
if (stripAmp) {
result = result.replaceAll("&amp;", "&");
result = ampPattern.matcher(result).replaceAll("&");
}
return result;
}

@ -27,6 +27,7 @@ import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.regex.Pattern;
import de.anomic.crawler.retrieval.HTTPLoader;
@ -43,6 +44,8 @@ import net.yacy.kelondro.util.FileUtils;
public class htmlParser extends AbstractParser implements Parser {
private static final Pattern patternUnderline = Pattern.compile("_");
public htmlParser() {
super("HTML Parser");
SUPPORTED_EXTENSIONS.add("htm");
@ -176,7 +179,7 @@ public class htmlParser extends AbstractParser implements Parser {
if (encoding.startsWith("MACINTOSH")) encoding = "MacRoman";
// fix wrong fill characters
encoding = encoding.replaceAll("_", "-");
encoding = patternUnderline.matcher(encoding).replaceAll("-");
if (encoding.matches("GB[_-]?2312([-_]80)?")) return "GB2312";
if (encoding.matches(".*UTF[-_]?8.*")) return "UTF-8";

Loading…
Cancel
Save