diff --git a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
index 5cd92349b..d2429a2cc 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
@@ -52,6 +52,8 @@ import java.util.ArrayList;
import java.util.Locale;
import java.util.Properties;
import java.util.TreeSet;
+
+import de.anomic.http.httpTemplate;
import de.anomic.server.serverByteBuffer;
public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer implements htmlFilterTransformer {
@@ -69,20 +71,24 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
static {
linkTags0 = new TreeSet(insensitiveCollator);
linkTags0.add("img");
+ linkTags0.add("input");
linkTags1 = new TreeSet(insensitiveCollator);
linkTags1.add("a");
}
- private static ArrayList bluelist = null;
+ private ArrayList bluelist = null;
+ private boolean gettext = false;
public htmlFilterContentTransformer() {
super(linkTags0, linkTags1);
}
public void init(String initarg) {
-// System.out.println("Transformer init: " + initarg);
- if (bluelist == null) {
+ if (initarg.equals("gettext")) {
+ // the initarg declares that the transformer applies a gettext-quotation on strings
+ gettext = true;
+ } else if (bluelist == null) {
// here, the initarg is used to load a list of bluelisted words
bluelist = new ArrayList();
File f = new File(initarg);
@@ -102,7 +108,7 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
}
public boolean isIdentityTransformer() {
- return bluelist.size() == 0;
+ return (bluelist.size() == 0) && (!gettext);
}
private static byte[] genBlueLetters(int length) {
@@ -116,7 +122,7 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
return bb.getBytes();
}
- private boolean hit(byte[] text) {
+ private boolean bluelistHit(byte[] text) {
if (text == null || bluelist == null) return false;
String lc;
try {
@@ -131,22 +137,61 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
}
public byte[] transformText(byte[] text) {
- if (hit(text)) {
-// System.out.println("FILTERHIT: " + text);
- return genBlueLetters(text.length);
+ if (gettext) {
+ serverByteBuffer sbb = new serverByteBuffer(text);
+ sbb.trim();
+ //if (sbb.length() > 0) System.out.println(" TEXT: " + sbb.toString());
+ serverByteBuffer[] sbbs = httpTemplate.splitQuotations(sbb);
+ sbb = new serverByteBuffer();
+ for (int i = 0; i < sbbs.length; i++) {
+ sbbs[i].trim();
+ if (sbbs[i].length() == 0) {
+ sbb.append(' ');
+ } else if ((sbbs[i].byteAt(0) == httpTemplate.hash) ||
+ (sbbs[i].startsWith(httpTemplate.dpdpa))) {
+ // this is a template or a part of a template
+ sbb.append(sbbs[i]);
+ } else {
+ // this is a text fragment, generate gettext quotation
+ sbb.append('_');
+ sbb.append('(');
+ sbb.append(sbbs[i]);
+ sbb.append(')');
+ }
+ }
+ //if (sbb.length() > 0) System.out.println("GETTEXT: " + sbb.toString());
+ return sbb.getBytes();
+ }
+ if (bluelist != null) {
+ if (bluelistHit(text)) {
+ // System.out.println("FILTERHIT: " + text);
+ return genBlueLetters(text.length);
+ } else {
+ return text;
+ }
}
return text;
}
public byte[] transformTag0(String tagname, Properties tagopts, byte quotechar) {
- if (hit(tagopts.getProperty("src","").getBytes())) return genBlueLetters(5);
- if (hit(tagopts.getProperty("alt","").getBytes())) return genBlueLetters(5);
+ if (tagname.equals("img")) {
+ // check bluelist
+ if (bluelistHit(tagopts.getProperty("src","").getBytes())) return genBlueLetters(5);
+ if (bluelistHit(tagopts.getProperty("alt","").getBytes())) return genBlueLetters(5);
+
+ // replace image alternative name
+ tagopts.setProperty("alt", new String(transformText(tagopts.getProperty("alt","").getBytes())));
+ }
+ if ((tagname.equals("input")) && (tagopts.getProperty("type").equals("submit"))) {
+ // rewrite button name
+ tagopts.setProperty("value", new String(transformText(tagopts.getProperty("value","").getBytes())));
+ }
return htmlFilterOutputStream.genTag0(tagname, tagopts, quotechar);
}
public byte[] transformTag1(String tagname, Properties tagopts, byte[] text, byte quotechar) {
- if (hit(tagopts.getProperty("href","").getBytes())) return genBlueLetters(text.length);
- if (hit(text)) return genBlueLetters(text.length);
+ if (bluelistHit(tagopts.getProperty("href","").getBytes())) return genBlueLetters(text.length);
+ if (bluelistHit(text)) return genBlueLetters(text.length);
return htmlFilterOutputStream.genTag1(tagname, tagopts, text, quotechar);
}
diff --git a/source/de/anomic/htmlFilter/htmlFilterOutputStream.java b/source/de/anomic/htmlFilter/htmlFilterOutputStream.java
index ef7d8b285..8a60a01ac 100644
--- a/source/de/anomic/htmlFilter/htmlFilterOutputStream.java
+++ b/source/de/anomic/htmlFilter/htmlFilterOutputStream.java
@@ -59,7 +59,6 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.MalformedURLException;
-import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Properties;
import de.anomic.server.serverByteBuffer;
@@ -491,27 +490,27 @@ public final class htmlFilterOutputStream extends OutputStream {
}
public static void main(String[] args) {
- // test app
- // takes one argument: a file name
+ // takes one argument: a file name
if (args.length != 1) return;
byte[] buffer = new byte[512];
try {
- htmlFilterContentScraper lc = new htmlFilterContentScraper(new URL("http://www.anomic.de/"));
- ArrayList v = new ArrayList();
- v.add("proxy");
- htmlFilterTransformer lt = new htmlFilterContentTransformer();
+ htmlFilterContentScraper scraper = new htmlFilterContentScraper(new URL("http://localhost:8080"));
+ htmlFilterTransformer transformer = new htmlFilterContentTransformer();
+ transformer.init("gettext");
InputStream is = new FileInputStream(args[0]);
FileOutputStream fos = new FileOutputStream(new File(args[0] + ".out"));
- OutputStream os = new htmlFilterOutputStream(fos, lc, lt, false);
+ OutputStream os = new htmlFilterOutputStream(fos, scraper, transformer, false);
int i;
while ((i = is.read(buffer)) > 0) os.write(buffer, 0, i);
os.close();
fos.close();
is.close();
- lc.print();
+ scraper.print();
+ } catch (MalformedURLException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
}
- catch (MalformedURLException e) {}
- catch (IOException e) {}
}
}
\ No newline at end of file
diff --git a/source/de/anomic/http/httpTemplate.java b/source/de/anomic/http/httpTemplate.java
index 40ee85349..a3def41aa 100644
--- a/source/de/anomic/http/httpTemplate.java
+++ b/source/de/anomic/http/httpTemplate.java
@@ -57,9 +57,12 @@ import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PushbackInputStream;
import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
import java.util.HashMap;
import java.util.Hashtable;
+import java.util.List;
+import de.anomic.server.serverByteBuffer;
import de.anomic.server.serverFileUtils;
import de.anomic.server.logging.serverLog;
@@ -120,12 +123,15 @@ import de.anomic.server.logging.serverLog;
*/
public final class httpTemplate {
- private static final byte hash = (byte)'#';
+ public static final byte hash = (byte)'#';
private static final byte[] hasha = {hash};
+ private static final byte dp = (byte)':';
+ public static final byte[] dpdpa = {dp, dp};
+
private static final byte lbr = (byte)'[';
private static final byte rbr = (byte)']';
- //private static final byte[] pOpen = {hash, lbr};
+ private static final byte[] pOpen = {hash, lbr};
private static final byte[] pClose = {rbr, hash};
private static final byte lcbr = (byte)'{';
@@ -135,13 +141,65 @@ public final class httpTemplate {
private static final byte lrbr = (byte)'(';
private static final byte rrbr = (byte)')';
- //private static final byte[] aOpen = {hash, lrbr};
+ private static final byte[] aOpen = {hash, lrbr};
private static final byte[] aClose = {rrbr, hash};
private static final byte ps = (byte)'%';
- //private static final byte[] iOpen = {hash, ps};
+ private static final byte[] iOpen = {hash, ps};
private static final byte[] iClose = {ps, hash};
+ public static final Object[] meta_quotation = new Object[] {
+ new Object[] {pOpen, pClose},
+ new Object[] {mOpen, mClose},
+ new Object[] {aOpen, aClose},
+ new Object[] {iOpen, iClose}
+ };
+
+ public static serverByteBuffer[] splitQuotations(serverByteBuffer text) {
+ List l = splitQuotation(text, 0);
+ serverByteBuffer[] sbbs = new serverByteBuffer[l.size()];
+ for (int i = 0; i < l.size(); i++) sbbs[i] = (serverByteBuffer) l.get(i);
+ return sbbs;
+ }
+
+ public static List splitQuotation(serverByteBuffer text, int qoff) {
+ ArrayList l = new ArrayList();
+ if (qoff >= meta_quotation.length) {
+ if (text.length() > 0) l.add(text);
+ return l;
+ }
+ int p = -1, q;
+ byte[] left = (byte[]) ((Object[]) meta_quotation[qoff])[0];
+ byte[] right = (byte[]) ((Object[]) meta_quotation[qoff])[1];
+ qoff++;
+ while ((text.length() > 0) && ((p = text.indexOf(left)) >= 0)) {
+ q = text.indexOf(right, p + 1);
+ if (q >= 0) {
+ // found a pattern
+ l.addAll(splitQuotation(new serverByteBuffer(text.getBytes(0, p)), qoff));
+ l.add(new serverByteBuffer(text.getBytes(p, q + right.length)));
+ text = new serverByteBuffer(text.getBytes(q + right.length));
+ } else {
+ // found only pattern start, no closing parantesis (a syntax error that is silently accepted here)
+ l.addAll(splitQuotation(new serverByteBuffer(text.getBytes(0, p)), qoff));
+ l.addAll(splitQuotation(new serverByteBuffer(text.getBytes(p)), qoff));
+ text.clear();
+ }
+ }
+
+ // find double-points
+ while ((text.length() > 0) && ((p = text.indexOf(dpdpa)) >= 0)) {
+ l.addAll(splitQuotation(new serverByteBuffer(text.getBytes(0, p)), qoff));
+ l.add(new serverByteBuffer(dpdpa));
+ l.addAll(splitQuotation(new serverByteBuffer(text.getBytes(p + 2)), qoff));
+ text.clear();
+ }
+
+ // add remaining
+ if (text.length() > 0) l.addAll(splitQuotation(text, qoff));
+ return l;
+ }
+
/**
* transfer until a specified pattern is found; everything but the pattern is transfered so far
* the function returns true, if the pattern is found
diff --git a/source/de/anomic/server/serverByteBuffer.java b/source/de/anomic/server/serverByteBuffer.java
index dab691662..829bfbdac 100644
--- a/source/de/anomic/server/serverByteBuffer.java
+++ b/source/de/anomic/server/serverByteBuffer.java
@@ -59,7 +59,7 @@ public final class serverByteBuffer extends OutputStream {
public serverByteBuffer() {
- buffer = new byte[80];
+ buffer = new byte[10];
length = 0;
offset = 0;
}
@@ -125,6 +125,12 @@ public final class serverByteBuffer extends OutputStream {
}
}
+ public void clear() {
+ this.buffer = new byte[0];
+ length = 0;
+ offset = 0;
+ }
+
public int length() {
return length;
}
@@ -201,17 +207,46 @@ public final class serverByteBuffer extends OutputStream {
return indexOf(b, 0);
}
+ public int indexOf(byte[] bs) {
+ return indexOf(bs, 0);
+ }
+
public int indexOf(byte b, int start) {
if (start >= length) return -1;
for (int i = start; i < length; i++) if (buffer[offset + i] == b) return i;
return -1;
}
+ public int indexOf(byte[] bs, int start) {
+ if (start + bs.length > length) return -1;
+ loop: for (int i = start; i <= length - bs.length; i++) {
+ // first test only first byte
+ if (buffer[offset + i] != bs[0]) continue loop;
+
+ // then test all remaining bytes
+ for (int j = 1; j < bs.length; j++) {
+ if (buffer[offset + i + j] != bs[j]) continue loop;
+ }
+
+ // found hit
+ return i;
+ }
+ return -1;
+ }
+
public int lastIndexOf(byte b) {
for (int i = length - 1; i >= 0; i--) if (buffer[offset + i] == b) return i;
return -1;
}
+ public boolean startsWith(byte[] bs) {
+ if (length < bs.length) return false;
+ for (int i = 0; i < bs.length; i++) {
+ if (buffer[offset + i] != bs[i]) return false;
+ }
+ return true;
+ }
+
public byte[] getBytes() {
return getBytes(0);
}