added gettext support:

automatic replacement of string appearances in html files by
gettext quotes.
see also: http://www.yacy-forum.de/viewtopic.php?p=23901#23901

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2309 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent b3c569f706
commit 97fa6788a1

@ -52,6 +52,8 @@ import java.util.ArrayList;
import java.util.Locale; import java.util.Locale;
import java.util.Properties; import java.util.Properties;
import java.util.TreeSet; import java.util.TreeSet;
import de.anomic.http.httpTemplate;
import de.anomic.server.serverByteBuffer; import de.anomic.server.serverByteBuffer;
public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer implements htmlFilterTransformer { public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer implements htmlFilterTransformer {
@ -69,20 +71,24 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
static { static {
linkTags0 = new TreeSet(insensitiveCollator); linkTags0 = new TreeSet(insensitiveCollator);
linkTags0.add("img"); linkTags0.add("img");
linkTags0.add("input");
linkTags1 = new TreeSet(insensitiveCollator); linkTags1 = new TreeSet(insensitiveCollator);
linkTags1.add("a"); linkTags1.add("a");
} }
private static ArrayList bluelist = null; private ArrayList bluelist = null;
private boolean gettext = false;
public htmlFilterContentTransformer() { public htmlFilterContentTransformer() {
super(linkTags0, linkTags1); super(linkTags0, linkTags1);
} }
public void init(String initarg) { public void init(String initarg) {
// System.out.println("Transformer init: " + initarg); if (initarg.equals("gettext")) {
if (bluelist == null) { // the initarg declares that the transformer applies a gettext-quotation on strings
gettext = true;
} else if (bluelist == null) {
// here, the initarg is used to load a list of bluelisted words // here, the initarg is used to load a list of bluelisted words
bluelist = new ArrayList(); bluelist = new ArrayList();
File f = new File(initarg); File f = new File(initarg);
@ -102,7 +108,7 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
} }
public boolean isIdentityTransformer() { public boolean isIdentityTransformer() {
return bluelist.size() == 0; return (bluelist.size() == 0) && (!gettext);
} }
private static byte[] genBlueLetters(int length) { private static byte[] genBlueLetters(int length) {
@ -116,7 +122,7 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
return bb.getBytes(); return bb.getBytes();
} }
private boolean hit(byte[] text) { private boolean bluelistHit(byte[] text) {
if (text == null || bluelist == null) return false; if (text == null || bluelist == null) return false;
String lc; String lc;
try { try {
@ -131,22 +137,61 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
} }
public byte[] transformText(byte[] text) { public byte[] transformText(byte[] text) {
if (hit(text)) { if (gettext) {
// System.out.println("FILTERHIT: " + text); serverByteBuffer sbb = new serverByteBuffer(text);
return genBlueLetters(text.length); sbb.trim();
//if (sbb.length() > 0) System.out.println(" TEXT: " + sbb.toString());
serverByteBuffer[] sbbs = httpTemplate.splitQuotations(sbb);
sbb = new serverByteBuffer();
for (int i = 0; i < sbbs.length; i++) {
sbbs[i].trim();
if (sbbs[i].length() == 0) {
sbb.append(' ');
} else if ((sbbs[i].byteAt(0) == httpTemplate.hash) ||
(sbbs[i].startsWith(httpTemplate.dpdpa))) {
// this is a template or a part of a template
sbb.append(sbbs[i]);
} else {
// this is a text fragment, generate gettext quotation
sbb.append('_');
sbb.append('(');
sbb.append(sbbs[i]);
sbb.append(')');
}
}
//if (sbb.length() > 0) System.out.println("GETTEXT: " + sbb.toString());
return sbb.getBytes();
}
if (bluelist != null) {
if (bluelistHit(text)) {
// System.out.println("FILTERHIT: " + text);
return genBlueLetters(text.length);
} else {
return text;
}
} }
return text; return text;
} }
public byte[] transformTag0(String tagname, Properties tagopts, byte quotechar) { public byte[] transformTag0(String tagname, Properties tagopts, byte quotechar) {
if (hit(tagopts.getProperty("src","").getBytes())) return genBlueLetters(5); if (tagname.equals("img")) {
if (hit(tagopts.getProperty("alt","").getBytes())) return genBlueLetters(5); // check bluelist
if (bluelistHit(tagopts.getProperty("src","").getBytes())) return genBlueLetters(5);
if (bluelistHit(tagopts.getProperty("alt","").getBytes())) return genBlueLetters(5);
// replace image alternative name
tagopts.setProperty("alt", new String(transformText(tagopts.getProperty("alt","").getBytes())));
}
if ((tagname.equals("input")) && (tagopts.getProperty("type").equals("submit"))) {
// rewrite button name
tagopts.setProperty("value", new String(transformText(tagopts.getProperty("value","").getBytes())));
}
return htmlFilterOutputStream.genTag0(tagname, tagopts, quotechar); return htmlFilterOutputStream.genTag0(tagname, tagopts, quotechar);
} }
public byte[] transformTag1(String tagname, Properties tagopts, byte[] text, byte quotechar) { public byte[] transformTag1(String tagname, Properties tagopts, byte[] text, byte quotechar) {
if (hit(tagopts.getProperty("href","").getBytes())) return genBlueLetters(text.length); if (bluelistHit(tagopts.getProperty("href","").getBytes())) return genBlueLetters(text.length);
if (hit(text)) return genBlueLetters(text.length); if (bluelistHit(text)) return genBlueLetters(text.length);
return htmlFilterOutputStream.genTag1(tagname, tagopts, text, quotechar); return htmlFilterOutputStream.genTag1(tagname, tagopts, text, quotechar);
} }

@ -59,7 +59,6 @@ import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.OutputStream; import java.io.OutputStream;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.Properties; import java.util.Properties;
import de.anomic.server.serverByteBuffer; import de.anomic.server.serverByteBuffer;
@ -491,27 +490,27 @@ public final class htmlFilterOutputStream extends OutputStream {
} }
public static void main(String[] args) { public static void main(String[] args) {
// test app
// takes one argument: a file name // takes one argument: a file name
if (args.length != 1) return; if (args.length != 1) return;
byte[] buffer = new byte[512]; byte[] buffer = new byte[512];
try { try {
htmlFilterContentScraper lc = new htmlFilterContentScraper(new URL("http://www.anomic.de/")); htmlFilterContentScraper scraper = new htmlFilterContentScraper(new URL("http://localhost:8080"));
ArrayList v = new ArrayList(); htmlFilterTransformer transformer = new htmlFilterContentTransformer();
v.add("proxy"); transformer.init("gettext");
htmlFilterTransformer lt = new htmlFilterContentTransformer();
InputStream is = new FileInputStream(args[0]); InputStream is = new FileInputStream(args[0]);
FileOutputStream fos = new FileOutputStream(new File(args[0] + ".out")); FileOutputStream fos = new FileOutputStream(new File(args[0] + ".out"));
OutputStream os = new htmlFilterOutputStream(fos, lc, lt, false); OutputStream os = new htmlFilterOutputStream(fos, scraper, transformer, false);
int i; int i;
while ((i = is.read(buffer)) > 0) os.write(buffer, 0, i); while ((i = is.read(buffer)) > 0) os.write(buffer, 0, i);
os.close(); os.close();
fos.close(); fos.close();
is.close(); is.close();
lc.print(); scraper.print();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} }
catch (MalformedURLException e) {}
catch (IOException e) {}
} }
} }

@ -57,9 +57,12 @@ import java.io.InputStreamReader;
import java.io.OutputStream; import java.io.OutputStream;
import java.io.PushbackInputStream; import java.io.PushbackInputStream;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.Hashtable; import java.util.Hashtable;
import java.util.List;
import de.anomic.server.serverByteBuffer;
import de.anomic.server.serverFileUtils; import de.anomic.server.serverFileUtils;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
@ -120,12 +123,15 @@ import de.anomic.server.logging.serverLog;
*/ */
public final class httpTemplate { public final class httpTemplate {
private static final byte hash = (byte)'#'; public static final byte hash = (byte)'#';
private static final byte[] hasha = {hash}; private static final byte[] hasha = {hash};
private static final byte dp = (byte)':';
public static final byte[] dpdpa = {dp, dp};
private static final byte lbr = (byte)'['; private static final byte lbr = (byte)'[';
private static final byte rbr = (byte)']'; private static final byte rbr = (byte)']';
//private static final byte[] pOpen = {hash, lbr}; private static final byte[] pOpen = {hash, lbr};
private static final byte[] pClose = {rbr, hash}; private static final byte[] pClose = {rbr, hash};
private static final byte lcbr = (byte)'{'; private static final byte lcbr = (byte)'{';
@ -135,13 +141,65 @@ public final class httpTemplate {
private static final byte lrbr = (byte)'('; private static final byte lrbr = (byte)'(';
private static final byte rrbr = (byte)')'; private static final byte rrbr = (byte)')';
//private static final byte[] aOpen = {hash, lrbr}; private static final byte[] aOpen = {hash, lrbr};
private static final byte[] aClose = {rrbr, hash}; private static final byte[] aClose = {rrbr, hash};
private static final byte ps = (byte)'%'; private static final byte ps = (byte)'%';
//private static final byte[] iOpen = {hash, ps}; private static final byte[] iOpen = {hash, ps};
private static final byte[] iClose = {ps, hash}; private static final byte[] iClose = {ps, hash};
public static final Object[] meta_quotation = new Object[] {
new Object[] {pOpen, pClose},
new Object[] {mOpen, mClose},
new Object[] {aOpen, aClose},
new Object[] {iOpen, iClose}
};
public static serverByteBuffer[] splitQuotations(serverByteBuffer text) {
List l = splitQuotation(text, 0);
serverByteBuffer[] sbbs = new serverByteBuffer[l.size()];
for (int i = 0; i < l.size(); i++) sbbs[i] = (serverByteBuffer) l.get(i);
return sbbs;
}
public static List splitQuotation(serverByteBuffer text, int qoff) {
ArrayList l = new ArrayList();
if (qoff >= meta_quotation.length) {
if (text.length() > 0) l.add(text);
return l;
}
int p = -1, q;
byte[] left = (byte[]) ((Object[]) meta_quotation[qoff])[0];
byte[] right = (byte[]) ((Object[]) meta_quotation[qoff])[1];
qoff++;
while ((text.length() > 0) && ((p = text.indexOf(left)) >= 0)) {
q = text.indexOf(right, p + 1);
if (q >= 0) {
// found a pattern
l.addAll(splitQuotation(new serverByteBuffer(text.getBytes(0, p)), qoff));
l.add(new serverByteBuffer(text.getBytes(p, q + right.length)));
text = new serverByteBuffer(text.getBytes(q + right.length));
} else {
// found only pattern start, no closing parantesis (a syntax error that is silently accepted here)
l.addAll(splitQuotation(new serverByteBuffer(text.getBytes(0, p)), qoff));
l.addAll(splitQuotation(new serverByteBuffer(text.getBytes(p)), qoff));
text.clear();
}
}
// find double-points
while ((text.length() > 0) && ((p = text.indexOf(dpdpa)) >= 0)) {
l.addAll(splitQuotation(new serverByteBuffer(text.getBytes(0, p)), qoff));
l.add(new serverByteBuffer(dpdpa));
l.addAll(splitQuotation(new serverByteBuffer(text.getBytes(p + 2)), qoff));
text.clear();
}
// add remaining
if (text.length() > 0) l.addAll(splitQuotation(text, qoff));
return l;
}
/** /**
* transfer until a specified pattern is found; everything but the pattern is transfered so far * transfer until a specified pattern is found; everything but the pattern is transfered so far
* the function returns true, if the pattern is found * the function returns true, if the pattern is found

@ -59,7 +59,7 @@ public final class serverByteBuffer extends OutputStream {
public serverByteBuffer() { public serverByteBuffer() {
buffer = new byte[80]; buffer = new byte[10];
length = 0; length = 0;
offset = 0; offset = 0;
} }
@ -125,6 +125,12 @@ public final class serverByteBuffer extends OutputStream {
} }
} }
public void clear() {
this.buffer = new byte[0];
length = 0;
offset = 0;
}
public int length() { public int length() {
return length; return length;
} }
@ -201,17 +207,46 @@ public final class serverByteBuffer extends OutputStream {
return indexOf(b, 0); return indexOf(b, 0);
} }
public int indexOf(byte[] bs) {
return indexOf(bs, 0);
}
public int indexOf(byte b, int start) { public int indexOf(byte b, int start) {
if (start >= length) return -1; if (start >= length) return -1;
for (int i = start; i < length; i++) if (buffer[offset + i] == b) return i; for (int i = start; i < length; i++) if (buffer[offset + i] == b) return i;
return -1; return -1;
} }
public int indexOf(byte[] bs, int start) {
if (start + bs.length > length) return -1;
loop: for (int i = start; i <= length - bs.length; i++) {
// first test only first byte
if (buffer[offset + i] != bs[0]) continue loop;
// then test all remaining bytes
for (int j = 1; j < bs.length; j++) {
if (buffer[offset + i + j] != bs[j]) continue loop;
}
// found hit
return i;
}
return -1;
}
public int lastIndexOf(byte b) { public int lastIndexOf(byte b) {
for (int i = length - 1; i >= 0; i--) if (buffer[offset + i] == b) return i; for (int i = length - 1; i >= 0; i--) if (buffer[offset + i] == b) return i;
return -1; return -1;
} }
public boolean startsWith(byte[] bs) {
if (length < bs.length) return false;
for (int i = 0; i < bs.length; i++) {
if (buffer[offset + i] != bs[i]) return false;
}
return true;
}
public byte[] getBytes() { public byte[] getBytes() {
return getBytes(0); return getBytes(0);
} }

Loading…
Cancel
Save