added gettext support:

automatic replacement of string appearances in html files by
gettext quotes.
see also: http://www.yacy-forum.de/viewtopic.php?p=23901#23901

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2309 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent b3c569f706
commit 97fa6788a1

@ -52,6 +52,8 @@ import java.util.ArrayList;
import java.util.Locale;
import java.util.Properties;
import java.util.TreeSet;
import de.anomic.http.httpTemplate;
import de.anomic.server.serverByteBuffer;
public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer implements htmlFilterTransformer {
@ -69,20 +71,24 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
static {
linkTags0 = new TreeSet(insensitiveCollator);
linkTags0.add("img");
linkTags0.add("input");
linkTags1 = new TreeSet(insensitiveCollator);
linkTags1.add("a");
}
private static ArrayList bluelist = null;
private ArrayList bluelist = null;
private boolean gettext = false;
public htmlFilterContentTransformer() {
super(linkTags0, linkTags1);
}
public void init(String initarg) {
// System.out.println("Transformer init: " + initarg);
if (bluelist == null) {
if (initarg.equals("gettext")) {
// the initarg declares that the transformer applies a gettext-quotation on strings
gettext = true;
} else if (bluelist == null) {
// here, the initarg is used to load a list of bluelisted words
bluelist = new ArrayList();
File f = new File(initarg);
@ -102,7 +108,7 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
}
public boolean isIdentityTransformer() {
return bluelist.size() == 0;
return (bluelist.size() == 0) && (!gettext);
}
private static byte[] genBlueLetters(int length) {
@ -116,7 +122,7 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
return bb.getBytes();
}
private boolean hit(byte[] text) {
private boolean bluelistHit(byte[] text) {
if (text == null || bluelist == null) return false;
String lc;
try {
@ -131,22 +137,61 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
}
public byte[] transformText(byte[] text) {
if (hit(text)) {
// System.out.println("FILTERHIT: " + text);
return genBlueLetters(text.length);
if (gettext) {
serverByteBuffer sbb = new serverByteBuffer(text);
sbb.trim();
//if (sbb.length() > 0) System.out.println(" TEXT: " + sbb.toString());
serverByteBuffer[] sbbs = httpTemplate.splitQuotations(sbb);
sbb = new serverByteBuffer();
for (int i = 0; i < sbbs.length; i++) {
sbbs[i].trim();
if (sbbs[i].length() == 0) {
sbb.append(' ');
} else if ((sbbs[i].byteAt(0) == httpTemplate.hash) ||
(sbbs[i].startsWith(httpTemplate.dpdpa))) {
// this is a template or a part of a template
sbb.append(sbbs[i]);
} else {
// this is a text fragment, generate gettext quotation
sbb.append('_');
sbb.append('(');
sbb.append(sbbs[i]);
sbb.append(')');
}
}
//if (sbb.length() > 0) System.out.println("GETTEXT: " + sbb.toString());
return sbb.getBytes();
}
if (bluelist != null) {
if (bluelistHit(text)) {
// System.out.println("FILTERHIT: " + text);
return genBlueLetters(text.length);
} else {
return text;
}
}
return text;
}
public byte[] transformTag0(String tagname, Properties tagopts, byte quotechar) {
if (hit(tagopts.getProperty("src","").getBytes())) return genBlueLetters(5);
if (hit(tagopts.getProperty("alt","").getBytes())) return genBlueLetters(5);
if (tagname.equals("img")) {
// check bluelist
if (bluelistHit(tagopts.getProperty("src","").getBytes())) return genBlueLetters(5);
if (bluelistHit(tagopts.getProperty("alt","").getBytes())) return genBlueLetters(5);
// replace image alternative name
tagopts.setProperty("alt", new String(transformText(tagopts.getProperty("alt","").getBytes())));
}
if ((tagname.equals("input")) && (tagopts.getProperty("type").equals("submit"))) {
// rewrite button name
tagopts.setProperty("value", new String(transformText(tagopts.getProperty("value","").getBytes())));
}
return htmlFilterOutputStream.genTag0(tagname, tagopts, quotechar);
}
public byte[] transformTag1(String tagname, Properties tagopts, byte[] text, byte quotechar) {
if (hit(tagopts.getProperty("href","").getBytes())) return genBlueLetters(text.length);
if (hit(text)) return genBlueLetters(text.length);
if (bluelistHit(tagopts.getProperty("href","").getBytes())) return genBlueLetters(text.length);
if (bluelistHit(text)) return genBlueLetters(text.length);
return htmlFilterOutputStream.genTag1(tagname, tagopts, text, quotechar);
}

@ -59,7 +59,6 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Properties;
import de.anomic.server.serverByteBuffer;
@ -491,27 +490,27 @@ public final class htmlFilterOutputStream extends OutputStream {
}
public static void main(String[] args) {
// test app
// takes one argument: a file name
// takes one argument: a file name
if (args.length != 1) return;
byte[] buffer = new byte[512];
try {
htmlFilterContentScraper lc = new htmlFilterContentScraper(new URL("http://www.anomic.de/"));
ArrayList v = new ArrayList();
v.add("proxy");
htmlFilterTransformer lt = new htmlFilterContentTransformer();
htmlFilterContentScraper scraper = new htmlFilterContentScraper(new URL("http://localhost:8080"));
htmlFilterTransformer transformer = new htmlFilterContentTransformer();
transformer.init("gettext");
InputStream is = new FileInputStream(args[0]);
FileOutputStream fos = new FileOutputStream(new File(args[0] + ".out"));
OutputStream os = new htmlFilterOutputStream(fos, lc, lt, false);
OutputStream os = new htmlFilterOutputStream(fos, scraper, transformer, false);
int i;
while ((i = is.read(buffer)) > 0) os.write(buffer, 0, i);
os.close();
fos.close();
is.close();
lc.print();
scraper.print();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
catch (MalformedURLException e) {}
catch (IOException e) {}
}
}

@ -57,9 +57,12 @@ import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PushbackInputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.List;
import de.anomic.server.serverByteBuffer;
import de.anomic.server.serverFileUtils;
import de.anomic.server.logging.serverLog;
@ -120,12 +123,15 @@ import de.anomic.server.logging.serverLog;
*/
public final class httpTemplate {
private static final byte hash = (byte)'#';
public static final byte hash = (byte)'#';
private static final byte[] hasha = {hash};
private static final byte dp = (byte)':';
public static final byte[] dpdpa = {dp, dp};
private static final byte lbr = (byte)'[';
private static final byte rbr = (byte)']';
//private static final byte[] pOpen = {hash, lbr};
private static final byte[] pOpen = {hash, lbr};
private static final byte[] pClose = {rbr, hash};
private static final byte lcbr = (byte)'{';
@ -135,13 +141,65 @@ public final class httpTemplate {
private static final byte lrbr = (byte)'(';
private static final byte rrbr = (byte)')';
//private static final byte[] aOpen = {hash, lrbr};
private static final byte[] aOpen = {hash, lrbr};
private static final byte[] aClose = {rrbr, hash};
private static final byte ps = (byte)'%';
//private static final byte[] iOpen = {hash, ps};
private static final byte[] iOpen = {hash, ps};
private static final byte[] iClose = {ps, hash};
public static final Object[] meta_quotation = new Object[] {
new Object[] {pOpen, pClose},
new Object[] {mOpen, mClose},
new Object[] {aOpen, aClose},
new Object[] {iOpen, iClose}
};
public static serverByteBuffer[] splitQuotations(serverByteBuffer text) {
List l = splitQuotation(text, 0);
serverByteBuffer[] sbbs = new serverByteBuffer[l.size()];
for (int i = 0; i < l.size(); i++) sbbs[i] = (serverByteBuffer) l.get(i);
return sbbs;
}
public static List splitQuotation(serverByteBuffer text, int qoff) {
ArrayList l = new ArrayList();
if (qoff >= meta_quotation.length) {
if (text.length() > 0) l.add(text);
return l;
}
int p = -1, q;
byte[] left = (byte[]) ((Object[]) meta_quotation[qoff])[0];
byte[] right = (byte[]) ((Object[]) meta_quotation[qoff])[1];
qoff++;
while ((text.length() > 0) && ((p = text.indexOf(left)) >= 0)) {
q = text.indexOf(right, p + 1);
if (q >= 0) {
// found a pattern
l.addAll(splitQuotation(new serverByteBuffer(text.getBytes(0, p)), qoff));
l.add(new serverByteBuffer(text.getBytes(p, q + right.length)));
text = new serverByteBuffer(text.getBytes(q + right.length));
} else {
// found only pattern start, no closing parantesis (a syntax error that is silently accepted here)
l.addAll(splitQuotation(new serverByteBuffer(text.getBytes(0, p)), qoff));
l.addAll(splitQuotation(new serverByteBuffer(text.getBytes(p)), qoff));
text.clear();
}
}
// find double-points
while ((text.length() > 0) && ((p = text.indexOf(dpdpa)) >= 0)) {
l.addAll(splitQuotation(new serverByteBuffer(text.getBytes(0, p)), qoff));
l.add(new serverByteBuffer(dpdpa));
l.addAll(splitQuotation(new serverByteBuffer(text.getBytes(p + 2)), qoff));
text.clear();
}
// add remaining
if (text.length() > 0) l.addAll(splitQuotation(text, qoff));
return l;
}
/**
* transfer until a specified pattern is found; everything but the pattern is transfered so far
* the function returns true, if the pattern is found

@ -59,7 +59,7 @@ public final class serverByteBuffer extends OutputStream {
public serverByteBuffer() {
buffer = new byte[80];
buffer = new byte[10];
length = 0;
offset = 0;
}
@ -125,6 +125,12 @@ public final class serverByteBuffer extends OutputStream {
}
}
public void clear() {
this.buffer = new byte[0];
length = 0;
offset = 0;
}
public int length() {
return length;
}
@ -201,17 +207,46 @@ public final class serverByteBuffer extends OutputStream {
return indexOf(b, 0);
}
public int indexOf(byte[] bs) {
return indexOf(bs, 0);
}
public int indexOf(byte b, int start) {
if (start >= length) return -1;
for (int i = start; i < length; i++) if (buffer[offset + i] == b) return i;
return -1;
}
public int indexOf(byte[] bs, int start) {
if (start + bs.length > length) return -1;
loop: for (int i = start; i <= length - bs.length; i++) {
// first test only first byte
if (buffer[offset + i] != bs[0]) continue loop;
// then test all remaining bytes
for (int j = 1; j < bs.length; j++) {
if (buffer[offset + i + j] != bs[j]) continue loop;
}
// found hit
return i;
}
return -1;
}
public int lastIndexOf(byte b) {
for (int i = length - 1; i >= 0; i--) if (buffer[offset + i] == b) return i;
return -1;
}
public boolean startsWith(byte[] bs) {
if (length < bs.length) return false;
for (int i = 0; i < bs.length; i++) {
if (buffer[offset + i] != bs[i]) return false;
}
return true;
}
public byte[] getBytes() {
return getBytes(0);
}

Loading…
Cancel
Save