From 97fa6788a1661de5e04974823659038175d12fdd Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 19 Jul 2006 22:35:36 +0000 Subject: [PATCH] added gettext support: automatic replacement of string appearances in html files by gettext quotes. see also: http://www.yacy-forum.de/viewtopic.php?p=23901#23901 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2309 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../htmlFilterContentTransformer.java | 69 +++++++++++++++---- .../htmlFilter/htmlFilterOutputStream.java | 21 +++--- source/de/anomic/http/httpTemplate.java | 66 ++++++++++++++++-- source/de/anomic/server/serverByteBuffer.java | 37 +++++++++- 4 files changed, 165 insertions(+), 28 deletions(-) diff --git a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java index 5cd92349b..d2429a2cc 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java @@ -52,6 +52,8 @@ import java.util.ArrayList; import java.util.Locale; import java.util.Properties; import java.util.TreeSet; + +import de.anomic.http.httpTemplate; import de.anomic.server.serverByteBuffer; public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer implements htmlFilterTransformer { @@ -69,20 +71,24 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer static { linkTags0 = new TreeSet(insensitiveCollator); linkTags0.add("img"); + linkTags0.add("input"); linkTags1 = new TreeSet(insensitiveCollator); linkTags1.add("a"); } - private static ArrayList bluelist = null; + private ArrayList bluelist = null; + private boolean gettext = false; public htmlFilterContentTransformer() { super(linkTags0, linkTags1); } public void init(String initarg) { -// System.out.println("Transformer init: " + initarg); - if (bluelist == null) { + if (initarg.equals("gettext")) { + // the initarg declares that the transformer applies a gettext-quotation on strings + gettext = true; + } else if (bluelist == null) { // here, the initarg is used to load a list of bluelisted words bluelist = new ArrayList(); File f = new File(initarg); @@ -102,7 +108,7 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer } public boolean isIdentityTransformer() { - return bluelist.size() == 0; + return (bluelist.size() == 0) && (!gettext); } private static byte[] genBlueLetters(int length) { @@ -116,7 +122,7 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer return bb.getBytes(); } - private boolean hit(byte[] text) { + private boolean bluelistHit(byte[] text) { if (text == null || bluelist == null) return false; String lc; try { @@ -131,22 +137,61 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer } public byte[] transformText(byte[] text) { - if (hit(text)) { -// System.out.println("FILTERHIT: " + text); - return genBlueLetters(text.length); + if (gettext) { + serverByteBuffer sbb = new serverByteBuffer(text); + sbb.trim(); + //if (sbb.length() > 0) System.out.println(" TEXT: " + sbb.toString()); + serverByteBuffer[] sbbs = httpTemplate.splitQuotations(sbb); + sbb = new serverByteBuffer(); + for (int i = 0; i < sbbs.length; i++) { + sbbs[i].trim(); + if (sbbs[i].length() == 0) { + sbb.append(' '); + } else if ((sbbs[i].byteAt(0) == httpTemplate.hash) || + (sbbs[i].startsWith(httpTemplate.dpdpa))) { + // this is a template or a part of a template + sbb.append(sbbs[i]); + } else { + // this is a text fragment, generate gettext quotation + sbb.append('_'); + sbb.append('('); + sbb.append(sbbs[i]); + sbb.append(')'); + } + } + //if (sbb.length() > 0) System.out.println("GETTEXT: " + sbb.toString()); + return sbb.getBytes(); + } + if (bluelist != null) { + if (bluelistHit(text)) { + // System.out.println("FILTERHIT: " + text); + return genBlueLetters(text.length); + } else { + return text; + } } return text; } public byte[] transformTag0(String tagname, Properties tagopts, byte quotechar) { - if (hit(tagopts.getProperty("src","").getBytes())) return genBlueLetters(5); - if (hit(tagopts.getProperty("alt","").getBytes())) return genBlueLetters(5); + if (tagname.equals("img")) { + // check bluelist + if (bluelistHit(tagopts.getProperty("src","").getBytes())) return genBlueLetters(5); + if (bluelistHit(tagopts.getProperty("alt","").getBytes())) return genBlueLetters(5); + + // replace image alternative name + tagopts.setProperty("alt", new String(transformText(tagopts.getProperty("alt","").getBytes()))); + } + if ((tagname.equals("input")) && (tagopts.getProperty("type").equals("submit"))) { + // rewrite button name + tagopts.setProperty("value", new String(transformText(tagopts.getProperty("value","").getBytes()))); + } return htmlFilterOutputStream.genTag0(tagname, tagopts, quotechar); } public byte[] transformTag1(String tagname, Properties tagopts, byte[] text, byte quotechar) { - if (hit(tagopts.getProperty("href","").getBytes())) return genBlueLetters(text.length); - if (hit(text)) return genBlueLetters(text.length); + if (bluelistHit(tagopts.getProperty("href","").getBytes())) return genBlueLetters(text.length); + if (bluelistHit(text)) return genBlueLetters(text.length); return htmlFilterOutputStream.genTag1(tagname, tagopts, text, quotechar); } diff --git a/source/de/anomic/htmlFilter/htmlFilterOutputStream.java b/source/de/anomic/htmlFilter/htmlFilterOutputStream.java index ef7d8b285..8a60a01ac 100644 --- a/source/de/anomic/htmlFilter/htmlFilterOutputStream.java +++ b/source/de/anomic/htmlFilter/htmlFilterOutputStream.java @@ -59,7 +59,6 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.MalformedURLException; -import java.util.ArrayList; import java.util.Enumeration; import java.util.Properties; import de.anomic.server.serverByteBuffer; @@ -491,27 +490,27 @@ public final class htmlFilterOutputStream extends OutputStream { } public static void main(String[] args) { - // test app - // takes one argument: a file name + // takes one argument: a file name if (args.length != 1) return; byte[] buffer = new byte[512]; try { - htmlFilterContentScraper lc = new htmlFilterContentScraper(new URL("http://www.anomic.de/")); - ArrayList v = new ArrayList(); - v.add("proxy"); - htmlFilterTransformer lt = new htmlFilterContentTransformer(); + htmlFilterContentScraper scraper = new htmlFilterContentScraper(new URL("http://localhost:8080")); + htmlFilterTransformer transformer = new htmlFilterContentTransformer(); + transformer.init("gettext"); InputStream is = new FileInputStream(args[0]); FileOutputStream fos = new FileOutputStream(new File(args[0] + ".out")); - OutputStream os = new htmlFilterOutputStream(fos, lc, lt, false); + OutputStream os = new htmlFilterOutputStream(fos, scraper, transformer, false); int i; while ((i = is.read(buffer)) > 0) os.write(buffer, 0, i); os.close(); fos.close(); is.close(); - lc.print(); + scraper.print(); + } catch (MalformedURLException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); } - catch (MalformedURLException e) {} - catch (IOException e) {} } } \ No newline at end of file diff --git a/source/de/anomic/http/httpTemplate.java b/source/de/anomic/http/httpTemplate.java index 40ee85349..a3def41aa 100644 --- a/source/de/anomic/http/httpTemplate.java +++ b/source/de/anomic/http/httpTemplate.java @@ -57,9 +57,12 @@ import java.io.InputStreamReader; import java.io.OutputStream; import java.io.PushbackInputStream; import java.io.UnsupportedEncodingException; +import java.util.ArrayList; import java.util.HashMap; import java.util.Hashtable; +import java.util.List; +import de.anomic.server.serverByteBuffer; import de.anomic.server.serverFileUtils; import de.anomic.server.logging.serverLog; @@ -120,12 +123,15 @@ import de.anomic.server.logging.serverLog; */ public final class httpTemplate { - private static final byte hash = (byte)'#'; + public static final byte hash = (byte)'#'; private static final byte[] hasha = {hash}; + private static final byte dp = (byte)':'; + public static final byte[] dpdpa = {dp, dp}; + private static final byte lbr = (byte)'['; private static final byte rbr = (byte)']'; - //private static final byte[] pOpen = {hash, lbr}; + private static final byte[] pOpen = {hash, lbr}; private static final byte[] pClose = {rbr, hash}; private static final byte lcbr = (byte)'{'; @@ -135,13 +141,65 @@ public final class httpTemplate { private static final byte lrbr = (byte)'('; private static final byte rrbr = (byte)')'; - //private static final byte[] aOpen = {hash, lrbr}; + private static final byte[] aOpen = {hash, lrbr}; private static final byte[] aClose = {rrbr, hash}; private static final byte ps = (byte)'%'; - //private static final byte[] iOpen = {hash, ps}; + private static final byte[] iOpen = {hash, ps}; private static final byte[] iClose = {ps, hash}; + public static final Object[] meta_quotation = new Object[] { + new Object[] {pOpen, pClose}, + new Object[] {mOpen, mClose}, + new Object[] {aOpen, aClose}, + new Object[] {iOpen, iClose} + }; + + public static serverByteBuffer[] splitQuotations(serverByteBuffer text) { + List l = splitQuotation(text, 0); + serverByteBuffer[] sbbs = new serverByteBuffer[l.size()]; + for (int i = 0; i < l.size(); i++) sbbs[i] = (serverByteBuffer) l.get(i); + return sbbs; + } + + public static List splitQuotation(serverByteBuffer text, int qoff) { + ArrayList l = new ArrayList(); + if (qoff >= meta_quotation.length) { + if (text.length() > 0) l.add(text); + return l; + } + int p = -1, q; + byte[] left = (byte[]) ((Object[]) meta_quotation[qoff])[0]; + byte[] right = (byte[]) ((Object[]) meta_quotation[qoff])[1]; + qoff++; + while ((text.length() > 0) && ((p = text.indexOf(left)) >= 0)) { + q = text.indexOf(right, p + 1); + if (q >= 0) { + // found a pattern + l.addAll(splitQuotation(new serverByteBuffer(text.getBytes(0, p)), qoff)); + l.add(new serverByteBuffer(text.getBytes(p, q + right.length))); + text = new serverByteBuffer(text.getBytes(q + right.length)); + } else { + // found only pattern start, no closing parantesis (a syntax error that is silently accepted here) + l.addAll(splitQuotation(new serverByteBuffer(text.getBytes(0, p)), qoff)); + l.addAll(splitQuotation(new serverByteBuffer(text.getBytes(p)), qoff)); + text.clear(); + } + } + + // find double-points + while ((text.length() > 0) && ((p = text.indexOf(dpdpa)) >= 0)) { + l.addAll(splitQuotation(new serverByteBuffer(text.getBytes(0, p)), qoff)); + l.add(new serverByteBuffer(dpdpa)); + l.addAll(splitQuotation(new serverByteBuffer(text.getBytes(p + 2)), qoff)); + text.clear(); + } + + // add remaining + if (text.length() > 0) l.addAll(splitQuotation(text, qoff)); + return l; + } + /** * transfer until a specified pattern is found; everything but the pattern is transfered so far * the function returns true, if the pattern is found diff --git a/source/de/anomic/server/serverByteBuffer.java b/source/de/anomic/server/serverByteBuffer.java index dab691662..829bfbdac 100644 --- a/source/de/anomic/server/serverByteBuffer.java +++ b/source/de/anomic/server/serverByteBuffer.java @@ -59,7 +59,7 @@ public final class serverByteBuffer extends OutputStream { public serverByteBuffer() { - buffer = new byte[80]; + buffer = new byte[10]; length = 0; offset = 0; } @@ -125,6 +125,12 @@ public final class serverByteBuffer extends OutputStream { } } + public void clear() { + this.buffer = new byte[0]; + length = 0; + offset = 0; + } + public int length() { return length; } @@ -201,17 +207,46 @@ public final class serverByteBuffer extends OutputStream { return indexOf(b, 0); } + public int indexOf(byte[] bs) { + return indexOf(bs, 0); + } + public int indexOf(byte b, int start) { if (start >= length) return -1; for (int i = start; i < length; i++) if (buffer[offset + i] == b) return i; return -1; } + public int indexOf(byte[] bs, int start) { + if (start + bs.length > length) return -1; + loop: for (int i = start; i <= length - bs.length; i++) { + // first test only first byte + if (buffer[offset + i] != bs[0]) continue loop; + + // then test all remaining bytes + for (int j = 1; j < bs.length; j++) { + if (buffer[offset + i + j] != bs[j]) continue loop; + } + + // found hit + return i; + } + return -1; + } + public int lastIndexOf(byte b) { for (int i = length - 1; i >= 0; i--) if (buffer[offset + i] == b) return i; return -1; } + public boolean startsWith(byte[] bs) { + if (length < bs.length) return false; + for (int i = 0; i < bs.length; i++) { + if (buffer[offset + i] != bs[i]) return false; + } + return true; + } + public byte[] getBytes() { return getBytes(0); }