From 4e9f02c8ec04dea7bdf260dfe788611c7e01f2ca Mon Sep 17 00:00:00 2001 From: allo Date: Wed, 26 Jul 2006 23:11:15 +0000 Subject: [PATCH] integration of Michaels string-extraction. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2337 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/data/gettext.java | 30 +++++++++++++++++-- .../htmlFilterContentTransformer.java | 22 ++++++++++++++ 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/source/de/anomic/data/gettext.java b/source/de/anomic/data/gettext.java index 8996d2285..606d69227 100644 --- a/source/de/anomic/data/gettext.java +++ b/source/de/anomic/data/gettext.java @@ -27,6 +27,7 @@ import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; +import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.text.SimpleDateFormat; @@ -36,6 +37,11 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Map; +import de.anomic.htmlFilter.htmlFilterAbstractTransformer; +import de.anomic.htmlFilter.htmlFilterContentTransformer; +import de.anomic.server.serverAbstractSwitch; +import de.anomic.server.logging.serverLog; + public class gettext{ public static ArrayList createGettextRecursive(File sourceDir, String extensions, String notdir, File oldgettextfile) throws FileNotFoundException{ if(oldgettextfile==null) @@ -121,7 +127,10 @@ public class gettext{ while(it.hasNext()){ try { filename=(String)it.next(); - tmp=getGettextSource(new File(filename), oldgettext); + //TODO: better possibility to switch the behaviour + //tmp=getGettextSource(new File(filename), oldgettext); + tmp=getGettextSourceFromHTML(new File(filename), oldgettext); + serverLog.logFinest("Gettext", "Extracting Strings from: "+filename); } catch (FileNotFoundException e) { System.out.println("File \""+filename+"\" not found."); } @@ -139,10 +148,27 @@ public class gettext{ return getGettextSource(inputfile, new HashMap()); } public static ArrayList getGettextSource(File inputfile, Map oldgettextmap) throws FileNotFoundException{ + ArrayList strings=getGettextItems(inputfile); + return getGettextSource(inputfile, oldgettextmap, strings); + } + public static ArrayList getGettextSourceFromHTML(File inputfile, Map oldgettextmap) throws FileNotFoundException{ + htmlFilterContentTransformer transformer=new htmlFilterContentTransformer(); + BufferedReader br=new BufferedReader(new FileReader(inputfile)); + StringBuffer content=new StringBuffer(); + String line=""; + try { + while((line=br.readLine())!=null){ + content.append(line).append("\n"); + } + } catch (IOException e) {} + ArrayList strings = transformer.getStrings(content.toString().getBytes()); + return getGettextSource(inputfile, oldgettextmap, strings); + } + public static ArrayList getGettextSource(File inputfile, Map oldgettextmap, ArrayList strings) throws FileNotFoundException{ if(oldgettextmap==null) oldgettextmap=new HashMap(); - ArrayList strings=getGettextItems(inputfile); + ArrayList list=new ArrayList(); Iterator it=strings.iterator(); if(strings.isEmpty()) diff --git a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java index 26e315935..838ad8e16 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java @@ -136,6 +136,28 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer return false; } + public ArrayList getStrings(byte[] text){ + ArrayList result=new ArrayList(); + + serverByteBuffer sbb = new serverByteBuffer(text); + serverByteBuffer[] sbbs = httpTemplate.splitQuotations(sbb); + //sbb = new serverByteBuffer(); + for (int i = 0; i < sbbs.length; i++) { + if (sbbs[i].isWhitespace(true)) { + //sbb.append(sbbs[i]); + } else if ((sbbs[i].byteAt(0) == httpTemplate.hash) || + (sbbs[i].startsWith(httpTemplate.dpdpa))) { + // this is a template or a part of a template + //sbb.append(sbbs[i]); + } else { + // this is a text fragment, generate gettext quotation + int ws = sbbs[i].whitespaceStart(true); + int we = sbbs[i].whitespaceEnd(true); + result.add(new String(sbbs[i].getBytes(ws, we))); + } + } + return result; + } public byte[] transformText(byte[] text) { if (gettext) { serverByteBuffer sbb = new serverByteBuffer(text);