From fd61209797c961db46e706929c1fe7d49ab4679f Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 8 Oct 2006 01:24:00 +0000 Subject: [PATCH] lines inside tags without punctuation are extended by a single dot. This enables the condenser to distinguish the lines in a better way. The result is a better preparation of snippets. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2715 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../htmlFilter/htmlFilterAbstractScraper.java | 2 +- .../htmlFilter/htmlFilterContentScraper.java | 19 ++++++++++++++++--- .../anomic/htmlFilter/htmlFilterScraper.java | 2 +- .../anomic/htmlFilter/htmlFilterWriter.java | 4 ++-- source/de/anomic/plasma/plasmaCondenser.java | 13 +++++-------- 5 files changed, 25 insertions(+), 15 deletions(-) diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java index 5c2855d8d..f90f188db 100644 --- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java @@ -304,7 +304,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { } //the 'missing' method that shall be implemented: - public abstract void scrapeText(char[] text); + public abstract void scrapeText(char[] text, boolean insideTag); // the other methods must take into account to construct the return value correctly public abstract void scrapeTag0(String tagname, Properties tagopts); diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 44f33cb81..d7a99276e 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -115,10 +115,23 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen this.content = new serverCharBuffer(1024); } - public void scrapeText(char[] newtext) { + public final static boolean punctuation(char c) { + return (c == '.') || (c == '!') || (c == '?'); + } + + public void scrapeText(char[] newtext, boolean insideTag) { // System.out.println("SCRAPE: " + new String(newtext)); - if ((content.length() != 0) && (content.charAt(content.length() - 1) != 32)) content.append(32); - content.append(super.stripAll(new serverCharBuffer(newtext, newtext.length + 1)).trim()).append(32); + serverCharBuffer b = super.stripAll(new serverCharBuffer(newtext, newtext.length + 1)).trim(); + if (insideTag) { + // texts inside tags sometimes have no punctuation at the line end + // this is bad for the text sematics, because it is not possible for the + // condenser to distinguish headlines from text beginnings. + // to make it easier for the condenser, a dot ('.') is appended in case that + // no punctuation is part of the newtext line + if ((b.length() != 0) && (!(punctuation(b.charAt(b.length() - 1))))) b.append('.'); + //System.out.println("*** Appended dot: " + b.toString()); + } + if (b.length() != 0) content.append(b).append((char) 32); } public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"'; diff --git a/source/de/anomic/htmlFilter/htmlFilterScraper.java b/source/de/anomic/htmlFilter/htmlFilterScraper.java index 678eaaf4c..8c506de70 100644 --- a/source/de/anomic/htmlFilter/htmlFilterScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterScraper.java @@ -48,7 +48,7 @@ public interface htmlFilterScraper { public boolean isTag1(String tag); - public void scrapeText(char[] text); + public void scrapeText(char[] text, boolean insideTag); public void scrapeTag0(String tagname, Properties tagopts); diff --git a/source/de/anomic/htmlFilter/htmlFilterWriter.java b/source/de/anomic/htmlFilter/htmlFilterWriter.java index f174876a2..3045643c3 100644 --- a/source/de/anomic/htmlFilter/htmlFilterWriter.java +++ b/source/de/anomic/htmlFilter/htmlFilterWriter.java @@ -186,7 +186,7 @@ public final class htmlFilterWriter extends Writer { // we are not collection tag text if (tag == null) { // and this is not a tag opener/closer - if (scraper != null) scraper.scrapeText(content); + if (scraper != null) scraper.scrapeText(content, false); if (transformer != null) return transformer.transformText(content); return content; } @@ -221,7 +221,7 @@ public final class htmlFilterWriter extends Writer { // we are collection tag text for the tag 'filterTag' if (tag == null) { // go on collecting content - if (scraper != null) scraper.scrapeText(content); + if (scraper != null) scraper.scrapeText(content, true); if (transformer != null) { filterCont.append(transformer.transformText(content)); } else { diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index 6dc70d2f4..a6f91fb3f 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -61,6 +61,7 @@ import java.util.Map; import java.util.TreeMap; import java.util.TreeSet; +import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.kelondro.kelondroMSetTools; public final class plasmaCondenser { @@ -192,7 +193,7 @@ public final class plasmaCondenser { word = ((String) wordenum.nextElement()).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars? // System.out.println("PARSED-WORD " + word); wordlen = word.length(); - if ((wordlen == 1) && (punctuation(word.charAt(0)))) { + if ((wordlen == 1) && (htmlFilterContentScraper.punctuation(word.charAt(0)))) { // store sentence if (sentence.length() > 0) { // we store the punctuation symbol as first element of the sentence vector @@ -470,10 +471,6 @@ public final class plasmaCondenser { writer.close(); } - protected final static boolean punctuation(char c) { - return (c == '.') || (c == '!') || (c == '?'); - } - public final static boolean invisible(char c) { // TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars? if ((c < ' ') || (c > 'z')) return true; @@ -507,7 +504,7 @@ public final class plasmaCondenser { char c; loop: while (e.hasMoreElements()) { s = (String) e.nextElement(); - if ((s.length() == 1) && (punctuation(s.charAt(0)))) return s; + if ((s.length() == 1) && (htmlFilterContentScraper.punctuation(s.charAt(0)))) return s; if (s.length() < ml) continue loop; for (int i = 0; i < s.length(); i++) { c = s.charAt(i); @@ -562,7 +559,7 @@ public final class plasmaCondenser { for (int i = 0; i < r.length(); i++) { c = r.charAt(i); if (invisible(c)) sb = sb.append(' '); // TODO: Bugfix needed for UTF-8 - else if (punctuation(c)) sb = sb.append(' ').append(c).append(' '); + else if (htmlFilterContentScraper.punctuation(c)) sb = sb.append(' ').append(c).append(' '); else sb = sb.append(c); } s = sb.toString().trim(); @@ -721,7 +718,7 @@ public final class plasmaCondenser { if (nextChar < 0) return null; c = (char) nextChar; s.append(c); - if (punctuation(c)) break; + if (htmlFilterContentScraper.punctuation(c)) break; } // replace line endings and tabs by blanks