lines inside tags without punctuation are extended by a single dot.

This enables the condenser to distinguish the lines in a better way.
The result is a better preparation of snippets.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2715 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent e25172853a
commit fd61209797

@ -304,7 +304,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
} }
//the 'missing' method that shall be implemented: //the 'missing' method that shall be implemented:
public abstract void scrapeText(char[] text); public abstract void scrapeText(char[] text, boolean insideTag);
// the other methods must take into account to construct the return value correctly // the other methods must take into account to construct the return value correctly
public abstract void scrapeTag0(String tagname, Properties tagopts); public abstract void scrapeTag0(String tagname, Properties tagopts);

@ -115,10 +115,23 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
this.content = new serverCharBuffer(1024); this.content = new serverCharBuffer(1024);
} }
public void scrapeText(char[] newtext) { public final static boolean punctuation(char c) {
return (c == '.') || (c == '!') || (c == '?');
}
public void scrapeText(char[] newtext, boolean insideTag) {
// System.out.println("SCRAPE: " + new String(newtext)); // System.out.println("SCRAPE: " + new String(newtext));
if ((content.length() != 0) && (content.charAt(content.length() - 1) != 32)) content.append(32); serverCharBuffer b = super.stripAll(new serverCharBuffer(newtext, newtext.length + 1)).trim();
content.append(super.stripAll(new serverCharBuffer(newtext, newtext.length + 1)).trim()).append(32); if (insideTag) {
// texts inside tags sometimes have no punctuation at the line end
// this is bad for the text sematics, because it is not possible for the
// condenser to distinguish headlines from text beginnings.
// to make it easier for the condenser, a dot ('.') is appended in case that
// no punctuation is part of the newtext line
if ((b.length() != 0) && (!(punctuation(b.charAt(b.length() - 1))))) b.append('.');
//System.out.println("*** Appended dot: " + b.toString());
}
if (b.length() != 0) content.append(b).append((char) 32);
} }
public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"'; public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';

@ -48,7 +48,7 @@ public interface htmlFilterScraper {
public boolean isTag1(String tag); public boolean isTag1(String tag);
public void scrapeText(char[] text); public void scrapeText(char[] text, boolean insideTag);
public void scrapeTag0(String tagname, Properties tagopts); public void scrapeTag0(String tagname, Properties tagopts);

@ -186,7 +186,7 @@ public final class htmlFilterWriter extends Writer {
// we are not collection tag text // we are not collection tag text
if (tag == null) { if (tag == null) {
// and this is not a tag opener/closer // and this is not a tag opener/closer
if (scraper != null) scraper.scrapeText(content); if (scraper != null) scraper.scrapeText(content, false);
if (transformer != null) return transformer.transformText(content); if (transformer != null) return transformer.transformText(content);
return content; return content;
} }
@ -221,7 +221,7 @@ public final class htmlFilterWriter extends Writer {
// we are collection tag text for the tag 'filterTag' // we are collection tag text for the tag 'filterTag'
if (tag == null) { if (tag == null) {
// go on collecting content // go on collecting content
if (scraper != null) scraper.scrapeText(content); if (scraper != null) scraper.scrapeText(content, true);
if (transformer != null) { if (transformer != null) {
filterCont.append(transformer.transformText(content)); filterCont.append(transformer.transformText(content));
} else { } else {

@ -61,6 +61,7 @@ import java.util.Map;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.TreeSet; import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.kelondro.kelondroMSetTools; import de.anomic.kelondro.kelondroMSetTools;
public final class plasmaCondenser { public final class plasmaCondenser {
@ -192,7 +193,7 @@ public final class plasmaCondenser {
word = ((String) wordenum.nextElement()).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars? word = ((String) wordenum.nextElement()).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars?
// System.out.println("PARSED-WORD " + word); // System.out.println("PARSED-WORD " + word);
wordlen = word.length(); wordlen = word.length();
if ((wordlen == 1) && (punctuation(word.charAt(0)))) { if ((wordlen == 1) && (htmlFilterContentScraper.punctuation(word.charAt(0)))) {
// store sentence // store sentence
if (sentence.length() > 0) { if (sentence.length() > 0) {
// we store the punctuation symbol as first element of the sentence vector // we store the punctuation symbol as first element of the sentence vector
@ -470,10 +471,6 @@ public final class plasmaCondenser {
writer.close(); writer.close();
} }
protected final static boolean punctuation(char c) {
return (c == '.') || (c == '!') || (c == '?');
}
public final static boolean invisible(char c) { public final static boolean invisible(char c) {
// TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars? // TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars?
if ((c < ' ') || (c > 'z')) return true; if ((c < ' ') || (c > 'z')) return true;
@ -507,7 +504,7 @@ public final class plasmaCondenser {
char c; char c;
loop: while (e.hasMoreElements()) { loop: while (e.hasMoreElements()) {
s = (String) e.nextElement(); s = (String) e.nextElement();
if ((s.length() == 1) && (punctuation(s.charAt(0)))) return s; if ((s.length() == 1) && (htmlFilterContentScraper.punctuation(s.charAt(0)))) return s;
if (s.length() < ml) continue loop; if (s.length() < ml) continue loop;
for (int i = 0; i < s.length(); i++) { for (int i = 0; i < s.length(); i++) {
c = s.charAt(i); c = s.charAt(i);
@ -562,7 +559,7 @@ public final class plasmaCondenser {
for (int i = 0; i < r.length(); i++) { for (int i = 0; i < r.length(); i++) {
c = r.charAt(i); c = r.charAt(i);
if (invisible(c)) sb = sb.append(' '); // TODO: Bugfix needed for UTF-8 if (invisible(c)) sb = sb.append(' '); // TODO: Bugfix needed for UTF-8
else if (punctuation(c)) sb = sb.append(' ').append(c).append(' '); else if (htmlFilterContentScraper.punctuation(c)) sb = sb.append(' ').append(c).append(' ');
else sb = sb.append(c); else sb = sb.append(c);
} }
s = sb.toString().trim(); s = sb.toString().trim();
@ -721,7 +718,7 @@ public final class plasmaCondenser {
if (nextChar < 0) return null; if (nextChar < 0) return null;
c = (char) nextChar; c = (char) nextChar;
s.append(c); s.append(c);
if (punctuation(c)) break; if (htmlFilterContentScraper.punctuation(c)) break;
} }
// replace line endings and tabs by blanks // replace line endings and tabs by blanks

Loading…
Cancel
Save