diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index 0104e0cc5..a5579dbd5 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -914,6 +914,7 @@ public class MultiProtocolURI implements Serializable, Comparable failedParser = new HashMap(); - if (MemoryControl.request(sourceArray.length * 6, false)) { - for (final Parser parser: parsers) { + final Map failedParser = new HashMap(); + for (final Parser parser: parsers) { + if (MemoryControl.request(sourceArray.length * 6, false)) { ByteArrayInputStream bis; if (mimeType.equals("text/plain") && parser.getName().equals("HTML Parser")) { // a hack to simulate html files .. is needed for NOLOAD queues. This throws their data into virtual text/plain messages. diff --git a/source/net/yacy/document/parser/html/AbstractScraper.java b/source/net/yacy/document/parser/html/AbstractScraper.java index 889c4e310..ff0c404e6 100644 --- a/source/net/yacy/document/parser/html/AbstractScraper.java +++ b/source/net/yacy/document/parser/html/AbstractScraper.java @@ -36,6 +36,9 @@ import net.yacy.kelondro.util.MemoryControl; public abstract class AbstractScraper implements Scraper { + protected static final String EMPTY_STRING = new String(); + + public static final char sp = ' '; public static final char lb = '<'; public static final char rb = '>'; public static final char sl = '/'; @@ -53,20 +56,25 @@ public abstract class AbstractScraper implements Scraper { this.tags1 = tags1; } + @Override public boolean isTag0(final String tag) { return (this.tags0 != null) && (this.tags0.contains(tag.toLowerCase())); } + @Override public boolean isTag1(final String tag) { return (this.tags1 != null) && (this.tags1.contains(tag.toLowerCase())); } //the 'missing' method that shall be implemented: + @Override public abstract void scrapeText(char[] text, String insideTag); // the other methods must take into account to construct the return value correctly + @Override public abstract void scrapeTag0(String tagname, Properties tagopts); + @Override public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text); protected static String stripAllTags(final char[] s) { @@ -76,7 +84,7 @@ public abstract class AbstractScraper implements Scraper { for (final char c : s) { if (c == lb) { bc++; - r.append(' '); + if (r.length() > 0 && r.charAt(r.length() - 1) != sp) r.append(sp); } else if (c == rb) { bc--; } else if (bc <= 0) { @@ -86,16 +94,42 @@ public abstract class AbstractScraper implements Scraper { return r.toString().trim(); } + protected final static String cleanLine(final String s) { + if (!MemoryControl.request(s.length() * 2, false)) return EMPTY_STRING; + final StringBuilder sb = new StringBuilder(s.length()); + char l = ' '; + char c; + for (int i = 0; i < s.length(); i++) { + c = s.charAt(i); + if (c < ' ') c = ' '; + if (c == ' ') { + if (l != ' ') sb.append(c); + } else { + sb.append(c); + } + l = c; + } + + // return result + return sb.toString().trim(); + } + public static String stripAll(final char[] s) { return CharacterCoding.html2unicode(stripAllTags(s)); } + @Override public void close() { // free resources this.tags0 = null; this.tags1 = null; } + public static void main(String[] args) { + String t = "\\n "; + System.out.println("'" + stripAllTags(t.toCharArray()) + "'"); + } + } diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 3ddd7859d..78e176d22 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -55,11 +55,9 @@ import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.ISO639; -import net.yacy.kelondro.util.MemoryControl; public class ContentScraper extends AbstractScraper implements Scraper { - private static final String EMPTY_STRING = new String(); public static final int MAX_DOCSIZE = 40 * 1024 * 1024; private final char degree = '\u00B0'; @@ -364,7 +362,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } } } else if (tagname.equalsIgnoreCase("area")) { - final String areatitle = cleanLine(tagopts.getProperty("title",EMPTY_STRING)); + final String areatitle = cleanLine(tagopts.getProperty("title", EMPTY_STRING)); //String alt = tagopts.getProperty("alt",EMPTY_STRING); final String href = tagopts.getProperty("href", EMPTY_STRING); if (href.length() > 0) { @@ -539,26 +537,6 @@ public class ContentScraper extends AbstractScraper implements Scraper { return line; } - private final static String cleanLine(final String s) { - if (!MemoryControl.request(s.length() * 2, false)) return EMPTY_STRING; - final StringBuilder sb = new StringBuilder(s.length()); - char l = ' '; - char c; - for (int i = 0; i < s.length(); i++) { - c = s.charAt(i); - if (c < ' ') c = ' '; - if (c == ' ') { - if (l != ' ') sb.append(c); - } else { - sb.append(c); - } - l = c; - } - - // return result - return sb.toString().trim(); - } - public String getTitle() { // construct a title string, even if the document has no title @@ -902,12 +880,14 @@ public class ContentScraper extends AbstractScraper implements Scraper { System.out.println("TEXT :" + this.content.toString()); } + @Override public void registerHtmlFilterEventListener(final ScraperListener listener) { if (listener != null) { this.htmlFilterEventListeners.add(ScraperListener.class, listener); } } + @Override public void deregisterHtmlFilterEventListener(final ScraperListener listener) { if (listener != null) { this.htmlFilterEventListeners.remove(ScraperListener.class, listener); diff --git a/source/net/yacy/search/snippet/TextSnippet.java b/source/net/yacy/search/snippet/TextSnippet.java index 417bb6f9a..3fc977ffc 100644 --- a/source/net/yacy/search/snippet/TextSnippet.java +++ b/source/net/yacy/search/snippet/TextSnippet.java @@ -266,6 +266,7 @@ public class TextSnippet implements Comparable, Comparator