parser refactoring & hacks

13 years ago · de903a53a0
parent 8a82609360
commit de903a53a0
5 changed files with 47 additions and 31 deletions
--- a/source/net/yacy/cora/document/MultiProtocolURI.java
+++ b/source/net/yacy/cora/document/MultiProtocolURI.java
@ -914,6 +914,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
          this.port == other.port;
    }

+    @Override
    public int compareTo(final MultiProtocolURI h) {
        return toString().compareTo(h.toString());
    }
@ -1842,7 +1843,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
     * Please call isHTTP(), isHTTPS() and isFTP() before using this class
     */
    public java.net.URL getURL() throws MalformedURLException {
-        if (!(isHTTP() || isHTTPS() || isFTP())) throw new UnsupportedOperationException();
+        if (!(isHTTP() || isHTTPS() || isFTP())) throw new MalformedURLException();
        return new java.net.URL(this.toNormalform(false, true));
    }

@ -1850,8 +1851,8 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
     * create a standard java File.
     * Please call isFile() before using this class
     */
-    public java.io.File getFSFile() {
-        if (!isFile()) throw new UnsupportedOperationException();
+    public java.io.File getFSFile() throws MalformedURLException {
+        if (!isFile()) throw new MalformedURLException();
        return new java.io.File(this.toNormalform(false, true).substring(7));
    }

@ -1861,7 +1862,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
     * @throws MalformedURLException
     */
    public SmbFile getSmbFile() throws MalformedURLException {
-        if (!isSMB()) throw new UnsupportedOperationException();
+        if (!isSMB()) throw new MalformedURLException();
        final String url = unescape(this.toNormalform(false, true));
        return new SmbFile(url);
    }
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -281,9 +281,9 @@ public final class TextParser {
        assert !parsers.isEmpty();

        Document[] docs = null;
-        final HashMap<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>();
-        if (MemoryControl.request(sourceArray.length * 6, false)) {
-            for (final Parser parser: parsers) {
+        final Map<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>();
+        for (final Parser parser: parsers) {
+            if (MemoryControl.request(sourceArray.length * 6, false)) {
            	ByteArrayInputStream bis;
            	if (mimeType.equals("text/plain") && parser.getName().equals("HTML Parser")) {
            	    // a hack to simulate html files .. is needed for NOLOAD queues. This throws their data into virtual text/plain messages.
--- a/source/net/yacy/document/parser/html/AbstractScraper.java
+++ b/source/net/yacy/document/parser/html/AbstractScraper.java
@ -36,6 +36,9 @@ import net.yacy.kelondro.util.MemoryControl;

 public abstract class AbstractScraper implements Scraper {

+    protected static final String EMPTY_STRING = new String();
+
+    public static final char sp = ' ';
    public static final char lb = '<';
    public static final char rb = '>';
    public static final char sl = '/';
@ -53,20 +56,25 @@ public abstract class AbstractScraper implements Scraper {
        this.tags1  = tags1;
    }

+    @Override
    public boolean isTag0(final String tag) {
        return (this.tags0 != null) && (this.tags0.contains(tag.toLowerCase()));
    }

+    @Override
    public boolean isTag1(final String tag) {
        return (this.tags1 != null) && (this.tags1.contains(tag.toLowerCase()));
    }

    //the 'missing' method that shall be implemented:
+    @Override
    public abstract void scrapeText(char[] text, String insideTag);

    // the other methods must take into account to construct the return value correctly
+    @Override
    public abstract void scrapeTag0(String tagname, Properties tagopts);

+    @Override
    public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text);

    protected static String stripAllTags(final char[] s) {
@ -76,7 +84,7 @@ public abstract class AbstractScraper implements Scraper {
        for (final char c : s) {
            if (c == lb) {
                bc++;
-                r.append(' ');
+                if (r.length() > 0 && r.charAt(r.length() - 1) != sp) r.append(sp);
            } else if (c == rb) {
                bc--;
            } else if (bc <= 0) {
@ -86,16 +94,42 @@ public abstract class AbstractScraper implements Scraper {
        return r.toString().trim();
    }

+    protected final static String cleanLine(final String s) {
+        if (!MemoryControl.request(s.length() * 2, false)) return EMPTY_STRING;
+        final StringBuilder sb = new StringBuilder(s.length());
+        char l = ' ';
+        char c;
+        for (int i = 0; i < s.length(); i++) {
+            c = s.charAt(i);
+            if (c < ' ') c = ' ';
+            if (c == ' ') {
+                if (l != ' ') sb.append(c);
+            } else {
+                sb.append(c);
+            }
+            l = c;
+        }
+
+        // return result
+        return sb.toString().trim();
+    }
+
    public static String stripAll(final char[] s) {
        return CharacterCoding.html2unicode(stripAllTags(s));
    }

+    @Override
    public void close() {
        // free resources
        this.tags0 = null;
        this.tags1 = null;
    }

+    public static void main(String[] args) {
+        String t = "<script src=\"navigation.js\" type=\"text/javascript\"></script>\\n <script src=\"../js/prototype.js\" type=\"text/javascript\"></script>";
+        System.out.println("'" + stripAllTags(t.toCharArray()) + "'");
+    }
+
 }


--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -55,11 +55,9 @@ import net.yacy.kelondro.io.CharBuffer;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.kelondro.util.ISO639;
-import net.yacy.kelondro.util.MemoryControl;


 public class ContentScraper extends AbstractScraper implements Scraper {
-	private static final String EMPTY_STRING = new String();
 	public static final int MAX_DOCSIZE = 40 * 1024 * 1024;

    private final char degree = '\u00B0';
@ -364,7 +362,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                }
            }
        } else if (tagname.equalsIgnoreCase("area")) {
-            final String areatitle = cleanLine(tagopts.getProperty("title",EMPTY_STRING));
+            final String areatitle = cleanLine(tagopts.getProperty("title", EMPTY_STRING));
            //String alt   = tagopts.getProperty("alt",EMPTY_STRING);
            final String href  = tagopts.getProperty("href", EMPTY_STRING);
            if (href.length() > 0) {
@ -539,26 +537,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        return line;
    }

-    private final static String cleanLine(final String s) {
-        if (!MemoryControl.request(s.length() * 2, false)) return EMPTY_STRING;
-        final StringBuilder sb = new StringBuilder(s.length());
-        char l = ' ';
-        char c;
-        for (int i = 0; i < s.length(); i++) {
-            c = s.charAt(i);
-            if (c < ' ') c = ' ';
-            if (c == ' ') {
-                if (l != ' ') sb.append(c);
-            } else {
-                sb.append(c);
-            }
-            l = c;
-        }
-
-        // return result
-        return sb.toString().trim();
-    }
-
    public String getTitle() {
        // construct a title string, even if the document has no title

@ -902,12 +880,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        System.out.println("TEXT     :" + this.content.toString());
    }

+    @Override
    public void registerHtmlFilterEventListener(final ScraperListener listener) {
        if (listener != null) {
            this.htmlFilterEventListeners.add(ScraperListener.class, listener);
        }
    }

+    @Override
    public void deregisterHtmlFilterEventListener(final ScraperListener listener) {
        if (listener != null) {
            this.htmlFilterEventListeners.remove(ScraperListener.class, listener);
--- a/source/net/yacy/search/snippet/TextSnippet.java
+++ b/source/net/yacy/search/snippet/TextSnippet.java
@ -266,6 +266,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
                this.resultStatus = ResultClass.SOURCE_WEB;
            }

+            // parse the document to get all sentenced; available for snippet computation
            Document document = null;
            try {
                document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());