enhanced html parser to recognize a href tags inside header tags

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6743 6c8d7289-2bf4-0310-a012-ef5d649a1542
15 years ago · 0f8004f9da
parent 3300930fc5
commit 0f8004f9da
4 changed files with 40 additions and 12 deletions
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@ -381,7 +381,7 @@ public class ViewFile {
                    i++;
                }
                i += putMediaInfo(prop, wordArray, i, document.getApplinks(), "app", (i % 2 == 0));
-                i += putMediaInfo(prop, wordArray, i, document.getHyperlinks(), "href", (i % 2 == 0));
+                i += putMediaInfo(prop, wordArray, i, document.getHyperlinks(), "link", (i % 2 == 0));
                prop.put("viewMode_links", i);

            }
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -27,6 +27,7 @@
 package net.yacy.document.parser.html;

 import java.io.ByteArrayInputStream;
+import java.io.CharArrayReader;
 import java.io.File;
 import java.io.IOException;
 import java.io.UnsupportedEncodingException;
@ -46,6 +47,7 @@ import javax.swing.event.EventListenerList;
 import net.yacy.document.parser.htmlParser;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.io.CharBuffer;
+import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.kelondro.util.ISO639;

@ -118,7 +120,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    
    public void scrapeText(final char[] newtext, final String insideTag) {
        // System.out.println("SCRAPE: " + new String(newtext));
-        String b = super.stripAll(new String(newtext)).trim();
+        String b = cleanLine(super.stripAll(new String(newtext)));
        if ((insideTag != null) && (!(insideTag.equals("a")))) {
            // texts inside tags sometimes have no punctuation at the line end
            // this is bad for the text sematics, because it is not possible for the
@ -216,7 +218,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    
    public void scrapeTag1(final String tagname, final Properties tagopts, final char[] text) {
        // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
-        if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) {
+        if (tagname.equalsIgnoreCase("a") && text.length < 2048) {
            final String href = tagopts.getProperty("href", "");
            DigestURI url;
            if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
@ -225,38 +227,59 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                final String type = (p < 0) ? "" : f.substring(p + 1);
                if (type.equals("png") || type.equals("gif") || type.equals("jpg") || type.equals("jpeg")) {
                    // special handling of such urls: put them to the image urls
-                    final ImageEntry ie = new ImageEntry(url, super.stripAll(new String(text)).trim(), -1, -1, -1);
+                    final ImageEntry ie = new ImageEntry(url, recursiveParse(text), -1, -1, -1);
                    addImage(images, ie);
                } else {
-                    anchors.put(url, super.stripAll(new String(text)).trim());
+                    anchors.put(url, recursiveParse(text));
                }
            }
        }
        String h;
        if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
-            h = cleanLine(super.stripAll(new String(text)));
+            h = recursiveParse(text);
            if (h.length() > 0) headlines[0].add(h);
        }
        if ((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) {
-            h = cleanLine(super.stripAll(new String(text)));
+            h = recursiveParse(text);
            if (h.length() > 0) headlines[1].add(h);
        }
        if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) {
-            h = cleanLine(super.stripAll(new String(text)));
+            System.out.println("TTT " + new String(text));
+            h = recursiveParse(text);
            if (h.length() > 0) headlines[2].add(h);
        }
        if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) {
-            h = cleanLine(super.stripAll(new String(text)));
+            h = recursiveParse(text);
            if (h.length() > 0) headlines[3].add(h);
        }
        if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
-            title = cleanLine(super.stripAll(new String(text)));
+            title = recursiveParse(text);
        }

        // fire event
        fireScrapeTag1(tagname, tagopts, text);
    }

+    private String recursiveParse(char[] inlineHtml) {
+        if (inlineHtml.length < 14) return cleanLine(super.stripAll(new String(inlineHtml)));
+        
+        // start a new scraper to parse links inside this text
+        // parsing the content
+        final ContentScraper scraper = new ContentScraper(this.root);        
+        final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false);
+        try {
+            FileUtils.copy(new CharArrayReader(inlineHtml), writer);
+            writer.close();
+        } catch (IOException e) {
+            Log.logException(e);
+            return cleanLine(super.stripAll(new String(inlineHtml)));
+        }
+        this.anchors.putAll(scraper.getAnchors());
+        this.images.putAll(scraper.images);
+        
+        return cleanLine(super.stripAll(new String(scraper.content.getChars())));
+    }
+    
    private static String cleanLine(String s) {
        // may contain too many funny symbols
        for (int i = 0; i < s.length(); i++)
--- a/source/net/yacy/document/parser/html/ScraperInputStream.java
+++ b/source/net/yacy/document/parser/html/ScraperInputStream.java
@ -63,14 +63,14 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
            final boolean passbyIfBinarySuspect
    ) {
        // create a input stream for buffereing
-        this.bufferedIn = new BufferedInputStream(inStream,(int) preBufferSize);
+        this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize);
        this.bufferedIn.mark((int) preBufferSize);
        
        final ContentScraper scraper = new ContentScraper(rooturl);
        scraper.registerHtmlFilterEventListener(this);
        
        try {
-	    this.reader = new InputStreamReader(this,inputStreamCharset);
+	    this.reader = (inputStreamCharset == null) ? new InputStreamReader(this) : new InputStreamReader(this,inputStreamCharset);
 	} catch (UnsupportedEncodingException e) {
 	    try {
 		this.reader = new InputStreamReader(this, "UTF-8");
--- a/source/net/yacy/kelondro/util/FileUtils.java
+++ b/source/net/yacy/kelondro/util/FileUtils.java
@ -121,6 +121,11 @@ public final class FileUtils {
        }
    }    
    
+    public static int copy(final InputStream source, final Writer dest) throws IOException {
+        final InputStreamReader reader = new InputStreamReader(source);
+        return copy(reader,dest);
+    }
+    
    public static int copy(final InputStream source, final Writer dest, final Charset inputCharset) throws IOException {
        final InputStreamReader reader = new InputStreamReader(source,inputCharset);
        return copy(reader,dest);