- added a 'add every media object linked in a html document as a new document' to the html parser. This causes that all image, app, video or audio file that is linked in a html file is added as document. In fact that means that parsing a single html document may cause that a number of documents is inserted into the search index.

- some refactoring for mime type discovery git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7919 6c8d7289-2bf4-0310-a012-ef5d649a1542
14 years ago · 610b01e1c3
parent 9a8937f8b6
commit 610b01e1c3
7 changed files with 193 additions and 145 deletions
--- a/source/de/anomic/crawler/retrieval/FileLoader.java
+++ b/source/de/anomic/crawler/retrieval/FileLoader.java
@ -33,12 +33,12 @@ import java.util.List;
 import de.anomic.search.Segments;
 import de.anomic.search.Switchboard;
 import de.anomic.crawler.CrawlProfile;
-import de.anomic.data.MimeTable;

 import net.yacy.cora.protocol.HeaderFramework;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.cora.protocol.ftp.FTPClient;
+import net.yacy.document.Classification;
 import net.yacy.document.TextParser;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.logging.Log;
@ -102,7 +102,7 @@ public class FileLoader {
        }
        
        // create response header
-        String mime = MimeTable.ext2mime(url.getFileExtension());
+        String mime = Classification.ext2mime(url.getFileExtension());
        ResponseHeader responseHeader = new ResponseHeader();
        responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
        responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
--- a/source/de/anomic/crawler/retrieval/SMBLoader.java
+++ b/source/de/anomic/crawler/retrieval/SMBLoader.java
@ -42,13 +42,13 @@ import jcifs.smb.SmbFileInputStream;
 import de.anomic.search.Segments;
 import de.anomic.search.Switchboard;
 import de.anomic.crawler.CrawlProfile;
-import de.anomic.data.MimeTable;

 import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.cora.protocol.HeaderFramework;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.cora.protocol.ftp.FTPClient;
+import net.yacy.document.Classification;
 import net.yacy.document.TextParser;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.logging.Log;
@ -122,7 +122,7 @@ public class SMBLoader {
        }
        
        // create response header
-        String mime = MimeTable.ext2mime(url.getFileExtension());
+        String mime = Classification.ext2mime(url.getFileExtension());
        ResponseHeader responseHeader = new ResponseHeader();
        responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
        responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
--- a/source/de/anomic/data/MimeTable.java
+++ b/source/de/anomic/data/MimeTable.java
@ -1,53 +0,0 @@
-package de.anomic.data;
-
-import java.io.BufferedInputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.util.Properties;
-
-import net.yacy.cora.document.MultiProtocolURI;
-import net.yacy.kelondro.logging.Log;
-
-public class MimeTable {
-
-    private static final Properties mimeTable = new Properties();
-    
-    public static void init(final File mimeFile) {
-        if (mimeTable.isEmpty()) {
-            // load the mime table
-            BufferedInputStream mimeTableInputStream = null;
-            try {
-                mimeTableInputStream = new BufferedInputStream(new FileInputStream(mimeFile));
-                mimeTable.load(mimeTableInputStream);
-            } catch (final Exception e) {                
-                Log.logException(e);
-            } finally {
-                if (mimeTableInputStream != null) try { mimeTableInputStream.close(); } catch (final Exception e1) {}                
-            }
-        }
-    }
-    
-    public static int size() {
-        return mimeTable.size();
-    }
-    
-    public static boolean isEmpty() {
-        return mimeTable.isEmpty();
-    }
-    
-    public static String ext2mime(final String ext) {
-        return mimeTable.getProperty(ext, "application/" + ext);
-    }
-    
-    public static String ext2mime(final String ext, final String dfltMime) {
-        return mimeTable.getProperty(ext, dfltMime);
-    }
-    
-    public static String url2mime(final MultiProtocolURI url, final String dfltMime) {
-        return ext2mime(url.getFileExtension(), dfltMime);
-    }
-    
-    public static String url2mime(final MultiProtocolURI url) {
-        return ext2mime(url.getFileExtension());
-    }
-}
--- a/source/de/anomic/http/server/HTTPDFileHandler.java
+++ b/source/de/anomic/http/server/HTTPDFileHandler.java
@ -105,7 +105,6 @@ import net.yacy.kelondro.util.ByteBuffer;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.kelondro.util.MemoryControl;
 import net.yacy.visualization.RasterPlotter;
-import de.anomic.data.MimeTable;
 import de.anomic.search.Switchboard;
 import de.anomic.search.SwitchboardConstants;
 import de.anomic.server.serverClassLoader;
@ -153,11 +152,11 @@ public final class HTTPDFileHandler {
        if (switchboard == null) {
            switchboard = theSwitchboard;

-            if (MimeTable.isEmpty()) {
+            if (Classification.countMimes() == 0) {
                // load the mime table
                final String mimeTablePath = theSwitchboard.getConfig("mimeTable","");
                Log.logConfig("HTTPDFiles", "Loading mime mapping file " + mimeTablePath);
-                MimeTable.init(new File(theSwitchboard.getAppPath(), mimeTablePath));
+                Classification.init(new File(theSwitchboard.getAppPath(), mimeTablePath));
            }

            // create default files array
@ -586,7 +585,7 @@ public final class HTTPDFileHandler {
                        // send an image to client
                        targetDate = new Date(System.currentTimeMillis());
                        nocache = true;
-                        final String mimeType = MimeTable.ext2mime(targetExt, "text/html");
+                        final String mimeType = Classification.ext2mime(targetExt, "text/html");
                        final ByteBuffer result = RasterPlotter.exportImage(yp.getImage(), targetExt);

                        // write the array to the client
@ -600,7 +599,7 @@ public final class HTTPDFileHandler {
                        // send an image to client
                        targetDate = new Date(System.currentTimeMillis());
                        nocache = true;
-                        final String mimeType = MimeTable.ext2mime(targetExt, "text/html");
+                        final String mimeType = Classification.ext2mime(targetExt, "text/html");
                        final ByteBuffer result = yp.getImage();

                        // write the array to the client
@ -634,7 +633,7 @@ public final class HTTPDFileHandler {
                        // send an image to client
                        targetDate = new Date(System.currentTimeMillis());
                        nocache = true;
-                        final String mimeType = MimeTable.ext2mime(targetExt, "text/html");
+                        final String mimeType = Classification.ext2mime(targetExt, "text/html");

                        // generate an byte array from the generated image
                        int width = i.getWidth(null); if (width < 0) width = 96; // bad hack
@ -805,7 +804,7 @@ public final class HTTPDFileHandler {
                // we have found a file that can be written to the client
                // if this file uses templates, then we use the template
                // re-write - method to create an result
-                String mimeType = MimeTable.ext2mime(targetExt, "text/html");
+                String mimeType = Classification.ext2mime(targetExt, "text/html");
                String ext = (String) conProp.get("EXT"); if (ext == null) ext = "";
                final boolean zipContent = requestHeader.acceptGzip() && HTTPDemon.shallTransportZipped("." + ext);
                if (path.endsWith("html") ||
--- a/source/de/anomic/search/MediaSnippet.java
+++ b/source/de/anomic/search/MediaSnippet.java
@ -37,6 +37,7 @@ import java.util.TreeSet;
 import net.yacy.cora.document.ASCII;
 import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.cora.services.federated.yacy.CacheStrategy;
+import net.yacy.document.Classification;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.WordTokenizer;
@ -47,7 +48,6 @@ import net.yacy.kelondro.index.RowSpaceExceededException;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.order.Base64Order;
 import net.yacy.kelondro.util.ByteArray;
-import de.anomic.data.MimeTable;


 public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaSnippet> {
@ -167,7 +167,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
            final int ranking = removeAppearanceHashes(url.toNormalform(false, false), queryhashes).size() +
                           removeAppearanceHashes(desc, queryhashes).size();
            if (ranking < 2 * queryhashes.size()) {
-                result.add(new MediaSnippet(mediatype, url, MimeTable.url2mime(url), desc, document.getTextLength(), null, ranking, source));
+                result.add(new MediaSnippet(mediatype, url, Classification.url2mime(url), desc, document.getTextLength(), null, ranking, source));
            }
        }
        return result;
@ -196,7 +196,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
                           removeAppearanceHashes(url.toNormalform(false, false), queryhashes).size() -
                           removeAppearanceHashes(desc, queryhashes).size();
            final long ranking = Long.MAX_VALUE - (ientry.height() + 1) * (ientry.width() + 1) * (appcount + 1);
-            result.add(new MediaSnippet(ContentDomain.IMAGE, url, MimeTable.url2mime(url), desc, ientry.fileSize(), ientry.width(), ientry.height(), ranking, source));
+            result.add(new MediaSnippet(ContentDomain.IMAGE, url, Classification.url2mime(url), desc, ientry.fileSize(), ientry.width(), ientry.height(), ranking, source));
        }
        return result;
    }
--- a/source/net/yacy/document/Classification.java
+++ b/source/net/yacy/document/Classification.java
@ -23,11 +23,11 @@ package net.yacy.document;
 import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.FileInputStream;
-import java.io.IOException;
 import java.util.HashSet;
 import java.util.Properties;
 import java.util.Set;

+import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.kelondro.logging.Log;

 public class Classification {
@ -38,22 +38,8 @@ public class Classification {
    private static final Set<String> videoExtSet = new HashSet<String>();
    private static final Set<String> appsExtSet = new HashSet<String>();
    
-    private static final Properties ext2mime = new Properties();
    
    static {
-    	// load a list of extensions from file
-        BufferedInputStream bufferedIn = null;
-        File mimeFile = new File("defaults/httpd.mime");
-        if (!mimeFile.exists()) mimeFile = new File("config/mime.properties");
-        try {
-            ext2mime.load(bufferedIn = new BufferedInputStream(new FileInputStream(mimeFile)));
-        } catch (final IOException e) {
-            Log.logSevere("Classification", "httpd.mime not found in " + mimeFile.toString(), e);
-        } finally {
-            if (bufferedIn != null) try {
-                bufferedIn.close();
-            } catch (final Exception e) {}
-        }
        
        final String apps = "7z,ace,arc,arj,apk,asf,asx,bat,bin,bkf,bz2,cab,com,css,dcm,deb,dll,dmg,exe,gho,ghs,gz,hqx,img,iso,jar,lha,rar,sh,sit,sitx,tar,tbz,tgz,tib,torrent,vbs,war,zip";
        final String audio = "aac,aif,aiff,flac,m4a,m4p,mid,mp2,mp3,oga,ogg,ram,sid,wav,wma";
@ -102,4 +88,41 @@ public class Classification {
        return mimeType.toUpperCase().startsWith("IMAGE");
    }

+
+    private static final Properties mimeTable = new Properties();
+    
+    public static void init(final File mimeFile) {
+        if (mimeTable.isEmpty()) {
+            // load the mime table
+            BufferedInputStream mimeTableInputStream = null;
+            try {
+                mimeTableInputStream = new BufferedInputStream(new FileInputStream(mimeFile));
+                mimeTable.load(mimeTableInputStream);
+            } catch (final Exception e) {                
+                Log.logException(e);
+            } finally {
+                if (mimeTableInputStream != null) try { mimeTableInputStream.close(); } catch (final Exception e1) {}                
+            }
+        }
+    }
+    
+    public static int countMimes() {
+        return mimeTable.size();
+    }
+    
+    public static String ext2mime(final String ext) {
+        return mimeTable.getProperty(ext, "application/" + ext);
+    }
+    
+    public static String ext2mime(final String ext, final String dfltMime) {
+        return mimeTable.getProperty(ext, dfltMime);
+    }
+    
+    public static String url2mime(final MultiProtocolURI url, final String dfltMime) {
+        return ext2mime(url.getFileExtension(), dfltMime);
+    }
+    
+    public static String url2mime(final MultiProtocolURI url) {
+        return ext2mime(url.getFileExtension());
+    }
 }
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@ -32,15 +32,20 @@ import java.net.MalformedURLException;
 import java.nio.charset.Charset;
 import java.nio.charset.IllegalCharsetNameException;
 import java.nio.charset.UnsupportedCharsetException;
+import java.util.ArrayList;
+import java.util.Map;
+import java.util.Map.Entry;
 import java.util.regex.Pattern;

 import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.document.AbstractParser;
+import net.yacy.document.Classification;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.parser.html.CharacterCoding;
 import net.yacy.document.parser.html.ContentScraper;
+import net.yacy.document.parser.html.ImageEntry;
 import net.yacy.document.parser.html.ScraperInputStream;
 import net.yacy.document.parser.html.TransformerWriter;
 import net.yacy.kelondro.util.FileUtils;
@ -83,6 +88,133 @@ public class htmlParser extends AbstractParser implements Parser {
        this.SUPPORTED_MIME_TYPES.add("text/csv");
    }

+    public Document[] parse(
+            final MultiProtocolURI location,
+            final String mimeType,
+            final String documentCharset,
+            final InputStream sourceStream) throws Parser.Failure, InterruptedException {
+
+        try {
+            // first get a document from the parsed html
+            ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream);
+            Document document = transformScraper(location, mimeType, documentCharset, scraper);
+
+            // then produce virtual documents for each of the link that is contained in the document!
+            ArrayList<Document> docs = new ArrayList<Document>();
+            docs.add(document);
+            for (Map.Entry<MultiProtocolURI, String> link: document.getApplinks().entrySet()) {
+                addLinkDocs(docs, "application", link.getKey(), link.getValue(), scraper);
+            }
+            for (Map.Entry<MultiProtocolURI, String> link: document.getAudiolinks().entrySet()) {
+                addLinkDocs(docs, "audio", link.getKey(), link.getValue(), scraper);
+            }
+            for (Map.Entry<MultiProtocolURI, String> link: document.getVideolinks().entrySet()) {
+                addLinkDocs(docs, "video", link.getKey(), link.getValue(), scraper);
+            }
+            for (Entry<MultiProtocolURI, ImageEntry> link: document.getImages().entrySet()) {
+                addImageDocs(docs, link.getValue());
+            }
+            
+            
+            // finally return the list of documents
+            return docs.toArray(new Document[docs.size()]);
+        } catch (final IOException e) {
+			throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location);
+		}
+    }
+    
+    private final static void addLinkDocs(ArrayList<Document> docs, String type, MultiProtocolURI uri, String descr, ContentScraper scraper) {
+        //System.out.println("HTMLPARSER-LINK " + type + ": " + uri.toNormalform(true, false) + " / " + descr);
+        final Document doc = new Document(
+                uri,
+                Classification.ext2mime(uri.getFileExtension()),
+                "UTF-8",
+                null,
+                scraper.getContentLanguages(),
+                null,
+                descr,
+                "",
+                "",
+                new String[]{descr},
+                type,
+                0.0f, 0.0f,
+                uri.toNormalform(false, false),
+                null,
+                null,
+                null,
+                false);
+        docs.add(doc);
+    }
+    
+    private final static void addImageDocs(ArrayList<Document> docs, ImageEntry img) {
+        //System.out.println("HTMLPARSER-LINK image: " + img.url().toNormalform(true, false) + " / " + img.alt());
+        final Document doc = new Document(
+                img.url(),
+                Classification.ext2mime(img.url().getFileExtension()),
+                "UTF-8",
+                null,
+                null,
+                null,
+                img.alt(),
+                "",
+                "",
+                new String[]{img.alt()},
+                "image",
+                0.0f, 0.0f,
+                img.url().toNormalform(false, false),
+                null,
+                null,
+                null,
+                false);
+        docs.add(doc);
+    }
+
+    /**
+     *  the transformScraper method transforms a scraper object into a document object
+     * @param location
+     * @param mimeType
+     * @param charSet
+     * @param scraper
+     * @return
+     */
+    private static Document transformScraper(final MultiProtocolURI location, final String mimeType, final String charSet, final ContentScraper scraper) {
+        final String[] sections = new String[
+                 scraper.getHeadlines(1).length +
+                 scraper.getHeadlines(2).length +
+                 scraper.getHeadlines(3).length +
+                 scraper.getHeadlines(4).length +
+                 scraper.getHeadlines(5).length +
+                 scraper.getHeadlines(6).length];
+        int p = 0;
+        for (int i = 1; i <= 6; i++) {
+            for (final String headline : scraper.getHeadlines(i)) {
+                sections[p++] = headline;
+            }
+        }
+        final Document ppd = new Document(
+                location,
+                mimeType,
+                charSet,
+                scraper,
+                scraper.getContentLanguages(),
+                scraper.getKeywords(),
+                scraper.getTitle(),
+                scraper.getAuthor(),
+                scraper.getPublisher(),
+                sections,
+                scraper.getDescription(),
+                scraper.getLon(), scraper.getLat(),
+                scraper.getText(),
+                scraper.getAnchors(),
+                scraper.getRSS(),
+                scraper.getImages(),
+                scraper.indexingDenied());
+        //scraper.close();
+        ppd.setFavicon(scraper.getFavicon());
+        
+        return ppd;
+    }
+
    public static ContentScraper parseToScraper(
            final MultiProtocolURI location,
            final String documentCharset,
@ -109,12 +241,12 @@ public class htmlParser extends AbstractParser implements Parser {

        // the author didn't tell us the encoding, try the mozilla-heuristic
        if (charset == null) {
-        	final CharsetDetector det = new CharsetDetector();
-        	det.enableInputFilter(true);
-        	final InputStream detStream = new BufferedInputStream(sourceStream);
-        	det.setText(detStream);
-        	charset = det.detect().getName();
-        	sourceStream = detStream;
+            final CharsetDetector det = new CharsetDetector();
+            det.enableInputFilter(true);
+            final InputStream detStream = new BufferedInputStream(sourceStream);
+            det.setText(detStream);
+            charset = det.detect().getName();
+            sourceStream = detStream;
        }

        // wtf? still nothing, just take system-standard
@ -124,11 +256,11 @@ public class htmlParser extends AbstractParser implements Parser {

        Charset c;
        try {
-        	c = Charset.forName(charset);
+            c = Charset.forName(charset);
        } catch (final IllegalCharsetNameException e) {
-        	c = Charset.defaultCharset();
+            c = Charset.defaultCharset();
        } catch (final UnsupportedCharsetException e) {
-        	c = Charset.defaultCharset();
+            c = Charset.defaultCharset();
        }

        // parsing the content
@ -139,7 +271,7 @@ public class htmlParser extends AbstractParser implements Parser {
        } catch (final IOException e) {
            throw new Parser.Failure("IO error:" + e.getMessage(), location);
        } finally {
-        	sourceStream.close();
+            sourceStream.close();
            writer.close();
        }
        //OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
@ -152,59 +284,6 @@ public class htmlParser extends AbstractParser implements Parser {
        return scraper;
    }

-    public Document[] parse(
-            final MultiProtocolURI location,
-            final String mimeType,
-            final String documentCharset,
-            final InputStream sourceStream) throws Parser.Failure, InterruptedException {
-
-        try {
-			return transformScraper(location, mimeType, documentCharset, parseToScraper(location, documentCharset, sourceStream));
-		} catch (final IOException e) {
-			throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location);
-		}
-    }
-
-    private static Document[] transformScraper(final MultiProtocolURI location, final String mimeType, final String charSet, final ContentScraper scraper) {
-        final String[] sections = new String[
-                 scraper.getHeadlines(1).length +
-                 scraper.getHeadlines(2).length +
-                 scraper.getHeadlines(3).length +
-                 scraper.getHeadlines(4).length +
-                 scraper.getHeadlines(5).length +
-                 scraper.getHeadlines(6).length];
-        int p = 0;
-        for (int i = 1; i <= 6; i++) {
-            for (final String headline : scraper.getHeadlines(i)) {
-                sections[p++] = headline;
-            }
-        }
-        final Document[] ppds = new Document[]{new Document(
-                location,
-                mimeType,
-                charSet,
-                scraper,
-                scraper.getContentLanguages(),
-                scraper.getKeywords(),
-                scraper.getTitle(),
-                scraper.getAuthor(),
-                scraper.getPublisher(),
-                sections,
-                scraper.getDescription(),
-                scraper.getLon(), scraper.getLat(),
-                scraper.getText(),
-                scraper.getAnchors(),
-                scraper.getRSS(),
-                scraper.getImages(),
-                scraper.indexingDenied())};
-        //scraper.close();
-        for (final Document ppd: ppds) {
-            ppd.setFavicon(scraper.getFavicon());
-        }
-        return ppds;
-    }
-
-
    /**
     * some html authors use wrong encoding names, either because they don't know exactly what they
     * are doing or they produce a type. Many times, the upper/downcase scheme of the name is fuzzy