*) migration of mimeTypeParser to jmimemagic 0.1

- better mimetype detection for rss feeds - better mimetype detection for odt documents (less memory consuming) - two new detector classes implementing MagicDetector interface of jmimemagic git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2650 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · 813a8a8179
parent 3f5a4153a0
commit 813a8a8179
12 changed files with 273 additions and 35 deletions
--- a/libx/jmimemagic-0.0.4a.jar
+++ b/libx/jmimemagic-0.0.4a.jar
--- a/libx/jmimemagic-0.1.0.jar
+++ b/libx/jmimemagic-0.1.0.jar
--- a/libx/jmimemagic-0.0.4a.license
+++ b/libx/jmimemagic-0.0.4a.license
@ -55,7 +55,7 @@ modified by someone else and passed on, the recipients should know
 that what they have is not the original version, so that the original
 author's reputation will not be affected by problems that might be
 introduced by others.
-
+
  Finally, software patents pose a constant threat to the existence of
 any free program.  We wish to make sure that a company cannot
 effectively restrict the users of a free program by obtaining a
@ -111,7 +111,7 @@ modification follow.  Pay close attention to the difference between a
 "work based on the library" and a "work that uses the library".  The
 former contains code derived from the library, whereas the latter must
 be combined with the library in order to run.
-
+
 		  GNU LESSER GENERAL PUBLIC LICENSE
   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION

@ -158,7 +158,7 @@ Library.
  You may charge a fee for the physical act of transferring a copy,
 and you may at your option offer warranty protection in exchange for a
 fee.
-
+
  2. You may modify your copy or copies of the Library or any portion
 of it, thus forming a work based on the Library, and copy and
 distribute such modifications or work under the terms of Section 1
@ -216,7 +216,7 @@ instead of to this License.  (If a newer version than version 2 of the
 ordinary GNU General Public License has appeared, then you can specify
 that version instead if you wish.)  Do not make any other change in
 these notices.
-
+
  Once this change is made in a given copy, it is irreversible for
 that copy, so the ordinary GNU General Public License applies to all
 subsequent copies and derivative works made from that copy.
@ -267,7 +267,7 @@ Library will still fall under Section 6.)
 distribute the object code for the work under the terms of Section 6.
 Any executables containing that work also fall under Section 6,
 whether or not they are linked directly with the Library itself.
-
+
  6. As an exception to the Sections above, you may also combine or
 link a "work that uses the Library" with the Library to produce a
 work containing portions of the Library, and distribute that work
@ -329,7 +329,7 @@ restrictions of other proprietary libraries that do not normally
 accompany the operating system.  Such a contradiction means you cannot
 use both them and the Library together in an executable that you
 distribute.
-
+
  7. You may place library facilities that are a work based on the
 Library side-by-side in a single library together with other library
 facilities not covered by this License, and distribute such a combined
@ -370,7 +370,7 @@ subject to these terms and conditions.  You may not impose any further
 restrictions on the recipients' exercise of the rights granted herein.
 You are not responsible for enforcing compliance by third parties with
 this License.
-
+
  11. If, as a consequence of a court judgment or allegation of patent
 infringement or for any other reason (not limited to patent issues),
 conditions are imposed on you (whether by court order, agreement or
@ -422,7 +422,7 @@ conditions either of that version or of any later version published by
 the Free Software Foundation.  If the Library does not specify a
 license version number, you may choose any version ever published by
 the Free Software Foundation.
-
+
  14. If you wish to incorporate parts of the Library into other free
 programs whose distribution conditions are incompatible with these,
 write to the author to ask for permission.  For software which is
@ -456,7 +456,7 @@ SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
 DAMAGES.

 		     END OF TERMS AND CONDITIONS
-
+
           How to Apply These Terms to Your New Libraries

  If you develop a new library, and you want it to be of the greatest
--- a/source/de/anomic/plasma/parser/mimeType/build.xml
+++ b/source/de/anomic/plasma/parser/mimeType/build.xml
@ -16,9 +16,10 @@
  	  	  <pathelement location="${build}" />	
  	  	
 		  <!-- main lib needed to parse doc files -->  	  	
-  		  <pathelement location="${libx}/jmimemagic-0.0.4a.jar" /> 
+  		  <pathelement location="${libx}/jmimemagic-0.1.0.jar" /> 
          <pathelement location="${libx}/jakarta-oro-2.0.7.jar" />                          
          <pathelement location="${libx}/log4j-1.2.9.jar" /> 
+          <pathelement location="${libx}/commons-logging.jar" />
          <pathelement location="${libx}/xerces.jar"/>     
  	  	</classpath>
  	  </javac>    	
@ -28,7 +29,7 @@
    <target name="zip" depends="compile">
  	  <tar destfile="${parserArchive}" compression="gzip">
  	  	<tarfileset dir="${libx}" 
-  	  				includes="jmimemagic-0.0.4a.jar,jakarta-oro-2.0.7.jar,log4j-1.2.9.jar,xerces.jar" 
+  	  				includes="jmimemagic-0.1.0.jar,jakarta-oro-2.0.7.jar,log4j-1.2.9.jar,commons-logging.jar,xerces.jar" 
  	  				prefix="${releaseFileParentDir}/libx/"
 			  	  	dirmode="755" mode="644"/>
  	  	<tarfileset dir="${src}/de/anomic/plasma/parser/${parserShortName}" 
@ -42,7 +43,7 @@

    <target name="copy" depends="compile">
        <copy todir="${release}/libx/">
-             <fileset dir="${libx}" includes="jmimemagic-0.0.4a.jar,jakarta-oro-2.0.7.jar,log4j-1.2.9.jar,xerces.jar"/> 
+             <fileset dir="${libx}" includes="jmimemagic-0.1.0.jar,jakarta-oro-2.0.7.jar,log4j-1.2.9.jar,commons-logging.jar,xerces.jar"/> 
        </copy>
        <copy todir="${release}/source/de/anomic/plasma/parser/${parserShortName}">
             <fileset dir="${src}/de/anomic/plasma/parser/${parserShortName}" includes="**/*"/> 
--- a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
+++ b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
@ -55,6 +55,7 @@ import org.apache.log4j.Logger;

 import net.sf.jmimemagic.Magic;
 import net.sf.jmimemagic.MagicMatch;
+import net.sf.jmimemagic.MagicMatchNotFoundException;

 import de.anomic.plasma.plasmaParser;
 import de.anomic.plasma.plasmaParserDocument;
@ -86,7 +87,7 @@ implements Parser {
     * @see Parser#getLibxDependences()
     */
    private static final String[] LIBX_DEPENDENCIES = new String[] {
-        "jmimemagic-0.0.4a.jar",
+        "jmimemagic-0.1.0.jar",
        "jakarta-oro-2.0.7.jar",
        "log4j-1.2.9.jar",
        "xerces.jar"
@ -106,9 +107,8 @@ implements Parser {
    public String getMimeType (File sourceFile) {
        String mimeType = null;
        
-        try {    
-            Magic theMagic = new Magic();           
-            MagicMatch match = theMagic.getMagicMatch(sourceFile);        
+        try {           
+            MagicMatch match = Magic.getMagicMatch(sourceFile,true);        
            
            // if a match was found we can return the new mimeType
            if (match!=null) {
@ -145,10 +145,8 @@ implements Parser {
            // deactivating the logging for jMimeMagic
            Logger jmimeMagicLogger = Logger.getLogger("net.sf.jmimemagic");
            jmimeMagicLogger.setLevel(Level.OFF);
-            
-            Magic theMagic = new Magic();           
-            MagicMatch match = theMagic.getMagicMatch(sourceFile);
-            
+
+            MagicMatch match = Magic.getMagicMatch(sourceFile,true,false);
            
            // if a match was found we can return the new mimeType
            if (match!=null) {
@ -172,7 +170,8 @@ implements Parser {
                return theParser.parseSource(location,mimeType,charset,sourceFile);
            }
            throw new ParserException("Unable to detect mimetype of resource.",location);
-            
+        } catch (MagicMatchNotFoundException e) {
+            throw new ParserException("Unable to detect mimetype of resource.",location);
        } catch (Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            if (e instanceof ParserException) throw (ParserException) e;
--- a/source/de/anomic/plasma/parser/mimeType/odtDetector.java
+++ b/source/de/anomic/plasma/parser/mimeType/odtDetector.java
@ -0,0 +1,112 @@
+//odtDetector.java 
+//------------------------
+//part of YaCy
+//(C) by Michael Peter Christen; mc@anomic.de
+//first published on http://www.anomic.de
+//Frankfurt, Germany, 2005
+//
+//this file is contributed by Martin Thelian
+//last major change: 16.05.2005
+//
+//This program is free software; you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation; either version 2 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program; if not, write to the Free Software
+//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+//Using this software in any meaning (reading, learning, copying, compiling,
+//running) means that you agree that the Author(s) is (are) not responsible
+//for cost, loss of data or any harm that may be caused directly or indirectly
+//by usage of this softare or this documentation. The usage of this software
+//is on your own risk. The installation and usage (starting/running) of this
+//software may allow other people or application to access your computer and
+//any attached devices and is highly dependent on the configuration of the
+//software which must be done by the user of the software; the author(s) is
+//(are) also not responsible for proper configuration and usage of the
+//software, even if provoked by documentation provided together with
+//the software.
+//
+//Any changes to this file according to the GPL as documented in the file
+//gpl.txt aside this file in the shipment you received can be done to the
+//lines that follows this copyright notice here, but changes must not be
+//done inside the copyright notive above. A re-distribution must contain
+//the intact and unchanged copyright notice.
+//Contributions and changes to the program code must be marked as such.
+
+
+package de.anomic.plasma.parser.mimeType;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Map;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+
+import net.sf.jmimemagic.MagicDetector;
+import de.anomic.server.serverFileUtils;
+
+public class odtDetector implements MagicDetector {
+
+    public String getDisplayName() {
+        return "ODT MimeType Detector";
+    }
+
+    public String[] getHandledExtensions() {
+        return new String[]{"zip","odt"};
+    }
+
+    public String[] getHandledTypes() {
+        return new String[] { "application/vnd.oasis.opendocument.text", "application/x-vnd.oasis.opendocument.text" };
+    }
+
+    public String getName() {
+        return "odtfiledetector";
+    }
+
+    public String getVersion() {
+        return "0.1";
+    }
+
+    public String[] process(byte[] data, int offset, int length, long bitmask, char comparator, String mimeType, Map params) {
+        File dstFile = null;
+        try {
+            dstFile = File.createTempFile("mimeTypeParser",".tmp");
+            serverFileUtils.write(data,dstFile);
+            return process(dstFile, offset, length, bitmask, comparator, mimeType, params);
+        } catch (IOException e) {
+            return null;
+        } finally {
+            if (dstFile != null) {dstFile.delete();}            
+        }
+    }
+
+    public String[] process(File file, int offset, int length, long bitmask, char comparator, String mimeType, Map params) {
+        try {
+            // opening the zip file
+            ZipFile zipFile = new ZipFile(file);
+            
+            // searching for a file named mimetype
+            ZipEntry mimeTypeInfo = zipFile.getEntry("mimetype");
+            if (mimeTypeInfo == null) return null;
+            
+            // read in the content of the file
+            InputStream zippedContent = zipFile.getInputStream(mimeTypeInfo); 
+            String realMimeType = new String(serverFileUtils.read(zippedContent, mimeTypeInfo.getSize()));
+            
+            return new String[]{realMimeType};
+        } catch (Exception e) {
+            return null;
+        }
+        
+    }
+
+}
--- a/source/de/anomic/plasma/parser/mimeType/rssDetector.java
+++ b/source/de/anomic/plasma/parser/mimeType/rssDetector.java
@ -0,0 +1,118 @@
+//rssDetector.java 
+//------------------------
+//part of YaCy
+//(C) by Michael Peter Christen; mc@anomic.de
+//first published on http://www.anomic.de
+//Frankfurt, Germany, 2005
+//
+//this file is contributed by Martin Thelian
+//last major change: 16.05.2005
+//
+//This program is free software; you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation; either version 2 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program; if not, write to the Free Software
+//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+//Using this software in any meaning (reading, learning, copying, compiling,
+//running) means that you agree that the Author(s) is (are) not responsible
+//for cost, loss of data or any harm that may be caused directly or indirectly
+//by usage of this softare or this documentation. The usage of this software
+//is on your own risk. The installation and usage (starting/running) of this
+//software may allow other people or application to access your computer and
+//any attached devices and is highly dependent on the configuration of the
+//software which must be done by the user of the software; the author(s) is
+//(are) also not responsible for proper configuration and usage of the
+//software, even if provoked by documentation provided together with
+//the software.
+//
+//Any changes to this file according to the GPL as documented in the file
+//gpl.txt aside this file in the shipment you received can be done to the
+//lines that follows this copyright notice here, but changes must not be
+//done inside the copyright notive above. A re-distribution must contain
+//the intact and unchanged copyright notice.
+//Contributions and changes to the program code must be marked as such.
+
+
+package de.anomic.plasma.parser.mimeType;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.lang.reflect.Method;
+import java.util.Map;
+
+import net.sf.jmimemagic.MagicDetector;
+
+public class rssDetector implements MagicDetector {
+
+    public String getDisplayName() {
+        return "RSS MimeType Detector";
+    }
+
+    public String[] getHandledExtensions() {
+        return new String[]{"xml","rss","rdf","atom"};
+    }
+
+    public String[] getHandledTypes() {
+        return new String[] { "text/rss", "application/rdf+xml", "application/rss+xml", "application/atom+xml" };
+    }
+
+    public String getName() {
+        return "rssfiledetector";
+    }
+
+    public String getVersion() {
+        return "0.1";
+    }
+
+    public String[] process(File file, int offset, int length, long bitmask, char comparator, String mimeType, Map params) {
+        FileInputStream fileInput = null;
+        try {
+            fileInput = new FileInputStream(file);
+            return detect(fileInput);
+        } catch (Exception e) {
+            return null;
+        } finally {
+            if (fileInput != null) try { fileInput.close(); } catch (Exception e) { /* ignore this */ }
+        }
+    }
+
+    public String[] process(byte[] data, int offset, int length, long bitmask, char comparator, String mimeType, Map params) {
+        ByteArrayInputStream input = new ByteArrayInputStream(data);
+        return detect(input);
+    }
+    
+    private String[] detect(InputStream input) {
+        try {
+
+            // getting the format detector class
+            Class formatDetector = Class.forName("de.nava.informa.utils.FormatDetector");
+            
+            // getting the proper method
+            Method getFormat = formatDetector.getMethod("getFormat", new Class[]{InputStream.class});
+            
+            // invoke the method
+            Object format = getFormat.invoke(null, new Object[] {input});
+            
+            if (format == null) return null;
+            else if (format.toString().startsWith("RSS ")) return new String[]{"application/rss+xml"};
+            else if (format.toString().startsWith("Atom ")) return new String[]{"application/atom+xml"};
+            else return null;
+        } catch (Exception e) {
+            return null;
+        } catch (Error e) {
+            return null;
+        }        
+    }
+
+}
--- a/source/de/anomic/plasma/parser/tar/tarParser.java
+++ b/source/de/anomic/plasma/parser/tar/tarParser.java
@ -45,7 +45,6 @@ package de.anomic.plasma.parser.tar;

 import java.io.File;
 import java.io.InputStream;
-import de.anomic.net.URL;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Hashtable;
@ -57,6 +56,7 @@ import java.util.zip.GZIPInputStream;
 import com.ice.tar.TarEntry;
 import com.ice.tar.TarInputStream;

+import de.anomic.net.URL;
 import de.anomic.plasma.plasmaParser;
 import de.anomic.plasma.plasmaParserDocument;
 import de.anomic.plasma.parser.AbstractParser;
@ -109,6 +109,8 @@ public class tarParser extends AbstractParser implements Parser {
                source = new GZIPInputStream(source);
            }
            
+            // TODO: what about bzip ....
+
            StringBuffer docKeywords = new StringBuffer();
            StringBuffer docShortTitle = new StringBuffer();  
            StringBuffer docLongTitle = new StringBuffer();   
@ -154,7 +156,7 @@ public class tarParser extends AbstractParser implements Parser {
                    // parsing the content                    
                    theDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null,tempFile);
                } catch (ParserException e) {
-                    this.theLogger.logInfo("Unable to parse tar file entry '" + entryName + "'. " + e.getErrorCode());
+                    this.theLogger.logInfo("Unable to parse tar file entry '" + entryName + "'. " + e.getMessage());
                } finally {
                    if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){/* ignore this */}
                }
--- a/source/de/anomic/plasma/parser/zip/zipParser.java
+++ b/source/de/anomic/plasma/parser/zip/zipParser.java
@ -140,7 +140,7 @@ public class zipParser extends AbstractParser implements Parser {
                    // parsing the zip file entry
                    theDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null, tempFile);
                } catch (ParserException e) {
-                    this.theLogger.logInfo("Unable to parse zip file entry '" + entryName + "'. " + e.getErrorCode());
+                    this.theLogger.logInfo("Unable to parse zip file entry '" + entryName + "'. " + e.getMessage());
                } finally {
                    if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){/* ignore this */}
                }
@ -185,7 +185,7 @@ public class zipParser extends AbstractParser implements Parser {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            if (e instanceof ParserException) throw (ParserException) e;
            
-            throw new ParserException("Unexpected error while parsing zip resource. " + e.getMessage(),location);
+            throw new ParserException("Unexpected error while parsing zip resource. " + e.getClass().getName() + ": "+ e.getMessage(),location);
        }
    }
    
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@ -50,7 +50,6 @@ import java.io.FileFilter;
 import java.io.FileInputStream;
 import java.io.FilenameFilter;
 import java.io.IOException;
-import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.net.URI;
 import java.util.Arrays;
@ -71,16 +70,13 @@ import org.apache.commons.pool.impl.GenericObjectPool;
 import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.htmlFilter.htmlFilterInputStream;
 import de.anomic.htmlFilter.htmlFilterWriter;
-import de.anomic.http.httpHeader;
 import de.anomic.http.httpc;
-import de.anomic.index.indexURL;
 import de.anomic.net.URL;
 import de.anomic.plasma.parser.Parser;
 import de.anomic.plasma.parser.ParserException;
 import de.anomic.plasma.parser.ParserInfo;
 import de.anomic.server.serverFileUtils;
 import de.anomic.server.logging.serverLog;
-import de.anomic.tools.bitfield;

 public final class plasmaParser {
    public static final String PARSER_MODE_PROXY   = "PROXY";
@ -512,7 +508,7 @@ public final class plasmaParser {
            
            // testing if parsing is supported for this resource
            if (!plasmaParser.supportedContent(location,mimeType)) {
-                String errorMsg = "No parser available to parse mimetype";
+                String errorMsg = "No parser available to parse mimetype '" + mimeType + "'";
                this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
                throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT);
            }
@ -588,7 +584,7 @@ public final class plasmaParser {
            } else if (realtimeParsableMimeTypesContains(mimeType)) {                      
                doc = parseHtml(location, mimeType, documentCharset, sourceFile);
            } else {
-                String errorMsg = "No parser available to parse mimetype";
+                String errorMsg = "No parser available to parse mimetype '" + mimeType + "'";
                this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
                throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT);                
            }
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@ -434,11 +434,15 @@ public class plasmaSnippetCache {
        try {
            if (resource == null) return null;

-            // try to get the header from the htcache directory
+            // if no resource metadata is available, try to load it
            if (docInfo == null) {
-                try {
+                // try to get the header from the htcache directory
+                try {                    
                    docInfo = this.cacheManager.loadResourceInfo(url);
                } catch (Exception e) {}
+                
+                // TODO: try to load it from web
+                
            }

            if (docInfo == null) {
--- a/source/de/anomic/server/serverFileUtils.java
+++ b/source/de/anomic/server/serverFileUtils.java
@ -274,8 +274,14 @@ public final class serverFileUtils {
    }

    public static byte[] read(InputStream source) throws IOException {
-        ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        copy(source, baos, -1);
+        return read(source,-1);
+    }
+    
+    public static byte[] read(InputStream source, long count) throws IOException {
+        ByteArrayOutputStream baos = (count > 0) 
+                                   ? new ByteArrayOutputStream((int)count) 
+                                   : new ByteArrayOutputStream();
+        copy(source, baos, count);
        baos.close();
        return baos.toByteArray();
    }